From ffb5c510181bfbc0bb93fae3f5c31d948dcf1ed4 Mon Sep 17 00:00:00 2001
From: Ankit Aggarwal <aankit@quicinc.com>
Date: Fri, 2 Jun 2023 02:26:32 -0700
Subject: [PATCH 1/2] [Hexagon] Remove v62 support

Remove v62 and make v65 as the default isa version for HVX.
---
 .../src/halide/halide_/PyEnums.cpp            |  2 -
 src/CodeGen_Hexagon.cpp                       | 61 ++++++-------------
 src/HexagonOffload.cpp                        | 10 +--
 src/HexagonOptimize.cpp                       | 19 +++---
 src/LLVM_Runtime_Linker.cpp                   |  5 +-
 src/Target.cpp                                | 27 ++------
 src/Target.h                                  |  2 -
 src/runtime/HalideRuntime.h                   |  2 -
 test/correctness/gather.cpp                   | 25 +++-----
 test/correctness/hexagon_scatter.cpp          |  4 +-
 test/correctness/histogram.cpp                | 12 ++--
 test/correctness/simd_op_check_hvx.cpp        | 44 +++++--------
 12 files changed, 67 insertions(+), 146 deletions(-)
diff --git a/python_bindings/src/halide/halide_/PyEnums.cpp b/python_bindings/src/halide/halide_/PyEnums.cpp
index 4cea2899fcf1..ba07aeed2081 100644
--- a/python_bindings/src/halide/halide_/PyEnums.cpp
+++ b/python_bindings/src/halide/halide_/PyEnums.cpp
@@ -142,8 +142,6 @@ void define_enums(py::module &m) {
         .value("LargeBuffers", Target::Feature::LargeBuffers)
         .value("HVX", Target::Feature::HVX)
         .value("HVX_128", Target::Feature::HVX_128)
-        .value("HVX_v62", Target::Feature::HVX_v62)
-        .value("HVX_v65", Target::Feature::HVX_v65)
         .value("HVX_v66", Target::Feature::HVX_v66)
         .value("FuzzFloatStores", Target::Feature::FuzzFloatStores)
         .value("SoftFloatABI", Target::Feature::SoftFloatABI)
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index 87f732dc0d54..96fe259d241d 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -54,10 +54,6 @@ class CodeGen_Hexagon : public CodeGen_Posix {
                                          std::vector<Type> arg_types,
                                          int flags);
 
-    int is_hvx_v65_or_later() const {
-        return (isa_version >= 65);
-    }
-
     using CodeGen_Posix::visit;
 
     /** Nodes for which we want to emit specific hexagon intrinsics */
@@ -130,10 +126,8 @@ CodeGen_Hexagon::CodeGen_Hexagon(const Target &t)
     : CodeGen_Posix(t) {
     if (target.has_feature(Halide::Target::HVX_v66)) {
         isa_version = 66;
-    } else if (target.has_feature(Halide::Target::HVX_v65)) {
-        isa_version = 65;
     } else {
-        isa_version = 62;
+        isa_version = 65;
     }
     user_assert(target.has_feature(Target::HVX))
         << "Creating a Codegen target for Hexagon without the hvx target feature.\n";
@@ -484,13 +478,11 @@ void CodeGen_Hexagon::compile_func(const LoweredFunc &f,
     debug(2) << "Hexagon: Lowering after unpredicating loads/stores:\n"
              << body << "\n\n";
 
-    if (is_hvx_v65_or_later()) {
-        // Generate vscatter-vgathers before optimize_hexagon_shuffles.
-        debug(1) << "Hexagon: Looking for vscatter-vgather...\n";
-        body = scatter_gather_generator(body);
-        debug(2) << "Hexagon: Lowering after vscatter-vgather:\n"
-                 << body << "\n\n";
-    }
+    // Generate vscatter-vgathers before optimize_hexagon_shuffles.
+    debug(1) << "Hexagon: Looking for vscatter-vgather...\n";
+    body = scatter_gather_generator(body);
+    debug(2) << "Hexagon: Lowering after vscatter-vgather:\n"
+                << body << "\n\n";
 
     debug(1) << "Hexagon: Optimizing shuffles...\n";
     // vlut always indexes 64 bytes of the LUT at a time, even in 128 byte mode.
@@ -537,7 +529,6 @@ struct HvxIntrinsic {
     enum {
         BroadcastScalarsToWords = 1 << 0,  // Some intrinsics need scalar arguments
                                            // broadcasted up to 32 bits.
-        v65OrLater = 1 << 1,
     };
     llvm::Intrinsic::ID id;
     halide_type_t ret_type;
@@ -664,7 +655,7 @@ const HvxIntrinsic intrinsic_wrappers[] = {
     // Absolute value:
     {INTRINSIC_128B(vabsh), u16v1, "abs.vh", {i16v1}},
     {INTRINSIC_128B(vabsw), u32v1, "abs.vw", {i32v1}},
-    {INTRINSIC_128B(vabsb), u8v1, "abs.vb", {i8v1}, HvxIntrinsic::v65OrLater},
+    {INTRINSIC_128B(vabsb), u8v1, "abs.vb", {i8v1}},
 
     // Absolute difference:
     {INTRINSIC_128B(vabsdiffub), u8v1, "absd.vub.vub", {u8v1, u8v1}},
@@ -675,21 +666,21 @@ const HvxIntrinsic intrinsic_wrappers[] = {
     // Averaging:
     {INTRINSIC_128B(vavgub), u8v1, "avg.vub.vub", {u8v1, u8v1}},
     {INTRINSIC_128B(vavguh), u16v1, "avg.vuh.vuh", {u16v1, u16v1}},
-    {INTRINSIC_128B(vavguw), u32v1, "avg.vuw.vuw", {u32v1, u32v1}, HvxIntrinsic::v65OrLater},
-    {INTRINSIC_128B(vavgb), i8v1, "avg.vb.vb", {i8v1, i8v1}, HvxIntrinsic::v65OrLater},
+    {INTRINSIC_128B(vavguw), u32v1, "avg.vuw.vuw", {u32v1, u32v1}},
+    {INTRINSIC_128B(vavgb), i8v1, "avg.vb.vb", {i8v1, i8v1}},
     {INTRINSIC_128B(vavgh), i16v1, "avg.vh.vh", {i16v1, i16v1}},
     {INTRINSIC_128B(vavgw), i32v1, "avg.vw.vw", {i32v1, i32v1}},
 
     {INTRINSIC_128B(vavgubrnd), u8v1, "avg_rnd.vub.vub", {u8v1, u8v1}},
     {INTRINSIC_128B(vavguhrnd), u16v1, "avg_rnd.vuh.vuh", {u16v1, u16v1}},
-    {INTRINSIC_128B(vavguwrnd), u32v1, "avg_rnd.vuw.vuw", {u32v1, u32v1}, HvxIntrinsic::v65OrLater},
-    {INTRINSIC_128B(vavgbrnd), i8v1, "avg_rnd.vb.vb", {i8v1, i8v1}, HvxIntrinsic::v65OrLater},
+    {INTRINSIC_128B(vavguwrnd), u32v1, "avg_rnd.vuw.vuw", {u32v1, u32v1}},
+    {INTRINSIC_128B(vavgbrnd), i8v1, "avg_rnd.vb.vb", {i8v1, i8v1}},
     {INTRINSIC_128B(vavghrnd), i16v1, "avg_rnd.vh.vh", {i16v1, i16v1}},
     {INTRINSIC_128B(vavgwrnd), i32v1, "avg_rnd.vw.vw", {i32v1, i32v1}},
 
      // This one is weird: i8_sat((u8 - u8)/2). It both saturates and averages.
     {INTRINSIC_128B(vnavgub), i8v1, "navg.vub.vub", {u8v1, u8v1}},
-    {INTRINSIC_128B(vnavgb), i8v1, "navg.vb.vb", {i8v1, i8v1}, HvxIntrinsic::v65OrLater},
+    {INTRINSIC_128B(vnavgb), i8v1, "navg.vb.vb", {i8v1, i8v1}},
     {INTRINSIC_128B(vnavgh), i16v1, "navg.vh.vh", {i16v1, i16v1}},
     {INTRINSIC_128B(vnavgw), i32v1, "navg.vw.vw", {i32v1, i32v1}},
 
@@ -805,7 +796,7 @@ const HvxIntrinsic intrinsic_wrappers[] = {
     // Rounding shift right
     {INTRINSIC_128B(vasrhubrndsat), u8v1, "trunc_satub_shr_rnd.vh", {i16v2, u16}},
     {INTRINSIC_128B(vasrhbrndsat), i8v1, "trunc_satb_shr_rnd.vh", {i16v2, u16}},
-    {INTRINSIC_128B(vasruhubrndsat), u8v1, "trunc_satub_shr_rnd.vuh", {u16v2, u16}, HvxIntrinsic::v65OrLater},
+    {INTRINSIC_128B(vasruhubrndsat), u8v1, "trunc_satub_shr_rnd.vuh", {u16v2, u16}},
     {INTRINSIC_128B(vasrwuhrndsat), u16v1, "trunc_satuh_shr_rnd.vw", {i32v2, u32}},
     {INTRINSIC_128B(vasrwhrndsat), i16v1, "trunc_sath_shr_rnd.vw", {i32v2, u32}},
     {INTRINSIC_128B(vasruwuhrndsat), u16v1, "trunc_satuh_shr_rnd.vuw", {u32v2, u32}},
@@ -824,8 +815,8 @@ const HvxIntrinsic intrinsic_wrappers[] = {
     {INTRINSIC_128B(vaslw), u32v1, "shl.vuw.w", {u32v1, u32}},
     {INTRINSIC_128B(vaslh), i16v1, "shl.vh.h", {i16v1, u16}},
     {INTRINSIC_128B(vaslw), i32v1, "shl.vw.w", {i32v1, u32}},
-    {INTRINSIC_128B(vasrh_acc), i16v1, "add_shr.vh.vh.uh", {i16v1, i16v1, i16}, HvxIntrinsic::BroadcastScalarsToWords | HvxIntrinsic::v65OrLater},
-    {INTRINSIC_128B(vaslh_acc), i16v1, "add_shl.vh.vh.uh", {i16v1, i16v1, i16}, HvxIntrinsic::BroadcastScalarsToWords | HvxIntrinsic::v65OrLater},
+    {INTRINSIC_128B(vasrh_acc), i16v1, "add_shr.vh.vh.uh", {i16v1, i16v1, i16}, HvxIntrinsic::BroadcastScalarsToWords},
+    {INTRINSIC_128B(vaslh_acc), i16v1, "add_shl.vh.vh.uh", {i16v1, i16v1, i16}, HvxIntrinsic::BroadcastScalarsToWords},
     {INTRINSIC_128B(vasrw_acc), i32v1, "add_shr.vw.vw.uw", {i32v1, i32v1, i32}},
     {INTRINSIC_128B(vaslw_acc), i32v1, "add_shl.vw.vw.uw", {i32v1, i32v1, i32}},
 
@@ -886,11 +877,6 @@ llvm::Function *CodeGen_Hexagon::define_hvx_intrinsic(llvm::Function *intrin,
     internal_assert(intrin) << "Null definition for intrinsic '" << name << "'\n";
     llvm::FunctionType *intrin_ty = intrin->getFunctionType();
     bool broadcast_scalar_word = flags & HvxIntrinsic::BroadcastScalarsToWords;
-    bool v65OrLater = flags & HvxIntrinsic::v65OrLater;
-
-    if (v65OrLater && !is_hvx_v65_or_later()) {
-        return nullptr;
-    }
 
     // Get the types of the arguments we want to pass.
     vector<llvm::Type *> llvm_arg_types;
@@ -1789,10 +1775,8 @@ Value *CodeGen_Hexagon::call_intrin(llvm::Type *result_type, const string &name,
 string CodeGen_Hexagon::mcpu_target() const {
     if (target.has_feature(Halide::Target::HVX_v66)) {
         return "hexagonv66";
-    } else if (target.has_feature(Halide::Target::HVX_v65)) {
-        return "hexagonv65";
     } else {
-        return "hexagonv62";
+        return "hexagonv65";
     }
 }
 
@@ -1923,12 +1907,10 @@ void CodeGen_Hexagon::visit(const Call *op) {
             internal_assert(op->args.size() == 1);
             Type ty = op->args[0].type();
             if ((ty.is_vector() && ty.is_int())) {
-                if (ty.bits() != 8 || is_hvx_v65_or_later()) {
-                    value = call_intrin(op->type,
-                                        "halide.hexagon.abs" + type_suffix(op->args[0]),
-                                        op->args);
-                    return;
-                }
+                value = call_intrin(op->type,
+                                    "halide.hexagon.abs" + type_suffix(op->args[0]),
+                                    op->args);
+                return;
             }
         } else if (op->is_intrinsic(Call::cast_mask)) {
             internal_error
@@ -2252,9 +2234,6 @@ void CodeGen_Hexagon::visit(const Allocate *alloc) {
         }
     } else if (alloc->memory_type == MemoryType::VTCM &&
                !alloc->new_expr.defined()) {
-        if (!is_hvx_v65_or_later()) {
-            user_error << "VTCM store_in requires HVX_v65 or later.\n";
-        }
         // Calculate size of allocation.
         Expr size = alloc->type.bytes();
         for (const auto &extent : alloc->extents) {
diff --git a/src/HexagonOffload.cpp b/src/HexagonOffload.cpp
index 1e6de70c2e9b..98a1177fe41f 100644
--- a/src/HexagonOffload.cpp
+++ b/src/HexagonOffload.cpp
@@ -39,8 +39,8 @@ enum {
     EF_HEXAGON_MACH_V5 = 0x4,
     EF_HEXAGON_MACH_V55 = 0x5,
     EF_HEXAGON_MACH_V60 = 0x60,  // Deprecated
-    EF_HEXAGON_MACH_V61 = 0x61,  // Deprecated?
-    EF_HEXAGON_MACH_V62 = 0x62,
+    EF_HEXAGON_MACH_V61 = 0x61,  // Deprecated
+    EF_HEXAGON_MACH_V62 = 0x62,  // Deprecated
     EF_HEXAGON_MACH_V65 = 0x65,
     EF_HEXAGON_MACH_V66 = 0x66,
 };
@@ -553,10 +553,8 @@ class HexagonLinker : public Linker {
     HexagonLinker(const Target &target) {
         if (target.has_feature(Target::HVX_v66)) {
             flags = Elf::EF_HEXAGON_MACH_V66;
-        } else if (target.has_feature(Target::HVX_v65)) {
-            flags = Elf::EF_HEXAGON_MACH_V65;
         } else {
-            flags = Elf::EF_HEXAGON_MACH_V62;
+            flags = Elf::EF_HEXAGON_MACH_V65;
         }
     }
 
@@ -980,8 +978,6 @@ Stmt inject_hexagon_rpc(Stmt s, const Target &host_target,
         Target::Profile,
         Target::NoAsserts,
         Target::HVX_128,
-        Target::HVX_v62,
-        Target::HVX_v65,
         Target::HVX_v66,
     };
     for (Target::Feature i : shared_features) {
diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp
index 5814bd7a0df4..ef2a6c03f4ce 100644
--- a/src/HexagonOptimize.cpp
+++ b/src/HexagonOptimize.cpp
@@ -184,7 +184,6 @@ struct Pattern {
         // re-interleave the result.
         ReinterleaveOp0 = InterleaveResult | DeinterleaveOp0,
 
-        v65orLater = 1 << 10,  // Pattern should be matched only for v65 target or later
         v66orLater = 1 << 11,  // Pattern should be matched only for v66 target or later
     };
 
@@ -218,10 +217,6 @@ Expr wild_i64x = Variable::make(Type(Type::Int, 64, 0), "*");
 
 // Check if a pattern with flags 'flags' is supported on the target.
 bool check_pattern_target(int flags, const Target &target) {
-    if ((flags & (Pattern::v65orLater)) &&
-        !target.features_any_of({Target::HVX_v65, Target::HVX_v66})) {
-        return false;
-    }
     if ((flags & (Pattern::v66orLater)) &&
         !target.features_any_of({Target::HVX_v66})) {
         return false;
@@ -697,11 +692,11 @@ class OptimizePatterns : public IRMutator {
             {"halide.hexagon.add_shr.vw.vw.uw", wild_i32x + (wild_i32x >> wild_u32)},
             {"halide.hexagon.add_shl.vw.vw.uw", wild_i32x + (wild_i32x << wild_u32)},
             {"halide.hexagon.add_shl.vw.vw.uw", wild_u32x + (wild_u32x << wild_u32)},
-            {"halide.hexagon.add_shl.vh.vh.uh", wild_i16x + (wild_i16x << wild_u16), Pattern::v65orLater},
-            {"halide.hexagon.add_shl.vh.vh.uh", wild_u16x + (wild_u16x << wild_u16), Pattern::v65orLater},
-            {"halide.hexagon.add_shr.vh.vh.uh", wild_i16x + (wild_i16x >> wild_u16), Pattern::v65orLater},
-            {"halide.hexagon.add_shl.vh.vh.uh", wild_i16x + (wild_i16x << wild_i16), Pattern::v65orLater},
-            {"halide.hexagon.add_shl.vh.vh.uh", wild_u16x + (wild_u16x << wild_u16), Pattern::v65orLater},
+            {"halide.hexagon.add_shl.vh.vh.uh", wild_i16x + (wild_i16x << wild_u16)},
+            {"halide.hexagon.add_shl.vh.vh.uh", wild_u16x + (wild_u16x << wild_u16)},
+            {"halide.hexagon.add_shr.vh.vh.uh", wild_i16x + (wild_i16x >> wild_u16)},
+            {"halide.hexagon.add_shl.vh.vh.uh", wild_i16x + (wild_i16x << wild_i16)},
+            {"halide.hexagon.add_shl.vh.vh.uh", wild_u16x + (wild_u16x << wild_u16)},
 
             // Non-widening multiply-accumulates with a scalar.
             {"halide.hexagon.add_mul.vh.vh.b", wild_i16x + widen_right_mul(wild_i16x, wild_i8)},
@@ -892,7 +887,7 @@ class OptimizePatterns : public IRMutator {
             // Saturating narrowing casts with rounding
             {"halide.hexagon.trunc_satub_shr_rnd.vh", u8_sat(rounding_shift_right(wild_i16x, wild_u16)), Pattern::DeinterleaveOp0},
             {"halide.hexagon.trunc_satb_shr_rnd.vh", i8_sat(rounding_shift_right(wild_i16x, wild_u16)), Pattern::DeinterleaveOp0},
-            {"halide.hexagon.trunc_satub_shr_rnd.vuh", u8_sat(rounding_shift_right(wild_u16x, wild_u16)), Pattern::DeinterleaveOp0 | Pattern::v65orLater},
+            {"halide.hexagon.trunc_satub_shr_rnd.vuh", u8_sat(rounding_shift_right(wild_u16x, wild_u16)), Pattern::DeinterleaveOp0},
             {"halide.hexagon.trunc_satuh_shr_rnd.vw", u16_sat(rounding_shift_right(wild_i32x, wild_u32)), Pattern::DeinterleaveOp0},
             {"halide.hexagon.trunc_sath_shr_rnd.vw", i16_sat(rounding_shift_right(wild_i32x, wild_u32)), Pattern::DeinterleaveOp0},
             {"halide.hexagon.trunc_satuh_shr_rnd.vuw", u16_sat(rounding_shift_right(wild_u32x, wild_u32)), Pattern::DeinterleaveOp0},
@@ -2318,7 +2313,7 @@ Stmt optimize_hexagon_shuffles(const Stmt &s, int lut_alignment) {
 }
 
 Stmt scatter_gather_generator(Stmt s) {
-    // Generate vscatter-vgather instruction if target >= v65
+    // Generate vscatter-vgather instruction
     s = substitute_in_all_lets(s);
     s = ScatterGatherGenerator().mutate(s);
     s = SyncronizationBarriers().mutate(s);
diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
index ff2f64986b27..b1d0eff926c8 100644
--- a/src/LLVM_Runtime_Linker.cpp
+++ b/src/LLVM_Runtime_Linker.cpp
@@ -1079,10 +1079,7 @@ std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVM
             if (t.arch == Target::Hexagon) {
                 modules.push_back(get_initmod_qurt_hvx(c, bits_64, debug));
                 modules.push_back(get_initmod_hvx_128_ll(c));
-                if (t.features_any_of({Target::HVX_v65, Target::HVX_v66})) {
-                    modules.push_back(get_initmod_qurt_hvx_vtcm(c, bits_64,
-                                                                debug));
-                }
+                modules.push_back(get_initmod_qurt_hvx_vtcm(c, bits_64, debug));
 
             } else {
                 modules.push_back(get_initmod_prefetch(c, bits_64, debug));
diff --git a/src/Target.cpp b/src/Target.cpp
index 60c8dbd9cfcd..d5a4e82304cb 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -286,8 +286,6 @@ Target calculate_host_target() {
 
 bool is_using_hexagon(const Target &t) {
     return (t.has_feature(Target::HVX) ||
-            t.has_feature(Target::HVX_v62) ||
-            t.has_feature(Target::HVX_v65) ||
             t.has_feature(Target::HVX_v66) ||
             t.has_feature(Target::HexagonDma) ||
             t.arch == Target::Hexagon);
@@ -297,16 +295,10 @@ int get_hvx_lower_bound(const Target &t) {
     if (!is_using_hexagon(t)) {
         return -1;
     }
-    if (t.has_feature(Target::HVX_v62)) {
-        return 62;
-    }
-    if (t.has_feature(Target::HVX_v65)) {
-        return 65;
-    }
     if (t.has_feature(Target::HVX_v66)) {
         return 66;
     }
-    return 60;
+    return 65;
 }
 
 }  // namespace
@@ -491,8 +483,6 @@ const std::map<std::string, Target::Feature> feature_name_map = {
     {"large_buffers", Target::LargeBuffers},
     {"hvx", Target::HVX_128},
     {"hvx_128", Target::HVX_128},
-    {"hvx_v62", Target::HVX_v62},
-    {"hvx_v65", Target::HVX_v65},
     {"hvx_v66", Target::HVX_v66},
     {"fuzz_float_stores", Target::FuzzFloatStores},
     {"soft_float_abi", Target::SoftFloatABI},
@@ -1248,8 +1238,6 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
         CUDACapability75,
         CUDACapability80,
         CUDACapability86,
-        HVX_v62,
-        HVX_v65,
         HVX_v66,
         VulkanV10,
         VulkanV12,
@@ -1385,12 +1373,6 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
 
     // Same trick as above for CUDA
     int hvx_version = std::min((unsigned)hvx_a, (unsigned)hvx_b);
-    if (hvx_version < 62) {
-        output.features.reset(HVX_v62);
-    }
-    if (hvx_version < 65) {
-        output.features.reset(HVX_v65);
-    }
     if (hvx_version < 66) {
         output.features.reset(HVX_v66);
     }
@@ -1422,10 +1404,9 @@ void target_test() {
         {{"x86-64-linux-vulkan", "x86-64-linux", "x86-64-linux-vulkan"}},
         {{"x86-64-linux-vulkan-vk_v13", "x86-64-linux-vulkan", "x86-64-linux-vulkan"}},
         {{"x86-64-linux-vulkan-vk_v13", "x86-64-linux-vulkan-vk_v10", "x86-64-linux-vulkan-vk_v10"}},
-        {{"hexagon-32-qurt-hvx_v65", "hexagon-32-qurt-hvx_v62", "hexagon-32-qurt-hvx_v62"}},
-        {{"hexagon-32-qurt-hvx_v62", "hexagon-32-qurt", "hexagon-32-qurt"}},
-        {{"hexagon-32-qurt-hvx_v62-hvx", "hexagon-32-qurt", ""}},
-        {{"hexagon-32-qurt-hvx_v62-hvx", "hexagon-32-qurt-hvx", "hexagon-32-qurt-hvx"}},
+        {{"hexagon-32-qurt-hvx_v66", "hexagon-32-qurt", "hexagon-32-qurt"}},
+        {{"hexagon-32-qurt-hvx_v66-hvx", "hexagon-32-qurt", ""}},
+        {{"hexagon-32-qurt-hvx_v66-hvx", "hexagon-32-qurt-hvx", "hexagon-32-qurt-hvx"}},
     };
 
     for (const auto &test : gcd_tests) {
diff --git a/src/Target.h b/src/Target.h
index b27f4b73a99d..580932143fba 100644
--- a/src/Target.h
+++ b/src/Target.h
@@ -119,8 +119,6 @@ struct Target {
         HexagonDma = halide_target_feature_hexagon_dma,
         HVX_128 = halide_target_feature_hvx_128,
         HVX = HVX_128,
-        HVX_v62 = halide_target_feature_hvx_v62,
-        HVX_v65 = halide_target_feature_hvx_v65,
         HVX_v66 = halide_target_feature_hvx_v66,
         FuzzFloatStores = halide_target_feature_fuzz_float_stores,
         SoftFloatABI = halide_target_feature_soft_float_abi,
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index d0c8e4e9fc4e..df3224268cd3 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -1352,7 +1352,6 @@ typedef enum halide_target_feature_t {
     halide_target_feature_large_buffers,  ///< Enable 64-bit buffer indexing to support buffers > 2GB. Ignored if bits != 64.
 
     halide_target_feature_hvx_128,                ///< Enable HVX 128 byte mode.
-    halide_target_feature_hvx_v62,                ///< Enable Hexagon v62 architecture.
     halide_target_feature_fuzz_float_stores,      ///< On every floating point store, set the last bit of the mantissa to zero. Pipelines for which the output is very different with this feature enabled may also produce very different output on different processors.
     halide_target_feature_soft_float_abi,         ///< Enable soft float ABI. This only enables the soft float ABI calling convention, which does not necessarily use soft floats.
     halide_target_feature_msan,                   ///< Enable hooks for MSAN support.
@@ -1365,7 +1364,6 @@ typedef enum halide_target_feature_t {
     halide_target_feature_trace_stores,           ///< Trace all stores done by the pipeline. Equivalent to calling Func::trace_stores on every non-inlined Func.
     halide_target_feature_trace_realizations,     ///< Trace all realizations done by the pipeline. Equivalent to calling Func::trace_realizations on every non-inlined Func.
     halide_target_feature_trace_pipeline,         ///< Trace the pipeline.
-    halide_target_feature_hvx_v65,                ///< Enable Hexagon v65 architecture.
     halide_target_feature_hvx_v66,                ///< Enable Hexagon v66 architecture.
     halide_target_feature_cl_half,                ///< Enable half support on OpenCL targets
     halide_target_feature_strict_float,           ///< Turn off all non-IEEE floating-point optimization. Currently applies only to LLVM targets.
diff --git a/test/correctness/gather.cpp b/test/correctness/gather.cpp
index 2d9bef71f326..9096b6927cb2 100644
--- a/test/correctness/gather.cpp
+++ b/test/correctness/gather.cpp
@@ -10,7 +10,7 @@ bool test() {
     const int W_img = 128;
     const int H_img = 8;
     const int W_lut = 256;
-    const int H_lut = (target.has_feature(Target::HVX_v65)) ? 32 : 1;
+    const int H_lut = (target.has_feature(Target::HVX)) ? 32 : 1;
 
     srand(time(0));
 
@@ -49,17 +49,15 @@ bool test() {
             .parallel(y)
             .vectorize(x, vector_size);
 
-        if (target.features_any_of({Target::HVX_v65, Target::HVX_v66})) {
-            lut_vtcm
-                .store_in(MemoryType::VTCM)
-                .compute_at(output, Var::outermost())
-                .vectorize(x, vector_size);
+        lut_vtcm
+            .store_in(MemoryType::VTCM)
+            .compute_at(output, Var::outermost())
+            .vectorize(x, vector_size);
 
-            output_vtcm
-                .store_in(MemoryType::VTCM)
-                .compute_at(output, y)
-                .vectorize(x, vector_size);
-        }
+        output_vtcm
+            .store_in(MemoryType::VTCM)
+            .compute_at(output, y)
+            .vectorize(x, vector_size);
     }
 
     Buffer<ITYPE> output_buf = output.realize({W_img, H_img});
@@ -80,10 +78,7 @@ bool test() {
 }
 
 int main() {
-    // With hexagon targets >=v65 with hvx, we expect to see gathers for
-    // uint16_t, int16_t, uint32_t, int32_t
-    // For targets <v65 with hvx, we should generate dynamic_shuffle which are
-    // compiled to vlut instructions.
+    // We expect to see vgathers with HVX.
     if (!test<uint8_t>() ||
         !test<int8_t>() ||
         !test<uint16_t>() ||
diff --git a/test/correctness/hexagon_scatter.cpp b/test/correctness/hexagon_scatter.cpp
index 6fde0669ac19..23fe563fe8c7 100644
--- a/test/correctness/hexagon_scatter.cpp
+++ b/test/correctness/hexagon_scatter.cpp
@@ -76,9 +76,7 @@ int test() {
             .parallel(y)
             .vectorize(x, vector_size / 2);
 
-        if (target.features_any_of({Target::HVX_v65, Target::HVX_v66})) {
-            f.store_in(MemoryType::VTCM);
-        }
+        f.store_in(MemoryType::VTCM);
     }
 
     Buffer<DTYPE> buf = g.realize({W, H});
diff --git a/test/correctness/histogram.cpp b/test/correctness/histogram.cpp
index bfb69058402d..901367f2460a 100644
--- a/test/correctness/histogram.cpp
+++ b/test/correctness/histogram.cpp
@@ -50,14 +50,12 @@ bool test() {
             .compute_at(g, Var::outermost())
             .vectorize(x, vector_size);
 
-        if (target.features_any_of({Target::HVX_v65, Target::HVX_v66})) {
-            hist.store_in(MemoryType::VTCM);
+        hist.store_in(MemoryType::VTCM);
 
-            hist
-                .update(0)
-                .allow_race_conditions()
-                .vectorize(r.x, vector_size);
-        }
+        hist
+            .update(0)
+            .allow_race_conditions()
+            .vectorize(r.x, vector_size);
     } else {
         hist.compute_root();
     }
diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp
index 4ed6753301ec..0bf70395d65a 100644
--- a/test/correctness/simd_op_check_hvx.cpp
+++ b/test/correctness/simd_op_check_hvx.cpp
@@ -13,7 +13,7 @@
 // simd_op_check into two tests, simd_op_check.cpp and simd_op_check_hvx.cpp
 // so that the latter is free to do its own thing - for simd_op_check_hvx.cpp
 // to run any tests, all that is needed is that HL_TARGET have a HVX related
-// target feature, i.e. one of HVX, HVX_v62, HVX_v65 and HVX_v66.
+// target feature, i.e. one of HVX and HVX_v66.
 
 using namespace Halide;
 using namespace Halide::ConciseCasts;
@@ -52,10 +52,8 @@ class SimdOpCheckHVX : public SimdOpCheckTest {
         int isa_version;
         if (target.has_feature(Halide::Target::HVX_v66)) {
             isa_version = 66;
-        } else if (target.has_feature(Halide::Target::HVX_v65)) {
-            isa_version = 65;
         } else {
-            isa_version = 62;
+            isa_version = 65;
         }
 
         // Verify that unaligned loads use the right instructions, and don't try to use
@@ -166,13 +164,11 @@ class SimdOpCheckHVX : public SimdOpCheckTest {
         check("vnavg(v*.ub,v*.ub)", hvx_width / 1, i8((i16(u8_1) - i16(u8_2)) / 2));
         check("vnavg(v*.h,v*.h)", hvx_width / 2, i16((i32(i16_1) - i32(i16_2)) / 2));
         check("vnavg(v*.w,v*.w)", hvx_width / 4, i32((i64(i32_1) - i64(i32_2)) / 2));
-        if (isa_version >= 65) {
-            check("vavg(v*.b,v*.b)", hvx_width / 1, i8((i16(i8_1) + i16(i8_2)) / 2));
-            check("vavg(v*.b,v*.b):rnd", hvx_width / 1, i8((i16(i8_1) + i16(i8_2) + 1) / 2));
-            check("vavg(v*.uw,v*.uw)", hvx_width / 4, u32((u64(u32_1) + u64(u32_2)) / 2));
-            check("vavg(v*.uw,v*.uw):rnd", hvx_width / 4, u32((u64(u32_1) + u64(u32_2) + 1) / 2));
-            check("vnavg(v*.b,v*.b)", hvx_width / 1, i8((i16(i8_1) - i16(i8_2)) / 2));
-        }
+        check("vavg(v*.b,v*.b)", hvx_width / 1, i8((i16(i8_1) + i16(i8_2)) / 2));
+        check("vavg(v*.b,v*.b):rnd", hvx_width / 1, i8((i16(i8_1) + i16(i8_2) + 1) / 2));
+        check("vavg(v*.uw,v*.uw)", hvx_width / 4, u32((u64(u32_1) + u64(u32_2)) / 2));
+        check("vavg(v*.uw,v*.uw):rnd", hvx_width / 4, u32((u64(u32_1) + u64(u32_2) + 1) / 2));
+        check("vnavg(v*.b,v*.b)", hvx_width / 1, i8((i16(i8_1) - i16(i8_2)) / 2));
 
         // The behavior of shifts larger than the type behave differently
         // on HVX vs. the scalar processor, so we clamp.
@@ -334,9 +330,7 @@ class SimdOpCheckHVX : public SimdOpCheckTest {
 
         check("v*.ub = vasr(v*.h,v*.h,r*):rnd:sat", hvx_width / 1, u8_sat((i32(i16_1) + 8) / 16));
         check("v*.b = vasr(v*.h,v*.h,r*):rnd:sat", hvx_width / 1, i8_sat((i32(i16_1) + 16) / 32));
-        if (isa_version >= 65) {
-            check("v*.ub = vasr(v*.uh,v*.uh,r*):rnd:sat", hvx_width / 1, u8_sat((u32(u16_1) + 32) / 64));
-        }
+        check("v*.ub = vasr(v*.uh,v*.uh,r*):rnd:sat", hvx_width / 1, u8_sat((u32(u16_1) + 32) / 64));
         // int32 is safe for overflow, allow non-widening rounding.
         check("v*.uh = vasr(v*.w,v*.w,r*):rnd:sat", hvx_width / 2, u16_sat((i32_1 + 64) / 128));
         check("v*.h = vasr(v*.w,v*.w,r*):rnd:sat", hvx_width / 2, i16_sat((i32_1 + 128) / 256));
@@ -447,9 +441,7 @@ class SimdOpCheckHVX : public SimdOpCheckTest {
 
         check("vabs(v*.h)", hvx_width / 2, abs(i16_1));
         check("vabs(v*.w)", hvx_width / 4, abs(i32_1));
-        if (isa_version >= 65) {
-            check("vabs(v*.b)", hvx_width / 1, abs(i8_1));
-        }
+        check("vabs(v*.b)", hvx_width / 1, abs(i8_1));
 
         check("vmpy(v*.ub,v*.ub)", hvx_width / 1, u16(u8_1) * u16(u8_2));
         check("vmpy(v*.b,v*.b)", hvx_width / 1, i16(i8_1) * i16(i8_2));
@@ -632,15 +624,13 @@ class SimdOpCheckHVX : public SimdOpCheckTest {
         check("v*.w += vasl(v*.w,r*)", hvx_width / 4, i32_1 + (i32_2 << u32(y % 32)));
         check("v*.w += vasr(v*.w,r*)", hvx_width / 4, i32_1 + (i32_2 >> u32(y % 32)));
 
-        if (isa_version >= 65) {
-            check("v*.h += vasl(v*.h,r*)", hvx_width / 2, i16_1 + (i16_2 << u16(y % 16)));
-            check("v*.h += vasr(v*.h,r*)", hvx_width / 2, i16_1 + (i16_2 >> u16(y % 16)));
-            check("v*.h += vasl(v*.h,r*)", hvx_width / 2, u16_1 + (u16_2 * 16));
-            check("v*.h += vasl(v*.h,r*)", hvx_width / 2, i16_1 + (i16_2 * 16));
-            check("v*.h += vasl(v*.h,r*)", hvx_width / 2, u16_1 + (16 * u16_2));
-            check("v*.h += vasl(v*.h,r*)", hvx_width / 2, i16_1 + (16 * i16_2));
-            check("v*.h += vasr(v*.h,r*)", hvx_width / 2, i16_1 + (i16_2 / 16));
-        }
+        check("v*.h += vasl(v*.h,r*)", hvx_width / 2, i16_1 + (i16_2 << u16(y % 16)));
+        check("v*.h += vasr(v*.h,r*)", hvx_width / 2, i16_1 + (i16_2 >> u16(y % 16)));
+        check("v*.h += vasl(v*.h,r*)", hvx_width / 2, u16_1 + (u16_2 * 16));
+        check("v*.h += vasl(v*.h,r*)", hvx_width / 2, i16_1 + (i16_2 * 16));
+        check("v*.h += vasl(v*.h,r*)", hvx_width / 2, u16_1 + (16 * u16_2));
+        check("v*.h += vasl(v*.h,r*)", hvx_width / 2, i16_1 + (16 * i16_2));
+        check("v*.h += vasr(v*.h,r*)", hvx_width / 2, i16_1 + (i16_2 / 16));
 
         check("vcl0(v*.uh)", hvx_width / 2, count_leading_zeros(u16_1));
         check("vcl0(v*.uw)", hvx_width / 4, count_leading_zeros(u32_1));
@@ -710,8 +700,6 @@ int main(int argc, char **argv) {
         {
             Target("hexagon-32-noos-hvx"),
             Target("hexagon-32-noos-hvx-hvx_128"),
-            Target("hexagon-32-noos-hvx-hvx_128-hvx_v62"),
-            Target("hexagon-32-noos-hvx-hvx_128-hvx_v65"),
             Target("hexagon-32-noos-hvx-hvx_128-hvx_v66"),
         });
 }

From 73936357f1b6e9d12262feeb805edfc4610c0ab0 Mon Sep 17 00:00:00 2001
From: Ankit Aggarwal <aankit@quicinc.com>
Date: Fri, 2 Jun 2023 05:37:57 -0700
Subject: [PATCH 2/2] Run clang-format

---
 src/CodeGen_Hexagon.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index 96fe259d241d..954fa39b716c 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -482,7 +482,7 @@ void CodeGen_Hexagon::compile_func(const LoweredFunc &f,
     debug(1) << "Hexagon: Looking for vscatter-vgather...\n";
     body = scatter_gather_generator(body);
     debug(2) << "Hexagon: Lowering after vscatter-vgather:\n"
-                << body << "\n\n";
+             << body << "\n\n";
 
     debug(1) << "Hexagon: Optimizing shuffles...\n";
     // vlut always indexes 64 bytes of the LUT at a time, even in 128 byte mode.