diff --git a/Android.bp b/Android.bp index 159aebb516..69379472b0 100644 --- a/Android.bp +++ b/Android.bp @@ -426,6 +426,8 @@ cc_library_static { "src/cpu/kernels/CpuDirectConv2dKernel.cpp", "src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp", "src/cpu/kernels/CpuDirectConv3dKernel.cpp", + "src/cpu/kernels/CpuDynamicGemmKernel.cpp", + "src/cpu/kernels/CpuDynamicGemmKernelHeuristics.cpp", "src/cpu/kernels/CpuElementwiseKernel.cpp", "src/cpu/kernels/CpuElementwiseUnaryKernel.cpp", "src/cpu/kernels/CpuFillKernel.cpp", @@ -609,6 +611,7 @@ cc_library_static { "src/cpu/operators/CpuDequantize.cpp", "src/cpu/operators/CpuDirectConv2d.cpp", "src/cpu/operators/CpuDirectConv3d.cpp", + "src/cpu/operators/CpuDynamicGemm.cpp", "src/cpu/operators/CpuElementwise.cpp", "src/cpu/operators/CpuElementwiseUnary.cpp", "src/cpu/operators/CpuFill.cpp", diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a31e61a76..2e05612398 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) list(APPEND CMAKE_MESSAGE_CONTEXT ArmCompute) project( ArmCompute - VERSION 43.0.0 + VERSION 44.0.0 DESCRIPTION "The Arm Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A CPU and Arm® Mali™ GPU architectures" LANGUAGES C CXX ASM) @@ -333,3 +333,16 @@ if(ARM_COMPUTE_BUILD_EXAMPLES) endforeach() endif() # ARM_COMPUTE_BUILD_EXAMPLES + +# Install libraries +install(TARGETS arm_compute arm_compute_graph + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) + +# Install test executables +if(ARM_COMPUTE_BUILD_TESTING) + install(TARGETS arm_compute_validation_framework arm_compute_benchmark arm_compute_validation + RUNTIME DESTINATION "${CMAKE_INSTALL_LIBDIR}/tests" + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}/tests" + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}/tests") +endif() diff --git a/README.md b/README.md index a5387961b4..84a3639156 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@

-# Compute Library ![](https://img.shields.io/badge/latest_release-24.11-green) +# Compute Library ![](https://img.shields.io/badge/latest_release-24.11.1-green) The Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A, Arm® Neoverse® and Arm® Mali™ GPUs architectures.
@@ -37,7 +37,7 @@ Key Features:
## Documentation -[![Documentation](https://img.shields.io/badge/documentation-24.11-green)](https://artificial-intelligence.sites.arm.com/computelibrary/v24.11/index.xhtml) +[![Documentation](https://img.shields.io/badge/documentation-24.11.1-green)](https://artificial-intelligence.sites.arm.com/computelibrary/v24.11.1/index.xhtml) > Note: The documentation includes the reference API, changelogs, build guide, contribution guide, errata, etc. @@ -50,22 +50,22 @@ All the binaries can be downloaded from [here](https://github.com/ARM-software/C | Platform | Operating System | Release archive (Download) | | -------------- | ---------------- | -------------------------- | -| Raspberry Pi 4 | Linux® 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-armv7a-cpu-bin.tar.gz) | -| Raspberry Pi 4 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-bin.tar.gz) | -| Odroid N2 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-gpu-bin.tar.gz) | -| HiKey960 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-gpu-bin.tar.gz) | +| Raspberry Pi 4 | Linux® 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11.1/arm_compute-v24.11.1-linux-armv7a-cpu-bin.tar.gz) | +| Raspberry Pi 4 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11.1/arm_compute-v24.11.1-linux-aarch64-cpu-bin.tar.gz) | +| Odroid N2 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11.1/arm_compute-v24.11.1-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11.1/arm_compute-v24.11.1-linux-aarch64-cpu-gpu-bin.tar.gz) | +| HiKey960 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11.1/arm_compute-v24.11.1-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11.1/arm_compute-v24.11.1-linux-aarch64-cpu-gpu-bin.tar.gz) |
| Architecture | Operating System | Release archive (Download) | | ------------ | ---------------- | -------------------------- | -| armv7 | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-armv7a-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-armv7a-cpu-gpu-bin.tar.gz) | -| arm64-v8a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-android-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-android-aarch64-cpu-gpu-bin.tar.gz) | -| arm64-v8a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-gpu-bin.tar.gz) | +| armv7 | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11.1/arm_compute-v24.11.1-linux-armv7a-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11.1/arm_compute-v24.11.1-linux-armv7a-cpu-gpu-bin.tar.gz) | +| arm64-v8a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11.1/arm_compute-v24.11.1-android-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11.1/arm_compute-v24.11.1-android-aarch64-cpu-gpu-bin.tar.gz) | +| arm64-v8a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11.1/arm_compute-v24.11.1-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11.1/arm_compute-v24.11.1-linux-aarch64-cpu-gpu-bin.tar.gz) |
-Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.11-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.11) +Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.11.1-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.11.1) Pre-build binaries are generated with the following security / good coding practices related flags: > -Wall, -Wextra, -Wformat=2, -Winit-self, -Wstrict-overflow=2, -Wswitch-default, -Woverloaded-virtual, -Wformat-security, -Wctor-dtor-privacy, -Wsign-promo, -Weffc++, -pedantic, -fstack-protector-strong @@ -107,13 +107,13 @@ Pre-build binaries are generated with the following security / good coding pract ## Experimental builds -**⚠ Important** Bazel and CMake builds are experimental CPU only builds, please see the [documentation](https://artificial-intelligence.sites.arm.com/computelibrary/v24.11/how_to_build.xhtml) for more details. +**⚠ Important** Bazel and CMake builds are experimental CPU only builds, please see the [documentation](https://artificial-intelligence.sites.arm.com/computelibrary/v24.11.1/how_to_build.xhtml) for more details.
## How to contribute -Contributions to the Compute Library are more than welcome. If you are interested on contributing, please have a look at our [how to contribute guidelines](https://artificial-intelligence.sites.arm.com/computelibrary/v24.11/contribution_guidelines.xhtml). +Contributions to the Compute Library are more than welcome. If you are interested on contributing, please have a look at our [how to contribute guidelines](https://artificial-intelligence.sites.arm.com/computelibrary/v24.11.1/contribution_guidelines.xhtml). ### Developer Certificate of Origin (DCO) Before the Compute Library accepts your contribution, you need to certify its origin and give us your permission. To manage this process we use the Developer Certificate of Origin (DCO) V1.1 (https://developercertificate.org/) diff --git a/SConscript b/SConscript index 784db8edcb..dca3ce44af 100644 --- a/SConscript +++ b/SConscript @@ -33,8 +33,8 @@ import codecs import platform import SCons -VERSION = "v24.11" -LIBRARY_VERSION_MAJOR = 43 +VERSION = "v24.11.1" +LIBRARY_VERSION_MAJOR = 44 LIBRARY_VERSION_MINOR = 0 LIBRARY_VERSION_PATCH = 0 SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH) diff --git a/arm_compute/core/TensorShape.h b/arm_compute/core/TensorShape.h index c1707e262f..8dcd94293c 100644 --- a/arm_compute/core/TensorShape.h +++ b/arm_compute/core/TensorShape.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021, 2023 Arm Limited. + * Copyright (c) 2016-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_TENSORSHAPE_H -#define ARM_COMPUTE_TENSORSHAPE_H +#ifndef ACL_ARM_COMPUTE_CORE_TENSORSHAPE_H +#define ACL_ARM_COMPUTE_CORE_TENSORSHAPE_H #include "arm_compute/core/Dimensions.h" #include "arm_compute/core/Error.h" @@ -35,7 +35,12 @@ namespace arm_compute { -/** Shape of a tensor */ +/** Shape of a tensor. + * + * It is allowed to set one or several dimensions of a tensor shape to size 0. + * In this case the dimensions of size 0 and the whole tensor shape are + * considered dynamic. + */ class TensorShape : public Dimensions { public: @@ -77,26 +82,17 @@ class TensorShape : public Dimensions */ TensorShape &set(size_t dimension, size_t value, bool apply_dim_correction = true, bool increase_dim_unit = true) { - // Clear entire shape if one dimension is zero - if (value == 0) - { - _num_dimensions = 0; - std::fill(_id.begin(), _id.end(), 0); - } - else - { - // Make sure all empty dimensions are filled with 1 - std::fill(_id.begin() + _num_dimensions, _id.end(), 1); + // Make sure all empty dimensions are filled with 1 + std::fill(_id.begin() + _num_dimensions, _id.end(), 1); - // Set the specified dimension and increase the number of dimensions if - // necessary - Dimensions::set(dimension, value, increase_dim_unit); + // Set the specified dimension and increase the number of dimensions if + // necessary + Dimensions::set(dimension, value, increase_dim_unit); - // Correct number dimensions to ignore trailing dimensions of size 1 - if (apply_dim_correction) - { - apply_dimension_correction(); - } + // Correct number dimensions to ignore trailing dimensions of size 1 + if (apply_dim_correction) + { + apply_dimension_correction(); } return *this; } @@ -244,6 +240,33 @@ class TensorShape : public Dimensions return bc_shape; } + /** Check if the tensor shape is dynamic. + * + * If any dimension of the tensor shape has size 0, then this dimension + * and the whole shape are considered dynamic. + * + * @return True if the tensor shape is dynamic, else false. + */ + bool is_dynamic() const + { + return std::any_of(cbegin(), cend(), [](const auto &s) { return s == 0; }); + } + + /** Check if a given dimension of the tensor shape is dynamic. + * + * If a dimension of the tensor shape has size 0, then this dimension + * and the whole shape are considered dynamic. + * + * @param[in] dim Dimension index. + * + * @return True if dimension dim is dynamic, else false. + */ + bool is_dynamic(const size_t dim) const + { + ARM_COMPUTE_ERROR_ON(dim >= TensorShape::num_max_dimensions); + return _id[dim] == 0; + } + private: /** Remove trailing dimensions of size 1 from the reported number of dimensions. */ void apply_dimension_correction() @@ -262,4 +285,4 @@ class TensorShape : public Dimensions } }; } // namespace arm_compute -#endif /*ARM_COMPUTE_TENSORSHAPE_H*/ +#endif // ACL_ARM_COMPUTE_CORE_TENSORSHAPE_H diff --git a/arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h b/arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h index 5958382f6c..759ff120e5 100644 --- a/arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h +++ b/arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h @@ -149,6 +149,11 @@ class CpuGemmAssemblyDispatch : arm_compute::experimental::IOperator const ITensorInfo *d, const GEMMInfo &gemm_info = GEMMInfo()); + /** Indicates whether or not there is a implementation for the configured GEMM + * @return a bool: true if the implementation is stateless; false if not. + */ + bool has_stateless_impl() const; + /** Checks if activation is supported by the gemm assembly dispatcher * * @param[in] activation Activation to check diff --git a/docs/Doxyfile b/docs/Doxyfile index 0621168e94..5718686cf0 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -60,7 +60,7 @@ PROJECT_NAME = "Compute Library" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 24.11 +PROJECT_NUMBER = 24.11.1 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/filelist.json b/filelist.json index 38cdff601d..1f53a1e1ad 100644 --- a/filelist.json +++ b/filelist.json @@ -1581,6 +1581,8 @@ "files": { "common": [ "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp", + "src/cpu/kernels/CpuDynamicGemmKernel.cpp", + "src/cpu/kernels/CpuDynamicGemmKernelHeuristics.cpp", "src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp", "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp", "src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp", @@ -1593,6 +1595,7 @@ "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp", "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp", "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp", + "src/cpu/operators/CpuDynamicGemm.cpp", "src/cpu/operators/CpuGemm.cpp", "src/cpu/operators/CpuGemmLowpOutputStage.cpp", "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp", diff --git a/src/BUILD.bazel b/src/BUILD.bazel index ed869de9aa..547c98576e 100644 --- a/src/BUILD.bazel +++ b/src/BUILD.bazel @@ -709,6 +709,8 @@ filegroup( "cpu/kernels/CpuDirectConv2dKernel.cpp", "cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp", "cpu/kernels/CpuDirectConv3dKernel.cpp", + "cpu/kernels/CpuDynamicGemmKernel.cpp", + "cpu/kernels/CpuDynamicGemmKernelHeuristics.cpp", "cpu/kernels/CpuElementwiseKernel.cpp", "cpu/kernels/CpuElementwiseUnaryKernel.cpp", "cpu/kernels/CpuFillKernel.cpp", @@ -892,6 +894,7 @@ filegroup( "cpu/operators/CpuDequantize.cpp", "cpu/operators/CpuDirectConv2d.cpp", "cpu/operators/CpuDirectConv3d.cpp", + "cpu/operators/CpuDynamicGemm.cpp", "cpu/operators/CpuElementwise.cpp", "cpu/operators/CpuElementwiseUnary.cpp", "cpu/operators/CpuFill.cpp", diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e8ae6705ac..ff3bed96df 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -700,6 +700,8 @@ target_sources( cpu/kernels/CpuDirectConv2dKernel.cpp cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp cpu/kernels/CpuDirectConv3dKernel.cpp + cpu/kernels/CpuDynamicGemmKernel.cpp + cpu/kernels/CpuDynamicGemmKernelHeuristics.cpp cpu/kernels/CpuElementwiseKernel.cpp cpu/kernels/CpuElementwiseUnaryKernel.cpp cpu/kernels/CpuFillKernel.cpp @@ -883,6 +885,7 @@ target_sources( cpu/operators/CpuDequantize.cpp cpu/operators/CpuDirectConv2d.cpp cpu/operators/CpuDirectConv3d.cpp + cpu/operators/CpuDynamicGemm.cpp cpu/operators/CpuElementwise.cpp cpu/operators/CpuElementwiseUnary.cpp cpu/operators/CpuFill.cpp diff --git a/src/common/cpuinfo/CpuInfo.cpp b/src/common/cpuinfo/CpuInfo.cpp index 09e220e75e..4f96eeebbe 100644 --- a/src/common/cpuinfo/CpuInfo.cpp +++ b/src/common/cpuinfo/CpuInfo.cpp @@ -416,7 +416,7 @@ CpuInfo CpuInfo::build() return info; #elif defined(__aarch64__) && defined(_WIN64) /* #elif defined(__aarch64__) && defined(__APPLE__) */ CpuIsaInfo isainfo; - isainfo.neon = true; + isainfo.neon = IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE); isainfo.dot = IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE); if (NTDDI_VERSION >= NTDDI_WIN11_GE) { diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h index 166260a3c0..0f28e26884 100644 --- a/src/core/CL/cl_kernels/helpers_asymm.h +++ b/src/core/CL/cl_kernels/helpers_asymm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_HELPERS_ASYMM_H -#define ARM_COMPUTE_HELPERS_ASYMM_H +#ifndef ACL_SRC_CORE_CL_CL_KERNELS_HELPERS_ASYMM_H +#define ACL_SRC_CORE_CL_CL_KERNELS_HELPERS_ASYMM_H #include "helpers.h" @@ -115,17 +115,16 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Correctly-rounded-to-nearest division by a power-of-two. */ -#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ - { \ - const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \ - const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \ - VEC_DATA_TYPE(int, size) \ - mask = (one << exponent) - one; \ - VEC_DATA_TYPE(int, size) \ - threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0)); \ - return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \ +#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ + { \ + const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \ + VEC_DATA_TYPE(int, size) \ + rounding = one << (exponent - one); \ + VEC_DATA_TYPE(int, size) \ + _tmp = (x + rounding) >> exponent; \ + return select(x, _tmp, (SELECT_VEC_DATA_TYPE(int, size))(exponent > 0)); \ } /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), @@ -146,19 +145,8 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) b_64 = convert_long##size(b); \ VEC_DATA_TYPE(long, size) \ ab_64 = a_64 * b_64; \ - /* Revert COMPMID-907 */ \ - VEC_DATA_TYPE(long, size) \ - mask1 = 1 << 30; \ - VEC_DATA_TYPE(long, size) \ - mask2 = 1 - (1 << 30); \ - VEC_DATA_TYPE(long, size) \ - is_positive_or_zero = ab_64 >= 0; \ - VEC_DATA_TYPE(long, size) \ - nudge = select(mask2, mask1, (SELECT_VEC_DATA_TYPE(long, size))(is_positive_or_zero)); \ - VEC_DATA_TYPE(long, size) \ - mask = 1ll << 31; \ VEC_DATA_TYPE(int, size) \ - ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ + ab_x2_high32 = convert_int##size(ab_64 >> 31); \ return select(ab_x2_high32, INT_MAX, (SELECT_VEC_DATA_TYPE(int, size))(overflow)); \ } @@ -583,4 +571,4 @@ MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4) MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8) MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16) -#endif // ARM_COMPUTE_HELPERS_ASYMM_H +#endif // ACL_SRC_CORE_CL_CL_KERNELS_HELPERS_ASYMM_H diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h index 8129606277..f2bb16b650 100644 --- a/src/core/CL/cl_kernels/tile_helpers.h +++ b/src/core/CL/cl_kernels/tile_helpers.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023 Arm Limited. + * Copyright (c) 2021-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ACL_SRC_CORE_CL_CL_KERNELS_TILE_HELPERS -#define ACL_SRC_CORE_CL_CL_KERNELS_TILE_HELPERS +#ifndef ACL_SRC_CORE_CL_CL_KERNELS_TILE_HELPERS_H +#define ACL_SRC_CORE_CL_CL_KERNELS_TILE_HELPERS_H // *INDENT-OFF* // clang-format off @@ -993,17 +993,12 @@ long a_64 = (long)(_src); \ long b_64 = (long)(DST_MULTIPLIER); \ long ab_64 = a_64 * b_64; \ - long mask1 = 1 << 30; \ - long mask2 = 1 - (1 << 30); \ - long is_positive_or_zero = ab_64 >= 0; \ - long nudge = select(mask2, mask1, is_positive_or_zero); \ - SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ + SRC_DATA_TYPE ab_x2_high32 = CONVERT(ab_64 >> 31, SRC_DATA_TYPE); \ _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ - if(DST_SHIFT >= 0) \ + if(DST_SHIFT > 0) \ { \ - long mask = ((((int)1) << DST_SHIFT) - (long)1); \ - long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \ - _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \ + int rounding = (int)1 << (DST_SHIFT - (int)1); \ + _tmp = (_tmp + rounding) >> DST_SHIFT; \ } \ _tmp += DST_OFFSET; \ dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ @@ -1041,17 +1036,11 @@ long a_64 = (long)(_src); \ long b_64 = (long)(_dst_multiplier); \ long ab_64 = a_64 * b_64; \ - long mask1 = 1 << 30; \ - long mask2 = 1 - (1 << 30); \ - long is_positive_or_zero = ab_64 >= 0; \ - long nudge = select(mask2, mask1, is_positive_or_zero); \ - SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ + SRC_DATA_TYPE ab_x2_high32 = CONVERT(ab_64 >> 31, SRC_DATA_TYPE); \ _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ - long mask = ((((int)1) << _dst_shift) - (int)1); \ - long threshold = (mask >> 1) + any(_tmp); \ - _tmp2 = _tmp >> _dst_shift; \ - _tmp2 += select(0, 1, (_tmp & mask) > threshold); \ - _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \ + int rounding = (int)1 << (_dst_shift - (int)1); \ + _tmp2 = (_tmp + rounding) >> _dst_shift; \ + _tmp = select(_tmp, _tmp2, _dst_shift > 0); \ _tmp += DST_OFFSET; \ dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ }) \ @@ -1083,17 +1072,12 @@ long a_64 = (long)(_src); \ long b_64 = (long)(DST_MULTIPLIER); \ long ab_64 = a_64 * b_64; \ - long mask1 = 1 << 30; \ - long mask2 = 1 - (1 << 30); \ - long is_positive_or_zero = ab_64 >= 0; \ - long nudge = select(mask2, mask1, is_positive_or_zero); \ - SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ + SRC_DATA_TYPE ab_x2_high32 = CONVERT(ab_64 >> 31, SRC_DATA_TYPE); \ _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ - if(DST_SHIFT >= 0) \ + if(DST_SHIFT > 0) \ { \ - long mask = ((((int)1) << DST_SHIFT) - (int)1); \ - long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \ - _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \ + int rounding = (int)1 << (DST_SHIFT - (int)1); \ + _tmp = (_tmp + rounding) >> DST_SHIFT; \ } \ _tmp += DST_OFFSET; \ dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ @@ -1448,4 +1432,4 @@ }) \ }) -#endif /* ACL_SRC_CORE_CL_CL_KERNELS_TILE_HELPERS */ +#endif // ACL_SRC_CORE_CL_CL_KERNELS_TILE_HELPERS_H diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h index 522369309b..a1c5855a70 100644 --- a/src/core/NEON/NEAsymm.h +++ b/src/core/NEON/NEAsymm.h @@ -24,6 +24,9 @@ #ifndef ACL_SRC_CORE_NEON_NEASYMM_H #define ACL_SRC_CORE_NEON_NEASYMM_H +#include "arm_compute/core/QuantizationInfo.h" +#include "arm_compute/core/Rounding.h" + #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" @@ -98,18 +101,18 @@ inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift))); in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift))); - in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); - in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); - in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); - in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); + in_s32.val[0] = vqdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); + in_s32.val[1] = vqdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); + in_s32.val[2] = vqdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); + in_s32.val[3] = vqdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); } else { // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); - in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); - in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); - in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); + in_s32.val[0] = vqdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); + in_s32.val[1] = vqdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); + in_s32.val[2] = vqdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); + in_s32.val[3] = vqdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); // Round to the nearest division by a power-of-two using result_shift_s32 in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift); @@ -173,18 +176,18 @@ inline int8x16_t finalize_quantization(int32x4x4_t &in_s32, in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift))); in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift))); - in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); - in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); - in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); - in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); + in_s32.val[0] = vqdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); + in_s32.val[1] = vqdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); + in_s32.val[2] = vqdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); + in_s32.val[3] = vqdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); } else { // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); - in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); - in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); - in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); + in_s32.val[0] = vqdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); + in_s32.val[1] = vqdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); + in_s32.val[2] = vqdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); + in_s32.val[3] = vqdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); // Round to the nearest division by a power-of-two using result_shift_s32 in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift); @@ -239,10 +242,10 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar int32x4x4_t res_shift_gt0 = { - vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]), - vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]), - vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]), - vqrdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]), + vqdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]), + vqdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]), + vqdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]), + vqdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]), }; // Round to the nearest division by a power-of-two using result_shift_s32 res_shift_gt0.val[0] = rounding_divide_by_pow2(res_shift_gt0.val[0], result_shift.val[0]); @@ -256,10 +259,10 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, vmulq_s32(in_s32.val[2], vshlq_s32(one_s32, vnegq_s32(result_shift.val[2]))), vmulq_s32(in_s32.val[3], vshlq_s32(one_s32, vnegq_s32(result_shift.val[3]))), }; - res_shift_lt0.val[0] = vqrdmulhq_s32(res_shift_lt0.val[0], result_fixedpoint_multiplier.val[0]); - res_shift_lt0.val[1] = vqrdmulhq_s32(res_shift_lt0.val[1], result_fixedpoint_multiplier.val[1]); - res_shift_lt0.val[2] = vqrdmulhq_s32(res_shift_lt0.val[2], result_fixedpoint_multiplier.val[2]); - res_shift_lt0.val[3] = vqrdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]); + res_shift_lt0.val[0] = vqdmulhq_s32(res_shift_lt0.val[0], result_fixedpoint_multiplier.val[0]); + res_shift_lt0.val[1] = vqdmulhq_s32(res_shift_lt0.val[1], result_fixedpoint_multiplier.val[1]); + res_shift_lt0.val[2] = vqdmulhq_s32(res_shift_lt0.val[2], result_fixedpoint_multiplier.val[2]); + res_shift_lt0.val[3] = vqdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]); // Select result depending on shift value const uint32x4x4_t mask_lt0 = { @@ -328,12 +331,12 @@ inline uint8_t finalize_quantization(int32_t in_value, if (result_shift < 0) { in_value = vgetq_lane_s32( - vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); + vqdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); } else { // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0); + in_value = vgetq_lane_s32(vqdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0); // Shift value by result_shift_s32 in_value = rounding_divide_by_pow2(in_value, result_shift); } @@ -376,12 +379,12 @@ inline int8_t finalize_quantization(int32_t in_value, if (result_shift < 0) { in_value = vgetq_lane_s32( - vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); + vqdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); } else { // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0); + in_value = vgetq_lane_s32(vqdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0); // Shift value by result_shift_s32 in_value = rounding_divide_by_pow2(in_value, result_shift); @@ -444,17 +447,14 @@ inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationIn /** Dequantize a neon vector holding 16 quantized values. * - * @param[in] qv Input values to be dequantized. - * @param[in] qi Quantization information to be used in the computation. + * @param[in] qv Input values to be dequantized. + * @param[in] voffset Vectorized dequantization offset. + * @param[in] vscale Vectorized dequantization scale. * * @return Dequantized values in a neon vector */ -inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi) +inline float32x4x4_t vdequantize(const uint8x16_t &qv, const int32x4_t &voffset, const float32x4_t &vscale) { - const float scale = qi.scale; - const int offset = qi.offset; - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); const float32x4x4_t vdequantized_input = {{ vmulq_f32(vcvtq_f32_s32( vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), @@ -472,19 +472,32 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantization return vdequantized_input; } -/** Dequantize a neon vector holding 16 signed quantized values. +/** Dequantize a neon vector holding 16 quantized values. * * @param[in] qv Input values to be dequantized. * @param[in] qi Quantization information to be used in the computation. * * @return Dequantized values in a neon vector */ -inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi) +inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + return vdequantize(qv, voffset, vscale); +} + +/** Dequantize a neon vector holding 16 signed quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] voffset Vectorized dequantization offset. + * @param[in] vscale Vectorized dequantization scale. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x4_t vdequantize(const int8x16_t &qv, const int32x4_t &voffset, const float32x4_t &vscale) { - const float scale = qi.scale; - const int offset = qi.offset; - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); const float32x4x4_t vdequantized_input = {{ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), @@ -494,6 +507,22 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationI return vdequantized_input; } +/** Dequantize a neon vector holding 16 signed quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + return vdequantize(qv, voffset, vscale); +} + /** Dequantize following an asymmetric quantization scheme a neon vector holding 16 quantized values. * * @param[in] qv Input values to be dequantized. diff --git a/src/core/NEON/NEMath.inl b/src/core/NEON/NEMath.inl index d995b6e2fc..be9cc961ce 100644 --- a/src/core/NEON/NEMath.inl +++ b/src/core/NEON/NEMath.inl @@ -429,25 +429,19 @@ inline float32x2_t vsin_f32(float32x2_t val) inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent) { - const int32x4_t shift_vec = vnegq_s32(exponent); - const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); - const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); - return vrshlq_s32(fixed_up_x, shift_vec); + const int32x4_t shift_vec = vnegq_s32(exponent); + return vrshlq_s32(x, shift_vec); } inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent) { - const int32x4_t shift_vec = vdupq_n_s32(-exponent); - const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); - const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); - return vrshlq_s32(fixed_up_x, shift_vec); + const int32x4_t shift_vec = vdupq_n_s32(-exponent); + return vrshlq_s32(x, shift_vec); } inline int32_t rounding_divide_by_pow2(int32_t x, int exponent) { - const int32_t mask = (1 << exponent) - 1; - const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0); - return (x >> exponent) + ((x & mask) > threshold ? 1 : 0); + return (exponent == 0) ? x : ((x + (1 << (exponent - 1))) >> exponent); } inline float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in) diff --git a/src/core/NEON/NESymm.h b/src/core/NEON/NESymm.h index ec246efc8c..ac6cc130c8 100644 --- a/src/core/NEON/NESymm.h +++ b/src/core/NEON/NESymm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2020, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NESYMM_H -#define ARM_COMPUTE_NESYMM_H +#ifndef ACL_SRC_CORE_NEON_NESYMM_H +#define ACL_SRC_CORE_NEON_NESYMM_H #include "arm_compute/core/utils/quantization/AsymmHelpers.h" @@ -229,11 +229,11 @@ inline int32x4x2_t multiply_by_quantized_multiplier_2row(int32x4x2_t input, int3 const auto one_shifted = 1 << left_shift; int32x4x2_t result; - result.val[0] = rounding_divide_by_pow2(vqrdmulhq_n_s32(vmulq_n_s32(input.val[0], one_shifted), qmul), right_shift); - result.val[1] = rounding_divide_by_pow2(vqrdmulhq_n_s32(vmulq_n_s32(input.val[1], one_shifted), qmul), right_shift); + result.val[0] = rounding_divide_by_pow2(vqdmulhq_n_s32(vmulq_n_s32(input.val[0], one_shifted), qmul), right_shift); + result.val[1] = rounding_divide_by_pow2(vqdmulhq_n_s32(vmulq_n_s32(input.val[1], one_shifted), qmul), right_shift); return result; } } // namespace arm_compute -#endif // ARM_COMPUTE_NESYMM_H +#endif // ACL_SRC_CORE_NEON_NESYMM_H diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp index 834751f1fe..ec87319941 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp @@ -145,8 +145,8 @@ class GemmHybrid : public GemmCommon { return true; } - // Execute - void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override { + // Stateless execute + void execute_stateless(const ndcoord_t &work_range, const ndcoord_t &, int, GemmArrays& g_array) override { #ifdef CYCLE_PROFILING profiler prof; #endif @@ -190,17 +190,17 @@ class GemmHybrid : public GemmCommon { auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width())); #endif - strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda, + strat.kernel(g_array._Aptr + (multi * g_array._A_multi_stride) + (batch * g_array._A_batch_stride) + (m_start * g_array._lda) + k0, g_array._lda, b_panel, - this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc, + g_array._Cptr + (multi * g_array._C_multi_stride) + (batch * g_array._C_batch_stride) + (m_start * g_array._ldc) + n0, g_array._ldc, (m_end - m_start), (nmax - n0), kmax-k0, - (strategy::supports_bias() && first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr, + (strategy::supports_bias() && first_pass && g_array._bias) ? g_array._bias + (multi * g_array._bias_multi_stride) + n0 : nullptr, last_pass ? _act : Activation(), !first_pass); // Add bias externally if needed - if (!strategy::supports_bias() && this->_bias && first_pass) { - bias_adder(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc, - this->_bias + (multi * this->_bias_multi_stride) + n0, + if (!strategy::supports_bias() && g_array._bias && first_pass) { + bias_adder(g_array._Cptr + (multi * g_array._C_multi_stride) + (batch * g_array._C_batch_stride) + (m_start * g_array._ldc) + n0, g_array._ldc, + g_array._bias + (multi * g_array._bias_multi_stride) + n0, (m_end - m_start), (nmax - n0)); } @@ -208,6 +208,11 @@ class GemmHybrid : public GemmCommon { } } + // Execute + void execute(const ndcoord_t &work_range, const ndcoord_t & thread_locator, int threadid) override { + execute_stateless(work_range, thread_locator, threadid, this->_gemm_array); + } + // Interface implementation - pretransposed bool B_is_pretransposed() const override { return true; diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp index 7c155d1677..4d11f042e4 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp @@ -423,8 +423,8 @@ class GemmHybridIndirect : public GemmCommon { return true; } - // Execute - void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override { + // Stateless execute + void execute_stateless(const ndcoord_t &work_range, const ndcoord_t &, int, GemmArrays& g_array) override { #ifdef CYCLE_PROFILING profiler prof; #endif @@ -504,9 +504,9 @@ class GemmHybridIndirect : public GemmCommon { const Troi *b_panel; if (FixedFormat) { - b_panel = reinterpret_cast(this->_Bptr) + - (multi * this->_B_multi_stride) + - ((n0 / stripe_width::get()) * this->_ldb) + + b_panel = reinterpret_cast(g_array._Bptr) + + (multi * g_array._B_multi_stride) + + ((n0 / stripe_width::get()) * g_array._ldb) + (k0 * stripe_width::get()); } else { b_panel = _B_transposed + @@ -515,7 +515,7 @@ class GemmHybridIndirect : public GemmCommon { (n0 * kern_k); } - IndirectOutputArg out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc); + IndirectOutputArg out_arg(g_array._Cptr + (multi * g_array._C_multi_stride) + (batch * g_array._C_batch_stride) + (m_start * g_array._ldc) + n0, g_array._ldc); #ifdef CYCLE_PROFILING auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width())); @@ -527,14 +527,14 @@ class GemmHybridIndirect : public GemmCommon { #endif strat, sections, string_lengths.data(), IndirectInputArg(_indirect_buf + (multi * _args._nbatches * _args._Ksections) + (batch * _args._Ksections) + first_section, m_start, first_offset), - (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg, - (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr, + (m_end - m_start), (nmax - n0), kern_k, b_panel, g_array._ldb, out_arg, + (g_array._bias && first_pass) ? g_array._bias + (multi * g_array._bias_multi_stride) + n0 : nullptr, last_pass ? _args._act : Activation(), !first_pass || _args._accumulate, // Quantization parameters _os, _col_bias+(multi * _args._Nsize), n0); } else if (_convolver) { - auto conv_cols = _convolver->process_columns(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride), this->_lda, k0, kmax, _rounded_Ksize); + auto conv_cols = _convolver->process_columns(g_array._Aptr + (multi * g_array._A_multi_stride) + (batch * g_array._A_batch_stride), g_array._lda, k0, kmax, _rounded_Ksize); unsigned int pos=0; auto conv_rows = conv_cols.process_rows(m_start, m_end - m_start); @@ -560,8 +560,8 @@ class GemmHybridIndirect : public GemmCommon { #endif strat, sections, string_lengths.data(), IndirectInputArg(in_row_strings.data(), 0, first_offset), - (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg, - (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr, + (m_end - m_start), (nmax - n0), kern_k, b_panel, g_array._ldb, out_arg, + (g_array._bias && first_pass) ? g_array._bias + (multi * g_array._bias_multi_stride) + n0 : nullptr, last_pass ? _args._act : Activation(), !first_pass || _args._accumulate, // Quantization parameters @@ -575,9 +575,9 @@ class GemmHybridIndirect : public GemmCommon { prof, #endif strat, 1, &len, - IndirectInputArg(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda), - (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg, - (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr, + IndirectInputArg(g_array._Aptr + (multi * g_array._A_multi_stride) + (batch * g_array._A_batch_stride) + m_start * g_array._lda + k0, g_array._lda), + (m_end - m_start), (nmax - n0), kern_k, b_panel, g_array._ldb, out_arg, + (g_array._bias && first_pass) ? g_array._bias + (multi * g_array._bias_multi_stride) + n0 : nullptr, last_pass ? _args._act : Activation(), !first_pass || _args._accumulate, // Quantization parameters @@ -587,6 +587,11 @@ class GemmHybridIndirect : public GemmCommon { } } + // Execute + void execute(const ndcoord_t &work_range, const ndcoord_t & thread_locator, int threadid) override { + execute_stateless(work_range, thread_locator, threadid, this->_gemm_array); + } + // Interface implementation - pretransposed bool B_is_pretransposed() const override { return (FixedFormat == false); diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp index a4ccb24dac..073012e5a7 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp @@ -171,8 +171,12 @@ class GemmHybridQuantized : public GemmCommon { return true; } - // Execute - void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override { + // Stateless execute + // TODO: Make this actually stateless. This still uses the stateful + // execution data because it requires a workspace which would also need to + // be handled statelessly. + void execute_stateless(const ndcoord_t &work_range, const ndcoord_t &, int threadid, GemmArrays &) override { + auto& g_array = this->_gemm_array; #ifdef CYCLE_PROFILING profiler prof; #endif @@ -218,7 +222,7 @@ class GemmHybridQuantized : public GemmCommon { #ifdef CYCLE_PROFILING auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width())); #endif - strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda, + strat.kernel(g_array._Aptr + (multi * g_array._A_multi_stride) + (batch * g_array._A_batch_stride) + (m_start * g_array._lda) + k0, g_array._lda, b_panel, result_buffer, (nmax-n0), (m_end - m_start), (nmax - n0), kern_k, @@ -230,7 +234,7 @@ class GemmHybridQuantized : public GemmCommon { auto p = prof.ScopedProfiler(PROFILE_ROWSUMS, (m_end - m_start) * _Ksize); #endif compute_row_sums(_qp, _Ksize, (m_end - m_start), - this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda), this->_lda, + g_array._Aptr + (multi * g_array._A_multi_stride) + (batch * g_array._A_batch_stride) + (m_start * g_array._lda), g_array._lda, local_row_sums); } @@ -240,13 +244,18 @@ class GemmHybridQuantized : public GemmCommon { #endif requantize_block_32(_qp, (nmax - n0), (m_end - m_start), result_buffer, (nmax - n0), - this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc, + g_array._Cptr + (multi * g_array._C_multi_stride) + (batch * g_array._C_batch_stride) + (m_start * g_array._ldc) + n0, g_array._ldc, local_row_sums, col_bias + (multi * _Nsize) + n0, n0); } } while (p.next_dim0()); } } + // Execute + void execute(const ndcoord_t &work_range, const ndcoord_t & thread_locator, int threadid) override { + execute_stateless(work_range, thread_locator, threadid, this->_gemm_array); + } + // Working space needed for intermediate result buffers. size_t get_working_size() const override { return (_nthreads * strategy::out_height() * _Nsize * sizeof(Tri)); diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp index 25570a5f69..6e1ea65890 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp @@ -832,8 +832,12 @@ class GemmInterleaved : public GemmCommon { _nthreads = std::min(nthreads, _maxthreads); } - // Execute - void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override { + // Stateless execute + // TODO: Make this actually stateless. This still uses the stateful + // execution data because it requires a workspace which would also need to + // be handled statelessly. + void execute_stateless(const ndcoord_t &work_range, const ndcoord_t &, int threadid, GemmArrays &) override { + auto& g_array = this->_gemm_array; #ifdef CYCLE_PROFILING profiler prof; #endif @@ -886,8 +890,8 @@ class GemmInterleaved : public GemmCommon { unsigned int kern_k = roundup(kmax - k0, strategy::k_unroll()); const Troi *b_ptr = FixedFormat ? - reinterpret_cast(this->_Bptr) + (multi * this->_B_multi_stride) + - ((start_x / get_stripe_width::get()) * this->_ldb) + + reinterpret_cast(g_array._Bptr) + (multi * g_array._B_multi_stride) + + ((start_x / get_stripe_width::get()) * g_array._ldb) + (k0 * get_stripe_width::get()) : _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k); @@ -912,16 +916,16 @@ class GemmInterleaved : public GemmCommon { _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier()); } else if (_convolver) { transforms.PrepareA_convolution(a_panel, - this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride), - this->_lda, *_convolver, _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier()); + g_array._Aptr + (batch * g_array._A_batch_stride) + (multi * g_array._A_multi_stride), + g_array._lda, *_convolver, _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier()); } else { transforms.PrepareA(a_panel, - this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride), - this->_lda, start_row, end_row, k0, std::min(kmax, _Ksize), row_sum_multiplier()); + g_array._Aptr + (batch * g_array._A_batch_stride) + (multi * g_array._A_multi_stride), + g_array._lda, start_row, end_row, k0, std::min(kmax, _Ksize), row_sum_multiplier()); } } - Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride); + Tr *result_ptr = g_array._Cptr + (batch * g_array._C_batch_stride) + (multi * g_array._C_multi_stride); // If we are using an accumulation buffer and this isn't the last pass, don't pass a result pointer. if (_accumulation_buffer && !last_pass) { @@ -934,13 +938,13 @@ class GemmInterleaved : public GemmCommon { prof, #endif // Strategy and panel pointers - strat, a_panel, b_ptr, this->_ldb, c_panel, + strat, a_panel, b_ptr, g_array._ldb, c_panel, // Result buffer pointers - result_ptr, this->_ldc, + result_ptr, g_array._ldc, // K size, and M/N ranges kern_k, start_row, end_row, start_x, end_x, // Only do bias on the first pass - ((bias_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr), + ((bias_pass && g_array._bias) ? g_array._bias + (multi * g_array._bias_multi_stride) : nullptr), // Only do activation on the last pass, and accumulation on any non-first pass. (last_pass ? _act : Activation()), (!first_pass || _accumulate), // Pass in quantization parameters for requantizing kernels (others will ignore) @@ -1009,12 +1013,12 @@ class GemmInterleaved : public GemmCommon { _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier()); } else if (_convolver) { transforms.PrepareA_convolution(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()), - this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride), - this->_lda, *_convolver, _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier()); + g_array._Aptr + (batch * g_array._A_batch_stride) + (current.multi() * g_array._A_multi_stride), + g_array._lda, *_convolver, _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier()); } else { transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()), - this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride), - this->_lda, first_m, last_m, current.k0(), std::min(_Ksize, current.kmax()), row_sum_multiplier()); + g_array._Aptr + (batch * g_array._A_batch_stride) + (current.multi() * g_array._A_multi_stride), + g_array._lda, first_m, last_m, current.k0(), std::min(_Ksize, current.kmax()), row_sum_multiplier()); } } @@ -1034,8 +1038,8 @@ class GemmInterleaved : public GemmCommon { // For FixedFormat cases, figure out the B pointer. The loop below moves through batches and vertically through the output so this will be the same throughout. if (FixedFormat) { - b_panel = reinterpret_cast(this->_Bptr) + (current.multi() * this->_B_multi_stride) + - ((current.x0() / get_stripe_width::get()) * this->_ldb) + + b_panel = reinterpret_cast(g_array._Bptr) + (current.multi() * g_array._B_multi_stride) + + ((current.x0() / get_stripe_width::get()) * g_array._ldb) + (current.k0() * get_stripe_width::get()); } @@ -1071,7 +1075,7 @@ class GemmInterleaved : public GemmCommon { const bool bias_pass = (std::is_same::value && !MergeStep) ? last_pass : first_pass; // Pointer to appropriate part of result array. - Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride); + Tr *result_ptr = g_array._Cptr + (batch * g_array._C_batch_stride) + (current.multi() * g_array._C_multi_stride); // If we are using an accumulation buffer, we don't pass the result buffer to ask the kernel // to write things into the accumulation buffer instead, except on the last pass. @@ -1085,13 +1089,13 @@ class GemmInterleaved : public GemmCommon { prof, #endif // Strategy and panel pointers - strat, a_ptr, b_panel, this->_ldb, c_panel, + strat, a_ptr, b_panel, g_array._ldb, c_panel, // Result buffer pointers - result_ptr, this->_ldc, + result_ptr, g_array._ldc, // K size, and M/N ranges kern_k, y, ymax, current.x0(), current.xmax(), // Only do bias on the first pass - ((bias_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr), + ((bias_pass && g_array._bias) ? g_array._bias + (current.multi() * g_array._bias_multi_stride) : nullptr), // Only do activation on the last pass, and accumulation on any non-first pass. (last_pass ? _act : Activation()), (!first_pass || _accumulate), // Pass in quantization parameters for requantizing kernels (others will ignore) @@ -1110,6 +1114,11 @@ class GemmInterleaved : public GemmCommon { } } + // Execute + void execute(const ndcoord_t &work_range, const ndcoord_t & thread_locator, int threadid) override { + execute_stateless(work_range, thread_locator, threadid, this->_gemm_array); + } + // Interface implementation - working space size_t get_working_size() const override { // In all cases, we need one A buffer plus a C buffer per thread, plus an accumulation buffer. diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp index aa03fb6aa1..0ba7b7870e 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp @@ -64,10 +64,17 @@ class GemvBatched : public GemmCommon { _subgemm->set_nthreads(nthreads); } - void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) override { + // TODO: Make this actually stateless. This still uses the stateful + // execution data because it requires a workspace which would also need to + // be handled statelessly. + void execute_stateless(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid, GemmArrays &) override { _subgemm->execute(work_range, thread_locator, threadid); } + void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) override { + execute_stateless(work_range, thread_locator, threadid, this->_gemm_array); + } + size_t get_working_size() const override { return _subgemm->get_working_size(); } diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp index 0acad28998..08c4192526 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp @@ -139,8 +139,8 @@ class GemvPretransposed : public GemmCommon { return { iceildiv(_args._Nsize, strategy::out_width()) * _args._nmulti }; } - // Actually execute the GEMV. - void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override { + // Use the stateless interface to execute the GEMV. + void execute_stateless(const ndcoord_t &work_range, const ndcoord_t &, int, GemmArrays& g_array) override { #ifdef CYCLE_PROFILING profiler prof; #endif @@ -175,11 +175,11 @@ class GemvPretransposed : public GemmCommon { #ifdef CYCLE_PROFILING auto p = prof.ScopedProfiler(PROFILE_KERNEL, (kmax-k0) * (nmax-n)); #endif - run_gemv_kernel::run(strat, this->_Aptr + (multi * this->_A_multi_stride) + k0, + run_gemv_kernel::run(strat, g_array._Aptr + (multi * g_array._A_multi_stride) + k0, _B_pretransposed + (multi * _buffer_per_multi) + (n * roundup(_args._Ksize, strategy::k_unroll())) + (k0 * strategy::out_width()), - this->_Cptr + (multi * this->_C_multi_stride) + n, + g_array._Cptr + (multi * g_array._C_multi_stride) + n, (nmax - n), (kmax-k0), - this->_bias ? this->_bias + (multi * this->_bias_multi_stride) + n : nullptr, + g_array._bias ? g_array._bias + (multi * g_array._bias_multi_stride) + n : nullptr, _args._act, (k0 != 0) || _args._accumulate, _os, col_bias, n + (_args._Nsize * multi)); } @@ -187,6 +187,11 @@ class GemvPretransposed : public GemmCommon { } } + // Actually execute the GEMV. + void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) override { + execute_stateless(work_range, thread_locator, threadid, this->_gemm_array); + } + /* Pretransposed interface implementation */ bool B_is_pretransposed() const override { return true; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp index 4d23660942..6a3e182e04 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -74,29 +73,25 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( ka.string_lengths = string_lengths; ka.N = N; ka.B_ptr = B_ptr; - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 91f\n" + "bge 88f\n" "cmp %x[M], #0x2\n" - "bgt 61f\n" - "beq 31f\n" - "mov x16, %x[col_bias]\n" + "bgt 59f\n" + "beq 30f\n" "movi v11.4s, #0x0\n" - "movi v15.16b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "movi v15.16b, #0x1\n" + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[col_bias]\n" "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n" "2:" // Height 1: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" "movi v19.4s, #0x0\n" - "3:" // Height 1: setup done "mov x12, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -117,104 +112,104 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "blt 11f\n" "ldr q0, [x10, #0x0]\n" "cmp x11, #0x20\n" - "ldr q4, [x14, #0x0]\n" - "ldr q5, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" - "ldr q7, [x14, #0x30]\n" - "ldr q8, [x14, #0x40]\n" - "ldr q9, [x14, #0x50]\n" - "ldr q10, [x14, #0x60]\n" + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "ldr q8, [x15, #0x40]\n" + "ldr q9, [x15, #0x50]\n" + "ldr q10, [x15, #0x60]\n" "blt 9f\n" "7:" // Height 1: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr d21, [x14, #0x70]\n" - "ldr x20, [x14, #0x78]\n" + "ldr d4, [x15, #0x70]\n" + "ldr x20, [x15, #0x78]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "ldr d20, [x14, #0x80]\n" + "ldr d5, [x15, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "ldr d26, [x14, #0x90]\n" + "ldr d6, [x15, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "ldr d25, [x14, #0xa0]\n" - "mov v21.d[1], x20\n" - "ldr x20, [x14, #0x88]\n" + "ldr d7, [x15, #0xa0]\n" + "mov v4.d[1], x20\n" + "ldr x20, [x15, #0x88]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" - "ldr d24, [x14, #0xb0]\n" + "ldr d8, [x15, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" - "ldr d23, [x14, #0xc0]\n" + "ldr d9, [x15, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" - "ldr d22, [x14, #0xd0]\n" - ".inst 0x4fa0e2b3 // sdot v19.4s, v21.16b, v0.4b[1]\n" - "ldr d21, [x14, #0xe0]\n" - "mov v20.d[1], x20\n" - "ldr x22, [x14, #0x98]\n" + "ldr d10, [x15, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + "ldr d4, [x15, #0xe0]\n" + "mov v5.d[1], x20\n" + "ldr x22, [x15, #0x98]\n" "add x10, x10, #0x10\n" - "ldr x21, [x14, #0xa8]\n" - ".inst 0x4f80ea90 // sdot v16.4s, v20.16b, v0.4b[2]\n" - "ldr d20, [x14, #0xf0]\n" - "ldr x20, [x14, #0xb8]\n" - "mov v26.d[1], x22\n" - "mov v25.d[1], x21\n" - "ldr x23, [x14, #0xc8]\n" - "ldr x22, [x14, #0xd8]\n" - ".inst 0x4f80eb51 // sdot v17.4s, v26.16b, v0.4b[2]\n" - "mov v24.d[1], x20\n" - "ldr x21, [x14, #0xe8]\n" - "ldr x20, [x14, #0xf8]\n" - ".inst 0x4f80eb32 // sdot v18.4s, v25.16b, v0.4b[2]\n" - ".inst 0x4f80eb13 // sdot v19.4s, v24.16b, v0.4b[2]\n" - "mov v23.d[1], x23\n" - "mov v22.d[1], x22\n" - "add x14, x14, #0x100\n" - "mov v21.d[1], x21\n" - ".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n" - "mov v20.d[1], x20\n" - ".inst 0x4fa0ead1 // sdot v17.4s, v22.16b, v0.4b[3]\n" - ".inst 0x4fa0eab2 // sdot v18.4s, v21.16b, v0.4b[3]\n" - ".inst 0x4fa0ea93 // sdot v19.4s, v20.16b, v0.4b[3]\n" + "ldr x21, [x15, #0xa8]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + "ldr d5, [x15, #0xf0]\n" + "ldr x20, [x15, #0xb8]\n" + "mov v6.d[1], x22\n" + "mov v7.d[1], x21\n" + "ldr x23, [x15, #0xc8]\n" + "ldr x22, [x15, #0xd8]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "mov v8.d[1], x20\n" + "ldr x21, [x15, #0xe8]\n" + "ldr x20, [x15, #0xf8]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + "mov v9.d[1], x23\n" + "mov v10.d[1], x22\n" + "add x15, x15, #0x100\n" + "mov v4.d[1], x21\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + "mov v5.d[1], x20\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 8f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum "ldr q0, [x10, #0x0]\n" "sub x11, x11, #0x10\n" - "ldr q4, [x14, #0x0]\n" + "ldr q4, [x15, #0x0]\n" "cmp x11, #0x20\n" - "ldr q5, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" - "ldr q7, [x14, #0x30]\n" - "ldr q8, [x14, #0x40]\n" - "ldr q9, [x14, #0x50]\n" - "ldr q10, [x14, #0x60]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "ldr q8, [x15, #0x40]\n" + "ldr q9, [x15, #0x50]\n" + "ldr q10, [x15, #0x60]\n" "prfm pldl1keep, [x10, #0x80]\n" "bge 7b\n" "9:" // Height 1: Multiply loop: Single iteration only ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q21, [x14, #0x70]\n" + "ldr q4, [x15, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q20, [x14, #0x80]\n" + "ldr q5, [x15, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q26, [x14, #0x90]\n" + "ldr q6, [x15, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q25, [x14, #0xa0]\n" + "ldr q7, [x15, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" - "ldr q24, [x14, #0xb0]\n" + "ldr q8, [x15, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" - "ldr q23, [x14, #0xc0]\n" + "ldr q9, [x15, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" - "ldr q22, [x14, #0xd0]\n" - ".inst 0x4fa0e2b3 // sdot v19.4s, v21.16b, v0.4b[1]\n" - "ldr q21, [x14, #0xe0]\n" - ".inst 0x4f80ea90 // sdot v16.4s, v20.16b, v0.4b[2]\n" - "ldr q20, [x14, #0xf0]\n" - ".inst 0x4f80eb51 // sdot v17.4s, v26.16b, v0.4b[2]\n" + "ldr q10, [x15, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + "ldr q4, [x15, #0xe0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + "ldr q5, [x15, #0xf0]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" "sub x11, x11, #0x10\n" - ".inst 0x4f80eb32 // sdot v18.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" "add x10, x10, #0x10\n" - ".inst 0x4f80eb13 // sdot v19.4s, v24.16b, v0.4b[2]\n" - "add x14, x14, #0x100\n" - ".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n" - ".inst 0x4fa0ead1 // sdot v17.4s, v22.16b, v0.4b[3]\n" - ".inst 0x4fa0eab2 // sdot v18.4s, v21.16b, v0.4b[3]\n" - ".inst 0x4fa0ea93 // sdot v19.4s, v20.16b, v0.4b[3]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + "add x15, x15, #0x100\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 10f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "10:" // Height 1: Multiply loop: unique 2: skip row sum @@ -228,17 +223,17 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "tbnz %x[flags], #31, 13f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "13:" // Height 1: Multiply loop: unique 3: skip row sum - "ldr q23, [x14, #0x0]\n" + "ldr q6, [x15, #0x0]\n" "sub x11, x11, #0x4\n" - "ldr q22, [x14, #0x10]\n" + "ldr q7, [x15, #0x10]\n" "cmp x11, #0x4\n" - "ldr q21, [x14, #0x20]\n" - "ldr q20, [x14, #0x30]\n" - ".inst 0x4f80e2f0 // sdot v16.4s, v23.16b, v0.4b[0]\n" - ".inst 0x4f80e2d1 // sdot v17.4s, v22.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" - ".inst 0x4f80e2b2 // sdot v18.4s, v21.16b, v0.4b[0]\n" - ".inst 0x4f80e293 // sdot v19.4s, v20.16b, v0.4b[0]\n" + "ldr q8, [x15, #0x20]\n" + "ldr q9, [x15, #0x30]\n" + ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" "bge 12b\n" "14:" // Height 1: Multiply loop: Skip odd blocks "cbz x11, 18f\n" @@ -253,15 +248,15 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "tbnz %x[flags], #31, 17f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "17:" // Height 1: Multiply loop: unique 4: skip row sum - "ldr q23, [x14, #0x0]\n" - "ldr q22, [x14, #0x10]\n" - "ldr q21, [x14, #0x20]\n" - "ldr q20, [x14, #0x30]\n" - ".inst 0x4f80e2f0 // sdot v16.4s, v23.16b, v0.4b[0]\n" - ".inst 0x4f80e2d1 // sdot v17.4s, v22.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" - ".inst 0x4f80e2b2 // sdot v18.4s, v21.16b, v0.4b[0]\n" - ".inst 0x4f80e293 // sdot v19.4s, v20.16b, v0.4b[0]\n" + "ldr q10, [x15, #0x0]\n" + "ldr q4, [x15, #0x10]\n" + "ldr q5, [x15, #0x20]\n" + "ldr q6, [x15, #0x30]\n" + ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" "18:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x12, x12, #0x1\n" @@ -271,136 +266,122 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "tbnz %x[flags], #31, 19f\n" "addp v11.4s, v11.4s, v11.4s\n" "add x20, %x[qp], %[b_offset]\n" - "ld1r { v20.4s }, [x20]\n" - "neg v20.4s, v20.4s\n" + "ld1r { v1.4s }, [x20]\n" + "neg v1.4s, v1.4s\n" "addp v11.4s, v11.4s, v11.4s\n" - "mul v11.4s, v11.4s, v20.4s\n" + "mul v11.4s, v11.4s, v1.4s\n" "19:" // Height 1: skip row sum fixup - "ldr q24, [x16, #0x0]\n" + "ldr q0, [x14, #0x0]\n" "add v16.4s, v16.4s, v11.4s\n" - "ldr q23, [x16, #0x10]\n" + "ldr q1, [x14, #0x10]\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q22, [x16, #0x20]\n" + "ldr q2, [x14, #0x20]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q21, [x16, #0x30]\n" + "ldr q3, [x14, #0x30]\n" "add v19.4s, v19.4s, v11.4s\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v20.4s }, [x20]\n" - "add v16.4s, v16.4s, v24.4s\n" - "add v17.4s, v17.4s, v23.4s\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x21]\n" + "add v16.4s, v16.4s, v0.4s\n" "ld1r { v0.4s }, [x20]\n" - "add v18.4s, v18.4s, v22.4s\n" - "add v19.4s, v19.4s, v21.4s\n" - "add x16, x16, #0x40\n" - "sqrdmulh v16.4s, v16.4s, v20.4s\n" - "sqrdmulh v17.4s, v17.4s, v20.4s\n" - "sqrdmulh v18.4s, v18.4s, v20.4s\n" - "sqrdmulh v19.4s, v19.4s, v20.4s\n" - "tbz %x[flags], #5, 20f\n" - "and v23.16b, v16.16b, v0.16b\n" - "and v22.16b, v17.16b, v0.16b\n" - "and v21.16b, v18.16b, v0.16b\n" - "and v20.16b, v19.16b, v0.16b\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v23.4s\n" - "sqadd v17.4s, v17.4s, v22.4s\n" - "sqadd v18.4s, v18.4s, v21.4s\n" - "sqadd v19.4s, v19.4s, v20.4s\n" - "20:" // Height 1: no shift correction + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add x21, %x[qp], %[c_offset]\n" + "add v19.4s, v19.4s, v3.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v6.4s }, [x20]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "cmp x16, #0x10\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "ld1r { v4.4s }, [x21]\n" + "add x14, x14, #0x40\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x20, %x[qp], %[c_offset]\n" - "add x21, %x[qp], %[maxval]\n" - "ld1r { v22.4s }, [x20]\n" - "add x20, %x[qp], %[minval]\n" - "ld1r { v21.4s }, [x21]\n" - "cmp x15, #0x10\n" - "ld1r { v20.4s }, [x20]\n" - "add v16.4s, v16.4s, v22.4s\n" - "add v17.4s, v17.4s, v22.4s\n" - "add v18.4s, v18.4s, v22.4s\n" - "add v19.4s, v19.4s, v22.4s\n" - "smin v16.4s, v16.4s, v21.4s\n" - "smin v17.4s, v17.4s, v21.4s\n" - "smin v18.4s, v18.4s, v21.4s\n" - "smin v19.4s, v19.4s, v21.4s\n" - "smax v16.4s, v16.4s, v20.4s\n" - "smax v17.4s, v17.4s, v20.4s\n" - "smax v18.4s, v18.4s, v20.4s\n" - "smax v19.4s, v19.4s, v20.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v16.16b, v16.16b, v17.16b\n" - "bge 29f\n" - "tbz x15, #3, 24f\n" + "bge 28f\n" + "tbz x16, #3, 23f\n" "str d16, [x13], #0x8\n" - "tbz x15, #2, 22f\n" + "tbz x16, #2, 21f\n" "st1 { v16.s }[2], [x13], #0x4\n" - "tbz x15, #1, 21f\n" + "tbz x16, #1, 20f\n" "st1 { v16.h }[6], [x13], #0x2\n" - "tbz x15, #0, 28f\n" + "tbz x16, #0, 27f\n" "st1 { v16.b }[14], [x13]\n" - "b 28f\n" - "21:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x15, #0, 28f\n" + "b 27f\n" + "20:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x16, #0, 27f\n" "st1 { v16.b }[12], [x13]\n" - "b 28f\n" - "22:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x15, #1, 23f\n" + "b 27f\n" + "21:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x16, #1, 22f\n" "st1 { v16.h }[4], [x13], #0x2\n" - "tbz x15, #0, 28f\n" + "tbz x16, #0, 27f\n" "st1 { v16.b }[10], [x13]\n" - "b 28f\n" - "23:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x15, #0, 28f\n" + "b 27f\n" + "22:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x16, #0, 27f\n" "st1 { v16.b }[8], [x13]\n" - "b 28f\n" - "24:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x15, #2, 26f\n" + "b 27f\n" + "23:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x16, #2, 25f\n" "str s16, [x13], #0x4\n" - "tbz x15, #1, 25f\n" + "tbz x16, #1, 24f\n" "st1 { v16.h }[2], [x13], #0x2\n" - "tbz x15, #0, 28f\n" + "tbz x16, #0, 27f\n" "st1 { v16.b }[6], [x13]\n" - "b 28f\n" - "25:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x15, #0, 28f\n" + "b 27f\n" + "24:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x16, #0, 27f\n" "st1 { v16.b }[4], [x13]\n" - "b 28f\n" - "26:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x15, #1, 27f\n" + "b 27f\n" + "25:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x16, #1, 26f\n" "str h16, [x13], #0x2\n" - "tbz x15, #0, 28f\n" + "tbz x16, #0, 27f\n" "st1 { v16.b }[2], [x13]\n" - "b 28f\n" - "27:" // Height 1: Partial direct writeback: partial_1_0 + "b 27f\n" + "26:" // Height 1: Partial direct writeback: partial_1_0 "str b16, [x13, #0x0]\n" - "28:" // Height 1: Partial direct writeback: Done - "b 30f\n" - "29:" // Height 1: Full writeback + "27:" // Height 1: Partial direct writeback: Done + "b 29f\n" + "28:" // Height 1: Full writeback "str q16, [x13, #0x0]\n" "add x13, x13, #0x10\n" - "30:" // Height 1: Writeback done - "subs x15, x15, #0x10\n" + "29:" // Height 1: Writeback done + "subs x16, x16, #0x10\n" "bgt 2b\n" - "b 122f\n" - "31:" // Height 2 - "mov x16, %x[col_bias]\n" + "b 118f\n" + "30:" // Height 2 "movi v11.4s, #0x0\n" - "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "movi v12.4s, #0x0\n" + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" "movi v15.16b, #0x1\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[col_bias]\n" "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "32:" // Height 2: Column loop + "31:" // Height 2: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -409,434 +390,407 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "33:" // Height 2: setup done "mov x12, #0x0\n" - "34:" // Height 2: String loop + "33:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 35f\n" + "tbz %x[flags], #3, 34f\n" "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x10, [x20, #0x0]\n" "ldr x9, [x20, #0x8]\n" - "cbnz x12, 36f\n" + "cbnz x12, 35f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x10, x10, x20\n" "add x9, x9, x20\n" - "b 36f\n" - "35:" // Height 2: setup direct input + "b 35f\n" + "34:" // Height 2: setup direct input "mov x10, %x[input_ptr]\n" "add x9, x10, x21\n" - "36:" // Height 2: input setup done + "35:" // Height 2: input setup done "cmp x11, #0x10\n" - "blt 41f\n" + "blt 40f\n" "ldr q0, [x10, #0x0]\n" "cmp x11, #0x20\n" "ldr q1, [x9, #0x0]\n" - "ldr q4, [x14, #0x0]\n" - "ldr q5, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" - "ldr q7, [x14, #0x30]\n" - "ldr q8, [x14, #0x40]\n" - "ldr q9, [x14, #0x50]\n" - "ldr q10, [x14, #0x60]\n" - "blt 39f\n" - "37:" // Height 2: Multiply loop: Main loop head + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "ldr q8, [x15, #0x40]\n" + "ldr q9, [x15, #0x50]\n" + "ldr q10, [x15, #0x60]\n" + "blt 38f\n" + "36:" // Height 2: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr x20, [x14, #0x78]\n" + "ldr x20, [x15, #0x78]\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "ldr d25, [x14, #0x70]\n" + "ldr d4, [x15, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "ldr x23, [x14, #0x88]\n" + "ldr x23, [x15, #0x88]\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "ldr d24, [x14, #0x80]\n" + "ldr d5, [x15, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "mov v25.d[1], x20\n" + "mov v4.d[1], x20\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - "ldr d30, [x14, #0x90]\n" + "ldr d6, [x15, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "ldr x22, [x14, #0x98]\n" + "ldr x22, [x15, #0x98]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" - "ldr d29, [x14, #0xa0]\n" - "ldr x21, [x14, #0xa8]\n" + "ldr d7, [x15, #0xa0]\n" + "ldr x21, [x15, #0xa8]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" - "ldr d28, [x14, #0xb0]\n" - "ldr x20, [x14, #0xb8]\n" + "ldr d8, [x15, #0xb0]\n" + "ldr x20, [x15, #0xb8]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" - "ldr d27, [x14, #0xc0]\n" + "ldr d9, [x15, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" - "mov v24.d[1], x23\n" + "mov v5.d[1], x23\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" - "ldr d26, [x14, #0xd0]\n" - ".inst 0x4fa0e333 // sdot v19.4s, v25.16b, v0.4b[1]\n" - "mov v30.d[1], x22\n" - ".inst 0x4fa1e337 // sdot v23.4s, v25.16b, v1.4b[1]\n" - "ldr d25, [x14, #0xe0]\n" - "mov v29.d[1], x21\n" - "ldr x23, [x14, #0xc8]\n" - "mov v28.d[1], x20\n" - "ldr x22, [x14, #0xd8]\n" - "ldr x21, [x14, #0xe8]\n" - ".inst 0x4f80eb10 // sdot v16.4s, v24.16b, v0.4b[2]\n" - ".inst 0x4f81eb14 // sdot v20.4s, v24.16b, v1.4b[2]\n" - "ldr d24, [x14, #0xf0]\n" - "ldr x20, [x14, #0xf8]\n" - ".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n" - ".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n" - "mov v27.d[1], x23\n" - ".inst 0x4f80ebb2 // sdot v18.4s, v29.16b, v0.4b[2]\n" - "mov v26.d[1], x22\n" - ".inst 0x4f81ebb6 // sdot v22.4s, v29.16b, v1.4b[2]\n" - "mov v25.d[1], x21\n" - ".inst 0x4f80eb93 // sdot v19.4s, v28.16b, v0.4b[2]\n" - "mov v24.d[1], x20\n" - ".inst 0x4f81eb97 // sdot v23.4s, v28.16b, v1.4b[2]\n" + "ldr d10, [x15, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + "mov v6.d[1], x22\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + "ldr d4, [x15, #0xe0]\n" + "mov v7.d[1], x21\n" + "ldr x23, [x15, #0xc8]\n" + "mov v8.d[1], x20\n" + "ldr x22, [x15, #0xd8]\n" + "ldr x21, [x15, #0xe8]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + "ldr d5, [x15, #0xf0]\n" + "ldr x20, [x15, #0xf8]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + "mov v9.d[1], x23\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + "mov v10.d[1], x22\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + "mov v4.d[1], x21\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + "mov v5.d[1], x20\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" "add x10, x10, #0x10\n" "add x9, x9, #0x10\n" - "add x14, x14, #0x100\n" - ".inst 0x4fa0eb70 // sdot v16.4s, v27.16b, v0.4b[3]\n" - ".inst 0x4fa1eb74 // sdot v20.4s, v27.16b, v1.4b[3]\n" - ".inst 0x4fa0eb51 // sdot v17.4s, v26.16b, v0.4b[3]\n" - ".inst 0x4fa1eb55 // sdot v21.4s, v26.16b, v1.4b[3]\n" - ".inst 0x4fa0eb32 // sdot v18.4s, v25.16b, v0.4b[3]\n" - ".inst 0x4fa1eb36 // sdot v22.4s, v25.16b, v1.4b[3]\n" - ".inst 0x4fa0eb13 // sdot v19.4s, v24.16b, v0.4b[3]\n" - ".inst 0x4fa1eb17 // sdot v23.4s, v24.16b, v1.4b[3]\n" - "tbnz %x[flags], #31, 38f\n" + "add x15, x15, #0x100\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 37f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" - "38:" // Height 2: Multiply loop: unique 5: skip row sum + "37:" // Height 2: Multiply loop: unique 5: skip row sum "ldr q0, [x10, #0x0]\n" "sub x11, x11, #0x10\n" "ldr q1, [x9, #0x0]\n" "cmp x11, #0x20\n" - "ldr q4, [x14, #0x0]\n" - "ldr q5, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" - "ldr q7, [x14, #0x30]\n" - "ldr q8, [x14, #0x40]\n" - "ldr q9, [x14, #0x50]\n" - "ldr q10, [x14, #0x60]\n" + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "ldr q8, [x15, #0x40]\n" + "ldr q9, [x15, #0x50]\n" + "ldr q10, [x15, #0x60]\n" "prfm pldl1keep, [x10, #0x80]\n" "prfm pldl1keep, [x9, #0x80]\n" - "bge 37b\n" - "39:" // Height 2: Multiply loop: Single iteration only + "bge 36b\n" + "38:" // Height 2: Multiply loop: Single iteration only ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" "sub x11, x11, #0x10\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q25, [x14, #0x70]\n" + "ldr q4, [x15, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" "add x10, x10, #0x10\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q24, [x14, #0x80]\n" + "ldr q5, [x15, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" "add x9, x9, #0x10\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - "ldr q30, [x14, #0x90]\n" + "ldr q6, [x15, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" - "ldr q29, [x14, #0xa0]\n" + "ldr q7, [x15, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" - "ldr q28, [x14, #0xb0]\n" + "ldr q8, [x15, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" - "ldr q27, [x14, #0xc0]\n" + "ldr q9, [x15, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" - "ldr q26, [x14, #0xd0]\n" - ".inst 0x4fa0e333 // sdot v19.4s, v25.16b, v0.4b[1]\n" - ".inst 0x4fa1e337 // sdot v23.4s, v25.16b, v1.4b[1]\n" - "ldr q25, [x14, #0xe0]\n" - ".inst 0x4f80eb10 // sdot v16.4s, v24.16b, v0.4b[2]\n" - ".inst 0x4f81eb14 // sdot v20.4s, v24.16b, v1.4b[2]\n" - "ldr q24, [x14, #0xf0]\n" - ".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n" - "add x14, x14, #0x100\n" - ".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n" - ".inst 0x4f80ebb2 // sdot v18.4s, v29.16b, v0.4b[2]\n" - ".inst 0x4f81ebb6 // sdot v22.4s, v29.16b, v1.4b[2]\n" - ".inst 0x4f80eb93 // sdot v19.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f81eb97 // sdot v23.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4fa0eb70 // sdot v16.4s, v27.16b, v0.4b[3]\n" - ".inst 0x4fa1eb74 // sdot v20.4s, v27.16b, v1.4b[3]\n" - ".inst 0x4fa0eb51 // sdot v17.4s, v26.16b, v0.4b[3]\n" - ".inst 0x4fa1eb55 // sdot v21.4s, v26.16b, v1.4b[3]\n" - ".inst 0x4fa0eb32 // sdot v18.4s, v25.16b, v0.4b[3]\n" - ".inst 0x4fa1eb36 // sdot v22.4s, v25.16b, v1.4b[3]\n" - ".inst 0x4fa0eb13 // sdot v19.4s, v24.16b, v0.4b[3]\n" - ".inst 0x4fa1eb17 // sdot v23.4s, v24.16b, v1.4b[3]\n" - "tbnz %x[flags], #31, 40f\n" + "ldr q10, [x15, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + "ldr q4, [x15, #0xe0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + "ldr q5, [x15, #0xf0]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "add x15, x15, #0x100\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 39f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" - "40:" // Height 2: Multiply loop: unique 6: skip row sum + "39:" // Height 2: Multiply loop: unique 6: skip row sum "prfm pldl1keep, [x10, #0x80]\n" "prfm pldl1keep, [x9, #0x80]\n" - "41:" // Height 2: Multiply loop: Main loop skip - "cbz x11, 48f\n" + "40:" // Height 2: Multiply loop: Main loop skip + "cbz x11, 47f\n" "cmp x11, #0x4\n" - "blt 44f\n" - "42:" // Height 2: Multiply loop: Odd block loop + "blt 43f\n" + "41:" // Height 2: Multiply loop: Odd block loop "ldr s0, [x10], #0x4\n" "ldr s1, [x9], #0x4\n" - "tbnz %x[flags], #31, 43f\n" + "tbnz %x[flags], #31, 42f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" - "43:" // Height 2: Multiply loop: unique 7: skip row sum - "ldr q27, [x14, #0x0]\n" + "42:" // Height 2: Multiply loop: unique 7: skip row sum + "ldr q6, [x15, #0x0]\n" "sub x11, x11, #0x4\n" - "ldr q26, [x14, #0x10]\n" + "ldr q7, [x15, #0x10]\n" "cmp x11, #0x4\n" - "ldr q25, [x14, #0x20]\n" - "ldr q24, [x14, #0x30]\n" - ".inst 0x4f80e370 // sdot v16.4s, v27.16b, v0.4b[0]\n" - ".inst 0x4f81e374 // sdot v20.4s, v27.16b, v1.4b[0]\n" - "add x14, x14, #0x40\n" - ".inst 0x4f80e351 // sdot v17.4s, v26.16b, v0.4b[0]\n" - ".inst 0x4f81e355 // sdot v21.4s, v26.16b, v1.4b[0]\n" - ".inst 0x4f80e332 // sdot v18.4s, v25.16b, v0.4b[0]\n" - ".inst 0x4f81e336 // sdot v22.4s, v25.16b, v1.4b[0]\n" - ".inst 0x4f80e313 // sdot v19.4s, v24.16b, v0.4b[0]\n" - ".inst 0x4f81e317 // sdot v23.4s, v24.16b, v1.4b[0]\n" - "bge 42b\n" - "44:" // Height 2: Multiply loop: Skip odd blocks - "cbz x11, 48f\n" - "tbz x11, #1, 45f\n" + "ldr q8, [x15, #0x20]\n" + "ldr q9, [x15, #0x30]\n" + ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" + "bge 41b\n" + "43:" // Height 2: Multiply loop: Skip odd blocks + "cbz x11, 47f\n" + "tbz x11, #1, 44f\n" "ldr h0, [x10], #0x2\n" "ldr h1, [x9], #0x2\n" - "tbz x11, #0, 46f\n" + "tbz x11, #0, 45f\n" "ld1 { v0.b }[2], [x10]\n" "ld1 { v1.b }[2], [x9]\n" - "b 46f\n" - "45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "b 45f\n" + "44:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x10, #0x0]\n" "ldr b1, [x9, #0x0]\n" - "46:" // Height 2: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 47f\n" + "45:" // Height 2: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 46f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" - "47:" // Height 2: Multiply loop: unique 8: skip row sum - "ldr q27, [x14, #0x0]\n" - "ldr q26, [x14, #0x10]\n" - "ldr q25, [x14, #0x20]\n" - "ldr q24, [x14, #0x30]\n" - ".inst 0x4f80e370 // sdot v16.4s, v27.16b, v0.4b[0]\n" - ".inst 0x4f81e374 // sdot v20.4s, v27.16b, v1.4b[0]\n" - "add x14, x14, #0x40\n" - ".inst 0x4f80e351 // sdot v17.4s, v26.16b, v0.4b[0]\n" - ".inst 0x4f81e355 // sdot v21.4s, v26.16b, v1.4b[0]\n" - ".inst 0x4f80e332 // sdot v18.4s, v25.16b, v0.4b[0]\n" - ".inst 0x4f81e336 // sdot v22.4s, v25.16b, v1.4b[0]\n" - ".inst 0x4f80e313 // sdot v19.4s, v24.16b, v0.4b[0]\n" - ".inst 0x4f81e317 // sdot v23.4s, v24.16b, v1.4b[0]\n" - "48:" // Height 2: Multiply loop: No odd multiplies + "46:" // Height 2: Multiply loop: unique 8: skip row sum + "ldr q10, [x15, #0x0]\n" + "ldr q4, [x15, #0x10]\n" + "ldr q5, [x15, #0x20]\n" + "ldr q6, [x15, #0x30]\n" + ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" + "47:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x12, x12, #0x1\n" "cmp x12, x20\n" - "bne 34b\n" + "bne 33b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x13, #0x0]\n" "add x24, x13, x20\n" "prfm pstl1keep, [x24, #0x0]\n" - "tbnz %x[flags], #31, 49f\n" + "tbnz %x[flags], #31, 48f\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "add x20, %x[qp], %[b_offset]\n" - "ld1r { v24.4s }, [x20]\n" - "neg v24.4s, v24.4s\n" + "ld1r { v2.4s }, [x20]\n" + "neg v2.4s, v2.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "mul v11.4s, v11.4s, v24.4s\n" - "mul v12.4s, v12.4s, v24.4s\n" - "49:" // Height 2: skip row sum fixup - "ldr q28, [x16, #0x0]\n" + "mul v11.4s, v11.4s, v2.4s\n" + "mul v12.4s, v12.4s, v2.4s\n" + "48:" // Height 2: skip row sum fixup + "ldr q0, [x14, #0x0]\n" "add v16.4s, v16.4s, v11.4s\n" - "ldr q27, [x16, #0x10]\n" + "ldr q1, [x14, #0x10]\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q26, [x16, #0x20]\n" + "ldr q2, [x14, #0x20]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q25, [x16, #0x30]\n" + "ldr q3, [x14, #0x30]\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" - "add v16.4s, v16.4s, v28.4s\n" - "add v17.4s, v17.4s, v27.4s\n" - "add v18.4s, v18.4s, v26.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "ld1r { v24.4s }, [x20]\n" - "add v19.4s, v19.4s, v25.4s\n" - "add v20.4s, v20.4s, v28.4s\n" - "add v21.4s, v21.4s, v27.4s\n" - "add v22.4s, v22.4s, v26.4s\n" - "add v23.4s, v23.4s, v25.4s\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "ld1r { v4.4s }, [x20]\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" "add x20, %x[qp], %[per_layer_right_shift]\n" "ld1r { v0.4s }, [x20]\n" - "sqrdmulh v16.4s, v16.4s, v24.4s\n" - "sqrdmulh v17.4s, v17.4s, v24.4s\n" - "sqrdmulh v18.4s, v18.4s, v24.4s\n" - "sqrdmulh v19.4s, v19.4s, v24.4s\n" - "sqrdmulh v20.4s, v20.4s, v24.4s\n" - "sqrdmulh v21.4s, v21.4s, v24.4s\n" - "sqrdmulh v22.4s, v22.4s, v24.4s\n" - "sqrdmulh v23.4s, v23.4s, v24.4s\n" - "add x16, x16, #0x40\n" - "tbz %x[flags], #5, 50f\n" - "and v24.16b, v16.16b, v0.16b\n" - "and v30.16b, v17.16b, v0.16b\n" - "and v29.16b, v18.16b, v0.16b\n" - "and v28.16b, v19.16b, v0.16b\n" - "and v27.16b, v20.16b, v0.16b\n" - "and v26.16b, v21.16b, v0.16b\n" - "and v25.16b, v22.16b, v0.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v24.4s\n" - "and v24.16b, v23.16b, v0.16b\n" - "sqadd v17.4s, v17.4s, v30.4s\n" - "sqadd v18.4s, v18.4s, v29.4s\n" - "sqadd v19.4s, v19.4s, v28.4s\n" - "sqadd v20.4s, v20.4s, v27.4s\n" - "sqadd v21.4s, v21.4s, v26.4s\n" - "sqadd v22.4s, v22.4s, v25.4s\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v24.4s\n" - "50:" // Height 2: no shift correction + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x20]\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" - "add x20, %x[qp], %[c_offset]\n" - "add x21, %x[qp], %[maxval]\n" - "ld1r { v26.4s }, [x20]\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v6.4s }, [x20]\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" "add x20, %x[qp], %[minval]\n" - "ld1r { v25.4s }, [x21]\n" - "cmp x15, #0x10\n" - "ld1r { v24.4s }, [x20]\n" - "add v16.4s, v16.4s, v26.4s\n" - "add v17.4s, v17.4s, v26.4s\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v26.4s\n" - "add v20.4s, v20.4s, v26.4s\n" - "add v21.4s, v21.4s, v26.4s\n" - "add v22.4s, v22.4s, v26.4s\n" - "add v23.4s, v23.4s, v26.4s\n" - "smin v16.4s, v16.4s, v25.4s\n" - "smin v17.4s, v17.4s, v25.4s\n" - "smin v18.4s, v18.4s, v25.4s\n" - "smin v19.4s, v19.4s, v25.4s\n" - "smin v20.4s, v20.4s, v25.4s\n" - "smin v21.4s, v21.4s, v25.4s\n" - "smin v22.4s, v22.4s, v25.4s\n" - "smin v23.4s, v23.4s, v25.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "smax v18.4s, v18.4s, v24.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "smax v21.4s, v21.4s, v24.4s\n" - "smax v22.4s, v22.4s, v24.4s\n" - "smax v23.4s, v23.4s, v24.4s\n" + "ld1r { v5.4s }, [x20]\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "cmp x16, #0x10\n" + "smax v19.4s, v19.4s, v5.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v18.8h, v18.8h, v19.8h\n" + "add x14, x14, #0x40\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v17.8h, v22.8h, v23.8h\n" - "uzp1 v16.16b, v16.16b, v18.16b\n" - "uzp1 v20.16b, v20.16b, v17.16b\n" - "bge 59f\n" - "tbz x15, #3, 54f\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "bge 57f\n" + "tbz x16, #3, 52f\n" "str d16, [x13], #0x8\n" "str d20, [x24], #0x8\n" - "tbz x15, #2, 52f\n" + "tbz x16, #2, 50f\n" "st1 { v16.s }[2], [x13], #0x4\n" "st1 { v20.s }[2], [x24], #0x4\n" - "tbz x15, #1, 51f\n" + "tbz x16, #1, 49f\n" "st1 { v16.h }[6], [x13], #0x2\n" "st1 { v20.h }[6], [x24], #0x2\n" - "tbz x15, #0, 58f\n" + "tbz x16, #0, 56f\n" "st1 { v16.b }[14], [x13]\n" "st1 { v20.b }[14], [x24]\n" - "b 58f\n" - "51:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x15, #0, 58f\n" + "b 56f\n" + "49:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x16, #0, 56f\n" "st1 { v16.b }[12], [x13]\n" "st1 { v20.b }[12], [x24]\n" - "b 58f\n" - "52:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x15, #1, 53f\n" + "b 56f\n" + "50:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x16, #1, 51f\n" "st1 { v16.h }[4], [x13], #0x2\n" "st1 { v20.h }[4], [x24], #0x2\n" - "tbz x15, #0, 58f\n" + "tbz x16, #0, 56f\n" "st1 { v16.b }[10], [x13]\n" "st1 { v20.b }[10], [x24]\n" - "b 58f\n" - "53:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x15, #0, 58f\n" + "b 56f\n" + "51:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x16, #0, 56f\n" "st1 { v16.b }[8], [x13]\n" "st1 { v20.b }[8], [x24]\n" - "b 58f\n" - "54:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x15, #2, 56f\n" + "b 56f\n" + "52:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x16, #2, 54f\n" "str s16, [x13], #0x4\n" "str s20, [x24], #0x4\n" - "tbz x15, #1, 55f\n" + "tbz x16, #1, 53f\n" "st1 { v16.h }[2], [x13], #0x2\n" "st1 { v20.h }[2], [x24], #0x2\n" - "tbz x15, #0, 58f\n" + "tbz x16, #0, 56f\n" "st1 { v16.b }[6], [x13]\n" "st1 { v20.b }[6], [x24]\n" - "b 58f\n" - "55:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x15, #0, 58f\n" + "b 56f\n" + "53:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x16, #0, 56f\n" "st1 { v16.b }[4], [x13]\n" "st1 { v20.b }[4], [x24]\n" - "b 58f\n" - "56:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x15, #1, 57f\n" + "b 56f\n" + "54:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x16, #1, 55f\n" "str h16, [x13], #0x2\n" "str h20, [x24], #0x2\n" - "tbz x15, #0, 58f\n" + "tbz x16, #0, 56f\n" "st1 { v16.b }[2], [x13]\n" "st1 { v20.b }[2], [x24]\n" - "b 58f\n" - "57:" // Height 2: Partial direct writeback: partial_1_0 + "b 56f\n" + "55:" // Height 2: Partial direct writeback: partial_1_0 "str b16, [x13, #0x0]\n" "str b20, [x24, #0x0]\n" - "58:" // Height 2: Partial direct writeback: Done - "b 60f\n" - "59:" // Height 2: Full writeback + "56:" // Height 2: Partial direct writeback: Done + "b 58f\n" + "57:" // Height 2: Full writeback "str q16, [x13, #0x0]\n" "add x13, x13, #0x10\n" "str q20, [x24, #0x0]\n" - "60:" // Height 2: Writeback done - "subs x15, x15, #0x10\n" - "bgt 32b\n" - "b 122f\n" - "61:" // Height 3 - "mov x16, %x[col_bias]\n" + "58:" // Height 2: Writeback done + "subs x16, x16, #0x10\n" + "bgt 31b\n" + "b 118f\n" + "59:" // Height 3 "movi v11.4s, #0x0\n" - "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "movi v12.4s, #0x0\n" + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" "movi v13.4s, #0x0\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" "movi v15.16b, #0x1\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[col_bias]\n" "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "62:" // Height 3: Column loop + "60:" // Height 3: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -849,318 +803,317 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "movi v25.4s, #0x0\n" "movi v26.4s, #0x0\n" "movi v27.4s, #0x0\n" - "63:" // Height 3: setup done "mov x12, #0x0\n" - "64:" // Height 3: String loop + "62:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 65f\n" + "tbz %x[flags], #3, 63f\n" "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x10, [x20, #0x0]\n" "ldr x9, [x20, #0x8]\n" "ldr x28, [x20, #0x10]\n" - "cbnz x12, 66f\n" + "cbnz x12, 64f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x10, x10, x20\n" "add x9, x9, x20\n" "add x28, x28, x20\n" - "b 66f\n" - "65:" // Height 3: setup direct input + "b 64f\n" + "63:" // Height 3: setup direct input "mov x10, %x[input_ptr]\n" "add x9, x10, x21\n" "add x28, x9, x21\n" - "66:" // Height 3: input setup done + "64:" // Height 3: input setup done "cmp x11, #0x10\n" - "blt 71f\n" + "blt 69f\n" "ldr q0, [x10, #0x0]\n" "cmp x11, #0x20\n" "ldr q1, [x9, #0x0]\n" "ldr q2, [x28, #0x0]\n" - "ldr q4, [x14, #0x0]\n" - "ldr q5, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" - "ldr q7, [x14, #0x30]\n" - "ldr q8, [x14, #0x40]\n" - "ldr q9, [x14, #0x50]\n" - "ldr q10, [x14, #0x60]\n" - "blt 69f\n" - "67:" // Height 3: Multiply loop: Main loop head + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "ldr q8, [x15, #0x40]\n" + "ldr q9, [x15, #0x50]\n" + "ldr q10, [x15, #0x60]\n" + "blt 67f\n" + "65:" // Height 3: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr x20, [x14, #0x78]\n" + "ldr x20, [x15, #0x78]\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "ldr x23, [x14, #0x88]\n" + "ldr x23, [x15, #0x88]\n" ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" - "ldr d29, [x14, #0x70]\n" + "ldr d4, [x15, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "ldr x22, [x14, #0x98]\n" + "ldr x22, [x15, #0x98]\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "ldr x21, [x14, #0xa8]\n" + "ldr x21, [x15, #0xa8]\n" ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" - "ldr d28, [x14, #0x80]\n" + "ldr d5, [x15, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "mov v29.d[1], x20\n" + "mov v4.d[1], x20\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - "ldr x20, [x14, #0xb8]\n" + "ldr x20, [x15, #0xb8]\n" ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" - "ldr d5, [x14, #0x90]\n" + "ldr d6, [x15, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "mov v28.d[1], x23\n" + "mov v5.d[1], x23\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" - "ldr x23, [x14, #0xc8]\n" + "ldr x23, [x15, #0xc8]\n" ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" - "ldr d4, [x14, #0xa0]\n" + "ldr d7, [x15, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" - "mov v5.d[1], x22\n" + "mov v6.d[1], x22\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" - "ldr x22, [x14, #0xd8]\n" + "ldr x22, [x15, #0xd8]\n" ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" - "ldr d3, [x14, #0xb0]\n" + "ldr d8, [x15, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" - "mov v4.d[1], x21\n" + "mov v7.d[1], x21\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" - "ldr x21, [x14, #0xe8]\n" + "ldr x21, [x15, #0xe8]\n" ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" - "ldr d31, [x14, #0xc0]\n" + "ldr d9, [x15, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" - "mov v3.d[1], x20\n" + "mov v8.d[1], x20\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" - "ldr x20, [x14, #0xf8]\n" + "ldr x20, [x15, #0xf8]\n" ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" - "ldr d30, [x14, #0xd0]\n" - ".inst 0x4fa0e3b3 // sdot v19.4s, v29.16b, v0.4b[1]\n" - "mov v31.d[1], x23\n" - ".inst 0x4fa1e3b7 // sdot v23.4s, v29.16b, v1.4b[1]\n" + "ldr d10, [x15, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + "mov v9.d[1], x23\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" "add x10, x10, #0x10\n" - ".inst 0x4fa2e3bb // sdot v27.4s, v29.16b, v2.4b[1]\n" - "ldr d29, [x14, #0xe0]\n" - ".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n" - "mov v30.d[1], x22\n" - ".inst 0x4f81eb94 // sdot v20.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" + "ldr d4, [x15, #0xe0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + "mov v10.d[1], x22\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" "add x9, x9, #0x10\n" - ".inst 0x4f82eb98 // sdot v24.4s, v28.16b, v2.4b[2]\n" - "ldr d28, [x14, #0xf0]\n" - ".inst 0x4f80e8b1 // sdot v17.4s, v5.16b, v0.4b[2]\n" - "mov v29.d[1], x21\n" - ".inst 0x4f81e8b5 // sdot v21.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" + "ldr d5, [x15, #0xf0]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "mov v4.d[1], x21\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" "add x28, x28, #0x10\n" - ".inst 0x4f82e8b9 // sdot v25.4s, v5.16b, v2.4b[2]\n" - "mov v28.d[1], x20\n" - ".inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]\n" - "add x14, x14, #0x100\n" - ".inst 0x4f81e896 // sdot v22.4s, v4.16b, v1.4b[2]\n" - ".inst 0x4f82e89a // sdot v26.4s, v4.16b, v2.4b[2]\n" - ".inst 0x4f80e873 // sdot v19.4s, v3.16b, v0.4b[2]\n" - ".inst 0x4f81e877 // sdot v23.4s, v3.16b, v1.4b[2]\n" - ".inst 0x4f82e87b // sdot v27.4s, v3.16b, v2.4b[2]\n" - ".inst 0x4fa0ebf0 // sdot v16.4s, v31.16b, v0.4b[3]\n" - ".inst 0x4fa1ebf4 // sdot v20.4s, v31.16b, v1.4b[3]\n" - ".inst 0x4fa2ebf8 // sdot v24.4s, v31.16b, v2.4b[3]\n" - ".inst 0x4fa0ebd1 // sdot v17.4s, v30.16b, v0.4b[3]\n" - ".inst 0x4fa1ebd5 // sdot v21.4s, v30.16b, v1.4b[3]\n" - ".inst 0x4fa2ebd9 // sdot v25.4s, v30.16b, v2.4b[3]\n" - ".inst 0x4fa0ebb2 // sdot v18.4s, v29.16b, v0.4b[3]\n" - ".inst 0x4fa1ebb6 // sdot v22.4s, v29.16b, v1.4b[3]\n" - ".inst 0x4fa2ebba // sdot v26.4s, v29.16b, v2.4b[3]\n" - ".inst 0x4fa0eb93 // sdot v19.4s, v28.16b, v0.4b[3]\n" - ".inst 0x4fa1eb97 // sdot v23.4s, v28.16b, v1.4b[3]\n" - ".inst 0x4fa2eb9b // sdot v27.4s, v28.16b, v2.4b[3]\n" - "tbnz %x[flags], #31, 68f\n" + ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" + "mov v5.d[1], x20\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + "add x15, x15, #0x100\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 66f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - "68:" // Height 3: Multiply loop: unique 9: skip row sum + "66:" // Height 3: Multiply loop: unique 9: skip row sum "ldr q0, [x10, #0x0]\n" "sub x11, x11, #0x10\n" "ldr q1, [x9, #0x0]\n" "cmp x11, #0x20\n" "ldr q2, [x28, #0x0]\n" - "ldr q4, [x14, #0x0]\n" - "ldr q5, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" - "ldr q7, [x14, #0x30]\n" - "ldr q8, [x14, #0x40]\n" - "ldr q9, [x14, #0x50]\n" - "ldr q10, [x14, #0x60]\n" + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "ldr q8, [x15, #0x40]\n" + "ldr q9, [x15, #0x50]\n" + "ldr q10, [x15, #0x60]\n" "prfm pldl1keep, [x10, #0x80]\n" "prfm pldl1keep, [x9, #0x80]\n" "prfm pldl1keep, [x28, #0x80]\n" - "bge 67b\n" - "69:" // Height 3: Multiply loop: Single iteration only + "bge 65b\n" + "67:" // Height 3: Multiply loop: Single iteration only ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" "sub x11, x11, #0x10\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" "add x10, x10, #0x10\n" ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q29, [x14, #0x70]\n" + "ldr q4, [x15, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" "add x9, x9, #0x10\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" "add x28, x28, #0x10\n" ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" - "ldr q28, [x14, #0x80]\n" + "ldr q5, [x15, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" - "ldr q5, [x14, #0x90]\n" + "ldr q6, [x15, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" - "ldr q4, [x14, #0xa0]\n" + "ldr q7, [x15, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" - "ldr q3, [x14, #0xb0]\n" + "ldr q8, [x15, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" - "ldr q31, [x14, #0xc0]\n" + "ldr q9, [x15, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" - "ldr q30, [x14, #0xd0]\n" - ".inst 0x4fa0e3b3 // sdot v19.4s, v29.16b, v0.4b[1]\n" - ".inst 0x4fa1e3b7 // sdot v23.4s, v29.16b, v1.4b[1]\n" - ".inst 0x4fa2e3bb // sdot v27.4s, v29.16b, v2.4b[1]\n" - "ldr q29, [x14, #0xe0]\n" - ".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f81eb94 // sdot v20.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4f82eb98 // sdot v24.4s, v28.16b, v2.4b[2]\n" - "ldr q28, [x14, #0xf0]\n" - ".inst 0x4f80e8b1 // sdot v17.4s, v5.16b, v0.4b[2]\n" - "add x14, x14, #0x100\n" - ".inst 0x4f81e8b5 // sdot v21.4s, v5.16b, v1.4b[2]\n" - ".inst 0x4f82e8b9 // sdot v25.4s, v5.16b, v2.4b[2]\n" - ".inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]\n" - ".inst 0x4f81e896 // sdot v22.4s, v4.16b, v1.4b[2]\n" - ".inst 0x4f82e89a // sdot v26.4s, v4.16b, v2.4b[2]\n" - ".inst 0x4f80e873 // sdot v19.4s, v3.16b, v0.4b[2]\n" - ".inst 0x4f81e877 // sdot v23.4s, v3.16b, v1.4b[2]\n" - ".inst 0x4f82e87b // sdot v27.4s, v3.16b, v2.4b[2]\n" - ".inst 0x4fa0ebf0 // sdot v16.4s, v31.16b, v0.4b[3]\n" - ".inst 0x4fa1ebf4 // sdot v20.4s, v31.16b, v1.4b[3]\n" - ".inst 0x4fa2ebf8 // sdot v24.4s, v31.16b, v2.4b[3]\n" - ".inst 0x4fa0ebd1 // sdot v17.4s, v30.16b, v0.4b[3]\n" - ".inst 0x4fa1ebd5 // sdot v21.4s, v30.16b, v1.4b[3]\n" - ".inst 0x4fa2ebd9 // sdot v25.4s, v30.16b, v2.4b[3]\n" - ".inst 0x4fa0ebb2 // sdot v18.4s, v29.16b, v0.4b[3]\n" - ".inst 0x4fa1ebb6 // sdot v22.4s, v29.16b, v1.4b[3]\n" - ".inst 0x4fa2ebba // sdot v26.4s, v29.16b, v2.4b[3]\n" - ".inst 0x4fa0eb93 // sdot v19.4s, v28.16b, v0.4b[3]\n" - ".inst 0x4fa1eb97 // sdot v23.4s, v28.16b, v1.4b[3]\n" - ".inst 0x4fa2eb9b // sdot v27.4s, v28.16b, v2.4b[3]\n" - "tbnz %x[flags], #31, 70f\n" + "ldr q10, [x15, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" + "ldr q4, [x15, #0xe0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" + "ldr q5, [x15, #0xf0]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "add x15, x15, #0x100\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 68f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - "70:" // Height 3: Multiply loop: unique 10: skip row sum + "68:" // Height 3: Multiply loop: unique 10: skip row sum "prfm pldl1keep, [x10, #0x80]\n" "prfm pldl1keep, [x9, #0x80]\n" "prfm pldl1keep, [x28, #0x80]\n" - "71:" // Height 3: Multiply loop: Main loop skip - "cbz x11, 78f\n" + "69:" // Height 3: Multiply loop: Main loop skip + "cbz x11, 76f\n" "cmp x11, #0x4\n" - "blt 74f\n" - "72:" // Height 3: Multiply loop: Odd block loop + "blt 72f\n" + "70:" // Height 3: Multiply loop: Odd block loop "ldr s0, [x10], #0x4\n" "ldr s1, [x9], #0x4\n" "ldr s2, [x28], #0x4\n" - "tbnz %x[flags], #31, 73f\n" + "tbnz %x[flags], #31, 71f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - "73:" // Height 3: Multiply loop: unique 11: skip row sum - "ldr q31, [x14, #0x0]\n" + "71:" // Height 3: Multiply loop: unique 11: skip row sum + "ldr q6, [x15, #0x0]\n" "sub x11, x11, #0x4\n" - "ldr q30, [x14, #0x10]\n" + "ldr q7, [x15, #0x10]\n" "cmp x11, #0x4\n" - "ldr q29, [x14, #0x20]\n" - "ldr q28, [x14, #0x30]\n" - ".inst 0x4f80e3f0 // sdot v16.4s, v31.16b, v0.4b[0]\n" - ".inst 0x4f81e3f4 // sdot v20.4s, v31.16b, v1.4b[0]\n" - "add x14, x14, #0x40\n" - ".inst 0x4f82e3f8 // sdot v24.4s, v31.16b, v2.4b[0]\n" - ".inst 0x4f80e3d1 // sdot v17.4s, v30.16b, v0.4b[0]\n" - ".inst 0x4f81e3d5 // sdot v21.4s, v30.16b, v1.4b[0]\n" - ".inst 0x4f82e3d9 // sdot v25.4s, v30.16b, v2.4b[0]\n" - ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n" - ".inst 0x4f81e3b6 // sdot v22.4s, v29.16b, v1.4b[0]\n" - ".inst 0x4f82e3ba // sdot v26.4s, v29.16b, v2.4b[0]\n" - ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f81e397 // sdot v23.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f82e39b // sdot v27.4s, v28.16b, v2.4b[0]\n" - "bge 72b\n" - "74:" // Height 3: Multiply loop: Skip odd blocks - "cbz x11, 78f\n" - "tbz x11, #1, 75f\n" + "ldr q8, [x15, #0x20]\n" + "ldr q9, [x15, #0x30]\n" + ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n" + ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n" + "bge 70b\n" + "72:" // Height 3: Multiply loop: Skip odd blocks + "cbz x11, 76f\n" + "tbz x11, #1, 73f\n" "ldr h0, [x10], #0x2\n" "ldr h1, [x9], #0x2\n" "ldr h2, [x28], #0x2\n" - "tbz x11, #0, 76f\n" + "tbz x11, #0, 74f\n" "ld1 { v0.b }[2], [x10]\n" "ld1 { v1.b }[2], [x9]\n" "ld1 { v2.b }[2], [x28]\n" - "b 76f\n" - "75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "b 74f\n" + "73:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x10, #0x0]\n" "ldr b1, [x9, #0x0]\n" "ldr b2, [x28, #0x0]\n" - "76:" // Height 3: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 77f\n" + "74:" // Height 3: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 75f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - "77:" // Height 3: Multiply loop: unique 12: skip row sum - "ldr q31, [x14, #0x0]\n" - "ldr q30, [x14, #0x10]\n" - "ldr q29, [x14, #0x20]\n" - "ldr q28, [x14, #0x30]\n" - ".inst 0x4f80e3f0 // sdot v16.4s, v31.16b, v0.4b[0]\n" - ".inst 0x4f81e3f4 // sdot v20.4s, v31.16b, v1.4b[0]\n" - "add x14, x14, #0x40\n" - ".inst 0x4f82e3f8 // sdot v24.4s, v31.16b, v2.4b[0]\n" - ".inst 0x4f80e3d1 // sdot v17.4s, v30.16b, v0.4b[0]\n" - ".inst 0x4f81e3d5 // sdot v21.4s, v30.16b, v1.4b[0]\n" - ".inst 0x4f82e3d9 // sdot v25.4s, v30.16b, v2.4b[0]\n" - ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n" - ".inst 0x4f81e3b6 // sdot v22.4s, v29.16b, v1.4b[0]\n" - ".inst 0x4f82e3ba // sdot v26.4s, v29.16b, v2.4b[0]\n" - ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f81e397 // sdot v23.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f82e39b // sdot v27.4s, v28.16b, v2.4b[0]\n" - "78:" // Height 3: Multiply loop: No odd multiplies + "75:" // Height 3: Multiply loop: unique 12: skip row sum + "ldr q10, [x15, #0x0]\n" + "ldr q4, [x15, #0x10]\n" + "ldr q5, [x15, #0x20]\n" + "ldr q6, [x15, #0x30]\n" + ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n" + ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n" + ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n" + "76:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x12, x12, #0x1\n" "cmp x12, x20\n" - "bne 64b\n" + "bne 62b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x13, #0x0]\n" "add x24, x13, x20\n" "add x23, x24, x20\n" "prfm pstl1keep, [x24, #0x0]\n" "prfm pstl1keep, [x23, #0x0]\n" - "tbnz %x[flags], #31, 79f\n" + "tbnz %x[flags], #31, 77f\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "add x20, %x[qp], %[b_offset]\n" - "ld1r { v28.4s }, [x20]\n" - "neg v28.4s, v28.4s\n" + "ld1r { v3.4s }, [x20]\n" + "neg v3.4s, v3.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "mul v11.4s, v11.4s, v28.4s\n" - "mul v12.4s, v12.4s, v28.4s\n" - "mul v13.4s, v13.4s, v28.4s\n" - "79:" // Height 3: skip row sum fixup - "ldr q31, [x16, #0x0]\n" + "mul v11.4s, v11.4s, v3.4s\n" + "mul v12.4s, v12.4s, v3.4s\n" + "mul v13.4s, v13.4s, v3.4s\n" + "77:" // Height 3: skip row sum fixup + "ldr q0, [x14, #0x0]\n" "add v16.4s, v16.4s, v11.4s\n" - "ldr q30, [x16, #0x10]\n" + "ldr q1, [x14, #0x10]\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q29, [x16, #0x20]\n" + "ldr q2, [x14, #0x20]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q28, [x16, #0x30]\n" + "ldr q3, [x14, #0x30]\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" @@ -1170,74 +1123,40 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "add v25.4s, v25.4s, v13.4s\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" - "add v16.4s, v16.4s, v31.4s\n" - "add v17.4s, v17.4s, v30.4s\n" - "add v18.4s, v18.4s, v29.4s\n" - "add v19.4s, v19.4s, v28.4s\n" - "add v20.4s, v20.4s, v31.4s\n" - "add v21.4s, v21.4s, v30.4s\n" - "add v22.4s, v22.4s, v29.4s\n" - "add v23.4s, v23.4s, v28.4s\n" - "add v24.4s, v24.4s, v31.4s\n" - "add v25.4s, v25.4s, v30.4s\n" - "add v26.4s, v26.4s, v29.4s\n" - "add v27.4s, v27.4s, v28.4s\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v28.4s }, [x20]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x21]\n" + "add x21, %x[qp], %[c_offset]\n" "ld1r { v0.4s }, [x20]\n" - "add x16, x16, #0x40\n" - "sqrdmulh v16.4s, v16.4s, v28.4s\n" - "sqrdmulh v17.4s, v17.4s, v28.4s\n" - "sqrdmulh v18.4s, v18.4s, v28.4s\n" - "sqrdmulh v19.4s, v19.4s, v28.4s\n" - "sqrdmulh v20.4s, v20.4s, v28.4s\n" - "sqrdmulh v21.4s, v21.4s, v28.4s\n" - "sqrdmulh v22.4s, v22.4s, v28.4s\n" - "sqrdmulh v23.4s, v23.4s, v28.4s\n" - "sqrdmulh v24.4s, v24.4s, v28.4s\n" - "sqrdmulh v25.4s, v25.4s, v28.4s\n" - "sqrdmulh v26.4s, v26.4s, v28.4s\n" - "sqrdmulh v27.4s, v27.4s, v28.4s\n" - "tbz %x[flags], #5, 80f\n" - "and v1.16b, v16.16b, v0.16b\n" - "and v31.16b, v17.16b, v0.16b\n" - "and v30.16b, v18.16b, v0.16b\n" - "and v29.16b, v19.16b, v0.16b\n" - "and v28.16b, v20.16b, v0.16b\n" - "and v3.16b, v21.16b, v0.16b\n" - "and v2.16b, v22.16b, v0.16b\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v1.4s\n" - "sqadd v17.4s, v17.4s, v31.4s\n" - "sqadd v18.4s, v18.4s, v30.4s\n" - "sqadd v19.4s, v19.4s, v29.4s\n" - "sqadd v20.4s, v20.4s, v28.4s\n" - "and v1.16b, v23.16b, v0.16b\n" - "and v31.16b, v24.16b, v0.16b\n" - "and v30.16b, v25.16b, v0.16b\n" - "and v29.16b, v26.16b, v0.16b\n" - "and v28.16b, v27.16b, v0.16b\n" - "sqadd v21.4s, v21.4s, v3.4s\n" - "sqadd v22.4s, v22.4s, v2.4s\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v1.4s\n" - "sqadd v24.4s, v24.4s, v31.4s\n" - "sqadd v25.4s, v25.4s, v30.4s\n" - "sqadd v26.4s, v26.4s, v29.4s\n" - "sqadd v27.4s, v27.4s, v28.4s\n" - "80:" // Height 3: no shift correction + "add x20, %x[qp], %[maxval]\n" + "ld1r { v6.4s }, [x20]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v4.4s\n" + "sqdmulh v26.4s, v26.4s, v4.4s\n" + "sqdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v4.4s }, [x21]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" @@ -1250,159 +1169,155 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add x20, %x[qp], %[c_offset]\n" - "add x21, %x[qp], %[maxval]\n" - "ld1r { v30.4s }, [x20]\n" - "add x20, %x[qp], %[minval]\n" - "ld1r { v29.4s }, [x21]\n" - "cmp x15, #0x10\n" - "ld1r { v28.4s }, [x20]\n" - "add v16.4s, v16.4s, v30.4s\n" - "add v17.4s, v17.4s, v30.4s\n" - "add v18.4s, v18.4s, v30.4s\n" - "add v19.4s, v19.4s, v30.4s\n" - "add v20.4s, v20.4s, v30.4s\n" - "add v21.4s, v21.4s, v30.4s\n" - "add v22.4s, v22.4s, v30.4s\n" - "add v23.4s, v23.4s, v30.4s\n" - "add v24.4s, v24.4s, v30.4s\n" - "add v25.4s, v25.4s, v30.4s\n" - "add v26.4s, v26.4s, v30.4s\n" - "add v27.4s, v27.4s, v30.4s\n" - "smin v16.4s, v16.4s, v29.4s\n" - "smin v17.4s, v17.4s, v29.4s\n" - "smin v18.4s, v18.4s, v29.4s\n" - "smin v19.4s, v19.4s, v29.4s\n" - "smin v20.4s, v20.4s, v29.4s\n" - "smin v21.4s, v21.4s, v29.4s\n" - "smin v22.4s, v22.4s, v29.4s\n" - "smin v23.4s, v23.4s, v29.4s\n" - "smin v24.4s, v24.4s, v29.4s\n" - "smin v25.4s, v25.4s, v29.4s\n" - "smin v26.4s, v26.4s, v29.4s\n" - "smin v27.4s, v27.4s, v29.4s\n" - "smax v16.4s, v16.4s, v28.4s\n" - "smax v17.4s, v17.4s, v28.4s\n" - "smax v18.4s, v18.4s, v28.4s\n" - "smax v19.4s, v19.4s, v28.4s\n" - "smax v20.4s, v20.4s, v28.4s\n" - "smax v21.4s, v21.4s, v28.4s\n" - "smax v22.4s, v22.4s, v28.4s\n" - "smax v23.4s, v23.4s, v28.4s\n" - "smax v24.4s, v24.4s, v28.4s\n" - "smax v25.4s, v25.4s, v28.4s\n" - "smax v26.4s, v26.4s, v28.4s\n" - "smax v27.4s, v27.4s, v28.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v19.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v18.8h, v22.8h, v23.8h\n" + "cmp x16, #0x10\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "orr %x[flags], %x[flags], #0x80000000\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v17.8h, v26.8h, v27.8h\n" - "uzp1 v16.16b, v16.16b, v19.16b\n" - "uzp1 v20.16b, v20.16b, v18.16b\n" - "uzp1 v24.16b, v24.16b, v17.16b\n" - "bge 89f\n" - "tbz x15, #3, 84f\n" + "add x14, x14, #0x40\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 86f\n" + "tbz x16, #3, 81f\n" "str d16, [x13], #0x8\n" "str d20, [x24], #0x8\n" "str d24, [x23], #0x8\n" - "tbz x15, #2, 82f\n" + "tbz x16, #2, 79f\n" "st1 { v16.s }[2], [x13], #0x4\n" "st1 { v20.s }[2], [x24], #0x4\n" "st1 { v24.s }[2], [x23], #0x4\n" - "tbz x15, #1, 81f\n" + "tbz x16, #1, 78f\n" "st1 { v16.h }[6], [x13], #0x2\n" "st1 { v20.h }[6], [x24], #0x2\n" "st1 { v24.h }[6], [x23], #0x2\n" - "tbz x15, #0, 88f\n" + "tbz x16, #0, 85f\n" "st1 { v16.b }[14], [x13]\n" "st1 { v20.b }[14], [x24]\n" "st1 { v24.b }[14], [x23]\n" - "b 88f\n" - "81:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x15, #0, 88f\n" + "b 85f\n" + "78:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x16, #0, 85f\n" "st1 { v16.b }[12], [x13]\n" "st1 { v20.b }[12], [x24]\n" "st1 { v24.b }[12], [x23]\n" - "b 88f\n" - "82:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x15, #1, 83f\n" + "b 85f\n" + "79:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x16, #1, 80f\n" "st1 { v16.h }[4], [x13], #0x2\n" "st1 { v20.h }[4], [x24], #0x2\n" "st1 { v24.h }[4], [x23], #0x2\n" - "tbz x15, #0, 88f\n" + "tbz x16, #0, 85f\n" "st1 { v16.b }[10], [x13]\n" "st1 { v20.b }[10], [x24]\n" "st1 { v24.b }[10], [x23]\n" - "b 88f\n" - "83:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x15, #0, 88f\n" + "b 85f\n" + "80:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x16, #0, 85f\n" "st1 { v16.b }[8], [x13]\n" "st1 { v20.b }[8], [x24]\n" "st1 { v24.b }[8], [x23]\n" - "b 88f\n" - "84:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x15, #2, 86f\n" + "b 85f\n" + "81:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x16, #2, 83f\n" "str s16, [x13], #0x4\n" "str s20, [x24], #0x4\n" "str s24, [x23], #0x4\n" - "tbz x15, #1, 85f\n" + "tbz x16, #1, 82f\n" "st1 { v16.h }[2], [x13], #0x2\n" "st1 { v20.h }[2], [x24], #0x2\n" "st1 { v24.h }[2], [x23], #0x2\n" - "tbz x15, #0, 88f\n" + "tbz x16, #0, 85f\n" "st1 { v16.b }[6], [x13]\n" "st1 { v20.b }[6], [x24]\n" "st1 { v24.b }[6], [x23]\n" - "b 88f\n" - "85:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x15, #0, 88f\n" + "b 85f\n" + "82:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x16, #0, 85f\n" "st1 { v16.b }[4], [x13]\n" "st1 { v20.b }[4], [x24]\n" "st1 { v24.b }[4], [x23]\n" - "b 88f\n" - "86:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x15, #1, 87f\n" + "b 85f\n" + "83:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x16, #1, 84f\n" "str h16, [x13], #0x2\n" "str h20, [x24], #0x2\n" "str h24, [x23], #0x2\n" - "tbz x15, #0, 88f\n" + "tbz x16, #0, 85f\n" "st1 { v16.b }[2], [x13]\n" "st1 { v20.b }[2], [x24]\n" "st1 { v24.b }[2], [x23]\n" - "b 88f\n" - "87:" // Height 3: Partial direct writeback: partial_1_0 + "b 85f\n" + "84:" // Height 3: Partial direct writeback: partial_1_0 "str b16, [x13, #0x0]\n" "str b20, [x24, #0x0]\n" "str b24, [x23, #0x0]\n" - "88:" // Height 3: Partial direct writeback: Done - "b 90f\n" - "89:" // Height 3: Full writeback + "85:" // Height 3: Partial direct writeback: Done + "b 87f\n" + "86:" // Height 3: Full writeback "str q16, [x13, #0x0]\n" "add x13, x13, #0x10\n" "str q20, [x24, #0x0]\n" "str q24, [x23, #0x0]\n" - "90:" // Height 3: Writeback done - "subs x15, x15, #0x10\n" - "bgt 62b\n" - "b 122f\n" - "91:" // Height 4 + "87:" // Height 3: Writeback done + "subs x16, x16, #0x10\n" + "bgt 60b\n" + "b 118f\n" + "88:" // Height 4 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "mov x20, #0x4\n" "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "mov x16, %x[col_bias]\n" "movi v11.4s, #0x0\n" - "bic %x[flags], %x[flags], #0x80000000\n" "movi v12.4s, #0x0\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "madd x20, x21, x20, x13\n" + "bic %x[flags], %x[flags], #0x80000000\n" "movi v13.4s, #0x0\n" + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "madd x20, x21, x20, x13\n" "movi v14.4s, #0x0\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" "movi v15.16b, #0x1\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[col_bias]\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "92:" // Height 4: Column loop + "89:" // Height 4: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -1419,80 +1334,79 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "93:" // Height 4: setup done "mov x12, #0x0\n" - "94:" // Height 4: String loop + "91:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 95f\n" + "tbz %x[flags], #3, 92f\n" "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x10, [x20, #0x0]\n" "ldr x9, [x20, #0x8]\n" "ldr x28, [x20, #0x10]\n" "ldr x27, [x20, #0x18]\n" - "cbnz x12, 96f\n" + "cbnz x12, 93f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x10, x10, x20\n" "add x9, x9, x20\n" "add x28, x28, x20\n" "add x27, x27, x20\n" - "b 96f\n" - "95:" // Height 4: setup direct input + "b 93f\n" + "92:" // Height 4: setup direct input "mov x10, %x[input_ptr]\n" "add x9, x10, x21\n" "add x28, x9, x21\n" "add x27, x28, x21\n" - "96:" // Height 4: input setup done + "93:" // Height 4: input setup done "cmp x11, #0x10\n" - "blt 101f\n" + "blt 98f\n" "ldr q0, [x10, #0x0]\n" "cmp x11, #0x20\n" "ldr q1, [x9, #0x0]\n" "ldr q2, [x28, #0x0]\n" "ldr q3, [x27, #0x0]\n" - "ldr q4, [x14, #0x0]\n" - "ldr q5, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" - "ldr q7, [x14, #0x30]\n" - "ldr q8, [x14, #0x40]\n" - "ldr q9, [x14, #0x50]\n" - "ldr q10, [x14, #0x60]\n" - "blt 99f\n" - "97:" // Height 4: Multiply loop: Main loop head + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "ldr q8, [x15, #0x40]\n" + "ldr q9, [x15, #0x50]\n" + "ldr q10, [x15, #0x60]\n" + "blt 96f\n" + "94:" // Height 4: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr x21, [x14, #0x78]\n" + "ldr x21, [x15, #0x78]\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "ldr x20, [x14, #0x88]\n" + "ldr x20, [x15, #0x88]\n" ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" - "ldr x26, [x14, #0x98]\n" + "ldr x26, [x15, #0x98]\n" ".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n" - "ldr d4, [x14, #0x70]\n" + "ldr d4, [x15, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "ldr x25, [x14, #0xa8]\n" + "ldr x25, [x15, #0xa8]\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "ldr x24, [x14, #0xb8]\n" + "ldr x24, [x15, #0xb8]\n" ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" "mov v4.d[1], x21\n" ".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n" - "ldr d5, [x14, #0x80]\n" + "ldr d5, [x15, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "ldr x23, [x14, #0xc8]\n" + "ldr x23, [x15, #0xc8]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - "ldr x22, [x14, #0xd8]\n" + "ldr x22, [x15, #0xd8]\n" ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" "mov v5.d[1], x20\n" ".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n" - "ldr d6, [x14, #0x90]\n" + "ldr d6, [x15, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "ldr x21, [x14, #0xe8]\n" + "ldr x21, [x15, #0xe8]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" - "ldr x20, [x14, #0xf8]\n" + "ldr x20, [x15, #0xf8]\n" ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" "mov v6.d[1], x26\n" ".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n" - "ldr d7, [x14, #0xa0]\n" + "ldr d7, [x15, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" "add x10, x10, #0x10\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" @@ -1500,7 +1414,7 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" "mov v7.d[1], x25\n" ".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n" - "ldr d8, [x14, #0xb0]\n" + "ldr d8, [x15, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" "add x28, x28, #0x10\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" @@ -1508,27 +1422,27 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" "mov v8.d[1], x24\n" ".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n" - "ldr d9, [x14, #0xc0]\n" + "ldr d9, [x15, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" "mov v9.d[1], x23\n" ".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n" - "ldr d10, [x14, #0xd0]\n" + "ldr d10, [x15, #0xd0]\n" ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" "mov v10.d[1], x22\n" ".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n" - "ldr d4, [x14, #0xe0]\n" + "ldr d4, [x15, #0xe0]\n" ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" "mov v4.d[1], x21\n" ".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n" - "ldr d5, [x14, #0xf0]\n" + "ldr d5, [x15, #0xf0]\n" ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" - "add x14, x14, #0x100\n" + "add x15, x15, #0x100\n" ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" "mov v5.d[1], x20\n" @@ -1557,31 +1471,31 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" ".inst 0x4fa3e8bf // sdot v31.4s, v5.16b, v3.4b[3]\n" - "tbnz %x[flags], #31, 98f\n" + "tbnz %x[flags], #31, 95f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" - "98:" // Height 4: Multiply loop: unique 13: skip row sum + "95:" // Height 4: Multiply loop: unique 13: skip row sum "ldr q0, [x10, #0x0]\n" "sub x11, x11, #0x10\n" "ldr q1, [x9, #0x0]\n" "cmp x11, #0x20\n" "ldr q2, [x28, #0x0]\n" "ldr q3, [x27, #0x0]\n" - "ldr q4, [x14, #0x0]\n" - "ldr q5, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" - "ldr q7, [x14, #0x30]\n" - "ldr q8, [x14, #0x40]\n" - "ldr q9, [x14, #0x50]\n" - "ldr q10, [x14, #0x60]\n" + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "ldr q8, [x15, #0x40]\n" + "ldr q9, [x15, #0x50]\n" + "ldr q10, [x15, #0x60]\n" "prfm pldl1keep, [x10, #0x80]\n" "prfm pldl1keep, [x9, #0x80]\n" "prfm pldl1keep, [x28, #0x80]\n" "prfm pldl1keep, [x27, #0x80]\n" - "bge 97b\n" - "99:" // Height 4: Multiply loop: Single iteration only + "bge 94b\n" + "96:" // Height 4: Multiply loop: Single iteration only ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" "sub x11, x11, #0x10\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" @@ -1589,51 +1503,51 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" "add x9, x9, #0x10\n" ".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n" - "ldr q4, [x14, #0x70]\n" + "ldr q4, [x15, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" "add x28, x28, #0x10\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" "add x27, x27, #0x10\n" ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" ".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n" - "ldr q5, [x14, #0x80]\n" + "ldr q5, [x15, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x14, #0x90]\n" + "ldr q6, [x15, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0xa0]\n" + "ldr q7, [x15, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" ".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n" - "ldr q8, [x14, #0xb0]\n" + "ldr q8, [x15, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" ".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n" - "ldr q9, [x14, #0xc0]\n" + "ldr q9, [x15, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" ".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n" - "ldr q10, [x14, #0xd0]\n" + "ldr q10, [x15, #0xd0]\n" ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" ".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n" - "ldr q4, [x14, #0xe0]\n" + "ldr q4, [x15, #0xe0]\n" ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" ".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n" - "ldr q5, [x14, #0xf0]\n" + "ldr q5, [x15, #0xf0]\n" ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" - "add x14, x14, #0x100\n" + "add x15, x15, #0x100\n" ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n" @@ -1661,106 +1575,106 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" ".inst 0x4fa3e8bf // sdot v31.4s, v5.16b, v3.4b[3]\n" - "tbnz %x[flags], #31, 100f\n" + "tbnz %x[flags], #31, 97f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" - "100:" // Height 4: Multiply loop: unique 14: skip row sum + "97:" // Height 4: Multiply loop: unique 14: skip row sum "prfm pldl1keep, [x10, #0x80]\n" "prfm pldl1keep, [x9, #0x80]\n" "prfm pldl1keep, [x28, #0x80]\n" "prfm pldl1keep, [x27, #0x80]\n" - "101:" // Height 4: Multiply loop: Main loop skip - "cbz x11, 108f\n" + "98:" // Height 4: Multiply loop: Main loop skip + "cbz x11, 105f\n" "cmp x11, #0x4\n" - "blt 104f\n" - "102:" // Height 4: Multiply loop: Odd block loop + "blt 101f\n" + "99:" // Height 4: Multiply loop: Odd block loop "ldr s0, [x10], #0x4\n" "ldr s1, [x9], #0x4\n" "ldr s2, [x28], #0x4\n" "ldr s3, [x27], #0x4\n" - "tbnz %x[flags], #31, 103f\n" + "tbnz %x[flags], #31, 100f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" - "103:" // Height 4: Multiply loop: unique 15: skip row sum - "ldr q7, [x14, #0x0]\n" + "100:" // Height 4: Multiply loop: unique 15: skip row sum + "ldr q6, [x15, #0x0]\n" "sub x11, x11, #0x4\n" - "ldr q6, [x14, #0x10]\n" + "ldr q7, [x15, #0x10]\n" "cmp x11, #0x4\n" - "ldr q5, [x14, #0x20]\n" - "ldr q4, [x14, #0x30]\n" - ".inst 0x4f80e0f0 // sdot v16.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n" - "add x14, x14, #0x40\n" - ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0fc // sdot v28.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" - ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" - ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n" - ".inst 0x4f83e0be // sdot v30.4s, v5.16b, v3.4b[0]\n" - ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n" - ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n" - "bge 102b\n" - "104:" // Height 4: Multiply loop: Skip odd blocks - "cbz x11, 108f\n" - "tbz x11, #1, 105f\n" + "ldr q8, [x15, #0x20]\n" + "ldr q9, [x15, #0x30]\n" + ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0fd // sdot v29.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n" + ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n" + ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n" + ".inst 0x4f83e13f // sdot v31.4s, v9.16b, v3.4b[0]\n" + "bge 99b\n" + "101:" // Height 4: Multiply loop: Skip odd blocks + "cbz x11, 105f\n" + "tbz x11, #1, 102f\n" "ldr h0, [x10], #0x2\n" "ldr h1, [x9], #0x2\n" "ldr h2, [x28], #0x2\n" "ldr h3, [x27], #0x2\n" - "tbz x11, #0, 106f\n" + "tbz x11, #0, 103f\n" "ld1 { v0.b }[2], [x10]\n" "ld1 { v1.b }[2], [x9]\n" "ld1 { v2.b }[2], [x28]\n" "ld1 { v3.b }[2], [x27]\n" - "b 106f\n" - "105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "b 103f\n" + "102:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x10, #0x0]\n" "ldr b1, [x9, #0x0]\n" "ldr b2, [x28, #0x0]\n" "ldr b3, [x27, #0x0]\n" - "106:" // Height 4: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 107f\n" + "103:" // Height 4: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 104f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" - "107:" // Height 4: Multiply loop: unique 16: skip row sum - "ldr q7, [x14, #0x0]\n" - "ldr q6, [x14, #0x10]\n" - "ldr q5, [x14, #0x20]\n" - "ldr q4, [x14, #0x30]\n" - ".inst 0x4f80e0f0 // sdot v16.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n" - "add x14, x14, #0x40\n" - ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0fc // sdot v28.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n" + "104:" // Height 4: Multiply loop: unique 16: skip row sum + "ldr q10, [x15, #0x0]\n" + "ldr q4, [x15, #0x10]\n" + "ldr q5, [x15, #0x20]\n" + "ldr q6, [x15, #0x30]\n" + ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n" + ".inst 0x4f83e15c // sdot v28.4s, v10.16b, v3.4b[0]\n" + ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f83e09d // sdot v29.4s, v4.16b, v3.4b[0]\n" ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n" ".inst 0x4f83e0be // sdot v30.4s, v5.16b, v3.4b[0]\n" - ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n" - ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n" - "108:" // Height 4: Multiply loop: No odd multiplies + ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0df // sdot v31.4s, v6.16b, v3.4b[0]\n" + "105:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x12, x12, #0x1\n" "cmp x12, x20\n" - "bne 94b\n" + "bne 91b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x13, #0x0]\n" "add x24, x13, x20\n" @@ -1769,30 +1683,30 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "add x22, x23, x20\n" "prfm pstl1keep, [x23, #0x0]\n" "prfm pstl1keep, [x22, #0x0]\n" - "tbnz %x[flags], #31, 109f\n" + "tbnz %x[flags], #31, 106f\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" "add x20, %x[qp], %[b_offset]\n" - "ld1r { v0.4s }, [x20]\n" + "ld1r { v4.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "neg v0.4s, v0.4s\n" + "neg v4.4s, v4.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "mul v11.4s, v11.4s, v0.4s\n" - "mul v12.4s, v12.4s, v0.4s\n" - "mul v13.4s, v13.4s, v0.4s\n" - "mul v14.4s, v14.4s, v0.4s\n" - "109:" // Height 4: skip row sum fixup - "ldr q3, [x16, #0x0]\n" + "mul v11.4s, v11.4s, v4.4s\n" + "mul v12.4s, v12.4s, v4.4s\n" + "mul v13.4s, v13.4s, v4.4s\n" + "mul v14.4s, v14.4s, v4.4s\n" + "106:" // Height 4: skip row sum fixup + "ldr q0, [x14, #0x0]\n" "add v16.4s, v16.4s, v11.4s\n" - "ldr q2, [x16, #0x10]\n" + "ldr q1, [x14, #0x10]\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q1, [x16, #0x20]\n" + "ldr q2, [x14, #0x20]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q0, [x16, #0x30]\n" + "ldr q3, [x14, #0x30]\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" @@ -1806,94 +1720,48 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "add v29.4s, v29.4s, v14.4s\n" "add v30.4s, v30.4s, v14.4s\n" "add v31.4s, v31.4s, v14.4s\n" - "add v16.4s, v16.4s, v3.4s\n" - "add v17.4s, v17.4s, v2.4s\n" - "add v18.4s, v18.4s, v1.4s\n" - "add v19.4s, v19.4s, v0.4s\n" - "add v20.4s, v20.4s, v3.4s\n" - "add v21.4s, v21.4s, v2.4s\n" - "add v22.4s, v22.4s, v1.4s\n" - "add v23.4s, v23.4s, v0.4s\n" - "add v24.4s, v24.4s, v3.4s\n" - "add v25.4s, v25.4s, v2.4s\n" - "add v26.4s, v26.4s, v1.4s\n" - "add v27.4s, v27.4s, v0.4s\n" - "add v28.4s, v28.4s, v3.4s\n" - "add v29.4s, v29.4s, v2.4s\n" - "add v30.4s, v30.4s, v1.4s\n" - "add v31.4s, v31.4s, v0.4s\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v1.4s }, [x20]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "add v28.4s, v28.4s, v0.4s\n" + "add v29.4s, v29.4s, v1.4s\n" + "add v30.4s, v30.4s, v2.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x21]\n" + "add x21, %x[qp], %[c_offset]\n" "ld1r { v0.4s }, [x20]\n" - "add x16, x16, #0x40\n" - "sqrdmulh v16.4s, v16.4s, v1.4s\n" - "sqrdmulh v17.4s, v17.4s, v1.4s\n" - "sqrdmulh v18.4s, v18.4s, v1.4s\n" - "sqrdmulh v19.4s, v19.4s, v1.4s\n" - "sqrdmulh v20.4s, v20.4s, v1.4s\n" - "sqrdmulh v21.4s, v21.4s, v1.4s\n" - "sqrdmulh v22.4s, v22.4s, v1.4s\n" - "sqrdmulh v23.4s, v23.4s, v1.4s\n" - "sqrdmulh v24.4s, v24.4s, v1.4s\n" - "sqrdmulh v25.4s, v25.4s, v1.4s\n" - "sqrdmulh v26.4s, v26.4s, v1.4s\n" - "sqrdmulh v27.4s, v27.4s, v1.4s\n" - "sqrdmulh v28.4s, v28.4s, v1.4s\n" - "sqrdmulh v29.4s, v29.4s, v1.4s\n" - "sqrdmulh v30.4s, v30.4s, v1.4s\n" - "sqrdmulh v31.4s, v31.4s, v1.4s\n" - "tbz %x[flags], #5, 110f\n" - "and v2.16b, v16.16b, v0.16b\n" - "and v1.16b, v17.16b, v0.16b\n" - "and v7.16b, v18.16b, v0.16b\n" - "and v6.16b, v19.16b, v0.16b\n" - "and v5.16b, v20.16b, v0.16b\n" - "and v4.16b, v21.16b, v0.16b\n" - "and v3.16b, v22.16b, v0.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v2.4s\n" - "sqadd v17.4s, v17.4s, v1.4s\n" - "and v2.16b, v23.16b, v0.16b\n" - "and v1.16b, v24.16b, v0.16b\n" - "sqadd v18.4s, v18.4s, v7.4s\n" - "sqadd v19.4s, v19.4s, v6.4s\n" - "sqadd v20.4s, v20.4s, v5.4s\n" - "sqadd v21.4s, v21.4s, v4.4s\n" - "sqadd v22.4s, v22.4s, v3.4s\n" - "and v7.16b, v25.16b, v0.16b\n" - "and v6.16b, v26.16b, v0.16b\n" - "and v5.16b, v27.16b, v0.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "and v4.16b, v28.16b, v0.16b\n" - "and v3.16b, v29.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v2.4s\n" - "sqadd v24.4s, v24.4s, v1.4s\n" - "and v2.16b, v30.16b, v0.16b\n" - "and v1.16b, v31.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqadd v25.4s, v25.4s, v7.4s\n" - "sqadd v26.4s, v26.4s, v6.4s\n" - "sqadd v27.4s, v27.4s, v5.4s\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqadd v28.4s, v28.4s, v4.4s\n" - "sqadd v29.4s, v29.4s, v3.4s\n" - "sqadd v30.4s, v30.4s, v2.4s\n" - "sqadd v31.4s, v31.4s, v1.4s\n" - "110:" // Height 4: no shift correction + "add x20, %x[qp], %[maxval]\n" + "ld1r { v6.4s }, [x20]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v4.4s\n" + "sqdmulh v26.4s, v26.4s, v4.4s\n" + "sqdmulh v27.4s, v27.4s, v4.4s\n" + "sqdmulh v28.4s, v28.4s, v4.4s\n" + "sqdmulh v29.4s, v29.4s, v4.4s\n" + "sqdmulh v30.4s, v30.4s, v4.4s\n" + "sqdmulh v31.4s, v31.4s, v4.4s\n" + "ld1r { v4.4s }, [x21]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" @@ -1910,185 +1778,181 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "srshl v29.4s, v29.4s, v0.4s\n" "srshl v30.4s, v30.4s, v0.4s\n" "srshl v31.4s, v31.4s, v0.4s\n" - "add x20, %x[qp], %[c_offset]\n" - "add x21, %x[qp], %[maxval]\n" - "ld1r { v2.4s }, [x20]\n" - "add x20, %x[qp], %[minval]\n" - "ld1r { v1.4s }, [x21]\n" - "cmp x15, #0x10\n" - "ld1r { v0.4s }, [x20]\n" - "add v16.4s, v16.4s, v2.4s\n" - "add v17.4s, v17.4s, v2.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v2.4s\n" - "add v20.4s, v20.4s, v2.4s\n" - "add v21.4s, v21.4s, v2.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v2.4s\n" - "add v24.4s, v24.4s, v2.4s\n" - "add v25.4s, v25.4s, v2.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v2.4s\n" - "add v28.4s, v28.4s, v2.4s\n" - "add v29.4s, v29.4s, v2.4s\n" - "add v30.4s, v30.4s, v2.4s\n" - "add v31.4s, v31.4s, v2.4s\n" - "smin v16.4s, v16.4s, v1.4s\n" - "smin v17.4s, v17.4s, v1.4s\n" - "smin v18.4s, v18.4s, v1.4s\n" - "smin v19.4s, v19.4s, v1.4s\n" - "smin v20.4s, v20.4s, v1.4s\n" - "smin v21.4s, v21.4s, v1.4s\n" - "smin v22.4s, v22.4s, v1.4s\n" - "smin v23.4s, v23.4s, v1.4s\n" - "smin v24.4s, v24.4s, v1.4s\n" - "smin v25.4s, v25.4s, v1.4s\n" - "smin v26.4s, v26.4s, v1.4s\n" - "smin v27.4s, v27.4s, v1.4s\n" - "smin v28.4s, v28.4s, v1.4s\n" - "smin v29.4s, v29.4s, v1.4s\n" - "smin v30.4s, v30.4s, v1.4s\n" - "smin v31.4s, v31.4s, v1.4s\n" - "smax v16.4s, v16.4s, v0.4s\n" - "smax v17.4s, v17.4s, v0.4s\n" - "smax v18.4s, v18.4s, v0.4s\n" - "smax v19.4s, v19.4s, v0.4s\n" - "smax v20.4s, v20.4s, v0.4s\n" - "smax v21.4s, v21.4s, v0.4s\n" - "smax v22.4s, v22.4s, v0.4s\n" - "smax v23.4s, v23.4s, v0.4s\n" - "smax v24.4s, v24.4s, v0.4s\n" - "smax v25.4s, v25.4s, v0.4s\n" - "smax v26.4s, v26.4s, v0.4s\n" - "smax v27.4s, v27.4s, v0.4s\n" - "smax v28.4s, v28.4s, v0.4s\n" - "smax v29.4s, v29.4s, v0.4s\n" - "smax v30.4s, v30.4s, v0.4s\n" - "smax v31.4s, v31.4s, v0.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v0.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v19.8h, v22.8h, v23.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v18.8h, v26.8h, v27.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" "uzp1 v28.8h, v28.8h, v29.8h\n" - "uzp1 v17.8h, v30.8h, v31.8h\n" - "uzp1 v16.16b, v16.16b, v0.16b\n" - "uzp1 v20.16b, v20.16b, v19.16b\n" - "uzp1 v24.16b, v24.16b, v18.16b\n" - "uzp1 v28.16b, v28.16b, v17.16b\n" - "bge 119f\n" - "tbz x15, #3, 114f\n" + "cmp x16, #0x10\n" + "uzp1 v29.8h, v30.8h, v31.8h\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "add x14, x14, #0x40\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v28.16b, v28.16b, v29.16b\n" + "bge 115f\n" + "tbz x16, #3, 110f\n" "str d16, [x13], #0x8\n" "str d20, [x24], #0x8\n" "str d24, [x23], #0x8\n" "str d28, [x22], #0x8\n" - "tbz x15, #2, 112f\n" + "tbz x16, #2, 108f\n" "st1 { v16.s }[2], [x13], #0x4\n" "st1 { v20.s }[2], [x24], #0x4\n" "st1 { v24.s }[2], [x23], #0x4\n" "st1 { v28.s }[2], [x22], #0x4\n" - "tbz x15, #1, 111f\n" + "tbz x16, #1, 107f\n" "st1 { v16.h }[6], [x13], #0x2\n" "st1 { v20.h }[6], [x24], #0x2\n" "st1 { v24.h }[6], [x23], #0x2\n" "st1 { v28.h }[6], [x22], #0x2\n" - "tbz x15, #0, 118f\n" + "tbz x16, #0, 114f\n" "st1 { v16.b }[14], [x13]\n" "st1 { v20.b }[14], [x24]\n" "st1 { v24.b }[14], [x23]\n" "st1 { v28.b }[14], [x22]\n" - "b 118f\n" - "111:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x15, #0, 118f\n" + "b 114f\n" + "107:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x16, #0, 114f\n" "st1 { v16.b }[12], [x13]\n" "st1 { v20.b }[12], [x24]\n" "st1 { v24.b }[12], [x23]\n" "st1 { v28.b }[12], [x22]\n" - "b 118f\n" - "112:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x15, #1, 113f\n" + "b 114f\n" + "108:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x16, #1, 109f\n" "st1 { v16.h }[4], [x13], #0x2\n" "st1 { v20.h }[4], [x24], #0x2\n" "st1 { v24.h }[4], [x23], #0x2\n" "st1 { v28.h }[4], [x22], #0x2\n" - "tbz x15, #0, 118f\n" + "tbz x16, #0, 114f\n" "st1 { v16.b }[10], [x13]\n" "st1 { v20.b }[10], [x24]\n" "st1 { v24.b }[10], [x23]\n" "st1 { v28.b }[10], [x22]\n" - "b 118f\n" - "113:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x15, #0, 118f\n" + "b 114f\n" + "109:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x16, #0, 114f\n" "st1 { v16.b }[8], [x13]\n" "st1 { v20.b }[8], [x24]\n" "st1 { v24.b }[8], [x23]\n" "st1 { v28.b }[8], [x22]\n" - "b 118f\n" - "114:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x15, #2, 116f\n" + "b 114f\n" + "110:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x16, #2, 112f\n" "str s16, [x13], #0x4\n" "str s20, [x24], #0x4\n" "str s24, [x23], #0x4\n" "str s28, [x22], #0x4\n" - "tbz x15, #1, 115f\n" + "tbz x16, #1, 111f\n" "st1 { v16.h }[2], [x13], #0x2\n" "st1 { v20.h }[2], [x24], #0x2\n" "st1 { v24.h }[2], [x23], #0x2\n" "st1 { v28.h }[2], [x22], #0x2\n" - "tbz x15, #0, 118f\n" + "tbz x16, #0, 114f\n" "st1 { v16.b }[6], [x13]\n" "st1 { v20.b }[6], [x24]\n" "st1 { v24.b }[6], [x23]\n" "st1 { v28.b }[6], [x22]\n" - "b 118f\n" - "115:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x15, #0, 118f\n" + "b 114f\n" + "111:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x16, #0, 114f\n" "st1 { v16.b }[4], [x13]\n" "st1 { v20.b }[4], [x24]\n" "st1 { v24.b }[4], [x23]\n" "st1 { v28.b }[4], [x22]\n" - "b 118f\n" - "116:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x15, #1, 117f\n" + "b 114f\n" + "112:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x16, #1, 113f\n" "str h16, [x13], #0x2\n" "str h20, [x24], #0x2\n" "str h24, [x23], #0x2\n" "str h28, [x22], #0x2\n" - "tbz x15, #0, 118f\n" + "tbz x16, #0, 114f\n" "st1 { v16.b }[2], [x13]\n" "st1 { v20.b }[2], [x24]\n" "st1 { v24.b }[2], [x23]\n" "st1 { v28.b }[2], [x22]\n" - "b 118f\n" - "117:" // Height 4: Partial direct writeback: partial_1_0 + "b 114f\n" + "113:" // Height 4: Partial direct writeback: partial_1_0 "str b16, [x13, #0x0]\n" "str b20, [x24, #0x0]\n" "str b24, [x23, #0x0]\n" "str b28, [x22, #0x0]\n" - "118:" // Height 4: Partial direct writeback: Done - "b 120f\n" - "119:" // Height 4: Full writeback + "114:" // Height 4: Partial direct writeback: Done + "b 116f\n" + "115:" // Height 4: Full writeback "str q16, [x13, #0x0]\n" "add x13, x13, #0x10\n" "str q20, [x24, #0x0]\n" "str q24, [x23, #0x0]\n" "str q28, [x22, #0x0]\n" - "120:" // Height 4: Writeback done - "subs x15, x15, #0x10\n" - "bgt 92b\n" + "116:" // Height 4: Writeback done + "subs x16, x16, #0x10\n" + "bgt 89b\n" "subs %x[M], %x[M], #0x4\n" - "beq 122f\n" + "beq 118f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 121f\n" + "tbz %x[flags], #3, 117f\n" "add x21, x21, #0x4\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "121:" // Update direct input + "117:" // Update direct input "mov x20, #0x4\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "122:" // Exit + "118:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp index 75e35a3e98..30001519a7 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -74,29 +73,25 @@ void a64_hybrid_s8qa_dot_4x16 ( ka.string_lengths = string_lengths; ka.N = N; ka.B_ptr = B_ptr; - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 91f\n" + "bge 88f\n" "cmp %x[M], #0x2\n" - "bgt 61f\n" - "beq 31f\n" - "mov x10, %x[col_bias]\n" + "bgt 59f\n" + "beq 30f\n" "movi v11.4s, #0x0\n" "movi v15.16b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "2:" // Height 1: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" "movi v19.4s, #0x0\n" - "3:" // Height 1: setup done "mov x26, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -116,87 +111,87 @@ void a64_hybrid_s8qa_dot_4x16 ( "cmp x25, #0x10\n" "blt 11f\n" "ldr q0, [x24, #0x0]\n" - "ldr q4, [x28, #0x0]\n" + "ldr q4, [x9, #0x0]\n" "cmp x25, #0x20\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" "blt 9f\n" "7:" // Height 1: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q21, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q20, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q26, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q25, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" "add x24, x24, #0x10\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" - "ldr q24, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" - "ldr q23, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" - "ldr q22, [x28, #0xd0]\n" - ".inst 0x4fa0e2b3 // sdot v19.4s, v21.16b, v0.4b[1]\n" - "ldr q21, [x28, #0xe0]\n" - ".inst 0x4f80ea90 // sdot v16.4s, v20.16b, v0.4b[2]\n" - "ldr q20, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" - ".inst 0x4f80eb51 // sdot v17.4s, v26.16b, v0.4b[2]\n" - ".inst 0x4f80eb32 // sdot v18.4s, v25.16b, v0.4b[2]\n" - ".inst 0x4f80eb13 // sdot v19.4s, v24.16b, v0.4b[2]\n" - ".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n" - ".inst 0x4fa0ead1 // sdot v17.4s, v22.16b, v0.4b[3]\n" - ".inst 0x4fa0eab2 // sdot v18.4s, v21.16b, v0.4b[3]\n" - ".inst 0x4fa0ea93 // sdot v19.4s, v20.16b, v0.4b[3]\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 8f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum "ldr q0, [x24, #0x0]\n" - "ldr q4, [x28, #0x0]\n" + "ldr q4, [x9, #0x0]\n" "sub x25, x25, #0x10\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" "cmp x25, #0x20\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "bge 7b\n" "9:" // Height 1: Multiply loop: Single iteration only ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q21, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q20, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q26, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q25, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" - "ldr q24, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" - "ldr q23, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" - "ldr q22, [x28, #0xd0]\n" - ".inst 0x4fa0e2b3 // sdot v19.4s, v21.16b, v0.4b[1]\n" - "ldr q21, [x28, #0xe0]\n" - ".inst 0x4f80ea90 // sdot v16.4s, v20.16b, v0.4b[2]\n" - "ldr q20, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" - ".inst 0x4f80eb51 // sdot v17.4s, v26.16b, v0.4b[2]\n" - ".inst 0x4f80eb32 // sdot v18.4s, v25.16b, v0.4b[2]\n" - ".inst 0x4f80eb13 // sdot v19.4s, v24.16b, v0.4b[2]\n" - ".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n" - ".inst 0x4fa0ead1 // sdot v17.4s, v22.16b, v0.4b[3]\n" - ".inst 0x4fa0eab2 // sdot v18.4s, v21.16b, v0.4b[3]\n" - ".inst 0x4fa0ea93 // sdot v19.4s, v20.16b, v0.4b[3]\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 10f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "10:" // Height 1: Multiply loop: unique 2: skip row sum @@ -210,17 +205,17 @@ void a64_hybrid_s8qa_dot_4x16 ( "tbnz %x[flags], #31, 13f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "13:" // Height 1: Multiply loop: unique 3: skip row sum - "ldr q23, [x28, #0x0]\n" - "ldr q22, [x28, #0x10]\n" + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" "sub x25, x25, #0x4\n" - "ldr q21, [x28, #0x20]\n" - "ldr q20, [x28, #0x30]\n" + "ldr q8, [x9, #0x20]\n" + "ldr q9, [x9, #0x30]\n" "cmp x25, #0x4\n" - "add x28, x28, #0x40\n" - ".inst 0x4f80e2f0 // sdot v16.4s, v23.16b, v0.4b[0]\n" - ".inst 0x4f80e2d1 // sdot v17.4s, v22.16b, v0.4b[0]\n" - ".inst 0x4f80e2b2 // sdot v18.4s, v21.16b, v0.4b[0]\n" - ".inst 0x4f80e293 // sdot v19.4s, v20.16b, v0.4b[0]\n" + "add x9, x9, #0x40\n" + ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" "bge 12b\n" "14:" // Height 1: Multiply loop: Skip odd blocks "cbz x25, 18f\n" @@ -235,15 +230,15 @@ void a64_hybrid_s8qa_dot_4x16 ( "tbnz %x[flags], #31, 17f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "17:" // Height 1: Multiply loop: unique 4: skip row sum - "ldr q23, [x28, #0x0]\n" - "ldr q22, [x28, #0x10]\n" - "ldr q21, [x28, #0x20]\n" - "ldr q20, [x28, #0x30]\n" - "add x28, x28, #0x40\n" - ".inst 0x4f80e2f0 // sdot v16.4s, v23.16b, v0.4b[0]\n" - ".inst 0x4f80e2d1 // sdot v17.4s, v22.16b, v0.4b[0]\n" - ".inst 0x4f80e2b2 // sdot v18.4s, v21.16b, v0.4b[0]\n" - ".inst 0x4f80e293 // sdot v19.4s, v20.16b, v0.4b[0]\n" + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" "18:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" @@ -253,136 +248,122 @@ void a64_hybrid_s8qa_dot_4x16 ( "tbnz %x[flags], #31, 19f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" - "ld1r { v20.4s }, [x20]\n" - "neg v20.4s, v20.4s\n" + "ld1r { v1.4s }, [x20]\n" + "neg v1.4s, v1.4s\n" "addp v11.4s, v11.4s, v11.4s\n" - "mul v11.4s, v11.4s, v20.4s\n" + "mul v11.4s, v11.4s, v1.4s\n" "19:" // Height 1: skip row sum fixup - "ldr q24, [x10, #0x0]\n" - "ldr q23, [x10, #0x10]\n" + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q22, [x10, #0x20]\n" - "ldr q21, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v20.4s }, [x20]\n" - "add v16.4s, v16.4s, v24.4s\n" - "add v17.4s, v17.4s, v23.4s\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - "add x10, x10, #0x40\n" + "ld1r { v4.4s }, [x21]\n" + "add v16.4s, v16.4s, v0.4s\n" "ld1r { v0.4s }, [x20]\n" - "add v18.4s, v18.4s, v22.4s\n" - "add v19.4s, v19.4s, v21.4s\n" - "sqrdmulh v16.4s, v16.4s, v20.4s\n" - "sqrdmulh v17.4s, v17.4s, v20.4s\n" - "sqrdmulh v18.4s, v18.4s, v20.4s\n" - "sqrdmulh v19.4s, v19.4s, v20.4s\n" - "tbz %x[flags], #5, 20f\n" - "and v23.16b, v16.16b, v0.16b\n" - "and v22.16b, v17.16b, v0.16b\n" - "and v21.16b, v18.16b, v0.16b\n" - "and v20.16b, v19.16b, v0.16b\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v23.4s\n" - "sqadd v17.4s, v17.4s, v22.4s\n" - "sqadd v18.4s, v18.4s, v21.4s\n" - "sqadd v19.4s, v19.4s, v20.4s\n" - "20:" // Height 1: no shift correction + "add v17.4s, v17.4s, v1.4s\n" "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v6.4s }, [x20]\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "add x28, x28, #0x40\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "ld1r { v4.4s }, [x21]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v22.4s }, [x21]\n" - "ld1r { v21.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v20.4s }, [x20]\n" - "add v16.4s, v16.4s, v22.4s\n" - "add v17.4s, v17.4s, v22.4s\n" - "add v18.4s, v18.4s, v22.4s\n" - "add v19.4s, v19.4s, v22.4s\n" - "smin v16.4s, v16.4s, v21.4s\n" - "smin v17.4s, v17.4s, v21.4s\n" - "smin v18.4s, v18.4s, v21.4s\n" - "smin v19.4s, v19.4s, v21.4s\n" - "smax v16.4s, v16.4s, v20.4s\n" - "smax v17.4s, v17.4s, v20.4s\n" - "smax v18.4s, v18.4s, v20.4s\n" - "smax v19.4s, v19.4s, v20.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v16.16b, v16.16b, v17.16b\n" - "bge 29f\n" - "tbz x9, #3, 24f\n" + "bge 28f\n" + "tbz x10, #3, 23f\n" "str d16, [x27], #0x8\n" - "tbz x9, #2, 22f\n" + "tbz x10, #2, 21f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "tbz x9, #1, 21f\n" + "tbz x10, #1, 20f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "tbz x9, #0, 28f\n" + "tbz x10, #0, 27f\n" "st1 { v16.b }[14], [x27]\n" - "b 28f\n" - "21:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x9, #0, 28f\n" + "b 27f\n" + "20:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x10, #0, 27f\n" "st1 { v16.b }[12], [x27]\n" - "b 28f\n" - "22:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x9, #1, 23f\n" + "b 27f\n" + "21:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x10, #1, 22f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "tbz x9, #0, 28f\n" + "tbz x10, #0, 27f\n" "st1 { v16.b }[10], [x27]\n" - "b 28f\n" - "23:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x9, #0, 28f\n" + "b 27f\n" + "22:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x10, #0, 27f\n" "st1 { v16.b }[8], [x27]\n" - "b 28f\n" - "24:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x9, #2, 26f\n" + "b 27f\n" + "23:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x10, #2, 25f\n" "str s16, [x27], #0x4\n" - "tbz x9, #1, 25f\n" + "tbz x10, #1, 24f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "tbz x9, #0, 28f\n" + "tbz x10, #0, 27f\n" "st1 { v16.b }[6], [x27]\n" - "b 28f\n" - "25:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x9, #0, 28f\n" + "b 27f\n" + "24:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x10, #0, 27f\n" "st1 { v16.b }[4], [x27]\n" - "b 28f\n" - "26:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x9, #1, 27f\n" + "b 27f\n" + "25:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x10, #1, 26f\n" "str h16, [x27], #0x2\n" - "tbz x9, #0, 28f\n" + "tbz x10, #0, 27f\n" "st1 { v16.b }[2], [x27]\n" - "b 28f\n" - "27:" // Height 1: Partial direct writeback: partial_1_0 + "b 27f\n" + "26:" // Height 1: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "28:" // Height 1: Partial direct writeback: Done - "b 30f\n" - "29:" // Height 1: Full writeback + "27:" // Height 1: Partial direct writeback: Done + "b 29f\n" + "28:" // Height 1: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "30:" // Height 1: Writeback done - "subs x9, x9, #0x10\n" + "29:" // Height 1: Writeback done + "subs x10, x10, #0x10\n" "bgt 2b\n" - "b 122f\n" - "31:" // Height 2 - "mov x10, %x[col_bias]\n" + "b 118f\n" + "30:" // Height 2 "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v15.16b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "32:" // Height 2: Column loop + "31:" // Height 2: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -391,416 +372,389 @@ void a64_hybrid_s8qa_dot_4x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "33:" // Height 2: setup done "mov x26, #0x0\n" - "34:" // Height 2: String loop + "33:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 35f\n" + "tbz %x[flags], #3, 34f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" - "cbnz x26, 36f\n" + "cbnz x26, 35f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" - "b 36f\n" - "35:" // Height 2: setup direct input + "b 35f\n" + "34:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" - "36:" // Height 2: input setup done + "35:" // Height 2: input setup done "cmp x25, #0x10\n" - "blt 41f\n" + "blt 40f\n" "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "cmp x25, #0x20\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" - "blt 39f\n" - "37:" // Height 2: Multiply loop: Main loop head + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" + "blt 38f\n" + "36:" // Height 2: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q25, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" "add x24, x24, #0x10\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q24, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" "add x23, x23, #0x10\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - "ldr q30, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x4fa0e333 // sdot v19.4s, v25.16b, v0.4b[1]\n" - ".inst 0x4fa1e337 // sdot v23.4s, v25.16b, v1.4b[1]\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x4f80eb10 // sdot v16.4s, v24.16b, v0.4b[2]\n" - ".inst 0x4f81eb14 // sdot v20.4s, v24.16b, v1.4b[2]\n" - "ldr q24, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" - ".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n" - ".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n" - ".inst 0x4f80ebb2 // sdot v18.4s, v29.16b, v0.4b[2]\n" - ".inst 0x4f81ebb6 // sdot v22.4s, v29.16b, v1.4b[2]\n" - ".inst 0x4f80eb93 // sdot v19.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f81eb97 // sdot v23.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4fa0eb70 // sdot v16.4s, v27.16b, v0.4b[3]\n" - ".inst 0x4fa1eb74 // sdot v20.4s, v27.16b, v1.4b[3]\n" - ".inst 0x4fa0eb51 // sdot v17.4s, v26.16b, v0.4b[3]\n" - ".inst 0x4fa1eb55 // sdot v21.4s, v26.16b, v1.4b[3]\n" - ".inst 0x4fa0eb32 // sdot v18.4s, v25.16b, v0.4b[3]\n" - ".inst 0x4fa1eb36 // sdot v22.4s, v25.16b, v1.4b[3]\n" - ".inst 0x4fa0eb13 // sdot v19.4s, v24.16b, v0.4b[3]\n" - ".inst 0x4fa1eb17 // sdot v23.4s, v24.16b, v1.4b[3]\n" - "tbnz %x[flags], #31, 38f\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 37f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" - "38:" // Height 2: Multiply loop: unique 5: skip row sum + "37:" // Height 2: Multiply loop: unique 5: skip row sum "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "sub x25, x25, #0x10\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x9, #0x10]\n" "cmp x25, #0x20\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - "bge 37b\n" - "39:" // Height 2: Multiply loop: Single iteration only + "bge 36b\n" + "38:" // Height 2: Multiply loop: Single iteration only ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q25, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" "sub x25, x25, #0x10\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q24, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" "add x24, x24, #0x10\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - "ldr q30, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" "add x23, x23, #0x10\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x4fa0e333 // sdot v19.4s, v25.16b, v0.4b[1]\n" - ".inst 0x4fa1e337 // sdot v23.4s, v25.16b, v1.4b[1]\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x4f80eb10 // sdot v16.4s, v24.16b, v0.4b[2]\n" - ".inst 0x4f81eb14 // sdot v20.4s, v24.16b, v1.4b[2]\n" - "ldr q24, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" - ".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n" - ".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n" - ".inst 0x4f80ebb2 // sdot v18.4s, v29.16b, v0.4b[2]\n" - ".inst 0x4f81ebb6 // sdot v22.4s, v29.16b, v1.4b[2]\n" - ".inst 0x4f80eb93 // sdot v19.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f81eb97 // sdot v23.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4fa0eb70 // sdot v16.4s, v27.16b, v0.4b[3]\n" - ".inst 0x4fa1eb74 // sdot v20.4s, v27.16b, v1.4b[3]\n" - ".inst 0x4fa0eb51 // sdot v17.4s, v26.16b, v0.4b[3]\n" - ".inst 0x4fa1eb55 // sdot v21.4s, v26.16b, v1.4b[3]\n" - ".inst 0x4fa0eb32 // sdot v18.4s, v25.16b, v0.4b[3]\n" - ".inst 0x4fa1eb36 // sdot v22.4s, v25.16b, v1.4b[3]\n" - ".inst 0x4fa0eb13 // sdot v19.4s, v24.16b, v0.4b[3]\n" - ".inst 0x4fa1eb17 // sdot v23.4s, v24.16b, v1.4b[3]\n" - "tbnz %x[flags], #31, 40f\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 39f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" - "40:" // Height 2: Multiply loop: unique 6: skip row sum + "39:" // Height 2: Multiply loop: unique 6: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - "41:" // Height 2: Multiply loop: Main loop skip - "cbz x25, 48f\n" + "40:" // Height 2: Multiply loop: Main loop skip + "cbz x25, 47f\n" "cmp x25, #0x4\n" - "blt 44f\n" - "42:" // Height 2: Multiply loop: Odd block loop + "blt 43f\n" + "41:" // Height 2: Multiply loop: Odd block loop "ldr s0, [x24], #0x4\n" "ldr s1, [x23], #0x4\n" - "tbnz %x[flags], #31, 43f\n" + "tbnz %x[flags], #31, 42f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" - "43:" // Height 2: Multiply loop: unique 7: skip row sum - "ldr q27, [x28, #0x0]\n" - "ldr q26, [x28, #0x10]\n" + "42:" // Height 2: Multiply loop: unique 7: skip row sum + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" "sub x25, x25, #0x4\n" - "ldr q25, [x28, #0x20]\n" - "ldr q24, [x28, #0x30]\n" + "ldr q8, [x9, #0x20]\n" + "ldr q9, [x9, #0x30]\n" "cmp x25, #0x4\n" - "add x28, x28, #0x40\n" - ".inst 0x4f80e370 // sdot v16.4s, v27.16b, v0.4b[0]\n" - ".inst 0x4f81e374 // sdot v20.4s, v27.16b, v1.4b[0]\n" - ".inst 0x4f80e351 // sdot v17.4s, v26.16b, v0.4b[0]\n" - ".inst 0x4f81e355 // sdot v21.4s, v26.16b, v1.4b[0]\n" - ".inst 0x4f80e332 // sdot v18.4s, v25.16b, v0.4b[0]\n" - ".inst 0x4f81e336 // sdot v22.4s, v25.16b, v1.4b[0]\n" - ".inst 0x4f80e313 // sdot v19.4s, v24.16b, v0.4b[0]\n" - ".inst 0x4f81e317 // sdot v23.4s, v24.16b, v1.4b[0]\n" - "bge 42b\n" - "44:" // Height 2: Multiply loop: Skip odd blocks - "cbz x25, 48f\n" - "tbz x25, #1, 45f\n" + "add x9, x9, #0x40\n" + ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" + "bge 41b\n" + "43:" // Height 2: Multiply loop: Skip odd blocks + "cbz x25, 47f\n" + "tbz x25, #1, 44f\n" "ldr h0, [x24], #0x2\n" "ldr h1, [x23], #0x2\n" - "tbz x25, #0, 46f\n" + "tbz x25, #0, 45f\n" "ld1 { v0.b }[2], [x24]\n" "ld1 { v1.b }[2], [x23]\n" - "b 46f\n" - "45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "b 45f\n" + "44:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x24, #0x0]\n" "ldr b1, [x23, #0x0]\n" - "46:" // Height 2: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 47f\n" + "45:" // Height 2: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 46f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" - "47:" // Height 2: Multiply loop: unique 8: skip row sum - "ldr q27, [x28, #0x0]\n" - "ldr q26, [x28, #0x10]\n" - "ldr q25, [x28, #0x20]\n" - "ldr q24, [x28, #0x30]\n" - "add x28, x28, #0x40\n" - ".inst 0x4f80e370 // sdot v16.4s, v27.16b, v0.4b[0]\n" - ".inst 0x4f81e374 // sdot v20.4s, v27.16b, v1.4b[0]\n" - ".inst 0x4f80e351 // sdot v17.4s, v26.16b, v0.4b[0]\n" - ".inst 0x4f81e355 // sdot v21.4s, v26.16b, v1.4b[0]\n" - ".inst 0x4f80e332 // sdot v18.4s, v25.16b, v0.4b[0]\n" - ".inst 0x4f81e336 // sdot v22.4s, v25.16b, v1.4b[0]\n" - ".inst 0x4f80e313 // sdot v19.4s, v24.16b, v0.4b[0]\n" - ".inst 0x4f81e317 // sdot v23.4s, v24.16b, v1.4b[0]\n" - "48:" // Height 2: Multiply loop: No odd multiplies + "46:" // Height 2: Multiply loop: unique 8: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" + ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" + "47:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 34b\n" + "bne 33b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x27, #0x0]\n" - "add x24, x27, x20\n" - "prfm pstl1keep, [x24, #0x0]\n" - "tbnz %x[flags], #31, 49f\n" + "add x26, x27, x20\n" + "prfm pstl1keep, [x26, #0x0]\n" + "tbnz %x[flags], #31, 48f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "ld1r { v24.4s }, [x20]\n" - "neg v24.4s, v24.4s\n" + "ld1r { v2.4s }, [x20]\n" + "neg v2.4s, v2.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "mul v11.4s, v11.4s, v24.4s\n" - "mul v12.4s, v12.4s, v24.4s\n" - "49:" // Height 2: skip row sum fixup - "ldr q28, [x10, #0x0]\n" - "ldr q27, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v2.4s\n" + "mul v12.4s, v12.4s, v2.4s\n" + "48:" // Height 2: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q26, [x10, #0x20]\n" - "ldr q25, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v24.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" - "add v16.4s, v16.4s, v28.4s\n" - "add v17.4s, v17.4s, v27.4s\n" - "add x10, x10, #0x40\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v25.4s\n" - "add v20.4s, v20.4s, v28.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v21.4s, v21.4s, v27.4s\n" - "add v22.4s, v22.4s, v26.4s\n" - "add v23.4s, v23.4s, v25.4s\n" - "sqrdmulh v16.4s, v16.4s, v24.4s\n" - "sqrdmulh v17.4s, v17.4s, v24.4s\n" - "sqrdmulh v18.4s, v18.4s, v24.4s\n" - "sqrdmulh v19.4s, v19.4s, v24.4s\n" - "sqrdmulh v20.4s, v20.4s, v24.4s\n" - "sqrdmulh v21.4s, v21.4s, v24.4s\n" - "sqrdmulh v22.4s, v22.4s, v24.4s\n" - "sqrdmulh v23.4s, v23.4s, v24.4s\n" - "tbz %x[flags], #5, 50f\n" - "and v24.16b, v16.16b, v0.16b\n" - "and v30.16b, v17.16b, v0.16b\n" - "and v29.16b, v18.16b, v0.16b\n" - "and v28.16b, v19.16b, v0.16b\n" - "and v27.16b, v20.16b, v0.16b\n" - "and v26.16b, v21.16b, v0.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "and v25.16b, v22.16b, v0.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v24.4s\n" - "and v24.16b, v23.16b, v0.16b\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v30.4s\n" - "sqadd v18.4s, v18.4s, v29.4s\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v19.4s, v19.4s, v28.4s\n" - "sqadd v20.4s, v20.4s, v27.4s\n" - "sqadd v21.4s, v21.4s, v26.4s\n" - "sqadd v22.4s, v22.4s, v25.4s\n" - "sqadd v23.4s, v23.4s, v24.4s\n" - "50:" // Height 2: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "add x22, %x[qp], %[c_offset]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "ld1r { v0.4s }, [x23]\n" + "add v21.4s, v21.4s, v1.4s\n" + "cmp x10, #0x10\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add x28, x28, #0x40\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v26.4s }, [x21]\n" - "ld1r { v25.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v24.4s }, [x20]\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" - "add v16.4s, v16.4s, v26.4s\n" - "add v17.4s, v17.4s, v26.4s\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v26.4s\n" - "add v20.4s, v20.4s, v26.4s\n" - "add v21.4s, v21.4s, v26.4s\n" - "add v22.4s, v22.4s, v26.4s\n" - "add v23.4s, v23.4s, v26.4s\n" - "smin v16.4s, v16.4s, v25.4s\n" - "smin v17.4s, v17.4s, v25.4s\n" - "smin v18.4s, v18.4s, v25.4s\n" - "smin v19.4s, v19.4s, v25.4s\n" - "smin v20.4s, v20.4s, v25.4s\n" - "smin v21.4s, v21.4s, v25.4s\n" - "smin v22.4s, v22.4s, v25.4s\n" - "smin v23.4s, v23.4s, v25.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "smax v18.4s, v18.4s, v24.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "smax v21.4s, v21.4s, v24.4s\n" - "smax v22.4s, v22.4s, v24.4s\n" - "smax v23.4s, v23.4s, v24.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v18.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v17.8h, v22.8h, v23.8h\n" - "uzp1 v16.16b, v16.16b, v18.16b\n" - "uzp1 v20.16b, v20.16b, v17.16b\n" - "bge 59f\n" - "tbz x9, #3, 54f\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "bge 57f\n" + "tbz x10, #3, 52f\n" "str d16, [x27], #0x8\n" - "str d20, [x24], #0x8\n" - "tbz x9, #2, 52f\n" + "str d20, [x26], #0x8\n" + "tbz x10, #2, 50f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x24], #0x4\n" - "tbz x9, #1, 51f\n" + "st1 { v20.s }[2], [x26], #0x4\n" + "tbz x10, #1, 49f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x24], #0x2\n" - "tbz x9, #0, 58f\n" + "st1 { v20.h }[6], [x26], #0x2\n" + "tbz x10, #0, 56f\n" "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x24]\n" - "b 58f\n" - "51:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x9, #0, 58f\n" + "st1 { v20.b }[14], [x26]\n" + "b 56f\n" + "49:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x10, #0, 56f\n" "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x24]\n" - "b 58f\n" - "52:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x9, #1, 53f\n" + "st1 { v20.b }[12], [x26]\n" + "b 56f\n" + "50:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x10, #1, 51f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x24], #0x2\n" - "tbz x9, #0, 58f\n" + "st1 { v20.h }[4], [x26], #0x2\n" + "tbz x10, #0, 56f\n" "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x24]\n" - "b 58f\n" - "53:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x9, #0, 58f\n" + "st1 { v20.b }[10], [x26]\n" + "b 56f\n" + "51:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x10, #0, 56f\n" "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x24]\n" - "b 58f\n" - "54:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x9, #2, 56f\n" + "st1 { v20.b }[8], [x26]\n" + "b 56f\n" + "52:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x10, #2, 54f\n" "str s16, [x27], #0x4\n" - "str s20, [x24], #0x4\n" - "tbz x9, #1, 55f\n" + "str s20, [x26], #0x4\n" + "tbz x10, #1, 53f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x24], #0x2\n" - "tbz x9, #0, 58f\n" + "st1 { v20.h }[2], [x26], #0x2\n" + "tbz x10, #0, 56f\n" "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x24]\n" - "b 58f\n" - "55:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x9, #0, 58f\n" + "st1 { v20.b }[6], [x26]\n" + "b 56f\n" + "53:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x10, #0, 56f\n" "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x24]\n" - "b 58f\n" - "56:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x9, #1, 57f\n" + "st1 { v20.b }[4], [x26]\n" + "b 56f\n" + "54:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x10, #1, 55f\n" "str h16, [x27], #0x2\n" - "str h20, [x24], #0x2\n" - "tbz x9, #0, 58f\n" + "str h20, [x26], #0x2\n" + "tbz x10, #0, 56f\n" "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x24]\n" - "b 58f\n" - "57:" // Height 2: Partial direct writeback: partial_1_0 + "st1 { v20.b }[2], [x26]\n" + "b 56f\n" + "55:" // Height 2: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "str b20, [x24, #0x0]\n" - "58:" // Height 2: Partial direct writeback: Done - "b 60f\n" - "59:" // Height 2: Full writeback + "str b20, [x26, #0x0]\n" + "56:" // Height 2: Partial direct writeback: Done + "b 58f\n" + "57:" // Height 2: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q20, [x24, #0x0]\n" - "60:" // Height 2: Writeback done - "subs x9, x9, #0x10\n" - "bgt 32b\n" - "b 122f\n" - "61:" // Height 3 - "mov x10, %x[col_bias]\n" + "str q20, [x26, #0x0]\n" + "58:" // Height 2: Writeback done + "subs x10, x10, #0x10\n" + "bgt 31b\n" + "b 118f\n" + "59:" // Height 3 "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v13.4s, #0x0\n" "movi v15.16b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "62:" // Height 3: Column loop + "60:" // Height 3: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -813,542 +767,503 @@ void a64_hybrid_s8qa_dot_4x16 ( "movi v25.4s, #0x0\n" "movi v26.4s, #0x0\n" "movi v27.4s, #0x0\n" - "63:" // Height 3: setup done "mov x26, #0x0\n" - "64:" // Height 3: String loop + "62:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 65f\n" + "tbz %x[flags], #3, 63f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" - "cbnz x26, 66f\n" + "cbnz x26, 64f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" - "b 66f\n" - "65:" // Height 3: setup direct input + "b 64f\n" + "63:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" - "66:" // Height 3: input setup done + "64:" // Height 3: input setup done "cmp x25, #0x10\n" - "blt 71f\n" + "blt 69f\n" "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "cmp x25, #0x20\n" "ldr q2, [x22, #0x0]\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" - "blt 69f\n" - "67:" // Height 3: Multiply loop: Main loop head + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" + "blt 67f\n" + "65:" // Height 3: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q29, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" "add x22, x22, #0x10\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" - "ldr q28, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" - "ldr q5, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" - "ldr q4, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" - "ldr q3, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" - "ldr q31, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" - "ldr q30, [x28, #0xd0]\n" - ".inst 0x4fa0e3b3 // sdot v19.4s, v29.16b, v0.4b[1]\n" - ".inst 0x4fa1e3b7 // sdot v23.4s, v29.16b, v1.4b[1]\n" - ".inst 0x4fa2e3bb // sdot v27.4s, v29.16b, v2.4b[1]\n" - "ldr q29, [x28, #0xe0]\n" - ".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f81eb94 // sdot v20.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4f82eb98 // sdot v24.4s, v28.16b, v2.4b[2]\n" - "ldr q28, [x28, #0xf0]\n" - ".inst 0x4f80e8b1 // sdot v17.4s, v5.16b, v0.4b[2]\n" - "add x28, x28, #0x100\n" - ".inst 0x4f81e8b5 // sdot v21.4s, v5.16b, v1.4b[2]\n" - ".inst 0x4f82e8b9 // sdot v25.4s, v5.16b, v2.4b[2]\n" - ".inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]\n" - ".inst 0x4f81e896 // sdot v22.4s, v4.16b, v1.4b[2]\n" - ".inst 0x4f82e89a // sdot v26.4s, v4.16b, v2.4b[2]\n" - ".inst 0x4f80e873 // sdot v19.4s, v3.16b, v0.4b[2]\n" - ".inst 0x4f81e877 // sdot v23.4s, v3.16b, v1.4b[2]\n" - ".inst 0x4f82e87b // sdot v27.4s, v3.16b, v2.4b[2]\n" - ".inst 0x4fa0ebf0 // sdot v16.4s, v31.16b, v0.4b[3]\n" - ".inst 0x4fa1ebf4 // sdot v20.4s, v31.16b, v1.4b[3]\n" - ".inst 0x4fa2ebf8 // sdot v24.4s, v31.16b, v2.4b[3]\n" - ".inst 0x4fa0ebd1 // sdot v17.4s, v30.16b, v0.4b[3]\n" - ".inst 0x4fa1ebd5 // sdot v21.4s, v30.16b, v1.4b[3]\n" - ".inst 0x4fa2ebd9 // sdot v25.4s, v30.16b, v2.4b[3]\n" - ".inst 0x4fa0ebb2 // sdot v18.4s, v29.16b, v0.4b[3]\n" - ".inst 0x4fa1ebb6 // sdot v22.4s, v29.16b, v1.4b[3]\n" - ".inst 0x4fa2ebba // sdot v26.4s, v29.16b, v2.4b[3]\n" - ".inst 0x4fa0eb93 // sdot v19.4s, v28.16b, v0.4b[3]\n" - ".inst 0x4fa1eb97 // sdot v23.4s, v28.16b, v1.4b[3]\n" - ".inst 0x4fa2eb9b // sdot v27.4s, v28.16b, v2.4b[3]\n" - "tbnz %x[flags], #31, 68f\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "add x9, x9, #0x100\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 66f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - "68:" // Height 3: Multiply loop: unique 9: skip row sum + "66:" // Height 3: Multiply loop: unique 9: skip row sum "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "sub x25, x25, #0x10\n" "ldr q2, [x22, #0x0]\n" - "ldr q4, [x28, #0x0]\n" + "ldr q4, [x9, #0x0]\n" "cmp x25, #0x20\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - "bge 67b\n" - "69:" // Height 3: Multiply loop: Single iteration only + "bge 65b\n" + "67:" // Height 3: Multiply loop: Single iteration only ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q29, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" "add x23, x23, #0x10\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" - "ldr q28, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" "add x22, x22, #0x10\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" - "ldr q5, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" - "ldr q4, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" - "ldr q3, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" - "ldr q31, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" - "ldr q30, [x28, #0xd0]\n" - ".inst 0x4fa0e3b3 // sdot v19.4s, v29.16b, v0.4b[1]\n" - ".inst 0x4fa1e3b7 // sdot v23.4s, v29.16b, v1.4b[1]\n" - ".inst 0x4fa2e3bb // sdot v27.4s, v29.16b, v2.4b[1]\n" - "ldr q29, [x28, #0xe0]\n" - ".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f81eb94 // sdot v20.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4f82eb98 // sdot v24.4s, v28.16b, v2.4b[2]\n" - "ldr q28, [x28, #0xf0]\n" - ".inst 0x4f80e8b1 // sdot v17.4s, v5.16b, v0.4b[2]\n" - "add x28, x28, #0x100\n" - ".inst 0x4f81e8b5 // sdot v21.4s, v5.16b, v1.4b[2]\n" - ".inst 0x4f82e8b9 // sdot v25.4s, v5.16b, v2.4b[2]\n" - ".inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]\n" - ".inst 0x4f81e896 // sdot v22.4s, v4.16b, v1.4b[2]\n" - ".inst 0x4f82e89a // sdot v26.4s, v4.16b, v2.4b[2]\n" - ".inst 0x4f80e873 // sdot v19.4s, v3.16b, v0.4b[2]\n" - ".inst 0x4f81e877 // sdot v23.4s, v3.16b, v1.4b[2]\n" - ".inst 0x4f82e87b // sdot v27.4s, v3.16b, v2.4b[2]\n" - ".inst 0x4fa0ebf0 // sdot v16.4s, v31.16b, v0.4b[3]\n" - ".inst 0x4fa1ebf4 // sdot v20.4s, v31.16b, v1.4b[3]\n" - ".inst 0x4fa2ebf8 // sdot v24.4s, v31.16b, v2.4b[3]\n" - ".inst 0x4fa0ebd1 // sdot v17.4s, v30.16b, v0.4b[3]\n" - ".inst 0x4fa1ebd5 // sdot v21.4s, v30.16b, v1.4b[3]\n" - ".inst 0x4fa2ebd9 // sdot v25.4s, v30.16b, v2.4b[3]\n" - ".inst 0x4fa0ebb2 // sdot v18.4s, v29.16b, v0.4b[3]\n" - ".inst 0x4fa1ebb6 // sdot v22.4s, v29.16b, v1.4b[3]\n" - ".inst 0x4fa2ebba // sdot v26.4s, v29.16b, v2.4b[3]\n" - ".inst 0x4fa0eb93 // sdot v19.4s, v28.16b, v0.4b[3]\n" - ".inst 0x4fa1eb97 // sdot v23.4s, v28.16b, v1.4b[3]\n" - ".inst 0x4fa2eb9b // sdot v27.4s, v28.16b, v2.4b[3]\n" - "tbnz %x[flags], #31, 70f\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "add x9, x9, #0x100\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 68f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - "70:" // Height 3: Multiply loop: unique 10: skip row sum + "68:" // Height 3: Multiply loop: unique 10: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - "71:" // Height 3: Multiply loop: Main loop skip - "cbz x25, 78f\n" + "69:" // Height 3: Multiply loop: Main loop skip + "cbz x25, 76f\n" "cmp x25, #0x4\n" - "blt 74f\n" - "72:" // Height 3: Multiply loop: Odd block loop + "blt 72f\n" + "70:" // Height 3: Multiply loop: Odd block loop "ldr s0, [x24], #0x4\n" "ldr s1, [x23], #0x4\n" "ldr s2, [x22], #0x4\n" - "tbnz %x[flags], #31, 73f\n" + "tbnz %x[flags], #31, 71f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - "73:" // Height 3: Multiply loop: unique 11: skip row sum - "ldr q31, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" + "71:" // Height 3: Multiply loop: unique 11: skip row sum + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" "sub x25, x25, #0x4\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" + "ldr q8, [x9, #0x20]\n" + "ldr q9, [x9, #0x30]\n" "cmp x25, #0x4\n" - "add x28, x28, #0x40\n" - ".inst 0x4f80e3f0 // sdot v16.4s, v31.16b, v0.4b[0]\n" - ".inst 0x4f81e3f4 // sdot v20.4s, v31.16b, v1.4b[0]\n" - ".inst 0x4f82e3f8 // sdot v24.4s, v31.16b, v2.4b[0]\n" - ".inst 0x4f80e3d1 // sdot v17.4s, v30.16b, v0.4b[0]\n" - ".inst 0x4f81e3d5 // sdot v21.4s, v30.16b, v1.4b[0]\n" - ".inst 0x4f82e3d9 // sdot v25.4s, v30.16b, v2.4b[0]\n" - ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n" - ".inst 0x4f81e3b6 // sdot v22.4s, v29.16b, v1.4b[0]\n" - ".inst 0x4f82e3ba // sdot v26.4s, v29.16b, v2.4b[0]\n" - ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f81e397 // sdot v23.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f82e39b // sdot v27.4s, v28.16b, v2.4b[0]\n" - "bge 72b\n" - "74:" // Height 3: Multiply loop: Skip odd blocks - "cbz x25, 78f\n" - "tbz x25, #1, 75f\n" + "add x9, x9, #0x40\n" + ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n" + ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n" + "bge 70b\n" + "72:" // Height 3: Multiply loop: Skip odd blocks + "cbz x25, 76f\n" + "tbz x25, #1, 73f\n" "ldr h0, [x24], #0x2\n" "ldr h1, [x23], #0x2\n" "ldr h2, [x22], #0x2\n" - "tbz x25, #0, 76f\n" + "tbz x25, #0, 74f\n" "ld1 { v0.b }[2], [x24]\n" "ld1 { v1.b }[2], [x23]\n" "ld1 { v2.b }[2], [x22]\n" - "b 76f\n" - "75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "b 74f\n" + "73:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x24, #0x0]\n" "ldr b1, [x23, #0x0]\n" "ldr b2, [x22, #0x0]\n" - "76:" // Height 3: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 77f\n" + "74:" // Height 3: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 75f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - "77:" // Height 3: Multiply loop: unique 12: skip row sum - "ldr q31, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" - "add x28, x28, #0x40\n" - ".inst 0x4f80e3f0 // sdot v16.4s, v31.16b, v0.4b[0]\n" - ".inst 0x4f81e3f4 // sdot v20.4s, v31.16b, v1.4b[0]\n" - ".inst 0x4f82e3f8 // sdot v24.4s, v31.16b, v2.4b[0]\n" - ".inst 0x4f80e3d1 // sdot v17.4s, v30.16b, v0.4b[0]\n" - ".inst 0x4f81e3d5 // sdot v21.4s, v30.16b, v1.4b[0]\n" - ".inst 0x4f82e3d9 // sdot v25.4s, v30.16b, v2.4b[0]\n" - ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n" - ".inst 0x4f81e3b6 // sdot v22.4s, v29.16b, v1.4b[0]\n" - ".inst 0x4f82e3ba // sdot v26.4s, v29.16b, v2.4b[0]\n" - ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f81e397 // sdot v23.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f82e39b // sdot v27.4s, v28.16b, v2.4b[0]\n" - "78:" // Height 3: Multiply loop: No odd multiplies + "75:" // Height 3: Multiply loop: unique 12: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" + ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n" + ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n" + ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n" + "76:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 64b\n" + "bne 62b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x27, #0x0]\n" - "add x24, x27, x20\n" - "prfm pstl1keep, [x24, #0x0]\n" - "add x23, x24, x20\n" - "prfm pstl1keep, [x23, #0x0]\n" - "tbnz %x[flags], #31, 79f\n" + "add x26, x27, x20\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x20\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbnz %x[flags], #31, 77f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "ld1r { v28.4s }, [x20]\n" + "ld1r { v3.4s }, [x20]\n" "addp v13.4s, v13.4s, v13.4s\n" - "neg v28.4s, v28.4s\n" + "neg v3.4s, v3.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "mul v11.4s, v11.4s, v28.4s\n" - "mul v12.4s, v12.4s, v28.4s\n" - "mul v13.4s, v13.4s, v28.4s\n" - "79:" // Height 3: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q31, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v3.4s\n" + "mul v12.4s, v12.4s, v3.4s\n" + "mul v13.4s, v13.4s, v3.4s\n" + "77:" // Height 3: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q30, [x10, #0x20]\n" - "ldr q29, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v28.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "add x22, %x[qp], %[c_offset]\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" - "add x10, x10, #0x40\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v31.4s\n" - "add v18.4s, v18.4s, v30.4s\n" - "add v19.4s, v19.4s, v29.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add x28, x28, #0x40\n" "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v31.4s\n" - "add v22.4s, v22.4s, v30.4s\n" - "add v23.4s, v23.4s, v29.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v25.4s, v25.4s, v31.4s\n" - "add v26.4s, v26.4s, v30.4s\n" - "add v27.4s, v27.4s, v29.4s\n" - "sqrdmulh v16.4s, v16.4s, v28.4s\n" - "sqrdmulh v17.4s, v17.4s, v28.4s\n" - "sqrdmulh v18.4s, v18.4s, v28.4s\n" - "sqrdmulh v19.4s, v19.4s, v28.4s\n" - "sqrdmulh v20.4s, v20.4s, v28.4s\n" - "sqrdmulh v21.4s, v21.4s, v28.4s\n" - "sqrdmulh v22.4s, v22.4s, v28.4s\n" - "sqrdmulh v23.4s, v23.4s, v28.4s\n" - "sqrdmulh v24.4s, v24.4s, v28.4s\n" - "sqrdmulh v25.4s, v25.4s, v28.4s\n" - "sqrdmulh v26.4s, v26.4s, v28.4s\n" - "sqrdmulh v27.4s, v27.4s, v28.4s\n" - "tbz %x[flags], #5, 80f\n" - "and v1.16b, v16.16b, v0.16b\n" - "and v31.16b, v17.16b, v0.16b\n" - "and v30.16b, v18.16b, v0.16b\n" - "and v29.16b, v19.16b, v0.16b\n" - "and v28.16b, v20.16b, v0.16b\n" - "and v3.16b, v21.16b, v0.16b\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "and v2.16b, v22.16b, v0.16b\n" - "sqadd v16.4s, v16.4s, v1.4s\n" - "sqadd v17.4s, v17.4s, v31.4s\n" - "sqadd v18.4s, v18.4s, v30.4s\n" - "sqadd v19.4s, v19.4s, v29.4s\n" - "sqadd v20.4s, v20.4s, v28.4s\n" - "and v1.16b, v23.16b, v0.16b\n" - "and v31.16b, v24.16b, v0.16b\n" - "and v30.16b, v25.16b, v0.16b\n" - "and v29.16b, v26.16b, v0.16b\n" - "and v28.16b, v27.16b, v0.16b\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sqadd v21.4s, v21.4s, v3.4s\n" - "sqadd v22.4s, v22.4s, v2.4s\n" - "sqadd v23.4s, v23.4s, v1.4s\n" - "sqadd v24.4s, v24.4s, v31.4s\n" - "sqadd v25.4s, v25.4s, v30.4s\n" - "sqadd v26.4s, v26.4s, v29.4s\n" - "sqadd v27.4s, v27.4s, v28.4s\n" - "80:" // Height 3: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "ld1r { v0.4s }, [x23]\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v4.4s\n" + "sqdmulh v26.4s, v26.4s, v4.4s\n" + "sqdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v30.4s }, [x21]\n" - "ld1r { v29.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v28.4s }, [x20]\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v24.4s, v24.4s, v0.4s\n" "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add v16.4s, v16.4s, v30.4s\n" - "add v17.4s, v17.4s, v30.4s\n" - "add v18.4s, v18.4s, v30.4s\n" - "add v19.4s, v19.4s, v30.4s\n" - "add v20.4s, v20.4s, v30.4s\n" - "add v21.4s, v21.4s, v30.4s\n" - "add v22.4s, v22.4s, v30.4s\n" - "add v23.4s, v23.4s, v30.4s\n" - "add v24.4s, v24.4s, v30.4s\n" - "add v25.4s, v25.4s, v30.4s\n" - "add v26.4s, v26.4s, v30.4s\n" - "add v27.4s, v27.4s, v30.4s\n" - "smin v16.4s, v16.4s, v29.4s\n" - "smin v17.4s, v17.4s, v29.4s\n" - "smin v18.4s, v18.4s, v29.4s\n" - "smin v19.4s, v19.4s, v29.4s\n" - "smin v20.4s, v20.4s, v29.4s\n" - "smin v21.4s, v21.4s, v29.4s\n" - "smin v22.4s, v22.4s, v29.4s\n" - "smin v23.4s, v23.4s, v29.4s\n" - "smin v24.4s, v24.4s, v29.4s\n" - "smin v25.4s, v25.4s, v29.4s\n" - "smin v26.4s, v26.4s, v29.4s\n" - "smin v27.4s, v27.4s, v29.4s\n" - "smax v16.4s, v16.4s, v28.4s\n" - "smax v17.4s, v17.4s, v28.4s\n" - "smax v18.4s, v18.4s, v28.4s\n" - "smax v19.4s, v19.4s, v28.4s\n" - "smax v20.4s, v20.4s, v28.4s\n" - "smax v21.4s, v21.4s, v28.4s\n" - "smax v22.4s, v22.4s, v28.4s\n" - "smax v23.4s, v23.4s, v28.4s\n" - "smax v24.4s, v24.4s, v28.4s\n" - "smax v25.4s, v25.4s, v28.4s\n" - "smax v26.4s, v26.4s, v28.4s\n" - "smax v27.4s, v27.4s, v28.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v19.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v18.8h, v22.8h, v23.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v17.8h, v26.8h, v27.8h\n" - "uzp1 v16.16b, v16.16b, v19.16b\n" - "uzp1 v20.16b, v20.16b, v18.16b\n" - "uzp1 v24.16b, v24.16b, v17.16b\n" - "bge 89f\n" - "tbz x9, #3, 84f\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 86f\n" + "tbz x10, #3, 81f\n" "str d16, [x27], #0x8\n" - "str d20, [x24], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x9, #2, 82f\n" + "str d20, [x26], #0x8\n" + "str d24, [x25], #0x8\n" + "tbz x10, #2, 79f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x24], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "tbz x9, #1, 81f\n" + "st1 { v20.s }[2], [x26], #0x4\n" + "st1 { v24.s }[2], [x25], #0x4\n" + "tbz x10, #1, 78f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x24], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "tbz x9, #0, 88f\n" + "st1 { v20.h }[6], [x26], #0x2\n" + "st1 { v24.h }[6], [x25], #0x2\n" + "tbz x10, #0, 85f\n" "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x24]\n" - "st1 { v24.b }[14], [x23]\n" - "b 88f\n" - "81:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x9, #0, 88f\n" + "st1 { v20.b }[14], [x26]\n" + "st1 { v24.b }[14], [x25]\n" + "b 85f\n" + "78:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x10, #0, 85f\n" "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x24]\n" - "st1 { v24.b }[12], [x23]\n" - "b 88f\n" - "82:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x9, #1, 83f\n" + "st1 { v20.b }[12], [x26]\n" + "st1 { v24.b }[12], [x25]\n" + "b 85f\n" + "79:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x10, #1, 80f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x24], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "tbz x9, #0, 88f\n" + "st1 { v20.h }[4], [x26], #0x2\n" + "st1 { v24.h }[4], [x25], #0x2\n" + "tbz x10, #0, 85f\n" "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x24]\n" - "st1 { v24.b }[10], [x23]\n" - "b 88f\n" - "83:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x9, #0, 88f\n" + "st1 { v20.b }[10], [x26]\n" + "st1 { v24.b }[10], [x25]\n" + "b 85f\n" + "80:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x10, #0, 85f\n" "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x24]\n" - "st1 { v24.b }[8], [x23]\n" - "b 88f\n" - "84:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x9, #2, 86f\n" + "st1 { v20.b }[8], [x26]\n" + "st1 { v24.b }[8], [x25]\n" + "b 85f\n" + "81:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x10, #2, 83f\n" "str s16, [x27], #0x4\n" - "str s20, [x24], #0x4\n" - "str s24, [x23], #0x4\n" - "tbz x9, #1, 85f\n" + "str s20, [x26], #0x4\n" + "str s24, [x25], #0x4\n" + "tbz x10, #1, 82f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x24], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "tbz x9, #0, 88f\n" + "st1 { v20.h }[2], [x26], #0x2\n" + "st1 { v24.h }[2], [x25], #0x2\n" + "tbz x10, #0, 85f\n" "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x24]\n" - "st1 { v24.b }[6], [x23]\n" - "b 88f\n" - "85:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x9, #0, 88f\n" + "st1 { v20.b }[6], [x26]\n" + "st1 { v24.b }[6], [x25]\n" + "b 85f\n" + "82:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x10, #0, 85f\n" "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x24]\n" - "st1 { v24.b }[4], [x23]\n" - "b 88f\n" - "86:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x9, #1, 87f\n" + "st1 { v20.b }[4], [x26]\n" + "st1 { v24.b }[4], [x25]\n" + "b 85f\n" + "83:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x10, #1, 84f\n" "str h16, [x27], #0x2\n" - "str h20, [x24], #0x2\n" - "str h24, [x23], #0x2\n" - "tbz x9, #0, 88f\n" + "str h20, [x26], #0x2\n" + "str h24, [x25], #0x2\n" + "tbz x10, #0, 85f\n" "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x24]\n" - "st1 { v24.b }[2], [x23]\n" - "b 88f\n" - "87:" // Height 3: Partial direct writeback: partial_1_0 + "st1 { v20.b }[2], [x26]\n" + "st1 { v24.b }[2], [x25]\n" + "b 85f\n" + "84:" // Height 3: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "str b20, [x24, #0x0]\n" - "str b24, [x23, #0x0]\n" - "88:" // Height 3: Partial direct writeback: Done - "b 90f\n" - "89:" // Height 3: Full writeback + "str b20, [x26, #0x0]\n" + "str b24, [x25, #0x0]\n" + "85:" // Height 3: Partial direct writeback: Done + "b 87f\n" + "86:" // Height 3: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q20, [x24, #0x0]\n" - "str q24, [x23, #0x0]\n" - "90:" // Height 3: Writeback done - "subs x9, x9, #0x10\n" - "bgt 62b\n" - "b 122f\n" - "91:" // Height 4 + "str q20, [x26, #0x0]\n" + "str q24, [x25, #0x0]\n" + "87:" // Height 3: Writeback done + "subs x10, x10, #0x10\n" + "bgt 60b\n" + "b 118f\n" + "88:" // Height 4 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "mov x20, #0x4\n" - "mov x10, %x[col_bias]\n" "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" - "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" "movi v13.4s, #0x0\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v14.4s, #0x0\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "madd x20, x21, x20, x27\n" "movi v15.16b, #0x1\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" + "madd x20, x21, x20, x27\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "92:" // Height 4: Column loop + "89:" // Height 4: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -1365,98 +1280,97 @@ void a64_hybrid_s8qa_dot_4x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "93:" // Height 4: setup done "mov x26, #0x0\n" - "94:" // Height 4: String loop + "91:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 95f\n" + "tbz %x[flags], #3, 92f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" "ldr x21, [x20, #0x18]\n" - "cbnz x26, 96f\n" + "cbnz x26, 93f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" "add x21, x21, x20\n" - "b 96f\n" - "95:" // Height 4: setup direct input + "b 93f\n" + "92:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" "add x21, x22, x21\n" - "96:" // Height 4: input setup done + "93:" // Height 4: input setup done "cmp x25, #0x10\n" - "blt 101f\n" + "blt 98f\n" "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "cmp x25, #0x20\n" "ldr q2, [x22, #0x0]\n" "ldr q3, [x21, #0x0]\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" - "blt 99f\n" - "97:" // Height 4: Multiply loop: Main loop head + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" + "blt 96f\n" + "94:" // Height 4: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" ".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" "add x22, x22, #0x10\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" "add x21, x21, #0x10\n" ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" ".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" ".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" ".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" ".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" + "ldr q10, [x9, #0xd0]\n" ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" ".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" + "ldr q4, [x9, #0xe0]\n" ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" ".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" @@ -1485,38 +1399,38 @@ void a64_hybrid_s8qa_dot_4x16 ( ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" ".inst 0x4fa3e8bf // sdot v31.4s, v5.16b, v3.4b[3]\n" - "tbnz %x[flags], #31, 98f\n" + "tbnz %x[flags], #31, 95f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" - "98:" // Height 4: Multiply loop: unique 13: skip row sum + "95:" // Height 4: Multiply loop: unique 13: skip row sum "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "sub x25, x25, #0x10\n" "ldr q2, [x22, #0x0]\n" "ldr q3, [x21, #0x0]\n" "cmp x25, #0x20\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" "prfm pldl1keep, [x21, #0x80]\n" - "bge 97b\n" - "99:" // Height 4: Multiply loop: Single iteration only + "bge 94b\n" + "96:" // Height 4: Multiply loop: Single iteration only ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" ".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" "add x23, x23, #0x10\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" @@ -1524,43 +1438,43 @@ void a64_hybrid_s8qa_dot_4x16 ( "add x21, x21, #0x10\n" ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" ".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" ".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" ".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" ".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" + "ldr q10, [x9, #0xd0]\n" ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" ".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" + "ldr q4, [x9, #0xe0]\n" ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" ".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" @@ -1589,252 +1503,202 @@ void a64_hybrid_s8qa_dot_4x16 ( ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" ".inst 0x4fa3e8bf // sdot v31.4s, v5.16b, v3.4b[3]\n" - "tbnz %x[flags], #31, 100f\n" + "tbnz %x[flags], #31, 97f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" - "100:" // Height 4: Multiply loop: unique 14: skip row sum + "97:" // Height 4: Multiply loop: unique 14: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" "prfm pldl1keep, [x21, #0x80]\n" - "101:" // Height 4: Multiply loop: Main loop skip - "cbz x25, 108f\n" + "98:" // Height 4: Multiply loop: Main loop skip + "cbz x25, 105f\n" "cmp x25, #0x4\n" - "blt 104f\n" - "102:" // Height 4: Multiply loop: Odd block loop + "blt 101f\n" + "99:" // Height 4: Multiply loop: Odd block loop "ldr s0, [x24], #0x4\n" "ldr s1, [x23], #0x4\n" "ldr s2, [x22], #0x4\n" "ldr s3, [x21], #0x4\n" - "tbnz %x[flags], #31, 103f\n" + "tbnz %x[flags], #31, 100f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" - "103:" // Height 4: Multiply loop: unique 15: skip row sum - "ldr q7, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" + "100:" // Height 4: Multiply loop: unique 15: skip row sum + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" "sub x25, x25, #0x4\n" - "ldr q5, [x28, #0x20]\n" - "ldr q4, [x28, #0x30]\n" + "ldr q8, [x9, #0x20]\n" + "ldr q9, [x9, #0x30]\n" "cmp x25, #0x4\n" - "add x28, x28, #0x40\n" - ".inst 0x4f80e0f0 // sdot v16.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0fc // sdot v28.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" - ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" - ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n" - ".inst 0x4f83e0be // sdot v30.4s, v5.16b, v3.4b[0]\n" - ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n" - ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n" - "bge 102b\n" - "104:" // Height 4: Multiply loop: Skip odd blocks - "cbz x25, 108f\n" - "tbz x25, #1, 105f\n" + "add x9, x9, #0x40\n" + ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0fd // sdot v29.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n" + ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n" + ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n" + ".inst 0x4f83e13f // sdot v31.4s, v9.16b, v3.4b[0]\n" + "bge 99b\n" + "101:" // Height 4: Multiply loop: Skip odd blocks + "cbz x25, 105f\n" + "tbz x25, #1, 102f\n" "ldr h0, [x24], #0x2\n" "ldr h1, [x23], #0x2\n" "ldr h2, [x22], #0x2\n" "ldr h3, [x21], #0x2\n" - "tbz x25, #0, 106f\n" + "tbz x25, #0, 103f\n" "ld1 { v0.b }[2], [x24]\n" "ld1 { v1.b }[2], [x23]\n" "ld1 { v2.b }[2], [x22]\n" "ld1 { v3.b }[2], [x21]\n" - "b 106f\n" - "105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "b 103f\n" + "102:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x24, #0x0]\n" "ldr b1, [x23, #0x0]\n" "ldr b2, [x22, #0x0]\n" "ldr b3, [x21, #0x0]\n" - "106:" // Height 4: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 107f\n" + "103:" // Height 4: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 104f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" - "107:" // Height 4: Multiply loop: unique 16: skip row sum - "ldr q7, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" - "ldr q5, [x28, #0x20]\n" - "ldr q4, [x28, #0x30]\n" - "add x28, x28, #0x40\n" - ".inst 0x4f80e0f0 // sdot v16.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0fc // sdot v28.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n" + "104:" // Height 4: Multiply loop: unique 16: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" + ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n" + ".inst 0x4f83e15c // sdot v28.4s, v10.16b, v3.4b[0]\n" + ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f83e09d // sdot v29.4s, v4.16b, v3.4b[0]\n" ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n" ".inst 0x4f83e0be // sdot v30.4s, v5.16b, v3.4b[0]\n" - ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n" - ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n" - "108:" // Height 4: Multiply loop: No odd multiplies + ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0df // sdot v31.4s, v6.16b, v3.4b[0]\n" + "105:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 94b\n" + "bne 91b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x27, #0x0]\n" - "add x24, x27, x20\n" + "add x26, x27, x20\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x20\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x20\n" "prfm pstl1keep, [x24, #0x0]\n" - "add x23, x24, x20\n" - "prfm pstl1keep, [x23, #0x0]\n" - "add x22, x23, x20\n" - "prfm pstl1keep, [x22, #0x0]\n" - "tbnz %x[flags], #31, 109f\n" + "tbnz %x[flags], #31, 106f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "ld1r { v0.4s }, [x20]\n" + "ld1r { v4.4s }, [x20]\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "neg v0.4s, v0.4s\n" + "neg v4.4s, v4.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "mul v11.4s, v11.4s, v0.4s\n" - "mul v12.4s, v12.4s, v0.4s\n" - "mul v13.4s, v13.4s, v0.4s\n" - "mul v14.4s, v14.4s, v0.4s\n" - "109:" // Height 4: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q4, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v4.4s\n" + "mul v12.4s, v12.4s, v4.4s\n" + "mul v13.4s, v13.4s, v4.4s\n" + "mul v14.4s, v14.4s, v4.4s\n" + "106:" // Height 4: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q3, [x10, #0x20]\n" - "ldr q2, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v1.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "add x22, %x[qp], %[c_offset]\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" - "add x10, x10, #0x40\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" "add v28.4s, v28.4s, v14.4s\n" "add v29.4s, v29.4s, v14.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" "add v30.4s, v30.4s, v14.4s\n" "add v31.4s, v31.4s, v14.4s\n" + "add x28, x28, #0x40\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v3.4s\n" - "add v19.4s, v19.4s, v2.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v3.4s\n" - "add v23.4s, v23.4s, v2.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v3.4s\n" - "add v27.4s, v27.4s, v2.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" "add v28.4s, v28.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v29.4s, v29.4s, v4.4s\n" - "add v30.4s, v30.4s, v3.4s\n" - "add v31.4s, v31.4s, v2.4s\n" - "sqrdmulh v16.4s, v16.4s, v1.4s\n" - "sqrdmulh v17.4s, v17.4s, v1.4s\n" - "sqrdmulh v18.4s, v18.4s, v1.4s\n" - "sqrdmulh v19.4s, v19.4s, v1.4s\n" - "sqrdmulh v20.4s, v20.4s, v1.4s\n" - "sqrdmulh v21.4s, v21.4s, v1.4s\n" - "sqrdmulh v22.4s, v22.4s, v1.4s\n" - "sqrdmulh v23.4s, v23.4s, v1.4s\n" - "sqrdmulh v24.4s, v24.4s, v1.4s\n" - "sqrdmulh v25.4s, v25.4s, v1.4s\n" - "sqrdmulh v26.4s, v26.4s, v1.4s\n" - "sqrdmulh v27.4s, v27.4s, v1.4s\n" - "sqrdmulh v28.4s, v28.4s, v1.4s\n" - "sqrdmulh v29.4s, v29.4s, v1.4s\n" - "sqrdmulh v30.4s, v30.4s, v1.4s\n" - "sqrdmulh v31.4s, v31.4s, v1.4s\n" - "tbz %x[flags], #5, 110f\n" - "and v2.16b, v16.16b, v0.16b\n" - "and v1.16b, v17.16b, v0.16b\n" - "and v7.16b, v18.16b, v0.16b\n" - "and v6.16b, v19.16b, v0.16b\n" - "and v5.16b, v20.16b, v0.16b\n" - "and v4.16b, v21.16b, v0.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "and v3.16b, v22.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v2.4s\n" - "sqadd v17.4s, v17.4s, v1.4s\n" - "and v2.16b, v23.16b, v0.16b\n" - "and v1.16b, v24.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqadd v18.4s, v18.4s, v7.4s\n" - "sqadd v19.4s, v19.4s, v6.4s\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v5.4s\n" - "sqadd v21.4s, v21.4s, v4.4s\n" - "sqadd v22.4s, v22.4s, v3.4s\n" - "and v7.16b, v25.16b, v0.16b\n" - "sqadd v23.4s, v23.4s, v2.4s\n" - "sqadd v24.4s, v24.4s, v1.4s\n" - "and v6.16b, v26.16b, v0.16b\n" - "and v5.16b, v27.16b, v0.16b\n" - "and v4.16b, v28.16b, v0.16b\n" - "and v3.16b, v29.16b, v0.16b\n" - "and v2.16b, v30.16b, v0.16b\n" - "and v1.16b, v31.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqadd v25.4s, v25.4s, v7.4s\n" - "sqadd v26.4s, v26.4s, v6.4s\n" - "sqadd v27.4s, v27.4s, v5.4s\n" - "sqadd v28.4s, v28.4s, v4.4s\n" - "sqadd v29.4s, v29.4s, v3.4s\n" - "sqadd v30.4s, v30.4s, v2.4s\n" - "sqadd v31.4s, v31.4s, v1.4s\n" - "110:" // Height 4: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "ld1r { v0.4s }, [x23]\n" + "add v29.4s, v29.4s, v1.4s\n" + "add v30.4s, v30.4s, v2.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v4.4s\n" + "sqdmulh v26.4s, v26.4s, v4.4s\n" + "sqdmulh v27.4s, v27.4s, v4.4s\n" + "sqdmulh v28.4s, v28.4s, v4.4s\n" + "sqdmulh v29.4s, v29.4s, v4.4s\n" + "sqdmulh v30.4s, v30.4s, v4.4s\n" + "sqdmulh v31.4s, v31.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v3.4s }, [x21]\n" - "ld1r { v2.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v1.4s }, [x20]\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v24.4s, v24.4s, v0.4s\n" @@ -1845,178 +1709,178 @@ void a64_hybrid_s8qa_dot_4x16 ( "srshl v29.4s, v29.4s, v0.4s\n" "srshl v30.4s, v30.4s, v0.4s\n" "srshl v31.4s, v31.4s, v0.4s\n" - "add v16.4s, v16.4s, v3.4s\n" - "add v17.4s, v17.4s, v3.4s\n" - "add v18.4s, v18.4s, v3.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v3.4s\n" - "add v21.4s, v21.4s, v3.4s\n" - "add v22.4s, v22.4s, v3.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "add v24.4s, v24.4s, v3.4s\n" - "add v25.4s, v25.4s, v3.4s\n" - "add v26.4s, v26.4s, v3.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "add v28.4s, v28.4s, v3.4s\n" - "add v29.4s, v29.4s, v3.4s\n" - "add v30.4s, v30.4s, v3.4s\n" - "add v31.4s, v31.4s, v3.4s\n" - "smin v16.4s, v16.4s, v2.4s\n" - "smin v17.4s, v17.4s, v2.4s\n" - "smin v18.4s, v18.4s, v2.4s\n" - "smin v19.4s, v19.4s, v2.4s\n" - "smin v20.4s, v20.4s, v2.4s\n" - "smin v21.4s, v21.4s, v2.4s\n" - "smin v22.4s, v22.4s, v2.4s\n" - "smin v23.4s, v23.4s, v2.4s\n" - "smin v24.4s, v24.4s, v2.4s\n" - "smin v25.4s, v25.4s, v2.4s\n" - "smin v26.4s, v26.4s, v2.4s\n" - "smin v27.4s, v27.4s, v2.4s\n" - "smin v28.4s, v28.4s, v2.4s\n" - "smin v29.4s, v29.4s, v2.4s\n" - "smin v30.4s, v30.4s, v2.4s\n" - "smin v31.4s, v31.4s, v2.4s\n" - "smax v16.4s, v16.4s, v1.4s\n" - "smax v17.4s, v17.4s, v1.4s\n" - "smax v18.4s, v18.4s, v1.4s\n" - "smax v19.4s, v19.4s, v1.4s\n" - "smax v20.4s, v20.4s, v1.4s\n" - "smax v21.4s, v21.4s, v1.4s\n" - "smax v22.4s, v22.4s, v1.4s\n" - "smax v23.4s, v23.4s, v1.4s\n" - "smax v24.4s, v24.4s, v1.4s\n" - "smax v25.4s, v25.4s, v1.4s\n" - "smax v26.4s, v26.4s, v1.4s\n" - "smax v27.4s, v27.4s, v1.4s\n" - "smax v28.4s, v28.4s, v1.4s\n" - "smax v29.4s, v29.4s, v1.4s\n" - "smax v30.4s, v30.4s, v1.4s\n" - "smax v31.4s, v31.4s, v1.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v0.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v19.8h, v22.8h, v23.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v18.8h, v26.8h, v27.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" "uzp1 v28.8h, v28.8h, v29.8h\n" - "uzp1 v17.8h, v30.8h, v31.8h\n" - "uzp1 v16.16b, v16.16b, v0.16b\n" - "uzp1 v20.16b, v20.16b, v19.16b\n" - "uzp1 v24.16b, v24.16b, v18.16b\n" - "uzp1 v28.16b, v28.16b, v17.16b\n" - "bge 119f\n" - "tbz x9, #3, 114f\n" + "uzp1 v29.8h, v30.8h, v31.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v28.16b, v28.16b, v29.16b\n" + "bge 115f\n" + "tbz x10, #3, 110f\n" "str d16, [x27], #0x8\n" - "str d20, [x24], #0x8\n" - "str d24, [x23], #0x8\n" - "str d28, [x22], #0x8\n" - "tbz x9, #2, 112f\n" + "str d20, [x26], #0x8\n" + "str d24, [x25], #0x8\n" + "str d28, [x24], #0x8\n" + "tbz x10, #2, 108f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x24], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "st1 { v28.s }[2], [x22], #0x4\n" - "tbz x9, #1, 111f\n" + "st1 { v20.s }[2], [x26], #0x4\n" + "st1 { v24.s }[2], [x25], #0x4\n" + "st1 { v28.s }[2], [x24], #0x4\n" + "tbz x10, #1, 107f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x24], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "st1 { v28.h }[6], [x22], #0x2\n" - "tbz x9, #0, 118f\n" + "st1 { v20.h }[6], [x26], #0x2\n" + "st1 { v24.h }[6], [x25], #0x2\n" + "st1 { v28.h }[6], [x24], #0x2\n" + "tbz x10, #0, 114f\n" "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x24]\n" - "st1 { v24.b }[14], [x23]\n" - "st1 { v28.b }[14], [x22]\n" - "b 118f\n" - "111:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x9, #0, 118f\n" + "st1 { v20.b }[14], [x26]\n" + "st1 { v24.b }[14], [x25]\n" + "st1 { v28.b }[14], [x24]\n" + "b 114f\n" + "107:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x10, #0, 114f\n" "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x24]\n" - "st1 { v24.b }[12], [x23]\n" - "st1 { v28.b }[12], [x22]\n" - "b 118f\n" - "112:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x9, #1, 113f\n" + "st1 { v20.b }[12], [x26]\n" + "st1 { v24.b }[12], [x25]\n" + "st1 { v28.b }[12], [x24]\n" + "b 114f\n" + "108:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x10, #1, 109f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x24], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "st1 { v28.h }[4], [x22], #0x2\n" - "tbz x9, #0, 118f\n" + "st1 { v20.h }[4], [x26], #0x2\n" + "st1 { v24.h }[4], [x25], #0x2\n" + "st1 { v28.h }[4], [x24], #0x2\n" + "tbz x10, #0, 114f\n" "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x24]\n" - "st1 { v24.b }[10], [x23]\n" - "st1 { v28.b }[10], [x22]\n" - "b 118f\n" - "113:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x9, #0, 118f\n" + "st1 { v20.b }[10], [x26]\n" + "st1 { v24.b }[10], [x25]\n" + "st1 { v28.b }[10], [x24]\n" + "b 114f\n" + "109:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x10, #0, 114f\n" "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x24]\n" - "st1 { v24.b }[8], [x23]\n" - "st1 { v28.b }[8], [x22]\n" - "b 118f\n" - "114:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x9, #2, 116f\n" + "st1 { v20.b }[8], [x26]\n" + "st1 { v24.b }[8], [x25]\n" + "st1 { v28.b }[8], [x24]\n" + "b 114f\n" + "110:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x10, #2, 112f\n" "str s16, [x27], #0x4\n" - "str s20, [x24], #0x4\n" - "str s24, [x23], #0x4\n" - "str s28, [x22], #0x4\n" - "tbz x9, #1, 115f\n" + "str s20, [x26], #0x4\n" + "str s24, [x25], #0x4\n" + "str s28, [x24], #0x4\n" + "tbz x10, #1, 111f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x24], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "st1 { v28.h }[2], [x22], #0x2\n" - "tbz x9, #0, 118f\n" + "st1 { v20.h }[2], [x26], #0x2\n" + "st1 { v24.h }[2], [x25], #0x2\n" + "st1 { v28.h }[2], [x24], #0x2\n" + "tbz x10, #0, 114f\n" "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x24]\n" - "st1 { v24.b }[6], [x23]\n" - "st1 { v28.b }[6], [x22]\n" - "b 118f\n" - "115:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x9, #0, 118f\n" + "st1 { v20.b }[6], [x26]\n" + "st1 { v24.b }[6], [x25]\n" + "st1 { v28.b }[6], [x24]\n" + "b 114f\n" + "111:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x10, #0, 114f\n" "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x24]\n" - "st1 { v24.b }[4], [x23]\n" - "st1 { v28.b }[4], [x22]\n" - "b 118f\n" - "116:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x9, #1, 117f\n" + "st1 { v20.b }[4], [x26]\n" + "st1 { v24.b }[4], [x25]\n" + "st1 { v28.b }[4], [x24]\n" + "b 114f\n" + "112:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x10, #1, 113f\n" "str h16, [x27], #0x2\n" - "str h20, [x24], #0x2\n" - "str h24, [x23], #0x2\n" - "str h28, [x22], #0x2\n" - "tbz x9, #0, 118f\n" + "str h20, [x26], #0x2\n" + "str h24, [x25], #0x2\n" + "str h28, [x24], #0x2\n" + "tbz x10, #0, 114f\n" "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x24]\n" - "st1 { v24.b }[2], [x23]\n" - "st1 { v28.b }[2], [x22]\n" - "b 118f\n" - "117:" // Height 4: Partial direct writeback: partial_1_0 + "st1 { v20.b }[2], [x26]\n" + "st1 { v24.b }[2], [x25]\n" + "st1 { v28.b }[2], [x24]\n" + "b 114f\n" + "113:" // Height 4: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "str b20, [x24, #0x0]\n" - "str b24, [x23, #0x0]\n" - "str b28, [x22, #0x0]\n" - "118:" // Height 4: Partial direct writeback: Done - "b 120f\n" - "119:" // Height 4: Full writeback + "str b20, [x26, #0x0]\n" + "str b24, [x25, #0x0]\n" + "str b28, [x24, #0x0]\n" + "114:" // Height 4: Partial direct writeback: Done + "b 116f\n" + "115:" // Height 4: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q20, [x24, #0x0]\n" - "str q24, [x23, #0x0]\n" - "str q28, [x22, #0x0]\n" - "120:" // Height 4: Writeback done - "subs x9, x9, #0x10\n" - "bgt 92b\n" + "str q20, [x26, #0x0]\n" + "str q24, [x25, #0x0]\n" + "str q28, [x24, #0x0]\n" + "116:" // Height 4: Writeback done + "subs x10, x10, #0x10\n" + "bgt 89b\n" "subs %x[M], %x[M], #0x4\n" - "beq 122f\n" + "beq 118f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 121f\n" + "tbz %x[flags], #3, 117f\n" "add x21, x21, #0x4\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "121:" // Update direct input + "117:" // Update direct input "mov x20, #0x4\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "122:" // Exit + "118:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp index 8f70b3dc26..efcf2f84df 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -74,22 +73,19 @@ void a64_hybrid_s8qa_mmla_4x16 ( ka.string_lengths = string_lengths; ka.N = N; ka.B_ptr = B_ptr; - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 97f\n" + "bge 94f\n" "cmp %x[M], #0x2\n" - "bgt 65f\n" - "beq 33f\n" - "mov x10, %x[col_bias]\n" + "bgt 63f\n" + "beq 32f\n" "movi v11.4s, #0x0\n" "movi v15.16b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "2:" // Height 1: Column loop "movi v16.4s, #0x0\n" @@ -100,7 +96,6 @@ void a64_hybrid_s8qa_mmla_4x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "3:" // Height 1: setup done "mov x26, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -120,92 +115,92 @@ void a64_hybrid_s8qa_mmla_4x16 ( "cmp x25, #0x10\n" "blt 11f\n" "ldr q1, [x24, #0x0]\n" - "ldr q5, [x28, #0x0]\n" + "ldr q5, [x9, #0x0]\n" "cmp x25, #0x20\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "ldr q4, [x28, #0x60]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "ldr q4, [x9, #0x60]\n" "blt 9f\n" "7:" // Height 1: Multiply loop: Main loop head "add x24, x24, #0x10\n" - "trn1 v0.2d, v1.2d, v27.2d\n" + "trn1 v0.2d, v1.2d, v2.2d\n" ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" - "ldr q25, [x28, #0x70]\n" - "trn2 v1.2d, v1.2d, v27.2d\n" + "ldr q5, [x9, #0x70]\n" + "trn2 v1.2d, v1.2d, v2.2d\n" ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" - "ldr q24, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" - "ldr q30, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x4e99a417 // smmla v23.4s, v0.16b, v25.16b\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x4e98a430 // smmla v16.4s, v1.16b, v24.16b\n" - "ldr q24, [x28, #0xf0]\n" - ".inst 0x4e9ea434 // smmla v20.4s, v1.16b, v30.16b\n" - "add x28, x28, #0x100\n" - ".inst 0x4e9da431 // smmla v17.4s, v1.16b, v29.16b\n" - ".inst 0x4e9ca435 // smmla v21.4s, v1.16b, v28.16b\n" - ".inst 0x4e9ba432 // smmla v18.4s, v1.16b, v27.16b\n" - ".inst 0x4e9aa436 // smmla v22.4s, v1.16b, v26.16b\n" - ".inst 0x4e99a433 // smmla v19.4s, v1.16b, v25.16b\n" - ".inst 0x4e98a437 // smmla v23.4s, v1.16b, v24.16b\n" + "ldr q4, [x9, #0xd0]\n" + ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" + "add x9, x9, #0x100\n" + ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n" + ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" "tbnz %x[flags], #31, 8f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum "ldr q1, [x24, #0x0]\n" - "ldr q5, [x28, #0x0]\n" + "ldr q5, [x9, #0x0]\n" "sub x25, x25, #0x10\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" "cmp x25, #0x20\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "ldr q4, [x28, #0x60]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "ldr q4, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "bge 7b\n" "9:" // Height 1: Multiply loop: Single iteration only "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" - "trn1 v0.2d, v1.2d, v24.2d\n" - "trn2 v1.2d, v1.2d, v24.2d\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" - "ldr q25, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" - "ldr q24, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" - "ldr q30, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x4e99a417 // smmla v23.4s, v0.16b, v25.16b\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x4e98a430 // smmla v16.4s, v1.16b, v24.16b\n" - "ldr q24, [x28, #0xf0]\n" - ".inst 0x4e9ea434 // smmla v20.4s, v1.16b, v30.16b\n" - "add x28, x28, #0x100\n" - ".inst 0x4e9da431 // smmla v17.4s, v1.16b, v29.16b\n" - ".inst 0x4e9ca435 // smmla v21.4s, v1.16b, v28.16b\n" - ".inst 0x4e9ba432 // smmla v18.4s, v1.16b, v27.16b\n" - ".inst 0x4e9aa436 // smmla v22.4s, v1.16b, v26.16b\n" - ".inst 0x4e99a433 // smmla v19.4s, v1.16b, v25.16b\n" - ".inst 0x4e98a437 // smmla v23.4s, v1.16b, v24.16b\n" + "ldr q4, [x9, #0xd0]\n" + ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" + "add x9, x9, #0x100\n" + ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n" + ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" "tbnz %x[flags], #31, 10f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" @@ -216,30 +211,30 @@ void a64_hybrid_s8qa_mmla_4x16 ( "cmp x25, #0x8\n" "blt 14f\n" "12:" // Height 1: Multiply loop: Odd block loop - "ldr d25, [x24], #0x8\n" - "trn1 v0.2d, v25.2d, v24.2d\n" + "ldr d1, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" "tbnz %x[flags], #31, 13f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "13:" // Height 1: Multiply loop: unique 3: skip row sum - "ldr q24, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" "sub x25, x25, #0x8\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q4, [x9, #0x30]\n" "cmp x25, #0x8\n" - "ldr q27, [x28, #0x40]\n" - "ldr q26, [x28, #0x50]\n" - "ldr q25, [x28, #0x60]\n" - ".inst 0x4e98a410 // smmla v16.4s, v0.16b, v24.16b\n" - "ldr q24, [x28, #0x70]\n" - ".inst 0x4e9ea414 // smmla v20.4s, v0.16b, v30.16b\n" - ".inst 0x4e9da411 // smmla v17.4s, v0.16b, v29.16b\n" - ".inst 0x4e9ca415 // smmla v21.4s, v0.16b, v28.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x4e9ba412 // smmla v18.4s, v0.16b, v27.16b\n" - ".inst 0x4e9aa416 // smmla v22.4s, v0.16b, v26.16b\n" - ".inst 0x4e99a413 // smmla v19.4s, v0.16b, v25.16b\n" - ".inst 0x4e98a417 // smmla v23.4s, v0.16b, v24.16b\n" + "ldr q5, [x9, #0x40]\n" + "ldr q6, [x9, #0x50]\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e88a410 // smmla v16.4s, v0.16b, v8.16b\n" + "ldr q8, [x9, #0x70]\n" + ".inst 0x4e89a414 // smmla v20.4s, v0.16b, v9.16b\n" + ".inst 0x4e8aa411 // smmla v17.4s, v0.16b, v10.16b\n" + ".inst 0x4e84a415 // smmla v21.4s, v0.16b, v4.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n" + ".inst 0x4e86a416 // smmla v22.4s, v0.16b, v6.16b\n" + ".inst 0x4e87a413 // smmla v19.4s, v0.16b, v7.16b\n" + ".inst 0x4e88a417 // smmla v23.4s, v0.16b, v8.16b\n" "bge 12b\n" "14:" // Height 1: Multiply loop: Skip odd blocks "cbz x25, 20f\n" @@ -263,27 +258,27 @@ void a64_hybrid_s8qa_mmla_4x16 ( "17:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x24, #0x0]\n" "18:" // Height 1: Multiply loop: Ragged operand read: Done - "trn1 v0.2d, v1.2d, v24.2d\n" + "trn1 v0.2d, v1.2d, v2.2d\n" "tbnz %x[flags], #31, 19f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "19:" // Height 1: Multiply loop: unique 4: skip row sum - "ldr q24, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" - "ldr q27, [x28, #0x40]\n" - "ldr q26, [x28, #0x50]\n" - "ldr q25, [x28, #0x60]\n" - ".inst 0x4e98a410 // smmla v16.4s, v0.16b, v24.16b\n" - "ldr q24, [x28, #0x70]\n" - ".inst 0x4e9ea414 // smmla v20.4s, v0.16b, v30.16b\n" - ".inst 0x4e9da411 // smmla v17.4s, v0.16b, v29.16b\n" - ".inst 0x4e9ca415 // smmla v21.4s, v0.16b, v28.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x4e9ba412 // smmla v18.4s, v0.16b, v27.16b\n" - ".inst 0x4e9aa416 // smmla v22.4s, v0.16b, v26.16b\n" - ".inst 0x4e99a413 // smmla v19.4s, v0.16b, v25.16b\n" - ".inst 0x4e98a417 // smmla v23.4s, v0.16b, v24.16b\n" + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "ldr q7, [x9, #0x40]\n" + "ldr q8, [x9, #0x50]\n" + "ldr q9, [x9, #0x60]\n" + ".inst 0x4e8aa410 // smmla v16.4s, v0.16b, v10.16b\n" + "ldr q10, [x9, #0x70]\n" + ".inst 0x4e84a414 // smmla v20.4s, v0.16b, v4.16b\n" + ".inst 0x4e85a411 // smmla v17.4s, v0.16b, v5.16b\n" + ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x4e87a412 // smmla v18.4s, v0.16b, v7.16b\n" + ".inst 0x4e88a416 // smmla v22.4s, v0.16b, v8.16b\n" + ".inst 0x4e89a413 // smmla v19.4s, v0.16b, v9.16b\n" + ".inst 0x4e8aa417 // smmla v23.4s, v0.16b, v10.16b\n" "20:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" @@ -298,136 +293,122 @@ void a64_hybrid_s8qa_mmla_4x16 ( "tbnz %x[flags], #31, 21f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" - "ld1r { v16.4s }, [x20]\n" - "neg v16.4s, v16.4s\n" + "ld1r { v1.4s }, [x20]\n" + "neg v1.4s, v1.4s\n" "dup v11.4s, v11.s[0]\n" - "mul v11.4s, v11.4s, v16.4s\n" + "mul v11.4s, v11.4s, v1.4s\n" "21:" // Height 1: skip row sum fixup - "ldr q24, [x10, #0x0]\n" - "ldr q22, [x10, #0x10]\n" + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v23.4s, v23.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q21, [x10, #0x20]\n" - "ldr q20, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v16.4s }, [x20]\n" - "add v23.4s, v23.4s, v24.4s\n" - "add v17.4s, v17.4s, v22.4s\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - "add x10, x10, #0x40\n" + "ld1r { v4.4s }, [x21]\n" + "add v23.4s, v23.4s, v0.4s\n" "ld1r { v0.4s }, [x20]\n" - "add v18.4s, v18.4s, v21.4s\n" - "add v19.4s, v19.4s, v20.4s\n" - "sqrdmulh v23.4s, v23.4s, v16.4s\n" - "sqrdmulh v17.4s, v17.4s, v16.4s\n" - "sqrdmulh v18.4s, v18.4s, v16.4s\n" - "sqrdmulh v19.4s, v19.4s, v16.4s\n" - "tbz %x[flags], #5, 22f\n" - "and v22.16b, v23.16b, v0.16b\n" - "and v21.16b, v17.16b, v0.16b\n" - "and v20.16b, v18.16b, v0.16b\n" - "and v16.16b, v19.16b, v0.16b\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sshr v16.4s, v16.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v22.4s\n" - "sqadd v17.4s, v17.4s, v21.4s\n" - "sqadd v18.4s, v18.4s, v20.4s\n" - "sqadd v19.4s, v19.4s, v16.4s\n" - "22:" // Height 1: no shift correction + "add v17.4s, v17.4s, v1.4s\n" "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v6.4s }, [x20]\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "add x28, x28, #0x40\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "ld1r { v4.4s }, [x21]\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v21.4s }, [x21]\n" - "ld1r { v20.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v16.4s }, [x20]\n" - "add v23.4s, v23.4s, v21.4s\n" - "add v17.4s, v17.4s, v21.4s\n" - "add v18.4s, v18.4s, v21.4s\n" - "add v19.4s, v19.4s, v21.4s\n" - "smin v23.4s, v23.4s, v20.4s\n" - "smin v17.4s, v17.4s, v20.4s\n" - "smin v18.4s, v18.4s, v20.4s\n" - "smin v19.4s, v19.4s, v20.4s\n" - "smax v23.4s, v23.4s, v16.4s\n" - "smax v17.4s, v17.4s, v16.4s\n" - "smax v18.4s, v18.4s, v16.4s\n" - "smax v19.4s, v19.4s, v16.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v23.8h, v23.8h, v17.8h\n" - "uzp1 v16.8h, v18.8h, v19.8h\n" - "uzp1 v23.16b, v23.16b, v16.16b\n" - "bge 31f\n" - "tbz x9, #3, 26f\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v23.16b, v23.16b, v17.16b\n" + "bge 30f\n" + "tbz x10, #3, 25f\n" "str d23, [x27], #0x8\n" - "tbz x9, #2, 24f\n" + "tbz x10, #2, 23f\n" "st1 { v23.s }[2], [x27], #0x4\n" - "tbz x9, #1, 23f\n" + "tbz x10, #1, 22f\n" "st1 { v23.h }[6], [x27], #0x2\n" - "tbz x9, #0, 30f\n" + "tbz x10, #0, 29f\n" "st1 { v23.b }[14], [x27]\n" - "b 30f\n" - "23:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x9, #0, 30f\n" + "b 29f\n" + "22:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x10, #0, 29f\n" "st1 { v23.b }[12], [x27]\n" - "b 30f\n" - "24:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x9, #1, 25f\n" + "b 29f\n" + "23:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x10, #1, 24f\n" "st1 { v23.h }[4], [x27], #0x2\n" - "tbz x9, #0, 30f\n" + "tbz x10, #0, 29f\n" "st1 { v23.b }[10], [x27]\n" - "b 30f\n" - "25:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x9, #0, 30f\n" + "b 29f\n" + "24:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x10, #0, 29f\n" "st1 { v23.b }[8], [x27]\n" - "b 30f\n" - "26:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x9, #2, 28f\n" + "b 29f\n" + "25:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x10, #2, 27f\n" "str s23, [x27], #0x4\n" - "tbz x9, #1, 27f\n" + "tbz x10, #1, 26f\n" "st1 { v23.h }[2], [x27], #0x2\n" - "tbz x9, #0, 30f\n" + "tbz x10, #0, 29f\n" "st1 { v23.b }[6], [x27]\n" - "b 30f\n" - "27:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x9, #0, 30f\n" + "b 29f\n" + "26:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x10, #0, 29f\n" "st1 { v23.b }[4], [x27]\n" - "b 30f\n" - "28:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x9, #1, 29f\n" + "b 29f\n" + "27:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x10, #1, 28f\n" "str h23, [x27], #0x2\n" - "tbz x9, #0, 30f\n" + "tbz x10, #0, 29f\n" "st1 { v23.b }[2], [x27]\n" - "b 30f\n" - "29:" // Height 1: Partial direct writeback: partial_1_0 + "b 29f\n" + "28:" // Height 1: Partial direct writeback: partial_1_0 "str b23, [x27, #0x0]\n" - "30:" // Height 1: Partial direct writeback: Done - "b 32f\n" - "31:" // Height 1: Full writeback + "29:" // Height 1: Partial direct writeback: Done + "b 31f\n" + "30:" // Height 1: Full writeback "str q23, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "32:" // Height 1: Writeback done - "subs x9, x9, #0x10\n" + "31:" // Height 1: Writeback done + "subs x10, x10, #0x10\n" "bgt 2b\n" - "b 130f\n" - "33:" // Height 2 - "mov x10, %x[col_bias]\n" + "b 126f\n" + "32:" // Height 2 "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v15.16b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "34:" // Height 2: Column loop + "33:" // Height 2: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -436,420 +417,393 @@ void a64_hybrid_s8qa_mmla_4x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "35:" // Height 2: setup done "mov x26, #0x0\n" - "36:" // Height 2: String loop + "35:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 37f\n" + "tbz %x[flags], #3, 36f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" - "cbnz x26, 38f\n" + "cbnz x26, 37f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" - "b 38f\n" - "37:" // Height 2: setup direct input + "b 37f\n" + "36:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" - "38:" // Height 2: input setup done + "37:" // Height 2: input setup done "cmp x25, #0x10\n" - "blt 43f\n" + "blt 42f\n" "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "cmp x25, #0x20\n" - "ldr q5, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "ldr q4, [x28, #0x60]\n" - "blt 41f\n" - "39:" // Height 2: Multiply loop: Main loop head + "ldr q5, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "ldr q4, [x9, #0x60]\n" + "blt 40f\n" + "38:" // Height 2: Multiply loop: Main loop head "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" - "ldr q25, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" - "ldr q24, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" - "ldr q30, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x4e99a417 // smmla v23.4s, v0.16b, v25.16b\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x4e98a430 // smmla v16.4s, v1.16b, v24.16b\n" - "ldr q24, [x28, #0xf0]\n" - ".inst 0x4e9ea434 // smmla v20.4s, v1.16b, v30.16b\n" - "add x28, x28, #0x100\n" - ".inst 0x4e9da431 // smmla v17.4s, v1.16b, v29.16b\n" - ".inst 0x4e9ca435 // smmla v21.4s, v1.16b, v28.16b\n" - ".inst 0x4e9ba432 // smmla v18.4s, v1.16b, v27.16b\n" - ".inst 0x4e9aa436 // smmla v22.4s, v1.16b, v26.16b\n" - ".inst 0x4e99a433 // smmla v19.4s, v1.16b, v25.16b\n" - ".inst 0x4e98a437 // smmla v23.4s, v1.16b, v24.16b\n" - "tbnz %x[flags], #31, 40f\n" + "ldr q4, [x9, #0xd0]\n" + ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" + "add x9, x9, #0x100\n" + ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n" + ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" + "tbnz %x[flags], #31, 39f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" - "40:" // Height 2: Multiply loop: unique 5: skip row sum + "39:" // Height 2: Multiply loop: unique 5: skip row sum "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "sub x25, x25, #0x10\n" - "ldr q5, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" + "ldr q5, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" "cmp x25, #0x20\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "ldr q4, [x28, #0x60]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "ldr q4, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - "bge 39b\n" - "41:" // Height 2: Multiply loop: Single iteration only + "bge 38b\n" + "40:" // Height 2: Multiply loop: Single iteration only "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" - "ldr q25, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" - "ldr q24, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" - "ldr q30, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x4e99a417 // smmla v23.4s, v0.16b, v25.16b\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x4e98a430 // smmla v16.4s, v1.16b, v24.16b\n" - "ldr q24, [x28, #0xf0]\n" - ".inst 0x4e9ea434 // smmla v20.4s, v1.16b, v30.16b\n" - "add x28, x28, #0x100\n" - ".inst 0x4e9da431 // smmla v17.4s, v1.16b, v29.16b\n" - ".inst 0x4e9ca435 // smmla v21.4s, v1.16b, v28.16b\n" - ".inst 0x4e9ba432 // smmla v18.4s, v1.16b, v27.16b\n" - ".inst 0x4e9aa436 // smmla v22.4s, v1.16b, v26.16b\n" - ".inst 0x4e99a433 // smmla v19.4s, v1.16b, v25.16b\n" - ".inst 0x4e98a437 // smmla v23.4s, v1.16b, v24.16b\n" - "tbnz %x[flags], #31, 42f\n" + "ldr q4, [x9, #0xd0]\n" + ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" + "add x9, x9, #0x100\n" + ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n" + ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" + "tbnz %x[flags], #31, 41f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" - "42:" // Height 2: Multiply loop: unique 6: skip row sum + "41:" // Height 2: Multiply loop: unique 6: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - "43:" // Height 2: Multiply loop: Main loop skip - "cbz x25, 52f\n" + "42:" // Height 2: Multiply loop: Main loop skip + "cbz x25, 51f\n" "cmp x25, #0x8\n" - "blt 46f\n" - "44:" // Height 2: Multiply loop: Odd block loop - "ldr d25, [x24], #0x8\n" - "ldr d24, [x23], #0x8\n" - "trn1 v0.2d, v25.2d, v24.2d\n" - "tbnz %x[flags], #31, 45f\n" + "blt 45f\n" + "43:" // Height 2: Multiply loop: Odd block loop + "ldr d1, [x24], #0x8\n" + "ldr d2, [x23], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "tbnz %x[flags], #31, 44f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" - "45:" // Height 2: Multiply loop: unique 7: skip row sum - "ldr q24, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" + "44:" // Height 2: Multiply loop: unique 7: skip row sum + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" "sub x25, x25, #0x8\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q4, [x9, #0x30]\n" "cmp x25, #0x8\n" - "ldr q27, [x28, #0x40]\n" - "ldr q26, [x28, #0x50]\n" - "ldr q25, [x28, #0x60]\n" - ".inst 0x4e98a410 // smmla v16.4s, v0.16b, v24.16b\n" - "ldr q24, [x28, #0x70]\n" - ".inst 0x4e9ea414 // smmla v20.4s, v0.16b, v30.16b\n" - ".inst 0x4e9da411 // smmla v17.4s, v0.16b, v29.16b\n" - ".inst 0x4e9ca415 // smmla v21.4s, v0.16b, v28.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x4e9ba412 // smmla v18.4s, v0.16b, v27.16b\n" - ".inst 0x4e9aa416 // smmla v22.4s, v0.16b, v26.16b\n" - ".inst 0x4e99a413 // smmla v19.4s, v0.16b, v25.16b\n" - ".inst 0x4e98a417 // smmla v23.4s, v0.16b, v24.16b\n" - "bge 44b\n" - "46:" // Height 2: Multiply loop: Skip odd blocks - "cbz x25, 52f\n" - "tbz x25, #2, 48f\n" + "ldr q5, [x9, #0x40]\n" + "ldr q6, [x9, #0x50]\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e88a410 // smmla v16.4s, v0.16b, v8.16b\n" + "ldr q8, [x9, #0x70]\n" + ".inst 0x4e89a414 // smmla v20.4s, v0.16b, v9.16b\n" + ".inst 0x4e8aa411 // smmla v17.4s, v0.16b, v10.16b\n" + ".inst 0x4e84a415 // smmla v21.4s, v0.16b, v4.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n" + ".inst 0x4e86a416 // smmla v22.4s, v0.16b, v6.16b\n" + ".inst 0x4e87a413 // smmla v19.4s, v0.16b, v7.16b\n" + ".inst 0x4e88a417 // smmla v23.4s, v0.16b, v8.16b\n" + "bge 43b\n" + "45:" // Height 2: Multiply loop: Skip odd blocks + "cbz x25, 51f\n" + "tbz x25, #2, 47f\n" "ldr s1, [x24], #0x4\n" "ldr s2, [x23], #0x4\n" - "tbz x25, #1, 47f\n" + "tbz x25, #1, 46f\n" "ld1 { v1.h }[2], [x24], #0x2\n" "ld1 { v2.h }[2], [x23], #0x2\n" - "tbz x25, #0, 50f\n" + "tbz x25, #0, 49f\n" "ld1 { v1.b }[6], [x24]\n" "ld1 { v2.b }[6], [x23]\n" - "b 50f\n" - "47:" // Height 2: Multiply loop: Ragged operand read: partial_1_4 - "tbz x25, #0, 50f\n" + "b 49f\n" + "46:" // Height 2: Multiply loop: Ragged operand read: partial_1_4 + "tbz x25, #0, 49f\n" "ld1 { v1.b }[4], [x24]\n" "ld1 { v2.b }[4], [x23]\n" - "b 50f\n" - "48:" // Height 2: Multiply loop: Ragged operand read: partial_2_0 - "tbz x25, #1, 49f\n" + "b 49f\n" + "47:" // Height 2: Multiply loop: Ragged operand read: partial_2_0 + "tbz x25, #1, 48f\n" "ldr h1, [x24], #0x2\n" "ldr h2, [x23], #0x2\n" - "tbz x25, #0, 50f\n" + "tbz x25, #0, 49f\n" "ld1 { v1.b }[2], [x24]\n" "ld1 { v2.b }[2], [x23]\n" - "b 50f\n" - "49:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "b 49f\n" + "48:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x24, #0x0]\n" "ldr b2, [x23, #0x0]\n" - "50:" // Height 2: Multiply loop: Ragged operand read: Done + "49:" // Height 2: Multiply loop: Ragged operand read: Done "trn1 v0.2d, v1.2d, v2.2d\n" - "tbnz %x[flags], #31, 51f\n" + "tbnz %x[flags], #31, 50f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" - "51:" // Height 2: Multiply loop: unique 8: skip row sum - "ldr q24, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" - "ldr q27, [x28, #0x40]\n" - "ldr q26, [x28, #0x50]\n" - "ldr q25, [x28, #0x60]\n" - ".inst 0x4e98a410 // smmla v16.4s, v0.16b, v24.16b\n" - "ldr q24, [x28, #0x70]\n" - ".inst 0x4e9ea414 // smmla v20.4s, v0.16b, v30.16b\n" - ".inst 0x4e9da411 // smmla v17.4s, v0.16b, v29.16b\n" - ".inst 0x4e9ca415 // smmla v21.4s, v0.16b, v28.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x4e9ba412 // smmla v18.4s, v0.16b, v27.16b\n" - ".inst 0x4e9aa416 // smmla v22.4s, v0.16b, v26.16b\n" - ".inst 0x4e99a413 // smmla v19.4s, v0.16b, v25.16b\n" - ".inst 0x4e98a417 // smmla v23.4s, v0.16b, v24.16b\n" - "52:" // Height 2: Multiply loop: No odd multiplies + "50:" // Height 2: Multiply loop: unique 8: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "ldr q7, [x9, #0x40]\n" + "ldr q8, [x9, #0x50]\n" + "ldr q9, [x9, #0x60]\n" + ".inst 0x4e8aa410 // smmla v16.4s, v0.16b, v10.16b\n" + "ldr q10, [x9, #0x70]\n" + ".inst 0x4e84a414 // smmla v20.4s, v0.16b, v4.16b\n" + ".inst 0x4e85a411 // smmla v17.4s, v0.16b, v5.16b\n" + ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x4e87a412 // smmla v18.4s, v0.16b, v7.16b\n" + ".inst 0x4e88a416 // smmla v22.4s, v0.16b, v8.16b\n" + ".inst 0x4e89a413 // smmla v19.4s, v0.16b, v9.16b\n" + ".inst 0x4e8aa417 // smmla v23.4s, v0.16b, v10.16b\n" + "51:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 36b\n" + "bne 35b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v24.2d, v16.2d, v20.2d\n" + "uzp1 v4.2d, v16.2d, v20.2d\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "prfm pstl1keep, [x27, #0x0]\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "uzp2 v17.2d, v17.2d, v21.2d\n" "uzp1 v21.2d, v18.2d, v22.2d\n" "uzp2 v18.2d, v18.2d, v22.2d\n" - "add x24, x27, x20\n" + "add x26, x27, x20\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" - "prfm pstl1keep, [x24, #0x0]\n" - "mov v23.16b, v24.16b\n" - "tbnz %x[flags], #31, 53f\n" + "prfm pstl1keep, [x26, #0x0]\n" + "mov v23.16b, v4.16b\n" + "tbnz %x[flags], #31, 52f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" - "ld1r { v24.4s }, [x20]\n" - "neg v24.4s, v24.4s\n" + "ld1r { v2.4s }, [x20]\n" + "neg v2.4s, v2.4s\n" "dup v12.4s, v11.s[3]\n" "dup v11.4s, v11.s[0]\n" - "mul v11.4s, v11.4s, v24.4s\n" - "mul v12.4s, v12.4s, v24.4s\n" - "53:" // Height 2: skip row sum fixup - "ldr q28, [x10, #0x0]\n" - "ldr q27, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v2.4s\n" + "mul v12.4s, v12.4s, v2.4s\n" + "52:" // Height 2: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v23.4s, v23.4s, v11.4s\n" "add v20.4s, v20.4s, v11.4s\n" - "ldr q26, [x10, #0x20]\n" - "ldr q25, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v21.4s, v21.4s, v11.4s\n" "add v22.4s, v22.4s, v11.4s\n" "add v16.4s, v16.4s, v12.4s\n" "add v17.4s, v17.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v24.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v18.4s, v18.4s, v12.4s\n" "add v19.4s, v19.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" - "add v23.4s, v23.4s, v28.4s\n" - "add v20.4s, v20.4s, v27.4s\n" - "add x10, x10, #0x40\n" - "add v21.4s, v21.4s, v26.4s\n" - "add v22.4s, v22.4s, v25.4s\n" - "add v16.4s, v16.4s, v28.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v17.4s, v17.4s, v27.4s\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v25.4s\n" - "sqrdmulh v23.4s, v23.4s, v24.4s\n" - "sqrdmulh v20.4s, v20.4s, v24.4s\n" - "sqrdmulh v21.4s, v21.4s, v24.4s\n" - "sqrdmulh v22.4s, v22.4s, v24.4s\n" - "sqrdmulh v16.4s, v16.4s, v24.4s\n" - "sqrdmulh v17.4s, v17.4s, v24.4s\n" - "sqrdmulh v18.4s, v18.4s, v24.4s\n" - "sqrdmulh v19.4s, v19.4s, v24.4s\n" - "tbz %x[flags], #5, 54f\n" - "and v24.16b, v23.16b, v0.16b\n" - "and v30.16b, v20.16b, v0.16b\n" - "and v29.16b, v21.16b, v0.16b\n" - "and v28.16b, v22.16b, v0.16b\n" - "and v27.16b, v16.16b, v0.16b\n" - "and v26.16b, v17.16b, v0.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "and v25.16b, v18.16b, v0.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v24.4s\n" - "and v24.16b, v19.16b, v0.16b\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v30.4s\n" - "sqadd v21.4s, v21.4s, v29.4s\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v22.4s, v22.4s, v28.4s\n" - "sqadd v16.4s, v16.4s, v27.4s\n" - "sqadd v17.4s, v17.4s, v26.4s\n" - "sqadd v18.4s, v18.4s, v25.4s\n" - "sqadd v19.4s, v19.4s, v24.4s\n" - "54:" // Height 2: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "add x22, %x[qp], %[c_offset]\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "ld1r { v0.4s }, [x23]\n" + "add v17.4s, v17.4s, v1.4s\n" + "cmp x10, #0x10\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add x28, x28, #0x40\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v26.4s }, [x21]\n" - "ld1r { v25.4s }, [x20]\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v24.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add v23.4s, v23.4s, v26.4s\n" - "add v20.4s, v20.4s, v26.4s\n" - "add v21.4s, v21.4s, v26.4s\n" - "add v22.4s, v22.4s, v26.4s\n" - "add v16.4s, v16.4s, v26.4s\n" - "add v17.4s, v17.4s, v26.4s\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v26.4s\n" - "smin v23.4s, v23.4s, v25.4s\n" - "smin v20.4s, v20.4s, v25.4s\n" - "smin v21.4s, v21.4s, v25.4s\n" - "smin v22.4s, v22.4s, v25.4s\n" - "smin v16.4s, v16.4s, v25.4s\n" - "smin v17.4s, v17.4s, v25.4s\n" - "smin v18.4s, v18.4s, v25.4s\n" - "smin v19.4s, v19.4s, v25.4s\n" - "smax v23.4s, v23.4s, v24.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "smax v21.4s, v21.4s, v24.4s\n" - "smax v22.4s, v22.4s, v24.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "smax v18.4s, v18.4s, v24.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v23.8h, v23.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v23.16b, v23.16b, v20.16b\n" "uzp1 v16.16b, v16.16b, v17.16b\n" - "bge 63f\n" - "tbz x9, #3, 58f\n" + "bge 61f\n" + "tbz x10, #3, 56f\n" "str d23, [x27], #0x8\n" - "str d16, [x24], #0x8\n" - "tbz x9, #2, 56f\n" + "str d16, [x26], #0x8\n" + "tbz x10, #2, 54f\n" "st1 { v23.s }[2], [x27], #0x4\n" - "st1 { v16.s }[2], [x24], #0x4\n" - "tbz x9, #1, 55f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "tbz x10, #1, 53f\n" "st1 { v23.h }[6], [x27], #0x2\n" - "st1 { v16.h }[6], [x24], #0x2\n" - "tbz x9, #0, 62f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "tbz x10, #0, 60f\n" "st1 { v23.b }[14], [x27]\n" - "st1 { v16.b }[14], [x24]\n" - "b 62f\n" - "55:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x9, #0, 62f\n" + "st1 { v16.b }[14], [x26]\n" + "b 60f\n" + "53:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x10, #0, 60f\n" "st1 { v23.b }[12], [x27]\n" - "st1 { v16.b }[12], [x24]\n" - "b 62f\n" - "56:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x9, #1, 57f\n" + "st1 { v16.b }[12], [x26]\n" + "b 60f\n" + "54:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x10, #1, 55f\n" "st1 { v23.h }[4], [x27], #0x2\n" - "st1 { v16.h }[4], [x24], #0x2\n" - "tbz x9, #0, 62f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "tbz x10, #0, 60f\n" "st1 { v23.b }[10], [x27]\n" - "st1 { v16.b }[10], [x24]\n" - "b 62f\n" - "57:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x9, #0, 62f\n" + "st1 { v16.b }[10], [x26]\n" + "b 60f\n" + "55:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x10, #0, 60f\n" "st1 { v23.b }[8], [x27]\n" - "st1 { v16.b }[8], [x24]\n" - "b 62f\n" - "58:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x9, #2, 60f\n" + "st1 { v16.b }[8], [x26]\n" + "b 60f\n" + "56:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x10, #2, 58f\n" "str s23, [x27], #0x4\n" - "str s16, [x24], #0x4\n" - "tbz x9, #1, 59f\n" + "str s16, [x26], #0x4\n" + "tbz x10, #1, 57f\n" "st1 { v23.h }[2], [x27], #0x2\n" - "st1 { v16.h }[2], [x24], #0x2\n" - "tbz x9, #0, 62f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "tbz x10, #0, 60f\n" "st1 { v23.b }[6], [x27]\n" - "st1 { v16.b }[6], [x24]\n" - "b 62f\n" - "59:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x9, #0, 62f\n" + "st1 { v16.b }[6], [x26]\n" + "b 60f\n" + "57:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x10, #0, 60f\n" "st1 { v23.b }[4], [x27]\n" - "st1 { v16.b }[4], [x24]\n" - "b 62f\n" - "60:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x9, #1, 61f\n" + "st1 { v16.b }[4], [x26]\n" + "b 60f\n" + "58:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x10, #1, 59f\n" "str h23, [x27], #0x2\n" - "str h16, [x24], #0x2\n" - "tbz x9, #0, 62f\n" + "str h16, [x26], #0x2\n" + "tbz x10, #0, 60f\n" "st1 { v23.b }[2], [x27]\n" - "st1 { v16.b }[2], [x24]\n" - "b 62f\n" - "61:" // Height 2: Partial direct writeback: partial_1_0 + "st1 { v16.b }[2], [x26]\n" + "b 60f\n" + "59:" // Height 2: Partial direct writeback: partial_1_0 "str b23, [x27, #0x0]\n" - "str b16, [x24, #0x0]\n" - "62:" // Height 2: Partial direct writeback: Done - "b 64f\n" - "63:" // Height 2: Full writeback + "str b16, [x26, #0x0]\n" + "60:" // Height 2: Partial direct writeback: Done + "b 62f\n" + "61:" // Height 2: Full writeback "str q23, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q16, [x24, #0x0]\n" - "64:" // Height 2: Writeback done - "subs x9, x9, #0x10\n" - "bgt 34b\n" - "b 130f\n" - "65:" // Height 3 - "mov x10, %x[col_bias]\n" + "str q16, [x26, #0x0]\n" + "62:" // Height 2: Writeback done + "subs x10, x10, #0x10\n" + "bgt 33b\n" + "b 126f\n" + "63:" // Height 3 "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v13.4s, #0x0\n" "movi v15.16b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "66:" // Height 3: Column loop + "64:" // Height 3: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -866,43 +820,42 @@ void a64_hybrid_s8qa_mmla_4x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "67:" // Height 3: setup done "mov x26, #0x0\n" - "68:" // Height 3: String loop + "66:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 69f\n" + "tbz %x[flags], #3, 67f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" - "cbnz x26, 70f\n" + "cbnz x26, 68f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" - "b 70f\n" - "69:" // Height 3: setup direct input + "b 68f\n" + "67:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" - "70:" // Height 3: input setup done + "68:" // Height 3: input setup done "cmp x25, #0x10\n" - "blt 75f\n" + "blt 73f\n" "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "cmp x25, #0x20\n" "ldr q3, [x22, #0x0]\n" - "ldr q5, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "blt 73f\n" - "71:" // Height 3: Multiply loop: Main loop head + "ldr q5, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "blt 71f\n" + "69:" // Height 3: Multiply loop: Main loop head "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "add x24, x24, #0x10\n" @@ -910,35 +863,35 @@ void a64_hybrid_s8qa_mmla_4x16 ( "add x22, x22, #0x10\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" - "ldr q14, [x28, #0x60]\n" + "ldr q4, [x9, #0x60]\n" ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x4e86a45c // smmla v28.4s, v2.16b, v6.16b\n" - "ldr q4, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4e88a45d // smmla v29.4s, v2.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x4e89a45a // smmla v26.4s, v2.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" ".inst 0x4e8aa45e // smmla v30.4s, v2.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" - ".inst 0x4e8ea413 // smmla v19.4s, v0.16b, v14.16b\n" - ".inst 0x4e8ea45b // smmla v27.4s, v2.16b, v14.16b\n" - "ldr q6, [x28, #0xd0]\n" + "ldr q10, [x9, #0xc0]\n" + ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a45b // smmla v27.4s, v2.16b, v4.16b\n" + "ldr q4, [x9, #0xd0]\n" ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" ".inst 0x4e85a45f // smmla v31.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" - ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n" - ".inst 0x4e84a478 // smmla v24.4s, v3.16b, v4.16b\n" - "ldr q4, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a478 // smmla v24.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" ".inst 0x4e87a47c // smmla v28.4s, v3.16b, v7.16b\n" ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" @@ -947,34 +900,34 @@ void a64_hybrid_s8qa_mmla_4x16 ( ".inst 0x4e89a47d // smmla v29.4s, v3.16b, v9.16b\n" ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" ".inst 0x4e8aa47a // smmla v26.4s, v3.16b, v10.16b\n" - ".inst 0x4e86a436 // smmla v22.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a47e // smmla v30.4s, v3.16b, v6.16b\n" + ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n" ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n" - ".inst 0x4e84a437 // smmla v23.4s, v1.16b, v4.16b\n" - ".inst 0x4e84a47f // smmla v31.4s, v3.16b, v4.16b\n" - "tbnz %x[flags], #31, 72f\n" + ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a47f // smmla v31.4s, v3.16b, v6.16b\n" + "tbnz %x[flags], #31, 70f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" ".inst 0x4e8f946d // sdot v13.4s, v3.16b, v15.16b\n" - "72:" // Height 3: Multiply loop: unique 9: skip row sum + "70:" // Height 3: Multiply loop: unique 9: skip row sum "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "sub x25, x25, #0x10\n" "ldr q3, [x22, #0x0]\n" - "ldr q5, [x28, #0x0]\n" + "ldr q5, [x9, #0x0]\n" "cmp x25, #0x20\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - "bge 71b\n" - "73:" // Height 3: Multiply loop: Single iteration only + "bge 69b\n" + "71:" // Height 3: Multiply loop: Single iteration only "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "sub x25, x25, #0x10\n" @@ -983,35 +936,35 @@ void a64_hybrid_s8qa_mmla_4x16 ( "add x22, x22, #0x10\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" - "ldr q14, [x28, #0x60]\n" + "ldr q4, [x9, #0x60]\n" ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x4e86a45c // smmla v28.4s, v2.16b, v6.16b\n" - "ldr q4, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4e88a45d // smmla v29.4s, v2.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x4e89a45a // smmla v26.4s, v2.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" ".inst 0x4e8aa45e // smmla v30.4s, v2.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" - ".inst 0x4e8ea413 // smmla v19.4s, v0.16b, v14.16b\n" - ".inst 0x4e8ea45b // smmla v27.4s, v2.16b, v14.16b\n" - "ldr q6, [x28, #0xd0]\n" + "ldr q10, [x9, #0xc0]\n" + ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a45b // smmla v27.4s, v2.16b, v4.16b\n" + "ldr q4, [x9, #0xd0]\n" ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" ".inst 0x4e85a45f // smmla v31.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" - ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n" - ".inst 0x4e84a478 // smmla v24.4s, v3.16b, v4.16b\n" - "ldr q4, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a478 // smmla v24.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" ".inst 0x4e87a47c // smmla v28.4s, v3.16b, v7.16b\n" ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" @@ -1020,416 +973,378 @@ void a64_hybrid_s8qa_mmla_4x16 ( ".inst 0x4e89a47d // smmla v29.4s, v3.16b, v9.16b\n" ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" ".inst 0x4e8aa47a // smmla v26.4s, v3.16b, v10.16b\n" - ".inst 0x4e86a436 // smmla v22.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a47e // smmla v30.4s, v3.16b, v6.16b\n" + ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n" ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n" - ".inst 0x4e84a437 // smmla v23.4s, v1.16b, v4.16b\n" - ".inst 0x4e84a47f // smmla v31.4s, v3.16b, v4.16b\n" - "tbnz %x[flags], #31, 74f\n" + ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a47f // smmla v31.4s, v3.16b, v6.16b\n" + "tbnz %x[flags], #31, 72f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" ".inst 0x4e8f946d // sdot v13.4s, v3.16b, v15.16b\n" - "74:" // Height 3: Multiply loop: unique 10: skip row sum + "72:" // Height 3: Multiply loop: unique 10: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - "75:" // Height 3: Multiply loop: Main loop skip - "cbz x25, 84f\n" + "73:" // Height 3: Multiply loop: Main loop skip + "cbz x25, 82f\n" "cmp x25, #0x8\n" - "blt 78f\n" - "76:" // Height 3: Multiply loop: Odd block loop - "ldr d3, [x24], #0x8\n" - "ldr d0, [x23], #0x8\n" - "ldr d1, [x22], #0x8\n" - "trn1 v0.2d, v3.2d, v0.2d\n" - "trn1 v2.2d, v1.2d, v2.2d\n" - "tbnz %x[flags], #31, 77f\n" + "blt 76f\n" + "74:" // Height 3: Multiply loop: Odd block loop + "ldr d1, [x24], #0x8\n" + "ldr d2, [x23], #0x8\n" + "ldr d3, [x22], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v7.2d\n" + "tbnz %x[flags], #31, 75f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - "77:" // Height 3: Multiply loop: unique 11: skip row sum - "ldr q1, [x28, #0x0]\n" - "ldr q8, [x28, #0x10]\n" + "75:" // Height 3: Multiply loop: unique 11: skip row sum + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" "sub x25, x25, #0x8\n" - "ldr q7, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q4, [x9, #0x30]\n" "cmp x25, #0x8\n" - "ldr q5, [x28, #0x40]\n" - "ldr q4, [x28, #0x50]\n" - "ldr q3, [x28, #0x60]\n" - ".inst 0x4e81a410 // smmla v16.4s, v0.16b, v1.16b\n" - ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n" - "ldr q1, [x28, #0x70]\n" - ".inst 0x4e88a414 // smmla v20.4s, v0.16b, v8.16b\n" - ".inst 0x4e88a45c // smmla v28.4s, v2.16b, v8.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" - ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n" + "ldr q5, [x9, #0x40]\n" + "ldr q6, [x9, #0x50]\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e88a410 // smmla v16.4s, v0.16b, v8.16b\n" + ".inst 0x4e88a458 // smmla v24.4s, v2.16b, v8.16b\n" + "ldr q8, [x9, #0x70]\n" + ".inst 0x4e89a414 // smmla v20.4s, v0.16b, v9.16b\n" + ".inst 0x4e89a45c // smmla v28.4s, v2.16b, v9.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x4e8aa411 // smmla v17.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aa459 // smmla v25.4s, v2.16b, v10.16b\n" + ".inst 0x4e84a415 // smmla v21.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a45d // smmla v29.4s, v2.16b, v4.16b\n" ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n" ".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n" - ".inst 0x4e84a416 // smmla v22.4s, v0.16b, v4.16b\n" - ".inst 0x4e84a45e // smmla v30.4s, v2.16b, v4.16b\n" - ".inst 0x4e83a413 // smmla v19.4s, v0.16b, v3.16b\n" - ".inst 0x4e83a45b // smmla v27.4s, v2.16b, v3.16b\n" - ".inst 0x4e81a417 // smmla v23.4s, v0.16b, v1.16b\n" - ".inst 0x4e81a45f // smmla v31.4s, v2.16b, v1.16b\n" - "bge 76b\n" - "78:" // Height 3: Multiply loop: Skip odd blocks - "cbz x25, 84f\n" - "tbz x25, #2, 80f\n" + ".inst 0x4e86a416 // smmla v22.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a45e // smmla v30.4s, v2.16b, v6.16b\n" + ".inst 0x4e87a413 // smmla v19.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a45b // smmla v27.4s, v2.16b, v7.16b\n" + ".inst 0x4e88a417 // smmla v23.4s, v0.16b, v8.16b\n" + ".inst 0x4e88a45f // smmla v31.4s, v2.16b, v8.16b\n" + "bge 74b\n" + "76:" // Height 3: Multiply loop: Skip odd blocks + "cbz x25, 82f\n" + "tbz x25, #2, 78f\n" "ldr s1, [x24], #0x4\n" "ldr s2, [x23], #0x4\n" "ldr s3, [x22], #0x4\n" - "tbz x25, #1, 79f\n" + "tbz x25, #1, 77f\n" "ld1 { v1.h }[2], [x24], #0x2\n" "ld1 { v2.h }[2], [x23], #0x2\n" "ld1 { v3.h }[2], [x22], #0x2\n" - "tbz x25, #0, 82f\n" + "tbz x25, #0, 80f\n" "ld1 { v1.b }[6], [x24]\n" "ld1 { v2.b }[6], [x23]\n" "ld1 { v3.b }[6], [x22]\n" - "b 82f\n" - "79:" // Height 3: Multiply loop: Ragged operand read: partial_1_4 - "tbz x25, #0, 82f\n" + "b 80f\n" + "77:" // Height 3: Multiply loop: Ragged operand read: partial_1_4 + "tbz x25, #0, 80f\n" "ld1 { v1.b }[4], [x24]\n" "ld1 { v2.b }[4], [x23]\n" "ld1 { v3.b }[4], [x22]\n" - "b 82f\n" - "80:" // Height 3: Multiply loop: Ragged operand read: partial_2_0 - "tbz x25, #1, 81f\n" + "b 80f\n" + "78:" // Height 3: Multiply loop: Ragged operand read: partial_2_0 + "tbz x25, #1, 79f\n" "ldr h1, [x24], #0x2\n" "ldr h2, [x23], #0x2\n" "ldr h3, [x22], #0x2\n" - "tbz x25, #0, 82f\n" + "tbz x25, #0, 80f\n" "ld1 { v1.b }[2], [x24]\n" "ld1 { v2.b }[2], [x23]\n" "ld1 { v3.b }[2], [x22]\n" - "b 82f\n" - "81:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "b 80f\n" + "79:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x24, #0x0]\n" "ldr b2, [x23, #0x0]\n" "ldr b3, [x22, #0x0]\n" - "82:" // Height 3: Multiply loop: Ragged operand read: Done + "80:" // Height 3: Multiply loop: Ragged operand read: Done "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "tbnz %x[flags], #31, 83f\n" + "trn1 v2.2d, v3.2d, v9.2d\n" + "tbnz %x[flags], #31, 81f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - "83:" // Height 3: Multiply loop: unique 12: skip row sum - "ldr q1, [x28, #0x0]\n" - "ldr q8, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - "ldr q5, [x28, #0x40]\n" - "ldr q4, [x28, #0x50]\n" - "ldr q3, [x28, #0x60]\n" - ".inst 0x4e81a410 // smmla v16.4s, v0.16b, v1.16b\n" - ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n" - "ldr q1, [x28, #0x70]\n" - ".inst 0x4e88a414 // smmla v20.4s, v0.16b, v8.16b\n" - ".inst 0x4e88a45c // smmla v28.4s, v2.16b, v8.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" + "81:" // Height 3: Multiply loop: unique 12: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "ldr q7, [x9, #0x40]\n" + "ldr q8, [x9, #0x50]\n" + "ldr q9, [x9, #0x60]\n" + ".inst 0x4e8aa410 // smmla v16.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aa458 // smmla v24.4s, v2.16b, v10.16b\n" + "ldr q10, [x9, #0x70]\n" + ".inst 0x4e84a414 // smmla v20.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a45c // smmla v28.4s, v2.16b, v4.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x4e85a411 // smmla v17.4s, v0.16b, v5.16b\n" + ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n" ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n" ".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n" - ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n" - ".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n" - ".inst 0x4e84a416 // smmla v22.4s, v0.16b, v4.16b\n" - ".inst 0x4e84a45e // smmla v30.4s, v2.16b, v4.16b\n" - ".inst 0x4e83a413 // smmla v19.4s, v0.16b, v3.16b\n" - ".inst 0x4e83a45b // smmla v27.4s, v2.16b, v3.16b\n" - ".inst 0x4e81a417 // smmla v23.4s, v0.16b, v1.16b\n" - ".inst 0x4e81a45f // smmla v31.4s, v2.16b, v1.16b\n" - "84:" // Height 3: Multiply loop: No odd multiplies + ".inst 0x4e87a412 // smmla v18.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a45a // smmla v26.4s, v2.16b, v7.16b\n" + ".inst 0x4e88a416 // smmla v22.4s, v0.16b, v8.16b\n" + ".inst 0x4e88a45e // smmla v30.4s, v2.16b, v8.16b\n" + ".inst 0x4e89a413 // smmla v19.4s, v0.16b, v9.16b\n" + ".inst 0x4e89a45b // smmla v27.4s, v2.16b, v9.16b\n" + ".inst 0x4e8aa417 // smmla v23.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aa45f // smmla v31.4s, v2.16b, v10.16b\n" + "82:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 68b\n" + "bne 66b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v0.2d, v16.2d, v20.2d\n" + "uzp1 v4.2d, v16.2d, v20.2d\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "prfm pstl1keep, [x27, #0x0]\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "uzp2 v17.2d, v17.2d, v21.2d\n" "uzp1 v21.2d, v18.2d, v22.2d\n" "uzp2 v18.2d, v18.2d, v22.2d\n" - "add x24, x27, x20\n" - "add x23, x24, x20\n" + "add x26, x27, x20\n" + "add x25, x26, x20\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" - "prfm pstl1keep, [x24, #0x0]\n" - "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x26, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" "uzp1 v24.2d, v24.2d, v28.2d\n" "uzp1 v25.2d, v25.2d, v29.2d\n" "uzp1 v26.2d, v26.2d, v30.2d\n" "uzp1 v27.2d, v27.2d, v31.2d\n" - "mov v31.16b, v0.16b\n" - "tbnz %x[flags], #31, 85f\n" + "mov v31.16b, v4.16b\n" + "tbnz %x[flags], #31, 83f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "ld1r { v23.4s }, [x20]\n" - "neg v23.4s, v23.4s\n" + "ld1r { v3.4s }, [x20]\n" + "neg v3.4s, v3.4s\n" "dup v12.4s, v11.s[3]\n" "dup v11.4s, v11.s[0]\n" "dup v13.4s, v13.s[0]\n" - "mul v11.4s, v11.4s, v23.4s\n" - "mul v12.4s, v12.4s, v23.4s\n" - "mul v13.4s, v13.4s, v23.4s\n" - "85:" // Height 3: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q30, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v3.4s\n" + "mul v12.4s, v12.4s, v3.4s\n" + "mul v13.4s, v13.4s, v3.4s\n" + "83:" // Height 3: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v31.4s, v31.4s, v11.4s\n" "add v20.4s, v20.4s, v11.4s\n" - "ldr q29, [x10, #0x20]\n" - "ldr q28, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v21.4s, v21.4s, v11.4s\n" "add v22.4s, v22.4s, v11.4s\n" "add v16.4s, v16.4s, v12.4s\n" "add v17.4s, v17.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v23.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v18.4s, v18.4s, v12.4s\n" "add v19.4s, v19.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "add x22, %x[qp], %[c_offset]\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" - "add x10, x10, #0x40\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" "add v31.4s, v31.4s, v0.4s\n" - "add v20.4s, v20.4s, v30.4s\n" - "add v21.4s, v21.4s, v29.4s\n" - "add v22.4s, v22.4s, v28.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add x28, x28, #0x40\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v30.4s\n" - "add v18.4s, v18.4s, v29.4s\n" - "add v19.4s, v19.4s, v28.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v25.4s, v25.4s, v30.4s\n" - "add v26.4s, v26.4s, v29.4s\n" - "add v27.4s, v27.4s, v28.4s\n" - "sqrdmulh v31.4s, v31.4s, v23.4s\n" - "sqrdmulh v20.4s, v20.4s, v23.4s\n" - "sqrdmulh v21.4s, v21.4s, v23.4s\n" - "sqrdmulh v22.4s, v22.4s, v23.4s\n" - "sqrdmulh v16.4s, v16.4s, v23.4s\n" - "sqrdmulh v17.4s, v17.4s, v23.4s\n" - "sqrdmulh v18.4s, v18.4s, v23.4s\n" - "sqrdmulh v19.4s, v19.4s, v23.4s\n" - "sqrdmulh v24.4s, v24.4s, v23.4s\n" - "sqrdmulh v25.4s, v25.4s, v23.4s\n" - "sqrdmulh v26.4s, v26.4s, v23.4s\n" - "sqrdmulh v27.4s, v27.4s, v23.4s\n" - "tbz %x[flags], #5, 86f\n" - "and v1.16b, v31.16b, v0.16b\n" - "and v30.16b, v20.16b, v0.16b\n" - "and v29.16b, v21.16b, v0.16b\n" - "and v28.16b, v22.16b, v0.16b\n" - "and v23.16b, v16.16b, v0.16b\n" - "and v3.16b, v17.16b, v0.16b\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "and v2.16b, v18.16b, v0.16b\n" - "sqadd v31.4s, v31.4s, v1.4s\n" - "sqadd v20.4s, v20.4s, v30.4s\n" - "sqadd v21.4s, v21.4s, v29.4s\n" - "sqadd v22.4s, v22.4s, v28.4s\n" - "sqadd v16.4s, v16.4s, v23.4s\n" - "and v1.16b, v19.16b, v0.16b\n" - "and v30.16b, v24.16b, v0.16b\n" - "and v29.16b, v25.16b, v0.16b\n" - "and v28.16b, v26.16b, v0.16b\n" - "and v23.16b, v27.16b, v0.16b\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v3.4s\n" - "sqadd v18.4s, v18.4s, v2.4s\n" - "sqadd v19.4s, v19.4s, v1.4s\n" - "sqadd v24.4s, v24.4s, v30.4s\n" - "sqadd v25.4s, v25.4s, v29.4s\n" - "sqadd v26.4s, v26.4s, v28.4s\n" - "sqadd v27.4s, v27.4s, v23.4s\n" - "86:" // Height 3: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "ld1r { v0.4s }, [x23]\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqdmulh v31.4s, v31.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v4.4s\n" + "sqdmulh v26.4s, v26.4s, v4.4s\n" + "sqdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v31.4s, v31.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v29.4s }, [x21]\n" - "ld1r { v28.4s }, [x20]\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v23.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v24.4s, v24.4s, v0.4s\n" "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add v31.4s, v31.4s, v29.4s\n" - "add v20.4s, v20.4s, v29.4s\n" - "add v21.4s, v21.4s, v29.4s\n" - "add v22.4s, v22.4s, v29.4s\n" - "add v16.4s, v16.4s, v29.4s\n" - "add v17.4s, v17.4s, v29.4s\n" - "add v18.4s, v18.4s, v29.4s\n" - "add v19.4s, v19.4s, v29.4s\n" - "add v24.4s, v24.4s, v29.4s\n" - "add v25.4s, v25.4s, v29.4s\n" - "add v26.4s, v26.4s, v29.4s\n" - "add v27.4s, v27.4s, v29.4s\n" - "smin v31.4s, v31.4s, v28.4s\n" - "smin v20.4s, v20.4s, v28.4s\n" - "smin v21.4s, v21.4s, v28.4s\n" - "smin v22.4s, v22.4s, v28.4s\n" - "smin v16.4s, v16.4s, v28.4s\n" - "smin v17.4s, v17.4s, v28.4s\n" - "smin v18.4s, v18.4s, v28.4s\n" - "smin v19.4s, v19.4s, v28.4s\n" - "smin v24.4s, v24.4s, v28.4s\n" - "smin v25.4s, v25.4s, v28.4s\n" - "smin v26.4s, v26.4s, v28.4s\n" - "smin v27.4s, v27.4s, v28.4s\n" - "smax v31.4s, v31.4s, v23.4s\n" - "smax v20.4s, v20.4s, v23.4s\n" - "smax v21.4s, v21.4s, v23.4s\n" - "smax v22.4s, v22.4s, v23.4s\n" - "smax v16.4s, v16.4s, v23.4s\n" - "smax v17.4s, v17.4s, v23.4s\n" - "smax v18.4s, v18.4s, v23.4s\n" - "smax v19.4s, v19.4s, v23.4s\n" - "smax v24.4s, v24.4s, v23.4s\n" - "smax v25.4s, v25.4s, v23.4s\n" - "smax v26.4s, v26.4s, v23.4s\n" - "smax v27.4s, v27.4s, v23.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v31.8h, v31.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v18.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v17.8h, v26.8h, v27.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" "uzp1 v31.16b, v31.16b, v20.16b\n" - "uzp1 v16.16b, v16.16b, v18.16b\n" - "uzp1 v24.16b, v24.16b, v17.16b\n" - "bge 95f\n" - "tbz x9, #3, 90f\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 92f\n" + "tbz x10, #3, 87f\n" "str d31, [x27], #0x8\n" - "str d16, [x24], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x9, #2, 88f\n" + "str d16, [x26], #0x8\n" + "str d24, [x25], #0x8\n" + "tbz x10, #2, 85f\n" "st1 { v31.s }[2], [x27], #0x4\n" - "st1 { v16.s }[2], [x24], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "tbz x9, #1, 87f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "st1 { v24.s }[2], [x25], #0x4\n" + "tbz x10, #1, 84f\n" "st1 { v31.h }[6], [x27], #0x2\n" - "st1 { v16.h }[6], [x24], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "tbz x9, #0, 94f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "st1 { v24.h }[6], [x25], #0x2\n" + "tbz x10, #0, 91f\n" "st1 { v31.b }[14], [x27]\n" - "st1 { v16.b }[14], [x24]\n" - "st1 { v24.b }[14], [x23]\n" - "b 94f\n" - "87:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x9, #0, 94f\n" + "st1 { v16.b }[14], [x26]\n" + "st1 { v24.b }[14], [x25]\n" + "b 91f\n" + "84:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x10, #0, 91f\n" "st1 { v31.b }[12], [x27]\n" - "st1 { v16.b }[12], [x24]\n" - "st1 { v24.b }[12], [x23]\n" - "b 94f\n" - "88:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x9, #1, 89f\n" + "st1 { v16.b }[12], [x26]\n" + "st1 { v24.b }[12], [x25]\n" + "b 91f\n" + "85:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x10, #1, 86f\n" "st1 { v31.h }[4], [x27], #0x2\n" - "st1 { v16.h }[4], [x24], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "tbz x9, #0, 94f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "st1 { v24.h }[4], [x25], #0x2\n" + "tbz x10, #0, 91f\n" "st1 { v31.b }[10], [x27]\n" - "st1 { v16.b }[10], [x24]\n" - "st1 { v24.b }[10], [x23]\n" - "b 94f\n" - "89:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x9, #0, 94f\n" + "st1 { v16.b }[10], [x26]\n" + "st1 { v24.b }[10], [x25]\n" + "b 91f\n" + "86:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x10, #0, 91f\n" "st1 { v31.b }[8], [x27]\n" - "st1 { v16.b }[8], [x24]\n" - "st1 { v24.b }[8], [x23]\n" - "b 94f\n" - "90:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x9, #2, 92f\n" + "st1 { v16.b }[8], [x26]\n" + "st1 { v24.b }[8], [x25]\n" + "b 91f\n" + "87:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x10, #2, 89f\n" "str s31, [x27], #0x4\n" - "str s16, [x24], #0x4\n" - "str s24, [x23], #0x4\n" - "tbz x9, #1, 91f\n" + "str s16, [x26], #0x4\n" + "str s24, [x25], #0x4\n" + "tbz x10, #1, 88f\n" "st1 { v31.h }[2], [x27], #0x2\n" - "st1 { v16.h }[2], [x24], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "tbz x9, #0, 94f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "st1 { v24.h }[2], [x25], #0x2\n" + "tbz x10, #0, 91f\n" "st1 { v31.b }[6], [x27]\n" - "st1 { v16.b }[6], [x24]\n" - "st1 { v24.b }[6], [x23]\n" - "b 94f\n" - "91:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x9, #0, 94f\n" + "st1 { v16.b }[6], [x26]\n" + "st1 { v24.b }[6], [x25]\n" + "b 91f\n" + "88:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x10, #0, 91f\n" "st1 { v31.b }[4], [x27]\n" - "st1 { v16.b }[4], [x24]\n" - "st1 { v24.b }[4], [x23]\n" - "b 94f\n" - "92:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x9, #1, 93f\n" + "st1 { v16.b }[4], [x26]\n" + "st1 { v24.b }[4], [x25]\n" + "b 91f\n" + "89:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x10, #1, 90f\n" "str h31, [x27], #0x2\n" - "str h16, [x24], #0x2\n" - "str h24, [x23], #0x2\n" - "tbz x9, #0, 94f\n" + "str h16, [x26], #0x2\n" + "str h24, [x25], #0x2\n" + "tbz x10, #0, 91f\n" "st1 { v31.b }[2], [x27]\n" - "st1 { v16.b }[2], [x24]\n" - "st1 { v24.b }[2], [x23]\n" - "b 94f\n" - "93:" // Height 3: Partial direct writeback: partial_1_0 + "st1 { v16.b }[2], [x26]\n" + "st1 { v24.b }[2], [x25]\n" + "b 91f\n" + "90:" // Height 3: Partial direct writeback: partial_1_0 "str b31, [x27, #0x0]\n" - "str b16, [x24, #0x0]\n" - "str b24, [x23, #0x0]\n" - "94:" // Height 3: Partial direct writeback: Done - "b 96f\n" - "95:" // Height 3: Full writeback + "str b16, [x26, #0x0]\n" + "str b24, [x25, #0x0]\n" + "91:" // Height 3: Partial direct writeback: Done + "b 93f\n" + "92:" // Height 3: Full writeback "str q31, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q16, [x24, #0x0]\n" - "str q24, [x23, #0x0]\n" - "96:" // Height 3: Writeback done - "subs x9, x9, #0x10\n" - "bgt 66b\n" - "b 130f\n" - "97:" // Height 4 + "str q16, [x26, #0x0]\n" + "str q24, [x25, #0x0]\n" + "93:" // Height 3: Writeback done + "subs x10, x10, #0x10\n" + "bgt 64b\n" + "b 126f\n" + "94:" // Height 4 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "mov x20, #0x4\n" - "mov x10, %x[col_bias]\n" "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" - "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" "movi v13.4s, #0x0\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v14.4s, #0x0\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "madd x20, x21, x20, x27\n" "movi v15.16b, #0x1\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" + "madd x20, x21, x20, x27\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "98:" // Height 4: Column loop + "95:" // Height 4: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -1446,54 +1361,53 @@ void a64_hybrid_s8qa_mmla_4x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "99:" // Height 4: setup done "mov x26, #0x0\n" - "100:" // Height 4: String loop + "97:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 101f\n" + "tbz %x[flags], #3, 98f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" "ldr x21, [x20, #0x18]\n" - "cbnz x26, 102f\n" + "cbnz x26, 99f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" "add x21, x21, x20\n" - "b 102f\n" - "101:" // Height 4: setup direct input + "b 99f\n" + "98:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" "add x21, x22, x21\n" - "102:" // Height 4: input setup done + "99:" // Height 4: input setup done "cmp x25, #0x10\n" - "blt 107f\n" + "blt 104f\n" "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "cmp x25, #0x20\n" "ldr q3, [x22, #0x0]\n" "ldr q4, [x21, #0x0]\n" - "ldr q5, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "blt 105f\n" - "103:" // Height 4: Multiply loop: Main loop head + "ldr q5, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "blt 102f\n" + "100:" // Height 4: Multiply loop: Main loop head "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" - "ldr q4, [x28, #0x60]\n" + "ldr q4, [x9, #0x60]\n" "add x22, x22, #0x10\n" "add x21, x21, #0x10\n" ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" @@ -1501,29 +1415,29 @@ void a64_hybrid_s8qa_mmla_4x16 ( ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x4e86a45c // smmla v28.4s, v2.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4e88a45d // smmla v29.4s, v2.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" ".inst 0x4e89a45a // smmla v26.4s, v2.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" ".inst 0x4e8aa45e // smmla v30.4s, v2.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" ".inst 0x4e84a45b // smmla v27.4s, v2.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" + "ldr q4, [x9, #0xd0]\n" ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" ".inst 0x4e85a45f // smmla v31.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" + "ldr q5, [x9, #0xe0]\n" ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" ".inst 0x4e86a478 // smmla v24.4s, v3.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" ".inst 0x4e87a47c // smmla v28.4s, v3.16b, v7.16b\n" ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" @@ -1538,37 +1452,37 @@ void a64_hybrid_s8qa_mmla_4x16 ( ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n" ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" ".inst 0x4e86a47f // smmla v31.4s, v3.16b, v6.16b\n" - "tbnz %x[flags], #31, 104f\n" + "tbnz %x[flags], #31, 101f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" ".inst 0x4e8f946d // sdot v13.4s, v3.16b, v15.16b\n" - "104:" // Height 4: Multiply loop: unique 13: skip row sum + "101:" // Height 4: Multiply loop: unique 13: skip row sum "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "sub x25, x25, #0x10\n" "ldr q3, [x22, #0x0]\n" "ldr q4, [x21, #0x0]\n" "cmp x25, #0x20\n" - "ldr q5, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" + "ldr q5, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" "prfm pldl1keep, [x21, #0x80]\n" - "bge 103b\n" - "105:" // Height 4: Multiply loop: Single iteration only + "bge 100b\n" + "102:" // Height 4: Multiply loop: Single iteration only "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" - "ldr q4, [x28, #0x60]\n" + "ldr q4, [x9, #0x60]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" "add x21, x21, #0x10\n" @@ -1577,29 +1491,29 @@ void a64_hybrid_s8qa_mmla_4x16 ( ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x4e86a45c // smmla v28.4s, v2.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4e88a45d // smmla v29.4s, v2.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" ".inst 0x4e89a45a // smmla v26.4s, v2.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" ".inst 0x4e8aa45e // smmla v30.4s, v2.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" ".inst 0x4e84a45b // smmla v27.4s, v2.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" + "ldr q4, [x9, #0xd0]\n" ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" ".inst 0x4e85a45f // smmla v31.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" + "ldr q5, [x9, #0xe0]\n" ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" ".inst 0x4e86a478 // smmla v24.4s, v3.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" ".inst 0x4e87a47c // smmla v28.4s, v3.16b, v7.16b\n" ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" @@ -1614,299 +1528,249 @@ void a64_hybrid_s8qa_mmla_4x16 ( ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n" ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" ".inst 0x4e86a47f // smmla v31.4s, v3.16b, v6.16b\n" - "tbnz %x[flags], #31, 106f\n" + "tbnz %x[flags], #31, 103f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" ".inst 0x4e8f946d // sdot v13.4s, v3.16b, v15.16b\n" - "106:" // Height 4: Multiply loop: unique 14: skip row sum + "103:" // Height 4: Multiply loop: unique 14: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" "prfm pldl1keep, [x21, #0x80]\n" - "107:" // Height 4: Multiply loop: Main loop skip - "cbz x25, 116f\n" + "104:" // Height 4: Multiply loop: Main loop skip + "cbz x25, 113f\n" "cmp x25, #0x8\n" - "blt 110f\n" - "108:" // Height 4: Multiply loop: Odd block loop - "ldr d3, [x24], #0x8\n" - "ldr d0, [x23], #0x8\n" - "ldr d2, [x22], #0x8\n" - "ldr d1, [x21], #0x8\n" - "trn1 v0.2d, v3.2d, v0.2d\n" - "trn1 v2.2d, v2.2d, v1.2d\n" - "tbnz %x[flags], #31, 109f\n" + "blt 107f\n" + "105:" // Height 4: Multiply loop: Odd block loop + "ldr d1, [x24], #0x8\n" + "ldr d2, [x23], #0x8\n" + "ldr d3, [x22], #0x8\n" + "ldr d7, [x21], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v7.2d\n" + "tbnz %x[flags], #31, 106f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - "109:" // Height 4: Multiply loop: unique 15: skip row sum - "ldr q1, [x28, #0x0]\n" - "ldr q8, [x28, #0x10]\n" + "106:" // Height 4: Multiply loop: unique 15: skip row sum + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" "sub x25, x25, #0x8\n" - "ldr q7, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q4, [x9, #0x30]\n" "cmp x25, #0x8\n" - "ldr q5, [x28, #0x40]\n" - "ldr q4, [x28, #0x50]\n" - "ldr q3, [x28, #0x60]\n" - ".inst 0x4e81a410 // smmla v16.4s, v0.16b, v1.16b\n" - ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n" - "ldr q1, [x28, #0x70]\n" - ".inst 0x4e88a414 // smmla v20.4s, v0.16b, v8.16b\n" - ".inst 0x4e88a45c // smmla v28.4s, v2.16b, v8.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" - ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n" + "ldr q5, [x9, #0x40]\n" + "ldr q6, [x9, #0x50]\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e88a410 // smmla v16.4s, v0.16b, v8.16b\n" + ".inst 0x4e88a458 // smmla v24.4s, v2.16b, v8.16b\n" + "ldr q8, [x9, #0x70]\n" + ".inst 0x4e89a414 // smmla v20.4s, v0.16b, v9.16b\n" + ".inst 0x4e89a45c // smmla v28.4s, v2.16b, v9.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x4e8aa411 // smmla v17.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aa459 // smmla v25.4s, v2.16b, v10.16b\n" + ".inst 0x4e84a415 // smmla v21.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a45d // smmla v29.4s, v2.16b, v4.16b\n" ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n" ".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n" - ".inst 0x4e84a416 // smmla v22.4s, v0.16b, v4.16b\n" - ".inst 0x4e84a45e // smmla v30.4s, v2.16b, v4.16b\n" - ".inst 0x4e83a413 // smmla v19.4s, v0.16b, v3.16b\n" - ".inst 0x4e83a45b // smmla v27.4s, v2.16b, v3.16b\n" - ".inst 0x4e81a417 // smmla v23.4s, v0.16b, v1.16b\n" - ".inst 0x4e81a45f // smmla v31.4s, v2.16b, v1.16b\n" - "bge 108b\n" - "110:" // Height 4: Multiply loop: Skip odd blocks - "cbz x25, 116f\n" - "tbz x25, #2, 112f\n" + ".inst 0x4e86a416 // smmla v22.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a45e // smmla v30.4s, v2.16b, v6.16b\n" + ".inst 0x4e87a413 // smmla v19.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a45b // smmla v27.4s, v2.16b, v7.16b\n" + ".inst 0x4e88a417 // smmla v23.4s, v0.16b, v8.16b\n" + ".inst 0x4e88a45f // smmla v31.4s, v2.16b, v8.16b\n" + "bge 105b\n" + "107:" // Height 4: Multiply loop: Skip odd blocks + "cbz x25, 113f\n" + "tbz x25, #2, 109f\n" "ldr s1, [x24], #0x4\n" "ldr s2, [x23], #0x4\n" "ldr s3, [x22], #0x4\n" "ldr s9, [x21], #0x4\n" - "tbz x25, #1, 111f\n" + "tbz x25, #1, 108f\n" "ld1 { v1.h }[2], [x24], #0x2\n" "ld1 { v2.h }[2], [x23], #0x2\n" "ld1 { v3.h }[2], [x22], #0x2\n" "ld1 { v9.h }[2], [x21], #0x2\n" - "tbz x25, #0, 114f\n" + "tbz x25, #0, 111f\n" "ld1 { v1.b }[6], [x24]\n" "ld1 { v2.b }[6], [x23]\n" "ld1 { v3.b }[6], [x22]\n" "ld1 { v9.b }[6], [x21]\n" - "b 114f\n" - "111:" // Height 4: Multiply loop: Ragged operand read: partial_1_4 - "tbz x25, #0, 114f\n" + "b 111f\n" + "108:" // Height 4: Multiply loop: Ragged operand read: partial_1_4 + "tbz x25, #0, 111f\n" "ld1 { v1.b }[4], [x24]\n" "ld1 { v2.b }[4], [x23]\n" "ld1 { v3.b }[4], [x22]\n" "ld1 { v9.b }[4], [x21]\n" - "b 114f\n" - "112:" // Height 4: Multiply loop: Ragged operand read: partial_2_0 - "tbz x25, #1, 113f\n" + "b 111f\n" + "109:" // Height 4: Multiply loop: Ragged operand read: partial_2_0 + "tbz x25, #1, 110f\n" "ldr h1, [x24], #0x2\n" "ldr h2, [x23], #0x2\n" "ldr h3, [x22], #0x2\n" "ldr h9, [x21], #0x2\n" - "tbz x25, #0, 114f\n" + "tbz x25, #0, 111f\n" "ld1 { v1.b }[2], [x24]\n" "ld1 { v2.b }[2], [x23]\n" "ld1 { v3.b }[2], [x22]\n" "ld1 { v9.b }[2], [x21]\n" - "b 114f\n" - "113:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "b 111f\n" + "110:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x24, #0x0]\n" "ldr b2, [x23, #0x0]\n" "ldr b3, [x22, #0x0]\n" "ldr b9, [x21, #0x0]\n" - "114:" // Height 4: Multiply loop: Ragged operand read: Done + "111:" // Height 4: Multiply loop: Ragged operand read: Done "trn1 v0.2d, v1.2d, v2.2d\n" "trn1 v2.2d, v3.2d, v9.2d\n" - "tbnz %x[flags], #31, 115f\n" + "tbnz %x[flags], #31, 112f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - "115:" // Height 4: Multiply loop: unique 16: skip row sum - "ldr q1, [x28, #0x0]\n" - "ldr q8, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - "ldr q5, [x28, #0x40]\n" - "ldr q4, [x28, #0x50]\n" - "ldr q3, [x28, #0x60]\n" - ".inst 0x4e81a410 // smmla v16.4s, v0.16b, v1.16b\n" - ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n" - "ldr q1, [x28, #0x70]\n" - ".inst 0x4e88a414 // smmla v20.4s, v0.16b, v8.16b\n" - ".inst 0x4e88a45c // smmla v28.4s, v2.16b, v8.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" + "112:" // Height 4: Multiply loop: unique 16: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "ldr q7, [x9, #0x40]\n" + "ldr q8, [x9, #0x50]\n" + "ldr q9, [x9, #0x60]\n" + ".inst 0x4e8aa410 // smmla v16.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aa458 // smmla v24.4s, v2.16b, v10.16b\n" + "ldr q10, [x9, #0x70]\n" + ".inst 0x4e84a414 // smmla v20.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a45c // smmla v28.4s, v2.16b, v4.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x4e85a411 // smmla v17.4s, v0.16b, v5.16b\n" + ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n" ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n" ".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n" - ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n" - ".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n" - ".inst 0x4e84a416 // smmla v22.4s, v0.16b, v4.16b\n" - ".inst 0x4e84a45e // smmla v30.4s, v2.16b, v4.16b\n" - ".inst 0x4e83a413 // smmla v19.4s, v0.16b, v3.16b\n" - ".inst 0x4e83a45b // smmla v27.4s, v2.16b, v3.16b\n" - ".inst 0x4e81a417 // smmla v23.4s, v0.16b, v1.16b\n" - ".inst 0x4e81a45f // smmla v31.4s, v2.16b, v1.16b\n" - "116:" // Height 4: Multiply loop: No odd multiplies + ".inst 0x4e87a412 // smmla v18.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a45a // smmla v26.4s, v2.16b, v7.16b\n" + ".inst 0x4e88a416 // smmla v22.4s, v0.16b, v8.16b\n" + ".inst 0x4e88a45e // smmla v30.4s, v2.16b, v8.16b\n" + ".inst 0x4e89a413 // smmla v19.4s, v0.16b, v9.16b\n" + ".inst 0x4e89a45b // smmla v27.4s, v2.16b, v9.16b\n" + ".inst 0x4e8aa417 // smmla v23.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aa45f // smmla v31.4s, v2.16b, v10.16b\n" + "113:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 100b\n" + "bne 97b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v0.2d, v16.2d, v20.2d\n" + "uzp1 v4.2d, v16.2d, v20.2d\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "prfm pstl1keep, [x27, #0x0]\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "uzp2 v17.2d, v17.2d, v21.2d\n" "uzp1 v21.2d, v18.2d, v22.2d\n" "uzp2 v18.2d, v18.2d, v22.2d\n" - "add x24, x27, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x26, x27, x20\n" + "add x25, x26, x20\n" + "add x24, x25, x20\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" - "prfm pstl1keep, [x24, #0x0]\n" + "prfm pstl1keep, [x26, #0x0]\n" "uzp1 v23.2d, v24.2d, v28.2d\n" "uzp2 v24.2d, v24.2d, v28.2d\n" - "prfm pstl1keep, [x23, #0x0]\n" - "prfm pstl1keep, [x22, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x24, #0x0]\n" "uzp1 v28.2d, v25.2d, v29.2d\n" "uzp2 v25.2d, v25.2d, v29.2d\n" "uzp1 v29.2d, v26.2d, v30.2d\n" "uzp2 v26.2d, v26.2d, v30.2d\n" "uzp1 v30.2d, v27.2d, v31.2d\n" "uzp2 v27.2d, v27.2d, v31.2d\n" - "mov v31.16b, v0.16b\n" - "tbnz %x[flags], #31, 117f\n" + "mov v31.16b, v4.16b\n" + "tbnz %x[flags], #31, 114f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "ld1r { v0.4s }, [x20]\n" - "neg v0.4s, v0.4s\n" + "ld1r { v4.4s }, [x20]\n" + "neg v4.4s, v4.4s\n" "dup v12.4s, v11.s[3]\n" "dup v11.4s, v11.s[0]\n" "dup v14.4s, v13.s[3]\n" "dup v13.4s, v13.s[0]\n" - "mul v11.4s, v11.4s, v0.4s\n" - "mul v12.4s, v12.4s, v0.4s\n" - "mul v14.4s, v14.4s, v0.4s\n" - "mul v13.4s, v13.4s, v0.4s\n" - "117:" // Height 4: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q4, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v4.4s\n" + "mul v12.4s, v12.4s, v4.4s\n" + "mul v14.4s, v14.4s, v4.4s\n" + "mul v13.4s, v13.4s, v4.4s\n" + "114:" // Height 4: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v31.4s, v31.4s, v11.4s\n" "add v20.4s, v20.4s, v11.4s\n" - "ldr q3, [x10, #0x20]\n" - "ldr q2, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v21.4s, v21.4s, v11.4s\n" "add v22.4s, v22.4s, v11.4s\n" "add v16.4s, v16.4s, v12.4s\n" "add v17.4s, v17.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v1.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v18.4s, v18.4s, v12.4s\n" "add v19.4s, v19.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "add x22, %x[qp], %[c_offset]\n" "add v23.4s, v23.4s, v13.4s\n" "add v28.4s, v28.4s, v13.4s\n" - "add x10, x10, #0x40\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" "add v29.4s, v29.4s, v13.4s\n" "add v30.4s, v30.4s, v13.4s\n" "add v24.4s, v24.4s, v14.4s\n" "add v25.4s, v25.4s, v14.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" "add v26.4s, v26.4s, v14.4s\n" "add v27.4s, v27.4s, v14.4s\n" + "add x28, x28, #0x40\n" "add v31.4s, v31.4s, v0.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v3.4s\n" - "add v22.4s, v22.4s, v2.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v3.4s\n" - "add v19.4s, v19.4s, v2.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" "add v23.4s, v23.4s, v0.4s\n" - "add v28.4s, v28.4s, v4.4s\n" - "add v29.4s, v29.4s, v3.4s\n" - "add v30.4s, v30.4s, v2.4s\n" + "add v28.4s, v28.4s, v1.4s\n" + "add v29.4s, v29.4s, v2.4s\n" + "add v30.4s, v30.4s, v3.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v3.4s\n" - "add v27.4s, v27.4s, v2.4s\n" - "sqrdmulh v31.4s, v31.4s, v1.4s\n" - "sqrdmulh v20.4s, v20.4s, v1.4s\n" - "sqrdmulh v21.4s, v21.4s, v1.4s\n" - "sqrdmulh v22.4s, v22.4s, v1.4s\n" - "sqrdmulh v16.4s, v16.4s, v1.4s\n" - "sqrdmulh v17.4s, v17.4s, v1.4s\n" - "sqrdmulh v18.4s, v18.4s, v1.4s\n" - "sqrdmulh v19.4s, v19.4s, v1.4s\n" - "sqrdmulh v23.4s, v23.4s, v1.4s\n" - "sqrdmulh v28.4s, v28.4s, v1.4s\n" - "sqrdmulh v29.4s, v29.4s, v1.4s\n" - "sqrdmulh v30.4s, v30.4s, v1.4s\n" - "sqrdmulh v24.4s, v24.4s, v1.4s\n" - "sqrdmulh v25.4s, v25.4s, v1.4s\n" - "sqrdmulh v26.4s, v26.4s, v1.4s\n" - "sqrdmulh v27.4s, v27.4s, v1.4s\n" - "tbz %x[flags], #5, 118f\n" - "and v2.16b, v31.16b, v0.16b\n" - "and v1.16b, v20.16b, v0.16b\n" - "and v7.16b, v21.16b, v0.16b\n" - "and v6.16b, v22.16b, v0.16b\n" - "and v5.16b, v16.16b, v0.16b\n" - "and v4.16b, v17.16b, v0.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "and v3.16b, v18.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v31.4s, v31.4s, v2.4s\n" - "sqadd v20.4s, v20.4s, v1.4s\n" - "and v2.16b, v19.16b, v0.16b\n" - "and v1.16b, v23.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqadd v21.4s, v21.4s, v7.4s\n" - "sqadd v22.4s, v22.4s, v6.4s\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v5.4s\n" - "sqadd v17.4s, v17.4s, v4.4s\n" - "sqadd v18.4s, v18.4s, v3.4s\n" - "and v7.16b, v28.16b, v0.16b\n" - "sqadd v19.4s, v19.4s, v2.4s\n" - "sqadd v23.4s, v23.4s, v1.4s\n" - "and v6.16b, v29.16b, v0.16b\n" - "and v5.16b, v30.16b, v0.16b\n" - "and v4.16b, v24.16b, v0.16b\n" - "and v3.16b, v25.16b, v0.16b\n" - "and v2.16b, v26.16b, v0.16b\n" - "and v1.16b, v27.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqadd v28.4s, v28.4s, v7.4s\n" - "sqadd v29.4s, v29.4s, v6.4s\n" - "sqadd v30.4s, v30.4s, v5.4s\n" - "sqadd v24.4s, v24.4s, v4.4s\n" - "sqadd v25.4s, v25.4s, v3.4s\n" - "sqadd v26.4s, v26.4s, v2.4s\n" - "sqadd v27.4s, v27.4s, v1.4s\n" - "118:" // Height 4: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "ld1r { v0.4s }, [x23]\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqdmulh v31.4s, v31.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v28.4s, v28.4s, v4.4s\n" + "sqdmulh v29.4s, v29.4s, v4.4s\n" + "sqdmulh v30.4s, v30.4s, v4.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v4.4s\n" + "sqdmulh v26.4s, v26.4s, v4.4s\n" + "sqdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v31.4s, v31.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v3.4s }, [x21]\n" - "ld1r { v2.4s }, [x20]\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v1.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" @@ -1917,178 +1781,178 @@ void a64_hybrid_s8qa_mmla_4x16 ( "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add v31.4s, v31.4s, v3.4s\n" - "add v20.4s, v20.4s, v3.4s\n" - "add v21.4s, v21.4s, v3.4s\n" - "add v22.4s, v22.4s, v3.4s\n" - "add v16.4s, v16.4s, v3.4s\n" - "add v17.4s, v17.4s, v3.4s\n" - "add v18.4s, v18.4s, v3.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "add v28.4s, v28.4s, v3.4s\n" - "add v29.4s, v29.4s, v3.4s\n" - "add v30.4s, v30.4s, v3.4s\n" - "add v24.4s, v24.4s, v3.4s\n" - "add v25.4s, v25.4s, v3.4s\n" - "add v26.4s, v26.4s, v3.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "smin v31.4s, v31.4s, v2.4s\n" - "smin v20.4s, v20.4s, v2.4s\n" - "smin v21.4s, v21.4s, v2.4s\n" - "smin v22.4s, v22.4s, v2.4s\n" - "smin v16.4s, v16.4s, v2.4s\n" - "smin v17.4s, v17.4s, v2.4s\n" - "smin v18.4s, v18.4s, v2.4s\n" - "smin v19.4s, v19.4s, v2.4s\n" - "smin v23.4s, v23.4s, v2.4s\n" - "smin v28.4s, v28.4s, v2.4s\n" - "smin v29.4s, v29.4s, v2.4s\n" - "smin v30.4s, v30.4s, v2.4s\n" - "smin v24.4s, v24.4s, v2.4s\n" - "smin v25.4s, v25.4s, v2.4s\n" - "smin v26.4s, v26.4s, v2.4s\n" - "smin v27.4s, v27.4s, v2.4s\n" - "smax v31.4s, v31.4s, v1.4s\n" - "smax v20.4s, v20.4s, v1.4s\n" - "smax v21.4s, v21.4s, v1.4s\n" - "smax v22.4s, v22.4s, v1.4s\n" - "smax v16.4s, v16.4s, v1.4s\n" - "smax v17.4s, v17.4s, v1.4s\n" - "smax v18.4s, v18.4s, v1.4s\n" - "smax v19.4s, v19.4s, v1.4s\n" - "smax v23.4s, v23.4s, v1.4s\n" - "smax v28.4s, v28.4s, v1.4s\n" - "smax v29.4s, v29.4s, v1.4s\n" - "smax v30.4s, v30.4s, v1.4s\n" - "smax v24.4s, v24.4s, v1.4s\n" - "smax v25.4s, v25.4s, v1.4s\n" - "smax v26.4s, v26.4s, v1.4s\n" - "smax v27.4s, v27.4s, v1.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v31.8h, v31.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v19.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v23.8h, v23.8h, v28.8h\n" - "uzp1 v18.8h, v29.8h, v30.8h\n" + "uzp1 v28.8h, v29.8h, v30.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v17.8h, v26.8h, v27.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" "uzp1 v31.16b, v31.16b, v20.16b\n" - "uzp1 v16.16b, v16.16b, v19.16b\n" - "uzp1 v23.16b, v23.16b, v18.16b\n" - "uzp1 v24.16b, v24.16b, v17.16b\n" - "bge 127f\n" - "tbz x9, #3, 122f\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v23.16b, v23.16b, v28.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 123f\n" + "tbz x10, #3, 118f\n" "str d31, [x27], #0x8\n" - "str d16, [x24], #0x8\n" - "str d23, [x23], #0x8\n" - "str d24, [x22], #0x8\n" - "tbz x9, #2, 120f\n" + "str d16, [x26], #0x8\n" + "str d23, [x25], #0x8\n" + "str d24, [x24], #0x8\n" + "tbz x10, #2, 116f\n" "st1 { v31.s }[2], [x27], #0x4\n" - "st1 { v16.s }[2], [x24], #0x4\n" - "st1 { v23.s }[2], [x23], #0x4\n" - "st1 { v24.s }[2], [x22], #0x4\n" - "tbz x9, #1, 119f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "st1 { v23.s }[2], [x25], #0x4\n" + "st1 { v24.s }[2], [x24], #0x4\n" + "tbz x10, #1, 115f\n" "st1 { v31.h }[6], [x27], #0x2\n" - "st1 { v16.h }[6], [x24], #0x2\n" - "st1 { v23.h }[6], [x23], #0x2\n" - "st1 { v24.h }[6], [x22], #0x2\n" - "tbz x9, #0, 126f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "st1 { v23.h }[6], [x25], #0x2\n" + "st1 { v24.h }[6], [x24], #0x2\n" + "tbz x10, #0, 122f\n" "st1 { v31.b }[14], [x27]\n" - "st1 { v16.b }[14], [x24]\n" - "st1 { v23.b }[14], [x23]\n" - "st1 { v24.b }[14], [x22]\n" - "b 126f\n" - "119:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x9, #0, 126f\n" + "st1 { v16.b }[14], [x26]\n" + "st1 { v23.b }[14], [x25]\n" + "st1 { v24.b }[14], [x24]\n" + "b 122f\n" + "115:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x10, #0, 122f\n" "st1 { v31.b }[12], [x27]\n" - "st1 { v16.b }[12], [x24]\n" - "st1 { v23.b }[12], [x23]\n" - "st1 { v24.b }[12], [x22]\n" - "b 126f\n" - "120:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x9, #1, 121f\n" + "st1 { v16.b }[12], [x26]\n" + "st1 { v23.b }[12], [x25]\n" + "st1 { v24.b }[12], [x24]\n" + "b 122f\n" + "116:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x10, #1, 117f\n" "st1 { v31.h }[4], [x27], #0x2\n" - "st1 { v16.h }[4], [x24], #0x2\n" - "st1 { v23.h }[4], [x23], #0x2\n" - "st1 { v24.h }[4], [x22], #0x2\n" - "tbz x9, #0, 126f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "st1 { v23.h }[4], [x25], #0x2\n" + "st1 { v24.h }[4], [x24], #0x2\n" + "tbz x10, #0, 122f\n" "st1 { v31.b }[10], [x27]\n" - "st1 { v16.b }[10], [x24]\n" - "st1 { v23.b }[10], [x23]\n" - "st1 { v24.b }[10], [x22]\n" - "b 126f\n" - "121:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x9, #0, 126f\n" + "st1 { v16.b }[10], [x26]\n" + "st1 { v23.b }[10], [x25]\n" + "st1 { v24.b }[10], [x24]\n" + "b 122f\n" + "117:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x10, #0, 122f\n" "st1 { v31.b }[8], [x27]\n" - "st1 { v16.b }[8], [x24]\n" - "st1 { v23.b }[8], [x23]\n" - "st1 { v24.b }[8], [x22]\n" - "b 126f\n" - "122:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x9, #2, 124f\n" + "st1 { v16.b }[8], [x26]\n" + "st1 { v23.b }[8], [x25]\n" + "st1 { v24.b }[8], [x24]\n" + "b 122f\n" + "118:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x10, #2, 120f\n" "str s31, [x27], #0x4\n" - "str s16, [x24], #0x4\n" - "str s23, [x23], #0x4\n" - "str s24, [x22], #0x4\n" - "tbz x9, #1, 123f\n" + "str s16, [x26], #0x4\n" + "str s23, [x25], #0x4\n" + "str s24, [x24], #0x4\n" + "tbz x10, #1, 119f\n" "st1 { v31.h }[2], [x27], #0x2\n" - "st1 { v16.h }[2], [x24], #0x2\n" - "st1 { v23.h }[2], [x23], #0x2\n" - "st1 { v24.h }[2], [x22], #0x2\n" - "tbz x9, #0, 126f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "st1 { v23.h }[2], [x25], #0x2\n" + "st1 { v24.h }[2], [x24], #0x2\n" + "tbz x10, #0, 122f\n" "st1 { v31.b }[6], [x27]\n" - "st1 { v16.b }[6], [x24]\n" - "st1 { v23.b }[6], [x23]\n" - "st1 { v24.b }[6], [x22]\n" - "b 126f\n" - "123:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x9, #0, 126f\n" + "st1 { v16.b }[6], [x26]\n" + "st1 { v23.b }[6], [x25]\n" + "st1 { v24.b }[6], [x24]\n" + "b 122f\n" + "119:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x10, #0, 122f\n" "st1 { v31.b }[4], [x27]\n" - "st1 { v16.b }[4], [x24]\n" - "st1 { v23.b }[4], [x23]\n" - "st1 { v24.b }[4], [x22]\n" - "b 126f\n" - "124:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x9, #1, 125f\n" + "st1 { v16.b }[4], [x26]\n" + "st1 { v23.b }[4], [x25]\n" + "st1 { v24.b }[4], [x24]\n" + "b 122f\n" + "120:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x10, #1, 121f\n" "str h31, [x27], #0x2\n" - "str h16, [x24], #0x2\n" - "str h23, [x23], #0x2\n" - "str h24, [x22], #0x2\n" - "tbz x9, #0, 126f\n" + "str h16, [x26], #0x2\n" + "str h23, [x25], #0x2\n" + "str h24, [x24], #0x2\n" + "tbz x10, #0, 122f\n" "st1 { v31.b }[2], [x27]\n" - "st1 { v16.b }[2], [x24]\n" - "st1 { v23.b }[2], [x23]\n" - "st1 { v24.b }[2], [x22]\n" - "b 126f\n" - "125:" // Height 4: Partial direct writeback: partial_1_0 + "st1 { v16.b }[2], [x26]\n" + "st1 { v23.b }[2], [x25]\n" + "st1 { v24.b }[2], [x24]\n" + "b 122f\n" + "121:" // Height 4: Partial direct writeback: partial_1_0 "str b31, [x27, #0x0]\n" - "str b16, [x24, #0x0]\n" - "str b23, [x23, #0x0]\n" - "str b24, [x22, #0x0]\n" - "126:" // Height 4: Partial direct writeback: Done - "b 128f\n" - "127:" // Height 4: Full writeback + "str b16, [x26, #0x0]\n" + "str b23, [x25, #0x0]\n" + "str b24, [x24, #0x0]\n" + "122:" // Height 4: Partial direct writeback: Done + "b 124f\n" + "123:" // Height 4: Full writeback "str q31, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q16, [x24, #0x0]\n" - "str q23, [x23, #0x0]\n" - "str q24, [x22, #0x0]\n" - "128:" // Height 4: Writeback done - "subs x9, x9, #0x10\n" - "bgt 98b\n" + "str q16, [x26, #0x0]\n" + "str q23, [x25, #0x0]\n" + "str q24, [x24, #0x0]\n" + "124:" // Height 4: Writeback done + "subs x10, x10, #0x10\n" + "bgt 95b\n" "subs %x[M], %x[M], #0x4\n" - "beq 130f\n" + "beq 126f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 129f\n" + "tbz %x[flags], #3, 125f\n" "add x21, x21, #0x4\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "129:" // Update direct input + "125:" // Update direct input "mov x20, #0x4\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "130:" // Exit + "126:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp index 559b492871..eddcf2cef9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -81,21 +80,18 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( ka.multiplier_ptr=qp->per_channel_muls + col_base; ka.shift_ptr=qp->per_channel_right_shifts + col_base; } - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 136f\n" + "bge 131f\n" "cmp %x[M], #0x4\n" - "bgt 109f\n" - "beq 82f\n" + "bgt 105f\n" + "beq 79f\n" "cmp %x[M], #0x2\n" - "bgt 55f\n" - "beq 28f\n" - "mov x6, %x[col_bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "bgt 53f\n" + "beq 27f\n" + "ldr x6, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x7, %x[col_bias]\n" "ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n" "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" @@ -105,7 +101,6 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" "movi v11.4s, #0x0\n" - "3:" // Height 1: setup done "mov x14, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -131,71 +126,71 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "blt 8f\n" "7:" // Height 1: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr d17, [x16, #0x20]\n" + "ldr d6, [x16, #0x20]\n" "ldr x21, [x16, #0x28]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr d16, [x16, #0x30]\n" + "ldr d7, [x16, #0x30]\n" "add x12, x12, #0x10\n" "ldr x20, [x16, #0x38]\n" "sub x13, x13, #0x10\n" - "mov v17.d[1], x21\n" + "mov v6.d[1], x21\n" "ldr x21, [x16, #0x48]\n" "ldr x22, [x12, #0x8]\n" "cmp x13, #0x20\n" - "mov v16.d[1], x20\n" - ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" - "ldr d17, [x16, #0x40]\n" - ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" - "ldr d16, [x16, #0x50]\n" + "mov v7.d[1], x20\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr d6, [x16, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr d7, [x16, #0x50]\n" "ldr x20, [x16, #0x58]\n" - "mov v17.d[1], x21\n" + "mov v6.d[1], x21\n" "ldr x21, [x16, #0x68]\n" "prfm pldl1keep, [x12, #0x80]\n" - "mov v16.d[1], x20\n" - ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" - "ldr d17, [x16, #0x60]\n" - ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" - "ldr d16, [x16, #0x70]\n" + "mov v7.d[1], x20\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + "ldr d6, [x16, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + "ldr d7, [x16, #0x70]\n" "ldr x20, [x16, #0x78]\n" - "mov v17.d[1], x21\n" + "mov v6.d[1], x21\n" "ldr x21, [x16, #0x88]\n" - "mov v16.d[1], x20\n" - ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" - "ldr d17, [x16, #0x80]\n" - ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" - "ldr d16, [x16, #0x90]\n" + "mov v7.d[1], x20\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + "ldr d6, [x16, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + "ldr d7, [x16, #0x90]\n" "ldr x20, [x16, #0x98]\n" - "mov v17.d[1], x21\n" + "mov v6.d[1], x21\n" "ldr x21, [x16, #0xa8]\n" - "mov v16.d[1], x20\n" - ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" - "ldr d17, [x16, #0xa0]\n" - ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" - "ldr d16, [x16, #0xb0]\n" + "mov v7.d[1], x20\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + "ldr d6, [x16, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + "ldr d7, [x16, #0xb0]\n" "ldr x20, [x16, #0xb8]\n" - "mov v17.d[1], x21\n" + "mov v6.d[1], x21\n" "ldr x21, [x16, #0xc8]\n" - "mov v16.d[1], x20\n" - ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" - "ldr d17, [x16, #0xc0]\n" - ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" - "ldr d16, [x16, #0xd0]\n" + "mov v7.d[1], x20\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + "ldr d6, [x16, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + "ldr d7, [x16, #0xd0]\n" "ldr x20, [x16, #0xd8]\n" - "mov v17.d[1], x21\n" + "mov v6.d[1], x21\n" "ldr x21, [x16, #0xe8]\n" - "mov v16.d[1], x20\n" - ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" - "ldr d17, [x16, #0xe0]\n" - ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" - "ldr d16, [x16, #0xf0]\n" + "mov v7.d[1], x20\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + "ldr d6, [x16, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + "ldr d7, [x16, #0xf0]\n" "ldr x20, [x16, #0xf8]\n" "add x16, x16, #0x100\n" - "mov v17.d[1], x21\n" + "mov v6.d[1], x21\n" "ldr x21, [x16, #0x8]\n" - "mov v16.d[1], x20\n" - ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" + "mov v7.d[1], x20\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" "ldr d6, [x16, #0x0]\n" - ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" "ldr d0, [x12, #0x0]\n" "ldr d7, [x16, #0x10]\n" "ldr x20, [x16, #0x18]\n" @@ -205,56 +200,56 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "bge 7b\n" "8:" // Height 1: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q17, [x16, #0x20]\n" + "ldr q6, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q16, [x16, #0x30]\n" + "ldr q7, [x16, #0x30]\n" "add x12, x12, #0x10\n" "sub x13, x13, #0x10\n" - ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" - "ldr q17, [x16, #0x40]\n" - ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" - "ldr q16, [x16, #0x50]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x16, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x16, #0x50]\n" "prfm pldl1keep, [x12, #0x80]\n" - ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" - "ldr q17, [x16, #0x60]\n" - ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" - "ldr q16, [x16, #0x70]\n" - ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" - "ldr q17, [x16, #0x80]\n" - ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" - "ldr q16, [x16, #0x90]\n" - ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" - "ldr q17, [x16, #0xa0]\n" - ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" - "ldr q16, [x16, #0xb0]\n" - ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" - "ldr q17, [x16, #0xc0]\n" - ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" - "ldr q16, [x16, #0xd0]\n" - ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" - "ldr q17, [x16, #0xe0]\n" - ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" - "ldr q16, [x16, #0xf0]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x16, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x16, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x16, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x16, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x16, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x16, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x16, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x16, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + "ldr q6, [x16, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + "ldr q7, [x16, #0xf0]\n" "add x16, x16, #0x100\n" - ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" - ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" "9:" // Height 1: Multiply loop: Main loop skip "cbz x13, 14f\n" "cmp x13, #0x4\n" "blt 11f\n" "10:" // Height 1: Multiply loop: Odd block loop - "ldr s18, [x12], #0x4\n" + "ldr s0, [x12], #0x4\n" "sub x13, x13, #0x4\n" - "ldr q17, [x16, #0x0]\n" + "ldr q6, [x16, #0x0]\n" "cmp x13, #0x4\n" - "ldr q16, [x16, #0x10]\n" - ".inst 0x4f92e228 // sdot v8.4s, v17.16b, v18.4b[0]\n" - "ldr q17, [x16, #0x20]\n" - ".inst 0x4f92e209 // sdot v9.4s, v16.16b, v18.4b[0]\n" - "ldr q16, [x16, #0x30]\n" + "ldr q7, [x16, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x16, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x16, #0x30]\n" "add x16, x16, #0x40\n" - ".inst 0x4f92e22a // sdot v10.4s, v17.16b, v18.4b[0]\n" - ".inst 0x4f92e20b // sdot v11.4s, v16.16b, v18.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" "bge 10b\n" "11:" // Height 1: Multiply loop: Skip odd blocks "cbz x13, 14f\n" @@ -266,41 +261,41 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "12:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x12, #0x0]\n" "13:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q17, [x16, #0x0]\n" - "ldr q16, [x16, #0x10]\n" - ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n" - "ldr q17, [x16, #0x20]\n" - ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n" - "ldr q16, [x16, #0x30]\n" + "ldr q6, [x16, #0x0]\n" + "ldr q7, [x16, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x16, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x16, #0x30]\n" "add x16, x16, #0x40\n" - ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" - ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" "14:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x14, x14, #0x1\n" "cmp x14, x20\n" "bne 4b\n" - "ldr q19, [x6, #0x0]\n" - "ldr q18, [x6, #0x10]\n" - "ldr q17, [x6, #0x20]\n" - "ldr q16, [x6, #0x30]\n" - "add v8.4s, v8.4s, v19.4s\n" + "ldr q0, [x7, #0x0]\n" + "ldr q1, [x7, #0x10]\n" + "ldr q2, [x7, #0x20]\n" + "ldr q3, [x7, #0x30]\n" + "add v8.4s, v8.4s, v0.4s\n" "prfm pstl1keep, [x15, #0x0]\n" - "add v9.4s, v9.4s, v18.4s\n" - "add v10.4s, v10.4s, v17.4s\n" - "add x6, x6, #0x40\n" - "add v11.4s, v11.4s, v16.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add x7, x7, #0x40\n" + "add v11.4s, v11.4s, v3.4s\n" "tbz %x[flags], #4, 15f\n" "ldr q0, [x8, #0x0]\n" - "ldr q4, [x7, #0x0]\n" + "ldr q4, [x6, #0x0]\n" "ldr q1, [x8, #0x10]\n" - "ldr q5, [x7, #0x10]\n" + "ldr q5, [x6, #0x10]\n" "ldr q2, [x8, #0x20]\n" - "ldr q6, [x7, #0x20]\n" + "ldr q6, [x6, #0x20]\n" "ldr q3, [x8, #0x30]\n" "add x8, x8, #0x40\n" - "ldr q7, [x7, #0x30]\n" - "add x7, x7, #0x40\n" + "ldr q7, [x6, #0x30]\n" + "add x6, x6, #0x40\n" "b 16f\n" "15:" // Height 1: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" @@ -314,111 +309,97 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" "16:" // Height 1: parameters loaded - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "tbz %x[flags], #5, 17f\n" - "and v19.16b, v8.16b, v0.16b\n" - "and v18.16b, v9.16b, v1.16b\n" - "and v17.16b, v10.16b, v2.16b\n" - "and v16.16b, v11.16b, v3.16b\n" - "sshr v19.4s, v19.4s, #0x1f\n" - "sshr v18.4s, v18.4s, #0x1f\n" - "sshr v17.4s, v17.4s, #0x1f\n" - "sshr v16.4s, v16.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v19.4s\n" - "sqadd v9.4s, v9.4s, v18.4s\n" - "sqadd v10.4s, v10.4s, v17.4s\n" - "sqadd v11.4s, v11.4s, v16.4s\n" - "17:" // Height 1: no shift correction - "srshl v8.4s, v8.4s, v0.4s\n" - "srshl v9.4s, v9.4s, v1.4s\n" - "srshl v10.4s, v10.4s, v2.4s\n" - "srshl v11.4s, v11.4s, v3.4s\n" + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" "add x20, %x[qp], %[c_offset]\n" "add x21, %x[qp], %[maxval]\n" - "ld1r { v18.4s }, [x20]\n" + "ld1r { v4.4s }, [x20]\n" "add x20, %x[qp], %[minval]\n" - "ld1r { v17.4s }, [x21]\n" + "ld1r { v6.4s }, [x21]\n" "cmp x17, #0x10\n" - "ld1r { v16.4s }, [x20]\n" - "add v8.4s, v8.4s, v18.4s\n" - "add v9.4s, v9.4s, v18.4s\n" - "add v10.4s, v10.4s, v18.4s\n" - "add v11.4s, v11.4s, v18.4s\n" - "smin v8.4s, v8.4s, v17.4s\n" - "smin v9.4s, v9.4s, v17.4s\n" - "smin v10.4s, v10.4s, v17.4s\n" - "smin v11.4s, v11.4s, v17.4s\n" - "smax v8.4s, v8.4s, v16.4s\n" - "smax v9.4s, v9.4s, v16.4s\n" - "smax v10.4s, v10.4s, v16.4s\n" - "smax v11.4s, v11.4s, v16.4s\n" + "ld1r { v5.4s }, [x20]\n" + "srshl v8.4s, v8.4s, v0.4s\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v16.8h, v10.8h, v11.8h\n" - "uzp1 v8.16b, v8.16b, v16.16b\n" - "bge 26f\n" - "tbz x17, #3, 21f\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "bge 25f\n" + "tbz x17, #3, 20f\n" "str d8, [x15], #0x8\n" - "tbz x17, #2, 19f\n" + "tbz x17, #2, 18f\n" "st1 { v8.s }[2], [x15], #0x4\n" - "tbz x17, #1, 18f\n" + "tbz x17, #1, 17f\n" "st1 { v8.h }[6], [x15], #0x2\n" - "tbz x17, #0, 25f\n" + "tbz x17, #0, 24f\n" "st1 { v8.b }[14], [x15]\n" - "b 25f\n" - "18:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x17, #0, 25f\n" + "b 24f\n" + "17:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x17, #0, 24f\n" "st1 { v8.b }[12], [x15]\n" - "b 25f\n" - "19:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x17, #1, 20f\n" + "b 24f\n" + "18:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x17, #1, 19f\n" "st1 { v8.h }[4], [x15], #0x2\n" - "tbz x17, #0, 25f\n" + "tbz x17, #0, 24f\n" "st1 { v8.b }[10], [x15]\n" - "b 25f\n" - "20:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x17, #0, 25f\n" + "b 24f\n" + "19:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x17, #0, 24f\n" "st1 { v8.b }[8], [x15]\n" - "b 25f\n" - "21:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x17, #2, 23f\n" + "b 24f\n" + "20:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x17, #2, 22f\n" "str s8, [x15], #0x4\n" - "tbz x17, #1, 22f\n" + "tbz x17, #1, 21f\n" "st1 { v8.h }[2], [x15], #0x2\n" - "tbz x17, #0, 25f\n" + "tbz x17, #0, 24f\n" "st1 { v8.b }[6], [x15]\n" - "b 25f\n" - "22:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x17, #0, 25f\n" + "b 24f\n" + "21:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x17, #0, 24f\n" "st1 { v8.b }[4], [x15]\n" - "b 25f\n" - "23:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x17, #1, 24f\n" + "b 24f\n" + "22:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x17, #1, 23f\n" "str h8, [x15], #0x2\n" - "tbz x17, #0, 25f\n" + "tbz x17, #0, 24f\n" "st1 { v8.b }[2], [x15]\n" - "b 25f\n" - "24:" // Height 1: Partial direct writeback: partial_1_0 + "b 24f\n" + "23:" // Height 1: Partial direct writeback: partial_1_0 "str b8, [x15, #0x0]\n" - "25:" // Height 1: Partial direct writeback: Done - "b 27f\n" - "26:" // Height 1: Full writeback + "24:" // Height 1: Partial direct writeback: Done + "b 26f\n" + "25:" // Height 1: Full writeback "str q8, [x15, #0x0]\n" "add x15, x15, #0x10\n" - "27:" // Height 1: Writeback done + "26:" // Height 1: Writeback done "subs x17, x17, #0x10\n" "bgt 2b\n" - "b 164f\n" - "28:" // Height 2 - "mov x6, %x[col_bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "b 158f\n" + "27:" // Height 2 + "ldr x6, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x7, %x[col_bias]\n" "ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n" "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x15, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "29:" // Height 2: Column loop + "28:" // Height 2: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -427,121 +408,120 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "movi v13.4s, #0x0\n" "movi v14.4s, #0x0\n" "movi v15.4s, #0x0\n" - "30:" // Height 2: setup done "mov x14, #0x0\n" - "31:" // Height 2: String loop + "30:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w13, [x20, x14, LSL #0x2]\n" - "tbz %x[flags], #3, 32f\n" + "tbz %x[flags], #3, 31f\n" "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x12, [x20, #0x0]\n" "ldr x11, [x20, #0x8]\n" - "cbnz x14, 33f\n" + "cbnz x14, 32f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x12, x12, x20\n" "add x11, x11, x20\n" - "b 33f\n" - "32:" // Height 2: setup direct input + "b 32f\n" + "31:" // Height 2: setup direct input "mov x12, %x[input_ptr]\n" "add x11, x12, x21\n" - "33:" // Height 2: input setup done + "32:" // Height 2: input setup done "cmp x13, #0x10\n" - "blt 36f\n" + "blt 35f\n" "ldr q0, [x12, #0x0]\n" "cmp x13, #0x20\n" "ldr q1, [x11, #0x0]\n" "ldr q6, [x16, #0x0]\n" "ldr q7, [x16, #0x10]\n" - "blt 35f\n" - "34:" // Height 2: Multiply loop: Main loop head + "blt 34f\n" + "33:" // Height 2: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "ldr x20, [x16, #0x28]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr d17, [x16, #0x20]\n" + "ldr d6, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "ldr x21, [x16, #0x38]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr d16, [x16, #0x30]\n" - "mov v17.d[1], x20\n" + "ldr d7, [x16, #0x30]\n" + "mov v6.d[1], x20\n" "ldr x20, [x16, #0x48]\n" "add x12, x12, #0x10\n" "add x11, x11, #0x10\n" - "mov v16.d[1], x21\n" - ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" - ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" - "ldr d17, [x16, #0x40]\n" - ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + "mov v7.d[1], x21\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "ldr d6, [x16, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" "ldr x21, [x16, #0x58]\n" - ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" - "ldr d16, [x16, #0x50]\n" - "mov v17.d[1], x20\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "ldr d7, [x16, #0x50]\n" + "mov v6.d[1], x20\n" "ldr x20, [x16, #0x68]\n" "ldr x23, [x12, #0x8]\n" "sub x13, x13, #0x10\n" - "mov v16.d[1], x21\n" - ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" - ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n" - "ldr d17, [x16, #0x60]\n" - ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + "mov v7.d[1], x21\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + "ldr d6, [x16, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" "ldr x21, [x16, #0x78]\n" - ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n" - "ldr d16, [x16, #0x70]\n" - "mov v17.d[1], x20\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "ldr d7, [x16, #0x70]\n" + "mov v6.d[1], x20\n" "ldr x20, [x16, #0x88]\n" "ldr x22, [x11, #0x8]\n" "cmp x13, #0x20\n" - "mov v16.d[1], x21\n" - ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" - ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n" - "ldr d17, [x16, #0x80]\n" - ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" + "mov v7.d[1], x21\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + "ldr d6, [x16, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" "ldr x21, [x16, #0x98]\n" - ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n" - "ldr d16, [x16, #0x90]\n" - "mov v17.d[1], x20\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + "ldr d7, [x16, #0x90]\n" + "mov v6.d[1], x20\n" "ldr x20, [x16, #0xa8]\n" "prfm pldl1keep, [x12, #0x80]\n" - "mov v16.d[1], x21\n" - ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" - ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n" - "ldr d17, [x16, #0xa0]\n" - ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + "mov v7.d[1], x21\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + "ldr d6, [x16, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" "ldr x21, [x16, #0xb8]\n" - ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n" - "ldr d16, [x16, #0xb0]\n" - "mov v17.d[1], x20\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "ldr d7, [x16, #0xb0]\n" + "mov v6.d[1], x20\n" "ldr x20, [x16, #0xc8]\n" "prfm pldl1keep, [x11, #0x80]\n" - "mov v16.d[1], x21\n" - ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" - ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n" - "ldr d17, [x16, #0xc0]\n" - ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" + "mov v7.d[1], x21\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + "ldr d6, [x16, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" "ldr x21, [x16, #0xd8]\n" - ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n" - "ldr d16, [x16, #0xd0]\n" - "mov v17.d[1], x20\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + "ldr d7, [x16, #0xd0]\n" + "mov v6.d[1], x20\n" "ldr x20, [x16, #0xe8]\n" - "mov v16.d[1], x21\n" - ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" - ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n" - "ldr d17, [x16, #0xe0]\n" - ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + "mov v7.d[1], x21\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + "ldr d6, [x16, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" "ldr x21, [x16, #0xf8]\n" - ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n" - "ldr d16, [x16, #0xf0]\n" - "mov v17.d[1], x20\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + "ldr d7, [x16, #0xf0]\n" + "mov v6.d[1], x20\n" "add x16, x16, #0x100\n" "ldr x20, [x16, #0x8]\n" - "mov v16.d[1], x21\n" - ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" - ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n" + "mov v7.d[1], x21\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" "ldr d6, [x16, #0x0]\n" - ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" "ldr d0, [x12, #0x0]\n" - ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" "ldr d1, [x11, #0x0]\n" "ldr d7, [x16, #0x10]\n" "mov v6.d[1], x20\n" @@ -549,144 +529,144 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "mov v0.d[1], x23\n" "mov v1.d[1], x22\n" "mov v7.d[1], x20\n" - "bge 34b\n" - "35:" // Height 2: Multiply loop: Single iteration only + "bge 33b\n" + "34:" // Height 2: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "add x12, x12, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q17, [x16, #0x20]\n" + "ldr q6, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x11, x11, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q16, [x16, #0x30]\n" - ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + "ldr q7, [x16, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" "sub x13, x13, #0x10\n" - ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" - "ldr q17, [x16, #0x40]\n" - ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x16, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" "prfm pldl1keep, [x12, #0x80]\n" - ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" - "ldr q16, [x16, #0x50]\n" - ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x16, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" "prfm pldl1keep, [x11, #0x80]\n" - ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n" - "ldr q17, [x16, #0x60]\n" - ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" - ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n" - "ldr q16, [x16, #0x70]\n" - ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" - ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n" - "ldr q17, [x16, #0x80]\n" - ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" - ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n" - "ldr q16, [x16, #0x90]\n" - ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" - ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n" - "ldr q17, [x16, #0xa0]\n" - ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" - ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n" - "ldr q16, [x16, #0xb0]\n" - ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" - ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n" - "ldr q17, [x16, #0xc0]\n" - ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" - ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n" - "ldr q16, [x16, #0xd0]\n" - ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" - ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n" - "ldr q17, [x16, #0xe0]\n" - ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" - ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n" - "ldr q16, [x16, #0xf0]\n" - ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x16, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x16, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x16, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x16, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x16, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x16, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x16, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x16, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x16, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + "ldr q7, [x16, #0xf0]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" "add x16, x16, #0x100\n" - ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n" - ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" - ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n" - "36:" // Height 2: Multiply loop: Main loop skip - "cbz x13, 41f\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "35:" // Height 2: Multiply loop: Main loop skip + "cbz x13, 40f\n" "cmp x13, #0x4\n" - "blt 38f\n" - "37:" // Height 2: Multiply loop: Odd block loop - "ldr s19, [x12], #0x4\n" + "blt 37f\n" + "36:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x12], #0x4\n" "sub x13, x13, #0x4\n" - "ldr s18, [x11], #0x4\n" + "ldr s1, [x11], #0x4\n" "cmp x13, #0x4\n" - "ldr q17, [x16, #0x0]\n" - "ldr q16, [x16, #0x10]\n" - ".inst 0x4f93e228 // sdot v8.4s, v17.16b, v19.4b[0]\n" - ".inst 0x4f92e22c // sdot v12.4s, v17.16b, v18.4b[0]\n" - "ldr q17, [x16, #0x20]\n" - ".inst 0x4f93e209 // sdot v9.4s, v16.16b, v19.4b[0]\n" - ".inst 0x4f92e20d // sdot v13.4s, v16.16b, v18.4b[0]\n" - "ldr q16, [x16, #0x30]\n" - ".inst 0x4f93e22a // sdot v10.4s, v17.16b, v19.4b[0]\n" + "ldr q6, [x16, #0x0]\n" + "ldr q7, [x16, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x16, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x16, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f92e22e // sdot v14.4s, v17.16b, v18.4b[0]\n" - ".inst 0x4f93e20b // sdot v11.4s, v16.16b, v19.4b[0]\n" - ".inst 0x4f92e20f // sdot v15.4s, v16.16b, v18.4b[0]\n" - "bge 37b\n" - "38:" // Height 2: Multiply loop: Skip odd blocks - "cbz x13, 41f\n" - "tbz x13, #1, 39f\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "bge 36b\n" + "37:" // Height 2: Multiply loop: Skip odd blocks + "cbz x13, 40f\n" + "tbz x13, #1, 38f\n" "ldr h0, [x12], #0x2\n" "ldr h1, [x11], #0x2\n" - "tbz x13, #0, 40f\n" + "tbz x13, #0, 39f\n" "ld1 { v0.b }[2], [x12]\n" "ld1 { v1.b }[2], [x11]\n" - "b 40f\n" - "39:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "b 39f\n" + "38:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x12, #0x0]\n" "ldr b1, [x11, #0x0]\n" - "40:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q17, [x16, #0x0]\n" - "ldr q16, [x16, #0x10]\n" - ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n" - ".inst 0x4f81e22c // sdot v12.4s, v17.16b, v1.4b[0]\n" - "ldr q17, [x16, #0x20]\n" - ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n" - ".inst 0x4f81e20d // sdot v13.4s, v16.16b, v1.4b[0]\n" - "ldr q16, [x16, #0x30]\n" - ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + "39:" // Height 2: Multiply loop: Ragged operand read: Done + "ldr q6, [x16, #0x0]\n" + "ldr q7, [x16, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x16, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x16, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" - ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" - ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" - "41:" // Height 2: Multiply loop: No odd multiplies + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "40:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x14, x14, #0x1\n" "cmp x14, x20\n" - "bne 31b\n" - "ldr q19, [x6, #0x0]\n" - "ldr q18, [x6, #0x10]\n" - "ldr q17, [x6, #0x20]\n" - "ldr q16, [x6, #0x30]\n" - "add v8.4s, v8.4s, v19.4s\n" + "bne 30b\n" + "ldr q0, [x7, #0x0]\n" + "ldr q1, [x7, #0x10]\n" + "ldr q2, [x7, #0x20]\n" + "ldr q3, [x7, #0x30]\n" + "add v8.4s, v8.4s, v0.4s\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add v9.4s, v9.4s, v18.4s\n" + "add v9.4s, v9.4s, v1.4s\n" "prfm pstl1keep, [x15, #0x0]\n" - "add v10.4s, v10.4s, v17.4s\n" - "add v11.4s, v11.4s, v16.4s\n" - "add v12.4s, v12.4s, v19.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v12.4s, v12.4s, v0.4s\n" "add x26, x15, x20\n" - "add v13.4s, v13.4s, v18.4s\n" + "add v13.4s, v13.4s, v1.4s\n" "prfm pstl1keep, [x26, #0x0]\n" - "add v14.4s, v14.4s, v17.4s\n" - "add v15.4s, v15.4s, v16.4s\n" - "add x6, x6, #0x40\n" - "tbz %x[flags], #4, 42f\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "add x7, x7, #0x40\n" + "tbz %x[flags], #4, 41f\n" "ldr q0, [x8, #0x0]\n" - "ldr q4, [x7, #0x0]\n" + "ldr q4, [x6, #0x0]\n" "ldr q1, [x8, #0x10]\n" - "ldr q5, [x7, #0x10]\n" + "ldr q5, [x6, #0x10]\n" "ldr q2, [x8, #0x20]\n" - "ldr q6, [x7, #0x20]\n" + "ldr q6, [x6, #0x20]\n" "ldr q3, [x8, #0x30]\n" "add x8, x8, #0x40\n" - "ldr q7, [x7, #0x30]\n" - "add x7, x7, #0x40\n" - "b 43f\n" - "42:" // Height 2: per layer parameters + "ldr q7, [x6, #0x30]\n" + "add x6, x6, #0x40\n" + "b 42f\n" + "41:" // Height 2: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1r { v0.4s }, [x21]\n" @@ -697,42 +677,20 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" - "43:" // Height 2: parameters loaded - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "sqrdmulh v12.4s, v12.4s, v4.4s\n" - "sqrdmulh v13.4s, v13.4s, v5.4s\n" - "sqrdmulh v14.4s, v14.4s, v6.4s\n" - "sqrdmulh v15.4s, v15.4s, v7.4s\n" - "tbz %x[flags], #5, 44f\n" - "and v19.16b, v8.16b, v0.16b\n" - "and v18.16b, v9.16b, v1.16b\n" - "and v17.16b, v10.16b, v2.16b\n" - "and v16.16b, v11.16b, v3.16b\n" - "sshr v19.4s, v19.4s, #0x1f\n" - "sshr v18.4s, v18.4s, #0x1f\n" - "sshr v17.4s, v17.4s, #0x1f\n" - "sshr v16.4s, v16.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v19.4s\n" - "sqadd v9.4s, v9.4s, v18.4s\n" - "sqadd v10.4s, v10.4s, v17.4s\n" - "sqadd v11.4s, v11.4s, v16.4s\n" - "and v19.16b, v12.16b, v0.16b\n" - "and v18.16b, v13.16b, v1.16b\n" - "and v17.16b, v14.16b, v2.16b\n" - "and v16.16b, v15.16b, v3.16b\n" - "sshr v19.4s, v19.4s, #0x1f\n" - "sshr v18.4s, v18.4s, #0x1f\n" - "sshr v17.4s, v17.4s, #0x1f\n" - "sshr v16.4s, v16.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v19.4s\n" - "sqadd v13.4s, v13.4s, v18.4s\n" - "sqadd v14.4s, v14.4s, v17.4s\n" - "sqadd v15.4s, v15.4s, v16.4s\n" - "44:" // Height 2: no shift correction + "42:" // Height 2: parameters loaded + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" + "sqdmulh v12.4s, v12.4s, v4.4s\n" + "sqdmulh v13.4s, v13.4s, v5.4s\n" + "sqdmulh v14.4s, v14.4s, v6.4s\n" + "sqdmulh v15.4s, v15.4s, v7.4s\n" + "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v4.4s }, [x21]\n" "srshl v8.4s, v8.4s, v0.4s\n" + "ld1r { v6.4s }, [x20]\n" "srshl v9.4s, v9.4s, v1.4s\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" @@ -740,120 +698,116 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "srshl v13.4s, v13.4s, v1.4s\n" "srshl v14.4s, v14.4s, v2.4s\n" "srshl v15.4s, v15.4s, v3.4s\n" - "add x20, %x[qp], %[c_offset]\n" - "add x21, %x[qp], %[maxval]\n" - "ld1r { v18.4s }, [x20]\n" + "add v8.4s, v8.4s, v4.4s\n" "add x20, %x[qp], %[minval]\n" - "ld1r { v17.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" "cmp x17, #0x10\n" - "ld1r { v16.4s }, [x20]\n" - "add v8.4s, v8.4s, v18.4s\n" - "add v9.4s, v9.4s, v18.4s\n" - "add v10.4s, v10.4s, v18.4s\n" - "add v11.4s, v11.4s, v18.4s\n" - "add v12.4s, v12.4s, v18.4s\n" - "add v13.4s, v13.4s, v18.4s\n" - "add v14.4s, v14.4s, v18.4s\n" - "add v15.4s, v15.4s, v18.4s\n" - "smin v8.4s, v8.4s, v17.4s\n" - "smin v9.4s, v9.4s, v17.4s\n" - "smin v10.4s, v10.4s, v17.4s\n" - "smin v11.4s, v11.4s, v17.4s\n" - "smin v12.4s, v12.4s, v17.4s\n" - "smin v13.4s, v13.4s, v17.4s\n" - "smin v14.4s, v14.4s, v17.4s\n" - "smin v15.4s, v15.4s, v17.4s\n" - "smax v8.4s, v8.4s, v16.4s\n" - "smax v9.4s, v9.4s, v16.4s\n" - "smax v10.4s, v10.4s, v16.4s\n" - "smax v11.4s, v11.4s, v16.4s\n" - "smax v12.4s, v12.4s, v16.4s\n" - "smax v13.4s, v13.4s, v16.4s\n" - "smax v14.4s, v14.4s, v16.4s\n" - "smax v15.4s, v15.4s, v16.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v17.8h, v10.8h, v11.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v16.8h, v14.8h, v15.8h\n" - "uzp1 v8.16b, v8.16b, v17.16b\n" - "uzp1 v12.16b, v12.16b, v16.16b\n" - "bge 53f\n" - "tbz x17, #3, 48f\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "bge 51f\n" + "tbz x17, #3, 46f\n" "str d8, [x15], #0x8\n" "str d12, [x26], #0x8\n" - "tbz x17, #2, 46f\n" + "tbz x17, #2, 44f\n" "st1 { v8.s }[2], [x15], #0x4\n" "st1 { v12.s }[2], [x26], #0x4\n" - "tbz x17, #1, 45f\n" + "tbz x17, #1, 43f\n" "st1 { v8.h }[6], [x15], #0x2\n" "st1 { v12.h }[6], [x26], #0x2\n" - "tbz x17, #0, 52f\n" + "tbz x17, #0, 50f\n" "st1 { v8.b }[14], [x15]\n" "st1 { v12.b }[14], [x26]\n" - "b 52f\n" - "45:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x17, #0, 52f\n" + "b 50f\n" + "43:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x17, #0, 50f\n" "st1 { v8.b }[12], [x15]\n" "st1 { v12.b }[12], [x26]\n" - "b 52f\n" - "46:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x17, #1, 47f\n" + "b 50f\n" + "44:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x17, #1, 45f\n" "st1 { v8.h }[4], [x15], #0x2\n" "st1 { v12.h }[4], [x26], #0x2\n" - "tbz x17, #0, 52f\n" + "tbz x17, #0, 50f\n" "st1 { v8.b }[10], [x15]\n" "st1 { v12.b }[10], [x26]\n" - "b 52f\n" - "47:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x17, #0, 52f\n" + "b 50f\n" + "45:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x17, #0, 50f\n" "st1 { v8.b }[8], [x15]\n" "st1 { v12.b }[8], [x26]\n" - "b 52f\n" - "48:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x17, #2, 50f\n" + "b 50f\n" + "46:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x17, #2, 48f\n" "str s8, [x15], #0x4\n" "str s12, [x26], #0x4\n" - "tbz x17, #1, 49f\n" + "tbz x17, #1, 47f\n" "st1 { v8.h }[2], [x15], #0x2\n" "st1 { v12.h }[2], [x26], #0x2\n" - "tbz x17, #0, 52f\n" + "tbz x17, #0, 50f\n" "st1 { v8.b }[6], [x15]\n" "st1 { v12.b }[6], [x26]\n" - "b 52f\n" - "49:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x17, #0, 52f\n" + "b 50f\n" + "47:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x17, #0, 50f\n" "st1 { v8.b }[4], [x15]\n" "st1 { v12.b }[4], [x26]\n" - "b 52f\n" - "50:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x17, #1, 51f\n" + "b 50f\n" + "48:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x17, #1, 49f\n" "str h8, [x15], #0x2\n" "str h12, [x26], #0x2\n" - "tbz x17, #0, 52f\n" + "tbz x17, #0, 50f\n" "st1 { v8.b }[2], [x15]\n" "st1 { v12.b }[2], [x26]\n" - "b 52f\n" - "51:" // Height 2: Partial direct writeback: partial_1_0 + "b 50f\n" + "49:" // Height 2: Partial direct writeback: partial_1_0 "str b8, [x15, #0x0]\n" "str b12, [x26, #0x0]\n" - "52:" // Height 2: Partial direct writeback: Done - "b 54f\n" - "53:" // Height 2: Full writeback + "50:" // Height 2: Partial direct writeback: Done + "b 52f\n" + "51:" // Height 2: Full writeback "str q8, [x15, #0x0]\n" "add x15, x15, #0x10\n" "str q12, [x26, #0x0]\n" - "54:" // Height 2: Writeback done + "52:" // Height 2: Writeback done "subs x17, x17, #0x10\n" - "bgt 29b\n" - "b 164f\n" - "55:" // Height 3 - "mov x6, %x[col_bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "bgt 28b\n" + "b 158f\n" + "53:" // Height 3 + "ldr x6, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x7, %x[col_bias]\n" "ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n" "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x15, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "56:" // Height 3: Column loop + "54:" // Height 3: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -866,146 +820,145 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" "movi v19.4s, #0x0\n" - "57:" // Height 3: setup done "mov x14, #0x0\n" - "58:" // Height 3: String loop + "56:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w13, [x20, x14, LSL #0x2]\n" - "tbz %x[flags], #3, 59f\n" + "tbz %x[flags], #3, 57f\n" "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x12, [x20, #0x0]\n" "ldr x11, [x20, #0x8]\n" "ldr x10, [x20, #0x10]\n" - "cbnz x14, 60f\n" + "cbnz x14, 58f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x12, x12, x20\n" "add x11, x11, x20\n" "add x10, x10, x20\n" - "b 60f\n" - "59:" // Height 3: setup direct input + "b 58f\n" + "57:" // Height 3: setup direct input "mov x12, %x[input_ptr]\n" "add x11, x12, x21\n" "add x10, x11, x21\n" - "60:" // Height 3: input setup done + "58:" // Height 3: input setup done "cmp x13, #0x10\n" - "blt 63f\n" + "blt 61f\n" "ldr q0, [x12, #0x0]\n" "cmp x13, #0x20\n" "ldr q1, [x11, #0x0]\n" "ldr q2, [x10, #0x0]\n" "ldr q6, [x16, #0x0]\n" "ldr q7, [x16, #0x10]\n" - "blt 62f\n" - "61:" // Height 3: Multiply loop: Main loop head + "blt 60f\n" + "59:" // Height 3: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "ldr x21, [x16, #0x28]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" "ldr x20, [x16, #0x38]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr d21, [x16, #0x20]\n" + "ldr d6, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x12, x12, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" "add x11, x11, #0x10\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr d20, [x16, #0x30]\n" - "mov v21.d[1], x21\n" + "ldr d7, [x16, #0x30]\n" + "mov v6.d[1], x21\n" "ldr x21, [x16, #0x48]\n" "add x10, x10, #0x10\n" "ldr x24, [x12, #0x8]\n" - "mov v20.d[1], x20\n" - ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" - ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" + "mov v7.d[1], x20\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" "ldr x20, [x16, #0x58]\n" - ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" - "ldr d21, [x16, #0x40]\n" - ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + "ldr d6, [x16, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" "ldr x23, [x11, #0x8]\n" - ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" "ldr x22, [x10, #0x8]\n" - ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" - "ldr d20, [x16, #0x50]\n" - "mov v21.d[1], x21\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "ldr d7, [x16, #0x50]\n" + "mov v6.d[1], x21\n" "ldr x21, [x16, #0x68]\n" "sub x13, x13, #0x10\n" "prfm pldl1keep, [x12, #0x80]\n" - "mov v20.d[1], x20\n" - ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n" - ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n" + "mov v7.d[1], x20\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" "ldr x20, [x16, #0x78]\n" - ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n" - "ldr d21, [x16, #0x60]\n" - ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + "ldr d6, [x16, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" "cmp x13, #0x20\n" - ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" "prfm pldl1keep, [x11, #0x80]\n" - ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n" - "ldr d20, [x16, #0x70]\n" - "mov v21.d[1], x21\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + "ldr d7, [x16, #0x70]\n" + "mov v6.d[1], x21\n" "ldr x21, [x16, #0x88]\n" "prfm pldl1keep, [x10, #0x80]\n" - "mov v20.d[1], x20\n" - ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n" - ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n" + "mov v7.d[1], x20\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" "ldr x20, [x16, #0x98]\n" - ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n" - "ldr d21, [x16, #0x80]\n" - ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n" - ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n" - ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n" - "ldr d20, [x16, #0x90]\n" - "mov v21.d[1], x21\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + "ldr d6, [x16, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + "ldr d7, [x16, #0x90]\n" + "mov v6.d[1], x21\n" "ldr x21, [x16, #0xa8]\n" - "mov v20.d[1], x20\n" - ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n" - ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n" + "mov v7.d[1], x20\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" "ldr x20, [x16, #0xb8]\n" - ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n" - "ldr d21, [x16, #0xa0]\n" - ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n" - ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n" - ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n" - "ldr d20, [x16, #0xb0]\n" - "mov v21.d[1], x21\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + "ldr d6, [x16, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + "ldr d7, [x16, #0xb0]\n" + "mov v6.d[1], x21\n" "ldr x21, [x16, #0xc8]\n" - "mov v20.d[1], x20\n" - ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n" - ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n" + "mov v7.d[1], x20\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" "ldr x20, [x16, #0xd8]\n" - ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n" - "ldr d21, [x16, #0xc0]\n" - ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n" - ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n" - ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n" - "ldr d20, [x16, #0xd0]\n" - "mov v21.d[1], x21\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + "ldr d6, [x16, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + "ldr d7, [x16, #0xd0]\n" + "mov v6.d[1], x21\n" "ldr x21, [x16, #0xe8]\n" - "mov v20.d[1], x20\n" - ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n" - ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n" + "mov v7.d[1], x20\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" "ldr x20, [x16, #0xf8]\n" - ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n" - "ldr d21, [x16, #0xe0]\n" - ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n" - ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n" - ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n" - "ldr d20, [x16, #0xf0]\n" - "mov v21.d[1], x21\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + "ldr d6, [x16, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + "ldr d7, [x16, #0xf0]\n" + "mov v6.d[1], x21\n" "add x16, x16, #0x100\n" "ldr x21, [x16, #0x8]\n" - "mov v20.d[1], x20\n" - ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n" - ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n" + "mov v7.d[1], x20\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" "ldr x20, [x16, #0x18]\n" - ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" "ldr d6, [x16, #0x0]\n" - ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" "ldr d0, [x12, #0x0]\n" - ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" "ldr d1, [x11, #0x0]\n" - ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" "ldr d2, [x10, #0x0]\n" "ldr d7, [x16, #0x10]\n" "mov v6.d[1], x21\n" @@ -1013,180 +966,180 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "mov v1.d[1], x23\n" "mov v2.d[1], x22\n" "mov v7.d[1], x20\n" - "bge 61b\n" - "62:" // Height 3: Multiply loop: Single iteration only + "bge 59b\n" + "60:" // Height 3: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "add x12, x12, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" "add x11, x11, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q21, [x16, #0x20]\n" + "ldr q6, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x10, x10, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" "sub x13, x13, #0x10\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q20, [x16, #0x30]\n" - ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" + "ldr q7, [x16, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" "prfm pldl1keep, [x12, #0x80]\n" - ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" "prfm pldl1keep, [x11, #0x80]\n" - ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" - "ldr q21, [x16, #0x40]\n" - ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x16, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" "prfm pldl1keep, [x10, #0x80]\n" - ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" - ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" - "ldr q20, [x16, #0x50]\n" - ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n" - ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n" - ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n" - "ldr q21, [x16, #0x60]\n" - ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n" - ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n" - ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n" - "ldr q20, [x16, #0x70]\n" - ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n" - ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n" - ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n" - "ldr q21, [x16, #0x80]\n" - ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n" - ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n" - ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n" - "ldr q20, [x16, #0x90]\n" - ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n" - ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n" - ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n" - "ldr q21, [x16, #0xa0]\n" - ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n" - ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n" - ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n" - "ldr q20, [x16, #0xb0]\n" - ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n" - ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n" - ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n" - "ldr q21, [x16, #0xc0]\n" - ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n" - ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n" - ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n" - "ldr q20, [x16, #0xd0]\n" - ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n" - ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n" - ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n" - "ldr q21, [x16, #0xe0]\n" - ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n" - ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n" - ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n" - "ldr q20, [x16, #0xf0]\n" - ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x16, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x16, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x16, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x16, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x16, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x16, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x16, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x16, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x16, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + "ldr q6, [x16, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + "ldr q7, [x16, #0xf0]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" "add x16, x16, #0x100\n" - ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n" - ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n" - ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n" - ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n" - ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n" - "63:" // Height 3: Multiply loop: Main loop skip - "cbz x13, 68f\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "61:" // Height 3: Multiply loop: Main loop skip + "cbz x13, 66f\n" "cmp x13, #0x4\n" - "blt 65f\n" - "64:" // Height 3: Multiply loop: Odd block loop - "ldr s24, [x12], #0x4\n" + "blt 63f\n" + "62:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x12], #0x4\n" "sub x13, x13, #0x4\n" - "ldr s23, [x11], #0x4\n" + "ldr s1, [x11], #0x4\n" "cmp x13, #0x4\n" - "ldr s22, [x10], #0x4\n" - "ldr q21, [x16, #0x0]\n" - "ldr q20, [x16, #0x10]\n" - ".inst 0x4f98e2a8 // sdot v8.4s, v21.16b, v24.4b[0]\n" - ".inst 0x4f97e2ac // sdot v12.4s, v21.16b, v23.4b[0]\n" - ".inst 0x4f96e2b0 // sdot v16.4s, v21.16b, v22.4b[0]\n" - "ldr q21, [x16, #0x20]\n" - ".inst 0x4f98e289 // sdot v9.4s, v20.16b, v24.4b[0]\n" - ".inst 0x4f97e28d // sdot v13.4s, v20.16b, v23.4b[0]\n" - ".inst 0x4f96e291 // sdot v17.4s, v20.16b, v22.4b[0]\n" - "ldr q20, [x16, #0x30]\n" - ".inst 0x4f98e2aa // sdot v10.4s, v21.16b, v24.4b[0]\n" + "ldr s2, [x10], #0x4\n" + "ldr q6, [x16, #0x0]\n" + "ldr q7, [x16, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x16, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x16, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f97e2ae // sdot v14.4s, v21.16b, v23.4b[0]\n" - ".inst 0x4f96e2b2 // sdot v18.4s, v21.16b, v22.4b[0]\n" - ".inst 0x4f98e28b // sdot v11.4s, v20.16b, v24.4b[0]\n" - ".inst 0x4f97e28f // sdot v15.4s, v20.16b, v23.4b[0]\n" - ".inst 0x4f96e293 // sdot v19.4s, v20.16b, v22.4b[0]\n" - "bge 64b\n" - "65:" // Height 3: Multiply loop: Skip odd blocks - "cbz x13, 68f\n" - "tbz x13, #1, 66f\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "bge 62b\n" + "63:" // Height 3: Multiply loop: Skip odd blocks + "cbz x13, 66f\n" + "tbz x13, #1, 64f\n" "ldr h0, [x12], #0x2\n" "ldr h1, [x11], #0x2\n" "ldr h2, [x10], #0x2\n" - "tbz x13, #0, 67f\n" + "tbz x13, #0, 65f\n" "ld1 { v0.b }[2], [x12]\n" "ld1 { v1.b }[2], [x11]\n" "ld1 { v2.b }[2], [x10]\n" - "b 67f\n" - "66:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "b 65f\n" + "64:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x12, #0x0]\n" "ldr b1, [x11, #0x0]\n" "ldr b2, [x10, #0x0]\n" - "67:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q21, [x16, #0x0]\n" - "ldr q20, [x16, #0x10]\n" - ".inst 0x4f80e2a8 // sdot v8.4s, v21.16b, v0.4b[0]\n" - ".inst 0x4f81e2ac // sdot v12.4s, v21.16b, v1.4b[0]\n" - ".inst 0x4f82e2b0 // sdot v16.4s, v21.16b, v2.4b[0]\n" - "ldr q21, [x16, #0x20]\n" - ".inst 0x4f80e289 // sdot v9.4s, v20.16b, v0.4b[0]\n" - ".inst 0x4f81e28d // sdot v13.4s, v20.16b, v1.4b[0]\n" - ".inst 0x4f82e291 // sdot v17.4s, v20.16b, v2.4b[0]\n" - "ldr q20, [x16, #0x30]\n" - ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" + "65:" // Height 3: Multiply loop: Ragged operand read: Done + "ldr q6, [x16, #0x0]\n" + "ldr q7, [x16, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x16, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x16, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" - ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" - ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" - ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" - ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" - "68:" // Height 3: Multiply loop: No odd multiplies + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "66:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x14, x14, #0x1\n" "cmp x14, x20\n" - "bne 58b\n" - "ldr q23, [x6, #0x0]\n" - "ldr q22, [x6, #0x10]\n" - "ldr q21, [x6, #0x20]\n" - "ldr q20, [x6, #0x30]\n" - "add v8.4s, v8.4s, v23.4s\n" + "bne 56b\n" + "ldr q0, [x7, #0x0]\n" + "ldr q1, [x7, #0x10]\n" + "ldr q2, [x7, #0x20]\n" + "ldr q3, [x7, #0x30]\n" + "add v8.4s, v8.4s, v0.4s\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add v9.4s, v9.4s, v22.4s\n" + "add v9.4s, v9.4s, v1.4s\n" "prfm pstl1keep, [x15, #0x0]\n" - "add v10.4s, v10.4s, v21.4s\n" - "add v11.4s, v11.4s, v20.4s\n" - "add v12.4s, v12.4s, v23.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v12.4s, v12.4s, v0.4s\n" "add x26, x15, x20\n" - "add v13.4s, v13.4s, v22.4s\n" + "add v13.4s, v13.4s, v1.4s\n" "add x25, x26, x20\n" "prfm pstl1keep, [x26, #0x0]\n" "prfm pstl1keep, [x25, #0x0]\n" - "add v14.4s, v14.4s, v21.4s\n" - "add v15.4s, v15.4s, v20.4s\n" - "add v16.4s, v16.4s, v23.4s\n" - "add v17.4s, v17.4s, v22.4s\n" - "add v18.4s, v18.4s, v21.4s\n" - "add v19.4s, v19.4s, v20.4s\n" - "add x6, x6, #0x40\n" - "tbz %x[flags], #4, 69f\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add x7, x7, #0x40\n" + "tbz %x[flags], #4, 67f\n" "ldr q0, [x8, #0x0]\n" - "ldr q4, [x7, #0x0]\n" + "ldr q4, [x6, #0x0]\n" "ldr q1, [x8, #0x10]\n" - "ldr q5, [x7, #0x10]\n" + "ldr q5, [x6, #0x10]\n" "ldr q2, [x8, #0x20]\n" - "ldr q6, [x7, #0x20]\n" + "ldr q6, [x6, #0x20]\n" "ldr q3, [x8, #0x30]\n" "add x8, x8, #0x40\n" - "ldr q7, [x7, #0x30]\n" - "add x7, x7, #0x40\n" - "b 70f\n" - "69:" // Height 3: per layer parameters + "ldr q7, [x6, #0x30]\n" + "add x6, x6, #0x40\n" + "b 68f\n" + "67:" // Height 3: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1r { v0.4s }, [x21]\n" @@ -1197,57 +1150,19 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" - "70:" // Height 3: parameters loaded - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "sqrdmulh v12.4s, v12.4s, v4.4s\n" - "sqrdmulh v13.4s, v13.4s, v5.4s\n" - "sqrdmulh v14.4s, v14.4s, v6.4s\n" - "sqrdmulh v15.4s, v15.4s, v7.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v5.4s\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "sqrdmulh v19.4s, v19.4s, v7.4s\n" - "tbz %x[flags], #5, 71f\n" - "and v23.16b, v8.16b, v0.16b\n" - "and v22.16b, v9.16b, v1.16b\n" - "and v21.16b, v10.16b, v2.16b\n" - "and v20.16b, v11.16b, v3.16b\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v23.4s\n" - "sqadd v9.4s, v9.4s, v22.4s\n" - "sqadd v10.4s, v10.4s, v21.4s\n" - "sqadd v11.4s, v11.4s, v20.4s\n" - "and v23.16b, v12.16b, v0.16b\n" - "and v22.16b, v13.16b, v1.16b\n" - "and v21.16b, v14.16b, v2.16b\n" - "and v20.16b, v15.16b, v3.16b\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v23.4s\n" - "sqadd v13.4s, v13.4s, v22.4s\n" - "sqadd v14.4s, v14.4s, v21.4s\n" - "sqadd v15.4s, v15.4s, v20.4s\n" - "and v23.16b, v16.16b, v0.16b\n" - "and v22.16b, v17.16b, v1.16b\n" - "and v21.16b, v18.16b, v2.16b\n" - "and v20.16b, v19.16b, v3.16b\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v23.4s\n" - "sqadd v17.4s, v17.4s, v22.4s\n" - "sqadd v18.4s, v18.4s, v21.4s\n" - "sqadd v19.4s, v19.4s, v20.4s\n" - "71:" // Height 3: no shift correction + "68:" // Height 3: parameters loaded + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" + "sqdmulh v12.4s, v12.4s, v4.4s\n" + "sqdmulh v13.4s, v13.4s, v5.4s\n" + "sqdmulh v14.4s, v14.4s, v6.4s\n" + "sqdmulh v15.4s, v15.4s, v7.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v5.4s\n" + "sqdmulh v18.4s, v18.4s, v6.4s\n" + "sqdmulh v19.4s, v19.4s, v7.4s\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" "srshl v10.4s, v10.4s, v2.4s\n" @@ -1262,149 +1177,149 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "srshl v19.4s, v19.4s, v3.4s\n" "add x20, %x[qp], %[c_offset]\n" "add x21, %x[qp], %[maxval]\n" - "ld1r { v22.4s }, [x20]\n" + "ld1r { v4.4s }, [x20]\n" "add x20, %x[qp], %[minval]\n" - "ld1r { v21.4s }, [x21]\n" + "ld1r { v6.4s }, [x21]\n" "cmp x17, #0x10\n" - "ld1r { v20.4s }, [x20]\n" - "add v8.4s, v8.4s, v22.4s\n" - "add v9.4s, v9.4s, v22.4s\n" - "add v10.4s, v10.4s, v22.4s\n" - "add v11.4s, v11.4s, v22.4s\n" - "add v12.4s, v12.4s, v22.4s\n" - "add v13.4s, v13.4s, v22.4s\n" - "add v14.4s, v14.4s, v22.4s\n" - "add v15.4s, v15.4s, v22.4s\n" - "add v16.4s, v16.4s, v22.4s\n" - "add v17.4s, v17.4s, v22.4s\n" - "add v18.4s, v18.4s, v22.4s\n" - "add v19.4s, v19.4s, v22.4s\n" - "smin v8.4s, v8.4s, v21.4s\n" - "smin v9.4s, v9.4s, v21.4s\n" - "smin v10.4s, v10.4s, v21.4s\n" - "smin v11.4s, v11.4s, v21.4s\n" - "smin v12.4s, v12.4s, v21.4s\n" - "smin v13.4s, v13.4s, v21.4s\n" - "smin v14.4s, v14.4s, v21.4s\n" - "smin v15.4s, v15.4s, v21.4s\n" - "smin v16.4s, v16.4s, v21.4s\n" - "smin v17.4s, v17.4s, v21.4s\n" - "smin v18.4s, v18.4s, v21.4s\n" - "smin v19.4s, v19.4s, v21.4s\n" - "smax v8.4s, v8.4s, v20.4s\n" - "smax v9.4s, v9.4s, v20.4s\n" - "smax v10.4s, v10.4s, v20.4s\n" - "smax v11.4s, v11.4s, v20.4s\n" - "smax v12.4s, v12.4s, v20.4s\n" - "smax v13.4s, v13.4s, v20.4s\n" - "smax v14.4s, v14.4s, v20.4s\n" - "smax v15.4s, v15.4s, v20.4s\n" - "smax v16.4s, v16.4s, v20.4s\n" - "smax v17.4s, v17.4s, v20.4s\n" - "smax v18.4s, v18.4s, v20.4s\n" - "smax v19.4s, v19.4s, v20.4s\n" + "ld1r { v5.4s }, [x20]\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v21.8h, v10.8h, v11.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v20.8h, v14.8h, v15.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" - "uzp1 v8.16b, v8.16b, v21.16b\n" - "uzp1 v12.16b, v12.16b, v20.16b\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" "uzp1 v16.16b, v16.16b, v17.16b\n" - "bge 80f\n" - "tbz x17, #3, 75f\n" + "bge 77f\n" + "tbz x17, #3, 72f\n" "str d8, [x15], #0x8\n" "str d12, [x26], #0x8\n" "str d16, [x25], #0x8\n" - "tbz x17, #2, 73f\n" + "tbz x17, #2, 70f\n" "st1 { v8.s }[2], [x15], #0x4\n" "st1 { v12.s }[2], [x26], #0x4\n" "st1 { v16.s }[2], [x25], #0x4\n" - "tbz x17, #1, 72f\n" + "tbz x17, #1, 69f\n" "st1 { v8.h }[6], [x15], #0x2\n" "st1 { v12.h }[6], [x26], #0x2\n" "st1 { v16.h }[6], [x25], #0x2\n" - "tbz x17, #0, 79f\n" + "tbz x17, #0, 76f\n" "st1 { v8.b }[14], [x15]\n" "st1 { v12.b }[14], [x26]\n" "st1 { v16.b }[14], [x25]\n" - "b 79f\n" - "72:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x17, #0, 79f\n" + "b 76f\n" + "69:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x17, #0, 76f\n" "st1 { v8.b }[12], [x15]\n" "st1 { v12.b }[12], [x26]\n" "st1 { v16.b }[12], [x25]\n" - "b 79f\n" - "73:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x17, #1, 74f\n" + "b 76f\n" + "70:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x17, #1, 71f\n" "st1 { v8.h }[4], [x15], #0x2\n" "st1 { v12.h }[4], [x26], #0x2\n" "st1 { v16.h }[4], [x25], #0x2\n" - "tbz x17, #0, 79f\n" + "tbz x17, #0, 76f\n" "st1 { v8.b }[10], [x15]\n" "st1 { v12.b }[10], [x26]\n" "st1 { v16.b }[10], [x25]\n" - "b 79f\n" - "74:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x17, #0, 79f\n" + "b 76f\n" + "71:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x17, #0, 76f\n" "st1 { v8.b }[8], [x15]\n" "st1 { v12.b }[8], [x26]\n" "st1 { v16.b }[8], [x25]\n" - "b 79f\n" - "75:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x17, #2, 77f\n" + "b 76f\n" + "72:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x17, #2, 74f\n" "str s8, [x15], #0x4\n" "str s12, [x26], #0x4\n" "str s16, [x25], #0x4\n" - "tbz x17, #1, 76f\n" + "tbz x17, #1, 73f\n" "st1 { v8.h }[2], [x15], #0x2\n" "st1 { v12.h }[2], [x26], #0x2\n" "st1 { v16.h }[2], [x25], #0x2\n" - "tbz x17, #0, 79f\n" + "tbz x17, #0, 76f\n" "st1 { v8.b }[6], [x15]\n" "st1 { v12.b }[6], [x26]\n" "st1 { v16.b }[6], [x25]\n" - "b 79f\n" - "76:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x17, #0, 79f\n" + "b 76f\n" + "73:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x17, #0, 76f\n" "st1 { v8.b }[4], [x15]\n" "st1 { v12.b }[4], [x26]\n" "st1 { v16.b }[4], [x25]\n" - "b 79f\n" - "77:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x17, #1, 78f\n" + "b 76f\n" + "74:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x17, #1, 75f\n" "str h8, [x15], #0x2\n" "str h12, [x26], #0x2\n" "str h16, [x25], #0x2\n" - "tbz x17, #0, 79f\n" + "tbz x17, #0, 76f\n" "st1 { v8.b }[2], [x15]\n" "st1 { v12.b }[2], [x26]\n" "st1 { v16.b }[2], [x25]\n" - "b 79f\n" - "78:" // Height 3: Partial direct writeback: partial_1_0 + "b 76f\n" + "75:" // Height 3: Partial direct writeback: partial_1_0 "str b8, [x15, #0x0]\n" "str b12, [x26, #0x0]\n" "str b16, [x25, #0x0]\n" - "79:" // Height 3: Partial direct writeback: Done - "b 81f\n" - "80:" // Height 3: Full writeback + "76:" // Height 3: Partial direct writeback: Done + "b 78f\n" + "77:" // Height 3: Full writeback "str q8, [x15, #0x0]\n" "add x15, x15, #0x10\n" "str q12, [x26, #0x0]\n" "str q16, [x25, #0x0]\n" - "81:" // Height 3: Writeback done + "78:" // Height 3: Writeback done "subs x17, x17, #0x10\n" - "bgt 56b\n" - "b 164f\n" - "82:" // Height 4 - "mov x6, %x[col_bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "bgt 54b\n" + "b 158f\n" + "79:" // Height 4 + "ldr x6, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x7, %x[col_bias]\n" "ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n" "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x15, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "83:" // Height 4: Column loop + "80:" // Height 4: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -1421,34 +1336,33 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "84:" // Height 4: setup done "mov x14, #0x0\n" - "85:" // Height 4: String loop + "82:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w13, [x20, x14, LSL #0x2]\n" - "tbz %x[flags], #3, 86f\n" + "tbz %x[flags], #3, 83f\n" "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x12, [x20, #0x0]\n" "ldr x11, [x20, #0x8]\n" "ldr x10, [x20, #0x10]\n" "ldr x9, [x20, #0x18]\n" - "cbnz x14, 87f\n" + "cbnz x14, 84f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x12, x12, x20\n" "add x11, x11, x20\n" "add x10, x10, x20\n" "add x9, x9, x20\n" - "b 87f\n" - "86:" // Height 4: setup direct input + "b 84f\n" + "83:" // Height 4: setup direct input "mov x12, %x[input_ptr]\n" "add x11, x12, x21\n" "add x10, x11, x21\n" "add x9, x10, x21\n" - "87:" // Height 4: input setup done + "84:" // Height 4: input setup done "cmp x13, #0x10\n" - "blt 90f\n" + "blt 87f\n" "ldr q0, [x12, #0x0]\n" "cmp x13, #0x20\n" "ldr q1, [x11, #0x0]\n" @@ -1456,8 +1370,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ldr q3, [x9, #0x0]\n" "ldr q6, [x16, #0x0]\n" "ldr q7, [x16, #0x10]\n" - "blt 89f\n" - "88:" // Height 4: Multiply loop: Main loop head + "blt 86f\n" + "85:" // Height 4: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "ldr x20, [x16, #0x28]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" @@ -1465,125 +1379,125 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" "add x12, x12, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr d25, [x16, #0x20]\n" + "ldr d6, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x11, x11, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" "add x10, x10, #0x10\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "mov v25.d[1], x20\n" + "mov v6.d[1], x20\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr d24, [x16, #0x30]\n" - ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" + "ldr d7, [x16, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" "ldr x20, [x16, #0x48]\n" - ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" "add x9, x9, #0x10\n" - "mov v24.d[1], x21\n" - ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" - ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" - "ldr d25, [x16, #0x40]\n" - ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" + "mov v7.d[1], x21\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + "ldr d6, [x16, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" "ldr x21, [x16, #0x58]\n" - ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" "ldr x25, [x12, #0x8]\n" - ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" - "mov v25.d[1], x20\n" - ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" - "ldr d24, [x16, #0x50]\n" - ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "mov v6.d[1], x20\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "ldr d7, [x16, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" "ldr x20, [x16, #0x68]\n" - ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" "ldr x24, [x11, #0x8]\n" - "mov v24.d[1], x21\n" - ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n" - ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n" - "ldr d25, [x16, #0x60]\n" - ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n" + "mov v7.d[1], x21\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + "ldr d6, [x16, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" "ldr x21, [x16, #0x78]\n" - ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" "ldr x23, [x10, #0x8]\n" - ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n" - "mov v25.d[1], x20\n" - ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n" - "ldr d24, [x16, #0x70]\n" - ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + "mov v6.d[1], x20\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + "ldr d7, [x16, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" "ldr x20, [x16, #0x88]\n" - ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" "ldr x22, [x9, #0x8]\n" - "mov v24.d[1], x21\n" - ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n" - ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n" - "ldr d25, [x16, #0x80]\n" - ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n" + "mov v7.d[1], x21\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + "ldr d6, [x16, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" "ldr x21, [x16, #0x98]\n" - ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" "sub x13, x13, #0x10\n" - ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n" - "mov v25.d[1], x20\n" - ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n" - "ldr d24, [x16, #0x90]\n" - ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + "mov v6.d[1], x20\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + "ldr d7, [x16, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" "ldr x20, [x16, #0xa8]\n" - ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" "cmp x13, #0x20\n" - "mov v24.d[1], x21\n" - ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n" - ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n" - "ldr d25, [x16, #0xa0]\n" - ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n" + "mov v7.d[1], x21\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + "ldr d6, [x16, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" "ldr x21, [x16, #0xb8]\n" - ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" "prfm pldl1keep, [x12, #0x80]\n" - ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n" - "mov v25.d[1], x20\n" - ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n" - "ldr d24, [x16, #0xb0]\n" - ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + "mov v6.d[1], x20\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + "ldr d7, [x16, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" "ldr x20, [x16, #0xc8]\n" - ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" "prfm pldl1keep, [x11, #0x80]\n" - "mov v24.d[1], x21\n" - ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n" - ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n" - "ldr d25, [x16, #0xc0]\n" - ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n" + "mov v7.d[1], x21\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + "ldr d6, [x16, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" "ldr x21, [x16, #0xd8]\n" - ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" "prfm pldl1keep, [x10, #0x80]\n" - ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n" - "mov v25.d[1], x20\n" - ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n" - "ldr d24, [x16, #0xd0]\n" - ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + "mov v6.d[1], x20\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + "ldr d7, [x16, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" "ldr x20, [x16, #0xe8]\n" - ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" "prfm pldl1keep, [x9, #0x80]\n" - "mov v24.d[1], x21\n" - ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n" - ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n" - "ldr d25, [x16, #0xe0]\n" - ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n" + "mov v7.d[1], x21\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + "ldr d6, [x16, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" "ldr x21, [x16, #0xf8]\n" - ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n" - ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n" - "mov v25.d[1], x20\n" - ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n" - "ldr d24, [x16, #0xf0]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + "mov v6.d[1], x20\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + "ldr d7, [x16, #0xf0]\n" "add x16, x16, #0x100\n" - ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n" - ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" "ldr x20, [x16, #0x8]\n" - "mov v24.d[1], x21\n" - ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n" - ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n" + "mov v7.d[1], x21\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" "ldr d6, [x16, #0x0]\n" - ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" "ldr d0, [x12, #0x0]\n" - ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" "ldr d1, [x11, #0x0]\n" - ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" "ldr d2, [x10, #0x0]\n" - ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" "ldr d3, [x9, #0x0]\n" "ldr d7, [x16, #0x10]\n" "mov v6.d[1], x20\n" @@ -1593,8 +1507,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "mov v2.d[1], x23\n" "mov v3.d[1], x22\n" "mov v7.d[1], x20\n" - "bge 88b\n" - "89:" // Height 4: Multiply loop: Single iteration only + "bge 85b\n" + "86:" // Height 4: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "add x12, x12, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" @@ -1602,7 +1516,7 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" "add x10, x10, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q25, [x16, #0x20]\n" + "ldr q6, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x9, x9, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" @@ -1610,199 +1524,199 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x12, #0x80]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q24, [x16, #0x30]\n" - ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" + "ldr q7, [x16, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" "prfm pldl1keep, [x11, #0x80]\n" - ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" "prfm pldl1keep, [x10, #0x80]\n" - ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" - "ldr q25, [x16, #0x40]\n" - ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" - ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" - ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" - ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" - "ldr q24, [x16, #0x50]\n" - ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n" - ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n" - ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n" - ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n" - "ldr q25, [x16, #0x60]\n" - ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n" - ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n" - ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n" - ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n" - "ldr q24, [x16, #0x70]\n" - ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n" - ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n" - ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n" - ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n" - "ldr q25, [x16, #0x80]\n" - ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n" - ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n" - ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n" - ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n" - "ldr q24, [x16, #0x90]\n" - ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n" - ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n" - ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n" - ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n" - "ldr q25, [x16, #0xa0]\n" - ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n" - ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n" - ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n" - ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n" - "ldr q24, [x16, #0xb0]\n" - ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n" - ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n" - ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n" - ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n" - "ldr q25, [x16, #0xc0]\n" - ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n" - ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n" - ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n" - ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n" - "ldr q24, [x16, #0xd0]\n" - ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n" - ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n" - ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n" - ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n" - "ldr q25, [x16, #0xe0]\n" - ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n" - ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n" - ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n" - ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n" - "ldr q24, [x16, #0xf0]\n" - ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x16, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x16, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x16, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x16, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x16, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x16, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x16, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x16, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x16, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x16, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + "ldr q6, [x16, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + "ldr q7, [x16, #0xf0]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" "add x16, x16, #0x100\n" - ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n" - ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n" - ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n" - ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n" - ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n" - ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n" - ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n" - "90:" // Height 4: Multiply loop: Main loop skip - "cbz x13, 95f\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + "87:" // Height 4: Multiply loop: Main loop skip + "cbz x13, 92f\n" "cmp x13, #0x4\n" - "blt 92f\n" - "91:" // Height 4: Multiply loop: Odd block loop - "ldr s29, [x12], #0x4\n" + "blt 89f\n" + "88:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x12], #0x4\n" "sub x13, x13, #0x4\n" - "ldr s28, [x11], #0x4\n" + "ldr s1, [x11], #0x4\n" "cmp x13, #0x4\n" - "ldr s27, [x10], #0x4\n" - "ldr s26, [x9], #0x4\n" - "ldr q25, [x16, #0x0]\n" - "ldr q24, [x16, #0x10]\n" - ".inst 0x4f9de328 // sdot v8.4s, v25.16b, v29.4b[0]\n" - ".inst 0x4f9ce32c // sdot v12.4s, v25.16b, v28.4b[0]\n" - ".inst 0x4f9be330 // sdot v16.4s, v25.16b, v27.4b[0]\n" - ".inst 0x4f9ae334 // sdot v20.4s, v25.16b, v26.4b[0]\n" - "ldr q25, [x16, #0x20]\n" - ".inst 0x4f9de309 // sdot v9.4s, v24.16b, v29.4b[0]\n" - ".inst 0x4f9ce30d // sdot v13.4s, v24.16b, v28.4b[0]\n" - ".inst 0x4f9be311 // sdot v17.4s, v24.16b, v27.4b[0]\n" - ".inst 0x4f9ae315 // sdot v21.4s, v24.16b, v26.4b[0]\n" - "ldr q24, [x16, #0x30]\n" - ".inst 0x4f9de32a // sdot v10.4s, v25.16b, v29.4b[0]\n" + "ldr s2, [x10], #0x4\n" + "ldr s3, [x9], #0x4\n" + "ldr q6, [x16, #0x0]\n" + "ldr q7, [x16, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x16, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x16, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f9ce32e // sdot v14.4s, v25.16b, v28.4b[0]\n" - ".inst 0x4f9be332 // sdot v18.4s, v25.16b, v27.4b[0]\n" - ".inst 0x4f9ae336 // sdot v22.4s, v25.16b, v26.4b[0]\n" - ".inst 0x4f9de30b // sdot v11.4s, v24.16b, v29.4b[0]\n" - ".inst 0x4f9ce30f // sdot v15.4s, v24.16b, v28.4b[0]\n" - ".inst 0x4f9be313 // sdot v19.4s, v24.16b, v27.4b[0]\n" - ".inst 0x4f9ae317 // sdot v23.4s, v24.16b, v26.4b[0]\n" - "bge 91b\n" - "92:" // Height 4: Multiply loop: Skip odd blocks - "cbz x13, 95f\n" - "tbz x13, #1, 93f\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "bge 88b\n" + "89:" // Height 4: Multiply loop: Skip odd blocks + "cbz x13, 92f\n" + "tbz x13, #1, 90f\n" "ldr h0, [x12], #0x2\n" "ldr h1, [x11], #0x2\n" "ldr h2, [x10], #0x2\n" "ldr h3, [x9], #0x2\n" - "tbz x13, #0, 94f\n" + "tbz x13, #0, 91f\n" "ld1 { v0.b }[2], [x12]\n" "ld1 { v1.b }[2], [x11]\n" "ld1 { v2.b }[2], [x10]\n" "ld1 { v3.b }[2], [x9]\n" - "b 94f\n" - "93:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "b 91f\n" + "90:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x12, #0x0]\n" "ldr b1, [x11, #0x0]\n" "ldr b2, [x10, #0x0]\n" "ldr b3, [x9, #0x0]\n" - "94:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q25, [x16, #0x0]\n" - "ldr q24, [x16, #0x10]\n" - ".inst 0x4f80e328 // sdot v8.4s, v25.16b, v0.4b[0]\n" - ".inst 0x4f81e32c // sdot v12.4s, v25.16b, v1.4b[0]\n" - ".inst 0x4f82e330 // sdot v16.4s, v25.16b, v2.4b[0]\n" - ".inst 0x4f83e334 // sdot v20.4s, v25.16b, v3.4b[0]\n" - "ldr q25, [x16, #0x20]\n" - ".inst 0x4f80e309 // sdot v9.4s, v24.16b, v0.4b[0]\n" - ".inst 0x4f81e30d // sdot v13.4s, v24.16b, v1.4b[0]\n" - ".inst 0x4f82e311 // sdot v17.4s, v24.16b, v2.4b[0]\n" - ".inst 0x4f83e315 // sdot v21.4s, v24.16b, v3.4b[0]\n" - "ldr q24, [x16, #0x30]\n" - ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" + "91:" // Height 4: Multiply loop: Ragged operand read: Done + "ldr q6, [x16, #0x0]\n" + "ldr q7, [x16, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x16, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x16, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" - ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" - ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" - ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" - ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" - ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" - ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" - "95:" // Height 4: Multiply loop: No odd multiplies + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "92:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x14, x14, #0x1\n" "cmp x14, x20\n" - "bne 85b\n" - "ldr q27, [x6, #0x0]\n" - "ldr q26, [x6, #0x10]\n" - "ldr q25, [x6, #0x20]\n" - "ldr q24, [x6, #0x30]\n" - "add v8.4s, v8.4s, v27.4s\n" + "bne 82b\n" + "ldr q0, [x7, #0x0]\n" + "ldr q1, [x7, #0x10]\n" + "ldr q2, [x7, #0x20]\n" + "ldr q3, [x7, #0x30]\n" + "add v8.4s, v8.4s, v0.4s\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add v9.4s, v9.4s, v26.4s\n" + "add v9.4s, v9.4s, v1.4s\n" "prfm pstl1keep, [x15, #0x0]\n" - "add v10.4s, v10.4s, v25.4s\n" - "add v11.4s, v11.4s, v24.4s\n" - "add v12.4s, v12.4s, v27.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v12.4s, v12.4s, v0.4s\n" "add x26, x15, x20\n" - "add v13.4s, v13.4s, v26.4s\n" + "add v13.4s, v13.4s, v1.4s\n" "add x25, x26, x20\n" "prfm pstl1keep, [x26, #0x0]\n" "add x24, x25, x20\n" "prfm pstl1keep, [x25, #0x0]\n" "prfm pstl1keep, [x24, #0x0]\n" - "add v14.4s, v14.4s, v25.4s\n" - "add v15.4s, v15.4s, v24.4s\n" - "add v16.4s, v16.4s, v27.4s\n" - "add v17.4s, v17.4s, v26.4s\n" - "add v18.4s, v18.4s, v25.4s\n" - "add v19.4s, v19.4s, v24.4s\n" - "add v20.4s, v20.4s, v27.4s\n" - "add v21.4s, v21.4s, v26.4s\n" - "add v22.4s, v22.4s, v25.4s\n" - "add v23.4s, v23.4s, v24.4s\n" - "add x6, x6, #0x40\n" - "tbz %x[flags], #4, 96f\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add x7, x7, #0x40\n" + "tbz %x[flags], #4, 93f\n" "ldr q0, [x8, #0x0]\n" - "ldr q4, [x7, #0x0]\n" + "ldr q4, [x6, #0x0]\n" "ldr q1, [x8, #0x10]\n" - "ldr q5, [x7, #0x10]\n" + "ldr q5, [x6, #0x10]\n" "ldr q2, [x8, #0x20]\n" - "ldr q6, [x7, #0x20]\n" + "ldr q6, [x6, #0x20]\n" "ldr q3, [x8, #0x30]\n" "add x8, x8, #0x40\n" - "ldr q7, [x7, #0x30]\n" - "add x7, x7, #0x40\n" - "b 97f\n" - "96:" // Height 4: per layer parameters + "ldr q7, [x6, #0x30]\n" + "add x6, x6, #0x40\n" + "b 94f\n" + "93:" // Height 4: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1r { v0.4s }, [x21]\n" @@ -1813,73 +1727,23 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" - "97:" // Height 4: parameters loaded - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "sqrdmulh v12.4s, v12.4s, v4.4s\n" - "sqrdmulh v13.4s, v13.4s, v5.4s\n" - "sqrdmulh v14.4s, v14.4s, v6.4s\n" - "sqrdmulh v15.4s, v15.4s, v7.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v5.4s\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "sqrdmulh v19.4s, v19.4s, v7.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v5.4s\n" - "sqrdmulh v22.4s, v22.4s, v6.4s\n" - "sqrdmulh v23.4s, v23.4s, v7.4s\n" - "tbz %x[flags], #5, 98f\n" - "and v27.16b, v8.16b, v0.16b\n" - "and v26.16b, v9.16b, v1.16b\n" - "and v25.16b, v10.16b, v2.16b\n" - "and v24.16b, v11.16b, v3.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v27.4s\n" - "sqadd v9.4s, v9.4s, v26.4s\n" - "sqadd v10.4s, v10.4s, v25.4s\n" - "sqadd v11.4s, v11.4s, v24.4s\n" - "and v27.16b, v12.16b, v0.16b\n" - "and v26.16b, v13.16b, v1.16b\n" - "and v25.16b, v14.16b, v2.16b\n" - "and v24.16b, v15.16b, v3.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v27.4s\n" - "sqadd v13.4s, v13.4s, v26.4s\n" - "sqadd v14.4s, v14.4s, v25.4s\n" - "sqadd v15.4s, v15.4s, v24.4s\n" - "and v27.16b, v16.16b, v0.16b\n" - "and v26.16b, v17.16b, v1.16b\n" - "and v25.16b, v18.16b, v2.16b\n" - "and v24.16b, v19.16b, v3.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v27.4s\n" - "sqadd v17.4s, v17.4s, v26.4s\n" - "sqadd v18.4s, v18.4s, v25.4s\n" - "sqadd v19.4s, v19.4s, v24.4s\n" - "and v27.16b, v20.16b, v0.16b\n" - "and v26.16b, v21.16b, v1.16b\n" - "and v25.16b, v22.16b, v2.16b\n" - "and v24.16b, v23.16b, v3.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v27.4s\n" - "sqadd v21.4s, v21.4s, v26.4s\n" - "sqadd v22.4s, v22.4s, v25.4s\n" - "sqadd v23.4s, v23.4s, v24.4s\n" - "98:" // Height 4: no shift correction + "94:" // Height 4: parameters loaded + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" + "sqdmulh v12.4s, v12.4s, v4.4s\n" + "sqdmulh v13.4s, v13.4s, v5.4s\n" + "sqdmulh v14.4s, v14.4s, v6.4s\n" + "sqdmulh v15.4s, v15.4s, v7.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v5.4s\n" + "sqdmulh v18.4s, v18.4s, v6.4s\n" + "sqdmulh v19.4s, v19.4s, v7.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v5.4s\n" + "sqdmulh v22.4s, v22.4s, v6.4s\n" + "sqdmulh v23.4s, v23.4s, v7.4s\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" "srshl v10.4s, v10.4s, v2.4s\n" @@ -1898,180 +1762,180 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "srshl v23.4s, v23.4s, v3.4s\n" "add x20, %x[qp], %[c_offset]\n" "add x21, %x[qp], %[maxval]\n" - "ld1r { v26.4s }, [x20]\n" + "ld1r { v4.4s }, [x20]\n" "add x20, %x[qp], %[minval]\n" - "ld1r { v25.4s }, [x21]\n" + "ld1r { v6.4s }, [x21]\n" "cmp x17, #0x10\n" - "ld1r { v24.4s }, [x20]\n" - "add v8.4s, v8.4s, v26.4s\n" - "add v9.4s, v9.4s, v26.4s\n" - "add v10.4s, v10.4s, v26.4s\n" - "add v11.4s, v11.4s, v26.4s\n" - "add v12.4s, v12.4s, v26.4s\n" - "add v13.4s, v13.4s, v26.4s\n" - "add v14.4s, v14.4s, v26.4s\n" - "add v15.4s, v15.4s, v26.4s\n" - "add v16.4s, v16.4s, v26.4s\n" - "add v17.4s, v17.4s, v26.4s\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v26.4s\n" - "add v20.4s, v20.4s, v26.4s\n" - "add v21.4s, v21.4s, v26.4s\n" - "add v22.4s, v22.4s, v26.4s\n" - "add v23.4s, v23.4s, v26.4s\n" - "smin v8.4s, v8.4s, v25.4s\n" - "smin v9.4s, v9.4s, v25.4s\n" - "smin v10.4s, v10.4s, v25.4s\n" - "smin v11.4s, v11.4s, v25.4s\n" - "smin v12.4s, v12.4s, v25.4s\n" - "smin v13.4s, v13.4s, v25.4s\n" - "smin v14.4s, v14.4s, v25.4s\n" - "smin v15.4s, v15.4s, v25.4s\n" - "smin v16.4s, v16.4s, v25.4s\n" - "smin v17.4s, v17.4s, v25.4s\n" - "smin v18.4s, v18.4s, v25.4s\n" - "smin v19.4s, v19.4s, v25.4s\n" - "smin v20.4s, v20.4s, v25.4s\n" - "smin v21.4s, v21.4s, v25.4s\n" - "smin v22.4s, v22.4s, v25.4s\n" - "smin v23.4s, v23.4s, v25.4s\n" - "smax v8.4s, v8.4s, v24.4s\n" - "smax v9.4s, v9.4s, v24.4s\n" - "smax v10.4s, v10.4s, v24.4s\n" - "smax v11.4s, v11.4s, v24.4s\n" - "smax v12.4s, v12.4s, v24.4s\n" - "smax v13.4s, v13.4s, v24.4s\n" - "smax v14.4s, v14.4s, v24.4s\n" - "smax v15.4s, v15.4s, v24.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "smax v18.4s, v18.4s, v24.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "smax v21.4s, v21.4s, v24.4s\n" - "smax v22.4s, v22.4s, v24.4s\n" - "smax v23.4s, v23.4s, v24.4s\n" + "ld1r { v5.4s }, [x20]\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v25.8h, v10.8h, v11.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v24.8h, v14.8h, v15.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v18.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v17.8h, v22.8h, v23.8h\n" - "uzp1 v8.16b, v8.16b, v25.16b\n" - "uzp1 v12.16b, v12.16b, v24.16b\n" - "uzp1 v16.16b, v16.16b, v18.16b\n" - "uzp1 v20.16b, v20.16b, v17.16b\n" - "bge 107f\n" - "tbz x17, #3, 102f\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "bge 103f\n" + "tbz x17, #3, 98f\n" "str d8, [x15], #0x8\n" "str d12, [x26], #0x8\n" "str d16, [x25], #0x8\n" "str d20, [x24], #0x8\n" - "tbz x17, #2, 100f\n" + "tbz x17, #2, 96f\n" "st1 { v8.s }[2], [x15], #0x4\n" "st1 { v12.s }[2], [x26], #0x4\n" "st1 { v16.s }[2], [x25], #0x4\n" "st1 { v20.s }[2], [x24], #0x4\n" - "tbz x17, #1, 99f\n" + "tbz x17, #1, 95f\n" "st1 { v8.h }[6], [x15], #0x2\n" "st1 { v12.h }[6], [x26], #0x2\n" "st1 { v16.h }[6], [x25], #0x2\n" "st1 { v20.h }[6], [x24], #0x2\n" - "tbz x17, #0, 106f\n" + "tbz x17, #0, 102f\n" "st1 { v8.b }[14], [x15]\n" "st1 { v12.b }[14], [x26]\n" "st1 { v16.b }[14], [x25]\n" "st1 { v20.b }[14], [x24]\n" - "b 106f\n" - "99:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x17, #0, 106f\n" + "b 102f\n" + "95:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x17, #0, 102f\n" "st1 { v8.b }[12], [x15]\n" "st1 { v12.b }[12], [x26]\n" "st1 { v16.b }[12], [x25]\n" "st1 { v20.b }[12], [x24]\n" - "b 106f\n" - "100:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x17, #1, 101f\n" + "b 102f\n" + "96:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x17, #1, 97f\n" "st1 { v8.h }[4], [x15], #0x2\n" "st1 { v12.h }[4], [x26], #0x2\n" "st1 { v16.h }[4], [x25], #0x2\n" "st1 { v20.h }[4], [x24], #0x2\n" - "tbz x17, #0, 106f\n" + "tbz x17, #0, 102f\n" "st1 { v8.b }[10], [x15]\n" "st1 { v12.b }[10], [x26]\n" "st1 { v16.b }[10], [x25]\n" "st1 { v20.b }[10], [x24]\n" - "b 106f\n" - "101:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x17, #0, 106f\n" + "b 102f\n" + "97:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x17, #0, 102f\n" "st1 { v8.b }[8], [x15]\n" "st1 { v12.b }[8], [x26]\n" "st1 { v16.b }[8], [x25]\n" "st1 { v20.b }[8], [x24]\n" - "b 106f\n" - "102:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x17, #2, 104f\n" + "b 102f\n" + "98:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x17, #2, 100f\n" "str s8, [x15], #0x4\n" "str s12, [x26], #0x4\n" "str s16, [x25], #0x4\n" "str s20, [x24], #0x4\n" - "tbz x17, #1, 103f\n" + "tbz x17, #1, 99f\n" "st1 { v8.h }[2], [x15], #0x2\n" "st1 { v12.h }[2], [x26], #0x2\n" "st1 { v16.h }[2], [x25], #0x2\n" "st1 { v20.h }[2], [x24], #0x2\n" - "tbz x17, #0, 106f\n" + "tbz x17, #0, 102f\n" "st1 { v8.b }[6], [x15]\n" "st1 { v12.b }[6], [x26]\n" "st1 { v16.b }[6], [x25]\n" "st1 { v20.b }[6], [x24]\n" - "b 106f\n" - "103:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x17, #0, 106f\n" + "b 102f\n" + "99:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x17, #0, 102f\n" "st1 { v8.b }[4], [x15]\n" "st1 { v12.b }[4], [x26]\n" "st1 { v16.b }[4], [x25]\n" "st1 { v20.b }[4], [x24]\n" - "b 106f\n" - "104:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x17, #1, 105f\n" + "b 102f\n" + "100:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x17, #1, 101f\n" "str h8, [x15], #0x2\n" "str h12, [x26], #0x2\n" "str h16, [x25], #0x2\n" "str h20, [x24], #0x2\n" - "tbz x17, #0, 106f\n" + "tbz x17, #0, 102f\n" "st1 { v8.b }[2], [x15]\n" "st1 { v12.b }[2], [x26]\n" "st1 { v16.b }[2], [x25]\n" "st1 { v20.b }[2], [x24]\n" - "b 106f\n" - "105:" // Height 4: Partial direct writeback: partial_1_0 + "b 102f\n" + "101:" // Height 4: Partial direct writeback: partial_1_0 "str b8, [x15, #0x0]\n" "str b12, [x26, #0x0]\n" "str b16, [x25, #0x0]\n" "str b20, [x24, #0x0]\n" - "106:" // Height 4: Partial direct writeback: Done - "b 108f\n" - "107:" // Height 4: Full writeback + "102:" // Height 4: Partial direct writeback: Done + "b 104f\n" + "103:" // Height 4: Full writeback "str q8, [x15, #0x0]\n" "add x15, x15, #0x10\n" "str q12, [x26, #0x0]\n" "str q16, [x25, #0x0]\n" "str q20, [x24, #0x0]\n" - "108:" // Height 4: Writeback done + "104:" // Height 4: Writeback done "subs x17, x17, #0x10\n" - "bgt 83b\n" - "b 164f\n" - "109:" // Height 5 - "mov x6, %x[col_bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "bgt 80b\n" + "b 158f\n" + "105:" // Height 5 + "ldr x6, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x7, %x[col_bias]\n" "ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n" "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x15, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "110:" // Height 5: Column loop + "106:" // Height 5: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -2092,13 +1956,12 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "movi v25.4s, #0x0\n" "movi v26.4s, #0x0\n" "movi v27.4s, #0x0\n" - "111:" // Height 5: setup done "mov x14, #0x0\n" - "112:" // Height 5: String loop + "108:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w13, [x20, x14, LSL #0x2]\n" - "tbz %x[flags], #3, 113f\n" + "tbz %x[flags], #3, 109f\n" "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x12, [x20, #0x0]\n" @@ -2106,23 +1969,23 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ldr x10, [x20, #0x10]\n" "ldr x9, [x20, #0x18]\n" "ldr x28, [x20, #0x20]\n" - "cbnz x14, 114f\n" + "cbnz x14, 110f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x12, x12, x20\n" "add x11, x11, x20\n" "add x10, x10, x20\n" "add x9, x9, x20\n" "add x28, x28, x20\n" - "b 114f\n" - "113:" // Height 5: setup direct input + "b 110f\n" + "109:" // Height 5: setup direct input "mov x12, %x[input_ptr]\n" "add x11, x12, x21\n" "add x10, x11, x21\n" "add x9, x10, x21\n" "add x28, x9, x21\n" - "114:" // Height 5: input setup done + "110:" // Height 5: input setup done "cmp x13, #0x10\n" - "blt 117f\n" + "blt 113f\n" "ldr q0, [x12, #0x0]\n" "cmp x13, #0x20\n" "ldr q1, [x11, #0x0]\n" @@ -2131,8 +1994,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ldr q4, [x28, #0x0]\n" "ldr q6, [x16, #0x0]\n" "ldr q7, [x16, #0x10]\n" - "blt 116f\n" - "115:" // Height 5: Multiply loop: Main loop head + "blt 112f\n" + "111:" // Height 5: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "ldr x21, [x16, #0x28]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" @@ -2142,144 +2005,144 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" "add x11, x11, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr d29, [x16, #0x20]\n" + "ldr d6, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x10, x10, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" "add x9, x9, #0x10\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "mov v29.d[1], x21\n" + "mov v6.d[1], x21\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" "ldr x21, [x16, #0x48]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr d28, [x16, #0x30]\n" - ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" + "ldr d7, [x16, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" "add x28, x28, #0x10\n" - ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" "ldr x26, [x12, #0x8]\n" - "mov v28.d[1], x20\n" - ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" - ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" + "mov v7.d[1], x20\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" "ldr x20, [x16, #0x58]\n" - ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" - "ldr d29, [x16, #0x40]\n" - ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + "ldr d6, [x16, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" "ldr x25, [x11, #0x8]\n" - ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" "ldr x24, [x10, #0x8]\n" - ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" - "mov v29.d[1], x21\n" - ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "mov v6.d[1], x21\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" "ldr x21, [x16, #0x68]\n" - ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" - "ldr d28, [x16, #0x50]\n" - ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "ldr d7, [x16, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" "ldr x23, [x9, #0x8]\n" - ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" "ldr x22, [x28, #0x8]\n" - "mov v28.d[1], x20\n" - ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n" - ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n" + "mov v7.d[1], x20\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" "ldr x20, [x16, #0x78]\n" - ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n" - "ldr d29, [x16, #0x60]\n" - ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + "ldr d6, [x16, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" "sub x13, x13, #0x10\n" - ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" "cmp x13, #0x20\n" - ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n" - "mov v29.d[1], x21\n" - ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + "mov v6.d[1], x21\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" "ldr x21, [x16, #0x88]\n" - ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n" - "ldr d28, [x16, #0x70]\n" - ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + "ldr d7, [x16, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" "prfm pldl1keep, [x12, #0x80]\n" - ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" "prfm pldl1keep, [x11, #0x80]\n" - "mov v28.d[1], x20\n" - ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n" - ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n" + "mov v7.d[1], x20\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" "ldr x20, [x16, #0x98]\n" - ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n" - "ldr d29, [x16, #0x80]\n" - ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + "ldr d6, [x16, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" "prfm pldl1keep, [x10, #0x80]\n" - ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n" - "mov v29.d[1], x21\n" - ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + "mov v6.d[1], x21\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" "ldr x21, [x16, #0xa8]\n" - ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n" - "ldr d28, [x16, #0x90]\n" - ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + "ldr d7, [x16, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" "prfm pldl1keep, [x28, #0x80]\n" - ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n" - "mov v28.d[1], x20\n" - ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n" - ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + "mov v7.d[1], x20\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" "ldr x20, [x16, #0xb8]\n" - ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n" - "ldr d29, [x16, #0xa0]\n" - ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n" - "mov v29.d[1], x21\n" - ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + "ldr d6, [x16, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + "mov v6.d[1], x21\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" "ldr x21, [x16, #0xc8]\n" - ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n" - "ldr d28, [x16, #0xb0]\n" - ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n" - ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n" - "mov v28.d[1], x20\n" - ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n" - ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + "ldr d7, [x16, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + "mov v7.d[1], x20\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" "ldr x20, [x16, #0xd8]\n" - ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n" - "ldr d29, [x16, #0xc0]\n" - ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n" - "mov v29.d[1], x21\n" - ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + "ldr d6, [x16, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + "mov v6.d[1], x21\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" "ldr x21, [x16, #0xe8]\n" - ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n" - "ldr d28, [x16, #0xd0]\n" - ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n" - ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n" - "mov v28.d[1], x20\n" - ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n" - ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + "ldr d7, [x16, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + "mov v7.d[1], x20\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" "ldr x20, [x16, #0xf8]\n" - ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n" - "ldr d29, [x16, #0xe0]\n" - ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n" - ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n" - ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n" - "mov v29.d[1], x21\n" - ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n" - ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n" - "ldr d28, [x16, #0xf0]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + "ldr d6, [x16, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + "mov v6.d[1], x21\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + "ldr d7, [x16, #0xf0]\n" "add x16, x16, #0x100\n" - ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n" - ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" "ldr x21, [x16, #0x8]\n" - "mov v28.d[1], x20\n" - ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n" - ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n" + "mov v7.d[1], x20\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" "ldr x20, [x16, #0x18]\n" - ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" "ldr d6, [x16, #0x0]\n" - ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" "ldr d0, [x12, #0x0]\n" - ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" "ldr d1, [x11, #0x0]\n" - ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" "ldr d2, [x10, #0x0]\n" - ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" "ldr d3, [x9, #0x0]\n" - ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" "ldr d4, [x28, #0x0]\n" "ldr d7, [x16, #0x10]\n" "mov v6.d[1], x21\n" @@ -2289,8 +2152,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "mov v3.d[1], x23\n" "mov v4.d[1], x22\n" "mov v7.d[1], x20\n" - "bge 115b\n" - "116:" // Height 5: Multiply loop: Single iteration only + "bge 111b\n" + "112:" // Height 5: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "add x12, x12, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" @@ -2300,7 +2163,7 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" "add x9, x9, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q29, [x16, #0x20]\n" + "ldr q6, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x28, x28, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" @@ -2310,196 +2173,196 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" "prfm pldl1keep, [x11, #0x80]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q28, [x16, #0x30]\n" - ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" + "ldr q7, [x16, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" "prfm pldl1keep, [x10, #0x80]\n" - ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" "prfm pldl1keep, [x28, #0x80]\n" - ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" - ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" - "ldr q29, [x16, #0x40]\n" - ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" - ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" - ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" - "ldr q28, [x16, #0x50]\n" - ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n" - ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n" - ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n" - ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n" - ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n" - "ldr q29, [x16, #0x60]\n" - ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n" - ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n" - ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n" - ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n" - ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n" - "ldr q28, [x16, #0x70]\n" - ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n" - ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n" - ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n" - ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n" - ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n" - "ldr q29, [x16, #0x80]\n" - ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n" - ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n" - ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n" - ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n" - ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n" - "ldr q28, [x16, #0x90]\n" - ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n" - ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n" - ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n" - ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n" - ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n" - "ldr q29, [x16, #0xa0]\n" - ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n" - ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n" - ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n" - "ldr q28, [x16, #0xb0]\n" - ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n" - ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n" - ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n" - ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n" - ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n" - "ldr q29, [x16, #0xc0]\n" - ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n" - ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n" - ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n" - "ldr q28, [x16, #0xd0]\n" - ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n" - ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n" - ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n" - ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n" - ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n" - "ldr q29, [x16, #0xe0]\n" - ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n" - ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n" - ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n" - ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n" - ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n" - "ldr q28, [x16, #0xf0]\n" - ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x16, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x16, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x16, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x16, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x16, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x16, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x16, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x16, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x16, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x16, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + "ldr q6, [x16, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + "ldr q7, [x16, #0xf0]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" "add x16, x16, #0x100\n" - ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n" - ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n" - ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n" - ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n" - ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n" - ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n" - ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n" - ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n" - ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n" - "117:" // Height 5: Multiply loop: Main loop skip - "cbz x13, 122f\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + "113:" // Height 5: Multiply loop: Main loop skip + "cbz x13, 118f\n" "cmp x13, #0x4\n" - "blt 119f\n" - "118:" // Height 5: Multiply loop: Odd block loop - "ldr s2, [x12], #0x4\n" + "blt 115f\n" + "114:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x12], #0x4\n" "sub x13, x13, #0x4\n" "ldr s1, [x11], #0x4\n" "cmp x13, #0x4\n" - "ldr s0, [x10], #0x4\n" - "ldr s31, [x9], #0x4\n" - "ldr s30, [x28], #0x4\n" - "ldr q29, [x16, #0x0]\n" - "ldr q28, [x16, #0x10]\n" - ".inst 0x4f82e3a8 // sdot v8.4s, v29.16b, v2.4b[0]\n" - ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n" - ".inst 0x4f80e3b0 // sdot v16.4s, v29.16b, v0.4b[0]\n" - ".inst 0x4f9fe3b4 // sdot v20.4s, v29.16b, v31.4b[0]\n" - ".inst 0x4f9ee3b8 // sdot v24.4s, v29.16b, v30.4b[0]\n" - "ldr q29, [x16, #0x20]\n" - ".inst 0x4f82e389 // sdot v9.4s, v28.16b, v2.4b[0]\n" - ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f80e391 // sdot v17.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f9fe395 // sdot v21.4s, v28.16b, v31.4b[0]\n" - ".inst 0x4f9ee399 // sdot v25.4s, v28.16b, v30.4b[0]\n" - "ldr q28, [x16, #0x30]\n" - ".inst 0x4f82e3aa // sdot v10.4s, v29.16b, v2.4b[0]\n" + "ldr s2, [x10], #0x4\n" + "ldr s3, [x9], #0x4\n" + "ldr s4, [x28], #0x4\n" + "ldr q6, [x16, #0x0]\n" + "ldr q7, [x16, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x16, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x16, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" - ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n" - ".inst 0x4f9fe3b6 // sdot v22.4s, v29.16b, v31.4b[0]\n" - ".inst 0x4f9ee3ba // sdot v26.4s, v29.16b, v30.4b[0]\n" - ".inst 0x4f82e38b // sdot v11.4s, v28.16b, v2.4b[0]\n" - ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f9fe397 // sdot v23.4s, v28.16b, v31.4b[0]\n" - ".inst 0x4f9ee39b // sdot v27.4s, v28.16b, v30.4b[0]\n" - "bge 118b\n" - "119:" // Height 5: Multiply loop: Skip odd blocks - "cbz x13, 122f\n" - "tbz x13, #1, 120f\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "bge 114b\n" + "115:" // Height 5: Multiply loop: Skip odd blocks + "cbz x13, 118f\n" + "tbz x13, #1, 116f\n" "ldr h0, [x12], #0x2\n" "ldr h1, [x11], #0x2\n" "ldr h2, [x10], #0x2\n" "ldr h3, [x9], #0x2\n" "ldr h4, [x28], #0x2\n" - "tbz x13, #0, 121f\n" + "tbz x13, #0, 117f\n" "ld1 { v0.b }[2], [x12]\n" "ld1 { v1.b }[2], [x11]\n" "ld1 { v2.b }[2], [x10]\n" "ld1 { v3.b }[2], [x9]\n" "ld1 { v4.b }[2], [x28]\n" - "b 121f\n" - "120:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "b 117f\n" + "116:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x12, #0x0]\n" "ldr b1, [x11, #0x0]\n" "ldr b2, [x10, #0x0]\n" "ldr b3, [x9, #0x0]\n" "ldr b4, [x28, #0x0]\n" - "121:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q29, [x16, #0x0]\n" - "ldr q28, [x16, #0x10]\n" - ".inst 0x4f80e3a8 // sdot v8.4s, v29.16b, v0.4b[0]\n" - ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n" - ".inst 0x4f82e3b0 // sdot v16.4s, v29.16b, v2.4b[0]\n" - ".inst 0x4f83e3b4 // sdot v20.4s, v29.16b, v3.4b[0]\n" - ".inst 0x4f84e3b8 // sdot v24.4s, v29.16b, v4.4b[0]\n" - "ldr q29, [x16, #0x20]\n" - ".inst 0x4f80e389 // sdot v9.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f82e391 // sdot v17.4s, v28.16b, v2.4b[0]\n" - ".inst 0x4f83e395 // sdot v21.4s, v28.16b, v3.4b[0]\n" - ".inst 0x4f84e399 // sdot v25.4s, v28.16b, v4.4b[0]\n" - "ldr q28, [x16, #0x30]\n" - ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" + "117:" // Height 5: Multiply loop: Ragged operand read: Done + "ldr q6, [x16, #0x0]\n" + "ldr q7, [x16, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x16, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x16, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" - ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" - ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" - ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" - ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" - ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" - ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" - "122:" // Height 5: Multiply loop: No odd multiplies + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "118:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x14, x14, #0x1\n" "cmp x14, x20\n" - "bne 112b\n" - "ldr q31, [x6, #0x0]\n" - "ldr q30, [x6, #0x10]\n" - "ldr q29, [x6, #0x20]\n" - "ldr q28, [x6, #0x30]\n" - "add v8.4s, v8.4s, v31.4s\n" + "bne 108b\n" + "ldr q0, [x7, #0x0]\n" + "ldr q1, [x7, #0x10]\n" + "ldr q2, [x7, #0x20]\n" + "ldr q3, [x7, #0x30]\n" + "add v8.4s, v8.4s, v0.4s\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add v9.4s, v9.4s, v30.4s\n" + "add v9.4s, v9.4s, v1.4s\n" "prfm pstl1keep, [x15, #0x0]\n" - "add v10.4s, v10.4s, v29.4s\n" - "add v11.4s, v11.4s, v28.4s\n" - "add v12.4s, v12.4s, v31.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v12.4s, v12.4s, v0.4s\n" "add x26, x15, x20\n" - "add v13.4s, v13.4s, v30.4s\n" + "add v13.4s, v13.4s, v1.4s\n" "add x25, x26, x20\n" "prfm pstl1keep, [x26, #0x0]\n" "add x24, x25, x20\n" @@ -2507,34 +2370,34 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "add x23, x24, x20\n" "prfm pstl1keep, [x24, #0x0]\n" "prfm pstl1keep, [x23, #0x0]\n" - "add v14.4s, v14.4s, v29.4s\n" - "add v15.4s, v15.4s, v28.4s\n" - "add v16.4s, v16.4s, v31.4s\n" - "add v17.4s, v17.4s, v30.4s\n" - "add v18.4s, v18.4s, v29.4s\n" - "add v19.4s, v19.4s, v28.4s\n" - "add v20.4s, v20.4s, v31.4s\n" - "add v21.4s, v21.4s, v30.4s\n" - "add v22.4s, v22.4s, v29.4s\n" - "add v23.4s, v23.4s, v28.4s\n" - "add v24.4s, v24.4s, v31.4s\n" - "add v25.4s, v25.4s, v30.4s\n" - "add v26.4s, v26.4s, v29.4s\n" - "add v27.4s, v27.4s, v28.4s\n" - "add x6, x6, #0x40\n" - "tbz %x[flags], #4, 123f\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "add x7, x7, #0x40\n" + "tbz %x[flags], #4, 119f\n" "ldr q0, [x8, #0x0]\n" - "ldr q4, [x7, #0x0]\n" + "ldr q4, [x6, #0x0]\n" "ldr q1, [x8, #0x10]\n" - "ldr q5, [x7, #0x10]\n" + "ldr q5, [x6, #0x10]\n" "ldr q2, [x8, #0x20]\n" - "ldr q6, [x7, #0x20]\n" + "ldr q6, [x6, #0x20]\n" "ldr q3, [x8, #0x30]\n" "add x8, x8, #0x40\n" - "ldr q7, [x7, #0x30]\n" - "add x7, x7, #0x40\n" - "b 124f\n" - "123:" // Height 5: per layer parameters + "ldr q7, [x6, #0x30]\n" + "add x6, x6, #0x40\n" + "b 120f\n" + "119:" // Height 5: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1r { v0.4s }, [x21]\n" @@ -2545,89 +2408,27 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" - "124:" // Height 5: parameters loaded - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "sqrdmulh v12.4s, v12.4s, v4.4s\n" - "sqrdmulh v13.4s, v13.4s, v5.4s\n" - "sqrdmulh v14.4s, v14.4s, v6.4s\n" - "sqrdmulh v15.4s, v15.4s, v7.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v5.4s\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "sqrdmulh v19.4s, v19.4s, v7.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v5.4s\n" - "sqrdmulh v22.4s, v22.4s, v6.4s\n" - "sqrdmulh v23.4s, v23.4s, v7.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v5.4s\n" - "sqrdmulh v26.4s, v26.4s, v6.4s\n" - "sqrdmulh v27.4s, v27.4s, v7.4s\n" - "tbz %x[flags], #5, 125f\n" - "and v31.16b, v8.16b, v0.16b\n" - "and v30.16b, v9.16b, v1.16b\n" - "and v29.16b, v10.16b, v2.16b\n" - "and v28.16b, v11.16b, v3.16b\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v31.4s\n" - "sqadd v9.4s, v9.4s, v30.4s\n" - "sqadd v10.4s, v10.4s, v29.4s\n" - "sqadd v11.4s, v11.4s, v28.4s\n" - "and v31.16b, v12.16b, v0.16b\n" - "and v30.16b, v13.16b, v1.16b\n" - "and v29.16b, v14.16b, v2.16b\n" - "and v28.16b, v15.16b, v3.16b\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v31.4s\n" - "sqadd v13.4s, v13.4s, v30.4s\n" - "sqadd v14.4s, v14.4s, v29.4s\n" - "sqadd v15.4s, v15.4s, v28.4s\n" - "and v31.16b, v16.16b, v0.16b\n" - "and v30.16b, v17.16b, v1.16b\n" - "and v29.16b, v18.16b, v2.16b\n" - "and v28.16b, v19.16b, v3.16b\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v31.4s\n" - "sqadd v17.4s, v17.4s, v30.4s\n" - "sqadd v18.4s, v18.4s, v29.4s\n" - "sqadd v19.4s, v19.4s, v28.4s\n" - "and v31.16b, v20.16b, v0.16b\n" - "and v30.16b, v21.16b, v1.16b\n" - "and v29.16b, v22.16b, v2.16b\n" - "and v28.16b, v23.16b, v3.16b\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v31.4s\n" - "sqadd v21.4s, v21.4s, v30.4s\n" - "sqadd v22.4s, v22.4s, v29.4s\n" - "sqadd v23.4s, v23.4s, v28.4s\n" - "and v31.16b, v24.16b, v0.16b\n" - "and v30.16b, v25.16b, v1.16b\n" - "and v29.16b, v26.16b, v2.16b\n" - "and v28.16b, v27.16b, v3.16b\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sqadd v24.4s, v24.4s, v31.4s\n" - "sqadd v25.4s, v25.4s, v30.4s\n" - "sqadd v26.4s, v26.4s, v29.4s\n" - "sqadd v27.4s, v27.4s, v28.4s\n" - "125:" // Height 5: no shift correction + "120:" // Height 5: parameters loaded + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" + "sqdmulh v12.4s, v12.4s, v4.4s\n" + "sqdmulh v13.4s, v13.4s, v5.4s\n" + "sqdmulh v14.4s, v14.4s, v6.4s\n" + "sqdmulh v15.4s, v15.4s, v7.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v5.4s\n" + "sqdmulh v18.4s, v18.4s, v6.4s\n" + "sqdmulh v19.4s, v19.4s, v7.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v5.4s\n" + "sqdmulh v22.4s, v22.4s, v6.4s\n" + "sqdmulh v23.4s, v23.4s, v7.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v5.4s\n" + "sqdmulh v26.4s, v26.4s, v6.4s\n" + "sqdmulh v27.4s, v27.4s, v7.4s\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" "srshl v10.4s, v10.4s, v2.4s\n" @@ -2650,215 +2451,215 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "srshl v27.4s, v27.4s, v3.4s\n" "add x20, %x[qp], %[c_offset]\n" "add x21, %x[qp], %[maxval]\n" - "ld1r { v30.4s }, [x20]\n" + "ld1r { v4.4s }, [x20]\n" "add x20, %x[qp], %[minval]\n" - "ld1r { v29.4s }, [x21]\n" + "ld1r { v6.4s }, [x21]\n" "cmp x17, #0x10\n" - "ld1r { v28.4s }, [x20]\n" - "add v8.4s, v8.4s, v30.4s\n" - "add v9.4s, v9.4s, v30.4s\n" - "add v10.4s, v10.4s, v30.4s\n" - "add v11.4s, v11.4s, v30.4s\n" - "add v12.4s, v12.4s, v30.4s\n" - "add v13.4s, v13.4s, v30.4s\n" - "add v14.4s, v14.4s, v30.4s\n" - "add v15.4s, v15.4s, v30.4s\n" - "add v16.4s, v16.4s, v30.4s\n" - "add v17.4s, v17.4s, v30.4s\n" - "add v18.4s, v18.4s, v30.4s\n" - "add v19.4s, v19.4s, v30.4s\n" - "add v20.4s, v20.4s, v30.4s\n" - "add v21.4s, v21.4s, v30.4s\n" - "add v22.4s, v22.4s, v30.4s\n" - "add v23.4s, v23.4s, v30.4s\n" - "add v24.4s, v24.4s, v30.4s\n" - "add v25.4s, v25.4s, v30.4s\n" - "add v26.4s, v26.4s, v30.4s\n" - "add v27.4s, v27.4s, v30.4s\n" - "smin v8.4s, v8.4s, v29.4s\n" - "smin v9.4s, v9.4s, v29.4s\n" - "smin v10.4s, v10.4s, v29.4s\n" - "smin v11.4s, v11.4s, v29.4s\n" - "smin v12.4s, v12.4s, v29.4s\n" - "smin v13.4s, v13.4s, v29.4s\n" - "smin v14.4s, v14.4s, v29.4s\n" - "smin v15.4s, v15.4s, v29.4s\n" - "smin v16.4s, v16.4s, v29.4s\n" - "smin v17.4s, v17.4s, v29.4s\n" - "smin v18.4s, v18.4s, v29.4s\n" - "smin v19.4s, v19.4s, v29.4s\n" - "smin v20.4s, v20.4s, v29.4s\n" - "smin v21.4s, v21.4s, v29.4s\n" - "smin v22.4s, v22.4s, v29.4s\n" - "smin v23.4s, v23.4s, v29.4s\n" - "smin v24.4s, v24.4s, v29.4s\n" - "smin v25.4s, v25.4s, v29.4s\n" - "smin v26.4s, v26.4s, v29.4s\n" - "smin v27.4s, v27.4s, v29.4s\n" - "smax v8.4s, v8.4s, v28.4s\n" - "smax v9.4s, v9.4s, v28.4s\n" - "smax v10.4s, v10.4s, v28.4s\n" - "smax v11.4s, v11.4s, v28.4s\n" - "smax v12.4s, v12.4s, v28.4s\n" - "smax v13.4s, v13.4s, v28.4s\n" - "smax v14.4s, v14.4s, v28.4s\n" - "smax v15.4s, v15.4s, v28.4s\n" - "smax v16.4s, v16.4s, v28.4s\n" - "smax v17.4s, v17.4s, v28.4s\n" - "smax v18.4s, v18.4s, v28.4s\n" - "smax v19.4s, v19.4s, v28.4s\n" - "smax v20.4s, v20.4s, v28.4s\n" - "smax v21.4s, v21.4s, v28.4s\n" - "smax v22.4s, v22.4s, v28.4s\n" - "smax v23.4s, v23.4s, v28.4s\n" - "smax v24.4s, v24.4s, v28.4s\n" - "smax v25.4s, v25.4s, v28.4s\n" - "smax v26.4s, v26.4s, v28.4s\n" - "smax v27.4s, v27.4s, v28.4s\n" + "ld1r { v5.4s }, [x20]\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v29.8h, v10.8h, v11.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v28.8h, v14.8h, v15.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v19.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v18.8h, v22.8h, v23.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v17.8h, v26.8h, v27.8h\n" - "uzp1 v8.16b, v8.16b, v29.16b\n" - "uzp1 v12.16b, v12.16b, v28.16b\n" - "uzp1 v16.16b, v16.16b, v19.16b\n" - "uzp1 v20.16b, v20.16b, v18.16b\n" - "uzp1 v24.16b, v24.16b, v17.16b\n" - "bge 134f\n" - "tbz x17, #3, 129f\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 129f\n" + "tbz x17, #3, 124f\n" "str d8, [x15], #0x8\n" "str d12, [x26], #0x8\n" "str d16, [x25], #0x8\n" "str d20, [x24], #0x8\n" "str d24, [x23], #0x8\n" - "tbz x17, #2, 127f\n" + "tbz x17, #2, 122f\n" "st1 { v8.s }[2], [x15], #0x4\n" "st1 { v12.s }[2], [x26], #0x4\n" "st1 { v16.s }[2], [x25], #0x4\n" "st1 { v20.s }[2], [x24], #0x4\n" "st1 { v24.s }[2], [x23], #0x4\n" - "tbz x17, #1, 126f\n" + "tbz x17, #1, 121f\n" "st1 { v8.h }[6], [x15], #0x2\n" "st1 { v12.h }[6], [x26], #0x2\n" "st1 { v16.h }[6], [x25], #0x2\n" "st1 { v20.h }[6], [x24], #0x2\n" "st1 { v24.h }[6], [x23], #0x2\n" - "tbz x17, #0, 133f\n" + "tbz x17, #0, 128f\n" "st1 { v8.b }[14], [x15]\n" "st1 { v12.b }[14], [x26]\n" "st1 { v16.b }[14], [x25]\n" "st1 { v20.b }[14], [x24]\n" "st1 { v24.b }[14], [x23]\n" - "b 133f\n" - "126:" // Height 5: Partial direct writeback: partial_1_12 - "tbz x17, #0, 133f\n" + "b 128f\n" + "121:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x17, #0, 128f\n" "st1 { v8.b }[12], [x15]\n" "st1 { v12.b }[12], [x26]\n" "st1 { v16.b }[12], [x25]\n" "st1 { v20.b }[12], [x24]\n" "st1 { v24.b }[12], [x23]\n" - "b 133f\n" - "127:" // Height 5: Partial direct writeback: partial_2_8 - "tbz x17, #1, 128f\n" + "b 128f\n" + "122:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x17, #1, 123f\n" "st1 { v8.h }[4], [x15], #0x2\n" "st1 { v12.h }[4], [x26], #0x2\n" "st1 { v16.h }[4], [x25], #0x2\n" "st1 { v20.h }[4], [x24], #0x2\n" "st1 { v24.h }[4], [x23], #0x2\n" - "tbz x17, #0, 133f\n" + "tbz x17, #0, 128f\n" "st1 { v8.b }[10], [x15]\n" "st1 { v12.b }[10], [x26]\n" "st1 { v16.b }[10], [x25]\n" "st1 { v20.b }[10], [x24]\n" "st1 { v24.b }[10], [x23]\n" - "b 133f\n" - "128:" // Height 5: Partial direct writeback: partial_1_8 - "tbz x17, #0, 133f\n" + "b 128f\n" + "123:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x17, #0, 128f\n" "st1 { v8.b }[8], [x15]\n" "st1 { v12.b }[8], [x26]\n" "st1 { v16.b }[8], [x25]\n" "st1 { v20.b }[8], [x24]\n" "st1 { v24.b }[8], [x23]\n" - "b 133f\n" - "129:" // Height 5: Partial direct writeback: partial_4_0 - "tbz x17, #2, 131f\n" + "b 128f\n" + "124:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x17, #2, 126f\n" "str s8, [x15], #0x4\n" "str s12, [x26], #0x4\n" "str s16, [x25], #0x4\n" "str s20, [x24], #0x4\n" "str s24, [x23], #0x4\n" - "tbz x17, #1, 130f\n" + "tbz x17, #1, 125f\n" "st1 { v8.h }[2], [x15], #0x2\n" "st1 { v12.h }[2], [x26], #0x2\n" "st1 { v16.h }[2], [x25], #0x2\n" "st1 { v20.h }[2], [x24], #0x2\n" "st1 { v24.h }[2], [x23], #0x2\n" - "tbz x17, #0, 133f\n" + "tbz x17, #0, 128f\n" "st1 { v8.b }[6], [x15]\n" "st1 { v12.b }[6], [x26]\n" "st1 { v16.b }[6], [x25]\n" "st1 { v20.b }[6], [x24]\n" "st1 { v24.b }[6], [x23]\n" - "b 133f\n" - "130:" // Height 5: Partial direct writeback: partial_1_4 - "tbz x17, #0, 133f\n" + "b 128f\n" + "125:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x17, #0, 128f\n" "st1 { v8.b }[4], [x15]\n" "st1 { v12.b }[4], [x26]\n" "st1 { v16.b }[4], [x25]\n" "st1 { v20.b }[4], [x24]\n" "st1 { v24.b }[4], [x23]\n" - "b 133f\n" - "131:" // Height 5: Partial direct writeback: partial_2_0 - "tbz x17, #1, 132f\n" + "b 128f\n" + "126:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x17, #1, 127f\n" "str h8, [x15], #0x2\n" "str h12, [x26], #0x2\n" "str h16, [x25], #0x2\n" "str h20, [x24], #0x2\n" "str h24, [x23], #0x2\n" - "tbz x17, #0, 133f\n" + "tbz x17, #0, 128f\n" "st1 { v8.b }[2], [x15]\n" "st1 { v12.b }[2], [x26]\n" "st1 { v16.b }[2], [x25]\n" "st1 { v20.b }[2], [x24]\n" "st1 { v24.b }[2], [x23]\n" - "b 133f\n" - "132:" // Height 5: Partial direct writeback: partial_1_0 + "b 128f\n" + "127:" // Height 5: Partial direct writeback: partial_1_0 "str b8, [x15, #0x0]\n" "str b12, [x26, #0x0]\n" "str b16, [x25, #0x0]\n" "str b20, [x24, #0x0]\n" "str b24, [x23, #0x0]\n" - "133:" // Height 5: Partial direct writeback: Done - "b 135f\n" - "134:" // Height 5: Full writeback + "128:" // Height 5: Partial direct writeback: Done + "b 130f\n" + "129:" // Height 5: Full writeback "str q8, [x15, #0x0]\n" "add x15, x15, #0x10\n" "str q12, [x26, #0x0]\n" "str q16, [x25, #0x0]\n" "str q20, [x24, #0x0]\n" "str q24, [x23, #0x0]\n" - "135:" // Height 5: Writeback done + "130:" // Height 5: Writeback done "subs x17, x17, #0x10\n" - "bgt 110b\n" - "b 164f\n" - "136:" // Height 6 + "bgt 106b\n" + "b 158f\n" + "131:" // Height 6 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "mov x20, #0x6\n" "ldr x15, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "mov x6, %x[col_bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x7, %x[col_bias]\n" + "ldr x6, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" "ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n" "madd x20, x21, x20, x15\n" "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "137:" // Height 6: Column loop + "132:" // Height 6: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -2883,13 +2684,12 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "138:" // Height 6: setup done "mov x14, #0x0\n" - "139:" // Height 6: String loop + "134:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w13, [x20, x14, LSL #0x2]\n" - "tbz %x[flags], #3, 140f\n" + "tbz %x[flags], #3, 135f\n" "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x12, [x20, #0x0]\n" @@ -2898,7 +2698,7 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ldr x9, [x20, #0x18]\n" "ldr x28, [x20, #0x20]\n" "ldr x27, [x20, #0x28]\n" - "cbnz x14, 141f\n" + "cbnz x14, 136f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x12, x12, x20\n" "add x11, x11, x20\n" @@ -2906,17 +2706,17 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "add x9, x9, x20\n" "add x28, x28, x20\n" "add x27, x27, x20\n" - "b 141f\n" - "140:" // Height 6: setup direct input + "b 136f\n" + "135:" // Height 6: setup direct input "mov x12, %x[input_ptr]\n" "add x11, x12, x21\n" "add x10, x11, x21\n" "add x9, x10, x21\n" "add x28, x9, x21\n" "add x27, x28, x21\n" - "141:" // Height 6: input setup done + "136:" // Height 6: input setup done "cmp x13, #0x10\n" - "blt 144f\n" + "blt 139f\n" "ldr q0, [x12, #0x0]\n" "cmp x13, #0x20\n" "ldr q1, [x11, #0x0]\n" @@ -2926,8 +2726,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ldr q5, [x27, #0x0]\n" "ldr q6, [x16, #0x0]\n" "ldr q7, [x16, #0x10]\n" - "blt 143f\n" - "142:" // Height 6: Multiply loop: Main loop head + "blt 138f\n" + "137:" // Height 6: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "ldr x21, [x16, #0x28]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" @@ -3105,8 +2905,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "mov v4.d[1], x22\n" "mov v5.d[1], x21\n" "mov v7.d[1], x20\n" - "bge 142b\n" - "143:" // Height 6: Multiply loop: Single iteration only + "bge 137b\n" + "138:" // Height 6: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "add x12, x12, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" @@ -3231,121 +3031,121 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" - "144:" // Height 6: Multiply loop: Main loop skip - "cbz x13, 149f\n" + "139:" // Height 6: Multiply loop: Main loop skip + "cbz x13, 144f\n" "cmp x13, #0x4\n" - "blt 146f\n" - "145:" // Height 6: Multiply loop: Odd block loop - "ldr s7, [x12], #0x4\n" + "blt 141f\n" + "140:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x12], #0x4\n" "sub x13, x13, #0x4\n" - "ldr s6, [x11], #0x4\n" + "ldr s1, [x11], #0x4\n" "cmp x13, #0x4\n" - "ldr s5, [x10], #0x4\n" - "ldr s4, [x9], #0x4\n" - "ldr s3, [x28], #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr q1, [x16, #0x0]\n" - "ldr q0, [x16, #0x10]\n" - ".inst 0x4f87e028 // sdot v8.4s, v1.16b, v7.4b[0]\n" - ".inst 0x4f86e02c // sdot v12.4s, v1.16b, v6.4b[0]\n" - ".inst 0x4f85e030 // sdot v16.4s, v1.16b, v5.4b[0]\n" - ".inst 0x4f84e034 // sdot v20.4s, v1.16b, v4.4b[0]\n" - ".inst 0x4f83e038 // sdot v24.4s, v1.16b, v3.4b[0]\n" - ".inst 0x4f82e03c // sdot v28.4s, v1.16b, v2.4b[0]\n" - "ldr q1, [x16, #0x20]\n" - ".inst 0x4f87e009 // sdot v9.4s, v0.16b, v7.4b[0]\n" - ".inst 0x4f86e00d // sdot v13.4s, v0.16b, v6.4b[0]\n" - ".inst 0x4f85e011 // sdot v17.4s, v0.16b, v5.4b[0]\n" - ".inst 0x4f84e015 // sdot v21.4s, v0.16b, v4.4b[0]\n" - ".inst 0x4f83e019 // sdot v25.4s, v0.16b, v3.4b[0]\n" - ".inst 0x4f82e01d // sdot v29.4s, v0.16b, v2.4b[0]\n" - "ldr q0, [x16, #0x30]\n" - ".inst 0x4f87e02a // sdot v10.4s, v1.16b, v7.4b[0]\n" + "ldr s2, [x10], #0x4\n" + "ldr s3, [x9], #0x4\n" + "ldr s4, [x28], #0x4\n" + "ldr s5, [x27], #0x4\n" + "ldr q6, [x16, #0x0]\n" + "ldr q7, [x16, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x16, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x16, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f86e02e // sdot v14.4s, v1.16b, v6.4b[0]\n" - ".inst 0x4f85e032 // sdot v18.4s, v1.16b, v5.4b[0]\n" - ".inst 0x4f84e036 // sdot v22.4s, v1.16b, v4.4b[0]\n" - ".inst 0x4f83e03a // sdot v26.4s, v1.16b, v3.4b[0]\n" - ".inst 0x4f82e03e // sdot v30.4s, v1.16b, v2.4b[0]\n" - ".inst 0x4f87e00b // sdot v11.4s, v0.16b, v7.4b[0]\n" - ".inst 0x4f86e00f // sdot v15.4s, v0.16b, v6.4b[0]\n" - ".inst 0x4f85e013 // sdot v19.4s, v0.16b, v5.4b[0]\n" - ".inst 0x4f84e017 // sdot v23.4s, v0.16b, v4.4b[0]\n" - ".inst 0x4f83e01b // sdot v27.4s, v0.16b, v3.4b[0]\n" - ".inst 0x4f82e01f // sdot v31.4s, v0.16b, v2.4b[0]\n" - "bge 145b\n" - "146:" // Height 6: Multiply loop: Skip odd blocks - "cbz x13, 149f\n" - "tbz x13, #1, 147f\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + "bge 140b\n" + "141:" // Height 6: Multiply loop: Skip odd blocks + "cbz x13, 144f\n" + "tbz x13, #1, 142f\n" "ldr h0, [x12], #0x2\n" "ldr h1, [x11], #0x2\n" "ldr h2, [x10], #0x2\n" "ldr h3, [x9], #0x2\n" "ldr h4, [x28], #0x2\n" "ldr h5, [x27], #0x2\n" - "tbz x13, #0, 148f\n" + "tbz x13, #0, 143f\n" "ld1 { v0.b }[2], [x12]\n" "ld1 { v1.b }[2], [x11]\n" "ld1 { v2.b }[2], [x10]\n" "ld1 { v3.b }[2], [x9]\n" "ld1 { v4.b }[2], [x28]\n" "ld1 { v5.b }[2], [x27]\n" - "b 148f\n" - "147:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "b 143f\n" + "142:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x12, #0x0]\n" "ldr b1, [x11, #0x0]\n" "ldr b2, [x10, #0x0]\n" "ldr b3, [x9, #0x0]\n" "ldr b4, [x28, #0x0]\n" "ldr b5, [x27, #0x0]\n" - "148:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q7, [x16, #0x0]\n" - "ldr q6, [x16, #0x10]\n" - ".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ec // sdot v12.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f0 // sdot v16.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f4 // sdot v20.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f8 // sdot v24.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0fc // sdot v28.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x16, #0x20]\n" - ".inst 0x4f80e0c9 // sdot v9.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cd // sdot v13.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d1 // sdot v17.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d5 // sdot v21.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d9 // sdot v25.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0dd // sdot v29.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x16, #0x30]\n" - ".inst 0x4f80e0ea // sdot v10.4s, v7.16b, v0.4b[0]\n" + "143:" // Height 6: Multiply loop: Ragged operand read: Done + "ldr q6, [x16, #0x0]\n" + "ldr q7, [x16, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x16, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x16, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f81e0ee // sdot v14.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f2 // sdot v18.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f6 // sdot v22.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fa // sdot v26.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0fe // sdot v30.4s, v7.16b, v5.4b[0]\n" - ".inst 0x4f80e0cb // sdot v11.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cf // sdot v15.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d3 // sdot v19.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d7 // sdot v23.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0db // sdot v27.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0df // sdot v31.4s, v6.16b, v5.4b[0]\n" - "149:" // Height 6: Multiply loop: No odd multiplies + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + "144:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x14, x14, #0x1\n" "cmp x14, x20\n" - "bne 139b\n" - "ldr q3, [x6, #0x0]\n" - "ldr q2, [x6, #0x10]\n" - "ldr q1, [x6, #0x20]\n" - "ldr q0, [x6, #0x30]\n" - "add v8.4s, v8.4s, v3.4s\n" + "bne 134b\n" + "ldr q0, [x7, #0x0]\n" + "ldr q1, [x7, #0x10]\n" + "ldr q2, [x7, #0x20]\n" + "ldr q3, [x7, #0x30]\n" + "add v8.4s, v8.4s, v0.4s\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add v9.4s, v9.4s, v2.4s\n" + "add v9.4s, v9.4s, v1.4s\n" "prfm pstl1keep, [x15, #0x0]\n" - "add v10.4s, v10.4s, v1.4s\n" - "add v11.4s, v11.4s, v0.4s\n" - "add v12.4s, v12.4s, v3.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v12.4s, v12.4s, v0.4s\n" "add x26, x15, x20\n" - "add v13.4s, v13.4s, v2.4s\n" + "add v13.4s, v13.4s, v1.4s\n" "add x25, x26, x20\n" "prfm pstl1keep, [x26, #0x0]\n" "add x24, x25, x20\n" @@ -3355,38 +3155,38 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "add x22, x23, x20\n" "prfm pstl1keep, [x23, #0x0]\n" "prfm pstl1keep, [x22, #0x0]\n" - "add v14.4s, v14.4s, v1.4s\n" - "add v15.4s, v15.4s, v0.4s\n" - "add v16.4s, v16.4s, v3.4s\n" - "add v17.4s, v17.4s, v2.4s\n" - "add v18.4s, v18.4s, v1.4s\n" - "add v19.4s, v19.4s, v0.4s\n" - "add v20.4s, v20.4s, v3.4s\n" - "add v21.4s, v21.4s, v2.4s\n" - "add v22.4s, v22.4s, v1.4s\n" - "add v23.4s, v23.4s, v0.4s\n" - "add v24.4s, v24.4s, v3.4s\n" - "add v25.4s, v25.4s, v2.4s\n" - "add v26.4s, v26.4s, v1.4s\n" - "add v27.4s, v27.4s, v0.4s\n" - "add v28.4s, v28.4s, v3.4s\n" - "add v29.4s, v29.4s, v2.4s\n" - "add v30.4s, v30.4s, v1.4s\n" - "add v31.4s, v31.4s, v0.4s\n" - "add x6, x6, #0x40\n" - "tbz %x[flags], #4, 150f\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "add v28.4s, v28.4s, v0.4s\n" + "add v29.4s, v29.4s, v1.4s\n" + "add v30.4s, v30.4s, v2.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "add x7, x7, #0x40\n" + "tbz %x[flags], #4, 145f\n" "ldr q0, [x8, #0x0]\n" - "ldr q4, [x7, #0x0]\n" + "ldr q4, [x6, #0x0]\n" "ldr q1, [x8, #0x10]\n" - "ldr q5, [x7, #0x10]\n" + "ldr q5, [x6, #0x10]\n" "ldr q2, [x8, #0x20]\n" - "ldr q6, [x7, #0x20]\n" + "ldr q6, [x6, #0x20]\n" "ldr q3, [x8, #0x30]\n" "add x8, x8, #0x40\n" - "ldr q7, [x7, #0x30]\n" - "add x7, x7, #0x40\n" - "b 151f\n" - "150:" // Height 6: per layer parameters + "ldr q7, [x6, #0x30]\n" + "add x6, x6, #0x40\n" + "b 146f\n" + "145:" // Height 6: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1r { v0.4s }, [x21]\n" @@ -3397,105 +3197,31 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" - "151:" // Height 6: parameters loaded - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "sqrdmulh v12.4s, v12.4s, v4.4s\n" - "sqrdmulh v13.4s, v13.4s, v5.4s\n" - "sqrdmulh v14.4s, v14.4s, v6.4s\n" - "sqrdmulh v15.4s, v15.4s, v7.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v5.4s\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "sqrdmulh v19.4s, v19.4s, v7.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v5.4s\n" - "sqrdmulh v22.4s, v22.4s, v6.4s\n" - "sqrdmulh v23.4s, v23.4s, v7.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v5.4s\n" - "sqrdmulh v26.4s, v26.4s, v6.4s\n" - "sqrdmulh v27.4s, v27.4s, v7.4s\n" - "sqrdmulh v28.4s, v28.4s, v4.4s\n" - "sqrdmulh v29.4s, v29.4s, v5.4s\n" - "sqrdmulh v30.4s, v30.4s, v6.4s\n" - "sqrdmulh v31.4s, v31.4s, v7.4s\n" - "tbz %x[flags], #5, 152f\n" - "and v7.16b, v8.16b, v0.16b\n" - "and v6.16b, v9.16b, v1.16b\n" - "and v5.16b, v10.16b, v2.16b\n" - "and v4.16b, v11.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v7.4s\n" - "sqadd v9.4s, v9.4s, v6.4s\n" - "sqadd v10.4s, v10.4s, v5.4s\n" - "sqadd v11.4s, v11.4s, v4.4s\n" - "and v7.16b, v12.16b, v0.16b\n" - "and v6.16b, v13.16b, v1.16b\n" - "and v5.16b, v14.16b, v2.16b\n" - "and v4.16b, v15.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v7.4s\n" - "sqadd v13.4s, v13.4s, v6.4s\n" - "sqadd v14.4s, v14.4s, v5.4s\n" - "sqadd v15.4s, v15.4s, v4.4s\n" - "and v7.16b, v16.16b, v0.16b\n" - "and v6.16b, v17.16b, v1.16b\n" - "and v5.16b, v18.16b, v2.16b\n" - "and v4.16b, v19.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v7.4s\n" - "sqadd v17.4s, v17.4s, v6.4s\n" - "sqadd v18.4s, v18.4s, v5.4s\n" - "sqadd v19.4s, v19.4s, v4.4s\n" - "and v7.16b, v20.16b, v0.16b\n" - "and v6.16b, v21.16b, v1.16b\n" - "and v5.16b, v22.16b, v2.16b\n" - "and v4.16b, v23.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v7.4s\n" - "sqadd v21.4s, v21.4s, v6.4s\n" - "sqadd v22.4s, v22.4s, v5.4s\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "and v7.16b, v24.16b, v0.16b\n" - "and v6.16b, v25.16b, v1.16b\n" - "and v5.16b, v26.16b, v2.16b\n" - "and v4.16b, v27.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v24.4s, v24.4s, v7.4s\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "sqadd v26.4s, v26.4s, v5.4s\n" - "sqadd v27.4s, v27.4s, v4.4s\n" - "and v7.16b, v28.16b, v0.16b\n" - "and v6.16b, v29.16b, v1.16b\n" - "and v5.16b, v30.16b, v2.16b\n" - "and v4.16b, v31.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v28.4s, v28.4s, v7.4s\n" - "sqadd v29.4s, v29.4s, v6.4s\n" - "sqadd v30.4s, v30.4s, v5.4s\n" - "sqadd v31.4s, v31.4s, v4.4s\n" - "152:" // Height 6: no shift correction + "146:" // Height 6: parameters loaded + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" + "sqdmulh v12.4s, v12.4s, v4.4s\n" + "sqdmulh v13.4s, v13.4s, v5.4s\n" + "sqdmulh v14.4s, v14.4s, v6.4s\n" + "sqdmulh v15.4s, v15.4s, v7.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v5.4s\n" + "sqdmulh v18.4s, v18.4s, v6.4s\n" + "sqdmulh v19.4s, v19.4s, v7.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v5.4s\n" + "sqdmulh v22.4s, v22.4s, v6.4s\n" + "sqdmulh v23.4s, v23.4s, v7.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v5.4s\n" + "sqdmulh v26.4s, v26.4s, v6.4s\n" + "sqdmulh v27.4s, v27.4s, v7.4s\n" + "sqdmulh v28.4s, v28.4s, v4.4s\n" + "sqdmulh v29.4s, v29.4s, v5.4s\n" + "sqdmulh v30.4s, v30.4s, v6.4s\n" + "sqdmulh v31.4s, v31.4s, v7.4s\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" "srshl v10.4s, v10.4s, v2.4s\n" @@ -3522,223 +3248,223 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "srshl v31.4s, v31.4s, v3.4s\n" "add x20, %x[qp], %[c_offset]\n" "add x21, %x[qp], %[maxval]\n" - "ld1r { v2.4s }, [x20]\n" + "ld1r { v4.4s }, [x20]\n" "add x20, %x[qp], %[minval]\n" - "ld1r { v1.4s }, [x21]\n" + "ld1r { v6.4s }, [x21]\n" "cmp x17, #0x10\n" - "ld1r { v0.4s }, [x20]\n" - "add v8.4s, v8.4s, v2.4s\n" - "add v9.4s, v9.4s, v2.4s\n" - "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v2.4s\n" - "add v12.4s, v12.4s, v2.4s\n" - "add v13.4s, v13.4s, v2.4s\n" - "add v14.4s, v14.4s, v2.4s\n" - "add v15.4s, v15.4s, v2.4s\n" - "add v16.4s, v16.4s, v2.4s\n" - "add v17.4s, v17.4s, v2.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v2.4s\n" - "add v20.4s, v20.4s, v2.4s\n" - "add v21.4s, v21.4s, v2.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v2.4s\n" - "add v24.4s, v24.4s, v2.4s\n" - "add v25.4s, v25.4s, v2.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v2.4s\n" - "add v28.4s, v28.4s, v2.4s\n" - "add v29.4s, v29.4s, v2.4s\n" - "add v30.4s, v30.4s, v2.4s\n" - "add v31.4s, v31.4s, v2.4s\n" - "smin v8.4s, v8.4s, v1.4s\n" - "smin v9.4s, v9.4s, v1.4s\n" - "smin v10.4s, v10.4s, v1.4s\n" - "smin v11.4s, v11.4s, v1.4s\n" - "smin v12.4s, v12.4s, v1.4s\n" - "smin v13.4s, v13.4s, v1.4s\n" - "smin v14.4s, v14.4s, v1.4s\n" - "smin v15.4s, v15.4s, v1.4s\n" - "smin v16.4s, v16.4s, v1.4s\n" - "smin v17.4s, v17.4s, v1.4s\n" - "smin v18.4s, v18.4s, v1.4s\n" - "smin v19.4s, v19.4s, v1.4s\n" - "smin v20.4s, v20.4s, v1.4s\n" - "smin v21.4s, v21.4s, v1.4s\n" - "smin v22.4s, v22.4s, v1.4s\n" - "smin v23.4s, v23.4s, v1.4s\n" - "smin v24.4s, v24.4s, v1.4s\n" - "smin v25.4s, v25.4s, v1.4s\n" - "smin v26.4s, v26.4s, v1.4s\n" - "smin v27.4s, v27.4s, v1.4s\n" - "smin v28.4s, v28.4s, v1.4s\n" - "smin v29.4s, v29.4s, v1.4s\n" - "smin v30.4s, v30.4s, v1.4s\n" - "smin v31.4s, v31.4s, v1.4s\n" - "smax v8.4s, v8.4s, v0.4s\n" - "smax v9.4s, v9.4s, v0.4s\n" - "smax v10.4s, v10.4s, v0.4s\n" - "smax v11.4s, v11.4s, v0.4s\n" - "smax v12.4s, v12.4s, v0.4s\n" - "smax v13.4s, v13.4s, v0.4s\n" - "smax v14.4s, v14.4s, v0.4s\n" - "smax v15.4s, v15.4s, v0.4s\n" - "smax v16.4s, v16.4s, v0.4s\n" - "smax v17.4s, v17.4s, v0.4s\n" - "smax v18.4s, v18.4s, v0.4s\n" - "smax v19.4s, v19.4s, v0.4s\n" - "smax v20.4s, v20.4s, v0.4s\n" - "smax v21.4s, v21.4s, v0.4s\n" - "smax v22.4s, v22.4s, v0.4s\n" - "smax v23.4s, v23.4s, v0.4s\n" - "smax v24.4s, v24.4s, v0.4s\n" - "smax v25.4s, v25.4s, v0.4s\n" - "smax v26.4s, v26.4s, v0.4s\n" - "smax v27.4s, v27.4s, v0.4s\n" - "smax v28.4s, v28.4s, v0.4s\n" - "smax v29.4s, v29.4s, v0.4s\n" - "smax v30.4s, v30.4s, v0.4s\n" - "smax v31.4s, v31.4s, v0.4s\n" + "ld1r { v5.4s }, [x20]\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v2.8h, v10.8h, v11.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v1.8h, v14.8h, v15.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v0.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v19.8h, v22.8h, v23.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v18.8h, v26.8h, v27.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" "uzp1 v28.8h, v28.8h, v29.8h\n" - "uzp1 v17.8h, v30.8h, v31.8h\n" - "uzp1 v8.16b, v8.16b, v2.16b\n" - "uzp1 v12.16b, v12.16b, v1.16b\n" - "uzp1 v16.16b, v16.16b, v0.16b\n" - "uzp1 v20.16b, v20.16b, v19.16b\n" - "uzp1 v24.16b, v24.16b, v18.16b\n" - "uzp1 v28.16b, v28.16b, v17.16b\n" - "bge 161f\n" - "tbz x17, #3, 156f\n" + "uzp1 v29.8h, v30.8h, v31.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v28.16b, v28.16b, v29.16b\n" + "bge 155f\n" + "tbz x17, #3, 150f\n" "str d8, [x15], #0x8\n" "str d12, [x26], #0x8\n" "str d16, [x25], #0x8\n" "str d20, [x24], #0x8\n" "str d24, [x23], #0x8\n" "str d28, [x22], #0x8\n" - "tbz x17, #2, 154f\n" + "tbz x17, #2, 148f\n" "st1 { v8.s }[2], [x15], #0x4\n" "st1 { v12.s }[2], [x26], #0x4\n" "st1 { v16.s }[2], [x25], #0x4\n" "st1 { v20.s }[2], [x24], #0x4\n" "st1 { v24.s }[2], [x23], #0x4\n" "st1 { v28.s }[2], [x22], #0x4\n" - "tbz x17, #1, 153f\n" + "tbz x17, #1, 147f\n" "st1 { v8.h }[6], [x15], #0x2\n" "st1 { v12.h }[6], [x26], #0x2\n" "st1 { v16.h }[6], [x25], #0x2\n" "st1 { v20.h }[6], [x24], #0x2\n" "st1 { v24.h }[6], [x23], #0x2\n" "st1 { v28.h }[6], [x22], #0x2\n" - "tbz x17, #0, 160f\n" + "tbz x17, #0, 154f\n" "st1 { v8.b }[14], [x15]\n" "st1 { v12.b }[14], [x26]\n" "st1 { v16.b }[14], [x25]\n" "st1 { v20.b }[14], [x24]\n" "st1 { v24.b }[14], [x23]\n" "st1 { v28.b }[14], [x22]\n" - "b 160f\n" - "153:" // Height 6: Partial direct writeback: partial_1_12 - "tbz x17, #0, 160f\n" + "b 154f\n" + "147:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x17, #0, 154f\n" "st1 { v8.b }[12], [x15]\n" "st1 { v12.b }[12], [x26]\n" "st1 { v16.b }[12], [x25]\n" "st1 { v20.b }[12], [x24]\n" "st1 { v24.b }[12], [x23]\n" "st1 { v28.b }[12], [x22]\n" - "b 160f\n" - "154:" // Height 6: Partial direct writeback: partial_2_8 - "tbz x17, #1, 155f\n" + "b 154f\n" + "148:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x17, #1, 149f\n" "st1 { v8.h }[4], [x15], #0x2\n" "st1 { v12.h }[4], [x26], #0x2\n" "st1 { v16.h }[4], [x25], #0x2\n" "st1 { v20.h }[4], [x24], #0x2\n" "st1 { v24.h }[4], [x23], #0x2\n" "st1 { v28.h }[4], [x22], #0x2\n" - "tbz x17, #0, 160f\n" + "tbz x17, #0, 154f\n" "st1 { v8.b }[10], [x15]\n" "st1 { v12.b }[10], [x26]\n" "st1 { v16.b }[10], [x25]\n" "st1 { v20.b }[10], [x24]\n" "st1 { v24.b }[10], [x23]\n" "st1 { v28.b }[10], [x22]\n" - "b 160f\n" - "155:" // Height 6: Partial direct writeback: partial_1_8 - "tbz x17, #0, 160f\n" + "b 154f\n" + "149:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x17, #0, 154f\n" "st1 { v8.b }[8], [x15]\n" "st1 { v12.b }[8], [x26]\n" "st1 { v16.b }[8], [x25]\n" "st1 { v20.b }[8], [x24]\n" "st1 { v24.b }[8], [x23]\n" "st1 { v28.b }[8], [x22]\n" - "b 160f\n" - "156:" // Height 6: Partial direct writeback: partial_4_0 - "tbz x17, #2, 158f\n" + "b 154f\n" + "150:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x17, #2, 152f\n" "str s8, [x15], #0x4\n" "str s12, [x26], #0x4\n" "str s16, [x25], #0x4\n" "str s20, [x24], #0x4\n" "str s24, [x23], #0x4\n" "str s28, [x22], #0x4\n" - "tbz x17, #1, 157f\n" + "tbz x17, #1, 151f\n" "st1 { v8.h }[2], [x15], #0x2\n" "st1 { v12.h }[2], [x26], #0x2\n" "st1 { v16.h }[2], [x25], #0x2\n" "st1 { v20.h }[2], [x24], #0x2\n" "st1 { v24.h }[2], [x23], #0x2\n" "st1 { v28.h }[2], [x22], #0x2\n" - "tbz x17, #0, 160f\n" + "tbz x17, #0, 154f\n" "st1 { v8.b }[6], [x15]\n" "st1 { v12.b }[6], [x26]\n" "st1 { v16.b }[6], [x25]\n" "st1 { v20.b }[6], [x24]\n" "st1 { v24.b }[6], [x23]\n" "st1 { v28.b }[6], [x22]\n" - "b 160f\n" - "157:" // Height 6: Partial direct writeback: partial_1_4 - "tbz x17, #0, 160f\n" + "b 154f\n" + "151:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x17, #0, 154f\n" "st1 { v8.b }[4], [x15]\n" "st1 { v12.b }[4], [x26]\n" "st1 { v16.b }[4], [x25]\n" "st1 { v20.b }[4], [x24]\n" "st1 { v24.b }[4], [x23]\n" "st1 { v28.b }[4], [x22]\n" - "b 160f\n" - "158:" // Height 6: Partial direct writeback: partial_2_0 - "tbz x17, #1, 159f\n" + "b 154f\n" + "152:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x17, #1, 153f\n" "str h8, [x15], #0x2\n" "str h12, [x26], #0x2\n" "str h16, [x25], #0x2\n" "str h20, [x24], #0x2\n" "str h24, [x23], #0x2\n" "str h28, [x22], #0x2\n" - "tbz x17, #0, 160f\n" + "tbz x17, #0, 154f\n" "st1 { v8.b }[2], [x15]\n" "st1 { v12.b }[2], [x26]\n" "st1 { v16.b }[2], [x25]\n" "st1 { v20.b }[2], [x24]\n" "st1 { v24.b }[2], [x23]\n" "st1 { v28.b }[2], [x22]\n" - "b 160f\n" - "159:" // Height 6: Partial direct writeback: partial_1_0 + "b 154f\n" + "153:" // Height 6: Partial direct writeback: partial_1_0 "str b8, [x15, #0x0]\n" "str b12, [x26, #0x0]\n" "str b16, [x25, #0x0]\n" "str b20, [x24, #0x0]\n" "str b24, [x23, #0x0]\n" "str b28, [x22, #0x0]\n" - "160:" // Height 6: Partial direct writeback: Done - "b 162f\n" - "161:" // Height 6: Full writeback + "154:" // Height 6: Partial direct writeback: Done + "b 156f\n" + "155:" // Height 6: Full writeback "str q8, [x15, #0x0]\n" "add x15, x15, #0x10\n" "str q12, [x26, #0x0]\n" @@ -3746,21 +3472,21 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "str q20, [x24, #0x0]\n" "str q24, [x23, #0x0]\n" "str q28, [x22, #0x0]\n" - "162:" // Height 6: Writeback done + "156:" // Height 6: Writeback done "subs x17, x17, #0x10\n" - "bgt 137b\n" + "bgt 132b\n" "subs %x[M], %x[M], #0x6\n" - "beq 164f\n" + "beq 158f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 163f\n" + "tbz %x[flags], #3, 157f\n" "add x21, x21, #0x6\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "163:" // Update direct input + "157:" // Update direct input "mov x20, #0x6\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "164:" // Exit + "158:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp index dbff7baee7..242c75eef4 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -81,22 +80,19 @@ void a64_hybrid_s8qs_dot_6x16 ( ka.multiplier_ptr=qp->per_channel_muls + col_base; ka.shift_ptr=qp->per_channel_right_shifts + col_base; } - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 136f\n" + "bge 131f\n" "cmp %x[M], #0x4\n" - "bgt 109f\n" - "beq 82f\n" + "bgt 105f\n" + "beq 79f\n" "cmp %x[M], #0x2\n" - "bgt 55f\n" - "beq 28f\n" - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "bgt 53f\n" + "beq 27f\n" + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" @@ -105,7 +101,6 @@ void a64_hybrid_s8qs_dot_6x16 ( "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" "movi v11.4s, #0x0\n" - "3:" // Height 1: setup done "mov x28, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -131,96 +126,96 @@ void a64_hybrid_s8qs_dot_6x16 ( "blt 8f\n" "7:" // Height 1: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q17, [x10, #0x20]\n" + "ldr q6, [x10, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q16, [x10, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" "cmp x27, #0x20\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" - "ldr q17, [x10, #0x40]\n" - ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" - "ldr q16, [x10, #0x50]\n" - ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" - "ldr q17, [x10, #0x60]\n" - ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" - "ldr q16, [x10, #0x70]\n" - ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" - "ldr q17, [x10, #0x80]\n" - ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" - "ldr q16, [x10, #0x90]\n" - ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" - "ldr q17, [x10, #0xa0]\n" - ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" - "ldr q16, [x10, #0xb0]\n" - ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" - "ldr q17, [x10, #0xc0]\n" - ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" - "ldr q16, [x10, #0xd0]\n" - ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" - "ldr q17, [x10, #0xe0]\n" - ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" - "ldr q16, [x10, #0xf0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x10, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x10, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x10, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x10, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x10, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x10, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x10, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + "ldr q6, [x10, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + "ldr q7, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 7b\n" "8:" // Height 1: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q17, [x10, #0x20]\n" + "ldr q6, [x10, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q16, [x10, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "add x26, x26, #0x10\n" "sub x27, x27, #0x10\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" - "ldr q17, [x10, #0x40]\n" - ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" - "ldr q16, [x10, #0x50]\n" - ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" - "ldr q17, [x10, #0x60]\n" - ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" - "ldr q16, [x10, #0x70]\n" - ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" - "ldr q17, [x10, #0x80]\n" - ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" - "ldr q16, [x10, #0x90]\n" - ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" - "ldr q17, [x10, #0xa0]\n" - ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" - "ldr q16, [x10, #0xb0]\n" - ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" - "ldr q17, [x10, #0xc0]\n" - ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" - "ldr q16, [x10, #0xd0]\n" - ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" - "ldr q17, [x10, #0xe0]\n" - ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" - "ldr q16, [x10, #0xf0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x10, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x10, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x10, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x10, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x10, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x10, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x10, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + "ldr q6, [x10, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + "ldr q7, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" - ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" "9:" // Height 1: Multiply loop: Main loop skip "cbz x27, 14f\n" "cmp x27, #0x4\n" "blt 11f\n" "10:" // Height 1: Multiply loop: Odd block loop - "ldr s18, [x26], #0x4\n" - "ldr q17, [x10, #0x0]\n" + "ldr s0, [x26], #0x4\n" + "ldr q6, [x10, #0x0]\n" "sub x27, x27, #0x4\n" - "ldr q16, [x10, #0x10]\n" + "ldr q7, [x10, #0x10]\n" "cmp x27, #0x4\n" - ".inst 0x4f92e228 // sdot v8.4s, v17.16b, v18.4b[0]\n" - "ldr q17, [x10, #0x20]\n" - ".inst 0x4f92e209 // sdot v9.4s, v16.16b, v18.4b[0]\n" - "ldr q16, [x10, #0x30]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f92e22a // sdot v10.4s, v17.16b, v18.4b[0]\n" - ".inst 0x4f92e20b // sdot v11.4s, v16.16b, v18.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" "bge 10b\n" "11:" // Height 1: Multiply loop: Skip odd blocks "cbz x27, 14f\n" @@ -232,41 +227,41 @@ void a64_hybrid_s8qs_dot_6x16 ( "12:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x26, #0x0]\n" "13:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q17, [x10, #0x0]\n" - "ldr q16, [x10, #0x10]\n" - ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n" - "ldr q17, [x10, #0x20]\n" - ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n" - "ldr q16, [x10, #0x30]\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" - ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" "14:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 4b\n" - "ldr q19, [x14, #0x0]\n" - "ldr q18, [x14, #0x10]\n" - "ldr q17, [x14, #0x20]\n" - "ldr q16, [x14, #0x30]\n" - "add x14, x14, #0x40\n" - "prfm pstl1keep, [x9, #0x0]\n" - "add v8.4s, v8.4s, v19.4s\n" - "add v9.4s, v9.4s, v18.4s\n" - "add v10.4s, v10.4s, v17.4s\n" - "add v11.4s, v11.4s, v16.4s\n" - "tbz %x[flags], #4, 15f\n" "ldr q0, [x12, #0x0]\n" - "ldr q4, [x13, #0x0]\n" "ldr q1, [x12, #0x10]\n" - "ldr q5, [x13, #0x10]\n" "ldr q2, [x12, #0x20]\n" - "ldr q6, [x13, #0x20]\n" "ldr q3, [x12, #0x30]\n" - "ldr q7, [x13, #0x30]\n" "add x12, x12, #0x40\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "tbz %x[flags], #4, 15f\n" + "ldr q0, [x13, #0x0]\n" + "ldr q4, [x14, #0x0]\n" + "ldr q1, [x13, #0x10]\n" + "ldr q5, [x14, #0x10]\n" + "ldr q2, [x13, #0x20]\n" + "ldr q6, [x14, #0x20]\n" + "ldr q3, [x13, #0x30]\n" + "ldr q7, [x14, #0x30]\n" "add x13, x13, #0x40\n" + "add x14, x14, #0x40\n" "b 16f\n" "15:" // Height 1: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" @@ -280,111 +275,97 @@ void a64_hybrid_s8qs_dot_6x16 ( "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" "16:" // Height 1: parameters loaded - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "tbz %x[flags], #5, 17f\n" - "and v19.16b, v8.16b, v0.16b\n" - "and v18.16b, v9.16b, v1.16b\n" - "and v17.16b, v10.16b, v2.16b\n" - "and v16.16b, v11.16b, v3.16b\n" - "sshr v19.4s, v19.4s, #0x1f\n" - "sshr v18.4s, v18.4s, #0x1f\n" - "sshr v17.4s, v17.4s, #0x1f\n" - "sshr v16.4s, v16.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v19.4s\n" - "sqadd v9.4s, v9.4s, v18.4s\n" - "sqadd v10.4s, v10.4s, v17.4s\n" - "sqadd v11.4s, v11.4s, v16.4s\n" - "17:" // Height 1: no shift correction + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v4.4s }, [x21]\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "ld1r { v6.4s }, [x20]\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" + "add x20, %x[qp], %[minval]\n" + "cmp x11, #0x10\n" + "ld1r { v5.4s }, [x20]\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v18.4s }, [x21]\n" - "ld1r { v17.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x11, #0x10\n" - "ld1r { v16.4s }, [x20]\n" - "add v8.4s, v8.4s, v18.4s\n" - "add v9.4s, v9.4s, v18.4s\n" - "add v10.4s, v10.4s, v18.4s\n" - "add v11.4s, v11.4s, v18.4s\n" - "smin v8.4s, v8.4s, v17.4s\n" - "smin v9.4s, v9.4s, v17.4s\n" - "smin v10.4s, v10.4s, v17.4s\n" - "smin v11.4s, v11.4s, v17.4s\n" - "smax v8.4s, v8.4s, v16.4s\n" - "smax v9.4s, v9.4s, v16.4s\n" - "smax v10.4s, v10.4s, v16.4s\n" - "smax v11.4s, v11.4s, v16.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v16.8h, v10.8h, v11.8h\n" - "uzp1 v8.16b, v8.16b, v16.16b\n" - "bge 26f\n" - "tbz x11, #3, 21f\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "bge 25f\n" + "tbz x11, #3, 20f\n" "str d8, [x9], #0x8\n" - "tbz x11, #2, 19f\n" + "tbz x11, #2, 18f\n" "st1 { v8.s }[2], [x9], #0x4\n" - "tbz x11, #1, 18f\n" + "tbz x11, #1, 17f\n" "st1 { v8.h }[6], [x9], #0x2\n" - "tbz x11, #0, 25f\n" + "tbz x11, #0, 24f\n" "st1 { v8.b }[14], [x9]\n" - "b 25f\n" - "18:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x11, #0, 25f\n" + "b 24f\n" + "17:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x11, #0, 24f\n" "st1 { v8.b }[12], [x9]\n" - "b 25f\n" - "19:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x11, #1, 20f\n" + "b 24f\n" + "18:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x11, #1, 19f\n" "st1 { v8.h }[4], [x9], #0x2\n" - "tbz x11, #0, 25f\n" + "tbz x11, #0, 24f\n" "st1 { v8.b }[10], [x9]\n" - "b 25f\n" - "20:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x11, #0, 25f\n" + "b 24f\n" + "19:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x11, #0, 24f\n" "st1 { v8.b }[8], [x9]\n" - "b 25f\n" - "21:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x11, #2, 23f\n" + "b 24f\n" + "20:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x11, #2, 22f\n" "str s8, [x9], #0x4\n" - "tbz x11, #1, 22f\n" + "tbz x11, #1, 21f\n" "st1 { v8.h }[2], [x9], #0x2\n" - "tbz x11, #0, 25f\n" + "tbz x11, #0, 24f\n" "st1 { v8.b }[6], [x9]\n" - "b 25f\n" - "22:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x11, #0, 25f\n" + "b 24f\n" + "21:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x11, #0, 24f\n" "st1 { v8.b }[4], [x9]\n" - "b 25f\n" - "23:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x11, #1, 24f\n" + "b 24f\n" + "22:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x11, #1, 23f\n" "str h8, [x9], #0x2\n" - "tbz x11, #0, 25f\n" + "tbz x11, #0, 24f\n" "st1 { v8.b }[2], [x9]\n" - "b 25f\n" - "24:" // Height 1: Partial direct writeback: partial_1_0 + "b 24f\n" + "23:" // Height 1: Partial direct writeback: partial_1_0 "str b8, [x9, #0x0]\n" - "25:" // Height 1: Partial direct writeback: Done - "b 27f\n" - "26:" // Height 1: Full writeback + "24:" // Height 1: Partial direct writeback: Done + "b 26f\n" + "25:" // Height 1: Full writeback "str q8, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "27:" // Height 1: Writeback done + "26:" // Height 1: Writeback done "subs x11, x11, #0x10\n" "bgt 2b\n" - "b 164f\n" - "28:" // Height 2 - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "b 158f\n" + "27:" // Height 2 + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "29:" // Height 2: Column loop + "28:" // Height 2: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -393,230 +374,229 @@ void a64_hybrid_s8qs_dot_6x16 ( "movi v13.4s, #0x0\n" "movi v14.4s, #0x0\n" "movi v15.4s, #0x0\n" - "30:" // Height 2: setup done "mov x28, #0x0\n" - "31:" // Height 2: String loop + "30:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 32f\n" + "tbz %x[flags], #3, 31f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" "ldr x25, [x20, #0x8]\n" - "cbnz x28, 33f\n" + "cbnz x28, 32f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" - "b 33f\n" - "32:" // Height 2: setup direct input + "b 32f\n" + "31:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" - "33:" // Height 2: input setup done + "32:" // Height 2: input setup done "cmp x27, #0x10\n" - "blt 36f\n" + "blt 35f\n" "ldr q0, [x26, #0x0]\n" "ldr q1, [x25, #0x0]\n" "cmp x27, #0x20\n" "ldr q6, [x10, #0x0]\n" "ldr q7, [x10, #0x10]\n" - "blt 35f\n" - "34:" // Height 2: Multiply loop: Main loop head + "blt 34f\n" + "33:" // Height 2: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q17, [x10, #0x20]\n" + "ldr q6, [x10, #0x20]\n" "sub x27, x27, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q16, [x10, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" "cmp x27, #0x20\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" - ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" - "ldr q17, [x10, #0x40]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x10, #0x40]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" - ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" - "ldr q16, [x10, #0x50]\n" - ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" - ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n" - "ldr q17, [x10, #0x60]\n" - ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" - ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n" - "ldr q16, [x10, #0x70]\n" - ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" - ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n" - "ldr q17, [x10, #0x80]\n" - ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" - ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n" - "ldr q16, [x10, #0x90]\n" - ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" - ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n" - "ldr q17, [x10, #0xa0]\n" - ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" - ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n" - "ldr q16, [x10, #0xb0]\n" - ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" - ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n" - "ldr q17, [x10, #0xc0]\n" - ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" - ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n" - "ldr q16, [x10, #0xd0]\n" - ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" - ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n" - "ldr q17, [x10, #0xe0]\n" - ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" - ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n" - "ldr q16, [x10, #0xf0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x10, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x10, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x10, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x10, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x10, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x10, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x10, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x10, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + "ldr q7, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" - ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" "ldr q1, [x25, #0x0]\n" "ldr q7, [x10, #0x10]\n" - "bge 34b\n" - "35:" // Height 2: Multiply loop: Single iteration only + "bge 33b\n" + "34:" // Height 2: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q17, [x10, #0x20]\n" + "ldr q6, [x10, #0x20]\n" "add x26, x26, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q16, [x10, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "add x25, x25, #0x10\n" "sub x27, x27, #0x10\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" - ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" - "ldr q17, [x10, #0x40]\n" - ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" - ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" - "ldr q16, [x10, #0x50]\n" - ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" - ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n" - "ldr q17, [x10, #0x60]\n" - ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" - ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n" - "ldr q16, [x10, #0x70]\n" - ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" - ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n" - "ldr q17, [x10, #0x80]\n" - ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" - ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n" - "ldr q16, [x10, #0x90]\n" - ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" - ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n" - "ldr q17, [x10, #0xa0]\n" - ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" - ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n" - "ldr q16, [x10, #0xb0]\n" - ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" - ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n" - "ldr q17, [x10, #0xc0]\n" - ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" - ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n" - "ldr q16, [x10, #0xd0]\n" - ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" - ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n" - "ldr q17, [x10, #0xe0]\n" - ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" - ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n" - "ldr q16, [x10, #0xf0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x10, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x10, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x10, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x10, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x10, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x10, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x10, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x10, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + "ldr q7, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" - ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n" - ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" - ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n" - "36:" // Height 2: Multiply loop: Main loop skip - "cbz x27, 41f\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "35:" // Height 2: Multiply loop: Main loop skip + "cbz x27, 40f\n" "cmp x27, #0x4\n" - "blt 38f\n" - "37:" // Height 2: Multiply loop: Odd block loop - "ldr s19, [x26], #0x4\n" - "ldr s18, [x25], #0x4\n" + "blt 37f\n" + "36:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x26], #0x4\n" + "ldr s1, [x25], #0x4\n" "sub x27, x27, #0x4\n" - "ldr q17, [x10, #0x0]\n" - "ldr q16, [x10, #0x10]\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" "cmp x27, #0x4\n" - ".inst 0x4f93e228 // sdot v8.4s, v17.16b, v19.4b[0]\n" - ".inst 0x4f92e22c // sdot v12.4s, v17.16b, v18.4b[0]\n" - "ldr q17, [x10, #0x20]\n" - ".inst 0x4f93e209 // sdot v9.4s, v16.16b, v19.4b[0]\n" - ".inst 0x4f92e20d // sdot v13.4s, v16.16b, v18.4b[0]\n" - "ldr q16, [x10, #0x30]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f93e22a // sdot v10.4s, v17.16b, v19.4b[0]\n" - ".inst 0x4f92e22e // sdot v14.4s, v17.16b, v18.4b[0]\n" - ".inst 0x4f93e20b // sdot v11.4s, v16.16b, v19.4b[0]\n" - ".inst 0x4f92e20f // sdot v15.4s, v16.16b, v18.4b[0]\n" - "bge 37b\n" - "38:" // Height 2: Multiply loop: Skip odd blocks - "cbz x27, 41f\n" - "tbz x27, #1, 39f\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "bge 36b\n" + "37:" // Height 2: Multiply loop: Skip odd blocks + "cbz x27, 40f\n" + "tbz x27, #1, 38f\n" "ldr h0, [x26], #0x2\n" "ldr h1, [x25], #0x2\n" - "tbz x27, #0, 40f\n" + "tbz x27, #0, 39f\n" "ld1 { v0.b }[2], [x26]\n" "ld1 { v1.b }[2], [x25]\n" - "b 40f\n" - "39:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "b 39f\n" + "38:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x26, #0x0]\n" "ldr b1, [x25, #0x0]\n" - "40:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q17, [x10, #0x0]\n" - "ldr q16, [x10, #0x10]\n" - ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n" - ".inst 0x4f81e22c // sdot v12.4s, v17.16b, v1.4b[0]\n" - "ldr q17, [x10, #0x20]\n" - ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n" - ".inst 0x4f81e20d // sdot v13.4s, v16.16b, v1.4b[0]\n" - "ldr q16, [x10, #0x30]\n" + "39:" // Height 2: Multiply loop: Ragged operand read: Done + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" - ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" - ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" - ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" - "41:" // Height 2: Multiply loop: No odd multiplies + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "40:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 31b\n" - "ldr q19, [x14, #0x0]\n" - "ldr q18, [x14, #0x10]\n" - "ldr q17, [x14, #0x20]\n" - "ldr q16, [x14, #0x30]\n" - "add x14, x14, #0x40\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "add v8.4s, v8.4s, v19.4s\n" - "add v9.4s, v9.4s, v18.4s\n" - "add v12.4s, v12.4s, v19.4s\n" - "add v13.4s, v13.4s, v18.4s\n" - "add v10.4s, v10.4s, v17.4s\n" - "add v11.4s, v11.4s, v16.4s\n" - "add x26, x9, x20\n" - "prfm pstl1keep, [x26, #0x0]\n" - "add v14.4s, v14.4s, v17.4s\n" - "add v15.4s, v15.4s, v16.4s\n" - "tbz %x[flags], #4, 42f\n" + "bne 30b\n" "ldr q0, [x12, #0x0]\n" - "ldr q4, [x13, #0x0]\n" "ldr q1, [x12, #0x10]\n" - "ldr q5, [x13, #0x10]\n" "ldr q2, [x12, #0x20]\n" - "ldr q6, [x13, #0x20]\n" "ldr q3, [x12, #0x30]\n" - "ldr q7, [x13, #0x30]\n" "add x12, x12, #0x40\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v12.4s, v12.4s, v0.4s\n" + "add v13.4s, v13.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add x27, x9, x20\n" + "prfm pstl1keep, [x27, #0x0]\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "tbz %x[flags], #4, 41f\n" + "ldr q0, [x13, #0x0]\n" + "ldr q4, [x14, #0x0]\n" + "ldr q1, [x13, #0x10]\n" + "ldr q5, [x14, #0x10]\n" + "ldr q2, [x13, #0x20]\n" + "ldr q6, [x14, #0x20]\n" + "ldr q3, [x13, #0x30]\n" + "ldr q7, [x14, #0x30]\n" "add x13, x13, #0x40\n" - "b 43f\n" - "42:" // Height 2: per layer parameters + "add x14, x14, #0x40\n" + "b 42f\n" + "41:" // Height 2: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1r { v0.4s }, [x21]\n" @@ -627,163 +607,137 @@ void a64_hybrid_s8qs_dot_6x16 ( "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" - "43:" // Height 2: parameters loaded - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "sqrdmulh v12.4s, v12.4s, v4.4s\n" - "sqrdmulh v13.4s, v13.4s, v5.4s\n" - "sqrdmulh v14.4s, v14.4s, v6.4s\n" - "sqrdmulh v15.4s, v15.4s, v7.4s\n" - "tbz %x[flags], #5, 44f\n" - "and v19.16b, v8.16b, v0.16b\n" - "and v18.16b, v9.16b, v1.16b\n" - "and v17.16b, v10.16b, v2.16b\n" - "and v16.16b, v11.16b, v3.16b\n" - "sshr v19.4s, v19.4s, #0x1f\n" - "sshr v18.4s, v18.4s, #0x1f\n" - "sshr v17.4s, v17.4s, #0x1f\n" - "sshr v16.4s, v16.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v19.4s\n" - "and v19.16b, v12.16b, v0.16b\n" - "sqadd v9.4s, v9.4s, v18.4s\n" - "and v18.16b, v13.16b, v1.16b\n" - "sqadd v10.4s, v10.4s, v17.4s\n" - "sqadd v11.4s, v11.4s, v16.4s\n" - "and v17.16b, v14.16b, v2.16b\n" - "and v16.16b, v15.16b, v3.16b\n" - "sshr v19.4s, v19.4s, #0x1f\n" - "sshr v18.4s, v18.4s, #0x1f\n" - "sshr v17.4s, v17.4s, #0x1f\n" - "sshr v16.4s, v16.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v19.4s\n" - "sqadd v13.4s, v13.4s, v18.4s\n" - "sqadd v14.4s, v14.4s, v17.4s\n" - "sqadd v15.4s, v15.4s, v16.4s\n" - "44:" // Height 2: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "42:" // Height 2: parameters loaded + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" + "add x20, %x[qp], %[minval]\n" + "cmp x11, #0x10\n" + "sqdmulh v12.4s, v12.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" + "sqdmulh v13.4s, v13.4s, v5.4s\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v14.4s, v14.4s, v6.4s\n" + "ld1r { v6.4s }, [x21]\n" + "sqdmulh v15.4s, v15.4s, v7.4s\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v18.4s }, [x21]\n" - "ld1r { v17.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x11, #0x10\n" - "ld1r { v16.4s }, [x20]\n" "srshl v14.4s, v14.4s, v2.4s\n" "srshl v15.4s, v15.4s, v3.4s\n" - "add v8.4s, v8.4s, v18.4s\n" - "add v9.4s, v9.4s, v18.4s\n" - "add v10.4s, v10.4s, v18.4s\n" - "add v11.4s, v11.4s, v18.4s\n" - "add v12.4s, v12.4s, v18.4s\n" - "add v13.4s, v13.4s, v18.4s\n" - "add v14.4s, v14.4s, v18.4s\n" - "add v15.4s, v15.4s, v18.4s\n" - "smin v8.4s, v8.4s, v17.4s\n" - "smin v9.4s, v9.4s, v17.4s\n" - "smin v10.4s, v10.4s, v17.4s\n" - "smin v11.4s, v11.4s, v17.4s\n" - "smin v12.4s, v12.4s, v17.4s\n" - "smin v13.4s, v13.4s, v17.4s\n" - "smin v14.4s, v14.4s, v17.4s\n" - "smin v15.4s, v15.4s, v17.4s\n" - "smax v8.4s, v8.4s, v16.4s\n" - "smax v9.4s, v9.4s, v16.4s\n" - "smax v10.4s, v10.4s, v16.4s\n" - "smax v11.4s, v11.4s, v16.4s\n" - "smax v12.4s, v12.4s, v16.4s\n" - "smax v13.4s, v13.4s, v16.4s\n" - "smax v14.4s, v14.4s, v16.4s\n" - "smax v15.4s, v15.4s, v16.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v17.8h, v10.8h, v11.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v16.8h, v14.8h, v15.8h\n" - "uzp1 v8.16b, v8.16b, v17.16b\n" - "uzp1 v12.16b, v12.16b, v16.16b\n" - "bge 53f\n" - "tbz x11, #3, 48f\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "bge 51f\n" + "tbz x11, #3, 46f\n" "str d8, [x9], #0x8\n" - "str d12, [x26], #0x8\n" - "tbz x11, #2, 46f\n" + "str d12, [x27], #0x8\n" + "tbz x11, #2, 44f\n" "st1 { v8.s }[2], [x9], #0x4\n" - "st1 { v12.s }[2], [x26], #0x4\n" - "tbz x11, #1, 45f\n" + "st1 { v12.s }[2], [x27], #0x4\n" + "tbz x11, #1, 43f\n" "st1 { v8.h }[6], [x9], #0x2\n" - "st1 { v12.h }[6], [x26], #0x2\n" - "tbz x11, #0, 52f\n" + "st1 { v12.h }[6], [x27], #0x2\n" + "tbz x11, #0, 50f\n" "st1 { v8.b }[14], [x9]\n" - "st1 { v12.b }[14], [x26]\n" - "b 52f\n" - "45:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x11, #0, 52f\n" + "st1 { v12.b }[14], [x27]\n" + "b 50f\n" + "43:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x11, #0, 50f\n" "st1 { v8.b }[12], [x9]\n" - "st1 { v12.b }[12], [x26]\n" - "b 52f\n" - "46:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x11, #1, 47f\n" + "st1 { v12.b }[12], [x27]\n" + "b 50f\n" + "44:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x11, #1, 45f\n" "st1 { v8.h }[4], [x9], #0x2\n" - "st1 { v12.h }[4], [x26], #0x2\n" - "tbz x11, #0, 52f\n" + "st1 { v12.h }[4], [x27], #0x2\n" + "tbz x11, #0, 50f\n" "st1 { v8.b }[10], [x9]\n" - "st1 { v12.b }[10], [x26]\n" - "b 52f\n" - "47:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x11, #0, 52f\n" + "st1 { v12.b }[10], [x27]\n" + "b 50f\n" + "45:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x11, #0, 50f\n" "st1 { v8.b }[8], [x9]\n" - "st1 { v12.b }[8], [x26]\n" - "b 52f\n" - "48:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x11, #2, 50f\n" + "st1 { v12.b }[8], [x27]\n" + "b 50f\n" + "46:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x11, #2, 48f\n" "str s8, [x9], #0x4\n" - "str s12, [x26], #0x4\n" - "tbz x11, #1, 49f\n" + "str s12, [x27], #0x4\n" + "tbz x11, #1, 47f\n" "st1 { v8.h }[2], [x9], #0x2\n" - "st1 { v12.h }[2], [x26], #0x2\n" - "tbz x11, #0, 52f\n" + "st1 { v12.h }[2], [x27], #0x2\n" + "tbz x11, #0, 50f\n" "st1 { v8.b }[6], [x9]\n" - "st1 { v12.b }[6], [x26]\n" - "b 52f\n" - "49:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x11, #0, 52f\n" + "st1 { v12.b }[6], [x27]\n" + "b 50f\n" + "47:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x11, #0, 50f\n" "st1 { v8.b }[4], [x9]\n" - "st1 { v12.b }[4], [x26]\n" - "b 52f\n" - "50:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x11, #1, 51f\n" + "st1 { v12.b }[4], [x27]\n" + "b 50f\n" + "48:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x11, #1, 49f\n" "str h8, [x9], #0x2\n" - "str h12, [x26], #0x2\n" - "tbz x11, #0, 52f\n" + "str h12, [x27], #0x2\n" + "tbz x11, #0, 50f\n" "st1 { v8.b }[2], [x9]\n" - "st1 { v12.b }[2], [x26]\n" - "b 52f\n" - "51:" // Height 2: Partial direct writeback: partial_1_0 + "st1 { v12.b }[2], [x27]\n" + "b 50f\n" + "49:" // Height 2: Partial direct writeback: partial_1_0 "str b8, [x9, #0x0]\n" - "str b12, [x26, #0x0]\n" - "52:" // Height 2: Partial direct writeback: Done - "b 54f\n" - "53:" // Height 2: Full writeback + "str b12, [x27, #0x0]\n" + "50:" // Height 2: Partial direct writeback: Done + "b 52f\n" + "51:" // Height 2: Full writeback "str q8, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "str q12, [x26, #0x0]\n" - "54:" // Height 2: Writeback done + "str q12, [x27, #0x0]\n" + "52:" // Height 2: Writeback done "subs x11, x11, #0x10\n" - "bgt 29b\n" - "b 164f\n" - "55:" // Height 3 - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "bgt 28b\n" + "b 158f\n" + "53:" // Height 3 + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "56:" // Height 3: Column loop + "54:" // Height 3: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -796,289 +750,288 @@ void a64_hybrid_s8qs_dot_6x16 ( "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" "movi v19.4s, #0x0\n" - "57:" // Height 3: setup done "mov x28, #0x0\n" - "58:" // Height 3: String loop + "56:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 59f\n" + "tbz %x[flags], #3, 57f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" "ldr x25, [x20, #0x8]\n" "ldr x24, [x20, #0x10]\n" - "cbnz x28, 60f\n" + "cbnz x28, 58f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" "add x24, x24, x20\n" - "b 60f\n" - "59:" // Height 3: setup direct input + "b 58f\n" + "57:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" "add x24, x25, x21\n" - "60:" // Height 3: input setup done + "58:" // Height 3: input setup done "cmp x27, #0x10\n" - "blt 63f\n" + "blt 61f\n" "ldr q0, [x26, #0x0]\n" "ldr q1, [x25, #0x0]\n" "cmp x27, #0x20\n" "ldr q2, [x24, #0x0]\n" "ldr q6, [x10, #0x0]\n" "ldr q7, [x10, #0x10]\n" - "blt 62f\n" - "61:" // Height 3: Multiply loop: Main loop head + "blt 60f\n" + "59:" // Height 3: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q21, [x10, #0x20]\n" + "ldr q6, [x10, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x25, x25, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q20, [x10, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "add x24, x24, #0x10\n" "cmp x27, #0x20\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" - ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" - ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" - "ldr q21, [x10, #0x40]\n" - "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" - ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" - ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" - "ldr q20, [x10, #0x50]\n" - ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n" - ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n" - ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n" - "ldr q21, [x10, #0x60]\n" - ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n" - ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n" - ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n" - "ldr q20, [x10, #0x70]\n" - ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n" - ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n" - ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n" - "ldr q21, [x10, #0x80]\n" - ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n" - ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n" - ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n" - "ldr q20, [x10, #0x90]\n" - ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n" - ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n" - ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n" - "ldr q21, [x10, #0xa0]\n" - ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n" - ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n" - ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n" - "ldr q20, [x10, #0xb0]\n" - ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n" - ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n" - ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n" - "ldr q21, [x10, #0xc0]\n" - ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n" - ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n" - ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n" - "ldr q20, [x10, #0xd0]\n" - ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n" - ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n" - ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n" - "ldr q21, [x10, #0xe0]\n" - ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n" - ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n" - ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n" - "ldr q20, [x10, #0xf0]\n" - "add x10, x10, #0x100\n" - ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n" - ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n" - ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n" - "ldr q6, [x10, #0x0]\n" - ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n" - "ldr q0, [x26, #0x0]\n" - ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n" - "ldr q1, [x25, #0x0]\n" - ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n" - "ldr q2, [x24, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "bge 61b\n" - "62:" // Height 3: Multiply loop: Single iteration only - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x26, x26, #0x10\n" - "add x25, x25, #0x10\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q21, [x10, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x24, x24, #0x10\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q20, [x10, #0x30]\n" - "sub x27, x27, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" - ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" - ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" - "ldr q21, [x10, #0x40]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x10, #0x40]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" - ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" - ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" - "ldr q20, [x10, #0x50]\n" - ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n" - ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n" - ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n" - "ldr q21, [x10, #0x60]\n" - ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n" - ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n" - ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n" - "ldr q20, [x10, #0x70]\n" - ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n" - ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n" - ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n" - "ldr q21, [x10, #0x80]\n" - ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n" - ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n" - ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n" - "ldr q20, [x10, #0x90]\n" - ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n" - ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n" - ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n" - "ldr q21, [x10, #0xa0]\n" - ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n" - ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n" - ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n" - "ldr q20, [x10, #0xb0]\n" - ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n" - ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n" - ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n" - "ldr q21, [x10, #0xc0]\n" - ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n" - ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n" - ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n" - "ldr q20, [x10, #0xd0]\n" - ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n" - ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n" - ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n" - "ldr q21, [x10, #0xe0]\n" - ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n" - ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n" - ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n" - "ldr q20, [x10, #0xf0]\n" - "add x10, x10, #0x100\n" - ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n" - ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n" - ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n" - ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n" - ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n" - ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n" - "63:" // Height 3: Multiply loop: Main loop skip - "cbz x27, 68f\n" - "cmp x27, #0x4\n" - "blt 65f\n" - "64:" // Height 3: Multiply loop: Odd block loop - "ldr s24, [x26], #0x4\n" - "ldr s23, [x25], #0x4\n" - "sub x27, x27, #0x4\n" - "ldr s22, [x24], #0x4\n" - "ldr q21, [x10, #0x0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x10, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x10, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x10, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x10, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x10, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x10, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x10, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + "ldr q6, [x10, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + "ldr q6, [x10, #0x0]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x26, #0x0]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "ldr q1, [x25, #0x0]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "ldr q2, [x24, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "bge 59b\n" + "60:" // Height 3: Multiply loop: Single iteration only + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x24, x24, #0x10\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x10, #0x30]\n" + "sub x27, x27, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x10, #0x40]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x10, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + "ldr q6, [x10, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + "ldr q7, [x10, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x10, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x10, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + "ldr q6, [x10, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + "ldr q7, [x10, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + "ldr q6, [x10, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "61:" // Height 3: Multiply loop: Main loop skip + "cbz x27, 66f\n" "cmp x27, #0x4\n" - "ldr q20, [x10, #0x10]\n" - ".inst 0x4f98e2a8 // sdot v8.4s, v21.16b, v24.4b[0]\n" - ".inst 0x4f97e2ac // sdot v12.4s, v21.16b, v23.4b[0]\n" - ".inst 0x4f96e2b0 // sdot v16.4s, v21.16b, v22.4b[0]\n" - "ldr q21, [x10, #0x20]\n" - ".inst 0x4f98e289 // sdot v9.4s, v20.16b, v24.4b[0]\n" - ".inst 0x4f97e28d // sdot v13.4s, v20.16b, v23.4b[0]\n" - ".inst 0x4f96e291 // sdot v17.4s, v20.16b, v22.4b[0]\n" - "ldr q20, [x10, #0x30]\n" + "blt 63f\n" + "62:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x26], #0x4\n" + "ldr s1, [x25], #0x4\n" + "sub x27, x27, #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr q6, [x10, #0x0]\n" + "cmp x27, #0x4\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f98e2aa // sdot v10.4s, v21.16b, v24.4b[0]\n" - ".inst 0x4f97e2ae // sdot v14.4s, v21.16b, v23.4b[0]\n" - ".inst 0x4f96e2b2 // sdot v18.4s, v21.16b, v22.4b[0]\n" - ".inst 0x4f98e28b // sdot v11.4s, v20.16b, v24.4b[0]\n" - ".inst 0x4f97e28f // sdot v15.4s, v20.16b, v23.4b[0]\n" - ".inst 0x4f96e293 // sdot v19.4s, v20.16b, v22.4b[0]\n" - "bge 64b\n" - "65:" // Height 3: Multiply loop: Skip odd blocks - "cbz x27, 68f\n" - "tbz x27, #1, 66f\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "bge 62b\n" + "63:" // Height 3: Multiply loop: Skip odd blocks + "cbz x27, 66f\n" + "tbz x27, #1, 64f\n" "ldr h0, [x26], #0x2\n" "ldr h1, [x25], #0x2\n" "ldr h2, [x24], #0x2\n" - "tbz x27, #0, 67f\n" + "tbz x27, #0, 65f\n" "ld1 { v0.b }[2], [x26]\n" "ld1 { v1.b }[2], [x25]\n" "ld1 { v2.b }[2], [x24]\n" - "b 67f\n" - "66:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "b 65f\n" + "64:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x26, #0x0]\n" "ldr b1, [x25, #0x0]\n" "ldr b2, [x24, #0x0]\n" - "67:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q21, [x10, #0x0]\n" - "ldr q20, [x10, #0x10]\n" - ".inst 0x4f80e2a8 // sdot v8.4s, v21.16b, v0.4b[0]\n" - ".inst 0x4f81e2ac // sdot v12.4s, v21.16b, v1.4b[0]\n" - ".inst 0x4f82e2b0 // sdot v16.4s, v21.16b, v2.4b[0]\n" - "ldr q21, [x10, #0x20]\n" - ".inst 0x4f80e289 // sdot v9.4s, v20.16b, v0.4b[0]\n" - ".inst 0x4f81e28d // sdot v13.4s, v20.16b, v1.4b[0]\n" - ".inst 0x4f82e291 // sdot v17.4s, v20.16b, v2.4b[0]\n" - "ldr q20, [x10, #0x30]\n" + "65:" // Height 3: Multiply loop: Ragged operand read: Done + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" - ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" - ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" - ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" - ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" - ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" - "68:" // Height 3: Multiply loop: No odd multiplies + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "66:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 58b\n" - "ldr q23, [x14, #0x0]\n" - "ldr q22, [x14, #0x10]\n" - "ldr q21, [x14, #0x20]\n" - "ldr q20, [x14, #0x30]\n" - "add x14, x14, #0x40\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "add v8.4s, v8.4s, v23.4s\n" - "add v9.4s, v9.4s, v22.4s\n" - "add v12.4s, v12.4s, v23.4s\n" - "add v13.4s, v13.4s, v22.4s\n" - "add v10.4s, v10.4s, v21.4s\n" - "add v11.4s, v11.4s, v20.4s\n" - "add x26, x9, x20\n" - "add x25, x26, x20\n" - "prfm pstl1keep, [x26, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" - "add v14.4s, v14.4s, v21.4s\n" - "add v15.4s, v15.4s, v20.4s\n" - "add v16.4s, v16.4s, v23.4s\n" - "add v17.4s, v17.4s, v22.4s\n" - "add v18.4s, v18.4s, v21.4s\n" - "add v19.4s, v19.4s, v20.4s\n" - "tbz %x[flags], #4, 69f\n" + "bne 56b\n" "ldr q0, [x12, #0x0]\n" - "ldr q4, [x13, #0x0]\n" "ldr q1, [x12, #0x10]\n" - "ldr q5, [x13, #0x10]\n" "ldr q2, [x12, #0x20]\n" - "ldr q6, [x13, #0x20]\n" "ldr q3, [x12, #0x30]\n" - "ldr q7, [x13, #0x30]\n" "add x12, x12, #0x40\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v12.4s, v12.4s, v0.4s\n" + "add v13.4s, v13.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add x27, x9, x20\n" + "add x26, x27, x20\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "tbz %x[flags], #4, 67f\n" + "ldr q0, [x13, #0x0]\n" + "ldr q4, [x14, #0x0]\n" + "ldr q1, [x13, #0x10]\n" + "ldr q5, [x14, #0x10]\n" + "ldr q2, [x13, #0x20]\n" + "ldr q6, [x14, #0x20]\n" + "ldr q3, [x13, #0x30]\n" + "ldr q7, [x14, #0x30]\n" "add x13, x13, #0x40\n" - "b 70f\n" - "69:" // Height 3: per layer parameters + "add x14, x14, #0x40\n" + "b 68f\n" + "67:" // Height 3: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1r { v0.4s }, [x21]\n" @@ -1089,214 +1042,176 @@ void a64_hybrid_s8qs_dot_6x16 ( "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" - "70:" // Height 3: parameters loaded - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "sqrdmulh v12.4s, v12.4s, v4.4s\n" - "sqrdmulh v13.4s, v13.4s, v5.4s\n" - "sqrdmulh v14.4s, v14.4s, v6.4s\n" - "sqrdmulh v15.4s, v15.4s, v7.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v5.4s\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "sqrdmulh v19.4s, v19.4s, v7.4s\n" - "tbz %x[flags], #5, 71f\n" - "and v23.16b, v8.16b, v0.16b\n" - "and v22.16b, v9.16b, v1.16b\n" - "and v21.16b, v10.16b, v2.16b\n" - "and v20.16b, v11.16b, v3.16b\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v23.4s\n" - "and v23.16b, v12.16b, v0.16b\n" - "sqadd v9.4s, v9.4s, v22.4s\n" - "and v22.16b, v13.16b, v1.16b\n" - "sqadd v10.4s, v10.4s, v21.4s\n" - "sqadd v11.4s, v11.4s, v20.4s\n" - "and v21.16b, v14.16b, v2.16b\n" - "and v20.16b, v15.16b, v3.16b\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v23.4s\n" - "and v23.16b, v16.16b, v0.16b\n" - "sqadd v13.4s, v13.4s, v22.4s\n" - "and v22.16b, v17.16b, v1.16b\n" - "sqadd v14.4s, v14.4s, v21.4s\n" - "sqadd v15.4s, v15.4s, v20.4s\n" - "and v21.16b, v18.16b, v2.16b\n" - "and v20.16b, v19.16b, v3.16b\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v23.4s\n" - "sqadd v17.4s, v17.4s, v22.4s\n" - "sqadd v18.4s, v18.4s, v21.4s\n" - "sqadd v19.4s, v19.4s, v20.4s\n" - "71:" // Height 3: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "68:" // Height 3: parameters loaded + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" + "add x20, %x[qp], %[minval]\n" + "cmp x11, #0x10\n" + "sqdmulh v12.4s, v12.4s, v4.4s\n" + "sqdmulh v13.4s, v13.4s, v5.4s\n" + "sqdmulh v14.4s, v14.4s, v6.4s\n" + "sqdmulh v15.4s, v15.4s, v7.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" + "sqdmulh v17.4s, v17.4s, v5.4s\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v18.4s, v18.4s, v6.4s\n" + "ld1r { v6.4s }, [x21]\n" + "sqdmulh v19.4s, v19.4s, v7.4s\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v22.4s }, [x21]\n" - "ld1r { v21.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x11, #0x10\n" - "ld1r { v20.4s }, [x20]\n" "srshl v14.4s, v14.4s, v2.4s\n" "srshl v15.4s, v15.4s, v3.4s\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v1.4s\n" "srshl v18.4s, v18.4s, v2.4s\n" "srshl v19.4s, v19.4s, v3.4s\n" - "add v8.4s, v8.4s, v22.4s\n" - "add v9.4s, v9.4s, v22.4s\n" - "add v10.4s, v10.4s, v22.4s\n" - "add v11.4s, v11.4s, v22.4s\n" - "add v12.4s, v12.4s, v22.4s\n" - "add v13.4s, v13.4s, v22.4s\n" - "add v14.4s, v14.4s, v22.4s\n" - "add v15.4s, v15.4s, v22.4s\n" - "add v16.4s, v16.4s, v22.4s\n" - "add v17.4s, v17.4s, v22.4s\n" - "add v18.4s, v18.4s, v22.4s\n" - "add v19.4s, v19.4s, v22.4s\n" - "smin v8.4s, v8.4s, v21.4s\n" - "smin v9.4s, v9.4s, v21.4s\n" - "smin v10.4s, v10.4s, v21.4s\n" - "smin v11.4s, v11.4s, v21.4s\n" - "smin v12.4s, v12.4s, v21.4s\n" - "smin v13.4s, v13.4s, v21.4s\n" - "smin v14.4s, v14.4s, v21.4s\n" - "smin v15.4s, v15.4s, v21.4s\n" - "smin v16.4s, v16.4s, v21.4s\n" - "smin v17.4s, v17.4s, v21.4s\n" - "smin v18.4s, v18.4s, v21.4s\n" - "smin v19.4s, v19.4s, v21.4s\n" - "smax v8.4s, v8.4s, v20.4s\n" - "smax v9.4s, v9.4s, v20.4s\n" - "smax v10.4s, v10.4s, v20.4s\n" - "smax v11.4s, v11.4s, v20.4s\n" - "smax v12.4s, v12.4s, v20.4s\n" - "smax v13.4s, v13.4s, v20.4s\n" - "smax v14.4s, v14.4s, v20.4s\n" - "smax v15.4s, v15.4s, v20.4s\n" - "smax v16.4s, v16.4s, v20.4s\n" - "smax v17.4s, v17.4s, v20.4s\n" - "smax v18.4s, v18.4s, v20.4s\n" - "smax v19.4s, v19.4s, v20.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v21.8h, v10.8h, v11.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v20.8h, v14.8h, v15.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" - "uzp1 v8.16b, v8.16b, v21.16b\n" - "uzp1 v12.16b, v12.16b, v20.16b\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" "uzp1 v16.16b, v16.16b, v17.16b\n" - "bge 80f\n" - "tbz x11, #3, 75f\n" + "bge 77f\n" + "tbz x11, #3, 72f\n" "str d8, [x9], #0x8\n" - "str d12, [x26], #0x8\n" - "str d16, [x25], #0x8\n" - "tbz x11, #2, 73f\n" + "str d12, [x27], #0x8\n" + "str d16, [x26], #0x8\n" + "tbz x11, #2, 70f\n" "st1 { v8.s }[2], [x9], #0x4\n" - "st1 { v12.s }[2], [x26], #0x4\n" - "st1 { v16.s }[2], [x25], #0x4\n" - "tbz x11, #1, 72f\n" + "st1 { v12.s }[2], [x27], #0x4\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "tbz x11, #1, 69f\n" "st1 { v8.h }[6], [x9], #0x2\n" - "st1 { v12.h }[6], [x26], #0x2\n" - "st1 { v16.h }[6], [x25], #0x2\n" - "tbz x11, #0, 79f\n" + "st1 { v12.h }[6], [x27], #0x2\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "tbz x11, #0, 76f\n" "st1 { v8.b }[14], [x9]\n" - "st1 { v12.b }[14], [x26]\n" - "st1 { v16.b }[14], [x25]\n" - "b 79f\n" - "72:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x11, #0, 79f\n" + "st1 { v12.b }[14], [x27]\n" + "st1 { v16.b }[14], [x26]\n" + "b 76f\n" + "69:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x11, #0, 76f\n" "st1 { v8.b }[12], [x9]\n" - "st1 { v12.b }[12], [x26]\n" - "st1 { v16.b }[12], [x25]\n" - "b 79f\n" - "73:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x11, #1, 74f\n" + "st1 { v12.b }[12], [x27]\n" + "st1 { v16.b }[12], [x26]\n" + "b 76f\n" + "70:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x11, #1, 71f\n" "st1 { v8.h }[4], [x9], #0x2\n" - "st1 { v12.h }[4], [x26], #0x2\n" - "st1 { v16.h }[4], [x25], #0x2\n" - "tbz x11, #0, 79f\n" + "st1 { v12.h }[4], [x27], #0x2\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "tbz x11, #0, 76f\n" "st1 { v8.b }[10], [x9]\n" - "st1 { v12.b }[10], [x26]\n" - "st1 { v16.b }[10], [x25]\n" - "b 79f\n" - "74:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x11, #0, 79f\n" + "st1 { v12.b }[10], [x27]\n" + "st1 { v16.b }[10], [x26]\n" + "b 76f\n" + "71:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x11, #0, 76f\n" "st1 { v8.b }[8], [x9]\n" - "st1 { v12.b }[8], [x26]\n" - "st1 { v16.b }[8], [x25]\n" - "b 79f\n" - "75:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x11, #2, 77f\n" + "st1 { v12.b }[8], [x27]\n" + "st1 { v16.b }[8], [x26]\n" + "b 76f\n" + "72:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x11, #2, 74f\n" "str s8, [x9], #0x4\n" - "str s12, [x26], #0x4\n" - "str s16, [x25], #0x4\n" - "tbz x11, #1, 76f\n" + "str s12, [x27], #0x4\n" + "str s16, [x26], #0x4\n" + "tbz x11, #1, 73f\n" "st1 { v8.h }[2], [x9], #0x2\n" - "st1 { v12.h }[2], [x26], #0x2\n" - "st1 { v16.h }[2], [x25], #0x2\n" - "tbz x11, #0, 79f\n" + "st1 { v12.h }[2], [x27], #0x2\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "tbz x11, #0, 76f\n" "st1 { v8.b }[6], [x9]\n" - "st1 { v12.b }[6], [x26]\n" - "st1 { v16.b }[6], [x25]\n" - "b 79f\n" - "76:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x11, #0, 79f\n" + "st1 { v12.b }[6], [x27]\n" + "st1 { v16.b }[6], [x26]\n" + "b 76f\n" + "73:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x11, #0, 76f\n" "st1 { v8.b }[4], [x9]\n" - "st1 { v12.b }[4], [x26]\n" - "st1 { v16.b }[4], [x25]\n" - "b 79f\n" - "77:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x11, #1, 78f\n" + "st1 { v12.b }[4], [x27]\n" + "st1 { v16.b }[4], [x26]\n" + "b 76f\n" + "74:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x11, #1, 75f\n" "str h8, [x9], #0x2\n" - "str h12, [x26], #0x2\n" - "str h16, [x25], #0x2\n" - "tbz x11, #0, 79f\n" + "str h12, [x27], #0x2\n" + "str h16, [x26], #0x2\n" + "tbz x11, #0, 76f\n" "st1 { v8.b }[2], [x9]\n" - "st1 { v12.b }[2], [x26]\n" - "st1 { v16.b }[2], [x25]\n" - "b 79f\n" - "78:" // Height 3: Partial direct writeback: partial_1_0 + "st1 { v12.b }[2], [x27]\n" + "st1 { v16.b }[2], [x26]\n" + "b 76f\n" + "75:" // Height 3: Partial direct writeback: partial_1_0 "str b8, [x9, #0x0]\n" - "str b12, [x26, #0x0]\n" - "str b16, [x25, #0x0]\n" - "79:" // Height 3: Partial direct writeback: Done - "b 81f\n" - "80:" // Height 3: Full writeback + "str b12, [x27, #0x0]\n" + "str b16, [x26, #0x0]\n" + "76:" // Height 3: Partial direct writeback: Done + "b 78f\n" + "77:" // Height 3: Full writeback "str q8, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "str q12, [x26, #0x0]\n" - "str q16, [x25, #0x0]\n" - "81:" // Height 3: Writeback done + "str q12, [x27, #0x0]\n" + "str q16, [x26, #0x0]\n" + "78:" // Height 3: Writeback done "subs x11, x11, #0x10\n" - "bgt 56b\n" - "b 164f\n" - "82:" // Height 4 - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "bgt 54b\n" + "b 158f\n" + "79:" // Height 4 + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "83:" // Height 4: Column loop + "80:" // Height 4: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -1313,34 +1228,33 @@ void a64_hybrid_s8qs_dot_6x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "84:" // Height 4: setup done "mov x28, #0x0\n" - "85:" // Height 4: String loop + "82:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 86f\n" + "tbz %x[flags], #3, 83f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" "ldr x25, [x20, #0x8]\n" "ldr x24, [x20, #0x10]\n" "ldr x23, [x20, #0x18]\n" - "cbnz x28, 87f\n" + "cbnz x28, 84f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" "add x24, x24, x20\n" "add x23, x23, x20\n" - "b 87f\n" - "86:" // Height 4: setup direct input + "b 84f\n" + "83:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" "add x24, x25, x21\n" "add x23, x24, x21\n" - "87:" // Height 4: input setup done + "84:" // Height 4: input setup done "cmp x27, #0x10\n" - "blt 90f\n" + "blt 87f\n" "ldr q0, [x26, #0x0]\n" "ldr q1, [x25, #0x0]\n" "cmp x27, #0x20\n" @@ -1348,15 +1262,15 @@ void a64_hybrid_s8qs_dot_6x16 ( "ldr q3, [x23, #0x0]\n" "ldr q6, [x10, #0x0]\n" "ldr q7, [x10, #0x10]\n" - "blt 89f\n" - "88:" // Height 4: Multiply loop: Main loop head + "blt 86f\n" + "85:" // Height 4: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q25, [x10, #0x20]\n" + "ldr q6, [x10, #0x20]\n" "add x25, x25, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" @@ -1364,96 +1278,96 @@ void a64_hybrid_s8qs_dot_6x16 ( "add x23, x23, #0x10\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q24, [x10, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "cmp x27, #0x20\n" - ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" - ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" - ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" - "ldr q25, [x10, #0x40]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x10, #0x40]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" - ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" - ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" - "ldr q24, [x10, #0x50]\n" - ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n" - ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n" - ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n" - ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n" - "ldr q25, [x10, #0x60]\n" - ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n" - ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n" - ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n" - ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n" - "ldr q24, [x10, #0x70]\n" - ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n" - ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n" - ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n" - ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n" - "ldr q25, [x10, #0x80]\n" - ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n" - ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n" - ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n" - ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n" - "ldr q24, [x10, #0x90]\n" - ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n" - ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n" - ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n" - ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n" - "ldr q25, [x10, #0xa0]\n" - ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n" - ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n" - ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n" - ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n" - "ldr q24, [x10, #0xb0]\n" - ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n" - ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n" - ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n" - ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n" - "ldr q25, [x10, #0xc0]\n" - ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n" - ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n" - ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n" - ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n" - "ldr q24, [x10, #0xd0]\n" - ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n" - ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n" - ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n" - ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n" - "ldr q25, [x10, #0xe0]\n" - ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n" - ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n" - ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n" - ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n" - "ldr q24, [x10, #0xf0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x10, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x10, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x10, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x10, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x10, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x10, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x10, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + "ldr q6, [x10, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + "ldr q7, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n" - ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n" - ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n" - ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" "ldr q1, [x25, #0x0]\n" - ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" "ldr q2, [x24, #0x0]\n" - ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" "ldr q3, [x23, #0x0]\n" "ldr q7, [x10, #0x10]\n" - "bge 88b\n" - "89:" // Height 4: Multiply loop: Single iteration only + "bge 85b\n" + "86:" // Height 4: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q25, [x10, #0x20]\n" + "ldr q6, [x10, #0x20]\n" "add x24, x24, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" @@ -1461,200 +1375,200 @@ void a64_hybrid_s8qs_dot_6x16 ( "prfm pldl1keep, [x26, #0x80]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q24, [x10, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" - ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" "sub x27, x27, #0x10\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" - ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" - "ldr q25, [x10, #0x40]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x10, #0x40]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" - ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" - ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" - ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" - "ldr q24, [x10, #0x50]\n" - ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n" - ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n" - ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n" - ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n" - "ldr q25, [x10, #0x60]\n" - ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n" - ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n" - ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n" - ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n" - "ldr q24, [x10, #0x70]\n" - ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n" - ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n" - ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n" - ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n" - "ldr q25, [x10, #0x80]\n" - ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n" - ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n" - ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n" - ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n" - "ldr q24, [x10, #0x90]\n" - ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n" - ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n" - ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n" - ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n" - "ldr q25, [x10, #0xa0]\n" - ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n" - ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n" - ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n" - ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n" - "ldr q24, [x10, #0xb0]\n" - ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n" - ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n" - ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n" - ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n" - "ldr q25, [x10, #0xc0]\n" - ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n" - ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n" - ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n" - ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n" - "ldr q24, [x10, #0xd0]\n" - ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n" - ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n" - ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n" - ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n" - "ldr q25, [x10, #0xe0]\n" - ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n" - ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n" - ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n" - ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n" - "ldr q24, [x10, #0xf0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x10, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + "ldr q6, [x10, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + "ldr q7, [x10, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x10, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x10, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + "ldr q6, [x10, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + "ldr q7, [x10, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + "ldr q6, [x10, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + "ldr q7, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n" - ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n" - ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n" - ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n" - ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n" - ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n" - ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n" - ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n" - "90:" // Height 4: Multiply loop: Main loop skip - "cbz x27, 95f\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + "87:" // Height 4: Multiply loop: Main loop skip + "cbz x27, 92f\n" "cmp x27, #0x4\n" - "blt 92f\n" - "91:" // Height 4: Multiply loop: Odd block loop - "ldr s29, [x26], #0x4\n" - "ldr s28, [x25], #0x4\n" + "blt 89f\n" + "88:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x26], #0x4\n" + "ldr s1, [x25], #0x4\n" "sub x27, x27, #0x4\n" - "ldr s27, [x24], #0x4\n" - "ldr s26, [x23], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" "cmp x27, #0x4\n" - "ldr q25, [x10, #0x0]\n" - "ldr q24, [x10, #0x10]\n" - ".inst 0x4f9de328 // sdot v8.4s, v25.16b, v29.4b[0]\n" - ".inst 0x4f9ce32c // sdot v12.4s, v25.16b, v28.4b[0]\n" - ".inst 0x4f9be330 // sdot v16.4s, v25.16b, v27.4b[0]\n" - ".inst 0x4f9ae334 // sdot v20.4s, v25.16b, v26.4b[0]\n" - "ldr q25, [x10, #0x20]\n" - ".inst 0x4f9de309 // sdot v9.4s, v24.16b, v29.4b[0]\n" - ".inst 0x4f9ce30d // sdot v13.4s, v24.16b, v28.4b[0]\n" - ".inst 0x4f9be311 // sdot v17.4s, v24.16b, v27.4b[0]\n" - ".inst 0x4f9ae315 // sdot v21.4s, v24.16b, v26.4b[0]\n" - "ldr q24, [x10, #0x30]\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f9de32a // sdot v10.4s, v25.16b, v29.4b[0]\n" - ".inst 0x4f9ce32e // sdot v14.4s, v25.16b, v28.4b[0]\n" - ".inst 0x4f9be332 // sdot v18.4s, v25.16b, v27.4b[0]\n" - ".inst 0x4f9ae336 // sdot v22.4s, v25.16b, v26.4b[0]\n" - ".inst 0x4f9de30b // sdot v11.4s, v24.16b, v29.4b[0]\n" - ".inst 0x4f9ce30f // sdot v15.4s, v24.16b, v28.4b[0]\n" - ".inst 0x4f9be313 // sdot v19.4s, v24.16b, v27.4b[0]\n" - ".inst 0x4f9ae317 // sdot v23.4s, v24.16b, v26.4b[0]\n" - "bge 91b\n" - "92:" // Height 4: Multiply loop: Skip odd blocks - "cbz x27, 95f\n" - "tbz x27, #1, 93f\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "bge 88b\n" + "89:" // Height 4: Multiply loop: Skip odd blocks + "cbz x27, 92f\n" + "tbz x27, #1, 90f\n" "ldr h0, [x26], #0x2\n" "ldr h1, [x25], #0x2\n" "ldr h2, [x24], #0x2\n" "ldr h3, [x23], #0x2\n" - "tbz x27, #0, 94f\n" + "tbz x27, #0, 91f\n" "ld1 { v0.b }[2], [x26]\n" "ld1 { v1.b }[2], [x25]\n" "ld1 { v2.b }[2], [x24]\n" "ld1 { v3.b }[2], [x23]\n" - "b 94f\n" - "93:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "b 91f\n" + "90:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x26, #0x0]\n" "ldr b1, [x25, #0x0]\n" "ldr b2, [x24, #0x0]\n" "ldr b3, [x23, #0x0]\n" - "94:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q25, [x10, #0x0]\n" - "ldr q24, [x10, #0x10]\n" - ".inst 0x4f80e328 // sdot v8.4s, v25.16b, v0.4b[0]\n" - ".inst 0x4f81e32c // sdot v12.4s, v25.16b, v1.4b[0]\n" - ".inst 0x4f82e330 // sdot v16.4s, v25.16b, v2.4b[0]\n" - ".inst 0x4f83e334 // sdot v20.4s, v25.16b, v3.4b[0]\n" - "ldr q25, [x10, #0x20]\n" - ".inst 0x4f80e309 // sdot v9.4s, v24.16b, v0.4b[0]\n" - ".inst 0x4f81e30d // sdot v13.4s, v24.16b, v1.4b[0]\n" - ".inst 0x4f82e311 // sdot v17.4s, v24.16b, v2.4b[0]\n" - ".inst 0x4f83e315 // sdot v21.4s, v24.16b, v3.4b[0]\n" - "ldr q24, [x10, #0x30]\n" + "91:" // Height 4: Multiply loop: Ragged operand read: Done + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" - ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" - ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" - ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" - ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" - ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" - ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" - ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" - "95:" // Height 4: Multiply loop: No odd multiplies + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "92:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 85b\n" - "ldr q27, [x14, #0x0]\n" - "ldr q26, [x14, #0x10]\n" - "ldr q25, [x14, #0x20]\n" - "ldr q24, [x14, #0x30]\n" - "add x14, x14, #0x40\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "add v8.4s, v8.4s, v27.4s\n" - "add v9.4s, v9.4s, v26.4s\n" - "add v12.4s, v12.4s, v27.4s\n" - "add v13.4s, v13.4s, v26.4s\n" - "add v10.4s, v10.4s, v25.4s\n" - "add v11.4s, v11.4s, v24.4s\n" - "add x26, x9, x20\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "prfm pstl1keep, [x26, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" - "add v14.4s, v14.4s, v25.4s\n" - "prfm pstl1keep, [x24, #0x0]\n" - "add v15.4s, v15.4s, v24.4s\n" - "add v16.4s, v16.4s, v27.4s\n" - "add v17.4s, v17.4s, v26.4s\n" - "add v18.4s, v18.4s, v25.4s\n" - "add v19.4s, v19.4s, v24.4s\n" - "add v20.4s, v20.4s, v27.4s\n" - "add v21.4s, v21.4s, v26.4s\n" - "add v22.4s, v22.4s, v25.4s\n" - "add v23.4s, v23.4s, v24.4s\n" - "tbz %x[flags], #4, 96f\n" + "bne 82b\n" "ldr q0, [x12, #0x0]\n" - "ldr q4, [x13, #0x0]\n" "ldr q1, [x12, #0x10]\n" - "ldr q5, [x13, #0x10]\n" "ldr q2, [x12, #0x20]\n" - "ldr q6, [x13, #0x20]\n" "ldr q3, [x12, #0x30]\n" - "ldr q7, [x13, #0x30]\n" "add x12, x12, #0x40\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v12.4s, v12.4s, v0.4s\n" + "add v13.4s, v13.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add x27, x9, x20\n" + "add x26, x27, x20\n" + "add x25, x26, x20\n" + "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add v14.4s, v14.4s, v2.4s\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add v15.4s, v15.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "tbz %x[flags], #4, 93f\n" + "ldr q0, [x13, #0x0]\n" + "ldr q4, [x14, #0x0]\n" + "ldr q1, [x13, #0x10]\n" + "ldr q5, [x14, #0x10]\n" + "ldr q2, [x13, #0x20]\n" + "ldr q6, [x14, #0x20]\n" + "ldr q3, [x13, #0x30]\n" + "ldr q7, [x14, #0x30]\n" "add x13, x13, #0x40\n" - "b 97f\n" - "96:" // Height 4: per layer parameters + "add x14, x14, #0x40\n" + "b 94f\n" + "93:" // Height 4: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1r { v0.4s }, [x21]\n" @@ -1665,86 +1579,36 @@ void a64_hybrid_s8qs_dot_6x16 ( "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" - "97:" // Height 4: parameters loaded - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "sqrdmulh v12.4s, v12.4s, v4.4s\n" - "sqrdmulh v13.4s, v13.4s, v5.4s\n" - "sqrdmulh v14.4s, v14.4s, v6.4s\n" - "sqrdmulh v15.4s, v15.4s, v7.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v5.4s\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "sqrdmulh v19.4s, v19.4s, v7.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v5.4s\n" - "sqrdmulh v22.4s, v22.4s, v6.4s\n" - "sqrdmulh v23.4s, v23.4s, v7.4s\n" - "tbz %x[flags], #5, 98f\n" - "and v27.16b, v8.16b, v0.16b\n" - "and v26.16b, v9.16b, v1.16b\n" - "and v25.16b, v10.16b, v2.16b\n" - "and v24.16b, v11.16b, v3.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v27.4s\n" - "and v27.16b, v12.16b, v0.16b\n" - "sqadd v9.4s, v9.4s, v26.4s\n" - "and v26.16b, v13.16b, v1.16b\n" - "sqadd v10.4s, v10.4s, v25.4s\n" - "sqadd v11.4s, v11.4s, v24.4s\n" - "and v25.16b, v14.16b, v2.16b\n" - "and v24.16b, v15.16b, v3.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v27.4s\n" - "and v27.16b, v16.16b, v0.16b\n" - "sqadd v13.4s, v13.4s, v26.4s\n" - "and v26.16b, v17.16b, v1.16b\n" - "sqadd v14.4s, v14.4s, v25.4s\n" - "sqadd v15.4s, v15.4s, v24.4s\n" - "and v25.16b, v18.16b, v2.16b\n" - "and v24.16b, v19.16b, v3.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v27.4s\n" - "and v27.16b, v20.16b, v0.16b\n" - "sqadd v17.4s, v17.4s, v26.4s\n" - "and v26.16b, v21.16b, v1.16b\n" - "sqadd v18.4s, v18.4s, v25.4s\n" - "sqadd v19.4s, v19.4s, v24.4s\n" - "and v25.16b, v22.16b, v2.16b\n" - "and v24.16b, v23.16b, v3.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v27.4s\n" - "sqadd v21.4s, v21.4s, v26.4s\n" - "sqadd v22.4s, v22.4s, v25.4s\n" - "sqadd v23.4s, v23.4s, v24.4s\n" - "98:" // Height 4: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "94:" // Height 4: parameters loaded + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" + "add x20, %x[qp], %[minval]\n" + "cmp x11, #0x10\n" + "sqdmulh v12.4s, v12.4s, v4.4s\n" + "sqdmulh v13.4s, v13.4s, v5.4s\n" + "sqdmulh v14.4s, v14.4s, v6.4s\n" + "sqdmulh v15.4s, v15.4s, v7.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v5.4s\n" + "sqdmulh v18.4s, v18.4s, v6.4s\n" + "sqdmulh v19.4s, v19.4s, v7.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" + "sqdmulh v21.4s, v21.4s, v5.4s\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v22.4s, v22.4s, v6.4s\n" + "ld1r { v6.4s }, [x21]\n" + "sqdmulh v23.4s, v23.4s, v7.4s\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v26.4s }, [x21]\n" - "ld1r { v25.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x11, #0x10\n" - "ld1r { v24.4s }, [x20]\n" "srshl v14.4s, v14.4s, v2.4s\n" "srshl v15.4s, v15.4s, v3.4s\n" "srshl v16.4s, v16.4s, v0.4s\n" @@ -1755,175 +1619,175 @@ void a64_hybrid_s8qs_dot_6x16 ( "srshl v21.4s, v21.4s, v1.4s\n" "srshl v22.4s, v22.4s, v2.4s\n" "srshl v23.4s, v23.4s, v3.4s\n" - "add v8.4s, v8.4s, v26.4s\n" - "add v9.4s, v9.4s, v26.4s\n" - "add v10.4s, v10.4s, v26.4s\n" - "add v11.4s, v11.4s, v26.4s\n" - "add v12.4s, v12.4s, v26.4s\n" - "add v13.4s, v13.4s, v26.4s\n" - "add v14.4s, v14.4s, v26.4s\n" - "add v15.4s, v15.4s, v26.4s\n" - "add v16.4s, v16.4s, v26.4s\n" - "add v17.4s, v17.4s, v26.4s\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v26.4s\n" - "add v20.4s, v20.4s, v26.4s\n" - "add v21.4s, v21.4s, v26.4s\n" - "add v22.4s, v22.4s, v26.4s\n" - "add v23.4s, v23.4s, v26.4s\n" - "smin v8.4s, v8.4s, v25.4s\n" - "smin v9.4s, v9.4s, v25.4s\n" - "smin v10.4s, v10.4s, v25.4s\n" - "smin v11.4s, v11.4s, v25.4s\n" - "smin v12.4s, v12.4s, v25.4s\n" - "smin v13.4s, v13.4s, v25.4s\n" - "smin v14.4s, v14.4s, v25.4s\n" - "smin v15.4s, v15.4s, v25.4s\n" - "smin v16.4s, v16.4s, v25.4s\n" - "smin v17.4s, v17.4s, v25.4s\n" - "smin v18.4s, v18.4s, v25.4s\n" - "smin v19.4s, v19.4s, v25.4s\n" - "smin v20.4s, v20.4s, v25.4s\n" - "smin v21.4s, v21.4s, v25.4s\n" - "smin v22.4s, v22.4s, v25.4s\n" - "smin v23.4s, v23.4s, v25.4s\n" - "smax v8.4s, v8.4s, v24.4s\n" - "smax v9.4s, v9.4s, v24.4s\n" - "smax v10.4s, v10.4s, v24.4s\n" - "smax v11.4s, v11.4s, v24.4s\n" - "smax v12.4s, v12.4s, v24.4s\n" - "smax v13.4s, v13.4s, v24.4s\n" - "smax v14.4s, v14.4s, v24.4s\n" - "smax v15.4s, v15.4s, v24.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "smax v18.4s, v18.4s, v24.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "smax v21.4s, v21.4s, v24.4s\n" - "smax v22.4s, v22.4s, v24.4s\n" - "smax v23.4s, v23.4s, v24.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v25.8h, v10.8h, v11.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v24.8h, v14.8h, v15.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v18.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v17.8h, v22.8h, v23.8h\n" - "uzp1 v8.16b, v8.16b, v25.16b\n" - "uzp1 v12.16b, v12.16b, v24.16b\n" - "uzp1 v16.16b, v16.16b, v18.16b\n" - "uzp1 v20.16b, v20.16b, v17.16b\n" - "bge 107f\n" - "tbz x11, #3, 102f\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "bge 103f\n" + "tbz x11, #3, 98f\n" "str d8, [x9], #0x8\n" - "str d12, [x26], #0x8\n" - "str d16, [x25], #0x8\n" - "str d20, [x24], #0x8\n" - "tbz x11, #2, 100f\n" + "str d12, [x27], #0x8\n" + "str d16, [x26], #0x8\n" + "str d20, [x25], #0x8\n" + "tbz x11, #2, 96f\n" "st1 { v8.s }[2], [x9], #0x4\n" - "st1 { v12.s }[2], [x26], #0x4\n" - "st1 { v16.s }[2], [x25], #0x4\n" - "st1 { v20.s }[2], [x24], #0x4\n" - "tbz x11, #1, 99f\n" + "st1 { v12.s }[2], [x27], #0x4\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "st1 { v20.s }[2], [x25], #0x4\n" + "tbz x11, #1, 95f\n" "st1 { v8.h }[6], [x9], #0x2\n" - "st1 { v12.h }[6], [x26], #0x2\n" - "st1 { v16.h }[6], [x25], #0x2\n" - "st1 { v20.h }[6], [x24], #0x2\n" - "tbz x11, #0, 106f\n" + "st1 { v12.h }[6], [x27], #0x2\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "st1 { v20.h }[6], [x25], #0x2\n" + "tbz x11, #0, 102f\n" "st1 { v8.b }[14], [x9]\n" - "st1 { v12.b }[14], [x26]\n" - "st1 { v16.b }[14], [x25]\n" - "st1 { v20.b }[14], [x24]\n" - "b 106f\n" - "99:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x11, #0, 106f\n" + "st1 { v12.b }[14], [x27]\n" + "st1 { v16.b }[14], [x26]\n" + "st1 { v20.b }[14], [x25]\n" + "b 102f\n" + "95:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x11, #0, 102f\n" "st1 { v8.b }[12], [x9]\n" - "st1 { v12.b }[12], [x26]\n" - "st1 { v16.b }[12], [x25]\n" - "st1 { v20.b }[12], [x24]\n" - "b 106f\n" - "100:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x11, #1, 101f\n" + "st1 { v12.b }[12], [x27]\n" + "st1 { v16.b }[12], [x26]\n" + "st1 { v20.b }[12], [x25]\n" + "b 102f\n" + "96:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x11, #1, 97f\n" "st1 { v8.h }[4], [x9], #0x2\n" - "st1 { v12.h }[4], [x26], #0x2\n" - "st1 { v16.h }[4], [x25], #0x2\n" - "st1 { v20.h }[4], [x24], #0x2\n" - "tbz x11, #0, 106f\n" + "st1 { v12.h }[4], [x27], #0x2\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "st1 { v20.h }[4], [x25], #0x2\n" + "tbz x11, #0, 102f\n" "st1 { v8.b }[10], [x9]\n" - "st1 { v12.b }[10], [x26]\n" - "st1 { v16.b }[10], [x25]\n" - "st1 { v20.b }[10], [x24]\n" - "b 106f\n" - "101:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x11, #0, 106f\n" + "st1 { v12.b }[10], [x27]\n" + "st1 { v16.b }[10], [x26]\n" + "st1 { v20.b }[10], [x25]\n" + "b 102f\n" + "97:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x11, #0, 102f\n" "st1 { v8.b }[8], [x9]\n" - "st1 { v12.b }[8], [x26]\n" - "st1 { v16.b }[8], [x25]\n" - "st1 { v20.b }[8], [x24]\n" - "b 106f\n" - "102:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x11, #2, 104f\n" + "st1 { v12.b }[8], [x27]\n" + "st1 { v16.b }[8], [x26]\n" + "st1 { v20.b }[8], [x25]\n" + "b 102f\n" + "98:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x11, #2, 100f\n" "str s8, [x9], #0x4\n" - "str s12, [x26], #0x4\n" - "str s16, [x25], #0x4\n" - "str s20, [x24], #0x4\n" - "tbz x11, #1, 103f\n" + "str s12, [x27], #0x4\n" + "str s16, [x26], #0x4\n" + "str s20, [x25], #0x4\n" + "tbz x11, #1, 99f\n" "st1 { v8.h }[2], [x9], #0x2\n" - "st1 { v12.h }[2], [x26], #0x2\n" - "st1 { v16.h }[2], [x25], #0x2\n" - "st1 { v20.h }[2], [x24], #0x2\n" - "tbz x11, #0, 106f\n" + "st1 { v12.h }[2], [x27], #0x2\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "st1 { v20.h }[2], [x25], #0x2\n" + "tbz x11, #0, 102f\n" "st1 { v8.b }[6], [x9]\n" - "st1 { v12.b }[6], [x26]\n" - "st1 { v16.b }[6], [x25]\n" - "st1 { v20.b }[6], [x24]\n" - "b 106f\n" - "103:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x11, #0, 106f\n" + "st1 { v12.b }[6], [x27]\n" + "st1 { v16.b }[6], [x26]\n" + "st1 { v20.b }[6], [x25]\n" + "b 102f\n" + "99:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x11, #0, 102f\n" "st1 { v8.b }[4], [x9]\n" - "st1 { v12.b }[4], [x26]\n" - "st1 { v16.b }[4], [x25]\n" - "st1 { v20.b }[4], [x24]\n" - "b 106f\n" - "104:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x11, #1, 105f\n" + "st1 { v12.b }[4], [x27]\n" + "st1 { v16.b }[4], [x26]\n" + "st1 { v20.b }[4], [x25]\n" + "b 102f\n" + "100:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x11, #1, 101f\n" "str h8, [x9], #0x2\n" - "str h12, [x26], #0x2\n" - "str h16, [x25], #0x2\n" - "str h20, [x24], #0x2\n" - "tbz x11, #0, 106f\n" + "str h12, [x27], #0x2\n" + "str h16, [x26], #0x2\n" + "str h20, [x25], #0x2\n" + "tbz x11, #0, 102f\n" "st1 { v8.b }[2], [x9]\n" - "st1 { v12.b }[2], [x26]\n" - "st1 { v16.b }[2], [x25]\n" - "st1 { v20.b }[2], [x24]\n" - "b 106f\n" - "105:" // Height 4: Partial direct writeback: partial_1_0 + "st1 { v12.b }[2], [x27]\n" + "st1 { v16.b }[2], [x26]\n" + "st1 { v20.b }[2], [x25]\n" + "b 102f\n" + "101:" // Height 4: Partial direct writeback: partial_1_0 "str b8, [x9, #0x0]\n" - "str b12, [x26, #0x0]\n" - "str b16, [x25, #0x0]\n" - "str b20, [x24, #0x0]\n" - "106:" // Height 4: Partial direct writeback: Done - "b 108f\n" - "107:" // Height 4: Full writeback + "str b12, [x27, #0x0]\n" + "str b16, [x26, #0x0]\n" + "str b20, [x25, #0x0]\n" + "102:" // Height 4: Partial direct writeback: Done + "b 104f\n" + "103:" // Height 4: Full writeback "str q8, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "str q12, [x26, #0x0]\n" - "str q16, [x25, #0x0]\n" - "str q20, [x24, #0x0]\n" - "108:" // Height 4: Writeback done + "str q12, [x27, #0x0]\n" + "str q16, [x26, #0x0]\n" + "str q20, [x25, #0x0]\n" + "104:" // Height 4: Writeback done "subs x11, x11, #0x10\n" - "bgt 83b\n" - "b 164f\n" - "109:" // Height 5 - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "bgt 80b\n" + "b 158f\n" + "105:" // Height 5 + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "110:" // Height 5: Column loop + "106:" // Height 5: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -1944,13 +1808,12 @@ void a64_hybrid_s8qs_dot_6x16 ( "movi v25.4s, #0x0\n" "movi v26.4s, #0x0\n" "movi v27.4s, #0x0\n" - "111:" // Height 5: setup done "mov x28, #0x0\n" - "112:" // Height 5: String loop + "108:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 113f\n" + "tbz %x[flags], #3, 109f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" @@ -1958,23 +1821,23 @@ void a64_hybrid_s8qs_dot_6x16 ( "ldr x24, [x20, #0x10]\n" "ldr x23, [x20, #0x18]\n" "ldr x22, [x20, #0x20]\n" - "cbnz x28, 114f\n" + "cbnz x28, 110f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" - "b 114f\n" - "113:" // Height 5: setup direct input + "b 110f\n" + "109:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" "add x24, x25, x21\n" "add x23, x24, x21\n" "add x22, x23, x21\n" - "114:" // Height 5: input setup done + "110:" // Height 5: input setup done "cmp x27, #0x10\n" - "blt 117f\n" + "blt 113f\n" "ldr q0, [x26, #0x0]\n" "ldr q1, [x25, #0x0]\n" "cmp x27, #0x20\n" @@ -1983,8 +1846,8 @@ void a64_hybrid_s8qs_dot_6x16 ( "ldr q4, [x22, #0x0]\n" "ldr q6, [x10, #0x0]\n" "ldr q7, [x10, #0x10]\n" - "blt 116f\n" - "115:" // Height 5: Multiply loop: Main loop head + "blt 112f\n" + "111:" // Height 5: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" "sub x27, x27, #0x10\n" @@ -1994,7 +1857,7 @@ void a64_hybrid_s8qs_dot_6x16 ( "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q29, [x10, #0x20]\n" + "ldr q6, [x10, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x23, x23, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" @@ -2003,104 +1866,104 @@ void a64_hybrid_s8qs_dot_6x16 ( "cmp x27, #0x20\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q28, [x10, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" - ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" "prfm pldl1keep, [x25, #0x80]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" - ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" - "ldr q29, [x10, #0x40]\n" - ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" - ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" - ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" - "ldr q28, [x10, #0x50]\n" - ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n" - ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n" - ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n" - ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n" - ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n" - "ldr q29, [x10, #0x60]\n" - ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n" - ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n" - ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n" - ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n" - ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n" - "ldr q28, [x10, #0x70]\n" - ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n" - ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n" - ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n" - ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n" - ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n" - "ldr q29, [x10, #0x80]\n" - ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n" - ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n" - ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n" - ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n" - ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n" - "ldr q28, [x10, #0x90]\n" - ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n" - ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n" - ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n" - ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n" - ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n" - "ldr q29, [x10, #0xa0]\n" - ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n" - ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n" - ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n" - "ldr q28, [x10, #0xb0]\n" - ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n" - ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n" - ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n" - ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n" - ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n" - "ldr q29, [x10, #0xc0]\n" - ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n" - ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n" - ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n" - "ldr q28, [x10, #0xd0]\n" - ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n" - ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n" - ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n" - ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n" - ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n" - "ldr q29, [x10, #0xe0]\n" - ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n" - ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n" - ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n" - ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n" - ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n" - "ldr q28, [x10, #0xf0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x10, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x10, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x10, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x10, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x10, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x10, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x10, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + "ldr q6, [x10, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + "ldr q7, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n" - ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n" - ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n" - ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n" - ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" "ldr q1, [x25, #0x0]\n" - ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" "ldr q2, [x24, #0x0]\n" - ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" "ldr q3, [x23, #0x0]\n" - ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" "ldr q4, [x22, #0x0]\n" "ldr q7, [x10, #0x10]\n" - "bge 115b\n" - "116:" // Height 5: Multiply loop: Single iteration only + "bge 111b\n" + "112:" // Height 5: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" "add x26, x26, #0x10\n" @@ -2110,7 +1973,7 @@ void a64_hybrid_s8qs_dot_6x16 ( "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q29, [x10, #0x20]\n" + "ldr q6, [x10, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x22, x22, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" @@ -2119,232 +1982,232 @@ void a64_hybrid_s8qs_dot_6x16 ( "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q28, [x10, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "sub x27, x27, #0x10\n" - ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" - ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" - ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" - "ldr q29, [x10, #0x40]\n" - ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" - ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" - ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" - "ldr q28, [x10, #0x50]\n" - ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n" - ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n" - ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n" - ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n" - ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n" - "ldr q29, [x10, #0x60]\n" - ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n" - ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n" - ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n" - ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n" - ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n" - "ldr q28, [x10, #0x70]\n" - ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n" - ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n" - ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n" - ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n" - ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n" - "ldr q29, [x10, #0x80]\n" - ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n" - ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n" - ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n" - ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n" - ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n" - "ldr q28, [x10, #0x90]\n" - ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n" - ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n" - ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n" - ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n" - ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n" - "ldr q29, [x10, #0xa0]\n" - ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n" - ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n" - ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n" - "ldr q28, [x10, #0xb0]\n" - ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n" - ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n" - ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n" - ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n" - ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n" - "ldr q29, [x10, #0xc0]\n" - ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n" - ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n" - ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n" - "ldr q28, [x10, #0xd0]\n" - ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n" - ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n" - ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n" - ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n" - ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n" - "ldr q29, [x10, #0xe0]\n" - ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n" - ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n" - ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n" - ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n" - ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n" - "ldr q28, [x10, #0xf0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x10, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + "ldr q6, [x10, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + "ldr q7, [x10, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x10, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x10, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + "ldr q6, [x10, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + "ldr q7, [x10, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + "ldr q6, [x10, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + "ldr q7, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n" - ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n" - ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n" - ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n" - ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n" - ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n" - ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n" - ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n" - ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n" - ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n" - "117:" // Height 5: Multiply loop: Main loop skip - "cbz x27, 122f\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + "113:" // Height 5: Multiply loop: Main loop skip + "cbz x27, 118f\n" "cmp x27, #0x4\n" - "blt 119f\n" - "118:" // Height 5: Multiply loop: Odd block loop - "ldr s2, [x26], #0x4\n" + "blt 115f\n" + "114:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x26], #0x4\n" "ldr s1, [x25], #0x4\n" "sub x27, x27, #0x4\n" - "ldr s0, [x24], #0x4\n" - "ldr s31, [x23], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" "cmp x27, #0x4\n" - "ldr s30, [x22], #0x4\n" - "ldr q29, [x10, #0x0]\n" - "ldr q28, [x10, #0x10]\n" - ".inst 0x4f82e3a8 // sdot v8.4s, v29.16b, v2.4b[0]\n" - ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n" - ".inst 0x4f80e3b0 // sdot v16.4s, v29.16b, v0.4b[0]\n" - ".inst 0x4f9fe3b4 // sdot v20.4s, v29.16b, v31.4b[0]\n" - ".inst 0x4f9ee3b8 // sdot v24.4s, v29.16b, v30.4b[0]\n" - "ldr q29, [x10, #0x20]\n" - ".inst 0x4f82e389 // sdot v9.4s, v28.16b, v2.4b[0]\n" - ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f80e391 // sdot v17.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f9fe395 // sdot v21.4s, v28.16b, v31.4b[0]\n" - ".inst 0x4f9ee399 // sdot v25.4s, v28.16b, v30.4b[0]\n" - "ldr q28, [x10, #0x30]\n" + "ldr s4, [x22], #0x4\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f82e3aa // sdot v10.4s, v29.16b, v2.4b[0]\n" - ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" - ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n" - ".inst 0x4f9fe3b6 // sdot v22.4s, v29.16b, v31.4b[0]\n" - ".inst 0x4f9ee3ba // sdot v26.4s, v29.16b, v30.4b[0]\n" - ".inst 0x4f82e38b // sdot v11.4s, v28.16b, v2.4b[0]\n" - ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f9fe397 // sdot v23.4s, v28.16b, v31.4b[0]\n" - ".inst 0x4f9ee39b // sdot v27.4s, v28.16b, v30.4b[0]\n" - "bge 118b\n" - "119:" // Height 5: Multiply loop: Skip odd blocks - "cbz x27, 122f\n" - "tbz x27, #1, 120f\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "bge 114b\n" + "115:" // Height 5: Multiply loop: Skip odd blocks + "cbz x27, 118f\n" + "tbz x27, #1, 116f\n" "ldr h0, [x26], #0x2\n" "ldr h1, [x25], #0x2\n" "ldr h2, [x24], #0x2\n" "ldr h3, [x23], #0x2\n" "ldr h4, [x22], #0x2\n" - "tbz x27, #0, 121f\n" + "tbz x27, #0, 117f\n" "ld1 { v0.b }[2], [x26]\n" "ld1 { v1.b }[2], [x25]\n" "ld1 { v2.b }[2], [x24]\n" "ld1 { v3.b }[2], [x23]\n" "ld1 { v4.b }[2], [x22]\n" - "b 121f\n" - "120:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "b 117f\n" + "116:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x26, #0x0]\n" "ldr b1, [x25, #0x0]\n" "ldr b2, [x24, #0x0]\n" "ldr b3, [x23, #0x0]\n" "ldr b4, [x22, #0x0]\n" - "121:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q29, [x10, #0x0]\n" - "ldr q28, [x10, #0x10]\n" - ".inst 0x4f80e3a8 // sdot v8.4s, v29.16b, v0.4b[0]\n" - ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n" - ".inst 0x4f82e3b0 // sdot v16.4s, v29.16b, v2.4b[0]\n" - ".inst 0x4f83e3b4 // sdot v20.4s, v29.16b, v3.4b[0]\n" - ".inst 0x4f84e3b8 // sdot v24.4s, v29.16b, v4.4b[0]\n" - "ldr q29, [x10, #0x20]\n" - ".inst 0x4f80e389 // sdot v9.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f82e391 // sdot v17.4s, v28.16b, v2.4b[0]\n" - ".inst 0x4f83e395 // sdot v21.4s, v28.16b, v3.4b[0]\n" - ".inst 0x4f84e399 // sdot v25.4s, v28.16b, v4.4b[0]\n" - "ldr q28, [x10, #0x30]\n" + "117:" // Height 5: Multiply loop: Ragged operand read: Done + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q7, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" - ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" - ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" - ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" - ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" - ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" - ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" - ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" - "122:" // Height 5: Multiply loop: No odd multiplies + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "118:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 112b\n" - "ldr q31, [x14, #0x0]\n" - "ldr q30, [x14, #0x10]\n" - "ldr q29, [x14, #0x20]\n" - "ldr q28, [x14, #0x30]\n" - "add x14, x14, #0x40\n" + "bne 108b\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q3, [x12, #0x30]\n" + "add x12, x12, #0x40\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x9, #0x0]\n" - "add v8.4s, v8.4s, v31.4s\n" - "add v9.4s, v9.4s, v30.4s\n" - "add v12.4s, v12.4s, v31.4s\n" - "add v13.4s, v13.4s, v30.4s\n" - "add v10.4s, v10.4s, v29.4s\n" - "add v11.4s, v11.4s, v28.4s\n" - "add x26, x9, x20\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v12.4s, v12.4s, v0.4s\n" + "add v13.4s, v13.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add x27, x9, x20\n" + "add x26, x27, x20\n" "add x25, x26, x20\n" - "add x24, x25, x20\n" + "prfm pstl1keep, [x27, #0x0]\n" "prfm pstl1keep, [x26, #0x0]\n" + "add v14.4s, v14.4s, v2.4s\n" + "add x24, x25, x20\n" "prfm pstl1keep, [x25, #0x0]\n" - "add v14.4s, v14.4s, v29.4s\n" - "add x23, x24, x20\n" + "add v15.4s, v15.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" "prfm pstl1keep, [x24, #0x0]\n" - "add v15.4s, v15.4s, v28.4s\n" - "add v16.4s, v16.4s, v31.4s\n" - "prfm pstl1keep, [x23, #0x0]\n" - "add v17.4s, v17.4s, v30.4s\n" - "add v18.4s, v18.4s, v29.4s\n" - "add v19.4s, v19.4s, v28.4s\n" - "add v20.4s, v20.4s, v31.4s\n" - "add v21.4s, v21.4s, v30.4s\n" - "add v22.4s, v22.4s, v29.4s\n" - "add v23.4s, v23.4s, v28.4s\n" - "add v24.4s, v24.4s, v31.4s\n" - "add v25.4s, v25.4s, v30.4s\n" - "add v26.4s, v26.4s, v29.4s\n" - "add v27.4s, v27.4s, v28.4s\n" - "tbz %x[flags], #4, 123f\n" - "ldr q0, [x12, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q1, [x12, #0x10]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q2, [x12, #0x20]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q3, [x12, #0x30]\n" - "ldr q7, [x13, #0x30]\n" - "add x12, x12, #0x40\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "tbz %x[flags], #4, 119f\n" + "ldr q0, [x13, #0x0]\n" + "ldr q4, [x14, #0x0]\n" + "ldr q1, [x13, #0x10]\n" + "ldr q5, [x14, #0x10]\n" + "ldr q2, [x13, #0x20]\n" + "ldr q6, [x14, #0x20]\n" + "ldr q3, [x13, #0x30]\n" + "ldr q7, [x14, #0x30]\n" "add x13, x13, #0x40\n" - "b 124f\n" - "123:" // Height 5: per layer parameters + "add x14, x14, #0x40\n" + "b 120f\n" + "119:" // Height 5: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1r { v0.4s }, [x21]\n" @@ -2355,102 +2218,40 @@ void a64_hybrid_s8qs_dot_6x16 ( "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" - "124:" // Height 5: parameters loaded - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "sqrdmulh v12.4s, v12.4s, v4.4s\n" - "sqrdmulh v13.4s, v13.4s, v5.4s\n" - "sqrdmulh v14.4s, v14.4s, v6.4s\n" - "sqrdmulh v15.4s, v15.4s, v7.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v5.4s\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "sqrdmulh v19.4s, v19.4s, v7.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v5.4s\n" - "sqrdmulh v22.4s, v22.4s, v6.4s\n" - "sqrdmulh v23.4s, v23.4s, v7.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v5.4s\n" - "sqrdmulh v26.4s, v26.4s, v6.4s\n" - "sqrdmulh v27.4s, v27.4s, v7.4s\n" - "tbz %x[flags], #5, 125f\n" - "and v31.16b, v8.16b, v0.16b\n" - "and v30.16b, v9.16b, v1.16b\n" - "and v29.16b, v10.16b, v2.16b\n" - "and v28.16b, v11.16b, v3.16b\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v31.4s\n" - "and v31.16b, v12.16b, v0.16b\n" - "sqadd v9.4s, v9.4s, v30.4s\n" - "and v30.16b, v13.16b, v1.16b\n" - "sqadd v10.4s, v10.4s, v29.4s\n" - "sqadd v11.4s, v11.4s, v28.4s\n" - "and v29.16b, v14.16b, v2.16b\n" - "and v28.16b, v15.16b, v3.16b\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v31.4s\n" - "and v31.16b, v16.16b, v0.16b\n" - "sqadd v13.4s, v13.4s, v30.4s\n" - "and v30.16b, v17.16b, v1.16b\n" - "sqadd v14.4s, v14.4s, v29.4s\n" - "sqadd v15.4s, v15.4s, v28.4s\n" - "and v29.16b, v18.16b, v2.16b\n" - "and v28.16b, v19.16b, v3.16b\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v31.4s\n" - "and v31.16b, v20.16b, v0.16b\n" - "sqadd v17.4s, v17.4s, v30.4s\n" - "and v30.16b, v21.16b, v1.16b\n" - "sqadd v18.4s, v18.4s, v29.4s\n" - "sqadd v19.4s, v19.4s, v28.4s\n" - "and v29.16b, v22.16b, v2.16b\n" - "and v28.16b, v23.16b, v3.16b\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v31.4s\n" - "and v31.16b, v24.16b, v0.16b\n" - "sqadd v21.4s, v21.4s, v30.4s\n" - "and v30.16b, v25.16b, v1.16b\n" - "sqadd v22.4s, v22.4s, v29.4s\n" - "sqadd v23.4s, v23.4s, v28.4s\n" - "and v29.16b, v26.16b, v2.16b\n" - "and v28.16b, v27.16b, v3.16b\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sqadd v24.4s, v24.4s, v31.4s\n" - "sqadd v25.4s, v25.4s, v30.4s\n" - "sqadd v26.4s, v26.4s, v29.4s\n" - "sqadd v27.4s, v27.4s, v28.4s\n" - "125:" // Height 5: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "120:" // Height 5: parameters loaded + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" + "add x20, %x[qp], %[minval]\n" + "cmp x11, #0x10\n" + "sqdmulh v12.4s, v12.4s, v4.4s\n" + "sqdmulh v13.4s, v13.4s, v5.4s\n" + "sqdmulh v14.4s, v14.4s, v6.4s\n" + "sqdmulh v15.4s, v15.4s, v7.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v5.4s\n" + "sqdmulh v18.4s, v18.4s, v6.4s\n" + "sqdmulh v19.4s, v19.4s, v7.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v5.4s\n" + "sqdmulh v22.4s, v22.4s, v6.4s\n" + "sqdmulh v23.4s, v23.4s, v7.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" + "sqdmulh v25.4s, v25.4s, v5.4s\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v26.4s, v26.4s, v6.4s\n" + "ld1r { v6.4s }, [x21]\n" + "sqdmulh v27.4s, v27.4s, v7.4s\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v30.4s }, [x21]\n" - "ld1r { v29.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x11, #0x10\n" - "ld1r { v28.4s }, [x20]\n" "srshl v14.4s, v14.4s, v2.4s\n" "srshl v15.4s, v15.4s, v3.4s\n" "srshl v16.4s, v16.4s, v0.4s\n" @@ -2465,210 +2266,210 @@ void a64_hybrid_s8qs_dot_6x16 ( "srshl v25.4s, v25.4s, v1.4s\n" "srshl v26.4s, v26.4s, v2.4s\n" "srshl v27.4s, v27.4s, v3.4s\n" - "add v8.4s, v8.4s, v30.4s\n" - "add v9.4s, v9.4s, v30.4s\n" - "add v10.4s, v10.4s, v30.4s\n" - "add v11.4s, v11.4s, v30.4s\n" - "add v12.4s, v12.4s, v30.4s\n" - "add v13.4s, v13.4s, v30.4s\n" - "add v14.4s, v14.4s, v30.4s\n" - "add v15.4s, v15.4s, v30.4s\n" - "add v16.4s, v16.4s, v30.4s\n" - "add v17.4s, v17.4s, v30.4s\n" - "add v18.4s, v18.4s, v30.4s\n" - "add v19.4s, v19.4s, v30.4s\n" - "add v20.4s, v20.4s, v30.4s\n" - "add v21.4s, v21.4s, v30.4s\n" - "add v22.4s, v22.4s, v30.4s\n" - "add v23.4s, v23.4s, v30.4s\n" - "add v24.4s, v24.4s, v30.4s\n" - "add v25.4s, v25.4s, v30.4s\n" - "add v26.4s, v26.4s, v30.4s\n" - "add v27.4s, v27.4s, v30.4s\n" - "smin v8.4s, v8.4s, v29.4s\n" - "smin v9.4s, v9.4s, v29.4s\n" - "smin v10.4s, v10.4s, v29.4s\n" - "smin v11.4s, v11.4s, v29.4s\n" - "smin v12.4s, v12.4s, v29.4s\n" - "smin v13.4s, v13.4s, v29.4s\n" - "smin v14.4s, v14.4s, v29.4s\n" - "smin v15.4s, v15.4s, v29.4s\n" - "smin v16.4s, v16.4s, v29.4s\n" - "smin v17.4s, v17.4s, v29.4s\n" - "smin v18.4s, v18.4s, v29.4s\n" - "smin v19.4s, v19.4s, v29.4s\n" - "smin v20.4s, v20.4s, v29.4s\n" - "smin v21.4s, v21.4s, v29.4s\n" - "smin v22.4s, v22.4s, v29.4s\n" - "smin v23.4s, v23.4s, v29.4s\n" - "smin v24.4s, v24.4s, v29.4s\n" - "smin v25.4s, v25.4s, v29.4s\n" - "smin v26.4s, v26.4s, v29.4s\n" - "smin v27.4s, v27.4s, v29.4s\n" - "smax v8.4s, v8.4s, v28.4s\n" - "smax v9.4s, v9.4s, v28.4s\n" - "smax v10.4s, v10.4s, v28.4s\n" - "smax v11.4s, v11.4s, v28.4s\n" - "smax v12.4s, v12.4s, v28.4s\n" - "smax v13.4s, v13.4s, v28.4s\n" - "smax v14.4s, v14.4s, v28.4s\n" - "smax v15.4s, v15.4s, v28.4s\n" - "smax v16.4s, v16.4s, v28.4s\n" - "smax v17.4s, v17.4s, v28.4s\n" - "smax v18.4s, v18.4s, v28.4s\n" - "smax v19.4s, v19.4s, v28.4s\n" - "smax v20.4s, v20.4s, v28.4s\n" - "smax v21.4s, v21.4s, v28.4s\n" - "smax v22.4s, v22.4s, v28.4s\n" - "smax v23.4s, v23.4s, v28.4s\n" - "smax v24.4s, v24.4s, v28.4s\n" - "smax v25.4s, v25.4s, v28.4s\n" - "smax v26.4s, v26.4s, v28.4s\n" - "smax v27.4s, v27.4s, v28.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v29.8h, v10.8h, v11.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v28.8h, v14.8h, v15.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v19.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v18.8h, v22.8h, v23.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v17.8h, v26.8h, v27.8h\n" - "uzp1 v8.16b, v8.16b, v29.16b\n" - "uzp1 v12.16b, v12.16b, v28.16b\n" - "uzp1 v16.16b, v16.16b, v19.16b\n" - "uzp1 v20.16b, v20.16b, v18.16b\n" - "uzp1 v24.16b, v24.16b, v17.16b\n" - "bge 134f\n" - "tbz x11, #3, 129f\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 129f\n" + "tbz x11, #3, 124f\n" "str d8, [x9], #0x8\n" - "str d12, [x26], #0x8\n" - "str d16, [x25], #0x8\n" - "str d20, [x24], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x11, #2, 127f\n" + "str d12, [x27], #0x8\n" + "str d16, [x26], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x24], #0x8\n" + "tbz x11, #2, 122f\n" "st1 { v8.s }[2], [x9], #0x4\n" - "st1 { v12.s }[2], [x26], #0x4\n" - "st1 { v16.s }[2], [x25], #0x4\n" - "st1 { v20.s }[2], [x24], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "tbz x11, #1, 126f\n" + "st1 { v12.s }[2], [x27], #0x4\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "st1 { v20.s }[2], [x25], #0x4\n" + "st1 { v24.s }[2], [x24], #0x4\n" + "tbz x11, #1, 121f\n" "st1 { v8.h }[6], [x9], #0x2\n" - "st1 { v12.h }[6], [x26], #0x2\n" - "st1 { v16.h }[6], [x25], #0x2\n" - "st1 { v20.h }[6], [x24], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "tbz x11, #0, 133f\n" + "st1 { v12.h }[6], [x27], #0x2\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "st1 { v20.h }[6], [x25], #0x2\n" + "st1 { v24.h }[6], [x24], #0x2\n" + "tbz x11, #0, 128f\n" "st1 { v8.b }[14], [x9]\n" - "st1 { v12.b }[14], [x26]\n" - "st1 { v16.b }[14], [x25]\n" - "st1 { v20.b }[14], [x24]\n" - "st1 { v24.b }[14], [x23]\n" - "b 133f\n" - "126:" // Height 5: Partial direct writeback: partial_1_12 - "tbz x11, #0, 133f\n" + "st1 { v12.b }[14], [x27]\n" + "st1 { v16.b }[14], [x26]\n" + "st1 { v20.b }[14], [x25]\n" + "st1 { v24.b }[14], [x24]\n" + "b 128f\n" + "121:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x11, #0, 128f\n" "st1 { v8.b }[12], [x9]\n" - "st1 { v12.b }[12], [x26]\n" - "st1 { v16.b }[12], [x25]\n" - "st1 { v20.b }[12], [x24]\n" - "st1 { v24.b }[12], [x23]\n" - "b 133f\n" - "127:" // Height 5: Partial direct writeback: partial_2_8 - "tbz x11, #1, 128f\n" + "st1 { v12.b }[12], [x27]\n" + "st1 { v16.b }[12], [x26]\n" + "st1 { v20.b }[12], [x25]\n" + "st1 { v24.b }[12], [x24]\n" + "b 128f\n" + "122:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x11, #1, 123f\n" "st1 { v8.h }[4], [x9], #0x2\n" - "st1 { v12.h }[4], [x26], #0x2\n" - "st1 { v16.h }[4], [x25], #0x2\n" - "st1 { v20.h }[4], [x24], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "tbz x11, #0, 133f\n" + "st1 { v12.h }[4], [x27], #0x2\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "st1 { v20.h }[4], [x25], #0x2\n" + "st1 { v24.h }[4], [x24], #0x2\n" + "tbz x11, #0, 128f\n" "st1 { v8.b }[10], [x9]\n" - "st1 { v12.b }[10], [x26]\n" - "st1 { v16.b }[10], [x25]\n" - "st1 { v20.b }[10], [x24]\n" - "st1 { v24.b }[10], [x23]\n" - "b 133f\n" - "128:" // Height 5: Partial direct writeback: partial_1_8 - "tbz x11, #0, 133f\n" + "st1 { v12.b }[10], [x27]\n" + "st1 { v16.b }[10], [x26]\n" + "st1 { v20.b }[10], [x25]\n" + "st1 { v24.b }[10], [x24]\n" + "b 128f\n" + "123:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x11, #0, 128f\n" "st1 { v8.b }[8], [x9]\n" - "st1 { v12.b }[8], [x26]\n" - "st1 { v16.b }[8], [x25]\n" - "st1 { v20.b }[8], [x24]\n" - "st1 { v24.b }[8], [x23]\n" - "b 133f\n" - "129:" // Height 5: Partial direct writeback: partial_4_0 - "tbz x11, #2, 131f\n" + "st1 { v12.b }[8], [x27]\n" + "st1 { v16.b }[8], [x26]\n" + "st1 { v20.b }[8], [x25]\n" + "st1 { v24.b }[8], [x24]\n" + "b 128f\n" + "124:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x11, #2, 126f\n" "str s8, [x9], #0x4\n" - "str s12, [x26], #0x4\n" - "str s16, [x25], #0x4\n" - "str s20, [x24], #0x4\n" - "str s24, [x23], #0x4\n" - "tbz x11, #1, 130f\n" + "str s12, [x27], #0x4\n" + "str s16, [x26], #0x4\n" + "str s20, [x25], #0x4\n" + "str s24, [x24], #0x4\n" + "tbz x11, #1, 125f\n" "st1 { v8.h }[2], [x9], #0x2\n" - "st1 { v12.h }[2], [x26], #0x2\n" - "st1 { v16.h }[2], [x25], #0x2\n" - "st1 { v20.h }[2], [x24], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "tbz x11, #0, 133f\n" + "st1 { v12.h }[2], [x27], #0x2\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "st1 { v20.h }[2], [x25], #0x2\n" + "st1 { v24.h }[2], [x24], #0x2\n" + "tbz x11, #0, 128f\n" "st1 { v8.b }[6], [x9]\n" - "st1 { v12.b }[6], [x26]\n" - "st1 { v16.b }[6], [x25]\n" - "st1 { v20.b }[6], [x24]\n" - "st1 { v24.b }[6], [x23]\n" - "b 133f\n" - "130:" // Height 5: Partial direct writeback: partial_1_4 - "tbz x11, #0, 133f\n" + "st1 { v12.b }[6], [x27]\n" + "st1 { v16.b }[6], [x26]\n" + "st1 { v20.b }[6], [x25]\n" + "st1 { v24.b }[6], [x24]\n" + "b 128f\n" + "125:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x11, #0, 128f\n" "st1 { v8.b }[4], [x9]\n" - "st1 { v12.b }[4], [x26]\n" - "st1 { v16.b }[4], [x25]\n" - "st1 { v20.b }[4], [x24]\n" - "st1 { v24.b }[4], [x23]\n" - "b 133f\n" - "131:" // Height 5: Partial direct writeback: partial_2_0 - "tbz x11, #1, 132f\n" + "st1 { v12.b }[4], [x27]\n" + "st1 { v16.b }[4], [x26]\n" + "st1 { v20.b }[4], [x25]\n" + "st1 { v24.b }[4], [x24]\n" + "b 128f\n" + "126:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x11, #1, 127f\n" "str h8, [x9], #0x2\n" - "str h12, [x26], #0x2\n" - "str h16, [x25], #0x2\n" - "str h20, [x24], #0x2\n" - "str h24, [x23], #0x2\n" - "tbz x11, #0, 133f\n" + "str h12, [x27], #0x2\n" + "str h16, [x26], #0x2\n" + "str h20, [x25], #0x2\n" + "str h24, [x24], #0x2\n" + "tbz x11, #0, 128f\n" "st1 { v8.b }[2], [x9]\n" - "st1 { v12.b }[2], [x26]\n" - "st1 { v16.b }[2], [x25]\n" - "st1 { v20.b }[2], [x24]\n" - "st1 { v24.b }[2], [x23]\n" - "b 133f\n" - "132:" // Height 5: Partial direct writeback: partial_1_0 + "st1 { v12.b }[2], [x27]\n" + "st1 { v16.b }[2], [x26]\n" + "st1 { v20.b }[2], [x25]\n" + "st1 { v24.b }[2], [x24]\n" + "b 128f\n" + "127:" // Height 5: Partial direct writeback: partial_1_0 "str b8, [x9, #0x0]\n" - "str b12, [x26, #0x0]\n" - "str b16, [x25, #0x0]\n" - "str b20, [x24, #0x0]\n" - "str b24, [x23, #0x0]\n" - "133:" // Height 5: Partial direct writeback: Done - "b 135f\n" - "134:" // Height 5: Full writeback + "str b12, [x27, #0x0]\n" + "str b16, [x26, #0x0]\n" + "str b20, [x25, #0x0]\n" + "str b24, [x24, #0x0]\n" + "128:" // Height 5: Partial direct writeback: Done + "b 130f\n" + "129:" // Height 5: Full writeback "str q8, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "str q12, [x26, #0x0]\n" - "str q16, [x25, #0x0]\n" - "str q20, [x24, #0x0]\n" - "str q24, [x23, #0x0]\n" - "135:" // Height 5: Writeback done + "str q12, [x27, #0x0]\n" + "str q16, [x26, #0x0]\n" + "str q20, [x25, #0x0]\n" + "str q24, [x24, #0x0]\n" + "130:" // Height 5: Writeback done "subs x11, x11, #0x10\n" - "bgt 110b\n" - "b 164f\n" - "136:" // Height 6 + "bgt 106b\n" + "b 158f\n" + "131:" // Height 6 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" "mov x20, #0x6\n" - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "madd x20, x21, x20, x9\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "137:" // Height 6: Column loop + "132:" // Height 6: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -2693,13 +2494,12 @@ void a64_hybrid_s8qs_dot_6x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "138:" // Height 6: setup done "mov x28, #0x0\n" - "139:" // Height 6: String loop + "134:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 140f\n" + "tbz %x[flags], #3, 135f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" @@ -2708,7 +2508,7 @@ void a64_hybrid_s8qs_dot_6x16 ( "ldr x23, [x20, #0x18]\n" "ldr x22, [x20, #0x20]\n" "ldr x21, [x20, #0x28]\n" - "cbnz x28, 141f\n" + "cbnz x28, 136f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" @@ -2716,17 +2516,17 @@ void a64_hybrid_s8qs_dot_6x16 ( "add x23, x23, x20\n" "add x22, x22, x20\n" "add x21, x21, x20\n" - "b 141f\n" - "140:" // Height 6: setup direct input + "b 136f\n" + "135:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" "add x24, x25, x21\n" "add x23, x24, x21\n" "add x22, x23, x21\n" "add x21, x22, x21\n" - "141:" // Height 6: input setup done + "136:" // Height 6: input setup done "cmp x27, #0x10\n" - "blt 144f\n" + "blt 139f\n" "ldr q0, [x26, #0x0]\n" "ldr q1, [x25, #0x0]\n" "cmp x27, #0x20\n" @@ -2736,8 +2536,8 @@ void a64_hybrid_s8qs_dot_6x16 ( "ldr q5, [x21, #0x0]\n" "ldr q6, [x10, #0x0]\n" "ldr q7, [x10, #0x10]\n" - "blt 143f\n" - "142:" // Height 6: Multiply loop: Main loop head + "blt 138f\n" + "137:" // Height 6: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" "sub x27, x27, #0x10\n" @@ -2871,8 +2671,8 @@ void a64_hybrid_s8qs_dot_6x16 ( ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" "ldr q5, [x21, #0x0]\n" "ldr q7, [x10, #0x10]\n" - "bge 142b\n" - "143:" // Height 6: Multiply loop: Single iteration only + "bge 137b\n" + "138:" // Height 6: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" "add x26, x26, #0x10\n" @@ -2997,162 +2797,162 @@ void a64_hybrid_s8qs_dot_6x16 ( ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" - "144:" // Height 6: Multiply loop: Main loop skip - "cbz x27, 149f\n" + "139:" // Height 6: Multiply loop: Main loop skip + "cbz x27, 144f\n" "cmp x27, #0x4\n" - "blt 146f\n" - "145:" // Height 6: Multiply loop: Odd block loop - "ldr s7, [x26], #0x4\n" - "ldr s6, [x25], #0x4\n" + "blt 141f\n" + "140:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x26], #0x4\n" + "ldr s1, [x25], #0x4\n" "sub x27, x27, #0x4\n" - "ldr s5, [x24], #0x4\n" - "ldr s4, [x23], #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" "cmp x27, #0x4\n" - "ldr s3, [x22], #0x4\n" - "ldr s2, [x21], #0x4\n" - "ldr q1, [x10, #0x0]\n" - "ldr q0, [x10, #0x10]\n" - ".inst 0x4f87e028 // sdot v8.4s, v1.16b, v7.4b[0]\n" - ".inst 0x4f86e02c // sdot v12.4s, v1.16b, v6.4b[0]\n" - ".inst 0x4f85e030 // sdot v16.4s, v1.16b, v5.4b[0]\n" - ".inst 0x4f84e034 // sdot v20.4s, v1.16b, v4.4b[0]\n" - ".inst 0x4f83e038 // sdot v24.4s, v1.16b, v3.4b[0]\n" - ".inst 0x4f82e03c // sdot v28.4s, v1.16b, v2.4b[0]\n" - "ldr q1, [x10, #0x20]\n" - ".inst 0x4f87e009 // sdot v9.4s, v0.16b, v7.4b[0]\n" - ".inst 0x4f86e00d // sdot v13.4s, v0.16b, v6.4b[0]\n" - ".inst 0x4f85e011 // sdot v17.4s, v0.16b, v5.4b[0]\n" - ".inst 0x4f84e015 // sdot v21.4s, v0.16b, v4.4b[0]\n" - ".inst 0x4f83e019 // sdot v25.4s, v0.16b, v3.4b[0]\n" - ".inst 0x4f82e01d // sdot v29.4s, v0.16b, v2.4b[0]\n" - "ldr q0, [x10, #0x30]\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x21], #0x4\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f87e02a // sdot v10.4s, v1.16b, v7.4b[0]\n" - ".inst 0x4f86e02e // sdot v14.4s, v1.16b, v6.4b[0]\n" - ".inst 0x4f85e032 // sdot v18.4s, v1.16b, v5.4b[0]\n" - ".inst 0x4f84e036 // sdot v22.4s, v1.16b, v4.4b[0]\n" - ".inst 0x4f83e03a // sdot v26.4s, v1.16b, v3.4b[0]\n" - ".inst 0x4f82e03e // sdot v30.4s, v1.16b, v2.4b[0]\n" - ".inst 0x4f87e00b // sdot v11.4s, v0.16b, v7.4b[0]\n" - ".inst 0x4f86e00f // sdot v15.4s, v0.16b, v6.4b[0]\n" - ".inst 0x4f85e013 // sdot v19.4s, v0.16b, v5.4b[0]\n" - ".inst 0x4f84e017 // sdot v23.4s, v0.16b, v4.4b[0]\n" - ".inst 0x4f83e01b // sdot v27.4s, v0.16b, v3.4b[0]\n" - ".inst 0x4f82e01f // sdot v31.4s, v0.16b, v2.4b[0]\n" - "bge 145b\n" - "146:" // Height 6: Multiply loop: Skip odd blocks - "cbz x27, 149f\n" - "tbz x27, #1, 147f\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + "bge 140b\n" + "141:" // Height 6: Multiply loop: Skip odd blocks + "cbz x27, 144f\n" + "tbz x27, #1, 142f\n" "ldr h0, [x26], #0x2\n" "ldr h1, [x25], #0x2\n" "ldr h2, [x24], #0x2\n" "ldr h3, [x23], #0x2\n" "ldr h4, [x22], #0x2\n" "ldr h5, [x21], #0x2\n" - "tbz x27, #0, 148f\n" + "tbz x27, #0, 143f\n" "ld1 { v0.b }[2], [x26]\n" "ld1 { v1.b }[2], [x25]\n" "ld1 { v2.b }[2], [x24]\n" "ld1 { v3.b }[2], [x23]\n" "ld1 { v4.b }[2], [x22]\n" "ld1 { v5.b }[2], [x21]\n" - "b 148f\n" - "147:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "b 143f\n" + "142:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x26, #0x0]\n" "ldr b1, [x25, #0x0]\n" "ldr b2, [x24, #0x0]\n" "ldr b3, [x23, #0x0]\n" "ldr b4, [x22, #0x0]\n" "ldr b5, [x21, #0x0]\n" - "148:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ec // sdot v12.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f0 // sdot v16.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f4 // sdot v20.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f8 // sdot v24.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0fc // sdot v28.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x4f80e0c9 // sdot v9.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cd // sdot v13.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d1 // sdot v17.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d5 // sdot v21.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d9 // sdot v25.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0dd // sdot v29.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x10, #0x30]\n" + "143:" // Height 6: Multiply loop: Ragged operand read: Done + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f80e0ea // sdot v10.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ee // sdot v14.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f2 // sdot v18.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f6 // sdot v22.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fa // sdot v26.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0fe // sdot v30.4s, v7.16b, v5.4b[0]\n" - ".inst 0x4f80e0cb // sdot v11.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cf // sdot v15.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d3 // sdot v19.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d7 // sdot v23.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0db // sdot v27.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0df // sdot v31.4s, v6.16b, v5.4b[0]\n" - "149:" // Height 6: Multiply loop: No odd multiplies + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + "144:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 139b\n" - "ldr q3, [x14, #0x0]\n" - "ldr q2, [x14, #0x10]\n" - "ldr q1, [x14, #0x20]\n" - "ldr q0, [x14, #0x30]\n" - "add x14, x14, #0x40\n" + "bne 134b\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q3, [x12, #0x30]\n" + "add x12, x12, #0x40\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x9, #0x0]\n" - "add v8.4s, v8.4s, v3.4s\n" - "add v9.4s, v9.4s, v2.4s\n" - "add v12.4s, v12.4s, v3.4s\n" - "add v13.4s, v13.4s, v2.4s\n" - "add v10.4s, v10.4s, v1.4s\n" - "add v11.4s, v11.4s, v0.4s\n" - "add x26, x9, x20\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v12.4s, v12.4s, v0.4s\n" + "add v13.4s, v13.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add x27, x9, x20\n" + "add x26, x27, x20\n" "add x25, x26, x20\n" - "add x24, x25, x20\n" + "prfm pstl1keep, [x27, #0x0]\n" "prfm pstl1keep, [x26, #0x0]\n" + "add v14.4s, v14.4s, v2.4s\n" + "add x24, x25, x20\n" "prfm pstl1keep, [x25, #0x0]\n" - "add v14.4s, v14.4s, v1.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" "add x23, x24, x20\n" "prfm pstl1keep, [x24, #0x0]\n" - "add v15.4s, v15.4s, v0.4s\n" - "add v16.4s, v16.4s, v3.4s\n" - "add x22, x23, x20\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" "prfm pstl1keep, [x23, #0x0]\n" - "add v17.4s, v17.4s, v2.4s\n" - "add v18.4s, v18.4s, v1.4s\n" - "prfm pstl1keep, [x22, #0x0]\n" - "add v19.4s, v19.4s, v0.4s\n" - "add v20.4s, v20.4s, v3.4s\n" - "add v21.4s, v21.4s, v2.4s\n" - "add v22.4s, v22.4s, v1.4s\n" - "add v23.4s, v23.4s, v0.4s\n" - "add v24.4s, v24.4s, v3.4s\n" - "add v25.4s, v25.4s, v2.4s\n" - "add v26.4s, v26.4s, v1.4s\n" - "add v27.4s, v27.4s, v0.4s\n" - "add v28.4s, v28.4s, v3.4s\n" - "add v29.4s, v29.4s, v2.4s\n" - "add v30.4s, v30.4s, v1.4s\n" - "add v31.4s, v31.4s, v0.4s\n" - "tbz %x[flags], #4, 150f\n" - "ldr q0, [x12, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q1, [x12, #0x10]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q2, [x12, #0x20]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q3, [x12, #0x30]\n" - "ldr q7, [x13, #0x30]\n" - "add x12, x12, #0x40\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "add v28.4s, v28.4s, v0.4s\n" + "add v29.4s, v29.4s, v1.4s\n" + "add v30.4s, v30.4s, v2.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "tbz %x[flags], #4, 145f\n" + "ldr q0, [x13, #0x0]\n" + "ldr q4, [x14, #0x0]\n" + "ldr q1, [x13, #0x10]\n" + "ldr q5, [x14, #0x10]\n" + "ldr q2, [x13, #0x20]\n" + "ldr q6, [x14, #0x20]\n" + "ldr q3, [x13, #0x30]\n" + "ldr q7, [x14, #0x30]\n" "add x13, x13, #0x40\n" - "b 151f\n" - "150:" // Height 6: per layer parameters + "add x14, x14, #0x40\n" + "b 146f\n" + "145:" // Height 6: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1r { v0.4s }, [x21]\n" @@ -3163,118 +2963,44 @@ void a64_hybrid_s8qs_dot_6x16 ( "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" - "151:" // Height 6: parameters loaded - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "sqrdmulh v12.4s, v12.4s, v4.4s\n" - "sqrdmulh v13.4s, v13.4s, v5.4s\n" - "sqrdmulh v14.4s, v14.4s, v6.4s\n" - "sqrdmulh v15.4s, v15.4s, v7.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v5.4s\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "sqrdmulh v19.4s, v19.4s, v7.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v5.4s\n" - "sqrdmulh v22.4s, v22.4s, v6.4s\n" - "sqrdmulh v23.4s, v23.4s, v7.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v5.4s\n" - "sqrdmulh v26.4s, v26.4s, v6.4s\n" - "sqrdmulh v27.4s, v27.4s, v7.4s\n" - "sqrdmulh v28.4s, v28.4s, v4.4s\n" - "sqrdmulh v29.4s, v29.4s, v5.4s\n" - "sqrdmulh v30.4s, v30.4s, v6.4s\n" - "sqrdmulh v31.4s, v31.4s, v7.4s\n" - "tbz %x[flags], #5, 152f\n" - "and v7.16b, v8.16b, v0.16b\n" - "and v6.16b, v9.16b, v1.16b\n" - "and v5.16b, v10.16b, v2.16b\n" - "and v4.16b, v11.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v7.4s\n" - "and v7.16b, v12.16b, v0.16b\n" - "sqadd v9.4s, v9.4s, v6.4s\n" - "and v6.16b, v13.16b, v1.16b\n" - "sqadd v10.4s, v10.4s, v5.4s\n" - "sqadd v11.4s, v11.4s, v4.4s\n" - "and v5.16b, v14.16b, v2.16b\n" - "and v4.16b, v15.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v7.4s\n" - "and v7.16b, v16.16b, v0.16b\n" - "sqadd v13.4s, v13.4s, v6.4s\n" - "and v6.16b, v17.16b, v1.16b\n" - "sqadd v14.4s, v14.4s, v5.4s\n" - "sqadd v15.4s, v15.4s, v4.4s\n" - "and v5.16b, v18.16b, v2.16b\n" - "and v4.16b, v19.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v7.4s\n" - "and v7.16b, v20.16b, v0.16b\n" - "sqadd v17.4s, v17.4s, v6.4s\n" - "and v6.16b, v21.16b, v1.16b\n" - "sqadd v18.4s, v18.4s, v5.4s\n" - "sqadd v19.4s, v19.4s, v4.4s\n" - "and v5.16b, v22.16b, v2.16b\n" - "and v4.16b, v23.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v7.4s\n" - "and v7.16b, v24.16b, v0.16b\n" - "sqadd v21.4s, v21.4s, v6.4s\n" - "and v6.16b, v25.16b, v1.16b\n" - "sqadd v22.4s, v22.4s, v5.4s\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "and v5.16b, v26.16b, v2.16b\n" - "and v4.16b, v27.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v24.4s, v24.4s, v7.4s\n" - "and v7.16b, v28.16b, v0.16b\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "and v6.16b, v29.16b, v1.16b\n" - "sqadd v26.4s, v26.4s, v5.4s\n" - "sqadd v27.4s, v27.4s, v4.4s\n" - "and v5.16b, v30.16b, v2.16b\n" - "and v4.16b, v31.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v28.4s, v28.4s, v7.4s\n" - "sqadd v29.4s, v29.4s, v6.4s\n" - "sqadd v30.4s, v30.4s, v5.4s\n" - "sqadd v31.4s, v31.4s, v4.4s\n" - "152:" // Height 6: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "146:" // Height 6: parameters loaded + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" + "add x20, %x[qp], %[minval]\n" + "cmp x11, #0x10\n" + "sqdmulh v12.4s, v12.4s, v4.4s\n" + "sqdmulh v13.4s, v13.4s, v5.4s\n" + "sqdmulh v14.4s, v14.4s, v6.4s\n" + "sqdmulh v15.4s, v15.4s, v7.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v5.4s\n" + "sqdmulh v18.4s, v18.4s, v6.4s\n" + "sqdmulh v19.4s, v19.4s, v7.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v5.4s\n" + "sqdmulh v22.4s, v22.4s, v6.4s\n" + "sqdmulh v23.4s, v23.4s, v7.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v5.4s\n" + "sqdmulh v26.4s, v26.4s, v6.4s\n" + "sqdmulh v27.4s, v27.4s, v7.4s\n" + "sqdmulh v28.4s, v28.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" + "sqdmulh v29.4s, v29.4s, v5.4s\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v30.4s, v30.4s, v6.4s\n" + "ld1r { v6.4s }, [x21]\n" + "sqdmulh v31.4s, v31.4s, v7.4s\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x21]\n" - "ld1r { v5.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x11, #0x10\n" - "ld1r { v4.4s }, [x20]\n" "srshl v14.4s, v14.4s, v2.4s\n" "srshl v15.4s, v15.4s, v3.4s\n" "srshl v16.4s, v16.4s, v0.4s\n" @@ -3293,240 +3019,240 @@ void a64_hybrid_s8qs_dot_6x16 ( "srshl v29.4s, v29.4s, v1.4s\n" "srshl v30.4s, v30.4s, v2.4s\n" "srshl v31.4s, v31.4s, v3.4s\n" - "add v8.4s, v8.4s, v6.4s\n" - "add v9.4s, v9.4s, v6.4s\n" - "add v10.4s, v10.4s, v6.4s\n" - "add v11.4s, v11.4s, v6.4s\n" - "add v12.4s, v12.4s, v6.4s\n" - "add v13.4s, v13.4s, v6.4s\n" - "add v14.4s, v14.4s, v6.4s\n" - "add v15.4s, v15.4s, v6.4s\n" - "add v16.4s, v16.4s, v6.4s\n" - "add v17.4s, v17.4s, v6.4s\n" - "add v18.4s, v18.4s, v6.4s\n" - "add v19.4s, v19.4s, v6.4s\n" - "add v20.4s, v20.4s, v6.4s\n" - "add v21.4s, v21.4s, v6.4s\n" - "add v22.4s, v22.4s, v6.4s\n" - "add v23.4s, v23.4s, v6.4s\n" - "add v24.4s, v24.4s, v6.4s\n" - "add v25.4s, v25.4s, v6.4s\n" - "add v26.4s, v26.4s, v6.4s\n" - "add v27.4s, v27.4s, v6.4s\n" - "add v28.4s, v28.4s, v6.4s\n" - "add v29.4s, v29.4s, v6.4s\n" - "add v30.4s, v30.4s, v6.4s\n" - "add v31.4s, v31.4s, v6.4s\n" - "smin v8.4s, v8.4s, v5.4s\n" - "smin v9.4s, v9.4s, v5.4s\n" - "smin v10.4s, v10.4s, v5.4s\n" - "smin v11.4s, v11.4s, v5.4s\n" - "smin v12.4s, v12.4s, v5.4s\n" - "smin v13.4s, v13.4s, v5.4s\n" - "smin v14.4s, v14.4s, v5.4s\n" - "smin v15.4s, v15.4s, v5.4s\n" - "smin v16.4s, v16.4s, v5.4s\n" - "smin v17.4s, v17.4s, v5.4s\n" - "smin v18.4s, v18.4s, v5.4s\n" - "smin v19.4s, v19.4s, v5.4s\n" - "smin v20.4s, v20.4s, v5.4s\n" - "smin v21.4s, v21.4s, v5.4s\n" - "smin v22.4s, v22.4s, v5.4s\n" - "smin v23.4s, v23.4s, v5.4s\n" - "smin v24.4s, v24.4s, v5.4s\n" - "smin v25.4s, v25.4s, v5.4s\n" - "smin v26.4s, v26.4s, v5.4s\n" - "smin v27.4s, v27.4s, v5.4s\n" - "smin v28.4s, v28.4s, v5.4s\n" - "smin v29.4s, v29.4s, v5.4s\n" - "smin v30.4s, v30.4s, v5.4s\n" - "smin v31.4s, v31.4s, v5.4s\n" - "smax v8.4s, v8.4s, v4.4s\n" - "smax v9.4s, v9.4s, v4.4s\n" - "smax v10.4s, v10.4s, v4.4s\n" - "smax v11.4s, v11.4s, v4.4s\n" - "smax v12.4s, v12.4s, v4.4s\n" - "smax v13.4s, v13.4s, v4.4s\n" - "smax v14.4s, v14.4s, v4.4s\n" - "smax v15.4s, v15.4s, v4.4s\n" - "smax v16.4s, v16.4s, v4.4s\n" - "smax v17.4s, v17.4s, v4.4s\n" - "smax v18.4s, v18.4s, v4.4s\n" - "smax v19.4s, v19.4s, v4.4s\n" - "smax v20.4s, v20.4s, v4.4s\n" - "smax v21.4s, v21.4s, v4.4s\n" - "smax v22.4s, v22.4s, v4.4s\n" - "smax v23.4s, v23.4s, v4.4s\n" - "smax v24.4s, v24.4s, v4.4s\n" - "smax v25.4s, v25.4s, v4.4s\n" - "smax v26.4s, v26.4s, v4.4s\n" - "smax v27.4s, v27.4s, v4.4s\n" - "smax v28.4s, v28.4s, v4.4s\n" - "smax v29.4s, v29.4s, v4.4s\n" - "smax v30.4s, v30.4s, v4.4s\n" - "smax v31.4s, v31.4s, v4.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v2.8h, v10.8h, v11.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v1.8h, v14.8h, v15.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v0.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v19.8h, v22.8h, v23.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v18.8h, v26.8h, v27.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" "uzp1 v28.8h, v28.8h, v29.8h\n" - "uzp1 v17.8h, v30.8h, v31.8h\n" - "uzp1 v8.16b, v8.16b, v2.16b\n" - "uzp1 v12.16b, v12.16b, v1.16b\n" - "uzp1 v16.16b, v16.16b, v0.16b\n" - "uzp1 v20.16b, v20.16b, v19.16b\n" - "uzp1 v24.16b, v24.16b, v18.16b\n" - "uzp1 v28.16b, v28.16b, v17.16b\n" - "bge 161f\n" - "tbz x11, #3, 156f\n" + "uzp1 v29.8h, v30.8h, v31.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v28.16b, v28.16b, v29.16b\n" + "bge 155f\n" + "tbz x11, #3, 150f\n" "str d8, [x9], #0x8\n" - "str d12, [x26], #0x8\n" - "str d16, [x25], #0x8\n" - "str d20, [x24], #0x8\n" - "str d24, [x23], #0x8\n" - "str d28, [x22], #0x8\n" - "tbz x11, #2, 154f\n" + "str d12, [x27], #0x8\n" + "str d16, [x26], #0x8\n" + "str d20, [x25], #0x8\n" + "str d24, [x24], #0x8\n" + "str d28, [x23], #0x8\n" + "tbz x11, #2, 148f\n" "st1 { v8.s }[2], [x9], #0x4\n" - "st1 { v12.s }[2], [x26], #0x4\n" - "st1 { v16.s }[2], [x25], #0x4\n" - "st1 { v20.s }[2], [x24], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "st1 { v28.s }[2], [x22], #0x4\n" - "tbz x11, #1, 153f\n" + "st1 { v12.s }[2], [x27], #0x4\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "st1 { v20.s }[2], [x25], #0x4\n" + "st1 { v24.s }[2], [x24], #0x4\n" + "st1 { v28.s }[2], [x23], #0x4\n" + "tbz x11, #1, 147f\n" "st1 { v8.h }[6], [x9], #0x2\n" - "st1 { v12.h }[6], [x26], #0x2\n" - "st1 { v16.h }[6], [x25], #0x2\n" - "st1 { v20.h }[6], [x24], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "st1 { v28.h }[6], [x22], #0x2\n" - "tbz x11, #0, 160f\n" + "st1 { v12.h }[6], [x27], #0x2\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "st1 { v20.h }[6], [x25], #0x2\n" + "st1 { v24.h }[6], [x24], #0x2\n" + "st1 { v28.h }[6], [x23], #0x2\n" + "tbz x11, #0, 154f\n" "st1 { v8.b }[14], [x9]\n" - "st1 { v12.b }[14], [x26]\n" - "st1 { v16.b }[14], [x25]\n" - "st1 { v20.b }[14], [x24]\n" - "st1 { v24.b }[14], [x23]\n" - "st1 { v28.b }[14], [x22]\n" - "b 160f\n" - "153:" // Height 6: Partial direct writeback: partial_1_12 - "tbz x11, #0, 160f\n" + "st1 { v12.b }[14], [x27]\n" + "st1 { v16.b }[14], [x26]\n" + "st1 { v20.b }[14], [x25]\n" + "st1 { v24.b }[14], [x24]\n" + "st1 { v28.b }[14], [x23]\n" + "b 154f\n" + "147:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x11, #0, 154f\n" "st1 { v8.b }[12], [x9]\n" - "st1 { v12.b }[12], [x26]\n" - "st1 { v16.b }[12], [x25]\n" - "st1 { v20.b }[12], [x24]\n" - "st1 { v24.b }[12], [x23]\n" - "st1 { v28.b }[12], [x22]\n" - "b 160f\n" - "154:" // Height 6: Partial direct writeback: partial_2_8 - "tbz x11, #1, 155f\n" + "st1 { v12.b }[12], [x27]\n" + "st1 { v16.b }[12], [x26]\n" + "st1 { v20.b }[12], [x25]\n" + "st1 { v24.b }[12], [x24]\n" + "st1 { v28.b }[12], [x23]\n" + "b 154f\n" + "148:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x11, #1, 149f\n" "st1 { v8.h }[4], [x9], #0x2\n" - "st1 { v12.h }[4], [x26], #0x2\n" - "st1 { v16.h }[4], [x25], #0x2\n" - "st1 { v20.h }[4], [x24], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "st1 { v28.h }[4], [x22], #0x2\n" - "tbz x11, #0, 160f\n" + "st1 { v12.h }[4], [x27], #0x2\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "st1 { v20.h }[4], [x25], #0x2\n" + "st1 { v24.h }[4], [x24], #0x2\n" + "st1 { v28.h }[4], [x23], #0x2\n" + "tbz x11, #0, 154f\n" "st1 { v8.b }[10], [x9]\n" - "st1 { v12.b }[10], [x26]\n" - "st1 { v16.b }[10], [x25]\n" - "st1 { v20.b }[10], [x24]\n" - "st1 { v24.b }[10], [x23]\n" - "st1 { v28.b }[10], [x22]\n" - "b 160f\n" - "155:" // Height 6: Partial direct writeback: partial_1_8 - "tbz x11, #0, 160f\n" + "st1 { v12.b }[10], [x27]\n" + "st1 { v16.b }[10], [x26]\n" + "st1 { v20.b }[10], [x25]\n" + "st1 { v24.b }[10], [x24]\n" + "st1 { v28.b }[10], [x23]\n" + "b 154f\n" + "149:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x11, #0, 154f\n" "st1 { v8.b }[8], [x9]\n" - "st1 { v12.b }[8], [x26]\n" - "st1 { v16.b }[8], [x25]\n" - "st1 { v20.b }[8], [x24]\n" - "st1 { v24.b }[8], [x23]\n" - "st1 { v28.b }[8], [x22]\n" - "b 160f\n" - "156:" // Height 6: Partial direct writeback: partial_4_0 - "tbz x11, #2, 158f\n" + "st1 { v12.b }[8], [x27]\n" + "st1 { v16.b }[8], [x26]\n" + "st1 { v20.b }[8], [x25]\n" + "st1 { v24.b }[8], [x24]\n" + "st1 { v28.b }[8], [x23]\n" + "b 154f\n" + "150:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x11, #2, 152f\n" "str s8, [x9], #0x4\n" - "str s12, [x26], #0x4\n" - "str s16, [x25], #0x4\n" - "str s20, [x24], #0x4\n" - "str s24, [x23], #0x4\n" - "str s28, [x22], #0x4\n" - "tbz x11, #1, 157f\n" + "str s12, [x27], #0x4\n" + "str s16, [x26], #0x4\n" + "str s20, [x25], #0x4\n" + "str s24, [x24], #0x4\n" + "str s28, [x23], #0x4\n" + "tbz x11, #1, 151f\n" "st1 { v8.h }[2], [x9], #0x2\n" - "st1 { v12.h }[2], [x26], #0x2\n" - "st1 { v16.h }[2], [x25], #0x2\n" - "st1 { v20.h }[2], [x24], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "st1 { v28.h }[2], [x22], #0x2\n" - "tbz x11, #0, 160f\n" + "st1 { v12.h }[2], [x27], #0x2\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "st1 { v20.h }[2], [x25], #0x2\n" + "st1 { v24.h }[2], [x24], #0x2\n" + "st1 { v28.h }[2], [x23], #0x2\n" + "tbz x11, #0, 154f\n" "st1 { v8.b }[6], [x9]\n" - "st1 { v12.b }[6], [x26]\n" - "st1 { v16.b }[6], [x25]\n" - "st1 { v20.b }[6], [x24]\n" - "st1 { v24.b }[6], [x23]\n" - "st1 { v28.b }[6], [x22]\n" - "b 160f\n" - "157:" // Height 6: Partial direct writeback: partial_1_4 - "tbz x11, #0, 160f\n" + "st1 { v12.b }[6], [x27]\n" + "st1 { v16.b }[6], [x26]\n" + "st1 { v20.b }[6], [x25]\n" + "st1 { v24.b }[6], [x24]\n" + "st1 { v28.b }[6], [x23]\n" + "b 154f\n" + "151:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x11, #0, 154f\n" "st1 { v8.b }[4], [x9]\n" - "st1 { v12.b }[4], [x26]\n" - "st1 { v16.b }[4], [x25]\n" - "st1 { v20.b }[4], [x24]\n" - "st1 { v24.b }[4], [x23]\n" - "st1 { v28.b }[4], [x22]\n" - "b 160f\n" - "158:" // Height 6: Partial direct writeback: partial_2_0 - "tbz x11, #1, 159f\n" + "st1 { v12.b }[4], [x27]\n" + "st1 { v16.b }[4], [x26]\n" + "st1 { v20.b }[4], [x25]\n" + "st1 { v24.b }[4], [x24]\n" + "st1 { v28.b }[4], [x23]\n" + "b 154f\n" + "152:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x11, #1, 153f\n" "str h8, [x9], #0x2\n" - "str h12, [x26], #0x2\n" - "str h16, [x25], #0x2\n" - "str h20, [x24], #0x2\n" - "str h24, [x23], #0x2\n" - "str h28, [x22], #0x2\n" - "tbz x11, #0, 160f\n" + "str h12, [x27], #0x2\n" + "str h16, [x26], #0x2\n" + "str h20, [x25], #0x2\n" + "str h24, [x24], #0x2\n" + "str h28, [x23], #0x2\n" + "tbz x11, #0, 154f\n" "st1 { v8.b }[2], [x9]\n" - "st1 { v12.b }[2], [x26]\n" - "st1 { v16.b }[2], [x25]\n" - "st1 { v20.b }[2], [x24]\n" - "st1 { v24.b }[2], [x23]\n" - "st1 { v28.b }[2], [x22]\n" - "b 160f\n" - "159:" // Height 6: Partial direct writeback: partial_1_0 + "st1 { v12.b }[2], [x27]\n" + "st1 { v16.b }[2], [x26]\n" + "st1 { v20.b }[2], [x25]\n" + "st1 { v24.b }[2], [x24]\n" + "st1 { v28.b }[2], [x23]\n" + "b 154f\n" + "153:" // Height 6: Partial direct writeback: partial_1_0 "str b8, [x9, #0x0]\n" - "str b12, [x26, #0x0]\n" - "str b16, [x25, #0x0]\n" - "str b20, [x24, #0x0]\n" - "str b24, [x23, #0x0]\n" - "str b28, [x22, #0x0]\n" - "160:" // Height 6: Partial direct writeback: Done - "b 162f\n" - "161:" // Height 6: Full writeback + "str b12, [x27, #0x0]\n" + "str b16, [x26, #0x0]\n" + "str b20, [x25, #0x0]\n" + "str b24, [x24, #0x0]\n" + "str b28, [x23, #0x0]\n" + "154:" // Height 6: Partial direct writeback: Done + "b 156f\n" + "155:" // Height 6: Full writeback "str q8, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "str q12, [x26, #0x0]\n" - "str q16, [x25, #0x0]\n" - "str q20, [x24, #0x0]\n" - "str q24, [x23, #0x0]\n" - "str q28, [x22, #0x0]\n" - "162:" // Height 6: Writeback done + "str q12, [x27, #0x0]\n" + "str q16, [x26, #0x0]\n" + "str q20, [x25, #0x0]\n" + "str q24, [x24, #0x0]\n" + "str q28, [x23, #0x0]\n" + "156:" // Height 6: Writeback done "subs x11, x11, #0x10\n" - "bgt 137b\n" + "bgt 132b\n" "subs %x[M], %x[M], #0x6\n" - "beq 164f\n" + "beq 158f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 163f\n" + "tbz %x[flags], #3, 157f\n" "add x21, x21, #0x6\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "163:" // Update direct input + "157:" // Update direct input "mov x20, #0x6\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "164:" // Exit + "158:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp index 867bcded1f..ceb1a39562 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -81,22 +80,19 @@ void a64_hybrid_s8qs_mmla_6x16 ( ka.multiplier_ptr=qp->per_channel_muls + col_base; ka.shift_ptr=qp->per_channel_right_shifts + col_base; } - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 146f\n" + "bge 141f\n" "cmp %x[M], #0x4\n" - "bgt 117f\n" - "beq 88f\n" + "bgt 113f\n" + "beq 85f\n" "cmp %x[M], #0x2\n" - "bgt 59f\n" - "beq 30f\n" - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "bgt 57f\n" + "beq 29f\n" + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" @@ -109,7 +105,6 @@ void a64_hybrid_s8qs_mmla_6x16 ( "movi v13.4s, #0x0\n" "movi v14.4s, #0x0\n" "movi v15.4s, #0x0\n" - "3:" // Height 1: setup done "mov x28, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -138,40 +133,40 @@ void a64_hybrid_s8qs_mmla_6x16 ( "add x26, x26, #0x10\n" "cmp x27, #0x20\n" "prfm pldl1keep, [x26, #0x80]\n" - "trn1 v18.2d, v1.2d, v21.2d\n" - "trn2 v1.2d, v1.2d, v21.2d\n" - ".inst 0x4e87a648 // smmla v8.4s, v18.16b, v7.16b\n" - "ldr q17, [x10, #0x20]\n" - ".inst 0x4e86a64c // smmla v12.4s, v18.16b, v6.16b\n" - "ldr q16, [x10, #0x30]\n" - ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x40]\n" - ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x50]\n" - ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x60]\n" - ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x70]\n" - ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x80]\n" - ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x90]\n" - ".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n" - "ldr q17, [x10, #0xa0]\n" - ".inst 0x4e90a42c // smmla v12.4s, v1.16b, v16.16b\n" - "ldr q16, [x10, #0xb0]\n" - ".inst 0x4e91a429 // smmla v9.4s, v1.16b, v17.16b\n" - "ldr q17, [x10, #0xc0]\n" - ".inst 0x4e90a42d // smmla v13.4s, v1.16b, v16.16b\n" - "ldr q16, [x10, #0xd0]\n" - ".inst 0x4e91a42a // smmla v10.4s, v1.16b, v17.16b\n" - "ldr q17, [x10, #0xe0]\n" - ".inst 0x4e90a42e // smmla v14.4s, v1.16b, v16.16b\n" - "ldr q16, [x10, #0xf0]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e91a42b // smmla v11.4s, v1.16b, v17.16b\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x4e90a42f // smmla v15.4s, v1.16b, v16.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" "ldr q1, [x26, #0x0]\n" "ldr q6, [x10, #0x10]\n" "bge 7b\n" @@ -179,65 +174,65 @@ void a64_hybrid_s8qs_mmla_6x16 ( "add x26, x26, #0x10\n" "sub x27, x27, #0x10\n" "prfm pldl1keep, [x26, #0x80]\n" - "trn1 v18.2d, v1.2d, v16.2d\n" - "trn2 v1.2d, v1.2d, v16.2d\n" - ".inst 0x4e87a648 // smmla v8.4s, v18.16b, v7.16b\n" - "ldr q17, [x10, #0x20]\n" - ".inst 0x4e86a64c // smmla v12.4s, v18.16b, v6.16b\n" - "ldr q16, [x10, #0x30]\n" - ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x40]\n" - ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x50]\n" - ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x60]\n" - ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x70]\n" - ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x80]\n" - ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x90]\n" - ".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n" - "ldr q17, [x10, #0xa0]\n" - ".inst 0x4e90a42c // smmla v12.4s, v1.16b, v16.16b\n" - "ldr q16, [x10, #0xb0]\n" - ".inst 0x4e91a429 // smmla v9.4s, v1.16b, v17.16b\n" - "ldr q17, [x10, #0xc0]\n" - ".inst 0x4e90a42d // smmla v13.4s, v1.16b, v16.16b\n" - "ldr q16, [x10, #0xd0]\n" - ".inst 0x4e91a42a // smmla v10.4s, v1.16b, v17.16b\n" - "ldr q17, [x10, #0xe0]\n" - ".inst 0x4e90a42e // smmla v14.4s, v1.16b, v16.16b\n" - "ldr q16, [x10, #0xf0]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e91a42b // smmla v11.4s, v1.16b, v17.16b\n" - ".inst 0x4e90a42f // smmla v15.4s, v1.16b, v16.16b\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" "9:" // Height 1: Multiply loop: Main loop skip "cbz x27, 16f\n" "cmp x27, #0x8\n" "blt 11f\n" "10:" // Height 1: Multiply loop: Odd block loop - "ldr d18, [x26], #0x8\n" - "ldr q19, [x10, #0x0]\n" + "ldr d1, [x26], #0x8\n" + "ldr q6, [x10, #0x0]\n" "sub x27, x27, #0x8\n" - "ldr q16, [x10, #0x10]\n" + "ldr q7, [x10, #0x10]\n" "cmp x27, #0x8\n" - "trn1 v18.2d, v18.2d, v17.2d\n" - ".inst 0x4e93a648 // smmla v8.4s, v18.16b, v19.16b\n" - "ldr q17, [x10, #0x20]\n" - ".inst 0x4e90a64c // smmla v12.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x30]\n" - ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x40]\n" - ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x50]\n" - ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x60]\n" - ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x70]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n" - ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" "bge 10b\n" "11:" // Height 1: Multiply loop: Skip odd blocks "cbz x27, 16f\n" @@ -261,55 +256,55 @@ void a64_hybrid_s8qs_mmla_6x16 ( "14:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x26, #0x0]\n" "15:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q17, [x10, #0x0]\n" - "ldr q19, [x10, #0x10]\n" - "trn1 v18.2d, v1.2d, v16.2d\n" - ".inst 0x4e91a648 // smmla v8.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x20]\n" - ".inst 0x4e93a64c // smmla v12.4s, v18.16b, v19.16b\n" - "ldr q16, [x10, #0x30]\n" - ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x40]\n" - ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x50]\n" - ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x60]\n" - ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x70]\n" + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n" - ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" "16:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 4b\n" - "ldr q19, [x14, #0x0]\n" - "ldr q18, [x14, #0x10]\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x12, #0x10]\n" "uzp1 v8.2d, v8.2d, v12.2d\n" "uzp1 v9.2d, v9.2d, v13.2d\n" - "ldr q17, [x14, #0x20]\n" - "ldr q16, [x14, #0x30]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q3, [x12, #0x30]\n" "uzp1 v10.2d, v10.2d, v14.2d\n" "uzp1 v11.2d, v11.2d, v15.2d\n" "prfm pstl1keep, [x9, #0x0]\n" - "add x14, x14, #0x40\n" + "add x12, x12, #0x40\n" "mov v15.16b, v8.16b\n" - "add v9.4s, v9.4s, v18.4s\n" - "add v15.4s, v15.4s, v19.4s\n" - "add v10.4s, v10.4s, v17.4s\n" - "add v11.4s, v11.4s, v16.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v15.4s, v15.4s, v0.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" "tbz %x[flags], #4, 17f\n" - "ldr q0, [x12, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q1, [x12, #0x10]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q2, [x12, #0x20]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q3, [x12, #0x30]\n" - "ldr q7, [x13, #0x30]\n" - "add x12, x12, #0x40\n" + "ldr q0, [x13, #0x0]\n" + "ldr q4, [x14, #0x0]\n" + "ldr q1, [x13, #0x10]\n" + "ldr q5, [x14, #0x10]\n" + "ldr q2, [x13, #0x20]\n" + "ldr q6, [x14, #0x20]\n" + "ldr q3, [x13, #0x30]\n" + "ldr q7, [x14, #0x30]\n" "add x13, x13, #0x40\n" + "add x14, x14, #0x40\n" "b 18f\n" "17:" // Height 1: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" @@ -323,111 +318,97 @@ void a64_hybrid_s8qs_mmla_6x16 ( "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" "18:" // Height 1: parameters loaded - "sqrdmulh v15.4s, v15.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "tbz %x[flags], #5, 19f\n" - "and v18.16b, v15.16b, v0.16b\n" - "and v17.16b, v9.16b, v1.16b\n" - "and v26.16b, v10.16b, v2.16b\n" - "and v16.16b, v11.16b, v3.16b\n" - "sshr v18.4s, v18.4s, #0x1f\n" - "sshr v17.4s, v17.4s, #0x1f\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v16.4s, v16.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v18.4s\n" - "sqadd v9.4s, v9.4s, v17.4s\n" - "sqadd v10.4s, v10.4s, v26.4s\n" - "sqadd v11.4s, v11.4s, v16.4s\n" - "19:" // Height 1: no shift correction + "sqdmulh v15.4s, v15.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v4.4s }, [x21]\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "ld1r { v6.4s }, [x20]\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" + "add x20, %x[qp], %[minval]\n" + "cmp x11, #0x10\n" + "ld1r { v5.4s }, [x20]\n" "srshl v15.4s, v15.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v18.4s }, [x21]\n" - "ld1r { v17.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x11, #0x10\n" - "ld1r { v16.4s }, [x20]\n" - "add v15.4s, v15.4s, v18.4s\n" - "add v9.4s, v9.4s, v18.4s\n" - "add v10.4s, v10.4s, v18.4s\n" - "add v11.4s, v11.4s, v18.4s\n" - "smin v15.4s, v15.4s, v17.4s\n" - "smin v9.4s, v9.4s, v17.4s\n" - "smin v10.4s, v10.4s, v17.4s\n" - "smin v11.4s, v11.4s, v17.4s\n" - "smax v15.4s, v15.4s, v16.4s\n" - "smax v9.4s, v9.4s, v16.4s\n" - "smax v10.4s, v10.4s, v16.4s\n" - "smax v11.4s, v11.4s, v16.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" "uzp1 v15.8h, v15.8h, v9.8h\n" - "uzp1 v16.8h, v10.8h, v11.8h\n" - "uzp1 v15.16b, v15.16b, v16.16b\n" - "bge 28f\n" - "tbz x11, #3, 23f\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v15.16b, v15.16b, v9.16b\n" + "bge 27f\n" + "tbz x11, #3, 22f\n" "str d15, [x9], #0x8\n" - "tbz x11, #2, 21f\n" + "tbz x11, #2, 20f\n" "st1 { v15.s }[2], [x9], #0x4\n" - "tbz x11, #1, 20f\n" + "tbz x11, #1, 19f\n" "st1 { v15.h }[6], [x9], #0x2\n" - "tbz x11, #0, 27f\n" + "tbz x11, #0, 26f\n" "st1 { v15.b }[14], [x9]\n" - "b 27f\n" - "20:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x11, #0, 27f\n" + "b 26f\n" + "19:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x11, #0, 26f\n" "st1 { v15.b }[12], [x9]\n" - "b 27f\n" - "21:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x11, #1, 22f\n" + "b 26f\n" + "20:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x11, #1, 21f\n" "st1 { v15.h }[4], [x9], #0x2\n" - "tbz x11, #0, 27f\n" + "tbz x11, #0, 26f\n" "st1 { v15.b }[10], [x9]\n" - "b 27f\n" - "22:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x11, #0, 27f\n" + "b 26f\n" + "21:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x11, #0, 26f\n" "st1 { v15.b }[8], [x9]\n" - "b 27f\n" - "23:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x11, #2, 25f\n" + "b 26f\n" + "22:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x11, #2, 24f\n" "str s15, [x9], #0x4\n" - "tbz x11, #1, 24f\n" + "tbz x11, #1, 23f\n" "st1 { v15.h }[2], [x9], #0x2\n" - "tbz x11, #0, 27f\n" + "tbz x11, #0, 26f\n" "st1 { v15.b }[6], [x9]\n" - "b 27f\n" - "24:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x11, #0, 27f\n" + "b 26f\n" + "23:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x11, #0, 26f\n" "st1 { v15.b }[4], [x9]\n" - "b 27f\n" - "25:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x11, #1, 26f\n" + "b 26f\n" + "24:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x11, #1, 25f\n" "str h15, [x9], #0x2\n" - "tbz x11, #0, 27f\n" + "tbz x11, #0, 26f\n" "st1 { v15.b }[2], [x9]\n" - "b 27f\n" - "26:" // Height 1: Partial direct writeback: partial_1_0 + "b 26f\n" + "25:" // Height 1: Partial direct writeback: partial_1_0 "str b15, [x9, #0x0]\n" - "27:" // Height 1: Partial direct writeback: Done - "b 29f\n" - "28:" // Height 1: Full writeback + "26:" // Height 1: Partial direct writeback: Done + "b 28f\n" + "27:" // Height 1: Full writeback "str q15, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "29:" // Height 1: Writeback done + "28:" // Height 1: Writeback done "subs x11, x11, #0x10\n" "bgt 2b\n" - "b 176f\n" - "30:" // Height 2 - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "b 170f\n" + "29:" // Height 2 + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "31:" // Height 2: Column loop + "30:" // Height 2: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -436,36 +417,35 @@ void a64_hybrid_s8qs_mmla_6x16 ( "movi v13.4s, #0x0\n" "movi v14.4s, #0x0\n" "movi v15.4s, #0x0\n" - "32:" // Height 2: setup done "mov x28, #0x0\n" - "33:" // Height 2: String loop + "32:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 34f\n" + "tbz %x[flags], #3, 33f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" "ldr x25, [x20, #0x8]\n" - "cbnz x28, 35f\n" + "cbnz x28, 34f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" - "b 35f\n" - "34:" // Height 2: setup direct input + "b 34f\n" + "33:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" - "35:" // Height 2: input setup done + "34:" // Height 2: input setup done "cmp x27, #0x10\n" - "blt 38f\n" + "blt 37f\n" "ldr q1, [x26, #0x0]\n" "ldr q2, [x25, #0x0]\n" "cmp x27, #0x20\n" "ldr q7, [x10, #0x0]\n" "ldr q6, [x10, #0x10]\n" - "blt 37f\n" - "36:" // Height 2: Multiply loop: Main loop head - "trn1 v18.2d, v1.2d, v2.2d\n" + "blt 36f\n" + "35:" // Height 2: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" @@ -474,166 +454,166 @@ void a64_hybrid_s8qs_mmla_6x16 ( "prfm pldl1keep, [x26, #0x80]\n" "ldr q2, [x25, #0x0]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4e87a648 // smmla v8.4s, v18.16b, v7.16b\n" - "ldr q16, [x10, #0x20]\n" - ".inst 0x4e86a64c // smmla v12.4s, v18.16b, v6.16b\n" - "ldr q17, [x10, #0x30]\n" - ".inst 0x4e90a649 // smmla v9.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x40]\n" - ".inst 0x4e91a64d // smmla v13.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x50]\n" - ".inst 0x4e90a64a // smmla v10.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x60]\n" - ".inst 0x4e91a64e // smmla v14.4s, v18.16b, v17.16b\n" - "ldr q31, [x10, #0x70]\n" - ".inst 0x4e90a64b // smmla v11.4s, v18.16b, v16.16b\n" - "ldr q17, [x10, #0x80]\n" - ".inst 0x4e9fa64f // smmla v15.4s, v18.16b, v31.16b\n" - "ldr q16, [x10, #0x90]\n" - ".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n" - "ldr q17, [x10, #0xa0]\n" - ".inst 0x4e90a42c // smmla v12.4s, v1.16b, v16.16b\n" - "ldr q16, [x10, #0xb0]\n" - ".inst 0x4e91a429 // smmla v9.4s, v1.16b, v17.16b\n" - "ldr q17, [x10, #0xc0]\n" - ".inst 0x4e90a42d // smmla v13.4s, v1.16b, v16.16b\n" - "ldr q16, [x10, #0xd0]\n" - ".inst 0x4e91a42a // smmla v10.4s, v1.16b, v17.16b\n" - "ldr q17, [x10, #0xe0]\n" - ".inst 0x4e90a42e // smmla v14.4s, v1.16b, v16.16b\n" - "ldr q16, [x10, #0xf0]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e91a42b // smmla v11.4s, v1.16b, v17.16b\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x4e90a42f // smmla v15.4s, v1.16b, v16.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" "ldr q1, [x26, #0x0]\n" "ldr q6, [x10, #0x10]\n" - "bge 36b\n" - "37:" // Height 2: Multiply loop: Single iteration only - "trn1 v18.2d, v1.2d, v2.2d\n" + "bge 35b\n" + "36:" // Height 2: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" "sub x27, x27, #0x10\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4e87a648 // smmla v8.4s, v18.16b, v7.16b\n" - "ldr q17, [x10, #0x20]\n" - ".inst 0x4e86a64c // smmla v12.4s, v18.16b, v6.16b\n" - "ldr q16, [x10, #0x30]\n" - ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x40]\n" - ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x50]\n" - ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x60]\n" - ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x70]\n" - ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x80]\n" - ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x90]\n" - ".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n" - "ldr q17, [x10, #0xa0]\n" - ".inst 0x4e90a42c // smmla v12.4s, v1.16b, v16.16b\n" - "ldr q16, [x10, #0xb0]\n" - ".inst 0x4e91a429 // smmla v9.4s, v1.16b, v17.16b\n" - "ldr q17, [x10, #0xc0]\n" - ".inst 0x4e90a42d // smmla v13.4s, v1.16b, v16.16b\n" - "ldr q16, [x10, #0xd0]\n" - ".inst 0x4e91a42a // smmla v10.4s, v1.16b, v17.16b\n" - "ldr q17, [x10, #0xe0]\n" - ".inst 0x4e90a42e // smmla v14.4s, v1.16b, v16.16b\n" - "ldr q16, [x10, #0xf0]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e91a42b // smmla v11.4s, v1.16b, v17.16b\n" - ".inst 0x4e90a42f // smmla v15.4s, v1.16b, v16.16b\n" - "38:" // Height 2: Multiply loop: Main loop skip - "cbz x27, 45f\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + "37:" // Height 2: Multiply loop: Main loop skip + "cbz x27, 44f\n" "cmp x27, #0x8\n" - "blt 40f\n" - "39:" // Height 2: Multiply loop: Odd block loop - "ldr d19, [x26], #0x8\n" - "ldr d18, [x25], #0x8\n" + "blt 39f\n" + "38:" // Height 2: Multiply loop: Odd block loop + "ldr d1, [x26], #0x8\n" + "ldr d2, [x25], #0x8\n" "sub x27, x27, #0x8\n" - "ldr q17, [x10, #0x0]\n" - "ldr q16, [x10, #0x10]\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" "cmp x27, #0x8\n" - "trn1 v18.2d, v19.2d, v18.2d\n" - ".inst 0x4e91a648 // smmla v8.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x20]\n" - ".inst 0x4e90a64c // smmla v12.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x30]\n" - ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x40]\n" - ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x50]\n" - ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x60]\n" - ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x70]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n" - ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n" - "bge 39b\n" - "40:" // Height 2: Multiply loop: Skip odd blocks - "cbz x27, 45f\n" - "tbz x27, #2, 42f\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + "bge 38b\n" + "39:" // Height 2: Multiply loop: Skip odd blocks + "cbz x27, 44f\n" + "tbz x27, #2, 41f\n" "ldr s1, [x26], #0x4\n" "ldr s2, [x25], #0x4\n" - "tbz x27, #1, 41f\n" + "tbz x27, #1, 40f\n" "ld1 { v1.h }[2], [x26], #0x2\n" "ld1 { v2.h }[2], [x25], #0x2\n" - "tbz x27, #0, 44f\n" + "tbz x27, #0, 43f\n" "ld1 { v1.b }[6], [x26]\n" "ld1 { v2.b }[6], [x25]\n" - "b 44f\n" - "41:" // Height 2: Multiply loop: Ragged operand read: partial_1_4 - "tbz x27, #0, 44f\n" + "b 43f\n" + "40:" // Height 2: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 43f\n" "ld1 { v1.b }[4], [x26]\n" "ld1 { v2.b }[4], [x25]\n" - "b 44f\n" - "42:" // Height 2: Multiply loop: Ragged operand read: partial_2_0 - "tbz x27, #1, 43f\n" + "b 43f\n" + "41:" // Height 2: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 42f\n" "ldr h1, [x26], #0x2\n" "ldr h2, [x25], #0x2\n" - "tbz x27, #0, 44f\n" + "tbz x27, #0, 43f\n" "ld1 { v1.b }[2], [x26]\n" "ld1 { v2.b }[2], [x25]\n" - "b 44f\n" - "43:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "b 43f\n" + "42:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x26, #0x0]\n" "ldr b2, [x25, #0x0]\n" - "44:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q17, [x10, #0x0]\n" - "ldr q16, [x10, #0x10]\n" - "trn1 v18.2d, v1.2d, v2.2d\n" - ".inst 0x4e91a648 // smmla v8.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x20]\n" - ".inst 0x4e90a64c // smmla v12.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x30]\n" - ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x40]\n" - ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x50]\n" - ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n" - "ldr q17, [x10, #0x60]\n" - ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n" - "ldr q16, [x10, #0x70]\n" + "43:" // Height 2: Multiply loop: Ragged operand read: Done + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + "ldr q6, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n" - ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n" - "45:" // Height 2: Multiply loop: No odd multiplies + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "44:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 33b\n" - "ldr q19, [x14, #0x0]\n" - "ldr q18, [x14, #0x10]\n" - "uzp1 v17.2d, v8.2d, v12.2d\n" + "bne 32b\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" "uzp2 v8.2d, v8.2d, v12.2d\n" - "ldr q5, [x14, #0x20]\n" - "ldr q16, [x14, #0x30]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q3, [x12, #0x30]\n" "uzp1 v12.2d, v9.2d, v13.2d\n" "uzp2 v9.2d, v9.2d, v13.2d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -642,31 +622,31 @@ void a64_hybrid_s8qs_mmla_6x16 ( "prfm pstl1keep, [x9, #0x0]\n" "uzp1 v14.2d, v11.2d, v15.2d\n" "uzp2 v11.2d, v11.2d, v15.2d\n" - "add x14, x14, #0x40\n" - "mov v15.16b, v17.16b\n" - "add v12.4s, v12.4s, v18.4s\n" - "add x26, x9, x20\n" - "prfm pstl1keep, [x26, #0x0]\n" - "add v13.4s, v13.4s, v5.4s\n" - "add v8.4s, v8.4s, v19.4s\n" - "add v15.4s, v15.4s, v19.4s\n" - "add v14.4s, v14.4s, v16.4s\n" - "add v9.4s, v9.4s, v18.4s\n" - "add v10.4s, v10.4s, v5.4s\n" - "add v11.4s, v11.4s, v16.4s\n" - "tbz %x[flags], #4, 46f\n" - "ldr q0, [x12, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q1, [x12, #0x10]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q2, [x12, #0x20]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q3, [x12, #0x30]\n" - "ldr q7, [x13, #0x30]\n" "add x12, x12, #0x40\n" + "mov v15.16b, v7.16b\n" + "add v12.4s, v12.4s, v1.4s\n" + "add x27, x9, x20\n" + "prfm pstl1keep, [x27, #0x0]\n" + "add v13.4s, v13.4s, v2.4s\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v15.4s, v15.4s, v0.4s\n" + "add v14.4s, v14.4s, v3.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "tbz %x[flags], #4, 45f\n" + "ldr q0, [x13, #0x0]\n" + "ldr q4, [x14, #0x0]\n" + "ldr q1, [x13, #0x10]\n" + "ldr q5, [x14, #0x10]\n" + "ldr q2, [x13, #0x20]\n" + "ldr q6, [x14, #0x20]\n" + "ldr q3, [x13, #0x30]\n" + "ldr q7, [x14, #0x30]\n" "add x13, x13, #0x40\n" - "b 47f\n" - "46:" // Height 2: per layer parameters + "add x14, x14, #0x40\n" + "b 46f\n" + "45:" // Height 2: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1r { v0.4s }, [x21]\n" @@ -677,163 +657,137 @@ void a64_hybrid_s8qs_mmla_6x16 ( "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" - "47:" // Height 2: parameters loaded - "sqrdmulh v15.4s, v15.4s, v4.4s\n" - "sqrdmulh v12.4s, v12.4s, v5.4s\n" - "sqrdmulh v13.4s, v13.4s, v6.4s\n" - "sqrdmulh v14.4s, v14.4s, v7.4s\n" - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "tbz %x[flags], #5, 48f\n" - "and v19.16b, v15.16b, v0.16b\n" - "and v18.16b, v12.16b, v1.16b\n" - "and v17.16b, v13.16b, v2.16b\n" - "and v16.16b, v14.16b, v3.16b\n" - "sshr v19.4s, v19.4s, #0x1f\n" - "sshr v18.4s, v18.4s, #0x1f\n" - "sshr v17.4s, v17.4s, #0x1f\n" - "sshr v16.4s, v16.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v19.4s\n" - "and v19.16b, v8.16b, v0.16b\n" - "sqadd v12.4s, v12.4s, v18.4s\n" - "and v18.16b, v9.16b, v1.16b\n" - "sqadd v13.4s, v13.4s, v17.4s\n" - "sqadd v14.4s, v14.4s, v16.4s\n" - "and v17.16b, v10.16b, v2.16b\n" - "and v16.16b, v11.16b, v3.16b\n" - "sshr v19.4s, v19.4s, #0x1f\n" - "sshr v18.4s, v18.4s, #0x1f\n" - "sshr v17.4s, v17.4s, #0x1f\n" - "sshr v16.4s, v16.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v19.4s\n" - "sqadd v9.4s, v9.4s, v18.4s\n" - "sqadd v10.4s, v10.4s, v17.4s\n" - "sqadd v11.4s, v11.4s, v16.4s\n" - "48:" // Height 2: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "46:" // Height 2: parameters loaded + "sqdmulh v15.4s, v15.4s, v4.4s\n" + "sqdmulh v12.4s, v12.4s, v5.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + "sqdmulh v13.4s, v13.4s, v6.4s\n" + "sqdmulh v14.4s, v14.4s, v7.4s\n" + "add x20, %x[qp], %[minval]\n" + "cmp x11, #0x10\n" + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "ld1r { v6.4s }, [x21]\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" "srshl v15.4s, v15.4s, v0.4s\n" "srshl v12.4s, v12.4s, v1.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v18.4s }, [x21]\n" - "ld1r { v17.4s }, [x20]\n" "srshl v13.4s, v13.4s, v2.4s\n" "srshl v14.4s, v14.4s, v3.4s\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x11, #0x10\n" - "ld1r { v16.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" - "add v15.4s, v15.4s, v18.4s\n" - "add v12.4s, v12.4s, v18.4s\n" - "add v13.4s, v13.4s, v18.4s\n" - "add v14.4s, v14.4s, v18.4s\n" - "add v8.4s, v8.4s, v18.4s\n" - "add v9.4s, v9.4s, v18.4s\n" - "add v10.4s, v10.4s, v18.4s\n" - "add v11.4s, v11.4s, v18.4s\n" - "smin v15.4s, v15.4s, v17.4s\n" - "smin v12.4s, v12.4s, v17.4s\n" - "smin v13.4s, v13.4s, v17.4s\n" - "smin v14.4s, v14.4s, v17.4s\n" - "smin v8.4s, v8.4s, v17.4s\n" - "smin v9.4s, v9.4s, v17.4s\n" - "smin v10.4s, v10.4s, v17.4s\n" - "smin v11.4s, v11.4s, v17.4s\n" - "smax v15.4s, v15.4s, v16.4s\n" - "smax v12.4s, v12.4s, v16.4s\n" - "smax v13.4s, v13.4s, v16.4s\n" - "smax v14.4s, v14.4s, v16.4s\n" - "smax v8.4s, v8.4s, v16.4s\n" - "smax v9.4s, v9.4s, v16.4s\n" - "smax v10.4s, v10.4s, v16.4s\n" - "smax v11.4s, v11.4s, v16.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" "uzp1 v15.8h, v15.8h, v12.8h\n" - "uzp1 v17.8h, v13.8h, v14.8h\n" + "uzp1 v12.8h, v13.8h, v14.8h\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v16.8h, v10.8h, v11.8h\n" - "uzp1 v15.16b, v15.16b, v17.16b\n" - "uzp1 v8.16b, v8.16b, v16.16b\n" - "bge 57f\n" - "tbz x11, #3, 52f\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v15.16b, v15.16b, v12.16b\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "bge 55f\n" + "tbz x11, #3, 50f\n" "str d15, [x9], #0x8\n" - "str d8, [x26], #0x8\n" - "tbz x11, #2, 50f\n" + "str d8, [x27], #0x8\n" + "tbz x11, #2, 48f\n" "st1 { v15.s }[2], [x9], #0x4\n" - "st1 { v8.s }[2], [x26], #0x4\n" - "tbz x11, #1, 49f\n" + "st1 { v8.s }[2], [x27], #0x4\n" + "tbz x11, #1, 47f\n" "st1 { v15.h }[6], [x9], #0x2\n" - "st1 { v8.h }[6], [x26], #0x2\n" - "tbz x11, #0, 56f\n" + "st1 { v8.h }[6], [x27], #0x2\n" + "tbz x11, #0, 54f\n" "st1 { v15.b }[14], [x9]\n" - "st1 { v8.b }[14], [x26]\n" - "b 56f\n" - "49:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x11, #0, 56f\n" + "st1 { v8.b }[14], [x27]\n" + "b 54f\n" + "47:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x11, #0, 54f\n" "st1 { v15.b }[12], [x9]\n" - "st1 { v8.b }[12], [x26]\n" - "b 56f\n" - "50:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x11, #1, 51f\n" + "st1 { v8.b }[12], [x27]\n" + "b 54f\n" + "48:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x11, #1, 49f\n" "st1 { v15.h }[4], [x9], #0x2\n" - "st1 { v8.h }[4], [x26], #0x2\n" - "tbz x11, #0, 56f\n" + "st1 { v8.h }[4], [x27], #0x2\n" + "tbz x11, #0, 54f\n" "st1 { v15.b }[10], [x9]\n" - "st1 { v8.b }[10], [x26]\n" - "b 56f\n" - "51:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x11, #0, 56f\n" + "st1 { v8.b }[10], [x27]\n" + "b 54f\n" + "49:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x11, #0, 54f\n" "st1 { v15.b }[8], [x9]\n" - "st1 { v8.b }[8], [x26]\n" - "b 56f\n" - "52:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x11, #2, 54f\n" + "st1 { v8.b }[8], [x27]\n" + "b 54f\n" + "50:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x11, #2, 52f\n" "str s15, [x9], #0x4\n" - "str s8, [x26], #0x4\n" - "tbz x11, #1, 53f\n" + "str s8, [x27], #0x4\n" + "tbz x11, #1, 51f\n" "st1 { v15.h }[2], [x9], #0x2\n" - "st1 { v8.h }[2], [x26], #0x2\n" - "tbz x11, #0, 56f\n" + "st1 { v8.h }[2], [x27], #0x2\n" + "tbz x11, #0, 54f\n" "st1 { v15.b }[6], [x9]\n" - "st1 { v8.b }[6], [x26]\n" - "b 56f\n" - "53:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x11, #0, 56f\n" + "st1 { v8.b }[6], [x27]\n" + "b 54f\n" + "51:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x11, #0, 54f\n" "st1 { v15.b }[4], [x9]\n" - "st1 { v8.b }[4], [x26]\n" - "b 56f\n" - "54:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x11, #1, 55f\n" + "st1 { v8.b }[4], [x27]\n" + "b 54f\n" + "52:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x11, #1, 53f\n" "str h15, [x9], #0x2\n" - "str h8, [x26], #0x2\n" - "tbz x11, #0, 56f\n" + "str h8, [x27], #0x2\n" + "tbz x11, #0, 54f\n" "st1 { v15.b }[2], [x9]\n" - "st1 { v8.b }[2], [x26]\n" - "b 56f\n" - "55:" // Height 2: Partial direct writeback: partial_1_0 + "st1 { v8.b }[2], [x27]\n" + "b 54f\n" + "53:" // Height 2: Partial direct writeback: partial_1_0 "str b15, [x9, #0x0]\n" - "str b8, [x26, #0x0]\n" - "56:" // Height 2: Partial direct writeback: Done - "b 58f\n" - "57:" // Height 2: Full writeback + "str b8, [x27, #0x0]\n" + "54:" // Height 2: Partial direct writeback: Done + "b 56f\n" + "55:" // Height 2: Full writeback "str q15, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "str q8, [x26, #0x0]\n" - "58:" // Height 2: Writeback done + "str q8, [x27, #0x0]\n" + "56:" // Height 2: Writeback done "subs x11, x11, #0x10\n" - "bgt 31b\n" - "b 176f\n" - "59:" // Height 3 - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "bgt 30b\n" + "b 170f\n" + "57:" // Height 3 + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "60:" // Height 3: Column loop + "58:" // Height 3: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -850,40 +804,39 @@ void a64_hybrid_s8qs_mmla_6x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "61:" // Height 3: setup done "mov x28, #0x0\n" - "62:" // Height 3: String loop + "60:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 63f\n" + "tbz %x[flags], #3, 61f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" "ldr x25, [x20, #0x8]\n" "ldr x24, [x20, #0x10]\n" - "cbnz x28, 64f\n" + "cbnz x28, 62f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" "add x24, x24, x20\n" - "b 64f\n" - "63:" // Height 3: setup direct input + "b 62f\n" + "61:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" "add x24, x25, x21\n" - "64:" // Height 3: input setup done + "62:" // Height 3: input setup done "cmp x27, #0x10\n" - "blt 67f\n" + "blt 65f\n" "ldr q1, [x26, #0x0]\n" "ldr q2, [x25, #0x0]\n" "cmp x27, #0x20\n" "ldr q3, [x24, #0x0]\n" "ldr q7, [x10, #0x0]\n" "ldr q6, [x10, #0x10]\n" - "blt 66f\n" - "65:" // Height 3: Multiply loop: Main loop head - "trn1 v27.2d, v1.2d, v2.2d\n" + "blt 64f\n" + "63:" // Height 3: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" @@ -892,64 +845,64 @@ void a64_hybrid_s8qs_mmla_6x16 ( "cmp x27, #0x20\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - "trn1 v26.2d, v3.2d, v24.2d\n" - ".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n" - ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" "prfm pldl1keep, [x24, #0x80]\n" - "trn2 v3.2d, v3.2d, v24.2d\n" - ".inst 0x4e87a750 // smmla v16.4s, v26.16b, v7.16b\n" - "ldr q25, [x10, #0x20]\n" - ".inst 0x4e86a754 // smmla v20.4s, v26.16b, v6.16b\n" - "ldr q24, [x10, #0x30]\n" - ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x40]\n" - ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x50]\n" - ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x60]\n" - ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x70]\n" - ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x80]\n" - ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x90]\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x90]\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x4e99a428 // smmla v8.4s, v1.16b, v25.16b\n" - ".inst 0x4e99a470 // smmla v16.4s, v3.16b, v25.16b\n" - "ldr q25, [x10, #0xa0]\n" - ".inst 0x4e98a42c // smmla v12.4s, v1.16b, v24.16b\n" - ".inst 0x4e98a474 // smmla v20.4s, v3.16b, v24.16b\n" - "ldr q24, [x10, #0xb0]\n" - ".inst 0x4e99a429 // smmla v9.4s, v1.16b, v25.16b\n" - ".inst 0x4e99a471 // smmla v17.4s, v3.16b, v25.16b\n" - "ldr q25, [x10, #0xc0]\n" - ".inst 0x4e98a42d // smmla v13.4s, v1.16b, v24.16b\n" - ".inst 0x4e98a475 // smmla v21.4s, v3.16b, v24.16b\n" - "ldr q24, [x10, #0xd0]\n" - ".inst 0x4e99a42a // smmla v10.4s, v1.16b, v25.16b\n" - ".inst 0x4e99a472 // smmla v18.4s, v3.16b, v25.16b\n" - "ldr q25, [x10, #0xe0]\n" - ".inst 0x4e98a42e // smmla v14.4s, v1.16b, v24.16b\n" - ".inst 0x4e98a476 // smmla v22.4s, v3.16b, v24.16b\n" - "ldr q24, [x10, #0xf0]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e99a42b // smmla v11.4s, v1.16b, v25.16b\n" - ".inst 0x4e99a473 // smmla v19.4s, v3.16b, v25.16b\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x4e98a42f // smmla v15.4s, v1.16b, v24.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x4e98a477 // smmla v23.4s, v3.16b, v24.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" "ldr q3, [x24, #0x0]\n" "ldr q6, [x10, #0x10]\n" - "bge 65b\n" - "66:" // Height 3: Multiply loop: Single iteration only - "trn1 v27.2d, v1.2d, v2.2d\n" + "bge 63b\n" + "64:" // Height 3: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" @@ -958,167 +911,167 @@ void a64_hybrid_s8qs_mmla_6x16 ( "sub x27, x27, #0x10\n" "prfm pldl1keep, [x25, #0x80]\n" "prfm pldl1keep, [x24, #0x80]\n" - "trn1 v26.2d, v3.2d, v24.2d\n" - ".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n" - ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n" - "trn2 v3.2d, v3.2d, v24.2d\n" - ".inst 0x4e87a750 // smmla v16.4s, v26.16b, v7.16b\n" - "ldr q24, [x10, #0x20]\n" - ".inst 0x4e86a754 // smmla v20.4s, v26.16b, v6.16b\n" - "ldr q4, [x10, #0x30]\n" - ".inst 0x4e98a769 // smmla v9.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a751 // smmla v17.4s, v26.16b, v24.16b\n" - "ldr q25, [x10, #0x40]\n" - ".inst 0x4e84a76d // smmla v13.4s, v27.16b, v4.16b\n" - ".inst 0x4e84a755 // smmla v21.4s, v26.16b, v4.16b\n" - "ldr q24, [x10, #0x50]\n" - ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x60]\n" - ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x70]\n" - ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x80]\n" - ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x90]\n" - ".inst 0x4e99a428 // smmla v8.4s, v1.16b, v25.16b\n" - ".inst 0x4e99a470 // smmla v16.4s, v3.16b, v25.16b\n" - "ldr q25, [x10, #0xa0]\n" - ".inst 0x4e98a42c // smmla v12.4s, v1.16b, v24.16b\n" - ".inst 0x4e98a474 // smmla v20.4s, v3.16b, v24.16b\n" - "ldr q24, [x10, #0xb0]\n" - ".inst 0x4e99a429 // smmla v9.4s, v1.16b, v25.16b\n" - ".inst 0x4e99a471 // smmla v17.4s, v3.16b, v25.16b\n" - "ldr q25, [x10, #0xc0]\n" - ".inst 0x4e98a42d // smmla v13.4s, v1.16b, v24.16b\n" - ".inst 0x4e98a475 // smmla v21.4s, v3.16b, v24.16b\n" - "ldr q24, [x10, #0xd0]\n" - ".inst 0x4e99a42a // smmla v10.4s, v1.16b, v25.16b\n" - ".inst 0x4e99a472 // smmla v18.4s, v3.16b, v25.16b\n" - "ldr q25, [x10, #0xe0]\n" - ".inst 0x4e98a42e // smmla v14.4s, v1.16b, v24.16b\n" - ".inst 0x4e98a476 // smmla v22.4s, v3.16b, v24.16b\n" - "ldr q24, [x10, #0xf0]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e99a42b // smmla v11.4s, v1.16b, v25.16b\n" - ".inst 0x4e99a473 // smmla v19.4s, v3.16b, v25.16b\n" - ".inst 0x4e98a42f // smmla v15.4s, v1.16b, v24.16b\n" - ".inst 0x4e98a477 // smmla v23.4s, v3.16b, v24.16b\n" - "67:" // Height 3: Multiply loop: Main loop skip - "cbz x27, 74f\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + "65:" // Height 3: Multiply loop: Main loop skip + "cbz x27, 72f\n" "cmp x27, #0x8\n" - "blt 69f\n" - "68:" // Height 3: Multiply loop: Odd block loop - "ldr d29, [x26], #0x8\n" - "ldr d27, [x25], #0x8\n" + "blt 67f\n" + "66:" // Height 3: Multiply loop: Odd block loop + "ldr d1, [x26], #0x8\n" + "ldr d2, [x25], #0x8\n" "sub x27, x27, #0x8\n" - "ldr d26, [x24], #0x8\n" - "ldr q25, [x10, #0x0]\n" + "ldr d3, [x24], #0x8\n" + "ldr q6, [x10, #0x0]\n" "cmp x27, #0x8\n" - "ldr q24, [x10, #0x10]\n" - "trn1 v27.2d, v29.2d, v27.2d\n" - "trn1 v26.2d, v26.2d, v28.2d\n" - ".inst 0x4e99a768 // smmla v8.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a750 // smmla v16.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x20]\n" - ".inst 0x4e98a76c // smmla v12.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a754 // smmla v20.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x30]\n" - ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x40]\n" - ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x50]\n" - ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x60]\n" - ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x70]\n" + "ldr q7, [x10, #0x10]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n" - ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n" - "bge 68b\n" - "69:" // Height 3: Multiply loop: Skip odd blocks - "cbz x27, 74f\n" - "tbz x27, #2, 71f\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + "bge 66b\n" + "67:" // Height 3: Multiply loop: Skip odd blocks + "cbz x27, 72f\n" + "tbz x27, #2, 69f\n" "ldr s1, [x26], #0x4\n" "ldr s2, [x25], #0x4\n" "ldr s3, [x24], #0x4\n" - "tbz x27, #1, 70f\n" + "tbz x27, #1, 68f\n" "ld1 { v1.h }[2], [x26], #0x2\n" "ld1 { v2.h }[2], [x25], #0x2\n" "ld1 { v3.h }[2], [x24], #0x2\n" - "tbz x27, #0, 73f\n" + "tbz x27, #0, 71f\n" "ld1 { v1.b }[6], [x26]\n" "ld1 { v2.b }[6], [x25]\n" "ld1 { v3.b }[6], [x24]\n" - "b 73f\n" - "70:" // Height 3: Multiply loop: Ragged operand read: partial_1_4 - "tbz x27, #0, 73f\n" + "b 71f\n" + "68:" // Height 3: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 71f\n" "ld1 { v1.b }[4], [x26]\n" "ld1 { v2.b }[4], [x25]\n" "ld1 { v3.b }[4], [x24]\n" - "b 73f\n" - "71:" // Height 3: Multiply loop: Ragged operand read: partial_2_0 - "tbz x27, #1, 72f\n" + "b 71f\n" + "69:" // Height 3: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 70f\n" "ldr h1, [x26], #0x2\n" "ldr h2, [x25], #0x2\n" "ldr h3, [x24], #0x2\n" - "tbz x27, #0, 73f\n" + "tbz x27, #0, 71f\n" "ld1 { v1.b }[2], [x26]\n" "ld1 { v2.b }[2], [x25]\n" "ld1 { v3.b }[2], [x24]\n" - "b 73f\n" - "72:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "b 71f\n" + "70:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x26, #0x0]\n" "ldr b2, [x25, #0x0]\n" "ldr b3, [x24, #0x0]\n" - "73:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q25, [x10, #0x0]\n" - "ldr q28, [x10, #0x10]\n" - "trn1 v27.2d, v1.2d, v2.2d\n" - "trn1 v26.2d, v3.2d, v24.2d\n" - ".inst 0x4e99a768 // smmla v8.4s, v27.16b, v25.16b\n" - ".inst 0x4e9ca76c // smmla v12.4s, v27.16b, v28.16b\n" - ".inst 0x4e99a750 // smmla v16.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x20]\n" - ".inst 0x4e9ca754 // smmla v20.4s, v26.16b, v28.16b\n" - "ldr q24, [x10, #0x30]\n" - ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x40]\n" - ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x50]\n" - ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x60]\n" - ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x70]\n" + "71:" // Height 3: Multiply loop: Ragged operand read: Done + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n" - ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n" - "74:" // Height 3: Multiply loop: No odd multiplies + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "72:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 62b\n" - "ldr q28, [x14, #0x0]\n" - "ldr q27, [x14, #0x10]\n" - "uzp1 v26.2d, v8.2d, v12.2d\n" + "bne 60b\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" "uzp2 v8.2d, v8.2d, v12.2d\n" - "ldr q25, [x14, #0x20]\n" - "ldr q24, [x14, #0x30]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q3, [x12, #0x30]\n" "uzp1 v12.2d, v9.2d, v13.2d\n" "uzp2 v9.2d, v9.2d, v13.2d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1127,41 +1080,41 @@ void a64_hybrid_s8qs_mmla_6x16 ( "prfm pstl1keep, [x9, #0x0]\n" "uzp1 v14.2d, v11.2d, v15.2d\n" "uzp2 v11.2d, v11.2d, v15.2d\n" - "add x14, x14, #0x40\n" + "add x12, x12, #0x40\n" "uzp1 v16.2d, v16.2d, v20.2d\n" "uzp1 v17.2d, v17.2d, v21.2d\n" "uzp1 v18.2d, v18.2d, v22.2d\n" "uzp1 v19.2d, v19.2d, v23.2d\n" - "add x26, x9, x20\n" - "mov v23.16b, v26.16b\n" - "add x25, x26, x20\n" + "add x27, x9, x20\n" + "mov v23.16b, v7.16b\n" + "add x26, x27, x20\n" + "prfm pstl1keep, [x27, #0x0]\n" + "add v12.4s, v12.4s, v1.4s\n" "prfm pstl1keep, [x26, #0x0]\n" - "add v12.4s, v12.4s, v27.4s\n" - "prfm pstl1keep, [x25, #0x0]\n" - "add v13.4s, v13.4s, v25.4s\n" - "add v14.4s, v14.4s, v24.4s\n" - "add v23.4s, v23.4s, v28.4s\n" - "add v8.4s, v8.4s, v28.4s\n" - "add v9.4s, v9.4s, v27.4s\n" - "add v10.4s, v10.4s, v25.4s\n" - "add v11.4s, v11.4s, v24.4s\n" - "add v16.4s, v16.4s, v28.4s\n" - "add v17.4s, v17.4s, v27.4s\n" - "add v18.4s, v18.4s, v25.4s\n" - "add v19.4s, v19.4s, v24.4s\n" - "tbz %x[flags], #4, 75f\n" - "ldr q0, [x12, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q1, [x12, #0x10]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q2, [x12, #0x20]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q3, [x12, #0x30]\n" - "ldr q7, [x13, #0x30]\n" - "add x12, x12, #0x40\n" + "add v13.4s, v13.4s, v2.4s\n" + "add v14.4s, v14.4s, v3.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "tbz %x[flags], #4, 73f\n" + "ldr q0, [x13, #0x0]\n" + "ldr q4, [x14, #0x0]\n" + "ldr q1, [x13, #0x10]\n" + "ldr q5, [x14, #0x10]\n" + "ldr q2, [x13, #0x20]\n" + "ldr q6, [x14, #0x20]\n" + "ldr q3, [x13, #0x30]\n" + "ldr q7, [x14, #0x30]\n" "add x13, x13, #0x40\n" - "b 76f\n" - "75:" // Height 3: per layer parameters + "add x14, x14, #0x40\n" + "b 74f\n" + "73:" // Height 3: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1r { v0.4s }, [x21]\n" @@ -1172,214 +1125,176 @@ void a64_hybrid_s8qs_mmla_6x16 ( "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" - "76:" // Height 3: parameters loaded - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "sqrdmulh v12.4s, v12.4s, v5.4s\n" - "sqrdmulh v13.4s, v13.4s, v6.4s\n" - "sqrdmulh v14.4s, v14.4s, v7.4s\n" - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v5.4s\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "sqrdmulh v19.4s, v19.4s, v7.4s\n" - "tbz %x[flags], #5, 77f\n" - "and v24.16b, v23.16b, v0.16b\n" - "and v22.16b, v12.16b, v1.16b\n" - "and v21.16b, v13.16b, v2.16b\n" - "and v20.16b, v14.16b, v3.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v24.4s\n" - "and v24.16b, v8.16b, v0.16b\n" - "sqadd v12.4s, v12.4s, v22.4s\n" - "and v22.16b, v9.16b, v1.16b\n" - "sqadd v13.4s, v13.4s, v21.4s\n" - "sqadd v14.4s, v14.4s, v20.4s\n" - "and v21.16b, v10.16b, v2.16b\n" - "and v20.16b, v11.16b, v3.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v24.4s\n" - "and v24.16b, v16.16b, v0.16b\n" - "sqadd v9.4s, v9.4s, v22.4s\n" - "and v22.16b, v17.16b, v1.16b\n" - "sqadd v10.4s, v10.4s, v21.4s\n" - "sqadd v11.4s, v11.4s, v20.4s\n" - "and v21.16b, v18.16b, v2.16b\n" - "and v20.16b, v19.16b, v3.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v24.4s\n" - "sqadd v17.4s, v17.4s, v22.4s\n" - "sqadd v18.4s, v18.4s, v21.4s\n" - "sqadd v19.4s, v19.4s, v20.4s\n" - "77:" // Height 3: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "74:" // Height 3: parameters loaded + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v12.4s, v12.4s, v5.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + "sqdmulh v13.4s, v13.4s, v6.4s\n" + "sqdmulh v14.4s, v14.4s, v7.4s\n" + "add x20, %x[qp], %[minval]\n" + "cmp x11, #0x10\n" + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" + "sqdmulh v17.4s, v17.4s, v5.4s\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v18.4s, v18.4s, v6.4s\n" + "ld1r { v6.4s }, [x21]\n" + "sqdmulh v19.4s, v19.4s, v7.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v12.4s, v12.4s, v1.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v22.4s }, [x21]\n" - "ld1r { v21.4s }, [x20]\n" "srshl v13.4s, v13.4s, v2.4s\n" "srshl v14.4s, v14.4s, v3.4s\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x11, #0x10\n" - "ld1r { v20.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v1.4s\n" "srshl v18.4s, v18.4s, v2.4s\n" "srshl v19.4s, v19.4s, v3.4s\n" - "add v23.4s, v23.4s, v22.4s\n" - "add v12.4s, v12.4s, v22.4s\n" - "add v13.4s, v13.4s, v22.4s\n" - "add v14.4s, v14.4s, v22.4s\n" - "add v8.4s, v8.4s, v22.4s\n" - "add v9.4s, v9.4s, v22.4s\n" - "add v10.4s, v10.4s, v22.4s\n" - "add v11.4s, v11.4s, v22.4s\n" - "add v16.4s, v16.4s, v22.4s\n" - "add v17.4s, v17.4s, v22.4s\n" - "add v18.4s, v18.4s, v22.4s\n" - "add v19.4s, v19.4s, v22.4s\n" - "smin v23.4s, v23.4s, v21.4s\n" - "smin v12.4s, v12.4s, v21.4s\n" - "smin v13.4s, v13.4s, v21.4s\n" - "smin v14.4s, v14.4s, v21.4s\n" - "smin v8.4s, v8.4s, v21.4s\n" - "smin v9.4s, v9.4s, v21.4s\n" - "smin v10.4s, v10.4s, v21.4s\n" - "smin v11.4s, v11.4s, v21.4s\n" - "smin v16.4s, v16.4s, v21.4s\n" - "smin v17.4s, v17.4s, v21.4s\n" - "smin v18.4s, v18.4s, v21.4s\n" - "smin v19.4s, v19.4s, v21.4s\n" - "smax v23.4s, v23.4s, v20.4s\n" - "smax v12.4s, v12.4s, v20.4s\n" - "smax v13.4s, v13.4s, v20.4s\n" - "smax v14.4s, v14.4s, v20.4s\n" - "smax v8.4s, v8.4s, v20.4s\n" - "smax v9.4s, v9.4s, v20.4s\n" - "smax v10.4s, v10.4s, v20.4s\n" - "smax v11.4s, v11.4s, v20.4s\n" - "smax v16.4s, v16.4s, v20.4s\n" - "smax v17.4s, v17.4s, v20.4s\n" - "smax v18.4s, v18.4s, v20.4s\n" - "smax v19.4s, v19.4s, v20.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v23.8h, v23.8h, v12.8h\n" - "uzp1 v21.8h, v13.8h, v14.8h\n" + "uzp1 v12.8h, v13.8h, v14.8h\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v20.8h, v10.8h, v11.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" - "uzp1 v23.16b, v23.16b, v21.16b\n" - "uzp1 v8.16b, v8.16b, v20.16b\n" + "uzp1 v23.16b, v23.16b, v12.16b\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" "uzp1 v16.16b, v16.16b, v17.16b\n" - "bge 86f\n" - "tbz x11, #3, 81f\n" + "bge 83f\n" + "tbz x11, #3, 78f\n" "str d23, [x9], #0x8\n" - "str d8, [x26], #0x8\n" - "str d16, [x25], #0x8\n" - "tbz x11, #2, 79f\n" + "str d8, [x27], #0x8\n" + "str d16, [x26], #0x8\n" + "tbz x11, #2, 76f\n" "st1 { v23.s }[2], [x9], #0x4\n" - "st1 { v8.s }[2], [x26], #0x4\n" - "st1 { v16.s }[2], [x25], #0x4\n" - "tbz x11, #1, 78f\n" + "st1 { v8.s }[2], [x27], #0x4\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "tbz x11, #1, 75f\n" "st1 { v23.h }[6], [x9], #0x2\n" - "st1 { v8.h }[6], [x26], #0x2\n" - "st1 { v16.h }[6], [x25], #0x2\n" - "tbz x11, #0, 85f\n" + "st1 { v8.h }[6], [x27], #0x2\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "tbz x11, #0, 82f\n" "st1 { v23.b }[14], [x9]\n" - "st1 { v8.b }[14], [x26]\n" - "st1 { v16.b }[14], [x25]\n" - "b 85f\n" - "78:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x11, #0, 85f\n" + "st1 { v8.b }[14], [x27]\n" + "st1 { v16.b }[14], [x26]\n" + "b 82f\n" + "75:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x11, #0, 82f\n" "st1 { v23.b }[12], [x9]\n" - "st1 { v8.b }[12], [x26]\n" - "st1 { v16.b }[12], [x25]\n" - "b 85f\n" - "79:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x11, #1, 80f\n" + "st1 { v8.b }[12], [x27]\n" + "st1 { v16.b }[12], [x26]\n" + "b 82f\n" + "76:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x11, #1, 77f\n" "st1 { v23.h }[4], [x9], #0x2\n" - "st1 { v8.h }[4], [x26], #0x2\n" - "st1 { v16.h }[4], [x25], #0x2\n" - "tbz x11, #0, 85f\n" + "st1 { v8.h }[4], [x27], #0x2\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "tbz x11, #0, 82f\n" "st1 { v23.b }[10], [x9]\n" - "st1 { v8.b }[10], [x26]\n" - "st1 { v16.b }[10], [x25]\n" - "b 85f\n" - "80:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x11, #0, 85f\n" + "st1 { v8.b }[10], [x27]\n" + "st1 { v16.b }[10], [x26]\n" + "b 82f\n" + "77:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x11, #0, 82f\n" "st1 { v23.b }[8], [x9]\n" - "st1 { v8.b }[8], [x26]\n" - "st1 { v16.b }[8], [x25]\n" - "b 85f\n" - "81:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x11, #2, 83f\n" + "st1 { v8.b }[8], [x27]\n" + "st1 { v16.b }[8], [x26]\n" + "b 82f\n" + "78:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x11, #2, 80f\n" "str s23, [x9], #0x4\n" - "str s8, [x26], #0x4\n" - "str s16, [x25], #0x4\n" - "tbz x11, #1, 82f\n" + "str s8, [x27], #0x4\n" + "str s16, [x26], #0x4\n" + "tbz x11, #1, 79f\n" "st1 { v23.h }[2], [x9], #0x2\n" - "st1 { v8.h }[2], [x26], #0x2\n" - "st1 { v16.h }[2], [x25], #0x2\n" - "tbz x11, #0, 85f\n" + "st1 { v8.h }[2], [x27], #0x2\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "tbz x11, #0, 82f\n" "st1 { v23.b }[6], [x9]\n" - "st1 { v8.b }[6], [x26]\n" - "st1 { v16.b }[6], [x25]\n" - "b 85f\n" - "82:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x11, #0, 85f\n" + "st1 { v8.b }[6], [x27]\n" + "st1 { v16.b }[6], [x26]\n" + "b 82f\n" + "79:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x11, #0, 82f\n" "st1 { v23.b }[4], [x9]\n" - "st1 { v8.b }[4], [x26]\n" - "st1 { v16.b }[4], [x25]\n" - "b 85f\n" - "83:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x11, #1, 84f\n" + "st1 { v8.b }[4], [x27]\n" + "st1 { v16.b }[4], [x26]\n" + "b 82f\n" + "80:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x11, #1, 81f\n" "str h23, [x9], #0x2\n" - "str h8, [x26], #0x2\n" - "str h16, [x25], #0x2\n" - "tbz x11, #0, 85f\n" + "str h8, [x27], #0x2\n" + "str h16, [x26], #0x2\n" + "tbz x11, #0, 82f\n" "st1 { v23.b }[2], [x9]\n" - "st1 { v8.b }[2], [x26]\n" - "st1 { v16.b }[2], [x25]\n" - "b 85f\n" - "84:" // Height 3: Partial direct writeback: partial_1_0 + "st1 { v8.b }[2], [x27]\n" + "st1 { v16.b }[2], [x26]\n" + "b 82f\n" + "81:" // Height 3: Partial direct writeback: partial_1_0 "str b23, [x9, #0x0]\n" - "str b8, [x26, #0x0]\n" - "str b16, [x25, #0x0]\n" - "85:" // Height 3: Partial direct writeback: Done - "b 87f\n" - "86:" // Height 3: Full writeback + "str b8, [x27, #0x0]\n" + "str b16, [x26, #0x0]\n" + "82:" // Height 3: Partial direct writeback: Done + "b 84f\n" + "83:" // Height 3: Full writeback "str q23, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "str q8, [x26, #0x0]\n" - "str q16, [x25, #0x0]\n" - "87:" // Height 3: Writeback done + "str q8, [x27, #0x0]\n" + "str q16, [x26, #0x0]\n" + "84:" // Height 3: Writeback done "subs x11, x11, #0x10\n" - "bgt 60b\n" - "b 176f\n" - "88:" // Height 4 - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "bgt 58b\n" + "b 170f\n" + "85:" // Height 4 + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "89:" // Height 4: Column loop + "86:" // Height 4: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -1396,34 +1311,33 @@ void a64_hybrid_s8qs_mmla_6x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "90:" // Height 4: setup done "mov x28, #0x0\n" - "91:" // Height 4: String loop + "88:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 92f\n" + "tbz %x[flags], #3, 89f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" "ldr x25, [x20, #0x8]\n" "ldr x24, [x20, #0x10]\n" "ldr x23, [x20, #0x18]\n" - "cbnz x28, 93f\n" + "cbnz x28, 90f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" "add x24, x24, x20\n" "add x23, x23, x20\n" - "b 93f\n" - "92:" // Height 4: setup direct input + "b 90f\n" + "89:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" "add x24, x25, x21\n" "add x23, x24, x21\n" - "93:" // Height 4: input setup done + "90:" // Height 4: input setup done "cmp x27, #0x10\n" - "blt 96f\n" + "blt 93f\n" "ldr q1, [x26, #0x0]\n" "ldr q2, [x25, #0x0]\n" "cmp x27, #0x20\n" @@ -1431,13 +1345,13 @@ void a64_hybrid_s8qs_mmla_6x16 ( "ldr q4, [x23, #0x0]\n" "ldr q7, [x10, #0x0]\n" "ldr q6, [x10, #0x10]\n" - "blt 95f\n" - "94:" // Height 4: Multiply loop: Main loop head - "trn1 v27.2d, v1.2d, v2.2d\n" + "blt 92f\n" + "91:" // Height 4: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" - "trn1 v26.2d, v3.2d, v4.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" @@ -1446,242 +1360,242 @@ void a64_hybrid_s8qs_mmla_6x16 ( "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" "ldr q4, [x23, #0x0]\n" - ".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n" - ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4e87a750 // smmla v16.4s, v26.16b, v7.16b\n" - "ldr q25, [x10, #0x20]\n" - ".inst 0x4e86a754 // smmla v20.4s, v26.16b, v6.16b\n" - "ldr q24, [x10, #0x30]\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x30]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x40]\n" - ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x50]\n" - ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x60]\n" - ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x70]\n" - ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x80]\n" - ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x90]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x90]\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x4e99a428 // smmla v8.4s, v1.16b, v25.16b\n" - ".inst 0x4e99a470 // smmla v16.4s, v3.16b, v25.16b\n" - "ldr q25, [x10, #0xa0]\n" - ".inst 0x4e98a42c // smmla v12.4s, v1.16b, v24.16b\n" - ".inst 0x4e98a474 // smmla v20.4s, v3.16b, v24.16b\n" - "ldr q24, [x10, #0xb0]\n" - ".inst 0x4e99a429 // smmla v9.4s, v1.16b, v25.16b\n" - ".inst 0x4e99a471 // smmla v17.4s, v3.16b, v25.16b\n" - "ldr q25, [x10, #0xc0]\n" - ".inst 0x4e98a42d // smmla v13.4s, v1.16b, v24.16b\n" - ".inst 0x4e98a475 // smmla v21.4s, v3.16b, v24.16b\n" - "ldr q24, [x10, #0xd0]\n" - ".inst 0x4e99a42a // smmla v10.4s, v1.16b, v25.16b\n" - ".inst 0x4e99a472 // smmla v18.4s, v3.16b, v25.16b\n" - "ldr q25, [x10, #0xe0]\n" - ".inst 0x4e98a42e // smmla v14.4s, v1.16b, v24.16b\n" - ".inst 0x4e98a476 // smmla v22.4s, v3.16b, v24.16b\n" - "ldr q24, [x10, #0xf0]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e99a42b // smmla v11.4s, v1.16b, v25.16b\n" - ".inst 0x4e99a473 // smmla v19.4s, v3.16b, v25.16b\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x4e98a42f // smmla v15.4s, v1.16b, v24.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x4e98a477 // smmla v23.4s, v3.16b, v24.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" "ldr q3, [x24, #0x0]\n" "ldr q6, [x10, #0x10]\n" - "bge 94b\n" - "95:" // Height 4: Multiply loop: Single iteration only - "trn1 v27.2d, v1.2d, v2.2d\n" + "bge 91b\n" + "92:" // Height 4: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - "trn1 v26.2d, v3.2d, v4.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" "sub x27, x27, #0x10\n" - ".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n" - ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4e87a750 // smmla v16.4s, v26.16b, v7.16b\n" - "ldr q25, [x10, #0x20]\n" - ".inst 0x4e86a754 // smmla v20.4s, v26.16b, v6.16b\n" - "ldr q24, [x10, #0x30]\n" - ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x40]\n" - ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x50]\n" - ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x60]\n" - ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x70]\n" - ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x80]\n" - ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x90]\n" - ".inst 0x4e99a428 // smmla v8.4s, v1.16b, v25.16b\n" - ".inst 0x4e99a470 // smmla v16.4s, v3.16b, v25.16b\n" - "ldr q25, [x10, #0xa0]\n" - ".inst 0x4e98a42c // smmla v12.4s, v1.16b, v24.16b\n" - ".inst 0x4e98a474 // smmla v20.4s, v3.16b, v24.16b\n" - "ldr q24, [x10, #0xb0]\n" - ".inst 0x4e99a429 // smmla v9.4s, v1.16b, v25.16b\n" - ".inst 0x4e99a471 // smmla v17.4s, v3.16b, v25.16b\n" - "ldr q25, [x10, #0xc0]\n" - ".inst 0x4e98a42d // smmla v13.4s, v1.16b, v24.16b\n" - ".inst 0x4e98a475 // smmla v21.4s, v3.16b, v24.16b\n" - "ldr q24, [x10, #0xd0]\n" - ".inst 0x4e99a42a // smmla v10.4s, v1.16b, v25.16b\n" - ".inst 0x4e99a472 // smmla v18.4s, v3.16b, v25.16b\n" - "ldr q25, [x10, #0xe0]\n" - ".inst 0x4e98a42e // smmla v14.4s, v1.16b, v24.16b\n" - ".inst 0x4e98a476 // smmla v22.4s, v3.16b, v24.16b\n" - "ldr q24, [x10, #0xf0]\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e99a42b // smmla v11.4s, v1.16b, v25.16b\n" - ".inst 0x4e99a473 // smmla v19.4s, v3.16b, v25.16b\n" - ".inst 0x4e98a42f // smmla v15.4s, v1.16b, v24.16b\n" - ".inst 0x4e98a477 // smmla v23.4s, v3.16b, v24.16b\n" - "96:" // Height 4: Multiply loop: Main loop skip - "cbz x27, 103f\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + "93:" // Height 4: Multiply loop: Main loop skip + "cbz x27, 100f\n" "cmp x27, #0x8\n" - "blt 98f\n" - "97:" // Height 4: Multiply loop: Odd block loop - "ldr d29, [x26], #0x8\n" - "ldr d27, [x25], #0x8\n" + "blt 95f\n" + "94:" // Height 4: Multiply loop: Odd block loop + "ldr d1, [x26], #0x8\n" + "ldr d2, [x25], #0x8\n" "sub x27, x27, #0x8\n" - "ldr d28, [x24], #0x8\n" - "ldr d26, [x23], #0x8\n" + "ldr d3, [x24], #0x8\n" + "ldr d4, [x23], #0x8\n" "cmp x27, #0x8\n" - "ldr q25, [x10, #0x0]\n" - "ldr q24, [x10, #0x10]\n" - "trn1 v27.2d, v29.2d, v27.2d\n" - "trn1 v26.2d, v28.2d, v26.2d\n" - ".inst 0x4e99a768 // smmla v8.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a750 // smmla v16.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x20]\n" - ".inst 0x4e98a76c // smmla v12.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a754 // smmla v20.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x30]\n" - ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x40]\n" - ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x50]\n" - ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x60]\n" - ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x70]\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n" - ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n" - "bge 97b\n" - "98:" // Height 4: Multiply loop: Skip odd blocks - "cbz x27, 103f\n" - "tbz x27, #2, 100f\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + "bge 94b\n" + "95:" // Height 4: Multiply loop: Skip odd blocks + "cbz x27, 100f\n" + "tbz x27, #2, 97f\n" "ldr s1, [x26], #0x4\n" "ldr s2, [x25], #0x4\n" "ldr s3, [x24], #0x4\n" "ldr s4, [x23], #0x4\n" - "tbz x27, #1, 99f\n" + "tbz x27, #1, 96f\n" "ld1 { v1.h }[2], [x26], #0x2\n" "ld1 { v2.h }[2], [x25], #0x2\n" "ld1 { v3.h }[2], [x24], #0x2\n" "ld1 { v4.h }[2], [x23], #0x2\n" - "tbz x27, #0, 102f\n" + "tbz x27, #0, 99f\n" "ld1 { v1.b }[6], [x26]\n" "ld1 { v2.b }[6], [x25]\n" "ld1 { v3.b }[6], [x24]\n" "ld1 { v4.b }[6], [x23]\n" - "b 102f\n" - "99:" // Height 4: Multiply loop: Ragged operand read: partial_1_4 - "tbz x27, #0, 102f\n" + "b 99f\n" + "96:" // Height 4: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 99f\n" "ld1 { v1.b }[4], [x26]\n" "ld1 { v2.b }[4], [x25]\n" "ld1 { v3.b }[4], [x24]\n" "ld1 { v4.b }[4], [x23]\n" - "b 102f\n" - "100:" // Height 4: Multiply loop: Ragged operand read: partial_2_0 - "tbz x27, #1, 101f\n" + "b 99f\n" + "97:" // Height 4: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 98f\n" "ldr h1, [x26], #0x2\n" "ldr h2, [x25], #0x2\n" "ldr h3, [x24], #0x2\n" "ldr h4, [x23], #0x2\n" - "tbz x27, #0, 102f\n" + "tbz x27, #0, 99f\n" "ld1 { v1.b }[2], [x26]\n" "ld1 { v2.b }[2], [x25]\n" "ld1 { v3.b }[2], [x24]\n" "ld1 { v4.b }[2], [x23]\n" - "b 102f\n" - "101:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "b 99f\n" + "98:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x26, #0x0]\n" "ldr b2, [x25, #0x0]\n" "ldr b3, [x24, #0x0]\n" "ldr b4, [x23, #0x0]\n" - "102:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q25, [x10, #0x0]\n" - "ldr q24, [x10, #0x10]\n" - "trn1 v27.2d, v1.2d, v2.2d\n" - "trn1 v26.2d, v3.2d, v4.2d\n" - ".inst 0x4e99a768 // smmla v8.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a750 // smmla v16.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x20]\n" - ".inst 0x4e98a76c // smmla v12.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a754 // smmla v20.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x30]\n" - ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x40]\n" - ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x50]\n" - ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n" - "ldr q25, [x10, #0x60]\n" - ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n" - "ldr q24, [x10, #0x70]\n" + "99:" // Height 4: Multiply loop: Ragged operand read: Done + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + "ldr q6, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n" - ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n" - ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n" - ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n" - "103:" // Height 4: Multiply loop: No odd multiplies + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + "100:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 91b\n" - "ldr q28, [x14, #0x0]\n" - "ldr q27, [x14, #0x10]\n" - "uzp1 v26.2d, v8.2d, v12.2d\n" + "bne 88b\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" "uzp2 v8.2d, v8.2d, v12.2d\n" - "ldr q25, [x14, #0x20]\n" - "ldr q24, [x14, #0x30]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q3, [x12, #0x30]\n" "uzp1 v12.2d, v9.2d, v13.2d\n" "uzp2 v9.2d, v9.2d, v13.2d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1690,51 +1604,51 @@ void a64_hybrid_s8qs_mmla_6x16 ( "prfm pstl1keep, [x9, #0x0]\n" "uzp1 v14.2d, v11.2d, v15.2d\n" "uzp2 v11.2d, v11.2d, v15.2d\n" - "add x14, x14, #0x40\n" + "add x12, x12, #0x40\n" "uzp1 v15.2d, v16.2d, v20.2d\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "uzp2 v17.2d, v17.2d, v21.2d\n" - "add x26, x9, x20\n" + "add x27, x9, x20\n" "uzp1 v21.2d, v18.2d, v22.2d\n" "uzp2 v18.2d, v18.2d, v22.2d\n" - "add x25, x26, x20\n" - "prfm pstl1keep, [x26, #0x0]\n" + "add x26, x27, x20\n" + "prfm pstl1keep, [x27, #0x0]\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" - "add x24, x25, x20\n" + "add x25, x26, x20\n" + "prfm pstl1keep, [x26, #0x0]\n" + "mov v23.16b, v7.16b\n" "prfm pstl1keep, [x25, #0x0]\n" - "mov v23.16b, v26.16b\n" - "prfm pstl1keep, [x24, #0x0]\n" - "add v12.4s, v12.4s, v27.4s\n" - "add v13.4s, v13.4s, v25.4s\n" - "add v14.4s, v14.4s, v24.4s\n" - "add v23.4s, v23.4s, v28.4s\n" - "add v8.4s, v8.4s, v28.4s\n" - "add v9.4s, v9.4s, v27.4s\n" - "add v10.4s, v10.4s, v25.4s\n" - "add v11.4s, v11.4s, v24.4s\n" - "add v15.4s, v15.4s, v28.4s\n" - "add v20.4s, v20.4s, v27.4s\n" - "add v21.4s, v21.4s, v25.4s\n" - "add v22.4s, v22.4s, v24.4s\n" - "add v16.4s, v16.4s, v28.4s\n" - "add v17.4s, v17.4s, v27.4s\n" - "add v18.4s, v18.4s, v25.4s\n" - "add v19.4s, v19.4s, v24.4s\n" - "tbz %x[flags], #4, 104f\n" - "ldr q0, [x12, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q1, [x12, #0x10]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q2, [x12, #0x20]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q3, [x12, #0x30]\n" - "ldr q7, [x13, #0x30]\n" - "add x12, x12, #0x40\n" + "add v12.4s, v12.4s, v1.4s\n" + "add v13.4s, v13.4s, v2.4s\n" + "add v14.4s, v14.4s, v3.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v15.4s, v15.4s, v0.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "tbz %x[flags], #4, 101f\n" + "ldr q0, [x13, #0x0]\n" + "ldr q4, [x14, #0x0]\n" + "ldr q1, [x13, #0x10]\n" + "ldr q5, [x14, #0x10]\n" + "ldr q2, [x13, #0x20]\n" + "ldr q6, [x14, #0x20]\n" + "ldr q3, [x13, #0x30]\n" + "ldr q7, [x14, #0x30]\n" "add x13, x13, #0x40\n" - "b 105f\n" - "104:" // Height 4: per layer parameters + "add x14, x14, #0x40\n" + "b 102f\n" + "101:" // Height 4: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1r { v0.4s }, [x21]\n" @@ -1745,86 +1659,36 @@ void a64_hybrid_s8qs_mmla_6x16 ( "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" - "105:" // Height 4: parameters loaded - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "sqrdmulh v12.4s, v12.4s, v5.4s\n" - "sqrdmulh v13.4s, v13.4s, v6.4s\n" - "sqrdmulh v14.4s, v14.4s, v7.4s\n" - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "sqrdmulh v15.4s, v15.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v5.4s\n" - "sqrdmulh v21.4s, v21.4s, v6.4s\n" - "sqrdmulh v22.4s, v22.4s, v7.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v5.4s\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "sqrdmulh v19.4s, v19.4s, v7.4s\n" - "tbz %x[flags], #5, 106f\n" - "and v27.16b, v23.16b, v0.16b\n" - "and v26.16b, v12.16b, v1.16b\n" - "and v25.16b, v13.16b, v2.16b\n" - "and v24.16b, v14.16b, v3.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v27.4s\n" - "and v27.16b, v8.16b, v0.16b\n" - "sqadd v12.4s, v12.4s, v26.4s\n" - "and v26.16b, v9.16b, v1.16b\n" - "sqadd v13.4s, v13.4s, v25.4s\n" - "sqadd v14.4s, v14.4s, v24.4s\n" - "and v25.16b, v10.16b, v2.16b\n" - "and v24.16b, v11.16b, v3.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v27.4s\n" - "and v27.16b, v15.16b, v0.16b\n" - "sqadd v9.4s, v9.4s, v26.4s\n" - "and v26.16b, v20.16b, v1.16b\n" - "sqadd v10.4s, v10.4s, v25.4s\n" - "sqadd v11.4s, v11.4s, v24.4s\n" - "and v25.16b, v21.16b, v2.16b\n" - "and v24.16b, v22.16b, v3.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v27.4s\n" - "and v27.16b, v16.16b, v0.16b\n" - "sqadd v20.4s, v20.4s, v26.4s\n" - "and v26.16b, v17.16b, v1.16b\n" - "sqadd v21.4s, v21.4s, v25.4s\n" - "sqadd v22.4s, v22.4s, v24.4s\n" - "and v25.16b, v18.16b, v2.16b\n" - "and v24.16b, v19.16b, v3.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v27.4s\n" - "sqadd v17.4s, v17.4s, v26.4s\n" - "sqadd v18.4s, v18.4s, v25.4s\n" - "sqadd v19.4s, v19.4s, v24.4s\n" - "106:" // Height 4: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "102:" // Height 4: parameters loaded + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v12.4s, v12.4s, v5.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + "sqdmulh v13.4s, v13.4s, v6.4s\n" + "sqdmulh v14.4s, v14.4s, v7.4s\n" + "add x20, %x[qp], %[minval]\n" + "cmp x11, #0x10\n" + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" + "sqdmulh v15.4s, v15.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v5.4s\n" + "sqdmulh v21.4s, v21.4s, v6.4s\n" + "sqdmulh v22.4s, v22.4s, v7.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" + "sqdmulh v17.4s, v17.4s, v5.4s\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v18.4s, v18.4s, v6.4s\n" + "ld1r { v6.4s }, [x21]\n" + "sqdmulh v19.4s, v19.4s, v7.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v12.4s, v12.4s, v1.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v26.4s }, [x21]\n" - "ld1r { v25.4s }, [x20]\n" "srshl v13.4s, v13.4s, v2.4s\n" "srshl v14.4s, v14.4s, v3.4s\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x11, #0x10\n" - "ld1r { v24.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" "srshl v15.4s, v15.4s, v0.4s\n" @@ -1835,175 +1699,175 @@ void a64_hybrid_s8qs_mmla_6x16 ( "srshl v17.4s, v17.4s, v1.4s\n" "srshl v18.4s, v18.4s, v2.4s\n" "srshl v19.4s, v19.4s, v3.4s\n" - "add v23.4s, v23.4s, v26.4s\n" - "add v12.4s, v12.4s, v26.4s\n" - "add v13.4s, v13.4s, v26.4s\n" - "add v14.4s, v14.4s, v26.4s\n" - "add v8.4s, v8.4s, v26.4s\n" - "add v9.4s, v9.4s, v26.4s\n" - "add v10.4s, v10.4s, v26.4s\n" - "add v11.4s, v11.4s, v26.4s\n" - "add v15.4s, v15.4s, v26.4s\n" - "add v20.4s, v20.4s, v26.4s\n" - "add v21.4s, v21.4s, v26.4s\n" - "add v22.4s, v22.4s, v26.4s\n" - "add v16.4s, v16.4s, v26.4s\n" - "add v17.4s, v17.4s, v26.4s\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v26.4s\n" - "smin v23.4s, v23.4s, v25.4s\n" - "smin v12.4s, v12.4s, v25.4s\n" - "smin v13.4s, v13.4s, v25.4s\n" - "smin v14.4s, v14.4s, v25.4s\n" - "smin v8.4s, v8.4s, v25.4s\n" - "smin v9.4s, v9.4s, v25.4s\n" - "smin v10.4s, v10.4s, v25.4s\n" - "smin v11.4s, v11.4s, v25.4s\n" - "smin v15.4s, v15.4s, v25.4s\n" - "smin v20.4s, v20.4s, v25.4s\n" - "smin v21.4s, v21.4s, v25.4s\n" - "smin v22.4s, v22.4s, v25.4s\n" - "smin v16.4s, v16.4s, v25.4s\n" - "smin v17.4s, v17.4s, v25.4s\n" - "smin v18.4s, v18.4s, v25.4s\n" - "smin v19.4s, v19.4s, v25.4s\n" - "smax v23.4s, v23.4s, v24.4s\n" - "smax v12.4s, v12.4s, v24.4s\n" - "smax v13.4s, v13.4s, v24.4s\n" - "smax v14.4s, v14.4s, v24.4s\n" - "smax v8.4s, v8.4s, v24.4s\n" - "smax v9.4s, v9.4s, v24.4s\n" - "smax v10.4s, v10.4s, v24.4s\n" - "smax v11.4s, v11.4s, v24.4s\n" - "smax v15.4s, v15.4s, v24.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "smax v21.4s, v21.4s, v24.4s\n" - "smax v22.4s, v22.4s, v24.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "smax v18.4s, v18.4s, v24.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v23.8h, v23.8h, v12.8h\n" - "uzp1 v25.8h, v13.8h, v14.8h\n" + "uzp1 v12.8h, v13.8h, v14.8h\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v24.8h, v10.8h, v11.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "uzp1 v15.8h, v15.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" - "uzp1 v23.16b, v23.16b, v25.16b\n" - "uzp1 v8.16b, v8.16b, v24.16b\n" + "uzp1 v23.16b, v23.16b, v12.16b\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" "uzp1 v15.16b, v15.16b, v20.16b\n" "uzp1 v16.16b, v16.16b, v17.16b\n" - "bge 115f\n" - "tbz x11, #3, 110f\n" + "bge 111f\n" + "tbz x11, #3, 106f\n" "str d23, [x9], #0x8\n" - "str d8, [x26], #0x8\n" - "str d15, [x25], #0x8\n" - "str d16, [x24], #0x8\n" - "tbz x11, #2, 108f\n" + "str d8, [x27], #0x8\n" + "str d15, [x26], #0x8\n" + "str d16, [x25], #0x8\n" + "tbz x11, #2, 104f\n" "st1 { v23.s }[2], [x9], #0x4\n" - "st1 { v8.s }[2], [x26], #0x4\n" - "st1 { v15.s }[2], [x25], #0x4\n" - "st1 { v16.s }[2], [x24], #0x4\n" - "tbz x11, #1, 107f\n" + "st1 { v8.s }[2], [x27], #0x4\n" + "st1 { v15.s }[2], [x26], #0x4\n" + "st1 { v16.s }[2], [x25], #0x4\n" + "tbz x11, #1, 103f\n" "st1 { v23.h }[6], [x9], #0x2\n" - "st1 { v8.h }[6], [x26], #0x2\n" - "st1 { v15.h }[6], [x25], #0x2\n" - "st1 { v16.h }[6], [x24], #0x2\n" - "tbz x11, #0, 114f\n" + "st1 { v8.h }[6], [x27], #0x2\n" + "st1 { v15.h }[6], [x26], #0x2\n" + "st1 { v16.h }[6], [x25], #0x2\n" + "tbz x11, #0, 110f\n" "st1 { v23.b }[14], [x9]\n" - "st1 { v8.b }[14], [x26]\n" - "st1 { v15.b }[14], [x25]\n" - "st1 { v16.b }[14], [x24]\n" - "b 114f\n" - "107:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x11, #0, 114f\n" + "st1 { v8.b }[14], [x27]\n" + "st1 { v15.b }[14], [x26]\n" + "st1 { v16.b }[14], [x25]\n" + "b 110f\n" + "103:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x11, #0, 110f\n" "st1 { v23.b }[12], [x9]\n" - "st1 { v8.b }[12], [x26]\n" - "st1 { v15.b }[12], [x25]\n" - "st1 { v16.b }[12], [x24]\n" - "b 114f\n" - "108:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x11, #1, 109f\n" + "st1 { v8.b }[12], [x27]\n" + "st1 { v15.b }[12], [x26]\n" + "st1 { v16.b }[12], [x25]\n" + "b 110f\n" + "104:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x11, #1, 105f\n" "st1 { v23.h }[4], [x9], #0x2\n" - "st1 { v8.h }[4], [x26], #0x2\n" - "st1 { v15.h }[4], [x25], #0x2\n" - "st1 { v16.h }[4], [x24], #0x2\n" - "tbz x11, #0, 114f\n" + "st1 { v8.h }[4], [x27], #0x2\n" + "st1 { v15.h }[4], [x26], #0x2\n" + "st1 { v16.h }[4], [x25], #0x2\n" + "tbz x11, #0, 110f\n" "st1 { v23.b }[10], [x9]\n" - "st1 { v8.b }[10], [x26]\n" - "st1 { v15.b }[10], [x25]\n" - "st1 { v16.b }[10], [x24]\n" - "b 114f\n" - "109:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x11, #0, 114f\n" + "st1 { v8.b }[10], [x27]\n" + "st1 { v15.b }[10], [x26]\n" + "st1 { v16.b }[10], [x25]\n" + "b 110f\n" + "105:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x11, #0, 110f\n" "st1 { v23.b }[8], [x9]\n" - "st1 { v8.b }[8], [x26]\n" - "st1 { v15.b }[8], [x25]\n" - "st1 { v16.b }[8], [x24]\n" - "b 114f\n" - "110:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x11, #2, 112f\n" + "st1 { v8.b }[8], [x27]\n" + "st1 { v15.b }[8], [x26]\n" + "st1 { v16.b }[8], [x25]\n" + "b 110f\n" + "106:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x11, #2, 108f\n" "str s23, [x9], #0x4\n" - "str s8, [x26], #0x4\n" - "str s15, [x25], #0x4\n" - "str s16, [x24], #0x4\n" - "tbz x11, #1, 111f\n" + "str s8, [x27], #0x4\n" + "str s15, [x26], #0x4\n" + "str s16, [x25], #0x4\n" + "tbz x11, #1, 107f\n" "st1 { v23.h }[2], [x9], #0x2\n" - "st1 { v8.h }[2], [x26], #0x2\n" - "st1 { v15.h }[2], [x25], #0x2\n" - "st1 { v16.h }[2], [x24], #0x2\n" - "tbz x11, #0, 114f\n" + "st1 { v8.h }[2], [x27], #0x2\n" + "st1 { v15.h }[2], [x26], #0x2\n" + "st1 { v16.h }[2], [x25], #0x2\n" + "tbz x11, #0, 110f\n" "st1 { v23.b }[6], [x9]\n" - "st1 { v8.b }[6], [x26]\n" - "st1 { v15.b }[6], [x25]\n" - "st1 { v16.b }[6], [x24]\n" - "b 114f\n" - "111:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x11, #0, 114f\n" + "st1 { v8.b }[6], [x27]\n" + "st1 { v15.b }[6], [x26]\n" + "st1 { v16.b }[6], [x25]\n" + "b 110f\n" + "107:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x11, #0, 110f\n" "st1 { v23.b }[4], [x9]\n" - "st1 { v8.b }[4], [x26]\n" - "st1 { v15.b }[4], [x25]\n" - "st1 { v16.b }[4], [x24]\n" - "b 114f\n" - "112:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x11, #1, 113f\n" + "st1 { v8.b }[4], [x27]\n" + "st1 { v15.b }[4], [x26]\n" + "st1 { v16.b }[4], [x25]\n" + "b 110f\n" + "108:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x11, #1, 109f\n" "str h23, [x9], #0x2\n" - "str h8, [x26], #0x2\n" - "str h15, [x25], #0x2\n" - "str h16, [x24], #0x2\n" - "tbz x11, #0, 114f\n" + "str h8, [x27], #0x2\n" + "str h15, [x26], #0x2\n" + "str h16, [x25], #0x2\n" + "tbz x11, #0, 110f\n" "st1 { v23.b }[2], [x9]\n" - "st1 { v8.b }[2], [x26]\n" - "st1 { v15.b }[2], [x25]\n" - "st1 { v16.b }[2], [x24]\n" - "b 114f\n" - "113:" // Height 4: Partial direct writeback: partial_1_0 + "st1 { v8.b }[2], [x27]\n" + "st1 { v15.b }[2], [x26]\n" + "st1 { v16.b }[2], [x25]\n" + "b 110f\n" + "109:" // Height 4: Partial direct writeback: partial_1_0 "str b23, [x9, #0x0]\n" - "str b8, [x26, #0x0]\n" - "str b15, [x25, #0x0]\n" - "str b16, [x24, #0x0]\n" - "114:" // Height 4: Partial direct writeback: Done - "b 116f\n" - "115:" // Height 4: Full writeback + "str b8, [x27, #0x0]\n" + "str b15, [x26, #0x0]\n" + "str b16, [x25, #0x0]\n" + "110:" // Height 4: Partial direct writeback: Done + "b 112f\n" + "111:" // Height 4: Full writeback "str q23, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "str q8, [x26, #0x0]\n" - "str q15, [x25, #0x0]\n" - "str q16, [x24, #0x0]\n" - "116:" // Height 4: Writeback done + "str q8, [x27, #0x0]\n" + "str q15, [x26, #0x0]\n" + "str q16, [x25, #0x0]\n" + "112:" // Height 4: Writeback done "subs x11, x11, #0x10\n" - "bgt 89b\n" - "b 176f\n" - "117:" // Height 5 - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "bgt 86b\n" + "b 170f\n" + "113:" // Height 5 + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "118:" // Height 5: Column loop + "114:" // Height 5: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -2028,13 +1892,12 @@ void a64_hybrid_s8qs_mmla_6x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "119:" // Height 5: setup done "mov x28, #0x0\n" - "120:" // Height 5: String loop + "116:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 121f\n" + "tbz %x[flags], #3, 117f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" @@ -2042,23 +1905,23 @@ void a64_hybrid_s8qs_mmla_6x16 ( "ldr x24, [x20, #0x10]\n" "ldr x23, [x20, #0x18]\n" "ldr x22, [x20, #0x20]\n" - "cbnz x28, 122f\n" + "cbnz x28, 118f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" - "b 122f\n" - "121:" // Height 5: setup direct input + "b 118f\n" + "117:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" "add x24, x25, x21\n" "add x23, x24, x21\n" "add x22, x23, x21\n" - "122:" // Height 5: input setup done + "118:" // Height 5: input setup done "cmp x27, #0x10\n" - "blt 125f\n" + "blt 121f\n" "ldr q1, [x26, #0x0]\n" "ldr q2, [x25, #0x0]\n" "cmp x27, #0x20\n" @@ -2066,8 +1929,8 @@ void a64_hybrid_s8qs_mmla_6x16 ( "ldr q4, [x23, #0x0]\n" "ldr q5, [x22, #0x0]\n" "ldr q7, [x10, #0x0]\n" - "blt 124f\n" - "123:" // Height 5: Multiply loop: Main loop head + "blt 120f\n" + "119:" // Height 5: Multiply loop: Main loop head "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "sub x27, x27, #0x10\n" @@ -2119,45 +1982,45 @@ void a64_hybrid_s8qs_mmla_6x16 ( ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" "ldr q2, [x25, #0x0]\n" ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" - "ldr q0, [x10, #0x90]\n" + "ldr q6, [x10, #0x90]\n" "ldr q4, [x23, #0x0]\n" ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n" - ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n" - "ldr q0, [x10, #0xb0]\n" - ".inst 0x4e86a429 // smmla v9.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a471 // smmla v17.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4b9 // smmla v25.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n" - ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n" - "ldr q0, [x10, #0xd0]\n" - ".inst 0x4e86a42a // smmla v10.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a472 // smmla v18.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4ba // smmla v26.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n" - ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n" - "ldr q0, [x10, #0xf0]\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e86a42b // smmla v11.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bb // smmla v27.4s, v5.16b, v6.16b\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" "ldr q3, [x24, #0x0]\n" - ".inst 0x4e80a4bf // smmla v31.4s, v5.16b, v0.16b\n" + ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" "ldr q5, [x22, #0x0]\n" - "bge 123b\n" - "124:" // Height 5: Multiply loop: Single iteration only + "bge 119b\n" + "120:" // Height 5: Multiply loop: Single iteration only "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "add x26, x26, #0x10\n" @@ -2207,184 +2070,184 @@ void a64_hybrid_s8qs_mmla_6x16 ( ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" - "ldr q0, [x10, #0x90]\n" + "ldr q6, [x10, #0x90]\n" ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" - "ldr q2, [x10, #0xa0]\n" - ".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n" - ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n" - "ldr q0, [x10, #0xb0]\n" - ".inst 0x4e82a429 // smmla v9.4s, v1.16b, v2.16b\n" - ".inst 0x4e82a471 // smmla v17.4s, v3.16b, v2.16b\n" - ".inst 0x4e82a4b9 // smmla v25.4s, v5.16b, v2.16b\n" - "ldr q2, [x10, #0xc0]\n" - ".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n" - ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n" - "ldr q0, [x10, #0xd0]\n" - ".inst 0x4e82a42a // smmla v10.4s, v1.16b, v2.16b\n" - ".inst 0x4e82a472 // smmla v18.4s, v3.16b, v2.16b\n" - ".inst 0x4e82a4ba // smmla v26.4s, v5.16b, v2.16b\n" - "ldr q2, [x10, #0xe0]\n" - ".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n" - ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n" - "ldr q0, [x10, #0xf0]\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e82a42b // smmla v11.4s, v1.16b, v2.16b\n" - ".inst 0x4e82a473 // smmla v19.4s, v3.16b, v2.16b\n" - ".inst 0x4e82a4bb // smmla v27.4s, v5.16b, v2.16b\n" - ".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n" - ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a4bf // smmla v31.4s, v5.16b, v0.16b\n" - "125:" // Height 5: Multiply loop: Main loop skip - "cbz x27, 132f\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" + "121:" // Height 5: Multiply loop: Main loop skip + "cbz x27, 128f\n" "cmp x27, #0x8\n" - "blt 127f\n" - "126:" // Height 5: Multiply loop: Odd block loop - "ldr d6, [x26], #0x8\n" - "ldr d4, [x25], #0x8\n" + "blt 123f\n" + "122:" // Height 5: Multiply loop: Odd block loop + "ldr d1, [x26], #0x8\n" + "ldr d2, [x25], #0x8\n" "sub x27, x27, #0x8\n" "ldr d3, [x24], #0x8\n" - "ldr d2, [x23], #0x8\n" + "ldr d4, [x23], #0x8\n" "cmp x27, #0x8\n" - "ldr d0, [x22], #0x8\n" - "ldr q1, [x10, #0x0]\n" - "trn1 v4.2d, v6.2d, v4.2d\n" - "trn1 v3.2d, v3.2d, v2.2d\n" - "trn1 v2.2d, v0.2d, v5.2d\n" - "ldr q0, [x10, #0x10]\n" - ".inst 0x4e81a488 // smmla v8.4s, v4.16b, v1.16b\n" - ".inst 0x4e81a470 // smmla v16.4s, v3.16b, v1.16b\n" - ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n" - "ldr q1, [x10, #0x20]\n" - ".inst 0x4e80a48c // smmla v12.4s, v4.16b, v0.16b\n" - ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n" - "ldr q0, [x10, #0x30]\n" - ".inst 0x4e81a489 // smmla v9.4s, v4.16b, v1.16b\n" - ".inst 0x4e81a471 // smmla v17.4s, v3.16b, v1.16b\n" - ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n" - "ldr q1, [x10, #0x40]\n" - ".inst 0x4e80a48d // smmla v13.4s, v4.16b, v0.16b\n" - ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a45d // smmla v29.4s, v2.16b, v0.16b\n" - "ldr q0, [x10, #0x50]\n" - ".inst 0x4e81a48a // smmla v10.4s, v4.16b, v1.16b\n" - ".inst 0x4e81a472 // smmla v18.4s, v3.16b, v1.16b\n" - ".inst 0x4e81a45a // smmla v26.4s, v2.16b, v1.16b\n" - "ldr q1, [x10, #0x60]\n" - ".inst 0x4e80a48e // smmla v14.4s, v4.16b, v0.16b\n" - ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n" - "ldr q0, [x10, #0x70]\n" + "ldr d5, [x22], #0x8\n" + "ldr q6, [x10, #0x0]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn1 v4.2d, v5.2d, v7.2d\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a498 // smmla v24.4s, v4.16b, v6.16b\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49c // smmla v28.4s, v4.16b, v7.16b\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a499 // smmla v25.4s, v4.16b, v6.16b\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49d // smmla v29.4s, v4.16b, v7.16b\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49a // smmla v26.4s, v4.16b, v6.16b\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49e // smmla v30.4s, v4.16b, v7.16b\n" + "ldr q7, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e81a48b // smmla v11.4s, v4.16b, v1.16b\n" - ".inst 0x4e81a473 // smmla v19.4s, v3.16b, v1.16b\n" - ".inst 0x4e81a45b // smmla v27.4s, v2.16b, v1.16b\n" - ".inst 0x4e80a48f // smmla v15.4s, v4.16b, v0.16b\n" - ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a45f // smmla v31.4s, v2.16b, v0.16b\n" - "bge 126b\n" - "127:" // Height 5: Multiply loop: Skip odd blocks - "cbz x27, 132f\n" - "tbz x27, #2, 129f\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49b // smmla v27.4s, v4.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49f // smmla v31.4s, v4.16b, v7.16b\n" + "bge 122b\n" + "123:" // Height 5: Multiply loop: Skip odd blocks + "cbz x27, 128f\n" + "tbz x27, #2, 125f\n" "ldr s1, [x26], #0x4\n" "ldr s2, [x25], #0x4\n" "ldr s3, [x24], #0x4\n" "ldr s4, [x23], #0x4\n" "ldr s5, [x22], #0x4\n" - "tbz x27, #1, 128f\n" + "tbz x27, #1, 124f\n" "ld1 { v1.h }[2], [x26], #0x2\n" "ld1 { v2.h }[2], [x25], #0x2\n" "ld1 { v3.h }[2], [x24], #0x2\n" "ld1 { v4.h }[2], [x23], #0x2\n" "ld1 { v5.h }[2], [x22], #0x2\n" - "tbz x27, #0, 131f\n" + "tbz x27, #0, 127f\n" "ld1 { v1.b }[6], [x26]\n" "ld1 { v2.b }[6], [x25]\n" "ld1 { v3.b }[6], [x24]\n" "ld1 { v4.b }[6], [x23]\n" "ld1 { v5.b }[6], [x22]\n" - "b 131f\n" - "128:" // Height 5: Multiply loop: Ragged operand read: partial_1_4 - "tbz x27, #0, 131f\n" + "b 127f\n" + "124:" // Height 5: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 127f\n" "ld1 { v1.b }[4], [x26]\n" "ld1 { v2.b }[4], [x25]\n" "ld1 { v3.b }[4], [x24]\n" "ld1 { v4.b }[4], [x23]\n" "ld1 { v5.b }[4], [x22]\n" - "b 131f\n" - "129:" // Height 5: Multiply loop: Ragged operand read: partial_2_0 - "tbz x27, #1, 130f\n" + "b 127f\n" + "125:" // Height 5: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 126f\n" "ldr h1, [x26], #0x2\n" "ldr h2, [x25], #0x2\n" "ldr h3, [x24], #0x2\n" "ldr h4, [x23], #0x2\n" "ldr h5, [x22], #0x2\n" - "tbz x27, #0, 131f\n" + "tbz x27, #0, 127f\n" "ld1 { v1.b }[2], [x26]\n" "ld1 { v2.b }[2], [x25]\n" "ld1 { v3.b }[2], [x24]\n" "ld1 { v4.b }[2], [x23]\n" "ld1 { v5.b }[2], [x22]\n" - "b 131f\n" - "130:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "b 127f\n" + "126:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x26, #0x0]\n" "ldr b2, [x25, #0x0]\n" "ldr b3, [x24, #0x0]\n" "ldr b4, [x23, #0x0]\n" "ldr b5, [x22, #0x0]\n" - "131:" // Height 5: Multiply loop: Ragged operand read: Done + "127:" // Height 5: Multiply loop: Ragged operand read: Done "ldr q7, [x10, #0x0]\n" - "trn1 v6.2d, v1.2d, v2.2d\n" - "trn1 v3.2d, v3.2d, v4.2d\n" - "trn1 v2.2d, v5.2d, v0.2d\n" - "ldr q0, [x10, #0x10]\n" - ".inst 0x4e87a4c8 // smmla v8.4s, v6.16b, v7.16b\n" - ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n" - "ldr q1, [x10, #0x20]\n" - ".inst 0x4e80a4cc // smmla v12.4s, v6.16b, v0.16b\n" - ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n" - "ldr q0, [x10, #0x30]\n" - ".inst 0x4e81a4c9 // smmla v9.4s, v6.16b, v1.16b\n" - ".inst 0x4e81a471 // smmla v17.4s, v3.16b, v1.16b\n" - ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n" - "ldr q1, [x10, #0x40]\n" - ".inst 0x4e80a4cd // smmla v13.4s, v6.16b, v0.16b\n" - ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a45d // smmla v29.4s, v2.16b, v0.16b\n" - "ldr q0, [x10, #0x50]\n" - ".inst 0x4e81a4ca // smmla v10.4s, v6.16b, v1.16b\n" - ".inst 0x4e81a472 // smmla v18.4s, v3.16b, v1.16b\n" - ".inst 0x4e81a45a // smmla v26.4s, v2.16b, v1.16b\n" - "ldr q1, [x10, #0x60]\n" - ".inst 0x4e80a4ce // smmla v14.4s, v6.16b, v0.16b\n" - ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n" - "ldr q0, [x10, #0x70]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q6, [x10, #0x10]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e81a4cb // smmla v11.4s, v6.16b, v1.16b\n" - ".inst 0x4e81a473 // smmla v19.4s, v3.16b, v1.16b\n" - ".inst 0x4e81a45b // smmla v27.4s, v2.16b, v1.16b\n" - ".inst 0x4e80a4cf // smmla v15.4s, v6.16b, v0.16b\n" - ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a45f // smmla v31.4s, v2.16b, v0.16b\n" - "132:" // Height 5: Multiply loop: No odd multiplies + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + "128:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 120b\n" - "ldr q4, [x14, #0x0]\n" - "ldr q3, [x14, #0x10]\n" - "uzp1 v2.2d, v8.2d, v12.2d\n" + "bne 116b\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" "uzp2 v8.2d, v8.2d, v12.2d\n" - "ldr q1, [x14, #0x20]\n" - "ldr q0, [x14, #0x30]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q3, [x12, #0x30]\n" "uzp1 v12.2d, v9.2d, v13.2d\n" "uzp2 v9.2d, v9.2d, v13.2d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -2393,61 +2256,61 @@ void a64_hybrid_s8qs_mmla_6x16 ( "prfm pstl1keep, [x9, #0x0]\n" "uzp1 v14.2d, v11.2d, v15.2d\n" "uzp2 v11.2d, v11.2d, v15.2d\n" - "add x14, x14, #0x40\n" + "add x12, x12, #0x40\n" "uzp1 v15.2d, v16.2d, v20.2d\n" "uzp2 v16.2d, v16.2d, v20.2d\n" - "add x26, x9, x20\n" + "add x27, x9, x20\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "uzp2 v17.2d, v17.2d, v21.2d\n" - "add x25, x26, x20\n" - "prfm pstl1keep, [x26, #0x0]\n" + "add x26, x27, x20\n" + "prfm pstl1keep, [x27, #0x0]\n" "uzp1 v21.2d, v18.2d, v22.2d\n" "uzp2 v18.2d, v18.2d, v22.2d\n" - "add x24, x25, x20\n" - "prfm pstl1keep, [x25, #0x0]\n" + "add x25, x26, x20\n" + "prfm pstl1keep, [x26, #0x0]\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" - "add x23, x24, x20\n" - "prfm pstl1keep, [x24, #0x0]\n" + "add x24, x25, x20\n" + "prfm pstl1keep, [x25, #0x0]\n" "uzp1 v24.2d, v24.2d, v28.2d\n" "uzp1 v25.2d, v25.2d, v29.2d\n" - "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x24, #0x0]\n" "uzp1 v26.2d, v26.2d, v30.2d\n" "uzp1 v27.2d, v27.2d, v31.2d\n" - "mov v31.16b, v2.16b\n" - "add v12.4s, v12.4s, v3.4s\n" - "add v13.4s, v13.4s, v1.4s\n" - "add v14.4s, v14.4s, v0.4s\n" - "add v31.4s, v31.4s, v4.4s\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v3.4s\n" - "add v10.4s, v10.4s, v1.4s\n" - "add v11.4s, v11.4s, v0.4s\n" - "add v15.4s, v15.4s, v4.4s\n" - "add v20.4s, v20.4s, v3.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v0.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v3.4s\n" - "add v18.4s, v18.4s, v1.4s\n" - "add v19.4s, v19.4s, v0.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v3.4s\n" - "add v26.4s, v26.4s, v1.4s\n" - "add v27.4s, v27.4s, v0.4s\n" - "tbz %x[flags], #4, 133f\n" - "ldr q0, [x12, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q1, [x12, #0x10]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q2, [x12, #0x20]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q3, [x12, #0x30]\n" - "ldr q7, [x13, #0x30]\n" - "add x12, x12, #0x40\n" + "mov v31.16b, v7.16b\n" + "add v12.4s, v12.4s, v1.4s\n" + "add v13.4s, v13.4s, v2.4s\n" + "add v14.4s, v14.4s, v3.4s\n" + "add v31.4s, v31.4s, v0.4s\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v15.4s, v15.4s, v0.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "tbz %x[flags], #4, 129f\n" + "ldr q0, [x13, #0x0]\n" + "ldr q4, [x14, #0x0]\n" + "ldr q1, [x13, #0x10]\n" + "ldr q5, [x14, #0x10]\n" + "ldr q2, [x13, #0x20]\n" + "ldr q6, [x14, #0x20]\n" + "ldr q3, [x13, #0x30]\n" + "ldr q7, [x14, #0x30]\n" "add x13, x13, #0x40\n" - "b 134f\n" - "133:" // Height 5: per layer parameters + "add x14, x14, #0x40\n" + "b 130f\n" + "129:" // Height 5: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1r { v0.4s }, [x21]\n" @@ -2458,102 +2321,40 @@ void a64_hybrid_s8qs_mmla_6x16 ( "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" - "134:" // Height 5: parameters loaded - "sqrdmulh v31.4s, v31.4s, v4.4s\n" - "sqrdmulh v12.4s, v12.4s, v5.4s\n" - "sqrdmulh v13.4s, v13.4s, v6.4s\n" - "sqrdmulh v14.4s, v14.4s, v7.4s\n" - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "sqrdmulh v15.4s, v15.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v5.4s\n" - "sqrdmulh v21.4s, v21.4s, v6.4s\n" - "sqrdmulh v22.4s, v22.4s, v7.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v5.4s\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "sqrdmulh v19.4s, v19.4s, v7.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v5.4s\n" - "sqrdmulh v26.4s, v26.4s, v6.4s\n" - "sqrdmulh v27.4s, v27.4s, v7.4s\n" - "tbz %x[flags], #5, 135f\n" - "and v30.16b, v31.16b, v0.16b\n" - "and v29.16b, v12.16b, v1.16b\n" - "and v28.16b, v13.16b, v2.16b\n" - "and v23.16b, v14.16b, v3.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sqadd v31.4s, v31.4s, v30.4s\n" - "and v30.16b, v8.16b, v0.16b\n" - "sqadd v12.4s, v12.4s, v29.4s\n" - "and v29.16b, v9.16b, v1.16b\n" - "sqadd v13.4s, v13.4s, v28.4s\n" - "sqadd v14.4s, v14.4s, v23.4s\n" - "and v28.16b, v10.16b, v2.16b\n" - "and v23.16b, v11.16b, v3.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v30.4s\n" - "and v30.16b, v15.16b, v0.16b\n" - "sqadd v9.4s, v9.4s, v29.4s\n" - "and v29.16b, v20.16b, v1.16b\n" - "sqadd v10.4s, v10.4s, v28.4s\n" - "sqadd v11.4s, v11.4s, v23.4s\n" - "and v28.16b, v21.16b, v2.16b\n" - "and v23.16b, v22.16b, v3.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v30.4s\n" - "and v30.16b, v16.16b, v0.16b\n" - "sqadd v20.4s, v20.4s, v29.4s\n" - "and v29.16b, v17.16b, v1.16b\n" - "sqadd v21.4s, v21.4s, v28.4s\n" - "sqadd v22.4s, v22.4s, v23.4s\n" - "and v28.16b, v18.16b, v2.16b\n" - "and v23.16b, v19.16b, v3.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v30.4s\n" - "and v30.16b, v24.16b, v0.16b\n" - "sqadd v17.4s, v17.4s, v29.4s\n" - "and v29.16b, v25.16b, v1.16b\n" - "sqadd v18.4s, v18.4s, v28.4s\n" - "sqadd v19.4s, v19.4s, v23.4s\n" - "and v28.16b, v26.16b, v2.16b\n" - "and v23.16b, v27.16b, v3.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sqadd v24.4s, v24.4s, v30.4s\n" - "sqadd v25.4s, v25.4s, v29.4s\n" - "sqadd v26.4s, v26.4s, v28.4s\n" - "sqadd v27.4s, v27.4s, v23.4s\n" - "135:" // Height 5: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "130:" // Height 5: parameters loaded + "sqdmulh v31.4s, v31.4s, v4.4s\n" + "sqdmulh v12.4s, v12.4s, v5.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + "sqdmulh v13.4s, v13.4s, v6.4s\n" + "sqdmulh v14.4s, v14.4s, v7.4s\n" + "add x20, %x[qp], %[minval]\n" + "cmp x11, #0x10\n" + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" + "sqdmulh v15.4s, v15.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v5.4s\n" + "sqdmulh v21.4s, v21.4s, v6.4s\n" + "sqdmulh v22.4s, v22.4s, v7.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v5.4s\n" + "sqdmulh v18.4s, v18.4s, v6.4s\n" + "sqdmulh v19.4s, v19.4s, v7.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" + "sqdmulh v25.4s, v25.4s, v5.4s\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v26.4s, v26.4s, v6.4s\n" + "ld1r { v6.4s }, [x21]\n" + "sqdmulh v27.4s, v27.4s, v7.4s\n" "srshl v31.4s, v31.4s, v0.4s\n" "srshl v12.4s, v12.4s, v1.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v29.4s }, [x21]\n" - "ld1r { v28.4s }, [x20]\n" "srshl v13.4s, v13.4s, v2.4s\n" "srshl v14.4s, v14.4s, v3.4s\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x11, #0x10\n" - "ld1r { v23.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" "srshl v15.4s, v15.4s, v0.4s\n" @@ -2568,210 +2369,210 @@ void a64_hybrid_s8qs_mmla_6x16 ( "srshl v25.4s, v25.4s, v1.4s\n" "srshl v26.4s, v26.4s, v2.4s\n" "srshl v27.4s, v27.4s, v3.4s\n" - "add v31.4s, v31.4s, v29.4s\n" - "add v12.4s, v12.4s, v29.4s\n" - "add v13.4s, v13.4s, v29.4s\n" - "add v14.4s, v14.4s, v29.4s\n" - "add v8.4s, v8.4s, v29.4s\n" - "add v9.4s, v9.4s, v29.4s\n" - "add v10.4s, v10.4s, v29.4s\n" - "add v11.4s, v11.4s, v29.4s\n" - "add v15.4s, v15.4s, v29.4s\n" - "add v20.4s, v20.4s, v29.4s\n" - "add v21.4s, v21.4s, v29.4s\n" - "add v22.4s, v22.4s, v29.4s\n" - "add v16.4s, v16.4s, v29.4s\n" - "add v17.4s, v17.4s, v29.4s\n" - "add v18.4s, v18.4s, v29.4s\n" - "add v19.4s, v19.4s, v29.4s\n" - "add v24.4s, v24.4s, v29.4s\n" - "add v25.4s, v25.4s, v29.4s\n" - "add v26.4s, v26.4s, v29.4s\n" - "add v27.4s, v27.4s, v29.4s\n" - "smin v31.4s, v31.4s, v28.4s\n" - "smin v12.4s, v12.4s, v28.4s\n" - "smin v13.4s, v13.4s, v28.4s\n" - "smin v14.4s, v14.4s, v28.4s\n" - "smin v8.4s, v8.4s, v28.4s\n" - "smin v9.4s, v9.4s, v28.4s\n" - "smin v10.4s, v10.4s, v28.4s\n" - "smin v11.4s, v11.4s, v28.4s\n" - "smin v15.4s, v15.4s, v28.4s\n" - "smin v20.4s, v20.4s, v28.4s\n" - "smin v21.4s, v21.4s, v28.4s\n" - "smin v22.4s, v22.4s, v28.4s\n" - "smin v16.4s, v16.4s, v28.4s\n" - "smin v17.4s, v17.4s, v28.4s\n" - "smin v18.4s, v18.4s, v28.4s\n" - "smin v19.4s, v19.4s, v28.4s\n" - "smin v24.4s, v24.4s, v28.4s\n" - "smin v25.4s, v25.4s, v28.4s\n" - "smin v26.4s, v26.4s, v28.4s\n" - "smin v27.4s, v27.4s, v28.4s\n" - "smax v31.4s, v31.4s, v23.4s\n" - "smax v12.4s, v12.4s, v23.4s\n" - "smax v13.4s, v13.4s, v23.4s\n" - "smax v14.4s, v14.4s, v23.4s\n" - "smax v8.4s, v8.4s, v23.4s\n" - "smax v9.4s, v9.4s, v23.4s\n" - "smax v10.4s, v10.4s, v23.4s\n" - "smax v11.4s, v11.4s, v23.4s\n" - "smax v15.4s, v15.4s, v23.4s\n" - "smax v20.4s, v20.4s, v23.4s\n" - "smax v21.4s, v21.4s, v23.4s\n" - "smax v22.4s, v22.4s, v23.4s\n" - "smax v16.4s, v16.4s, v23.4s\n" - "smax v17.4s, v17.4s, v23.4s\n" - "smax v18.4s, v18.4s, v23.4s\n" - "smax v19.4s, v19.4s, v23.4s\n" - "smax v24.4s, v24.4s, v23.4s\n" - "smax v25.4s, v25.4s, v23.4s\n" - "smax v26.4s, v26.4s, v23.4s\n" - "smax v27.4s, v27.4s, v23.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v31.8h, v31.8h, v12.8h\n" - "uzp1 v28.8h, v13.8h, v14.8h\n" + "uzp1 v12.8h, v13.8h, v14.8h\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v23.8h, v10.8h, v11.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "uzp1 v15.8h, v15.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v18.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v17.8h, v26.8h, v27.8h\n" - "uzp1 v31.16b, v31.16b, v28.16b\n" - "uzp1 v8.16b, v8.16b, v23.16b\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v31.16b, v31.16b, v12.16b\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" "uzp1 v15.16b, v15.16b, v20.16b\n" - "uzp1 v16.16b, v16.16b, v18.16b\n" - "uzp1 v24.16b, v24.16b, v17.16b\n" - "bge 144f\n" - "tbz x11, #3, 139f\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 139f\n" + "tbz x11, #3, 134f\n" "str d31, [x9], #0x8\n" - "str d8, [x26], #0x8\n" - "str d15, [x25], #0x8\n" - "str d16, [x24], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x11, #2, 137f\n" + "str d8, [x27], #0x8\n" + "str d15, [x26], #0x8\n" + "str d16, [x25], #0x8\n" + "str d24, [x24], #0x8\n" + "tbz x11, #2, 132f\n" "st1 { v31.s }[2], [x9], #0x4\n" - "st1 { v8.s }[2], [x26], #0x4\n" - "st1 { v15.s }[2], [x25], #0x4\n" - "st1 { v16.s }[2], [x24], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "tbz x11, #1, 136f\n" + "st1 { v8.s }[2], [x27], #0x4\n" + "st1 { v15.s }[2], [x26], #0x4\n" + "st1 { v16.s }[2], [x25], #0x4\n" + "st1 { v24.s }[2], [x24], #0x4\n" + "tbz x11, #1, 131f\n" "st1 { v31.h }[6], [x9], #0x2\n" - "st1 { v8.h }[6], [x26], #0x2\n" - "st1 { v15.h }[6], [x25], #0x2\n" - "st1 { v16.h }[6], [x24], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "tbz x11, #0, 143f\n" + "st1 { v8.h }[6], [x27], #0x2\n" + "st1 { v15.h }[6], [x26], #0x2\n" + "st1 { v16.h }[6], [x25], #0x2\n" + "st1 { v24.h }[6], [x24], #0x2\n" + "tbz x11, #0, 138f\n" "st1 { v31.b }[14], [x9]\n" - "st1 { v8.b }[14], [x26]\n" - "st1 { v15.b }[14], [x25]\n" - "st1 { v16.b }[14], [x24]\n" - "st1 { v24.b }[14], [x23]\n" - "b 143f\n" - "136:" // Height 5: Partial direct writeback: partial_1_12 - "tbz x11, #0, 143f\n" + "st1 { v8.b }[14], [x27]\n" + "st1 { v15.b }[14], [x26]\n" + "st1 { v16.b }[14], [x25]\n" + "st1 { v24.b }[14], [x24]\n" + "b 138f\n" + "131:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x11, #0, 138f\n" "st1 { v31.b }[12], [x9]\n" - "st1 { v8.b }[12], [x26]\n" - "st1 { v15.b }[12], [x25]\n" - "st1 { v16.b }[12], [x24]\n" - "st1 { v24.b }[12], [x23]\n" - "b 143f\n" - "137:" // Height 5: Partial direct writeback: partial_2_8 - "tbz x11, #1, 138f\n" + "st1 { v8.b }[12], [x27]\n" + "st1 { v15.b }[12], [x26]\n" + "st1 { v16.b }[12], [x25]\n" + "st1 { v24.b }[12], [x24]\n" + "b 138f\n" + "132:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x11, #1, 133f\n" "st1 { v31.h }[4], [x9], #0x2\n" - "st1 { v8.h }[4], [x26], #0x2\n" - "st1 { v15.h }[4], [x25], #0x2\n" - "st1 { v16.h }[4], [x24], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "tbz x11, #0, 143f\n" + "st1 { v8.h }[4], [x27], #0x2\n" + "st1 { v15.h }[4], [x26], #0x2\n" + "st1 { v16.h }[4], [x25], #0x2\n" + "st1 { v24.h }[4], [x24], #0x2\n" + "tbz x11, #0, 138f\n" "st1 { v31.b }[10], [x9]\n" - "st1 { v8.b }[10], [x26]\n" - "st1 { v15.b }[10], [x25]\n" - "st1 { v16.b }[10], [x24]\n" - "st1 { v24.b }[10], [x23]\n" - "b 143f\n" - "138:" // Height 5: Partial direct writeback: partial_1_8 - "tbz x11, #0, 143f\n" + "st1 { v8.b }[10], [x27]\n" + "st1 { v15.b }[10], [x26]\n" + "st1 { v16.b }[10], [x25]\n" + "st1 { v24.b }[10], [x24]\n" + "b 138f\n" + "133:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x11, #0, 138f\n" "st1 { v31.b }[8], [x9]\n" - "st1 { v8.b }[8], [x26]\n" - "st1 { v15.b }[8], [x25]\n" - "st1 { v16.b }[8], [x24]\n" - "st1 { v24.b }[8], [x23]\n" - "b 143f\n" - "139:" // Height 5: Partial direct writeback: partial_4_0 - "tbz x11, #2, 141f\n" + "st1 { v8.b }[8], [x27]\n" + "st1 { v15.b }[8], [x26]\n" + "st1 { v16.b }[8], [x25]\n" + "st1 { v24.b }[8], [x24]\n" + "b 138f\n" + "134:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x11, #2, 136f\n" "str s31, [x9], #0x4\n" - "str s8, [x26], #0x4\n" - "str s15, [x25], #0x4\n" - "str s16, [x24], #0x4\n" - "str s24, [x23], #0x4\n" - "tbz x11, #1, 140f\n" + "str s8, [x27], #0x4\n" + "str s15, [x26], #0x4\n" + "str s16, [x25], #0x4\n" + "str s24, [x24], #0x4\n" + "tbz x11, #1, 135f\n" "st1 { v31.h }[2], [x9], #0x2\n" - "st1 { v8.h }[2], [x26], #0x2\n" - "st1 { v15.h }[2], [x25], #0x2\n" - "st1 { v16.h }[2], [x24], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "tbz x11, #0, 143f\n" + "st1 { v8.h }[2], [x27], #0x2\n" + "st1 { v15.h }[2], [x26], #0x2\n" + "st1 { v16.h }[2], [x25], #0x2\n" + "st1 { v24.h }[2], [x24], #0x2\n" + "tbz x11, #0, 138f\n" "st1 { v31.b }[6], [x9]\n" - "st1 { v8.b }[6], [x26]\n" - "st1 { v15.b }[6], [x25]\n" - "st1 { v16.b }[6], [x24]\n" - "st1 { v24.b }[6], [x23]\n" - "b 143f\n" - "140:" // Height 5: Partial direct writeback: partial_1_4 - "tbz x11, #0, 143f\n" + "st1 { v8.b }[6], [x27]\n" + "st1 { v15.b }[6], [x26]\n" + "st1 { v16.b }[6], [x25]\n" + "st1 { v24.b }[6], [x24]\n" + "b 138f\n" + "135:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x11, #0, 138f\n" "st1 { v31.b }[4], [x9]\n" - "st1 { v8.b }[4], [x26]\n" - "st1 { v15.b }[4], [x25]\n" - "st1 { v16.b }[4], [x24]\n" - "st1 { v24.b }[4], [x23]\n" - "b 143f\n" - "141:" // Height 5: Partial direct writeback: partial_2_0 - "tbz x11, #1, 142f\n" + "st1 { v8.b }[4], [x27]\n" + "st1 { v15.b }[4], [x26]\n" + "st1 { v16.b }[4], [x25]\n" + "st1 { v24.b }[4], [x24]\n" + "b 138f\n" + "136:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x11, #1, 137f\n" "str h31, [x9], #0x2\n" - "str h8, [x26], #0x2\n" - "str h15, [x25], #0x2\n" - "str h16, [x24], #0x2\n" - "str h24, [x23], #0x2\n" - "tbz x11, #0, 143f\n" + "str h8, [x27], #0x2\n" + "str h15, [x26], #0x2\n" + "str h16, [x25], #0x2\n" + "str h24, [x24], #0x2\n" + "tbz x11, #0, 138f\n" "st1 { v31.b }[2], [x9]\n" - "st1 { v8.b }[2], [x26]\n" - "st1 { v15.b }[2], [x25]\n" - "st1 { v16.b }[2], [x24]\n" - "st1 { v24.b }[2], [x23]\n" - "b 143f\n" - "142:" // Height 5: Partial direct writeback: partial_1_0 + "st1 { v8.b }[2], [x27]\n" + "st1 { v15.b }[2], [x26]\n" + "st1 { v16.b }[2], [x25]\n" + "st1 { v24.b }[2], [x24]\n" + "b 138f\n" + "137:" // Height 5: Partial direct writeback: partial_1_0 "str b31, [x9, #0x0]\n" - "str b8, [x26, #0x0]\n" - "str b15, [x25, #0x0]\n" - "str b16, [x24, #0x0]\n" - "str b24, [x23, #0x0]\n" - "143:" // Height 5: Partial direct writeback: Done - "b 145f\n" - "144:" // Height 5: Full writeback + "str b8, [x27, #0x0]\n" + "str b15, [x26, #0x0]\n" + "str b16, [x25, #0x0]\n" + "str b24, [x24, #0x0]\n" + "138:" // Height 5: Partial direct writeback: Done + "b 140f\n" + "139:" // Height 5: Full writeback "str q31, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "str q8, [x26, #0x0]\n" - "str q15, [x25, #0x0]\n" - "str q16, [x24, #0x0]\n" - "str q24, [x23, #0x0]\n" - "145:" // Height 5: Writeback done + "str q8, [x27, #0x0]\n" + "str q15, [x26, #0x0]\n" + "str q16, [x25, #0x0]\n" + "str q24, [x24, #0x0]\n" + "140:" // Height 5: Writeback done "subs x11, x11, #0x10\n" - "bgt 118b\n" - "b 176f\n" - "146:" // Height 6 + "bgt 114b\n" + "b 170f\n" + "141:" // Height 6 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" "mov x20, #0x6\n" - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "madd x20, x21, x20, x9\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "147:" // Height 6: Column loop + "142:" // Height 6: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -2796,13 +2597,12 @@ void a64_hybrid_s8qs_mmla_6x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "148:" // Height 6: setup done "mov x28, #0x0\n" - "149:" // Height 6: String loop + "144:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 150f\n" + "tbz %x[flags], #3, 145f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" @@ -2811,7 +2611,7 @@ void a64_hybrid_s8qs_mmla_6x16 ( "ldr x23, [x20, #0x18]\n" "ldr x22, [x20, #0x20]\n" "ldr x21, [x20, #0x28]\n" - "cbnz x28, 151f\n" + "cbnz x28, 146f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" @@ -2819,17 +2619,17 @@ void a64_hybrid_s8qs_mmla_6x16 ( "add x23, x23, x20\n" "add x22, x22, x20\n" "add x21, x21, x20\n" - "b 151f\n" - "150:" // Height 6: setup direct input + "b 146f\n" + "145:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" "add x24, x25, x21\n" "add x23, x24, x21\n" "add x22, x23, x21\n" "add x21, x22, x21\n" - "151:" // Height 6: input setup done + "146:" // Height 6: input setup done "cmp x27, #0x10\n" - "blt 154f\n" + "blt 149f\n" "ldr q1, [x26, #0x0]\n" "ldr q2, [x25, #0x0]\n" "cmp x27, #0x20\n" @@ -2838,8 +2638,8 @@ void a64_hybrid_s8qs_mmla_6x16 ( "ldr q5, [x22, #0x0]\n" "ldr q6, [x21, #0x0]\n" "ldr q7, [x10, #0x0]\n" - "blt 153f\n" - "152:" // Height 6: Multiply loop: Main loop head + "blt 148f\n" + "147:" // Height 6: Multiply loop: Main loop head "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "sub x27, x27, #0x10\n" @@ -2893,46 +2693,46 @@ void a64_hybrid_s8qs_mmla_6x16 ( ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" "ldr q2, [x25, #0x0]\n" ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" - "ldr q0, [x10, #0x90]\n" + "ldr q6, [x10, #0x90]\n" "ldr q4, [x23, #0x0]\n" ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n" - ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n" - "ldr q0, [x10, #0xb0]\n" - ".inst 0x4e86a429 // smmla v9.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a471 // smmla v17.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4b9 // smmla v25.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n" - ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n" - "ldr q0, [x10, #0xd0]\n" - ".inst 0x4e86a42a // smmla v10.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a472 // smmla v18.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4ba // smmla v26.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n" - ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n" - "ldr q0, [x10, #0xf0]\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e86a42b // smmla v11.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bb // smmla v27.4s, v5.16b, v6.16b\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" "ldr q3, [x24, #0x0]\n" - ".inst 0x4e80a4bf // smmla v31.4s, v5.16b, v0.16b\n" + ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" "ldr q5, [x22, #0x0]\n" "ldr q6, [x21, #0x0]\n" - "bge 152b\n" - "153:" // Height 6: Multiply loop: Single iteration only + "bge 147b\n" + "148:" // Height 6: Multiply loop: Single iteration only "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "add x26, x26, #0x10\n" @@ -2984,192 +2784,192 @@ void a64_hybrid_s8qs_mmla_6x16 ( ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" - "ldr q0, [x10, #0x90]\n" + "ldr q6, [x10, #0x90]\n" ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" - "ldr q2, [x10, #0xa0]\n" - ".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n" - ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n" - "ldr q0, [x10, #0xb0]\n" - ".inst 0x4e82a429 // smmla v9.4s, v1.16b, v2.16b\n" - ".inst 0x4e82a471 // smmla v17.4s, v3.16b, v2.16b\n" - ".inst 0x4e82a4b9 // smmla v25.4s, v5.16b, v2.16b\n" - "ldr q2, [x10, #0xc0]\n" - ".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n" - ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n" - "ldr q0, [x10, #0xd0]\n" - ".inst 0x4e82a42a // smmla v10.4s, v1.16b, v2.16b\n" - ".inst 0x4e82a472 // smmla v18.4s, v3.16b, v2.16b\n" - ".inst 0x4e82a4ba // smmla v26.4s, v5.16b, v2.16b\n" - "ldr q2, [x10, #0xe0]\n" - ".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n" - ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n" - "ldr q0, [x10, #0xf0]\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e82a42b // smmla v11.4s, v1.16b, v2.16b\n" - ".inst 0x4e82a473 // smmla v19.4s, v3.16b, v2.16b\n" - ".inst 0x4e82a4bb // smmla v27.4s, v5.16b, v2.16b\n" - ".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n" - ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a4bf // smmla v31.4s, v5.16b, v0.16b\n" - "154:" // Height 6: Multiply loop: Main loop skip - "cbz x27, 161f\n" + ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" + ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" + "149:" // Height 6: Multiply loop: Main loop skip + "cbz x27, 156f\n" "cmp x27, #0x8\n" - "blt 156f\n" - "155:" // Height 6: Multiply loop: Odd block loop - "ldr d6, [x26], #0x8\n" - "ldr d4, [x25], #0x8\n" + "blt 151f\n" + "150:" // Height 6: Multiply loop: Odd block loop + "ldr d1, [x26], #0x8\n" + "ldr d2, [x25], #0x8\n" "sub x27, x27, #0x8\n" - "ldr d5, [x24], #0x8\n" - "ldr d3, [x23], #0x8\n" + "ldr d3, [x24], #0x8\n" + "ldr d4, [x23], #0x8\n" "cmp x27, #0x8\n" - "ldr d2, [x22], #0x8\n" - "ldr d0, [x21], #0x8\n" - "ldr q1, [x10, #0x0]\n" - "trn1 v4.2d, v6.2d, v4.2d\n" - "trn1 v3.2d, v5.2d, v3.2d\n" - "trn1 v2.2d, v2.2d, v0.2d\n" - "ldr q0, [x10, #0x10]\n" - ".inst 0x4e81a488 // smmla v8.4s, v4.16b, v1.16b\n" - ".inst 0x4e81a470 // smmla v16.4s, v3.16b, v1.16b\n" - ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n" - "ldr q1, [x10, #0x20]\n" - ".inst 0x4e80a48c // smmla v12.4s, v4.16b, v0.16b\n" - ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n" - "ldr q0, [x10, #0x30]\n" - ".inst 0x4e81a489 // smmla v9.4s, v4.16b, v1.16b\n" - ".inst 0x4e81a471 // smmla v17.4s, v3.16b, v1.16b\n" - ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n" - "ldr q1, [x10, #0x40]\n" - ".inst 0x4e80a48d // smmla v13.4s, v4.16b, v0.16b\n" - ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a45d // smmla v29.4s, v2.16b, v0.16b\n" - "ldr q0, [x10, #0x50]\n" - ".inst 0x4e81a48a // smmla v10.4s, v4.16b, v1.16b\n" - ".inst 0x4e81a472 // smmla v18.4s, v3.16b, v1.16b\n" - ".inst 0x4e81a45a // smmla v26.4s, v2.16b, v1.16b\n" - "ldr q1, [x10, #0x60]\n" - ".inst 0x4e80a48e // smmla v14.4s, v4.16b, v0.16b\n" - ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n" - "ldr q0, [x10, #0x70]\n" + "ldr d5, [x22], #0x8\n" + "ldr d7, [x21], #0x8\n" + "ldr q6, [x10, #0x0]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn1 v4.2d, v5.2d, v7.2d\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a498 // smmla v24.4s, v4.16b, v6.16b\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49c // smmla v28.4s, v4.16b, v7.16b\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a499 // smmla v25.4s, v4.16b, v6.16b\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49d // smmla v29.4s, v4.16b, v7.16b\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49a // smmla v26.4s, v4.16b, v6.16b\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49e // smmla v30.4s, v4.16b, v7.16b\n" + "ldr q7, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e81a48b // smmla v11.4s, v4.16b, v1.16b\n" - ".inst 0x4e81a473 // smmla v19.4s, v3.16b, v1.16b\n" - ".inst 0x4e81a45b // smmla v27.4s, v2.16b, v1.16b\n" - ".inst 0x4e80a48f // smmla v15.4s, v4.16b, v0.16b\n" - ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" - ".inst 0x4e80a45f // smmla v31.4s, v2.16b, v0.16b\n" - "bge 155b\n" - "156:" // Height 6: Multiply loop: Skip odd blocks - "cbz x27, 161f\n" - "tbz x27, #2, 158f\n" + ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49b // smmla v27.4s, v4.16b, v6.16b\n" + ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49f // smmla v31.4s, v4.16b, v7.16b\n" + "bge 150b\n" + "151:" // Height 6: Multiply loop: Skip odd blocks + "cbz x27, 156f\n" + "tbz x27, #2, 153f\n" "ldr s1, [x26], #0x4\n" "ldr s2, [x25], #0x4\n" "ldr s3, [x24], #0x4\n" "ldr s4, [x23], #0x4\n" "ldr s5, [x22], #0x4\n" "ldr s6, [x21], #0x4\n" - "tbz x27, #1, 157f\n" + "tbz x27, #1, 152f\n" "ld1 { v1.h }[2], [x26], #0x2\n" "ld1 { v2.h }[2], [x25], #0x2\n" "ld1 { v3.h }[2], [x24], #0x2\n" "ld1 { v4.h }[2], [x23], #0x2\n" "ld1 { v5.h }[2], [x22], #0x2\n" "ld1 { v6.h }[2], [x21], #0x2\n" - "tbz x27, #0, 160f\n" + "tbz x27, #0, 155f\n" "ld1 { v1.b }[6], [x26]\n" "ld1 { v2.b }[6], [x25]\n" "ld1 { v3.b }[6], [x24]\n" "ld1 { v4.b }[6], [x23]\n" "ld1 { v5.b }[6], [x22]\n" "ld1 { v6.b }[6], [x21]\n" - "b 160f\n" - "157:" // Height 6: Multiply loop: Ragged operand read: partial_1_4 - "tbz x27, #0, 160f\n" + "b 155f\n" + "152:" // Height 6: Multiply loop: Ragged operand read: partial_1_4 + "tbz x27, #0, 155f\n" "ld1 { v1.b }[4], [x26]\n" "ld1 { v2.b }[4], [x25]\n" "ld1 { v3.b }[4], [x24]\n" "ld1 { v4.b }[4], [x23]\n" "ld1 { v5.b }[4], [x22]\n" "ld1 { v6.b }[4], [x21]\n" - "b 160f\n" - "158:" // Height 6: Multiply loop: Ragged operand read: partial_2_0 - "tbz x27, #1, 159f\n" + "b 155f\n" + "153:" // Height 6: Multiply loop: Ragged operand read: partial_2_0 + "tbz x27, #1, 154f\n" "ldr h1, [x26], #0x2\n" "ldr h2, [x25], #0x2\n" "ldr h3, [x24], #0x2\n" "ldr h4, [x23], #0x2\n" "ldr h5, [x22], #0x2\n" "ldr h6, [x21], #0x2\n" - "tbz x27, #0, 160f\n" + "tbz x27, #0, 155f\n" "ld1 { v1.b }[2], [x26]\n" "ld1 { v2.b }[2], [x25]\n" "ld1 { v3.b }[2], [x24]\n" "ld1 { v4.b }[2], [x23]\n" "ld1 { v5.b }[2], [x22]\n" "ld1 { v6.b }[2], [x21]\n" - "b 160f\n" - "159:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "b 155f\n" + "154:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x26, #0x0]\n" "ldr b2, [x25, #0x0]\n" "ldr b3, [x24, #0x0]\n" "ldr b4, [x23, #0x0]\n" "ldr b5, [x22, #0x0]\n" "ldr b6, [x21, #0x0]\n" - "160:" // Height 6: Multiply loop: Ragged operand read: Done + "155:" // Height 6: Multiply loop: Ragged operand read: Done "ldr q7, [x10, #0x0]\n" - "trn1 v2.2d, v1.2d, v2.2d\n" - "trn1 v4.2d, v3.2d, v4.2d\n" - "trn1 v3.2d, v5.2d, v6.2d\n" - "ldr q0, [x10, #0x10]\n" - ".inst 0x4e87a448 // smmla v8.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a490 // smmla v16.4s, v4.16b, v7.16b\n" - ".inst 0x4e87a478 // smmla v24.4s, v3.16b, v7.16b\n" - "ldr q1, [x10, #0x20]\n" - ".inst 0x4e80a44c // smmla v12.4s, v2.16b, v0.16b\n" - ".inst 0x4e80a494 // smmla v20.4s, v4.16b, v0.16b\n" - ".inst 0x4e80a47c // smmla v28.4s, v3.16b, v0.16b\n" - "ldr q0, [x10, #0x30]\n" - ".inst 0x4e81a449 // smmla v9.4s, v2.16b, v1.16b\n" - ".inst 0x4e81a491 // smmla v17.4s, v4.16b, v1.16b\n" - ".inst 0x4e81a479 // smmla v25.4s, v3.16b, v1.16b\n" - "ldr q1, [x10, #0x40]\n" - ".inst 0x4e80a44d // smmla v13.4s, v2.16b, v0.16b\n" - ".inst 0x4e80a495 // smmla v21.4s, v4.16b, v0.16b\n" - ".inst 0x4e80a47d // smmla v29.4s, v3.16b, v0.16b\n" - "ldr q0, [x10, #0x50]\n" - ".inst 0x4e81a44a // smmla v10.4s, v2.16b, v1.16b\n" - ".inst 0x4e81a492 // smmla v18.4s, v4.16b, v1.16b\n" - ".inst 0x4e81a47a // smmla v26.4s, v3.16b, v1.16b\n" - "ldr q1, [x10, #0x60]\n" - ".inst 0x4e80a44e // smmla v14.4s, v2.16b, v0.16b\n" - ".inst 0x4e80a496 // smmla v22.4s, v4.16b, v0.16b\n" - ".inst 0x4e80a47e // smmla v30.4s, v3.16b, v0.16b\n" - "ldr q0, [x10, #0x70]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q6, [x10, #0x10]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e81a44b // smmla v11.4s, v2.16b, v1.16b\n" - ".inst 0x4e81a493 // smmla v19.4s, v4.16b, v1.16b\n" - ".inst 0x4e81a47b // smmla v27.4s, v3.16b, v1.16b\n" - ".inst 0x4e80a44f // smmla v15.4s, v2.16b, v0.16b\n" - ".inst 0x4e80a497 // smmla v23.4s, v4.16b, v0.16b\n" - ".inst 0x4e80a47f // smmla v31.4s, v3.16b, v0.16b\n" - "161:" // Height 6: Multiply loop: No odd multiplies + ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" + ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + "156:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 149b\n" - "ldr q4, [x14, #0x0]\n" - "ldr q3, [x14, #0x10]\n" - "uzp1 v2.2d, v8.2d, v12.2d\n" + "bne 144b\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" "uzp2 v8.2d, v8.2d, v12.2d\n" - "ldr q1, [x14, #0x20]\n" - "ldr q0, [x14, #0x30]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q3, [x12, #0x30]\n" "uzp1 v12.2d, v9.2d, v13.2d\n" "uzp2 v9.2d, v9.2d, v13.2d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -3178,71 +2978,71 @@ void a64_hybrid_s8qs_mmla_6x16 ( "prfm pstl1keep, [x9, #0x0]\n" "uzp1 v14.2d, v11.2d, v15.2d\n" "uzp2 v11.2d, v11.2d, v15.2d\n" - "add x14, x14, #0x40\n" + "add x12, x12, #0x40\n" "uzp1 v15.2d, v16.2d, v20.2d\n" "uzp2 v16.2d, v16.2d, v20.2d\n" - "add x26, x9, x20\n" - "add x25, x26, x20\n" + "add x27, x9, x20\n" + "add x26, x27, x20\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "uzp2 v17.2d, v17.2d, v21.2d\n" - "add x24, x25, x20\n" - "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x20\n" + "prfm pstl1keep, [x27, #0x0]\n" "uzp1 v21.2d, v18.2d, v22.2d\n" "uzp2 v18.2d, v18.2d, v22.2d\n" - "add x23, x24, x20\n" - "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x20\n" + "prfm pstl1keep, [x26, #0x0]\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" - "add x22, x23, x20\n" - "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x20\n" + "prfm pstl1keep, [x25, #0x0]\n" "uzp1 v23.2d, v24.2d, v28.2d\n" "uzp2 v24.2d, v24.2d, v28.2d\n" + "prfm pstl1keep, [x24, #0x0]\n" "prfm pstl1keep, [x23, #0x0]\n" - "prfm pstl1keep, [x22, #0x0]\n" "uzp1 v28.2d, v25.2d, v29.2d\n" "uzp2 v25.2d, v25.2d, v29.2d\n" "uzp1 v29.2d, v26.2d, v30.2d\n" "uzp2 v26.2d, v26.2d, v30.2d\n" "uzp1 v30.2d, v27.2d, v31.2d\n" "uzp2 v27.2d, v27.2d, v31.2d\n" - "mov v31.16b, v2.16b\n" - "add v12.4s, v12.4s, v3.4s\n" - "add v13.4s, v13.4s, v1.4s\n" - "add v14.4s, v14.4s, v0.4s\n" - "add v31.4s, v31.4s, v4.4s\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v3.4s\n" - "add v10.4s, v10.4s, v1.4s\n" - "add v11.4s, v11.4s, v0.4s\n" - "add v15.4s, v15.4s, v4.4s\n" - "add v20.4s, v20.4s, v3.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v0.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v3.4s\n" - "add v18.4s, v18.4s, v1.4s\n" - "add v19.4s, v19.4s, v0.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v28.4s, v28.4s, v3.4s\n" - "add v29.4s, v29.4s, v1.4s\n" - "add v30.4s, v30.4s, v0.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v3.4s\n" - "add v26.4s, v26.4s, v1.4s\n" - "add v27.4s, v27.4s, v0.4s\n" - "tbz %x[flags], #4, 162f\n" - "ldr q0, [x12, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q1, [x12, #0x10]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q2, [x12, #0x20]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q3, [x12, #0x30]\n" - "ldr q7, [x13, #0x30]\n" - "add x12, x12, #0x40\n" + "mov v31.16b, v7.16b\n" + "add v12.4s, v12.4s, v1.4s\n" + "add v13.4s, v13.4s, v2.4s\n" + "add v14.4s, v14.4s, v3.4s\n" + "add v31.4s, v31.4s, v0.4s\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v15.4s, v15.4s, v0.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v28.4s, v28.4s, v1.4s\n" + "add v29.4s, v29.4s, v2.4s\n" + "add v30.4s, v30.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "tbz %x[flags], #4, 157f\n" + "ldr q0, [x13, #0x0]\n" + "ldr q4, [x14, #0x0]\n" + "ldr q1, [x13, #0x10]\n" + "ldr q5, [x14, #0x10]\n" + "ldr q2, [x13, #0x20]\n" + "ldr q6, [x14, #0x20]\n" + "ldr q3, [x13, #0x30]\n" + "ldr q7, [x14, #0x30]\n" "add x13, x13, #0x40\n" - "b 163f\n" - "162:" // Height 6: per layer parameters + "add x14, x14, #0x40\n" + "b 158f\n" + "157:" // Height 6: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1r { v0.4s }, [x21]\n" @@ -3253,118 +3053,44 @@ void a64_hybrid_s8qs_mmla_6x16 ( "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" - "163:" // Height 6: parameters loaded - "sqrdmulh v31.4s, v31.4s, v4.4s\n" - "sqrdmulh v12.4s, v12.4s, v5.4s\n" - "sqrdmulh v13.4s, v13.4s, v6.4s\n" - "sqrdmulh v14.4s, v14.4s, v7.4s\n" - "sqrdmulh v8.4s, v8.4s, v4.4s\n" - "sqrdmulh v9.4s, v9.4s, v5.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "sqrdmulh v15.4s, v15.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v5.4s\n" - "sqrdmulh v21.4s, v21.4s, v6.4s\n" - "sqrdmulh v22.4s, v22.4s, v7.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v5.4s\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "sqrdmulh v19.4s, v19.4s, v7.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "sqrdmulh v28.4s, v28.4s, v5.4s\n" - "sqrdmulh v29.4s, v29.4s, v6.4s\n" - "sqrdmulh v30.4s, v30.4s, v7.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v5.4s\n" - "sqrdmulh v26.4s, v26.4s, v6.4s\n" - "sqrdmulh v27.4s, v27.4s, v7.4s\n" - "tbz %x[flags], #5, 164f\n" - "and v7.16b, v31.16b, v0.16b\n" - "and v6.16b, v12.16b, v1.16b\n" - "and v5.16b, v13.16b, v2.16b\n" - "and v4.16b, v14.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v31.4s, v31.4s, v7.4s\n" - "and v7.16b, v8.16b, v0.16b\n" - "sqadd v12.4s, v12.4s, v6.4s\n" - "and v6.16b, v9.16b, v1.16b\n" - "sqadd v13.4s, v13.4s, v5.4s\n" - "sqadd v14.4s, v14.4s, v4.4s\n" - "and v5.16b, v10.16b, v2.16b\n" - "and v4.16b, v11.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v7.4s\n" - "and v7.16b, v15.16b, v0.16b\n" - "sqadd v9.4s, v9.4s, v6.4s\n" - "and v6.16b, v20.16b, v1.16b\n" - "sqadd v10.4s, v10.4s, v5.4s\n" - "sqadd v11.4s, v11.4s, v4.4s\n" - "and v5.16b, v21.16b, v2.16b\n" - "and v4.16b, v22.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v7.4s\n" - "and v7.16b, v16.16b, v0.16b\n" - "sqadd v20.4s, v20.4s, v6.4s\n" - "and v6.16b, v17.16b, v1.16b\n" - "sqadd v21.4s, v21.4s, v5.4s\n" - "sqadd v22.4s, v22.4s, v4.4s\n" - "and v5.16b, v18.16b, v2.16b\n" - "and v4.16b, v19.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v7.4s\n" - "and v7.16b, v23.16b, v0.16b\n" - "sqadd v17.4s, v17.4s, v6.4s\n" - "and v6.16b, v28.16b, v1.16b\n" - "sqadd v18.4s, v18.4s, v5.4s\n" - "sqadd v19.4s, v19.4s, v4.4s\n" - "and v5.16b, v29.16b, v2.16b\n" - "and v4.16b, v30.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v7.4s\n" - "and v7.16b, v24.16b, v0.16b\n" - "sqadd v28.4s, v28.4s, v6.4s\n" - "and v6.16b, v25.16b, v1.16b\n" - "sqadd v29.4s, v29.4s, v5.4s\n" - "sqadd v30.4s, v30.4s, v4.4s\n" - "and v5.16b, v26.16b, v2.16b\n" - "and v4.16b, v27.16b, v3.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v24.4s, v24.4s, v7.4s\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "sqadd v26.4s, v26.4s, v5.4s\n" - "sqadd v27.4s, v27.4s, v4.4s\n" - "164:" // Height 6: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "158:" // Height 6: parameters loaded + "sqdmulh v31.4s, v31.4s, v4.4s\n" + "sqdmulh v12.4s, v12.4s, v5.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + "sqdmulh v13.4s, v13.4s, v6.4s\n" + "sqdmulh v14.4s, v14.4s, v7.4s\n" + "add x20, %x[qp], %[minval]\n" + "cmp x11, #0x10\n" + "sqdmulh v8.4s, v8.4s, v4.4s\n" + "sqdmulh v9.4s, v9.4s, v5.4s\n" + "sqdmulh v10.4s, v10.4s, v6.4s\n" + "sqdmulh v11.4s, v11.4s, v7.4s\n" + "sqdmulh v15.4s, v15.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v5.4s\n" + "sqdmulh v21.4s, v21.4s, v6.4s\n" + "sqdmulh v22.4s, v22.4s, v7.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v5.4s\n" + "sqdmulh v18.4s, v18.4s, v6.4s\n" + "sqdmulh v19.4s, v19.4s, v7.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v28.4s, v28.4s, v5.4s\n" + "sqdmulh v29.4s, v29.4s, v6.4s\n" + "sqdmulh v30.4s, v30.4s, v7.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" + "sqdmulh v25.4s, v25.4s, v5.4s\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v26.4s, v26.4s, v6.4s\n" + "ld1r { v6.4s }, [x21]\n" + "sqdmulh v27.4s, v27.4s, v7.4s\n" "srshl v31.4s, v31.4s, v0.4s\n" "srshl v12.4s, v12.4s, v1.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x21]\n" - "ld1r { v5.4s }, [x20]\n" "srshl v13.4s, v13.4s, v2.4s\n" "srshl v14.4s, v14.4s, v3.4s\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x11, #0x10\n" - "ld1r { v4.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" "srshl v15.4s, v15.4s, v0.4s\n" @@ -3383,240 +3109,240 @@ void a64_hybrid_s8qs_mmla_6x16 ( "srshl v25.4s, v25.4s, v1.4s\n" "srshl v26.4s, v26.4s, v2.4s\n" "srshl v27.4s, v27.4s, v3.4s\n" - "add v31.4s, v31.4s, v6.4s\n" - "add v12.4s, v12.4s, v6.4s\n" - "add v13.4s, v13.4s, v6.4s\n" - "add v14.4s, v14.4s, v6.4s\n" - "add v8.4s, v8.4s, v6.4s\n" - "add v9.4s, v9.4s, v6.4s\n" - "add v10.4s, v10.4s, v6.4s\n" - "add v11.4s, v11.4s, v6.4s\n" - "add v15.4s, v15.4s, v6.4s\n" - "add v20.4s, v20.4s, v6.4s\n" - "add v21.4s, v21.4s, v6.4s\n" - "add v22.4s, v22.4s, v6.4s\n" - "add v16.4s, v16.4s, v6.4s\n" - "add v17.4s, v17.4s, v6.4s\n" - "add v18.4s, v18.4s, v6.4s\n" - "add v19.4s, v19.4s, v6.4s\n" - "add v23.4s, v23.4s, v6.4s\n" - "add v28.4s, v28.4s, v6.4s\n" - "add v29.4s, v29.4s, v6.4s\n" - "add v30.4s, v30.4s, v6.4s\n" - "add v24.4s, v24.4s, v6.4s\n" - "add v25.4s, v25.4s, v6.4s\n" - "add v26.4s, v26.4s, v6.4s\n" - "add v27.4s, v27.4s, v6.4s\n" - "smin v31.4s, v31.4s, v5.4s\n" - "smin v12.4s, v12.4s, v5.4s\n" - "smin v13.4s, v13.4s, v5.4s\n" - "smin v14.4s, v14.4s, v5.4s\n" - "smin v8.4s, v8.4s, v5.4s\n" - "smin v9.4s, v9.4s, v5.4s\n" - "smin v10.4s, v10.4s, v5.4s\n" - "smin v11.4s, v11.4s, v5.4s\n" - "smin v15.4s, v15.4s, v5.4s\n" - "smin v20.4s, v20.4s, v5.4s\n" - "smin v21.4s, v21.4s, v5.4s\n" - "smin v22.4s, v22.4s, v5.4s\n" - "smin v16.4s, v16.4s, v5.4s\n" - "smin v17.4s, v17.4s, v5.4s\n" - "smin v18.4s, v18.4s, v5.4s\n" - "smin v19.4s, v19.4s, v5.4s\n" - "smin v23.4s, v23.4s, v5.4s\n" - "smin v28.4s, v28.4s, v5.4s\n" - "smin v29.4s, v29.4s, v5.4s\n" - "smin v30.4s, v30.4s, v5.4s\n" - "smin v24.4s, v24.4s, v5.4s\n" - "smin v25.4s, v25.4s, v5.4s\n" - "smin v26.4s, v26.4s, v5.4s\n" - "smin v27.4s, v27.4s, v5.4s\n" - "smax v31.4s, v31.4s, v4.4s\n" - "smax v12.4s, v12.4s, v4.4s\n" - "smax v13.4s, v13.4s, v4.4s\n" - "smax v14.4s, v14.4s, v4.4s\n" - "smax v8.4s, v8.4s, v4.4s\n" - "smax v9.4s, v9.4s, v4.4s\n" - "smax v10.4s, v10.4s, v4.4s\n" - "smax v11.4s, v11.4s, v4.4s\n" - "smax v15.4s, v15.4s, v4.4s\n" - "smax v20.4s, v20.4s, v4.4s\n" - "smax v21.4s, v21.4s, v4.4s\n" - "smax v22.4s, v22.4s, v4.4s\n" - "smax v16.4s, v16.4s, v4.4s\n" - "smax v17.4s, v17.4s, v4.4s\n" - "smax v18.4s, v18.4s, v4.4s\n" - "smax v19.4s, v19.4s, v4.4s\n" - "smax v23.4s, v23.4s, v4.4s\n" - "smax v28.4s, v28.4s, v4.4s\n" - "smax v29.4s, v29.4s, v4.4s\n" - "smax v30.4s, v30.4s, v4.4s\n" - "smax v24.4s, v24.4s, v4.4s\n" - "smax v25.4s, v25.4s, v4.4s\n" - "smax v26.4s, v26.4s, v4.4s\n" - "smax v27.4s, v27.4s, v4.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v31.8h, v31.8h, v12.8h\n" - "uzp1 v1.8h, v13.8h, v14.8h\n" + "uzp1 v12.8h, v13.8h, v14.8h\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v0.8h, v10.8h, v11.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" "uzp1 v15.8h, v15.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v19.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v23.8h, v23.8h, v28.8h\n" - "uzp1 v18.8h, v29.8h, v30.8h\n" + "uzp1 v28.8h, v29.8h, v30.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v17.8h, v26.8h, v27.8h\n" - "uzp1 v31.16b, v31.16b, v1.16b\n" - "uzp1 v8.16b, v8.16b, v0.16b\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v31.16b, v31.16b, v12.16b\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" "uzp1 v15.16b, v15.16b, v20.16b\n" - "uzp1 v16.16b, v16.16b, v19.16b\n" - "uzp1 v23.16b, v23.16b, v18.16b\n" - "uzp1 v24.16b, v24.16b, v17.16b\n" - "bge 173f\n" - "tbz x11, #3, 168f\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v23.16b, v23.16b, v28.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 167f\n" + "tbz x11, #3, 162f\n" "str d31, [x9], #0x8\n" - "str d8, [x26], #0x8\n" - "str d15, [x25], #0x8\n" - "str d16, [x24], #0x8\n" - "str d23, [x23], #0x8\n" - "str d24, [x22], #0x8\n" - "tbz x11, #2, 166f\n" + "str d8, [x27], #0x8\n" + "str d15, [x26], #0x8\n" + "str d16, [x25], #0x8\n" + "str d23, [x24], #0x8\n" + "str d24, [x23], #0x8\n" + "tbz x11, #2, 160f\n" "st1 { v31.s }[2], [x9], #0x4\n" - "st1 { v8.s }[2], [x26], #0x4\n" - "st1 { v15.s }[2], [x25], #0x4\n" - "st1 { v16.s }[2], [x24], #0x4\n" - "st1 { v23.s }[2], [x23], #0x4\n" - "st1 { v24.s }[2], [x22], #0x4\n" - "tbz x11, #1, 165f\n" + "st1 { v8.s }[2], [x27], #0x4\n" + "st1 { v15.s }[2], [x26], #0x4\n" + "st1 { v16.s }[2], [x25], #0x4\n" + "st1 { v23.s }[2], [x24], #0x4\n" + "st1 { v24.s }[2], [x23], #0x4\n" + "tbz x11, #1, 159f\n" "st1 { v31.h }[6], [x9], #0x2\n" - "st1 { v8.h }[6], [x26], #0x2\n" - "st1 { v15.h }[6], [x25], #0x2\n" - "st1 { v16.h }[6], [x24], #0x2\n" - "st1 { v23.h }[6], [x23], #0x2\n" - "st1 { v24.h }[6], [x22], #0x2\n" - "tbz x11, #0, 172f\n" + "st1 { v8.h }[6], [x27], #0x2\n" + "st1 { v15.h }[6], [x26], #0x2\n" + "st1 { v16.h }[6], [x25], #0x2\n" + "st1 { v23.h }[6], [x24], #0x2\n" + "st1 { v24.h }[6], [x23], #0x2\n" + "tbz x11, #0, 166f\n" "st1 { v31.b }[14], [x9]\n" - "st1 { v8.b }[14], [x26]\n" - "st1 { v15.b }[14], [x25]\n" - "st1 { v16.b }[14], [x24]\n" - "st1 { v23.b }[14], [x23]\n" - "st1 { v24.b }[14], [x22]\n" - "b 172f\n" - "165:" // Height 6: Partial direct writeback: partial_1_12 - "tbz x11, #0, 172f\n" + "st1 { v8.b }[14], [x27]\n" + "st1 { v15.b }[14], [x26]\n" + "st1 { v16.b }[14], [x25]\n" + "st1 { v23.b }[14], [x24]\n" + "st1 { v24.b }[14], [x23]\n" + "b 166f\n" + "159:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x11, #0, 166f\n" "st1 { v31.b }[12], [x9]\n" - "st1 { v8.b }[12], [x26]\n" - "st1 { v15.b }[12], [x25]\n" - "st1 { v16.b }[12], [x24]\n" - "st1 { v23.b }[12], [x23]\n" - "st1 { v24.b }[12], [x22]\n" - "b 172f\n" - "166:" // Height 6: Partial direct writeback: partial_2_8 - "tbz x11, #1, 167f\n" + "st1 { v8.b }[12], [x27]\n" + "st1 { v15.b }[12], [x26]\n" + "st1 { v16.b }[12], [x25]\n" + "st1 { v23.b }[12], [x24]\n" + "st1 { v24.b }[12], [x23]\n" + "b 166f\n" + "160:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x11, #1, 161f\n" "st1 { v31.h }[4], [x9], #0x2\n" - "st1 { v8.h }[4], [x26], #0x2\n" - "st1 { v15.h }[4], [x25], #0x2\n" - "st1 { v16.h }[4], [x24], #0x2\n" - "st1 { v23.h }[4], [x23], #0x2\n" - "st1 { v24.h }[4], [x22], #0x2\n" - "tbz x11, #0, 172f\n" + "st1 { v8.h }[4], [x27], #0x2\n" + "st1 { v15.h }[4], [x26], #0x2\n" + "st1 { v16.h }[4], [x25], #0x2\n" + "st1 { v23.h }[4], [x24], #0x2\n" + "st1 { v24.h }[4], [x23], #0x2\n" + "tbz x11, #0, 166f\n" "st1 { v31.b }[10], [x9]\n" - "st1 { v8.b }[10], [x26]\n" - "st1 { v15.b }[10], [x25]\n" - "st1 { v16.b }[10], [x24]\n" - "st1 { v23.b }[10], [x23]\n" - "st1 { v24.b }[10], [x22]\n" - "b 172f\n" - "167:" // Height 6: Partial direct writeback: partial_1_8 - "tbz x11, #0, 172f\n" + "st1 { v8.b }[10], [x27]\n" + "st1 { v15.b }[10], [x26]\n" + "st1 { v16.b }[10], [x25]\n" + "st1 { v23.b }[10], [x24]\n" + "st1 { v24.b }[10], [x23]\n" + "b 166f\n" + "161:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x11, #0, 166f\n" "st1 { v31.b }[8], [x9]\n" - "st1 { v8.b }[8], [x26]\n" - "st1 { v15.b }[8], [x25]\n" - "st1 { v16.b }[8], [x24]\n" - "st1 { v23.b }[8], [x23]\n" - "st1 { v24.b }[8], [x22]\n" - "b 172f\n" - "168:" // Height 6: Partial direct writeback: partial_4_0 - "tbz x11, #2, 170f\n" + "st1 { v8.b }[8], [x27]\n" + "st1 { v15.b }[8], [x26]\n" + "st1 { v16.b }[8], [x25]\n" + "st1 { v23.b }[8], [x24]\n" + "st1 { v24.b }[8], [x23]\n" + "b 166f\n" + "162:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x11, #2, 164f\n" "str s31, [x9], #0x4\n" - "str s8, [x26], #0x4\n" - "str s15, [x25], #0x4\n" - "str s16, [x24], #0x4\n" - "str s23, [x23], #0x4\n" - "str s24, [x22], #0x4\n" - "tbz x11, #1, 169f\n" + "str s8, [x27], #0x4\n" + "str s15, [x26], #0x4\n" + "str s16, [x25], #0x4\n" + "str s23, [x24], #0x4\n" + "str s24, [x23], #0x4\n" + "tbz x11, #1, 163f\n" "st1 { v31.h }[2], [x9], #0x2\n" - "st1 { v8.h }[2], [x26], #0x2\n" - "st1 { v15.h }[2], [x25], #0x2\n" - "st1 { v16.h }[2], [x24], #0x2\n" - "st1 { v23.h }[2], [x23], #0x2\n" - "st1 { v24.h }[2], [x22], #0x2\n" - "tbz x11, #0, 172f\n" + "st1 { v8.h }[2], [x27], #0x2\n" + "st1 { v15.h }[2], [x26], #0x2\n" + "st1 { v16.h }[2], [x25], #0x2\n" + "st1 { v23.h }[2], [x24], #0x2\n" + "st1 { v24.h }[2], [x23], #0x2\n" + "tbz x11, #0, 166f\n" "st1 { v31.b }[6], [x9]\n" - "st1 { v8.b }[6], [x26]\n" - "st1 { v15.b }[6], [x25]\n" - "st1 { v16.b }[6], [x24]\n" - "st1 { v23.b }[6], [x23]\n" - "st1 { v24.b }[6], [x22]\n" - "b 172f\n" - "169:" // Height 6: Partial direct writeback: partial_1_4 - "tbz x11, #0, 172f\n" + "st1 { v8.b }[6], [x27]\n" + "st1 { v15.b }[6], [x26]\n" + "st1 { v16.b }[6], [x25]\n" + "st1 { v23.b }[6], [x24]\n" + "st1 { v24.b }[6], [x23]\n" + "b 166f\n" + "163:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x11, #0, 166f\n" "st1 { v31.b }[4], [x9]\n" - "st1 { v8.b }[4], [x26]\n" - "st1 { v15.b }[4], [x25]\n" - "st1 { v16.b }[4], [x24]\n" - "st1 { v23.b }[4], [x23]\n" - "st1 { v24.b }[4], [x22]\n" - "b 172f\n" - "170:" // Height 6: Partial direct writeback: partial_2_0 - "tbz x11, #1, 171f\n" + "st1 { v8.b }[4], [x27]\n" + "st1 { v15.b }[4], [x26]\n" + "st1 { v16.b }[4], [x25]\n" + "st1 { v23.b }[4], [x24]\n" + "st1 { v24.b }[4], [x23]\n" + "b 166f\n" + "164:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x11, #1, 165f\n" "str h31, [x9], #0x2\n" - "str h8, [x26], #0x2\n" - "str h15, [x25], #0x2\n" - "str h16, [x24], #0x2\n" - "str h23, [x23], #0x2\n" - "str h24, [x22], #0x2\n" - "tbz x11, #0, 172f\n" + "str h8, [x27], #0x2\n" + "str h15, [x26], #0x2\n" + "str h16, [x25], #0x2\n" + "str h23, [x24], #0x2\n" + "str h24, [x23], #0x2\n" + "tbz x11, #0, 166f\n" "st1 { v31.b }[2], [x9]\n" - "st1 { v8.b }[2], [x26]\n" - "st1 { v15.b }[2], [x25]\n" - "st1 { v16.b }[2], [x24]\n" - "st1 { v23.b }[2], [x23]\n" - "st1 { v24.b }[2], [x22]\n" - "b 172f\n" - "171:" // Height 6: Partial direct writeback: partial_1_0 + "st1 { v8.b }[2], [x27]\n" + "st1 { v15.b }[2], [x26]\n" + "st1 { v16.b }[2], [x25]\n" + "st1 { v23.b }[2], [x24]\n" + "st1 { v24.b }[2], [x23]\n" + "b 166f\n" + "165:" // Height 6: Partial direct writeback: partial_1_0 "str b31, [x9, #0x0]\n" - "str b8, [x26, #0x0]\n" - "str b15, [x25, #0x0]\n" - "str b16, [x24, #0x0]\n" - "str b23, [x23, #0x0]\n" - "str b24, [x22, #0x0]\n" - "172:" // Height 6: Partial direct writeback: Done - "b 174f\n" - "173:" // Height 6: Full writeback + "str b8, [x27, #0x0]\n" + "str b15, [x26, #0x0]\n" + "str b16, [x25, #0x0]\n" + "str b23, [x24, #0x0]\n" + "str b24, [x23, #0x0]\n" + "166:" // Height 6: Partial direct writeback: Done + "b 168f\n" + "167:" // Height 6: Full writeback "str q31, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "str q8, [x26, #0x0]\n" - "str q15, [x25, #0x0]\n" - "str q16, [x24, #0x0]\n" - "str q23, [x23, #0x0]\n" - "str q24, [x22, #0x0]\n" - "174:" // Height 6: Writeback done + "str q8, [x27, #0x0]\n" + "str q15, [x26, #0x0]\n" + "str q16, [x25, #0x0]\n" + "str q23, [x24, #0x0]\n" + "str q24, [x23, #0x0]\n" + "168:" // Height 6: Writeback done "subs x11, x11, #0x10\n" - "bgt 147b\n" + "bgt 142b\n" "subs %x[M], %x[M], #0x6\n" - "beq 176f\n" + "beq 170f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 175f\n" + "tbz %x[flags], #3, 169f\n" "add x21, x21, #0x6\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "175:" // Update direct input + "169:" // Update direct input "mov x20, #0x6\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "176:" // Exit + "170:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp index 90b196735a..2cd72bb4f3 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -74,29 +73,25 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( ka.string_lengths = string_lengths; ka.N = N; ka.B_ptr = B_ptr; - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 91f\n" + "bge 88f\n" "cmp %x[M], #0x2\n" - "bgt 61f\n" - "beq 31f\n" - "mov x16, %x[col_bias]\n" + "bgt 59f\n" + "beq 30f\n" "movi v11.4s, #0x0\n" - "movi v15.16b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "movi v15.16b, #0x1\n" + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[col_bias]\n" "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n" "2:" // Height 1: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" "movi v19.4s, #0x0\n" - "3:" // Height 1: setup done "mov x12, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -117,104 +112,104 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "blt 11f\n" "ldr q0, [x10, #0x0]\n" "cmp x11, #0x20\n" - "ldr q4, [x14, #0x0]\n" - "ldr q5, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" - "ldr q7, [x14, #0x30]\n" - "ldr q8, [x14, #0x40]\n" - "ldr q9, [x14, #0x50]\n" - "ldr q10, [x14, #0x60]\n" + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "ldr q8, [x15, #0x40]\n" + "ldr q9, [x15, #0x50]\n" + "ldr q10, [x15, #0x60]\n" "blt 9f\n" "7:" // Height 1: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr d21, [x14, #0x70]\n" - "ldr x20, [x14, #0x78]\n" + "ldr d4, [x15, #0x70]\n" + "ldr x20, [x15, #0x78]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "ldr d20, [x14, #0x80]\n" + "ldr d5, [x15, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "ldr d26, [x14, #0x90]\n" + "ldr d6, [x15, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "ldr d25, [x14, #0xa0]\n" - "mov v21.d[1], x20\n" - "ldr x20, [x14, #0x88]\n" + "ldr d7, [x15, #0xa0]\n" + "mov v4.d[1], x20\n" + "ldr x20, [x15, #0x88]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" - "ldr d24, [x14, #0xb0]\n" + "ldr d8, [x15, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" - "ldr d23, [x14, #0xc0]\n" + "ldr d9, [x15, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" - "ldr d22, [x14, #0xd0]\n" - ".inst 0x6fa0e2b3 // udot v19.4s, v21.16b, v0.4b[1]\n" - "ldr d21, [x14, #0xe0]\n" - "mov v20.d[1], x20\n" - "ldr x22, [x14, #0x98]\n" + "ldr d10, [x15, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + "ldr d4, [x15, #0xe0]\n" + "mov v5.d[1], x20\n" + "ldr x22, [x15, #0x98]\n" "add x10, x10, #0x10\n" - "ldr x21, [x14, #0xa8]\n" - ".inst 0x6f80ea90 // udot v16.4s, v20.16b, v0.4b[2]\n" - "ldr d20, [x14, #0xf0]\n" - "ldr x20, [x14, #0xb8]\n" - "mov v26.d[1], x22\n" - "mov v25.d[1], x21\n" - "ldr x23, [x14, #0xc8]\n" - "ldr x22, [x14, #0xd8]\n" - ".inst 0x6f80eb51 // udot v17.4s, v26.16b, v0.4b[2]\n" - "mov v24.d[1], x20\n" - "ldr x21, [x14, #0xe8]\n" - "ldr x20, [x14, #0xf8]\n" - ".inst 0x6f80eb32 // udot v18.4s, v25.16b, v0.4b[2]\n" - ".inst 0x6f80eb13 // udot v19.4s, v24.16b, v0.4b[2]\n" - "mov v23.d[1], x23\n" - "mov v22.d[1], x22\n" - "add x14, x14, #0x100\n" - "mov v21.d[1], x21\n" - ".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n" - "mov v20.d[1], x20\n" - ".inst 0x6fa0ead1 // udot v17.4s, v22.16b, v0.4b[3]\n" - ".inst 0x6fa0eab2 // udot v18.4s, v21.16b, v0.4b[3]\n" - ".inst 0x6fa0ea93 // udot v19.4s, v20.16b, v0.4b[3]\n" + "ldr x21, [x15, #0xa8]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + "ldr d5, [x15, #0xf0]\n" + "ldr x20, [x15, #0xb8]\n" + "mov v6.d[1], x22\n" + "mov v7.d[1], x21\n" + "ldr x23, [x15, #0xc8]\n" + "ldr x22, [x15, #0xd8]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "mov v8.d[1], x20\n" + "ldr x21, [x15, #0xe8]\n" + "ldr x20, [x15, #0xf8]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + "mov v9.d[1], x23\n" + "mov v10.d[1], x22\n" + "add x15, x15, #0x100\n" + "mov v4.d[1], x21\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + "mov v5.d[1], x20\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 8f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum "ldr q0, [x10, #0x0]\n" "sub x11, x11, #0x10\n" - "ldr q4, [x14, #0x0]\n" + "ldr q4, [x15, #0x0]\n" "cmp x11, #0x20\n" - "ldr q5, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" - "ldr q7, [x14, #0x30]\n" - "ldr q8, [x14, #0x40]\n" - "ldr q9, [x14, #0x50]\n" - "ldr q10, [x14, #0x60]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "ldr q8, [x15, #0x40]\n" + "ldr q9, [x15, #0x50]\n" + "ldr q10, [x15, #0x60]\n" "prfm pldl1keep, [x10, #0x80]\n" "bge 7b\n" "9:" // Height 1: Multiply loop: Single iteration only ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q21, [x14, #0x70]\n" + "ldr q4, [x15, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q20, [x14, #0x80]\n" + "ldr q5, [x15, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q26, [x14, #0x90]\n" + "ldr q6, [x15, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q25, [x14, #0xa0]\n" + "ldr q7, [x15, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" - "ldr q24, [x14, #0xb0]\n" + "ldr q8, [x15, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" - "ldr q23, [x14, #0xc0]\n" + "ldr q9, [x15, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" - "ldr q22, [x14, #0xd0]\n" - ".inst 0x6fa0e2b3 // udot v19.4s, v21.16b, v0.4b[1]\n" - "ldr q21, [x14, #0xe0]\n" - ".inst 0x6f80ea90 // udot v16.4s, v20.16b, v0.4b[2]\n" - "ldr q20, [x14, #0xf0]\n" - ".inst 0x6f80eb51 // udot v17.4s, v26.16b, v0.4b[2]\n" + "ldr q10, [x15, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + "ldr q4, [x15, #0xe0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + "ldr q5, [x15, #0xf0]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" "sub x11, x11, #0x10\n" - ".inst 0x6f80eb32 // udot v18.4s, v25.16b, v0.4b[2]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" "add x10, x10, #0x10\n" - ".inst 0x6f80eb13 // udot v19.4s, v24.16b, v0.4b[2]\n" - "add x14, x14, #0x100\n" - ".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n" - ".inst 0x6fa0ead1 // udot v17.4s, v22.16b, v0.4b[3]\n" - ".inst 0x6fa0eab2 // udot v18.4s, v21.16b, v0.4b[3]\n" - ".inst 0x6fa0ea93 // udot v19.4s, v20.16b, v0.4b[3]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + "add x15, x15, #0x100\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 10f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "10:" // Height 1: Multiply loop: unique 2: skip row sum @@ -228,17 +223,17 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "tbnz %x[flags], #31, 13f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "13:" // Height 1: Multiply loop: unique 3: skip row sum - "ldr q23, [x14, #0x0]\n" + "ldr q6, [x15, #0x0]\n" "sub x11, x11, #0x4\n" - "ldr q22, [x14, #0x10]\n" + "ldr q7, [x15, #0x10]\n" "cmp x11, #0x4\n" - "ldr q21, [x14, #0x20]\n" - "ldr q20, [x14, #0x30]\n" - ".inst 0x6f80e2f0 // udot v16.4s, v23.16b, v0.4b[0]\n" - ".inst 0x6f80e2d1 // udot v17.4s, v22.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" - ".inst 0x6f80e2b2 // udot v18.4s, v21.16b, v0.4b[0]\n" - ".inst 0x6f80e293 // udot v19.4s, v20.16b, v0.4b[0]\n" + "ldr q8, [x15, #0x20]\n" + "ldr q9, [x15, #0x30]\n" + ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" "bge 12b\n" "14:" // Height 1: Multiply loop: Skip odd blocks "cbz x11, 18f\n" @@ -253,15 +248,15 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "tbnz %x[flags], #31, 17f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "17:" // Height 1: Multiply loop: unique 4: skip row sum - "ldr q23, [x14, #0x0]\n" - "ldr q22, [x14, #0x10]\n" - "ldr q21, [x14, #0x20]\n" - "ldr q20, [x14, #0x30]\n" - ".inst 0x6f80e2f0 // udot v16.4s, v23.16b, v0.4b[0]\n" - ".inst 0x6f80e2d1 // udot v17.4s, v22.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" - ".inst 0x6f80e2b2 // udot v18.4s, v21.16b, v0.4b[0]\n" - ".inst 0x6f80e293 // udot v19.4s, v20.16b, v0.4b[0]\n" + "ldr q10, [x15, #0x0]\n" + "ldr q4, [x15, #0x10]\n" + "ldr q5, [x15, #0x20]\n" + "ldr q6, [x15, #0x30]\n" + ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" "18:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x12, x12, #0x1\n" @@ -271,136 +266,122 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "tbnz %x[flags], #31, 19f\n" "addp v11.4s, v11.4s, v11.4s\n" "add x20, %x[qp], %[b_offset]\n" - "ld1r { v20.4s }, [x20]\n" - "neg v20.4s, v20.4s\n" + "ld1r { v1.4s }, [x20]\n" + "neg v1.4s, v1.4s\n" "addp v11.4s, v11.4s, v11.4s\n" - "mul v11.4s, v11.4s, v20.4s\n" + "mul v11.4s, v11.4s, v1.4s\n" "19:" // Height 1: skip row sum fixup - "ldr q24, [x16, #0x0]\n" + "ldr q0, [x14, #0x0]\n" "add v16.4s, v16.4s, v11.4s\n" - "ldr q23, [x16, #0x10]\n" + "ldr q1, [x14, #0x10]\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q22, [x16, #0x20]\n" + "ldr q2, [x14, #0x20]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q21, [x16, #0x30]\n" + "ldr q3, [x14, #0x30]\n" "add v19.4s, v19.4s, v11.4s\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v20.4s }, [x20]\n" - "add v16.4s, v16.4s, v24.4s\n" - "add v17.4s, v17.4s, v23.4s\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x21]\n" + "add v16.4s, v16.4s, v0.4s\n" "ld1r { v0.4s }, [x20]\n" - "add v18.4s, v18.4s, v22.4s\n" - "add v19.4s, v19.4s, v21.4s\n" - "add x16, x16, #0x40\n" - "sqrdmulh v16.4s, v16.4s, v20.4s\n" - "sqrdmulh v17.4s, v17.4s, v20.4s\n" - "sqrdmulh v18.4s, v18.4s, v20.4s\n" - "sqrdmulh v19.4s, v19.4s, v20.4s\n" - "tbz %x[flags], #5, 20f\n" - "and v23.16b, v16.16b, v0.16b\n" - "and v22.16b, v17.16b, v0.16b\n" - "and v21.16b, v18.16b, v0.16b\n" - "and v20.16b, v19.16b, v0.16b\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v23.4s\n" - "sqadd v17.4s, v17.4s, v22.4s\n" - "sqadd v18.4s, v18.4s, v21.4s\n" - "sqadd v19.4s, v19.4s, v20.4s\n" - "20:" // Height 1: no shift correction + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add x21, %x[qp], %[c_offset]\n" + "add v19.4s, v19.4s, v3.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v6.4s }, [x20]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "cmp x16, #0x10\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "ld1r { v4.4s }, [x21]\n" + "add x14, x14, #0x40\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x20, %x[qp], %[c_offset]\n" - "add x21, %x[qp], %[maxval]\n" - "ld1r { v22.4s }, [x20]\n" - "add x20, %x[qp], %[minval]\n" - "ld1r { v21.4s }, [x21]\n" - "cmp x15, #0x10\n" - "ld1r { v20.4s }, [x20]\n" - "add v16.4s, v16.4s, v22.4s\n" - "add v17.4s, v17.4s, v22.4s\n" - "add v18.4s, v18.4s, v22.4s\n" - "add v19.4s, v19.4s, v22.4s\n" - "smin v16.4s, v16.4s, v21.4s\n" - "smin v17.4s, v17.4s, v21.4s\n" - "smin v18.4s, v18.4s, v21.4s\n" - "smin v19.4s, v19.4s, v21.4s\n" - "smax v16.4s, v16.4s, v20.4s\n" - "smax v17.4s, v17.4s, v20.4s\n" - "smax v18.4s, v18.4s, v20.4s\n" - "smax v19.4s, v19.4s, v20.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v16.16b, v16.16b, v17.16b\n" - "bge 29f\n" - "tbz x15, #3, 24f\n" + "bge 28f\n" + "tbz x16, #3, 23f\n" "str d16, [x13], #0x8\n" - "tbz x15, #2, 22f\n" + "tbz x16, #2, 21f\n" "st1 { v16.s }[2], [x13], #0x4\n" - "tbz x15, #1, 21f\n" + "tbz x16, #1, 20f\n" "st1 { v16.h }[6], [x13], #0x2\n" - "tbz x15, #0, 28f\n" + "tbz x16, #0, 27f\n" "st1 { v16.b }[14], [x13]\n" - "b 28f\n" - "21:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x15, #0, 28f\n" + "b 27f\n" + "20:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x16, #0, 27f\n" "st1 { v16.b }[12], [x13]\n" - "b 28f\n" - "22:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x15, #1, 23f\n" + "b 27f\n" + "21:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x16, #1, 22f\n" "st1 { v16.h }[4], [x13], #0x2\n" - "tbz x15, #0, 28f\n" + "tbz x16, #0, 27f\n" "st1 { v16.b }[10], [x13]\n" - "b 28f\n" - "23:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x15, #0, 28f\n" + "b 27f\n" + "22:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x16, #0, 27f\n" "st1 { v16.b }[8], [x13]\n" - "b 28f\n" - "24:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x15, #2, 26f\n" + "b 27f\n" + "23:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x16, #2, 25f\n" "str s16, [x13], #0x4\n" - "tbz x15, #1, 25f\n" + "tbz x16, #1, 24f\n" "st1 { v16.h }[2], [x13], #0x2\n" - "tbz x15, #0, 28f\n" + "tbz x16, #0, 27f\n" "st1 { v16.b }[6], [x13]\n" - "b 28f\n" - "25:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x15, #0, 28f\n" + "b 27f\n" + "24:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x16, #0, 27f\n" "st1 { v16.b }[4], [x13]\n" - "b 28f\n" - "26:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x15, #1, 27f\n" + "b 27f\n" + "25:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x16, #1, 26f\n" "str h16, [x13], #0x2\n" - "tbz x15, #0, 28f\n" + "tbz x16, #0, 27f\n" "st1 { v16.b }[2], [x13]\n" - "b 28f\n" - "27:" // Height 1: Partial direct writeback: partial_1_0 + "b 27f\n" + "26:" // Height 1: Partial direct writeback: partial_1_0 "str b16, [x13, #0x0]\n" - "28:" // Height 1: Partial direct writeback: Done - "b 30f\n" - "29:" // Height 1: Full writeback + "27:" // Height 1: Partial direct writeback: Done + "b 29f\n" + "28:" // Height 1: Full writeback "str q16, [x13, #0x0]\n" "add x13, x13, #0x10\n" - "30:" // Height 1: Writeback done - "subs x15, x15, #0x10\n" + "29:" // Height 1: Writeback done + "subs x16, x16, #0x10\n" "bgt 2b\n" - "b 122f\n" - "31:" // Height 2 - "mov x16, %x[col_bias]\n" + "b 118f\n" + "30:" // Height 2 "movi v11.4s, #0x0\n" - "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "movi v12.4s, #0x0\n" + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" "movi v15.16b, #0x1\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[col_bias]\n" "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "32:" // Height 2: Column loop + "31:" // Height 2: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -409,434 +390,407 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "33:" // Height 2: setup done "mov x12, #0x0\n" - "34:" // Height 2: String loop + "33:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 35f\n" + "tbz %x[flags], #3, 34f\n" "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x10, [x20, #0x0]\n" "ldr x9, [x20, #0x8]\n" - "cbnz x12, 36f\n" + "cbnz x12, 35f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x10, x10, x20\n" "add x9, x9, x20\n" - "b 36f\n" - "35:" // Height 2: setup direct input + "b 35f\n" + "34:" // Height 2: setup direct input "mov x10, %x[input_ptr]\n" "add x9, x10, x21\n" - "36:" // Height 2: input setup done + "35:" // Height 2: input setup done "cmp x11, #0x10\n" - "blt 41f\n" + "blt 40f\n" "ldr q0, [x10, #0x0]\n" "cmp x11, #0x20\n" "ldr q1, [x9, #0x0]\n" - "ldr q4, [x14, #0x0]\n" - "ldr q5, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" - "ldr q7, [x14, #0x30]\n" - "ldr q8, [x14, #0x40]\n" - "ldr q9, [x14, #0x50]\n" - "ldr q10, [x14, #0x60]\n" - "blt 39f\n" - "37:" // Height 2: Multiply loop: Main loop head + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "ldr q8, [x15, #0x40]\n" + "ldr q9, [x15, #0x50]\n" + "ldr q10, [x15, #0x60]\n" + "blt 38f\n" + "36:" // Height 2: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr x20, [x14, #0x78]\n" + "ldr x20, [x15, #0x78]\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "ldr d25, [x14, #0x70]\n" + "ldr d4, [x15, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "ldr x23, [x14, #0x88]\n" + "ldr x23, [x15, #0x88]\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "ldr d24, [x14, #0x80]\n" + "ldr d5, [x15, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "mov v25.d[1], x20\n" + "mov v4.d[1], x20\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - "ldr d30, [x14, #0x90]\n" + "ldr d6, [x15, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "ldr x22, [x14, #0x98]\n" + "ldr x22, [x15, #0x98]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" - "ldr d29, [x14, #0xa0]\n" - "ldr x21, [x14, #0xa8]\n" + "ldr d7, [x15, #0xa0]\n" + "ldr x21, [x15, #0xa8]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" - "ldr d28, [x14, #0xb0]\n" - "ldr x20, [x14, #0xb8]\n" + "ldr d8, [x15, #0xb0]\n" + "ldr x20, [x15, #0xb8]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" - "ldr d27, [x14, #0xc0]\n" + "ldr d9, [x15, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" - "mov v24.d[1], x23\n" + "mov v5.d[1], x23\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" - "ldr d26, [x14, #0xd0]\n" - ".inst 0x6fa0e333 // udot v19.4s, v25.16b, v0.4b[1]\n" - "mov v30.d[1], x22\n" - ".inst 0x6fa1e337 // udot v23.4s, v25.16b, v1.4b[1]\n" - "ldr d25, [x14, #0xe0]\n" - "mov v29.d[1], x21\n" - "ldr x23, [x14, #0xc8]\n" - "mov v28.d[1], x20\n" - "ldr x22, [x14, #0xd8]\n" - "ldr x21, [x14, #0xe8]\n" - ".inst 0x6f80eb10 // udot v16.4s, v24.16b, v0.4b[2]\n" - ".inst 0x6f81eb14 // udot v20.4s, v24.16b, v1.4b[2]\n" - "ldr d24, [x14, #0xf0]\n" - "ldr x20, [x14, #0xf8]\n" - ".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n" - ".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n" - "mov v27.d[1], x23\n" - ".inst 0x6f80ebb2 // udot v18.4s, v29.16b, v0.4b[2]\n" - "mov v26.d[1], x22\n" - ".inst 0x6f81ebb6 // udot v22.4s, v29.16b, v1.4b[2]\n" - "mov v25.d[1], x21\n" - ".inst 0x6f80eb93 // udot v19.4s, v28.16b, v0.4b[2]\n" - "mov v24.d[1], x20\n" - ".inst 0x6f81eb97 // udot v23.4s, v28.16b, v1.4b[2]\n" + "ldr d10, [x15, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + "mov v6.d[1], x22\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + "ldr d4, [x15, #0xe0]\n" + "mov v7.d[1], x21\n" + "ldr x23, [x15, #0xc8]\n" + "mov v8.d[1], x20\n" + "ldr x22, [x15, #0xd8]\n" + "ldr x21, [x15, #0xe8]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + "ldr d5, [x15, #0xf0]\n" + "ldr x20, [x15, #0xf8]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + "mov v9.d[1], x23\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + "mov v10.d[1], x22\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + "mov v4.d[1], x21\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + "mov v5.d[1], x20\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" "add x10, x10, #0x10\n" "add x9, x9, #0x10\n" - "add x14, x14, #0x100\n" - ".inst 0x6fa0eb70 // udot v16.4s, v27.16b, v0.4b[3]\n" - ".inst 0x6fa1eb74 // udot v20.4s, v27.16b, v1.4b[3]\n" - ".inst 0x6fa0eb51 // udot v17.4s, v26.16b, v0.4b[3]\n" - ".inst 0x6fa1eb55 // udot v21.4s, v26.16b, v1.4b[3]\n" - ".inst 0x6fa0eb32 // udot v18.4s, v25.16b, v0.4b[3]\n" - ".inst 0x6fa1eb36 // udot v22.4s, v25.16b, v1.4b[3]\n" - ".inst 0x6fa0eb13 // udot v19.4s, v24.16b, v0.4b[3]\n" - ".inst 0x6fa1eb17 // udot v23.4s, v24.16b, v1.4b[3]\n" - "tbnz %x[flags], #31, 38f\n" + "add x15, x15, #0x100\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 37f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - "38:" // Height 2: Multiply loop: unique 5: skip row sum + "37:" // Height 2: Multiply loop: unique 5: skip row sum "ldr q0, [x10, #0x0]\n" "sub x11, x11, #0x10\n" "ldr q1, [x9, #0x0]\n" "cmp x11, #0x20\n" - "ldr q4, [x14, #0x0]\n" - "ldr q5, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" - "ldr q7, [x14, #0x30]\n" - "ldr q8, [x14, #0x40]\n" - "ldr q9, [x14, #0x50]\n" - "ldr q10, [x14, #0x60]\n" + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "ldr q8, [x15, #0x40]\n" + "ldr q9, [x15, #0x50]\n" + "ldr q10, [x15, #0x60]\n" "prfm pldl1keep, [x10, #0x80]\n" "prfm pldl1keep, [x9, #0x80]\n" - "bge 37b\n" - "39:" // Height 2: Multiply loop: Single iteration only + "bge 36b\n" + "38:" // Height 2: Multiply loop: Single iteration only ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" "sub x11, x11, #0x10\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q25, [x14, #0x70]\n" + "ldr q4, [x15, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" "add x10, x10, #0x10\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q24, [x14, #0x80]\n" + "ldr q5, [x15, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" "add x9, x9, #0x10\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - "ldr q30, [x14, #0x90]\n" + "ldr q6, [x15, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" - "ldr q29, [x14, #0xa0]\n" + "ldr q7, [x15, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" - "ldr q28, [x14, #0xb0]\n" + "ldr q8, [x15, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" - "ldr q27, [x14, #0xc0]\n" + "ldr q9, [x15, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" - "ldr q26, [x14, #0xd0]\n" - ".inst 0x6fa0e333 // udot v19.4s, v25.16b, v0.4b[1]\n" - ".inst 0x6fa1e337 // udot v23.4s, v25.16b, v1.4b[1]\n" - "ldr q25, [x14, #0xe0]\n" - ".inst 0x6f80eb10 // udot v16.4s, v24.16b, v0.4b[2]\n" - ".inst 0x6f81eb14 // udot v20.4s, v24.16b, v1.4b[2]\n" - "ldr q24, [x14, #0xf0]\n" - ".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n" - "add x14, x14, #0x100\n" - ".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n" - ".inst 0x6f80ebb2 // udot v18.4s, v29.16b, v0.4b[2]\n" - ".inst 0x6f81ebb6 // udot v22.4s, v29.16b, v1.4b[2]\n" - ".inst 0x6f80eb93 // udot v19.4s, v28.16b, v0.4b[2]\n" - ".inst 0x6f81eb97 // udot v23.4s, v28.16b, v1.4b[2]\n" - ".inst 0x6fa0eb70 // udot v16.4s, v27.16b, v0.4b[3]\n" - ".inst 0x6fa1eb74 // udot v20.4s, v27.16b, v1.4b[3]\n" - ".inst 0x6fa0eb51 // udot v17.4s, v26.16b, v0.4b[3]\n" - ".inst 0x6fa1eb55 // udot v21.4s, v26.16b, v1.4b[3]\n" - ".inst 0x6fa0eb32 // udot v18.4s, v25.16b, v0.4b[3]\n" - ".inst 0x6fa1eb36 // udot v22.4s, v25.16b, v1.4b[3]\n" - ".inst 0x6fa0eb13 // udot v19.4s, v24.16b, v0.4b[3]\n" - ".inst 0x6fa1eb17 // udot v23.4s, v24.16b, v1.4b[3]\n" - "tbnz %x[flags], #31, 40f\n" + "ldr q10, [x15, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + "ldr q4, [x15, #0xe0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + "ldr q5, [x15, #0xf0]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "add x15, x15, #0x100\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 39f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - "40:" // Height 2: Multiply loop: unique 6: skip row sum + "39:" // Height 2: Multiply loop: unique 6: skip row sum "prfm pldl1keep, [x10, #0x80]\n" "prfm pldl1keep, [x9, #0x80]\n" - "41:" // Height 2: Multiply loop: Main loop skip - "cbz x11, 48f\n" + "40:" // Height 2: Multiply loop: Main loop skip + "cbz x11, 47f\n" "cmp x11, #0x4\n" - "blt 44f\n" - "42:" // Height 2: Multiply loop: Odd block loop + "blt 43f\n" + "41:" // Height 2: Multiply loop: Odd block loop "ldr s0, [x10], #0x4\n" "ldr s1, [x9], #0x4\n" - "tbnz %x[flags], #31, 43f\n" + "tbnz %x[flags], #31, 42f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - "43:" // Height 2: Multiply loop: unique 7: skip row sum - "ldr q27, [x14, #0x0]\n" + "42:" // Height 2: Multiply loop: unique 7: skip row sum + "ldr q6, [x15, #0x0]\n" "sub x11, x11, #0x4\n" - "ldr q26, [x14, #0x10]\n" + "ldr q7, [x15, #0x10]\n" "cmp x11, #0x4\n" - "ldr q25, [x14, #0x20]\n" - "ldr q24, [x14, #0x30]\n" - ".inst 0x6f80e370 // udot v16.4s, v27.16b, v0.4b[0]\n" - ".inst 0x6f81e374 // udot v20.4s, v27.16b, v1.4b[0]\n" - "add x14, x14, #0x40\n" - ".inst 0x6f80e351 // udot v17.4s, v26.16b, v0.4b[0]\n" - ".inst 0x6f81e355 // udot v21.4s, v26.16b, v1.4b[0]\n" - ".inst 0x6f80e332 // udot v18.4s, v25.16b, v0.4b[0]\n" - ".inst 0x6f81e336 // udot v22.4s, v25.16b, v1.4b[0]\n" - ".inst 0x6f80e313 // udot v19.4s, v24.16b, v0.4b[0]\n" - ".inst 0x6f81e317 // udot v23.4s, v24.16b, v1.4b[0]\n" - "bge 42b\n" - "44:" // Height 2: Multiply loop: Skip odd blocks - "cbz x11, 48f\n" - "tbz x11, #1, 45f\n" + "ldr q8, [x15, #0x20]\n" + "ldr q9, [x15, #0x30]\n" + ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" + "bge 41b\n" + "43:" // Height 2: Multiply loop: Skip odd blocks + "cbz x11, 47f\n" + "tbz x11, #1, 44f\n" "ldr h0, [x10], #0x2\n" "ldr h1, [x9], #0x2\n" - "tbz x11, #0, 46f\n" + "tbz x11, #0, 45f\n" "ld1 { v0.b }[2], [x10]\n" "ld1 { v1.b }[2], [x9]\n" - "b 46f\n" - "45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "b 45f\n" + "44:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x10, #0x0]\n" "ldr b1, [x9, #0x0]\n" - "46:" // Height 2: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 47f\n" + "45:" // Height 2: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 46f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - "47:" // Height 2: Multiply loop: unique 8: skip row sum - "ldr q27, [x14, #0x0]\n" - "ldr q26, [x14, #0x10]\n" - "ldr q25, [x14, #0x20]\n" - "ldr q24, [x14, #0x30]\n" - ".inst 0x6f80e370 // udot v16.4s, v27.16b, v0.4b[0]\n" - ".inst 0x6f81e374 // udot v20.4s, v27.16b, v1.4b[0]\n" - "add x14, x14, #0x40\n" - ".inst 0x6f80e351 // udot v17.4s, v26.16b, v0.4b[0]\n" - ".inst 0x6f81e355 // udot v21.4s, v26.16b, v1.4b[0]\n" - ".inst 0x6f80e332 // udot v18.4s, v25.16b, v0.4b[0]\n" - ".inst 0x6f81e336 // udot v22.4s, v25.16b, v1.4b[0]\n" - ".inst 0x6f80e313 // udot v19.4s, v24.16b, v0.4b[0]\n" - ".inst 0x6f81e317 // udot v23.4s, v24.16b, v1.4b[0]\n" - "48:" // Height 2: Multiply loop: No odd multiplies + "46:" // Height 2: Multiply loop: unique 8: skip row sum + "ldr q10, [x15, #0x0]\n" + "ldr q4, [x15, #0x10]\n" + "ldr q5, [x15, #0x20]\n" + "ldr q6, [x15, #0x30]\n" + ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" + "47:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x12, x12, #0x1\n" "cmp x12, x20\n" - "bne 34b\n" + "bne 33b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x13, #0x0]\n" "add x24, x13, x20\n" "prfm pstl1keep, [x24, #0x0]\n" - "tbnz %x[flags], #31, 49f\n" + "tbnz %x[flags], #31, 48f\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "add x20, %x[qp], %[b_offset]\n" - "ld1r { v24.4s }, [x20]\n" - "neg v24.4s, v24.4s\n" + "ld1r { v2.4s }, [x20]\n" + "neg v2.4s, v2.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "mul v11.4s, v11.4s, v24.4s\n" - "mul v12.4s, v12.4s, v24.4s\n" - "49:" // Height 2: skip row sum fixup - "ldr q28, [x16, #0x0]\n" + "mul v11.4s, v11.4s, v2.4s\n" + "mul v12.4s, v12.4s, v2.4s\n" + "48:" // Height 2: skip row sum fixup + "ldr q0, [x14, #0x0]\n" "add v16.4s, v16.4s, v11.4s\n" - "ldr q27, [x16, #0x10]\n" + "ldr q1, [x14, #0x10]\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q26, [x16, #0x20]\n" + "ldr q2, [x14, #0x20]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q25, [x16, #0x30]\n" + "ldr q3, [x14, #0x30]\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" - "add v16.4s, v16.4s, v28.4s\n" - "add v17.4s, v17.4s, v27.4s\n" - "add v18.4s, v18.4s, v26.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "ld1r { v24.4s }, [x20]\n" - "add v19.4s, v19.4s, v25.4s\n" - "add v20.4s, v20.4s, v28.4s\n" - "add v21.4s, v21.4s, v27.4s\n" - "add v22.4s, v22.4s, v26.4s\n" - "add v23.4s, v23.4s, v25.4s\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "ld1r { v4.4s }, [x20]\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" "add x20, %x[qp], %[per_layer_right_shift]\n" "ld1r { v0.4s }, [x20]\n" - "sqrdmulh v16.4s, v16.4s, v24.4s\n" - "sqrdmulh v17.4s, v17.4s, v24.4s\n" - "sqrdmulh v18.4s, v18.4s, v24.4s\n" - "sqrdmulh v19.4s, v19.4s, v24.4s\n" - "sqrdmulh v20.4s, v20.4s, v24.4s\n" - "sqrdmulh v21.4s, v21.4s, v24.4s\n" - "sqrdmulh v22.4s, v22.4s, v24.4s\n" - "sqrdmulh v23.4s, v23.4s, v24.4s\n" - "add x16, x16, #0x40\n" - "tbz %x[flags], #5, 50f\n" - "and v24.16b, v16.16b, v0.16b\n" - "and v30.16b, v17.16b, v0.16b\n" - "and v29.16b, v18.16b, v0.16b\n" - "and v28.16b, v19.16b, v0.16b\n" - "and v27.16b, v20.16b, v0.16b\n" - "and v26.16b, v21.16b, v0.16b\n" - "and v25.16b, v22.16b, v0.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v24.4s\n" - "and v24.16b, v23.16b, v0.16b\n" - "sqadd v17.4s, v17.4s, v30.4s\n" - "sqadd v18.4s, v18.4s, v29.4s\n" - "sqadd v19.4s, v19.4s, v28.4s\n" - "sqadd v20.4s, v20.4s, v27.4s\n" - "sqadd v21.4s, v21.4s, v26.4s\n" - "sqadd v22.4s, v22.4s, v25.4s\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v24.4s\n" - "50:" // Height 2: no shift correction + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x20]\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" - "add x20, %x[qp], %[c_offset]\n" - "add x21, %x[qp], %[maxval]\n" - "ld1r { v26.4s }, [x20]\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v6.4s }, [x20]\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" "add x20, %x[qp], %[minval]\n" - "ld1r { v25.4s }, [x21]\n" - "cmp x15, #0x10\n" - "ld1r { v24.4s }, [x20]\n" - "add v16.4s, v16.4s, v26.4s\n" - "add v17.4s, v17.4s, v26.4s\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v26.4s\n" - "add v20.4s, v20.4s, v26.4s\n" - "add v21.4s, v21.4s, v26.4s\n" - "add v22.4s, v22.4s, v26.4s\n" - "add v23.4s, v23.4s, v26.4s\n" - "smin v16.4s, v16.4s, v25.4s\n" - "smin v17.4s, v17.4s, v25.4s\n" - "smin v18.4s, v18.4s, v25.4s\n" - "smin v19.4s, v19.4s, v25.4s\n" - "smin v20.4s, v20.4s, v25.4s\n" - "smin v21.4s, v21.4s, v25.4s\n" - "smin v22.4s, v22.4s, v25.4s\n" - "smin v23.4s, v23.4s, v25.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "smax v18.4s, v18.4s, v24.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "smax v21.4s, v21.4s, v24.4s\n" - "smax v22.4s, v22.4s, v24.4s\n" - "smax v23.4s, v23.4s, v24.4s\n" + "ld1r { v5.4s }, [x20]\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "cmp x16, #0x10\n" + "smax v19.4s, v19.4s, v5.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v18.8h, v18.8h, v19.8h\n" + "add x14, x14, #0x40\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v17.8h, v22.8h, v23.8h\n" - "uzp1 v16.16b, v16.16b, v18.16b\n" - "uzp1 v20.16b, v20.16b, v17.16b\n" - "bge 59f\n" - "tbz x15, #3, 54f\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "bge 57f\n" + "tbz x16, #3, 52f\n" "str d16, [x13], #0x8\n" "str d20, [x24], #0x8\n" - "tbz x15, #2, 52f\n" + "tbz x16, #2, 50f\n" "st1 { v16.s }[2], [x13], #0x4\n" "st1 { v20.s }[2], [x24], #0x4\n" - "tbz x15, #1, 51f\n" + "tbz x16, #1, 49f\n" "st1 { v16.h }[6], [x13], #0x2\n" "st1 { v20.h }[6], [x24], #0x2\n" - "tbz x15, #0, 58f\n" + "tbz x16, #0, 56f\n" "st1 { v16.b }[14], [x13]\n" "st1 { v20.b }[14], [x24]\n" - "b 58f\n" - "51:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x15, #0, 58f\n" + "b 56f\n" + "49:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x16, #0, 56f\n" "st1 { v16.b }[12], [x13]\n" "st1 { v20.b }[12], [x24]\n" - "b 58f\n" - "52:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x15, #1, 53f\n" + "b 56f\n" + "50:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x16, #1, 51f\n" "st1 { v16.h }[4], [x13], #0x2\n" "st1 { v20.h }[4], [x24], #0x2\n" - "tbz x15, #0, 58f\n" + "tbz x16, #0, 56f\n" "st1 { v16.b }[10], [x13]\n" "st1 { v20.b }[10], [x24]\n" - "b 58f\n" - "53:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x15, #0, 58f\n" + "b 56f\n" + "51:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x16, #0, 56f\n" "st1 { v16.b }[8], [x13]\n" "st1 { v20.b }[8], [x24]\n" - "b 58f\n" - "54:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x15, #2, 56f\n" + "b 56f\n" + "52:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x16, #2, 54f\n" "str s16, [x13], #0x4\n" "str s20, [x24], #0x4\n" - "tbz x15, #1, 55f\n" + "tbz x16, #1, 53f\n" "st1 { v16.h }[2], [x13], #0x2\n" "st1 { v20.h }[2], [x24], #0x2\n" - "tbz x15, #0, 58f\n" + "tbz x16, #0, 56f\n" "st1 { v16.b }[6], [x13]\n" "st1 { v20.b }[6], [x24]\n" - "b 58f\n" - "55:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x15, #0, 58f\n" + "b 56f\n" + "53:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x16, #0, 56f\n" "st1 { v16.b }[4], [x13]\n" "st1 { v20.b }[4], [x24]\n" - "b 58f\n" - "56:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x15, #1, 57f\n" + "b 56f\n" + "54:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x16, #1, 55f\n" "str h16, [x13], #0x2\n" "str h20, [x24], #0x2\n" - "tbz x15, #0, 58f\n" + "tbz x16, #0, 56f\n" "st1 { v16.b }[2], [x13]\n" "st1 { v20.b }[2], [x24]\n" - "b 58f\n" - "57:" // Height 2: Partial direct writeback: partial_1_0 + "b 56f\n" + "55:" // Height 2: Partial direct writeback: partial_1_0 "str b16, [x13, #0x0]\n" "str b20, [x24, #0x0]\n" - "58:" // Height 2: Partial direct writeback: Done - "b 60f\n" - "59:" // Height 2: Full writeback + "56:" // Height 2: Partial direct writeback: Done + "b 58f\n" + "57:" // Height 2: Full writeback "str q16, [x13, #0x0]\n" "add x13, x13, #0x10\n" "str q20, [x24, #0x0]\n" - "60:" // Height 2: Writeback done - "subs x15, x15, #0x10\n" - "bgt 32b\n" - "b 122f\n" - "61:" // Height 3 - "mov x16, %x[col_bias]\n" + "58:" // Height 2: Writeback done + "subs x16, x16, #0x10\n" + "bgt 31b\n" + "b 118f\n" + "59:" // Height 3 "movi v11.4s, #0x0\n" - "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "movi v12.4s, #0x0\n" + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" "movi v13.4s, #0x0\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" "movi v15.16b, #0x1\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[col_bias]\n" "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "62:" // Height 3: Column loop + "60:" // Height 3: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -849,318 +803,317 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "movi v25.4s, #0x0\n" "movi v26.4s, #0x0\n" "movi v27.4s, #0x0\n" - "63:" // Height 3: setup done "mov x12, #0x0\n" - "64:" // Height 3: String loop + "62:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 65f\n" + "tbz %x[flags], #3, 63f\n" "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x10, [x20, #0x0]\n" "ldr x9, [x20, #0x8]\n" "ldr x28, [x20, #0x10]\n" - "cbnz x12, 66f\n" + "cbnz x12, 64f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x10, x10, x20\n" "add x9, x9, x20\n" "add x28, x28, x20\n" - "b 66f\n" - "65:" // Height 3: setup direct input + "b 64f\n" + "63:" // Height 3: setup direct input "mov x10, %x[input_ptr]\n" "add x9, x10, x21\n" "add x28, x9, x21\n" - "66:" // Height 3: input setup done + "64:" // Height 3: input setup done "cmp x11, #0x10\n" - "blt 71f\n" + "blt 69f\n" "ldr q0, [x10, #0x0]\n" "cmp x11, #0x20\n" "ldr q1, [x9, #0x0]\n" "ldr q2, [x28, #0x0]\n" - "ldr q4, [x14, #0x0]\n" - "ldr q5, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" - "ldr q7, [x14, #0x30]\n" - "ldr q8, [x14, #0x40]\n" - "ldr q9, [x14, #0x50]\n" - "ldr q10, [x14, #0x60]\n" - "blt 69f\n" - "67:" // Height 3: Multiply loop: Main loop head + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "ldr q8, [x15, #0x40]\n" + "ldr q9, [x15, #0x50]\n" + "ldr q10, [x15, #0x60]\n" + "blt 67f\n" + "65:" // Height 3: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr x20, [x14, #0x78]\n" + "ldr x20, [x15, #0x78]\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "ldr x23, [x14, #0x88]\n" + "ldr x23, [x15, #0x88]\n" ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" - "ldr d29, [x14, #0x70]\n" + "ldr d4, [x15, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "ldr x22, [x14, #0x98]\n" + "ldr x22, [x15, #0x98]\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "ldr x21, [x14, #0xa8]\n" + "ldr x21, [x15, #0xa8]\n" ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" - "ldr d28, [x14, #0x80]\n" + "ldr d5, [x15, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "mov v29.d[1], x20\n" + "mov v4.d[1], x20\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - "ldr x20, [x14, #0xb8]\n" + "ldr x20, [x15, #0xb8]\n" ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" - "ldr d5, [x14, #0x90]\n" + "ldr d6, [x15, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "mov v28.d[1], x23\n" + "mov v5.d[1], x23\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" - "ldr x23, [x14, #0xc8]\n" + "ldr x23, [x15, #0xc8]\n" ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" - "ldr d4, [x14, #0xa0]\n" + "ldr d7, [x15, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" - "mov v5.d[1], x22\n" + "mov v6.d[1], x22\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" - "ldr x22, [x14, #0xd8]\n" + "ldr x22, [x15, #0xd8]\n" ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" - "ldr d3, [x14, #0xb0]\n" + "ldr d8, [x15, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" - "mov v4.d[1], x21\n" + "mov v7.d[1], x21\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" - "ldr x21, [x14, #0xe8]\n" + "ldr x21, [x15, #0xe8]\n" ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" - "ldr d31, [x14, #0xc0]\n" + "ldr d9, [x15, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" - "mov v3.d[1], x20\n" + "mov v8.d[1], x20\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" - "ldr x20, [x14, #0xf8]\n" + "ldr x20, [x15, #0xf8]\n" ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" - "ldr d30, [x14, #0xd0]\n" - ".inst 0x6fa0e3b3 // udot v19.4s, v29.16b, v0.4b[1]\n" - "mov v31.d[1], x23\n" - ".inst 0x6fa1e3b7 // udot v23.4s, v29.16b, v1.4b[1]\n" + "ldr d10, [x15, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + "mov v9.d[1], x23\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" "add x10, x10, #0x10\n" - ".inst 0x6fa2e3bb // udot v27.4s, v29.16b, v2.4b[1]\n" - "ldr d29, [x14, #0xe0]\n" - ".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n" - "mov v30.d[1], x22\n" - ".inst 0x6f81eb94 // udot v20.4s, v28.16b, v1.4b[2]\n" + ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" + "ldr d4, [x15, #0xe0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + "mov v10.d[1], x22\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" "add x9, x9, #0x10\n" - ".inst 0x6f82eb98 // udot v24.4s, v28.16b, v2.4b[2]\n" - "ldr d28, [x14, #0xf0]\n" - ".inst 0x6f80e8b1 // udot v17.4s, v5.16b, v0.4b[2]\n" - "mov v29.d[1], x21\n" - ".inst 0x6f81e8b5 // udot v21.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" + "ldr d5, [x15, #0xf0]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "mov v4.d[1], x21\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" "add x28, x28, #0x10\n" - ".inst 0x6f82e8b9 // udot v25.4s, v5.16b, v2.4b[2]\n" - "mov v28.d[1], x20\n" - ".inst 0x6f80e892 // udot v18.4s, v4.16b, v0.4b[2]\n" - "add x14, x14, #0x100\n" - ".inst 0x6f81e896 // udot v22.4s, v4.16b, v1.4b[2]\n" - ".inst 0x6f82e89a // udot v26.4s, v4.16b, v2.4b[2]\n" - ".inst 0x6f80e873 // udot v19.4s, v3.16b, v0.4b[2]\n" - ".inst 0x6f81e877 // udot v23.4s, v3.16b, v1.4b[2]\n" - ".inst 0x6f82e87b // udot v27.4s, v3.16b, v2.4b[2]\n" - ".inst 0x6fa0ebf0 // udot v16.4s, v31.16b, v0.4b[3]\n" - ".inst 0x6fa1ebf4 // udot v20.4s, v31.16b, v1.4b[3]\n" - ".inst 0x6fa2ebf8 // udot v24.4s, v31.16b, v2.4b[3]\n" - ".inst 0x6fa0ebd1 // udot v17.4s, v30.16b, v0.4b[3]\n" - ".inst 0x6fa1ebd5 // udot v21.4s, v30.16b, v1.4b[3]\n" - ".inst 0x6fa2ebd9 // udot v25.4s, v30.16b, v2.4b[3]\n" - ".inst 0x6fa0ebb2 // udot v18.4s, v29.16b, v0.4b[3]\n" - ".inst 0x6fa1ebb6 // udot v22.4s, v29.16b, v1.4b[3]\n" - ".inst 0x6fa2ebba // udot v26.4s, v29.16b, v2.4b[3]\n" - ".inst 0x6fa0eb93 // udot v19.4s, v28.16b, v0.4b[3]\n" - ".inst 0x6fa1eb97 // udot v23.4s, v28.16b, v1.4b[3]\n" - ".inst 0x6fa2eb9b // udot v27.4s, v28.16b, v2.4b[3]\n" - "tbnz %x[flags], #31, 68f\n" + ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" + "mov v5.d[1], x20\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + "add x15, x15, #0x100\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 66f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "68:" // Height 3: Multiply loop: unique 9: skip row sum + "66:" // Height 3: Multiply loop: unique 9: skip row sum "ldr q0, [x10, #0x0]\n" "sub x11, x11, #0x10\n" "ldr q1, [x9, #0x0]\n" "cmp x11, #0x20\n" "ldr q2, [x28, #0x0]\n" - "ldr q4, [x14, #0x0]\n" - "ldr q5, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" - "ldr q7, [x14, #0x30]\n" - "ldr q8, [x14, #0x40]\n" - "ldr q9, [x14, #0x50]\n" - "ldr q10, [x14, #0x60]\n" + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "ldr q8, [x15, #0x40]\n" + "ldr q9, [x15, #0x50]\n" + "ldr q10, [x15, #0x60]\n" "prfm pldl1keep, [x10, #0x80]\n" "prfm pldl1keep, [x9, #0x80]\n" "prfm pldl1keep, [x28, #0x80]\n" - "bge 67b\n" - "69:" // Height 3: Multiply loop: Single iteration only + "bge 65b\n" + "67:" // Height 3: Multiply loop: Single iteration only ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" "sub x11, x11, #0x10\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" "add x10, x10, #0x10\n" ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q29, [x14, #0x70]\n" + "ldr q4, [x15, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" "add x9, x9, #0x10\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" "add x28, x28, #0x10\n" ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" - "ldr q28, [x14, #0x80]\n" + "ldr q5, [x15, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" - "ldr q5, [x14, #0x90]\n" + "ldr q6, [x15, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" - "ldr q4, [x14, #0xa0]\n" + "ldr q7, [x15, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" - "ldr q3, [x14, #0xb0]\n" + "ldr q8, [x15, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" - "ldr q31, [x14, #0xc0]\n" + "ldr q9, [x15, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" - "ldr q30, [x14, #0xd0]\n" - ".inst 0x6fa0e3b3 // udot v19.4s, v29.16b, v0.4b[1]\n" - ".inst 0x6fa1e3b7 // udot v23.4s, v29.16b, v1.4b[1]\n" - ".inst 0x6fa2e3bb // udot v27.4s, v29.16b, v2.4b[1]\n" - "ldr q29, [x14, #0xe0]\n" - ".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n" - ".inst 0x6f81eb94 // udot v20.4s, v28.16b, v1.4b[2]\n" - ".inst 0x6f82eb98 // udot v24.4s, v28.16b, v2.4b[2]\n" - "ldr q28, [x14, #0xf0]\n" - ".inst 0x6f80e8b1 // udot v17.4s, v5.16b, v0.4b[2]\n" - "add x14, x14, #0x100\n" - ".inst 0x6f81e8b5 // udot v21.4s, v5.16b, v1.4b[2]\n" - ".inst 0x6f82e8b9 // udot v25.4s, v5.16b, v2.4b[2]\n" - ".inst 0x6f80e892 // udot v18.4s, v4.16b, v0.4b[2]\n" - ".inst 0x6f81e896 // udot v22.4s, v4.16b, v1.4b[2]\n" - ".inst 0x6f82e89a // udot v26.4s, v4.16b, v2.4b[2]\n" - ".inst 0x6f80e873 // udot v19.4s, v3.16b, v0.4b[2]\n" - ".inst 0x6f81e877 // udot v23.4s, v3.16b, v1.4b[2]\n" - ".inst 0x6f82e87b // udot v27.4s, v3.16b, v2.4b[2]\n" - ".inst 0x6fa0ebf0 // udot v16.4s, v31.16b, v0.4b[3]\n" - ".inst 0x6fa1ebf4 // udot v20.4s, v31.16b, v1.4b[3]\n" - ".inst 0x6fa2ebf8 // udot v24.4s, v31.16b, v2.4b[3]\n" - ".inst 0x6fa0ebd1 // udot v17.4s, v30.16b, v0.4b[3]\n" - ".inst 0x6fa1ebd5 // udot v21.4s, v30.16b, v1.4b[3]\n" - ".inst 0x6fa2ebd9 // udot v25.4s, v30.16b, v2.4b[3]\n" - ".inst 0x6fa0ebb2 // udot v18.4s, v29.16b, v0.4b[3]\n" - ".inst 0x6fa1ebb6 // udot v22.4s, v29.16b, v1.4b[3]\n" - ".inst 0x6fa2ebba // udot v26.4s, v29.16b, v2.4b[3]\n" - ".inst 0x6fa0eb93 // udot v19.4s, v28.16b, v0.4b[3]\n" - ".inst 0x6fa1eb97 // udot v23.4s, v28.16b, v1.4b[3]\n" - ".inst 0x6fa2eb9b // udot v27.4s, v28.16b, v2.4b[3]\n" - "tbnz %x[flags], #31, 70f\n" + "ldr q10, [x15, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" + "ldr q4, [x15, #0xe0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" + "ldr q5, [x15, #0xf0]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "add x15, x15, #0x100\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 68f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "70:" // Height 3: Multiply loop: unique 10: skip row sum + "68:" // Height 3: Multiply loop: unique 10: skip row sum "prfm pldl1keep, [x10, #0x80]\n" "prfm pldl1keep, [x9, #0x80]\n" "prfm pldl1keep, [x28, #0x80]\n" - "71:" // Height 3: Multiply loop: Main loop skip - "cbz x11, 78f\n" + "69:" // Height 3: Multiply loop: Main loop skip + "cbz x11, 76f\n" "cmp x11, #0x4\n" - "blt 74f\n" - "72:" // Height 3: Multiply loop: Odd block loop + "blt 72f\n" + "70:" // Height 3: Multiply loop: Odd block loop "ldr s0, [x10], #0x4\n" "ldr s1, [x9], #0x4\n" "ldr s2, [x28], #0x4\n" - "tbnz %x[flags], #31, 73f\n" + "tbnz %x[flags], #31, 71f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "73:" // Height 3: Multiply loop: unique 11: skip row sum - "ldr q31, [x14, #0x0]\n" + "71:" // Height 3: Multiply loop: unique 11: skip row sum + "ldr q6, [x15, #0x0]\n" "sub x11, x11, #0x4\n" - "ldr q30, [x14, #0x10]\n" + "ldr q7, [x15, #0x10]\n" "cmp x11, #0x4\n" - "ldr q29, [x14, #0x20]\n" - "ldr q28, [x14, #0x30]\n" - ".inst 0x6f80e3f0 // udot v16.4s, v31.16b, v0.4b[0]\n" - ".inst 0x6f81e3f4 // udot v20.4s, v31.16b, v1.4b[0]\n" - "add x14, x14, #0x40\n" - ".inst 0x6f82e3f8 // udot v24.4s, v31.16b, v2.4b[0]\n" - ".inst 0x6f80e3d1 // udot v17.4s, v30.16b, v0.4b[0]\n" - ".inst 0x6f81e3d5 // udot v21.4s, v30.16b, v1.4b[0]\n" - ".inst 0x6f82e3d9 // udot v25.4s, v30.16b, v2.4b[0]\n" - ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n" - ".inst 0x6f81e3b6 // udot v22.4s, v29.16b, v1.4b[0]\n" - ".inst 0x6f82e3ba // udot v26.4s, v29.16b, v2.4b[0]\n" - ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n" - ".inst 0x6f81e397 // udot v23.4s, v28.16b, v1.4b[0]\n" - ".inst 0x6f82e39b // udot v27.4s, v28.16b, v2.4b[0]\n" - "bge 72b\n" - "74:" // Height 3: Multiply loop: Skip odd blocks - "cbz x11, 78f\n" - "tbz x11, #1, 75f\n" + "ldr q8, [x15, #0x20]\n" + "ldr q9, [x15, #0x30]\n" + ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n" + ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n" + "bge 70b\n" + "72:" // Height 3: Multiply loop: Skip odd blocks + "cbz x11, 76f\n" + "tbz x11, #1, 73f\n" "ldr h0, [x10], #0x2\n" "ldr h1, [x9], #0x2\n" "ldr h2, [x28], #0x2\n" - "tbz x11, #0, 76f\n" + "tbz x11, #0, 74f\n" "ld1 { v0.b }[2], [x10]\n" "ld1 { v1.b }[2], [x9]\n" "ld1 { v2.b }[2], [x28]\n" - "b 76f\n" - "75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "b 74f\n" + "73:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x10, #0x0]\n" "ldr b1, [x9, #0x0]\n" "ldr b2, [x28, #0x0]\n" - "76:" // Height 3: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 77f\n" + "74:" // Height 3: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 75f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "77:" // Height 3: Multiply loop: unique 12: skip row sum - "ldr q31, [x14, #0x0]\n" - "ldr q30, [x14, #0x10]\n" - "ldr q29, [x14, #0x20]\n" - "ldr q28, [x14, #0x30]\n" - ".inst 0x6f80e3f0 // udot v16.4s, v31.16b, v0.4b[0]\n" - ".inst 0x6f81e3f4 // udot v20.4s, v31.16b, v1.4b[0]\n" - "add x14, x14, #0x40\n" - ".inst 0x6f82e3f8 // udot v24.4s, v31.16b, v2.4b[0]\n" - ".inst 0x6f80e3d1 // udot v17.4s, v30.16b, v0.4b[0]\n" - ".inst 0x6f81e3d5 // udot v21.4s, v30.16b, v1.4b[0]\n" - ".inst 0x6f82e3d9 // udot v25.4s, v30.16b, v2.4b[0]\n" - ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n" - ".inst 0x6f81e3b6 // udot v22.4s, v29.16b, v1.4b[0]\n" - ".inst 0x6f82e3ba // udot v26.4s, v29.16b, v2.4b[0]\n" - ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n" - ".inst 0x6f81e397 // udot v23.4s, v28.16b, v1.4b[0]\n" - ".inst 0x6f82e39b // udot v27.4s, v28.16b, v2.4b[0]\n" - "78:" // Height 3: Multiply loop: No odd multiplies + "75:" // Height 3: Multiply loop: unique 12: skip row sum + "ldr q10, [x15, #0x0]\n" + "ldr q4, [x15, #0x10]\n" + "ldr q5, [x15, #0x20]\n" + "ldr q6, [x15, #0x30]\n" + ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n" + ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n" + ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n" + "76:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x12, x12, #0x1\n" "cmp x12, x20\n" - "bne 64b\n" + "bne 62b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x13, #0x0]\n" "add x24, x13, x20\n" "add x23, x24, x20\n" "prfm pstl1keep, [x24, #0x0]\n" "prfm pstl1keep, [x23, #0x0]\n" - "tbnz %x[flags], #31, 79f\n" + "tbnz %x[flags], #31, 77f\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "add x20, %x[qp], %[b_offset]\n" - "ld1r { v28.4s }, [x20]\n" - "neg v28.4s, v28.4s\n" + "ld1r { v3.4s }, [x20]\n" + "neg v3.4s, v3.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "mul v11.4s, v11.4s, v28.4s\n" - "mul v12.4s, v12.4s, v28.4s\n" - "mul v13.4s, v13.4s, v28.4s\n" - "79:" // Height 3: skip row sum fixup - "ldr q31, [x16, #0x0]\n" + "mul v11.4s, v11.4s, v3.4s\n" + "mul v12.4s, v12.4s, v3.4s\n" + "mul v13.4s, v13.4s, v3.4s\n" + "77:" // Height 3: skip row sum fixup + "ldr q0, [x14, #0x0]\n" "add v16.4s, v16.4s, v11.4s\n" - "ldr q30, [x16, #0x10]\n" + "ldr q1, [x14, #0x10]\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q29, [x16, #0x20]\n" + "ldr q2, [x14, #0x20]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q28, [x16, #0x30]\n" + "ldr q3, [x14, #0x30]\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" @@ -1170,74 +1123,40 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "add v25.4s, v25.4s, v13.4s\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" - "add v16.4s, v16.4s, v31.4s\n" - "add v17.4s, v17.4s, v30.4s\n" - "add v18.4s, v18.4s, v29.4s\n" - "add v19.4s, v19.4s, v28.4s\n" - "add v20.4s, v20.4s, v31.4s\n" - "add v21.4s, v21.4s, v30.4s\n" - "add v22.4s, v22.4s, v29.4s\n" - "add v23.4s, v23.4s, v28.4s\n" - "add v24.4s, v24.4s, v31.4s\n" - "add v25.4s, v25.4s, v30.4s\n" - "add v26.4s, v26.4s, v29.4s\n" - "add v27.4s, v27.4s, v28.4s\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v28.4s }, [x20]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x21]\n" + "add x21, %x[qp], %[c_offset]\n" "ld1r { v0.4s }, [x20]\n" - "add x16, x16, #0x40\n" - "sqrdmulh v16.4s, v16.4s, v28.4s\n" - "sqrdmulh v17.4s, v17.4s, v28.4s\n" - "sqrdmulh v18.4s, v18.4s, v28.4s\n" - "sqrdmulh v19.4s, v19.4s, v28.4s\n" - "sqrdmulh v20.4s, v20.4s, v28.4s\n" - "sqrdmulh v21.4s, v21.4s, v28.4s\n" - "sqrdmulh v22.4s, v22.4s, v28.4s\n" - "sqrdmulh v23.4s, v23.4s, v28.4s\n" - "sqrdmulh v24.4s, v24.4s, v28.4s\n" - "sqrdmulh v25.4s, v25.4s, v28.4s\n" - "sqrdmulh v26.4s, v26.4s, v28.4s\n" - "sqrdmulh v27.4s, v27.4s, v28.4s\n" - "tbz %x[flags], #5, 80f\n" - "and v1.16b, v16.16b, v0.16b\n" - "and v31.16b, v17.16b, v0.16b\n" - "and v30.16b, v18.16b, v0.16b\n" - "and v29.16b, v19.16b, v0.16b\n" - "and v28.16b, v20.16b, v0.16b\n" - "and v3.16b, v21.16b, v0.16b\n" - "and v2.16b, v22.16b, v0.16b\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v1.4s\n" - "sqadd v17.4s, v17.4s, v31.4s\n" - "sqadd v18.4s, v18.4s, v30.4s\n" - "sqadd v19.4s, v19.4s, v29.4s\n" - "sqadd v20.4s, v20.4s, v28.4s\n" - "and v1.16b, v23.16b, v0.16b\n" - "and v31.16b, v24.16b, v0.16b\n" - "and v30.16b, v25.16b, v0.16b\n" - "and v29.16b, v26.16b, v0.16b\n" - "and v28.16b, v27.16b, v0.16b\n" - "sqadd v21.4s, v21.4s, v3.4s\n" - "sqadd v22.4s, v22.4s, v2.4s\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v1.4s\n" - "sqadd v24.4s, v24.4s, v31.4s\n" - "sqadd v25.4s, v25.4s, v30.4s\n" - "sqadd v26.4s, v26.4s, v29.4s\n" - "sqadd v27.4s, v27.4s, v28.4s\n" - "80:" // Height 3: no shift correction + "add x20, %x[qp], %[maxval]\n" + "ld1r { v6.4s }, [x20]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v4.4s\n" + "sqdmulh v26.4s, v26.4s, v4.4s\n" + "sqdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v4.4s }, [x21]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" @@ -1250,159 +1169,155 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add x20, %x[qp], %[c_offset]\n" - "add x21, %x[qp], %[maxval]\n" - "ld1r { v30.4s }, [x20]\n" - "add x20, %x[qp], %[minval]\n" - "ld1r { v29.4s }, [x21]\n" - "cmp x15, #0x10\n" - "ld1r { v28.4s }, [x20]\n" - "add v16.4s, v16.4s, v30.4s\n" - "add v17.4s, v17.4s, v30.4s\n" - "add v18.4s, v18.4s, v30.4s\n" - "add v19.4s, v19.4s, v30.4s\n" - "add v20.4s, v20.4s, v30.4s\n" - "add v21.4s, v21.4s, v30.4s\n" - "add v22.4s, v22.4s, v30.4s\n" - "add v23.4s, v23.4s, v30.4s\n" - "add v24.4s, v24.4s, v30.4s\n" - "add v25.4s, v25.4s, v30.4s\n" - "add v26.4s, v26.4s, v30.4s\n" - "add v27.4s, v27.4s, v30.4s\n" - "smin v16.4s, v16.4s, v29.4s\n" - "smin v17.4s, v17.4s, v29.4s\n" - "smin v18.4s, v18.4s, v29.4s\n" - "smin v19.4s, v19.4s, v29.4s\n" - "smin v20.4s, v20.4s, v29.4s\n" - "smin v21.4s, v21.4s, v29.4s\n" - "smin v22.4s, v22.4s, v29.4s\n" - "smin v23.4s, v23.4s, v29.4s\n" - "smin v24.4s, v24.4s, v29.4s\n" - "smin v25.4s, v25.4s, v29.4s\n" - "smin v26.4s, v26.4s, v29.4s\n" - "smin v27.4s, v27.4s, v29.4s\n" - "smax v16.4s, v16.4s, v28.4s\n" - "smax v17.4s, v17.4s, v28.4s\n" - "smax v18.4s, v18.4s, v28.4s\n" - "smax v19.4s, v19.4s, v28.4s\n" - "smax v20.4s, v20.4s, v28.4s\n" - "smax v21.4s, v21.4s, v28.4s\n" - "smax v22.4s, v22.4s, v28.4s\n" - "smax v23.4s, v23.4s, v28.4s\n" - "smax v24.4s, v24.4s, v28.4s\n" - "smax v25.4s, v25.4s, v28.4s\n" - "smax v26.4s, v26.4s, v28.4s\n" - "smax v27.4s, v27.4s, v28.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v19.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v18.8h, v22.8h, v23.8h\n" + "cmp x16, #0x10\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "orr %x[flags], %x[flags], #0x80000000\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v17.8h, v26.8h, v27.8h\n" - "uzp1 v16.16b, v16.16b, v19.16b\n" - "uzp1 v20.16b, v20.16b, v18.16b\n" - "uzp1 v24.16b, v24.16b, v17.16b\n" - "bge 89f\n" - "tbz x15, #3, 84f\n" + "add x14, x14, #0x40\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 86f\n" + "tbz x16, #3, 81f\n" "str d16, [x13], #0x8\n" "str d20, [x24], #0x8\n" "str d24, [x23], #0x8\n" - "tbz x15, #2, 82f\n" + "tbz x16, #2, 79f\n" "st1 { v16.s }[2], [x13], #0x4\n" "st1 { v20.s }[2], [x24], #0x4\n" "st1 { v24.s }[2], [x23], #0x4\n" - "tbz x15, #1, 81f\n" + "tbz x16, #1, 78f\n" "st1 { v16.h }[6], [x13], #0x2\n" "st1 { v20.h }[6], [x24], #0x2\n" "st1 { v24.h }[6], [x23], #0x2\n" - "tbz x15, #0, 88f\n" + "tbz x16, #0, 85f\n" "st1 { v16.b }[14], [x13]\n" "st1 { v20.b }[14], [x24]\n" "st1 { v24.b }[14], [x23]\n" - "b 88f\n" - "81:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x15, #0, 88f\n" + "b 85f\n" + "78:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x16, #0, 85f\n" "st1 { v16.b }[12], [x13]\n" "st1 { v20.b }[12], [x24]\n" "st1 { v24.b }[12], [x23]\n" - "b 88f\n" - "82:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x15, #1, 83f\n" + "b 85f\n" + "79:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x16, #1, 80f\n" "st1 { v16.h }[4], [x13], #0x2\n" "st1 { v20.h }[4], [x24], #0x2\n" "st1 { v24.h }[4], [x23], #0x2\n" - "tbz x15, #0, 88f\n" + "tbz x16, #0, 85f\n" "st1 { v16.b }[10], [x13]\n" "st1 { v20.b }[10], [x24]\n" "st1 { v24.b }[10], [x23]\n" - "b 88f\n" - "83:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x15, #0, 88f\n" + "b 85f\n" + "80:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x16, #0, 85f\n" "st1 { v16.b }[8], [x13]\n" "st1 { v20.b }[8], [x24]\n" "st1 { v24.b }[8], [x23]\n" - "b 88f\n" - "84:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x15, #2, 86f\n" + "b 85f\n" + "81:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x16, #2, 83f\n" "str s16, [x13], #0x4\n" "str s20, [x24], #0x4\n" "str s24, [x23], #0x4\n" - "tbz x15, #1, 85f\n" + "tbz x16, #1, 82f\n" "st1 { v16.h }[2], [x13], #0x2\n" "st1 { v20.h }[2], [x24], #0x2\n" "st1 { v24.h }[2], [x23], #0x2\n" - "tbz x15, #0, 88f\n" + "tbz x16, #0, 85f\n" "st1 { v16.b }[6], [x13]\n" "st1 { v20.b }[6], [x24]\n" "st1 { v24.b }[6], [x23]\n" - "b 88f\n" - "85:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x15, #0, 88f\n" + "b 85f\n" + "82:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x16, #0, 85f\n" "st1 { v16.b }[4], [x13]\n" "st1 { v20.b }[4], [x24]\n" "st1 { v24.b }[4], [x23]\n" - "b 88f\n" - "86:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x15, #1, 87f\n" + "b 85f\n" + "83:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x16, #1, 84f\n" "str h16, [x13], #0x2\n" "str h20, [x24], #0x2\n" "str h24, [x23], #0x2\n" - "tbz x15, #0, 88f\n" + "tbz x16, #0, 85f\n" "st1 { v16.b }[2], [x13]\n" "st1 { v20.b }[2], [x24]\n" "st1 { v24.b }[2], [x23]\n" - "b 88f\n" - "87:" // Height 3: Partial direct writeback: partial_1_0 + "b 85f\n" + "84:" // Height 3: Partial direct writeback: partial_1_0 "str b16, [x13, #0x0]\n" "str b20, [x24, #0x0]\n" "str b24, [x23, #0x0]\n" - "88:" // Height 3: Partial direct writeback: Done - "b 90f\n" - "89:" // Height 3: Full writeback + "85:" // Height 3: Partial direct writeback: Done + "b 87f\n" + "86:" // Height 3: Full writeback "str q16, [x13, #0x0]\n" "add x13, x13, #0x10\n" "str q20, [x24, #0x0]\n" "str q24, [x23, #0x0]\n" - "90:" // Height 3: Writeback done - "subs x15, x15, #0x10\n" - "bgt 62b\n" - "b 122f\n" - "91:" // Height 4 + "87:" // Height 3: Writeback done + "subs x16, x16, #0x10\n" + "bgt 60b\n" + "b 118f\n" + "88:" // Height 4 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "mov x20, #0x4\n" "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "mov x16, %x[col_bias]\n" "movi v11.4s, #0x0\n" - "bic %x[flags], %x[flags], #0x80000000\n" "movi v12.4s, #0x0\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "madd x20, x21, x20, x13\n" + "bic %x[flags], %x[flags], #0x80000000\n" "movi v13.4s, #0x0\n" + "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" + "madd x20, x21, x20, x13\n" "movi v14.4s, #0x0\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" "movi v15.16b, #0x1\n" + "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[col_bias]\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "92:" // Height 4: Column loop + "89:" // Height 4: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -1419,80 +1334,79 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "93:" // Height 4: setup done "mov x12, #0x0\n" - "94:" // Height 4: String loop + "91:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 95f\n" + "tbz %x[flags], #3, 92f\n" "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x10, [x20, #0x0]\n" "ldr x9, [x20, #0x8]\n" "ldr x28, [x20, #0x10]\n" "ldr x27, [x20, #0x18]\n" - "cbnz x12, 96f\n" + "cbnz x12, 93f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x10, x10, x20\n" "add x9, x9, x20\n" "add x28, x28, x20\n" "add x27, x27, x20\n" - "b 96f\n" - "95:" // Height 4: setup direct input + "b 93f\n" + "92:" // Height 4: setup direct input "mov x10, %x[input_ptr]\n" "add x9, x10, x21\n" "add x28, x9, x21\n" "add x27, x28, x21\n" - "96:" // Height 4: input setup done + "93:" // Height 4: input setup done "cmp x11, #0x10\n" - "blt 101f\n" + "blt 98f\n" "ldr q0, [x10, #0x0]\n" "cmp x11, #0x20\n" "ldr q1, [x9, #0x0]\n" "ldr q2, [x28, #0x0]\n" "ldr q3, [x27, #0x0]\n" - "ldr q4, [x14, #0x0]\n" - "ldr q5, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" - "ldr q7, [x14, #0x30]\n" - "ldr q8, [x14, #0x40]\n" - "ldr q9, [x14, #0x50]\n" - "ldr q10, [x14, #0x60]\n" - "blt 99f\n" - "97:" // Height 4: Multiply loop: Main loop head + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "ldr q8, [x15, #0x40]\n" + "ldr q9, [x15, #0x50]\n" + "ldr q10, [x15, #0x60]\n" + "blt 96f\n" + "94:" // Height 4: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr x21, [x14, #0x78]\n" + "ldr x21, [x15, #0x78]\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "ldr x20, [x14, #0x88]\n" + "ldr x20, [x15, #0x88]\n" ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" - "ldr x26, [x14, #0x98]\n" + "ldr x26, [x15, #0x98]\n" ".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n" - "ldr d4, [x14, #0x70]\n" + "ldr d4, [x15, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "ldr x25, [x14, #0xa8]\n" + "ldr x25, [x15, #0xa8]\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "ldr x24, [x14, #0xb8]\n" + "ldr x24, [x15, #0xb8]\n" ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" "mov v4.d[1], x21\n" ".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n" - "ldr d5, [x14, #0x80]\n" + "ldr d5, [x15, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "ldr x23, [x14, #0xc8]\n" + "ldr x23, [x15, #0xc8]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - "ldr x22, [x14, #0xd8]\n" + "ldr x22, [x15, #0xd8]\n" ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" "mov v5.d[1], x20\n" ".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n" - "ldr d6, [x14, #0x90]\n" + "ldr d6, [x15, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "ldr x21, [x14, #0xe8]\n" + "ldr x21, [x15, #0xe8]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" - "ldr x20, [x14, #0xf8]\n" + "ldr x20, [x15, #0xf8]\n" ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" "mov v6.d[1], x26\n" ".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n" - "ldr d7, [x14, #0xa0]\n" + "ldr d7, [x15, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" "add x10, x10, #0x10\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" @@ -1500,7 +1414,7 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" "mov v7.d[1], x25\n" ".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n" - "ldr d8, [x14, #0xb0]\n" + "ldr d8, [x15, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" "add x28, x28, #0x10\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" @@ -1508,27 +1422,27 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" "mov v8.d[1], x24\n" ".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n" - "ldr d9, [x14, #0xc0]\n" + "ldr d9, [x15, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" "mov v9.d[1], x23\n" ".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n" - "ldr d10, [x14, #0xd0]\n" + "ldr d10, [x15, #0xd0]\n" ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" "mov v10.d[1], x22\n" ".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n" - "ldr d4, [x14, #0xe0]\n" + "ldr d4, [x15, #0xe0]\n" ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" "mov v4.d[1], x21\n" ".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n" - "ldr d5, [x14, #0xf0]\n" + "ldr d5, [x15, #0xf0]\n" ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" - "add x14, x14, #0x100\n" + "add x15, x15, #0x100\n" ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" "mov v5.d[1], x20\n" @@ -1557,31 +1471,31 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" ".inst 0x6fa3e8bf // udot v31.4s, v5.16b, v3.4b[3]\n" - "tbnz %x[flags], #31, 98f\n" + "tbnz %x[flags], #31, 95f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" - "98:" // Height 4: Multiply loop: unique 13: skip row sum + "95:" // Height 4: Multiply loop: unique 13: skip row sum "ldr q0, [x10, #0x0]\n" "sub x11, x11, #0x10\n" "ldr q1, [x9, #0x0]\n" "cmp x11, #0x20\n" "ldr q2, [x28, #0x0]\n" "ldr q3, [x27, #0x0]\n" - "ldr q4, [x14, #0x0]\n" - "ldr q5, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" - "ldr q7, [x14, #0x30]\n" - "ldr q8, [x14, #0x40]\n" - "ldr q9, [x14, #0x50]\n" - "ldr q10, [x14, #0x60]\n" + "ldr q4, [x15, #0x0]\n" + "ldr q5, [x15, #0x10]\n" + "ldr q6, [x15, #0x20]\n" + "ldr q7, [x15, #0x30]\n" + "ldr q8, [x15, #0x40]\n" + "ldr q9, [x15, #0x50]\n" + "ldr q10, [x15, #0x60]\n" "prfm pldl1keep, [x10, #0x80]\n" "prfm pldl1keep, [x9, #0x80]\n" "prfm pldl1keep, [x28, #0x80]\n" "prfm pldl1keep, [x27, #0x80]\n" - "bge 97b\n" - "99:" // Height 4: Multiply loop: Single iteration only + "bge 94b\n" + "96:" // Height 4: Multiply loop: Single iteration only ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" "sub x11, x11, #0x10\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" @@ -1589,51 +1503,51 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" "add x9, x9, #0x10\n" ".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n" - "ldr q4, [x14, #0x70]\n" + "ldr q4, [x15, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" "add x28, x28, #0x10\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" "add x27, x27, #0x10\n" ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" ".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n" - "ldr q5, [x14, #0x80]\n" + "ldr q5, [x15, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x14, #0x90]\n" + "ldr q6, [x15, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0xa0]\n" + "ldr q7, [x15, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" ".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n" - "ldr q8, [x14, #0xb0]\n" + "ldr q8, [x15, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" ".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n" - "ldr q9, [x14, #0xc0]\n" + "ldr q9, [x15, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" ".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n" - "ldr q10, [x14, #0xd0]\n" + "ldr q10, [x15, #0xd0]\n" ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" ".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n" - "ldr q4, [x14, #0xe0]\n" + "ldr q4, [x15, #0xe0]\n" ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" ".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n" - "ldr q5, [x14, #0xf0]\n" + "ldr q5, [x15, #0xf0]\n" ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" - "add x14, x14, #0x100\n" + "add x15, x15, #0x100\n" ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n" @@ -1661,106 +1575,106 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" ".inst 0x6fa3e8bf // udot v31.4s, v5.16b, v3.4b[3]\n" - "tbnz %x[flags], #31, 100f\n" + "tbnz %x[flags], #31, 97f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" - "100:" // Height 4: Multiply loop: unique 14: skip row sum + "97:" // Height 4: Multiply loop: unique 14: skip row sum "prfm pldl1keep, [x10, #0x80]\n" "prfm pldl1keep, [x9, #0x80]\n" "prfm pldl1keep, [x28, #0x80]\n" "prfm pldl1keep, [x27, #0x80]\n" - "101:" // Height 4: Multiply loop: Main loop skip - "cbz x11, 108f\n" + "98:" // Height 4: Multiply loop: Main loop skip + "cbz x11, 105f\n" "cmp x11, #0x4\n" - "blt 104f\n" - "102:" // Height 4: Multiply loop: Odd block loop + "blt 101f\n" + "99:" // Height 4: Multiply loop: Odd block loop "ldr s0, [x10], #0x4\n" "ldr s1, [x9], #0x4\n" "ldr s2, [x28], #0x4\n" "ldr s3, [x27], #0x4\n" - "tbnz %x[flags], #31, 103f\n" + "tbnz %x[flags], #31, 100f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" - "103:" // Height 4: Multiply loop: unique 15: skip row sum - "ldr q7, [x14, #0x0]\n" + "100:" // Height 4: Multiply loop: unique 15: skip row sum + "ldr q6, [x15, #0x0]\n" "sub x11, x11, #0x4\n" - "ldr q6, [x14, #0x10]\n" + "ldr q7, [x15, #0x10]\n" "cmp x11, #0x4\n" - "ldr q5, [x14, #0x20]\n" - "ldr q4, [x14, #0x30]\n" - ".inst 0x6f80e0f0 // udot v16.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n" - "add x14, x14, #0x40\n" - ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0fc // udot v28.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" - ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" - ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n" - ".inst 0x6f83e0be // udot v30.4s, v5.16b, v3.4b[0]\n" - ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n" - ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n" - "bge 102b\n" - "104:" // Height 4: Multiply loop: Skip odd blocks - "cbz x11, 108f\n" - "tbz x11, #1, 105f\n" + "ldr q8, [x15, #0x20]\n" + "ldr q9, [x15, #0x30]\n" + ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0fd // udot v29.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n" + ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n" + ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n" + ".inst 0x6f83e13f // udot v31.4s, v9.16b, v3.4b[0]\n" + "bge 99b\n" + "101:" // Height 4: Multiply loop: Skip odd blocks + "cbz x11, 105f\n" + "tbz x11, #1, 102f\n" "ldr h0, [x10], #0x2\n" "ldr h1, [x9], #0x2\n" "ldr h2, [x28], #0x2\n" "ldr h3, [x27], #0x2\n" - "tbz x11, #0, 106f\n" + "tbz x11, #0, 103f\n" "ld1 { v0.b }[2], [x10]\n" "ld1 { v1.b }[2], [x9]\n" "ld1 { v2.b }[2], [x28]\n" "ld1 { v3.b }[2], [x27]\n" - "b 106f\n" - "105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "b 103f\n" + "102:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x10, #0x0]\n" "ldr b1, [x9, #0x0]\n" "ldr b2, [x28, #0x0]\n" "ldr b3, [x27, #0x0]\n" - "106:" // Height 4: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 107f\n" + "103:" // Height 4: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 104f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" - "107:" // Height 4: Multiply loop: unique 16: skip row sum - "ldr q7, [x14, #0x0]\n" - "ldr q6, [x14, #0x10]\n" - "ldr q5, [x14, #0x20]\n" - "ldr q4, [x14, #0x30]\n" - ".inst 0x6f80e0f0 // udot v16.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n" - "add x14, x14, #0x40\n" - ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0fc // udot v28.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n" + "104:" // Height 4: Multiply loop: unique 16: skip row sum + "ldr q10, [x15, #0x0]\n" + "ldr q4, [x15, #0x10]\n" + "ldr q5, [x15, #0x20]\n" + "ldr q6, [x15, #0x30]\n" + ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" + "add x15, x15, #0x40\n" + ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n" + ".inst 0x6f83e15c // udot v28.4s, v10.16b, v3.4b[0]\n" + ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x6f83e09d // udot v29.4s, v4.16b, v3.4b[0]\n" ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n" ".inst 0x6f83e0be // udot v30.4s, v5.16b, v3.4b[0]\n" - ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n" - ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n" - "108:" // Height 4: Multiply loop: No odd multiplies + ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0df // udot v31.4s, v6.16b, v3.4b[0]\n" + "105:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x12, x12, #0x1\n" "cmp x12, x20\n" - "bne 94b\n" + "bne 91b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x13, #0x0]\n" "add x24, x13, x20\n" @@ -1769,30 +1683,30 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "add x22, x23, x20\n" "prfm pstl1keep, [x23, #0x0]\n" "prfm pstl1keep, [x22, #0x0]\n" - "tbnz %x[flags], #31, 109f\n" + "tbnz %x[flags], #31, 106f\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" "add x20, %x[qp], %[b_offset]\n" - "ld1r { v0.4s }, [x20]\n" + "ld1r { v4.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "neg v0.4s, v0.4s\n" + "neg v4.4s, v4.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "mul v11.4s, v11.4s, v0.4s\n" - "mul v12.4s, v12.4s, v0.4s\n" - "mul v13.4s, v13.4s, v0.4s\n" - "mul v14.4s, v14.4s, v0.4s\n" - "109:" // Height 4: skip row sum fixup - "ldr q3, [x16, #0x0]\n" + "mul v11.4s, v11.4s, v4.4s\n" + "mul v12.4s, v12.4s, v4.4s\n" + "mul v13.4s, v13.4s, v4.4s\n" + "mul v14.4s, v14.4s, v4.4s\n" + "106:" // Height 4: skip row sum fixup + "ldr q0, [x14, #0x0]\n" "add v16.4s, v16.4s, v11.4s\n" - "ldr q2, [x16, #0x10]\n" + "ldr q1, [x14, #0x10]\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q1, [x16, #0x20]\n" + "ldr q2, [x14, #0x20]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q0, [x16, #0x30]\n" + "ldr q3, [x14, #0x30]\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" @@ -1806,94 +1720,48 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "add v29.4s, v29.4s, v14.4s\n" "add v30.4s, v30.4s, v14.4s\n" "add v31.4s, v31.4s, v14.4s\n" - "add v16.4s, v16.4s, v3.4s\n" - "add v17.4s, v17.4s, v2.4s\n" - "add v18.4s, v18.4s, v1.4s\n" - "add v19.4s, v19.4s, v0.4s\n" - "add v20.4s, v20.4s, v3.4s\n" - "add v21.4s, v21.4s, v2.4s\n" - "add v22.4s, v22.4s, v1.4s\n" - "add v23.4s, v23.4s, v0.4s\n" - "add v24.4s, v24.4s, v3.4s\n" - "add v25.4s, v25.4s, v2.4s\n" - "add v26.4s, v26.4s, v1.4s\n" - "add v27.4s, v27.4s, v0.4s\n" - "add v28.4s, v28.4s, v3.4s\n" - "add v29.4s, v29.4s, v2.4s\n" - "add v30.4s, v30.4s, v1.4s\n" - "add v31.4s, v31.4s, v0.4s\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v1.4s }, [x20]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "add v28.4s, v28.4s, v0.4s\n" + "add v29.4s, v29.4s, v1.4s\n" + "add v30.4s, v30.4s, v2.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x21]\n" + "add x21, %x[qp], %[c_offset]\n" "ld1r { v0.4s }, [x20]\n" - "add x16, x16, #0x40\n" - "sqrdmulh v16.4s, v16.4s, v1.4s\n" - "sqrdmulh v17.4s, v17.4s, v1.4s\n" - "sqrdmulh v18.4s, v18.4s, v1.4s\n" - "sqrdmulh v19.4s, v19.4s, v1.4s\n" - "sqrdmulh v20.4s, v20.4s, v1.4s\n" - "sqrdmulh v21.4s, v21.4s, v1.4s\n" - "sqrdmulh v22.4s, v22.4s, v1.4s\n" - "sqrdmulh v23.4s, v23.4s, v1.4s\n" - "sqrdmulh v24.4s, v24.4s, v1.4s\n" - "sqrdmulh v25.4s, v25.4s, v1.4s\n" - "sqrdmulh v26.4s, v26.4s, v1.4s\n" - "sqrdmulh v27.4s, v27.4s, v1.4s\n" - "sqrdmulh v28.4s, v28.4s, v1.4s\n" - "sqrdmulh v29.4s, v29.4s, v1.4s\n" - "sqrdmulh v30.4s, v30.4s, v1.4s\n" - "sqrdmulh v31.4s, v31.4s, v1.4s\n" - "tbz %x[flags], #5, 110f\n" - "and v2.16b, v16.16b, v0.16b\n" - "and v1.16b, v17.16b, v0.16b\n" - "and v7.16b, v18.16b, v0.16b\n" - "and v6.16b, v19.16b, v0.16b\n" - "and v5.16b, v20.16b, v0.16b\n" - "and v4.16b, v21.16b, v0.16b\n" - "and v3.16b, v22.16b, v0.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v2.4s\n" - "sqadd v17.4s, v17.4s, v1.4s\n" - "and v2.16b, v23.16b, v0.16b\n" - "and v1.16b, v24.16b, v0.16b\n" - "sqadd v18.4s, v18.4s, v7.4s\n" - "sqadd v19.4s, v19.4s, v6.4s\n" - "sqadd v20.4s, v20.4s, v5.4s\n" - "sqadd v21.4s, v21.4s, v4.4s\n" - "sqadd v22.4s, v22.4s, v3.4s\n" - "and v7.16b, v25.16b, v0.16b\n" - "and v6.16b, v26.16b, v0.16b\n" - "and v5.16b, v27.16b, v0.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "and v4.16b, v28.16b, v0.16b\n" - "and v3.16b, v29.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v2.4s\n" - "sqadd v24.4s, v24.4s, v1.4s\n" - "and v2.16b, v30.16b, v0.16b\n" - "and v1.16b, v31.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqadd v25.4s, v25.4s, v7.4s\n" - "sqadd v26.4s, v26.4s, v6.4s\n" - "sqadd v27.4s, v27.4s, v5.4s\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqadd v28.4s, v28.4s, v4.4s\n" - "sqadd v29.4s, v29.4s, v3.4s\n" - "sqadd v30.4s, v30.4s, v2.4s\n" - "sqadd v31.4s, v31.4s, v1.4s\n" - "110:" // Height 4: no shift correction + "add x20, %x[qp], %[maxval]\n" + "ld1r { v6.4s }, [x20]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v4.4s\n" + "sqdmulh v26.4s, v26.4s, v4.4s\n" + "sqdmulh v27.4s, v27.4s, v4.4s\n" + "sqdmulh v28.4s, v28.4s, v4.4s\n" + "sqdmulh v29.4s, v29.4s, v4.4s\n" + "sqdmulh v30.4s, v30.4s, v4.4s\n" + "sqdmulh v31.4s, v31.4s, v4.4s\n" + "ld1r { v4.4s }, [x21]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" @@ -1910,185 +1778,181 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "srshl v29.4s, v29.4s, v0.4s\n" "srshl v30.4s, v30.4s, v0.4s\n" "srshl v31.4s, v31.4s, v0.4s\n" - "add x20, %x[qp], %[c_offset]\n" - "add x21, %x[qp], %[maxval]\n" - "ld1r { v2.4s }, [x20]\n" - "add x20, %x[qp], %[minval]\n" - "ld1r { v1.4s }, [x21]\n" - "cmp x15, #0x10\n" - "ld1r { v0.4s }, [x20]\n" - "add v16.4s, v16.4s, v2.4s\n" - "add v17.4s, v17.4s, v2.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v2.4s\n" - "add v20.4s, v20.4s, v2.4s\n" - "add v21.4s, v21.4s, v2.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v2.4s\n" - "add v24.4s, v24.4s, v2.4s\n" - "add v25.4s, v25.4s, v2.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v2.4s\n" - "add v28.4s, v28.4s, v2.4s\n" - "add v29.4s, v29.4s, v2.4s\n" - "add v30.4s, v30.4s, v2.4s\n" - "add v31.4s, v31.4s, v2.4s\n" - "smin v16.4s, v16.4s, v1.4s\n" - "smin v17.4s, v17.4s, v1.4s\n" - "smin v18.4s, v18.4s, v1.4s\n" - "smin v19.4s, v19.4s, v1.4s\n" - "smin v20.4s, v20.4s, v1.4s\n" - "smin v21.4s, v21.4s, v1.4s\n" - "smin v22.4s, v22.4s, v1.4s\n" - "smin v23.4s, v23.4s, v1.4s\n" - "smin v24.4s, v24.4s, v1.4s\n" - "smin v25.4s, v25.4s, v1.4s\n" - "smin v26.4s, v26.4s, v1.4s\n" - "smin v27.4s, v27.4s, v1.4s\n" - "smin v28.4s, v28.4s, v1.4s\n" - "smin v29.4s, v29.4s, v1.4s\n" - "smin v30.4s, v30.4s, v1.4s\n" - "smin v31.4s, v31.4s, v1.4s\n" - "smax v16.4s, v16.4s, v0.4s\n" - "smax v17.4s, v17.4s, v0.4s\n" - "smax v18.4s, v18.4s, v0.4s\n" - "smax v19.4s, v19.4s, v0.4s\n" - "smax v20.4s, v20.4s, v0.4s\n" - "smax v21.4s, v21.4s, v0.4s\n" - "smax v22.4s, v22.4s, v0.4s\n" - "smax v23.4s, v23.4s, v0.4s\n" - "smax v24.4s, v24.4s, v0.4s\n" - "smax v25.4s, v25.4s, v0.4s\n" - "smax v26.4s, v26.4s, v0.4s\n" - "smax v27.4s, v27.4s, v0.4s\n" - "smax v28.4s, v28.4s, v0.4s\n" - "smax v29.4s, v29.4s, v0.4s\n" - "smax v30.4s, v30.4s, v0.4s\n" - "smax v31.4s, v31.4s, v0.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v0.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v19.8h, v22.8h, v23.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v18.8h, v26.8h, v27.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" "uzp1 v28.8h, v28.8h, v29.8h\n" - "uzp1 v17.8h, v30.8h, v31.8h\n" - "uzp1 v16.16b, v16.16b, v0.16b\n" - "uzp1 v20.16b, v20.16b, v19.16b\n" - "uzp1 v24.16b, v24.16b, v18.16b\n" - "uzp1 v28.16b, v28.16b, v17.16b\n" - "bge 119f\n" - "tbz x15, #3, 114f\n" + "cmp x16, #0x10\n" + "uzp1 v29.8h, v30.8h, v31.8h\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "add x14, x14, #0x40\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v28.16b, v28.16b, v29.16b\n" + "bge 115f\n" + "tbz x16, #3, 110f\n" "str d16, [x13], #0x8\n" "str d20, [x24], #0x8\n" "str d24, [x23], #0x8\n" "str d28, [x22], #0x8\n" - "tbz x15, #2, 112f\n" + "tbz x16, #2, 108f\n" "st1 { v16.s }[2], [x13], #0x4\n" "st1 { v20.s }[2], [x24], #0x4\n" "st1 { v24.s }[2], [x23], #0x4\n" "st1 { v28.s }[2], [x22], #0x4\n" - "tbz x15, #1, 111f\n" + "tbz x16, #1, 107f\n" "st1 { v16.h }[6], [x13], #0x2\n" "st1 { v20.h }[6], [x24], #0x2\n" "st1 { v24.h }[6], [x23], #0x2\n" "st1 { v28.h }[6], [x22], #0x2\n" - "tbz x15, #0, 118f\n" + "tbz x16, #0, 114f\n" "st1 { v16.b }[14], [x13]\n" "st1 { v20.b }[14], [x24]\n" "st1 { v24.b }[14], [x23]\n" "st1 { v28.b }[14], [x22]\n" - "b 118f\n" - "111:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x15, #0, 118f\n" + "b 114f\n" + "107:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x16, #0, 114f\n" "st1 { v16.b }[12], [x13]\n" "st1 { v20.b }[12], [x24]\n" "st1 { v24.b }[12], [x23]\n" "st1 { v28.b }[12], [x22]\n" - "b 118f\n" - "112:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x15, #1, 113f\n" + "b 114f\n" + "108:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x16, #1, 109f\n" "st1 { v16.h }[4], [x13], #0x2\n" "st1 { v20.h }[4], [x24], #0x2\n" "st1 { v24.h }[4], [x23], #0x2\n" "st1 { v28.h }[4], [x22], #0x2\n" - "tbz x15, #0, 118f\n" + "tbz x16, #0, 114f\n" "st1 { v16.b }[10], [x13]\n" "st1 { v20.b }[10], [x24]\n" "st1 { v24.b }[10], [x23]\n" "st1 { v28.b }[10], [x22]\n" - "b 118f\n" - "113:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x15, #0, 118f\n" + "b 114f\n" + "109:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x16, #0, 114f\n" "st1 { v16.b }[8], [x13]\n" "st1 { v20.b }[8], [x24]\n" "st1 { v24.b }[8], [x23]\n" "st1 { v28.b }[8], [x22]\n" - "b 118f\n" - "114:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x15, #2, 116f\n" + "b 114f\n" + "110:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x16, #2, 112f\n" "str s16, [x13], #0x4\n" "str s20, [x24], #0x4\n" "str s24, [x23], #0x4\n" "str s28, [x22], #0x4\n" - "tbz x15, #1, 115f\n" + "tbz x16, #1, 111f\n" "st1 { v16.h }[2], [x13], #0x2\n" "st1 { v20.h }[2], [x24], #0x2\n" "st1 { v24.h }[2], [x23], #0x2\n" "st1 { v28.h }[2], [x22], #0x2\n" - "tbz x15, #0, 118f\n" + "tbz x16, #0, 114f\n" "st1 { v16.b }[6], [x13]\n" "st1 { v20.b }[6], [x24]\n" "st1 { v24.b }[6], [x23]\n" "st1 { v28.b }[6], [x22]\n" - "b 118f\n" - "115:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x15, #0, 118f\n" + "b 114f\n" + "111:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x16, #0, 114f\n" "st1 { v16.b }[4], [x13]\n" "st1 { v20.b }[4], [x24]\n" "st1 { v24.b }[4], [x23]\n" "st1 { v28.b }[4], [x22]\n" - "b 118f\n" - "116:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x15, #1, 117f\n" + "b 114f\n" + "112:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x16, #1, 113f\n" "str h16, [x13], #0x2\n" "str h20, [x24], #0x2\n" "str h24, [x23], #0x2\n" "str h28, [x22], #0x2\n" - "tbz x15, #0, 118f\n" + "tbz x16, #0, 114f\n" "st1 { v16.b }[2], [x13]\n" "st1 { v20.b }[2], [x24]\n" "st1 { v24.b }[2], [x23]\n" "st1 { v28.b }[2], [x22]\n" - "b 118f\n" - "117:" // Height 4: Partial direct writeback: partial_1_0 + "b 114f\n" + "113:" // Height 4: Partial direct writeback: partial_1_0 "str b16, [x13, #0x0]\n" "str b20, [x24, #0x0]\n" "str b24, [x23, #0x0]\n" "str b28, [x22, #0x0]\n" - "118:" // Height 4: Partial direct writeback: Done - "b 120f\n" - "119:" // Height 4: Full writeback + "114:" // Height 4: Partial direct writeback: Done + "b 116f\n" + "115:" // Height 4: Full writeback "str q16, [x13, #0x0]\n" "add x13, x13, #0x10\n" "str q20, [x24, #0x0]\n" "str q24, [x23, #0x0]\n" "str q28, [x22, #0x0]\n" - "120:" // Height 4: Writeback done - "subs x15, x15, #0x10\n" - "bgt 92b\n" + "116:" // Height 4: Writeback done + "subs x16, x16, #0x10\n" + "bgt 89b\n" "subs %x[M], %x[M], #0x4\n" - "beq 122f\n" + "beq 118f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 121f\n" + "tbz %x[flags], #3, 117f\n" "add x21, x21, #0x4\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "121:" // Update direct input + "117:" // Update direct input "mov x20, #0x4\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "122:" // Exit + "118:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp index 23315f3c0c..15036891c9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -74,29 +73,25 @@ void a64_hybrid_u8qa_dot_4x16 ( ka.string_lengths = string_lengths; ka.N = N; ka.B_ptr = B_ptr; - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 91f\n" + "bge 88f\n" "cmp %x[M], #0x2\n" - "bgt 61f\n" - "beq 31f\n" - "mov x10, %x[col_bias]\n" + "bgt 59f\n" + "beq 30f\n" "movi v11.4s, #0x0\n" "movi v15.16b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "2:" // Height 1: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" "movi v19.4s, #0x0\n" - "3:" // Height 1: setup done "mov x26, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -116,87 +111,87 @@ void a64_hybrid_u8qa_dot_4x16 ( "cmp x25, #0x10\n" "blt 11f\n" "ldr q0, [x24, #0x0]\n" - "ldr q4, [x28, #0x0]\n" + "ldr q4, [x9, #0x0]\n" "cmp x25, #0x20\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" "blt 9f\n" "7:" // Height 1: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q21, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q20, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q26, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q25, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" "add x24, x24, #0x10\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" - "ldr q24, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" - "ldr q23, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" - "ldr q22, [x28, #0xd0]\n" - ".inst 0x6fa0e2b3 // udot v19.4s, v21.16b, v0.4b[1]\n" - "ldr q21, [x28, #0xe0]\n" - ".inst 0x6f80ea90 // udot v16.4s, v20.16b, v0.4b[2]\n" - "ldr q20, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" - ".inst 0x6f80eb51 // udot v17.4s, v26.16b, v0.4b[2]\n" - ".inst 0x6f80eb32 // udot v18.4s, v25.16b, v0.4b[2]\n" - ".inst 0x6f80eb13 // udot v19.4s, v24.16b, v0.4b[2]\n" - ".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n" - ".inst 0x6fa0ead1 // udot v17.4s, v22.16b, v0.4b[3]\n" - ".inst 0x6fa0eab2 // udot v18.4s, v21.16b, v0.4b[3]\n" - ".inst 0x6fa0ea93 // udot v19.4s, v20.16b, v0.4b[3]\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 8f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum "ldr q0, [x24, #0x0]\n" - "ldr q4, [x28, #0x0]\n" + "ldr q4, [x9, #0x0]\n" "sub x25, x25, #0x10\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" "cmp x25, #0x20\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "bge 7b\n" "9:" // Height 1: Multiply loop: Single iteration only ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q21, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q20, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q26, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q25, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" - "ldr q24, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" - "ldr q23, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" - "ldr q22, [x28, #0xd0]\n" - ".inst 0x6fa0e2b3 // udot v19.4s, v21.16b, v0.4b[1]\n" - "ldr q21, [x28, #0xe0]\n" - ".inst 0x6f80ea90 // udot v16.4s, v20.16b, v0.4b[2]\n" - "ldr q20, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" - ".inst 0x6f80eb51 // udot v17.4s, v26.16b, v0.4b[2]\n" - ".inst 0x6f80eb32 // udot v18.4s, v25.16b, v0.4b[2]\n" - ".inst 0x6f80eb13 // udot v19.4s, v24.16b, v0.4b[2]\n" - ".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n" - ".inst 0x6fa0ead1 // udot v17.4s, v22.16b, v0.4b[3]\n" - ".inst 0x6fa0eab2 // udot v18.4s, v21.16b, v0.4b[3]\n" - ".inst 0x6fa0ea93 // udot v19.4s, v20.16b, v0.4b[3]\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 10f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "10:" // Height 1: Multiply loop: unique 2: skip row sum @@ -210,17 +205,17 @@ void a64_hybrid_u8qa_dot_4x16 ( "tbnz %x[flags], #31, 13f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "13:" // Height 1: Multiply loop: unique 3: skip row sum - "ldr q23, [x28, #0x0]\n" - "ldr q22, [x28, #0x10]\n" + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" "sub x25, x25, #0x4\n" - "ldr q21, [x28, #0x20]\n" - "ldr q20, [x28, #0x30]\n" + "ldr q8, [x9, #0x20]\n" + "ldr q9, [x9, #0x30]\n" "cmp x25, #0x4\n" - "add x28, x28, #0x40\n" - ".inst 0x6f80e2f0 // udot v16.4s, v23.16b, v0.4b[0]\n" - ".inst 0x6f80e2d1 // udot v17.4s, v22.16b, v0.4b[0]\n" - ".inst 0x6f80e2b2 // udot v18.4s, v21.16b, v0.4b[0]\n" - ".inst 0x6f80e293 // udot v19.4s, v20.16b, v0.4b[0]\n" + "add x9, x9, #0x40\n" + ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" "bge 12b\n" "14:" // Height 1: Multiply loop: Skip odd blocks "cbz x25, 18f\n" @@ -235,15 +230,15 @@ void a64_hybrid_u8qa_dot_4x16 ( "tbnz %x[flags], #31, 17f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "17:" // Height 1: Multiply loop: unique 4: skip row sum - "ldr q23, [x28, #0x0]\n" - "ldr q22, [x28, #0x10]\n" - "ldr q21, [x28, #0x20]\n" - "ldr q20, [x28, #0x30]\n" - "add x28, x28, #0x40\n" - ".inst 0x6f80e2f0 // udot v16.4s, v23.16b, v0.4b[0]\n" - ".inst 0x6f80e2d1 // udot v17.4s, v22.16b, v0.4b[0]\n" - ".inst 0x6f80e2b2 // udot v18.4s, v21.16b, v0.4b[0]\n" - ".inst 0x6f80e293 // udot v19.4s, v20.16b, v0.4b[0]\n" + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" "18:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" @@ -253,136 +248,122 @@ void a64_hybrid_u8qa_dot_4x16 ( "tbnz %x[flags], #31, 19f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" - "ld1r { v20.4s }, [x20]\n" - "neg v20.4s, v20.4s\n" + "ld1r { v1.4s }, [x20]\n" + "neg v1.4s, v1.4s\n" "addp v11.4s, v11.4s, v11.4s\n" - "mul v11.4s, v11.4s, v20.4s\n" + "mul v11.4s, v11.4s, v1.4s\n" "19:" // Height 1: skip row sum fixup - "ldr q24, [x10, #0x0]\n" - "ldr q23, [x10, #0x10]\n" + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q22, [x10, #0x20]\n" - "ldr q21, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v20.4s }, [x20]\n" - "add v16.4s, v16.4s, v24.4s\n" - "add v17.4s, v17.4s, v23.4s\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - "add x10, x10, #0x40\n" + "ld1r { v4.4s }, [x21]\n" + "add v16.4s, v16.4s, v0.4s\n" "ld1r { v0.4s }, [x20]\n" - "add v18.4s, v18.4s, v22.4s\n" - "add v19.4s, v19.4s, v21.4s\n" - "sqrdmulh v16.4s, v16.4s, v20.4s\n" - "sqrdmulh v17.4s, v17.4s, v20.4s\n" - "sqrdmulh v18.4s, v18.4s, v20.4s\n" - "sqrdmulh v19.4s, v19.4s, v20.4s\n" - "tbz %x[flags], #5, 20f\n" - "and v23.16b, v16.16b, v0.16b\n" - "and v22.16b, v17.16b, v0.16b\n" - "and v21.16b, v18.16b, v0.16b\n" - "and v20.16b, v19.16b, v0.16b\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v23.4s\n" - "sqadd v17.4s, v17.4s, v22.4s\n" - "sqadd v18.4s, v18.4s, v21.4s\n" - "sqadd v19.4s, v19.4s, v20.4s\n" - "20:" // Height 1: no shift correction + "add v17.4s, v17.4s, v1.4s\n" "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v6.4s }, [x20]\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "add x28, x28, #0x40\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "ld1r { v4.4s }, [x21]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v22.4s }, [x21]\n" - "ld1r { v21.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v20.4s }, [x20]\n" - "add v16.4s, v16.4s, v22.4s\n" - "add v17.4s, v17.4s, v22.4s\n" - "add v18.4s, v18.4s, v22.4s\n" - "add v19.4s, v19.4s, v22.4s\n" - "smin v16.4s, v16.4s, v21.4s\n" - "smin v17.4s, v17.4s, v21.4s\n" - "smin v18.4s, v18.4s, v21.4s\n" - "smin v19.4s, v19.4s, v21.4s\n" - "smax v16.4s, v16.4s, v20.4s\n" - "smax v17.4s, v17.4s, v20.4s\n" - "smax v18.4s, v18.4s, v20.4s\n" - "smax v19.4s, v19.4s, v20.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v16.16b, v16.16b, v17.16b\n" - "bge 29f\n" - "tbz x9, #3, 24f\n" + "bge 28f\n" + "tbz x10, #3, 23f\n" "str d16, [x27], #0x8\n" - "tbz x9, #2, 22f\n" + "tbz x10, #2, 21f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "tbz x9, #1, 21f\n" + "tbz x10, #1, 20f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "tbz x9, #0, 28f\n" + "tbz x10, #0, 27f\n" "st1 { v16.b }[14], [x27]\n" - "b 28f\n" - "21:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x9, #0, 28f\n" + "b 27f\n" + "20:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x10, #0, 27f\n" "st1 { v16.b }[12], [x27]\n" - "b 28f\n" - "22:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x9, #1, 23f\n" + "b 27f\n" + "21:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x10, #1, 22f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "tbz x9, #0, 28f\n" + "tbz x10, #0, 27f\n" "st1 { v16.b }[10], [x27]\n" - "b 28f\n" - "23:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x9, #0, 28f\n" + "b 27f\n" + "22:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x10, #0, 27f\n" "st1 { v16.b }[8], [x27]\n" - "b 28f\n" - "24:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x9, #2, 26f\n" + "b 27f\n" + "23:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x10, #2, 25f\n" "str s16, [x27], #0x4\n" - "tbz x9, #1, 25f\n" + "tbz x10, #1, 24f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "tbz x9, #0, 28f\n" + "tbz x10, #0, 27f\n" "st1 { v16.b }[6], [x27]\n" - "b 28f\n" - "25:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x9, #0, 28f\n" + "b 27f\n" + "24:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x10, #0, 27f\n" "st1 { v16.b }[4], [x27]\n" - "b 28f\n" - "26:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x9, #1, 27f\n" + "b 27f\n" + "25:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x10, #1, 26f\n" "str h16, [x27], #0x2\n" - "tbz x9, #0, 28f\n" + "tbz x10, #0, 27f\n" "st1 { v16.b }[2], [x27]\n" - "b 28f\n" - "27:" // Height 1: Partial direct writeback: partial_1_0 + "b 27f\n" + "26:" // Height 1: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "28:" // Height 1: Partial direct writeback: Done - "b 30f\n" - "29:" // Height 1: Full writeback + "27:" // Height 1: Partial direct writeback: Done + "b 29f\n" + "28:" // Height 1: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "30:" // Height 1: Writeback done - "subs x9, x9, #0x10\n" + "29:" // Height 1: Writeback done + "subs x10, x10, #0x10\n" "bgt 2b\n" - "b 122f\n" - "31:" // Height 2 - "mov x10, %x[col_bias]\n" + "b 118f\n" + "30:" // Height 2 "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v15.16b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "32:" // Height 2: Column loop + "31:" // Height 2: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -391,416 +372,389 @@ void a64_hybrid_u8qa_dot_4x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "33:" // Height 2: setup done "mov x26, #0x0\n" - "34:" // Height 2: String loop + "33:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 35f\n" + "tbz %x[flags], #3, 34f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" - "cbnz x26, 36f\n" + "cbnz x26, 35f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" - "b 36f\n" - "35:" // Height 2: setup direct input + "b 35f\n" + "34:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" - "36:" // Height 2: input setup done + "35:" // Height 2: input setup done "cmp x25, #0x10\n" - "blt 41f\n" + "blt 40f\n" "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "cmp x25, #0x20\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" - "blt 39f\n" - "37:" // Height 2: Multiply loop: Main loop head + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" + "blt 38f\n" + "36:" // Height 2: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q25, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" "add x24, x24, #0x10\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q24, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" "add x23, x23, #0x10\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - "ldr q30, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x6fa0e333 // udot v19.4s, v25.16b, v0.4b[1]\n" - ".inst 0x6fa1e337 // udot v23.4s, v25.16b, v1.4b[1]\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x6f80eb10 // udot v16.4s, v24.16b, v0.4b[2]\n" - ".inst 0x6f81eb14 // udot v20.4s, v24.16b, v1.4b[2]\n" - "ldr q24, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" - ".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n" - ".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n" - ".inst 0x6f80ebb2 // udot v18.4s, v29.16b, v0.4b[2]\n" - ".inst 0x6f81ebb6 // udot v22.4s, v29.16b, v1.4b[2]\n" - ".inst 0x6f80eb93 // udot v19.4s, v28.16b, v0.4b[2]\n" - ".inst 0x6f81eb97 // udot v23.4s, v28.16b, v1.4b[2]\n" - ".inst 0x6fa0eb70 // udot v16.4s, v27.16b, v0.4b[3]\n" - ".inst 0x6fa1eb74 // udot v20.4s, v27.16b, v1.4b[3]\n" - ".inst 0x6fa0eb51 // udot v17.4s, v26.16b, v0.4b[3]\n" - ".inst 0x6fa1eb55 // udot v21.4s, v26.16b, v1.4b[3]\n" - ".inst 0x6fa0eb32 // udot v18.4s, v25.16b, v0.4b[3]\n" - ".inst 0x6fa1eb36 // udot v22.4s, v25.16b, v1.4b[3]\n" - ".inst 0x6fa0eb13 // udot v19.4s, v24.16b, v0.4b[3]\n" - ".inst 0x6fa1eb17 // udot v23.4s, v24.16b, v1.4b[3]\n" - "tbnz %x[flags], #31, 38f\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 37f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - "38:" // Height 2: Multiply loop: unique 5: skip row sum + "37:" // Height 2: Multiply loop: unique 5: skip row sum "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "sub x25, x25, #0x10\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x9, #0x10]\n" "cmp x25, #0x20\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - "bge 37b\n" - "39:" // Height 2: Multiply loop: Single iteration only + "bge 36b\n" + "38:" // Height 2: Multiply loop: Single iteration only ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q25, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" "sub x25, x25, #0x10\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q24, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" "add x24, x24, #0x10\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - "ldr q30, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" "add x23, x23, #0x10\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x6fa0e333 // udot v19.4s, v25.16b, v0.4b[1]\n" - ".inst 0x6fa1e337 // udot v23.4s, v25.16b, v1.4b[1]\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x6f80eb10 // udot v16.4s, v24.16b, v0.4b[2]\n" - ".inst 0x6f81eb14 // udot v20.4s, v24.16b, v1.4b[2]\n" - "ldr q24, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" - ".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n" - ".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n" - ".inst 0x6f80ebb2 // udot v18.4s, v29.16b, v0.4b[2]\n" - ".inst 0x6f81ebb6 // udot v22.4s, v29.16b, v1.4b[2]\n" - ".inst 0x6f80eb93 // udot v19.4s, v28.16b, v0.4b[2]\n" - ".inst 0x6f81eb97 // udot v23.4s, v28.16b, v1.4b[2]\n" - ".inst 0x6fa0eb70 // udot v16.4s, v27.16b, v0.4b[3]\n" - ".inst 0x6fa1eb74 // udot v20.4s, v27.16b, v1.4b[3]\n" - ".inst 0x6fa0eb51 // udot v17.4s, v26.16b, v0.4b[3]\n" - ".inst 0x6fa1eb55 // udot v21.4s, v26.16b, v1.4b[3]\n" - ".inst 0x6fa0eb32 // udot v18.4s, v25.16b, v0.4b[3]\n" - ".inst 0x6fa1eb36 // udot v22.4s, v25.16b, v1.4b[3]\n" - ".inst 0x6fa0eb13 // udot v19.4s, v24.16b, v0.4b[3]\n" - ".inst 0x6fa1eb17 // udot v23.4s, v24.16b, v1.4b[3]\n" - "tbnz %x[flags], #31, 40f\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 39f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - "40:" // Height 2: Multiply loop: unique 6: skip row sum + "39:" // Height 2: Multiply loop: unique 6: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - "41:" // Height 2: Multiply loop: Main loop skip - "cbz x25, 48f\n" + "40:" // Height 2: Multiply loop: Main loop skip + "cbz x25, 47f\n" "cmp x25, #0x4\n" - "blt 44f\n" - "42:" // Height 2: Multiply loop: Odd block loop + "blt 43f\n" + "41:" // Height 2: Multiply loop: Odd block loop "ldr s0, [x24], #0x4\n" "ldr s1, [x23], #0x4\n" - "tbnz %x[flags], #31, 43f\n" + "tbnz %x[flags], #31, 42f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - "43:" // Height 2: Multiply loop: unique 7: skip row sum - "ldr q27, [x28, #0x0]\n" - "ldr q26, [x28, #0x10]\n" + "42:" // Height 2: Multiply loop: unique 7: skip row sum + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" "sub x25, x25, #0x4\n" - "ldr q25, [x28, #0x20]\n" - "ldr q24, [x28, #0x30]\n" + "ldr q8, [x9, #0x20]\n" + "ldr q9, [x9, #0x30]\n" "cmp x25, #0x4\n" - "add x28, x28, #0x40\n" - ".inst 0x6f80e370 // udot v16.4s, v27.16b, v0.4b[0]\n" - ".inst 0x6f81e374 // udot v20.4s, v27.16b, v1.4b[0]\n" - ".inst 0x6f80e351 // udot v17.4s, v26.16b, v0.4b[0]\n" - ".inst 0x6f81e355 // udot v21.4s, v26.16b, v1.4b[0]\n" - ".inst 0x6f80e332 // udot v18.4s, v25.16b, v0.4b[0]\n" - ".inst 0x6f81e336 // udot v22.4s, v25.16b, v1.4b[0]\n" - ".inst 0x6f80e313 // udot v19.4s, v24.16b, v0.4b[0]\n" - ".inst 0x6f81e317 // udot v23.4s, v24.16b, v1.4b[0]\n" - "bge 42b\n" - "44:" // Height 2: Multiply loop: Skip odd blocks - "cbz x25, 48f\n" - "tbz x25, #1, 45f\n" + "add x9, x9, #0x40\n" + ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" + "bge 41b\n" + "43:" // Height 2: Multiply loop: Skip odd blocks + "cbz x25, 47f\n" + "tbz x25, #1, 44f\n" "ldr h0, [x24], #0x2\n" "ldr h1, [x23], #0x2\n" - "tbz x25, #0, 46f\n" + "tbz x25, #0, 45f\n" "ld1 { v0.b }[2], [x24]\n" "ld1 { v1.b }[2], [x23]\n" - "b 46f\n" - "45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "b 45f\n" + "44:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x24, #0x0]\n" "ldr b1, [x23, #0x0]\n" - "46:" // Height 2: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 47f\n" + "45:" // Height 2: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 46f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - "47:" // Height 2: Multiply loop: unique 8: skip row sum - "ldr q27, [x28, #0x0]\n" - "ldr q26, [x28, #0x10]\n" - "ldr q25, [x28, #0x20]\n" - "ldr q24, [x28, #0x30]\n" - "add x28, x28, #0x40\n" - ".inst 0x6f80e370 // udot v16.4s, v27.16b, v0.4b[0]\n" - ".inst 0x6f81e374 // udot v20.4s, v27.16b, v1.4b[0]\n" - ".inst 0x6f80e351 // udot v17.4s, v26.16b, v0.4b[0]\n" - ".inst 0x6f81e355 // udot v21.4s, v26.16b, v1.4b[0]\n" - ".inst 0x6f80e332 // udot v18.4s, v25.16b, v0.4b[0]\n" - ".inst 0x6f81e336 // udot v22.4s, v25.16b, v1.4b[0]\n" - ".inst 0x6f80e313 // udot v19.4s, v24.16b, v0.4b[0]\n" - ".inst 0x6f81e317 // udot v23.4s, v24.16b, v1.4b[0]\n" - "48:" // Height 2: Multiply loop: No odd multiplies + "46:" // Height 2: Multiply loop: unique 8: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" + ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" + "47:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 34b\n" + "bne 33b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x27, #0x0]\n" - "add x24, x27, x20\n" - "prfm pstl1keep, [x24, #0x0]\n" - "tbnz %x[flags], #31, 49f\n" + "add x26, x27, x20\n" + "prfm pstl1keep, [x26, #0x0]\n" + "tbnz %x[flags], #31, 48f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "ld1r { v24.4s }, [x20]\n" - "neg v24.4s, v24.4s\n" + "ld1r { v2.4s }, [x20]\n" + "neg v2.4s, v2.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "mul v11.4s, v11.4s, v24.4s\n" - "mul v12.4s, v12.4s, v24.4s\n" - "49:" // Height 2: skip row sum fixup - "ldr q28, [x10, #0x0]\n" - "ldr q27, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v2.4s\n" + "mul v12.4s, v12.4s, v2.4s\n" + "48:" // Height 2: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q26, [x10, #0x20]\n" - "ldr q25, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v24.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" - "add v16.4s, v16.4s, v28.4s\n" - "add v17.4s, v17.4s, v27.4s\n" - "add x10, x10, #0x40\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v25.4s\n" - "add v20.4s, v20.4s, v28.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v21.4s, v21.4s, v27.4s\n" - "add v22.4s, v22.4s, v26.4s\n" - "add v23.4s, v23.4s, v25.4s\n" - "sqrdmulh v16.4s, v16.4s, v24.4s\n" - "sqrdmulh v17.4s, v17.4s, v24.4s\n" - "sqrdmulh v18.4s, v18.4s, v24.4s\n" - "sqrdmulh v19.4s, v19.4s, v24.4s\n" - "sqrdmulh v20.4s, v20.4s, v24.4s\n" - "sqrdmulh v21.4s, v21.4s, v24.4s\n" - "sqrdmulh v22.4s, v22.4s, v24.4s\n" - "sqrdmulh v23.4s, v23.4s, v24.4s\n" - "tbz %x[flags], #5, 50f\n" - "and v24.16b, v16.16b, v0.16b\n" - "and v30.16b, v17.16b, v0.16b\n" - "and v29.16b, v18.16b, v0.16b\n" - "and v28.16b, v19.16b, v0.16b\n" - "and v27.16b, v20.16b, v0.16b\n" - "and v26.16b, v21.16b, v0.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "and v25.16b, v22.16b, v0.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v24.4s\n" - "and v24.16b, v23.16b, v0.16b\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v30.4s\n" - "sqadd v18.4s, v18.4s, v29.4s\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v19.4s, v19.4s, v28.4s\n" - "sqadd v20.4s, v20.4s, v27.4s\n" - "sqadd v21.4s, v21.4s, v26.4s\n" - "sqadd v22.4s, v22.4s, v25.4s\n" - "sqadd v23.4s, v23.4s, v24.4s\n" - "50:" // Height 2: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "add x22, %x[qp], %[c_offset]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "ld1r { v0.4s }, [x23]\n" + "add v21.4s, v21.4s, v1.4s\n" + "cmp x10, #0x10\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add x28, x28, #0x40\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v26.4s }, [x21]\n" - "ld1r { v25.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v24.4s }, [x20]\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" - "add v16.4s, v16.4s, v26.4s\n" - "add v17.4s, v17.4s, v26.4s\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v26.4s\n" - "add v20.4s, v20.4s, v26.4s\n" - "add v21.4s, v21.4s, v26.4s\n" - "add v22.4s, v22.4s, v26.4s\n" - "add v23.4s, v23.4s, v26.4s\n" - "smin v16.4s, v16.4s, v25.4s\n" - "smin v17.4s, v17.4s, v25.4s\n" - "smin v18.4s, v18.4s, v25.4s\n" - "smin v19.4s, v19.4s, v25.4s\n" - "smin v20.4s, v20.4s, v25.4s\n" - "smin v21.4s, v21.4s, v25.4s\n" - "smin v22.4s, v22.4s, v25.4s\n" - "smin v23.4s, v23.4s, v25.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "smax v18.4s, v18.4s, v24.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "smax v21.4s, v21.4s, v24.4s\n" - "smax v22.4s, v22.4s, v24.4s\n" - "smax v23.4s, v23.4s, v24.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v18.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v17.8h, v22.8h, v23.8h\n" - "uzp1 v16.16b, v16.16b, v18.16b\n" - "uzp1 v20.16b, v20.16b, v17.16b\n" - "bge 59f\n" - "tbz x9, #3, 54f\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "bge 57f\n" + "tbz x10, #3, 52f\n" "str d16, [x27], #0x8\n" - "str d20, [x24], #0x8\n" - "tbz x9, #2, 52f\n" + "str d20, [x26], #0x8\n" + "tbz x10, #2, 50f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x24], #0x4\n" - "tbz x9, #1, 51f\n" + "st1 { v20.s }[2], [x26], #0x4\n" + "tbz x10, #1, 49f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x24], #0x2\n" - "tbz x9, #0, 58f\n" + "st1 { v20.h }[6], [x26], #0x2\n" + "tbz x10, #0, 56f\n" "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x24]\n" - "b 58f\n" - "51:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x9, #0, 58f\n" + "st1 { v20.b }[14], [x26]\n" + "b 56f\n" + "49:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x10, #0, 56f\n" "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x24]\n" - "b 58f\n" - "52:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x9, #1, 53f\n" + "st1 { v20.b }[12], [x26]\n" + "b 56f\n" + "50:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x10, #1, 51f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x24], #0x2\n" - "tbz x9, #0, 58f\n" + "st1 { v20.h }[4], [x26], #0x2\n" + "tbz x10, #0, 56f\n" "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x24]\n" - "b 58f\n" - "53:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x9, #0, 58f\n" + "st1 { v20.b }[10], [x26]\n" + "b 56f\n" + "51:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x10, #0, 56f\n" "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x24]\n" - "b 58f\n" - "54:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x9, #2, 56f\n" + "st1 { v20.b }[8], [x26]\n" + "b 56f\n" + "52:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x10, #2, 54f\n" "str s16, [x27], #0x4\n" - "str s20, [x24], #0x4\n" - "tbz x9, #1, 55f\n" + "str s20, [x26], #0x4\n" + "tbz x10, #1, 53f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x24], #0x2\n" - "tbz x9, #0, 58f\n" + "st1 { v20.h }[2], [x26], #0x2\n" + "tbz x10, #0, 56f\n" "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x24]\n" - "b 58f\n" - "55:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x9, #0, 58f\n" + "st1 { v20.b }[6], [x26]\n" + "b 56f\n" + "53:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x10, #0, 56f\n" "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x24]\n" - "b 58f\n" - "56:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x9, #1, 57f\n" + "st1 { v20.b }[4], [x26]\n" + "b 56f\n" + "54:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x10, #1, 55f\n" "str h16, [x27], #0x2\n" - "str h20, [x24], #0x2\n" - "tbz x9, #0, 58f\n" + "str h20, [x26], #0x2\n" + "tbz x10, #0, 56f\n" "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x24]\n" - "b 58f\n" - "57:" // Height 2: Partial direct writeback: partial_1_0 + "st1 { v20.b }[2], [x26]\n" + "b 56f\n" + "55:" // Height 2: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "str b20, [x24, #0x0]\n" - "58:" // Height 2: Partial direct writeback: Done - "b 60f\n" - "59:" // Height 2: Full writeback + "str b20, [x26, #0x0]\n" + "56:" // Height 2: Partial direct writeback: Done + "b 58f\n" + "57:" // Height 2: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q20, [x24, #0x0]\n" - "60:" // Height 2: Writeback done - "subs x9, x9, #0x10\n" - "bgt 32b\n" - "b 122f\n" - "61:" // Height 3 - "mov x10, %x[col_bias]\n" + "str q20, [x26, #0x0]\n" + "58:" // Height 2: Writeback done + "subs x10, x10, #0x10\n" + "bgt 31b\n" + "b 118f\n" + "59:" // Height 3 "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v13.4s, #0x0\n" "movi v15.16b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "62:" // Height 3: Column loop + "60:" // Height 3: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -813,542 +767,503 @@ void a64_hybrid_u8qa_dot_4x16 ( "movi v25.4s, #0x0\n" "movi v26.4s, #0x0\n" "movi v27.4s, #0x0\n" - "63:" // Height 3: setup done "mov x26, #0x0\n" - "64:" // Height 3: String loop + "62:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 65f\n" + "tbz %x[flags], #3, 63f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" - "cbnz x26, 66f\n" + "cbnz x26, 64f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" - "b 66f\n" - "65:" // Height 3: setup direct input + "b 64f\n" + "63:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" - "66:" // Height 3: input setup done + "64:" // Height 3: input setup done "cmp x25, #0x10\n" - "blt 71f\n" + "blt 69f\n" "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "cmp x25, #0x20\n" "ldr q2, [x22, #0x0]\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" - "blt 69f\n" - "67:" // Height 3: Multiply loop: Main loop head + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" + "blt 67f\n" + "65:" // Height 3: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q29, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" "add x22, x22, #0x10\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" - "ldr q28, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" - "ldr q5, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" - "ldr q4, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" - "ldr q3, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" - "ldr q31, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" - "ldr q30, [x28, #0xd0]\n" - ".inst 0x6fa0e3b3 // udot v19.4s, v29.16b, v0.4b[1]\n" - ".inst 0x6fa1e3b7 // udot v23.4s, v29.16b, v1.4b[1]\n" - ".inst 0x6fa2e3bb // udot v27.4s, v29.16b, v2.4b[1]\n" - "ldr q29, [x28, #0xe0]\n" - ".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n" - ".inst 0x6f81eb94 // udot v20.4s, v28.16b, v1.4b[2]\n" - ".inst 0x6f82eb98 // udot v24.4s, v28.16b, v2.4b[2]\n" - "ldr q28, [x28, #0xf0]\n" - ".inst 0x6f80e8b1 // udot v17.4s, v5.16b, v0.4b[2]\n" - "add x28, x28, #0x100\n" - ".inst 0x6f81e8b5 // udot v21.4s, v5.16b, v1.4b[2]\n" - ".inst 0x6f82e8b9 // udot v25.4s, v5.16b, v2.4b[2]\n" - ".inst 0x6f80e892 // udot v18.4s, v4.16b, v0.4b[2]\n" - ".inst 0x6f81e896 // udot v22.4s, v4.16b, v1.4b[2]\n" - ".inst 0x6f82e89a // udot v26.4s, v4.16b, v2.4b[2]\n" - ".inst 0x6f80e873 // udot v19.4s, v3.16b, v0.4b[2]\n" - ".inst 0x6f81e877 // udot v23.4s, v3.16b, v1.4b[2]\n" - ".inst 0x6f82e87b // udot v27.4s, v3.16b, v2.4b[2]\n" - ".inst 0x6fa0ebf0 // udot v16.4s, v31.16b, v0.4b[3]\n" - ".inst 0x6fa1ebf4 // udot v20.4s, v31.16b, v1.4b[3]\n" - ".inst 0x6fa2ebf8 // udot v24.4s, v31.16b, v2.4b[3]\n" - ".inst 0x6fa0ebd1 // udot v17.4s, v30.16b, v0.4b[3]\n" - ".inst 0x6fa1ebd5 // udot v21.4s, v30.16b, v1.4b[3]\n" - ".inst 0x6fa2ebd9 // udot v25.4s, v30.16b, v2.4b[3]\n" - ".inst 0x6fa0ebb2 // udot v18.4s, v29.16b, v0.4b[3]\n" - ".inst 0x6fa1ebb6 // udot v22.4s, v29.16b, v1.4b[3]\n" - ".inst 0x6fa2ebba // udot v26.4s, v29.16b, v2.4b[3]\n" - ".inst 0x6fa0eb93 // udot v19.4s, v28.16b, v0.4b[3]\n" - ".inst 0x6fa1eb97 // udot v23.4s, v28.16b, v1.4b[3]\n" - ".inst 0x6fa2eb9b // udot v27.4s, v28.16b, v2.4b[3]\n" - "tbnz %x[flags], #31, 68f\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "add x9, x9, #0x100\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 66f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "68:" // Height 3: Multiply loop: unique 9: skip row sum + "66:" // Height 3: Multiply loop: unique 9: skip row sum "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "sub x25, x25, #0x10\n" "ldr q2, [x22, #0x0]\n" - "ldr q4, [x28, #0x0]\n" + "ldr q4, [x9, #0x0]\n" "cmp x25, #0x20\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - "bge 67b\n" - "69:" // Height 3: Multiply loop: Single iteration only + "bge 65b\n" + "67:" // Height 3: Multiply loop: Single iteration only ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q29, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" "add x23, x23, #0x10\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" - "ldr q28, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" "add x22, x22, #0x10\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" - "ldr q5, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" - "ldr q4, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" - "ldr q3, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" - "ldr q31, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" - "ldr q30, [x28, #0xd0]\n" - ".inst 0x6fa0e3b3 // udot v19.4s, v29.16b, v0.4b[1]\n" - ".inst 0x6fa1e3b7 // udot v23.4s, v29.16b, v1.4b[1]\n" - ".inst 0x6fa2e3bb // udot v27.4s, v29.16b, v2.4b[1]\n" - "ldr q29, [x28, #0xe0]\n" - ".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n" - ".inst 0x6f81eb94 // udot v20.4s, v28.16b, v1.4b[2]\n" - ".inst 0x6f82eb98 // udot v24.4s, v28.16b, v2.4b[2]\n" - "ldr q28, [x28, #0xf0]\n" - ".inst 0x6f80e8b1 // udot v17.4s, v5.16b, v0.4b[2]\n" - "add x28, x28, #0x100\n" - ".inst 0x6f81e8b5 // udot v21.4s, v5.16b, v1.4b[2]\n" - ".inst 0x6f82e8b9 // udot v25.4s, v5.16b, v2.4b[2]\n" - ".inst 0x6f80e892 // udot v18.4s, v4.16b, v0.4b[2]\n" - ".inst 0x6f81e896 // udot v22.4s, v4.16b, v1.4b[2]\n" - ".inst 0x6f82e89a // udot v26.4s, v4.16b, v2.4b[2]\n" - ".inst 0x6f80e873 // udot v19.4s, v3.16b, v0.4b[2]\n" - ".inst 0x6f81e877 // udot v23.4s, v3.16b, v1.4b[2]\n" - ".inst 0x6f82e87b // udot v27.4s, v3.16b, v2.4b[2]\n" - ".inst 0x6fa0ebf0 // udot v16.4s, v31.16b, v0.4b[3]\n" - ".inst 0x6fa1ebf4 // udot v20.4s, v31.16b, v1.4b[3]\n" - ".inst 0x6fa2ebf8 // udot v24.4s, v31.16b, v2.4b[3]\n" - ".inst 0x6fa0ebd1 // udot v17.4s, v30.16b, v0.4b[3]\n" - ".inst 0x6fa1ebd5 // udot v21.4s, v30.16b, v1.4b[3]\n" - ".inst 0x6fa2ebd9 // udot v25.4s, v30.16b, v2.4b[3]\n" - ".inst 0x6fa0ebb2 // udot v18.4s, v29.16b, v0.4b[3]\n" - ".inst 0x6fa1ebb6 // udot v22.4s, v29.16b, v1.4b[3]\n" - ".inst 0x6fa2ebba // udot v26.4s, v29.16b, v2.4b[3]\n" - ".inst 0x6fa0eb93 // udot v19.4s, v28.16b, v0.4b[3]\n" - ".inst 0x6fa1eb97 // udot v23.4s, v28.16b, v1.4b[3]\n" - ".inst 0x6fa2eb9b // udot v27.4s, v28.16b, v2.4b[3]\n" - "tbnz %x[flags], #31, 70f\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "add x9, x9, #0x100\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 68f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "70:" // Height 3: Multiply loop: unique 10: skip row sum + "68:" // Height 3: Multiply loop: unique 10: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - "71:" // Height 3: Multiply loop: Main loop skip - "cbz x25, 78f\n" + "69:" // Height 3: Multiply loop: Main loop skip + "cbz x25, 76f\n" "cmp x25, #0x4\n" - "blt 74f\n" - "72:" // Height 3: Multiply loop: Odd block loop + "blt 72f\n" + "70:" // Height 3: Multiply loop: Odd block loop "ldr s0, [x24], #0x4\n" "ldr s1, [x23], #0x4\n" "ldr s2, [x22], #0x4\n" - "tbnz %x[flags], #31, 73f\n" + "tbnz %x[flags], #31, 71f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "73:" // Height 3: Multiply loop: unique 11: skip row sum - "ldr q31, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" + "71:" // Height 3: Multiply loop: unique 11: skip row sum + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" "sub x25, x25, #0x4\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" + "ldr q8, [x9, #0x20]\n" + "ldr q9, [x9, #0x30]\n" "cmp x25, #0x4\n" - "add x28, x28, #0x40\n" - ".inst 0x6f80e3f0 // udot v16.4s, v31.16b, v0.4b[0]\n" - ".inst 0x6f81e3f4 // udot v20.4s, v31.16b, v1.4b[0]\n" - ".inst 0x6f82e3f8 // udot v24.4s, v31.16b, v2.4b[0]\n" - ".inst 0x6f80e3d1 // udot v17.4s, v30.16b, v0.4b[0]\n" - ".inst 0x6f81e3d5 // udot v21.4s, v30.16b, v1.4b[0]\n" - ".inst 0x6f82e3d9 // udot v25.4s, v30.16b, v2.4b[0]\n" - ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n" - ".inst 0x6f81e3b6 // udot v22.4s, v29.16b, v1.4b[0]\n" - ".inst 0x6f82e3ba // udot v26.4s, v29.16b, v2.4b[0]\n" - ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n" - ".inst 0x6f81e397 // udot v23.4s, v28.16b, v1.4b[0]\n" - ".inst 0x6f82e39b // udot v27.4s, v28.16b, v2.4b[0]\n" - "bge 72b\n" - "74:" // Height 3: Multiply loop: Skip odd blocks - "cbz x25, 78f\n" - "tbz x25, #1, 75f\n" + "add x9, x9, #0x40\n" + ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n" + ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n" + "bge 70b\n" + "72:" // Height 3: Multiply loop: Skip odd blocks + "cbz x25, 76f\n" + "tbz x25, #1, 73f\n" "ldr h0, [x24], #0x2\n" "ldr h1, [x23], #0x2\n" "ldr h2, [x22], #0x2\n" - "tbz x25, #0, 76f\n" + "tbz x25, #0, 74f\n" "ld1 { v0.b }[2], [x24]\n" "ld1 { v1.b }[2], [x23]\n" "ld1 { v2.b }[2], [x22]\n" - "b 76f\n" - "75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "b 74f\n" + "73:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x24, #0x0]\n" "ldr b1, [x23, #0x0]\n" "ldr b2, [x22, #0x0]\n" - "76:" // Height 3: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 77f\n" + "74:" // Height 3: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 75f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "77:" // Height 3: Multiply loop: unique 12: skip row sum - "ldr q31, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" - "add x28, x28, #0x40\n" - ".inst 0x6f80e3f0 // udot v16.4s, v31.16b, v0.4b[0]\n" - ".inst 0x6f81e3f4 // udot v20.4s, v31.16b, v1.4b[0]\n" - ".inst 0x6f82e3f8 // udot v24.4s, v31.16b, v2.4b[0]\n" - ".inst 0x6f80e3d1 // udot v17.4s, v30.16b, v0.4b[0]\n" - ".inst 0x6f81e3d5 // udot v21.4s, v30.16b, v1.4b[0]\n" - ".inst 0x6f82e3d9 // udot v25.4s, v30.16b, v2.4b[0]\n" - ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n" - ".inst 0x6f81e3b6 // udot v22.4s, v29.16b, v1.4b[0]\n" - ".inst 0x6f82e3ba // udot v26.4s, v29.16b, v2.4b[0]\n" - ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n" - ".inst 0x6f81e397 // udot v23.4s, v28.16b, v1.4b[0]\n" - ".inst 0x6f82e39b // udot v27.4s, v28.16b, v2.4b[0]\n" - "78:" // Height 3: Multiply loop: No odd multiplies + "75:" // Height 3: Multiply loop: unique 12: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" + ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n" + ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n" + ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n" + "76:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 64b\n" + "bne 62b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x27, #0x0]\n" - "add x24, x27, x20\n" - "prfm pstl1keep, [x24, #0x0]\n" - "add x23, x24, x20\n" - "prfm pstl1keep, [x23, #0x0]\n" - "tbnz %x[flags], #31, 79f\n" + "add x26, x27, x20\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x20\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbnz %x[flags], #31, 77f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "ld1r { v28.4s }, [x20]\n" + "ld1r { v3.4s }, [x20]\n" "addp v13.4s, v13.4s, v13.4s\n" - "neg v28.4s, v28.4s\n" + "neg v3.4s, v3.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "mul v11.4s, v11.4s, v28.4s\n" - "mul v12.4s, v12.4s, v28.4s\n" - "mul v13.4s, v13.4s, v28.4s\n" - "79:" // Height 3: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q31, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v3.4s\n" + "mul v12.4s, v12.4s, v3.4s\n" + "mul v13.4s, v13.4s, v3.4s\n" + "77:" // Height 3: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q30, [x10, #0x20]\n" - "ldr q29, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v28.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "add x22, %x[qp], %[c_offset]\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" - "add x10, x10, #0x40\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v31.4s\n" - "add v18.4s, v18.4s, v30.4s\n" - "add v19.4s, v19.4s, v29.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add x28, x28, #0x40\n" "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v31.4s\n" - "add v22.4s, v22.4s, v30.4s\n" - "add v23.4s, v23.4s, v29.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v25.4s, v25.4s, v31.4s\n" - "add v26.4s, v26.4s, v30.4s\n" - "add v27.4s, v27.4s, v29.4s\n" - "sqrdmulh v16.4s, v16.4s, v28.4s\n" - "sqrdmulh v17.4s, v17.4s, v28.4s\n" - "sqrdmulh v18.4s, v18.4s, v28.4s\n" - "sqrdmulh v19.4s, v19.4s, v28.4s\n" - "sqrdmulh v20.4s, v20.4s, v28.4s\n" - "sqrdmulh v21.4s, v21.4s, v28.4s\n" - "sqrdmulh v22.4s, v22.4s, v28.4s\n" - "sqrdmulh v23.4s, v23.4s, v28.4s\n" - "sqrdmulh v24.4s, v24.4s, v28.4s\n" - "sqrdmulh v25.4s, v25.4s, v28.4s\n" - "sqrdmulh v26.4s, v26.4s, v28.4s\n" - "sqrdmulh v27.4s, v27.4s, v28.4s\n" - "tbz %x[flags], #5, 80f\n" - "and v1.16b, v16.16b, v0.16b\n" - "and v31.16b, v17.16b, v0.16b\n" - "and v30.16b, v18.16b, v0.16b\n" - "and v29.16b, v19.16b, v0.16b\n" - "and v28.16b, v20.16b, v0.16b\n" - "and v3.16b, v21.16b, v0.16b\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "and v2.16b, v22.16b, v0.16b\n" - "sqadd v16.4s, v16.4s, v1.4s\n" - "sqadd v17.4s, v17.4s, v31.4s\n" - "sqadd v18.4s, v18.4s, v30.4s\n" - "sqadd v19.4s, v19.4s, v29.4s\n" - "sqadd v20.4s, v20.4s, v28.4s\n" - "and v1.16b, v23.16b, v0.16b\n" - "and v31.16b, v24.16b, v0.16b\n" - "and v30.16b, v25.16b, v0.16b\n" - "and v29.16b, v26.16b, v0.16b\n" - "and v28.16b, v27.16b, v0.16b\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sqadd v21.4s, v21.4s, v3.4s\n" - "sqadd v22.4s, v22.4s, v2.4s\n" - "sqadd v23.4s, v23.4s, v1.4s\n" - "sqadd v24.4s, v24.4s, v31.4s\n" - "sqadd v25.4s, v25.4s, v30.4s\n" - "sqadd v26.4s, v26.4s, v29.4s\n" - "sqadd v27.4s, v27.4s, v28.4s\n" - "80:" // Height 3: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "ld1r { v0.4s }, [x23]\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v4.4s\n" + "sqdmulh v26.4s, v26.4s, v4.4s\n" + "sqdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v30.4s }, [x21]\n" - "ld1r { v29.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v28.4s }, [x20]\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v24.4s, v24.4s, v0.4s\n" "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add v16.4s, v16.4s, v30.4s\n" - "add v17.4s, v17.4s, v30.4s\n" - "add v18.4s, v18.4s, v30.4s\n" - "add v19.4s, v19.4s, v30.4s\n" - "add v20.4s, v20.4s, v30.4s\n" - "add v21.4s, v21.4s, v30.4s\n" - "add v22.4s, v22.4s, v30.4s\n" - "add v23.4s, v23.4s, v30.4s\n" - "add v24.4s, v24.4s, v30.4s\n" - "add v25.4s, v25.4s, v30.4s\n" - "add v26.4s, v26.4s, v30.4s\n" - "add v27.4s, v27.4s, v30.4s\n" - "smin v16.4s, v16.4s, v29.4s\n" - "smin v17.4s, v17.4s, v29.4s\n" - "smin v18.4s, v18.4s, v29.4s\n" - "smin v19.4s, v19.4s, v29.4s\n" - "smin v20.4s, v20.4s, v29.4s\n" - "smin v21.4s, v21.4s, v29.4s\n" - "smin v22.4s, v22.4s, v29.4s\n" - "smin v23.4s, v23.4s, v29.4s\n" - "smin v24.4s, v24.4s, v29.4s\n" - "smin v25.4s, v25.4s, v29.4s\n" - "smin v26.4s, v26.4s, v29.4s\n" - "smin v27.4s, v27.4s, v29.4s\n" - "smax v16.4s, v16.4s, v28.4s\n" - "smax v17.4s, v17.4s, v28.4s\n" - "smax v18.4s, v18.4s, v28.4s\n" - "smax v19.4s, v19.4s, v28.4s\n" - "smax v20.4s, v20.4s, v28.4s\n" - "smax v21.4s, v21.4s, v28.4s\n" - "smax v22.4s, v22.4s, v28.4s\n" - "smax v23.4s, v23.4s, v28.4s\n" - "smax v24.4s, v24.4s, v28.4s\n" - "smax v25.4s, v25.4s, v28.4s\n" - "smax v26.4s, v26.4s, v28.4s\n" - "smax v27.4s, v27.4s, v28.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v19.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v18.8h, v22.8h, v23.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v17.8h, v26.8h, v27.8h\n" - "uzp1 v16.16b, v16.16b, v19.16b\n" - "uzp1 v20.16b, v20.16b, v18.16b\n" - "uzp1 v24.16b, v24.16b, v17.16b\n" - "bge 89f\n" - "tbz x9, #3, 84f\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 86f\n" + "tbz x10, #3, 81f\n" "str d16, [x27], #0x8\n" - "str d20, [x24], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x9, #2, 82f\n" + "str d20, [x26], #0x8\n" + "str d24, [x25], #0x8\n" + "tbz x10, #2, 79f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x24], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "tbz x9, #1, 81f\n" + "st1 { v20.s }[2], [x26], #0x4\n" + "st1 { v24.s }[2], [x25], #0x4\n" + "tbz x10, #1, 78f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x24], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "tbz x9, #0, 88f\n" + "st1 { v20.h }[6], [x26], #0x2\n" + "st1 { v24.h }[6], [x25], #0x2\n" + "tbz x10, #0, 85f\n" "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x24]\n" - "st1 { v24.b }[14], [x23]\n" - "b 88f\n" - "81:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x9, #0, 88f\n" + "st1 { v20.b }[14], [x26]\n" + "st1 { v24.b }[14], [x25]\n" + "b 85f\n" + "78:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x10, #0, 85f\n" "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x24]\n" - "st1 { v24.b }[12], [x23]\n" - "b 88f\n" - "82:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x9, #1, 83f\n" + "st1 { v20.b }[12], [x26]\n" + "st1 { v24.b }[12], [x25]\n" + "b 85f\n" + "79:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x10, #1, 80f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x24], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "tbz x9, #0, 88f\n" + "st1 { v20.h }[4], [x26], #0x2\n" + "st1 { v24.h }[4], [x25], #0x2\n" + "tbz x10, #0, 85f\n" "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x24]\n" - "st1 { v24.b }[10], [x23]\n" - "b 88f\n" - "83:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x9, #0, 88f\n" + "st1 { v20.b }[10], [x26]\n" + "st1 { v24.b }[10], [x25]\n" + "b 85f\n" + "80:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x10, #0, 85f\n" "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x24]\n" - "st1 { v24.b }[8], [x23]\n" - "b 88f\n" - "84:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x9, #2, 86f\n" + "st1 { v20.b }[8], [x26]\n" + "st1 { v24.b }[8], [x25]\n" + "b 85f\n" + "81:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x10, #2, 83f\n" "str s16, [x27], #0x4\n" - "str s20, [x24], #0x4\n" - "str s24, [x23], #0x4\n" - "tbz x9, #1, 85f\n" + "str s20, [x26], #0x4\n" + "str s24, [x25], #0x4\n" + "tbz x10, #1, 82f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x24], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "tbz x9, #0, 88f\n" + "st1 { v20.h }[2], [x26], #0x2\n" + "st1 { v24.h }[2], [x25], #0x2\n" + "tbz x10, #0, 85f\n" "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x24]\n" - "st1 { v24.b }[6], [x23]\n" - "b 88f\n" - "85:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x9, #0, 88f\n" + "st1 { v20.b }[6], [x26]\n" + "st1 { v24.b }[6], [x25]\n" + "b 85f\n" + "82:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x10, #0, 85f\n" "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x24]\n" - "st1 { v24.b }[4], [x23]\n" - "b 88f\n" - "86:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x9, #1, 87f\n" + "st1 { v20.b }[4], [x26]\n" + "st1 { v24.b }[4], [x25]\n" + "b 85f\n" + "83:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x10, #1, 84f\n" "str h16, [x27], #0x2\n" - "str h20, [x24], #0x2\n" - "str h24, [x23], #0x2\n" - "tbz x9, #0, 88f\n" + "str h20, [x26], #0x2\n" + "str h24, [x25], #0x2\n" + "tbz x10, #0, 85f\n" "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x24]\n" - "st1 { v24.b }[2], [x23]\n" - "b 88f\n" - "87:" // Height 3: Partial direct writeback: partial_1_0 + "st1 { v20.b }[2], [x26]\n" + "st1 { v24.b }[2], [x25]\n" + "b 85f\n" + "84:" // Height 3: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "str b20, [x24, #0x0]\n" - "str b24, [x23, #0x0]\n" - "88:" // Height 3: Partial direct writeback: Done - "b 90f\n" - "89:" // Height 3: Full writeback + "str b20, [x26, #0x0]\n" + "str b24, [x25, #0x0]\n" + "85:" // Height 3: Partial direct writeback: Done + "b 87f\n" + "86:" // Height 3: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q20, [x24, #0x0]\n" - "str q24, [x23, #0x0]\n" - "90:" // Height 3: Writeback done - "subs x9, x9, #0x10\n" - "bgt 62b\n" - "b 122f\n" - "91:" // Height 4 + "str q20, [x26, #0x0]\n" + "str q24, [x25, #0x0]\n" + "87:" // Height 3: Writeback done + "subs x10, x10, #0x10\n" + "bgt 60b\n" + "b 118f\n" + "88:" // Height 4 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "mov x20, #0x4\n" - "mov x10, %x[col_bias]\n" "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" - "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" "movi v13.4s, #0x0\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v14.4s, #0x0\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "madd x20, x21, x20, x27\n" "movi v15.16b, #0x1\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" + "madd x20, x21, x20, x27\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "92:" // Height 4: Column loop + "89:" // Height 4: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -1365,98 +1280,97 @@ void a64_hybrid_u8qa_dot_4x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "93:" // Height 4: setup done "mov x26, #0x0\n" - "94:" // Height 4: String loop + "91:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 95f\n" + "tbz %x[flags], #3, 92f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" "ldr x21, [x20, #0x18]\n" - "cbnz x26, 96f\n" + "cbnz x26, 93f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" "add x21, x21, x20\n" - "b 96f\n" - "95:" // Height 4: setup direct input + "b 93f\n" + "92:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" "add x21, x22, x21\n" - "96:" // Height 4: input setup done + "93:" // Height 4: input setup done "cmp x25, #0x10\n" - "blt 101f\n" + "blt 98f\n" "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "cmp x25, #0x20\n" "ldr q2, [x22, #0x0]\n" "ldr q3, [x21, #0x0]\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" - "blt 99f\n" - "97:" // Height 4: Multiply loop: Main loop head + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" + "blt 96f\n" + "94:" // Height 4: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" ".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" "add x22, x22, #0x10\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" "add x21, x21, #0x10\n" ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" ".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" ".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" ".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" ".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" + "ldr q10, [x9, #0xd0]\n" ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" ".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" + "ldr q4, [x9, #0xe0]\n" ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" ".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" @@ -1485,38 +1399,38 @@ void a64_hybrid_u8qa_dot_4x16 ( ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" ".inst 0x6fa3e8bf // udot v31.4s, v5.16b, v3.4b[3]\n" - "tbnz %x[flags], #31, 98f\n" + "tbnz %x[flags], #31, 95f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" - "98:" // Height 4: Multiply loop: unique 13: skip row sum + "95:" // Height 4: Multiply loop: unique 13: skip row sum "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "sub x25, x25, #0x10\n" "ldr q2, [x22, #0x0]\n" "ldr q3, [x21, #0x0]\n" "cmp x25, #0x20\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" "prfm pldl1keep, [x21, #0x80]\n" - "bge 97b\n" - "99:" // Height 4: Multiply loop: Single iteration only + "bge 94b\n" + "96:" // Height 4: Multiply loop: Single iteration only ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" ".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" "add x23, x23, #0x10\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" @@ -1524,43 +1438,43 @@ void a64_hybrid_u8qa_dot_4x16 ( "add x21, x21, #0x10\n" ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" ".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" ".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" ".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" ".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" + "ldr q10, [x9, #0xd0]\n" ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" ".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" + "ldr q4, [x9, #0xe0]\n" ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" ".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" @@ -1589,252 +1503,202 @@ void a64_hybrid_u8qa_dot_4x16 ( ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" ".inst 0x6fa3e8bf // udot v31.4s, v5.16b, v3.4b[3]\n" - "tbnz %x[flags], #31, 100f\n" + "tbnz %x[flags], #31, 97f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" - "100:" // Height 4: Multiply loop: unique 14: skip row sum + "97:" // Height 4: Multiply loop: unique 14: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" "prfm pldl1keep, [x21, #0x80]\n" - "101:" // Height 4: Multiply loop: Main loop skip - "cbz x25, 108f\n" + "98:" // Height 4: Multiply loop: Main loop skip + "cbz x25, 105f\n" "cmp x25, #0x4\n" - "blt 104f\n" - "102:" // Height 4: Multiply loop: Odd block loop + "blt 101f\n" + "99:" // Height 4: Multiply loop: Odd block loop "ldr s0, [x24], #0x4\n" "ldr s1, [x23], #0x4\n" "ldr s2, [x22], #0x4\n" "ldr s3, [x21], #0x4\n" - "tbnz %x[flags], #31, 103f\n" + "tbnz %x[flags], #31, 100f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" - "103:" // Height 4: Multiply loop: unique 15: skip row sum - "ldr q7, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" + "100:" // Height 4: Multiply loop: unique 15: skip row sum + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" "sub x25, x25, #0x4\n" - "ldr q5, [x28, #0x20]\n" - "ldr q4, [x28, #0x30]\n" + "ldr q8, [x9, #0x20]\n" + "ldr q9, [x9, #0x30]\n" "cmp x25, #0x4\n" - "add x28, x28, #0x40\n" - ".inst 0x6f80e0f0 // udot v16.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0fc // udot v28.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" - ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" - ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n" - ".inst 0x6f83e0be // udot v30.4s, v5.16b, v3.4b[0]\n" - ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n" - ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n" - "bge 102b\n" - "104:" // Height 4: Multiply loop: Skip odd blocks - "cbz x25, 108f\n" - "tbz x25, #1, 105f\n" + "add x9, x9, #0x40\n" + ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0fd // udot v29.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n" + ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n" + ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n" + ".inst 0x6f83e13f // udot v31.4s, v9.16b, v3.4b[0]\n" + "bge 99b\n" + "101:" // Height 4: Multiply loop: Skip odd blocks + "cbz x25, 105f\n" + "tbz x25, #1, 102f\n" "ldr h0, [x24], #0x2\n" "ldr h1, [x23], #0x2\n" "ldr h2, [x22], #0x2\n" "ldr h3, [x21], #0x2\n" - "tbz x25, #0, 106f\n" + "tbz x25, #0, 103f\n" "ld1 { v0.b }[2], [x24]\n" "ld1 { v1.b }[2], [x23]\n" "ld1 { v2.b }[2], [x22]\n" "ld1 { v3.b }[2], [x21]\n" - "b 106f\n" - "105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "b 103f\n" + "102:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x24, #0x0]\n" "ldr b1, [x23, #0x0]\n" "ldr b2, [x22, #0x0]\n" "ldr b3, [x21, #0x0]\n" - "106:" // Height 4: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 107f\n" + "103:" // Height 4: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 104f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" - "107:" // Height 4: Multiply loop: unique 16: skip row sum - "ldr q7, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" - "ldr q5, [x28, #0x20]\n" - "ldr q4, [x28, #0x30]\n" - "add x28, x28, #0x40\n" - ".inst 0x6f80e0f0 // udot v16.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0fc // udot v28.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n" + "104:" // Height 4: Multiply loop: unique 16: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" + ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n" + ".inst 0x6f83e15c // udot v28.4s, v10.16b, v3.4b[0]\n" + ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x6f83e09d // udot v29.4s, v4.16b, v3.4b[0]\n" ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n" ".inst 0x6f83e0be // udot v30.4s, v5.16b, v3.4b[0]\n" - ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n" - ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n" - "108:" // Height 4: Multiply loop: No odd multiplies + ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0df // udot v31.4s, v6.16b, v3.4b[0]\n" + "105:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 94b\n" + "bne 91b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x27, #0x0]\n" - "add x24, x27, x20\n" + "add x26, x27, x20\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x20\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x20\n" "prfm pstl1keep, [x24, #0x0]\n" - "add x23, x24, x20\n" - "prfm pstl1keep, [x23, #0x0]\n" - "add x22, x23, x20\n" - "prfm pstl1keep, [x22, #0x0]\n" - "tbnz %x[flags], #31, 109f\n" + "tbnz %x[flags], #31, 106f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "ld1r { v0.4s }, [x20]\n" + "ld1r { v4.4s }, [x20]\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "neg v0.4s, v0.4s\n" + "neg v4.4s, v4.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "mul v11.4s, v11.4s, v0.4s\n" - "mul v12.4s, v12.4s, v0.4s\n" - "mul v13.4s, v13.4s, v0.4s\n" - "mul v14.4s, v14.4s, v0.4s\n" - "109:" // Height 4: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q4, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v4.4s\n" + "mul v12.4s, v12.4s, v4.4s\n" + "mul v13.4s, v13.4s, v4.4s\n" + "mul v14.4s, v14.4s, v4.4s\n" + "106:" // Height 4: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q3, [x10, #0x20]\n" - "ldr q2, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v1.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "add x22, %x[qp], %[c_offset]\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" - "add x10, x10, #0x40\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" "add v28.4s, v28.4s, v14.4s\n" "add v29.4s, v29.4s, v14.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" "add v30.4s, v30.4s, v14.4s\n" "add v31.4s, v31.4s, v14.4s\n" + "add x28, x28, #0x40\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v3.4s\n" - "add v19.4s, v19.4s, v2.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v3.4s\n" - "add v23.4s, v23.4s, v2.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v3.4s\n" - "add v27.4s, v27.4s, v2.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" "add v28.4s, v28.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v29.4s, v29.4s, v4.4s\n" - "add v30.4s, v30.4s, v3.4s\n" - "add v31.4s, v31.4s, v2.4s\n" - "sqrdmulh v16.4s, v16.4s, v1.4s\n" - "sqrdmulh v17.4s, v17.4s, v1.4s\n" - "sqrdmulh v18.4s, v18.4s, v1.4s\n" - "sqrdmulh v19.4s, v19.4s, v1.4s\n" - "sqrdmulh v20.4s, v20.4s, v1.4s\n" - "sqrdmulh v21.4s, v21.4s, v1.4s\n" - "sqrdmulh v22.4s, v22.4s, v1.4s\n" - "sqrdmulh v23.4s, v23.4s, v1.4s\n" - "sqrdmulh v24.4s, v24.4s, v1.4s\n" - "sqrdmulh v25.4s, v25.4s, v1.4s\n" - "sqrdmulh v26.4s, v26.4s, v1.4s\n" - "sqrdmulh v27.4s, v27.4s, v1.4s\n" - "sqrdmulh v28.4s, v28.4s, v1.4s\n" - "sqrdmulh v29.4s, v29.4s, v1.4s\n" - "sqrdmulh v30.4s, v30.4s, v1.4s\n" - "sqrdmulh v31.4s, v31.4s, v1.4s\n" - "tbz %x[flags], #5, 110f\n" - "and v2.16b, v16.16b, v0.16b\n" - "and v1.16b, v17.16b, v0.16b\n" - "and v7.16b, v18.16b, v0.16b\n" - "and v6.16b, v19.16b, v0.16b\n" - "and v5.16b, v20.16b, v0.16b\n" - "and v4.16b, v21.16b, v0.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "and v3.16b, v22.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v2.4s\n" - "sqadd v17.4s, v17.4s, v1.4s\n" - "and v2.16b, v23.16b, v0.16b\n" - "and v1.16b, v24.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqadd v18.4s, v18.4s, v7.4s\n" - "sqadd v19.4s, v19.4s, v6.4s\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v5.4s\n" - "sqadd v21.4s, v21.4s, v4.4s\n" - "sqadd v22.4s, v22.4s, v3.4s\n" - "and v7.16b, v25.16b, v0.16b\n" - "sqadd v23.4s, v23.4s, v2.4s\n" - "sqadd v24.4s, v24.4s, v1.4s\n" - "and v6.16b, v26.16b, v0.16b\n" - "and v5.16b, v27.16b, v0.16b\n" - "and v4.16b, v28.16b, v0.16b\n" - "and v3.16b, v29.16b, v0.16b\n" - "and v2.16b, v30.16b, v0.16b\n" - "and v1.16b, v31.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqadd v25.4s, v25.4s, v7.4s\n" - "sqadd v26.4s, v26.4s, v6.4s\n" - "sqadd v27.4s, v27.4s, v5.4s\n" - "sqadd v28.4s, v28.4s, v4.4s\n" - "sqadd v29.4s, v29.4s, v3.4s\n" - "sqadd v30.4s, v30.4s, v2.4s\n" - "sqadd v31.4s, v31.4s, v1.4s\n" - "110:" // Height 4: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "ld1r { v0.4s }, [x23]\n" + "add v29.4s, v29.4s, v1.4s\n" + "add v30.4s, v30.4s, v2.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v4.4s\n" + "sqdmulh v26.4s, v26.4s, v4.4s\n" + "sqdmulh v27.4s, v27.4s, v4.4s\n" + "sqdmulh v28.4s, v28.4s, v4.4s\n" + "sqdmulh v29.4s, v29.4s, v4.4s\n" + "sqdmulh v30.4s, v30.4s, v4.4s\n" + "sqdmulh v31.4s, v31.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v3.4s }, [x21]\n" - "ld1r { v2.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v1.4s }, [x20]\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v24.4s, v24.4s, v0.4s\n" @@ -1845,178 +1709,178 @@ void a64_hybrid_u8qa_dot_4x16 ( "srshl v29.4s, v29.4s, v0.4s\n" "srshl v30.4s, v30.4s, v0.4s\n" "srshl v31.4s, v31.4s, v0.4s\n" - "add v16.4s, v16.4s, v3.4s\n" - "add v17.4s, v17.4s, v3.4s\n" - "add v18.4s, v18.4s, v3.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v3.4s\n" - "add v21.4s, v21.4s, v3.4s\n" - "add v22.4s, v22.4s, v3.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "add v24.4s, v24.4s, v3.4s\n" - "add v25.4s, v25.4s, v3.4s\n" - "add v26.4s, v26.4s, v3.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "add v28.4s, v28.4s, v3.4s\n" - "add v29.4s, v29.4s, v3.4s\n" - "add v30.4s, v30.4s, v3.4s\n" - "add v31.4s, v31.4s, v3.4s\n" - "smin v16.4s, v16.4s, v2.4s\n" - "smin v17.4s, v17.4s, v2.4s\n" - "smin v18.4s, v18.4s, v2.4s\n" - "smin v19.4s, v19.4s, v2.4s\n" - "smin v20.4s, v20.4s, v2.4s\n" - "smin v21.4s, v21.4s, v2.4s\n" - "smin v22.4s, v22.4s, v2.4s\n" - "smin v23.4s, v23.4s, v2.4s\n" - "smin v24.4s, v24.4s, v2.4s\n" - "smin v25.4s, v25.4s, v2.4s\n" - "smin v26.4s, v26.4s, v2.4s\n" - "smin v27.4s, v27.4s, v2.4s\n" - "smin v28.4s, v28.4s, v2.4s\n" - "smin v29.4s, v29.4s, v2.4s\n" - "smin v30.4s, v30.4s, v2.4s\n" - "smin v31.4s, v31.4s, v2.4s\n" - "smax v16.4s, v16.4s, v1.4s\n" - "smax v17.4s, v17.4s, v1.4s\n" - "smax v18.4s, v18.4s, v1.4s\n" - "smax v19.4s, v19.4s, v1.4s\n" - "smax v20.4s, v20.4s, v1.4s\n" - "smax v21.4s, v21.4s, v1.4s\n" - "smax v22.4s, v22.4s, v1.4s\n" - "smax v23.4s, v23.4s, v1.4s\n" - "smax v24.4s, v24.4s, v1.4s\n" - "smax v25.4s, v25.4s, v1.4s\n" - "smax v26.4s, v26.4s, v1.4s\n" - "smax v27.4s, v27.4s, v1.4s\n" - "smax v28.4s, v28.4s, v1.4s\n" - "smax v29.4s, v29.4s, v1.4s\n" - "smax v30.4s, v30.4s, v1.4s\n" - "smax v31.4s, v31.4s, v1.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v0.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v19.8h, v22.8h, v23.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v18.8h, v26.8h, v27.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" "uzp1 v28.8h, v28.8h, v29.8h\n" - "uzp1 v17.8h, v30.8h, v31.8h\n" - "uzp1 v16.16b, v16.16b, v0.16b\n" - "uzp1 v20.16b, v20.16b, v19.16b\n" - "uzp1 v24.16b, v24.16b, v18.16b\n" - "uzp1 v28.16b, v28.16b, v17.16b\n" - "bge 119f\n" - "tbz x9, #3, 114f\n" + "uzp1 v29.8h, v30.8h, v31.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v28.16b, v28.16b, v29.16b\n" + "bge 115f\n" + "tbz x10, #3, 110f\n" "str d16, [x27], #0x8\n" - "str d20, [x24], #0x8\n" - "str d24, [x23], #0x8\n" - "str d28, [x22], #0x8\n" - "tbz x9, #2, 112f\n" + "str d20, [x26], #0x8\n" + "str d24, [x25], #0x8\n" + "str d28, [x24], #0x8\n" + "tbz x10, #2, 108f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x24], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "st1 { v28.s }[2], [x22], #0x4\n" - "tbz x9, #1, 111f\n" + "st1 { v20.s }[2], [x26], #0x4\n" + "st1 { v24.s }[2], [x25], #0x4\n" + "st1 { v28.s }[2], [x24], #0x4\n" + "tbz x10, #1, 107f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x24], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "st1 { v28.h }[6], [x22], #0x2\n" - "tbz x9, #0, 118f\n" + "st1 { v20.h }[6], [x26], #0x2\n" + "st1 { v24.h }[6], [x25], #0x2\n" + "st1 { v28.h }[6], [x24], #0x2\n" + "tbz x10, #0, 114f\n" "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x24]\n" - "st1 { v24.b }[14], [x23]\n" - "st1 { v28.b }[14], [x22]\n" - "b 118f\n" - "111:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x9, #0, 118f\n" + "st1 { v20.b }[14], [x26]\n" + "st1 { v24.b }[14], [x25]\n" + "st1 { v28.b }[14], [x24]\n" + "b 114f\n" + "107:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x10, #0, 114f\n" "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x24]\n" - "st1 { v24.b }[12], [x23]\n" - "st1 { v28.b }[12], [x22]\n" - "b 118f\n" - "112:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x9, #1, 113f\n" + "st1 { v20.b }[12], [x26]\n" + "st1 { v24.b }[12], [x25]\n" + "st1 { v28.b }[12], [x24]\n" + "b 114f\n" + "108:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x10, #1, 109f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x24], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "st1 { v28.h }[4], [x22], #0x2\n" - "tbz x9, #0, 118f\n" + "st1 { v20.h }[4], [x26], #0x2\n" + "st1 { v24.h }[4], [x25], #0x2\n" + "st1 { v28.h }[4], [x24], #0x2\n" + "tbz x10, #0, 114f\n" "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x24]\n" - "st1 { v24.b }[10], [x23]\n" - "st1 { v28.b }[10], [x22]\n" - "b 118f\n" - "113:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x9, #0, 118f\n" + "st1 { v20.b }[10], [x26]\n" + "st1 { v24.b }[10], [x25]\n" + "st1 { v28.b }[10], [x24]\n" + "b 114f\n" + "109:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x10, #0, 114f\n" "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x24]\n" - "st1 { v24.b }[8], [x23]\n" - "st1 { v28.b }[8], [x22]\n" - "b 118f\n" - "114:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x9, #2, 116f\n" + "st1 { v20.b }[8], [x26]\n" + "st1 { v24.b }[8], [x25]\n" + "st1 { v28.b }[8], [x24]\n" + "b 114f\n" + "110:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x10, #2, 112f\n" "str s16, [x27], #0x4\n" - "str s20, [x24], #0x4\n" - "str s24, [x23], #0x4\n" - "str s28, [x22], #0x4\n" - "tbz x9, #1, 115f\n" + "str s20, [x26], #0x4\n" + "str s24, [x25], #0x4\n" + "str s28, [x24], #0x4\n" + "tbz x10, #1, 111f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x24], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "st1 { v28.h }[2], [x22], #0x2\n" - "tbz x9, #0, 118f\n" + "st1 { v20.h }[2], [x26], #0x2\n" + "st1 { v24.h }[2], [x25], #0x2\n" + "st1 { v28.h }[2], [x24], #0x2\n" + "tbz x10, #0, 114f\n" "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x24]\n" - "st1 { v24.b }[6], [x23]\n" - "st1 { v28.b }[6], [x22]\n" - "b 118f\n" - "115:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x9, #0, 118f\n" + "st1 { v20.b }[6], [x26]\n" + "st1 { v24.b }[6], [x25]\n" + "st1 { v28.b }[6], [x24]\n" + "b 114f\n" + "111:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x10, #0, 114f\n" "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x24]\n" - "st1 { v24.b }[4], [x23]\n" - "st1 { v28.b }[4], [x22]\n" - "b 118f\n" - "116:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x9, #1, 117f\n" + "st1 { v20.b }[4], [x26]\n" + "st1 { v24.b }[4], [x25]\n" + "st1 { v28.b }[4], [x24]\n" + "b 114f\n" + "112:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x10, #1, 113f\n" "str h16, [x27], #0x2\n" - "str h20, [x24], #0x2\n" - "str h24, [x23], #0x2\n" - "str h28, [x22], #0x2\n" - "tbz x9, #0, 118f\n" + "str h20, [x26], #0x2\n" + "str h24, [x25], #0x2\n" + "str h28, [x24], #0x2\n" + "tbz x10, #0, 114f\n" "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x24]\n" - "st1 { v24.b }[2], [x23]\n" - "st1 { v28.b }[2], [x22]\n" - "b 118f\n" - "117:" // Height 4: Partial direct writeback: partial_1_0 + "st1 { v20.b }[2], [x26]\n" + "st1 { v24.b }[2], [x25]\n" + "st1 { v28.b }[2], [x24]\n" + "b 114f\n" + "113:" // Height 4: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "str b20, [x24, #0x0]\n" - "str b24, [x23, #0x0]\n" - "str b28, [x22, #0x0]\n" - "118:" // Height 4: Partial direct writeback: Done - "b 120f\n" - "119:" // Height 4: Full writeback + "str b20, [x26, #0x0]\n" + "str b24, [x25, #0x0]\n" + "str b28, [x24, #0x0]\n" + "114:" // Height 4: Partial direct writeback: Done + "b 116f\n" + "115:" // Height 4: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q20, [x24, #0x0]\n" - "str q24, [x23, #0x0]\n" - "str q28, [x22, #0x0]\n" - "120:" // Height 4: Writeback done - "subs x9, x9, #0x10\n" - "bgt 92b\n" + "str q20, [x26, #0x0]\n" + "str q24, [x25, #0x0]\n" + "str q28, [x24, #0x0]\n" + "116:" // Height 4: Writeback done + "subs x10, x10, #0x10\n" + "bgt 89b\n" "subs %x[M], %x[M], #0x4\n" - "beq 122f\n" + "beq 118f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 121f\n" + "tbz %x[flags], #3, 117f\n" "add x21, x21, #0x4\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "121:" // Update direct input + "117:" // Update direct input "mov x20, #0x4\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "122:" // Exit + "118:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp index 2d3af7f9c3..ae67025d64 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -74,22 +73,19 @@ void a64_hybrid_u8qa_mmla_4x16 ( ka.string_lengths = string_lengths; ka.N = N; ka.B_ptr = B_ptr; - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 97f\n" + "bge 94f\n" "cmp %x[M], #0x2\n" - "bgt 65f\n" - "beq 33f\n" - "mov x10, %x[col_bias]\n" + "bgt 63f\n" + "beq 32f\n" "movi v11.4s, #0x0\n" "movi v15.16b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "2:" // Height 1: Column loop "movi v16.4s, #0x0\n" @@ -100,7 +96,6 @@ void a64_hybrid_u8qa_mmla_4x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "3:" // Height 1: setup done "mov x26, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -120,92 +115,92 @@ void a64_hybrid_u8qa_mmla_4x16 ( "cmp x25, #0x10\n" "blt 11f\n" "ldr q1, [x24, #0x0]\n" - "ldr q5, [x28, #0x0]\n" + "ldr q5, [x9, #0x0]\n" "cmp x25, #0x20\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "ldr q4, [x28, #0x60]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "ldr q4, [x9, #0x60]\n" "blt 9f\n" "7:" // Height 1: Multiply loop: Main loop head "add x24, x24, #0x10\n" - "trn1 v0.2d, v1.2d, v27.2d\n" + "trn1 v0.2d, v1.2d, v2.2d\n" ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" - "ldr q25, [x28, #0x70]\n" - "trn2 v1.2d, v1.2d, v27.2d\n" + "ldr q5, [x9, #0x70]\n" + "trn2 v1.2d, v1.2d, v2.2d\n" ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" - "ldr q24, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" - "ldr q30, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x6e99a417 // ummla v23.4s, v0.16b, v25.16b\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x6e98a430 // ummla v16.4s, v1.16b, v24.16b\n" - "ldr q24, [x28, #0xf0]\n" - ".inst 0x6e9ea434 // ummla v20.4s, v1.16b, v30.16b\n" - "add x28, x28, #0x100\n" - ".inst 0x6e9da431 // ummla v17.4s, v1.16b, v29.16b\n" - ".inst 0x6e9ca435 // ummla v21.4s, v1.16b, v28.16b\n" - ".inst 0x6e9ba432 // ummla v18.4s, v1.16b, v27.16b\n" - ".inst 0x6e9aa436 // ummla v22.4s, v1.16b, v26.16b\n" - ".inst 0x6e99a433 // ummla v19.4s, v1.16b, v25.16b\n" - ".inst 0x6e98a437 // ummla v23.4s, v1.16b, v24.16b\n" + "ldr q4, [x9, #0xd0]\n" + ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" + "add x9, x9, #0x100\n" + ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" + ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n" + ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" + ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" + ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" + ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" "tbnz %x[flags], #31, 8f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum "ldr q1, [x24, #0x0]\n" - "ldr q5, [x28, #0x0]\n" + "ldr q5, [x9, #0x0]\n" "sub x25, x25, #0x10\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" "cmp x25, #0x20\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "ldr q4, [x28, #0x60]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "ldr q4, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "bge 7b\n" "9:" // Height 1: Multiply loop: Single iteration only "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" - "trn1 v0.2d, v1.2d, v24.2d\n" - "trn2 v1.2d, v1.2d, v24.2d\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" - "ldr q25, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" - "ldr q24, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" - "ldr q30, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x6e99a417 // ummla v23.4s, v0.16b, v25.16b\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x6e98a430 // ummla v16.4s, v1.16b, v24.16b\n" - "ldr q24, [x28, #0xf0]\n" - ".inst 0x6e9ea434 // ummla v20.4s, v1.16b, v30.16b\n" - "add x28, x28, #0x100\n" - ".inst 0x6e9da431 // ummla v17.4s, v1.16b, v29.16b\n" - ".inst 0x6e9ca435 // ummla v21.4s, v1.16b, v28.16b\n" - ".inst 0x6e9ba432 // ummla v18.4s, v1.16b, v27.16b\n" - ".inst 0x6e9aa436 // ummla v22.4s, v1.16b, v26.16b\n" - ".inst 0x6e99a433 // ummla v19.4s, v1.16b, v25.16b\n" - ".inst 0x6e98a437 // ummla v23.4s, v1.16b, v24.16b\n" + "ldr q4, [x9, #0xd0]\n" + ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" + "add x9, x9, #0x100\n" + ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" + ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n" + ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" + ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" + ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" + ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" "tbnz %x[flags], #31, 10f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" @@ -216,30 +211,30 @@ void a64_hybrid_u8qa_mmla_4x16 ( "cmp x25, #0x8\n" "blt 14f\n" "12:" // Height 1: Multiply loop: Odd block loop - "ldr d25, [x24], #0x8\n" - "trn1 v0.2d, v25.2d, v24.2d\n" + "ldr d1, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" "tbnz %x[flags], #31, 13f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "13:" // Height 1: Multiply loop: unique 3: skip row sum - "ldr q24, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" "sub x25, x25, #0x8\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q4, [x9, #0x30]\n" "cmp x25, #0x8\n" - "ldr q27, [x28, #0x40]\n" - "ldr q26, [x28, #0x50]\n" - "ldr q25, [x28, #0x60]\n" - ".inst 0x6e98a410 // ummla v16.4s, v0.16b, v24.16b\n" - "ldr q24, [x28, #0x70]\n" - ".inst 0x6e9ea414 // ummla v20.4s, v0.16b, v30.16b\n" - ".inst 0x6e9da411 // ummla v17.4s, v0.16b, v29.16b\n" - ".inst 0x6e9ca415 // ummla v21.4s, v0.16b, v28.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x6e9ba412 // ummla v18.4s, v0.16b, v27.16b\n" - ".inst 0x6e9aa416 // ummla v22.4s, v0.16b, v26.16b\n" - ".inst 0x6e99a413 // ummla v19.4s, v0.16b, v25.16b\n" - ".inst 0x6e98a417 // ummla v23.4s, v0.16b, v24.16b\n" + "ldr q5, [x9, #0x40]\n" + "ldr q6, [x9, #0x50]\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e88a410 // ummla v16.4s, v0.16b, v8.16b\n" + "ldr q8, [x9, #0x70]\n" + ".inst 0x6e89a414 // ummla v20.4s, v0.16b, v9.16b\n" + ".inst 0x6e8aa411 // ummla v17.4s, v0.16b, v10.16b\n" + ".inst 0x6e84a415 // ummla v21.4s, v0.16b, v4.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n" + ".inst 0x6e86a416 // ummla v22.4s, v0.16b, v6.16b\n" + ".inst 0x6e87a413 // ummla v19.4s, v0.16b, v7.16b\n" + ".inst 0x6e88a417 // ummla v23.4s, v0.16b, v8.16b\n" "bge 12b\n" "14:" // Height 1: Multiply loop: Skip odd blocks "cbz x25, 20f\n" @@ -263,27 +258,27 @@ void a64_hybrid_u8qa_mmla_4x16 ( "17:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x24, #0x0]\n" "18:" // Height 1: Multiply loop: Ragged operand read: Done - "trn1 v0.2d, v1.2d, v24.2d\n" + "trn1 v0.2d, v1.2d, v2.2d\n" "tbnz %x[flags], #31, 19f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "19:" // Height 1: Multiply loop: unique 4: skip row sum - "ldr q24, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" - "ldr q27, [x28, #0x40]\n" - "ldr q26, [x28, #0x50]\n" - "ldr q25, [x28, #0x60]\n" - ".inst 0x6e98a410 // ummla v16.4s, v0.16b, v24.16b\n" - "ldr q24, [x28, #0x70]\n" - ".inst 0x6e9ea414 // ummla v20.4s, v0.16b, v30.16b\n" - ".inst 0x6e9da411 // ummla v17.4s, v0.16b, v29.16b\n" - ".inst 0x6e9ca415 // ummla v21.4s, v0.16b, v28.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x6e9ba412 // ummla v18.4s, v0.16b, v27.16b\n" - ".inst 0x6e9aa416 // ummla v22.4s, v0.16b, v26.16b\n" - ".inst 0x6e99a413 // ummla v19.4s, v0.16b, v25.16b\n" - ".inst 0x6e98a417 // ummla v23.4s, v0.16b, v24.16b\n" + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "ldr q7, [x9, #0x40]\n" + "ldr q8, [x9, #0x50]\n" + "ldr q9, [x9, #0x60]\n" + ".inst 0x6e8aa410 // ummla v16.4s, v0.16b, v10.16b\n" + "ldr q10, [x9, #0x70]\n" + ".inst 0x6e84a414 // ummla v20.4s, v0.16b, v4.16b\n" + ".inst 0x6e85a411 // ummla v17.4s, v0.16b, v5.16b\n" + ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x6e87a412 // ummla v18.4s, v0.16b, v7.16b\n" + ".inst 0x6e88a416 // ummla v22.4s, v0.16b, v8.16b\n" + ".inst 0x6e89a413 // ummla v19.4s, v0.16b, v9.16b\n" + ".inst 0x6e8aa417 // ummla v23.4s, v0.16b, v10.16b\n" "20:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" @@ -298,136 +293,122 @@ void a64_hybrid_u8qa_mmla_4x16 ( "tbnz %x[flags], #31, 21f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" - "ld1r { v16.4s }, [x20]\n" - "neg v16.4s, v16.4s\n" + "ld1r { v1.4s }, [x20]\n" + "neg v1.4s, v1.4s\n" "dup v11.4s, v11.s[0]\n" - "mul v11.4s, v11.4s, v16.4s\n" + "mul v11.4s, v11.4s, v1.4s\n" "21:" // Height 1: skip row sum fixup - "ldr q24, [x10, #0x0]\n" - "ldr q22, [x10, #0x10]\n" + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v23.4s, v23.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q21, [x10, #0x20]\n" - "ldr q20, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v16.4s }, [x20]\n" - "add v23.4s, v23.4s, v24.4s\n" - "add v17.4s, v17.4s, v22.4s\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - "add x10, x10, #0x40\n" + "ld1r { v4.4s }, [x21]\n" + "add v23.4s, v23.4s, v0.4s\n" "ld1r { v0.4s }, [x20]\n" - "add v18.4s, v18.4s, v21.4s\n" - "add v19.4s, v19.4s, v20.4s\n" - "sqrdmulh v23.4s, v23.4s, v16.4s\n" - "sqrdmulh v17.4s, v17.4s, v16.4s\n" - "sqrdmulh v18.4s, v18.4s, v16.4s\n" - "sqrdmulh v19.4s, v19.4s, v16.4s\n" - "tbz %x[flags], #5, 22f\n" - "and v22.16b, v23.16b, v0.16b\n" - "and v21.16b, v17.16b, v0.16b\n" - "and v20.16b, v18.16b, v0.16b\n" - "and v16.16b, v19.16b, v0.16b\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sshr v16.4s, v16.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v22.4s\n" - "sqadd v17.4s, v17.4s, v21.4s\n" - "sqadd v18.4s, v18.4s, v20.4s\n" - "sqadd v19.4s, v19.4s, v16.4s\n" - "22:" // Height 1: no shift correction + "add v17.4s, v17.4s, v1.4s\n" "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v6.4s }, [x20]\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "add x28, x28, #0x40\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "ld1r { v4.4s }, [x21]\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v21.4s }, [x21]\n" - "ld1r { v20.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v16.4s }, [x20]\n" - "add v23.4s, v23.4s, v21.4s\n" - "add v17.4s, v17.4s, v21.4s\n" - "add v18.4s, v18.4s, v21.4s\n" - "add v19.4s, v19.4s, v21.4s\n" - "smin v23.4s, v23.4s, v20.4s\n" - "smin v17.4s, v17.4s, v20.4s\n" - "smin v18.4s, v18.4s, v20.4s\n" - "smin v19.4s, v19.4s, v20.4s\n" - "smax v23.4s, v23.4s, v16.4s\n" - "smax v17.4s, v17.4s, v16.4s\n" - "smax v18.4s, v18.4s, v16.4s\n" - "smax v19.4s, v19.4s, v16.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v23.8h, v23.8h, v17.8h\n" - "uzp1 v16.8h, v18.8h, v19.8h\n" - "uzp1 v23.16b, v23.16b, v16.16b\n" - "bge 31f\n" - "tbz x9, #3, 26f\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v23.16b, v23.16b, v17.16b\n" + "bge 30f\n" + "tbz x10, #3, 25f\n" "str d23, [x27], #0x8\n" - "tbz x9, #2, 24f\n" + "tbz x10, #2, 23f\n" "st1 { v23.s }[2], [x27], #0x4\n" - "tbz x9, #1, 23f\n" + "tbz x10, #1, 22f\n" "st1 { v23.h }[6], [x27], #0x2\n" - "tbz x9, #0, 30f\n" + "tbz x10, #0, 29f\n" "st1 { v23.b }[14], [x27]\n" - "b 30f\n" - "23:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x9, #0, 30f\n" + "b 29f\n" + "22:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x10, #0, 29f\n" "st1 { v23.b }[12], [x27]\n" - "b 30f\n" - "24:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x9, #1, 25f\n" + "b 29f\n" + "23:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x10, #1, 24f\n" "st1 { v23.h }[4], [x27], #0x2\n" - "tbz x9, #0, 30f\n" + "tbz x10, #0, 29f\n" "st1 { v23.b }[10], [x27]\n" - "b 30f\n" - "25:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x9, #0, 30f\n" + "b 29f\n" + "24:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x10, #0, 29f\n" "st1 { v23.b }[8], [x27]\n" - "b 30f\n" - "26:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x9, #2, 28f\n" + "b 29f\n" + "25:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x10, #2, 27f\n" "str s23, [x27], #0x4\n" - "tbz x9, #1, 27f\n" + "tbz x10, #1, 26f\n" "st1 { v23.h }[2], [x27], #0x2\n" - "tbz x9, #0, 30f\n" + "tbz x10, #0, 29f\n" "st1 { v23.b }[6], [x27]\n" - "b 30f\n" - "27:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x9, #0, 30f\n" + "b 29f\n" + "26:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x10, #0, 29f\n" "st1 { v23.b }[4], [x27]\n" - "b 30f\n" - "28:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x9, #1, 29f\n" + "b 29f\n" + "27:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x10, #1, 28f\n" "str h23, [x27], #0x2\n" - "tbz x9, #0, 30f\n" + "tbz x10, #0, 29f\n" "st1 { v23.b }[2], [x27]\n" - "b 30f\n" - "29:" // Height 1: Partial direct writeback: partial_1_0 + "b 29f\n" + "28:" // Height 1: Partial direct writeback: partial_1_0 "str b23, [x27, #0x0]\n" - "30:" // Height 1: Partial direct writeback: Done - "b 32f\n" - "31:" // Height 1: Full writeback + "29:" // Height 1: Partial direct writeback: Done + "b 31f\n" + "30:" // Height 1: Full writeback "str q23, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "32:" // Height 1: Writeback done - "subs x9, x9, #0x10\n" + "31:" // Height 1: Writeback done + "subs x10, x10, #0x10\n" "bgt 2b\n" - "b 130f\n" - "33:" // Height 2 - "mov x10, %x[col_bias]\n" + "b 126f\n" + "32:" // Height 2 "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v15.16b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "34:" // Height 2: Column loop + "33:" // Height 2: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -436,420 +417,393 @@ void a64_hybrid_u8qa_mmla_4x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "35:" // Height 2: setup done "mov x26, #0x0\n" - "36:" // Height 2: String loop + "35:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 37f\n" + "tbz %x[flags], #3, 36f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" - "cbnz x26, 38f\n" + "cbnz x26, 37f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" - "b 38f\n" - "37:" // Height 2: setup direct input + "b 37f\n" + "36:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" - "38:" // Height 2: input setup done + "37:" // Height 2: input setup done "cmp x25, #0x10\n" - "blt 43f\n" + "blt 42f\n" "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "cmp x25, #0x20\n" - "ldr q5, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "ldr q4, [x28, #0x60]\n" - "blt 41f\n" - "39:" // Height 2: Multiply loop: Main loop head + "ldr q5, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "ldr q4, [x9, #0x60]\n" + "blt 40f\n" + "38:" // Height 2: Multiply loop: Main loop head "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" - "ldr q25, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" - "ldr q24, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" - "ldr q30, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x6e99a417 // ummla v23.4s, v0.16b, v25.16b\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x6e98a430 // ummla v16.4s, v1.16b, v24.16b\n" - "ldr q24, [x28, #0xf0]\n" - ".inst 0x6e9ea434 // ummla v20.4s, v1.16b, v30.16b\n" - "add x28, x28, #0x100\n" - ".inst 0x6e9da431 // ummla v17.4s, v1.16b, v29.16b\n" - ".inst 0x6e9ca435 // ummla v21.4s, v1.16b, v28.16b\n" - ".inst 0x6e9ba432 // ummla v18.4s, v1.16b, v27.16b\n" - ".inst 0x6e9aa436 // ummla v22.4s, v1.16b, v26.16b\n" - ".inst 0x6e99a433 // ummla v19.4s, v1.16b, v25.16b\n" - ".inst 0x6e98a437 // ummla v23.4s, v1.16b, v24.16b\n" - "tbnz %x[flags], #31, 40f\n" + "ldr q4, [x9, #0xd0]\n" + ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" + "add x9, x9, #0x100\n" + ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" + ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n" + ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" + ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" + ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" + ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" + "tbnz %x[flags], #31, 39f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" - "40:" // Height 2: Multiply loop: unique 5: skip row sum + "39:" // Height 2: Multiply loop: unique 5: skip row sum "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "sub x25, x25, #0x10\n" - "ldr q5, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" + "ldr q5, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" "cmp x25, #0x20\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "ldr q4, [x28, #0x60]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "ldr q4, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - "bge 39b\n" - "41:" // Height 2: Multiply loop: Single iteration only + "bge 38b\n" + "40:" // Height 2: Multiply loop: Single iteration only "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" - "ldr q25, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" - "ldr q24, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" - "ldr q30, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x6e99a417 // ummla v23.4s, v0.16b, v25.16b\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x6e98a430 // ummla v16.4s, v1.16b, v24.16b\n" - "ldr q24, [x28, #0xf0]\n" - ".inst 0x6e9ea434 // ummla v20.4s, v1.16b, v30.16b\n" - "add x28, x28, #0x100\n" - ".inst 0x6e9da431 // ummla v17.4s, v1.16b, v29.16b\n" - ".inst 0x6e9ca435 // ummla v21.4s, v1.16b, v28.16b\n" - ".inst 0x6e9ba432 // ummla v18.4s, v1.16b, v27.16b\n" - ".inst 0x6e9aa436 // ummla v22.4s, v1.16b, v26.16b\n" - ".inst 0x6e99a433 // ummla v19.4s, v1.16b, v25.16b\n" - ".inst 0x6e98a437 // ummla v23.4s, v1.16b, v24.16b\n" - "tbnz %x[flags], #31, 42f\n" + "ldr q4, [x9, #0xd0]\n" + ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" + "add x9, x9, #0x100\n" + ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" + ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n" + ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" + ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" + ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" + ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" + "tbnz %x[flags], #31, 41f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" - "42:" // Height 2: Multiply loop: unique 6: skip row sum + "41:" // Height 2: Multiply loop: unique 6: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - "43:" // Height 2: Multiply loop: Main loop skip - "cbz x25, 52f\n" + "42:" // Height 2: Multiply loop: Main loop skip + "cbz x25, 51f\n" "cmp x25, #0x8\n" - "blt 46f\n" - "44:" // Height 2: Multiply loop: Odd block loop - "ldr d25, [x24], #0x8\n" - "ldr d24, [x23], #0x8\n" - "trn1 v0.2d, v25.2d, v24.2d\n" - "tbnz %x[flags], #31, 45f\n" + "blt 45f\n" + "43:" // Height 2: Multiply loop: Odd block loop + "ldr d1, [x24], #0x8\n" + "ldr d2, [x23], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "tbnz %x[flags], #31, 44f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" - "45:" // Height 2: Multiply loop: unique 7: skip row sum - "ldr q24, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" + "44:" // Height 2: Multiply loop: unique 7: skip row sum + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" "sub x25, x25, #0x8\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q4, [x9, #0x30]\n" "cmp x25, #0x8\n" - "ldr q27, [x28, #0x40]\n" - "ldr q26, [x28, #0x50]\n" - "ldr q25, [x28, #0x60]\n" - ".inst 0x6e98a410 // ummla v16.4s, v0.16b, v24.16b\n" - "ldr q24, [x28, #0x70]\n" - ".inst 0x6e9ea414 // ummla v20.4s, v0.16b, v30.16b\n" - ".inst 0x6e9da411 // ummla v17.4s, v0.16b, v29.16b\n" - ".inst 0x6e9ca415 // ummla v21.4s, v0.16b, v28.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x6e9ba412 // ummla v18.4s, v0.16b, v27.16b\n" - ".inst 0x6e9aa416 // ummla v22.4s, v0.16b, v26.16b\n" - ".inst 0x6e99a413 // ummla v19.4s, v0.16b, v25.16b\n" - ".inst 0x6e98a417 // ummla v23.4s, v0.16b, v24.16b\n" - "bge 44b\n" - "46:" // Height 2: Multiply loop: Skip odd blocks - "cbz x25, 52f\n" - "tbz x25, #2, 48f\n" + "ldr q5, [x9, #0x40]\n" + "ldr q6, [x9, #0x50]\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e88a410 // ummla v16.4s, v0.16b, v8.16b\n" + "ldr q8, [x9, #0x70]\n" + ".inst 0x6e89a414 // ummla v20.4s, v0.16b, v9.16b\n" + ".inst 0x6e8aa411 // ummla v17.4s, v0.16b, v10.16b\n" + ".inst 0x6e84a415 // ummla v21.4s, v0.16b, v4.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n" + ".inst 0x6e86a416 // ummla v22.4s, v0.16b, v6.16b\n" + ".inst 0x6e87a413 // ummla v19.4s, v0.16b, v7.16b\n" + ".inst 0x6e88a417 // ummla v23.4s, v0.16b, v8.16b\n" + "bge 43b\n" + "45:" // Height 2: Multiply loop: Skip odd blocks + "cbz x25, 51f\n" + "tbz x25, #2, 47f\n" "ldr s1, [x24], #0x4\n" "ldr s2, [x23], #0x4\n" - "tbz x25, #1, 47f\n" + "tbz x25, #1, 46f\n" "ld1 { v1.h }[2], [x24], #0x2\n" "ld1 { v2.h }[2], [x23], #0x2\n" - "tbz x25, #0, 50f\n" + "tbz x25, #0, 49f\n" "ld1 { v1.b }[6], [x24]\n" "ld1 { v2.b }[6], [x23]\n" - "b 50f\n" - "47:" // Height 2: Multiply loop: Ragged operand read: partial_1_4 - "tbz x25, #0, 50f\n" + "b 49f\n" + "46:" // Height 2: Multiply loop: Ragged operand read: partial_1_4 + "tbz x25, #0, 49f\n" "ld1 { v1.b }[4], [x24]\n" "ld1 { v2.b }[4], [x23]\n" - "b 50f\n" - "48:" // Height 2: Multiply loop: Ragged operand read: partial_2_0 - "tbz x25, #1, 49f\n" + "b 49f\n" + "47:" // Height 2: Multiply loop: Ragged operand read: partial_2_0 + "tbz x25, #1, 48f\n" "ldr h1, [x24], #0x2\n" "ldr h2, [x23], #0x2\n" - "tbz x25, #0, 50f\n" + "tbz x25, #0, 49f\n" "ld1 { v1.b }[2], [x24]\n" "ld1 { v2.b }[2], [x23]\n" - "b 50f\n" - "49:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "b 49f\n" + "48:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x24, #0x0]\n" "ldr b2, [x23, #0x0]\n" - "50:" // Height 2: Multiply loop: Ragged operand read: Done + "49:" // Height 2: Multiply loop: Ragged operand read: Done "trn1 v0.2d, v1.2d, v2.2d\n" - "tbnz %x[flags], #31, 51f\n" + "tbnz %x[flags], #31, 50f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" - "51:" // Height 2: Multiply loop: unique 8: skip row sum - "ldr q24, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" - "ldr q27, [x28, #0x40]\n" - "ldr q26, [x28, #0x50]\n" - "ldr q25, [x28, #0x60]\n" - ".inst 0x6e98a410 // ummla v16.4s, v0.16b, v24.16b\n" - "ldr q24, [x28, #0x70]\n" - ".inst 0x6e9ea414 // ummla v20.4s, v0.16b, v30.16b\n" - ".inst 0x6e9da411 // ummla v17.4s, v0.16b, v29.16b\n" - ".inst 0x6e9ca415 // ummla v21.4s, v0.16b, v28.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x6e9ba412 // ummla v18.4s, v0.16b, v27.16b\n" - ".inst 0x6e9aa416 // ummla v22.4s, v0.16b, v26.16b\n" - ".inst 0x6e99a413 // ummla v19.4s, v0.16b, v25.16b\n" - ".inst 0x6e98a417 // ummla v23.4s, v0.16b, v24.16b\n" - "52:" // Height 2: Multiply loop: No odd multiplies + "50:" // Height 2: Multiply loop: unique 8: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "ldr q7, [x9, #0x40]\n" + "ldr q8, [x9, #0x50]\n" + "ldr q9, [x9, #0x60]\n" + ".inst 0x6e8aa410 // ummla v16.4s, v0.16b, v10.16b\n" + "ldr q10, [x9, #0x70]\n" + ".inst 0x6e84a414 // ummla v20.4s, v0.16b, v4.16b\n" + ".inst 0x6e85a411 // ummla v17.4s, v0.16b, v5.16b\n" + ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x6e87a412 // ummla v18.4s, v0.16b, v7.16b\n" + ".inst 0x6e88a416 // ummla v22.4s, v0.16b, v8.16b\n" + ".inst 0x6e89a413 // ummla v19.4s, v0.16b, v9.16b\n" + ".inst 0x6e8aa417 // ummla v23.4s, v0.16b, v10.16b\n" + "51:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 36b\n" + "bne 35b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v24.2d, v16.2d, v20.2d\n" + "uzp1 v4.2d, v16.2d, v20.2d\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "prfm pstl1keep, [x27, #0x0]\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "uzp2 v17.2d, v17.2d, v21.2d\n" "uzp1 v21.2d, v18.2d, v22.2d\n" "uzp2 v18.2d, v18.2d, v22.2d\n" - "add x24, x27, x20\n" + "add x26, x27, x20\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" - "prfm pstl1keep, [x24, #0x0]\n" - "mov v23.16b, v24.16b\n" - "tbnz %x[flags], #31, 53f\n" + "prfm pstl1keep, [x26, #0x0]\n" + "mov v23.16b, v4.16b\n" + "tbnz %x[flags], #31, 52f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" - "ld1r { v24.4s }, [x20]\n" - "neg v24.4s, v24.4s\n" + "ld1r { v2.4s }, [x20]\n" + "neg v2.4s, v2.4s\n" "dup v12.4s, v11.s[3]\n" "dup v11.4s, v11.s[0]\n" - "mul v11.4s, v11.4s, v24.4s\n" - "mul v12.4s, v12.4s, v24.4s\n" - "53:" // Height 2: skip row sum fixup - "ldr q28, [x10, #0x0]\n" - "ldr q27, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v2.4s\n" + "mul v12.4s, v12.4s, v2.4s\n" + "52:" // Height 2: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v23.4s, v23.4s, v11.4s\n" "add v20.4s, v20.4s, v11.4s\n" - "ldr q26, [x10, #0x20]\n" - "ldr q25, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v21.4s, v21.4s, v11.4s\n" "add v22.4s, v22.4s, v11.4s\n" "add v16.4s, v16.4s, v12.4s\n" "add v17.4s, v17.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v24.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v18.4s, v18.4s, v12.4s\n" "add v19.4s, v19.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" - "add v23.4s, v23.4s, v28.4s\n" - "add v20.4s, v20.4s, v27.4s\n" - "add x10, x10, #0x40\n" - "add v21.4s, v21.4s, v26.4s\n" - "add v22.4s, v22.4s, v25.4s\n" - "add v16.4s, v16.4s, v28.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v17.4s, v17.4s, v27.4s\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v25.4s\n" - "sqrdmulh v23.4s, v23.4s, v24.4s\n" - "sqrdmulh v20.4s, v20.4s, v24.4s\n" - "sqrdmulh v21.4s, v21.4s, v24.4s\n" - "sqrdmulh v22.4s, v22.4s, v24.4s\n" - "sqrdmulh v16.4s, v16.4s, v24.4s\n" - "sqrdmulh v17.4s, v17.4s, v24.4s\n" - "sqrdmulh v18.4s, v18.4s, v24.4s\n" - "sqrdmulh v19.4s, v19.4s, v24.4s\n" - "tbz %x[flags], #5, 54f\n" - "and v24.16b, v23.16b, v0.16b\n" - "and v30.16b, v20.16b, v0.16b\n" - "and v29.16b, v21.16b, v0.16b\n" - "and v28.16b, v22.16b, v0.16b\n" - "and v27.16b, v16.16b, v0.16b\n" - "and v26.16b, v17.16b, v0.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "and v25.16b, v18.16b, v0.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v24.4s\n" - "and v24.16b, v19.16b, v0.16b\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v30.4s\n" - "sqadd v21.4s, v21.4s, v29.4s\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v22.4s, v22.4s, v28.4s\n" - "sqadd v16.4s, v16.4s, v27.4s\n" - "sqadd v17.4s, v17.4s, v26.4s\n" - "sqadd v18.4s, v18.4s, v25.4s\n" - "sqadd v19.4s, v19.4s, v24.4s\n" - "54:" // Height 2: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "add x22, %x[qp], %[c_offset]\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "ld1r { v0.4s }, [x23]\n" + "add v17.4s, v17.4s, v1.4s\n" + "cmp x10, #0x10\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add x28, x28, #0x40\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v26.4s }, [x21]\n" - "ld1r { v25.4s }, [x20]\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v24.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add v23.4s, v23.4s, v26.4s\n" - "add v20.4s, v20.4s, v26.4s\n" - "add v21.4s, v21.4s, v26.4s\n" - "add v22.4s, v22.4s, v26.4s\n" - "add v16.4s, v16.4s, v26.4s\n" - "add v17.4s, v17.4s, v26.4s\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v26.4s\n" - "smin v23.4s, v23.4s, v25.4s\n" - "smin v20.4s, v20.4s, v25.4s\n" - "smin v21.4s, v21.4s, v25.4s\n" - "smin v22.4s, v22.4s, v25.4s\n" - "smin v16.4s, v16.4s, v25.4s\n" - "smin v17.4s, v17.4s, v25.4s\n" - "smin v18.4s, v18.4s, v25.4s\n" - "smin v19.4s, v19.4s, v25.4s\n" - "smax v23.4s, v23.4s, v24.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "smax v21.4s, v21.4s, v24.4s\n" - "smax v22.4s, v22.4s, v24.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "smax v18.4s, v18.4s, v24.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v23.8h, v23.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v23.16b, v23.16b, v20.16b\n" "uzp1 v16.16b, v16.16b, v17.16b\n" - "bge 63f\n" - "tbz x9, #3, 58f\n" + "bge 61f\n" + "tbz x10, #3, 56f\n" "str d23, [x27], #0x8\n" - "str d16, [x24], #0x8\n" - "tbz x9, #2, 56f\n" + "str d16, [x26], #0x8\n" + "tbz x10, #2, 54f\n" "st1 { v23.s }[2], [x27], #0x4\n" - "st1 { v16.s }[2], [x24], #0x4\n" - "tbz x9, #1, 55f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "tbz x10, #1, 53f\n" "st1 { v23.h }[6], [x27], #0x2\n" - "st1 { v16.h }[6], [x24], #0x2\n" - "tbz x9, #0, 62f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "tbz x10, #0, 60f\n" "st1 { v23.b }[14], [x27]\n" - "st1 { v16.b }[14], [x24]\n" - "b 62f\n" - "55:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x9, #0, 62f\n" + "st1 { v16.b }[14], [x26]\n" + "b 60f\n" + "53:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x10, #0, 60f\n" "st1 { v23.b }[12], [x27]\n" - "st1 { v16.b }[12], [x24]\n" - "b 62f\n" - "56:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x9, #1, 57f\n" + "st1 { v16.b }[12], [x26]\n" + "b 60f\n" + "54:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x10, #1, 55f\n" "st1 { v23.h }[4], [x27], #0x2\n" - "st1 { v16.h }[4], [x24], #0x2\n" - "tbz x9, #0, 62f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "tbz x10, #0, 60f\n" "st1 { v23.b }[10], [x27]\n" - "st1 { v16.b }[10], [x24]\n" - "b 62f\n" - "57:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x9, #0, 62f\n" + "st1 { v16.b }[10], [x26]\n" + "b 60f\n" + "55:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x10, #0, 60f\n" "st1 { v23.b }[8], [x27]\n" - "st1 { v16.b }[8], [x24]\n" - "b 62f\n" - "58:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x9, #2, 60f\n" + "st1 { v16.b }[8], [x26]\n" + "b 60f\n" + "56:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x10, #2, 58f\n" "str s23, [x27], #0x4\n" - "str s16, [x24], #0x4\n" - "tbz x9, #1, 59f\n" + "str s16, [x26], #0x4\n" + "tbz x10, #1, 57f\n" "st1 { v23.h }[2], [x27], #0x2\n" - "st1 { v16.h }[2], [x24], #0x2\n" - "tbz x9, #0, 62f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "tbz x10, #0, 60f\n" "st1 { v23.b }[6], [x27]\n" - "st1 { v16.b }[6], [x24]\n" - "b 62f\n" - "59:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x9, #0, 62f\n" + "st1 { v16.b }[6], [x26]\n" + "b 60f\n" + "57:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x10, #0, 60f\n" "st1 { v23.b }[4], [x27]\n" - "st1 { v16.b }[4], [x24]\n" - "b 62f\n" - "60:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x9, #1, 61f\n" + "st1 { v16.b }[4], [x26]\n" + "b 60f\n" + "58:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x10, #1, 59f\n" "str h23, [x27], #0x2\n" - "str h16, [x24], #0x2\n" - "tbz x9, #0, 62f\n" + "str h16, [x26], #0x2\n" + "tbz x10, #0, 60f\n" "st1 { v23.b }[2], [x27]\n" - "st1 { v16.b }[2], [x24]\n" - "b 62f\n" - "61:" // Height 2: Partial direct writeback: partial_1_0 + "st1 { v16.b }[2], [x26]\n" + "b 60f\n" + "59:" // Height 2: Partial direct writeback: partial_1_0 "str b23, [x27, #0x0]\n" - "str b16, [x24, #0x0]\n" - "62:" // Height 2: Partial direct writeback: Done - "b 64f\n" - "63:" // Height 2: Full writeback + "str b16, [x26, #0x0]\n" + "60:" // Height 2: Partial direct writeback: Done + "b 62f\n" + "61:" // Height 2: Full writeback "str q23, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q16, [x24, #0x0]\n" - "64:" // Height 2: Writeback done - "subs x9, x9, #0x10\n" - "bgt 34b\n" - "b 130f\n" - "65:" // Height 3 - "mov x10, %x[col_bias]\n" + "str q16, [x26, #0x0]\n" + "62:" // Height 2: Writeback done + "subs x10, x10, #0x10\n" + "bgt 33b\n" + "b 126f\n" + "63:" // Height 3 "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v13.4s, #0x0\n" "movi v15.16b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "66:" // Height 3: Column loop + "64:" // Height 3: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -866,43 +820,42 @@ void a64_hybrid_u8qa_mmla_4x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "67:" // Height 3: setup done "mov x26, #0x0\n" - "68:" // Height 3: String loop + "66:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 69f\n" + "tbz %x[flags], #3, 67f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" - "cbnz x26, 70f\n" + "cbnz x26, 68f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" - "b 70f\n" - "69:" // Height 3: setup direct input + "b 68f\n" + "67:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" - "70:" // Height 3: input setup done + "68:" // Height 3: input setup done "cmp x25, #0x10\n" - "blt 75f\n" + "blt 73f\n" "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "cmp x25, #0x20\n" "ldr q3, [x22, #0x0]\n" - "ldr q5, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "blt 73f\n" - "71:" // Height 3: Multiply loop: Main loop head + "ldr q5, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "blt 71f\n" + "69:" // Height 3: Multiply loop: Main loop head "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "add x24, x24, #0x10\n" @@ -910,35 +863,35 @@ void a64_hybrid_u8qa_mmla_4x16 ( "add x22, x22, #0x10\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" - "ldr q14, [x28, #0x60]\n" + "ldr q4, [x9, #0x60]\n" ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x6e86a45c // ummla v28.4s, v2.16b, v6.16b\n" - "ldr q4, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6e88a45d // ummla v29.4s, v2.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x6e89a45a // ummla v26.4s, v2.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" ".inst 0x6e8aa45e // ummla v30.4s, v2.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" - ".inst 0x6e8ea413 // ummla v19.4s, v0.16b, v14.16b\n" - ".inst 0x6e8ea45b // ummla v27.4s, v2.16b, v14.16b\n" - "ldr q6, [x28, #0xd0]\n" + "ldr q10, [x9, #0xc0]\n" + ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a45b // ummla v27.4s, v2.16b, v4.16b\n" + "ldr q4, [x9, #0xd0]\n" ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" ".inst 0x6e85a45f // ummla v31.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" - ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n" - ".inst 0x6e84a478 // ummla v24.4s, v3.16b, v4.16b\n" - "ldr q4, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a478 // ummla v24.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" ".inst 0x6e87a47c // ummla v28.4s, v3.16b, v7.16b\n" ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" @@ -947,34 +900,34 @@ void a64_hybrid_u8qa_mmla_4x16 ( ".inst 0x6e89a47d // ummla v29.4s, v3.16b, v9.16b\n" ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" ".inst 0x6e8aa47a // ummla v26.4s, v3.16b, v10.16b\n" - ".inst 0x6e86a436 // ummla v22.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a47e // ummla v30.4s, v3.16b, v6.16b\n" + ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" + ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n" ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n" - ".inst 0x6e84a437 // ummla v23.4s, v1.16b, v4.16b\n" - ".inst 0x6e84a47f // ummla v31.4s, v3.16b, v4.16b\n" - "tbnz %x[flags], #31, 72f\n" + ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a47f // ummla v31.4s, v3.16b, v6.16b\n" + "tbnz %x[flags], #31, 70f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" ".inst 0x6e8f946d // udot v13.4s, v3.16b, v15.16b\n" - "72:" // Height 3: Multiply loop: unique 9: skip row sum + "70:" // Height 3: Multiply loop: unique 9: skip row sum "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "sub x25, x25, #0x10\n" "ldr q3, [x22, #0x0]\n" - "ldr q5, [x28, #0x0]\n" + "ldr q5, [x9, #0x0]\n" "cmp x25, #0x20\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - "bge 71b\n" - "73:" // Height 3: Multiply loop: Single iteration only + "bge 69b\n" + "71:" // Height 3: Multiply loop: Single iteration only "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "sub x25, x25, #0x10\n" @@ -983,35 +936,35 @@ void a64_hybrid_u8qa_mmla_4x16 ( "add x22, x22, #0x10\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" - "ldr q14, [x28, #0x60]\n" + "ldr q4, [x9, #0x60]\n" ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x6e86a45c // ummla v28.4s, v2.16b, v6.16b\n" - "ldr q4, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6e88a45d // ummla v29.4s, v2.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x6e89a45a // ummla v26.4s, v2.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" ".inst 0x6e8aa45e // ummla v30.4s, v2.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" - ".inst 0x6e8ea413 // ummla v19.4s, v0.16b, v14.16b\n" - ".inst 0x6e8ea45b // ummla v27.4s, v2.16b, v14.16b\n" - "ldr q6, [x28, #0xd0]\n" + "ldr q10, [x9, #0xc0]\n" + ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a45b // ummla v27.4s, v2.16b, v4.16b\n" + "ldr q4, [x9, #0xd0]\n" ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" ".inst 0x6e85a45f // ummla v31.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" - ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n" - ".inst 0x6e84a478 // ummla v24.4s, v3.16b, v4.16b\n" - "ldr q4, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a478 // ummla v24.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" ".inst 0x6e87a47c // ummla v28.4s, v3.16b, v7.16b\n" ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" @@ -1020,416 +973,378 @@ void a64_hybrid_u8qa_mmla_4x16 ( ".inst 0x6e89a47d // ummla v29.4s, v3.16b, v9.16b\n" ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" ".inst 0x6e8aa47a // ummla v26.4s, v3.16b, v10.16b\n" - ".inst 0x6e86a436 // ummla v22.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a47e // ummla v30.4s, v3.16b, v6.16b\n" + ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" + ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n" ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n" - ".inst 0x6e84a437 // ummla v23.4s, v1.16b, v4.16b\n" - ".inst 0x6e84a47f // ummla v31.4s, v3.16b, v4.16b\n" - "tbnz %x[flags], #31, 74f\n" + ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a47f // ummla v31.4s, v3.16b, v6.16b\n" + "tbnz %x[flags], #31, 72f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" ".inst 0x6e8f946d // udot v13.4s, v3.16b, v15.16b\n" - "74:" // Height 3: Multiply loop: unique 10: skip row sum + "72:" // Height 3: Multiply loop: unique 10: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - "75:" // Height 3: Multiply loop: Main loop skip - "cbz x25, 84f\n" + "73:" // Height 3: Multiply loop: Main loop skip + "cbz x25, 82f\n" "cmp x25, #0x8\n" - "blt 78f\n" - "76:" // Height 3: Multiply loop: Odd block loop - "ldr d3, [x24], #0x8\n" - "ldr d0, [x23], #0x8\n" - "ldr d1, [x22], #0x8\n" - "trn1 v0.2d, v3.2d, v0.2d\n" - "trn1 v2.2d, v1.2d, v2.2d\n" - "tbnz %x[flags], #31, 77f\n" + "blt 76f\n" + "74:" // Height 3: Multiply loop: Odd block loop + "ldr d1, [x24], #0x8\n" + "ldr d2, [x23], #0x8\n" + "ldr d3, [x22], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v7.2d\n" + "tbnz %x[flags], #31, 75f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "77:" // Height 3: Multiply loop: unique 11: skip row sum - "ldr q1, [x28, #0x0]\n" - "ldr q8, [x28, #0x10]\n" + "75:" // Height 3: Multiply loop: unique 11: skip row sum + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" "sub x25, x25, #0x8\n" - "ldr q7, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q4, [x9, #0x30]\n" "cmp x25, #0x8\n" - "ldr q5, [x28, #0x40]\n" - "ldr q4, [x28, #0x50]\n" - "ldr q3, [x28, #0x60]\n" - ".inst 0x6e81a410 // ummla v16.4s, v0.16b, v1.16b\n" - ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n" - "ldr q1, [x28, #0x70]\n" - ".inst 0x6e88a414 // ummla v20.4s, v0.16b, v8.16b\n" - ".inst 0x6e88a45c // ummla v28.4s, v2.16b, v8.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" - ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n" + "ldr q5, [x9, #0x40]\n" + "ldr q6, [x9, #0x50]\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e88a410 // ummla v16.4s, v0.16b, v8.16b\n" + ".inst 0x6e88a458 // ummla v24.4s, v2.16b, v8.16b\n" + "ldr q8, [x9, #0x70]\n" + ".inst 0x6e89a414 // ummla v20.4s, v0.16b, v9.16b\n" + ".inst 0x6e89a45c // ummla v28.4s, v2.16b, v9.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x6e8aa411 // ummla v17.4s, v0.16b, v10.16b\n" + ".inst 0x6e8aa459 // ummla v25.4s, v2.16b, v10.16b\n" + ".inst 0x6e84a415 // ummla v21.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a45d // ummla v29.4s, v2.16b, v4.16b\n" ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n" ".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n" - ".inst 0x6e84a416 // ummla v22.4s, v0.16b, v4.16b\n" - ".inst 0x6e84a45e // ummla v30.4s, v2.16b, v4.16b\n" - ".inst 0x6e83a413 // ummla v19.4s, v0.16b, v3.16b\n" - ".inst 0x6e83a45b // ummla v27.4s, v2.16b, v3.16b\n" - ".inst 0x6e81a417 // ummla v23.4s, v0.16b, v1.16b\n" - ".inst 0x6e81a45f // ummla v31.4s, v2.16b, v1.16b\n" - "bge 76b\n" - "78:" // Height 3: Multiply loop: Skip odd blocks - "cbz x25, 84f\n" - "tbz x25, #2, 80f\n" + ".inst 0x6e86a416 // ummla v22.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a45e // ummla v30.4s, v2.16b, v6.16b\n" + ".inst 0x6e87a413 // ummla v19.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a45b // ummla v27.4s, v2.16b, v7.16b\n" + ".inst 0x6e88a417 // ummla v23.4s, v0.16b, v8.16b\n" + ".inst 0x6e88a45f // ummla v31.4s, v2.16b, v8.16b\n" + "bge 74b\n" + "76:" // Height 3: Multiply loop: Skip odd blocks + "cbz x25, 82f\n" + "tbz x25, #2, 78f\n" "ldr s1, [x24], #0x4\n" "ldr s2, [x23], #0x4\n" "ldr s3, [x22], #0x4\n" - "tbz x25, #1, 79f\n" + "tbz x25, #1, 77f\n" "ld1 { v1.h }[2], [x24], #0x2\n" "ld1 { v2.h }[2], [x23], #0x2\n" "ld1 { v3.h }[2], [x22], #0x2\n" - "tbz x25, #0, 82f\n" + "tbz x25, #0, 80f\n" "ld1 { v1.b }[6], [x24]\n" "ld1 { v2.b }[6], [x23]\n" "ld1 { v3.b }[6], [x22]\n" - "b 82f\n" - "79:" // Height 3: Multiply loop: Ragged operand read: partial_1_4 - "tbz x25, #0, 82f\n" + "b 80f\n" + "77:" // Height 3: Multiply loop: Ragged operand read: partial_1_4 + "tbz x25, #0, 80f\n" "ld1 { v1.b }[4], [x24]\n" "ld1 { v2.b }[4], [x23]\n" "ld1 { v3.b }[4], [x22]\n" - "b 82f\n" - "80:" // Height 3: Multiply loop: Ragged operand read: partial_2_0 - "tbz x25, #1, 81f\n" + "b 80f\n" + "78:" // Height 3: Multiply loop: Ragged operand read: partial_2_0 + "tbz x25, #1, 79f\n" "ldr h1, [x24], #0x2\n" "ldr h2, [x23], #0x2\n" "ldr h3, [x22], #0x2\n" - "tbz x25, #0, 82f\n" + "tbz x25, #0, 80f\n" "ld1 { v1.b }[2], [x24]\n" "ld1 { v2.b }[2], [x23]\n" "ld1 { v3.b }[2], [x22]\n" - "b 82f\n" - "81:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "b 80f\n" + "79:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x24, #0x0]\n" "ldr b2, [x23, #0x0]\n" "ldr b3, [x22, #0x0]\n" - "82:" // Height 3: Multiply loop: Ragged operand read: Done + "80:" // Height 3: Multiply loop: Ragged operand read: Done "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "tbnz %x[flags], #31, 83f\n" + "trn1 v2.2d, v3.2d, v9.2d\n" + "tbnz %x[flags], #31, 81f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "83:" // Height 3: Multiply loop: unique 12: skip row sum - "ldr q1, [x28, #0x0]\n" - "ldr q8, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - "ldr q5, [x28, #0x40]\n" - "ldr q4, [x28, #0x50]\n" - "ldr q3, [x28, #0x60]\n" - ".inst 0x6e81a410 // ummla v16.4s, v0.16b, v1.16b\n" - ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n" - "ldr q1, [x28, #0x70]\n" - ".inst 0x6e88a414 // ummla v20.4s, v0.16b, v8.16b\n" - ".inst 0x6e88a45c // ummla v28.4s, v2.16b, v8.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" + "81:" // Height 3: Multiply loop: unique 12: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "ldr q7, [x9, #0x40]\n" + "ldr q8, [x9, #0x50]\n" + "ldr q9, [x9, #0x60]\n" + ".inst 0x6e8aa410 // ummla v16.4s, v0.16b, v10.16b\n" + ".inst 0x6e8aa458 // ummla v24.4s, v2.16b, v10.16b\n" + "ldr q10, [x9, #0x70]\n" + ".inst 0x6e84a414 // ummla v20.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a45c // ummla v28.4s, v2.16b, v4.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x6e85a411 // ummla v17.4s, v0.16b, v5.16b\n" + ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n" ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n" ".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n" - ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n" - ".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n" - ".inst 0x6e84a416 // ummla v22.4s, v0.16b, v4.16b\n" - ".inst 0x6e84a45e // ummla v30.4s, v2.16b, v4.16b\n" - ".inst 0x6e83a413 // ummla v19.4s, v0.16b, v3.16b\n" - ".inst 0x6e83a45b // ummla v27.4s, v2.16b, v3.16b\n" - ".inst 0x6e81a417 // ummla v23.4s, v0.16b, v1.16b\n" - ".inst 0x6e81a45f // ummla v31.4s, v2.16b, v1.16b\n" - "84:" // Height 3: Multiply loop: No odd multiplies + ".inst 0x6e87a412 // ummla v18.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a45a // ummla v26.4s, v2.16b, v7.16b\n" + ".inst 0x6e88a416 // ummla v22.4s, v0.16b, v8.16b\n" + ".inst 0x6e88a45e // ummla v30.4s, v2.16b, v8.16b\n" + ".inst 0x6e89a413 // ummla v19.4s, v0.16b, v9.16b\n" + ".inst 0x6e89a45b // ummla v27.4s, v2.16b, v9.16b\n" + ".inst 0x6e8aa417 // ummla v23.4s, v0.16b, v10.16b\n" + ".inst 0x6e8aa45f // ummla v31.4s, v2.16b, v10.16b\n" + "82:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 68b\n" + "bne 66b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v0.2d, v16.2d, v20.2d\n" + "uzp1 v4.2d, v16.2d, v20.2d\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "prfm pstl1keep, [x27, #0x0]\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "uzp2 v17.2d, v17.2d, v21.2d\n" "uzp1 v21.2d, v18.2d, v22.2d\n" "uzp2 v18.2d, v18.2d, v22.2d\n" - "add x24, x27, x20\n" - "add x23, x24, x20\n" + "add x26, x27, x20\n" + "add x25, x26, x20\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" - "prfm pstl1keep, [x24, #0x0]\n" - "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x26, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" "uzp1 v24.2d, v24.2d, v28.2d\n" "uzp1 v25.2d, v25.2d, v29.2d\n" "uzp1 v26.2d, v26.2d, v30.2d\n" "uzp1 v27.2d, v27.2d, v31.2d\n" - "mov v31.16b, v0.16b\n" - "tbnz %x[flags], #31, 85f\n" + "mov v31.16b, v4.16b\n" + "tbnz %x[flags], #31, 83f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "ld1r { v23.4s }, [x20]\n" - "neg v23.4s, v23.4s\n" + "ld1r { v3.4s }, [x20]\n" + "neg v3.4s, v3.4s\n" "dup v12.4s, v11.s[3]\n" "dup v11.4s, v11.s[0]\n" "dup v13.4s, v13.s[0]\n" - "mul v11.4s, v11.4s, v23.4s\n" - "mul v12.4s, v12.4s, v23.4s\n" - "mul v13.4s, v13.4s, v23.4s\n" - "85:" // Height 3: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q30, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v3.4s\n" + "mul v12.4s, v12.4s, v3.4s\n" + "mul v13.4s, v13.4s, v3.4s\n" + "83:" // Height 3: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v31.4s, v31.4s, v11.4s\n" "add v20.4s, v20.4s, v11.4s\n" - "ldr q29, [x10, #0x20]\n" - "ldr q28, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v21.4s, v21.4s, v11.4s\n" "add v22.4s, v22.4s, v11.4s\n" "add v16.4s, v16.4s, v12.4s\n" "add v17.4s, v17.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v23.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v18.4s, v18.4s, v12.4s\n" "add v19.4s, v19.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "add x22, %x[qp], %[c_offset]\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" - "add x10, x10, #0x40\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" "add v31.4s, v31.4s, v0.4s\n" - "add v20.4s, v20.4s, v30.4s\n" - "add v21.4s, v21.4s, v29.4s\n" - "add v22.4s, v22.4s, v28.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add x28, x28, #0x40\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v30.4s\n" - "add v18.4s, v18.4s, v29.4s\n" - "add v19.4s, v19.4s, v28.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v25.4s, v25.4s, v30.4s\n" - "add v26.4s, v26.4s, v29.4s\n" - "add v27.4s, v27.4s, v28.4s\n" - "sqrdmulh v31.4s, v31.4s, v23.4s\n" - "sqrdmulh v20.4s, v20.4s, v23.4s\n" - "sqrdmulh v21.4s, v21.4s, v23.4s\n" - "sqrdmulh v22.4s, v22.4s, v23.4s\n" - "sqrdmulh v16.4s, v16.4s, v23.4s\n" - "sqrdmulh v17.4s, v17.4s, v23.4s\n" - "sqrdmulh v18.4s, v18.4s, v23.4s\n" - "sqrdmulh v19.4s, v19.4s, v23.4s\n" - "sqrdmulh v24.4s, v24.4s, v23.4s\n" - "sqrdmulh v25.4s, v25.4s, v23.4s\n" - "sqrdmulh v26.4s, v26.4s, v23.4s\n" - "sqrdmulh v27.4s, v27.4s, v23.4s\n" - "tbz %x[flags], #5, 86f\n" - "and v1.16b, v31.16b, v0.16b\n" - "and v30.16b, v20.16b, v0.16b\n" - "and v29.16b, v21.16b, v0.16b\n" - "and v28.16b, v22.16b, v0.16b\n" - "and v23.16b, v16.16b, v0.16b\n" - "and v3.16b, v17.16b, v0.16b\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "and v2.16b, v18.16b, v0.16b\n" - "sqadd v31.4s, v31.4s, v1.4s\n" - "sqadd v20.4s, v20.4s, v30.4s\n" - "sqadd v21.4s, v21.4s, v29.4s\n" - "sqadd v22.4s, v22.4s, v28.4s\n" - "sqadd v16.4s, v16.4s, v23.4s\n" - "and v1.16b, v19.16b, v0.16b\n" - "and v30.16b, v24.16b, v0.16b\n" - "and v29.16b, v25.16b, v0.16b\n" - "and v28.16b, v26.16b, v0.16b\n" - "and v23.16b, v27.16b, v0.16b\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v3.4s\n" - "sqadd v18.4s, v18.4s, v2.4s\n" - "sqadd v19.4s, v19.4s, v1.4s\n" - "sqadd v24.4s, v24.4s, v30.4s\n" - "sqadd v25.4s, v25.4s, v29.4s\n" - "sqadd v26.4s, v26.4s, v28.4s\n" - "sqadd v27.4s, v27.4s, v23.4s\n" - "86:" // Height 3: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "ld1r { v0.4s }, [x23]\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqdmulh v31.4s, v31.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v4.4s\n" + "sqdmulh v26.4s, v26.4s, v4.4s\n" + "sqdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v31.4s, v31.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v29.4s }, [x21]\n" - "ld1r { v28.4s }, [x20]\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v23.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v24.4s, v24.4s, v0.4s\n" "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add v31.4s, v31.4s, v29.4s\n" - "add v20.4s, v20.4s, v29.4s\n" - "add v21.4s, v21.4s, v29.4s\n" - "add v22.4s, v22.4s, v29.4s\n" - "add v16.4s, v16.4s, v29.4s\n" - "add v17.4s, v17.4s, v29.4s\n" - "add v18.4s, v18.4s, v29.4s\n" - "add v19.4s, v19.4s, v29.4s\n" - "add v24.4s, v24.4s, v29.4s\n" - "add v25.4s, v25.4s, v29.4s\n" - "add v26.4s, v26.4s, v29.4s\n" - "add v27.4s, v27.4s, v29.4s\n" - "smin v31.4s, v31.4s, v28.4s\n" - "smin v20.4s, v20.4s, v28.4s\n" - "smin v21.4s, v21.4s, v28.4s\n" - "smin v22.4s, v22.4s, v28.4s\n" - "smin v16.4s, v16.4s, v28.4s\n" - "smin v17.4s, v17.4s, v28.4s\n" - "smin v18.4s, v18.4s, v28.4s\n" - "smin v19.4s, v19.4s, v28.4s\n" - "smin v24.4s, v24.4s, v28.4s\n" - "smin v25.4s, v25.4s, v28.4s\n" - "smin v26.4s, v26.4s, v28.4s\n" - "smin v27.4s, v27.4s, v28.4s\n" - "smax v31.4s, v31.4s, v23.4s\n" - "smax v20.4s, v20.4s, v23.4s\n" - "smax v21.4s, v21.4s, v23.4s\n" - "smax v22.4s, v22.4s, v23.4s\n" - "smax v16.4s, v16.4s, v23.4s\n" - "smax v17.4s, v17.4s, v23.4s\n" - "smax v18.4s, v18.4s, v23.4s\n" - "smax v19.4s, v19.4s, v23.4s\n" - "smax v24.4s, v24.4s, v23.4s\n" - "smax v25.4s, v25.4s, v23.4s\n" - "smax v26.4s, v26.4s, v23.4s\n" - "smax v27.4s, v27.4s, v23.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v31.8h, v31.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v18.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v17.8h, v26.8h, v27.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" "uzp1 v31.16b, v31.16b, v20.16b\n" - "uzp1 v16.16b, v16.16b, v18.16b\n" - "uzp1 v24.16b, v24.16b, v17.16b\n" - "bge 95f\n" - "tbz x9, #3, 90f\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 92f\n" + "tbz x10, #3, 87f\n" "str d31, [x27], #0x8\n" - "str d16, [x24], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x9, #2, 88f\n" + "str d16, [x26], #0x8\n" + "str d24, [x25], #0x8\n" + "tbz x10, #2, 85f\n" "st1 { v31.s }[2], [x27], #0x4\n" - "st1 { v16.s }[2], [x24], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "tbz x9, #1, 87f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "st1 { v24.s }[2], [x25], #0x4\n" + "tbz x10, #1, 84f\n" "st1 { v31.h }[6], [x27], #0x2\n" - "st1 { v16.h }[6], [x24], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "tbz x9, #0, 94f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "st1 { v24.h }[6], [x25], #0x2\n" + "tbz x10, #0, 91f\n" "st1 { v31.b }[14], [x27]\n" - "st1 { v16.b }[14], [x24]\n" - "st1 { v24.b }[14], [x23]\n" - "b 94f\n" - "87:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x9, #0, 94f\n" + "st1 { v16.b }[14], [x26]\n" + "st1 { v24.b }[14], [x25]\n" + "b 91f\n" + "84:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x10, #0, 91f\n" "st1 { v31.b }[12], [x27]\n" - "st1 { v16.b }[12], [x24]\n" - "st1 { v24.b }[12], [x23]\n" - "b 94f\n" - "88:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x9, #1, 89f\n" + "st1 { v16.b }[12], [x26]\n" + "st1 { v24.b }[12], [x25]\n" + "b 91f\n" + "85:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x10, #1, 86f\n" "st1 { v31.h }[4], [x27], #0x2\n" - "st1 { v16.h }[4], [x24], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "tbz x9, #0, 94f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "st1 { v24.h }[4], [x25], #0x2\n" + "tbz x10, #0, 91f\n" "st1 { v31.b }[10], [x27]\n" - "st1 { v16.b }[10], [x24]\n" - "st1 { v24.b }[10], [x23]\n" - "b 94f\n" - "89:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x9, #0, 94f\n" + "st1 { v16.b }[10], [x26]\n" + "st1 { v24.b }[10], [x25]\n" + "b 91f\n" + "86:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x10, #0, 91f\n" "st1 { v31.b }[8], [x27]\n" - "st1 { v16.b }[8], [x24]\n" - "st1 { v24.b }[8], [x23]\n" - "b 94f\n" - "90:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x9, #2, 92f\n" + "st1 { v16.b }[8], [x26]\n" + "st1 { v24.b }[8], [x25]\n" + "b 91f\n" + "87:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x10, #2, 89f\n" "str s31, [x27], #0x4\n" - "str s16, [x24], #0x4\n" - "str s24, [x23], #0x4\n" - "tbz x9, #1, 91f\n" + "str s16, [x26], #0x4\n" + "str s24, [x25], #0x4\n" + "tbz x10, #1, 88f\n" "st1 { v31.h }[2], [x27], #0x2\n" - "st1 { v16.h }[2], [x24], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "tbz x9, #0, 94f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "st1 { v24.h }[2], [x25], #0x2\n" + "tbz x10, #0, 91f\n" "st1 { v31.b }[6], [x27]\n" - "st1 { v16.b }[6], [x24]\n" - "st1 { v24.b }[6], [x23]\n" - "b 94f\n" - "91:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x9, #0, 94f\n" + "st1 { v16.b }[6], [x26]\n" + "st1 { v24.b }[6], [x25]\n" + "b 91f\n" + "88:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x10, #0, 91f\n" "st1 { v31.b }[4], [x27]\n" - "st1 { v16.b }[4], [x24]\n" - "st1 { v24.b }[4], [x23]\n" - "b 94f\n" - "92:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x9, #1, 93f\n" + "st1 { v16.b }[4], [x26]\n" + "st1 { v24.b }[4], [x25]\n" + "b 91f\n" + "89:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x10, #1, 90f\n" "str h31, [x27], #0x2\n" - "str h16, [x24], #0x2\n" - "str h24, [x23], #0x2\n" - "tbz x9, #0, 94f\n" + "str h16, [x26], #0x2\n" + "str h24, [x25], #0x2\n" + "tbz x10, #0, 91f\n" "st1 { v31.b }[2], [x27]\n" - "st1 { v16.b }[2], [x24]\n" - "st1 { v24.b }[2], [x23]\n" - "b 94f\n" - "93:" // Height 3: Partial direct writeback: partial_1_0 + "st1 { v16.b }[2], [x26]\n" + "st1 { v24.b }[2], [x25]\n" + "b 91f\n" + "90:" // Height 3: Partial direct writeback: partial_1_0 "str b31, [x27, #0x0]\n" - "str b16, [x24, #0x0]\n" - "str b24, [x23, #0x0]\n" - "94:" // Height 3: Partial direct writeback: Done - "b 96f\n" - "95:" // Height 3: Full writeback + "str b16, [x26, #0x0]\n" + "str b24, [x25, #0x0]\n" + "91:" // Height 3: Partial direct writeback: Done + "b 93f\n" + "92:" // Height 3: Full writeback "str q31, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q16, [x24, #0x0]\n" - "str q24, [x23, #0x0]\n" - "96:" // Height 3: Writeback done - "subs x9, x9, #0x10\n" - "bgt 66b\n" - "b 130f\n" - "97:" // Height 4 + "str q16, [x26, #0x0]\n" + "str q24, [x25, #0x0]\n" + "93:" // Height 3: Writeback done + "subs x10, x10, #0x10\n" + "bgt 64b\n" + "b 126f\n" + "94:" // Height 4 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "mov x20, #0x4\n" - "mov x10, %x[col_bias]\n" "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" - "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" "movi v13.4s, #0x0\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v14.4s, #0x0\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "madd x20, x21, x20, x27\n" "movi v15.16b, #0x1\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" + "madd x20, x21, x20, x27\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "98:" // Height 4: Column loop + "95:" // Height 4: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -1446,54 +1361,53 @@ void a64_hybrid_u8qa_mmla_4x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "99:" // Height 4: setup done "mov x26, #0x0\n" - "100:" // Height 4: String loop + "97:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 101f\n" + "tbz %x[flags], #3, 98f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" "ldr x21, [x20, #0x18]\n" - "cbnz x26, 102f\n" + "cbnz x26, 99f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" "add x21, x21, x20\n" - "b 102f\n" - "101:" // Height 4: setup direct input + "b 99f\n" + "98:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" "add x21, x22, x21\n" - "102:" // Height 4: input setup done + "99:" // Height 4: input setup done "cmp x25, #0x10\n" - "blt 107f\n" + "blt 104f\n" "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "cmp x25, #0x20\n" "ldr q3, [x22, #0x0]\n" "ldr q4, [x21, #0x0]\n" - "ldr q5, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "blt 105f\n" - "103:" // Height 4: Multiply loop: Main loop head + "ldr q5, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "blt 102f\n" + "100:" // Height 4: Multiply loop: Main loop head "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" - "ldr q4, [x28, #0x60]\n" + "ldr q4, [x9, #0x60]\n" "add x22, x22, #0x10\n" "add x21, x21, #0x10\n" ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" @@ -1501,29 +1415,29 @@ void a64_hybrid_u8qa_mmla_4x16 ( ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x6e86a45c // ummla v28.4s, v2.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6e88a45d // ummla v29.4s, v2.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" ".inst 0x6e89a45a // ummla v26.4s, v2.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" ".inst 0x6e8aa45e // ummla v30.4s, v2.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" ".inst 0x6e84a45b // ummla v27.4s, v2.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" + "ldr q4, [x9, #0xd0]\n" ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" ".inst 0x6e85a45f // ummla v31.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" + "ldr q5, [x9, #0xe0]\n" ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" ".inst 0x6e86a478 // ummla v24.4s, v3.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" ".inst 0x6e87a47c // ummla v28.4s, v3.16b, v7.16b\n" ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" @@ -1538,37 +1452,37 @@ void a64_hybrid_u8qa_mmla_4x16 ( ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n" ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" ".inst 0x6e86a47f // ummla v31.4s, v3.16b, v6.16b\n" - "tbnz %x[flags], #31, 104f\n" + "tbnz %x[flags], #31, 101f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" ".inst 0x6e8f946d // udot v13.4s, v3.16b, v15.16b\n" - "104:" // Height 4: Multiply loop: unique 13: skip row sum + "101:" // Height 4: Multiply loop: unique 13: skip row sum "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "sub x25, x25, #0x10\n" "ldr q3, [x22, #0x0]\n" "ldr q4, [x21, #0x0]\n" "cmp x25, #0x20\n" - "ldr q5, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" + "ldr q5, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" "prfm pldl1keep, [x21, #0x80]\n" - "bge 103b\n" - "105:" // Height 4: Multiply loop: Single iteration only + "bge 100b\n" + "102:" // Height 4: Multiply loop: Single iteration only "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" - "ldr q4, [x28, #0x60]\n" + "ldr q4, [x9, #0x60]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" "add x21, x21, #0x10\n" @@ -1577,29 +1491,29 @@ void a64_hybrid_u8qa_mmla_4x16 ( ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x6e86a45c // ummla v28.4s, v2.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6e88a45d // ummla v29.4s, v2.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" ".inst 0x6e89a45a // ummla v26.4s, v2.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" ".inst 0x6e8aa45e // ummla v30.4s, v2.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" ".inst 0x6e84a45b // ummla v27.4s, v2.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" + "ldr q4, [x9, #0xd0]\n" ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" ".inst 0x6e85a45f // ummla v31.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" + "ldr q5, [x9, #0xe0]\n" ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" ".inst 0x6e86a478 // ummla v24.4s, v3.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" ".inst 0x6e87a47c // ummla v28.4s, v3.16b, v7.16b\n" ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" @@ -1614,299 +1528,249 @@ void a64_hybrid_u8qa_mmla_4x16 ( ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n" ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" ".inst 0x6e86a47f // ummla v31.4s, v3.16b, v6.16b\n" - "tbnz %x[flags], #31, 106f\n" + "tbnz %x[flags], #31, 103f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" ".inst 0x6e8f946d // udot v13.4s, v3.16b, v15.16b\n" - "106:" // Height 4: Multiply loop: unique 14: skip row sum + "103:" // Height 4: Multiply loop: unique 14: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" "prfm pldl1keep, [x21, #0x80]\n" - "107:" // Height 4: Multiply loop: Main loop skip - "cbz x25, 116f\n" + "104:" // Height 4: Multiply loop: Main loop skip + "cbz x25, 113f\n" "cmp x25, #0x8\n" - "blt 110f\n" - "108:" // Height 4: Multiply loop: Odd block loop - "ldr d3, [x24], #0x8\n" - "ldr d0, [x23], #0x8\n" - "ldr d2, [x22], #0x8\n" - "ldr d1, [x21], #0x8\n" - "trn1 v0.2d, v3.2d, v0.2d\n" - "trn1 v2.2d, v2.2d, v1.2d\n" - "tbnz %x[flags], #31, 109f\n" + "blt 107f\n" + "105:" // Height 4: Multiply loop: Odd block loop + "ldr d1, [x24], #0x8\n" + "ldr d2, [x23], #0x8\n" + "ldr d3, [x22], #0x8\n" + "ldr d7, [x21], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v7.2d\n" + "tbnz %x[flags], #31, 106f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "109:" // Height 4: Multiply loop: unique 15: skip row sum - "ldr q1, [x28, #0x0]\n" - "ldr q8, [x28, #0x10]\n" + "106:" // Height 4: Multiply loop: unique 15: skip row sum + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" "sub x25, x25, #0x8\n" - "ldr q7, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q4, [x9, #0x30]\n" "cmp x25, #0x8\n" - "ldr q5, [x28, #0x40]\n" - "ldr q4, [x28, #0x50]\n" - "ldr q3, [x28, #0x60]\n" - ".inst 0x6e81a410 // ummla v16.4s, v0.16b, v1.16b\n" - ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n" - "ldr q1, [x28, #0x70]\n" - ".inst 0x6e88a414 // ummla v20.4s, v0.16b, v8.16b\n" - ".inst 0x6e88a45c // ummla v28.4s, v2.16b, v8.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" - ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n" + "ldr q5, [x9, #0x40]\n" + "ldr q6, [x9, #0x50]\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x6e88a410 // ummla v16.4s, v0.16b, v8.16b\n" + ".inst 0x6e88a458 // ummla v24.4s, v2.16b, v8.16b\n" + "ldr q8, [x9, #0x70]\n" + ".inst 0x6e89a414 // ummla v20.4s, v0.16b, v9.16b\n" + ".inst 0x6e89a45c // ummla v28.4s, v2.16b, v9.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x6e8aa411 // ummla v17.4s, v0.16b, v10.16b\n" + ".inst 0x6e8aa459 // ummla v25.4s, v2.16b, v10.16b\n" + ".inst 0x6e84a415 // ummla v21.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a45d // ummla v29.4s, v2.16b, v4.16b\n" ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n" ".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n" - ".inst 0x6e84a416 // ummla v22.4s, v0.16b, v4.16b\n" - ".inst 0x6e84a45e // ummla v30.4s, v2.16b, v4.16b\n" - ".inst 0x6e83a413 // ummla v19.4s, v0.16b, v3.16b\n" - ".inst 0x6e83a45b // ummla v27.4s, v2.16b, v3.16b\n" - ".inst 0x6e81a417 // ummla v23.4s, v0.16b, v1.16b\n" - ".inst 0x6e81a45f // ummla v31.4s, v2.16b, v1.16b\n" - "bge 108b\n" - "110:" // Height 4: Multiply loop: Skip odd blocks - "cbz x25, 116f\n" - "tbz x25, #2, 112f\n" + ".inst 0x6e86a416 // ummla v22.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a45e // ummla v30.4s, v2.16b, v6.16b\n" + ".inst 0x6e87a413 // ummla v19.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a45b // ummla v27.4s, v2.16b, v7.16b\n" + ".inst 0x6e88a417 // ummla v23.4s, v0.16b, v8.16b\n" + ".inst 0x6e88a45f // ummla v31.4s, v2.16b, v8.16b\n" + "bge 105b\n" + "107:" // Height 4: Multiply loop: Skip odd blocks + "cbz x25, 113f\n" + "tbz x25, #2, 109f\n" "ldr s1, [x24], #0x4\n" "ldr s2, [x23], #0x4\n" "ldr s3, [x22], #0x4\n" "ldr s9, [x21], #0x4\n" - "tbz x25, #1, 111f\n" + "tbz x25, #1, 108f\n" "ld1 { v1.h }[2], [x24], #0x2\n" "ld1 { v2.h }[2], [x23], #0x2\n" "ld1 { v3.h }[2], [x22], #0x2\n" "ld1 { v9.h }[2], [x21], #0x2\n" - "tbz x25, #0, 114f\n" + "tbz x25, #0, 111f\n" "ld1 { v1.b }[6], [x24]\n" "ld1 { v2.b }[6], [x23]\n" "ld1 { v3.b }[6], [x22]\n" "ld1 { v9.b }[6], [x21]\n" - "b 114f\n" - "111:" // Height 4: Multiply loop: Ragged operand read: partial_1_4 - "tbz x25, #0, 114f\n" + "b 111f\n" + "108:" // Height 4: Multiply loop: Ragged operand read: partial_1_4 + "tbz x25, #0, 111f\n" "ld1 { v1.b }[4], [x24]\n" "ld1 { v2.b }[4], [x23]\n" "ld1 { v3.b }[4], [x22]\n" "ld1 { v9.b }[4], [x21]\n" - "b 114f\n" - "112:" // Height 4: Multiply loop: Ragged operand read: partial_2_0 - "tbz x25, #1, 113f\n" + "b 111f\n" + "109:" // Height 4: Multiply loop: Ragged operand read: partial_2_0 + "tbz x25, #1, 110f\n" "ldr h1, [x24], #0x2\n" "ldr h2, [x23], #0x2\n" "ldr h3, [x22], #0x2\n" "ldr h9, [x21], #0x2\n" - "tbz x25, #0, 114f\n" + "tbz x25, #0, 111f\n" "ld1 { v1.b }[2], [x24]\n" "ld1 { v2.b }[2], [x23]\n" "ld1 { v3.b }[2], [x22]\n" "ld1 { v9.b }[2], [x21]\n" - "b 114f\n" - "113:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "b 111f\n" + "110:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x24, #0x0]\n" "ldr b2, [x23, #0x0]\n" "ldr b3, [x22, #0x0]\n" "ldr b9, [x21, #0x0]\n" - "114:" // Height 4: Multiply loop: Ragged operand read: Done + "111:" // Height 4: Multiply loop: Ragged operand read: Done "trn1 v0.2d, v1.2d, v2.2d\n" "trn1 v2.2d, v3.2d, v9.2d\n" - "tbnz %x[flags], #31, 115f\n" + "tbnz %x[flags], #31, 112f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "115:" // Height 4: Multiply loop: unique 16: skip row sum - "ldr q1, [x28, #0x0]\n" - "ldr q8, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - "ldr q5, [x28, #0x40]\n" - "ldr q4, [x28, #0x50]\n" - "ldr q3, [x28, #0x60]\n" - ".inst 0x6e81a410 // ummla v16.4s, v0.16b, v1.16b\n" - ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n" - "ldr q1, [x28, #0x70]\n" - ".inst 0x6e88a414 // ummla v20.4s, v0.16b, v8.16b\n" - ".inst 0x6e88a45c // ummla v28.4s, v2.16b, v8.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" + "112:" // Height 4: Multiply loop: unique 16: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "ldr q7, [x9, #0x40]\n" + "ldr q8, [x9, #0x50]\n" + "ldr q9, [x9, #0x60]\n" + ".inst 0x6e8aa410 // ummla v16.4s, v0.16b, v10.16b\n" + ".inst 0x6e8aa458 // ummla v24.4s, v2.16b, v10.16b\n" + "ldr q10, [x9, #0x70]\n" + ".inst 0x6e84a414 // ummla v20.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a45c // ummla v28.4s, v2.16b, v4.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x6e85a411 // ummla v17.4s, v0.16b, v5.16b\n" + ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n" ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n" ".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n" - ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n" - ".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n" - ".inst 0x6e84a416 // ummla v22.4s, v0.16b, v4.16b\n" - ".inst 0x6e84a45e // ummla v30.4s, v2.16b, v4.16b\n" - ".inst 0x6e83a413 // ummla v19.4s, v0.16b, v3.16b\n" - ".inst 0x6e83a45b // ummla v27.4s, v2.16b, v3.16b\n" - ".inst 0x6e81a417 // ummla v23.4s, v0.16b, v1.16b\n" - ".inst 0x6e81a45f // ummla v31.4s, v2.16b, v1.16b\n" - "116:" // Height 4: Multiply loop: No odd multiplies + ".inst 0x6e87a412 // ummla v18.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a45a // ummla v26.4s, v2.16b, v7.16b\n" + ".inst 0x6e88a416 // ummla v22.4s, v0.16b, v8.16b\n" + ".inst 0x6e88a45e // ummla v30.4s, v2.16b, v8.16b\n" + ".inst 0x6e89a413 // ummla v19.4s, v0.16b, v9.16b\n" + ".inst 0x6e89a45b // ummla v27.4s, v2.16b, v9.16b\n" + ".inst 0x6e8aa417 // ummla v23.4s, v0.16b, v10.16b\n" + ".inst 0x6e8aa45f // ummla v31.4s, v2.16b, v10.16b\n" + "113:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 100b\n" + "bne 97b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v0.2d, v16.2d, v20.2d\n" + "uzp1 v4.2d, v16.2d, v20.2d\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "prfm pstl1keep, [x27, #0x0]\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "uzp2 v17.2d, v17.2d, v21.2d\n" "uzp1 v21.2d, v18.2d, v22.2d\n" "uzp2 v18.2d, v18.2d, v22.2d\n" - "add x24, x27, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x26, x27, x20\n" + "add x25, x26, x20\n" + "add x24, x25, x20\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" - "prfm pstl1keep, [x24, #0x0]\n" + "prfm pstl1keep, [x26, #0x0]\n" "uzp1 v23.2d, v24.2d, v28.2d\n" "uzp2 v24.2d, v24.2d, v28.2d\n" - "prfm pstl1keep, [x23, #0x0]\n" - "prfm pstl1keep, [x22, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x24, #0x0]\n" "uzp1 v28.2d, v25.2d, v29.2d\n" "uzp2 v25.2d, v25.2d, v29.2d\n" "uzp1 v29.2d, v26.2d, v30.2d\n" "uzp2 v26.2d, v26.2d, v30.2d\n" "uzp1 v30.2d, v27.2d, v31.2d\n" "uzp2 v27.2d, v27.2d, v31.2d\n" - "mov v31.16b, v0.16b\n" - "tbnz %x[flags], #31, 117f\n" + "mov v31.16b, v4.16b\n" + "tbnz %x[flags], #31, 114f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "ld1r { v0.4s }, [x20]\n" - "neg v0.4s, v0.4s\n" + "ld1r { v4.4s }, [x20]\n" + "neg v4.4s, v4.4s\n" "dup v12.4s, v11.s[3]\n" "dup v11.4s, v11.s[0]\n" "dup v14.4s, v13.s[3]\n" "dup v13.4s, v13.s[0]\n" - "mul v11.4s, v11.4s, v0.4s\n" - "mul v12.4s, v12.4s, v0.4s\n" - "mul v14.4s, v14.4s, v0.4s\n" - "mul v13.4s, v13.4s, v0.4s\n" - "117:" // Height 4: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q4, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v4.4s\n" + "mul v12.4s, v12.4s, v4.4s\n" + "mul v14.4s, v14.4s, v4.4s\n" + "mul v13.4s, v13.4s, v4.4s\n" + "114:" // Height 4: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v31.4s, v31.4s, v11.4s\n" "add v20.4s, v20.4s, v11.4s\n" - "ldr q3, [x10, #0x20]\n" - "ldr q2, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v21.4s, v21.4s, v11.4s\n" "add v22.4s, v22.4s, v11.4s\n" "add v16.4s, v16.4s, v12.4s\n" "add v17.4s, v17.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v1.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v18.4s, v18.4s, v12.4s\n" "add v19.4s, v19.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "add x22, %x[qp], %[c_offset]\n" "add v23.4s, v23.4s, v13.4s\n" "add v28.4s, v28.4s, v13.4s\n" - "add x10, x10, #0x40\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" "add v29.4s, v29.4s, v13.4s\n" "add v30.4s, v30.4s, v13.4s\n" "add v24.4s, v24.4s, v14.4s\n" "add v25.4s, v25.4s, v14.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" "add v26.4s, v26.4s, v14.4s\n" "add v27.4s, v27.4s, v14.4s\n" + "add x28, x28, #0x40\n" "add v31.4s, v31.4s, v0.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v3.4s\n" - "add v22.4s, v22.4s, v2.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v3.4s\n" - "add v19.4s, v19.4s, v2.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" "add v23.4s, v23.4s, v0.4s\n" - "add v28.4s, v28.4s, v4.4s\n" - "add v29.4s, v29.4s, v3.4s\n" - "add v30.4s, v30.4s, v2.4s\n" + "add v28.4s, v28.4s, v1.4s\n" + "add v29.4s, v29.4s, v2.4s\n" + "add v30.4s, v30.4s, v3.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v3.4s\n" - "add v27.4s, v27.4s, v2.4s\n" - "sqrdmulh v31.4s, v31.4s, v1.4s\n" - "sqrdmulh v20.4s, v20.4s, v1.4s\n" - "sqrdmulh v21.4s, v21.4s, v1.4s\n" - "sqrdmulh v22.4s, v22.4s, v1.4s\n" - "sqrdmulh v16.4s, v16.4s, v1.4s\n" - "sqrdmulh v17.4s, v17.4s, v1.4s\n" - "sqrdmulh v18.4s, v18.4s, v1.4s\n" - "sqrdmulh v19.4s, v19.4s, v1.4s\n" - "sqrdmulh v23.4s, v23.4s, v1.4s\n" - "sqrdmulh v28.4s, v28.4s, v1.4s\n" - "sqrdmulh v29.4s, v29.4s, v1.4s\n" - "sqrdmulh v30.4s, v30.4s, v1.4s\n" - "sqrdmulh v24.4s, v24.4s, v1.4s\n" - "sqrdmulh v25.4s, v25.4s, v1.4s\n" - "sqrdmulh v26.4s, v26.4s, v1.4s\n" - "sqrdmulh v27.4s, v27.4s, v1.4s\n" - "tbz %x[flags], #5, 118f\n" - "and v2.16b, v31.16b, v0.16b\n" - "and v1.16b, v20.16b, v0.16b\n" - "and v7.16b, v21.16b, v0.16b\n" - "and v6.16b, v22.16b, v0.16b\n" - "and v5.16b, v16.16b, v0.16b\n" - "and v4.16b, v17.16b, v0.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "and v3.16b, v18.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v31.4s, v31.4s, v2.4s\n" - "sqadd v20.4s, v20.4s, v1.4s\n" - "and v2.16b, v19.16b, v0.16b\n" - "and v1.16b, v23.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqadd v21.4s, v21.4s, v7.4s\n" - "sqadd v22.4s, v22.4s, v6.4s\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v5.4s\n" - "sqadd v17.4s, v17.4s, v4.4s\n" - "sqadd v18.4s, v18.4s, v3.4s\n" - "and v7.16b, v28.16b, v0.16b\n" - "sqadd v19.4s, v19.4s, v2.4s\n" - "sqadd v23.4s, v23.4s, v1.4s\n" - "and v6.16b, v29.16b, v0.16b\n" - "and v5.16b, v30.16b, v0.16b\n" - "and v4.16b, v24.16b, v0.16b\n" - "and v3.16b, v25.16b, v0.16b\n" - "and v2.16b, v26.16b, v0.16b\n" - "and v1.16b, v27.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqadd v28.4s, v28.4s, v7.4s\n" - "sqadd v29.4s, v29.4s, v6.4s\n" - "sqadd v30.4s, v30.4s, v5.4s\n" - "sqadd v24.4s, v24.4s, v4.4s\n" - "sqadd v25.4s, v25.4s, v3.4s\n" - "sqadd v26.4s, v26.4s, v2.4s\n" - "sqadd v27.4s, v27.4s, v1.4s\n" - "118:" // Height 4: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "ld1r { v0.4s }, [x23]\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqdmulh v31.4s, v31.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v28.4s, v28.4s, v4.4s\n" + "sqdmulh v29.4s, v29.4s, v4.4s\n" + "sqdmulh v30.4s, v30.4s, v4.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v4.4s\n" + "sqdmulh v26.4s, v26.4s, v4.4s\n" + "sqdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v31.4s, v31.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v3.4s }, [x21]\n" - "ld1r { v2.4s }, [x20]\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v1.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" @@ -1917,178 +1781,178 @@ void a64_hybrid_u8qa_mmla_4x16 ( "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add v31.4s, v31.4s, v3.4s\n" - "add v20.4s, v20.4s, v3.4s\n" - "add v21.4s, v21.4s, v3.4s\n" - "add v22.4s, v22.4s, v3.4s\n" - "add v16.4s, v16.4s, v3.4s\n" - "add v17.4s, v17.4s, v3.4s\n" - "add v18.4s, v18.4s, v3.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "add v28.4s, v28.4s, v3.4s\n" - "add v29.4s, v29.4s, v3.4s\n" - "add v30.4s, v30.4s, v3.4s\n" - "add v24.4s, v24.4s, v3.4s\n" - "add v25.4s, v25.4s, v3.4s\n" - "add v26.4s, v26.4s, v3.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "smin v31.4s, v31.4s, v2.4s\n" - "smin v20.4s, v20.4s, v2.4s\n" - "smin v21.4s, v21.4s, v2.4s\n" - "smin v22.4s, v22.4s, v2.4s\n" - "smin v16.4s, v16.4s, v2.4s\n" - "smin v17.4s, v17.4s, v2.4s\n" - "smin v18.4s, v18.4s, v2.4s\n" - "smin v19.4s, v19.4s, v2.4s\n" - "smin v23.4s, v23.4s, v2.4s\n" - "smin v28.4s, v28.4s, v2.4s\n" - "smin v29.4s, v29.4s, v2.4s\n" - "smin v30.4s, v30.4s, v2.4s\n" - "smin v24.4s, v24.4s, v2.4s\n" - "smin v25.4s, v25.4s, v2.4s\n" - "smin v26.4s, v26.4s, v2.4s\n" - "smin v27.4s, v27.4s, v2.4s\n" - "smax v31.4s, v31.4s, v1.4s\n" - "smax v20.4s, v20.4s, v1.4s\n" - "smax v21.4s, v21.4s, v1.4s\n" - "smax v22.4s, v22.4s, v1.4s\n" - "smax v16.4s, v16.4s, v1.4s\n" - "smax v17.4s, v17.4s, v1.4s\n" - "smax v18.4s, v18.4s, v1.4s\n" - "smax v19.4s, v19.4s, v1.4s\n" - "smax v23.4s, v23.4s, v1.4s\n" - "smax v28.4s, v28.4s, v1.4s\n" - "smax v29.4s, v29.4s, v1.4s\n" - "smax v30.4s, v30.4s, v1.4s\n" - "smax v24.4s, v24.4s, v1.4s\n" - "smax v25.4s, v25.4s, v1.4s\n" - "smax v26.4s, v26.4s, v1.4s\n" - "smax v27.4s, v27.4s, v1.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v31.8h, v31.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v19.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v23.8h, v23.8h, v28.8h\n" - "uzp1 v18.8h, v29.8h, v30.8h\n" + "uzp1 v28.8h, v29.8h, v30.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v17.8h, v26.8h, v27.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" "uzp1 v31.16b, v31.16b, v20.16b\n" - "uzp1 v16.16b, v16.16b, v19.16b\n" - "uzp1 v23.16b, v23.16b, v18.16b\n" - "uzp1 v24.16b, v24.16b, v17.16b\n" - "bge 127f\n" - "tbz x9, #3, 122f\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v23.16b, v23.16b, v28.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 123f\n" + "tbz x10, #3, 118f\n" "str d31, [x27], #0x8\n" - "str d16, [x24], #0x8\n" - "str d23, [x23], #0x8\n" - "str d24, [x22], #0x8\n" - "tbz x9, #2, 120f\n" + "str d16, [x26], #0x8\n" + "str d23, [x25], #0x8\n" + "str d24, [x24], #0x8\n" + "tbz x10, #2, 116f\n" "st1 { v31.s }[2], [x27], #0x4\n" - "st1 { v16.s }[2], [x24], #0x4\n" - "st1 { v23.s }[2], [x23], #0x4\n" - "st1 { v24.s }[2], [x22], #0x4\n" - "tbz x9, #1, 119f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "st1 { v23.s }[2], [x25], #0x4\n" + "st1 { v24.s }[2], [x24], #0x4\n" + "tbz x10, #1, 115f\n" "st1 { v31.h }[6], [x27], #0x2\n" - "st1 { v16.h }[6], [x24], #0x2\n" - "st1 { v23.h }[6], [x23], #0x2\n" - "st1 { v24.h }[6], [x22], #0x2\n" - "tbz x9, #0, 126f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "st1 { v23.h }[6], [x25], #0x2\n" + "st1 { v24.h }[6], [x24], #0x2\n" + "tbz x10, #0, 122f\n" "st1 { v31.b }[14], [x27]\n" - "st1 { v16.b }[14], [x24]\n" - "st1 { v23.b }[14], [x23]\n" - "st1 { v24.b }[14], [x22]\n" - "b 126f\n" - "119:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x9, #0, 126f\n" + "st1 { v16.b }[14], [x26]\n" + "st1 { v23.b }[14], [x25]\n" + "st1 { v24.b }[14], [x24]\n" + "b 122f\n" + "115:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x10, #0, 122f\n" "st1 { v31.b }[12], [x27]\n" - "st1 { v16.b }[12], [x24]\n" - "st1 { v23.b }[12], [x23]\n" - "st1 { v24.b }[12], [x22]\n" - "b 126f\n" - "120:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x9, #1, 121f\n" + "st1 { v16.b }[12], [x26]\n" + "st1 { v23.b }[12], [x25]\n" + "st1 { v24.b }[12], [x24]\n" + "b 122f\n" + "116:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x10, #1, 117f\n" "st1 { v31.h }[4], [x27], #0x2\n" - "st1 { v16.h }[4], [x24], #0x2\n" - "st1 { v23.h }[4], [x23], #0x2\n" - "st1 { v24.h }[4], [x22], #0x2\n" - "tbz x9, #0, 126f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "st1 { v23.h }[4], [x25], #0x2\n" + "st1 { v24.h }[4], [x24], #0x2\n" + "tbz x10, #0, 122f\n" "st1 { v31.b }[10], [x27]\n" - "st1 { v16.b }[10], [x24]\n" - "st1 { v23.b }[10], [x23]\n" - "st1 { v24.b }[10], [x22]\n" - "b 126f\n" - "121:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x9, #0, 126f\n" + "st1 { v16.b }[10], [x26]\n" + "st1 { v23.b }[10], [x25]\n" + "st1 { v24.b }[10], [x24]\n" + "b 122f\n" + "117:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x10, #0, 122f\n" "st1 { v31.b }[8], [x27]\n" - "st1 { v16.b }[8], [x24]\n" - "st1 { v23.b }[8], [x23]\n" - "st1 { v24.b }[8], [x22]\n" - "b 126f\n" - "122:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x9, #2, 124f\n" + "st1 { v16.b }[8], [x26]\n" + "st1 { v23.b }[8], [x25]\n" + "st1 { v24.b }[8], [x24]\n" + "b 122f\n" + "118:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x10, #2, 120f\n" "str s31, [x27], #0x4\n" - "str s16, [x24], #0x4\n" - "str s23, [x23], #0x4\n" - "str s24, [x22], #0x4\n" - "tbz x9, #1, 123f\n" + "str s16, [x26], #0x4\n" + "str s23, [x25], #0x4\n" + "str s24, [x24], #0x4\n" + "tbz x10, #1, 119f\n" "st1 { v31.h }[2], [x27], #0x2\n" - "st1 { v16.h }[2], [x24], #0x2\n" - "st1 { v23.h }[2], [x23], #0x2\n" - "st1 { v24.h }[2], [x22], #0x2\n" - "tbz x9, #0, 126f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "st1 { v23.h }[2], [x25], #0x2\n" + "st1 { v24.h }[2], [x24], #0x2\n" + "tbz x10, #0, 122f\n" "st1 { v31.b }[6], [x27]\n" - "st1 { v16.b }[6], [x24]\n" - "st1 { v23.b }[6], [x23]\n" - "st1 { v24.b }[6], [x22]\n" - "b 126f\n" - "123:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x9, #0, 126f\n" + "st1 { v16.b }[6], [x26]\n" + "st1 { v23.b }[6], [x25]\n" + "st1 { v24.b }[6], [x24]\n" + "b 122f\n" + "119:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x10, #0, 122f\n" "st1 { v31.b }[4], [x27]\n" - "st1 { v16.b }[4], [x24]\n" - "st1 { v23.b }[4], [x23]\n" - "st1 { v24.b }[4], [x22]\n" - "b 126f\n" - "124:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x9, #1, 125f\n" + "st1 { v16.b }[4], [x26]\n" + "st1 { v23.b }[4], [x25]\n" + "st1 { v24.b }[4], [x24]\n" + "b 122f\n" + "120:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x10, #1, 121f\n" "str h31, [x27], #0x2\n" - "str h16, [x24], #0x2\n" - "str h23, [x23], #0x2\n" - "str h24, [x22], #0x2\n" - "tbz x9, #0, 126f\n" + "str h16, [x26], #0x2\n" + "str h23, [x25], #0x2\n" + "str h24, [x24], #0x2\n" + "tbz x10, #0, 122f\n" "st1 { v31.b }[2], [x27]\n" - "st1 { v16.b }[2], [x24]\n" - "st1 { v23.b }[2], [x23]\n" - "st1 { v24.b }[2], [x22]\n" - "b 126f\n" - "125:" // Height 4: Partial direct writeback: partial_1_0 + "st1 { v16.b }[2], [x26]\n" + "st1 { v23.b }[2], [x25]\n" + "st1 { v24.b }[2], [x24]\n" + "b 122f\n" + "121:" // Height 4: Partial direct writeback: partial_1_0 "str b31, [x27, #0x0]\n" - "str b16, [x24, #0x0]\n" - "str b23, [x23, #0x0]\n" - "str b24, [x22, #0x0]\n" - "126:" // Height 4: Partial direct writeback: Done - "b 128f\n" - "127:" // Height 4: Full writeback + "str b16, [x26, #0x0]\n" + "str b23, [x25, #0x0]\n" + "str b24, [x24, #0x0]\n" + "122:" // Height 4: Partial direct writeback: Done + "b 124f\n" + "123:" // Height 4: Full writeback "str q31, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q16, [x24, #0x0]\n" - "str q23, [x23, #0x0]\n" - "str q24, [x22, #0x0]\n" - "128:" // Height 4: Writeback done - "subs x9, x9, #0x10\n" - "bgt 98b\n" + "str q16, [x26, #0x0]\n" + "str q23, [x25, #0x0]\n" + "str q24, [x24, #0x0]\n" + "124:" // Height 4: Writeback done + "subs x10, x10, #0x10\n" + "bgt 95b\n" "subs %x[M], %x[M], #0x4\n" - "beq 130f\n" + "beq 126f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 129f\n" + "tbz %x[flags], #3, 125f\n" "add x21, x21, #0x4\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "129:" // Update direct input + "125:" // Update direct input "mov x20, #0x4\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "130:" // Exit + "126:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_dot_4x16/generic.cpp index e5ca848fb9..52ab4ad248 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_dot_4x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_dot_4x16/generic.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -74,29 +73,25 @@ void a64_hybrid_u8s8qa_dot_4x16 ( ka.string_lengths = string_lengths; ka.N = N; ka.B_ptr = B_ptr; - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 91f\n" + "bge 88f\n" "cmp %x[M], #0x2\n" - "bgt 61f\n" - "beq 31f\n" - "mov x10, %x[col_bias]\n" + "bgt 59f\n" + "beq 30f\n" "movi v11.4s, #0x0\n" "movi v15.16b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "2:" // Height 1: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" "movi v19.4s, #0x0\n" - "3:" // Height 1: setup done "mov x26, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -116,87 +111,87 @@ void a64_hybrid_u8s8qa_dot_4x16 ( "cmp x25, #0x10\n" "blt 11f\n" "ldr q0, [x24, #0x0]\n" - "ldr q4, [x28, #0x0]\n" + "ldr q4, [x9, #0x0]\n" "cmp x25, #0x20\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" "blt 9f\n" "7:" // Height 1: Multiply loop: Main loop head ".inst 0x4f00f090 // sudot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q21, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" ".inst 0x4f00f0b1 // sudot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q20, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" ".inst 0x4f00f0d2 // sudot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q26, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x4f00f0f3 // sudot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q25, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" "add x24, x24, #0x10\n" ".inst 0x4f20f110 // sudot v16.4s, v8.16b, v0.4b[1]\n" - "ldr q24, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x4f20f131 // sudot v17.4s, v9.16b, v0.4b[1]\n" - "ldr q23, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x4f20f152 // sudot v18.4s, v10.16b, v0.4b[1]\n" - "ldr q22, [x28, #0xd0]\n" - ".inst 0x4f20f2b3 // sudot v19.4s, v21.16b, v0.4b[1]\n" - "ldr q21, [x28, #0xe0]\n" - ".inst 0x4f00fa90 // sudot v16.4s, v20.16b, v0.4b[2]\n" - "ldr q20, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" - ".inst 0x4f00fb51 // sudot v17.4s, v26.16b, v0.4b[2]\n" - ".inst 0x4f00fb32 // sudot v18.4s, v25.16b, v0.4b[2]\n" - ".inst 0x4f00fb13 // sudot v19.4s, v24.16b, v0.4b[2]\n" - ".inst 0x4f20faf0 // sudot v16.4s, v23.16b, v0.4b[3]\n" - ".inst 0x4f20fad1 // sudot v17.4s, v22.16b, v0.4b[3]\n" - ".inst 0x4f20fab2 // sudot v18.4s, v21.16b, v0.4b[3]\n" - ".inst 0x4f20fa93 // sudot v19.4s, v20.16b, v0.4b[3]\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x4f20f093 // sudot v19.4s, v4.16b, v0.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x4f00f8b0 // sudot v16.4s, v5.16b, v0.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4f00f8d1 // sudot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f00f8f2 // sudot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f00f913 // sudot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f20f930 // sudot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4f20f951 // sudot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4f20f892 // sudot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4f20f8b3 // sudot v19.4s, v5.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 8f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum "ldr q0, [x24, #0x0]\n" - "ldr q4, [x28, #0x0]\n" + "ldr q4, [x9, #0x0]\n" "sub x25, x25, #0x10\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" "cmp x25, #0x20\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "bge 7b\n" "9:" // Height 1: Multiply loop: Single iteration only ".inst 0x4f00f090 // sudot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q21, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" ".inst 0x4f00f0b1 // sudot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q20, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" ".inst 0x4f00f0d2 // sudot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q26, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x4f00f0f3 // sudot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q25, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" ".inst 0x4f20f110 // sudot v16.4s, v8.16b, v0.4b[1]\n" - "ldr q24, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x4f20f131 // sudot v17.4s, v9.16b, v0.4b[1]\n" - "ldr q23, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x4f20f152 // sudot v18.4s, v10.16b, v0.4b[1]\n" - "ldr q22, [x28, #0xd0]\n" - ".inst 0x4f20f2b3 // sudot v19.4s, v21.16b, v0.4b[1]\n" - "ldr q21, [x28, #0xe0]\n" - ".inst 0x4f00fa90 // sudot v16.4s, v20.16b, v0.4b[2]\n" - "ldr q20, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" - ".inst 0x4f00fb51 // sudot v17.4s, v26.16b, v0.4b[2]\n" - ".inst 0x4f00fb32 // sudot v18.4s, v25.16b, v0.4b[2]\n" - ".inst 0x4f00fb13 // sudot v19.4s, v24.16b, v0.4b[2]\n" - ".inst 0x4f20faf0 // sudot v16.4s, v23.16b, v0.4b[3]\n" - ".inst 0x4f20fad1 // sudot v17.4s, v22.16b, v0.4b[3]\n" - ".inst 0x4f20fab2 // sudot v18.4s, v21.16b, v0.4b[3]\n" - ".inst 0x4f20fa93 // sudot v19.4s, v20.16b, v0.4b[3]\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x4f20f093 // sudot v19.4s, v4.16b, v0.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x4f00f8b0 // sudot v16.4s, v5.16b, v0.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4f00f8d1 // sudot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f00f8f2 // sudot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f00f913 // sudot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f20f930 // sudot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4f20f951 // sudot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4f20f892 // sudot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4f20f8b3 // sudot v19.4s, v5.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 10f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "10:" // Height 1: Multiply loop: unique 2: skip row sum @@ -210,17 +205,17 @@ void a64_hybrid_u8s8qa_dot_4x16 ( "tbnz %x[flags], #31, 13f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "13:" // Height 1: Multiply loop: unique 3: skip row sum - "ldr q23, [x28, #0x0]\n" - "ldr q22, [x28, #0x10]\n" + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" "sub x25, x25, #0x4\n" - "ldr q21, [x28, #0x20]\n" - "ldr q20, [x28, #0x30]\n" + "ldr q8, [x9, #0x20]\n" + "ldr q9, [x9, #0x30]\n" "cmp x25, #0x4\n" - "add x28, x28, #0x40\n" - ".inst 0x4f00f2f0 // sudot v16.4s, v23.16b, v0.4b[0]\n" - ".inst 0x4f00f2d1 // sudot v17.4s, v22.16b, v0.4b[0]\n" - ".inst 0x4f00f2b2 // sudot v18.4s, v21.16b, v0.4b[0]\n" - ".inst 0x4f00f293 // sudot v19.4s, v20.16b, v0.4b[0]\n" + "add x9, x9, #0x40\n" + ".inst 0x4f00f0d0 // sudot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f00f0f1 // sudot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f00f112 // sudot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f00f133 // sudot v19.4s, v9.16b, v0.4b[0]\n" "bge 12b\n" "14:" // Height 1: Multiply loop: Skip odd blocks "cbz x25, 18f\n" @@ -235,15 +230,15 @@ void a64_hybrid_u8s8qa_dot_4x16 ( "tbnz %x[flags], #31, 17f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "17:" // Height 1: Multiply loop: unique 4: skip row sum - "ldr q23, [x28, #0x0]\n" - "ldr q22, [x28, #0x10]\n" - "ldr q21, [x28, #0x20]\n" - "ldr q20, [x28, #0x30]\n" - "add x28, x28, #0x40\n" - ".inst 0x4f00f2f0 // sudot v16.4s, v23.16b, v0.4b[0]\n" - ".inst 0x4f00f2d1 // sudot v17.4s, v22.16b, v0.4b[0]\n" - ".inst 0x4f00f2b2 // sudot v18.4s, v21.16b, v0.4b[0]\n" - ".inst 0x4f00f293 // sudot v19.4s, v20.16b, v0.4b[0]\n" + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + ".inst 0x4f00f150 // sudot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f00f091 // sudot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f00f0b2 // sudot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f00f0d3 // sudot v19.4s, v6.16b, v0.4b[0]\n" "18:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" @@ -253,136 +248,122 @@ void a64_hybrid_u8s8qa_dot_4x16 ( "tbnz %x[flags], #31, 19f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" - "ld1r { v20.4s }, [x20]\n" - "neg v20.4s, v20.4s\n" + "ld1r { v1.4s }, [x20]\n" + "neg v1.4s, v1.4s\n" "addp v11.4s, v11.4s, v11.4s\n" - "mul v11.4s, v11.4s, v20.4s\n" + "mul v11.4s, v11.4s, v1.4s\n" "19:" // Height 1: skip row sum fixup - "ldr q24, [x10, #0x0]\n" - "ldr q23, [x10, #0x10]\n" + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q22, [x10, #0x20]\n" - "ldr q21, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v20.4s }, [x20]\n" - "add v16.4s, v16.4s, v24.4s\n" - "add v17.4s, v17.4s, v23.4s\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - "add x10, x10, #0x40\n" + "ld1r { v4.4s }, [x21]\n" + "add v16.4s, v16.4s, v0.4s\n" "ld1r { v0.4s }, [x20]\n" - "add v18.4s, v18.4s, v22.4s\n" - "add v19.4s, v19.4s, v21.4s\n" - "sqrdmulh v16.4s, v16.4s, v20.4s\n" - "sqrdmulh v17.4s, v17.4s, v20.4s\n" - "sqrdmulh v18.4s, v18.4s, v20.4s\n" - "sqrdmulh v19.4s, v19.4s, v20.4s\n" - "tbz %x[flags], #5, 20f\n" - "and v23.16b, v16.16b, v0.16b\n" - "and v22.16b, v17.16b, v0.16b\n" - "and v21.16b, v18.16b, v0.16b\n" - "and v20.16b, v19.16b, v0.16b\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v23.4s\n" - "sqadd v17.4s, v17.4s, v22.4s\n" - "sqadd v18.4s, v18.4s, v21.4s\n" - "sqadd v19.4s, v19.4s, v20.4s\n" - "20:" // Height 1: no shift correction + "add v17.4s, v17.4s, v1.4s\n" "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v6.4s }, [x20]\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "add x28, x28, #0x40\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "ld1r { v4.4s }, [x21]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v22.4s }, [x21]\n" - "ld1r { v21.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v20.4s }, [x20]\n" - "add v16.4s, v16.4s, v22.4s\n" - "add v17.4s, v17.4s, v22.4s\n" - "add v18.4s, v18.4s, v22.4s\n" - "add v19.4s, v19.4s, v22.4s\n" - "smin v16.4s, v16.4s, v21.4s\n" - "smin v17.4s, v17.4s, v21.4s\n" - "smin v18.4s, v18.4s, v21.4s\n" - "smin v19.4s, v19.4s, v21.4s\n" - "smax v16.4s, v16.4s, v20.4s\n" - "smax v17.4s, v17.4s, v20.4s\n" - "smax v18.4s, v18.4s, v20.4s\n" - "smax v19.4s, v19.4s, v20.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v16.16b, v16.16b, v17.16b\n" - "bge 29f\n" - "tbz x9, #3, 24f\n" + "bge 28f\n" + "tbz x10, #3, 23f\n" "str d16, [x27], #0x8\n" - "tbz x9, #2, 22f\n" + "tbz x10, #2, 21f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "tbz x9, #1, 21f\n" + "tbz x10, #1, 20f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "tbz x9, #0, 28f\n" + "tbz x10, #0, 27f\n" "st1 { v16.b }[14], [x27]\n" - "b 28f\n" - "21:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x9, #0, 28f\n" + "b 27f\n" + "20:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x10, #0, 27f\n" "st1 { v16.b }[12], [x27]\n" - "b 28f\n" - "22:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x9, #1, 23f\n" + "b 27f\n" + "21:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x10, #1, 22f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "tbz x9, #0, 28f\n" + "tbz x10, #0, 27f\n" "st1 { v16.b }[10], [x27]\n" - "b 28f\n" - "23:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x9, #0, 28f\n" + "b 27f\n" + "22:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x10, #0, 27f\n" "st1 { v16.b }[8], [x27]\n" - "b 28f\n" - "24:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x9, #2, 26f\n" + "b 27f\n" + "23:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x10, #2, 25f\n" "str s16, [x27], #0x4\n" - "tbz x9, #1, 25f\n" + "tbz x10, #1, 24f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "tbz x9, #0, 28f\n" + "tbz x10, #0, 27f\n" "st1 { v16.b }[6], [x27]\n" - "b 28f\n" - "25:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x9, #0, 28f\n" + "b 27f\n" + "24:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x10, #0, 27f\n" "st1 { v16.b }[4], [x27]\n" - "b 28f\n" - "26:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x9, #1, 27f\n" + "b 27f\n" + "25:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x10, #1, 26f\n" "str h16, [x27], #0x2\n" - "tbz x9, #0, 28f\n" + "tbz x10, #0, 27f\n" "st1 { v16.b }[2], [x27]\n" - "b 28f\n" - "27:" // Height 1: Partial direct writeback: partial_1_0 + "b 27f\n" + "26:" // Height 1: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "28:" // Height 1: Partial direct writeback: Done - "b 30f\n" - "29:" // Height 1: Full writeback + "27:" // Height 1: Partial direct writeback: Done + "b 29f\n" + "28:" // Height 1: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "30:" // Height 1: Writeback done - "subs x9, x9, #0x10\n" + "29:" // Height 1: Writeback done + "subs x10, x10, #0x10\n" "bgt 2b\n" - "b 122f\n" - "31:" // Height 2 - "mov x10, %x[col_bias]\n" + "b 118f\n" + "30:" // Height 2 "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v15.16b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "32:" // Height 2: Column loop + "31:" // Height 2: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -391,416 +372,389 @@ void a64_hybrid_u8s8qa_dot_4x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "33:" // Height 2: setup done "mov x26, #0x0\n" - "34:" // Height 2: String loop + "33:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 35f\n" + "tbz %x[flags], #3, 34f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" - "cbnz x26, 36f\n" + "cbnz x26, 35f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" - "b 36f\n" - "35:" // Height 2: setup direct input + "b 35f\n" + "34:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" - "36:" // Height 2: input setup done + "35:" // Height 2: input setup done "cmp x25, #0x10\n" - "blt 41f\n" + "blt 40f\n" "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "cmp x25, #0x20\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" - "blt 39f\n" - "37:" // Height 2: Multiply loop: Main loop head + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" + "blt 38f\n" + "36:" // Height 2: Multiply loop: Main loop head ".inst 0x4f00f090 // sudot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x4f01f094 // sudot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q25, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" "add x24, x24, #0x10\n" ".inst 0x4f00f0b1 // sudot v17.4s, v5.16b, v0.4b[0]\n" ".inst 0x4f01f0b5 // sudot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q24, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" "add x23, x23, #0x10\n" ".inst 0x4f00f0d2 // sudot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f01f0d6 // sudot v22.4s, v6.16b, v1.4b[0]\n" - "ldr q30, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x4f00f0f3 // sudot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f01f0f7 // sudot v23.4s, v7.16b, v1.4b[0]\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x4f20f110 // sudot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4f21f114 // sudot v20.4s, v8.16b, v1.4b[1]\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x4f20f131 // sudot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4f21f135 // sudot v21.4s, v9.16b, v1.4b[1]\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x4f20f152 // sudot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4f21f156 // sudot v22.4s, v10.16b, v1.4b[1]\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x4f20f333 // sudot v19.4s, v25.16b, v0.4b[1]\n" - ".inst 0x4f21f337 // sudot v23.4s, v25.16b, v1.4b[1]\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x4f00fb10 // sudot v16.4s, v24.16b, v0.4b[2]\n" - ".inst 0x4f01fb14 // sudot v20.4s, v24.16b, v1.4b[2]\n" - "ldr q24, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" - ".inst 0x4f00fbd1 // sudot v17.4s, v30.16b, v0.4b[2]\n" - ".inst 0x4f01fbd5 // sudot v21.4s, v30.16b, v1.4b[2]\n" - ".inst 0x4f00fbb2 // sudot v18.4s, v29.16b, v0.4b[2]\n" - ".inst 0x4f01fbb6 // sudot v22.4s, v29.16b, v1.4b[2]\n" - ".inst 0x4f00fb93 // sudot v19.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f01fb97 // sudot v23.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4f20fb70 // sudot v16.4s, v27.16b, v0.4b[3]\n" - ".inst 0x4f21fb74 // sudot v20.4s, v27.16b, v1.4b[3]\n" - ".inst 0x4f20fb51 // sudot v17.4s, v26.16b, v0.4b[3]\n" - ".inst 0x4f21fb55 // sudot v21.4s, v26.16b, v1.4b[3]\n" - ".inst 0x4f20fb32 // sudot v18.4s, v25.16b, v0.4b[3]\n" - ".inst 0x4f21fb36 // sudot v22.4s, v25.16b, v1.4b[3]\n" - ".inst 0x4f20fb13 // sudot v19.4s, v24.16b, v0.4b[3]\n" - ".inst 0x4f21fb17 // sudot v23.4s, v24.16b, v1.4b[3]\n" - "tbnz %x[flags], #31, 38f\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x4f20f093 // sudot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4f21f097 // sudot v23.4s, v4.16b, v1.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x4f00f8b0 // sudot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f01f8b4 // sudot v20.4s, v5.16b, v1.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4f00f8d1 // sudot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f01f8d5 // sudot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f00f8f2 // sudot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f01f8f6 // sudot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f00f913 // sudot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f01f917 // sudot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f20f930 // sudot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4f21f934 // sudot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4f20f951 // sudot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4f21f955 // sudot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4f20f892 // sudot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4f21f896 // sudot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4f20f8b3 // sudot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4f21f8b7 // sudot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 37f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - "38:" // Height 2: Multiply loop: unique 5: skip row sum + "37:" // Height 2: Multiply loop: unique 5: skip row sum "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "sub x25, x25, #0x10\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x9, #0x10]\n" "cmp x25, #0x20\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - "bge 37b\n" - "39:" // Height 2: Multiply loop: Single iteration only + "bge 36b\n" + "38:" // Height 2: Multiply loop: Single iteration only ".inst 0x4f00f090 // sudot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x4f01f094 // sudot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q25, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" "sub x25, x25, #0x10\n" ".inst 0x4f00f0b1 // sudot v17.4s, v5.16b, v0.4b[0]\n" ".inst 0x4f01f0b5 // sudot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q24, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" "add x24, x24, #0x10\n" ".inst 0x4f00f0d2 // sudot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f01f0d6 // sudot v22.4s, v6.16b, v1.4b[0]\n" - "ldr q30, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" "add x23, x23, #0x10\n" ".inst 0x4f00f0f3 // sudot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f01f0f7 // sudot v23.4s, v7.16b, v1.4b[0]\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x4f20f110 // sudot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4f21f114 // sudot v20.4s, v8.16b, v1.4b[1]\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x4f20f131 // sudot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4f21f135 // sudot v21.4s, v9.16b, v1.4b[1]\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x4f20f152 // sudot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4f21f156 // sudot v22.4s, v10.16b, v1.4b[1]\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x4f20f333 // sudot v19.4s, v25.16b, v0.4b[1]\n" - ".inst 0x4f21f337 // sudot v23.4s, v25.16b, v1.4b[1]\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x4f00fb10 // sudot v16.4s, v24.16b, v0.4b[2]\n" - ".inst 0x4f01fb14 // sudot v20.4s, v24.16b, v1.4b[2]\n" - "ldr q24, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" - ".inst 0x4f00fbd1 // sudot v17.4s, v30.16b, v0.4b[2]\n" - ".inst 0x4f01fbd5 // sudot v21.4s, v30.16b, v1.4b[2]\n" - ".inst 0x4f00fbb2 // sudot v18.4s, v29.16b, v0.4b[2]\n" - ".inst 0x4f01fbb6 // sudot v22.4s, v29.16b, v1.4b[2]\n" - ".inst 0x4f00fb93 // sudot v19.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f01fb97 // sudot v23.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4f20fb70 // sudot v16.4s, v27.16b, v0.4b[3]\n" - ".inst 0x4f21fb74 // sudot v20.4s, v27.16b, v1.4b[3]\n" - ".inst 0x4f20fb51 // sudot v17.4s, v26.16b, v0.4b[3]\n" - ".inst 0x4f21fb55 // sudot v21.4s, v26.16b, v1.4b[3]\n" - ".inst 0x4f20fb32 // sudot v18.4s, v25.16b, v0.4b[3]\n" - ".inst 0x4f21fb36 // sudot v22.4s, v25.16b, v1.4b[3]\n" - ".inst 0x4f20fb13 // sudot v19.4s, v24.16b, v0.4b[3]\n" - ".inst 0x4f21fb17 // sudot v23.4s, v24.16b, v1.4b[3]\n" - "tbnz %x[flags], #31, 40f\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x4f20f093 // sudot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4f21f097 // sudot v23.4s, v4.16b, v1.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x4f00f8b0 // sudot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f01f8b4 // sudot v20.4s, v5.16b, v1.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" + ".inst 0x4f00f8d1 // sudot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f01f8d5 // sudot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f00f8f2 // sudot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f01f8f6 // sudot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f00f913 // sudot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f01f917 // sudot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f20f930 // sudot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4f21f934 // sudot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4f20f951 // sudot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4f21f955 // sudot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4f20f892 // sudot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4f21f896 // sudot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4f20f8b3 // sudot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4f21f8b7 // sudot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 39f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - "40:" // Height 2: Multiply loop: unique 6: skip row sum + "39:" // Height 2: Multiply loop: unique 6: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - "41:" // Height 2: Multiply loop: Main loop skip - "cbz x25, 48f\n" + "40:" // Height 2: Multiply loop: Main loop skip + "cbz x25, 47f\n" "cmp x25, #0x4\n" - "blt 44f\n" - "42:" // Height 2: Multiply loop: Odd block loop + "blt 43f\n" + "41:" // Height 2: Multiply loop: Odd block loop "ldr s0, [x24], #0x4\n" "ldr s1, [x23], #0x4\n" - "tbnz %x[flags], #31, 43f\n" + "tbnz %x[flags], #31, 42f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - "43:" // Height 2: Multiply loop: unique 7: skip row sum - "ldr q27, [x28, #0x0]\n" - "ldr q26, [x28, #0x10]\n" + "42:" // Height 2: Multiply loop: unique 7: skip row sum + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" "sub x25, x25, #0x4\n" - "ldr q25, [x28, #0x20]\n" - "ldr q24, [x28, #0x30]\n" + "ldr q8, [x9, #0x20]\n" + "ldr q9, [x9, #0x30]\n" "cmp x25, #0x4\n" - "add x28, x28, #0x40\n" - ".inst 0x4f00f370 // sudot v16.4s, v27.16b, v0.4b[0]\n" - ".inst 0x4f01f374 // sudot v20.4s, v27.16b, v1.4b[0]\n" - ".inst 0x4f00f351 // sudot v17.4s, v26.16b, v0.4b[0]\n" - ".inst 0x4f01f355 // sudot v21.4s, v26.16b, v1.4b[0]\n" - ".inst 0x4f00f332 // sudot v18.4s, v25.16b, v0.4b[0]\n" - ".inst 0x4f01f336 // sudot v22.4s, v25.16b, v1.4b[0]\n" - ".inst 0x4f00f313 // sudot v19.4s, v24.16b, v0.4b[0]\n" - ".inst 0x4f01f317 // sudot v23.4s, v24.16b, v1.4b[0]\n" - "bge 42b\n" - "44:" // Height 2: Multiply loop: Skip odd blocks - "cbz x25, 48f\n" - "tbz x25, #1, 45f\n" + "add x9, x9, #0x40\n" + ".inst 0x4f00f0d0 // sudot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f01f0d4 // sudot v20.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f00f0f1 // sudot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f01f0f5 // sudot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f00f112 // sudot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f01f116 // sudot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x4f00f133 // sudot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x4f01f137 // sudot v23.4s, v9.16b, v1.4b[0]\n" + "bge 41b\n" + "43:" // Height 2: Multiply loop: Skip odd blocks + "cbz x25, 47f\n" + "tbz x25, #1, 44f\n" "ldr h0, [x24], #0x2\n" "ldr h1, [x23], #0x2\n" - "tbz x25, #0, 46f\n" + "tbz x25, #0, 45f\n" "ld1 { v0.b }[2], [x24]\n" "ld1 { v1.b }[2], [x23]\n" - "b 46f\n" - "45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "b 45f\n" + "44:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x24, #0x0]\n" "ldr b1, [x23, #0x0]\n" - "46:" // Height 2: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 47f\n" + "45:" // Height 2: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 46f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - "47:" // Height 2: Multiply loop: unique 8: skip row sum - "ldr q27, [x28, #0x0]\n" - "ldr q26, [x28, #0x10]\n" - "ldr q25, [x28, #0x20]\n" - "ldr q24, [x28, #0x30]\n" - "add x28, x28, #0x40\n" - ".inst 0x4f00f370 // sudot v16.4s, v27.16b, v0.4b[0]\n" - ".inst 0x4f01f374 // sudot v20.4s, v27.16b, v1.4b[0]\n" - ".inst 0x4f00f351 // sudot v17.4s, v26.16b, v0.4b[0]\n" - ".inst 0x4f01f355 // sudot v21.4s, v26.16b, v1.4b[0]\n" - ".inst 0x4f00f332 // sudot v18.4s, v25.16b, v0.4b[0]\n" - ".inst 0x4f01f336 // sudot v22.4s, v25.16b, v1.4b[0]\n" - ".inst 0x4f00f313 // sudot v19.4s, v24.16b, v0.4b[0]\n" - ".inst 0x4f01f317 // sudot v23.4s, v24.16b, v1.4b[0]\n" - "48:" // Height 2: Multiply loop: No odd multiplies + "46:" // Height 2: Multiply loop: unique 8: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + ".inst 0x4f00f150 // sudot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f01f154 // sudot v20.4s, v10.16b, v1.4b[0]\n" + ".inst 0x4f00f091 // sudot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f01f095 // sudot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f00f0b2 // sudot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f01f0b6 // sudot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f00f0d3 // sudot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f01f0d7 // sudot v23.4s, v6.16b, v1.4b[0]\n" + "47:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 34b\n" + "bne 33b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x27, #0x0]\n" - "add x24, x27, x20\n" - "prfm pstl1keep, [x24, #0x0]\n" - "tbnz %x[flags], #31, 49f\n" + "add x26, x27, x20\n" + "prfm pstl1keep, [x26, #0x0]\n" + "tbnz %x[flags], #31, 48f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "ld1r { v24.4s }, [x20]\n" - "neg v24.4s, v24.4s\n" + "ld1r { v2.4s }, [x20]\n" + "neg v2.4s, v2.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "mul v11.4s, v11.4s, v24.4s\n" - "mul v12.4s, v12.4s, v24.4s\n" - "49:" // Height 2: skip row sum fixup - "ldr q28, [x10, #0x0]\n" - "ldr q27, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v2.4s\n" + "mul v12.4s, v12.4s, v2.4s\n" + "48:" // Height 2: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q26, [x10, #0x20]\n" - "ldr q25, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v24.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" - "add v16.4s, v16.4s, v28.4s\n" - "add v17.4s, v17.4s, v27.4s\n" - "add x10, x10, #0x40\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v25.4s\n" - "add v20.4s, v20.4s, v28.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v21.4s, v21.4s, v27.4s\n" - "add v22.4s, v22.4s, v26.4s\n" - "add v23.4s, v23.4s, v25.4s\n" - "sqrdmulh v16.4s, v16.4s, v24.4s\n" - "sqrdmulh v17.4s, v17.4s, v24.4s\n" - "sqrdmulh v18.4s, v18.4s, v24.4s\n" - "sqrdmulh v19.4s, v19.4s, v24.4s\n" - "sqrdmulh v20.4s, v20.4s, v24.4s\n" - "sqrdmulh v21.4s, v21.4s, v24.4s\n" - "sqrdmulh v22.4s, v22.4s, v24.4s\n" - "sqrdmulh v23.4s, v23.4s, v24.4s\n" - "tbz %x[flags], #5, 50f\n" - "and v24.16b, v16.16b, v0.16b\n" - "and v30.16b, v17.16b, v0.16b\n" - "and v29.16b, v18.16b, v0.16b\n" - "and v28.16b, v19.16b, v0.16b\n" - "and v27.16b, v20.16b, v0.16b\n" - "and v26.16b, v21.16b, v0.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "and v25.16b, v22.16b, v0.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v24.4s\n" - "and v24.16b, v23.16b, v0.16b\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v30.4s\n" - "sqadd v18.4s, v18.4s, v29.4s\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v19.4s, v19.4s, v28.4s\n" - "sqadd v20.4s, v20.4s, v27.4s\n" - "sqadd v21.4s, v21.4s, v26.4s\n" - "sqadd v22.4s, v22.4s, v25.4s\n" - "sqadd v23.4s, v23.4s, v24.4s\n" - "50:" // Height 2: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "add x22, %x[qp], %[c_offset]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "ld1r { v0.4s }, [x23]\n" + "add v21.4s, v21.4s, v1.4s\n" + "cmp x10, #0x10\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add x28, x28, #0x40\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v26.4s }, [x21]\n" - "ld1r { v25.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v24.4s }, [x20]\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" - "add v16.4s, v16.4s, v26.4s\n" - "add v17.4s, v17.4s, v26.4s\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v26.4s\n" - "add v20.4s, v20.4s, v26.4s\n" - "add v21.4s, v21.4s, v26.4s\n" - "add v22.4s, v22.4s, v26.4s\n" - "add v23.4s, v23.4s, v26.4s\n" - "smin v16.4s, v16.4s, v25.4s\n" - "smin v17.4s, v17.4s, v25.4s\n" - "smin v18.4s, v18.4s, v25.4s\n" - "smin v19.4s, v19.4s, v25.4s\n" - "smin v20.4s, v20.4s, v25.4s\n" - "smin v21.4s, v21.4s, v25.4s\n" - "smin v22.4s, v22.4s, v25.4s\n" - "smin v23.4s, v23.4s, v25.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "smax v18.4s, v18.4s, v24.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "smax v21.4s, v21.4s, v24.4s\n" - "smax v22.4s, v22.4s, v24.4s\n" - "smax v23.4s, v23.4s, v24.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v18.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v17.8h, v22.8h, v23.8h\n" - "uzp1 v16.16b, v16.16b, v18.16b\n" - "uzp1 v20.16b, v20.16b, v17.16b\n" - "bge 59f\n" - "tbz x9, #3, 54f\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "bge 57f\n" + "tbz x10, #3, 52f\n" "str d16, [x27], #0x8\n" - "str d20, [x24], #0x8\n" - "tbz x9, #2, 52f\n" + "str d20, [x26], #0x8\n" + "tbz x10, #2, 50f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x24], #0x4\n" - "tbz x9, #1, 51f\n" + "st1 { v20.s }[2], [x26], #0x4\n" + "tbz x10, #1, 49f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x24], #0x2\n" - "tbz x9, #0, 58f\n" + "st1 { v20.h }[6], [x26], #0x2\n" + "tbz x10, #0, 56f\n" "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x24]\n" - "b 58f\n" - "51:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x9, #0, 58f\n" + "st1 { v20.b }[14], [x26]\n" + "b 56f\n" + "49:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x10, #0, 56f\n" "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x24]\n" - "b 58f\n" - "52:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x9, #1, 53f\n" + "st1 { v20.b }[12], [x26]\n" + "b 56f\n" + "50:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x10, #1, 51f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x24], #0x2\n" - "tbz x9, #0, 58f\n" + "st1 { v20.h }[4], [x26], #0x2\n" + "tbz x10, #0, 56f\n" "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x24]\n" - "b 58f\n" - "53:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x9, #0, 58f\n" + "st1 { v20.b }[10], [x26]\n" + "b 56f\n" + "51:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x10, #0, 56f\n" "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x24]\n" - "b 58f\n" - "54:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x9, #2, 56f\n" + "st1 { v20.b }[8], [x26]\n" + "b 56f\n" + "52:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x10, #2, 54f\n" "str s16, [x27], #0x4\n" - "str s20, [x24], #0x4\n" - "tbz x9, #1, 55f\n" + "str s20, [x26], #0x4\n" + "tbz x10, #1, 53f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x24], #0x2\n" - "tbz x9, #0, 58f\n" + "st1 { v20.h }[2], [x26], #0x2\n" + "tbz x10, #0, 56f\n" "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x24]\n" - "b 58f\n" - "55:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x9, #0, 58f\n" + "st1 { v20.b }[6], [x26]\n" + "b 56f\n" + "53:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x10, #0, 56f\n" "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x24]\n" - "b 58f\n" - "56:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x9, #1, 57f\n" + "st1 { v20.b }[4], [x26]\n" + "b 56f\n" + "54:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x10, #1, 55f\n" "str h16, [x27], #0x2\n" - "str h20, [x24], #0x2\n" - "tbz x9, #0, 58f\n" + "str h20, [x26], #0x2\n" + "tbz x10, #0, 56f\n" "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x24]\n" - "b 58f\n" - "57:" // Height 2: Partial direct writeback: partial_1_0 + "st1 { v20.b }[2], [x26]\n" + "b 56f\n" + "55:" // Height 2: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "str b20, [x24, #0x0]\n" - "58:" // Height 2: Partial direct writeback: Done - "b 60f\n" - "59:" // Height 2: Full writeback + "str b20, [x26, #0x0]\n" + "56:" // Height 2: Partial direct writeback: Done + "b 58f\n" + "57:" // Height 2: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q20, [x24, #0x0]\n" - "60:" // Height 2: Writeback done - "subs x9, x9, #0x10\n" - "bgt 32b\n" - "b 122f\n" - "61:" // Height 3 - "mov x10, %x[col_bias]\n" + "str q20, [x26, #0x0]\n" + "58:" // Height 2: Writeback done + "subs x10, x10, #0x10\n" + "bgt 31b\n" + "b 118f\n" + "59:" // Height 3 "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v13.4s, #0x0\n" "movi v15.16b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "62:" // Height 3: Column loop + "60:" // Height 3: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -813,542 +767,503 @@ void a64_hybrid_u8s8qa_dot_4x16 ( "movi v25.4s, #0x0\n" "movi v26.4s, #0x0\n" "movi v27.4s, #0x0\n" - "63:" // Height 3: setup done "mov x26, #0x0\n" - "64:" // Height 3: String loop + "62:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 65f\n" + "tbz %x[flags], #3, 63f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" - "cbnz x26, 66f\n" + "cbnz x26, 64f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" - "b 66f\n" - "65:" // Height 3: setup direct input + "b 64f\n" + "63:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" - "66:" // Height 3: input setup done + "64:" // Height 3: input setup done "cmp x25, #0x10\n" - "blt 71f\n" + "blt 69f\n" "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "cmp x25, #0x20\n" "ldr q2, [x22, #0x0]\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" - "blt 69f\n" - "67:" // Height 3: Multiply loop: Main loop head + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" + "blt 67f\n" + "65:" // Height 3: Multiply loop: Main loop head ".inst 0x4f00f090 // sudot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x4f01f094 // sudot v20.4s, v4.16b, v1.4b[0]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x4f02f098 // sudot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q29, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" ".inst 0x4f00f0b1 // sudot v17.4s, v5.16b, v0.4b[0]\n" "add x22, x22, #0x10\n" ".inst 0x4f01f0b5 // sudot v21.4s, v5.16b, v1.4b[0]\n" ".inst 0x4f02f0b9 // sudot v25.4s, v5.16b, v2.4b[0]\n" - "ldr q28, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" ".inst 0x4f00f0d2 // sudot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f01f0d6 // sudot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f02f0da // sudot v26.4s, v6.16b, v2.4b[0]\n" - "ldr q5, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x4f00f0f3 // sudot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f01f0f7 // sudot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f02f0fb // sudot v27.4s, v7.16b, v2.4b[0]\n" - "ldr q4, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x4f20f110 // sudot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4f21f114 // sudot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x4f22f118 // sudot v24.4s, v8.16b, v2.4b[1]\n" - "ldr q3, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x4f20f131 // sudot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4f21f135 // sudot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x4f22f139 // sudot v25.4s, v9.16b, v2.4b[1]\n" - "ldr q31, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x4f20f152 // sudot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4f21f156 // sudot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4f22f15a // sudot v26.4s, v10.16b, v2.4b[1]\n" - "ldr q30, [x28, #0xd0]\n" - ".inst 0x4f20f3b3 // sudot v19.4s, v29.16b, v0.4b[1]\n" - ".inst 0x4f21f3b7 // sudot v23.4s, v29.16b, v1.4b[1]\n" - ".inst 0x4f22f3bb // sudot v27.4s, v29.16b, v2.4b[1]\n" - "ldr q29, [x28, #0xe0]\n" - ".inst 0x4f00fb90 // sudot v16.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f01fb94 // sudot v20.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4f02fb98 // sudot v24.4s, v28.16b, v2.4b[2]\n" - "ldr q28, [x28, #0xf0]\n" - ".inst 0x4f00f8b1 // sudot v17.4s, v5.16b, v0.4b[2]\n" - "add x28, x28, #0x100\n" - ".inst 0x4f01f8b5 // sudot v21.4s, v5.16b, v1.4b[2]\n" - ".inst 0x4f02f8b9 // sudot v25.4s, v5.16b, v2.4b[2]\n" - ".inst 0x4f00f892 // sudot v18.4s, v4.16b, v0.4b[2]\n" - ".inst 0x4f01f896 // sudot v22.4s, v4.16b, v1.4b[2]\n" - ".inst 0x4f02f89a // sudot v26.4s, v4.16b, v2.4b[2]\n" - ".inst 0x4f00f873 // sudot v19.4s, v3.16b, v0.4b[2]\n" - ".inst 0x4f01f877 // sudot v23.4s, v3.16b, v1.4b[2]\n" - ".inst 0x4f02f87b // sudot v27.4s, v3.16b, v2.4b[2]\n" - ".inst 0x4f20fbf0 // sudot v16.4s, v31.16b, v0.4b[3]\n" - ".inst 0x4f21fbf4 // sudot v20.4s, v31.16b, v1.4b[3]\n" - ".inst 0x4f22fbf8 // sudot v24.4s, v31.16b, v2.4b[3]\n" - ".inst 0x4f20fbd1 // sudot v17.4s, v30.16b, v0.4b[3]\n" - ".inst 0x4f21fbd5 // sudot v21.4s, v30.16b, v1.4b[3]\n" - ".inst 0x4f22fbd9 // sudot v25.4s, v30.16b, v2.4b[3]\n" - ".inst 0x4f20fbb2 // sudot v18.4s, v29.16b, v0.4b[3]\n" - ".inst 0x4f21fbb6 // sudot v22.4s, v29.16b, v1.4b[3]\n" - ".inst 0x4f22fbba // sudot v26.4s, v29.16b, v2.4b[3]\n" - ".inst 0x4f20fb93 // sudot v19.4s, v28.16b, v0.4b[3]\n" - ".inst 0x4f21fb97 // sudot v23.4s, v28.16b, v1.4b[3]\n" - ".inst 0x4f22fb9b // sudot v27.4s, v28.16b, v2.4b[3]\n" - "tbnz %x[flags], #31, 68f\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x4f20f093 // sudot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4f21f097 // sudot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4f22f09b // sudot v27.4s, v4.16b, v2.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x4f00f8b0 // sudot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f01f8b4 // sudot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f02f8b8 // sudot v24.4s, v5.16b, v2.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + ".inst 0x4f00f8d1 // sudot v17.4s, v6.16b, v0.4b[2]\n" + "add x9, x9, #0x100\n" + ".inst 0x4f01f8d5 // sudot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f02f8d9 // sudot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f00f8f2 // sudot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f01f8f6 // sudot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f02f8fa // sudot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f00f913 // sudot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f01f917 // sudot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f02f91b // sudot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4f20f930 // sudot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4f21f934 // sudot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4f22f938 // sudot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x4f20f951 // sudot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4f21f955 // sudot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4f22f959 // sudot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x4f20f892 // sudot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4f21f896 // sudot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4f22f89a // sudot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4f20f8b3 // sudot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4f21f8b7 // sudot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4f22f8bb // sudot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 66f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "68:" // Height 3: Multiply loop: unique 9: skip row sum + "66:" // Height 3: Multiply loop: unique 9: skip row sum "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "sub x25, x25, #0x10\n" "ldr q2, [x22, #0x0]\n" - "ldr q4, [x28, #0x0]\n" + "ldr q4, [x9, #0x0]\n" "cmp x25, #0x20\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - "bge 67b\n" - "69:" // Height 3: Multiply loop: Single iteration only + "bge 65b\n" + "67:" // Height 3: Multiply loop: Single iteration only ".inst 0x4f00f090 // sudot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x4f01f094 // sudot v20.4s, v4.16b, v1.4b[0]\n" "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" ".inst 0x4f02f098 // sudot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q29, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" ".inst 0x4f00f0b1 // sudot v17.4s, v5.16b, v0.4b[0]\n" "add x23, x23, #0x10\n" ".inst 0x4f01f0b5 // sudot v21.4s, v5.16b, v1.4b[0]\n" ".inst 0x4f02f0b9 // sudot v25.4s, v5.16b, v2.4b[0]\n" - "ldr q28, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" "add x22, x22, #0x10\n" ".inst 0x4f00f0d2 // sudot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f01f0d6 // sudot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f02f0da // sudot v26.4s, v6.16b, v2.4b[0]\n" - "ldr q5, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x4f00f0f3 // sudot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f01f0f7 // sudot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f02f0fb // sudot v27.4s, v7.16b, v2.4b[0]\n" - "ldr q4, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x4f20f110 // sudot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4f21f114 // sudot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x4f22f118 // sudot v24.4s, v8.16b, v2.4b[1]\n" - "ldr q3, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x4f20f131 // sudot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4f21f135 // sudot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x4f22f139 // sudot v25.4s, v9.16b, v2.4b[1]\n" - "ldr q31, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x4f20f152 // sudot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4f21f156 // sudot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4f22f15a // sudot v26.4s, v10.16b, v2.4b[1]\n" - "ldr q30, [x28, #0xd0]\n" - ".inst 0x4f20f3b3 // sudot v19.4s, v29.16b, v0.4b[1]\n" - ".inst 0x4f21f3b7 // sudot v23.4s, v29.16b, v1.4b[1]\n" - ".inst 0x4f22f3bb // sudot v27.4s, v29.16b, v2.4b[1]\n" - "ldr q29, [x28, #0xe0]\n" - ".inst 0x4f00fb90 // sudot v16.4s, v28.16b, v0.4b[2]\n" - ".inst 0x4f01fb94 // sudot v20.4s, v28.16b, v1.4b[2]\n" - ".inst 0x4f02fb98 // sudot v24.4s, v28.16b, v2.4b[2]\n" - "ldr q28, [x28, #0xf0]\n" - ".inst 0x4f00f8b1 // sudot v17.4s, v5.16b, v0.4b[2]\n" - "add x28, x28, #0x100\n" - ".inst 0x4f01f8b5 // sudot v21.4s, v5.16b, v1.4b[2]\n" - ".inst 0x4f02f8b9 // sudot v25.4s, v5.16b, v2.4b[2]\n" - ".inst 0x4f00f892 // sudot v18.4s, v4.16b, v0.4b[2]\n" - ".inst 0x4f01f896 // sudot v22.4s, v4.16b, v1.4b[2]\n" - ".inst 0x4f02f89a // sudot v26.4s, v4.16b, v2.4b[2]\n" - ".inst 0x4f00f873 // sudot v19.4s, v3.16b, v0.4b[2]\n" - ".inst 0x4f01f877 // sudot v23.4s, v3.16b, v1.4b[2]\n" - ".inst 0x4f02f87b // sudot v27.4s, v3.16b, v2.4b[2]\n" - ".inst 0x4f20fbf0 // sudot v16.4s, v31.16b, v0.4b[3]\n" - ".inst 0x4f21fbf4 // sudot v20.4s, v31.16b, v1.4b[3]\n" - ".inst 0x4f22fbf8 // sudot v24.4s, v31.16b, v2.4b[3]\n" - ".inst 0x4f20fbd1 // sudot v17.4s, v30.16b, v0.4b[3]\n" - ".inst 0x4f21fbd5 // sudot v21.4s, v30.16b, v1.4b[3]\n" - ".inst 0x4f22fbd9 // sudot v25.4s, v30.16b, v2.4b[3]\n" - ".inst 0x4f20fbb2 // sudot v18.4s, v29.16b, v0.4b[3]\n" - ".inst 0x4f21fbb6 // sudot v22.4s, v29.16b, v1.4b[3]\n" - ".inst 0x4f22fbba // sudot v26.4s, v29.16b, v2.4b[3]\n" - ".inst 0x4f20fb93 // sudot v19.4s, v28.16b, v0.4b[3]\n" - ".inst 0x4f21fb97 // sudot v23.4s, v28.16b, v1.4b[3]\n" - ".inst 0x4f22fb9b // sudot v27.4s, v28.16b, v2.4b[3]\n" - "tbnz %x[flags], #31, 70f\n" + "ldr q10, [x9, #0xd0]\n" + ".inst 0x4f20f093 // sudot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4f21f097 // sudot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4f22f09b // sudot v27.4s, v4.16b, v2.4b[1]\n" + "ldr q4, [x9, #0xe0]\n" + ".inst 0x4f00f8b0 // sudot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f01f8b4 // sudot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f02f8b8 // sudot v24.4s, v5.16b, v2.4b[2]\n" + "ldr q5, [x9, #0xf0]\n" + ".inst 0x4f00f8d1 // sudot v17.4s, v6.16b, v0.4b[2]\n" + "add x9, x9, #0x100\n" + ".inst 0x4f01f8d5 // sudot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f02f8d9 // sudot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f00f8f2 // sudot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f01f8f6 // sudot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f02f8fa // sudot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f00f913 // sudot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f01f917 // sudot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f02f91b // sudot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4f20f930 // sudot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4f21f934 // sudot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4f22f938 // sudot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x4f20f951 // sudot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4f21f955 // sudot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4f22f959 // sudot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x4f20f892 // sudot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4f21f896 // sudot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4f22f89a // sudot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4f20f8b3 // sudot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4f21f8b7 // sudot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4f22f8bb // sudot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 68f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "70:" // Height 3: Multiply loop: unique 10: skip row sum + "68:" // Height 3: Multiply loop: unique 10: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - "71:" // Height 3: Multiply loop: Main loop skip - "cbz x25, 78f\n" + "69:" // Height 3: Multiply loop: Main loop skip + "cbz x25, 76f\n" "cmp x25, #0x4\n" - "blt 74f\n" - "72:" // Height 3: Multiply loop: Odd block loop + "blt 72f\n" + "70:" // Height 3: Multiply loop: Odd block loop "ldr s0, [x24], #0x4\n" "ldr s1, [x23], #0x4\n" "ldr s2, [x22], #0x4\n" - "tbnz %x[flags], #31, 73f\n" + "tbnz %x[flags], #31, 71f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "73:" // Height 3: Multiply loop: unique 11: skip row sum - "ldr q31, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" + "71:" // Height 3: Multiply loop: unique 11: skip row sum + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" "sub x25, x25, #0x4\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" + "ldr q8, [x9, #0x20]\n" + "ldr q9, [x9, #0x30]\n" "cmp x25, #0x4\n" - "add x28, x28, #0x40\n" - ".inst 0x4f00f3f0 // sudot v16.4s, v31.16b, v0.4b[0]\n" - ".inst 0x4f01f3f4 // sudot v20.4s, v31.16b, v1.4b[0]\n" - ".inst 0x4f02f3f8 // sudot v24.4s, v31.16b, v2.4b[0]\n" - ".inst 0x4f00f3d1 // sudot v17.4s, v30.16b, v0.4b[0]\n" - ".inst 0x4f01f3d5 // sudot v21.4s, v30.16b, v1.4b[0]\n" - ".inst 0x4f02f3d9 // sudot v25.4s, v30.16b, v2.4b[0]\n" - ".inst 0x4f00f3b2 // sudot v18.4s, v29.16b, v0.4b[0]\n" - ".inst 0x4f01f3b6 // sudot v22.4s, v29.16b, v1.4b[0]\n" - ".inst 0x4f02f3ba // sudot v26.4s, v29.16b, v2.4b[0]\n" - ".inst 0x4f00f393 // sudot v19.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f01f397 // sudot v23.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f02f39b // sudot v27.4s, v28.16b, v2.4b[0]\n" - "bge 72b\n" - "74:" // Height 3: Multiply loop: Skip odd blocks - "cbz x25, 78f\n" - "tbz x25, #1, 75f\n" + "add x9, x9, #0x40\n" + ".inst 0x4f00f0d0 // sudot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f01f0d4 // sudot v20.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f02f0d8 // sudot v24.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f00f0f1 // sudot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f01f0f5 // sudot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f02f0f9 // sudot v25.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f00f112 // sudot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f01f116 // sudot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x4f02f11a // sudot v26.4s, v8.16b, v2.4b[0]\n" + ".inst 0x4f00f133 // sudot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x4f01f137 // sudot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x4f02f13b // sudot v27.4s, v9.16b, v2.4b[0]\n" + "bge 70b\n" + "72:" // Height 3: Multiply loop: Skip odd blocks + "cbz x25, 76f\n" + "tbz x25, #1, 73f\n" "ldr h0, [x24], #0x2\n" "ldr h1, [x23], #0x2\n" "ldr h2, [x22], #0x2\n" - "tbz x25, #0, 76f\n" + "tbz x25, #0, 74f\n" "ld1 { v0.b }[2], [x24]\n" "ld1 { v1.b }[2], [x23]\n" "ld1 { v2.b }[2], [x22]\n" - "b 76f\n" - "75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "b 74f\n" + "73:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x24, #0x0]\n" "ldr b1, [x23, #0x0]\n" "ldr b2, [x22, #0x0]\n" - "76:" // Height 3: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 77f\n" + "74:" // Height 3: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 75f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "77:" // Height 3: Multiply loop: unique 12: skip row sum - "ldr q31, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" - "add x28, x28, #0x40\n" - ".inst 0x4f00f3f0 // sudot v16.4s, v31.16b, v0.4b[0]\n" - ".inst 0x4f01f3f4 // sudot v20.4s, v31.16b, v1.4b[0]\n" - ".inst 0x4f02f3f8 // sudot v24.4s, v31.16b, v2.4b[0]\n" - ".inst 0x4f00f3d1 // sudot v17.4s, v30.16b, v0.4b[0]\n" - ".inst 0x4f01f3d5 // sudot v21.4s, v30.16b, v1.4b[0]\n" - ".inst 0x4f02f3d9 // sudot v25.4s, v30.16b, v2.4b[0]\n" - ".inst 0x4f00f3b2 // sudot v18.4s, v29.16b, v0.4b[0]\n" - ".inst 0x4f01f3b6 // sudot v22.4s, v29.16b, v1.4b[0]\n" - ".inst 0x4f02f3ba // sudot v26.4s, v29.16b, v2.4b[0]\n" - ".inst 0x4f00f393 // sudot v19.4s, v28.16b, v0.4b[0]\n" - ".inst 0x4f01f397 // sudot v23.4s, v28.16b, v1.4b[0]\n" - ".inst 0x4f02f39b // sudot v27.4s, v28.16b, v2.4b[0]\n" - "78:" // Height 3: Multiply loop: No odd multiplies + "75:" // Height 3: Multiply loop: unique 12: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + ".inst 0x4f00f150 // sudot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f01f154 // sudot v20.4s, v10.16b, v1.4b[0]\n" + ".inst 0x4f02f158 // sudot v24.4s, v10.16b, v2.4b[0]\n" + ".inst 0x4f00f091 // sudot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f01f095 // sudot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f02f099 // sudot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f00f0b2 // sudot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f01f0b6 // sudot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f02f0ba // sudot v26.4s, v5.16b, v2.4b[0]\n" + ".inst 0x4f00f0d3 // sudot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f01f0d7 // sudot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f02f0db // sudot v27.4s, v6.16b, v2.4b[0]\n" + "76:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 64b\n" + "bne 62b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x27, #0x0]\n" - "add x24, x27, x20\n" - "prfm pstl1keep, [x24, #0x0]\n" - "add x23, x24, x20\n" - "prfm pstl1keep, [x23, #0x0]\n" - "tbnz %x[flags], #31, 79f\n" + "add x26, x27, x20\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x20\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbnz %x[flags], #31, 77f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "ld1r { v28.4s }, [x20]\n" + "ld1r { v3.4s }, [x20]\n" "addp v13.4s, v13.4s, v13.4s\n" - "neg v28.4s, v28.4s\n" + "neg v3.4s, v3.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "mul v11.4s, v11.4s, v28.4s\n" - "mul v12.4s, v12.4s, v28.4s\n" - "mul v13.4s, v13.4s, v28.4s\n" - "79:" // Height 3: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q31, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v3.4s\n" + "mul v12.4s, v12.4s, v3.4s\n" + "mul v13.4s, v13.4s, v3.4s\n" + "77:" // Height 3: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q30, [x10, #0x20]\n" - "ldr q29, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v28.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "add x22, %x[qp], %[c_offset]\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" - "add x10, x10, #0x40\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v31.4s\n" - "add v18.4s, v18.4s, v30.4s\n" - "add v19.4s, v19.4s, v29.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add x28, x28, #0x40\n" "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v31.4s\n" - "add v22.4s, v22.4s, v30.4s\n" - "add v23.4s, v23.4s, v29.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v25.4s, v25.4s, v31.4s\n" - "add v26.4s, v26.4s, v30.4s\n" - "add v27.4s, v27.4s, v29.4s\n" - "sqrdmulh v16.4s, v16.4s, v28.4s\n" - "sqrdmulh v17.4s, v17.4s, v28.4s\n" - "sqrdmulh v18.4s, v18.4s, v28.4s\n" - "sqrdmulh v19.4s, v19.4s, v28.4s\n" - "sqrdmulh v20.4s, v20.4s, v28.4s\n" - "sqrdmulh v21.4s, v21.4s, v28.4s\n" - "sqrdmulh v22.4s, v22.4s, v28.4s\n" - "sqrdmulh v23.4s, v23.4s, v28.4s\n" - "sqrdmulh v24.4s, v24.4s, v28.4s\n" - "sqrdmulh v25.4s, v25.4s, v28.4s\n" - "sqrdmulh v26.4s, v26.4s, v28.4s\n" - "sqrdmulh v27.4s, v27.4s, v28.4s\n" - "tbz %x[flags], #5, 80f\n" - "and v1.16b, v16.16b, v0.16b\n" - "and v31.16b, v17.16b, v0.16b\n" - "and v30.16b, v18.16b, v0.16b\n" - "and v29.16b, v19.16b, v0.16b\n" - "and v28.16b, v20.16b, v0.16b\n" - "and v3.16b, v21.16b, v0.16b\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "and v2.16b, v22.16b, v0.16b\n" - "sqadd v16.4s, v16.4s, v1.4s\n" - "sqadd v17.4s, v17.4s, v31.4s\n" - "sqadd v18.4s, v18.4s, v30.4s\n" - "sqadd v19.4s, v19.4s, v29.4s\n" - "sqadd v20.4s, v20.4s, v28.4s\n" - "and v1.16b, v23.16b, v0.16b\n" - "and v31.16b, v24.16b, v0.16b\n" - "and v30.16b, v25.16b, v0.16b\n" - "and v29.16b, v26.16b, v0.16b\n" - "and v28.16b, v27.16b, v0.16b\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sqadd v21.4s, v21.4s, v3.4s\n" - "sqadd v22.4s, v22.4s, v2.4s\n" - "sqadd v23.4s, v23.4s, v1.4s\n" - "sqadd v24.4s, v24.4s, v31.4s\n" - "sqadd v25.4s, v25.4s, v30.4s\n" - "sqadd v26.4s, v26.4s, v29.4s\n" - "sqadd v27.4s, v27.4s, v28.4s\n" - "80:" // Height 3: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "ld1r { v0.4s }, [x23]\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v4.4s\n" + "sqdmulh v26.4s, v26.4s, v4.4s\n" + "sqdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v30.4s }, [x21]\n" - "ld1r { v29.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v28.4s }, [x20]\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v24.4s, v24.4s, v0.4s\n" "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add v16.4s, v16.4s, v30.4s\n" - "add v17.4s, v17.4s, v30.4s\n" - "add v18.4s, v18.4s, v30.4s\n" - "add v19.4s, v19.4s, v30.4s\n" - "add v20.4s, v20.4s, v30.4s\n" - "add v21.4s, v21.4s, v30.4s\n" - "add v22.4s, v22.4s, v30.4s\n" - "add v23.4s, v23.4s, v30.4s\n" - "add v24.4s, v24.4s, v30.4s\n" - "add v25.4s, v25.4s, v30.4s\n" - "add v26.4s, v26.4s, v30.4s\n" - "add v27.4s, v27.4s, v30.4s\n" - "smin v16.4s, v16.4s, v29.4s\n" - "smin v17.4s, v17.4s, v29.4s\n" - "smin v18.4s, v18.4s, v29.4s\n" - "smin v19.4s, v19.4s, v29.4s\n" - "smin v20.4s, v20.4s, v29.4s\n" - "smin v21.4s, v21.4s, v29.4s\n" - "smin v22.4s, v22.4s, v29.4s\n" - "smin v23.4s, v23.4s, v29.4s\n" - "smin v24.4s, v24.4s, v29.4s\n" - "smin v25.4s, v25.4s, v29.4s\n" - "smin v26.4s, v26.4s, v29.4s\n" - "smin v27.4s, v27.4s, v29.4s\n" - "smax v16.4s, v16.4s, v28.4s\n" - "smax v17.4s, v17.4s, v28.4s\n" - "smax v18.4s, v18.4s, v28.4s\n" - "smax v19.4s, v19.4s, v28.4s\n" - "smax v20.4s, v20.4s, v28.4s\n" - "smax v21.4s, v21.4s, v28.4s\n" - "smax v22.4s, v22.4s, v28.4s\n" - "smax v23.4s, v23.4s, v28.4s\n" - "smax v24.4s, v24.4s, v28.4s\n" - "smax v25.4s, v25.4s, v28.4s\n" - "smax v26.4s, v26.4s, v28.4s\n" - "smax v27.4s, v27.4s, v28.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v19.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v18.8h, v22.8h, v23.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v17.8h, v26.8h, v27.8h\n" - "uzp1 v16.16b, v16.16b, v19.16b\n" - "uzp1 v20.16b, v20.16b, v18.16b\n" - "uzp1 v24.16b, v24.16b, v17.16b\n" - "bge 89f\n" - "tbz x9, #3, 84f\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 86f\n" + "tbz x10, #3, 81f\n" "str d16, [x27], #0x8\n" - "str d20, [x24], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x9, #2, 82f\n" + "str d20, [x26], #0x8\n" + "str d24, [x25], #0x8\n" + "tbz x10, #2, 79f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x24], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "tbz x9, #1, 81f\n" + "st1 { v20.s }[2], [x26], #0x4\n" + "st1 { v24.s }[2], [x25], #0x4\n" + "tbz x10, #1, 78f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x24], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "tbz x9, #0, 88f\n" + "st1 { v20.h }[6], [x26], #0x2\n" + "st1 { v24.h }[6], [x25], #0x2\n" + "tbz x10, #0, 85f\n" "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x24]\n" - "st1 { v24.b }[14], [x23]\n" - "b 88f\n" - "81:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x9, #0, 88f\n" + "st1 { v20.b }[14], [x26]\n" + "st1 { v24.b }[14], [x25]\n" + "b 85f\n" + "78:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x10, #0, 85f\n" "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x24]\n" - "st1 { v24.b }[12], [x23]\n" - "b 88f\n" - "82:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x9, #1, 83f\n" + "st1 { v20.b }[12], [x26]\n" + "st1 { v24.b }[12], [x25]\n" + "b 85f\n" + "79:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x10, #1, 80f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x24], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "tbz x9, #0, 88f\n" + "st1 { v20.h }[4], [x26], #0x2\n" + "st1 { v24.h }[4], [x25], #0x2\n" + "tbz x10, #0, 85f\n" "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x24]\n" - "st1 { v24.b }[10], [x23]\n" - "b 88f\n" - "83:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x9, #0, 88f\n" + "st1 { v20.b }[10], [x26]\n" + "st1 { v24.b }[10], [x25]\n" + "b 85f\n" + "80:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x10, #0, 85f\n" "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x24]\n" - "st1 { v24.b }[8], [x23]\n" - "b 88f\n" - "84:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x9, #2, 86f\n" + "st1 { v20.b }[8], [x26]\n" + "st1 { v24.b }[8], [x25]\n" + "b 85f\n" + "81:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x10, #2, 83f\n" "str s16, [x27], #0x4\n" - "str s20, [x24], #0x4\n" - "str s24, [x23], #0x4\n" - "tbz x9, #1, 85f\n" + "str s20, [x26], #0x4\n" + "str s24, [x25], #0x4\n" + "tbz x10, #1, 82f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x24], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "tbz x9, #0, 88f\n" + "st1 { v20.h }[2], [x26], #0x2\n" + "st1 { v24.h }[2], [x25], #0x2\n" + "tbz x10, #0, 85f\n" "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x24]\n" - "st1 { v24.b }[6], [x23]\n" - "b 88f\n" - "85:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x9, #0, 88f\n" + "st1 { v20.b }[6], [x26]\n" + "st1 { v24.b }[6], [x25]\n" + "b 85f\n" + "82:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x10, #0, 85f\n" "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x24]\n" - "st1 { v24.b }[4], [x23]\n" - "b 88f\n" - "86:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x9, #1, 87f\n" + "st1 { v20.b }[4], [x26]\n" + "st1 { v24.b }[4], [x25]\n" + "b 85f\n" + "83:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x10, #1, 84f\n" "str h16, [x27], #0x2\n" - "str h20, [x24], #0x2\n" - "str h24, [x23], #0x2\n" - "tbz x9, #0, 88f\n" + "str h20, [x26], #0x2\n" + "str h24, [x25], #0x2\n" + "tbz x10, #0, 85f\n" "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x24]\n" - "st1 { v24.b }[2], [x23]\n" - "b 88f\n" - "87:" // Height 3: Partial direct writeback: partial_1_0 + "st1 { v20.b }[2], [x26]\n" + "st1 { v24.b }[2], [x25]\n" + "b 85f\n" + "84:" // Height 3: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "str b20, [x24, #0x0]\n" - "str b24, [x23, #0x0]\n" - "88:" // Height 3: Partial direct writeback: Done - "b 90f\n" - "89:" // Height 3: Full writeback + "str b20, [x26, #0x0]\n" + "str b24, [x25, #0x0]\n" + "85:" // Height 3: Partial direct writeback: Done + "b 87f\n" + "86:" // Height 3: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q20, [x24, #0x0]\n" - "str q24, [x23, #0x0]\n" - "90:" // Height 3: Writeback done - "subs x9, x9, #0x10\n" - "bgt 62b\n" - "b 122f\n" - "91:" // Height 4 + "str q20, [x26, #0x0]\n" + "str q24, [x25, #0x0]\n" + "87:" // Height 3: Writeback done + "subs x10, x10, #0x10\n" + "bgt 60b\n" + "b 118f\n" + "88:" // Height 4 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "mov x20, #0x4\n" - "mov x10, %x[col_bias]\n" "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" - "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" "movi v13.4s, #0x0\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v14.4s, #0x0\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "madd x20, x21, x20, x27\n" "movi v15.16b, #0x1\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" + "madd x20, x21, x20, x27\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "92:" // Height 4: Column loop + "89:" // Height 4: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -1365,98 +1280,97 @@ void a64_hybrid_u8s8qa_dot_4x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "93:" // Height 4: setup done "mov x26, #0x0\n" - "94:" // Height 4: String loop + "91:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 95f\n" + "tbz %x[flags], #3, 92f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" "ldr x21, [x20, #0x18]\n" - "cbnz x26, 96f\n" + "cbnz x26, 93f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" "add x21, x21, x20\n" - "b 96f\n" - "95:" // Height 4: setup direct input + "b 93f\n" + "92:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" "add x21, x22, x21\n" - "96:" // Height 4: input setup done + "93:" // Height 4: input setup done "cmp x25, #0x10\n" - "blt 101f\n" + "blt 98f\n" "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "cmp x25, #0x20\n" "ldr q2, [x22, #0x0]\n" "ldr q3, [x21, #0x0]\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" - "blt 99f\n" - "97:" // Height 4: Multiply loop: Main loop head + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" + "blt 96f\n" + "94:" // Height 4: Multiply loop: Main loop head ".inst 0x4f00f090 // sudot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x4f01f094 // sudot v20.4s, v4.16b, v1.4b[0]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x4f02f098 // sudot v24.4s, v4.16b, v2.4b[0]\n" ".inst 0x4f03f09c // sudot v28.4s, v4.16b, v3.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" "add x22, x22, #0x10\n" ".inst 0x4f00f0b1 // sudot v17.4s, v5.16b, v0.4b[0]\n" ".inst 0x4f01f0b5 // sudot v21.4s, v5.16b, v1.4b[0]\n" "add x21, x21, #0x10\n" ".inst 0x4f02f0b9 // sudot v25.4s, v5.16b, v2.4b[0]\n" ".inst 0x4f03f0bd // sudot v29.4s, v5.16b, v3.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" ".inst 0x4f00f0d2 // sudot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f01f0d6 // sudot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f02f0da // sudot v26.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f03f0de // sudot v30.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x4f00f0f3 // sudot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f01f0f7 // sudot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f02f0fb // sudot v27.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f03f0ff // sudot v31.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x4f20f110 // sudot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4f21f114 // sudot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x4f22f118 // sudot v24.4s, v8.16b, v2.4b[1]\n" ".inst 0x4f23f11c // sudot v28.4s, v8.16b, v3.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x4f20f131 // sudot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4f21f135 // sudot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x4f22f139 // sudot v25.4s, v9.16b, v2.4b[1]\n" ".inst 0x4f23f13d // sudot v29.4s, v9.16b, v3.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x4f20f152 // sudot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4f21f156 // sudot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4f22f15a // sudot v26.4s, v10.16b, v2.4b[1]\n" ".inst 0x4f23f15e // sudot v30.4s, v10.16b, v3.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" + "ldr q10, [x9, #0xd0]\n" ".inst 0x4f20f093 // sudot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x4f21f097 // sudot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x4f22f09b // sudot v27.4s, v4.16b, v2.4b[1]\n" ".inst 0x4f23f09f // sudot v31.4s, v4.16b, v3.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" + "ldr q4, [x9, #0xe0]\n" ".inst 0x4f00f8b0 // sudot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x4f01f8b4 // sudot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x4f02f8b8 // sudot v24.4s, v5.16b, v2.4b[2]\n" ".inst 0x4f03f8bc // sudot v28.4s, v5.16b, v3.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4f00f8d1 // sudot v17.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f01f8d5 // sudot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f02f8d9 // sudot v25.4s, v6.16b, v2.4b[2]\n" @@ -1485,38 +1399,38 @@ void a64_hybrid_u8s8qa_dot_4x16 ( ".inst 0x4f21f8b7 // sudot v23.4s, v5.16b, v1.4b[3]\n" ".inst 0x4f22f8bb // sudot v27.4s, v5.16b, v2.4b[3]\n" ".inst 0x4f23f8bf // sudot v31.4s, v5.16b, v3.4b[3]\n" - "tbnz %x[flags], #31, 98f\n" + "tbnz %x[flags], #31, 95f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" - "98:" // Height 4: Multiply loop: unique 13: skip row sum + "95:" // Height 4: Multiply loop: unique 13: skip row sum "ldr q0, [x24, #0x0]\n" "ldr q1, [x23, #0x0]\n" "sub x25, x25, #0x10\n" "ldr q2, [x22, #0x0]\n" "ldr q3, [x21, #0x0]\n" "cmp x25, #0x20\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q8, [x28, #0x40]\n" - "ldr q9, [x28, #0x50]\n" - "ldr q10, [x28, #0x60]\n" + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" + "ldr q8, [x9, #0x40]\n" + "ldr q9, [x9, #0x50]\n" + "ldr q10, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" "prfm pldl1keep, [x21, #0x80]\n" - "bge 97b\n" - "99:" // Height 4: Multiply loop: Single iteration only + "bge 94b\n" + "96:" // Height 4: Multiply loop: Single iteration only ".inst 0x4f00f090 // sudot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x4f01f094 // sudot v20.4s, v4.16b, v1.4b[0]\n" "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" ".inst 0x4f02f098 // sudot v24.4s, v4.16b, v2.4b[0]\n" ".inst 0x4f03f09c // sudot v28.4s, v4.16b, v3.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q4, [x9, #0x70]\n" "add x23, x23, #0x10\n" ".inst 0x4f00f0b1 // sudot v17.4s, v5.16b, v0.4b[0]\n" ".inst 0x4f01f0b5 // sudot v21.4s, v5.16b, v1.4b[0]\n" @@ -1524,43 +1438,43 @@ void a64_hybrid_u8s8qa_dot_4x16 ( "add x21, x21, #0x10\n" ".inst 0x4f02f0b9 // sudot v25.4s, v5.16b, v2.4b[0]\n" ".inst 0x4f03f0bd // sudot v29.4s, v5.16b, v3.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q5, [x9, #0x80]\n" ".inst 0x4f00f0d2 // sudot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f01f0d6 // sudot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f02f0da // sudot v26.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f03f0de // sudot v30.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q6, [x9, #0x90]\n" ".inst 0x4f00f0f3 // sudot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f01f0f7 // sudot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f02f0fb // sudot v27.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f03f0ff // sudot v31.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q7, [x9, #0xa0]\n" ".inst 0x4f20f110 // sudot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4f21f114 // sudot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x4f22f118 // sudot v24.4s, v8.16b, v2.4b[1]\n" ".inst 0x4f23f11c // sudot v28.4s, v8.16b, v3.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q8, [x9, #0xb0]\n" ".inst 0x4f20f131 // sudot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4f21f135 // sudot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x4f22f139 // sudot v25.4s, v9.16b, v2.4b[1]\n" ".inst 0x4f23f13d // sudot v29.4s, v9.16b, v3.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q9, [x9, #0xc0]\n" ".inst 0x4f20f152 // sudot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4f21f156 // sudot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4f22f15a // sudot v26.4s, v10.16b, v2.4b[1]\n" ".inst 0x4f23f15e // sudot v30.4s, v10.16b, v3.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" + "ldr q10, [x9, #0xd0]\n" ".inst 0x4f20f093 // sudot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x4f21f097 // sudot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x4f22f09b // sudot v27.4s, v4.16b, v2.4b[1]\n" ".inst 0x4f23f09f // sudot v31.4s, v4.16b, v3.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" + "ldr q4, [x9, #0xe0]\n" ".inst 0x4f00f8b0 // sudot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x4f01f8b4 // sudot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x4f02f8b8 // sudot v24.4s, v5.16b, v2.4b[2]\n" ".inst 0x4f03f8bc // sudot v28.4s, v5.16b, v3.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q5, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4f00f8d1 // sudot v17.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f01f8d5 // sudot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f02f8d9 // sudot v25.4s, v6.16b, v2.4b[2]\n" @@ -1589,252 +1503,202 @@ void a64_hybrid_u8s8qa_dot_4x16 ( ".inst 0x4f21f8b7 // sudot v23.4s, v5.16b, v1.4b[3]\n" ".inst 0x4f22f8bb // sudot v27.4s, v5.16b, v2.4b[3]\n" ".inst 0x4f23f8bf // sudot v31.4s, v5.16b, v3.4b[3]\n" - "tbnz %x[flags], #31, 100f\n" + "tbnz %x[flags], #31, 97f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" - "100:" // Height 4: Multiply loop: unique 14: skip row sum + "97:" // Height 4: Multiply loop: unique 14: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" "prfm pldl1keep, [x21, #0x80]\n" - "101:" // Height 4: Multiply loop: Main loop skip - "cbz x25, 108f\n" + "98:" // Height 4: Multiply loop: Main loop skip + "cbz x25, 105f\n" "cmp x25, #0x4\n" - "blt 104f\n" - "102:" // Height 4: Multiply loop: Odd block loop + "blt 101f\n" + "99:" // Height 4: Multiply loop: Odd block loop "ldr s0, [x24], #0x4\n" "ldr s1, [x23], #0x4\n" "ldr s2, [x22], #0x4\n" "ldr s3, [x21], #0x4\n" - "tbnz %x[flags], #31, 103f\n" + "tbnz %x[flags], #31, 100f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" - "103:" // Height 4: Multiply loop: unique 15: skip row sum - "ldr q7, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" + "100:" // Height 4: Multiply loop: unique 15: skip row sum + "ldr q6, [x9, #0x0]\n" + "ldr q7, [x9, #0x10]\n" "sub x25, x25, #0x4\n" - "ldr q5, [x28, #0x20]\n" - "ldr q4, [x28, #0x30]\n" + "ldr q8, [x9, #0x20]\n" + "ldr q9, [x9, #0x30]\n" "cmp x25, #0x4\n" - "add x28, x28, #0x40\n" - ".inst 0x4f00f0f0 // sudot v16.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f01f0f4 // sudot v20.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f02f0f8 // sudot v24.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f03f0fc // sudot v28.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f00f0d1 // sudot v17.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f01f0d5 // sudot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f02f0d9 // sudot v25.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f03f0dd // sudot v29.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f00f0b2 // sudot v18.4s, v5.16b, v0.4b[0]\n" - ".inst 0x4f01f0b6 // sudot v22.4s, v5.16b, v1.4b[0]\n" - ".inst 0x4f02f0ba // sudot v26.4s, v5.16b, v2.4b[0]\n" - ".inst 0x4f03f0be // sudot v30.4s, v5.16b, v3.4b[0]\n" - ".inst 0x4f00f093 // sudot v19.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4f01f097 // sudot v23.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4f02f09b // sudot v27.4s, v4.16b, v2.4b[0]\n" - ".inst 0x4f03f09f // sudot v31.4s, v4.16b, v3.4b[0]\n" - "bge 102b\n" - "104:" // Height 4: Multiply loop: Skip odd blocks - "cbz x25, 108f\n" - "tbz x25, #1, 105f\n" + "add x9, x9, #0x40\n" + ".inst 0x4f00f0d0 // sudot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f01f0d4 // sudot v20.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f02f0d8 // sudot v24.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f03f0dc // sudot v28.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f00f0f1 // sudot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f01f0f5 // sudot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f02f0f9 // sudot v25.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f03f0fd // sudot v29.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f00f112 // sudot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f01f116 // sudot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x4f02f11a // sudot v26.4s, v8.16b, v2.4b[0]\n" + ".inst 0x4f03f11e // sudot v30.4s, v8.16b, v3.4b[0]\n" + ".inst 0x4f00f133 // sudot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x4f01f137 // sudot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x4f02f13b // sudot v27.4s, v9.16b, v2.4b[0]\n" + ".inst 0x4f03f13f // sudot v31.4s, v9.16b, v3.4b[0]\n" + "bge 99b\n" + "101:" // Height 4: Multiply loop: Skip odd blocks + "cbz x25, 105f\n" + "tbz x25, #1, 102f\n" "ldr h0, [x24], #0x2\n" "ldr h1, [x23], #0x2\n" "ldr h2, [x22], #0x2\n" "ldr h3, [x21], #0x2\n" - "tbz x25, #0, 106f\n" + "tbz x25, #0, 103f\n" "ld1 { v0.b }[2], [x24]\n" "ld1 { v1.b }[2], [x23]\n" "ld1 { v2.b }[2], [x22]\n" "ld1 { v3.b }[2], [x21]\n" - "b 106f\n" - "105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "b 103f\n" + "102:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x24, #0x0]\n" "ldr b1, [x23, #0x0]\n" "ldr b2, [x22, #0x0]\n" "ldr b3, [x21, #0x0]\n" - "106:" // Height 4: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 107f\n" + "103:" // Height 4: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 104f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" - "107:" // Height 4: Multiply loop: unique 16: skip row sum - "ldr q7, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" - "ldr q5, [x28, #0x20]\n" - "ldr q4, [x28, #0x30]\n" - "add x28, x28, #0x40\n" - ".inst 0x4f00f0f0 // sudot v16.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f01f0f4 // sudot v20.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f02f0f8 // sudot v24.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f03f0fc // sudot v28.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f00f0d1 // sudot v17.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f01f0d5 // sudot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f02f0d9 // sudot v25.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f03f0dd // sudot v29.4s, v6.16b, v3.4b[0]\n" + "104:" // Height 4: Multiply loop: unique 16: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + ".inst 0x4f00f150 // sudot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f01f154 // sudot v20.4s, v10.16b, v1.4b[0]\n" + ".inst 0x4f02f158 // sudot v24.4s, v10.16b, v2.4b[0]\n" + ".inst 0x4f03f15c // sudot v28.4s, v10.16b, v3.4b[0]\n" + ".inst 0x4f00f091 // sudot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f01f095 // sudot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f02f099 // sudot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f03f09d // sudot v29.4s, v4.16b, v3.4b[0]\n" ".inst 0x4f00f0b2 // sudot v18.4s, v5.16b, v0.4b[0]\n" ".inst 0x4f01f0b6 // sudot v22.4s, v5.16b, v1.4b[0]\n" ".inst 0x4f02f0ba // sudot v26.4s, v5.16b, v2.4b[0]\n" ".inst 0x4f03f0be // sudot v30.4s, v5.16b, v3.4b[0]\n" - ".inst 0x4f00f093 // sudot v19.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4f01f097 // sudot v23.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4f02f09b // sudot v27.4s, v4.16b, v2.4b[0]\n" - ".inst 0x4f03f09f // sudot v31.4s, v4.16b, v3.4b[0]\n" - "108:" // Height 4: Multiply loop: No odd multiplies + ".inst 0x4f00f0d3 // sudot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f01f0d7 // sudot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f02f0db // sudot v27.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f03f0df // sudot v31.4s, v6.16b, v3.4b[0]\n" + "105:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 94b\n" + "bne 91b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x27, #0x0]\n" - "add x24, x27, x20\n" + "add x26, x27, x20\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x20\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x20\n" "prfm pstl1keep, [x24, #0x0]\n" - "add x23, x24, x20\n" - "prfm pstl1keep, [x23, #0x0]\n" - "add x22, x23, x20\n" - "prfm pstl1keep, [x22, #0x0]\n" - "tbnz %x[flags], #31, 109f\n" + "tbnz %x[flags], #31, 106f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "ld1r { v0.4s }, [x20]\n" + "ld1r { v4.4s }, [x20]\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "neg v0.4s, v0.4s\n" + "neg v4.4s, v4.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "mul v11.4s, v11.4s, v0.4s\n" - "mul v12.4s, v12.4s, v0.4s\n" - "mul v13.4s, v13.4s, v0.4s\n" - "mul v14.4s, v14.4s, v0.4s\n" - "109:" // Height 4: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q4, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v4.4s\n" + "mul v12.4s, v12.4s, v4.4s\n" + "mul v13.4s, v13.4s, v4.4s\n" + "mul v14.4s, v14.4s, v4.4s\n" + "106:" // Height 4: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q3, [x10, #0x20]\n" - "ldr q2, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v1.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "add x22, %x[qp], %[c_offset]\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" - "add x10, x10, #0x40\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" "add v28.4s, v28.4s, v14.4s\n" "add v29.4s, v29.4s, v14.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" "add v30.4s, v30.4s, v14.4s\n" "add v31.4s, v31.4s, v14.4s\n" + "add x28, x28, #0x40\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v3.4s\n" - "add v19.4s, v19.4s, v2.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v3.4s\n" - "add v23.4s, v23.4s, v2.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v3.4s\n" - "add v27.4s, v27.4s, v2.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" "add v28.4s, v28.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v29.4s, v29.4s, v4.4s\n" - "add v30.4s, v30.4s, v3.4s\n" - "add v31.4s, v31.4s, v2.4s\n" - "sqrdmulh v16.4s, v16.4s, v1.4s\n" - "sqrdmulh v17.4s, v17.4s, v1.4s\n" - "sqrdmulh v18.4s, v18.4s, v1.4s\n" - "sqrdmulh v19.4s, v19.4s, v1.4s\n" - "sqrdmulh v20.4s, v20.4s, v1.4s\n" - "sqrdmulh v21.4s, v21.4s, v1.4s\n" - "sqrdmulh v22.4s, v22.4s, v1.4s\n" - "sqrdmulh v23.4s, v23.4s, v1.4s\n" - "sqrdmulh v24.4s, v24.4s, v1.4s\n" - "sqrdmulh v25.4s, v25.4s, v1.4s\n" - "sqrdmulh v26.4s, v26.4s, v1.4s\n" - "sqrdmulh v27.4s, v27.4s, v1.4s\n" - "sqrdmulh v28.4s, v28.4s, v1.4s\n" - "sqrdmulh v29.4s, v29.4s, v1.4s\n" - "sqrdmulh v30.4s, v30.4s, v1.4s\n" - "sqrdmulh v31.4s, v31.4s, v1.4s\n" - "tbz %x[flags], #5, 110f\n" - "and v2.16b, v16.16b, v0.16b\n" - "and v1.16b, v17.16b, v0.16b\n" - "and v7.16b, v18.16b, v0.16b\n" - "and v6.16b, v19.16b, v0.16b\n" - "and v5.16b, v20.16b, v0.16b\n" - "and v4.16b, v21.16b, v0.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "and v3.16b, v22.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v2.4s\n" - "sqadd v17.4s, v17.4s, v1.4s\n" - "and v2.16b, v23.16b, v0.16b\n" - "and v1.16b, v24.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqadd v18.4s, v18.4s, v7.4s\n" - "sqadd v19.4s, v19.4s, v6.4s\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v5.4s\n" - "sqadd v21.4s, v21.4s, v4.4s\n" - "sqadd v22.4s, v22.4s, v3.4s\n" - "and v7.16b, v25.16b, v0.16b\n" - "sqadd v23.4s, v23.4s, v2.4s\n" - "sqadd v24.4s, v24.4s, v1.4s\n" - "and v6.16b, v26.16b, v0.16b\n" - "and v5.16b, v27.16b, v0.16b\n" - "and v4.16b, v28.16b, v0.16b\n" - "and v3.16b, v29.16b, v0.16b\n" - "and v2.16b, v30.16b, v0.16b\n" - "and v1.16b, v31.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqadd v25.4s, v25.4s, v7.4s\n" - "sqadd v26.4s, v26.4s, v6.4s\n" - "sqadd v27.4s, v27.4s, v5.4s\n" - "sqadd v28.4s, v28.4s, v4.4s\n" - "sqadd v29.4s, v29.4s, v3.4s\n" - "sqadd v30.4s, v30.4s, v2.4s\n" - "sqadd v31.4s, v31.4s, v1.4s\n" - "110:" // Height 4: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "ld1r { v0.4s }, [x23]\n" + "add v29.4s, v29.4s, v1.4s\n" + "add v30.4s, v30.4s, v2.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v4.4s\n" + "sqdmulh v26.4s, v26.4s, v4.4s\n" + "sqdmulh v27.4s, v27.4s, v4.4s\n" + "sqdmulh v28.4s, v28.4s, v4.4s\n" + "sqdmulh v29.4s, v29.4s, v4.4s\n" + "sqdmulh v30.4s, v30.4s, v4.4s\n" + "sqdmulh v31.4s, v31.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v3.4s }, [x21]\n" - "ld1r { v2.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v1.4s }, [x20]\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v24.4s, v24.4s, v0.4s\n" @@ -1845,178 +1709,178 @@ void a64_hybrid_u8s8qa_dot_4x16 ( "srshl v29.4s, v29.4s, v0.4s\n" "srshl v30.4s, v30.4s, v0.4s\n" "srshl v31.4s, v31.4s, v0.4s\n" - "add v16.4s, v16.4s, v3.4s\n" - "add v17.4s, v17.4s, v3.4s\n" - "add v18.4s, v18.4s, v3.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v3.4s\n" - "add v21.4s, v21.4s, v3.4s\n" - "add v22.4s, v22.4s, v3.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "add v24.4s, v24.4s, v3.4s\n" - "add v25.4s, v25.4s, v3.4s\n" - "add v26.4s, v26.4s, v3.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "add v28.4s, v28.4s, v3.4s\n" - "add v29.4s, v29.4s, v3.4s\n" - "add v30.4s, v30.4s, v3.4s\n" - "add v31.4s, v31.4s, v3.4s\n" - "smin v16.4s, v16.4s, v2.4s\n" - "smin v17.4s, v17.4s, v2.4s\n" - "smin v18.4s, v18.4s, v2.4s\n" - "smin v19.4s, v19.4s, v2.4s\n" - "smin v20.4s, v20.4s, v2.4s\n" - "smin v21.4s, v21.4s, v2.4s\n" - "smin v22.4s, v22.4s, v2.4s\n" - "smin v23.4s, v23.4s, v2.4s\n" - "smin v24.4s, v24.4s, v2.4s\n" - "smin v25.4s, v25.4s, v2.4s\n" - "smin v26.4s, v26.4s, v2.4s\n" - "smin v27.4s, v27.4s, v2.4s\n" - "smin v28.4s, v28.4s, v2.4s\n" - "smin v29.4s, v29.4s, v2.4s\n" - "smin v30.4s, v30.4s, v2.4s\n" - "smin v31.4s, v31.4s, v2.4s\n" - "smax v16.4s, v16.4s, v1.4s\n" - "smax v17.4s, v17.4s, v1.4s\n" - "smax v18.4s, v18.4s, v1.4s\n" - "smax v19.4s, v19.4s, v1.4s\n" - "smax v20.4s, v20.4s, v1.4s\n" - "smax v21.4s, v21.4s, v1.4s\n" - "smax v22.4s, v22.4s, v1.4s\n" - "smax v23.4s, v23.4s, v1.4s\n" - "smax v24.4s, v24.4s, v1.4s\n" - "smax v25.4s, v25.4s, v1.4s\n" - "smax v26.4s, v26.4s, v1.4s\n" - "smax v27.4s, v27.4s, v1.4s\n" - "smax v28.4s, v28.4s, v1.4s\n" - "smax v29.4s, v29.4s, v1.4s\n" - "smax v30.4s, v30.4s, v1.4s\n" - "smax v31.4s, v31.4s, v1.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v0.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v19.8h, v22.8h, v23.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v18.8h, v26.8h, v27.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" "uzp1 v28.8h, v28.8h, v29.8h\n" - "uzp1 v17.8h, v30.8h, v31.8h\n" - "uzp1 v16.16b, v16.16b, v0.16b\n" - "uzp1 v20.16b, v20.16b, v19.16b\n" - "uzp1 v24.16b, v24.16b, v18.16b\n" - "uzp1 v28.16b, v28.16b, v17.16b\n" - "bge 119f\n" - "tbz x9, #3, 114f\n" + "uzp1 v29.8h, v30.8h, v31.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v28.16b, v28.16b, v29.16b\n" + "bge 115f\n" + "tbz x10, #3, 110f\n" "str d16, [x27], #0x8\n" - "str d20, [x24], #0x8\n" - "str d24, [x23], #0x8\n" - "str d28, [x22], #0x8\n" - "tbz x9, #2, 112f\n" + "str d20, [x26], #0x8\n" + "str d24, [x25], #0x8\n" + "str d28, [x24], #0x8\n" + "tbz x10, #2, 108f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x24], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "st1 { v28.s }[2], [x22], #0x4\n" - "tbz x9, #1, 111f\n" + "st1 { v20.s }[2], [x26], #0x4\n" + "st1 { v24.s }[2], [x25], #0x4\n" + "st1 { v28.s }[2], [x24], #0x4\n" + "tbz x10, #1, 107f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x24], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "st1 { v28.h }[6], [x22], #0x2\n" - "tbz x9, #0, 118f\n" + "st1 { v20.h }[6], [x26], #0x2\n" + "st1 { v24.h }[6], [x25], #0x2\n" + "st1 { v28.h }[6], [x24], #0x2\n" + "tbz x10, #0, 114f\n" "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x24]\n" - "st1 { v24.b }[14], [x23]\n" - "st1 { v28.b }[14], [x22]\n" - "b 118f\n" - "111:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x9, #0, 118f\n" + "st1 { v20.b }[14], [x26]\n" + "st1 { v24.b }[14], [x25]\n" + "st1 { v28.b }[14], [x24]\n" + "b 114f\n" + "107:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x10, #0, 114f\n" "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x24]\n" - "st1 { v24.b }[12], [x23]\n" - "st1 { v28.b }[12], [x22]\n" - "b 118f\n" - "112:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x9, #1, 113f\n" + "st1 { v20.b }[12], [x26]\n" + "st1 { v24.b }[12], [x25]\n" + "st1 { v28.b }[12], [x24]\n" + "b 114f\n" + "108:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x10, #1, 109f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x24], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "st1 { v28.h }[4], [x22], #0x2\n" - "tbz x9, #0, 118f\n" + "st1 { v20.h }[4], [x26], #0x2\n" + "st1 { v24.h }[4], [x25], #0x2\n" + "st1 { v28.h }[4], [x24], #0x2\n" + "tbz x10, #0, 114f\n" "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x24]\n" - "st1 { v24.b }[10], [x23]\n" - "st1 { v28.b }[10], [x22]\n" - "b 118f\n" - "113:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x9, #0, 118f\n" + "st1 { v20.b }[10], [x26]\n" + "st1 { v24.b }[10], [x25]\n" + "st1 { v28.b }[10], [x24]\n" + "b 114f\n" + "109:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x10, #0, 114f\n" "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x24]\n" - "st1 { v24.b }[8], [x23]\n" - "st1 { v28.b }[8], [x22]\n" - "b 118f\n" - "114:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x9, #2, 116f\n" + "st1 { v20.b }[8], [x26]\n" + "st1 { v24.b }[8], [x25]\n" + "st1 { v28.b }[8], [x24]\n" + "b 114f\n" + "110:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x10, #2, 112f\n" "str s16, [x27], #0x4\n" - "str s20, [x24], #0x4\n" - "str s24, [x23], #0x4\n" - "str s28, [x22], #0x4\n" - "tbz x9, #1, 115f\n" + "str s20, [x26], #0x4\n" + "str s24, [x25], #0x4\n" + "str s28, [x24], #0x4\n" + "tbz x10, #1, 111f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x24], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "st1 { v28.h }[2], [x22], #0x2\n" - "tbz x9, #0, 118f\n" + "st1 { v20.h }[2], [x26], #0x2\n" + "st1 { v24.h }[2], [x25], #0x2\n" + "st1 { v28.h }[2], [x24], #0x2\n" + "tbz x10, #0, 114f\n" "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x24]\n" - "st1 { v24.b }[6], [x23]\n" - "st1 { v28.b }[6], [x22]\n" - "b 118f\n" - "115:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x9, #0, 118f\n" + "st1 { v20.b }[6], [x26]\n" + "st1 { v24.b }[6], [x25]\n" + "st1 { v28.b }[6], [x24]\n" + "b 114f\n" + "111:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x10, #0, 114f\n" "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x24]\n" - "st1 { v24.b }[4], [x23]\n" - "st1 { v28.b }[4], [x22]\n" - "b 118f\n" - "116:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x9, #1, 117f\n" + "st1 { v20.b }[4], [x26]\n" + "st1 { v24.b }[4], [x25]\n" + "st1 { v28.b }[4], [x24]\n" + "b 114f\n" + "112:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x10, #1, 113f\n" "str h16, [x27], #0x2\n" - "str h20, [x24], #0x2\n" - "str h24, [x23], #0x2\n" - "str h28, [x22], #0x2\n" - "tbz x9, #0, 118f\n" + "str h20, [x26], #0x2\n" + "str h24, [x25], #0x2\n" + "str h28, [x24], #0x2\n" + "tbz x10, #0, 114f\n" "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x24]\n" - "st1 { v24.b }[2], [x23]\n" - "st1 { v28.b }[2], [x22]\n" - "b 118f\n" - "117:" // Height 4: Partial direct writeback: partial_1_0 + "st1 { v20.b }[2], [x26]\n" + "st1 { v24.b }[2], [x25]\n" + "st1 { v28.b }[2], [x24]\n" + "b 114f\n" + "113:" // Height 4: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "str b20, [x24, #0x0]\n" - "str b24, [x23, #0x0]\n" - "str b28, [x22, #0x0]\n" - "118:" // Height 4: Partial direct writeback: Done - "b 120f\n" - "119:" // Height 4: Full writeback + "str b20, [x26, #0x0]\n" + "str b24, [x25, #0x0]\n" + "str b28, [x24, #0x0]\n" + "114:" // Height 4: Partial direct writeback: Done + "b 116f\n" + "115:" // Height 4: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q20, [x24, #0x0]\n" - "str q24, [x23, #0x0]\n" - "str q28, [x22, #0x0]\n" - "120:" // Height 4: Writeback done - "subs x9, x9, #0x10\n" - "bgt 92b\n" + "str q20, [x26, #0x0]\n" + "str q24, [x25, #0x0]\n" + "str q28, [x24, #0x0]\n" + "116:" // Height 4: Writeback done + "subs x10, x10, #0x10\n" + "bgt 89b\n" "subs %x[M], %x[M], #0x4\n" - "beq 122f\n" + "beq 118f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 121f\n" + "tbz %x[flags], #3, 117f\n" "add x21, x21, #0x4\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "121:" // Update direct input + "117:" // Update direct input "mov x20, #0x4\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "122:" // Exit + "118:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_mmla_4x16/generic.cpp index 00b9db05c0..97183d5cfd 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_mmla_4x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8s8qa_mmla_4x16/generic.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -74,22 +73,19 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( ka.string_lengths = string_lengths; ka.N = N; ka.B_ptr = B_ptr; - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 97f\n" + "bge 94f\n" "cmp %x[M], #0x2\n" - "bgt 65f\n" - "beq 33f\n" - "mov x10, %x[col_bias]\n" + "bgt 63f\n" + "beq 32f\n" "movi v11.4s, #0x0\n" "movi v15.16b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "2:" // Height 1: Column loop "movi v16.4s, #0x0\n" @@ -100,7 +96,6 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "3:" // Height 1: setup done "mov x26, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -120,92 +115,92 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( "cmp x25, #0x10\n" "blt 11f\n" "ldr q1, [x24, #0x0]\n" - "ldr q5, [x28, #0x0]\n" + "ldr q5, [x9, #0x0]\n" "cmp x25, #0x20\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "ldr q4, [x28, #0x60]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "ldr q4, [x9, #0x60]\n" "blt 9f\n" "7:" // Height 1: Multiply loop: Main loop head "add x24, x24, #0x10\n" - "trn1 v0.2d, v1.2d, v27.2d\n" + "trn1 v0.2d, v1.2d, v2.2d\n" ".inst 0x4e85ac10 // usmmla v16.4s, v0.16b, v5.16b\n" - "ldr q25, [x28, #0x70]\n" - "trn2 v1.2d, v1.2d, v27.2d\n" + "ldr q5, [x9, #0x70]\n" + "trn2 v1.2d, v1.2d, v2.2d\n" ".inst 0x4e86ac14 // usmmla v20.4s, v0.16b, v6.16b\n" - "ldr q24, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n" - "ldr q30, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4e88ac15 // usmmla v21.4s, v0.16b, v8.16b\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x4e89ac12 // usmmla v18.4s, v0.16b, v9.16b\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x4e8aac16 // usmmla v22.4s, v0.16b, v10.16b\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x4e84ac13 // usmmla v19.4s, v0.16b, v4.16b\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x4e99ac17 // usmmla v23.4s, v0.16b, v25.16b\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x4e98ac30 // usmmla v16.4s, v1.16b, v24.16b\n" - "ldr q24, [x28, #0xf0]\n" - ".inst 0x4e9eac34 // usmmla v20.4s, v1.16b, v30.16b\n" - "add x28, x28, #0x100\n" - ".inst 0x4e9dac31 // usmmla v17.4s, v1.16b, v29.16b\n" - ".inst 0x4e9cac35 // usmmla v21.4s, v1.16b, v28.16b\n" - ".inst 0x4e9bac32 // usmmla v18.4s, v1.16b, v27.16b\n" - ".inst 0x4e9aac36 // usmmla v22.4s, v1.16b, v26.16b\n" - ".inst 0x4e99ac33 // usmmla v19.4s, v1.16b, v25.16b\n" - ".inst 0x4e98ac37 // usmmla v23.4s, v1.16b, v24.16b\n" + "ldr q4, [x9, #0xd0]\n" + ".inst 0x4e85ac17 // usmmla v23.4s, v0.16b, v5.16b\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x4e86ac30 // usmmla v16.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + ".inst 0x4e87ac34 // usmmla v20.4s, v1.16b, v7.16b\n" + "add x9, x9, #0x100\n" + ".inst 0x4e88ac31 // usmmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e89ac35 // usmmla v21.4s, v1.16b, v9.16b\n" + ".inst 0x4e8aac32 // usmmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e84ac36 // usmmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e85ac33 // usmmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e86ac37 // usmmla v23.4s, v1.16b, v6.16b\n" "tbnz %x[flags], #31, 8f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum "ldr q1, [x24, #0x0]\n" - "ldr q5, [x28, #0x0]\n" + "ldr q5, [x9, #0x0]\n" "sub x25, x25, #0x10\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" "cmp x25, #0x20\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "ldr q4, [x28, #0x60]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "ldr q4, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "bge 7b\n" "9:" // Height 1: Multiply loop: Single iteration only "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" - "trn1 v0.2d, v1.2d, v24.2d\n" - "trn2 v1.2d, v1.2d, v24.2d\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" ".inst 0x4e85ac10 // usmmla v16.4s, v0.16b, v5.16b\n" - "ldr q25, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x4e86ac14 // usmmla v20.4s, v0.16b, v6.16b\n" - "ldr q24, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n" - "ldr q30, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4e88ac15 // usmmla v21.4s, v0.16b, v8.16b\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x4e89ac12 // usmmla v18.4s, v0.16b, v9.16b\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x4e8aac16 // usmmla v22.4s, v0.16b, v10.16b\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x4e84ac13 // usmmla v19.4s, v0.16b, v4.16b\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x4e99ac17 // usmmla v23.4s, v0.16b, v25.16b\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x4e98ac30 // usmmla v16.4s, v1.16b, v24.16b\n" - "ldr q24, [x28, #0xf0]\n" - ".inst 0x4e9eac34 // usmmla v20.4s, v1.16b, v30.16b\n" - "add x28, x28, #0x100\n" - ".inst 0x4e9dac31 // usmmla v17.4s, v1.16b, v29.16b\n" - ".inst 0x4e9cac35 // usmmla v21.4s, v1.16b, v28.16b\n" - ".inst 0x4e9bac32 // usmmla v18.4s, v1.16b, v27.16b\n" - ".inst 0x4e9aac36 // usmmla v22.4s, v1.16b, v26.16b\n" - ".inst 0x4e99ac33 // usmmla v19.4s, v1.16b, v25.16b\n" - ".inst 0x4e98ac37 // usmmla v23.4s, v1.16b, v24.16b\n" + "ldr q4, [x9, #0xd0]\n" + ".inst 0x4e85ac17 // usmmla v23.4s, v0.16b, v5.16b\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x4e86ac30 // usmmla v16.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + ".inst 0x4e87ac34 // usmmla v20.4s, v1.16b, v7.16b\n" + "add x9, x9, #0x100\n" + ".inst 0x4e88ac31 // usmmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e89ac35 // usmmla v21.4s, v1.16b, v9.16b\n" + ".inst 0x4e8aac32 // usmmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e84ac36 // usmmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e85ac33 // usmmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e86ac37 // usmmla v23.4s, v1.16b, v6.16b\n" "tbnz %x[flags], #31, 10f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" @@ -216,30 +211,30 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( "cmp x25, #0x8\n" "blt 14f\n" "12:" // Height 1: Multiply loop: Odd block loop - "ldr d25, [x24], #0x8\n" - "trn1 v0.2d, v25.2d, v24.2d\n" + "ldr d1, [x24], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" "tbnz %x[flags], #31, 13f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "13:" // Height 1: Multiply loop: unique 3: skip row sum - "ldr q24, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" "sub x25, x25, #0x8\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q4, [x9, #0x30]\n" "cmp x25, #0x8\n" - "ldr q27, [x28, #0x40]\n" - "ldr q26, [x28, #0x50]\n" - "ldr q25, [x28, #0x60]\n" - ".inst 0x4e98ac10 // usmmla v16.4s, v0.16b, v24.16b\n" - "ldr q24, [x28, #0x70]\n" - ".inst 0x4e9eac14 // usmmla v20.4s, v0.16b, v30.16b\n" - ".inst 0x4e9dac11 // usmmla v17.4s, v0.16b, v29.16b\n" - ".inst 0x4e9cac15 // usmmla v21.4s, v0.16b, v28.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x4e9bac12 // usmmla v18.4s, v0.16b, v27.16b\n" - ".inst 0x4e9aac16 // usmmla v22.4s, v0.16b, v26.16b\n" - ".inst 0x4e99ac13 // usmmla v19.4s, v0.16b, v25.16b\n" - ".inst 0x4e98ac17 // usmmla v23.4s, v0.16b, v24.16b\n" + "ldr q5, [x9, #0x40]\n" + "ldr q6, [x9, #0x50]\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e88ac10 // usmmla v16.4s, v0.16b, v8.16b\n" + "ldr q8, [x9, #0x70]\n" + ".inst 0x4e89ac14 // usmmla v20.4s, v0.16b, v9.16b\n" + ".inst 0x4e8aac11 // usmmla v17.4s, v0.16b, v10.16b\n" + ".inst 0x4e84ac15 // usmmla v21.4s, v0.16b, v4.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x4e85ac12 // usmmla v18.4s, v0.16b, v5.16b\n" + ".inst 0x4e86ac16 // usmmla v22.4s, v0.16b, v6.16b\n" + ".inst 0x4e87ac13 // usmmla v19.4s, v0.16b, v7.16b\n" + ".inst 0x4e88ac17 // usmmla v23.4s, v0.16b, v8.16b\n" "bge 12b\n" "14:" // Height 1: Multiply loop: Skip odd blocks "cbz x25, 20f\n" @@ -263,27 +258,27 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( "17:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x24, #0x0]\n" "18:" // Height 1: Multiply loop: Ragged operand read: Done - "trn1 v0.2d, v1.2d, v24.2d\n" + "trn1 v0.2d, v1.2d, v2.2d\n" "tbnz %x[flags], #31, 19f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "19:" // Height 1: Multiply loop: unique 4: skip row sum - "ldr q24, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" - "ldr q27, [x28, #0x40]\n" - "ldr q26, [x28, #0x50]\n" - "ldr q25, [x28, #0x60]\n" - ".inst 0x4e98ac10 // usmmla v16.4s, v0.16b, v24.16b\n" - "ldr q24, [x28, #0x70]\n" - ".inst 0x4e9eac14 // usmmla v20.4s, v0.16b, v30.16b\n" - ".inst 0x4e9dac11 // usmmla v17.4s, v0.16b, v29.16b\n" - ".inst 0x4e9cac15 // usmmla v21.4s, v0.16b, v28.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x4e9bac12 // usmmla v18.4s, v0.16b, v27.16b\n" - ".inst 0x4e9aac16 // usmmla v22.4s, v0.16b, v26.16b\n" - ".inst 0x4e99ac13 // usmmla v19.4s, v0.16b, v25.16b\n" - ".inst 0x4e98ac17 // usmmla v23.4s, v0.16b, v24.16b\n" + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "ldr q7, [x9, #0x40]\n" + "ldr q8, [x9, #0x50]\n" + "ldr q9, [x9, #0x60]\n" + ".inst 0x4e8aac10 // usmmla v16.4s, v0.16b, v10.16b\n" + "ldr q10, [x9, #0x70]\n" + ".inst 0x4e84ac14 // usmmla v20.4s, v0.16b, v4.16b\n" + ".inst 0x4e85ac11 // usmmla v17.4s, v0.16b, v5.16b\n" + ".inst 0x4e86ac15 // usmmla v21.4s, v0.16b, v6.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x4e87ac12 // usmmla v18.4s, v0.16b, v7.16b\n" + ".inst 0x4e88ac16 // usmmla v22.4s, v0.16b, v8.16b\n" + ".inst 0x4e89ac13 // usmmla v19.4s, v0.16b, v9.16b\n" + ".inst 0x4e8aac17 // usmmla v23.4s, v0.16b, v10.16b\n" "20:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" @@ -298,136 +293,122 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( "tbnz %x[flags], #31, 21f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" - "ld1r { v16.4s }, [x20]\n" - "neg v16.4s, v16.4s\n" + "ld1r { v1.4s }, [x20]\n" + "neg v1.4s, v1.4s\n" "dup v11.4s, v11.s[0]\n" - "mul v11.4s, v11.4s, v16.4s\n" + "mul v11.4s, v11.4s, v1.4s\n" "21:" // Height 1: skip row sum fixup - "ldr q24, [x10, #0x0]\n" - "ldr q22, [x10, #0x10]\n" + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v23.4s, v23.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q21, [x10, #0x20]\n" - "ldr q20, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v16.4s }, [x20]\n" - "add v23.4s, v23.4s, v24.4s\n" - "add v17.4s, v17.4s, v22.4s\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - "add x10, x10, #0x40\n" + "ld1r { v4.4s }, [x21]\n" + "add v23.4s, v23.4s, v0.4s\n" "ld1r { v0.4s }, [x20]\n" - "add v18.4s, v18.4s, v21.4s\n" - "add v19.4s, v19.4s, v20.4s\n" - "sqrdmulh v23.4s, v23.4s, v16.4s\n" - "sqrdmulh v17.4s, v17.4s, v16.4s\n" - "sqrdmulh v18.4s, v18.4s, v16.4s\n" - "sqrdmulh v19.4s, v19.4s, v16.4s\n" - "tbz %x[flags], #5, 22f\n" - "and v22.16b, v23.16b, v0.16b\n" - "and v21.16b, v17.16b, v0.16b\n" - "and v20.16b, v18.16b, v0.16b\n" - "and v16.16b, v19.16b, v0.16b\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "sshr v21.4s, v21.4s, #0x1f\n" - "sshr v20.4s, v20.4s, #0x1f\n" - "sshr v16.4s, v16.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v22.4s\n" - "sqadd v17.4s, v17.4s, v21.4s\n" - "sqadd v18.4s, v18.4s, v20.4s\n" - "sqadd v19.4s, v19.4s, v16.4s\n" - "22:" // Height 1: no shift correction + "add v17.4s, v17.4s, v1.4s\n" "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v6.4s }, [x20]\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v5.4s }, [x20]\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "add x28, x28, #0x40\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "ld1r { v4.4s }, [x21]\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v21.4s }, [x21]\n" - "ld1r { v20.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v16.4s }, [x20]\n" - "add v23.4s, v23.4s, v21.4s\n" - "add v17.4s, v17.4s, v21.4s\n" - "add v18.4s, v18.4s, v21.4s\n" - "add v19.4s, v19.4s, v21.4s\n" - "smin v23.4s, v23.4s, v20.4s\n" - "smin v17.4s, v17.4s, v20.4s\n" - "smin v18.4s, v18.4s, v20.4s\n" - "smin v19.4s, v19.4s, v20.4s\n" - "smax v23.4s, v23.4s, v16.4s\n" - "smax v17.4s, v17.4s, v16.4s\n" - "smax v18.4s, v18.4s, v16.4s\n" - "smax v19.4s, v19.4s, v16.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v23.8h, v23.8h, v17.8h\n" - "uzp1 v16.8h, v18.8h, v19.8h\n" - "uzp1 v23.16b, v23.16b, v16.16b\n" - "bge 31f\n" - "tbz x9, #3, 26f\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v23.16b, v23.16b, v17.16b\n" + "bge 30f\n" + "tbz x10, #3, 25f\n" "str d23, [x27], #0x8\n" - "tbz x9, #2, 24f\n" + "tbz x10, #2, 23f\n" "st1 { v23.s }[2], [x27], #0x4\n" - "tbz x9, #1, 23f\n" + "tbz x10, #1, 22f\n" "st1 { v23.h }[6], [x27], #0x2\n" - "tbz x9, #0, 30f\n" + "tbz x10, #0, 29f\n" "st1 { v23.b }[14], [x27]\n" - "b 30f\n" - "23:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x9, #0, 30f\n" + "b 29f\n" + "22:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x10, #0, 29f\n" "st1 { v23.b }[12], [x27]\n" - "b 30f\n" - "24:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x9, #1, 25f\n" + "b 29f\n" + "23:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x10, #1, 24f\n" "st1 { v23.h }[4], [x27], #0x2\n" - "tbz x9, #0, 30f\n" + "tbz x10, #0, 29f\n" "st1 { v23.b }[10], [x27]\n" - "b 30f\n" - "25:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x9, #0, 30f\n" + "b 29f\n" + "24:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x10, #0, 29f\n" "st1 { v23.b }[8], [x27]\n" - "b 30f\n" - "26:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x9, #2, 28f\n" + "b 29f\n" + "25:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x10, #2, 27f\n" "str s23, [x27], #0x4\n" - "tbz x9, #1, 27f\n" + "tbz x10, #1, 26f\n" "st1 { v23.h }[2], [x27], #0x2\n" - "tbz x9, #0, 30f\n" + "tbz x10, #0, 29f\n" "st1 { v23.b }[6], [x27]\n" - "b 30f\n" - "27:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x9, #0, 30f\n" + "b 29f\n" + "26:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x10, #0, 29f\n" "st1 { v23.b }[4], [x27]\n" - "b 30f\n" - "28:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x9, #1, 29f\n" + "b 29f\n" + "27:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x10, #1, 28f\n" "str h23, [x27], #0x2\n" - "tbz x9, #0, 30f\n" + "tbz x10, #0, 29f\n" "st1 { v23.b }[2], [x27]\n" - "b 30f\n" - "29:" // Height 1: Partial direct writeback: partial_1_0 + "b 29f\n" + "28:" // Height 1: Partial direct writeback: partial_1_0 "str b23, [x27, #0x0]\n" - "30:" // Height 1: Partial direct writeback: Done - "b 32f\n" - "31:" // Height 1: Full writeback + "29:" // Height 1: Partial direct writeback: Done + "b 31f\n" + "30:" // Height 1: Full writeback "str q23, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "32:" // Height 1: Writeback done - "subs x9, x9, #0x10\n" + "31:" // Height 1: Writeback done + "subs x10, x10, #0x10\n" "bgt 2b\n" - "b 130f\n" - "33:" // Height 2 - "mov x10, %x[col_bias]\n" + "b 126f\n" + "32:" // Height 2 "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v15.16b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "34:" // Height 2: Column loop + "33:" // Height 2: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -436,420 +417,393 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "35:" // Height 2: setup done "mov x26, #0x0\n" - "36:" // Height 2: String loop + "35:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 37f\n" + "tbz %x[flags], #3, 36f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" - "cbnz x26, 38f\n" + "cbnz x26, 37f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" - "b 38f\n" - "37:" // Height 2: setup direct input + "b 37f\n" + "36:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" - "38:" // Height 2: input setup done + "37:" // Height 2: input setup done "cmp x25, #0x10\n" - "blt 43f\n" + "blt 42f\n" "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "cmp x25, #0x20\n" - "ldr q5, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "ldr q4, [x28, #0x60]\n" - "blt 41f\n" - "39:" // Height 2: Multiply loop: Main loop head + "ldr q5, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "ldr q4, [x9, #0x60]\n" + "blt 40f\n" + "38:" // Height 2: Multiply loop: Main loop head "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x4e85ac10 // usmmla v16.4s, v0.16b, v5.16b\n" - "ldr q25, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x4e86ac14 // usmmla v20.4s, v0.16b, v6.16b\n" - "ldr q24, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n" - "ldr q30, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4e88ac15 // usmmla v21.4s, v0.16b, v8.16b\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x4e89ac12 // usmmla v18.4s, v0.16b, v9.16b\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x4e8aac16 // usmmla v22.4s, v0.16b, v10.16b\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x4e84ac13 // usmmla v19.4s, v0.16b, v4.16b\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x4e99ac17 // usmmla v23.4s, v0.16b, v25.16b\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x4e98ac30 // usmmla v16.4s, v1.16b, v24.16b\n" - "ldr q24, [x28, #0xf0]\n" - ".inst 0x4e9eac34 // usmmla v20.4s, v1.16b, v30.16b\n" - "add x28, x28, #0x100\n" - ".inst 0x4e9dac31 // usmmla v17.4s, v1.16b, v29.16b\n" - ".inst 0x4e9cac35 // usmmla v21.4s, v1.16b, v28.16b\n" - ".inst 0x4e9bac32 // usmmla v18.4s, v1.16b, v27.16b\n" - ".inst 0x4e9aac36 // usmmla v22.4s, v1.16b, v26.16b\n" - ".inst 0x4e99ac33 // usmmla v19.4s, v1.16b, v25.16b\n" - ".inst 0x4e98ac37 // usmmla v23.4s, v1.16b, v24.16b\n" - "tbnz %x[flags], #31, 40f\n" + "ldr q4, [x9, #0xd0]\n" + ".inst 0x4e85ac17 // usmmla v23.4s, v0.16b, v5.16b\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x4e86ac30 // usmmla v16.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + ".inst 0x4e87ac34 // usmmla v20.4s, v1.16b, v7.16b\n" + "add x9, x9, #0x100\n" + ".inst 0x4e88ac31 // usmmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e89ac35 // usmmla v21.4s, v1.16b, v9.16b\n" + ".inst 0x4e8aac32 // usmmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e84ac36 // usmmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e85ac33 // usmmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e86ac37 // usmmla v23.4s, v1.16b, v6.16b\n" + "tbnz %x[flags], #31, 39f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" - "40:" // Height 2: Multiply loop: unique 5: skip row sum + "39:" // Height 2: Multiply loop: unique 5: skip row sum "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "sub x25, x25, #0x10\n" - "ldr q5, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" + "ldr q5, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" "cmp x25, #0x20\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "ldr q4, [x28, #0x60]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "ldr q4, [x9, #0x60]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - "bge 39b\n" - "41:" // Height 2: Multiply loop: Single iteration only + "bge 38b\n" + "40:" // Height 2: Multiply loop: Single iteration only "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x4e85ac10 // usmmla v16.4s, v0.16b, v5.16b\n" - "ldr q25, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x4e86ac14 // usmmla v20.4s, v0.16b, v6.16b\n" - "ldr q24, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n" - "ldr q30, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4e88ac15 // usmmla v21.4s, v0.16b, v8.16b\n" - "ldr q29, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x4e89ac12 // usmmla v18.4s, v0.16b, v9.16b\n" - "ldr q28, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x4e8aac16 // usmmla v22.4s, v0.16b, v10.16b\n" - "ldr q27, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x4e84ac13 // usmmla v19.4s, v0.16b, v4.16b\n" - "ldr q26, [x28, #0xd0]\n" - ".inst 0x4e99ac17 // usmmla v23.4s, v0.16b, v25.16b\n" - "ldr q25, [x28, #0xe0]\n" - ".inst 0x4e98ac30 // usmmla v16.4s, v1.16b, v24.16b\n" - "ldr q24, [x28, #0xf0]\n" - ".inst 0x4e9eac34 // usmmla v20.4s, v1.16b, v30.16b\n" - "add x28, x28, #0x100\n" - ".inst 0x4e9dac31 // usmmla v17.4s, v1.16b, v29.16b\n" - ".inst 0x4e9cac35 // usmmla v21.4s, v1.16b, v28.16b\n" - ".inst 0x4e9bac32 // usmmla v18.4s, v1.16b, v27.16b\n" - ".inst 0x4e9aac36 // usmmla v22.4s, v1.16b, v26.16b\n" - ".inst 0x4e99ac33 // usmmla v19.4s, v1.16b, v25.16b\n" - ".inst 0x4e98ac37 // usmmla v23.4s, v1.16b, v24.16b\n" - "tbnz %x[flags], #31, 42f\n" + "ldr q4, [x9, #0xd0]\n" + ".inst 0x4e85ac17 // usmmla v23.4s, v0.16b, v5.16b\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x4e86ac30 // usmmla v16.4s, v1.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + ".inst 0x4e87ac34 // usmmla v20.4s, v1.16b, v7.16b\n" + "add x9, x9, #0x100\n" + ".inst 0x4e88ac31 // usmmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e89ac35 // usmmla v21.4s, v1.16b, v9.16b\n" + ".inst 0x4e8aac32 // usmmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e84ac36 // usmmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e85ac33 // usmmla v19.4s, v1.16b, v5.16b\n" + ".inst 0x4e86ac37 // usmmla v23.4s, v1.16b, v6.16b\n" + "tbnz %x[flags], #31, 41f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" - "42:" // Height 2: Multiply loop: unique 6: skip row sum + "41:" // Height 2: Multiply loop: unique 6: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - "43:" // Height 2: Multiply loop: Main loop skip - "cbz x25, 52f\n" + "42:" // Height 2: Multiply loop: Main loop skip + "cbz x25, 51f\n" "cmp x25, #0x8\n" - "blt 46f\n" - "44:" // Height 2: Multiply loop: Odd block loop - "ldr d25, [x24], #0x8\n" - "ldr d24, [x23], #0x8\n" - "trn1 v0.2d, v25.2d, v24.2d\n" - "tbnz %x[flags], #31, 45f\n" + "blt 45f\n" + "43:" // Height 2: Multiply loop: Odd block loop + "ldr d1, [x24], #0x8\n" + "ldr d2, [x23], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "tbnz %x[flags], #31, 44f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" - "45:" // Height 2: Multiply loop: unique 7: skip row sum - "ldr q24, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" + "44:" // Height 2: Multiply loop: unique 7: skip row sum + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" "sub x25, x25, #0x8\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q4, [x9, #0x30]\n" "cmp x25, #0x8\n" - "ldr q27, [x28, #0x40]\n" - "ldr q26, [x28, #0x50]\n" - "ldr q25, [x28, #0x60]\n" - ".inst 0x4e98ac10 // usmmla v16.4s, v0.16b, v24.16b\n" - "ldr q24, [x28, #0x70]\n" - ".inst 0x4e9eac14 // usmmla v20.4s, v0.16b, v30.16b\n" - ".inst 0x4e9dac11 // usmmla v17.4s, v0.16b, v29.16b\n" - ".inst 0x4e9cac15 // usmmla v21.4s, v0.16b, v28.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x4e9bac12 // usmmla v18.4s, v0.16b, v27.16b\n" - ".inst 0x4e9aac16 // usmmla v22.4s, v0.16b, v26.16b\n" - ".inst 0x4e99ac13 // usmmla v19.4s, v0.16b, v25.16b\n" - ".inst 0x4e98ac17 // usmmla v23.4s, v0.16b, v24.16b\n" - "bge 44b\n" - "46:" // Height 2: Multiply loop: Skip odd blocks - "cbz x25, 52f\n" - "tbz x25, #2, 48f\n" + "ldr q5, [x9, #0x40]\n" + "ldr q6, [x9, #0x50]\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e88ac10 // usmmla v16.4s, v0.16b, v8.16b\n" + "ldr q8, [x9, #0x70]\n" + ".inst 0x4e89ac14 // usmmla v20.4s, v0.16b, v9.16b\n" + ".inst 0x4e8aac11 // usmmla v17.4s, v0.16b, v10.16b\n" + ".inst 0x4e84ac15 // usmmla v21.4s, v0.16b, v4.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x4e85ac12 // usmmla v18.4s, v0.16b, v5.16b\n" + ".inst 0x4e86ac16 // usmmla v22.4s, v0.16b, v6.16b\n" + ".inst 0x4e87ac13 // usmmla v19.4s, v0.16b, v7.16b\n" + ".inst 0x4e88ac17 // usmmla v23.4s, v0.16b, v8.16b\n" + "bge 43b\n" + "45:" // Height 2: Multiply loop: Skip odd blocks + "cbz x25, 51f\n" + "tbz x25, #2, 47f\n" "ldr s1, [x24], #0x4\n" "ldr s2, [x23], #0x4\n" - "tbz x25, #1, 47f\n" + "tbz x25, #1, 46f\n" "ld1 { v1.h }[2], [x24], #0x2\n" "ld1 { v2.h }[2], [x23], #0x2\n" - "tbz x25, #0, 50f\n" + "tbz x25, #0, 49f\n" "ld1 { v1.b }[6], [x24]\n" "ld1 { v2.b }[6], [x23]\n" - "b 50f\n" - "47:" // Height 2: Multiply loop: Ragged operand read: partial_1_4 - "tbz x25, #0, 50f\n" + "b 49f\n" + "46:" // Height 2: Multiply loop: Ragged operand read: partial_1_4 + "tbz x25, #0, 49f\n" "ld1 { v1.b }[4], [x24]\n" "ld1 { v2.b }[4], [x23]\n" - "b 50f\n" - "48:" // Height 2: Multiply loop: Ragged operand read: partial_2_0 - "tbz x25, #1, 49f\n" + "b 49f\n" + "47:" // Height 2: Multiply loop: Ragged operand read: partial_2_0 + "tbz x25, #1, 48f\n" "ldr h1, [x24], #0x2\n" "ldr h2, [x23], #0x2\n" - "tbz x25, #0, 50f\n" + "tbz x25, #0, 49f\n" "ld1 { v1.b }[2], [x24]\n" "ld1 { v2.b }[2], [x23]\n" - "b 50f\n" - "49:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "b 49f\n" + "48:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x24, #0x0]\n" "ldr b2, [x23, #0x0]\n" - "50:" // Height 2: Multiply loop: Ragged operand read: Done + "49:" // Height 2: Multiply loop: Ragged operand read: Done "trn1 v0.2d, v1.2d, v2.2d\n" - "tbnz %x[flags], #31, 51f\n" + "tbnz %x[flags], #31, 50f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" - "51:" // Height 2: Multiply loop: unique 8: skip row sum - "ldr q24, [x28, #0x0]\n" - "ldr q30, [x28, #0x10]\n" - "ldr q29, [x28, #0x20]\n" - "ldr q28, [x28, #0x30]\n" - "ldr q27, [x28, #0x40]\n" - "ldr q26, [x28, #0x50]\n" - "ldr q25, [x28, #0x60]\n" - ".inst 0x4e98ac10 // usmmla v16.4s, v0.16b, v24.16b\n" - "ldr q24, [x28, #0x70]\n" - ".inst 0x4e9eac14 // usmmla v20.4s, v0.16b, v30.16b\n" - ".inst 0x4e9dac11 // usmmla v17.4s, v0.16b, v29.16b\n" - ".inst 0x4e9cac15 // usmmla v21.4s, v0.16b, v28.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x4e9bac12 // usmmla v18.4s, v0.16b, v27.16b\n" - ".inst 0x4e9aac16 // usmmla v22.4s, v0.16b, v26.16b\n" - ".inst 0x4e99ac13 // usmmla v19.4s, v0.16b, v25.16b\n" - ".inst 0x4e98ac17 // usmmla v23.4s, v0.16b, v24.16b\n" - "52:" // Height 2: Multiply loop: No odd multiplies + "50:" // Height 2: Multiply loop: unique 8: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "ldr q7, [x9, #0x40]\n" + "ldr q8, [x9, #0x50]\n" + "ldr q9, [x9, #0x60]\n" + ".inst 0x4e8aac10 // usmmla v16.4s, v0.16b, v10.16b\n" + "ldr q10, [x9, #0x70]\n" + ".inst 0x4e84ac14 // usmmla v20.4s, v0.16b, v4.16b\n" + ".inst 0x4e85ac11 // usmmla v17.4s, v0.16b, v5.16b\n" + ".inst 0x4e86ac15 // usmmla v21.4s, v0.16b, v6.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x4e87ac12 // usmmla v18.4s, v0.16b, v7.16b\n" + ".inst 0x4e88ac16 // usmmla v22.4s, v0.16b, v8.16b\n" + ".inst 0x4e89ac13 // usmmla v19.4s, v0.16b, v9.16b\n" + ".inst 0x4e8aac17 // usmmla v23.4s, v0.16b, v10.16b\n" + "51:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 36b\n" + "bne 35b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v24.2d, v16.2d, v20.2d\n" + "uzp1 v4.2d, v16.2d, v20.2d\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "prfm pstl1keep, [x27, #0x0]\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "uzp2 v17.2d, v17.2d, v21.2d\n" "uzp1 v21.2d, v18.2d, v22.2d\n" "uzp2 v18.2d, v18.2d, v22.2d\n" - "add x24, x27, x20\n" + "add x26, x27, x20\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" - "prfm pstl1keep, [x24, #0x0]\n" - "mov v23.16b, v24.16b\n" - "tbnz %x[flags], #31, 53f\n" + "prfm pstl1keep, [x26, #0x0]\n" + "mov v23.16b, v4.16b\n" + "tbnz %x[flags], #31, 52f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" - "ld1r { v24.4s }, [x20]\n" - "neg v24.4s, v24.4s\n" + "ld1r { v2.4s }, [x20]\n" + "neg v2.4s, v2.4s\n" "dup v12.4s, v11.s[3]\n" "dup v11.4s, v11.s[0]\n" - "mul v11.4s, v11.4s, v24.4s\n" - "mul v12.4s, v12.4s, v24.4s\n" - "53:" // Height 2: skip row sum fixup - "ldr q28, [x10, #0x0]\n" - "ldr q27, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v2.4s\n" + "mul v12.4s, v12.4s, v2.4s\n" + "52:" // Height 2: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v23.4s, v23.4s, v11.4s\n" "add v20.4s, v20.4s, v11.4s\n" - "ldr q26, [x10, #0x20]\n" - "ldr q25, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v21.4s, v21.4s, v11.4s\n" "add v22.4s, v22.4s, v11.4s\n" "add v16.4s, v16.4s, v12.4s\n" "add v17.4s, v17.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v24.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v18.4s, v18.4s, v12.4s\n" "add v19.4s, v19.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" - "add v23.4s, v23.4s, v28.4s\n" - "add v20.4s, v20.4s, v27.4s\n" - "add x10, x10, #0x40\n" - "add v21.4s, v21.4s, v26.4s\n" - "add v22.4s, v22.4s, v25.4s\n" - "add v16.4s, v16.4s, v28.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v17.4s, v17.4s, v27.4s\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v25.4s\n" - "sqrdmulh v23.4s, v23.4s, v24.4s\n" - "sqrdmulh v20.4s, v20.4s, v24.4s\n" - "sqrdmulh v21.4s, v21.4s, v24.4s\n" - "sqrdmulh v22.4s, v22.4s, v24.4s\n" - "sqrdmulh v16.4s, v16.4s, v24.4s\n" - "sqrdmulh v17.4s, v17.4s, v24.4s\n" - "sqrdmulh v18.4s, v18.4s, v24.4s\n" - "sqrdmulh v19.4s, v19.4s, v24.4s\n" - "tbz %x[flags], #5, 54f\n" - "and v24.16b, v23.16b, v0.16b\n" - "and v30.16b, v20.16b, v0.16b\n" - "and v29.16b, v21.16b, v0.16b\n" - "and v28.16b, v22.16b, v0.16b\n" - "and v27.16b, v16.16b, v0.16b\n" - "and v26.16b, v17.16b, v0.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "and v25.16b, v18.16b, v0.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v24.4s\n" - "and v24.16b, v19.16b, v0.16b\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v30.4s\n" - "sqadd v21.4s, v21.4s, v29.4s\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "sqadd v22.4s, v22.4s, v28.4s\n" - "sqadd v16.4s, v16.4s, v27.4s\n" - "sqadd v17.4s, v17.4s, v26.4s\n" - "sqadd v18.4s, v18.4s, v25.4s\n" - "sqadd v19.4s, v19.4s, v24.4s\n" - "54:" // Height 2: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "add x22, %x[qp], %[c_offset]\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "ld1r { v0.4s }, [x23]\n" + "add v17.4s, v17.4s, v1.4s\n" + "cmp x10, #0x10\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add x28, x28, #0x40\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v26.4s }, [x21]\n" - "ld1r { v25.4s }, [x20]\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v24.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add v23.4s, v23.4s, v26.4s\n" - "add v20.4s, v20.4s, v26.4s\n" - "add v21.4s, v21.4s, v26.4s\n" - "add v22.4s, v22.4s, v26.4s\n" - "add v16.4s, v16.4s, v26.4s\n" - "add v17.4s, v17.4s, v26.4s\n" - "add v18.4s, v18.4s, v26.4s\n" - "add v19.4s, v19.4s, v26.4s\n" - "smin v23.4s, v23.4s, v25.4s\n" - "smin v20.4s, v20.4s, v25.4s\n" - "smin v21.4s, v21.4s, v25.4s\n" - "smin v22.4s, v22.4s, v25.4s\n" - "smin v16.4s, v16.4s, v25.4s\n" - "smin v17.4s, v17.4s, v25.4s\n" - "smin v18.4s, v18.4s, v25.4s\n" - "smin v19.4s, v19.4s, v25.4s\n" - "smax v23.4s, v23.4s, v24.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "smax v21.4s, v21.4s, v24.4s\n" - "smax v22.4s, v22.4s, v24.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "smax v18.4s, v18.4s, v24.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v23.8h, v23.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v23.16b, v23.16b, v20.16b\n" "uzp1 v16.16b, v16.16b, v17.16b\n" - "bge 63f\n" - "tbz x9, #3, 58f\n" + "bge 61f\n" + "tbz x10, #3, 56f\n" "str d23, [x27], #0x8\n" - "str d16, [x24], #0x8\n" - "tbz x9, #2, 56f\n" + "str d16, [x26], #0x8\n" + "tbz x10, #2, 54f\n" "st1 { v23.s }[2], [x27], #0x4\n" - "st1 { v16.s }[2], [x24], #0x4\n" - "tbz x9, #1, 55f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "tbz x10, #1, 53f\n" "st1 { v23.h }[6], [x27], #0x2\n" - "st1 { v16.h }[6], [x24], #0x2\n" - "tbz x9, #0, 62f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "tbz x10, #0, 60f\n" "st1 { v23.b }[14], [x27]\n" - "st1 { v16.b }[14], [x24]\n" - "b 62f\n" - "55:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x9, #0, 62f\n" + "st1 { v16.b }[14], [x26]\n" + "b 60f\n" + "53:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x10, #0, 60f\n" "st1 { v23.b }[12], [x27]\n" - "st1 { v16.b }[12], [x24]\n" - "b 62f\n" - "56:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x9, #1, 57f\n" + "st1 { v16.b }[12], [x26]\n" + "b 60f\n" + "54:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x10, #1, 55f\n" "st1 { v23.h }[4], [x27], #0x2\n" - "st1 { v16.h }[4], [x24], #0x2\n" - "tbz x9, #0, 62f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "tbz x10, #0, 60f\n" "st1 { v23.b }[10], [x27]\n" - "st1 { v16.b }[10], [x24]\n" - "b 62f\n" - "57:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x9, #0, 62f\n" + "st1 { v16.b }[10], [x26]\n" + "b 60f\n" + "55:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x10, #0, 60f\n" "st1 { v23.b }[8], [x27]\n" - "st1 { v16.b }[8], [x24]\n" - "b 62f\n" - "58:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x9, #2, 60f\n" + "st1 { v16.b }[8], [x26]\n" + "b 60f\n" + "56:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x10, #2, 58f\n" "str s23, [x27], #0x4\n" - "str s16, [x24], #0x4\n" - "tbz x9, #1, 59f\n" + "str s16, [x26], #0x4\n" + "tbz x10, #1, 57f\n" "st1 { v23.h }[2], [x27], #0x2\n" - "st1 { v16.h }[2], [x24], #0x2\n" - "tbz x9, #0, 62f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "tbz x10, #0, 60f\n" "st1 { v23.b }[6], [x27]\n" - "st1 { v16.b }[6], [x24]\n" - "b 62f\n" - "59:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x9, #0, 62f\n" + "st1 { v16.b }[6], [x26]\n" + "b 60f\n" + "57:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x10, #0, 60f\n" "st1 { v23.b }[4], [x27]\n" - "st1 { v16.b }[4], [x24]\n" - "b 62f\n" - "60:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x9, #1, 61f\n" + "st1 { v16.b }[4], [x26]\n" + "b 60f\n" + "58:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x10, #1, 59f\n" "str h23, [x27], #0x2\n" - "str h16, [x24], #0x2\n" - "tbz x9, #0, 62f\n" + "str h16, [x26], #0x2\n" + "tbz x10, #0, 60f\n" "st1 { v23.b }[2], [x27]\n" - "st1 { v16.b }[2], [x24]\n" - "b 62f\n" - "61:" // Height 2: Partial direct writeback: partial_1_0 + "st1 { v16.b }[2], [x26]\n" + "b 60f\n" + "59:" // Height 2: Partial direct writeback: partial_1_0 "str b23, [x27, #0x0]\n" - "str b16, [x24, #0x0]\n" - "62:" // Height 2: Partial direct writeback: Done - "b 64f\n" - "63:" // Height 2: Full writeback + "str b16, [x26, #0x0]\n" + "60:" // Height 2: Partial direct writeback: Done + "b 62f\n" + "61:" // Height 2: Full writeback "str q23, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q16, [x24, #0x0]\n" - "64:" // Height 2: Writeback done - "subs x9, x9, #0x10\n" - "bgt 34b\n" - "b 130f\n" - "65:" // Height 3 - "mov x10, %x[col_bias]\n" + "str q16, [x26, #0x0]\n" + "62:" // Height 2: Writeback done + "subs x10, x10, #0x10\n" + "bgt 33b\n" + "b 126f\n" + "63:" // Height 3 "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v13.4s, #0x0\n" "movi v15.16b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "66:" // Height 3: Column loop + "64:" // Height 3: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -866,43 +820,42 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "67:" // Height 3: setup done "mov x26, #0x0\n" - "68:" // Height 3: String loop + "66:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 69f\n" + "tbz %x[flags], #3, 67f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" - "cbnz x26, 70f\n" + "cbnz x26, 68f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" - "b 70f\n" - "69:" // Height 3: setup direct input + "b 68f\n" + "67:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" - "70:" // Height 3: input setup done + "68:" // Height 3: input setup done "cmp x25, #0x10\n" - "blt 75f\n" + "blt 73f\n" "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "cmp x25, #0x20\n" "ldr q3, [x22, #0x0]\n" - "ldr q5, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "blt 73f\n" - "71:" // Height 3: Multiply loop: Main loop head + "ldr q5, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "blt 71f\n" + "69:" // Height 3: Multiply loop: Main loop head "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "add x24, x24, #0x10\n" @@ -910,35 +863,35 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( "add x22, x22, #0x10\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" - "ldr q14, [x28, #0x60]\n" + "ldr q4, [x9, #0x60]\n" ".inst 0x4e85ac10 // usmmla v16.4s, v0.16b, v5.16b\n" ".inst 0x4e86ac14 // usmmla v20.4s, v0.16b, v6.16b\n" ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n" ".inst 0x4e88ac15 // usmmla v21.4s, v0.16b, v8.16b\n" ".inst 0x4e85ac58 // usmmla v24.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x4e86ac5c // usmmla v28.4s, v2.16b, v6.16b\n" - "ldr q4, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4e89ac12 // usmmla v18.4s, v0.16b, v9.16b\n" ".inst 0x4e87ac59 // usmmla v25.4s, v2.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4e88ac5d // usmmla v29.4s, v2.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x4e89ac5a // usmmla v26.4s, v2.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x4e8aac16 // usmmla v22.4s, v0.16b, v10.16b\n" ".inst 0x4e8aac5e // usmmla v30.4s, v2.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" - ".inst 0x4e8eac13 // usmmla v19.4s, v0.16b, v14.16b\n" - ".inst 0x4e8eac5b // usmmla v27.4s, v2.16b, v14.16b\n" - "ldr q6, [x28, #0xd0]\n" + "ldr q10, [x9, #0xc0]\n" + ".inst 0x4e84ac13 // usmmla v19.4s, v0.16b, v4.16b\n" + ".inst 0x4e84ac5b // usmmla v27.4s, v2.16b, v4.16b\n" + "ldr q4, [x9, #0xd0]\n" ".inst 0x4e85ac17 // usmmla v23.4s, v0.16b, v5.16b\n" ".inst 0x4e85ac5f // usmmla v31.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" - ".inst 0x4e84ac30 // usmmla v16.4s, v1.16b, v4.16b\n" - ".inst 0x4e84ac78 // usmmla v24.4s, v3.16b, v4.16b\n" - "ldr q4, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x4e86ac30 // usmmla v16.4s, v1.16b, v6.16b\n" + ".inst 0x4e86ac78 // usmmla v24.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4e87ac34 // usmmla v20.4s, v1.16b, v7.16b\n" ".inst 0x4e87ac7c // usmmla v28.4s, v3.16b, v7.16b\n" ".inst 0x4e88ac31 // usmmla v17.4s, v1.16b, v8.16b\n" @@ -947,34 +900,34 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( ".inst 0x4e89ac7d // usmmla v29.4s, v3.16b, v9.16b\n" ".inst 0x4e8aac32 // usmmla v18.4s, v1.16b, v10.16b\n" ".inst 0x4e8aac7a // usmmla v26.4s, v3.16b, v10.16b\n" - ".inst 0x4e86ac36 // usmmla v22.4s, v1.16b, v6.16b\n" - ".inst 0x4e86ac7e // usmmla v30.4s, v3.16b, v6.16b\n" + ".inst 0x4e84ac36 // usmmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e84ac7e // usmmla v30.4s, v3.16b, v4.16b\n" ".inst 0x4e85ac33 // usmmla v19.4s, v1.16b, v5.16b\n" ".inst 0x4e85ac7b // usmmla v27.4s, v3.16b, v5.16b\n" - ".inst 0x4e84ac37 // usmmla v23.4s, v1.16b, v4.16b\n" - ".inst 0x4e84ac7f // usmmla v31.4s, v3.16b, v4.16b\n" - "tbnz %x[flags], #31, 72f\n" + ".inst 0x4e86ac37 // usmmla v23.4s, v1.16b, v6.16b\n" + ".inst 0x4e86ac7f // usmmla v31.4s, v3.16b, v6.16b\n" + "tbnz %x[flags], #31, 70f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" ".inst 0x6e8f946d // udot v13.4s, v3.16b, v15.16b\n" - "72:" // Height 3: Multiply loop: unique 9: skip row sum + "70:" // Height 3: Multiply loop: unique 9: skip row sum "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "sub x25, x25, #0x10\n" "ldr q3, [x22, #0x0]\n" - "ldr q5, [x28, #0x0]\n" + "ldr q5, [x9, #0x0]\n" "cmp x25, #0x20\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - "bge 71b\n" - "73:" // Height 3: Multiply loop: Single iteration only + "bge 69b\n" + "71:" // Height 3: Multiply loop: Single iteration only "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "sub x25, x25, #0x10\n" @@ -983,35 +936,35 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( "add x22, x22, #0x10\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" - "ldr q14, [x28, #0x60]\n" + "ldr q4, [x9, #0x60]\n" ".inst 0x4e85ac10 // usmmla v16.4s, v0.16b, v5.16b\n" ".inst 0x4e86ac14 // usmmla v20.4s, v0.16b, v6.16b\n" ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n" ".inst 0x4e88ac15 // usmmla v21.4s, v0.16b, v8.16b\n" ".inst 0x4e85ac58 // usmmla v24.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x4e86ac5c // usmmla v28.4s, v2.16b, v6.16b\n" - "ldr q4, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4e89ac12 // usmmla v18.4s, v0.16b, v9.16b\n" ".inst 0x4e87ac59 // usmmla v25.4s, v2.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4e88ac5d // usmmla v29.4s, v2.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x4e89ac5a // usmmla v26.4s, v2.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x4e8aac16 // usmmla v22.4s, v0.16b, v10.16b\n" ".inst 0x4e8aac5e // usmmla v30.4s, v2.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" - ".inst 0x4e8eac13 // usmmla v19.4s, v0.16b, v14.16b\n" - ".inst 0x4e8eac5b // usmmla v27.4s, v2.16b, v14.16b\n" - "ldr q6, [x28, #0xd0]\n" + "ldr q10, [x9, #0xc0]\n" + ".inst 0x4e84ac13 // usmmla v19.4s, v0.16b, v4.16b\n" + ".inst 0x4e84ac5b // usmmla v27.4s, v2.16b, v4.16b\n" + "ldr q4, [x9, #0xd0]\n" ".inst 0x4e85ac17 // usmmla v23.4s, v0.16b, v5.16b\n" ".inst 0x4e85ac5f // usmmla v31.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" - ".inst 0x4e84ac30 // usmmla v16.4s, v1.16b, v4.16b\n" - ".inst 0x4e84ac78 // usmmla v24.4s, v3.16b, v4.16b\n" - "ldr q4, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q5, [x9, #0xe0]\n" + ".inst 0x4e86ac30 // usmmla v16.4s, v1.16b, v6.16b\n" + ".inst 0x4e86ac78 // usmmla v24.4s, v3.16b, v6.16b\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4e87ac34 // usmmla v20.4s, v1.16b, v7.16b\n" ".inst 0x4e87ac7c // usmmla v28.4s, v3.16b, v7.16b\n" ".inst 0x4e88ac31 // usmmla v17.4s, v1.16b, v8.16b\n" @@ -1020,416 +973,378 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( ".inst 0x4e89ac7d // usmmla v29.4s, v3.16b, v9.16b\n" ".inst 0x4e8aac32 // usmmla v18.4s, v1.16b, v10.16b\n" ".inst 0x4e8aac7a // usmmla v26.4s, v3.16b, v10.16b\n" - ".inst 0x4e86ac36 // usmmla v22.4s, v1.16b, v6.16b\n" - ".inst 0x4e86ac7e // usmmla v30.4s, v3.16b, v6.16b\n" + ".inst 0x4e84ac36 // usmmla v22.4s, v1.16b, v4.16b\n" + ".inst 0x4e84ac7e // usmmla v30.4s, v3.16b, v4.16b\n" ".inst 0x4e85ac33 // usmmla v19.4s, v1.16b, v5.16b\n" ".inst 0x4e85ac7b // usmmla v27.4s, v3.16b, v5.16b\n" - ".inst 0x4e84ac37 // usmmla v23.4s, v1.16b, v4.16b\n" - ".inst 0x4e84ac7f // usmmla v31.4s, v3.16b, v4.16b\n" - "tbnz %x[flags], #31, 74f\n" + ".inst 0x4e86ac37 // usmmla v23.4s, v1.16b, v6.16b\n" + ".inst 0x4e86ac7f // usmmla v31.4s, v3.16b, v6.16b\n" + "tbnz %x[flags], #31, 72f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" ".inst 0x6e8f946d // udot v13.4s, v3.16b, v15.16b\n" - "74:" // Height 3: Multiply loop: unique 10: skip row sum + "72:" // Height 3: Multiply loop: unique 10: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - "75:" // Height 3: Multiply loop: Main loop skip - "cbz x25, 84f\n" + "73:" // Height 3: Multiply loop: Main loop skip + "cbz x25, 82f\n" "cmp x25, #0x8\n" - "blt 78f\n" - "76:" // Height 3: Multiply loop: Odd block loop - "ldr d3, [x24], #0x8\n" - "ldr d0, [x23], #0x8\n" - "ldr d1, [x22], #0x8\n" - "trn1 v0.2d, v3.2d, v0.2d\n" - "trn1 v2.2d, v1.2d, v2.2d\n" - "tbnz %x[flags], #31, 77f\n" + "blt 76f\n" + "74:" // Height 3: Multiply loop: Odd block loop + "ldr d1, [x24], #0x8\n" + "ldr d2, [x23], #0x8\n" + "ldr d3, [x22], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v7.2d\n" + "tbnz %x[flags], #31, 75f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "77:" // Height 3: Multiply loop: unique 11: skip row sum - "ldr q1, [x28, #0x0]\n" - "ldr q8, [x28, #0x10]\n" + "75:" // Height 3: Multiply loop: unique 11: skip row sum + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" "sub x25, x25, #0x8\n" - "ldr q7, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q4, [x9, #0x30]\n" "cmp x25, #0x8\n" - "ldr q5, [x28, #0x40]\n" - "ldr q4, [x28, #0x50]\n" - "ldr q3, [x28, #0x60]\n" - ".inst 0x4e81ac10 // usmmla v16.4s, v0.16b, v1.16b\n" - ".inst 0x4e81ac58 // usmmla v24.4s, v2.16b, v1.16b\n" - "ldr q1, [x28, #0x70]\n" - ".inst 0x4e88ac14 // usmmla v20.4s, v0.16b, v8.16b\n" - ".inst 0x4e88ac5c // usmmla v28.4s, v2.16b, v8.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n" - ".inst 0x4e87ac59 // usmmla v25.4s, v2.16b, v7.16b\n" - ".inst 0x4e86ac15 // usmmla v21.4s, v0.16b, v6.16b\n" - ".inst 0x4e86ac5d // usmmla v29.4s, v2.16b, v6.16b\n" + "ldr q5, [x9, #0x40]\n" + "ldr q6, [x9, #0x50]\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e88ac10 // usmmla v16.4s, v0.16b, v8.16b\n" + ".inst 0x4e88ac58 // usmmla v24.4s, v2.16b, v8.16b\n" + "ldr q8, [x9, #0x70]\n" + ".inst 0x4e89ac14 // usmmla v20.4s, v0.16b, v9.16b\n" + ".inst 0x4e89ac5c // usmmla v28.4s, v2.16b, v9.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x4e8aac11 // usmmla v17.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aac59 // usmmla v25.4s, v2.16b, v10.16b\n" + ".inst 0x4e84ac15 // usmmla v21.4s, v0.16b, v4.16b\n" + ".inst 0x4e84ac5d // usmmla v29.4s, v2.16b, v4.16b\n" ".inst 0x4e85ac12 // usmmla v18.4s, v0.16b, v5.16b\n" ".inst 0x4e85ac5a // usmmla v26.4s, v2.16b, v5.16b\n" - ".inst 0x4e84ac16 // usmmla v22.4s, v0.16b, v4.16b\n" - ".inst 0x4e84ac5e // usmmla v30.4s, v2.16b, v4.16b\n" - ".inst 0x4e83ac13 // usmmla v19.4s, v0.16b, v3.16b\n" - ".inst 0x4e83ac5b // usmmla v27.4s, v2.16b, v3.16b\n" - ".inst 0x4e81ac17 // usmmla v23.4s, v0.16b, v1.16b\n" - ".inst 0x4e81ac5f // usmmla v31.4s, v2.16b, v1.16b\n" - "bge 76b\n" - "78:" // Height 3: Multiply loop: Skip odd blocks - "cbz x25, 84f\n" - "tbz x25, #2, 80f\n" + ".inst 0x4e86ac16 // usmmla v22.4s, v0.16b, v6.16b\n" + ".inst 0x4e86ac5e // usmmla v30.4s, v2.16b, v6.16b\n" + ".inst 0x4e87ac13 // usmmla v19.4s, v0.16b, v7.16b\n" + ".inst 0x4e87ac5b // usmmla v27.4s, v2.16b, v7.16b\n" + ".inst 0x4e88ac17 // usmmla v23.4s, v0.16b, v8.16b\n" + ".inst 0x4e88ac5f // usmmla v31.4s, v2.16b, v8.16b\n" + "bge 74b\n" + "76:" // Height 3: Multiply loop: Skip odd blocks + "cbz x25, 82f\n" + "tbz x25, #2, 78f\n" "ldr s1, [x24], #0x4\n" "ldr s2, [x23], #0x4\n" "ldr s3, [x22], #0x4\n" - "tbz x25, #1, 79f\n" + "tbz x25, #1, 77f\n" "ld1 { v1.h }[2], [x24], #0x2\n" "ld1 { v2.h }[2], [x23], #0x2\n" "ld1 { v3.h }[2], [x22], #0x2\n" - "tbz x25, #0, 82f\n" + "tbz x25, #0, 80f\n" "ld1 { v1.b }[6], [x24]\n" "ld1 { v2.b }[6], [x23]\n" "ld1 { v3.b }[6], [x22]\n" - "b 82f\n" - "79:" // Height 3: Multiply loop: Ragged operand read: partial_1_4 - "tbz x25, #0, 82f\n" + "b 80f\n" + "77:" // Height 3: Multiply loop: Ragged operand read: partial_1_4 + "tbz x25, #0, 80f\n" "ld1 { v1.b }[4], [x24]\n" "ld1 { v2.b }[4], [x23]\n" "ld1 { v3.b }[4], [x22]\n" - "b 82f\n" - "80:" // Height 3: Multiply loop: Ragged operand read: partial_2_0 - "tbz x25, #1, 81f\n" + "b 80f\n" + "78:" // Height 3: Multiply loop: Ragged operand read: partial_2_0 + "tbz x25, #1, 79f\n" "ldr h1, [x24], #0x2\n" "ldr h2, [x23], #0x2\n" "ldr h3, [x22], #0x2\n" - "tbz x25, #0, 82f\n" + "tbz x25, #0, 80f\n" "ld1 { v1.b }[2], [x24]\n" "ld1 { v2.b }[2], [x23]\n" "ld1 { v3.b }[2], [x22]\n" - "b 82f\n" - "81:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "b 80f\n" + "79:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x24, #0x0]\n" "ldr b2, [x23, #0x0]\n" "ldr b3, [x22, #0x0]\n" - "82:" // Height 3: Multiply loop: Ragged operand read: Done + "80:" // Height 3: Multiply loop: Ragged operand read: Done "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "tbnz %x[flags], #31, 83f\n" + "trn1 v2.2d, v3.2d, v9.2d\n" + "tbnz %x[flags], #31, 81f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "83:" // Height 3: Multiply loop: unique 12: skip row sum - "ldr q1, [x28, #0x0]\n" - "ldr q8, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - "ldr q5, [x28, #0x40]\n" - "ldr q4, [x28, #0x50]\n" - "ldr q3, [x28, #0x60]\n" - ".inst 0x4e81ac10 // usmmla v16.4s, v0.16b, v1.16b\n" - ".inst 0x4e81ac58 // usmmla v24.4s, v2.16b, v1.16b\n" - "ldr q1, [x28, #0x70]\n" - ".inst 0x4e88ac14 // usmmla v20.4s, v0.16b, v8.16b\n" - ".inst 0x4e88ac5c // usmmla v28.4s, v2.16b, v8.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n" - ".inst 0x4e87ac59 // usmmla v25.4s, v2.16b, v7.16b\n" + "81:" // Height 3: Multiply loop: unique 12: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "ldr q7, [x9, #0x40]\n" + "ldr q8, [x9, #0x50]\n" + "ldr q9, [x9, #0x60]\n" + ".inst 0x4e8aac10 // usmmla v16.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aac58 // usmmla v24.4s, v2.16b, v10.16b\n" + "ldr q10, [x9, #0x70]\n" + ".inst 0x4e84ac14 // usmmla v20.4s, v0.16b, v4.16b\n" + ".inst 0x4e84ac5c // usmmla v28.4s, v2.16b, v4.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x4e85ac11 // usmmla v17.4s, v0.16b, v5.16b\n" + ".inst 0x4e85ac59 // usmmla v25.4s, v2.16b, v5.16b\n" ".inst 0x4e86ac15 // usmmla v21.4s, v0.16b, v6.16b\n" ".inst 0x4e86ac5d // usmmla v29.4s, v2.16b, v6.16b\n" - ".inst 0x4e85ac12 // usmmla v18.4s, v0.16b, v5.16b\n" - ".inst 0x4e85ac5a // usmmla v26.4s, v2.16b, v5.16b\n" - ".inst 0x4e84ac16 // usmmla v22.4s, v0.16b, v4.16b\n" - ".inst 0x4e84ac5e // usmmla v30.4s, v2.16b, v4.16b\n" - ".inst 0x4e83ac13 // usmmla v19.4s, v0.16b, v3.16b\n" - ".inst 0x4e83ac5b // usmmla v27.4s, v2.16b, v3.16b\n" - ".inst 0x4e81ac17 // usmmla v23.4s, v0.16b, v1.16b\n" - ".inst 0x4e81ac5f // usmmla v31.4s, v2.16b, v1.16b\n" - "84:" // Height 3: Multiply loop: No odd multiplies + ".inst 0x4e87ac12 // usmmla v18.4s, v0.16b, v7.16b\n" + ".inst 0x4e87ac5a // usmmla v26.4s, v2.16b, v7.16b\n" + ".inst 0x4e88ac16 // usmmla v22.4s, v0.16b, v8.16b\n" + ".inst 0x4e88ac5e // usmmla v30.4s, v2.16b, v8.16b\n" + ".inst 0x4e89ac13 // usmmla v19.4s, v0.16b, v9.16b\n" + ".inst 0x4e89ac5b // usmmla v27.4s, v2.16b, v9.16b\n" + ".inst 0x4e8aac17 // usmmla v23.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aac5f // usmmla v31.4s, v2.16b, v10.16b\n" + "82:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 68b\n" + "bne 66b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v0.2d, v16.2d, v20.2d\n" + "uzp1 v4.2d, v16.2d, v20.2d\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "prfm pstl1keep, [x27, #0x0]\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "uzp2 v17.2d, v17.2d, v21.2d\n" "uzp1 v21.2d, v18.2d, v22.2d\n" "uzp2 v18.2d, v18.2d, v22.2d\n" - "add x24, x27, x20\n" - "add x23, x24, x20\n" + "add x26, x27, x20\n" + "add x25, x26, x20\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" - "prfm pstl1keep, [x24, #0x0]\n" - "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x26, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" "uzp1 v24.2d, v24.2d, v28.2d\n" "uzp1 v25.2d, v25.2d, v29.2d\n" "uzp1 v26.2d, v26.2d, v30.2d\n" "uzp1 v27.2d, v27.2d, v31.2d\n" - "mov v31.16b, v0.16b\n" - "tbnz %x[flags], #31, 85f\n" + "mov v31.16b, v4.16b\n" + "tbnz %x[flags], #31, 83f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "ld1r { v23.4s }, [x20]\n" - "neg v23.4s, v23.4s\n" + "ld1r { v3.4s }, [x20]\n" + "neg v3.4s, v3.4s\n" "dup v12.4s, v11.s[3]\n" "dup v11.4s, v11.s[0]\n" "dup v13.4s, v13.s[0]\n" - "mul v11.4s, v11.4s, v23.4s\n" - "mul v12.4s, v12.4s, v23.4s\n" - "mul v13.4s, v13.4s, v23.4s\n" - "85:" // Height 3: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q30, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v3.4s\n" + "mul v12.4s, v12.4s, v3.4s\n" + "mul v13.4s, v13.4s, v3.4s\n" + "83:" // Height 3: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v31.4s, v31.4s, v11.4s\n" "add v20.4s, v20.4s, v11.4s\n" - "ldr q29, [x10, #0x20]\n" - "ldr q28, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v21.4s, v21.4s, v11.4s\n" "add v22.4s, v22.4s, v11.4s\n" "add v16.4s, v16.4s, v12.4s\n" "add v17.4s, v17.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v23.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v18.4s, v18.4s, v12.4s\n" "add v19.4s, v19.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "add x22, %x[qp], %[c_offset]\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" - "add x10, x10, #0x40\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" "add v31.4s, v31.4s, v0.4s\n" - "add v20.4s, v20.4s, v30.4s\n" - "add v21.4s, v21.4s, v29.4s\n" - "add v22.4s, v22.4s, v28.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add x28, x28, #0x40\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v30.4s\n" - "add v18.4s, v18.4s, v29.4s\n" - "add v19.4s, v19.4s, v28.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v25.4s, v25.4s, v30.4s\n" - "add v26.4s, v26.4s, v29.4s\n" - "add v27.4s, v27.4s, v28.4s\n" - "sqrdmulh v31.4s, v31.4s, v23.4s\n" - "sqrdmulh v20.4s, v20.4s, v23.4s\n" - "sqrdmulh v21.4s, v21.4s, v23.4s\n" - "sqrdmulh v22.4s, v22.4s, v23.4s\n" - "sqrdmulh v16.4s, v16.4s, v23.4s\n" - "sqrdmulh v17.4s, v17.4s, v23.4s\n" - "sqrdmulh v18.4s, v18.4s, v23.4s\n" - "sqrdmulh v19.4s, v19.4s, v23.4s\n" - "sqrdmulh v24.4s, v24.4s, v23.4s\n" - "sqrdmulh v25.4s, v25.4s, v23.4s\n" - "sqrdmulh v26.4s, v26.4s, v23.4s\n" - "sqrdmulh v27.4s, v27.4s, v23.4s\n" - "tbz %x[flags], #5, 86f\n" - "and v1.16b, v31.16b, v0.16b\n" - "and v30.16b, v20.16b, v0.16b\n" - "and v29.16b, v21.16b, v0.16b\n" - "and v28.16b, v22.16b, v0.16b\n" - "and v23.16b, v16.16b, v0.16b\n" - "and v3.16b, v17.16b, v0.16b\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "and v2.16b, v18.16b, v0.16b\n" - "sqadd v31.4s, v31.4s, v1.4s\n" - "sqadd v20.4s, v20.4s, v30.4s\n" - "sqadd v21.4s, v21.4s, v29.4s\n" - "sqadd v22.4s, v22.4s, v28.4s\n" - "sqadd v16.4s, v16.4s, v23.4s\n" - "and v1.16b, v19.16b, v0.16b\n" - "and v30.16b, v24.16b, v0.16b\n" - "and v29.16b, v25.16b, v0.16b\n" - "and v28.16b, v26.16b, v0.16b\n" - "and v23.16b, v27.16b, v0.16b\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v23.4s, v23.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v3.4s\n" - "sqadd v18.4s, v18.4s, v2.4s\n" - "sqadd v19.4s, v19.4s, v1.4s\n" - "sqadd v24.4s, v24.4s, v30.4s\n" - "sqadd v25.4s, v25.4s, v29.4s\n" - "sqadd v26.4s, v26.4s, v28.4s\n" - "sqadd v27.4s, v27.4s, v23.4s\n" - "86:" // Height 3: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "ld1r { v0.4s }, [x23]\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqdmulh v31.4s, v31.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v4.4s\n" + "sqdmulh v26.4s, v26.4s, v4.4s\n" + "sqdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v31.4s, v31.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v29.4s }, [x21]\n" - "ld1r { v28.4s }, [x20]\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v23.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v24.4s, v24.4s, v0.4s\n" "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add v31.4s, v31.4s, v29.4s\n" - "add v20.4s, v20.4s, v29.4s\n" - "add v21.4s, v21.4s, v29.4s\n" - "add v22.4s, v22.4s, v29.4s\n" - "add v16.4s, v16.4s, v29.4s\n" - "add v17.4s, v17.4s, v29.4s\n" - "add v18.4s, v18.4s, v29.4s\n" - "add v19.4s, v19.4s, v29.4s\n" - "add v24.4s, v24.4s, v29.4s\n" - "add v25.4s, v25.4s, v29.4s\n" - "add v26.4s, v26.4s, v29.4s\n" - "add v27.4s, v27.4s, v29.4s\n" - "smin v31.4s, v31.4s, v28.4s\n" - "smin v20.4s, v20.4s, v28.4s\n" - "smin v21.4s, v21.4s, v28.4s\n" - "smin v22.4s, v22.4s, v28.4s\n" - "smin v16.4s, v16.4s, v28.4s\n" - "smin v17.4s, v17.4s, v28.4s\n" - "smin v18.4s, v18.4s, v28.4s\n" - "smin v19.4s, v19.4s, v28.4s\n" - "smin v24.4s, v24.4s, v28.4s\n" - "smin v25.4s, v25.4s, v28.4s\n" - "smin v26.4s, v26.4s, v28.4s\n" - "smin v27.4s, v27.4s, v28.4s\n" - "smax v31.4s, v31.4s, v23.4s\n" - "smax v20.4s, v20.4s, v23.4s\n" - "smax v21.4s, v21.4s, v23.4s\n" - "smax v22.4s, v22.4s, v23.4s\n" - "smax v16.4s, v16.4s, v23.4s\n" - "smax v17.4s, v17.4s, v23.4s\n" - "smax v18.4s, v18.4s, v23.4s\n" - "smax v19.4s, v19.4s, v23.4s\n" - "smax v24.4s, v24.4s, v23.4s\n" - "smax v25.4s, v25.4s, v23.4s\n" - "smax v26.4s, v26.4s, v23.4s\n" - "smax v27.4s, v27.4s, v23.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v31.8h, v31.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v18.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v17.8h, v26.8h, v27.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" "uzp1 v31.16b, v31.16b, v20.16b\n" - "uzp1 v16.16b, v16.16b, v18.16b\n" - "uzp1 v24.16b, v24.16b, v17.16b\n" - "bge 95f\n" - "tbz x9, #3, 90f\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 92f\n" + "tbz x10, #3, 87f\n" "str d31, [x27], #0x8\n" - "str d16, [x24], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x9, #2, 88f\n" + "str d16, [x26], #0x8\n" + "str d24, [x25], #0x8\n" + "tbz x10, #2, 85f\n" "st1 { v31.s }[2], [x27], #0x4\n" - "st1 { v16.s }[2], [x24], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "tbz x9, #1, 87f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "st1 { v24.s }[2], [x25], #0x4\n" + "tbz x10, #1, 84f\n" "st1 { v31.h }[6], [x27], #0x2\n" - "st1 { v16.h }[6], [x24], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "tbz x9, #0, 94f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "st1 { v24.h }[6], [x25], #0x2\n" + "tbz x10, #0, 91f\n" "st1 { v31.b }[14], [x27]\n" - "st1 { v16.b }[14], [x24]\n" - "st1 { v24.b }[14], [x23]\n" - "b 94f\n" - "87:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x9, #0, 94f\n" + "st1 { v16.b }[14], [x26]\n" + "st1 { v24.b }[14], [x25]\n" + "b 91f\n" + "84:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x10, #0, 91f\n" "st1 { v31.b }[12], [x27]\n" - "st1 { v16.b }[12], [x24]\n" - "st1 { v24.b }[12], [x23]\n" - "b 94f\n" - "88:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x9, #1, 89f\n" + "st1 { v16.b }[12], [x26]\n" + "st1 { v24.b }[12], [x25]\n" + "b 91f\n" + "85:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x10, #1, 86f\n" "st1 { v31.h }[4], [x27], #0x2\n" - "st1 { v16.h }[4], [x24], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "tbz x9, #0, 94f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "st1 { v24.h }[4], [x25], #0x2\n" + "tbz x10, #0, 91f\n" "st1 { v31.b }[10], [x27]\n" - "st1 { v16.b }[10], [x24]\n" - "st1 { v24.b }[10], [x23]\n" - "b 94f\n" - "89:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x9, #0, 94f\n" + "st1 { v16.b }[10], [x26]\n" + "st1 { v24.b }[10], [x25]\n" + "b 91f\n" + "86:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x10, #0, 91f\n" "st1 { v31.b }[8], [x27]\n" - "st1 { v16.b }[8], [x24]\n" - "st1 { v24.b }[8], [x23]\n" - "b 94f\n" - "90:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x9, #2, 92f\n" + "st1 { v16.b }[8], [x26]\n" + "st1 { v24.b }[8], [x25]\n" + "b 91f\n" + "87:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x10, #2, 89f\n" "str s31, [x27], #0x4\n" - "str s16, [x24], #0x4\n" - "str s24, [x23], #0x4\n" - "tbz x9, #1, 91f\n" + "str s16, [x26], #0x4\n" + "str s24, [x25], #0x4\n" + "tbz x10, #1, 88f\n" "st1 { v31.h }[2], [x27], #0x2\n" - "st1 { v16.h }[2], [x24], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "tbz x9, #0, 94f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "st1 { v24.h }[2], [x25], #0x2\n" + "tbz x10, #0, 91f\n" "st1 { v31.b }[6], [x27]\n" - "st1 { v16.b }[6], [x24]\n" - "st1 { v24.b }[6], [x23]\n" - "b 94f\n" - "91:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x9, #0, 94f\n" + "st1 { v16.b }[6], [x26]\n" + "st1 { v24.b }[6], [x25]\n" + "b 91f\n" + "88:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x10, #0, 91f\n" "st1 { v31.b }[4], [x27]\n" - "st1 { v16.b }[4], [x24]\n" - "st1 { v24.b }[4], [x23]\n" - "b 94f\n" - "92:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x9, #1, 93f\n" + "st1 { v16.b }[4], [x26]\n" + "st1 { v24.b }[4], [x25]\n" + "b 91f\n" + "89:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x10, #1, 90f\n" "str h31, [x27], #0x2\n" - "str h16, [x24], #0x2\n" - "str h24, [x23], #0x2\n" - "tbz x9, #0, 94f\n" + "str h16, [x26], #0x2\n" + "str h24, [x25], #0x2\n" + "tbz x10, #0, 91f\n" "st1 { v31.b }[2], [x27]\n" - "st1 { v16.b }[2], [x24]\n" - "st1 { v24.b }[2], [x23]\n" - "b 94f\n" - "93:" // Height 3: Partial direct writeback: partial_1_0 + "st1 { v16.b }[2], [x26]\n" + "st1 { v24.b }[2], [x25]\n" + "b 91f\n" + "90:" // Height 3: Partial direct writeback: partial_1_0 "str b31, [x27, #0x0]\n" - "str b16, [x24, #0x0]\n" - "str b24, [x23, #0x0]\n" - "94:" // Height 3: Partial direct writeback: Done - "b 96f\n" - "95:" // Height 3: Full writeback + "str b16, [x26, #0x0]\n" + "str b24, [x25, #0x0]\n" + "91:" // Height 3: Partial direct writeback: Done + "b 93f\n" + "92:" // Height 3: Full writeback "str q31, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q16, [x24, #0x0]\n" - "str q24, [x23, #0x0]\n" - "96:" // Height 3: Writeback done - "subs x9, x9, #0x10\n" - "bgt 66b\n" - "b 130f\n" - "97:" // Height 4 + "str q16, [x26, #0x0]\n" + "str q24, [x25, #0x0]\n" + "93:" // Height 3: Writeback done + "subs x10, x10, #0x10\n" + "bgt 64b\n" + "b 126f\n" + "94:" // Height 4 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "mov x20, #0x4\n" - "mov x10, %x[col_bias]\n" "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" - "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" "movi v13.4s, #0x0\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "movi v14.4s, #0x0\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "madd x20, x21, x20, x27\n" "movi v15.16b, #0x1\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" + "madd x20, x21, x20, x27\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "98:" // Height 4: Column loop + "95:" // Height 4: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -1446,54 +1361,53 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "99:" // Height 4: setup done "mov x26, #0x0\n" - "100:" // Height 4: String loop + "97:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 101f\n" + "tbz %x[flags], #3, 98f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" "ldr x21, [x20, #0x18]\n" - "cbnz x26, 102f\n" + "cbnz x26, 99f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" "add x21, x21, x20\n" - "b 102f\n" - "101:" // Height 4: setup direct input + "b 99f\n" + "98:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" "add x21, x22, x21\n" - "102:" // Height 4: input setup done + "99:" // Height 4: input setup done "cmp x25, #0x10\n" - "blt 107f\n" + "blt 104f\n" "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "cmp x25, #0x20\n" "ldr q3, [x22, #0x0]\n" "ldr q4, [x21, #0x0]\n" - "ldr q5, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" - "blt 105f\n" - "103:" // Height 4: Multiply loop: Main loop head + "ldr q5, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" + "blt 102f\n" + "100:" // Height 4: Multiply loop: Main loop head "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" - "ldr q4, [x28, #0x60]\n" + "ldr q4, [x9, #0x60]\n" "add x22, x22, #0x10\n" "add x21, x21, #0x10\n" ".inst 0x4e85ac10 // usmmla v16.4s, v0.16b, v5.16b\n" @@ -1501,29 +1415,29 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n" ".inst 0x4e88ac15 // usmmla v21.4s, v0.16b, v8.16b\n" ".inst 0x4e85ac58 // usmmla v24.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x4e86ac5c // usmmla v28.4s, v2.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4e87ac59 // usmmla v25.4s, v2.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4e88ac5d // usmmla v29.4s, v2.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x4e89ac12 // usmmla v18.4s, v0.16b, v9.16b\n" ".inst 0x4e89ac5a // usmmla v26.4s, v2.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x4e8aac16 // usmmla v22.4s, v0.16b, v10.16b\n" ".inst 0x4e8aac5e // usmmla v30.4s, v2.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x4e84ac13 // usmmla v19.4s, v0.16b, v4.16b\n" ".inst 0x4e84ac5b // usmmla v27.4s, v2.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" + "ldr q4, [x9, #0xd0]\n" ".inst 0x4e85ac17 // usmmla v23.4s, v0.16b, v5.16b\n" ".inst 0x4e85ac5f // usmmla v31.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" + "ldr q5, [x9, #0xe0]\n" ".inst 0x4e86ac30 // usmmla v16.4s, v1.16b, v6.16b\n" ".inst 0x4e86ac78 // usmmla v24.4s, v3.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4e87ac34 // usmmla v20.4s, v1.16b, v7.16b\n" ".inst 0x4e87ac7c // usmmla v28.4s, v3.16b, v7.16b\n" ".inst 0x4e88ac31 // usmmla v17.4s, v1.16b, v8.16b\n" @@ -1538,37 +1452,37 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( ".inst 0x4e85ac7b // usmmla v27.4s, v3.16b, v5.16b\n" ".inst 0x4e86ac37 // usmmla v23.4s, v1.16b, v6.16b\n" ".inst 0x4e86ac7f // usmmla v31.4s, v3.16b, v6.16b\n" - "tbnz %x[flags], #31, 104f\n" + "tbnz %x[flags], #31, 101f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" ".inst 0x6e8f946d // udot v13.4s, v3.16b, v15.16b\n" - "104:" // Height 4: Multiply loop: unique 13: skip row sum + "101:" // Height 4: Multiply loop: unique 13: skip row sum "ldr q1, [x24, #0x0]\n" "ldr q2, [x23, #0x0]\n" "sub x25, x25, #0x10\n" "ldr q3, [x22, #0x0]\n" "ldr q4, [x21, #0x0]\n" "cmp x25, #0x20\n" - "ldr q5, [x28, #0x0]\n" - "ldr q6, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q8, [x28, #0x30]\n" - "ldr q9, [x28, #0x40]\n" - "ldr q10, [x28, #0x50]\n" + "ldr q5, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + "ldr q7, [x9, #0x20]\n" + "ldr q8, [x9, #0x30]\n" + "ldr q9, [x9, #0x40]\n" + "ldr q10, [x9, #0x50]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" "prfm pldl1keep, [x21, #0x80]\n" - "bge 103b\n" - "105:" // Height 4: Multiply loop: Single iteration only + "bge 100b\n" + "102:" // Height 4: Multiply loop: Single iteration only "trn1 v0.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" - "ldr q4, [x28, #0x60]\n" + "ldr q4, [x9, #0x60]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" "add x21, x21, #0x10\n" @@ -1577,29 +1491,29 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n" ".inst 0x4e88ac15 // usmmla v21.4s, v0.16b, v8.16b\n" ".inst 0x4e85ac58 // usmmla v24.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q5, [x9, #0x70]\n" ".inst 0x4e86ac5c // usmmla v28.4s, v2.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4e87ac59 // usmmla v25.4s, v2.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4e88ac5d // usmmla v29.4s, v2.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q8, [x9, #0xa0]\n" ".inst 0x4e89ac12 // usmmla v18.4s, v0.16b, v9.16b\n" ".inst 0x4e89ac5a // usmmla v26.4s, v2.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q9, [x9, #0xb0]\n" ".inst 0x4e8aac16 // usmmla v22.4s, v0.16b, v10.16b\n" ".inst 0x4e8aac5e // usmmla v30.4s, v2.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" + "ldr q10, [x9, #0xc0]\n" ".inst 0x4e84ac13 // usmmla v19.4s, v0.16b, v4.16b\n" ".inst 0x4e84ac5b // usmmla v27.4s, v2.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" + "ldr q4, [x9, #0xd0]\n" ".inst 0x4e85ac17 // usmmla v23.4s, v0.16b, v5.16b\n" ".inst 0x4e85ac5f // usmmla v31.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" + "ldr q5, [x9, #0xe0]\n" ".inst 0x4e86ac30 // usmmla v16.4s, v1.16b, v6.16b\n" ".inst 0x4e86ac78 // usmmla v24.4s, v3.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" - "add x28, x28, #0x100\n" + "ldr q6, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4e87ac34 // usmmla v20.4s, v1.16b, v7.16b\n" ".inst 0x4e87ac7c // usmmla v28.4s, v3.16b, v7.16b\n" ".inst 0x4e88ac31 // usmmla v17.4s, v1.16b, v8.16b\n" @@ -1614,299 +1528,249 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( ".inst 0x4e85ac7b // usmmla v27.4s, v3.16b, v5.16b\n" ".inst 0x4e86ac37 // usmmla v23.4s, v1.16b, v6.16b\n" ".inst 0x4e86ac7f // usmmla v31.4s, v3.16b, v6.16b\n" - "tbnz %x[flags], #31, 106f\n" + "tbnz %x[flags], #31, 103f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" ".inst 0x6e8f946d // udot v13.4s, v3.16b, v15.16b\n" - "106:" // Height 4: Multiply loop: unique 14: skip row sum + "103:" // Height 4: Multiply loop: unique 14: skip row sum "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" "prfm pldl1keep, [x21, #0x80]\n" - "107:" // Height 4: Multiply loop: Main loop skip - "cbz x25, 116f\n" + "104:" // Height 4: Multiply loop: Main loop skip + "cbz x25, 113f\n" "cmp x25, #0x8\n" - "blt 110f\n" - "108:" // Height 4: Multiply loop: Odd block loop - "ldr d3, [x24], #0x8\n" - "ldr d0, [x23], #0x8\n" - "ldr d2, [x22], #0x8\n" - "ldr d1, [x21], #0x8\n" - "trn1 v0.2d, v3.2d, v0.2d\n" - "trn1 v2.2d, v2.2d, v1.2d\n" - "tbnz %x[flags], #31, 109f\n" + "blt 107f\n" + "105:" // Height 4: Multiply loop: Odd block loop + "ldr d1, [x24], #0x8\n" + "ldr d2, [x23], #0x8\n" + "ldr d3, [x22], #0x8\n" + "ldr d7, [x21], #0x8\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v7.2d\n" + "tbnz %x[flags], #31, 106f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "109:" // Height 4: Multiply loop: unique 15: skip row sum - "ldr q1, [x28, #0x0]\n" - "ldr q8, [x28, #0x10]\n" + "106:" // Height 4: Multiply loop: unique 15: skip row sum + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" "sub x25, x25, #0x8\n" - "ldr q7, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q4, [x9, #0x30]\n" "cmp x25, #0x8\n" - "ldr q5, [x28, #0x40]\n" - "ldr q4, [x28, #0x50]\n" - "ldr q3, [x28, #0x60]\n" - ".inst 0x4e81ac10 // usmmla v16.4s, v0.16b, v1.16b\n" - ".inst 0x4e81ac58 // usmmla v24.4s, v2.16b, v1.16b\n" - "ldr q1, [x28, #0x70]\n" - ".inst 0x4e88ac14 // usmmla v20.4s, v0.16b, v8.16b\n" - ".inst 0x4e88ac5c // usmmla v28.4s, v2.16b, v8.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n" - ".inst 0x4e87ac59 // usmmla v25.4s, v2.16b, v7.16b\n" - ".inst 0x4e86ac15 // usmmla v21.4s, v0.16b, v6.16b\n" - ".inst 0x4e86ac5d // usmmla v29.4s, v2.16b, v6.16b\n" + "ldr q5, [x9, #0x40]\n" + "ldr q6, [x9, #0x50]\n" + "ldr q7, [x9, #0x60]\n" + ".inst 0x4e88ac10 // usmmla v16.4s, v0.16b, v8.16b\n" + ".inst 0x4e88ac58 // usmmla v24.4s, v2.16b, v8.16b\n" + "ldr q8, [x9, #0x70]\n" + ".inst 0x4e89ac14 // usmmla v20.4s, v0.16b, v9.16b\n" + ".inst 0x4e89ac5c // usmmla v28.4s, v2.16b, v9.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x4e8aac11 // usmmla v17.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aac59 // usmmla v25.4s, v2.16b, v10.16b\n" + ".inst 0x4e84ac15 // usmmla v21.4s, v0.16b, v4.16b\n" + ".inst 0x4e84ac5d // usmmla v29.4s, v2.16b, v4.16b\n" ".inst 0x4e85ac12 // usmmla v18.4s, v0.16b, v5.16b\n" ".inst 0x4e85ac5a // usmmla v26.4s, v2.16b, v5.16b\n" - ".inst 0x4e84ac16 // usmmla v22.4s, v0.16b, v4.16b\n" - ".inst 0x4e84ac5e // usmmla v30.4s, v2.16b, v4.16b\n" - ".inst 0x4e83ac13 // usmmla v19.4s, v0.16b, v3.16b\n" - ".inst 0x4e83ac5b // usmmla v27.4s, v2.16b, v3.16b\n" - ".inst 0x4e81ac17 // usmmla v23.4s, v0.16b, v1.16b\n" - ".inst 0x4e81ac5f // usmmla v31.4s, v2.16b, v1.16b\n" - "bge 108b\n" - "110:" // Height 4: Multiply loop: Skip odd blocks - "cbz x25, 116f\n" - "tbz x25, #2, 112f\n" + ".inst 0x4e86ac16 // usmmla v22.4s, v0.16b, v6.16b\n" + ".inst 0x4e86ac5e // usmmla v30.4s, v2.16b, v6.16b\n" + ".inst 0x4e87ac13 // usmmla v19.4s, v0.16b, v7.16b\n" + ".inst 0x4e87ac5b // usmmla v27.4s, v2.16b, v7.16b\n" + ".inst 0x4e88ac17 // usmmla v23.4s, v0.16b, v8.16b\n" + ".inst 0x4e88ac5f // usmmla v31.4s, v2.16b, v8.16b\n" + "bge 105b\n" + "107:" // Height 4: Multiply loop: Skip odd blocks + "cbz x25, 113f\n" + "tbz x25, #2, 109f\n" "ldr s1, [x24], #0x4\n" "ldr s2, [x23], #0x4\n" "ldr s3, [x22], #0x4\n" "ldr s9, [x21], #0x4\n" - "tbz x25, #1, 111f\n" + "tbz x25, #1, 108f\n" "ld1 { v1.h }[2], [x24], #0x2\n" "ld1 { v2.h }[2], [x23], #0x2\n" "ld1 { v3.h }[2], [x22], #0x2\n" "ld1 { v9.h }[2], [x21], #0x2\n" - "tbz x25, #0, 114f\n" + "tbz x25, #0, 111f\n" "ld1 { v1.b }[6], [x24]\n" "ld1 { v2.b }[6], [x23]\n" "ld1 { v3.b }[6], [x22]\n" "ld1 { v9.b }[6], [x21]\n" - "b 114f\n" - "111:" // Height 4: Multiply loop: Ragged operand read: partial_1_4 - "tbz x25, #0, 114f\n" + "b 111f\n" + "108:" // Height 4: Multiply loop: Ragged operand read: partial_1_4 + "tbz x25, #0, 111f\n" "ld1 { v1.b }[4], [x24]\n" "ld1 { v2.b }[4], [x23]\n" "ld1 { v3.b }[4], [x22]\n" "ld1 { v9.b }[4], [x21]\n" - "b 114f\n" - "112:" // Height 4: Multiply loop: Ragged operand read: partial_2_0 - "tbz x25, #1, 113f\n" + "b 111f\n" + "109:" // Height 4: Multiply loop: Ragged operand read: partial_2_0 + "tbz x25, #1, 110f\n" "ldr h1, [x24], #0x2\n" "ldr h2, [x23], #0x2\n" "ldr h3, [x22], #0x2\n" "ldr h9, [x21], #0x2\n" - "tbz x25, #0, 114f\n" + "tbz x25, #0, 111f\n" "ld1 { v1.b }[2], [x24]\n" "ld1 { v2.b }[2], [x23]\n" "ld1 { v3.b }[2], [x22]\n" "ld1 { v9.b }[2], [x21]\n" - "b 114f\n" - "113:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "b 111f\n" + "110:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x24, #0x0]\n" "ldr b2, [x23, #0x0]\n" "ldr b3, [x22, #0x0]\n" "ldr b9, [x21, #0x0]\n" - "114:" // Height 4: Multiply loop: Ragged operand read: Done + "111:" // Height 4: Multiply loop: Ragged operand read: Done "trn1 v0.2d, v1.2d, v2.2d\n" "trn1 v2.2d, v3.2d, v9.2d\n" - "tbnz %x[flags], #31, 115f\n" + "tbnz %x[flags], #31, 112f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "115:" // Height 4: Multiply loop: unique 16: skip row sum - "ldr q1, [x28, #0x0]\n" - "ldr q8, [x28, #0x10]\n" - "ldr q7, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - "ldr q5, [x28, #0x40]\n" - "ldr q4, [x28, #0x50]\n" - "ldr q3, [x28, #0x60]\n" - ".inst 0x4e81ac10 // usmmla v16.4s, v0.16b, v1.16b\n" - ".inst 0x4e81ac58 // usmmla v24.4s, v2.16b, v1.16b\n" - "ldr q1, [x28, #0x70]\n" - ".inst 0x4e88ac14 // usmmla v20.4s, v0.16b, v8.16b\n" - ".inst 0x4e88ac5c // usmmla v28.4s, v2.16b, v8.16b\n" - "add x28, x28, #0x80\n" - ".inst 0x4e87ac11 // usmmla v17.4s, v0.16b, v7.16b\n" - ".inst 0x4e87ac59 // usmmla v25.4s, v2.16b, v7.16b\n" + "112:" // Height 4: Multiply loop: unique 16: skip row sum + "ldr q10, [x9, #0x0]\n" + "ldr q4, [x9, #0x10]\n" + "ldr q5, [x9, #0x20]\n" + "ldr q6, [x9, #0x30]\n" + "ldr q7, [x9, #0x40]\n" + "ldr q8, [x9, #0x50]\n" + "ldr q9, [x9, #0x60]\n" + ".inst 0x4e8aac10 // usmmla v16.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aac58 // usmmla v24.4s, v2.16b, v10.16b\n" + "ldr q10, [x9, #0x70]\n" + ".inst 0x4e84ac14 // usmmla v20.4s, v0.16b, v4.16b\n" + ".inst 0x4e84ac5c // usmmla v28.4s, v2.16b, v4.16b\n" + "add x9, x9, #0x80\n" + ".inst 0x4e85ac11 // usmmla v17.4s, v0.16b, v5.16b\n" + ".inst 0x4e85ac59 // usmmla v25.4s, v2.16b, v5.16b\n" ".inst 0x4e86ac15 // usmmla v21.4s, v0.16b, v6.16b\n" ".inst 0x4e86ac5d // usmmla v29.4s, v2.16b, v6.16b\n" - ".inst 0x4e85ac12 // usmmla v18.4s, v0.16b, v5.16b\n" - ".inst 0x4e85ac5a // usmmla v26.4s, v2.16b, v5.16b\n" - ".inst 0x4e84ac16 // usmmla v22.4s, v0.16b, v4.16b\n" - ".inst 0x4e84ac5e // usmmla v30.4s, v2.16b, v4.16b\n" - ".inst 0x4e83ac13 // usmmla v19.4s, v0.16b, v3.16b\n" - ".inst 0x4e83ac5b // usmmla v27.4s, v2.16b, v3.16b\n" - ".inst 0x4e81ac17 // usmmla v23.4s, v0.16b, v1.16b\n" - ".inst 0x4e81ac5f // usmmla v31.4s, v2.16b, v1.16b\n" - "116:" // Height 4: Multiply loop: No odd multiplies + ".inst 0x4e87ac12 // usmmla v18.4s, v0.16b, v7.16b\n" + ".inst 0x4e87ac5a // usmmla v26.4s, v2.16b, v7.16b\n" + ".inst 0x4e88ac16 // usmmla v22.4s, v0.16b, v8.16b\n" + ".inst 0x4e88ac5e // usmmla v30.4s, v2.16b, v8.16b\n" + ".inst 0x4e89ac13 // usmmla v19.4s, v0.16b, v9.16b\n" + ".inst 0x4e89ac5b // usmmla v27.4s, v2.16b, v9.16b\n" + ".inst 0x4e8aac17 // usmmla v23.4s, v0.16b, v10.16b\n" + ".inst 0x4e8aac5f // usmmla v31.4s, v2.16b, v10.16b\n" + "113:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 100b\n" + "bne 97b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v0.2d, v16.2d, v20.2d\n" + "uzp1 v4.2d, v16.2d, v20.2d\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "prfm pstl1keep, [x27, #0x0]\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "uzp2 v17.2d, v17.2d, v21.2d\n" "uzp1 v21.2d, v18.2d, v22.2d\n" "uzp2 v18.2d, v18.2d, v22.2d\n" - "add x24, x27, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x26, x27, x20\n" + "add x25, x26, x20\n" + "add x24, x25, x20\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" - "prfm pstl1keep, [x24, #0x0]\n" + "prfm pstl1keep, [x26, #0x0]\n" "uzp1 v23.2d, v24.2d, v28.2d\n" "uzp2 v24.2d, v24.2d, v28.2d\n" - "prfm pstl1keep, [x23, #0x0]\n" - "prfm pstl1keep, [x22, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x24, #0x0]\n" "uzp1 v28.2d, v25.2d, v29.2d\n" "uzp2 v25.2d, v25.2d, v29.2d\n" "uzp1 v29.2d, v26.2d, v30.2d\n" "uzp2 v26.2d, v26.2d, v30.2d\n" "uzp1 v30.2d, v27.2d, v31.2d\n" "uzp2 v27.2d, v27.2d, v31.2d\n" - "mov v31.16b, v0.16b\n" - "tbnz %x[flags], #31, 117f\n" + "mov v31.16b, v4.16b\n" + "tbnz %x[flags], #31, 114f\n" "add x20, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "ld1r { v0.4s }, [x20]\n" - "neg v0.4s, v0.4s\n" + "ld1r { v4.4s }, [x20]\n" + "neg v4.4s, v4.4s\n" "dup v12.4s, v11.s[3]\n" "dup v11.4s, v11.s[0]\n" "dup v14.4s, v13.s[3]\n" "dup v13.4s, v13.s[0]\n" - "mul v11.4s, v11.4s, v0.4s\n" - "mul v12.4s, v12.4s, v0.4s\n" - "mul v14.4s, v14.4s, v0.4s\n" - "mul v13.4s, v13.4s, v0.4s\n" - "117:" // Height 4: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q4, [x10, #0x10]\n" + "mul v11.4s, v11.4s, v4.4s\n" + "mul v12.4s, v12.4s, v4.4s\n" + "mul v14.4s, v14.4s, v4.4s\n" + "mul v13.4s, v13.4s, v4.4s\n" + "114:" // Height 4: skip row sum fixup + "ldr q0, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" "add v31.4s, v31.4s, v11.4s\n" "add v20.4s, v20.4s, v11.4s\n" - "ldr q3, [x10, #0x20]\n" - "ldr q2, [x10, #0x30]\n" + "ldr q2, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" "add v21.4s, v21.4s, v11.4s\n" "add v22.4s, v22.4s, v11.4s\n" "add v16.4s, v16.4s, v12.4s\n" "add v17.4s, v17.4s, v12.4s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "ld1r { v1.4s }, [x20]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v4.4s }, [x20]\n" "add v18.4s, v18.4s, v12.4s\n" "add v19.4s, v19.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "add x22, %x[qp], %[c_offset]\n" "add v23.4s, v23.4s, v13.4s\n" "add v28.4s, v28.4s, v13.4s\n" - "add x10, x10, #0x40\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v6.4s }, [x21]\n" + "ld1r { v5.4s }, [x20]\n" "add v29.4s, v29.4s, v13.4s\n" "add v30.4s, v30.4s, v13.4s\n" "add v24.4s, v24.4s, v14.4s\n" "add v25.4s, v25.4s, v14.4s\n" + "cmp x10, #0x10\n" + "orr %x[flags], %x[flags], #0x80000000\n" "add v26.4s, v26.4s, v14.4s\n" "add v27.4s, v27.4s, v14.4s\n" + "add x28, x28, #0x40\n" "add v31.4s, v31.4s, v0.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v3.4s\n" - "add v22.4s, v22.4s, v2.4s\n" + "add v20.4s, v20.4s, v1.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v3.4s\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v3.4s\n" - "add v19.4s, v19.4s, v2.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" "add v23.4s, v23.4s, v0.4s\n" - "add v28.4s, v28.4s, v4.4s\n" - "add v29.4s, v29.4s, v3.4s\n" - "add v30.4s, v30.4s, v2.4s\n" + "add v28.4s, v28.4s, v1.4s\n" + "add v29.4s, v29.4s, v2.4s\n" + "add v30.4s, v30.4s, v3.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v3.4s\n" - "add v27.4s, v27.4s, v2.4s\n" - "sqrdmulh v31.4s, v31.4s, v1.4s\n" - "sqrdmulh v20.4s, v20.4s, v1.4s\n" - "sqrdmulh v21.4s, v21.4s, v1.4s\n" - "sqrdmulh v22.4s, v22.4s, v1.4s\n" - "sqrdmulh v16.4s, v16.4s, v1.4s\n" - "sqrdmulh v17.4s, v17.4s, v1.4s\n" - "sqrdmulh v18.4s, v18.4s, v1.4s\n" - "sqrdmulh v19.4s, v19.4s, v1.4s\n" - "sqrdmulh v23.4s, v23.4s, v1.4s\n" - "sqrdmulh v28.4s, v28.4s, v1.4s\n" - "sqrdmulh v29.4s, v29.4s, v1.4s\n" - "sqrdmulh v30.4s, v30.4s, v1.4s\n" - "sqrdmulh v24.4s, v24.4s, v1.4s\n" - "sqrdmulh v25.4s, v25.4s, v1.4s\n" - "sqrdmulh v26.4s, v26.4s, v1.4s\n" - "sqrdmulh v27.4s, v27.4s, v1.4s\n" - "tbz %x[flags], #5, 118f\n" - "and v2.16b, v31.16b, v0.16b\n" - "and v1.16b, v20.16b, v0.16b\n" - "and v7.16b, v21.16b, v0.16b\n" - "and v6.16b, v22.16b, v0.16b\n" - "and v5.16b, v16.16b, v0.16b\n" - "and v4.16b, v17.16b, v0.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "and v3.16b, v18.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v31.4s, v31.4s, v2.4s\n" - "sqadd v20.4s, v20.4s, v1.4s\n" - "and v2.16b, v19.16b, v0.16b\n" - "and v1.16b, v23.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqadd v21.4s, v21.4s, v7.4s\n" - "sqadd v22.4s, v22.4s, v6.4s\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v5.4s\n" - "sqadd v17.4s, v17.4s, v4.4s\n" - "sqadd v18.4s, v18.4s, v3.4s\n" - "and v7.16b, v28.16b, v0.16b\n" - "sqadd v19.4s, v19.4s, v2.4s\n" - "sqadd v23.4s, v23.4s, v1.4s\n" - "and v6.16b, v29.16b, v0.16b\n" - "and v5.16b, v30.16b, v0.16b\n" - "and v4.16b, v24.16b, v0.16b\n" - "and v3.16b, v25.16b, v0.16b\n" - "and v2.16b, v26.16b, v0.16b\n" - "and v1.16b, v27.16b, v0.16b\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqadd v28.4s, v28.4s, v7.4s\n" - "sqadd v29.4s, v29.4s, v6.4s\n" - "sqadd v30.4s, v30.4s, v5.4s\n" - "sqadd v24.4s, v24.4s, v4.4s\n" - "sqadd v25.4s, v25.4s, v3.4s\n" - "sqadd v26.4s, v26.4s, v2.4s\n" - "sqadd v27.4s, v27.4s, v1.4s\n" - "118:" // Height 4: no shift correction - "add x21, %x[qp], %[c_offset]\n" + "ld1r { v0.4s }, [x23]\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqdmulh v31.4s, v31.4s, v4.4s\n" + "sqdmulh v20.4s, v20.4s, v4.4s\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "sqdmulh v22.4s, v22.4s, v4.4s\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "sqdmulh v17.4s, v17.4s, v4.4s\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sqdmulh v19.4s, v19.4s, v4.4s\n" + "sqdmulh v23.4s, v23.4s, v4.4s\n" + "sqdmulh v28.4s, v28.4s, v4.4s\n" + "sqdmulh v29.4s, v29.4s, v4.4s\n" + "sqdmulh v30.4s, v30.4s, v4.4s\n" + "sqdmulh v24.4s, v24.4s, v4.4s\n" + "sqdmulh v25.4s, v25.4s, v4.4s\n" + "sqdmulh v26.4s, v26.4s, v4.4s\n" + "sqdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v4.4s }, [x22]\n" "srshl v31.4s, v31.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1r { v3.4s }, [x21]\n" - "ld1r { v2.4s }, [x20]\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x20, %x[qp], %[minval]\n" - "cmp x9, #0x10\n" - "ld1r { v1.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" @@ -1917,178 +1781,178 @@ void a64_hybrid_u8s8qa_mmla_4x16 ( "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add v31.4s, v31.4s, v3.4s\n" - "add v20.4s, v20.4s, v3.4s\n" - "add v21.4s, v21.4s, v3.4s\n" - "add v22.4s, v22.4s, v3.4s\n" - "add v16.4s, v16.4s, v3.4s\n" - "add v17.4s, v17.4s, v3.4s\n" - "add v18.4s, v18.4s, v3.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "add v28.4s, v28.4s, v3.4s\n" - "add v29.4s, v29.4s, v3.4s\n" - "add v30.4s, v30.4s, v3.4s\n" - "add v24.4s, v24.4s, v3.4s\n" - "add v25.4s, v25.4s, v3.4s\n" - "add v26.4s, v26.4s, v3.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "smin v31.4s, v31.4s, v2.4s\n" - "smin v20.4s, v20.4s, v2.4s\n" - "smin v21.4s, v21.4s, v2.4s\n" - "smin v22.4s, v22.4s, v2.4s\n" - "smin v16.4s, v16.4s, v2.4s\n" - "smin v17.4s, v17.4s, v2.4s\n" - "smin v18.4s, v18.4s, v2.4s\n" - "smin v19.4s, v19.4s, v2.4s\n" - "smin v23.4s, v23.4s, v2.4s\n" - "smin v28.4s, v28.4s, v2.4s\n" - "smin v29.4s, v29.4s, v2.4s\n" - "smin v30.4s, v30.4s, v2.4s\n" - "smin v24.4s, v24.4s, v2.4s\n" - "smin v25.4s, v25.4s, v2.4s\n" - "smin v26.4s, v26.4s, v2.4s\n" - "smin v27.4s, v27.4s, v2.4s\n" - "smax v31.4s, v31.4s, v1.4s\n" - "smax v20.4s, v20.4s, v1.4s\n" - "smax v21.4s, v21.4s, v1.4s\n" - "smax v22.4s, v22.4s, v1.4s\n" - "smax v16.4s, v16.4s, v1.4s\n" - "smax v17.4s, v17.4s, v1.4s\n" - "smax v18.4s, v18.4s, v1.4s\n" - "smax v19.4s, v19.4s, v1.4s\n" - "smax v23.4s, v23.4s, v1.4s\n" - "smax v28.4s, v28.4s, v1.4s\n" - "smax v29.4s, v29.4s, v1.4s\n" - "smax v30.4s, v30.4s, v1.4s\n" - "smax v24.4s, v24.4s, v1.4s\n" - "smax v25.4s, v25.4s, v1.4s\n" - "smax v26.4s, v26.4s, v1.4s\n" - "smax v27.4s, v27.4s, v1.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" "uzp1 v31.8h, v31.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v19.8h, v18.8h, v19.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v23.8h, v23.8h, v28.8h\n" - "uzp1 v18.8h, v29.8h, v30.8h\n" + "uzp1 v28.8h, v29.8h, v30.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v17.8h, v26.8h, v27.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" "uzp1 v31.16b, v31.16b, v20.16b\n" - "uzp1 v16.16b, v16.16b, v19.16b\n" - "uzp1 v23.16b, v23.16b, v18.16b\n" - "uzp1 v24.16b, v24.16b, v17.16b\n" - "bge 127f\n" - "tbz x9, #3, 122f\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v23.16b, v23.16b, v28.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 123f\n" + "tbz x10, #3, 118f\n" "str d31, [x27], #0x8\n" - "str d16, [x24], #0x8\n" - "str d23, [x23], #0x8\n" - "str d24, [x22], #0x8\n" - "tbz x9, #2, 120f\n" + "str d16, [x26], #0x8\n" + "str d23, [x25], #0x8\n" + "str d24, [x24], #0x8\n" + "tbz x10, #2, 116f\n" "st1 { v31.s }[2], [x27], #0x4\n" - "st1 { v16.s }[2], [x24], #0x4\n" - "st1 { v23.s }[2], [x23], #0x4\n" - "st1 { v24.s }[2], [x22], #0x4\n" - "tbz x9, #1, 119f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "st1 { v23.s }[2], [x25], #0x4\n" + "st1 { v24.s }[2], [x24], #0x4\n" + "tbz x10, #1, 115f\n" "st1 { v31.h }[6], [x27], #0x2\n" - "st1 { v16.h }[6], [x24], #0x2\n" - "st1 { v23.h }[6], [x23], #0x2\n" - "st1 { v24.h }[6], [x22], #0x2\n" - "tbz x9, #0, 126f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "st1 { v23.h }[6], [x25], #0x2\n" + "st1 { v24.h }[6], [x24], #0x2\n" + "tbz x10, #0, 122f\n" "st1 { v31.b }[14], [x27]\n" - "st1 { v16.b }[14], [x24]\n" - "st1 { v23.b }[14], [x23]\n" - "st1 { v24.b }[14], [x22]\n" - "b 126f\n" - "119:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x9, #0, 126f\n" + "st1 { v16.b }[14], [x26]\n" + "st1 { v23.b }[14], [x25]\n" + "st1 { v24.b }[14], [x24]\n" + "b 122f\n" + "115:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x10, #0, 122f\n" "st1 { v31.b }[12], [x27]\n" - "st1 { v16.b }[12], [x24]\n" - "st1 { v23.b }[12], [x23]\n" - "st1 { v24.b }[12], [x22]\n" - "b 126f\n" - "120:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x9, #1, 121f\n" + "st1 { v16.b }[12], [x26]\n" + "st1 { v23.b }[12], [x25]\n" + "st1 { v24.b }[12], [x24]\n" + "b 122f\n" + "116:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x10, #1, 117f\n" "st1 { v31.h }[4], [x27], #0x2\n" - "st1 { v16.h }[4], [x24], #0x2\n" - "st1 { v23.h }[4], [x23], #0x2\n" - "st1 { v24.h }[4], [x22], #0x2\n" - "tbz x9, #0, 126f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "st1 { v23.h }[4], [x25], #0x2\n" + "st1 { v24.h }[4], [x24], #0x2\n" + "tbz x10, #0, 122f\n" "st1 { v31.b }[10], [x27]\n" - "st1 { v16.b }[10], [x24]\n" - "st1 { v23.b }[10], [x23]\n" - "st1 { v24.b }[10], [x22]\n" - "b 126f\n" - "121:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x9, #0, 126f\n" + "st1 { v16.b }[10], [x26]\n" + "st1 { v23.b }[10], [x25]\n" + "st1 { v24.b }[10], [x24]\n" + "b 122f\n" + "117:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x10, #0, 122f\n" "st1 { v31.b }[8], [x27]\n" - "st1 { v16.b }[8], [x24]\n" - "st1 { v23.b }[8], [x23]\n" - "st1 { v24.b }[8], [x22]\n" - "b 126f\n" - "122:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x9, #2, 124f\n" + "st1 { v16.b }[8], [x26]\n" + "st1 { v23.b }[8], [x25]\n" + "st1 { v24.b }[8], [x24]\n" + "b 122f\n" + "118:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x10, #2, 120f\n" "str s31, [x27], #0x4\n" - "str s16, [x24], #0x4\n" - "str s23, [x23], #0x4\n" - "str s24, [x22], #0x4\n" - "tbz x9, #1, 123f\n" + "str s16, [x26], #0x4\n" + "str s23, [x25], #0x4\n" + "str s24, [x24], #0x4\n" + "tbz x10, #1, 119f\n" "st1 { v31.h }[2], [x27], #0x2\n" - "st1 { v16.h }[2], [x24], #0x2\n" - "st1 { v23.h }[2], [x23], #0x2\n" - "st1 { v24.h }[2], [x22], #0x2\n" - "tbz x9, #0, 126f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "st1 { v23.h }[2], [x25], #0x2\n" + "st1 { v24.h }[2], [x24], #0x2\n" + "tbz x10, #0, 122f\n" "st1 { v31.b }[6], [x27]\n" - "st1 { v16.b }[6], [x24]\n" - "st1 { v23.b }[6], [x23]\n" - "st1 { v24.b }[6], [x22]\n" - "b 126f\n" - "123:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x9, #0, 126f\n" + "st1 { v16.b }[6], [x26]\n" + "st1 { v23.b }[6], [x25]\n" + "st1 { v24.b }[6], [x24]\n" + "b 122f\n" + "119:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x10, #0, 122f\n" "st1 { v31.b }[4], [x27]\n" - "st1 { v16.b }[4], [x24]\n" - "st1 { v23.b }[4], [x23]\n" - "st1 { v24.b }[4], [x22]\n" - "b 126f\n" - "124:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x9, #1, 125f\n" + "st1 { v16.b }[4], [x26]\n" + "st1 { v23.b }[4], [x25]\n" + "st1 { v24.b }[4], [x24]\n" + "b 122f\n" + "120:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x10, #1, 121f\n" "str h31, [x27], #0x2\n" - "str h16, [x24], #0x2\n" - "str h23, [x23], #0x2\n" - "str h24, [x22], #0x2\n" - "tbz x9, #0, 126f\n" + "str h16, [x26], #0x2\n" + "str h23, [x25], #0x2\n" + "str h24, [x24], #0x2\n" + "tbz x10, #0, 122f\n" "st1 { v31.b }[2], [x27]\n" - "st1 { v16.b }[2], [x24]\n" - "st1 { v23.b }[2], [x23]\n" - "st1 { v24.b }[2], [x22]\n" - "b 126f\n" - "125:" // Height 4: Partial direct writeback: partial_1_0 + "st1 { v16.b }[2], [x26]\n" + "st1 { v23.b }[2], [x25]\n" + "st1 { v24.b }[2], [x24]\n" + "b 122f\n" + "121:" // Height 4: Partial direct writeback: partial_1_0 "str b31, [x27, #0x0]\n" - "str b16, [x24, #0x0]\n" - "str b23, [x23, #0x0]\n" - "str b24, [x22, #0x0]\n" - "126:" // Height 4: Partial direct writeback: Done - "b 128f\n" - "127:" // Height 4: Full writeback + "str b16, [x26, #0x0]\n" + "str b23, [x25, #0x0]\n" + "str b24, [x24, #0x0]\n" + "122:" // Height 4: Partial direct writeback: Done + "b 124f\n" + "123:" // Height 4: Full writeback "str q31, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q16, [x24, #0x0]\n" - "str q23, [x23, #0x0]\n" - "str q24, [x22, #0x0]\n" - "128:" // Height 4: Writeback done - "subs x9, x9, #0x10\n" - "bgt 98b\n" + "str q16, [x26, #0x0]\n" + "str q23, [x25, #0x0]\n" + "str q24, [x24, #0x0]\n" + "124:" // Height 4: Writeback done + "subs x10, x10, #0x10\n" + "bgt 95b\n" "subs %x[M], %x[M], #0x4\n" - "beq 130f\n" + "beq 126f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 129f\n" + "tbz %x[flags], #3, 125f\n" "add x21, x21, #0x4\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "129:" // Update direct input + "125:" // Update direct input "mov x20, #0x4\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "130:" // Exit + "126:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp index 1a7cc1e70e..f00cc43f1d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp @@ -46,9 +46,6 @@ void sme2_gemv_s8qa_dot_16VL ( unsigned long flags=0; ka.B_ptr = B_ptr; - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "ptrue p8.b\n" ".inst 0xd503477f // SMSTART ZA\n" @@ -189,7 +186,6 @@ void sme2_gemv_s8qa_dot_16VL ( "uzp1 z12.b, z12.b, z19.b\n" "st1b { z12.b }, p1, [x25]\n" "addvl x25, x25, #1\n" - "13:" // Width 1: Output done "b 44f\n" "14:" // Width 2 "mov x23, %x[A_ptr]\n" @@ -317,7 +313,6 @@ void sme2_gemv_s8qa_dot_16VL ( "st1b { z20.b }, p2, [x25]\n" "st1b { z12.b }, p1, [x25, #1, MUL VL]\n" "addvl x25, x25, #2\n" - "23:" // Width 2: Output done "b 44f\n" "24:" // Width 3 "mov x20, #0x2\n" @@ -473,7 +468,6 @@ void sme2_gemv_s8qa_dot_16VL ( "st1b { z12.b }, p2, [x25, #1, MUL VL]\n" "st1b { z16.b }, p1, [x25, #2, MUL VL]\n" "addvl x25, x25, #3\n" - "33:" // Width 3: Output done "b 44f\n" "34:" // Width 4 "mov x20, #0x3\n" @@ -657,7 +651,6 @@ void sme2_gemv_s8qa_dot_16VL ( "st1b { z24.b }, p2, [x25, #2, MUL VL]\n" "st1b { z20.b }, p1, [x25, #3, MUL VL]\n" "addvl x25, x25, #4\n" - "43:" // Width 4: Output done "subs x26, x26, #0x4\n" "sub %x[N], %x[N], x28, LSL #2\n" "bgt 4b\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp index 1cbaf00052..843ceb32f6 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp @@ -46,9 +46,6 @@ void sme2_gemv_u8qa_dot_16VL ( unsigned long flags=0; ka.B_ptr = B_ptr; - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "ptrue p8.b\n" ".inst 0xd503477f // SMSTART ZA\n" @@ -189,7 +186,6 @@ void sme2_gemv_u8qa_dot_16VL ( "uzp1 z12.b, z12.b, z19.b\n" "st1b { z12.b }, p1, [x25]\n" "addvl x25, x25, #1\n" - "13:" // Width 1: Output done "b 44f\n" "14:" // Width 2 "mov x23, %x[A_ptr]\n" @@ -317,7 +313,6 @@ void sme2_gemv_u8qa_dot_16VL ( "st1b { z20.b }, p2, [x25]\n" "st1b { z12.b }, p1, [x25, #1, MUL VL]\n" "addvl x25, x25, #2\n" - "23:" // Width 2: Output done "b 44f\n" "24:" // Width 3 "mov x20, #0x2\n" @@ -473,7 +468,6 @@ void sme2_gemv_u8qa_dot_16VL ( "st1b { z12.b }, p2, [x25, #1, MUL VL]\n" "st1b { z16.b }, p1, [x25, #2, MUL VL]\n" "addvl x25, x25, #3\n" - "33:" // Width 3: Output done "b 44f\n" "34:" // Width 4 "mov x20, #0x3\n" @@ -657,7 +651,6 @@ void sme2_gemv_u8qa_dot_16VL ( "st1b { z24.b }, p2, [x25, #2, MUL VL]\n" "st1b { z20.b }, p1, [x25, #3, MUL VL]\n" "addvl x25, x25, #4\n" - "43:" // Width 4: Output done "subs x26, x26, #0x4\n" "sub %x[N], %x[N], x28, LSL #2\n" "bgt 4b\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp index 7de5e09bd5..eca689bd7f 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -74,23 +73,20 @@ void sve_hybrid_s8qa_dot_4x4VL ( ka.string_lengths = string_lengths; ka.N = N; ka.B_ptr = B_ptr; - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "ptrue p2.b\n" "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 43f\n" + "bge 40f\n" "cmp %x[M], #0x2\n" - "bgt 29f\n" - "beq 15f\n" - "mov x10, %x[col_bias]\n" + "bgt 27f\n" + "beq 14f\n" "mov z11.s, #0x0\n" "mov z15.b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "2:" // Height 1: Column loop "mov x20, #0x0\n" @@ -98,8 +94,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( "mov z17.s, #0x0\n" "mov z18.s, #0x0\n" "mov z19.s, #0x0\n" - "whilelt p1.b, x20, x9\n" - "3:" // Height 1: setup done + "whilelt p1.b, x20, x10\n" "mov x26, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -120,41 +115,41 @@ void sve_hybrid_s8qa_dot_4x4VL ( "ble 9f\n" "7:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z21.b }, p2/Z, [x28]\n" - "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z20.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z23.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" - "ld1b { z22.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #6, MUL VL]\n" "add x24, x24, #0x10\n" - "sdot z16.s, z21.b, z0.b[0]\n" - "ld1b { z21.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - "sdot z17.s, z26.b, z0.b[0]\n" - "sdot z18.s, z25.b, z0.b[0]\n" - "sdot z19.s, z24.b, z0.b[0]\n" - "sdot z16.s, z20.b, z0.b[1]\n" - "ld1b { z20.b }, p2/Z, [x28, #-8, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n" - "sdot z17.s, z23.b, z0.b[1]\n" - "ld1b { z23.b }, p2/Z, [x28, #-4, MUL VL]\n" - "sdot z18.s, z22.b, z0.b[1]\n" - "ld1b { z22.b }, p2/Z, [x28, #-3, MUL VL]\n" - "sdot z19.s, z21.b, z0.b[1]\n" - "ld1b { z21.b }, p2/Z, [x28, #-2, MUL VL]\n" - "sdot z16.s, z20.b, z0.b[2]\n" - "ld1b { z20.b }, p2/Z, [x28, #-1, MUL VL]\n" - "sdot z17.s, z26.b, z0.b[2]\n" - "sdot z18.s, z25.b, z0.b[2]\n" - "sdot z19.s, z24.b, z0.b[2]\n" - "sdot z16.s, z23.b, z0.b[3]\n" - "sdot z17.s, z22.b, z0.b[3]\n" - "sdot z18.s, z21.b, z0.b[3]\n" - "sdot z19.s, z20.b, z0.b[3]\n" + "sdot z16.s, z4.b, z0.b[0]\n" + "ld1b { z4.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + "sdot z17.s, z5.b, z0.b[0]\n" + "sdot z18.s, z6.b, z0.b[0]\n" + "sdot z19.s, z7.b, z0.b[0]\n" + "sdot z16.s, z8.b, z0.b[1]\n" + "ld1b { z5.b }, p2/Z, [x9, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-5, MUL VL]\n" + "sdot z17.s, z9.b, z0.b[1]\n" + "ld1b { z9.b }, p2/Z, [x9, #-4, MUL VL]\n" + "sdot z18.s, z10.b, z0.b[1]\n" + "ld1b { z10.b }, p2/Z, [x9, #-3, MUL VL]\n" + "sdot z19.s, z4.b, z0.b[1]\n" + "ld1b { z4.b }, p2/Z, [x9, #-2, MUL VL]\n" + "sdot z16.s, z5.b, z0.b[2]\n" + "ld1b { z5.b }, p2/Z, [x9, #-1, MUL VL]\n" + "sdot z17.s, z6.b, z0.b[2]\n" + "sdot z18.s, z7.b, z0.b[2]\n" + "sdot z19.s, z8.b, z0.b[2]\n" + "sdot z16.s, z9.b, z0.b[3]\n" + "sdot z17.s, z10.b, z0.b[3]\n" + "sdot z18.s, z4.b, z0.b[3]\n" + "sdot z19.s, z5.b, z0.b[3]\n" "tbnz %x[flags], #31, 8f\n" "sdot z11.s, z0.b, z15.b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum @@ -163,49 +158,49 @@ void sve_hybrid_s8qa_dot_4x4VL ( "bgt 7b\n" "9:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z23.b }, p2/Z, [x28]\n" - "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" - "sdot z16.s, z23.b, z0.b[0]\n" - "sdot z17.s, z22.b, z0.b[0]\n" - "sdot z18.s, z21.b, z0.b[0]\n" - "sdot z19.s, z20.b, z0.b[0]\n" + "sdot z16.s, z4.b, z0.b[0]\n" + "sdot z17.s, z5.b, z0.b[0]\n" + "sdot z18.s, z6.b, z0.b[0]\n" + "sdot z19.s, z7.b, z0.b[0]\n" "ble 10f\n" - "ld1b { z23.b }, p2/Z, [x28]\n" - "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9]\n" + "ld1b { z9.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "sdot z16.s, z23.b, z0.b[1]\n" - "sdot z17.s, z22.b, z0.b[1]\n" - "sdot z18.s, z21.b, z0.b[1]\n" - "sdot z19.s, z20.b, z0.b[1]\n" + "ld1b { z10.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z16.s, z8.b, z0.b[1]\n" + "sdot z17.s, z9.b, z0.b[1]\n" + "sdot z18.s, z10.b, z0.b[1]\n" + "sdot z19.s, z4.b, z0.b[1]\n" "ble 10f\n" - "ld1b { z23.b }, p2/Z, [x28]\n" - "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "sdot z16.s, z23.b, z0.b[2]\n" - "sdot z17.s, z22.b, z0.b[2]\n" - "sdot z18.s, z21.b, z0.b[2]\n" - "sdot z19.s, z20.b, z0.b[2]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z16.s, z5.b, z0.b[2]\n" + "sdot z17.s, z6.b, z0.b[2]\n" + "sdot z18.s, z7.b, z0.b[2]\n" + "sdot z19.s, z8.b, z0.b[2]\n" "ble 10f\n" - "ld1b { z23.b }, p2/Z, [x28]\n" - "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "sdot z16.s, z23.b, z0.b[3]\n" - "sdot z17.s, z22.b, z0.b[3]\n" - "sdot z18.s, z21.b, z0.b[3]\n" - "sdot z19.s, z20.b, z0.b[3]\n" + "ld1b { z9.b }, p2/Z, [x9]\n" + "ld1b { z10.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z16.s, z9.b, z0.b[3]\n" + "sdot z17.s, z10.b, z0.b[3]\n" + "sdot z18.s, z4.b, z0.b[3]\n" + "sdot z19.s, z5.b, z0.b[3]\n" "10:" // Height 1: Multiply loop: multiply skip "tbnz %x[flags], #31, 11f\n" "sdot z11.s, z0.b, z15.b\n" @@ -217,91 +212,76 @@ void sve_hybrid_s8qa_dot_4x4VL ( "tbnz %x[flags], #31, 12f\n" "mov x21, #0x4\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z20.s }, p2/Z, [x20]\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" "whilelt p0.s, XZR, x21\n" - "neg z20.s, p2/M, z20.s\n" + "neg z1.s, p2/M, z1.s\n" "saddv d11, p0, z11.s\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z20.s\n" + "mul z11.s, p2/M, z11.s, z1.s\n" "12:" // Height 1: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z23.s }, p2/Z, [x10]\n" - "ld1w { z20.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z22.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z21.s }, p2/Z, [x10, #3, MUL VL]\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "add z16.s, z16.s, z23.s\n" - "add z17.s, z17.s, z20.s\n" - "ld1rw { z20.s }, p2/Z, [x20]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - "addvl x10, x10, #4\n" - "add z18.s, z18.s, z22.s\n" - "add z19.s, z19.s, z21.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "ld1rw { z4.s }, p2/Z, [x21]\n" "ld1rw { z0.s }, p2/Z, [x20]\n" - ".inst 0x04b47610 // sqrdmulh z16.s, z16.s, z20.s\n" - ".inst 0x04b47631 // sqrdmulh z17.s, z17.s, z20.s\n" - ".inst 0x04b47652 // sqrdmulh z18.s, z18.s, z20.s\n" - ".inst 0x04b47673 // sqrdmulh z19.s, z19.s, z20.s\n" - "tbz %x[flags], #5, 13f\n" - "and z23.d, z16.d, z0.d\n" - "and z22.d, z17.d, z0.d\n" - "and z21.d, z18.d, z0.d\n" - "and z20.d, z19.d, z0.d\n" - "asr z23.s, z23.s, #0x1f\n" - "asr z22.s, z22.s, #0x1f\n" - "asr z21.s, z21.s, #0x1f\n" - "asr z20.s, z20.s, #0x1f\n" - "sqadd z16.s, z16.s, z23.s\n" - "sqadd z17.s, z17.s, z22.s\n" - "sqadd z18.s, z18.s, z21.s\n" - "sqadd z19.s, z19.s, z20.s\n" - "13:" // Height 1: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x20]\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x21]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "ld1rw { z22.s }, p2/Z, [x20]\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1rw { z21.s }, p2/Z, [x20]\n" - "add z16.s, z16.s, z22.s\n" - "add x20, %x[qp], %[minval]\n" - "add z17.s, z17.s, z22.s\n" - "add z18.s, z18.s, z22.s\n" - "ld1rw { z20.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z22.s\n" - "smin z16.s, p2/M, z16.s, z21.s\n" - "smin z17.s, p2/M, z17.s, z21.s\n" - "smin z18.s, p2/M, z18.s, z21.s\n" - "smin z19.s, p2/M, z19.s, z21.s\n" - "smax z16.s, p2/M, z16.s, z20.s\n" - "smax z17.s, p2/M, z17.s, z20.s\n" - "smax z18.s, p2/M, z18.s, z20.s\n" - "smax z19.s, p2/M, z19.s, z20.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" "st1b { z16.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "14:" // Height 1: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" "bgt 2b\n" - "b 58f\n" - "15:" // Height 2 - "mov x10, %x[col_bias]\n" + "b 54f\n" + "14:" // Height 2 "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z15.b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "16:" // Height 2: Column loop + "15:" // Height 2: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -309,302 +289,274 @@ void sve_hybrid_s8qa_dot_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "17:" // Height 2: setup done "mov x26, #0x0\n" - "18:" // Height 2: String loop + "17:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 19f\n" + "tbz %x[flags], #3, 18f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" - "cbnz x26, 20f\n" + "cbnz x26, 19f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" - "b 20f\n" - "19:" // Height 2: setup direct input + "b 19f\n" + "18:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" - "20:" // Height 2: input setup done + "19:" // Height 2: input setup done "cmp x25, #0x10\n" - "ble 23f\n" - "21:" // Height 2: Multiply loop: Main loop head + "ble 22f\n" + "20:" // Height 2: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z25.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n" - "sdot z16.s, z25.b, z0.b[0]\n" - "sdot z20.s, z25.b, z1.b[0]\n" - "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - "sdot z17.s, z30.b, z0.b[0]\n" - "sdot z21.s, z30.b, z1.b[0]\n" - "sdot z18.s, z29.b, z0.b[0]\n" - "sdot z22.s, z29.b, z1.b[0]\n" - "sdot z19.s, z28.b, z0.b[0]\n" - "sdot z23.s, z28.b, z1.b[0]\n" - "sdot z16.s, z24.b, z0.b[1]\n" - "sdot z20.s, z24.b, z1.b[1]\n" - "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n" - "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n" - "sdot z17.s, z27.b, z0.b[1]\n" - "sdot z21.s, z27.b, z1.b[1]\n" - "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n" - "sdot z18.s, z26.b, z0.b[1]\n" - "sdot z22.s, z26.b, z1.b[1]\n" - "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n" - "sdot z19.s, z25.b, z0.b[1]\n" - "sdot z23.s, z25.b, z1.b[1]\n" - "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n" - "sdot z16.s, z24.b, z0.b[2]\n" - "sdot z20.s, z24.b, z1.b[2]\n" - "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n" - "sdot z17.s, z30.b, z0.b[2]\n" - "sdot z21.s, z30.b, z1.b[2]\n" - "sdot z18.s, z29.b, z0.b[2]\n" - "sdot z22.s, z29.b, z1.b[2]\n" - "sdot z19.s, z28.b, z0.b[2]\n" - "sdot z23.s, z28.b, z1.b[2]\n" - "sdot z16.s, z27.b, z0.b[3]\n" - "sdot z20.s, z27.b, z1.b[3]\n" - "sdot z17.s, z26.b, z0.b[3]\n" - "sdot z21.s, z26.b, z1.b[3]\n" - "sdot z18.s, z25.b, z0.b[3]\n" - "sdot z22.s, z25.b, z1.b[3]\n" - "sdot z19.s, z24.b, z0.b[3]\n" - "sdot z23.s, z24.b, z1.b[3]\n" - "tbnz %x[flags], #31, 22f\n" + "ld1b { z10.b }, p2/Z, [x9, #6, MUL VL]\n" + "sdot z16.s, z4.b, z0.b[0]\n" + "sdot z20.s, z4.b, z1.b[0]\n" + "ld1b { z4.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + "sdot z17.s, z5.b, z0.b[0]\n" + "sdot z21.s, z5.b, z1.b[0]\n" + "sdot z18.s, z6.b, z0.b[0]\n" + "sdot z22.s, z6.b, z1.b[0]\n" + "sdot z19.s, z7.b, z0.b[0]\n" + "sdot z23.s, z7.b, z1.b[0]\n" + "sdot z16.s, z8.b, z0.b[1]\n" + "sdot z20.s, z8.b, z1.b[1]\n" + "ld1b { z5.b }, p2/Z, [x9, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" + "sdot z17.s, z9.b, z0.b[1]\n" + "sdot z21.s, z9.b, z1.b[1]\n" + "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-5, MUL VL]\n" + "sdot z18.s, z10.b, z0.b[1]\n" + "sdot z22.s, z10.b, z1.b[1]\n" + "ld1b { z9.b }, p2/Z, [x9, #-4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #-3, MUL VL]\n" + "sdot z19.s, z4.b, z0.b[1]\n" + "sdot z23.s, z4.b, z1.b[1]\n" + "ld1b { z4.b }, p2/Z, [x9, #-2, MUL VL]\n" + "sdot z16.s, z5.b, z0.b[2]\n" + "sdot z20.s, z5.b, z1.b[2]\n" + "ld1b { z5.b }, p2/Z, [x9, #-1, MUL VL]\n" + "sdot z17.s, z6.b, z0.b[2]\n" + "sdot z21.s, z6.b, z1.b[2]\n" + "sdot z18.s, z7.b, z0.b[2]\n" + "sdot z22.s, z7.b, z1.b[2]\n" + "sdot z19.s, z8.b, z0.b[2]\n" + "sdot z23.s, z8.b, z1.b[2]\n" + "sdot z16.s, z9.b, z0.b[3]\n" + "sdot z20.s, z9.b, z1.b[3]\n" + "sdot z17.s, z10.b, z0.b[3]\n" + "sdot z21.s, z10.b, z1.b[3]\n" + "sdot z18.s, z4.b, z0.b[3]\n" + "sdot z22.s, z4.b, z1.b[3]\n" + "sdot z19.s, z5.b, z0.b[3]\n" + "sdot z23.s, z5.b, z1.b[3]\n" + "tbnz %x[flags], #31, 21f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" - "22:" // Height 2: Multiply loop: unique 3: skip row sum + "21:" // Height 2: Multiply loop: unique 3: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 21b\n" - "23:" // Height 2: Multiply loop: Single iteration only + "bgt 20b\n" + "22:" // Height 2: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z27.b }, p2/Z, [x28]\n" - "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" - "sdot z16.s, z27.b, z0.b[0]\n" - "sdot z20.s, z27.b, z1.b[0]\n" - "sdot z17.s, z26.b, z0.b[0]\n" - "sdot z21.s, z26.b, z1.b[0]\n" - "sdot z18.s, z25.b, z0.b[0]\n" - "sdot z22.s, z25.b, z1.b[0]\n" - "sdot z19.s, z24.b, z0.b[0]\n" - "sdot z23.s, z24.b, z1.b[0]\n" - "ble 24f\n" - "ld1b { z27.b }, p2/Z, [x28]\n" - "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" + "sdot z16.s, z4.b, z0.b[0]\n" + "sdot z20.s, z4.b, z1.b[0]\n" + "sdot z17.s, z5.b, z0.b[0]\n" + "sdot z21.s, z5.b, z1.b[0]\n" + "sdot z18.s, z6.b, z0.b[0]\n" + "sdot z22.s, z6.b, z1.b[0]\n" + "sdot z19.s, z7.b, z0.b[0]\n" + "sdot z23.s, z7.b, z1.b[0]\n" + "ble 23f\n" + "ld1b { z8.b }, p2/Z, [x9]\n" + "ld1b { z9.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "sdot z16.s, z27.b, z0.b[1]\n" - "sdot z20.s, z27.b, z1.b[1]\n" - "sdot z17.s, z26.b, z0.b[1]\n" - "sdot z21.s, z26.b, z1.b[1]\n" - "sdot z18.s, z25.b, z0.b[1]\n" - "sdot z22.s, z25.b, z1.b[1]\n" - "sdot z19.s, z24.b, z0.b[1]\n" - "sdot z23.s, z24.b, z1.b[1]\n" - "ble 24f\n" - "ld1b { z27.b }, p2/Z, [x28]\n" - "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z16.s, z8.b, z0.b[1]\n" + "sdot z20.s, z8.b, z1.b[1]\n" + "sdot z17.s, z9.b, z0.b[1]\n" + "sdot z21.s, z9.b, z1.b[1]\n" + "sdot z18.s, z10.b, z0.b[1]\n" + "sdot z22.s, z10.b, z1.b[1]\n" + "sdot z19.s, z4.b, z0.b[1]\n" + "sdot z23.s, z4.b, z1.b[1]\n" + "ble 23f\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "sdot z16.s, z27.b, z0.b[2]\n" - "sdot z20.s, z27.b, z1.b[2]\n" - "sdot z17.s, z26.b, z0.b[2]\n" - "sdot z21.s, z26.b, z1.b[2]\n" - "sdot z18.s, z25.b, z0.b[2]\n" - "sdot z22.s, z25.b, z1.b[2]\n" - "sdot z19.s, z24.b, z0.b[2]\n" - "sdot z23.s, z24.b, z1.b[2]\n" - "ble 24f\n" - "ld1b { z27.b }, p2/Z, [x28]\n" - "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "sdot z16.s, z27.b, z0.b[3]\n" - "sdot z20.s, z27.b, z1.b[3]\n" - "sdot z17.s, z26.b, z0.b[3]\n" - "sdot z21.s, z26.b, z1.b[3]\n" - "sdot z18.s, z25.b, z0.b[3]\n" - "sdot z22.s, z25.b, z1.b[3]\n" - "sdot z19.s, z24.b, z0.b[3]\n" - "sdot z23.s, z24.b, z1.b[3]\n" - "24:" // Height 2: Multiply loop: multiply skip - "tbnz %x[flags], #31, 25f\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z16.s, z5.b, z0.b[2]\n" + "sdot z20.s, z5.b, z1.b[2]\n" + "sdot z17.s, z6.b, z0.b[2]\n" + "sdot z21.s, z6.b, z1.b[2]\n" + "sdot z18.s, z7.b, z0.b[2]\n" + "sdot z22.s, z7.b, z1.b[2]\n" + "sdot z19.s, z8.b, z0.b[2]\n" + "sdot z23.s, z8.b, z1.b[2]\n" + "ble 23f\n" + "ld1b { z9.b }, p2/Z, [x9]\n" + "ld1b { z10.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z16.s, z9.b, z0.b[3]\n" + "sdot z20.s, z9.b, z1.b[3]\n" + "sdot z17.s, z10.b, z0.b[3]\n" + "sdot z21.s, z10.b, z1.b[3]\n" + "sdot z18.s, z4.b, z0.b[3]\n" + "sdot z22.s, z4.b, z1.b[3]\n" + "sdot z19.s, z5.b, z0.b[3]\n" + "sdot z23.s, z5.b, z1.b[3]\n" + "23:" // Height 2: Multiply loop: multiply skip + "tbnz %x[flags], #31, 24f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" - "25:" // Height 2: Multiply loop: unique 4: skip row sum + "24:" // Height 2: Multiply loop: unique 4: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 18b\n" + "bne 17b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x27, x20\n" - "tbnz %x[flags], #31, 26f\n" + "add x26, x27, x20\n" + "tbnz %x[flags], #31, 25f\n" "mov x21, #0x4\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" + "ld1rw { z2.s }, p2/Z, [x20]\n" "whilelt p0.s, XZR, x21\n" - "neg z24.s, p2/M, z24.s\n" + "neg z2.s, p2/M, z2.s\n" "saddv d11, p0, z11.s\n" "saddv d12, p0, z12.s\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z24.s\n" + "mul z11.s, p2/M, z11.s, z2.s\n" "mov z12.s, z12.s[0]\n" - "mul z12.s, p2/M, z12.s, z24.s\n" - "26:" // Height 2: skip row sum fixup + "mul z12.s, p2/M, z12.s, z2.s\n" + "25:" // Height 2: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z28.s }, p2/Z, [x10]\n" - "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z20.s, z20.s, z12.s\n" "add z21.s, z21.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z22.s, z22.s, z12.s\n" "add z23.s, z23.s, z12.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" - "add z16.s, z16.s, z28.s\n" - "add z17.s, z17.s, z27.s\n" - "addvl x10, x10, #4\n" - "add z18.s, z18.s, z26.s\n" - "add z19.s, z19.s, z25.s\n" - "add z20.s, z20.s, z28.s\n" - "add z21.s, z21.s, z27.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z22.s, z22.s, z26.s\n" - "add z23.s, z23.s, z25.s\n" - ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n" - ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n" - ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n" - ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n" - ".inst 0x04b87694 // sqrdmulh z20.s, z20.s, z24.s\n" - ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n" - ".inst 0x04b876d6 // sqrdmulh z22.s, z22.s, z24.s\n" - ".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n" - "tbz %x[flags], #5, 27f\n" - "and z24.d, z16.d, z0.d\n" - "and z30.d, z17.d, z0.d\n" - "and z29.d, z18.d, z0.d\n" - "and z28.d, z19.d, z0.d\n" - "and z27.d, z20.d, z0.d\n" - "and z26.d, z21.d, z0.d\n" - "asr z24.s, z24.s, #0x1f\n" - "and z25.d, z22.d, z0.d\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z27.s, z27.s, #0x1f\n" - "sqadd z16.s, z16.s, z24.s\n" - "and z24.d, z23.d, z0.d\n" - "asr z26.s, z26.s, #0x1f\n" - "asr z25.s, z25.s, #0x1f\n" - "sqadd z17.s, z17.s, z30.s\n" - "sqadd z18.s, z18.s, z29.s\n" - "asr z24.s, z24.s, #0x1f\n" - "sqadd z19.s, z19.s, z28.s\n" - "sqadd z20.s, z20.s, z27.s\n" - "sqadd z21.s, z21.s, z26.s\n" - "sqadd z22.s, z22.s, z25.s\n" - "sqadd z23.s, z23.s, z24.s\n" - "27:" // Height 2: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + "addvl x28, x28, #4\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "ld1rw { z26.s }, p2/Z, [x20]\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" - "add z16.s, z16.s, z26.s\n" + "add z16.s, z16.s, z4.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z17.s, z17.s, z26.s\n" - "add z18.s, z18.s, z26.s\n" - "ld1rw { z25.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z26.s\n" - "add z20.s, z20.s, z26.s\n" - "add x20, %x[qp], %[minval]\n" - "add z21.s, z21.s, z26.s\n" - "add z22.s, z22.s, z26.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "add z23.s, z23.s, z26.s\n" - "smin z16.s, p2/M, z16.s, z25.s\n" - "smin z17.s, p2/M, z17.s, z25.s\n" - "smin z18.s, p2/M, z18.s, z25.s\n" - "smin z19.s, p2/M, z19.s, z25.s\n" - "smin z20.s, p2/M, z20.s, z25.s\n" - "smin z21.s, p2/M, z21.s, z25.s\n" - "smin z22.s, p2/M, z22.s, z25.s\n" - "smin z23.s, p2/M, z23.s, z25.s\n" - "smax z16.s, p2/M, z16.s, z24.s\n" - "smax z17.s, p2/M, z17.s, z24.s\n" - "smax z18.s, p2/M, z18.s, z24.s\n" - "smax z19.s, p2/M, z19.s, z24.s\n" - "smax z20.s, p2/M, z20.s, z24.s\n" - "smax z21.s, p2/M, z21.s, z24.s\n" - "smax z22.s, p2/M, z22.s, z24.s\n" - "smax z23.s, p2/M, z23.s, z24.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "add z23.s, z23.s, z4.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "uzp1 z18.h, z18.h, z19.h\n" + "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z20.h, z20.h, z21.h\n" - "uzp1 z17.h, z22.h, z23.h\n" - "uzp1 z16.b, z16.b, z18.b\n" - "uzp1 z20.b, z20.b, z17.b\n" + "uzp1 z21.h, z22.h, z23.h\n" + "uzp1 z16.b, z16.b, z17.b\n" + "uzp1 z20.b, z20.b, z21.b\n" "st1b { z16.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "st1b { z20.b }, p1, [x24]\n" - "28:" // Height 2: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 16b\n" - "b 58f\n" - "29:" // Height 3 - "mov x10, %x[col_bias]\n" + "st1b { z20.b }, p1, [x26]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 15b\n" + "b 54f\n" + "27:" // Height 3 "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z13.s, #0x0\n" "mov z15.b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "30:" // Height 3: Column loop + "28:" // Height 3: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -612,399 +564,359 @@ void sve_hybrid_s8qa_dot_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" "mov z24.s, #0x0\n" "mov z25.s, #0x0\n" "mov z26.s, #0x0\n" "mov z27.s, #0x0\n" - "31:" // Height 3: setup done "mov x26, #0x0\n" - "32:" // Height 3: String loop + "30:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 33f\n" + "tbz %x[flags], #3, 31f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" - "cbnz x26, 34f\n" + "cbnz x26, 32f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" - "b 34f\n" - "33:" // Height 3: setup direct input + "b 32f\n" + "31:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" - "34:" // Height 3: input setup done + "32:" // Height 3: input setup done "cmp x25, #0x10\n" - "ble 37f\n" - "35:" // Height 3: Multiply loop: Main loop head + "ble 35f\n" + "33:" // Height 3: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z29.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z3.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z31.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" "ld1rqb { z2.b }, p0/Z, [x22]\n" - "ld1b { z30.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #6, MUL VL]\n" "add x22, x22, #0x10\n" - "sdot z16.s, z5.b, z0.b[0]\n" - "sdot z20.s, z5.b, z1.b[0]\n" - "sdot z17.s, z29.b, z0.b[0]\n" - "sdot z21.s, z29.b, z1.b[0]\n" - "sdot z18.s, z4.b, z0.b[0]\n" - "sdot z24.s, z5.b, z2.b[0]\n" - "sdot z25.s, z29.b, z2.b[0]\n" - "ld1b { z29.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - "sdot z22.s, z4.b, z1.b[0]\n" - "sdot z26.s, z4.b, z2.b[0]\n" - "sdot z19.s, z28.b, z0.b[0]\n" - "sdot z23.s, z28.b, z1.b[0]\n" - "sdot z27.s, z28.b, z2.b[0]\n" - "sdot z16.s, z3.b, z0.b[1]\n" - "ld1b { z28.b }, p2/Z, [x28, #-8, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #-7, MUL VL]\n" - "sdot z20.s, z3.b, z1.b[1]\n" - "sdot z24.s, z3.b, z2.b[1]\n" - "ld1b { z4.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z3.b }, p2/Z, [x28, #-5, MUL VL]\n" - "sdot z17.s, z31.b, z0.b[1]\n" - "sdot z21.s, z31.b, z1.b[1]\n" - "sdot z25.s, z31.b, z2.b[1]\n" - "sdot z18.s, z30.b, z0.b[1]\n" - "ld1b { z31.b }, p2/Z, [x28, #-4, MUL VL]\n" - "sdot z22.s, z30.b, z1.b[1]\n" - "sdot z26.s, z30.b, z2.b[1]\n" - "ld1b { z30.b }, p2/Z, [x28, #-3, MUL VL]\n" - "sdot z19.s, z29.b, z0.b[1]\n" - "sdot z23.s, z29.b, z1.b[1]\n" - "sdot z27.s, z29.b, z2.b[1]\n" - "sdot z16.s, z28.b, z0.b[2]\n" - "ld1b { z29.b }, p2/Z, [x28, #-2, MUL VL]\n" - "sdot z20.s, z28.b, z1.b[2]\n" - "sdot z24.s, z28.b, z2.b[2]\n" - "ld1b { z28.b }, p2/Z, [x28, #-1, MUL VL]\n" - "sdot z17.s, z5.b, z0.b[2]\n" - "sdot z21.s, z5.b, z1.b[2]\n" - "sdot z25.s, z5.b, z2.b[2]\n" - "sdot z18.s, z4.b, z0.b[2]\n" - "sdot z22.s, z4.b, z1.b[2]\n" - "sdot z26.s, z4.b, z2.b[2]\n" - "sdot z19.s, z3.b, z0.b[2]\n" - "sdot z23.s, z3.b, z1.b[2]\n" - "sdot z27.s, z3.b, z2.b[2]\n" - "sdot z16.s, z31.b, z0.b[3]\n" - "sdot z20.s, z31.b, z1.b[3]\n" - "sdot z24.s, z31.b, z2.b[3]\n" - "sdot z17.s, z30.b, z0.b[3]\n" - "sdot z21.s, z30.b, z1.b[3]\n" - "sdot z25.s, z30.b, z2.b[3]\n" - "sdot z18.s, z29.b, z0.b[3]\n" - "sdot z22.s, z29.b, z1.b[3]\n" - "sdot z26.s, z29.b, z2.b[3]\n" - "sdot z19.s, z28.b, z0.b[3]\n" - "sdot z23.s, z28.b, z1.b[3]\n" - "sdot z27.s, z28.b, z2.b[3]\n" - "tbnz %x[flags], #31, 36f\n" + "sdot z16.s, z4.b, z0.b[0]\n" + "sdot z20.s, z4.b, z1.b[0]\n" + "sdot z17.s, z5.b, z0.b[0]\n" + "sdot z21.s, z5.b, z1.b[0]\n" + "sdot z18.s, z6.b, z0.b[0]\n" + "sdot z24.s, z4.b, z2.b[0]\n" + "sdot z25.s, z5.b, z2.b[0]\n" + "ld1b { z4.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + "sdot z22.s, z6.b, z1.b[0]\n" + "sdot z26.s, z6.b, z2.b[0]\n" + "sdot z19.s, z7.b, z0.b[0]\n" + "sdot z23.s, z7.b, z1.b[0]\n" + "sdot z27.s, z7.b, z2.b[0]\n" + "sdot z16.s, z8.b, z0.b[1]\n" + "ld1b { z5.b }, p2/Z, [x9, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" + "sdot z20.s, z8.b, z1.b[1]\n" + "sdot z24.s, z8.b, z2.b[1]\n" + "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-5, MUL VL]\n" + "sdot z17.s, z9.b, z0.b[1]\n" + "sdot z21.s, z9.b, z1.b[1]\n" + "sdot z25.s, z9.b, z2.b[1]\n" + "sdot z18.s, z10.b, z0.b[1]\n" + "ld1b { z9.b }, p2/Z, [x9, #-4, MUL VL]\n" + "sdot z22.s, z10.b, z1.b[1]\n" + "sdot z26.s, z10.b, z2.b[1]\n" + "ld1b { z10.b }, p2/Z, [x9, #-3, MUL VL]\n" + "sdot z19.s, z4.b, z0.b[1]\n" + "sdot z23.s, z4.b, z1.b[1]\n" + "sdot z27.s, z4.b, z2.b[1]\n" + "sdot z16.s, z5.b, z0.b[2]\n" + "ld1b { z4.b }, p2/Z, [x9, #-2, MUL VL]\n" + "sdot z20.s, z5.b, z1.b[2]\n" + "sdot z24.s, z5.b, z2.b[2]\n" + "ld1b { z5.b }, p2/Z, [x9, #-1, MUL VL]\n" + "sdot z17.s, z6.b, z0.b[2]\n" + "sdot z21.s, z6.b, z1.b[2]\n" + "sdot z25.s, z6.b, z2.b[2]\n" + "sdot z18.s, z7.b, z0.b[2]\n" + "sdot z22.s, z7.b, z1.b[2]\n" + "sdot z26.s, z7.b, z2.b[2]\n" + "sdot z19.s, z8.b, z0.b[2]\n" + "sdot z23.s, z8.b, z1.b[2]\n" + "sdot z27.s, z8.b, z2.b[2]\n" + "sdot z16.s, z9.b, z0.b[3]\n" + "sdot z20.s, z9.b, z1.b[3]\n" + "sdot z24.s, z9.b, z2.b[3]\n" + "sdot z17.s, z10.b, z0.b[3]\n" + "sdot z21.s, z10.b, z1.b[3]\n" + "sdot z25.s, z10.b, z2.b[3]\n" + "sdot z18.s, z4.b, z0.b[3]\n" + "sdot z22.s, z4.b, z1.b[3]\n" + "sdot z26.s, z4.b, z2.b[3]\n" + "sdot z19.s, z5.b, z0.b[3]\n" + "sdot z23.s, z5.b, z1.b[3]\n" + "sdot z27.s, z5.b, z2.b[3]\n" + "tbnz %x[flags], #31, 34f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" "sdot z13.s, z2.b, z15.b\n" - "36:" // Height 3: Multiply loop: unique 5: skip row sum + "34:" // Height 3: Multiply loop: unique 5: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 35b\n" - "37:" // Height 3: Multiply loop: Single iteration only + "bgt 33b\n" + "35:" // Height 3: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z31.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "ld1rqb { z2.b }, p0/Z, [x22]\n" - "sdot z16.s, z31.b, z0.b[0]\n" - "sdot z20.s, z31.b, z1.b[0]\n" - "sdot z17.s, z30.b, z0.b[0]\n" - "sdot z21.s, z30.b, z1.b[0]\n" - "sdot z18.s, z29.b, z0.b[0]\n" - "sdot z22.s, z29.b, z1.b[0]\n" - "sdot z24.s, z31.b, z2.b[0]\n" - "sdot z25.s, z30.b, z2.b[0]\n" - "sdot z26.s, z29.b, z2.b[0]\n" - "sdot z19.s, z28.b, z0.b[0]\n" - "sdot z23.s, z28.b, z1.b[0]\n" - "sdot z27.s, z28.b, z2.b[0]\n" - "ble 38f\n" - "ld1b { z31.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" + "sdot z16.s, z4.b, z0.b[0]\n" + "sdot z20.s, z4.b, z1.b[0]\n" + "sdot z17.s, z5.b, z0.b[0]\n" + "sdot z21.s, z5.b, z1.b[0]\n" + "sdot z18.s, z6.b, z0.b[0]\n" + "sdot z22.s, z6.b, z1.b[0]\n" + "sdot z24.s, z4.b, z2.b[0]\n" + "sdot z25.s, z5.b, z2.b[0]\n" + "sdot z26.s, z6.b, z2.b[0]\n" + "sdot z19.s, z7.b, z0.b[0]\n" + "sdot z23.s, z7.b, z1.b[0]\n" + "sdot z27.s, z7.b, z2.b[0]\n" + "ble 36f\n" + "ld1b { z8.b }, p2/Z, [x9]\n" + "ld1b { z9.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "sdot z16.s, z31.b, z0.b[1]\n" - "sdot z20.s, z31.b, z1.b[1]\n" - "sdot z24.s, z31.b, z2.b[1]\n" - "sdot z17.s, z30.b, z0.b[1]\n" - "sdot z21.s, z30.b, z1.b[1]\n" - "sdot z25.s, z30.b, z2.b[1]\n" - "sdot z18.s, z29.b, z0.b[1]\n" - "sdot z22.s, z29.b, z1.b[1]\n" - "sdot z26.s, z29.b, z2.b[1]\n" - "sdot z19.s, z28.b, z0.b[1]\n" - "sdot z23.s, z28.b, z1.b[1]\n" - "sdot z27.s, z28.b, z2.b[1]\n" - "ble 38f\n" - "ld1b { z31.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z16.s, z8.b, z0.b[1]\n" + "sdot z20.s, z8.b, z1.b[1]\n" + "sdot z24.s, z8.b, z2.b[1]\n" + "sdot z17.s, z9.b, z0.b[1]\n" + "sdot z21.s, z9.b, z1.b[1]\n" + "sdot z25.s, z9.b, z2.b[1]\n" + "sdot z18.s, z10.b, z0.b[1]\n" + "sdot z22.s, z10.b, z1.b[1]\n" + "sdot z26.s, z10.b, z2.b[1]\n" + "sdot z19.s, z4.b, z0.b[1]\n" + "sdot z23.s, z4.b, z1.b[1]\n" + "sdot z27.s, z4.b, z2.b[1]\n" + "ble 36f\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "sdot z16.s, z31.b, z0.b[2]\n" - "sdot z20.s, z31.b, z1.b[2]\n" - "sdot z24.s, z31.b, z2.b[2]\n" - "sdot z17.s, z30.b, z0.b[2]\n" - "sdot z21.s, z30.b, z1.b[2]\n" - "sdot z25.s, z30.b, z2.b[2]\n" - "sdot z18.s, z29.b, z0.b[2]\n" - "sdot z22.s, z29.b, z1.b[2]\n" - "sdot z26.s, z29.b, z2.b[2]\n" - "sdot z19.s, z28.b, z0.b[2]\n" - "sdot z23.s, z28.b, z1.b[2]\n" - "sdot z27.s, z28.b, z2.b[2]\n" - "ble 38f\n" - "ld1b { z31.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "sdot z16.s, z31.b, z0.b[3]\n" - "sdot z20.s, z31.b, z1.b[3]\n" - "sdot z24.s, z31.b, z2.b[3]\n" - "sdot z17.s, z30.b, z0.b[3]\n" - "sdot z21.s, z30.b, z1.b[3]\n" - "sdot z25.s, z30.b, z2.b[3]\n" - "sdot z18.s, z29.b, z0.b[3]\n" - "sdot z22.s, z29.b, z1.b[3]\n" - "sdot z26.s, z29.b, z2.b[3]\n" - "sdot z19.s, z28.b, z0.b[3]\n" - "sdot z23.s, z28.b, z1.b[3]\n" - "sdot z27.s, z28.b, z2.b[3]\n" - "38:" // Height 3: Multiply loop: multiply skip - "tbnz %x[flags], #31, 39f\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z16.s, z5.b, z0.b[2]\n" + "sdot z20.s, z5.b, z1.b[2]\n" + "sdot z24.s, z5.b, z2.b[2]\n" + "sdot z17.s, z6.b, z0.b[2]\n" + "sdot z21.s, z6.b, z1.b[2]\n" + "sdot z25.s, z6.b, z2.b[2]\n" + "sdot z18.s, z7.b, z0.b[2]\n" + "sdot z22.s, z7.b, z1.b[2]\n" + "sdot z26.s, z7.b, z2.b[2]\n" + "sdot z19.s, z8.b, z0.b[2]\n" + "sdot z23.s, z8.b, z1.b[2]\n" + "sdot z27.s, z8.b, z2.b[2]\n" + "ble 36f\n" + "ld1b { z9.b }, p2/Z, [x9]\n" + "ld1b { z10.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z16.s, z9.b, z0.b[3]\n" + "sdot z20.s, z9.b, z1.b[3]\n" + "sdot z24.s, z9.b, z2.b[3]\n" + "sdot z17.s, z10.b, z0.b[3]\n" + "sdot z21.s, z10.b, z1.b[3]\n" + "sdot z25.s, z10.b, z2.b[3]\n" + "sdot z18.s, z4.b, z0.b[3]\n" + "sdot z22.s, z4.b, z1.b[3]\n" + "sdot z26.s, z4.b, z2.b[3]\n" + "sdot z19.s, z5.b, z0.b[3]\n" + "sdot z23.s, z5.b, z1.b[3]\n" + "sdot z27.s, z5.b, z2.b[3]\n" + "36:" // Height 3: Multiply loop: multiply skip + "tbnz %x[flags], #31, 37f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" "sdot z13.s, z2.b, z15.b\n" - "39:" // Height 3: Multiply loop: unique 6: skip row sum + "37:" // Height 3: Multiply loop: unique 6: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 32b\n" + "bne 30b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x27, x20\n" - "add x23, x24, x20\n" - "tbnz %x[flags], #31, 40f\n" + "add x26, x27, x20\n" + "add x25, x26, x20\n" + "tbnz %x[flags], #31, 38f\n" "mov x21, #0x4\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z28.s }, p2/Z, [x20]\n" + "ld1rw { z3.s }, p2/Z, [x20]\n" "whilelt p0.s, XZR, x21\n" - "neg z28.s, p2/M, z28.s\n" + "neg z3.s, p2/M, z3.s\n" "saddv d11, p0, z11.s\n" "saddv d12, p0, z12.s\n" "saddv d13, p0, z13.s\n" "mov z11.s, z11.s[0]\n" "mov z12.s, z12.s[0]\n" - "mul z11.s, p2/M, z11.s, z28.s\n" - "mul z12.s, p2/M, z12.s, z28.s\n" + "mul z11.s, p2/M, z11.s, z3.s\n" + "mul z12.s, p2/M, z12.s, z3.s\n" "mov z13.s, z13.s[0]\n" - "mul z13.s, p2/M, z13.s, z28.s\n" - "40:" // Height 3: skip row sum fixup + "mul z13.s, p2/M, z13.s, z3.s\n" + "38:" // Height 3: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z31.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z30.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z29.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z20.s, z20.s, z12.s\n" "add z21.s, z21.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z22.s, z22.s, z12.s\n" "add z23.s, z23.s, z12.s\n" - "ld1rw { z28.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" "add z24.s, z24.s, z13.s\n" "add z25.s, z25.s, z13.s\n" - "addvl x10, x10, #4\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" "add z26.s, z26.s, z13.s\n" "add z27.s, z27.s, z13.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z31.s\n" - "add z18.s, z18.s, z30.s\n" - "add z19.s, z19.s, z29.s\n" + "add z17.s, z17.s, z1.s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" "add z20.s, z20.s, z0.s\n" - "add z21.s, z21.s, z31.s\n" - "add z22.s, z22.s, z30.s\n" - "add z23.s, z23.s, z29.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z31.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z26.s, z26.s, z30.s\n" - "add z27.s, z27.s, z29.s\n" - ".inst 0x04bc7610 // sqrdmulh z16.s, z16.s, z28.s\n" - ".inst 0x04bc7631 // sqrdmulh z17.s, z17.s, z28.s\n" - ".inst 0x04bc7652 // sqrdmulh z18.s, z18.s, z28.s\n" - ".inst 0x04bc7673 // sqrdmulh z19.s, z19.s, z28.s\n" - ".inst 0x04bc7694 // sqrdmulh z20.s, z20.s, z28.s\n" - ".inst 0x04bc76b5 // sqrdmulh z21.s, z21.s, z28.s\n" - ".inst 0x04bc76d6 // sqrdmulh z22.s, z22.s, z28.s\n" - ".inst 0x04bc76f7 // sqrdmulh z23.s, z23.s, z28.s\n" - ".inst 0x04bc7718 // sqrdmulh z24.s, z24.s, z28.s\n" - ".inst 0x04bc7739 // sqrdmulh z25.s, z25.s, z28.s\n" - ".inst 0x04bc775a // sqrdmulh z26.s, z26.s, z28.s\n" - ".inst 0x04bc777b // sqrdmulh z27.s, z27.s, z28.s\n" - "tbz %x[flags], #5, 41f\n" - "and z1.d, z16.d, z0.d\n" - "and z31.d, z17.d, z0.d\n" - "and z30.d, z18.d, z0.d\n" - "and z29.d, z19.d, z0.d\n" - "and z28.d, z20.d, z0.d\n" - "and z3.d, z21.d, z0.d\n" - "asr z1.s, z1.s, #0x1f\n" - "asr z31.s, z31.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "and z2.d, z22.d, z0.d\n" - "sqadd z16.s, z16.s, z1.s\n" - "sqadd z17.s, z17.s, z31.s\n" - "sqadd z18.s, z18.s, z30.s\n" - "sqadd z19.s, z19.s, z29.s\n" - "sqadd z20.s, z20.s, z28.s\n" - "and z1.d, z23.d, z0.d\n" - "and z31.d, z24.d, z0.d\n" - "and z30.d, z25.d, z0.d\n" - "and z29.d, z26.d, z0.d\n" - "and z28.d, z27.d, z0.d\n" - "asr z3.s, z3.s, #0x1f\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "asr z31.s, z31.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "sqadd z21.s, z21.s, z3.s\n" - "sqadd z22.s, z22.s, z2.s\n" - "sqadd z23.s, z23.s, z1.s\n" - "sqadd z24.s, z24.s, z31.s\n" - "sqadd z25.s, z25.s, z30.s\n" - "sqadd z26.s, z26.s, z29.s\n" - "sqadd z27.s, z27.s, z28.s\n" - "41:" // Height 3: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add z25.s, z25.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "ld1rw { z30.s }, p2/Z, [x20]\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + ".inst 0x04a47318 // sqdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47339 // sqdmulh z25.s, z25.s, z4.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + ".inst 0x04a4735a // sqdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4737b // sqdmulh z27.s, z27.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" - "add z16.s, z16.s, z30.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z17.s, z17.s, z30.s\n" - "add z18.s, z18.s, z30.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z19.s, z19.s, z30.s\n" - "add z20.s, z20.s, z30.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z21.s, z21.s, z30.s\n" - "add z22.s, z22.s, z30.s\n" - "ld1rw { z29.s }, p2/Z, [x20]\n" - "add z23.s, z23.s, z30.s\n" - "add z24.s, z24.s, z30.s\n" - "add x20, %x[qp], %[minval]\n" - "add z25.s, z25.s, z30.s\n" - "add z26.s, z26.s, z30.s\n" - "ld1rw { z28.s }, p2/Z, [x20]\n" - "add z27.s, z27.s, z30.s\n" - "smin z16.s, p2/M, z16.s, z29.s\n" - "smin z17.s, p2/M, z17.s, z29.s\n" - "smin z18.s, p2/M, z18.s, z29.s\n" - "smin z19.s, p2/M, z19.s, z29.s\n" - "smin z20.s, p2/M, z20.s, z29.s\n" - "smin z21.s, p2/M, z21.s, z29.s\n" - "smin z22.s, p2/M, z22.s, z29.s\n" - "smin z23.s, p2/M, z23.s, z29.s\n" - "smin z24.s, p2/M, z24.s, z29.s\n" - "smin z25.s, p2/M, z25.s, z29.s\n" - "smin z26.s, p2/M, z26.s, z29.s\n" - "smin z27.s, p2/M, z27.s, z29.s\n" - "smax z16.s, p2/M, z16.s, z28.s\n" - "smax z17.s, p2/M, z17.s, z28.s\n" - "smax z18.s, p2/M, z18.s, z28.s\n" - "smax z19.s, p2/M, z19.s, z28.s\n" - "smax z20.s, p2/M, z20.s, z28.s\n" - "smax z21.s, p2/M, z21.s, z28.s\n" - "smax z22.s, p2/M, z22.s, z28.s\n" - "smax z23.s, p2/M, z23.s, z28.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "add z26.s, z26.s, z4.s\n" + "add z27.s, z27.s, z4.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z24.s, p2/M, z24.s, z28.s\n" - "smax z25.s, p2/M, z25.s, z28.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" "uzp1 z17.h, z18.h, z19.h\n" - "smax z26.s, p2/M, z26.s, z28.s\n" - "smax z27.s, p2/M, z27.s, z28.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z20.h, z20.h, z21.h\n" - "uzp1 z18.h, z22.h, z23.h\n" + "uzp1 z21.h, z22.h, z23.h\n" "uzp1 z24.h, z24.h, z25.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "uzp1 z17.h, z26.h, z27.h\n" - "uzp1 z20.b, z20.b, z18.b\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z20.b, z20.b, z21.b\n" "st1b { z16.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "uzp1 z24.b, z24.b, z17.b\n" - "st1b { z20.b }, p1, [x24]\n" - "st1b { z24.b }, p1, [x23]\n" - "42:" // Height 3: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 30b\n" - "b 58f\n" - "43:" // Height 4 + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z20.b }, p1, [x26]\n" + "st1b { z24.b }, p1, [x25]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 28b\n" + "b 54f\n" + "40:" // Height 4 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "mov x20, #0x4\n" - "mov x10, %x[col_bias]\n" "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" - "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" "mov z13.s, #0x0\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z14.s, #0x0\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "madd x20, x21, x20, x27\n" "mov z15.b, #0x1\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" + "madd x20, x21, x20, x27\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "44:" // Height 4: Column loop + "41:" // Height 4: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -1012,7 +924,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" "mov z24.s, #0x0\n" @@ -1023,42 +935,41 @@ void sve_hybrid_s8qa_dot_4x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "45:" // Height 4: setup done "mov x26, #0x0\n" - "46:" // Height 4: String loop + "43:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 47f\n" + "tbz %x[flags], #3, 44f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" "ldr x21, [x20, #0x18]\n" - "cbnz x26, 48f\n" + "cbnz x26, 45f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" "add x21, x21, x20\n" - "b 48f\n" - "47:" // Height 4: setup direct input + "b 45f\n" + "44:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" "add x21, x22, x21\n" - "48:" // Height 4: input setup done + "45:" // Height 4: input setup done "cmp x25, #0x10\n" - "ble 51f\n" - "49:" // Height 4: Multiply loop: Main loop head + "ble 48f\n" + "46:" // Height 4: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" @@ -1067,431 +978,380 @@ void sve_hybrid_s8qa_dot_4x4VL ( "ld1rqb { z3.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" "add x21, x21, #0x10\n" - "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" - "sdot z16.s, z5.b, z0.b[0]\n" - "sdot z20.s, z5.b, z1.b[0]\n" - "sdot z17.s, z10.b, z0.b[0]\n" - "sdot z21.s, z10.b, z1.b[0]\n" - "sdot z24.s, z5.b, z2.b[0]\n" - "sdot z28.s, z5.b, z3.b[0]\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - "sdot z25.s, z10.b, z2.b[0]\n" - "sdot z29.s, z10.b, z3.b[0]\n" - "sdot z18.s, z4.b, z0.b[0]\n" - "sdot z22.s, z4.b, z1.b[0]\n" - "sdot z26.s, z4.b, z2.b[0]\n" - "sdot z30.s, z4.b, z3.b[0]\n" - "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n" - "sdot z19.s, z9.b, z0.b[0]\n" - "sdot z23.s, z9.b, z1.b[0]\n" - "sdot z27.s, z9.b, z2.b[0]\n" - "sdot z31.s, z9.b, z3.b[0]\n" - "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #6, MUL VL]\n" + "sdot z16.s, z4.b, z0.b[0]\n" + "sdot z20.s, z4.b, z1.b[0]\n" + "sdot z17.s, z5.b, z0.b[0]\n" + "sdot z21.s, z5.b, z1.b[0]\n" + "sdot z24.s, z4.b, z2.b[0]\n" + "sdot z28.s, z4.b, z3.b[0]\n" + "ld1b { z4.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + "sdot z25.s, z5.b, z2.b[0]\n" + "sdot z29.s, z5.b, z3.b[0]\n" + "sdot z18.s, z6.b, z0.b[0]\n" + "sdot z22.s, z6.b, z1.b[0]\n" + "sdot z26.s, z6.b, z2.b[0]\n" + "sdot z30.s, z6.b, z3.b[0]\n" + "ld1b { z5.b }, p2/Z, [x9, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" + "sdot z19.s, z7.b, z0.b[0]\n" + "sdot z23.s, z7.b, z1.b[0]\n" + "sdot z27.s, z7.b, z2.b[0]\n" + "sdot z31.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" "sdot z16.s, z8.b, z0.b[1]\n" "sdot z20.s, z8.b, z1.b[1]\n" "sdot z24.s, z8.b, z2.b[1]\n" "sdot z28.s, z8.b, z3.b[1]\n" - "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" - "sdot z17.s, z7.b, z0.b[1]\n" - "sdot z21.s, z7.b, z1.b[1]\n" - "sdot z25.s, z7.b, z2.b[1]\n" - "sdot z29.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" - "sdot z18.s, z6.b, z0.b[1]\n" - "sdot z22.s, z6.b, z1.b[1]\n" - "sdot z26.s, z6.b, z2.b[1]\n" - "sdot z30.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" - "sdot z19.s, z5.b, z0.b[1]\n" - "sdot z23.s, z5.b, z1.b[1]\n" - "sdot z27.s, z5.b, z2.b[1]\n" - "sdot z31.s, z5.b, z3.b[1]\n" - "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" - "sdot z16.s, z4.b, z0.b[2]\n" - "sdot z20.s, z4.b, z1.b[2]\n" - "sdot z24.s, z4.b, z2.b[2]\n" - "sdot z28.s, z4.b, z3.b[2]\n" - "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n" - "sdot z17.s, z10.b, z0.b[2]\n" - "sdot z21.s, z10.b, z1.b[2]\n" - "sdot z25.s, z10.b, z2.b[2]\n" - "sdot z29.s, z10.b, z3.b[2]\n" - "sdot z18.s, z9.b, z0.b[2]\n" - "sdot z22.s, z9.b, z1.b[2]\n" - "sdot z26.s, z9.b, z2.b[2]\n" - "sdot z30.s, z9.b, z3.b[2]\n" + "ld1b { z8.b }, p2/Z, [x9, #-5, MUL VL]\n" + "sdot z17.s, z9.b, z0.b[1]\n" + "sdot z21.s, z9.b, z1.b[1]\n" + "sdot z25.s, z9.b, z2.b[1]\n" + "sdot z29.s, z9.b, z3.b[1]\n" + "ld1b { z9.b }, p2/Z, [x9, #-4, MUL VL]\n" + "sdot z18.s, z10.b, z0.b[1]\n" + "sdot z22.s, z10.b, z1.b[1]\n" + "sdot z26.s, z10.b, z2.b[1]\n" + "sdot z30.s, z10.b, z3.b[1]\n" + "ld1b { z10.b }, p2/Z, [x9, #-3, MUL VL]\n" + "sdot z19.s, z4.b, z0.b[1]\n" + "sdot z23.s, z4.b, z1.b[1]\n" + "sdot z27.s, z4.b, z2.b[1]\n" + "sdot z31.s, z4.b, z3.b[1]\n" + "ld1b { z4.b }, p2/Z, [x9, #-2, MUL VL]\n" + "sdot z16.s, z5.b, z0.b[2]\n" + "sdot z20.s, z5.b, z1.b[2]\n" + "sdot z24.s, z5.b, z2.b[2]\n" + "sdot z28.s, z5.b, z3.b[2]\n" + "ld1b { z5.b }, p2/Z, [x9, #-1, MUL VL]\n" + "sdot z17.s, z6.b, z0.b[2]\n" + "sdot z21.s, z6.b, z1.b[2]\n" + "sdot z25.s, z6.b, z2.b[2]\n" + "sdot z29.s, z6.b, z3.b[2]\n" + "sdot z18.s, z7.b, z0.b[2]\n" + "sdot z22.s, z7.b, z1.b[2]\n" + "sdot z26.s, z7.b, z2.b[2]\n" + "sdot z30.s, z7.b, z3.b[2]\n" "sdot z19.s, z8.b, z0.b[2]\n" "sdot z23.s, z8.b, z1.b[2]\n" "sdot z27.s, z8.b, z2.b[2]\n" "sdot z31.s, z8.b, z3.b[2]\n" - "sdot z16.s, z7.b, z0.b[3]\n" - "sdot z20.s, z7.b, z1.b[3]\n" - "sdot z24.s, z7.b, z2.b[3]\n" - "sdot z28.s, z7.b, z3.b[3]\n" - "sdot z17.s, z6.b, z0.b[3]\n" - "sdot z21.s, z6.b, z1.b[3]\n" - "sdot z25.s, z6.b, z2.b[3]\n" - "sdot z29.s, z6.b, z3.b[3]\n" - "sdot z18.s, z5.b, z0.b[3]\n" - "sdot z22.s, z5.b, z1.b[3]\n" - "sdot z26.s, z5.b, z2.b[3]\n" - "sdot z30.s, z5.b, z3.b[3]\n" - "sdot z19.s, z4.b, z0.b[3]\n" - "sdot z23.s, z4.b, z1.b[3]\n" - "sdot z27.s, z4.b, z2.b[3]\n" - "sdot z31.s, z4.b, z3.b[3]\n" - "tbnz %x[flags], #31, 50f\n" + "sdot z16.s, z9.b, z0.b[3]\n" + "sdot z20.s, z9.b, z1.b[3]\n" + "sdot z24.s, z9.b, z2.b[3]\n" + "sdot z28.s, z9.b, z3.b[3]\n" + "sdot z17.s, z10.b, z0.b[3]\n" + "sdot z21.s, z10.b, z1.b[3]\n" + "sdot z25.s, z10.b, z2.b[3]\n" + "sdot z29.s, z10.b, z3.b[3]\n" + "sdot z18.s, z4.b, z0.b[3]\n" + "sdot z22.s, z4.b, z1.b[3]\n" + "sdot z26.s, z4.b, z2.b[3]\n" + "sdot z30.s, z4.b, z3.b[3]\n" + "sdot z19.s, z5.b, z0.b[3]\n" + "sdot z23.s, z5.b, z1.b[3]\n" + "sdot z27.s, z5.b, z2.b[3]\n" + "sdot z31.s, z5.b, z3.b[3]\n" + "tbnz %x[flags], #31, 47f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" "sdot z13.s, z2.b, z15.b\n" "sdot z14.s, z3.b, z15.b\n" - "50:" // Height 4: Multiply loop: unique 7: skip row sum + "47:" // Height 4: Multiply loop: unique 7: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 49b\n" - "51:" // Height 4: Multiply loop: Single iteration only + "bgt 46b\n" + "48:" // Height 4: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z7.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "ld1rqb { z2.b }, p0/Z, [x22]\n" "ld1rqb { z3.b }, p0/Z, [x21]\n" - "sdot z16.s, z7.b, z0.b[0]\n" - "sdot z20.s, z7.b, z1.b[0]\n" - "sdot z17.s, z6.b, z0.b[0]\n" - "sdot z21.s, z6.b, z1.b[0]\n" - "sdot z18.s, z5.b, z0.b[0]\n" - "sdot z22.s, z5.b, z1.b[0]\n" - "sdot z24.s, z7.b, z2.b[0]\n" - "sdot z28.s, z7.b, z3.b[0]\n" - "sdot z25.s, z6.b, z2.b[0]\n" - "sdot z29.s, z6.b, z3.b[0]\n" - "sdot z26.s, z5.b, z2.b[0]\n" - "sdot z30.s, z5.b, z3.b[0]\n" - "sdot z19.s, z4.b, z0.b[0]\n" - "sdot z23.s, z4.b, z1.b[0]\n" - "sdot z27.s, z4.b, z2.b[0]\n" - "sdot z31.s, z4.b, z3.b[0]\n" - "ble 52f\n" - "ld1b { z7.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "sdot z16.s, z4.b, z0.b[0]\n" + "sdot z20.s, z4.b, z1.b[0]\n" + "sdot z17.s, z5.b, z0.b[0]\n" + "sdot z21.s, z5.b, z1.b[0]\n" + "sdot z18.s, z6.b, z0.b[0]\n" + "sdot z22.s, z6.b, z1.b[0]\n" + "sdot z24.s, z4.b, z2.b[0]\n" + "sdot z28.s, z4.b, z3.b[0]\n" + "sdot z25.s, z5.b, z2.b[0]\n" + "sdot z29.s, z5.b, z3.b[0]\n" + "sdot z26.s, z6.b, z2.b[0]\n" + "sdot z30.s, z6.b, z3.b[0]\n" + "sdot z19.s, z7.b, z0.b[0]\n" + "sdot z23.s, z7.b, z1.b[0]\n" + "sdot z27.s, z7.b, z2.b[0]\n" + "sdot z31.s, z7.b, z3.b[0]\n" + "ble 49f\n" + "ld1b { z8.b }, p2/Z, [x9]\n" + "ld1b { z9.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "sdot z16.s, z7.b, z0.b[1]\n" - "sdot z20.s, z7.b, z1.b[1]\n" - "sdot z24.s, z7.b, z2.b[1]\n" - "sdot z28.s, z7.b, z3.b[1]\n" - "sdot z17.s, z6.b, z0.b[1]\n" - "sdot z21.s, z6.b, z1.b[1]\n" - "sdot z25.s, z6.b, z2.b[1]\n" - "sdot z29.s, z6.b, z3.b[1]\n" - "sdot z18.s, z5.b, z0.b[1]\n" - "sdot z22.s, z5.b, z1.b[1]\n" - "sdot z26.s, z5.b, z2.b[1]\n" - "sdot z30.s, z5.b, z3.b[1]\n" + "ld1b { z10.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z16.s, z8.b, z0.b[1]\n" + "sdot z20.s, z8.b, z1.b[1]\n" + "sdot z24.s, z8.b, z2.b[1]\n" + "sdot z28.s, z8.b, z3.b[1]\n" + "sdot z17.s, z9.b, z0.b[1]\n" + "sdot z21.s, z9.b, z1.b[1]\n" + "sdot z25.s, z9.b, z2.b[1]\n" + "sdot z29.s, z9.b, z3.b[1]\n" + "sdot z18.s, z10.b, z0.b[1]\n" + "sdot z22.s, z10.b, z1.b[1]\n" + "sdot z26.s, z10.b, z2.b[1]\n" + "sdot z30.s, z10.b, z3.b[1]\n" "sdot z19.s, z4.b, z0.b[1]\n" "sdot z23.s, z4.b, z1.b[1]\n" "sdot z27.s, z4.b, z2.b[1]\n" "sdot z31.s, z4.b, z3.b[1]\n" - "ble 52f\n" - "ld1b { z7.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ble 49f\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "sdot z16.s, z7.b, z0.b[2]\n" - "sdot z20.s, z7.b, z1.b[2]\n" - "sdot z24.s, z7.b, z2.b[2]\n" - "sdot z28.s, z7.b, z3.b[2]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z16.s, z5.b, z0.b[2]\n" + "sdot z20.s, z5.b, z1.b[2]\n" + "sdot z24.s, z5.b, z2.b[2]\n" + "sdot z28.s, z5.b, z3.b[2]\n" "sdot z17.s, z6.b, z0.b[2]\n" "sdot z21.s, z6.b, z1.b[2]\n" "sdot z25.s, z6.b, z2.b[2]\n" "sdot z29.s, z6.b, z3.b[2]\n" - "sdot z18.s, z5.b, z0.b[2]\n" - "sdot z22.s, z5.b, z1.b[2]\n" - "sdot z26.s, z5.b, z2.b[2]\n" - "sdot z30.s, z5.b, z3.b[2]\n" - "sdot z19.s, z4.b, z0.b[2]\n" - "sdot z23.s, z4.b, z1.b[2]\n" - "sdot z27.s, z4.b, z2.b[2]\n" - "sdot z31.s, z4.b, z3.b[2]\n" - "ble 52f\n" - "ld1b { z7.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "sdot z16.s, z7.b, z0.b[3]\n" - "sdot z20.s, z7.b, z1.b[3]\n" - "sdot z24.s, z7.b, z2.b[3]\n" - "sdot z28.s, z7.b, z3.b[3]\n" - "sdot z17.s, z6.b, z0.b[3]\n" - "sdot z21.s, z6.b, z1.b[3]\n" - "sdot z25.s, z6.b, z2.b[3]\n" - "sdot z29.s, z6.b, z3.b[3]\n" - "sdot z18.s, z5.b, z0.b[3]\n" - "sdot z22.s, z5.b, z1.b[3]\n" - "sdot z26.s, z5.b, z2.b[3]\n" - "sdot z30.s, z5.b, z3.b[3]\n" - "sdot z19.s, z4.b, z0.b[3]\n" - "sdot z23.s, z4.b, z1.b[3]\n" - "sdot z27.s, z4.b, z2.b[3]\n" - "sdot z31.s, z4.b, z3.b[3]\n" - "52:" // Height 4: Multiply loop: multiply skip - "tbnz %x[flags], #31, 53f\n" + "sdot z18.s, z7.b, z0.b[2]\n" + "sdot z22.s, z7.b, z1.b[2]\n" + "sdot z26.s, z7.b, z2.b[2]\n" + "sdot z30.s, z7.b, z3.b[2]\n" + "sdot z19.s, z8.b, z0.b[2]\n" + "sdot z23.s, z8.b, z1.b[2]\n" + "sdot z27.s, z8.b, z2.b[2]\n" + "sdot z31.s, z8.b, z3.b[2]\n" + "ble 49f\n" + "ld1b { z9.b }, p2/Z, [x9]\n" + "ld1b { z10.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "sdot z16.s, z9.b, z0.b[3]\n" + "sdot z20.s, z9.b, z1.b[3]\n" + "sdot z24.s, z9.b, z2.b[3]\n" + "sdot z28.s, z9.b, z3.b[3]\n" + "sdot z17.s, z10.b, z0.b[3]\n" + "sdot z21.s, z10.b, z1.b[3]\n" + "sdot z25.s, z10.b, z2.b[3]\n" + "sdot z29.s, z10.b, z3.b[3]\n" + "sdot z18.s, z4.b, z0.b[3]\n" + "sdot z22.s, z4.b, z1.b[3]\n" + "sdot z26.s, z4.b, z2.b[3]\n" + "sdot z30.s, z4.b, z3.b[3]\n" + "sdot z19.s, z5.b, z0.b[3]\n" + "sdot z23.s, z5.b, z1.b[3]\n" + "sdot z27.s, z5.b, z2.b[3]\n" + "sdot z31.s, z5.b, z3.b[3]\n" + "49:" // Height 4: Multiply loop: multiply skip + "tbnz %x[flags], #31, 50f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" "sdot z13.s, z2.b, z15.b\n" "sdot z14.s, z3.b, z15.b\n" - "53:" // Height 4: Multiply loop: unique 8: skip row sum + "50:" // Height 4: Multiply loop: unique 8: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 46b\n" + "bne 43b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x27, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "tbnz %x[flags], #31, 54f\n" + "add x26, x27, x20\n" + "add x25, x26, x20\n" + "add x24, x25, x20\n" + "tbnz %x[flags], #31, 51f\n" "mov x21, #0x4\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" "whilelt p0.s, XZR, x21\n" - "neg z0.s, p2/M, z0.s\n" + "neg z4.s, p2/M, z4.s\n" "saddv d11, p0, z11.s\n" "saddv d12, p0, z12.s\n" "saddv d13, p0, z13.s\n" "saddv d14, p0, z14.s\n" "mov z11.s, z11.s[0]\n" "mov z12.s, z12.s[0]\n" - "mul z11.s, p2/M, z11.s, z0.s\n" - "mul z12.s, p2/M, z12.s, z0.s\n" + "mul z11.s, p2/M, z11.s, z4.s\n" + "mul z12.s, p2/M, z12.s, z4.s\n" "mov z13.s, z13.s[0]\n" "mov z14.s, z14.s[0]\n" - "mul z13.s, p2/M, z13.s, z0.s\n" - "mul z14.s, p2/M, z14.s, z0.s\n" - "54:" // Height 4: skip row sum fixup + "mul z13.s, p2/M, z13.s, z4.s\n" + "mul z14.s, p2/M, z14.s, z4.s\n" + "51:" // Height 4: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z4.s }, p2/Z, [x10]\n" - "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z20.s, z20.s, z12.s\n" "add z21.s, z21.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z22.s, z22.s, z12.s\n" "add z23.s, z23.s, z12.s\n" - "ld1rw { z1.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" "add z24.s, z24.s, z13.s\n" "add z25.s, z25.s, z13.s\n" - "addvl x10, x10, #4\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" "add z26.s, z26.s, z13.s\n" "add z27.s, z27.s, z13.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" "add z28.s, z28.s, z14.s\n" "add z29.s, z29.s, z14.s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" "add z30.s, z30.s, z14.s\n" "add z31.s, z31.s, z14.s\n" - "add z16.s, z16.s, z4.s\n" - "add z17.s, z17.s, z0.s\n" - "add z18.s, z18.s, z3.s\n" - "add z19.s, z19.s, z2.s\n" - "add z20.s, z20.s, z4.s\n" - "add z21.s, z21.s, z0.s\n" - "add z22.s, z22.s, z3.s\n" - "add z23.s, z23.s, z2.s\n" - "add z24.s, z24.s, z4.s\n" - "add z25.s, z25.s, z0.s\n" - "add z26.s, z26.s, z3.s\n" - "add z27.s, z27.s, z2.s\n" - "add z28.s, z28.s, z4.s\n" - "add z29.s, z29.s, z0.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z30.s, z30.s, z3.s\n" - "add z31.s, z31.s, z2.s\n" - ".inst 0x04a17610 // sqrdmulh z16.s, z16.s, z1.s\n" - ".inst 0x04a17631 // sqrdmulh z17.s, z17.s, z1.s\n" - ".inst 0x04a17652 // sqrdmulh z18.s, z18.s, z1.s\n" - ".inst 0x04a17673 // sqrdmulh z19.s, z19.s, z1.s\n" - ".inst 0x04a17694 // sqrdmulh z20.s, z20.s, z1.s\n" - ".inst 0x04a176b5 // sqrdmulh z21.s, z21.s, z1.s\n" - ".inst 0x04a176d6 // sqrdmulh z22.s, z22.s, z1.s\n" - ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n" - ".inst 0x04a17718 // sqrdmulh z24.s, z24.s, z1.s\n" - ".inst 0x04a17739 // sqrdmulh z25.s, z25.s, z1.s\n" - ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n" - ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n" - ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n" - ".inst 0x04a177bd // sqrdmulh z29.s, z29.s, z1.s\n" - ".inst 0x04a177de // sqrdmulh z30.s, z30.s, z1.s\n" - ".inst 0x04a177ff // sqrdmulh z31.s, z31.s, z1.s\n" - "tbz %x[flags], #5, 55f\n" - "and z2.d, z16.d, z0.d\n" - "and z1.d, z17.d, z0.d\n" - "and z7.d, z18.d, z0.d\n" - "and z6.d, z19.d, z0.d\n" - "and z5.d, z20.d, z0.d\n" - "and z4.d, z21.d, z0.d\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "and z3.d, z22.d, z0.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "sqadd z16.s, z16.s, z2.s\n" - "sqadd z17.s, z17.s, z1.s\n" - "and z2.d, z23.d, z0.d\n" - "and z1.d, z24.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z3.s, z3.s, #0x1f\n" - "sqadd z18.s, z18.s, z7.s\n" - "sqadd z19.s, z19.s, z6.s\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "sqadd z20.s, z20.s, z5.s\n" - "sqadd z21.s, z21.s, z4.s\n" - "sqadd z22.s, z22.s, z3.s\n" - "and z7.d, z25.d, z0.d\n" - "sqadd z23.s, z23.s, z2.s\n" - "sqadd z24.s, z24.s, z1.s\n" - "and z6.d, z26.d, z0.d\n" - "and z5.d, z27.d, z0.d\n" - "and z4.d, z28.d, z0.d\n" - "and z3.d, z29.d, z0.d\n" - "and z2.d, z30.d, z0.d\n" - "and z1.d, z31.d, z0.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z3.s, z3.s, #0x1f\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "sqadd z25.s, z25.s, z7.s\n" - "sqadd z26.s, z26.s, z6.s\n" - "sqadd z27.s, z27.s, z5.s\n" - "sqadd z28.s, z28.s, z4.s\n" - "sqadd z29.s, z29.s, z3.s\n" - "sqadd z30.s, z30.s, z2.s\n" - "sqadd z31.s, z31.s, z1.s\n" - "55:" // Height 4: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + "add z28.s, z28.s, z0.s\n" + "add z29.s, z29.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z30.s, z30.s, z2.s\n" + "add z31.s, z31.s, z3.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "ld1rw { z2.s }, p2/Z, [x20]\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + ".inst 0x04a47318 // sqdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47339 // sqdmulh z25.s, z25.s, z4.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + ".inst 0x04a4735a // sqdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4737b // sqdmulh z27.s, z27.s, z4.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" + ".inst 0x04a4739c // sqdmulh z28.s, z28.s, z4.s\n" + ".inst 0x04a473bd // sqdmulh z29.s, z29.s, z4.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" - "add z16.s, z16.s, z2.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + ".inst 0x04a473de // sqdmulh z30.s, z30.s, z4.s\n" + ".inst 0x04a473ff // sqdmulh z31.s, z31.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z17.s, z17.s, z2.s\n" - "add z18.s, z18.s, z2.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z19.s, z19.s, z2.s\n" - "add z20.s, z20.s, z2.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" - "add z21.s, z21.s, z2.s\n" - "add z22.s, z22.s, z2.s\n" ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n" - "add z23.s, z23.s, z2.s\n" - "add z24.s, z24.s, z2.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z25.s, z25.s, z2.s\n" - "add z26.s, z26.s, z2.s\n" - "ld1rw { z1.s }, p2/Z, [x20]\n" - "add z27.s, z27.s, z2.s\n" - "add z28.s, z28.s, z2.s\n" - "add x20, %x[qp], %[minval]\n" - "add z29.s, z29.s, z2.s\n" - "add z30.s, z30.s, z2.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z31.s, z31.s, z2.s\n" - "smin z16.s, p2/M, z16.s, z1.s\n" - "smin z17.s, p2/M, z17.s, z1.s\n" - "smin z18.s, p2/M, z18.s, z1.s\n" - "smin z19.s, p2/M, z19.s, z1.s\n" - "smin z20.s, p2/M, z20.s, z1.s\n" - "smin z21.s, p2/M, z21.s, z1.s\n" - "smin z22.s, p2/M, z22.s, z1.s\n" - "smin z23.s, p2/M, z23.s, z1.s\n" - "smin z24.s, p2/M, z24.s, z1.s\n" - "smin z25.s, p2/M, z25.s, z1.s\n" - "smin z26.s, p2/M, z26.s, z1.s\n" - "smin z27.s, p2/M, z27.s, z1.s\n" - "smin z28.s, p2/M, z28.s, z1.s\n" - "smin z29.s, p2/M, z29.s, z1.s\n" - "smin z30.s, p2/M, z30.s, z1.s\n" - "smin z31.s, p2/M, z31.s, z1.s\n" - "smax z16.s, p2/M, z16.s, z0.s\n" - "smax z17.s, p2/M, z17.s, z0.s\n" - "smax z18.s, p2/M, z18.s, z0.s\n" - "smax z19.s, p2/M, z19.s, z0.s\n" - "smax z20.s, p2/M, z20.s, z0.s\n" - "smax z21.s, p2/M, z21.s, z0.s\n" - "smax z22.s, p2/M, z22.s, z0.s\n" - "smax z23.s, p2/M, z23.s, z0.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "add z26.s, z26.s, z4.s\n" + "add z27.s, z27.s, z4.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "add z28.s, z28.s, z4.s\n" + "add z29.s, z29.s, z4.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "add z30.s, z30.s, z4.s\n" + "add z31.s, z31.s, z4.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smin z28.s, p2/M, z28.s, z6.s\n" + "smin z29.s, p2/M, z29.s, z6.s\n" + "smin z30.s, p2/M, z30.s, z6.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z24.s, p2/M, z24.s, z0.s\n" - "smax z25.s, p2/M, z25.s, z0.s\n" - "uzp1 z18.h, z18.h, z19.h\n" - "smax z26.s, p2/M, z26.s, z0.s\n" - "smax z27.s, p2/M, z27.s, z0.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z20.h, z20.h, z21.h\n" - "smax z28.s, p2/M, z28.s, z0.s\n" - "smax z29.s, p2/M, z29.s, z0.s\n" - "uzp1 z17.h, z22.h, z23.h\n" - "smax z30.s, p2/M, z30.s, z0.s\n" - "smax z31.s, p2/M, z31.s, z0.s\n" + "smax z28.s, p2/M, z28.s, z5.s\n" + "smax z29.s, p2/M, z29.s, z5.s\n" + "uzp1 z21.h, z22.h, z23.h\n" + "smax z30.s, p2/M, z30.s, z5.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" "uzp1 z24.h, z24.h, z25.h\n" - "uzp1 z16.b, z16.b, z18.b\n" - "uzp1 z18.h, z26.h, z27.h\n" + "uzp1 z16.b, z16.b, z17.b\n" + "uzp1 z25.h, z26.h, z27.h\n" "uzp1 z28.h, z28.h, z29.h\n" - "uzp1 z20.b, z20.b, z17.b\n" - "uzp1 z17.h, z30.h, z31.h\n" + "uzp1 z20.b, z20.b, z21.b\n" + "uzp1 z29.h, z30.h, z31.h\n" "st1b { z16.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "uzp1 z24.b, z24.b, z18.b\n" - "uzp1 z28.b, z28.b, z17.b\n" - "st1b { z20.b }, p1, [x24]\n" - "st1b { z24.b }, p1, [x23]\n" - "st1b { z28.b }, p1, [x22]\n" - "56:" // Height 4: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 44b\n" + "uzp1 z24.b, z24.b, z25.b\n" + "uzp1 z28.b, z28.b, z29.b\n" + "st1b { z20.b }, p1, [x26]\n" + "st1b { z24.b }, p1, [x25]\n" + "st1b { z28.b }, p1, [x24]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 41b\n" "subs %x[M], %x[M], #0x4\n" - "beq 58f\n" + "beq 54f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 57f\n" + "tbz %x[flags], #3, 53f\n" "add x21, x21, #0x4\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "57:" // Update direct input + "53:" // Update direct input "mov x20, #0x4\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "58:" // Exit + "54:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp index b8e65e6999..a88891e720 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -74,23 +73,20 @@ void sve_hybrid_s8qa_mmla_4x4VL ( ka.string_lengths = string_lengths; ka.N = N; ka.B_ptr = B_ptr; - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "ptrue p2.b\n" "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 43f\n" + "bge 40f\n" "cmp %x[M], #0x2\n" - "bgt 29f\n" - "beq 15f\n" - "mov x10, %x[col_bias]\n" + "bgt 27f\n" + "beq 14f\n" "mov z11.s, #0x0\n" "mov z15.b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "2:" // Height 1: Column loop "mov x20, #0x0\n" @@ -100,10 +96,9 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "3:" // Height 1: setup done "mov x26, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -124,43 +119,43 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "ble 9f\n" "7:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z30.b }, p2/Z, [x28]\n" - "ld1b { z29.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "ld1b { z24.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" "add x24, x24, #0x10\n" - "trn1 z0.d, z1.d, z31.d\n" - ".inst 0x451e9810 // smmla z16.s, z0.b, z30.b\n" - "ld1b { z8.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - "trn2 z1.d, z1.d, z31.d\n" - ".inst 0x451d9814 // smmla z20.s, z0.b, z29.b\n" - ".inst 0x451c9811 // smmla z17.s, z0.b, z28.b\n" - ".inst 0x451b9815 // smmla z21.s, z0.b, z27.b\n" - ".inst 0x451a9812 // smmla z18.s, z0.b, z26.b\n" - "ld1b { z31.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x45199816 // smmla z22.s, z0.b, z25.b\n" - ".inst 0x45189813 // smmla z19.s, z0.b, z24.b\n" - "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n" - ".inst 0x45089817 // smmla z23.s, z0.b, z8.b\n" - "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n" - ".inst 0x451f9830 // smmla z16.s, z1.b, z31.b\n" - "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x451e9834 // smmla z20.s, z1.b, z30.b\n" - ".inst 0x451d9831 // smmla z17.s, z1.b, z29.b\n" - ".inst 0x451c9835 // smmla z21.s, z1.b, z28.b\n" - ".inst 0x451b9832 // smmla z18.s, z1.b, z27.b\n" - ".inst 0x451a9836 // smmla z22.s, z1.b, z26.b\n" - ".inst 0x45199833 // smmla z19.s, z1.b, z25.b\n" - ".inst 0x45189837 // smmla z23.s, z1.b, z24.b\n" + "trn1 z0.d, z1.d, z2.d\n" + ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" + ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" + ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" + ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" + "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" + ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" + ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" + "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-6, MUL VL]\n" + ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" + "ld1b { z9.b }, p2/Z, [x9, #-5, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #-4, MUL VL]\n" + ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + "ld1b { z4.b }, p2/Z, [x9, #-3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #-2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" + ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" + ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" + ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" + ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" "tbnz %x[flags], #31, 8f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z11.s, z1.b, z15.b\n" @@ -170,45 +165,45 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "bgt 7b\n" "9:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z24.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" - "trn1 z0.d, z1.d, z31.d\n" - ".inst 0x45189810 // smmla z16.s, z0.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #8\n" - "trn2 z1.d, z1.d, z31.d\n" - ".inst 0x451e9814 // smmla z20.s, z0.b, z30.b\n" - ".inst 0x451d9811 // smmla z17.s, z0.b, z29.b\n" - ".inst 0x451c9815 // smmla z21.s, z0.b, z28.b\n" - ".inst 0x451b9812 // smmla z18.s, z0.b, z27.b\n" - ".inst 0x451a9816 // smmla z22.s, z0.b, z26.b\n" - ".inst 0x45199813 // smmla z19.s, z0.b, z25.b\n" - ".inst 0x45189817 // smmla z23.s, z0.b, z24.b\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" + "trn1 z0.d, z1.d, z2.d\n" + ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" + ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" + ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" + ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" + ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" + ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" + ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" "ble 10f\n" - "ld1b { z24.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45189830 // smmla z16.s, z1.b, z24.b\n" - "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x451e9834 // smmla z20.s, z1.b, z30.b\n" - "addvl x28, x28, #8\n" - ".inst 0x451d9831 // smmla z17.s, z1.b, z29.b\n" - ".inst 0x451c9835 // smmla z21.s, z1.b, z28.b\n" - ".inst 0x451b9832 // smmla z18.s, z1.b, z27.b\n" - ".inst 0x451a9836 // smmla z22.s, z1.b, z26.b\n" - ".inst 0x45199833 // smmla z19.s, z1.b, z25.b\n" - ".inst 0x45189837 // smmla z23.s, z1.b, z24.b\n" + "ld1b { z6.b }, p2/Z, [x9]\n" + "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + "ld1b { z5.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + "addvl x9, x9, #8\n" + ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" + ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" + ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" + ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" + ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" "10:" // Height 1: Multiply loop: multiply skip "tbnz %x[flags], #31, 11f\n" "sdot z11.s, z0.b, z15.b\n" @@ -226,89 +221,74 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "tbnz %x[flags], #31, 12f\n" "add x20, %x[qp], %[b_offset]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" - "ld1rw { z9.s }, p2/Z, [x20]\n" - "neg z9.s, p2/M, z9.s\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" + "neg z1.s, p2/M, z1.s\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z9.s\n" + "mul z11.s, p2/M, z11.s, z1.s\n" "12:" // Height 1: skip row sum fixup "add z23.s, z23.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z22.s }, p2/Z, [x10]\n" - "ld1w { z24.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z21.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z20.s }, p2/Z, [x10, #3, MUL VL]\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "add z23.s, z23.s, z22.s\n" - "add z17.s, z17.s, z24.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - "addvl x10, x10, #4\n" - "add z18.s, z18.s, z21.s\n" - "add z19.s, z19.s, z20.s\n" + "add z23.s, z23.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "ld1rw { z4.s }, p2/Z, [x21]\n" "ld1rw { z0.s }, p2/Z, [x20]\n" - ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n" - ".inst 0x04b07631 // sqrdmulh z17.s, z17.s, z16.s\n" - ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n" - ".inst 0x04b07673 // sqrdmulh z19.s, z19.s, z16.s\n" - "tbz %x[flags], #5, 13f\n" - "and z22.d, z23.d, z0.d\n" - "and z21.d, z17.d, z0.d\n" - "and z20.d, z18.d, z0.d\n" - "and z16.d, z19.d, z0.d\n" - "asr z22.s, z22.s, #0x1f\n" - "asr z21.s, z21.s, #0x1f\n" - "asr z20.s, z20.s, #0x1f\n" - "asr z16.s, z16.s, #0x1f\n" - "sqadd z23.s, z23.s, z22.s\n" - "sqadd z17.s, z17.s, z21.s\n" - "sqadd z18.s, z18.s, z20.s\n" - "sqadd z19.s, z19.s, z16.s\n" - "13:" // Height 1: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x20]\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x21]\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "ld1rw { z21.s }, p2/Z, [x20]\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1rw { z20.s }, p2/Z, [x20]\n" - "add z23.s, z23.s, z21.s\n" - "add x20, %x[qp], %[minval]\n" - "add z17.s, z17.s, z21.s\n" - "add z18.s, z18.s, z21.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z21.s\n" - "smin z23.s, p2/M, z23.s, z20.s\n" - "smin z17.s, p2/M, z17.s, z20.s\n" - "smin z18.s, p2/M, z18.s, z20.s\n" - "smin z19.s, p2/M, z19.s, z20.s\n" - "smax z23.s, p2/M, z23.s, z16.s\n" - "smax z17.s, p2/M, z17.s, z16.s\n" - "smax z18.s, p2/M, z18.s, z16.s\n" - "smax z19.s, p2/M, z19.s, z16.s\n" + "add z23.s, z23.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z23.h, z23.h, z17.h\n" - "uzp1 z16.h, z18.h, z19.h\n" - "uzp1 z23.b, z23.b, z16.b\n" + "uzp1 z17.h, z18.h, z19.h\n" + "uzp1 z23.b, z23.b, z17.b\n" "st1b { z23.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "14:" // Height 1: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" "bgt 2b\n" - "b 58f\n" - "15:" // Height 2 - "mov x10, %x[col_bias]\n" + "b 54f\n" + "14:" // Height 2 "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z15.b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "16:" // Height 2: Column loop + "15:" // Height 2: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -316,131 +296,130 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "17:" // Height 2: setup done "mov x26, #0x0\n" - "18:" // Height 2: String loop + "17:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 19f\n" + "tbz %x[flags], #3, 18f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" - "cbnz x26, 20f\n" + "cbnz x26, 19f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" - "b 20f\n" - "19:" // Height 2: setup direct input + "b 19f\n" + "18:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" - "20:" // Height 2: input setup done + "19:" // Height 2: input setup done "cmp x25, #0x10\n" - "ble 23f\n" - "21:" // Height 2: Multiply loop: Main loop head + "ble 22f\n" + "20:" // Height 2: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z31.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "ld1rqb { z25.b }, p0/Z, [x23]\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - "ld1b { z24.b }, p2/Z, [x28, #6, MUL VL]\n" - "trn1 z0.d, z1.d, z25.d\n" - "trn2 z1.d, z1.d, z25.d\n" - ".inst 0x451f9810 // smmla z16.s, z0.b, z31.b\n" - "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - ".inst 0x451e9814 // smmla z20.s, z0.b, z30.b\n" - ".inst 0x451d9811 // smmla z17.s, z0.b, z29.b\n" - ".inst 0x451c9815 // smmla z21.s, z0.b, z28.b\n" - ".inst 0x451b9812 // smmla z18.s, z0.b, z27.b\n" - ".inst 0x451a9816 // smmla z22.s, z0.b, z26.b\n" - ".inst 0x45189813 // smmla z19.s, z0.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x45199817 // smmla z23.s, z0.b, z25.b\n" - "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n" - ".inst 0x45189830 // smmla z16.s, z1.b, z24.b\n" - "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x451e9834 // smmla z20.s, z1.b, z30.b\n" - ".inst 0x451d9831 // smmla z17.s, z1.b, z29.b\n" - ".inst 0x451c9835 // smmla z21.s, z1.b, z28.b\n" - ".inst 0x451b9832 // smmla z18.s, z1.b, z27.b\n" - ".inst 0x451a9836 // smmla z22.s, z1.b, z26.b\n" - ".inst 0x45199833 // smmla z19.s, z1.b, z25.b\n" - ".inst 0x45189837 // smmla z23.s, z1.b, z24.b\n" - "tbnz %x[flags], #31, 22f\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" + ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" + ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" + ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" + ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" + ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" + "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" + ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" + "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-6, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #-5, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #-4, MUL VL]\n" + ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + "ld1b { z4.b }, p2/Z, [x9, #-3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #-2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" + ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" + ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" + ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" + ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + "tbnz %x[flags], #31, 21f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z11.s, z1.b, z15.b\n" - "22:" // Height 2: Multiply loop: unique 3: skip row sum + "21:" // Height 2: Multiply loop: unique 3: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 21b\n" - "23:" // Height 2: Multiply loop: Single iteration only + "bgt 20b\n" + "22:" // Height 2: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z29.b }, p2/Z, [x28]\n" - "ld1b { z28.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "ld1rqb { z24.b }, p0/Z, [x23]\n" - "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" - "trn1 z0.d, z1.d, z24.d\n" - "trn2 z1.d, z1.d, z24.d\n" - ".inst 0x451d9810 // smmla z16.s, z0.b, z29.b\n" - "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #8\n" - ".inst 0x451c9814 // smmla z20.s, z0.b, z28.b\n" - ".inst 0x45049811 // smmla z17.s, z0.b, z4.b\n" - ".inst 0x451b9815 // smmla z21.s, z0.b, z27.b\n" - ".inst 0x451a9812 // smmla z18.s, z0.b, z26.b\n" - ".inst 0x45069816 // smmla z22.s, z0.b, z6.b\n" - ".inst 0x45199813 // smmla z19.s, z0.b, z25.b\n" - ".inst 0x45189817 // smmla z23.s, z0.b, z24.b\n" - "ble 24f\n" - "ld1b { z24.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45189830 // smmla z16.s, z1.b, z24.b\n" - "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x451e9834 // smmla z20.s, z1.b, z30.b\n" - "addvl x28, x28, #8\n" - ".inst 0x451d9831 // smmla z17.s, z1.b, z29.b\n" - ".inst 0x451c9835 // smmla z21.s, z1.b, z28.b\n" - ".inst 0x451b9832 // smmla z18.s, z1.b, z27.b\n" - ".inst 0x451a9836 // smmla z22.s, z1.b, z26.b\n" - ".inst 0x45199833 // smmla z19.s, z1.b, z25.b\n" - ".inst 0x45189837 // smmla z23.s, z1.b, z24.b\n" - "24:" // Height 2: Multiply loop: multiply skip - "tbnz %x[flags], #31, 25f\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" + ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" + ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" + ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" + ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" + ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" + ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" + "ble 23f\n" + "ld1b { z6.b }, p2/Z, [x9]\n" + "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + "ld1b { z5.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + "addvl x9, x9, #8\n" + ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" + ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" + ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" + ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" + ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + "23:" // Height 2: Multiply loop: multiply skip + "tbnz %x[flags], #31, 24f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z11.s, z1.b, z15.b\n" - "25:" // Height 2: Multiply loop: unique 4: skip row sum + "24:" // Height 2: Multiply loop: unique 4: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 18b\n" - "uzp1 z24.d, z16.d, z20.d\n" + "bne 17b\n" + "uzp1 z7.d, z16.d, z20.d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "uzp2 z16.d, z16.d, z20.d\n" "uzp1 z20.d, z17.d, z21.d\n" @@ -449,116 +428,90 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "uzp2 z18.d, z18.d, z22.d\n" "uzp1 z22.d, z19.d, z23.d\n" "uzp2 z19.d, z19.d, z23.d\n" - "add x23, x27, x20\n" - "mov z23.d, z24.d\n" - "tbnz %x[flags], #31, 26f\n" + "add x26, x27, x20\n" + "mov z23.d, z7.d\n" + "tbnz %x[flags], #31, 25f\n" "add x20, %x[qp], %[b_offset]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "neg z24.s, p2/M, z24.s\n" + "ld1rw { z2.s }, p2/Z, [x20]\n" + "neg z2.s, p2/M, z2.s\n" "mov z12.s, z11.s[3]\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z24.s\n" - "mul z12.s, p2/M, z12.s, z24.s\n" - "26:" // Height 2: skip row sum fixup + "mul z11.s, p2/M, z11.s, z2.s\n" + "mul z12.s, p2/M, z12.s, z2.s\n" + "25:" // Height 2: skip row sum fixup "add z23.s, z23.s, z11.s\n" "add z20.s, z20.s, z11.s\n" - "ld1w { z28.s }, p2/Z, [x10]\n" - "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z21.s, z21.s, z11.s\n" "add z22.s, z22.s, z11.s\n" - "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z16.s, z16.s, z12.s\n" "add z17.s, z17.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z12.s\n" "add z19.s, z19.s, z12.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" - "add z23.s, z23.s, z28.s\n" - "add z20.s, z20.s, z27.s\n" - "addvl x10, x10, #4\n" - "add z21.s, z21.s, z26.s\n" - "add z22.s, z22.s, z25.s\n" - "add z16.s, z16.s, z28.s\n" - "add z17.s, z17.s, z27.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z18.s, z18.s, z26.s\n" - "add z19.s, z19.s, z25.s\n" - ".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n" - ".inst 0x04b87694 // sqrdmulh z20.s, z20.s, z24.s\n" - ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n" - ".inst 0x04b876d6 // sqrdmulh z22.s, z22.s, z24.s\n" - ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n" - ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n" - ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n" - ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n" - "tbz %x[flags], #5, 27f\n" - "and z24.d, z23.d, z0.d\n" - "and z30.d, z20.d, z0.d\n" - "and z29.d, z21.d, z0.d\n" - "and z28.d, z22.d, z0.d\n" - "and z27.d, z16.d, z0.d\n" - "and z26.d, z17.d, z0.d\n" - "asr z24.s, z24.s, #0x1f\n" - "and z25.d, z18.d, z0.d\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z27.s, z27.s, #0x1f\n" - "sqadd z23.s, z23.s, z24.s\n" - "and z24.d, z19.d, z0.d\n" - "asr z26.s, z26.s, #0x1f\n" - "asr z25.s, z25.s, #0x1f\n" - "sqadd z20.s, z20.s, z30.s\n" - "sqadd z21.s, z21.s, z29.s\n" - "asr z24.s, z24.s, #0x1f\n" - "sqadd z22.s, z22.s, z28.s\n" - "sqadd z16.s, z16.s, z27.s\n" - "sqadd z17.s, z17.s, z26.s\n" - "sqadd z18.s, z18.s, z25.s\n" - "sqadd z19.s, z19.s, z24.s\n" - "27:" // Height 2: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" + "add z23.s, z23.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "addvl x28, x28, #4\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "ld1rw { z26.s }, p2/Z, [x20]\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z23.s, z23.s, z26.s\n" + "add z23.s, z23.s, z4.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z20.s, z20.s, z26.s\n" - "add z21.s, z21.s, z26.s\n" - "ld1rw { z25.s }, p2/Z, [x20]\n" - "add z22.s, z22.s, z26.s\n" - "add z16.s, z16.s, z26.s\n" - "add x20, %x[qp], %[minval]\n" - "add z17.s, z17.s, z26.s\n" - "add z18.s, z18.s, z26.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z26.s\n" - "smin z23.s, p2/M, z23.s, z25.s\n" - "smin z20.s, p2/M, z20.s, z25.s\n" - "smin z21.s, p2/M, z21.s, z25.s\n" - "smin z22.s, p2/M, z22.s, z25.s\n" - "smin z16.s, p2/M, z16.s, z25.s\n" - "smin z17.s, p2/M, z17.s, z25.s\n" - "smin z18.s, p2/M, z18.s, z25.s\n" - "smin z19.s, p2/M, z19.s, z25.s\n" - "smax z23.s, p2/M, z23.s, z24.s\n" - "smax z20.s, p2/M, z20.s, z24.s\n" - "smax z21.s, p2/M, z21.s, z24.s\n" - "smax z22.s, p2/M, z22.s, z24.s\n" - "smax z16.s, p2/M, z16.s, z24.s\n" - "smax z17.s, p2/M, z17.s, z24.s\n" - "smax z18.s, p2/M, z18.s, z24.s\n" - "smax z19.s, p2/M, z19.s, z24.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z23.h, z23.h, z20.h\n" "uzp1 z20.h, z21.h, z22.h\n" "uzp1 z16.h, z16.h, z17.h\n" @@ -567,23 +520,22 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "uzp1 z16.b, z16.b, z17.b\n" "st1b { z23.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "st1b { z16.b }, p1, [x23]\n" - "28:" // Height 2: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 16b\n" - "b 58f\n" - "29:" // Height 3 - "mov x10, %x[col_bias]\n" + "st1b { z16.b }, p1, [x26]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 15b\n" + "b 54f\n" + "27:" // Height 3 "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z13.s, #0x0\n" "mov z15.b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "30:" // Height 3: Column loop + "28:" // Height 3: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -591,7 +543,7 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" "mov z24.s, #0x0\n" @@ -602,39 +554,38 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "31:" // Height 3: setup done "mov x26, #0x0\n" - "32:" // Height 3: String loop + "30:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 33f\n" + "tbz %x[flags], #3, 31f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" - "cbnz x26, 34f\n" + "cbnz x26, 32f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" - "b 34f\n" - "33:" // Height 3: setup direct input + "b 32f\n" + "31:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" - "34:" // Height 3: input setup done + "32:" // Height 3: input setup done "cmp x25, #0x10\n" - "ble 37f\n" - "35:" // Height 3: Multiply loop: Main loop head + "ble 35f\n" + "33:" // Height 3: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" @@ -643,333 +594,294 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "add x22, x22, #0x10\n" "trn1 z0.d, z1.d, z2.d\n" "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z6.d\n" - "trn2 z3.d, z3.d, z6.d\n" - "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" - ".inst 0x450a9814 // smmla z20.s, z0.b, z10.b\n" - ".inst 0x45099811 // smmla z17.s, z0.b, z9.b\n" + ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" + ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" - ".inst 0x45049812 // smmla z18.s, z0.b, z4.b\n" + ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - ".inst 0x450a985c // smmla z28.s, z2.b, z10.b\n" - ".inst 0x45099859 // smmla z25.s, z2.b, z9.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x4506985c // smmla z28.s, z2.b, z6.b\n" + ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n" ".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n" - ".inst 0x4504985a // smmla z26.s, z2.b, z4.b\n" - ".inst 0x45079816 // smmla z22.s, z0.b, z7.b\n" - "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x4507985e // smmla z30.s, z2.b, z7.b\n" - ".inst 0x45069813 // smmla z19.s, z0.b, z6.b\n" - "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n" - ".inst 0x4506985b // smmla z27.s, z2.b, z6.b\n" + ".inst 0x4509985a // smmla z26.s, z2.b, z9.b\n" + ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" + "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" + ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n" + ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" + "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-6, MUL VL]\n" + ".inst 0x4504985b // smmla z27.s, z2.b, z4.b\n" ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" - "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #-5, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #-4, MUL VL]\n" ".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n" - ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n" - "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" - ".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n" - "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x450a9834 // smmla z20.s, z1.b, z10.b\n" - ".inst 0x450a987c // smmla z28.s, z3.b, z10.b\n" - ".inst 0x45099831 // smmla z17.s, z1.b, z9.b\n" - ".inst 0x45099879 // smmla z25.s, z3.b, z9.b\n" - ".inst 0x45089835 // smmla z21.s, z1.b, z8.b\n" - ".inst 0x4508987d // smmla z29.s, z3.b, z8.b\n" - ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n" - ".inst 0x4507987a // smmla z26.s, z3.b, z7.b\n" - ".inst 0x45069836 // smmla z22.s, z1.b, z6.b\n" - ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n" + ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + "ld1b { z4.b }, p2/Z, [x9, #-3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #-2, MUL VL]\n" + ".inst 0x45069878 // smmla z24.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + ".inst 0x4507987c // smmla z28.s, z3.b, z7.b\n" + ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + ".inst 0x45089879 // smmla z25.s, z3.b, z8.b\n" + ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" + ".inst 0x4509987d // smmla z29.s, z3.b, z9.b\n" + ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" + ".inst 0x450a987a // smmla z26.s, z3.b, z10.b\n" + ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" + ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n" ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n" - ".inst 0x45049837 // smmla z23.s, z1.b, z4.b\n" - ".inst 0x4504987f // smmla z31.s, z3.b, z4.b\n" - "tbnz %x[flags], #31, 36f\n" + ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + ".inst 0x4506987f // smmla z31.s, z3.b, z6.b\n" + "tbnz %x[flags], #31, 34f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z13.s, z2.b, z15.b\n" "sdot z11.s, z1.b, z15.b\n" "sdot z13.s, z3.b, z15.b\n" - "36:" // Height 3: Multiply loop: unique 5: skip row sum + "34:" // Height 3: Multiply loop: unique 5: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 35b\n" - "37:" // Height 3: Multiply loop: Single iteration only + "bgt 33b\n" + "35:" // Height 3: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" "trn1 z0.d, z1.d, z2.d\n" "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z5.d\n" - "trn2 z3.d, z3.d, z5.d\n" - "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - ".inst 0x45049810 // smmla z16.s, z0.b, z4.b\n" - ".inst 0x450a9814 // smmla z20.s, z0.b, z10.b\n" - ".inst 0x45099811 // smmla z17.s, z0.b, z9.b\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" + ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" + ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" - ".inst 0x45079812 // smmla z18.s, z0.b, z7.b\n" - ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x450a985c // smmla z28.s, z2.b, z10.b\n" - "addvl x28, x28, #8\n" - ".inst 0x45099859 // smmla z25.s, z2.b, z9.b\n" + ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" + ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x4506985c // smmla z28.s, z2.b, z6.b\n" + "addvl x9, x9, #8\n" + ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n" ".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n" - ".inst 0x4507985a // smmla z26.s, z2.b, z7.b\n" - ".inst 0x45069816 // smmla z22.s, z0.b, z6.b\n" - ".inst 0x4506985e // smmla z30.s, z2.b, z6.b\n" - ".inst 0x45059813 // smmla z19.s, z0.b, z5.b\n" - ".inst 0x4505985b // smmla z27.s, z2.b, z5.b\n" - ".inst 0x45049817 // smmla z23.s, z0.b, z4.b\n" - ".inst 0x4504985f // smmla z31.s, z2.b, z4.b\n" - "ble 38f\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n" - ".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n" - "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x450a9834 // smmla z20.s, z1.b, z10.b\n" - ".inst 0x450a987c // smmla z28.s, z3.b, z10.b\n" - ".inst 0x45099831 // smmla z17.s, z1.b, z9.b\n" - ".inst 0x45099879 // smmla z25.s, z3.b, z9.b\n" - "addvl x28, x28, #8\n" - ".inst 0x45089835 // smmla z21.s, z1.b, z8.b\n" - ".inst 0x4508987d // smmla z29.s, z3.b, z8.b\n" - ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n" - ".inst 0x4507987a // smmla z26.s, z3.b, z7.b\n" - ".inst 0x45069836 // smmla z22.s, z1.b, z6.b\n" - ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n" + ".inst 0x4509985a // smmla z26.s, z2.b, z9.b\n" + ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" + ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n" + ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" + ".inst 0x4504985b // smmla z27.s, z2.b, z4.b\n" + ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" + ".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n" + "ble 36f\n" + "ld1b { z6.b }, p2/Z, [x9]\n" + "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + ".inst 0x45069878 // smmla z24.s, z3.b, z6.b\n" + "ld1b { z5.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + ".inst 0x4507987c // smmla z28.s, z3.b, z7.b\n" + ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + ".inst 0x45089879 // smmla z25.s, z3.b, z8.b\n" + "addvl x9, x9, #8\n" + ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" + ".inst 0x4509987d // smmla z29.s, z3.b, z9.b\n" + ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" + ".inst 0x450a987a // smmla z26.s, z3.b, z10.b\n" + ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" + ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n" ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n" - ".inst 0x45049837 // smmla z23.s, z1.b, z4.b\n" - ".inst 0x4504987f // smmla z31.s, z3.b, z4.b\n" - "38:" // Height 3: Multiply loop: multiply skip - "tbnz %x[flags], #31, 39f\n" + ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + ".inst 0x4506987f // smmla z31.s, z3.b, z6.b\n" + "36:" // Height 3: Multiply loop: multiply skip + "tbnz %x[flags], #31, 37f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z13.s, z2.b, z15.b\n" "sdot z11.s, z1.b, z15.b\n" "sdot z13.s, z3.b, z15.b\n" - "39:" // Height 3: Multiply loop: unique 6: skip row sum + "37:" // Height 3: Multiply loop: unique 6: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 32b\n" + "bne 30b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 z0.d, z16.d, z20.d\n" + "uzp1 z7.d, z16.d, z20.d\n" "uzp2 z16.d, z16.d, z20.d\n" "uzp1 z20.d, z17.d, z21.d\n" "uzp2 z17.d, z17.d, z21.d\n" "uzp1 z21.d, z18.d, z22.d\n" "uzp2 z18.d, z18.d, z22.d\n" - "add x23, x27, x20\n" + "add x26, x27, x20\n" "uzp1 z22.d, z19.d, z23.d\n" "uzp2 z19.d, z19.d, z23.d\n" - "add x22, x23, x20\n" + "add x25, x26, x20\n" "uzp1 z24.d, z24.d, z28.d\n" "uzp1 z25.d, z25.d, z29.d\n" "uzp1 z26.d, z26.d, z30.d\n" "uzp1 z27.d, z27.d, z31.d\n" - "mov z31.d, z0.d\n" - "tbnz %x[flags], #31, 40f\n" + "mov z31.d, z7.d\n" + "tbnz %x[flags], #31, 38f\n" "add x20, %x[qp], %[b_offset]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" ".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n" - "ld1rw { z23.s }, p2/Z, [x20]\n" - "neg z23.s, p2/M, z23.s\n" + "ld1rw { z3.s }, p2/Z, [x20]\n" + "neg z3.s, p2/M, z3.s\n" "mov z12.s, z11.s[3]\n" "mov z11.s, z11.s[0]\n" "mov z13.s, z13.s[0]\n" - "mul z11.s, p2/M, z11.s, z23.s\n" - "mul z12.s, p2/M, z12.s, z23.s\n" - "mul z13.s, p2/M, z13.s, z23.s\n" - "40:" // Height 3: skip row sum fixup + "mul z11.s, p2/M, z11.s, z3.s\n" + "mul z12.s, p2/M, z12.s, z3.s\n" + "mul z13.s, p2/M, z13.s, z3.s\n" + "38:" // Height 3: skip row sum fixup "add z31.s, z31.s, z11.s\n" "add z20.s, z20.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z30.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z21.s, z21.s, z11.s\n" "add z22.s, z22.s, z11.s\n" - "ld1w { z29.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z28.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z16.s, z16.s, z12.s\n" "add z17.s, z17.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z12.s\n" "add z19.s, z19.s, z12.s\n" - "ld1rw { z23.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" "add z24.s, z24.s, z13.s\n" "add z25.s, z25.s, z13.s\n" - "addvl x10, x10, #4\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" "add z26.s, z26.s, z13.s\n" "add z27.s, z27.s, z13.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" "add z31.s, z31.s, z0.s\n" - "add z20.s, z20.s, z30.s\n" - "add z21.s, z21.s, z29.s\n" - "add z22.s, z22.s, z28.s\n" + "add z20.s, z20.s, z1.s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z30.s\n" - "add z18.s, z18.s, z29.s\n" - "add z19.s, z19.s, z28.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z30.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z26.s, z26.s, z29.s\n" - "add z27.s, z27.s, z28.s\n" - ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n" - ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n" - ".inst 0x04b776b5 // sqrdmulh z21.s, z21.s, z23.s\n" - ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n" - ".inst 0x04b77610 // sqrdmulh z16.s, z16.s, z23.s\n" - ".inst 0x04b77631 // sqrdmulh z17.s, z17.s, z23.s\n" - ".inst 0x04b77652 // sqrdmulh z18.s, z18.s, z23.s\n" - ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n" - ".inst 0x04b77718 // sqrdmulh z24.s, z24.s, z23.s\n" - ".inst 0x04b77739 // sqrdmulh z25.s, z25.s, z23.s\n" - ".inst 0x04b7775a // sqrdmulh z26.s, z26.s, z23.s\n" - ".inst 0x04b7777b // sqrdmulh z27.s, z27.s, z23.s\n" - "tbz %x[flags], #5, 41f\n" - "and z1.d, z31.d, z0.d\n" - "and z30.d, z20.d, z0.d\n" - "and z29.d, z21.d, z0.d\n" - "and z28.d, z22.d, z0.d\n" - "and z23.d, z16.d, z0.d\n" - "and z3.d, z17.d, z0.d\n" - "asr z1.s, z1.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z23.s, z23.s, #0x1f\n" - "and z2.d, z18.d, z0.d\n" - "sqadd z31.s, z31.s, z1.s\n" - "sqadd z20.s, z20.s, z30.s\n" - "sqadd z21.s, z21.s, z29.s\n" - "sqadd z22.s, z22.s, z28.s\n" - "sqadd z16.s, z16.s, z23.s\n" - "and z1.d, z19.d, z0.d\n" - "and z30.d, z24.d, z0.d\n" - "and z29.d, z25.d, z0.d\n" - "and z28.d, z26.d, z0.d\n" - "and z23.d, z27.d, z0.d\n" - "asr z3.s, z3.s, #0x1f\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z23.s, z23.s, #0x1f\n" - "sqadd z17.s, z17.s, z3.s\n" - "sqadd z18.s, z18.s, z2.s\n" - "sqadd z19.s, z19.s, z1.s\n" - "sqadd z24.s, z24.s, z30.s\n" - "sqadd z25.s, z25.s, z29.s\n" - "sqadd z26.s, z26.s, z28.s\n" - "sqadd z27.s, z27.s, z23.s\n" - "41:" // Height 3: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add z25.s, z25.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + ".inst 0x04a473ff // sqdmulh z31.s, z31.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" - "ld1rw { z29.s }, p2/Z, [x20]\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + ".inst 0x04a47318 // sqdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47339 // sqdmulh z25.s, z25.s, z4.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + ".inst 0x04a4735a // sqdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4737b // sqdmulh z27.s, z27.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z31.s, z31.s, z29.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z20.s, z20.s, z29.s\n" - "add z21.s, z21.s, z29.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z22.s, z22.s, z29.s\n" - "add z16.s, z16.s, z29.s\n" + "add z31.s, z31.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z17.s, z17.s, z29.s\n" - "add z18.s, z18.s, z29.s\n" - "ld1rw { z28.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z29.s\n" - "add z24.s, z24.s, z29.s\n" - "add x20, %x[qp], %[minval]\n" - "add z25.s, z25.s, z29.s\n" - "add z26.s, z26.s, z29.s\n" - "ld1rw { z23.s }, p2/Z, [x20]\n" - "add z27.s, z27.s, z29.s\n" - "smin z31.s, p2/M, z31.s, z28.s\n" - "smin z20.s, p2/M, z20.s, z28.s\n" - "smin z21.s, p2/M, z21.s, z28.s\n" - "smin z22.s, p2/M, z22.s, z28.s\n" - "smin z16.s, p2/M, z16.s, z28.s\n" - "smin z17.s, p2/M, z17.s, z28.s\n" - "smin z18.s, p2/M, z18.s, z28.s\n" - "smin z19.s, p2/M, z19.s, z28.s\n" - "smin z24.s, p2/M, z24.s, z28.s\n" - "smin z25.s, p2/M, z25.s, z28.s\n" - "smin z26.s, p2/M, z26.s, z28.s\n" - "smin z27.s, p2/M, z27.s, z28.s\n" - "smax z31.s, p2/M, z31.s, z23.s\n" - "smax z20.s, p2/M, z20.s, z23.s\n" - "smax z21.s, p2/M, z21.s, z23.s\n" - "smax z22.s, p2/M, z22.s, z23.s\n" - "smax z16.s, p2/M, z16.s, z23.s\n" - "smax z17.s, p2/M, z17.s, z23.s\n" - "smax z18.s, p2/M, z18.s, z23.s\n" - "smax z19.s, p2/M, z19.s, z23.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "add z26.s, z26.s, z4.s\n" + "add z27.s, z27.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z31.h, z31.h, z20.h\n" - "smax z24.s, p2/M, z24.s, z23.s\n" - "smax z25.s, p2/M, z25.s, z23.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" "uzp1 z20.h, z21.h, z22.h\n" - "smax z26.s, p2/M, z26.s, z23.s\n" - "smax z27.s, p2/M, z27.s, z23.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "uzp1 z18.h, z18.h, z19.h\n" + "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z24.h, z24.h, z25.h\n" "uzp1 z31.b, z31.b, z20.b\n" - "uzp1 z17.h, z26.h, z27.h\n" - "uzp1 z16.b, z16.b, z18.b\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z16.b, z16.b, z17.b\n" "st1b { z31.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "uzp1 z24.b, z24.b, z17.b\n" - "st1b { z16.b }, p1, [x23]\n" - "st1b { z24.b }, p1, [x22]\n" - "42:" // Height 3: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 30b\n" - "b 58f\n" - "43:" // Height 4 + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z16.b }, p1, [x26]\n" + "st1b { z24.b }, p1, [x25]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 28b\n" + "b 54f\n" + "40:" // Height 4 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "mov x20, #0x4\n" - "mov x10, %x[col_bias]\n" "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" - "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" "mov z13.s, #0x0\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z14.s, #0x0\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "madd x20, x21, x20, x27\n" "mov z15.b, #0x1\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" + "madd x20, x21, x20, x27\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "44:" // Height 4: Column loop + "41:" // Height 4: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -977,7 +889,7 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" "mov z24.s, #0x0\n" @@ -988,191 +900,190 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "45:" // Height 4: setup done "mov x26, #0x0\n" - "46:" // Height 4: String loop + "43:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 47f\n" + "tbz %x[flags], #3, 44f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" "ldr x21, [x20, #0x18]\n" - "cbnz x26, 48f\n" + "cbnz x26, 45f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" "add x21, x21, x20\n" - "b 48f\n" - "47:" // Height 4: setup direct input + "b 45f\n" + "44:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" "add x21, x22, x21\n" - "48:" // Height 4: input setup done + "45:" // Height 4: input setup done "cmp x25, #0x10\n" - "ble 51f\n" - "49:" // Height 4: Multiply loop: Main loop head + "ble 48f\n" + "46:" // Height 4: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "ld1rqb { z6.b }, p0/Z, [x21]\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" "add x21, x21, #0x10\n" "trn1 z0.d, z1.d, z2.d\n" "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z6.d\n" - "trn2 z3.d, z3.d, z6.d\n" - "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" - ".inst 0x45049814 // smmla z20.s, z0.b, z4.b\n" + ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" - ".inst 0x45099815 // smmla z21.s, z0.b, z9.b\n" - ".inst 0x45089812 // smmla z18.s, z0.b, z8.b\n" + ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" + ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - ".inst 0x4504985c // smmla z28.s, z2.b, z4.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x4506985c // smmla z28.s, z2.b, z6.b\n" ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n" - ".inst 0x4509985d // smmla z29.s, z2.b, z9.b\n" - ".inst 0x4508985a // smmla z26.s, z2.b, z8.b\n" + ".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n" + ".inst 0x4509985a // smmla z26.s, z2.b, z9.b\n" ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" - "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n" - ".inst 0x45069813 // smmla z19.s, z0.b, z6.b\n" - "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n" - ".inst 0x4506985b // smmla z27.s, z2.b, z6.b\n" + ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" + "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-6, MUL VL]\n" + ".inst 0x4504985b // smmla z27.s, z2.b, z4.b\n" ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" - "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #-5, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #-4, MUL VL]\n" ".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n" - ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n" - "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" - ".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n" - "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x450a9834 // smmla z20.s, z1.b, z10.b\n" - ".inst 0x450a987c // smmla z28.s, z3.b, z10.b\n" - ".inst 0x45099831 // smmla z17.s, z1.b, z9.b\n" - ".inst 0x45099879 // smmla z25.s, z3.b, z9.b\n" - ".inst 0x45089835 // smmla z21.s, z1.b, z8.b\n" - ".inst 0x4508987d // smmla z29.s, z3.b, z8.b\n" - ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n" - ".inst 0x4507987a // smmla z26.s, z3.b, z7.b\n" - ".inst 0x45069836 // smmla z22.s, z1.b, z6.b\n" - ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n" + ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + "ld1b { z4.b }, p2/Z, [x9, #-3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #-2, MUL VL]\n" + ".inst 0x45069878 // smmla z24.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + ".inst 0x4507987c // smmla z28.s, z3.b, z7.b\n" + ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + ".inst 0x45089879 // smmla z25.s, z3.b, z8.b\n" + ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" + ".inst 0x4509987d // smmla z29.s, z3.b, z9.b\n" + ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" + ".inst 0x450a987a // smmla z26.s, z3.b, z10.b\n" + ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" + ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n" ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n" - ".inst 0x45049837 // smmla z23.s, z1.b, z4.b\n" - ".inst 0x4504987f // smmla z31.s, z3.b, z4.b\n" - "tbnz %x[flags], #31, 50f\n" + ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + ".inst 0x4506987f // smmla z31.s, z3.b, z6.b\n" + "tbnz %x[flags], #31, 47f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z13.s, z2.b, z15.b\n" "sdot z11.s, z1.b, z15.b\n" "sdot z13.s, z3.b, z15.b\n" - "50:" // Height 4: Multiply loop: unique 7: skip row sum + "47:" // Height 4: Multiply loop: unique 7: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 49b\n" - "51:" // Height 4: Multiply loop: Single iteration only + "bgt 46b\n" + "48:" // Height 4: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z6.b }, p2/Z, [x28]\n" - "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "ld1rqb { z5.b }, p0/Z, [x21]\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" "trn1 z0.d, z1.d, z2.d\n" "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z5.d\n" - "trn2 z3.d, z3.d, z5.d\n" - "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - ".inst 0x45069810 // smmla z16.s, z0.b, z6.b\n" - ".inst 0x45049814 // smmla z20.s, z0.b, z4.b\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" + ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" - ".inst 0x45099815 // smmla z21.s, z0.b, z9.b\n" - ".inst 0x45089812 // smmla z18.s, z0.b, z8.b\n" - ".inst 0x45069858 // smmla z24.s, z2.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x4504985c // smmla z28.s, z2.b, z4.b\n" - "addvl x28, x28, #8\n" + ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" + ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" + ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x4506985c // smmla z28.s, z2.b, z6.b\n" + "addvl x9, x9, #8\n" ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n" - ".inst 0x4509985d // smmla z29.s, z2.b, z9.b\n" - ".inst 0x4508985a // smmla z26.s, z2.b, z8.b\n" + ".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n" + ".inst 0x4509985a // smmla z26.s, z2.b, z9.b\n" ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n" - ".inst 0x45059813 // smmla z19.s, z0.b, z5.b\n" - ".inst 0x4505985b // smmla z27.s, z2.b, z5.b\n" - ".inst 0x45069817 // smmla z23.s, z0.b, z6.b\n" - ".inst 0x4506985f // smmla z31.s, z2.b, z6.b\n" - "ble 52f\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n" - ".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n" - "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x450a9834 // smmla z20.s, z1.b, z10.b\n" - ".inst 0x450a987c // smmla z28.s, z3.b, z10.b\n" - ".inst 0x45099831 // smmla z17.s, z1.b, z9.b\n" - ".inst 0x45099879 // smmla z25.s, z3.b, z9.b\n" - "addvl x28, x28, #8\n" - ".inst 0x45089835 // smmla z21.s, z1.b, z8.b\n" - ".inst 0x4508987d // smmla z29.s, z3.b, z8.b\n" - ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n" - ".inst 0x4507987a // smmla z26.s, z3.b, z7.b\n" - ".inst 0x45069836 // smmla z22.s, z1.b, z6.b\n" - ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n" + ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" + ".inst 0x4504985b // smmla z27.s, z2.b, z4.b\n" + ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" + ".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n" + "ble 49f\n" + "ld1b { z6.b }, p2/Z, [x9]\n" + "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + ".inst 0x45069878 // smmla z24.s, z3.b, z6.b\n" + "ld1b { z5.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + ".inst 0x4507987c // smmla z28.s, z3.b, z7.b\n" + ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + ".inst 0x45089879 // smmla z25.s, z3.b, z8.b\n" + "addvl x9, x9, #8\n" + ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" + ".inst 0x4509987d // smmla z29.s, z3.b, z9.b\n" + ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" + ".inst 0x450a987a // smmla z26.s, z3.b, z10.b\n" + ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" + ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n" ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n" - ".inst 0x45049837 // smmla z23.s, z1.b, z4.b\n" - ".inst 0x4504987f // smmla z31.s, z3.b, z4.b\n" - "52:" // Height 4: Multiply loop: multiply skip - "tbnz %x[flags], #31, 53f\n" + ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + ".inst 0x4506987f // smmla z31.s, z3.b, z6.b\n" + "49:" // Height 4: Multiply loop: multiply skip + "tbnz %x[flags], #31, 50f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z13.s, z2.b, z15.b\n" "sdot z11.s, z1.b, z15.b\n" "sdot z13.s, z3.b, z15.b\n" - "53:" // Height 4: Multiply loop: unique 8: skip row sum + "50:" // Height 4: Multiply loop: unique 8: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 46b\n" + "bne 43b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 z0.d, z16.d, z20.d\n" + "uzp1 z7.d, z16.d, z20.d\n" "uzp2 z16.d, z16.d, z20.d\n" "uzp1 z20.d, z17.d, z21.d\n" "uzp2 z17.d, z17.d, z21.d\n" "uzp1 z21.d, z18.d, z22.d\n" "uzp2 z18.d, z18.d, z22.d\n" - "add x23, x27, x20\n" - "add x22, x23, x20\n" + "add x26, x27, x20\n" + "add x25, x26, x20\n" "uzp1 z22.d, z19.d, z23.d\n" "uzp2 z19.d, z19.d, z23.d\n" - "add x21, x22, x20\n" + "add x24, x25, x20\n" "uzp1 z23.d, z24.d, z28.d\n" "uzp2 z24.d, z24.d, z28.d\n" "uzp1 z28.d, z25.d, z29.d\n" @@ -1181,233 +1092,182 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "uzp2 z26.d, z26.d, z30.d\n" "uzp1 z30.d, z27.d, z31.d\n" "uzp2 z27.d, z27.d, z31.d\n" - "mov z31.d, z0.d\n" - "tbnz %x[flags], #31, 54f\n" + "mov z31.d, z7.d\n" + "tbnz %x[flags], #31, 51f\n" "add x20, %x[qp], %[b_offset]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" ".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "neg z0.s, p2/M, z0.s\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "neg z4.s, p2/M, z4.s\n" "mov z12.s, z11.s[3]\n" "mov z11.s, z11.s[0]\n" "mov z14.s, z13.s[3]\n" "mov z13.s, z13.s[0]\n" - "mul z11.s, p2/M, z11.s, z0.s\n" - "mul z12.s, p2/M, z12.s, z0.s\n" - "mul z13.s, p2/M, z13.s, z0.s\n" - "mul z14.s, p2/M, z14.s, z0.s\n" - "54:" // Height 4: skip row sum fixup + "mul z11.s, p2/M, z11.s, z4.s\n" + "mul z12.s, p2/M, z12.s, z4.s\n" + "mul z13.s, p2/M, z13.s, z4.s\n" + "mul z14.s, p2/M, z14.s, z4.s\n" + "51:" // Height 4: skip row sum fixup "add z31.s, z31.s, z11.s\n" "add z20.s, z20.s, z11.s\n" - "ld1w { z4.s }, p2/Z, [x10]\n" - "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z21.s, z21.s, z11.s\n" "add z22.s, z22.s, z11.s\n" - "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z16.s, z16.s, z12.s\n" "add z17.s, z17.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z12.s\n" "add z19.s, z19.s, z12.s\n" - "ld1rw { z1.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" "add z23.s, z23.s, z13.s\n" "add z28.s, z28.s, z13.s\n" - "addvl x10, x10, #4\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" "add z29.s, z29.s, z13.s\n" "add z30.s, z30.s, z13.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" "add z24.s, z24.s, z14.s\n" "add z25.s, z25.s, z14.s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" "add z26.s, z26.s, z14.s\n" "add z27.s, z27.s, z14.s\n" - "add z31.s, z31.s, z4.s\n" - "add z20.s, z20.s, z0.s\n" - "add z21.s, z21.s, z3.s\n" - "add z22.s, z22.s, z2.s\n" - "add z16.s, z16.s, z4.s\n" - "add z17.s, z17.s, z0.s\n" - "add z18.s, z18.s, z3.s\n" - "add z19.s, z19.s, z2.s\n" - "add z23.s, z23.s, z4.s\n" - "add z28.s, z28.s, z0.s\n" - "add z29.s, z29.s, z3.s\n" - "add z30.s, z30.s, z2.s\n" - "add z24.s, z24.s, z4.s\n" - "add z25.s, z25.s, z0.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z26.s, z26.s, z3.s\n" - "add z27.s, z27.s, z2.s\n" - ".inst 0x04a177ff // sqrdmulh z31.s, z31.s, z1.s\n" - ".inst 0x04a17694 // sqrdmulh z20.s, z20.s, z1.s\n" - ".inst 0x04a176b5 // sqrdmulh z21.s, z21.s, z1.s\n" - ".inst 0x04a176d6 // sqrdmulh z22.s, z22.s, z1.s\n" - ".inst 0x04a17610 // sqrdmulh z16.s, z16.s, z1.s\n" - ".inst 0x04a17631 // sqrdmulh z17.s, z17.s, z1.s\n" - ".inst 0x04a17652 // sqrdmulh z18.s, z18.s, z1.s\n" - ".inst 0x04a17673 // sqrdmulh z19.s, z19.s, z1.s\n" - ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n" - ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n" - ".inst 0x04a177bd // sqrdmulh z29.s, z29.s, z1.s\n" - ".inst 0x04a177de // sqrdmulh z30.s, z30.s, z1.s\n" - ".inst 0x04a17718 // sqrdmulh z24.s, z24.s, z1.s\n" - ".inst 0x04a17739 // sqrdmulh z25.s, z25.s, z1.s\n" - ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n" - ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n" - "tbz %x[flags], #5, 55f\n" - "and z2.d, z31.d, z0.d\n" - "and z1.d, z20.d, z0.d\n" - "and z7.d, z21.d, z0.d\n" - "and z6.d, z22.d, z0.d\n" - "and z5.d, z16.d, z0.d\n" - "and z4.d, z17.d, z0.d\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "and z3.d, z18.d, z0.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "sqadd z31.s, z31.s, z2.s\n" - "sqadd z20.s, z20.s, z1.s\n" - "and z2.d, z19.d, z0.d\n" - "and z1.d, z23.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z3.s, z3.s, #0x1f\n" - "sqadd z21.s, z21.s, z7.s\n" - "sqadd z22.s, z22.s, z6.s\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "sqadd z16.s, z16.s, z5.s\n" - "sqadd z17.s, z17.s, z4.s\n" - "sqadd z18.s, z18.s, z3.s\n" - "and z7.d, z28.d, z0.d\n" - "sqadd z19.s, z19.s, z2.s\n" - "sqadd z23.s, z23.s, z1.s\n" - "and z6.d, z29.d, z0.d\n" - "and z5.d, z30.d, z0.d\n" - "and z4.d, z24.d, z0.d\n" - "and z3.d, z25.d, z0.d\n" - "and z2.d, z26.d, z0.d\n" - "and z1.d, z27.d, z0.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z3.s, z3.s, #0x1f\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "sqadd z28.s, z28.s, z7.s\n" - "sqadd z29.s, z29.s, z6.s\n" - "sqadd z30.s, z30.s, z5.s\n" - "sqadd z24.s, z24.s, z4.s\n" - "sqadd z25.s, z25.s, z3.s\n" - "sqadd z26.s, z26.s, z2.s\n" - "sqadd z27.s, z27.s, z1.s\n" - "55:" // Height 4: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add z31.s, z31.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z23.s, z23.s, z0.s\n" + "add z28.s, z28.s, z1.s\n" + "add z29.s, z29.s, z2.s\n" + "add z30.s, z30.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "add z25.s, z25.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + ".inst 0x04a473ff // sqdmulh z31.s, z31.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" - "ld1rw { z2.s }, p2/Z, [x20]\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a4739c // sqdmulh z28.s, z28.s, z4.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + ".inst 0x04a473bd // sqdmulh z29.s, z29.s, z4.s\n" + ".inst 0x04a473de // sqdmulh z30.s, z30.s, z4.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + ".inst 0x04a47318 // sqdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47339 // sqdmulh z25.s, z25.s, z4.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z31.s, z31.s, z2.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + ".inst 0x04a4735a // sqdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4737b // sqdmulh z27.s, z27.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "add z20.s, z20.s, z2.s\n" - "add z21.s, z21.s, z2.s\n" ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" - "add z22.s, z22.s, z2.s\n" - "add z16.s, z16.s, z2.s\n" ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z17.s, z17.s, z2.s\n" - "add z18.s, z18.s, z2.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z19.s, z19.s, z2.s\n" - "add z23.s, z23.s, z2.s\n" + "add z31.s, z31.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z28.s, z28.s, z2.s\n" - "add z29.s, z29.s, z2.s\n" - "ld1rw { z1.s }, p2/Z, [x20]\n" - "add z30.s, z30.s, z2.s\n" - "add z24.s, z24.s, z2.s\n" - "add x20, %x[qp], %[minval]\n" - "add z25.s, z25.s, z2.s\n" - "add z26.s, z26.s, z2.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z27.s, z27.s, z2.s\n" - "smin z31.s, p2/M, z31.s, z1.s\n" - "smin z20.s, p2/M, z20.s, z1.s\n" - "smin z21.s, p2/M, z21.s, z1.s\n" - "smin z22.s, p2/M, z22.s, z1.s\n" - "smin z16.s, p2/M, z16.s, z1.s\n" - "smin z17.s, p2/M, z17.s, z1.s\n" - "smin z18.s, p2/M, z18.s, z1.s\n" - "smin z19.s, p2/M, z19.s, z1.s\n" - "smin z23.s, p2/M, z23.s, z1.s\n" - "smin z28.s, p2/M, z28.s, z1.s\n" - "smin z29.s, p2/M, z29.s, z1.s\n" - "smin z30.s, p2/M, z30.s, z1.s\n" - "smin z24.s, p2/M, z24.s, z1.s\n" - "smin z25.s, p2/M, z25.s, z1.s\n" - "smin z26.s, p2/M, z26.s, z1.s\n" - "smin z27.s, p2/M, z27.s, z1.s\n" - "smax z31.s, p2/M, z31.s, z0.s\n" - "smax z20.s, p2/M, z20.s, z0.s\n" - "smax z21.s, p2/M, z21.s, z0.s\n" - "smax z22.s, p2/M, z22.s, z0.s\n" - "smax z16.s, p2/M, z16.s, z0.s\n" - "smax z17.s, p2/M, z17.s, z0.s\n" - "smax z18.s, p2/M, z18.s, z0.s\n" - "smax z19.s, p2/M, z19.s, z0.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "add z23.s, z23.s, z4.s\n" + "add z28.s, z28.s, z4.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "add z29.s, z29.s, z4.s\n" + "add z30.s, z30.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "add z26.s, z26.s, z4.s\n" + "add z27.s, z27.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z28.s, p2/M, z28.s, z6.s\n" + "smin z29.s, p2/M, z29.s, z6.s\n" + "smin z30.s, p2/M, z30.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z31.h, z31.h, z20.h\n" - "smax z23.s, p2/M, z23.s, z0.s\n" - "smax z28.s, p2/M, z28.s, z0.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z28.s, p2/M, z28.s, z5.s\n" "uzp1 z20.h, z21.h, z22.h\n" - "smax z29.s, p2/M, z29.s, z0.s\n" - "smax z30.s, p2/M, z30.s, z0.s\n" + "smax z29.s, p2/M, z29.s, z5.s\n" + "smax z30.s, p2/M, z30.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z24.s, p2/M, z24.s, z0.s\n" - "smax z25.s, p2/M, z25.s, z0.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" "uzp1 z17.h, z18.h, z19.h\n" - "smax z26.s, p2/M, z26.s, z0.s\n" - "smax z27.s, p2/M, z27.s, z0.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z23.h, z23.h, z28.h\n" "uzp1 z31.b, z31.b, z20.b\n" - "uzp1 z18.h, z29.h, z30.h\n" + "uzp1 z28.h, z29.h, z30.h\n" "uzp1 z24.h, z24.h, z25.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "uzp1 z17.h, z26.h, z27.h\n" + "uzp1 z25.h, z26.h, z27.h\n" "st1b { z31.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "uzp1 z23.b, z23.b, z18.b\n" - "uzp1 z24.b, z24.b, z17.b\n" - "st1b { z16.b }, p1, [x23]\n" - "st1b { z23.b }, p1, [x22]\n" - "st1b { z24.b }, p1, [x21]\n" - "56:" // Height 4: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 44b\n" + "uzp1 z23.b, z23.b, z28.b\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z16.b }, p1, [x26]\n" + "st1b { z23.b }, p1, [x25]\n" + "st1b { z24.b }, p1, [x24]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 41b\n" "subs %x[M], %x[M], #0x4\n" - "beq 58f\n" + "beq 54f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 57f\n" + "tbz %x[flags], #3, 53f\n" "add x21, x21, #0x4\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "57:" // Update direct input + "53:" // Update direct input "mov x20, #0x4\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "58:" // Exit + "54:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp index 49930b57f7..c58e587e64 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -81,23 +80,20 @@ void sve_hybrid_s8qs_dot_6x4VL ( ka.multiplier_ptr=qp->per_channel_muls + col_base; ka.shift_ptr=qp->per_channel_right_shifts + col_base; } - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "ptrue p2.b\n" "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 66f\n" + "bge 61f\n" "cmp %x[M], #0x4\n" - "bgt 53f\n" - "beq 40f\n" + "bgt 49f\n" + "beq 37f\n" "cmp %x[M], #0x2\n" - "bgt 27f\n" - "beq 14f\n" - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "bgt 25f\n" + "beq 13f\n" + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" @@ -108,7 +104,6 @@ void sve_hybrid_s8qs_dot_6x4VL ( "mov z10.s, #0x0\n" "mov z11.s, #0x0\n" "whilelt p1.b, x20, x11\n" - "3:" // Height 1: setup done "mov x28, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -129,114 +124,114 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ble 8f\n" "7:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1b { z17.b }, p2/Z, [x10]\n" - "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" "ld1rqb { z0.b }, p0/Z, [x26]\n" "add x26, x26, #0x10\n" - "sdot z8.s, z17.b, z0.b[0]\n" - "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z16.b, z0.b[0]\n" - "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z17.b, z0.b[0]\n" - "ld1b { z17.b }, p2/Z, [x10, #4, MUL VL]\n" - "sdot z11.s, z16.b, z0.b[0]\n" - "ld1b { z16.b }, p2/Z, [x10, #5, MUL VL]\n" - "sdot z8.s, z17.b, z0.b[1]\n" - "ld1b { z17.b }, p2/Z, [x10, #6, MUL VL]\n" - "sdot z9.s, z16.b, z0.b[1]\n" - "ld1b { z16.b }, p2/Z, [x10, #7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "ld1b { z7.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "sdot z10.s, z17.b, z0.b[1]\n" - "sdot z11.s, z16.b, z0.b[1]\n" - "ld1b { z17.b }, p2/Z, [x10, #-8, MUL VL]\n" - "ld1b { z16.b }, p2/Z, [x10, #-7, MUL VL]\n" - "sdot z8.s, z17.b, z0.b[2]\n" - "ld1b { z17.b }, p2/Z, [x10, #-6, MUL VL]\n" - "sdot z9.s, z16.b, z0.b[2]\n" - "ld1b { z16.b }, p2/Z, [x10, #-5, MUL VL]\n" - "sdot z10.s, z17.b, z0.b[2]\n" - "ld1b { z17.b }, p2/Z, [x10, #-4, MUL VL]\n" - "sdot z11.s, z16.b, z0.b[2]\n" - "ld1b { z16.b }, p2/Z, [x10, #-3, MUL VL]\n" - "sdot z8.s, z17.b, z0.b[3]\n" - "ld1b { z17.b }, p2/Z, [x10, #-2, MUL VL]\n" - "sdot z9.s, z16.b, z0.b[3]\n" - "ld1b { z16.b }, p2/Z, [x10, #-1, MUL VL]\n" - "sdot z10.s, z17.b, z0.b[3]\n" - "sdot z11.s, z16.b, z0.b[3]\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #-8, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x10, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "ld1b { z6.b }, p2/Z, [x10, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "ld1b { z7.b }, p2/Z, [x10, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" "bgt 7b\n" "8:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" - "ld1b { z17.b }, p2/Z, [x10]\n" - "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" "ld1rqb { z0.b }, p0/Z, [x26]\n" - "sdot z8.s, z17.b, z0.b[0]\n" - "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z16.b, z0.b[0]\n" - "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z17.b, z0.b[0]\n" - "sdot z11.s, z16.b, z0.b[0]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" "ble 9f\n" - "ld1b { z17.b }, p2/Z, [x10]\n" - "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z8.s, z17.b, z0.b[1]\n" - "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z16.b, z0.b[1]\n" - "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z17.b, z0.b[1]\n" - "sdot z11.s, z16.b, z0.b[1]\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" "ble 9f\n" - "ld1b { z17.b }, p2/Z, [x10]\n" - "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z8.s, z17.b, z0.b[2]\n" - "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z16.b, z0.b[2]\n" - "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z17.b, z0.b[2]\n" - "sdot z11.s, z16.b, z0.b[2]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z11.s, z7.b, z0.b[2]\n" "ble 9f\n" - "ld1b { z17.b }, p2/Z, [x10]\n" - "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z17.b, z0.b[3]\n" - "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z16.b, z0.b[3]\n" - "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z17.b, z0.b[3]\n" - "sdot z11.s, z16.b, z0.b[3]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" "9:" // Height 1: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 4b\n" - "ld1w { z19.s }, p2/Z, [x14]\n" - "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n" - "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" - "add z8.s, z8.s, z19.s\n" - "add z9.s, z9.s, z18.s\n" - "add z10.s, z10.s, z17.s\n" - "add z11.s, z11.s, z16.s\n" - "tbz %x[flags], #4, 10f\n" "ld1w { z0.s }, p2/Z, [x12]\n" - "ld1w { z4.s }, p2/Z, [x13]\n" "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" - "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" "addvl x12, x12, #4\n" + "add z8.s, z8.s, z0.s\n" + "add z9.s, z9.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "tbz %x[flags], #4, 10f\n" + "ld1w { z0.s }, p2/Z, [x13]\n" + "ld1w { z4.s }, p2/Z, [x14]\n" + "ld1w { z1.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x13, #3, MUL VL]\n" + "ld1w { z7.s }, p2/Z, [x14, #3, MUL VL]\n" "addvl x13, x13, #4\n" + "addvl x14, x14, #4\n" "b 11f\n" "10:" // Height 1: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" @@ -250,64 +245,49 @@ void sve_hybrid_s8qs_dot_6x4VL ( "mov z3.d, z0.d\n" "mov z7.d, z4.d\n" "11:" // Height 1: parameters loaded - ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" - ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" - ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" - ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" - "tbz %x[flags], #5, 12f\n" - "and z19.d, z8.d, z0.d\n" - "and z18.d, z9.d, z1.d\n" - "and z17.d, z10.d, z2.d\n" - "and z16.d, z11.d, z3.d\n" - "asr z19.s, z19.s, #0x1f\n" - "asr z18.s, z18.s, #0x1f\n" - "asr z17.s, z17.s, #0x1f\n" - "asr z16.s, z16.s, #0x1f\n" - "sqadd z8.s, z8.s, z19.s\n" - "sqadd z9.s, z9.s, z18.s\n" - "sqadd z10.s, z10.s, z17.s\n" - "sqadd z11.s, z11.s, z16.s\n" - "12:" // Height 1: no shift correction - "add x20, %x[qp], %[c_offset]\n" + ".inst 0x04a47108 // sqdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57129 // sqdmulh z9.s, z9.s, z5.s\n" + "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + ".inst 0x04a6714a // sqdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7716b // sqdmulh z11.s, z11.s, z7.s\n" + "ld1rw { z4.s }, p2/Z, [x21]\n" + "ld1rw { z6.s }, p2/Z, [x20]\n" + "add x20, %x[qp], %[minval]\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "ld1rw { z18.s }, p2/Z, [x20]\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1rw { z17.s }, p2/Z, [x20]\n" - "add z8.s, z8.s, z18.s\n" - "add x20, %x[qp], %[minval]\n" - "add z9.s, z9.s, z18.s\n" - "add z10.s, z10.s, z18.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" - "add z11.s, z11.s, z18.s\n" - "smin z8.s, p2/M, z8.s, z17.s\n" - "smin z9.s, p2/M, z9.s, z17.s\n" - "smin z10.s, p2/M, z10.s, z17.s\n" - "smin z11.s, p2/M, z11.s, z17.s\n" - "smax z8.s, p2/M, z8.s, z16.s\n" - "smax z9.s, p2/M, z9.s, z16.s\n" - "smax z10.s, p2/M, z10.s, z16.s\n" - "smax z11.s, p2/M, z11.s, z16.s\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" "uzp1 z8.h, z8.h, z9.h\n" - "uzp1 z16.h, z10.h, z11.h\n" - "uzp1 z8.b, z8.b, z16.b\n" + "uzp1 z9.h, z10.h, z11.h\n" + "uzp1 z8.b, z8.b, z9.b\n" "st1b { z8.b }, p1, [x9]\n" "addvl x9, x9, #1\n" - "13:" // Height 1: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" "bgt 2b\n" - "b 80f\n" - "14:" // Height 2 - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "b 74f\n" + "13:" // Height 2 + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "15:" // Height 2: Column loop + "14:" // Height 2: Column loop "mov x20, #0x0\n" "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" @@ -318,181 +298,180 @@ void sve_hybrid_s8qs_dot_6x4VL ( "whilelt p1.b, x20, x11\n" "mov z14.s, #0x0\n" "mov z15.s, #0x0\n" - "16:" // Height 2: setup done "mov x28, #0x0\n" - "17:" // Height 2: String loop + "16:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 18f\n" + "tbz %x[flags], #3, 17f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" "ldr x25, [x20, #0x8]\n" - "cbnz x28, 19f\n" + "cbnz x28, 18f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" - "b 19f\n" - "18:" // Height 2: setup direct input + "b 18f\n" + "17:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" - "19:" // Height 2: input setup done + "18:" // Height 2: input setup done "cmp x27, #0x10\n" - "ble 21f\n" - "20:" // Height 2: Multiply loop: Main loop head + "ble 20f\n" + "19:" // Height 2: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1b { z17.b }, p2/Z, [x10]\n" - "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" "add x26, x26, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x25]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" "add x25, x25, #0x10\n" - "sdot z8.s, z17.b, z1.b[0]\n" - "sdot z12.s, z17.b, z0.b[0]\n" - "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z16.b, z1.b[0]\n" - "sdot z13.s, z16.b, z0.b[0]\n" - "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z17.b, z1.b[0]\n" - "sdot z14.s, z17.b, z0.b[0]\n" - "ld1b { z17.b }, p2/Z, [x10, #4, MUL VL]\n" - "sdot z11.s, z16.b, z1.b[0]\n" - "sdot z15.s, z16.b, z0.b[0]\n" - "ld1b { z16.b }, p2/Z, [x10, #5, MUL VL]\n" - "sdot z8.s, z17.b, z1.b[1]\n" - "sdot z12.s, z17.b, z0.b[1]\n" - "ld1b { z17.b }, p2/Z, [x10, #6, MUL VL]\n" - "sdot z9.s, z16.b, z1.b[1]\n" - "sdot z13.s, z16.b, z0.b[1]\n" - "ld1b { z16.b }, p2/Z, [x10, #7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "ld1b { z7.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "sdot z10.s, z17.b, z1.b[1]\n" - "sdot z14.s, z17.b, z0.b[1]\n" - "sdot z11.s, z16.b, z1.b[1]\n" - "sdot z15.s, z16.b, z0.b[1]\n" - "ld1b { z17.b }, p2/Z, [x10, #-8, MUL VL]\n" - "ld1b { z16.b }, p2/Z, [x10, #-7, MUL VL]\n" - "sdot z8.s, z17.b, z1.b[2]\n" - "sdot z12.s, z17.b, z0.b[2]\n" - "ld1b { z17.b }, p2/Z, [x10, #-6, MUL VL]\n" - "sdot z9.s, z16.b, z1.b[2]\n" - "sdot z13.s, z16.b, z0.b[2]\n" - "ld1b { z16.b }, p2/Z, [x10, #-5, MUL VL]\n" - "sdot z10.s, z17.b, z1.b[2]\n" - "sdot z14.s, z17.b, z0.b[2]\n" - "ld1b { z17.b }, p2/Z, [x10, #-4, MUL VL]\n" - "sdot z11.s, z16.b, z1.b[2]\n" - "sdot z15.s, z16.b, z0.b[2]\n" - "ld1b { z16.b }, p2/Z, [x10, #-3, MUL VL]\n" - "sdot z8.s, z17.b, z1.b[3]\n" - "sdot z12.s, z17.b, z0.b[3]\n" - "ld1b { z17.b }, p2/Z, [x10, #-2, MUL VL]\n" - "sdot z9.s, z16.b, z1.b[3]\n" - "sdot z13.s, z16.b, z0.b[3]\n" - "ld1b { z16.b }, p2/Z, [x10, #-1, MUL VL]\n" - "sdot z10.s, z17.b, z1.b[3]\n" - "sdot z14.s, z17.b, z0.b[3]\n" - "sdot z11.s, z16.b, z1.b[3]\n" - "sdot z15.s, z16.b, z0.b[3]\n" - "bgt 20b\n" - "21:" // Height 2: Multiply loop: Single iteration only + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #-8, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x10, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "ld1b { z6.b }, p2/Z, [x10, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "ld1b { z7.b }, p2/Z, [x10, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "bgt 19b\n" + "20:" // Height 2: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" - "ld1b { z17.b }, p2/Z, [x10]\n" - "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" "ld1rqb { z0.b }, p0/Z, [x26]\n" "ld1rqb { z1.b }, p0/Z, [x25]\n" - "sdot z8.s, z17.b, z0.b[0]\n" - "sdot z12.s, z17.b, z1.b[0]\n" - "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z16.b, z0.b[0]\n" - "sdot z13.s, z16.b, z1.b[0]\n" - "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z17.b, z0.b[0]\n" - "sdot z14.s, z17.b, z1.b[0]\n" - "sdot z11.s, z16.b, z0.b[0]\n" - "sdot z15.s, z16.b, z1.b[0]\n" - "ble 22f\n" - "ld1b { z17.b }, p2/Z, [x10]\n" - "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "ble 21f\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z8.s, z17.b, z0.b[1]\n" - "sdot z12.s, z17.b, z1.b[1]\n" - "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z16.b, z0.b[1]\n" - "sdot z13.s, z16.b, z1.b[1]\n" - "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z17.b, z0.b[1]\n" - "sdot z14.s, z17.b, z1.b[1]\n" - "sdot z11.s, z16.b, z0.b[1]\n" - "sdot z15.s, z16.b, z1.b[1]\n" - "ble 22f\n" - "ld1b { z17.b }, p2/Z, [x10]\n" - "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "ble 21f\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z8.s, z17.b, z0.b[2]\n" - "sdot z12.s, z17.b, z1.b[2]\n" - "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z16.b, z0.b[2]\n" - "sdot z13.s, z16.b, z1.b[2]\n" - "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z17.b, z0.b[2]\n" - "sdot z14.s, z17.b, z1.b[2]\n" - "sdot z11.s, z16.b, z0.b[2]\n" - "sdot z15.s, z16.b, z1.b[2]\n" - "ble 22f\n" - "ld1b { z17.b }, p2/Z, [x10]\n" - "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z17.b, z0.b[3]\n" - "sdot z12.s, z17.b, z1.b[3]\n" - "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z16.b, z0.b[3]\n" - "sdot z13.s, z16.b, z1.b[3]\n" - "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "ble 21f\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z17.b, z0.b[3]\n" - "sdot z14.s, z17.b, z1.b[3]\n" - "sdot z11.s, z16.b, z0.b[3]\n" - "sdot z15.s, z16.b, z1.b[3]\n" - "22:" // Height 2: Multiply loop: multiply skip + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "21:" // Height 2: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 17b\n" + "bne 16b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "ld1w { z19.s }, p2/Z, [x14]\n" - "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n" - "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" - "add x26, x9, x20\n" - "add z8.s, z8.s, z19.s\n" - "add z12.s, z12.s, z19.s\n" - "add z9.s, z9.s, z18.s\n" - "add z10.s, z10.s, z17.s\n" - "add z13.s, z13.s, z18.s\n" - "add z11.s, z11.s, z16.s\n" - "add z14.s, z14.s, z17.s\n" - "add z15.s, z15.s, z16.s\n" - "tbz %x[flags], #4, 23f\n" "ld1w { z0.s }, p2/Z, [x12]\n" - "ld1w { z4.s }, p2/Z, [x13]\n" "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" - "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" "addvl x12, x12, #4\n" + "add x27, x9, x20\n" + "add z8.s, z8.s, z0.s\n" + "add z12.s, z12.s, z0.s\n" + "add z9.s, z9.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z13.s, z13.s, z1.s\n" + "add z11.s, z11.s, z3.s\n" + "add z14.s, z14.s, z2.s\n" + "add z15.s, z15.s, z3.s\n" + "tbz %x[flags], #4, 22f\n" + "ld1w { z0.s }, p2/Z, [x13]\n" + "ld1w { z4.s }, p2/Z, [x14]\n" + "ld1w { z1.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x13, #3, MUL VL]\n" + "ld1w { z7.s }, p2/Z, [x14, #3, MUL VL]\n" "addvl x13, x13, #4\n" - "b 24f\n" - "23:" // Height 2: per layer parameters + "addvl x14, x14, #4\n" + "b 23f\n" + "22:" // Height 2: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1rw { z0.s }, p2/Z, [x21]\n" @@ -503,101 +482,74 @@ void sve_hybrid_s8qs_dot_6x4VL ( "mov z6.d, z4.d\n" "mov z3.d, z0.d\n" "mov z7.d, z4.d\n" - "24:" // Height 2: parameters loaded - ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" - ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" - ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" - ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" - ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n" - ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n" - ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n" - ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n" - "tbz %x[flags], #5, 25f\n" - "and z19.d, z8.d, z0.d\n" - "and z18.d, z9.d, z1.d\n" - "and z17.d, z10.d, z2.d\n" - "and z16.d, z11.d, z3.d\n" - "asr z19.s, z19.s, #0x1f\n" - "asr z18.s, z18.s, #0x1f\n" - "asr z17.s, z17.s, #0x1f\n" - "asr z16.s, z16.s, #0x1f\n" - "sqadd z8.s, z8.s, z19.s\n" - "and z19.d, z12.d, z0.d\n" - "sqadd z9.s, z9.s, z18.s\n" - "and z18.d, z13.d, z1.d\n" - "sqadd z10.s, z10.s, z17.s\n" - "sqadd z11.s, z11.s, z16.s\n" - "and z17.d, z14.d, z2.d\n" - "and z16.d, z15.d, z3.d\n" - "asr z19.s, z19.s, #0x1f\n" - "asr z18.s, z18.s, #0x1f\n" - "asr z17.s, z17.s, #0x1f\n" - "asr z16.s, z16.s, #0x1f\n" - "sqadd z12.s, z12.s, z19.s\n" - "sqadd z13.s, z13.s, z18.s\n" - "sqadd z14.s, z14.s, z17.s\n" - "sqadd z15.s, z15.s, z16.s\n" - "25:" // Height 2: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "23:" // Height 2: parameters loaded + ".inst 0x04a47108 // sqdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57129 // sqdmulh z9.s, z9.s, z5.s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + ".inst 0x04a6714a // sqdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7716b // sqdmulh z11.s, z11.s, z7.s\n" + "add x20, %x[qp], %[minval]\n" + ".inst 0x04a4718c // sqdmulh z12.s, z12.s, z4.s\n" + ".inst 0x04a571ad // sqdmulh z13.s, z13.s, z5.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" + ".inst 0x04a671ce // sqdmulh z14.s, z14.s, z6.s\n" + ".inst 0x04a771ef // sqdmulh z15.s, z15.s, z7.s\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "ld1rw { z18.s }, p2/Z, [x20]\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n" ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" - "add z8.s, z8.s, z18.s\n" ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z9.s, z9.s, z18.s\n" - "add z10.s, z10.s, z18.s\n" - "ld1rw { z17.s }, p2/Z, [x20]\n" - "add z11.s, z11.s, z18.s\n" - "add z12.s, z12.s, z18.s\n" - "add x20, %x[qp], %[minval]\n" - "add z13.s, z13.s, z18.s\n" - "add z14.s, z14.s, z18.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" - "add z15.s, z15.s, z18.s\n" - "smin z8.s, p2/M, z8.s, z17.s\n" - "smin z9.s, p2/M, z9.s, z17.s\n" - "smin z10.s, p2/M, z10.s, z17.s\n" - "smin z11.s, p2/M, z11.s, z17.s\n" - "smin z12.s, p2/M, z12.s, z17.s\n" - "smin z13.s, p2/M, z13.s, z17.s\n" - "smin z14.s, p2/M, z14.s, z17.s\n" - "smin z15.s, p2/M, z15.s, z17.s\n" - "smax z8.s, p2/M, z8.s, z16.s\n" - "smax z9.s, p2/M, z9.s, z16.s\n" - "smax z10.s, p2/M, z10.s, z16.s\n" - "smax z11.s, p2/M, z11.s, z16.s\n" - "smax z12.s, p2/M, z12.s, z16.s\n" - "smax z13.s, p2/M, z13.s, z16.s\n" - "smax z14.s, p2/M, z14.s, z16.s\n" - "smax z15.s, p2/M, z15.s, z16.s\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "add z12.s, z12.s, z4.s\n" + "add z13.s, z13.s, z4.s\n" + "add z14.s, z14.s, z4.s\n" + "add z15.s, z15.s, z4.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" "uzp1 z8.h, z8.h, z9.h\n" - "uzp1 z17.h, z10.h, z11.h\n" + "uzp1 z9.h, z10.h, z11.h\n" "uzp1 z12.h, z12.h, z13.h\n" - "uzp1 z16.h, z14.h, z15.h\n" - "uzp1 z8.b, z8.b, z17.b\n" - "uzp1 z12.b, z12.b, z16.b\n" + "uzp1 z13.h, z14.h, z15.h\n" + "uzp1 z8.b, z8.b, z9.b\n" + "uzp1 z12.b, z12.b, z13.b\n" "st1b { z8.b }, p1, [x9]\n" "addvl x9, x9, #1\n" - "st1b { z12.b }, p1, [x26]\n" - "26:" // Height 2: Writeback done + "st1b { z12.b }, p1, [x27]\n" "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" - "bgt 15b\n" - "b 80f\n" - "27:" // Height 3 - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "bgt 14b\n" + "b 74f\n" + "25:" // Height 3 + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "28:" // Height 3: Column loop + "26:" // Height 3: Column loop "mov x20, #0x0\n" "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" @@ -612,224 +564,223 @@ void sve_hybrid_s8qs_dot_6x4VL ( "mov z17.s, #0x0\n" "mov z18.s, #0x0\n" "mov z19.s, #0x0\n" - "29:" // Height 3: setup done "mov x28, #0x0\n" - "30:" // Height 3: String loop + "28:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 31f\n" + "tbz %x[flags], #3, 29f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" "ldr x25, [x20, #0x8]\n" "ldr x24, [x20, #0x10]\n" - "cbnz x28, 32f\n" + "cbnz x28, 30f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" "add x24, x24, x20\n" - "b 32f\n" - "31:" // Height 3: setup direct input + "b 30f\n" + "29:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" "add x24, x25, x21\n" - "32:" // Height 3: input setup done + "30:" // Height 3: input setup done "cmp x27, #0x10\n" - "ble 34f\n" - "33:" // Height 3: Multiply loop: Main loop head + "ble 32f\n" + "31:" // Height 3: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1b { z21.b }, p2/Z, [x10]\n" - "ld1b { z20.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" "add x26, x26, #0x10\n" "ld1rqb { z1.b }, p0/Z, [x25]\n" "add x25, x25, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x24]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" "add x24, x24, #0x10\n" - "sdot z8.s, z21.b, z2.b[0]\n" - "sdot z12.s, z21.b, z1.b[0]\n" - "sdot z9.s, z20.b, z2.b[0]\n" - "sdot z13.s, z20.b, z1.b[0]\n" - "sdot z16.s, z21.b, z0.b[0]\n" - "ld1b { z21.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z17.s, z20.b, z0.b[0]\n" - "ld1b { z20.b }, p2/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z21.b, z2.b[0]\n" - "sdot z14.s, z21.b, z1.b[0]\n" - "sdot z18.s, z21.b, z0.b[0]\n" - "ld1b { z21.b }, p2/Z, [x10, #4, MUL VL]\n" - "sdot z11.s, z20.b, z2.b[0]\n" - "sdot z15.s, z20.b, z1.b[0]\n" - "sdot z19.s, z20.b, z0.b[0]\n" - "ld1b { z20.b }, p2/Z, [x10, #5, MUL VL]\n" - "sdot z8.s, z21.b, z2.b[1]\n" - "sdot z12.s, z21.b, z1.b[1]\n" - "sdot z16.s, z21.b, z0.b[1]\n" - "ld1b { z21.b }, p2/Z, [x10, #6, MUL VL]\n" - "sdot z9.s, z20.b, z2.b[1]\n" - "sdot z13.s, z20.b, z1.b[1]\n" - "sdot z17.s, z20.b, z0.b[1]\n" - "ld1b { z20.b }, p2/Z, [x10, #7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "ld1b { z7.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "sdot z10.s, z21.b, z2.b[1]\n" - "sdot z14.s, z21.b, z1.b[1]\n" - "sdot z18.s, z21.b, z0.b[1]\n" - "sdot z11.s, z20.b, z2.b[1]\n" - "sdot z15.s, z20.b, z1.b[1]\n" - "sdot z19.s, z20.b, z0.b[1]\n" - "ld1b { z21.b }, p2/Z, [x10, #-8, MUL VL]\n" - "ld1b { z20.b }, p2/Z, [x10, #-7, MUL VL]\n" - "sdot z8.s, z21.b, z2.b[2]\n" - "sdot z12.s, z21.b, z1.b[2]\n" - "sdot z16.s, z21.b, z0.b[2]\n" - "ld1b { z21.b }, p2/Z, [x10, #-6, MUL VL]\n" - "sdot z9.s, z20.b, z2.b[2]\n" - "sdot z13.s, z20.b, z1.b[2]\n" - "sdot z17.s, z20.b, z0.b[2]\n" - "ld1b { z20.b }, p2/Z, [x10, #-5, MUL VL]\n" - "sdot z10.s, z21.b, z2.b[2]\n" - "sdot z14.s, z21.b, z1.b[2]\n" - "sdot z18.s, z21.b, z0.b[2]\n" - "ld1b { z21.b }, p2/Z, [x10, #-4, MUL VL]\n" - "sdot z11.s, z20.b, z2.b[2]\n" - "sdot z15.s, z20.b, z1.b[2]\n" - "sdot z19.s, z20.b, z0.b[2]\n" - "ld1b { z20.b }, p2/Z, [x10, #-3, MUL VL]\n" - "sdot z8.s, z21.b, z2.b[3]\n" - "sdot z12.s, z21.b, z1.b[3]\n" - "sdot z16.s, z21.b, z0.b[3]\n" - "ld1b { z21.b }, p2/Z, [x10, #-2, MUL VL]\n" - "sdot z9.s, z20.b, z2.b[3]\n" - "sdot z13.s, z20.b, z1.b[3]\n" - "sdot z17.s, z20.b, z0.b[3]\n" - "ld1b { z20.b }, p2/Z, [x10, #-1, MUL VL]\n" - "sdot z10.s, z21.b, z2.b[3]\n" - "sdot z14.s, z21.b, z1.b[3]\n" - "sdot z18.s, z21.b, z0.b[3]\n" - "sdot z11.s, z20.b, z2.b[3]\n" - "sdot z15.s, z20.b, z1.b[3]\n" - "sdot z19.s, z20.b, z0.b[3]\n" - "bgt 33b\n" - "34:" // Height 3: Multiply loop: Single iteration only + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #-8, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x10, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "ld1b { z6.b }, p2/Z, [x10, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "ld1b { z7.b }, p2/Z, [x10, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "bgt 31b\n" + "32:" // Height 3: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" - "ld1b { z21.b }, p2/Z, [x10]\n" - "ld1b { z20.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" "ld1rqb { z0.b }, p0/Z, [x26]\n" "ld1rqb { z1.b }, p0/Z, [x25]\n" "ld1rqb { z2.b }, p0/Z, [x24]\n" - "sdot z8.s, z21.b, z0.b[0]\n" - "sdot z12.s, z21.b, z1.b[0]\n" - "sdot z9.s, z20.b, z0.b[0]\n" - "sdot z13.s, z20.b, z1.b[0]\n" - "sdot z16.s, z21.b, z2.b[0]\n" - "ld1b { z21.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z17.s, z20.b, z2.b[0]\n" - "ld1b { z20.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z21.b, z0.b[0]\n" - "sdot z14.s, z21.b, z1.b[0]\n" - "sdot z18.s, z21.b, z2.b[0]\n" - "sdot z11.s, z20.b, z0.b[0]\n" - "sdot z15.s, z20.b, z1.b[0]\n" - "sdot z19.s, z20.b, z2.b[0]\n" - "ble 35f\n" - "ld1b { z21.b }, p2/Z, [x10]\n" - "ld1b { z20.b }, p2/Z, [x10, #1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "ble 33f\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z8.s, z21.b, z0.b[1]\n" - "sdot z12.s, z21.b, z1.b[1]\n" - "sdot z16.s, z21.b, z2.b[1]\n" - "ld1b { z21.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z20.b, z0.b[1]\n" - "sdot z13.s, z20.b, z1.b[1]\n" - "sdot z17.s, z20.b, z2.b[1]\n" - "ld1b { z20.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z21.b, z0.b[1]\n" - "sdot z14.s, z21.b, z1.b[1]\n" - "sdot z18.s, z21.b, z2.b[1]\n" - "sdot z11.s, z20.b, z0.b[1]\n" - "sdot z15.s, z20.b, z1.b[1]\n" - "sdot z19.s, z20.b, z2.b[1]\n" - "ble 35f\n" - "ld1b { z21.b }, p2/Z, [x10]\n" - "ld1b { z20.b }, p2/Z, [x10, #1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "ble 33f\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z8.s, z21.b, z0.b[2]\n" - "sdot z12.s, z21.b, z1.b[2]\n" - "sdot z16.s, z21.b, z2.b[2]\n" - "ld1b { z21.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z20.b, z0.b[2]\n" - "sdot z13.s, z20.b, z1.b[2]\n" - "sdot z17.s, z20.b, z2.b[2]\n" - "ld1b { z20.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z21.b, z0.b[2]\n" - "sdot z14.s, z21.b, z1.b[2]\n" - "sdot z18.s, z21.b, z2.b[2]\n" - "sdot z11.s, z20.b, z0.b[2]\n" - "sdot z15.s, z20.b, z1.b[2]\n" - "sdot z19.s, z20.b, z2.b[2]\n" - "ble 35f\n" - "ld1b { z21.b }, p2/Z, [x10]\n" - "ld1b { z20.b }, p2/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z21.b, z0.b[3]\n" - "sdot z12.s, z21.b, z1.b[3]\n" - "sdot z16.s, z21.b, z2.b[3]\n" - "ld1b { z21.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z20.b, z0.b[3]\n" - "sdot z13.s, z20.b, z1.b[3]\n" - "sdot z17.s, z20.b, z2.b[3]\n" - "ld1b { z20.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "ble 33f\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z21.b, z0.b[3]\n" - "sdot z14.s, z21.b, z1.b[3]\n" - "sdot z18.s, z21.b, z2.b[3]\n" - "sdot z11.s, z20.b, z0.b[3]\n" - "sdot z15.s, z20.b, z1.b[3]\n" - "sdot z19.s, z20.b, z2.b[3]\n" - "35:" // Height 3: Multiply loop: multiply skip + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "33:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 30b\n" + "bne 28b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "ld1w { z23.s }, p2/Z, [x14]\n" - "ld1w { z22.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z21.s }, p2/Z, [x14, #2, MUL VL]\n" - "ld1w { z20.s }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" - "add x26, x9, x20\n" - "add x25, x26, x20\n" - "add z8.s, z8.s, z23.s\n" - "add z12.s, z12.s, z23.s\n" - "add z9.s, z9.s, z22.s\n" - "add z10.s, z10.s, z21.s\n" - "add z11.s, z11.s, z20.s\n" - "add z13.s, z13.s, z22.s\n" - "add z14.s, z14.s, z21.s\n" - "add z15.s, z15.s, z20.s\n" - "add z16.s, z16.s, z23.s\n" - "add z17.s, z17.s, z22.s\n" - "add z18.s, z18.s, z21.s\n" - "add z19.s, z19.s, z20.s\n" - "tbz %x[flags], #4, 36f\n" "ld1w { z0.s }, p2/Z, [x12]\n" - "ld1w { z4.s }, p2/Z, [x13]\n" "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" - "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" "addvl x12, x12, #4\n" + "add x27, x9, x20\n" + "add x26, x27, x20\n" + "add z8.s, z8.s, z0.s\n" + "add z12.s, z12.s, z0.s\n" + "add z9.s, z9.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "add z13.s, z13.s, z1.s\n" + "add z14.s, z14.s, z2.s\n" + "add z15.s, z15.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "tbz %x[flags], #4, 34f\n" + "ld1w { z0.s }, p2/Z, [x13]\n" + "ld1w { z4.s }, p2/Z, [x14]\n" + "ld1w { z1.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x13, #3, MUL VL]\n" + "ld1w { z7.s }, p2/Z, [x14, #3, MUL VL]\n" "addvl x13, x13, #4\n" - "b 37f\n" - "36:" // Height 3: per layer parameters + "addvl x14, x14, #4\n" + "b 35f\n" + "34:" // Height 3: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1rw { z0.s }, p2/Z, [x21]\n" @@ -840,137 +791,98 @@ void sve_hybrid_s8qs_dot_6x4VL ( "mov z6.d, z4.d\n" "mov z3.d, z0.d\n" "mov z7.d, z4.d\n" - "37:" // Height 3: parameters loaded - ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" - ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" - ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" - ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" - ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n" - ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n" - ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n" - ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" - ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" - ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" - "tbz %x[flags], #5, 38f\n" - "and z23.d, z8.d, z0.d\n" - "and z22.d, z9.d, z1.d\n" - "and z21.d, z10.d, z2.d\n" - "and z20.d, z11.d, z3.d\n" - "asr z23.s, z23.s, #0x1f\n" - "asr z22.s, z22.s, #0x1f\n" - "asr z21.s, z21.s, #0x1f\n" - "asr z20.s, z20.s, #0x1f\n" - "sqadd z8.s, z8.s, z23.s\n" - "and z23.d, z12.d, z0.d\n" - "sqadd z9.s, z9.s, z22.s\n" - "and z22.d, z13.d, z1.d\n" - "sqadd z10.s, z10.s, z21.s\n" - "sqadd z11.s, z11.s, z20.s\n" - "and z21.d, z14.d, z2.d\n" - "and z20.d, z15.d, z3.d\n" - "asr z23.s, z23.s, #0x1f\n" - "asr z22.s, z22.s, #0x1f\n" - "asr z21.s, z21.s, #0x1f\n" - "asr z20.s, z20.s, #0x1f\n" - "sqadd z12.s, z12.s, z23.s\n" - "and z23.d, z16.d, z0.d\n" - "sqadd z13.s, z13.s, z22.s\n" - "and z22.d, z17.d, z1.d\n" - "sqadd z14.s, z14.s, z21.s\n" - "sqadd z15.s, z15.s, z20.s\n" - "and z21.d, z18.d, z2.d\n" - "and z20.d, z19.d, z3.d\n" - "asr z23.s, z23.s, #0x1f\n" - "asr z22.s, z22.s, #0x1f\n" - "asr z21.s, z21.s, #0x1f\n" - "asr z20.s, z20.s, #0x1f\n" - "sqadd z16.s, z16.s, z23.s\n" - "sqadd z17.s, z17.s, z22.s\n" - "sqadd z18.s, z18.s, z21.s\n" - "sqadd z19.s, z19.s, z20.s\n" - "38:" // Height 3: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "35:" // Height 3: parameters loaded + ".inst 0x04a47108 // sqdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57129 // sqdmulh z9.s, z9.s, z5.s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + ".inst 0x04a6714a // sqdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7716b // sqdmulh z11.s, z11.s, z7.s\n" + "add x20, %x[qp], %[minval]\n" + ".inst 0x04a4718c // sqdmulh z12.s, z12.s, z4.s\n" + ".inst 0x04a571ad // sqdmulh z13.s, z13.s, z5.s\n" + ".inst 0x04a671ce // sqdmulh z14.s, z14.s, z6.s\n" + ".inst 0x04a771ef // sqdmulh z15.s, z15.s, z7.s\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "ld1rw { z22.s }, p2/Z, [x20]\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a57231 // sqdmulh z17.s, z17.s, z5.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" + ".inst 0x04a67252 // sqdmulh z18.s, z18.s, z6.s\n" + ".inst 0x04a77273 // sqdmulh z19.s, z19.s, z7.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n" ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" - "add z8.s, z8.s, z22.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z9.s, z9.s, z22.s\n" - "add z10.s, z10.s, z22.s\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" - "add z11.s, z11.s, z22.s\n" - "add z12.s, z12.s, z22.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z13.s, z13.s, z22.s\n" - "add z14.s, z14.s, z22.s\n" - "ld1rw { z21.s }, p2/Z, [x20]\n" - "add z15.s, z15.s, z22.s\n" - "add z16.s, z16.s, z22.s\n" - "add x20, %x[qp], %[minval]\n" - "add z17.s, z17.s, z22.s\n" - "add z18.s, z18.s, z22.s\n" - "ld1rw { z20.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z22.s\n" - "smin z8.s, p2/M, z8.s, z21.s\n" - "smin z9.s, p2/M, z9.s, z21.s\n" - "smin z10.s, p2/M, z10.s, z21.s\n" - "smin z11.s, p2/M, z11.s, z21.s\n" - "smin z12.s, p2/M, z12.s, z21.s\n" - "smin z13.s, p2/M, z13.s, z21.s\n" - "smin z14.s, p2/M, z14.s, z21.s\n" - "smin z15.s, p2/M, z15.s, z21.s\n" - "smin z16.s, p2/M, z16.s, z21.s\n" - "smin z17.s, p2/M, z17.s, z21.s\n" - "smin z18.s, p2/M, z18.s, z21.s\n" - "smin z19.s, p2/M, z19.s, z21.s\n" - "smax z8.s, p2/M, z8.s, z20.s\n" - "smax z9.s, p2/M, z9.s, z20.s\n" - "smax z10.s, p2/M, z10.s, z20.s\n" - "smax z11.s, p2/M, z11.s, z20.s\n" - "smax z12.s, p2/M, z12.s, z20.s\n" - "smax z13.s, p2/M, z13.s, z20.s\n" - "smax z14.s, p2/M, z14.s, z20.s\n" - "smax z15.s, p2/M, z15.s, z20.s\n" + "add z12.s, z12.s, z4.s\n" + "add z13.s, z13.s, z4.s\n" + "add z14.s, z14.s, z4.s\n" + "add z15.s, z15.s, z4.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" "uzp1 z8.h, z8.h, z9.h\n" - "smax z16.s, p2/M, z16.s, z20.s\n" - "smax z17.s, p2/M, z17.s, z20.s\n" - "uzp1 z21.h, z10.h, z11.h\n" - "smax z18.s, p2/M, z18.s, z20.s\n" - "smax z19.s, p2/M, z19.s, z20.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z12.h, z12.h, z13.h\n" - "uzp1 z20.h, z14.h, z15.h\n" + "uzp1 z13.h, z14.h, z15.h\n" "uzp1 z16.h, z16.h, z17.h\n" - "uzp1 z8.b, z8.b, z21.b\n" + "uzp1 z8.b, z8.b, z9.b\n" "uzp1 z17.h, z18.h, z19.h\n" - "uzp1 z12.b, z12.b, z20.b\n" + "uzp1 z12.b, z12.b, z13.b\n" "st1b { z8.b }, p1, [x9]\n" "addvl x9, x9, #1\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z12.b }, p1, [x26]\n" - "st1b { z16.b }, p1, [x25]\n" - "39:" // Height 3: Writeback done + "st1b { z12.b }, p1, [x27]\n" + "st1b { z16.b }, p1, [x26]\n" "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" - "bgt 28b\n" - "b 80f\n" - "40:" // Height 4 - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "bgt 26b\n" + "b 74f\n" + "37:" // Height 4 + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "41:" // Height 4: Column loop + "38:" // Height 4: Column loop "mov x20, #0x0\n" "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" @@ -989,267 +901,266 @@ void sve_hybrid_s8qs_dot_6x4VL ( "mov z21.s, #0x0\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "42:" // Height 4: setup done "mov x28, #0x0\n" - "43:" // Height 4: String loop + "40:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 44f\n" + "tbz %x[flags], #3, 41f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" "ldr x25, [x20, #0x8]\n" "ldr x24, [x20, #0x10]\n" "ldr x23, [x20, #0x18]\n" - "cbnz x28, 45f\n" + "cbnz x28, 42f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" "add x24, x24, x20\n" "add x23, x23, x20\n" - "b 45f\n" - "44:" // Height 4: setup direct input + "b 42f\n" + "41:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" "add x24, x25, x21\n" "add x23, x24, x21\n" - "45:" // Height 4: input setup done + "42:" // Height 4: input setup done "cmp x27, #0x10\n" - "ble 47f\n" - "46:" // Height 4: Multiply loop: Main loop head + "ble 44f\n" + "43:" // Height 4: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1b { z25.b }, p2/Z, [x10]\n" - "ld1b { z24.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - "ld1rqb { z3.b }, p0/Z, [x26]\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" "add x26, x26, #0x10\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" "add x25, x25, #0x10\n" - "ld1rqb { z1.b }, p0/Z, [x24]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" "add x24, x24, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x23]\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" "add x23, x23, #0x10\n" - "sdot z8.s, z25.b, z3.b[0]\n" - "sdot z12.s, z25.b, z2.b[0]\n" - "sdot z9.s, z24.b, z3.b[0]\n" - "sdot z13.s, z24.b, z2.b[0]\n" - "sdot z16.s, z25.b, z1.b[0]\n" - "sdot z20.s, z25.b, z0.b[0]\n" - "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z17.s, z24.b, z1.b[0]\n" - "sdot z21.s, z24.b, z0.b[0]\n" - "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z25.b, z3.b[0]\n" - "sdot z14.s, z25.b, z2.b[0]\n" - "sdot z18.s, z25.b, z1.b[0]\n" - "sdot z22.s, z25.b, z0.b[0]\n" - "ld1b { z25.b }, p2/Z, [x10, #4, MUL VL]\n" - "sdot z11.s, z24.b, z3.b[0]\n" - "sdot z15.s, z24.b, z2.b[0]\n" - "sdot z19.s, z24.b, z1.b[0]\n" - "sdot z23.s, z24.b, z0.b[0]\n" - "ld1b { z24.b }, p2/Z, [x10, #5, MUL VL]\n" - "sdot z8.s, z25.b, z3.b[1]\n" - "sdot z12.s, z25.b, z2.b[1]\n" - "sdot z16.s, z25.b, z1.b[1]\n" - "sdot z20.s, z25.b, z0.b[1]\n" - "ld1b { z25.b }, p2/Z, [x10, #6, MUL VL]\n" - "sdot z9.s, z24.b, z3.b[1]\n" - "sdot z13.s, z24.b, z2.b[1]\n" - "sdot z17.s, z24.b, z1.b[1]\n" - "sdot z21.s, z24.b, z0.b[1]\n" - "ld1b { z24.b }, p2/Z, [x10, #7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "ld1b { z7.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "sdot z10.s, z25.b, z3.b[1]\n" - "sdot z14.s, z25.b, z2.b[1]\n" - "sdot z18.s, z25.b, z1.b[1]\n" - "sdot z22.s, z25.b, z0.b[1]\n" - "sdot z11.s, z24.b, z3.b[1]\n" - "sdot z15.s, z24.b, z2.b[1]\n" - "ld1b { z25.b }, p2/Z, [x10, #-8, MUL VL]\n" - "sdot z19.s, z24.b, z1.b[1]\n" - "sdot z23.s, z24.b, z0.b[1]\n" - "ld1b { z24.b }, p2/Z, [x10, #-7, MUL VL]\n" - "sdot z8.s, z25.b, z3.b[2]\n" - "sdot z12.s, z25.b, z2.b[2]\n" - "sdot z16.s, z25.b, z1.b[2]\n" - "sdot z20.s, z25.b, z0.b[2]\n" - "ld1b { z25.b }, p2/Z, [x10, #-6, MUL VL]\n" - "sdot z9.s, z24.b, z3.b[2]\n" - "sdot z13.s, z24.b, z2.b[2]\n" - "sdot z17.s, z24.b, z1.b[2]\n" - "sdot z21.s, z24.b, z0.b[2]\n" - "ld1b { z24.b }, p2/Z, [x10, #-5, MUL VL]\n" - "sdot z10.s, z25.b, z3.b[2]\n" - "sdot z14.s, z25.b, z2.b[2]\n" - "sdot z18.s, z25.b, z1.b[2]\n" - "sdot z22.s, z25.b, z0.b[2]\n" - "ld1b { z25.b }, p2/Z, [x10, #-4, MUL VL]\n" - "sdot z11.s, z24.b, z3.b[2]\n" - "sdot z15.s, z24.b, z2.b[2]\n" - "sdot z19.s, z24.b, z1.b[2]\n" - "sdot z23.s, z24.b, z0.b[2]\n" - "ld1b { z24.b }, p2/Z, [x10, #-3, MUL VL]\n" - "sdot z8.s, z25.b, z3.b[3]\n" - "sdot z12.s, z25.b, z2.b[3]\n" - "sdot z16.s, z25.b, z1.b[3]\n" - "sdot z20.s, z25.b, z0.b[3]\n" - "ld1b { z25.b }, p2/Z, [x10, #-2, MUL VL]\n" - "sdot z9.s, z24.b, z3.b[3]\n" - "sdot z13.s, z24.b, z2.b[3]\n" - "sdot z17.s, z24.b, z1.b[3]\n" - "sdot z21.s, z24.b, z0.b[3]\n" - "ld1b { z24.b }, p2/Z, [x10, #-1, MUL VL]\n" - "sdot z10.s, z25.b, z3.b[3]\n" - "sdot z14.s, z25.b, z2.b[3]\n" - "sdot z18.s, z25.b, z1.b[3]\n" - "sdot z22.s, z25.b, z0.b[3]\n" - "sdot z11.s, z24.b, z3.b[3]\n" - "sdot z15.s, z24.b, z2.b[3]\n" - "sdot z19.s, z24.b, z1.b[3]\n" - "sdot z23.s, z24.b, z0.b[3]\n" - "bgt 46b\n" - "47:" // Height 4: Multiply loop: Single iteration only + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #-8, MUL VL]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "ld1b { z7.b }, p2/Z, [x10, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "ld1b { z6.b }, p2/Z, [x10, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "ld1b { z7.b }, p2/Z, [x10, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "bgt 43b\n" + "44:" // Height 4: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" - "ld1b { z25.b }, p2/Z, [x10]\n" - "ld1b { z24.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" "ld1rqb { z0.b }, p0/Z, [x26]\n" "ld1rqb { z1.b }, p0/Z, [x25]\n" "ld1rqb { z2.b }, p0/Z, [x24]\n" "ld1rqb { z3.b }, p0/Z, [x23]\n" - "sdot z8.s, z25.b, z0.b[0]\n" - "sdot z12.s, z25.b, z1.b[0]\n" - "sdot z9.s, z24.b, z0.b[0]\n" - "sdot z13.s, z24.b, z1.b[0]\n" - "sdot z16.s, z25.b, z2.b[0]\n" - "sdot z20.s, z25.b, z3.b[0]\n" - "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z17.s, z24.b, z2.b[0]\n" - "sdot z21.s, z24.b, z3.b[0]\n" - "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z25.b, z0.b[0]\n" - "sdot z14.s, z25.b, z1.b[0]\n" - "sdot z18.s, z25.b, z2.b[0]\n" - "sdot z22.s, z25.b, z3.b[0]\n" - "sdot z11.s, z24.b, z0.b[0]\n" - "sdot z15.s, z24.b, z1.b[0]\n" - "sdot z19.s, z24.b, z2.b[0]\n" - "sdot z23.s, z24.b, z3.b[0]\n" - "ble 48f\n" - "ld1b { z25.b }, p2/Z, [x10]\n" - "ld1b { z24.b }, p2/Z, [x10, #1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "ble 45f\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z8.s, z25.b, z0.b[1]\n" - "sdot z12.s, z25.b, z1.b[1]\n" - "sdot z16.s, z25.b, z2.b[1]\n" - "sdot z20.s, z25.b, z3.b[1]\n" - "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z24.b, z0.b[1]\n" - "sdot z13.s, z24.b, z1.b[1]\n" - "sdot z17.s, z24.b, z2.b[1]\n" - "sdot z21.s, z24.b, z3.b[1]\n" - "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z25.b, z0.b[1]\n" - "sdot z14.s, z25.b, z1.b[1]\n" - "sdot z18.s, z25.b, z2.b[1]\n" - "sdot z22.s, z25.b, z3.b[1]\n" - "sdot z11.s, z24.b, z0.b[1]\n" - "sdot z15.s, z24.b, z1.b[1]\n" - "sdot z19.s, z24.b, z2.b[1]\n" - "sdot z23.s, z24.b, z3.b[1]\n" - "ble 48f\n" - "ld1b { z25.b }, p2/Z, [x10]\n" - "ld1b { z24.b }, p2/Z, [x10, #1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "ble 45f\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z8.s, z25.b, z0.b[2]\n" - "sdot z12.s, z25.b, z1.b[2]\n" - "sdot z16.s, z25.b, z2.b[2]\n" - "sdot z20.s, z25.b, z3.b[2]\n" - "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z24.b, z0.b[2]\n" - "sdot z13.s, z24.b, z1.b[2]\n" - "sdot z17.s, z24.b, z2.b[2]\n" - "sdot z21.s, z24.b, z3.b[2]\n" - "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z25.b, z0.b[2]\n" - "sdot z14.s, z25.b, z1.b[2]\n" - "sdot z18.s, z25.b, z2.b[2]\n" - "sdot z22.s, z25.b, z3.b[2]\n" - "sdot z11.s, z24.b, z0.b[2]\n" - "sdot z15.s, z24.b, z1.b[2]\n" - "sdot z19.s, z24.b, z2.b[2]\n" - "sdot z23.s, z24.b, z3.b[2]\n" - "ble 48f\n" - "ld1b { z25.b }, p2/Z, [x10]\n" - "ld1b { z24.b }, p2/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z25.b, z0.b[3]\n" - "sdot z12.s, z25.b, z1.b[3]\n" - "sdot z16.s, z25.b, z2.b[3]\n" - "sdot z20.s, z25.b, z3.b[3]\n" - "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z24.b, z0.b[3]\n" - "sdot z13.s, z24.b, z1.b[3]\n" - "sdot z17.s, z24.b, z2.b[3]\n" - "sdot z21.s, z24.b, z3.b[3]\n" - "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "ble 45f\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z25.b, z0.b[3]\n" - "sdot z14.s, z25.b, z1.b[3]\n" - "sdot z18.s, z25.b, z2.b[3]\n" - "sdot z22.s, z25.b, z3.b[3]\n" - "sdot z11.s, z24.b, z0.b[3]\n" - "sdot z15.s, z24.b, z1.b[3]\n" - "sdot z19.s, z24.b, z2.b[3]\n" - "sdot z23.s, z24.b, z3.b[3]\n" - "48:" // Height 4: Multiply loop: multiply skip + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "45:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 43b\n" + "bne 40b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "ld1w { z27.s }, p2/Z, [x14]\n" - "ld1w { z26.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z25.s }, p2/Z, [x14, #2, MUL VL]\n" - "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" - "add x26, x9, x20\n" - "add x25, x26, x20\n" - "add z8.s, z8.s, z27.s\n" - "add z12.s, z12.s, z27.s\n" - "add x24, x25, x20\n" - "add z9.s, z9.s, z26.s\n" - "add z10.s, z10.s, z25.s\n" - "add z11.s, z11.s, z24.s\n" - "add z13.s, z13.s, z26.s\n" - "add z14.s, z14.s, z25.s\n" - "add z15.s, z15.s, z24.s\n" - "add z16.s, z16.s, z27.s\n" - "add z17.s, z17.s, z26.s\n" - "add z18.s, z18.s, z25.s\n" - "add z19.s, z19.s, z24.s\n" - "add z20.s, z20.s, z27.s\n" - "add z21.s, z21.s, z26.s\n" - "add z22.s, z22.s, z25.s\n" - "add z23.s, z23.s, z24.s\n" - "tbz %x[flags], #4, 49f\n" "ld1w { z0.s }, p2/Z, [x12]\n" - "ld1w { z4.s }, p2/Z, [x13]\n" "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" - "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" "addvl x12, x12, #4\n" + "add x27, x9, x20\n" + "add x26, x27, x20\n" + "add z8.s, z8.s, z0.s\n" + "add z12.s, z12.s, z0.s\n" + "add x25, x26, x20\n" + "add z9.s, z9.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "add z13.s, z13.s, z1.s\n" + "add z14.s, z14.s, z2.s\n" + "add z15.s, z15.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + "tbz %x[flags], #4, 46f\n" + "ld1w { z0.s }, p2/Z, [x13]\n" + "ld1w { z4.s }, p2/Z, [x14]\n" + "ld1w { z1.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x13, #3, MUL VL]\n" + "ld1w { z7.s }, p2/Z, [x14, #3, MUL VL]\n" "addvl x13, x13, #4\n" - "b 50f\n" - "49:" // Height 4: per layer parameters + "addvl x14, x14, #4\n" + "b 47f\n" + "46:" // Height 4: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1rw { z0.s }, p2/Z, [x21]\n" @@ -1260,173 +1171,122 @@ void sve_hybrid_s8qs_dot_6x4VL ( "mov z6.d, z4.d\n" "mov z3.d, z0.d\n" "mov z7.d, z4.d\n" - "50:" // Height 4: parameters loaded - ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" - ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" - ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" - ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" - ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n" - ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n" - ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n" - ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" - ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" - ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" - ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" - ".inst 0x04a576b5 // sqrdmulh z21.s, z21.s, z5.s\n" - ".inst 0x04a676d6 // sqrdmulh z22.s, z22.s, z6.s\n" - ".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n" - "tbz %x[flags], #5, 51f\n" - "and z27.d, z8.d, z0.d\n" - "and z26.d, z9.d, z1.d\n" - "and z25.d, z10.d, z2.d\n" - "and z24.d, z11.d, z3.d\n" - "asr z27.s, z27.s, #0x1f\n" - "asr z26.s, z26.s, #0x1f\n" - "asr z25.s, z25.s, #0x1f\n" - "asr z24.s, z24.s, #0x1f\n" - "sqadd z8.s, z8.s, z27.s\n" - "and z27.d, z12.d, z0.d\n" - "sqadd z9.s, z9.s, z26.s\n" - "and z26.d, z13.d, z1.d\n" - "sqadd z10.s, z10.s, z25.s\n" - "sqadd z11.s, z11.s, z24.s\n" - "and z25.d, z14.d, z2.d\n" - "and z24.d, z15.d, z3.d\n" - "asr z27.s, z27.s, #0x1f\n" - "asr z26.s, z26.s, #0x1f\n" - "asr z25.s, z25.s, #0x1f\n" - "asr z24.s, z24.s, #0x1f\n" - "sqadd z12.s, z12.s, z27.s\n" - "and z27.d, z16.d, z0.d\n" - "sqadd z13.s, z13.s, z26.s\n" - "and z26.d, z17.d, z1.d\n" - "sqadd z14.s, z14.s, z25.s\n" - "sqadd z15.s, z15.s, z24.s\n" - "and z25.d, z18.d, z2.d\n" - "and z24.d, z19.d, z3.d\n" - "asr z27.s, z27.s, #0x1f\n" - "asr z26.s, z26.s, #0x1f\n" - "asr z25.s, z25.s, #0x1f\n" - "asr z24.s, z24.s, #0x1f\n" - "sqadd z16.s, z16.s, z27.s\n" - "and z27.d, z20.d, z0.d\n" - "sqadd z17.s, z17.s, z26.s\n" - "and z26.d, z21.d, z1.d\n" - "sqadd z18.s, z18.s, z25.s\n" - "sqadd z19.s, z19.s, z24.s\n" - "and z25.d, z22.d, z2.d\n" - "and z24.d, z23.d, z3.d\n" - "asr z27.s, z27.s, #0x1f\n" - "asr z26.s, z26.s, #0x1f\n" - "asr z25.s, z25.s, #0x1f\n" - "asr z24.s, z24.s, #0x1f\n" - "sqadd z20.s, z20.s, z27.s\n" - "sqadd z21.s, z21.s, z26.s\n" - "sqadd z22.s, z22.s, z25.s\n" - "sqadd z23.s, z23.s, z24.s\n" - "51:" // Height 4: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "47:" // Height 4: parameters loaded + ".inst 0x04a47108 // sqdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57129 // sqdmulh z9.s, z9.s, z5.s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + ".inst 0x04a6714a // sqdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7716b // sqdmulh z11.s, z11.s, z7.s\n" + "add x20, %x[qp], %[minval]\n" + ".inst 0x04a4718c // sqdmulh z12.s, z12.s, z4.s\n" + ".inst 0x04a571ad // sqdmulh z13.s, z13.s, z5.s\n" + ".inst 0x04a671ce // sqdmulh z14.s, z14.s, z6.s\n" + ".inst 0x04a771ef // sqdmulh z15.s, z15.s, z7.s\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "ld1rw { z25.s }, p2/Z, [x20]\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a57231 // sqdmulh z17.s, z17.s, z5.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + ".inst 0x04a67252 // sqdmulh z18.s, z18.s, z6.s\n" + ".inst 0x04a77273 // sqdmulh z19.s, z19.s, z7.s\n" ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a572b5 // sqdmulh z21.s, z21.s, z5.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" - "add z8.s, z8.s, z25.s\n" + ".inst 0x04a672d6 // sqdmulh z22.s, z22.s, z6.s\n" + ".inst 0x04a772f7 // sqdmulh z23.s, z23.s, z7.s\n" ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z9.s, z9.s, z25.s\n" - "add z10.s, z10.s, z25.s\n" ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" - "add z11.s, z11.s, z25.s\n" - "add z12.s, z12.s, z25.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" - "add z13.s, z13.s, z25.s\n" - "add z14.s, z14.s, z25.s\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n" ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n" - "add z15.s, z15.s, z25.s\n" - "add z16.s, z16.s, z25.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z17.s, z17.s, z25.s\n" - "add z18.s, z18.s, z25.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z25.s\n" - "add z20.s, z20.s, z25.s\n" - "add x20, %x[qp], %[minval]\n" - "add z21.s, z21.s, z25.s\n" - "add z22.s, z22.s, z25.s\n" - "ld1rw { z26.s }, p2/Z, [x20]\n" - "add z23.s, z23.s, z25.s\n" - "smin z8.s, p2/M, z8.s, z24.s\n" - "smin z9.s, p2/M, z9.s, z24.s\n" - "smin z10.s, p2/M, z10.s, z24.s\n" - "smin z11.s, p2/M, z11.s, z24.s\n" - "smin z12.s, p2/M, z12.s, z24.s\n" - "smin z13.s, p2/M, z13.s, z24.s\n" - "smin z14.s, p2/M, z14.s, z24.s\n" - "smin z15.s, p2/M, z15.s, z24.s\n" - "smin z16.s, p2/M, z16.s, z24.s\n" - "smin z17.s, p2/M, z17.s, z24.s\n" - "smin z18.s, p2/M, z18.s, z24.s\n" - "smin z19.s, p2/M, z19.s, z24.s\n" - "smin z20.s, p2/M, z20.s, z24.s\n" - "smin z21.s, p2/M, z21.s, z24.s\n" - "smin z22.s, p2/M, z22.s, z24.s\n" - "smin z23.s, p2/M, z23.s, z24.s\n" - "smax z8.s, p2/M, z8.s, z26.s\n" - "smax z9.s, p2/M, z9.s, z26.s\n" - "smax z10.s, p2/M, z10.s, z26.s\n" - "smax z11.s, p2/M, z11.s, z26.s\n" - "smax z12.s, p2/M, z12.s, z26.s\n" - "smax z13.s, p2/M, z13.s, z26.s\n" - "smax z14.s, p2/M, z14.s, z26.s\n" - "smax z15.s, p2/M, z15.s, z26.s\n" + "add z12.s, z12.s, z4.s\n" + "add z13.s, z13.s, z4.s\n" + "add z14.s, z14.s, z4.s\n" + "add z15.s, z15.s, z4.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "add z22.s, z22.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" "uzp1 z8.h, z8.h, z9.h\n" - "smax z16.s, p2/M, z16.s, z26.s\n" - "smax z17.s, p2/M, z17.s, z26.s\n" - "uzp1 z25.h, z10.h, z11.h\n" - "smax z18.s, p2/M, z18.s, z26.s\n" - "smax z19.s, p2/M, z19.s, z26.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z12.h, z12.h, z13.h\n" - "smax z20.s, p2/M, z20.s, z26.s\n" - "smax z21.s, p2/M, z21.s, z26.s\n" - "uzp1 z24.h, z14.h, z15.h\n" - "smax z22.s, p2/M, z22.s, z26.s\n" - "smax z23.s, p2/M, z23.s, z26.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "uzp1 z13.h, z14.h, z15.h\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "uzp1 z8.b, z8.b, z25.b\n" - "uzp1 z18.h, z18.h, z19.h\n" + "uzp1 z8.b, z8.b, z9.b\n" + "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z20.h, z20.h, z21.h\n" - "uzp1 z12.b, z12.b, z24.b\n" - "uzp1 z17.h, z22.h, z23.h\n" + "uzp1 z12.b, z12.b, z13.b\n" + "uzp1 z21.h, z22.h, z23.h\n" "st1b { z8.b }, p1, [x9]\n" "addvl x9, x9, #1\n" - "uzp1 z16.b, z16.b, z18.b\n" - "uzp1 z20.b, z20.b, z17.b\n" - "st1b { z12.b }, p1, [x26]\n" - "st1b { z16.b }, p1, [x25]\n" - "st1b { z20.b }, p1, [x24]\n" - "52:" // Height 4: Writeback done + "uzp1 z16.b, z16.b, z17.b\n" + "uzp1 z20.b, z20.b, z21.b\n" + "st1b { z12.b }, p1, [x27]\n" + "st1b { z16.b }, p1, [x26]\n" + "st1b { z20.b }, p1, [x25]\n" "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" - "bgt 41b\n" - "b 80f\n" - "53:" // Height 5 - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "bgt 38b\n" + "b 74f\n" + "49:" // Height 5 + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "54:" // Height 5: Column loop + "50:" // Height 5: Column loop "mov x20, #0x0\n" "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" @@ -1449,13 +1309,12 @@ void sve_hybrid_s8qs_dot_6x4VL ( "mov z25.s, #0x0\n" "mov z26.s, #0x0\n" "mov z27.s, #0x0\n" - "55:" // Height 5: setup done "mov x28, #0x0\n" - "56:" // Height 5: String loop + "52:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 57f\n" + "tbz %x[flags], #3, 53f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" @@ -1463,296 +1322,296 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ldr x24, [x20, #0x10]\n" "ldr x23, [x20, #0x18]\n" "ldr x22, [x20, #0x20]\n" - "cbnz x28, 58f\n" + "cbnz x28, 54f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" - "b 58f\n" - "57:" // Height 5: setup direct input + "b 54f\n" + "53:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" "add x24, x25, x21\n" "add x23, x24, x21\n" "add x22, x23, x21\n" - "58:" // Height 5: input setup done + "54:" // Height 5: input setup done "cmp x27, #0x10\n" - "ble 60f\n" - "59:" // Height 5: Multiply loop: Main loop head + "ble 56f\n" + "55:" // Height 5: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1b { z29.b }, p2/Z, [x10]\n" - "ld1b { z28.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - "ld1rqb { z4.b }, p0/Z, [x26]\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" "add x26, x26, #0x10\n" - "ld1rqb { z3.b }, p0/Z, [x25]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" "add x25, x25, #0x10\n" "ld1rqb { z2.b }, p0/Z, [x24]\n" "add x24, x24, #0x10\n" - "ld1rqb { z1.b }, p0/Z, [x23]\n" - "ld1rqb { z0.b }, p0/Z, [x22]\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "sdot z8.s, z29.b, z4.b[0]\n" - "sdot z12.s, z29.b, z3.b[0]\n" - "sdot z9.s, z28.b, z4.b[0]\n" - "sdot z16.s, z29.b, z2.b[0]\n" - "sdot z20.s, z29.b, z1.b[0]\n" - "sdot z24.s, z29.b, z0.b[0]\n" - "sdot z13.s, z28.b, z3.b[0]\n" - "ld1b { z29.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z17.s, z28.b, z2.b[0]\n" - "sdot z21.s, z28.b, z1.b[0]\n" - "sdot z25.s, z28.b, z0.b[0]\n" - "ld1b { z28.b }, p2/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z29.b, z4.b[0]\n" - "sdot z14.s, z29.b, z3.b[0]\n" - "sdot z18.s, z29.b, z2.b[0]\n" - "sdot z22.s, z29.b, z1.b[0]\n" - "sdot z26.s, z29.b, z0.b[0]\n" - "ld1b { z29.b }, p2/Z, [x10, #4, MUL VL]\n" - "sdot z11.s, z28.b, z4.b[0]\n" - "sdot z15.s, z28.b, z3.b[0]\n" - "sdot z19.s, z28.b, z2.b[0]\n" - "sdot z23.s, z28.b, z1.b[0]\n" - "sdot z27.s, z28.b, z0.b[0]\n" - "ld1b { z28.b }, p2/Z, [x10, #5, MUL VL]\n" - "sdot z8.s, z29.b, z4.b[1]\n" - "sdot z12.s, z29.b, z3.b[1]\n" - "sdot z16.s, z29.b, z2.b[1]\n" - "sdot z20.s, z29.b, z1.b[1]\n" - "sdot z24.s, z29.b, z0.b[1]\n" - "ld1b { z29.b }, p2/Z, [x10, #6, MUL VL]\n" - "sdot z9.s, z28.b, z4.b[1]\n" - "sdot z13.s, z28.b, z3.b[1]\n" - "sdot z17.s, z28.b, z2.b[1]\n" - "sdot z21.s, z28.b, z1.b[1]\n" - "sdot z25.s, z28.b, z0.b[1]\n" - "ld1b { z28.b }, p2/Z, [x10, #7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "sdot z24.s, z6.b, z4.b[0]\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "sdot z25.s, z7.b, z4.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "sdot z26.s, z6.b, z4.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "sdot z27.s, z7.b, z4.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "sdot z24.s, z6.b, z4.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "sdot z25.s, z7.b, z4.b[1]\n" + "ld1b { z7.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "sdot z10.s, z29.b, z4.b[1]\n" - "sdot z14.s, z29.b, z3.b[1]\n" - "sdot z18.s, z29.b, z2.b[1]\n" - "sdot z22.s, z29.b, z1.b[1]\n" - "sdot z26.s, z29.b, z0.b[1]\n" - "sdot z11.s, z28.b, z4.b[1]\n" - "sdot z15.s, z28.b, z3.b[1]\n" - "ld1b { z29.b }, p2/Z, [x10, #-8, MUL VL]\n" - "sdot z19.s, z28.b, z2.b[1]\n" - "sdot z23.s, z28.b, z1.b[1]\n" - "sdot z27.s, z28.b, z0.b[1]\n" - "ld1b { z28.b }, p2/Z, [x10, #-7, MUL VL]\n" - "sdot z8.s, z29.b, z4.b[2]\n" - "sdot z12.s, z29.b, z3.b[2]\n" - "sdot z16.s, z29.b, z2.b[2]\n" - "sdot z20.s, z29.b, z1.b[2]\n" - "sdot z24.s, z29.b, z0.b[2]\n" - "ld1b { z29.b }, p2/Z, [x10, #-6, MUL VL]\n" - "sdot z9.s, z28.b, z4.b[2]\n" - "sdot z13.s, z28.b, z3.b[2]\n" - "sdot z17.s, z28.b, z2.b[2]\n" - "sdot z21.s, z28.b, z1.b[2]\n" - "sdot z25.s, z28.b, z0.b[2]\n" - "ld1b { z28.b }, p2/Z, [x10, #-5, MUL VL]\n" - "sdot z10.s, z29.b, z4.b[2]\n" - "sdot z14.s, z29.b, z3.b[2]\n" - "sdot z18.s, z29.b, z2.b[2]\n" - "sdot z22.s, z29.b, z1.b[2]\n" - "sdot z26.s, z29.b, z0.b[2]\n" - "ld1b { z29.b }, p2/Z, [x10, #-4, MUL VL]\n" - "sdot z11.s, z28.b, z4.b[2]\n" - "sdot z15.s, z28.b, z3.b[2]\n" - "sdot z19.s, z28.b, z2.b[2]\n" - "sdot z23.s, z28.b, z1.b[2]\n" - "sdot z27.s, z28.b, z0.b[2]\n" - "ld1b { z28.b }, p2/Z, [x10, #-3, MUL VL]\n" - "sdot z8.s, z29.b, z4.b[3]\n" - "sdot z12.s, z29.b, z3.b[3]\n" - "sdot z16.s, z29.b, z2.b[3]\n" - "sdot z20.s, z29.b, z1.b[3]\n" - "sdot z24.s, z29.b, z0.b[3]\n" - "ld1b { z29.b }, p2/Z, [x10, #-2, MUL VL]\n" - "sdot z9.s, z28.b, z4.b[3]\n" - "sdot z13.s, z28.b, z3.b[3]\n" - "sdot z17.s, z28.b, z2.b[3]\n" - "sdot z21.s, z28.b, z1.b[3]\n" - "sdot z25.s, z28.b, z0.b[3]\n" - "ld1b { z28.b }, p2/Z, [x10, #-1, MUL VL]\n" - "sdot z10.s, z29.b, z4.b[3]\n" - "sdot z14.s, z29.b, z3.b[3]\n" - "sdot z18.s, z29.b, z2.b[3]\n" - "sdot z22.s, z29.b, z1.b[3]\n" - "sdot z26.s, z29.b, z0.b[3]\n" - "sdot z11.s, z28.b, z4.b[3]\n" - "sdot z15.s, z28.b, z3.b[3]\n" - "sdot z19.s, z28.b, z2.b[3]\n" - "sdot z23.s, z28.b, z1.b[3]\n" - "sdot z27.s, z28.b, z0.b[3]\n" - "bgt 59b\n" - "60:" // Height 5: Multiply loop: Single iteration only + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "sdot z26.s, z6.b, z4.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #-8, MUL VL]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "sdot z27.s, z7.b, z4.b[1]\n" + "ld1b { z7.b }, p2/Z, [x10, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "sdot z24.s, z6.b, z4.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "sdot z25.s, z7.b, z4.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "sdot z26.s, z6.b, z4.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "sdot z27.s, z7.b, z4.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "sdot z24.s, z6.b, z4.b[3]\n" + "ld1b { z6.b }, p2/Z, [x10, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "sdot z25.s, z7.b, z4.b[3]\n" + "ld1b { z7.b }, p2/Z, [x10, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z26.s, z6.b, z4.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "sdot z27.s, z7.b, z4.b[3]\n" + "bgt 55b\n" + "56:" // Height 5: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" - "ld1b { z29.b }, p2/Z, [x10]\n" - "ld1b { z28.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" "ld1rqb { z0.b }, p0/Z, [x26]\n" "ld1rqb { z1.b }, p0/Z, [x25]\n" "ld1rqb { z2.b }, p0/Z, [x24]\n" "ld1rqb { z3.b }, p0/Z, [x23]\n" "ld1rqb { z4.b }, p0/Z, [x22]\n" - "sdot z8.s, z29.b, z0.b[0]\n" - "sdot z12.s, z29.b, z1.b[0]\n" - "sdot z9.s, z28.b, z0.b[0]\n" - "sdot z13.s, z28.b, z1.b[0]\n" - "sdot z16.s, z29.b, z2.b[0]\n" - "sdot z20.s, z29.b, z3.b[0]\n" - "sdot z24.s, z29.b, z4.b[0]\n" - "sdot z17.s, z28.b, z2.b[0]\n" - "ld1b { z29.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z21.s, z28.b, z3.b[0]\n" - "sdot z25.s, z28.b, z4.b[0]\n" - "ld1b { z28.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "sdot z24.s, z6.b, z4.b[0]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "sdot z25.s, z7.b, z4.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z29.b, z0.b[0]\n" - "sdot z14.s, z29.b, z1.b[0]\n" - "sdot z18.s, z29.b, z2.b[0]\n" - "sdot z22.s, z29.b, z3.b[0]\n" - "sdot z26.s, z29.b, z4.b[0]\n" - "sdot z11.s, z28.b, z0.b[0]\n" - "sdot z15.s, z28.b, z1.b[0]\n" - "sdot z19.s, z28.b, z2.b[0]\n" - "sdot z23.s, z28.b, z3.b[0]\n" - "sdot z27.s, z28.b, z4.b[0]\n" - "ble 61f\n" - "ld1b { z29.b }, p2/Z, [x10]\n" - "ld1b { z28.b }, p2/Z, [x10, #1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "sdot z26.s, z6.b, z4.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "sdot z27.s, z7.b, z4.b[0]\n" + "ble 57f\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z8.s, z29.b, z0.b[1]\n" - "sdot z12.s, z29.b, z1.b[1]\n" - "sdot z16.s, z29.b, z2.b[1]\n" - "sdot z20.s, z29.b, z3.b[1]\n" - "sdot z24.s, z29.b, z4.b[1]\n" - "ld1b { z29.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z28.b, z0.b[1]\n" - "sdot z13.s, z28.b, z1.b[1]\n" - "sdot z17.s, z28.b, z2.b[1]\n" - "sdot z21.s, z28.b, z3.b[1]\n" - "sdot z25.s, z28.b, z4.b[1]\n" - "ld1b { z28.b }, p2/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z29.b, z0.b[1]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "sdot z24.s, z6.b, z4.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "sdot z25.s, z7.b, z4.b[1]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[1]\n" "addvl x10, x10, #4\n" - "sdot z14.s, z29.b, z1.b[1]\n" - "sdot z18.s, z29.b, z2.b[1]\n" - "sdot z22.s, z29.b, z3.b[1]\n" - "sdot z26.s, z29.b, z4.b[1]\n" - "sdot z11.s, z28.b, z0.b[1]\n" - "sdot z15.s, z28.b, z1.b[1]\n" - "sdot z19.s, z28.b, z2.b[1]\n" - "sdot z23.s, z28.b, z3.b[1]\n" - "sdot z27.s, z28.b, z4.b[1]\n" - "ble 61f\n" - "ld1b { z29.b }, p2/Z, [x10]\n" - "ld1b { z28.b }, p2/Z, [x10, #1, MUL VL]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "sdot z26.s, z6.b, z4.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "sdot z27.s, z7.b, z4.b[1]\n" + "ble 57f\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z8.s, z29.b, z0.b[2]\n" - "sdot z12.s, z29.b, z1.b[2]\n" - "sdot z16.s, z29.b, z2.b[2]\n" - "sdot z20.s, z29.b, z3.b[2]\n" - "sdot z24.s, z29.b, z4.b[2]\n" - "ld1b { z29.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z28.b, z0.b[2]\n" - "sdot z13.s, z28.b, z1.b[2]\n" - "sdot z17.s, z28.b, z2.b[2]\n" - "sdot z21.s, z28.b, z3.b[2]\n" - "sdot z25.s, z28.b, z4.b[2]\n" - "ld1b { z28.b }, p2/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z29.b, z0.b[2]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "sdot z24.s, z6.b, z4.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "sdot z25.s, z7.b, z4.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" "addvl x10, x10, #4\n" - "sdot z14.s, z29.b, z1.b[2]\n" - "sdot z18.s, z29.b, z2.b[2]\n" - "sdot z22.s, z29.b, z3.b[2]\n" - "sdot z26.s, z29.b, z4.b[2]\n" - "sdot z11.s, z28.b, z0.b[2]\n" - "sdot z15.s, z28.b, z1.b[2]\n" - "sdot z19.s, z28.b, z2.b[2]\n" - "sdot z23.s, z28.b, z3.b[2]\n" - "sdot z27.s, z28.b, z4.b[2]\n" - "ble 61f\n" - "ld1b { z29.b }, p2/Z, [x10]\n" - "ld1b { z28.b }, p2/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z29.b, z0.b[3]\n" - "sdot z12.s, z29.b, z1.b[3]\n" - "sdot z16.s, z29.b, z2.b[3]\n" - "sdot z20.s, z29.b, z3.b[3]\n" - "sdot z24.s, z29.b, z4.b[3]\n" - "ld1b { z29.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z28.b, z0.b[3]\n" - "sdot z13.s, z28.b, z1.b[3]\n" - "sdot z17.s, z28.b, z2.b[3]\n" - "sdot z21.s, z28.b, z3.b[3]\n" - "sdot z25.s, z28.b, z4.b[3]\n" - "ld1b { z28.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "sdot z26.s, z6.b, z4.b[2]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "sdot z27.s, z7.b, z4.b[2]\n" + "ble 57f\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "sdot z24.s, z6.b, z4.b[3]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "sdot z25.s, z7.b, z4.b[3]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z29.b, z0.b[3]\n" - "sdot z14.s, z29.b, z1.b[3]\n" - "sdot z18.s, z29.b, z2.b[3]\n" - "sdot z22.s, z29.b, z3.b[3]\n" - "sdot z26.s, z29.b, z4.b[3]\n" - "sdot z11.s, z28.b, z0.b[3]\n" - "sdot z15.s, z28.b, z1.b[3]\n" - "sdot z19.s, z28.b, z2.b[3]\n" - "sdot z23.s, z28.b, z3.b[3]\n" - "sdot z27.s, z28.b, z4.b[3]\n" - "61:" // Height 5: Multiply loop: multiply skip + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z26.s, z6.b, z4.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "sdot z27.s, z7.b, z4.b[3]\n" + "57:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 56b\n" + "bne 52b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "ld1w { z31.s }, p2/Z, [x14]\n" - "ld1w { z30.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z29.s }, p2/Z, [x14, #2, MUL VL]\n" - "ld1w { z28.s }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" - "add x26, x9, x20\n" - "add x25, x26, x20\n" - "add z8.s, z8.s, z31.s\n" - "add z12.s, z12.s, z31.s\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add z9.s, z9.s, z30.s\n" - "add z10.s, z10.s, z29.s\n" - "add z11.s, z11.s, z28.s\n" - "add z13.s, z13.s, z30.s\n" - "add z14.s, z14.s, z29.s\n" - "add z15.s, z15.s, z28.s\n" - "add z16.s, z16.s, z31.s\n" - "add z17.s, z17.s, z30.s\n" - "add z18.s, z18.s, z29.s\n" - "add z19.s, z19.s, z28.s\n" - "add z20.s, z20.s, z31.s\n" - "add z21.s, z21.s, z30.s\n" - "add z22.s, z22.s, z29.s\n" - "add z23.s, z23.s, z28.s\n" - "add z24.s, z24.s, z31.s\n" - "add z25.s, z25.s, z30.s\n" - "add z26.s, z26.s, z29.s\n" - "add z27.s, z27.s, z28.s\n" - "tbz %x[flags], #4, 62f\n" "ld1w { z0.s }, p2/Z, [x12]\n" - "ld1w { z4.s }, p2/Z, [x13]\n" "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" - "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" "addvl x12, x12, #4\n" + "add x27, x9, x20\n" + "add x26, x27, x20\n" + "add z8.s, z8.s, z0.s\n" + "add z12.s, z12.s, z0.s\n" + "add x25, x26, x20\n" + "add x24, x25, x20\n" + "add z9.s, z9.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "add z13.s, z13.s, z1.s\n" + "add z14.s, z14.s, z2.s\n" + "add z15.s, z15.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + "tbz %x[flags], #4, 58f\n" + "ld1w { z0.s }, p2/Z, [x13]\n" + "ld1w { z4.s }, p2/Z, [x14]\n" + "ld1w { z1.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x13, #3, MUL VL]\n" + "ld1w { z7.s }, p2/Z, [x14, #3, MUL VL]\n" "addvl x13, x13, #4\n" - "b 63f\n" - "62:" // Height 5: per layer parameters + "addvl x14, x14, #4\n" + "b 59f\n" + "58:" // Height 5: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1rw { z0.s }, p2/Z, [x21]\n" @@ -1763,213 +1622,150 @@ void sve_hybrid_s8qs_dot_6x4VL ( "mov z6.d, z4.d\n" "mov z3.d, z0.d\n" "mov z7.d, z4.d\n" - "63:" // Height 5: parameters loaded - ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" - ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" - ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" - ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" - ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n" - ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n" - ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n" - ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" - ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" - ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" - ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" - ".inst 0x04a576b5 // sqrdmulh z21.s, z21.s, z5.s\n" - ".inst 0x04a676d6 // sqrdmulh z22.s, z22.s, z6.s\n" - ".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n" - ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" - ".inst 0x04a57739 // sqrdmulh z25.s, z25.s, z5.s\n" - ".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n" - ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n" - "tbz %x[flags], #5, 64f\n" - "and z31.d, z8.d, z0.d\n" - "and z30.d, z9.d, z1.d\n" - "and z29.d, z10.d, z2.d\n" - "and z28.d, z11.d, z3.d\n" - "asr z31.s, z31.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "sqadd z8.s, z8.s, z31.s\n" - "and z31.d, z12.d, z0.d\n" - "sqadd z9.s, z9.s, z30.s\n" - "and z30.d, z13.d, z1.d\n" - "sqadd z10.s, z10.s, z29.s\n" - "sqadd z11.s, z11.s, z28.s\n" - "and z29.d, z14.d, z2.d\n" - "and z28.d, z15.d, z3.d\n" - "asr z31.s, z31.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "sqadd z12.s, z12.s, z31.s\n" - "and z31.d, z16.d, z0.d\n" - "sqadd z13.s, z13.s, z30.s\n" - "and z30.d, z17.d, z1.d\n" - "sqadd z14.s, z14.s, z29.s\n" - "sqadd z15.s, z15.s, z28.s\n" - "and z29.d, z18.d, z2.d\n" - "and z28.d, z19.d, z3.d\n" - "asr z31.s, z31.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "sqadd z16.s, z16.s, z31.s\n" - "and z31.d, z20.d, z0.d\n" - "sqadd z17.s, z17.s, z30.s\n" - "and z30.d, z21.d, z1.d\n" - "sqadd z18.s, z18.s, z29.s\n" - "sqadd z19.s, z19.s, z28.s\n" - "and z29.d, z22.d, z2.d\n" - "and z28.d, z23.d, z3.d\n" - "asr z31.s, z31.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "sqadd z20.s, z20.s, z31.s\n" - "and z31.d, z24.d, z0.d\n" - "sqadd z21.s, z21.s, z30.s\n" - "and z30.d, z25.d, z1.d\n" - "sqadd z22.s, z22.s, z29.s\n" - "sqadd z23.s, z23.s, z28.s\n" - "and z29.d, z26.d, z2.d\n" - "and z28.d, z27.d, z3.d\n" - "asr z31.s, z31.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "sqadd z24.s, z24.s, z31.s\n" - "sqadd z25.s, z25.s, z30.s\n" - "sqadd z26.s, z26.s, z29.s\n" - "sqadd z27.s, z27.s, z28.s\n" - "64:" // Height 5: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "59:" // Height 5: parameters loaded + ".inst 0x04a47108 // sqdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57129 // sqdmulh z9.s, z9.s, z5.s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + ".inst 0x04a6714a // sqdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7716b // sqdmulh z11.s, z11.s, z7.s\n" + "add x20, %x[qp], %[minval]\n" + ".inst 0x04a4718c // sqdmulh z12.s, z12.s, z4.s\n" + ".inst 0x04a571ad // sqdmulh z13.s, z13.s, z5.s\n" + ".inst 0x04a671ce // sqdmulh z14.s, z14.s, z6.s\n" + ".inst 0x04a771ef // sqdmulh z15.s, z15.s, z7.s\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "ld1rw { z29.s }, p2/Z, [x20]\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a57231 // sqdmulh z17.s, z17.s, z5.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + ".inst 0x04a67252 // sqdmulh z18.s, z18.s, z6.s\n" + ".inst 0x04a77273 // sqdmulh z19.s, z19.s, z7.s\n" ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a572b5 // sqdmulh z21.s, z21.s, z5.s\n" ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" - "add z8.s, z8.s, z29.s\n" ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" + ".inst 0x04a672d6 // sqdmulh z22.s, z22.s, z6.s\n" + ".inst 0x04a772f7 // sqdmulh z23.s, z23.s, z7.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z9.s, z9.s, z29.s\n" - "add z10.s, z10.s, z29.s\n" ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" + ".inst 0x04a47318 // sqdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a57339 // sqdmulh z25.s, z25.s, z5.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" - "add z11.s, z11.s, z29.s\n" - "add z12.s, z12.s, z29.s\n" + ".inst 0x04a6735a // sqdmulh z26.s, z26.s, z6.s\n" + ".inst 0x04a7737b // sqdmulh z27.s, z27.s, z7.s\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" - "add z13.s, z13.s, z29.s\n" - "add z14.s, z14.s, z29.s\n" ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n" ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n" - "add z15.s, z15.s, z29.s\n" - "add z16.s, z16.s, z29.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z17.s, z17.s, z29.s\n" - "add z18.s, z18.s, z29.s\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" ".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n" ".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n" - "add z19.s, z19.s, z29.s\n" - "add z20.s, z20.s, z29.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z21.s, z21.s, z29.s\n" - "add z22.s, z22.s, z29.s\n" - "ld1rw { z28.s }, p2/Z, [x20]\n" - "add z23.s, z23.s, z29.s\n" - "add z24.s, z24.s, z29.s\n" - "add x20, %x[qp], %[minval]\n" - "add z25.s, z25.s, z29.s\n" - "add z26.s, z26.s, z29.s\n" - "ld1rw { z30.s }, p2/Z, [x20]\n" - "add z27.s, z27.s, z29.s\n" - "smin z8.s, p2/M, z8.s, z28.s\n" - "smin z9.s, p2/M, z9.s, z28.s\n" - "smin z10.s, p2/M, z10.s, z28.s\n" - "smin z11.s, p2/M, z11.s, z28.s\n" - "smin z12.s, p2/M, z12.s, z28.s\n" - "smin z13.s, p2/M, z13.s, z28.s\n" - "smin z14.s, p2/M, z14.s, z28.s\n" - "smin z15.s, p2/M, z15.s, z28.s\n" - "smin z16.s, p2/M, z16.s, z28.s\n" - "smin z17.s, p2/M, z17.s, z28.s\n" - "smin z18.s, p2/M, z18.s, z28.s\n" - "smin z19.s, p2/M, z19.s, z28.s\n" - "smin z20.s, p2/M, z20.s, z28.s\n" - "smin z21.s, p2/M, z21.s, z28.s\n" - "smin z22.s, p2/M, z22.s, z28.s\n" - "smin z23.s, p2/M, z23.s, z28.s\n" - "smin z24.s, p2/M, z24.s, z28.s\n" - "smin z25.s, p2/M, z25.s, z28.s\n" - "smin z26.s, p2/M, z26.s, z28.s\n" - "smin z27.s, p2/M, z27.s, z28.s\n" - "smax z8.s, p2/M, z8.s, z30.s\n" - "smax z9.s, p2/M, z9.s, z30.s\n" - "smax z10.s, p2/M, z10.s, z30.s\n" - "smax z11.s, p2/M, z11.s, z30.s\n" - "smax z12.s, p2/M, z12.s, z30.s\n" - "smax z13.s, p2/M, z13.s, z30.s\n" - "smax z14.s, p2/M, z14.s, z30.s\n" - "smax z15.s, p2/M, z15.s, z30.s\n" + "add z12.s, z12.s, z4.s\n" + "add z13.s, z13.s, z4.s\n" + "add z14.s, z14.s, z4.s\n" + "add z15.s, z15.s, z4.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "add z22.s, z22.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "add z26.s, z26.s, z4.s\n" + "add z27.s, z27.s, z4.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" "uzp1 z8.h, z8.h, z9.h\n" - "smax z16.s, p2/M, z16.s, z30.s\n" - "smax z17.s, p2/M, z17.s, z30.s\n" - "uzp1 z29.h, z10.h, z11.h\n" - "smax z18.s, p2/M, z18.s, z30.s\n" - "smax z19.s, p2/M, z19.s, z30.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z12.h, z12.h, z13.h\n" - "smax z20.s, p2/M, z20.s, z30.s\n" - "smax z21.s, p2/M, z21.s, z30.s\n" - "uzp1 z28.h, z14.h, z15.h\n" - "smax z22.s, p2/M, z22.s, z30.s\n" - "smax z23.s, p2/M, z23.s, z30.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "uzp1 z13.h, z14.h, z15.h\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "uzp1 z8.b, z8.b, z29.b\n" - "smax z24.s, p2/M, z24.s, z30.s\n" - "smax z25.s, p2/M, z25.s, z30.s\n" + "uzp1 z8.b, z8.b, z9.b\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" "uzp1 z17.h, z18.h, z19.h\n" - "smax z26.s, p2/M, z26.s, z30.s\n" - "smax z27.s, p2/M, z27.s, z30.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z20.h, z20.h, z21.h\n" - "uzp1 z12.b, z12.b, z28.b\n" - "uzp1 z18.h, z22.h, z23.h\n" + "uzp1 z12.b, z12.b, z13.b\n" + "uzp1 z21.h, z22.h, z23.h\n" "st1b { z8.b }, p1, [x9]\n" "addvl x9, x9, #1\n" "uzp1 z24.h, z24.h, z25.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "uzp1 z17.h, z26.h, z27.h\n" - "st1b { z12.b }, p1, [x26]\n" - "uzp1 z20.b, z20.b, z18.b\n" - "uzp1 z24.b, z24.b, z17.b\n" - "st1b { z16.b }, p1, [x25]\n" - "st1b { z20.b }, p1, [x24]\n" - "st1b { z24.b }, p1, [x23]\n" - "65:" // Height 5: Writeback done + "uzp1 z25.h, z26.h, z27.h\n" + "st1b { z12.b }, p1, [x27]\n" + "uzp1 z20.b, z20.b, z21.b\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z16.b }, p1, [x26]\n" + "st1b { z20.b }, p1, [x25]\n" + "st1b { z24.b }, p1, [x24]\n" "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" - "bgt 54b\n" - "b 80f\n" - "66:" // Height 6 + "bgt 50b\n" + "b 74f\n" + "61:" // Height 6 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" "mov x20, #0x6\n" - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "madd x20, x21, x20, x9\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "67:" // Height 6: Column loop + "62:" // Height 6: Column loop "mov x20, #0x0\n" "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" @@ -1996,13 +1792,12 @@ void sve_hybrid_s8qs_dot_6x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "68:" // Height 6: setup done "mov x28, #0x0\n" - "69:" // Height 6: String loop + "64:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 70f\n" + "tbz %x[flags], #3, 65f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" @@ -2011,7 +1806,7 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ldr x23, [x20, #0x18]\n" "ldr x22, [x20, #0x20]\n" "ldr x21, [x20, #0x28]\n" - "cbnz x28, 71f\n" + "cbnz x28, 66f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" @@ -2019,151 +1814,151 @@ void sve_hybrid_s8qs_dot_6x4VL ( "add x23, x23, x20\n" "add x22, x22, x20\n" "add x21, x21, x20\n" - "b 71f\n" - "70:" // Height 6: setup direct input + "b 66f\n" + "65:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" "add x24, x25, x21\n" "add x23, x24, x21\n" "add x22, x23, x21\n" "add x21, x22, x21\n" - "71:" // Height 6: input setup done + "66:" // Height 6: input setup done "cmp x27, #0x10\n" - "ble 73f\n" - "72:" // Height 6: Multiply loop: Main loop head + "ble 68f\n" + "67:" // Height 6: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1b { z1.b }, p2/Z, [x10]\n" - "ld1b { z0.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - "ld1rqb { z7.b }, p0/Z, [x26]\n" + "ld1rqb { z0.b }, p0/Z, [x26]\n" "add x26, x26, #0x10\n" - "ld1rqb { z6.b }, p0/Z, [x25]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" "add x25, x25, #0x10\n" - "ld1rqb { z5.b }, p0/Z, [x24]\n" + "ld1rqb { z2.b }, p0/Z, [x24]\n" "add x24, x24, #0x10\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "ld1rqb { z3.b }, p0/Z, [x22]\n" - "ld1rqb { z2.b }, p0/Z, [x21]\n" + "ld1rqb { z3.b }, p0/Z, [x23]\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "sdot z8.s, z1.b, z7.b[0]\n" - "sdot z12.s, z1.b, z6.b[0]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "sdot z12.s, z6.b, z1.b[0]\n" "add x21, x21, #0x10\n" - "sdot z16.s, z1.b, z5.b[0]\n" - "sdot z20.s, z1.b, z4.b[0]\n" - "sdot z24.s, z1.b, z3.b[0]\n" - "sdot z28.s, z1.b, z2.b[0]\n" - "ld1b { z1.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z0.b, z7.b[0]\n" - "sdot z13.s, z0.b, z6.b[0]\n" - "sdot z17.s, z0.b, z5.b[0]\n" - "sdot z21.s, z0.b, z4.b[0]\n" - "sdot z25.s, z0.b, z3.b[0]\n" - "sdot z29.s, z0.b, z2.b[0]\n" - "ld1b { z0.b }, p2/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z1.b, z7.b[0]\n" - "sdot z14.s, z1.b, z6.b[0]\n" - "sdot z18.s, z1.b, z5.b[0]\n" - "sdot z22.s, z1.b, z4.b[0]\n" - "sdot z26.s, z1.b, z3.b[0]\n" - "sdot z30.s, z1.b, z2.b[0]\n" - "ld1b { z1.b }, p2/Z, [x10, #4, MUL VL]\n" - "sdot z11.s, z0.b, z7.b[0]\n" - "sdot z15.s, z0.b, z6.b[0]\n" - "sdot z19.s, z0.b, z5.b[0]\n" - "sdot z23.s, z0.b, z4.b[0]\n" - "sdot z27.s, z0.b, z3.b[0]\n" - "sdot z31.s, z0.b, z2.b[0]\n" - "ld1b { z0.b }, p2/Z, [x10, #5, MUL VL]\n" - "sdot z8.s, z1.b, z7.b[1]\n" - "sdot z12.s, z1.b, z6.b[1]\n" - "sdot z16.s, z1.b, z5.b[1]\n" - "sdot z20.s, z1.b, z4.b[1]\n" - "sdot z24.s, z1.b, z3.b[1]\n" - "sdot z28.s, z1.b, z2.b[1]\n" - "ld1b { z1.b }, p2/Z, [x10, #6, MUL VL]\n" - "sdot z9.s, z0.b, z7.b[1]\n" - "sdot z13.s, z0.b, z6.b[1]\n" - "sdot z17.s, z0.b, z5.b[1]\n" - "sdot z21.s, z0.b, z4.b[1]\n" - "sdot z25.s, z0.b, z3.b[1]\n" - "sdot z29.s, z0.b, z2.b[1]\n" - "ld1b { z0.b }, p2/Z, [x10, #7, MUL VL]\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "sdot z24.s, z6.b, z4.b[0]\n" + "sdot z28.s, z6.b, z5.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "sdot z25.s, z7.b, z4.b[0]\n" + "sdot z29.s, z7.b, z5.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "sdot z26.s, z6.b, z4.b[0]\n" + "sdot z30.s, z6.b, z5.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "sdot z27.s, z7.b, z4.b[0]\n" + "sdot z31.s, z7.b, z5.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #5, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "sdot z24.s, z6.b, z4.b[1]\n" + "sdot z28.s, z6.b, z5.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "sdot z25.s, z7.b, z4.b[1]\n" + "sdot z29.s, z7.b, z5.b[1]\n" + "ld1b { z7.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "sdot z10.s, z1.b, z7.b[1]\n" - "sdot z14.s, z1.b, z6.b[1]\n" - "sdot z18.s, z1.b, z5.b[1]\n" - "sdot z22.s, z1.b, z4.b[1]\n" - "sdot z26.s, z1.b, z3.b[1]\n" - "sdot z30.s, z1.b, z2.b[1]\n" - "ld1b { z1.b }, p2/Z, [x10, #-8, MUL VL]\n" - "sdot z11.s, z0.b, z7.b[1]\n" - "sdot z15.s, z0.b, z6.b[1]\n" - "sdot z19.s, z0.b, z5.b[1]\n" - "sdot z23.s, z0.b, z4.b[1]\n" - "sdot z27.s, z0.b, z3.b[1]\n" - "sdot z31.s, z0.b, z2.b[1]\n" - "ld1b { z0.b }, p2/Z, [x10, #-7, MUL VL]\n" - "sdot z8.s, z1.b, z7.b[2]\n" - "sdot z12.s, z1.b, z6.b[2]\n" - "sdot z16.s, z1.b, z5.b[2]\n" - "sdot z20.s, z1.b, z4.b[2]\n" - "sdot z24.s, z1.b, z3.b[2]\n" - "sdot z28.s, z1.b, z2.b[2]\n" - "ld1b { z1.b }, p2/Z, [x10, #-6, MUL VL]\n" - "sdot z9.s, z0.b, z7.b[2]\n" - "sdot z13.s, z0.b, z6.b[2]\n" - "sdot z17.s, z0.b, z5.b[2]\n" - "sdot z21.s, z0.b, z4.b[2]\n" - "sdot z25.s, z0.b, z3.b[2]\n" - "sdot z29.s, z0.b, z2.b[2]\n" - "ld1b { z0.b }, p2/Z, [x10, #-5, MUL VL]\n" - "sdot z10.s, z1.b, z7.b[2]\n" - "sdot z14.s, z1.b, z6.b[2]\n" - "sdot z18.s, z1.b, z5.b[2]\n" - "sdot z22.s, z1.b, z4.b[2]\n" - "sdot z26.s, z1.b, z3.b[2]\n" - "sdot z30.s, z1.b, z2.b[2]\n" - "ld1b { z1.b }, p2/Z, [x10, #-4, MUL VL]\n" - "sdot z11.s, z0.b, z7.b[2]\n" - "sdot z15.s, z0.b, z6.b[2]\n" - "sdot z19.s, z0.b, z5.b[2]\n" - "sdot z23.s, z0.b, z4.b[2]\n" - "sdot z27.s, z0.b, z3.b[2]\n" - "sdot z31.s, z0.b, z2.b[2]\n" - "ld1b { z0.b }, p2/Z, [x10, #-3, MUL VL]\n" - "sdot z8.s, z1.b, z7.b[3]\n" - "sdot z12.s, z1.b, z6.b[3]\n" - "sdot z16.s, z1.b, z5.b[3]\n" - "sdot z20.s, z1.b, z4.b[3]\n" - "sdot z24.s, z1.b, z3.b[3]\n" - "sdot z28.s, z1.b, z2.b[3]\n" - "ld1b { z1.b }, p2/Z, [x10, #-2, MUL VL]\n" - "sdot z9.s, z0.b, z7.b[3]\n" - "sdot z13.s, z0.b, z6.b[3]\n" - "sdot z17.s, z0.b, z5.b[3]\n" - "sdot z21.s, z0.b, z4.b[3]\n" - "sdot z25.s, z0.b, z3.b[3]\n" - "sdot z29.s, z0.b, z2.b[3]\n" - "ld1b { z0.b }, p2/Z, [x10, #-1, MUL VL]\n" - "sdot z10.s, z1.b, z7.b[3]\n" - "sdot z14.s, z1.b, z6.b[3]\n" - "sdot z18.s, z1.b, z5.b[3]\n" - "sdot z22.s, z1.b, z4.b[3]\n" - "sdot z26.s, z1.b, z3.b[3]\n" - "sdot z30.s, z1.b, z2.b[3]\n" - "sdot z11.s, z0.b, z7.b[3]\n" - "sdot z15.s, z0.b, z6.b[3]\n" - "sdot z19.s, z0.b, z5.b[3]\n" - "sdot z23.s, z0.b, z4.b[3]\n" - "sdot z27.s, z0.b, z3.b[3]\n" - "sdot z31.s, z0.b, z2.b[3]\n" - "bgt 72b\n" - "73:" // Height 6: Multiply loop: Single iteration only + "sdot z10.s, z6.b, z0.b[1]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "sdot z26.s, z6.b, z4.b[1]\n" + "sdot z30.s, z6.b, z5.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #-8, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "sdot z27.s, z7.b, z4.b[1]\n" + "sdot z31.s, z7.b, z5.b[1]\n" + "ld1b { z7.b }, p2/Z, [x10, #-7, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "sdot z24.s, z6.b, z4.b[2]\n" + "sdot z28.s, z6.b, z5.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #-6, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "sdot z25.s, z7.b, z4.b[2]\n" + "sdot z29.s, z7.b, z5.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #-5, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "sdot z26.s, z6.b, z4.b[2]\n" + "sdot z30.s, z6.b, z5.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #-4, MUL VL]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "sdot z27.s, z7.b, z4.b[2]\n" + "sdot z31.s, z7.b, z5.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #-3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "sdot z24.s, z6.b, z4.b[3]\n" + "sdot z28.s, z6.b, z5.b[3]\n" + "ld1b { z6.b }, p2/Z, [x10, #-2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "sdot z25.s, z7.b, z4.b[3]\n" + "sdot z29.s, z7.b, z5.b[3]\n" + "ld1b { z7.b }, p2/Z, [x10, #-1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z26.s, z6.b, z4.b[3]\n" + "sdot z30.s, z6.b, z5.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "sdot z27.s, z7.b, z4.b[3]\n" + "sdot z31.s, z7.b, z5.b[3]\n" + "bgt 67b\n" + "68:" // Height 6: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" - "ld1b { z7.b }, p2/Z, [x10]\n" - "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" "ld1rqb { z0.b }, p0/Z, [x26]\n" "ld1rqb { z1.b }, p0/Z, [x25]\n" @@ -2171,178 +1966,178 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ld1rqb { z3.b }, p0/Z, [x23]\n" "ld1rqb { z4.b }, p0/Z, [x22]\n" "ld1rqb { z5.b }, p0/Z, [x21]\n" - "sdot z8.s, z7.b, z0.b[0]\n" - "sdot z12.s, z7.b, z1.b[0]\n" - "sdot z9.s, z6.b, z0.b[0]\n" - "sdot z13.s, z6.b, z1.b[0]\n" - "sdot z16.s, z7.b, z2.b[0]\n" - "sdot z20.s, z7.b, z3.b[0]\n" - "sdot z24.s, z7.b, z4.b[0]\n" - "sdot z28.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z17.s, z6.b, z2.b[0]\n" - "sdot z21.s, z6.b, z3.b[0]\n" - "sdot z25.s, z6.b, z4.b[0]\n" - "sdot z29.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[0]\n" + "sdot z12.s, z6.b, z1.b[0]\n" + "sdot z9.s, z7.b, z0.b[0]\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "sdot z16.s, z6.b, z2.b[0]\n" + "sdot z20.s, z6.b, z3.b[0]\n" + "sdot z24.s, z6.b, z4.b[0]\n" + "sdot z28.s, z6.b, z5.b[0]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z21.s, z7.b, z3.b[0]\n" + "sdot z25.s, z7.b, z4.b[0]\n" + "sdot z29.s, z7.b, z5.b[0]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z7.b, z0.b[0]\n" - "sdot z14.s, z7.b, z1.b[0]\n" - "sdot z18.s, z7.b, z2.b[0]\n" - "sdot z22.s, z7.b, z3.b[0]\n" - "sdot z26.s, z7.b, z4.b[0]\n" - "sdot z30.s, z7.b, z5.b[0]\n" - "sdot z11.s, z6.b, z0.b[0]\n" - "sdot z15.s, z6.b, z1.b[0]\n" - "sdot z19.s, z6.b, z2.b[0]\n" - "sdot z23.s, z6.b, z3.b[0]\n" - "sdot z27.s, z6.b, z4.b[0]\n" - "sdot z31.s, z6.b, z5.b[0]\n" - "ble 74f\n" - "ld1b { z7.b }, p2/Z, [x10]\n" - "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[0]\n" + "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z18.s, z6.b, z2.b[0]\n" + "sdot z22.s, z6.b, z3.b[0]\n" + "sdot z26.s, z6.b, z4.b[0]\n" + "sdot z30.s, z6.b, z5.b[0]\n" + "sdot z11.s, z7.b, z0.b[0]\n" + "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z23.s, z7.b, z3.b[0]\n" + "sdot z27.s, z7.b, z4.b[0]\n" + "sdot z31.s, z7.b, z5.b[0]\n" + "ble 69f\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z8.s, z7.b, z0.b[1]\n" - "sdot z12.s, z7.b, z1.b[1]\n" - "sdot z16.s, z7.b, z2.b[1]\n" - "sdot z20.s, z7.b, z3.b[1]\n" - "sdot z24.s, z7.b, z4.b[1]\n" - "sdot z28.s, z7.b, z5.b[1]\n" - "sdot z9.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z13.s, z6.b, z1.b[1]\n" - "sdot z17.s, z6.b, z2.b[1]\n" - "sdot z21.s, z6.b, z3.b[1]\n" - "sdot z25.s, z6.b, z4.b[1]\n" - "sdot z29.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z7.b, z0.b[1]\n" + "sdot z8.s, z6.b, z0.b[1]\n" + "sdot z12.s, z6.b, z1.b[1]\n" + "sdot z16.s, z6.b, z2.b[1]\n" + "sdot z20.s, z6.b, z3.b[1]\n" + "sdot z24.s, z6.b, z4.b[1]\n" + "sdot z28.s, z6.b, z5.b[1]\n" + "sdot z9.s, z7.b, z0.b[1]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z13.s, z7.b, z1.b[1]\n" + "sdot z17.s, z7.b, z2.b[1]\n" + "sdot z21.s, z7.b, z3.b[1]\n" + "sdot z25.s, z7.b, z4.b[1]\n" + "sdot z29.s, z7.b, z5.b[1]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[1]\n" "addvl x10, x10, #4\n" - "sdot z14.s, z7.b, z1.b[1]\n" - "sdot z18.s, z7.b, z2.b[1]\n" - "sdot z22.s, z7.b, z3.b[1]\n" - "sdot z26.s, z7.b, z4.b[1]\n" - "sdot z30.s, z7.b, z5.b[1]\n" - "sdot z11.s, z6.b, z0.b[1]\n" - "sdot z15.s, z6.b, z1.b[1]\n" - "sdot z19.s, z6.b, z2.b[1]\n" - "sdot z23.s, z6.b, z3.b[1]\n" - "sdot z27.s, z6.b, z4.b[1]\n" - "sdot z31.s, z6.b, z5.b[1]\n" - "ble 74f\n" - "ld1b { z7.b }, p2/Z, [x10]\n" - "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" + "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z18.s, z6.b, z2.b[1]\n" + "sdot z22.s, z6.b, z3.b[1]\n" + "sdot z26.s, z6.b, z4.b[1]\n" + "sdot z30.s, z6.b, z5.b[1]\n" + "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z23.s, z7.b, z3.b[1]\n" + "sdot z27.s, z7.b, z4.b[1]\n" + "sdot z31.s, z7.b, z5.b[1]\n" + "ble 69f\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z8.s, z7.b, z0.b[2]\n" - "sdot z12.s, z7.b, z1.b[2]\n" - "sdot z16.s, z7.b, z2.b[2]\n" - "sdot z20.s, z7.b, z3.b[2]\n" - "sdot z24.s, z7.b, z4.b[2]\n" - "sdot z28.s, z7.b, z5.b[2]\n" - "sdot z9.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z13.s, z6.b, z1.b[2]\n" - "sdot z17.s, z6.b, z2.b[2]\n" - "sdot z21.s, z6.b, z3.b[2]\n" - "sdot z25.s, z6.b, z4.b[2]\n" - "sdot z29.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z7.b, z0.b[2]\n" + "sdot z8.s, z6.b, z0.b[2]\n" + "sdot z12.s, z6.b, z1.b[2]\n" + "sdot z16.s, z6.b, z2.b[2]\n" + "sdot z20.s, z6.b, z3.b[2]\n" + "sdot z24.s, z6.b, z4.b[2]\n" + "sdot z28.s, z6.b, z5.b[2]\n" + "sdot z9.s, z7.b, z0.b[2]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z13.s, z7.b, z1.b[2]\n" + "sdot z17.s, z7.b, z2.b[2]\n" + "sdot z21.s, z7.b, z3.b[2]\n" + "sdot z25.s, z7.b, z4.b[2]\n" + "sdot z29.s, z7.b, z5.b[2]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z6.b, z0.b[2]\n" "addvl x10, x10, #4\n" - "sdot z14.s, z7.b, z1.b[2]\n" - "sdot z18.s, z7.b, z2.b[2]\n" - "sdot z22.s, z7.b, z3.b[2]\n" - "sdot z26.s, z7.b, z4.b[2]\n" - "sdot z30.s, z7.b, z5.b[2]\n" - "sdot z11.s, z6.b, z0.b[2]\n" - "sdot z15.s, z6.b, z1.b[2]\n" - "sdot z19.s, z6.b, z2.b[2]\n" - "sdot z23.s, z6.b, z3.b[2]\n" - "sdot z27.s, z6.b, z4.b[2]\n" - "sdot z31.s, z6.b, z5.b[2]\n" - "ble 74f\n" - "ld1b { z7.b }, p2/Z, [x10]\n" - "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z7.b, z0.b[3]\n" - "sdot z12.s, z7.b, z1.b[3]\n" - "sdot z16.s, z7.b, z2.b[3]\n" - "sdot z20.s, z7.b, z3.b[3]\n" - "sdot z24.s, z7.b, z4.b[3]\n" - "sdot z28.s, z7.b, z5.b[3]\n" - "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z6.b, z0.b[3]\n" - "sdot z13.s, z6.b, z1.b[3]\n" - "sdot z17.s, z6.b, z2.b[3]\n" - "sdot z21.s, z6.b, z3.b[3]\n" - "sdot z25.s, z6.b, z4.b[3]\n" - "sdot z29.s, z6.b, z5.b[3]\n" - "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z18.s, z6.b, z2.b[2]\n" + "sdot z22.s, z6.b, z3.b[2]\n" + "sdot z26.s, z6.b, z4.b[2]\n" + "sdot z30.s, z6.b, z5.b[2]\n" + "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z23.s, z7.b, z3.b[2]\n" + "sdot z27.s, z7.b, z4.b[2]\n" + "sdot z31.s, z7.b, z5.b[2]\n" + "ble 69f\n" + "ld1b { z6.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z6.b, z0.b[3]\n" + "sdot z12.s, z6.b, z1.b[3]\n" + "sdot z16.s, z6.b, z2.b[3]\n" + "sdot z20.s, z6.b, z3.b[3]\n" + "sdot z24.s, z6.b, z4.b[3]\n" + "sdot z28.s, z6.b, z5.b[3]\n" + "ld1b { z6.b }, p2/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z7.b, z0.b[3]\n" + "sdot z13.s, z7.b, z1.b[3]\n" + "sdot z17.s, z7.b, z2.b[3]\n" + "sdot z21.s, z7.b, z3.b[3]\n" + "sdot z25.s, z7.b, z4.b[3]\n" + "sdot z29.s, z7.b, z5.b[3]\n" + "ld1b { z7.b }, p2/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z7.b, z0.b[3]\n" - "sdot z14.s, z7.b, z1.b[3]\n" - "sdot z18.s, z7.b, z2.b[3]\n" - "sdot z22.s, z7.b, z3.b[3]\n" - "sdot z26.s, z7.b, z4.b[3]\n" - "sdot z30.s, z7.b, z5.b[3]\n" - "sdot z11.s, z6.b, z0.b[3]\n" - "sdot z15.s, z6.b, z1.b[3]\n" - "sdot z19.s, z6.b, z2.b[3]\n" - "sdot z23.s, z6.b, z3.b[3]\n" - "sdot z27.s, z6.b, z4.b[3]\n" - "sdot z31.s, z6.b, z5.b[3]\n" - "74:" // Height 6: Multiply loop: multiply skip + "sdot z10.s, z6.b, z0.b[3]\n" + "sdot z14.s, z6.b, z1.b[3]\n" + "sdot z18.s, z6.b, z2.b[3]\n" + "sdot z22.s, z6.b, z3.b[3]\n" + "sdot z26.s, z6.b, z4.b[3]\n" + "sdot z30.s, z6.b, z5.b[3]\n" + "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z23.s, z7.b, z3.b[3]\n" + "sdot z27.s, z7.b, z4.b[3]\n" + "sdot z31.s, z7.b, z5.b[3]\n" + "69:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 69b\n" + "bne 64b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "ld1w { z3.s }, p2/Z, [x14]\n" - "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n" - "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" - "add x26, x9, x20\n" - "add x25, x26, x20\n" - "add z8.s, z8.s, z3.s\n" - "add z12.s, z12.s, z3.s\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add z9.s, z9.s, z2.s\n" - "add z10.s, z10.s, z1.s\n" - "add x22, x23, x20\n" - "add z11.s, z11.s, z0.s\n" - "add z13.s, z13.s, z2.s\n" - "add z14.s, z14.s, z1.s\n" - "add z15.s, z15.s, z0.s\n" - "add z16.s, z16.s, z3.s\n" - "add z17.s, z17.s, z2.s\n" - "add z18.s, z18.s, z1.s\n" - "add z19.s, z19.s, z0.s\n" - "add z20.s, z20.s, z3.s\n" - "add z21.s, z21.s, z2.s\n" - "add z22.s, z22.s, z1.s\n" - "add z23.s, z23.s, z0.s\n" - "add z24.s, z24.s, z3.s\n" - "add z25.s, z25.s, z2.s\n" - "add z26.s, z26.s, z1.s\n" - "add z27.s, z27.s, z0.s\n" - "add z28.s, z28.s, z3.s\n" - "add z29.s, z29.s, z2.s\n" - "add z30.s, z30.s, z1.s\n" - "add z31.s, z31.s, z0.s\n" - "tbz %x[flags], #4, 75f\n" "ld1w { z0.s }, p2/Z, [x12]\n" - "ld1w { z4.s }, p2/Z, [x13]\n" "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" - "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" "addvl x12, x12, #4\n" + "add x27, x9, x20\n" + "add x26, x27, x20\n" + "add z8.s, z8.s, z0.s\n" + "add z12.s, z12.s, z0.s\n" + "add x25, x26, x20\n" + "add x24, x25, x20\n" + "add z9.s, z9.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add x23, x24, x20\n" + "add z11.s, z11.s, z3.s\n" + "add z13.s, z13.s, z1.s\n" + "add z14.s, z14.s, z2.s\n" + "add z15.s, z15.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + "add z28.s, z28.s, z0.s\n" + "add z29.s, z29.s, z1.s\n" + "add z30.s, z30.s, z2.s\n" + "add z31.s, z31.s, z3.s\n" + "tbz %x[flags], #4, 70f\n" + "ld1w { z0.s }, p2/Z, [x13]\n" + "ld1w { z4.s }, p2/Z, [x14]\n" + "ld1w { z1.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x13, #3, MUL VL]\n" + "ld1w { z7.s }, p2/Z, [x14, #3, MUL VL]\n" "addvl x13, x13, #4\n" - "b 76f\n" - "75:" // Height 6: per layer parameters + "addvl x14, x14, #4\n" + "b 71f\n" + "70:" // Height 6: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1rw { z0.s }, p2/Z, [x21]\n" @@ -2353,248 +2148,173 @@ void sve_hybrid_s8qs_dot_6x4VL ( "mov z6.d, z4.d\n" "mov z3.d, z0.d\n" "mov z7.d, z4.d\n" - "76:" // Height 6: parameters loaded - ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" - ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" - ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" - ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" - ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n" - ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n" - ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n" - ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" - ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" - ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" - ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" - ".inst 0x04a576b5 // sqrdmulh z21.s, z21.s, z5.s\n" - ".inst 0x04a676d6 // sqrdmulh z22.s, z22.s, z6.s\n" - ".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n" - ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" - ".inst 0x04a57739 // sqrdmulh z25.s, z25.s, z5.s\n" - ".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n" - ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n" - ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n" - ".inst 0x04a577bd // sqrdmulh z29.s, z29.s, z5.s\n" - ".inst 0x04a677de // sqrdmulh z30.s, z30.s, z6.s\n" - ".inst 0x04a777ff // sqrdmulh z31.s, z31.s, z7.s\n" - "tbz %x[flags], #5, 77f\n" - "and z7.d, z8.d, z0.d\n" - "and z6.d, z9.d, z1.d\n" - "and z5.d, z10.d, z2.d\n" - "and z4.d, z11.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z8.s, z8.s, z7.s\n" - "and z7.d, z12.d, z0.d\n" - "sqadd z9.s, z9.s, z6.s\n" - "and z6.d, z13.d, z1.d\n" - "sqadd z10.s, z10.s, z5.s\n" - "sqadd z11.s, z11.s, z4.s\n" - "and z5.d, z14.d, z2.d\n" - "and z4.d, z15.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z12.s, z12.s, z7.s\n" - "and z7.d, z16.d, z0.d\n" - "sqadd z13.s, z13.s, z6.s\n" - "and z6.d, z17.d, z1.d\n" - "sqadd z14.s, z14.s, z5.s\n" - "sqadd z15.s, z15.s, z4.s\n" - "and z5.d, z18.d, z2.d\n" - "and z4.d, z19.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z16.s, z16.s, z7.s\n" - "and z7.d, z20.d, z0.d\n" - "sqadd z17.s, z17.s, z6.s\n" - "and z6.d, z21.d, z1.d\n" - "sqadd z18.s, z18.s, z5.s\n" - "sqadd z19.s, z19.s, z4.s\n" - "and z5.d, z22.d, z2.d\n" - "and z4.d, z23.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z20.s, z20.s, z7.s\n" - "and z7.d, z24.d, z0.d\n" - "sqadd z21.s, z21.s, z6.s\n" - "and z6.d, z25.d, z1.d\n" - "sqadd z22.s, z22.s, z5.s\n" - "sqadd z23.s, z23.s, z4.s\n" - "and z5.d, z26.d, z2.d\n" - "and z4.d, z27.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z24.s, z24.s, z7.s\n" - "and z7.d, z28.d, z0.d\n" - "sqadd z25.s, z25.s, z6.s\n" - "and z6.d, z29.d, z1.d\n" - "sqadd z26.s, z26.s, z5.s\n" - "sqadd z27.s, z27.s, z4.s\n" - "and z5.d, z30.d, z2.d\n" - "and z4.d, z31.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z28.s, z28.s, z7.s\n" - "sqadd z29.s, z29.s, z6.s\n" - "sqadd z30.s, z30.s, z5.s\n" - "sqadd z31.s, z31.s, z4.s\n" - "77:" // Height 6: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "71:" // Height 6: parameters loaded + ".inst 0x04a47108 // sqdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57129 // sqdmulh z9.s, z9.s, z5.s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + ".inst 0x04a6714a // sqdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7716b // sqdmulh z11.s, z11.s, z7.s\n" + "add x20, %x[qp], %[minval]\n" + ".inst 0x04a4718c // sqdmulh z12.s, z12.s, z4.s\n" + ".inst 0x04a571ad // sqdmulh z13.s, z13.s, z5.s\n" + ".inst 0x04a671ce // sqdmulh z14.s, z14.s, z6.s\n" + ".inst 0x04a771ef // sqdmulh z15.s, z15.s, z7.s\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a57231 // sqdmulh z17.s, z17.s, z5.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + ".inst 0x04a67252 // sqdmulh z18.s, z18.s, z6.s\n" + ".inst 0x04a77273 // sqdmulh z19.s, z19.s, z7.s\n" ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a572b5 // sqdmulh z21.s, z21.s, z5.s\n" ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" - "add z8.s, z8.s, z4.s\n" ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" + ".inst 0x04a672d6 // sqdmulh z22.s, z22.s, z6.s\n" + ".inst 0x04a772f7 // sqdmulh z23.s, z23.s, z7.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z9.s, z9.s, z4.s\n" - "add z10.s, z10.s, z4.s\n" ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" + ".inst 0x04a47318 // sqdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a57339 // sqdmulh z25.s, z25.s, z5.s\n" ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" - "add z11.s, z11.s, z4.s\n" - "add z12.s, z12.s, z4.s\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" + ".inst 0x04a6735a // sqdmulh z26.s, z26.s, z6.s\n" + ".inst 0x04a7737b // sqdmulh z27.s, z27.s, z7.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" - "add z13.s, z13.s, z4.s\n" - "add z14.s, z14.s, z4.s\n" ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n" + ".inst 0x04a4739c // sqdmulh z28.s, z28.s, z4.s\n" + ".inst 0x04a573bd // sqdmulh z29.s, z29.s, z5.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n" - "add z15.s, z15.s, z4.s\n" - "add z16.s, z16.s, z4.s\n" + ".inst 0x04a673de // sqdmulh z30.s, z30.s, z6.s\n" + ".inst 0x04a773ff // sqdmulh z31.s, z31.s, z7.s\n" ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" ".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n" ".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n" - "add z19.s, z19.s, z4.s\n" - "add z20.s, z20.s, z4.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n" ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" - "add z21.s, z21.s, z4.s\n" - "add z22.s, z22.s, z4.s\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" ".inst 0x4482883d // srshl z29.s, p2/M, z29.s, z1.s\n" ".inst 0x4482885e // srshl z30.s, p2/M, z30.s, z2.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + ".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n" + "add z12.s, z12.s, z4.s\n" + "add z13.s, z13.s, z4.s\n" + "add z14.s, z14.s, z4.s\n" + "add z15.s, z15.s, z4.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "add z22.s, z22.s, z4.s\n" "add z23.s, z23.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" "add z24.s, z24.s, z4.s\n" - ".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n" - "add x20, %x[qp], %[maxval]\n" "add z25.s, z25.s, z4.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" "add z26.s, z26.s, z4.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" "add z27.s, z27.s, z4.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" "add z28.s, z28.s, z4.s\n" - "add x20, %x[qp], %[minval]\n" "add z29.s, z29.s, z4.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" "add z30.s, z30.s, z4.s\n" - "ld1rw { z2.s }, p2/Z, [x20]\n" "add z31.s, z31.s, z4.s\n" - "smin z8.s, p2/M, z8.s, z0.s\n" - "smin z9.s, p2/M, z9.s, z0.s\n" - "smin z10.s, p2/M, z10.s, z0.s\n" - "smin z11.s, p2/M, z11.s, z0.s\n" - "smin z12.s, p2/M, z12.s, z0.s\n" - "smin z13.s, p2/M, z13.s, z0.s\n" - "smin z14.s, p2/M, z14.s, z0.s\n" - "smin z15.s, p2/M, z15.s, z0.s\n" - "smin z16.s, p2/M, z16.s, z0.s\n" - "smin z17.s, p2/M, z17.s, z0.s\n" - "smin z18.s, p2/M, z18.s, z0.s\n" - "smin z19.s, p2/M, z19.s, z0.s\n" - "smin z20.s, p2/M, z20.s, z0.s\n" - "smin z21.s, p2/M, z21.s, z0.s\n" - "smin z22.s, p2/M, z22.s, z0.s\n" - "smin z23.s, p2/M, z23.s, z0.s\n" - "smin z24.s, p2/M, z24.s, z0.s\n" - "smin z25.s, p2/M, z25.s, z0.s\n" - "smin z26.s, p2/M, z26.s, z0.s\n" - "smin z27.s, p2/M, z27.s, z0.s\n" - "smin z28.s, p2/M, z28.s, z0.s\n" - "smin z29.s, p2/M, z29.s, z0.s\n" - "smin z30.s, p2/M, z30.s, z0.s\n" - "smin z31.s, p2/M, z31.s, z0.s\n" - "smax z8.s, p2/M, z8.s, z2.s\n" - "smax z9.s, p2/M, z9.s, z2.s\n" - "smax z10.s, p2/M, z10.s, z2.s\n" - "smax z11.s, p2/M, z11.s, z2.s\n" - "smax z12.s, p2/M, z12.s, z2.s\n" - "smax z13.s, p2/M, z13.s, z2.s\n" - "smax z14.s, p2/M, z14.s, z2.s\n" - "smax z15.s, p2/M, z15.s, z2.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smin z28.s, p2/M, z28.s, z6.s\n" + "smin z29.s, p2/M, z29.s, z6.s\n" + "smin z30.s, p2/M, z30.s, z6.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" "uzp1 z8.h, z8.h, z9.h\n" - "smax z16.s, p2/M, z16.s, z2.s\n" - "smax z17.s, p2/M, z17.s, z2.s\n" - "uzp1 z1.h, z10.h, z11.h\n" - "smax z18.s, p2/M, z18.s, z2.s\n" - "smax z19.s, p2/M, z19.s, z2.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z12.h, z12.h, z13.h\n" - "smax z20.s, p2/M, z20.s, z2.s\n" - "smax z21.s, p2/M, z21.s, z2.s\n" - "uzp1 z0.h, z14.h, z15.h\n" - "smax z22.s, p2/M, z22.s, z2.s\n" - "smax z23.s, p2/M, z23.s, z2.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "uzp1 z13.h, z14.h, z15.h\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "uzp1 z8.b, z8.b, z1.b\n" - "smax z24.s, p2/M, z24.s, z2.s\n" - "smax z25.s, p2/M, z25.s, z2.s\n" - "uzp1 z18.h, z18.h, z19.h\n" - "smax z26.s, p2/M, z26.s, z2.s\n" - "smax z27.s, p2/M, z27.s, z2.s\n" + "uzp1 z8.b, z8.b, z9.b\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z20.h, z20.h, z21.h\n" - "uzp1 z12.b, z12.b, z0.b\n" - "smax z28.s, p2/M, z28.s, z2.s\n" - "smax z29.s, p2/M, z29.s, z2.s\n" - "uzp1 z17.h, z22.h, z23.h\n" + "uzp1 z12.b, z12.b, z13.b\n" + "smax z28.s, p2/M, z28.s, z5.s\n" + "smax z29.s, p2/M, z29.s, z5.s\n" + "uzp1 z21.h, z22.h, z23.h\n" "st1b { z8.b }, p1, [x9]\n" - "smax z30.s, p2/M, z30.s, z2.s\n" - "smax z31.s, p2/M, z31.s, z2.s\n" + "smax z30.s, p2/M, z30.s, z5.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" "uzp1 z24.h, z24.h, z25.h\n" - "uzp1 z16.b, z16.b, z18.b\n" - "uzp1 z18.h, z26.h, z27.h\n" - "st1b { z12.b }, p1, [x26]\n" + "uzp1 z16.b, z16.b, z17.b\n" + "uzp1 z25.h, z26.h, z27.h\n" + "st1b { z12.b }, p1, [x27]\n" "addvl x9, x9, #1\n" "uzp1 z28.h, z28.h, z29.h\n" - "uzp1 z20.b, z20.b, z17.b\n" - "uzp1 z17.h, z30.h, z31.h\n" - "st1b { z16.b }, p1, [x25]\n" - "uzp1 z24.b, z24.b, z18.b\n" - "uzp1 z28.b, z28.b, z17.b\n" - "st1b { z20.b }, p1, [x24]\n" - "st1b { z24.b }, p1, [x23]\n" - "st1b { z28.b }, p1, [x22]\n" - "78:" // Height 6: Writeback done + "uzp1 z20.b, z20.b, z21.b\n" + "uzp1 z29.h, z30.h, z31.h\n" + "st1b { z16.b }, p1, [x26]\n" + "uzp1 z24.b, z24.b, z25.b\n" + "uzp1 z28.b, z28.b, z29.b\n" + "st1b { z20.b }, p1, [x25]\n" + "st1b { z24.b }, p1, [x24]\n" + "st1b { z28.b }, p1, [x23]\n" "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" - "bgt 67b\n" + "bgt 62b\n" "subs %x[M], %x[M], #0x6\n" - "beq 80f\n" + "beq 74f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 79f\n" + "tbz %x[flags], #3, 73f\n" "add x21, x21, #0x6\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "79:" // Update direct input + "73:" // Update direct input "mov x20, #0x6\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "80:" // Exit + "74:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp index 759e3e2f3d..d06123cc4f 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -81,23 +80,20 @@ void sve_hybrid_s8qs_mmla_6x4VL ( ka.multiplier_ptr=qp->per_channel_muls + col_base; ka.shift_ptr=qp->per_channel_right_shifts + col_base; } - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "ptrue p2.b\n" "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 66f\n" + "bge 61f\n" "cmp %x[M], #0x4\n" - "bgt 53f\n" - "beq 40f\n" + "bgt 49f\n" + "beq 37f\n" "cmp %x[M], #0x2\n" - "bgt 27f\n" - "beq 14f\n" - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "bgt 25f\n" + "beq 13f\n" + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" @@ -112,7 +108,6 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "whilelt p1.b, x20, x11\n" "mov z14.s, #0x0\n" "mov z15.s, #0x0\n" - "3:" // Height 1: setup done "mov x28, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -133,87 +128,87 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "ble 8f\n" "7:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1b { z17.b }, p2/Z, [x10]\n" - "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x10]\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - "ld1rqb { z19.b }, p0/Z, [x26]\n" + "ld1rqb { z1.b }, p0/Z, [x26]\n" "add x26, x26, #0x10\n" - "trn1 z18.d, z19.d, z22.d\n" - "trn2 z19.d, z19.d, z22.d\n" - ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n" - ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #7, MUL VL]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n" - ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n" - "ld1b { z17.b }, p2/Z, [x10, #-8, MUL VL]\n" - "ld1b { z16.b }, p2/Z, [x10, #-7, MUL VL]\n" - ".inst 0x45119a68 // smmla z8.s, z19.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #-6, MUL VL]\n" - ".inst 0x45109a6c // smmla z12.s, z19.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #-5, MUL VL]\n" - ".inst 0x45119a69 // smmla z9.s, z19.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #-4, MUL VL]\n" - ".inst 0x45109a6d // smmla z13.s, z19.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #-3, MUL VL]\n" - ".inst 0x45119a6a // smmla z10.s, z19.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #-2, MUL VL]\n" - ".inst 0x45109a6e // smmla z14.s, z19.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #-1, MUL VL]\n" - ".inst 0x45119a6b // smmla z11.s, z19.b, z17.b\n" - ".inst 0x45109a6f // smmla z15.s, z19.b, z16.b\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" "bgt 7b\n" "8:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" - "ld1b { z17.b }, p2/Z, [x10]\n" - "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x10]\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x8\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "trn1 z18.d, z1.d, z19.d\n" - ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n" + "trn1 z0.d, z1.d, z2.d\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n" - "trn2 z1.d, z1.d, z19.d\n" - ".inst 0x45079a49 // smmla z9.s, z18.b, z7.b\n" - "ld1b { z17.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n" - "ld1b { z26.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #7, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x451a9a4b // smmla z11.s, z18.b, z26.b\n" - ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" "ble 9f\n" - "ld1b { z17.b }, p2/Z, [x10]\n" - "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n" - ".inst 0x45119828 // smmla z8.s, z1.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x4510982c // smmla z12.s, z1.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n" - ".inst 0x45119829 // smmla z9.s, z1.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x4510982d // smmla z13.s, z1.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x4511982a // smmla z10.s, z1.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x4510982e // smmla z14.s, z1.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #7, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x10]\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x4511982b // smmla z11.s, z1.b, z17.b\n" - ".inst 0x4510982f // smmla z15.s, z1.b, z16.b\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" "9:" // Height 1: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -221,29 +216,29 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "bne 4b\n" "uzp1 z8.d, z8.d, z12.d\n" "uzp1 z9.d, z9.d, z13.d\n" - "ld1w { z19.s }, p2/Z, [x14]\n" - "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n" - "uzp1 z10.d, z10.d, z14.d\n" - "uzp1 z11.d, z11.d, z15.d\n" - "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n" - "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" - "mov z15.d, z8.d\n" - "add z9.s, z9.s, z18.s\n" - "add z15.s, z15.s, z19.s\n" - "add z10.s, z10.s, z17.s\n" - "add z11.s, z11.s, z16.s\n" - "tbz %x[flags], #4, 10f\n" "ld1w { z0.s }, p2/Z, [x12]\n" - "ld1w { z4.s }, p2/Z, [x13]\n" "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" + "uzp1 z10.d, z10.d, z14.d\n" + "uzp1 z11.d, z11.d, z15.d\n" "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" - "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" "addvl x12, x12, #4\n" + "mov z15.d, z8.d\n" + "add z9.s, z9.s, z1.s\n" + "add z15.s, z15.s, z0.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "tbz %x[flags], #4, 10f\n" + "ld1w { z0.s }, p2/Z, [x13]\n" + "ld1w { z4.s }, p2/Z, [x14]\n" + "ld1w { z1.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x13, #3, MUL VL]\n" + "ld1w { z7.s }, p2/Z, [x14, #3, MUL VL]\n" "addvl x13, x13, #4\n" + "addvl x14, x14, #4\n" "b 11f\n" "10:" // Height 1: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" @@ -257,64 +252,49 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "mov z3.d, z0.d\n" "mov z7.d, z4.d\n" "11:" // Height 1: parameters loaded - ".inst 0x04a475ef // sqrdmulh z15.s, z15.s, z4.s\n" - ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" - ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" - ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" - "tbz %x[flags], #5, 12f\n" - "and z19.d, z15.d, z0.d\n" - "and z18.d, z9.d, z1.d\n" - "and z17.d, z10.d, z2.d\n" - "and z16.d, z11.d, z3.d\n" - "asr z19.s, z19.s, #0x1f\n" - "asr z18.s, z18.s, #0x1f\n" - "asr z17.s, z17.s, #0x1f\n" - "asr z16.s, z16.s, #0x1f\n" - "sqadd z15.s, z15.s, z19.s\n" - "sqadd z9.s, z9.s, z18.s\n" - "sqadd z10.s, z10.s, z17.s\n" - "sqadd z11.s, z11.s, z16.s\n" - "12:" // Height 1: no shift correction - "add x20, %x[qp], %[c_offset]\n" + ".inst 0x04a471ef // sqdmulh z15.s, z15.s, z4.s\n" + ".inst 0x04a57129 // sqdmulh z9.s, z9.s, z5.s\n" + "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + ".inst 0x04a6714a // sqdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7716b // sqdmulh z11.s, z11.s, z7.s\n" + "ld1rw { z4.s }, p2/Z, [x21]\n" + "ld1rw { z6.s }, p2/Z, [x20]\n" + "add x20, %x[qp], %[minval]\n" ".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n" - "ld1rw { z17.s }, p2/Z, [x20]\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" - "add z15.s, z15.s, z17.s\n" - "add x20, %x[qp], %[minval]\n" - "add z9.s, z9.s, z17.s\n" - "add z10.s, z10.s, z17.s\n" - "ld1rw { z28.s }, p2/Z, [x20]\n" - "add z11.s, z11.s, z17.s\n" - "smin z15.s, p2/M, z15.s, z16.s\n" - "smin z9.s, p2/M, z9.s, z16.s\n" - "smin z10.s, p2/M, z10.s, z16.s\n" - "smin z11.s, p2/M, z11.s, z16.s\n" - "smax z15.s, p2/M, z15.s, z28.s\n" - "smax z9.s, p2/M, z9.s, z28.s\n" - "smax z10.s, p2/M, z10.s, z28.s\n" - "smax z11.s, p2/M, z11.s, z28.s\n" + "add z15.s, z15.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" "uzp1 z15.h, z15.h, z9.h\n" - "uzp1 z16.h, z10.h, z11.h\n" - "uzp1 z15.b, z15.b, z16.b\n" + "uzp1 z9.h, z10.h, z11.h\n" + "uzp1 z15.b, z15.b, z9.b\n" "st1b { z15.b }, p1, [x9]\n" "addvl x9, x9, #1\n" - "13:" // Height 1: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" "bgt 2b\n" - "b 80f\n" - "14:" // Height 2 - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "b 74f\n" + "13:" // Height 2 + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "15:" // Height 2: Column loop + "14:" // Height 2: Column loop "mov x20, #0x0\n" "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" @@ -325,156 +305,155 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "whilelt p1.b, x20, x11\n" "mov z14.s, #0x0\n" "mov z15.s, #0x0\n" - "16:" // Height 2: setup done "mov x28, #0x0\n" - "17:" // Height 2: String loop + "16:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 18f\n" + "tbz %x[flags], #3, 17f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" "ldr x25, [x20, #0x8]\n" - "cbnz x28, 19f\n" + "cbnz x28, 18f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" - "b 19f\n" - "18:" // Height 2: setup direct input + "b 18f\n" + "17:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" - "19:" // Height 2: input setup done + "18:" // Height 2: input setup done "cmp x27, #0x10\n" - "ble 21f\n" - "20:" // Height 2: Multiply loop: Main loop head + "ble 20f\n" + "19:" // Height 2: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1b { z17.b }, p2/Z, [x10]\n" - "ld1b { z19.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x10]\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - "ld1rqb { z18.b }, p0/Z, [x26]\n" + "ld1rqb { z1.b }, p0/Z, [x26]\n" "add x26, x26, #0x10\n" - "ld1rqb { z16.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x25]\n" "add x25, x25, #0x10\n" - "trn1 z2.d, z18.d, z16.d\n" - "trn2 z18.d, z18.d, z16.d\n" - ".inst 0x45119848 // smmla z8.s, z2.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x4513984c // smmla z12.s, z2.b, z19.b\n" - "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n" - ".inst 0x45119849 // smmla z9.s, z2.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x4510984d // smmla z13.s, z2.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x4511984a // smmla z10.s, z2.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x4510984e // smmla z14.s, z2.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #7, MUL VL]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x4511984b // smmla z11.s, z2.b, z17.b\n" - ".inst 0x4510984f // smmla z15.s, z2.b, z16.b\n" - "ld1b { z17.b }, p2/Z, [x10, #-8, MUL VL]\n" - "ld1b { z16.b }, p2/Z, [x10, #-7, MUL VL]\n" - ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #-6, MUL VL]\n" - ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #-5, MUL VL]\n" - ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #-4, MUL VL]\n" - ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #-3, MUL VL]\n" - ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #-2, MUL VL]\n" - ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #-1, MUL VL]\n" - ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n" - ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n" - "bgt 20b\n" - "21:" // Height 2: Multiply loop: Single iteration only + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + "bgt 19b\n" + "20:" // Height 2: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" - "ld1b { z17.b }, p2/Z, [x10]\n" - "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x10]\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x8\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z19.b }, p0/Z, [x25]\n" - "trn1 z18.d, z1.d, z19.d\n" - ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n" - "trn2 z1.d, z1.d, z19.d\n" - ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #7, MUL VL]\n" + "ld1rqb { z2.b }, p0/Z, [x25]\n" + "trn1 z0.d, z1.d, z2.d\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n" - ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n" - "ble 22f\n" - "ld1b { z17.b }, p2/Z, [x10]\n" - "ld1b { z16.b }, p2/Z, [x10, #1, MUL VL]\n" - ".inst 0x45119828 // smmla z8.s, z1.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x4510982c // smmla z12.s, z1.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #3, MUL VL]\n" - ".inst 0x45119829 // smmla z9.s, z1.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x4510982d // smmla z13.s, z1.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x4511982a // smmla z10.s, z1.b, z17.b\n" - "ld1b { z17.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x4510982e // smmla z14.s, z1.b, z16.b\n" - "ld1b { z16.b }, p2/Z, [x10, #7, MUL VL]\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + "ble 21f\n" + "ld1b { z7.b }, p2/Z, [x10]\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x4511982b // smmla z11.s, z1.b, z17.b\n" - ".inst 0x4510982f // smmla z15.s, z1.b, z16.b\n" - "22:" // Height 2: Multiply loop: multiply skip + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + "21:" // Height 2: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 17b\n" - "uzp1 z20.d, z8.d, z12.d\n" + "bne 16b\n" + "uzp1 z7.d, z8.d, z12.d\n" "uzp2 z8.d, z8.d, z12.d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "ld1w { z19.s }, p2/Z, [x14]\n" + "ld1w { z0.s }, p2/Z, [x12]\n" "uzp1 z12.d, z9.d, z13.d\n" "uzp2 z9.d, z9.d, z13.d\n" - "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n" - "uzp1 z13.d, z10.d, z14.d\n" - "uzp2 z10.d, z10.d, z14.d\n" - "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" - "uzp1 z14.d, z11.d, z15.d\n" - "uzp2 z11.d, z11.d, z15.d\n" - "add x26, x9, x20\n" - "mov z15.d, z20.d\n" - "add z12.s, z12.s, z18.s\n" - "add z13.s, z13.s, z17.s\n" - "add z8.s, z8.s, z19.s\n" - "add z15.s, z15.s, z19.s\n" - "add z14.s, z14.s, z16.s\n" - "add z9.s, z9.s, z18.s\n" - "add z10.s, z10.s, z17.s\n" - "add z11.s, z11.s, z16.s\n" - "tbz %x[flags], #4, 23f\n" - "ld1w { z0.s }, p2/Z, [x12]\n" - "ld1w { z4.s }, p2/Z, [x13]\n" "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" + "uzp1 z13.d, z10.d, z14.d\n" + "uzp2 z10.d, z10.d, z14.d\n" "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" - "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" "addvl x12, x12, #4\n" + "uzp1 z14.d, z11.d, z15.d\n" + "uzp2 z11.d, z11.d, z15.d\n" + "add x27, x9, x20\n" + "mov z15.d, z7.d\n" + "add z12.s, z12.s, z1.s\n" + "add z13.s, z13.s, z2.s\n" + "add z8.s, z8.s, z0.s\n" + "add z15.s, z15.s, z0.s\n" + "add z14.s, z14.s, z3.s\n" + "add z9.s, z9.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "tbz %x[flags], #4, 22f\n" + "ld1w { z0.s }, p2/Z, [x13]\n" + "ld1w { z4.s }, p2/Z, [x14]\n" + "ld1w { z1.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x13, #3, MUL VL]\n" + "ld1w { z7.s }, p2/Z, [x14, #3, MUL VL]\n" "addvl x13, x13, #4\n" - "b 24f\n" - "23:" // Height 2: per layer parameters + "addvl x14, x14, #4\n" + "b 23f\n" + "22:" // Height 2: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1rw { z0.s }, p2/Z, [x21]\n" @@ -485,101 +464,74 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "mov z6.d, z4.d\n" "mov z3.d, z0.d\n" "mov z7.d, z4.d\n" - "24:" // Height 2: parameters loaded - ".inst 0x04a475ef // sqrdmulh z15.s, z15.s, z4.s\n" - ".inst 0x04a5758c // sqrdmulh z12.s, z12.s, z5.s\n" - ".inst 0x04a675ad // sqrdmulh z13.s, z13.s, z6.s\n" - ".inst 0x04a775ce // sqrdmulh z14.s, z14.s, z7.s\n" - ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" - ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" - ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" - ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" - "tbz %x[flags], #5, 25f\n" - "and z18.d, z15.d, z0.d\n" - "and z19.d, z12.d, z1.d\n" - "and z17.d, z13.d, z2.d\n" - "and z16.d, z14.d, z3.d\n" - "asr z18.s, z18.s, #0x1f\n" - "asr z19.s, z19.s, #0x1f\n" - "asr z17.s, z17.s, #0x1f\n" - "asr z16.s, z16.s, #0x1f\n" - "sqadd z15.s, z15.s, z18.s\n" - "and z18.d, z8.d, z0.d\n" - "sqadd z12.s, z12.s, z19.s\n" - "and z19.d, z9.d, z1.d\n" - "sqadd z13.s, z13.s, z17.s\n" - "sqadd z14.s, z14.s, z16.s\n" - "and z17.d, z10.d, z2.d\n" - "and z16.d, z11.d, z3.d\n" - "asr z18.s, z18.s, #0x1f\n" - "asr z19.s, z19.s, #0x1f\n" - "asr z17.s, z17.s, #0x1f\n" - "asr z16.s, z16.s, #0x1f\n" - "sqadd z8.s, z8.s, z18.s\n" - "sqadd z9.s, z9.s, z19.s\n" - "sqadd z10.s, z10.s, z17.s\n" - "sqadd z11.s, z11.s, z16.s\n" - "25:" // Height 2: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "23:" // Height 2: parameters loaded + ".inst 0x04a471ef // sqdmulh z15.s, z15.s, z4.s\n" + ".inst 0x04a5718c // sqdmulh z12.s, z12.s, z5.s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + ".inst 0x04a671ad // sqdmulh z13.s, z13.s, z6.s\n" + ".inst 0x04a771ce // sqdmulh z14.s, z14.s, z7.s\n" + "add x20, %x[qp], %[minval]\n" + ".inst 0x04a47108 // sqdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57129 // sqdmulh z9.s, z9.s, z5.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" + ".inst 0x04a6714a // sqdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7716b // sqdmulh z11.s, z11.s, z7.s\n" ".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n" - "ld1rw { z18.s }, p2/Z, [x20]\n" ".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n" ".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n" ".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "add z15.s, z15.s, z18.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z12.s, z12.s, z18.s\n" - "add z13.s, z13.s, z18.s\n" - "ld1rw { z17.s }, p2/Z, [x20]\n" - "add z14.s, z14.s, z18.s\n" - "add z8.s, z8.s, z18.s\n" - "add x20, %x[qp], %[minval]\n" - "add z9.s, z9.s, z18.s\n" - "add z10.s, z10.s, z18.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" - "add z11.s, z11.s, z18.s\n" - "smin z15.s, p2/M, z15.s, z17.s\n" - "smin z12.s, p2/M, z12.s, z17.s\n" - "smin z13.s, p2/M, z13.s, z17.s\n" - "smin z14.s, p2/M, z14.s, z17.s\n" - "smin z8.s, p2/M, z8.s, z17.s\n" - "smin z9.s, p2/M, z9.s, z17.s\n" - "smin z10.s, p2/M, z10.s, z17.s\n" - "smin z11.s, p2/M, z11.s, z17.s\n" - "smax z15.s, p2/M, z15.s, z16.s\n" - "smax z12.s, p2/M, z12.s, z16.s\n" - "smax z13.s, p2/M, z13.s, z16.s\n" - "smax z14.s, p2/M, z14.s, z16.s\n" - "smax z8.s, p2/M, z8.s, z16.s\n" - "smax z9.s, p2/M, z9.s, z16.s\n" - "smax z10.s, p2/M, z10.s, z16.s\n" - "smax z11.s, p2/M, z11.s, z16.s\n" + "add z15.s, z15.s, z4.s\n" + "add z12.s, z12.s, z4.s\n" + "add z13.s, z13.s, z4.s\n" + "add z14.s, z14.s, z4.s\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" "uzp1 z15.h, z15.h, z12.h\n" - "uzp1 z17.h, z13.h, z14.h\n" + "uzp1 z12.h, z13.h, z14.h\n" "uzp1 z8.h, z8.h, z9.h\n" - "uzp1 z16.h, z10.h, z11.h\n" - "uzp1 z15.b, z15.b, z17.b\n" - "uzp1 z8.b, z8.b, z16.b\n" + "uzp1 z9.h, z10.h, z11.h\n" + "uzp1 z15.b, z15.b, z12.b\n" + "uzp1 z8.b, z8.b, z9.b\n" "st1b { z15.b }, p1, [x9]\n" "addvl x9, x9, #1\n" - "st1b { z8.b }, p1, [x26]\n" - "26:" // Height 2: Writeback done + "st1b { z8.b }, p1, [x27]\n" "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" - "bgt 15b\n" - "b 80f\n" - "27:" // Height 3 - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "bgt 14b\n" + "b 74f\n" + "25:" // Height 3 + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "28:" // Height 3: Column loop + "26:" // Height 3: Column loop "mov x20, #0x0\n" "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" @@ -598,207 +550,206 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "mov z21.s, #0x0\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "29:" // Height 3: setup done "mov x28, #0x0\n" - "30:" // Height 3: String loop + "28:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 31f\n" + "tbz %x[flags], #3, 29f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" "ldr x25, [x20, #0x8]\n" "ldr x24, [x20, #0x10]\n" - "cbnz x28, 32f\n" + "cbnz x28, 30f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" "add x24, x24, x20\n" - "b 32f\n" - "31:" // Height 3: setup direct input + "b 30f\n" + "29:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" "add x24, x25, x21\n" - "32:" // Height 3: input setup done + "30:" // Height 3: input setup done "cmp x27, #0x10\n" - "ble 34f\n" - "33:" // Height 3: Multiply loop: Main loop head + "ble 32f\n" + "31:" // Height 3: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1b { z28.b }, p2/Z, [x10]\n" - "ld1b { z25.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x10]\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - "ld1rqb { z30.b }, p0/Z, [x26]\n" + "ld1rqb { z1.b }, p0/Z, [x26]\n" "add x26, x26, #0x10\n" - "ld1rqb { z24.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x25]\n" "add x25, x25, #0x10\n" - "ld1rqb { z29.b }, p0/Z, [x24]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" "add x24, x24, #0x10\n" - "trn1 z27.d, z30.d, z24.d\n" - "trn2 z30.d, z30.d, z24.d\n" - "trn1 z26.d, z29.d, z31.d\n" - "trn2 z29.d, z29.d, z31.d\n" - ".inst 0x451c9b68 // smmla z8.s, z27.b, z28.b\n" - ".inst 0x45199b6c // smmla z12.s, z27.b, z25.b\n" - ".inst 0x451c9b50 // smmla z16.s, z26.b, z28.b\n" - "ld1b { z4.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x45199b54 // smmla z20.s, z26.b, z25.b\n" - "ld1b { z28.b }, p2/Z, [x10, #3, MUL VL]\n" - ".inst 0x45049b69 // smmla z9.s, z27.b, z4.b\n" - ".inst 0x45049b51 // smmla z17.s, z26.b, z4.b\n" - "ld1b { z25.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x451c9b6d // smmla z13.s, z27.b, z28.b\n" - ".inst 0x451c9b55 // smmla z21.s, z26.b, z28.b\n" - "ld1b { z24.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n" - ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n" - ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #7, MUL VL]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n" - ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n" - ".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n" - ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n" - "ld1b { z25.b }, p2/Z, [x10, #-8, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x10, #-7, MUL VL]\n" - ".inst 0x45199bc8 // smmla z8.s, z30.b, z25.b\n" - ".inst 0x45199bb0 // smmla z16.s, z29.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #-6, MUL VL]\n" - ".inst 0x45189bcc // smmla z12.s, z30.b, z24.b\n" - ".inst 0x45189bb4 // smmla z20.s, z29.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #-5, MUL VL]\n" - ".inst 0x45199bc9 // smmla z9.s, z30.b, z25.b\n" - ".inst 0x45199bb1 // smmla z17.s, z29.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #-4, MUL VL]\n" - ".inst 0x45189bcd // smmla z13.s, z30.b, z24.b\n" - ".inst 0x45189bb5 // smmla z21.s, z29.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #-3, MUL VL]\n" - ".inst 0x45199bca // smmla z10.s, z30.b, z25.b\n" - ".inst 0x45199bb2 // smmla z18.s, z29.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #-2, MUL VL]\n" - ".inst 0x45189bce // smmla z14.s, z30.b, z24.b\n" - ".inst 0x45189bb6 // smmla z22.s, z29.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #-1, MUL VL]\n" - ".inst 0x45199bcb // smmla z11.s, z30.b, z25.b\n" - ".inst 0x45199bb3 // smmla z19.s, z29.b, z25.b\n" - ".inst 0x45189bcf // smmla z15.s, z30.b, z24.b\n" - ".inst 0x45189bb7 // smmla z23.s, z29.b, z24.b\n" - "bgt 33b\n" - "34:" // Height 3: Multiply loop: Single iteration only + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + "bgt 31b\n" + "32:" // Height 3: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" - "ld1b { z25.b }, p2/Z, [x10]\n" - "ld1b { z28.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x10]\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x8\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z24.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x25]\n" "ld1rqb { z3.b }, p0/Z, [x24]\n" - "trn1 z27.d, z1.d, z24.d\n" - "trn2 z1.d, z1.d, z24.d\n" - "trn1 z26.d, z3.d, z29.d\n" - ".inst 0x45199b68 // smmla z8.s, z27.b, z25.b\n" - ".inst 0x451c9b6c // smmla z12.s, z27.b, z28.b\n" - "trn2 z3.d, z3.d, z29.d\n" - ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x451c9b54 // smmla z20.s, z26.b, z28.b\n" - "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n" - ".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n" - ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n" - ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n" - ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n" - ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #7, MUL VL]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + "trn1 z2.d, z3.d, z4.d\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n" - ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n" - ".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n" - ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n" - "ble 35f\n" - "ld1b { z25.b }, p2/Z, [x10]\n" - "ld1b { z24.b }, p2/Z, [x10, #1, MUL VL]\n" - ".inst 0x45199828 // smmla z8.s, z1.b, z25.b\n" - ".inst 0x45199870 // smmla z16.s, z3.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x4518982c // smmla z12.s, z1.b, z24.b\n" - ".inst 0x45189874 // smmla z20.s, z3.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n" - ".inst 0x45199829 // smmla z9.s, z1.b, z25.b\n" - ".inst 0x45199871 // smmla z17.s, z3.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x4518982d // smmla z13.s, z1.b, z24.b\n" - ".inst 0x45189875 // smmla z21.s, z3.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x4519982a // smmla z10.s, z1.b, z25.b\n" - ".inst 0x45199872 // smmla z18.s, z3.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x4518982e // smmla z14.s, z1.b, z24.b\n" - ".inst 0x45189876 // smmla z22.s, z3.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #7, MUL VL]\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + "ble 33f\n" + "ld1b { z7.b }, p2/Z, [x10]\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x4519982b // smmla z11.s, z1.b, z25.b\n" - ".inst 0x45199873 // smmla z19.s, z3.b, z25.b\n" - ".inst 0x4518982f // smmla z15.s, z1.b, z24.b\n" - ".inst 0x45189877 // smmla z23.s, z3.b, z24.b\n" - "35:" // Height 3: Multiply loop: multiply skip + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + "33:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 30b\n" - "uzp1 z28.d, z8.d, z12.d\n" + "bne 28b\n" + "uzp1 z7.d, z8.d, z12.d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "uzp2 z8.d, z8.d, z12.d\n" - "ld1w { z27.s }, p2/Z, [x14]\n" + "ld1w { z0.s }, p2/Z, [x12]\n" "uzp1 z12.d, z9.d, z13.d\n" "uzp2 z9.d, z9.d, z13.d\n" - "ld1w { z26.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z25.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" "uzp1 z13.d, z10.d, z14.d\n" "uzp2 z10.d, z10.d, z14.d\n" - "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" + "addvl x12, x12, #4\n" "uzp1 z14.d, z11.d, z15.d\n" "uzp2 z11.d, z11.d, z15.d\n" - "add x26, x9, x20\n" + "add x27, x9, x20\n" "uzp1 z16.d, z16.d, z20.d\n" "uzp1 z17.d, z17.d, z21.d\n" - "add x25, x26, x20\n" + "add x26, x27, x20\n" "uzp1 z18.d, z18.d, z22.d\n" "uzp1 z19.d, z19.d, z23.d\n" - "mov z23.d, z28.d\n" - "add z12.s, z12.s, z26.s\n" - "add z13.s, z13.s, z25.s\n" - "add z14.s, z14.s, z24.s\n" - "add z23.s, z23.s, z27.s\n" - "add z8.s, z8.s, z27.s\n" - "add z9.s, z9.s, z26.s\n" - "add z10.s, z10.s, z25.s\n" - "add z11.s, z11.s, z24.s\n" - "add z16.s, z16.s, z27.s\n" - "add z17.s, z17.s, z26.s\n" - "add z18.s, z18.s, z25.s\n" - "add z19.s, z19.s, z24.s\n" - "tbz %x[flags], #4, 36f\n" - "ld1w { z0.s }, p2/Z, [x12]\n" - "ld1w { z4.s }, p2/Z, [x13]\n" - "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" - "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" - "addvl x12, x12, #4\n" + "mov z23.d, z7.d\n" + "add z12.s, z12.s, z1.s\n" + "add z13.s, z13.s, z2.s\n" + "add z14.s, z14.s, z3.s\n" + "add z23.s, z23.s, z0.s\n" + "add z8.s, z8.s, z0.s\n" + "add z9.s, z9.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "tbz %x[flags], #4, 34f\n" + "ld1w { z0.s }, p2/Z, [x13]\n" + "ld1w { z4.s }, p2/Z, [x14]\n" + "ld1w { z1.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x13, #3, MUL VL]\n" + "ld1w { z7.s }, p2/Z, [x14, #3, MUL VL]\n" "addvl x13, x13, #4\n" - "b 37f\n" - "36:" // Height 3: per layer parameters + "addvl x14, x14, #4\n" + "b 35f\n" + "34:" // Height 3: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1rw { z0.s }, p2/Z, [x21]\n" @@ -809,137 +760,98 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "mov z6.d, z4.d\n" "mov z3.d, z0.d\n" "mov z7.d, z4.d\n" - "37:" // Height 3: parameters loaded - ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" - ".inst 0x04a5758c // sqrdmulh z12.s, z12.s, z5.s\n" - ".inst 0x04a675ad // sqrdmulh z13.s, z13.s, z6.s\n" - ".inst 0x04a775ce // sqrdmulh z14.s, z14.s, z7.s\n" - ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" - ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" - ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" - ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" - ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" - ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" - "tbz %x[flags], #5, 38f\n" - "and z24.d, z23.d, z0.d\n" - "and z22.d, z12.d, z1.d\n" - "and z21.d, z13.d, z2.d\n" - "and z20.d, z14.d, z3.d\n" - "asr z24.s, z24.s, #0x1f\n" - "asr z22.s, z22.s, #0x1f\n" - "asr z21.s, z21.s, #0x1f\n" - "asr z20.s, z20.s, #0x1f\n" - "sqadd z23.s, z23.s, z24.s\n" - "and z24.d, z8.d, z0.d\n" - "sqadd z12.s, z12.s, z22.s\n" - "and z22.d, z9.d, z1.d\n" - "sqadd z13.s, z13.s, z21.s\n" - "sqadd z14.s, z14.s, z20.s\n" - "and z21.d, z10.d, z2.d\n" - "and z20.d, z11.d, z3.d\n" - "asr z24.s, z24.s, #0x1f\n" - "asr z22.s, z22.s, #0x1f\n" - "asr z21.s, z21.s, #0x1f\n" - "asr z20.s, z20.s, #0x1f\n" - "sqadd z8.s, z8.s, z24.s\n" - "and z24.d, z16.d, z0.d\n" - "sqadd z9.s, z9.s, z22.s\n" - "and z22.d, z17.d, z1.d\n" - "sqadd z10.s, z10.s, z21.s\n" - "sqadd z11.s, z11.s, z20.s\n" - "and z21.d, z18.d, z2.d\n" - "and z20.d, z19.d, z3.d\n" - "asr z24.s, z24.s, #0x1f\n" - "asr z22.s, z22.s, #0x1f\n" - "asr z21.s, z21.s, #0x1f\n" - "asr z20.s, z20.s, #0x1f\n" - "sqadd z16.s, z16.s, z24.s\n" - "sqadd z17.s, z17.s, z22.s\n" - "sqadd z18.s, z18.s, z21.s\n" - "sqadd z19.s, z19.s, z20.s\n" - "38:" // Height 3: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "35:" // Height 3: parameters loaded + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a5718c // sqdmulh z12.s, z12.s, z5.s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + ".inst 0x04a671ad // sqdmulh z13.s, z13.s, z6.s\n" + ".inst 0x04a771ce // sqdmulh z14.s, z14.s, z7.s\n" + "add x20, %x[qp], %[minval]\n" + ".inst 0x04a47108 // sqdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57129 // sqdmulh z9.s, z9.s, z5.s\n" + ".inst 0x04a6714a // sqdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7716b // sqdmulh z11.s, z11.s, z7.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "ld1rw { z22.s }, p2/Z, [x20]\n" ".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a57231 // sqdmulh z17.s, z17.s, z5.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n" + ".inst 0x04a67252 // sqdmulh z18.s, z18.s, z6.s\n" + ".inst 0x04a77273 // sqdmulh z19.s, z19.s, z7.s\n" ".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "add z23.s, z23.s, z22.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z12.s, z12.s, z22.s\n" - "add z13.s, z13.s, z22.s\n" + "add z23.s, z23.s, z4.s\n" + "add z12.s, z12.s, z4.s\n" ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" - "add z14.s, z14.s, z22.s\n" - "add z8.s, z8.s, z22.s\n" + "add z13.s, z13.s, z4.s\n" + "add z14.s, z14.s, z4.s\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z9.s, z9.s, z22.s\n" - "add z10.s, z10.s, z22.s\n" - "ld1rw { z21.s }, p2/Z, [x20]\n" - "add z11.s, z11.s, z22.s\n" - "add z16.s, z16.s, z22.s\n" - "add x20, %x[qp], %[minval]\n" - "add z17.s, z17.s, z22.s\n" - "add z18.s, z18.s, z22.s\n" - "ld1rw { z20.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z22.s\n" - "smin z23.s, p2/M, z23.s, z21.s\n" - "smin z12.s, p2/M, z12.s, z21.s\n" - "smin z13.s, p2/M, z13.s, z21.s\n" - "smin z14.s, p2/M, z14.s, z21.s\n" - "smin z8.s, p2/M, z8.s, z21.s\n" - "smin z9.s, p2/M, z9.s, z21.s\n" - "smin z10.s, p2/M, z10.s, z21.s\n" - "smin z11.s, p2/M, z11.s, z21.s\n" - "smin z16.s, p2/M, z16.s, z21.s\n" - "smin z17.s, p2/M, z17.s, z21.s\n" - "smin z18.s, p2/M, z18.s, z21.s\n" - "smin z19.s, p2/M, z19.s, z21.s\n" - "smax z23.s, p2/M, z23.s, z20.s\n" - "smax z12.s, p2/M, z12.s, z20.s\n" - "smax z13.s, p2/M, z13.s, z20.s\n" - "smax z14.s, p2/M, z14.s, z20.s\n" - "smax z8.s, p2/M, z8.s, z20.s\n" - "smax z9.s, p2/M, z9.s, z20.s\n" - "smax z10.s, p2/M, z10.s, z20.s\n" - "smax z11.s, p2/M, z11.s, z20.s\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" "uzp1 z23.h, z23.h, z12.h\n" - "smax z16.s, p2/M, z16.s, z20.s\n" - "smax z17.s, p2/M, z17.s, z20.s\n" - "uzp1 z21.h, z13.h, z14.h\n" - "smax z18.s, p2/M, z18.s, z20.s\n" - "smax z19.s, p2/M, z19.s, z20.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "uzp1 z12.h, z13.h, z14.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z8.h, z8.h, z9.h\n" - "uzp1 z20.h, z10.h, z11.h\n" + "uzp1 z9.h, z10.h, z11.h\n" "uzp1 z16.h, z16.h, z17.h\n" - "uzp1 z23.b, z23.b, z21.b\n" + "uzp1 z23.b, z23.b, z12.b\n" "uzp1 z17.h, z18.h, z19.h\n" - "uzp1 z8.b, z8.b, z20.b\n" + "uzp1 z8.b, z8.b, z9.b\n" "st1b { z23.b }, p1, [x9]\n" "addvl x9, x9, #1\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z8.b }, p1, [x26]\n" - "st1b { z16.b }, p1, [x25]\n" - "39:" // Height 3: Writeback done + "st1b { z8.b }, p1, [x27]\n" + "st1b { z16.b }, p1, [x26]\n" "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" - "bgt 28b\n" - "b 80f\n" - "40:" // Height 4 - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "bgt 26b\n" + "b 74f\n" + "37:" // Height 4 + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "41:" // Height 4: Column loop + "38:" // Height 4: Column loop "mov x20, #0x0\n" "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" @@ -958,222 +870,221 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "mov z21.s, #0x0\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "42:" // Height 4: setup done "mov x28, #0x0\n" - "43:" // Height 4: String loop + "40:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 44f\n" + "tbz %x[flags], #3, 41f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" "ldr x25, [x20, #0x8]\n" "ldr x24, [x20, #0x10]\n" "ldr x23, [x20, #0x18]\n" - "cbnz x28, 45f\n" + "cbnz x28, 42f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" "add x24, x24, x20\n" "add x23, x23, x20\n" - "b 45f\n" - "44:" // Height 4: setup direct input + "b 42f\n" + "41:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" "add x24, x25, x21\n" "add x23, x24, x21\n" - "45:" // Height 4: input setup done + "42:" // Height 4: input setup done "cmp x27, #0x10\n" - "ble 47f\n" - "46:" // Height 4: Multiply loop: Main loop head + "ble 44f\n" + "43:" // Height 4: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1b { z31.b }, p2/Z, [x10]\n" - "ld1b { z30.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x10]\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - "ld1rqb { z29.b }, p0/Z, [x26]\n" + "ld1rqb { z1.b }, p0/Z, [x26]\n" "add x26, x26, #0x10\n" - "ld1rqb { z25.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x25]\n" "add x25, x25, #0x10\n" - "ld1rqb { z28.b }, p0/Z, [x24]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" "add x24, x24, #0x10\n" - "ld1rqb { z24.b }, p0/Z, [x23]\n" + "ld1rqb { z4.b }, p0/Z, [x23]\n" "add x23, x23, #0x10\n" - "trn1 z27.d, z29.d, z25.d\n" - "trn2 z29.d, z29.d, z25.d\n" - "trn1 z26.d, z28.d, z24.d\n" - "trn2 z28.d, z28.d, z24.d\n" - ".inst 0x451f9b68 // smmla z8.s, z27.b, z31.b\n" - ".inst 0x451e9b6c // smmla z12.s, z27.b, z30.b\n" - ".inst 0x451f9b50 // smmla z16.s, z26.b, z31.b\n" - "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x451e9b54 // smmla z20.s, z26.b, z30.b\n" - "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n" - ".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n" - ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n" - ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n" - ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n" - ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #7, MUL VL]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n" - ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n" - ".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n" - ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n" - "ld1b { z25.b }, p2/Z, [x10, #-8, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x10, #-7, MUL VL]\n" - ".inst 0x45199ba8 // smmla z8.s, z29.b, z25.b\n" - ".inst 0x45199b90 // smmla z16.s, z28.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #-6, MUL VL]\n" - ".inst 0x45189bac // smmla z12.s, z29.b, z24.b\n" - ".inst 0x45189b94 // smmla z20.s, z28.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #-5, MUL VL]\n" - ".inst 0x45199ba9 // smmla z9.s, z29.b, z25.b\n" - ".inst 0x45199b91 // smmla z17.s, z28.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #-4, MUL VL]\n" - ".inst 0x45189bad // smmla z13.s, z29.b, z24.b\n" - ".inst 0x45189b95 // smmla z21.s, z28.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #-3, MUL VL]\n" - ".inst 0x45199baa // smmla z10.s, z29.b, z25.b\n" - ".inst 0x45199b92 // smmla z18.s, z28.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #-2, MUL VL]\n" - ".inst 0x45189bae // smmla z14.s, z29.b, z24.b\n" - ".inst 0x45189b96 // smmla z22.s, z28.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #-1, MUL VL]\n" - ".inst 0x45199bab // smmla z11.s, z29.b, z25.b\n" - ".inst 0x45199b93 // smmla z19.s, z28.b, z25.b\n" - ".inst 0x45189baf // smmla z15.s, z29.b, z24.b\n" - ".inst 0x45189b97 // smmla z23.s, z28.b, z24.b\n" - "bgt 46b\n" - "47:" // Height 4: Multiply loop: Single iteration only + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + "bgt 43b\n" + "44:" // Height 4: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" - "ld1b { z29.b }, p2/Z, [x10]\n" - "ld1b { z28.b }, p2/Z, [x10, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x10]\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" "subs x27, x27, #0x8\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z25.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x25]\n" "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z24.b }, p0/Z, [x23]\n" - "trn1 z27.d, z1.d, z25.d\n" - "trn2 z1.d, z1.d, z25.d\n" - "trn1 z26.d, z3.d, z24.d\n" - ".inst 0x451d9b68 // smmla z8.s, z27.b, z29.b\n" - ".inst 0x451c9b6c // smmla z12.s, z27.b, z28.b\n" - "trn2 z3.d, z3.d, z24.d\n" - ".inst 0x451d9b50 // smmla z16.s, z26.b, z29.b\n" - "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x451c9b54 // smmla z20.s, z26.b, z28.b\n" - "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n" - ".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n" - ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n" - ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n" - ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n" - ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #7, MUL VL]\n" + "ld1rqb { z4.b }, p0/Z, [x23]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + "trn1 z2.d, z3.d, z4.d\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + "trn2 z3.d, z3.d, z4.d\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n" - ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n" - ".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n" - ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n" - "ble 48f\n" - "ld1b { z25.b }, p2/Z, [x10]\n" - "ld1b { z24.b }, p2/Z, [x10, #1, MUL VL]\n" - ".inst 0x45199828 // smmla z8.s, z1.b, z25.b\n" - ".inst 0x45199870 // smmla z16.s, z3.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x4518982c // smmla z12.s, z1.b, z24.b\n" - ".inst 0x45189874 // smmla z20.s, z3.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #3, MUL VL]\n" - ".inst 0x45199829 // smmla z9.s, z1.b, z25.b\n" - ".inst 0x45199871 // smmla z17.s, z3.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x4518982d // smmla z13.s, z1.b, z24.b\n" - ".inst 0x45189875 // smmla z21.s, z3.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x4519982a // smmla z10.s, z1.b, z25.b\n" - ".inst 0x45199872 // smmla z18.s, z3.b, z25.b\n" - "ld1b { z25.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x4518982e // smmla z14.s, z1.b, z24.b\n" - ".inst 0x45189876 // smmla z22.s, z3.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x10, #7, MUL VL]\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + "ble 45f\n" + "ld1b { z7.b }, p2/Z, [x10]\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x4519982b // smmla z11.s, z1.b, z25.b\n" - ".inst 0x45199873 // smmla z19.s, z3.b, z25.b\n" - ".inst 0x4518982f // smmla z15.s, z1.b, z24.b\n" - ".inst 0x45189877 // smmla z23.s, z3.b, z24.b\n" - "48:" // Height 4: Multiply loop: multiply skip + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + "45:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 43b\n" - "uzp1 z28.d, z8.d, z12.d\n" + "bne 40b\n" + "uzp1 z7.d, z8.d, z12.d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "uzp2 z8.d, z8.d, z12.d\n" - "ld1w { z27.s }, p2/Z, [x14]\n" + "ld1w { z0.s }, p2/Z, [x12]\n" "uzp1 z12.d, z9.d, z13.d\n" "uzp2 z9.d, z9.d, z13.d\n" - "ld1w { z26.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z25.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" "uzp1 z13.d, z10.d, z14.d\n" "uzp2 z10.d, z10.d, z14.d\n" - "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" + "addvl x12, x12, #4\n" "uzp1 z14.d, z11.d, z15.d\n" "uzp2 z11.d, z11.d, z15.d\n" - "add x26, x9, x20\n" + "add x27, x9, x20\n" "uzp1 z15.d, z16.d, z20.d\n" "uzp2 z16.d, z16.d, z20.d\n" - "add x25, x26, x20\n" + "add x26, x27, x20\n" "uzp1 z20.d, z17.d, z21.d\n" "uzp2 z17.d, z17.d, z21.d\n" - "add x24, x25, x20\n" + "add x25, x26, x20\n" "uzp1 z21.d, z18.d, z22.d\n" "uzp2 z18.d, z18.d, z22.d\n" "uzp1 z22.d, z19.d, z23.d\n" "uzp2 z19.d, z19.d, z23.d\n" - "mov z23.d, z28.d\n" - "add z12.s, z12.s, z26.s\n" - "add z13.s, z13.s, z25.s\n" - "add z14.s, z14.s, z24.s\n" - "add z23.s, z23.s, z27.s\n" - "add z8.s, z8.s, z27.s\n" - "add z9.s, z9.s, z26.s\n" - "add z10.s, z10.s, z25.s\n" - "add z11.s, z11.s, z24.s\n" - "add z15.s, z15.s, z27.s\n" - "add z20.s, z20.s, z26.s\n" - "add z21.s, z21.s, z25.s\n" - "add z22.s, z22.s, z24.s\n" - "add z16.s, z16.s, z27.s\n" - "add z17.s, z17.s, z26.s\n" - "add z18.s, z18.s, z25.s\n" - "add z19.s, z19.s, z24.s\n" - "tbz %x[flags], #4, 49f\n" - "ld1w { z0.s }, p2/Z, [x12]\n" - "ld1w { z4.s }, p2/Z, [x13]\n" - "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" - "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" - "addvl x12, x12, #4\n" + "mov z23.d, z7.d\n" + "add z12.s, z12.s, z1.s\n" + "add z13.s, z13.s, z2.s\n" + "add z14.s, z14.s, z3.s\n" + "add z23.s, z23.s, z0.s\n" + "add z8.s, z8.s, z0.s\n" + "add z9.s, z9.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "add z15.s, z15.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "tbz %x[flags], #4, 46f\n" + "ld1w { z0.s }, p2/Z, [x13]\n" + "ld1w { z4.s }, p2/Z, [x14]\n" + "ld1w { z1.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x13, #3, MUL VL]\n" + "ld1w { z7.s }, p2/Z, [x14, #3, MUL VL]\n" "addvl x13, x13, #4\n" - "b 50f\n" - "49:" // Height 4: per layer parameters + "addvl x14, x14, #4\n" + "b 47f\n" + "46:" // Height 4: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1rw { z0.s }, p2/Z, [x21]\n" @@ -1184,173 +1095,122 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "mov z6.d, z4.d\n" "mov z3.d, z0.d\n" "mov z7.d, z4.d\n" - "50:" // Height 4: parameters loaded - ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" - ".inst 0x04a5758c // sqrdmulh z12.s, z12.s, z5.s\n" - ".inst 0x04a675ad // sqrdmulh z13.s, z13.s, z6.s\n" - ".inst 0x04a775ce // sqrdmulh z14.s, z14.s, z7.s\n" - ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" - ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" - ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" - ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" - ".inst 0x04a475ef // sqrdmulh z15.s, z15.s, z4.s\n" - ".inst 0x04a57694 // sqrdmulh z20.s, z20.s, z5.s\n" - ".inst 0x04a676b5 // sqrdmulh z21.s, z21.s, z6.s\n" - ".inst 0x04a776d6 // sqrdmulh z22.s, z22.s, z7.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" - ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" - ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" - "tbz %x[flags], #5, 51f\n" - "and z27.d, z23.d, z0.d\n" - "and z26.d, z12.d, z1.d\n" - "and z25.d, z13.d, z2.d\n" - "and z24.d, z14.d, z3.d\n" - "asr z27.s, z27.s, #0x1f\n" - "asr z26.s, z26.s, #0x1f\n" - "asr z25.s, z25.s, #0x1f\n" - "asr z24.s, z24.s, #0x1f\n" - "sqadd z23.s, z23.s, z27.s\n" - "and z27.d, z8.d, z0.d\n" - "sqadd z12.s, z12.s, z26.s\n" - "and z26.d, z9.d, z1.d\n" - "sqadd z13.s, z13.s, z25.s\n" - "sqadd z14.s, z14.s, z24.s\n" - "and z25.d, z10.d, z2.d\n" - "and z24.d, z11.d, z3.d\n" - "asr z27.s, z27.s, #0x1f\n" - "asr z26.s, z26.s, #0x1f\n" - "asr z25.s, z25.s, #0x1f\n" - "asr z24.s, z24.s, #0x1f\n" - "sqadd z8.s, z8.s, z27.s\n" - "and z27.d, z15.d, z0.d\n" - "sqadd z9.s, z9.s, z26.s\n" - "and z26.d, z20.d, z1.d\n" - "sqadd z10.s, z10.s, z25.s\n" - "sqadd z11.s, z11.s, z24.s\n" - "and z25.d, z21.d, z2.d\n" - "and z24.d, z22.d, z3.d\n" - "asr z27.s, z27.s, #0x1f\n" - "asr z26.s, z26.s, #0x1f\n" - "asr z25.s, z25.s, #0x1f\n" - "asr z24.s, z24.s, #0x1f\n" - "sqadd z15.s, z15.s, z27.s\n" - "and z27.d, z16.d, z0.d\n" - "sqadd z20.s, z20.s, z26.s\n" - "and z26.d, z17.d, z1.d\n" - "sqadd z21.s, z21.s, z25.s\n" - "sqadd z22.s, z22.s, z24.s\n" - "and z25.d, z18.d, z2.d\n" - "and z24.d, z19.d, z3.d\n" - "asr z27.s, z27.s, #0x1f\n" - "asr z26.s, z26.s, #0x1f\n" - "asr z25.s, z25.s, #0x1f\n" - "asr z24.s, z24.s, #0x1f\n" - "sqadd z16.s, z16.s, z27.s\n" - "sqadd z17.s, z17.s, z26.s\n" - "sqadd z18.s, z18.s, z25.s\n" - "sqadd z19.s, z19.s, z24.s\n" - "51:" // Height 4: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "47:" // Height 4: parameters loaded + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a5718c // sqdmulh z12.s, z12.s, z5.s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + ".inst 0x04a671ad // sqdmulh z13.s, z13.s, z6.s\n" + ".inst 0x04a771ce // sqdmulh z14.s, z14.s, z7.s\n" + "add x20, %x[qp], %[minval]\n" + ".inst 0x04a47108 // sqdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57129 // sqdmulh z9.s, z9.s, z5.s\n" + ".inst 0x04a6714a // sqdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7716b // sqdmulh z11.s, z11.s, z7.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "ld1rw { z25.s }, p2/Z, [x20]\n" ".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n" + ".inst 0x04a471ef // sqdmulh z15.s, z15.s, z4.s\n" + ".inst 0x04a57294 // sqdmulh z20.s, z20.s, z5.s\n" ".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n" ".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n" + ".inst 0x04a672b5 // sqdmulh z21.s, z21.s, z6.s\n" + ".inst 0x04a772d6 // sqdmulh z22.s, z22.s, z7.s\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a57231 // sqdmulh z17.s, z17.s, z5.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "add z23.s, z23.s, z25.s\n" + ".inst 0x04a67252 // sqdmulh z18.s, z18.s, z6.s\n" + ".inst 0x04a77273 // sqdmulh z19.s, z19.s, z7.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" ".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n" - "add z12.s, z12.s, z25.s\n" - "add z13.s, z13.s, z25.s\n" ".inst 0x44828834 // srshl z20.s, p2/M, z20.s, z1.s\n" ".inst 0x44828855 // srshl z21.s, p2/M, z21.s, z2.s\n" - "add z14.s, z14.s, z25.s\n" - "add z8.s, z8.s, z25.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" ".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z9.s, z9.s, z25.s\n" - "add z10.s, z10.s, z25.s\n" + "add z23.s, z23.s, z4.s\n" + "add z12.s, z12.s, z4.s\n" ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" - "add z11.s, z11.s, z25.s\n" - "add z15.s, z15.s, z25.s\n" + "add z13.s, z13.s, z4.s\n" + "add z14.s, z14.s, z4.s\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z20.s, z20.s, z25.s\n" - "add z21.s, z21.s, z25.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "add z22.s, z22.s, z25.s\n" - "add z16.s, z16.s, z25.s\n" - "add x20, %x[qp], %[minval]\n" - "add z17.s, z17.s, z25.s\n" - "add z18.s, z18.s, z25.s\n" - "ld1rw { z26.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z25.s\n" - "smin z23.s, p2/M, z23.s, z24.s\n" - "smin z12.s, p2/M, z12.s, z24.s\n" - "smin z13.s, p2/M, z13.s, z24.s\n" - "smin z14.s, p2/M, z14.s, z24.s\n" - "smin z8.s, p2/M, z8.s, z24.s\n" - "smin z9.s, p2/M, z9.s, z24.s\n" - "smin z10.s, p2/M, z10.s, z24.s\n" - "smin z11.s, p2/M, z11.s, z24.s\n" - "smin z15.s, p2/M, z15.s, z24.s\n" - "smin z20.s, p2/M, z20.s, z24.s\n" - "smin z21.s, p2/M, z21.s, z24.s\n" - "smin z22.s, p2/M, z22.s, z24.s\n" - "smin z16.s, p2/M, z16.s, z24.s\n" - "smin z17.s, p2/M, z17.s, z24.s\n" - "smin z18.s, p2/M, z18.s, z24.s\n" - "smin z19.s, p2/M, z19.s, z24.s\n" - "smax z23.s, p2/M, z23.s, z26.s\n" - "smax z12.s, p2/M, z12.s, z26.s\n" - "smax z13.s, p2/M, z13.s, z26.s\n" - "smax z14.s, p2/M, z14.s, z26.s\n" - "smax z8.s, p2/M, z8.s, z26.s\n" - "smax z9.s, p2/M, z9.s, z26.s\n" - "smax z10.s, p2/M, z10.s, z26.s\n" - "smax z11.s, p2/M, z11.s, z26.s\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "add z15.s, z15.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" "uzp1 z23.h, z23.h, z12.h\n" - "smax z15.s, p2/M, z15.s, z26.s\n" - "smax z20.s, p2/M, z20.s, z26.s\n" - "uzp1 z25.h, z13.h, z14.h\n" - "smax z21.s, p2/M, z21.s, z26.s\n" - "smax z22.s, p2/M, z22.s, z26.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "uzp1 z12.h, z13.h, z14.h\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" "uzp1 z8.h, z8.h, z9.h\n" - "smax z16.s, p2/M, z16.s, z26.s\n" - "smax z17.s, p2/M, z17.s, z26.s\n" - "uzp1 z24.h, z10.h, z11.h\n" - "smax z18.s, p2/M, z18.s, z26.s\n" - "smax z19.s, p2/M, z19.s, z26.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z15.h, z15.h, z20.h\n" - "uzp1 z23.b, z23.b, z25.b\n" + "uzp1 z23.b, z23.b, z12.b\n" "uzp1 z20.h, z21.h, z22.h\n" "uzp1 z16.h, z16.h, z17.h\n" - "uzp1 z8.b, z8.b, z24.b\n" + "uzp1 z8.b, z8.b, z9.b\n" "uzp1 z17.h, z18.h, z19.h\n" "st1b { z23.b }, p1, [x9]\n" "addvl x9, x9, #1\n" "uzp1 z15.b, z15.b, z20.b\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z8.b }, p1, [x26]\n" - "st1b { z15.b }, p1, [x25]\n" - "st1b { z16.b }, p1, [x24]\n" - "52:" // Height 4: Writeback done + "st1b { z8.b }, p1, [x27]\n" + "st1b { z15.b }, p1, [x26]\n" + "st1b { z16.b }, p1, [x25]\n" "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" - "bgt 41b\n" - "b 80f\n" - "53:" // Height 5 - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "bgt 38b\n" + "b 74f\n" + "49:" // Height 5 + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "54:" // Height 5: Column loop + "50:" // Height 5: Column loop "mov x20, #0x0\n" "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" @@ -1377,13 +1237,12 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "55:" // Height 5: setup done "mov x28, #0x0\n" - "56:" // Height 5: String loop + "52:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 57f\n" + "tbz %x[flags], #3, 53f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" @@ -1391,259 +1250,259 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "ldr x24, [x20, #0x10]\n" "ldr x23, [x20, #0x18]\n" "ldr x22, [x20, #0x20]\n" - "cbnz x28, 58f\n" + "cbnz x28, 54f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" - "b 58f\n" - "57:" // Height 5: setup direct input + "b 54f\n" + "53:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" "add x24, x25, x21\n" "add x23, x24, x21\n" "add x22, x23, x21\n" - "58:" // Height 5: input setup done + "54:" // Height 5: input setup done "cmp x27, #0x10\n" - "ble 60f\n" - "59:" // Height 5: Multiply loop: Main loop head + "ble 56f\n" + "55:" // Height 5: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1b { z1.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - "ld1rqb { z6.b }, p0/Z, [x26]\n" + "ld1rqb { z1.b }, p0/Z, [x26]\n" "add x26, x26, #0x10\n" - "ld1rqb { z3.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x25]\n" "add x25, x25, #0x10\n" - "ld1rqb { z7.b }, p0/Z, [x24]\n" - "ld1rqb { z2.b }, p0/Z, [x23]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" + "ld1rqb { z4.b }, p0/Z, [x23]\n" "ld1rqb { z5.b }, p0/Z, [x22]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "trn1 z4.d, z6.d, z3.d\n" - "trn2 z6.d, z6.d, z3.d\n" - "trn1 z3.d, z7.d, z2.d\n" - "trn2 z7.d, z7.d, z2.d\n" - "trn1 z2.d, z5.d, z0.d\n" - "trn2 z5.d, z5.d, z0.d\n" - "ld1b { z0.b }, p2/Z, [x10, #1, MUL VL]\n" - ".inst 0x45019888 // smmla z8.s, z4.b, z1.b\n" - ".inst 0x45019870 // smmla z16.s, z3.b, z1.b\n" - ".inst 0x45019858 // smmla z24.s, z2.b, z1.b\n" - "ld1b { z1.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x4500988c // smmla z12.s, z4.b, z0.b\n" - ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n" - ".inst 0x4500985c // smmla z28.s, z2.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #3, MUL VL]\n" - ".inst 0x45019889 // smmla z9.s, z4.b, z1.b\n" - ".inst 0x45019871 // smmla z17.s, z3.b, z1.b\n" - ".inst 0x45019859 // smmla z25.s, z2.b, z1.b\n" - "ld1b { z1.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x4500988d // smmla z13.s, z4.b, z0.b\n" - ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n" - ".inst 0x4500985d // smmla z29.s, z2.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x4501988a // smmla z10.s, z4.b, z1.b\n" - ".inst 0x45019872 // smmla z18.s, z3.b, z1.b\n" - ".inst 0x4501985a // smmla z26.s, z2.b, z1.b\n" - "ld1b { z1.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x4500988e // smmla z14.s, z4.b, z0.b\n" - ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n" - ".inst 0x4500985e // smmla z30.s, z2.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #7, MUL VL]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "trn1 z4.d, z5.d, z6.d\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x4501988b // smmla z11.s, z4.b, z1.b\n" - ".inst 0x45019873 // smmla z19.s, z3.b, z1.b\n" - ".inst 0x4501985b // smmla z27.s, z2.b, z1.b\n" - ".inst 0x4500988f // smmla z15.s, z4.b, z0.b\n" - ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n" - ".inst 0x4500985f // smmla z31.s, z2.b, z0.b\n" - "ld1b { z1.b }, p2/Z, [x10, #-8, MUL VL]\n" - "ld1b { z0.b }, p2/Z, [x10, #-7, MUL VL]\n" - ".inst 0x450198c8 // smmla z8.s, z6.b, z1.b\n" - ".inst 0x450198f0 // smmla z16.s, z7.b, z1.b\n" - ".inst 0x450198b8 // smmla z24.s, z5.b, z1.b\n" - "ld1b { z1.b }, p2/Z, [x10, #-6, MUL VL]\n" - ".inst 0x450098cc // smmla z12.s, z6.b, z0.b\n" - ".inst 0x450098f4 // smmla z20.s, z7.b, z0.b\n" - ".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #-5, MUL VL]\n" - ".inst 0x450198c9 // smmla z9.s, z6.b, z1.b\n" - ".inst 0x450198f1 // smmla z17.s, z7.b, z1.b\n" - ".inst 0x450198b9 // smmla z25.s, z5.b, z1.b\n" - "ld1b { z1.b }, p2/Z, [x10, #-4, MUL VL]\n" - ".inst 0x450098cd // smmla z13.s, z6.b, z0.b\n" - ".inst 0x450098f5 // smmla z21.s, z7.b, z0.b\n" - ".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #-3, MUL VL]\n" - ".inst 0x450198ca // smmla z10.s, z6.b, z1.b\n" - ".inst 0x450198f2 // smmla z18.s, z7.b, z1.b\n" - ".inst 0x450198ba // smmla z26.s, z5.b, z1.b\n" - "ld1b { z1.b }, p2/Z, [x10, #-2, MUL VL]\n" - ".inst 0x450098ce // smmla z14.s, z6.b, z0.b\n" - ".inst 0x450098f6 // smmla z22.s, z7.b, z0.b\n" - ".inst 0x450098be // smmla z30.s, z5.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #-1, MUL VL]\n" - ".inst 0x450198cb // smmla z11.s, z6.b, z1.b\n" - ".inst 0x450198f3 // smmla z19.s, z7.b, z1.b\n" - ".inst 0x450198bb // smmla z27.s, z5.b, z1.b\n" - ".inst 0x450098cf // smmla z15.s, z6.b, z0.b\n" - ".inst 0x450098f7 // smmla z23.s, z7.b, z0.b\n" - ".inst 0x450098bf // smmla z31.s, z5.b, z0.b\n" - "bgt 59b\n" - "60:" // Height 5: Multiply loop: Single iteration only + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + "bgt 55b\n" + "56:" // Height 5: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" - "ld1b { z2.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10]\n" "subs x27, x27, #0x8\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z6.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x25]\n" "ld1rqb { z3.b }, p0/Z, [x24]\n" "ld1rqb { z4.b }, p0/Z, [x23]\n" "ld1rqb { z5.b }, p0/Z, [x22]\n" - "trn1 z7.d, z1.d, z6.d\n" - "trn2 z1.d, z1.d, z6.d\n" - "trn1 z6.d, z3.d, z4.d\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + "trn1 z2.d, z3.d, z4.d\n" "trn2 z3.d, z3.d, z4.d\n" - "trn1 z4.d, z5.d, z0.d\n" - "trn2 z5.d, z5.d, z0.d\n" - "ld1b { z0.b }, p2/Z, [x10, #1, MUL VL]\n" - ".inst 0x450298e8 // smmla z8.s, z7.b, z2.b\n" - ".inst 0x450298d0 // smmla z16.s, z6.b, z2.b\n" - ".inst 0x45029898 // smmla z24.s, z4.b, z2.b\n" - "ld1b { z2.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n" - ".inst 0x450098d4 // smmla z20.s, z6.b, z0.b\n" - ".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #3, MUL VL]\n" - ".inst 0x450298e9 // smmla z9.s, z7.b, z2.b\n" - ".inst 0x450298d1 // smmla z17.s, z6.b, z2.b\n" - ".inst 0x45029899 // smmla z25.s, z4.b, z2.b\n" - "ld1b { z2.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n" - ".inst 0x450098d5 // smmla z21.s, z6.b, z0.b\n" - ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x450298ea // smmla z10.s, z7.b, z2.b\n" - ".inst 0x450298d2 // smmla z18.s, z6.b, z2.b\n" - ".inst 0x4502989a // smmla z26.s, z4.b, z2.b\n" - "ld1b { z2.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n" - ".inst 0x450098d6 // smmla z22.s, z6.b, z0.b\n" - ".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #7, MUL VL]\n" + "trn1 z4.d, z5.d, z6.d\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x450298eb // smmla z11.s, z7.b, z2.b\n" - ".inst 0x450298d3 // smmla z19.s, z6.b, z2.b\n" - ".inst 0x4502989b // smmla z27.s, z4.b, z2.b\n" - ".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n" - ".inst 0x450098d7 // smmla z23.s, z6.b, z0.b\n" - ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n" - "ble 61f\n" - "ld1b { z2.b }, p2/Z, [x10]\n" - "ld1b { z0.b }, p2/Z, [x10, #1, MUL VL]\n" - ".inst 0x45029828 // smmla z8.s, z1.b, z2.b\n" - ".inst 0x45029870 // smmla z16.s, z3.b, z2.b\n" - ".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n" - "ld1b { z2.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x4500982c // smmla z12.s, z1.b, z0.b\n" - ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n" - ".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #3, MUL VL]\n" - ".inst 0x45029829 // smmla z9.s, z1.b, z2.b\n" - ".inst 0x45029871 // smmla z17.s, z3.b, z2.b\n" - ".inst 0x450298b9 // smmla z25.s, z5.b, z2.b\n" - "ld1b { z2.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x4500982d // smmla z13.s, z1.b, z0.b\n" - ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n" - ".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x4502982a // smmla z10.s, z1.b, z2.b\n" - ".inst 0x45029872 // smmla z18.s, z3.b, z2.b\n" - ".inst 0x450298ba // smmla z26.s, z5.b, z2.b\n" - "ld1b { z2.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x4500982e // smmla z14.s, z1.b, z0.b\n" - ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n" - ".inst 0x450098be // smmla z30.s, z5.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #7, MUL VL]\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" + "ble 57f\n" + "ld1b { z7.b }, p2/Z, [x10]\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x4502982b // smmla z11.s, z1.b, z2.b\n" - ".inst 0x45029873 // smmla z19.s, z3.b, z2.b\n" - ".inst 0x450298bb // smmla z27.s, z5.b, z2.b\n" - ".inst 0x4500982f // smmla z15.s, z1.b, z0.b\n" - ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n" - ".inst 0x450098bf // smmla z31.s, z5.b, z0.b\n" - "61:" // Height 5: Multiply loop: multiply skip + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + "57:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 56b\n" + "bne 52b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 z4.d, z8.d, z12.d\n" + "uzp1 z7.d, z8.d, z12.d\n" "uzp2 z8.d, z8.d, z12.d\n" - "ld1w { z3.s }, p2/Z, [x14]\n" + "ld1w { z0.s }, p2/Z, [x12]\n" "uzp1 z12.d, z9.d, z13.d\n" "uzp2 z9.d, z9.d, z13.d\n" - "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" "uzp1 z13.d, z10.d, z14.d\n" "uzp2 z10.d, z10.d, z14.d\n" - "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" + "addvl x12, x12, #4\n" "uzp1 z14.d, z11.d, z15.d\n" "uzp2 z11.d, z11.d, z15.d\n" - "add x26, x9, x20\n" + "add x27, x9, x20\n" "uzp1 z15.d, z16.d, z20.d\n" "uzp2 z16.d, z16.d, z20.d\n" - "add x25, x26, x20\n" + "add x26, x27, x20\n" "uzp1 z20.d, z17.d, z21.d\n" "uzp2 z17.d, z17.d, z21.d\n" - "add x24, x25, x20\n" + "add x25, x26, x20\n" "uzp1 z21.d, z18.d, z22.d\n" "uzp2 z18.d, z18.d, z22.d\n" - "add x23, x24, x20\n" + "add x24, x25, x20\n" "uzp1 z22.d, z19.d, z23.d\n" "uzp2 z19.d, z19.d, z23.d\n" "uzp1 z24.d, z24.d, z28.d\n" "uzp1 z25.d, z25.d, z29.d\n" "uzp1 z26.d, z26.d, z30.d\n" "uzp1 z27.d, z27.d, z31.d\n" - "mov z31.d, z4.d\n" - "add z12.s, z12.s, z2.s\n" - "add z13.s, z13.s, z1.s\n" - "add z14.s, z14.s, z0.s\n" - "add z31.s, z31.s, z3.s\n" - "add z8.s, z8.s, z3.s\n" - "add z9.s, z9.s, z2.s\n" - "add z10.s, z10.s, z1.s\n" - "add z11.s, z11.s, z0.s\n" - "add z15.s, z15.s, z3.s\n" - "add z20.s, z20.s, z2.s\n" - "add z21.s, z21.s, z1.s\n" - "add z22.s, z22.s, z0.s\n" - "add z16.s, z16.s, z3.s\n" - "add z17.s, z17.s, z2.s\n" - "add z18.s, z18.s, z1.s\n" - "add z19.s, z19.s, z0.s\n" - "add z24.s, z24.s, z3.s\n" - "add z25.s, z25.s, z2.s\n" - "add z26.s, z26.s, z1.s\n" - "add z27.s, z27.s, z0.s\n" - "tbz %x[flags], #4, 62f\n" - "ld1w { z0.s }, p2/Z, [x12]\n" - "ld1w { z4.s }, p2/Z, [x13]\n" - "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" - "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" - "addvl x12, x12, #4\n" + "mov z31.d, z7.d\n" + "add z12.s, z12.s, z1.s\n" + "add z13.s, z13.s, z2.s\n" + "add z14.s, z14.s, z3.s\n" + "add z31.s, z31.s, z0.s\n" + "add z8.s, z8.s, z0.s\n" + "add z9.s, z9.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "add z15.s, z15.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + "tbz %x[flags], #4, 58f\n" + "ld1w { z0.s }, p2/Z, [x13]\n" + "ld1w { z4.s }, p2/Z, [x14]\n" + "ld1w { z1.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x13, #3, MUL VL]\n" + "ld1w { z7.s }, p2/Z, [x14, #3, MUL VL]\n" "addvl x13, x13, #4\n" - "b 63f\n" - "62:" // Height 5: per layer parameters + "addvl x14, x14, #4\n" + "b 59f\n" + "58:" // Height 5: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1rw { z0.s }, p2/Z, [x21]\n" @@ -1654,213 +1513,150 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "mov z6.d, z4.d\n" "mov z3.d, z0.d\n" "mov z7.d, z4.d\n" - "63:" // Height 5: parameters loaded - ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" - ".inst 0x04a5758c // sqrdmulh z12.s, z12.s, z5.s\n" - ".inst 0x04a675ad // sqrdmulh z13.s, z13.s, z6.s\n" - ".inst 0x04a775ce // sqrdmulh z14.s, z14.s, z7.s\n" - ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" - ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" - ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" - ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" - ".inst 0x04a475ef // sqrdmulh z15.s, z15.s, z4.s\n" - ".inst 0x04a57694 // sqrdmulh z20.s, z20.s, z5.s\n" - ".inst 0x04a676b5 // sqrdmulh z21.s, z21.s, z6.s\n" - ".inst 0x04a776d6 // sqrdmulh z22.s, z22.s, z7.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" - ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" - ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" - ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" - ".inst 0x04a57739 // sqrdmulh z25.s, z25.s, z5.s\n" - ".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n" - ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n" - "tbz %x[flags], #5, 64f\n" - "and z30.d, z31.d, z0.d\n" - "and z29.d, z12.d, z1.d\n" - "and z28.d, z13.d, z2.d\n" - "and z23.d, z14.d, z3.d\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z23.s, z23.s, #0x1f\n" - "sqadd z31.s, z31.s, z30.s\n" - "and z30.d, z8.d, z0.d\n" - "sqadd z12.s, z12.s, z29.s\n" - "and z29.d, z9.d, z1.d\n" - "sqadd z13.s, z13.s, z28.s\n" - "sqadd z14.s, z14.s, z23.s\n" - "and z28.d, z10.d, z2.d\n" - "and z23.d, z11.d, z3.d\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z23.s, z23.s, #0x1f\n" - "sqadd z8.s, z8.s, z30.s\n" - "and z30.d, z15.d, z0.d\n" - "sqadd z9.s, z9.s, z29.s\n" - "and z29.d, z20.d, z1.d\n" - "sqadd z10.s, z10.s, z28.s\n" - "sqadd z11.s, z11.s, z23.s\n" - "and z28.d, z21.d, z2.d\n" - "and z23.d, z22.d, z3.d\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z23.s, z23.s, #0x1f\n" - "sqadd z15.s, z15.s, z30.s\n" - "and z30.d, z16.d, z0.d\n" - "sqadd z20.s, z20.s, z29.s\n" - "and z29.d, z17.d, z1.d\n" - "sqadd z21.s, z21.s, z28.s\n" - "sqadd z22.s, z22.s, z23.s\n" - "and z28.d, z18.d, z2.d\n" - "and z23.d, z19.d, z3.d\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z23.s, z23.s, #0x1f\n" - "sqadd z16.s, z16.s, z30.s\n" - "and z30.d, z24.d, z0.d\n" - "sqadd z17.s, z17.s, z29.s\n" - "and z29.d, z25.d, z1.d\n" - "sqadd z18.s, z18.s, z28.s\n" - "sqadd z19.s, z19.s, z23.s\n" - "and z28.d, z26.d, z2.d\n" - "and z23.d, z27.d, z3.d\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z23.s, z23.s, #0x1f\n" - "sqadd z24.s, z24.s, z30.s\n" - "sqadd z25.s, z25.s, z29.s\n" - "sqadd z26.s, z26.s, z28.s\n" - "sqadd z27.s, z27.s, z23.s\n" - "64:" // Height 5: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "59:" // Height 5: parameters loaded + ".inst 0x04a473ff // sqdmulh z31.s, z31.s, z4.s\n" + ".inst 0x04a5718c // sqdmulh z12.s, z12.s, z5.s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + ".inst 0x04a671ad // sqdmulh z13.s, z13.s, z6.s\n" + ".inst 0x04a771ce // sqdmulh z14.s, z14.s, z7.s\n" + "add x20, %x[qp], %[minval]\n" + ".inst 0x04a47108 // sqdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57129 // sqdmulh z9.s, z9.s, z5.s\n" + ".inst 0x04a6714a // sqdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7716b // sqdmulh z11.s, z11.s, z7.s\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" - "ld1rw { z28.s }, p2/Z, [x20]\n" ".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n" + ".inst 0x04a471ef // sqdmulh z15.s, z15.s, z4.s\n" + ".inst 0x04a57294 // sqdmulh z20.s, z20.s, z5.s\n" ".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n" ".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n" + ".inst 0x04a672b5 // sqdmulh z21.s, z21.s, z6.s\n" + ".inst 0x04a772d6 // sqdmulh z22.s, z22.s, z7.s\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a57231 // sqdmulh z17.s, z17.s, z5.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "add z31.s, z31.s, z28.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + ".inst 0x04a67252 // sqdmulh z18.s, z18.s, z6.s\n" + ".inst 0x04a77273 // sqdmulh z19.s, z19.s, z7.s\n" ".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n" - "add z12.s, z12.s, z28.s\n" - "add z13.s, z13.s, z28.s\n" ".inst 0x44828834 // srshl z20.s, p2/M, z20.s, z1.s\n" + ".inst 0x04a47318 // sqdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a57339 // sqdmulh z25.s, z25.s, z5.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828855 // srshl z21.s, p2/M, z21.s, z2.s\n" - "add z14.s, z14.s, z28.s\n" - "add z8.s, z8.s, z28.s\n" + ".inst 0x04a6735a // sqdmulh z26.s, z26.s, z6.s\n" + ".inst 0x04a7737b // sqdmulh z27.s, z27.s, z7.s\n" ".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z9.s, z9.s, z28.s\n" - "add z10.s, z10.s, z28.s\n" ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" - "add z11.s, z11.s, z28.s\n" - "add z15.s, z15.s, z28.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z20.s, z20.s, z28.s\n" - "add z21.s, z21.s, z28.s\n" + "add z31.s, z31.s, z4.s\n" + "add z12.s, z12.s, z4.s\n" ".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n" ".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n" - "add z22.s, z22.s, z28.s\n" - "add z16.s, z16.s, z28.s\n" + "add z13.s, z13.s, z4.s\n" + "add z14.s, z14.s, z4.s\n" ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z17.s, z17.s, z28.s\n" - "add z18.s, z18.s, z28.s\n" - "ld1rw { z23.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z28.s\n" - "add z24.s, z24.s, z28.s\n" - "add x20, %x[qp], %[minval]\n" - "add z25.s, z25.s, z28.s\n" - "add z26.s, z26.s, z28.s\n" - "ld1rw { z29.s }, p2/Z, [x20]\n" - "add z27.s, z27.s, z28.s\n" - "smin z31.s, p2/M, z31.s, z23.s\n" - "smin z12.s, p2/M, z12.s, z23.s\n" - "smin z13.s, p2/M, z13.s, z23.s\n" - "smin z14.s, p2/M, z14.s, z23.s\n" - "smin z8.s, p2/M, z8.s, z23.s\n" - "smin z9.s, p2/M, z9.s, z23.s\n" - "smin z10.s, p2/M, z10.s, z23.s\n" - "smin z11.s, p2/M, z11.s, z23.s\n" - "smin z15.s, p2/M, z15.s, z23.s\n" - "smin z20.s, p2/M, z20.s, z23.s\n" - "smin z21.s, p2/M, z21.s, z23.s\n" - "smin z22.s, p2/M, z22.s, z23.s\n" - "smin z16.s, p2/M, z16.s, z23.s\n" - "smin z17.s, p2/M, z17.s, z23.s\n" - "smin z18.s, p2/M, z18.s, z23.s\n" - "smin z19.s, p2/M, z19.s, z23.s\n" - "smin z24.s, p2/M, z24.s, z23.s\n" - "smin z25.s, p2/M, z25.s, z23.s\n" - "smin z26.s, p2/M, z26.s, z23.s\n" - "smin z27.s, p2/M, z27.s, z23.s\n" - "smax z31.s, p2/M, z31.s, z29.s\n" - "smax z12.s, p2/M, z12.s, z29.s\n" - "smax z13.s, p2/M, z13.s, z29.s\n" - "smax z14.s, p2/M, z14.s, z29.s\n" - "smax z8.s, p2/M, z8.s, z29.s\n" - "smax z9.s, p2/M, z9.s, z29.s\n" - "smax z10.s, p2/M, z10.s, z29.s\n" - "smax z11.s, p2/M, z11.s, z29.s\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "add z15.s, z15.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "add z26.s, z26.s, z4.s\n" + "add z27.s, z27.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" "uzp1 z31.h, z31.h, z12.h\n" - "smax z15.s, p2/M, z15.s, z29.s\n" - "smax z20.s, p2/M, z20.s, z29.s\n" - "uzp1 z28.h, z13.h, z14.h\n" - "smax z21.s, p2/M, z21.s, z29.s\n" - "smax z22.s, p2/M, z22.s, z29.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "uzp1 z12.h, z13.h, z14.h\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" "uzp1 z8.h, z8.h, z9.h\n" - "smax z16.s, p2/M, z16.s, z29.s\n" - "smax z17.s, p2/M, z17.s, z29.s\n" - "uzp1 z23.h, z10.h, z11.h\n" - "smax z18.s, p2/M, z18.s, z29.s\n" - "smax z19.s, p2/M, z19.s, z29.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z15.h, z15.h, z20.h\n" - "uzp1 z31.b, z31.b, z28.b\n" - "smax z24.s, p2/M, z24.s, z29.s\n" - "smax z25.s, p2/M, z25.s, z29.s\n" + "uzp1 z31.b, z31.b, z12.b\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" "uzp1 z20.h, z21.h, z22.h\n" - "smax z26.s, p2/M, z26.s, z29.s\n" - "smax z27.s, p2/M, z27.s, z29.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "uzp1 z8.b, z8.b, z23.b\n" - "uzp1 z18.h, z18.h, z19.h\n" + "uzp1 z8.b, z8.b, z9.b\n" + "uzp1 z17.h, z18.h, z19.h\n" "st1b { z31.b }, p1, [x9]\n" "addvl x9, x9, #1\n" "uzp1 z24.h, z24.h, z25.h\n" "uzp1 z15.b, z15.b, z20.b\n" - "uzp1 z17.h, z26.h, z27.h\n" - "st1b { z8.b }, p1, [x26]\n" - "uzp1 z16.b, z16.b, z18.b\n" - "uzp1 z24.b, z24.b, z17.b\n" - "st1b { z15.b }, p1, [x25]\n" - "st1b { z16.b }, p1, [x24]\n" - "st1b { z24.b }, p1, [x23]\n" - "65:" // Height 5: Writeback done + "uzp1 z25.h, z26.h, z27.h\n" + "st1b { z8.b }, p1, [x27]\n" + "uzp1 z16.b, z16.b, z17.b\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z15.b }, p1, [x26]\n" + "st1b { z16.b }, p1, [x25]\n" + "st1b { z24.b }, p1, [x24]\n" "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" - "bgt 54b\n" - "b 80f\n" - "66:" // Height 6 + "bgt 50b\n" + "b 74f\n" + "61:" // Height 6 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" "mov x20, #0x6\n" - "mov x14, %x[col_bias]\n" - "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x12, %x[col_bias]\n" + "ldr x14, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_shift_ptr]]\n" "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" "madd x20, x21, x20, x9\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "67:" // Height 6: Column loop + "62:" // Height 6: Column loop "mov x20, #0x0\n" "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" @@ -1887,13 +1683,12 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "68:" // Height 6: setup done "mov x28, #0x0\n" - "69:" // Height 6: String loop + "64:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 70f\n" + "tbz %x[flags], #3, 65f\n" "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x26, [x20, #0x0]\n" @@ -1902,7 +1697,7 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "ldr x23, [x20, #0x18]\n" "ldr x22, [x20, #0x20]\n" "ldr x21, [x20, #0x28]\n" - "cbnz x28, 71f\n" + "cbnz x28, 66f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" "add x25, x25, x20\n" @@ -1910,219 +1705,219 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "add x23, x23, x20\n" "add x22, x22, x20\n" "add x21, x21, x20\n" - "b 71f\n" - "70:" // Height 6: setup direct input + "b 66f\n" + "65:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" "add x25, x26, x21\n" "add x24, x25, x21\n" "add x23, x24, x21\n" "add x22, x23, x21\n" "add x21, x22, x21\n" - "71:" // Height 6: input setup done + "66:" // Height 6: input setup done "cmp x27, #0x10\n" - "ble 73f\n" - "72:" // Height 6: Multiply loop: Main loop head + "ble 68f\n" + "67:" // Height 6: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1b { z1.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - "ld1rqb { z6.b }, p0/Z, [x26]\n" + "ld1rqb { z1.b }, p0/Z, [x26]\n" "add x26, x26, #0x10\n" - "ld1rqb { z3.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x25]\n" "add x25, x25, #0x10\n" - "ld1rqb { z7.b }, p0/Z, [x24]\n" + "ld1rqb { z3.b }, p0/Z, [x24]\n" "add x24, x24, #0x10\n" - "ld1rqb { z2.b }, p0/Z, [x23]\n" + "ld1rqb { z4.b }, p0/Z, [x23]\n" "ld1rqb { z5.b }, p0/Z, [x22]\n" - "ld1rqb { z0.b }, p0/Z, [x21]\n" + "ld1rqb { z6.b }, p0/Z, [x21]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "trn1 z4.d, z6.d, z3.d\n" - "trn2 z6.d, z6.d, z3.d\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" "add x21, x21, #0x10\n" - "trn1 z3.d, z7.d, z2.d\n" - "trn2 z7.d, z7.d, z2.d\n" - "trn1 z2.d, z5.d, z0.d\n" - "trn2 z5.d, z5.d, z0.d\n" - "ld1b { z0.b }, p2/Z, [x10, #1, MUL VL]\n" - ".inst 0x45019888 // smmla z8.s, z4.b, z1.b\n" - ".inst 0x45019870 // smmla z16.s, z3.b, z1.b\n" - ".inst 0x45019858 // smmla z24.s, z2.b, z1.b\n" - "ld1b { z1.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x4500988c // smmla z12.s, z4.b, z0.b\n" - ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n" - ".inst 0x4500985c // smmla z28.s, z2.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #3, MUL VL]\n" - ".inst 0x45019889 // smmla z9.s, z4.b, z1.b\n" - ".inst 0x45019871 // smmla z17.s, z3.b, z1.b\n" - ".inst 0x45019859 // smmla z25.s, z2.b, z1.b\n" - "ld1b { z1.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x4500988d // smmla z13.s, z4.b, z0.b\n" - ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n" - ".inst 0x4500985d // smmla z29.s, z2.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x4501988a // smmla z10.s, z4.b, z1.b\n" - ".inst 0x45019872 // smmla z18.s, z3.b, z1.b\n" - ".inst 0x4501985a // smmla z26.s, z2.b, z1.b\n" - "ld1b { z1.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x4500988e // smmla z14.s, z4.b, z0.b\n" - ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n" - ".inst 0x4500985e // smmla z30.s, z2.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #7, MUL VL]\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "trn1 z4.d, z5.d, z6.d\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x4501988b // smmla z11.s, z4.b, z1.b\n" - ".inst 0x45019873 // smmla z19.s, z3.b, z1.b\n" - ".inst 0x4501985b // smmla z27.s, z2.b, z1.b\n" - ".inst 0x4500988f // smmla z15.s, z4.b, z0.b\n" - ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n" - ".inst 0x4500985f // smmla z31.s, z2.b, z0.b\n" - "ld1b { z1.b }, p2/Z, [x10, #-8, MUL VL]\n" - "ld1b { z0.b }, p2/Z, [x10, #-7, MUL VL]\n" - ".inst 0x450198c8 // smmla z8.s, z6.b, z1.b\n" - ".inst 0x450198f0 // smmla z16.s, z7.b, z1.b\n" - ".inst 0x450198b8 // smmla z24.s, z5.b, z1.b\n" - "ld1b { z1.b }, p2/Z, [x10, #-6, MUL VL]\n" - ".inst 0x450098cc // smmla z12.s, z6.b, z0.b\n" - ".inst 0x450098f4 // smmla z20.s, z7.b, z0.b\n" - ".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #-5, MUL VL]\n" - ".inst 0x450198c9 // smmla z9.s, z6.b, z1.b\n" - ".inst 0x450198f1 // smmla z17.s, z7.b, z1.b\n" - ".inst 0x450198b9 // smmla z25.s, z5.b, z1.b\n" - "ld1b { z1.b }, p2/Z, [x10, #-4, MUL VL]\n" - ".inst 0x450098cd // smmla z13.s, z6.b, z0.b\n" - ".inst 0x450098f5 // smmla z21.s, z7.b, z0.b\n" - ".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #-3, MUL VL]\n" - ".inst 0x450198ca // smmla z10.s, z6.b, z1.b\n" - ".inst 0x450198f2 // smmla z18.s, z7.b, z1.b\n" - ".inst 0x450198ba // smmla z26.s, z5.b, z1.b\n" - "ld1b { z1.b }, p2/Z, [x10, #-2, MUL VL]\n" - ".inst 0x450098ce // smmla z14.s, z6.b, z0.b\n" - ".inst 0x450098f6 // smmla z22.s, z7.b, z0.b\n" - ".inst 0x450098be // smmla z30.s, z5.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #-1, MUL VL]\n" - ".inst 0x450198cb // smmla z11.s, z6.b, z1.b\n" - ".inst 0x450198f3 // smmla z19.s, z7.b, z1.b\n" - ".inst 0x450198bb // smmla z27.s, z5.b, z1.b\n" - ".inst 0x450098cf // smmla z15.s, z6.b, z0.b\n" - ".inst 0x450098f7 // smmla z23.s, z7.b, z0.b\n" - ".inst 0x450098bf // smmla z31.s, z5.b, z0.b\n" - "bgt 72b\n" - "73:" // Height 6: Multiply loop: Single iteration only + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x10, #-7, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-6, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-5, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-3, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #-2, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #-1, MUL VL]\n" + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + "bgt 67b\n" + "68:" // Height 6: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" - "ld1b { z2.b }, p2/Z, [x10]\n" + "ld1b { z7.b }, p2/Z, [x10]\n" "subs x27, x27, #0x8\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z6.b }, p0/Z, [x25]\n" + "ld1rqb { z2.b }, p0/Z, [x25]\n" "ld1rqb { z3.b }, p0/Z, [x24]\n" "ld1rqb { z4.b }, p0/Z, [x23]\n" "ld1rqb { z5.b }, p0/Z, [x22]\n" - "ld1rqb { z0.b }, p0/Z, [x21]\n" - "trn1 z7.d, z1.d, z6.d\n" - "trn2 z1.d, z1.d, z6.d\n" - "trn1 z6.d, z3.d, z4.d\n" + "ld1rqb { z6.b }, p0/Z, [x21]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + "trn1 z2.d, z3.d, z4.d\n" "trn2 z3.d, z3.d, z4.d\n" - "trn1 z4.d, z5.d, z0.d\n" - "trn2 z5.d, z5.d, z0.d\n" - "ld1b { z0.b }, p2/Z, [x10, #1, MUL VL]\n" - ".inst 0x450298e8 // smmla z8.s, z7.b, z2.b\n" - ".inst 0x450298d0 // smmla z16.s, z6.b, z2.b\n" - ".inst 0x45029898 // smmla z24.s, z4.b, z2.b\n" - "ld1b { z2.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n" - ".inst 0x450098d4 // smmla z20.s, z6.b, z0.b\n" - ".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #3, MUL VL]\n" - ".inst 0x450298e9 // smmla z9.s, z7.b, z2.b\n" - ".inst 0x450298d1 // smmla z17.s, z6.b, z2.b\n" - ".inst 0x45029899 // smmla z25.s, z4.b, z2.b\n" - "ld1b { z2.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n" - ".inst 0x450098d5 // smmla z21.s, z6.b, z0.b\n" - ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x450298ea // smmla z10.s, z7.b, z2.b\n" - ".inst 0x450298d2 // smmla z18.s, z6.b, z2.b\n" - ".inst 0x4502989a // smmla z26.s, z4.b, z2.b\n" - "ld1b { z2.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n" - ".inst 0x450098d6 // smmla z22.s, z6.b, z0.b\n" - ".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #7, MUL VL]\n" + "trn1 z4.d, z5.d, z6.d\n" + "trn2 z5.d, z5.d, z6.d\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" + ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" + ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" + ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" + ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" + ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" + ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" + ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" + ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" + ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x450298eb // smmla z11.s, z7.b, z2.b\n" - ".inst 0x450298d3 // smmla z19.s, z6.b, z2.b\n" - ".inst 0x4502989b // smmla z27.s, z4.b, z2.b\n" - ".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n" - ".inst 0x450098d7 // smmla z23.s, z6.b, z0.b\n" - ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n" - "ble 74f\n" - "ld1b { z2.b }, p2/Z, [x10]\n" - "ld1b { z0.b }, p2/Z, [x10, #1, MUL VL]\n" - ".inst 0x45029828 // smmla z8.s, z1.b, z2.b\n" - ".inst 0x45029870 // smmla z16.s, z3.b, z2.b\n" - ".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n" - "ld1b { z2.b }, p2/Z, [x10, #2, MUL VL]\n" - ".inst 0x4500982c // smmla z12.s, z1.b, z0.b\n" - ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n" - ".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #3, MUL VL]\n" - ".inst 0x45029829 // smmla z9.s, z1.b, z2.b\n" - ".inst 0x45029871 // smmla z17.s, z3.b, z2.b\n" - ".inst 0x450298b9 // smmla z25.s, z5.b, z2.b\n" - "ld1b { z2.b }, p2/Z, [x10, #4, MUL VL]\n" - ".inst 0x4500982d // smmla z13.s, z1.b, z0.b\n" - ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n" - ".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #5, MUL VL]\n" - ".inst 0x4502982a // smmla z10.s, z1.b, z2.b\n" - ".inst 0x45029872 // smmla z18.s, z3.b, z2.b\n" - ".inst 0x450298ba // smmla z26.s, z5.b, z2.b\n" - "ld1b { z2.b }, p2/Z, [x10, #6, MUL VL]\n" - ".inst 0x4500982e // smmla z14.s, z1.b, z0.b\n" - ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n" - ".inst 0x450098be // smmla z30.s, z5.b, z0.b\n" - "ld1b { z0.b }, p2/Z, [x10, #7, MUL VL]\n" + ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" + ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" + ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" + "ble 69f\n" + "ld1b { z7.b }, p2/Z, [x10]\n" + "ld1b { z6.b }, p2/Z, [x10, #1, MUL VL]\n" + ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" + ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" + ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #2, MUL VL]\n" + ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" + ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" + ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #3, MUL VL]\n" + ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" + ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" + ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #4, MUL VL]\n" + ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" + ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" + ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #5, MUL VL]\n" + ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" + ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" + ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x10, #6, MUL VL]\n" + ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" + ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x4502982b // smmla z11.s, z1.b, z2.b\n" - ".inst 0x45029873 // smmla z19.s, z3.b, z2.b\n" - ".inst 0x450298bb // smmla z27.s, z5.b, z2.b\n" - ".inst 0x4500982f // smmla z15.s, z1.b, z0.b\n" - ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n" - ".inst 0x450098bf // smmla z31.s, z5.b, z0.b\n" - "74:" // Height 6: Multiply loop: multiply skip + ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" + ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" + ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" + ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + "69:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "bne 69b\n" + "bne 64b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 z4.d, z8.d, z12.d\n" + "uzp1 z7.d, z8.d, z12.d\n" "uzp2 z8.d, z8.d, z12.d\n" - "ld1w { z3.s }, p2/Z, [x14]\n" + "ld1w { z0.s }, p2/Z, [x12]\n" "uzp1 z12.d, z9.d, z13.d\n" "uzp2 z9.d, z9.d, z13.d\n" - "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" "uzp1 z13.d, z10.d, z14.d\n" "uzp2 z10.d, z10.d, z14.d\n" - "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" - "add x26, x9, x20\n" + "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" + "addvl x12, x12, #4\n" + "add x27, x9, x20\n" "uzp1 z14.d, z11.d, z15.d\n" "uzp2 z11.d, z11.d, z15.d\n" "uzp1 z15.d, z16.d, z20.d\n" "uzp2 z16.d, z16.d, z20.d\n" - "add x25, x26, x20\n" + "add x26, x27, x20\n" "uzp1 z20.d, z17.d, z21.d\n" "uzp2 z17.d, z17.d, z21.d\n" - "add x24, x25, x20\n" + "add x25, x26, x20\n" "uzp1 z21.d, z18.d, z22.d\n" "uzp2 z18.d, z18.d, z22.d\n" - "add x23, x24, x20\n" + "add x24, x25, x20\n" "uzp1 z22.d, z19.d, z23.d\n" "uzp2 z19.d, z19.d, z23.d\n" - "add x22, x23, x20\n" + "add x23, x24, x20\n" "uzp1 z23.d, z24.d, z28.d\n" "uzp2 z24.d, z24.d, z28.d\n" "uzp1 z28.d, z25.d, z29.d\n" @@ -2131,44 +1926,44 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "uzp2 z26.d, z26.d, z30.d\n" "uzp1 z30.d, z27.d, z31.d\n" "uzp2 z27.d, z27.d, z31.d\n" - "mov z31.d, z4.d\n" - "add z12.s, z12.s, z2.s\n" - "add z13.s, z13.s, z1.s\n" - "add z14.s, z14.s, z0.s\n" - "add z31.s, z31.s, z3.s\n" - "add z8.s, z8.s, z3.s\n" - "add z9.s, z9.s, z2.s\n" - "add z10.s, z10.s, z1.s\n" - "add z11.s, z11.s, z0.s\n" - "add z15.s, z15.s, z3.s\n" - "add z20.s, z20.s, z2.s\n" - "add z21.s, z21.s, z1.s\n" - "add z22.s, z22.s, z0.s\n" - "add z16.s, z16.s, z3.s\n" - "add z17.s, z17.s, z2.s\n" - "add z18.s, z18.s, z1.s\n" - "add z19.s, z19.s, z0.s\n" - "add z23.s, z23.s, z3.s\n" - "add z28.s, z28.s, z2.s\n" - "add z29.s, z29.s, z1.s\n" - "add z30.s, z30.s, z0.s\n" - "add z24.s, z24.s, z3.s\n" - "add z25.s, z25.s, z2.s\n" - "add z26.s, z26.s, z1.s\n" - "add z27.s, z27.s, z0.s\n" - "tbz %x[flags], #4, 75f\n" - "ld1w { z0.s }, p2/Z, [x12]\n" - "ld1w { z4.s }, p2/Z, [x13]\n" - "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" - "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" - "addvl x12, x12, #4\n" + "mov z31.d, z7.d\n" + "add z12.s, z12.s, z1.s\n" + "add z13.s, z13.s, z2.s\n" + "add z14.s, z14.s, z3.s\n" + "add z31.s, z31.s, z0.s\n" + "add z8.s, z8.s, z0.s\n" + "add z9.s, z9.s, z1.s\n" + "add z10.s, z10.s, z2.s\n" + "add z11.s, z11.s, z3.s\n" + "add z15.s, z15.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z23.s, z23.s, z0.s\n" + "add z28.s, z28.s, z1.s\n" + "add z29.s, z29.s, z2.s\n" + "add z30.s, z30.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + "tbz %x[flags], #4, 70f\n" + "ld1w { z0.s }, p2/Z, [x13]\n" + "ld1w { z4.s }, p2/Z, [x14]\n" + "ld1w { z1.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x13, #3, MUL VL]\n" + "ld1w { z7.s }, p2/Z, [x14, #3, MUL VL]\n" "addvl x13, x13, #4\n" - "b 76f\n" - "75:" // Height 6: per layer parameters + "addvl x14, x14, #4\n" + "b 71f\n" + "70:" // Height 6: per layer parameters "add x21, %x[qp], %[per_layer_right_shift]\n" "add x20, %x[qp], %[per_layer_mul]\n" "ld1rw { z0.s }, p2/Z, [x21]\n" @@ -2179,248 +1974,173 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "mov z6.d, z4.d\n" "mov z3.d, z0.d\n" "mov z7.d, z4.d\n" - "76:" // Height 6: parameters loaded - ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" - ".inst 0x04a5758c // sqrdmulh z12.s, z12.s, z5.s\n" - ".inst 0x04a675ad // sqrdmulh z13.s, z13.s, z6.s\n" - ".inst 0x04a775ce // sqrdmulh z14.s, z14.s, z7.s\n" - ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" - ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" - ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" - ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" - ".inst 0x04a475ef // sqrdmulh z15.s, z15.s, z4.s\n" - ".inst 0x04a57694 // sqrdmulh z20.s, z20.s, z5.s\n" - ".inst 0x04a676b5 // sqrdmulh z21.s, z21.s, z6.s\n" - ".inst 0x04a776d6 // sqrdmulh z22.s, z22.s, z7.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" - ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" - ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" - ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" - ".inst 0x04a5779c // sqrdmulh z28.s, z28.s, z5.s\n" - ".inst 0x04a677bd // sqrdmulh z29.s, z29.s, z6.s\n" - ".inst 0x04a777de // sqrdmulh z30.s, z30.s, z7.s\n" - ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" - ".inst 0x04a57739 // sqrdmulh z25.s, z25.s, z5.s\n" - ".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n" - ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n" - "tbz %x[flags], #5, 77f\n" - "and z7.d, z31.d, z0.d\n" - "and z6.d, z12.d, z1.d\n" - "and z5.d, z13.d, z2.d\n" - "and z4.d, z14.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z31.s, z31.s, z7.s\n" - "and z7.d, z8.d, z0.d\n" - "sqadd z12.s, z12.s, z6.s\n" - "and z6.d, z9.d, z1.d\n" - "sqadd z13.s, z13.s, z5.s\n" - "sqadd z14.s, z14.s, z4.s\n" - "and z5.d, z10.d, z2.d\n" - "and z4.d, z11.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z8.s, z8.s, z7.s\n" - "and z7.d, z15.d, z0.d\n" - "sqadd z9.s, z9.s, z6.s\n" - "and z6.d, z20.d, z1.d\n" - "sqadd z10.s, z10.s, z5.s\n" - "sqadd z11.s, z11.s, z4.s\n" - "and z5.d, z21.d, z2.d\n" - "and z4.d, z22.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z15.s, z15.s, z7.s\n" - "and z7.d, z16.d, z0.d\n" - "sqadd z20.s, z20.s, z6.s\n" - "and z6.d, z17.d, z1.d\n" - "sqadd z21.s, z21.s, z5.s\n" - "sqadd z22.s, z22.s, z4.s\n" - "and z5.d, z18.d, z2.d\n" - "and z4.d, z19.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z16.s, z16.s, z7.s\n" - "and z7.d, z23.d, z0.d\n" - "sqadd z17.s, z17.s, z6.s\n" - "and z6.d, z28.d, z1.d\n" - "sqadd z18.s, z18.s, z5.s\n" - "sqadd z19.s, z19.s, z4.s\n" - "and z5.d, z29.d, z2.d\n" - "and z4.d, z30.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z23.s, z23.s, z7.s\n" - "and z7.d, z24.d, z0.d\n" - "sqadd z28.s, z28.s, z6.s\n" - "and z6.d, z25.d, z1.d\n" - "sqadd z29.s, z29.s, z5.s\n" - "sqadd z30.s, z30.s, z4.s\n" - "and z5.d, z26.d, z2.d\n" - "and z4.d, z27.d, z3.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z24.s, z24.s, z7.s\n" - "sqadd z25.s, z25.s, z6.s\n" - "sqadd z26.s, z26.s, z5.s\n" - "sqadd z27.s, z27.s, z4.s\n" - "77:" // Height 6: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "71:" // Height 6: parameters loaded + ".inst 0x04a473ff // sqdmulh z31.s, z31.s, z4.s\n" + ".inst 0x04a5718c // sqdmulh z12.s, z12.s, z5.s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[maxval]\n" + ".inst 0x04a671ad // sqdmulh z13.s, z13.s, z6.s\n" + ".inst 0x04a771ce // sqdmulh z14.s, z14.s, z7.s\n" + "add x20, %x[qp], %[minval]\n" + ".inst 0x04a47108 // sqdmulh z8.s, z8.s, z4.s\n" + ".inst 0x04a57129 // sqdmulh z9.s, z9.s, z5.s\n" + ".inst 0x04a6714a // sqdmulh z10.s, z10.s, z6.s\n" + ".inst 0x04a7716b // sqdmulh z11.s, z11.s, z7.s\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" ".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n" + ".inst 0x04a471ef // sqdmulh z15.s, z15.s, z4.s\n" + ".inst 0x04a57294 // sqdmulh z20.s, z20.s, z5.s\n" ".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n" ".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n" + ".inst 0x04a672b5 // sqdmulh z21.s, z21.s, z6.s\n" + ".inst 0x04a772d6 // sqdmulh z22.s, z22.s, z7.s\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a57231 // sqdmulh z17.s, z17.s, z5.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "add z31.s, z31.s, z4.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" + ".inst 0x04a67252 // sqdmulh z18.s, z18.s, z6.s\n" + ".inst 0x04a77273 // sqdmulh z19.s, z19.s, z7.s\n" ".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n" - "add z12.s, z12.s, z4.s\n" - "add z13.s, z13.s, z4.s\n" ".inst 0x44828834 // srshl z20.s, p2/M, z20.s, z1.s\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a5739c // sqdmulh z28.s, z28.s, z5.s\n" ".inst 0x44828855 // srshl z21.s, p2/M, z21.s, z2.s\n" - "add z14.s, z14.s, z4.s\n" - "add z8.s, z8.s, z4.s\n" ".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n" + ".inst 0x04a673bd // sqdmulh z29.s, z29.s, z6.s\n" + ".inst 0x04a773de // sqdmulh z30.s, z30.s, z7.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z9.s, z9.s, z4.s\n" - "add z10.s, z10.s, z4.s\n" ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" + ".inst 0x04a47318 // sqdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a57339 // sqdmulh z25.s, z25.s, z5.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" - "add z11.s, z11.s, z4.s\n" - "add z15.s, z15.s, z4.s\n" + ".inst 0x04a6735a // sqdmulh z26.s, z26.s, z6.s\n" + ".inst 0x04a7737b // sqdmulh z27.s, z27.s, z7.s\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "add z20.s, z20.s, z4.s\n" - "add z21.s, z21.s, z4.s\n" ".inst 0x4482883c // srshl z28.s, p2/M, z28.s, z1.s\n" ".inst 0x4482885d // srshl z29.s, p2/M, z29.s, z2.s\n" - "add z22.s, z22.s, z4.s\n" - "add z16.s, z16.s, z4.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" ".inst 0x4482887e // srshl z30.s, p2/M, z30.s, z3.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z31.s, z31.s, z4.s\n" + "add z12.s, z12.s, z4.s\n" ".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n" ".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n" + "add z13.s, z13.s, z4.s\n" + "add z14.s, z14.s, z4.s\n" + ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n" + "add z8.s, z8.s, z4.s\n" + "add z9.s, z9.s, z4.s\n" + "add z10.s, z10.s, z4.s\n" + "add z11.s, z11.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smin z12.s, p2/M, z12.s, z6.s\n" + "add z15.s, z15.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "smin z13.s, p2/M, z13.s, z6.s\n" + "smin z14.s, p2/M, z14.s, z6.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "smin z8.s, p2/M, z8.s, z6.s\n" + "smin z9.s, p2/M, z9.s, z6.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "smin z10.s, p2/M, z10.s, z6.s\n" + "smin z11.s, p2/M, z11.s, z6.s\n" + "add z18.s, z18.s, z4.s\n" "add z19.s, z19.s, z4.s\n" + "smin z15.s, p2/M, z15.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" "add z23.s, z23.s, z4.s\n" - ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n" - "add x20, %x[qp], %[maxval]\n" "add z28.s, z28.s, z4.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" "add z29.s, z29.s, z4.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" "add z30.s, z30.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" "add z24.s, z24.s, z4.s\n" - "add x20, %x[qp], %[minval]\n" "add z25.s, z25.s, z4.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" "add z26.s, z26.s, z4.s\n" - "ld1rw { z2.s }, p2/Z, [x20]\n" "add z27.s, z27.s, z4.s\n" - "smin z31.s, p2/M, z31.s, z0.s\n" - "smin z12.s, p2/M, z12.s, z0.s\n" - "smin z13.s, p2/M, z13.s, z0.s\n" - "smin z14.s, p2/M, z14.s, z0.s\n" - "smin z8.s, p2/M, z8.s, z0.s\n" - "smin z9.s, p2/M, z9.s, z0.s\n" - "smin z10.s, p2/M, z10.s, z0.s\n" - "smin z11.s, p2/M, z11.s, z0.s\n" - "smin z15.s, p2/M, z15.s, z0.s\n" - "smin z20.s, p2/M, z20.s, z0.s\n" - "smin z21.s, p2/M, z21.s, z0.s\n" - "smin z22.s, p2/M, z22.s, z0.s\n" - "smin z16.s, p2/M, z16.s, z0.s\n" - "smin z17.s, p2/M, z17.s, z0.s\n" - "smin z18.s, p2/M, z18.s, z0.s\n" - "smin z19.s, p2/M, z19.s, z0.s\n" - "smin z23.s, p2/M, z23.s, z0.s\n" - "smin z28.s, p2/M, z28.s, z0.s\n" - "smin z29.s, p2/M, z29.s, z0.s\n" - "smin z30.s, p2/M, z30.s, z0.s\n" - "smin z24.s, p2/M, z24.s, z0.s\n" - "smin z25.s, p2/M, z25.s, z0.s\n" - "smin z26.s, p2/M, z26.s, z0.s\n" - "smin z27.s, p2/M, z27.s, z0.s\n" - "smax z31.s, p2/M, z31.s, z2.s\n" - "smax z12.s, p2/M, z12.s, z2.s\n" - "smax z13.s, p2/M, z13.s, z2.s\n" - "smax z14.s, p2/M, z14.s, z2.s\n" - "smax z8.s, p2/M, z8.s, z2.s\n" - "smax z9.s, p2/M, z9.s, z2.s\n" - "smax z10.s, p2/M, z10.s, z2.s\n" - "smax z11.s, p2/M, z11.s, z2.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z28.s, p2/M, z28.s, z6.s\n" + "smin z29.s, p2/M, z29.s, z6.s\n" + "smin z30.s, p2/M, z30.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "smax z12.s, p2/M, z12.s, z5.s\n" + "smax z13.s, p2/M, z13.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z8.s, p2/M, z8.s, z5.s\n" + "smax z9.s, p2/M, z9.s, z5.s\n" + "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z5.s\n" "uzp1 z31.h, z31.h, z12.h\n" - "smax z15.s, p2/M, z15.s, z2.s\n" - "smax z20.s, p2/M, z20.s, z2.s\n" - "uzp1 z1.h, z13.h, z14.h\n" - "smax z21.s, p2/M, z21.s, z2.s\n" - "smax z22.s, p2/M, z22.s, z2.s\n" + "smax z15.s, p2/M, z15.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "uzp1 z12.h, z13.h, z14.h\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" "uzp1 z8.h, z8.h, z9.h\n" - "smax z16.s, p2/M, z16.s, z2.s\n" - "smax z17.s, p2/M, z17.s, z2.s\n" - "uzp1 z0.h, z10.h, z11.h\n" - "smax z18.s, p2/M, z18.s, z2.s\n" - "smax z19.s, p2/M, z19.s, z2.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "uzp1 z9.h, z10.h, z11.h\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z15.h, z15.h, z20.h\n" - "uzp1 z31.b, z31.b, z1.b\n" - "smax z23.s, p2/M, z23.s, z2.s\n" - "smax z28.s, p2/M, z28.s, z2.s\n" + "uzp1 z31.b, z31.b, z12.b\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z28.s, p2/M, z28.s, z5.s\n" "uzp1 z20.h, z21.h, z22.h\n" - "smax z29.s, p2/M, z29.s, z2.s\n" - "smax z30.s, p2/M, z30.s, z2.s\n" + "smax z29.s, p2/M, z29.s, z5.s\n" + "smax z30.s, p2/M, z30.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "uzp1 z8.b, z8.b, z0.b\n" - "smax z24.s, p2/M, z24.s, z2.s\n" - "smax z25.s, p2/M, z25.s, z2.s\n" + "uzp1 z8.b, z8.b, z9.b\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" "uzp1 z17.h, z18.h, z19.h\n" "st1b { z31.b }, p1, [x9]\n" - "smax z26.s, p2/M, z26.s, z2.s\n" - "smax z27.s, p2/M, z27.s, z2.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z23.h, z23.h, z28.h\n" "uzp1 z15.b, z15.b, z20.b\n" - "uzp1 z18.h, z29.h, z30.h\n" - "st1b { z8.b }, p1, [x26]\n" + "uzp1 z28.h, z29.h, z30.h\n" + "st1b { z8.b }, p1, [x27]\n" "addvl x9, x9, #1\n" "uzp1 z24.h, z24.h, z25.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "uzp1 z17.h, z26.h, z27.h\n" - "st1b { z15.b }, p1, [x25]\n" - "uzp1 z23.b, z23.b, z18.b\n" - "uzp1 z24.b, z24.b, z17.b\n" - "st1b { z16.b }, p1, [x24]\n" - "st1b { z23.b }, p1, [x23]\n" - "st1b { z24.b }, p1, [x22]\n" - "78:" // Height 6: Writeback done + "uzp1 z25.h, z26.h, z27.h\n" + "st1b { z15.b }, p1, [x26]\n" + "uzp1 z23.b, z23.b, z28.b\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z16.b }, p1, [x25]\n" + "st1b { z23.b }, p1, [x24]\n" + "st1b { z24.b }, p1, [x23]\n" "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" - "bgt 67b\n" + "bgt 62b\n" "subs %x[M], %x[M], #0x6\n" - "beq 80f\n" + "beq 74f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 79f\n" + "tbz %x[flags], #3, 73f\n" "add x21, x21, #0x6\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "79:" // Update direct input + "73:" // Update direct input "mov x20, #0x6\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "80:" // Exit + "74:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp index 7b598bac57..1482088292 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -74,23 +73,20 @@ void sve_hybrid_u8qa_dot_4x4VL ( ka.string_lengths = string_lengths; ka.N = N; ka.B_ptr = B_ptr; - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "ptrue p2.b\n" "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 43f\n" + "bge 40f\n" "cmp %x[M], #0x2\n" - "bgt 29f\n" - "beq 15f\n" - "mov x10, %x[col_bias]\n" + "bgt 27f\n" + "beq 14f\n" "mov z11.s, #0x0\n" "mov z15.b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "2:" // Height 1: Column loop "mov x20, #0x0\n" @@ -98,8 +94,7 @@ void sve_hybrid_u8qa_dot_4x4VL ( "mov z17.s, #0x0\n" "mov z18.s, #0x0\n" "mov z19.s, #0x0\n" - "whilelt p1.b, x20, x9\n" - "3:" // Height 1: setup done + "whilelt p1.b, x20, x10\n" "mov x26, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -120,41 +115,41 @@ void sve_hybrid_u8qa_dot_4x4VL ( "ble 9f\n" "7:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z21.b }, p2/Z, [x28]\n" - "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z20.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z23.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" - "ld1b { z22.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #6, MUL VL]\n" "add x24, x24, #0x10\n" - "udot z16.s, z21.b, z0.b[0]\n" - "ld1b { z21.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - "udot z17.s, z26.b, z0.b[0]\n" - "udot z18.s, z25.b, z0.b[0]\n" - "udot z19.s, z24.b, z0.b[0]\n" - "udot z16.s, z20.b, z0.b[1]\n" - "ld1b { z20.b }, p2/Z, [x28, #-8, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n" - "udot z17.s, z23.b, z0.b[1]\n" - "ld1b { z23.b }, p2/Z, [x28, #-4, MUL VL]\n" - "udot z18.s, z22.b, z0.b[1]\n" - "ld1b { z22.b }, p2/Z, [x28, #-3, MUL VL]\n" - "udot z19.s, z21.b, z0.b[1]\n" - "ld1b { z21.b }, p2/Z, [x28, #-2, MUL VL]\n" - "udot z16.s, z20.b, z0.b[2]\n" - "ld1b { z20.b }, p2/Z, [x28, #-1, MUL VL]\n" - "udot z17.s, z26.b, z0.b[2]\n" - "udot z18.s, z25.b, z0.b[2]\n" - "udot z19.s, z24.b, z0.b[2]\n" - "udot z16.s, z23.b, z0.b[3]\n" - "udot z17.s, z22.b, z0.b[3]\n" - "udot z18.s, z21.b, z0.b[3]\n" - "udot z19.s, z20.b, z0.b[3]\n" + "udot z16.s, z4.b, z0.b[0]\n" + "ld1b { z4.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + "udot z17.s, z5.b, z0.b[0]\n" + "udot z18.s, z6.b, z0.b[0]\n" + "udot z19.s, z7.b, z0.b[0]\n" + "udot z16.s, z8.b, z0.b[1]\n" + "ld1b { z5.b }, p2/Z, [x9, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-5, MUL VL]\n" + "udot z17.s, z9.b, z0.b[1]\n" + "ld1b { z9.b }, p2/Z, [x9, #-4, MUL VL]\n" + "udot z18.s, z10.b, z0.b[1]\n" + "ld1b { z10.b }, p2/Z, [x9, #-3, MUL VL]\n" + "udot z19.s, z4.b, z0.b[1]\n" + "ld1b { z4.b }, p2/Z, [x9, #-2, MUL VL]\n" + "udot z16.s, z5.b, z0.b[2]\n" + "ld1b { z5.b }, p2/Z, [x9, #-1, MUL VL]\n" + "udot z17.s, z6.b, z0.b[2]\n" + "udot z18.s, z7.b, z0.b[2]\n" + "udot z19.s, z8.b, z0.b[2]\n" + "udot z16.s, z9.b, z0.b[3]\n" + "udot z17.s, z10.b, z0.b[3]\n" + "udot z18.s, z4.b, z0.b[3]\n" + "udot z19.s, z5.b, z0.b[3]\n" "tbnz %x[flags], #31, 8f\n" "udot z11.s, z0.b, z15.b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum @@ -163,49 +158,49 @@ void sve_hybrid_u8qa_dot_4x4VL ( "bgt 7b\n" "9:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z23.b }, p2/Z, [x28]\n" - "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" - "udot z16.s, z23.b, z0.b[0]\n" - "udot z17.s, z22.b, z0.b[0]\n" - "udot z18.s, z21.b, z0.b[0]\n" - "udot z19.s, z20.b, z0.b[0]\n" + "udot z16.s, z4.b, z0.b[0]\n" + "udot z17.s, z5.b, z0.b[0]\n" + "udot z18.s, z6.b, z0.b[0]\n" + "udot z19.s, z7.b, z0.b[0]\n" "ble 10f\n" - "ld1b { z23.b }, p2/Z, [x28]\n" - "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9]\n" + "ld1b { z9.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "udot z16.s, z23.b, z0.b[1]\n" - "udot z17.s, z22.b, z0.b[1]\n" - "udot z18.s, z21.b, z0.b[1]\n" - "udot z19.s, z20.b, z0.b[1]\n" + "ld1b { z10.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z16.s, z8.b, z0.b[1]\n" + "udot z17.s, z9.b, z0.b[1]\n" + "udot z18.s, z10.b, z0.b[1]\n" + "udot z19.s, z4.b, z0.b[1]\n" "ble 10f\n" - "ld1b { z23.b }, p2/Z, [x28]\n" - "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "udot z16.s, z23.b, z0.b[2]\n" - "udot z17.s, z22.b, z0.b[2]\n" - "udot z18.s, z21.b, z0.b[2]\n" - "udot z19.s, z20.b, z0.b[2]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z16.s, z5.b, z0.b[2]\n" + "udot z17.s, z6.b, z0.b[2]\n" + "udot z18.s, z7.b, z0.b[2]\n" + "udot z19.s, z8.b, z0.b[2]\n" "ble 10f\n" - "ld1b { z23.b }, p2/Z, [x28]\n" - "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "udot z16.s, z23.b, z0.b[3]\n" - "udot z17.s, z22.b, z0.b[3]\n" - "udot z18.s, z21.b, z0.b[3]\n" - "udot z19.s, z20.b, z0.b[3]\n" + "ld1b { z9.b }, p2/Z, [x9]\n" + "ld1b { z10.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z16.s, z9.b, z0.b[3]\n" + "udot z17.s, z10.b, z0.b[3]\n" + "udot z18.s, z4.b, z0.b[3]\n" + "udot z19.s, z5.b, z0.b[3]\n" "10:" // Height 1: Multiply loop: multiply skip "tbnz %x[flags], #31, 11f\n" "udot z11.s, z0.b, z15.b\n" @@ -217,91 +212,76 @@ void sve_hybrid_u8qa_dot_4x4VL ( "tbnz %x[flags], #31, 12f\n" "mov x21, #0x4\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z20.s }, p2/Z, [x20]\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" "whilelt p0.s, XZR, x21\n" - "neg z20.s, p2/M, z20.s\n" + "neg z1.s, p2/M, z1.s\n" "uaddv d11, p0, z11.s\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z20.s\n" + "mul z11.s, p2/M, z11.s, z1.s\n" "12:" // Height 1: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z23.s }, p2/Z, [x10]\n" - "ld1w { z20.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z22.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z21.s }, p2/Z, [x10, #3, MUL VL]\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "add z16.s, z16.s, z23.s\n" - "add z17.s, z17.s, z20.s\n" - "ld1rw { z20.s }, p2/Z, [x20]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - "addvl x10, x10, #4\n" - "add z18.s, z18.s, z22.s\n" - "add z19.s, z19.s, z21.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "ld1rw { z4.s }, p2/Z, [x21]\n" "ld1rw { z0.s }, p2/Z, [x20]\n" - ".inst 0x04b47610 // sqrdmulh z16.s, z16.s, z20.s\n" - ".inst 0x04b47631 // sqrdmulh z17.s, z17.s, z20.s\n" - ".inst 0x04b47652 // sqrdmulh z18.s, z18.s, z20.s\n" - ".inst 0x04b47673 // sqrdmulh z19.s, z19.s, z20.s\n" - "tbz %x[flags], #5, 13f\n" - "and z23.d, z16.d, z0.d\n" - "and z22.d, z17.d, z0.d\n" - "and z21.d, z18.d, z0.d\n" - "and z20.d, z19.d, z0.d\n" - "asr z23.s, z23.s, #0x1f\n" - "asr z22.s, z22.s, #0x1f\n" - "asr z21.s, z21.s, #0x1f\n" - "asr z20.s, z20.s, #0x1f\n" - "sqadd z16.s, z16.s, z23.s\n" - "sqadd z17.s, z17.s, z22.s\n" - "sqadd z18.s, z18.s, z21.s\n" - "sqadd z19.s, z19.s, z20.s\n" - "13:" // Height 1: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x20]\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x21]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "ld1rw { z22.s }, p2/Z, [x20]\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1rw { z21.s }, p2/Z, [x20]\n" - "add z16.s, z16.s, z22.s\n" - "add x20, %x[qp], %[minval]\n" - "add z17.s, z17.s, z22.s\n" - "add z18.s, z18.s, z22.s\n" - "ld1rw { z20.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z22.s\n" - "smin z16.s, p2/M, z16.s, z21.s\n" - "smin z17.s, p2/M, z17.s, z21.s\n" - "smin z18.s, p2/M, z18.s, z21.s\n" - "smin z19.s, p2/M, z19.s, z21.s\n" - "smax z16.s, p2/M, z16.s, z20.s\n" - "smax z17.s, p2/M, z17.s, z20.s\n" - "smax z18.s, p2/M, z18.s, z20.s\n" - "smax z19.s, p2/M, z19.s, z20.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" "st1b { z16.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "14:" // Height 1: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" "bgt 2b\n" - "b 58f\n" - "15:" // Height 2 - "mov x10, %x[col_bias]\n" + "b 54f\n" + "14:" // Height 2 "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z15.b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "16:" // Height 2: Column loop + "15:" // Height 2: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -309,302 +289,274 @@ void sve_hybrid_u8qa_dot_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "17:" // Height 2: setup done "mov x26, #0x0\n" - "18:" // Height 2: String loop + "17:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 19f\n" + "tbz %x[flags], #3, 18f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" - "cbnz x26, 20f\n" + "cbnz x26, 19f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" - "b 20f\n" - "19:" // Height 2: setup direct input + "b 19f\n" + "18:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" - "20:" // Height 2: input setup done + "19:" // Height 2: input setup done "cmp x25, #0x10\n" - "ble 23f\n" - "21:" // Height 2: Multiply loop: Main loop head + "ble 22f\n" + "20:" // Height 2: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z25.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n" - "udot z16.s, z25.b, z0.b[0]\n" - "udot z20.s, z25.b, z1.b[0]\n" - "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - "udot z17.s, z30.b, z0.b[0]\n" - "udot z21.s, z30.b, z1.b[0]\n" - "udot z18.s, z29.b, z0.b[0]\n" - "udot z22.s, z29.b, z1.b[0]\n" - "udot z19.s, z28.b, z0.b[0]\n" - "udot z23.s, z28.b, z1.b[0]\n" - "udot z16.s, z24.b, z0.b[1]\n" - "udot z20.s, z24.b, z1.b[1]\n" - "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n" - "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n" - "udot z17.s, z27.b, z0.b[1]\n" - "udot z21.s, z27.b, z1.b[1]\n" - "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n" - "udot z18.s, z26.b, z0.b[1]\n" - "udot z22.s, z26.b, z1.b[1]\n" - "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n" - "udot z19.s, z25.b, z0.b[1]\n" - "udot z23.s, z25.b, z1.b[1]\n" - "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n" - "udot z16.s, z24.b, z0.b[2]\n" - "udot z20.s, z24.b, z1.b[2]\n" - "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n" - "udot z17.s, z30.b, z0.b[2]\n" - "udot z21.s, z30.b, z1.b[2]\n" - "udot z18.s, z29.b, z0.b[2]\n" - "udot z22.s, z29.b, z1.b[2]\n" - "udot z19.s, z28.b, z0.b[2]\n" - "udot z23.s, z28.b, z1.b[2]\n" - "udot z16.s, z27.b, z0.b[3]\n" - "udot z20.s, z27.b, z1.b[3]\n" - "udot z17.s, z26.b, z0.b[3]\n" - "udot z21.s, z26.b, z1.b[3]\n" - "udot z18.s, z25.b, z0.b[3]\n" - "udot z22.s, z25.b, z1.b[3]\n" - "udot z19.s, z24.b, z0.b[3]\n" - "udot z23.s, z24.b, z1.b[3]\n" - "tbnz %x[flags], #31, 22f\n" + "ld1b { z10.b }, p2/Z, [x9, #6, MUL VL]\n" + "udot z16.s, z4.b, z0.b[0]\n" + "udot z20.s, z4.b, z1.b[0]\n" + "ld1b { z4.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + "udot z17.s, z5.b, z0.b[0]\n" + "udot z21.s, z5.b, z1.b[0]\n" + "udot z18.s, z6.b, z0.b[0]\n" + "udot z22.s, z6.b, z1.b[0]\n" + "udot z19.s, z7.b, z0.b[0]\n" + "udot z23.s, z7.b, z1.b[0]\n" + "udot z16.s, z8.b, z0.b[1]\n" + "udot z20.s, z8.b, z1.b[1]\n" + "ld1b { z5.b }, p2/Z, [x9, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" + "udot z17.s, z9.b, z0.b[1]\n" + "udot z21.s, z9.b, z1.b[1]\n" + "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-5, MUL VL]\n" + "udot z18.s, z10.b, z0.b[1]\n" + "udot z22.s, z10.b, z1.b[1]\n" + "ld1b { z9.b }, p2/Z, [x9, #-4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #-3, MUL VL]\n" + "udot z19.s, z4.b, z0.b[1]\n" + "udot z23.s, z4.b, z1.b[1]\n" + "ld1b { z4.b }, p2/Z, [x9, #-2, MUL VL]\n" + "udot z16.s, z5.b, z0.b[2]\n" + "udot z20.s, z5.b, z1.b[2]\n" + "ld1b { z5.b }, p2/Z, [x9, #-1, MUL VL]\n" + "udot z17.s, z6.b, z0.b[2]\n" + "udot z21.s, z6.b, z1.b[2]\n" + "udot z18.s, z7.b, z0.b[2]\n" + "udot z22.s, z7.b, z1.b[2]\n" + "udot z19.s, z8.b, z0.b[2]\n" + "udot z23.s, z8.b, z1.b[2]\n" + "udot z16.s, z9.b, z0.b[3]\n" + "udot z20.s, z9.b, z1.b[3]\n" + "udot z17.s, z10.b, z0.b[3]\n" + "udot z21.s, z10.b, z1.b[3]\n" + "udot z18.s, z4.b, z0.b[3]\n" + "udot z22.s, z4.b, z1.b[3]\n" + "udot z19.s, z5.b, z0.b[3]\n" + "udot z23.s, z5.b, z1.b[3]\n" + "tbnz %x[flags], #31, 21f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" - "22:" // Height 2: Multiply loop: unique 3: skip row sum + "21:" // Height 2: Multiply loop: unique 3: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 21b\n" - "23:" // Height 2: Multiply loop: Single iteration only + "bgt 20b\n" + "22:" // Height 2: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z27.b }, p2/Z, [x28]\n" - "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" - "udot z16.s, z27.b, z0.b[0]\n" - "udot z20.s, z27.b, z1.b[0]\n" - "udot z17.s, z26.b, z0.b[0]\n" - "udot z21.s, z26.b, z1.b[0]\n" - "udot z18.s, z25.b, z0.b[0]\n" - "udot z22.s, z25.b, z1.b[0]\n" - "udot z19.s, z24.b, z0.b[0]\n" - "udot z23.s, z24.b, z1.b[0]\n" - "ble 24f\n" - "ld1b { z27.b }, p2/Z, [x28]\n" - "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" + "udot z16.s, z4.b, z0.b[0]\n" + "udot z20.s, z4.b, z1.b[0]\n" + "udot z17.s, z5.b, z0.b[0]\n" + "udot z21.s, z5.b, z1.b[0]\n" + "udot z18.s, z6.b, z0.b[0]\n" + "udot z22.s, z6.b, z1.b[0]\n" + "udot z19.s, z7.b, z0.b[0]\n" + "udot z23.s, z7.b, z1.b[0]\n" + "ble 23f\n" + "ld1b { z8.b }, p2/Z, [x9]\n" + "ld1b { z9.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "udot z16.s, z27.b, z0.b[1]\n" - "udot z20.s, z27.b, z1.b[1]\n" - "udot z17.s, z26.b, z0.b[1]\n" - "udot z21.s, z26.b, z1.b[1]\n" - "udot z18.s, z25.b, z0.b[1]\n" - "udot z22.s, z25.b, z1.b[1]\n" - "udot z19.s, z24.b, z0.b[1]\n" - "udot z23.s, z24.b, z1.b[1]\n" - "ble 24f\n" - "ld1b { z27.b }, p2/Z, [x28]\n" - "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z16.s, z8.b, z0.b[1]\n" + "udot z20.s, z8.b, z1.b[1]\n" + "udot z17.s, z9.b, z0.b[1]\n" + "udot z21.s, z9.b, z1.b[1]\n" + "udot z18.s, z10.b, z0.b[1]\n" + "udot z22.s, z10.b, z1.b[1]\n" + "udot z19.s, z4.b, z0.b[1]\n" + "udot z23.s, z4.b, z1.b[1]\n" + "ble 23f\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "udot z16.s, z27.b, z0.b[2]\n" - "udot z20.s, z27.b, z1.b[2]\n" - "udot z17.s, z26.b, z0.b[2]\n" - "udot z21.s, z26.b, z1.b[2]\n" - "udot z18.s, z25.b, z0.b[2]\n" - "udot z22.s, z25.b, z1.b[2]\n" - "udot z19.s, z24.b, z0.b[2]\n" - "udot z23.s, z24.b, z1.b[2]\n" - "ble 24f\n" - "ld1b { z27.b }, p2/Z, [x28]\n" - "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "udot z16.s, z27.b, z0.b[3]\n" - "udot z20.s, z27.b, z1.b[3]\n" - "udot z17.s, z26.b, z0.b[3]\n" - "udot z21.s, z26.b, z1.b[3]\n" - "udot z18.s, z25.b, z0.b[3]\n" - "udot z22.s, z25.b, z1.b[3]\n" - "udot z19.s, z24.b, z0.b[3]\n" - "udot z23.s, z24.b, z1.b[3]\n" - "24:" // Height 2: Multiply loop: multiply skip - "tbnz %x[flags], #31, 25f\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z16.s, z5.b, z0.b[2]\n" + "udot z20.s, z5.b, z1.b[2]\n" + "udot z17.s, z6.b, z0.b[2]\n" + "udot z21.s, z6.b, z1.b[2]\n" + "udot z18.s, z7.b, z0.b[2]\n" + "udot z22.s, z7.b, z1.b[2]\n" + "udot z19.s, z8.b, z0.b[2]\n" + "udot z23.s, z8.b, z1.b[2]\n" + "ble 23f\n" + "ld1b { z9.b }, p2/Z, [x9]\n" + "ld1b { z10.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z16.s, z9.b, z0.b[3]\n" + "udot z20.s, z9.b, z1.b[3]\n" + "udot z17.s, z10.b, z0.b[3]\n" + "udot z21.s, z10.b, z1.b[3]\n" + "udot z18.s, z4.b, z0.b[3]\n" + "udot z22.s, z4.b, z1.b[3]\n" + "udot z19.s, z5.b, z0.b[3]\n" + "udot z23.s, z5.b, z1.b[3]\n" + "23:" // Height 2: Multiply loop: multiply skip + "tbnz %x[flags], #31, 24f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" - "25:" // Height 2: Multiply loop: unique 4: skip row sum + "24:" // Height 2: Multiply loop: unique 4: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 18b\n" + "bne 17b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x27, x20\n" - "tbnz %x[flags], #31, 26f\n" + "add x26, x27, x20\n" + "tbnz %x[flags], #31, 25f\n" "mov x21, #0x4\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" + "ld1rw { z2.s }, p2/Z, [x20]\n" "whilelt p0.s, XZR, x21\n" - "neg z24.s, p2/M, z24.s\n" + "neg z2.s, p2/M, z2.s\n" "uaddv d11, p0, z11.s\n" "uaddv d12, p0, z12.s\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z24.s\n" + "mul z11.s, p2/M, z11.s, z2.s\n" "mov z12.s, z12.s[0]\n" - "mul z12.s, p2/M, z12.s, z24.s\n" - "26:" // Height 2: skip row sum fixup + "mul z12.s, p2/M, z12.s, z2.s\n" + "25:" // Height 2: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z28.s }, p2/Z, [x10]\n" - "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z20.s, z20.s, z12.s\n" "add z21.s, z21.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z22.s, z22.s, z12.s\n" "add z23.s, z23.s, z12.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" - "add z16.s, z16.s, z28.s\n" - "add z17.s, z17.s, z27.s\n" - "addvl x10, x10, #4\n" - "add z18.s, z18.s, z26.s\n" - "add z19.s, z19.s, z25.s\n" - "add z20.s, z20.s, z28.s\n" - "add z21.s, z21.s, z27.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z22.s, z22.s, z26.s\n" - "add z23.s, z23.s, z25.s\n" - ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n" - ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n" - ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n" - ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n" - ".inst 0x04b87694 // sqrdmulh z20.s, z20.s, z24.s\n" - ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n" - ".inst 0x04b876d6 // sqrdmulh z22.s, z22.s, z24.s\n" - ".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n" - "tbz %x[flags], #5, 27f\n" - "and z24.d, z16.d, z0.d\n" - "and z30.d, z17.d, z0.d\n" - "and z29.d, z18.d, z0.d\n" - "and z28.d, z19.d, z0.d\n" - "and z27.d, z20.d, z0.d\n" - "and z26.d, z21.d, z0.d\n" - "asr z24.s, z24.s, #0x1f\n" - "and z25.d, z22.d, z0.d\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z27.s, z27.s, #0x1f\n" - "sqadd z16.s, z16.s, z24.s\n" - "and z24.d, z23.d, z0.d\n" - "asr z26.s, z26.s, #0x1f\n" - "asr z25.s, z25.s, #0x1f\n" - "sqadd z17.s, z17.s, z30.s\n" - "sqadd z18.s, z18.s, z29.s\n" - "asr z24.s, z24.s, #0x1f\n" - "sqadd z19.s, z19.s, z28.s\n" - "sqadd z20.s, z20.s, z27.s\n" - "sqadd z21.s, z21.s, z26.s\n" - "sqadd z22.s, z22.s, z25.s\n" - "sqadd z23.s, z23.s, z24.s\n" - "27:" // Height 2: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + "addvl x28, x28, #4\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "ld1rw { z26.s }, p2/Z, [x20]\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" - "add z16.s, z16.s, z26.s\n" + "add z16.s, z16.s, z4.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z17.s, z17.s, z26.s\n" - "add z18.s, z18.s, z26.s\n" - "ld1rw { z25.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z26.s\n" - "add z20.s, z20.s, z26.s\n" - "add x20, %x[qp], %[minval]\n" - "add z21.s, z21.s, z26.s\n" - "add z22.s, z22.s, z26.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "add z23.s, z23.s, z26.s\n" - "smin z16.s, p2/M, z16.s, z25.s\n" - "smin z17.s, p2/M, z17.s, z25.s\n" - "smin z18.s, p2/M, z18.s, z25.s\n" - "smin z19.s, p2/M, z19.s, z25.s\n" - "smin z20.s, p2/M, z20.s, z25.s\n" - "smin z21.s, p2/M, z21.s, z25.s\n" - "smin z22.s, p2/M, z22.s, z25.s\n" - "smin z23.s, p2/M, z23.s, z25.s\n" - "smax z16.s, p2/M, z16.s, z24.s\n" - "smax z17.s, p2/M, z17.s, z24.s\n" - "smax z18.s, p2/M, z18.s, z24.s\n" - "smax z19.s, p2/M, z19.s, z24.s\n" - "smax z20.s, p2/M, z20.s, z24.s\n" - "smax z21.s, p2/M, z21.s, z24.s\n" - "smax z22.s, p2/M, z22.s, z24.s\n" - "smax z23.s, p2/M, z23.s, z24.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "add z23.s, z23.s, z4.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "uzp1 z18.h, z18.h, z19.h\n" + "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z20.h, z20.h, z21.h\n" - "uzp1 z17.h, z22.h, z23.h\n" - "uzp1 z16.b, z16.b, z18.b\n" - "uzp1 z20.b, z20.b, z17.b\n" + "uzp1 z21.h, z22.h, z23.h\n" + "uzp1 z16.b, z16.b, z17.b\n" + "uzp1 z20.b, z20.b, z21.b\n" "st1b { z16.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "st1b { z20.b }, p1, [x24]\n" - "28:" // Height 2: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 16b\n" - "b 58f\n" - "29:" // Height 3 - "mov x10, %x[col_bias]\n" + "st1b { z20.b }, p1, [x26]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 15b\n" + "b 54f\n" + "27:" // Height 3 "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z13.s, #0x0\n" "mov z15.b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "30:" // Height 3: Column loop + "28:" // Height 3: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -612,399 +564,359 @@ void sve_hybrid_u8qa_dot_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" "mov z24.s, #0x0\n" "mov z25.s, #0x0\n" "mov z26.s, #0x0\n" "mov z27.s, #0x0\n" - "31:" // Height 3: setup done "mov x26, #0x0\n" - "32:" // Height 3: String loop + "30:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 33f\n" + "tbz %x[flags], #3, 31f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" - "cbnz x26, 34f\n" + "cbnz x26, 32f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" - "b 34f\n" - "33:" // Height 3: setup direct input + "b 32f\n" + "31:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" - "34:" // Height 3: input setup done + "32:" // Height 3: input setup done "cmp x25, #0x10\n" - "ble 37f\n" - "35:" // Height 3: Multiply loop: Main loop head + "ble 35f\n" + "33:" // Height 3: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z29.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z3.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z31.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" "ld1rqb { z2.b }, p0/Z, [x22]\n" - "ld1b { z30.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #6, MUL VL]\n" "add x22, x22, #0x10\n" - "udot z16.s, z5.b, z0.b[0]\n" - "udot z20.s, z5.b, z1.b[0]\n" - "udot z17.s, z29.b, z0.b[0]\n" - "udot z21.s, z29.b, z1.b[0]\n" - "udot z18.s, z4.b, z0.b[0]\n" - "udot z24.s, z5.b, z2.b[0]\n" - "udot z25.s, z29.b, z2.b[0]\n" - "ld1b { z29.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - "udot z22.s, z4.b, z1.b[0]\n" - "udot z26.s, z4.b, z2.b[0]\n" - "udot z19.s, z28.b, z0.b[0]\n" - "udot z23.s, z28.b, z1.b[0]\n" - "udot z27.s, z28.b, z2.b[0]\n" - "udot z16.s, z3.b, z0.b[1]\n" - "ld1b { z28.b }, p2/Z, [x28, #-8, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #-7, MUL VL]\n" - "udot z20.s, z3.b, z1.b[1]\n" - "udot z24.s, z3.b, z2.b[1]\n" - "ld1b { z4.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z3.b }, p2/Z, [x28, #-5, MUL VL]\n" - "udot z17.s, z31.b, z0.b[1]\n" - "udot z21.s, z31.b, z1.b[1]\n" - "udot z25.s, z31.b, z2.b[1]\n" - "udot z18.s, z30.b, z0.b[1]\n" - "ld1b { z31.b }, p2/Z, [x28, #-4, MUL VL]\n" - "udot z22.s, z30.b, z1.b[1]\n" - "udot z26.s, z30.b, z2.b[1]\n" - "ld1b { z30.b }, p2/Z, [x28, #-3, MUL VL]\n" - "udot z19.s, z29.b, z0.b[1]\n" - "udot z23.s, z29.b, z1.b[1]\n" - "udot z27.s, z29.b, z2.b[1]\n" - "udot z16.s, z28.b, z0.b[2]\n" - "ld1b { z29.b }, p2/Z, [x28, #-2, MUL VL]\n" - "udot z20.s, z28.b, z1.b[2]\n" - "udot z24.s, z28.b, z2.b[2]\n" - "ld1b { z28.b }, p2/Z, [x28, #-1, MUL VL]\n" - "udot z17.s, z5.b, z0.b[2]\n" - "udot z21.s, z5.b, z1.b[2]\n" - "udot z25.s, z5.b, z2.b[2]\n" - "udot z18.s, z4.b, z0.b[2]\n" - "udot z22.s, z4.b, z1.b[2]\n" - "udot z26.s, z4.b, z2.b[2]\n" - "udot z19.s, z3.b, z0.b[2]\n" - "udot z23.s, z3.b, z1.b[2]\n" - "udot z27.s, z3.b, z2.b[2]\n" - "udot z16.s, z31.b, z0.b[3]\n" - "udot z20.s, z31.b, z1.b[3]\n" - "udot z24.s, z31.b, z2.b[3]\n" - "udot z17.s, z30.b, z0.b[3]\n" - "udot z21.s, z30.b, z1.b[3]\n" - "udot z25.s, z30.b, z2.b[3]\n" - "udot z18.s, z29.b, z0.b[3]\n" - "udot z22.s, z29.b, z1.b[3]\n" - "udot z26.s, z29.b, z2.b[3]\n" - "udot z19.s, z28.b, z0.b[3]\n" - "udot z23.s, z28.b, z1.b[3]\n" - "udot z27.s, z28.b, z2.b[3]\n" - "tbnz %x[flags], #31, 36f\n" + "udot z16.s, z4.b, z0.b[0]\n" + "udot z20.s, z4.b, z1.b[0]\n" + "udot z17.s, z5.b, z0.b[0]\n" + "udot z21.s, z5.b, z1.b[0]\n" + "udot z18.s, z6.b, z0.b[0]\n" + "udot z24.s, z4.b, z2.b[0]\n" + "udot z25.s, z5.b, z2.b[0]\n" + "ld1b { z4.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + "udot z22.s, z6.b, z1.b[0]\n" + "udot z26.s, z6.b, z2.b[0]\n" + "udot z19.s, z7.b, z0.b[0]\n" + "udot z23.s, z7.b, z1.b[0]\n" + "udot z27.s, z7.b, z2.b[0]\n" + "udot z16.s, z8.b, z0.b[1]\n" + "ld1b { z5.b }, p2/Z, [x9, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" + "udot z20.s, z8.b, z1.b[1]\n" + "udot z24.s, z8.b, z2.b[1]\n" + "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-5, MUL VL]\n" + "udot z17.s, z9.b, z0.b[1]\n" + "udot z21.s, z9.b, z1.b[1]\n" + "udot z25.s, z9.b, z2.b[1]\n" + "udot z18.s, z10.b, z0.b[1]\n" + "ld1b { z9.b }, p2/Z, [x9, #-4, MUL VL]\n" + "udot z22.s, z10.b, z1.b[1]\n" + "udot z26.s, z10.b, z2.b[1]\n" + "ld1b { z10.b }, p2/Z, [x9, #-3, MUL VL]\n" + "udot z19.s, z4.b, z0.b[1]\n" + "udot z23.s, z4.b, z1.b[1]\n" + "udot z27.s, z4.b, z2.b[1]\n" + "udot z16.s, z5.b, z0.b[2]\n" + "ld1b { z4.b }, p2/Z, [x9, #-2, MUL VL]\n" + "udot z20.s, z5.b, z1.b[2]\n" + "udot z24.s, z5.b, z2.b[2]\n" + "ld1b { z5.b }, p2/Z, [x9, #-1, MUL VL]\n" + "udot z17.s, z6.b, z0.b[2]\n" + "udot z21.s, z6.b, z1.b[2]\n" + "udot z25.s, z6.b, z2.b[2]\n" + "udot z18.s, z7.b, z0.b[2]\n" + "udot z22.s, z7.b, z1.b[2]\n" + "udot z26.s, z7.b, z2.b[2]\n" + "udot z19.s, z8.b, z0.b[2]\n" + "udot z23.s, z8.b, z1.b[2]\n" + "udot z27.s, z8.b, z2.b[2]\n" + "udot z16.s, z9.b, z0.b[3]\n" + "udot z20.s, z9.b, z1.b[3]\n" + "udot z24.s, z9.b, z2.b[3]\n" + "udot z17.s, z10.b, z0.b[3]\n" + "udot z21.s, z10.b, z1.b[3]\n" + "udot z25.s, z10.b, z2.b[3]\n" + "udot z18.s, z4.b, z0.b[3]\n" + "udot z22.s, z4.b, z1.b[3]\n" + "udot z26.s, z4.b, z2.b[3]\n" + "udot z19.s, z5.b, z0.b[3]\n" + "udot z23.s, z5.b, z1.b[3]\n" + "udot z27.s, z5.b, z2.b[3]\n" + "tbnz %x[flags], #31, 34f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" - "36:" // Height 3: Multiply loop: unique 5: skip row sum + "34:" // Height 3: Multiply loop: unique 5: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 35b\n" - "37:" // Height 3: Multiply loop: Single iteration only + "bgt 33b\n" + "35:" // Height 3: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z31.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "ld1rqb { z2.b }, p0/Z, [x22]\n" - "udot z16.s, z31.b, z0.b[0]\n" - "udot z20.s, z31.b, z1.b[0]\n" - "udot z17.s, z30.b, z0.b[0]\n" - "udot z21.s, z30.b, z1.b[0]\n" - "udot z18.s, z29.b, z0.b[0]\n" - "udot z22.s, z29.b, z1.b[0]\n" - "udot z24.s, z31.b, z2.b[0]\n" - "udot z25.s, z30.b, z2.b[0]\n" - "udot z26.s, z29.b, z2.b[0]\n" - "udot z19.s, z28.b, z0.b[0]\n" - "udot z23.s, z28.b, z1.b[0]\n" - "udot z27.s, z28.b, z2.b[0]\n" - "ble 38f\n" - "ld1b { z31.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" + "udot z16.s, z4.b, z0.b[0]\n" + "udot z20.s, z4.b, z1.b[0]\n" + "udot z17.s, z5.b, z0.b[0]\n" + "udot z21.s, z5.b, z1.b[0]\n" + "udot z18.s, z6.b, z0.b[0]\n" + "udot z22.s, z6.b, z1.b[0]\n" + "udot z24.s, z4.b, z2.b[0]\n" + "udot z25.s, z5.b, z2.b[0]\n" + "udot z26.s, z6.b, z2.b[0]\n" + "udot z19.s, z7.b, z0.b[0]\n" + "udot z23.s, z7.b, z1.b[0]\n" + "udot z27.s, z7.b, z2.b[0]\n" + "ble 36f\n" + "ld1b { z8.b }, p2/Z, [x9]\n" + "ld1b { z9.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "udot z16.s, z31.b, z0.b[1]\n" - "udot z20.s, z31.b, z1.b[1]\n" - "udot z24.s, z31.b, z2.b[1]\n" - "udot z17.s, z30.b, z0.b[1]\n" - "udot z21.s, z30.b, z1.b[1]\n" - "udot z25.s, z30.b, z2.b[1]\n" - "udot z18.s, z29.b, z0.b[1]\n" - "udot z22.s, z29.b, z1.b[1]\n" - "udot z26.s, z29.b, z2.b[1]\n" - "udot z19.s, z28.b, z0.b[1]\n" - "udot z23.s, z28.b, z1.b[1]\n" - "udot z27.s, z28.b, z2.b[1]\n" - "ble 38f\n" - "ld1b { z31.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z16.s, z8.b, z0.b[1]\n" + "udot z20.s, z8.b, z1.b[1]\n" + "udot z24.s, z8.b, z2.b[1]\n" + "udot z17.s, z9.b, z0.b[1]\n" + "udot z21.s, z9.b, z1.b[1]\n" + "udot z25.s, z9.b, z2.b[1]\n" + "udot z18.s, z10.b, z0.b[1]\n" + "udot z22.s, z10.b, z1.b[1]\n" + "udot z26.s, z10.b, z2.b[1]\n" + "udot z19.s, z4.b, z0.b[1]\n" + "udot z23.s, z4.b, z1.b[1]\n" + "udot z27.s, z4.b, z2.b[1]\n" + "ble 36f\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "udot z16.s, z31.b, z0.b[2]\n" - "udot z20.s, z31.b, z1.b[2]\n" - "udot z24.s, z31.b, z2.b[2]\n" - "udot z17.s, z30.b, z0.b[2]\n" - "udot z21.s, z30.b, z1.b[2]\n" - "udot z25.s, z30.b, z2.b[2]\n" - "udot z18.s, z29.b, z0.b[2]\n" - "udot z22.s, z29.b, z1.b[2]\n" - "udot z26.s, z29.b, z2.b[2]\n" - "udot z19.s, z28.b, z0.b[2]\n" - "udot z23.s, z28.b, z1.b[2]\n" - "udot z27.s, z28.b, z2.b[2]\n" - "ble 38f\n" - "ld1b { z31.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "udot z16.s, z31.b, z0.b[3]\n" - "udot z20.s, z31.b, z1.b[3]\n" - "udot z24.s, z31.b, z2.b[3]\n" - "udot z17.s, z30.b, z0.b[3]\n" - "udot z21.s, z30.b, z1.b[3]\n" - "udot z25.s, z30.b, z2.b[3]\n" - "udot z18.s, z29.b, z0.b[3]\n" - "udot z22.s, z29.b, z1.b[3]\n" - "udot z26.s, z29.b, z2.b[3]\n" - "udot z19.s, z28.b, z0.b[3]\n" - "udot z23.s, z28.b, z1.b[3]\n" - "udot z27.s, z28.b, z2.b[3]\n" - "38:" // Height 3: Multiply loop: multiply skip - "tbnz %x[flags], #31, 39f\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z16.s, z5.b, z0.b[2]\n" + "udot z20.s, z5.b, z1.b[2]\n" + "udot z24.s, z5.b, z2.b[2]\n" + "udot z17.s, z6.b, z0.b[2]\n" + "udot z21.s, z6.b, z1.b[2]\n" + "udot z25.s, z6.b, z2.b[2]\n" + "udot z18.s, z7.b, z0.b[2]\n" + "udot z22.s, z7.b, z1.b[2]\n" + "udot z26.s, z7.b, z2.b[2]\n" + "udot z19.s, z8.b, z0.b[2]\n" + "udot z23.s, z8.b, z1.b[2]\n" + "udot z27.s, z8.b, z2.b[2]\n" + "ble 36f\n" + "ld1b { z9.b }, p2/Z, [x9]\n" + "ld1b { z10.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z16.s, z9.b, z0.b[3]\n" + "udot z20.s, z9.b, z1.b[3]\n" + "udot z24.s, z9.b, z2.b[3]\n" + "udot z17.s, z10.b, z0.b[3]\n" + "udot z21.s, z10.b, z1.b[3]\n" + "udot z25.s, z10.b, z2.b[3]\n" + "udot z18.s, z4.b, z0.b[3]\n" + "udot z22.s, z4.b, z1.b[3]\n" + "udot z26.s, z4.b, z2.b[3]\n" + "udot z19.s, z5.b, z0.b[3]\n" + "udot z23.s, z5.b, z1.b[3]\n" + "udot z27.s, z5.b, z2.b[3]\n" + "36:" // Height 3: Multiply loop: multiply skip + "tbnz %x[flags], #31, 37f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" - "39:" // Height 3: Multiply loop: unique 6: skip row sum + "37:" // Height 3: Multiply loop: unique 6: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 32b\n" + "bne 30b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x27, x20\n" - "add x23, x24, x20\n" - "tbnz %x[flags], #31, 40f\n" + "add x26, x27, x20\n" + "add x25, x26, x20\n" + "tbnz %x[flags], #31, 38f\n" "mov x21, #0x4\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z28.s }, p2/Z, [x20]\n" + "ld1rw { z3.s }, p2/Z, [x20]\n" "whilelt p0.s, XZR, x21\n" - "neg z28.s, p2/M, z28.s\n" + "neg z3.s, p2/M, z3.s\n" "uaddv d11, p0, z11.s\n" "uaddv d12, p0, z12.s\n" "uaddv d13, p0, z13.s\n" "mov z11.s, z11.s[0]\n" "mov z12.s, z12.s[0]\n" - "mul z11.s, p2/M, z11.s, z28.s\n" - "mul z12.s, p2/M, z12.s, z28.s\n" + "mul z11.s, p2/M, z11.s, z3.s\n" + "mul z12.s, p2/M, z12.s, z3.s\n" "mov z13.s, z13.s[0]\n" - "mul z13.s, p2/M, z13.s, z28.s\n" - "40:" // Height 3: skip row sum fixup + "mul z13.s, p2/M, z13.s, z3.s\n" + "38:" // Height 3: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z31.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z30.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z29.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z20.s, z20.s, z12.s\n" "add z21.s, z21.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z22.s, z22.s, z12.s\n" "add z23.s, z23.s, z12.s\n" - "ld1rw { z28.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" "add z24.s, z24.s, z13.s\n" "add z25.s, z25.s, z13.s\n" - "addvl x10, x10, #4\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" "add z26.s, z26.s, z13.s\n" "add z27.s, z27.s, z13.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z31.s\n" - "add z18.s, z18.s, z30.s\n" - "add z19.s, z19.s, z29.s\n" + "add z17.s, z17.s, z1.s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" "add z20.s, z20.s, z0.s\n" - "add z21.s, z21.s, z31.s\n" - "add z22.s, z22.s, z30.s\n" - "add z23.s, z23.s, z29.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z31.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z26.s, z26.s, z30.s\n" - "add z27.s, z27.s, z29.s\n" - ".inst 0x04bc7610 // sqrdmulh z16.s, z16.s, z28.s\n" - ".inst 0x04bc7631 // sqrdmulh z17.s, z17.s, z28.s\n" - ".inst 0x04bc7652 // sqrdmulh z18.s, z18.s, z28.s\n" - ".inst 0x04bc7673 // sqrdmulh z19.s, z19.s, z28.s\n" - ".inst 0x04bc7694 // sqrdmulh z20.s, z20.s, z28.s\n" - ".inst 0x04bc76b5 // sqrdmulh z21.s, z21.s, z28.s\n" - ".inst 0x04bc76d6 // sqrdmulh z22.s, z22.s, z28.s\n" - ".inst 0x04bc76f7 // sqrdmulh z23.s, z23.s, z28.s\n" - ".inst 0x04bc7718 // sqrdmulh z24.s, z24.s, z28.s\n" - ".inst 0x04bc7739 // sqrdmulh z25.s, z25.s, z28.s\n" - ".inst 0x04bc775a // sqrdmulh z26.s, z26.s, z28.s\n" - ".inst 0x04bc777b // sqrdmulh z27.s, z27.s, z28.s\n" - "tbz %x[flags], #5, 41f\n" - "and z1.d, z16.d, z0.d\n" - "and z31.d, z17.d, z0.d\n" - "and z30.d, z18.d, z0.d\n" - "and z29.d, z19.d, z0.d\n" - "and z28.d, z20.d, z0.d\n" - "and z3.d, z21.d, z0.d\n" - "asr z1.s, z1.s, #0x1f\n" - "asr z31.s, z31.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "and z2.d, z22.d, z0.d\n" - "sqadd z16.s, z16.s, z1.s\n" - "sqadd z17.s, z17.s, z31.s\n" - "sqadd z18.s, z18.s, z30.s\n" - "sqadd z19.s, z19.s, z29.s\n" - "sqadd z20.s, z20.s, z28.s\n" - "and z1.d, z23.d, z0.d\n" - "and z31.d, z24.d, z0.d\n" - "and z30.d, z25.d, z0.d\n" - "and z29.d, z26.d, z0.d\n" - "and z28.d, z27.d, z0.d\n" - "asr z3.s, z3.s, #0x1f\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "asr z31.s, z31.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "sqadd z21.s, z21.s, z3.s\n" - "sqadd z22.s, z22.s, z2.s\n" - "sqadd z23.s, z23.s, z1.s\n" - "sqadd z24.s, z24.s, z31.s\n" - "sqadd z25.s, z25.s, z30.s\n" - "sqadd z26.s, z26.s, z29.s\n" - "sqadd z27.s, z27.s, z28.s\n" - "41:" // Height 3: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add z25.s, z25.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "ld1rw { z30.s }, p2/Z, [x20]\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + ".inst 0x04a47318 // sqdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47339 // sqdmulh z25.s, z25.s, z4.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + ".inst 0x04a4735a // sqdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4737b // sqdmulh z27.s, z27.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" - "add z16.s, z16.s, z30.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z17.s, z17.s, z30.s\n" - "add z18.s, z18.s, z30.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z19.s, z19.s, z30.s\n" - "add z20.s, z20.s, z30.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z21.s, z21.s, z30.s\n" - "add z22.s, z22.s, z30.s\n" - "ld1rw { z29.s }, p2/Z, [x20]\n" - "add z23.s, z23.s, z30.s\n" - "add z24.s, z24.s, z30.s\n" - "add x20, %x[qp], %[minval]\n" - "add z25.s, z25.s, z30.s\n" - "add z26.s, z26.s, z30.s\n" - "ld1rw { z28.s }, p2/Z, [x20]\n" - "add z27.s, z27.s, z30.s\n" - "smin z16.s, p2/M, z16.s, z29.s\n" - "smin z17.s, p2/M, z17.s, z29.s\n" - "smin z18.s, p2/M, z18.s, z29.s\n" - "smin z19.s, p2/M, z19.s, z29.s\n" - "smin z20.s, p2/M, z20.s, z29.s\n" - "smin z21.s, p2/M, z21.s, z29.s\n" - "smin z22.s, p2/M, z22.s, z29.s\n" - "smin z23.s, p2/M, z23.s, z29.s\n" - "smin z24.s, p2/M, z24.s, z29.s\n" - "smin z25.s, p2/M, z25.s, z29.s\n" - "smin z26.s, p2/M, z26.s, z29.s\n" - "smin z27.s, p2/M, z27.s, z29.s\n" - "smax z16.s, p2/M, z16.s, z28.s\n" - "smax z17.s, p2/M, z17.s, z28.s\n" - "smax z18.s, p2/M, z18.s, z28.s\n" - "smax z19.s, p2/M, z19.s, z28.s\n" - "smax z20.s, p2/M, z20.s, z28.s\n" - "smax z21.s, p2/M, z21.s, z28.s\n" - "smax z22.s, p2/M, z22.s, z28.s\n" - "smax z23.s, p2/M, z23.s, z28.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "add z26.s, z26.s, z4.s\n" + "add z27.s, z27.s, z4.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z24.s, p2/M, z24.s, z28.s\n" - "smax z25.s, p2/M, z25.s, z28.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" "uzp1 z17.h, z18.h, z19.h\n" - "smax z26.s, p2/M, z26.s, z28.s\n" - "smax z27.s, p2/M, z27.s, z28.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z20.h, z20.h, z21.h\n" - "uzp1 z18.h, z22.h, z23.h\n" + "uzp1 z21.h, z22.h, z23.h\n" "uzp1 z24.h, z24.h, z25.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "uzp1 z17.h, z26.h, z27.h\n" - "uzp1 z20.b, z20.b, z18.b\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z20.b, z20.b, z21.b\n" "st1b { z16.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "uzp1 z24.b, z24.b, z17.b\n" - "st1b { z20.b }, p1, [x24]\n" - "st1b { z24.b }, p1, [x23]\n" - "42:" // Height 3: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 30b\n" - "b 58f\n" - "43:" // Height 4 + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z20.b }, p1, [x26]\n" + "st1b { z24.b }, p1, [x25]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 28b\n" + "b 54f\n" + "40:" // Height 4 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "mov x20, #0x4\n" - "mov x10, %x[col_bias]\n" "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" - "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" "mov z13.s, #0x0\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z14.s, #0x0\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "madd x20, x21, x20, x27\n" "mov z15.b, #0x1\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" + "madd x20, x21, x20, x27\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "44:" // Height 4: Column loop + "41:" // Height 4: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -1012,7 +924,7 @@ void sve_hybrid_u8qa_dot_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" "mov z24.s, #0x0\n" @@ -1023,42 +935,41 @@ void sve_hybrid_u8qa_dot_4x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "45:" // Height 4: setup done "mov x26, #0x0\n" - "46:" // Height 4: String loop + "43:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 47f\n" + "tbz %x[flags], #3, 44f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" "ldr x21, [x20, #0x18]\n" - "cbnz x26, 48f\n" + "cbnz x26, 45f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" "add x21, x21, x20\n" - "b 48f\n" - "47:" // Height 4: setup direct input + "b 45f\n" + "44:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" "add x21, x22, x21\n" - "48:" // Height 4: input setup done + "45:" // Height 4: input setup done "cmp x25, #0x10\n" - "ble 51f\n" - "49:" // Height 4: Multiply loop: Main loop head + "ble 48f\n" + "46:" // Height 4: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" @@ -1067,431 +978,380 @@ void sve_hybrid_u8qa_dot_4x4VL ( "ld1rqb { z3.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" "add x21, x21, #0x10\n" - "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" - "udot z16.s, z5.b, z0.b[0]\n" - "udot z20.s, z5.b, z1.b[0]\n" - "udot z17.s, z10.b, z0.b[0]\n" - "udot z21.s, z10.b, z1.b[0]\n" - "udot z24.s, z5.b, z2.b[0]\n" - "udot z28.s, z5.b, z3.b[0]\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - "udot z25.s, z10.b, z2.b[0]\n" - "udot z29.s, z10.b, z3.b[0]\n" - "udot z18.s, z4.b, z0.b[0]\n" - "udot z22.s, z4.b, z1.b[0]\n" - "udot z26.s, z4.b, z2.b[0]\n" - "udot z30.s, z4.b, z3.b[0]\n" - "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n" - "udot z19.s, z9.b, z0.b[0]\n" - "udot z23.s, z9.b, z1.b[0]\n" - "udot z27.s, z9.b, z2.b[0]\n" - "udot z31.s, z9.b, z3.b[0]\n" - "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #6, MUL VL]\n" + "udot z16.s, z4.b, z0.b[0]\n" + "udot z20.s, z4.b, z1.b[0]\n" + "udot z17.s, z5.b, z0.b[0]\n" + "udot z21.s, z5.b, z1.b[0]\n" + "udot z24.s, z4.b, z2.b[0]\n" + "udot z28.s, z4.b, z3.b[0]\n" + "ld1b { z4.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + "udot z25.s, z5.b, z2.b[0]\n" + "udot z29.s, z5.b, z3.b[0]\n" + "udot z18.s, z6.b, z0.b[0]\n" + "udot z22.s, z6.b, z1.b[0]\n" + "udot z26.s, z6.b, z2.b[0]\n" + "udot z30.s, z6.b, z3.b[0]\n" + "ld1b { z5.b }, p2/Z, [x9, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" + "udot z19.s, z7.b, z0.b[0]\n" + "udot z23.s, z7.b, z1.b[0]\n" + "udot z27.s, z7.b, z2.b[0]\n" + "udot z31.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" "udot z16.s, z8.b, z0.b[1]\n" "udot z20.s, z8.b, z1.b[1]\n" "udot z24.s, z8.b, z2.b[1]\n" "udot z28.s, z8.b, z3.b[1]\n" - "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" - "udot z17.s, z7.b, z0.b[1]\n" - "udot z21.s, z7.b, z1.b[1]\n" - "udot z25.s, z7.b, z2.b[1]\n" - "udot z29.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" - "udot z18.s, z6.b, z0.b[1]\n" - "udot z22.s, z6.b, z1.b[1]\n" - "udot z26.s, z6.b, z2.b[1]\n" - "udot z30.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" - "udot z19.s, z5.b, z0.b[1]\n" - "udot z23.s, z5.b, z1.b[1]\n" - "udot z27.s, z5.b, z2.b[1]\n" - "udot z31.s, z5.b, z3.b[1]\n" - "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" - "udot z16.s, z4.b, z0.b[2]\n" - "udot z20.s, z4.b, z1.b[2]\n" - "udot z24.s, z4.b, z2.b[2]\n" - "udot z28.s, z4.b, z3.b[2]\n" - "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n" - "udot z17.s, z10.b, z0.b[2]\n" - "udot z21.s, z10.b, z1.b[2]\n" - "udot z25.s, z10.b, z2.b[2]\n" - "udot z29.s, z10.b, z3.b[2]\n" - "udot z18.s, z9.b, z0.b[2]\n" - "udot z22.s, z9.b, z1.b[2]\n" - "udot z26.s, z9.b, z2.b[2]\n" - "udot z30.s, z9.b, z3.b[2]\n" + "ld1b { z8.b }, p2/Z, [x9, #-5, MUL VL]\n" + "udot z17.s, z9.b, z0.b[1]\n" + "udot z21.s, z9.b, z1.b[1]\n" + "udot z25.s, z9.b, z2.b[1]\n" + "udot z29.s, z9.b, z3.b[1]\n" + "ld1b { z9.b }, p2/Z, [x9, #-4, MUL VL]\n" + "udot z18.s, z10.b, z0.b[1]\n" + "udot z22.s, z10.b, z1.b[1]\n" + "udot z26.s, z10.b, z2.b[1]\n" + "udot z30.s, z10.b, z3.b[1]\n" + "ld1b { z10.b }, p2/Z, [x9, #-3, MUL VL]\n" + "udot z19.s, z4.b, z0.b[1]\n" + "udot z23.s, z4.b, z1.b[1]\n" + "udot z27.s, z4.b, z2.b[1]\n" + "udot z31.s, z4.b, z3.b[1]\n" + "ld1b { z4.b }, p2/Z, [x9, #-2, MUL VL]\n" + "udot z16.s, z5.b, z0.b[2]\n" + "udot z20.s, z5.b, z1.b[2]\n" + "udot z24.s, z5.b, z2.b[2]\n" + "udot z28.s, z5.b, z3.b[2]\n" + "ld1b { z5.b }, p2/Z, [x9, #-1, MUL VL]\n" + "udot z17.s, z6.b, z0.b[2]\n" + "udot z21.s, z6.b, z1.b[2]\n" + "udot z25.s, z6.b, z2.b[2]\n" + "udot z29.s, z6.b, z3.b[2]\n" + "udot z18.s, z7.b, z0.b[2]\n" + "udot z22.s, z7.b, z1.b[2]\n" + "udot z26.s, z7.b, z2.b[2]\n" + "udot z30.s, z7.b, z3.b[2]\n" "udot z19.s, z8.b, z0.b[2]\n" "udot z23.s, z8.b, z1.b[2]\n" "udot z27.s, z8.b, z2.b[2]\n" "udot z31.s, z8.b, z3.b[2]\n" - "udot z16.s, z7.b, z0.b[3]\n" - "udot z20.s, z7.b, z1.b[3]\n" - "udot z24.s, z7.b, z2.b[3]\n" - "udot z28.s, z7.b, z3.b[3]\n" - "udot z17.s, z6.b, z0.b[3]\n" - "udot z21.s, z6.b, z1.b[3]\n" - "udot z25.s, z6.b, z2.b[3]\n" - "udot z29.s, z6.b, z3.b[3]\n" - "udot z18.s, z5.b, z0.b[3]\n" - "udot z22.s, z5.b, z1.b[3]\n" - "udot z26.s, z5.b, z2.b[3]\n" - "udot z30.s, z5.b, z3.b[3]\n" - "udot z19.s, z4.b, z0.b[3]\n" - "udot z23.s, z4.b, z1.b[3]\n" - "udot z27.s, z4.b, z2.b[3]\n" - "udot z31.s, z4.b, z3.b[3]\n" - "tbnz %x[flags], #31, 50f\n" + "udot z16.s, z9.b, z0.b[3]\n" + "udot z20.s, z9.b, z1.b[3]\n" + "udot z24.s, z9.b, z2.b[3]\n" + "udot z28.s, z9.b, z3.b[3]\n" + "udot z17.s, z10.b, z0.b[3]\n" + "udot z21.s, z10.b, z1.b[3]\n" + "udot z25.s, z10.b, z2.b[3]\n" + "udot z29.s, z10.b, z3.b[3]\n" + "udot z18.s, z4.b, z0.b[3]\n" + "udot z22.s, z4.b, z1.b[3]\n" + "udot z26.s, z4.b, z2.b[3]\n" + "udot z30.s, z4.b, z3.b[3]\n" + "udot z19.s, z5.b, z0.b[3]\n" + "udot z23.s, z5.b, z1.b[3]\n" + "udot z27.s, z5.b, z2.b[3]\n" + "udot z31.s, z5.b, z3.b[3]\n" + "tbnz %x[flags], #31, 47f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" "udot z14.s, z3.b, z15.b\n" - "50:" // Height 4: Multiply loop: unique 7: skip row sum + "47:" // Height 4: Multiply loop: unique 7: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 49b\n" - "51:" // Height 4: Multiply loop: Single iteration only + "bgt 46b\n" + "48:" // Height 4: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z7.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "ld1rqb { z2.b }, p0/Z, [x22]\n" "ld1rqb { z3.b }, p0/Z, [x21]\n" - "udot z16.s, z7.b, z0.b[0]\n" - "udot z20.s, z7.b, z1.b[0]\n" - "udot z17.s, z6.b, z0.b[0]\n" - "udot z21.s, z6.b, z1.b[0]\n" - "udot z18.s, z5.b, z0.b[0]\n" - "udot z22.s, z5.b, z1.b[0]\n" - "udot z24.s, z7.b, z2.b[0]\n" - "udot z28.s, z7.b, z3.b[0]\n" - "udot z25.s, z6.b, z2.b[0]\n" - "udot z29.s, z6.b, z3.b[0]\n" - "udot z26.s, z5.b, z2.b[0]\n" - "udot z30.s, z5.b, z3.b[0]\n" - "udot z19.s, z4.b, z0.b[0]\n" - "udot z23.s, z4.b, z1.b[0]\n" - "udot z27.s, z4.b, z2.b[0]\n" - "udot z31.s, z4.b, z3.b[0]\n" - "ble 52f\n" - "ld1b { z7.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "udot z16.s, z4.b, z0.b[0]\n" + "udot z20.s, z4.b, z1.b[0]\n" + "udot z17.s, z5.b, z0.b[0]\n" + "udot z21.s, z5.b, z1.b[0]\n" + "udot z18.s, z6.b, z0.b[0]\n" + "udot z22.s, z6.b, z1.b[0]\n" + "udot z24.s, z4.b, z2.b[0]\n" + "udot z28.s, z4.b, z3.b[0]\n" + "udot z25.s, z5.b, z2.b[0]\n" + "udot z29.s, z5.b, z3.b[0]\n" + "udot z26.s, z6.b, z2.b[0]\n" + "udot z30.s, z6.b, z3.b[0]\n" + "udot z19.s, z7.b, z0.b[0]\n" + "udot z23.s, z7.b, z1.b[0]\n" + "udot z27.s, z7.b, z2.b[0]\n" + "udot z31.s, z7.b, z3.b[0]\n" + "ble 49f\n" + "ld1b { z8.b }, p2/Z, [x9]\n" + "ld1b { z9.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "udot z16.s, z7.b, z0.b[1]\n" - "udot z20.s, z7.b, z1.b[1]\n" - "udot z24.s, z7.b, z2.b[1]\n" - "udot z28.s, z7.b, z3.b[1]\n" - "udot z17.s, z6.b, z0.b[1]\n" - "udot z21.s, z6.b, z1.b[1]\n" - "udot z25.s, z6.b, z2.b[1]\n" - "udot z29.s, z6.b, z3.b[1]\n" - "udot z18.s, z5.b, z0.b[1]\n" - "udot z22.s, z5.b, z1.b[1]\n" - "udot z26.s, z5.b, z2.b[1]\n" - "udot z30.s, z5.b, z3.b[1]\n" + "ld1b { z10.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z16.s, z8.b, z0.b[1]\n" + "udot z20.s, z8.b, z1.b[1]\n" + "udot z24.s, z8.b, z2.b[1]\n" + "udot z28.s, z8.b, z3.b[1]\n" + "udot z17.s, z9.b, z0.b[1]\n" + "udot z21.s, z9.b, z1.b[1]\n" + "udot z25.s, z9.b, z2.b[1]\n" + "udot z29.s, z9.b, z3.b[1]\n" + "udot z18.s, z10.b, z0.b[1]\n" + "udot z22.s, z10.b, z1.b[1]\n" + "udot z26.s, z10.b, z2.b[1]\n" + "udot z30.s, z10.b, z3.b[1]\n" "udot z19.s, z4.b, z0.b[1]\n" "udot z23.s, z4.b, z1.b[1]\n" "udot z27.s, z4.b, z2.b[1]\n" "udot z31.s, z4.b, z3.b[1]\n" - "ble 52f\n" - "ld1b { z7.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ble 49f\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "udot z16.s, z7.b, z0.b[2]\n" - "udot z20.s, z7.b, z1.b[2]\n" - "udot z24.s, z7.b, z2.b[2]\n" - "udot z28.s, z7.b, z3.b[2]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z16.s, z5.b, z0.b[2]\n" + "udot z20.s, z5.b, z1.b[2]\n" + "udot z24.s, z5.b, z2.b[2]\n" + "udot z28.s, z5.b, z3.b[2]\n" "udot z17.s, z6.b, z0.b[2]\n" "udot z21.s, z6.b, z1.b[2]\n" "udot z25.s, z6.b, z2.b[2]\n" "udot z29.s, z6.b, z3.b[2]\n" - "udot z18.s, z5.b, z0.b[2]\n" - "udot z22.s, z5.b, z1.b[2]\n" - "udot z26.s, z5.b, z2.b[2]\n" - "udot z30.s, z5.b, z3.b[2]\n" - "udot z19.s, z4.b, z0.b[2]\n" - "udot z23.s, z4.b, z1.b[2]\n" - "udot z27.s, z4.b, z2.b[2]\n" - "udot z31.s, z4.b, z3.b[2]\n" - "ble 52f\n" - "ld1b { z7.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "udot z16.s, z7.b, z0.b[3]\n" - "udot z20.s, z7.b, z1.b[3]\n" - "udot z24.s, z7.b, z2.b[3]\n" - "udot z28.s, z7.b, z3.b[3]\n" - "udot z17.s, z6.b, z0.b[3]\n" - "udot z21.s, z6.b, z1.b[3]\n" - "udot z25.s, z6.b, z2.b[3]\n" - "udot z29.s, z6.b, z3.b[3]\n" - "udot z18.s, z5.b, z0.b[3]\n" - "udot z22.s, z5.b, z1.b[3]\n" - "udot z26.s, z5.b, z2.b[3]\n" - "udot z30.s, z5.b, z3.b[3]\n" - "udot z19.s, z4.b, z0.b[3]\n" - "udot z23.s, z4.b, z1.b[3]\n" - "udot z27.s, z4.b, z2.b[3]\n" - "udot z31.s, z4.b, z3.b[3]\n" - "52:" // Height 4: Multiply loop: multiply skip - "tbnz %x[flags], #31, 53f\n" + "udot z18.s, z7.b, z0.b[2]\n" + "udot z22.s, z7.b, z1.b[2]\n" + "udot z26.s, z7.b, z2.b[2]\n" + "udot z30.s, z7.b, z3.b[2]\n" + "udot z19.s, z8.b, z0.b[2]\n" + "udot z23.s, z8.b, z1.b[2]\n" + "udot z27.s, z8.b, z2.b[2]\n" + "udot z31.s, z8.b, z3.b[2]\n" + "ble 49f\n" + "ld1b { z9.b }, p2/Z, [x9]\n" + "ld1b { z10.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "udot z16.s, z9.b, z0.b[3]\n" + "udot z20.s, z9.b, z1.b[3]\n" + "udot z24.s, z9.b, z2.b[3]\n" + "udot z28.s, z9.b, z3.b[3]\n" + "udot z17.s, z10.b, z0.b[3]\n" + "udot z21.s, z10.b, z1.b[3]\n" + "udot z25.s, z10.b, z2.b[3]\n" + "udot z29.s, z10.b, z3.b[3]\n" + "udot z18.s, z4.b, z0.b[3]\n" + "udot z22.s, z4.b, z1.b[3]\n" + "udot z26.s, z4.b, z2.b[3]\n" + "udot z30.s, z4.b, z3.b[3]\n" + "udot z19.s, z5.b, z0.b[3]\n" + "udot z23.s, z5.b, z1.b[3]\n" + "udot z27.s, z5.b, z2.b[3]\n" + "udot z31.s, z5.b, z3.b[3]\n" + "49:" // Height 4: Multiply loop: multiply skip + "tbnz %x[flags], #31, 50f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" "udot z14.s, z3.b, z15.b\n" - "53:" // Height 4: Multiply loop: unique 8: skip row sum + "50:" // Height 4: Multiply loop: unique 8: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 46b\n" + "bne 43b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x27, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "tbnz %x[flags], #31, 54f\n" + "add x26, x27, x20\n" + "add x25, x26, x20\n" + "add x24, x25, x20\n" + "tbnz %x[flags], #31, 51f\n" "mov x21, #0x4\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" "whilelt p0.s, XZR, x21\n" - "neg z0.s, p2/M, z0.s\n" + "neg z4.s, p2/M, z4.s\n" "uaddv d11, p0, z11.s\n" "uaddv d12, p0, z12.s\n" "uaddv d13, p0, z13.s\n" "uaddv d14, p0, z14.s\n" "mov z11.s, z11.s[0]\n" "mov z12.s, z12.s[0]\n" - "mul z11.s, p2/M, z11.s, z0.s\n" - "mul z12.s, p2/M, z12.s, z0.s\n" + "mul z11.s, p2/M, z11.s, z4.s\n" + "mul z12.s, p2/M, z12.s, z4.s\n" "mov z13.s, z13.s[0]\n" "mov z14.s, z14.s[0]\n" - "mul z13.s, p2/M, z13.s, z0.s\n" - "mul z14.s, p2/M, z14.s, z0.s\n" - "54:" // Height 4: skip row sum fixup + "mul z13.s, p2/M, z13.s, z4.s\n" + "mul z14.s, p2/M, z14.s, z4.s\n" + "51:" // Height 4: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z4.s }, p2/Z, [x10]\n" - "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z20.s, z20.s, z12.s\n" "add z21.s, z21.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z22.s, z22.s, z12.s\n" "add z23.s, z23.s, z12.s\n" - "ld1rw { z1.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" "add z24.s, z24.s, z13.s\n" "add z25.s, z25.s, z13.s\n" - "addvl x10, x10, #4\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" "add z26.s, z26.s, z13.s\n" "add z27.s, z27.s, z13.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" "add z28.s, z28.s, z14.s\n" "add z29.s, z29.s, z14.s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" "add z30.s, z30.s, z14.s\n" "add z31.s, z31.s, z14.s\n" - "add z16.s, z16.s, z4.s\n" - "add z17.s, z17.s, z0.s\n" - "add z18.s, z18.s, z3.s\n" - "add z19.s, z19.s, z2.s\n" - "add z20.s, z20.s, z4.s\n" - "add z21.s, z21.s, z0.s\n" - "add z22.s, z22.s, z3.s\n" - "add z23.s, z23.s, z2.s\n" - "add z24.s, z24.s, z4.s\n" - "add z25.s, z25.s, z0.s\n" - "add z26.s, z26.s, z3.s\n" - "add z27.s, z27.s, z2.s\n" - "add z28.s, z28.s, z4.s\n" - "add z29.s, z29.s, z0.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z30.s, z30.s, z3.s\n" - "add z31.s, z31.s, z2.s\n" - ".inst 0x04a17610 // sqrdmulh z16.s, z16.s, z1.s\n" - ".inst 0x04a17631 // sqrdmulh z17.s, z17.s, z1.s\n" - ".inst 0x04a17652 // sqrdmulh z18.s, z18.s, z1.s\n" - ".inst 0x04a17673 // sqrdmulh z19.s, z19.s, z1.s\n" - ".inst 0x04a17694 // sqrdmulh z20.s, z20.s, z1.s\n" - ".inst 0x04a176b5 // sqrdmulh z21.s, z21.s, z1.s\n" - ".inst 0x04a176d6 // sqrdmulh z22.s, z22.s, z1.s\n" - ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n" - ".inst 0x04a17718 // sqrdmulh z24.s, z24.s, z1.s\n" - ".inst 0x04a17739 // sqrdmulh z25.s, z25.s, z1.s\n" - ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n" - ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n" - ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n" - ".inst 0x04a177bd // sqrdmulh z29.s, z29.s, z1.s\n" - ".inst 0x04a177de // sqrdmulh z30.s, z30.s, z1.s\n" - ".inst 0x04a177ff // sqrdmulh z31.s, z31.s, z1.s\n" - "tbz %x[flags], #5, 55f\n" - "and z2.d, z16.d, z0.d\n" - "and z1.d, z17.d, z0.d\n" - "and z7.d, z18.d, z0.d\n" - "and z6.d, z19.d, z0.d\n" - "and z5.d, z20.d, z0.d\n" - "and z4.d, z21.d, z0.d\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "and z3.d, z22.d, z0.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "sqadd z16.s, z16.s, z2.s\n" - "sqadd z17.s, z17.s, z1.s\n" - "and z2.d, z23.d, z0.d\n" - "and z1.d, z24.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z3.s, z3.s, #0x1f\n" - "sqadd z18.s, z18.s, z7.s\n" - "sqadd z19.s, z19.s, z6.s\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "sqadd z20.s, z20.s, z5.s\n" - "sqadd z21.s, z21.s, z4.s\n" - "sqadd z22.s, z22.s, z3.s\n" - "and z7.d, z25.d, z0.d\n" - "sqadd z23.s, z23.s, z2.s\n" - "sqadd z24.s, z24.s, z1.s\n" - "and z6.d, z26.d, z0.d\n" - "and z5.d, z27.d, z0.d\n" - "and z4.d, z28.d, z0.d\n" - "and z3.d, z29.d, z0.d\n" - "and z2.d, z30.d, z0.d\n" - "and z1.d, z31.d, z0.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z3.s, z3.s, #0x1f\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "sqadd z25.s, z25.s, z7.s\n" - "sqadd z26.s, z26.s, z6.s\n" - "sqadd z27.s, z27.s, z5.s\n" - "sqadd z28.s, z28.s, z4.s\n" - "sqadd z29.s, z29.s, z3.s\n" - "sqadd z30.s, z30.s, z2.s\n" - "sqadd z31.s, z31.s, z1.s\n" - "55:" // Height 4: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + "add z28.s, z28.s, z0.s\n" + "add z29.s, z29.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z30.s, z30.s, z2.s\n" + "add z31.s, z31.s, z3.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "ld1rw { z2.s }, p2/Z, [x20]\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + ".inst 0x04a47318 // sqdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47339 // sqdmulh z25.s, z25.s, z4.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + ".inst 0x04a4735a // sqdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4737b // sqdmulh z27.s, z27.s, z4.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" + ".inst 0x04a4739c // sqdmulh z28.s, z28.s, z4.s\n" + ".inst 0x04a473bd // sqdmulh z29.s, z29.s, z4.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" - "add z16.s, z16.s, z2.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + ".inst 0x04a473de // sqdmulh z30.s, z30.s, z4.s\n" + ".inst 0x04a473ff // sqdmulh z31.s, z31.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z17.s, z17.s, z2.s\n" - "add z18.s, z18.s, z2.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z19.s, z19.s, z2.s\n" - "add z20.s, z20.s, z2.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" - "add z21.s, z21.s, z2.s\n" - "add z22.s, z22.s, z2.s\n" ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n" - "add z23.s, z23.s, z2.s\n" - "add z24.s, z24.s, z2.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z25.s, z25.s, z2.s\n" - "add z26.s, z26.s, z2.s\n" - "ld1rw { z1.s }, p2/Z, [x20]\n" - "add z27.s, z27.s, z2.s\n" - "add z28.s, z28.s, z2.s\n" - "add x20, %x[qp], %[minval]\n" - "add z29.s, z29.s, z2.s\n" - "add z30.s, z30.s, z2.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z31.s, z31.s, z2.s\n" - "smin z16.s, p2/M, z16.s, z1.s\n" - "smin z17.s, p2/M, z17.s, z1.s\n" - "smin z18.s, p2/M, z18.s, z1.s\n" - "smin z19.s, p2/M, z19.s, z1.s\n" - "smin z20.s, p2/M, z20.s, z1.s\n" - "smin z21.s, p2/M, z21.s, z1.s\n" - "smin z22.s, p2/M, z22.s, z1.s\n" - "smin z23.s, p2/M, z23.s, z1.s\n" - "smin z24.s, p2/M, z24.s, z1.s\n" - "smin z25.s, p2/M, z25.s, z1.s\n" - "smin z26.s, p2/M, z26.s, z1.s\n" - "smin z27.s, p2/M, z27.s, z1.s\n" - "smin z28.s, p2/M, z28.s, z1.s\n" - "smin z29.s, p2/M, z29.s, z1.s\n" - "smin z30.s, p2/M, z30.s, z1.s\n" - "smin z31.s, p2/M, z31.s, z1.s\n" - "smax z16.s, p2/M, z16.s, z0.s\n" - "smax z17.s, p2/M, z17.s, z0.s\n" - "smax z18.s, p2/M, z18.s, z0.s\n" - "smax z19.s, p2/M, z19.s, z0.s\n" - "smax z20.s, p2/M, z20.s, z0.s\n" - "smax z21.s, p2/M, z21.s, z0.s\n" - "smax z22.s, p2/M, z22.s, z0.s\n" - "smax z23.s, p2/M, z23.s, z0.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "add z26.s, z26.s, z4.s\n" + "add z27.s, z27.s, z4.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "add z28.s, z28.s, z4.s\n" + "add z29.s, z29.s, z4.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "add z30.s, z30.s, z4.s\n" + "add z31.s, z31.s, z4.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smin z28.s, p2/M, z28.s, z6.s\n" + "smin z29.s, p2/M, z29.s, z6.s\n" + "smin z30.s, p2/M, z30.s, z6.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z24.s, p2/M, z24.s, z0.s\n" - "smax z25.s, p2/M, z25.s, z0.s\n" - "uzp1 z18.h, z18.h, z19.h\n" - "smax z26.s, p2/M, z26.s, z0.s\n" - "smax z27.s, p2/M, z27.s, z0.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z20.h, z20.h, z21.h\n" - "smax z28.s, p2/M, z28.s, z0.s\n" - "smax z29.s, p2/M, z29.s, z0.s\n" - "uzp1 z17.h, z22.h, z23.h\n" - "smax z30.s, p2/M, z30.s, z0.s\n" - "smax z31.s, p2/M, z31.s, z0.s\n" + "smax z28.s, p2/M, z28.s, z5.s\n" + "smax z29.s, p2/M, z29.s, z5.s\n" + "uzp1 z21.h, z22.h, z23.h\n" + "smax z30.s, p2/M, z30.s, z5.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" "uzp1 z24.h, z24.h, z25.h\n" - "uzp1 z16.b, z16.b, z18.b\n" - "uzp1 z18.h, z26.h, z27.h\n" + "uzp1 z16.b, z16.b, z17.b\n" + "uzp1 z25.h, z26.h, z27.h\n" "uzp1 z28.h, z28.h, z29.h\n" - "uzp1 z20.b, z20.b, z17.b\n" - "uzp1 z17.h, z30.h, z31.h\n" + "uzp1 z20.b, z20.b, z21.b\n" + "uzp1 z29.h, z30.h, z31.h\n" "st1b { z16.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "uzp1 z24.b, z24.b, z18.b\n" - "uzp1 z28.b, z28.b, z17.b\n" - "st1b { z20.b }, p1, [x24]\n" - "st1b { z24.b }, p1, [x23]\n" - "st1b { z28.b }, p1, [x22]\n" - "56:" // Height 4: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 44b\n" + "uzp1 z24.b, z24.b, z25.b\n" + "uzp1 z28.b, z28.b, z29.b\n" + "st1b { z20.b }, p1, [x26]\n" + "st1b { z24.b }, p1, [x25]\n" + "st1b { z28.b }, p1, [x24]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 41b\n" "subs %x[M], %x[M], #0x4\n" - "beq 58f\n" + "beq 54f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 57f\n" + "tbz %x[flags], #3, 53f\n" "add x21, x21, #0x4\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "57:" // Update direct input + "53:" // Update direct input "mov x20, #0x4\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "58:" // Exit + "54:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp index 212e178065..c3fa2f5506 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -74,23 +73,20 @@ void sve_hybrid_u8qa_mmla_4x4VL ( ka.string_lengths = string_lengths; ka.N = N; ka.B_ptr = B_ptr; - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "ptrue p2.b\n" "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 43f\n" + "bge 40f\n" "cmp %x[M], #0x2\n" - "bgt 29f\n" - "beq 15f\n" - "mov x10, %x[col_bias]\n" + "bgt 27f\n" + "beq 14f\n" "mov z11.s, #0x0\n" "mov z15.b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "2:" // Height 1: Column loop "mov x20, #0x0\n" @@ -100,10 +96,9 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "3:" // Height 1: setup done "mov x26, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -124,43 +119,43 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "ble 9f\n" "7:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z30.b }, p2/Z, [x28]\n" - "ld1b { z29.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "ld1b { z24.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" "add x24, x24, #0x10\n" - "trn1 z0.d, z1.d, z31.d\n" - ".inst 0x45de9810 // ummla z16.s, z0.b, z30.b\n" - "ld1b { z8.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - "trn2 z1.d, z1.d, z31.d\n" - ".inst 0x45dd9814 // ummla z20.s, z0.b, z29.b\n" - ".inst 0x45dc9811 // ummla z17.s, z0.b, z28.b\n" - ".inst 0x45db9815 // ummla z21.s, z0.b, z27.b\n" - ".inst 0x45da9812 // ummla z18.s, z0.b, z26.b\n" - "ld1b { z31.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x45d99816 // ummla z22.s, z0.b, z25.b\n" - ".inst 0x45d89813 // ummla z19.s, z0.b, z24.b\n" - "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n" - ".inst 0x45c89817 // ummla z23.s, z0.b, z8.b\n" - "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n" - ".inst 0x45df9830 // ummla z16.s, z1.b, z31.b\n" - "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x45de9834 // ummla z20.s, z1.b, z30.b\n" - ".inst 0x45dd9831 // ummla z17.s, z1.b, z29.b\n" - ".inst 0x45dc9835 // ummla z21.s, z1.b, z28.b\n" - ".inst 0x45db9832 // ummla z18.s, z1.b, z27.b\n" - ".inst 0x45da9836 // ummla z22.s, z1.b, z26.b\n" - ".inst 0x45d99833 // ummla z19.s, z1.b, z25.b\n" - ".inst 0x45d89837 // ummla z23.s, z1.b, z24.b\n" + "trn1 z0.d, z1.d, z2.d\n" + ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" + ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" + ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" + ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" + "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" + ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" + ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" + "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-6, MUL VL]\n" + ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" + "ld1b { z9.b }, p2/Z, [x9, #-5, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #-4, MUL VL]\n" + ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + "ld1b { z4.b }, p2/Z, [x9, #-3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #-2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" + ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" + ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" + ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" + ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" "tbnz %x[flags], #31, 8f\n" "udot z11.s, z0.b, z15.b\n" "udot z11.s, z1.b, z15.b\n" @@ -170,45 +165,45 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "bgt 7b\n" "9:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z24.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" - "trn1 z0.d, z1.d, z31.d\n" - ".inst 0x45d89810 // ummla z16.s, z0.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #8\n" - "trn2 z1.d, z1.d, z31.d\n" - ".inst 0x45de9814 // ummla z20.s, z0.b, z30.b\n" - ".inst 0x45dd9811 // ummla z17.s, z0.b, z29.b\n" - ".inst 0x45dc9815 // ummla z21.s, z0.b, z28.b\n" - ".inst 0x45db9812 // ummla z18.s, z0.b, z27.b\n" - ".inst 0x45da9816 // ummla z22.s, z0.b, z26.b\n" - ".inst 0x45d99813 // ummla z19.s, z0.b, z25.b\n" - ".inst 0x45d89817 // ummla z23.s, z0.b, z24.b\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" + "trn1 z0.d, z1.d, z2.d\n" + ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" + ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" + ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" + ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" + ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" + ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" + ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" "ble 10f\n" - "ld1b { z24.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45d89830 // ummla z16.s, z1.b, z24.b\n" - "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45de9834 // ummla z20.s, z1.b, z30.b\n" - "addvl x28, x28, #8\n" - ".inst 0x45dd9831 // ummla z17.s, z1.b, z29.b\n" - ".inst 0x45dc9835 // ummla z21.s, z1.b, z28.b\n" - ".inst 0x45db9832 // ummla z18.s, z1.b, z27.b\n" - ".inst 0x45da9836 // ummla z22.s, z1.b, z26.b\n" - ".inst 0x45d99833 // ummla z19.s, z1.b, z25.b\n" - ".inst 0x45d89837 // ummla z23.s, z1.b, z24.b\n" + "ld1b { z6.b }, p2/Z, [x9]\n" + "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + "ld1b { z5.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + "addvl x9, x9, #8\n" + ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" + ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" + ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" + ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" + ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" "10:" // Height 1: Multiply loop: multiply skip "tbnz %x[flags], #31, 11f\n" "udot z11.s, z0.b, z15.b\n" @@ -226,89 +221,74 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "tbnz %x[flags], #31, 12f\n" "add x20, %x[qp], %[b_offset]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" - "ld1rw { z9.s }, p2/Z, [x20]\n" - "neg z9.s, p2/M, z9.s\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" + "neg z1.s, p2/M, z1.s\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z9.s\n" + "mul z11.s, p2/M, z11.s, z1.s\n" "12:" // Height 1: skip row sum fixup "add z23.s, z23.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z22.s }, p2/Z, [x10]\n" - "ld1w { z24.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z21.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z20.s }, p2/Z, [x10, #3, MUL VL]\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "add z23.s, z23.s, z22.s\n" - "add z17.s, z17.s, z24.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - "addvl x10, x10, #4\n" - "add z18.s, z18.s, z21.s\n" - "add z19.s, z19.s, z20.s\n" + "add z23.s, z23.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "ld1rw { z4.s }, p2/Z, [x21]\n" "ld1rw { z0.s }, p2/Z, [x20]\n" - ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n" - ".inst 0x04b07631 // sqrdmulh z17.s, z17.s, z16.s\n" - ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n" - ".inst 0x04b07673 // sqrdmulh z19.s, z19.s, z16.s\n" - "tbz %x[flags], #5, 13f\n" - "and z22.d, z23.d, z0.d\n" - "and z21.d, z17.d, z0.d\n" - "and z20.d, z18.d, z0.d\n" - "and z16.d, z19.d, z0.d\n" - "asr z22.s, z22.s, #0x1f\n" - "asr z21.s, z21.s, #0x1f\n" - "asr z20.s, z20.s, #0x1f\n" - "asr z16.s, z16.s, #0x1f\n" - "sqadd z23.s, z23.s, z22.s\n" - "sqadd z17.s, z17.s, z21.s\n" - "sqadd z18.s, z18.s, z20.s\n" - "sqadd z19.s, z19.s, z16.s\n" - "13:" // Height 1: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x20]\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x21]\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "ld1rw { z21.s }, p2/Z, [x20]\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1rw { z20.s }, p2/Z, [x20]\n" - "add z23.s, z23.s, z21.s\n" - "add x20, %x[qp], %[minval]\n" - "add z17.s, z17.s, z21.s\n" - "add z18.s, z18.s, z21.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z21.s\n" - "smin z23.s, p2/M, z23.s, z20.s\n" - "smin z17.s, p2/M, z17.s, z20.s\n" - "smin z18.s, p2/M, z18.s, z20.s\n" - "smin z19.s, p2/M, z19.s, z20.s\n" - "smax z23.s, p2/M, z23.s, z16.s\n" - "smax z17.s, p2/M, z17.s, z16.s\n" - "smax z18.s, p2/M, z18.s, z16.s\n" - "smax z19.s, p2/M, z19.s, z16.s\n" + "add z23.s, z23.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z23.h, z23.h, z17.h\n" - "uzp1 z16.h, z18.h, z19.h\n" - "uzp1 z23.b, z23.b, z16.b\n" + "uzp1 z17.h, z18.h, z19.h\n" + "uzp1 z23.b, z23.b, z17.b\n" "st1b { z23.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "14:" // Height 1: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" "bgt 2b\n" - "b 58f\n" - "15:" // Height 2 - "mov x10, %x[col_bias]\n" + "b 54f\n" + "14:" // Height 2 "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z15.b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "16:" // Height 2: Column loop + "15:" // Height 2: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -316,131 +296,130 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "17:" // Height 2: setup done "mov x26, #0x0\n" - "18:" // Height 2: String loop + "17:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 19f\n" + "tbz %x[flags], #3, 18f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" - "cbnz x26, 20f\n" + "cbnz x26, 19f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" - "b 20f\n" - "19:" // Height 2: setup direct input + "b 19f\n" + "18:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" - "20:" // Height 2: input setup done + "19:" // Height 2: input setup done "cmp x25, #0x10\n" - "ble 23f\n" - "21:" // Height 2: Multiply loop: Main loop head + "ble 22f\n" + "20:" // Height 2: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z31.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "ld1rqb { z25.b }, p0/Z, [x23]\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - "ld1b { z24.b }, p2/Z, [x28, #6, MUL VL]\n" - "trn1 z0.d, z1.d, z25.d\n" - "trn2 z1.d, z1.d, z25.d\n" - ".inst 0x45df9810 // ummla z16.s, z0.b, z31.b\n" - "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - ".inst 0x45de9814 // ummla z20.s, z0.b, z30.b\n" - ".inst 0x45dd9811 // ummla z17.s, z0.b, z29.b\n" - ".inst 0x45dc9815 // ummla z21.s, z0.b, z28.b\n" - ".inst 0x45db9812 // ummla z18.s, z0.b, z27.b\n" - ".inst 0x45da9816 // ummla z22.s, z0.b, z26.b\n" - ".inst 0x45d89813 // ummla z19.s, z0.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x45d99817 // ummla z23.s, z0.b, z25.b\n" - "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n" - ".inst 0x45d89830 // ummla z16.s, z1.b, z24.b\n" - "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x45de9834 // ummla z20.s, z1.b, z30.b\n" - ".inst 0x45dd9831 // ummla z17.s, z1.b, z29.b\n" - ".inst 0x45dc9835 // ummla z21.s, z1.b, z28.b\n" - ".inst 0x45db9832 // ummla z18.s, z1.b, z27.b\n" - ".inst 0x45da9836 // ummla z22.s, z1.b, z26.b\n" - ".inst 0x45d99833 // ummla z19.s, z1.b, z25.b\n" - ".inst 0x45d89837 // ummla z23.s, z1.b, z24.b\n" - "tbnz %x[flags], #31, 22f\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" + ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" + ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" + ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" + ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" + ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" + "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" + ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" + "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-6, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #-5, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #-4, MUL VL]\n" + ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + "ld1b { z4.b }, p2/Z, [x9, #-3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #-2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" + ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" + ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" + ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" + ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + "tbnz %x[flags], #31, 21f\n" "udot z11.s, z0.b, z15.b\n" "udot z11.s, z1.b, z15.b\n" - "22:" // Height 2: Multiply loop: unique 3: skip row sum + "21:" // Height 2: Multiply loop: unique 3: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 21b\n" - "23:" // Height 2: Multiply loop: Single iteration only + "bgt 20b\n" + "22:" // Height 2: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z29.b }, p2/Z, [x28]\n" - "ld1b { z28.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "ld1rqb { z24.b }, p0/Z, [x23]\n" - "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" - "trn1 z0.d, z1.d, z24.d\n" - "trn2 z1.d, z1.d, z24.d\n" - ".inst 0x45dd9810 // ummla z16.s, z0.b, z29.b\n" - "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #8\n" - ".inst 0x45dc9814 // ummla z20.s, z0.b, z28.b\n" - ".inst 0x45c49811 // ummla z17.s, z0.b, z4.b\n" - ".inst 0x45db9815 // ummla z21.s, z0.b, z27.b\n" - ".inst 0x45da9812 // ummla z18.s, z0.b, z26.b\n" - ".inst 0x45c69816 // ummla z22.s, z0.b, z6.b\n" - ".inst 0x45d99813 // ummla z19.s, z0.b, z25.b\n" - ".inst 0x45d89817 // ummla z23.s, z0.b, z24.b\n" - "ble 24f\n" - "ld1b { z24.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45d89830 // ummla z16.s, z1.b, z24.b\n" - "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45de9834 // ummla z20.s, z1.b, z30.b\n" - "addvl x28, x28, #8\n" - ".inst 0x45dd9831 // ummla z17.s, z1.b, z29.b\n" - ".inst 0x45dc9835 // ummla z21.s, z1.b, z28.b\n" - ".inst 0x45db9832 // ummla z18.s, z1.b, z27.b\n" - ".inst 0x45da9836 // ummla z22.s, z1.b, z26.b\n" - ".inst 0x45d99833 // ummla z19.s, z1.b, z25.b\n" - ".inst 0x45d89837 // ummla z23.s, z1.b, z24.b\n" - "24:" // Height 2: Multiply loop: multiply skip - "tbnz %x[flags], #31, 25f\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" + ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" + ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" + ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" + ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" + ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" + ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" + "ble 23f\n" + "ld1b { z6.b }, p2/Z, [x9]\n" + "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + "ld1b { z5.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + "addvl x9, x9, #8\n" + ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" + ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" + ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" + ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" + ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + "23:" // Height 2: Multiply loop: multiply skip + "tbnz %x[flags], #31, 24f\n" "udot z11.s, z0.b, z15.b\n" "udot z11.s, z1.b, z15.b\n" - "25:" // Height 2: Multiply loop: unique 4: skip row sum + "24:" // Height 2: Multiply loop: unique 4: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 18b\n" - "uzp1 z24.d, z16.d, z20.d\n" + "bne 17b\n" + "uzp1 z7.d, z16.d, z20.d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "uzp2 z16.d, z16.d, z20.d\n" "uzp1 z20.d, z17.d, z21.d\n" @@ -449,116 +428,90 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "uzp2 z18.d, z18.d, z22.d\n" "uzp1 z22.d, z19.d, z23.d\n" "uzp2 z19.d, z19.d, z23.d\n" - "add x23, x27, x20\n" - "mov z23.d, z24.d\n" - "tbnz %x[flags], #31, 26f\n" + "add x26, x27, x20\n" + "mov z23.d, z7.d\n" + "tbnz %x[flags], #31, 25f\n" "add x20, %x[qp], %[b_offset]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "neg z24.s, p2/M, z24.s\n" + "ld1rw { z2.s }, p2/Z, [x20]\n" + "neg z2.s, p2/M, z2.s\n" "mov z12.s, z11.s[3]\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z24.s\n" - "mul z12.s, p2/M, z12.s, z24.s\n" - "26:" // Height 2: skip row sum fixup + "mul z11.s, p2/M, z11.s, z2.s\n" + "mul z12.s, p2/M, z12.s, z2.s\n" + "25:" // Height 2: skip row sum fixup "add z23.s, z23.s, z11.s\n" "add z20.s, z20.s, z11.s\n" - "ld1w { z28.s }, p2/Z, [x10]\n" - "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z21.s, z21.s, z11.s\n" "add z22.s, z22.s, z11.s\n" - "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z16.s, z16.s, z12.s\n" "add z17.s, z17.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z12.s\n" "add z19.s, z19.s, z12.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" - "add z23.s, z23.s, z28.s\n" - "add z20.s, z20.s, z27.s\n" - "addvl x10, x10, #4\n" - "add z21.s, z21.s, z26.s\n" - "add z22.s, z22.s, z25.s\n" - "add z16.s, z16.s, z28.s\n" - "add z17.s, z17.s, z27.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z18.s, z18.s, z26.s\n" - "add z19.s, z19.s, z25.s\n" - ".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n" - ".inst 0x04b87694 // sqrdmulh z20.s, z20.s, z24.s\n" - ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n" - ".inst 0x04b876d6 // sqrdmulh z22.s, z22.s, z24.s\n" - ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n" - ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n" - ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n" - ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n" - "tbz %x[flags], #5, 27f\n" - "and z24.d, z23.d, z0.d\n" - "and z30.d, z20.d, z0.d\n" - "and z29.d, z21.d, z0.d\n" - "and z28.d, z22.d, z0.d\n" - "and z27.d, z16.d, z0.d\n" - "and z26.d, z17.d, z0.d\n" - "asr z24.s, z24.s, #0x1f\n" - "and z25.d, z18.d, z0.d\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z27.s, z27.s, #0x1f\n" - "sqadd z23.s, z23.s, z24.s\n" - "and z24.d, z19.d, z0.d\n" - "asr z26.s, z26.s, #0x1f\n" - "asr z25.s, z25.s, #0x1f\n" - "sqadd z20.s, z20.s, z30.s\n" - "sqadd z21.s, z21.s, z29.s\n" - "asr z24.s, z24.s, #0x1f\n" - "sqadd z22.s, z22.s, z28.s\n" - "sqadd z16.s, z16.s, z27.s\n" - "sqadd z17.s, z17.s, z26.s\n" - "sqadd z18.s, z18.s, z25.s\n" - "sqadd z19.s, z19.s, z24.s\n" - "27:" // Height 2: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" + "add z23.s, z23.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "addvl x28, x28, #4\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "ld1rw { z26.s }, p2/Z, [x20]\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z23.s, z23.s, z26.s\n" + "add z23.s, z23.s, z4.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z20.s, z20.s, z26.s\n" - "add z21.s, z21.s, z26.s\n" - "ld1rw { z25.s }, p2/Z, [x20]\n" - "add z22.s, z22.s, z26.s\n" - "add z16.s, z16.s, z26.s\n" - "add x20, %x[qp], %[minval]\n" - "add z17.s, z17.s, z26.s\n" - "add z18.s, z18.s, z26.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z26.s\n" - "smin z23.s, p2/M, z23.s, z25.s\n" - "smin z20.s, p2/M, z20.s, z25.s\n" - "smin z21.s, p2/M, z21.s, z25.s\n" - "smin z22.s, p2/M, z22.s, z25.s\n" - "smin z16.s, p2/M, z16.s, z25.s\n" - "smin z17.s, p2/M, z17.s, z25.s\n" - "smin z18.s, p2/M, z18.s, z25.s\n" - "smin z19.s, p2/M, z19.s, z25.s\n" - "smax z23.s, p2/M, z23.s, z24.s\n" - "smax z20.s, p2/M, z20.s, z24.s\n" - "smax z21.s, p2/M, z21.s, z24.s\n" - "smax z22.s, p2/M, z22.s, z24.s\n" - "smax z16.s, p2/M, z16.s, z24.s\n" - "smax z17.s, p2/M, z17.s, z24.s\n" - "smax z18.s, p2/M, z18.s, z24.s\n" - "smax z19.s, p2/M, z19.s, z24.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z23.h, z23.h, z20.h\n" "uzp1 z20.h, z21.h, z22.h\n" "uzp1 z16.h, z16.h, z17.h\n" @@ -567,23 +520,22 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "uzp1 z16.b, z16.b, z17.b\n" "st1b { z23.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "st1b { z16.b }, p1, [x23]\n" - "28:" // Height 2: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 16b\n" - "b 58f\n" - "29:" // Height 3 - "mov x10, %x[col_bias]\n" + "st1b { z16.b }, p1, [x26]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 15b\n" + "b 54f\n" + "27:" // Height 3 "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z13.s, #0x0\n" "mov z15.b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "30:" // Height 3: Column loop + "28:" // Height 3: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -591,7 +543,7 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" "mov z24.s, #0x0\n" @@ -602,39 +554,38 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "31:" // Height 3: setup done "mov x26, #0x0\n" - "32:" // Height 3: String loop + "30:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 33f\n" + "tbz %x[flags], #3, 31f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" - "cbnz x26, 34f\n" + "cbnz x26, 32f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" - "b 34f\n" - "33:" // Height 3: setup direct input + "b 32f\n" + "31:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" - "34:" // Height 3: input setup done + "32:" // Height 3: input setup done "cmp x25, #0x10\n" - "ble 37f\n" - "35:" // Height 3: Multiply loop: Main loop head + "ble 35f\n" + "33:" // Height 3: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" @@ -643,333 +594,294 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "add x22, x22, #0x10\n" "trn1 z0.d, z1.d, z2.d\n" "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z6.d\n" - "trn2 z3.d, z3.d, z6.d\n" - "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" - ".inst 0x45ca9814 // ummla z20.s, z0.b, z10.b\n" - ".inst 0x45c99811 // ummla z17.s, z0.b, z9.b\n" + ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" + ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" - ".inst 0x45c49812 // ummla z18.s, z0.b, z4.b\n" + ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - ".inst 0x45ca985c // ummla z28.s, z2.b, z10.b\n" - ".inst 0x45c99859 // ummla z25.s, z2.b, z9.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x45c6985c // ummla z28.s, z2.b, z6.b\n" + ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n" ".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n" - ".inst 0x45c4985a // ummla z26.s, z2.b, z4.b\n" - ".inst 0x45c79816 // ummla z22.s, z0.b, z7.b\n" - "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x45c7985e // ummla z30.s, z2.b, z7.b\n" - ".inst 0x45c69813 // ummla z19.s, z0.b, z6.b\n" - "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n" - ".inst 0x45c6985b // ummla z27.s, z2.b, z6.b\n" + ".inst 0x45c9985a // ummla z26.s, z2.b, z9.b\n" + ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" + "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" + ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n" + ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" + "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-6, MUL VL]\n" + ".inst 0x45c4985b // ummla z27.s, z2.b, z4.b\n" ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" - "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #-5, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #-4, MUL VL]\n" ".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n" - ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n" - "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" - ".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n" - "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x45ca9834 // ummla z20.s, z1.b, z10.b\n" - ".inst 0x45ca987c // ummla z28.s, z3.b, z10.b\n" - ".inst 0x45c99831 // ummla z17.s, z1.b, z9.b\n" - ".inst 0x45c99879 // ummla z25.s, z3.b, z9.b\n" - ".inst 0x45c89835 // ummla z21.s, z1.b, z8.b\n" - ".inst 0x45c8987d // ummla z29.s, z3.b, z8.b\n" - ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n" - ".inst 0x45c7987a // ummla z26.s, z3.b, z7.b\n" - ".inst 0x45c69836 // ummla z22.s, z1.b, z6.b\n" - ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n" + ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + "ld1b { z4.b }, p2/Z, [x9, #-3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #-2, MUL VL]\n" + ".inst 0x45c69878 // ummla z24.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + ".inst 0x45c7987c // ummla z28.s, z3.b, z7.b\n" + ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + ".inst 0x45c89879 // ummla z25.s, z3.b, z8.b\n" + ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" + ".inst 0x45c9987d // ummla z29.s, z3.b, z9.b\n" + ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" + ".inst 0x45ca987a // ummla z26.s, z3.b, z10.b\n" + ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" + ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n" ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n" - ".inst 0x45c49837 // ummla z23.s, z1.b, z4.b\n" - ".inst 0x45c4987f // ummla z31.s, z3.b, z4.b\n" - "tbnz %x[flags], #31, 36f\n" + ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + ".inst 0x45c6987f // ummla z31.s, z3.b, z6.b\n" + "tbnz %x[flags], #31, 34f\n" "udot z11.s, z0.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" "udot z11.s, z1.b, z15.b\n" "udot z13.s, z3.b, z15.b\n" - "36:" // Height 3: Multiply loop: unique 5: skip row sum + "34:" // Height 3: Multiply loop: unique 5: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 35b\n" - "37:" // Height 3: Multiply loop: Single iteration only + "bgt 33b\n" + "35:" // Height 3: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" "trn1 z0.d, z1.d, z2.d\n" "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z5.d\n" - "trn2 z3.d, z3.d, z5.d\n" - "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - ".inst 0x45c49810 // ummla z16.s, z0.b, z4.b\n" - ".inst 0x45ca9814 // ummla z20.s, z0.b, z10.b\n" - ".inst 0x45c99811 // ummla z17.s, z0.b, z9.b\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" + ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" + ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" - ".inst 0x45c79812 // ummla z18.s, z0.b, z7.b\n" - ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45ca985c // ummla z28.s, z2.b, z10.b\n" - "addvl x28, x28, #8\n" - ".inst 0x45c99859 // ummla z25.s, z2.b, z9.b\n" + ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" + ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45c6985c // ummla z28.s, z2.b, z6.b\n" + "addvl x9, x9, #8\n" + ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n" ".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n" - ".inst 0x45c7985a // ummla z26.s, z2.b, z7.b\n" - ".inst 0x45c69816 // ummla z22.s, z0.b, z6.b\n" - ".inst 0x45c6985e // ummla z30.s, z2.b, z6.b\n" - ".inst 0x45c59813 // ummla z19.s, z0.b, z5.b\n" - ".inst 0x45c5985b // ummla z27.s, z2.b, z5.b\n" - ".inst 0x45c49817 // ummla z23.s, z0.b, z4.b\n" - ".inst 0x45c4985f // ummla z31.s, z2.b, z4.b\n" - "ble 38f\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n" - ".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n" - "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45ca9834 // ummla z20.s, z1.b, z10.b\n" - ".inst 0x45ca987c // ummla z28.s, z3.b, z10.b\n" - ".inst 0x45c99831 // ummla z17.s, z1.b, z9.b\n" - ".inst 0x45c99879 // ummla z25.s, z3.b, z9.b\n" - "addvl x28, x28, #8\n" - ".inst 0x45c89835 // ummla z21.s, z1.b, z8.b\n" - ".inst 0x45c8987d // ummla z29.s, z3.b, z8.b\n" - ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n" - ".inst 0x45c7987a // ummla z26.s, z3.b, z7.b\n" - ".inst 0x45c69836 // ummla z22.s, z1.b, z6.b\n" - ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n" + ".inst 0x45c9985a // ummla z26.s, z2.b, z9.b\n" + ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" + ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n" + ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" + ".inst 0x45c4985b // ummla z27.s, z2.b, z4.b\n" + ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" + ".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n" + "ble 36f\n" + "ld1b { z6.b }, p2/Z, [x9]\n" + "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + ".inst 0x45c69878 // ummla z24.s, z3.b, z6.b\n" + "ld1b { z5.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + ".inst 0x45c7987c // ummla z28.s, z3.b, z7.b\n" + ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + ".inst 0x45c89879 // ummla z25.s, z3.b, z8.b\n" + "addvl x9, x9, #8\n" + ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" + ".inst 0x45c9987d // ummla z29.s, z3.b, z9.b\n" + ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" + ".inst 0x45ca987a // ummla z26.s, z3.b, z10.b\n" + ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" + ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n" ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n" - ".inst 0x45c49837 // ummla z23.s, z1.b, z4.b\n" - ".inst 0x45c4987f // ummla z31.s, z3.b, z4.b\n" - "38:" // Height 3: Multiply loop: multiply skip - "tbnz %x[flags], #31, 39f\n" + ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + ".inst 0x45c6987f // ummla z31.s, z3.b, z6.b\n" + "36:" // Height 3: Multiply loop: multiply skip + "tbnz %x[flags], #31, 37f\n" "udot z11.s, z0.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" "udot z11.s, z1.b, z15.b\n" "udot z13.s, z3.b, z15.b\n" - "39:" // Height 3: Multiply loop: unique 6: skip row sum + "37:" // Height 3: Multiply loop: unique 6: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 32b\n" + "bne 30b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 z0.d, z16.d, z20.d\n" + "uzp1 z7.d, z16.d, z20.d\n" "uzp2 z16.d, z16.d, z20.d\n" "uzp1 z20.d, z17.d, z21.d\n" "uzp2 z17.d, z17.d, z21.d\n" "uzp1 z21.d, z18.d, z22.d\n" "uzp2 z18.d, z18.d, z22.d\n" - "add x23, x27, x20\n" + "add x26, x27, x20\n" "uzp1 z22.d, z19.d, z23.d\n" "uzp2 z19.d, z19.d, z23.d\n" - "add x22, x23, x20\n" + "add x25, x26, x20\n" "uzp1 z24.d, z24.d, z28.d\n" "uzp1 z25.d, z25.d, z29.d\n" "uzp1 z26.d, z26.d, z30.d\n" "uzp1 z27.d, z27.d, z31.d\n" - "mov z31.d, z0.d\n" - "tbnz %x[flags], #31, 40f\n" + "mov z31.d, z7.d\n" + "tbnz %x[flags], #31, 38f\n" "add x20, %x[qp], %[b_offset]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" ".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n" - "ld1rw { z23.s }, p2/Z, [x20]\n" - "neg z23.s, p2/M, z23.s\n" + "ld1rw { z3.s }, p2/Z, [x20]\n" + "neg z3.s, p2/M, z3.s\n" "mov z12.s, z11.s[3]\n" "mov z11.s, z11.s[0]\n" "mov z13.s, z13.s[0]\n" - "mul z11.s, p2/M, z11.s, z23.s\n" - "mul z12.s, p2/M, z12.s, z23.s\n" - "mul z13.s, p2/M, z13.s, z23.s\n" - "40:" // Height 3: skip row sum fixup + "mul z11.s, p2/M, z11.s, z3.s\n" + "mul z12.s, p2/M, z12.s, z3.s\n" + "mul z13.s, p2/M, z13.s, z3.s\n" + "38:" // Height 3: skip row sum fixup "add z31.s, z31.s, z11.s\n" "add z20.s, z20.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z30.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z21.s, z21.s, z11.s\n" "add z22.s, z22.s, z11.s\n" - "ld1w { z29.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z28.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z16.s, z16.s, z12.s\n" "add z17.s, z17.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z12.s\n" "add z19.s, z19.s, z12.s\n" - "ld1rw { z23.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" "add z24.s, z24.s, z13.s\n" "add z25.s, z25.s, z13.s\n" - "addvl x10, x10, #4\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" "add z26.s, z26.s, z13.s\n" "add z27.s, z27.s, z13.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" "add z31.s, z31.s, z0.s\n" - "add z20.s, z20.s, z30.s\n" - "add z21.s, z21.s, z29.s\n" - "add z22.s, z22.s, z28.s\n" + "add z20.s, z20.s, z1.s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z30.s\n" - "add z18.s, z18.s, z29.s\n" - "add z19.s, z19.s, z28.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z30.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z26.s, z26.s, z29.s\n" - "add z27.s, z27.s, z28.s\n" - ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n" - ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n" - ".inst 0x04b776b5 // sqrdmulh z21.s, z21.s, z23.s\n" - ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n" - ".inst 0x04b77610 // sqrdmulh z16.s, z16.s, z23.s\n" - ".inst 0x04b77631 // sqrdmulh z17.s, z17.s, z23.s\n" - ".inst 0x04b77652 // sqrdmulh z18.s, z18.s, z23.s\n" - ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n" - ".inst 0x04b77718 // sqrdmulh z24.s, z24.s, z23.s\n" - ".inst 0x04b77739 // sqrdmulh z25.s, z25.s, z23.s\n" - ".inst 0x04b7775a // sqrdmulh z26.s, z26.s, z23.s\n" - ".inst 0x04b7777b // sqrdmulh z27.s, z27.s, z23.s\n" - "tbz %x[flags], #5, 41f\n" - "and z1.d, z31.d, z0.d\n" - "and z30.d, z20.d, z0.d\n" - "and z29.d, z21.d, z0.d\n" - "and z28.d, z22.d, z0.d\n" - "and z23.d, z16.d, z0.d\n" - "and z3.d, z17.d, z0.d\n" - "asr z1.s, z1.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z23.s, z23.s, #0x1f\n" - "and z2.d, z18.d, z0.d\n" - "sqadd z31.s, z31.s, z1.s\n" - "sqadd z20.s, z20.s, z30.s\n" - "sqadd z21.s, z21.s, z29.s\n" - "sqadd z22.s, z22.s, z28.s\n" - "sqadd z16.s, z16.s, z23.s\n" - "and z1.d, z19.d, z0.d\n" - "and z30.d, z24.d, z0.d\n" - "and z29.d, z25.d, z0.d\n" - "and z28.d, z26.d, z0.d\n" - "and z23.d, z27.d, z0.d\n" - "asr z3.s, z3.s, #0x1f\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z23.s, z23.s, #0x1f\n" - "sqadd z17.s, z17.s, z3.s\n" - "sqadd z18.s, z18.s, z2.s\n" - "sqadd z19.s, z19.s, z1.s\n" - "sqadd z24.s, z24.s, z30.s\n" - "sqadd z25.s, z25.s, z29.s\n" - "sqadd z26.s, z26.s, z28.s\n" - "sqadd z27.s, z27.s, z23.s\n" - "41:" // Height 3: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add z25.s, z25.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + ".inst 0x04a473ff // sqdmulh z31.s, z31.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" - "ld1rw { z29.s }, p2/Z, [x20]\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + ".inst 0x04a47318 // sqdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47339 // sqdmulh z25.s, z25.s, z4.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + ".inst 0x04a4735a // sqdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4737b // sqdmulh z27.s, z27.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z31.s, z31.s, z29.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z20.s, z20.s, z29.s\n" - "add z21.s, z21.s, z29.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z22.s, z22.s, z29.s\n" - "add z16.s, z16.s, z29.s\n" + "add z31.s, z31.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z17.s, z17.s, z29.s\n" - "add z18.s, z18.s, z29.s\n" - "ld1rw { z28.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z29.s\n" - "add z24.s, z24.s, z29.s\n" - "add x20, %x[qp], %[minval]\n" - "add z25.s, z25.s, z29.s\n" - "add z26.s, z26.s, z29.s\n" - "ld1rw { z23.s }, p2/Z, [x20]\n" - "add z27.s, z27.s, z29.s\n" - "smin z31.s, p2/M, z31.s, z28.s\n" - "smin z20.s, p2/M, z20.s, z28.s\n" - "smin z21.s, p2/M, z21.s, z28.s\n" - "smin z22.s, p2/M, z22.s, z28.s\n" - "smin z16.s, p2/M, z16.s, z28.s\n" - "smin z17.s, p2/M, z17.s, z28.s\n" - "smin z18.s, p2/M, z18.s, z28.s\n" - "smin z19.s, p2/M, z19.s, z28.s\n" - "smin z24.s, p2/M, z24.s, z28.s\n" - "smin z25.s, p2/M, z25.s, z28.s\n" - "smin z26.s, p2/M, z26.s, z28.s\n" - "smin z27.s, p2/M, z27.s, z28.s\n" - "smax z31.s, p2/M, z31.s, z23.s\n" - "smax z20.s, p2/M, z20.s, z23.s\n" - "smax z21.s, p2/M, z21.s, z23.s\n" - "smax z22.s, p2/M, z22.s, z23.s\n" - "smax z16.s, p2/M, z16.s, z23.s\n" - "smax z17.s, p2/M, z17.s, z23.s\n" - "smax z18.s, p2/M, z18.s, z23.s\n" - "smax z19.s, p2/M, z19.s, z23.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "add z26.s, z26.s, z4.s\n" + "add z27.s, z27.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z31.h, z31.h, z20.h\n" - "smax z24.s, p2/M, z24.s, z23.s\n" - "smax z25.s, p2/M, z25.s, z23.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" "uzp1 z20.h, z21.h, z22.h\n" - "smax z26.s, p2/M, z26.s, z23.s\n" - "smax z27.s, p2/M, z27.s, z23.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "uzp1 z18.h, z18.h, z19.h\n" + "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z24.h, z24.h, z25.h\n" "uzp1 z31.b, z31.b, z20.b\n" - "uzp1 z17.h, z26.h, z27.h\n" - "uzp1 z16.b, z16.b, z18.b\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z16.b, z16.b, z17.b\n" "st1b { z31.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "uzp1 z24.b, z24.b, z17.b\n" - "st1b { z16.b }, p1, [x23]\n" - "st1b { z24.b }, p1, [x22]\n" - "42:" // Height 3: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 30b\n" - "b 58f\n" - "43:" // Height 4 + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z16.b }, p1, [x26]\n" + "st1b { z24.b }, p1, [x25]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 28b\n" + "b 54f\n" + "40:" // Height 4 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "mov x20, #0x4\n" - "mov x10, %x[col_bias]\n" "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" - "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" "mov z13.s, #0x0\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z14.s, #0x0\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "madd x20, x21, x20, x27\n" "mov z15.b, #0x1\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" + "madd x20, x21, x20, x27\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "44:" // Height 4: Column loop + "41:" // Height 4: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -977,7 +889,7 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" "mov z24.s, #0x0\n" @@ -988,191 +900,190 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "45:" // Height 4: setup done "mov x26, #0x0\n" - "46:" // Height 4: String loop + "43:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 47f\n" + "tbz %x[flags], #3, 44f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" "ldr x21, [x20, #0x18]\n" - "cbnz x26, 48f\n" + "cbnz x26, 45f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" "add x21, x21, x20\n" - "b 48f\n" - "47:" // Height 4: setup direct input + "b 45f\n" + "44:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" "add x21, x22, x21\n" - "48:" // Height 4: input setup done + "45:" // Height 4: input setup done "cmp x25, #0x10\n" - "ble 51f\n" - "49:" // Height 4: Multiply loop: Main loop head + "ble 48f\n" + "46:" // Height 4: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "ld1rqb { z6.b }, p0/Z, [x21]\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" "add x21, x21, #0x10\n" "trn1 z0.d, z1.d, z2.d\n" "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z6.d\n" - "trn2 z3.d, z3.d, z6.d\n" - "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" - ".inst 0x45c49814 // ummla z20.s, z0.b, z4.b\n" + ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" - ".inst 0x45c99815 // ummla z21.s, z0.b, z9.b\n" - ".inst 0x45c89812 // ummla z18.s, z0.b, z8.b\n" + ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" + ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - ".inst 0x45c4985c // ummla z28.s, z2.b, z4.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x45c6985c // ummla z28.s, z2.b, z6.b\n" ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n" - ".inst 0x45c9985d // ummla z29.s, z2.b, z9.b\n" - ".inst 0x45c8985a // ummla z26.s, z2.b, z8.b\n" + ".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n" + ".inst 0x45c9985a // ummla z26.s, z2.b, z9.b\n" ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" - "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n" - ".inst 0x45c69813 // ummla z19.s, z0.b, z6.b\n" - "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n" - ".inst 0x45c6985b // ummla z27.s, z2.b, z6.b\n" + ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" + "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-6, MUL VL]\n" + ".inst 0x45c4985b // ummla z27.s, z2.b, z4.b\n" ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" - "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #-5, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #-4, MUL VL]\n" ".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n" - ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n" - "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" - ".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n" - "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x45ca9834 // ummla z20.s, z1.b, z10.b\n" - ".inst 0x45ca987c // ummla z28.s, z3.b, z10.b\n" - ".inst 0x45c99831 // ummla z17.s, z1.b, z9.b\n" - ".inst 0x45c99879 // ummla z25.s, z3.b, z9.b\n" - ".inst 0x45c89835 // ummla z21.s, z1.b, z8.b\n" - ".inst 0x45c8987d // ummla z29.s, z3.b, z8.b\n" - ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n" - ".inst 0x45c7987a // ummla z26.s, z3.b, z7.b\n" - ".inst 0x45c69836 // ummla z22.s, z1.b, z6.b\n" - ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n" + ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + "ld1b { z4.b }, p2/Z, [x9, #-3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #-2, MUL VL]\n" + ".inst 0x45c69878 // ummla z24.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + ".inst 0x45c7987c // ummla z28.s, z3.b, z7.b\n" + ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + ".inst 0x45c89879 // ummla z25.s, z3.b, z8.b\n" + ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" + ".inst 0x45c9987d // ummla z29.s, z3.b, z9.b\n" + ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" + ".inst 0x45ca987a // ummla z26.s, z3.b, z10.b\n" + ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" + ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n" ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n" - ".inst 0x45c49837 // ummla z23.s, z1.b, z4.b\n" - ".inst 0x45c4987f // ummla z31.s, z3.b, z4.b\n" - "tbnz %x[flags], #31, 50f\n" + ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + ".inst 0x45c6987f // ummla z31.s, z3.b, z6.b\n" + "tbnz %x[flags], #31, 47f\n" "udot z11.s, z0.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" "udot z11.s, z1.b, z15.b\n" "udot z13.s, z3.b, z15.b\n" - "50:" // Height 4: Multiply loop: unique 7: skip row sum + "47:" // Height 4: Multiply loop: unique 7: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 49b\n" - "51:" // Height 4: Multiply loop: Single iteration only + "bgt 46b\n" + "48:" // Height 4: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z6.b }, p2/Z, [x28]\n" - "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "ld1rqb { z5.b }, p0/Z, [x21]\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" "trn1 z0.d, z1.d, z2.d\n" "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z5.d\n" - "trn2 z3.d, z3.d, z5.d\n" - "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - ".inst 0x45c69810 // ummla z16.s, z0.b, z6.b\n" - ".inst 0x45c49814 // ummla z20.s, z0.b, z4.b\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" + ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" - ".inst 0x45c99815 // ummla z21.s, z0.b, z9.b\n" - ".inst 0x45c89812 // ummla z18.s, z0.b, z8.b\n" - ".inst 0x45c69858 // ummla z24.s, z2.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45c4985c // ummla z28.s, z2.b, z4.b\n" - "addvl x28, x28, #8\n" + ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" + ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" + ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45c6985c // ummla z28.s, z2.b, z6.b\n" + "addvl x9, x9, #8\n" ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n" - ".inst 0x45c9985d // ummla z29.s, z2.b, z9.b\n" - ".inst 0x45c8985a // ummla z26.s, z2.b, z8.b\n" + ".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n" + ".inst 0x45c9985a // ummla z26.s, z2.b, z9.b\n" ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n" - ".inst 0x45c59813 // ummla z19.s, z0.b, z5.b\n" - ".inst 0x45c5985b // ummla z27.s, z2.b, z5.b\n" - ".inst 0x45c69817 // ummla z23.s, z0.b, z6.b\n" - ".inst 0x45c6985f // ummla z31.s, z2.b, z6.b\n" - "ble 52f\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n" - ".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n" - "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45ca9834 // ummla z20.s, z1.b, z10.b\n" - ".inst 0x45ca987c // ummla z28.s, z3.b, z10.b\n" - ".inst 0x45c99831 // ummla z17.s, z1.b, z9.b\n" - ".inst 0x45c99879 // ummla z25.s, z3.b, z9.b\n" - "addvl x28, x28, #8\n" - ".inst 0x45c89835 // ummla z21.s, z1.b, z8.b\n" - ".inst 0x45c8987d // ummla z29.s, z3.b, z8.b\n" - ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n" - ".inst 0x45c7987a // ummla z26.s, z3.b, z7.b\n" - ".inst 0x45c69836 // ummla z22.s, z1.b, z6.b\n" - ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n" + ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" + ".inst 0x45c4985b // ummla z27.s, z2.b, z4.b\n" + ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" + ".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n" + "ble 49f\n" + "ld1b { z6.b }, p2/Z, [x9]\n" + "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + ".inst 0x45c69878 // ummla z24.s, z3.b, z6.b\n" + "ld1b { z5.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + ".inst 0x45c7987c // ummla z28.s, z3.b, z7.b\n" + ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + ".inst 0x45c89879 // ummla z25.s, z3.b, z8.b\n" + "addvl x9, x9, #8\n" + ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" + ".inst 0x45c9987d // ummla z29.s, z3.b, z9.b\n" + ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" + ".inst 0x45ca987a // ummla z26.s, z3.b, z10.b\n" + ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" + ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n" ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n" - ".inst 0x45c49837 // ummla z23.s, z1.b, z4.b\n" - ".inst 0x45c4987f // ummla z31.s, z3.b, z4.b\n" - "52:" // Height 4: Multiply loop: multiply skip - "tbnz %x[flags], #31, 53f\n" + ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + ".inst 0x45c6987f // ummla z31.s, z3.b, z6.b\n" + "49:" // Height 4: Multiply loop: multiply skip + "tbnz %x[flags], #31, 50f\n" "udot z11.s, z0.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" "udot z11.s, z1.b, z15.b\n" "udot z13.s, z3.b, z15.b\n" - "53:" // Height 4: Multiply loop: unique 8: skip row sum + "50:" // Height 4: Multiply loop: unique 8: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 46b\n" + "bne 43b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 z0.d, z16.d, z20.d\n" + "uzp1 z7.d, z16.d, z20.d\n" "uzp2 z16.d, z16.d, z20.d\n" "uzp1 z20.d, z17.d, z21.d\n" "uzp2 z17.d, z17.d, z21.d\n" "uzp1 z21.d, z18.d, z22.d\n" "uzp2 z18.d, z18.d, z22.d\n" - "add x23, x27, x20\n" - "add x22, x23, x20\n" + "add x26, x27, x20\n" + "add x25, x26, x20\n" "uzp1 z22.d, z19.d, z23.d\n" "uzp2 z19.d, z19.d, z23.d\n" - "add x21, x22, x20\n" + "add x24, x25, x20\n" "uzp1 z23.d, z24.d, z28.d\n" "uzp2 z24.d, z24.d, z28.d\n" "uzp1 z28.d, z25.d, z29.d\n" @@ -1181,233 +1092,182 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "uzp2 z26.d, z26.d, z30.d\n" "uzp1 z30.d, z27.d, z31.d\n" "uzp2 z27.d, z27.d, z31.d\n" - "mov z31.d, z0.d\n" - "tbnz %x[flags], #31, 54f\n" + "mov z31.d, z7.d\n" + "tbnz %x[flags], #31, 51f\n" "add x20, %x[qp], %[b_offset]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" ".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "neg z0.s, p2/M, z0.s\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "neg z4.s, p2/M, z4.s\n" "mov z12.s, z11.s[3]\n" "mov z11.s, z11.s[0]\n" "mov z14.s, z13.s[3]\n" "mov z13.s, z13.s[0]\n" - "mul z11.s, p2/M, z11.s, z0.s\n" - "mul z12.s, p2/M, z12.s, z0.s\n" - "mul z13.s, p2/M, z13.s, z0.s\n" - "mul z14.s, p2/M, z14.s, z0.s\n" - "54:" // Height 4: skip row sum fixup + "mul z11.s, p2/M, z11.s, z4.s\n" + "mul z12.s, p2/M, z12.s, z4.s\n" + "mul z13.s, p2/M, z13.s, z4.s\n" + "mul z14.s, p2/M, z14.s, z4.s\n" + "51:" // Height 4: skip row sum fixup "add z31.s, z31.s, z11.s\n" "add z20.s, z20.s, z11.s\n" - "ld1w { z4.s }, p2/Z, [x10]\n" - "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z21.s, z21.s, z11.s\n" "add z22.s, z22.s, z11.s\n" - "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z16.s, z16.s, z12.s\n" "add z17.s, z17.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z12.s\n" "add z19.s, z19.s, z12.s\n" - "ld1rw { z1.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" "add z23.s, z23.s, z13.s\n" "add z28.s, z28.s, z13.s\n" - "addvl x10, x10, #4\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" "add z29.s, z29.s, z13.s\n" "add z30.s, z30.s, z13.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" "add z24.s, z24.s, z14.s\n" "add z25.s, z25.s, z14.s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" "add z26.s, z26.s, z14.s\n" "add z27.s, z27.s, z14.s\n" - "add z31.s, z31.s, z4.s\n" - "add z20.s, z20.s, z0.s\n" - "add z21.s, z21.s, z3.s\n" - "add z22.s, z22.s, z2.s\n" - "add z16.s, z16.s, z4.s\n" - "add z17.s, z17.s, z0.s\n" - "add z18.s, z18.s, z3.s\n" - "add z19.s, z19.s, z2.s\n" - "add z23.s, z23.s, z4.s\n" - "add z28.s, z28.s, z0.s\n" - "add z29.s, z29.s, z3.s\n" - "add z30.s, z30.s, z2.s\n" - "add z24.s, z24.s, z4.s\n" - "add z25.s, z25.s, z0.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z26.s, z26.s, z3.s\n" - "add z27.s, z27.s, z2.s\n" - ".inst 0x04a177ff // sqrdmulh z31.s, z31.s, z1.s\n" - ".inst 0x04a17694 // sqrdmulh z20.s, z20.s, z1.s\n" - ".inst 0x04a176b5 // sqrdmulh z21.s, z21.s, z1.s\n" - ".inst 0x04a176d6 // sqrdmulh z22.s, z22.s, z1.s\n" - ".inst 0x04a17610 // sqrdmulh z16.s, z16.s, z1.s\n" - ".inst 0x04a17631 // sqrdmulh z17.s, z17.s, z1.s\n" - ".inst 0x04a17652 // sqrdmulh z18.s, z18.s, z1.s\n" - ".inst 0x04a17673 // sqrdmulh z19.s, z19.s, z1.s\n" - ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n" - ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n" - ".inst 0x04a177bd // sqrdmulh z29.s, z29.s, z1.s\n" - ".inst 0x04a177de // sqrdmulh z30.s, z30.s, z1.s\n" - ".inst 0x04a17718 // sqrdmulh z24.s, z24.s, z1.s\n" - ".inst 0x04a17739 // sqrdmulh z25.s, z25.s, z1.s\n" - ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n" - ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n" - "tbz %x[flags], #5, 55f\n" - "and z2.d, z31.d, z0.d\n" - "and z1.d, z20.d, z0.d\n" - "and z7.d, z21.d, z0.d\n" - "and z6.d, z22.d, z0.d\n" - "and z5.d, z16.d, z0.d\n" - "and z4.d, z17.d, z0.d\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "and z3.d, z18.d, z0.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "sqadd z31.s, z31.s, z2.s\n" - "sqadd z20.s, z20.s, z1.s\n" - "and z2.d, z19.d, z0.d\n" - "and z1.d, z23.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z3.s, z3.s, #0x1f\n" - "sqadd z21.s, z21.s, z7.s\n" - "sqadd z22.s, z22.s, z6.s\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "sqadd z16.s, z16.s, z5.s\n" - "sqadd z17.s, z17.s, z4.s\n" - "sqadd z18.s, z18.s, z3.s\n" - "and z7.d, z28.d, z0.d\n" - "sqadd z19.s, z19.s, z2.s\n" - "sqadd z23.s, z23.s, z1.s\n" - "and z6.d, z29.d, z0.d\n" - "and z5.d, z30.d, z0.d\n" - "and z4.d, z24.d, z0.d\n" - "and z3.d, z25.d, z0.d\n" - "and z2.d, z26.d, z0.d\n" - "and z1.d, z27.d, z0.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z3.s, z3.s, #0x1f\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "sqadd z28.s, z28.s, z7.s\n" - "sqadd z29.s, z29.s, z6.s\n" - "sqadd z30.s, z30.s, z5.s\n" - "sqadd z24.s, z24.s, z4.s\n" - "sqadd z25.s, z25.s, z3.s\n" - "sqadd z26.s, z26.s, z2.s\n" - "sqadd z27.s, z27.s, z1.s\n" - "55:" // Height 4: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add z31.s, z31.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z23.s, z23.s, z0.s\n" + "add z28.s, z28.s, z1.s\n" + "add z29.s, z29.s, z2.s\n" + "add z30.s, z30.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "add z25.s, z25.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + ".inst 0x04a473ff // sqdmulh z31.s, z31.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" - "ld1rw { z2.s }, p2/Z, [x20]\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a4739c // sqdmulh z28.s, z28.s, z4.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + ".inst 0x04a473bd // sqdmulh z29.s, z29.s, z4.s\n" + ".inst 0x04a473de // sqdmulh z30.s, z30.s, z4.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + ".inst 0x04a47318 // sqdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47339 // sqdmulh z25.s, z25.s, z4.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z31.s, z31.s, z2.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + ".inst 0x04a4735a // sqdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4737b // sqdmulh z27.s, z27.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "add z20.s, z20.s, z2.s\n" - "add z21.s, z21.s, z2.s\n" ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" - "add z22.s, z22.s, z2.s\n" - "add z16.s, z16.s, z2.s\n" ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z17.s, z17.s, z2.s\n" - "add z18.s, z18.s, z2.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z19.s, z19.s, z2.s\n" - "add z23.s, z23.s, z2.s\n" + "add z31.s, z31.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z28.s, z28.s, z2.s\n" - "add z29.s, z29.s, z2.s\n" - "ld1rw { z1.s }, p2/Z, [x20]\n" - "add z30.s, z30.s, z2.s\n" - "add z24.s, z24.s, z2.s\n" - "add x20, %x[qp], %[minval]\n" - "add z25.s, z25.s, z2.s\n" - "add z26.s, z26.s, z2.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z27.s, z27.s, z2.s\n" - "smin z31.s, p2/M, z31.s, z1.s\n" - "smin z20.s, p2/M, z20.s, z1.s\n" - "smin z21.s, p2/M, z21.s, z1.s\n" - "smin z22.s, p2/M, z22.s, z1.s\n" - "smin z16.s, p2/M, z16.s, z1.s\n" - "smin z17.s, p2/M, z17.s, z1.s\n" - "smin z18.s, p2/M, z18.s, z1.s\n" - "smin z19.s, p2/M, z19.s, z1.s\n" - "smin z23.s, p2/M, z23.s, z1.s\n" - "smin z28.s, p2/M, z28.s, z1.s\n" - "smin z29.s, p2/M, z29.s, z1.s\n" - "smin z30.s, p2/M, z30.s, z1.s\n" - "smin z24.s, p2/M, z24.s, z1.s\n" - "smin z25.s, p2/M, z25.s, z1.s\n" - "smin z26.s, p2/M, z26.s, z1.s\n" - "smin z27.s, p2/M, z27.s, z1.s\n" - "smax z31.s, p2/M, z31.s, z0.s\n" - "smax z20.s, p2/M, z20.s, z0.s\n" - "smax z21.s, p2/M, z21.s, z0.s\n" - "smax z22.s, p2/M, z22.s, z0.s\n" - "smax z16.s, p2/M, z16.s, z0.s\n" - "smax z17.s, p2/M, z17.s, z0.s\n" - "smax z18.s, p2/M, z18.s, z0.s\n" - "smax z19.s, p2/M, z19.s, z0.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "add z23.s, z23.s, z4.s\n" + "add z28.s, z28.s, z4.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "add z29.s, z29.s, z4.s\n" + "add z30.s, z30.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "add z26.s, z26.s, z4.s\n" + "add z27.s, z27.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z28.s, p2/M, z28.s, z6.s\n" + "smin z29.s, p2/M, z29.s, z6.s\n" + "smin z30.s, p2/M, z30.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z31.h, z31.h, z20.h\n" - "smax z23.s, p2/M, z23.s, z0.s\n" - "smax z28.s, p2/M, z28.s, z0.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z28.s, p2/M, z28.s, z5.s\n" "uzp1 z20.h, z21.h, z22.h\n" - "smax z29.s, p2/M, z29.s, z0.s\n" - "smax z30.s, p2/M, z30.s, z0.s\n" + "smax z29.s, p2/M, z29.s, z5.s\n" + "smax z30.s, p2/M, z30.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z24.s, p2/M, z24.s, z0.s\n" - "smax z25.s, p2/M, z25.s, z0.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" "uzp1 z17.h, z18.h, z19.h\n" - "smax z26.s, p2/M, z26.s, z0.s\n" - "smax z27.s, p2/M, z27.s, z0.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z23.h, z23.h, z28.h\n" "uzp1 z31.b, z31.b, z20.b\n" - "uzp1 z18.h, z29.h, z30.h\n" + "uzp1 z28.h, z29.h, z30.h\n" "uzp1 z24.h, z24.h, z25.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "uzp1 z17.h, z26.h, z27.h\n" + "uzp1 z25.h, z26.h, z27.h\n" "st1b { z31.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "uzp1 z23.b, z23.b, z18.b\n" - "uzp1 z24.b, z24.b, z17.b\n" - "st1b { z16.b }, p1, [x23]\n" - "st1b { z23.b }, p1, [x22]\n" - "st1b { z24.b }, p1, [x21]\n" - "56:" // Height 4: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 44b\n" + "uzp1 z23.b, z23.b, z28.b\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z16.b }, p1, [x26]\n" + "st1b { z23.b }, p1, [x25]\n" + "st1b { z24.b }, p1, [x24]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 41b\n" "subs %x[M], %x[M], #0x4\n" - "beq 58f\n" + "beq 54f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 57f\n" + "tbz %x[flags], #3, 53f\n" "add x21, x21, #0x4\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "57:" // Update direct input + "53:" // Update direct input "mov x20, #0x4\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "58:" // Exit + "54:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_dot_4x4VL/generic.cpp index dc3b7ef3ec..7e78af961b 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_dot_4x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_dot_4x4VL/generic.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -74,23 +73,20 @@ void sve_hybrid_u8s8qa_dot_4x4VL ( ka.string_lengths = string_lengths; ka.N = N; ka.B_ptr = B_ptr; - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "ptrue p2.b\n" "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 43f\n" + "bge 40f\n" "cmp %x[M], #0x2\n" - "bgt 29f\n" - "beq 15f\n" - "mov x10, %x[col_bias]\n" + "bgt 27f\n" + "beq 14f\n" "mov z11.s, #0x0\n" "mov z15.b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "2:" // Height 1: Column loop "mov x20, #0x0\n" @@ -98,8 +94,7 @@ void sve_hybrid_u8s8qa_dot_4x4VL ( "mov z17.s, #0x0\n" "mov z18.s, #0x0\n" "mov z19.s, #0x0\n" - "whilelt p1.b, x20, x9\n" - "3:" // Height 1: setup done + "whilelt p1.b, x20, x10\n" "mov x26, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -120,41 +115,41 @@ void sve_hybrid_u8s8qa_dot_4x4VL ( "ble 9f\n" "7:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z21.b }, p2/Z, [x28]\n" - "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z20.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z23.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" - "ld1b { z22.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #6, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x44a01eb0 // sudot z16.s, z21.b, z0.b[0]\n" - "ld1b { z21.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - ".inst 0x44a01f51 // sudot z17.s, z26.b, z0.b[0]\n" - ".inst 0x44a01f32 // sudot z18.s, z25.b, z0.b[0]\n" - ".inst 0x44a01f13 // sudot z19.s, z24.b, z0.b[0]\n" - ".inst 0x44a81e90 // sudot z16.s, z20.b, z0.b[1]\n" - "ld1b { z20.b }, p2/Z, [x28, #-8, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n" - ".inst 0x44a81ef1 // sudot z17.s, z23.b, z0.b[1]\n" - "ld1b { z23.b }, p2/Z, [x28, #-4, MUL VL]\n" - ".inst 0x44a81ed2 // sudot z18.s, z22.b, z0.b[1]\n" - "ld1b { z22.b }, p2/Z, [x28, #-3, MUL VL]\n" - ".inst 0x44a81eb3 // sudot z19.s, z21.b, z0.b[1]\n" - "ld1b { z21.b }, p2/Z, [x28, #-2, MUL VL]\n" - ".inst 0x44b01e90 // sudot z16.s, z20.b, z0.b[2]\n" - "ld1b { z20.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x44b01f51 // sudot z17.s, z26.b, z0.b[2]\n" - ".inst 0x44b01f32 // sudot z18.s, z25.b, z0.b[2]\n" - ".inst 0x44b01f13 // sudot z19.s, z24.b, z0.b[2]\n" - ".inst 0x44b81ef0 // sudot z16.s, z23.b, z0.b[3]\n" - ".inst 0x44b81ed1 // sudot z17.s, z22.b, z0.b[3]\n" - ".inst 0x44b81eb2 // sudot z18.s, z21.b, z0.b[3]\n" - ".inst 0x44b81e93 // sudot z19.s, z20.b, z0.b[3]\n" + ".inst 0x44a01c90 // sudot z16.s, z4.b, z0.b[0]\n" + "ld1b { z4.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x44a01cb1 // sudot z17.s, z5.b, z0.b[0]\n" + ".inst 0x44a01cd2 // sudot z18.s, z6.b, z0.b[0]\n" + ".inst 0x44a01cf3 // sudot z19.s, z7.b, z0.b[0]\n" + ".inst 0x44a81d10 // sudot z16.s, z8.b, z0.b[1]\n" + "ld1b { z5.b }, p2/Z, [x9, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-5, MUL VL]\n" + ".inst 0x44a81d31 // sudot z17.s, z9.b, z0.b[1]\n" + "ld1b { z9.b }, p2/Z, [x9, #-4, MUL VL]\n" + ".inst 0x44a81d52 // sudot z18.s, z10.b, z0.b[1]\n" + "ld1b { z10.b }, p2/Z, [x9, #-3, MUL VL]\n" + ".inst 0x44a81c93 // sudot z19.s, z4.b, z0.b[1]\n" + "ld1b { z4.b }, p2/Z, [x9, #-2, MUL VL]\n" + ".inst 0x44b01cb0 // sudot z16.s, z5.b, z0.b[2]\n" + "ld1b { z5.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x44b01cd1 // sudot z17.s, z6.b, z0.b[2]\n" + ".inst 0x44b01cf2 // sudot z18.s, z7.b, z0.b[2]\n" + ".inst 0x44b01d13 // sudot z19.s, z8.b, z0.b[2]\n" + ".inst 0x44b81d30 // sudot z16.s, z9.b, z0.b[3]\n" + ".inst 0x44b81d51 // sudot z17.s, z10.b, z0.b[3]\n" + ".inst 0x44b81c92 // sudot z18.s, z4.b, z0.b[3]\n" + ".inst 0x44b81cb3 // sudot z19.s, z5.b, z0.b[3]\n" "tbnz %x[flags], #31, 8f\n" "udot z11.s, z0.b, z15.b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum @@ -163,49 +158,49 @@ void sve_hybrid_u8s8qa_dot_4x4VL ( "bgt 7b\n" "9:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z23.b }, p2/Z, [x28]\n" - "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" - ".inst 0x44a01ef0 // sudot z16.s, z23.b, z0.b[0]\n" - ".inst 0x44a01ed1 // sudot z17.s, z22.b, z0.b[0]\n" - ".inst 0x44a01eb2 // sudot z18.s, z21.b, z0.b[0]\n" - ".inst 0x44a01e93 // sudot z19.s, z20.b, z0.b[0]\n" + ".inst 0x44a01c90 // sudot z16.s, z4.b, z0.b[0]\n" + ".inst 0x44a01cb1 // sudot z17.s, z5.b, z0.b[0]\n" + ".inst 0x44a01cd2 // sudot z18.s, z6.b, z0.b[0]\n" + ".inst 0x44a01cf3 // sudot z19.s, z7.b, z0.b[0]\n" "ble 10f\n" - "ld1b { z23.b }, p2/Z, [x28]\n" - "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9]\n" + "ld1b { z9.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - ".inst 0x44a81ef0 // sudot z16.s, z23.b, z0.b[1]\n" - ".inst 0x44a81ed1 // sudot z17.s, z22.b, z0.b[1]\n" - ".inst 0x44a81eb2 // sudot z18.s, z21.b, z0.b[1]\n" - ".inst 0x44a81e93 // sudot z19.s, z20.b, z0.b[1]\n" + "ld1b { z10.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + ".inst 0x44a81d10 // sudot z16.s, z8.b, z0.b[1]\n" + ".inst 0x44a81d31 // sudot z17.s, z9.b, z0.b[1]\n" + ".inst 0x44a81d52 // sudot z18.s, z10.b, z0.b[1]\n" + ".inst 0x44a81c93 // sudot z19.s, z4.b, z0.b[1]\n" "ble 10f\n" - "ld1b { z23.b }, p2/Z, [x28]\n" - "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - ".inst 0x44b01ef0 // sudot z16.s, z23.b, z0.b[2]\n" - ".inst 0x44b01ed1 // sudot z17.s, z22.b, z0.b[2]\n" - ".inst 0x44b01eb2 // sudot z18.s, z21.b, z0.b[2]\n" - ".inst 0x44b01e93 // sudot z19.s, z20.b, z0.b[2]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + ".inst 0x44b01cb0 // sudot z16.s, z5.b, z0.b[2]\n" + ".inst 0x44b01cd1 // sudot z17.s, z6.b, z0.b[2]\n" + ".inst 0x44b01cf2 // sudot z18.s, z7.b, z0.b[2]\n" + ".inst 0x44b01d13 // sudot z19.s, z8.b, z0.b[2]\n" "ble 10f\n" - "ld1b { z23.b }, p2/Z, [x28]\n" - "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - ".inst 0x44b81ef0 // sudot z16.s, z23.b, z0.b[3]\n" - ".inst 0x44b81ed1 // sudot z17.s, z22.b, z0.b[3]\n" - ".inst 0x44b81eb2 // sudot z18.s, z21.b, z0.b[3]\n" - ".inst 0x44b81e93 // sudot z19.s, z20.b, z0.b[3]\n" + "ld1b { z9.b }, p2/Z, [x9]\n" + "ld1b { z10.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + ".inst 0x44b81d30 // sudot z16.s, z9.b, z0.b[3]\n" + ".inst 0x44b81d51 // sudot z17.s, z10.b, z0.b[3]\n" + ".inst 0x44b81c92 // sudot z18.s, z4.b, z0.b[3]\n" + ".inst 0x44b81cb3 // sudot z19.s, z5.b, z0.b[3]\n" "10:" // Height 1: Multiply loop: multiply skip "tbnz %x[flags], #31, 11f\n" "udot z11.s, z0.b, z15.b\n" @@ -217,91 +212,76 @@ void sve_hybrid_u8s8qa_dot_4x4VL ( "tbnz %x[flags], #31, 12f\n" "mov x21, #0x4\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z20.s }, p2/Z, [x20]\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" "whilelt p0.s, XZR, x21\n" - "neg z20.s, p2/M, z20.s\n" + "neg z1.s, p2/M, z1.s\n" "saddv d11, p0, z11.s\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z20.s\n" + "mul z11.s, p2/M, z11.s, z1.s\n" "12:" // Height 1: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z23.s }, p2/Z, [x10]\n" - "ld1w { z20.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z22.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z21.s }, p2/Z, [x10, #3, MUL VL]\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "add z16.s, z16.s, z23.s\n" - "add z17.s, z17.s, z20.s\n" - "ld1rw { z20.s }, p2/Z, [x20]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - "addvl x10, x10, #4\n" - "add z18.s, z18.s, z22.s\n" - "add z19.s, z19.s, z21.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "ld1rw { z4.s }, p2/Z, [x21]\n" "ld1rw { z0.s }, p2/Z, [x20]\n" - ".inst 0x04b47610 // sqrdmulh z16.s, z16.s, z20.s\n" - ".inst 0x04b47631 // sqrdmulh z17.s, z17.s, z20.s\n" - ".inst 0x04b47652 // sqrdmulh z18.s, z18.s, z20.s\n" - ".inst 0x04b47673 // sqrdmulh z19.s, z19.s, z20.s\n" - "tbz %x[flags], #5, 13f\n" - "and z23.d, z16.d, z0.d\n" - "and z22.d, z17.d, z0.d\n" - "and z21.d, z18.d, z0.d\n" - "and z20.d, z19.d, z0.d\n" - "asr z23.s, z23.s, #0x1f\n" - "asr z22.s, z22.s, #0x1f\n" - "asr z21.s, z21.s, #0x1f\n" - "asr z20.s, z20.s, #0x1f\n" - "sqadd z16.s, z16.s, z23.s\n" - "sqadd z17.s, z17.s, z22.s\n" - "sqadd z18.s, z18.s, z21.s\n" - "sqadd z19.s, z19.s, z20.s\n" - "13:" // Height 1: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x20]\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x21]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "ld1rw { z22.s }, p2/Z, [x20]\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1rw { z21.s }, p2/Z, [x20]\n" - "add z16.s, z16.s, z22.s\n" - "add x20, %x[qp], %[minval]\n" - "add z17.s, z17.s, z22.s\n" - "add z18.s, z18.s, z22.s\n" - "ld1rw { z20.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z22.s\n" - "smin z16.s, p2/M, z16.s, z21.s\n" - "smin z17.s, p2/M, z17.s, z21.s\n" - "smin z18.s, p2/M, z18.s, z21.s\n" - "smin z19.s, p2/M, z19.s, z21.s\n" - "smax z16.s, p2/M, z16.s, z20.s\n" - "smax z17.s, p2/M, z17.s, z20.s\n" - "smax z18.s, p2/M, z18.s, z20.s\n" - "smax z19.s, p2/M, z19.s, z20.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" "st1b { z16.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "14:" // Height 1: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" "bgt 2b\n" - "b 58f\n" - "15:" // Height 2 - "mov x10, %x[col_bias]\n" + "b 54f\n" + "14:" // Height 2 "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z15.b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "16:" // Height 2: Column loop + "15:" // Height 2: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -309,302 +289,274 @@ void sve_hybrid_u8s8qa_dot_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "17:" // Height 2: setup done "mov x26, #0x0\n" - "18:" // Height 2: String loop + "17:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 19f\n" + "tbz %x[flags], #3, 18f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" - "cbnz x26, 20f\n" + "cbnz x26, 19f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" - "b 20f\n" - "19:" // Height 2: setup direct input + "b 19f\n" + "18:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" - "20:" // Height 2: input setup done + "19:" // Height 2: input setup done "cmp x25, #0x10\n" - "ble 23f\n" - "21:" // Height 2: Multiply loop: Main loop head + "ble 22f\n" + "20:" // Height 2: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z25.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n" - ".inst 0x44a01f30 // sudot z16.s, z25.b, z0.b[0]\n" - ".inst 0x44a11f34 // sudot z20.s, z25.b, z1.b[0]\n" - "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - ".inst 0x44a01fd1 // sudot z17.s, z30.b, z0.b[0]\n" - ".inst 0x44a11fd5 // sudot z21.s, z30.b, z1.b[0]\n" - ".inst 0x44a01fb2 // sudot z18.s, z29.b, z0.b[0]\n" - ".inst 0x44a11fb6 // sudot z22.s, z29.b, z1.b[0]\n" - ".inst 0x44a01f93 // sudot z19.s, z28.b, z0.b[0]\n" - ".inst 0x44a11f97 // sudot z23.s, z28.b, z1.b[0]\n" - ".inst 0x44a81f10 // sudot z16.s, z24.b, z0.b[1]\n" - ".inst 0x44a91f14 // sudot z20.s, z24.b, z1.b[1]\n" - "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n" - "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n" - ".inst 0x44a81f71 // sudot z17.s, z27.b, z0.b[1]\n" - ".inst 0x44a91f75 // sudot z21.s, z27.b, z1.b[1]\n" - "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n" - ".inst 0x44a81f52 // sudot z18.s, z26.b, z0.b[1]\n" - ".inst 0x44a91f56 // sudot z22.s, z26.b, z1.b[1]\n" - "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n" - ".inst 0x44a81f33 // sudot z19.s, z25.b, z0.b[1]\n" - ".inst 0x44a91f37 // sudot z23.s, z25.b, z1.b[1]\n" - "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n" - ".inst 0x44b01f10 // sudot z16.s, z24.b, z0.b[2]\n" - ".inst 0x44b11f14 // sudot z20.s, z24.b, z1.b[2]\n" - "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x44b01fd1 // sudot z17.s, z30.b, z0.b[2]\n" - ".inst 0x44b11fd5 // sudot z21.s, z30.b, z1.b[2]\n" - ".inst 0x44b01fb2 // sudot z18.s, z29.b, z0.b[2]\n" - ".inst 0x44b11fb6 // sudot z22.s, z29.b, z1.b[2]\n" - ".inst 0x44b01f93 // sudot z19.s, z28.b, z0.b[2]\n" - ".inst 0x44b11f97 // sudot z23.s, z28.b, z1.b[2]\n" - ".inst 0x44b81f70 // sudot z16.s, z27.b, z0.b[3]\n" - ".inst 0x44b91f74 // sudot z20.s, z27.b, z1.b[3]\n" - ".inst 0x44b81f51 // sudot z17.s, z26.b, z0.b[3]\n" - ".inst 0x44b91f55 // sudot z21.s, z26.b, z1.b[3]\n" - ".inst 0x44b81f32 // sudot z18.s, z25.b, z0.b[3]\n" - ".inst 0x44b91f36 // sudot z22.s, z25.b, z1.b[3]\n" - ".inst 0x44b81f13 // sudot z19.s, z24.b, z0.b[3]\n" - ".inst 0x44b91f17 // sudot z23.s, z24.b, z1.b[3]\n" - "tbnz %x[flags], #31, 22f\n" + "ld1b { z10.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x44a01c90 // sudot z16.s, z4.b, z0.b[0]\n" + ".inst 0x44a11c94 // sudot z20.s, z4.b, z1.b[0]\n" + "ld1b { z4.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x44a01cb1 // sudot z17.s, z5.b, z0.b[0]\n" + ".inst 0x44a11cb5 // sudot z21.s, z5.b, z1.b[0]\n" + ".inst 0x44a01cd2 // sudot z18.s, z6.b, z0.b[0]\n" + ".inst 0x44a11cd6 // sudot z22.s, z6.b, z1.b[0]\n" + ".inst 0x44a01cf3 // sudot z19.s, z7.b, z0.b[0]\n" + ".inst 0x44a11cf7 // sudot z23.s, z7.b, z1.b[0]\n" + ".inst 0x44a81d10 // sudot z16.s, z8.b, z0.b[1]\n" + ".inst 0x44a91d14 // sudot z20.s, z8.b, z1.b[1]\n" + "ld1b { z5.b }, p2/Z, [x9, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" + ".inst 0x44a81d31 // sudot z17.s, z9.b, z0.b[1]\n" + ".inst 0x44a91d35 // sudot z21.s, z9.b, z1.b[1]\n" + "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-5, MUL VL]\n" + ".inst 0x44a81d52 // sudot z18.s, z10.b, z0.b[1]\n" + ".inst 0x44a91d56 // sudot z22.s, z10.b, z1.b[1]\n" + "ld1b { z9.b }, p2/Z, [x9, #-4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #-3, MUL VL]\n" + ".inst 0x44a81c93 // sudot z19.s, z4.b, z0.b[1]\n" + ".inst 0x44a91c97 // sudot z23.s, z4.b, z1.b[1]\n" + "ld1b { z4.b }, p2/Z, [x9, #-2, MUL VL]\n" + ".inst 0x44b01cb0 // sudot z16.s, z5.b, z0.b[2]\n" + ".inst 0x44b11cb4 // sudot z20.s, z5.b, z1.b[2]\n" + "ld1b { z5.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x44b01cd1 // sudot z17.s, z6.b, z0.b[2]\n" + ".inst 0x44b11cd5 // sudot z21.s, z6.b, z1.b[2]\n" + ".inst 0x44b01cf2 // sudot z18.s, z7.b, z0.b[2]\n" + ".inst 0x44b11cf6 // sudot z22.s, z7.b, z1.b[2]\n" + ".inst 0x44b01d13 // sudot z19.s, z8.b, z0.b[2]\n" + ".inst 0x44b11d17 // sudot z23.s, z8.b, z1.b[2]\n" + ".inst 0x44b81d30 // sudot z16.s, z9.b, z0.b[3]\n" + ".inst 0x44b91d34 // sudot z20.s, z9.b, z1.b[3]\n" + ".inst 0x44b81d51 // sudot z17.s, z10.b, z0.b[3]\n" + ".inst 0x44b91d55 // sudot z21.s, z10.b, z1.b[3]\n" + ".inst 0x44b81c92 // sudot z18.s, z4.b, z0.b[3]\n" + ".inst 0x44b91c96 // sudot z22.s, z4.b, z1.b[3]\n" + ".inst 0x44b81cb3 // sudot z19.s, z5.b, z0.b[3]\n" + ".inst 0x44b91cb7 // sudot z23.s, z5.b, z1.b[3]\n" + "tbnz %x[flags], #31, 21f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" - "22:" // Height 2: Multiply loop: unique 3: skip row sum + "21:" // Height 2: Multiply loop: unique 3: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 21b\n" - "23:" // Height 2: Multiply loop: Single iteration only + "bgt 20b\n" + "22:" // Height 2: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z27.b }, p2/Z, [x28]\n" - "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" - ".inst 0x44a01f70 // sudot z16.s, z27.b, z0.b[0]\n" - ".inst 0x44a11f74 // sudot z20.s, z27.b, z1.b[0]\n" - ".inst 0x44a01f51 // sudot z17.s, z26.b, z0.b[0]\n" - ".inst 0x44a11f55 // sudot z21.s, z26.b, z1.b[0]\n" - ".inst 0x44a01f32 // sudot z18.s, z25.b, z0.b[0]\n" - ".inst 0x44a11f36 // sudot z22.s, z25.b, z1.b[0]\n" - ".inst 0x44a01f13 // sudot z19.s, z24.b, z0.b[0]\n" - ".inst 0x44a11f17 // sudot z23.s, z24.b, z1.b[0]\n" - "ble 24f\n" - "ld1b { z27.b }, p2/Z, [x28]\n" - "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x44a01c90 // sudot z16.s, z4.b, z0.b[0]\n" + ".inst 0x44a11c94 // sudot z20.s, z4.b, z1.b[0]\n" + ".inst 0x44a01cb1 // sudot z17.s, z5.b, z0.b[0]\n" + ".inst 0x44a11cb5 // sudot z21.s, z5.b, z1.b[0]\n" + ".inst 0x44a01cd2 // sudot z18.s, z6.b, z0.b[0]\n" + ".inst 0x44a11cd6 // sudot z22.s, z6.b, z1.b[0]\n" + ".inst 0x44a01cf3 // sudot z19.s, z7.b, z0.b[0]\n" + ".inst 0x44a11cf7 // sudot z23.s, z7.b, z1.b[0]\n" + "ble 23f\n" + "ld1b { z8.b }, p2/Z, [x9]\n" + "ld1b { z9.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - ".inst 0x44a81f70 // sudot z16.s, z27.b, z0.b[1]\n" - ".inst 0x44a91f74 // sudot z20.s, z27.b, z1.b[1]\n" - ".inst 0x44a81f51 // sudot z17.s, z26.b, z0.b[1]\n" - ".inst 0x44a91f55 // sudot z21.s, z26.b, z1.b[1]\n" - ".inst 0x44a81f32 // sudot z18.s, z25.b, z0.b[1]\n" - ".inst 0x44a91f36 // sudot z22.s, z25.b, z1.b[1]\n" - ".inst 0x44a81f13 // sudot z19.s, z24.b, z0.b[1]\n" - ".inst 0x44a91f17 // sudot z23.s, z24.b, z1.b[1]\n" - "ble 24f\n" - "ld1b { z27.b }, p2/Z, [x28]\n" - "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + ".inst 0x44a81d10 // sudot z16.s, z8.b, z0.b[1]\n" + ".inst 0x44a91d14 // sudot z20.s, z8.b, z1.b[1]\n" + ".inst 0x44a81d31 // sudot z17.s, z9.b, z0.b[1]\n" + ".inst 0x44a91d35 // sudot z21.s, z9.b, z1.b[1]\n" + ".inst 0x44a81d52 // sudot z18.s, z10.b, z0.b[1]\n" + ".inst 0x44a91d56 // sudot z22.s, z10.b, z1.b[1]\n" + ".inst 0x44a81c93 // sudot z19.s, z4.b, z0.b[1]\n" + ".inst 0x44a91c97 // sudot z23.s, z4.b, z1.b[1]\n" + "ble 23f\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - ".inst 0x44b01f70 // sudot z16.s, z27.b, z0.b[2]\n" - ".inst 0x44b11f74 // sudot z20.s, z27.b, z1.b[2]\n" - ".inst 0x44b01f51 // sudot z17.s, z26.b, z0.b[2]\n" - ".inst 0x44b11f55 // sudot z21.s, z26.b, z1.b[2]\n" - ".inst 0x44b01f32 // sudot z18.s, z25.b, z0.b[2]\n" - ".inst 0x44b11f36 // sudot z22.s, z25.b, z1.b[2]\n" - ".inst 0x44b01f13 // sudot z19.s, z24.b, z0.b[2]\n" - ".inst 0x44b11f17 // sudot z23.s, z24.b, z1.b[2]\n" - "ble 24f\n" - "ld1b { z27.b }, p2/Z, [x28]\n" - "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - ".inst 0x44b81f70 // sudot z16.s, z27.b, z0.b[3]\n" - ".inst 0x44b91f74 // sudot z20.s, z27.b, z1.b[3]\n" - ".inst 0x44b81f51 // sudot z17.s, z26.b, z0.b[3]\n" - ".inst 0x44b91f55 // sudot z21.s, z26.b, z1.b[3]\n" - ".inst 0x44b81f32 // sudot z18.s, z25.b, z0.b[3]\n" - ".inst 0x44b91f36 // sudot z22.s, z25.b, z1.b[3]\n" - ".inst 0x44b81f13 // sudot z19.s, z24.b, z0.b[3]\n" - ".inst 0x44b91f17 // sudot z23.s, z24.b, z1.b[3]\n" - "24:" // Height 2: Multiply loop: multiply skip - "tbnz %x[flags], #31, 25f\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + ".inst 0x44b01cb0 // sudot z16.s, z5.b, z0.b[2]\n" + ".inst 0x44b11cb4 // sudot z20.s, z5.b, z1.b[2]\n" + ".inst 0x44b01cd1 // sudot z17.s, z6.b, z0.b[2]\n" + ".inst 0x44b11cd5 // sudot z21.s, z6.b, z1.b[2]\n" + ".inst 0x44b01cf2 // sudot z18.s, z7.b, z0.b[2]\n" + ".inst 0x44b11cf6 // sudot z22.s, z7.b, z1.b[2]\n" + ".inst 0x44b01d13 // sudot z19.s, z8.b, z0.b[2]\n" + ".inst 0x44b11d17 // sudot z23.s, z8.b, z1.b[2]\n" + "ble 23f\n" + "ld1b { z9.b }, p2/Z, [x9]\n" + "ld1b { z10.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + ".inst 0x44b81d30 // sudot z16.s, z9.b, z0.b[3]\n" + ".inst 0x44b91d34 // sudot z20.s, z9.b, z1.b[3]\n" + ".inst 0x44b81d51 // sudot z17.s, z10.b, z0.b[3]\n" + ".inst 0x44b91d55 // sudot z21.s, z10.b, z1.b[3]\n" + ".inst 0x44b81c92 // sudot z18.s, z4.b, z0.b[3]\n" + ".inst 0x44b91c96 // sudot z22.s, z4.b, z1.b[3]\n" + ".inst 0x44b81cb3 // sudot z19.s, z5.b, z0.b[3]\n" + ".inst 0x44b91cb7 // sudot z23.s, z5.b, z1.b[3]\n" + "23:" // Height 2: Multiply loop: multiply skip + "tbnz %x[flags], #31, 24f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" - "25:" // Height 2: Multiply loop: unique 4: skip row sum + "24:" // Height 2: Multiply loop: unique 4: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 18b\n" + "bne 17b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x27, x20\n" - "tbnz %x[flags], #31, 26f\n" + "add x26, x27, x20\n" + "tbnz %x[flags], #31, 25f\n" "mov x21, #0x4\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" + "ld1rw { z2.s }, p2/Z, [x20]\n" "whilelt p0.s, XZR, x21\n" - "neg z24.s, p2/M, z24.s\n" + "neg z2.s, p2/M, z2.s\n" "saddv d11, p0, z11.s\n" "saddv d12, p0, z12.s\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z24.s\n" + "mul z11.s, p2/M, z11.s, z2.s\n" "mov z12.s, z12.s[0]\n" - "mul z12.s, p2/M, z12.s, z24.s\n" - "26:" // Height 2: skip row sum fixup + "mul z12.s, p2/M, z12.s, z2.s\n" + "25:" // Height 2: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z28.s }, p2/Z, [x10]\n" - "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z20.s, z20.s, z12.s\n" "add z21.s, z21.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z22.s, z22.s, z12.s\n" "add z23.s, z23.s, z12.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" - "add z16.s, z16.s, z28.s\n" - "add z17.s, z17.s, z27.s\n" - "addvl x10, x10, #4\n" - "add z18.s, z18.s, z26.s\n" - "add z19.s, z19.s, z25.s\n" - "add z20.s, z20.s, z28.s\n" - "add z21.s, z21.s, z27.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z22.s, z22.s, z26.s\n" - "add z23.s, z23.s, z25.s\n" - ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n" - ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n" - ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n" - ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n" - ".inst 0x04b87694 // sqrdmulh z20.s, z20.s, z24.s\n" - ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n" - ".inst 0x04b876d6 // sqrdmulh z22.s, z22.s, z24.s\n" - ".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n" - "tbz %x[flags], #5, 27f\n" - "and z24.d, z16.d, z0.d\n" - "and z30.d, z17.d, z0.d\n" - "and z29.d, z18.d, z0.d\n" - "and z28.d, z19.d, z0.d\n" - "and z27.d, z20.d, z0.d\n" - "and z26.d, z21.d, z0.d\n" - "asr z24.s, z24.s, #0x1f\n" - "and z25.d, z22.d, z0.d\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z27.s, z27.s, #0x1f\n" - "sqadd z16.s, z16.s, z24.s\n" - "and z24.d, z23.d, z0.d\n" - "asr z26.s, z26.s, #0x1f\n" - "asr z25.s, z25.s, #0x1f\n" - "sqadd z17.s, z17.s, z30.s\n" - "sqadd z18.s, z18.s, z29.s\n" - "asr z24.s, z24.s, #0x1f\n" - "sqadd z19.s, z19.s, z28.s\n" - "sqadd z20.s, z20.s, z27.s\n" - "sqadd z21.s, z21.s, z26.s\n" - "sqadd z22.s, z22.s, z25.s\n" - "sqadd z23.s, z23.s, z24.s\n" - "27:" // Height 2: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + "addvl x28, x28, #4\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "ld1rw { z26.s }, p2/Z, [x20]\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" - "add z16.s, z16.s, z26.s\n" + "add z16.s, z16.s, z4.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z17.s, z17.s, z26.s\n" - "add z18.s, z18.s, z26.s\n" - "ld1rw { z25.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z26.s\n" - "add z20.s, z20.s, z26.s\n" - "add x20, %x[qp], %[minval]\n" - "add z21.s, z21.s, z26.s\n" - "add z22.s, z22.s, z26.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "add z23.s, z23.s, z26.s\n" - "smin z16.s, p2/M, z16.s, z25.s\n" - "smin z17.s, p2/M, z17.s, z25.s\n" - "smin z18.s, p2/M, z18.s, z25.s\n" - "smin z19.s, p2/M, z19.s, z25.s\n" - "smin z20.s, p2/M, z20.s, z25.s\n" - "smin z21.s, p2/M, z21.s, z25.s\n" - "smin z22.s, p2/M, z22.s, z25.s\n" - "smin z23.s, p2/M, z23.s, z25.s\n" - "smax z16.s, p2/M, z16.s, z24.s\n" - "smax z17.s, p2/M, z17.s, z24.s\n" - "smax z18.s, p2/M, z18.s, z24.s\n" - "smax z19.s, p2/M, z19.s, z24.s\n" - "smax z20.s, p2/M, z20.s, z24.s\n" - "smax z21.s, p2/M, z21.s, z24.s\n" - "smax z22.s, p2/M, z22.s, z24.s\n" - "smax z23.s, p2/M, z23.s, z24.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "add z23.s, z23.s, z4.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "uzp1 z18.h, z18.h, z19.h\n" + "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z20.h, z20.h, z21.h\n" - "uzp1 z17.h, z22.h, z23.h\n" - "uzp1 z16.b, z16.b, z18.b\n" - "uzp1 z20.b, z20.b, z17.b\n" + "uzp1 z21.h, z22.h, z23.h\n" + "uzp1 z16.b, z16.b, z17.b\n" + "uzp1 z20.b, z20.b, z21.b\n" "st1b { z16.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "st1b { z20.b }, p1, [x24]\n" - "28:" // Height 2: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 16b\n" - "b 58f\n" - "29:" // Height 3 - "mov x10, %x[col_bias]\n" + "st1b { z20.b }, p1, [x26]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 15b\n" + "b 54f\n" + "27:" // Height 3 "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z13.s, #0x0\n" "mov z15.b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "30:" // Height 3: Column loop + "28:" // Height 3: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -612,399 +564,359 @@ void sve_hybrid_u8s8qa_dot_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" "mov z24.s, #0x0\n" "mov z25.s, #0x0\n" "mov z26.s, #0x0\n" "mov z27.s, #0x0\n" - "31:" // Height 3: setup done "mov x26, #0x0\n" - "32:" // Height 3: String loop + "30:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 33f\n" + "tbz %x[flags], #3, 31f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" - "cbnz x26, 34f\n" + "cbnz x26, 32f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" - "b 34f\n" - "33:" // Height 3: setup direct input + "b 32f\n" + "31:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" - "34:" // Height 3: input setup done + "32:" // Height 3: input setup done "cmp x25, #0x10\n" - "ble 37f\n" - "35:" // Height 3: Multiply loop: Main loop head + "ble 35f\n" + "33:" // Height 3: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z29.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z3.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z31.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" "ld1rqb { z2.b }, p0/Z, [x22]\n" - "ld1b { z30.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #6, MUL VL]\n" "add x22, x22, #0x10\n" - ".inst 0x44a01cb0 // sudot z16.s, z5.b, z0.b[0]\n" - ".inst 0x44a11cb4 // sudot z20.s, z5.b, z1.b[0]\n" - ".inst 0x44a01fb1 // sudot z17.s, z29.b, z0.b[0]\n" - ".inst 0x44a11fb5 // sudot z21.s, z29.b, z1.b[0]\n" - ".inst 0x44a01c92 // sudot z18.s, z4.b, z0.b[0]\n" - ".inst 0x44a21cb8 // sudot z24.s, z5.b, z2.b[0]\n" - ".inst 0x44a21fb9 // sudot z25.s, z29.b, z2.b[0]\n" - "ld1b { z29.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - ".inst 0x44a11c96 // sudot z22.s, z4.b, z1.b[0]\n" - ".inst 0x44a21c9a // sudot z26.s, z4.b, z2.b[0]\n" - ".inst 0x44a01f93 // sudot z19.s, z28.b, z0.b[0]\n" - ".inst 0x44a11f97 // sudot z23.s, z28.b, z1.b[0]\n" - ".inst 0x44a21f9b // sudot z27.s, z28.b, z2.b[0]\n" - ".inst 0x44a81c70 // sudot z16.s, z3.b, z0.b[1]\n" - "ld1b { z28.b }, p2/Z, [x28, #-8, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #-7, MUL VL]\n" - ".inst 0x44a91c74 // sudot z20.s, z3.b, z1.b[1]\n" - ".inst 0x44aa1c78 // sudot z24.s, z3.b, z2.b[1]\n" - "ld1b { z4.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z3.b }, p2/Z, [x28, #-5, MUL VL]\n" - ".inst 0x44a81ff1 // sudot z17.s, z31.b, z0.b[1]\n" - ".inst 0x44a91ff5 // sudot z21.s, z31.b, z1.b[1]\n" - ".inst 0x44aa1ff9 // sudot z25.s, z31.b, z2.b[1]\n" - ".inst 0x44a81fd2 // sudot z18.s, z30.b, z0.b[1]\n" - "ld1b { z31.b }, p2/Z, [x28, #-4, MUL VL]\n" - ".inst 0x44a91fd6 // sudot z22.s, z30.b, z1.b[1]\n" - ".inst 0x44aa1fda // sudot z26.s, z30.b, z2.b[1]\n" - "ld1b { z30.b }, p2/Z, [x28, #-3, MUL VL]\n" - ".inst 0x44a81fb3 // sudot z19.s, z29.b, z0.b[1]\n" - ".inst 0x44a91fb7 // sudot z23.s, z29.b, z1.b[1]\n" - ".inst 0x44aa1fbb // sudot z27.s, z29.b, z2.b[1]\n" - ".inst 0x44b01f90 // sudot z16.s, z28.b, z0.b[2]\n" - "ld1b { z29.b }, p2/Z, [x28, #-2, MUL VL]\n" - ".inst 0x44b11f94 // sudot z20.s, z28.b, z1.b[2]\n" - ".inst 0x44b21f98 // sudot z24.s, z28.b, z2.b[2]\n" - "ld1b { z28.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x44b01cb1 // sudot z17.s, z5.b, z0.b[2]\n" - ".inst 0x44b11cb5 // sudot z21.s, z5.b, z1.b[2]\n" - ".inst 0x44b21cb9 // sudot z25.s, z5.b, z2.b[2]\n" - ".inst 0x44b01c92 // sudot z18.s, z4.b, z0.b[2]\n" - ".inst 0x44b11c96 // sudot z22.s, z4.b, z1.b[2]\n" - ".inst 0x44b21c9a // sudot z26.s, z4.b, z2.b[2]\n" - ".inst 0x44b01c73 // sudot z19.s, z3.b, z0.b[2]\n" - ".inst 0x44b11c77 // sudot z23.s, z3.b, z1.b[2]\n" - ".inst 0x44b21c7b // sudot z27.s, z3.b, z2.b[2]\n" - ".inst 0x44b81ff0 // sudot z16.s, z31.b, z0.b[3]\n" - ".inst 0x44b91ff4 // sudot z20.s, z31.b, z1.b[3]\n" - ".inst 0x44ba1ff8 // sudot z24.s, z31.b, z2.b[3]\n" - ".inst 0x44b81fd1 // sudot z17.s, z30.b, z0.b[3]\n" - ".inst 0x44b91fd5 // sudot z21.s, z30.b, z1.b[3]\n" - ".inst 0x44ba1fd9 // sudot z25.s, z30.b, z2.b[3]\n" - ".inst 0x44b81fb2 // sudot z18.s, z29.b, z0.b[3]\n" - ".inst 0x44b91fb6 // sudot z22.s, z29.b, z1.b[3]\n" - ".inst 0x44ba1fba // sudot z26.s, z29.b, z2.b[3]\n" - ".inst 0x44b81f93 // sudot z19.s, z28.b, z0.b[3]\n" - ".inst 0x44b91f97 // sudot z23.s, z28.b, z1.b[3]\n" - ".inst 0x44ba1f9b // sudot z27.s, z28.b, z2.b[3]\n" - "tbnz %x[flags], #31, 36f\n" + ".inst 0x44a01c90 // sudot z16.s, z4.b, z0.b[0]\n" + ".inst 0x44a11c94 // sudot z20.s, z4.b, z1.b[0]\n" + ".inst 0x44a01cb1 // sudot z17.s, z5.b, z0.b[0]\n" + ".inst 0x44a11cb5 // sudot z21.s, z5.b, z1.b[0]\n" + ".inst 0x44a01cd2 // sudot z18.s, z6.b, z0.b[0]\n" + ".inst 0x44a21c98 // sudot z24.s, z4.b, z2.b[0]\n" + ".inst 0x44a21cb9 // sudot z25.s, z5.b, z2.b[0]\n" + "ld1b { z4.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x44a11cd6 // sudot z22.s, z6.b, z1.b[0]\n" + ".inst 0x44a21cda // sudot z26.s, z6.b, z2.b[0]\n" + ".inst 0x44a01cf3 // sudot z19.s, z7.b, z0.b[0]\n" + ".inst 0x44a11cf7 // sudot z23.s, z7.b, z1.b[0]\n" + ".inst 0x44a21cfb // sudot z27.s, z7.b, z2.b[0]\n" + ".inst 0x44a81d10 // sudot z16.s, z8.b, z0.b[1]\n" + "ld1b { z5.b }, p2/Z, [x9, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" + ".inst 0x44a91d14 // sudot z20.s, z8.b, z1.b[1]\n" + ".inst 0x44aa1d18 // sudot z24.s, z8.b, z2.b[1]\n" + "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-5, MUL VL]\n" + ".inst 0x44a81d31 // sudot z17.s, z9.b, z0.b[1]\n" + ".inst 0x44a91d35 // sudot z21.s, z9.b, z1.b[1]\n" + ".inst 0x44aa1d39 // sudot z25.s, z9.b, z2.b[1]\n" + ".inst 0x44a81d52 // sudot z18.s, z10.b, z0.b[1]\n" + "ld1b { z9.b }, p2/Z, [x9, #-4, MUL VL]\n" + ".inst 0x44a91d56 // sudot z22.s, z10.b, z1.b[1]\n" + ".inst 0x44aa1d5a // sudot z26.s, z10.b, z2.b[1]\n" + "ld1b { z10.b }, p2/Z, [x9, #-3, MUL VL]\n" + ".inst 0x44a81c93 // sudot z19.s, z4.b, z0.b[1]\n" + ".inst 0x44a91c97 // sudot z23.s, z4.b, z1.b[1]\n" + ".inst 0x44aa1c9b // sudot z27.s, z4.b, z2.b[1]\n" + ".inst 0x44b01cb0 // sudot z16.s, z5.b, z0.b[2]\n" + "ld1b { z4.b }, p2/Z, [x9, #-2, MUL VL]\n" + ".inst 0x44b11cb4 // sudot z20.s, z5.b, z1.b[2]\n" + ".inst 0x44b21cb8 // sudot z24.s, z5.b, z2.b[2]\n" + "ld1b { z5.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x44b01cd1 // sudot z17.s, z6.b, z0.b[2]\n" + ".inst 0x44b11cd5 // sudot z21.s, z6.b, z1.b[2]\n" + ".inst 0x44b21cd9 // sudot z25.s, z6.b, z2.b[2]\n" + ".inst 0x44b01cf2 // sudot z18.s, z7.b, z0.b[2]\n" + ".inst 0x44b11cf6 // sudot z22.s, z7.b, z1.b[2]\n" + ".inst 0x44b21cfa // sudot z26.s, z7.b, z2.b[2]\n" + ".inst 0x44b01d13 // sudot z19.s, z8.b, z0.b[2]\n" + ".inst 0x44b11d17 // sudot z23.s, z8.b, z1.b[2]\n" + ".inst 0x44b21d1b // sudot z27.s, z8.b, z2.b[2]\n" + ".inst 0x44b81d30 // sudot z16.s, z9.b, z0.b[3]\n" + ".inst 0x44b91d34 // sudot z20.s, z9.b, z1.b[3]\n" + ".inst 0x44ba1d38 // sudot z24.s, z9.b, z2.b[3]\n" + ".inst 0x44b81d51 // sudot z17.s, z10.b, z0.b[3]\n" + ".inst 0x44b91d55 // sudot z21.s, z10.b, z1.b[3]\n" + ".inst 0x44ba1d59 // sudot z25.s, z10.b, z2.b[3]\n" + ".inst 0x44b81c92 // sudot z18.s, z4.b, z0.b[3]\n" + ".inst 0x44b91c96 // sudot z22.s, z4.b, z1.b[3]\n" + ".inst 0x44ba1c9a // sudot z26.s, z4.b, z2.b[3]\n" + ".inst 0x44b81cb3 // sudot z19.s, z5.b, z0.b[3]\n" + ".inst 0x44b91cb7 // sudot z23.s, z5.b, z1.b[3]\n" + ".inst 0x44ba1cbb // sudot z27.s, z5.b, z2.b[3]\n" + "tbnz %x[flags], #31, 34f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" - "36:" // Height 3: Multiply loop: unique 5: skip row sum + "34:" // Height 3: Multiply loop: unique 5: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 35b\n" - "37:" // Height 3: Multiply loop: Single iteration only + "bgt 33b\n" + "35:" // Height 3: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z31.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "ld1rqb { z2.b }, p0/Z, [x22]\n" - ".inst 0x44a01ff0 // sudot z16.s, z31.b, z0.b[0]\n" - ".inst 0x44a11ff4 // sudot z20.s, z31.b, z1.b[0]\n" - ".inst 0x44a01fd1 // sudot z17.s, z30.b, z0.b[0]\n" - ".inst 0x44a11fd5 // sudot z21.s, z30.b, z1.b[0]\n" - ".inst 0x44a01fb2 // sudot z18.s, z29.b, z0.b[0]\n" - ".inst 0x44a11fb6 // sudot z22.s, z29.b, z1.b[0]\n" - ".inst 0x44a21ff8 // sudot z24.s, z31.b, z2.b[0]\n" - ".inst 0x44a21fd9 // sudot z25.s, z30.b, z2.b[0]\n" - ".inst 0x44a21fba // sudot z26.s, z29.b, z2.b[0]\n" - ".inst 0x44a01f93 // sudot z19.s, z28.b, z0.b[0]\n" - ".inst 0x44a11f97 // sudot z23.s, z28.b, z1.b[0]\n" - ".inst 0x44a21f9b // sudot z27.s, z28.b, z2.b[0]\n" - "ble 38f\n" - "ld1b { z31.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x44a01c90 // sudot z16.s, z4.b, z0.b[0]\n" + ".inst 0x44a11c94 // sudot z20.s, z4.b, z1.b[0]\n" + ".inst 0x44a01cb1 // sudot z17.s, z5.b, z0.b[0]\n" + ".inst 0x44a11cb5 // sudot z21.s, z5.b, z1.b[0]\n" + ".inst 0x44a01cd2 // sudot z18.s, z6.b, z0.b[0]\n" + ".inst 0x44a11cd6 // sudot z22.s, z6.b, z1.b[0]\n" + ".inst 0x44a21c98 // sudot z24.s, z4.b, z2.b[0]\n" + ".inst 0x44a21cb9 // sudot z25.s, z5.b, z2.b[0]\n" + ".inst 0x44a21cda // sudot z26.s, z6.b, z2.b[0]\n" + ".inst 0x44a01cf3 // sudot z19.s, z7.b, z0.b[0]\n" + ".inst 0x44a11cf7 // sudot z23.s, z7.b, z1.b[0]\n" + ".inst 0x44a21cfb // sudot z27.s, z7.b, z2.b[0]\n" + "ble 36f\n" + "ld1b { z8.b }, p2/Z, [x9]\n" + "ld1b { z9.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - ".inst 0x44a81ff0 // sudot z16.s, z31.b, z0.b[1]\n" - ".inst 0x44a91ff4 // sudot z20.s, z31.b, z1.b[1]\n" - ".inst 0x44aa1ff8 // sudot z24.s, z31.b, z2.b[1]\n" - ".inst 0x44a81fd1 // sudot z17.s, z30.b, z0.b[1]\n" - ".inst 0x44a91fd5 // sudot z21.s, z30.b, z1.b[1]\n" - ".inst 0x44aa1fd9 // sudot z25.s, z30.b, z2.b[1]\n" - ".inst 0x44a81fb2 // sudot z18.s, z29.b, z0.b[1]\n" - ".inst 0x44a91fb6 // sudot z22.s, z29.b, z1.b[1]\n" - ".inst 0x44aa1fba // sudot z26.s, z29.b, z2.b[1]\n" - ".inst 0x44a81f93 // sudot z19.s, z28.b, z0.b[1]\n" - ".inst 0x44a91f97 // sudot z23.s, z28.b, z1.b[1]\n" - ".inst 0x44aa1f9b // sudot z27.s, z28.b, z2.b[1]\n" - "ble 38f\n" - "ld1b { z31.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + ".inst 0x44a81d10 // sudot z16.s, z8.b, z0.b[1]\n" + ".inst 0x44a91d14 // sudot z20.s, z8.b, z1.b[1]\n" + ".inst 0x44aa1d18 // sudot z24.s, z8.b, z2.b[1]\n" + ".inst 0x44a81d31 // sudot z17.s, z9.b, z0.b[1]\n" + ".inst 0x44a91d35 // sudot z21.s, z9.b, z1.b[1]\n" + ".inst 0x44aa1d39 // sudot z25.s, z9.b, z2.b[1]\n" + ".inst 0x44a81d52 // sudot z18.s, z10.b, z0.b[1]\n" + ".inst 0x44a91d56 // sudot z22.s, z10.b, z1.b[1]\n" + ".inst 0x44aa1d5a // sudot z26.s, z10.b, z2.b[1]\n" + ".inst 0x44a81c93 // sudot z19.s, z4.b, z0.b[1]\n" + ".inst 0x44a91c97 // sudot z23.s, z4.b, z1.b[1]\n" + ".inst 0x44aa1c9b // sudot z27.s, z4.b, z2.b[1]\n" + "ble 36f\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - ".inst 0x44b01ff0 // sudot z16.s, z31.b, z0.b[2]\n" - ".inst 0x44b11ff4 // sudot z20.s, z31.b, z1.b[2]\n" - ".inst 0x44b21ff8 // sudot z24.s, z31.b, z2.b[2]\n" - ".inst 0x44b01fd1 // sudot z17.s, z30.b, z0.b[2]\n" - ".inst 0x44b11fd5 // sudot z21.s, z30.b, z1.b[2]\n" - ".inst 0x44b21fd9 // sudot z25.s, z30.b, z2.b[2]\n" - ".inst 0x44b01fb2 // sudot z18.s, z29.b, z0.b[2]\n" - ".inst 0x44b11fb6 // sudot z22.s, z29.b, z1.b[2]\n" - ".inst 0x44b21fba // sudot z26.s, z29.b, z2.b[2]\n" - ".inst 0x44b01f93 // sudot z19.s, z28.b, z0.b[2]\n" - ".inst 0x44b11f97 // sudot z23.s, z28.b, z1.b[2]\n" - ".inst 0x44b21f9b // sudot z27.s, z28.b, z2.b[2]\n" - "ble 38f\n" - "ld1b { z31.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - ".inst 0x44b81ff0 // sudot z16.s, z31.b, z0.b[3]\n" - ".inst 0x44b91ff4 // sudot z20.s, z31.b, z1.b[3]\n" - ".inst 0x44ba1ff8 // sudot z24.s, z31.b, z2.b[3]\n" - ".inst 0x44b81fd1 // sudot z17.s, z30.b, z0.b[3]\n" - ".inst 0x44b91fd5 // sudot z21.s, z30.b, z1.b[3]\n" - ".inst 0x44ba1fd9 // sudot z25.s, z30.b, z2.b[3]\n" - ".inst 0x44b81fb2 // sudot z18.s, z29.b, z0.b[3]\n" - ".inst 0x44b91fb6 // sudot z22.s, z29.b, z1.b[3]\n" - ".inst 0x44ba1fba // sudot z26.s, z29.b, z2.b[3]\n" - ".inst 0x44b81f93 // sudot z19.s, z28.b, z0.b[3]\n" - ".inst 0x44b91f97 // sudot z23.s, z28.b, z1.b[3]\n" - ".inst 0x44ba1f9b // sudot z27.s, z28.b, z2.b[3]\n" - "38:" // Height 3: Multiply loop: multiply skip - "tbnz %x[flags], #31, 39f\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + ".inst 0x44b01cb0 // sudot z16.s, z5.b, z0.b[2]\n" + ".inst 0x44b11cb4 // sudot z20.s, z5.b, z1.b[2]\n" + ".inst 0x44b21cb8 // sudot z24.s, z5.b, z2.b[2]\n" + ".inst 0x44b01cd1 // sudot z17.s, z6.b, z0.b[2]\n" + ".inst 0x44b11cd5 // sudot z21.s, z6.b, z1.b[2]\n" + ".inst 0x44b21cd9 // sudot z25.s, z6.b, z2.b[2]\n" + ".inst 0x44b01cf2 // sudot z18.s, z7.b, z0.b[2]\n" + ".inst 0x44b11cf6 // sudot z22.s, z7.b, z1.b[2]\n" + ".inst 0x44b21cfa // sudot z26.s, z7.b, z2.b[2]\n" + ".inst 0x44b01d13 // sudot z19.s, z8.b, z0.b[2]\n" + ".inst 0x44b11d17 // sudot z23.s, z8.b, z1.b[2]\n" + ".inst 0x44b21d1b // sudot z27.s, z8.b, z2.b[2]\n" + "ble 36f\n" + "ld1b { z9.b }, p2/Z, [x9]\n" + "ld1b { z10.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + ".inst 0x44b81d30 // sudot z16.s, z9.b, z0.b[3]\n" + ".inst 0x44b91d34 // sudot z20.s, z9.b, z1.b[3]\n" + ".inst 0x44ba1d38 // sudot z24.s, z9.b, z2.b[3]\n" + ".inst 0x44b81d51 // sudot z17.s, z10.b, z0.b[3]\n" + ".inst 0x44b91d55 // sudot z21.s, z10.b, z1.b[3]\n" + ".inst 0x44ba1d59 // sudot z25.s, z10.b, z2.b[3]\n" + ".inst 0x44b81c92 // sudot z18.s, z4.b, z0.b[3]\n" + ".inst 0x44b91c96 // sudot z22.s, z4.b, z1.b[3]\n" + ".inst 0x44ba1c9a // sudot z26.s, z4.b, z2.b[3]\n" + ".inst 0x44b81cb3 // sudot z19.s, z5.b, z0.b[3]\n" + ".inst 0x44b91cb7 // sudot z23.s, z5.b, z1.b[3]\n" + ".inst 0x44ba1cbb // sudot z27.s, z5.b, z2.b[3]\n" + "36:" // Height 3: Multiply loop: multiply skip + "tbnz %x[flags], #31, 37f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" - "39:" // Height 3: Multiply loop: unique 6: skip row sum + "37:" // Height 3: Multiply loop: unique 6: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 32b\n" + "bne 30b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x27, x20\n" - "add x23, x24, x20\n" - "tbnz %x[flags], #31, 40f\n" + "add x26, x27, x20\n" + "add x25, x26, x20\n" + "tbnz %x[flags], #31, 38f\n" "mov x21, #0x4\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z28.s }, p2/Z, [x20]\n" + "ld1rw { z3.s }, p2/Z, [x20]\n" "whilelt p0.s, XZR, x21\n" - "neg z28.s, p2/M, z28.s\n" + "neg z3.s, p2/M, z3.s\n" "saddv d11, p0, z11.s\n" "saddv d12, p0, z12.s\n" "saddv d13, p0, z13.s\n" "mov z11.s, z11.s[0]\n" "mov z12.s, z12.s[0]\n" - "mul z11.s, p2/M, z11.s, z28.s\n" - "mul z12.s, p2/M, z12.s, z28.s\n" + "mul z11.s, p2/M, z11.s, z3.s\n" + "mul z12.s, p2/M, z12.s, z3.s\n" "mov z13.s, z13.s[0]\n" - "mul z13.s, p2/M, z13.s, z28.s\n" - "40:" // Height 3: skip row sum fixup + "mul z13.s, p2/M, z13.s, z3.s\n" + "38:" // Height 3: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z31.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z30.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z29.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z20.s, z20.s, z12.s\n" "add z21.s, z21.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z22.s, z22.s, z12.s\n" "add z23.s, z23.s, z12.s\n" - "ld1rw { z28.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" "add z24.s, z24.s, z13.s\n" "add z25.s, z25.s, z13.s\n" - "addvl x10, x10, #4\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" "add z26.s, z26.s, z13.s\n" "add z27.s, z27.s, z13.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z31.s\n" - "add z18.s, z18.s, z30.s\n" - "add z19.s, z19.s, z29.s\n" + "add z17.s, z17.s, z1.s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" "add z20.s, z20.s, z0.s\n" - "add z21.s, z21.s, z31.s\n" - "add z22.s, z22.s, z30.s\n" - "add z23.s, z23.s, z29.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z31.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z26.s, z26.s, z30.s\n" - "add z27.s, z27.s, z29.s\n" - ".inst 0x04bc7610 // sqrdmulh z16.s, z16.s, z28.s\n" - ".inst 0x04bc7631 // sqrdmulh z17.s, z17.s, z28.s\n" - ".inst 0x04bc7652 // sqrdmulh z18.s, z18.s, z28.s\n" - ".inst 0x04bc7673 // sqrdmulh z19.s, z19.s, z28.s\n" - ".inst 0x04bc7694 // sqrdmulh z20.s, z20.s, z28.s\n" - ".inst 0x04bc76b5 // sqrdmulh z21.s, z21.s, z28.s\n" - ".inst 0x04bc76d6 // sqrdmulh z22.s, z22.s, z28.s\n" - ".inst 0x04bc76f7 // sqrdmulh z23.s, z23.s, z28.s\n" - ".inst 0x04bc7718 // sqrdmulh z24.s, z24.s, z28.s\n" - ".inst 0x04bc7739 // sqrdmulh z25.s, z25.s, z28.s\n" - ".inst 0x04bc775a // sqrdmulh z26.s, z26.s, z28.s\n" - ".inst 0x04bc777b // sqrdmulh z27.s, z27.s, z28.s\n" - "tbz %x[flags], #5, 41f\n" - "and z1.d, z16.d, z0.d\n" - "and z31.d, z17.d, z0.d\n" - "and z30.d, z18.d, z0.d\n" - "and z29.d, z19.d, z0.d\n" - "and z28.d, z20.d, z0.d\n" - "and z3.d, z21.d, z0.d\n" - "asr z1.s, z1.s, #0x1f\n" - "asr z31.s, z31.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "and z2.d, z22.d, z0.d\n" - "sqadd z16.s, z16.s, z1.s\n" - "sqadd z17.s, z17.s, z31.s\n" - "sqadd z18.s, z18.s, z30.s\n" - "sqadd z19.s, z19.s, z29.s\n" - "sqadd z20.s, z20.s, z28.s\n" - "and z1.d, z23.d, z0.d\n" - "and z31.d, z24.d, z0.d\n" - "and z30.d, z25.d, z0.d\n" - "and z29.d, z26.d, z0.d\n" - "and z28.d, z27.d, z0.d\n" - "asr z3.s, z3.s, #0x1f\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "asr z31.s, z31.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "sqadd z21.s, z21.s, z3.s\n" - "sqadd z22.s, z22.s, z2.s\n" - "sqadd z23.s, z23.s, z1.s\n" - "sqadd z24.s, z24.s, z31.s\n" - "sqadd z25.s, z25.s, z30.s\n" - "sqadd z26.s, z26.s, z29.s\n" - "sqadd z27.s, z27.s, z28.s\n" - "41:" // Height 3: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add z25.s, z25.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "ld1rw { z30.s }, p2/Z, [x20]\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + ".inst 0x04a47318 // sqdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47339 // sqdmulh z25.s, z25.s, z4.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + ".inst 0x04a4735a // sqdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4737b // sqdmulh z27.s, z27.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" - "add z16.s, z16.s, z30.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z17.s, z17.s, z30.s\n" - "add z18.s, z18.s, z30.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z19.s, z19.s, z30.s\n" - "add z20.s, z20.s, z30.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z21.s, z21.s, z30.s\n" - "add z22.s, z22.s, z30.s\n" - "ld1rw { z29.s }, p2/Z, [x20]\n" - "add z23.s, z23.s, z30.s\n" - "add z24.s, z24.s, z30.s\n" - "add x20, %x[qp], %[minval]\n" - "add z25.s, z25.s, z30.s\n" - "add z26.s, z26.s, z30.s\n" - "ld1rw { z28.s }, p2/Z, [x20]\n" - "add z27.s, z27.s, z30.s\n" - "smin z16.s, p2/M, z16.s, z29.s\n" - "smin z17.s, p2/M, z17.s, z29.s\n" - "smin z18.s, p2/M, z18.s, z29.s\n" - "smin z19.s, p2/M, z19.s, z29.s\n" - "smin z20.s, p2/M, z20.s, z29.s\n" - "smin z21.s, p2/M, z21.s, z29.s\n" - "smin z22.s, p2/M, z22.s, z29.s\n" - "smin z23.s, p2/M, z23.s, z29.s\n" - "smin z24.s, p2/M, z24.s, z29.s\n" - "smin z25.s, p2/M, z25.s, z29.s\n" - "smin z26.s, p2/M, z26.s, z29.s\n" - "smin z27.s, p2/M, z27.s, z29.s\n" - "smax z16.s, p2/M, z16.s, z28.s\n" - "smax z17.s, p2/M, z17.s, z28.s\n" - "smax z18.s, p2/M, z18.s, z28.s\n" - "smax z19.s, p2/M, z19.s, z28.s\n" - "smax z20.s, p2/M, z20.s, z28.s\n" - "smax z21.s, p2/M, z21.s, z28.s\n" - "smax z22.s, p2/M, z22.s, z28.s\n" - "smax z23.s, p2/M, z23.s, z28.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "add z26.s, z26.s, z4.s\n" + "add z27.s, z27.s, z4.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z24.s, p2/M, z24.s, z28.s\n" - "smax z25.s, p2/M, z25.s, z28.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" "uzp1 z17.h, z18.h, z19.h\n" - "smax z26.s, p2/M, z26.s, z28.s\n" - "smax z27.s, p2/M, z27.s, z28.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z20.h, z20.h, z21.h\n" - "uzp1 z18.h, z22.h, z23.h\n" + "uzp1 z21.h, z22.h, z23.h\n" "uzp1 z24.h, z24.h, z25.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "uzp1 z17.h, z26.h, z27.h\n" - "uzp1 z20.b, z20.b, z18.b\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z20.b, z20.b, z21.b\n" "st1b { z16.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "uzp1 z24.b, z24.b, z17.b\n" - "st1b { z20.b }, p1, [x24]\n" - "st1b { z24.b }, p1, [x23]\n" - "42:" // Height 3: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 30b\n" - "b 58f\n" - "43:" // Height 4 + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z20.b }, p1, [x26]\n" + "st1b { z24.b }, p1, [x25]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 28b\n" + "b 54f\n" + "40:" // Height 4 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "mov x20, #0x4\n" - "mov x10, %x[col_bias]\n" "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" - "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" "mov z13.s, #0x0\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z14.s, #0x0\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "madd x20, x21, x20, x27\n" "mov z15.b, #0x1\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" + "madd x20, x21, x20, x27\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "44:" // Height 4: Column loop + "41:" // Height 4: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -1012,7 +924,7 @@ void sve_hybrid_u8s8qa_dot_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" "mov z24.s, #0x0\n" @@ -1023,42 +935,41 @@ void sve_hybrid_u8s8qa_dot_4x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "45:" // Height 4: setup done "mov x26, #0x0\n" - "46:" // Height 4: String loop + "43:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 47f\n" + "tbz %x[flags], #3, 44f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" "ldr x21, [x20, #0x18]\n" - "cbnz x26, 48f\n" + "cbnz x26, 45f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" "add x21, x21, x20\n" - "b 48f\n" - "47:" // Height 4: setup direct input + "b 45f\n" + "44:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" "add x21, x22, x21\n" - "48:" // Height 4: input setup done + "45:" // Height 4: input setup done "cmp x25, #0x10\n" - "ble 51f\n" - "49:" // Height 4: Multiply loop: Main loop head + "ble 48f\n" + "46:" // Height 4: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" @@ -1067,431 +978,380 @@ void sve_hybrid_u8s8qa_dot_4x4VL ( "ld1rqb { z3.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" "add x21, x21, #0x10\n" - "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" - ".inst 0x44a01cb0 // sudot z16.s, z5.b, z0.b[0]\n" - ".inst 0x44a11cb4 // sudot z20.s, z5.b, z1.b[0]\n" - ".inst 0x44a01d51 // sudot z17.s, z10.b, z0.b[0]\n" - ".inst 0x44a11d55 // sudot z21.s, z10.b, z1.b[0]\n" - ".inst 0x44a21cb8 // sudot z24.s, z5.b, z2.b[0]\n" - ".inst 0x44a31cbc // sudot z28.s, z5.b, z3.b[0]\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - ".inst 0x44a21d59 // sudot z25.s, z10.b, z2.b[0]\n" - ".inst 0x44a31d5d // sudot z29.s, z10.b, z3.b[0]\n" - ".inst 0x44a01c92 // sudot z18.s, z4.b, z0.b[0]\n" - ".inst 0x44a11c96 // sudot z22.s, z4.b, z1.b[0]\n" - ".inst 0x44a21c9a // sudot z26.s, z4.b, z2.b[0]\n" - ".inst 0x44a31c9e // sudot z30.s, z4.b, z3.b[0]\n" - "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n" - ".inst 0x44a01d33 // sudot z19.s, z9.b, z0.b[0]\n" - ".inst 0x44a11d37 // sudot z23.s, z9.b, z1.b[0]\n" - ".inst 0x44a21d3b // sudot z27.s, z9.b, z2.b[0]\n" - ".inst 0x44a31d3f // sudot z31.s, z9.b, z3.b[0]\n" - "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x44a01c90 // sudot z16.s, z4.b, z0.b[0]\n" + ".inst 0x44a11c94 // sudot z20.s, z4.b, z1.b[0]\n" + ".inst 0x44a01cb1 // sudot z17.s, z5.b, z0.b[0]\n" + ".inst 0x44a11cb5 // sudot z21.s, z5.b, z1.b[0]\n" + ".inst 0x44a21c98 // sudot z24.s, z4.b, z2.b[0]\n" + ".inst 0x44a31c9c // sudot z28.s, z4.b, z3.b[0]\n" + "ld1b { z4.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x44a21cb9 // sudot z25.s, z5.b, z2.b[0]\n" + ".inst 0x44a31cbd // sudot z29.s, z5.b, z3.b[0]\n" + ".inst 0x44a01cd2 // sudot z18.s, z6.b, z0.b[0]\n" + ".inst 0x44a11cd6 // sudot z22.s, z6.b, z1.b[0]\n" + ".inst 0x44a21cda // sudot z26.s, z6.b, z2.b[0]\n" + ".inst 0x44a31cde // sudot z30.s, z6.b, z3.b[0]\n" + "ld1b { z5.b }, p2/Z, [x9, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" + ".inst 0x44a01cf3 // sudot z19.s, z7.b, z0.b[0]\n" + ".inst 0x44a11cf7 // sudot z23.s, z7.b, z1.b[0]\n" + ".inst 0x44a21cfb // sudot z27.s, z7.b, z2.b[0]\n" + ".inst 0x44a31cff // sudot z31.s, z7.b, z3.b[0]\n" + "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" ".inst 0x44a81d10 // sudot z16.s, z8.b, z0.b[1]\n" ".inst 0x44a91d14 // sudot z20.s, z8.b, z1.b[1]\n" ".inst 0x44aa1d18 // sudot z24.s, z8.b, z2.b[1]\n" ".inst 0x44ab1d1c // sudot z28.s, z8.b, z3.b[1]\n" - "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" - ".inst 0x44a81cf1 // sudot z17.s, z7.b, z0.b[1]\n" - ".inst 0x44a91cf5 // sudot z21.s, z7.b, z1.b[1]\n" - ".inst 0x44aa1cf9 // sudot z25.s, z7.b, z2.b[1]\n" - ".inst 0x44ab1cfd // sudot z29.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" - ".inst 0x44a81cd2 // sudot z18.s, z6.b, z0.b[1]\n" - ".inst 0x44a91cd6 // sudot z22.s, z6.b, z1.b[1]\n" - ".inst 0x44aa1cda // sudot z26.s, z6.b, z2.b[1]\n" - ".inst 0x44ab1cde // sudot z30.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" - ".inst 0x44a81cb3 // sudot z19.s, z5.b, z0.b[1]\n" - ".inst 0x44a91cb7 // sudot z23.s, z5.b, z1.b[1]\n" - ".inst 0x44aa1cbb // sudot z27.s, z5.b, z2.b[1]\n" - ".inst 0x44ab1cbf // sudot z31.s, z5.b, z3.b[1]\n" - "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" - ".inst 0x44b01c90 // sudot z16.s, z4.b, z0.b[2]\n" - ".inst 0x44b11c94 // sudot z20.s, z4.b, z1.b[2]\n" - ".inst 0x44b21c98 // sudot z24.s, z4.b, z2.b[2]\n" - ".inst 0x44b31c9c // sudot z28.s, z4.b, z3.b[2]\n" - "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x44b01d51 // sudot z17.s, z10.b, z0.b[2]\n" - ".inst 0x44b11d55 // sudot z21.s, z10.b, z1.b[2]\n" - ".inst 0x44b21d59 // sudot z25.s, z10.b, z2.b[2]\n" - ".inst 0x44b31d5d // sudot z29.s, z10.b, z3.b[2]\n" - ".inst 0x44b01d32 // sudot z18.s, z9.b, z0.b[2]\n" - ".inst 0x44b11d36 // sudot z22.s, z9.b, z1.b[2]\n" - ".inst 0x44b21d3a // sudot z26.s, z9.b, z2.b[2]\n" - ".inst 0x44b31d3e // sudot z30.s, z9.b, z3.b[2]\n" + "ld1b { z8.b }, p2/Z, [x9, #-5, MUL VL]\n" + ".inst 0x44a81d31 // sudot z17.s, z9.b, z0.b[1]\n" + ".inst 0x44a91d35 // sudot z21.s, z9.b, z1.b[1]\n" + ".inst 0x44aa1d39 // sudot z25.s, z9.b, z2.b[1]\n" + ".inst 0x44ab1d3d // sudot z29.s, z9.b, z3.b[1]\n" + "ld1b { z9.b }, p2/Z, [x9, #-4, MUL VL]\n" + ".inst 0x44a81d52 // sudot z18.s, z10.b, z0.b[1]\n" + ".inst 0x44a91d56 // sudot z22.s, z10.b, z1.b[1]\n" + ".inst 0x44aa1d5a // sudot z26.s, z10.b, z2.b[1]\n" + ".inst 0x44ab1d5e // sudot z30.s, z10.b, z3.b[1]\n" + "ld1b { z10.b }, p2/Z, [x9, #-3, MUL VL]\n" + ".inst 0x44a81c93 // sudot z19.s, z4.b, z0.b[1]\n" + ".inst 0x44a91c97 // sudot z23.s, z4.b, z1.b[1]\n" + ".inst 0x44aa1c9b // sudot z27.s, z4.b, z2.b[1]\n" + ".inst 0x44ab1c9f // sudot z31.s, z4.b, z3.b[1]\n" + "ld1b { z4.b }, p2/Z, [x9, #-2, MUL VL]\n" + ".inst 0x44b01cb0 // sudot z16.s, z5.b, z0.b[2]\n" + ".inst 0x44b11cb4 // sudot z20.s, z5.b, z1.b[2]\n" + ".inst 0x44b21cb8 // sudot z24.s, z5.b, z2.b[2]\n" + ".inst 0x44b31cbc // sudot z28.s, z5.b, z3.b[2]\n" + "ld1b { z5.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x44b01cd1 // sudot z17.s, z6.b, z0.b[2]\n" + ".inst 0x44b11cd5 // sudot z21.s, z6.b, z1.b[2]\n" + ".inst 0x44b21cd9 // sudot z25.s, z6.b, z2.b[2]\n" + ".inst 0x44b31cdd // sudot z29.s, z6.b, z3.b[2]\n" + ".inst 0x44b01cf2 // sudot z18.s, z7.b, z0.b[2]\n" + ".inst 0x44b11cf6 // sudot z22.s, z7.b, z1.b[2]\n" + ".inst 0x44b21cfa // sudot z26.s, z7.b, z2.b[2]\n" + ".inst 0x44b31cfe // sudot z30.s, z7.b, z3.b[2]\n" ".inst 0x44b01d13 // sudot z19.s, z8.b, z0.b[2]\n" ".inst 0x44b11d17 // sudot z23.s, z8.b, z1.b[2]\n" ".inst 0x44b21d1b // sudot z27.s, z8.b, z2.b[2]\n" ".inst 0x44b31d1f // sudot z31.s, z8.b, z3.b[2]\n" - ".inst 0x44b81cf0 // sudot z16.s, z7.b, z0.b[3]\n" - ".inst 0x44b91cf4 // sudot z20.s, z7.b, z1.b[3]\n" - ".inst 0x44ba1cf8 // sudot z24.s, z7.b, z2.b[3]\n" - ".inst 0x44bb1cfc // sudot z28.s, z7.b, z3.b[3]\n" - ".inst 0x44b81cd1 // sudot z17.s, z6.b, z0.b[3]\n" - ".inst 0x44b91cd5 // sudot z21.s, z6.b, z1.b[3]\n" - ".inst 0x44ba1cd9 // sudot z25.s, z6.b, z2.b[3]\n" - ".inst 0x44bb1cdd // sudot z29.s, z6.b, z3.b[3]\n" - ".inst 0x44b81cb2 // sudot z18.s, z5.b, z0.b[3]\n" - ".inst 0x44b91cb6 // sudot z22.s, z5.b, z1.b[3]\n" - ".inst 0x44ba1cba // sudot z26.s, z5.b, z2.b[3]\n" - ".inst 0x44bb1cbe // sudot z30.s, z5.b, z3.b[3]\n" - ".inst 0x44b81c93 // sudot z19.s, z4.b, z0.b[3]\n" - ".inst 0x44b91c97 // sudot z23.s, z4.b, z1.b[3]\n" - ".inst 0x44ba1c9b // sudot z27.s, z4.b, z2.b[3]\n" - ".inst 0x44bb1c9f // sudot z31.s, z4.b, z3.b[3]\n" - "tbnz %x[flags], #31, 50f\n" + ".inst 0x44b81d30 // sudot z16.s, z9.b, z0.b[3]\n" + ".inst 0x44b91d34 // sudot z20.s, z9.b, z1.b[3]\n" + ".inst 0x44ba1d38 // sudot z24.s, z9.b, z2.b[3]\n" + ".inst 0x44bb1d3c // sudot z28.s, z9.b, z3.b[3]\n" + ".inst 0x44b81d51 // sudot z17.s, z10.b, z0.b[3]\n" + ".inst 0x44b91d55 // sudot z21.s, z10.b, z1.b[3]\n" + ".inst 0x44ba1d59 // sudot z25.s, z10.b, z2.b[3]\n" + ".inst 0x44bb1d5d // sudot z29.s, z10.b, z3.b[3]\n" + ".inst 0x44b81c92 // sudot z18.s, z4.b, z0.b[3]\n" + ".inst 0x44b91c96 // sudot z22.s, z4.b, z1.b[3]\n" + ".inst 0x44ba1c9a // sudot z26.s, z4.b, z2.b[3]\n" + ".inst 0x44bb1c9e // sudot z30.s, z4.b, z3.b[3]\n" + ".inst 0x44b81cb3 // sudot z19.s, z5.b, z0.b[3]\n" + ".inst 0x44b91cb7 // sudot z23.s, z5.b, z1.b[3]\n" + ".inst 0x44ba1cbb // sudot z27.s, z5.b, z2.b[3]\n" + ".inst 0x44bb1cbf // sudot z31.s, z5.b, z3.b[3]\n" + "tbnz %x[flags], #31, 47f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" "udot z14.s, z3.b, z15.b\n" - "50:" // Height 4: Multiply loop: unique 7: skip row sum + "47:" // Height 4: Multiply loop: unique 7: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 49b\n" - "51:" // Height 4: Multiply loop: Single iteration only + "bgt 46b\n" + "48:" // Height 4: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z7.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9]\n" + "ld1b { z5.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" + "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "ld1rqb { z2.b }, p0/Z, [x22]\n" "ld1rqb { z3.b }, p0/Z, [x21]\n" - ".inst 0x44a01cf0 // sudot z16.s, z7.b, z0.b[0]\n" - ".inst 0x44a11cf4 // sudot z20.s, z7.b, z1.b[0]\n" - ".inst 0x44a01cd1 // sudot z17.s, z6.b, z0.b[0]\n" - ".inst 0x44a11cd5 // sudot z21.s, z6.b, z1.b[0]\n" - ".inst 0x44a01cb2 // sudot z18.s, z5.b, z0.b[0]\n" - ".inst 0x44a11cb6 // sudot z22.s, z5.b, z1.b[0]\n" - ".inst 0x44a21cf8 // sudot z24.s, z7.b, z2.b[0]\n" - ".inst 0x44a31cfc // sudot z28.s, z7.b, z3.b[0]\n" - ".inst 0x44a21cd9 // sudot z25.s, z6.b, z2.b[0]\n" - ".inst 0x44a31cdd // sudot z29.s, z6.b, z3.b[0]\n" - ".inst 0x44a21cba // sudot z26.s, z5.b, z2.b[0]\n" - ".inst 0x44a31cbe // sudot z30.s, z5.b, z3.b[0]\n" - ".inst 0x44a01c93 // sudot z19.s, z4.b, z0.b[0]\n" - ".inst 0x44a11c97 // sudot z23.s, z4.b, z1.b[0]\n" - ".inst 0x44a21c9b // sudot z27.s, z4.b, z2.b[0]\n" - ".inst 0x44a31c9f // sudot z31.s, z4.b, z3.b[0]\n" - "ble 52f\n" - "ld1b { z7.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x44a01c90 // sudot z16.s, z4.b, z0.b[0]\n" + ".inst 0x44a11c94 // sudot z20.s, z4.b, z1.b[0]\n" + ".inst 0x44a01cb1 // sudot z17.s, z5.b, z0.b[0]\n" + ".inst 0x44a11cb5 // sudot z21.s, z5.b, z1.b[0]\n" + ".inst 0x44a01cd2 // sudot z18.s, z6.b, z0.b[0]\n" + ".inst 0x44a11cd6 // sudot z22.s, z6.b, z1.b[0]\n" + ".inst 0x44a21c98 // sudot z24.s, z4.b, z2.b[0]\n" + ".inst 0x44a31c9c // sudot z28.s, z4.b, z3.b[0]\n" + ".inst 0x44a21cb9 // sudot z25.s, z5.b, z2.b[0]\n" + ".inst 0x44a31cbd // sudot z29.s, z5.b, z3.b[0]\n" + ".inst 0x44a21cda // sudot z26.s, z6.b, z2.b[0]\n" + ".inst 0x44a31cde // sudot z30.s, z6.b, z3.b[0]\n" + ".inst 0x44a01cf3 // sudot z19.s, z7.b, z0.b[0]\n" + ".inst 0x44a11cf7 // sudot z23.s, z7.b, z1.b[0]\n" + ".inst 0x44a21cfb // sudot z27.s, z7.b, z2.b[0]\n" + ".inst 0x44a31cff // sudot z31.s, z7.b, z3.b[0]\n" + "ble 49f\n" + "ld1b { z8.b }, p2/Z, [x9]\n" + "ld1b { z9.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - ".inst 0x44a81cf0 // sudot z16.s, z7.b, z0.b[1]\n" - ".inst 0x44a91cf4 // sudot z20.s, z7.b, z1.b[1]\n" - ".inst 0x44aa1cf8 // sudot z24.s, z7.b, z2.b[1]\n" - ".inst 0x44ab1cfc // sudot z28.s, z7.b, z3.b[1]\n" - ".inst 0x44a81cd1 // sudot z17.s, z6.b, z0.b[1]\n" - ".inst 0x44a91cd5 // sudot z21.s, z6.b, z1.b[1]\n" - ".inst 0x44aa1cd9 // sudot z25.s, z6.b, z2.b[1]\n" - ".inst 0x44ab1cdd // sudot z29.s, z6.b, z3.b[1]\n" - ".inst 0x44a81cb2 // sudot z18.s, z5.b, z0.b[1]\n" - ".inst 0x44a91cb6 // sudot z22.s, z5.b, z1.b[1]\n" - ".inst 0x44aa1cba // sudot z26.s, z5.b, z2.b[1]\n" - ".inst 0x44ab1cbe // sudot z30.s, z5.b, z3.b[1]\n" + "ld1b { z10.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + ".inst 0x44a81d10 // sudot z16.s, z8.b, z0.b[1]\n" + ".inst 0x44a91d14 // sudot z20.s, z8.b, z1.b[1]\n" + ".inst 0x44aa1d18 // sudot z24.s, z8.b, z2.b[1]\n" + ".inst 0x44ab1d1c // sudot z28.s, z8.b, z3.b[1]\n" + ".inst 0x44a81d31 // sudot z17.s, z9.b, z0.b[1]\n" + ".inst 0x44a91d35 // sudot z21.s, z9.b, z1.b[1]\n" + ".inst 0x44aa1d39 // sudot z25.s, z9.b, z2.b[1]\n" + ".inst 0x44ab1d3d // sudot z29.s, z9.b, z3.b[1]\n" + ".inst 0x44a81d52 // sudot z18.s, z10.b, z0.b[1]\n" + ".inst 0x44a91d56 // sudot z22.s, z10.b, z1.b[1]\n" + ".inst 0x44aa1d5a // sudot z26.s, z10.b, z2.b[1]\n" + ".inst 0x44ab1d5e // sudot z30.s, z10.b, z3.b[1]\n" ".inst 0x44a81c93 // sudot z19.s, z4.b, z0.b[1]\n" ".inst 0x44a91c97 // sudot z23.s, z4.b, z1.b[1]\n" ".inst 0x44aa1c9b // sudot z27.s, z4.b, z2.b[1]\n" ".inst 0x44ab1c9f // sudot z31.s, z4.b, z3.b[1]\n" - "ble 52f\n" - "ld1b { z7.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ble 49f\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - ".inst 0x44b01cf0 // sudot z16.s, z7.b, z0.b[2]\n" - ".inst 0x44b11cf4 // sudot z20.s, z7.b, z1.b[2]\n" - ".inst 0x44b21cf8 // sudot z24.s, z7.b, z2.b[2]\n" - ".inst 0x44b31cfc // sudot z28.s, z7.b, z3.b[2]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + ".inst 0x44b01cb0 // sudot z16.s, z5.b, z0.b[2]\n" + ".inst 0x44b11cb4 // sudot z20.s, z5.b, z1.b[2]\n" + ".inst 0x44b21cb8 // sudot z24.s, z5.b, z2.b[2]\n" + ".inst 0x44b31cbc // sudot z28.s, z5.b, z3.b[2]\n" ".inst 0x44b01cd1 // sudot z17.s, z6.b, z0.b[2]\n" ".inst 0x44b11cd5 // sudot z21.s, z6.b, z1.b[2]\n" ".inst 0x44b21cd9 // sudot z25.s, z6.b, z2.b[2]\n" ".inst 0x44b31cdd // sudot z29.s, z6.b, z3.b[2]\n" - ".inst 0x44b01cb2 // sudot z18.s, z5.b, z0.b[2]\n" - ".inst 0x44b11cb6 // sudot z22.s, z5.b, z1.b[2]\n" - ".inst 0x44b21cba // sudot z26.s, z5.b, z2.b[2]\n" - ".inst 0x44b31cbe // sudot z30.s, z5.b, z3.b[2]\n" - ".inst 0x44b01c93 // sudot z19.s, z4.b, z0.b[2]\n" - ".inst 0x44b11c97 // sudot z23.s, z4.b, z1.b[2]\n" - ".inst 0x44b21c9b // sudot z27.s, z4.b, z2.b[2]\n" - ".inst 0x44b31c9f // sudot z31.s, z4.b, z3.b[2]\n" - "ble 52f\n" - "ld1b { z7.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - ".inst 0x44b81cf0 // sudot z16.s, z7.b, z0.b[3]\n" - ".inst 0x44b91cf4 // sudot z20.s, z7.b, z1.b[3]\n" - ".inst 0x44ba1cf8 // sudot z24.s, z7.b, z2.b[3]\n" - ".inst 0x44bb1cfc // sudot z28.s, z7.b, z3.b[3]\n" - ".inst 0x44b81cd1 // sudot z17.s, z6.b, z0.b[3]\n" - ".inst 0x44b91cd5 // sudot z21.s, z6.b, z1.b[3]\n" - ".inst 0x44ba1cd9 // sudot z25.s, z6.b, z2.b[3]\n" - ".inst 0x44bb1cdd // sudot z29.s, z6.b, z3.b[3]\n" - ".inst 0x44b81cb2 // sudot z18.s, z5.b, z0.b[3]\n" - ".inst 0x44b91cb6 // sudot z22.s, z5.b, z1.b[3]\n" - ".inst 0x44ba1cba // sudot z26.s, z5.b, z2.b[3]\n" - ".inst 0x44bb1cbe // sudot z30.s, z5.b, z3.b[3]\n" - ".inst 0x44b81c93 // sudot z19.s, z4.b, z0.b[3]\n" - ".inst 0x44b91c97 // sudot z23.s, z4.b, z1.b[3]\n" - ".inst 0x44ba1c9b // sudot z27.s, z4.b, z2.b[3]\n" - ".inst 0x44bb1c9f // sudot z31.s, z4.b, z3.b[3]\n" - "52:" // Height 4: Multiply loop: multiply skip - "tbnz %x[flags], #31, 53f\n" + ".inst 0x44b01cf2 // sudot z18.s, z7.b, z0.b[2]\n" + ".inst 0x44b11cf6 // sudot z22.s, z7.b, z1.b[2]\n" + ".inst 0x44b21cfa // sudot z26.s, z7.b, z2.b[2]\n" + ".inst 0x44b31cfe // sudot z30.s, z7.b, z3.b[2]\n" + ".inst 0x44b01d13 // sudot z19.s, z8.b, z0.b[2]\n" + ".inst 0x44b11d17 // sudot z23.s, z8.b, z1.b[2]\n" + ".inst 0x44b21d1b // sudot z27.s, z8.b, z2.b[2]\n" + ".inst 0x44b31d1f // sudot z31.s, z8.b, z3.b[2]\n" + "ble 49f\n" + "ld1b { z9.b }, p2/Z, [x9]\n" + "ld1b { z10.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + ".inst 0x44b81d30 // sudot z16.s, z9.b, z0.b[3]\n" + ".inst 0x44b91d34 // sudot z20.s, z9.b, z1.b[3]\n" + ".inst 0x44ba1d38 // sudot z24.s, z9.b, z2.b[3]\n" + ".inst 0x44bb1d3c // sudot z28.s, z9.b, z3.b[3]\n" + ".inst 0x44b81d51 // sudot z17.s, z10.b, z0.b[3]\n" + ".inst 0x44b91d55 // sudot z21.s, z10.b, z1.b[3]\n" + ".inst 0x44ba1d59 // sudot z25.s, z10.b, z2.b[3]\n" + ".inst 0x44bb1d5d // sudot z29.s, z10.b, z3.b[3]\n" + ".inst 0x44b81c92 // sudot z18.s, z4.b, z0.b[3]\n" + ".inst 0x44b91c96 // sudot z22.s, z4.b, z1.b[3]\n" + ".inst 0x44ba1c9a // sudot z26.s, z4.b, z2.b[3]\n" + ".inst 0x44bb1c9e // sudot z30.s, z4.b, z3.b[3]\n" + ".inst 0x44b81cb3 // sudot z19.s, z5.b, z0.b[3]\n" + ".inst 0x44b91cb7 // sudot z23.s, z5.b, z1.b[3]\n" + ".inst 0x44ba1cbb // sudot z27.s, z5.b, z2.b[3]\n" + ".inst 0x44bb1cbf // sudot z31.s, z5.b, z3.b[3]\n" + "49:" // Height 4: Multiply loop: multiply skip + "tbnz %x[flags], #31, 50f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" "udot z14.s, z3.b, z15.b\n" - "53:" // Height 4: Multiply loop: unique 8: skip row sum + "50:" // Height 4: Multiply loop: unique 8: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 46b\n" + "bne 43b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x27, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "tbnz %x[flags], #31, 54f\n" + "add x26, x27, x20\n" + "add x25, x26, x20\n" + "add x24, x25, x20\n" + "tbnz %x[flags], #31, 51f\n" "mov x21, #0x4\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" "whilelt p0.s, XZR, x21\n" - "neg z0.s, p2/M, z0.s\n" + "neg z4.s, p2/M, z4.s\n" "saddv d11, p0, z11.s\n" "saddv d12, p0, z12.s\n" "saddv d13, p0, z13.s\n" "saddv d14, p0, z14.s\n" "mov z11.s, z11.s[0]\n" "mov z12.s, z12.s[0]\n" - "mul z11.s, p2/M, z11.s, z0.s\n" - "mul z12.s, p2/M, z12.s, z0.s\n" + "mul z11.s, p2/M, z11.s, z4.s\n" + "mul z12.s, p2/M, z12.s, z4.s\n" "mov z13.s, z13.s[0]\n" "mov z14.s, z14.s[0]\n" - "mul z13.s, p2/M, z13.s, z0.s\n" - "mul z14.s, p2/M, z14.s, z0.s\n" - "54:" // Height 4: skip row sum fixup + "mul z13.s, p2/M, z13.s, z4.s\n" + "mul z14.s, p2/M, z14.s, z4.s\n" + "51:" // Height 4: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z4.s }, p2/Z, [x10]\n" - "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z20.s, z20.s, z12.s\n" "add z21.s, z21.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z22.s, z22.s, z12.s\n" "add z23.s, z23.s, z12.s\n" - "ld1rw { z1.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" "add z24.s, z24.s, z13.s\n" "add z25.s, z25.s, z13.s\n" - "addvl x10, x10, #4\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" "add z26.s, z26.s, z13.s\n" "add z27.s, z27.s, z13.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" "add z28.s, z28.s, z14.s\n" "add z29.s, z29.s, z14.s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" "add z30.s, z30.s, z14.s\n" "add z31.s, z31.s, z14.s\n" - "add z16.s, z16.s, z4.s\n" - "add z17.s, z17.s, z0.s\n" - "add z18.s, z18.s, z3.s\n" - "add z19.s, z19.s, z2.s\n" - "add z20.s, z20.s, z4.s\n" - "add z21.s, z21.s, z0.s\n" - "add z22.s, z22.s, z3.s\n" - "add z23.s, z23.s, z2.s\n" - "add z24.s, z24.s, z4.s\n" - "add z25.s, z25.s, z0.s\n" - "add z26.s, z26.s, z3.s\n" - "add z27.s, z27.s, z2.s\n" - "add z28.s, z28.s, z4.s\n" - "add z29.s, z29.s, z0.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z30.s, z30.s, z3.s\n" - "add z31.s, z31.s, z2.s\n" - ".inst 0x04a17610 // sqrdmulh z16.s, z16.s, z1.s\n" - ".inst 0x04a17631 // sqrdmulh z17.s, z17.s, z1.s\n" - ".inst 0x04a17652 // sqrdmulh z18.s, z18.s, z1.s\n" - ".inst 0x04a17673 // sqrdmulh z19.s, z19.s, z1.s\n" - ".inst 0x04a17694 // sqrdmulh z20.s, z20.s, z1.s\n" - ".inst 0x04a176b5 // sqrdmulh z21.s, z21.s, z1.s\n" - ".inst 0x04a176d6 // sqrdmulh z22.s, z22.s, z1.s\n" - ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n" - ".inst 0x04a17718 // sqrdmulh z24.s, z24.s, z1.s\n" - ".inst 0x04a17739 // sqrdmulh z25.s, z25.s, z1.s\n" - ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n" - ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n" - ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n" - ".inst 0x04a177bd // sqrdmulh z29.s, z29.s, z1.s\n" - ".inst 0x04a177de // sqrdmulh z30.s, z30.s, z1.s\n" - ".inst 0x04a177ff // sqrdmulh z31.s, z31.s, z1.s\n" - "tbz %x[flags], #5, 55f\n" - "and z2.d, z16.d, z0.d\n" - "and z1.d, z17.d, z0.d\n" - "and z7.d, z18.d, z0.d\n" - "and z6.d, z19.d, z0.d\n" - "and z5.d, z20.d, z0.d\n" - "and z4.d, z21.d, z0.d\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "and z3.d, z22.d, z0.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "sqadd z16.s, z16.s, z2.s\n" - "sqadd z17.s, z17.s, z1.s\n" - "and z2.d, z23.d, z0.d\n" - "and z1.d, z24.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z3.s, z3.s, #0x1f\n" - "sqadd z18.s, z18.s, z7.s\n" - "sqadd z19.s, z19.s, z6.s\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "sqadd z20.s, z20.s, z5.s\n" - "sqadd z21.s, z21.s, z4.s\n" - "sqadd z22.s, z22.s, z3.s\n" - "and z7.d, z25.d, z0.d\n" - "sqadd z23.s, z23.s, z2.s\n" - "sqadd z24.s, z24.s, z1.s\n" - "and z6.d, z26.d, z0.d\n" - "and z5.d, z27.d, z0.d\n" - "and z4.d, z28.d, z0.d\n" - "and z3.d, z29.d, z0.d\n" - "and z2.d, z30.d, z0.d\n" - "and z1.d, z31.d, z0.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z3.s, z3.s, #0x1f\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "sqadd z25.s, z25.s, z7.s\n" - "sqadd z26.s, z26.s, z6.s\n" - "sqadd z27.s, z27.s, z5.s\n" - "sqadd z28.s, z28.s, z4.s\n" - "sqadd z29.s, z29.s, z3.s\n" - "sqadd z30.s, z30.s, z2.s\n" - "sqadd z31.s, z31.s, z1.s\n" - "55:" // Height 4: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z2.s\n" + "add z23.s, z23.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "add z25.s, z25.s, z1.s\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + "add z28.s, z28.s, z0.s\n" + "add z29.s, z29.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z30.s, z30.s, z2.s\n" + "add z31.s, z31.s, z3.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "ld1rw { z2.s }, p2/Z, [x20]\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + ".inst 0x04a47318 // sqdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47339 // sqdmulh z25.s, z25.s, z4.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + ".inst 0x04a4735a // sqdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4737b // sqdmulh z27.s, z27.s, z4.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" + ".inst 0x04a4739c // sqdmulh z28.s, z28.s, z4.s\n" + ".inst 0x04a473bd // sqdmulh z29.s, z29.s, z4.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" - "add z16.s, z16.s, z2.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" + ".inst 0x04a473de // sqdmulh z30.s, z30.s, z4.s\n" + ".inst 0x04a473ff // sqdmulh z31.s, z31.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z17.s, z17.s, z2.s\n" - "add z18.s, z18.s, z2.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z19.s, z19.s, z2.s\n" - "add z20.s, z20.s, z2.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" - "add z21.s, z21.s, z2.s\n" - "add z22.s, z22.s, z2.s\n" ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n" - "add z23.s, z23.s, z2.s\n" - "add z24.s, z24.s, z2.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z25.s, z25.s, z2.s\n" - "add z26.s, z26.s, z2.s\n" - "ld1rw { z1.s }, p2/Z, [x20]\n" - "add z27.s, z27.s, z2.s\n" - "add z28.s, z28.s, z2.s\n" - "add x20, %x[qp], %[minval]\n" - "add z29.s, z29.s, z2.s\n" - "add z30.s, z30.s, z2.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z31.s, z31.s, z2.s\n" - "smin z16.s, p2/M, z16.s, z1.s\n" - "smin z17.s, p2/M, z17.s, z1.s\n" - "smin z18.s, p2/M, z18.s, z1.s\n" - "smin z19.s, p2/M, z19.s, z1.s\n" - "smin z20.s, p2/M, z20.s, z1.s\n" - "smin z21.s, p2/M, z21.s, z1.s\n" - "smin z22.s, p2/M, z22.s, z1.s\n" - "smin z23.s, p2/M, z23.s, z1.s\n" - "smin z24.s, p2/M, z24.s, z1.s\n" - "smin z25.s, p2/M, z25.s, z1.s\n" - "smin z26.s, p2/M, z26.s, z1.s\n" - "smin z27.s, p2/M, z27.s, z1.s\n" - "smin z28.s, p2/M, z28.s, z1.s\n" - "smin z29.s, p2/M, z29.s, z1.s\n" - "smin z30.s, p2/M, z30.s, z1.s\n" - "smin z31.s, p2/M, z31.s, z1.s\n" - "smax z16.s, p2/M, z16.s, z0.s\n" - "smax z17.s, p2/M, z17.s, z0.s\n" - "smax z18.s, p2/M, z18.s, z0.s\n" - "smax z19.s, p2/M, z19.s, z0.s\n" - "smax z20.s, p2/M, z20.s, z0.s\n" - "smax z21.s, p2/M, z21.s, z0.s\n" - "smax z22.s, p2/M, z22.s, z0.s\n" - "smax z23.s, p2/M, z23.s, z0.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z23.s, z23.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "add z26.s, z26.s, z4.s\n" + "add z27.s, z27.s, z4.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "add z28.s, z28.s, z4.s\n" + "add z29.s, z29.s, z4.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "add z30.s, z30.s, z4.s\n" + "add z31.s, z31.s, z4.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smin z28.s, p2/M, z28.s, z6.s\n" + "smin z29.s, p2/M, z29.s, z6.s\n" + "smin z30.s, p2/M, z30.s, z6.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z24.s, p2/M, z24.s, z0.s\n" - "smax z25.s, p2/M, z25.s, z0.s\n" - "uzp1 z18.h, z18.h, z19.h\n" - "smax z26.s, p2/M, z26.s, z0.s\n" - "smax z27.s, p2/M, z27.s, z0.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" + "uzp1 z17.h, z18.h, z19.h\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z20.h, z20.h, z21.h\n" - "smax z28.s, p2/M, z28.s, z0.s\n" - "smax z29.s, p2/M, z29.s, z0.s\n" - "uzp1 z17.h, z22.h, z23.h\n" - "smax z30.s, p2/M, z30.s, z0.s\n" - "smax z31.s, p2/M, z31.s, z0.s\n" + "smax z28.s, p2/M, z28.s, z5.s\n" + "smax z29.s, p2/M, z29.s, z5.s\n" + "uzp1 z21.h, z22.h, z23.h\n" + "smax z30.s, p2/M, z30.s, z5.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" "uzp1 z24.h, z24.h, z25.h\n" - "uzp1 z16.b, z16.b, z18.b\n" - "uzp1 z18.h, z26.h, z27.h\n" + "uzp1 z16.b, z16.b, z17.b\n" + "uzp1 z25.h, z26.h, z27.h\n" "uzp1 z28.h, z28.h, z29.h\n" - "uzp1 z20.b, z20.b, z17.b\n" - "uzp1 z17.h, z30.h, z31.h\n" + "uzp1 z20.b, z20.b, z21.b\n" + "uzp1 z29.h, z30.h, z31.h\n" "st1b { z16.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "uzp1 z24.b, z24.b, z18.b\n" - "uzp1 z28.b, z28.b, z17.b\n" - "st1b { z20.b }, p1, [x24]\n" - "st1b { z24.b }, p1, [x23]\n" - "st1b { z28.b }, p1, [x22]\n" - "56:" // Height 4: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 44b\n" + "uzp1 z24.b, z24.b, z25.b\n" + "uzp1 z28.b, z28.b, z29.b\n" + "st1b { z20.b }, p1, [x26]\n" + "st1b { z24.b }, p1, [x25]\n" + "st1b { z28.b }, p1, [x24]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 41b\n" "subs %x[M], %x[M], #0x4\n" - "beq 58f\n" + "beq 54f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 57f\n" + "tbz %x[flags], #3, 53f\n" "add x21, x21, #0x4\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "57:" // Update direct input + "53:" // Update direct input "mov x20, #0x4\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "58:" // Exit + "54:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_mmla_4x4VL/generic.cpp index 01bdac2967..cc303cd4d9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_mmla_4x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8s8qa_mmla_4x4VL/generic.cpp @@ -25,7 +25,6 @@ #include "arm_gemm.hpp" #include "../../utils.hpp" - #include #include @@ -74,23 +73,20 @@ void sve_hybrid_u8s8qa_mmla_4x4VL ( ka.string_lengths = string_lengths; ka.N = N; ka.B_ptr = B_ptr; - if (qp->c_offset > qp->minval) { - flags |= 0x20; - } __asm__ __volatile__( "ptrue p2.b\n" "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 43f\n" + "bge 40f\n" "cmp %x[M], #0x2\n" - "bgt 29f\n" - "beq 15f\n" - "mov x10, %x[col_bias]\n" + "bgt 27f\n" + "beq 14f\n" "mov z11.s, #0x0\n" "mov z15.b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "2:" // Height 1: Column loop "mov x20, #0x0\n" @@ -100,10 +96,9 @@ void sve_hybrid_u8s8qa_mmla_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "3:" // Height 1: setup done "mov x26, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" @@ -124,43 +119,43 @@ void sve_hybrid_u8s8qa_mmla_4x4VL ( "ble 9f\n" "7:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z30.b }, p2/Z, [x28]\n" - "ld1b { z29.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "ld1b { z24.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" "add x24, x24, #0x10\n" - "trn1 z0.d, z1.d, z31.d\n" - ".inst 0x459e9810 // usmmla z16.s, z0.b, z30.b\n" - "ld1b { z8.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - "trn2 z1.d, z1.d, z31.d\n" - ".inst 0x459d9814 // usmmla z20.s, z0.b, z29.b\n" - ".inst 0x459c9811 // usmmla z17.s, z0.b, z28.b\n" - ".inst 0x459b9815 // usmmla z21.s, z0.b, z27.b\n" - ".inst 0x459a9812 // usmmla z18.s, z0.b, z26.b\n" - "ld1b { z31.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x45999816 // usmmla z22.s, z0.b, z25.b\n" - ".inst 0x45989813 // usmmla z19.s, z0.b, z24.b\n" - "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n" - ".inst 0x45889817 // usmmla z23.s, z0.b, z8.b\n" - "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n" - ".inst 0x459f9830 // usmmla z16.s, z1.b, z31.b\n" - "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x459e9834 // usmmla z20.s, z1.b, z30.b\n" - ".inst 0x459d9831 // usmmla z17.s, z1.b, z29.b\n" - ".inst 0x459c9835 // usmmla z21.s, z1.b, z28.b\n" - ".inst 0x459b9832 // usmmla z18.s, z1.b, z27.b\n" - ".inst 0x459a9836 // usmmla z22.s, z1.b, z26.b\n" - ".inst 0x45999833 // usmmla z19.s, z1.b, z25.b\n" - ".inst 0x45989837 // usmmla z23.s, z1.b, z24.b\n" + "trn1 z0.d, z1.d, z2.d\n" + ".inst 0x45859810 // usmmla z16.s, z0.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45869814 // usmmla z20.s, z0.b, z6.b\n" + ".inst 0x45879811 // usmmla z17.s, z0.b, z7.b\n" + ".inst 0x45889815 // usmmla z21.s, z0.b, z8.b\n" + ".inst 0x45899812 // usmmla z18.s, z0.b, z9.b\n" + "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" + ".inst 0x458a9816 // usmmla z22.s, z0.b, z10.b\n" + ".inst 0x45849813 // usmmla z19.s, z0.b, z4.b\n" + "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-6, MUL VL]\n" + ".inst 0x45859817 // usmmla z23.s, z0.b, z5.b\n" + "ld1b { z9.b }, p2/Z, [x9, #-5, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #-4, MUL VL]\n" + ".inst 0x45869830 // usmmla z16.s, z1.b, z6.b\n" + "ld1b { z4.b }, p2/Z, [x9, #-3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #-2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45879834 // usmmla z20.s, z1.b, z7.b\n" + ".inst 0x45889831 // usmmla z17.s, z1.b, z8.b\n" + ".inst 0x45899835 // usmmla z21.s, z1.b, z9.b\n" + ".inst 0x458a9832 // usmmla z18.s, z1.b, z10.b\n" + ".inst 0x45849836 // usmmla z22.s, z1.b, z4.b\n" + ".inst 0x45859833 // usmmla z19.s, z1.b, z5.b\n" + ".inst 0x45869837 // usmmla z23.s, z1.b, z6.b\n" "tbnz %x[flags], #31, 8f\n" "udot z11.s, z0.b, z15.b\n" "udot z11.s, z1.b, z15.b\n" @@ -170,45 +165,45 @@ void sve_hybrid_u8s8qa_mmla_4x4VL ( "bgt 7b\n" "9:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z24.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" - "trn1 z0.d, z1.d, z31.d\n" - ".inst 0x45989810 // usmmla z16.s, z0.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #8\n" - "trn2 z1.d, z1.d, z31.d\n" - ".inst 0x459e9814 // usmmla z20.s, z0.b, z30.b\n" - ".inst 0x459d9811 // usmmla z17.s, z0.b, z29.b\n" - ".inst 0x459c9815 // usmmla z21.s, z0.b, z28.b\n" - ".inst 0x459b9812 // usmmla z18.s, z0.b, z27.b\n" - ".inst 0x459a9816 // usmmla z22.s, z0.b, z26.b\n" - ".inst 0x45999813 // usmmla z19.s, z0.b, z25.b\n" - ".inst 0x45989817 // usmmla z23.s, z0.b, z24.b\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" + "trn1 z0.d, z1.d, z2.d\n" + ".inst 0x45859810 // usmmla z16.s, z0.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45869814 // usmmla z20.s, z0.b, z6.b\n" + ".inst 0x45879811 // usmmla z17.s, z0.b, z7.b\n" + ".inst 0x45889815 // usmmla z21.s, z0.b, z8.b\n" + ".inst 0x45899812 // usmmla z18.s, z0.b, z9.b\n" + ".inst 0x458a9816 // usmmla z22.s, z0.b, z10.b\n" + ".inst 0x45849813 // usmmla z19.s, z0.b, z4.b\n" + ".inst 0x45859817 // usmmla z23.s, z0.b, z5.b\n" "ble 10f\n" - "ld1b { z24.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45989830 // usmmla z16.s, z1.b, z24.b\n" - "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x459e9834 // usmmla z20.s, z1.b, z30.b\n" - "addvl x28, x28, #8\n" - ".inst 0x459d9831 // usmmla z17.s, z1.b, z29.b\n" - ".inst 0x459c9835 // usmmla z21.s, z1.b, z28.b\n" - ".inst 0x459b9832 // usmmla z18.s, z1.b, z27.b\n" - ".inst 0x459a9836 // usmmla z22.s, z1.b, z26.b\n" - ".inst 0x45999833 // usmmla z19.s, z1.b, z25.b\n" - ".inst 0x45989837 // usmmla z23.s, z1.b, z24.b\n" + "ld1b { z6.b }, p2/Z, [x9]\n" + "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x45869830 // usmmla z16.s, z1.b, z6.b\n" + "ld1b { z5.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45879834 // usmmla z20.s, z1.b, z7.b\n" + "addvl x9, x9, #8\n" + ".inst 0x45889831 // usmmla z17.s, z1.b, z8.b\n" + ".inst 0x45899835 // usmmla z21.s, z1.b, z9.b\n" + ".inst 0x458a9832 // usmmla z18.s, z1.b, z10.b\n" + ".inst 0x45849836 // usmmla z22.s, z1.b, z4.b\n" + ".inst 0x45859833 // usmmla z19.s, z1.b, z5.b\n" + ".inst 0x45869837 // usmmla z23.s, z1.b, z6.b\n" "10:" // Height 1: Multiply loop: multiply skip "tbnz %x[flags], #31, 11f\n" "udot z11.s, z0.b, z15.b\n" @@ -226,89 +221,74 @@ void sve_hybrid_u8s8qa_mmla_4x4VL ( "tbnz %x[flags], #31, 12f\n" "add x20, %x[qp], %[b_offset]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" - "ld1rw { z9.s }, p2/Z, [x20]\n" - "neg z9.s, p2/M, z9.s\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" + "neg z1.s, p2/M, z1.s\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z9.s\n" + "mul z11.s, p2/M, z11.s, z1.s\n" "12:" // Height 1: skip row sum fixup "add z23.s, z23.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z22.s }, p2/Z, [x10]\n" - "ld1w { z24.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z21.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z20.s }, p2/Z, [x10, #3, MUL VL]\n" - "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" - "add z23.s, z23.s, z22.s\n" - "add z17.s, z17.s, z24.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" + "add x21, %x[qp], %[per_layer_mul]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - "addvl x10, x10, #4\n" - "add z18.s, z18.s, z21.s\n" - "add z19.s, z19.s, z20.s\n" + "add z23.s, z23.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "ld1rw { z4.s }, p2/Z, [x21]\n" "ld1rw { z0.s }, p2/Z, [x20]\n" - ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n" - ".inst 0x04b07631 // sqrdmulh z17.s, z17.s, z16.s\n" - ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n" - ".inst 0x04b07673 // sqrdmulh z19.s, z19.s, z16.s\n" - "tbz %x[flags], #5, 13f\n" - "and z22.d, z23.d, z0.d\n" - "and z21.d, z17.d, z0.d\n" - "and z20.d, z18.d, z0.d\n" - "and z16.d, z19.d, z0.d\n" - "asr z22.s, z22.s, #0x1f\n" - "asr z21.s, z21.s, #0x1f\n" - "asr z20.s, z20.s, #0x1f\n" - "asr z16.s, z16.s, #0x1f\n" - "sqadd z23.s, z23.s, z22.s\n" - "sqadd z17.s, z17.s, z21.s\n" - "sqadd z18.s, z18.s, z20.s\n" - "sqadd z19.s, z19.s, z16.s\n" - "13:" // Height 1: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add x21, %x[qp], %[c_offset]\n" + "add x20, %x[qp], %[maxval]\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x20]\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x21]\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "ld1rw { z21.s }, p2/Z, [x20]\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "ld1rw { z20.s }, p2/Z, [x20]\n" - "add z23.s, z23.s, z21.s\n" - "add x20, %x[qp], %[minval]\n" - "add z17.s, z17.s, z21.s\n" - "add z18.s, z18.s, z21.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z21.s\n" - "smin z23.s, p2/M, z23.s, z20.s\n" - "smin z17.s, p2/M, z17.s, z20.s\n" - "smin z18.s, p2/M, z18.s, z20.s\n" - "smin z19.s, p2/M, z19.s, z20.s\n" - "smax z23.s, p2/M, z23.s, z16.s\n" - "smax z17.s, p2/M, z17.s, z16.s\n" - "smax z18.s, p2/M, z18.s, z16.s\n" - "smax z19.s, p2/M, z19.s, z16.s\n" + "add z23.s, z23.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z23.h, z23.h, z17.h\n" - "uzp1 z16.h, z18.h, z19.h\n" - "uzp1 z23.b, z23.b, z16.b\n" + "uzp1 z17.h, z18.h, z19.h\n" + "uzp1 z23.b, z23.b, z17.b\n" "st1b { z23.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "14:" // Height 1: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" "bgt 2b\n" - "b 58f\n" - "15:" // Height 2 - "mov x10, %x[col_bias]\n" + "b 54f\n" + "14:" // Height 2 "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z15.b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "16:" // Height 2: Column loop + "15:" // Height 2: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -316,131 +296,130 @@ void sve_hybrid_u8s8qa_mmla_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "17:" // Height 2: setup done "mov x26, #0x0\n" - "18:" // Height 2: String loop + "17:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 19f\n" + "tbz %x[flags], #3, 18f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" - "cbnz x26, 20f\n" + "cbnz x26, 19f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" - "b 20f\n" - "19:" // Height 2: setup direct input + "b 19f\n" + "18:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" - "20:" // Height 2: input setup done + "19:" // Height 2: input setup done "cmp x25, #0x10\n" - "ble 23f\n" - "21:" // Height 2: Multiply loop: Main loop head + "ble 22f\n" + "20:" // Height 2: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z31.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "ld1rqb { z25.b }, p0/Z, [x23]\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - "ld1b { z24.b }, p2/Z, [x28, #6, MUL VL]\n" - "trn1 z0.d, z1.d, z25.d\n" - "trn2 z1.d, z1.d, z25.d\n" - ".inst 0x459f9810 // usmmla z16.s, z0.b, z31.b\n" - "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - ".inst 0x459e9814 // usmmla z20.s, z0.b, z30.b\n" - ".inst 0x459d9811 // usmmla z17.s, z0.b, z29.b\n" - ".inst 0x459c9815 // usmmla z21.s, z0.b, z28.b\n" - ".inst 0x459b9812 // usmmla z18.s, z0.b, z27.b\n" - ".inst 0x459a9816 // usmmla z22.s, z0.b, z26.b\n" - ".inst 0x45989813 // usmmla z19.s, z0.b, z24.b\n" - "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x45999817 // usmmla z23.s, z0.b, z25.b\n" - "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n" - ".inst 0x45989830 // usmmla z16.s, z1.b, z24.b\n" - "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n" - "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x459e9834 // usmmla z20.s, z1.b, z30.b\n" - ".inst 0x459d9831 // usmmla z17.s, z1.b, z29.b\n" - ".inst 0x459c9835 // usmmla z21.s, z1.b, z28.b\n" - ".inst 0x459b9832 // usmmla z18.s, z1.b, z27.b\n" - ".inst 0x459a9836 // usmmla z22.s, z1.b, z26.b\n" - ".inst 0x45999833 // usmmla z19.s, z1.b, z25.b\n" - ".inst 0x45989837 // usmmla z23.s, z1.b, z24.b\n" - "tbnz %x[flags], #31, 22f\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45859810 // usmmla z16.s, z0.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x45869814 // usmmla z20.s, z0.b, z6.b\n" + ".inst 0x45879811 // usmmla z17.s, z0.b, z7.b\n" + ".inst 0x45889815 // usmmla z21.s, z0.b, z8.b\n" + ".inst 0x45899812 // usmmla z18.s, z0.b, z9.b\n" + ".inst 0x458a9816 // usmmla z22.s, z0.b, z10.b\n" + ".inst 0x45849813 // usmmla z19.s, z0.b, z4.b\n" + "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" + ".inst 0x45859817 // usmmla z23.s, z0.b, z5.b\n" + "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-6, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #-5, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #-4, MUL VL]\n" + ".inst 0x45869830 // usmmla z16.s, z1.b, z6.b\n" + "ld1b { z4.b }, p2/Z, [x9, #-3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #-2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45879834 // usmmla z20.s, z1.b, z7.b\n" + ".inst 0x45889831 // usmmla z17.s, z1.b, z8.b\n" + ".inst 0x45899835 // usmmla z21.s, z1.b, z9.b\n" + ".inst 0x458a9832 // usmmla z18.s, z1.b, z10.b\n" + ".inst 0x45849836 // usmmla z22.s, z1.b, z4.b\n" + ".inst 0x45859833 // usmmla z19.s, z1.b, z5.b\n" + ".inst 0x45869837 // usmmla z23.s, z1.b, z6.b\n" + "tbnz %x[flags], #31, 21f\n" "udot z11.s, z0.b, z15.b\n" "udot z11.s, z1.b, z15.b\n" - "22:" // Height 2: Multiply loop: unique 3: skip row sum + "21:" // Height 2: Multiply loop: unique 3: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 21b\n" - "23:" // Height 2: Multiply loop: Single iteration only + "bgt 20b\n" + "22:" // Height 2: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z29.b }, p2/Z, [x28]\n" - "ld1b { z28.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "ld1rqb { z24.b }, p0/Z, [x23]\n" - "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" - "trn1 z0.d, z1.d, z24.d\n" - "trn2 z1.d, z1.d, z24.d\n" - ".inst 0x459d9810 // usmmla z16.s, z0.b, z29.b\n" - "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #8\n" - ".inst 0x459c9814 // usmmla z20.s, z0.b, z28.b\n" - ".inst 0x45849811 // usmmla z17.s, z0.b, z4.b\n" - ".inst 0x459b9815 // usmmla z21.s, z0.b, z27.b\n" - ".inst 0x459a9812 // usmmla z18.s, z0.b, z26.b\n" - ".inst 0x45869816 // usmmla z22.s, z0.b, z6.b\n" - ".inst 0x45999813 // usmmla z19.s, z0.b, z25.b\n" - ".inst 0x45989817 // usmmla z23.s, z0.b, z24.b\n" - "ble 24f\n" - "ld1b { z24.b }, p2/Z, [x28]\n" - "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45989830 // usmmla z16.s, z1.b, z24.b\n" - "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x459e9834 // usmmla z20.s, z1.b, z30.b\n" - "addvl x28, x28, #8\n" - ".inst 0x459d9831 // usmmla z17.s, z1.b, z29.b\n" - ".inst 0x459c9835 // usmmla z21.s, z1.b, z28.b\n" - ".inst 0x459b9832 // usmmla z18.s, z1.b, z27.b\n" - ".inst 0x459a9836 // usmmla z22.s, z1.b, z26.b\n" - ".inst 0x45999833 // usmmla z19.s, z1.b, z25.b\n" - ".inst 0x45989837 // usmmla z23.s, z1.b, z24.b\n" - "24:" // Height 2: Multiply loop: multiply skip - "tbnz %x[flags], #31, 25f\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" + "trn1 z0.d, z1.d, z2.d\n" + "trn2 z1.d, z1.d, z2.d\n" + ".inst 0x45859810 // usmmla z16.s, z0.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #8\n" + ".inst 0x45869814 // usmmla z20.s, z0.b, z6.b\n" + ".inst 0x45879811 // usmmla z17.s, z0.b, z7.b\n" + ".inst 0x45889815 // usmmla z21.s, z0.b, z8.b\n" + ".inst 0x45899812 // usmmla z18.s, z0.b, z9.b\n" + ".inst 0x458a9816 // usmmla z22.s, z0.b, z10.b\n" + ".inst 0x45849813 // usmmla z19.s, z0.b, z4.b\n" + ".inst 0x45859817 // usmmla z23.s, z0.b, z5.b\n" + "ble 23f\n" + "ld1b { z6.b }, p2/Z, [x9]\n" + "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x45869830 // usmmla z16.s, z1.b, z6.b\n" + "ld1b { z5.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45879834 // usmmla z20.s, z1.b, z7.b\n" + "addvl x9, x9, #8\n" + ".inst 0x45889831 // usmmla z17.s, z1.b, z8.b\n" + ".inst 0x45899835 // usmmla z21.s, z1.b, z9.b\n" + ".inst 0x458a9832 // usmmla z18.s, z1.b, z10.b\n" + ".inst 0x45849836 // usmmla z22.s, z1.b, z4.b\n" + ".inst 0x45859833 // usmmla z19.s, z1.b, z5.b\n" + ".inst 0x45869837 // usmmla z23.s, z1.b, z6.b\n" + "23:" // Height 2: Multiply loop: multiply skip + "tbnz %x[flags], #31, 24f\n" "udot z11.s, z0.b, z15.b\n" "udot z11.s, z1.b, z15.b\n" - "25:" // Height 2: Multiply loop: unique 4: skip row sum + "24:" // Height 2: Multiply loop: unique 4: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 18b\n" - "uzp1 z24.d, z16.d, z20.d\n" + "bne 17b\n" + "uzp1 z7.d, z16.d, z20.d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "uzp2 z16.d, z16.d, z20.d\n" "uzp1 z20.d, z17.d, z21.d\n" @@ -449,116 +428,90 @@ void sve_hybrid_u8s8qa_mmla_4x4VL ( "uzp2 z18.d, z18.d, z22.d\n" "uzp1 z22.d, z19.d, z23.d\n" "uzp2 z19.d, z19.d, z23.d\n" - "add x23, x27, x20\n" - "mov z23.d, z24.d\n" - "tbnz %x[flags], #31, 26f\n" + "add x26, x27, x20\n" + "mov z23.d, z7.d\n" + "tbnz %x[flags], #31, 25f\n" "add x20, %x[qp], %[b_offset]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "neg z24.s, p2/M, z24.s\n" + "ld1rw { z2.s }, p2/Z, [x20]\n" + "neg z2.s, p2/M, z2.s\n" "mov z12.s, z11.s[3]\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z24.s\n" - "mul z12.s, p2/M, z12.s, z24.s\n" - "26:" // Height 2: skip row sum fixup + "mul z11.s, p2/M, z11.s, z2.s\n" + "mul z12.s, p2/M, z12.s, z2.s\n" + "25:" // Height 2: skip row sum fixup "add z23.s, z23.s, z11.s\n" "add z20.s, z20.s, z11.s\n" - "ld1w { z28.s }, p2/Z, [x10]\n" - "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z21.s, z21.s, z11.s\n" "add z22.s, z22.s, z11.s\n" - "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z16.s, z16.s, z12.s\n" "add z17.s, z17.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z12.s\n" "add z19.s, z19.s, z12.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" - "add z23.s, z23.s, z28.s\n" - "add z20.s, z20.s, z27.s\n" - "addvl x10, x10, #4\n" - "add z21.s, z21.s, z26.s\n" - "add z22.s, z22.s, z25.s\n" - "add z16.s, z16.s, z28.s\n" - "add z17.s, z17.s, z27.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z18.s, z18.s, z26.s\n" - "add z19.s, z19.s, z25.s\n" - ".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n" - ".inst 0x04b87694 // sqrdmulh z20.s, z20.s, z24.s\n" - ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n" - ".inst 0x04b876d6 // sqrdmulh z22.s, z22.s, z24.s\n" - ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n" - ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n" - ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n" - ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n" - "tbz %x[flags], #5, 27f\n" - "and z24.d, z23.d, z0.d\n" - "and z30.d, z20.d, z0.d\n" - "and z29.d, z21.d, z0.d\n" - "and z28.d, z22.d, z0.d\n" - "and z27.d, z16.d, z0.d\n" - "and z26.d, z17.d, z0.d\n" - "asr z24.s, z24.s, #0x1f\n" - "and z25.d, z18.d, z0.d\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z27.s, z27.s, #0x1f\n" - "sqadd z23.s, z23.s, z24.s\n" - "and z24.d, z19.d, z0.d\n" - "asr z26.s, z26.s, #0x1f\n" - "asr z25.s, z25.s, #0x1f\n" - "sqadd z20.s, z20.s, z30.s\n" - "sqadd z21.s, z21.s, z29.s\n" - "asr z24.s, z24.s, #0x1f\n" - "sqadd z22.s, z22.s, z28.s\n" - "sqadd z16.s, z16.s, z27.s\n" - "sqadd z17.s, z17.s, z26.s\n" - "sqadd z18.s, z18.s, z25.s\n" - "sqadd z19.s, z19.s, z24.s\n" - "27:" // Height 2: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" + "add z23.s, z23.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "addvl x28, x28, #4\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "ld1rw { z26.s }, p2/Z, [x20]\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z23.s, z23.s, z26.s\n" + "add z23.s, z23.s, z4.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z20.s, z20.s, z26.s\n" - "add z21.s, z21.s, z26.s\n" - "ld1rw { z25.s }, p2/Z, [x20]\n" - "add z22.s, z22.s, z26.s\n" - "add z16.s, z16.s, z26.s\n" - "add x20, %x[qp], %[minval]\n" - "add z17.s, z17.s, z26.s\n" - "add z18.s, z18.s, z26.s\n" - "ld1rw { z24.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z26.s\n" - "smin z23.s, p2/M, z23.s, z25.s\n" - "smin z20.s, p2/M, z20.s, z25.s\n" - "smin z21.s, p2/M, z21.s, z25.s\n" - "smin z22.s, p2/M, z22.s, z25.s\n" - "smin z16.s, p2/M, z16.s, z25.s\n" - "smin z17.s, p2/M, z17.s, z25.s\n" - "smin z18.s, p2/M, z18.s, z25.s\n" - "smin z19.s, p2/M, z19.s, z25.s\n" - "smax z23.s, p2/M, z23.s, z24.s\n" - "smax z20.s, p2/M, z20.s, z24.s\n" - "smax z21.s, p2/M, z21.s, z24.s\n" - "smax z22.s, p2/M, z22.s, z24.s\n" - "smax z16.s, p2/M, z16.s, z24.s\n" - "smax z17.s, p2/M, z17.s, z24.s\n" - "smax z18.s, p2/M, z18.s, z24.s\n" - "smax z19.s, p2/M, z19.s, z24.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z23.h, z23.h, z20.h\n" "uzp1 z20.h, z21.h, z22.h\n" "uzp1 z16.h, z16.h, z17.h\n" @@ -567,23 +520,22 @@ void sve_hybrid_u8s8qa_mmla_4x4VL ( "uzp1 z16.b, z16.b, z17.b\n" "st1b { z23.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "st1b { z16.b }, p1, [x23]\n" - "28:" // Height 2: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 16b\n" - "b 58f\n" - "29:" // Height 3 - "mov x10, %x[col_bias]\n" + "st1b { z16.b }, p1, [x26]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 15b\n" + "b 54f\n" + "27:" // Height 3 "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z13.s, #0x0\n" "mov z15.b, #0x1\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "30:" // Height 3: Column loop + "28:" // Height 3: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -591,7 +543,7 @@ void sve_hybrid_u8s8qa_mmla_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" "mov z24.s, #0x0\n" @@ -602,39 +554,38 @@ void sve_hybrid_u8s8qa_mmla_4x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "31:" // Height 3: setup done "mov x26, #0x0\n" - "32:" // Height 3: String loop + "30:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 33f\n" + "tbz %x[flags], #3, 31f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" - "cbnz x26, 34f\n" + "cbnz x26, 32f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" - "b 34f\n" - "33:" // Height 3: setup direct input + "b 32f\n" + "31:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" - "34:" // Height 3: input setup done + "32:" // Height 3: input setup done "cmp x25, #0x10\n" - "ble 37f\n" - "35:" // Height 3: Multiply loop: Main loop head + "ble 35f\n" + "33:" // Height 3: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" @@ -643,333 +594,294 @@ void sve_hybrid_u8s8qa_mmla_4x4VL ( "add x22, x22, #0x10\n" "trn1 z0.d, z1.d, z2.d\n" "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z6.d\n" - "trn2 z3.d, z3.d, z6.d\n" - "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" ".inst 0x45859810 // usmmla z16.s, z0.b, z5.b\n" - ".inst 0x458a9814 // usmmla z20.s, z0.b, z10.b\n" - ".inst 0x45899811 // usmmla z17.s, z0.b, z9.b\n" + ".inst 0x45869814 // usmmla z20.s, z0.b, z6.b\n" + ".inst 0x45879811 // usmmla z17.s, z0.b, z7.b\n" ".inst 0x45889815 // usmmla z21.s, z0.b, z8.b\n" - ".inst 0x45849812 // usmmla z18.s, z0.b, z4.b\n" + ".inst 0x45899812 // usmmla z18.s, z0.b, z9.b\n" ".inst 0x45859858 // usmmla z24.s, z2.b, z5.b\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - ".inst 0x458a985c // usmmla z28.s, z2.b, z10.b\n" - ".inst 0x45899859 // usmmla z25.s, z2.b, z9.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x4586985c // usmmla z28.s, z2.b, z6.b\n" + ".inst 0x45879859 // usmmla z25.s, z2.b, z7.b\n" ".inst 0x4588985d // usmmla z29.s, z2.b, z8.b\n" - ".inst 0x4584985a // usmmla z26.s, z2.b, z4.b\n" - ".inst 0x45879816 // usmmla z22.s, z0.b, z7.b\n" - "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x4587985e // usmmla z30.s, z2.b, z7.b\n" - ".inst 0x45869813 // usmmla z19.s, z0.b, z6.b\n" - "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n" - ".inst 0x4586985b // usmmla z27.s, z2.b, z6.b\n" + ".inst 0x4589985a // usmmla z26.s, z2.b, z9.b\n" + ".inst 0x458a9816 // usmmla z22.s, z0.b, z10.b\n" + "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" + ".inst 0x458a985e // usmmla z30.s, z2.b, z10.b\n" + ".inst 0x45849813 // usmmla z19.s, z0.b, z4.b\n" + "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-6, MUL VL]\n" + ".inst 0x4584985b // usmmla z27.s, z2.b, z4.b\n" ".inst 0x45859817 // usmmla z23.s, z0.b, z5.b\n" - "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #-5, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #-4, MUL VL]\n" ".inst 0x4585985f // usmmla z31.s, z2.b, z5.b\n" - ".inst 0x45849830 // usmmla z16.s, z1.b, z4.b\n" - "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" - ".inst 0x45849878 // usmmla z24.s, z3.b, z4.b\n" - "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x458a9834 // usmmla z20.s, z1.b, z10.b\n" - ".inst 0x458a987c // usmmla z28.s, z3.b, z10.b\n" - ".inst 0x45899831 // usmmla z17.s, z1.b, z9.b\n" - ".inst 0x45899879 // usmmla z25.s, z3.b, z9.b\n" - ".inst 0x45889835 // usmmla z21.s, z1.b, z8.b\n" - ".inst 0x4588987d // usmmla z29.s, z3.b, z8.b\n" - ".inst 0x45879832 // usmmla z18.s, z1.b, z7.b\n" - ".inst 0x4587987a // usmmla z26.s, z3.b, z7.b\n" - ".inst 0x45869836 // usmmla z22.s, z1.b, z6.b\n" - ".inst 0x4586987e // usmmla z30.s, z3.b, z6.b\n" + ".inst 0x45869830 // usmmla z16.s, z1.b, z6.b\n" + "ld1b { z4.b }, p2/Z, [x9, #-3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #-2, MUL VL]\n" + ".inst 0x45869878 // usmmla z24.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45879834 // usmmla z20.s, z1.b, z7.b\n" + ".inst 0x4587987c // usmmla z28.s, z3.b, z7.b\n" + ".inst 0x45889831 // usmmla z17.s, z1.b, z8.b\n" + ".inst 0x45889879 // usmmla z25.s, z3.b, z8.b\n" + ".inst 0x45899835 // usmmla z21.s, z1.b, z9.b\n" + ".inst 0x4589987d // usmmla z29.s, z3.b, z9.b\n" + ".inst 0x458a9832 // usmmla z18.s, z1.b, z10.b\n" + ".inst 0x458a987a // usmmla z26.s, z3.b, z10.b\n" + ".inst 0x45849836 // usmmla z22.s, z1.b, z4.b\n" + ".inst 0x4584987e // usmmla z30.s, z3.b, z4.b\n" ".inst 0x45859833 // usmmla z19.s, z1.b, z5.b\n" ".inst 0x4585987b // usmmla z27.s, z3.b, z5.b\n" - ".inst 0x45849837 // usmmla z23.s, z1.b, z4.b\n" - ".inst 0x4584987f // usmmla z31.s, z3.b, z4.b\n" - "tbnz %x[flags], #31, 36f\n" + ".inst 0x45869837 // usmmla z23.s, z1.b, z6.b\n" + ".inst 0x4586987f // usmmla z31.s, z3.b, z6.b\n" + "tbnz %x[flags], #31, 34f\n" "udot z11.s, z0.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" "udot z11.s, z1.b, z15.b\n" "udot z13.s, z3.b, z15.b\n" - "36:" // Height 3: Multiply loop: unique 5: skip row sum + "34:" // Height 3: Multiply loop: unique 5: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 35b\n" - "37:" // Height 3: Multiply loop: Single iteration only + "bgt 33b\n" + "35:" // Height 3: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" "trn1 z0.d, z1.d, z2.d\n" "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z5.d\n" - "trn2 z3.d, z3.d, z5.d\n" - "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - ".inst 0x45849810 // usmmla z16.s, z0.b, z4.b\n" - ".inst 0x458a9814 // usmmla z20.s, z0.b, z10.b\n" - ".inst 0x45899811 // usmmla z17.s, z0.b, z9.b\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x45859810 // usmmla z16.s, z0.b, z5.b\n" + ".inst 0x45869814 // usmmla z20.s, z0.b, z6.b\n" + ".inst 0x45879811 // usmmla z17.s, z0.b, z7.b\n" ".inst 0x45889815 // usmmla z21.s, z0.b, z8.b\n" - ".inst 0x45879812 // usmmla z18.s, z0.b, z7.b\n" - ".inst 0x45849858 // usmmla z24.s, z2.b, z4.b\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x458a985c // usmmla z28.s, z2.b, z10.b\n" - "addvl x28, x28, #8\n" - ".inst 0x45899859 // usmmla z25.s, z2.b, z9.b\n" + ".inst 0x45899812 // usmmla z18.s, z0.b, z9.b\n" + ".inst 0x45859858 // usmmla z24.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x4586985c // usmmla z28.s, z2.b, z6.b\n" + "addvl x9, x9, #8\n" + ".inst 0x45879859 // usmmla z25.s, z2.b, z7.b\n" ".inst 0x4588985d // usmmla z29.s, z2.b, z8.b\n" - ".inst 0x4587985a // usmmla z26.s, z2.b, z7.b\n" - ".inst 0x45869816 // usmmla z22.s, z0.b, z6.b\n" - ".inst 0x4586985e // usmmla z30.s, z2.b, z6.b\n" - ".inst 0x45859813 // usmmla z19.s, z0.b, z5.b\n" - ".inst 0x4585985b // usmmla z27.s, z2.b, z5.b\n" - ".inst 0x45849817 // usmmla z23.s, z0.b, z4.b\n" - ".inst 0x4584985f // usmmla z31.s, z2.b, z4.b\n" - "ble 38f\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45849830 // usmmla z16.s, z1.b, z4.b\n" - ".inst 0x45849878 // usmmla z24.s, z3.b, z4.b\n" - "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x458a9834 // usmmla z20.s, z1.b, z10.b\n" - ".inst 0x458a987c // usmmla z28.s, z3.b, z10.b\n" - ".inst 0x45899831 // usmmla z17.s, z1.b, z9.b\n" - ".inst 0x45899879 // usmmla z25.s, z3.b, z9.b\n" - "addvl x28, x28, #8\n" - ".inst 0x45889835 // usmmla z21.s, z1.b, z8.b\n" - ".inst 0x4588987d // usmmla z29.s, z3.b, z8.b\n" - ".inst 0x45879832 // usmmla z18.s, z1.b, z7.b\n" - ".inst 0x4587987a // usmmla z26.s, z3.b, z7.b\n" - ".inst 0x45869836 // usmmla z22.s, z1.b, z6.b\n" - ".inst 0x4586987e // usmmla z30.s, z3.b, z6.b\n" + ".inst 0x4589985a // usmmla z26.s, z2.b, z9.b\n" + ".inst 0x458a9816 // usmmla z22.s, z0.b, z10.b\n" + ".inst 0x458a985e // usmmla z30.s, z2.b, z10.b\n" + ".inst 0x45849813 // usmmla z19.s, z0.b, z4.b\n" + ".inst 0x4584985b // usmmla z27.s, z2.b, z4.b\n" + ".inst 0x45859817 // usmmla z23.s, z0.b, z5.b\n" + ".inst 0x4585985f // usmmla z31.s, z2.b, z5.b\n" + "ble 36f\n" + "ld1b { z6.b }, p2/Z, [x9]\n" + "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x45869830 // usmmla z16.s, z1.b, z6.b\n" + ".inst 0x45869878 // usmmla z24.s, z3.b, z6.b\n" + "ld1b { z5.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45879834 // usmmla z20.s, z1.b, z7.b\n" + ".inst 0x4587987c // usmmla z28.s, z3.b, z7.b\n" + ".inst 0x45889831 // usmmla z17.s, z1.b, z8.b\n" + ".inst 0x45889879 // usmmla z25.s, z3.b, z8.b\n" + "addvl x9, x9, #8\n" + ".inst 0x45899835 // usmmla z21.s, z1.b, z9.b\n" + ".inst 0x4589987d // usmmla z29.s, z3.b, z9.b\n" + ".inst 0x458a9832 // usmmla z18.s, z1.b, z10.b\n" + ".inst 0x458a987a // usmmla z26.s, z3.b, z10.b\n" + ".inst 0x45849836 // usmmla z22.s, z1.b, z4.b\n" + ".inst 0x4584987e // usmmla z30.s, z3.b, z4.b\n" ".inst 0x45859833 // usmmla z19.s, z1.b, z5.b\n" ".inst 0x4585987b // usmmla z27.s, z3.b, z5.b\n" - ".inst 0x45849837 // usmmla z23.s, z1.b, z4.b\n" - ".inst 0x4584987f // usmmla z31.s, z3.b, z4.b\n" - "38:" // Height 3: Multiply loop: multiply skip - "tbnz %x[flags], #31, 39f\n" + ".inst 0x45869837 // usmmla z23.s, z1.b, z6.b\n" + ".inst 0x4586987f // usmmla z31.s, z3.b, z6.b\n" + "36:" // Height 3: Multiply loop: multiply skip + "tbnz %x[flags], #31, 37f\n" "udot z11.s, z0.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" "udot z11.s, z1.b, z15.b\n" "udot z13.s, z3.b, z15.b\n" - "39:" // Height 3: Multiply loop: unique 6: skip row sum + "37:" // Height 3: Multiply loop: unique 6: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 32b\n" + "bne 30b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 z0.d, z16.d, z20.d\n" + "uzp1 z7.d, z16.d, z20.d\n" "uzp2 z16.d, z16.d, z20.d\n" "uzp1 z20.d, z17.d, z21.d\n" "uzp2 z17.d, z17.d, z21.d\n" "uzp1 z21.d, z18.d, z22.d\n" "uzp2 z18.d, z18.d, z22.d\n" - "add x23, x27, x20\n" + "add x26, x27, x20\n" "uzp1 z22.d, z19.d, z23.d\n" "uzp2 z19.d, z19.d, z23.d\n" - "add x22, x23, x20\n" + "add x25, x26, x20\n" "uzp1 z24.d, z24.d, z28.d\n" "uzp1 z25.d, z25.d, z29.d\n" "uzp1 z26.d, z26.d, z30.d\n" "uzp1 z27.d, z27.d, z31.d\n" - "mov z31.d, z0.d\n" - "tbnz %x[flags], #31, 40f\n" + "mov z31.d, z7.d\n" + "tbnz %x[flags], #31, 38f\n" "add x20, %x[qp], %[b_offset]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" ".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n" - "ld1rw { z23.s }, p2/Z, [x20]\n" - "neg z23.s, p2/M, z23.s\n" + "ld1rw { z3.s }, p2/Z, [x20]\n" + "neg z3.s, p2/M, z3.s\n" "mov z12.s, z11.s[3]\n" "mov z11.s, z11.s[0]\n" "mov z13.s, z13.s[0]\n" - "mul z11.s, p2/M, z11.s, z23.s\n" - "mul z12.s, p2/M, z12.s, z23.s\n" - "mul z13.s, p2/M, z13.s, z23.s\n" - "40:" // Height 3: skip row sum fixup + "mul z11.s, p2/M, z11.s, z3.s\n" + "mul z12.s, p2/M, z12.s, z3.s\n" + "mul z13.s, p2/M, z13.s, z3.s\n" + "38:" // Height 3: skip row sum fixup "add z31.s, z31.s, z11.s\n" "add z20.s, z20.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z30.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z21.s, z21.s, z11.s\n" "add z22.s, z22.s, z11.s\n" - "ld1w { z29.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z28.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z16.s, z16.s, z12.s\n" "add z17.s, z17.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z12.s\n" "add z19.s, z19.s, z12.s\n" - "ld1rw { z23.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" "add z24.s, z24.s, z13.s\n" "add z25.s, z25.s, z13.s\n" - "addvl x10, x10, #4\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" "add z26.s, z26.s, z13.s\n" "add z27.s, z27.s, z13.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" "add z31.s, z31.s, z0.s\n" - "add z20.s, z20.s, z30.s\n" - "add z21.s, z21.s, z29.s\n" - "add z22.s, z22.s, z28.s\n" + "add z20.s, z20.s, z1.s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z30.s\n" - "add z18.s, z18.s, z29.s\n" - "add z19.s, z19.s, z28.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z30.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z26.s, z26.s, z29.s\n" - "add z27.s, z27.s, z28.s\n" - ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n" - ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n" - ".inst 0x04b776b5 // sqrdmulh z21.s, z21.s, z23.s\n" - ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n" - ".inst 0x04b77610 // sqrdmulh z16.s, z16.s, z23.s\n" - ".inst 0x04b77631 // sqrdmulh z17.s, z17.s, z23.s\n" - ".inst 0x04b77652 // sqrdmulh z18.s, z18.s, z23.s\n" - ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n" - ".inst 0x04b77718 // sqrdmulh z24.s, z24.s, z23.s\n" - ".inst 0x04b77739 // sqrdmulh z25.s, z25.s, z23.s\n" - ".inst 0x04b7775a // sqrdmulh z26.s, z26.s, z23.s\n" - ".inst 0x04b7777b // sqrdmulh z27.s, z27.s, z23.s\n" - "tbz %x[flags], #5, 41f\n" - "and z1.d, z31.d, z0.d\n" - "and z30.d, z20.d, z0.d\n" - "and z29.d, z21.d, z0.d\n" - "and z28.d, z22.d, z0.d\n" - "and z23.d, z16.d, z0.d\n" - "and z3.d, z17.d, z0.d\n" - "asr z1.s, z1.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z23.s, z23.s, #0x1f\n" - "and z2.d, z18.d, z0.d\n" - "sqadd z31.s, z31.s, z1.s\n" - "sqadd z20.s, z20.s, z30.s\n" - "sqadd z21.s, z21.s, z29.s\n" - "sqadd z22.s, z22.s, z28.s\n" - "sqadd z16.s, z16.s, z23.s\n" - "and z1.d, z19.d, z0.d\n" - "and z30.d, z24.d, z0.d\n" - "and z29.d, z25.d, z0.d\n" - "and z28.d, z26.d, z0.d\n" - "and z23.d, z27.d, z0.d\n" - "asr z3.s, z3.s, #0x1f\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "asr z30.s, z30.s, #0x1f\n" - "asr z29.s, z29.s, #0x1f\n" - "asr z28.s, z28.s, #0x1f\n" - "asr z23.s, z23.s, #0x1f\n" - "sqadd z17.s, z17.s, z3.s\n" - "sqadd z18.s, z18.s, z2.s\n" - "sqadd z19.s, z19.s, z1.s\n" - "sqadd z24.s, z24.s, z30.s\n" - "sqadd z25.s, z25.s, z29.s\n" - "sqadd z26.s, z26.s, z28.s\n" - "sqadd z27.s, z27.s, z23.s\n" - "41:" // Height 3: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add z25.s, z25.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + ".inst 0x04a473ff // sqdmulh z31.s, z31.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" - "ld1rw { z29.s }, p2/Z, [x20]\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + ".inst 0x04a47318 // sqdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47339 // sqdmulh z25.s, z25.s, z4.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + ".inst 0x04a4735a // sqdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4737b // sqdmulh z27.s, z27.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z31.s, z31.s, z29.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z20.s, z20.s, z29.s\n" - "add z21.s, z21.s, z29.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z22.s, z22.s, z29.s\n" - "add z16.s, z16.s, z29.s\n" + "add z31.s, z31.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z17.s, z17.s, z29.s\n" - "add z18.s, z18.s, z29.s\n" - "ld1rw { z28.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z29.s\n" - "add z24.s, z24.s, z29.s\n" - "add x20, %x[qp], %[minval]\n" - "add z25.s, z25.s, z29.s\n" - "add z26.s, z26.s, z29.s\n" - "ld1rw { z23.s }, p2/Z, [x20]\n" - "add z27.s, z27.s, z29.s\n" - "smin z31.s, p2/M, z31.s, z28.s\n" - "smin z20.s, p2/M, z20.s, z28.s\n" - "smin z21.s, p2/M, z21.s, z28.s\n" - "smin z22.s, p2/M, z22.s, z28.s\n" - "smin z16.s, p2/M, z16.s, z28.s\n" - "smin z17.s, p2/M, z17.s, z28.s\n" - "smin z18.s, p2/M, z18.s, z28.s\n" - "smin z19.s, p2/M, z19.s, z28.s\n" - "smin z24.s, p2/M, z24.s, z28.s\n" - "smin z25.s, p2/M, z25.s, z28.s\n" - "smin z26.s, p2/M, z26.s, z28.s\n" - "smin z27.s, p2/M, z27.s, z28.s\n" - "smax z31.s, p2/M, z31.s, z23.s\n" - "smax z20.s, p2/M, z20.s, z23.s\n" - "smax z21.s, p2/M, z21.s, z23.s\n" - "smax z22.s, p2/M, z22.s, z23.s\n" - "smax z16.s, p2/M, z16.s, z23.s\n" - "smax z17.s, p2/M, z17.s, z23.s\n" - "smax z18.s, p2/M, z18.s, z23.s\n" - "smax z19.s, p2/M, z19.s, z23.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "add z26.s, z26.s, z4.s\n" + "add z27.s, z27.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z31.h, z31.h, z20.h\n" - "smax z24.s, p2/M, z24.s, z23.s\n" - "smax z25.s, p2/M, z25.s, z23.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" "uzp1 z20.h, z21.h, z22.h\n" - "smax z26.s, p2/M, z26.s, z23.s\n" - "smax z27.s, p2/M, z27.s, z23.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "uzp1 z18.h, z18.h, z19.h\n" + "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z24.h, z24.h, z25.h\n" "uzp1 z31.b, z31.b, z20.b\n" - "uzp1 z17.h, z26.h, z27.h\n" - "uzp1 z16.b, z16.b, z18.b\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z16.b, z16.b, z17.b\n" "st1b { z31.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "uzp1 z24.b, z24.b, z17.b\n" - "st1b { z16.b }, p1, [x23]\n" - "st1b { z24.b }, p1, [x22]\n" - "42:" // Height 3: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 30b\n" - "b 58f\n" - "43:" // Height 4 + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z16.b }, p1, [x26]\n" + "st1b { z24.b }, p1, [x25]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 28b\n" + "b 54f\n" + "40:" // Height 4 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "ldr x27, [%x[args_ptr], %[offsetof_output_ptr]]\n" "mov x20, #0x4\n" - "mov x10, %x[col_bias]\n" "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" - "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" "mov z13.s, #0x0\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" "mov z14.s, #0x0\n" - "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "madd x20, x21, x20, x27\n" "mov z15.b, #0x1\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[col_bias]\n" + "madd x20, x21, x20, x27\n" "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "44:" // Height 4: Column loop + "41:" // Height 4: Column loop "mov x20, #0x0\n" "mov z16.s, #0x0\n" "mov z17.s, #0x0\n" @@ -977,7 +889,7 @@ void sve_hybrid_u8s8qa_mmla_4x4VL ( "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" - "whilelt p1.b, x20, x9\n" + "whilelt p1.b, x20, x10\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" "mov z24.s, #0x0\n" @@ -988,191 +900,190 @@ void sve_hybrid_u8s8qa_mmla_4x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "45:" // Height 4: setup done "mov x26, #0x0\n" - "46:" // Height 4: String loop + "43:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "tbz %x[flags], #3, 47f\n" + "tbz %x[flags], #3, 44f\n" "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" "add x20, x20, x21, LSL #3\n" "ldr x24, [x20, #0x0]\n" "ldr x23, [x20, #0x8]\n" "ldr x22, [x20, #0x10]\n" "ldr x21, [x20, #0x18]\n" - "cbnz x26, 48f\n" + "cbnz x26, 45f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" "add x23, x23, x20\n" "add x22, x22, x20\n" "add x21, x21, x20\n" - "b 48f\n" - "47:" // Height 4: setup direct input + "b 45f\n" + "44:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" "add x23, x24, x21\n" "add x22, x23, x21\n" "add x21, x22, x21\n" - "48:" // Height 4: input setup done + "45:" // Height 4: input setup done "cmp x25, #0x10\n" - "ble 51f\n" - "49:" // Height 4: Multiply loop: Main loop head + "ble 48f\n" + "46:" // Height 4: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "ld1rqb { z6.b }, p0/Z, [x21]\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" "add x21, x21, #0x10\n" "trn1 z0.d, z1.d, z2.d\n" "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z6.d\n" - "trn2 z3.d, z3.d, z6.d\n" - "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" ".inst 0x45859810 // usmmla z16.s, z0.b, z5.b\n" - ".inst 0x45849814 // usmmla z20.s, z0.b, z4.b\n" + ".inst 0x45869814 // usmmla z20.s, z0.b, z6.b\n" ".inst 0x45879811 // usmmla z17.s, z0.b, z7.b\n" - ".inst 0x45899815 // usmmla z21.s, z0.b, z9.b\n" - ".inst 0x45889812 // usmmla z18.s, z0.b, z8.b\n" + ".inst 0x45889815 // usmmla z21.s, z0.b, z8.b\n" + ".inst 0x45899812 // usmmla z18.s, z0.b, z9.b\n" ".inst 0x45859858 // usmmla z24.s, z2.b, z5.b\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" - "addvl x28, x28, #16\n" - ".inst 0x4584985c // usmmla z28.s, z2.b, z4.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" + ".inst 0x4586985c // usmmla z28.s, z2.b, z6.b\n" ".inst 0x45879859 // usmmla z25.s, z2.b, z7.b\n" - ".inst 0x4589985d // usmmla z29.s, z2.b, z9.b\n" - ".inst 0x4588985a // usmmla z26.s, z2.b, z8.b\n" + ".inst 0x4588985d // usmmla z29.s, z2.b, z8.b\n" + ".inst 0x4589985a // usmmla z26.s, z2.b, z9.b\n" ".inst 0x458a9816 // usmmla z22.s, z0.b, z10.b\n" - "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" ".inst 0x458a985e // usmmla z30.s, z2.b, z10.b\n" - ".inst 0x45869813 // usmmla z19.s, z0.b, z6.b\n" - "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n" - ".inst 0x4586985b // usmmla z27.s, z2.b, z6.b\n" + ".inst 0x45849813 // usmmla z19.s, z0.b, z4.b\n" + "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #-6, MUL VL]\n" + ".inst 0x4584985b // usmmla z27.s, z2.b, z4.b\n" ".inst 0x45859817 // usmmla z23.s, z0.b, z5.b\n" - "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #-5, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #-4, MUL VL]\n" ".inst 0x4585985f // usmmla z31.s, z2.b, z5.b\n" - ".inst 0x45849830 // usmmla z16.s, z1.b, z4.b\n" - "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" - ".inst 0x45849878 // usmmla z24.s, z3.b, z4.b\n" - "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x458a9834 // usmmla z20.s, z1.b, z10.b\n" - ".inst 0x458a987c // usmmla z28.s, z3.b, z10.b\n" - ".inst 0x45899831 // usmmla z17.s, z1.b, z9.b\n" - ".inst 0x45899879 // usmmla z25.s, z3.b, z9.b\n" - ".inst 0x45889835 // usmmla z21.s, z1.b, z8.b\n" - ".inst 0x4588987d // usmmla z29.s, z3.b, z8.b\n" - ".inst 0x45879832 // usmmla z18.s, z1.b, z7.b\n" - ".inst 0x4587987a // usmmla z26.s, z3.b, z7.b\n" - ".inst 0x45869836 // usmmla z22.s, z1.b, z6.b\n" - ".inst 0x4586987e // usmmla z30.s, z3.b, z6.b\n" + ".inst 0x45869830 // usmmla z16.s, z1.b, z6.b\n" + "ld1b { z4.b }, p2/Z, [x9, #-3, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9, #-2, MUL VL]\n" + ".inst 0x45869878 // usmmla z24.s, z3.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45879834 // usmmla z20.s, z1.b, z7.b\n" + ".inst 0x4587987c // usmmla z28.s, z3.b, z7.b\n" + ".inst 0x45889831 // usmmla z17.s, z1.b, z8.b\n" + ".inst 0x45889879 // usmmla z25.s, z3.b, z8.b\n" + ".inst 0x45899835 // usmmla z21.s, z1.b, z9.b\n" + ".inst 0x4589987d // usmmla z29.s, z3.b, z9.b\n" + ".inst 0x458a9832 // usmmla z18.s, z1.b, z10.b\n" + ".inst 0x458a987a // usmmla z26.s, z3.b, z10.b\n" + ".inst 0x45849836 // usmmla z22.s, z1.b, z4.b\n" + ".inst 0x4584987e // usmmla z30.s, z3.b, z4.b\n" ".inst 0x45859833 // usmmla z19.s, z1.b, z5.b\n" ".inst 0x4585987b // usmmla z27.s, z3.b, z5.b\n" - ".inst 0x45849837 // usmmla z23.s, z1.b, z4.b\n" - ".inst 0x4584987f // usmmla z31.s, z3.b, z4.b\n" - "tbnz %x[flags], #31, 50f\n" + ".inst 0x45869837 // usmmla z23.s, z1.b, z6.b\n" + ".inst 0x4586987f // usmmla z31.s, z3.b, z6.b\n" + "tbnz %x[flags], #31, 47f\n" "udot z11.s, z0.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" "udot z11.s, z1.b, z15.b\n" "udot z13.s, z3.b, z15.b\n" - "50:" // Height 4: Multiply loop: unique 7: skip row sum + "47:" // Height 4: Multiply loop: unique 7: skip row sum "sub x25, x25, #0x10\n" "cmp x25, #0x10\n" - "bgt 49b\n" - "51:" // Height 4: Multiply loop: Single iteration only + "bgt 46b\n" + "48:" // Height 4: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" - "ld1b { z6.b }, p2/Z, [x28]\n" - "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #5, MUL VL]\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" "ld1rqb { z2.b }, p0/Z, [x23]\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "ld1rqb { z5.b }, p0/Z, [x21]\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" "trn1 z0.d, z1.d, z2.d\n" "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z5.d\n" - "trn2 z3.d, z3.d, z5.d\n" - "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - ".inst 0x45869810 // usmmla z16.s, z0.b, z6.b\n" - ".inst 0x45849814 // usmmla z20.s, z0.b, z4.b\n" + "trn1 z2.d, z3.d, z4.d\n" + "trn2 z3.d, z3.d, z4.d\n" + "ld1b { z4.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x45859810 // usmmla z16.s, z0.b, z5.b\n" + ".inst 0x45869814 // usmmla z20.s, z0.b, z6.b\n" ".inst 0x45879811 // usmmla z17.s, z0.b, z7.b\n" - ".inst 0x45899815 // usmmla z21.s, z0.b, z9.b\n" - ".inst 0x45889812 // usmmla z18.s, z0.b, z8.b\n" - ".inst 0x45869858 // usmmla z24.s, z2.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x4584985c // usmmla z28.s, z2.b, z4.b\n" - "addvl x28, x28, #8\n" + ".inst 0x45889815 // usmmla z21.s, z0.b, z8.b\n" + ".inst 0x45899812 // usmmla z18.s, z0.b, z9.b\n" + ".inst 0x45859858 // usmmla z24.s, z2.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x4586985c // usmmla z28.s, z2.b, z6.b\n" + "addvl x9, x9, #8\n" ".inst 0x45879859 // usmmla z25.s, z2.b, z7.b\n" - ".inst 0x4589985d // usmmla z29.s, z2.b, z9.b\n" - ".inst 0x4588985a // usmmla z26.s, z2.b, z8.b\n" + ".inst 0x4588985d // usmmla z29.s, z2.b, z8.b\n" + ".inst 0x4589985a // usmmla z26.s, z2.b, z9.b\n" ".inst 0x458a9816 // usmmla z22.s, z0.b, z10.b\n" ".inst 0x458a985e // usmmla z30.s, z2.b, z10.b\n" - ".inst 0x45859813 // usmmla z19.s, z0.b, z5.b\n" - ".inst 0x4585985b // usmmla z27.s, z2.b, z5.b\n" - ".inst 0x45869817 // usmmla z23.s, z0.b, z6.b\n" - ".inst 0x4586985f // usmmla z31.s, z2.b, z6.b\n" - "ble 52f\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45849830 // usmmla z16.s, z1.b, z4.b\n" - ".inst 0x45849878 // usmmla z24.s, z3.b, z4.b\n" - "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x458a9834 // usmmla z20.s, z1.b, z10.b\n" - ".inst 0x458a987c // usmmla z28.s, z3.b, z10.b\n" - ".inst 0x45899831 // usmmla z17.s, z1.b, z9.b\n" - ".inst 0x45899879 // usmmla z25.s, z3.b, z9.b\n" - "addvl x28, x28, #8\n" - ".inst 0x45889835 // usmmla z21.s, z1.b, z8.b\n" - ".inst 0x4588987d // usmmla z29.s, z3.b, z8.b\n" - ".inst 0x45879832 // usmmla z18.s, z1.b, z7.b\n" - ".inst 0x4587987a // usmmla z26.s, z3.b, z7.b\n" - ".inst 0x45869836 // usmmla z22.s, z1.b, z6.b\n" - ".inst 0x4586987e // usmmla z30.s, z3.b, z6.b\n" + ".inst 0x45849813 // usmmla z19.s, z0.b, z4.b\n" + ".inst 0x4584985b // usmmla z27.s, z2.b, z4.b\n" + ".inst 0x45859817 // usmmla z23.s, z0.b, z5.b\n" + ".inst 0x4585985f // usmmla z31.s, z2.b, z5.b\n" + "ble 49f\n" + "ld1b { z6.b }, p2/Z, [x9]\n" + "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x45869830 // usmmla z16.s, z1.b, z6.b\n" + ".inst 0x45869878 // usmmla z24.s, z3.b, z6.b\n" + "ld1b { z5.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45879834 // usmmla z20.s, z1.b, z7.b\n" + ".inst 0x4587987c // usmmla z28.s, z3.b, z7.b\n" + ".inst 0x45889831 // usmmla z17.s, z1.b, z8.b\n" + ".inst 0x45889879 // usmmla z25.s, z3.b, z8.b\n" + "addvl x9, x9, #8\n" + ".inst 0x45899835 // usmmla z21.s, z1.b, z9.b\n" + ".inst 0x4589987d // usmmla z29.s, z3.b, z9.b\n" + ".inst 0x458a9832 // usmmla z18.s, z1.b, z10.b\n" + ".inst 0x458a987a // usmmla z26.s, z3.b, z10.b\n" + ".inst 0x45849836 // usmmla z22.s, z1.b, z4.b\n" + ".inst 0x4584987e // usmmla z30.s, z3.b, z4.b\n" ".inst 0x45859833 // usmmla z19.s, z1.b, z5.b\n" ".inst 0x4585987b // usmmla z27.s, z3.b, z5.b\n" - ".inst 0x45849837 // usmmla z23.s, z1.b, z4.b\n" - ".inst 0x4584987f // usmmla z31.s, z3.b, z4.b\n" - "52:" // Height 4: Multiply loop: multiply skip - "tbnz %x[flags], #31, 53f\n" + ".inst 0x45869837 // usmmla z23.s, z1.b, z6.b\n" + ".inst 0x4586987f // usmmla z31.s, z3.b, z6.b\n" + "49:" // Height 4: Multiply loop: multiply skip + "tbnz %x[flags], #31, 50f\n" "udot z11.s, z0.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" "udot z11.s, z1.b, z15.b\n" "udot z13.s, z3.b, z15.b\n" - "53:" // Height 4: Multiply loop: unique 8: skip row sum + "50:" // Height 4: Multiply loop: unique 8: skip row sum "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" - "bne 46b\n" + "bne 43b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 z0.d, z16.d, z20.d\n" + "uzp1 z7.d, z16.d, z20.d\n" "uzp2 z16.d, z16.d, z20.d\n" "uzp1 z20.d, z17.d, z21.d\n" "uzp2 z17.d, z17.d, z21.d\n" "uzp1 z21.d, z18.d, z22.d\n" "uzp2 z18.d, z18.d, z22.d\n" - "add x23, x27, x20\n" - "add x22, x23, x20\n" + "add x26, x27, x20\n" + "add x25, x26, x20\n" "uzp1 z22.d, z19.d, z23.d\n" "uzp2 z19.d, z19.d, z23.d\n" - "add x21, x22, x20\n" + "add x24, x25, x20\n" "uzp1 z23.d, z24.d, z28.d\n" "uzp2 z24.d, z24.d, z28.d\n" "uzp1 z28.d, z25.d, z29.d\n" @@ -1181,233 +1092,182 @@ void sve_hybrid_u8s8qa_mmla_4x4VL ( "uzp2 z26.d, z26.d, z30.d\n" "uzp1 z30.d, z27.d, z31.d\n" "uzp2 z27.d, z27.d, z31.d\n" - "mov z31.d, z0.d\n" - "tbnz %x[flags], #31, 54f\n" + "mov z31.d, z7.d\n" + "tbnz %x[flags], #31, 51f\n" "add x20, %x[qp], %[b_offset]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" ".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "neg z0.s, p2/M, z0.s\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "neg z4.s, p2/M, z4.s\n" "mov z12.s, z11.s[3]\n" "mov z11.s, z11.s[0]\n" "mov z14.s, z13.s[3]\n" "mov z13.s, z13.s[0]\n" - "mul z11.s, p2/M, z11.s, z0.s\n" - "mul z12.s, p2/M, z12.s, z0.s\n" - "mul z13.s, p2/M, z13.s, z0.s\n" - "mul z14.s, p2/M, z14.s, z0.s\n" - "54:" // Height 4: skip row sum fixup + "mul z11.s, p2/M, z11.s, z4.s\n" + "mul z12.s, p2/M, z12.s, z4.s\n" + "mul z13.s, p2/M, z13.s, z4.s\n" + "mul z14.s, p2/M, z14.s, z4.s\n" + "51:" // Height 4: skip row sum fixup "add z31.s, z31.s, z11.s\n" "add z20.s, z20.s, z11.s\n" - "ld1w { z4.s }, p2/Z, [x10]\n" - "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x28]\n" + "ld1w { z1.s }, p2/Z, [x28, #1, MUL VL]\n" "add z21.s, z21.s, z11.s\n" "add z22.s, z22.s, z11.s\n" - "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x28, #3, MUL VL]\n" "add z16.s, z16.s, z12.s\n" "add z17.s, z17.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "orr %x[flags], %x[flags], #0x80000000\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z12.s\n" "add z19.s, z19.s, z12.s\n" - "ld1rw { z1.s }, p2/Z, [x20]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" + "add x22, %x[qp], %[c_offset]\n" "add z23.s, z23.s, z13.s\n" "add z28.s, z28.s, z13.s\n" - "addvl x10, x10, #4\n" + "add x21, %x[qp], %[maxval]\n" + "add x20, %x[qp], %[minval]\n" "add z29.s, z29.s, z13.s\n" "add z30.s, z30.s, z13.s\n" + "ld1rw { z6.s }, p2/Z, [x21]\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" "add z24.s, z24.s, z14.s\n" "add z25.s, z25.s, z14.s\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "addvl x28, x28, #4\n" "add z26.s, z26.s, z14.s\n" "add z27.s, z27.s, z14.s\n" - "add z31.s, z31.s, z4.s\n" - "add z20.s, z20.s, z0.s\n" - "add z21.s, z21.s, z3.s\n" - "add z22.s, z22.s, z2.s\n" - "add z16.s, z16.s, z4.s\n" - "add z17.s, z17.s, z0.s\n" - "add z18.s, z18.s, z3.s\n" - "add z19.s, z19.s, z2.s\n" - "add z23.s, z23.s, z4.s\n" - "add z28.s, z28.s, z0.s\n" - "add z29.s, z29.s, z3.s\n" - "add z30.s, z30.s, z2.s\n" - "add z24.s, z24.s, z4.s\n" - "add z25.s, z25.s, z0.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z26.s, z26.s, z3.s\n" - "add z27.s, z27.s, z2.s\n" - ".inst 0x04a177ff // sqrdmulh z31.s, z31.s, z1.s\n" - ".inst 0x04a17694 // sqrdmulh z20.s, z20.s, z1.s\n" - ".inst 0x04a176b5 // sqrdmulh z21.s, z21.s, z1.s\n" - ".inst 0x04a176d6 // sqrdmulh z22.s, z22.s, z1.s\n" - ".inst 0x04a17610 // sqrdmulh z16.s, z16.s, z1.s\n" - ".inst 0x04a17631 // sqrdmulh z17.s, z17.s, z1.s\n" - ".inst 0x04a17652 // sqrdmulh z18.s, z18.s, z1.s\n" - ".inst 0x04a17673 // sqrdmulh z19.s, z19.s, z1.s\n" - ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n" - ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n" - ".inst 0x04a177bd // sqrdmulh z29.s, z29.s, z1.s\n" - ".inst 0x04a177de // sqrdmulh z30.s, z30.s, z1.s\n" - ".inst 0x04a17718 // sqrdmulh z24.s, z24.s, z1.s\n" - ".inst 0x04a17739 // sqrdmulh z25.s, z25.s, z1.s\n" - ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n" - ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n" - "tbz %x[flags], #5, 55f\n" - "and z2.d, z31.d, z0.d\n" - "and z1.d, z20.d, z0.d\n" - "and z7.d, z21.d, z0.d\n" - "and z6.d, z22.d, z0.d\n" - "and z5.d, z16.d, z0.d\n" - "and z4.d, z17.d, z0.d\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "and z3.d, z18.d, z0.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "sqadd z31.s, z31.s, z2.s\n" - "sqadd z20.s, z20.s, z1.s\n" - "and z2.d, z19.d, z0.d\n" - "and z1.d, z23.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z3.s, z3.s, #0x1f\n" - "sqadd z21.s, z21.s, z7.s\n" - "sqadd z22.s, z22.s, z6.s\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "sqadd z16.s, z16.s, z5.s\n" - "sqadd z17.s, z17.s, z4.s\n" - "sqadd z18.s, z18.s, z3.s\n" - "and z7.d, z28.d, z0.d\n" - "sqadd z19.s, z19.s, z2.s\n" - "sqadd z23.s, z23.s, z1.s\n" - "and z6.d, z29.d, z0.d\n" - "and z5.d, z30.d, z0.d\n" - "and z4.d, z24.d, z0.d\n" - "and z3.d, z25.d, z0.d\n" - "and z2.d, z26.d, z0.d\n" - "and z1.d, z27.d, z0.d\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z3.s, z3.s, #0x1f\n" - "asr z2.s, z2.s, #0x1f\n" - "asr z1.s, z1.s, #0x1f\n" - "sqadd z28.s, z28.s, z7.s\n" - "sqadd z29.s, z29.s, z6.s\n" - "sqadd z30.s, z30.s, z5.s\n" - "sqadd z24.s, z24.s, z4.s\n" - "sqadd z25.s, z25.s, z3.s\n" - "sqadd z26.s, z26.s, z2.s\n" - "sqadd z27.s, z27.s, z1.s\n" - "55:" // Height 4: no shift correction - "add x20, %x[qp], %[c_offset]\n" + "add z31.s, z31.s, z0.s\n" + "add z20.s, z20.s, z1.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z3.s\n" + "add z16.s, z16.s, z0.s\n" + "add z17.s, z17.s, z1.s\n" + "add z18.s, z18.s, z2.s\n" + "add z19.s, z19.s, z3.s\n" + "add z23.s, z23.s, z0.s\n" + "add z28.s, z28.s, z1.s\n" + "add z29.s, z29.s, z2.s\n" + "add z30.s, z30.s, z3.s\n" + "add z24.s, z24.s, z0.s\n" + "add z25.s, z25.s, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" + "add z26.s, z26.s, z2.s\n" + "add z27.s, z27.s, z3.s\n" + ".inst 0x04a473ff // sqdmulh z31.s, z31.s, z4.s\n" + ".inst 0x04a47294 // sqdmulh z20.s, z20.s, z4.s\n" + ".inst 0x04a472b5 // sqdmulh z21.s, z21.s, z4.s\n" + ".inst 0x04a472d6 // sqdmulh z22.s, z22.s, z4.s\n" + ".inst 0x04a47210 // sqdmulh z16.s, z16.s, z4.s\n" + ".inst 0x04a47231 // sqdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04a47252 // sqdmulh z18.s, z18.s, z4.s\n" + ".inst 0x04a47273 // sqdmulh z19.s, z19.s, z4.s\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" - "ld1rw { z2.s }, p2/Z, [x20]\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" + ".inst 0x04a472f7 // sqdmulh z23.s, z23.s, z4.s\n" + ".inst 0x04a4739c // sqdmulh z28.s, z28.s, z4.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" + ".inst 0x04a473bd // sqdmulh z29.s, z29.s, z4.s\n" + ".inst 0x04a473de // sqdmulh z30.s, z30.s, z4.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" + ".inst 0x04a47318 // sqdmulh z24.s, z24.s, z4.s\n" + ".inst 0x04a47339 // sqdmulh z25.s, z25.s, z4.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z31.s, z31.s, z2.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" + ".inst 0x04a4735a // sqdmulh z26.s, z26.s, z4.s\n" + ".inst 0x04a4737b // sqdmulh z27.s, z27.s, z4.s\n" + "ld1rw { z4.s }, p2/Z, [x22]\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "add z20.s, z20.s, z2.s\n" - "add z21.s, z21.s, z2.s\n" ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" - "add z22.s, z22.s, z2.s\n" - "add z16.s, z16.s, z2.s\n" ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z17.s, z17.s, z2.s\n" - "add z18.s, z18.s, z2.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z19.s, z19.s, z2.s\n" - "add z23.s, z23.s, z2.s\n" + "add z31.s, z31.s, z4.s\n" + "add z20.s, z20.s, z4.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" - "add x20, %x[qp], %[maxval]\n" - "add z28.s, z28.s, z2.s\n" - "add z29.s, z29.s, z2.s\n" - "ld1rw { z1.s }, p2/Z, [x20]\n" - "add z30.s, z30.s, z2.s\n" - "add z24.s, z24.s, z2.s\n" - "add x20, %x[qp], %[minval]\n" - "add z25.s, z25.s, z2.s\n" - "add z26.s, z26.s, z2.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z27.s, z27.s, z2.s\n" - "smin z31.s, p2/M, z31.s, z1.s\n" - "smin z20.s, p2/M, z20.s, z1.s\n" - "smin z21.s, p2/M, z21.s, z1.s\n" - "smin z22.s, p2/M, z22.s, z1.s\n" - "smin z16.s, p2/M, z16.s, z1.s\n" - "smin z17.s, p2/M, z17.s, z1.s\n" - "smin z18.s, p2/M, z18.s, z1.s\n" - "smin z19.s, p2/M, z19.s, z1.s\n" - "smin z23.s, p2/M, z23.s, z1.s\n" - "smin z28.s, p2/M, z28.s, z1.s\n" - "smin z29.s, p2/M, z29.s, z1.s\n" - "smin z30.s, p2/M, z30.s, z1.s\n" - "smin z24.s, p2/M, z24.s, z1.s\n" - "smin z25.s, p2/M, z25.s, z1.s\n" - "smin z26.s, p2/M, z26.s, z1.s\n" - "smin z27.s, p2/M, z27.s, z1.s\n" - "smax z31.s, p2/M, z31.s, z0.s\n" - "smax z20.s, p2/M, z20.s, z0.s\n" - "smax z21.s, p2/M, z21.s, z0.s\n" - "smax z22.s, p2/M, z22.s, z0.s\n" - "smax z16.s, p2/M, z16.s, z0.s\n" - "smax z17.s, p2/M, z17.s, z0.s\n" - "smax z18.s, p2/M, z18.s, z0.s\n" - "smax z19.s, p2/M, z19.s, z0.s\n" + "add z21.s, z21.s, z4.s\n" + "add z22.s, z22.s, z4.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z4.s\n" + "add z18.s, z18.s, z4.s\n" + "add z19.s, z19.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z6.s\n" + "smin z20.s, p2/M, z20.s, z6.s\n" + "add z23.s, z23.s, z4.s\n" + "add z28.s, z28.s, z4.s\n" + "smin z21.s, p2/M, z21.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z6.s\n" + "add z29.s, z29.s, z4.s\n" + "add z30.s, z30.s, z4.s\n" + "smin z16.s, p2/M, z16.s, z6.s\n" + "smin z17.s, p2/M, z17.s, z6.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z4.s\n" + "smin z18.s, p2/M, z18.s, z6.s\n" + "smin z19.s, p2/M, z19.s, z6.s\n" + "add z26.s, z26.s, z4.s\n" + "add z27.s, z27.s, z4.s\n" + "smin z23.s, p2/M, z23.s, z6.s\n" + "smin z28.s, p2/M, z28.s, z6.s\n" + "smin z29.s, p2/M, z29.s, z6.s\n" + "smin z30.s, p2/M, z30.s, z6.s\n" + "smin z24.s, p2/M, z24.s, z6.s\n" + "smin z25.s, p2/M, z25.s, z6.s\n" + "smin z26.s, p2/M, z26.s, z6.s\n" + "smin z27.s, p2/M, z27.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z5.s\n" + "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z5.s\n" + "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z31.h, z31.h, z20.h\n" - "smax z23.s, p2/M, z23.s, z0.s\n" - "smax z28.s, p2/M, z28.s, z0.s\n" + "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z28.s, p2/M, z28.s, z5.s\n" "uzp1 z20.h, z21.h, z22.h\n" - "smax z29.s, p2/M, z29.s, z0.s\n" - "smax z30.s, p2/M, z30.s, z0.s\n" + "smax z29.s, p2/M, z29.s, z5.s\n" + "smax z30.s, p2/M, z30.s, z5.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z24.s, p2/M, z24.s, z0.s\n" - "smax z25.s, p2/M, z25.s, z0.s\n" + "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z5.s\n" "uzp1 z17.h, z18.h, z19.h\n" - "smax z26.s, p2/M, z26.s, z0.s\n" - "smax z27.s, p2/M, z27.s, z0.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z23.h, z23.h, z28.h\n" "uzp1 z31.b, z31.b, z20.b\n" - "uzp1 z18.h, z29.h, z30.h\n" + "uzp1 z28.h, z29.h, z30.h\n" "uzp1 z24.h, z24.h, z25.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "uzp1 z17.h, z26.h, z27.h\n" + "uzp1 z25.h, z26.h, z27.h\n" "st1b { z31.b }, p1, [x27]\n" "addvl x27, x27, #1\n" - "uzp1 z23.b, z23.b, z18.b\n" - "uzp1 z24.b, z24.b, z17.b\n" - "st1b { z16.b }, p1, [x23]\n" - "st1b { z23.b }, p1, [x22]\n" - "st1b { z24.b }, p1, [x21]\n" - "56:" // Height 4: Writeback done - "decw x9, ALL, MUL #4\n" - "cmp x9, XZR\n" - "bgt 44b\n" + "uzp1 z23.b, z23.b, z28.b\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z16.b }, p1, [x26]\n" + "st1b { z23.b }, p1, [x25]\n" + "st1b { z24.b }, p1, [x24]\n" + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 41b\n" "subs %x[M], %x[M], #0x4\n" - "beq 58f\n" + "beq 54f\n" "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 57f\n" + "tbz %x[flags], #3, 53f\n" "add x21, x21, #0x4\n" "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "57:" // Update direct input + "53:" // Update direct input "mov x20, #0x4\n" "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" - "58:" // Exit + "54:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" diff --git a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp index bde81f36b3..90604e941a 100644 --- a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp +++ b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp @@ -70,9 +70,10 @@ class QuantizeWrapper : public GemmCommon { if (working_space == nullptr || arrays_set == false) return; + auto& g_array = this->_gemm_array; /* Use the first part of our working space for the subgemm result, pass the operand details straight through. */ - _subgemm->set_arrays(this->_Aptr, this->_lda, this->_A_batch_stride, this->_A_multi_stride, - this->_Bptr, this->_ldb, this->_B_multi_stride, + _subgemm->set_arrays(g_array._Aptr, g_array._lda, g_array._A_batch_stride, g_array._A_multi_stride, + g_array._Bptr, g_array._ldb, g_array._B_multi_stride, reinterpret_cast(working_space), _args._Nsize, (_args._Nsize * _args._Msize), (_args._Nsize * _args._Msize * _args._nbatches), nullptr, 0); } @@ -86,20 +87,22 @@ class QuantizeWrapper : public GemmCommon { void requantize_runtime(unsigned int threadid) { unsigned int first_row = (threadid * _args._Msize) / _args._maxthreads; unsigned int last_row = ((threadid+1) * _args._Msize) / _args._maxthreads; + auto& g_array = this->_gemm_array; for (unsigned int multi=0; multi<_args._nmulti; multi++) { for (unsigned int batch=0; batch<_args._nbatches; batch++) { /* Compute row sums now */ - compute_row_sums(_params, _args._Ksize, (last_row - first_row), this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (first_row * this->_lda), - this->_lda, _row_sums + (multi * _args._nbatches * _args._Msize) + (batch * _args._Msize) + first_row); + compute_row_sums(_params, _args._Ksize, (last_row - first_row), g_array._Aptr + (multi * g_array._A_multi_stride) + + (batch * g_array._A_batch_stride) + (first_row * g_array._lda), g_array._lda, _row_sums + + (multi * _args._nbatches * _args._Msize) + (batch * _args._Msize) + first_row); // If we don't care about negative values, call the version of this function that doesn't correct before shifting. // 'c_offset' represents zero, so if the lowest possible quantized output value is the same or more than that we will not output negative numbers. - requantize_block_32(_params, _args._Nsize, (last_row - first_row), - reinterpret_cast(working_space) + (multi * (_args._Msize * _args._Nsize * _args._nbatches)) + (batch * (_args._Msize * _args._Nsize)) + (first_row * _args._Nsize), - _args._Nsize, - this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (first_row * this->_ldc), this->_ldc, - _row_sums + (multi * _args._nbatches * _args._Msize) + (batch * _args._Msize) + first_row, - _col_sums + (multi * _args._Nsize), 0); + requantize_block_32(_params, _args._Nsize, (last_row - first_row), reinterpret_cast(working_space) + + (multi * (_args._Msize * _args._Nsize * _args._nbatches)) + (batch * (_args._Msize * _args._Nsize)) + + (first_row * _args._Nsize), _args._Nsize, g_array._Cptr + (multi * g_array._C_multi_stride) + + (batch * g_array._C_batch_stride) + (first_row * g_array._ldc), g_array._ldc, _row_sums + + (multi * _args._nbatches * _args._Msize) + (batch * _args._Msize) + first_row, _col_sums + + (multi * _args._Nsize), 0); } } } @@ -138,7 +141,10 @@ class QuantizeWrapper : public GemmCommon { _args._maxthreads = nthreads; } - void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) override { + // TODO: Make this actually stateless. This still uses the stateful + // execution data because it requires a workspace which would also need to + // be handled statelessly. + void execute_stateless(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid, GemmArrays &) override { _subgemm->execute(work_range, thread_locator, threadid); _barrier.arrive_and_wait(); @@ -146,6 +152,10 @@ class QuantizeWrapper : public GemmCommon { requantize_runtime(threadid); } + void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) override { + execute_stateless(work_range, thread_locator, threadid, this->_gemm_array); + } + size_t get_working_size() const override { return _subgemm->get_working_size() + local_working_size(); } diff --git a/src/core/NEON/kernels/arm_gemm/quantized.cpp b/src/core/NEON/kernels/arm_gemm/quantized.cpp index dbc64fe038..d18477d628 100644 --- a/src/core/NEON/kernels/arm_gemm/quantized.cpp +++ b/src/core/NEON/kernels/arm_gemm/quantized.cpp @@ -55,7 +55,7 @@ namespace { * column is set up in any case (and it is hoped that the compiler can elide * the needless movs in the per-layer case). */ -template +template void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigned int height, const int32_t *input, unsigned int in_stride, int8_t *output, unsigned int out_stride, const int32_t *row_bias, const int32_t *col_bias, const unsigned int start_col) { @@ -203,48 +203,15 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne } // Multiply - v_in00 = vqrdmulhq_s32(v_in00, v_mul0); - v_in01 = vqrdmulhq_s32(v_in01, v_mul1); - v_in02 = vqrdmulhq_s32(v_in02, v_mul2); - v_in03 = vqrdmulhq_s32(v_in03, v_mul3); - - v_in10 = vqrdmulhq_s32(v_in10, v_mul0); - v_in11 = vqrdmulhq_s32(v_in11, v_mul1); - v_in12 = vqrdmulhq_s32(v_in12, v_mul2); - v_in13 = vqrdmulhq_s32(v_in13, v_mul3); - - // Compute and add on corrective offset - if (do_shift_correction) { - int32x4_t v_temp00 = vandq_s32(v_in00, v_shf0); - int32x4_t v_temp01 = vandq_s32(v_in01, v_shf1); - int32x4_t v_temp02 = vandq_s32(v_in02, v_shf2); - int32x4_t v_temp03 = vandq_s32(v_in03, v_shf3); - - int32x4_t v_temp10 = vandq_s32(v_in10, v_shf0); - int32x4_t v_temp11 = vandq_s32(v_in11, v_shf1); - int32x4_t v_temp12 = vandq_s32(v_in12, v_shf2); - int32x4_t v_temp13 = vandq_s32(v_in13, v_shf3); - - v_temp00 = vshrq_n_s32(v_temp00, 31); - v_temp01 = vshrq_n_s32(v_temp01, 31); - v_temp02 = vshrq_n_s32(v_temp02, 31); - v_temp03 = vshrq_n_s32(v_temp03, 31); - - v_temp10 = vshrq_n_s32(v_temp10, 31); - v_temp11 = vshrq_n_s32(v_temp11, 31); - v_temp12 = vshrq_n_s32(v_temp12, 31); - v_temp13 = vshrq_n_s32(v_temp13, 31); - - v_in00 = vqaddq_s32(v_in00, v_temp00); - v_in01 = vqaddq_s32(v_in01, v_temp01); - v_in02 = vqaddq_s32(v_in02, v_temp02); - v_in03 = vqaddq_s32(v_in03, v_temp03); - - v_in10 = vqaddq_s32(v_in10, v_temp10); - v_in11 = vqaddq_s32(v_in11, v_temp11); - v_in12 = vqaddq_s32(v_in12, v_temp12); - v_in13 = vqaddq_s32(v_in13, v_temp13); - } + v_in00 = vqdmulhq_s32(v_in00, v_mul0); + v_in01 = vqdmulhq_s32(v_in01, v_mul1); + v_in02 = vqdmulhq_s32(v_in02, v_mul2); + v_in03 = vqdmulhq_s32(v_in03, v_mul3); + + v_in10 = vqdmulhq_s32(v_in10, v_mul0); + v_in11 = vqdmulhq_s32(v_in11, v_mul1); + v_in12 = vqdmulhq_s32(v_in12, v_mul2); + v_in13 = vqdmulhq_s32(v_in13, v_mul3); v_in00 = vrshlq_s32(v_in00, v_shf0); v_in01 = vrshlq_s32(v_in01, v_shf1); @@ -390,40 +357,13 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne } // Multiply - v_in00 = vqrdmulhq_s32(v_in00, v_mul0); - v_in01 = vqrdmulhq_s32(v_in01, v_mul1); - v_in02 = vqrdmulhq_s32(v_in02, v_mul2); - - v_in10 = vqrdmulhq_s32(v_in10, v_mul0); - v_in11 = vqrdmulhq_s32(v_in11, v_mul1); - v_in12 = vqrdmulhq_s32(v_in12, v_mul2); - - // Compute and add on corrective offset - if (do_shift_correction) { - int32x4_t v_temp00 = vandq_s32(v_in00, v_shf0); - int32x4_t v_temp01 = vandq_s32(v_in01, v_shf1); - int32x4_t v_temp02 = vandq_s32(v_in02, v_shf2); - - int32x4_t v_temp10 = vandq_s32(v_in10, v_shf0); - int32x4_t v_temp11 = vandq_s32(v_in11, v_shf1); - int32x4_t v_temp12 = vandq_s32(v_in12, v_shf2); - - v_temp00 = vshrq_n_s32(v_temp00, 31); - v_temp01 = vshrq_n_s32(v_temp01, 31); - v_temp02 = vshrq_n_s32(v_temp02, 31); - - v_temp10 = vshrq_n_s32(v_temp10, 31); - v_temp11 = vshrq_n_s32(v_temp11, 31); - v_temp12 = vshrq_n_s32(v_temp12, 31); - - v_in00 = vqaddq_s32(v_in00, v_temp00); - v_in01 = vqaddq_s32(v_in01, v_temp01); - v_in02 = vqaddq_s32(v_in02, v_temp02); - - v_in10 = vqaddq_s32(v_in10, v_temp10); - v_in11 = vqaddq_s32(v_in11, v_temp11); - v_in12 = vqaddq_s32(v_in12, v_temp12); - } + v_in00 = vqdmulhq_s32(v_in00, v_mul0); + v_in01 = vqdmulhq_s32(v_in01, v_mul1); + v_in02 = vqdmulhq_s32(v_in02, v_mul2); + + v_in10 = vqdmulhq_s32(v_in10, v_mul0); + v_in11 = vqdmulhq_s32(v_in11, v_mul1); + v_in12 = vqdmulhq_s32(v_in12, v_mul2); v_in00 = vrshlq_s32(v_in00, v_shf0); v_in01 = vrshlq_s32(v_in01, v_shf1); @@ -525,24 +465,9 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne } // Then multiply - v_in00 = vqrdmulhq_s32(v_in00, v_mul0); - - v_in10 = vqrdmulhq_s32(v_in10, v_mul0); - - // Compute and add on corrective offset - if (do_shift_correction) { - int32x4_t v_temp00 = vandq_s32(v_in00, v_shf0); - - int32x4_t v_temp10 = vandq_s32(v_in10, v_shf0); + v_in00 = vqdmulhq_s32(v_in00, v_mul0); - v_temp00 = vshrq_n_s32(v_temp00, 31); - - v_temp10 = vshrq_n_s32(v_temp10, 31); - - v_in00 = vqaddq_s32(v_in00, v_temp00); - - v_in10 = vqaddq_s32(v_in10, v_temp10); - } + v_in10 = vqdmulhq_s32(v_in10, v_mul0); v_in00 = vrshlq_s32(v_in00, v_shf0); @@ -639,24 +564,9 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne } // Then multiply - v_in00 = vqrdmulhq_s32(v_in00, v_mul0); - - v_in10 = vqrdmulhq_s32(v_in10, v_mul0); - - // Compute and add on corrective offset - if (do_shift_correction) { - int32x4_t v_temp00 = vandq_s32(v_in00, v_shf0); + v_in00 = vqdmulhq_s32(v_in00, v_mul0); - int32x4_t v_temp10 = vandq_s32(v_in10, v_shf0); - - v_temp00 = vshrq_n_s32(v_temp00, 31); - - v_temp10 = vshrq_n_s32(v_temp10, 31); - - v_in00 = vqaddq_s32(v_in00, v_temp00); - - v_in10 = vqaddq_s32(v_in10, v_temp10); - } + v_in10 = vqdmulhq_s32(v_in10, v_mul0); v_in00 = vrshlq_s32(v_in00, v_shf0); @@ -699,40 +609,20 @@ void requantize_block_32(const Requantize32 &qp, unsigned int width, unsigned in const Tin *input, unsigned int in_stride, Tout *output, unsigned int out_stride, const int32_t *row_bias, const int32_t *col_bias, unsigned int start_col) { if (qp.per_channel_requant) { - if (qp.minval >= qp.c_offset) { - if (qp.per_channel_left_shifts) { - requantize_block_32_int(qp, width, height, reinterpret_cast(input), in_stride, - reinterpret_cast(output), out_stride, row_bias, col_bias, start_col); - } else { - requantize_block_32_int(qp, width, height, reinterpret_cast(input), in_stride, - reinterpret_cast(output), out_stride, row_bias, col_bias, start_col); - } + if (qp.per_channel_left_shifts) { + requantize_block_32_int(qp, width, height, reinterpret_cast(input), in_stride, + reinterpret_cast(output), out_stride, row_bias, col_bias, start_col); } else { - if (qp.per_channel_left_shifts) { - requantize_block_32_int(qp, width, height, reinterpret_cast(input), in_stride, - reinterpret_cast(output), out_stride, row_bias, col_bias, start_col); - } else { - requantize_block_32_int(qp, width, height, reinterpret_cast(input), in_stride, - reinterpret_cast(output), out_stride, row_bias, col_bias, start_col); - } + requantize_block_32_int(qp, width, height, reinterpret_cast(input), in_stride, + reinterpret_cast(output), out_stride, row_bias, col_bias, start_col); } } else { - if (qp.minval >= qp.c_offset) { - if (qp.per_layer_left_shift > 0) { - requantize_block_32_int(qp, width, height, reinterpret_cast(input), in_stride, - reinterpret_cast(output), out_stride, row_bias, col_bias, start_col); - } else { - requantize_block_32_int(qp, width, height, reinterpret_cast(input), in_stride, - reinterpret_cast(output), out_stride, row_bias, col_bias, start_col); - } + if (qp.per_layer_left_shift > 0) { + requantize_block_32_int(qp, width, height, reinterpret_cast(input), in_stride, + reinterpret_cast(output), out_stride, row_bias, col_bias, start_col); } else { - if (qp.per_layer_left_shift > 0) { - requantize_block_32_int(qp, width, height, reinterpret_cast(input), in_stride, - reinterpret_cast(output), out_stride, row_bias, col_bias, start_col); - } else { - requantize_block_32_int(qp, width, height, reinterpret_cast(input), in_stride, - reinterpret_cast(output), out_stride, row_bias, col_bias, start_col); - } + requantize_block_32_int(qp, width, height, reinterpret_cast(input), in_stride, + reinterpret_cast(output), out_stride, row_bias, col_bias, start_col); } } } diff --git a/src/core/NEON/wrapper/intrinsics/add.h b/src/core/NEON/wrapper/intrinsics/add.h index 6134d75b29..e4e37d6dc9 100644 --- a/src/core/NEON/wrapper/intrinsics/add.h +++ b/src/core/NEON/wrapper/intrinsics/add.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2020, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_WRAPPER_ADD_H -#define ARM_COMPUTE_WRAPPER_ADD_H +#ifndef ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_ADD_H +#define ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_ADD_H #include @@ -112,6 +112,28 @@ VADDW_IMPL(uint64x2_t, uint32x2_t, vaddw, u32) VADDW_IMPL(int64x2_t, int32x2_t, vaddw, s32) #undef VADDW_IMPL +#ifdef __aarch64__ +// VADDW_HIGH: Vector widening add with upper half extraction +#define VADDW_HIGH_IMPL(wtype, vtype, postfix) \ + inline wtype vaddw_high(const wtype &a, const vtype &b) \ + { \ + return vaddw_high_##postfix(a, b); \ + } +#else // __aarch64__ +#define VADDW_HIGH_IMPL(wtype, vtype, postfix) \ + inline wtype vaddw_high(const wtype &a, const vtype &b) \ + { \ + return vaddw(a, vget_high_##postfix(b)); \ + } +#endif // __aarch64__ +VADDW_HIGH_IMPL(uint16x8_t, uint8x16_t, u8) +VADDW_HIGH_IMPL(int16x8_t, int8x16_t, s8) +VADDW_HIGH_IMPL(uint32x4_t, uint16x8_t, u16) +VADDW_HIGH_IMPL(int32x4_t, int16x8_t, s16) +VADDW_HIGH_IMPL(uint64x2_t, uint32x4_t, u32) +VADDW_HIGH_IMPL(int64x2_t, int32x4_t, s32) +#undef VADDW_HIGH_IMPL + // VADDL: Vector long add #define VADDL_IMPL(wtype, vtype, prefix, postfix) \ inline wtype vaddl(const vtype &a, const vtype &b) \ @@ -198,4 +220,4 @@ VPADD_IMPL(float16x4_t, float16x4_t, vpadd, f16) #undef VPADD_IMPL } // namespace wrapper } // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_ADD_H */ +#endif // ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_ADD_H diff --git a/src/core/NEON/wrapper/intrinsics/movl.h b/src/core/NEON/wrapper/intrinsics/movl.h index 99f2150eab..03e3a52969 100644 --- a/src/core/NEON/wrapper/intrinsics/movl.h +++ b/src/core/NEON/wrapper/intrinsics/movl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2020, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_WRAPPER_MOVL_H -#define ARM_COMPUTE_WRAPPER_MOVL_H +#ifndef ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MOVL_H +#define ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MOVL_H #include @@ -44,6 +44,28 @@ VMOVL_IMPL(uint64x2_t, uint32x2_t, vmovl, u32) VMOVL_IMPL(int64x2_t, int32x2_t, vmovl, s32) #undef VMOVL_IMPL + +#ifdef __aarch64__ +#define VMOVL_HIGH_IMPL(ptype, vtype, postfix) \ + inline ptype vmovl_high(const vtype &a) \ + { \ + return vmovl_high_##postfix(a); \ + } +#else // __aarch64__ +#define VMOVL_HIGH_IMPL(ptype, vtype, postfix) \ + inline ptype vmovl_high(const vtype &a) \ + { \ + return vmovl(vget_high_##postfix(a)); \ + } +#endif // __aarch64__ +VMOVL_HIGH_IMPL(uint16x8_t, uint8x16_t, u8) +VMOVL_HIGH_IMPL(int16x8_t, int8x16_t, s8) +VMOVL_HIGH_IMPL(uint32x4_t, uint16x8_t, u16) +VMOVL_HIGH_IMPL(int32x4_t, int16x8_t, s16) +VMOVL_HIGH_IMPL(uint64x2_t, uint32x4_t, u32) +VMOVL_HIGH_IMPL(int64x2_t, int32x4_t, s32) + +#undef VMOVL_HIGH_IMPL } // namespace wrapper } // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_MOVL_H */ +#endif // ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MOVL_H diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp index f8b74a985d..a09fa2cba6 100644 --- a/src/core/utils/quantization/AsymmHelpers.cpp +++ b/src/core/utils/quantization/AsymmHelpers.cpp @@ -246,22 +246,16 @@ void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, int32_t saturating_rounding_doubling_highmul(int32_t a, int32_t b) { - bool overflow = a == b && a == std::numeric_limits::min(); - int64_t a_64(a); - int64_t b_64(b); - int64_t ab_64 = a_64 * b_64; - const bool is_positive_or_zero = - a == 0 || b == 0 || (std::signbit(static_cast(a)) == std::signbit(static_cast(b))); - int32_t nudge = is_positive_or_zero ? (1 << 30) : (1 - (1 << 30)); - int32_t ab_x2_high32 = static_cast((ab_64 + nudge) / (1ll << 31)); - return overflow ? std::numeric_limits::max() : ab_x2_high32; + bool overflow = a == b && a == std::numeric_limits::min(); + int64_t a_64(a); + int64_t b_64(b); + int64_t ab_x2_64 = a_64 * b_64 * 2; + return overflow ? std::numeric_limits::max() : (ab_x2_64 >> 32); } inline int32_t rounding_divide_by_pow2(int32_t x, int exponent) { - const int32_t mask = (1 << exponent) - 1; - const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0); - return (x >> exponent) + ((x & mask) > threshold ? 1 : 0); + return (exponent == 0) ? x : ((x + (1 << (exponent - 1))) >> exponent); } int32_t multiply_by_quantized_multiplier(int32_t input, int32_t qmul, int32_t shift) diff --git a/src/cpu/kernels/CpuDynamicGemmKernel.cpp b/src/cpu/kernels/CpuDynamicGemmKernel.cpp new file mode 100644 index 0000000000..2efb91c2bf --- /dev/null +++ b/src/cpu/kernels/CpuDynamicGemmKernel.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuDynamicGemmKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ + +void CpuDynamicGemmKernel::configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(a); + ARM_COMPUTE_UNUSED(b); + ARM_COMPUTE_UNUSED(c); + ARM_COMPUTE_UNUSED(d); + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_UNUSED(gemm_info); + ARM_COMPUTE_UNUSED(_func); +} + +Status CpuDynamicGemmKernel::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(a); + ARM_COMPUTE_UNUSED(b); + ARM_COMPUTE_UNUSED(c); + ARM_COMPUTE_UNUSED(d); + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_UNUSED(gemm_info); + + return Status{ErrorCode::RUNTIME_ERROR, "Kernel not implemented yet."}; +} + +void CpuDynamicGemmKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(tensors); + ARM_COMPUTE_UNUSED(window); + ARM_COMPUTE_UNUSED(info); +} + +const char *CpuDynamicGemmKernel::name() const +{ + return ""; +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuDynamicGemmKernel.h b/src/cpu/kernels/CpuDynamicGemmKernel.h new file mode 100644 index 0000000000..be78022b74 --- /dev/null +++ b/src/cpu/kernels/CpuDynamicGemmKernel.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_CPUDYNAMICGEMMKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUDYNAMICGEMMKERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Arm(R) Neon (TM) kernel to perform dynamic GEMM */ +class CpuDynamicGemmKernel : public ICpuKernel +{ +private: + using DynamicGemmKernelPtr = std::add_pointer::type; + +public: + struct DynamicGemmKernel + { + const char *name; + DynamicGemmKernelPtr ukernel; + }; + CpuDynamicGemmKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDynamicGemmKernel); + /** Initialise the kernel's input and output. + * + * @param[in] a First input tensor info (Matrix A or Vector A). Data type supported: F32 + * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a + * @param[in] c Third input tensor info (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a + * @param[out] d Output tensor info. Data type supported: same as @p a + * @param[in] alpha Weight of the matrix product + * @param[in] beta Weight of matrix C + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should happen only for the first run + */ + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); + + /** Static function to check if given info will lead to a valid configuration of @ref CpuDynamicGemmMatKernel. + * + * @note The input and output tensor must have the same dimensions + * + * Similar to @ref CpuDynamicGemmKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + DynamicGemmKernelPtr _func{nullptr}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_CPUDYNAMICGEMMKERNEL_H diff --git a/src/cpu/kernels/CpuDynamicGemmKernelHeuristics.cpp b/src/cpu/kernels/CpuDynamicGemmKernelHeuristics.cpp new file mode 100644 index 0000000000..be4dbe9b60 --- /dev/null +++ b/src/cpu/kernels/CpuDynamicGemmKernelHeuristics.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/CpuDynamicGemmKernelHeuristics.h" + +#include +#include + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace heuristics +{ +namespace +{ + +using KernelList = std::vector; +using KernelMap = std::map; + +static const KernelMap kernels = {}; + +} // namespace + +CpuDynamicGemmKernelHeuristics::CpuDynamicGemmKernelHeuristics(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(a); + ARM_COMPUTE_UNUSED(b); + ARM_COMPUTE_UNUSED(c); + ARM_COMPUTE_UNUSED(d); + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_UNUSED(gemm_info); +} + +/** Return minimum workload size + * + * @return Minimum workload size for requested configuration. + */ +size_t CpuDynamicGemmKernelHeuristics::mws() const +{ + return _mws; +} + +/** Return kernel's execution window + * + * @return The execution window + */ +const Window &CpuDynamicGemmKernelHeuristics::window() const +{ + return _window; +} + +/** Return the kernel to run + * + * @return The function pointer to the chosen kernel + */ +const CpuDynamicGemmKernelHeuristics::DynamicGemmKernel *CpuDynamicGemmKernelHeuristics::kernel() +{ + return _kernel; +} + +/** Return the scheduling hint e.g. dimension(s) to split + * + * @return an instance of @ref IScheduler::Hints to describe the scheduling hints + */ +const IScheduler::Hints &CpuDynamicGemmKernelHeuristics::scheduler_hint() const +{ + return _hint; +} +} // namespace heuristics +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuDynamicGemmKernelHeuristics.h b/src/cpu/kernels/CpuDynamicGemmKernelHeuristics.h new file mode 100644 index 0000000000..7d84129743 --- /dev/null +++ b/src/cpu/kernels/CpuDynamicGemmKernelHeuristics.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_CPU_KERNELS_CPUDYNAMICGEMMKERNELHEURISTICS_H +#define ACL_SRC_CPU_KERNELS_CPUDYNAMICGEMMKERNELHEURISTICS_H + +#include "arm_compute/core/CPP/ICPPKernel.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/runtime/IScheduler.h" + +#include "src/core/common/Macros.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace heuristics +{ + +class CpuDynamicGemmKernelHeuristics +{ +public: + using KernelPtr = std::add_pointer::type; + + struct DynamicGemmKernel + { + const char *name{nullptr}; + KernelPtr ukernel{nullptr}; + }; + + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDynamicGemmKernelHeuristics); + + // Default constructor and destructor + CpuDynamicGemmKernelHeuristics() noexcept {}; + ~CpuDynamicGemmKernelHeuristics() = default; + + /** Similar to @ref CpuDynamicGemmKernel::configure() */ + CpuDynamicGemmKernelHeuristics(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); + + /** Return minimum workload size + * + * @return Minimum workload size for requested configuration in size_t + */ + size_t mws() const; + + /** Return kernel's execution window + * + * @return a reference to the kernel execution window of type @ref Window + */ + const Window &window() const; + + /** Return the kernel to run + * + * @return The function pointer to the chosen kernel + */ + const DynamicGemmKernel *kernel(); + + /** Return the scheduling hint e.g. dimension(s) to split + * + * @return an instance of @ref IScheduler::Hints to describe the scheduling hints + */ + const IScheduler::Hints &scheduler_hint() const; + +private: + size_t _mws{ICPPKernel::default_mws}; + Window _window{}; + const DynamicGemmKernel *_kernel{nullptr}; + IScheduler::Hints _hint{Window::DimY}; +}; + +} // namespace heuristics +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_CPUDYNAMICGEMMKERNELHEURISTICS_H diff --git a/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp b/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp index 76aa759dd1..b618eab139 100644 --- a/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp +++ b/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp @@ -24,6 +24,10 @@ #include "src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.h" +#include "arm_compute/core/utils/DataTypeUtils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/common/Registrars.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/activation/list.h" @@ -157,6 +161,139 @@ static const KernelMap kernels = {{DataType::F32, fp32_kernels}, {DataType::QASYMM8_SIGNED, qasymm8_signed_kernels}, {DataType::QSYMM16, qsymm16_kernels}}; +/** Find the index of the first element greater than the input + * @note binary search does not provide much value over the small array, + * therefore we keep the implementation simple. + * + * @param arr input array + * @param len length of the input array + * @param x element to compare + * @return the index found + */ +size_t find_ind_lte_elm(const size_t *arr, size_t len, size_t x) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(arr); + for (size_t i = 0; i < len; ++i) + { + if (x <= arr[i]) + { + return i; + } + } + + return len - 1; +} + +size_t calculate_mws(const CPUModel cpu_model, DataType dtype, const ActivationLayerInfo &act_info, size_t problem_size) +{ + // This number is loosely chosen as threading overhead in each platform varies wildly. + size_t mws = 1529; + + if (cpu_model == CPUModel::V1) + { + // If max_threads is smaller than the number of threads suggested in the heuristics, + // + const size_t max_threads = NEScheduler::get().num_threads(); + + constexpr int32_t compute_heavy_arr_fp32_len = 26; + static const size_t compute_heavy_arr_fp32[2][compute_heavy_arr_fp32_len] = { + {2000, 4000, 5000, 6000, 8000, 9000, 10000, 20000, 30000, 40000, 50000, 60000, 70000, + 80000, 90000, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 2000000}, + {1, 2, 3, 4, 5, 6, 7, 9, 12, 14, 15, 18, 20, 22, 25, 29, 36, 43, 48, 53, 57, 58, 59, 60, 62, max_threads}}; + + constexpr int32_t compute_light_arr_fp32_len = 20; + static const size_t compute_light_arr_fp32[2][compute_light_arr_fp32_len] = { + {30000, 40000, 50000, 70000, 80000, 90000, 100000, 200000, 300000, 400000, + 500000, 600000, 700000, 900000, 1000000, 2000000, 3000000, 4000000, 5000000, 6000000}, + {1, 2, 3, 4, 6, 8, 10, 13, 15, 18, 21, 23, 24, 25, 30, 38, 45, 53, 60, max_threads}}; + + constexpr int32_t compute_heavy_arr_fp16_len = 24; + static const size_t compute_heavy_arr_fp16[2][compute_heavy_arr_fp16_len] = { + {10000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 200000, 300000, 400000, + 500000, 800000, 900000, 1000000, 2000000, 3000000, 4000000, 5000000, 6000000, 8000000, 10000000, 20000000}, + {1, 2, 3, 5, 6, 7, 8, 10, 13, 17, 20, 23, 25, 28, 32, 37, 43, 49, 55, 58, 60, 61, 62, max_threads}}; + + constexpr int32_t compute_light_arr_fp16_len = 20; + static const size_t compute_light_arr_fp16[2][compute_light_arr_fp16_len] = { + {30000, 40000, 50000, 70000, 80000, 90000, 100000, 200000, 300000, 400000, + 500000, 600000, 700000, 900000, 1000000, 2000000, 3000000, 4000000, 5000000, 6000000}, + {1, 2, 3, 4, 6, 8, 10, 13, 15, 18, 21, 23, 24, 25, 30, 38, 45, 53, 60, max_threads}}; + + constexpr int32_t s8_arr_len = 24; + static const size_t s8_arr[2][s8_arr_len] = { + {7000, 8000, 9000, 10000, 20000, 30000, 40000, 60000, 70000, 90000, 100000, 200000, + 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 2000000, 3000000, 8000000, 9000000}, + {1, 2, 3, 4, 6, 7, 10, 11, 13, 15, 19, 23, 26, 31, 37, 40, 44, 48, 52, 54, 58, 61, 62, max_threads}}; + + const size_t dtype_len = data_size_from_type(dtype); + + const size_t *size_arr = nullptr; + const size_t *nthread_arr = nullptr; + size_t arr_len = 0; + + switch (act_info.activation()) + { + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + case ActivationLayerInfo::ActivationFunction::SWISH: + case ActivationLayerInfo::ActivationFunction::ELU: + case ActivationLayerInfo::ActivationFunction::GELU: + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + case ActivationLayerInfo::ActivationFunction::TANH: + { + switch (dtype_len) + { + case 4: + size_arr = &compute_heavy_arr_fp32[0][0]; + nthread_arr = &compute_heavy_arr_fp32[1][0]; + arr_len = compute_heavy_arr_fp32_len; + break; + case 2: + size_arr = &compute_heavy_arr_fp16[0][0]; + nthread_arr = &compute_heavy_arr_fp16[1][0]; + arr_len = compute_heavy_arr_fp16_len; + break; + case 1: + default: + size_arr = &s8_arr[0][0]; + nthread_arr = &s8_arr[1][0]; + arr_len = s8_arr_len; + break; + } + break; + } + default: + { + switch (dtype_len) + { + case 4: + size_arr = &compute_light_arr_fp32[0][0]; + nthread_arr = &compute_light_arr_fp32[1][0]; + arr_len = compute_light_arr_fp32_len; + break; + case 2: + size_arr = &compute_light_arr_fp16[0][0]; + nthread_arr = &compute_light_arr_fp16[1][0]; + arr_len = compute_light_arr_fp16_len; + break; + case 1: + default: + size_arr = &s8_arr[0][0]; + nthread_arr = &s8_arr[1][0]; + arr_len = s8_arr_len; + break; + } + break; + } + } + + const size_t ind = find_ind_lte_elm(size_arr, arr_len, problem_size); + const size_t nthreads = std::min(nthread_arr[ind], max_threads); + mws = (problem_size + nthreads - 1) / nthreads; + } + + return mws; +} + } // namespace void CpuActivationKernelHeuristics::choose_kernel(ActivationDataTypeISASelectorData &selector) @@ -187,6 +324,7 @@ CpuActivationKernelHeuristics::CpuActivationKernelHeuristics(const ITensorInfo const DataType dtype = src->data_type(); ActivationDataTypeISASelectorData selector{dtype, CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()}; + const CPUModel cpu_model = CPUInfo::get().get_cpu_model(); choose_kernel(selector); // Set window and scheduling hint @@ -204,9 +342,7 @@ CpuActivationKernelHeuristics::CpuActivationKernelHeuristics(const ITensorInfo // Set minimum workload size if (split_dim == Window::DimX) { - // Don't split the work load too small if the tensor has been reinterpreted as 1D. - // This number is loosely chosen as threading overhead in each platform varies wildly. - _mws = 1536; + _mws = calculate_mws(cpu_model, src->data_type(), activation_info.activation(), src->tensor_shape().x()); } } diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h index bdbfb54c22..c3a1799e12 100644 --- a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h +++ b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h @@ -24,12 +24,15 @@ #ifndef ACL_SRC_CPU_KERNELS_ASSEMBLY_CPUGEMMASSEMBLYWRAPPERKERNEL_H #define ACL_SRC_CPU_KERNELS_ASSEMBLY_CPUGEMMASSEMBLYWRAPPERKERNEL_H +#include "arm_compute/core/Error.h" +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "src/core/NEON/INEKernel.h" #include "src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp" +#include "gemm_arrays.hpp" #include "gemm_common.hpp" namespace arm_compute @@ -96,6 +99,34 @@ class CpuGemmAssemblyWrapperKernel final : public INEKernel _kernel->execute(ndc_win, ndc_tlc, info.thread_id); } + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override + { + ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast(_kernel))); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + + const auto *Aptr = reinterpret_cast(tensors.get_tensor(ACL_SRC_0)->buffer()); + const auto *Bptr = reinterpret_cast(tensors.get_tensor(ACL_SRC_1)->buffer()); + const auto *bias = reinterpret_cast(tensors.get_tensor(ACL_SRC_2)->buffer()); + auto *Cptr = reinterpret_cast(tensors.get_tensor(ACL_DST)->buffer()); + + ARM_COMPUTE_ERROR_ON_NULLPTR(Aptr, Cptr); + + // We make a copy of the original gemm arrays and then update the + // source, bias, and destination pointers with the packed values. + arm_gemm::GemmArrays ga = _kernel->get_gemm_arrays(); + + ga._Aptr = Aptr; + ga._Bptr = Bptr; + ga._bias = bias; + ga._Cptr = Cptr; + + auto win = arm_gemm::to_ndcoord(window); + + arm_gemm::ndcoord_t thread_locator{}; + + _kernel->execute_stateless(win, thread_locator, info.thread_id, ga); + } + /** Configure window of the kernel * * @param[in] window Region on which to execute the kernel diff --git a/src/cpu/kernels/assembly/gemm_arrays.hpp b/src/cpu/kernels/assembly/gemm_arrays.hpp new file mode 100644 index 0000000000..2d4f7e1a03 --- /dev/null +++ b/src/cpu/kernels/assembly/gemm_arrays.hpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_CPU_KERNELS_ASSEMBLY_GEMM_ARRAYS_HPP +#define ACL_SRC_CPU_KERNELS_ASSEMBLY_GEMM_ARRAYS_HPP + +#pragma once + +namespace arm_gemm +{ + +struct IGemmArrays +{ + /* Pass in the pointers to the arrays to be operated on and their + * strides. This "generic" version uses void *s, the preferred version + * is the one provided by templated GemmCommon (below) which takes + * appropriately typed pointers. If B is pretransposed (see below) then + * the settings for B here are ignored. + */ + virtual void set_arrays_generic(const void *A, + const int lda, + const int A_batch_stride, + const int A_multi_stride, + const void *B, + const int ldb, + const int B_multi_stride, /* batches share B */ + void *C, + const int ldc, + const int C_batch_stride, + const int C_multi_stride, + const void *bias, + const int bias_multi_stride) = 0; /* no row or batch stride needed */ + + virtual ~IGemmArrays() = default; +}; + +template +struct GemmArrays : public IGemmArrays +{ + const To *_Aptr = nullptr; + int _lda = 0; + int _A_batch_stride = 0; + int _A_multi_stride = 0; + const Tw *_Bptr = nullptr; + int _ldb = 0; + int _B_multi_stride = 0; + Tr *_Cptr = nullptr; + int _ldc = 0; + int _C_batch_stride = 0; + int _C_multi_stride = 0; + const Tr *_bias = nullptr; + int _bias_multi_stride = 0; + + GemmArrays() = default; + + GemmArrays(const To *A, + const int lda, + const int A_batch_stride, + const int A_multi_stride, + const Tw *B, + const int ldb, + const int B_multi_stride, /* batches share B */ + Tr *C, + const int ldc, + const int C_batch_stride, + const int C_multi_stride, + const Tr *bias, + const int bias_multi_stride) /* no row or batch stride needed */ + : _Aptr(A), + _lda(lda), + _A_batch_stride(A_batch_stride), + _A_multi_stride(A_multi_stride), + _Bptr(B), + _ldb(ldb), + _B_multi_stride(B_multi_stride), + _Cptr(C), + _ldc(ldc), + _C_batch_stride(C_batch_stride), + _C_multi_stride(C_multi_stride), + _bias(bias), + _bias_multi_stride(bias_multi_stride) + { + } + + GemmArrays(const GemmArrays &) = default; + GemmArrays &operator=(const GemmArrays &) = default; + GemmArrays(GemmArrays &&) = delete; + GemmArrays &operator=(GemmArrays &&) = delete; + ~GemmArrays() override = default; + + /* Pass in the pointers to the arrays to be operated on and their + * strides (templated version with appropriate types). */ + void set_arrays(const To *A, + const int lda, + const int A_batch_stride, + const int A_multi_stride, + const Tw *B, + const int ldb, + const int B_multi_stride, /* batches share B */ + Tr *C, + const int ldc, + const int C_batch_stride, + const int C_multi_stride, + const Tr *bias, + const int bias_multi_stride) /* no row or batch stride needed */ + { + _Aptr = A; + _lda = lda; + _A_batch_stride = A_batch_stride; + _A_multi_stride = A_multi_stride; + _Bptr = B; + _ldb = ldb; + _B_multi_stride = B_multi_stride; + _Cptr = C; + _ldc = ldc; + _C_batch_stride = C_batch_stride; + _C_multi_stride = C_multi_stride; + _bias = bias; + _bias_multi_stride = bias_multi_stride; + } + + /* Implementation of the void * overload which casts its arguments to the appropriate type. */ + void set_arrays_generic(const void *A, + const int lda, + const int A_batch_stride, + const int A_multi_stride, + const void *B, + const int ldb, + const int B_multi_stride, /* batches share B */ + void *C, + const int ldc, + const int C_batch_stride, + const int C_multi_stride, + const void *bias, + const int bias_multi_stride) override /* no row or batch stride needed */ + { + set_arrays(static_cast(A), lda, A_batch_stride, A_multi_stride, static_cast(B), ldb, + B_multi_stride, static_cast(C), ldc, C_batch_stride, C_multi_stride, + static_cast(bias), bias_multi_stride); + } +}; +} // namespace arm_gemm + +#endif // ACL_SRC_CPU_KERNELS_ASSEMBLY_GEMM_ARRAYS_HPP diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp index d5676c134e..ce1873a496 100644 --- a/src/cpu/kernels/assembly/gemm_common.hpp +++ b/src/cpu/kernels/assembly/gemm_common.hpp @@ -28,6 +28,7 @@ #pragma once #include "convolution_parameters.hpp" +#include "gemm_arrays.hpp" #include "ndrange.hpp" #include @@ -94,7 +95,7 @@ class IGemmCommon return false; } - /** Main execute member fucntion + /** Main execute member function * @param [in] work_range specifies the range of work we want to be computed, total range defined by get_window_size() * @param [in] thread_locator where are we inside of the thread space * @param [in] threadid a unique threadid @@ -200,23 +201,21 @@ template class GemmCommon : public IGemmCommon { protected: - const To *_Aptr = nullptr; - int _lda = 0; - int _A_batch_stride = 0; - int _A_multi_stride = 0; - const Tw *_Bptr = nullptr; - int _ldb = 0; - int _B_multi_stride = 0; - Tr *_Cptr = nullptr; - int _ldc = 0; - int _C_batch_stride = 0; - int _C_multi_stride = 0; - const Tr *_bias = nullptr; - int _bias_multi_stride = 0; + GemmArrays _gemm_array{}; public: /* Pass in the pointers to the arrays to be operated on and their * strides (templated version with appropriate types). */ + void set_gemm_arrays(GemmArrays &ga) + { + _gemm_array = ga; + } + + const GemmArrays &get_gemm_arrays() const + { + return _gemm_array; + } + virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, @@ -231,19 +230,8 @@ class GemmCommon : public IGemmCommon const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride) { - _Aptr = A; - _lda = lda; - _A_batch_stride = A_batch_stride; - _A_multi_stride = A_multi_stride; - _Bptr = B; - _ldb = ldb; - _B_multi_stride = B_multi_stride; - _Cptr = C; - _ldc = ldc; - _C_batch_stride = C_batch_stride; - _C_multi_stride = C_multi_stride; - _bias = bias; - _bias_multi_stride = bias_multi_stride; + _gemm_array.set_arrays(A, lda, A_batch_stride, A_multi_stride, B, ldb, B_multi_stride, C, ldc, C_batch_stride, + C_multi_stride, bias, bias_multi_stride); } /* Implementation of the void * overload which casts its arguments to the appropriate type. */ @@ -312,8 +300,18 @@ class GemmCommon : public IGemmCommon { set_indirect_parameters(sz, reinterpret_cast(ptr)); } -}; + /** Stateless version of the main execute member function + * @param [in] work_range specifies the range of work we want to be computed, total range defined by get_window_size() + * @param [in] thread_locator where are we inside of the thread space + * @param [in] threadid a unique threadid + * @param [out] GemmArrays structure containing the input/output addresses, and stride info + */ + virtual void execute_stateless(const ndcoord_t &work_range, + const ndcoord_t &thread_locator, + int threadid, + GemmArrays &gemm_array) = 0; +}; } // namespace arm_gemm #endif // ACL_SRC_CPU_KERNELS_ASSEMBLY_GEMM_COMMON_HPP diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h index 1560b38ceb..b6f0b4c18a 100644 --- a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h +++ b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h @@ -25,6 +25,9 @@ #define ACL_SRC_CPU_KERNELS_ELEMENTWISE_BINARY_GENERIC_NEON_IMPL_H #include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Window.h" #include "src/core/NEON/NEAsymm.h" @@ -596,21 +599,33 @@ inline void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out) } inline void -store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4x4_t &invscale) { // Adjust offset with 0.5 to round to nearest. const float32x4_t adj_offset = vaddq_f32(offset, vdupq_n_f32(0.5f)); const int32x4x4_t out = {{ - vcvtq_s32_f32(vmlaq_f32(adj_offset, rf.val[0], invscale)), - vcvtq_s32_f32(vmlaq_f32(adj_offset, rf.val[1], invscale)), - vcvtq_s32_f32(vmlaq_f32(adj_offset, rf.val[2], invscale)), - vcvtq_s32_f32(vmlaq_f32(adj_offset, rf.val[3], invscale)), + vcvtq_s32_f32(vmlaq_f32(adj_offset, rf.val[0], invscale.val[0])), + vcvtq_s32_f32(vmlaq_f32(adj_offset, rf.val[1], invscale.val[1])), + vcvtq_s32_f32(vmlaq_f32(adj_offset, rf.val[2], invscale.val[2])), + vcvtq_s32_f32(vmlaq_f32(adj_offset, rf.val[3], invscale.val[3])), }}; store_quantized(output_ptr, out); } -inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out) +inline void +store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +{ + return store_quantized(output_ptr, rf, offset, + float32x4x4_t{{ + invscale, + invscale, + invscale, + invscale, + }}); +} + +inline void store_quantized(int8_t *output_ptr, const int32x4x4_t &out) { const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); @@ -618,7 +633,7 @@ inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out) } inline void -store_quantized(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +store_quantized(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4x4_t &invscale) { // Adjust offset to round to nearest. const uint32x4x4_t cmp = {{ @@ -644,21 +659,105 @@ store_quantized(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t & }}; const int32x4x4_t out = {{ - vcvtq_s32_f32(vmlaq_f32(adj_offset.val[0], rf.val[0], invscale)), - vcvtq_s32_f32(vmlaq_f32(adj_offset.val[1], rf.val[1], invscale)), - vcvtq_s32_f32(vmlaq_f32(adj_offset.val[2], rf.val[2], invscale)), - vcvtq_s32_f32(vmlaq_f32(adj_offset.val[3], rf.val[3], invscale)), + vcvtq_s32_f32(vmlaq_f32(adj_offset.val[0], rf.val[0], invscale.val[0])), + vcvtq_s32_f32(vmlaq_f32(adj_offset.val[1], rf.val[1], invscale.val[1])), + vcvtq_s32_f32(vmlaq_f32(adj_offset.val[2], rf.val[2], invscale.val[2])), + vcvtq_s32_f32(vmlaq_f32(adj_offset.val[3], rf.val[3], invscale.val[3])), }}; - store_quantized_signed(output_ptr, out); + store_quantized(output_ptr, out); +} + +inline void +store_quantized(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +{ + return store_quantized(output_ptr, rf, offset, + float32x4x4_t{{ + invscale, + invscale, + invscale, + invscale, + }}); } +template ::value || std::is_same::value>> +union ElementwiseQuantizedScalarParams +{ + // For ops that expect dequantized inputs. + struct Generic + { + float a; + float b; + UniformQuantizationInfo qinfo; // Unused in comparison operations + } generic; + + // The prelu implementation expects quantized inputs. + struct Prelu + { + Input a; + Input b; + float s1; // scale_out / scale_a + float s2; // s1 / scale_b + int32_t a_offset; // Input quantization offset + int32_t b_offset; // Input quantization offset + int32_t o_offset; // Output quantization offset + } prelu; +}; + template ::value || std::is_same::value) && + (std::is_same::value || std::is_same::value)>> +inline Output elementwise_arithm_op_quantized_scalar(const ElementwiseQuantizedScalarParams ¶ms) +{ + const auto &_ = params.generic; + const float res = elementwise_arithm_op_scalar(_.a, _.b); + return Qasymm8QuantizationHelper::quantize(res, _.qinfo); +} + +// Specialization that optimizes PReLU by fusing quantization logic into the operator logic. +// Turns +// dequant(a) > 0 ? quant(dequant(a)) : quant(dequant(a) * dequant(b)) +// into +// a > offset_a ? quant_s1(a - offset_a) : quant_s2((a - offset_a) * (b - offset_b)) +// where quant_s1 and quant_s2 use the normal output offset, but scales s1 = scale_out / scale_a and s2 = s1 / scale_b respectively. +template ::value || std::is_same::value>> -inline Output elementwise_arithm_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) + typename = std::enable_if_t<(std::is_same::value || std::is_same::value) && + (std::is_same::value || std::is_same::value)>> +inline Output elementwise_prelu_quantized_scalar( + Input a, Input b, float s1, float s2, int32_t a_offset, int32_t b_offset, int32_t o_offset) { - const float res = elementwise_arithm_op_scalar(a, b); - return Qasymm8QuantizationHelper::quantize(res, qinfo); + int a_minus_offset = static_cast(a) - a_offset; + if (a_minus_offset > 0) + { + return Qasymm8QuantizationHelper::quantize(static_cast(a_minus_offset), + UniformQuantizationInfo{s1, o_offset}); + } + else + { + int b_minus_offset = static_cast(b) - b_offset; + return Qasymm8QuantizationHelper::quantize(static_cast(a_minus_offset) * b_minus_offset, + UniformQuantizationInfo{s2, o_offset}); + } +} + +template <> +inline uint8_t elementwise_arithm_op_quantized_scalar( + const ElementwiseQuantizedScalarParams ¶ms) +{ + const auto &_ = params.prelu; + return elementwise_prelu_quantized_scalar(_.a, _.b, _.s1, _.s2, _.a_offset, _.b_offset, + _.o_offset); +} + +template <> +inline int8_t elementwise_arithm_op_quantized_scalar( + const ElementwiseQuantizedScalarParams ¶ms) +{ + const auto &_ = params.prelu; + return elementwise_prelu_quantized_scalar(_.a, _.b, _.s1, _.s2, _.a_offset, _.b_offset, _.o_offset); } template @@ -674,11 +773,13 @@ float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t return out; } -template -inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) +template ::value || std::is_same::value>> +inline uint8_t elementwise_comp_op_quantized_scalar(const ElementwiseQuantizedScalarParams ¶ms) { - ARM_COMPUTE_UNUSED(qinfo); - return elementwise_comp_op_scalar(a, b); + const auto &_ = params.generic; + return elementwise_comp_op_scalar(_.a, _.b); } template @@ -721,23 +822,189 @@ inline int elementwise_arithm_op_quantized_loop(int window_start_x, return x; } +inline int32x4x4_t widen_to_i32_and_offset(uint8x16_t q, int32x4_t offset) +{ + const int16x8_t low16x8 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(q))); + const int16x8_t high16x8 = vreinterpretq_s16_u16(wrapper::vmovl_high(q)); + return {{ + vaddw_s16(offset, vget_low_s16(low16x8)), + wrapper::vaddw_high(offset, low16x8), + vaddw_s16(offset, vget_low_s16(high16x8)), + wrapper::vaddw_high(offset, high16x8), + }}; +} + +inline int32x4x4_t widen_to_i32_and_offset(int8x16_t q, int32x4_t offset) +{ + const int16x8_t low16x8 = vmovl_s8(vget_low_s8(q)); + const int16x8_t high16x8 = wrapper::vmovl_high(q); + return {{ + vaddw_s16(offset, vget_low_s16(low16x8)), + wrapper::vaddw_high(offset, low16x8), + vaddw_s16(offset, vget_low_s16(high16x8)), + wrapper::vaddw_high(offset, high16x8), + }}; +} + +// Specialization that optimizes PReLU by fusing quantization logic into the operator logic. +// Turns +// dequant(a) > 0 ? quant(dequant(a)) : quant(dequant(a) * dequant(b)) +// into +// a > offset_a ? quant_s1(a - offset_a) : quant_s2((a - offset_a) * (b - offset_b)) +// where quant_s1 and quant_s2 use the normal output offset, but scales s1 = scale_out / scale_a and s2 = s1 / scale_b respectively. +template ::value || std::is_same::value) && + (std::is_same::value || std::is_same::value)>> +inline int elementwise_prelu_quantized_loop(int window_start_x, + int window_end_x, + int window_step_x, + const Input *input1_ptr, + const Input *input2_ptr, + Output *output_ptr, + int32x4_t v_neg_offset1, + int32x4_t v_neg_offset2, + float32x4_t vinv_s1, + float32x4_t vinv_s2, + float32x4_t voffseto, + float32x4_t invvscaleo) +{ + ARM_COMPUTE_UNUSED(invvscaleo); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t a_minus_offset = widen_to_i32_and_offset(wrapper::vloadq(input1_ptr + x), v_neg_offset1); + const int32x4x4_t b_minus_offset = widen_to_i32_and_offset(wrapper::vloadq(input2_ptr + x), v_neg_offset2); + + // prelu: a > offset_a ? (a - offset_a) : ((a - offset_a) * (b - offset_b)) + const uint32x4x4_t cmp = {{ +#ifdef __aarch64__ + vcgtzq_s32(a_minus_offset.val[0]), + vcgtzq_s32(a_minus_offset.val[1]), + vcgtzq_s32(a_minus_offset.val[2]), + vcgtzq_s32(a_minus_offset.val[3]), +#else // __aarch64__ + vcgtq_s32(a_minus_offset.val[0], vdupq_n_s32(0)), + vcgtq_s32(a_minus_offset.val[1], vdupq_n_s32(0)), + vcgtq_s32(a_minus_offset.val[2], vdupq_n_s32(0)), + vcgtq_s32(a_minus_offset.val[3], vdupq_n_s32(0)), +#endif // __aarch64__ + }}; + const int32x4x4_t prelu_false = {{ + vmulq_s32(a_minus_offset.val[0], b_minus_offset.val[0]), + vmulq_s32(a_minus_offset.val[1], b_minus_offset.val[1]), + vmulq_s32(a_minus_offset.val[2], b_minus_offset.val[2]), + vmulq_s32(a_minus_offset.val[3], b_minus_offset.val[3]), + }}; + const int32x4x4_t prelui = {{ + vbslq_s32(cmp.val[0], a_minus_offset.val[0], prelu_false.val[0]), + vbslq_s32(cmp.val[1], a_minus_offset.val[1], prelu_false.val[1]), + vbslq_s32(cmp.val[2], a_minus_offset.val[2], prelu_false.val[2]), + vbslq_s32(cmp.val[3], a_minus_offset.val[3], prelu_false.val[3]), + }}; + const float32x4x4_t preluf = {{ + vcvtq_f32_s32(prelui.val[0]), + vcvtq_f32_s32(prelui.val[1]), + vcvtq_f32_s32(prelui.val[2]), + vcvtq_f32_s32(prelui.val[3]), + }}; + + // quant(prelu) + const float32x4x4_t vinv_s = {{ + vbslq_f32(cmp.val[0], vinv_s1, vinv_s2), + vbslq_f32(cmp.val[1], vinv_s1, vinv_s2), + vbslq_f32(cmp.val[2], vinv_s1, vinv_s2), + vbslq_f32(cmp.val[3], vinv_s1, vinv_s2), + }}; + store_quantized(output_ptr + x, preluf, voffseto, vinv_s); + } + return x; +} + +// Note: v_neg_offset1 and v_neg_offset2 are negated compared to the generic template. +template <> +inline int elementwise_arithm_op_quantized_loop(int window_start_x, + int window_end_x, + int window_step_x, + const uint8_t *input1_ptr, + const uint8_t *input2_ptr, + uint8_t *output_ptr, + int32x4_t v_neg_offset1, + int32x4_t v_neg_offset2, + float32x4_t vinv_s1, + float32x4_t vinv_s2, + float32x4_t voffseto, + float32x4_t invvscaleo) +{ + return elementwise_prelu_quantized_loop(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, + output_ptr, v_neg_offset1, v_neg_offset2, vinv_s1, vinv_s2, voffseto, + invvscaleo); +} + +// Note: v_neg_offset1 and v_neg_offset2 are negated compared to the generic template. +template <> +inline int elementwise_arithm_op_quantized_loop(int window_start_x, + int window_end_x, + int window_step_x, + const int8_t *input1_ptr, + const int8_t *input2_ptr, + int8_t *output_ptr, + int32x4_t v_neg_offset1, + int32x4_t v_neg_offset2, + float32x4_t vinv_s1, + float32x4_t vinv_s2, + float32x4_t voffseto, + float32x4_t invvscaleo) +{ + return elementwise_prelu_quantized_loop(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, + output_ptr, v_neg_offset1, v_neg_offset2, vinv_s1, vinv_s2, voffseto, + invvscaleo); +} + +template ::value || std::is_same::value>> +union ElementwiseQuantizedBroadcastParams +{ + // For ops that expect dequantized inputs. + struct Generic + { + float32x4x4_t broadcast_vector; + float32x4_t vscale_non_broadcast; + float32x4_t invvscaleo; // Not used by comparisons + } generic; + + // The prelu implementation expects quantized inputs. + struct Prelu + { + Input broadcast_value; + int32_t offset_broadcast; + float s1; // scale_out / scale_a + float32x4_t vinv_s1; // scale_a / scale_out + float32x4_t vinv_s2; // vinv_s1 * scale_b + int32_t o_offset; + } prelu; +}; + template ::value || std::is_same::value) && (std::is_same::value || std::is_same::value)>> -inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, - int window_end_x, - int window_step_x, - const Input *non_broadcast_input_ptr, - float32x4x4_t broadcast_vector, - Output *output_ptr, - int32x4_t voffset_non_broadcast, - float32x4_t vscale_non_broadcast, - float32x4_t voffseto, - float32x4_t invvscaleo, - bool reorder) +inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const Input *non_broadcast_input_ptr, + Output *output_ptr, + int32x4_t voffset_non_broadcast, + float32x4_t voffseto, + bool reorder, + const ElementwiseQuantizedBroadcastParams ¶ms) { + const float32x4x4_t &broadcast_vector = params.generic.broadcast_vector; + const float32x4_t &vscale_non_broadcast = params.generic.vscale_non_broadcast; + const float32x4_t &invvscaleo = params.generic.invvscaleo; + int x = window_start_x; for (; x <= (window_end_x - window_step_x); x += window_step_x) { @@ -750,6 +1017,187 @@ inline int elementwise_arithm_op_quantized_broadcast_loop(int window_s return x; } +// Implements one vector worth of PReLU on quantized data where the first operand is broadcast. +template ::value || std::is_same::value) && + (std::is_same::value || std::is_same::value)>> +inline void elementwise_prelu_quantized_broadcast_vector(int32_t a_minus_offset, + const Input *b_input_ptr, + int32x4_t b_neg_voffset, + Output *output_ptr, + float s1, + float32x4_t vinv_s2, + float32x4_t o_voffset, + int32_t o_offset) +{ + // a > offset_a ? (a - offset_a) : ((a - offset_a) * (b - offset_b)) + if (a_minus_offset > 0) + { + const Output res = Qasymm8QuantizationHelper::quantize(static_cast(a_minus_offset), + UniformQuantizationInfo{s1, o_offset}); + const auto res_v = wrapper::vdup_n(res, wrapper::traits::vector_128_tag{}); + wrapper::vstore(output_ptr, res_v); + } + else + { + const int32x4x4_t b_minus_offset = widen_to_i32_and_offset(wrapper::vloadq(b_input_ptr), b_neg_voffset); + const int32x4x4_t prelu_false = {{ + vmulq_n_s32(b_minus_offset.val[0], a_minus_offset), + vmulq_n_s32(b_minus_offset.val[1], a_minus_offset), + vmulq_n_s32(b_minus_offset.val[2], a_minus_offset), + vmulq_n_s32(b_minus_offset.val[3], a_minus_offset), + }}; + const float32x4x4_t preluf = {{ + vcvtq_f32_s32(prelu_false.val[0]), + vcvtq_f32_s32(prelu_false.val[1]), + vcvtq_f32_s32(prelu_false.val[2]), + vcvtq_f32_s32(prelu_false.val[3]), + }}; + + store_quantized(output_ptr, preluf, o_voffset, vinv_s2); + } +} + +// Implements one vector worth of PReLU on quantized data where the second operand is broadcast. +template ::value || std::is_same::value) && + (std::is_same::value || std::is_same::value)>> +inline void elementwise_prelu_quantized_broadcast_vector(const Input *a_input_ptr, + int32x4_t a_neg_voffset, + int32_t b_minus_offset, + Output *output_ptr, + float32x4_t vinv_s1, + float32x4_t vinv_s2, + float32x4_t voffseto) +{ + const int32x4x4_t a_minus_offset = widen_to_i32_and_offset(wrapper::vloadq(a_input_ptr), a_neg_voffset); + + // prelu: a > offset_a ? (a - offset_a) : ((a - offset_a) * (b - offset_b)) + const uint32x4x4_t cmp = {{ +#ifdef __aarch64__ + vcgtzq_s32(a_minus_offset.val[0]), + vcgtzq_s32(a_minus_offset.val[1]), + vcgtzq_s32(a_minus_offset.val[2]), + vcgtzq_s32(a_minus_offset.val[3]), +#else // __aarch64__ + vcgtq_s32(a_minus_offset.val[0], vdupq_n_s32(0)), + vcgtq_s32(a_minus_offset.val[1], vdupq_n_s32(0)), + vcgtq_s32(a_minus_offset.val[2], vdupq_n_s32(0)), + vcgtq_s32(a_minus_offset.val[3], vdupq_n_s32(0)), +#endif // __aarch64__ + }}; + const int32x4x4_t prelu_false = {{ + vmulq_n_s32(a_minus_offset.val[0], b_minus_offset), + vmulq_n_s32(a_minus_offset.val[1], b_minus_offset), + vmulq_n_s32(a_minus_offset.val[2], b_minus_offset), + vmulq_n_s32(a_minus_offset.val[3], b_minus_offset), + }}; + const int32x4x4_t prelui = {{ + vbslq_s32(cmp.val[0], a_minus_offset.val[0], prelu_false.val[0]), + vbslq_s32(cmp.val[1], a_minus_offset.val[1], prelu_false.val[1]), + vbslq_s32(cmp.val[2], a_minus_offset.val[2], prelu_false.val[2]), + vbslq_s32(cmp.val[3], a_minus_offset.val[3], prelu_false.val[3]), + }}; + const float32x4x4_t preluf = {{ + vcvtq_f32_s32(prelui.val[0]), + vcvtq_f32_s32(prelui.val[1]), + vcvtq_f32_s32(prelui.val[2]), + vcvtq_f32_s32(prelui.val[3]), + }}; + + // quant(prelu) + const float32x4x4_t vinv_s = {{ + vbslq_f32(cmp.val[0], vinv_s1, vinv_s2), + vbslq_f32(cmp.val[1], vinv_s1, vinv_s2), + vbslq_f32(cmp.val[2], vinv_s1, vinv_s2), + vbslq_f32(cmp.val[3], vinv_s1, vinv_s2), + }}; + store_quantized(output_ptr, preluf, voffseto, vinv_s); +} + +// Specialization that optimizes PReLU by fusing quantization logic into the operator logic. +// Turns +// dequant(a) > 0 ? quant(dequant(a)) : quant(dequant(a) * dequant(b)) +// into +// a > offset_a ? quant_s1(a - offset_a) : quant_s2((a - offset_a) * (b - offset_b)) +// where quant_s1 and quant_s2 use the normal output offset, but scales s1 = scale_out / scale_a and s2 = s1 / scale_b respectively. +template ::value || std::is_same::value) && + (std::is_same::value || std::is_same::value)>> +inline int elementwise_prelu_quantized_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const Input *non_broadcast_input_ptr, + Output *output_ptr, + int32x4_t vnegoffset_non_broadcast, + float32x4_t voffseto, + bool reorder, + const ElementwiseQuantizedBroadcastParams ¶ms) +{ + const auto &_ = params.prelu; + const int32_t broadcast_q = static_cast(_.broadcast_value); + const int32_t broadcast_minus_offset = broadcast_q - _.offset_broadcast; + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + if (reorder) + { + // With reorder, (a) is broadcast and (b) is non-broadcast. + elementwise_prelu_quantized_broadcast_vector(broadcast_minus_offset, non_broadcast_input_ptr + x, + vnegoffset_non_broadcast, output_ptr + x, _.s1, _.vinv_s2, + voffseto, _.o_offset); + } + else + { + // Without reorder, (a) is non-broadcast and (b) is broadcast. + elementwise_prelu_quantized_broadcast_vector(non_broadcast_input_ptr + x, vnegoffset_non_broadcast, + broadcast_minus_offset, output_ptr + x, _.vinv_s1, _.vinv_s2, + voffseto); + } + } + return x; +} + +// Note: vnegoffset_non_broadcast is negated compared to the generic template. +template <> +inline int elementwise_arithm_op_quantized_broadcast_loop( + int window_start_x, + int window_end_x, + int window_step_x, + const uint8_t *non_broadcast_input_ptr, + uint8_t *output_ptr, + int32x4_t vnegoffset_non_broadcast, + float32x4_t voffseto, + bool reorder, + const ElementwiseQuantizedBroadcastParams ¶ms) +{ + return elementwise_prelu_quantized_broadcast_loop(window_start_x, window_end_x, window_step_x, + non_broadcast_input_ptr, output_ptr, vnegoffset_non_broadcast, + voffseto, reorder, params); +} + +// Note: vnegoffset_non_broadcast is negated compared to the generic template. +template <> +inline int elementwise_arithm_op_quantized_broadcast_loop( + int window_start_x, + int window_end_x, + int window_step_x, + const int8_t *non_broadcast_input_ptr, + int8_t *output_ptr, + int32x4_t vnegoffset_non_broadcast, + float32x4_t voffseto, + bool reorder, + const ElementwiseQuantizedBroadcastParams ¶ms) +{ + return elementwise_prelu_quantized_broadcast_loop(window_start_x, window_end_x, window_step_x, + non_broadcast_input_ptr, output_ptr, vnegoffset_non_broadcast, + voffseto, reorder, params); +} + template ::value || std::is_same::value>> @@ -781,19 +1229,21 @@ inline int elementwise_comp_op_quantized_loop(int window_start_x, template ::value || std::is_same::value>> -inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, - int window_end_x, - int window_step_x, - const Input *non_broadcast_input_ptr, - float32x4x4_t broadcast_vector, - uint8_t *output_ptr, - int32x4_t voffset_non_broadcast, - float32x4_t vscale_non_broadcast, - float32x4_t voffseto, - float32x4_t invvscaleo, - bool reorder) +inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const Input *non_broadcast_input_ptr, + uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, + float32x4_t voffseto, + bool reorder, + const ElementwiseQuantizedBroadcastParams ¶ms) { - ARM_COMPUTE_UNUSED(voffseto, invvscaleo); + ARM_COMPUTE_UNUSED(voffseto); + + const float32x4_t &vscale_non_broadcast = params.generic.vscale_non_broadcast; + const float32x4x4_t &broadcast_vector = params.generic.broadcast_vector; + int x = window_start_x; for (; x <= (window_end_x - window_step_x); x += window_step_x) { @@ -814,18 +1264,16 @@ inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - Output (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + Output (*scalar_func)(const ElementwiseQuantizedScalarParams &), int (*broadcast_func)(int, int, int, const Input *, - float32x4x4_t, Output *, int32x4_t, float32x4_t, - float32x4_t, - float32x4_t, - const bool), + const bool, + const ElementwiseQuantizedBroadcastParams &), int (*neon_func)(int, int, int, @@ -839,7 +1287,15 @@ inline void elementwise_op_quantized(const ITensor *in1, float32x4_t, float32x4_t)) { - using InputVector = wrapper::traits::neon_vector_t; + bool is_prelu = scalar_func == &elementwise_arithm_op_quantized_scalar; + if (is_prelu) + { + ARM_COMPUTE_ERROR_ON( + broadcast_func != + (&elementwise_arithm_op_quantized_broadcast_loop)); + ARM_COMPUTE_ERROR_ON(neon_func != + (&elementwise_arithm_op_quantized_loop)); + } // Create input windows Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); @@ -855,6 +1311,14 @@ inline void elementwise_op_quantized(const ITensor *in1, const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); + const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); + + const float prelu_s1 = is_prelu ? output_qinfo.scale / input1_qinfo.scale : 1.0f; + const float prelu_s2 = is_prelu ? prelu_s1 / input2_qinfo.scale : 1.0f; + const float32x4_t prelu_vinv_s1 = is_prelu ? vdupq_n_f32(input1_qinfo.scale / output_qinfo.scale) : float32x4_t{}; + const float32x4_t prelu_vinv_s2 = + is_prelu ? vdupq_n_f32(input1_qinfo.scale * input2_qinfo.scale / output_qinfo.scale) : float32x4_t{}; const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset); const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); @@ -871,8 +1335,9 @@ inline void elementwise_op_quantized(const ITensor *in1, const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); - const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); + const int32x4_t voffset_non_broadcast = + vdupq_n_s32(is_prelu ? -non_broadcast_qinfo.offset : non_broadcast_qinfo.offset); + const float32x4_t vscale_non_broadcast = !is_prelu ? vdupq_n_f32(non_broadcast_qinfo.scale) : float32x4_t{}; // Clear X Dimension on execution window as we handle manually non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -888,37 +1353,77 @@ inline void elementwise_op_quantized(const ITensor *in1, const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); const auto output_ptr = reinterpret_cast(output.ptr()); - const Input broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const InputVector broadcast_value_v = - wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag{}); - const float32x4x4_t broadcast_vector = vdequantize(broadcast_value_v, broadcast_qinfo); + const Input broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const float broadcast_value_deq = + is_prelu ? 0.0f : Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo); + + ElementwiseQuantizedBroadcastParams params{}; + if (is_prelu) + { + params.prelu = { + broadcast_value, broadcast_qinfo.offset, prelu_s1, + prelu_vinv_s1, prelu_vinv_s2, output_qinfo.offset, + }; + } + else + { + const float32x4x4_t broadcast_vector = {{ + vdupq_n_f32(broadcast_value_deq), + vdupq_n_f32(broadcast_value_deq), + vdupq_n_f32(broadcast_value_deq), + vdupq_n_f32(broadcast_value_deq), + }}; + params.generic = { + broadcast_vector, + vscale_non_broadcast, + invvscaleo, + }; + } int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, - broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast, - voffseto, invvscaleo, !is_broadcast_input_2); + output_ptr, voffset_non_broadcast, voffseto, !is_broadcast_input_2, params); for (; x < window_end_x; ++x) { - const float afs = Qasymm8QuantizationHelper::dequantize(*(non_broadcast_input_ptr + x), - non_broadcast_qinfo); - const float bfs = Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, - !is_broadcast_input_2 ? afs : bfs, output_qinfo); + const Input non_broadcast_value = *(non_broadcast_input_ptr + x); + + ElementwiseQuantizedScalarParams params{}; + if (is_prelu) + { + params.prelu = { + !is_broadcast_input_2 ? broadcast_value : non_broadcast_value, + !is_broadcast_input_2 ? non_broadcast_value : broadcast_value, + prelu_s1, + prelu_s2, + input1_qinfo.offset, + input2_qinfo.offset, + output_qinfo.offset, + }; + } + else + { + const float non_broadcast_value_deq = + Qasymm8QuantizationHelper::dequantize(non_broadcast_value, non_broadcast_qinfo); + params.generic = { + !is_broadcast_input_2 ? broadcast_value_deq : non_broadcast_value_deq, + !is_broadcast_input_2 ? non_broadcast_value_deq : broadcast_value_deq, + output_qinfo, + }; + } + + *(output_ptr + x) = (*scalar_func)(params); } }, broadcast_input, non_broadcast_input, output); } else { - const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); - const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); - // Input1 quantization info - const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); - const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); + const int32x4_t voffset1 = vdupq_n_s32(is_prelu ? -input1_qinfo.offset : input1_qinfo.offset); + const float32x4_t vscale1 = is_prelu ? prelu_vinv_s1 : vdupq_n_f32(input1_qinfo.scale); // Input2 quantization info - const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); - const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); + const int32x4_t voffset2 = vdupq_n_s32(is_prelu ? -input2_qinfo.offset : input2_qinfo.offset); + const float32x4_t vscale2 = is_prelu ? prelu_vinv_s2 : vdupq_n_f32(input2_qinfo.scale); // Clear X Dimension on execution window as we handle manually input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -940,9 +1445,27 @@ inline void elementwise_op_quantized(const ITensor *in1, voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo); for (; x < window_end_x; ++x) { - const float afs = Qasymm8QuantizationHelper::dequantize(*(input1_ptr + x), input1_qinfo); - const float bfs = Qasymm8QuantizationHelper::dequantize(*(input2_ptr + x), input2_qinfo); - *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + const Input input1_value = *(input1_ptr + x); + const Input input2_value = *(input2_ptr + x); + + ElementwiseQuantizedScalarParams params{}; + if (is_prelu) + { + params.prelu = { + input1_value, input2_value, prelu_s1, prelu_s2, input1_qinfo.offset, + input2_qinfo.offset, output_qinfo.offset, + }; + } + else + { + params.generic = { + Qasymm8QuantizationHelper::dequantize(input1_value, input1_qinfo), + Qasymm8QuantizationHelper::dequantize(input2_value, input2_qinfo), + output_qinfo, + }; + } + + *(output_ptr + x) = (*scalar_func)(params); } }, input1, input2, output); @@ -953,7 +1476,7 @@ template void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { elementwise_op_quantized(in1, in2, out, window, - &elementwise_arithm_op_quantized_scalar, + &elementwise_arithm_op_quantized_scalar, &elementwise_arithm_op_quantized_broadcast_loop, &elementwise_arithm_op_quantized_loop); } @@ -961,7 +1484,8 @@ void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITe template void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { - elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar, + elementwise_op_quantized(in1, in2, out, window, + &elementwise_arithm_op_quantized_scalar, &elementwise_arithm_op_quantized_broadcast_loop, &elementwise_arithm_op_quantized_loop); } @@ -969,15 +1493,15 @@ void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *i template void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { - elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar, - &elementwise_comp_op_quantized_broadcast_loop, - &elementwise_comp_op_quantized_loop); + elementwise_op_quantized( + in1, in2, out, window, &elementwise_comp_op_quantized_scalar, + &elementwise_comp_op_quantized_broadcast_loop, &elementwise_comp_op_quantized_loop); } template void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { - elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar, + elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar, &elementwise_comp_op_quantized_broadcast_loop, &elementwise_comp_op_quantized_loop); } diff --git a/src/cpu/operators/CpuDynamicGemm.cpp b/src/cpu/operators/CpuDynamicGemm.cpp new file mode 100644 index 0000000000..1ab3b82d38 --- /dev/null +++ b/src/cpu/operators/CpuDynamicGemm.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuDynamicGemm.h" + +#include "arm_compute/core/TensorInfo.h" + +namespace arm_compute +{ +namespace cpu +{ + +void CpuDynamicGemm::configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); + ARM_COMPUTE_UNUSED(a); + ARM_COMPUTE_UNUSED(b); + ARM_COMPUTE_UNUSED(c); + ARM_COMPUTE_UNUSED(d); + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_UNUSED(gemm_info); +} + +Status CpuDynamicGemm::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(a); + ARM_COMPUTE_UNUSED(b); + ARM_COMPUTE_UNUSED(c); + ARM_COMPUTE_UNUSED(d); + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_UNUSED(gemm_info); + + return Status{ErrorCode::RUNTIME_ERROR, "Operator not implemented yet."}; +} + +void CpuDynamicGemm::run(ITensorPack &tensors) +{ + ARM_COMPUTE_UNUSED(tensors); +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuDynamicGemm.h b/src/cpu/operators/CpuDynamicGemm.h new file mode 100644 index 0000000000..5d44e16505 --- /dev/null +++ b/src/cpu/operators/CpuDynamicGemm.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUDYNAMICGEMM_H +#define ACL_SRC_CPU_OPERATORS_CPUDYNAMICGEMM_H + +#include "arm_compute/core/TensorInfo.h" + +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/kernels/CpuDynamicGemmKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to execute dynamic GEMM. This function calls the following kernels: + * + * -# @ref cpu::kernels::CpuDynamicGemmKernel + */ +class CpuDynamicGemm : public ICpuOperator +{ +public: + /** Default constructor */ + CpuDynamicGemm() = default; + /** Default destructor */ + ~CpuDynamicGemm() = default; + /** Configure operator for a given list of arguments + * + * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C]. + * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function. + * + * @param[in] a First input tensor info (Matrix A or Vector A). Data type supported: F32 + * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a + * @param[in] c Third input tensor info (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a + * @param[out] d Output tensor info. Data type supported: same as @p a + * @param[in] alpha Weight of the matrix product + * @param[in] beta Weight of matrix C + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should happen only for the first run + */ + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); + + /** Static function to check if given info will lead to a valid configuration of @ref CpuDynamicGemm. + * + * Similar to @ref CpuDynamicGemm::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + std::unique_ptr _kernel{nullptr}; +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_CPUDYNAMICGEMM_H diff --git a/src/cpu/operators/CpuPRelu.h b/src/cpu/operators/CpuPRelu.h index 084474e2ba..916b9a3c88 100644 --- a/src/cpu/operators/CpuPRelu.h +++ b/src/cpu/operators/CpuPRelu.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_PRELU_H -#define ARM_COMPUTE_CPU_PRELU_H +#ifndef ACL_SRC_CPU_OPERATORS_CPUPRELU_H +#define ACL_SRC_CPU_OPERATORS_CPUPRELU_H #include "src/cpu/operators/CpuElementwise.h" @@ -30,9 +30,9 @@ namespace arm_compute { namespace cpu { -/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for PRelu operation */ +/** Class to run @ref cpu::kernels::CpuArithmeticKernel for PRelu operation */ using CpuPRelu = CpuElementwiseArithmetic; } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_PRELU_H */ \ No newline at end of file +#endif // ACL_SRC_CPU_OPERATORS_CPUPRELU_H diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index aef59ffb30..fc106140fb 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -23,6 +23,8 @@ */ #include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/core/CPP/Validate.h" @@ -257,6 +259,11 @@ class Fallback : public CpuGemmAssemblyDispatch::IFallback _is_prepared = is_prepared; } + bool has_stateless_impl() const override + { + return _gemm_kernel_asm->get_working_size() == 0; + } + private: enum AuxTensorIdx { @@ -794,11 +801,37 @@ void Fallback::run(ITensorPack & multi_stride_a = 0; } + Tensor in0_tensor; + in0_tensor.allocator()->init(*(a->info())); + in0_tensor.allocator()->import_memory(const_cast(in0_ptr)); + + Tensor in1_tensor; + if (b) + { + in1_tensor.allocator()->init(*(b->info())); + in1_tensor.allocator()->import_memory(const_cast(in1_ptr)); + } + + Tensor bias_tensor; + if (c) + { + bias_tensor.allocator()->init(*(c->info())); + bias_tensor.allocator()->import_memory(bias); + } + + Tensor out_tensor; + out_tensor.allocator()->init(*(d->info())); + out_tensor.allocator()->import_memory(out_ptr); + + ITensorPack gemm_pack{ + {ACL_SRC_0, &in0_tensor}, {ACL_SRC_1, &in1_tensor}, {ACL_SRC_2, &bias_tensor}, {ACL_DST, &out_tensor}}; + // Set gemm parameters _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, ldd, batch_stride_d, multi_stride_d, bias, 0); + // Schedule - NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint); + NEScheduler::get().schedule_op(_optimised_kernel.get(), scheduling_hint, _optimised_kernel->window(), gemm_pack); } template @@ -1013,7 +1046,7 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected break; #endif /* ENABLE_FP16_KERNELS */ default: - ARM_COMPUTE_RETURN_ERROR_ON_MSG(true, "Usupported type. Could not find a kernel"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(true, "Unsupported type. Could not find a kernel"); break; } expected_weight_format = assembly_utils::map_to_arm_compute_weight_format(arm_gemm_expected_wf); @@ -1021,6 +1054,11 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected return Status{}; } +bool CpuGemmAssemblyDispatch::has_stateless_impl() const +{ + return _arm_gemm->has_stateless_impl(); +} + Status CpuGemmAssemblyDispatch::validate( const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info) { diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h index 0b6f22d45a..84420f7763 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h @@ -93,6 +93,7 @@ class CpuGemmAssemblyDispatch : public ICpuOperator const bool, const bool) = 0; virtual ~IFallback() = default; + virtual bool has_stateless_impl() const = 0; }; public: @@ -170,6 +171,18 @@ class CpuGemmAssemblyDispatch : public ICpuOperator const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info); + + /** Checks if a stateless implementation is supported + * + * The arm_gemm kernels that have been made stateless so far are those that + * do not require any working space. Once all kernels have been made + * stateless we can deprecate it by always returning true, and eventually + * removing it completely + * + * @return True if stateless execution is supported else false + */ + bool has_stateless_impl() const; + /** Checks if activation is supported by the gemm assembly dispatcher * * @param[in] activation Activation to check diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp index ecf84abd2c..2dd87310a8 100644 --- a/src/runtime/IScheduler.cpp +++ b/src/runtime/IScheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2023 Arm Limited. + * Copyright (c) 2016-2024 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp index 55b25f9098..efaae0ddce 100644 --- a/src/runtime/OMP/OMPScheduler.cpp +++ b/src/runtime/OMP/OMPScheduler.cpp @@ -75,6 +75,13 @@ void OMPScheduler::schedule(ICPPKernel *kernel, const Hints &hints) void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) { + // The rest of the logic in this function does not handle the + // split_dimensions_all case so we defer to IScheduler::schedule_common() + if (hints.split_dimension() == IScheduler::split_dimensions_all) + { + return schedule_common(kernel, hints, window, tensors); + } + ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel"); ARM_COMPUTE_ERROR_ON_MSG(hints.strategy() == StrategyHint::DYNAMIC, "Dynamic scheduling is not supported in OMPScheduler"); diff --git a/src/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp b/src/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp index adda460c96..6021d13307 100644 --- a/src/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp +++ b/src/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp @@ -24,6 +24,8 @@ #include "arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h" +#include "arm_compute/core/Error.h" + #include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" /* @@ -116,6 +118,13 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &weight_f return cpu::CpuGemmAssemblyDispatch::has_opt_impl(weight_format, a, b, c, d, init_assembly_metadata(gemm_info)); } +bool CpuGemmAssemblyDispatch::has_stateless_impl() const +{ + ARM_COMPUTE_ERROR_ON_MSG(!is_configured(), "calling has_stateless_impl() on unconfigured CpuGemmAssemblyDispatch"); + + return _impl->cpu_gemm_assembly_dispatch->has_stateless_impl(); +} + bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation) { return cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation); diff --git a/tests/validation/CL/QLSTMLayerNormalization.cpp b/tests/validation/CL/QLSTMLayerNormalization.cpp index 1c7dee4612..8f3ee8476e 100644 --- a/tests/validation/CL/QLSTMLayerNormalization.cpp +++ b/tests/validation/CL/QLSTMLayerNormalization.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -165,7 +165,7 @@ constexpr uint32_t qsymm16_per_vector = vector_size_byte / sizeof(int16_t); QLSTMLayerNormShapeDataSet("WeightShape")), \ QLSTMLayerNormShapeDataSet("BiasShape")), \ framework::dataset::make("DataType", DataType::QSYMM16)), \ - framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1. / 8192), QuantizationInfo(8192) })) + framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1. / 8192), QuantizationInfo(2) })) #define QSYMM16_DATASET_1D \ concat(concat(QSYMM16_DATASET_ITER(1, 0), QSYMM16_DATASET_ITER(1, 1)), QSYMM16_DATASET_ITER(1, 2)) diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp index 5f0ebd2542..07c4bf1746 100644 --- a/tests/validation/NEON/ConvolutionLayer.cpp +++ b/tests/validation/NEON/ConvolutionLayer.cpp @@ -85,13 +85,7 @@ const AbsoluteTolerance abs_tolerance_f16(0.2f); constexpr float tolerance_num = 0.07f; /**< Tolerance number for the FP16 implementation */ #endif /* ARM_COMPUTE_ENABLE_FP16 */ -#ifdef ARM_COMPUTE_ENABLE_SME -// TODO(COMPMID-6011): SME kernels and the reference model use different rounding mode. -// Temporarily increase the tolerance for quantized data. -constexpr AbsoluteTolerance tolerance_qasymm8(1.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */ -#else // ARM_COMPUTE_ENABLE_SME constexpr AbsoluteTolerance tolerance_qasymm8(0.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */ -#endif // ARM_COMPUTE_ENABLE_SME /** CNN data types */ const auto CNNDataTypes = make("DataType", diff --git a/tests/validation/NEON/PReluLayer.cpp b/tests/validation/NEON/PReluLayer.cpp index de07524a15..976157100b 100644 --- a/tests/validation/NEON/PReluLayer.cpp +++ b/tests/validation/NEON/PReluLayer.cpp @@ -102,6 +102,9 @@ using NEPReluLayerFixture = PReluLayerValidationFixture using NEPReluLayerQuantizedFixture = PReluLayerValidationQuantizedFixture; +template +using NEPReluLayerQuantizedBroadcastFixture = PReluLayerQuantizedBroadcastValidationFixture; + TEST_SUITE(Quantized) TEST_SUITE(QASYMM8) FIXTURE_DATA_TEST_CASE(RunSmall, NEPReluLayerQuantizedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), @@ -127,6 +130,28 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEPReluLayerQuantizedFixture, framewor // Validate output validate(Accessor(_target), _reference, tolerance_fp32, 0.01); } + +FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEPReluLayerQuantizedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapesBroadcast(), + PReluLayerQASYMM8Dataset), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })) + ) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_fp32, 0.01); +} + +FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEPReluLayerQuantizedBroadcastFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapesBroadcast(), + PReluLayerQASYMM8Dataset), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })) + ) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_fp32, 0.01); +} TEST_SUITE_END() // QASYMM8 TEST_SUITE(QASYMM8_SIGNED) @@ -153,6 +178,28 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEPReluLayerQuantizedFixture, framework // Validate output validate(Accessor(_target), _reference, tolerance_s8, 0.01); } + +FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEPReluLayerQuantizedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapesBroadcast(), + PReluLayerQASYMM8SignedDataset), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.2f, 127) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.1f, 64) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -128) })) + ) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_s8, 0.01); +} + +FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEPReluLayerQuantizedBroadcastFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapesBroadcast(), + PReluLayerQASYMM8SignedDataset), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.2f, 127) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.1f, 64) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -128) })) + ) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_s8, 0.01); +} TEST_SUITE_END() // QASYMM8_SIGNED TEST_SUITE_END() // Quantized diff --git a/tests/validation/UNIT/TensorShape.cpp b/tests/validation/UNIT/TensorShape.cpp index ebe9b32e9d..2aa54d6ad4 100644 --- a/tests/validation/UNIT/TensorShape.cpp +++ b/tests/validation/UNIT/TensorShape.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 Arm Limited. + * Copyright (c) 2017, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -66,6 +66,49 @@ DATA_TEST_CASE(SetEmpty, framework::DatasetMode::ALL, framework::dataset::make(" ARM_COMPUTE_EXPECT(shape.total_size() == 10, framework::LogLevel::ERRORS); } +DATA_TEST_CASE(DynamicDimentions, framework::DatasetMode::ALL, zip( + framework::dataset::make("TensorShape", { + TensorShape{ 0U }, + TensorShape{ 1U, 0U, 3U }, + TensorShape{ 1U, 2U, 0U }, + TensorShape{ 1U, 2U, 0U, 1U }, + TensorShape{ 0U, 2U, 0U, 1U }, + TensorShape{ 0U, 2U, 0U, 1U }, + TensorShape{ 0U, 0U }, + TensorShape{ 0U, 0U }}), + framework::dataset::make("dim", {0U, 1U, 2U, 2U, 0U, 2U, 0U, 1U} )), + shape, dim) +{ + ARM_COMPUTE_ERROR_ON(dim >= TensorShape::num_max_dimensions); + ARM_COMPUTE_EXPECT(shape.is_dynamic(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(shape.is_dynamic(dim), framework::LogLevel::ERRORS); +} + +DATA_TEST_CASE(StaticShapes, framework::DatasetMode::ALL, + framework::dataset::make("TensorShape", { + TensorShape{ 1U }, + TensorShape{ 1U, 5U }, + TensorShape{ 1U, 2U, 3U }}), + shape) +{ + ARM_COMPUTE_EXPECT(!shape.is_dynamic(), framework::LogLevel::ERRORS); +} + +DATA_TEST_CASE(StaticDimentions, framework::DatasetMode::ALL, zip( + framework::dataset::make("TensorShape", { + TensorShape{ 1U }, + TensorShape{ 1U, 0U }, + TensorShape{ 0U, 2U }, + TensorShape{ 1U, 0U, 3U }, + TensorShape{ 1U, 2U, 0U }, + TensorShape{ 1U, 2U, 3U }}), + framework::dataset::make("dim", {0U, 0U, 1U, 0U, 1U, 2U})), + shape, dim) +{ + ARM_COMPUTE_ERROR_ON(dim >= TensorShape::num_max_dimensions); + ARM_COMPUTE_EXPECT(!shape.is_dynamic(dim), framework::LogLevel::ERRORS); +} + TEST_SUITE_END() // TensorShapeValidation TEST_SUITE_END() } // namespace validation diff --git a/tests/validation/reference/UtilsQuantizedAsymm.h b/tests/validation/reference/UtilsQuantizedAsymm.h index e5ecc66545..1a7ebbab57 100644 --- a/tests/validation/reference/UtilsQuantizedAsymm.h +++ b/tests/validation/reference/UtilsQuantizedAsymm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_TEST_VALIDATION_UTILS_QUANTIZED_ASYMM_H -#define ARM_COMPUTE_TEST_VALIDATION_UTILS_QUANTIZED_ASYMM_H +#ifndef ACL_TESTS_VALIDATION_REFERENCE_UTILSQUANTIZEDASYMM_H +#define ACL_TESTS_VALIDATION_REFERENCE_UTILSQUANTIZEDASYMM_H #include @@ -48,24 +48,22 @@ inline int64_t to_int64(int32_t val) #endif // __clang__ } // namespace -/** Rounded to nearest division by a power-of-two. */ +/** Rounded to nearest division by a power-of-two. + * This implements the documented behaviour of SRSHL with a negative shift. */ inline int32_t asymm_rounding_divide_by_pow2(int32_t x, int exponent) { - const int32_t mask = (1 << exponent) - 1; - const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0); - return (x >> exponent) + ((x & mask) > threshold ? 1 : 0); + return (exponent == 0) ? x : ((x + (1 << (exponent-1))) >> exponent); } -/** Multiplication of two integers. The same as ARMv7 Arm® Neon™ VQRDMULH instruction. */ +/** Doubling multiplication of two integers, returning high half. + * This implements the documented behaviour of SQDMULH */ inline int32_t asymm_int_mult(int32_t a, int32_t b) { const bool overflow = a == b && a == std::numeric_limits::min(); const int64_t a_64 = to_int64(a); const int64_t b_64 = to_int64(b); - const int64_t ab_64 = a_64 * b_64; - const int32_t nudge = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30)); - const int32_t ab_x2_high32 = static_cast((ab_64 + nudge) / (1ll << 31)); - return overflow ? std::numeric_limits::max() : ab_x2_high32; + const int64_t ab_x2_64 = a_64 * b_64 * 2; + return overflow ? std::numeric_limits::max() : (ab_x2_64 >> 32); } /** Quantize down the input value in range [min, max]. */ @@ -88,4 +86,4 @@ inline int32_t quantize_down_scale_by_fixedpoint(int32_t val, int32_t result_mul } // namespace validation } // namespace test } // namespace arm_compute -#endif /* ARM_COMPUTE_TEST_VALIDATION_UTILS_QUANTIZED_ASYMM_H */ +#endif // ACL_TESTS_VALIDATION_REFERENCE_UTILSQUANTIZEDASYMM_H diff --git a/tests/validation/runtime/experimental/operators/CpuGemmConv2d.cpp b/tests/validation/runtime/experimental/operators/CpuGemmConv2d.cpp index 14c8affe27..c688aefc87 100644 --- a/tests/validation/runtime/experimental/operators/CpuGemmConv2d.cpp +++ b/tests/validation/runtime/experimental/operators/CpuGemmConv2d.cpp @@ -53,13 +53,7 @@ using framework::dataset::make; namespace { const RelativeTolerance rel_tolerance_f32(0.01f); -#ifdef ARM_COMPUTE_ENABLE_SME -// TODO(COMPMID-6011): SME kernels and the reference model use different rounding mode. -// Temporarily increase the tolerance for quantized data. -constexpr AbsoluteTolerance tolerance_qasymm8(1.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */ -#else // ARM_COMPUTE_ENABLE_SME constexpr AbsoluteTolerance tolerance_qasymm8(0.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */ -#endif // ARM_COMPUTE_ENABLE_SME } // namespace TEST_SUITE(NEON)