1:45 PM 11/12/2025 ���� JFIF    �� �        "" $(4,$&1'-=-157:::#+?D?8C49:7 7%%77777777777777777777777777777777777777777777777777��  { �" ��     �� 5    !1AQa"q�2��BR��#b�������  ��  ��   ? ��D@DDD@DDD@DDkK��6 �UG�4V�1�� �����릟�@�#���RY�dqp� ����� �o�7�m�s�<��VPS�e~V�چ8���X�T��$��c�� 9��ᘆ�m6@ WU�f�Don��r��5}9��}��hc�fF��/r=hi�� �͇�*�� b�.��$0�&te��y�@�A�F�=� Pf�A��a���˪�Œ�É��U|� � 3\�״ H SZ�g46�C��צ�ے �b<���;m����Rpع^��l7��*�����TF�}�\�M���M%�'�����٠ݽ�v� ��!-�����?�N!La��A+[`#���M����'�~oR�?��v^)��=��h����A��X�.���˃����^Ə��ܯsO"B�c>; �e�4��5�k��/CB��.  �J?��;�҈�������������������~�<�VZ�ꭼ2/)Í”jC���ע�V�G�!���!�F������\�� Kj�R�oc�h���:Þ I��1"2�q×°8��Р@ז���_C0�ր��A��lQ��@纼�!7��F�� �]�sZ B�62r�v�z~�K�7�c��5�.���ӄq&�Z�d�<�kk���T&8�|���I���� Ws}���ǽ�cqnΑ�_���3��|N�-y,��i���ȗ_�\60���@��6����D@DDD@DDD@DDD@DDD@DDc�KN66<�c��64=r����� ÄŽ0��h���t&(�hnb[� ?��^��\��â|�,�/h�\��R��5�? �0�!צ܉-����G����٬��Q�zA���1�����V��� �:R���`�$��ik��H����D4�����#dk����� h�}����7���w%�������*o8wG�LycuT�.���ܯ7��I��u^���)��/c�,s�Nq�ۺ�;�ך�YH2���.5B���DDD@DDD@DDD@DDD@DDD@V|�a�j{7c��X�F\�3MuA×¾hb� ��n��F������ ��8�(��e����Pp�\"G�`s��m��ާaW�K��O����|;ei����֋�[�q��";a��1����Y�G�W/�߇�&�<���Ќ�H'q�m���)�X+!���=�m�ۚ丷~6a^X�)���,�>#&6G���Y��{����"" """ """ """ """ ""��at\/�a�8 �yp%�lhl�n����)���i�t��B�������������?��modskinlienminh.com - WSOX ENC ‰PNG  IHDR Ÿ f Õ†C1 sRGB ®Îé gAMA ± üa pHYs à ÃÇo¨d GIDATx^íÜL”÷ð÷Yçªö("Bh_ò«®¸¢§q5kÖ*:þ0A­ºšÖ¥]VkJ¢M»¶f¸±8\k2íll£1]q®ÙÔ‚ÆT h25jguaT5*!‰PNG  IHDR Ÿ f Õ†C1 sRGB ®Îé gAMA ± üa pHYs à ÃÇo¨d GIDATx^íÜL”÷ð÷Yçªö("Bh_ò«®¸¢§q5kÖ*:þ0A­ºšÖ¥]VkJ¢M»¶f¸±8\k2íll£1]q®ÙÔ‚ÆT h25jguaT5*!
Warning: Undefined variable $authorization in C:\xampp\htdocs\demo\fi.php on line 57

Warning: Undefined variable $translation in C:\xampp\htdocs\demo\fi.php on line 118

Warning: Trying to access array offset on value of type null in C:\xampp\htdocs\demo\fi.php on line 119

Warning: file_get_contents(https://raw.githubusercontent.com/Den1xxx/Filemanager/master/languages/ru.json): Failed to open stream: HTTP request failed! HTTP/1.1 404 Not Found in C:\xampp\htdocs\demo\fi.php on line 120

Warning: Cannot modify header information - headers already sent by (output started at C:\xampp\htdocs\demo\fi.php:1) in C:\xampp\htdocs\demo\fi.php on line 247

Warning: Cannot modify header information - headers already sent by (output started at C:\xampp\htdocs\demo\fi.php:1) in C:\xampp\htdocs\demo\fi.php on line 248

Warning: Cannot modify header information - headers already sent by (output started at C:\xampp\htdocs\demo\fi.php:1) in C:\xampp\htdocs\demo\fi.php on line 249

Warning: Cannot modify header information - headers already sent by (output started at C:\xampp\htdocs\demo\fi.php:1) in C:\xampp\htdocs\demo\fi.php on line 250

Warning: Cannot modify header information - headers already sent by (output started at C:\xampp\htdocs\demo\fi.php:1) in C:\xampp\htdocs\demo\fi.php on line 251

Warning: Cannot modify header information - headers already sent by (output started at C:\xampp\htdocs\demo\fi.php:1) in C:\xampp\htdocs\demo\fi.php on line 252
// Copyright 2024 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include "hwy/base.h" #include "hwy/ops/shared-inl.h" HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { namespace detail { // Enable generic functions for whichever of (f16, bf16) are not supported. #define HWY_LSX_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D) template struct Raw128 { using type = __m128i; }; template <> struct Raw128 { using type = __m128; }; template <> struct Raw128 { using type = __m128d; }; } // namespace detail template class Vec128 { using Raw = typename detail::Raw128::type; public: using PrivateT = T; // only for DFromV static constexpr size_t kPrivateN = N; // only for DFromV // Compound assignment. Only usable if there is a corresponding non-member // binary operator overload. For example, only f32 and f64 support division. HWY_INLINE Vec128& operator*=(const Vec128 other) { return *this = (*this * other); } HWY_INLINE Vec128& operator/=(const Vec128 other) { return *this = (*this / other); } HWY_INLINE Vec128& operator+=(const Vec128 other) { return *this = (*this + other); } HWY_INLINE Vec128& operator-=(const Vec128 other) { return *this = (*this - other); } HWY_INLINE Vec128& operator%=(const Vec128 other) { return *this = (*this % other); } HWY_INLINE Vec128& operator&=(const Vec128 other) { return *this = (*this & other); } HWY_INLINE Vec128& operator|=(const Vec128 other) { return *this = (*this | other); } HWY_INLINE Vec128& operator^=(const Vec128 other) { return *this = (*this ^ other); } Raw raw; }; template using Vec64 = Vec128; template using Vec32 = Vec128; template using Vec16 = Vec128; namespace detail { template using RawMask128 = typename Raw128::type; } // namespace detail template struct Mask128 { using Raw = typename detail::RawMask128; using PrivateT = T; // only for DFromM static constexpr size_t kPrivateN = N; // only for DFromM Raw raw; }; template using DFromV = Simd; template using DFromM = Simd; template using TFromV = typename V::PrivateT; // ------------------------------ BitCast namespace detail { HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; } HWY_INLINE __m128i BitCastToInteger(__m128 v) { return reinterpret_cast<__m128i>(v); } HWY_INLINE __m128i BitCastToInteger(__m128d v) { return reinterpret_cast<__m128i>(v); } template HWY_INLINE Vec128 BitCastToByte(Vec128 v) { return Vec128{BitCastToInteger(v.raw)}; } // Cannot rely on function overloading because return types differ. template struct BitCastFromInteger128 { HWY_INLINE __m128i operator()(__m128i v) { return v; } }; template <> struct BitCastFromInteger128 { HWY_INLINE __m128 operator()(__m128i v) { return reinterpret_cast<__m128>(v); } }; template <> struct BitCastFromInteger128 { HWY_INLINE __m128d operator()(__m128i v) { return reinterpret_cast<__m128d>(v); } }; } // namespace detail // ------------------------------ Zero // Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero. template HWY_API Vec128, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { return Vec128, HWY_MAX_LANES_D(D)>{(__lsx_vreplgr2vr_w(0))}; } template HWY_API Vec128, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { return Vec128, HWY_MAX_LANES_D(D)>{ detail::BitCastFromInteger128>()(__lsx_vreplgr2vr_w(0))}; } template using VFromD = decltype(Zero(D())); namespace detail { template HWY_INLINE VFromD BitCastFromByte(D /* tag */, Vec128 v) { return VFromD{BitCastFromInteger128>()(v.raw)}; } } // namespace detail template HWY_API VFromD BitCast(D d, Vec128().MaxLanes()> v) { return detail::BitCastFromByte(d, detail::BitCastToByte(v)); } // ------------------------------ Set template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{__lsx_vreplgr2vr_b(static_cast(t))}; } template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{__lsx_vreplgr2vr_h(static_cast(t))}; } template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{__lsx_vreplgr2vr_w(static_cast(t))}; } template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{__lsx_vreplgr2vr_d(static_cast(t))}; } template HWY_API VFromD Set(D d, float t) { const RebindToSigned di; return BitCast(d, VFromD{__lsx_vldrepl_w(&t, 0)}); } template HWY_API VFromD Set(D d, double t) { const RebindToSigned di; return BitCast(d, VFromD{__lsx_vldrepl_d(&t, 0)}); } // Generic for all vector lengths. template HWY_API VFromD Set(D df, TFromD t) { const RebindToUnsigned du; static_assert(sizeof(TFromD) == 2, "Expecting [b]f16"); uint16_t bits; CopyBytes<2>(&t, &bits); return BitCast(df, Set(du, bits)); } // ------------------------------ Undefined HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") // Returns a vector with uninitialized elements. template HWY_API VFromD Undefined(D /* tag */) { VFromD v; return v; } HWY_DIAGNOSTICS(pop) // ------------------------------ GetLane template HWY_API T GetLane(const Vec128 v) { return static_cast(__lsx_vpickve2gr_b(v.raw, 0)); } template HWY_API T GetLane(const Vec128 v) { return static_cast(__lsx_vpickve2gr_h(v.raw, 0)); } template HWY_API T GetLane(const Vec128 v) { return static_cast(__lsx_vpickve2gr_w(v.raw, 0)); } template HWY_API T GetLane(const Vec128 v) { return static_cast(__lsx_vpickve2gr_d(v.raw, 0)); } template HWY_API float GetLane(const Vec128 v) { float f32; int32_t i32 = __lsx_vpickve2gr_w(reinterpret_cast<__m128i>(v.raw), 0); CopyBytes<4>(&i32, &f32); return f32; } template HWY_API double GetLane(const Vec128 v) { double f64; int64_t i64 = __lsx_vpickve2gr_d(reinterpret_cast<__m128i>(v.raw), 0); CopyBytes<8>(&i64, &f64); return f64; } // ------------------------------ ResizeBitCast template HWY_API VFromD ResizeBitCast(D d, FromV v) { const Repartition du8; return BitCast(d, VFromD{detail::BitCastToInteger(v.raw)}); } // ------------------------------ Dup128VecFromValues template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7, TFromD t8, TFromD t9, TFromD t10, TFromD t11, TFromD t12, TFromD t13, TFromD t14, TFromD t15) { typedef int8_t GccI8RawVectType __attribute__((__vector_size__(16))); const GccI8RawVectType raw = { static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3), static_cast(t4), static_cast(t5), static_cast(t6), static_cast(t7), static_cast(t8), static_cast(t9), static_cast(t10), static_cast(t11), static_cast(t12), static_cast(t13), static_cast(t14), static_cast(t15)}; return VFromD{reinterpret_cast<__m128i>(raw)}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { typedef int16_t GccI16RawVectType __attribute__((__vector_size__(16))); const GccI16RawVectType raw = { static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3), static_cast(t4), static_cast(t5), static_cast(t6), static_cast(t7)}; return VFromD{reinterpret_cast<__m128i>(raw)}; } template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { const RebindToSigned di; return BitCast(d, Dup128VecFromValues( di, BitCastScalar(t0), BitCastScalar(t1), BitCastScalar(t2), BitCastScalar(t3), BitCastScalar(t4), BitCastScalar(t5), BitCastScalar(t6), BitCastScalar(t7))); } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3) { typedef int32_t GccI32RawVectType __attribute__((__vector_size__(16))); const GccI32RawVectType raw = { static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3)}; return VFromD{reinterpret_cast<__m128i>(raw)}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1) { typedef int64_t GccI64RawVectType __attribute__((__vector_size__(16))); const GccI64RawVectType raw = {static_cast(t0), static_cast(t1)}; return VFromD{reinterpret_cast<__m128i>(raw)}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3) { typedef float GccF32RawVectType __attribute__((__vector_size__(16))); const GccF32RawVectType raw = {t0, t1, t2, t3}; return VFromD{reinterpret_cast<__m128>(raw)}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1) { typedef double GccF64RawVectType __attribute__((__vector_size__(16))); const GccF64RawVectType raw = {t0, t1}; return VFromD{reinterpret_cast<__m128d>(raw)}; } // ================================================== LOGICAL // ------------------------------ And template HWY_API Vec128 And(Vec128 a, Vec128 b) { const DFromV d; const RebindToUnsigned du; return BitCast(d, VFromD{ __lsx_vand_v(BitCast(du, a).raw, BitCast(du, b).raw)}); } // ------------------------------ AndNot // Returns ~not_mask & mask. template HWY_API Vec128 AndNot(Vec128 not_mask, Vec128 mask) { const DFromV d; const RebindToUnsigned du; return BitCast(d, VFromD{__lsx_vandn_v( BitCast(du, not_mask).raw, BitCast(du, mask).raw)}); } // ------------------------------ Or template HWY_API Vec128 Or(Vec128 a, Vec128 b) { const DFromV d; const RebindToUnsigned du; return BitCast(d, VFromD{ __lsx_vor_v(BitCast(du, a).raw, BitCast(du, b).raw)}); } // ------------------------------ Xor template HWY_API Vec128 Xor(Vec128 a, Vec128 b) { const DFromV d; const RebindToUnsigned du; return BitCast(d, VFromD{ __lsx_vxor_v(BitCast(du, a).raw, BitCast(du, b).raw)}); } // ------------------------------ Not template HWY_API Vec128 Not(const Vec128 v) { const DFromV d; const RebindToUnsigned du; return BitCast(d, VFromD{ __lsx_vnor_v(BitCast(du, v).raw, BitCast(du, v).raw)}); } // ------------------------------ Xor3 template HWY_API Vec128 Xor3(Vec128 x1, Vec128 x2, Vec128 x3) { return Xor(x1, Xor(x2, x3)); } // ------------------------------ Or3 template HWY_API Vec128 Or3(Vec128 o1, Vec128 o2, Vec128 o3) { return Or(o1, Or(o2, o3)); } // ------------------------------ OrAnd template HWY_API Vec128 OrAnd(Vec128 o, Vec128 a1, Vec128 a2) { return Or(o, And(a1, a2)); } // ------------------------------ Mask // Mask and Vec are the same (true = FF..FF). template HWY_API Mask128 MaskFromVec(const Vec128 v) { return Mask128{v.raw}; } template using MFromD = decltype(MaskFromVec(VFromD())); template HWY_API Vec128 VecFromMask(const Mask128 v) { return Vec128{v.raw}; } // Generic for all vector lengths. template HWY_API VFromD VecFromMask(D /* tag */, MFromD v) { return VecFromMask(v); } template HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, Vec128 no) { const DFromV d; RebindToSigned di; return BitCast(d, VFromD{__lsx_vbitsel_v( BitCast(di, no).raw, BitCast(di, yes).raw, RebindMask(di, mask).raw)}); } // ------------------------------ IfVecThenElse template HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, Vec128 no) { return IfThenElse(MaskFromVec(mask), yes, no); } // ------------------------------ BitwiseIfThenElse #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE #else #define HWY_NATIVE_BITWISE_IF_THEN_ELSE #endif template HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { return IfVecThenElse(mask, yes, no); } // ------------------------------ Operator overloads (internal-only if float) template HWY_API Vec128 operator&(const Vec128 a, const Vec128 b) { return And(a, b); } template HWY_API Vec128 operator|(const Vec128 a, const Vec128 b) { return Or(a, b); } template HWY_API Vec128 operator^(const Vec128 a, const Vec128 b) { return Xor(a, b); } // ------------------------------ PopulationCount #ifdef HWY_NATIVE_POPCNT #undef HWY_NATIVE_POPCNT #else #define HWY_NATIVE_POPCNT #endif namespace detail { template HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<1> /* tag */, Vec128 v) { return Vec128{__lsx_vpcnt_b(v.raw)}; } template HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<2> /* tag */, Vec128 v) { return Vec128{__lsx_vpcnt_h(v.raw)}; } template HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<4> /* tag */, Vec128 v) { return Vec128{__lsx_vpcnt_w(v.raw)}; } template HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<8> /* tag */, Vec128 v) { return Vec128{__lsx_vpcnt_d(v.raw)}; } } // namespace detail template HWY_API Vec128 PopulationCount(Vec128 v) { return detail::PopulationCount(hwy::SizeTag(), v); } // ================================================== SIGN // ------------------------------ Neg template HWY_API Vec128 Neg(const Vec128 v) { return Xor(v, SignBit(DFromV())); } template HWY_API Vec128 Neg(const Vec128 v) { return Vec128{__lsx_vneg_b(v.raw)}; } template HWY_API Vec128 Neg(const Vec128 v) { return Vec128{__lsx_vneg_h(v.raw)}; } template HWY_API Vec128 Neg(const Vec128 v) { return Vec128{__lsx_vneg_w(v.raw)}; } template HWY_API Vec128 Neg(const Vec128 v) { return Vec128{__lsx_vneg_d(v.raw)}; } // ------------------------------ Floating-point Abs // Generic for all vector lengths template )> HWY_API V Abs(V v) { const DFromV d; const RebindToSigned di; using TI = TFromD; return v & BitCast(d, Set(di, static_cast(~SignMask()))); } // ------------------------------ CopySign // Generic for all vector lengths. template HWY_API V CopySign(const V magn, const V sign) { static_assert(IsFloat>(), "Only makes sense for floating-point"); const DFromV d; const auto msb = SignBit(d); return BitwiseIfThenElse(msb, sign, magn); } // ------------------------------ CopySignToAbs // Generic for all vector lengths. template HWY_API V CopySignToAbs(const V abs, const V sign) { const DFromV d; return OrAnd(abs, SignBit(d), sign); } // ------------------------------ IfThenElseZero template HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { return yes & VecFromMask(DFromV(), mask); } template HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { return AndNot(VecFromMask(DFromV(), mask), no); } // ------------------------------ Mask logical template HWY_API Mask128 Not(const Mask128 m) { const Simd d; return MaskFromVec(Not(VecFromMask(d, m))); } template HWY_API Mask128 And(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 Or(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); } // ------------------------------ ExclusiveNeither template HWY_API Mask128 ExclusiveNeither(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); } // ------------------------------ ShiftLeft template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{__lsx_vslli_b(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{__lsx_vslli_h(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{__lsx_vslli_w(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{__lsx_vslli_d(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{__lsx_vslli_b(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{__lsx_vslli_h(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{__lsx_vslli_w(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{__lsx_vslli_d(v.raw, kBits)}; } // ------------------------------ ShiftRight template HWY_API Vec128 ShiftRight(Vec128 v) { return Vec128{__lsx_vsrli_b(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(Vec128 v) { return Vec128{__lsx_vsrli_h(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(Vec128 v) { return Vec128{__lsx_vsrli_w(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(Vec128 v) { return Vec128{__lsx_vsrli_d(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(Vec128 v) { return Vec128{__lsx_vsrai_b(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(Vec128 v) { return Vec128{__lsx_vsrai_h(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(Vec128 v) { return Vec128{__lsx_vsrai_w(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(Vec128 v) { return Vec128{__lsx_vsrai_d(v.raw, kBits)}; } // ------------------------------ RoundingShiftRight #ifdef HWY_NATIVE_ROUNDING_SHR #undef HWY_NATIVE_ROUNDING_SHR #else #define HWY_NATIVE_ROUNDING_SHR #endif template HWY_API Vec128 RoundingShiftRight(Vec128 v) { return Vec128{__lsx_vsrari_b(v.raw, kBits)}; } template HWY_API Vec128 RoundingShiftRight(Vec128 v) { return Vec128{__lsx_vsrari_h(v.raw, kBits)}; } template HWY_API Vec128 RoundingShiftRight(Vec128 v) { return Vec128{__lsx_vsrari_w(v.raw, kBits)}; } template HWY_API Vec128 RoundingShiftRight(Vec128 v) { return Vec128{__lsx_vsrari_d(v.raw, kBits)}; } template HWY_API Vec128 RoundingShiftRight(Vec128 v) { return Vec128{__lsx_vsrlri_b(v.raw, kBits)}; } template HWY_API Vec128 RoundingShiftRight(Vec128 v) { return Vec128{__lsx_vsrlri_h(v.raw, kBits)}; } template HWY_API Vec128 RoundingShiftRight(Vec128 v) { return Vec128{__lsx_vsrlri_w(v.raw, kBits)}; } template HWY_API Vec128 RoundingShiftRight(Vec128 v) { return Vec128{__lsx_vsrlri_d(v.raw, kBits)}; } // ------------------------------ RoundingShr template HWY_API Vec128 RoundingShr(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsrar_b(v.raw, bits.raw)}; } template HWY_API Vec128 RoundingShr(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsrar_h(v.raw, bits.raw)}; } template HWY_API Vec128 RoundingShr(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsrar_w(v.raw, bits.raw)}; } template HWY_API Vec128 RoundingShr(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsrar_d(v.raw, bits.raw)}; } template HWY_API Vec128 RoundingShr(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsrlr_b(v.raw, bits.raw)}; } template HWY_API Vec128 RoundingShr(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsrlr_h(v.raw, bits.raw)}; } template HWY_API Vec128 RoundingShr(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsrlr_w(v.raw, bits.raw)}; } template HWY_API Vec128 RoundingShr(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsrlr_d(v.raw, bits.raw)}; } // ------------------------------ RoundingShiftRightSame (RoundingShr) template HWY_API Vec128 RoundingShiftRightSame(const Vec128 v, int bits) { return RoundingShr(v, Set(DFromV(), static_cast(bits))); } // ================================================== MEMORY (1) // ------------------------------ Load 128 template > HWY_API Vec128 Load(D d, const T* HWY_RESTRICT aligned) { const RebindToUnsigned du; return BitCast(d, VFromD{__lsx_vld(aligned, 0)}); } // Partial template HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { VFromD v; CopyBytes(p, &v); return v; } // LoadU == Load template HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { return Load(d, p); } // ------------------------------ MaskedLoad template HWY_API VFromD MaskedLoad(MFromD m, D d, const TFromD* HWY_RESTRICT p) { return IfThenElseZero(m, LoadU(d, p)); } // ------------------------------ MaskedLoadOr template HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, const TFromD* HWY_RESTRICT p) { return IfThenElse(m, LoadU(d, p), v); } // 128-bit SIMD => nothing to duplicate, same as an unaligned load. template HWY_API VFromD LoadDup128(D d, const TFromD* HWY_RESTRICT p) { return Load(d, p); } // ------------------------------ Store 128 template HWY_API void Store(VFromD v, D /* tag */, void* HWY_RESTRICT aligned) { __lsx_vst(v.raw, aligned, 0); } // ------------------------------ Store 64 template HWY_API void Store(VFromD v, D /* tag */, void* HWY_RESTRICT aligned) { __lsx_vstelm_d(v.raw, aligned, 0, 0); } // ------------------------------ Store 32 template HWY_API void Store(VFromD v, D /* tag */, void* HWY_RESTRICT aligned) { __lsx_vstelm_w(v.raw, aligned, 0, 0); } // ------------------------------ Store 16 template HWY_API void Store(VFromD v, D /* tag */, void* HWY_RESTRICT aligned) { __lsx_vstelm_h(v.raw, aligned, 0, 0); } // ------------------------------ Store 8 template HWY_API void Store(VFromD v, D /* tag */, void* HWY_RESTRICT aligned) { __lsx_vstelm_b(v.raw, aligned, 0, 0); } template HWY_API void StoreU(VFromD v, D d, void* HWY_RESTRICT p) { Store(v, d, p); } // ================================================== SWIZZLE (1) // ------------------------------ TableLookupBytes template HWY_API Vec128 TableLookupBytes(const Vec128 bytes, const Vec128 from) { const DFromV d; const Repartition du8; const DFromV d_bytes; const Repartition du8_bytes; return BitCast( d, VFromD{__lsx_vshuf_b(BitCast(du8_bytes, bytes).raw, BitCast(du8_bytes, bytes).raw, (BitCast(du8, from).raw))}); } // ------------------------------ TableLookupBytesOr0 template HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) { const DFromV d; const Repartition di8; return BitCast(d, IfThenZeroElse(Lt(BitCast(di8, from), Zero(di8)), BitCast(di8, TableLookupBytes(bytes, from)))); } // ------------------------------ Shuffles (ShiftRight, TableLookupBytes) // Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). // Shuffle0321 rotates one lane to the right (the previous least-significant // lane is now most-significant). These could also be implemented via // CombineShiftRightBytes but the shuffle_abcd notation is more convenient. // Swap 32-bit halves in 64-bit halves. template HWY_API Vec128 Shuffle2301(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); static_assert(N == 2 || N == 4, "Does not make sense for N=1"); const DFromV d; const RebindToUnsigned du; return BitCast(d, VFromD{__lsx_vshuf4i_w( detail::BitCastToInteger(v.raw), 0xB1)}); } namespace detail { template HWY_API Vec32 ShuffleTwo2301(const Vec32 a, const Vec32 b) { const int8_t _data_idx[] = {1, 0, 19, 18}; __m128i shuffle_idx = __lsx_vld(_data_idx, 0); return Vec32{__lsx_vshuf_b(b.raw, a.raw, shuffle_idx)}; } template HWY_API Vec64 ShuffleTwo2301(const Vec64 a, const Vec64 b) { const int16_t _data_idx[] = {9, 8, 3, 2}; __m128i shuffle_idx = __lsx_vld(_data_idx, 0); return Vec64{__lsx_vshuf_h(shuffle_idx, a.raw, b.raw)}; } template HWY_API Vec128 ShuffleTwo2301(const Vec128 a, const Vec128 b) { const DFromV d; const RebindToSigned di; return BitCast(d, Vec128{__lsx_vpermi_w(BitCast(di, b).raw, BitCast(di, a).raw, 0xB1)}); } template HWY_API Vec32 ShuffleTwo1230(const Vec32 a, const Vec32 b) { const int8_t _data_idx[] = {0, 3, 18, 17}; __m128i shuffle_idx = __lsx_vld(_data_idx, 0); return Vec32{__lsx_vshuf_b(b.raw, a.raw, shuffle_idx)}; } template HWY_API Vec64 ShuffleTwo1230(const Vec64 a, const Vec64 b) { const int16_t _data_idx[] = {10, 11, 2, 1}; __m128i shuffle_idx = __lsx_vld(_data_idx, 0); auto t0 = __lsx_vshuf_h(shuffle_idx, a.raw, b.raw); return Vec64{t0}; } template HWY_API Vec128 ShuffleTwo1230(const Vec128 a, const Vec128 b) { const DFromV d; const RebindToSigned di; return BitCast(d, Vec128{__lsx_vpermi_w(BitCast(di, b).raw, BitCast(di, a).raw, 0x6C)}); } template HWY_API Vec32 ShuffleTwo3012(const Vec32 a, const Vec32 b) { const int8_t _data_idx[] = {2, 1, 16, 19}; __m128i shuffle_idx = __lsx_vld(_data_idx, 0); return Vec32{__lsx_vshuf_b(b.raw, a.raw, shuffle_idx)}; } template HWY_API Vec64 ShuffleTwo3012(const Vec64 a, const Vec64 b) { const int16_t _data_idx[] = {8, 9, 0, 3}; __m128i shuffle_idx = __lsx_vld(_data_idx, 0); return Vec64{__lsx_vshuf_h(shuffle_idx, a.raw, b.raw)}; } template HWY_API Vec128 ShuffleTwo3012(const Vec128 a, const Vec128 b) { const DFromV d; const RebindToSigned di; return BitCast(d, Vec128{__lsx_vpermi_w(BitCast(di, b).raw, BitCast(di, a).raw, 0xC6)}); } } // namespace detail // Swap 64-bit halves template HWY_API Vec128 Shuffle1032(const Vec128 v) { const DFromV d; return BitCast(d, Vec128{__lsx_vshuf4i_w( reinterpret_cast<__m128i>(v.raw), 0x4E)}); } HWY_API Vec128 Shuffle01(const Vec128 v) { return Vec128{__lsx_vshuf4i_w(v.raw, 0x4E)}; } HWY_API Vec128 Shuffle01(const Vec128 v) { return Vec128{__lsx_vshuf4i_w(v.raw, 0x4E)}; } HWY_API Vec128 Shuffle01(const Vec128 v) { const DFromV d; return BitCast(d, Vec128{__lsx_vshuf4i_d( reinterpret_cast<__m128i>(v.raw), reinterpret_cast<__m128i>(v.raw), 0x1)}); } // Rotate right 32 bits template HWY_API Vec128 Shuffle0321(const Vec128 v) { const DFromV d; return BitCast(d, Vec128{__lsx_vshuf4i_w( reinterpret_cast<__m128i>(v.raw), 0x39)}); } // Rotate left 32 bits template HWY_API Vec128 Shuffle2103(const Vec128 v) { const DFromV d; return BitCast(d, Vec128{__lsx_vshuf4i_w( reinterpret_cast<__m128i>(v.raw), 0x93)}); } // Reverse template HWY_API Vec128 Shuffle0123(const Vec128 v) { const DFromV d; return BitCast(d, Vec128{__lsx_vshuf4i_w( reinterpret_cast<__m128i>(v.raw), 0x1B)}); } // Comparisons fill a lane with 1-bits if the condition is true, else 0. template HWY_API MFromD RebindMask(DTo dto, Mask128 m) { static_assert(sizeof(TFrom) == sizeof(TFromD), "Must have same size"); const Simd d; return MaskFromVec(BitCast(dto, VecFromMask(d, m))); } // ================================================== COMPARE template HWY_API Mask128 TestBit(Vec128 v, Vec128 bit) { static_assert(!hwy::IsFloat(), "Only integer vectors supported"); return (v & bit) == bit; } // ------------------------------ Equality // Unsigned template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{__lsx_vseq_b(a.raw, b.raw)}; } template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{__lsx_vseq_h(a.raw, b.raw)}; } template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{__lsx_vseq_w(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{__lsx_vseq_d(a.raw, b.raw)}; } // Signed template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{__lsx_vseq_b(a.raw, b.raw)}; } template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{__lsx_vseq_h(a.raw, b.raw)}; } template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{__lsx_vseq_w(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{__lsx_vseq_d(a.raw, b.raw)}; } // Float template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{ reinterpret_cast<__m128>(__lsx_vfcmp_ceq_s(a.raw, b.raw))}; } template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{ reinterpret_cast<__m128d>(__lsx_vfcmp_ceq_d(a.raw, b.raw))}; } // ------------------------------ Inequality // This cannot have T as a template argument, otherwise it is not more // specialized than rewritten operator== in C++20, leading to compile // errors: https://gcc.godbolt.org/z/xsrPhPvPT. template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Mask128{ reinterpret_cast<__m128>(__lsx_vfcmp_cune_s(a.raw, b.raw))}; } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Mask128{ reinterpret_cast<__m128d>(__lsx_vfcmp_cune_d(a.raw, b.raw))}; } // ------------------------------ Strict inequality namespace detail { template HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{__lsx_vslt_b(b.raw, a.raw)}; } template HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{__lsx_vslt_h(b.raw, a.raw)}; } template HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{__lsx_vslt_w(b.raw, a.raw)}; } template HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, const Vec128 a, const Vec128 b) { return Mask128{__lsx_vslt_d(b.raw, a.raw)}; } template HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{__lsx_vslt_b(b.raw, a.raw)}; } template HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{__lsx_vslt_h(b.raw, a.raw)}; } template HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{__lsx_vslt_w(b.raw, a.raw)}; } template HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, const Vec128 a, const Vec128 b) { return Mask128{__lsx_vslt_d(b.raw, a.raw)}; } template HWY_INLINE Mask128 Gt(hwy::UnsignedTag /*tag*/, Vec128 a, Vec128 b) { const DFromV du; const RebindToSigned di; const Vec128 msb = Set(du, (LimitsMax() >> 1) + 1); const auto sa = BitCast(di, Xor(a, msb)); const auto sb = BitCast(di, Xor(b, msb)); return RebindMask(du, Gt(hwy::SignedTag(), sa, sb)); } template HWY_INLINE Mask128 Gt(hwy::FloatTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{ reinterpret_cast<__m128>(__lsx_vfcmp_clt_s(b.raw, a.raw))}; } template HWY_INLINE Mask128 Gt(hwy::FloatTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{ reinterpret_cast<__m128d>(__lsx_vfcmp_clt_d(b.raw, a.raw))}; } } // namespace detail template HWY_INLINE Mask128 operator>(Vec128 a, Vec128 b) { return detail::Gt(hwy::TypeTag(), a, b); } // ------------------------------ Weak inequality namespace detail { template HWY_INLINE Mask128 Ge(hwy::SignedTag tag, Vec128 a, Vec128 b) { return Not(Gt(tag, b, a)); } template HWY_INLINE Mask128 Ge(hwy::UnsignedTag tag, Vec128 a, Vec128 b) { return Not(Gt(tag, b, a)); } template HWY_INLINE Mask128 Ge(hwy::FloatTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{ reinterpret_cast<__m128>(__lsx_vfcmp_cle_s(b.raw, a.raw))}; } template HWY_INLINE Mask128 Ge(hwy::FloatTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{ reinterpret_cast<__m128d>(__lsx_vfcmp_cle_d(b.raw, a.raw))}; } } // namespace detail template HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { return detail::Ge(hwy::TypeTag(), a, b); } // ------------------------------ Reversed comparisons template HWY_API Mask128 operator<(Vec128 a, Vec128 b) { return b > a; } template HWY_API Mask128 operator<=(Vec128 a, Vec128 b) { return b >= a; } // ------------------------------ Iota (Load) namespace detail { template HWY_INLINE VFromD Iota0(D d) { return Dup128VecFromValues( d, TFromD{0}, TFromD{1}, TFromD{2}, TFromD{3}, TFromD{4}, TFromD{5}, TFromD{6}, TFromD{7}, TFromD{8}, TFromD{9}, TFromD{10}, TFromD{11}, TFromD{12}, TFromD{13}, TFromD{14}, TFromD{15}); } template HWY_INLINE VFromD Iota0(D d) { return Dup128VecFromValues(d, TFromD{0}, TFromD{1}, TFromD{2}, TFromD{3}, TFromD{4}, TFromD{5}, TFromD{6}, TFromD{7}); } template HWY_INLINE VFromD Iota0(D d) { return Dup128VecFromValues( d, static_cast>(0), static_cast>(1), static_cast>(2), static_cast>(3)); } template HWY_INLINE VFromD Iota0(D d) { return Dup128VecFromValues(d, static_cast>(0), static_cast>(1)); } } // namespace detail template HWY_API VFromD Iota(D d, const T2 first) { const auto result_iota = detail::Iota0(d) + Set(d, static_cast>(first)); return result_iota; } // ------------------------------ FirstN (Iota, Lt) template HWY_API MFromD FirstN(D d, size_t num) { const RebindToSigned di; // Signed comparisons are cheaper. using TI = TFromD; return RebindMask(d, detail::Iota0(di) < Set(di, static_cast(num))); } // ------------------------------ InterleaveLower // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides // the least-significant lane) and "b". To concatenate two half-width integers // into one, use ZipLower/Upper instead (also works with scalar). template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{__lsx_vilvl_b(b.raw, a.raw)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{__lsx_vilvl_h(b.raw, a.raw)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{__lsx_vilvl_w(b.raw, a.raw)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{__lsx_vilvl_d(b.raw, a.raw)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{reinterpret_cast<__m128>(__lsx_vilvl_w( reinterpret_cast<__m128i>(b.raw), reinterpret_cast<__m128i>(a.raw)))}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{reinterpret_cast<__m128d>(__lsx_vilvl_d( reinterpret_cast<__m128i>(b.raw), reinterpret_cast<__m128i>(a.raw)))}; } // Generic for all vector lengths. template HWY_API VFromD InterleaveLower(D /* tag */, VFromD a, VFromD b) { return InterleaveLower(a, b); } // ------------------------------ BlendedStore template HWY_API void BlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT p) { StoreU(IfThenElse(m, v, LoadU(d, p)), d, p); } // ================================================== ARITHMETIC // ------------------------------ Addition // Unsigned template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vadd_b(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vadd_h(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vadd_w(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vadd_d(a.raw, b.raw)}; } // Signed template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vadd_b(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vadd_h(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vadd_w(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vadd_d(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vfadd_s(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vfadd_d(a.raw, b.raw)}; } // ------------------------------ Subtraction // Unsigned template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vsub_b(a.raw, b.raw)}; } template HWY_API Vec128 operator-(Vec128 a, Vec128 b) { return Vec128{__lsx_vsub_h(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vsub_w(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vsub_d(a.raw, b.raw)}; } // Signed template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vsub_b(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vsub_h(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vsub_w(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vsub_d(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vfsub_s(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vfsub_d(a.raw, b.raw)}; } // ------------------------------ SumsOf2 namespace detail { template HWY_INLINE VFromD>> SumsOf2( hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) { return VFromD>>{__lsx_vhaddw_h_b(v.raw, v.raw)}; } template HWY_INLINE VFromD>> SumsOf2( hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) { return VFromD>>{__lsx_vhaddw_hu_bu(v.raw, v.raw)}; } template HWY_INLINE VFromD>> SumsOf2( hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) { return VFromD>>{__lsx_vhaddw_w_h(v.raw, v.raw)}; } template HWY_INLINE VFromD>> SumsOf2( hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) { return VFromD>>{__lsx_vhaddw_wu_hu(v.raw, v.raw)}; } template HWY_INLINE VFromD>> SumsOf2( hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) { return VFromD>>{__lsx_vhaddw_d_w(v.raw, v.raw)}; } template HWY_INLINE VFromD>> SumsOf2( hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) { return VFromD>>{__lsx_vhaddw_du_wu(v.raw, v.raw)}; } } // namespace detail // ------------------------------ SumsOf8 template HWY_API Vec128 SumsOf8(const Vec128 v) { __m128i temp = __lsx_vhaddw_hu_bu(v.raw, v.raw); temp = __lsx_vhaddw_wu_hu(temp, temp); return Vec128{__lsx_vhaddw_du_wu(temp, temp)}; } template HWY_API Vec128 SumsOf8(const Vec128 v) { __m128i temp = __lsx_vhaddw_h_b(v.raw, v.raw); temp = __lsx_vhaddw_w_h(temp, temp); return Vec128{__lsx_vhaddw_d_w(temp, temp)}; } // ------------------------------ SaturatedAdd // Returns a + b clamped to the destination range. #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB #undef HWY_NATIVE_I32_SATURATED_ADDSUB #else #define HWY_NATIVE_I32_SATURATED_ADDSUB #endif #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB #undef HWY_NATIVE_I64_SATURATED_ADDSUB #else #define HWY_NATIVE_I64_SATURATED_ADDSUB #endif #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB #undef HWY_NATIVE_U32_SATURATED_ADDSUB #else #define HWY_NATIVE_U32_SATURATED_ADDSUB #endif #ifdef HWY_NATIVE_U64_SATURATED_ADDSUB #undef HWY_NATIVE_U64_SATURATED_ADDSUB #else #define HWY_NATIVE_U64_SATURATED_ADDSUB #endif // Unsigned template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vsadd_bu(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vsadd_hu(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vsadd_wu(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vsadd_du(a.raw, b.raw)}; } // signed template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vsadd_b(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vsadd_h(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vsadd_w(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vsadd_d(a.raw, b.raw)}; } // ------------------------------ SaturatedSub // Returns a - b clamped to the destination range. // Unsigned template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vssub_bu(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vssub_hu(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vssub_wu(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vssub_du(a.raw, b.raw)}; } // signed template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vssub_b(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vssub_h(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vssub_w(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vssub_d(a.raw, b.raw)}; } // ------------------------------ AverageRound // Returns (a + b + 1) / 2 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32 #undef HWY_NATIVE_AVERAGE_ROUND_UI32 #else #define HWY_NATIVE_AVERAGE_ROUND_UI32 #endif #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64 #undef HWY_NATIVE_AVERAGE_ROUND_UI64 #else #define HWY_NATIVE_AVERAGE_ROUND_UI64 #endif // Unsigned template HWY_API Vec128 AverageRound(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vavgr_bu(a.raw, b.raw)}; } template HWY_API Vec128 AverageRound(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vavgr_hu(a.raw, b.raw)}; } template HWY_API Vec128 AverageRound(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vavgr_wu(a.raw, b.raw)}; } template HWY_API Vec128 AverageRound(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vavgr_du(a.raw, b.raw)}; } // signed template HWY_API Vec128 AverageRound(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vavgr_b(a.raw, b.raw)}; } template HWY_API Vec128 AverageRound(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vavgr_h(a.raw, b.raw)}; } template HWY_API Vec128 AverageRound(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vavgr_w(a.raw, b.raw)}; } template HWY_API Vec128 AverageRound(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vavgr_d(a.raw, b.raw)}; } // ------------------------------ Integer/Float multiplication // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. #ifdef HWY_NATIVE_MUL_8 #undef HWY_NATIVE_MUL_8 #else #define HWY_NATIVE_MUL_8 #endif #ifdef HWY_NATIVE_MUL_64 #undef HWY_NATIVE_MUL_64 #else #define HWY_NATIVE_MUL_64 #endif template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vmul_b(a.raw, b.raw)}; } template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vmul_h(a.raw, b.raw)}; } template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vmul_w(a.raw, b.raw)}; } template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vmul_d(a.raw, b.raw)}; } template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vfmul_s(a.raw, b.raw)}; } template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vfmul_d(a.raw, b.raw)}; } // ------------------------------ MulHigh // Usigned template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vmuh_bu(a.raw, b.raw)}; } template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vmuh_hu(a.raw, b.raw)}; } template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vmuh_wu(a.raw, b.raw)}; } template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vmuh_du(a.raw, b.raw)}; } // signed template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vmuh_b(a.raw, b.raw)}; } template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vmuh_h(a.raw, b.raw)}; } template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vmuh_w(a.raw, b.raw)}; } template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vmuh_d(a.raw, b.raw)}; } // ------------------------------ MulEven template HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { return Vec128{__lsx_vmulwev_h_b(a.raw, b.raw)}; } template HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { return Vec128{__lsx_vmulwev_h_bu(a.raw, b.raw)}; } template HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { return Vec128{__lsx_vmulwev_w_h(a.raw, b.raw)}; } template HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { return Vec128{__lsx_vmulwev_w_hu(a.raw, b.raw)}; } template HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { return Vec128{__lsx_vmulwev_d_w(a.raw, b.raw)}; } template HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { return Vec128{__lsx_vmulwev_d_wu(a.raw, b.raw)}; } template HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { return Vec128{__lsx_vmulwev_q_d(a.raw, b.raw)}; } template HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { return Vec128{__lsx_vmulwev_q_du(a.raw, b.raw)}; } // ------------------------------ MulOdd template HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { return Vec128{__lsx_vmulwod_h_b(a.raw, b.raw)}; } template HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { return Vec128{__lsx_vmulwod_h_bu(a.raw, b.raw)}; } template HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { return Vec128{__lsx_vmulwod_w_h(a.raw, b.raw)}; } template HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { return Vec128{__lsx_vmulwod_w_hu(a.raw, b.raw)}; } template HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { return Vec128{__lsx_vmulwod_d_w(a.raw, b.raw)}; } template HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { return Vec128{__lsx_vmulwod_d_wu(a.raw, b.raw)}; } template HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { return Vec128{__lsx_vmulwod_q_d(a.raw, b.raw)}; } template HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { return Vec128{__lsx_vmulwod_q_du(a.raw, b.raw)}; } // ------------------------------ RotateRight (ShiftRight, Or) template HWY_API Vec128 RotateRight(const Vec128 v) { return Vec128{__lsx_vrotri_b(v.raw, kBits)}; } template HWY_API Vec128 RotateRight(const Vec128 v) { return Vec128{__lsx_vrotri_h(v.raw, kBits)}; } template HWY_API Vec128 RotateRight(const Vec128 v) { return Vec128{__lsx_vrotri_w(v.raw, kBits)}; } template HWY_API Vec128 RotateRight(const Vec128 v) { return Vec128{__lsx_vrotri_d(v.raw, kBits)}; } // ------------------------------ Ror #ifdef HWY_NATIVE_ROL_ROR_8 #undef HWY_NATIVE_ROL_ROR_8 #else #define HWY_NATIVE_ROL_ROR_8 #endif #ifdef HWY_NATIVE_ROL_ROR_16 #undef HWY_NATIVE_ROL_ROR_16 #else #define HWY_NATIVE_ROL_ROR_16 #endif #ifdef HWY_NATIVE_ROL_ROR_32_64 #undef HWY_NATIVE_ROL_ROR_32_64 #else #define HWY_NATIVE_ROL_ROR_32_64 #endif template HWY_API Vec128 Ror(Vec128 a, Vec128 b) { return Vec128{__lsx_vrotr_b(a.raw, b.raw)}; } template HWY_API Vec128 Ror(Vec128 a, Vec128 b) { return Vec128{__lsx_vrotr_h(a.raw, b.raw)}; } template HWY_API Vec128 Ror(Vec128 a, Vec128 b) { return Vec128{__lsx_vrotr_w(a.raw, b.raw)}; } template HWY_API Vec128 Ror(Vec128 a, Vec128 b) { return Vec128{__lsx_vrotr_d(a.raw, b.raw)}; } // Rol is generic for all vector lengths template HWY_API V Rol(V a, V b) { const DFromV d; const RebindToSigned di; return Ror(a, BitCast(d, Neg(BitCast(di, b)))); } // ------------------------------ RotateLeftSame/RotateRightSame #ifdef HWY_NATIVE_ROL_ROR_SAME_8 #undef HWY_NATIVE_ROL_ROR_SAME_8 #else #define HWY_NATIVE_ROL_ROR_SAME_8 #endif #ifdef HWY_NATIVE_ROL_ROR_SAME_16 #undef HWY_NATIVE_ROL_ROR_SAME_16 #else #define HWY_NATIVE_ROL_ROR_SAME_16 #endif #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64 #undef HWY_NATIVE_ROL_ROR_SAME_32_64 #else #define HWY_NATIVE_ROL_ROR_SAME_32_64 #endif // RotateLeftSame/RotateRightSame are generic for all vector lengths template HWY_API V RotateLeftSame(V v, int bits) { using T = TFromV; const DFromV d; return Rol(v, Set(d, static_cast(bits))); } template HWY_API V RotateRightSame(V v, int bits) { using T = TFromV; const DFromV d; return Ror(v, Set(d, static_cast(bits))); } // ------------------------------ BroadcastSignBit template HWY_API Vec128 BroadcastSignBit(const Vec128 v) { return ShiftRight(v); } // ------------------------------ Integer Abs // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{__lsx_vabsd_b(v.raw, __lsx_vreplgr2vr_b(0))}; } template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{__lsx_vabsd_h(v.raw, __lsx_vreplgr2vr_b(0))}; } template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{__lsx_vabsd_w(v.raw, __lsx_vreplgr2vr_b(0))}; } template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{__lsx_vabsd_d(v.raw, __lsx_vreplgr2vr_b(0))}; } // ------------------------------ SaturatedAbs #ifdef HWY_NATIVE_SATURATED_ABS #undef HWY_NATIVE_SATURATED_ABS #else #define HWY_NATIVE_SATURATED_ABS #endif template )> HWY_API V SaturatedAbs(V v) { const DFromV d; const RebindToUnsigned du; return BitCast(d, Min(BitCast(du, v), BitCast(du, SaturatedSub(Zero(d), v)))); } template )> HWY_API V SaturatedAbs(V v) { return Max(v, SaturatedSub(Zero(DFromV()), v)); } template )> HWY_API V SaturatedAbs(V v) { const auto abs_v = Abs(v); const DFromV d; const RebindToUnsigned du; return BitCast(d, Min(BitCast(du, abs_v), Set(du, static_cast(LimitsMax())))); } template )> HWY_API V SaturatedAbs(V v) { const auto abs_v = Abs(v); return Add(abs_v, BroadcastSignBit(abs_v)); } // ------------------------------ IfNegativeThenElse template HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, Vec128 no) { static_assert(IsSigned(), "Only works for signed/float"); const DFromV d; const RebindToSigned di; Mask128 m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); return IfThenElse(m, yes, no); } // ------------------------------ IfNegativeThenNegOrUndefIfZero #ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG #undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG #else #define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG #endif template HWY_API Vec128 IfNegativeThenNegOrUndefIfZero(Vec128 mask, Vec128 v) { return Vec128{__lsx_vsigncov_b(mask.raw, v.raw)}; } template HWY_API Vec128 IfNegativeThenNegOrUndefIfZero( Vec128 mask, Vec128 v) { return Vec128{__lsx_vsigncov_h(mask.raw, v.raw)}; } template HWY_API Vec128 IfNegativeThenNegOrUndefIfZero( Vec128 mask, Vec128 v) { return Vec128{__lsx_vsigncov_w(mask.raw, v.raw)}; } template HWY_API Vec128 IfNegativeThenNegOrUndefIfZero( Vec128 mask, Vec128 v) { return Vec128{__lsx_vsigncov_d(mask.raw, v.raw)}; } // ------------------------------ ShiftLeftSame/ShiftRightSame template HWY_API Vec128 ShiftLeftSame(const Vec128 v, int bits) { return v << Set(DFromV(), static_cast(bits)); } template HWY_API Vec128 ShiftRightSame(const Vec128 v, int bits) { return v >> Set(DFromV(), static_cast(bits)); } // ------------------------------ Integer/Float Div #ifdef HWY_NATIVE_INT_DIV #undef HWY_NATIVE_INT_DIV #else #define HWY_NATIVE_INT_DIV #endif template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { // Use inline assembly to avoid undefined behavior if any lanes of b are zero // or a[i] == LimitsMin() && b[i] == -1 __m128i raw_result; __asm__("vdiv.b %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); return Vec128{raw_result}; } template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { // Use inline assembly to avoid undefined behavior if any lanes of b are zero __m128i raw_result; __asm__("vdiv.bu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); return Vec128{raw_result}; } template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { // Use inline assembly to avoid undefined behavior if any lanes of b are zero // or a[i] == LimitsMin() && b[i] == -1 __m128i raw_result; __asm__("vdiv.h %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); return Vec128{raw_result}; } template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { // Use inline assembly to avoid undefined behavior if any lanes of b are zero __m128i raw_result; __asm__("vdiv.hu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); return Vec128{raw_result}; } template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { // Use inline assembly to avoid undefined behavior if any lanes of b are zero // or a[i] == LimitsMin() && b[i] == -1 __m128i raw_result; __asm__("vdiv.w %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); return Vec128{raw_result}; } template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { // Use inline assembly to avoid undefined behavior if any lanes of b are zero __m128i raw_result; __asm__("vdiv.wu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); return Vec128{raw_result}; } template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { // Use inline assembly to avoid undefined behavior if any lanes of b are zero // or a[i] == LimitsMin() && b[i] == -1 __m128i raw_result; __asm__("vdiv.d %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); return Vec128{raw_result}; } template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { // Use inline assembly to avoid undefined behavior if any lanes of b are zero __m128i raw_result; __asm__("vdiv.du %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); return Vec128{raw_result}; } template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vfdiv_s(a.raw, b.raw)}; } template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { return Vec128{__lsx_vfdiv_d(a.raw, b.raw)}; } // ------------------------------ Integer Mod template HWY_API Vec128 operator%(const Vec128 a, const Vec128 b) { // Use inline assembly to avoid undefined behavior if any lanes of b are zero // or a[i] == LimitsMin() && b[i] == -1 __m128i raw_result; __asm__("vmod.b %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); return Vec128{raw_result}; } template HWY_API Vec128 operator%(const Vec128 a, const Vec128 b) { // Use inline assembly to avoid undefined behavior if any lanes of b are zero __m128i raw_result; __asm__("vmod.bu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); return Vec128{raw_result}; } template HWY_API Vec128 operator%(const Vec128 a, const Vec128 b) { // Use inline assembly to avoid undefined behavior if any lanes of b are zero // or a[i] == LimitsMin() && b[i] == -1 __m128i raw_result; __asm__("vmod.h %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); return Vec128{raw_result}; } template HWY_API Vec128 operator%(const Vec128 a, const Vec128 b) { // Use inline assembly to avoid undefined behavior if any lanes of b are zero __m128i raw_result; __asm__("vmod.hu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); return Vec128{raw_result}; } template HWY_API Vec128 operator%(const Vec128 a, const Vec128 b) { // Use inline assembly to avoid undefined behavior if any lanes of b are zero // or a[i] == LimitsMin() && b[i] == -1 __m128i raw_result; __asm__("vmod.w %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); return Vec128{raw_result}; } template HWY_API Vec128 operator%(const Vec128 a, const Vec128 b) { // Use inline assembly to avoid undefined behavior if any lanes of b are zero __m128i raw_result; __asm__("vmod.wu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); return Vec128{raw_result}; } template HWY_API Vec128 operator%(const Vec128 a, const Vec128 b) { // Use inline assembly to avoid undefined behavior if any lanes of b are zero // or a[i] == LimitsMin() && b[i] == -1 __m128i raw_result; __asm__("vmod.d %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); return Vec128{raw_result}; } template HWY_API Vec128 operator%(const Vec128 a, const Vec128 b) { // Use inline assembly to avoid undefined behavior if any lanes of b are zero __m128i raw_result; __asm__("vmod.du %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); return Vec128{raw_result}; } // ------------------------------ ApproximateReciprocal #ifdef HWY_NATIVE_F64_APPROX_RECIP #undef HWY_NATIVE_F64_APPROX_RECIP #else #define HWY_NATIVE_F64_APPROX_RECIP #endif template HWY_API Vec128 ApproximateReciprocal(const Vec128 v) { return Vec128{__lsx_vfrecip_s(v.raw)}; } template HWY_API Vec128 ApproximateReciprocal(const Vec128 v) { return Vec128{__lsx_vfrecip_d(v.raw)}; } // ------------------------------ Absolute value of difference #ifdef HWY_NATIVE_INTEGER_ABS_DIFF #undef HWY_NATIVE_INTEGER_ABS_DIFF #else #define HWY_NATIVE_INTEGER_ABS_DIFF #endif template HWY_API Vec128 AbsDiff(const Vec128 a, Vec128 b) { return Vec128{__lsx_vabsd_b(a.raw, b.raw)}; } template HWY_API Vec128 AbsDiff(const Vec128 a, Vec128 b) { return Vec128{__lsx_vabsd_h(a.raw, b.raw)}; } template HWY_API Vec128 AbsDiff(const Vec128 a, Vec128 b) { return Vec128{__lsx_vabsd_w(a.raw, b.raw)}; } template HWY_API Vec128 AbsDiff(const Vec128 a, Vec128 b) { return Vec128{__lsx_vabsd_d(a.raw, b.raw)}; } template HWY_API Vec128 AbsDiff(const Vec128 a, Vec128 b) { return Vec128{__lsx_vabsd_bu(a.raw, b.raw)}; } template HWY_API Vec128 AbsDiff(const Vec128 a, Vec128 b) { return Vec128{__lsx_vabsd_hu(a.raw, b.raw)}; } template HWY_API Vec128 AbsDiff(const Vec128 a, Vec128 b) { return Vec128{__lsx_vabsd_wu(a.raw, b.raw)}; } template HWY_API Vec128 AbsDiff(const Vec128 a, Vec128 b) { return Vec128{__lsx_vabsd_du(a.raw, b.raw)}; } // Generic for all vector lengths. template HWY_API V AbsDiff(V a, V b) { return Abs(a - b); } // ------------------------------ Integer/Float multiply-add #ifdef HWY_NATIVE_INT_FMA #undef HWY_NATIVE_INT_FMA #else #define HWY_NATIVE_INT_FMA #endif template HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, Vec128 add) { return Vec128{__lsx_vmadd_b(add.raw, mul.raw, x.raw)}; } template HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, Vec128 add) { return Vec128{__lsx_vmadd_h(add.raw, mul.raw, x.raw)}; } template HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, Vec128 add) { return Vec128{__lsx_vmadd_w(add.raw, mul.raw, x.raw)}; } template HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, Vec128 add) { return Vec128{__lsx_vmadd_d(add.raw, mul.raw, x.raw)}; } template HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, Vec128 add) { return Vec128{__lsx_vfmadd_s(mul.raw, x.raw, add.raw)}; } template HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, Vec128 add) { return Vec128{__lsx_vfmadd_d(mul.raw, x.raw, add.raw)}; } // Unsinged template HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, Vec128 add) { return mul * x + add; } // ------------------------------ Integer/Float NegMulAdd template HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, Vec128 add) { return Vec128{__lsx_vmsub_b(add.raw, mul.raw, x.raw)}; } template HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, Vec128 add) { return Vec128{__lsx_vmsub_h(add.raw, mul.raw, x.raw)}; } template HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, Vec128 sub) { return Vec128{__lsx_vmsub_w(sub.raw, mul.raw, x.raw)}; } template HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, Vec128 sub) { return Vec128{__lsx_vmsub_d(sub.raw, mul.raw, x.raw)}; } // Float/unsigned template HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, Vec128 add) { return add - mul * x; } // ------------------------------ Float MulSub // float template HWY_API Vec128 MulSub(Vec128 mul, Vec128 x, Vec128 sub) { return Vec128{__lsx_vfmsub_s(x.raw, mul.raw, sub.raw)}; } template HWY_API Vec128 MulSub(Vec128 mul, Vec128 x, Vec128 sub) { return Vec128{__lsx_vfmsub_d(x.raw, mul.raw, sub.raw)}; } // unsigned template HWY_API Vec128 MulSub(Vec128 mul, Vec128 x, Vec128 sub) { return mul * x - sub; } // ------------------------------ Float NegMulSub // float/unsigned template HWY_API Vec128 NegMulSub(Vec128 mul, Vec128 x, Vec128 sub) { return Neg(mul) * x - sub; } // ------------------------------ Floating-point square root template HWY_API Vec128 Sqrt(Vec128 v) { return Vec128{__lsx_vfsqrt_s(v.raw)}; } template HWY_API Vec128 Sqrt(Vec128 v) { return Vec128{__lsx_vfsqrt_d(v.raw)}; } // ------------------------------ ApproximateReciprocalSqrt #ifdef HWY_NATIVE_F64_APPROX_RSQRT #undef HWY_NATIVE_F64_APPROX_RSQRT #else #define HWY_NATIVE_F64_APPROX_RSQRT #endif template HWY_API Vec128 ApproximateReciprocalSqrt(Vec128 v) { return Vec128{__lsx_vfrsqrt_s(v.raw)}; } template HWY_API Vec128 ApproximateReciprocalSqrt(Vec128 v) { return Vec128{__lsx_vfrsqrt_d(v.raw)}; } // ------------------------------ Min template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{__lsx_vmin_bu(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{__lsx_vmin_hu(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{__lsx_vmin_wu(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{__lsx_vmin_du(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{__lsx_vmin_b(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{__lsx_vmin_h(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{__lsx_vmin_w(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{__lsx_vmin_d(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{__lsx_vfmin_s(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{__lsx_vfmin_d(a.raw, b.raw)}; } // ------------------------------ Max template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{__lsx_vmax_bu(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{__lsx_vmax_hu(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{__lsx_vmax_wu(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{__lsx_vmax_du(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{__lsx_vmax_b(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{__lsx_vmax_h(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{__lsx_vmax_w(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{__lsx_vmax_d(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{__lsx_vfmax_s(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{__lsx_vfmax_d(a.raw, b.raw)}; } // ------------------------------ MinMagnitude and MaxMagnitude #ifdef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE #undef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE #else #define HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE #endif template HWY_API Vec128 MinMagnitude(Vec128 a, Vec128 b) { return Vec128{__lsx_vfmina_s(a.raw, b.raw)}; } template HWY_API Vec128 MinMagnitude(Vec128 a, Vec128 b) { return Vec128{__lsx_vfmina_d(a.raw, b.raw)}; } template HWY_API Vec128 MaxMagnitude(Vec128 a, Vec128 b) { return Vec128{__lsx_vfmaxa_s(a.raw, b.raw)}; } template HWY_API Vec128 MaxMagnitude(Vec128 a, Vec128 b) { return Vec128{__lsx_vfmaxa_d(a.raw, b.raw)}; } // ------------------------------ Non-temporal stores // Same as aligned stores on non-x86. template HWY_API void Stream(const VFromD v, D d, TFromD* HWY_RESTRICT aligned) { __builtin_prefetch(aligned, 1, 0); Store(v, d, aligned); } // ------------------------------ Scatter in generic_ops-inl.h // ------------------------------ Gather in generic_ops-inl.h // ================================================== SWIZZLE (2) // ------------------------------ LowerHalf template HWY_API VFromD LowerHalf(D /* tag */, VFromD> v) { return VFromD{v.raw}; } template HWY_API Vec128 LowerHalf(Vec128 v) { return Vec128{v.raw}; } // ------------------------------ ShiftLeftBytes template HWY_API VFromD ShiftLeftBytes(D d, VFromD v) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); if (kBytes == 0) return v; const RebindToUnsigned du; return BitCast( d, VFromD{__lsx_vbsll_v(BitCast(du, v).raw, kBytes)}); } // Generic for all vector lengths. template HWY_API V ShiftLeftBytes(const V v) { return ShiftLeftBytes(DFromV(), v); } // ------------------------------ ShiftLeftLanes // Generic for all vector lengths. template HWY_API VFromD ShiftLeftLanes(D d, const VFromD v) { const Repartition d8; return BitCast(d, ShiftLeftBytes)>(BitCast(d8, v))); } // Generic for all vector lengths. template HWY_API V ShiftLeftLanes(const V v) { return ShiftLeftLanes(DFromV(), v); } // ------------------------------ ShiftRightBytes template HWY_API VFromD ShiftRightBytes(D d, VFromD v) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); if (kBytes == 0) return v; const RebindToUnsigned du; // For partial vectors, clear upper lanes so we shift in zeros. if (d.MaxBytes() != 16) { const Full128> dfull; const VFromD vfull{v.raw}; v = VFromD{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw}; } return BitCast( d, VFromD{__lsx_vbsrl_v(BitCast(du, v).raw, kBytes)}); } // ------------------------------ ShiftRightLanes // Generic for all vector lengths. template HWY_API VFromD ShiftRightLanes(D d, const VFromD v) { const Repartition d8; constexpr size_t kBytes = kLanes * sizeof(TFromD); return BitCast(d, ShiftRightBytes(d8, BitCast(d8, v))); } // ------------------------------ UpperHalf (ShiftRightBytes) template HWY_API VFromD UpperHalf(D d, VFromD> v) { const Twice> dut; using VUT = VFromD; // for float16_t const VUT vut = BitCast(dut, v); return BitCast(d, LowerHalf(VUT{__lsx_vilvh_d(vut.raw, vut.raw)})); } // Partial template HWY_API VFromD UpperHalf(D d, VFromD> v) { return LowerHalf(d, ShiftRightBytes(Twice(), v)); } // ------------------------------ ExtractLane (UpperHalf) namespace detail { template HWY_INLINE T ExtractLane(const Vec128 v) { static_assert(kLane < N, "Lane index out of bounds"); return static_cast(__lsx_vpickve2gr_b(v.raw, kLane) & 0xFF); } template HWY_INLINE T ExtractLane(const Vec128 v) { static_assert(kLane < N, "Lane index out of bounds"); const DFromV d; const RebindToUnsigned du; const uint16_t lane = static_cast( __lsx_vpickve2gr_hu(BitCast(du, v).raw, kLane) & 0xFFFF); return BitCastScalar(lane); } template HWY_INLINE T ExtractLane(const Vec128 v) { static_assert(kLane < N, "Lane index out of bounds"); return static_cast(__lsx_vpickve2gr_w(v.raw, kLane)); } template HWY_INLINE T ExtractLane(const Vec128 v) { static_assert(kLane < N, "Lane index out of bounds"); return static_cast(__lsx_vpickve2gr_d(v.raw, kLane)); } template HWY_INLINE float ExtractLane(const Vec128 v) { float f32; int32_t i32 = __lsx_vpickve2gr_w(reinterpret_cast<__m128i>(v.raw), kLane); CopyBytes<4>(&i32, &f32); return f32; } template HWY_INLINE double ExtractLane(const Vec128 v) { double f64; int64_t i64 = __lsx_vpickve2gr_d(reinterpret_cast<__m128i>(v.raw), kLane); CopyBytes<8>(&i64, &f64); return f64; } } // namespace detail template HWY_API T ExtractLane(const Vec128 v, size_t i) { HWY_DASSERT(i == 0); (void)i; return GetLane(v); } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::ExtractLane<0>(v); case 1: return detail::ExtractLane<1>(v); } } #endif alignas(16) T lanes[2]; Store(v, DFromV(), lanes); return lanes[i]; } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::ExtractLane<0>(v); case 1: return detail::ExtractLane<1>(v); case 2: return detail::ExtractLane<2>(v); case 3: return detail::ExtractLane<3>(v); } } #endif alignas(16) T lanes[4]; Store(v, DFromV(), lanes); return lanes[i]; } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::ExtractLane<0>(v); case 1: return detail::ExtractLane<1>(v); case 2: return detail::ExtractLane<2>(v); case 3: return detail::ExtractLane<3>(v); case 4: return detail::ExtractLane<4>(v); case 5: return detail::ExtractLane<5>(v); case 6: return detail::ExtractLane<6>(v); case 7: return detail::ExtractLane<7>(v); } } #endif alignas(16) T lanes[8]; Store(v, DFromV(), lanes); return lanes[i]; } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::ExtractLane<0>(v); case 1: return detail::ExtractLane<1>(v); case 2: return detail::ExtractLane<2>(v); case 3: return detail::ExtractLane<3>(v); case 4: return detail::ExtractLane<4>(v); case 5: return detail::ExtractLane<5>(v); case 6: return detail::ExtractLane<6>(v); case 7: return detail::ExtractLane<7>(v); case 8: return detail::ExtractLane<8>(v); case 9: return detail::ExtractLane<9>(v); case 10: return detail::ExtractLane<10>(v); case 11: return detail::ExtractLane<11>(v); case 12: return detail::ExtractLane<12>(v); case 13: return detail::ExtractLane<13>(v); case 14: return detail::ExtractLane<14>(v); case 15: return detail::ExtractLane<15>(v); } } #endif alignas(16) T lanes[16]; Store(v, DFromV(), lanes); return lanes[i]; } // ------------------------------ InsertLane (UpperHalf) namespace detail { template HWY_INLINE V InsertLaneUsingBroadcastAndBlend(V v, size_t i, TFromV t) { const DFromV d; #if HWY_TARGET <= HWY_AVX3 using RawMask = decltype(MaskFromVec(VFromD()).raw); const auto mask = MFromD{static_cast(uint64_t{1} << i)}; #else const RebindToUnsigned du; using TU = TFromD; const auto mask = RebindMask(d, Iota(du, 0) == Set(du, static_cast(i))); #endif return IfThenElse(mask, Set(d, t), v); } template HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { static_assert(kLane < N, "Lane index out of bounds"); return Vec128{__lsx_vinsgr2vr_b(v.raw, t, kLane)}; } template HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { static_assert(kLane < N, "Lane index out of bounds"); const DFromV d; const RebindToUnsigned du; const uint16_t bits = BitCastScalar(t); return BitCast(d, VFromD{ __lsx_vinsgr2vr_h(BitCast(du, v).raw, bits, kLane)}); } template HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { static_assert(kLane < N, "Lane index out of bounds"); return Vec128{__lsx_vinsgr2vr_w(v.raw, t, kLane)}; } template HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { static_assert(kLane < N, "Lane index out of bounds"); return Vec128{__lsx_vinsgr2vr_d(v.raw, t, kLane)}; } template HWY_INLINE Vec128 InsertLane(const Vec128 v, float t) { static_assert(kLane < N, "Lane index out of bounds"); const DFromV d; int ti = BitCastScalar(t); RebindToUnsigned du; return BitCast(d, VFromD{__lsx_vinsgr2vr_w( reinterpret_cast<__m128i>(v.raw), ti, kLane)}); } template HWY_INLINE Vec128 InsertLane(const Vec128 v, double t) { static_assert(kLane < 2, "Lane index out of bounds"); const DFromV d; long int ti = BitCastScalar(t); RebindToUnsigned du; return BitCast(d, VFromD{__lsx_vinsgr2vr_d( reinterpret_cast<__m128i>(v.raw), ti, kLane)}); } } // namespace detail template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { HWY_DASSERT(i == 0); (void)i; return Set(DFromV(), t); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); } } #endif return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); case 2: return detail::InsertLane<2>(v, t); case 3: return detail::InsertLane<3>(v, t); } } #endif return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); case 2: return detail::InsertLane<2>(v, t); case 3: return detail::InsertLane<3>(v, t); case 4: return detail::InsertLane<4>(v, t); case 5: return detail::InsertLane<5>(v, t); case 6: return detail::InsertLane<6>(v, t); case 7: return detail::InsertLane<7>(v, t); } } #endif return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); case 2: return detail::InsertLane<2>(v, t); case 3: return detail::InsertLane<3>(v, t); case 4: return detail::InsertLane<4>(v, t); case 5: return detail::InsertLane<5>(v, t); case 6: return detail::InsertLane<6>(v, t); case 7: return detail::InsertLane<7>(v, t); case 8: return detail::InsertLane<8>(v, t); case 9: return detail::InsertLane<9>(v, t); case 10: return detail::InsertLane<10>(v, t); case 11: return detail::InsertLane<11>(v, t); case 12: return detail::InsertLane<12>(v, t); case 13: return detail::InsertLane<13>(v, t); case 14: return detail::InsertLane<14>(v, t); case 15: return detail::InsertLane<15>(v, t); } } #endif return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); } // ------------------------------ CombineShiftRightBytes template HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { static_assert(0 < kBytes && kBytes < 16, "kBytes invalid"); return Or(ShiftRightBytes(d, lo), ShiftLeftBytes<16 - kBytes>(d, hi)); } template HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { constexpr size_t kSize = d.MaxBytes(); static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); const Twice dt; return VFromD{ShiftRightBytes(dt, Combine(dt, hi, lo)).raw}; } // ------------------------------ Broadcast/splat any lane template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{__lsx_vreplvei_b(v.raw, kLane)}; } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{__lsx_vreplvei_h(v.raw, kLane)}; } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); const DFromV d; return BitCast(d, Vec128{__lsx_vreplvei_w( reinterpret_cast<__m128i>(v.raw), kLane)}); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); const DFromV d; return BitCast(d, Vec128{__lsx_vreplvei_d( reinterpret_cast<__m128i>(v.raw), kLane)}); } // ------------------------------ TableLookupLanes (Shuffle01) // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. template struct Indices128 { __m128i raw; }; namespace detail { template HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( D d) { const Repartition d8; return Iota(d8, 0); } template HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( D d) { const Repartition d8; alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; return Load(d8, kBroadcastLaneBytes); } template HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( D d) { const Repartition d8; alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; return Load(d8, kBroadcastLaneBytes); } template HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( D d) { const Repartition d8; alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; return Load(d8, kBroadcastLaneBytes); } template HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { const Repartition d8; return Zero(d8); } template HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { const Repartition d8; alignas(16) static constexpr uint8_t kByteOffsets[16] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; return Load(d8, kByteOffsets); } template HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { const Repartition d8; alignas(16) static constexpr uint8_t kByteOffsets[16] = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; return Load(d8, kByteOffsets); } template HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { const Repartition d8; alignas(16) static constexpr uint8_t kByteOffsets[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; return Load(d8, kByteOffsets); } } // namespace detail template HWY_API Indices128, MaxLanes(D())> IndicesFromVec( D d, Vec128 vec) { using T = TFromD; static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); #if HWY_IS_DEBUG_BUILD const RebindToUnsigned du; using TU = TFromD; HWY_DASSERT(AllTrue( du, Lt(BitCast(du, vec), Set(du, static_cast(MaxLanes(d) * 2))))); #endif (void)d; return Indices128, MaxLanes(D())>{BitCast(d, vec).raw}; } template HWY_API Indices128, MaxLanes(D())> IndicesFromVec( D d, Vec128 vec) { using T = TFromD; static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); #if HWY_IS_DEBUG_BUILD const RebindToUnsigned du; using TU = TFromD; HWY_DASSERT(AllTrue( du, Lt(BitCast(du, vec), Set(du, static_cast(MaxLanes(d) * 2))))); #endif const Repartition d8; using V8 = VFromD; // Broadcast each lane index to all bytes of T and shift to bytes const V8 lane_indices = TableLookupBytes( BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d)); constexpr int kIndexShiftAmt = static_cast(FloorLog2(sizeof(T))); const V8 byte_indices = ShiftLeft(lane_indices); const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d)); return Indices128, MaxLanes(D())>{sum.raw}; } template HWY_API Indices128, MaxLanes(D())> SetTableIndices(D d, const TI* idx) { const Rebind di; return IndicesFromVec(d, LoadU(di, idx)); } template HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { using TI = MakeSigned; const DFromV d; const Rebind di; auto t1 = TableLookupBytes(BitCast(di, v), Vec128{idx.raw}); return BitCast(d, t1); } // Single lane: no change template HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 /* idx */) { return v; } // ------------------------------ ReverseBlocks // Single block: no change template HWY_API VFromD ReverseBlocks(D /* tag */, VFromD v) { return v; } // ------------------------------ Reverse (Shuffle0123, Shuffle2301) // Single lane: no change template HWY_API VFromD Reverse(D /* tag */, VFromD v) { return v; } // 32-bit x2: shuffle template HWY_API VFromD Reverse(D /* tag */, const VFromD v) { return VFromD{Shuffle2301(Vec128>{v.raw}).raw}; } // 64-bit x2: shuffle template HWY_API VFromD Reverse(D /* tag */, const VFromD v) { return Shuffle01(v); } // 32-bit x4: shuffle template HWY_API VFromD Reverse(D /* tag */, const VFromD v) { return Shuffle0123(v); } // 16-bit template HWY_API VFromD Reverse(D d, const VFromD v) { const RebindToUnsigned du; using VU = VFromD; const VU vu = BitCast(du, v); constexpr size_t kN = MaxLanes(d); if (kN == 1) return v; if (kN == 2) { return BitCast(d, VU{__lsx_vshuf4i_h(vu.raw, 0x11)}); } if (kN == 4) { return BitCast(d, VU{__lsx_vshuf4i_h(vu.raw, 0x1B)}); } const RebindToSigned di; const VFromD shuffle = Dup128VecFromValues( di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); return BitCast(d, TableLookupBytes(v, shuffle)); } template HWY_API VFromD Reverse(D d, const VFromD v) { static constexpr int kN = static_cast(MaxLanes(d)); if (kN == 1) return v; alignas(16) static constexpr int8_t _tmp_data[] = { kN - 1, kN - 2, kN - 3, kN - 4, kN - 5, kN - 6, kN - 7, kN - 8, kN - 9, kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16}; return VFromD{__lsx_vshuf_b(v.raw, v.raw, __lsx_vld(_tmp_data, 0))}; } // ------------------------------ Reverse2 // Single lane: no change template HWY_API VFromD Reverse2(D /* tag */, VFromD v) { return v; } template HWY_API VFromD Reverse2(D d, const VFromD v) { const RepartitionToWide> dw; return BitCast(d, RotateRight<16>(BitCast(dw, v))); } // Generic for all vector lengths. template HWY_API VFromD Reverse2(D /* tag */, VFromD v) { return Shuffle2301(v); } // Generic for all vector lengths. template HWY_API VFromD Reverse2(D /* tag */, VFromD v) { return Shuffle01(v); } // ------------------------------ Reverse4 template HWY_API VFromD Reverse4(D /* tag */, VFromD v) { return VFromD{__lsx_vshuf4i_h(v.raw, 0x1B)}; } // Generic for all vector lengths. template HWY_API VFromD Reverse4(D /* tag */, const VFromD v) { return Shuffle0123(v); } template HWY_API VFromD Reverse4(D /* tag */, VFromD /* v */) { HWY_ASSERT(0); // don't have 4 u64 lanes } // ------------------------------ Reverse8 template HWY_API VFromD Reverse8(D d, const VFromD v) { const RepartitionToWide dw; return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v)))); } template HWY_API VFromD Reverse8(D /* tag */, VFromD /* v */) { HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit } // ------------------------------ InterleaveUpper (UpperHalf) // Full template HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { return VFromD{__lsx_vilvh_b(b.raw, a.raw)}; } template HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { return VFromD{__lsx_vilvh_h(b.raw, a.raw)}; } template HWY_API VFromD InterleaveUpper(D d, VFromD a, VFromD b) { const RebindToSigned df; return BitCast(d, VFromD{ __lsx_vilvh_w(BitCast(df, b).raw, BitCast(df, a).raw)}); } template HWY_API VFromD InterleaveUpper(D d, VFromD a, VFromD b) { const RebindToSigned dd; return BitCast(d, VFromD{ __lsx_vilvh_d(BitCast(dd, b).raw, BitCast(dd, a).raw)}); } // Partial template HWY_API VFromD InterleaveUpper(D d, VFromD a, VFromD b) { const Half d2; return InterleaveLower(d, VFromD{UpperHalf(d2, a).raw}, VFromD{UpperHalf(d2, b).raw}); } // ------------------------------ ZipLower/ZipUpper (InterleaveLower) // Same as Interleave*, except that the return lanes are double-width integers; // this is necessary because the single-lane scalar cannot return two values. template >> HWY_API VFromD ZipLower(V a, V b) { return BitCast(DW(), InterleaveLower(a, b)); } template , class DW = RepartitionToWide> HWY_API VFromD ZipLower(DW dw, V a, V b) { return BitCast(dw, InterleaveLower(D(), a, b)); } template , class DW = RepartitionToWide> HWY_API VFromD ZipUpper(DW dw, V a, V b) { return BitCast(dw, InterleaveUpper(D(), a, b)); } // ================================================== CONVERT (1) // ------------------------------ PromoteTo unsigned template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vsllwil_hu_bu(v.raw, 0)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vsllwil_wu_hu(v.raw, 0)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vsllwil_du_wu(v.raw, 0)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { const __m128i u16 = __lsx_vsllwil_hu_bu(v.raw, 0); return VFromD{__lsx_vsllwil_wu_hu(u16, 0)}; } template HWY_API VFromD PromoteTo(D d, VFromD> v) { const Rebind du32; return PromoteTo(d, PromoteTo(du32, v)); } template HWY_API VFromD PromoteTo(D /*tag*/, VFromD> v) { const __m128i u32 = __lsx_vsllwil_wu_hu(v.raw, 0); return VFromD{__lsx_vsllwil_du_wu(u32, 0)}; } // Unsigned to signed: same plus cast. template ), sizeof(TFromV)), HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V))> HWY_API VFromD PromoteTo(D di, V v) { const RebindToUnsigned du; return BitCast(di, PromoteTo(du, v)); } // signed template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vsllwil_h_b(v.raw, 0)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vsllwil_w_h(v.raw, 0)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vsllwil_d_w(v.raw, 0)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { const __m128i i16 = __lsx_vsllwil_h_b(v.raw, 0); return VFromD{__lsx_vsllwil_w_h(i16, 0)}; } template HWY_API VFromD PromoteTo(D d, VFromD> v) { const Rebind di32; return PromoteTo(d, PromoteTo(di32, v)); } template HWY_API VFromD PromoteTo(D /*tag*/, VFromD> v) { const __m128i i32 = __lsx_vsllwil_w_h(v.raw, 0); return VFromD{__lsx_vsllwil_d_w(i32, 0)}; } // -------------------- PromoteTo float #ifdef HWY_NATIVE_F16C #undef HWY_NATIVE_F16C #else #define HWY_NATIVE_F16C #endif template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vfcvtl_s_h(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vfcvtl_d_s(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vffintl_d_w(v.raw)}; } template HWY_API VFromD PromoteTo(D df64, VFromD> v) { const Rebind di32; const auto i32_to_f64_result = PromoteTo(df64, BitCast(di32, v)); return i32_to_f64_result + IfNegativeThenElse(i32_to_f64_result, Set(df64, 4294967296.0), Zero(df64)); } template HWY_API VFromD PromoteTo(D d, VFromD> v) { const RebindToSigned di32; const Rebind du16; return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); } // ------------------------------ Per4LaneBlockShuffle namespace detail { #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 #else #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32 #endif template HWY_INLINE VFromD Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0) { typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16))); const GccU32RawVectType raw = {x0, x1, x2, x3}; return ResizeBitCast(d, Vec128{reinterpret_cast<__m128i>(raw)}); } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, hwy::SizeTag /*vect_size_tag*/, V v) { constexpr int kShuffle = static_cast(kIdx3210 & 0xFF); return V{__lsx_vshuf4i_b(v.raw, kShuffle)}; } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, hwy::SizeTag /*vect_size_tag*/, V v) { const DFromV d; const RebindToUnsigned du; // for float16_t constexpr int kShuffle = static_cast(kIdx3210 & 0xFF); return BitCast( d, VFromD{__lsx_vshuf4i_h(BitCast(du, v).raw, kShuffle)}); } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, hwy::SizeTag<16> /*vect_size_tag*/, V v) { const DFromV d; constexpr int kShuffle = static_cast(kIdx3210 & 0xFF); const RebindToUnsigned du; return BitCast(d, VFromD{__lsx_vshuf4i_w( reinterpret_cast<__m128i>(v.raw), kShuffle)}); } } // namespace detail // ------------------------------ SlideUpLanes namespace detail { template HWY_INLINE V SlideUpLanes(V v, size_t amt) { const DFromV d; const Full64 du64; const auto vu64 = ResizeBitCast(du64, v); return ResizeBitCast( d, ShiftLeftSame(vu64, static_cast(amt * sizeof(TFromV) * 8))); } template HWY_INLINE V SlideUpLanes(V v, size_t amt) { const DFromV d; const Repartition du8; const auto idx = Iota(du8, static_cast(size_t{0} - amt * sizeof(TFromV))); return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx)); } } // namespace detail template HWY_API VFromD SlideUpLanes(D /*d*/, VFromD v, size_t /*amt*/) { return v; } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftLeftLanes<1>(d, v); } } #else (void)d; #endif return detail::SlideUpLanes(v, amt); } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftLeftLanes<1>(d, v); case 2: return ShiftLeftLanes<2>(d, v); case 3: return ShiftLeftLanes<3>(d, v); } } #else (void)d; #endif return detail::SlideUpLanes(v, amt); } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftLeftLanes<1>(d, v); case 2: return ShiftLeftLanes<2>(d, v); case 3: return ShiftLeftLanes<3>(d, v); case 4: return ShiftLeftLanes<4>(d, v); case 5: return ShiftLeftLanes<5>(d, v); case 6: return ShiftLeftLanes<6>(d, v); case 7: return ShiftLeftLanes<7>(d, v); } } #else (void)d; #endif return detail::SlideUpLanes(v, amt); } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftLeftLanes<1>(d, v); case 2: return ShiftLeftLanes<2>(d, v); case 3: return ShiftLeftLanes<3>(d, v); case 4: return ShiftLeftLanes<4>(d, v); case 5: return ShiftLeftLanes<5>(d, v); case 6: return ShiftLeftLanes<6>(d, v); case 7: return ShiftLeftLanes<7>(d, v); case 8: return ShiftLeftLanes<8>(d, v); case 9: return ShiftLeftLanes<9>(d, v); case 10: return ShiftLeftLanes<10>(d, v); case 11: return ShiftLeftLanes<11>(d, v); case 12: return ShiftLeftLanes<12>(d, v); case 13: return ShiftLeftLanes<13>(d, v); case 14: return ShiftLeftLanes<14>(d, v); case 15: return ShiftLeftLanes<15>(d, v); } } #else (void)d; #endif return detail::SlideUpLanes(v, amt); } // ------------------------------ SlideDownLanes namespace detail { template HWY_INLINE V SlideDownLanes(V v, size_t amt) { const DFromV d; const Repartition, decltype(d)> dv; return BitCast(d, ShiftRightSame(BitCast(dv, v), static_cast(amt * sizeof(TFromV) * 8))); } template HWY_INLINE V SlideDownLanes(V v, size_t amt) { const DFromV d; const Repartition di8; auto idx = Iota(di8, static_cast(amt * sizeof(TFromV))); idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15}))); return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx)); } } // namespace detail template HWY_API VFromD SlideDownLanes(D /*d*/, VFromD v, size_t /*amt*/) { return v; } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftRightLanes<1>(d, v); } } #else (void)d; #endif return detail::SlideDownLanes(v, amt); } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftRightLanes<1>(d, v); case 2: return ShiftRightLanes<2>(d, v); case 3: return ShiftRightLanes<3>(d, v); } } #else (void)d; #endif return detail::SlideDownLanes(v, amt); } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftRightLanes<1>(d, v); case 2: return ShiftRightLanes<2>(d, v); case 3: return ShiftRightLanes<3>(d, v); case 4: return ShiftRightLanes<4>(d, v); case 5: return ShiftRightLanes<5>(d, v); case 6: return ShiftRightLanes<6>(d, v); case 7: return ShiftRightLanes<7>(d, v); } } #else (void)d; #endif return detail::SlideDownLanes(v, amt); } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftRightLanes<1>(d, v); case 2: return ShiftRightLanes<2>(d, v); case 3: return ShiftRightLanes<3>(d, v); case 4: return ShiftRightLanes<4>(d, v); case 5: return ShiftRightLanes<5>(d, v); case 6: return ShiftRightLanes<6>(d, v); case 7: return ShiftRightLanes<7>(d, v); case 8: return ShiftRightLanes<8>(d, v); case 9: return ShiftRightLanes<9>(d, v); case 10: return ShiftRightLanes<10>(d, v); case 11: return ShiftRightLanes<11>(d, v); case 12: return ShiftRightLanes<12>(d, v); case 13: return ShiftRightLanes<13>(d, v); case 14: return ShiftRightLanes<14>(d, v); case 15: return ShiftRightLanes<15>(d, v); } } #else (void)d; #endif return detail::SlideDownLanes(v, amt); } // ================================================== COMBINE // ------------------------------ Combine (InterleaveLower) // N = N/2 + N/2 (upper half undefined) template >> HWY_API VFromD Combine(D d, VH hi_half, VH lo_half) { const Half dh; const RebindToUnsigned duh; // Treat half-width input as one lane, and expand to two lanes. using VU = Vec128, 2>; const VU lo{BitCast(duh, lo_half).raw}; const VU hi{BitCast(duh, hi_half).raw}; return BitCast(d, InterleaveLower(lo, hi)); } // ------------------------------ ZeroExtendVector (Combine) template HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { return Combine(d, Zero(Half()), lo); } // ------------------------------ Concat full (InterleaveLower) // hiH,hiL loH,loL |-> hiL,loL (= lower halves) template HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { const Repartition d64; return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi))); } // hiH,hiL loH,loL |-> hiH,loH (= upper halves) template HWY_API VFromD ConcatUpperUpper(D d, VFromD hi, VFromD lo) { const Repartition d64; return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi))); } // hiH,hiL loH,loL |-> hiL,loH (= inner halves) template HWY_API VFromD ConcatLowerUpper(D d, VFromD hi, VFromD lo) { return CombineShiftRightBytes<8>(d, hi, lo); } // hiH,hiL loH,loL |-> hiH,loL (= outer halves) template HWY_API VFromD ConcatUpperLower(D d, VFromD hi, VFromD lo) { return BitCast(d, Vec128{__lsx_vshuf4i_d( reinterpret_cast<__m128i>(lo.raw), reinterpret_cast<__m128i>(hi.raw), 0xC)}); } // ------------------------------ Concat partial (Combine, LowerHalf) template HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { const Half d2; return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); } template HWY_API VFromD ConcatUpperUpper(D d, VFromD hi, VFromD lo) { const Half d2; return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); } template HWY_API VFromD ConcatLowerUpper(D d, const VFromD hi, const VFromD lo) { const Half d2; return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); } template HWY_API VFromD ConcatUpperLower(D d, VFromD hi, VFromD lo) { const Half d2; return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo)); } // ------------------------------ ConcatOdd // 8-bit full template HWY_API VFromD ConcatOdd(D /* tag */, VFromD hi, VFromD lo) { return VFromD{__lsx_vpickod_b(hi.raw, lo.raw)}; } // 8-bit x8 template HWY_API VFromD ConcatOdd(D /* tag */, VFromD hi, VFromD lo) { __m128i _tmp = __lsx_vpickod_b(hi.raw, lo.raw); return VFromD{__lsx_vextrins_w(_tmp, _tmp, 0x12)}; } // 8-bit x4 template HWY_API VFromD ConcatOdd(D /* tag */, VFromD hi, VFromD lo) { __m128i _tmp = __lsx_vpickod_b(hi.raw, lo.raw); return VFromD{__lsx_vextrins_h(_tmp, _tmp, 0x14)}; } // 16-bit full template HWY_API VFromD ConcatOdd(D /* tag */, VFromD hi, VFromD lo) { return VFromD{__lsx_vpickod_h(hi.raw, lo.raw)}; } // 16-bit x4 template HWY_API VFromD ConcatOdd(D /* tag */, VFromD hi, VFromD lo) { __m128i _tmp = __lsx_vpickod_h(hi.raw, lo.raw); return VFromD{__lsx_vextrins_w(_tmp, _tmp, 0x12)}; } // 32-bit full template HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { return BitCast( d, Vec128{__lsx_vpickod_w(reinterpret_cast<__m128i>(hi.raw), reinterpret_cast<__m128i>(lo.raw))}); } // Any T x2 template HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { return InterleaveUpper(d, lo, hi); } // ------------------------------ ConcatEven // 8-bit full template HWY_API VFromD ConcatEven(D /* tag */, VFromD hi, VFromD lo) { return VFromD{__lsx_vpickev_b(hi.raw, lo.raw)}; } // 8-bit x8 template HWY_API VFromD ConcatEven(D /* tag */, VFromD hi, VFromD lo) { __m128i _tmp = __lsx_vpickev_b(hi.raw, lo.raw); return VFromD{__lsx_vextrins_w(_tmp, _tmp, 0x12)}; } // 8-bit x4 template HWY_API VFromD ConcatEven(D /* tag */, VFromD hi, VFromD lo) { __m128i _tmp = __lsx_vpickev_b(hi.raw, lo.raw); return VFromD{__lsx_vextrins_h(_tmp, _tmp, 0x14)}; } // 16-bit full template HWY_API VFromD ConcatEven(D /* tag */, VFromD hi, VFromD lo) { return VFromD{__lsx_vpickev_h(hi.raw, lo.raw)}; } // 16-bit x4 template HWY_API VFromD ConcatEven(D /* tag */, VFromD hi, VFromD lo) { __m128i _tmp = __lsx_vpickev_h(hi.raw, lo.raw); return VFromD{__lsx_vextrins_w(_tmp, _tmp, 0x12)}; } // 32-bit full template HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { return BitCast( d, Vec128{__lsx_vpickev_w(reinterpret_cast<__m128i>(hi.raw), reinterpret_cast<__m128i>(lo.raw))}); } // Any T x2 template HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { return InterleaveLower(d, lo, hi); } template HWY_INLINE Vec128 ConcatEven(Vec128 hi, Vec128 lo) { const DFromV d; const RebindToUnsigned du; return BitCast(d, ConcatEven(BitCast(du, hi), BitCast(du, lo))); } // ------------------------------ DupEven (InterleaveLower) template HWY_API Vec128 DupEven(const Vec128 v) { return v; } template HWY_API Vec128 DupEven(const Vec128 v) { __m128i _tmp = __lsx_vpickev_b(v.raw, v.raw); return Vec128{__lsx_vilvl_b(_tmp, _tmp)}; } template HWY_API Vec128 DupEven(const Vec128 v) { const DFromV d; const RebindToUnsigned du; // for float16_t __m128i _tmp = __lsx_vpickev_h(BitCast(du, v).raw, BitCast(du, v).raw); return BitCast(d, VFromD{__lsx_vilvl_h(_tmp, _tmp)}); } template HWY_API Vec128 DupEven(const Vec128 v) { const DFromV d; __m128i _tmp = detail::BitCastToInteger(v.raw); __m128i _tmp1 = __lsx_vpickev_w(_tmp, _tmp); return BitCast(d, Vec128{__lsx_vilvl_w(_tmp1, _tmp1)}); } template HWY_API Vec128 DupEven(Vec128 v) { return InterleaveLower(DFromV(), v, v); } // ------------------------------ DupOdd (InterleaveUpper) template HWY_API Vec128 DupOdd(Vec128 v) { return v; } template HWY_API Vec128 DupOdd(const Vec128 v) { __m128i _tmp = __lsx_vpickod_b(v.raw, v.raw); return Vec128{__lsx_vilvl_b(_tmp, _tmp)}; } template HWY_API Vec128 DupOdd(const Vec128 v) { __m128i _tmp = __lsx_vpickod_h(v.raw, v.raw); return Vec128{__lsx_vilvl_h(_tmp, _tmp)}; } template HWY_API Vec128 DupOdd(const Vec128 v) { const DFromV d; __m128i _tmp = detail::BitCastToInteger(v.raw); __m128i _tmp1 = __lsx_vpickod_w(_tmp, _tmp); return BitCast(d, Vec128{__lsx_vilvl_w(_tmp1, _tmp1)}); } template HWY_API Vec128 DupOdd(Vec128 v) { return InterleaveUpper(DFromV(), v, v); } // ------------------------------ TwoTablesLookupLanes (DupEven) template HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { const DFromV d; const Twice dt; const Repartition dt_u8; // TableLookupLanes currently requires table and index vectors to be the same // size, though a half-length index vector would be sufficient here. #if HWY_IS_MSAN const Vec128 idx_vec{idx.raw}; const Indices128 idx2{Combine(dt, idx_vec, idx_vec).raw}; #else // We only keep LowerHalf of the result, which is valid in idx. const Indices128 idx2{idx.raw}; #endif return LowerHalf( d, TableLookupBytes(Combine(dt, b, a), BitCast(dt, VFromD{idx2.raw}))); } template HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { return Vec128{__lsx_vshuf_b(b.raw, a.raw, idx.raw)}; } template HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { const DFromV d; const Repartition du8; return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), Indices128{idx.raw})); } // ------------------------------ OddEven template HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { __m128i t0 = __lsx_vpackod_b(a.raw, a.raw); return Vec128{__lsx_vpackev_b(t0, b.raw)}; } template HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { __m128i t0 = __lsx_vpackod_h(a.raw, a.raw); return Vec128{__lsx_vpackev_h(t0, b.raw)}; } template HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { const DFromV d; const RebindToUnsigned du; __m128i t0 = __lsx_vpackod_w(BitCast(du, a).raw, BitCast(du, a).raw); return BitCast(d, VFromD{__lsx_vpackev_w(t0, BitCast(du, b).raw)}); } template HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { const DFromV d; const RebindToUnsigned du; return BitCast(d, VFromD{__lsx_vextrins_d( BitCast(du, b).raw, BitCast(du, a).raw, 0x11)}); } // -------------------------- InterleaveEven template HWY_API VFromD InterleaveEven(D /*d*/, VFromD a, VFromD b) { return VFromD{__lsx_vpackev_b(b.raw, a.raw)}; } template HWY_API VFromD InterleaveEven(D /*d*/, VFromD a, VFromD b) { return VFromD{__lsx_vpackev_h(b.raw, a.raw)}; } template HWY_API VFromD InterleaveEven(D d, VFromD a, VFromD b) { const RebindToSigned di; return BitCast(d, VFromD{__lsx_vpackev_w(BitCast(di, b).raw, BitCast(di, a).raw)}); } template HWY_API VFromD InterleaveEven(D d, VFromD a, VFromD b) { const RebindToSigned di; return BitCast(d, VFromD{__lsx_vpackev_d(BitCast(di, b).raw, BitCast(di, a).raw)}); } // -------------------------- InterleaveOdd template HWY_API VFromD InterleaveOdd(D /*d*/, VFromD a, VFromD b) { return VFromD{__lsx_vpackod_b(b.raw, a.raw)}; } template HWY_API VFromD InterleaveOdd(D /*d*/, VFromD a, VFromD b) { return VFromD{__lsx_vpackod_h(b.raw, a.raw)}; } template HWY_API VFromD InterleaveOdd(D d, VFromD a, VFromD b) { const RebindToSigned di; return BitCast(d, VFromD{__lsx_vpackod_w(BitCast(di, b).raw, BitCast(di, a).raw)}); } template HWY_API VFromD InterleaveOdd(D d, VFromD a, VFromD b) { const RebindToSigned di; return BitCast(d, VFromD{__lsx_vpackod_d(BitCast(di, b).raw, BitCast(di, a).raw)}); } // ------------------------------ OddEvenBlocks template HWY_API Vec128 OddEvenBlocks(Vec128 /* odd */, Vec128 even) { return even; } // ------------------------------ SwapAdjacentBlocks template HWY_API Vec128 SwapAdjacentBlocks(Vec128 v) { return v; } // ------------------------------ InterleaveEvenBlocks template , HWY_IF_V_SIZE_LE_D(D, 16)> HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) { return a; } // ------------------------------ InterleaveOddBlocks template , HWY_IF_V_SIZE_LE_D(D, 16)> HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) { return a; } // ------------------------------ Shl template HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsll_b(v.raw, bits.raw)}; } template HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsll_h(v.raw, bits.raw)}; } template HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsll_w(v.raw, bits.raw)}; } template HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsll_d(v.raw, bits.raw)}; } // ------------------------------ Shr namespace detail { template HWY_API Vec128 Shr(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsrl_b(v.raw, bits.raw)}; } template HWY_API Vec128 Shr(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsrl_h(v.raw, bits.raw)}; } template HWY_API Vec128 Shr(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsrl_w(v.raw, bits.raw)}; } template HWY_API Vec128 Shr(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsrl_d(v.raw, bits.raw)}; } template HWY_API Vec128 Shr(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsra_b(v.raw, bits.raw)}; } template HWY_API Vec128 Shr(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsra_h(v.raw, bits.raw)}; } template HWY_API Vec128 Shr(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsra_w(v.raw, bits.raw)}; } template HWY_API Vec128 Shr(Vec128 v, Vec128 bits) { return Vec128{__lsx_vsra_d(v.raw, bits.raw)}; } } // namespace detail template HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { return detail::Shr(v, bits); } // ================================================== CONVERT (2) // ------------------------------ PromoteEvenTo/PromoteOddTo #include "hwy/ops/inside-inl.h" // Generic for all vector lengths. template >> HWY_API VFromD WidenMulPairwiseAdd(DF df, VBF a, VBF b) { return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), Mul(PromoteOddTo(df, a), PromoteOddTo(df, b))); } template >> HWY_API VFromD WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) { __m128i _tmp = __lsx_vmulwev_w_h(a.raw, b.raw); return VFromD{__lsx_vmaddwod_w_h(_tmp, a.raw, b.raw)}; } template >> HWY_API VFromD WidenMulPairwiseAdd(DU32 /* tag */, VU16 a, VU16 b) { __m128i _tmp = __lsx_vmulwev_w_hu(a.raw, b.raw); return VFromD{__lsx_vmaddwod_w_hu(_tmp, a.raw, b.raw)}; } // ------------------------------ ReorderWidenMulAccumulate template >> HWY_API VFromD ReorderWidenMulAccumulate(D32 /* tag */, V16 a, V16 b, const VFromD sum0, VFromD& /* sum1 */) { return VFromD{__lsx_vmaddwev_w_h( __lsx_vmaddwod_w_h(sum0.raw, a.raw, b.raw), a.raw, b.raw)}; } template >> HWY_API VFromD ReorderWidenMulAccumulate(DU32 /* tag */, VU16 a, VU16 b, const VFromD sum0, VFromD& /* sum1 */) { return VFromD{__lsx_vmaddwev_w_hu( __lsx_vmaddwod_w_hu(sum0.raw, a.raw, b.raw), a.raw, b.raw)}; } // ------------------------------ RearrangeToOddPlusEven template HWY_API Vec128 RearrangeToOddPlusEven(const Vec128 sum0, Vec128 /*sum1*/) { return sum0; // invariant already holds } template HWY_API Vec128 RearrangeToOddPlusEven( const Vec128 sum0, Vec128 /*sum1*/) { return sum0; // invariant already holds } template HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { return Add(sum0, sum1); } // ------------------------------ Demotions // NOTE: hwy::EnableIf()>* = nullptr is used instead of // hwy::EnableIf* = nullptr to avoid compiler errors since // !hwy::IsSame() is always false and as !hwy::IsSame() will cause // SFINAE to occur instead of a hard error due to a dependency on the V template // argument #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \ hwy::EnableIf()>* = nullptr template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vssrani_b_h(v.raw, v.raw, 0)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vssrani_bu_h(v.raw, v.raw, 0)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vssrlni_b_h(v.raw, v.raw, 0)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vssrlni_bu_h(v.raw, v.raw, 0)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vssrani_h_w(v.raw, v.raw, 0)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vssrani_hu_w(v.raw, v.raw, 0)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vssrlni_h_w(v.raw, v.raw, 0)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vssrlni_hu_w(v.raw, v.raw, 0)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vssrani_w_d(v.raw, v.raw, 0)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vssrani_wu_d(v.raw, v.raw, 0)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vssrlni_w_d(v.raw, v.raw, 0)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vssrlni_wu_d(v.raw, v.raw, 0)}; } // UI->UI DemoteTo for the case where // sizeof(TFromD) <= sizeof(TFromV) / 4 is generic for all vector lengths template ) / 4)> HWY_API VFromD DemoteTo(DN dn, V v) { using T = TFromV; using TN = TFromD; using TDemoteTo = MakeNarrow() && IsSigned(), T, MakeUnsigned>>; return DemoteTo(dn, DemoteTo(Rebind(), v)); } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vfcvt_h_s(v.raw, v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vfcvt_s_d(v.raw, v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vftintrz_w_d( reinterpret_cast<__m128d>(__lsx_vreplgr2vr_w(0)), v.raw)}; } template HWY_API VFromD DemoteTo(D du32, VFromD> v) { const Rebind du64; return DemoteTo(du32, ConvertTo(du64, v)); } template HWY_API VFromD DemoteTo(D df32, VFromD> v) { const Rebind df64; const RebindToUnsigned du64; const RebindToSigned di32; const RebindToUnsigned du32; const auto k2p64_63 = Set(df64, 27670116110564327424.0); const auto f64_hi52 = Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63; const auto f64_lo12 = PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)), Set(du32, uint32_t{0x00000FFF})))); const auto f64_sum = f64_hi52 + f64_lo12; const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; const auto f64_sum_is_inexact = ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); const auto f64_bits_decrement = And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))), f64_sum_is_inexact); const auto adj_f64_val = BitCast( df64, Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact)); return DemoteTo(df32, adj_f64_val); } template HWY_API VFromD DemoteTo(D df32, VFromD> v) { const Rebind df64; const RebindToUnsigned du64; const RebindToSigned di32; const RebindToUnsigned du32; const auto k2p64 = Set(df64, 18446744073709551616.0); const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64; const auto f64_lo12 = PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)), Set(du32, uint32_t{0x00000FFF})))); const auto f64_sum = f64_hi52 + f64_lo12; const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; const auto f64_sum_is_inexact = ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); const auto adj_f64_val = BitCast( df64, Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)), f64_sum_is_inexact)); return DemoteTo(df32, adj_f64_val); } // ------------------------------ ReorderDemote2To // ReorderDemote2To for 8-byte UI64->UI32, <= 4-byte UI32->UI16, // and <= 4-byte UI16->UI8 template ) <= 2 ? 4 : 8))), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DN), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV) * 2)> HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return VFromD{__lsx_vssrani_b_h(b.raw, a.raw, 0)}; } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return VFromD{__lsx_vssrani_bu_h(b.raw, a.raw, 0)}; } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return VFromD{__lsx_vssrlni_b_h(b.raw, a.raw, 0)}; } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return VFromD{__lsx_vssrlni_bu_h(b.raw, a.raw, 0)}; } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return VFromD{__lsx_vssrani_h_w(b.raw, a.raw, 0)}; } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return VFromD{__lsx_vssrani_hu_w(b.raw, a.raw, 0)}; } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return VFromD{__lsx_vssrlni_h_w(b.raw, a.raw, 0)}; } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return VFromD{__lsx_vssrlni_hu_w(b.raw, a.raw, 0)}; } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return VFromD{__lsx_vssrani_w_d(b.raw, a.raw, 0)}; } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return VFromD{__lsx_vssrani_wu_d(b.raw, a.raw, 0)}; } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return VFromD{__lsx_vssrlni_w_d(b.raw, a.raw, 0)}; } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return VFromD{__lsx_vssrlni_wu_d(b.raw, a.raw, 0)}; } // 8-byte UI32->UI16 and UI16->UI8 ReorderDemote2To template ) * 2), HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV) * 2)> HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { const Twice> dt; const Twice dt_n; const auto demote2_result = ReorderDemote2To(dt_n, ResizeBitCast(dt, a), ResizeBitCast(dt, b)); return VFromD{__lsx_vshuf4i_w(demote2_result.raw, 0x88)}; } template ), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2)> HWY_API VFromD OrderedDemote2To(D d, V a, V b) { return ReorderDemote2To(d, a, b); } template HWY_API Vec128 U8FromU32(const Vec128 v) { const DFromV du32; const Rebind du8; return DemoteTo(du8, BitCast(du32, v)); } // ------------------------------ F32->UI64 PromoteTo // f32 ->i64 template HWY_API VFromD PromoteTo(D /*di64*/, VFromD> v) { return VFromD{__lsx_vftintrzl_l_s(v.raw)}; } // F32->U64 PromoteTo generic for all vector lengths template HWY_API VFromD PromoteTo(D du64, VFromD> v) { const RebindToFloat df64; return ConvertTo(du64, PromoteTo(df64, v)); } // ------------------------------ MulFixedPoint15 template HWY_API Vec128 MulFixedPoint15(const Vec128 a, const Vec128 b) { __m128i temp_ev = __lsx_vmulwev_w_h(a.raw, b.raw); __m128i temp_od = __lsx_vmulwod_w_h(a.raw, b.raw); __m128i temp1 = __lsx_vilvl_w(temp_od, temp_ev); __m128i temp2 = __lsx_vilvh_w(temp_od, temp_ev); return Vec128{__lsx_vssrarni_h_w(temp2, temp1, 15)}; } // ------------------------------ Truncations template HWY_API VFromD TruncateTo(DTo /* tag */, Vec128 v) { const Repartition, DFromV> dto; return VFromD{BitCast(dto, v).raw}; } template HWY_API Vec16 TruncateTo(D /* tag */, Vec128 v) { return Vec16{__lsx_vextrins_b(v.raw, v.raw, 0x18)}; } template HWY_API Vec32 TruncateTo(D /* tag */, Vec128 v) { return Vec32{__lsx_vextrins_h(v.raw, v.raw, 0x14)}; } template HWY_API Vec64 TruncateTo(D /* tag */, Vec128 v) { return Vec64{__lsx_vpickev_w(v.raw, v.raw)}; } template HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { __m128i v_ev = __lsx_vpickev_b(v.raw, v.raw); return VFromD{__lsx_vpickev_b(v_ev, v_ev)}; } template HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vpickev_h(v.raw, v.raw)}; } template HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vpickev_b(v.raw, v.raw)}; } // ------------------------------ int -> float ConvertTo template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vffint_s_w(v.raw)}; } template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vffint_s_wu(v.raw)}; } template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vffint_d_l(v.raw)}; } // ------------------------------ float -> int ConvertTo template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vffint_d_lu(v.raw)}; } template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vftintrz_w_s(v.raw)}; } template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vftintrz_wu_s(v.raw)}; } template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vftintrz_l_d(v.raw)}; } template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{__lsx_vftintrz_lu_d(v.raw)}; } // ------------------------------ NearestInt (Round) template HWY_API Vec128 NearestInt(const Vec128 v) { return Vec128{__lsx_vftintrne_w_s(v.raw)}; } template HWY_API Vec128 NearestInt(const Vec128 v) { return Vec128{__lsx_vftintrne_l_d(v.raw)}; } template HWY_API VFromD DemoteToNearestInt(DI32 di32, VFromD> v) { return DemoteTo(di32, NearestInt(v)); } // ------------------------------ Floating-point rounding template HWY_API Vec128 Round(const Vec128 v) { return Vec128{__lsx_vfrintrne_s(v.raw)}; } template HWY_API Vec128 Round(const Vec128 v) { return Vec128{__lsx_vfrintrne_d(v.raw)}; } template HWY_API Vec128 Trunc(const Vec128 v) { return Vec128{__lsx_vfrintrz_s(v.raw)}; } template HWY_API Vec128 Trunc(const Vec128 v) { return Vec128{__lsx_vfrintrz_d(v.raw)}; } template HWY_API Vec128 Ceil(const Vec128 v) { return Vec128{__lsx_vfrintrp_s(v.raw)}; } template HWY_API Vec128 Ceil(const Vec128 v) { return Vec128{__lsx_vfrintrp_d(v.raw)}; } // Toward -infinity, aka floor template HWY_API Vec128 Floor(const Vec128 v) { return Vec128{__lsx_vfrintrm_s(v.raw)}; } template HWY_API Vec128 Floor(const Vec128 v) { return Vec128{__lsx_vfrintrm_d(v.raw)}; } // ------------------------------ Floating-point classification // FIXME: disable gcc-14 tree-based loop optimizations to prevent // 'HighwayTestGroup/HighwayTest.TestAllIsNaN/LSX' failures #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG #pragma GCC push_options #pragma GCC optimize("-fno-tree-loop-optimize") #endif template HWY_API Mask128 IsNaN(const Vec128 v) { return Mask128{ reinterpret_cast<__m128>(__lsx_vfcmp_cune_s(v.raw, v.raw))}; } template HWY_API Mask128 IsNaN(const Vec128 v) { return Mask128{ reinterpret_cast<__m128d>(__lsx_vfcmp_cune_d(v.raw, v.raw))}; } #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG #pragma GCC pop_options #endif #ifdef HWY_NATIVE_IS_EITHER_NAN #undef HWY_NATIVE_IS_EITHER_NAN #else #define HWY_NATIVE_IS_EITHER_NAN #endif template HWY_API Mask128 IsEitherNaN(Vec128 a, Vec128 b) { return Mask128{ reinterpret_cast<__m128>(__lsx_vfcmp_cun_s(a.raw, b.raw))}; } template HWY_API Mask128 IsEitherNaN(Vec128 a, Vec128 b) { __m128i _tmp = __lsx_vor_v(__lsx_vfcmp_cune_d(a.raw, a.raw), __lsx_vfcmp_cune_d(b.raw, b.raw)); return Mask128{reinterpret_cast<__m128d>(_tmp)}; } #ifdef HWY_NATIVE_ISINF #undef HWY_NATIVE_ISINF #else #define HWY_NATIVE_ISINF #endif template HWY_API MFromD> IsInf(V v) { using T = TFromV; static_assert(IsFloat(), "Only for float"); using TU = MakeUnsigned; const DFromV d; const RebindToUnsigned du; const VFromD vu = BitCast(du, v); // 'Shift left' to clear the sign bit, check for exponent=max and // mantissa=0. return RebindMask( d, Eq(Add(vu, vu), Set(du, static_cast(hwy::MaxExponentTimes2())))); } // Returns whether normal/subnormal/zero. template HWY_API MFromD> IsFinite(V v) { using T = TFromV; static_assert(IsFloat(), "Only for float"); using TU = MakeUnsigned; const DFromV d; const RebindToUnsigned du; const VFromD vu = BitCast(du, v); // 'Shift left' to clear the sign bit, check for exponent(hwy::MaxExponentTimes2())))); } // ================================================== MISC // ------------------------------ LoadMaskBits (TestBit) namespace detail { template HWY_INLINE MFromD LoadMaskBits(D d, uint64_t bits) { const RebindToUnsigned du; // Easier than Set(), which would require an >8-bit type, which would not // compile for T=uint8_t, N=1. const VFromD vbits{__lsx_vreplgr2vr_w(static_cast(bits))}; // Replicate bytes 8x such that each byte contains the bit that governs it. alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}; const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128}; return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); } template HWY_INLINE MFromD LoadMaskBits(D d, uint64_t bits) { const RebindToUnsigned du; alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; return RebindMask( d, TestBit(Set(du, static_cast(bits)), Load(du, kBit))); } template HWY_INLINE MFromD LoadMaskBits(D d, uint64_t bits) { const RebindToUnsigned du; alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8}; return RebindMask( d, TestBit(Set(du, static_cast(bits)), Load(du, kBit))); } template HWY_INLINE MFromD LoadMaskBits(D d, uint64_t bits) { const RebindToUnsigned du; alignas(16) static constexpr uint64_t kBit[8] = {1, 2}; return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit))); } } // namespace detail template HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { uint64_t mask_bits = 0; CopyBytes<(d.MaxLanes() + 7) / 8>(bits, &mask_bits); return detail::LoadMaskBits(d, mask_bits); } // ------------------------------ Dup128MaskFromMaskBits template HWY_API MFromD Dup128MaskFromMaskBits(D d, unsigned mask_bits) { constexpr size_t kN = MaxLanes(d); if (kN < 8) mask_bits &= (1u << kN) - 1; return detail::LoadMaskBits(d, mask_bits); } template struct CompressIsPartition { enum { value = (sizeof(T) != 1) }; }; // ------------------------------ BitsFromMask namespace detail { template constexpr uint64_t OnlyActive(D d, uint64_t mask_bits) { return (d.MaxBytes() >= 16) ? mask_bits : mask_bits & ((1ull << d.MaxLanes()) - 1); } constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) { return static_cast(static_cast(mask_bits)); } } // namespace detail template HWY_API uint64_t BitsFromMask(D d, MFromD mask) { return detail::OnlyActive( d, detail::U64FromInt(__lsx_vpickve2gr_w(__lsx_vmskltz_b(mask.raw), 0))); } template HWY_API uint64_t BitsFromMask(D d, MFromD mask) { return detail::OnlyActive( d, detail::U64FromInt(__lsx_vpickve2gr_w(__lsx_vmskltz_h(mask.raw), 0))); } template HWY_API uint64_t BitsFromMask(D d, MFromD mask) { return detail::OnlyActive( d, detail::U64FromInt(__lsx_vpickve2gr_w( __lsx_vmskltz_w(reinterpret_cast<__m128i>(mask.raw)), 0))); } template HWY_API uint64_t BitsFromMask(D d, MFromD mask) { return detail::OnlyActive( d, detail::U64FromInt(__lsx_vpickve2gr_w( __lsx_vmskltz_d(reinterpret_cast<__m128i>(mask.raw)), 0))); } // ------------------------------ StoreMaskBits // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { constexpr size_t kNumBytes = (MaxLanes(d) + 7) / 8; const uint64_t mask_bits = BitsFromMask(d, mask); CopyBytes(&mask_bits, bits); return kNumBytes; } template HWY_API bool AllFalse(D d, MFromD mask) { return BitsFromMask(d, mask) == 0; } template HWY_API bool AllTrue(D d, MFromD mask) { constexpr size_t kN = MaxLanes(d); constexpr uint64_t kAllBits = (1ull << kN) - 1; return BitsFromMask(d, mask) == kAllBits; } template HWY_API size_t CountTrue(D d, MFromD mask) { return PopCount(BitsFromMask(d, mask)); } template HWY_API size_t FindKnownFirstTrue(D d, MFromD mask) { return Num0BitsBelowLS1Bit_Nonzero64(BitsFromMask(d, mask)); } template HWY_API intptr_t FindFirstTrue(D d, MFromD mask) { const uint64_t mask_bits = BitsFromMask(d, mask); return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1; } template HWY_API size_t FindKnownLastTrue(D d, MFromD mask) { return 31 - Num0BitsAboveMS1Bit_Nonzero32( static_cast(BitsFromMask(d, mask))); } template HWY_API intptr_t FindLastTrue(D d, MFromD mask) { const uint32_t mask_bits = static_cast(BitsFromMask(d, mask)); return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) : -1; } // ------------------------------ Compress, CompressBits namespace detail { // Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6. template HWY_INLINE VFromD IndicesFromBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 256); const Rebind d8; const Twice d8t; const RebindToUnsigned du; alignas(16) static constexpr uint8_t table[2048] = { // PrintCompress16x8Tables 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; const VFromD byte_idx{Load(d8, table + mask_bits * 8).raw}; const VFromD pairs = ZipLower(byte_idx, byte_idx); return BitCast(d, pairs + Set(du, 0x0100)); } template HWY_INLINE VFromD IndicesFromNotBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 256); const Rebind d8; const Twice d8t; const RebindToUnsigned du; alignas(16) static constexpr uint8_t table[2048] = { // PrintCompressNot16x8Tables 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; const VFromD byte_idx{Load(d8, table + mask_bits * 8).raw}; const VFromD pairs = ZipLower(byte_idx, byte_idx); return BitCast(d, pairs + Set(du, 0x0100)); } template HWY_INLINE VFromD IndicesFromBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 16); // There are only 4 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[256] = { // PrintCompress32x4Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE VFromD IndicesFromNotBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 16); // There are only 4 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[256] = { // PrintCompressNot32x4Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE VFromD IndicesFromBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 4); // There are only 2 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[64] = { // PrintCompress64x2Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE VFromD IndicesFromNotBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 4); // There are only 2 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[64] = { // PrintCompressNot64x2Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_API Vec128 CompressBits(Vec128 v, uint64_t mask_bits) { const DFromV d; const RebindToUnsigned du; HWY_DASSERT(mask_bits < (1ull << N)); const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); } template HWY_API Vec128 CompressNotBits(Vec128 v, uint64_t mask_bits) { const DFromV d; const RebindToUnsigned du; HWY_DASSERT(mask_bits < (1ull << N)); const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits)); return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); } } // namespace detail // Single lane: no-op template HWY_API Vec128 Compress(Vec128 v, Mask128 /*m*/) { return v; } // Two lanes: conditional swap template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. const DFromV d; const Vec128 m = VecFromMask(d, mask); const Vec128 maskL = DupEven(m); const Vec128 maskH = DupOdd(m); const Vec128 swap = AndNot(maskL, maskH); return IfVecThenElse(swap, Shuffle01(v), v); } // General case, 2 or 4 bytes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { const DFromV d; return detail::CompressBits(v, BitsFromMask(d, mask)); } // ------------------------------ CompressNot // Single lane: no-op template HWY_API Vec128 CompressNot(Vec128 v, Mask128 /*m*/) { return v; } // Two lanes: conditional swap template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. const DFromV d; const Vec128 m = VecFromMask(d, mask); const Vec128 maskL = DupEven(m); const Vec128 maskH = DupOdd(m); const Vec128 swap = AndNot(maskH, maskL); return IfVecThenElse(swap, Shuffle01(v), v); } template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { const DFromV d; // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { return detail::CompressBits(v, BitsFromMask(d, Not(mask))); } return detail::CompressNotBits(v, BitsFromMask(d, mask)); } // ------------------------------ CompressBlocksNot HWY_API Vec128 CompressBlocksNot(Vec128 v, Mask128 /* m */) { return v; } template HWY_API Vec128 CompressBits(Vec128 v, const uint8_t* HWY_RESTRICT bits) { uint64_t mask_bits = 0; constexpr size_t kNumBytes = (N + 7) / 8; CopyBytes(bits, &mask_bits); if (N < 8) { mask_bits &= (1ull << N) - 1; } return detail::CompressBits(v, mask_bits); } // ------------------------------ CompressStore, CompressBitsStore template HWY_API size_t CompressStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; const uint64_t mask_bits = BitsFromMask(d, m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); StoreU(compressed, d, unaligned); detail::MaybeUnpoison(unaligned, count); return count; } template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; const uint64_t mask_bits = BitsFromMask(d, m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); BlendedStore(compressed, FirstN(d, count), d, unaligned); detail::MaybeUnpoison(unaligned, count); return count; } template HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; uint64_t mask_bits = 0; constexpr size_t kN = MaxLanes(d); constexpr size_t kNumBytes = (kN + 7) / 8; CopyBytes(bits, &mask_bits); if (kN < 8) { mask_bits &= (1ull << kN) - 1; } const size_t count = PopCount(mask_bits); const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); StoreU(compressed, d, unaligned); detail::MaybeUnpoison(unaligned, count); return count; } // ------------------------------ StoreInterleaved2/3/4 // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in // generic_ops-inl.h. // ------------------------------ Additional mask logical operations template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { return mask; } template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { const FixedTag d; const auto vmask = VecFromMask(d, mask); return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask))); } template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { const Simd d; const auto vmask = VecFromMask(d, mask); const auto neg_vmask = ResizeBitCast(d, Neg(ResizeBitCast(Full64(), vmask))); return MaskFromVec(Or(vmask, neg_vmask)); } template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { const Full128 d; const Repartition di64; auto vmask = BitCast(di64, VecFromMask(d, mask)); VFromD neg_vmask{__lsx_vsub_q(Zero(di64).raw, vmask.raw)}; return MaskFromVec(BitCast(d, Or(vmask, neg_vmask))); } template HWY_API Mask128 SetBeforeFirst(Mask128 mask) { return Not(SetAtOrAfterFirst(mask)); } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { return mask; } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { const FixedTag d; const RebindToSigned di; const auto vmask = BitCast(di, VecFromMask(d, mask)); const auto zero = Zero(di); const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero); return MaskFromVec(BitCast(d, And(vmask, vmask2))); } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { const Simd d; const RebindToSigned di; const auto vmask = ResizeBitCast(Full64(), VecFromMask(d, mask)); const auto only_first_vmask = BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask))))); return MaskFromVec(only_first_vmask); } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { const Full128 d; const RebindToSigned di; auto vmask = BitCast(di, VecFromMask(d, mask)); VFromD neg_vmask{__lsx_vsub_q(Zero(di).raw, vmask.raw)}; return MaskFromVec(BitCast(d, Neg(And(vmask, neg_vmask)))); } template HWY_API Mask128 SetAtOrBeforeFirst(Mask128 /*mask*/) { const FixedTag d; const RebindToSigned di; using TI = MakeSigned; return RebindMask(d, MaskFromVec(Set(di, TI(-1)))); } template HWY_API Mask128 SetAtOrBeforeFirst(Mask128 mask) { const Simd d; return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask)))); } // ------------------------------ Reductions #undef HWY_IF_SUM_OF_LANES_D #define HWY_IF_SUM_OF_LANES_D(D) \ HWY_IF_LANES_GT_D(D, 1), \ hwy::EnableIf, uint8_t>() || \ (HWY_V_SIZE_D(D) != 8 && HWY_V_SIZE_D(D) != 16)>* = \ nullptr // ------------------------------ SumOfLanes template HWY_API VFromD SumOfLanes(D d, VFromD v) { return Set(d, static_cast(GetLane(SumsOf8(v)) & 0xFF)); } template HWY_API VFromD SumOfLanes(D d, VFromD v) { const Repartition d64; VFromD sums = SumsOf8(v); sums = SumOfLanes(d64, sums); return Broadcast<0>(BitCast(d, sums)); } // ------------------------------ Lt128 namespace detail { // Returns vector-mask for Lt128. Generic for all vector lengths. template HWY_INLINE VFromD Lt128Vec(const D d, VFromD a, VFromD b) { // Truth table of Eq and Lt for Hi and Lo u64. // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) // =H =L cH cL | out = cH | (=H & cL) // 0 0 0 0 | 0 // 0 0 0 1 | 0 // 0 0 1 0 | 1 // 0 0 1 1 | 1 // 0 1 0 0 | 0 // 0 1 0 1 | 0 // 0 1 1 0 | 1 // 1 0 0 0 | 0 // 1 0 0 1 | 1 // 1 1 0 0 | 0 const auto eqHL = Eq(a, b); const VFromD ltHL = VecFromMask(d, Lt(a, b)); const VFromD ltLX = ShiftLeftLanes<1>(ltHL); const VFromD vecHx = IfThenElse(eqHL, ltLX, ltHL); return InterleaveUpper(d, vecHx, vecHx); } // Returns vector-mask for Eq128. Generic for all vector lengths. template HWY_INLINE VFromD Eq128Vec(D d, VFromD a, VFromD b) { const auto eqHL = VecFromMask(d, Eq(a, b)); const auto eqLH = Reverse2(d, eqHL); return And(eqHL, eqLH); } template HWY_INLINE VFromD Ne128Vec(D d, VFromD a, VFromD b) { const auto neHL = VecFromMask(d, Ne(a, b)); const auto neLH = Reverse2(d, neHL); return Or(neHL, neLH); } template HWY_INLINE VFromD Lt128UpperVec(D d, VFromD a, VFromD b) { const VFromD ltHL = VecFromMask(d, Lt(a, b)); return InterleaveUpper(d, ltHL, ltHL); } template HWY_INLINE VFromD Eq128UpperVec(D d, VFromD a, VFromD b) { const VFromD eqHL = VecFromMask(d, Eq(a, b)); return InterleaveUpper(d, eqHL, eqHL); } template HWY_INLINE VFromD Ne128UpperVec(D d, VFromD a, VFromD b) { const VFromD neHL = VecFromMask(d, Ne(a, b)); return InterleaveUpper(d, neHL, neHL); } } // namespace detail template HWY_API MFromD Lt128(D d, VFromD a, VFromD b) { return MaskFromVec(detail::Lt128Vec(d, a, b)); } template HWY_API MFromD Eq128(D d, VFromD a, VFromD b) { return MaskFromVec(detail::Eq128Vec(d, a, b)); } template HWY_API MFromD Ne128(D d, VFromD a, VFromD b) { return MaskFromVec(detail::Ne128Vec(d, a, b)); } template HWY_API MFromD Lt128Upper(D d, VFromD a, VFromD b) { return MaskFromVec(detail::Lt128UpperVec(d, a, b)); } template HWY_API MFromD Eq128Upper(D d, VFromD a, VFromD b) { return MaskFromVec(detail::Eq128UpperVec(d, a, b)); } template HWY_API MFromD Ne128Upper(D d, VFromD a, VFromD b) { return MaskFromVec(detail::Ne128UpperVec(d, a, b)); } // ------------------------------ Min128, Max128 (Lt128) // Avoids the extra MaskFromVec in Lt128. template HWY_API VFromD Min128(D d, VFromD a, VFromD b) { return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b); } template HWY_API VFromD Max128(D d, VFromD a, VFromD b) { return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b); } template HWY_API VFromD Min128Upper(D d, VFromD a, VFromD b) { return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b); } template HWY_API VFromD Max128Upper(D d, VFromD a, VFromD b) { return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b); } // -------------------- LeadingZeroCount, TrailingZeroCount, // HighestSetBitIndex #ifdef HWY_NATIVE_LEADING_ZERO_COUNT #undef HWY_NATIVE_LEADING_ZERO_COUNT #else #define HWY_NATIVE_LEADING_ZERO_COUNT #endif template ), HWY_IF_V_SIZE_LE_D(DFromV, 16)> HWY_API V LeadingZeroCount(V v) { return V{__lsx_vclz_b(v.raw)}; } template ), HWY_IF_V_SIZE_LE_D(DFromV, 16)> HWY_API V LeadingZeroCount(V v) { return V{__lsx_vclz_h(v.raw)}; } template ), HWY_IF_V_SIZE_LE_D(DFromV, 16)> HWY_API V LeadingZeroCount(V v) { return V{__lsx_vclz_w(v.raw)}; } template ), HWY_IF_V_SIZE_LE_D(DFromV, 16)> HWY_API V LeadingZeroCount(V v) { return V{__lsx_vclz_d(v.raw)}; } template HWY_API V HighestSetBitIndex(V v) { const DFromV d; using T = TFromD; return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v)); } template HWY_API V TrailingZeroCount(V v) { const DFromV d; const RebindToSigned di; using T = TFromD; const auto lsb = And(v, BitCast(d, Neg(BitCast(di, v)))); return IfThenElse(Eq(v, Zero(d)), Set(d, T{sizeof(T) * 8}), HighestSetBitIndex(lsb)); } } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE(); #undef HWY_LSX_IF_EMULATED_D