From f7871915e93579912f56f0b30d78164e87a98d82 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Thu, 5 Jun 2025 15:34:10 -0400 Subject: [PATCH] Update simd.h --- src/simd.h | 2748 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 2152 insertions(+), 596 deletions(-) diff --git a/src/simd.h b/src/simd.h index 2461bab..1ebdc59 100644 --- a/src/simd.h +++ b/src/simd.h @@ -14,6 +14,7 @@ template struct simd { using SignedT = std::make_signed_t; using UnsignedT = std::make_unsigned_t; + static constexpr int lanes = kLanes; simd() = default; @@ -144,82 +145,481 @@ enum InstructionSet { Simd_x86_AVX512, }; -template -struct simd { + ; + +template +struct simd {}; + +template struct simd { using SignedT = std::make_signed_t; using UnsignedT = std::make_unsigned_t; + static constexpr int lanes = kLanes; + static constexpr auto kInstructionSet = Simd_x86_SSE; simd() = default; - explicit simd(const UnsignedT *t) { memcpy(x, t, sizeof(x)); } - explicit simd(const SignedT *t) { memcpy(x, t, sizeof(x)); } - - simd const &as_signed() const { - return (simd const &)*this; + __attribute__((always_inline)) explicit simd(const UnsignedT *t) { + memcpy(x, t, sizeof(x)); + } + __attribute__((always_inline)) explicit simd(const SignedT *t) { + memcpy(x, t, sizeof(x)); } - simd const &as_unsigned() const { - return (simd const &)*this; + __attribute__((always_inline)) simd const &as_signed() const { + return (simd const &)*this; } - static simd splat(T t) { + __attribute__((always_inline)) simd const &as_unsigned() const { + return (simd const &)*this; + } + + __attribute__((always_inline)) static simd splat(T t) { simd result; int i = 0; - if constexpr (kInstructionSet >= Simd_x86_AVX512) { - if constexpr (std::is_same_v) { - for (; i + 64 <= kLanes; i += 64) { - auto v = _mm512_set1_epi8(t); - memcpy(&result.x[i], &v, 64); + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + auto v = _mm_set1_epi8(t); + memcpy(&result.x[i], &v, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + auto v = _mm_set1_epi16(t); + memcpy(&result.x[i], &v, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + auto v = _mm_set1_epi32(t); + memcpy(&result.x[i], &v, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + auto v = _mm_set1_epi64x(t); + memcpy(&result.x[i], &v, 16); + } + } + for (; i < kLanes; ++i) { + result.x[i] = t; + } + return result; + } + + __attribute__((always_inline)) int count_leading_zero_lanes() const { + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v; + memcpy(&v, &x[i], 16); + auto lz = std::countr_one( + uint32_t(_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_set1_epi8(0), v)))); + if (lz < 16) { + return i + lz; } - } else if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - auto v = _mm512_set1_epi16(t); - memcpy(&result.x[i], &v, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v; + memcpy(&v, &x[i], 16); + auto lz = std::countr_one(uint32_t(_mm_movemask_epi8( + _mm_cmpeq_epi16(_mm_set1_epi16(0), v)))) / + 2; + if (lz < 8) { + return i + lz; } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - auto v = _mm512_set1_epi32(t); - memcpy(&result.x[i], &v, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v; + memcpy(&v, &x[i], 16); + auto lz = std::countr_one(uint32_t(_mm_movemask_epi8( + _mm_cmpeq_epi32(_mm_set1_epi32(0), v)))) / + 4; + if (lz < 4) { + return i + lz; } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - auto v = _mm512_set1_epi64(t); - memcpy(&result.x[i], &v, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + __m128i v; + memcpy(&v, &x[i], 16); + auto lz = std::countr_one(uint32_t(_mm_movemask_epi8( + _mm_cmpeq_epi64(_mm_set1_epi64x(0), v)))) / + 8; + if (lz < 2) { + return i + lz; } } } - if constexpr (kInstructionSet >= Simd_x86_AVX) { - if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - auto v = _mm256_set1_epi8(t); - memcpy(&result.x[i], &v, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - auto v = _mm256_set1_epi16(t); - memcpy(&result.x[i], &v, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - auto v = _mm256_set1_epi32(t); - memcpy(&result.x[i], &v, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 4 <= kLanes; i += 4) { - auto v = _mm256_set1_epi64x(t); - memcpy(&result.x[i], &v, 32); - } + for (; i < kLanes; ++i) { + if (x[i]) { + break; + } + } + return i; + } + + __attribute__((always_inline)) int count_leading_nonzero_lanes() const { + return (~(*this)).count_leading_zero_lanes(); + } + + __attribute__((always_inline)) simd operator==(simd const &other) const { + simd result; + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmpeq_epi8(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmpeq_epi16(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmpeq_epi32(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmpeq_epi64(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } + for (; i < kLanes; ++i) { + result.x[i] = x[i] == other.x[i] ? T(-1) : T(0); + } + return result; + } + + __attribute__((always_inline)) simd operator!=(simd const &other) const { + return ~(*this == other); + } + + __attribute__((always_inline)) simd operator<(simd const &other) const { + simd result; + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmplt_epi8(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_xor_si128(_mm_cmpeq_epi8(_mm_max_epu8(v0, v1), v0), + _mm_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmplt_epi16(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_xor_si128(_mm_cmpeq_epi16(_mm_max_epu16(v0, v1), v0), + _mm_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmplt_epi32(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_xor_si128(_mm_cmpeq_epi32(_mm_max_epu32(v0, v1), v0), + _mm_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmpgt_epi64(v1, v0); + memcpy(&result.x[i], &v1, 16); + } + } + for (; i < kLanes; ++i) { + result.x[i] = x[i] < other.x[i] ? T(-1) : T(0); + } + return result; + } + + __attribute__((always_inline)) simd operator>(simd const &other) const { + return other < *this; + } + + __attribute__((always_inline)) simd operator<=(simd const &other) const { + return ~(*this > other); + } + + __attribute__((always_inline)) simd operator>=(simd const &other) const { + return ~(*this < other); + } + + __attribute__((always_inline)) simd operator-(simd const &other) const { + simd result; + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_sub_epi8(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_sub_epi16(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_sub_epi32(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_sub_epi64(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } + for (; i < kLanes; ++i) { + result.x[i] = T(UnsignedT(x[i]) - UnsignedT(other.x[i])); + } + return result; + } + + __attribute__((always_inline)) simd operator+(simd const &other) const { + simd result; + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_add_epi8(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_add_epi16(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_add_epi32(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_add_epi64(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } + for (; i < kLanes; ++i) { + result.x[i] = T(UnsignedT(x[i]) + UnsignedT(other.x[i])); + } + return result; + } + + __attribute__((always_inline)) simd operator|(simd const &other) const { + simd result; + int i = 0; + for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_or_si128(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + for (; i < kLanes; ++i) { + result.x[i] = other.x[i] | x[i]; + } + return result; + } + + __attribute__((always_inline)) simd operator&(simd const &other) const { + simd result; + int i = 0; + for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_and_si128(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + for (; i < kLanes; ++i) { + result.x[i] = x[i] & other.x[i]; + } + return result; + } + + __attribute__((always_inline)) simd operator^(simd const &other) const { + simd result; + int i = 0; + for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_xor_si128(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + for (; i < kLanes; ++i) { + result.x[i] = x[i] ^ other.x[i]; + } + return result; + } + + __attribute__((always_inline)) simd operator~() const { + simd result; + int i = 0; + for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { + __m128i v0; + memcpy(&v0, &x[i], 16); + v0 = _mm_xor_si128(v0, _mm_set1_epi8(0xff)); + memcpy(&result.x[i], &v0, 16); + } + for (; i < kLanes; ++i) { + result.x[i] = ~x[i]; + } + return result; + } + +private: + T x[kLanes]; +}; + +template struct simd { + + using SignedT = std::make_signed_t; + using UnsignedT = std::make_unsigned_t; + static constexpr int lanes = kLanes; + static constexpr auto kInstructionSet = Simd_x86_AVX; + + simd() = default; + + __attribute__((always_inline, + target("avx"))) explicit simd(const UnsignedT *t) { + memcpy(x, t, sizeof(x)); + } + __attribute__((always_inline, + target("avx"))) explicit simd(const SignedT *t) { + memcpy(x, t, sizeof(x)); + } + + __attribute__((always_inline, target("avx"))) simd const &as_signed() const { + return (simd const &)*this; + } + + __attribute__((always_inline, target("avx"))) simd const & + as_unsigned() const { + return (simd const &)*this; + } + + __attribute__((always_inline, target("avx"))) static simd splat(T t) { + simd result; + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + auto v = _mm256_set1_epi8(t); + memcpy(&result.x[i], &v, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + auto v = _mm256_set1_epi16(t); + memcpy(&result.x[i], &v, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + auto v = _mm256_set1_epi32(t); + memcpy(&result.x[i], &v, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + auto v = _mm256_set1_epi64x(t); + memcpy(&result.x[i], &v, 32); } } if constexpr (std::is_same_v) { @@ -249,94 +649,499 @@ struct simd { return result; } - int count_leading_zero_lanes() const { + __attribute__((always_inline, target("avx"))) int + count_leading_zero_lanes() const { int i = 0; - if constexpr (kInstructionSet >= Simd_x86_AVX512) { - if constexpr (std::is_same_v) { - for (; i + 64 <= kLanes; i += 64) { - __m512i v; - memcpy(&v, &x[i], 64); - auto lz = - std::countr_one(_mm512_cmpeq_epi8_mask(_mm512_set1_epi8(0), v)); - if (lz < 64) { - return i + lz; - } + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v; + memcpy(&v, &x[i], 16); + auto lz = std::countr_one( + uint32_t(_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_set1_epi8(0), v)))); + if (lz < 16) { + return i + lz; } - } else if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m512i v; - memcpy(&v, &x[i], 64); - auto lz = - std::countr_one(_mm512_cmpeq_epi16_mask(_mm512_set1_epi16(0), v)); - if (lz < 32) { - return i + lz; - } + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v; + memcpy(&v, &x[i], 16); + auto lz = std::countr_one(uint32_t(_mm_movemask_epi8( + _mm_cmpeq_epi16(_mm_set1_epi16(0), v)))) / + 2; + if (lz < 8) { + return i + lz; } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m512i v; - memcpy(&v, &x[i], 64); - auto lz = - std::countr_one(_mm512_cmpeq_epi32_mask(_mm512_set1_epi32(0), v)); - if (lz < 16) { - return i + lz; - } + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v; + memcpy(&v, &x[i], 16); + auto lz = std::countr_one(uint32_t(_mm_movemask_epi8( + _mm_cmpeq_epi32(_mm_set1_epi32(0), v)))) / + 4; + if (lz < 4) { + return i + lz; } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m512i v; - memcpy(&v, &x[i], 64); - auto lz = - std::countr_one(_mm512_cmpeq_epi64_mask(_mm512_set1_epi64(0), v)); - if (lz < 8) { - return i + lz; - } + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + __m128i v; + memcpy(&v, &x[i], 16); + auto lz = std::countr_one(uint32_t(_mm_movemask_epi8( + _mm_cmpeq_epi64(_mm_set1_epi64x(0), v)))) / + 8; + if (lz < 2) { + return i + lz; } } } - if constexpr (kInstructionSet >= Simd_x86_AVX2) { - if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m256i v; - memcpy(&v, &x[i], 32); - auto lz = std::countr_one(uint32_t( - _mm256_movemask_epi8(_mm256_cmpeq_epi8(_mm256_set1_epi8(0), v)))); - if (lz < 32) { - return i + lz; - } + for (; i < kLanes; ++i) { + if (x[i]) { + break; + } + } + return i; + } + + __attribute__((always_inline, target("avx"))) int + count_leading_nonzero_lanes() const { + return (~(*this)).count_leading_zero_lanes(); + } + + __attribute__((always_inline, target("avx"))) simd + operator==(simd const &other) const { + simd result; + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmpeq_epi8(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmpeq_epi16(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmpeq_epi32(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmpeq_epi64(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } + for (; i < kLanes; ++i) { + result.x[i] = x[i] == other.x[i] ? T(-1) : T(0); + } + return result; + } + + __attribute__((always_inline, target("avx"))) simd + operator!=(simd const &other) const { + return ~(*this == other); + } + + __attribute__((always_inline, target("avx"))) simd + operator<(simd const &other) const { + simd result; + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmplt_epi8(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_xor_si128(_mm_cmpeq_epi8(_mm_max_epu8(v0, v1), v0), + _mm_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmplt_epi16(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_xor_si128(_mm_cmpeq_epi16(_mm_max_epu16(v0, v1), v0), + _mm_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmplt_epi32(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_xor_si128(_mm_cmpeq_epi32(_mm_max_epu32(v0, v1), v0), + _mm_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmpgt_epi64(v1, v0); + memcpy(&result.x[i], &v1, 16); + } + } + for (; i < kLanes; ++i) { + result.x[i] = x[i] < other.x[i] ? T(-1) : T(0); + } + return result; + } + + __attribute__((always_inline, target("avx"))) simd + operator>(simd const &other) const { + return other < *this; + } + + __attribute__((always_inline, target("avx"))) simd + operator<=(simd const &other) const { + return ~(*this > other); + } + + __attribute__((always_inline, target("avx"))) simd + operator>=(simd const &other) const { + return ~(*this < other); + } + + __attribute__((always_inline, target("avx"))) simd + operator-(simd const &other) const { + simd result; + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_sub_epi8(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_sub_epi16(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_sub_epi32(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_sub_epi64(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } + for (; i < kLanes; ++i) { + result.x[i] = T(UnsignedT(x[i]) - UnsignedT(other.x[i])); + } + return result; + } + + __attribute__((always_inline, target("avx"))) simd + operator+(simd const &other) const { + simd result; + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_add_epi8(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_add_epi16(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_add_epi32(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_add_epi64(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } + for (; i < kLanes; ++i) { + result.x[i] = T(UnsignedT(x[i]) + UnsignedT(other.x[i])); + } + return result; + } + + __attribute__((always_inline, target("avx"))) simd + operator|(simd const &other) const { + simd result; + int i = 0; + for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_or_si128(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + for (; i < kLanes; ++i) { + result.x[i] = other.x[i] | x[i]; + } + return result; + } + + __attribute__((always_inline, target("avx"))) simd + operator&(simd const &other) const { + simd result; + int i = 0; + for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_and_si128(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + for (; i < kLanes; ++i) { + result.x[i] = x[i] & other.x[i]; + } + return result; + } + + __attribute__((always_inline, target("avx"))) simd + operator^(simd const &other) const { + simd result; + int i = 0; + for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_xor_si128(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + for (; i < kLanes; ++i) { + result.x[i] = x[i] ^ other.x[i]; + } + return result; + } + + __attribute__((always_inline, target("avx"))) simd operator~() const { + simd result; + int i = 0; + for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { + __m128i v0; + memcpy(&v0, &x[i], 16); + v0 = _mm_xor_si128(v0, _mm_set1_epi8(0xff)); + memcpy(&result.x[i], &v0, 16); + } + for (; i < kLanes; ++i) { + result.x[i] = ~x[i]; + } + return result; + } + +private: + T x[kLanes]; +}; + +template struct simd { + + using SignedT = std::make_signed_t; + using UnsignedT = std::make_unsigned_t; + static constexpr int lanes = kLanes; + static constexpr auto kInstructionSet = Simd_x86_AVX2; + + simd() = default; + + __attribute__((always_inline, + target("avx2"))) explicit simd(const UnsignedT *t) { + memcpy(x, t, sizeof(x)); + } + __attribute__((always_inline, + target("avx2"))) explicit simd(const SignedT *t) { + memcpy(x, t, sizeof(x)); + } + + __attribute__((always_inline, target("avx2"))) simd const &as_signed() const { + return (simd const &)*this; + } + + __attribute__((always_inline, target("avx2"))) simd const & + as_unsigned() const { + return (simd const &)*this; + } + + __attribute__((always_inline, target("avx2"))) static simd splat(T t) { + simd result; + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + auto v = _mm256_set1_epi8(t); + memcpy(&result.x[i], &v, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + auto v = _mm256_set1_epi16(t); + memcpy(&result.x[i], &v, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + auto v = _mm256_set1_epi32(t); + memcpy(&result.x[i], &v, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + auto v = _mm256_set1_epi64x(t); + memcpy(&result.x[i], &v, 32); + } + } + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + auto v = _mm_set1_epi8(t); + memcpy(&result.x[i], &v, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + auto v = _mm_set1_epi16(t); + memcpy(&result.x[i], &v, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + auto v = _mm_set1_epi32(t); + memcpy(&result.x[i], &v, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + auto v = _mm_set1_epi64x(t); + memcpy(&result.x[i], &v, 16); + } + } + for (; i < kLanes; ++i) { + result.x[i] = t; + } + return result; + } + + __attribute__((always_inline, target("avx2"))) int + count_leading_zero_lanes() const { + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v; + memcpy(&v, &x[i], 32); + auto lz = std::countr_one(uint32_t( + _mm256_movemask_epi8(_mm256_cmpeq_epi8(_mm256_set1_epi8(0), v)))); + if (lz < 32) { + return i + lz; } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m256i v; - memcpy(&v, &x[i], 32); - auto lz = std::countr_one(uint32_t(_mm256_movemask_epi8( - _mm256_cmpeq_epi16(_mm256_set1_epi16(0), v)))) / - 2; - if (lz < 16) { - return i + lz; - } + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v; + memcpy(&v, &x[i], 32); + auto lz = std::countr_one(uint32_t(_mm256_movemask_epi8( + _mm256_cmpeq_epi16(_mm256_set1_epi16(0), v)))) / + 2; + if (lz < 16) { + return i + lz; } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m256i v; - memcpy(&v, &x[i], 32); - auto lz = std::countr_one(uint32_t(_mm256_movemask_epi8( - _mm256_cmpeq_epi32(_mm256_set1_epi32(0), v)))) / - 4; - if (lz < 8) { - return i + lz; - } + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v; + memcpy(&v, &x[i], 32); + auto lz = std::countr_one(uint32_t(_mm256_movemask_epi8( + _mm256_cmpeq_epi32(_mm256_set1_epi32(0), v)))) / + 4; + if (lz < 8) { + return i + lz; } - } else if constexpr (std::is_same_v) { - for (; i + 4 <= kLanes; i += 4) { - __m256i v; - memcpy(&v, &x[i], 32); - auto lz = std::countr_one(uint32_t(_mm256_movemask_epi8( - _mm256_cmpeq_epi64(_mm256_set1_epi64x(0), v)))) / - 8; - if (lz < 4) { - return i + lz; - } + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m256i v; + memcpy(&v, &x[i], 32); + auto lz = std::countr_one(uint32_t(_mm256_movemask_epi8( + _mm256_cmpeq_epi64(_mm256_set1_epi64x(0), v)))) / + 8; + if (lz < 4) { + return i + lz; } } } @@ -392,93 +1197,50 @@ struct simd { return i; } - int count_leading_nonzero_lanes() const { + __attribute__((always_inline, target("avx2"))) int + count_leading_nonzero_lanes() const { return (~(*this)).count_leading_zero_lanes(); } - simd operator==(simd const &other) const { + __attribute__((always_inline, target("avx2"))) simd + operator==(simd const &other) const { simd result; int i = 0; - if constexpr (kInstructionSet >= Simd_x86_AVX512) { - if constexpr (std::is_same_v) { - for (; i + 64 <= kLanes; i += 64) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - auto k = _mm512_cmpeq_epi8_mask(v0, v1); - v1 = _mm512_maskz_set1_epi8(k, T(-1)); - memcpy(&result.x[i], &v1, 64); - } - } else if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - auto k = _mm512_cmpeq_epi16_mask(v0, v1); - v1 = _mm512_maskz_set1_epi16(k, T(-1)); - memcpy(&result.x[i], &v1, 64); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - auto k = _mm512_cmpeq_epi32_mask(v0, v1); - v1 = _mm512_maskz_set1_epi32(k, T(-1)); - memcpy(&result.x[i], &v1, 64); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - auto k = _mm512_cmpeq_epi64_mask(v0, v1); - v1 = _mm512_maskz_set1_epi64(k, T(-1)); - memcpy(&result.x[i], &v1, 64); - } + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpeq_epi8(v0, v1); + memcpy(&result.x[i], &v1, 32); } - } - if constexpr (kInstructionSet >= Simd_x86_AVX2) { - if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_cmpeq_epi8(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_cmpeq_epi16(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_cmpeq_epi32(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 4 <= kLanes; i += 4) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_cmpeq_epi64(v0, v1); - memcpy(&result.x[i], &v1, 32); - } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpeq_epi16(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpeq_epi32(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpeq_epi64(v0, v1); + memcpy(&result.x[i], &v1, 32); } } if constexpr (std::is_same_v) { @@ -524,163 +1286,80 @@ struct simd { return result; } - simd operator!=(simd const &other) const { return ~(*this == other); } + __attribute__((always_inline, target("avx2"))) simd + operator!=(simd const &other) const { + return ~(*this == other); + } - simd operator<(simd const &other) const { + __attribute__((always_inline, target("avx2"))) simd + operator<(simd const &other) const { simd result; int i = 0; - if constexpr (kInstructionSet >= Simd_x86_AVX512) { - if constexpr (std::is_same_v) { - for (; i + 64 <= kLanes; i += 64) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - auto k = _mm512_cmpgt_epi8_mask(v1, v0); - v1 = _mm512_maskz_set1_epi8(k, T(-1)); - memcpy(&result.x[i], &v1, 64); - } - } else if constexpr (std::is_same_v) { - for (; i + 64 <= kLanes; i += 64) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - auto k = _mm512_cmpgt_epu8_mask(v1, v0); - v1 = _mm512_maskz_set1_epi8(k, T(-1)); - memcpy(&result.x[i], &v1, 64); - } - } else if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - auto k = _mm512_cmpgt_epi16_mask(v1, v0); - v1 = _mm512_maskz_set1_epi16(k, T(-1)); - memcpy(&result.x[i], &v1, 64); - } - } else if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - auto k = _mm512_cmpgt_epu16_mask(v1, v0); - v1 = _mm512_maskz_set1_epi16(k, T(-1)); - memcpy(&result.x[i], &v1, 64); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - auto k = _mm512_cmpgt_epi32_mask(v1, v0); - v1 = _mm512_maskz_set1_epi32(k, T(-1)); - memcpy(&result.x[i], &v1, 64); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - auto k = _mm512_cmpgt_epu32_mask(v1, v0); - v1 = _mm512_maskz_set1_epi32(k, T(-1)); - memcpy(&result.x[i], &v1, 64); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - auto k = _mm512_cmpgt_epi64_mask(v1, v0); - v1 = _mm512_maskz_set1_epi64(k, T(-1)); - memcpy(&result.x[i], &v1, 64); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - auto k = _mm512_cmpgt_epu64_mask(v1, v0); - v1 = _mm512_maskz_set1_epi64(k, T(-1)); - memcpy(&result.x[i], &v1, 64); - } + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpgt_epi8(v1, v0); + memcpy(&result.x[i], &v1, 32); } - } - if constexpr (kInstructionSet >= Simd_x86_AVX2) { - if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_cmpgt_epi8(v1, v0); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_xor_si256(_mm256_cmpeq_epi8(_mm256_max_epu8(v0, v1), v0), - _mm256_set1_epi8(0xff)); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_cmpgt_epi16(v1, v0); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = - _mm256_xor_si256(_mm256_cmpeq_epi16(_mm256_max_epu16(v0, v1), v0), - _mm256_set1_epi8(0xff)); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_cmpgt_epi32(v1, v0); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = - _mm256_xor_si256(_mm256_cmpeq_epi32(_mm256_max_epu32(v0, v1), v0), - _mm256_set1_epi8(0xff)); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 4 <= kLanes; i += 4) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_cmpgt_epi64(v1, v0); - memcpy(&result.x[i], &v1, 32); - } + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_xor_si256(_mm256_cmpeq_epi8(_mm256_max_epu8(v0, v1), v0), + _mm256_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpgt_epi16(v1, v0); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_xor_si256(_mm256_cmpeq_epi16(_mm256_max_epu16(v0, v1), v0), + _mm256_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpgt_epi32(v1, v0); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_xor_si256(_mm256_cmpeq_epi32(_mm256_max_epu32(v0, v1), v0), + _mm256_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpgt_epi64(v1, v0); + memcpy(&result.x[i], &v1, 32); } } if constexpr (std::is_same_v) { @@ -756,91 +1435,60 @@ struct simd { return result; } - simd operator>(simd const &other) const { return other < *this; } + __attribute__((always_inline, target("avx2"))) simd + operator>(simd const &other) const { + return other < *this; + } - simd operator<=(simd const &other) const { return ~(*this > other); } + __attribute__((always_inline, target("avx2"))) simd + operator<=(simd const &other) const { + return ~(*this > other); + } - simd operator>=(simd const &other) const { return ~(*this < other); } + __attribute__((always_inline, target("avx2"))) simd + operator>=(simd const &other) const { + return ~(*this < other); + } - simd operator-(simd const &other) const { + __attribute__((always_inline, target("avx2"))) simd + operator-(simd const &other) const { simd result; int i = 0; - if constexpr (kInstructionSet >= Simd_x86_AVX512) { - if constexpr (std::is_same_v) { - for (; i + 64 <= kLanes; i += 64) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - v1 = _mm512_sub_epi8(v0, v1); - memcpy(&result.x[i], &v1, 64); - } - } else if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - v1 = _mm512_sub_epi16(v0, v1); - memcpy(&result.x[i], &v1, 64); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - v1 = _mm512_sub_epi32(v0, v1); - memcpy(&result.x[i], &v1, 64); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - v1 = _mm512_sub_epi64(v0, v1); - memcpy(&result.x[i], &v1, 64); - } + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_sub_epi8(v0, v1); + memcpy(&result.x[i], &v1, 32); } - } - if constexpr (kInstructionSet >= Simd_x86_AVX2) { - if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_sub_epi8(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_sub_epi16(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_sub_epi32(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 4 <= kLanes; i += 4) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_sub_epi64(v0, v1); - memcpy(&result.x[i], &v1, 32); - } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_sub_epi16(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_sub_epi32(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_sub_epi64(v0, v1); + memcpy(&result.x[i], &v1, 32); } } if constexpr (std::is_same_v) { @@ -886,85 +1534,45 @@ struct simd { return result; } - simd operator+(simd const &other) const { + __attribute__((always_inline, target("avx2"))) simd + operator+(simd const &other) const { simd result; int i = 0; - if constexpr (kInstructionSet >= Simd_x86_AVX512) { - if constexpr (std::is_same_v) { - for (; i + 64 <= kLanes; i += 64) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - v1 = _mm512_add_epi8(v0, v1); - memcpy(&result.x[i], &v1, 64); - } - } else if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - v1 = _mm512_add_epi16(v0, v1); - memcpy(&result.x[i], &v1, 64); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - v1 = _mm512_add_epi32(v0, v1); - memcpy(&result.x[i], &v1, 64); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - v1 = _mm512_add_epi64(v0, v1); - memcpy(&result.x[i], &v1, 64); - } + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_add_epi8(v0, v1); + memcpy(&result.x[i], &v1, 32); } - } - if constexpr (kInstructionSet >= Simd_x86_AVX2) { - if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_add_epi8(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_add_epi16(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_add_epi32(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 4 <= kLanes; i += 4) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_add_epi64(v0, v1); - memcpy(&result.x[i], &v1, 32); - } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_add_epi16(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_add_epi32(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_add_epi64(v0, v1); + memcpy(&result.x[i], &v1, 32); } } if constexpr (std::is_same_v) { @@ -1010,28 +1618,17 @@ struct simd { return result; } - simd operator|(simd const &other) const { + __attribute__((always_inline, target("avx2"))) simd + operator|(simd const &other) const { simd result; int i = 0; - if constexpr (kInstructionSet >= Simd_x86_AVX512) { - for (; i + 64 / sizeof(T) <= kLanes; i += 64 / sizeof(T)) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - v1 = _mm512_or_si512(v0, v1); - memcpy(&result.x[i], &v1, 64); - } - } - if constexpr (kInstructionSet >= Simd_x86_AVX2) { - for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_or_si256(v0, v1); - memcpy(&result.x[i], &v1, 32); - } + for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_or_si256(v0, v1); + memcpy(&result.x[i], &v1, 32); } for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { __m128i v0; @@ -1047,28 +1644,17 @@ struct simd { return result; } - simd operator&(simd const &other) const { + __attribute__((always_inline, target("avx2"))) simd + operator&(simd const &other) const { simd result; int i = 0; - if constexpr (kInstructionSet >= Simd_x86_AVX512) { - for (; i + 64 / sizeof(T) <= kLanes; i += 64 / sizeof(T)) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - v1 = _mm512_and_si512(v0, v1); - memcpy(&result.x[i], &v1, 64); - } - } - if constexpr (kInstructionSet >= Simd_x86_AVX2) { - for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_and_si256(v0, v1); - memcpy(&result.x[i], &v1, 32); - } + for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_and_si256(v0, v1); + memcpy(&result.x[i], &v1, 32); } for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { __m128i v0; @@ -1084,28 +1670,17 @@ struct simd { return result; } - simd operator^(simd const &other) const { + __attribute__((always_inline, target("avx2"))) simd + operator^(simd const &other) const { simd result; int i = 0; - if constexpr (kInstructionSet >= Simd_x86_AVX512) { - for (; i + 64 / sizeof(T) <= kLanes; i += 64 / sizeof(T)) { - __m512i v0; - memcpy(&v0, &x[i], 64); - __m512i v1; - memcpy(&v1, &other.x[i], 64); - v1 = _mm512_xor_si512(v0, v1); - memcpy(&result.x[i], &v1, 64); - } - } - if constexpr (kInstructionSet >= Simd_x86_AVX2) { - for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_xor_si256(v0, v1); - memcpy(&result.x[i], &v1, 32); - } + for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_xor_si256(v0, v1); + memcpy(&result.x[i], &v1, 32); } for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { __m128i v0; @@ -1121,24 +1696,14 @@ struct simd { return result; } - simd operator~() const { + __attribute__((always_inline, target("avx2"))) simd operator~() const { simd result; int i = 0; - if constexpr (kInstructionSet >= Simd_x86_AVX512) { - for (; i + 64 / sizeof(T) <= kLanes; i += 64 / sizeof(T)) { - __m512i v0; - memcpy(&v0, &x[i], 64); - v0 = _mm512_xor_si512(v0, _mm512_set1_epi8(0xff)); - memcpy(&result.x[i], &v0, 64); - } - } - if constexpr (kInstructionSet >= Simd_x86_AVX2) { - for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { - __m256i v0; - memcpy(&v0, &x[i], 32); - v0 = _mm256_xor_si256(v0, _mm256_set1_epi8(0xff)); - memcpy(&result.x[i], &v0, 32); - } + for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { + __m256i v0; + memcpy(&v0, &x[i], 32); + v0 = _mm256_xor_si256(v0, _mm256_set1_epi8(0xff)); + memcpy(&result.x[i], &v0, 32); } for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { __m128i v0; @@ -1155,6 +1720,996 @@ struct simd { private: T x[kLanes]; }; + +template struct simd { + + using SignedT = std::make_signed_t; + using UnsignedT = std::make_unsigned_t; + static constexpr int lanes = kLanes; + static constexpr auto kInstructionSet = Simd_x86_AVX512; + + simd() = default; + + __attribute((always_inline, + target("avx512bw"))) explicit simd(const UnsignedT *t) { + memcpy(x, t, sizeof(x)); + } + __attribute((always_inline, + target("avx512bw"))) explicit simd(const SignedT *t) { + memcpy(x, t, sizeof(x)); + } + + __attribute((always_inline, target("avx512bw"))) simd const & + as_signed() const { + return (simd const &)*this; + } + + simd const &as_unsigned() const { + return (simd const &)*this; + } + + __attribute((always_inline, target("avx512bw"))) static simd splat(T t) { + simd result; + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 64 <= kLanes; i += 64) { + auto v = _mm512_set1_epi8(t); + memcpy(&result.x[i], &v, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + auto v = _mm512_set1_epi16(t); + memcpy(&result.x[i], &v, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + auto v = _mm512_set1_epi32(t); + memcpy(&result.x[i], &v, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + auto v = _mm512_set1_epi64(t); + memcpy(&result.x[i], &v, 64); + } + } + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + auto v = _mm256_set1_epi8(t); + memcpy(&result.x[i], &v, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + auto v = _mm256_set1_epi16(t); + memcpy(&result.x[i], &v, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + auto v = _mm256_set1_epi32(t); + memcpy(&result.x[i], &v, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + auto v = _mm256_set1_epi64x(t); + memcpy(&result.x[i], &v, 32); + } + } + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + auto v = _mm_set1_epi8(t); + memcpy(&result.x[i], &v, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + auto v = _mm_set1_epi16(t); + memcpy(&result.x[i], &v, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + auto v = _mm_set1_epi32(t); + memcpy(&result.x[i], &v, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + auto v = _mm_set1_epi64x(t); + memcpy(&result.x[i], &v, 16); + } + } + for (; i < kLanes; ++i) { + result.x[i] = t; + } + return result; + } + + __attribute((always_inline, target("avx512bw"))) int + count_leading_zero_lanes() const { + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 64 <= kLanes; i += 64) { + __m512i v; + memcpy(&v, &x[i], 64); + auto lz = + std::countr_one(_mm512_cmpeq_epi8_mask(_mm512_set1_epi8(0), v)); + if (lz < 64) { + return i + lz; + } + } + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m512i v; + memcpy(&v, &x[i], 64); + auto lz = + std::countr_one(_mm512_cmpeq_epi16_mask(_mm512_set1_epi16(0), v)); + if (lz < 32) { + return i + lz; + } + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m512i v; + memcpy(&v, &x[i], 64); + auto lz = + std::countr_one(_mm512_cmpeq_epi32_mask(_mm512_set1_epi32(0), v)); + if (lz < 16) { + return i + lz; + } + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m512i v; + memcpy(&v, &x[i], 64); + auto lz = + std::countr_one(_mm512_cmpeq_epi64_mask(_mm512_set1_epi64(0), v)); + if (lz < 8) { + return i + lz; + } + } + } + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v; + memcpy(&v, &x[i], 32); + auto lz = std::countr_one(uint32_t( + _mm256_movemask_epi8(_mm256_cmpeq_epi8(_mm256_set1_epi8(0), v)))); + if (lz < 32) { + return i + lz; + } + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v; + memcpy(&v, &x[i], 32); + auto lz = std::countr_one(uint32_t(_mm256_movemask_epi8( + _mm256_cmpeq_epi16(_mm256_set1_epi16(0), v)))) / + 2; + if (lz < 16) { + return i + lz; + } + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v; + memcpy(&v, &x[i], 32); + auto lz = std::countr_one(uint32_t(_mm256_movemask_epi8( + _mm256_cmpeq_epi32(_mm256_set1_epi32(0), v)))) / + 4; + if (lz < 8) { + return i + lz; + } + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m256i v; + memcpy(&v, &x[i], 32); + auto lz = std::countr_one(uint32_t(_mm256_movemask_epi8( + _mm256_cmpeq_epi64(_mm256_set1_epi64x(0), v)))) / + 8; + if (lz < 4) { + return i + lz; + } + } + } + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v; + memcpy(&v, &x[i], 16); + auto lz = std::countr_one( + uint32_t(_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_set1_epi8(0), v)))); + if (lz < 16) { + return i + lz; + } + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v; + memcpy(&v, &x[i], 16); + auto lz = std::countr_one(uint32_t(_mm_movemask_epi8( + _mm_cmpeq_epi16(_mm_set1_epi16(0), v)))) / + 2; + if (lz < 8) { + return i + lz; + } + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v; + memcpy(&v, &x[i], 16); + auto lz = std::countr_one(uint32_t(_mm_movemask_epi8( + _mm_cmpeq_epi32(_mm_set1_epi32(0), v)))) / + 4; + if (lz < 4) { + return i + lz; + } + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + __m128i v; + memcpy(&v, &x[i], 16); + auto lz = std::countr_one(uint32_t(_mm_movemask_epi8( + _mm_cmpeq_epi64(_mm_set1_epi64x(0), v)))) / + 8; + if (lz < 2) { + return i + lz; + } + } + } + for (; i < kLanes; ++i) { + if (x[i]) { + break; + } + } + return i; + } + + __attribute((always_inline, target("avx512bw"))) int + count_leading_nonzero_lanes() const { + return (~(*this)).count_leading_zero_lanes(); + } + + __attribute((always_inline, target("avx512bw"))) simd + operator==(simd const &other) const { + simd result; + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 64 <= kLanes; i += 64) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpeq_epi8_mask(v0, v1); + v1 = _mm512_maskz_set1_epi8(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpeq_epi16_mask(v0, v1); + v1 = _mm512_maskz_set1_epi16(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpeq_epi32_mask(v0, v1); + v1 = _mm512_maskz_set1_epi32(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpeq_epi64_mask(v0, v1); + v1 = _mm512_maskz_set1_epi64(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpeq_epi8(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpeq_epi16(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpeq_epi32(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpeq_epi64(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmpeq_epi8(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmpeq_epi16(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmpeq_epi32(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmpeq_epi64(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } + for (; i < kLanes; ++i) { + result.x[i] = x[i] == other.x[i] ? T(-1) : T(0); + } + return result; + } + + __attribute((always_inline, target("avx512bw"))) simd + operator!=(simd const &other) const { + return ~(*this == other); + } + + __attribute((always_inline, target("avx512bw"))) simd + operator<(simd const &other) const { + simd result; + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 64 <= kLanes; i += 64) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpgt_epi8_mask(v1, v0); + v1 = _mm512_maskz_set1_epi8(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 64 <= kLanes; i += 64) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpgt_epu8_mask(v1, v0); + v1 = _mm512_maskz_set1_epi8(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpgt_epi16_mask(v1, v0); + v1 = _mm512_maskz_set1_epi16(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpgt_epu16_mask(v1, v0); + v1 = _mm512_maskz_set1_epi16(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpgt_epi32_mask(v1, v0); + v1 = _mm512_maskz_set1_epi32(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpgt_epu32_mask(v1, v0); + v1 = _mm512_maskz_set1_epi32(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpgt_epi64_mask(v1, v0); + v1 = _mm512_maskz_set1_epi64(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpgt_epu64_mask(v1, v0); + v1 = _mm512_maskz_set1_epi64(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpgt_epi8(v1, v0); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_xor_si256(_mm256_cmpeq_epi8(_mm256_max_epu8(v0, v1), v0), + _mm256_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpgt_epi16(v1, v0); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_xor_si256(_mm256_cmpeq_epi16(_mm256_max_epu16(v0, v1), v0), + _mm256_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpgt_epi32(v1, v0); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_xor_si256(_mm256_cmpeq_epi32(_mm256_max_epu32(v0, v1), v0), + _mm256_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpgt_epi64(v1, v0); + memcpy(&result.x[i], &v1, 32); + } + } + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmplt_epi8(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_xor_si128(_mm_cmpeq_epi8(_mm_max_epu8(v0, v1), v0), + _mm_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmplt_epi16(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_xor_si128(_mm_cmpeq_epi16(_mm_max_epu16(v0, v1), v0), + _mm_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmplt_epi32(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_xor_si128(_mm_cmpeq_epi32(_mm_max_epu32(v0, v1), v0), + _mm_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_cmpgt_epi64(v1, v0); + memcpy(&result.x[i], &v1, 16); + } + } + for (; i < kLanes; ++i) { + result.x[i] = x[i] < other.x[i] ? T(-1) : T(0); + } + return result; + } + + __attribute((always_inline, target("avx512bw"))) simd + operator>(simd const &other) const { + return other < *this; + } + + __attribute((always_inline, target("avx512bw"))) simd + operator<=(simd const &other) const { + return ~(*this > other); + } + + __attribute((always_inline, target("avx512bw"))) simd + operator>=(simd const &other) const { + return ~(*this < other); + } + + __attribute((always_inline, target("avx512bw"))) simd + operator-(simd const &other) const { + simd result; + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 64 <= kLanes; i += 64) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_sub_epi8(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_sub_epi16(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_sub_epi32(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_sub_epi64(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_sub_epi8(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_sub_epi16(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_sub_epi32(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_sub_epi64(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_sub_epi8(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_sub_epi16(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_sub_epi32(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_sub_epi64(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } + for (; i < kLanes; ++i) { + result.x[i] = T(UnsignedT(x[i]) - UnsignedT(other.x[i])); + } + return result; + } + + __attribute((always_inline, target("avx512bw"))) simd + operator+(simd const &other) const { + simd result; + int i = 0; + if constexpr (std::is_same_v) { + for (; i + 64 <= kLanes; i += 64) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_add_epi8(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_add_epi16(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_add_epi32(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_add_epi64(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_add_epi8(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_add_epi16(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_add_epi32(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_add_epi64(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } + if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_add_epi8(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_add_epi16(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_add_epi32(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } else if constexpr (std::is_same_v) { + for (; i + 2 <= kLanes; i += 2) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_add_epi64(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + } + for (; i < kLanes; ++i) { + result.x[i] = T(UnsignedT(x[i]) + UnsignedT(other.x[i])); + } + return result; + } + + __attribute((always_inline, target("avx512bw"))) simd + operator|(simd const &other) const { + simd result; + int i = 0; + for (; i + 64 / sizeof(T) <= kLanes; i += 64 / sizeof(T)) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_or_si512(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_or_si256(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_or_si128(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + for (; i < kLanes; ++i) { + result.x[i] = other.x[i] | x[i]; + } + return result; + } + + __attribute((always_inline, target("avx512bw"))) simd + operator&(simd const &other) const { + simd result; + int i = 0; + for (; i + 64 / sizeof(T) <= kLanes; i += 64 / sizeof(T)) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_and_si512(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_and_si256(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_and_si128(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + for (; i < kLanes; ++i) { + result.x[i] = x[i] & other.x[i]; + } + return result; + } + + __attribute((always_inline, target("avx512bw"))) simd + operator^(simd const &other) const { + simd result; + int i = 0; + for (; i + 64 / sizeof(T) <= kLanes; i += 64 / sizeof(T)) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_xor_si512(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_xor_si256(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { + __m128i v0; + memcpy(&v0, &x[i], 16); + __m128i v1; + memcpy(&v1, &other.x[i], 16); + v1 = _mm_xor_si128(v0, v1); + memcpy(&result.x[i], &v1, 16); + } + for (; i < kLanes; ++i) { + result.x[i] = x[i] ^ other.x[i]; + } + return result; + } + + __attribute((always_inline, target("avx512bw"))) simd operator~() const { + simd result; + int i = 0; + for (; i + 64 / sizeof(T) <= kLanes; i += 64 / sizeof(T)) { + __m512i v0; + memcpy(&v0, &x[i], 64); + v0 = _mm512_xor_si512(v0, _mm512_set1_epi8(0xff)); + memcpy(&result.x[i], &v0, 64); + } + for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { + __m256i v0; + memcpy(&v0, &x[i], 32); + v0 = _mm256_xor_si256(v0, _mm256_set1_epi8(0xff)); + memcpy(&result.x[i], &v0, 32); + } + for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { + __m128i v0; + memcpy(&v0, &x[i], 16); + v0 = _mm_xor_si128(v0, _mm_set1_epi8(0xff)); + memcpy(&result.x[i], &v0, 16); + } + for (; i < kLanes; ++i) { + result.x[i] = ~x[i]; + } + return result; + } + +private: + T x[kLanes]; +}; + } // namespace sse using namespace sse; @@ -1169,6 +2724,7 @@ template struct simd { using SignedT = std::make_signed_t; using UnsignedT = std::make_unsigned_t; + static constexpr int lanes = kLanes; simd() = default;