diff --git a/src/simd.h b/src/simd.h index 47cf8cb..2461bab 100644 --- a/src/simd.h +++ b/src/simd.h @@ -28,8 +28,8 @@ template struct simd { return (simd const &)*this; } - static simd splat(T t) { - simd result; + static simd splat(T t) { + simd result; for (int i = 0; i < kLanes; ++i) { result.x[i] = t; } @@ -50,84 +50,78 @@ template struct simd { return (~(*this)).count_leading_zero_lanes(); } - simd operator==(simd const &other) const { - simd result; + simd operator==(simd const &other) const { + simd result; for (int i = 0; i < kLanes; ++i) { result.x[i] = x[i] == other.x[i] ? T(-1) : T(0); } return result; } - simd operator!=(simd const &other) const { - simd result; + simd operator!=(simd const &other) const { + simd result; for (int i = 0; i < kLanes; ++i) { result.x[i] = x[i] != other.x[i] ? T(-1) : T(0); } return result; } - simd operator<(simd const &other) const { - simd result; + simd operator<(simd const &other) const { + simd result; for (int i = 0; i < kLanes; ++i) { result.x[i] = x[i] < other.x[i] ? T(-1) : T(0); } return result; } - simd operator>(simd const &other) const { - return other < *this; - } + simd operator>(simd const &other) const { return other < *this; } - simd operator<=(simd const &other) const { - return ~(*this > other); - } + simd operator<=(simd const &other) const { return ~(*this > other); } - simd operator>=(simd const &other) const { - return ~(*this < other); - } + simd operator>=(simd const &other) const { return ~(*this < other); } - simd operator-(simd const &other) const { - simd result; + simd operator-(simd const &other) const { + simd result; for (int i = 0; i < kLanes; ++i) { result.x[i] = T(UnsignedT(x[i]) - UnsignedT(other.x[i])); } return result; } - simd operator+(simd const &other) const { - simd result; + simd operator+(simd const &other) const { + simd result; for (int i = 0; i < kLanes; ++i) { result.x[i] = T(UnsignedT(x[i]) + UnsignedT(other.x[i])); } return result; } - simd operator|(simd const &other) const { - simd result; + simd operator|(simd const &other) const { + simd result; for (int i = 0; i < kLanes; ++i) { result.x[i] = other.x[i] | x[i]; } return result; } - simd operator&(simd const &other) const { - simd result; + simd operator&(simd const &other) const { + simd result; for (int i = 0; i < kLanes; ++i) { result.x[i] = x[i] & other.x[i]; } return result; } - simd operator^(simd const &other) const { - simd result; + simd operator^(simd const &other) const { + simd result; for (int i = 0; i < kLanes; ++i) { result.x[i] = x[i] ^ other.x[i]; } return result; } - simd operator~() const { - simd result; + simd operator~() const { + simd result; for (int i = 0; i < kLanes; ++i) { result.x[i] = ~x[i]; } @@ -143,7 +137,25 @@ private: #include namespace sse { -template struct simd { +enum InstructionSet { + Simd_x86_SSE, + Simd_x86_AVX, + Simd_x86_AVX2, + Simd_x86_AVX512, +}; + +template +struct simd { using SignedT = std::make_signed_t; using UnsignedT = std::make_unsigned_t; @@ -153,40 +165,63 @@ template struct simd { explicit simd(const UnsignedT *t) { memcpy(x, t, sizeof(x)); } explicit simd(const SignedT *t) { memcpy(x, t, sizeof(x)); } - simd const &as_signed() const { - return (simd const &)*this; + simd const &as_signed() const { + return (simd const &)*this; } - simd const &as_unsigned() const { - return (simd const &)*this; + simd const &as_unsigned() const { + return (simd const &)*this; } - static simd splat(T t) { - simd result; + static simd splat(T t) { + simd result; int i = 0; -#ifdef __AVX__ - if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - auto v = _mm256_set1_epi8(t); - memcpy(&result.x[i], &v, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - auto v = _mm256_set1_epi16(t); - memcpy(&result.x[i], &v, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - auto v = _mm256_set1_epi32(t); - memcpy(&result.x[i], &v, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 4 <= kLanes; i += 4) { - auto v = _mm256_set1_epi64x(t); - memcpy(&result.x[i], &v, 32); + if constexpr (kInstructionSet >= Simd_x86_AVX512) { + if constexpr (std::is_same_v) { + for (; i + 64 <= kLanes; i += 64) { + auto v = _mm512_set1_epi8(t); + memcpy(&result.x[i], &v, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + auto v = _mm512_set1_epi16(t); + memcpy(&result.x[i], &v, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + auto v = _mm512_set1_epi32(t); + memcpy(&result.x[i], &v, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + auto v = _mm512_set1_epi64(t); + memcpy(&result.x[i], &v, 64); + } + } + } + if constexpr (kInstructionSet >= Simd_x86_AVX) { + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + auto v = _mm256_set1_epi8(t); + memcpy(&result.x[i], &v, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + auto v = _mm256_set1_epi16(t); + memcpy(&result.x[i], &v, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + auto v = _mm256_set1_epi32(t); + memcpy(&result.x[i], &v, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + auto v = _mm256_set1_epi64x(t); + memcpy(&result.x[i], &v, 32); + } } } -#endif if constexpr (std::is_same_v) { for (; i + 16 <= kLanes; i += 16) { auto v = _mm_set1_epi8(t); @@ -216,52 +251,95 @@ template struct simd { int count_leading_zero_lanes() const { int i = 0; -#ifdef __AVX2__ - if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m256i v; - memcpy(&v, &x[i], 32); - auto lz = std::countr_one(uint32_t( - _mm256_movemask_epi8(_mm256_cmpeq_epi8(_mm256_set1_epi8(0), v)))); - if (lz < 32) { - return i + lz; + if constexpr (kInstructionSet >= Simd_x86_AVX512) { + if constexpr (std::is_same_v) { + for (; i + 64 <= kLanes; i += 64) { + __m512i v; + memcpy(&v, &x[i], 64); + auto lz = + std::countr_one(_mm512_cmpeq_epi8_mask(_mm512_set1_epi8(0), v)); + if (lz < 64) { + return i + lz; + } } - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m256i v; - memcpy(&v, &x[i], 32); - auto lz = std::countr_one(uint32_t(_mm256_movemask_epi8( - _mm256_cmpeq_epi16(_mm256_set1_epi16(0), v)))) / - 2; - if (lz < 16) { - return i + lz; + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m512i v; + memcpy(&v, &x[i], 64); + auto lz = + std::countr_one(_mm512_cmpeq_epi16_mask(_mm512_set1_epi16(0), v)); + if (lz < 32) { + return i + lz; + } } - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m256i v; - memcpy(&v, &x[i], 32); - auto lz = std::countr_one(uint32_t(_mm256_movemask_epi8( - _mm256_cmpeq_epi32(_mm256_set1_epi32(0), v)))) / - 4; - if (lz < 8) { - return i + lz; + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m512i v; + memcpy(&v, &x[i], 64); + auto lz = + std::countr_one(_mm512_cmpeq_epi32_mask(_mm512_set1_epi32(0), v)); + if (lz < 16) { + return i + lz; + } } - } - } else if constexpr (std::is_same_v) { - for (; i + 4 <= kLanes; i += 4) { - __m256i v; - memcpy(&v, &x[i], 32); - auto lz = std::countr_one(uint32_t(_mm256_movemask_epi8( - _mm256_cmpeq_epi64(_mm256_set1_epi64x(0), v)))) / - 8; - if (lz < 4) { - return i + lz; + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m512i v; + memcpy(&v, &x[i], 64); + auto lz = + std::countr_one(_mm512_cmpeq_epi64_mask(_mm512_set1_epi64(0), v)); + if (lz < 8) { + return i + lz; + } + } + } + } + if constexpr (kInstructionSet >= Simd_x86_AVX2) { + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v; + memcpy(&v, &x[i], 32); + auto lz = std::countr_one(uint32_t( + _mm256_movemask_epi8(_mm256_cmpeq_epi8(_mm256_set1_epi8(0), v)))); + if (lz < 32) { + return i + lz; + } + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v; + memcpy(&v, &x[i], 32); + auto lz = std::countr_one(uint32_t(_mm256_movemask_epi8( + _mm256_cmpeq_epi16(_mm256_set1_epi16(0), v)))) / + 2; + if (lz < 16) { + return i + lz; + } + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v; + memcpy(&v, &x[i], 32); + auto lz = std::countr_one(uint32_t(_mm256_movemask_epi8( + _mm256_cmpeq_epi32(_mm256_set1_epi32(0), v)))) / + 4; + if (lz < 8) { + return i + lz; + } + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m256i v; + memcpy(&v, &x[i], 32); + auto lz = std::countr_one(uint32_t(_mm256_movemask_epi8( + _mm256_cmpeq_epi64(_mm256_set1_epi64x(0), v)))) / + 8; + if (lz < 4) { + return i + lz; + } } } } -#endif if constexpr (std::is_same_v) { for (; i + 16 <= kLanes; i += 16) { __m128i v; @@ -318,48 +396,91 @@ template struct simd { return (~(*this)).count_leading_zero_lanes(); } - simd operator==(simd const &other) const { - simd result; + simd operator==(simd const &other) const { + simd result; int i = 0; -#ifdef __AVX2__ - if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_cmpeq_epi8(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_cmpeq_epi16(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_cmpeq_epi32(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 4 <= kLanes; i += 4) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_cmpeq_epi64(v0, v1); - memcpy(&result.x[i], &v1, 32); + if constexpr (kInstructionSet >= Simd_x86_AVX512) { + if constexpr (std::is_same_v) { + for (; i + 64 <= kLanes; i += 64) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpeq_epi8_mask(v0, v1); + v1 = _mm512_maskz_set1_epi8(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpeq_epi16_mask(v0, v1); + v1 = _mm512_maskz_set1_epi16(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpeq_epi32_mask(v0, v1); + v1 = _mm512_maskz_set1_epi32(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpeq_epi64_mask(v0, v1); + v1 = _mm512_maskz_set1_epi64(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } + } + if constexpr (kInstructionSet >= Simd_x86_AVX2) { + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpeq_epi8(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpeq_epi16(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpeq_epi32(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpeq_epi64(v0, v1); + memcpy(&result.x[i], &v1, 32); + } } } -#endif if constexpr (std::is_same_v) { for (; i + 16 <= kLanes; i += 16) { __m128i v0; @@ -403,82 +524,165 @@ template struct simd { return result; } - simd operator!=(simd const &other) const { - return ~(*this == other); - } + simd operator!=(simd const &other) const { return ~(*this == other); } - simd operator<(simd const &other) const { - simd result; + simd operator<(simd const &other) const { + simd result; int i = 0; -#ifdef __AVX2__ - if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_cmpgt_epi8(v1, v0); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_xor_si256(_mm256_cmpeq_epi8(_mm256_max_epu8(v0, v1), v0), - _mm256_set1_epi8(0xff)); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_cmpgt_epi16(v1, v0); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_xor_si256(_mm256_cmpeq_epi16(_mm256_max_epu16(v0, v1), v0), - _mm256_set1_epi8(0xff)); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_cmpgt_epi32(v1, v0); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_xor_si256(_mm256_cmpeq_epi32(_mm256_max_epu32(v0, v1), v0), - _mm256_set1_epi8(0xff)); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 4 <= kLanes; i += 4) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_cmpgt_epi64(v1, v0); - memcpy(&result.x[i], &v1, 32); + if constexpr (kInstructionSet >= Simd_x86_AVX512) { + if constexpr (std::is_same_v) { + for (; i + 64 <= kLanes; i += 64) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpgt_epi8_mask(v1, v0); + v1 = _mm512_maskz_set1_epi8(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 64 <= kLanes; i += 64) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpgt_epu8_mask(v1, v0); + v1 = _mm512_maskz_set1_epi8(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpgt_epi16_mask(v1, v0); + v1 = _mm512_maskz_set1_epi16(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpgt_epu16_mask(v1, v0); + v1 = _mm512_maskz_set1_epi16(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpgt_epi32_mask(v1, v0); + v1 = _mm512_maskz_set1_epi32(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpgt_epu32_mask(v1, v0); + v1 = _mm512_maskz_set1_epi32(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpgt_epi64_mask(v1, v0); + v1 = _mm512_maskz_set1_epi64(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + auto k = _mm512_cmpgt_epu64_mask(v1, v0); + v1 = _mm512_maskz_set1_epi64(k, T(-1)); + memcpy(&result.x[i], &v1, 64); + } + } + } + if constexpr (kInstructionSet >= Simd_x86_AVX2) { + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpgt_epi8(v1, v0); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_xor_si256(_mm256_cmpeq_epi8(_mm256_max_epu8(v0, v1), v0), + _mm256_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpgt_epi16(v1, v0); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = + _mm256_xor_si256(_mm256_cmpeq_epi16(_mm256_max_epu16(v0, v1), v0), + _mm256_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpgt_epi32(v1, v0); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = + _mm256_xor_si256(_mm256_cmpeq_epi32(_mm256_max_epu32(v0, v1), v0), + _mm256_set1_epi8(0xff)); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_cmpgt_epi64(v1, v0); + memcpy(&result.x[i], &v1, 32); + } } } -#endif if constexpr (std::is_same_v) { for (; i + 16 <= kLanes; i += 16) { __m128i v0; @@ -552,60 +756,93 @@ template struct simd { return result; } - simd operator>(simd const &other) const { - return other < *this; - } + simd operator>(simd const &other) const { return other < *this; } - simd operator<=(simd const &other) const { - return ~(*this > other); - } + simd operator<=(simd const &other) const { return ~(*this > other); } - simd operator>=(simd const &other) const { - return ~(*this < other); - } + simd operator>=(simd const &other) const { return ~(*this < other); } - simd operator-(simd const &other) const { - simd result; + simd operator-(simd const &other) const { + simd result; int i = 0; -#ifdef __AVX2__ - if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_sub_epi8(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_sub_epi16(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_sub_epi32(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 4 <= kLanes; i += 4) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_sub_epi64(v0, v1); - memcpy(&result.x[i], &v1, 32); + if constexpr (kInstructionSet >= Simd_x86_AVX512) { + if constexpr (std::is_same_v) { + for (; i + 64 <= kLanes; i += 64) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_sub_epi8(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_sub_epi16(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_sub_epi32(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_sub_epi64(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } + } + if constexpr (kInstructionSet >= Simd_x86_AVX2) { + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_sub_epi8(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_sub_epi16(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_sub_epi32(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_sub_epi64(v0, v1); + memcpy(&result.x[i], &v1, 32); + } } } -#endif if constexpr (std::is_same_v) { for (; i + 16 <= kLanes; i += 16) { __m128i v0; @@ -649,48 +886,87 @@ template struct simd { return result; } - simd operator+(simd const &other) const { - simd result; + simd operator+(simd const &other) const { + simd result; int i = 0; -#ifdef __AVX2__ - if constexpr (std::is_same_v) { - for (; i + 32 <= kLanes; i += 32) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_add_epi8(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 16 <= kLanes; i += 16) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_add_epi16(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 8 <= kLanes; i += 8) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_add_epi32(v0, v1); - memcpy(&result.x[i], &v1, 32); - } - } else if constexpr (std::is_same_v) { - for (; i + 4 <= kLanes; i += 4) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_add_epi64(v0, v1); - memcpy(&result.x[i], &v1, 32); + if constexpr (kInstructionSet >= Simd_x86_AVX512) { + if constexpr (std::is_same_v) { + for (; i + 64 <= kLanes; i += 64) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_add_epi8(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_add_epi16(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_add_epi32(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_add_epi64(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } + } + if constexpr (kInstructionSet >= Simd_x86_AVX2) { + if constexpr (std::is_same_v) { + for (; i + 32 <= kLanes; i += 32) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_add_epi8(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 16 <= kLanes; i += 16) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_add_epi16(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 8 <= kLanes; i += 8) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_add_epi32(v0, v1); + memcpy(&result.x[i], &v1, 32); + } + } else if constexpr (std::is_same_v) { + for (; i + 4 <= kLanes; i += 4) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_add_epi64(v0, v1); + memcpy(&result.x[i], &v1, 32); + } } } -#endif if constexpr (std::is_same_v) { for (; i + 16 <= kLanes; i += 16) { __m128i v0; @@ -734,19 +1010,29 @@ template struct simd { return result; } - simd operator|(simd const &other) const { - simd result; + simd operator|(simd const &other) const { + simd result; int i = 0; -#ifdef __AVX2__ - for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_or_si256(v0, v1); - memcpy(&result.x[i], &v1, 32); + if constexpr (kInstructionSet >= Simd_x86_AVX512) { + for (; i + 64 / sizeof(T) <= kLanes; i += 64 / sizeof(T)) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_or_si512(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } + if constexpr (kInstructionSet >= Simd_x86_AVX2) { + for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_or_si256(v0, v1); + memcpy(&result.x[i], &v1, 32); + } } -#endif for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { __m128i v0; memcpy(&v0, &x[i], 16); @@ -761,19 +1047,29 @@ template struct simd { return result; } - simd operator&(simd const &other) const { - simd result; + simd operator&(simd const &other) const { + simd result; int i = 0; -#ifdef __AVX2__ - for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_and_si256(v0, v1); - memcpy(&result.x[i], &v1, 32); + if constexpr (kInstructionSet >= Simd_x86_AVX512) { + for (; i + 64 / sizeof(T) <= kLanes; i += 64 / sizeof(T)) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_and_si512(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } + if constexpr (kInstructionSet >= Simd_x86_AVX2) { + for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_and_si256(v0, v1); + memcpy(&result.x[i], &v1, 32); + } } -#endif for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { __m128i v0; memcpy(&v0, &x[i], 16); @@ -788,19 +1084,29 @@ template struct simd { return result; } - simd operator^(simd const &other) const { - simd result; + simd operator^(simd const &other) const { + simd result; int i = 0; -#ifdef __AVX2__ - for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { - __m256i v0; - memcpy(&v0, &x[i], 32); - __m256i v1; - memcpy(&v1, &other.x[i], 32); - v1 = _mm256_xor_si256(v0, v1); - memcpy(&result.x[i], &v1, 32); + if constexpr (kInstructionSet >= Simd_x86_AVX512) { + for (; i + 64 / sizeof(T) <= kLanes; i += 64 / sizeof(T)) { + __m512i v0; + memcpy(&v0, &x[i], 64); + __m512i v1; + memcpy(&v1, &other.x[i], 64); + v1 = _mm512_xor_si512(v0, v1); + memcpy(&result.x[i], &v1, 64); + } + } + if constexpr (kInstructionSet >= Simd_x86_AVX2) { + for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { + __m256i v0; + memcpy(&v0, &x[i], 32); + __m256i v1; + memcpy(&v1, &other.x[i], 32); + v1 = _mm256_xor_si256(v0, v1); + memcpy(&result.x[i], &v1, 32); + } } -#endif for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { __m128i v0; memcpy(&v0, &x[i], 16); @@ -815,17 +1121,25 @@ template struct simd { return result; } - simd operator~() const { - simd result; + simd operator~() const { + simd result; int i = 0; -#ifdef __AVX2__ - for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { - __m256i v0; - memcpy(&v0, &x[i], 32); - v0 = _mm256_xor_si256(v0, _mm256_set1_epi8(0xff)); - memcpy(&result.x[i], &v0, 32); + if constexpr (kInstructionSet >= Simd_x86_AVX512) { + for (; i + 64 / sizeof(T) <= kLanes; i += 64 / sizeof(T)) { + __m512i v0; + memcpy(&v0, &x[i], 64); + v0 = _mm512_xor_si512(v0, _mm512_set1_epi8(0xff)); + memcpy(&result.x[i], &v0, 64); + } + } + if constexpr (kInstructionSet >= Simd_x86_AVX2) { + for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) { + __m256i v0; + memcpy(&v0, &x[i], 32); + v0 = _mm256_xor_si256(v0, _mm256_set1_epi8(0xff)); + memcpy(&result.x[i], &v0, 32); + } } -#endif for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { __m128i v0; memcpy(&v0, &x[i], 16); @@ -869,8 +1183,8 @@ template struct simd { return (simd const &)*this; } - static simd splat(T t) { - simd result; + static simd splat(T t) { + simd result; int i = 0; if constexpr (std::is_same_v) { for (; i + 16 <= kLanes; i += 16) { @@ -955,8 +1269,8 @@ template struct simd { return (~(*this)).count_leading_zero_lanes(); } - simd operator==(simd const &other) const { - simd result; + simd operator==(simd const &other) const { + simd result; int i = 0; if constexpr (std::is_same_v) { for (; i + 16 <= kLanes; i += 16) { @@ -1001,12 +1315,10 @@ template struct simd { return result; } - simd operator!=(simd const &other) const { - return ~(*this == other); - } + simd operator!=(simd const &other) const { return ~(*this == other); } - simd operator<(simd const &other) const { - simd result; + simd operator<(simd const &other) const { + simd result; int i = 0; if constexpr (std::is_same_v) { for (; i + 16 <= kLanes; i += 16) { @@ -1087,20 +1399,14 @@ template struct simd { return result; } - simd operator>(simd const &other) const { - return other < *this; - } + simd operator>(simd const &other) const { return other < *this; } - simd operator<=(simd const &other) const { - return ~(*this > other); - } + simd operator<=(simd const &other) const { return ~(*this > other); } - simd operator>=(simd const &other) const { - return ~(*this < other); - } + simd operator>=(simd const &other) const { return ~(*this < other); } - simd operator-(simd const &other) const { - simd result; + simd operator-(simd const &other) const { + simd result; int i = 0; if constexpr (std::is_same_v) { for (; i + 16 <= kLanes; i += 16) { @@ -1145,8 +1451,8 @@ template struct simd { return result; } - simd operator+(simd const &other) const { - simd result; + simd operator+(simd const &other) const { + simd result; int i = 0; if constexpr (std::is_same_v) { for (; i + 16 <= kLanes; i += 16) { @@ -1191,8 +1497,8 @@ template struct simd { return result; } - simd operator|(simd const &other) const { - simd result; + simd operator|(simd const &other) const { + simd result; int i = 0; for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { int8x16_t v0; @@ -1208,8 +1514,8 @@ template struct simd { return result; } - simd operator&(simd const &other) const { - simd result; + simd operator&(simd const &other) const { + simd result; int i = 0; for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { int8x16_t v0; @@ -1225,8 +1531,8 @@ template struct simd { return result; } - simd operator^(simd const &other) const { - simd result; + simd operator^(simd const &other) const { + simd result; int i = 0; for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { int8x16_t v0; @@ -1242,8 +1548,8 @@ template struct simd { return result; } - simd operator~() const { - simd result; + simd operator~() const { + simd result; int i = 0; for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) { int8x16_t v0;