Update simd.h

This commit is contained in:
2025-06-04 16:49:26 -04:00
parent b28251c8b1
commit d6d5e210e4

View File

@@ -28,8 +28,8 @@ template <std::integral T, int kLanes> struct simd {
return (simd<UnsignedT, kLanes> const &)*this;
}
static simd<T, kLanes> splat(T t) {
simd<T, kLanes> result;
static simd splat(T t) {
simd result;
for (int i = 0; i < kLanes; ++i) {
result.x[i] = t;
}
@@ -50,84 +50,78 @@ template <std::integral T, int kLanes> struct simd {
return (~(*this)).count_leading_zero_lanes();
}
simd<T, kLanes> operator==(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator==(simd const &other) const {
simd result;
for (int i = 0; i < kLanes; ++i) {
result.x[i] = x[i] == other.x[i] ? T(-1) : T(0);
}
return result;
}
simd<T, kLanes> operator!=(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator!=(simd const &other) const {
simd result;
for (int i = 0; i < kLanes; ++i) {
result.x[i] = x[i] != other.x[i] ? T(-1) : T(0);
}
return result;
}
simd<T, kLanes> operator<(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator<(simd const &other) const {
simd result;
for (int i = 0; i < kLanes; ++i) {
result.x[i] = x[i] < other.x[i] ? T(-1) : T(0);
}
return result;
}
simd<T, kLanes> operator>(simd<T, kLanes> const &other) const {
return other < *this;
}
simd operator>(simd const &other) const { return other < *this; }
simd<T, kLanes> operator<=(simd<T, kLanes> const &other) const {
return ~(*this > other);
}
simd operator<=(simd const &other) const { return ~(*this > other); }
simd<T, kLanes> operator>=(simd<T, kLanes> const &other) const {
return ~(*this < other);
}
simd operator>=(simd const &other) const { return ~(*this < other); }
simd<T, kLanes> operator-(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator-(simd const &other) const {
simd result;
for (int i = 0; i < kLanes; ++i) {
result.x[i] = T(UnsignedT(x[i]) - UnsignedT(other.x[i]));
}
return result;
}
simd<T, kLanes> operator+(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator+(simd const &other) const {
simd result;
for (int i = 0; i < kLanes; ++i) {
result.x[i] = T(UnsignedT(x[i]) + UnsignedT(other.x[i]));
}
return result;
}
simd<T, kLanes> operator|(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator|(simd const &other) const {
simd result;
for (int i = 0; i < kLanes; ++i) {
result.x[i] = other.x[i] | x[i];
}
return result;
}
simd<T, kLanes> operator&(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator&(simd const &other) const {
simd result;
for (int i = 0; i < kLanes; ++i) {
result.x[i] = x[i] & other.x[i];
}
return result;
}
simd<T, kLanes> operator^(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator^(simd const &other) const {
simd result;
for (int i = 0; i < kLanes; ++i) {
result.x[i] = x[i] ^ other.x[i];
}
return result;
}
simd<T, kLanes> operator~() const {
simd<T, kLanes> result;
simd operator~() const {
simd result;
for (int i = 0; i < kLanes; ++i) {
result.x[i] = ~x[i];
}
@@ -143,7 +137,25 @@ private:
#include <immintrin.h>
namespace sse {
template <std::integral T, int kLanes> struct simd {
enum InstructionSet {
Simd_x86_SSE,
Simd_x86_AVX,
Simd_x86_AVX2,
Simd_x86_AVX512,
};
template <std::integral T, int kLanes,
InstructionSet kInstructionSet =
// AVX512 not enabled by default
#if defined(__AVX2__)
Simd_x86_AVX2
#elif defined(__AVX__)
Simd_x86_AVX
#else
Simd_x86_SSE
#endif
>
struct simd {
using SignedT = std::make_signed_t<T>;
using UnsignedT = std::make_unsigned_t<T>;
@@ -153,18 +165,41 @@ template <std::integral T, int kLanes> struct simd {
explicit simd(const UnsignedT *t) { memcpy(x, t, sizeof(x)); }
explicit simd(const SignedT *t) { memcpy(x, t, sizeof(x)); }
simd<SignedT, kLanes> const &as_signed() const {
return (simd<SignedT, kLanes> const &)*this;
simd const &as_signed() const {
return (simd<SignedT, kLanes, kInstructionSet> const &)*this;
}
simd<UnsignedT, kLanes> const &as_unsigned() const {
return (simd<UnsignedT, kLanes> const &)*this;
simd const &as_unsigned() const {
return (simd<UnsignedT, kLanes, kInstructionSet> const &)*this;
}
static simd<T, kLanes> splat(T t) {
simd<T, kLanes> result;
static simd splat(T t) {
simd result;
int i = 0;
#ifdef __AVX__
if constexpr (kInstructionSet >= Simd_x86_AVX512) {
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 64 <= kLanes; i += 64) {
auto v = _mm512_set1_epi8(t);
memcpy(&result.x[i], &v, 64);
}
} else if constexpr (std::is_same_v<UnsignedT, uint16_t>) {
for (; i + 32 <= kLanes; i += 32) {
auto v = _mm512_set1_epi16(t);
memcpy(&result.x[i], &v, 64);
}
} else if constexpr (std::is_same_v<UnsignedT, uint32_t>) {
for (; i + 16 <= kLanes; i += 16) {
auto v = _mm512_set1_epi32(t);
memcpy(&result.x[i], &v, 64);
}
} else if constexpr (std::is_same_v<UnsignedT, uint64_t>) {
for (; i + 8 <= kLanes; i += 8) {
auto v = _mm512_set1_epi64(t);
memcpy(&result.x[i], &v, 64);
}
}
}
if constexpr (kInstructionSet >= Simd_x86_AVX) {
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 32 <= kLanes; i += 32) {
auto v = _mm256_set1_epi8(t);
@@ -186,7 +221,7 @@ template <std::integral T, int kLanes> struct simd {
memcpy(&result.x[i], &v, 32);
}
}
#endif
}
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 16 <= kLanes; i += 16) {
auto v = _mm_set1_epi8(t);
@@ -216,7 +251,50 @@ template <std::integral T, int kLanes> struct simd {
int count_leading_zero_lanes() const {
int i = 0;
#ifdef __AVX2__
if constexpr (kInstructionSet >= Simd_x86_AVX512) {
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 64 <= kLanes; i += 64) {
__m512i v;
memcpy(&v, &x[i], 64);
auto lz =
std::countr_one(_mm512_cmpeq_epi8_mask(_mm512_set1_epi8(0), v));
if (lz < 64) {
return i + lz;
}
}
} else if constexpr (std::is_same_v<UnsignedT, uint16_t>) {
for (; i + 32 <= kLanes; i += 32) {
__m512i v;
memcpy(&v, &x[i], 64);
auto lz =
std::countr_one(_mm512_cmpeq_epi16_mask(_mm512_set1_epi16(0), v));
if (lz < 32) {
return i + lz;
}
}
} else if constexpr (std::is_same_v<UnsignedT, uint32_t>) {
for (; i + 16 <= kLanes; i += 16) {
__m512i v;
memcpy(&v, &x[i], 64);
auto lz =
std::countr_one(_mm512_cmpeq_epi32_mask(_mm512_set1_epi32(0), v));
if (lz < 16) {
return i + lz;
}
}
} else if constexpr (std::is_same_v<UnsignedT, uint64_t>) {
for (; i + 8 <= kLanes; i += 8) {
__m512i v;
memcpy(&v, &x[i], 64);
auto lz =
std::countr_one(_mm512_cmpeq_epi64_mask(_mm512_set1_epi64(0), v));
if (lz < 8) {
return i + lz;
}
}
}
}
if constexpr (kInstructionSet >= Simd_x86_AVX2) {
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 32 <= kLanes; i += 32) {
__m256i v;
@@ -261,7 +339,7 @@ template <std::integral T, int kLanes> struct simd {
}
}
}
#endif
}
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 16 <= kLanes; i += 16) {
__m128i v;
@@ -318,10 +396,53 @@ template <std::integral T, int kLanes> struct simd {
return (~(*this)).count_leading_zero_lanes();
}
simd<T, kLanes> operator==(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator==(simd const &other) const {
simd result;
int i = 0;
#ifdef __AVX2__
if constexpr (kInstructionSet >= Simd_x86_AVX512) {
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 64 <= kLanes; i += 64) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
auto k = _mm512_cmpeq_epi8_mask(v0, v1);
v1 = _mm512_maskz_set1_epi8(k, T(-1));
memcpy(&result.x[i], &v1, 64);
}
} else if constexpr (std::is_same_v<UnsignedT, uint16_t>) {
for (; i + 32 <= kLanes; i += 32) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
auto k = _mm512_cmpeq_epi16_mask(v0, v1);
v1 = _mm512_maskz_set1_epi16(k, T(-1));
memcpy(&result.x[i], &v1, 64);
}
} else if constexpr (std::is_same_v<UnsignedT, uint32_t>) {
for (; i + 16 <= kLanes; i += 16) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
auto k = _mm512_cmpeq_epi32_mask(v0, v1);
v1 = _mm512_maskz_set1_epi32(k, T(-1));
memcpy(&result.x[i], &v1, 64);
}
} else if constexpr (std::is_same_v<UnsignedT, uint64_t>) {
for (; i + 8 <= kLanes; i += 8) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
auto k = _mm512_cmpeq_epi64_mask(v0, v1);
v1 = _mm512_maskz_set1_epi64(k, T(-1));
memcpy(&result.x[i], &v1, 64);
}
}
}
if constexpr (kInstructionSet >= Simd_x86_AVX2) {
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 32 <= kLanes; i += 32) {
__m256i v0;
@@ -359,7 +480,7 @@ template <std::integral T, int kLanes> struct simd {
memcpy(&result.x[i], &v1, 32);
}
}
#endif
}
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 16 <= kLanes; i += 16) {
__m128i v0;
@@ -403,14 +524,95 @@ template <std::integral T, int kLanes> struct simd {
return result;
}
simd<T, kLanes> operator!=(simd<T, kLanes> const &other) const {
return ~(*this == other);
}
simd operator!=(simd const &other) const { return ~(*this == other); }
simd<T, kLanes> operator<(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator<(simd const &other) const {
simd result;
int i = 0;
#ifdef __AVX2__
if constexpr (kInstructionSet >= Simd_x86_AVX512) {
if constexpr (std::is_same_v<T, int8_t>) {
for (; i + 64 <= kLanes; i += 64) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
auto k = _mm512_cmpgt_epi8_mask(v1, v0);
v1 = _mm512_maskz_set1_epi8(k, T(-1));
memcpy(&result.x[i], &v1, 64);
}
} else if constexpr (std::is_same_v<T, uint8_t>) {
for (; i + 64 <= kLanes; i += 64) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
auto k = _mm512_cmpgt_epu8_mask(v1, v0);
v1 = _mm512_maskz_set1_epi8(k, T(-1));
memcpy(&result.x[i], &v1, 64);
}
} else if constexpr (std::is_same_v<T, int16_t>) {
for (; i + 32 <= kLanes; i += 32) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
auto k = _mm512_cmpgt_epi16_mask(v1, v0);
v1 = _mm512_maskz_set1_epi16(k, T(-1));
memcpy(&result.x[i], &v1, 64);
}
} else if constexpr (std::is_same_v<T, uint16_t>) {
for (; i + 32 <= kLanes; i += 32) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
auto k = _mm512_cmpgt_epu16_mask(v1, v0);
v1 = _mm512_maskz_set1_epi16(k, T(-1));
memcpy(&result.x[i], &v1, 64);
}
} else if constexpr (std::is_same_v<T, int32_t>) {
for (; i + 16 <= kLanes; i += 16) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
auto k = _mm512_cmpgt_epi32_mask(v1, v0);
v1 = _mm512_maskz_set1_epi32(k, T(-1));
memcpy(&result.x[i], &v1, 64);
}
} else if constexpr (std::is_same_v<T, uint32_t>) {
for (; i + 16 <= kLanes; i += 16) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
auto k = _mm512_cmpgt_epu32_mask(v1, v0);
v1 = _mm512_maskz_set1_epi32(k, T(-1));
memcpy(&result.x[i], &v1, 64);
}
} else if constexpr (std::is_same_v<T, int64_t>) {
for (; i + 8 <= kLanes; i += 8) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
auto k = _mm512_cmpgt_epi64_mask(v1, v0);
v1 = _mm512_maskz_set1_epi64(k, T(-1));
memcpy(&result.x[i], &v1, 64);
}
} else if constexpr (std::is_same_v<T, uint64_t>) {
for (; i + 8 <= kLanes; i += 8) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
auto k = _mm512_cmpgt_epu64_mask(v1, v0);
v1 = _mm512_maskz_set1_epi64(k, T(-1));
memcpy(&result.x[i], &v1, 64);
}
}
}
if constexpr (kInstructionSet >= Simd_x86_AVX2) {
if constexpr (std::is_same_v<T, int8_t>) {
for (; i + 32 <= kLanes; i += 32) {
__m256i v0;
@@ -445,7 +647,8 @@ template <std::integral T, int kLanes> struct simd {
memcpy(&v0, &x[i], 32);
__m256i v1;
memcpy(&v1, &other.x[i], 32);
v1 = _mm256_xor_si256(_mm256_cmpeq_epi16(_mm256_max_epu16(v0, v1), v0),
v1 =
_mm256_xor_si256(_mm256_cmpeq_epi16(_mm256_max_epu16(v0, v1), v0),
_mm256_set1_epi8(0xff));
memcpy(&result.x[i], &v1, 32);
}
@@ -464,7 +667,8 @@ template <std::integral T, int kLanes> struct simd {
memcpy(&v0, &x[i], 32);
__m256i v1;
memcpy(&v1, &other.x[i], 32);
v1 = _mm256_xor_si256(_mm256_cmpeq_epi32(_mm256_max_epu32(v0, v1), v0),
v1 =
_mm256_xor_si256(_mm256_cmpeq_epi32(_mm256_max_epu32(v0, v1), v0),
_mm256_set1_epi8(0xff));
memcpy(&result.x[i], &v1, 32);
}
@@ -478,7 +682,7 @@ template <std::integral T, int kLanes> struct simd {
memcpy(&result.x[i], &v1, 32);
}
}
#endif
}
if constexpr (std::is_same_v<T, int8_t>) {
for (; i + 16 <= kLanes; i += 16) {
__m128i v0;
@@ -552,22 +756,55 @@ template <std::integral T, int kLanes> struct simd {
return result;
}
simd<T, kLanes> operator>(simd<T, kLanes> const &other) const {
return other < *this;
}
simd operator>(simd const &other) const { return other < *this; }
simd<T, kLanes> operator<=(simd<T, kLanes> const &other) const {
return ~(*this > other);
}
simd operator<=(simd const &other) const { return ~(*this > other); }
simd<T, kLanes> operator>=(simd<T, kLanes> const &other) const {
return ~(*this < other);
}
simd operator>=(simd const &other) const { return ~(*this < other); }
simd<T, kLanes> operator-(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator-(simd const &other) const {
simd result;
int i = 0;
#ifdef __AVX2__
if constexpr (kInstructionSet >= Simd_x86_AVX512) {
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 64 <= kLanes; i += 64) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
v1 = _mm512_sub_epi8(v0, v1);
memcpy(&result.x[i], &v1, 64);
}
} else if constexpr (std::is_same_v<UnsignedT, uint16_t>) {
for (; i + 32 <= kLanes; i += 32) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
v1 = _mm512_sub_epi16(v0, v1);
memcpy(&result.x[i], &v1, 64);
}
} else if constexpr (std::is_same_v<UnsignedT, uint32_t>) {
for (; i + 16 <= kLanes; i += 16) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
v1 = _mm512_sub_epi32(v0, v1);
memcpy(&result.x[i], &v1, 64);
}
} else if constexpr (std::is_same_v<UnsignedT, uint64_t>) {
for (; i + 8 <= kLanes; i += 8) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
v1 = _mm512_sub_epi64(v0, v1);
memcpy(&result.x[i], &v1, 64);
}
}
}
if constexpr (kInstructionSet >= Simd_x86_AVX2) {
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 32 <= kLanes; i += 32) {
__m256i v0;
@@ -605,7 +842,7 @@ template <std::integral T, int kLanes> struct simd {
memcpy(&result.x[i], &v1, 32);
}
}
#endif
}
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 16 <= kLanes; i += 16) {
__m128i v0;
@@ -649,10 +886,49 @@ template <std::integral T, int kLanes> struct simd {
return result;
}
simd<T, kLanes> operator+(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator+(simd const &other) const {
simd result;
int i = 0;
#ifdef __AVX2__
if constexpr (kInstructionSet >= Simd_x86_AVX512) {
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 64 <= kLanes; i += 64) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
v1 = _mm512_add_epi8(v0, v1);
memcpy(&result.x[i], &v1, 64);
}
} else if constexpr (std::is_same_v<UnsignedT, uint16_t>) {
for (; i + 32 <= kLanes; i += 32) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
v1 = _mm512_add_epi16(v0, v1);
memcpy(&result.x[i], &v1, 64);
}
} else if constexpr (std::is_same_v<UnsignedT, uint32_t>) {
for (; i + 16 <= kLanes; i += 16) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
v1 = _mm512_add_epi32(v0, v1);
memcpy(&result.x[i], &v1, 64);
}
} else if constexpr (std::is_same_v<UnsignedT, uint64_t>) {
for (; i + 8 <= kLanes; i += 8) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
v1 = _mm512_add_epi64(v0, v1);
memcpy(&result.x[i], &v1, 64);
}
}
}
if constexpr (kInstructionSet >= Simd_x86_AVX2) {
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 32 <= kLanes; i += 32) {
__m256i v0;
@@ -690,7 +966,7 @@ template <std::integral T, int kLanes> struct simd {
memcpy(&result.x[i], &v1, 32);
}
}
#endif
}
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 16 <= kLanes; i += 16) {
__m128i v0;
@@ -734,10 +1010,20 @@ template <std::integral T, int kLanes> struct simd {
return result;
}
simd<T, kLanes> operator|(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator|(simd const &other) const {
simd result;
int i = 0;
#ifdef __AVX2__
if constexpr (kInstructionSet >= Simd_x86_AVX512) {
for (; i + 64 / sizeof(T) <= kLanes; i += 64 / sizeof(T)) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
v1 = _mm512_or_si512(v0, v1);
memcpy(&result.x[i], &v1, 64);
}
}
if constexpr (kInstructionSet >= Simd_x86_AVX2) {
for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) {
__m256i v0;
memcpy(&v0, &x[i], 32);
@@ -746,7 +1032,7 @@ template <std::integral T, int kLanes> struct simd {
v1 = _mm256_or_si256(v0, v1);
memcpy(&result.x[i], &v1, 32);
}
#endif
}
for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) {
__m128i v0;
memcpy(&v0, &x[i], 16);
@@ -761,10 +1047,20 @@ template <std::integral T, int kLanes> struct simd {
return result;
}
simd<T, kLanes> operator&(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator&(simd const &other) const {
simd result;
int i = 0;
#ifdef __AVX2__
if constexpr (kInstructionSet >= Simd_x86_AVX512) {
for (; i + 64 / sizeof(T) <= kLanes; i += 64 / sizeof(T)) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
v1 = _mm512_and_si512(v0, v1);
memcpy(&result.x[i], &v1, 64);
}
}
if constexpr (kInstructionSet >= Simd_x86_AVX2) {
for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) {
__m256i v0;
memcpy(&v0, &x[i], 32);
@@ -773,7 +1069,7 @@ template <std::integral T, int kLanes> struct simd {
v1 = _mm256_and_si256(v0, v1);
memcpy(&result.x[i], &v1, 32);
}
#endif
}
for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) {
__m128i v0;
memcpy(&v0, &x[i], 16);
@@ -788,10 +1084,20 @@ template <std::integral T, int kLanes> struct simd {
return result;
}
simd<T, kLanes> operator^(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator^(simd const &other) const {
simd result;
int i = 0;
#ifdef __AVX2__
if constexpr (kInstructionSet >= Simd_x86_AVX512) {
for (; i + 64 / sizeof(T) <= kLanes; i += 64 / sizeof(T)) {
__m512i v0;
memcpy(&v0, &x[i], 64);
__m512i v1;
memcpy(&v1, &other.x[i], 64);
v1 = _mm512_xor_si512(v0, v1);
memcpy(&result.x[i], &v1, 64);
}
}
if constexpr (kInstructionSet >= Simd_x86_AVX2) {
for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) {
__m256i v0;
memcpy(&v0, &x[i], 32);
@@ -800,7 +1106,7 @@ template <std::integral T, int kLanes> struct simd {
v1 = _mm256_xor_si256(v0, v1);
memcpy(&result.x[i], &v1, 32);
}
#endif
}
for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) {
__m128i v0;
memcpy(&v0, &x[i], 16);
@@ -815,17 +1121,25 @@ template <std::integral T, int kLanes> struct simd {
return result;
}
simd<T, kLanes> operator~() const {
simd<T, kLanes> result;
simd operator~() const {
simd result;
int i = 0;
#ifdef __AVX2__
if constexpr (kInstructionSet >= Simd_x86_AVX512) {
for (; i + 64 / sizeof(T) <= kLanes; i += 64 / sizeof(T)) {
__m512i v0;
memcpy(&v0, &x[i], 64);
v0 = _mm512_xor_si512(v0, _mm512_set1_epi8(0xff));
memcpy(&result.x[i], &v0, 64);
}
}
if constexpr (kInstructionSet >= Simd_x86_AVX2) {
for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) {
__m256i v0;
memcpy(&v0, &x[i], 32);
v0 = _mm256_xor_si256(v0, _mm256_set1_epi8(0xff));
memcpy(&result.x[i], &v0, 32);
}
#endif
}
for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) {
__m128i v0;
memcpy(&v0, &x[i], 16);
@@ -869,8 +1183,8 @@ template <std::integral T, int kLanes> struct simd {
return (simd<UnsignedT, kLanes> const &)*this;
}
static simd<T, kLanes> splat(T t) {
simd<T, kLanes> result;
static simd splat(T t) {
simd result;
int i = 0;
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 16 <= kLanes; i += 16) {
@@ -955,8 +1269,8 @@ template <std::integral T, int kLanes> struct simd {
return (~(*this)).count_leading_zero_lanes();
}
simd<T, kLanes> operator==(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator==(simd const &other) const {
simd result;
int i = 0;
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 16 <= kLanes; i += 16) {
@@ -1001,12 +1315,10 @@ template <std::integral T, int kLanes> struct simd {
return result;
}
simd<T, kLanes> operator!=(simd<T, kLanes> const &other) const {
return ~(*this == other);
}
simd operator!=(simd const &other) const { return ~(*this == other); }
simd<T, kLanes> operator<(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator<(simd const &other) const {
simd result;
int i = 0;
if constexpr (std::is_same_v<T, int8_t>) {
for (; i + 16 <= kLanes; i += 16) {
@@ -1087,20 +1399,14 @@ template <std::integral T, int kLanes> struct simd {
return result;
}
simd<T, kLanes> operator>(simd<T, kLanes> const &other) const {
return other < *this;
}
simd operator>(simd const &other) const { return other < *this; }
simd<T, kLanes> operator<=(simd<T, kLanes> const &other) const {
return ~(*this > other);
}
simd operator<=(simd const &other) const { return ~(*this > other); }
simd<T, kLanes> operator>=(simd<T, kLanes> const &other) const {
return ~(*this < other);
}
simd operator>=(simd const &other) const { return ~(*this < other); }
simd<T, kLanes> operator-(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator-(simd const &other) const {
simd result;
int i = 0;
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 16 <= kLanes; i += 16) {
@@ -1145,8 +1451,8 @@ template <std::integral T, int kLanes> struct simd {
return result;
}
simd<T, kLanes> operator+(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator+(simd const &other) const {
simd result;
int i = 0;
if constexpr (std::is_same_v<UnsignedT, uint8_t>) {
for (; i + 16 <= kLanes; i += 16) {
@@ -1191,8 +1497,8 @@ template <std::integral T, int kLanes> struct simd {
return result;
}
simd<T, kLanes> operator|(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator|(simd const &other) const {
simd result;
int i = 0;
for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) {
int8x16_t v0;
@@ -1208,8 +1514,8 @@ template <std::integral T, int kLanes> struct simd {
return result;
}
simd<T, kLanes> operator&(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator&(simd const &other) const {
simd result;
int i = 0;
for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) {
int8x16_t v0;
@@ -1225,8 +1531,8 @@ template <std::integral T, int kLanes> struct simd {
return result;
}
simd<T, kLanes> operator^(simd<T, kLanes> const &other) const {
simd<T, kLanes> result;
simd operator^(simd const &other) const {
simd result;
int i = 0;
for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) {
int8x16_t v0;
@@ -1242,8 +1548,8 @@ template <std::integral T, int kLanes> struct simd {
return result;
}
simd<T, kLanes> operator~() const {
simd<T, kLanes> result;
simd operator~() const {
simd result;
int i = 0;
for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) {
int8x16_t v0;