Use simple loop for longestCommonPrefixPartialKey
The benchmark has this as faster
This commit is contained in:
@@ -720,42 +720,13 @@ bytes:
|
||||
|
||||
int longestCommonPrefixPartialKey(const uint8_t *ap, const uint8_t *bp,
|
||||
int cl) {
|
||||
if (cl > Node::kPartialKeyMaxLen) {
|
||||
__builtin_unreachable(); // GCOVR_EXCL_LINE
|
||||
}
|
||||
return longestCommonPrefix(ap, bp, cl);
|
||||
#if 0
|
||||
static_assert(Node::kPartialKeyMaxLen == 16);
|
||||
// SOMEDAY: use masked loads (requires avx-512/sve2)
|
||||
#if defined(HAS_AVX)
|
||||
__m128i a;
|
||||
memcpy(&a, ap, cl);
|
||||
__m128i b;
|
||||
memcpy(&b, bp, cl);
|
||||
const auto compared = _mm_cmpeq_epi8(a, b);
|
||||
int mask = (1 << cl) - 1;
|
||||
auto c = = _mm_movemask_epi8(compared) & mask;
|
||||
return std::countr_zero(~c);
|
||||
#elif defined(HAS_ARM_NEON)
|
||||
uint8x16_t a;
|
||||
memcpy(&a, ap, cl);
|
||||
uint8x16_t b;
|
||||
memcpy(&b, bp, cl);
|
||||
uint16x8_t results = vreinterpretq_u16_u8(vceqq_u8(a, b));
|
||||
uint64_t mask = cl == 16 ? uint64_t(-1) : (uint64_t(1) << (cl * 4)) - 1;
|
||||
uint64_t bitfield =
|
||||
vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(results, 4)), 0) & mask;
|
||||
return std::countr_zero(~bitfield) >> 2;
|
||||
#else
|
||||
int i = 0;
|
||||
for (; i < 16; ++i) {
|
||||
for (; i < cl; ++i) {
|
||||
if (*ap++ != *bp++) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
// Performs a physical search for remaining
|
||||
|
Reference in New Issue
Block a user