Use simple loop for longestCommonPrefixPartialKey
The benchmark has this as faster
This commit is contained in:
@@ -720,42 +720,13 @@ bytes:
|
|||||||
|
|
||||||
int longestCommonPrefixPartialKey(const uint8_t *ap, const uint8_t *bp,
|
int longestCommonPrefixPartialKey(const uint8_t *ap, const uint8_t *bp,
|
||||||
int cl) {
|
int cl) {
|
||||||
if (cl > Node::kPartialKeyMaxLen) {
|
|
||||||
__builtin_unreachable(); // GCOVR_EXCL_LINE
|
|
||||||
}
|
|
||||||
return longestCommonPrefix(ap, bp, cl);
|
|
||||||
#if 0
|
|
||||||
static_assert(Node::kPartialKeyMaxLen == 16);
|
|
||||||
// SOMEDAY: use masked loads (requires avx-512/sve2)
|
|
||||||
#if defined(HAS_AVX)
|
|
||||||
__m128i a;
|
|
||||||
memcpy(&a, ap, cl);
|
|
||||||
__m128i b;
|
|
||||||
memcpy(&b, bp, cl);
|
|
||||||
const auto compared = _mm_cmpeq_epi8(a, b);
|
|
||||||
int mask = (1 << cl) - 1;
|
|
||||||
auto c = = _mm_movemask_epi8(compared) & mask;
|
|
||||||
return std::countr_zero(~c);
|
|
||||||
#elif defined(HAS_ARM_NEON)
|
|
||||||
uint8x16_t a;
|
|
||||||
memcpy(&a, ap, cl);
|
|
||||||
uint8x16_t b;
|
|
||||||
memcpy(&b, bp, cl);
|
|
||||||
uint16x8_t results = vreinterpretq_u16_u8(vceqq_u8(a, b));
|
|
||||||
uint64_t mask = cl == 16 ? uint64_t(-1) : (uint64_t(1) << (cl * 4)) - 1;
|
|
||||||
uint64_t bitfield =
|
|
||||||
vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(results, 4)), 0) & mask;
|
|
||||||
return std::countr_zero(~bitfield) >> 2;
|
|
||||||
#else
|
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (; i < 16; ++i) {
|
for (; i < cl; ++i) {
|
||||||
if (*ap++ != *bp++) {
|
if (*ap++ != *bp++) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return i;
|
return i;
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Performs a physical search for remaining
|
// Performs a physical search for remaining
|
||||||
|
Reference in New Issue
Block a user