From 46e01af027ede8a4f469ae2f48b493f9aec5f3d4 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Mon, 8 Jul 2024 11:01:17 -0700 Subject: [PATCH] Specialize scan16 for Node16 in checkMaxBetweenExclusive --- ConflictSet.cpp | 115 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 99 insertions(+), 16 deletions(-) diff --git a/ConflictSet.cpp b/ConflictSet.cpp index 4cb75b1..ccc5c25 100644 --- a/ConflictSet.cpp +++ b/ConflictSet.cpp @@ -2036,23 +2036,106 @@ bool checkMaxBetweenExclusive(Node *n, int begin, int end, case Type_Node16: { auto *self = static_cast(n); - { - int c = begin == 255 ? -1 : getChildGeqSimd(self, begin + 1); - if (c >= 0 && c < end) { - auto *child = self->children[getNodeIndex(self, c)]; - if (child->entryPresent) { - if (!(child->entry.rangeVersion <= readVersion)) { - return false; - }; - } - begin = c; - } else { - return true; - } - // [begin, end) is now the half-open interval of children we're interested - // in. - assert(begin < end); + ++begin; + + assert(begin <= end); + assert(end - begin < 256); + +#ifdef HAS_ARM_NEON + + uint8x16_t indices; + memcpy(&indices, self->index, 16); + // 0xff for each in bounds + auto results = + vcltq_u8(vsubq_u8(indices, vdupq_n_u8(begin)), vdupq_n_u8(end - begin)); + // 0xf for each 0xff + uint64_t mask = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(results), 4)), 0); + + mask &= self->numChildren == 16 + ? uint64_t(-1) + : (uint64_t(1) << (self->numChildren << 2)) - 1; + if (!mask) { + return true; } + auto *child = self->children[__builtin_ctzll(mask) >> 2]; + const bool firstRangeOk = + !child->entryPresent || child->entry.rangeVersion <= readVersion; + + uint32x4_t w4[4]; + memcpy(w4, self->childMaxVersion, sizeof(w4)); + uint32_t rv; + memcpy(&rv, &readVersion, sizeof(rv)); + const auto rvVec = vdupq_n_u32(rv); + + int32x4_t z; + memset(&z, 0, sizeof(z)); + + uint16x4_t conflicting[4]; + for (int i = 0; i < 4; ++i) { + conflicting[i] = vmovn_u32( + vcgtq_s32(vreinterpretq_s32_u32(vsubq_u32(w4[i], rvVec)), z)); + } + auto combined = + vcombine_u8(vmovn_u16(vcombine_u16(conflicting[0], conflicting[1])), + vmovn_u16(vcombine_u16(conflicting[2], conflicting[3]))); + + uint64_t compared = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(combined), 4)), 0); + + return !(compared & mask) && firstRangeOk; + +#elif defined(HAS_AVX) + + __m128i indices; + memcpy(&indices, self->index, 16); + indices = _mm_sub_epi8(indices, _mm_set1_epi8(begin)); + uint32_t mask = + 0xffff & + ~_mm_movemask_epi8(_mm_cmpeq_epi8( + indices, _mm_max_epu8(indices, _mm_set1_epi8(end - begin)))); + mask &= (1 << self->numChildren) - 1; + if (!mask) { + return true; + } + auto *child = self->children[__builtin_ctz(mask)]; + const bool firstRangeOk = + !child->entryPresent || child->entry.rangeVersion <= readVersion; + + uint32_t compared = 0; + if constexpr (kAVX512) { + compared = compare16_32bit_avx512(self->childMaxVersion, readVersion); + } else { + compared = compare16_32bit(self->childMaxVersion, readVersion); + } + return !(compared & mask) && firstRangeOk; + +#else + + const unsigned shiftUpperBound = end - begin; + const unsigned shiftAmount = begin; + auto inBounds = [&](unsigned c) { + return c - shiftAmount < shiftUpperBound; + }; + + uint32_t mask = 0; + for (int i = 0; i < 16; ++i) { + mask |= inBounds(self->index[i]) << i; + } + mask &= (1 << self->numChildren) - 1; + if (!mask) { + return true; + } + auto *child = self->children[__builtin_ctz(mask)]; + const bool firstRangeOk = + !child->entryPresent || child->entry.rangeVersion <= readVersion; + uint32_t compared = 0; + for (int i = 0; i < 16; ++i) { + compared |= (self->childMaxVersion[i] > readVersion) << i; + } + return !(compared & mask) && firstRangeOk; + +#endif return scan16(self->childMaxVersion, self->index, begin, end, readVersion);