diff --git a/ConflictSet.cpp b/ConflictSet.cpp index 444c75b..11355e7 100644 --- a/ConflictSet.cpp +++ b/ConflictSet.cpp @@ -1885,6 +1885,547 @@ void downsize(Node *self, WriteContext *writeContext) { } } +#ifdef HAS_AVX +uint32_t compare16(const InternalVersionT *vs, InternalVersionT rv) { +#if USE_64_BIT + uint32_t compared = 0; + for (int i = 0; i < 16; ++i) { + compared |= (vs[i] > rv) << i; + } + return compared; +#else + uint32_t compared = 0; + __m128i w[4]; // GCOVR_EXCL_LINE + memcpy(w, vs, sizeof(w)); + uint32_t r; // GCOVR_EXCL_LINE + memcpy(&r, &rv, sizeof(r)); + const auto rvVec = _mm_set1_epi32(r); + const auto zero = _mm_setzero_si128(); + for (int i = 0; i < 4; ++i) { + compared |= _mm_movemask_ps( + __m128(_mm_cmpgt_epi32(_mm_sub_epi32(w[i], rvVec), zero))) + << (i * 4); + } + return compared; +#endif +} + +__attribute__((target("avx512f"))) uint32_t +compare16_avx512(const InternalVersionT *vs, InternalVersionT rv) { +#if USE_64_BIT + int64_t r; + memcpy(&r, &rv, sizeof(r)); + uint32_t low = + _mm512_cmpgt_epi64_mask(_mm512_loadu_epi64(vs), _mm512_set1_epi64(r)); + uint32_t high = + _mm512_cmpgt_epi64_mask(_mm512_loadu_epi64(vs + 8), _mm512_set1_epi64(r)); + return low | (high << 8); +#else + uint32_t r; + memcpy(&r, &rv, sizeof(r)); + return _mm512_cmpgt_epi32_mask( + _mm512_sub_epi32(_mm512_loadu_epi32(vs), _mm512_set1_epi32(r)), + _mm512_setzero_epi32()); +#endif +} +#endif + +// Returns true if v[i] <= readVersion for all i such that begin <= is[i] < end +// Preconditions: begin <= end, end - begin < 256 +template +bool scan16(const InternalVersionT *vs, const uint8_t *is, int begin, int end, + InternalVersionT readVersion) { + + assert(begin <= end); + assert(end - begin < 256); + +#ifdef HAS_ARM_NEON + + uint8x16_t indices; + memcpy(&indices, is, 16); + // 0xff for each in bounds + auto results = + vcltq_u8(vsubq_u8(indices, vdupq_n_u8(begin)), vdupq_n_u8(end - begin)); + // 0xf for each 0xff + uint64_t mask = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(results), 4)), 0); + + uint32x4_t w4[4]; + memcpy(w4, vs, sizeof(w4)); + uint32_t rv; + memcpy(&rv, &readVersion, sizeof(rv)); + const auto rvVec = vdupq_n_u32(rv); + + int32x4_t z; + memset(&z, 0, sizeof(z)); + + uint16x4_t conflicting[4]; + for (int i = 0; i < 4; ++i) { + conflicting[i] = + vmovn_u32(vcgtq_s32(vreinterpretq_s32_u32(vsubq_u32(w4[i], rvVec)), z)); + } + auto combined = + vcombine_u8(vmovn_u16(vcombine_u16(conflicting[0], conflicting[1])), + vmovn_u16(vcombine_u16(conflicting[2], conflicting[3]))); + + uint64_t compared = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(combined), 4)), 0); + + return !(compared & mask); + +#elif defined(HAS_AVX) + + __m128i indices; + memcpy(&indices, is, 16); + indices = _mm_sub_epi8(indices, _mm_set1_epi8(begin)); + uint32_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8( + indices, _mm_max_epu8(indices, _mm_set1_epi8(end - begin)))); + + uint32_t compared = 0; + if constexpr (kAVX512) { + compared = compare16_avx512(vs, readVersion); + } else { + compared = compare16(vs, readVersion); + } + return !(compared & mask); + +#else + + const unsigned shiftUpperBound = end - begin; + const unsigned shiftAmount = begin; + auto inBounds = [&](unsigned c) { return c - shiftAmount < shiftUpperBound; }; + + uint32_t compared = 0; + for (int i = 0; i < 16; ++i) { + compared |= (vs[i] > readVersion) << i; + } + uint32_t mask = 0; + for (int i = 0; i < 16; ++i) { + mask |= inBounds(is[i]) << i; + } + return !(compared & mask); + +#endif +} + +// Returns true if v[i] <= readVersion for all i such that begin <= i < end +template +bool scan16(const InternalVersionT *vs, int begin, int end, + InternalVersionT readVersion) { + assert(0 <= begin && begin < 16); + assert(0 <= end && end <= 16); + assert(begin <= end); + +#if defined(HAS_ARM_NEON) + uint32x4_t w4[4]; + memcpy(w4, vs, sizeof(w4)); + uint32_t rv; + memcpy(&rv, &readVersion, sizeof(rv)); + const auto rvVec = vdupq_n_u32(rv); + + int32x4_t z; + memset(&z, 0, sizeof(z)); + + uint16x4_t conflicting[4]; + for (int i = 0; i < 4; ++i) { + conflicting[i] = + vmovn_u32(vcgtq_s32(vreinterpretq_s32_u32(vsubq_u32(w4[i], rvVec)), z)); + } + auto combined = + vcombine_u8(vmovn_u16(vcombine_u16(conflicting[0], conflicting[1])), + vmovn_u16(vcombine_u16(conflicting[2], conflicting[3]))); + + uint64_t conflict = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(combined), 4)), 0); + + conflict &= end == 16 ? -1 : (uint64_t(1) << (end << 2)) - 1; + conflict >>= begin << 2; + return !conflict; +#elif defined(HAS_AVX) + uint32_t conflict; + if constexpr (kAVX512) { + conflict = compare16_avx512(vs, readVersion); + } else { + conflict = compare16(vs, readVersion); + } + conflict &= (1 << end) - 1; + conflict >>= begin; + return !conflict; +#else + uint64_t conflict = 0; + for (int i = 0; i < 16; ++i) { + conflict |= (vs[i] > readVersion) << i; + } + conflict &= (1 << end) - 1; + conflict >>= begin; + return !conflict; +#endif +} + +// Return whether or not the max version among all keys starting with the search +// path of n + [child], where child in (begin, end) is <= readVersion. Does not +// account for the range version of firstGt(searchpath(n) + [end - 1]) +template +bool checkMaxBetweenExclusiveImpl(Node0 *, int, int, InternalVersionT) { + return true; +} + +template +bool checkMaxBetweenExclusiveImpl(Node3 *n, int begin, int end, + InternalVersionT readVersion) { + assume(-1 <= begin); + assume(begin <= 256); + assume(-1 <= end); + assume(end <= 256); + assume(begin < end); + assert(!(begin == -1 && end == 256)); + + auto *self = static_cast(n); + + ++begin; + + const unsigned shiftUpperBound = end - begin; + const unsigned shiftAmount = begin; + auto inBounds = [&](unsigned c) { return c - shiftAmount < shiftUpperBound; }; + + uint32_t mask = 0; + for (int i = 0; i < Node3::kMaxNodes; ++i) { + mask |= inBounds(self->index[i]) << i; + } + mask &= (1 << self->numChildren) - 1; + if (!mask) { + return true; + } + Node *child = self->children[std::countr_zero(mask)]; + const bool firstRangeOk = + !child->entryPresent || child->entry.rangeVersion <= readVersion; + uint32_t compared = 0; + for (int i = 0; i < Node3::kMaxNodes; ++i) { + compared |= (self->childMaxVersion[i] > readVersion) << i; + } + + return !(compared & mask) && firstRangeOk; +} + +template +bool checkMaxBetweenExclusiveImpl(Node16 *n, int begin, int end, + InternalVersionT readVersion) { + assume(-1 <= begin); + assume(begin <= 256); + assume(-1 <= end); + assume(end <= 256); + assume(begin < end); + assert(!(begin == -1 && end == 256)); + + auto *self = static_cast(n); + + ++begin; + + assert(begin <= end); + assert(end - begin < 256); + +#ifdef HAS_ARM_NEON + + uint8x16_t indices; + memcpy(&indices, self->index, 16); + // 0xff for each in bounds + auto results = + vcltq_u8(vsubq_u8(indices, vdupq_n_u8(begin)), vdupq_n_u8(end - begin)); + // 0xf for each 0xff + uint64_t mask = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(results), 4)), 0); + + mask &= self->numChildren == 16 + ? uint64_t(-1) + : (uint64_t(1) << (self->numChildren << 2)) - 1; + if (!mask) { + return true; + } + Node *child = self->children[std::countr_zero(mask) >> 2]; + const bool firstRangeOk = + !child->entryPresent || child->entry.rangeVersion <= readVersion; + + uint32x4_t w4[4]; + memcpy(w4, self->childMaxVersion, sizeof(w4)); + uint32_t rv; + memcpy(&rv, &readVersion, sizeof(rv)); + const auto rvVec = vdupq_n_u32(rv); + + int32x4_t z; + memset(&z, 0, sizeof(z)); + + uint16x4_t conflicting[4]; + for (int i = 0; i < 4; ++i) { + conflicting[i] = + vmovn_u32(vcgtq_s32(vreinterpretq_s32_u32(vsubq_u32(w4[i], rvVec)), z)); + } + auto combined = + vcombine_u8(vmovn_u16(vcombine_u16(conflicting[0], conflicting[1])), + vmovn_u16(vcombine_u16(conflicting[2], conflicting[3]))); + + uint64_t compared = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(combined), 4)), 0); + + return !(compared & mask) && firstRangeOk; + +#elif defined(HAS_AVX) + + __m128i indices; + memcpy(&indices, self->index, 16); + indices = _mm_sub_epi8(indices, _mm_set1_epi8(begin)); + uint32_t mask = + 0xffff & ~_mm_movemask_epi8(_mm_cmpeq_epi8( + indices, _mm_max_epu8(indices, _mm_set1_epi8(end - begin)))); + mask &= (1 << self->numChildren) - 1; + if (!mask) { + return true; + } + Node *child = self->children[std::countr_zero(mask)]; + const bool firstRangeOk = + !child->entryPresent || child->entry.rangeVersion <= readVersion; + + uint32_t compared = 0; + if constexpr (kAVX512) { + compared = compare16_avx512(self->childMaxVersion, readVersion); + } else { + compared = compare16(self->childMaxVersion, readVersion); + } + return !(compared & mask) && firstRangeOk; + +#else + + const unsigned shiftUpperBound = end - begin; + const unsigned shiftAmount = begin; + auto inBounds = [&](unsigned c) { return c - shiftAmount < shiftUpperBound; }; + + uint32_t mask = 0; + for (int i = 0; i < 16; ++i) { + mask |= inBounds(self->index[i]) << i; + } + mask &= (1 << self->numChildren) - 1; + if (!mask) { + return true; + } + Node *child = self->children[std::countr_zero(mask)]; + const bool firstRangeOk = + !child->entryPresent || child->entry.rangeVersion <= readVersion; + uint32_t compared = 0; + for (int i = 0; i < 16; ++i) { + compared |= (self->childMaxVersion[i] > readVersion) << i; + } + return !(compared & mask) && firstRangeOk; + +#endif +} + +template +bool checkMaxBetweenExclusiveImpl(Node48 *n, int begin, int end, + InternalVersionT readVersion) { + assume(-1 <= begin); + assume(begin <= 256); + assume(-1 <= end); + assume(end <= 256); + assume(begin < end); + assert(!(begin == -1 && end == 256)); + + auto *self = static_cast(n); + + { + int c = self->bitSet.firstSetGeq(begin + 1); + if (c >= 0 && c < end) { + Node *child = self->children[self->index[c]]; + if (child->entryPresent && child->entry.rangeVersion > readVersion) { + return false; + } + begin = c; + } else { + return true; + } + // [begin, end) is now the half-open interval of children we're interested + // in. + assert(begin < end); + } + + // Check all pages + static_assert(Node48::kMaxOfMaxPageSize == 16); + for (int i = 0; i < Node48::kMaxOfMaxTotalPages; ++i) { + if (self->maxOfMax[i] > readVersion) { + if (!scan16(self->childMaxVersion + + (i << Node48::kMaxOfMaxShift), + self->reverseIndex + (i << Node48::kMaxOfMaxShift), + begin, end, readVersion)) { + return false; + } + } + } + return true; +} + +template +bool checkMaxBetweenExclusiveImpl(Node256 *n, int begin, int end, + InternalVersionT readVersion) { + assume(-1 <= begin); + assume(begin <= 256); + assume(-1 <= end); + assume(end <= 256); + assume(begin < end); + assert(!(begin == -1 && end == 256)); + + static_assert(Node256::kMaxOfMaxTotalPages == 16); + auto *self = static_cast(n); + + { + int c = self->bitSet.firstSetGeq(begin + 1); + if (c >= 0 && c < end) { + Node *child = self->children[c]; + if (child->entryPresent && child->entry.rangeVersion > readVersion) { + return false; + } + begin = c; + } else { + return true; + } + // [begin, end) is now the half-open interval of children we're interested + // in. + assert(begin < end); + } + + const int firstPage = begin >> Node256::kMaxOfMaxShift; + const int lastPage = (end - 1) >> Node256::kMaxOfMaxShift; + // Check the only page if there's only one + if (firstPage == lastPage) { + if (self->maxOfMax[firstPage] <= readVersion) { + return true; + } + const int intraPageBegin = begin & (Node256::kMaxOfMaxPageSize - 1); + const int intraPageEnd = end - (lastPage << Node256::kMaxOfMaxShift); + return scan16(self->childMaxVersion + + (firstPage << Node256::kMaxOfMaxShift), + intraPageBegin, intraPageEnd, readVersion); + } + // Check the first page + if (self->maxOfMax[firstPage] > readVersion) { + const int intraPageBegin = begin & (Node256::kMaxOfMaxPageSize - 1); + if (!scan16(self->childMaxVersion + + (firstPage << Node256::kMaxOfMaxShift), + intraPageBegin, 16, readVersion)) { + return false; + } + } + // Check the last page + if (self->maxOfMax[lastPage] > readVersion) { + const int intraPageEnd = end - (lastPage << Node256::kMaxOfMaxShift); + if (!scan16(self->childMaxVersion + + (lastPage << Node256::kMaxOfMaxShift), + 0, intraPageEnd, readVersion)) { + return false; + } + } + // Check inner pages + return scan16(self->maxOfMax, firstPage + 1, lastPage, readVersion); +} + +bool checkMaxBetweenExclusive(Node0 *n, int begin, int end, + InternalVersionT readVersion) { + return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion); +} + +bool checkMaxBetweenExclusive(Node3 *n, int begin, int end, + InternalVersionT readVersion) { + return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion); +} + +#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__) +__attribute__((target("avx512f"))) bool +checkMaxBetweenExclusive(Node16 *n, int begin, int end, + InternalVersionT readVersion) { + return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion); +} +__attribute__((target("default"))) +#endif + +bool checkMaxBetweenExclusive(Node16 *n, int begin, int end, InternalVersionT readVersion) { + return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion); +} + +#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__) +__attribute__((target("avx512f"))) bool +checkMaxBetweenExclusive(Node48 *n, int begin, int end, + InternalVersionT readVersion) { + return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion); +} +__attribute__((target("default"))) +#endif + +bool checkMaxBetweenExclusive(Node48 *n, int begin, int end, + InternalVersionT readVersion) { + return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion); +} + +#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__) +__attribute__((target("avx512f"))) bool +checkMaxBetweenExclusive(Node256 *n, int begin, int end, + InternalVersionT readVersion) { + return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion); +} +__attribute__((target("default"))) +#endif + +bool checkMaxBetweenExclusive(Node256 *n, int begin, int end, + InternalVersionT readVersion) { + return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion); +} + +#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__) +__attribute__((target("avx512f"))) bool +checkMaxBetweenExclusive(Node *n, int begin, int end, + InternalVersionT readVersion) { + switch (n->getType()) { + case Type_Node0: + return checkMaxBetweenExclusiveImpl(static_cast(n), begin, + end, readVersion); + case Type_Node3: + return checkMaxBetweenExclusiveImpl(static_cast(n), begin, + end, readVersion); + case Type_Node16: + return checkMaxBetweenExclusiveImpl(static_cast(n), begin, + end, readVersion); + case Type_Node48: + return checkMaxBetweenExclusiveImpl(static_cast(n), begin, + end, readVersion); + case Type_Node256: + return checkMaxBetweenExclusiveImpl(static_cast(n), begin, + end, readVersion); + default: // GCOVR_EXCL_LINE + __builtin_unreachable(); // GCOVR_EXCL_LINE + } +} +__attribute__((target("default"))) +#endif + +bool checkMaxBetweenExclusive(Node *n, int begin, int end, + InternalVersionT readVersion) { + switch (n->getType()) { + case Type_Node0: + return checkMaxBetweenExclusiveImpl(static_cast(n), begin, + end, readVersion); + case Type_Node3: + return checkMaxBetweenExclusiveImpl(static_cast(n), begin, + end, readVersion); + case Type_Node16: + return checkMaxBetweenExclusiveImpl(static_cast(n), begin, + end, readVersion); + case Type_Node48: + return checkMaxBetweenExclusiveImpl(static_cast(n), begin, + end, readVersion); + case Type_Node256: + return checkMaxBetweenExclusiveImpl(static_cast(n), begin, + end, readVersion); + default: // GCOVR_EXCL_LINE + __builtin_unreachable(); // GCOVR_EXCL_LINE + } +} + template struct Iterator; // Higher-level handle to a position in the tree. Can represent the position of @@ -1943,8 +2484,10 @@ struct IteratorBase { IteratorBase getChild(int index); IteratorBase nextSibling(); TrivialSpan getSearchPath(Arena &arena); - - Node *escapeHatch() { return node; } + bool checkMaxBetweenExclusive(int begin, int end, + InternalVersionT readVersion) { + return ::checkMaxBetweenExclusive(node, begin, end, readVersion); + } protected: Node *node; @@ -1977,8 +2520,11 @@ template struct Iterator : IteratorBase { TrivialSpan partialKey() { return {static_cast(node)->partialKey(), node->partialKeyLen}; } - - T *escapeHatch() { return static_cast(node); } + bool checkMaxBetweenExclusive(int begin, int end, + InternalVersionT readVersion) { + return ::checkMaxBetweenExclusive(static_cast(node), begin, end, + readVersion); + } }; TrivialSpan IteratorBase::partialKey() { @@ -2251,572 +2797,6 @@ TaggedNodePointer nextSibling(Node *node) { } } -#ifdef HAS_AVX -uint32_t compare16(const InternalVersionT *vs, InternalVersionT rv) { -#if USE_64_BIT - uint32_t compared = 0; - for (int i = 0; i < 16; ++i) { - compared |= (vs[i] > rv) << i; - } - return compared; -#else - uint32_t compared = 0; - __m128i w[4]; // GCOVR_EXCL_LINE - memcpy(w, vs, sizeof(w)); - uint32_t r; // GCOVR_EXCL_LINE - memcpy(&r, &rv, sizeof(r)); - const auto rvVec = _mm_set1_epi32(r); - const auto zero = _mm_setzero_si128(); - for (int i = 0; i < 4; ++i) { - compared |= _mm_movemask_ps( - __m128(_mm_cmpgt_epi32(_mm_sub_epi32(w[i], rvVec), zero))) - << (i * 4); - } - return compared; -#endif -} - -__attribute__((target("avx512f"))) uint32_t -compare16_avx512(const InternalVersionT *vs, InternalVersionT rv) { -#if USE_64_BIT - int64_t r; - memcpy(&r, &rv, sizeof(r)); - uint32_t low = - _mm512_cmpgt_epi64_mask(_mm512_loadu_epi64(vs), _mm512_set1_epi64(r)); - uint32_t high = - _mm512_cmpgt_epi64_mask(_mm512_loadu_epi64(vs + 8), _mm512_set1_epi64(r)); - return low | (high << 8); -#else - uint32_t r; - memcpy(&r, &rv, sizeof(r)); - return _mm512_cmpgt_epi32_mask( - _mm512_sub_epi32(_mm512_loadu_epi32(vs), _mm512_set1_epi32(r)), - _mm512_setzero_epi32()); -#endif -} -#endif - -// Returns true if v[i] <= readVersion for all i such that begin <= is[i] < end -// Preconditions: begin <= end, end - begin < 256 -template -bool scan16(const InternalVersionT *vs, const uint8_t *is, int begin, int end, - InternalVersionT readVersion) { - - assert(begin <= end); - assert(end - begin < 256); - -#ifdef HAS_ARM_NEON - - uint8x16_t indices; - memcpy(&indices, is, 16); - // 0xff for each in bounds - auto results = - vcltq_u8(vsubq_u8(indices, vdupq_n_u8(begin)), vdupq_n_u8(end - begin)); - // 0xf for each 0xff - uint64_t mask = vget_lane_u64( - vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(results), 4)), 0); - - uint32x4_t w4[4]; - memcpy(w4, vs, sizeof(w4)); - uint32_t rv; - memcpy(&rv, &readVersion, sizeof(rv)); - const auto rvVec = vdupq_n_u32(rv); - - int32x4_t z; - memset(&z, 0, sizeof(z)); - - uint16x4_t conflicting[4]; - for (int i = 0; i < 4; ++i) { - conflicting[i] = - vmovn_u32(vcgtq_s32(vreinterpretq_s32_u32(vsubq_u32(w4[i], rvVec)), z)); - } - auto combined = - vcombine_u8(vmovn_u16(vcombine_u16(conflicting[0], conflicting[1])), - vmovn_u16(vcombine_u16(conflicting[2], conflicting[3]))); - - uint64_t compared = vget_lane_u64( - vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(combined), 4)), 0); - - return !(compared & mask); - -#elif defined(HAS_AVX) - - __m128i indices; - memcpy(&indices, is, 16); - indices = _mm_sub_epi8(indices, _mm_set1_epi8(begin)); - uint32_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8( - indices, _mm_max_epu8(indices, _mm_set1_epi8(end - begin)))); - - uint32_t compared = 0; - if constexpr (kAVX512) { - compared = compare16_avx512(vs, readVersion); - } else { - compared = compare16(vs, readVersion); - } - return !(compared & mask); - -#else - - const unsigned shiftUpperBound = end - begin; - const unsigned shiftAmount = begin; - auto inBounds = [&](unsigned c) { return c - shiftAmount < shiftUpperBound; }; - - uint32_t compared = 0; - for (int i = 0; i < 16; ++i) { - compared |= (vs[i] > readVersion) << i; - } - uint32_t mask = 0; - for (int i = 0; i < 16; ++i) { - mask |= inBounds(is[i]) << i; - } - return !(compared & mask); - -#endif -} - -// Returns true if v[i] <= readVersion for all i such that begin <= i < end -template -bool scan16(const InternalVersionT *vs, int begin, int end, - InternalVersionT readVersion) { - assert(0 <= begin && begin < 16); - assert(0 <= end && end <= 16); - assert(begin <= end); - -#if defined(HAS_ARM_NEON) - uint32x4_t w4[4]; - memcpy(w4, vs, sizeof(w4)); - uint32_t rv; - memcpy(&rv, &readVersion, sizeof(rv)); - const auto rvVec = vdupq_n_u32(rv); - - int32x4_t z; - memset(&z, 0, sizeof(z)); - - uint16x4_t conflicting[4]; - for (int i = 0; i < 4; ++i) { - conflicting[i] = - vmovn_u32(vcgtq_s32(vreinterpretq_s32_u32(vsubq_u32(w4[i], rvVec)), z)); - } - auto combined = - vcombine_u8(vmovn_u16(vcombine_u16(conflicting[0], conflicting[1])), - vmovn_u16(vcombine_u16(conflicting[2], conflicting[3]))); - - uint64_t conflict = vget_lane_u64( - vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(combined), 4)), 0); - - conflict &= end == 16 ? -1 : (uint64_t(1) << (end << 2)) - 1; - conflict >>= begin << 2; - return !conflict; -#elif defined(HAS_AVX) - uint32_t conflict; - if constexpr (kAVX512) { - conflict = compare16_avx512(vs, readVersion); - } else { - conflict = compare16(vs, readVersion); - } - conflict &= (1 << end) - 1; - conflict >>= begin; - return !conflict; -#else - uint64_t conflict = 0; - for (int i = 0; i < 16; ++i) { - conflict |= (vs[i] > readVersion) << i; - } - conflict &= (1 << end) - 1; - conflict >>= begin; - return !conflict; -#endif -} - -// Return whether or not the max version among all keys starting with the search -// path of n + [child], where child in (begin, end) is <= readVersion. Does not -// account for the range version of firstGt(searchpath(n) + [end - 1]) -template -bool checkMaxBetweenExclusiveImpl(Node0 *, int, int, InternalVersionT, - ReadContext *readContext) { - ++readContext->range_read_node_scan_accum; - return true; -} - -template -bool checkMaxBetweenExclusiveImpl(Node3 *n, int begin, int end, - InternalVersionT readVersion, - ReadContext *readContext) { - ++readContext->range_read_node_scan_accum; - assume(-1 <= begin); - assume(begin <= 256); - assume(-1 <= end); - assume(end <= 256); - assume(begin < end); - assert(!(begin == -1 && end == 256)); - - auto *self = static_cast(n); - - ++begin; - - const unsigned shiftUpperBound = end - begin; - const unsigned shiftAmount = begin; - auto inBounds = [&](unsigned c) { return c - shiftAmount < shiftUpperBound; }; - - uint32_t mask = 0; - for (int i = 0; i < Node3::kMaxNodes; ++i) { - mask |= inBounds(self->index[i]) << i; - } - mask &= (1 << self->numChildren) - 1; - if (!mask) { - return true; - } - Node *child = self->children[std::countr_zero(mask)]; - const bool firstRangeOk = - !child->entryPresent || child->entry.rangeVersion <= readVersion; - uint32_t compared = 0; - for (int i = 0; i < Node3::kMaxNodes; ++i) { - compared |= (self->childMaxVersion[i] > readVersion) << i; - } - - return !(compared & mask) && firstRangeOk; -} - -template -bool checkMaxBetweenExclusiveImpl(Node16 *n, int begin, int end, - InternalVersionT readVersion, - ReadContext *readContext) { - ++readContext->range_read_node_scan_accum; - assume(-1 <= begin); - assume(begin <= 256); - assume(-1 <= end); - assume(end <= 256); - assume(begin < end); - assert(!(begin == -1 && end == 256)); - - auto *self = static_cast(n); - - ++begin; - - assert(begin <= end); - assert(end - begin < 256); - -#ifdef HAS_ARM_NEON - - uint8x16_t indices; - memcpy(&indices, self->index, 16); - // 0xff for each in bounds - auto results = - vcltq_u8(vsubq_u8(indices, vdupq_n_u8(begin)), vdupq_n_u8(end - begin)); - // 0xf for each 0xff - uint64_t mask = vget_lane_u64( - vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(results), 4)), 0); - - mask &= self->numChildren == 16 - ? uint64_t(-1) - : (uint64_t(1) << (self->numChildren << 2)) - 1; - if (!mask) { - return true; - } - Node *child = self->children[std::countr_zero(mask) >> 2]; - const bool firstRangeOk = - !child->entryPresent || child->entry.rangeVersion <= readVersion; - - uint32x4_t w4[4]; - memcpy(w4, self->childMaxVersion, sizeof(w4)); - uint32_t rv; - memcpy(&rv, &readVersion, sizeof(rv)); - const auto rvVec = vdupq_n_u32(rv); - - int32x4_t z; - memset(&z, 0, sizeof(z)); - - uint16x4_t conflicting[4]; - for (int i = 0; i < 4; ++i) { - conflicting[i] = - vmovn_u32(vcgtq_s32(vreinterpretq_s32_u32(vsubq_u32(w4[i], rvVec)), z)); - } - auto combined = - vcombine_u8(vmovn_u16(vcombine_u16(conflicting[0], conflicting[1])), - vmovn_u16(vcombine_u16(conflicting[2], conflicting[3]))); - - uint64_t compared = vget_lane_u64( - vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(combined), 4)), 0); - - return !(compared & mask) && firstRangeOk; - -#elif defined(HAS_AVX) - - __m128i indices; - memcpy(&indices, self->index, 16); - indices = _mm_sub_epi8(indices, _mm_set1_epi8(begin)); - uint32_t mask = - 0xffff & ~_mm_movemask_epi8(_mm_cmpeq_epi8( - indices, _mm_max_epu8(indices, _mm_set1_epi8(end - begin)))); - mask &= (1 << self->numChildren) - 1; - if (!mask) { - return true; - } - Node *child = self->children[std::countr_zero(mask)]; - const bool firstRangeOk = - !child->entryPresent || child->entry.rangeVersion <= readVersion; - - uint32_t compared = 0; - if constexpr (kAVX512) { - compared = compare16_avx512(self->childMaxVersion, readVersion); - } else { - compared = compare16(self->childMaxVersion, readVersion); - } - return !(compared & mask) && firstRangeOk; - -#else - - const unsigned shiftUpperBound = end - begin; - const unsigned shiftAmount = begin; - auto inBounds = [&](unsigned c) { return c - shiftAmount < shiftUpperBound; }; - - uint32_t mask = 0; - for (int i = 0; i < 16; ++i) { - mask |= inBounds(self->index[i]) << i; - } - mask &= (1 << self->numChildren) - 1; - if (!mask) { - return true; - } - Node *child = self->children[std::countr_zero(mask)]; - const bool firstRangeOk = - !child->entryPresent || child->entry.rangeVersion <= readVersion; - uint32_t compared = 0; - for (int i = 0; i < 16; ++i) { - compared |= (self->childMaxVersion[i] > readVersion) << i; - } - return !(compared & mask) && firstRangeOk; - -#endif -} - -template -bool checkMaxBetweenExclusiveImpl(Node48 *n, int begin, int end, - InternalVersionT readVersion, - ReadContext *readContext) { - ++readContext->range_read_node_scan_accum; - assume(-1 <= begin); - assume(begin <= 256); - assume(-1 <= end); - assume(end <= 256); - assume(begin < end); - assert(!(begin == -1 && end == 256)); - - auto *self = static_cast(n); - - { - int c = self->bitSet.firstSetGeq(begin + 1); - if (c >= 0 && c < end) { - Node *child = self->children[self->index[c]]; - if (child->entryPresent && child->entry.rangeVersion > readVersion) { - return false; - } - begin = c; - } else { - return true; - } - // [begin, end) is now the half-open interval of children we're interested - // in. - assert(begin < end); - } - - // Check all pages - static_assert(Node48::kMaxOfMaxPageSize == 16); - for (int i = 0; i < Node48::kMaxOfMaxTotalPages; ++i) { - if (self->maxOfMax[i] > readVersion) { - if (!scan16(self->childMaxVersion + - (i << Node48::kMaxOfMaxShift), - self->reverseIndex + (i << Node48::kMaxOfMaxShift), - begin, end, readVersion)) { - return false; - } - } - } - return true; -} - -template -bool checkMaxBetweenExclusiveImpl(Node256 *n, int begin, int end, - InternalVersionT readVersion, - ReadContext *readContext) { - ++readContext->range_read_node_scan_accum; - assume(-1 <= begin); - assume(begin <= 256); - assume(-1 <= end); - assume(end <= 256); - assume(begin < end); - assert(!(begin == -1 && end == 256)); - - static_assert(Node256::kMaxOfMaxTotalPages == 16); - auto *self = static_cast(n); - - { - int c = self->bitSet.firstSetGeq(begin + 1); - if (c >= 0 && c < end) { - Node *child = self->children[c]; - if (child->entryPresent && child->entry.rangeVersion > readVersion) { - return false; - } - begin = c; - } else { - return true; - } - // [begin, end) is now the half-open interval of children we're interested - // in. - assert(begin < end); - } - - const int firstPage = begin >> Node256::kMaxOfMaxShift; - const int lastPage = (end - 1) >> Node256::kMaxOfMaxShift; - // Check the only page if there's only one - if (firstPage == lastPage) { - if (self->maxOfMax[firstPage] <= readVersion) { - return true; - } - const int intraPageBegin = begin & (Node256::kMaxOfMaxPageSize - 1); - const int intraPageEnd = end - (lastPage << Node256::kMaxOfMaxShift); - return scan16(self->childMaxVersion + - (firstPage << Node256::kMaxOfMaxShift), - intraPageBegin, intraPageEnd, readVersion); - } - // Check the first page - if (self->maxOfMax[firstPage] > readVersion) { - const int intraPageBegin = begin & (Node256::kMaxOfMaxPageSize - 1); - if (!scan16(self->childMaxVersion + - (firstPage << Node256::kMaxOfMaxShift), - intraPageBegin, 16, readVersion)) { - return false; - } - } - // Check the last page - if (self->maxOfMax[lastPage] > readVersion) { - const int intraPageEnd = end - (lastPage << Node256::kMaxOfMaxShift); - if (!scan16(self->childMaxVersion + - (lastPage << Node256::kMaxOfMaxShift), - 0, intraPageEnd, readVersion)) { - return false; - } - } - // Check inner pages - return scan16(self->maxOfMax, firstPage + 1, lastPage, readVersion); -} - -bool checkMaxBetweenExclusive(Node0 *n, int begin, int end, - InternalVersionT readVersion, - ReadContext *readContext) { - return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, - readContext); -} - -bool checkMaxBetweenExclusive(Node3 *n, int begin, int end, - InternalVersionT readVersion, - ReadContext *readContext) { - return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, - readContext); -} - -#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__) -__attribute__((target("avx512f"))) bool -checkMaxBetweenExclusive(Node16 *n, int begin, int end, - InternalVersionT readVersion, - ReadContext *readContext) { - return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, - readContext); -} -__attribute__((target("default"))) -#endif - -bool checkMaxBetweenExclusive(Node16 *n, int begin, int end, - InternalVersionT readVersion, ReadContext *readContext) { - return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, - readContext); -} - -#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__) -__attribute__((target("avx512f"))) bool -checkMaxBetweenExclusive(Node48 *n, int begin, int end, - InternalVersionT readVersion, - ReadContext *readContext) { - return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, - readContext); -} -__attribute__((target("default"))) -#endif - -bool checkMaxBetweenExclusive(Node48 *n, int begin, int end, - InternalVersionT readVersion, ReadContext *readContext) { - return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, - readContext); -} - -#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__) -__attribute__((target("avx512f"))) bool -checkMaxBetweenExclusive(Node256 *n, int begin, int end, - InternalVersionT readVersion, - ReadContext *readContext) { - return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, - readContext); -} -__attribute__((target("default"))) -#endif - -bool checkMaxBetweenExclusive(Node256 *n, int begin, int end, - InternalVersionT readVersion, ReadContext *readContext) { - return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, - readContext); -} - -#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__) -__attribute__((target("avx512f"))) bool -checkMaxBetweenExclusive(Node *n, int begin, int end, - InternalVersionT readVersion, - ReadContext *readContext) { - switch (n->getType()) { - case Type_Node0: - return checkMaxBetweenExclusiveImpl(static_cast(n), begin, - end, readVersion, readContext); - case Type_Node3: - return checkMaxBetweenExclusiveImpl(static_cast(n), begin, - end, readVersion, readContext); - case Type_Node16: - return checkMaxBetweenExclusiveImpl(static_cast(n), begin, - end, readVersion, readContext); - case Type_Node48: - return checkMaxBetweenExclusiveImpl(static_cast(n), begin, - end, readVersion, readContext); - case Type_Node256: - return checkMaxBetweenExclusiveImpl(static_cast(n), begin, - end, readVersion, readContext); - default: // GCOVR_EXCL_LINE - __builtin_unreachable(); // GCOVR_EXCL_LINE - } -} -__attribute__((target("default"))) -#endif - -bool checkMaxBetweenExclusive(Node *n, int begin, int end, - InternalVersionT readVersion, ReadContext *readContext) { - switch (n->getType()) { - case Type_Node0: - return checkMaxBetweenExclusiveImpl(static_cast(n), begin, - end, readVersion, readContext); - case Type_Node3: - return checkMaxBetweenExclusiveImpl(static_cast(n), begin, - end, readVersion, readContext); - case Type_Node16: - return checkMaxBetweenExclusiveImpl(static_cast(n), begin, - end, readVersion, readContext); - case Type_Node48: - return checkMaxBetweenExclusiveImpl(static_cast(n), begin, - end, readVersion, readContext); - case Type_Node256: - return checkMaxBetweenExclusiveImpl(static_cast(n), begin, - end, readVersion, readContext); - default: // GCOVR_EXCL_LINE - __builtin_unreachable(); // GCOVR_EXCL_LINE - } -} - TrivialSpan getSearchPath(Arena &arena, Node *n) { assert(n != nullptr); auto result = vector(arena); @@ -2851,8 +2831,8 @@ bool checkRangeStartsWith(Iterator nTyped, TrivialSpan key, int begin, #endif auto remaining = key; if (remaining.size() == 0) { - return checkMaxBetweenExclusive(nTyped.escapeHatch(), begin, end, - readVersion, readContext); + ++readContext->range_read_node_scan_accum; + return nTyped.checkMaxBetweenExclusive(begin, end, readVersion); } auto cAndV = nTyped.getChildAndMaxVersion(remaining[0]); @@ -2940,13 +2920,13 @@ scan16(const InternalVersionT *vs, int begin, int end, InternalVersionT readVersion); template __attribute__((target("avx512f"))) bool checkMaxBetweenExclusiveImpl(Node16 *n, int begin, int end, - InternalVersionT readVersion, ReadContext *); + InternalVersionT readVersion); template __attribute__((target("avx512f"))) bool checkMaxBetweenExclusiveImpl(Node48 *n, int begin, int end, - InternalVersionT readVersion, ReadContext *); + InternalVersionT readVersion); template __attribute__((target("avx512f"))) bool checkMaxBetweenExclusiveImpl(Node256 *n, int begin, int end, - InternalVersionT readVersion, ReadContext *); + InternalVersionT readVersion); #endif // Returns a pointer the pointer to the newly inserted node in the tree. Caller @@ -3757,8 +3737,9 @@ PRESERVE_NONE void done_common_prefix_iter(Job *job, Context *context) { MUSTTAIL return complete(job, context); } - if (!checkMaxBetweenExclusive(n.escapeHatch(), -1, job->remaining[0], - job->readVersion, &context->readContext)) { + ++context->readContext.range_read_node_scan_accum; + if (!n.checkMaxBetweenExclusive(-1, job->remaining[0], + job->readVersion)) { job->setResult(false); MUSTTAIL return complete(job, context); } @@ -3899,8 +3880,8 @@ PRESERVE_NONE void left_side_iter(Job *job, Context *context) { MUSTTAIL return complete(job, context); } - if (!checkMaxBetweenExclusive(n.escapeHatch(), job->remaining[0], 256, - job->readVersion, &context->readContext)) { + ++context->readContext.range_read_node_scan_accum; + if (!n.checkMaxBetweenExclusive(job->remaining[0], 256, job->readVersion)) { job->setResult(false); MUSTTAIL return complete(job, context); } @@ -4035,8 +4016,8 @@ PRESERVE_NONE void right_side_iter(Job *job, Context *context) { MUSTTAIL return complete(job, context); } - if (!checkMaxBetweenExclusive(n.escapeHatch(), -1, job->remaining[0], - job->readVersion, &context->readContext)) { + ++context->readContext.range_read_node_scan_accum; + if (!n.checkMaxBetweenExclusive(-1, job->remaining[0], job->readVersion)) { job->setResult(false); MUSTTAIL return complete(job, context); } @@ -4773,8 +4754,8 @@ bool checkRangeLeftSide(IteratorBase n, TrivialSpan key, int prefixLen, } if (searchPathLen >= prefixLen) { - if (!checkMaxBetweenExclusive(n.escapeHatch(), remaining[0], 256, - readVersion, readContext)) { + ++readContext->range_read_node_scan_accum; + if (!n.checkMaxBetweenExclusive(remaining[0], 256, readVersion)) { return false; } } @@ -4862,8 +4843,8 @@ bool checkRangeRightSide(IteratorBase n, TrivialSpan key, int prefixLen, return false; } - if (!checkMaxBetweenExclusive(n.escapeHatch(), -1, remaining[0], - readVersion, readContext)) { + ++readContext->range_read_node_scan_accum; + if (!n.checkMaxBetweenExclusive(-1, remaining[0], readVersion)) { return false; } }