diff --git a/ConflictSet.cpp b/ConflictSet.cpp index 5e4b73c..44c9cb6 100644 --- a/ConflictSet.cpp +++ b/ConflictSet.cpp @@ -2147,7 +2147,14 @@ bool scan16(const InternalVersionT *vs, int begin, int end, // path of n + [child], where child in (begin, end) is <= readVersion. Does not // account for the range version of firstGt(searchpath(n) + [end - 1]) template -bool checkMaxBetweenExclusiveImpl(Node *n, int begin, int end, +bool checkMaxBetweenExclusiveImpl(Node0 *, int, int, InternalVersionT, + ReadContext *tls) { + ++tls->range_read_node_scan_accum; + return true; +} + +template +bool checkMaxBetweenExclusiveImpl(Node3 *n, int begin, int end, InternalVersionT readVersion, ReadContext *tls) { ++tls->range_read_node_scan_accum; @@ -2156,232 +2163,282 @@ bool checkMaxBetweenExclusiveImpl(Node *n, int begin, int end, assume(-1 <= end); assume(end <= 256); assume(begin < end); - assert(!(begin == -1 && end == 256)); - switch (n->getType()) { - case Type_Node0: - return true; - case Type_Node3: { - auto *self = static_cast(n); + auto *self = static_cast(n); - ++begin; + ++begin; - const unsigned shiftUpperBound = end - begin; - const unsigned shiftAmount = begin; - auto inBounds = [&](unsigned c) { - return c - shiftAmount < shiftUpperBound; - }; + const unsigned shiftUpperBound = end - begin; + const unsigned shiftAmount = begin; + auto inBounds = [&](unsigned c) { return c - shiftAmount < shiftUpperBound; }; - uint32_t mask = 0; - for (int i = 0; i < Node3::kMaxNodes; ++i) { - mask |= inBounds(self->index[i]) << i; - } - mask &= (1 << self->numChildren) - 1; - if (!mask) { - return true; - } - Node *child = self->children[std::countr_zero(mask)]; - const bool firstRangeOk = - !child->entryPresent || child->entry.rangeVersion <= readVersion; - uint32_t compared = 0; - for (int i = 0; i < Node3::kMaxNodes; ++i) { - compared |= (self->childMaxVersion[i] > readVersion) << i; - } - - return !(compared & mask) && firstRangeOk; + uint32_t mask = 0; + for (int i = 0; i < Node3::kMaxNodes; ++i) { + mask |= inBounds(self->index[i]) << i; + } + mask &= (1 << self->numChildren) - 1; + if (!mask) { + return true; + } + Node *child = self->children[std::countr_zero(mask)]; + const bool firstRangeOk = + !child->entryPresent || child->entry.rangeVersion <= readVersion; + uint32_t compared = 0; + for (int i = 0; i < Node3::kMaxNodes; ++i) { + compared |= (self->childMaxVersion[i] > readVersion) << i; } - case Type_Node16: { - auto *self = static_cast(n); - ++begin; + return !(compared & mask) && firstRangeOk; +} - assert(begin <= end); - assert(end - begin < 256); +template +bool checkMaxBetweenExclusiveImpl(Node16 *n, int begin, int end, + InternalVersionT readVersion, + ReadContext *tls) { + ++tls->range_read_node_scan_accum; + assume(-1 <= begin); + assume(begin <= 256); + assume(-1 <= end); + assume(end <= 256); + assume(begin < end); + assert(!(begin == -1 && end == 256)); + + auto *self = static_cast(n); + + ++begin; + + assert(begin <= end); + assert(end - begin < 256); #ifdef HAS_ARM_NEON - uint8x16_t indices; - memcpy(&indices, self->index, 16); - // 0xff for each in bounds - auto results = - vcltq_u8(vsubq_u8(indices, vdupq_n_u8(begin)), vdupq_n_u8(end - begin)); - // 0xf for each 0xff - uint64_t mask = vget_lane_u64( - vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(results), 4)), 0); + uint8x16_t indices; + memcpy(&indices, self->index, 16); + // 0xff for each in bounds + auto results = + vcltq_u8(vsubq_u8(indices, vdupq_n_u8(begin)), vdupq_n_u8(end - begin)); + // 0xf for each 0xff + uint64_t mask = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(results), 4)), 0); - mask &= self->numChildren == 16 - ? uint64_t(-1) - : (uint64_t(1) << (self->numChildren << 2)) - 1; - if (!mask) { - return true; - } - Node *child = self->children[std::countr_zero(mask) >> 2]; - const bool firstRangeOk = - !child->entryPresent || child->entry.rangeVersion <= readVersion; + mask &= self->numChildren == 16 + ? uint64_t(-1) + : (uint64_t(1) << (self->numChildren << 2)) - 1; + if (!mask) { + return true; + } + Node *child = self->children[std::countr_zero(mask) >> 2]; + const bool firstRangeOk = + !child->entryPresent || child->entry.rangeVersion <= readVersion; - uint32x4_t w4[4]; - memcpy(w4, self->childMaxVersion, sizeof(w4)); - uint32_t rv; - memcpy(&rv, &readVersion, sizeof(rv)); - const auto rvVec = vdupq_n_u32(rv); + uint32x4_t w4[4]; + memcpy(w4, self->childMaxVersion, sizeof(w4)); + uint32_t rv; + memcpy(&rv, &readVersion, sizeof(rv)); + const auto rvVec = vdupq_n_u32(rv); - int32x4_t z; - memset(&z, 0, sizeof(z)); + int32x4_t z; + memset(&z, 0, sizeof(z)); - uint16x4_t conflicting[4]; - for (int i = 0; i < 4; ++i) { - conflicting[i] = vmovn_u32( - vcgtq_s32(vreinterpretq_s32_u32(vsubq_u32(w4[i], rvVec)), z)); - } - auto combined = - vcombine_u8(vmovn_u16(vcombine_u16(conflicting[0], conflicting[1])), - vmovn_u16(vcombine_u16(conflicting[2], conflicting[3]))); + uint16x4_t conflicting[4]; + for (int i = 0; i < 4; ++i) { + conflicting[i] = + vmovn_u32(vcgtq_s32(vreinterpretq_s32_u32(vsubq_u32(w4[i], rvVec)), z)); + } + auto combined = + vcombine_u8(vmovn_u16(vcombine_u16(conflicting[0], conflicting[1])), + vmovn_u16(vcombine_u16(conflicting[2], conflicting[3]))); - uint64_t compared = vget_lane_u64( - vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(combined), 4)), 0); + uint64_t compared = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(combined), 4)), 0); - return !(compared & mask) && firstRangeOk; + return !(compared & mask) && firstRangeOk; #elif defined(HAS_AVX) - __m128i indices; - memcpy(&indices, self->index, 16); - indices = _mm_sub_epi8(indices, _mm_set1_epi8(begin)); - uint32_t mask = - 0xffff & - ~_mm_movemask_epi8(_mm_cmpeq_epi8( - indices, _mm_max_epu8(indices, _mm_set1_epi8(end - begin)))); - mask &= (1 << self->numChildren) - 1; - if (!mask) { - return true; - } - Node *child = self->children[std::countr_zero(mask)]; - const bool firstRangeOk = - !child->entryPresent || child->entry.rangeVersion <= readVersion; + __m128i indices; + memcpy(&indices, self->index, 16); + indices = _mm_sub_epi8(indices, _mm_set1_epi8(begin)); + uint32_t mask = + 0xffff & ~_mm_movemask_epi8(_mm_cmpeq_epi8( + indices, _mm_max_epu8(indices, _mm_set1_epi8(end - begin)))); + mask &= (1 << self->numChildren) - 1; + if (!mask) { + return true; + } + Node *child = self->children[std::countr_zero(mask)]; + const bool firstRangeOk = + !child->entryPresent || child->entry.rangeVersion <= readVersion; - uint32_t compared = 0; - if constexpr (kAVX512) { - compared = compare16_avx512(self->childMaxVersion, readVersion); - } else { - compared = compare16(self->childMaxVersion, readVersion); - } - return !(compared & mask) && firstRangeOk; + uint32_t compared = 0; + if constexpr (kAVX512) { + compared = compare16_avx512(self->childMaxVersion, readVersion); + } else { + compared = compare16(self->childMaxVersion, readVersion); + } + return !(compared & mask) && firstRangeOk; #else - const unsigned shiftUpperBound = end - begin; - const unsigned shiftAmount = begin; - auto inBounds = [&](unsigned c) { - return c - shiftAmount < shiftUpperBound; - }; + const unsigned shiftUpperBound = end - begin; + const unsigned shiftAmount = begin; + auto inBounds = [&](unsigned c) { return c - shiftAmount < shiftUpperBound; }; - uint32_t mask = 0; - for (int i = 0; i < 16; ++i) { - mask |= inBounds(self->index[i]) << i; - } - mask &= (1 << self->numChildren) - 1; - if (!mask) { - return true; - } - Node *child = self->children[std::countr_zero(mask)]; - const bool firstRangeOk = - !child->entryPresent || child->entry.rangeVersion <= readVersion; - uint32_t compared = 0; - for (int i = 0; i < 16; ++i) { - compared |= (self->childMaxVersion[i] > readVersion) << i; - } - return !(compared & mask) && firstRangeOk; - -#endif + uint32_t mask = 0; + for (int i = 0; i < 16; ++i) { + mask |= inBounds(self->index[i]) << i; } - case Type_Node48: { - auto *self = static_cast(n); - - { - int c = self->bitSet.firstSetGeq(begin + 1); - if (c >= 0 && c < end) { - Node *child = self->children[self->index[c]]; - if (child->entryPresent && child->entry.rangeVersion > readVersion) { - return false; - } - begin = c; - } else { - return true; - } - // [begin, end) is now the half-open interval of children we're interested - // in. - assert(begin < end); - } - - // Check all pages - static_assert(Node48::kMaxOfMaxPageSize == 16); - for (int i = 0; i < Node48::kMaxOfMaxTotalPages; ++i) { - if (self->maxOfMax[i] > readVersion) { - if (!scan16(self->childMaxVersion + - (i << Node48::kMaxOfMaxShift), - self->reverseIndex + (i << Node48::kMaxOfMaxShift), - begin, end, readVersion)) { - return false; - } - } - } + mask &= (1 << self->numChildren) - 1; + if (!mask) { return true; } - case Type_Node256: { - static_assert(Node256::kMaxOfMaxTotalPages == 16); - auto *self = static_cast(n); + Node *child = self->children[std::countr_zero(mask)]; + const bool firstRangeOk = + !child->entryPresent || child->entry.rangeVersion <= readVersion; + uint32_t compared = 0; + for (int i = 0; i < 16; ++i) { + compared |= (self->childMaxVersion[i] > readVersion) << i; + } + return !(compared & mask) && firstRangeOk; - { - int c = self->bitSet.firstSetGeq(begin + 1); - if (c >= 0 && c < end) { - Node *child = self->children[c]; - if (child->entryPresent && child->entry.rangeVersion > readVersion) { - return false; - } - begin = c; - } else { - return true; - } - // [begin, end) is now the half-open interval of children we're interested - // in. - assert(begin < end); - } +#endif +} - const int firstPage = begin >> Node256::kMaxOfMaxShift; - const int lastPage = (end - 1) >> Node256::kMaxOfMaxShift; - // Check the only page if there's only one - if (firstPage == lastPage) { - if (self->maxOfMax[firstPage] <= readVersion) { - return true; +template +bool checkMaxBetweenExclusiveImpl(Node48 *n, int begin, int end, + InternalVersionT readVersion, + ReadContext *tls) { + ++tls->range_read_node_scan_accum; + assume(-1 <= begin); + assume(begin <= 256); + assume(-1 <= end); + assume(end <= 256); + assume(begin < end); + assert(!(begin == -1 && end == 256)); + + auto *self = static_cast(n); + + { + int c = self->bitSet.firstSetGeq(begin + 1); + if (c >= 0 && c < end) { + Node *child = self->children[self->index[c]]; + if (child->entryPresent && child->entry.rangeVersion > readVersion) { + return false; } - const int intraPageBegin = begin & (Node256::kMaxOfMaxPageSize - 1); - const int intraPageEnd = end - (lastPage << Node256::kMaxOfMaxShift); - return scan16(self->childMaxVersion + - (firstPage << Node256::kMaxOfMaxShift), - intraPageBegin, intraPageEnd, readVersion); + begin = c; + } else { + return true; } - // Check the first page - if (self->maxOfMax[firstPage] > readVersion) { - const int intraPageBegin = begin & (Node256::kMaxOfMaxPageSize - 1); + // [begin, end) is now the half-open interval of children we're interested + // in. + assert(begin < end); + } + + // Check all pages + static_assert(Node48::kMaxOfMaxPageSize == 16); + for (int i = 0; i < Node48::kMaxOfMaxTotalPages; ++i) { + if (self->maxOfMax[i] > readVersion) { if (!scan16(self->childMaxVersion + + (i << Node48::kMaxOfMaxShift), + self->reverseIndex + (i << Node48::kMaxOfMaxShift), + begin, end, readVersion)) { + return false; + } + } + } + return true; +} + +template +bool checkMaxBetweenExclusiveImpl(Node256 *n, int begin, int end, + InternalVersionT readVersion, + ReadContext *tls) { + ++tls->range_read_node_scan_accum; + assume(-1 <= begin); + assume(begin <= 256); + assume(-1 <= end); + assume(end <= 256); + assume(begin < end); + assert(!(begin == -1 && end == 256)); + + static_assert(Node256::kMaxOfMaxTotalPages == 16); + auto *self = static_cast(n); + + { + int c = self->bitSet.firstSetGeq(begin + 1); + if (c >= 0 && c < end) { + Node *child = self->children[c]; + if (child->entryPresent && child->entry.rangeVersion > readVersion) { + return false; + } + begin = c; + } else { + return true; + } + // [begin, end) is now the half-open interval of children we're interested + // in. + assert(begin < end); + } + + const int firstPage = begin >> Node256::kMaxOfMaxShift; + const int lastPage = (end - 1) >> Node256::kMaxOfMaxShift; + // Check the only page if there's only one + if (firstPage == lastPage) { + if (self->maxOfMax[firstPage] <= readVersion) { + return true; + } + const int intraPageBegin = begin & (Node256::kMaxOfMaxPageSize - 1); + const int intraPageEnd = end - (lastPage << Node256::kMaxOfMaxShift); + return scan16(self->childMaxVersion + (firstPage << Node256::kMaxOfMaxShift), - intraPageBegin, 16, readVersion)) { - return false; - } + intraPageBegin, intraPageEnd, readVersion); + } + // Check the first page + if (self->maxOfMax[firstPage] > readVersion) { + const int intraPageBegin = begin & (Node256::kMaxOfMaxPageSize - 1); + if (!scan16(self->childMaxVersion + + (firstPage << Node256::kMaxOfMaxShift), + intraPageBegin, 16, readVersion)) { + return false; } - // Check the last page - if (self->maxOfMax[lastPage] > readVersion) { - const int intraPageEnd = end - (lastPage << Node256::kMaxOfMaxShift); - if (!scan16(self->childMaxVersion + - (lastPage << Node256::kMaxOfMaxShift), - 0, intraPageEnd, readVersion)) { - return false; - } + } + // Check the last page + if (self->maxOfMax[lastPage] > readVersion) { + const int intraPageEnd = end - (lastPage << Node256::kMaxOfMaxShift); + if (!scan16(self->childMaxVersion + + (lastPage << Node256::kMaxOfMaxShift), + 0, intraPageEnd, readVersion)) { + return false; } - // Check inner pages - return scan16(self->maxOfMax, firstPage + 1, lastPage, - readVersion); + } + // Check inner pages + return scan16(self->maxOfMax, firstPage + 1, lastPage, readVersion); +} + +template +bool checkMaxBetweenExclusiveImpl(Node *n, int begin, int end, + InternalVersionT readVersion, + ReadContext *tls) { + switch (n->getType()) { + case Type_Node0: + return checkMaxBetweenExclusiveImpl(static_cast(n), begin, + end, readVersion, tls); + case Type_Node3: { + return checkMaxBetweenExclusiveImpl(static_cast(n), begin, + end, readVersion, tls); + } + case Type_Node16: { + return checkMaxBetweenExclusiveImpl(static_cast(n), + begin, end, readVersion, tls); + } + case Type_Node48: { + return checkMaxBetweenExclusiveImpl(static_cast(n), + begin, end, readVersion, tls); + } + case Type_Node256: { + return checkMaxBetweenExclusiveImpl(static_cast(n), + begin, end, readVersion, tls); } default: // GCOVR_EXCL_LINE __builtin_unreachable(); // GCOVR_EXCL_LINE @@ -2402,6 +2459,58 @@ bool checkMaxBetweenExclusive(Node *n, int begin, int end, return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, tls); } +bool checkMaxBetweenExclusive(Node0 *n, int begin, int end, + InternalVersionT readVersion, ReadContext *tls) { + return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, tls); +} + +bool checkMaxBetweenExclusive(Node3 *n, int begin, int end, + InternalVersionT readVersion, ReadContext *tls) { + return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, tls); +} + +#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__) +__attribute__((target("avx512f"))) bool +checkMaxBetweenExclusive(Node16 *n, int begin, int end, + InternalVersionT readVersion, ReadContext *tls) { + return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, tls); +} +__attribute__((target("default"))) +#endif + +bool checkMaxBetweenExclusive(Node16 *n, int begin, int end, + InternalVersionT readVersion, ReadContext *tls) { + return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, tls); +} + +#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__) +__attribute__((target("avx512f"))) bool +checkMaxBetweenExclusive(Node48 *n, int begin, int end, + InternalVersionT readVersion, ReadContext *tls) { + return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, tls); +} +__attribute__((target("default"))) +#endif + +bool checkMaxBetweenExclusive(Node48 *n, int begin, int end, + InternalVersionT readVersion, ReadContext *tls) { + return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, tls); +} + +#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__) +__attribute__((target("avx512f"))) bool +checkMaxBetweenExclusive(Node256 *n, int begin, int end, + InternalVersionT readVersion, ReadContext *tls) { + return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, tls); +} +__attribute__((target("default"))) +#endif + +bool checkMaxBetweenExclusive(Node256 *n, int begin, int end, + InternalVersionT readVersion, ReadContext *tls) { + return checkMaxBetweenExclusiveImpl(n, begin, end, readVersion, tls); +} + Vector getSearchPath(Arena &arena, Node *n) { assert(n != nullptr); auto result = vector(arena); @@ -2424,7 +2533,8 @@ Vector getSearchPath(Arena &arena, Node *n) { // // Precondition: transitively, no child of n has a search path that's a longer // prefix of key than n -bool checkRangeStartsWith(Node *n, std::span key, int begin, +template +bool checkRangeStartsWith(NodeT *n, std::span key, int begin, int end, InternalVersionT readVersion, ReadContext *tls) { #if DEBUG_VERBOSE && !defined(NDEBUG) @@ -2501,6 +2611,15 @@ scan16(const InternalVersionT *vs, int begin, int end, template __attribute__((target("avx512f"))) bool checkMaxBetweenExclusiveImpl(Node *n, int begin, int end, InternalVersionT readVersion, ReadContext *); +template __attribute__((target("avx512f"))) bool +checkMaxBetweenExclusiveImpl(Node16 *n, int begin, int end, + InternalVersionT readVersion, ReadContext *); +template __attribute__((target("avx512f"))) bool +checkMaxBetweenExclusiveImpl(Node48 *n, int begin, int end, + InternalVersionT readVersion, ReadContext *); +template __attribute__((target("avx512f"))) bool +checkMaxBetweenExclusiveImpl(Node256 *n, int begin, int end, + InternalVersionT readVersion, ReadContext *); #endif // Returns a pointer the pointer to the newly inserted node in the tree. Caller @@ -3207,14 +3326,16 @@ PRESERVE_NONE void begin(CheckJob *job, CheckContext *context) { job->remaining = job->begin.subspan(0, job->lcp); if (job->remaining.size() == 0) { - MUSTTAIL return done_common_prefix_iter(job, context); + job->continuation = done_common_prefix_iter; + MUSTTAIL return job->continuation(job, context); } auto [c, maxV] = getChildAndMaxVersion(job->n, job->remaining[0]); job->maxV = maxV; job->child = c; if (job->child == nullptr) { - MUSTTAIL return done_common_prefix_iter(job, context); + job->continuation = done_common_prefix_iter; + MUSTTAIL return job->continuation(job, context); } job->continuation = commonPrefixIterTable[c.getType()];