2 Commits

Author SHA1 Message Date
9f5a68e2c0 Use plain loop for Node3
All checks were successful
Tests / Clang total: 932, passed: 932
Clang |Total|New|Outstanding|Fixed|Trend |:-:|:-:|:-:|:-:|:-: |3|0|3|0|:zzz:
Tests / Release [gcc] total: 932, passed: 932
Tests / Release [gcc,aarch64] total: 931, passed: 931
Tests / Coverage total: 930, passed: 930
weaselab/conflict-set/pipeline/head This commit looks good
2024-03-15 17:35:14 -07:00
dfbb3ce5f1 Use assume
It works now that we fell back to the __builtin_unreachable based
implementation for gcc.
2024-03-15 17:23:02 -07:00

View File

@@ -374,10 +374,8 @@ inline void Node16::copyChildrenAndKeyFrom(const Node48 &other) {
other.bitSet.forEachInRange(
[&](int c) {
// Suppress a false positive -Waggressive-loop-optimizations warning
// in gcc. `assume` doesn't work for some reason.
if (!(i < Node16::kMaxNodes)) {
__builtin_unreachable(); // GCOVR_EXCL_LINE
}
// in gcc
assume(i < Node16::kMaxNodes);
index[i] = c;
children[i] = other.children[other.index[c]];
assert(children[i].child->parent == &other);
@@ -430,10 +428,8 @@ inline void Node48::copyChildrenAndKeyFrom(const Node256 &other) {
bitSet.forEachInRange(
[&](int c) {
// Suppress a false positive -Waggressive-loop-optimizations warning
// in gcc. `assume` doesn't work for some reason.
if (!(i < Node48::kMaxNodes)) {
__builtin_unreachable(); // GCOVR_EXCL_LINE
}
// in gcc.
assume(i < Node48::kMaxNodes);
index[c] = i;
children[i] = other.children[c];
assert(children[i].child->parent == &other);
@@ -603,6 +599,21 @@ struct NodeAllocators {
template <class NodeT> int getNodeIndex(NodeT *self, uint8_t index) {
static_assert(std::is_same_v<NodeT, Node3> || std::is_same_v<NodeT, Node16>);
// cachegrind says the plain loop is fewer instructions and more mis-predicted
// branches. Microbenchmark says plain loop is faster. It's written in this
// weird "generic" way though in case someday we can use the simd
// implementation easily if we want.
if constexpr (std::is_same_v<NodeT, Node3>) {
Node3 *n = (Node3 *)self;
for (int i = 0; i < n->numChildren; ++i) {
if (n->index[i] == index) {
return i;
}
}
return -1;
}
#ifdef HAS_AVX
// Based on https://www.the-paper-trail.org/post/art-paper-notes/
@@ -725,6 +736,21 @@ Node *getChild(Node *self, uint8_t index) {
template <class NodeT> int getChildGeqSimd(NodeT *self, int child) {
static_assert(std::is_same_v<NodeT, Node3> || std::is_same_v<NodeT, Node16>);
// cachegrind says the plain loop is fewer instructions and more mis-predicted
// branches. Microbenchmark says plain loop is faster. It's written in this
// weird "generic" way though in case someday we can use the simd
// implementation easily if we want.
if constexpr (std::is_same_v<NodeT, Node3>) {
Node3 *n = (Node3 *)self;
for (int i = 0; i < n->numChildren; ++i) {
if (n->index[i] >= child) {
return n->index[i];
}
}
return -1;
}
#ifdef HAS_AVX
__m128i key_vec = _mm_set1_epi8(child);
__m128i indices;