Use plain loop for Node3

Use assume
It works now that we fell back to the __builtin_unreachable based implementation for gcc.
2024-03-15 17:35:14 -07:00 · 2024-03-15 17:23:02 -07:00
1 changed files with 34 additions and 8 deletions
--- a/ConflictSet.cpp
+++ b/ConflictSet.cpp
@@ -374,10 +374,8 @@ inline void Node16::copyChildrenAndKeyFrom(const Node48 &other) {
  other.bitSet.forEachInRange(
      [&](int c) {
        // Suppress a false positive -Waggressive-loop-optimizations warning
-        // in gcc. `assume` doesn't work for some reason.
-        if (!(i < Node16::kMaxNodes)) {
-          __builtin_unreachable(); // GCOVR_EXCL_LINE
-        }
+        // in gcc
+        assume(i < Node16::kMaxNodes);
        index[i] = c;
        children[i] = other.children[other.index[c]];
        assert(children[i].child->parent == &other);
@@ -430,10 +428,8 @@ inline void Node48::copyChildrenAndKeyFrom(const Node256 &other) {
  bitSet.forEachInRange(
      [&](int c) {
        // Suppress a false positive -Waggressive-loop-optimizations warning
-        // in gcc. `assume` doesn't work for some reason.
-        if (!(i < Node48::kMaxNodes)) {
-          __builtin_unreachable(); // GCOVR_EXCL_LINE
-        }
+        // in gcc.
+        assume(i < Node48::kMaxNodes);
        index[c] = i;
        children[i] = other.children[c];
        assert(children[i].child->parent == &other);
@@ -603,6 +599,21 @@ struct NodeAllocators {

 template <class NodeT> int getNodeIndex(NodeT *self, uint8_t index) {
  static_assert(std::is_same_v<NodeT, Node3> || std::is_same_v<NodeT, Node16>);
+
+  // cachegrind says the plain loop is fewer instructions and more mis-predicted
+  // branches. Microbenchmark says plain loop is faster. It's written in this
+  // weird "generic" way though in case someday we can use the simd
+  // implementation easily if we want.
+  if constexpr (std::is_same_v<NodeT, Node3>) {
+    Node3 *n = (Node3 *)self;
+    for (int i = 0; i < n->numChildren; ++i) {
+      if (n->index[i] == index) {
+        return i;
+      }
+    }
+    return -1;
+  }
+
 #ifdef HAS_AVX
  // Based on https://www.the-paper-trail.org/post/art-paper-notes/

@@ -725,6 +736,21 @@ Node *getChild(Node *self, uint8_t index) {

 template <class NodeT> int getChildGeqSimd(NodeT *self, int child) {
  static_assert(std::is_same_v<NodeT, Node3> || std::is_same_v<NodeT, Node16>);
+
+  // cachegrind says the plain loop is fewer instructions and more mis-predicted
+  // branches. Microbenchmark says plain loop is faster. It's written in this
+  // weird "generic" way though in case someday we can use the simd
+  // implementation easily if we want.
+  if constexpr (std::is_same_v<NodeT, Node3>) {
+    Node3 *n = (Node3 *)self;
+    for (int i = 0; i < n->numChildren; ++i) {
+      if (n->index[i] >= child) {
+        return n->index[i];
+      }
+    }
+    return -1;
+  }
+
 #ifdef HAS_AVX
  __m128i key_vec = _mm_set1_epi8(child);
  __m128i indices;
Author	SHA1	Message	Date
Andrew Noyes	9f5a68e2c0	Use plain loop for Node3 All checks were successful Tests / Clang total: 932, passed: 932 Details Clang \|Total\|New\|Outstanding\|Fixed\|Trend \|:-:\|:-:\|:-:\|:-:\|:-: \|3\|0\|3\|0\|:zzz: Details Tests / Release [gcc] total: 932, passed: 932 Details Tests / Release [gcc,aarch64] total: 931, passed: 931 Details Tests / Coverage total: 930, passed: 930 Details weaselab/conflict-set/pipeline/head This commit looks good Details	2024-03-15 17:35:14 -07:00
Andrew Noyes	dfbb3ce5f1	Use assume It works now that we fell back to the __builtin_unreachable based implementation for gcc.	2024-03-15 17:23:02 -07:00