From 9f5a68e2c066400b19e36e3a2237221b74b10a4c Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew@weaselab.dev>
Date: Fri, 15 Mar 2024 17:35:14 -0700
Subject: [PATCH] Use plain loop for Node3

---
 ConflictSet.cpp | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
diff --git a/ConflictSet.cpp b/ConflictSet.cpp
index 742abae..f4bfef0 100644
--- a/ConflictSet.cpp
+++ b/ConflictSet.cpp
@@ -599,6 +599,21 @@ struct NodeAllocators {
 
 template <class NodeT> int getNodeIndex(NodeT *self, uint8_t index) {
   static_assert(std::is_same_v<NodeT, Node3> || std::is_same_v<NodeT, Node16>);
+
+  // cachegrind says the plain loop is fewer instructions and more mis-predicted
+  // branches. Microbenchmark says plain loop is faster. It's written in this
+  // weird "generic" way though in case someday we can use the simd
+  // implementation easily if we want.
+  if constexpr (std::is_same_v<NodeT, Node3>) {
+    Node3 *n = (Node3 *)self;
+    for (int i = 0; i < n->numChildren; ++i) {
+      if (n->index[i] == index) {
+        return i;
+      }
+    }
+    return -1;
+  }
+
 #ifdef HAS_AVX
   // Based on https://www.the-paper-trail.org/post/art-paper-notes/
 
@@ -721,6 +736,21 @@ Node *getChild(Node *self, uint8_t index) {
 
 template <class NodeT> int getChildGeqSimd(NodeT *self, int child) {
   static_assert(std::is_same_v<NodeT, Node3> || std::is_same_v<NodeT, Node16>);
+
+  // cachegrind says the plain loop is fewer instructions and more mis-predicted
+  // branches. Microbenchmark says plain loop is faster. It's written in this
+  // weird "generic" way though in case someday we can use the simd
+  // implementation easily if we want.
+  if constexpr (std::is_same_v<NodeT, Node3>) {
+    Node3 *n = (Node3 *)self;
+    for (int i = 0; i < n->numChildren; ++i) {
+      if (n->index[i] >= child) {
+        return n->index[i];
+      }
+    }
+    return -1;
+  }
+
 #ifdef HAS_AVX
   __m128i key_vec = _mm_set1_epi8(child);
   __m128i indices;