From 0a9ac5967680f25720ad629fb99c8098961aa080 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew@weaselab.dev>
Date: Tue, 20 Aug 2024 10:36:04 -0700
Subject: [PATCH] Commit to non-simd Node3 implementations

---
 ConflictSet.cpp | 64 ++++++++++++++++++-------------------------------
 1 file changed, 23 insertions(+), 41 deletions(-)
diff --git a/ConflictSet.cpp b/ConflictSet.cpp
index cef5990..8fb1bcb 100644
--- a/ConflictSet.cpp
+++ b/ConflictSet.cpp
@@ -768,22 +768,17 @@ private:
   BoundedFreeListAllocator<Node256> node256;
 };
 
-template <class NodeT> int getNodeIndex(NodeT *self, uint8_t index) {
-  static_assert(std::is_same_v<NodeT, Node3> || std::is_same_v<NodeT, Node16>);
-
-  // cachegrind says the plain loop is fewer instructions and more mis-predicted
-  // branches. Microbenchmark says plain loop is faster. It's written in this
-  // weird "generic" way though in case someday we can use the simd
-  // implementation easily if we want.
-  if constexpr (std::is_same_v<NodeT, Node3>) {
-    Node3 *n = (Node3 *)self;
-    for (int i = 0; i < n->numChildren; ++i) {
-      if (n->index[i] == index) {
-        return i;
-      }
+int getNodeIndex(Node3 *self, uint8_t index) {
+  Node3 *n = (Node3 *)self;
+  for (int i = 0; i < n->numChildren; ++i) {
+    if (n->index[i] == index) {
+      return i;
     }
-    return -1;
   }
+  return -1;
+}
+
+int getNodeIndex(Node16 *self, uint8_t index) {
 
 #ifdef HAS_AVX
   // Based on https://www.the-paper-trail.org/post/art-paper-notes/
@@ -796,7 +791,7 @@ template <class NodeT> int getNodeIndex(NodeT *self, uint8_t index) {
   // keys aren't valid, we'll mask the results to only consider the valid ones
   // below.
   __m128i indices;
-  memcpy(&indices, self->index, NodeT::kMaxNodes);
+  memcpy(&indices, self->index, Node16::kMaxNodes);
   __m128i results = _mm_cmpeq_epi8(key_vec, indices);
 
   // Build a mask to select only the first node->num_children values from the
@@ -819,12 +814,11 @@ template <class NodeT> int getNodeIndex(NodeT *self, uint8_t index) {
   // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
 
   uint8x16_t indices;
-  memcpy(&indices, self->index, NodeT::kMaxNodes);
+  memcpy(&indices, self->index, Node16::kMaxNodes);
   // 0xff for each match
   uint16x8_t results =
       vreinterpretq_u16_u8(vceqq_u8(vdupq_n_u8(index), indices));
-  static_assert(NodeT::kMaxNodes <= 16);
-  assume(self->numChildren <= NodeT::kMaxNodes);
+  assume(self->numChildren <= Node16::kMaxNodes);
   uint64_t mask = self->numChildren == 16
                       ? uint64_t(-1)
                       : (uint64_t(1) << (self->numChildren * 4)) - 1;
@@ -1077,22 +1071,18 @@ ChildAndMaxVersion getChildAndMaxVersion(Node *self, uint8_t index) {
   }
 }
 
-template <class NodeT> Node *getChildGeqSimd(NodeT *self, int child) {
-  static_assert(std::is_same_v<NodeT, Node3> || std::is_same_v<NodeT, Node16>);
+Node *getChildGeq(Node0 *, int) { return nullptr; }
 
-  // cachegrind says the plain loop is fewer instructions and more mis-predicted
-  // branches. Microbenchmark says plain loop is faster. It's written in this
-  // weird "generic" way though so that someday we can use the simd
-  // implementation easily if we want.
-  if constexpr (std::is_same_v<NodeT, Node3>) {
-    Node3 *n = (Node3 *)self;
-    for (int i = 0; i < n->numChildren; ++i) {
-      if (n->index[i] >= child) {
-        return n->children[i];
-      }
+Node *getChildGeq(Node3 *n, int child) {
+  for (int i = 0; i < n->numChildren; ++i) {
+    if (n->index[i] >= child) {
+      return n->children[i];
     }
-    return nullptr;
   }
+  return nullptr;
+}
+
+Node *getChildGeq(Node16 *self, int child) {
   if (child > 255) {
     return nullptr;
   }
@@ -1100,7 +1090,7 @@ template <class NodeT> Node *getChildGeqSimd(NodeT *self, int child) {
 #ifdef HAS_AVX
   __m128i key_vec = _mm_set1_epi8(child);
   __m128i indices;
-  memcpy(&indices, self->index, NodeT::kMaxNodes);
+  memcpy(&indices, self->index, Node16::kMaxNodes);
   __m128i results = _mm_cmpeq_epi8(key_vec, _mm_min_epu8(key_vec, indices));
   int mask = (1 << self->numChildren) - 1;
   uint32_t bitfield = _mm_movemask_epi8(results) & mask;
@@ -1110,8 +1100,7 @@ template <class NodeT> Node *getChildGeqSimd(NodeT *self, int child) {
   memcpy(&indices, self->index, sizeof(self->index));
   // 0xff for each leq
   auto results = vcleq_u8(vdupq_n_u8(child), indices);
-  static_assert(NodeT::kMaxNodes <= 16);
-  assume(self->numChildren <= NodeT::kMaxNodes);
+  assume(self->numChildren <= Node16::kMaxNodes);
   uint64_t mask = self->numChildren == 16
                       ? uint64_t(-1)
                       : (uint64_t(1) << (self->numChildren * 4)) - 1;
@@ -1136,13 +1125,6 @@ template <class NodeT> Node *getChildGeqSimd(NodeT *self, int child) {
 #endif
 }
 
-Node *getChildGeq(Node0 *, int) { return nullptr; }
-Node *getChildGeq(Node3 *self, int child) {
-  return getChildGeqSimd(self, child);
-}
-Node *getChildGeq(Node16 *self, int child) {
-  return getChildGeqSimd(self, child);
-}
 Node *getChildGeq(Node48 *self, int child) {
   int c = self->bitSet.firstSetGeq(child);
   if (c < 0) {