From 9f5a68e2c066400b19e36e3a2237221b74b10a4c Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Fri, 15 Mar 2024 17:35:14 -0700 Subject: [PATCH] Use plain loop for Node3 --- ConflictSet.cpp | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/ConflictSet.cpp b/ConflictSet.cpp index 742abae..f4bfef0 100644 --- a/ConflictSet.cpp +++ b/ConflictSet.cpp @@ -599,6 +599,21 @@ struct NodeAllocators { template int getNodeIndex(NodeT *self, uint8_t index) { static_assert(std::is_same_v || std::is_same_v); + + // cachegrind says the plain loop is fewer instructions and more mis-predicted + // branches. Microbenchmark says plain loop is faster. It's written in this + // weird "generic" way though in case someday we can use the simd + // implementation easily if we want. + if constexpr (std::is_same_v) { + Node3 *n = (Node3 *)self; + for (int i = 0; i < n->numChildren; ++i) { + if (n->index[i] == index) { + return i; + } + } + return -1; + } + #ifdef HAS_AVX // Based on https://www.the-paper-trail.org/post/art-paper-notes/ @@ -721,6 +736,21 @@ Node *getChild(Node *self, uint8_t index) { template int getChildGeqSimd(NodeT *self, int child) { static_assert(std::is_same_v || std::is_same_v); + + // cachegrind says the plain loop is fewer instructions and more mis-predicted + // branches. Microbenchmark says plain loop is faster. It's written in this + // weird "generic" way though in case someday we can use the simd + // implementation easily if we want. + if constexpr (std::is_same_v) { + Node3 *n = (Node3 *)self; + for (int i = 0; i < n->numChildren; ++i) { + if (n->index[i] >= child) { + return n->index[i]; + } + } + return -1; + } + #ifdef HAS_AVX __m128i key_vec = _mm_set1_epi8(child); __m128i indices;