From 0a9ac5967680f25720ad629fb99c8098961aa080 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Tue, 20 Aug 2024 10:36:04 -0700 Subject: [PATCH] Commit to non-simd Node3 implementations --- ConflictSet.cpp | 64 ++++++++++++++++++------------------------------- 1 file changed, 23 insertions(+), 41 deletions(-) diff --git a/ConflictSet.cpp b/ConflictSet.cpp index cef5990..8fb1bcb 100644 --- a/ConflictSet.cpp +++ b/ConflictSet.cpp @@ -768,22 +768,17 @@ private: BoundedFreeListAllocator node256; }; -template int getNodeIndex(NodeT *self, uint8_t index) { - static_assert(std::is_same_v || std::is_same_v); - - // cachegrind says the plain loop is fewer instructions and more mis-predicted - // branches. Microbenchmark says plain loop is faster. It's written in this - // weird "generic" way though in case someday we can use the simd - // implementation easily if we want. - if constexpr (std::is_same_v) { - Node3 *n = (Node3 *)self; - for (int i = 0; i < n->numChildren; ++i) { - if (n->index[i] == index) { - return i; - } +int getNodeIndex(Node3 *self, uint8_t index) { + Node3 *n = (Node3 *)self; + for (int i = 0; i < n->numChildren; ++i) { + if (n->index[i] == index) { + return i; } - return -1; } + return -1; +} + +int getNodeIndex(Node16 *self, uint8_t index) { #ifdef HAS_AVX // Based on https://www.the-paper-trail.org/post/art-paper-notes/ @@ -796,7 +791,7 @@ template int getNodeIndex(NodeT *self, uint8_t index) { // keys aren't valid, we'll mask the results to only consider the valid ones // below. __m128i indices; - memcpy(&indices, self->index, NodeT::kMaxNodes); + memcpy(&indices, self->index, Node16::kMaxNodes); __m128i results = _mm_cmpeq_epi8(key_vec, indices); // Build a mask to select only the first node->num_children values from the @@ -819,12 +814,11 @@ template int getNodeIndex(NodeT *self, uint8_t index) { // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon uint8x16_t indices; - memcpy(&indices, self->index, NodeT::kMaxNodes); + memcpy(&indices, self->index, Node16::kMaxNodes); // 0xff for each match uint16x8_t results = vreinterpretq_u16_u8(vceqq_u8(vdupq_n_u8(index), indices)); - static_assert(NodeT::kMaxNodes <= 16); - assume(self->numChildren <= NodeT::kMaxNodes); + assume(self->numChildren <= Node16::kMaxNodes); uint64_t mask = self->numChildren == 16 ? uint64_t(-1) : (uint64_t(1) << (self->numChildren * 4)) - 1; @@ -1077,22 +1071,18 @@ ChildAndMaxVersion getChildAndMaxVersion(Node *self, uint8_t index) { } } -template Node *getChildGeqSimd(NodeT *self, int child) { - static_assert(std::is_same_v || std::is_same_v); +Node *getChildGeq(Node0 *, int) { return nullptr; } - // cachegrind says the plain loop is fewer instructions and more mis-predicted - // branches. Microbenchmark says plain loop is faster. It's written in this - // weird "generic" way though so that someday we can use the simd - // implementation easily if we want. - if constexpr (std::is_same_v) { - Node3 *n = (Node3 *)self; - for (int i = 0; i < n->numChildren; ++i) { - if (n->index[i] >= child) { - return n->children[i]; - } +Node *getChildGeq(Node3 *n, int child) { + for (int i = 0; i < n->numChildren; ++i) { + if (n->index[i] >= child) { + return n->children[i]; } - return nullptr; } + return nullptr; +} + +Node *getChildGeq(Node16 *self, int child) { if (child > 255) { return nullptr; } @@ -1100,7 +1090,7 @@ template Node *getChildGeqSimd(NodeT *self, int child) { #ifdef HAS_AVX __m128i key_vec = _mm_set1_epi8(child); __m128i indices; - memcpy(&indices, self->index, NodeT::kMaxNodes); + memcpy(&indices, self->index, Node16::kMaxNodes); __m128i results = _mm_cmpeq_epi8(key_vec, _mm_min_epu8(key_vec, indices)); int mask = (1 << self->numChildren) - 1; uint32_t bitfield = _mm_movemask_epi8(results) & mask; @@ -1110,8 +1100,7 @@ template Node *getChildGeqSimd(NodeT *self, int child) { memcpy(&indices, self->index, sizeof(self->index)); // 0xff for each leq auto results = vcleq_u8(vdupq_n_u8(child), indices); - static_assert(NodeT::kMaxNodes <= 16); - assume(self->numChildren <= NodeT::kMaxNodes); + assume(self->numChildren <= Node16::kMaxNodes); uint64_t mask = self->numChildren == 16 ? uint64_t(-1) : (uint64_t(1) << (self->numChildren * 4)) - 1; @@ -1136,13 +1125,6 @@ template Node *getChildGeqSimd(NodeT *self, int child) { #endif } -Node *getChildGeq(Node0 *, int) { return nullptr; } -Node *getChildGeq(Node3 *self, int child) { - return getChildGeqSimd(self, child); -} -Node *getChildGeq(Node16 *self, int child) { - return getChildGeqSimd(self, child); -} Node *getChildGeq(Node48 *self, int child) { int c = self->bitSet.firstSetGeq(child); if (c < 0) {