Commit to non-simd Node3 implementations

Remove unnecessary casts
Save 8 bytes in Node3
2024-08-20 10:36:04 -07:00 · 2024-08-20 10:30:27 -07:00 · 2024-08-20 10:30:07 -07:00 · 2024-08-20 09:57:44 -07:00 · 2024-08-20 09:51:29 -07:00
1 changed files with 50 additions and 77 deletions
--- a/ConflictSet.cpp
+++ b/ConflictSet.cpp
@@ -197,7 +197,6 @@ struct Node {
  /* end section that's copied to the next node */

  uint8_t *partialKey();
-
  Type getType() const { return type; }
  int32_t getCapacity() const { return partialKeyCapacity; }

@@ -221,84 +220,83 @@ constexpr int kNodeCopySize =

 struct Node0 : Node {
  constexpr static auto kType = Type_Node0;
-  uint8_t *partialKey() { return (uint8_t *)(this + 1); }

+  uint8_t *partialKey() { return (uint8_t *)(this + 1); }
  void copyChildrenAndKeyFrom(const Node0 &other);
  void copyChildrenAndKeyFrom(const struct Node3 &other);
-
  size_t size() const { return sizeof(Node0) + getCapacity(); }
 };

 struct Node3 : Node {
  constexpr static auto kMaxNodes = 3;
  constexpr static auto kType = Type_Node3;
-  // Sorted
-  uint8_t index[kMaxNodes];
+
  Node *children[kMaxNodes];
  InternalVersionT childMaxVersion[kMaxNodes];
-  uint8_t *partialKey() { return (uint8_t *)(this + 1); }
+  // Sorted
+  uint8_t index[kMaxNodes];

+  uint8_t *partialKey() { return (uint8_t *)(this + 1); }
  void copyChildrenAndKeyFrom(const Node0 &other);
  void copyChildrenAndKeyFrom(const Node3 &other);
  void copyChildrenAndKeyFrom(const struct Node16 &other);
-
  size_t size() const { return sizeof(Node3) + getCapacity(); }
 };

 struct Node16 : Node {
  constexpr static auto kType = Type_Node16;
  constexpr static auto kMaxNodes = 16;
-  // Sorted
-  uint8_t index[kMaxNodes];
+
  Node *children[kMaxNodes];
  InternalVersionT childMaxVersion[kMaxNodes];
-  uint8_t *partialKey() { return (uint8_t *)(this + 1); }
+  // Sorted
+  uint8_t index[kMaxNodes];

+  uint8_t *partialKey() { return (uint8_t *)(this + 1); }
  void copyChildrenAndKeyFrom(const Node3 &other);
  void copyChildrenAndKeyFrom(const Node16 &other);
  void copyChildrenAndKeyFrom(const struct Node48 &other);
-
  size_t size() const { return sizeof(Node16) + getCapacity(); }
 };

 struct Node48 : Node {
  constexpr static auto kType = Type_Node48;
  constexpr static auto kMaxNodes = 48;
-  BitSet bitSet;
-  int8_t nextFree;
-  int8_t index[256];
-  Node *children[kMaxNodes];
-  InternalVersionT childMaxVersion[kMaxNodes];
-  uint8_t reverseIndex[kMaxNodes];
  constexpr static int kMaxOfMaxPageSize = 16;
  constexpr static int kMaxOfMaxShift =
      std::countr_zero(uint32_t(kMaxOfMaxPageSize));
  constexpr static int kMaxOfMaxTotalPages = kMaxNodes / kMaxOfMaxPageSize;
+
+  BitSet bitSet;
+  Node *children[kMaxNodes];
+  InternalVersionT childMaxVersion[kMaxNodes];
  InternalVersionT maxOfMax[kMaxOfMaxTotalPages];
+  uint8_t reverseIndex[kMaxNodes];
+  int8_t index[256];

  uint8_t *partialKey() { return (uint8_t *)(this + 1); }
-
  void copyChildrenAndKeyFrom(const Node16 &other);
  void copyChildrenAndKeyFrom(const Node48 &other);
  void copyChildrenAndKeyFrom(const struct Node256 &other);
-
  size_t size() const { return sizeof(Node48) + getCapacity(); }
 };

 struct Node256 : Node {
  constexpr static auto kType = Type_Node256;
-  BitSet bitSet;
-  Node *children[256];
-  InternalVersionT childMaxVersion[256];
+  constexpr static auto kMaxNodes = 256;
  constexpr static int kMaxOfMaxPageSize = 16;
  constexpr static int kMaxOfMaxShift =
      std::countr_zero(uint32_t(kMaxOfMaxPageSize));
-  constexpr static int kMaxOfMaxTotalPages = 256 / kMaxOfMaxPageSize;
+  constexpr static int kMaxOfMaxTotalPages = kMaxNodes / kMaxOfMaxPageSize;
+
+  BitSet bitSet;
+  Node *children[kMaxNodes];
+  InternalVersionT childMaxVersion[kMaxNodes];
  InternalVersionT maxOfMax[kMaxOfMaxTotalPages];
+
  uint8_t *partialKey() { return (uint8_t *)(this + 1); }
  void copyChildrenAndKeyFrom(const Node48 &other);
  void copyChildrenAndKeyFrom(const Node256 &other);
-
  size_t size() const { return sizeof(Node256) + getCapacity(); }
 };

@@ -323,7 +321,7 @@ inline void Node3::copyChildrenAndKeyFrom(const Node0 &other) {
 inline void Node3::copyChildrenAndKeyFrom(const Node3 &other) {
  memcpy((char *)this + kNodeCopyBegin, (char *)&other + kNodeCopyBegin,
         kNodeCopySize);
-  memcpy(index, other.index, sizeof(*this) - sizeof(Node));
+  memcpy(children, other.children, sizeof(*this) - sizeof(Node));
  memcpy(partialKey(), &other + 1, partialKeyLen);
  for (int i = 0; i < numChildren; ++i) {
    assert(children[i]->parent == &other);
@@ -404,7 +402,6 @@ inline void Node48::copyChildrenAndKeyFrom(const Node16 &other) {
  }
  memcpy(partialKey(), &other + 1, partialKeyLen);
  bitSet.init();
-  nextFree = Node16::kMaxNodes;
  int i = 0;
  for (auto x : other.index) {
    bitSet.set(x);
@@ -424,7 +421,6 @@ inline void Node48::copyChildrenAndKeyFrom(const Node48 &other) {
  memcpy((char *)this + kNodeCopyBegin, (char *)&other + kNodeCopyBegin,
         kNodeCopySize);
  bitSet = other.bitSet;
-  nextFree = other.nextFree;
  memcpy(index, other.index, sizeof(index));
  memset(children, 0, sizeof(children));
  const auto z = InternalVersionT::zero;
@@ -451,7 +447,6 @@ inline void Node48::copyChildrenAndKeyFrom(const Node256 &other) {
  for (auto &v : childMaxVersion) {
    v = z;
  }
-  nextFree = other.numChildren;
  bitSet = other.bitSet;
  int i = 0;
  bitSet.forEachSet([&](int c) {
@@ -773,22 +768,17 @@ private:
  BoundedFreeListAllocator<Node256> node256;
 };

-template <class NodeT> int getNodeIndex(NodeT *self, uint8_t index) {
-  static_assert(std::is_same_v<NodeT, Node3> || std::is_same_v<NodeT, Node16>);
-
-  // cachegrind says the plain loop is fewer instructions and more mis-predicted
-  // branches. Microbenchmark says plain loop is faster. It's written in this
-  // weird "generic" way though in case someday we can use the simd
-  // implementation easily if we want.
-  if constexpr (std::is_same_v<NodeT, Node3>) {
-    Node3 *n = (Node3 *)self;
-    for (int i = 0; i < n->numChildren; ++i) {
-      if (n->index[i] == index) {
-        return i;
-      }
+int getNodeIndex(Node3 *self, uint8_t index) {
+  Node3 *n = (Node3 *)self;
+  for (int i = 0; i < n->numChildren; ++i) {
+    if (n->index[i] == index) {
+      return i;
    }
-    return -1;
  }
+  return -1;
+}
+
+int getNodeIndex(Node16 *self, uint8_t index) {

 #ifdef HAS_AVX
  // Based on https://www.the-paper-trail.org/post/art-paper-notes/
@@ -801,7 +791,7 @@ template <class NodeT> int getNodeIndex(NodeT *self, uint8_t index) {
  // keys aren't valid, we'll mask the results to only consider the valid ones
  // below.
  __m128i indices;
-  memcpy(&indices, self->index, NodeT::kMaxNodes);
+  memcpy(&indices, self->index, Node16::kMaxNodes);
  __m128i results = _mm_cmpeq_epi8(key_vec, indices);

  // Build a mask to select only the first node->num_children values from the
@@ -824,12 +814,11 @@ template <class NodeT> int getNodeIndex(NodeT *self, uint8_t index) {
  // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon

  uint8x16_t indices;
-  memcpy(&indices, self->index, NodeT::kMaxNodes);
+  memcpy(&indices, self->index, Node16::kMaxNodes);
  // 0xff for each match
  uint16x8_t results =
      vreinterpretq_u16_u8(vceqq_u8(vdupq_n_u8(index), indices));
-  static_assert(NodeT::kMaxNodes <= 16);
-  assume(self->numChildren <= NodeT::kMaxNodes);
+  assume(self->numChildren <= Node16::kMaxNodes);
  uint64_t mask = self->numChildren == 16
                      ? uint64_t(-1)
                      : (uint64_t(1) << (self->numChildren * 4)) - 1;
@@ -1082,22 +1071,18 @@ ChildAndMaxVersion getChildAndMaxVersion(Node *self, uint8_t index) {
  }
 }

-template <class NodeT> Node *getChildGeqSimd(NodeT *self, int child) {
-  static_assert(std::is_same_v<NodeT, Node3> || std::is_same_v<NodeT, Node16>);
+Node *getChildGeq(Node0 *, int) { return nullptr; }

-  // cachegrind says the plain loop is fewer instructions and more mis-predicted
-  // branches. Microbenchmark says plain loop is faster. It's written in this
-  // weird "generic" way though so that someday we can use the simd
-  // implementation easily if we want.
-  if constexpr (std::is_same_v<NodeT, Node3>) {
-    Node3 *n = (Node3 *)self;
-    for (int i = 0; i < n->numChildren; ++i) {
-      if (n->index[i] >= child) {
-        return n->children[i];
-      }
+Node *getChildGeq(Node3 *n, int child) {
+  for (int i = 0; i < n->numChildren; ++i) {
+    if (n->index[i] >= child) {
+      return n->children[i];
    }
-    return nullptr;
  }
+  return nullptr;
+}
+
+Node *getChildGeq(Node16 *self, int child) {
  if (child > 255) {
    return nullptr;
  }
@@ -1105,7 +1090,7 @@ template <class NodeT> Node *getChildGeqSimd(NodeT *self, int child) {
 #ifdef HAS_AVX
  __m128i key_vec = _mm_set1_epi8(child);
  __m128i indices;
-  memcpy(&indices, self->index, NodeT::kMaxNodes);
+  memcpy(&indices, self->index, Node16::kMaxNodes);
  __m128i results = _mm_cmpeq_epi8(key_vec, _mm_min_epu8(key_vec, indices));
  int mask = (1 << self->numChildren) - 1;
  uint32_t bitfield = _mm_movemask_epi8(results) & mask;
@@ -1115,8 +1100,7 @@ template <class NodeT> Node *getChildGeqSimd(NodeT *self, int child) {
  memcpy(&indices, self->index, sizeof(self->index));
  // 0xff for each leq
  auto results = vcleq_u8(vdupq_n_u8(child), indices);
-  static_assert(NodeT::kMaxNodes <= 16);
-  assume(self->numChildren <= NodeT::kMaxNodes);
+  assume(self->numChildren <= Node16::kMaxNodes);
  uint64_t mask = self->numChildren == 16
                      ? uint64_t(-1)
                      : (uint64_t(1) << (self->numChildren * 4)) - 1;
@@ -1141,13 +1125,6 @@ template <class NodeT> Node *getChildGeqSimd(NodeT *self, int child) {
 #endif
 }

-Node *getChildGeq(Node0 *, int) { return nullptr; }
-Node *getChildGeq(Node3 *self, int child) {
-  return getChildGeqSimd(self, child);
-}
-Node *getChildGeq(Node16 *self, int child) {
-  return getChildGeqSimd(self, child);
-}
 Node *getChildGeq(Node48 *self, int child) {
  int c = self->bitSet.firstSetGeq(child);
  if (c < 0) {
@@ -1360,7 +1337,7 @@ Node *&getOrCreateChild(Node *&self, std::span<const uint8_t> &key,
    auto *self3 = static_cast<Node3 *>(self);
    int i = self->numChildren - 1;
    for (; i >= 0; --i) {
-      if (int(self3->index[i]) < int(index)) {
+      if (self3->index[i] < index) {
        break;
      }
      self3->index[i + 1] = self3->index[i];
@@ -1390,7 +1367,7 @@ Node *&getOrCreateChild(Node *&self, std::span<const uint8_t> &key,
    auto *self16 = static_cast<Node16 *>(self);
    int i = self->numChildren - 1;
    for (; i >= 0; --i) {
-      if (int(self16->index[i]) < int(index)) {
+      if (self16->index[i] < index) {
        break;
      }
      self16->index[i + 1] = self16->index[i];
@@ -1419,9 +1396,7 @@ Node *&getOrCreateChild(Node *&self, std::span<const uint8_t> &key,
  insert48:
    auto *self48 = static_cast<Node48 *>(self);
    self48->bitSet.set(index);
-    ++self->numChildren;
-    assert(self48->nextFree < 48);
-    int nextFree = self48->nextFree++;
+    auto nextFree = self48->numChildren++;
    self48->index[index] = nextFree;
    self48->reverseIndex[nextFree] = index;
    auto &result = self48->children[nextFree];
@@ -1812,7 +1787,7 @@ Node *erase(Node *self, WriteContext *tls, ConflictSet::Impl *impl,
    parent48->bitSet.reset(parentsIndex);
    int8_t toRemoveChildrenIndex =
        std::exchange(parent48->index[parentsIndex], -1);
-    int8_t lastChildrenIndex = --parent48->nextFree;
+    auto lastChildrenIndex = --parent48->numChildren;
    assert(toRemoveChildrenIndex >= 0);
    assert(lastChildrenIndex >= 0);
    if (toRemoveChildrenIndex != lastChildrenIndex) {
@@ -1831,8 +1806,6 @@ Node *erase(Node *self, WriteContext *tls, ConflictSet::Impl *impl,
    }
    parent48->childMaxVersion[lastChildrenIndex] = tls->zero;

-    --parent->numChildren;
-
    if (needsDownsize(parent48)) {
      downsize(parent48, tls, impl, result);
    }
Author	SHA1	Message	Date
Andrew Noyes	0a9ac59676	Commit to non-simd Node3 implementations Some checks failed Tests / Clang total: 2620, passed: 2620 Details Clang \|Total\|New\|Outstanding\|Fixed\|Trend \|:-:\|:-:\|:-:\|:-:\|:-: \|0\|0\|0\|0\|:clap: Details Tests / Debug total: 2618, passed: 2618 Details Tests / SIMD fallback total: 2620, passed: 2620 Details Tests / Release [gcc] total: 2620, passed: 2620 Details GNU C Compiler (gcc) \|Total\|New\|Outstanding\|Fixed\|Trend \|:-:\|:-:\|:-:\|:-:\|:-: \|0\|0\|0\|0\|:clap: Details Tests / Release [gcc,aarch64] total: 1957, passed: 1957 Details Tests / Coverage total: 1967, failed: 1, passed: 1966 Details weaselab/conflict-set/pipeline/head There was a failure building this commit Details	2024-08-20 10:36:04 -07:00
Andrew Noyes	e3a77ed773	Remove unnecessary casts	2024-08-20 10:30:27 -07:00
Andrew Noyes	cdf9a8a7b0	Save 8 bytes in Node3	2024-08-20 10:30:07 -07:00
Andrew Noyes	305dfdd52f	Change whitespace in node structs for consistency	2024-08-20 09:57:44 -07:00
Andrew Noyes	7261c91492	Remove Node48::nextFree, and improve padding to save 8 bytes	2024-08-20 09:51:29 -07:00