Switch to new machine for benchmarks

Skip lcp call in SearchStepWise if no partial key
Remove len < 8 check in longestCommonPrefix
2024-08-02 18:18:36 -07:00 · 2024-08-02 18:16:55 -07:00 · 2024-08-02 18:16:55 -07:00 · 2024-08-02 18:16:55 -07:00 · 2024-08-02 18:16:55 -07:00 · 2024-08-02 18:16:55 -07:00
3 changed files with 136 additions and 213 deletions
@@ -31,8 +31,14 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
                                               "MinSizeRel" "RelWithDebInfo")
 endif()

-add_compile_options(-fdata-sections -ffunction-sections -Wswitch-enum
-                    -Werror=switch-enum -fPIC)
+add_compile_options(
+  -fdata-sections
+  -ffunction-sections
+  -Wswitch-enum
+  -Werror=switch-enum
+  -fPIC
+  -g
+  -fno-omit-frame-pointer)

 set(full_relro_flags "-pie;LINKER:-z,relro,-z,now,-z,noexecstack")
 cmake_push_check_state()
@@ -395,8 +395,9 @@ inline void Node48::copyChildrenAndKeyFrom(const Node16 &other) {
  assert(numChildren == Node16::kMaxNodes);
  memset(index, -1, sizeof(index));
  memset(children, 0, sizeof(children));
+  const auto z = InternalVersionT::zero;
  for (auto &v : childMaxVersion) {
-    v = InternalVersionT::zero;
+    v = z;
  }
  memcpy(partialKey(), &other + 1, partialKeyLen);
  bitSet.init();
@@ -423,8 +424,9 @@ inline void Node48::copyChildrenAndKeyFrom(const Node48 &other) {
  nextFree = other.nextFree;
  memcpy(index, other.index, sizeof(index));
  memset(children, 0, sizeof(children));
+  const auto z = InternalVersionT::zero;
  for (auto &v : childMaxVersion) {
-    v = InternalVersionT::zero;
+    v = z;
  }
  for (int i = 0; i < numChildren; ++i) {
    children[i] = other.children[i];
@@ -442,8 +444,9 @@ inline void Node48::copyChildrenAndKeyFrom(const Node256 &other) {
         kNodeCopySize);
  memset(index, -1, sizeof(index));
  memset(children, 0, sizeof(children));
+  const auto z = InternalVersionT::zero;
  for (auto &v : childMaxVersion) {
-    v = InternalVersionT::zero;
+    v = z;
  }
  nextFree = other.numChildren;
  bitSet = other.bitSet;
@@ -470,11 +473,12 @@ inline void Node256::copyChildrenAndKeyFrom(const Node48 &other) {
         kNodeCopySize);
  bitSet = other.bitSet;
  memset(children, 0, sizeof(children));
+  const auto z = InternalVersionT::zero;
  for (auto &v : childMaxVersion) {
-    v = InternalVersionT::zero;
+    v = z;
  }
  for (auto &v : maxOfMax) {
-    v = InternalVersionT::zero;
+    v = z;
  }
  bitSet.forEachSet([&](int c) {
    children[c] = other.children[other.index[c]];
@@ -491,8 +495,9 @@ inline void Node256::copyChildrenAndKeyFrom(const Node256 &other) {
  memcpy((char *)this + kNodeCopyBegin, (char *)&other + kNodeCopyBegin,
         kNodeCopySize);
  memset(children, 0, sizeof(children));
+  const auto z = InternalVersionT::zero;
  for (auto &v : childMaxVersion) {
-    v = InternalVersionT::zero;
+    v = z;
  }
  bitSet = other.bitSet;
  bitSet.forEachSet([&](int c) {
@@ -619,13 +624,15 @@ template <class T> struct BoundedFreeListAllocator {
    T *result = allocate_helper(partialKeyCapacity);
    if constexpr (!std::is_same_v<T, Node0>) {
      memset(result->children, 0, sizeof(result->children));
+      const auto z = InternalVersionT::zero;
      for (auto &v : result->childMaxVersion) {
-        v = InternalVersionT::zero;
+        v = z;
      }
    }
    if constexpr (std::is_same_v<T, Node48> || std::is_same_v<T, Node256>) {
+      const auto z = InternalVersionT::zero;
      for (auto &v : result->maxOfMax) {
-        v = InternalVersionT::zero;
+        v = z;
      }
    }
    return result;
@@ -712,6 +719,9 @@ struct ReadContext {
  int64_t prefix_read_iterations_accum = 0;
  int64_t range_read_iterations_accum = 0;
  int64_t range_read_node_scan_accum = 0;
+  int64_t commits_accum = 0;
+  int64_t conflicts_accum = 0;
+  int64_t too_olds_accum = 0;
  ConflictSet::Impl *impl;
 };

@@ -730,6 +740,10 @@ struct WriteContext {
    int64_t write_bytes;
  } accum;

+  // Cache a copy of InternalVersionT::zero, so we don't need to do the TLS
+  // lookup as often.
+  InternalVersionT zero;
+
  WriteContext() { memset(&accum, 0, sizeof(accum)); }

  template <class T> T *allocate(int c) {
@@ -1122,24 +1136,19 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, WriteContext *tls) {

  insert3:
    auto *self3 = static_cast<Node3 *>(self);
-    ++self->numChildren;
-    int i = 0;
-    for (; i < self->numChildren - 1; ++i) {
-      if (int(self3->index[i]) > int(index)) {
-        memmove(self3->index + i + 1, self3->index + i,
-                self->numChildren - (i + 1));
-        memmove(self3->children + i + 1, self3->children + i,
-                (self->numChildren - (i + 1)) *
-                    sizeof(self3->children[0])); // NOLINT
-        memmove(self3->childMaxVersion + i + 1, self3->childMaxVersion + i,
-                (self->numChildren - (i + 1)) *
-                    sizeof(self3->childMaxVersion[0]));
+    int i = self->numChildren - 1;
+    for (; i >= 0; --i) {
+      if (int(self3->index[i]) < int(index)) {
        break;
      }
+      self3->index[i + 1] = self3->index[i];
+      self3->children[i + 1] = self3->children[i];
+      self3->childMaxVersion[i + 1] = self3->childMaxVersion[i];
    }
-    self3->index[i] = index;
-    auto &result = self3->children[i];
+    self3->index[i + 1] = index;
+    auto &result = self3->children[i + 1];
    result = nullptr;
+    ++self->numChildren;
    return result;
  }
  case Type_Node16: {
@@ -1153,71 +1162,21 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, WriteContext *tls) {
    }

  insert16:
-    auto *self16 = static_cast<Node16 *>(self);
    assert(self->getType() == Type_Node16);
-
-    ++self->numChildren;
-#ifdef HAS_AVX
-    __m128i key_vec = _mm_set1_epi8(index);
-    __m128i indices;
-    memcpy(&indices, self16->index, sizeof(self16->index));
-    __m128i results = _mm_cmpeq_epi8(key_vec, _mm_min_epu8(key_vec, indices));
-    int mask = (1 << (self->numChildren - 1)) - 1;
-    uint32_t bitfield = _mm_movemask_epi8(results) & mask;
-    bitfield |= uint32_t(1) << (self->numChildren - 1);
-    int i = std::countr_zero(bitfield);
-    if (i < self->numChildren - 1) {
-      memmove(self16->index + i + 1, self16->index + i,
-              self->numChildren - (i + 1));
-      memmove(self16->children + i + 1, self16->children + i,
-              (self->numChildren - (i + 1)) *
-                  sizeof(self16->children[0])); // NOLINT
-      memmove(self16->childMaxVersion + i + 1, self16->childMaxVersion + i,
-              (self->numChildren - (i + 1)) *
-                  sizeof(self16->childMaxVersion[0]));
-    }
-#elif defined(HAS_ARM_NEON)
-    uint8x16_t indices;
-    memcpy(&indices, self16->index, sizeof(self16->index));
-    // 0xff for each leq
-    auto results = vcleq_u8(vdupq_n_u8(index), indices);
-    uint64_t mask = (uint64_t(1) << ((self->numChildren - 1) * 4)) - 1;
-    // 0xf for each 0xff (within mask)
-    uint64_t bitfield =
-        vget_lane_u64(
-            vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(results), 4)),
-            0) &
-        mask;
-    bitfield |= uint64_t(0xf) << ((self->numChildren - 1) * 4);
-    int i = std::countr_zero(bitfield) / 4;
-    if (i < self->numChildren - 1) {
-      memmove(self16->index + i + 1, self16->index + i,
-              self->numChildren - (i + 1));
-      memmove(self16->children + i + 1, self16->children + i,
-              (self->numChildren - (i + 1)) *
-                  sizeof(self16->children[0])); // NOLINT
-      memmove(self16->childMaxVersion + i + 1, self16->childMaxVersion + i,
-              (self->numChildren - (i + 1)) *
-                  sizeof(self16->childMaxVersion[0]));
-    }
-#else
-    int i = 0;
-    for (; i < int(self->numChildren) - 1; ++i) {
-      if (int(self16->index[i]) > int(index)) {
-        memmove(self16->index + i + 1, self16->index + i,
-                self->numChildren - (i + 1));
-        memmove(self16->children + i + 1, self16->children + i,
-                (self->numChildren - (i + 1)) * sizeof(self16->children[0]));
-        memmove(self16->childMaxVersion + i + 1, self16->childMaxVersion + i,
-                (self->numChildren - (i + 1)) *
-                    sizeof(self16->childMaxVersion[0]));
+    auto *self16 = static_cast<Node16 *>(self);
+    int i = self->numChildren - 1;
+    for (; i >= 0; --i) {
+      if (int(self16->index[i]) < int(index)) {
        break;
      }
+      self16->index[i + 1] = self16->index[i];
+      self16->children[i + 1] = self16->children[i];
+      self16->childMaxVersion[i + 1] = self16->childMaxVersion[i];
    }
-#endif
-    self16->index[i] = index;
-    auto &result = self16->children[i];
+    self16->index[i + 1] = index;
+    auto &result = self16->children[i + 1];
    result = nullptr;
+    ++self->numChildren;
    return result;
  }
  case Type_Node48: {
@@ -1472,7 +1431,7 @@ void maybeDownsize(Node *self, WriteContext *tls, ConflictSet::Impl *impl,
      // that we have a new parent.
      setMaxVersion(child, impl, childMaxVersion);
      if (child->parent) {
-        rezero(child->parent, InternalVersionT::zero);
+        rezero(child->parent, tls->zero);
      }

      getInTree(self, impl) = child;
@@ -1552,36 +1511,24 @@ Node *erase(Node *self, WriteContext *tls, ConflictSet::Impl *impl,
    auto *parent3 = static_cast<Node3 *>(parent);
    int nodeIndex = getNodeIndex(parent3, parentsIndex);
    assert(nodeIndex >= 0);
-    memmove(parent3->index + nodeIndex, parent3->index + nodeIndex + 1,
-            sizeof(parent3->index[0]) *
-                (parent->numChildren - (nodeIndex + 1)));
-    memmove(parent3->children + nodeIndex, parent3->children + nodeIndex + 1,
-            sizeof(parent3->children[0]) * // NOLINT
-                (parent->numChildren - (nodeIndex + 1)));
-    memmove(parent3->childMaxVersion + nodeIndex,
-            parent3->childMaxVersion + nodeIndex + 1,
-            sizeof(parent3->childMaxVersion[0]) *
-                (parent->numChildren - (nodeIndex + 1)));
-
    --parent->numChildren;
+    for (int i = nodeIndex; i < parent->numChildren; ++i) {
+      parent3->index[i] = parent3->index[i + 1];
+      parent3->children[i] = parent3->children[i + 1];
+      parent3->childMaxVersion[i] = parent3->childMaxVersion[i + 1];
+    }
    assert(parent->numChildren > 0 || parent->entryPresent);
  } break;
  case Type_Node16: {
    auto *parent16 = static_cast<Node16 *>(parent);
    int nodeIndex = getNodeIndex(parent16, parentsIndex);
    assert(nodeIndex >= 0);
-    memmove(parent16->index + nodeIndex, parent16->index + nodeIndex + 1,
-            sizeof(parent16->index[0]) *
-                (parent->numChildren - (nodeIndex + 1)));
-    memmove(parent16->children + nodeIndex, parent16->children + nodeIndex + 1,
-            sizeof(parent16->children[0]) * // NOLINT
-                (parent->numChildren - (nodeIndex + 1)));
-    memmove(parent16->childMaxVersion + nodeIndex,
-            parent16->childMaxVersion + nodeIndex + 1,
-            sizeof(parent16->childMaxVersion[0]) *
-                (parent->numChildren - (nodeIndex + 1)));
-
    --parent->numChildren;
+    for (int i = nodeIndex; i < parent->numChildren; ++i) {
+      parent16->index[i] = parent16->index[i + 1];
+      parent16->children[i] = parent16->children[i + 1];
+      parent16->childMaxVersion[i] = parent16->childMaxVersion[i + 1];
+    }

    // By kMinChildrenNode16
    assert(parent->numChildren > 0);
@@ -1608,7 +1555,7 @@ Node *erase(Node *self, WriteContext *tls, ConflictSet::Impl *impl,
      parent48->index[parentIndex] = toRemoveChildrenIndex;
      parent48->reverseIndex[toRemoveChildrenIndex] = parentIndex;
    }
-    parent48->childMaxVersion[lastChildrenIndex] = InternalVersionT::zero;
+    parent48->childMaxVersion[lastChildrenIndex] = tls->zero;

    --parent->numChildren;

@@ -1726,15 +1673,38 @@ int firstNeqStride(const uint8_t *ap, const uint8_t *bp) {
 #endif
 }

+#if defined(__x86_64__) && !defined(__SANITIZE_THREAD__)
+__attribute__((target("avx512bw"))) int
+longestCommonPrefix(const uint8_t *ap, const uint8_t *bp, int cl) {
+  int i = 0;
+  int end = cl & ~63;
+  while (i < end) {
+    const uint64_t eq =
+        _mm512_cmpeq_epi8_mask(_mm512_loadu_epi8(ap), _mm512_loadu_epi8(bp));
+    if (eq != uint64_t(-1)) {
+      return i + std::countr_one(eq);
+    }
+    i += 64;
+    ap += 64;
+    bp += 64;
+  }
+  if (i < cl) {
+    const uint64_t mask = (uint64_t(1) << (cl - i)) - 1;
+    const uint64_t eq = _mm512_cmpeq_epi8_mask(
+        _mm512_maskz_loadu_epi8(mask, ap), _mm512_maskz_loadu_epi8(mask, bp));
+    return i + std::countr_one(eq & mask);
+  }
+  assert(i == cl);
+  return i;
+}
+__attribute__((target("default")))
+#endif
+
 int longestCommonPrefix(const uint8_t *ap, const uint8_t *bp, int cl) {
  assume(cl >= 0);
  int i = 0;
  int end;

-  if (cl < 8) {
-    goto bytes;
-  }
-
  // kStride * kUnrollCount at a time
  end = cl & ~(kStride * kUnrollFactor - 1);
  while (i < end) {
@@ -1775,7 +1745,6 @@ int longestCommonPrefix(const uint8_t *ap, const uint8_t *bp, int cl) {
    bp += 8;
  }

-bytes:
  // byte at a time
  while (i < cl) {
    if (*ap != *bp) {
@@ -1808,10 +1777,13 @@ struct SearchStepWise {
    if (child == nullptr) {
      return true;
    }
-    int cl = std::min<int>(child->partialKeyLen, remaining.size() - 1);
-    int i = longestCommonPrefix(child->partialKey(), remaining.data() + 1, cl);
-    if (i != child->partialKeyLen) {
-      return true;
+    if (child->partialKeyLen > 0) {
+      int cl = std::min<int>(child->partialKeyLen, remaining.size() - 1);
+      int i =
+          longestCommonPrefix(child->partialKey(), remaining.data() + 1, cl);
+      if (i != child->partialKeyLen) {
+        return true;
+      }
    }
    n = child;
    remaining =
@@ -2949,8 +2921,7 @@ template <bool kBegin>
      child->partialKeyLen = 0;
      child->parent = *self;
      child->parentsIndex = key.front();
-      setMaxVersion(child, impl,
-                    kBegin ? writeVersion : InternalVersionT::zero);
+      setMaxVersion(child, impl, kBegin ? writeVersion : tls->zero);
    }

    self = &child;
@@ -2998,8 +2969,7 @@ void addPointWrite(Node *&root, std::span<const uint8_t> key,
    n->entry.pointVersion = writeVersion;
    setMaxVersion(n, impl, writeVersion);
    n->entry.rangeVersion =
-        p == nullptr ? InternalVersionT::zero
-                     : std::max(p->entry.rangeVersion, InternalVersionT::zero);
+        p == nullptr ? tls->zero : std::max(p->entry.rangeVersion, tls->zero);
  } else {
    assert(writeVersion >= n->entry.pointVersion);
    n->entry.pointVersion = writeVersion;
@@ -3063,8 +3033,7 @@ void addWriteRange(Node *&root, std::span<const uint8_t> begin,
    ++tls->accum.entries_inserted;
    auto *p = nextLogical(beginNode);
    beginNode->entry.rangeVersion =
-        p == nullptr ? InternalVersionT::zero
-                     : std::max(p->entry.rangeVersion, InternalVersionT::zero);
+        p == nullptr ? tls->zero : std::max(p->entry.rangeVersion, tls->zero);
    beginNode->entry.pointVersion = writeVersion;
    assert(maxVersion(beginNode, impl) <= writeVersion);
    setMaxVersion(beginNode, impl, writeVersion);
@@ -3084,8 +3053,7 @@ void addWriteRange(Node *&root, std::span<const uint8_t> begin,
    ++tls->accum.entries_inserted;
    auto *p = nextLogical(endNode);
    endNode->entry.pointVersion =
-        p == nullptr ? InternalVersionT::zero
-                     : std::max(p->entry.rangeVersion, InternalVersionT::zero);
+        p == nullptr ? tls->zero : std::max(p->entry.rangeVersion, tls->zero);
    auto m = maxVersion(endNode, impl);
    setMaxVersion(endNode, impl,
                  std::max<InternalVersionT>(m, endNode->entry.pointVersion));
@@ -3162,9 +3130,6 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
  void check(const ReadRange *reads, Result *result, int count) {
    ReadContext tls;
    tls.impl = this;
-    int commits_accum = 0;
-    int conflicts_accum = 0;
-    int too_olds_accum = 0;
    int64_t check_byte_accum = 0;
    for (int i = 0; i < count; ++i) {
      const auto &r = reads[i];
@@ -3182,9 +3147,9 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
                                  InternalVersionT(reads[i].readVersion), &tls))
              ? Commit
              : Conflict;
-      commits_accum += result[i] == Commit;
-      conflicts_accum += result[i] == Conflict;
-      too_olds_accum += result[i] == TooOld;
+      tls.commits_accum += result[i] == Commit;
+      tls.conflicts_accum += result[i] == Conflict;
+      tls.too_olds_accum += result[i] == TooOld;
    }
    point_read_total.add(tls.point_read_accum);
    prefix_read_total.add(tls.prefix_read_accum);
@@ -3196,9 +3161,9 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
    point_read_iterations_total.add(tls.point_read_iterations_accum);
    prefix_read_iterations_total.add(tls.prefix_read_iterations_accum);
    range_read_iterations_total.add(tls.range_read_iterations_accum);
-    commits_total.add(commits_accum);
-    conflicts_total.add(conflicts_accum);
-    too_olds_total.add(too_olds_accum);
+    commits_total.add(tls.commits_accum);
+    conflicts_total.add(tls.conflicts_accum);
+    too_olds_total.add(tls.too_olds_accum);
    check_bytes_total.add(check_byte_accum);
  }

@@ -3206,7 +3171,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
    // There could be other conflict sets in the same thread. We need
    // InternalVersionT::zero to be correct for this conflict set for the
    // lifetime of the current call frame.
-    InternalVersionT::zero = oldestVersion;
+    InternalVersionT::zero = tls.zero = oldestVersion;

    assert(writeVersion >= newestVersionFullPrecision);

@@ -3317,7 +3282,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
    InternalVersionT oldestVersion{o};
    this->oldestVersionFullPrecision = o;
    this->oldestVersion = oldestVersion;
-    InternalVersionT::zero = oldestVersion;
+    InternalVersionT::zero = tls.zero = oldestVersion;
 #ifdef NDEBUG
    // This is here for performance reasons, since we want to amortize the cost
    // of storing the search path as a string. In tests, we want to exercise the
@@ -3367,7 +3332,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
    root->entry.pointVersion = this->oldestVersion;
    root->entry.rangeVersion = this->oldestVersion;

-    InternalVersionT::zero = this->oldestVersion;
+    InternalVersionT::zero = tls.zero = this->oldestVersion;

    // Intentionally not resetting totalBytes
  }
@@ -1,86 +1,38 @@
 A data structure for optimistic concurrency control on ranges of bitwise-lexicographically-ordered keys.

-Intended to replace FoundationDB's skip list.
+Intended as an alternative to FoundationDB's skip list.

-Hardware for all benchmarks is a mac m1 2020.
+Hardware for all benchmarks is an AMD Ryzen 9 7900 with (2x32GB) 5600MT/s CL28-34-34-89 1.35V RAM

-# FoundationDB's benchmark
+# Microbenchmark

 ## Skip list

-```
-New conflict set: 1.957 sec
-                  0.639 Mtransactions/sec
-                  2.555 Mkeys/sec
-Detect only:      1.845 sec
-                  0.678 Mtransactions/sec
-                  2.710 Mkeys/sec
-Skiplist only:    1.263 sec
-                  0.990 Mtransactions/sec
-                  3.960 Mkeys/sec
-Performance counters:
-               Build: 0.0546
-                 Add: 0.0563
-              Detect: 1.84
-              D.Sort: 0.412
-           D.Combine: 0.0141
-         D.CheckRead: 0.671
-   D.CheckIntraBatch: 0.0068
-        D.MergeWrite: 0.592
-      D.RemoveBefore: 0.146
-```
+|               ns/op |                op/s |    err% |          ins/op |          cyc/op |    IPC |         bra/op |   miss% |     total | benchmark
+|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
+|              172.03 |        5,812,791.77 |    0.4% |        3,130.62 |          879.00 |  3.562 |         509.23 |    0.0% |      0.01 | `point reads`
+|              167.44 |        5,972,130.71 |    0.2% |        3,065.14 |          862.27 |  3.555 |         494.30 |    0.0% |      0.01 | `prefix reads`
+|              238.77 |        4,188,130.84 |    0.9% |        3,589.93 |        1,259.30 |  2.851 |         637.12 |    0.0% |      0.01 | `range reads`
+|              424.01 |        2,358,426.70 |    0.2% |        5,620.05 |        2,242.35 |  2.506 |         854.80 |    1.7% |      0.01 | `point writes`
+|              418.45 |        2,389,780.56 |    0.4% |        5,525.07 |        2,211.05 |  2.499 |         831.71 |    1.7% |      0.01 | `prefix writes`
+|              254.87 |        3,923,568.88 |    2.6% |        3,187.01 |        1,366.50 |  2.332 |         529.11 |    2.7% |      0.02 | `range writes`
+|              675.96 |        1,479,374.50 |    3.3% |        7,735.41 |        3,468.60 |  2.230 |       1,386.02 |    1.8% |      0.01 | `monotonic increasing point writes`
+|          137,986.20 |            7,247.10 |    0.6% |      789,752.33 |      699,462.00 |  1.129 |     144,824.14 |    0.0% |      0.01 | `worst case for radix tree`
+|               21.63 |       46,231,564.03 |    1.0% |          448.00 |          107.14 |  4.181 |          84.00 |    0.0% |      0.01 | `create and destroy`

 ## Radix tree (this implementation)

-```
-New conflict set: 1.366 sec
-                  0.915 Mtransactions/sec
-                  3.660 Mkeys/sec
-Detect only:      1.248 sec
-                  1.002 Mtransactions/sec
-                  4.007 Mkeys/sec
-Skiplist only:    0.573 sec
-                  2.182 Mtransactions/sec
-                  8.730 Mkeys/sec
-Performance counters:
-               Build: 0.0594
-                 Add: 0.0572
-              Detect: 1.25
-              D.Sort: 0.418
-           D.Combine: 0.0149
-         D.CheckRead: 0.232
-   D.CheckIntraBatch: 0.0067
-        D.MergeWrite: 0.341
-      D.RemoveBefore: 0.232
-```
-
-# Our benchmark
-
-## Skip list
-
-|               ns/op |                op/s |    err% |     total | benchmark
-|--------------------:|--------------------:|--------:|----------:|:----------
-|              245.99 |        4,065,232.81 |    0.3% |      0.01 | `point reads`
-|              265.93 |        3,760,430.49 |    0.2% |      0.01 | `prefix reads`
-|              485.30 |        2,060,569.50 |    0.2% |      0.01 | `range reads`
-|              449.60 |        2,224,195.17 |    0.4% |      0.01 | `point writes`
-|              441.76 |        2,263,688.18 |    1.1% |      0.01 | `prefix writes`
-|              245.42 |        4,074,647.54 |    2.4% |      0.02 | `range writes`
-|              572.80 |        1,745,810.06 |    1.3% |      0.01 | `monotonic increasing point writes`
-|          154,819.33 |            6,459.14 |    0.9% |      0.01 | `worst case for radix tree`
-
-## Radix tree (this implementation)
-
-|               ns/op |                op/s |    err% |     total | benchmark
-|--------------------:|--------------------:|--------:|----------:|:----------
-|               20.25 |       49,372,759.86 |    0.3% |      0.01 | `point reads`
-|               23.58 |       42,401,298.00 |    0.3% |      0.01 | `prefix reads`
-|               64.12 |       15,595,463.14 |    0.8% |      0.01 | `range reads`
-|               29.50 |       33,903,101.20 |    0.7% |      0.01 | `point writes`
-|               46.76 |       21,384,036.19 |    1.2% |      0.01 | `prefix writes`
-|               51.25 |       19,512,195.12 |    0.0% |      0.01 | `range writes`
-|              109.51 |        9,131,469.31 |    3.6% |      0.01 | `monotonic increasing point writes`
-|        1,153,875.00 |              866.65 |    1.6% |      0.01 | `worst case for radix tree`
+|               ns/op |                op/s |    err% |          ins/op |          cyc/op |    IPC |         bra/op |   miss% |     total | benchmark
+|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
+|               17.03 |       58,732,967.93 |    0.6% |          276.28 |           87.96 |  3.141 |          52.15 |    0.4% |      0.01 | `point reads`
+|               19.52 |       51,239,158.04 |    0.3% |          367.16 |          101.50 |  3.617 |          61.92 |    0.3% |      0.01 | `prefix reads`
+|               47.74 |       20,947,676.63 |    0.5% |          998.16 |          247.43 |  4.034 |         161.64 |    0.2% |      0.01 | `range reads`
+|               23.14 |       43,207,824.89 |    0.4% |          408.18 |          121.64 |  3.356 |          70.20 |    0.3% |      0.01 | `point writes`
+|               38.02 |       26,302,115.66 |    0.1% |          709.72 |          199.70 |  3.554 |         134.26 |    0.3% |      0.01 | `prefix writes`
+|               44.28 |       22,583,559.17 |    0.9% |          825.19 |          233.10 |  3.540 |         141.48 |    0.2% |      0.01 | `range writes`
+|               85.50 |       11,695,990.63 |    0.5% |        1,488.16 |          455.68 |  3.266 |         289.22 |    0.1% |      0.01 | `monotonic increasing point writes`
+|          338,388.50 |            2,955.18 |    3.3% |    4,097,087.00 |    1,809,996.00 |  2.264 |     759,645.00 |    0.0% |      0.01 | `worst case for radix tree`
+|               84.84 |       11,787,313.59 |    1.4% |        1,716.02 |          440.50 |  3.896 |         271.00 |    0.0% |      0.01 | `create and destroy`

 # "Real data" test

@@ -89,13 +41,13 @@ Point queries only, best of three runs. Gc ratio is the ratio of time spent doin
 ## skip list

 ```
-Check: 11.3385 seconds, 329.718 MB/s, Add: 5.35612 seconds, 131.072 MB/s, Gc ratio: 45.7173%
+Check: 4.47891 seconds, 364.05 MB/s, Add: 4.55599 seconds, 123.058 MB/s, Gc ratio: 37.1145%
 ```

 ## radix tree

 ```
-Check: 2.60639 seconds, 1434.36 MB/s, Add: 2.10911 seconds, 332.86 MB/s, Gc ratio: 46.3071%
+Check: 1.05813 seconds, 1540.97 MB/s, Add: 1.32071 seconds, 424.508 MB/s, Gc ratio: 42.2067%
 ```

 ## hash table
@@ -103,5 +55,5 @@ Check: 2.60639 seconds, 1434.36 MB/s, Add: 2.10911 seconds, 332.86 MB/s, Gc rati
 (The hash table implementation doesn't work on range queries, and its purpose is to provide an idea of how fast point queries can be)

 ```
-Check: 1.83386 seconds, 2038.6 MB/s, Add: 0.601411 seconds, 1167.32 MB/s, Gc ratio: 48.9776%
+Check: 0.804094 seconds, 2027.81 MB/s, Add: 0.652952 seconds, 858.645 MB/s, Gc ratio: 35.3885%
 ```
Author	SHA1	Message	Date
andrew	d1dc1247e1	Switch to new machine for benchmarks Tests / Clang total: 1479, passed: 1479 Details Clang \|Total\|New\|Outstanding\|Fixed\|Trend \|:-:\|:-:\|:-:\|:-:\|:-: \|0\|0\|0\|0\|:clap: Details Tests / Debug total: 1477, passed: 1477 Details weaselab/conflict-set/pipeline/head There was a failure building this commit Details	2024-08-02 18:18:36 -07:00
andrew	f1ad68109a	Skip lcp call in SearchStepWise if no partial key	2024-08-02 18:16:55 -07:00
andrew	c4443bc355	Remove len < 8 check in longestCommonPrefix Micro benchmarks look a tiny bit better /shrug	2024-08-02 18:16:55 -07:00
andrew	857b402fe2	Add an avx512bw longestCommonPrefix	2024-08-02 18:16:55 -07:00
andrew	9b3e1b219b	Remove small memmoves in erase	2024-08-02 18:16:55 -07:00
andrew	ab52c63935	Move local accum's into ReadContext	2024-08-02 18:16:55 -07:00
andrew	bad9d7ced8	Avoid some tls lookups for InternalVersionT::zero	2024-08-02 18:16:55 -07:00
andrew	c8d9dc034d	Remove memmove call for inserting into Node16	2024-08-02 18:16:55 -07:00
andrew	72168ef6a3	Avoid memmove when inserting into Node3	2024-08-02 18:16:55 -07:00
andrew	620a0afd2a	Add debug symbols and frame pointer So that perf works out of the box. Leave it in the release artifacts/do whatever cpack does by default for now.	2024-08-02 18:16:55 -07:00