Reordering seems to improve codegen

WIP
Use preserve_none and put continuation array in CheckAll
2024-09-23 15:28:51 -07:00 · 2024-09-23 15:11:48 -07:00 · 2024-09-23 14:53:16 -07:00 · 2024-09-23 13:32:56 -07:00 · 2024-09-23 12:52:30 -07:00 · 2024-09-21 14:28:15 -07:00
18 changed files with 466 additions and 238 deletions
@@ -24,6 +24,14 @@ repos:
        entry: "^#define SHOW_MEMORY 1$"
        language: pygrep
        types: [c++]
+  - repo: local
+    hooks:
+      - id: sim cache misses check
+        name: disallow checking in SIM_CACHE_MISSES=1
+        description: disallow checking in SIM_CACHE_MISSES=1
+        entry: "^#define SIM_CACHE_MISSES 1$"
+        language: pygrep
+        types: [c++]
  - repo: https://github.com/shellcheck-py/shellcheck-py
    rev: a23f6b85d0fdd5bb9d564e2579e678033debbdff # frozen: v0.10.0.1
    hooks:
@@ -350,6 +350,11 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR AND BUILD_TESTING)
  set_target_properties(server_bench PROPERTIES SKIP_BUILD_RPATH ON)

  add_executable(interleaving_test InterleavingTest.cpp)
+  # work around lack of musttail for gcc
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_BUILD_TYPE STREQUAL "Debug")
+    target_compile_options(interleaving_test PRIVATE -Og
+                                                     -foptimize-sibling-calls)
+  endif()
  target_link_libraries(interleaving_test PRIVATE nanobench)
 endif()

@@ -48,6 +48,17 @@ limitations under the License.
 #endif
 #endif

+#define SIM_CACHE_MISSES 0
+#if SIM_CACHE_MISSES
+constexpr void simCacheMiss(void *x) {
+  if (x) {
+    _mm_clflush(x);
+  }
+}
+#else
+constexpr void simCacheMiss(void *) {}
+#endif
+
 #include <memcheck.h>

 using namespace weaselab;
@@ -766,8 +777,6 @@ private:

 int getNodeIndex(Node3 *self, uint8_t index) {
  Node3 *n = (Node3 *)self;
-  assume(n->numChildren >= 1);
-  assume(n->numChildren <= 3);
  for (int i = 0; i < n->numChildren; ++i) {
    if (n->index[i] == index) {
      return i;
@@ -776,18 +785,6 @@ int getNodeIndex(Node3 *self, uint8_t index) {
  return -1;
 }

-int getNodeIndexExists(Node3 *self, uint8_t index) {
-  Node3 *n = (Node3 *)self;
-  assume(n->numChildren >= 1);
-  assume(n->numChildren <= 3);
-  for (int i = 0; i < n->numChildren; ++i) {
-    if (n->index[i] == index) {
-      return i;
-    }
-  }
-  __builtin_unreachable(); // GCOVR_EXCL_LINE
-}
-
 int getNodeIndex(Node16 *self, uint8_t index) {

 #ifdef HAS_AVX
@@ -848,62 +845,31 @@ int getNodeIndex(Node16 *self, uint8_t index) {
 #endif
 }

-int getNodeIndexExists(Node16 *self, uint8_t index) {
-
-#ifdef HAS_AVX
-  __m128i key_vec = _mm_set1_epi8(index);
-  __m128i indices;
-  memcpy(&indices, self->index, Node16::kMaxNodes);
-  __m128i results = _mm_cmpeq_epi8(key_vec, indices);
-  uint32_t mask = (1 << self->numChildren) - 1;
-  uint32_t bitfield = _mm_movemask_epi8(results) & mask;
-  assume(bitfield != 0);
-  return std::countr_zero(bitfield);
-#elif defined(HAS_ARM_NEON)
-  // Based on
-  // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
-
-  uint8x16_t indices;
-  memcpy(&indices, self->index, Node16::kMaxNodes);
-  // 0xff for each match
-  uint16x8_t results =
-      vreinterpretq_u16_u8(vceqq_u8(vdupq_n_u8(index), indices));
-  assume(self->numChildren <= Node16::kMaxNodes);
-  uint64_t mask = self->numChildren == 16
-                      ? uint64_t(-1)
-                      : (uint64_t(1) << (self->numChildren * 4)) - 1;
-  // 0xf for each match in valid range
-  uint64_t bitfield =
-      vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(results, 4)), 0) & mask;
-  assume(bitfield != 0);
-  return std::countr_zero(bitfield) / 4;
-#else
-  for (int i = 0; i < self->numChildren; ++i) {
-    if (self->index[i] == index) {
-      return i;
-    }
-  }
-  __builtin_unreachable(); // GCOVR_EXCL_LINE
-#endif
-}
-
 // Precondition - an entry for index must exist in the node
 Node *&getChildExists(Node3 *self, uint8_t index) {
-  return self->children[getNodeIndexExists(self, index)];
+  auto &result = self->children[getNodeIndex(self, index)];
+  simCacheMiss(result);
+  return result;
 }
 // Precondition - an entry for index must exist in the node
 Node *&getChildExists(Node16 *self, uint8_t index) {
-  return self->children[getNodeIndexExists(self, index)];
+  auto &result = self->children[getNodeIndex(self, index)];
+  simCacheMiss(result);
+  return result;
 }
 // Precondition - an entry for index must exist in the node
 Node *&getChildExists(Node48 *self, uint8_t index) {
  assert(self->bitSet.test(index));
-  return self->children[self->index[index]];
+  auto &result = self->children[self->index[index]];
+  simCacheMiss(result);
+  return result;
 }
 // Precondition - an entry for index must exist in the node
 Node *&getChildExists(Node256 *self, uint8_t index) {
  assert(self->bitSet.test(index));
-  return self->children[index];
+  auto &result = self->children[index];
+  simCacheMiss(result);
+  return result;
 }

 // Precondition - an entry for index must exist in the node
@@ -938,12 +904,12 @@ InternalVersionT maxVersion(Node *n) {
    __builtin_unreachable(); // GCOVR_EXCL_LINE
  case Type_Node3: {
    auto *n3 = static_cast<Node3 *>(n);
-    int i = getNodeIndexExists(n3, index);
+    int i = getNodeIndex(n3, index);
    return n3->childMaxVersion[i];
  }
  case Type_Node16: {
    auto *n16 = static_cast<Node16 *>(n);
-    int i = getNodeIndexExists(n16, index);
+    int i = getNodeIndex(n16, index);
    return n16->childMaxVersion[i];
  }
  case Type_Node48: {
@@ -971,12 +937,12 @@ InternalVersionT exchangeMaxVersion(Node *n, InternalVersionT newMax) {
    __builtin_unreachable(); // GCOVR_EXCL_LINE
  case Type_Node3: {
    auto *n3 = static_cast<Node3 *>(n);
-    int i = getNodeIndexExists(n3, index);
+    int i = getNodeIndex(n3, index);
    return std::exchange(n3->childMaxVersion[i], newMax);
  }
  case Type_Node16: {
    auto *n16 = static_cast<Node16 *>(n);
-    int i = getNodeIndexExists(n16, index);
+    int i = getNodeIndex(n16, index);
    return std::exchange(n16->childMaxVersion[i], newMax);
  }
  case Type_Node48: {
@@ -1005,13 +971,13 @@ void setMaxVersion(Node *n, InternalVersionT newMax) {
    __builtin_unreachable(); // GCOVR_EXCL_LINE
  case Type_Node3: {
    auto *n3 = static_cast<Node3 *>(n);
-    int i = getNodeIndexExists(n3, index);
+    int i = getNodeIndex(n3, index);
    n3->childMaxVersion[i] = newMax;
    return;
  }
  case Type_Node16: {
    auto *n16 = static_cast<Node16 *>(n);
-    int i = getNodeIndexExists(n16, index);
+    int i = getNodeIndex(n16, index);
    n16->childMaxVersion[i] = newMax;
    return;
  }
@@ -1083,6 +1049,7 @@ ChildAndMaxVersion getChildAndMaxVersion(Node3 *self, uint8_t index) {
  if (i < 0) {
    return {};
  }
+  simCacheMiss(self->children[i]);
  return {self->children[i], self->childMaxVersion[i]};
 }
 ChildAndMaxVersion getChildAndMaxVersion(Node16 *self, uint8_t index) {
@@ -1090,6 +1057,7 @@ ChildAndMaxVersion getChildAndMaxVersion(Node16 *self, uint8_t index) {
  if (i < 0) {
    return {};
  }
+  simCacheMiss(self->children[i]);
  return {self->children[i], self->childMaxVersion[i]};
 }
 ChildAndMaxVersion getChildAndMaxVersion(Node48 *self, uint8_t index) {
@@ -1097,9 +1065,11 @@ ChildAndMaxVersion getChildAndMaxVersion(Node48 *self, uint8_t index) {
  if (i < 0) {
    return {};
  }
+  simCacheMiss(self->children[i]);
  return {self->children[i], self->childMaxVersion[i]};
 }
 ChildAndMaxVersion getChildAndMaxVersion(Node256 *self, uint8_t index) {
+  simCacheMiss(self->children[index]);
  return {self->children[index], self->childMaxVersion[index]};
 }

@@ -1123,10 +1093,9 @@ ChildAndMaxVersion getChildAndMaxVersion(Node *self, uint8_t index) {
 Node *getChildGeq(Node0 *, int) { return nullptr; }

 Node *getChildGeq(Node3 *n, int child) {
-  assume(n->numChildren >= 1);
-  assume(n->numChildren <= 3);
  for (int i = 0; i < n->numChildren; ++i) {
    if (n->index[i] >= child) {
+      simCacheMiss(n->children[i]);
      return n->children[i];
    }
  }
@@ -1145,7 +1114,10 @@ Node *getChildGeq(Node16 *self, int child) {
  __m128i results = _mm_cmpeq_epi8(key_vec, _mm_min_epu8(key_vec, indices));
  int mask = (1 << self->numChildren) - 1;
  uint32_t bitfield = _mm_movemask_epi8(results) & mask;
-  return bitfield == 0 ? nullptr : self->children[std::countr_zero(bitfield)];
+  auto *result =
+      bitfield == 0 ? nullptr : self->children[std::countr_zero(bitfield)];
+  simCacheMiss(result);
+  return result;
 #elif defined(HAS_ARM_NEON)
  uint8x16_t indices;
  memcpy(&indices, self->index, sizeof(self->index));
@@ -1181,13 +1153,16 @@ Node *getChildGeq(Node48 *self, int child) {
  if (c < 0) {
    return nullptr;
  }
-  return self->children[self->index[c]];
+  auto *result = self->children[self->index[c]];
+  simCacheMiss(result);
+  return result;
 }
 Node *getChildGeq(Node256 *self, int child) {
  int c = self->bitSet.firstSetGeq(child);
  if (c < 0) {
    return nullptr;
  }
+  simCacheMiss(self->children[c]);
  return self->children[c];
 }

@@ -1211,20 +1186,26 @@ Node *getChildGeq(Node *self, int child) {
 // Precondition: self has a child
 Node *getFirstChildExists(Node3 *self) {
  assert(self->numChildren > 0);
+  simCacheMiss(self->children[0]);
  return self->children[0];
 }
 // Precondition: self has a child
 Node *getFirstChildExists(Node16 *self) {
  assert(self->numChildren > 0);
+  simCacheMiss(self->children[0]);
  return self->children[0];
 }
 // Precondition: self has a child
 Node *getFirstChildExists(Node48 *self) {
-  return self->children[self->index[self->bitSet.firstSetGeq(0)]];
+  auto *result = self->children[self->index[self->bitSet.firstSetGeq(0)]];
+  simCacheMiss(result);
+  return result;
 }
 // Precondition: self has a child
 Node *getFirstChildExists(Node256 *self) {
-  return self->children[self->bitSet.firstSetGeq(0)];
+  auto *result = self->children[self->bitSet.firstSetGeq(0)];
+  simCacheMiss(result);
+  return result;
 }

 // Precondition: self has a child
@@ -1604,6 +1585,7 @@ __attribute__((target("avx512f"))) void rezero16(InternalVersionT *vs,
      _mm512_sub_epi32(_mm512_loadu_epi32(vs), zvec), _mm512_setzero_epi32());
  _mm512_mask_storeu_epi32(vs, m, zvec);
 }
+
 __attribute__((target("default")))
 #endif

@@ -2470,7 +2452,6 @@ checkMaxBetweenExclusive(Node *n, int begin, int end,
 }
 __attribute__((target("default")))
 #endif
-
 bool checkMaxBetweenExclusive(Node *n, int begin, int end,
                         InternalVersionT readVersion, ReadContext *tls) {
  return checkMaxBetweenExclusiveImpl<false>(n, begin, end, readVersion, tls);
@@ -2910,72 +2891,6 @@ void addPointWrite(Node *&root, std::span<const uint8_t> key,
  }
 }

-#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__)
-__attribute__((target("avx512f"))) InternalVersionT
-horizontalMaxUpTo16(InternalVersionT *vs, InternalVersionT z, int len) {
-  assume(len <= 16);
-#if USE_64_BIT
-  // Hope it gets vectorized
-  InternalVersionT max = vs[0];
-  for (int i = 1; i < len; ++i) {
-    max = std::max(vs[i], max);
-  }
-  return max;
-#else
-  uint32_t zero;
-  memcpy(&zero, &z, sizeof(zero));
-  auto zeroVec = _mm512_set1_epi32(zero);
-  auto max = InternalVersionT(
-      zero +
-      _mm512_reduce_max_epi32(_mm512_sub_epi32(
-          _mm512_mask_loadu_epi32(zeroVec, _mm512_int2mask((1 << len) - 1), vs),
-          zeroVec)));
-  return max;
-#endif
-}
-__attribute__((target("default")))
-#endif
-
-InternalVersionT
-horizontalMaxUpTo16(InternalVersionT *vs, InternalVersionT, int len) {
-  assume(len <= 16);
-  InternalVersionT max = vs[0];
-  for (int i = 1; i < len; ++i) {
-    max = std::max(vs[i], max);
-  }
-  return max;
-}
-
-#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__)
-__attribute__((target("avx512f"))) InternalVersionT
-horizontalMax16(InternalVersionT *vs, InternalVersionT z) {
-#if USE_64_BIT
-  // Hope it gets vectorized
-  InternalVersionT max = vs[0];
-  for (int i = 1; i < 16; ++i) {
-    max = std::max(vs[i], max);
-  }
-  return max;
-#else
-  uint32_t zero;
-  memcpy(&zero, &z, sizeof(zero));
-  auto zeroVec = _mm512_set1_epi32(zero);
-  return InternalVersionT(zero + _mm512_reduce_max_epi32(_mm512_sub_epi32(
-                                     _mm512_loadu_epi32(vs), zeroVec)));
-#endif
-}
-__attribute__((target("default")))
-#endif
-
-InternalVersionT
-horizontalMax16(InternalVersionT *vs, InternalVersionT) {
-  InternalVersionT max = vs[0];
-  for (int i = 1; i < 16; ++i) {
-    max = std::max(vs[i], max);
-  }
-  return max;
-}
-
 // Precondition: `node->entryPresent`, and node is not the root
 void fixupMaxVersion(Node *node, WriteContext *tls) {
  assert(node->parent);
@@ -2987,13 +2902,15 @@ void fixupMaxVersion(Node *node, WriteContext *tls) {
    break;
  case Type_Node3: {
    auto *self3 = static_cast<Node3 *>(node);
-    max = std::max(max, horizontalMaxUpTo16(self3->childMaxVersion, tls->zero,
-                                            self3->numChildren));
+    for (int i = 0; i < self3->numChildren; ++i) {
+      max = std::max(self3->childMaxVersion[i], max);
+    }
  } break;
  case Type_Node16: {
    auto *self16 = static_cast<Node16 *>(node);
-    max = std::max(max, horizontalMaxUpTo16(self16->childMaxVersion, tls->zero,
-                                            self16->numChildren));
+    for (int i = 0; i < self16->numChildren; ++i) {
+      max = std::max(self16->childMaxVersion[i], max);
+    }
  } break;
  case Type_Node48: {
    auto *self48 = static_cast<Node48 *>(node);
@@ -3003,7 +2920,9 @@ void fixupMaxVersion(Node *node, WriteContext *tls) {
  } break;
  case Type_Node256: {
    auto *self256 = static_cast<Node256 *>(node);
-    max = std::max(max, horizontalMax16(self256->childMaxVersion, tls->zero));
+    for (auto v : self256->maxOfMax) {
+      max = std::max(v, max);
+    }
  } break;
  default:                   // GCOVR_EXCL_LINE
    __builtin_unreachable(); // GCOVR_EXCL_LINE
@@ -3126,34 +3045,288 @@ Node *firstGeqPhysical(Node *n, const std::span<const uint8_t> key) {
  }
 }

+#ifndef __has_attribute
+#define __has_attribute(x) 0
+#endif
+
+#if __has_attribute(musttail)
+#define MUSTTAIL __attribute__((musttail))
+#else
+#define MUSTTAIL
+#endif
+
+#if __has_attribute(preserve_none)
+#define CONTINUATION_CALLING_CONVENTION __attribute__((preserve_none))
+#else
+#define CONTINUATION_CALLING_CONVENTION
+#endif
+
+typedef CONTINUATION_CALLING_CONVENTION void (*continuation)(struct CheckAll *,
+                                                             int64_t prevJob,
+                                                             int64_t job,
+                                                             int64_t started,
+                                                             int64_t count);
+
+// State relevant to a particular query
+struct CheckJob {
+  void setResult(bool ok) {
+    *result = ok ? ConflictSet::Commit : ConflictSet::Conflict;
+  }
+
+  [[nodiscard]] continuation init(const ConflictSet::ReadRange *read,
+                                  ConflictSet::Result *result, Node *root,
+                                  int64_t oldestVersionFullPrecision,
+                                  ReadContext *tls);
+
+  Node *n;
+  ChildAndMaxVersion childAndVersion;
+  std::span<const uint8_t> begin;
+  InternalVersionT readVersion;
+  ConflictSet::Result *result;
+};
+
+// State relevant to all queries
+struct CheckAll {
+  constexpr static int kConcurrent = 32;
+  CheckJob inProgress[kConcurrent];
+  continuation next[kConcurrent];
+  int nextJob[kConcurrent];
+  Node *root;
+  int64_t oldestVersionFullPrecision;
+  ReadContext *tls;
+  const ConflictSet::ReadRange *queries;
+  ConflictSet::Result *results;
+};
+
+CONTINUATION_CALLING_CONVENTION void keepGoing(CheckAll *context,
+                                               int64_t prevJob, int64_t job,
+                                               int64_t started, int64_t count) {
+  prevJob = job;
+  job = context->nextJob[job];
+  MUSTTAIL return context->next[job](context, prevJob, job, started, count);
+}
+
+CONTINUATION_CALLING_CONVENTION void complete(CheckAll *context,
+                                              int64_t prevJob, int64_t job,
+                                              int64_t started, int64_t count) {
+  if (started == count) {
+    if (prevJob == job) {
+      return;
+    }
+    context->nextJob[prevJob] = context->nextJob[job];
+    job = prevJob;
+  } else {
+    int temp = started++;
+    context->next[job] = context->inProgress[job].init(
+        context->queries + temp, context->results + temp, context->root,
+        context->oldestVersionFullPrecision, context->tls);
+  }
+  MUSTTAIL return keepGoing(context, prevJob, job, started, count);
+}
+
+namespace check_point_read_state_machine {
+
+CONTINUATION_CALLING_CONVENTION void
+down_left_spine(struct CheckAll *, int64_t prevJob, int64_t job,
+                int64_t started, int64_t count);
+CONTINUATION_CALLING_CONVENTION void iter(struct CheckAll *, int64_t prevJob,
+                                          int64_t job, int64_t started,
+                                          int64_t count);
+CONTINUATION_CALLING_CONVENTION void begin(struct CheckAll *, int64_t prevJob,
+                                           int64_t job, int64_t started,
+                                           int64_t count);
+
+void begin(struct CheckAll *context, int64_t prevJob, int64_t job,
+           int64_t started, int64_t count) {
+  ++context->tls->point_read_accum;
+#if DEBUG_VERBOSE && !defined(NDEBUG)
+  fprintf(stderr, "Check point read: %s\n", printable(key).c_str());
+#endif
+  auto *j = context->inProgress + job;
+
+  if (j->begin.size() == 0) {
+    if (j->n->entryPresent) {
+      j->setResult(j->n->entry.pointVersion <= j->readVersion);
+      MUSTTAIL return complete(context, prevJob, job, started, count);
+    }
+    j->n = getFirstChildExists(j->n);
+    context->next[job] = down_left_spine;
+    __builtin_prefetch(j->n);
+    MUSTTAIL return keepGoing(context, prevJob, job, started, count);
+  }
+
+  j->childAndVersion = getChildAndMaxVersion(j->n, j->begin[0]);
+  context->next[job] = iter;
+  __builtin_prefetch(j->childAndVersion.child);
+  MUSTTAIL return keepGoing(context, prevJob, job, started, count);
+}
+
+void iter(struct CheckAll *context, int64_t prevJob, int64_t job,
+          int64_t started, int64_t count) {
+  auto *j = context->inProgress + job;
+  if (j->childAndVersion.child == nullptr) {
+    auto c = getChildGeq(j->n, j->begin[0]);
+    if (c != nullptr) {
+      j->n = c;
+      context->next[job] = down_left_spine;
+      __builtin_prefetch(j->n);
+      MUSTTAIL return keepGoing(context, prevJob, job, started, count);
+    } else {
+      j->n = nextSibling(j->n);
+      if (j->n == nullptr) {
+        j->setResult(true);
+        MUSTTAIL return complete(context, prevJob, job, started, count);
+      }
+      context->next[job] = down_left_spine;
+      __builtin_prefetch(j->n);
+      MUSTTAIL return keepGoing(context, prevJob, job, started, count);
+    }
+  }
+
+  j->n = j->childAndVersion.child;
+  j->begin = j->begin.subspan(1, j->begin.size() - 1);
+
+  if (j->n->partialKeyLen > 0) {
+    int commonLen = std::min<int>(j->n->partialKeyLen, j->begin.size());
+    int i = longestCommonPrefix(j->n->partialKey(), j->begin.data(), commonLen);
+    if (i < commonLen) {
+      auto c = j->n->partialKey()[i] <=> j->begin[i];
+      if (c > 0) {
+        context->next[job] = down_left_spine;
+        MUSTTAIL return down_left_spine(context, prevJob, job, started, count);
+      } else {
+        j->n = nextSibling(j->n);
+        if (j->n == nullptr) {
+          j->setResult(true);
+          MUSTTAIL return complete(context, prevJob, job, started, count);
+        }
+        context->next[job] = down_left_spine;
+        __builtin_prefetch(j->n);
+        MUSTTAIL return keepGoing(context, prevJob, job, started, count);
+      }
+    }
+    if (commonLen == j->n->partialKeyLen) {
+      // partial key matches
+      j->begin = j->begin.subspan(commonLen, j->begin.size() - commonLen);
+    } else if (j->n->partialKeyLen > int(j->begin.size())) {
+      // n is the first physical node greater than remaining, and there's no
+      // eq node
+      context->next[job] = down_left_spine;
+      MUSTTAIL return down_left_spine(context, prevJob, job, started, count);
+    }
+  }
+
+  if (j->childAndVersion.maxVersion <= j->readVersion) {
+    ++context->tls->point_read_short_circuit_accum;
+    j->setResult(true);
+    MUSTTAIL return complete(context, prevJob, job, started, count);
+  }
+
+  ++context->tls->point_read_iterations_accum;
+
+  if (j->begin.size() == 0) {
+    if (j->n->entryPresent) {
+      j->setResult(j->n->entry.pointVersion <= j->readVersion);
+      MUSTTAIL return complete(context, prevJob, job, started, count);
+    }
+    j->n = getFirstChildExists(j->n);
+    context->next[job] = down_left_spine;
+    __builtin_prefetch(j->n);
+    MUSTTAIL return keepGoing(context, prevJob, job, started, count);
+  }
+
+  j->childAndVersion = getChildAndMaxVersion(j->n, j->begin[0]);
+  __builtin_prefetch(j->childAndVersion.child);
+  // j->next is already iter
+  MUSTTAIL return keepGoing(context, prevJob, job, started, count);
+}
+
+void down_left_spine(struct CheckAll *context, int64_t prevJob, int64_t job,
+                     int64_t started, int64_t count) {
+  auto *j = context->inProgress + job;
+  if (j->n->entryPresent) {
+    j->setResult(j->n->entry.rangeVersion <= j->readVersion);
+    MUSTTAIL return complete(context, prevJob, job, started, count);
+  }
+  j->n = getFirstChildExists(j->n);
+  __builtin_prefetch(j->n);
+  // j->next is already down_left_spine
+  MUSTTAIL return keepGoing(context, prevJob, job, started, count);
+}
+
+} // namespace check_point_read_state_machine
+
+continuation CheckJob::init(const ConflictSet::ReadRange *read,
+                            ConflictSet::Result *result, Node *root,
+                            int64_t oldestVersionFullPrecision,
+                            ReadContext *tls) {
+  auto begin = std::span<const uint8_t>(read->begin.p, read->begin.len);
+  auto end = std::span<const uint8_t>(read->end.p, read->end.len);
+  if (read->readVersion < oldestVersionFullPrecision) {
+    *result = ConflictSet::TooOld;
+    return complete;
+  } else if (end.size() == 0) {
+    this->begin = begin;
+    this->n = root;
+    this->readVersion = InternalVersionT(read->readVersion);
+    this->result = result;
+    return check_point_read_state_machine::begin;
+    // *result =
+    //     checkPointRead(root, begin, InternalVersionT(read->readVersion), tls)
+    //         ? ConflictSet::Commit
+    //         : ConflictSet::Conflict;
+    // return complete;
+  } else {
+    *result = checkRangeRead(root, begin, end,
+                             InternalVersionT(read->readVersion), tls)
+                  ? ConflictSet::Commit
+                  : ConflictSet::Conflict;
+    return complete;
+  }
+}
+
 struct __attribute__((visibility("hidden"))) ConflictSet::Impl {

  void check(const ReadRange *reads, Result *result, int count) {
+    assert(oldestVersionFullPrecision >=
+           newestVersionFullPrecision - kNominalVersionWindow);
+
+    if (count == 0) {
+      return;
+    }
+
    ReadContext tls;
    tls.impl = this;
    int64_t check_byte_accum = 0;
+
+    CheckAll context;
+    context.oldestVersionFullPrecision = oldestVersionFullPrecision;
+    context.queries = reads;
+    context.results = result;
+    context.root = root;
+    context.tls = &tls;
+
+    int64_t started = std::min(context.kConcurrent, count);
+    for (int i = 0; i < started; i++) {
+      context.next[i] = context.inProgress[i].init(
+          reads + i, result + i, root, oldestVersionFullPrecision, &tls);
+      context.nextJob[i] = i + 1;
+    }
+    context.nextJob[started - 1] = 0;
+    int prevJob = started - 1;
+    int job = 0;
+    context.next[job](&context, prevJob, job, started, count);
+
    for (int i = 0; i < count; ++i) {
      assert(reads[i].readVersion >= 0);
      assert(reads[i].readVersion <= newestVersionFullPrecision);
      const auto &r = reads[i];
      check_byte_accum += r.begin.len + r.end.len;
-      auto begin = std::span<const uint8_t>(r.begin.p, r.begin.len);
-      auto end = std::span<const uint8_t>(r.end.p, r.end.len);
-      assert(oldestVersionFullPrecision >=
-             newestVersionFullPrecision - kNominalVersionWindow);
-      result[i] =
-          reads[i].readVersion < oldestVersionFullPrecision ? TooOld
-          : (end.size() > 0
-                 ? checkRangeRead(root, begin, end,
-                                  InternalVersionT(reads[i].readVersion), &tls)
-                 : checkPointRead(root, begin,
-                                  InternalVersionT(reads[i].readVersion), &tls))
-              ? Commit
-              : Conflict;
      tls.commits_accum += result[i] == Commit;
      tls.conflicts_accum += result[i] == Conflict;
      tls.too_olds_accum += result[i] == TooOld;
    }
+
    point_read_total.add(tls.point_read_accum);
    prefix_read_total.add(tls.prefix_read_accum);
    range_read_total.add(tls.range_read_accum);
@@ -4095,24 +4268,6 @@ template <int kN> void benchScan2() {
  });
 }

-void benchHorizontal16() {
-  ankerl::nanobench::Bench bench;
-  InternalVersionT vs[16];
-  for (int i = 0; i < 16; ++i) {
-    vs[i] = InternalVersionT(rand() % 1000 + 1000);
-  }
-#if !USE_64_BIT
-  InternalVersionT::zero = InternalVersionT(rand() % 1000);
-#endif
-  bench.run("horizontal16", [&]() {
-    bench.doNotOptimizeAway(horizontalMax16(vs, InternalVersionT::zero));
-  });
-  int x = rand() % 15 + 1;
-  bench.run("horizontalUpTo16", [&]() {
-    bench.doNotOptimizeAway(horizontalMaxUpTo16(vs, InternalVersionT::zero, x));
-  });
-}
-
 void benchLCP(int len) {
  ankerl::nanobench::Bench bench;
  std::vector<uint8_t> lhs(len);
@@ -4145,7 +4300,11 @@ void printTree() {
  debugPrintDot(stdout, cs.root, &cs);
 }

-int main(void) { benchHorizontal16(); }
+int main(void) {
+  for (int i = 0; i < 256; ++i) {
+    benchLCP(i);
+  }
+}
 #endif

 #ifdef ENABLE_FUZZ
@@ -22,9 +22,6 @@ void *stepJob(Job *j) {
  return done ? nullptr : (void *)stepJob;
 }

-// So we can look at the disassembly more easily
-
-extern "C" {
 void sequential(Job **jobs, int count) {
  for (int i = 0; i < count; ++i) {
    do {
@@ -94,6 +91,87 @@ void interleaveBoundedCyclicList(Job **jobs, int count) {
  }
 }

+#ifndef __has_attribute
+#define __has_attribute(x) 0
+#endif
+
+#if __has_attribute(musttail)
+#define MUSTTAIL __attribute__((musttail))
+#else
+#define MUSTTAIL
+#endif
+
+struct Context {
+  constexpr static int kConcurrent = 32;
+  Job **jobs;
+  Job *inProgress[kConcurrent];
+  void (*continuation[kConcurrent])(Context *, int64_t prevJob, int64_t job,
+                                    int64_t started, int64_t count);
+  int nextJob[kConcurrent];
+};
+
+void keepGoing(Context *context, int64_t prevJob, int64_t job, int64_t started,
+               int64_t count) {
+  prevJob = job;
+  job = context->nextJob[job];
+  MUSTTAIL return context->continuation[job](context, prevJob, job, started,
+                                             count);
+}
+
+void stepJobTailCall(Context *context, int64_t prevJob, int64_t job,
+                     int64_t started, int64_t count);
+
+void complete(Context *context, int64_t prevJob, int64_t job, int64_t started,
+              int64_t count) {
+  if (started == count) {
+    if (prevJob == job) {
+      return;
+    }
+    context->nextJob[prevJob] = context->nextJob[job];
+    job = prevJob;
+  } else {
+    context->inProgress[job] = context->jobs[started++];
+    context->continuation[job] = stepJobTailCall;
+  }
+  prevJob = job;
+  job = context->nextJob[job];
+  MUSTTAIL return context->continuation[job](context, prevJob, job, started,
+                                             count);
+}
+
+void stepJobTailCall(Context *context, int64_t prevJob, int64_t job,
+                     int64_t started, int64_t count) {
+  auto *j = context->inProgress[job];
+  auto done = --(*j->input) == 0;
+#ifdef __x86_64__
+  _mm_clflush(j->input);
+#endif
+  if (done) {
+    MUSTTAIL return complete(context, prevJob, job, started, count);
+  } else {
+    context->continuation[job] = stepJobTailCall;
+    MUSTTAIL return keepGoing(context, prevJob, job, started, count);
+  }
+}
+
+void useTailCalls(Job **jobs, int count) {
+  if (count == 0) {
+    return;
+  }
+  Context context;
+  context.jobs = jobs;
+  int64_t started = std::min(Context::kConcurrent, count);
+  for (int i = 0; i < started; i++) {
+    context.inProgress[i] = jobs[i];
+    context.nextJob[i] = i + 1;
+    context.continuation[i] = stepJobTailCall;
+  }
+  context.nextJob[started - 1] = 0;
+  int prevJob = started - 1;
+  int job = 0;
+  return context.continuation[job](&context, prevJob, job, started, count);
+}
+
 void interleaveCyclicList(Job **jobs, int count) {
  auto *nextJob = (int *)alloca(sizeof(int) * count);

@@ -117,12 +195,11 @@ void interleaveCyclicList(Job **jobs, int count) {
    job = nextJob[job];
  }
 }
-}

 int main() {
  ankerl::nanobench::Bench bench;

-  constexpr int kNumJobs = 100;
+  constexpr int kNumJobs = 10000;
  bench.relative(true);

  Job jobs[kNumJobs];
@@ -140,6 +217,7 @@ int main() {
  for (auto [scheduler, name] :
       {std::make_pair(sequentialNoFuncPtr, "sequentialNoFuncPtr"),
        std::make_pair(sequential, "sequential"),
+        std::make_pair(useTailCalls, "useTailCalls"),
        std::make_pair(interleaveSwapping, "interleavingSwapping"),
        std::make_pair(interleaveBoundedCyclicList,
                       "interleaveBoundedCyclicList"),
@@ -1,4 +1,5 @@
 #include <atomic>
+#include <cstdint>
 #include <errno.h>
 #include <netdb.h>
 #include <stdio.h>
@@ -21,78 +22,55 @@

 std::atomic<int64_t> transactions;

-constexpr int kBaseSearchDepth = 115;
 constexpr int kWindowSize = 10000000;

-std::string numToKey(int64_t num) {
+constexpr int kNumPrefixes = 250000;
+
+std::string makeKey(int64_t num, int suffixLen) {
  std::string result;
-  result.resize(kBaseSearchDepth + sizeof(int64_t));
-  memset(result.data(), 0, kBaseSearchDepth);
+  result.resize(sizeof(int64_t) + suffixLen);
  int64_t be = __builtin_bswap64(num);
-  memcpy(result.data() + kBaseSearchDepth, &be, sizeof(int64_t));
+  memcpy(result.data(), &be, sizeof(int64_t));
+  memset(result.data() + sizeof(int64_t), 0, suffixLen);
  return result;
 }

 void workload(weaselab::ConflictSet *cs) {
  int64_t version = kWindowSize;
-  cs->addWrites(nullptr, 0, version);
+  for (int i = 0; i < kNumPrefixes; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      weaselab::ConflictSet::WriteRange wr;
+      auto k = makeKey(i, j);
+      wr.begin.p = (const uint8_t *)k.data();
+      wr.begin.len = k.size();
+      wr.end.len = 0;
+      cs->addWrites(&wr, 1, version);
+    }
+  }
+  ++version;
+  for (int i = 0; i < kNumPrefixes; ++i) {
+    weaselab::ConflictSet::WriteRange wr;
+    auto k = makeKey(i, 50);
+    wr.begin.p = (const uint8_t *)k.data();
+    wr.begin.len = k.size();
+    wr.end.len = 0;
+    cs->addWrites(&wr, 1, version);
+  }
+
+  std::vector<weaselab::ConflictSet::Result> results(10);
  for (;; transactions.fetch_add(1, std::memory_order_relaxed)) {
-    // Reads
-    {
-      auto beginK = numToKey(version - kWindowSize);
-      auto endK = numToKey(version - 1);
-      auto pointRv = version - kWindowSize + rand() % kWindowSize + 1;
-      auto pointK = numToKey(pointRv);
-      weaselab::ConflictSet::ReadRange reads[] = {
-          {
-              {(const uint8_t *)pointK.data(), int(pointK.size())},
-              {nullptr, 0},
-              pointRv,
-          },
-          {
-              {(const uint8_t *)beginK.data(), int(beginK.size())},
-              {(const uint8_t *)endK.data(), int(endK.size())},
-              version - 2,
-          },
-      };
-      weaselab::ConflictSet::Result result[sizeof(reads) / sizeof(reads[0])];
-      cs->check(reads, result, sizeof(reads) / sizeof(reads[0]));
-      // for (int i = 0; i < sizeof(reads) / sizeof(reads[0]); ++i) {
-      //   if (result[i] != weaselab::ConflictSet::Commit) {
-      //     fprintf(stderr, "Unexpected conflict: [%s, %s) @ %" PRId64 "\n",
-      //             printable(reads[i].begin).c_str(),
-      //             printable(reads[i].end).c_str(), reads[i].readVersion);
-      //     abort();
-      //   }
-      // }
+    std::vector<std::string> keys(10);
+    for (auto &k : keys) {
+      k = makeKey(rand() % kNumPrefixes, 49);
    }
-    // Writes
-    {
-      weaselab::ConflictSet::WriteRange w;
-      auto k = numToKey(version);
-      w.begin.p = (const uint8_t *)k.data();
-      w.end.len = 0;
-      if (version % (kWindowSize / 2) == 0) {
-        for (int l = 0; l <= k.size(); ++l) {
-          w.begin.len = l;
-          cs->addWrites(&w, 1, version);
-        }
-      } else {
-        w.begin.len = k.size();
-        cs->addWrites(&w, 1, version);
-        int64_t beginN = version - kWindowSize + rand() % kWindowSize;
-        auto b = numToKey(beginN);
-        auto e = numToKey(beginN + 1000);
-        w.begin.p = (const uint8_t *)b.data();
-        w.begin.len = b.size();
-        w.end.p = (const uint8_t *)e.data();
-        w.end.len = e.size();
-        cs->addWrites(&w, 1, version);
-      }
+    std::vector<weaselab::ConflictSet::ReadRange> reads(10);
+    for (int i = 0; i < reads.size(); ++i) {
+      reads[i].begin.p = (const uint8_t *)(keys[i].data());
+      reads[i].begin.len = keys[i].size();
+      reads[i].end.len = 0;
+      reads[i].readVersion = version - 1;
    }
-    // GC
-    cs->setOldestVersion(version - kWindowSize);
-    ++version;
+    cs->check(reads.data(), results.data(), 10);
  }
 }
Author	SHA1	Message	Date
andrew	ed67486077	Reordering seems to improve codegen Tests / Clang total: 3244, passed: 3244 Details Clang \|Total\|New\|Outstanding\|Fixed\|Trend \|:-:\|:-:\|:-:\|:-:\|:-: \|0\|0\|0\|0\|:clap: Details Tests / 64 bit versions total: 3244, passed: 3244 Details Tests / Debug total: 3242, passed: 3242 Details Tests / SIMD fallback total: 3244, passed: 3244 Details Tests / Release [gcc] total: 3244, passed: 3244 Details GNU C Compiler (gcc) \|Total\|New\|Outstanding\|Fixed\|Trend \|:-:\|:-:\|:-:\|:-:\|:-: \|0\|0\|0\|0\|:clap: Details Tests / Release [gcc,aarch64] total: 2419, passed: 2419 Details Tests / Coverage total: 2437, passed: 2437 Details Code Coverage #### Project Overview No changes detected, that affect the code coverage. * Line Coverage: 98.98% (1938/1958) * Branch Coverage: 68.67% (1497/2180) * Complexity Density: 0.00 * Lines of Code: 1958 #### Quality Gates Summary Output truncated. Details weaselab/conflict-set/pipeline/head There was a failure building this commit Details	2024-09-23 15:28:51 -07:00
andrew	b376f6fdd5	WIP Tests / Clang total: 3244, passed: 3244 Details Clang \|Total\|New\|Outstanding\|Fixed\|Trend \|:-:\|:-:\|:-:\|:-:\|:-: \|0\|0\|0\|0\|:clap: Details Tests / 64 bit versions total: 3244, passed: 3244 Details Tests / Debug total: 3242, passed: 3242 Details Tests / SIMD fallback total: 3244, passed: 3244 Details Tests / Release [gcc] total: 3244, passed: 3244 Details GNU C Compiler (gcc) \|Total\|New\|Outstanding\|Fixed\|Trend \|:-:\|:-:\|:-:\|:-:\|:-: \|0\|0\|0\|0\|:clap: Details Tests / Release [gcc,aarch64] total: 2419, passed: 2419 Details Tests / Coverage total: 2437, passed: 2437 Details Code Coverage #### Project Overview No changes detected, that affect the code coverage. * Line Coverage: 98.98% (1938/1958) * Branch Coverage: 68.67% (1497/2180) * Complexity Density: 0.00 * Lines of Code: 1958 #### Quality Gates Summary Output truncated. Details weaselab/conflict-set/pipeline/head There was a failure building this commit Details	2024-09-23 15:11:48 -07:00
andrew	6de63dd3fe	Use preserve_none and put continuation array in CheckAll	2024-09-23 14:53:16 -07:00
andrew	3e5f13bf54	WIP - tests pass	2024-09-23 13:32:56 -07:00
andrew	e7e1d1f7f5	Add tail-call based interleaving approach	2024-09-23 12:52:30 -07:00
andrew	442658e983	Target ~1GB memory usage in server bench	2024-09-21 14:28:15 -07:00
andrew	26f602215e	Accentuate cache misses for point reads in server_bench	2024-09-14 22:42:40 -07:00
andrew	98236f81cb	Add missing __builtin_prefetch	2024-09-14 22:41:58 -07:00
andrew	3593b72880	Disallow checking SIM_CACHE_MISSES=1	2024-09-10 22:23:37 -07:00
andrew	814aac4ea7	Experiment with causing cache misses	2024-09-10 22:06:00 -07:00
andrew	0550fa0016	Add "iter" state	2024-09-10 17:22:10 -07:00
andrew	fe5cfb0336	Remove redundant cast	2024-09-10 17:06:45 -07:00
andrew	82203515a0	check_point_read_state_machine::down_left_spine	2024-09-10 17:05:09 -07:00
andrew	465372c734	Scaffolding to prepare for interleaving checks	2024-09-10 16:10:57 -07:00