From a2a55c9717df1a7ff5d419f47c6ec003706432bb Mon Sep 17 00:00:00 2001
From: Andrew Noyes <anoyes34@gmail.com>
Date: Fri, 19 Jan 2024 14:50:36 -0800
Subject: [PATCH] Prepare for fuzzing

---
 CMakeLists.txt  |   2 +-
 ConflictSet.cpp | 335 ++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 295 insertions(+), 42 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bc5dee7..12e6cd6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,7 @@ target_compile_definitions(conflict_set_test PRIVATE ENABLE_TESTS)
 # keep asserts for test
 target_compile_options(conflict_set_test PRIVATE -UNDEBUG)
 # Only emit compile warnings for test
-target_compile_options(conflict_set_test PRIVATE -Wall -Wextra -Wpedantic -Wunreachable-code -Werror)
+target_compile_options(conflict_set_test PRIVATE -Wall -Wextra -Wpedantic -Wunreachable-code)
 
 include(CTest)
 add_test(NAME conflict_set_test COMMAND conflict_set_test)
diff --git a/ConflictSet.cpp b/ConflictSet.cpp
index 728a69f..f902695 100644
--- a/ConflictSet.cpp
+++ b/ConflictSet.cpp
@@ -12,17 +12,6 @@
 #include <utility>
 #include <vector>
 
-#define SHOW_PRIORITY 0
-#define DEBUG 0
-
-using Key = ConflictSet::Key;
-
-static auto operator<=>(const Key &lhs, const Key &rhs) {
-  const int minLen = std::min(lhs.len, rhs.len);
-  const int c = memcmp(lhs.p, rhs.p, minLen);
-  return c != 0 ? c <=> 0 : lhs.len <=> rhs.len;
-}
-
 // ==================== BEGIN ARENA IMPL ====================
 
 /// Group allocations with similar lifetimes to amortize the cost of malloc/free
@@ -189,6 +178,280 @@ bool operator!=(const ArenaAlloc<T> &lhs, const ArenaAlloc<U> &rhs) {
 
 // ==================== END ARENA IMPL ====================
 
+// ==================== BEGIN RANDOM IMPL ====================
+
+struct Random {
+  // *Really* minimal PCG32 code / (c) 2014 M.E. O'Neill / pcg-random.org
+  // Licensed under Apache License 2.0 (NO WARRANTY, etc. see website)
+  //
+  // Modified - mostly c -> c++
+  Random() = default;
+
+  Random(uint64_t initState, uint64_t initSeq) {
+    pcg32_srandom_r(initState, initSeq);
+    next();
+  }
+
+  /// Draws from a uniform distribution of uint32_t's
+  uint32_t next() {
+    auto result = next_;
+    next_ = pcg32_random_r();
+    return result;
+  }
+
+  /// Draws from a uniform distribution of [0, s). From
+  /// https://arxiv.org/pdf/1805.10941.pdf
+  uint32_t bounded(uint32_t s) {
+    assert(s != 0);
+    uint32_t x = next();
+    auto m = uint64_t(x) * uint64_t(s);
+    auto l = uint32_t(m);
+    if (l < s) {
+      uint32_t t = -s % s;
+      while (l < t) {
+        x = next();
+        m = uint64_t(x) * uint64_t(s);
+        l = uint32_t(m);
+      }
+    }
+    uint32_t result = m >> 32;
+    return result;
+  }
+
+  /// Fill `bytes` with `size` random bytes
+  void randomBytes(uint8_t *bytes, int size);
+
+  /// Fill `bytes` with `size` random hex bytes
+  void randomHex(uint8_t *bytes, int size);
+
+  template <class T, class = std::enable_if_t<std::is_trivially_copyable_v<T>>>
+  T randT() {
+    T t;
+    randomBytes((uint8_t *)&t, sizeof(T));
+    return t;
+  }
+
+private:
+  uint32_t pcg32_random_r() {
+    uint64_t oldState = state;
+    // Advance internal state
+    state = oldState * 6364136223846793005ULL + inc;
+    // Calculate output function (XSH RR), uses old state for max ILP
+    uint32_t xorShifted = ((oldState >> 18u) ^ oldState) >> 27u;
+    uint32_t rot = oldState >> 59u;
+    return (xorShifted >> rot) | (xorShifted << ((-rot) & 31));
+  }
+
+  // Seed the rng.  Specified in two parts, state initializer and a
+  // sequence selection constant (a.k.a. stream id)
+  void pcg32_srandom_r(uint64_t initstate, uint64_t initSeq) {
+    state = 0U;
+    inc = (initSeq << 1u) | 1u;
+    pcg32_random_r();
+    state += initstate;
+    pcg32_random_r();
+  }
+  uint32_t next_{};
+  // RNG state.  All values are possible.
+  uint64_t state{};
+  // Controls which RNG sequence (stream) is selected. Must *always* be odd.
+  uint64_t inc{};
+};
+
+// TODO provide a way to seed this
+thread_local inline Random gRandom{0, 0};
+
+template <class Container> void shuffle(Container &x) {
+  using std::swap;
+  for (int i = x.size() - 1; i > 0; --i) {
+    int j = gRandom.bounded(i + 1);
+    if (i != j) {
+      swap(x[i], x[j]);
+    }
+  }
+}
+
+void Random::randomBytes(uint8_t *bytes, int size) {
+  int i = 0;
+  for (; i + 4 < size; i += 4) {
+    uint32_t random = next();
+    memcpy(bytes + i, &random, 4);
+  }
+  if (i < size) {
+    uint32_t random = next();
+    memcpy(bytes + i, &random, size - i);
+  }
+}
+
+void Random::randomHex(uint8_t *bytes, int size) {
+  int i = 0;
+  while (i + 8 < size) {
+    uint32_t r = next();
+    bytes[i++] = "0123456789abcdef"[r & 0b1111];
+    r >>= 4;
+    bytes[i++] = "0123456789abcdef"[r & 0b1111];
+    r >>= 4;
+    bytes[i++] = "0123456789abcdef"[r & 0b1111];
+    r >>= 4;
+    bytes[i++] = "0123456789abcdef"[r & 0b1111];
+    r >>= 4;
+    bytes[i++] = "0123456789abcdef"[r & 0b1111];
+    r >>= 4;
+    bytes[i++] = "0123456789abcdef"[r & 0b1111];
+    r >>= 4;
+    bytes[i++] = "0123456789abcdef"[r & 0b1111];
+    r >>= 4;
+    bytes[i++] = "0123456789abcdef"[r & 0b1111];
+  }
+  uint32_t r = next();
+  while (i < size) {
+    bytes[i++] = "0123456789abcdef"[r & 0b1111];
+    r >>= 4;
+  }
+}
+
+// ==================== END RANDOM IMPL ====================
+
+// ==================== BEGIN ARBITRARY IMPL ====================
+
+/// Think of `Arbitrary` as an attacker-controlled random number generator.
+/// Usually you want your random number generator to be fair, so that you can
+/// sensibly analyze probabilities. E.g. The analysis that shows that quicksort
+/// is expected O(n log n) with a random pivot relies on the random pivot being
+/// selected uniformly from a fair distribution.
+///
+/// Other times you want your randomness to be diabolically unfair, like when
+/// looking for bugs and fuzzing. The random-number-like interface is still
+/// convenient here, but you can potentially get much better coverage by
+/// allowing the possibility of e.g. flipping heads 100 times in a row.
+///
+/// When it runs out of entropy, it always returns 0.
+struct Arbitrary {
+  Arbitrary() = default;
+
+  explicit Arbitrary(std::span<const uint8_t> bytecode) : bytecode(bytecode) {}
+
+  /// Draws an arbitrary uint32_t
+  uint32_t next() { return consume<4>(); }
+
+  /// Draws an arbitrary element from [0, s)
+  uint32_t bounded(uint32_t s);
+
+  /// Fill `bytes` with `size` arbitrary bytes
+  void randomBytes(uint8_t *bytes, int size) {
+    int toFill = std::min<int>(size, bytecode.size());
+    if (toFill > 0) {
+      memcpy(bytes, bytecode.data(), toFill);
+    }
+    bytecode = bytecode.subspan(toFill, bytecode.size() - toFill);
+    memset(bytes + toFill, 0, size - toFill);
+  }
+
+  /// Fill `bytes` with `size` random hex bytes
+  void randomHex(uint8_t *bytes, int size) {
+    for (int i = 0; i < size;) {
+      uint8_t arbitrary = consume<1>();
+      bytes[i++] = "0123456789abcdef"[arbitrary & 0xf];
+      arbitrary >>= 4;
+      if (i < size) {
+        bytes[i++] = "0123456789abcdef"[arbitrary & 0xf];
+      }
+    }
+  }
+
+  template <class T, class = std::enable_if_t<std::is_trivially_copyable_v<T>>>
+  T randT() {
+    T t;
+    randomBytes((uint8_t *)&t, sizeof(T));
+    return t;
+  }
+
+  bool hasEntropy() const { return bytecode.size() != 0; }
+
+private:
+  uint8_t consumeByte() {
+    if (bytecode.size() == 0) {
+      return 0;
+    }
+    auto result = bytecode[0];
+    bytecode = bytecode.subspan(1, bytecode.size() - 1);
+    return result;
+  }
+
+  template <int kBytes> uint32_t consume() {
+    uint32_t result = 0;
+    static_assert(kBytes <= 4);
+    for (int i = 0; i < kBytes; ++i) {
+      result <<= 8;
+      result |= consumeByte();
+    }
+    return result;
+  }
+
+  std::span<const uint8_t> bytecode;
+};
+
+inline Arbitrary gArbitrary;
+
+void initFuzz(const uint8_t *data, size_t size);
+
+uint32_t Arbitrary::bounded(uint32_t s) {
+  if (s == 1) {
+    return 0;
+  }
+  switch (32 - __builtin_clz(s - 1)) {
+  case 1:
+  case 2:
+  case 3:
+  case 4:
+  case 5:
+  case 6:
+  case 7:
+  case 8:
+    return consume<1>() % s;
+  case 9:
+  case 10:
+  case 11:
+  case 12:
+  case 13:
+  case 14:
+  case 15:
+  case 16:
+    return consume<2>() % s;
+  case 17:
+  case 18:
+  case 19:
+  case 20:
+  case 21:
+  case 22:
+  case 23:
+  case 24:
+    return consume<3>() % s;
+  default:
+    return consume<4>() % s;
+  }
+}
+
+void initFuzz(const uint8_t *data, size_t size) {
+  gArbitrary = Arbitrary{{data, size}};
+  uint64_t state = gArbitrary.next();
+  uint64_t seq = gArbitrary.next();
+  gRandom = Random{state, seq};
+}
+
+// ==================== END ARBITRARY IMPL ====================
+
+#define SHOW_PRIORITY 0
+#define DEBUG 0
+
+using Key = ConflictSet::Key;
+
+static auto operator<=>(const Key &lhs, const Key &rhs) {
+  const int minLen = std::min(lhs.len, rhs.len);
+  const int c = memcmp(lhs.p, rhs.p, minLen);
+  return c != 0 ? c <=> 0 : lhs.len <=> rhs.len;
+}
+
 namespace {
 // A node in the tree representing write conflict history. This tree maintains
 // several invariants:
@@ -236,16 +499,6 @@ struct Node {
   }
 };
 
-// TODO: use a better prng. This is technically vulnerable to a
-// denial-of-service attack that can make conflict-checking linear in the
-// number of nodes in the tree.
-thread_local uint32_t gSeed = 1013904223L;
-uint32_t fastRand() {
-  auto result = gSeed;
-  gSeed = gSeed * 1664525L + 1013904223L;
-  return result;
-}
-
 // Note: `rangeVersion` is left uninitialized.
 Node *createNode(const Key &key, Node *parent, int64_t pointVersion) {
   assert(key.len <= std::numeric_limits<int>::max());
@@ -255,7 +508,7 @@ Node *createNode(const Key &key, Node *parent, int64_t pointVersion) {
   result->child[0] = nullptr;
   result->child[1] = nullptr;
   result->parent = parent;
-  result->priority = fastRand();
+  result->priority = gRandom.next();
 #if SHOW_PRIORITY
   result->priority &= 0xff;
 #endif
@@ -601,7 +854,8 @@ int64_t checkMaxVersion(Node *node, bool &success) {
   return expected;
 }
 
-bool checkInvariants(Node *node) {
+template <class ReferenceImpl>
+bool checkCorrectness(Node *node, ReferenceImpl &refImpl) {
   bool success = true;
   // Check bst invariant
   Arena arena;
@@ -626,6 +880,18 @@ bool checkInvariants(Node *node) {
   checkMaxVersion(node, success);
   checkParentPointers(node, success);
 
+  std::string logicalMap;
+  std::string referenceLogicalMap;
+  printLogical(logicalMap, node);
+  refImpl.printLogical(referenceLogicalMap);
+  if (logicalMap != referenceLogicalMap) {
+    fprintf(stderr,
+            "Logical map not equal to reference logical map.\n\nActual:\n"
+            "%s\nExpected:\n%s\n",
+            logicalMap.c_str(), referenceLogicalMap.c_str());
+    success = false;
+  }
+
   return success;
 }
 
@@ -706,7 +972,8 @@ struct __attribute__((__visibility__("hidden"))) ConflictSet::Impl {
 
   void addWrites(const WriteRange *writes, int count) {
     Arena arena;
-    auto *stepwiseInserts = new (arena) StepwiseInsert[count];
+    auto stepwiseInserts =
+        std::span<StepwiseInsert>(new (arena) StepwiseInsert[count], count);
     for (int i = 0; i < count; ++i) {
       // TODO handle non-singleton writes lol
       assert(writes[i].end.len == 0);
@@ -720,11 +987,9 @@ struct __attribute__((__visibility__("hidden"))) ConflictSet::Impl {
     // Mitigate potential n^2 behavior of insertion by shuffling the insertion
     // order. Not sure how this interacts with interleaved insertion but it's
     // probably fine.
-    // TODO better/faster RNG?
-    std::mt19937 g(fastRand());
-    std::shuffle(stepwiseInserts, stepwiseInserts + count, g);
+    shuffle(stepwiseInserts);
 
-    runInterleaved(std::span<StepwiseInsert>(stepwiseInserts, count));
+    runInterleaved(stepwiseInserts);
 
     std::vector<Node *, ArenaAlloc<Node *>> workList{
         ArenaAlloc<Node *>(&arena)};
@@ -929,7 +1194,6 @@ struct ReferenceImpl {
 } // namespace
 
 #ifdef ENABLE_TESTS
-
 int main(void) {
   int64_t writeVersion = 0;
   ConflictSet::Impl cs{writeVersion};
@@ -946,18 +1210,7 @@ int main(void) {
   cs.addWrites(write, kNumKeys);
   refImpl.addWrites(write, kNumKeys);
   debugPrintDot(stdout, cs.root);
-  bool success = checkInvariants(cs.root);
-  std::string logicalMap;
-  std::string referenceLogicalMap;
-  printLogical(logicalMap, cs.root);
-  refImpl.printLogical(referenceLogicalMap);
-  if (logicalMap != referenceLogicalMap) {
-    fprintf(stderr,
-            "Logical map not equal to reference logical map.\n\nActual:\n"
-            "%s\nExpected:\n%s\n",
-            logicalMap.c_str(), referenceLogicalMap.c_str());
-    success = false;
-  }
+  bool success = checkCorrectness(cs.root, refImpl);
   return success ? 0 : 1;
 }
 #endif
\ No newline at end of file