diff --git a/ConflictSet.cpp b/ConflictSet.cpp index 82c090b..74faf0f 100644 --- a/ConflictSet.cpp +++ b/ConflictSet.cpp @@ -152,8 +152,14 @@ struct BitSet { } } + void init() { + for (auto &w : words) { + w = 0; + } + } + private: - uint64_t words[4] = {}; + uint64_t words[4]; }; bool BitSet::test(int i) const { @@ -196,28 +202,38 @@ enum Type { Type_Node256, }; +template struct BoundedFreeListAllocator; + struct Node { /* begin section that's copied to the next node */ - Node *parent = nullptr; Entry entry; - int32_t partialKeyLen = 0; - int16_t numChildren = 0; - bool entryPresent = false; - uint8_t parentsIndex = 0; + Node *parent; + int32_t partialKeyLen; + int16_t numChildren; + bool entryPresent; + uint8_t parentsIndex; /* end section that's copied to the next node */ - Type type; - - // Leaving this uninitialized is intentional and necessary for correctness. - // Basically it needs to be preserved when going to the free list and back. - int32_t partialKeyCapacity; + // These survive the free list round trip unscathed, and are only written by + // BoundedFreeListAllocator uint8_t *partialKey(); + + Type getType() const { return type; } + int32_t getCapacity() const { return partialKeyCapacity; } + +private: + template friend struct BoundedFreeListAllocator; + // These are publically readable, but should only be written by + // BoundedFreeListAllocator + Type type; + int32_t partialKeyCapacity; }; -constexpr int kNodeCopyBegin = offsetof(Node, parent); -constexpr int kNodeCopySize = offsetof(Node, type) - kNodeCopyBegin; +constexpr int kNodeCopyBegin = offsetof(Node, entry); +constexpr int kNodeCopySize = + offsetof(Node, parentsIndex) + sizeof(Node::parentsIndex) - kNodeCopyBegin; struct Child { int64_t childMaxVersion; @@ -225,49 +241,41 @@ struct Child { }; struct Node0 : Node { - Node0() { this->type = Type_Node0; } + constexpr static auto kType = Type_Node0; uint8_t *partialKey() { return (uint8_t *)(this + 1); } }; struct Node3 : Node { constexpr static auto kMaxNodes = 3; + constexpr static auto kType = Type_Node3; // Sorted uint8_t index[kMaxNodes]; Child children[kMaxNodes]; - Node3() { this->type = Type_Node3; } uint8_t *partialKey() { return (uint8_t *)(this + 1); } }; struct Node16 : Node { + constexpr static auto kType = Type_Node16; constexpr static auto kMaxNodes = 16; // Sorted uint8_t index[kMaxNodes]; Child children[kMaxNodes]; - Node16() { this->type = Type_Node16; } uint8_t *partialKey() { return (uint8_t *)(this + 1); } }; struct Node48 : Node { + constexpr static auto kType = Type_Node48; BitSet bitSet; Child children[48]; - int8_t nextFree = 0; + int8_t nextFree; int8_t index[256]; - Node48() { - memset(index, -1, 256); - this->type = Type_Node48; - } uint8_t *partialKey() { return (uint8_t *)(this + 1); } }; struct Node256 : Node { + constexpr static auto kType = Type_Node256; BitSet bitSet; Child children[256]; - Node256() { - this->type = Type_Node256; - for (int i = 0; i < 256; ++i) { - children[i].child = nullptr; - } - } uint8_t *partialKey() { return (uint8_t *)(this + 1); } }; @@ -311,22 +319,27 @@ static_assert(kBytesPerKey - sizeof(Node0) >= kMinNodeSurplus); // Which should give us the budget to pay for the key bytes. (children + // entryPresent) is a lower bound on how many keys these bytes are a prefix of -template -struct BoundedFreeListAllocator { +constexpr int64_t kFreeListMaxMemory = 1 << 20; + +template struct BoundedFreeListAllocator { static_assert(sizeof(T) >= sizeof(void *)); static_assert(std::derived_from); + static_assert(std::is_trivial_v); T *allocate(int partialKeyCapacity) { if (freeList != nullptr) { T *n = (T *)freeList; + VALGRIND_MAKE_MEM_DEFINED(freeList, sizeof(freeList)); + memcpy(&freeList, freeList, sizeof(freeList)); VALGRIND_MAKE_MEM_UNDEFINED(n, sizeof(T)); VALGRIND_MAKE_MEM_DEFINED(&n->partialKeyCapacity, sizeof(n->partialKeyCapacity)); - VALGRIND_MAKE_MEM_DEFINED(freeList, sizeof(freeList)); - memcpy(&freeList, freeList, sizeof(freeList)); + VALGRIND_MAKE_MEM_DEFINED(&n->type, sizeof(n->type)); + assert(n->type == T::kType); + VALGRIND_MAKE_MEM_UNDEFINED(n + 1, n->partialKeyCapacity); freeListBytes -= sizeof(T) + n->partialKeyCapacity; if (n->partialKeyCapacity >= partialKeyCapacity) { - return new (n) T; + return n; } else { // The intent is to filter out too-small nodes in the freelist removeNode(n); @@ -334,7 +347,8 @@ struct BoundedFreeListAllocator { } } - auto *result = new (safe_malloc(sizeof(T) + partialKeyCapacity)) T; + auto *result = (T *)safe_malloc(sizeof(T) + partialKeyCapacity); + result->type = T::kType; result->partialKeyCapacity = partialKeyCapacity; addNode(result); return result; @@ -342,14 +356,14 @@ struct BoundedFreeListAllocator { void release(T *p) { static_assert(std::is_trivially_destructible_v); - if (freeListBytes >= kMemoryBound) { + if (freeListBytes >= kFreeListMaxMemory) { removeNode(p); return safe_free(p); } memcpy((void *)p, &freeList, sizeof(freeList)); freeList = p; freeListBytes += sizeof(T) + p->partialKeyCapacity; - VALGRIND_MAKE_MEM_NOACCESS(freeList, sizeof(T)); + VALGRIND_MAKE_MEM_NOACCESS(freeList, sizeof(T) + p->partialKeyCapacity); } ~BoundedFreeListAllocator() { @@ -455,7 +469,7 @@ template int getNodeIndex(NodeT *self, uint8_t index) { // Precondition - an entry for index must exist in the node Node *&getChildExists(Node *self, uint8_t index) { - switch (self->type) { + switch (self->getType()) { case Type_Node0: __builtin_unreachable(); // GCOVR_EXCL_LINE case Type_Node3: { @@ -487,7 +501,7 @@ int64_t &maxVersion(Node *n, ConflictSet::Impl *); Node *&getInTree(Node *n, ConflictSet::Impl *); Node *getChild(Node *self, uint8_t index) { - switch (self->type) { + switch (self->getType()) { case Type_Node0: return nullptr; case Type_Node3: { @@ -576,7 +590,7 @@ int getChildGeq(Node *self, int child) { if (child > 255) { return -1; } - switch (self->type) { + switch (self->getType()) { case Type_Node0: return -1; case Type_Node3: @@ -623,7 +637,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, NodeAllocators *allocators) { // Fast path for if it exists already - switch (self->type) { + switch (self->getType()) { case Type_Node0: break; case Type_Node3: { @@ -657,7 +671,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, __builtin_unreachable(); // GCOVR_EXCL_LINE } - switch (self->type) { + switch (self->getType()) { case Type_Node0: { auto *self0 = static_cast(self); @@ -710,9 +724,11 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, if (self->numChildren == Node16::kMaxNodes) { auto *self16 = static_cast(self); auto *newSelf = allocators->node48.allocate(self->partialKeyLen); + memset(newSelf->index, -1, sizeof(newSelf->index)); memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin, kNodeCopySize); memcpy(newSelf->partialKey(), self16->partialKey(), self->partialKeyLen); + newSelf->bitSet.init(); newSelf->nextFree = Node16::kMaxNodes; int i = 0; for (auto x : self16->index) { @@ -730,7 +746,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, insert16: auto *self16 = static_cast(self); - assert(self->type == Type_Node16); + assert(self->getType() == Type_Node16); ++self->numChildren; int i = 0; @@ -753,6 +769,9 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, if (self->numChildren == 48) { auto *self48 = static_cast(self); auto *newSelf = allocators->node256.allocate(self->partialKeyLen); + for (int i = 0; i < 256; ++i) { + newSelf->children[i].child = nullptr; + } memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin, kNodeCopySize); memcpy(newSelf->partialKey(), self48->partialKey(), self->partialKeyLen); @@ -819,7 +838,7 @@ template void freeAndMakeCapacityAtLeast(Node *&self, int capacity, NodeAllocators *allocators, ConflictSet::Impl *impl) { - switch (self->type) { + switch (self->getType()) { case Type_Node0: { auto *self0 = (Node0 *)self; auto *newSelf = allocators->node0.allocate(capacity); @@ -884,6 +903,8 @@ void freeAndMakeCapacityAtLeast(Node *&self, int capacity, kNodeCopySize); memcpy(newSelf->partialKey(), self48->partialKey(), self->partialKeyLen); newSelf->bitSet = self48->bitSet; + // TODO check codegen here? + newSelf->nextFree = 0; newSelf->bitSet.forEachInRange( [&](int c) { int index = newSelf->nextFree; @@ -933,7 +954,7 @@ void maybeDecreaseCapacity(Node *&self, NodeAllocators *allocators, ConflictSet::Impl *impl) { const int maxCapacity = (self->numChildren + int(self->entryPresent)) * self->partialKeyLen; - if (self->partialKeyCapacity <= maxCapacity) { + if (self->getCapacity() <= maxCapacity) { return; } freeAndMakeCapacityAtLeast(self, maxCapacity, @@ -948,7 +969,7 @@ void maybeDownsize(Node *self, NodeAllocators *allocators, fprintf(stderr, "maybeDownsize: %s\n", getSearchPathPrintable(self).c_str()); #endif - switch (self->type) { + switch (self->getType()) { case Type_Node0: __builtin_unreachable(); // GCOVR_EXCL_LINE case Type_Node3: { @@ -966,7 +987,7 @@ void maybeDownsize(Node *self, NodeAllocators *allocators, auto *child = self3->children[0].child; int minCapacity = self3->partialKeyLen + 1 + child->partialKeyLen; - if (minCapacity > child->partialKeyCapacity) { + if (minCapacity > child->getCapacity()) { const bool update = child == dontInvalidate; freeAndMakeCapacityAtLeast(child, minCapacity, allocators, impl); @@ -1011,6 +1032,7 @@ void maybeDownsize(Node *self, NodeAllocators *allocators, kNodeCopySize); memcpy(newSelf->partialKey(), self16->partialKey(), self->partialKeyLen); // TODO replace with memcpy? + static_assert(Node3::kMaxNodes == kMinChildrenNode16 - 1); for (int i = 0; i < Node3::kMaxNodes; ++i) { newSelf->index[i] = self16->index[i]; newSelf->children[i] = self16->children[i]; @@ -1028,6 +1050,7 @@ void maybeDownsize(Node *self, NodeAllocators *allocators, kNodeCopySize); memcpy(newSelf->partialKey(), self48->partialKey(), self->partialKeyLen); + static_assert(Node16::kMaxNodes == kMinChildrenNode48 - 1); int i = 0; self48->bitSet.forEachInRange( [&](int c) { @@ -1051,10 +1074,12 @@ void maybeDownsize(Node *self, NodeAllocators *allocators, if (self->numChildren + int(self->entryPresent) < kMinChildrenNode256) { auto *self256 = (Node256 *)self; auto *newSelf = allocators->node48.allocate(self->partialKeyLen); + memset(newSelf->index, -1, sizeof(newSelf->index)); memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin, kNodeCopySize); memcpy(newSelf->partialKey(), self256->partialKey(), self->partialKeyLen); + newSelf->nextFree = 0; newSelf->bitSet = self256->bitSet; newSelf->bitSet.forEachInRange( [&](int c) { @@ -1103,7 +1128,7 @@ Node *erase(Node *self, NodeAllocators *allocators, ConflictSet::Impl *impl, return result; } - switch (self->type) { + switch (self->getType()) { case Type_Node0: allocators->node0.release((Node0 *)self); break; @@ -1123,7 +1148,7 @@ Node *erase(Node *self, NodeAllocators *allocators, ConflictSet::Impl *impl, __builtin_unreachable(); // GCOVR_EXCL_LINE } - switch (parent->type) { + switch (parent->getType()) { case Type_Node0: __builtin_unreachable(); // GCOVR_EXCL_LINE case Type_Node3: { @@ -1481,7 +1506,7 @@ int64_t maxBetweenExclusive(Node *n, int begin, int end) { return result; } } - switch (n->type) { + switch (n->getType()) { case Type_Node0: // We would have returned above, after not finding a child __builtin_unreachable(); // GCOVR_EXCL_LINE @@ -2027,7 +2052,7 @@ template } else { // Consider adding a partial key if ((*self)->numChildren == 0 && !(*self)->entryPresent) { - assert((*self)->partialKeyCapacity >= int(key.size())); + assert((*self)->getCapacity() >= int(key.size())); (*self)->partialKeyLen = key.size(); memcpy((*self)->partialKey(), key.data(), (*self)->partialKeyLen); key = key.subspan((*self)->partialKeyLen, @@ -2054,6 +2079,9 @@ template auto &child = getOrCreateChild(*self, key.front(), allocators); if (!child) { child = allocators->node0.allocate(key.size() - 1); + child->numChildren = 0; + child->entryPresent = false; + child->partialKeyLen = 0; child->parent = *self; child->parentsIndex = key.front(); maxVersion(child, impl) = @@ -2348,12 +2376,17 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl { explicit Impl(int64_t oldestVersion) : oldestVersion(oldestVersion) { // Insert "" root = allocators.node0.allocate(0); + root->numChildren = 0; + root->parent = nullptr; rootMaxVersion = oldestVersion; - root->entry.pointVersion = oldestVersion; - root->entry.rangeVersion = oldestVersion; + root->entryPresent = false; + root->partialKeyLen = 0; addKey(root); + root->entryPresent = true; + root->entry.pointVersion = oldestVersion; + root->entry.rangeVersion = oldestVersion; } ~Impl() { destroyTree(root); } @@ -2375,7 +2408,7 @@ int64_t &maxVersion(Node *n, ConflictSet::Impl *impl) { if (n == nullptr) { return impl->rootMaxVersion; } - switch (n->type) { + switch (n->getType()) { case Type_Node0: __builtin_unreachable(); // GCOVR_EXCL_LINE case Type_Node3: { @@ -2654,7 +2687,7 @@ Iterator firstGeq(Node *n, std::string_view key) { [[maybe_unused]] void checkMemoryBoundInvariants(Node *node, bool &success) { int minNumChildren; - switch (node->type) { + switch (node->getType()) { case Type_Node0: minNumChildren = 0; break; @@ -2718,7 +2751,7 @@ int64_t keyBytes = 0; int64_t peakKeyBytes = 0; int64_t getNodeSize(struct Node *n) { - switch (n->type) { + switch (n->getType()) { case Type_Node0: return sizeof(Node0); case Type_Node3: @@ -2750,7 +2783,7 @@ int64_t getSearchPathLength(Node *n) { void addNode(Node *n) { nodeBytes += getNodeSize(n); - partialCapacityBytes += n->partialKeyCapacity; + partialCapacityBytes += n->getCapacity(); if (nodeBytes > peakNodeBytes) { peakNodeBytes = nodeBytes; } @@ -2761,7 +2794,7 @@ void addNode(Node *n) { void removeNode(Node *n) { nodeBytes -= getNodeSize(n); - partialCapacityBytes -= n->partialKeyCapacity; + partialCapacityBytes -= n->getCapacity(); } void addKey(Node *n) { @@ -2824,25 +2857,7 @@ void printTree() { debugPrintDot(stdout, cs.root, &cs); } -#define ANKERL_NANOBENCH_IMPLEMENT -#include "third_party/nanobench.h" - -int main(void) { - printTree(); - return 0; - ankerl::nanobench::Bench bench; - ConflictSet::Impl cs{0}; - for (int j = 0; j < 256; ++j) { - getOrCreateChild(cs.root, j, &cs.allocators) = - cs.allocators.node0.allocate(0); - if (j % 10 == 0) { - bench.run("MaxExclusive " + std::to_string(j), [&]() { - bench.doNotOptimizeAway(maxBetweenExclusive(cs.root, 0, 256)); - }); - } - } - return 0; -} +int main(void) { printTree(); } #endif #ifdef ENABLE_FUZZ