From ee36bda8f83e0754ea0417bc0594ffcc3a100c1f Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Thu, 14 Mar 2024 14:47:11 -0700 Subject: [PATCH] Track initializedness of Node memory more precisely By not initializing Node members with dummy default values. This has performance/code size benefits, and improves debugging when running under valgrind. Unfortunately this also makes it easy to write code that uses uninitialized memory, so if valgrind doesn't have good coverage then we might let some uninit usages sneak through. We plan to have good coverage for valgrind, so I think it's ok. If writing correct code becomes too tedious then we can go back to initializing Node fields with dummy default values. --- ConflictSet.cpp | 165 ++++++++++++++++++++++++++---------------------- 1 file changed, 90 insertions(+), 75 deletions(-) diff --git a/ConflictSet.cpp b/ConflictSet.cpp index 82c090b..74faf0f 100644 --- a/ConflictSet.cpp +++ b/ConflictSet.cpp @@ -152,8 +152,14 @@ struct BitSet { } } + void init() { + for (auto &w : words) { + w = 0; + } + } + private: - uint64_t words[4] = {}; + uint64_t words[4]; }; bool BitSet::test(int i) const { @@ -196,28 +202,38 @@ enum Type { Type_Node256, }; +template struct BoundedFreeListAllocator; + struct Node { /* begin section that's copied to the next node */ - Node *parent = nullptr; Entry entry; - int32_t partialKeyLen = 0; - int16_t numChildren = 0; - bool entryPresent = false; - uint8_t parentsIndex = 0; + Node *parent; + int32_t partialKeyLen; + int16_t numChildren; + bool entryPresent; + uint8_t parentsIndex; /* end section that's copied to the next node */ - Type type; - - // Leaving this uninitialized is intentional and necessary for correctness. - // Basically it needs to be preserved when going to the free list and back. - int32_t partialKeyCapacity; + // These survive the free list round trip unscathed, and are only written by + // BoundedFreeListAllocator uint8_t *partialKey(); + + Type getType() const { return type; } + int32_t getCapacity() const { return partialKeyCapacity; } + +private: + template friend struct BoundedFreeListAllocator; + // These are publically readable, but should only be written by + // BoundedFreeListAllocator + Type type; + int32_t partialKeyCapacity; }; -constexpr int kNodeCopyBegin = offsetof(Node, parent); -constexpr int kNodeCopySize = offsetof(Node, type) - kNodeCopyBegin; +constexpr int kNodeCopyBegin = offsetof(Node, entry); +constexpr int kNodeCopySize = + offsetof(Node, parentsIndex) + sizeof(Node::parentsIndex) - kNodeCopyBegin; struct Child { int64_t childMaxVersion; @@ -225,49 +241,41 @@ struct Child { }; struct Node0 : Node { - Node0() { this->type = Type_Node0; } + constexpr static auto kType = Type_Node0; uint8_t *partialKey() { return (uint8_t *)(this + 1); } }; struct Node3 : Node { constexpr static auto kMaxNodes = 3; + constexpr static auto kType = Type_Node3; // Sorted uint8_t index[kMaxNodes]; Child children[kMaxNodes]; - Node3() { this->type = Type_Node3; } uint8_t *partialKey() { return (uint8_t *)(this + 1); } }; struct Node16 : Node { + constexpr static auto kType = Type_Node16; constexpr static auto kMaxNodes = 16; // Sorted uint8_t index[kMaxNodes]; Child children[kMaxNodes]; - Node16() { this->type = Type_Node16; } uint8_t *partialKey() { return (uint8_t *)(this + 1); } }; struct Node48 : Node { + constexpr static auto kType = Type_Node48; BitSet bitSet; Child children[48]; - int8_t nextFree = 0; + int8_t nextFree; int8_t index[256]; - Node48() { - memset(index, -1, 256); - this->type = Type_Node48; - } uint8_t *partialKey() { return (uint8_t *)(this + 1); } }; struct Node256 : Node { + constexpr static auto kType = Type_Node256; BitSet bitSet; Child children[256]; - Node256() { - this->type = Type_Node256; - for (int i = 0; i < 256; ++i) { - children[i].child = nullptr; - } - } uint8_t *partialKey() { return (uint8_t *)(this + 1); } }; @@ -311,22 +319,27 @@ static_assert(kBytesPerKey - sizeof(Node0) >= kMinNodeSurplus); // Which should give us the budget to pay for the key bytes. (children + // entryPresent) is a lower bound on how many keys these bytes are a prefix of -template -struct BoundedFreeListAllocator { +constexpr int64_t kFreeListMaxMemory = 1 << 20; + +template struct BoundedFreeListAllocator { static_assert(sizeof(T) >= sizeof(void *)); static_assert(std::derived_from); + static_assert(std::is_trivial_v); T *allocate(int partialKeyCapacity) { if (freeList != nullptr) { T *n = (T *)freeList; + VALGRIND_MAKE_MEM_DEFINED(freeList, sizeof(freeList)); + memcpy(&freeList, freeList, sizeof(freeList)); VALGRIND_MAKE_MEM_UNDEFINED(n, sizeof(T)); VALGRIND_MAKE_MEM_DEFINED(&n->partialKeyCapacity, sizeof(n->partialKeyCapacity)); - VALGRIND_MAKE_MEM_DEFINED(freeList, sizeof(freeList)); - memcpy(&freeList, freeList, sizeof(freeList)); + VALGRIND_MAKE_MEM_DEFINED(&n->type, sizeof(n->type)); + assert(n->type == T::kType); + VALGRIND_MAKE_MEM_UNDEFINED(n + 1, n->partialKeyCapacity); freeListBytes -= sizeof(T) + n->partialKeyCapacity; if (n->partialKeyCapacity >= partialKeyCapacity) { - return new (n) T; + return n; } else { // The intent is to filter out too-small nodes in the freelist removeNode(n); @@ -334,7 +347,8 @@ struct BoundedFreeListAllocator { } } - auto *result = new (safe_malloc(sizeof(T) + partialKeyCapacity)) T; + auto *result = (T *)safe_malloc(sizeof(T) + partialKeyCapacity); + result->type = T::kType; result->partialKeyCapacity = partialKeyCapacity; addNode(result); return result; @@ -342,14 +356,14 @@ struct BoundedFreeListAllocator { void release(T *p) { static_assert(std::is_trivially_destructible_v); - if (freeListBytes >= kMemoryBound) { + if (freeListBytes >= kFreeListMaxMemory) { removeNode(p); return safe_free(p); } memcpy((void *)p, &freeList, sizeof(freeList)); freeList = p; freeListBytes += sizeof(T) + p->partialKeyCapacity; - VALGRIND_MAKE_MEM_NOACCESS(freeList, sizeof(T)); + VALGRIND_MAKE_MEM_NOACCESS(freeList, sizeof(T) + p->partialKeyCapacity); } ~BoundedFreeListAllocator() { @@ -455,7 +469,7 @@ template int getNodeIndex(NodeT *self, uint8_t index) { // Precondition - an entry for index must exist in the node Node *&getChildExists(Node *self, uint8_t index) { - switch (self->type) { + switch (self->getType()) { case Type_Node0: __builtin_unreachable(); // GCOVR_EXCL_LINE case Type_Node3: { @@ -487,7 +501,7 @@ int64_t &maxVersion(Node *n, ConflictSet::Impl *); Node *&getInTree(Node *n, ConflictSet::Impl *); Node *getChild(Node *self, uint8_t index) { - switch (self->type) { + switch (self->getType()) { case Type_Node0: return nullptr; case Type_Node3: { @@ -576,7 +590,7 @@ int getChildGeq(Node *self, int child) { if (child > 255) { return -1; } - switch (self->type) { + switch (self->getType()) { case Type_Node0: return -1; case Type_Node3: @@ -623,7 +637,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, NodeAllocators *allocators) { // Fast path for if it exists already - switch (self->type) { + switch (self->getType()) { case Type_Node0: break; case Type_Node3: { @@ -657,7 +671,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, __builtin_unreachable(); // GCOVR_EXCL_LINE } - switch (self->type) { + switch (self->getType()) { case Type_Node0: { auto *self0 = static_cast(self); @@ -710,9 +724,11 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, if (self->numChildren == Node16::kMaxNodes) { auto *self16 = static_cast(self); auto *newSelf = allocators->node48.allocate(self->partialKeyLen); + memset(newSelf->index, -1, sizeof(newSelf->index)); memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin, kNodeCopySize); memcpy(newSelf->partialKey(), self16->partialKey(), self->partialKeyLen); + newSelf->bitSet.init(); newSelf->nextFree = Node16::kMaxNodes; int i = 0; for (auto x : self16->index) { @@ -730,7 +746,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, insert16: auto *self16 = static_cast(self); - assert(self->type == Type_Node16); + assert(self->getType() == Type_Node16); ++self->numChildren; int i = 0; @@ -753,6 +769,9 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, if (self->numChildren == 48) { auto *self48 = static_cast(self); auto *newSelf = allocators->node256.allocate(self->partialKeyLen); + for (int i = 0; i < 256; ++i) { + newSelf->children[i].child = nullptr; + } memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin, kNodeCopySize); memcpy(newSelf->partialKey(), self48->partialKey(), self->partialKeyLen); @@ -819,7 +838,7 @@ template void freeAndMakeCapacityAtLeast(Node *&self, int capacity, NodeAllocators *allocators, ConflictSet::Impl *impl) { - switch (self->type) { + switch (self->getType()) { case Type_Node0: { auto *self0 = (Node0 *)self; auto *newSelf = allocators->node0.allocate(capacity); @@ -884,6 +903,8 @@ void freeAndMakeCapacityAtLeast(Node *&self, int capacity, kNodeCopySize); memcpy(newSelf->partialKey(), self48->partialKey(), self->partialKeyLen); newSelf->bitSet = self48->bitSet; + // TODO check codegen here? + newSelf->nextFree = 0; newSelf->bitSet.forEachInRange( [&](int c) { int index = newSelf->nextFree; @@ -933,7 +954,7 @@ void maybeDecreaseCapacity(Node *&self, NodeAllocators *allocators, ConflictSet::Impl *impl) { const int maxCapacity = (self->numChildren + int(self->entryPresent)) * self->partialKeyLen; - if (self->partialKeyCapacity <= maxCapacity) { + if (self->getCapacity() <= maxCapacity) { return; } freeAndMakeCapacityAtLeast(self, maxCapacity, @@ -948,7 +969,7 @@ void maybeDownsize(Node *self, NodeAllocators *allocators, fprintf(stderr, "maybeDownsize: %s\n", getSearchPathPrintable(self).c_str()); #endif - switch (self->type) { + switch (self->getType()) { case Type_Node0: __builtin_unreachable(); // GCOVR_EXCL_LINE case Type_Node3: { @@ -966,7 +987,7 @@ void maybeDownsize(Node *self, NodeAllocators *allocators, auto *child = self3->children[0].child; int minCapacity = self3->partialKeyLen + 1 + child->partialKeyLen; - if (minCapacity > child->partialKeyCapacity) { + if (minCapacity > child->getCapacity()) { const bool update = child == dontInvalidate; freeAndMakeCapacityAtLeast(child, minCapacity, allocators, impl); @@ -1011,6 +1032,7 @@ void maybeDownsize(Node *self, NodeAllocators *allocators, kNodeCopySize); memcpy(newSelf->partialKey(), self16->partialKey(), self->partialKeyLen); // TODO replace with memcpy? + static_assert(Node3::kMaxNodes == kMinChildrenNode16 - 1); for (int i = 0; i < Node3::kMaxNodes; ++i) { newSelf->index[i] = self16->index[i]; newSelf->children[i] = self16->children[i]; @@ -1028,6 +1050,7 @@ void maybeDownsize(Node *self, NodeAllocators *allocators, kNodeCopySize); memcpy(newSelf->partialKey(), self48->partialKey(), self->partialKeyLen); + static_assert(Node16::kMaxNodes == kMinChildrenNode48 - 1); int i = 0; self48->bitSet.forEachInRange( [&](int c) { @@ -1051,10 +1074,12 @@ void maybeDownsize(Node *self, NodeAllocators *allocators, if (self->numChildren + int(self->entryPresent) < kMinChildrenNode256) { auto *self256 = (Node256 *)self; auto *newSelf = allocators->node48.allocate(self->partialKeyLen); + memset(newSelf->index, -1, sizeof(newSelf->index)); memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin, kNodeCopySize); memcpy(newSelf->partialKey(), self256->partialKey(), self->partialKeyLen); + newSelf->nextFree = 0; newSelf->bitSet = self256->bitSet; newSelf->bitSet.forEachInRange( [&](int c) { @@ -1103,7 +1128,7 @@ Node *erase(Node *self, NodeAllocators *allocators, ConflictSet::Impl *impl, return result; } - switch (self->type) { + switch (self->getType()) { case Type_Node0: allocators->node0.release((Node0 *)self); break; @@ -1123,7 +1148,7 @@ Node *erase(Node *self, NodeAllocators *allocators, ConflictSet::Impl *impl, __builtin_unreachable(); // GCOVR_EXCL_LINE } - switch (parent->type) { + switch (parent->getType()) { case Type_Node0: __builtin_unreachable(); // GCOVR_EXCL_LINE case Type_Node3: { @@ -1481,7 +1506,7 @@ int64_t maxBetweenExclusive(Node *n, int begin, int end) { return result; } } - switch (n->type) { + switch (n->getType()) { case Type_Node0: // We would have returned above, after not finding a child __builtin_unreachable(); // GCOVR_EXCL_LINE @@ -2027,7 +2052,7 @@ template } else { // Consider adding a partial key if ((*self)->numChildren == 0 && !(*self)->entryPresent) { - assert((*self)->partialKeyCapacity >= int(key.size())); + assert((*self)->getCapacity() >= int(key.size())); (*self)->partialKeyLen = key.size(); memcpy((*self)->partialKey(), key.data(), (*self)->partialKeyLen); key = key.subspan((*self)->partialKeyLen, @@ -2054,6 +2079,9 @@ template auto &child = getOrCreateChild(*self, key.front(), allocators); if (!child) { child = allocators->node0.allocate(key.size() - 1); + child->numChildren = 0; + child->entryPresent = false; + child->partialKeyLen = 0; child->parent = *self; child->parentsIndex = key.front(); maxVersion(child, impl) = @@ -2348,12 +2376,17 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl { explicit Impl(int64_t oldestVersion) : oldestVersion(oldestVersion) { // Insert "" root = allocators.node0.allocate(0); + root->numChildren = 0; + root->parent = nullptr; rootMaxVersion = oldestVersion; - root->entry.pointVersion = oldestVersion; - root->entry.rangeVersion = oldestVersion; + root->entryPresent = false; + root->partialKeyLen = 0; addKey(root); + root->entryPresent = true; + root->entry.pointVersion = oldestVersion; + root->entry.rangeVersion = oldestVersion; } ~Impl() { destroyTree(root); } @@ -2375,7 +2408,7 @@ int64_t &maxVersion(Node *n, ConflictSet::Impl *impl) { if (n == nullptr) { return impl->rootMaxVersion; } - switch (n->type) { + switch (n->getType()) { case Type_Node0: __builtin_unreachable(); // GCOVR_EXCL_LINE case Type_Node3: { @@ -2654,7 +2687,7 @@ Iterator firstGeq(Node *n, std::string_view key) { [[maybe_unused]] void checkMemoryBoundInvariants(Node *node, bool &success) { int minNumChildren; - switch (node->type) { + switch (node->getType()) { case Type_Node0: minNumChildren = 0; break; @@ -2718,7 +2751,7 @@ int64_t keyBytes = 0; int64_t peakKeyBytes = 0; int64_t getNodeSize(struct Node *n) { - switch (n->type) { + switch (n->getType()) { case Type_Node0: return sizeof(Node0); case Type_Node3: @@ -2750,7 +2783,7 @@ int64_t getSearchPathLength(Node *n) { void addNode(Node *n) { nodeBytes += getNodeSize(n); - partialCapacityBytes += n->partialKeyCapacity; + partialCapacityBytes += n->getCapacity(); if (nodeBytes > peakNodeBytes) { peakNodeBytes = nodeBytes; } @@ -2761,7 +2794,7 @@ void addNode(Node *n) { void removeNode(Node *n) { nodeBytes -= getNodeSize(n); - partialCapacityBytes -= n->partialKeyCapacity; + partialCapacityBytes -= n->getCapacity(); } void addKey(Node *n) { @@ -2824,25 +2857,7 @@ void printTree() { debugPrintDot(stdout, cs.root, &cs); } -#define ANKERL_NANOBENCH_IMPLEMENT -#include "third_party/nanobench.h" - -int main(void) { - printTree(); - return 0; - ankerl::nanobench::Bench bench; - ConflictSet::Impl cs{0}; - for (int j = 0; j < 256; ++j) { - getOrCreateChild(cs.root, j, &cs.allocators) = - cs.allocators.node0.allocate(0); - if (j % 10 == 0) { - bench.run("MaxExclusive " + std::to_string(j), [&]() { - bench.doNotOptimizeAway(maxBetweenExclusive(cs.root, 0, 256)); - }); - } - } - return 0; -} +int main(void) { printTree(); } #endif #ifdef ENABLE_FUZZ