From 3fb8bf7c3b59f7447ce4c7e322cd596e57fb6d47 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Fri, 8 Mar 2024 14:43:18 -0800 Subject: [PATCH] Bring back custom allocator --- ConflictSet.cpp | 186 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 149 insertions(+), 37 deletions(-) diff --git a/ConflictSet.cpp b/ConflictSet.cpp index e0856b8..76e8ca6 100644 --- a/ConflictSet.cpp +++ b/ConflictSet.cpp @@ -244,13 +244,73 @@ struct Node256 : Node { uint8_t *partialKey() { return (uint8_t *)(this + 1); } }; -template NodeT *newNode(int partialKeyCapacity) { - auto *result = new (safe_malloc(sizeof(NodeT) + partialKeyCapacity)) NodeT; -#ifndef NDEBUG - result->partialKeyCapacity = partialKeyCapacity; +// Bounds memory usage in free list, but does not account for memory for partial +// keys. +template +struct BoundedFreeListAllocator { + static_assert(sizeof(T) >= sizeof(void *)); + static_assert(std::derived_from); + + T *allocate(int partialKeyCapacity) { +#if SHOW_MEMORY + ++liveAllocations; + maxLiveAllocations = std::max(maxLiveAllocations, liveAllocations); #endif - return result; -} + if (freeList != nullptr) { + T *n = (T *)freeList; + VALGRIND_MAKE_MEM_DEFINED(n, sizeof(T)); + if (n->partialKeyLen >= partialKeyCapacity) { + memcpy(&freeList, freeList, sizeof(freeList)); + --freeListSize; + VALGRIND_MAKE_MEM_UNDEFINED(n, sizeof(T)); + return new (n) T; + } + VALGRIND_MAKE_MEM_NOACCESS(n, sizeof(T)); + } + + auto *result = new (safe_malloc(sizeof(T) + partialKeyCapacity)) T; +#ifndef NDEBUG + result->partialKeyCapacity = partialKeyCapacity; +#endif + return result; + } + + void release(T *p) { +#if SHOW_MEMORY + --liveAllocations; +#endif + p->~T(); + if (freeListSize == kMaxFreeListSize) { + return free(p); + } + memcpy((void *)p, &freeList, sizeof(freeList)); + freeList = p; + ++freeListSize; + VALGRIND_MAKE_MEM_NOACCESS(freeList, sizeof(T)); + } + + ~BoundedFreeListAllocator() { + for (void *iter = freeList; iter != nullptr;) { + VALGRIND_MAKE_MEM_DEFINED(iter, sizeof(iter)); + auto *tmp = iter; + memcpy(&iter, iter, sizeof(void *)); + free(tmp); + } + } + +#if SHOW_MEMORY + int64_t highWaterMarkBytes() const { return maxLiveAllocations * sizeof(T); } +#endif + +private: + static constexpr int kMaxFreeListSize = kMemoryBound / sizeof(T); + int freeListSize = 0; + void *freeList = nullptr; +#if SHOW_MEMORY + int64_t maxLiveAllocations = 0; + int64_t liveAllocations = 0; +#endif +}; uint8_t *Node::partialKey() { switch (type) { @@ -267,6 +327,14 @@ uint8_t *Node::partialKey() { } } +struct NodeAllocators { + BoundedFreeListAllocator node0; + BoundedFreeListAllocator node4; + BoundedFreeListAllocator node16; + BoundedFreeListAllocator node48; + BoundedFreeListAllocator node256; +}; + int getNodeIndex(Node16 *self, uint8_t index) { #ifdef HAS_AVX // Based on https://www.the-paper-trail.org/post/art-paper-notes/ @@ -473,7 +541,8 @@ void setChildrenParents(Node256 *n) { // Caller is responsible for assigning a non-null pointer to the returned // reference if null -Node *&getOrCreateChild(Node *&self, uint8_t index) { +Node *&getOrCreateChild(Node *&self, uint8_t index, + NodeAllocators *allocators) { // Fast path for if it exists already if (self->type <= Type::Node16) { @@ -498,11 +567,11 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) { if (self->type == Type::Node0) { auto *self0 = static_cast(self); - auto *newSelf = newNode(self->partialKeyLen); + auto *newSelf = allocators->node4.allocate(self->partialKeyLen); memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin, kNodeCopySize); memcpy(newSelf->partialKey(), self0->partialKey(), self->partialKeyLen); - free(self0); + allocators->node0.release(self0); self = newSelf; goto insert16; @@ -511,7 +580,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) { auto *self4 = static_cast(self); if (self->numChildren == 4) { - auto *newSelf = newNode(self->partialKeyLen); + auto *newSelf = allocators->node16.allocate(self->partialKeyLen); memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin, kNodeCopySize); memcpy(newSelf->partialKey(), self4->partialKey(), self->partialKeyLen); @@ -520,7 +589,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) { newSelf->index[i] = self4->index[i]; newSelf->children[i] = self4->children[i]; } - free(self4); + allocators->node4.release(self4); setChildrenParents(newSelf); self = newSelf; } @@ -531,7 +600,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) { if (self->numChildren == 16) { auto *self16 = static_cast(self); - auto *newSelf = newNode(self->partialKeyLen); + auto *newSelf = allocators->node48.allocate(self->partialKeyLen); memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin, kNodeCopySize); memcpy(newSelf->partialKey(), self16->partialKey(), self->partialKeyLen); @@ -544,7 +613,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) { ++i; } assert(i == 16); - free(self16); + allocators->node16.release(self16); setChildrenParents(newSelf); self = newSelf; goto insert48; @@ -572,7 +641,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) { if (self->numChildren == 48) { auto *self48 = static_cast(self); - auto *newSelf = newNode(self->partialKeyLen); + auto *newSelf = allocators->node256.allocate(self->partialKeyLen); memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin, kNodeCopySize); memcpy(newSelf->partialKey(), self48->partialKey(), self->partialKeyLen); @@ -582,7 +651,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) { newSelf->children[i] = self48->children[self48->index[i]]; }, 0, 256); - free(self48); + allocators->node48.release(self48); setChildrenParents(newSelf); self = newSelf; goto insert256; @@ -609,9 +678,25 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) { } // Precondition - an entry for index must exist in the node -void eraseChild(Node *self, uint8_t index) { +void eraseChild(Node *self, uint8_t index, NodeAllocators *allocators) { auto *child = getChildExists(self, index); - free(child); + switch (child->type) { + case Type::Node0: + allocators->node0.release((Node0 *)child); + break; + case Type::Node4: + allocators->node4.release((Node4 *)child); + break; + case Type::Node16: + allocators->node16.release((Node16 *)child); + break; + case Type::Node48: + allocators->node48.release((Node48 *)child); + break; + case Type::Node256: + allocators->node256.release((Node256 *)child); + break; + } if (self->type <= Type::Node16) { auto *self16 = static_cast(self); @@ -643,7 +728,7 @@ void eraseChild(Node *self, uint8_t index) { --self->numChildren; if (self->numChildren == 0 && !self->entryPresent && self->parent != nullptr) { - eraseChild(self->parent, self->parentsIndex); + eraseChild(self->parent, self->parentsIndex, allocators); } } @@ -1464,7 +1549,8 @@ bool checkRangeRead(Node *n, std::span begin, // a postcondition. template [[nodiscard]] Node *insert(Node **self, std::span key, - int64_t writeVersion, ConflictSet::Impl *impl) { + int64_t writeVersion, NodeAllocators *allocators, + ConflictSet::Impl *impl) { for (;;) { @@ -1477,7 +1563,7 @@ template auto *old = *self; int64_t oldMaxVersion = maxVersion(old, impl); - *self = newNode(partialKeyIndex); + *self = allocators->node4.allocate(partialKeyIndex); memcpy((char *)*self + kNodeCopyBegin, (char *)old + kNodeCopyBegin, kNodeCopySize); @@ -1487,7 +1573,8 @@ template memcpy((*self)->partialKey(), old->partialKey(), (*self)->partialKeyLen); - getOrCreateChild(*self, old->partialKey()[partialKeyIndex]) = old; + getOrCreateChild(*self, old->partialKey()[partialKeyIndex], + allocators) = old; old->parent = *self; old->parentsIndex = old->partialKey()[partialKeyIndex]; maxVersion(old, impl) = oldMaxVersion; @@ -1501,7 +1588,7 @@ template } else { // Consider adding a partial key if ((*self)->numChildren == 0 && !(*self)->entryPresent) { - assert((*self)->partialKeyCapacity == int(key.size())); + assert((*self)->partialKeyCapacity >= int(key.size())); (*self)->partialKeyLen = key.size(); memcpy((*self)->partialKey(), key.data(), (*self)->partialKeyLen); key = key.subspan((*self)->partialKeyLen, @@ -1525,9 +1612,9 @@ template m = writeVersion; } - auto &child = getOrCreateChild(*self, key.front()); + auto &child = getOrCreateChild(*self, key.front(), allocators); if (!child) { - child = newNode(key.size() - 1); + child = allocators->node0.allocate(key.size() - 1); child->parent = *self; child->parentsIndex = key.front(); maxVersion(child, impl) = @@ -1559,8 +1646,8 @@ void destroyTree(Node *root) { void addPointWrite(Node *&root, int64_t oldestVersion, std::span key, int64_t writeVersion, - ConflictSet::Impl *impl) { - auto *n = insert(&root, key, writeVersion, impl); + NodeAllocators *allocators, ConflictSet::Impl *impl) { + auto *n = insert(&root, key, writeVersion, allocators, impl); if (!n->entryPresent) { auto *p = nextLogical(n); n->entryPresent = true; @@ -1576,13 +1663,15 @@ void addPointWrite(Node *&root, int64_t oldestVersion, void addWriteRange(Node *&root, int64_t oldestVersion, std::span begin, std::span end, - int64_t writeVersion, ConflictSet::Impl *impl) { + int64_t writeVersion, NodeAllocators *allocators, + ConflictSet::Impl *impl) { int lcp = longestCommonPrefix(begin.data(), end.data(), std::min(begin.size(), end.size())); if (lcp == int(begin.size()) && end.size() == begin.size() + 1 && end.back() == 0) { - return addPointWrite(root, oldestVersion, begin, writeVersion, impl); + return addPointWrite(root, oldestVersion, begin, writeVersion, allocators, + impl); } auto remaining = begin.subspan(0, lcp); @@ -1621,7 +1710,8 @@ void addWriteRange(Node *&root, int64_t oldestVersion, begin = begin.subspan(consumed, begin.size() - consumed); end = end.subspan(consumed, end.size() - consumed); - auto *beginNode = insert(useAsRoot, begin, writeVersion, impl); + auto *beginNode = + insert(useAsRoot, begin, writeVersion, allocators, impl); const bool insertedBegin = !beginNode->entryPresent; beginNode->entryPresent = true; @@ -1639,7 +1729,7 @@ void addWriteRange(Node *&root, int64_t oldestVersion, assert(writeVersion >= beginNode->entry.pointVersion); beginNode->entry.pointVersion = writeVersion; - auto *endNode = insert(useAsRoot, end, writeVersion, impl); + auto *endNode = insert(useAsRoot, end, writeVersion, allocators, impl); const bool insertedEnd = !endNode->entryPresent; endNode->entryPresent = true; @@ -1655,7 +1745,7 @@ void addWriteRange(Node *&root, int64_t oldestVersion, if (insertedEnd) { // beginNode may have been invalidated - beginNode = insert(useAsRoot, begin, writeVersion, impl); + beginNode = insert(useAsRoot, begin, writeVersion, allocators, impl); assert(beginNode->entryPresent); } @@ -1664,7 +1754,7 @@ void addWriteRange(Node *&root, int64_t oldestVersion, beginNode = nextLogical(beginNode); old->entryPresent = false; if (old->numChildren == 0 && old->parent != nullptr) { - eraseChild(old->parent, old->parentsIndex); + eraseChild(old->parent, old->parentsIndex, allocators); } } } @@ -1758,10 +1848,12 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl { auto end = std::span(w.end.p, w.end.len); if (w.end.len > 0) { keyUpdates += 3; - addWriteRange(root, oldestVersion, begin, end, writeVersion, this); + addWriteRange(root, oldestVersion, begin, end, writeVersion, + &allocators, this); } else { keyUpdates += 2; - addPointWrite(root, oldestVersion, begin, writeVersion, this); + addPointWrite(root, oldestVersion, begin, writeVersion, &allocators, + this); } } } @@ -1794,7 +1886,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl { assert(n->entry.rangeVersion <= oldestVersion); prev->entryPresent = false; if (prev->numChildren == 0 && prev->parent != nullptr) { - eraseChild(prev->parent, prev->parentsIndex); + eraseChild(prev->parent, prev->parentsIndex, &allocators); } } @@ -1806,7 +1898,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl { explicit Impl(int64_t oldestVersion) : oldestVersion(oldestVersion) { // Insert "" - root = newNode(0); + root = allocators.node0.allocate(0); rootMaxVersion = oldestVersion; root->entry.pointVersion = oldestVersion; root->entry.rangeVersion = oldestVersion; @@ -1814,6 +1906,8 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl { } ~Impl() { destroyTree(root); } + NodeAllocators allocators; + Arena removalKeyArena; std::span removalKey; int64_t keyUpdates = 0; @@ -1873,6 +1967,23 @@ ConflictSet::~ConflictSet() { } } +#if SHOW_MEMORY +__attribute__((visibility("default"))) void showMemory(const ConflictSet &cs) { + ConflictSet::Impl *impl; + memcpy(&impl, &cs, sizeof(impl)); // NOLINT + fprintf(stderr, "Max Node0 memory usage: %" PRId64 "\n", + impl->allocators.node0.highWaterMarkBytes()); + fprintf(stderr, "Max Node4 memory usage: %" PRId64 "\n", + impl->allocators.node4.highWaterMarkBytes()); + fprintf(stderr, "Max Node16 memory usage: %" PRId64 "\n", + impl->allocators.node16.highWaterMarkBytes()); + fprintf(stderr, "Max Node48 memory usage: %" PRId64 "\n", + impl->allocators.node48.highWaterMarkBytes()); + fprintf(stderr, "Max Node256 memory usage: %" PRId64 "\n", + impl->allocators.node256.highWaterMarkBytes()); +} +#endif + ConflictSet::ConflictSet(ConflictSet &&other) noexcept : impl(std::exchange(other.impl, nullptr)) {} @@ -2148,7 +2259,8 @@ int main(void) { ankerl::nanobench::Bench bench; ConflictSet::Impl cs{0}; for (int j = 0; j < 256; ++j) { - getOrCreateChild(cs.root, j) = newNode(0); + getOrCreateChild(cs.root, j, &cs.allocators) = + cs.allocators.node0.allocate(0); if (j % 10 == 0) { bench.run("MaxExclusive " + std::to_string(j), [&]() { bench.doNotOptimizeAway(maxBetweenExclusive(cs.root, 0, 256));