Bring back custom allocator

This commit is contained in:
2024-03-08 14:43:18 -08:00
parent 0c8cb8faa5
commit 3fb8bf7c3b

View File

@@ -244,14 +244,74 @@ struct Node256 : Node {
uint8_t *partialKey() { return (uint8_t *)(this + 1); }
};
template <class NodeT> NodeT *newNode(int partialKeyCapacity) {
auto *result = new (safe_malloc(sizeof(NodeT) + partialKeyCapacity)) NodeT;
// Bounds memory usage in free list, but does not account for memory for partial
// keys.
template <class T, size_t kMemoryBound = (1 << 20)>
struct BoundedFreeListAllocator {
static_assert(sizeof(T) >= sizeof(void *));
static_assert(std::derived_from<T, Node>);
T *allocate(int partialKeyCapacity) {
#if SHOW_MEMORY
++liveAllocations;
maxLiveAllocations = std::max(maxLiveAllocations, liveAllocations);
#endif
if (freeList != nullptr) {
T *n = (T *)freeList;
VALGRIND_MAKE_MEM_DEFINED(n, sizeof(T));
if (n->partialKeyLen >= partialKeyCapacity) {
memcpy(&freeList, freeList, sizeof(freeList));
--freeListSize;
VALGRIND_MAKE_MEM_UNDEFINED(n, sizeof(T));
return new (n) T;
}
VALGRIND_MAKE_MEM_NOACCESS(n, sizeof(T));
}
auto *result = new (safe_malloc(sizeof(T) + partialKeyCapacity)) T;
#ifndef NDEBUG
result->partialKeyCapacity = partialKeyCapacity;
#endif
return result;
}
void release(T *p) {
#if SHOW_MEMORY
--liveAllocations;
#endif
p->~T();
if (freeListSize == kMaxFreeListSize) {
return free(p);
}
memcpy((void *)p, &freeList, sizeof(freeList));
freeList = p;
++freeListSize;
VALGRIND_MAKE_MEM_NOACCESS(freeList, sizeof(T));
}
~BoundedFreeListAllocator() {
for (void *iter = freeList; iter != nullptr;) {
VALGRIND_MAKE_MEM_DEFINED(iter, sizeof(iter));
auto *tmp = iter;
memcpy(&iter, iter, sizeof(void *));
free(tmp);
}
}
#if SHOW_MEMORY
int64_t highWaterMarkBytes() const { return maxLiveAllocations * sizeof(T); }
#endif
private:
static constexpr int kMaxFreeListSize = kMemoryBound / sizeof(T);
int freeListSize = 0;
void *freeList = nullptr;
#if SHOW_MEMORY
int64_t maxLiveAllocations = 0;
int64_t liveAllocations = 0;
#endif
};
uint8_t *Node::partialKey() {
switch (type) {
case Type::Node0:
@@ -267,6 +327,14 @@ uint8_t *Node::partialKey() {
}
}
struct NodeAllocators {
BoundedFreeListAllocator<Node0> node0;
BoundedFreeListAllocator<Node4> node4;
BoundedFreeListAllocator<Node16> node16;
BoundedFreeListAllocator<Node48> node48;
BoundedFreeListAllocator<Node256> node256;
};
int getNodeIndex(Node16 *self, uint8_t index) {
#ifdef HAS_AVX
// Based on https://www.the-paper-trail.org/post/art-paper-notes/
@@ -473,7 +541,8 @@ void setChildrenParents(Node256 *n) {
// Caller is responsible for assigning a non-null pointer to the returned
// reference if null
Node *&getOrCreateChild(Node *&self, uint8_t index) {
Node *&getOrCreateChild(Node *&self, uint8_t index,
NodeAllocators *allocators) {
// Fast path for if it exists already
if (self->type <= Type::Node16) {
@@ -498,11 +567,11 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) {
if (self->type == Type::Node0) {
auto *self0 = static_cast<Node0 *>(self);
auto *newSelf = newNode<Node4>(self->partialKeyLen);
auto *newSelf = allocators->node4.allocate(self->partialKeyLen);
memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin,
kNodeCopySize);
memcpy(newSelf->partialKey(), self0->partialKey(), self->partialKeyLen);
free(self0);
allocators->node0.release(self0);
self = newSelf;
goto insert16;
@@ -511,7 +580,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) {
auto *self4 = static_cast<Node4 *>(self);
if (self->numChildren == 4) {
auto *newSelf = newNode<Node16>(self->partialKeyLen);
auto *newSelf = allocators->node16.allocate(self->partialKeyLen);
memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin,
kNodeCopySize);
memcpy(newSelf->partialKey(), self4->partialKey(), self->partialKeyLen);
@@ -520,7 +589,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) {
newSelf->index[i] = self4->index[i];
newSelf->children[i] = self4->children[i];
}
free(self4);
allocators->node4.release(self4);
setChildrenParents(newSelf);
self = newSelf;
}
@@ -531,7 +600,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) {
if (self->numChildren == 16) {
auto *self16 = static_cast<Node16 *>(self);
auto *newSelf = newNode<Node48>(self->partialKeyLen);
auto *newSelf = allocators->node48.allocate(self->partialKeyLen);
memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin,
kNodeCopySize);
memcpy(newSelf->partialKey(), self16->partialKey(), self->partialKeyLen);
@@ -544,7 +613,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) {
++i;
}
assert(i == 16);
free(self16);
allocators->node16.release(self16);
setChildrenParents(newSelf);
self = newSelf;
goto insert48;
@@ -572,7 +641,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) {
if (self->numChildren == 48) {
auto *self48 = static_cast<Node48 *>(self);
auto *newSelf = newNode<Node256>(self->partialKeyLen);
auto *newSelf = allocators->node256.allocate(self->partialKeyLen);
memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin,
kNodeCopySize);
memcpy(newSelf->partialKey(), self48->partialKey(), self->partialKeyLen);
@@ -582,7 +651,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) {
newSelf->children[i] = self48->children[self48->index[i]];
},
0, 256);
free(self48);
allocators->node48.release(self48);
setChildrenParents(newSelf);
self = newSelf;
goto insert256;
@@ -609,9 +678,25 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) {
}
// Precondition - an entry for index must exist in the node
void eraseChild(Node *self, uint8_t index) {
void eraseChild(Node *self, uint8_t index, NodeAllocators *allocators) {
auto *child = getChildExists(self, index);
free(child);
switch (child->type) {
case Type::Node0:
allocators->node0.release((Node0 *)child);
break;
case Type::Node4:
allocators->node4.release((Node4 *)child);
break;
case Type::Node16:
allocators->node16.release((Node16 *)child);
break;
case Type::Node48:
allocators->node48.release((Node48 *)child);
break;
case Type::Node256:
allocators->node256.release((Node256 *)child);
break;
}
if (self->type <= Type::Node16) {
auto *self16 = static_cast<Node16 *>(self);
@@ -643,7 +728,7 @@ void eraseChild(Node *self, uint8_t index) {
--self->numChildren;
if (self->numChildren == 0 && !self->entryPresent &&
self->parent != nullptr) {
eraseChild(self->parent, self->parentsIndex);
eraseChild(self->parent, self->parentsIndex, allocators);
}
}
@@ -1464,7 +1549,8 @@ bool checkRangeRead(Node *n, std::span<const uint8_t> begin,
// a postcondition.
template <bool kBegin>
[[nodiscard]] Node *insert(Node **self, std::span<const uint8_t> key,
int64_t writeVersion, ConflictSet::Impl *impl) {
int64_t writeVersion, NodeAllocators *allocators,
ConflictSet::Impl *impl) {
for (;;) {
@@ -1477,7 +1563,7 @@ template <bool kBegin>
auto *old = *self;
int64_t oldMaxVersion = maxVersion(old, impl);
*self = newNode<Node4>(partialKeyIndex);
*self = allocators->node4.allocate(partialKeyIndex);
memcpy((char *)*self + kNodeCopyBegin, (char *)old + kNodeCopyBegin,
kNodeCopySize);
@@ -1487,7 +1573,8 @@ template <bool kBegin>
memcpy((*self)->partialKey(), old->partialKey(),
(*self)->partialKeyLen);
getOrCreateChild(*self, old->partialKey()[partialKeyIndex]) = old;
getOrCreateChild(*self, old->partialKey()[partialKeyIndex],
allocators) = old;
old->parent = *self;
old->parentsIndex = old->partialKey()[partialKeyIndex];
maxVersion(old, impl) = oldMaxVersion;
@@ -1501,7 +1588,7 @@ template <bool kBegin>
} else {
// Consider adding a partial key
if ((*self)->numChildren == 0 && !(*self)->entryPresent) {
assert((*self)->partialKeyCapacity == int(key.size()));
assert((*self)->partialKeyCapacity >= int(key.size()));
(*self)->partialKeyLen = key.size();
memcpy((*self)->partialKey(), key.data(), (*self)->partialKeyLen);
key = key.subspan((*self)->partialKeyLen,
@@ -1525,9 +1612,9 @@ template <bool kBegin>
m = writeVersion;
}
auto &child = getOrCreateChild(*self, key.front());
auto &child = getOrCreateChild(*self, key.front(), allocators);
if (!child) {
child = newNode<Node0>(key.size() - 1);
child = allocators->node0.allocate(key.size() - 1);
child->parent = *self;
child->parentsIndex = key.front();
maxVersion(child, impl) =
@@ -1559,8 +1646,8 @@ void destroyTree(Node *root) {
void addPointWrite(Node *&root, int64_t oldestVersion,
std::span<const uint8_t> key, int64_t writeVersion,
ConflictSet::Impl *impl) {
auto *n = insert<true>(&root, key, writeVersion, impl);
NodeAllocators *allocators, ConflictSet::Impl *impl) {
auto *n = insert<true>(&root, key, writeVersion, allocators, impl);
if (!n->entryPresent) {
auto *p = nextLogical(n);
n->entryPresent = true;
@@ -1576,13 +1663,15 @@ void addPointWrite(Node *&root, int64_t oldestVersion,
void addWriteRange(Node *&root, int64_t oldestVersion,
std::span<const uint8_t> begin, std::span<const uint8_t> end,
int64_t writeVersion, ConflictSet::Impl *impl) {
int64_t writeVersion, NodeAllocators *allocators,
ConflictSet::Impl *impl) {
int lcp = longestCommonPrefix(begin.data(), end.data(),
std::min(begin.size(), end.size()));
if (lcp == int(begin.size()) && end.size() == begin.size() + 1 &&
end.back() == 0) {
return addPointWrite(root, oldestVersion, begin, writeVersion, impl);
return addPointWrite(root, oldestVersion, begin, writeVersion, allocators,
impl);
}
auto remaining = begin.subspan(0, lcp);
@@ -1621,7 +1710,8 @@ void addWriteRange(Node *&root, int64_t oldestVersion,
begin = begin.subspan(consumed, begin.size() - consumed);
end = end.subspan(consumed, end.size() - consumed);
auto *beginNode = insert<true>(useAsRoot, begin, writeVersion, impl);
auto *beginNode =
insert<true>(useAsRoot, begin, writeVersion, allocators, impl);
const bool insertedBegin = !beginNode->entryPresent;
beginNode->entryPresent = true;
@@ -1639,7 +1729,7 @@ void addWriteRange(Node *&root, int64_t oldestVersion,
assert(writeVersion >= beginNode->entry.pointVersion);
beginNode->entry.pointVersion = writeVersion;
auto *endNode = insert<false>(useAsRoot, end, writeVersion, impl);
auto *endNode = insert<false>(useAsRoot, end, writeVersion, allocators, impl);
const bool insertedEnd = !endNode->entryPresent;
endNode->entryPresent = true;
@@ -1655,7 +1745,7 @@ void addWriteRange(Node *&root, int64_t oldestVersion,
if (insertedEnd) {
// beginNode may have been invalidated
beginNode = insert<true>(useAsRoot, begin, writeVersion, impl);
beginNode = insert<true>(useAsRoot, begin, writeVersion, allocators, impl);
assert(beginNode->entryPresent);
}
@@ -1664,7 +1754,7 @@ void addWriteRange(Node *&root, int64_t oldestVersion,
beginNode = nextLogical(beginNode);
old->entryPresent = false;
if (old->numChildren == 0 && old->parent != nullptr) {
eraseChild(old->parent, old->parentsIndex);
eraseChild(old->parent, old->parentsIndex, allocators);
}
}
}
@@ -1758,10 +1848,12 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
auto end = std::span<const uint8_t>(w.end.p, w.end.len);
if (w.end.len > 0) {
keyUpdates += 3;
addWriteRange(root, oldestVersion, begin, end, writeVersion, this);
addWriteRange(root, oldestVersion, begin, end, writeVersion,
&allocators, this);
} else {
keyUpdates += 2;
addPointWrite(root, oldestVersion, begin, writeVersion, this);
addPointWrite(root, oldestVersion, begin, writeVersion, &allocators,
this);
}
}
}
@@ -1794,7 +1886,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
assert(n->entry.rangeVersion <= oldestVersion);
prev->entryPresent = false;
if (prev->numChildren == 0 && prev->parent != nullptr) {
eraseChild(prev->parent, prev->parentsIndex);
eraseChild(prev->parent, prev->parentsIndex, &allocators);
}
}
@@ -1806,7 +1898,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
explicit Impl(int64_t oldestVersion) : oldestVersion(oldestVersion) {
// Insert ""
root = newNode<Node4>(0);
root = allocators.node0.allocate(0);
rootMaxVersion = oldestVersion;
root->entry.pointVersion = oldestVersion;
root->entry.rangeVersion = oldestVersion;
@@ -1814,6 +1906,8 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
}
~Impl() { destroyTree(root); }
NodeAllocators allocators;
Arena removalKeyArena;
std::span<const uint8_t> removalKey;
int64_t keyUpdates = 0;
@@ -1873,6 +1967,23 @@ ConflictSet::~ConflictSet() {
}
}
#if SHOW_MEMORY
__attribute__((visibility("default"))) void showMemory(const ConflictSet &cs) {
ConflictSet::Impl *impl;
memcpy(&impl, &cs, sizeof(impl)); // NOLINT
fprintf(stderr, "Max Node0 memory usage: %" PRId64 "\n",
impl->allocators.node0.highWaterMarkBytes());
fprintf(stderr, "Max Node4 memory usage: %" PRId64 "\n",
impl->allocators.node4.highWaterMarkBytes());
fprintf(stderr, "Max Node16 memory usage: %" PRId64 "\n",
impl->allocators.node16.highWaterMarkBytes());
fprintf(stderr, "Max Node48 memory usage: %" PRId64 "\n",
impl->allocators.node48.highWaterMarkBytes());
fprintf(stderr, "Max Node256 memory usage: %" PRId64 "\n",
impl->allocators.node256.highWaterMarkBytes());
}
#endif
ConflictSet::ConflictSet(ConflictSet &&other) noexcept
: impl(std::exchange(other.impl, nullptr)) {}
@@ -2148,7 +2259,8 @@ int main(void) {
ankerl::nanobench::Bench bench;
ConflictSet::Impl cs{0};
for (int j = 0; j < 256; ++j) {
getOrCreateChild(cs.root, j) = newNode<Node0>(0);
getOrCreateChild(cs.root, j, &cs.allocators) =
cs.allocators.node0.allocate(0);
if (j % 10 == 0) {
bench.run("MaxExclusive " + std::to_string(j), [&]() {
bench.doNotOptimizeAway(maxBetweenExclusive(cs.root, 0, 256));