Remove freeList, min/max capacity tracking

The freelist doesn't seem to get a good hit rate. Policies other than
capacity = minCapacity did not improve the rate we were resizing nodes,
but did increase memory usage, so get rid of that too. Add a
nodes_resized_total counter.
This commit is contained in:
2024-11-20 14:43:10 -08:00
parent 3f4d3b685a
commit e125b599b5

View File

@@ -702,27 +702,13 @@ constexpr int getMaxCapacity(Node *self) {
self->partialKeyLen);
}
#ifdef __APPLE__
// Disabling the free list altogether is faster on my mac m1
constexpr int64_t kMaxFreeListBytes = 0;
#else
constexpr int64_t kMaxFreeListBytes = 1 << 20;
#endif
// Maintains a free list up to kMaxFreeListBytes. If the top element of the list
// doesn't meet the capacity constraints, it's freed and a new node is allocated
// with the minimum capacity. The hope is that "unfit" nodes don't get stuck in
// the free list.
template <class T> struct NodeAllocator {
static_assert(std::derived_from<T, Node>);
static_assert(std::is_trivial_v<T>);
T *allocate(int minCapacity, int maxCapacity) {
assert(minCapacity <= maxCapacity);
assert(freeListSize >= 0);
assert(freeListSize <= kMaxFreeListBytes);
T *result = allocate_helper(minCapacity, maxCapacity);
T *allocate(int capacity) {
T *result = allocate_helper(capacity);
result->endOfRange = false;
result->releaseDeferred = false;
if constexpr (!std::is_same_v<T, Node0>) {
@@ -742,47 +728,19 @@ template <class T> struct NodeAllocator {
}
void release(T *p) {
assume(p->partialKeyCapacity >= 0);
assume(freeListSize >= 0);
if (freeListSize + sizeof(T) + p->partialKeyCapacity > kMaxFreeListBytes) {
removeNode(p);
return safe_free(p, sizeof(T) + p->partialKeyCapacity);
}
p->parent = freeList;
freeList = p;
freeListSize += sizeof(T) + p->partialKeyCapacity;
VALGRIND_MAKE_MEM_NOACCESS(p, sizeof(T) + p->partialKeyCapacity);
removeNode(p);
return safe_free(p, sizeof(T) + p->partialKeyCapacity);
}
void deferRelease(T *p, Node *forwardTo) {
p->releaseDeferred = true;
p->forwardTo = forwardTo;
if (freeListSize + sizeof(T) + p->partialKeyCapacity > kMaxFreeListBytes) {
p->parent = deferredListOverflow;
deferredListOverflow = p;
} else {
if (deferredList == nullptr) {
deferredListFront = p;
}
p->parent = deferredList;
deferredList = p;
freeListSize += sizeof(T) + p->partialKeyCapacity;
}
p->parent = deferredList;
deferredList = p;
}
void releaseDeferred() {
if (deferredList != nullptr) {
deferredListFront->parent = freeList;
#ifndef NVALGRIND
for (auto *iter = deferredList; iter != freeList;) {
auto *tmp = iter;
iter = (T *)iter->parent;
VALGRIND_MAKE_MEM_NOACCESS(tmp, sizeof(T) + tmp->partialKeyCapacity);
}
#endif
freeList = std::exchange(deferredList, nullptr);
}
for (T *n = std::exchange(deferredListOverflow, nullptr); n != nullptr;) {
for (T *n = std::exchange(deferredList, nullptr); n != nullptr;) {
auto *tmp = n;
n = (T *)n->parent;
release(tmp);
@@ -796,51 +754,12 @@ template <class T> struct NodeAllocator {
NodeAllocator(NodeAllocator &&) = delete;
NodeAllocator &operator=(NodeAllocator &&) = delete;
~NodeAllocator() {
assert(deferredList == nullptr);
assert(deferredListOverflow == nullptr);
for (T *iter = freeList; iter != nullptr;) {
VALGRIND_MAKE_MEM_DEFINED(iter, sizeof(T));
auto *tmp = iter;
iter = (T *)iter->parent;
removeNode(tmp);
safe_free(tmp, sizeof(T) + tmp->partialKeyCapacity);
}
}
~NodeAllocator() { assert(deferredList == nullptr); }
private:
int64_t freeListSize = 0;
T *freeList = nullptr;
T *deferredList = nullptr;
// Used to concatenate deferredList to freeList
T *deferredListFront;
T *deferredListOverflow = nullptr;
T *allocate_helper(int minCapacity, int maxCapacity) {
if (freeList != nullptr) {
VALGRIND_MAKE_MEM_DEFINED(freeList, sizeof(T));
freeListSize -= sizeof(T) + freeList->partialKeyCapacity;
assume(freeList->partialKeyCapacity >= 0);
assume(minCapacity >= 0);
assume(minCapacity <= maxCapacity);
if (freeList->partialKeyCapacity >= minCapacity &&
freeList->partialKeyCapacity <= maxCapacity) {
auto *result = freeList;
freeList = (T *)freeList->parent;
VALGRIND_MAKE_MEM_UNDEFINED(result,
sizeof(T) + result->partialKeyCapacity);
VALGRIND_MAKE_MEM_DEFINED(&result->partialKeyCapacity,
sizeof(result->partialKeyCapacity));
VALGRIND_MAKE_MEM_DEFINED(&result->type, sizeof(result->type));
return result;
} else {
auto *p = freeList;
freeList = (T *)p->parent;
removeNode(p);
safe_free(p, sizeof(T) + p->partialKeyCapacity);
}
}
int capacity = maxCapacity;
T *allocate_helper(int capacity) {
auto *result = (T *)safe_malloc(sizeof(T) + capacity);
result->type = T::kType;
result->partialKeyCapacity = capacity;
@@ -899,6 +818,7 @@ struct WriteContext {
int64_t point_writes;
int64_t range_writes;
int64_t write_bytes;
int64_t nodes_resized;
} accum;
#if USE_64_BIT
@@ -911,19 +831,19 @@ struct WriteContext {
WriteContext() { memset(&accum, 0, sizeof(accum)); }
template <class T> T *allocate(int minCapacity, int maxCapacity) {
template <class T> T *allocate(int capacity) {
static_assert(!std::is_same_v<T, Node>);
++accum.nodes_allocated;
if constexpr (std::is_same_v<T, Node0>) {
return node0.allocate(minCapacity, maxCapacity);
return node0.allocate(capacity);
} else if constexpr (std::is_same_v<T, Node3>) {
return node3.allocate(minCapacity, maxCapacity);
return node3.allocate(capacity);
} else if constexpr (std::is_same_v<T, Node16>) {
return node16.allocate(minCapacity, maxCapacity);
return node16.allocate(capacity);
} else if constexpr (std::is_same_v<T, Node48>) {
return node48.allocate(minCapacity, maxCapacity);
return node48.allocate(capacity);
} else if constexpr (std::is_same_v<T, Node256>) {
return node256.allocate(minCapacity, maxCapacity);
return node256.allocate(capacity);
}
}
template <class T> void release(T *c) {
@@ -1479,8 +1399,7 @@ void consumePartialKeyFull(TaggedNodePointer &self, TrivialSpan &key,
InternalVersionT oldMaxVersion = exchangeMaxVersion(old, writeVersion);
// *self will have one child (old)
auto *newSelf = writeContext->allocate<Node3>(
partialKeyIndex, getMaxCapacity(1, 0, partialKeyIndex));
auto *newSelf = writeContext->allocate<Node3>(partialKeyIndex);
newSelf->parent = old->parent;
newSelf->parentsIndex = old->parentsIndex;
@@ -1584,8 +1503,7 @@ TaggedNodePointer &getOrCreateChild(TaggedNodePointer &self, TrivialSpan &key,
__builtin_unreachable(); // GCOVR_EXCL_LINE
}
auto *newChild = writeContext->allocate<Node0>(
key.size(), getMaxCapacity(0, 1, key.size()));
auto *newChild = writeContext->allocate<Node0>(key.size());
newChild->numChildren = 0;
newChild->entryPresent = false; // Will be set to true by the caller
newChild->partialKeyLen = key.size();
@@ -1597,8 +1515,7 @@ TaggedNodePointer &getOrCreateChild(TaggedNodePointer &self, TrivialSpan &key,
case Type_Node0: {
auto *self0 = static_cast<Node0 *>(self);
auto *newSelf = writeContext->allocate<Node3>(
self->partialKeyLen, getMaxCapacity(1, 1, self->partialKeyLen));
auto *newSelf = writeContext->allocate<Node3>(self->partialKeyLen);
newSelf->copyChildrenAndKeyFrom(*self0);
writeContext->deferRelease(self0, newSelf);
self = newSelf;
@@ -1608,9 +1525,7 @@ TaggedNodePointer &getOrCreateChild(TaggedNodePointer &self, TrivialSpan &key,
case Type_Node3: {
if (self->numChildren == Node3::kMaxNodes) {
auto *self3 = static_cast<Node3 *>(self);
auto *newSelf = writeContext->allocate<Node16>(
self->partialKeyLen,
getMaxCapacity(4, self->entryPresent, self->partialKeyLen));
auto *newSelf = writeContext->allocate<Node16>(self->partialKeyLen);
newSelf->copyChildrenAndKeyFrom(*self3);
writeContext->deferRelease(self3, newSelf);
self = newSelf;
@@ -1640,9 +1555,7 @@ TaggedNodePointer &getOrCreateChild(TaggedNodePointer &self, TrivialSpan &key,
case Type_Node16: {
if (self->numChildren == Node16::kMaxNodes) {
auto *self16 = static_cast<Node16 *>(self);
auto *newSelf = writeContext->allocate<Node48>(
self->partialKeyLen,
getMaxCapacity(17, self->entryPresent, self->partialKeyLen));
auto *newSelf = writeContext->allocate<Node48>(self->partialKeyLen);
newSelf->copyChildrenAndKeyFrom(*self16);
writeContext->deferRelease(self16, newSelf);
self = newSelf;
@@ -1674,9 +1587,7 @@ TaggedNodePointer &getOrCreateChild(TaggedNodePointer &self, TrivialSpan &key,
if (self->numChildren == 48) {
auto *self48 = static_cast<Node48 *>(self);
auto *newSelf = writeContext->allocate<Node256>(
self->partialKeyLen,
getMaxCapacity(49, self->entryPresent, self->partialKeyLen));
auto *newSelf = writeContext->allocate<Node256>(self->partialKeyLen);
newSelf->copyChildrenAndKeyFrom(*self48);
writeContext->deferRelease(self48, newSelf);
self = newSelf;
@@ -1760,13 +1671,13 @@ downLeftSpine:
return node;
}
void freeAndMakeCapacityBetween(Node *&self, int minCapacity, int maxCapacity,
WriteContext *writeContext,
ConflictSet::Impl *impl) {
void freeAndMakeCapacity(Node *&self, int capacity, WriteContext *writeContext,
ConflictSet::Impl *impl) {
++writeContext->accum.nodes_resized;
switch (self->getType()) {
case Type_Node0: {
auto *self0 = (Node0 *)self;
auto *newSelf = writeContext->allocate<Node0>(minCapacity, maxCapacity);
auto *newSelf = writeContext->allocate<Node0>(capacity);
newSelf->copyChildrenAndKeyFrom(*self0);
getInTree(self, impl) = newSelf;
writeContext->deferRelease(self0, newSelf);
@@ -1774,7 +1685,7 @@ void freeAndMakeCapacityBetween(Node *&self, int minCapacity, int maxCapacity,
} break;
case Type_Node3: {
auto *self3 = (Node3 *)self;
auto *newSelf = writeContext->allocate<Node3>(minCapacity, maxCapacity);
auto *newSelf = writeContext->allocate<Node3>(capacity);
newSelf->copyChildrenAndKeyFrom(*self3);
getInTree(self, impl) = newSelf;
writeContext->deferRelease(self3, newSelf);
@@ -1782,7 +1693,7 @@ void freeAndMakeCapacityBetween(Node *&self, int minCapacity, int maxCapacity,
} break;
case Type_Node16: {
auto *self16 = (Node16 *)self;
auto *newSelf = writeContext->allocate<Node16>(minCapacity, maxCapacity);
auto *newSelf = writeContext->allocate<Node16>(capacity);
newSelf->copyChildrenAndKeyFrom(*self16);
getInTree(self, impl) = newSelf;
writeContext->deferRelease(self16, newSelf);
@@ -1790,7 +1701,7 @@ void freeAndMakeCapacityBetween(Node *&self, int minCapacity, int maxCapacity,
} break;
case Type_Node48: {
auto *self48 = (Node48 *)self;
auto *newSelf = writeContext->allocate<Node48>(minCapacity, maxCapacity);
auto *newSelf = writeContext->allocate<Node48>(capacity);
newSelf->copyChildrenAndKeyFrom(*self48);
getInTree(self, impl) = newSelf;
writeContext->deferRelease(self48, newSelf);
@@ -1798,7 +1709,7 @@ void freeAndMakeCapacityBetween(Node *&self, int minCapacity, int maxCapacity,
} break;
case Type_Node256: {
auto *self256 = (Node256 *)self;
auto *newSelf = writeContext->allocate<Node256>(minCapacity, maxCapacity);
auto *newSelf = writeContext->allocate<Node256>(capacity);
newSelf->copyChildrenAndKeyFrom(*self256);
getInTree(self, impl) = newSelf;
writeContext->deferRelease(self256, newSelf);
@@ -1823,8 +1734,7 @@ void maybeDecreaseCapacity(Node *&self, WriteContext *writeContext,
if (self->getCapacity() <= maxCapacity) {
return;
}
freeAndMakeCapacityBetween(self, self->partialKeyLen, maxCapacity,
writeContext, impl);
freeAndMakeCapacity(self, self->partialKeyLen, writeContext, impl);
}
#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__)
@@ -1898,12 +1808,9 @@ void mergeWithChild(TaggedNodePointer &self, WriteContext *writeContext,
assert(!self3->entryPresent);
Node *child = self3->children[0];
const int minCapacity = self3->partialKeyLen + 1 + child->partialKeyLen;
const int maxCapacity =
getMaxCapacity(child->numChildren, child->entryPresent, minCapacity);
if (minCapacity > child->getCapacity()) {
freeAndMakeCapacityBetween(child, minCapacity, maxCapacity, writeContext,
impl);
freeAndMakeCapacity(child, minCapacity, writeContext, impl);
}
// Merge partial key with child
@@ -1942,8 +1849,7 @@ bool needsDownsize(Node *n) {
void downsize(Node3 *self, WriteContext *writeContext,
ConflictSet::Impl *impl) {
if (self->numChildren == 0) {
auto *newSelf = writeContext->allocate<Node0>(
self->partialKeyLen, getMaxCapacity(0, 1, self->partialKeyLen));
auto *newSelf = writeContext->allocate<Node0>(self->partialKeyLen);
newSelf->copyChildrenAndKeyFrom(*self);
getInTree(self, impl) = newSelf;
writeContext->deferRelease(self, newSelf);
@@ -1956,9 +1862,7 @@ void downsize(Node3 *self, WriteContext *writeContext,
void downsize(Node16 *self, WriteContext *writeContext,
ConflictSet::Impl *impl) {
assert(self->numChildren + int(self->entryPresent) < kMinChildrenNode16);
auto *newSelf = writeContext->allocate<Node3>(
self->partialKeyLen,
getMaxCapacity(kMinChildrenNode16 - 1, 0, self->partialKeyLen));
auto *newSelf = writeContext->allocate<Node3>(self->partialKeyLen);
newSelf->copyChildrenAndKeyFrom(*self);
getInTree(self, impl) = newSelf;
writeContext->deferRelease(self, newSelf);
@@ -1967,9 +1871,7 @@ void downsize(Node16 *self, WriteContext *writeContext,
void downsize(Node48 *self, WriteContext *writeContext,
ConflictSet::Impl *impl) {
assert(self->numChildren + int(self->entryPresent) < kMinChildrenNode48);
auto *newSelf = writeContext->allocate<Node16>(
self->partialKeyLen,
getMaxCapacity(kMinChildrenNode48 - 1, 0, self->partialKeyLen));
auto *newSelf = writeContext->allocate<Node16>(self->partialKeyLen);
newSelf->copyChildrenAndKeyFrom(*self);
getInTree(self, impl) = newSelf;
writeContext->deferRelease(self, newSelf);
@@ -1979,9 +1881,7 @@ void downsize(Node256 *self, WriteContext *writeContext,
ConflictSet::Impl *impl) {
assert(self->numChildren + int(self->entryPresent) < kMinChildrenNode256);
auto *self256 = (Node256 *)self;
auto *newSelf = writeContext->allocate<Node48>(
self->partialKeyLen,
getMaxCapacity(kMinChildrenNode256 - 1, 0, self->partialKeyLen));
auto *newSelf = writeContext->allocate<Node48>(self->partialKeyLen);
newSelf->copyChildrenAndKeyFrom(*self256);
getInTree(self, impl) = newSelf;
writeContext->deferRelease(self256, newSelf);
@@ -5268,6 +5168,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
range_writes_total.add(writeContext.accum.range_writes);
nodes_allocated_total.add(writeContext.accum.nodes_allocated);
nodes_released_total.add(writeContext.accum.nodes_released);
nodes_resized_total.add(writeContext.accum.nodes_resized);
entries_inserted_total.add(writeContext.accum.entries_inserted);
entries_erased_total.add(writeContext.accum.entries_erased);
insert_iterations_total.add(writeContext.accum.insert_iterations);
@@ -5396,6 +5297,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
std::exchange(writeContext.accum.nodes_allocated, 0));
nodes_released_total.add(
std::exchange(writeContext.accum.nodes_released, 0));
nodes_resized_total.add(std::exchange(writeContext.accum.nodes_resized, 0));
entries_inserted_total.add(
std::exchange(writeContext.accum.entries_inserted, 0));
entries_erased_total.add(
@@ -5421,7 +5323,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
keyUpdates = 10;
// Insert ""
root = writeContext.allocate<Node0>(0, 0);
root = writeContext.allocate<Node0>(0);
root->numChildren = 0;
root->parent = nullptr;
root->entryPresent = false;
@@ -5514,6 +5416,9 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
"The total number of physical tree nodes allocated");
COUNTER(nodes_released_total,
"The total number of physical tree nodes released");
COUNTER(nodes_resized_total,
"The total number of physical tree nodes that have been resized to "
"account for partial key capacity changes");
COUNTER(insert_iterations_total,
"The total number of iterations of the main loop for insertion. "
"Includes searches where the entry already existed, and so insertion "