10 Commits

Author SHA1 Message Date
d1dc1247e1 Switch to new machine for benchmarks
Some checks failed
Tests / Clang total: 1479, passed: 1479
Clang |Total|New|Outstanding|Fixed|Trend |:-:|:-:|:-:|:-:|:-: |0|0|0|0|:clap:
Tests / Debug total: 1477, passed: 1477
weaselab/conflict-set/pipeline/head There was a failure building this commit
2024-08-02 18:18:36 -07:00
f1ad68109a Skip lcp call in SearchStepWise if no partial key 2024-08-02 18:16:55 -07:00
c4443bc355 Remove len < 8 check in longestCommonPrefix
Micro benchmarks look a tiny bit better /shrug
2024-08-02 18:16:55 -07:00
857b402fe2 Add an avx512bw longestCommonPrefix 2024-08-02 18:16:55 -07:00
9b3e1b219b Remove small memmoves in erase 2024-08-02 18:16:55 -07:00
ab52c63935 Move local accum's into ReadContext 2024-08-02 18:16:55 -07:00
bad9d7ced8 Avoid some tls lookups for InternalVersionT::zero 2024-08-02 18:16:55 -07:00
c8d9dc034d Remove memmove call for inserting into Node16 2024-08-02 18:16:55 -07:00
72168ef6a3 Avoid memmove when inserting into Node3 2024-08-02 18:16:55 -07:00
620a0afd2a Add debug symbols and frame pointer
So that perf works out of the box. Leave it in the release artifacts/do
whatever cpack does by default for now.
2024-08-02 18:16:55 -07:00
3 changed files with 136 additions and 213 deletions

View File

@@ -31,8 +31,14 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
"MinSizeRel" "RelWithDebInfo") "MinSizeRel" "RelWithDebInfo")
endif() endif()
add_compile_options(-fdata-sections -ffunction-sections -Wswitch-enum add_compile_options(
-Werror=switch-enum -fPIC) -fdata-sections
-ffunction-sections
-Wswitch-enum
-Werror=switch-enum
-fPIC
-g
-fno-omit-frame-pointer)
set(full_relro_flags "-pie;LINKER:-z,relro,-z,now,-z,noexecstack") set(full_relro_flags "-pie;LINKER:-z,relro,-z,now,-z,noexecstack")
cmake_push_check_state() cmake_push_check_state()

View File

@@ -395,8 +395,9 @@ inline void Node48::copyChildrenAndKeyFrom(const Node16 &other) {
assert(numChildren == Node16::kMaxNodes); assert(numChildren == Node16::kMaxNodes);
memset(index, -1, sizeof(index)); memset(index, -1, sizeof(index));
memset(children, 0, sizeof(children)); memset(children, 0, sizeof(children));
const auto z = InternalVersionT::zero;
for (auto &v : childMaxVersion) { for (auto &v : childMaxVersion) {
v = InternalVersionT::zero; v = z;
} }
memcpy(partialKey(), &other + 1, partialKeyLen); memcpy(partialKey(), &other + 1, partialKeyLen);
bitSet.init(); bitSet.init();
@@ -423,8 +424,9 @@ inline void Node48::copyChildrenAndKeyFrom(const Node48 &other) {
nextFree = other.nextFree; nextFree = other.nextFree;
memcpy(index, other.index, sizeof(index)); memcpy(index, other.index, sizeof(index));
memset(children, 0, sizeof(children)); memset(children, 0, sizeof(children));
const auto z = InternalVersionT::zero;
for (auto &v : childMaxVersion) { for (auto &v : childMaxVersion) {
v = InternalVersionT::zero; v = z;
} }
for (int i = 0; i < numChildren; ++i) { for (int i = 0; i < numChildren; ++i) {
children[i] = other.children[i]; children[i] = other.children[i];
@@ -442,8 +444,9 @@ inline void Node48::copyChildrenAndKeyFrom(const Node256 &other) {
kNodeCopySize); kNodeCopySize);
memset(index, -1, sizeof(index)); memset(index, -1, sizeof(index));
memset(children, 0, sizeof(children)); memset(children, 0, sizeof(children));
const auto z = InternalVersionT::zero;
for (auto &v : childMaxVersion) { for (auto &v : childMaxVersion) {
v = InternalVersionT::zero; v = z;
} }
nextFree = other.numChildren; nextFree = other.numChildren;
bitSet = other.bitSet; bitSet = other.bitSet;
@@ -470,11 +473,12 @@ inline void Node256::copyChildrenAndKeyFrom(const Node48 &other) {
kNodeCopySize); kNodeCopySize);
bitSet = other.bitSet; bitSet = other.bitSet;
memset(children, 0, sizeof(children)); memset(children, 0, sizeof(children));
const auto z = InternalVersionT::zero;
for (auto &v : childMaxVersion) { for (auto &v : childMaxVersion) {
v = InternalVersionT::zero; v = z;
} }
for (auto &v : maxOfMax) { for (auto &v : maxOfMax) {
v = InternalVersionT::zero; v = z;
} }
bitSet.forEachSet([&](int c) { bitSet.forEachSet([&](int c) {
children[c] = other.children[other.index[c]]; children[c] = other.children[other.index[c]];
@@ -491,8 +495,9 @@ inline void Node256::copyChildrenAndKeyFrom(const Node256 &other) {
memcpy((char *)this + kNodeCopyBegin, (char *)&other + kNodeCopyBegin, memcpy((char *)this + kNodeCopyBegin, (char *)&other + kNodeCopyBegin,
kNodeCopySize); kNodeCopySize);
memset(children, 0, sizeof(children)); memset(children, 0, sizeof(children));
const auto z = InternalVersionT::zero;
for (auto &v : childMaxVersion) { for (auto &v : childMaxVersion) {
v = InternalVersionT::zero; v = z;
} }
bitSet = other.bitSet; bitSet = other.bitSet;
bitSet.forEachSet([&](int c) { bitSet.forEachSet([&](int c) {
@@ -619,13 +624,15 @@ template <class T> struct BoundedFreeListAllocator {
T *result = allocate_helper(partialKeyCapacity); T *result = allocate_helper(partialKeyCapacity);
if constexpr (!std::is_same_v<T, Node0>) { if constexpr (!std::is_same_v<T, Node0>) {
memset(result->children, 0, sizeof(result->children)); memset(result->children, 0, sizeof(result->children));
const auto z = InternalVersionT::zero;
for (auto &v : result->childMaxVersion) { for (auto &v : result->childMaxVersion) {
v = InternalVersionT::zero; v = z;
} }
} }
if constexpr (std::is_same_v<T, Node48> || std::is_same_v<T, Node256>) { if constexpr (std::is_same_v<T, Node48> || std::is_same_v<T, Node256>) {
const auto z = InternalVersionT::zero;
for (auto &v : result->maxOfMax) { for (auto &v : result->maxOfMax) {
v = InternalVersionT::zero; v = z;
} }
} }
return result; return result;
@@ -712,6 +719,9 @@ struct ReadContext {
int64_t prefix_read_iterations_accum = 0; int64_t prefix_read_iterations_accum = 0;
int64_t range_read_iterations_accum = 0; int64_t range_read_iterations_accum = 0;
int64_t range_read_node_scan_accum = 0; int64_t range_read_node_scan_accum = 0;
int64_t commits_accum = 0;
int64_t conflicts_accum = 0;
int64_t too_olds_accum = 0;
ConflictSet::Impl *impl; ConflictSet::Impl *impl;
}; };
@@ -730,6 +740,10 @@ struct WriteContext {
int64_t write_bytes; int64_t write_bytes;
} accum; } accum;
// Cache a copy of InternalVersionT::zero, so we don't need to do the TLS
// lookup as often.
InternalVersionT zero;
WriteContext() { memset(&accum, 0, sizeof(accum)); } WriteContext() { memset(&accum, 0, sizeof(accum)); }
template <class T> T *allocate(int c) { template <class T> T *allocate(int c) {
@@ -1122,24 +1136,19 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, WriteContext *tls) {
insert3: insert3:
auto *self3 = static_cast<Node3 *>(self); auto *self3 = static_cast<Node3 *>(self);
++self->numChildren; int i = self->numChildren - 1;
int i = 0; for (; i >= 0; --i) {
for (; i < self->numChildren - 1; ++i) { if (int(self3->index[i]) < int(index)) {
if (int(self3->index[i]) > int(index)) {
memmove(self3->index + i + 1, self3->index + i,
self->numChildren - (i + 1));
memmove(self3->children + i + 1, self3->children + i,
(self->numChildren - (i + 1)) *
sizeof(self3->children[0])); // NOLINT
memmove(self3->childMaxVersion + i + 1, self3->childMaxVersion + i,
(self->numChildren - (i + 1)) *
sizeof(self3->childMaxVersion[0]));
break; break;
} }
self3->index[i + 1] = self3->index[i];
self3->children[i + 1] = self3->children[i];
self3->childMaxVersion[i + 1] = self3->childMaxVersion[i];
} }
self3->index[i] = index; self3->index[i + 1] = index;
auto &result = self3->children[i]; auto &result = self3->children[i + 1];
result = nullptr; result = nullptr;
++self->numChildren;
return result; return result;
} }
case Type_Node16: { case Type_Node16: {
@@ -1153,71 +1162,21 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, WriteContext *tls) {
} }
insert16: insert16:
auto *self16 = static_cast<Node16 *>(self);
assert(self->getType() == Type_Node16); assert(self->getType() == Type_Node16);
auto *self16 = static_cast<Node16 *>(self);
++self->numChildren; int i = self->numChildren - 1;
#ifdef HAS_AVX for (; i >= 0; --i) {
__m128i key_vec = _mm_set1_epi8(index); if (int(self16->index[i]) < int(index)) {
__m128i indices;
memcpy(&indices, self16->index, sizeof(self16->index));
__m128i results = _mm_cmpeq_epi8(key_vec, _mm_min_epu8(key_vec, indices));
int mask = (1 << (self->numChildren - 1)) - 1;
uint32_t bitfield = _mm_movemask_epi8(results) & mask;
bitfield |= uint32_t(1) << (self->numChildren - 1);
int i = std::countr_zero(bitfield);
if (i < self->numChildren - 1) {
memmove(self16->index + i + 1, self16->index + i,
self->numChildren - (i + 1));
memmove(self16->children + i + 1, self16->children + i,
(self->numChildren - (i + 1)) *
sizeof(self16->children[0])); // NOLINT
memmove(self16->childMaxVersion + i + 1, self16->childMaxVersion + i,
(self->numChildren - (i + 1)) *
sizeof(self16->childMaxVersion[0]));
}
#elif defined(HAS_ARM_NEON)
uint8x16_t indices;
memcpy(&indices, self16->index, sizeof(self16->index));
// 0xff for each leq
auto results = vcleq_u8(vdupq_n_u8(index), indices);
uint64_t mask = (uint64_t(1) << ((self->numChildren - 1) * 4)) - 1;
// 0xf for each 0xff (within mask)
uint64_t bitfield =
vget_lane_u64(
vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(results), 4)),
0) &
mask;
bitfield |= uint64_t(0xf) << ((self->numChildren - 1) * 4);
int i = std::countr_zero(bitfield) / 4;
if (i < self->numChildren - 1) {
memmove(self16->index + i + 1, self16->index + i,
self->numChildren - (i + 1));
memmove(self16->children + i + 1, self16->children + i,
(self->numChildren - (i + 1)) *
sizeof(self16->children[0])); // NOLINT
memmove(self16->childMaxVersion + i + 1, self16->childMaxVersion + i,
(self->numChildren - (i + 1)) *
sizeof(self16->childMaxVersion[0]));
}
#else
int i = 0;
for (; i < int(self->numChildren) - 1; ++i) {
if (int(self16->index[i]) > int(index)) {
memmove(self16->index + i + 1, self16->index + i,
self->numChildren - (i + 1));
memmove(self16->children + i + 1, self16->children + i,
(self->numChildren - (i + 1)) * sizeof(self16->children[0]));
memmove(self16->childMaxVersion + i + 1, self16->childMaxVersion + i,
(self->numChildren - (i + 1)) *
sizeof(self16->childMaxVersion[0]));
break; break;
} }
self16->index[i + 1] = self16->index[i];
self16->children[i + 1] = self16->children[i];
self16->childMaxVersion[i + 1] = self16->childMaxVersion[i];
} }
#endif self16->index[i + 1] = index;
self16->index[i] = index; auto &result = self16->children[i + 1];
auto &result = self16->children[i];
result = nullptr; result = nullptr;
++self->numChildren;
return result; return result;
} }
case Type_Node48: { case Type_Node48: {
@@ -1472,7 +1431,7 @@ void maybeDownsize(Node *self, WriteContext *tls, ConflictSet::Impl *impl,
// that we have a new parent. // that we have a new parent.
setMaxVersion(child, impl, childMaxVersion); setMaxVersion(child, impl, childMaxVersion);
if (child->parent) { if (child->parent) {
rezero(child->parent, InternalVersionT::zero); rezero(child->parent, tls->zero);
} }
getInTree(self, impl) = child; getInTree(self, impl) = child;
@@ -1552,36 +1511,24 @@ Node *erase(Node *self, WriteContext *tls, ConflictSet::Impl *impl,
auto *parent3 = static_cast<Node3 *>(parent); auto *parent3 = static_cast<Node3 *>(parent);
int nodeIndex = getNodeIndex(parent3, parentsIndex); int nodeIndex = getNodeIndex(parent3, parentsIndex);
assert(nodeIndex >= 0); assert(nodeIndex >= 0);
memmove(parent3->index + nodeIndex, parent3->index + nodeIndex + 1,
sizeof(parent3->index[0]) *
(parent->numChildren - (nodeIndex + 1)));
memmove(parent3->children + nodeIndex, parent3->children + nodeIndex + 1,
sizeof(parent3->children[0]) * // NOLINT
(parent->numChildren - (nodeIndex + 1)));
memmove(parent3->childMaxVersion + nodeIndex,
parent3->childMaxVersion + nodeIndex + 1,
sizeof(parent3->childMaxVersion[0]) *
(parent->numChildren - (nodeIndex + 1)));
--parent->numChildren; --parent->numChildren;
for (int i = nodeIndex; i < parent->numChildren; ++i) {
parent3->index[i] = parent3->index[i + 1];
parent3->children[i] = parent3->children[i + 1];
parent3->childMaxVersion[i] = parent3->childMaxVersion[i + 1];
}
assert(parent->numChildren > 0 || parent->entryPresent); assert(parent->numChildren > 0 || parent->entryPresent);
} break; } break;
case Type_Node16: { case Type_Node16: {
auto *parent16 = static_cast<Node16 *>(parent); auto *parent16 = static_cast<Node16 *>(parent);
int nodeIndex = getNodeIndex(parent16, parentsIndex); int nodeIndex = getNodeIndex(parent16, parentsIndex);
assert(nodeIndex >= 0); assert(nodeIndex >= 0);
memmove(parent16->index + nodeIndex, parent16->index + nodeIndex + 1,
sizeof(parent16->index[0]) *
(parent->numChildren - (nodeIndex + 1)));
memmove(parent16->children + nodeIndex, parent16->children + nodeIndex + 1,
sizeof(parent16->children[0]) * // NOLINT
(parent->numChildren - (nodeIndex + 1)));
memmove(parent16->childMaxVersion + nodeIndex,
parent16->childMaxVersion + nodeIndex + 1,
sizeof(parent16->childMaxVersion[0]) *
(parent->numChildren - (nodeIndex + 1)));
--parent->numChildren; --parent->numChildren;
for (int i = nodeIndex; i < parent->numChildren; ++i) {
parent16->index[i] = parent16->index[i + 1];
parent16->children[i] = parent16->children[i + 1];
parent16->childMaxVersion[i] = parent16->childMaxVersion[i + 1];
}
// By kMinChildrenNode16 // By kMinChildrenNode16
assert(parent->numChildren > 0); assert(parent->numChildren > 0);
@@ -1608,7 +1555,7 @@ Node *erase(Node *self, WriteContext *tls, ConflictSet::Impl *impl,
parent48->index[parentIndex] = toRemoveChildrenIndex; parent48->index[parentIndex] = toRemoveChildrenIndex;
parent48->reverseIndex[toRemoveChildrenIndex] = parentIndex; parent48->reverseIndex[toRemoveChildrenIndex] = parentIndex;
} }
parent48->childMaxVersion[lastChildrenIndex] = InternalVersionT::zero; parent48->childMaxVersion[lastChildrenIndex] = tls->zero;
--parent->numChildren; --parent->numChildren;
@@ -1726,15 +1673,38 @@ int firstNeqStride(const uint8_t *ap, const uint8_t *bp) {
#endif #endif
} }
#if defined(__x86_64__) && !defined(__SANITIZE_THREAD__)
__attribute__((target("avx512bw"))) int
longestCommonPrefix(const uint8_t *ap, const uint8_t *bp, int cl) {
int i = 0;
int end = cl & ~63;
while (i < end) {
const uint64_t eq =
_mm512_cmpeq_epi8_mask(_mm512_loadu_epi8(ap), _mm512_loadu_epi8(bp));
if (eq != uint64_t(-1)) {
return i + std::countr_one(eq);
}
i += 64;
ap += 64;
bp += 64;
}
if (i < cl) {
const uint64_t mask = (uint64_t(1) << (cl - i)) - 1;
const uint64_t eq = _mm512_cmpeq_epi8_mask(
_mm512_maskz_loadu_epi8(mask, ap), _mm512_maskz_loadu_epi8(mask, bp));
return i + std::countr_one(eq & mask);
}
assert(i == cl);
return i;
}
__attribute__((target("default")))
#endif
int longestCommonPrefix(const uint8_t *ap, const uint8_t *bp, int cl) { int longestCommonPrefix(const uint8_t *ap, const uint8_t *bp, int cl) {
assume(cl >= 0); assume(cl >= 0);
int i = 0; int i = 0;
int end; int end;
if (cl < 8) {
goto bytes;
}
// kStride * kUnrollCount at a time // kStride * kUnrollCount at a time
end = cl & ~(kStride * kUnrollFactor - 1); end = cl & ~(kStride * kUnrollFactor - 1);
while (i < end) { while (i < end) {
@@ -1775,7 +1745,6 @@ int longestCommonPrefix(const uint8_t *ap, const uint8_t *bp, int cl) {
bp += 8; bp += 8;
} }
bytes:
// byte at a time // byte at a time
while (i < cl) { while (i < cl) {
if (*ap != *bp) { if (*ap != *bp) {
@@ -1808,10 +1777,13 @@ struct SearchStepWise {
if (child == nullptr) { if (child == nullptr) {
return true; return true;
} }
int cl = std::min<int>(child->partialKeyLen, remaining.size() - 1); if (child->partialKeyLen > 0) {
int i = longestCommonPrefix(child->partialKey(), remaining.data() + 1, cl); int cl = std::min<int>(child->partialKeyLen, remaining.size() - 1);
if (i != child->partialKeyLen) { int i =
return true; longestCommonPrefix(child->partialKey(), remaining.data() + 1, cl);
if (i != child->partialKeyLen) {
return true;
}
} }
n = child; n = child;
remaining = remaining =
@@ -2949,8 +2921,7 @@ template <bool kBegin>
child->partialKeyLen = 0; child->partialKeyLen = 0;
child->parent = *self; child->parent = *self;
child->parentsIndex = key.front(); child->parentsIndex = key.front();
setMaxVersion(child, impl, setMaxVersion(child, impl, kBegin ? writeVersion : tls->zero);
kBegin ? writeVersion : InternalVersionT::zero);
} }
self = &child; self = &child;
@@ -2998,8 +2969,7 @@ void addPointWrite(Node *&root, std::span<const uint8_t> key,
n->entry.pointVersion = writeVersion; n->entry.pointVersion = writeVersion;
setMaxVersion(n, impl, writeVersion); setMaxVersion(n, impl, writeVersion);
n->entry.rangeVersion = n->entry.rangeVersion =
p == nullptr ? InternalVersionT::zero p == nullptr ? tls->zero : std::max(p->entry.rangeVersion, tls->zero);
: std::max(p->entry.rangeVersion, InternalVersionT::zero);
} else { } else {
assert(writeVersion >= n->entry.pointVersion); assert(writeVersion >= n->entry.pointVersion);
n->entry.pointVersion = writeVersion; n->entry.pointVersion = writeVersion;
@@ -3063,8 +3033,7 @@ void addWriteRange(Node *&root, std::span<const uint8_t> begin,
++tls->accum.entries_inserted; ++tls->accum.entries_inserted;
auto *p = nextLogical(beginNode); auto *p = nextLogical(beginNode);
beginNode->entry.rangeVersion = beginNode->entry.rangeVersion =
p == nullptr ? InternalVersionT::zero p == nullptr ? tls->zero : std::max(p->entry.rangeVersion, tls->zero);
: std::max(p->entry.rangeVersion, InternalVersionT::zero);
beginNode->entry.pointVersion = writeVersion; beginNode->entry.pointVersion = writeVersion;
assert(maxVersion(beginNode, impl) <= writeVersion); assert(maxVersion(beginNode, impl) <= writeVersion);
setMaxVersion(beginNode, impl, writeVersion); setMaxVersion(beginNode, impl, writeVersion);
@@ -3084,8 +3053,7 @@ void addWriteRange(Node *&root, std::span<const uint8_t> begin,
++tls->accum.entries_inserted; ++tls->accum.entries_inserted;
auto *p = nextLogical(endNode); auto *p = nextLogical(endNode);
endNode->entry.pointVersion = endNode->entry.pointVersion =
p == nullptr ? InternalVersionT::zero p == nullptr ? tls->zero : std::max(p->entry.rangeVersion, tls->zero);
: std::max(p->entry.rangeVersion, InternalVersionT::zero);
auto m = maxVersion(endNode, impl); auto m = maxVersion(endNode, impl);
setMaxVersion(endNode, impl, setMaxVersion(endNode, impl,
std::max<InternalVersionT>(m, endNode->entry.pointVersion)); std::max<InternalVersionT>(m, endNode->entry.pointVersion));
@@ -3162,9 +3130,6 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
void check(const ReadRange *reads, Result *result, int count) { void check(const ReadRange *reads, Result *result, int count) {
ReadContext tls; ReadContext tls;
tls.impl = this; tls.impl = this;
int commits_accum = 0;
int conflicts_accum = 0;
int too_olds_accum = 0;
int64_t check_byte_accum = 0; int64_t check_byte_accum = 0;
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
const auto &r = reads[i]; const auto &r = reads[i];
@@ -3182,9 +3147,9 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
InternalVersionT(reads[i].readVersion), &tls)) InternalVersionT(reads[i].readVersion), &tls))
? Commit ? Commit
: Conflict; : Conflict;
commits_accum += result[i] == Commit; tls.commits_accum += result[i] == Commit;
conflicts_accum += result[i] == Conflict; tls.conflicts_accum += result[i] == Conflict;
too_olds_accum += result[i] == TooOld; tls.too_olds_accum += result[i] == TooOld;
} }
point_read_total.add(tls.point_read_accum); point_read_total.add(tls.point_read_accum);
prefix_read_total.add(tls.prefix_read_accum); prefix_read_total.add(tls.prefix_read_accum);
@@ -3196,9 +3161,9 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
point_read_iterations_total.add(tls.point_read_iterations_accum); point_read_iterations_total.add(tls.point_read_iterations_accum);
prefix_read_iterations_total.add(tls.prefix_read_iterations_accum); prefix_read_iterations_total.add(tls.prefix_read_iterations_accum);
range_read_iterations_total.add(tls.range_read_iterations_accum); range_read_iterations_total.add(tls.range_read_iterations_accum);
commits_total.add(commits_accum); commits_total.add(tls.commits_accum);
conflicts_total.add(conflicts_accum); conflicts_total.add(tls.conflicts_accum);
too_olds_total.add(too_olds_accum); too_olds_total.add(tls.too_olds_accum);
check_bytes_total.add(check_byte_accum); check_bytes_total.add(check_byte_accum);
} }
@@ -3206,7 +3171,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
// There could be other conflict sets in the same thread. We need // There could be other conflict sets in the same thread. We need
// InternalVersionT::zero to be correct for this conflict set for the // InternalVersionT::zero to be correct for this conflict set for the
// lifetime of the current call frame. // lifetime of the current call frame.
InternalVersionT::zero = oldestVersion; InternalVersionT::zero = tls.zero = oldestVersion;
assert(writeVersion >= newestVersionFullPrecision); assert(writeVersion >= newestVersionFullPrecision);
@@ -3317,7 +3282,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
InternalVersionT oldestVersion{o}; InternalVersionT oldestVersion{o};
this->oldestVersionFullPrecision = o; this->oldestVersionFullPrecision = o;
this->oldestVersion = oldestVersion; this->oldestVersion = oldestVersion;
InternalVersionT::zero = oldestVersion; InternalVersionT::zero = tls.zero = oldestVersion;
#ifdef NDEBUG #ifdef NDEBUG
// This is here for performance reasons, since we want to amortize the cost // This is here for performance reasons, since we want to amortize the cost
// of storing the search path as a string. In tests, we want to exercise the // of storing the search path as a string. In tests, we want to exercise the
@@ -3367,7 +3332,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
root->entry.pointVersion = this->oldestVersion; root->entry.pointVersion = this->oldestVersion;
root->entry.rangeVersion = this->oldestVersion; root->entry.rangeVersion = this->oldestVersion;
InternalVersionT::zero = this->oldestVersion; InternalVersionT::zero = tls.zero = this->oldestVersion;
// Intentionally not resetting totalBytes // Intentionally not resetting totalBytes
} }

104
README.md
View File

@@ -1,86 +1,38 @@
A data structure for optimistic concurrency control on ranges of bitwise-lexicographically-ordered keys. A data structure for optimistic concurrency control on ranges of bitwise-lexicographically-ordered keys.
Intended to replace FoundationDB's skip list. Intended as an alternative to FoundationDB's skip list.
Hardware for all benchmarks is a mac m1 2020. Hardware for all benchmarks is an AMD Ryzen 9 7900 with (2x32GB) 5600MT/s CL28-34-34-89 1.35V RAM
# FoundationDB's benchmark # Microbenchmark
## Skip list ## Skip list
``` | ns/op | op/s | err% | ins/op | cyc/op | IPC | bra/op | miss% | total | benchmark
New conflict set: 1.957 sec |--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
0.639 Mtransactions/sec | 172.03 | 5,812,791.77 | 0.4% | 3,130.62 | 879.00 | 3.562 | 509.23 | 0.0% | 0.01 | `point reads`
2.555 Mkeys/sec | 167.44 | 5,972,130.71 | 0.2% | 3,065.14 | 862.27 | 3.555 | 494.30 | 0.0% | 0.01 | `prefix reads`
Detect only: 1.845 sec | 238.77 | 4,188,130.84 | 0.9% | 3,589.93 | 1,259.30 | 2.851 | 637.12 | 0.0% | 0.01 | `range reads`
0.678 Mtransactions/sec | 424.01 | 2,358,426.70 | 0.2% | 5,620.05 | 2,242.35 | 2.506 | 854.80 | 1.7% | 0.01 | `point writes`
2.710 Mkeys/sec | 418.45 | 2,389,780.56 | 0.4% | 5,525.07 | 2,211.05 | 2.499 | 831.71 | 1.7% | 0.01 | `prefix writes`
Skiplist only: 1.263 sec | 254.87 | 3,923,568.88 | 2.6% | 3,187.01 | 1,366.50 | 2.332 | 529.11 | 2.7% | 0.02 | `range writes`
0.990 Mtransactions/sec | 675.96 | 1,479,374.50 | 3.3% | 7,735.41 | 3,468.60 | 2.230 | 1,386.02 | 1.8% | 0.01 | `monotonic increasing point writes`
3.960 Mkeys/sec | 137,986.20 | 7,247.10 | 0.6% | 789,752.33 | 699,462.00 | 1.129 | 144,824.14 | 0.0% | 0.01 | `worst case for radix tree`
Performance counters: | 21.63 | 46,231,564.03 | 1.0% | 448.00 | 107.14 | 4.181 | 84.00 | 0.0% | 0.01 | `create and destroy`
Build: 0.0546
Add: 0.0563
Detect: 1.84
D.Sort: 0.412
D.Combine: 0.0141
D.CheckRead: 0.671
D.CheckIntraBatch: 0.0068
D.MergeWrite: 0.592
D.RemoveBefore: 0.146
```
## Radix tree (this implementation) ## Radix tree (this implementation)
``` | ns/op | op/s | err% | ins/op | cyc/op | IPC | bra/op | miss% | total | benchmark
New conflict set: 1.366 sec |--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
0.915 Mtransactions/sec | 17.03 | 58,732,967.93 | 0.6% | 276.28 | 87.96 | 3.141 | 52.15 | 0.4% | 0.01 | `point reads`
3.660 Mkeys/sec | 19.52 | 51,239,158.04 | 0.3% | 367.16 | 101.50 | 3.617 | 61.92 | 0.3% | 0.01 | `prefix reads`
Detect only: 1.248 sec | 47.74 | 20,947,676.63 | 0.5% | 998.16 | 247.43 | 4.034 | 161.64 | 0.2% | 0.01 | `range reads`
1.002 Mtransactions/sec | 23.14 | 43,207,824.89 | 0.4% | 408.18 | 121.64 | 3.356 | 70.20 | 0.3% | 0.01 | `point writes`
4.007 Mkeys/sec | 38.02 | 26,302,115.66 | 0.1% | 709.72 | 199.70 | 3.554 | 134.26 | 0.3% | 0.01 | `prefix writes`
Skiplist only: 0.573 sec | 44.28 | 22,583,559.17 | 0.9% | 825.19 | 233.10 | 3.540 | 141.48 | 0.2% | 0.01 | `range writes`
2.182 Mtransactions/sec | 85.50 | 11,695,990.63 | 0.5% | 1,488.16 | 455.68 | 3.266 | 289.22 | 0.1% | 0.01 | `monotonic increasing point writes`
8.730 Mkeys/sec | 338,388.50 | 2,955.18 | 3.3% | 4,097,087.00 | 1,809,996.00 | 2.264 | 759,645.00 | 0.0% | 0.01 | `worst case for radix tree`
Performance counters: | 84.84 | 11,787,313.59 | 1.4% | 1,716.02 | 440.50 | 3.896 | 271.00 | 0.0% | 0.01 | `create and destroy`
Build: 0.0594
Add: 0.0572
Detect: 1.25
D.Sort: 0.418
D.Combine: 0.0149
D.CheckRead: 0.232
D.CheckIntraBatch: 0.0067
D.MergeWrite: 0.341
D.RemoveBefore: 0.232
```
# Our benchmark
## Skip list
| ns/op | op/s | err% | total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
| 245.99 | 4,065,232.81 | 0.3% | 0.01 | `point reads`
| 265.93 | 3,760,430.49 | 0.2% | 0.01 | `prefix reads`
| 485.30 | 2,060,569.50 | 0.2% | 0.01 | `range reads`
| 449.60 | 2,224,195.17 | 0.4% | 0.01 | `point writes`
| 441.76 | 2,263,688.18 | 1.1% | 0.01 | `prefix writes`
| 245.42 | 4,074,647.54 | 2.4% | 0.02 | `range writes`
| 572.80 | 1,745,810.06 | 1.3% | 0.01 | `monotonic increasing point writes`
| 154,819.33 | 6,459.14 | 0.9% | 0.01 | `worst case for radix tree`
## Radix tree (this implementation)
| ns/op | op/s | err% | total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
| 20.25 | 49,372,759.86 | 0.3% | 0.01 | `point reads`
| 23.58 | 42,401,298.00 | 0.3% | 0.01 | `prefix reads`
| 64.12 | 15,595,463.14 | 0.8% | 0.01 | `range reads`
| 29.50 | 33,903,101.20 | 0.7% | 0.01 | `point writes`
| 46.76 | 21,384,036.19 | 1.2% | 0.01 | `prefix writes`
| 51.25 | 19,512,195.12 | 0.0% | 0.01 | `range writes`
| 109.51 | 9,131,469.31 | 3.6% | 0.01 | `monotonic increasing point writes`
| 1,153,875.00 | 866.65 | 1.6% | 0.01 | `worst case for radix tree`
# "Real data" test # "Real data" test
@@ -89,13 +41,13 @@ Point queries only, best of three runs. Gc ratio is the ratio of time spent doin
## skip list ## skip list
``` ```
Check: 11.3385 seconds, 329.718 MB/s, Add: 5.35612 seconds, 131.072 MB/s, Gc ratio: 45.7173% Check: 4.47891 seconds, 364.05 MB/s, Add: 4.55599 seconds, 123.058 MB/s, Gc ratio: 37.1145%
``` ```
## radix tree ## radix tree
``` ```
Check: 2.60639 seconds, 1434.36 MB/s, Add: 2.10911 seconds, 332.86 MB/s, Gc ratio: 46.3071% Check: 1.05813 seconds, 1540.97 MB/s, Add: 1.32071 seconds, 424.508 MB/s, Gc ratio: 42.2067%
``` ```
## hash table ## hash table
@@ -103,5 +55,5 @@ Check: 2.60639 seconds, 1434.36 MB/s, Add: 2.10911 seconds, 332.86 MB/s, Gc rati
(The hash table implementation doesn't work on range queries, and its purpose is to provide an idea of how fast point queries can be) (The hash table implementation doesn't work on range queries, and its purpose is to provide an idea of how fast point queries can be)
``` ```
Check: 1.83386 seconds, 2038.6 MB/s, Add: 0.601411 seconds, 1167.32 MB/s, Gc ratio: 48.9776% Check: 0.804094 seconds, 2027.81 MB/s, Add: 0.652952 seconds, 858.645 MB/s, Gc ratio: 35.3885%
``` ```