10 Commits

Author SHA1 Message Date
d1dc1247e1 Switch to new machine for benchmarks
Some checks failed
Tests / Clang total: 1479, passed: 1479
Clang |Total|New|Outstanding|Fixed|Trend |:-:|:-:|:-:|:-:|:-: |0|0|0|0|:clap:
Tests / Debug total: 1477, passed: 1477
weaselab/conflict-set/pipeline/head There was a failure building this commit
2024-08-02 18:18:36 -07:00
f1ad68109a Skip lcp call in SearchStepWise if no partial key 2024-08-02 18:16:55 -07:00
c4443bc355 Remove len < 8 check in longestCommonPrefix
Micro benchmarks look a tiny bit better /shrug
2024-08-02 18:16:55 -07:00
857b402fe2 Add an avx512bw longestCommonPrefix 2024-08-02 18:16:55 -07:00
9b3e1b219b Remove small memmoves in erase 2024-08-02 18:16:55 -07:00
ab52c63935 Move local accum's into ReadContext 2024-08-02 18:16:55 -07:00
bad9d7ced8 Avoid some tls lookups for InternalVersionT::zero 2024-08-02 18:16:55 -07:00
c8d9dc034d Remove memmove call for inserting into Node16 2024-08-02 18:16:55 -07:00
72168ef6a3 Avoid memmove when inserting into Node3 2024-08-02 18:16:55 -07:00
620a0afd2a Add debug symbols and frame pointer
So that perf works out of the box. Leave it in the release artifacts/do
whatever cpack does by default for now.
2024-08-02 18:16:55 -07:00
3 changed files with 136 additions and 213 deletions

View File

@@ -31,8 +31,14 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
"MinSizeRel" "RelWithDebInfo")
endif()
add_compile_options(-fdata-sections -ffunction-sections -Wswitch-enum
-Werror=switch-enum -fPIC)
add_compile_options(
-fdata-sections
-ffunction-sections
-Wswitch-enum
-Werror=switch-enum
-fPIC
-g
-fno-omit-frame-pointer)
set(full_relro_flags "-pie;LINKER:-z,relro,-z,now,-z,noexecstack")
cmake_push_check_state()

View File

@@ -395,8 +395,9 @@ inline void Node48::copyChildrenAndKeyFrom(const Node16 &other) {
assert(numChildren == Node16::kMaxNodes);
memset(index, -1, sizeof(index));
memset(children, 0, sizeof(children));
const auto z = InternalVersionT::zero;
for (auto &v : childMaxVersion) {
v = InternalVersionT::zero;
v = z;
}
memcpy(partialKey(), &other + 1, partialKeyLen);
bitSet.init();
@@ -423,8 +424,9 @@ inline void Node48::copyChildrenAndKeyFrom(const Node48 &other) {
nextFree = other.nextFree;
memcpy(index, other.index, sizeof(index));
memset(children, 0, sizeof(children));
const auto z = InternalVersionT::zero;
for (auto &v : childMaxVersion) {
v = InternalVersionT::zero;
v = z;
}
for (int i = 0; i < numChildren; ++i) {
children[i] = other.children[i];
@@ -442,8 +444,9 @@ inline void Node48::copyChildrenAndKeyFrom(const Node256 &other) {
kNodeCopySize);
memset(index, -1, sizeof(index));
memset(children, 0, sizeof(children));
const auto z = InternalVersionT::zero;
for (auto &v : childMaxVersion) {
v = InternalVersionT::zero;
v = z;
}
nextFree = other.numChildren;
bitSet = other.bitSet;
@@ -470,11 +473,12 @@ inline void Node256::copyChildrenAndKeyFrom(const Node48 &other) {
kNodeCopySize);
bitSet = other.bitSet;
memset(children, 0, sizeof(children));
const auto z = InternalVersionT::zero;
for (auto &v : childMaxVersion) {
v = InternalVersionT::zero;
v = z;
}
for (auto &v : maxOfMax) {
v = InternalVersionT::zero;
v = z;
}
bitSet.forEachSet([&](int c) {
children[c] = other.children[other.index[c]];
@@ -491,8 +495,9 @@ inline void Node256::copyChildrenAndKeyFrom(const Node256 &other) {
memcpy((char *)this + kNodeCopyBegin, (char *)&other + kNodeCopyBegin,
kNodeCopySize);
memset(children, 0, sizeof(children));
const auto z = InternalVersionT::zero;
for (auto &v : childMaxVersion) {
v = InternalVersionT::zero;
v = z;
}
bitSet = other.bitSet;
bitSet.forEachSet([&](int c) {
@@ -619,13 +624,15 @@ template <class T> struct BoundedFreeListAllocator {
T *result = allocate_helper(partialKeyCapacity);
if constexpr (!std::is_same_v<T, Node0>) {
memset(result->children, 0, sizeof(result->children));
const auto z = InternalVersionT::zero;
for (auto &v : result->childMaxVersion) {
v = InternalVersionT::zero;
v = z;
}
}
if constexpr (std::is_same_v<T, Node48> || std::is_same_v<T, Node256>) {
const auto z = InternalVersionT::zero;
for (auto &v : result->maxOfMax) {
v = InternalVersionT::zero;
v = z;
}
}
return result;
@@ -712,6 +719,9 @@ struct ReadContext {
int64_t prefix_read_iterations_accum = 0;
int64_t range_read_iterations_accum = 0;
int64_t range_read_node_scan_accum = 0;
int64_t commits_accum = 0;
int64_t conflicts_accum = 0;
int64_t too_olds_accum = 0;
ConflictSet::Impl *impl;
};
@@ -730,6 +740,10 @@ struct WriteContext {
int64_t write_bytes;
} accum;
// Cache a copy of InternalVersionT::zero, so we don't need to do the TLS
// lookup as often.
InternalVersionT zero;
WriteContext() { memset(&accum, 0, sizeof(accum)); }
template <class T> T *allocate(int c) {
@@ -1122,24 +1136,19 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, WriteContext *tls) {
insert3:
auto *self3 = static_cast<Node3 *>(self);
++self->numChildren;
int i = 0;
for (; i < self->numChildren - 1; ++i) {
if (int(self3->index[i]) > int(index)) {
memmove(self3->index + i + 1, self3->index + i,
self->numChildren - (i + 1));
memmove(self3->children + i + 1, self3->children + i,
(self->numChildren - (i + 1)) *
sizeof(self3->children[0])); // NOLINT
memmove(self3->childMaxVersion + i + 1, self3->childMaxVersion + i,
(self->numChildren - (i + 1)) *
sizeof(self3->childMaxVersion[0]));
int i = self->numChildren - 1;
for (; i >= 0; --i) {
if (int(self3->index[i]) < int(index)) {
break;
}
self3->index[i + 1] = self3->index[i];
self3->children[i + 1] = self3->children[i];
self3->childMaxVersion[i + 1] = self3->childMaxVersion[i];
}
self3->index[i] = index;
auto &result = self3->children[i];
self3->index[i + 1] = index;
auto &result = self3->children[i + 1];
result = nullptr;
++self->numChildren;
return result;
}
case Type_Node16: {
@@ -1153,71 +1162,21 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, WriteContext *tls) {
}
insert16:
auto *self16 = static_cast<Node16 *>(self);
assert(self->getType() == Type_Node16);
++self->numChildren;
#ifdef HAS_AVX
__m128i key_vec = _mm_set1_epi8(index);
__m128i indices;
memcpy(&indices, self16->index, sizeof(self16->index));
__m128i results = _mm_cmpeq_epi8(key_vec, _mm_min_epu8(key_vec, indices));
int mask = (1 << (self->numChildren - 1)) - 1;
uint32_t bitfield = _mm_movemask_epi8(results) & mask;
bitfield |= uint32_t(1) << (self->numChildren - 1);
int i = std::countr_zero(bitfield);
if (i < self->numChildren - 1) {
memmove(self16->index + i + 1, self16->index + i,
self->numChildren - (i + 1));
memmove(self16->children + i + 1, self16->children + i,
(self->numChildren - (i + 1)) *
sizeof(self16->children[0])); // NOLINT
memmove(self16->childMaxVersion + i + 1, self16->childMaxVersion + i,
(self->numChildren - (i + 1)) *
sizeof(self16->childMaxVersion[0]));
}
#elif defined(HAS_ARM_NEON)
uint8x16_t indices;
memcpy(&indices, self16->index, sizeof(self16->index));
// 0xff for each leq
auto results = vcleq_u8(vdupq_n_u8(index), indices);
uint64_t mask = (uint64_t(1) << ((self->numChildren - 1) * 4)) - 1;
// 0xf for each 0xff (within mask)
uint64_t bitfield =
vget_lane_u64(
vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(results), 4)),
0) &
mask;
bitfield |= uint64_t(0xf) << ((self->numChildren - 1) * 4);
int i = std::countr_zero(bitfield) / 4;
if (i < self->numChildren - 1) {
memmove(self16->index + i + 1, self16->index + i,
self->numChildren - (i + 1));
memmove(self16->children + i + 1, self16->children + i,
(self->numChildren - (i + 1)) *
sizeof(self16->children[0])); // NOLINT
memmove(self16->childMaxVersion + i + 1, self16->childMaxVersion + i,
(self->numChildren - (i + 1)) *
sizeof(self16->childMaxVersion[0]));
}
#else
int i = 0;
for (; i < int(self->numChildren) - 1; ++i) {
if (int(self16->index[i]) > int(index)) {
memmove(self16->index + i + 1, self16->index + i,
self->numChildren - (i + 1));
memmove(self16->children + i + 1, self16->children + i,
(self->numChildren - (i + 1)) * sizeof(self16->children[0]));
memmove(self16->childMaxVersion + i + 1, self16->childMaxVersion + i,
(self->numChildren - (i + 1)) *
sizeof(self16->childMaxVersion[0]));
auto *self16 = static_cast<Node16 *>(self);
int i = self->numChildren - 1;
for (; i >= 0; --i) {
if (int(self16->index[i]) < int(index)) {
break;
}
self16->index[i + 1] = self16->index[i];
self16->children[i + 1] = self16->children[i];
self16->childMaxVersion[i + 1] = self16->childMaxVersion[i];
}
#endif
self16->index[i] = index;
auto &result = self16->children[i];
self16->index[i + 1] = index;
auto &result = self16->children[i + 1];
result = nullptr;
++self->numChildren;
return result;
}
case Type_Node48: {
@@ -1472,7 +1431,7 @@ void maybeDownsize(Node *self, WriteContext *tls, ConflictSet::Impl *impl,
// that we have a new parent.
setMaxVersion(child, impl, childMaxVersion);
if (child->parent) {
rezero(child->parent, InternalVersionT::zero);
rezero(child->parent, tls->zero);
}
getInTree(self, impl) = child;
@@ -1552,36 +1511,24 @@ Node *erase(Node *self, WriteContext *tls, ConflictSet::Impl *impl,
auto *parent3 = static_cast<Node3 *>(parent);
int nodeIndex = getNodeIndex(parent3, parentsIndex);
assert(nodeIndex >= 0);
memmove(parent3->index + nodeIndex, parent3->index + nodeIndex + 1,
sizeof(parent3->index[0]) *
(parent->numChildren - (nodeIndex + 1)));
memmove(parent3->children + nodeIndex, parent3->children + nodeIndex + 1,
sizeof(parent3->children[0]) * // NOLINT
(parent->numChildren - (nodeIndex + 1)));
memmove(parent3->childMaxVersion + nodeIndex,
parent3->childMaxVersion + nodeIndex + 1,
sizeof(parent3->childMaxVersion[0]) *
(parent->numChildren - (nodeIndex + 1)));
--parent->numChildren;
for (int i = nodeIndex; i < parent->numChildren; ++i) {
parent3->index[i] = parent3->index[i + 1];
parent3->children[i] = parent3->children[i + 1];
parent3->childMaxVersion[i] = parent3->childMaxVersion[i + 1];
}
assert(parent->numChildren > 0 || parent->entryPresent);
} break;
case Type_Node16: {
auto *parent16 = static_cast<Node16 *>(parent);
int nodeIndex = getNodeIndex(parent16, parentsIndex);
assert(nodeIndex >= 0);
memmove(parent16->index + nodeIndex, parent16->index + nodeIndex + 1,
sizeof(parent16->index[0]) *
(parent->numChildren - (nodeIndex + 1)));
memmove(parent16->children + nodeIndex, parent16->children + nodeIndex + 1,
sizeof(parent16->children[0]) * // NOLINT
(parent->numChildren - (nodeIndex + 1)));
memmove(parent16->childMaxVersion + nodeIndex,
parent16->childMaxVersion + nodeIndex + 1,
sizeof(parent16->childMaxVersion[0]) *
(parent->numChildren - (nodeIndex + 1)));
--parent->numChildren;
for (int i = nodeIndex; i < parent->numChildren; ++i) {
parent16->index[i] = parent16->index[i + 1];
parent16->children[i] = parent16->children[i + 1];
parent16->childMaxVersion[i] = parent16->childMaxVersion[i + 1];
}
// By kMinChildrenNode16
assert(parent->numChildren > 0);
@@ -1608,7 +1555,7 @@ Node *erase(Node *self, WriteContext *tls, ConflictSet::Impl *impl,
parent48->index[parentIndex] = toRemoveChildrenIndex;
parent48->reverseIndex[toRemoveChildrenIndex] = parentIndex;
}
parent48->childMaxVersion[lastChildrenIndex] = InternalVersionT::zero;
parent48->childMaxVersion[lastChildrenIndex] = tls->zero;
--parent->numChildren;
@@ -1726,15 +1673,38 @@ int firstNeqStride(const uint8_t *ap, const uint8_t *bp) {
#endif
}
#if defined(__x86_64__) && !defined(__SANITIZE_THREAD__)
__attribute__((target("avx512bw"))) int
longestCommonPrefix(const uint8_t *ap, const uint8_t *bp, int cl) {
int i = 0;
int end = cl & ~63;
while (i < end) {
const uint64_t eq =
_mm512_cmpeq_epi8_mask(_mm512_loadu_epi8(ap), _mm512_loadu_epi8(bp));
if (eq != uint64_t(-1)) {
return i + std::countr_one(eq);
}
i += 64;
ap += 64;
bp += 64;
}
if (i < cl) {
const uint64_t mask = (uint64_t(1) << (cl - i)) - 1;
const uint64_t eq = _mm512_cmpeq_epi8_mask(
_mm512_maskz_loadu_epi8(mask, ap), _mm512_maskz_loadu_epi8(mask, bp));
return i + std::countr_one(eq & mask);
}
assert(i == cl);
return i;
}
__attribute__((target("default")))
#endif
int longestCommonPrefix(const uint8_t *ap, const uint8_t *bp, int cl) {
assume(cl >= 0);
int i = 0;
int end;
if (cl < 8) {
goto bytes;
}
// kStride * kUnrollCount at a time
end = cl & ~(kStride * kUnrollFactor - 1);
while (i < end) {
@@ -1775,7 +1745,6 @@ int longestCommonPrefix(const uint8_t *ap, const uint8_t *bp, int cl) {
bp += 8;
}
bytes:
// byte at a time
while (i < cl) {
if (*ap != *bp) {
@@ -1808,10 +1777,13 @@ struct SearchStepWise {
if (child == nullptr) {
return true;
}
int cl = std::min<int>(child->partialKeyLen, remaining.size() - 1);
int i = longestCommonPrefix(child->partialKey(), remaining.data() + 1, cl);
if (i != child->partialKeyLen) {
return true;
if (child->partialKeyLen > 0) {
int cl = std::min<int>(child->partialKeyLen, remaining.size() - 1);
int i =
longestCommonPrefix(child->partialKey(), remaining.data() + 1, cl);
if (i != child->partialKeyLen) {
return true;
}
}
n = child;
remaining =
@@ -2949,8 +2921,7 @@ template <bool kBegin>
child->partialKeyLen = 0;
child->parent = *self;
child->parentsIndex = key.front();
setMaxVersion(child, impl,
kBegin ? writeVersion : InternalVersionT::zero);
setMaxVersion(child, impl, kBegin ? writeVersion : tls->zero);
}
self = &child;
@@ -2998,8 +2969,7 @@ void addPointWrite(Node *&root, std::span<const uint8_t> key,
n->entry.pointVersion = writeVersion;
setMaxVersion(n, impl, writeVersion);
n->entry.rangeVersion =
p == nullptr ? InternalVersionT::zero
: std::max(p->entry.rangeVersion, InternalVersionT::zero);
p == nullptr ? tls->zero : std::max(p->entry.rangeVersion, tls->zero);
} else {
assert(writeVersion >= n->entry.pointVersion);
n->entry.pointVersion = writeVersion;
@@ -3063,8 +3033,7 @@ void addWriteRange(Node *&root, std::span<const uint8_t> begin,
++tls->accum.entries_inserted;
auto *p = nextLogical(beginNode);
beginNode->entry.rangeVersion =
p == nullptr ? InternalVersionT::zero
: std::max(p->entry.rangeVersion, InternalVersionT::zero);
p == nullptr ? tls->zero : std::max(p->entry.rangeVersion, tls->zero);
beginNode->entry.pointVersion = writeVersion;
assert(maxVersion(beginNode, impl) <= writeVersion);
setMaxVersion(beginNode, impl, writeVersion);
@@ -3084,8 +3053,7 @@ void addWriteRange(Node *&root, std::span<const uint8_t> begin,
++tls->accum.entries_inserted;
auto *p = nextLogical(endNode);
endNode->entry.pointVersion =
p == nullptr ? InternalVersionT::zero
: std::max(p->entry.rangeVersion, InternalVersionT::zero);
p == nullptr ? tls->zero : std::max(p->entry.rangeVersion, tls->zero);
auto m = maxVersion(endNode, impl);
setMaxVersion(endNode, impl,
std::max<InternalVersionT>(m, endNode->entry.pointVersion));
@@ -3162,9 +3130,6 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
void check(const ReadRange *reads, Result *result, int count) {
ReadContext tls;
tls.impl = this;
int commits_accum = 0;
int conflicts_accum = 0;
int too_olds_accum = 0;
int64_t check_byte_accum = 0;
for (int i = 0; i < count; ++i) {
const auto &r = reads[i];
@@ -3182,9 +3147,9 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
InternalVersionT(reads[i].readVersion), &tls))
? Commit
: Conflict;
commits_accum += result[i] == Commit;
conflicts_accum += result[i] == Conflict;
too_olds_accum += result[i] == TooOld;
tls.commits_accum += result[i] == Commit;
tls.conflicts_accum += result[i] == Conflict;
tls.too_olds_accum += result[i] == TooOld;
}
point_read_total.add(tls.point_read_accum);
prefix_read_total.add(tls.prefix_read_accum);
@@ -3196,9 +3161,9 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
point_read_iterations_total.add(tls.point_read_iterations_accum);
prefix_read_iterations_total.add(tls.prefix_read_iterations_accum);
range_read_iterations_total.add(tls.range_read_iterations_accum);
commits_total.add(commits_accum);
conflicts_total.add(conflicts_accum);
too_olds_total.add(too_olds_accum);
commits_total.add(tls.commits_accum);
conflicts_total.add(tls.conflicts_accum);
too_olds_total.add(tls.too_olds_accum);
check_bytes_total.add(check_byte_accum);
}
@@ -3206,7 +3171,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
// There could be other conflict sets in the same thread. We need
// InternalVersionT::zero to be correct for this conflict set for the
// lifetime of the current call frame.
InternalVersionT::zero = oldestVersion;
InternalVersionT::zero = tls.zero = oldestVersion;
assert(writeVersion >= newestVersionFullPrecision);
@@ -3317,7 +3282,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
InternalVersionT oldestVersion{o};
this->oldestVersionFullPrecision = o;
this->oldestVersion = oldestVersion;
InternalVersionT::zero = oldestVersion;
InternalVersionT::zero = tls.zero = oldestVersion;
#ifdef NDEBUG
// This is here for performance reasons, since we want to amortize the cost
// of storing the search path as a string. In tests, we want to exercise the
@@ -3367,7 +3332,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
root->entry.pointVersion = this->oldestVersion;
root->entry.rangeVersion = this->oldestVersion;
InternalVersionT::zero = this->oldestVersion;
InternalVersionT::zero = tls.zero = this->oldestVersion;
// Intentionally not resetting totalBytes
}

104
README.md
View File

@@ -1,86 +1,38 @@
A data structure for optimistic concurrency control on ranges of bitwise-lexicographically-ordered keys.
Intended to replace FoundationDB's skip list.
Intended as an alternative to FoundationDB's skip list.
Hardware for all benchmarks is a mac m1 2020.
Hardware for all benchmarks is an AMD Ryzen 9 7900 with (2x32GB) 5600MT/s CL28-34-34-89 1.35V RAM
# FoundationDB's benchmark
# Microbenchmark
## Skip list
```
New conflict set: 1.957 sec
0.639 Mtransactions/sec
2.555 Mkeys/sec
Detect only: 1.845 sec
0.678 Mtransactions/sec
2.710 Mkeys/sec
Skiplist only: 1.263 sec
0.990 Mtransactions/sec
3.960 Mkeys/sec
Performance counters:
Build: 0.0546
Add: 0.0563
Detect: 1.84
D.Sort: 0.412
D.Combine: 0.0141
D.CheckRead: 0.671
D.CheckIntraBatch: 0.0068
D.MergeWrite: 0.592
D.RemoveBefore: 0.146
```
| ns/op | op/s | err% | ins/op | cyc/op | IPC | bra/op | miss% | total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
| 172.03 | 5,812,791.77 | 0.4% | 3,130.62 | 879.00 | 3.562 | 509.23 | 0.0% | 0.01 | `point reads`
| 167.44 | 5,972,130.71 | 0.2% | 3,065.14 | 862.27 | 3.555 | 494.30 | 0.0% | 0.01 | `prefix reads`
| 238.77 | 4,188,130.84 | 0.9% | 3,589.93 | 1,259.30 | 2.851 | 637.12 | 0.0% | 0.01 | `range reads`
| 424.01 | 2,358,426.70 | 0.2% | 5,620.05 | 2,242.35 | 2.506 | 854.80 | 1.7% | 0.01 | `point writes`
| 418.45 | 2,389,780.56 | 0.4% | 5,525.07 | 2,211.05 | 2.499 | 831.71 | 1.7% | 0.01 | `prefix writes`
| 254.87 | 3,923,568.88 | 2.6% | 3,187.01 | 1,366.50 | 2.332 | 529.11 | 2.7% | 0.02 | `range writes`
| 675.96 | 1,479,374.50 | 3.3% | 7,735.41 | 3,468.60 | 2.230 | 1,386.02 | 1.8% | 0.01 | `monotonic increasing point writes`
| 137,986.20 | 7,247.10 | 0.6% | 789,752.33 | 699,462.00 | 1.129 | 144,824.14 | 0.0% | 0.01 | `worst case for radix tree`
| 21.63 | 46,231,564.03 | 1.0% | 448.00 | 107.14 | 4.181 | 84.00 | 0.0% | 0.01 | `create and destroy`
## Radix tree (this implementation)
```
New conflict set: 1.366 sec
0.915 Mtransactions/sec
3.660 Mkeys/sec
Detect only: 1.248 sec
1.002 Mtransactions/sec
4.007 Mkeys/sec
Skiplist only: 0.573 sec
2.182 Mtransactions/sec
8.730 Mkeys/sec
Performance counters:
Build: 0.0594
Add: 0.0572
Detect: 1.25
D.Sort: 0.418
D.Combine: 0.0149
D.CheckRead: 0.232
D.CheckIntraBatch: 0.0067
D.MergeWrite: 0.341
D.RemoveBefore: 0.232
```
# Our benchmark
## Skip list
| ns/op | op/s | err% | total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
| 245.99 | 4,065,232.81 | 0.3% | 0.01 | `point reads`
| 265.93 | 3,760,430.49 | 0.2% | 0.01 | `prefix reads`
| 485.30 | 2,060,569.50 | 0.2% | 0.01 | `range reads`
| 449.60 | 2,224,195.17 | 0.4% | 0.01 | `point writes`
| 441.76 | 2,263,688.18 | 1.1% | 0.01 | `prefix writes`
| 245.42 | 4,074,647.54 | 2.4% | 0.02 | `range writes`
| 572.80 | 1,745,810.06 | 1.3% | 0.01 | `monotonic increasing point writes`
| 154,819.33 | 6,459.14 | 0.9% | 0.01 | `worst case for radix tree`
## Radix tree (this implementation)
| ns/op | op/s | err% | total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
| 20.25 | 49,372,759.86 | 0.3% | 0.01 | `point reads`
| 23.58 | 42,401,298.00 | 0.3% | 0.01 | `prefix reads`
| 64.12 | 15,595,463.14 | 0.8% | 0.01 | `range reads`
| 29.50 | 33,903,101.20 | 0.7% | 0.01 | `point writes`
| 46.76 | 21,384,036.19 | 1.2% | 0.01 | `prefix writes`
| 51.25 | 19,512,195.12 | 0.0% | 0.01 | `range writes`
| 109.51 | 9,131,469.31 | 3.6% | 0.01 | `monotonic increasing point writes`
| 1,153,875.00 | 866.65 | 1.6% | 0.01 | `worst case for radix tree`
| ns/op | op/s | err% | ins/op | cyc/op | IPC | bra/op | miss% | total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
| 17.03 | 58,732,967.93 | 0.6% | 276.28 | 87.96 | 3.141 | 52.15 | 0.4% | 0.01 | `point reads`
| 19.52 | 51,239,158.04 | 0.3% | 367.16 | 101.50 | 3.617 | 61.92 | 0.3% | 0.01 | `prefix reads`
| 47.74 | 20,947,676.63 | 0.5% | 998.16 | 247.43 | 4.034 | 161.64 | 0.2% | 0.01 | `range reads`
| 23.14 | 43,207,824.89 | 0.4% | 408.18 | 121.64 | 3.356 | 70.20 | 0.3% | 0.01 | `point writes`
| 38.02 | 26,302,115.66 | 0.1% | 709.72 | 199.70 | 3.554 | 134.26 | 0.3% | 0.01 | `prefix writes`
| 44.28 | 22,583,559.17 | 0.9% | 825.19 | 233.10 | 3.540 | 141.48 | 0.2% | 0.01 | `range writes`
| 85.50 | 11,695,990.63 | 0.5% | 1,488.16 | 455.68 | 3.266 | 289.22 | 0.1% | 0.01 | `monotonic increasing point writes`
| 338,388.50 | 2,955.18 | 3.3% | 4,097,087.00 | 1,809,996.00 | 2.264 | 759,645.00 | 0.0% | 0.01 | `worst case for radix tree`
| 84.84 | 11,787,313.59 | 1.4% | 1,716.02 | 440.50 | 3.896 | 271.00 | 0.0% | 0.01 | `create and destroy`
# "Real data" test
@@ -89,13 +41,13 @@ Point queries only, best of three runs. Gc ratio is the ratio of time spent doin
## skip list
```
Check: 11.3385 seconds, 329.718 MB/s, Add: 5.35612 seconds, 131.072 MB/s, Gc ratio: 45.7173%
Check: 4.47891 seconds, 364.05 MB/s, Add: 4.55599 seconds, 123.058 MB/s, Gc ratio: 37.1145%
```
## radix tree
```
Check: 2.60639 seconds, 1434.36 MB/s, Add: 2.10911 seconds, 332.86 MB/s, Gc ratio: 46.3071%
Check: 1.05813 seconds, 1540.97 MB/s, Add: 1.32071 seconds, 424.508 MB/s, Gc ratio: 42.2067%
```
## hash table
@@ -103,5 +55,5 @@ Check: 2.60639 seconds, 1434.36 MB/s, Add: 2.10911 seconds, 332.86 MB/s, Gc rati
(The hash table implementation doesn't work on range queries, and its purpose is to provide an idea of how fast point queries can be)
```
Check: 1.83386 seconds, 2038.6 MB/s, Add: 0.601411 seconds, 1167.32 MB/s, Gc ratio: 48.9776%
Check: 0.804094 seconds, 2027.81 MB/s, Add: 0.652952 seconds, 858.645 MB/s, Gc ratio: 35.3885%
```