Compare commits
20 Commits
v0.0.11
...
0a850f22e9
| Author | SHA1 | Date | |
|---|---|---|---|
| 0a850f22e9 | |||
| 479b39d055 | |||
| 482408d725 | |||
| 45995e3307 | |||
| 359b0b29ff | |||
| 54e47ebd40 | |||
| 1c9dda68a6 | |||
| 142455dd28 | |||
| 567d385fbd | |||
| 8a44055533 | |||
| 62516825d1 | |||
| 3d592bd6a9 | |||
| f5f5fb620b | |||
| e3d1b2e842 | |||
| 9f8800af16 | |||
| 182c065c8e | |||
| 2dba0d5be3 | |||
| a1dfdf355c | |||
| 15919cb1c4 | |||
| 5ed9003a83 |
+1
-1
@@ -1,7 +1,7 @@
|
||||
cmake_minimum_required(VERSION 3.18)
|
||||
project(
|
||||
conflict-set
|
||||
VERSION 0.0.11
|
||||
VERSION 0.0.12
|
||||
DESCRIPTION
|
||||
"A data structure for optimistic concurrency control on ranges of bitwise-lexicographically-ordered keys."
|
||||
HOMEPAGE_URL "https://git.weaselab.dev/weaselab/conflict-set"
|
||||
|
||||
+127
-303
@@ -16,12 +16,12 @@ limitations under the License.
|
||||
|
||||
#include "ConflictSet.h"
|
||||
#include "Internal.h"
|
||||
#include "LongestCommonPrefix.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <bit>
|
||||
#include <cassert>
|
||||
#include <compare>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
@@ -40,11 +40,13 @@ limitations under the License.
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#ifndef __SANITIZE_THREAD__
|
||||
#if defined(__has_feature)
|
||||
#if __has_feature(thread_sanitizer)
|
||||
#define __SANITIZE_THREAD__
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <memcheck.h>
|
||||
|
||||
@@ -1123,12 +1125,68 @@ Node *getFirstChildExists(Node *self) {
|
||||
}
|
||||
}
|
||||
|
||||
// Caller is responsible for assigning a non-null pointer to the returned
|
||||
// reference if null. Updates child's max version to `newMaxVersion` if child
|
||||
// exists but does not have a partial key.
|
||||
Node *&getOrCreateChild(Node *&self, uint8_t index,
|
||||
void consumePartialKeyFull(Node *&self, std::span<const uint8_t> &key,
|
||||
InternalVersionT writeVersion, WriteContext *tls) {
|
||||
// Handle an existing partial key
|
||||
int commonLen = std::min<int>(self->partialKeyLen, key.size());
|
||||
int partialKeyIndex =
|
||||
longestCommonPrefix(self->partialKey(), key.data(), commonLen);
|
||||
if (partialKeyIndex < self->partialKeyLen) {
|
||||
auto *old = self;
|
||||
// Since root cannot have a partial key
|
||||
assert(old->parent != nullptr);
|
||||
InternalVersionT oldMaxVersion = exchangeMaxVersion(old, writeVersion);
|
||||
|
||||
// *self will have one child (old)
|
||||
auto *newSelf = tls->allocate<Node3>(partialKeyIndex);
|
||||
|
||||
newSelf->parent = old->parent;
|
||||
newSelf->parentsIndex = old->parentsIndex;
|
||||
newSelf->partialKeyLen = partialKeyIndex;
|
||||
newSelf->entryPresent = false;
|
||||
newSelf->numChildren = 1;
|
||||
|
||||
memcpy(newSelf->partialKey(), old->partialKey(), newSelf->partialKeyLen);
|
||||
|
||||
uint8_t oldDistinguishingByte = old->partialKey()[partialKeyIndex];
|
||||
old->parent = newSelf;
|
||||
old->parentsIndex = oldDistinguishingByte;
|
||||
newSelf->index[0] = oldDistinguishingByte;
|
||||
newSelf->children[0] = old;
|
||||
newSelf->childMaxVersion[0] = oldMaxVersion;
|
||||
self = newSelf;
|
||||
|
||||
memmove(old->partialKey(), old->partialKey() + partialKeyIndex + 1,
|
||||
old->partialKeyLen - (partialKeyIndex + 1));
|
||||
old->partialKeyLen -= partialKeyIndex + 1;
|
||||
|
||||
// We would consider decreasing capacity here, but we can't invalidate
|
||||
// old since it's not on the search path. setOldestVersion will clean it
|
||||
// up.
|
||||
}
|
||||
key = key.subspan(partialKeyIndex, key.size() - partialKeyIndex);
|
||||
}
|
||||
|
||||
// Consume any partial key of `self`, and update `self` and
|
||||
// `key` such that `self` is along the search path of `key`
|
||||
inline __attribute__((always_inline)) void
|
||||
consumePartialKey(Node *&self, std::span<const uint8_t> &key,
|
||||
InternalVersionT writeVersion, WriteContext *tls) {
|
||||
if (self->partialKeyLen > 0) {
|
||||
consumePartialKeyFull(self, key, writeVersion, tls);
|
||||
}
|
||||
}
|
||||
|
||||
// Return the next node along the search path of key, consuming bytes of key
|
||||
// such that the search path of the result + key is the same as the search path
|
||||
// of self + key before the call. Creates a node if necessary. Updates
|
||||
// `maxVersion` for result.
|
||||
Node *&getOrCreateChild(Node *&self, std::span<const uint8_t> &key,
|
||||
InternalVersionT newMaxVersion, WriteContext *tls) {
|
||||
|
||||
int index = key.front();
|
||||
key = key.subspan(1, key.size() - 1);
|
||||
|
||||
// Fast path for if it exists already
|
||||
switch (self->getType()) {
|
||||
case Type_Node0:
|
||||
@@ -1137,9 +1195,8 @@ Node *&getOrCreateChild(Node *&self, uint8_t index,
|
||||
auto *self3 = static_cast<Node3 *>(self);
|
||||
int i = getNodeIndex(self3, index);
|
||||
if (i >= 0) {
|
||||
if (self3->children[i]->partialKeyLen == 0) {
|
||||
self3->childMaxVersion[i] = newMaxVersion;
|
||||
}
|
||||
consumePartialKey(self3->children[i], key, newMaxVersion, tls);
|
||||
self3->childMaxVersion[i] = newMaxVersion;
|
||||
return self3->children[i];
|
||||
}
|
||||
} break;
|
||||
@@ -1147,9 +1204,8 @@ Node *&getOrCreateChild(Node *&self, uint8_t index,
|
||||
auto *self16 = static_cast<Node16 *>(self);
|
||||
int i = getNodeIndex(self16, index);
|
||||
if (i >= 0) {
|
||||
if (self16->children[i]->partialKeyLen == 0) {
|
||||
self16->childMaxVersion[i] = newMaxVersion;
|
||||
}
|
||||
consumePartialKey(self16->children[i], key, newMaxVersion, tls);
|
||||
self16->childMaxVersion[i] = newMaxVersion;
|
||||
return self16->children[i];
|
||||
}
|
||||
} break;
|
||||
@@ -1157,23 +1213,21 @@ Node *&getOrCreateChild(Node *&self, uint8_t index,
|
||||
auto *self48 = static_cast<Node48 *>(self);
|
||||
int secondIndex = self48->index[index];
|
||||
if (secondIndex >= 0) {
|
||||
if (self48->children[secondIndex]->partialKeyLen == 0) {
|
||||
self48->childMaxVersion[secondIndex] = newMaxVersion;
|
||||
self48->maxOfMax[secondIndex >> Node48::kMaxOfMaxShift] =
|
||||
std::max(self48->maxOfMax[secondIndex >> Node48::kMaxOfMaxShift],
|
||||
newMaxVersion);
|
||||
}
|
||||
consumePartialKey(self48->children[secondIndex], key, newMaxVersion, tls);
|
||||
self48->childMaxVersion[secondIndex] = newMaxVersion;
|
||||
self48->maxOfMax[secondIndex >> Node48::kMaxOfMaxShift] =
|
||||
std::max(self48->maxOfMax[secondIndex >> Node48::kMaxOfMaxShift],
|
||||
newMaxVersion);
|
||||
return self48->children[secondIndex];
|
||||
}
|
||||
} break;
|
||||
case Type_Node256: {
|
||||
auto *self256 = static_cast<Node256 *>(self);
|
||||
if (auto &result = self256->children[index]; result != nullptr) {
|
||||
if (self256->children[index]->partialKeyLen == 0) {
|
||||
self256->childMaxVersion[index] = newMaxVersion;
|
||||
self256->maxOfMax[index >> Node256::kMaxOfMaxShift] = std::max(
|
||||
self256->maxOfMax[index >> Node256::kMaxOfMaxShift], newMaxVersion);
|
||||
}
|
||||
consumePartialKey(result, key, newMaxVersion, tls);
|
||||
self256->childMaxVersion[index] = newMaxVersion;
|
||||
self256->maxOfMax[index >> Node256::kMaxOfMaxShift] = std::max(
|
||||
self256->maxOfMax[index >> Node256::kMaxOfMaxShift], newMaxVersion);
|
||||
return result;
|
||||
}
|
||||
} break;
|
||||
@@ -1181,6 +1235,14 @@ Node *&getOrCreateChild(Node *&self, uint8_t index,
|
||||
__builtin_unreachable(); // GCOVR_EXCL_LINE
|
||||
}
|
||||
|
||||
auto *newChild = tls->allocate<Node0>(key.size());
|
||||
newChild->numChildren = 0;
|
||||
newChild->entryPresent = false;
|
||||
newChild->partialKeyLen = key.size();
|
||||
newChild->parentsIndex = index;
|
||||
memcpy(newChild->partialKey(), key.data(), key.size());
|
||||
key = {};
|
||||
|
||||
switch (self->getType()) {
|
||||
case Type_Node0: {
|
||||
auto *self0 = static_cast<Node0 *>(self);
|
||||
@@ -1215,8 +1277,10 @@ Node *&getOrCreateChild(Node *&self, uint8_t index,
|
||||
}
|
||||
self3->index[i + 1] = index;
|
||||
auto &result = self3->children[i + 1];
|
||||
result = nullptr;
|
||||
self3->childMaxVersion[i + 1] = newMaxVersion;
|
||||
result = newChild;
|
||||
++self->numChildren;
|
||||
newChild->parent = self;
|
||||
return result;
|
||||
}
|
||||
case Type_Node16: {
|
||||
@@ -1243,8 +1307,10 @@ Node *&getOrCreateChild(Node *&self, uint8_t index,
|
||||
}
|
||||
self16->index[i + 1] = index;
|
||||
auto &result = self16->children[i + 1];
|
||||
result = nullptr;
|
||||
self16->childMaxVersion[i + 1] = newMaxVersion;
|
||||
result = newChild;
|
||||
++self->numChildren;
|
||||
newChild->parent = self;
|
||||
return result;
|
||||
}
|
||||
case Type_Node48: {
|
||||
@@ -1267,7 +1333,11 @@ Node *&getOrCreateChild(Node *&self, uint8_t index,
|
||||
self48->index[index] = nextFree;
|
||||
self48->reverseIndex[nextFree] = index;
|
||||
auto &result = self48->children[nextFree];
|
||||
result = nullptr;
|
||||
self48->childMaxVersion[nextFree] = newMaxVersion;
|
||||
self48->maxOfMax[nextFree >> Node48::kMaxOfMaxShift] = std::max(
|
||||
newMaxVersion, self48->maxOfMax[nextFree >> Node48::kMaxOfMaxShift]);
|
||||
result = newChild;
|
||||
newChild->parent = self;
|
||||
return result;
|
||||
}
|
||||
case Type_Node256: {
|
||||
@@ -1276,7 +1346,13 @@ Node *&getOrCreateChild(Node *&self, uint8_t index,
|
||||
auto *self256 = static_cast<Node256 *>(self);
|
||||
++self->numChildren;
|
||||
self256->bitSet.set(index);
|
||||
return self256->children[index];
|
||||
auto &result = self256->children[index];
|
||||
self256->childMaxVersion[index] = newMaxVersion;
|
||||
self256->maxOfMax[index >> Node256::kMaxOfMaxShift] = std::max(
|
||||
newMaxVersion, self256->maxOfMax[index >> Node256::kMaxOfMaxShift]);
|
||||
result = newChild;
|
||||
newChild->parent = self;
|
||||
return result;
|
||||
}
|
||||
default: // GCOVR_EXCL_LINE
|
||||
__builtin_unreachable(); // GCOVR_EXCL_LINE
|
||||
@@ -1687,167 +1763,6 @@ Node *nextSibling(Node *node) {
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(HAS_AVX) || defined(HAS_ARM_NEON)
|
||||
constexpr int kStride = 64;
|
||||
#else
|
||||
constexpr int kStride = 16;
|
||||
#endif
|
||||
|
||||
constexpr int kUnrollFactor = 4;
|
||||
|
||||
bool compareStride(const uint8_t *ap, const uint8_t *bp) {
|
||||
#if defined(HAS_ARM_NEON)
|
||||
static_assert(kStride == 64);
|
||||
uint8x16_t x[4]; // GCOVR_EXCL_LINE
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
x[i] = vceqq_u8(vld1q_u8(ap + i * 16), vld1q_u8(bp + i * 16));
|
||||
}
|
||||
auto results = vreinterpretq_u16_u8(
|
||||
vandq_u8(vandq_u8(x[0], x[1]), vandq_u8(x[2], x[3])));
|
||||
bool eq = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(results, 4)), 0) ==
|
||||
uint64_t(-1);
|
||||
#elif defined(HAS_AVX)
|
||||
static_assert(kStride == 64);
|
||||
__m128i x[4]; // GCOVR_EXCL_LINE
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
x[i] = _mm_cmpeq_epi8(_mm_loadu_si128((__m128i *)(ap + i * 16)),
|
||||
_mm_loadu_si128((__m128i *)(bp + i * 16)));
|
||||
}
|
||||
auto eq =
|
||||
_mm_movemask_epi8(_mm_and_si128(_mm_and_si128(x[0], x[1]),
|
||||
_mm_and_si128(x[2], x[3]))) == 0xffff;
|
||||
#else
|
||||
// Hope it gets vectorized
|
||||
auto eq = memcmp(ap, bp, kStride) == 0;
|
||||
#endif
|
||||
return eq;
|
||||
}
|
||||
|
||||
// Precondition: ap[:kStride] != bp[:kStride]
|
||||
int firstNeqStride(const uint8_t *ap, const uint8_t *bp) {
|
||||
#if defined(HAS_AVX)
|
||||
static_assert(kStride == 64);
|
||||
uint64_t c[kStride / 16]; // GCOVR_EXCL_LINE
|
||||
for (int i = 0; i < kStride; i += 16) {
|
||||
const auto a = _mm_loadu_si128((__m128i *)(ap + i));
|
||||
const auto b = _mm_loadu_si128((__m128i *)(bp + i));
|
||||
const auto compared = _mm_cmpeq_epi8(a, b);
|
||||
c[i / 16] = _mm_movemask_epi8(compared) & 0xffff;
|
||||
}
|
||||
return std::countr_zero(~(c[0] | c[1] << 16 | c[2] << 32 | c[3] << 48));
|
||||
#elif defined(HAS_ARM_NEON)
|
||||
static_assert(kStride == 64);
|
||||
for (int i = 0; i < kStride; i += 16) {
|
||||
// 0xff for each match
|
||||
uint16x8_t results =
|
||||
vreinterpretq_u16_u8(vceqq_u8(vld1q_u8(ap + i), vld1q_u8(bp + i)));
|
||||
// 0xf for each mismatch
|
||||
uint64_t bitfield =
|
||||
~vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(results, 4)), 0);
|
||||
if (bitfield) {
|
||||
return i + (std::countr_zero(bitfield) >> 2);
|
||||
}
|
||||
}
|
||||
__builtin_unreachable(); // GCOVR_EXCL_LINE
|
||||
#else
|
||||
int i = 0;
|
||||
for (; i < kStride - 1; ++i) {
|
||||
if (*ap++ != *bp++) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
#endif
|
||||
}
|
||||
|
||||
// This gets covered in local development
|
||||
// GCOVR_EXCL_START
|
||||
#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__)
|
||||
__attribute__((target("avx512f,avx512bw"))) int
|
||||
longestCommonPrefix(const uint8_t *ap, const uint8_t *bp, int cl) {
|
||||
int i = 0;
|
||||
int end = cl & ~63;
|
||||
while (i < end) {
|
||||
const uint64_t eq =
|
||||
_mm512_cmpeq_epi8_mask(_mm512_loadu_epi8(ap), _mm512_loadu_epi8(bp));
|
||||
if (eq != uint64_t(-1)) {
|
||||
return i + std::countr_one(eq);
|
||||
}
|
||||
i += 64;
|
||||
ap += 64;
|
||||
bp += 64;
|
||||
}
|
||||
if (i < cl) {
|
||||
const uint64_t mask = (uint64_t(1) << (cl - i)) - 1;
|
||||
const uint64_t eq = _mm512_cmpeq_epi8_mask(
|
||||
_mm512_maskz_loadu_epi8(mask, ap), _mm512_maskz_loadu_epi8(mask, bp));
|
||||
return i + std::countr_one(eq & mask);
|
||||
}
|
||||
assert(i == cl);
|
||||
return i;
|
||||
}
|
||||
__attribute__((target("default")))
|
||||
#endif
|
||||
// GCOVR_EXCL_STOP
|
||||
|
||||
int longestCommonPrefix(const uint8_t *ap, const uint8_t *bp, int cl) {
|
||||
assume(cl >= 0);
|
||||
int i = 0;
|
||||
int end;
|
||||
|
||||
// kStride * kUnrollCount at a time
|
||||
end = cl & ~(kStride * kUnrollFactor - 1);
|
||||
while (i < end) {
|
||||
for (int j = 0; j < kUnrollFactor; ++j) {
|
||||
if (!compareStride(ap, bp)) {
|
||||
return i + firstNeqStride(ap, bp);
|
||||
}
|
||||
i += kStride;
|
||||
ap += kStride;
|
||||
bp += kStride;
|
||||
}
|
||||
}
|
||||
|
||||
// kStride at a time
|
||||
end = cl & ~(kStride - 1);
|
||||
while (i < end) {
|
||||
if (!compareStride(ap, bp)) {
|
||||
return i + firstNeqStride(ap, bp);
|
||||
}
|
||||
i += kStride;
|
||||
ap += kStride;
|
||||
bp += kStride;
|
||||
}
|
||||
|
||||
// word at a time
|
||||
end = cl & ~(sizeof(uint64_t) - 1);
|
||||
while (i < end) {
|
||||
uint64_t a; // GCOVR_EXCL_LINE
|
||||
uint64_t b; // GCOVR_EXCL_LINE
|
||||
memcpy(&a, ap, 8);
|
||||
memcpy(&b, bp, 8);
|
||||
const auto mismatched = a ^ b;
|
||||
if (mismatched) {
|
||||
return i + std::countr_zero(mismatched) / 8;
|
||||
}
|
||||
i += 8;
|
||||
ap += 8;
|
||||
bp += 8;
|
||||
}
|
||||
|
||||
// byte at a time
|
||||
while (i < cl) {
|
||||
if (*ap != *bp) {
|
||||
break;
|
||||
}
|
||||
++ap;
|
||||
++bp;
|
||||
++i;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
// Logically this is the same as performing firstGeq and then checking against
|
||||
// point or range version according to cmp, but this version short circuits as
|
||||
// soon as it can prove that there's no conflict.
|
||||
@@ -2877,94 +2792,22 @@ checkMaxBetweenExclusiveImpl<true>(Node *n, int begin, int end,
|
||||
InternalVersionT readVersion, ReadContext *);
|
||||
#endif
|
||||
|
||||
// Consume the partial key of `self` (which must exist), and update `self` and
|
||||
// `key` such that `self` is along the search path of `key`
|
||||
void consumePartialKey(Node *&self, std::span<const uint8_t> &key,
|
||||
InternalVersionT writeVersion, WriteContext *tls) {
|
||||
assert(self->partialKeyLen > 0);
|
||||
// Handle an existing partial key
|
||||
int commonLen = std::min<int>(self->partialKeyLen, key.size());
|
||||
int partialKeyIndex =
|
||||
longestCommonPrefix(self->partialKey(), key.data(), commonLen);
|
||||
if (partialKeyIndex < self->partialKeyLen) {
|
||||
auto *old = self;
|
||||
// Since root cannot have a partial key
|
||||
assert(old->parent != nullptr);
|
||||
InternalVersionT oldMaxVersion = exchangeMaxVersion(old, writeVersion);
|
||||
|
||||
// *self will have one child (old)
|
||||
auto *newSelf = tls->allocate<Node3>(partialKeyIndex);
|
||||
|
||||
newSelf->parent = old->parent;
|
||||
newSelf->parentsIndex = old->parentsIndex;
|
||||
newSelf->partialKeyLen = partialKeyIndex;
|
||||
newSelf->entryPresent = false;
|
||||
newSelf->numChildren = 1;
|
||||
|
||||
memcpy(newSelf->partialKey(), old->partialKey(), newSelf->partialKeyLen);
|
||||
|
||||
uint8_t oldDistinguishingByte = old->partialKey()[partialKeyIndex];
|
||||
old->parent = newSelf;
|
||||
old->parentsIndex = oldDistinguishingByte;
|
||||
newSelf->index[0] = oldDistinguishingByte;
|
||||
newSelf->children[0] = old;
|
||||
newSelf->childMaxVersion[0] = oldMaxVersion;
|
||||
self = newSelf;
|
||||
|
||||
memmove(old->partialKey(), old->partialKey() + partialKeyIndex + 1,
|
||||
old->partialKeyLen - (partialKeyIndex + 1));
|
||||
old->partialKeyLen -= partialKeyIndex + 1;
|
||||
|
||||
// We would consider decreasing capacity here, but we can't invalidate
|
||||
// old since it's not on the search path. setOldestVersion will clean it
|
||||
// up.
|
||||
}
|
||||
key = key.subspan(partialKeyIndex, key.size() - partialKeyIndex);
|
||||
}
|
||||
|
||||
// Returns a pointer to the newly inserted node. Caller must set
|
||||
// `entryPresent`, and `entry` fields. All nodes along the search path of the
|
||||
// result will have `maxVersion` set to `writeVersion` as a postcondition. Nodes
|
||||
// along the search path may be invalidated.
|
||||
// Returns a pointer the pointer to the newly inserted node in the tree. Caller
|
||||
// must set `entryPresent`, and `entry` fields. All nodes along the search path
|
||||
// of the result will have `maxVersion` set to `writeVersion` as a
|
||||
// postcondition. Nodes along the search path may be invalidated.
|
||||
[[nodiscard]]
|
||||
Node *insert(Node **self, std::span<const uint8_t> key,
|
||||
InternalVersionT writeVersion, WriteContext *tls,
|
||||
ConflictSet::Impl *impl) {
|
||||
Node **insert(Node **self, std::span<const uint8_t> key,
|
||||
InternalVersionT writeVersion, WriteContext *tls,
|
||||
ConflictSet::Impl *impl) {
|
||||
|
||||
if ((*self)->partialKeyLen > 0) {
|
||||
consumePartialKey(*self, key, writeVersion, tls);
|
||||
}
|
||||
assert(maxVersion(*self, impl) <= writeVersion);
|
||||
setMaxVersion(*self, impl, writeVersion);
|
||||
|
||||
for (;; ++tls->accum.insert_iterations) {
|
||||
|
||||
if (key.size() == 0) {
|
||||
return *self;
|
||||
}
|
||||
|
||||
auto &child = getOrCreateChild(*self, key.front(), writeVersion, tls);
|
||||
if (!child) {
|
||||
child = tls->allocate<Node0>(key.size() - 1);
|
||||
child->numChildren = 0;
|
||||
child->entryPresent = false;
|
||||
child->partialKeyLen = key.size() - 1;
|
||||
child->parent = *self;
|
||||
child->parentsIndex = key.front();
|
||||
setMaxVersion(child, impl, writeVersion);
|
||||
memcpy(child->partialKey(), key.data() + 1, child->partialKeyLen);
|
||||
return child;
|
||||
}
|
||||
|
||||
self = &child;
|
||||
key = key.subspan(1, key.size() - 1);
|
||||
|
||||
if ((*self)->partialKeyLen > 0) {
|
||||
consumePartialKey(*self, key, writeVersion, tls);
|
||||
assert(maxVersion(*self, impl) <= writeVersion);
|
||||
setMaxVersion(*self, impl, writeVersion);
|
||||
}
|
||||
for (; key.size() != 0; ++tls->accum.insert_iterations) {
|
||||
self = &getOrCreateChild(*self, key, writeVersion, tls);
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
void destroyTree(Node *root) {
|
||||
@@ -2996,7 +2839,7 @@ void addPointWrite(Node *&root, std::span<const uint8_t> key,
|
||||
InternalVersionT writeVersion, WriteContext *tls,
|
||||
ConflictSet::Impl *impl) {
|
||||
++tls->accum.point_writes;
|
||||
auto *n = insert(&root, key, writeVersion, tls, impl);
|
||||
auto *n = *insert(&root, key, writeVersion, tls, impl);
|
||||
if (!n->entryPresent) {
|
||||
++tls->accum.entries_inserted;
|
||||
auto *p = nextLogical(n);
|
||||
@@ -3063,41 +2906,16 @@ void addWriteRange(Node *&root, std::span<const uint8_t> begin,
|
||||
}
|
||||
++tls->accum.range_writes;
|
||||
const bool beginIsPrefix = lcp == int(begin.size());
|
||||
auto remaining = begin.subspan(0, lcp);
|
||||
|
||||
auto *n = root;
|
||||
Node **useAsRoot =
|
||||
insert(&root, begin.subspan(0, lcp), writeVersion, tls, impl);
|
||||
|
||||
for (;;) {
|
||||
if (int(remaining.size()) <= n->partialKeyLen) {
|
||||
break;
|
||||
}
|
||||
int i = longestCommonPrefix(n->partialKey(), remaining.data(),
|
||||
n->partialKeyLen);
|
||||
if (i != n->partialKeyLen) {
|
||||
break;
|
||||
}
|
||||
|
||||
auto *child = getChild(n, remaining[n->partialKeyLen]);
|
||||
if (child == nullptr) {
|
||||
break;
|
||||
}
|
||||
|
||||
assert(maxVersion(n, impl) <= writeVersion);
|
||||
setMaxVersion(n, impl, writeVersion);
|
||||
|
||||
remaining = remaining.subspan(n->partialKeyLen + 1,
|
||||
remaining.size() - (n->partialKeyLen + 1));
|
||||
n = child;
|
||||
}
|
||||
|
||||
Node **useAsRoot = &getInTree(n, impl);
|
||||
|
||||
int consumed = lcp - remaining.size();
|
||||
int consumed = lcp;
|
||||
|
||||
begin = begin.subspan(consumed, begin.size() - consumed);
|
||||
end = end.subspan(consumed, end.size() - consumed);
|
||||
|
||||
auto *beginNode = insert(useAsRoot, begin, writeVersion, tls, impl);
|
||||
auto *beginNode = *insert(useAsRoot, begin, writeVersion, tls, impl);
|
||||
|
||||
const bool insertedBegin = !beginNode->entryPresent;
|
||||
|
||||
@@ -3114,7 +2932,7 @@ void addWriteRange(Node *&root, std::span<const uint8_t> begin,
|
||||
assert(writeVersion >= beginNode->entry.pointVersion);
|
||||
beginNode->entry.pointVersion = writeVersion;
|
||||
|
||||
auto *endNode = insert(useAsRoot, end, writeVersion, tls, impl);
|
||||
auto *endNode = *insert(useAsRoot, end, writeVersion, tls, impl);
|
||||
|
||||
const bool insertedEnd = !endNode->entryPresent;
|
||||
|
||||
@@ -3132,7 +2950,7 @@ void addWriteRange(Node *&root, std::span<const uint8_t> begin,
|
||||
if (beginIsPrefix && insertedEnd) {
|
||||
// beginNode may have been invalidated when inserting end. TODO can we do
|
||||
// better?
|
||||
beginNode = insert(useAsRoot, begin, writeVersion, tls, impl);
|
||||
beginNode = *insert(useAsRoot, begin, writeVersion, tls, impl);
|
||||
assert(beginNode->entryPresent);
|
||||
}
|
||||
|
||||
@@ -3246,6 +3064,8 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
|
||||
InternalVersionT::zero = tls.zero = oldestVersion;
|
||||
|
||||
assert(writeVersion >= newestVersionFullPrecision);
|
||||
assert(tls.accum.entries_erased == 0);
|
||||
assert(tls.accum.entries_inserted == 0);
|
||||
|
||||
if (oldestExtantVersion < writeVersion - kMaxCorrectVersionWindow)
|
||||
[[unlikely]] {
|
||||
@@ -3273,15 +3093,19 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
|
||||
auto begin = std::span<const uint8_t>(w.begin.p, w.begin.len);
|
||||
auto end = std::span<const uint8_t>(w.end.p, w.end.len);
|
||||
if (w.end.len > 0) {
|
||||
keyUpdates += 3;
|
||||
addWriteRange(root, begin, end, InternalVersionT(writeVersion), &tls,
|
||||
this);
|
||||
} else {
|
||||
keyUpdates += 2;
|
||||
addPointWrite(root, begin, InternalVersionT(writeVersion), &tls, this);
|
||||
}
|
||||
}
|
||||
|
||||
// Run gc at least 200% the rate we're inserting entries
|
||||
keyUpdates +=
|
||||
std::max<int64_t>(tls.accum.entries_inserted - tls.accum.entries_erased,
|
||||
0) *
|
||||
2;
|
||||
|
||||
memory_bytes.set(totalBytes);
|
||||
point_writes_total.add(tls.accum.point_writes);
|
||||
range_writes_total.add(tls.accum.range_writes);
|
||||
|
||||
Vendored
+10
-7
@@ -117,15 +117,18 @@ pipeline {
|
||||
}
|
||||
}
|
||||
steps {
|
||||
script {
|
||||
filter_args = "-f ConflictSet.cpp -f LongestCommonPrefix.h"
|
||||
}
|
||||
CleanBuildAndTest("-DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_FLAGS=--coverage -DCMAKE_CXX_FLAGS=--coverage -DCMAKE_BUILD_TYPE=Debug -DDISABLE_TSAN=ON")
|
||||
sh '''
|
||||
gcovr -f ConflictSet.cpp --cobertura > build/coverage.xml
|
||||
'''
|
||||
sh """
|
||||
gcovr ${filter_args} --cobertura > build/coverage.xml
|
||||
"""
|
||||
recordCoverage qualityGates: [[criticality: 'NOTE', metric: 'MODULE']], tools: [[parser: 'COBERTURA', pattern: 'build/coverage.xml']]
|
||||
sh '''
|
||||
gcovr -f ConflictSet.cpp
|
||||
gcovr -f ConflictSet.cpp --fail-under-line 100 > /dev/null
|
||||
'''
|
||||
sh """
|
||||
gcovr ${filter_args}
|
||||
gcovr ${filter_args} --fail-under-line 100 > /dev/null
|
||||
"""
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,185 @@
|
||||
#pragma once
|
||||
|
||||
#include <assert.h>
|
||||
#include <bit>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef HAS_AVX
|
||||
#include <immintrin.h>
|
||||
#elif HAS_ARM_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#ifndef __SANITIZE_THREAD__
|
||||
#if defined(__has_feature)
|
||||
#if __has_feature(thread_sanitizer)
|
||||
#define __SANITIZE_THREAD__
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(HAS_AVX) || defined(HAS_ARM_NEON)
|
||||
constexpr int kStride = 64;
|
||||
#else
|
||||
constexpr int kStride = 16;
|
||||
#endif
|
||||
|
||||
constexpr int kUnrollFactor = 4;
|
||||
|
||||
inline bool compareStride(const uint8_t *ap, const uint8_t *bp) {
|
||||
#if defined(HAS_ARM_NEON)
|
||||
static_assert(kStride == 64);
|
||||
uint8x16_t x[4]; // GCOVR_EXCL_LINE
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
x[i] = vceqq_u8(vld1q_u8(ap + i * 16), vld1q_u8(bp + i * 16));
|
||||
}
|
||||
auto results = vreinterpretq_u16_u8(
|
||||
vandq_u8(vandq_u8(x[0], x[1]), vandq_u8(x[2], x[3])));
|
||||
bool eq = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(results, 4)), 0) ==
|
||||
uint64_t(-1);
|
||||
#elif defined(HAS_AVX)
|
||||
static_assert(kStride == 64);
|
||||
__m128i x[4]; // GCOVR_EXCL_LINE
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
x[i] = _mm_cmpeq_epi8(_mm_loadu_si128((__m128i *)(ap + i * 16)),
|
||||
_mm_loadu_si128((__m128i *)(bp + i * 16)));
|
||||
}
|
||||
auto eq =
|
||||
_mm_movemask_epi8(_mm_and_si128(_mm_and_si128(x[0], x[1]),
|
||||
_mm_and_si128(x[2], x[3]))) == 0xffff;
|
||||
#else
|
||||
// Hope it gets vectorized
|
||||
auto eq = memcmp(ap, bp, kStride) == 0;
|
||||
#endif
|
||||
return eq;
|
||||
}
|
||||
|
||||
// Precondition: ap[:kStride] != bp[:kStride]
|
||||
inline int firstNeqStride(const uint8_t *ap, const uint8_t *bp) {
|
||||
#if defined(HAS_AVX)
|
||||
static_assert(kStride == 64);
|
||||
uint64_t c[kStride / 16]; // GCOVR_EXCL_LINE
|
||||
for (int i = 0; i < kStride; i += 16) {
|
||||
const auto a = _mm_loadu_si128((__m128i *)(ap + i));
|
||||
const auto b = _mm_loadu_si128((__m128i *)(bp + i));
|
||||
const auto compared = _mm_cmpeq_epi8(a, b);
|
||||
c[i / 16] = _mm_movemask_epi8(compared) & 0xffff;
|
||||
}
|
||||
return std::countr_zero(~(c[0] | c[1] << 16 | c[2] << 32 | c[3] << 48));
|
||||
#elif defined(HAS_ARM_NEON)
|
||||
static_assert(kStride == 64);
|
||||
for (int i = 0; i < kStride; i += 16) {
|
||||
// 0xff for each match
|
||||
uint16x8_t results =
|
||||
vreinterpretq_u16_u8(vceqq_u8(vld1q_u8(ap + i), vld1q_u8(bp + i)));
|
||||
// 0xf for each mismatch
|
||||
uint64_t bitfield =
|
||||
~vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(results, 4)), 0);
|
||||
if (bitfield) {
|
||||
return i + (std::countr_zero(bitfield) >> 2);
|
||||
}
|
||||
}
|
||||
__builtin_unreachable(); // GCOVR_EXCL_LINE
|
||||
#else
|
||||
int i = 0;
|
||||
for (; i < kStride - 1; ++i) {
|
||||
if (*ap++ != *bp++) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
#endif
|
||||
}
|
||||
|
||||
// This gets covered in local development
|
||||
// GCOVR_EXCL_START
|
||||
#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__)
|
||||
__attribute__((target("avx512f,avx512bw"))) inline int
|
||||
longestCommonPrefix(const uint8_t *ap, const uint8_t *bp, int cl) {
|
||||
int i = 0;
|
||||
int end = cl & ~63;
|
||||
while (i < end) {
|
||||
const uint64_t eq =
|
||||
_mm512_cmpeq_epi8_mask(_mm512_loadu_epi8(ap), _mm512_loadu_epi8(bp));
|
||||
if (eq != uint64_t(-1)) {
|
||||
return i + std::countr_one(eq);
|
||||
}
|
||||
i += 64;
|
||||
ap += 64;
|
||||
bp += 64;
|
||||
}
|
||||
if (i < cl) {
|
||||
const uint64_t mask = (uint64_t(1) << (cl - i)) - 1;
|
||||
const uint64_t eq = _mm512_cmpeq_epi8_mask(
|
||||
_mm512_maskz_loadu_epi8(mask, ap), _mm512_maskz_loadu_epi8(mask, bp));
|
||||
return i + std::countr_one(eq & mask);
|
||||
}
|
||||
assert(i == cl);
|
||||
return i;
|
||||
}
|
||||
__attribute__((target("default")))
|
||||
#endif
|
||||
// GCOVR_EXCL_STOP
|
||||
|
||||
inline int
|
||||
longestCommonPrefix(const uint8_t *ap, const uint8_t *bp, int cl) {
|
||||
if (!(cl >= 0)) {
|
||||
__builtin_unreachable(); // GCOVR_EXCL_LINE
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
int end;
|
||||
|
||||
// kStride * kUnrollCount at a time
|
||||
end = cl & ~(kStride * kUnrollFactor - 1);
|
||||
while (i < end) {
|
||||
for (int j = 0; j < kUnrollFactor; ++j) {
|
||||
if (!compareStride(ap, bp)) {
|
||||
return i + firstNeqStride(ap, bp);
|
||||
}
|
||||
i += kStride;
|
||||
ap += kStride;
|
||||
bp += kStride;
|
||||
}
|
||||
}
|
||||
|
||||
// kStride at a time
|
||||
end = cl & ~(kStride - 1);
|
||||
while (i < end) {
|
||||
if (!compareStride(ap, bp)) {
|
||||
return i + firstNeqStride(ap, bp);
|
||||
}
|
||||
i += kStride;
|
||||
ap += kStride;
|
||||
bp += kStride;
|
||||
}
|
||||
|
||||
// word at a time
|
||||
end = cl & ~(sizeof(uint64_t) - 1);
|
||||
while (i < end) {
|
||||
uint64_t a; // GCOVR_EXCL_LINE
|
||||
uint64_t b; // GCOVR_EXCL_LINE
|
||||
memcpy(&a, ap, 8);
|
||||
memcpy(&b, bp, 8);
|
||||
const auto mismatched = a ^ b;
|
||||
if (mismatched) {
|
||||
return i + std::countr_zero(mismatched) / 8;
|
||||
}
|
||||
i += 8;
|
||||
ap += 8;
|
||||
bp += 8;
|
||||
}
|
||||
|
||||
// byte at a time
|
||||
while (i < cl) {
|
||||
if (*ap != *bp) {
|
||||
break;
|
||||
}
|
||||
++ap;
|
||||
++bp;
|
||||
++i;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
@@ -24,15 +24,16 @@ Hardware for all benchmarks is an AMD Ryzen 9 7900 with (2x32GB) 5600MT/s CL28-3
|
||||
|
||||
| ns/op | op/s | err% | ins/op | cyc/op | IPC | bra/op | miss% | total | benchmark
|
||||
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|
||||
| 12.42 | 80,500,398.66 | 0.8% | 180.38 | 61.57 | 2.930 | 41.51 | 0.4% | 0.01 | `point reads`
|
||||
| 15.17 | 65,917,580.99 | 0.2% | 279.47 | 74.95 | 3.729 | 55.54 | 0.3% | 0.01 | `prefix reads`
|
||||
| 38.16 | 26,202,393.91 | 0.1% | 803.07 | 189.13 | 4.246 | 141.68 | 0.2% | 0.01 | `range reads`
|
||||
| 20.20 | 49,504,615.44 | 0.4% | 363.00 | 100.35 | 3.617 | 49.81 | 0.3% | 0.01 | `point writes`
|
||||
| 41.99 | 23,816,559.99 | 0.3% | 799.27 | 209.63 | 3.813 | 154.32 | 0.1% | 0.01 | `prefix writes`
|
||||
| 46.28 | 21,607,605.88 | 1.5% | 953.79 | 231.47 | 4.121 | 168.34 | 0.0% | 0.01 | `range writes`
|
||||
| 80.99 | 12,347,449.98 | 0.9% | 1,501.97 | 406.50 | 3.695 | 281.89 | 0.1% | 0.01 | `monotonic increasing point writes`
|
||||
| 318,010.00 | 3,144.56 | 1.0% | 3,994,511.50 | 1,657,831.50 | 2.409 | 805,969.50 | 0.0% | 0.01 | `worst case for radix tree`
|
||||
| 75.85 | 13,183,612.56 | 0.5% | 1,590.01 | 385.64 | 4.123 | 258.00 | 0.0% | 0.01 | `create and destroy`
|
||||
| 10.80 | 92,600,541.52 | 0.6% | 180.38 | 54.49 | 3.310 | 41.51 | 0.4% | 0.01 | `point reads`
|
||||
| 15.00 | 66,687,691.68 | 0.4% | 278.44 | 76.44 | 3.642 | 55.56 | 0.3% | 0.01 | `prefix reads`
|
||||
| 36.81 | 27,163,394.61 | 0.4% | 795.06 | 187.91 | 4.231 | 142.67 | 0.2% | 0.01 | `range reads`
|
||||
| 18.14 | 55,137,674.01 | 1.2% | 338.19 | 92.86 | 3.642 | 42.81 | 0.4% | 0.01 | `point writes`
|
||||
| 33.19 | 30,127,119.71 | 0.1% | 681.03 | 170.05 | 4.005 | 98.68 | 0.2% | 0.01 | `prefix writes`
|
||||
| 37.37 | 26,759,432.70 | 1.9% | 779.70 | 195.45 | 3.989 | 114.21 | 0.0% | 0.01 | `range writes`
|
||||
| 74.36 | 13,448,582.47 | 1.9% | 1,425.68 | 389.08 | 3.664 | 258.88 | 0.1% | 0.01 | `monotonic increasing point writes`
|
||||
| 316,928.00 | 3,155.29 | 1.5% | 3,992,986.00 | 1,699,813.00 | 2.349 | 806,226.50 | 0.0% | 0.01 | `worst case for radix tree`
|
||||
| 75.26 | 13,286,517.16 | 0.5% | 1,590.01 | 386.67 | 4.112 | 258.00 | 0.0% | 0.01 | `create and destroy`
|
||||
|
||||
|
||||
# "Real data" test
|
||||
|
||||
@@ -47,7 +48,7 @@ Check: 4.47891 seconds, 364.05 MB/s, Add: 4.55599 seconds, 123.058 MB/s, Gc rati
|
||||
## radix tree
|
||||
|
||||
```
|
||||
Check: 0.963721 seconds, 1691.93 MB/s, Add: 1.3288 seconds, 421.924 MB/s, Gc ratio: 42.8819%
|
||||
Check: 0.910234 seconds, 1791.35 MB/s, Add: 1.25908 seconds, 445.287 MB/s, Gc ratio: 44.0415%
|
||||
```
|
||||
|
||||
## hash table
|
||||
|
||||
@@ -76,6 +76,14 @@ void workload(weaselab::ConflictSet *cs) {
|
||||
} else {
|
||||
w.begin.len = k.size();
|
||||
cs->addWrites(&w, 1, version);
|
||||
int64_t beginN = version - kWindowSize + rand() % kWindowSize;
|
||||
auto b = numToKey(beginN);
|
||||
auto e = numToKey(beginN + 1000);
|
||||
w.begin.p = b.data();
|
||||
w.begin.len = b.size();
|
||||
w.end.p = e.data();
|
||||
w.end.len = e.size();
|
||||
cs->addWrites(&w, 1, version);
|
||||
}
|
||||
}
|
||||
// GC
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
|
||||
:ゥゥゥゥゥゥゥゥゥゥ:::::
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user