3 Commits

24 changed files with 267 additions and 394 deletions

View File

@@ -248,19 +248,6 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR AND BUILD_TESTING)
add_test(NAME conflict_set_blackbox_${hash} COMMAND driver ${TEST})
endforeach()
find_program(VALGRIND_EXE valgrind)
if(VALGRIND_EXE AND NOT CMAKE_CROSSCOMPILING)
list(LENGTH CORPUS_TESTS len)
math(EXPR last "${len} - 1")
set(partition_size 100)
foreach(i RANGE 0 ${last} ${partition_size})
list(SUBLIST CORPUS_TESTS ${i} ${partition_size} partition)
add_test(NAME conflict_set_blackbox_valgrind_${i}
COMMAND ${VALGRIND_EXE} --error-exitcode=99 --
$<TARGET_FILE:driver> ${partition})
endforeach()
endif()
# scripted tests. Written manually to fill in anything libfuzzer couldn't
# find.
if(NOT CMAKE_CROSSCOMPILING)
@@ -281,14 +268,19 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR AND BUILD_TESTING)
${Python3_EXECUTABLE}
${CMAKE_CURRENT_SOURCE_DIR}/test_conflict_set.py test ${TEST}
--build-dir ${CMAKE_CURRENT_BINARY_DIR})
if(VALGRIND_EXE AND NOT CMAKE_CROSSCOMPILING)
add_test(
NAME script_test_${TEST}_valgrind
COMMAND
${VALGRIND_EXE} ${Python3_EXECUTABLE}
${CMAKE_CURRENT_SOURCE_DIR}/test_conflict_set.py test ${TEST}
--build-dir ${CMAKE_CURRENT_BINARY_DIR})
endif()
endforeach()
endif()
find_program(VALGRIND_EXE valgrind)
if(VALGRIND_EXE AND NOT CMAKE_CROSSCOMPILING)
list(LENGTH CORPUS_TESTS len)
math(EXPR last "${len} - 1")
set(partition_size 100)
foreach(i RANGE 0 ${last} ${partition_size})
list(SUBLIST CORPUS_TESTS ${i} ${partition_size} partition)
add_test(NAME conflict_set_blackbox_valgrind_${i}
COMMAND ${VALGRIND_EXE} --error-exitcode=99 --
$<TARGET_FILE:driver> ${partition})
endforeach()
endif()
@@ -358,11 +350,6 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR AND BUILD_TESTING)
set_target_properties(server_bench PROPERTIES SKIP_BUILD_RPATH ON)
add_executable(interleaving_test InterleavingTest.cpp)
# work around lack of musttail for gcc
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_BUILD_TYPE STREQUAL "Debug")
target_compile_options(interleaving_test PRIVATE -Og
-foptimize-sibling-calls)
endif()
target_link_libraries(interleaving_test PRIVATE nanobench)
endif()

View File

@@ -203,60 +203,6 @@ enum Type : int8_t {
template <class T> struct BoundedFreeListAllocator;
struct TaggedNodePointer {
TaggedNodePointer() = default;
operator struct Node *() { return (struct Node *)(uintptr_t)p; }
operator struct Node0 *() {
assert(t == Type_Node0);
return (struct Node0 *)(uintptr_t)p;
}
operator struct Node3 *() {
assert(t == Type_Node3);
return (struct Node3 *)(uintptr_t)p;
}
operator struct Node16 *() {
assert(t == Type_Node16);
return (struct Node16 *)(uintptr_t)p;
}
operator struct Node48 *() {
assert(t == Type_Node48);
return (struct Node48 *)(uintptr_t)p;
}
operator struct Node256 *() {
assert(t == Type_Node256);
return (struct Node256 *)(uintptr_t)p;
}
/*implicit*/ TaggedNodePointer(std::nullptr_t) : p(0), t(Type_Node0) {}
/*implicit*/ TaggedNodePointer(Node0 *x)
: TaggedNodePointer((struct Node *)x, Type_Node0) {}
/*implicit*/ TaggedNodePointer(Node3 *x)
: TaggedNodePointer((struct Node *)x, Type_Node3) {}
/*implicit*/ TaggedNodePointer(Node16 *x)
: TaggedNodePointer((struct Node *)x, Type_Node16) {}
/*implicit*/ TaggedNodePointer(Node48 *x)
: TaggedNodePointer((struct Node *)x, Type_Node48) {}
/*implicit*/ TaggedNodePointer(Node256 *x)
: TaggedNodePointer((struct Node *)x, Type_Node256) {}
bool operator!=(std::nullptr_t) { return p != 0 || t != Type_Node0; }
bool operator==(std::nullptr_t) { return p == 0 && t == Type_Node0; }
bool operator==(const TaggedNodePointer &) const = default;
bool operator==(Node *n) const { return (uintptr_t)n == p; }
Node *operator->() { return (Node *)(uintptr_t)p; }
Type getType() { return t; }
TaggedNodePointer(const TaggedNodePointer &) = default;
TaggedNodePointer &operator=(const TaggedNodePointer &) = default;
/*implicit*/ TaggedNodePointer(Node *n);
private:
TaggedNodePointer(struct Node *p, Type t) : p((uintptr_t)p), t(t) {
assume(p != 0);
}
uintptr_t p : 56;
Type t : 8;
};
struct Node {
/* begin section that's copied to the next node */
@@ -282,9 +228,6 @@ private:
int32_t partialKeyCapacity;
};
TaggedNodePointer::TaggedNodePointer(Node *n)
: TaggedNodePointer(n, n->getType()) {}
constexpr int kNodeCopyBegin = offsetof(Node, entry);
constexpr int kNodeCopySize =
offsetof(Node, parentsIndex) + sizeof(Node::parentsIndex) - kNodeCopyBegin;
@@ -308,7 +251,7 @@ struct Node3 : Node {
constexpr static auto kMaxNodes = 3;
constexpr static auto kType = Type_Node3;
TaggedNodePointer children[kMaxNodes];
Node *children[kMaxNodes];
InternalVersionT childMaxVersion[kMaxNodes];
// Sorted
uint8_t index[kMaxNodes];
@@ -324,7 +267,7 @@ struct Node16 : Node {
constexpr static auto kType = Type_Node16;
constexpr static auto kMaxNodes = 16;
TaggedNodePointer children[kMaxNodes];
Node *children[kMaxNodes];
InternalVersionT childMaxVersion[kMaxNodes];
// Sorted
uint8_t index[kMaxNodes];
@@ -345,7 +288,7 @@ struct Node48 : Node {
constexpr static int kMaxOfMaxTotalPages = kMaxNodes / kMaxOfMaxPageSize;
BitSet bitSet;
TaggedNodePointer children[kMaxNodes];
Node *children[kMaxNodes];
InternalVersionT childMaxVersion[kMaxNodes];
InternalVersionT maxOfMax[kMaxOfMaxTotalPages];
uint8_t reverseIndex[kMaxNodes];
@@ -367,7 +310,7 @@ struct Node256 : Node {
constexpr static int kMaxOfMaxTotalPages = kMaxNodes / kMaxOfMaxPageSize;
BitSet bitSet;
TaggedNodePointer children[kMaxNodes];
Node *children[kMaxNodes];
InternalVersionT childMaxVersion[kMaxNodes];
InternalVersionT maxOfMax[kMaxOfMaxTotalPages];
@@ -823,8 +766,6 @@ private:
int getNodeIndex(Node3 *self, uint8_t index) {
Node3 *n = (Node3 *)self;
assume(n->numChildren >= 1);
assume(n->numChildren <= 3);
for (int i = 0; i < n->numChildren; ++i) {
if (n->index[i] == index) {
return i;
@@ -833,18 +774,6 @@ int getNodeIndex(Node3 *self, uint8_t index) {
return -1;
}
int getNodeIndexExists(Node3 *self, uint8_t index) {
Node3 *n = (Node3 *)self;
assume(n->numChildren >= 1);
assume(n->numChildren <= 3);
for (int i = 0; i < n->numChildren; ++i) {
if (n->index[i] == index) {
return i;
}
}
__builtin_unreachable(); // GCOVR_EXCL_LINE
}
int getNodeIndex(Node16 *self, uint8_t index) {
#ifdef HAS_AVX
@@ -905,66 +834,27 @@ int getNodeIndex(Node16 *self, uint8_t index) {
#endif
}
int getNodeIndexExists(Node16 *self, uint8_t index) {
#ifdef HAS_AVX
__m128i key_vec = _mm_set1_epi8(index);
__m128i indices;
memcpy(&indices, self->index, Node16::kMaxNodes);
__m128i results = _mm_cmpeq_epi8(key_vec, indices);
uint32_t mask = (1 << self->numChildren) - 1;
uint32_t bitfield = _mm_movemask_epi8(results) & mask;
assume(bitfield != 0);
return std::countr_zero(bitfield);
#elif defined(HAS_ARM_NEON)
// Based on
// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
uint8x16_t indices;
memcpy(&indices, self->index, Node16::kMaxNodes);
// 0xff for each match
uint16x8_t results =
vreinterpretq_u16_u8(vceqq_u8(vdupq_n_u8(index), indices));
assume(self->numChildren <= Node16::kMaxNodes);
uint64_t mask = self->numChildren == 16
? uint64_t(-1)
: (uint64_t(1) << (self->numChildren * 4)) - 1;
// 0xf for each match in valid range
uint64_t bitfield =
vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(results, 4)), 0) & mask;
assume(bitfield != 0);
return std::countr_zero(bitfield) / 4;
#else
for (int i = 0; i < self->numChildren; ++i) {
if (self->index[i] == index) {
return i;
}
}
__builtin_unreachable(); // GCOVR_EXCL_LINE
#endif
}
// Precondition - an entry for index must exist in the node
TaggedNodePointer &getChildExists(Node3 *self, uint8_t index) {
return self->children[getNodeIndexExists(self, index)];
Node *&getChildExists(Node3 *self, uint8_t index) {
return self->children[getNodeIndex(self, index)];
}
// Precondition - an entry for index must exist in the node
TaggedNodePointer &getChildExists(Node16 *self, uint8_t index) {
return self->children[getNodeIndexExists(self, index)];
Node *&getChildExists(Node16 *self, uint8_t index) {
return self->children[getNodeIndex(self, index)];
}
// Precondition - an entry for index must exist in the node
TaggedNodePointer &getChildExists(Node48 *self, uint8_t index) {
Node *&getChildExists(Node48 *self, uint8_t index) {
assert(self->bitSet.test(index));
return self->children[self->index[index]];
}
// Precondition - an entry for index must exist in the node
TaggedNodePointer &getChildExists(Node256 *self, uint8_t index) {
Node *&getChildExists(Node256 *self, uint8_t index) {
assert(self->bitSet.test(index));
return self->children[index];
}
// Precondition - an entry for index must exist in the node
TaggedNodePointer &getChildExists(Node *self, uint8_t index) {
Node *&getChildExists(Node *self, uint8_t index) {
switch (self->getType()) {
case Type_Node0: // GCOVR_EXCL_LINE
__builtin_unreachable(); // GCOVR_EXCL_LINE
@@ -995,12 +885,12 @@ InternalVersionT maxVersion(Node *n) {
__builtin_unreachable(); // GCOVR_EXCL_LINE
case Type_Node3: {
auto *n3 = static_cast<Node3 *>(n);
int i = getNodeIndexExists(n3, index);
int i = getNodeIndex(n3, index);
return n3->childMaxVersion[i];
}
case Type_Node16: {
auto *n16 = static_cast<Node16 *>(n);
int i = getNodeIndexExists(n16, index);
int i = getNodeIndex(n16, index);
return n16->childMaxVersion[i];
}
case Type_Node48: {
@@ -1028,12 +918,12 @@ InternalVersionT exchangeMaxVersion(Node *n, InternalVersionT newMax) {
__builtin_unreachable(); // GCOVR_EXCL_LINE
case Type_Node3: {
auto *n3 = static_cast<Node3 *>(n);
int i = getNodeIndexExists(n3, index);
int i = getNodeIndex(n3, index);
return std::exchange(n3->childMaxVersion[i], newMax);
}
case Type_Node16: {
auto *n16 = static_cast<Node16 *>(n);
int i = getNodeIndexExists(n16, index);
int i = getNodeIndex(n16, index);
return std::exchange(n16->childMaxVersion[i], newMax);
}
case Type_Node48: {
@@ -1062,13 +952,13 @@ void setMaxVersion(Node *n, InternalVersionT newMax) {
__builtin_unreachable(); // GCOVR_EXCL_LINE
case Type_Node3: {
auto *n3 = static_cast<Node3 *>(n);
int i = getNodeIndexExists(n3, index);
int i = getNodeIndex(n3, index);
n3->childMaxVersion[i] = newMax;
return;
}
case Type_Node16: {
auto *n16 = static_cast<Node16 *>(n);
int i = getNodeIndexExists(n16, index);
int i = getNodeIndex(n16, index);
n16->childMaxVersion[i] = newMax;
return;
}
@@ -1095,7 +985,7 @@ void setMaxVersion(Node *n, InternalVersionT newMax) {
}
}
TaggedNodePointer &getInTree(Node *n, ConflictSet::Impl *);
Node *&getInTree(Node *n, ConflictSet::Impl *);
Node *getChild(Node0 *, uint8_t) { return nullptr; }
Node *getChild(Node3 *self, uint8_t index) {
@@ -1132,7 +1022,6 @@ Node *getChild(Node *self, uint8_t index) {
struct ChildAndMaxVersion {
Node *child;
InternalVersionT maxVersion;
Type childType;
};
ChildAndMaxVersion getChildAndMaxVersion(Node0 *, uint8_t) { return {}; }
@@ -1141,28 +1030,24 @@ ChildAndMaxVersion getChildAndMaxVersion(Node3 *self, uint8_t index) {
if (i < 0) {
return {};
}
return {self->children[i], self->childMaxVersion[i],
self->children[i].getType()};
return {self->children[i], self->childMaxVersion[i]};
}
ChildAndMaxVersion getChildAndMaxVersion(Node16 *self, uint8_t index) {
int i = getNodeIndex(self, index);
if (i < 0) {
return {};
}
return {self->children[i], self->childMaxVersion[i],
self->children[i].getType()};
return {self->children[i], self->childMaxVersion[i]};
}
ChildAndMaxVersion getChildAndMaxVersion(Node48 *self, uint8_t index) {
int i = self->index[index];
if (i < 0) {
return {};
}
return {self->children[i], self->childMaxVersion[i],
self->children[i].getType()};
return {self->children[i], self->childMaxVersion[i]};
}
ChildAndMaxVersion getChildAndMaxVersion(Node256 *self, uint8_t index) {
return {self->children[index], self->childMaxVersion[index],
self->children[index].getType()};
return {self->children[index], self->childMaxVersion[index]};
}
ChildAndMaxVersion getChildAndMaxVersion(Node *self, uint8_t index) {
@@ -1185,8 +1070,6 @@ ChildAndMaxVersion getChildAndMaxVersion(Node *self, uint8_t index) {
Node *getChildGeq(Node0 *, int) { return nullptr; }
Node *getChildGeq(Node3 *n, int child) {
assume(n->numChildren >= 1);
assume(n->numChildren <= 3);
for (int i = 0; i < n->numChildren; ++i) {
if (n->index[i] >= child) {
return n->children[i];
@@ -1307,15 +1190,14 @@ Node *getFirstChildExists(Node *self) {
}
}
void consumePartialKeyFull(TaggedNodePointer &self,
std::span<const uint8_t> &key,
void consumePartialKeyFull(Node *&self, std::span<const uint8_t> &key,
InternalVersionT writeVersion, WriteContext *tls) {
// Handle an existing partial key
int commonLen = std::min<int>(self->partialKeyLen, key.size());
int partialKeyIndex =
longestCommonPrefix(self->partialKey(), key.data(), commonLen);
if (partialKeyIndex < self->partialKeyLen) {
Node *old = self;
auto *old = self;
// Since root cannot have a partial key
assert(old->parent != nullptr);
InternalVersionT oldMaxVersion = exchangeMaxVersion(old, writeVersion);
@@ -1353,7 +1235,7 @@ void consumePartialKeyFull(TaggedNodePointer &self,
// Consume any partial key of `self`, and update `self` and
// `key` such that `self` is along the search path of `key`
inline __attribute__((always_inline)) void
consumePartialKey(TaggedNodePointer &self, std::span<const uint8_t> &key,
consumePartialKey(Node *&self, std::span<const uint8_t> &key,
InternalVersionT writeVersion, WriteContext *tls) {
if (self->partialKeyLen > 0) {
consumePartialKeyFull(self, key, writeVersion, tls);
@@ -1364,10 +1246,8 @@ consumePartialKey(TaggedNodePointer &self, std::span<const uint8_t> &key,
// such that the search path of the result + key is the same as the search path
// of self + key before the call. Creates a node if necessary. Updates
// `maxVersion` for result.
TaggedNodePointer &getOrCreateChild(TaggedNodePointer &self,
std::span<const uint8_t> &key,
InternalVersionT newMaxVersion,
WriteContext *tls) {
Node *&getOrCreateChild(Node *&self, std::span<const uint8_t> &key,
InternalVersionT newMaxVersion, WriteContext *tls) {
int index = key.front();
key = key.subspan(1, key.size() - 1);
@@ -1669,6 +1549,7 @@ __attribute__((target("avx512f"))) void rezero16(InternalVersionT *vs,
_mm512_sub_epi32(_mm512_loadu_epi32(vs), zvec), _mm512_setzero_epi32());
_mm512_mask_storeu_epi32(vs, m, zvec);
}
__attribute__((target("default")))
#endif
@@ -1725,11 +1606,10 @@ void rezero(Node *n, InternalVersionT z) {
}
#endif
void mergeWithChild(TaggedNodePointer &self, WriteContext *tls,
ConflictSet::Impl *impl, Node *&dontInvalidate,
Node3 *self3) {
void mergeWithChild(Node *&self, WriteContext *tls, ConflictSet::Impl *impl,
Node *&dontInvalidate, Node3 *self3) {
assert(!self3->entryPresent);
Node *child = self3->children[0];
auto *child = self3->children[0];
int minCapacity = self3->partialKeyLen + 1 + child->partialKeyLen;
if (minCapacity > child->getCapacity()) {
@@ -1978,7 +1858,7 @@ bool checkPointRead(Node *n, const std::span<const uint8_t> key,
goto downLeftSpine;
}
auto [child, maxV, childT] = getChildAndMaxVersion(n, remaining[0]);
auto [child, maxV] = getChildAndMaxVersion(n, remaining[0]);
if (child == nullptr) {
auto c = getChildGeq(n, remaining[0]);
if (c != nullptr) {
@@ -2048,7 +1928,7 @@ bool checkPrefixRead(Node *n, const std::span<const uint8_t> key,
return maxVersion(n) <= readVersion;
}
auto [child, maxV, childT] = getChildAndMaxVersion(n, remaining[0]);
auto [child, maxV] = getChildAndMaxVersion(n, remaining[0]);
if (child == nullptr) {
auto c = getChildGeq(n, remaining[0]);
if (c != nullptr) {
@@ -2321,7 +2201,7 @@ bool checkMaxBetweenExclusiveImpl(Node *n, int begin, int end,
if (!mask) {
return true;
}
Node *child = self->children[std::countr_zero(mask)];
auto *child = self->children[std::countr_zero(mask)];
const bool firstRangeOk =
!child->entryPresent || child->entry.rangeVersion <= readVersion;
uint32_t compared = 0;
@@ -2396,7 +2276,7 @@ bool checkMaxBetweenExclusiveImpl(Node *n, int begin, int end,
if (!mask) {
return true;
}
Node *child = self->children[std::countr_zero(mask)];
auto *child = self->children[std::countr_zero(mask)];
const bool firstRangeOk =
!child->entryPresent || child->entry.rangeVersion <= readVersion;
@@ -2441,7 +2321,7 @@ bool checkMaxBetweenExclusiveImpl(Node *n, int begin, int end,
{
int c = self->bitSet.firstSetGeq(begin + 1);
if (c >= 0 && c < end) {
Node *child = self->children[self->index[c]];
auto *child = self->children[self->index[c]];
if (child->entryPresent && child->entry.rangeVersion > readVersion) {
return false;
}
@@ -2475,7 +2355,7 @@ bool checkMaxBetweenExclusiveImpl(Node *n, int begin, int end,
{
int c = self->bitSet.firstSetGeq(begin + 1);
if (c >= 0 && c < end) {
Node *child = self->children[c];
auto *child = self->children[c];
if (child->entryPresent && child->entry.rangeVersion > readVersion) {
return false;
}
@@ -2536,7 +2416,6 @@ checkMaxBetweenExclusive(Node *n, int begin, int end,
}
__attribute__((target("default")))
#endif
bool checkMaxBetweenExclusive(Node *n, int begin, int end,
InternalVersionT readVersion, ReadContext *tls) {
return checkMaxBetweenExclusiveImpl<false>(n, begin, end, readVersion, tls);
@@ -2647,7 +2526,7 @@ bool checkRangeLeftSide(Node *n, std::span<const uint8_t> key, int prefixLen,
}
}
auto [child, maxV, childT] = getChildAndMaxVersion(n, remaining[0]);
auto [child, maxV] = getChildAndMaxVersion(n, remaining[0]);
if (child == nullptr) {
auto c = getChildGeq(n, remaining[0]);
if (c != nullptr) {
@@ -2835,7 +2714,7 @@ bool checkRangeRead(Node *n, std::span<const uint8_t> begin,
if (remaining.size() == 0) {
break;
}
auto [child, v, childT] = getChildAndMaxVersion(n, remaining[0]);
auto [child, v] = getChildAndMaxVersion(n, remaining[0]);
if (child == nullptr) {
break;
}
@@ -2901,10 +2780,8 @@ checkMaxBetweenExclusiveImpl<true>(Node *n, int begin, int end,
// of the result will have `maxVersion` set to `writeVersion` as a
// postcondition. Nodes along the search path may be invalidated. Callers must
// ensure that the max version of the self argument is updated.
[[nodiscard]] TaggedNodePointer *insert(TaggedNodePointer *self,
std::span<const uint8_t> key,
InternalVersionT writeVersion,
WriteContext *tls) {
[[nodiscard]] Node **insert(Node **self, std::span<const uint8_t> key,
InternalVersionT writeVersion, WriteContext *tls) {
for (; key.size() != 0; ++tls->accum.insert_iterations) {
self = &getOrCreateChild(*self, key, writeVersion, tls);
@@ -2932,23 +2809,17 @@ void eraseTree(Node *root, WriteContext *tls) {
} break;
case Type_Node3: {
auto *n3 = static_cast<Node3 *>(n);
for (int i = 0; i < n3->numChildren; ++i) {
toFree.push_back(n3->children[i]);
}
toFree.append(std::span<Node *>(n3->children, n3->numChildren));
tls->release(n3);
} break;
case Type_Node16: {
auto *n16 = static_cast<Node16 *>(n);
for (int i = 0; i < n16->numChildren; ++i) {
toFree.push_back(n16->children[i]);
}
toFree.append(std::span<Node *>(n16->children, n16->numChildren));
tls->release(n16);
} break;
case Type_Node48: {
auto *n48 = static_cast<Node48 *>(n);
for (int i = 0; i < n48->numChildren; ++i) {
toFree.push_back(n48->children[i]);
}
toFree.append(std::span<Node *>(n48->children, n48->numChildren));
tls->release(n48);
} break;
case Type_Node256: {
@@ -2964,10 +2835,10 @@ void eraseTree(Node *root, WriteContext *tls) {
}
}
void addPointWrite(TaggedNodePointer &root, std::span<const uint8_t> key,
void addPointWrite(Node *&root, std::span<const uint8_t> key,
InternalVersionT writeVersion, WriteContext *tls) {
++tls->accum.point_writes;
auto n = *insert(&root, key, writeVersion, tls);
auto *n = *insert(&root, key, writeVersion, tls);
if (!n->entryPresent) {
++tls->accum.entries_inserted;
auto *p = nextLogical(n);
@@ -2984,72 +2855,6 @@ void addPointWrite(TaggedNodePointer &root, std::span<const uint8_t> key,
}
}
#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__)
__attribute__((target("avx512f"))) InternalVersionT horizontalMaxUpTo16(
InternalVersionT *vs, [[maybe_unused]] InternalVersionT z, int len) {
assume(len <= 16);
#if USE_64_BIT
// Hope it gets vectorized
InternalVersionT max = vs[0];
for (int i = 1; i < len; ++i) {
max = std::max(vs[i], max);
}
return max;
#else
uint32_t zero;
memcpy(&zero, &z, sizeof(zero));
auto zeroVec = _mm512_set1_epi32(zero);
auto max = InternalVersionT(
zero +
_mm512_reduce_max_epi32(_mm512_sub_epi32(
_mm512_mask_loadu_epi32(zeroVec, _mm512_int2mask((1 << len) - 1), vs),
zeroVec)));
return max;
#endif
}
__attribute__((target("default")))
#endif
InternalVersionT
horizontalMaxUpTo16(InternalVersionT *vs, InternalVersionT, int len) {
assume(len <= 16);
InternalVersionT max = vs[0];
for (int i = 1; i < len; ++i) {
max = std::max(vs[i], max);
}
return max;
}
#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__)
__attribute__((target("avx512f"))) InternalVersionT
horizontalMax16(InternalVersionT *vs, [[maybe_unused]] InternalVersionT z) {
#if USE_64_BIT
// Hope it gets vectorized
InternalVersionT max = vs[0];
for (int i = 1; i < 16; ++i) {
max = std::max(vs[i], max);
}
return max;
#else
uint32_t zero; // GCOVR_EXCL_LINE
memcpy(&zero, &z, sizeof(zero));
auto zeroVec = _mm512_set1_epi32(zero);
return InternalVersionT(zero + _mm512_reduce_max_epi32(_mm512_sub_epi32(
_mm512_loadu_epi32(vs), zeroVec)));
#endif
}
__attribute__((target("default")))
#endif
InternalVersionT
horizontalMax16(InternalVersionT *vs, InternalVersionT) {
InternalVersionT max = vs[0];
for (int i = 1; i < 16; ++i) {
max = std::max(vs[i], max);
}
return max;
}
// Precondition: `node->entryPresent`, and node is not the root
void fixupMaxVersion(Node *node, WriteContext *tls) {
assert(node->parent);
@@ -3061,13 +2866,15 @@ void fixupMaxVersion(Node *node, WriteContext *tls) {
break;
case Type_Node3: {
auto *self3 = static_cast<Node3 *>(node);
max = std::max(max, horizontalMaxUpTo16(self3->childMaxVersion, tls->zero,
self3->numChildren));
for (int i = 0; i < self3->numChildren; ++i) {
max = std::max(self3->childMaxVersion[i], max);
}
} break;
case Type_Node16: {
auto *self16 = static_cast<Node16 *>(node);
max = std::max(max, horizontalMaxUpTo16(self16->childMaxVersion, tls->zero,
self16->numChildren));
for (int i = 0; i < self16->numChildren; ++i) {
max = std::max(self16->childMaxVersion[i], max);
}
} break;
case Type_Node48: {
auto *self48 = static_cast<Node48 *>(node);
@@ -3077,7 +2884,9 @@ void fixupMaxVersion(Node *node, WriteContext *tls) {
} break;
case Type_Node256: {
auto *self256 = static_cast<Node256 *>(node);
max = std::max(max, horizontalMax16(self256->childMaxVersion, tls->zero));
for (auto v : self256->maxOfMax) {
max = std::max(v, max);
}
} break;
default: // GCOVR_EXCL_LINE
__builtin_unreachable(); // GCOVR_EXCL_LINE
@@ -3085,7 +2894,7 @@ void fixupMaxVersion(Node *node, WriteContext *tls) {
setMaxVersion(node, max);
}
void addWriteRange(TaggedNodePointer &root, std::span<const uint8_t> begin,
void addWriteRange(Node *&root, std::span<const uint8_t> begin,
std::span<const uint8_t> end, InternalVersionT writeVersion,
WriteContext *tls, ConflictSet::Impl *impl) {
@@ -3098,12 +2907,12 @@ void addWriteRange(TaggedNodePointer &root, std::span<const uint8_t> begin,
++tls->accum.range_writes;
const bool beginIsPrefix = lcp == int(begin.size());
auto useAsRoot = insert(&root, begin.subspan(0, lcp), writeVersion, tls);
Node **useAsRoot = insert(&root, begin.subspan(0, lcp), writeVersion, tls);
begin = begin.subspan(lcp, begin.size() - lcp);
end = end.subspan(lcp, end.size() - lcp);
Node *beginNode = *insert(useAsRoot, begin, writeVersion, tls);
auto *beginNode = *insert(useAsRoot, begin, writeVersion, tls);
addKey(beginNode);
if (!beginNode->entryPresent) {
++tls->accum.entries_inserted;
@@ -3114,7 +2923,7 @@ void addWriteRange(TaggedNodePointer &root, std::span<const uint8_t> begin,
}
beginNode->entry.pointVersion = writeVersion;
Node *endNode = *insert(useAsRoot, end, writeVersion, tls);
auto *endNode = *insert(useAsRoot, end, writeVersion, tls);
addKey(endNode);
if (!endNode->entryPresent) {
++tls->accum.entries_inserted;
@@ -3134,7 +2943,7 @@ void addWriteRange(TaggedNodePointer &root, std::span<const uint8_t> begin,
assert(!beginNode->endOfRange);
assert(!endNode->endOfRange);
endNode->endOfRange = true;
Node *iter = beginNode;
auto *iter = beginNode;
for (iter = nextLogical(iter); !iter->endOfRange;
iter = erase(iter, tls, impl, /*logical*/ true)) {
assert(!iter->endOfRange);
@@ -3200,34 +3009,203 @@ Node *firstGeqPhysical(Node *n, const std::span<const uint8_t> key) {
}
}
struct CheckJob {
Node *n;
std::span<const uint8_t> begin;
InternalVersionT readVersion;
ReadContext *tls;
ConflictSet::Result *result;
void setResult(bool ok) {
*result = ok ? ConflictSet::Commit : ConflictSet::Conflict;
}
typedef void (*typeErasedContinuation)(void *);
// The type of a function that takes a CheckJob* and returns its own type
struct continuation {
typedef continuation (*functionPtrType)(CheckJob *);
functionPtrType func;
continuation operator()(CheckJob *job) { return func(job); }
/*implicit*/ continuation(functionPtrType func) : func(func) {}
continuation() = default;
operator bool() { return func != nullptr; }
};
continuation next;
void init(const ConflictSet::ReadRange *read, ConflictSet::Result *result,
Node *root, int64_t oldestVersionFullPrecision, ReadContext *tls);
};
namespace check_point_read_state_machine {
CheckJob::continuation down_left_spine(CheckJob *job);
// Logically this is the same as performing firstGeq and then checking against
// point or range version according to cmp, but this version short circuits as
// soon as it can prove that there's no conflict.
CheckJob::continuation begin(CheckJob *job) {
++job->tls->point_read_accum;
#if DEBUG_VERBOSE && !defined(NDEBUG)
fprintf(stderr, "Check point read: %s\n", printable(key).c_str());
#endif
for (;; ++job->tls->point_read_iterations_accum) {
if (job->begin.size() == 0) {
if (job->n->entryPresent) {
job->setResult(job->n->entry.pointVersion <= job->readVersion);
return nullptr; // Done
}
job->n = getFirstChildExists(job->n);
return down_left_spine;
}
auto [child, maxV] = getChildAndMaxVersion(job->n, job->begin[0]);
if (child == nullptr) {
auto c = getChildGeq(job->n, job->begin[0]);
if (c != nullptr) {
job->n = c;
return down_left_spine;
} else {
job->n = nextSibling(job->n);
if (job->n == nullptr) {
job->setResult(true);
return nullptr; // Done
}
return down_left_spine;
}
}
job->n = child;
job->begin = job->begin.subspan(1, job->begin.size() - 1);
if (job->n->partialKeyLen > 0) {
int commonLen = std::min<int>(job->n->partialKeyLen, job->begin.size());
int i = longestCommonPrefix(job->n->partialKey(), job->begin.data(),
commonLen);
if (i < commonLen) {
auto c = job->n->partialKey()[i] <=> job->begin[i];
if (c > 0) {
return down_left_spine;
} else {
job->n = nextSibling(job->n);
if (job->n == nullptr) {
job->setResult(true);
return nullptr; // Done
}
return down_left_spine;
}
}
if (commonLen == job->n->partialKeyLen) {
// partial key matches
job->begin =
job->begin.subspan(commonLen, job->begin.size() - commonLen);
} else if (job->n->partialKeyLen > int(job->begin.size())) {
// n is the first physical node greater than remaining, and there's no
// eq node
return down_left_spine;
}
}
if (maxV <= job->readVersion) {
++job->tls->point_read_short_circuit_accum;
job->setResult(true);
return nullptr; // Done
}
}
}
CheckJob::continuation down_left_spine(CheckJob *job) {
if (job->n->entryPresent) {
job->setResult(job->n->entry.rangeVersion <= job->readVersion);
return nullptr; // Done
}
job->n = getFirstChildExists(job->n);
return down_left_spine;
}
} // namespace check_point_read_state_machine
void CheckJob::init(const ConflictSet::ReadRange *read,
ConflictSet::Result *result, Node *root,
int64_t oldestVersionFullPrecision, ReadContext *tls) {
auto begin = std::span<const uint8_t>(read->begin.p, read->begin.len);
auto end = std::span<const uint8_t>(read->end.p, read->end.len);
if (read->readVersion < oldestVersionFullPrecision) {
*result = ConflictSet::TooOld;
next = +[](CheckJob *) -> continuation { return nullptr; };
} else if (end.size() == 0) {
this->begin = begin;
this->n = root;
this->readVersion = InternalVersionT(read->readVersion);
this->result = result;
this->tls = tls;
this->next = check_point_read_state_machine::begin;
} else {
*result = checkRangeRead(root, begin, end,
InternalVersionT(read->readVersion), tls)
? ConflictSet::Commit
: ConflictSet::Conflict;
next = +[](CheckJob *) -> continuation { return nullptr; };
}
}
struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
void check(const ReadRange *reads, Result *result, int count) {
assert(oldestVersionFullPrecision >=
newestVersionFullPrecision - kNominalVersionWindow);
if (count == 0) {
return;
}
ReadContext tls;
tls.impl = this;
int64_t check_byte_accum = 0;
constexpr int kConcurrent = 32;
CheckJob inProgress[kConcurrent];
int nextJob[kConcurrent];
int started = std::min(kConcurrent, count);
for (int i = 0; i < started; i++) {
inProgress[i].init(reads + i, result + i, root,
oldestVersionFullPrecision, &tls);
nextJob[i] = i + 1;
}
nextJob[started - 1] = 0;
int prevJob = started - 1;
int job = 0;
for (;;) {
auto next = inProgress[job].next(inProgress + job);
inProgress[job].next = next;
if (!next) {
if (started == count) {
if (prevJob == job)
break;
nextJob[prevJob] = nextJob[job];
job = prevJob;
} else {
int temp = started++;
inProgress[job].init(reads + temp, result + temp, root,
oldestVersionFullPrecision, &tls);
}
}
prevJob = job;
job = nextJob[job];
}
for (int i = 0; i < count; ++i) {
assert(reads[i].readVersion >= 0);
assert(reads[i].readVersion <= newestVersionFullPrecision);
const auto &r = reads[i];
check_byte_accum += r.begin.len + r.end.len;
auto begin = std::span<const uint8_t>(r.begin.p, r.begin.len);
auto end = std::span<const uint8_t>(r.end.p, r.end.len);
assert(oldestVersionFullPrecision >=
newestVersionFullPrecision - kNominalVersionWindow);
result[i] =
reads[i].readVersion < oldestVersionFullPrecision ? TooOld
: (end.size() > 0
? checkRangeRead(root, begin, end,
InternalVersionT(reads[i].readVersion), &tls)
: checkPointRead(root, begin,
InternalVersionT(reads[i].readVersion), &tls))
? Commit
: Conflict;
tls.commits_accum += result[i] == Commit;
tls.conflicts_accum += result[i] == Conflict;
tls.too_olds_accum += result[i] == TooOld;
}
point_read_total.add(tls.point_read_accum);
prefix_read_total.add(tls.prefix_read_accum);
range_read_total.add(tls.range_read_accum);
@@ -3449,7 +3427,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
std::span<const uint8_t> removalKey;
int64_t keyUpdates;
TaggedNodePointer root;
Node *root;
InternalVersionT oldestVersion;
int64_t oldestVersionFullPrecision;
int64_t oldestExtantVersion;
@@ -3530,7 +3508,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
}
};
TaggedNodePointer &getInTree(Node *n, ConflictSet::Impl *impl) {
Node *&getInTree(Node *n, ConflictSet::Impl *impl) {
return n->parent == nullptr ? impl->root
: getChildExists(n->parent, n->parentsIndex);
}
@@ -4169,24 +4147,6 @@ template <int kN> void benchScan2() {
});
}
void benchHorizontal16() {
ankerl::nanobench::Bench bench;
InternalVersionT vs[16];
for (int i = 0; i < 16; ++i) {
vs[i] = InternalVersionT(rand() % 1000 + 1000);
}
#if !USE_64_BIT
InternalVersionT::zero = InternalVersionT(rand() % 1000);
#endif
bench.run("horizontal16", [&]() {
bench.doNotOptimizeAway(horizontalMax16(vs, InternalVersionT::zero));
});
int x = rand() % 15 + 1;
bench.run("horizontalUpTo16", [&]() {
bench.doNotOptimizeAway(horizontalMaxUpTo16(vs, InternalVersionT::zero, x));
});
}
void benchLCP(int len) {
ankerl::nanobench::Bench bench;
std::vector<uint8_t> lhs(len);
@@ -4219,7 +4179,11 @@ void printTree() {
debugPrintDot(stdout, cs.root, &cs);
}
int main(void) { benchHorizontal16(); }
int main(void) {
for (int i = 0; i < 256; ++i) {
benchLCP(i);
}
}
#endif
#ifdef ENABLE_FUZZ

View File

@@ -22,6 +22,9 @@ void *stepJob(Job *j) {
return done ? nullptr : (void *)stepJob;
}
// So we can look at the disassembly more easily
extern "C" {
void sequential(Job **jobs, int count) {
for (int i = 0; i < count; ++i) {
do {
@@ -91,87 +94,6 @@ void interleaveBoundedCyclicList(Job **jobs, int count) {
}
}
#ifndef __has_attribute
#define __has_attribute(x) 0
#endif
#if __has_attribute(musttail)
#define MUSTTAIL __attribute__((musttail))
#else
#define MUSTTAIL
#endif
struct Context {
constexpr static int kConcurrent = 32;
Job **jobs;
Job *inProgress[kConcurrent];
void (*continuation[kConcurrent])(Context *, int64_t prevJob, int64_t job,
int64_t started, int64_t count);
int nextJob[kConcurrent];
};
void keepGoing(Context *context, int64_t prevJob, int64_t job, int64_t started,
int64_t count) {
prevJob = job;
job = context->nextJob[job];
MUSTTAIL return context->continuation[job](context, prevJob, job, started,
count);
}
void stepJobTailCall(Context *context, int64_t prevJob, int64_t job,
int64_t started, int64_t count);
void complete(Context *context, int64_t prevJob, int64_t job, int64_t started,
int64_t count) {
if (started == count) {
if (prevJob == job) {
return;
}
context->nextJob[prevJob] = context->nextJob[job];
job = prevJob;
} else {
context->inProgress[job] = context->jobs[started++];
context->continuation[job] = stepJobTailCall;
}
prevJob = job;
job = context->nextJob[job];
MUSTTAIL return context->continuation[job](context, prevJob, job, started,
count);
}
void stepJobTailCall(Context *context, int64_t prevJob, int64_t job,
int64_t started, int64_t count) {
auto *j = context->inProgress[job];
auto done = --(*j->input) == 0;
#ifdef __x86_64__
_mm_clflush(j->input);
#endif
if (done) {
MUSTTAIL return complete(context, prevJob, job, started, count);
} else {
context->continuation[job] = stepJobTailCall;
MUSTTAIL return keepGoing(context, prevJob, job, started, count);
}
}
void useTailCalls(Job **jobs, int count) {
if (count == 0) {
return;
}
Context context;
context.jobs = jobs;
int64_t started = std::min(Context::kConcurrent, count);
for (int i = 0; i < started; i++) {
context.inProgress[i] = jobs[i];
context.nextJob[i] = i + 1;
context.continuation[i] = stepJobTailCall;
}
context.nextJob[started - 1] = 0;
int prevJob = started - 1;
int job = 0;
return context.continuation[job](&context, prevJob, job, started, count);
}
void interleaveCyclicList(Job **jobs, int count) {
auto *nextJob = (int *)alloca(sizeof(int) * count);
@@ -195,11 +117,12 @@ void interleaveCyclicList(Job **jobs, int count) {
job = nextJob[job];
}
}
}
int main() {
ankerl::nanobench::Bench bench;
constexpr int kNumJobs = 10000;
constexpr int kNumJobs = 100;
bench.relative(true);
Job jobs[kNumJobs];
@@ -217,7 +140,6 @@ int main() {
for (auto [scheduler, name] :
{std::make_pair(sequentialNoFuncPtr, "sequentialNoFuncPtr"),
std::make_pair(sequential, "sequential"),
std::make_pair(useTailCalls, "useTailCalls"),
std::make_pair(interleaveSwapping, "interleavingSwapping"),
std::make_pair(interleaveBoundedCyclicList,
"interleaveBoundedCyclicList"),