Compare commits
20 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| af1e2299de | |||
| 230e96063d | |||
| f41a62471b | |||
| d8f85dedc4 | |||
| 656939560b | |||
| 5580f9b71d | |||
| 628d16b7e6 | |||
| d9e4a7d1b6 | |||
| 52201fa4c7 | |||
| 0814822d82 | |||
| 41df2398e8 | |||
| 84c4d0fcba | |||
| 6241533dfb | |||
| 0abf6a1ecf | |||
| 867136ff1b | |||
| 4b8f7320d3 | |||
| 6628092384 | |||
| a0a4f1afea | |||
| ca479c03ce | |||
| 0a2e133ab9 |
@@ -7,7 +7,6 @@
|
||||
void showMemory(const ConflictSet &cs);
|
||||
#endif
|
||||
|
||||
#define ANKERL_NANOBENCH_IMPLEMENT
|
||||
#include "third_party/nanobench.h"
|
||||
|
||||
constexpr int kNumKeys = 1000000;
|
||||
|
||||
+33
-14
@@ -138,6 +138,8 @@ include(CTest)
|
||||
# disable tests if this is being used through e.g. FetchContent
|
||||
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR AND BUILD_TESTING)
|
||||
|
||||
add_library(nanobench ${CMAKE_CURRENT_SOURCE_DIR}/nanobench.cpp)
|
||||
|
||||
set(TEST_FLAGS -Wall -Wextra -Wunreachable-code -Wpedantic -UNDEBUG)
|
||||
|
||||
# corpus tests, which are tests curated by libfuzzer. The goal is to get broad
|
||||
@@ -185,6 +187,7 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR AND BUILD_TESTING)
|
||||
target_include_directories(conflict_set_main
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
|
||||
target_compile_definitions(conflict_set_main PRIVATE ENABLE_MAIN)
|
||||
target_link_libraries(conflict_set_main PRIVATE nanobench)
|
||||
|
||||
if(NOT APPLE)
|
||||
# libfuzzer target, to generate/manage corpus
|
||||
@@ -245,6 +248,19 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR AND BUILD_TESTING)
|
||||
add_test(NAME conflict_set_blackbox_${hash} COMMAND driver ${TEST})
|
||||
endforeach()
|
||||
|
||||
find_program(VALGRIND_EXE valgrind)
|
||||
if(VALGRIND_EXE AND NOT CMAKE_CROSSCOMPILING)
|
||||
list(LENGTH CORPUS_TESTS len)
|
||||
math(EXPR last "${len} - 1")
|
||||
set(partition_size 100)
|
||||
foreach(i RANGE 0 ${last} ${partition_size})
|
||||
list(SUBLIST CORPUS_TESTS ${i} ${partition_size} partition)
|
||||
add_test(NAME conflict_set_blackbox_valgrind_${i}
|
||||
COMMAND ${VALGRIND_EXE} --error-exitcode=99 --
|
||||
$<TARGET_FILE:driver> ${partition})
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
# scripted tests. Written manually to fill in anything libfuzzer couldn't
|
||||
# find.
|
||||
if(NOT CMAKE_CROSSCOMPILING)
|
||||
@@ -265,19 +281,14 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR AND BUILD_TESTING)
|
||||
${Python3_EXECUTABLE}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_conflict_set.py test ${TEST}
|
||||
--build-dir ${CMAKE_CURRENT_BINARY_DIR})
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
find_program(VALGRIND_EXE valgrind)
|
||||
if(VALGRIND_EXE AND NOT CMAKE_CROSSCOMPILING)
|
||||
list(LENGTH CORPUS_TESTS len)
|
||||
math(EXPR last "${len} - 1")
|
||||
set(partition_size 100)
|
||||
foreach(i RANGE 0 ${last} ${partition_size})
|
||||
list(SUBLIST CORPUS_TESTS ${i} ${partition_size} partition)
|
||||
add_test(NAME conflict_set_blackbox_valgrind_${i}
|
||||
COMMAND ${VALGRIND_EXE} --error-exitcode=99 --
|
||||
$<TARGET_FILE:driver> ${partition})
|
||||
if(VALGRIND_EXE AND NOT CMAKE_CROSSCOMPILING)
|
||||
add_test(
|
||||
NAME script_test_${TEST}_valgrind
|
||||
COMMAND
|
||||
${VALGRIND_EXE} ${Python3_EXECUTABLE}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_conflict_set.py test ${TEST}
|
||||
--build-dir ${CMAKE_CURRENT_BINARY_DIR})
|
||||
endif()
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
@@ -330,7 +341,7 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR AND BUILD_TESTING)
|
||||
|
||||
# bench
|
||||
add_executable(conflict_set_bench Bench.cpp)
|
||||
target_link_libraries(conflict_set_bench PRIVATE ${PROJECT_NAME})
|
||||
target_link_libraries(conflict_set_bench PRIVATE ${PROJECT_NAME} nanobench)
|
||||
set_target_properties(conflict_set_bench PROPERTIES SKIP_BUILD_RPATH ON)
|
||||
add_executable(real_data_bench RealDataBench.cpp)
|
||||
target_link_libraries(real_data_bench PRIVATE ${PROJECT_NAME})
|
||||
@@ -345,6 +356,14 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR AND BUILD_TESTING)
|
||||
add_executable(server_bench ServerBench.cpp)
|
||||
target_link_libraries(server_bench PRIVATE ${PROJECT_NAME})
|
||||
set_target_properties(server_bench PROPERTIES SKIP_BUILD_RPATH ON)
|
||||
|
||||
add_executable(interleaving_test InterleavingTest.cpp)
|
||||
# work around lack of musttail for gcc
|
||||
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||
target_compile_options(interleaving_test PRIVATE -Og
|
||||
-foptimize-sibling-calls)
|
||||
endif()
|
||||
target_link_libraries(interleaving_test PRIVATE nanobench)
|
||||
endif()
|
||||
|
||||
# packaging
|
||||
|
||||
+272
-68
@@ -203,6 +203,60 @@ enum Type : int8_t {
|
||||
|
||||
template <class T> struct BoundedFreeListAllocator;
|
||||
|
||||
struct TaggedNodePointer {
|
||||
TaggedNodePointer() = default;
|
||||
operator struct Node *() { return (struct Node *)(uintptr_t)p; }
|
||||
operator struct Node0 *() {
|
||||
assert(t == Type_Node0);
|
||||
return (struct Node0 *)(uintptr_t)p;
|
||||
}
|
||||
operator struct Node3 *() {
|
||||
assert(t == Type_Node3);
|
||||
return (struct Node3 *)(uintptr_t)p;
|
||||
}
|
||||
operator struct Node16 *() {
|
||||
assert(t == Type_Node16);
|
||||
return (struct Node16 *)(uintptr_t)p;
|
||||
}
|
||||
operator struct Node48 *() {
|
||||
assert(t == Type_Node48);
|
||||
return (struct Node48 *)(uintptr_t)p;
|
||||
}
|
||||
operator struct Node256 *() {
|
||||
assert(t == Type_Node256);
|
||||
return (struct Node256 *)(uintptr_t)p;
|
||||
}
|
||||
/*implicit*/ TaggedNodePointer(std::nullptr_t) : p(0), t(Type_Node0) {}
|
||||
/*implicit*/ TaggedNodePointer(Node0 *x)
|
||||
: TaggedNodePointer((struct Node *)x, Type_Node0) {}
|
||||
/*implicit*/ TaggedNodePointer(Node3 *x)
|
||||
: TaggedNodePointer((struct Node *)x, Type_Node3) {}
|
||||
/*implicit*/ TaggedNodePointer(Node16 *x)
|
||||
: TaggedNodePointer((struct Node *)x, Type_Node16) {}
|
||||
/*implicit*/ TaggedNodePointer(Node48 *x)
|
||||
: TaggedNodePointer((struct Node *)x, Type_Node48) {}
|
||||
/*implicit*/ TaggedNodePointer(Node256 *x)
|
||||
: TaggedNodePointer((struct Node *)x, Type_Node256) {}
|
||||
|
||||
bool operator!=(std::nullptr_t) { return p != 0 || t != Type_Node0; }
|
||||
bool operator==(std::nullptr_t) { return p == 0 && t == Type_Node0; }
|
||||
bool operator==(const TaggedNodePointer &) const = default;
|
||||
bool operator==(Node *n) const { return (uintptr_t)n == p; }
|
||||
Node *operator->() { return (Node *)(uintptr_t)p; }
|
||||
Type getType() { return t; }
|
||||
|
||||
TaggedNodePointer(const TaggedNodePointer &) = default;
|
||||
TaggedNodePointer &operator=(const TaggedNodePointer &) = default;
|
||||
/*implicit*/ TaggedNodePointer(Node *n);
|
||||
|
||||
private:
|
||||
TaggedNodePointer(struct Node *p, Type t) : p((uintptr_t)p), t(t) {
|
||||
assume(p != 0);
|
||||
}
|
||||
uintptr_t p : 56;
|
||||
Type t : 8;
|
||||
};
|
||||
|
||||
struct Node {
|
||||
|
||||
/* begin section that's copied to the next node */
|
||||
@@ -228,6 +282,9 @@ private:
|
||||
int32_t partialKeyCapacity;
|
||||
};
|
||||
|
||||
TaggedNodePointer::TaggedNodePointer(Node *n)
|
||||
: TaggedNodePointer(n, n->getType()) {}
|
||||
|
||||
constexpr int kNodeCopyBegin = offsetof(Node, entry);
|
||||
constexpr int kNodeCopySize =
|
||||
offsetof(Node, parentsIndex) + sizeof(Node::parentsIndex) - kNodeCopyBegin;
|
||||
@@ -251,7 +308,7 @@ struct Node3 : Node {
|
||||
constexpr static auto kMaxNodes = 3;
|
||||
constexpr static auto kType = Type_Node3;
|
||||
|
||||
Node *children[kMaxNodes];
|
||||
TaggedNodePointer children[kMaxNodes];
|
||||
InternalVersionT childMaxVersion[kMaxNodes];
|
||||
// Sorted
|
||||
uint8_t index[kMaxNodes];
|
||||
@@ -267,7 +324,7 @@ struct Node16 : Node {
|
||||
constexpr static auto kType = Type_Node16;
|
||||
constexpr static auto kMaxNodes = 16;
|
||||
|
||||
Node *children[kMaxNodes];
|
||||
TaggedNodePointer children[kMaxNodes];
|
||||
InternalVersionT childMaxVersion[kMaxNodes];
|
||||
// Sorted
|
||||
uint8_t index[kMaxNodes];
|
||||
@@ -288,7 +345,7 @@ struct Node48 : Node {
|
||||
constexpr static int kMaxOfMaxTotalPages = kMaxNodes / kMaxOfMaxPageSize;
|
||||
|
||||
BitSet bitSet;
|
||||
Node *children[kMaxNodes];
|
||||
TaggedNodePointer children[kMaxNodes];
|
||||
InternalVersionT childMaxVersion[kMaxNodes];
|
||||
InternalVersionT maxOfMax[kMaxOfMaxTotalPages];
|
||||
uint8_t reverseIndex[kMaxNodes];
|
||||
@@ -310,7 +367,7 @@ struct Node256 : Node {
|
||||
constexpr static int kMaxOfMaxTotalPages = kMaxNodes / kMaxOfMaxPageSize;
|
||||
|
||||
BitSet bitSet;
|
||||
Node *children[kMaxNodes];
|
||||
TaggedNodePointer children[kMaxNodes];
|
||||
InternalVersionT childMaxVersion[kMaxNodes];
|
||||
InternalVersionT maxOfMax[kMaxOfMaxTotalPages];
|
||||
|
||||
@@ -766,6 +823,8 @@ private:
|
||||
|
||||
int getNodeIndex(Node3 *self, uint8_t index) {
|
||||
Node3 *n = (Node3 *)self;
|
||||
assume(n->numChildren >= 1);
|
||||
assume(n->numChildren <= 3);
|
||||
for (int i = 0; i < n->numChildren; ++i) {
|
||||
if (n->index[i] == index) {
|
||||
return i;
|
||||
@@ -774,6 +833,18 @@ int getNodeIndex(Node3 *self, uint8_t index) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int getNodeIndexExists(Node3 *self, uint8_t index) {
|
||||
Node3 *n = (Node3 *)self;
|
||||
assume(n->numChildren >= 1);
|
||||
assume(n->numChildren <= 3);
|
||||
for (int i = 0; i < n->numChildren; ++i) {
|
||||
if (n->index[i] == index) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
__builtin_unreachable(); // GCOVR_EXCL_LINE
|
||||
}
|
||||
|
||||
int getNodeIndex(Node16 *self, uint8_t index) {
|
||||
|
||||
#ifdef HAS_AVX
|
||||
@@ -834,27 +905,66 @@ int getNodeIndex(Node16 *self, uint8_t index) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int getNodeIndexExists(Node16 *self, uint8_t index) {
|
||||
|
||||
#ifdef HAS_AVX
|
||||
__m128i key_vec = _mm_set1_epi8(index);
|
||||
__m128i indices;
|
||||
memcpy(&indices, self->index, Node16::kMaxNodes);
|
||||
__m128i results = _mm_cmpeq_epi8(key_vec, indices);
|
||||
uint32_t mask = (1 << self->numChildren) - 1;
|
||||
uint32_t bitfield = _mm_movemask_epi8(results) & mask;
|
||||
assume(bitfield != 0);
|
||||
return std::countr_zero(bitfield);
|
||||
#elif defined(HAS_ARM_NEON)
|
||||
// Based on
|
||||
// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
|
||||
|
||||
uint8x16_t indices;
|
||||
memcpy(&indices, self->index, Node16::kMaxNodes);
|
||||
// 0xff for each match
|
||||
uint16x8_t results =
|
||||
vreinterpretq_u16_u8(vceqq_u8(vdupq_n_u8(index), indices));
|
||||
assume(self->numChildren <= Node16::kMaxNodes);
|
||||
uint64_t mask = self->numChildren == 16
|
||||
? uint64_t(-1)
|
||||
: (uint64_t(1) << (self->numChildren * 4)) - 1;
|
||||
// 0xf for each match in valid range
|
||||
uint64_t bitfield =
|
||||
vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(results, 4)), 0) & mask;
|
||||
assume(bitfield != 0);
|
||||
return std::countr_zero(bitfield) / 4;
|
||||
#else
|
||||
for (int i = 0; i < self->numChildren; ++i) {
|
||||
if (self->index[i] == index) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
__builtin_unreachable(); // GCOVR_EXCL_LINE
|
||||
#endif
|
||||
}
|
||||
|
||||
// Precondition - an entry for index must exist in the node
|
||||
Node *&getChildExists(Node3 *self, uint8_t index) {
|
||||
return self->children[getNodeIndex(self, index)];
|
||||
TaggedNodePointer &getChildExists(Node3 *self, uint8_t index) {
|
||||
return self->children[getNodeIndexExists(self, index)];
|
||||
}
|
||||
// Precondition - an entry for index must exist in the node
|
||||
Node *&getChildExists(Node16 *self, uint8_t index) {
|
||||
return self->children[getNodeIndex(self, index)];
|
||||
TaggedNodePointer &getChildExists(Node16 *self, uint8_t index) {
|
||||
return self->children[getNodeIndexExists(self, index)];
|
||||
}
|
||||
// Precondition - an entry for index must exist in the node
|
||||
Node *&getChildExists(Node48 *self, uint8_t index) {
|
||||
TaggedNodePointer &getChildExists(Node48 *self, uint8_t index) {
|
||||
assert(self->bitSet.test(index));
|
||||
return self->children[self->index[index]];
|
||||
}
|
||||
// Precondition - an entry for index must exist in the node
|
||||
Node *&getChildExists(Node256 *self, uint8_t index) {
|
||||
TaggedNodePointer &getChildExists(Node256 *self, uint8_t index) {
|
||||
assert(self->bitSet.test(index));
|
||||
return self->children[index];
|
||||
}
|
||||
|
||||
// Precondition - an entry for index must exist in the node
|
||||
Node *&getChildExists(Node *self, uint8_t index) {
|
||||
TaggedNodePointer &getChildExists(Node *self, uint8_t index) {
|
||||
switch (self->getType()) {
|
||||
case Type_Node0: // GCOVR_EXCL_LINE
|
||||
__builtin_unreachable(); // GCOVR_EXCL_LINE
|
||||
@@ -885,12 +995,12 @@ InternalVersionT maxVersion(Node *n) {
|
||||
__builtin_unreachable(); // GCOVR_EXCL_LINE
|
||||
case Type_Node3: {
|
||||
auto *n3 = static_cast<Node3 *>(n);
|
||||
int i = getNodeIndex(n3, index);
|
||||
int i = getNodeIndexExists(n3, index);
|
||||
return n3->childMaxVersion[i];
|
||||
}
|
||||
case Type_Node16: {
|
||||
auto *n16 = static_cast<Node16 *>(n);
|
||||
int i = getNodeIndex(n16, index);
|
||||
int i = getNodeIndexExists(n16, index);
|
||||
return n16->childMaxVersion[i];
|
||||
}
|
||||
case Type_Node48: {
|
||||
@@ -918,12 +1028,12 @@ InternalVersionT exchangeMaxVersion(Node *n, InternalVersionT newMax) {
|
||||
__builtin_unreachable(); // GCOVR_EXCL_LINE
|
||||
case Type_Node3: {
|
||||
auto *n3 = static_cast<Node3 *>(n);
|
||||
int i = getNodeIndex(n3, index);
|
||||
int i = getNodeIndexExists(n3, index);
|
||||
return std::exchange(n3->childMaxVersion[i], newMax);
|
||||
}
|
||||
case Type_Node16: {
|
||||
auto *n16 = static_cast<Node16 *>(n);
|
||||
int i = getNodeIndex(n16, index);
|
||||
int i = getNodeIndexExists(n16, index);
|
||||
return std::exchange(n16->childMaxVersion[i], newMax);
|
||||
}
|
||||
case Type_Node48: {
|
||||
@@ -952,13 +1062,13 @@ void setMaxVersion(Node *n, InternalVersionT newMax) {
|
||||
__builtin_unreachable(); // GCOVR_EXCL_LINE
|
||||
case Type_Node3: {
|
||||
auto *n3 = static_cast<Node3 *>(n);
|
||||
int i = getNodeIndex(n3, index);
|
||||
int i = getNodeIndexExists(n3, index);
|
||||
n3->childMaxVersion[i] = newMax;
|
||||
return;
|
||||
}
|
||||
case Type_Node16: {
|
||||
auto *n16 = static_cast<Node16 *>(n);
|
||||
int i = getNodeIndex(n16, index);
|
||||
int i = getNodeIndexExists(n16, index);
|
||||
n16->childMaxVersion[i] = newMax;
|
||||
return;
|
||||
}
|
||||
@@ -985,7 +1095,7 @@ void setMaxVersion(Node *n, InternalVersionT newMax) {
|
||||
}
|
||||
}
|
||||
|
||||
Node *&getInTree(Node *n, ConflictSet::Impl *);
|
||||
TaggedNodePointer &getInTree(Node *n, ConflictSet::Impl *);
|
||||
|
||||
Node *getChild(Node0 *, uint8_t) { return nullptr; }
|
||||
Node *getChild(Node3 *self, uint8_t index) {
|
||||
@@ -1022,6 +1132,7 @@ Node *getChild(Node *self, uint8_t index) {
|
||||
struct ChildAndMaxVersion {
|
||||
Node *child;
|
||||
InternalVersionT maxVersion;
|
||||
Type childType;
|
||||
};
|
||||
|
||||
ChildAndMaxVersion getChildAndMaxVersion(Node0 *, uint8_t) { return {}; }
|
||||
@@ -1030,24 +1141,28 @@ ChildAndMaxVersion getChildAndMaxVersion(Node3 *self, uint8_t index) {
|
||||
if (i < 0) {
|
||||
return {};
|
||||
}
|
||||
return {self->children[i], self->childMaxVersion[i]};
|
||||
return {self->children[i], self->childMaxVersion[i],
|
||||
self->children[i].getType()};
|
||||
}
|
||||
ChildAndMaxVersion getChildAndMaxVersion(Node16 *self, uint8_t index) {
|
||||
int i = getNodeIndex(self, index);
|
||||
if (i < 0) {
|
||||
return {};
|
||||
}
|
||||
return {self->children[i], self->childMaxVersion[i]};
|
||||
return {self->children[i], self->childMaxVersion[i],
|
||||
self->children[i].getType()};
|
||||
}
|
||||
ChildAndMaxVersion getChildAndMaxVersion(Node48 *self, uint8_t index) {
|
||||
int i = self->index[index];
|
||||
if (i < 0) {
|
||||
return {};
|
||||
}
|
||||
return {self->children[i], self->childMaxVersion[i]};
|
||||
return {self->children[i], self->childMaxVersion[i],
|
||||
self->children[i].getType()};
|
||||
}
|
||||
ChildAndMaxVersion getChildAndMaxVersion(Node256 *self, uint8_t index) {
|
||||
return {self->children[index], self->childMaxVersion[index]};
|
||||
return {self->children[index], self->childMaxVersion[index],
|
||||
self->children[index].getType()};
|
||||
}
|
||||
|
||||
ChildAndMaxVersion getChildAndMaxVersion(Node *self, uint8_t index) {
|
||||
@@ -1070,6 +1185,8 @@ ChildAndMaxVersion getChildAndMaxVersion(Node *self, uint8_t index) {
|
||||
Node *getChildGeq(Node0 *, int) { return nullptr; }
|
||||
|
||||
Node *getChildGeq(Node3 *n, int child) {
|
||||
assume(n->numChildren >= 1);
|
||||
assume(n->numChildren <= 3);
|
||||
for (int i = 0; i < n->numChildren; ++i) {
|
||||
if (n->index[i] >= child) {
|
||||
return n->children[i];
|
||||
@@ -1190,14 +1307,15 @@ Node *getFirstChildExists(Node *self) {
|
||||
}
|
||||
}
|
||||
|
||||
void consumePartialKeyFull(Node *&self, std::span<const uint8_t> &key,
|
||||
void consumePartialKeyFull(TaggedNodePointer &self,
|
||||
std::span<const uint8_t> &key,
|
||||
InternalVersionT writeVersion, WriteContext *tls) {
|
||||
// Handle an existing partial key
|
||||
int commonLen = std::min<int>(self->partialKeyLen, key.size());
|
||||
int partialKeyIndex =
|
||||
longestCommonPrefix(self->partialKey(), key.data(), commonLen);
|
||||
if (partialKeyIndex < self->partialKeyLen) {
|
||||
auto *old = self;
|
||||
Node *old = self;
|
||||
// Since root cannot have a partial key
|
||||
assert(old->parent != nullptr);
|
||||
InternalVersionT oldMaxVersion = exchangeMaxVersion(old, writeVersion);
|
||||
@@ -1235,7 +1353,7 @@ void consumePartialKeyFull(Node *&self, std::span<const uint8_t> &key,
|
||||
// Consume any partial key of `self`, and update `self` and
|
||||
// `key` such that `self` is along the search path of `key`
|
||||
inline __attribute__((always_inline)) void
|
||||
consumePartialKey(Node *&self, std::span<const uint8_t> &key,
|
||||
consumePartialKey(TaggedNodePointer &self, std::span<const uint8_t> &key,
|
||||
InternalVersionT writeVersion, WriteContext *tls) {
|
||||
if (self->partialKeyLen > 0) {
|
||||
consumePartialKeyFull(self, key, writeVersion, tls);
|
||||
@@ -1246,8 +1364,10 @@ consumePartialKey(Node *&self, std::span<const uint8_t> &key,
|
||||
// such that the search path of the result + key is the same as the search path
|
||||
// of self + key before the call. Creates a node if necessary. Updates
|
||||
// `maxVersion` for result.
|
||||
Node *&getOrCreateChild(Node *&self, std::span<const uint8_t> &key,
|
||||
InternalVersionT newMaxVersion, WriteContext *tls) {
|
||||
TaggedNodePointer &getOrCreateChild(TaggedNodePointer &self,
|
||||
std::span<const uint8_t> &key,
|
||||
InternalVersionT newMaxVersion,
|
||||
WriteContext *tls) {
|
||||
|
||||
int index = key.front();
|
||||
key = key.subspan(1, key.size() - 1);
|
||||
@@ -1549,7 +1669,6 @@ __attribute__((target("avx512f"))) void rezero16(InternalVersionT *vs,
|
||||
_mm512_sub_epi32(_mm512_loadu_epi32(vs), zvec), _mm512_setzero_epi32());
|
||||
_mm512_mask_storeu_epi32(vs, m, zvec);
|
||||
}
|
||||
|
||||
__attribute__((target("default")))
|
||||
#endif
|
||||
|
||||
@@ -1606,10 +1725,11 @@ void rezero(Node *n, InternalVersionT z) {
|
||||
}
|
||||
#endif
|
||||
|
||||
void mergeWithChild(Node *&self, WriteContext *tls, ConflictSet::Impl *impl,
|
||||
Node *&dontInvalidate, Node3 *self3) {
|
||||
void mergeWithChild(TaggedNodePointer &self, WriteContext *tls,
|
||||
ConflictSet::Impl *impl, Node *&dontInvalidate,
|
||||
Node3 *self3) {
|
||||
assert(!self3->entryPresent);
|
||||
auto *child = self3->children[0];
|
||||
Node *child = self3->children[0];
|
||||
int minCapacity = self3->partialKeyLen + 1 + child->partialKeyLen;
|
||||
|
||||
if (minCapacity > child->getCapacity()) {
|
||||
@@ -1858,7 +1978,7 @@ bool checkPointRead(Node *n, const std::span<const uint8_t> key,
|
||||
goto downLeftSpine;
|
||||
}
|
||||
|
||||
auto [child, maxV] = getChildAndMaxVersion(n, remaining[0]);
|
||||
auto [child, maxV, childT] = getChildAndMaxVersion(n, remaining[0]);
|
||||
if (child == nullptr) {
|
||||
auto c = getChildGeq(n, remaining[0]);
|
||||
if (c != nullptr) {
|
||||
@@ -1928,7 +2048,7 @@ bool checkPrefixRead(Node *n, const std::span<const uint8_t> key,
|
||||
return maxVersion(n) <= readVersion;
|
||||
}
|
||||
|
||||
auto [child, maxV] = getChildAndMaxVersion(n, remaining[0]);
|
||||
auto [child, maxV, childT] = getChildAndMaxVersion(n, remaining[0]);
|
||||
if (child == nullptr) {
|
||||
auto c = getChildGeq(n, remaining[0]);
|
||||
if (c != nullptr) {
|
||||
@@ -2201,7 +2321,7 @@ bool checkMaxBetweenExclusiveImpl(Node *n, int begin, int end,
|
||||
if (!mask) {
|
||||
return true;
|
||||
}
|
||||
auto *child = self->children[std::countr_zero(mask)];
|
||||
Node *child = self->children[std::countr_zero(mask)];
|
||||
const bool firstRangeOk =
|
||||
!child->entryPresent || child->entry.rangeVersion <= readVersion;
|
||||
uint32_t compared = 0;
|
||||
@@ -2276,7 +2396,7 @@ bool checkMaxBetweenExclusiveImpl(Node *n, int begin, int end,
|
||||
if (!mask) {
|
||||
return true;
|
||||
}
|
||||
auto *child = self->children[std::countr_zero(mask)];
|
||||
Node *child = self->children[std::countr_zero(mask)];
|
||||
const bool firstRangeOk =
|
||||
!child->entryPresent || child->entry.rangeVersion <= readVersion;
|
||||
|
||||
@@ -2321,7 +2441,7 @@ bool checkMaxBetweenExclusiveImpl(Node *n, int begin, int end,
|
||||
{
|
||||
int c = self->bitSet.firstSetGeq(begin + 1);
|
||||
if (c >= 0 && c < end) {
|
||||
auto *child = self->children[self->index[c]];
|
||||
Node *child = self->children[self->index[c]];
|
||||
if (child->entryPresent && child->entry.rangeVersion > readVersion) {
|
||||
return false;
|
||||
}
|
||||
@@ -2355,7 +2475,7 @@ bool checkMaxBetweenExclusiveImpl(Node *n, int begin, int end,
|
||||
{
|
||||
int c = self->bitSet.firstSetGeq(begin + 1);
|
||||
if (c >= 0 && c < end) {
|
||||
auto *child = self->children[c];
|
||||
Node *child = self->children[c];
|
||||
if (child->entryPresent && child->entry.rangeVersion > readVersion) {
|
||||
return false;
|
||||
}
|
||||
@@ -2416,6 +2536,7 @@ checkMaxBetweenExclusive(Node *n, int begin, int end,
|
||||
}
|
||||
__attribute__((target("default")))
|
||||
#endif
|
||||
|
||||
bool checkMaxBetweenExclusive(Node *n, int begin, int end,
|
||||
InternalVersionT readVersion, ReadContext *tls) {
|
||||
return checkMaxBetweenExclusiveImpl<false>(n, begin, end, readVersion, tls);
|
||||
@@ -2526,7 +2647,7 @@ bool checkRangeLeftSide(Node *n, std::span<const uint8_t> key, int prefixLen,
|
||||
}
|
||||
}
|
||||
|
||||
auto [child, maxV] = getChildAndMaxVersion(n, remaining[0]);
|
||||
auto [child, maxV, childT] = getChildAndMaxVersion(n, remaining[0]);
|
||||
if (child == nullptr) {
|
||||
auto c = getChildGeq(n, remaining[0]);
|
||||
if (c != nullptr) {
|
||||
@@ -2714,7 +2835,7 @@ bool checkRangeRead(Node *n, std::span<const uint8_t> begin,
|
||||
if (remaining.size() == 0) {
|
||||
break;
|
||||
}
|
||||
auto [child, v] = getChildAndMaxVersion(n, remaining[0]);
|
||||
auto [child, v, childT] = getChildAndMaxVersion(n, remaining[0]);
|
||||
if (child == nullptr) {
|
||||
break;
|
||||
}
|
||||
@@ -2780,8 +2901,10 @@ checkMaxBetweenExclusiveImpl<true>(Node *n, int begin, int end,
|
||||
// of the result will have `maxVersion` set to `writeVersion` as a
|
||||
// postcondition. Nodes along the search path may be invalidated. Callers must
|
||||
// ensure that the max version of the self argument is updated.
|
||||
[[nodiscard]] Node **insert(Node **self, std::span<const uint8_t> key,
|
||||
InternalVersionT writeVersion, WriteContext *tls) {
|
||||
[[nodiscard]] TaggedNodePointer *insert(TaggedNodePointer *self,
|
||||
std::span<const uint8_t> key,
|
||||
InternalVersionT writeVersion,
|
||||
WriteContext *tls) {
|
||||
|
||||
for (; key.size() != 0; ++tls->accum.insert_iterations) {
|
||||
self = &getOrCreateChild(*self, key, writeVersion, tls);
|
||||
@@ -2809,17 +2932,23 @@ void eraseTree(Node *root, WriteContext *tls) {
|
||||
} break;
|
||||
case Type_Node3: {
|
||||
auto *n3 = static_cast<Node3 *>(n);
|
||||
toFree.append(std::span<Node *>(n3->children, n3->numChildren));
|
||||
for (int i = 0; i < n3->numChildren; ++i) {
|
||||
toFree.push_back(n3->children[i]);
|
||||
}
|
||||
tls->release(n3);
|
||||
} break;
|
||||
case Type_Node16: {
|
||||
auto *n16 = static_cast<Node16 *>(n);
|
||||
toFree.append(std::span<Node *>(n16->children, n16->numChildren));
|
||||
for (int i = 0; i < n16->numChildren; ++i) {
|
||||
toFree.push_back(n16->children[i]);
|
||||
}
|
||||
tls->release(n16);
|
||||
} break;
|
||||
case Type_Node48: {
|
||||
auto *n48 = static_cast<Node48 *>(n);
|
||||
toFree.append(std::span<Node *>(n48->children, n48->numChildren));
|
||||
for (int i = 0; i < n48->numChildren; ++i) {
|
||||
toFree.push_back(n48->children[i]);
|
||||
}
|
||||
tls->release(n48);
|
||||
} break;
|
||||
case Type_Node256: {
|
||||
@@ -2835,10 +2964,10 @@ void eraseTree(Node *root, WriteContext *tls) {
|
||||
}
|
||||
}
|
||||
|
||||
void addPointWrite(Node *&root, std::span<const uint8_t> key,
|
||||
void addPointWrite(TaggedNodePointer &root, std::span<const uint8_t> key,
|
||||
InternalVersionT writeVersion, WriteContext *tls) {
|
||||
++tls->accum.point_writes;
|
||||
auto *n = *insert(&root, key, writeVersion, tls);
|
||||
auto n = *insert(&root, key, writeVersion, tls);
|
||||
if (!n->entryPresent) {
|
||||
++tls->accum.entries_inserted;
|
||||
auto *p = nextLogical(n);
|
||||
@@ -2855,6 +2984,72 @@ void addPointWrite(Node *&root, std::span<const uint8_t> key,
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__)
|
||||
__attribute__((target("avx512f"))) InternalVersionT horizontalMaxUpTo16(
|
||||
InternalVersionT *vs, [[maybe_unused]] InternalVersionT z, int len) {
|
||||
assume(len <= 16);
|
||||
#if USE_64_BIT
|
||||
// Hope it gets vectorized
|
||||
InternalVersionT max = vs[0];
|
||||
for (int i = 1; i < len; ++i) {
|
||||
max = std::max(vs[i], max);
|
||||
}
|
||||
return max;
|
||||
#else
|
||||
uint32_t zero;
|
||||
memcpy(&zero, &z, sizeof(zero));
|
||||
auto zeroVec = _mm512_set1_epi32(zero);
|
||||
auto max = InternalVersionT(
|
||||
zero +
|
||||
_mm512_reduce_max_epi32(_mm512_sub_epi32(
|
||||
_mm512_mask_loadu_epi32(zeroVec, _mm512_int2mask((1 << len) - 1), vs),
|
||||
zeroVec)));
|
||||
return max;
|
||||
#endif
|
||||
}
|
||||
__attribute__((target("default")))
|
||||
#endif
|
||||
|
||||
InternalVersionT
|
||||
horizontalMaxUpTo16(InternalVersionT *vs, InternalVersionT, int len) {
|
||||
assume(len <= 16);
|
||||
InternalVersionT max = vs[0];
|
||||
for (int i = 1; i < len; ++i) {
|
||||
max = std::max(vs[i], max);
|
||||
}
|
||||
return max;
|
||||
}
|
||||
|
||||
#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__)
|
||||
__attribute__((target("avx512f"))) InternalVersionT
|
||||
horizontalMax16(InternalVersionT *vs, [[maybe_unused]] InternalVersionT z) {
|
||||
#if USE_64_BIT
|
||||
// Hope it gets vectorized
|
||||
InternalVersionT max = vs[0];
|
||||
for (int i = 1; i < 16; ++i) {
|
||||
max = std::max(vs[i], max);
|
||||
}
|
||||
return max;
|
||||
#else
|
||||
uint32_t zero; // GCOVR_EXCL_LINE
|
||||
memcpy(&zero, &z, sizeof(zero));
|
||||
auto zeroVec = _mm512_set1_epi32(zero);
|
||||
return InternalVersionT(zero + _mm512_reduce_max_epi32(_mm512_sub_epi32(
|
||||
_mm512_loadu_epi32(vs), zeroVec)));
|
||||
#endif
|
||||
}
|
||||
__attribute__((target("default")))
|
||||
#endif
|
||||
|
||||
InternalVersionT
|
||||
horizontalMax16(InternalVersionT *vs, InternalVersionT) {
|
||||
InternalVersionT max = vs[0];
|
||||
for (int i = 1; i < 16; ++i) {
|
||||
max = std::max(vs[i], max);
|
||||
}
|
||||
return max;
|
||||
}
|
||||
|
||||
// Precondition: `node->entryPresent`, and node is not the root
|
||||
void fixupMaxVersion(Node *node, WriteContext *tls) {
|
||||
assert(node->parent);
|
||||
@@ -2866,15 +3061,13 @@ void fixupMaxVersion(Node *node, WriteContext *tls) {
|
||||
break;
|
||||
case Type_Node3: {
|
||||
auto *self3 = static_cast<Node3 *>(node);
|
||||
for (int i = 0; i < self3->numChildren; ++i) {
|
||||
max = std::max(self3->childMaxVersion[i], max);
|
||||
}
|
||||
max = std::max(max, horizontalMaxUpTo16(self3->childMaxVersion, tls->zero,
|
||||
self3->numChildren));
|
||||
} break;
|
||||
case Type_Node16: {
|
||||
auto *self16 = static_cast<Node16 *>(node);
|
||||
for (int i = 0; i < self16->numChildren; ++i) {
|
||||
max = std::max(self16->childMaxVersion[i], max);
|
||||
}
|
||||
max = std::max(max, horizontalMaxUpTo16(self16->childMaxVersion, tls->zero,
|
||||
self16->numChildren));
|
||||
} break;
|
||||
case Type_Node48: {
|
||||
auto *self48 = static_cast<Node48 *>(node);
|
||||
@@ -2884,9 +3077,7 @@ void fixupMaxVersion(Node *node, WriteContext *tls) {
|
||||
} break;
|
||||
case Type_Node256: {
|
||||
auto *self256 = static_cast<Node256 *>(node);
|
||||
for (auto v : self256->maxOfMax) {
|
||||
max = std::max(v, max);
|
||||
}
|
||||
max = std::max(max, horizontalMax16(self256->childMaxVersion, tls->zero));
|
||||
} break;
|
||||
default: // GCOVR_EXCL_LINE
|
||||
__builtin_unreachable(); // GCOVR_EXCL_LINE
|
||||
@@ -2894,7 +3085,7 @@ void fixupMaxVersion(Node *node, WriteContext *tls) {
|
||||
setMaxVersion(node, max);
|
||||
}
|
||||
|
||||
void addWriteRange(Node *&root, std::span<const uint8_t> begin,
|
||||
void addWriteRange(TaggedNodePointer &root, std::span<const uint8_t> begin,
|
||||
std::span<const uint8_t> end, InternalVersionT writeVersion,
|
||||
WriteContext *tls, ConflictSet::Impl *impl) {
|
||||
|
||||
@@ -2907,12 +3098,12 @@ void addWriteRange(Node *&root, std::span<const uint8_t> begin,
|
||||
++tls->accum.range_writes;
|
||||
const bool beginIsPrefix = lcp == int(begin.size());
|
||||
|
||||
Node **useAsRoot = insert(&root, begin.subspan(0, lcp), writeVersion, tls);
|
||||
auto useAsRoot = insert(&root, begin.subspan(0, lcp), writeVersion, tls);
|
||||
|
||||
begin = begin.subspan(lcp, begin.size() - lcp);
|
||||
end = end.subspan(lcp, end.size() - lcp);
|
||||
|
||||
auto *beginNode = *insert(useAsRoot, begin, writeVersion, tls);
|
||||
Node *beginNode = *insert(useAsRoot, begin, writeVersion, tls);
|
||||
addKey(beginNode);
|
||||
if (!beginNode->entryPresent) {
|
||||
++tls->accum.entries_inserted;
|
||||
@@ -2923,7 +3114,7 @@ void addWriteRange(Node *&root, std::span<const uint8_t> begin,
|
||||
}
|
||||
beginNode->entry.pointVersion = writeVersion;
|
||||
|
||||
auto *endNode = *insert(useAsRoot, end, writeVersion, tls);
|
||||
Node *endNode = *insert(useAsRoot, end, writeVersion, tls);
|
||||
addKey(endNode);
|
||||
if (!endNode->entryPresent) {
|
||||
++tls->accum.entries_inserted;
|
||||
@@ -2943,7 +3134,7 @@ void addWriteRange(Node *&root, std::span<const uint8_t> begin,
|
||||
assert(!beginNode->endOfRange);
|
||||
assert(!endNode->endOfRange);
|
||||
endNode->endOfRange = true;
|
||||
auto *iter = beginNode;
|
||||
Node *iter = beginNode;
|
||||
for (iter = nextLogical(iter); !iter->endOfRange;
|
||||
iter = erase(iter, tls, impl, /*logical*/ true)) {
|
||||
assert(!iter->endOfRange);
|
||||
@@ -3258,7 +3449,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
|
||||
std::span<const uint8_t> removalKey;
|
||||
int64_t keyUpdates;
|
||||
|
||||
Node *root;
|
||||
TaggedNodePointer root;
|
||||
InternalVersionT oldestVersion;
|
||||
int64_t oldestVersionFullPrecision;
|
||||
int64_t oldestExtantVersion;
|
||||
@@ -3339,7 +3530,7 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
|
||||
}
|
||||
};
|
||||
|
||||
Node *&getInTree(Node *n, ConflictSet::Impl *impl) {
|
||||
TaggedNodePointer &getInTree(Node *n, ConflictSet::Impl *impl) {
|
||||
return n->parent == nullptr ? impl->root
|
||||
: getChildExists(n->parent, n->parentsIndex);
|
||||
}
|
||||
@@ -3923,7 +4114,6 @@ struct __attribute__((visibility("default"))) PeakPrinter {
|
||||
|
||||
#ifdef ENABLE_MAIN
|
||||
|
||||
#define ANKERL_NANOBENCH_IMPLEMENT
|
||||
#include "third_party/nanobench.h"
|
||||
|
||||
template <int kN> void benchRezero() {
|
||||
@@ -3979,6 +4169,24 @@ template <int kN> void benchScan2() {
|
||||
});
|
||||
}
|
||||
|
||||
void benchHorizontal16() {
|
||||
ankerl::nanobench::Bench bench;
|
||||
InternalVersionT vs[16];
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
vs[i] = InternalVersionT(rand() % 1000 + 1000);
|
||||
}
|
||||
#if !USE_64_BIT
|
||||
InternalVersionT::zero = InternalVersionT(rand() % 1000);
|
||||
#endif
|
||||
bench.run("horizontal16", [&]() {
|
||||
bench.doNotOptimizeAway(horizontalMax16(vs, InternalVersionT::zero));
|
||||
});
|
||||
int x = rand() % 15 + 1;
|
||||
bench.run("horizontalUpTo16", [&]() {
|
||||
bench.doNotOptimizeAway(horizontalMaxUpTo16(vs, InternalVersionT::zero, x));
|
||||
});
|
||||
}
|
||||
|
||||
void benchLCP(int len) {
|
||||
ankerl::nanobench::Bench bench;
|
||||
std::vector<uint8_t> lhs(len);
|
||||
@@ -4011,11 +4219,7 @@ void printTree() {
|
||||
debugPrintDot(stdout, cs.root, &cs);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
for (int i = 0; i < 256; ++i) {
|
||||
benchLCP(i);
|
||||
}
|
||||
}
|
||||
int main(void) { benchHorizontal16(); }
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_FUZZ
|
||||
|
||||
@@ -0,0 +1,256 @@
|
||||
#include <alloca.h>
|
||||
#include <cassert>
|
||||
#ifdef __x86_64__
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#include "third_party/nanobench.h"
|
||||
|
||||
struct Job {
|
||||
int *input;
|
||||
// Returned void* is a function pointer to the next continuation. We have to
|
||||
// use void* because otherwise the type would be recursive.
|
||||
typedef void *(*continuation)(Job *);
|
||||
continuation next;
|
||||
};
|
||||
|
||||
void *stepJob(Job *j) {
|
||||
auto done = --(*j->input) == 0;
|
||||
#ifdef __x86_64__
|
||||
_mm_clflush(j->input);
|
||||
#endif
|
||||
return done ? nullptr : (void *)stepJob;
|
||||
}
|
||||
|
||||
void sequential(Job **jobs, int count) {
|
||||
for (int i = 0; i < count; ++i) {
|
||||
do {
|
||||
jobs[i]->next = (Job::continuation)jobs[i]->next(jobs[i]);
|
||||
} while (jobs[i]->next);
|
||||
}
|
||||
}
|
||||
|
||||
void sequentialNoFuncPtr(Job **jobs, int count) {
|
||||
for (int i = 0; i < count; ++i) {
|
||||
while (stepJob(jobs[i]))
|
||||
;
|
||||
}
|
||||
}
|
||||
|
||||
void interleaveSwapping(Job **jobs, int remaining) {
|
||||
int current = 0;
|
||||
while (remaining > 0) {
|
||||
auto next = (Job::continuation)jobs[current]->next(jobs[current]);
|
||||
jobs[current]->next = next;
|
||||
if (next == nullptr) {
|
||||
jobs[current] = jobs[remaining - 1];
|
||||
--remaining;
|
||||
} else {
|
||||
++current;
|
||||
}
|
||||
if (current == remaining) {
|
||||
current = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void interleaveBoundedCyclicList(Job **jobs, int count) {
|
||||
if (count == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int kConcurrent = 32;
|
||||
Job *inProgress[kConcurrent];
|
||||
int nextJob[kConcurrent];
|
||||
|
||||
int started = std::min(kConcurrent, count);
|
||||
for (int i = 0; i < started; i++) {
|
||||
inProgress[i] = jobs[i];
|
||||
nextJob[i] = i + 1;
|
||||
}
|
||||
nextJob[started - 1] = 0;
|
||||
|
||||
int prevJob = started - 1;
|
||||
int job = 0;
|
||||
for (;;) {
|
||||
auto next = (Job::continuation)inProgress[job]->next(inProgress[job]);
|
||||
inProgress[job]->next = next;
|
||||
if (next == nullptr) {
|
||||
if (started == count) {
|
||||
if (prevJob == job)
|
||||
break;
|
||||
nextJob[prevJob] = nextJob[job];
|
||||
job = prevJob;
|
||||
} else {
|
||||
int temp = started++;
|
||||
inProgress[job] = jobs[temp];
|
||||
}
|
||||
}
|
||||
prevJob = job;
|
||||
job = nextJob[job];
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef __has_attribute
|
||||
#define __has_attribute(x) 0
|
||||
#endif
|
||||
|
||||
#if __has_attribute(musttail)
|
||||
#define MUSTTAIL __attribute__((musttail))
|
||||
#else
|
||||
#define MUSTTAIL
|
||||
#endif
|
||||
|
||||
struct Context {
|
||||
constexpr static int kConcurrent = 32;
|
||||
Job **jobs;
|
||||
Job *inProgress[kConcurrent];
|
||||
void (*continuation[kConcurrent])(Context *, int64_t prevJob, int64_t job,
|
||||
int64_t started, int64_t count);
|
||||
int nextJob[kConcurrent];
|
||||
};
|
||||
|
||||
void keepGoing(Context *context, int64_t prevJob, int64_t job, int64_t started,
|
||||
int64_t count) {
|
||||
prevJob = job;
|
||||
job = context->nextJob[job];
|
||||
MUSTTAIL return context->continuation[job](context, prevJob, job, started,
|
||||
count);
|
||||
}
|
||||
|
||||
void stepJobTailCall(Context *context, int64_t prevJob, int64_t job,
|
||||
int64_t started, int64_t count);
|
||||
|
||||
void complete(Context *context, int64_t prevJob, int64_t job, int64_t started,
|
||||
int64_t count) {
|
||||
if (started == count) {
|
||||
if (prevJob == job) {
|
||||
return;
|
||||
}
|
||||
context->nextJob[prevJob] = context->nextJob[job];
|
||||
job = prevJob;
|
||||
} else {
|
||||
context->inProgress[job] = context->jobs[started++];
|
||||
context->continuation[job] = stepJobTailCall;
|
||||
}
|
||||
prevJob = job;
|
||||
job = context->nextJob[job];
|
||||
MUSTTAIL return context->continuation[job](context, prevJob, job, started,
|
||||
count);
|
||||
}
|
||||
|
||||
void stepJobTailCall(Context *context, int64_t prevJob, int64_t job,
|
||||
int64_t started, int64_t count) {
|
||||
auto *j = context->inProgress[job];
|
||||
auto done = --(*j->input) == 0;
|
||||
#ifdef __x86_64__
|
||||
_mm_clflush(j->input);
|
||||
#endif
|
||||
if (done) {
|
||||
MUSTTAIL return complete(context, prevJob, job, started, count);
|
||||
} else {
|
||||
context->continuation[job] = stepJobTailCall;
|
||||
MUSTTAIL return keepGoing(context, prevJob, job, started, count);
|
||||
}
|
||||
}
|
||||
|
||||
void useTailCalls(Job **jobs, int count) {
|
||||
if (count == 0) {
|
||||
return;
|
||||
}
|
||||
Context context;
|
||||
context.jobs = jobs;
|
||||
int64_t started = std::min(Context::kConcurrent, count);
|
||||
for (int i = 0; i < started; i++) {
|
||||
context.inProgress[i] = jobs[i];
|
||||
context.nextJob[i] = i + 1;
|
||||
context.continuation[i] = stepJobTailCall;
|
||||
}
|
||||
context.nextJob[started - 1] = 0;
|
||||
int prevJob = started - 1;
|
||||
int job = 0;
|
||||
return context.continuation[job](&context, prevJob, job, started, count);
|
||||
}
|
||||
|
||||
void interleaveCyclicList(Job **jobs, int count) {
|
||||
auto *nextJob = (int *)alloca(sizeof(int) * count);
|
||||
|
||||
for (int i = 0; i < count - 1; ++i) {
|
||||
nextJob[i] = i + 1;
|
||||
}
|
||||
nextJob[count - 1] = 0;
|
||||
|
||||
int prevJob = count - 1;
|
||||
int job = 0;
|
||||
for (;;) {
|
||||
auto next = (Job::continuation)jobs[job]->next(jobs[job]);
|
||||
jobs[job]->next = next;
|
||||
if (next == nullptr) {
|
||||
if (prevJob == job)
|
||||
break;
|
||||
nextJob[prevJob] = nextJob[job];
|
||||
job = prevJob;
|
||||
}
|
||||
prevJob = job;
|
||||
job = nextJob[job];
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
ankerl::nanobench::Bench bench;
|
||||
|
||||
constexpr int kNumJobs = 10000;
|
||||
bench.relative(true);
|
||||
|
||||
Job jobs[kNumJobs];
|
||||
Job jobsCopy[kNumJobs];
|
||||
int iters = 0;
|
||||
int originalInput[kNumJobs];
|
||||
for (int i = 0; i < kNumJobs; ++i) {
|
||||
originalInput[i] = rand() % 5 + 3;
|
||||
jobs[i].input = new int{originalInput[i]};
|
||||
jobs[i].next = stepJob;
|
||||
iters += *jobs[i].input;
|
||||
}
|
||||
bench.batch(iters);
|
||||
|
||||
for (auto [scheduler, name] :
|
||||
{std::make_pair(sequentialNoFuncPtr, "sequentialNoFuncPtr"),
|
||||
std::make_pair(sequential, "sequential"),
|
||||
std::make_pair(useTailCalls, "useTailCalls"),
|
||||
std::make_pair(interleaveSwapping, "interleavingSwapping"),
|
||||
std::make_pair(interleaveBoundedCyclicList,
|
||||
"interleaveBoundedCyclicList"),
|
||||
std::make_pair(interleaveCyclicList, "interleaveCyclicList")}) {
|
||||
for (int i = 0; i < kNumJobs; ++i) {
|
||||
*jobs[i].input = originalInput[i];
|
||||
}
|
||||
memcpy(jobsCopy, jobs, sizeof(jobs));
|
||||
Job *ps[kNumJobs];
|
||||
for (int i = 0; i < kNumJobs; ++i) {
|
||||
ps[i] = jobsCopy + i;
|
||||
}
|
||||
scheduler(ps, kNumJobs);
|
||||
for (int i = 0; i < kNumJobs; ++i) {
|
||||
if (*jobsCopy[i].input != 0) {
|
||||
fprintf(stderr, "%s failed\n", name);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
bench.run(name, [&]() {
|
||||
for (int i = 0; i < kNumJobs; ++i) {
|
||||
*jobs[i].input = originalInput[i];
|
||||
}
|
||||
memcpy(jobsCopy, jobs, sizeof(jobs));
|
||||
Job *ps[kNumJobs];
|
||||
for (int i = 0; i < kNumJobs; ++i) {
|
||||
ps[i] = jobsCopy + i;
|
||||
}
|
||||
scheduler(ps, kNumJobs);
|
||||
});
|
||||
}
|
||||
for (int i = 0; i < kNumJobs; ++i) {
|
||||
delete jobs[i].input;
|
||||
}
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,2 @@
|
||||
#define ANKERL_NANOBENCH_IMPLEMENT
|
||||
#include "third_party/nanobench.h"
|
||||
Reference in New Issue
Block a user