14 Commits

Author SHA1 Message Date
andrew ed67486077 Reordering seems to improve codegen
Tests / Clang total: 3244, passed: 3244
Clang |Total|New|Outstanding|Fixed|Trend |:-:|:-:|:-:|:-:|:-: |0|0|0|0|:clap:
Tests / 64 bit versions total: 3244, passed: 3244
Tests / Debug total: 3242, passed: 3242
Tests / SIMD fallback total: 3244, passed: 3244
Tests / Release [gcc] total: 3244, passed: 3244
GNU C Compiler (gcc) |Total|New|Outstanding|Fixed|Trend |:-:|:-:|:-:|:-:|:-: |0|0|0|0|:clap:
Tests / Release [gcc,aarch64] total: 2419, passed: 2419
Tests / Coverage total: 2437, passed: 2437
Code Coverage #### Project Overview No changes detected, that affect the code coverage. * Line Coverage: 98.98% (1938/1958) * Branch Coverage: 68.67% (1497/2180) * Complexity Density: 0.00 * Lines of Code: 1958 #### Quality Gates Summary Output truncated.
weaselab/conflict-set/pipeline/head There was a failure building this commit
2024-09-23 15:28:51 -07:00
andrew b376f6fdd5 WIP
Tests / Clang total: 3244, passed: 3244
Clang |Total|New|Outstanding|Fixed|Trend |:-:|:-:|:-:|:-:|:-: |0|0|0|0|:clap:
Tests / 64 bit versions total: 3244, passed: 3244
Tests / Debug total: 3242, passed: 3242
Tests / SIMD fallback total: 3244, passed: 3244
Tests / Release [gcc] total: 3244, passed: 3244
GNU C Compiler (gcc) |Total|New|Outstanding|Fixed|Trend |:-:|:-:|:-:|:-:|:-: |0|0|0|0|:clap:
Tests / Release [gcc,aarch64] total: 2419, passed: 2419
Tests / Coverage total: 2437, passed: 2437
Code Coverage #### Project Overview No changes detected, that affect the code coverage. * Line Coverage: 98.98% (1938/1958) * Branch Coverage: 68.67% (1497/2180) * Complexity Density: 0.00 * Lines of Code: 1958 #### Quality Gates Summary Output truncated.
weaselab/conflict-set/pipeline/head There was a failure building this commit
2024-09-23 15:11:48 -07:00
andrew 6de63dd3fe Use preserve_none and put continuation array in CheckAll 2024-09-23 14:53:16 -07:00
andrew 3e5f13bf54 WIP - tests pass 2024-09-23 13:32:56 -07:00
andrew e7e1d1f7f5 Add tail-call based interleaving approach 2024-09-23 12:52:30 -07:00
andrew 442658e983 Target ~1GB memory usage in server bench 2024-09-21 14:28:15 -07:00
andrew 26f602215e Accentuate cache misses for point reads in server_bench 2024-09-14 22:42:40 -07:00
andrew 98236f81cb Add missing __builtin_prefetch 2024-09-14 22:41:58 -07:00
andrew 3593b72880 Disallow checking SIM_CACHE_MISSES=1 2024-09-10 22:23:37 -07:00
andrew 814aac4ea7 Experiment with causing cache misses 2024-09-10 22:06:00 -07:00
andrew 0550fa0016 Add "iter" state 2024-09-10 17:22:10 -07:00
andrew fe5cfb0336 Remove redundant cast 2024-09-10 17:06:45 -07:00
andrew 82203515a0 check_point_read_state_machine::down_left_spine 2024-09-10 17:05:09 -07:00
andrew 465372c734 Scaffolding to prepare for interleaving checks 2024-09-10 16:10:57 -07:00
18 changed files with 466 additions and 238 deletions
+8
View File
@@ -24,6 +24,14 @@ repos:
entry: "^#define SHOW_MEMORY 1$"
language: pygrep
types: [c++]
- repo: local
hooks:
- id: sim cache misses check
name: disallow checking in SIM_CACHE_MISSES=1
description: disallow checking in SIM_CACHE_MISSES=1
entry: "^#define SIM_CACHE_MISSES 1$"
language: pygrep
types: [c++]
- repo: https://github.com/shellcheck-py/shellcheck-py
rev: a23f6b85d0fdd5bb9d564e2579e678033debbdff # frozen: v0.10.0.1
hooks:
+5
View File
@@ -350,6 +350,11 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR AND BUILD_TESTING)
set_target_properties(server_bench PROPERTIES SKIP_BUILD_RPATH ON)
add_executable(interleaving_test InterleavingTest.cpp)
# work around lack of musttail for gcc
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_BUILD_TYPE STREQUAL "Debug")
target_compile_options(interleaving_test PRIVATE -Og
-foptimize-sibling-calls)
endif()
target_link_libraries(interleaving_test PRIVATE nanobench)
endif()
+332 -173
View File
@@ -48,6 +48,17 @@ limitations under the License.
#endif
#endif
#define SIM_CACHE_MISSES 0
#if SIM_CACHE_MISSES
constexpr void simCacheMiss(void *x) {
if (x) {
_mm_clflush(x);
}
}
#else
constexpr void simCacheMiss(void *) {}
#endif
#include <memcheck.h>
using namespace weaselab;
@@ -766,8 +777,6 @@ private:
int getNodeIndex(Node3 *self, uint8_t index) {
Node3 *n = (Node3 *)self;
assume(n->numChildren >= 1);
assume(n->numChildren <= 3);
for (int i = 0; i < n->numChildren; ++i) {
if (n->index[i] == index) {
return i;
@@ -776,18 +785,6 @@ int getNodeIndex(Node3 *self, uint8_t index) {
return -1;
}
int getNodeIndexExists(Node3 *self, uint8_t index) {
Node3 *n = (Node3 *)self;
assume(n->numChildren >= 1);
assume(n->numChildren <= 3);
for (int i = 0; i < n->numChildren; ++i) {
if (n->index[i] == index) {
return i;
}
}
__builtin_unreachable(); // GCOVR_EXCL_LINE
}
int getNodeIndex(Node16 *self, uint8_t index) {
#ifdef HAS_AVX
@@ -848,62 +845,31 @@ int getNodeIndex(Node16 *self, uint8_t index) {
#endif
}
int getNodeIndexExists(Node16 *self, uint8_t index) {
#ifdef HAS_AVX
__m128i key_vec = _mm_set1_epi8(index);
__m128i indices;
memcpy(&indices, self->index, Node16::kMaxNodes);
__m128i results = _mm_cmpeq_epi8(key_vec, indices);
uint32_t mask = (1 << self->numChildren) - 1;
uint32_t bitfield = _mm_movemask_epi8(results) & mask;
assume(bitfield != 0);
return std::countr_zero(bitfield);
#elif defined(HAS_ARM_NEON)
// Based on
// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
uint8x16_t indices;
memcpy(&indices, self->index, Node16::kMaxNodes);
// 0xff for each match
uint16x8_t results =
vreinterpretq_u16_u8(vceqq_u8(vdupq_n_u8(index), indices));
assume(self->numChildren <= Node16::kMaxNodes);
uint64_t mask = self->numChildren == 16
? uint64_t(-1)
: (uint64_t(1) << (self->numChildren * 4)) - 1;
// 0xf for each match in valid range
uint64_t bitfield =
vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(results, 4)), 0) & mask;
assume(bitfield != 0);
return std::countr_zero(bitfield) / 4;
#else
for (int i = 0; i < self->numChildren; ++i) {
if (self->index[i] == index) {
return i;
}
}
__builtin_unreachable(); // GCOVR_EXCL_LINE
#endif
}
// Precondition - an entry for index must exist in the node
Node *&getChildExists(Node3 *self, uint8_t index) {
return self->children[getNodeIndexExists(self, index)];
auto &result = self->children[getNodeIndex(self, index)];
simCacheMiss(result);
return result;
}
// Precondition - an entry for index must exist in the node
Node *&getChildExists(Node16 *self, uint8_t index) {
return self->children[getNodeIndexExists(self, index)];
auto &result = self->children[getNodeIndex(self, index)];
simCacheMiss(result);
return result;
}
// Precondition - an entry for index must exist in the node
Node *&getChildExists(Node48 *self, uint8_t index) {
assert(self->bitSet.test(index));
return self->children[self->index[index]];
auto &result = self->children[self->index[index]];
simCacheMiss(result);
return result;
}
// Precondition - an entry for index must exist in the node
Node *&getChildExists(Node256 *self, uint8_t index) {
assert(self->bitSet.test(index));
return self->children[index];
auto &result = self->children[index];
simCacheMiss(result);
return result;
}
// Precondition - an entry for index must exist in the node
@@ -938,12 +904,12 @@ InternalVersionT maxVersion(Node *n) {
__builtin_unreachable(); // GCOVR_EXCL_LINE
case Type_Node3: {
auto *n3 = static_cast<Node3 *>(n);
int i = getNodeIndexExists(n3, index);
int i = getNodeIndex(n3, index);
return n3->childMaxVersion[i];
}
case Type_Node16: {
auto *n16 = static_cast<Node16 *>(n);
int i = getNodeIndexExists(n16, index);
int i = getNodeIndex(n16, index);
return n16->childMaxVersion[i];
}
case Type_Node48: {
@@ -971,12 +937,12 @@ InternalVersionT exchangeMaxVersion(Node *n, InternalVersionT newMax) {
__builtin_unreachable(); // GCOVR_EXCL_LINE
case Type_Node3: {
auto *n3 = static_cast<Node3 *>(n);
int i = getNodeIndexExists(n3, index);
int i = getNodeIndex(n3, index);
return std::exchange(n3->childMaxVersion[i], newMax);
}
case Type_Node16: {
auto *n16 = static_cast<Node16 *>(n);
int i = getNodeIndexExists(n16, index);
int i = getNodeIndex(n16, index);
return std::exchange(n16->childMaxVersion[i], newMax);
}
case Type_Node48: {
@@ -1005,13 +971,13 @@ void setMaxVersion(Node *n, InternalVersionT newMax) {
__builtin_unreachable(); // GCOVR_EXCL_LINE
case Type_Node3: {
auto *n3 = static_cast<Node3 *>(n);
int i = getNodeIndexExists(n3, index);
int i = getNodeIndex(n3, index);
n3->childMaxVersion[i] = newMax;
return;
}
case Type_Node16: {
auto *n16 = static_cast<Node16 *>(n);
int i = getNodeIndexExists(n16, index);
int i = getNodeIndex(n16, index);
n16->childMaxVersion[i] = newMax;
return;
}
@@ -1083,6 +1049,7 @@ ChildAndMaxVersion getChildAndMaxVersion(Node3 *self, uint8_t index) {
if (i < 0) {
return {};
}
simCacheMiss(self->children[i]);
return {self->children[i], self->childMaxVersion[i]};
}
ChildAndMaxVersion getChildAndMaxVersion(Node16 *self, uint8_t index) {
@@ -1090,6 +1057,7 @@ ChildAndMaxVersion getChildAndMaxVersion(Node16 *self, uint8_t index) {
if (i < 0) {
return {};
}
simCacheMiss(self->children[i]);
return {self->children[i], self->childMaxVersion[i]};
}
ChildAndMaxVersion getChildAndMaxVersion(Node48 *self, uint8_t index) {
@@ -1097,9 +1065,11 @@ ChildAndMaxVersion getChildAndMaxVersion(Node48 *self, uint8_t index) {
if (i < 0) {
return {};
}
simCacheMiss(self->children[i]);
return {self->children[i], self->childMaxVersion[i]};
}
ChildAndMaxVersion getChildAndMaxVersion(Node256 *self, uint8_t index) {
simCacheMiss(self->children[index]);
return {self->children[index], self->childMaxVersion[index]};
}
@@ -1123,10 +1093,9 @@ ChildAndMaxVersion getChildAndMaxVersion(Node *self, uint8_t index) {
Node *getChildGeq(Node0 *, int) { return nullptr; }
Node *getChildGeq(Node3 *n, int child) {
assume(n->numChildren >= 1);
assume(n->numChildren <= 3);
for (int i = 0; i < n->numChildren; ++i) {
if (n->index[i] >= child) {
simCacheMiss(n->children[i]);
return n->children[i];
}
}
@@ -1145,7 +1114,10 @@ Node *getChildGeq(Node16 *self, int child) {
__m128i results = _mm_cmpeq_epi8(key_vec, _mm_min_epu8(key_vec, indices));
int mask = (1 << self->numChildren) - 1;
uint32_t bitfield = _mm_movemask_epi8(results) & mask;
return bitfield == 0 ? nullptr : self->children[std::countr_zero(bitfield)];
auto *result =
bitfield == 0 ? nullptr : self->children[std::countr_zero(bitfield)];
simCacheMiss(result);
return result;
#elif defined(HAS_ARM_NEON)
uint8x16_t indices;
memcpy(&indices, self->index, sizeof(self->index));
@@ -1181,13 +1153,16 @@ Node *getChildGeq(Node48 *self, int child) {
if (c < 0) {
return nullptr;
}
return self->children[self->index[c]];
auto *result = self->children[self->index[c]];
simCacheMiss(result);
return result;
}
Node *getChildGeq(Node256 *self, int child) {
int c = self->bitSet.firstSetGeq(child);
if (c < 0) {
return nullptr;
}
simCacheMiss(self->children[c]);
return self->children[c];
}
@@ -1211,20 +1186,26 @@ Node *getChildGeq(Node *self, int child) {
// Precondition: self has a child
Node *getFirstChildExists(Node3 *self) {
assert(self->numChildren > 0);
simCacheMiss(self->children[0]);
return self->children[0];
}
// Precondition: self has a child
Node *getFirstChildExists(Node16 *self) {
assert(self->numChildren > 0);
simCacheMiss(self->children[0]);
return self->children[0];
}
// Precondition: self has a child
Node *getFirstChildExists(Node48 *self) {
return self->children[self->index[self->bitSet.firstSetGeq(0)]];
auto *result = self->children[self->index[self->bitSet.firstSetGeq(0)]];
simCacheMiss(result);
return result;
}
// Precondition: self has a child
Node *getFirstChildExists(Node256 *self) {
return self->children[self->bitSet.firstSetGeq(0)];
auto *result = self->children[self->bitSet.firstSetGeq(0)];
simCacheMiss(result);
return result;
}
// Precondition: self has a child
@@ -1604,6 +1585,7 @@ __attribute__((target("avx512f"))) void rezero16(InternalVersionT *vs,
_mm512_sub_epi32(_mm512_loadu_epi32(vs), zvec), _mm512_setzero_epi32());
_mm512_mask_storeu_epi32(vs, m, zvec);
}
__attribute__((target("default")))
#endif
@@ -2470,7 +2452,6 @@ checkMaxBetweenExclusive(Node *n, int begin, int end,
}
__attribute__((target("default")))
#endif
bool checkMaxBetweenExclusive(Node *n, int begin, int end,
InternalVersionT readVersion, ReadContext *tls) {
return checkMaxBetweenExclusiveImpl<false>(n, begin, end, readVersion, tls);
@@ -2910,72 +2891,6 @@ void addPointWrite(Node *&root, std::span<const uint8_t> key,
}
}
#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__)
__attribute__((target("avx512f"))) InternalVersionT
horizontalMaxUpTo16(InternalVersionT *vs, InternalVersionT z, int len) {
assume(len <= 16);
#if USE_64_BIT
// Hope it gets vectorized
InternalVersionT max = vs[0];
for (int i = 1; i < len; ++i) {
max = std::max(vs[i], max);
}
return max;
#else
uint32_t zero;
memcpy(&zero, &z, sizeof(zero));
auto zeroVec = _mm512_set1_epi32(zero);
auto max = InternalVersionT(
zero +
_mm512_reduce_max_epi32(_mm512_sub_epi32(
_mm512_mask_loadu_epi32(zeroVec, _mm512_int2mask((1 << len) - 1), vs),
zeroVec)));
return max;
#endif
}
__attribute__((target("default")))
#endif
InternalVersionT
horizontalMaxUpTo16(InternalVersionT *vs, InternalVersionT, int len) {
assume(len <= 16);
InternalVersionT max = vs[0];
for (int i = 1; i < len; ++i) {
max = std::max(vs[i], max);
}
return max;
}
#if defined(HAS_AVX) && !defined(__SANITIZE_THREAD__)
__attribute__((target("avx512f"))) InternalVersionT
horizontalMax16(InternalVersionT *vs, InternalVersionT z) {
#if USE_64_BIT
// Hope it gets vectorized
InternalVersionT max = vs[0];
for (int i = 1; i < 16; ++i) {
max = std::max(vs[i], max);
}
return max;
#else
uint32_t zero;
memcpy(&zero, &z, sizeof(zero));
auto zeroVec = _mm512_set1_epi32(zero);
return InternalVersionT(zero + _mm512_reduce_max_epi32(_mm512_sub_epi32(
_mm512_loadu_epi32(vs), zeroVec)));
#endif
}
__attribute__((target("default")))
#endif
InternalVersionT
horizontalMax16(InternalVersionT *vs, InternalVersionT) {
InternalVersionT max = vs[0];
for (int i = 1; i < 16; ++i) {
max = std::max(vs[i], max);
}
return max;
}
// Precondition: `node->entryPresent`, and node is not the root
void fixupMaxVersion(Node *node, WriteContext *tls) {
assert(node->parent);
@@ -2987,13 +2902,15 @@ void fixupMaxVersion(Node *node, WriteContext *tls) {
break;
case Type_Node3: {
auto *self3 = static_cast<Node3 *>(node);
max = std::max(max, horizontalMaxUpTo16(self3->childMaxVersion, tls->zero,
self3->numChildren));
for (int i = 0; i < self3->numChildren; ++i) {
max = std::max(self3->childMaxVersion[i], max);
}
} break;
case Type_Node16: {
auto *self16 = static_cast<Node16 *>(node);
max = std::max(max, horizontalMaxUpTo16(self16->childMaxVersion, tls->zero,
self16->numChildren));
for (int i = 0; i < self16->numChildren; ++i) {
max = std::max(self16->childMaxVersion[i], max);
}
} break;
case Type_Node48: {
auto *self48 = static_cast<Node48 *>(node);
@@ -3003,7 +2920,9 @@ void fixupMaxVersion(Node *node, WriteContext *tls) {
} break;
case Type_Node256: {
auto *self256 = static_cast<Node256 *>(node);
max = std::max(max, horizontalMax16(self256->childMaxVersion, tls->zero));
for (auto v : self256->maxOfMax) {
max = std::max(v, max);
}
} break;
default: // GCOVR_EXCL_LINE
__builtin_unreachable(); // GCOVR_EXCL_LINE
@@ -3126,34 +3045,288 @@ Node *firstGeqPhysical(Node *n, const std::span<const uint8_t> key) {
}
}
#ifndef __has_attribute
#define __has_attribute(x) 0
#endif
#if __has_attribute(musttail)
#define MUSTTAIL __attribute__((musttail))
#else
#define MUSTTAIL
#endif
#if __has_attribute(preserve_none)
#define CONTINUATION_CALLING_CONVENTION __attribute__((preserve_none))
#else
#define CONTINUATION_CALLING_CONVENTION
#endif
typedef CONTINUATION_CALLING_CONVENTION void (*continuation)(struct CheckAll *,
int64_t prevJob,
int64_t job,
int64_t started,
int64_t count);
// State relevant to a particular query
struct CheckJob {
void setResult(bool ok) {
*result = ok ? ConflictSet::Commit : ConflictSet::Conflict;
}
[[nodiscard]] continuation init(const ConflictSet::ReadRange *read,
ConflictSet::Result *result, Node *root,
int64_t oldestVersionFullPrecision,
ReadContext *tls);
Node *n;
ChildAndMaxVersion childAndVersion;
std::span<const uint8_t> begin;
InternalVersionT readVersion;
ConflictSet::Result *result;
};
// State relevant to all queries
struct CheckAll {
constexpr static int kConcurrent = 32;
CheckJob inProgress[kConcurrent];
continuation next[kConcurrent];
int nextJob[kConcurrent];
Node *root;
int64_t oldestVersionFullPrecision;
ReadContext *tls;
const ConflictSet::ReadRange *queries;
ConflictSet::Result *results;
};
CONTINUATION_CALLING_CONVENTION void keepGoing(CheckAll *context,
int64_t prevJob, int64_t job,
int64_t started, int64_t count) {
prevJob = job;
job = context->nextJob[job];
MUSTTAIL return context->next[job](context, prevJob, job, started, count);
}
CONTINUATION_CALLING_CONVENTION void complete(CheckAll *context,
int64_t prevJob, int64_t job,
int64_t started, int64_t count) {
if (started == count) {
if (prevJob == job) {
return;
}
context->nextJob[prevJob] = context->nextJob[job];
job = prevJob;
} else {
int temp = started++;
context->next[job] = context->inProgress[job].init(
context->queries + temp, context->results + temp, context->root,
context->oldestVersionFullPrecision, context->tls);
}
MUSTTAIL return keepGoing(context, prevJob, job, started, count);
}
namespace check_point_read_state_machine {
CONTINUATION_CALLING_CONVENTION void
down_left_spine(struct CheckAll *, int64_t prevJob, int64_t job,
int64_t started, int64_t count);
CONTINUATION_CALLING_CONVENTION void iter(struct CheckAll *, int64_t prevJob,
int64_t job, int64_t started,
int64_t count);
CONTINUATION_CALLING_CONVENTION void begin(struct CheckAll *, int64_t prevJob,
int64_t job, int64_t started,
int64_t count);
void begin(struct CheckAll *context, int64_t prevJob, int64_t job,
int64_t started, int64_t count) {
++context->tls->point_read_accum;
#if DEBUG_VERBOSE && !defined(NDEBUG)
fprintf(stderr, "Check point read: %s\n", printable(key).c_str());
#endif
auto *j = context->inProgress + job;
if (j->begin.size() == 0) {
if (j->n->entryPresent) {
j->setResult(j->n->entry.pointVersion <= j->readVersion);
MUSTTAIL return complete(context, prevJob, job, started, count);
}
j->n = getFirstChildExists(j->n);
context->next[job] = down_left_spine;
__builtin_prefetch(j->n);
MUSTTAIL return keepGoing(context, prevJob, job, started, count);
}
j->childAndVersion = getChildAndMaxVersion(j->n, j->begin[0]);
context->next[job] = iter;
__builtin_prefetch(j->childAndVersion.child);
MUSTTAIL return keepGoing(context, prevJob, job, started, count);
}
void iter(struct CheckAll *context, int64_t prevJob, int64_t job,
int64_t started, int64_t count) {
auto *j = context->inProgress + job;
if (j->childAndVersion.child == nullptr) {
auto c = getChildGeq(j->n, j->begin[0]);
if (c != nullptr) {
j->n = c;
context->next[job] = down_left_spine;
__builtin_prefetch(j->n);
MUSTTAIL return keepGoing(context, prevJob, job, started, count);
} else {
j->n = nextSibling(j->n);
if (j->n == nullptr) {
j->setResult(true);
MUSTTAIL return complete(context, prevJob, job, started, count);
}
context->next[job] = down_left_spine;
__builtin_prefetch(j->n);
MUSTTAIL return keepGoing(context, prevJob, job, started, count);
}
}
j->n = j->childAndVersion.child;
j->begin = j->begin.subspan(1, j->begin.size() - 1);
if (j->n->partialKeyLen > 0) {
int commonLen = std::min<int>(j->n->partialKeyLen, j->begin.size());
int i = longestCommonPrefix(j->n->partialKey(), j->begin.data(), commonLen);
if (i < commonLen) {
auto c = j->n->partialKey()[i] <=> j->begin[i];
if (c > 0) {
context->next[job] = down_left_spine;
MUSTTAIL return down_left_spine(context, prevJob, job, started, count);
} else {
j->n = nextSibling(j->n);
if (j->n == nullptr) {
j->setResult(true);
MUSTTAIL return complete(context, prevJob, job, started, count);
}
context->next[job] = down_left_spine;
__builtin_prefetch(j->n);
MUSTTAIL return keepGoing(context, prevJob, job, started, count);
}
}
if (commonLen == j->n->partialKeyLen) {
// partial key matches
j->begin = j->begin.subspan(commonLen, j->begin.size() - commonLen);
} else if (j->n->partialKeyLen > int(j->begin.size())) {
// n is the first physical node greater than remaining, and there's no
// eq node
context->next[job] = down_left_spine;
MUSTTAIL return down_left_spine(context, prevJob, job, started, count);
}
}
if (j->childAndVersion.maxVersion <= j->readVersion) {
++context->tls->point_read_short_circuit_accum;
j->setResult(true);
MUSTTAIL return complete(context, prevJob, job, started, count);
}
++context->tls->point_read_iterations_accum;
if (j->begin.size() == 0) {
if (j->n->entryPresent) {
j->setResult(j->n->entry.pointVersion <= j->readVersion);
MUSTTAIL return complete(context, prevJob, job, started, count);
}
j->n = getFirstChildExists(j->n);
context->next[job] = down_left_spine;
__builtin_prefetch(j->n);
MUSTTAIL return keepGoing(context, prevJob, job, started, count);
}
j->childAndVersion = getChildAndMaxVersion(j->n, j->begin[0]);
__builtin_prefetch(j->childAndVersion.child);
// j->next is already iter
MUSTTAIL return keepGoing(context, prevJob, job, started, count);
}
void down_left_spine(struct CheckAll *context, int64_t prevJob, int64_t job,
int64_t started, int64_t count) {
auto *j = context->inProgress + job;
if (j->n->entryPresent) {
j->setResult(j->n->entry.rangeVersion <= j->readVersion);
MUSTTAIL return complete(context, prevJob, job, started, count);
}
j->n = getFirstChildExists(j->n);
__builtin_prefetch(j->n);
// j->next is already down_left_spine
MUSTTAIL return keepGoing(context, prevJob, job, started, count);
}
} // namespace check_point_read_state_machine
continuation CheckJob::init(const ConflictSet::ReadRange *read,
ConflictSet::Result *result, Node *root,
int64_t oldestVersionFullPrecision,
ReadContext *tls) {
auto begin = std::span<const uint8_t>(read->begin.p, read->begin.len);
auto end = std::span<const uint8_t>(read->end.p, read->end.len);
if (read->readVersion < oldestVersionFullPrecision) {
*result = ConflictSet::TooOld;
return complete;
} else if (end.size() == 0) {
this->begin = begin;
this->n = root;
this->readVersion = InternalVersionT(read->readVersion);
this->result = result;
return check_point_read_state_machine::begin;
// *result =
// checkPointRead(root, begin, InternalVersionT(read->readVersion), tls)
// ? ConflictSet::Commit
// : ConflictSet::Conflict;
// return complete;
} else {
*result = checkRangeRead(root, begin, end,
InternalVersionT(read->readVersion), tls)
? ConflictSet::Commit
: ConflictSet::Conflict;
return complete;
}
}
struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
void check(const ReadRange *reads, Result *result, int count) {
assert(oldestVersionFullPrecision >=
newestVersionFullPrecision - kNominalVersionWindow);
if (count == 0) {
return;
}
ReadContext tls;
tls.impl = this;
int64_t check_byte_accum = 0;
CheckAll context;
context.oldestVersionFullPrecision = oldestVersionFullPrecision;
context.queries = reads;
context.results = result;
context.root = root;
context.tls = &tls;
int64_t started = std::min(context.kConcurrent, count);
for (int i = 0; i < started; i++) {
context.next[i] = context.inProgress[i].init(
reads + i, result + i, root, oldestVersionFullPrecision, &tls);
context.nextJob[i] = i + 1;
}
context.nextJob[started - 1] = 0;
int prevJob = started - 1;
int job = 0;
context.next[job](&context, prevJob, job, started, count);
for (int i = 0; i < count; ++i) {
assert(reads[i].readVersion >= 0);
assert(reads[i].readVersion <= newestVersionFullPrecision);
const auto &r = reads[i];
check_byte_accum += r.begin.len + r.end.len;
auto begin = std::span<const uint8_t>(r.begin.p, r.begin.len);
auto end = std::span<const uint8_t>(r.end.p, r.end.len);
assert(oldestVersionFullPrecision >=
newestVersionFullPrecision - kNominalVersionWindow);
result[i] =
reads[i].readVersion < oldestVersionFullPrecision ? TooOld
: (end.size() > 0
? checkRangeRead(root, begin, end,
InternalVersionT(reads[i].readVersion), &tls)
: checkPointRead(root, begin,
InternalVersionT(reads[i].readVersion), &tls))
? Commit
: Conflict;
tls.commits_accum += result[i] == Commit;
tls.conflicts_accum += result[i] == Conflict;
tls.too_olds_accum += result[i] == TooOld;
}
point_read_total.add(tls.point_read_accum);
prefix_read_total.add(tls.prefix_read_accum);
range_read_total.add(tls.range_read_accum);
@@ -4095,24 +4268,6 @@ template <int kN> void benchScan2() {
});
}
void benchHorizontal16() {
ankerl::nanobench::Bench bench;
InternalVersionT vs[16];
for (int i = 0; i < 16; ++i) {
vs[i] = InternalVersionT(rand() % 1000 + 1000);
}
#if !USE_64_BIT
InternalVersionT::zero = InternalVersionT(rand() % 1000);
#endif
bench.run("horizontal16", [&]() {
bench.doNotOptimizeAway(horizontalMax16(vs, InternalVersionT::zero));
});
int x = rand() % 15 + 1;
bench.run("horizontalUpTo16", [&]() {
bench.doNotOptimizeAway(horizontalMaxUpTo16(vs, InternalVersionT::zero, x));
});
}
void benchLCP(int len) {
ankerl::nanobench::Bench bench;
std::vector<uint8_t> lhs(len);
@@ -4145,7 +4300,11 @@ void printTree() {
debugPrintDot(stdout, cs.root, &cs);
}
int main(void) { benchHorizontal16(); }
int main(void) {
for (int i = 0; i < 256; ++i) {
benchLCP(i);
}
}
#endif
#ifdef ENABLE_FUZZ
+83 -5
View File
@@ -22,9 +22,6 @@ void *stepJob(Job *j) {
return done ? nullptr : (void *)stepJob;
}
// So we can look at the disassembly more easily
extern "C" {
void sequential(Job **jobs, int count) {
for (int i = 0; i < count; ++i) {
do {
@@ -94,6 +91,87 @@ void interleaveBoundedCyclicList(Job **jobs, int count) {
}
}
#ifndef __has_attribute
#define __has_attribute(x) 0
#endif
#if __has_attribute(musttail)
#define MUSTTAIL __attribute__((musttail))
#else
#define MUSTTAIL
#endif
struct Context {
constexpr static int kConcurrent = 32;
Job **jobs;
Job *inProgress[kConcurrent];
void (*continuation[kConcurrent])(Context *, int64_t prevJob, int64_t job,
int64_t started, int64_t count);
int nextJob[kConcurrent];
};
void keepGoing(Context *context, int64_t prevJob, int64_t job, int64_t started,
int64_t count) {
prevJob = job;
job = context->nextJob[job];
MUSTTAIL return context->continuation[job](context, prevJob, job, started,
count);
}
void stepJobTailCall(Context *context, int64_t prevJob, int64_t job,
int64_t started, int64_t count);
void complete(Context *context, int64_t prevJob, int64_t job, int64_t started,
int64_t count) {
if (started == count) {
if (prevJob == job) {
return;
}
context->nextJob[prevJob] = context->nextJob[job];
job = prevJob;
} else {
context->inProgress[job] = context->jobs[started++];
context->continuation[job] = stepJobTailCall;
}
prevJob = job;
job = context->nextJob[job];
MUSTTAIL return context->continuation[job](context, prevJob, job, started,
count);
}
void stepJobTailCall(Context *context, int64_t prevJob, int64_t job,
int64_t started, int64_t count) {
auto *j = context->inProgress[job];
auto done = --(*j->input) == 0;
#ifdef __x86_64__
_mm_clflush(j->input);
#endif
if (done) {
MUSTTAIL return complete(context, prevJob, job, started, count);
} else {
context->continuation[job] = stepJobTailCall;
MUSTTAIL return keepGoing(context, prevJob, job, started, count);
}
}
void useTailCalls(Job **jobs, int count) {
if (count == 0) {
return;
}
Context context;
context.jobs = jobs;
int64_t started = std::min(Context::kConcurrent, count);
for (int i = 0; i < started; i++) {
context.inProgress[i] = jobs[i];
context.nextJob[i] = i + 1;
context.continuation[i] = stepJobTailCall;
}
context.nextJob[started - 1] = 0;
int prevJob = started - 1;
int job = 0;
return context.continuation[job](&context, prevJob, job, started, count);
}
void interleaveCyclicList(Job **jobs, int count) {
auto *nextJob = (int *)alloca(sizeof(int) * count);
@@ -117,12 +195,11 @@ void interleaveCyclicList(Job **jobs, int count) {
job = nextJob[job];
}
}
}
int main() {
ankerl::nanobench::Bench bench;
constexpr int kNumJobs = 100;
constexpr int kNumJobs = 10000;
bench.relative(true);
Job jobs[kNumJobs];
@@ -140,6 +217,7 @@ int main() {
for (auto [scheduler, name] :
{std::make_pair(sequentialNoFuncPtr, "sequentialNoFuncPtr"),
std::make_pair(sequential, "sequential"),
std::make_pair(useTailCalls, "useTailCalls"),
std::make_pair(interleaveSwapping, "interleavingSwapping"),
std::make_pair(interleaveBoundedCyclicList,
"interleaveBoundedCyclicList"),
+38 -60
View File
@@ -1,4 +1,5 @@
#include <atomic>
#include <cstdint>
#include <errno.h>
#include <netdb.h>
#include <stdio.h>
@@ -21,78 +22,55 @@
std::atomic<int64_t> transactions;
constexpr int kBaseSearchDepth = 115;
constexpr int kWindowSize = 10000000;
std::string numToKey(int64_t num) {
constexpr int kNumPrefixes = 250000;
std::string makeKey(int64_t num, int suffixLen) {
std::string result;
result.resize(kBaseSearchDepth + sizeof(int64_t));
memset(result.data(), 0, kBaseSearchDepth);
result.resize(sizeof(int64_t) + suffixLen);
int64_t be = __builtin_bswap64(num);
memcpy(result.data() + kBaseSearchDepth, &be, sizeof(int64_t));
memcpy(result.data(), &be, sizeof(int64_t));
memset(result.data() + sizeof(int64_t), 0, suffixLen);
return result;
}
void workload(weaselab::ConflictSet *cs) {
int64_t version = kWindowSize;
cs->addWrites(nullptr, 0, version);
for (int i = 0; i < kNumPrefixes; ++i) {
for (int j = 0; j < 50; ++j) {
weaselab::ConflictSet::WriteRange wr;
auto k = makeKey(i, j);
wr.begin.p = (const uint8_t *)k.data();
wr.begin.len = k.size();
wr.end.len = 0;
cs->addWrites(&wr, 1, version);
}
}
++version;
for (int i = 0; i < kNumPrefixes; ++i) {
weaselab::ConflictSet::WriteRange wr;
auto k = makeKey(i, 50);
wr.begin.p = (const uint8_t *)k.data();
wr.begin.len = k.size();
wr.end.len = 0;
cs->addWrites(&wr, 1, version);
}
std::vector<weaselab::ConflictSet::Result> results(10);
for (;; transactions.fetch_add(1, std::memory_order_relaxed)) {
// Reads
{
auto beginK = numToKey(version - kWindowSize);
auto endK = numToKey(version - 1);
auto pointRv = version - kWindowSize + rand() % kWindowSize + 1;
auto pointK = numToKey(pointRv);
weaselab::ConflictSet::ReadRange reads[] = {
{
{(const uint8_t *)pointK.data(), int(pointK.size())},
{nullptr, 0},
pointRv,
},
{
{(const uint8_t *)beginK.data(), int(beginK.size())},
{(const uint8_t *)endK.data(), int(endK.size())},
version - 2,
},
};
weaselab::ConflictSet::Result result[sizeof(reads) / sizeof(reads[0])];
cs->check(reads, result, sizeof(reads) / sizeof(reads[0]));
// for (int i = 0; i < sizeof(reads) / sizeof(reads[0]); ++i) {
// if (result[i] != weaselab::ConflictSet::Commit) {
// fprintf(stderr, "Unexpected conflict: [%s, %s) @ %" PRId64 "\n",
// printable(reads[i].begin).c_str(),
// printable(reads[i].end).c_str(), reads[i].readVersion);
// abort();
// }
// }
std::vector<std::string> keys(10);
for (auto &k : keys) {
k = makeKey(rand() % kNumPrefixes, 49);
}
// Writes
{
weaselab::ConflictSet::WriteRange w;
auto k = numToKey(version);
w.begin.p = (const uint8_t *)k.data();
w.end.len = 0;
if (version % (kWindowSize / 2) == 0) {
for (int l = 0; l <= k.size(); ++l) {
w.begin.len = l;
cs->addWrites(&w, 1, version);
}
} else {
w.begin.len = k.size();
cs->addWrites(&w, 1, version);
int64_t beginN = version - kWindowSize + rand() % kWindowSize;
auto b = numToKey(beginN);
auto e = numToKey(beginN + 1000);
w.begin.p = (const uint8_t *)b.data();
w.begin.len = b.size();
w.end.p = (const uint8_t *)e.data();
w.end.len = e.size();
cs->addWrites(&w, 1, version);
}
std::vector<weaselab::ConflictSet::ReadRange> reads(10);
for (int i = 0; i < reads.size(); ++i) {
reads[i].begin.p = (const uint8_t *)(keys[i].data());
reads[i].begin.len = keys[i].size();
reads[i].end.len = 0;
reads[i].readVersion = version - 1;
}
// GC
cs->setOldestVersion(version - kWindowSize);
++version;
cs->check(reads.data(), results.data(), 10);
}
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.