Compare commits
3 Commits
v0.0.12
...
7f86fdee66
| Author | SHA1 | Date | |
|---|---|---|---|
| 7f86fdee66 | |||
| 442755d0a6 | |||
| e15b3bb137 |
@@ -1,7 +1,7 @@
|
||||
cmake_minimum_required(VERSION 3.18)
|
||||
project(
|
||||
conflict-set
|
||||
VERSION 0.0.12
|
||||
VERSION 0.0.13
|
||||
DESCRIPTION
|
||||
"A data structure for optimistic concurrency control on ranges of bitwise-lexicographically-ordered keys."
|
||||
HOMEPAGE_URL "https://git.weaselab.dev/weaselab/conflict-set"
|
||||
|
||||
@@ -87,22 +87,42 @@ constexpr int64_t kMaxCorrectVersionWindow =
|
||||
std::numeric_limits<int32_t>::max();
|
||||
static_assert(kNominalVersionWindow <= kMaxCorrectVersionWindow);
|
||||
|
||||
#ifndef USE_64_BIT
|
||||
#define USE_64_BIT 0
|
||||
#endif
|
||||
|
||||
struct InternalVersionT {
|
||||
constexpr InternalVersionT() = default;
|
||||
constexpr explicit InternalVersionT(int64_t value) : value(value) {}
|
||||
constexpr int64_t toInt64() const { return value; } // GCOVR_EXCL_LINE
|
||||
constexpr auto operator<=>(const InternalVersionT &rhs) const {
|
||||
#if USE_64_BIT
|
||||
return value <=> rhs.value;
|
||||
#else
|
||||
// Maintains ordering after overflow, as long as the full-precision versions
|
||||
// are within `kMaxCorrectVersionWindow` of eachother.
|
||||
return int32_t(value - rhs.value) <=> 0;
|
||||
#endif
|
||||
}
|
||||
constexpr bool operator==(const InternalVersionT &) const = default;
|
||||
#if USE_64_BIT
|
||||
static const InternalVersionT zero;
|
||||
#else
|
||||
static thread_local InternalVersionT zero;
|
||||
#endif
|
||||
|
||||
private:
|
||||
#if USE_64_BIT
|
||||
int64_t value;
|
||||
#else
|
||||
uint32_t value;
|
||||
#endif
|
||||
};
|
||||
#if USE_64_BIT
|
||||
const InternalVersionT InternalVersionT::zero{0};
|
||||
#else
|
||||
thread_local InternalVersionT InternalVersionT::zero;
|
||||
#endif
|
||||
|
||||
struct Entry {
|
||||
InternalVersionT pointVersion;
|
||||
@@ -518,8 +538,13 @@ std::string getSearchPath(Node *n);
|
||||
// Each node with an entry present gets a budget of kBytesPerKey. Node0 always
|
||||
// has an entry present.
|
||||
// Induction hypothesis is that each node's surplus is >= kMinNodeSurplus
|
||||
#if USE_64_BIT
|
||||
constexpr int kBytesPerKey = 144;
|
||||
constexpr int kMinNodeSurplus = 104;
|
||||
#else
|
||||
constexpr int kBytesPerKey = 112;
|
||||
constexpr int kMinNodeSurplus = 80;
|
||||
#endif
|
||||
// Cound the entry itself as a child
|
||||
constexpr int kMinChildrenNode0 = 1;
|
||||
constexpr int kMinChildrenNode3 = 2;
|
||||
@@ -724,9 +749,13 @@ struct WriteContext {
|
||||
int64_t write_bytes;
|
||||
} accum;
|
||||
|
||||
#if USE_64_BIT
|
||||
static constexpr InternalVersionT zero{0};
|
||||
#else
|
||||
// Cache a copy of InternalVersionT::zero, so we don't need to do the TLS
|
||||
// lookup as often.
|
||||
InternalVersionT zero;
|
||||
#endif
|
||||
|
||||
WriteContext() { memset(&accum, 0, sizeof(accum)); }
|
||||
|
||||
@@ -1563,6 +1592,9 @@ void rezero16(InternalVersionT *vs, InternalVersionT zero) {
|
||||
}
|
||||
}
|
||||
|
||||
#if USE_64_BIT
|
||||
void rezero(Node *, InternalVersionT) {}
|
||||
#else
|
||||
void rezero(Node *n, InternalVersionT z) {
|
||||
#if DEBUG_VERBOSE && !defined(NDEBUG)
|
||||
fprintf(stderr, "rezero to %" PRId64 ": %s\n", z.toInt64(),
|
||||
@@ -1605,6 +1637,7 @@ void rezero(Node *n, InternalVersionT z) {
|
||||
__builtin_unreachable(); // GCOVR_EXCL_LINE
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void mergeWithChild(Node *&self, WriteContext *tls, ConflictSet::Impl *impl,
|
||||
Node *&dontInvalidate, Node3 *self3) {
|
||||
@@ -1987,7 +2020,14 @@ downLeftSpine:
|
||||
}
|
||||
|
||||
#ifdef HAS_AVX
|
||||
uint32_t compare16_32bit(const InternalVersionT *vs, InternalVersionT rv) {
|
||||
uint32_t compare16(const InternalVersionT *vs, InternalVersionT rv) {
|
||||
#if USE_64_BIT
|
||||
uint32_t compared = 0;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
compared |= (vs[i] > rv) << i;
|
||||
}
|
||||
return compared;
|
||||
#else
|
||||
uint32_t compared = 0;
|
||||
__m128i w[4]; // GCOVR_EXCL_LINE
|
||||
memcpy(w, vs, sizeof(w));
|
||||
@@ -2001,15 +2041,26 @@ uint32_t compare16_32bit(const InternalVersionT *vs, InternalVersionT rv) {
|
||||
<< (i * 4);
|
||||
}
|
||||
return compared;
|
||||
#endif
|
||||
}
|
||||
|
||||
__attribute__((target("avx512f"))) uint32_t
|
||||
compare16_32bit_avx512(const InternalVersionT *vs, InternalVersionT rv) {
|
||||
compare16_avx512(const InternalVersionT *vs, InternalVersionT rv) {
|
||||
#if USE_64_BIT
|
||||
int64_t r;
|
||||
memcpy(&r, &rv, sizeof(r));
|
||||
uint32_t low =
|
||||
_mm512_cmpgt_epi64_mask(_mm512_loadu_epi64(vs), _mm512_set1_epi64(r));
|
||||
uint32_t high =
|
||||
_mm512_cmpgt_epi64_mask(_mm512_loadu_epi64(vs + 8), _mm512_set1_epi64(r));
|
||||
return low | (high << 8);
|
||||
#else
|
||||
uint32_t r;
|
||||
memcpy(&r, &rv, sizeof(r));
|
||||
return _mm512_cmpgt_epi32_mask(
|
||||
_mm512_sub_epi32(_mm512_loadu_epi32(vs), _mm512_set1_epi32(r)),
|
||||
_mm512_setzero_epi32());
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -2066,9 +2117,9 @@ bool scan16(const InternalVersionT *vs, const uint8_t *is, int begin, int end,
|
||||
|
||||
uint32_t compared = 0;
|
||||
if constexpr (kAVX512) {
|
||||
compared = compare16_32bit_avx512(vs, readVersion); // GCOVR_EXCL_LINE
|
||||
compared = compare16_avx512(vs, readVersion);
|
||||
} else {
|
||||
compared = compare16_32bit(vs, readVersion); // GCOVR_EXCL_LINE
|
||||
compared = compare16(vs, readVersion);
|
||||
}
|
||||
return !(compared & mask);
|
||||
|
||||
@@ -2127,9 +2178,9 @@ bool scan16(const InternalVersionT *vs, int begin, int end,
|
||||
#elif defined(HAS_AVX)
|
||||
uint32_t conflict;
|
||||
if constexpr (kAVX512) {
|
||||
conflict = compare16_32bit_avx512(vs, readVersion); // GCOVR_EXCL_LINE
|
||||
conflict = compare16_avx512(vs, readVersion);
|
||||
} else {
|
||||
conflict = compare16_32bit(vs, readVersion); // GCOVR_EXCL_LINE
|
||||
conflict = compare16(vs, readVersion);
|
||||
}
|
||||
conflict &= (1 << end) - 1;
|
||||
conflict >>= begin;
|
||||
@@ -2264,12 +2315,9 @@ bool checkMaxBetweenExclusiveImpl(Node *n, int begin, int end,
|
||||
|
||||
uint32_t compared = 0;
|
||||
if constexpr (kAVX512) {
|
||||
compared = // GCOVR_EXCL_LINE
|
||||
compare16_32bit_avx512(self->childMaxVersion, // GCOVR_EXCL_LINE
|
||||
readVersion); // GCOVR_EXCL_LINE
|
||||
compared = compare16_avx512(self->childMaxVersion, readVersion);
|
||||
} else {
|
||||
compared = compare16_32bit(self->childMaxVersion,
|
||||
readVersion); // GCOVR_EXCL_LINE
|
||||
compared = compare16(self->childMaxVersion, readVersion);
|
||||
}
|
||||
return !(compared & mask) && firstRangeOk;
|
||||
|
||||
@@ -3137,10 +3185,12 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
|
||||
}
|
||||
|
||||
void addWrites(const WriteRange *writes, int count, int64_t writeVersion) {
|
||||
#if !USE_64_BIT
|
||||
// There could be other conflict sets in the same thread. We need
|
||||
// InternalVersionT::zero to be correct for this conflict set for the
|
||||
// lifetime of the current call frame.
|
||||
InternalVersionT::zero = tls.zero = oldestVersion;
|
||||
#endif
|
||||
|
||||
assert(writeVersion >= newestVersionFullPrecision);
|
||||
assert(tls.accum.entries_erased == 0);
|
||||
@@ -3255,7 +3305,9 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
|
||||
InternalVersionT oldestVersion{o};
|
||||
this->oldestVersionFullPrecision = o;
|
||||
this->oldestVersion = oldestVersion;
|
||||
#if !USE_64_BIT
|
||||
InternalVersionT::zero = tls.zero = oldestVersion;
|
||||
#endif
|
||||
#ifdef NDEBUG
|
||||
// This is here for performance reasons, since we want to amortize the cost
|
||||
// of storing the search path as a string. In tests, we want to exercise the
|
||||
@@ -3304,7 +3356,9 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
|
||||
root->entry.pointVersion = this->oldestVersion;
|
||||
root->entry.rangeVersion = this->oldestVersion;
|
||||
|
||||
#if !USE_64_BIT
|
||||
InternalVersionT::zero = tls.zero = this->oldestVersion;
|
||||
#endif
|
||||
|
||||
// Intentionally not resetting totalBytes
|
||||
}
|
||||
@@ -3751,6 +3805,9 @@ Node *firstGeq(Node *n, std::string_view key) {
|
||||
n, std::span<const uint8_t>((const uint8_t *)key.data(), key.size()));
|
||||
}
|
||||
|
||||
#if USE_64_BIT
|
||||
void checkVersionsGeqOldestExtant(Node *, InternalVersionT) {}
|
||||
#else
|
||||
void checkVersionsGeqOldestExtant(Node *n,
|
||||
InternalVersionT oldestExtantVersion) {
|
||||
if (n->entryPresent) {
|
||||
@@ -3794,6 +3851,7 @@ void checkVersionsGeqOldestExtant(Node *n,
|
||||
abort();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
[[maybe_unused]] InternalVersionT
|
||||
checkMaxVersion(Node *root, Node *node, InternalVersionT oldestVersion,
|
||||
|
||||
11
Jenkinsfile
vendored
11
Jenkinsfile
vendored
@@ -48,6 +48,17 @@ pipeline {
|
||||
recordIssues(tools: [clang()])
|
||||
}
|
||||
}
|
||||
stage('64 bit versions') {
|
||||
agent {
|
||||
dockerfile {
|
||||
args '-v /home/jenkins/ccache:/ccache'
|
||||
reuseNode true
|
||||
}
|
||||
}
|
||||
steps {
|
||||
CleanBuildAndTest("-DCMAKE_CXX_FLAGS=-DUSE_64_BIT=1")
|
||||
}
|
||||
}
|
||||
stage('Debug') {
|
||||
agent {
|
||||
dockerfile {
|
||||
|
||||
@@ -206,8 +206,11 @@ until we end at $a_{i} + 1$, adjacent to the first inner range.
|
||||
|
||||
A few notes on implementation:
|
||||
\begin{itemize}
|
||||
\item{For clarity, the above algorithm decouples the logical partitioning from the physical structure of the tree. An optimized implementation would merge adjacent prefix ranges that don't correspond to nodes in the tree as it scans, so that it only calculates the version of such merged ranges once. Additionally, our implementation stores an index of which child pointers are valid as a bitset for Node48 and Node256 to speed up this scan using techniques inspired by \cite{Lemire_2018}.}
|
||||
\item{In order to avoid many costly pointer indirections, we can store the max version not in each node itself but next to each node's parent pointer. Without this, the range read performance is not competetive with the skip list.}
|
||||
\item{For clarity, the above algorithm decouples the logical partitioning from the physical structure of the tree.
|
||||
An optimized implementation would merge adjacent prefix ranges that don't correspond to nodes in the tree as it scans, so that it only calculates the version of such merged ranges once.
|
||||
Additionally, our implementation uses SIMD instructions and instruction-level parallelism to compare many prefix ranges to the read version $r$ in parallel.}
|
||||
\item{In order to avoid many costly pointer indirections, and to take advantage of SIMD, we can store the max version of child nodes as a dense array directly in the parent node.
|
||||
Without this, the range read performance is not competetive with the skip list.}
|
||||
\item{An optimized implementation would visit the partition of $[a_{i}\dots a_{m}, a_{i} + 1)$ in reverse order, as it descends along the search path to $a_{i}\dots a_{m}$}
|
||||
\item{An optimized implementation would search for the common prefix first, and return early if any prefix of the common prefix has a $max \leq r$.}
|
||||
\end{itemize}
|
||||
|
||||
Reference in New Issue
Block a user