3 Commits

Author SHA1 Message Date
7f86fdee66 Test 64 bit versions
All checks were successful
Tests / Clang total: 2710, passed: 2710
Clang |Total|New|Outstanding|Fixed|Trend |:-:|:-:|:-:|:-:|:-: |0|0|0|0|:clap:
Tests / 64 bit versions total: 2710, passed: 2710
Tests / Debug total: 2708, passed: 2708
Tests / SIMD fallback total: 2710, passed: 2710
Tests / Release [gcc] total: 2710, passed: 2710
GNU C Compiler (gcc) |Total|New|Outstanding|Fixed|Trend |:-:|:-:|:-:|:-:|:-: |0|0|0|0|:clap:
Tests / Release [gcc,aarch64] total: 2020, passed: 2020
Tests / Coverage total: 2036, passed: 2036
Code Coverage #### Project Overview No changes detected, that affect the code coverage. * Line Coverage: 98.92% (1825/1845) * Branch Coverage: 66.85% (1480/2214) * Complexity Density: 0.00 * Lines of Code: 1845 #### Quality Gates Summary Output truncated.
weaselab/conflict-set/pipeline/head This commit looks good
Keep 32 bit versions the default though
2024-08-21 14:00:00 -07:00
442755d0a6 Update implementation notes
This doesn't really capture the complexity, but at least it's more
accurate
2024-08-21 14:00:00 -07:00
e15b3bb137 Bump version
All checks were successful
Tests / Clang total: 2710, passed: 2710
Clang |Total|New|Outstanding|Fixed|Trend |:-:|:-:|:-:|:-:|:-: |0|0|0|0|:clap:
Tests / Debug total: 2708, passed: 2708
Tests / SIMD fallback total: 2710, passed: 2710
Tests / Release [gcc] total: 2710, passed: 2710
GNU C Compiler (gcc) |Total|New|Outstanding|Fixed|Trend |:-:|:-:|:-:|:-:|:-: |0|0|0|0|:clap:
Tests / Release [gcc,aarch64] total: 2020, passed: 2020
Tests / Coverage total: 2036, passed: 2036
Code Coverage #### Project Overview No changes detected, that affect the code coverage. * Line Coverage: 98.97% (1821/1840) * Branch Coverage: 67.00% (1478/2206) * Complexity Density: 0.00 * Lines of Code: 1840 #### Quality Gates Summary Output truncated.
weaselab/conflict-set/pipeline/head This commit looks good
2024-08-20 16:58:39 -07:00
4 changed files with 86 additions and 14 deletions

View File

@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.18) cmake_minimum_required(VERSION 3.18)
project( project(
conflict-set conflict-set
VERSION 0.0.12 VERSION 0.0.13
DESCRIPTION DESCRIPTION
"A data structure for optimistic concurrency control on ranges of bitwise-lexicographically-ordered keys." "A data structure for optimistic concurrency control on ranges of bitwise-lexicographically-ordered keys."
HOMEPAGE_URL "https://git.weaselab.dev/weaselab/conflict-set" HOMEPAGE_URL "https://git.weaselab.dev/weaselab/conflict-set"

View File

@@ -87,22 +87,42 @@ constexpr int64_t kMaxCorrectVersionWindow =
std::numeric_limits<int32_t>::max(); std::numeric_limits<int32_t>::max();
static_assert(kNominalVersionWindow <= kMaxCorrectVersionWindow); static_assert(kNominalVersionWindow <= kMaxCorrectVersionWindow);
#ifndef USE_64_BIT
#define USE_64_BIT 0
#endif
struct InternalVersionT { struct InternalVersionT {
constexpr InternalVersionT() = default; constexpr InternalVersionT() = default;
constexpr explicit InternalVersionT(int64_t value) : value(value) {} constexpr explicit InternalVersionT(int64_t value) : value(value) {}
constexpr int64_t toInt64() const { return value; } // GCOVR_EXCL_LINE constexpr int64_t toInt64() const { return value; } // GCOVR_EXCL_LINE
constexpr auto operator<=>(const InternalVersionT &rhs) const { constexpr auto operator<=>(const InternalVersionT &rhs) const {
#if USE_64_BIT
return value <=> rhs.value;
#else
// Maintains ordering after overflow, as long as the full-precision versions // Maintains ordering after overflow, as long as the full-precision versions
// are within `kMaxCorrectVersionWindow` of eachother. // are within `kMaxCorrectVersionWindow` of eachother.
return int32_t(value - rhs.value) <=> 0; return int32_t(value - rhs.value) <=> 0;
#endif
} }
constexpr bool operator==(const InternalVersionT &) const = default; constexpr bool operator==(const InternalVersionT &) const = default;
#if USE_64_BIT
static const InternalVersionT zero;
#else
static thread_local InternalVersionT zero; static thread_local InternalVersionT zero;
#endif
private: private:
#if USE_64_BIT
int64_t value;
#else
uint32_t value; uint32_t value;
#endif
}; };
#if USE_64_BIT
const InternalVersionT InternalVersionT::zero{0};
#else
thread_local InternalVersionT InternalVersionT::zero; thread_local InternalVersionT InternalVersionT::zero;
#endif
struct Entry { struct Entry {
InternalVersionT pointVersion; InternalVersionT pointVersion;
@@ -518,8 +538,13 @@ std::string getSearchPath(Node *n);
// Each node with an entry present gets a budget of kBytesPerKey. Node0 always // Each node with an entry present gets a budget of kBytesPerKey. Node0 always
// has an entry present. // has an entry present.
// Induction hypothesis is that each node's surplus is >= kMinNodeSurplus // Induction hypothesis is that each node's surplus is >= kMinNodeSurplus
#if USE_64_BIT
constexpr int kBytesPerKey = 144;
constexpr int kMinNodeSurplus = 104;
#else
constexpr int kBytesPerKey = 112; constexpr int kBytesPerKey = 112;
constexpr int kMinNodeSurplus = 80; constexpr int kMinNodeSurplus = 80;
#endif
// Cound the entry itself as a child // Cound the entry itself as a child
constexpr int kMinChildrenNode0 = 1; constexpr int kMinChildrenNode0 = 1;
constexpr int kMinChildrenNode3 = 2; constexpr int kMinChildrenNode3 = 2;
@@ -724,9 +749,13 @@ struct WriteContext {
int64_t write_bytes; int64_t write_bytes;
} accum; } accum;
#if USE_64_BIT
static constexpr InternalVersionT zero{0};
#else
// Cache a copy of InternalVersionT::zero, so we don't need to do the TLS // Cache a copy of InternalVersionT::zero, so we don't need to do the TLS
// lookup as often. // lookup as often.
InternalVersionT zero; InternalVersionT zero;
#endif
WriteContext() { memset(&accum, 0, sizeof(accum)); } WriteContext() { memset(&accum, 0, sizeof(accum)); }
@@ -1563,6 +1592,9 @@ void rezero16(InternalVersionT *vs, InternalVersionT zero) {
} }
} }
#if USE_64_BIT
void rezero(Node *, InternalVersionT) {}
#else
void rezero(Node *n, InternalVersionT z) { void rezero(Node *n, InternalVersionT z) {
#if DEBUG_VERBOSE && !defined(NDEBUG) #if DEBUG_VERBOSE && !defined(NDEBUG)
fprintf(stderr, "rezero to %" PRId64 ": %s\n", z.toInt64(), fprintf(stderr, "rezero to %" PRId64 ": %s\n", z.toInt64(),
@@ -1605,6 +1637,7 @@ void rezero(Node *n, InternalVersionT z) {
__builtin_unreachable(); // GCOVR_EXCL_LINE __builtin_unreachable(); // GCOVR_EXCL_LINE
} }
} }
#endif
void mergeWithChild(Node *&self, WriteContext *tls, ConflictSet::Impl *impl, void mergeWithChild(Node *&self, WriteContext *tls, ConflictSet::Impl *impl,
Node *&dontInvalidate, Node3 *self3) { Node *&dontInvalidate, Node3 *self3) {
@@ -1987,7 +2020,14 @@ downLeftSpine:
} }
#ifdef HAS_AVX #ifdef HAS_AVX
uint32_t compare16_32bit(const InternalVersionT *vs, InternalVersionT rv) { uint32_t compare16(const InternalVersionT *vs, InternalVersionT rv) {
#if USE_64_BIT
uint32_t compared = 0;
for (int i = 0; i < 16; ++i) {
compared |= (vs[i] > rv) << i;
}
return compared;
#else
uint32_t compared = 0; uint32_t compared = 0;
__m128i w[4]; // GCOVR_EXCL_LINE __m128i w[4]; // GCOVR_EXCL_LINE
memcpy(w, vs, sizeof(w)); memcpy(w, vs, sizeof(w));
@@ -2001,15 +2041,26 @@ uint32_t compare16_32bit(const InternalVersionT *vs, InternalVersionT rv) {
<< (i * 4); << (i * 4);
} }
return compared; return compared;
#endif
} }
__attribute__((target("avx512f"))) uint32_t __attribute__((target("avx512f"))) uint32_t
compare16_32bit_avx512(const InternalVersionT *vs, InternalVersionT rv) { compare16_avx512(const InternalVersionT *vs, InternalVersionT rv) {
#if USE_64_BIT
int64_t r;
memcpy(&r, &rv, sizeof(r));
uint32_t low =
_mm512_cmpgt_epi64_mask(_mm512_loadu_epi64(vs), _mm512_set1_epi64(r));
uint32_t high =
_mm512_cmpgt_epi64_mask(_mm512_loadu_epi64(vs + 8), _mm512_set1_epi64(r));
return low | (high << 8);
#else
uint32_t r; uint32_t r;
memcpy(&r, &rv, sizeof(r)); memcpy(&r, &rv, sizeof(r));
return _mm512_cmpgt_epi32_mask( return _mm512_cmpgt_epi32_mask(
_mm512_sub_epi32(_mm512_loadu_epi32(vs), _mm512_set1_epi32(r)), _mm512_sub_epi32(_mm512_loadu_epi32(vs), _mm512_set1_epi32(r)),
_mm512_setzero_epi32()); _mm512_setzero_epi32());
#endif
} }
#endif #endif
@@ -2066,9 +2117,9 @@ bool scan16(const InternalVersionT *vs, const uint8_t *is, int begin, int end,
uint32_t compared = 0; uint32_t compared = 0;
if constexpr (kAVX512) { if constexpr (kAVX512) {
compared = compare16_32bit_avx512(vs, readVersion); // GCOVR_EXCL_LINE compared = compare16_avx512(vs, readVersion);
} else { } else {
compared = compare16_32bit(vs, readVersion); // GCOVR_EXCL_LINE compared = compare16(vs, readVersion);
} }
return !(compared & mask); return !(compared & mask);
@@ -2127,9 +2178,9 @@ bool scan16(const InternalVersionT *vs, int begin, int end,
#elif defined(HAS_AVX) #elif defined(HAS_AVX)
uint32_t conflict; uint32_t conflict;
if constexpr (kAVX512) { if constexpr (kAVX512) {
conflict = compare16_32bit_avx512(vs, readVersion); // GCOVR_EXCL_LINE conflict = compare16_avx512(vs, readVersion);
} else { } else {
conflict = compare16_32bit(vs, readVersion); // GCOVR_EXCL_LINE conflict = compare16(vs, readVersion);
} }
conflict &= (1 << end) - 1; conflict &= (1 << end) - 1;
conflict >>= begin; conflict >>= begin;
@@ -2264,12 +2315,9 @@ bool checkMaxBetweenExclusiveImpl(Node *n, int begin, int end,
uint32_t compared = 0; uint32_t compared = 0;
if constexpr (kAVX512) { if constexpr (kAVX512) {
compared = // GCOVR_EXCL_LINE compared = compare16_avx512(self->childMaxVersion, readVersion);
compare16_32bit_avx512(self->childMaxVersion, // GCOVR_EXCL_LINE
readVersion); // GCOVR_EXCL_LINE
} else { } else {
compared = compare16_32bit(self->childMaxVersion, compared = compare16(self->childMaxVersion, readVersion);
readVersion); // GCOVR_EXCL_LINE
} }
return !(compared & mask) && firstRangeOk; return !(compared & mask) && firstRangeOk;
@@ -3137,10 +3185,12 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
} }
void addWrites(const WriteRange *writes, int count, int64_t writeVersion) { void addWrites(const WriteRange *writes, int count, int64_t writeVersion) {
#if !USE_64_BIT
// There could be other conflict sets in the same thread. We need // There could be other conflict sets in the same thread. We need
// InternalVersionT::zero to be correct for this conflict set for the // InternalVersionT::zero to be correct for this conflict set for the
// lifetime of the current call frame. // lifetime of the current call frame.
InternalVersionT::zero = tls.zero = oldestVersion; InternalVersionT::zero = tls.zero = oldestVersion;
#endif
assert(writeVersion >= newestVersionFullPrecision); assert(writeVersion >= newestVersionFullPrecision);
assert(tls.accum.entries_erased == 0); assert(tls.accum.entries_erased == 0);
@@ -3255,7 +3305,9 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
InternalVersionT oldestVersion{o}; InternalVersionT oldestVersion{o};
this->oldestVersionFullPrecision = o; this->oldestVersionFullPrecision = o;
this->oldestVersion = oldestVersion; this->oldestVersion = oldestVersion;
#if !USE_64_BIT
InternalVersionT::zero = tls.zero = oldestVersion; InternalVersionT::zero = tls.zero = oldestVersion;
#endif
#ifdef NDEBUG #ifdef NDEBUG
// This is here for performance reasons, since we want to amortize the cost // This is here for performance reasons, since we want to amortize the cost
// of storing the search path as a string. In tests, we want to exercise the // of storing the search path as a string. In tests, we want to exercise the
@@ -3304,7 +3356,9 @@ struct __attribute__((visibility("hidden"))) ConflictSet::Impl {
root->entry.pointVersion = this->oldestVersion; root->entry.pointVersion = this->oldestVersion;
root->entry.rangeVersion = this->oldestVersion; root->entry.rangeVersion = this->oldestVersion;
#if !USE_64_BIT
InternalVersionT::zero = tls.zero = this->oldestVersion; InternalVersionT::zero = tls.zero = this->oldestVersion;
#endif
// Intentionally not resetting totalBytes // Intentionally not resetting totalBytes
} }
@@ -3751,6 +3805,9 @@ Node *firstGeq(Node *n, std::string_view key) {
n, std::span<const uint8_t>((const uint8_t *)key.data(), key.size())); n, std::span<const uint8_t>((const uint8_t *)key.data(), key.size()));
} }
#if USE_64_BIT
void checkVersionsGeqOldestExtant(Node *, InternalVersionT) {}
#else
void checkVersionsGeqOldestExtant(Node *n, void checkVersionsGeqOldestExtant(Node *n,
InternalVersionT oldestExtantVersion) { InternalVersionT oldestExtantVersion) {
if (n->entryPresent) { if (n->entryPresent) {
@@ -3794,6 +3851,7 @@ void checkVersionsGeqOldestExtant(Node *n,
abort(); abort();
} }
} }
#endif
[[maybe_unused]] InternalVersionT [[maybe_unused]] InternalVersionT
checkMaxVersion(Node *root, Node *node, InternalVersionT oldestVersion, checkMaxVersion(Node *root, Node *node, InternalVersionT oldestVersion,

11
Jenkinsfile vendored
View File

@@ -48,6 +48,17 @@ pipeline {
recordIssues(tools: [clang()]) recordIssues(tools: [clang()])
} }
} }
stage('64 bit versions') {
agent {
dockerfile {
args '-v /home/jenkins/ccache:/ccache'
reuseNode true
}
}
steps {
CleanBuildAndTest("-DCMAKE_CXX_FLAGS=-DUSE_64_BIT=1")
}
}
stage('Debug') { stage('Debug') {
agent { agent {
dockerfile { dockerfile {

View File

@@ -206,8 +206,11 @@ until we end at $a_{i} + 1$, adjacent to the first inner range.
A few notes on implementation: A few notes on implementation:
\begin{itemize} \begin{itemize}
\item{For clarity, the above algorithm decouples the logical partitioning from the physical structure of the tree. An optimized implementation would merge adjacent prefix ranges that don't correspond to nodes in the tree as it scans, so that it only calculates the version of such merged ranges once. Additionally, our implementation stores an index of which child pointers are valid as a bitset for Node48 and Node256 to speed up this scan using techniques inspired by \cite{Lemire_2018}.} \item{For clarity, the above algorithm decouples the logical partitioning from the physical structure of the tree.
\item{In order to avoid many costly pointer indirections, we can store the max version not in each node itself but next to each node's parent pointer. Without this, the range read performance is not competetive with the skip list.} An optimized implementation would merge adjacent prefix ranges that don't correspond to nodes in the tree as it scans, so that it only calculates the version of such merged ranges once.
Additionally, our implementation uses SIMD instructions and instruction-level parallelism to compare many prefix ranges to the read version $r$ in parallel.}
\item{In order to avoid many costly pointer indirections, and to take advantage of SIMD, we can store the max version of child nodes as a dense array directly in the parent node.
Without this, the range read performance is not competetive with the skip list.}
\item{An optimized implementation would visit the partition of $[a_{i}\dots a_{m}, a_{i} + 1)$ in reverse order, as it descends along the search path to $a_{i}\dots a_{m}$} \item{An optimized implementation would visit the partition of $[a_{i}\dots a_{m}, a_{i} + 1)$ in reverse order, as it descends along the search path to $a_{i}\dots a_{m}$}
\item{An optimized implementation would search for the common prefix first, and return early if any prefix of the common prefix has a $max \leq r$.} \item{An optimized implementation would search for the common prefix first, and return early if any prefix of the common prefix has a $max \leq r$.}
\end{itemize} \end{itemize}