From 814aac4ea79bfb26594fe770a15a7d8f5705d59e Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Tue, 10 Sep 2024 22:06:00 -0700 Subject: [PATCH] Experiment with causing cache misses --- ConflictSet.cpp | 114 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 85 insertions(+), 29 deletions(-) diff --git a/ConflictSet.cpp b/ConflictSet.cpp index 2100a7c..325d05c 100644 --- a/ConflictSet.cpp +++ b/ConflictSet.cpp @@ -48,6 +48,17 @@ limitations under the License. #endif #endif +#define SIM_CACHE_MISSES 0 +#if SIM_CACHE_MISSES +constexpr void simCacheMiss(void *x) { + if (x) { + _mm_clflush(x); + } +} +#else +constexpr void simCacheMiss(void *) {} +#endif + #include using namespace weaselab; @@ -836,21 +847,29 @@ int getNodeIndex(Node16 *self, uint8_t index) { // Precondition - an entry for index must exist in the node Node *&getChildExists(Node3 *self, uint8_t index) { - return self->children[getNodeIndex(self, index)]; + auto &result = self->children[getNodeIndex(self, index)]; + simCacheMiss(result); + return result; } // Precondition - an entry for index must exist in the node Node *&getChildExists(Node16 *self, uint8_t index) { - return self->children[getNodeIndex(self, index)]; + auto &result = self->children[getNodeIndex(self, index)]; + simCacheMiss(result); + return result; } // Precondition - an entry for index must exist in the node Node *&getChildExists(Node48 *self, uint8_t index) { assert(self->bitSet.test(index)); - return self->children[self->index[index]]; + auto &result = self->children[self->index[index]]; + simCacheMiss(result); + return result; } // Precondition - an entry for index must exist in the node Node *&getChildExists(Node256 *self, uint8_t index) { assert(self->bitSet.test(index)); - return self->children[index]; + auto &result = self->children[index]; + simCacheMiss(result); + return result; } // Precondition - an entry for index must exist in the node @@ -1030,6 +1049,7 @@ ChildAndMaxVersion getChildAndMaxVersion(Node3 *self, uint8_t index) { if (i < 0) { return {}; } + simCacheMiss(self->children[i]); return {self->children[i], self->childMaxVersion[i]}; } ChildAndMaxVersion getChildAndMaxVersion(Node16 *self, uint8_t index) { @@ -1037,6 +1057,7 @@ ChildAndMaxVersion getChildAndMaxVersion(Node16 *self, uint8_t index) { if (i < 0) { return {}; } + simCacheMiss(self->children[i]); return {self->children[i], self->childMaxVersion[i]}; } ChildAndMaxVersion getChildAndMaxVersion(Node48 *self, uint8_t index) { @@ -1044,9 +1065,11 @@ ChildAndMaxVersion getChildAndMaxVersion(Node48 *self, uint8_t index) { if (i < 0) { return {}; } + simCacheMiss(self->children[i]); return {self->children[i], self->childMaxVersion[i]}; } ChildAndMaxVersion getChildAndMaxVersion(Node256 *self, uint8_t index) { + simCacheMiss(self->children[index]); return {self->children[index], self->childMaxVersion[index]}; } @@ -1072,6 +1095,7 @@ Node *getChildGeq(Node0 *, int) { return nullptr; } Node *getChildGeq(Node3 *n, int child) { for (int i = 0; i < n->numChildren; ++i) { if (n->index[i] >= child) { + simCacheMiss(n->children[i]); return n->children[i]; } } @@ -1090,7 +1114,10 @@ Node *getChildGeq(Node16 *self, int child) { __m128i results = _mm_cmpeq_epi8(key_vec, _mm_min_epu8(key_vec, indices)); int mask = (1 << self->numChildren) - 1; uint32_t bitfield = _mm_movemask_epi8(results) & mask; - return bitfield == 0 ? nullptr : self->children[std::countr_zero(bitfield)]; + auto *result = + bitfield == 0 ? nullptr : self->children[std::countr_zero(bitfield)]; + simCacheMiss(result); + return result; #elif defined(HAS_ARM_NEON) uint8x16_t indices; memcpy(&indices, self->index, sizeof(self->index)); @@ -1126,13 +1153,16 @@ Node *getChildGeq(Node48 *self, int child) { if (c < 0) { return nullptr; } - return self->children[self->index[c]]; + auto *result = self->children[self->index[c]]; + simCacheMiss(result); + return result; } Node *getChildGeq(Node256 *self, int child) { int c = self->bitSet.firstSetGeq(child); if (c < 0) { return nullptr; } + simCacheMiss(self->children[c]); return self->children[c]; } @@ -1156,20 +1186,26 @@ Node *getChildGeq(Node *self, int child) { // Precondition: self has a child Node *getFirstChildExists(Node3 *self) { assert(self->numChildren > 0); + simCacheMiss(self->children[0]); return self->children[0]; } // Precondition: self has a child Node *getFirstChildExists(Node16 *self) { assert(self->numChildren > 0); + simCacheMiss(self->children[0]); return self->children[0]; } // Precondition: self has a child Node *getFirstChildExists(Node48 *self) { - return self->children[self->index[self->bitSet.firstSetGeq(0)]]; + auto *result = self->children[self->index[self->bitSet.firstSetGeq(0)]]; + simCacheMiss(result); + return result; } // Precondition: self has a child Node *getFirstChildExists(Node256 *self) { - return self->children[self->bitSet.firstSetGeq(0)]; + auto *result = self->children[self->bitSet.firstSetGeq(0)]; + simCacheMiss(result); + return result; } // Precondition: self has a child @@ -3010,18 +3046,10 @@ Node *firstGeqPhysical(Node *n, const std::span key) { } struct CheckJob { - Node *n; - std::span begin; - InternalVersionT readVersion; - ReadContext *tls; - ConflictSet::Result *result; - void setResult(bool ok) { *result = ok ? ConflictSet::Commit : ConflictSet::Conflict; } - typedef void (*typeErasedContinuation)(void *); - // The type of a function that takes a CheckJob* and returns its own type struct continuation { typedef continuation (*functionPtrType)(CheckJob *); @@ -3032,9 +3060,16 @@ struct CheckJob { operator bool() { return func != nullptr; } }; - continuation next; void init(const ConflictSet::ReadRange *read, ConflictSet::Result *result, Node *root, int64_t oldestVersionFullPrecision, ReadContext *tls); + + continuation next; + Node *n; + ChildAndMaxVersion childAndVersion; + std::span begin; + InternalVersionT readVersion; + ReadContext *tls; + ConflictSet::Result *result; }; namespace check_point_read_state_machine { @@ -3042,18 +3077,12 @@ namespace check_point_read_state_machine { CheckJob::continuation down_left_spine(CheckJob *job); CheckJob::continuation iter(CheckJob *job); -// Logically this is the same as performing firstGeq and then checking against -// point or range version according to cmp, but this version short circuits as -// soon as it can prove that there's no conflict. CheckJob::continuation begin(CheckJob *job) { ++job->tls->point_read_accum; #if DEBUG_VERBOSE && !defined(NDEBUG) fprintf(stderr, "Check point read: %s\n", printable(key).c_str()); #endif - return iter(job); -} -CheckJob::continuation iter(CheckJob *job) { if (job->begin.size() == 0) { if (job->n->entryPresent) { job->setResult(job->n->entry.pointVersion <= job->readVersion); @@ -3063,11 +3092,17 @@ CheckJob::continuation iter(CheckJob *job) { return down_left_spine; } - auto [child, maxV] = getChildAndMaxVersion(job->n, job->begin[0]); - if (child == nullptr) { + job->childAndVersion = getChildAndMaxVersion(job->n, job->begin[0]); + __builtin_prefetch(job->childAndVersion.child); + return iter; +} + +CheckJob::continuation iter(CheckJob *job) { + if (job->childAndVersion.child == nullptr) { auto c = getChildGeq(job->n, job->begin[0]); if (c != nullptr) { job->n = c; + __builtin_prefetch(job->n); return down_left_spine; } else { job->n = nextSibling(job->n); @@ -3075,11 +3110,12 @@ CheckJob::continuation iter(CheckJob *job) { job->setResult(true); return nullptr; // Done } + __builtin_prefetch(job->n); return down_left_spine; } } - job->n = child; + job->n = job->childAndVersion.child; job->begin = job->begin.subspan(1, job->begin.size() - 1); if (job->n->partialKeyLen > 0) { @@ -3089,13 +3125,14 @@ CheckJob::continuation iter(CheckJob *job) { if (i < commonLen) { auto c = job->n->partialKey()[i] <=> job->begin[i]; if (c > 0) { - return down_left_spine; + return down_left_spine(job); } else { job->n = nextSibling(job->n); if (job->n == nullptr) { job->setResult(true); return nullptr; // Done } + __builtin_prefetch(job->n); return down_left_spine; } } @@ -3105,17 +3142,30 @@ CheckJob::continuation iter(CheckJob *job) { } else if (job->n->partialKeyLen > int(job->begin.size())) { // n is the first physical node greater than remaining, and there's no // eq node - return down_left_spine; + return down_left_spine(job); } } - if (maxV <= job->readVersion) { + if (job->childAndVersion.maxVersion <= job->readVersion) { ++job->tls->point_read_short_circuit_accum; job->setResult(true); return nullptr; // Done } ++job->tls->point_read_iterations_accum; + + if (job->begin.size() == 0) { + if (job->n->entryPresent) { + job->setResult(job->n->entry.pointVersion <= job->readVersion); + return nullptr; // Done + } + job->n = getFirstChildExists(job->n); + __builtin_prefetch(job->n); + return down_left_spine; + } + + job->childAndVersion = getChildAndMaxVersion(job->n, job->begin[0]); + __builtin_prefetch(job->childAndVersion.child); return iter; } @@ -3125,6 +3175,7 @@ CheckJob::continuation down_left_spine(CheckJob *job) { return nullptr; // Done } job->n = getFirstChildExists(job->n); + __builtin_prefetch(job->n); return down_left_spine; } @@ -3145,6 +3196,11 @@ void CheckJob::init(const ConflictSet::ReadRange *read, this->result = result; this->tls = tls; this->next = check_point_read_state_machine::begin; + // *result = + // checkPointRead(root, begin, InternalVersionT(read->readVersion), tls) + // ? ConflictSet::Commit + // : ConflictSet::Conflict; + // next = +[](CheckJob *) -> continuation { return nullptr; }; } else { *result = checkRangeRead(root, begin, end, InternalVersionT(read->readVersion), tls)