5 Commits

Author SHA1 Message Date
0a9ac59676 Commit to non-simd Node3 implementations
Some checks failed
Tests / Clang total: 2620, passed: 2620
Clang |Total|New|Outstanding|Fixed|Trend |:-:|:-:|:-:|:-:|:-: |0|0|0|0|:clap:
Tests / Debug total: 2618, passed: 2618
Tests / SIMD fallback total: 2620, passed: 2620
Tests / Release [gcc] total: 2620, passed: 2620
GNU C Compiler (gcc) |Total|New|Outstanding|Fixed|Trend |:-:|:-:|:-:|:-:|:-: |0|0|0|0|:clap:
Tests / Release [gcc,aarch64] total: 1957, passed: 1957
Tests / Coverage total: 1967, failed: 1, passed: 1966
weaselab/conflict-set/pipeline/head There was a failure building this commit
2024-08-20 10:36:04 -07:00
e3a77ed773 Remove unnecessary casts 2024-08-20 10:30:27 -07:00
cdf9a8a7b0 Save 8 bytes in Node3 2024-08-20 10:30:07 -07:00
305dfdd52f Change whitespace in node structs for consistency 2024-08-20 09:57:44 -07:00
7261c91492 Remove Node48::nextFree, and improve padding to save 8 bytes 2024-08-20 09:51:29 -07:00

View File

@@ -197,7 +197,6 @@ struct Node {
/* end section that's copied to the next node */ /* end section that's copied to the next node */
uint8_t *partialKey(); uint8_t *partialKey();
Type getType() const { return type; } Type getType() const { return type; }
int32_t getCapacity() const { return partialKeyCapacity; } int32_t getCapacity() const { return partialKeyCapacity; }
@@ -221,84 +220,83 @@ constexpr int kNodeCopySize =
struct Node0 : Node { struct Node0 : Node {
constexpr static auto kType = Type_Node0; constexpr static auto kType = Type_Node0;
uint8_t *partialKey() { return (uint8_t *)(this + 1); }
uint8_t *partialKey() { return (uint8_t *)(this + 1); }
void copyChildrenAndKeyFrom(const Node0 &other); void copyChildrenAndKeyFrom(const Node0 &other);
void copyChildrenAndKeyFrom(const struct Node3 &other); void copyChildrenAndKeyFrom(const struct Node3 &other);
size_t size() const { return sizeof(Node0) + getCapacity(); } size_t size() const { return sizeof(Node0) + getCapacity(); }
}; };
struct Node3 : Node { struct Node3 : Node {
constexpr static auto kMaxNodes = 3; constexpr static auto kMaxNodes = 3;
constexpr static auto kType = Type_Node3; constexpr static auto kType = Type_Node3;
// Sorted
uint8_t index[kMaxNodes];
Node *children[kMaxNodes]; Node *children[kMaxNodes];
InternalVersionT childMaxVersion[kMaxNodes]; InternalVersionT childMaxVersion[kMaxNodes];
uint8_t *partialKey() { return (uint8_t *)(this + 1); } // Sorted
uint8_t index[kMaxNodes];
uint8_t *partialKey() { return (uint8_t *)(this + 1); }
void copyChildrenAndKeyFrom(const Node0 &other); void copyChildrenAndKeyFrom(const Node0 &other);
void copyChildrenAndKeyFrom(const Node3 &other); void copyChildrenAndKeyFrom(const Node3 &other);
void copyChildrenAndKeyFrom(const struct Node16 &other); void copyChildrenAndKeyFrom(const struct Node16 &other);
size_t size() const { return sizeof(Node3) + getCapacity(); } size_t size() const { return sizeof(Node3) + getCapacity(); }
}; };
struct Node16 : Node { struct Node16 : Node {
constexpr static auto kType = Type_Node16; constexpr static auto kType = Type_Node16;
constexpr static auto kMaxNodes = 16; constexpr static auto kMaxNodes = 16;
// Sorted
uint8_t index[kMaxNodes];
Node *children[kMaxNodes]; Node *children[kMaxNodes];
InternalVersionT childMaxVersion[kMaxNodes]; InternalVersionT childMaxVersion[kMaxNodes];
uint8_t *partialKey() { return (uint8_t *)(this + 1); } // Sorted
uint8_t index[kMaxNodes];
uint8_t *partialKey() { return (uint8_t *)(this + 1); }
void copyChildrenAndKeyFrom(const Node3 &other); void copyChildrenAndKeyFrom(const Node3 &other);
void copyChildrenAndKeyFrom(const Node16 &other); void copyChildrenAndKeyFrom(const Node16 &other);
void copyChildrenAndKeyFrom(const struct Node48 &other); void copyChildrenAndKeyFrom(const struct Node48 &other);
size_t size() const { return sizeof(Node16) + getCapacity(); } size_t size() const { return sizeof(Node16) + getCapacity(); }
}; };
struct Node48 : Node { struct Node48 : Node {
constexpr static auto kType = Type_Node48; constexpr static auto kType = Type_Node48;
constexpr static auto kMaxNodes = 48; constexpr static auto kMaxNodes = 48;
BitSet bitSet;
int8_t nextFree;
int8_t index[256];
Node *children[kMaxNodes];
InternalVersionT childMaxVersion[kMaxNodes];
uint8_t reverseIndex[kMaxNodes];
constexpr static int kMaxOfMaxPageSize = 16; constexpr static int kMaxOfMaxPageSize = 16;
constexpr static int kMaxOfMaxShift = constexpr static int kMaxOfMaxShift =
std::countr_zero(uint32_t(kMaxOfMaxPageSize)); std::countr_zero(uint32_t(kMaxOfMaxPageSize));
constexpr static int kMaxOfMaxTotalPages = kMaxNodes / kMaxOfMaxPageSize; constexpr static int kMaxOfMaxTotalPages = kMaxNodes / kMaxOfMaxPageSize;
BitSet bitSet;
Node *children[kMaxNodes];
InternalVersionT childMaxVersion[kMaxNodes];
InternalVersionT maxOfMax[kMaxOfMaxTotalPages]; InternalVersionT maxOfMax[kMaxOfMaxTotalPages];
uint8_t reverseIndex[kMaxNodes];
int8_t index[256];
uint8_t *partialKey() { return (uint8_t *)(this + 1); } uint8_t *partialKey() { return (uint8_t *)(this + 1); }
void copyChildrenAndKeyFrom(const Node16 &other); void copyChildrenAndKeyFrom(const Node16 &other);
void copyChildrenAndKeyFrom(const Node48 &other); void copyChildrenAndKeyFrom(const Node48 &other);
void copyChildrenAndKeyFrom(const struct Node256 &other); void copyChildrenAndKeyFrom(const struct Node256 &other);
size_t size() const { return sizeof(Node48) + getCapacity(); } size_t size() const { return sizeof(Node48) + getCapacity(); }
}; };
struct Node256 : Node { struct Node256 : Node {
constexpr static auto kType = Type_Node256; constexpr static auto kType = Type_Node256;
BitSet bitSet; constexpr static auto kMaxNodes = 256;
Node *children[256];
InternalVersionT childMaxVersion[256];
constexpr static int kMaxOfMaxPageSize = 16; constexpr static int kMaxOfMaxPageSize = 16;
constexpr static int kMaxOfMaxShift = constexpr static int kMaxOfMaxShift =
std::countr_zero(uint32_t(kMaxOfMaxPageSize)); std::countr_zero(uint32_t(kMaxOfMaxPageSize));
constexpr static int kMaxOfMaxTotalPages = 256 / kMaxOfMaxPageSize; constexpr static int kMaxOfMaxTotalPages = kMaxNodes / kMaxOfMaxPageSize;
BitSet bitSet;
Node *children[kMaxNodes];
InternalVersionT childMaxVersion[kMaxNodes];
InternalVersionT maxOfMax[kMaxOfMaxTotalPages]; InternalVersionT maxOfMax[kMaxOfMaxTotalPages];
uint8_t *partialKey() { return (uint8_t *)(this + 1); } uint8_t *partialKey() { return (uint8_t *)(this + 1); }
void copyChildrenAndKeyFrom(const Node48 &other); void copyChildrenAndKeyFrom(const Node48 &other);
void copyChildrenAndKeyFrom(const Node256 &other); void copyChildrenAndKeyFrom(const Node256 &other);
size_t size() const { return sizeof(Node256) + getCapacity(); } size_t size() const { return sizeof(Node256) + getCapacity(); }
}; };
@@ -323,7 +321,7 @@ inline void Node3::copyChildrenAndKeyFrom(const Node0 &other) {
inline void Node3::copyChildrenAndKeyFrom(const Node3 &other) { inline void Node3::copyChildrenAndKeyFrom(const Node3 &other) {
memcpy((char *)this + kNodeCopyBegin, (char *)&other + kNodeCopyBegin, memcpy((char *)this + kNodeCopyBegin, (char *)&other + kNodeCopyBegin,
kNodeCopySize); kNodeCopySize);
memcpy(index, other.index, sizeof(*this) - sizeof(Node)); memcpy(children, other.children, sizeof(*this) - sizeof(Node));
memcpy(partialKey(), &other + 1, partialKeyLen); memcpy(partialKey(), &other + 1, partialKeyLen);
for (int i = 0; i < numChildren; ++i) { for (int i = 0; i < numChildren; ++i) {
assert(children[i]->parent == &other); assert(children[i]->parent == &other);
@@ -404,7 +402,6 @@ inline void Node48::copyChildrenAndKeyFrom(const Node16 &other) {
} }
memcpy(partialKey(), &other + 1, partialKeyLen); memcpy(partialKey(), &other + 1, partialKeyLen);
bitSet.init(); bitSet.init();
nextFree = Node16::kMaxNodes;
int i = 0; int i = 0;
for (auto x : other.index) { for (auto x : other.index) {
bitSet.set(x); bitSet.set(x);
@@ -424,7 +421,6 @@ inline void Node48::copyChildrenAndKeyFrom(const Node48 &other) {
memcpy((char *)this + kNodeCopyBegin, (char *)&other + kNodeCopyBegin, memcpy((char *)this + kNodeCopyBegin, (char *)&other + kNodeCopyBegin,
kNodeCopySize); kNodeCopySize);
bitSet = other.bitSet; bitSet = other.bitSet;
nextFree = other.nextFree;
memcpy(index, other.index, sizeof(index)); memcpy(index, other.index, sizeof(index));
memset(children, 0, sizeof(children)); memset(children, 0, sizeof(children));
const auto z = InternalVersionT::zero; const auto z = InternalVersionT::zero;
@@ -451,7 +447,6 @@ inline void Node48::copyChildrenAndKeyFrom(const Node256 &other) {
for (auto &v : childMaxVersion) { for (auto &v : childMaxVersion) {
v = z; v = z;
} }
nextFree = other.numChildren;
bitSet = other.bitSet; bitSet = other.bitSet;
int i = 0; int i = 0;
bitSet.forEachSet([&](int c) { bitSet.forEachSet([&](int c) {
@@ -773,22 +768,17 @@ private:
BoundedFreeListAllocator<Node256> node256; BoundedFreeListAllocator<Node256> node256;
}; };
template <class NodeT> int getNodeIndex(NodeT *self, uint8_t index) { int getNodeIndex(Node3 *self, uint8_t index) {
static_assert(std::is_same_v<NodeT, Node3> || std::is_same_v<NodeT, Node16>); Node3 *n = (Node3 *)self;
for (int i = 0; i < n->numChildren; ++i) {
// cachegrind says the plain loop is fewer instructions and more mis-predicted if (n->index[i] == index) {
// branches. Microbenchmark says plain loop is faster. It's written in this return i;
// weird "generic" way though in case someday we can use the simd
// implementation easily if we want.
if constexpr (std::is_same_v<NodeT, Node3>) {
Node3 *n = (Node3 *)self;
for (int i = 0; i < n->numChildren; ++i) {
if (n->index[i] == index) {
return i;
}
} }
return -1;
} }
return -1;
}
int getNodeIndex(Node16 *self, uint8_t index) {
#ifdef HAS_AVX #ifdef HAS_AVX
// Based on https://www.the-paper-trail.org/post/art-paper-notes/ // Based on https://www.the-paper-trail.org/post/art-paper-notes/
@@ -801,7 +791,7 @@ template <class NodeT> int getNodeIndex(NodeT *self, uint8_t index) {
// keys aren't valid, we'll mask the results to only consider the valid ones // keys aren't valid, we'll mask the results to only consider the valid ones
// below. // below.
__m128i indices; __m128i indices;
memcpy(&indices, self->index, NodeT::kMaxNodes); memcpy(&indices, self->index, Node16::kMaxNodes);
__m128i results = _mm_cmpeq_epi8(key_vec, indices); __m128i results = _mm_cmpeq_epi8(key_vec, indices);
// Build a mask to select only the first node->num_children values from the // Build a mask to select only the first node->num_children values from the
@@ -824,12 +814,11 @@ template <class NodeT> int getNodeIndex(NodeT *self, uint8_t index) {
// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
uint8x16_t indices; uint8x16_t indices;
memcpy(&indices, self->index, NodeT::kMaxNodes); memcpy(&indices, self->index, Node16::kMaxNodes);
// 0xff for each match // 0xff for each match
uint16x8_t results = uint16x8_t results =
vreinterpretq_u16_u8(vceqq_u8(vdupq_n_u8(index), indices)); vreinterpretq_u16_u8(vceqq_u8(vdupq_n_u8(index), indices));
static_assert(NodeT::kMaxNodes <= 16); assume(self->numChildren <= Node16::kMaxNodes);
assume(self->numChildren <= NodeT::kMaxNodes);
uint64_t mask = self->numChildren == 16 uint64_t mask = self->numChildren == 16
? uint64_t(-1) ? uint64_t(-1)
: (uint64_t(1) << (self->numChildren * 4)) - 1; : (uint64_t(1) << (self->numChildren * 4)) - 1;
@@ -1082,22 +1071,18 @@ ChildAndMaxVersion getChildAndMaxVersion(Node *self, uint8_t index) {
} }
} }
template <class NodeT> Node *getChildGeqSimd(NodeT *self, int child) { Node *getChildGeq(Node0 *, int) { return nullptr; }
static_assert(std::is_same_v<NodeT, Node3> || std::is_same_v<NodeT, Node16>);
// cachegrind says the plain loop is fewer instructions and more mis-predicted Node *getChildGeq(Node3 *n, int child) {
// branches. Microbenchmark says plain loop is faster. It's written in this for (int i = 0; i < n->numChildren; ++i) {
// weird "generic" way though so that someday we can use the simd if (n->index[i] >= child) {
// implementation easily if we want. return n->children[i];
if constexpr (std::is_same_v<NodeT, Node3>) {
Node3 *n = (Node3 *)self;
for (int i = 0; i < n->numChildren; ++i) {
if (n->index[i] >= child) {
return n->children[i];
}
} }
return nullptr;
} }
return nullptr;
}
Node *getChildGeq(Node16 *self, int child) {
if (child > 255) { if (child > 255) {
return nullptr; return nullptr;
} }
@@ -1105,7 +1090,7 @@ template <class NodeT> Node *getChildGeqSimd(NodeT *self, int child) {
#ifdef HAS_AVX #ifdef HAS_AVX
__m128i key_vec = _mm_set1_epi8(child); __m128i key_vec = _mm_set1_epi8(child);
__m128i indices; __m128i indices;
memcpy(&indices, self->index, NodeT::kMaxNodes); memcpy(&indices, self->index, Node16::kMaxNodes);
__m128i results = _mm_cmpeq_epi8(key_vec, _mm_min_epu8(key_vec, indices)); __m128i results = _mm_cmpeq_epi8(key_vec, _mm_min_epu8(key_vec, indices));
int mask = (1 << self->numChildren) - 1; int mask = (1 << self->numChildren) - 1;
uint32_t bitfield = _mm_movemask_epi8(results) & mask; uint32_t bitfield = _mm_movemask_epi8(results) & mask;
@@ -1115,8 +1100,7 @@ template <class NodeT> Node *getChildGeqSimd(NodeT *self, int child) {
memcpy(&indices, self->index, sizeof(self->index)); memcpy(&indices, self->index, sizeof(self->index));
// 0xff for each leq // 0xff for each leq
auto results = vcleq_u8(vdupq_n_u8(child), indices); auto results = vcleq_u8(vdupq_n_u8(child), indices);
static_assert(NodeT::kMaxNodes <= 16); assume(self->numChildren <= Node16::kMaxNodes);
assume(self->numChildren <= NodeT::kMaxNodes);
uint64_t mask = self->numChildren == 16 uint64_t mask = self->numChildren == 16
? uint64_t(-1) ? uint64_t(-1)
: (uint64_t(1) << (self->numChildren * 4)) - 1; : (uint64_t(1) << (self->numChildren * 4)) - 1;
@@ -1141,13 +1125,6 @@ template <class NodeT> Node *getChildGeqSimd(NodeT *self, int child) {
#endif #endif
} }
Node *getChildGeq(Node0 *, int) { return nullptr; }
Node *getChildGeq(Node3 *self, int child) {
return getChildGeqSimd(self, child);
}
Node *getChildGeq(Node16 *self, int child) {
return getChildGeqSimd(self, child);
}
Node *getChildGeq(Node48 *self, int child) { Node *getChildGeq(Node48 *self, int child) {
int c = self->bitSet.firstSetGeq(child); int c = self->bitSet.firstSetGeq(child);
if (c < 0) { if (c < 0) {
@@ -1360,7 +1337,7 @@ Node *&getOrCreateChild(Node *&self, std::span<const uint8_t> &key,
auto *self3 = static_cast<Node3 *>(self); auto *self3 = static_cast<Node3 *>(self);
int i = self->numChildren - 1; int i = self->numChildren - 1;
for (; i >= 0; --i) { for (; i >= 0; --i) {
if (int(self3->index[i]) < int(index)) { if (self3->index[i] < index) {
break; break;
} }
self3->index[i + 1] = self3->index[i]; self3->index[i + 1] = self3->index[i];
@@ -1390,7 +1367,7 @@ Node *&getOrCreateChild(Node *&self, std::span<const uint8_t> &key,
auto *self16 = static_cast<Node16 *>(self); auto *self16 = static_cast<Node16 *>(self);
int i = self->numChildren - 1; int i = self->numChildren - 1;
for (; i >= 0; --i) { for (; i >= 0; --i) {
if (int(self16->index[i]) < int(index)) { if (self16->index[i] < index) {
break; break;
} }
self16->index[i + 1] = self16->index[i]; self16->index[i + 1] = self16->index[i];
@@ -1419,9 +1396,7 @@ Node *&getOrCreateChild(Node *&self, std::span<const uint8_t> &key,
insert48: insert48:
auto *self48 = static_cast<Node48 *>(self); auto *self48 = static_cast<Node48 *>(self);
self48->bitSet.set(index); self48->bitSet.set(index);
++self->numChildren; auto nextFree = self48->numChildren++;
assert(self48->nextFree < 48);
int nextFree = self48->nextFree++;
self48->index[index] = nextFree; self48->index[index] = nextFree;
self48->reverseIndex[nextFree] = index; self48->reverseIndex[nextFree] = index;
auto &result = self48->children[nextFree]; auto &result = self48->children[nextFree];
@@ -1812,7 +1787,7 @@ Node *erase(Node *self, WriteContext *tls, ConflictSet::Impl *impl,
parent48->bitSet.reset(parentsIndex); parent48->bitSet.reset(parentsIndex);
int8_t toRemoveChildrenIndex = int8_t toRemoveChildrenIndex =
std::exchange(parent48->index[parentsIndex], -1); std::exchange(parent48->index[parentsIndex], -1);
int8_t lastChildrenIndex = --parent48->nextFree; auto lastChildrenIndex = --parent48->numChildren;
assert(toRemoveChildrenIndex >= 0); assert(toRemoveChildrenIndex >= 0);
assert(lastChildrenIndex >= 0); assert(lastChildrenIndex >= 0);
if (toRemoveChildrenIndex != lastChildrenIndex) { if (toRemoveChildrenIndex != lastChildrenIndex) {
@@ -1831,8 +1806,6 @@ Node *erase(Node *self, WriteContext *tls, ConflictSet::Impl *impl,
} }
parent48->childMaxVersion[lastChildrenIndex] = tls->zero; parent48->childMaxVersion[lastChildrenIndex] = tls->zero;
--parent->numChildren;
if (needsDownsize(parent48)) { if (needsDownsize(parent48)) {
downsize(parent48, tls, impl, result); downsize(parent48, tls, impl, result);
} }