Commit to non-simd Node3 implementations
Some checks failed
Tests / Clang total: 2620, passed: 2620
Clang |Total|New|Outstanding|Fixed|Trend
|:-:|:-:|:-:|:-:|:-:
|0|0|0|0|:clap:
Tests / Debug total: 2618, passed: 2618
Tests / SIMD fallback total: 2620, passed: 2620
Tests / Release [gcc] total: 2620, passed: 2620
GNU C Compiler (gcc) |Total|New|Outstanding|Fixed|Trend
|:-:|:-:|:-:|:-:|:-:
|0|0|0|0|:clap:
Tests / Release [gcc,aarch64] total: 1957, passed: 1957
Tests / Coverage total: 1967, failed: 1, passed: 1966
weaselab/conflict-set/pipeline/head There was a failure building this commit
Some checks failed
Tests / Clang total: 2620, passed: 2620
Clang |Total|New|Outstanding|Fixed|Trend
|:-:|:-:|:-:|:-:|:-:
|0|0|0|0|:clap:
Tests / Debug total: 2618, passed: 2618
Tests / SIMD fallback total: 2620, passed: 2620
Tests / Release [gcc] total: 2620, passed: 2620
GNU C Compiler (gcc) |Total|New|Outstanding|Fixed|Trend
|:-:|:-:|:-:|:-:|:-:
|0|0|0|0|:clap:
Tests / Release [gcc,aarch64] total: 1957, passed: 1957
Tests / Coverage total: 1967, failed: 1, passed: 1966
weaselab/conflict-set/pipeline/head There was a failure building this commit
This commit is contained in:
@@ -768,14 +768,7 @@ private:
|
|||||||
BoundedFreeListAllocator<Node256> node256;
|
BoundedFreeListAllocator<Node256> node256;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class NodeT> int getNodeIndex(NodeT *self, uint8_t index) {
|
int getNodeIndex(Node3 *self, uint8_t index) {
|
||||||
static_assert(std::is_same_v<NodeT, Node3> || std::is_same_v<NodeT, Node16>);
|
|
||||||
|
|
||||||
// cachegrind says the plain loop is fewer instructions and more mis-predicted
|
|
||||||
// branches. Microbenchmark says plain loop is faster. It's written in this
|
|
||||||
// weird "generic" way though in case someday we can use the simd
|
|
||||||
// implementation easily if we want.
|
|
||||||
if constexpr (std::is_same_v<NodeT, Node3>) {
|
|
||||||
Node3 *n = (Node3 *)self;
|
Node3 *n = (Node3 *)self;
|
||||||
for (int i = 0; i < n->numChildren; ++i) {
|
for (int i = 0; i < n->numChildren; ++i) {
|
||||||
if (n->index[i] == index) {
|
if (n->index[i] == index) {
|
||||||
@@ -783,7 +776,9 @@ template <class NodeT> int getNodeIndex(NodeT *self, uint8_t index) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int getNodeIndex(Node16 *self, uint8_t index) {
|
||||||
|
|
||||||
#ifdef HAS_AVX
|
#ifdef HAS_AVX
|
||||||
// Based on https://www.the-paper-trail.org/post/art-paper-notes/
|
// Based on https://www.the-paper-trail.org/post/art-paper-notes/
|
||||||
@@ -796,7 +791,7 @@ template <class NodeT> int getNodeIndex(NodeT *self, uint8_t index) {
|
|||||||
// keys aren't valid, we'll mask the results to only consider the valid ones
|
// keys aren't valid, we'll mask the results to only consider the valid ones
|
||||||
// below.
|
// below.
|
||||||
__m128i indices;
|
__m128i indices;
|
||||||
memcpy(&indices, self->index, NodeT::kMaxNodes);
|
memcpy(&indices, self->index, Node16::kMaxNodes);
|
||||||
__m128i results = _mm_cmpeq_epi8(key_vec, indices);
|
__m128i results = _mm_cmpeq_epi8(key_vec, indices);
|
||||||
|
|
||||||
// Build a mask to select only the first node->num_children values from the
|
// Build a mask to select only the first node->num_children values from the
|
||||||
@@ -819,12 +814,11 @@ template <class NodeT> int getNodeIndex(NodeT *self, uint8_t index) {
|
|||||||
// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
|
// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
|
||||||
|
|
||||||
uint8x16_t indices;
|
uint8x16_t indices;
|
||||||
memcpy(&indices, self->index, NodeT::kMaxNodes);
|
memcpy(&indices, self->index, Node16::kMaxNodes);
|
||||||
// 0xff for each match
|
// 0xff for each match
|
||||||
uint16x8_t results =
|
uint16x8_t results =
|
||||||
vreinterpretq_u16_u8(vceqq_u8(vdupq_n_u8(index), indices));
|
vreinterpretq_u16_u8(vceqq_u8(vdupq_n_u8(index), indices));
|
||||||
static_assert(NodeT::kMaxNodes <= 16);
|
assume(self->numChildren <= Node16::kMaxNodes);
|
||||||
assume(self->numChildren <= NodeT::kMaxNodes);
|
|
||||||
uint64_t mask = self->numChildren == 16
|
uint64_t mask = self->numChildren == 16
|
||||||
? uint64_t(-1)
|
? uint64_t(-1)
|
||||||
: (uint64_t(1) << (self->numChildren * 4)) - 1;
|
: (uint64_t(1) << (self->numChildren * 4)) - 1;
|
||||||
@@ -1077,22 +1071,18 @@ ChildAndMaxVersion getChildAndMaxVersion(Node *self, uint8_t index) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class NodeT> Node *getChildGeqSimd(NodeT *self, int child) {
|
Node *getChildGeq(Node0 *, int) { return nullptr; }
|
||||||
static_assert(std::is_same_v<NodeT, Node3> || std::is_same_v<NodeT, Node16>);
|
|
||||||
|
|
||||||
// cachegrind says the plain loop is fewer instructions and more mis-predicted
|
Node *getChildGeq(Node3 *n, int child) {
|
||||||
// branches. Microbenchmark says plain loop is faster. It's written in this
|
|
||||||
// weird "generic" way though so that someday we can use the simd
|
|
||||||
// implementation easily if we want.
|
|
||||||
if constexpr (std::is_same_v<NodeT, Node3>) {
|
|
||||||
Node3 *n = (Node3 *)self;
|
|
||||||
for (int i = 0; i < n->numChildren; ++i) {
|
for (int i = 0; i < n->numChildren; ++i) {
|
||||||
if (n->index[i] >= child) {
|
if (n->index[i] >= child) {
|
||||||
return n->children[i];
|
return n->children[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Node *getChildGeq(Node16 *self, int child) {
|
||||||
if (child > 255) {
|
if (child > 255) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
@@ -1100,7 +1090,7 @@ template <class NodeT> Node *getChildGeqSimd(NodeT *self, int child) {
|
|||||||
#ifdef HAS_AVX
|
#ifdef HAS_AVX
|
||||||
__m128i key_vec = _mm_set1_epi8(child);
|
__m128i key_vec = _mm_set1_epi8(child);
|
||||||
__m128i indices;
|
__m128i indices;
|
||||||
memcpy(&indices, self->index, NodeT::kMaxNodes);
|
memcpy(&indices, self->index, Node16::kMaxNodes);
|
||||||
__m128i results = _mm_cmpeq_epi8(key_vec, _mm_min_epu8(key_vec, indices));
|
__m128i results = _mm_cmpeq_epi8(key_vec, _mm_min_epu8(key_vec, indices));
|
||||||
int mask = (1 << self->numChildren) - 1;
|
int mask = (1 << self->numChildren) - 1;
|
||||||
uint32_t bitfield = _mm_movemask_epi8(results) & mask;
|
uint32_t bitfield = _mm_movemask_epi8(results) & mask;
|
||||||
@@ -1110,8 +1100,7 @@ template <class NodeT> Node *getChildGeqSimd(NodeT *self, int child) {
|
|||||||
memcpy(&indices, self->index, sizeof(self->index));
|
memcpy(&indices, self->index, sizeof(self->index));
|
||||||
// 0xff for each leq
|
// 0xff for each leq
|
||||||
auto results = vcleq_u8(vdupq_n_u8(child), indices);
|
auto results = vcleq_u8(vdupq_n_u8(child), indices);
|
||||||
static_assert(NodeT::kMaxNodes <= 16);
|
assume(self->numChildren <= Node16::kMaxNodes);
|
||||||
assume(self->numChildren <= NodeT::kMaxNodes);
|
|
||||||
uint64_t mask = self->numChildren == 16
|
uint64_t mask = self->numChildren == 16
|
||||||
? uint64_t(-1)
|
? uint64_t(-1)
|
||||||
: (uint64_t(1) << (self->numChildren * 4)) - 1;
|
: (uint64_t(1) << (self->numChildren * 4)) - 1;
|
||||||
@@ -1136,13 +1125,6 @@ template <class NodeT> Node *getChildGeqSimd(NodeT *self, int child) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
Node *getChildGeq(Node0 *, int) { return nullptr; }
|
|
||||||
Node *getChildGeq(Node3 *self, int child) {
|
|
||||||
return getChildGeqSimd(self, child);
|
|
||||||
}
|
|
||||||
Node *getChildGeq(Node16 *self, int child) {
|
|
||||||
return getChildGeqSimd(self, child);
|
|
||||||
}
|
|
||||||
Node *getChildGeq(Node48 *self, int child) {
|
Node *getChildGeq(Node48 *self, int child) {
|
||||||
int c = self->bitSet.firstSetGeq(child);
|
int c = self->bitSet.firstSetGeq(child);
|
||||||
if (c < 0) {
|
if (c < 0) {
|
||||||
|
Reference in New Issue
Block a user