Use bitset index for Node48 too

This commit is contained in:
2024-01-31 16:04:03 -08:00
parent bafe1edfa4
commit 31a555c44c
3 changed files with 25 additions and 130 deletions

View File

@@ -50,7 +50,7 @@ add_library(${PROJECT_NAME} SHARED ConflictSet.cpp)
target_compile_options(${PROJECT_NAME} PRIVATE -fPIC -fno-exceptions
-fvisibility=hidden)
target_link_options(${PROJECT_NAME} PRIVATE $<$<NOT:$<CONFIG:Debug>>:
-nodefaultlibs -lc>)
-nodefaultlibs -lc -lgcc_s>)
if(NOT APPLE)
target_link_options(${PROJECT_NAME} PRIVATE

View File

@@ -64,17 +64,7 @@ struct Node16 : Node {
Node16() { this->type = Type::Node16; }
};
struct Node48 : Node {
int8_t nextFree = 0;
int8_t index[256];
Node *children[48] = {};
Node48() {
this->type = Type::Node48;
memset(index, -1, 256);
}
};
struct PointerSet {
struct BitSet {
bool test(int i) const {
assert(0 <= i);
assert(i < 256);
@@ -132,8 +122,19 @@ private:
__uint128_t hi = 0;
};
struct Node48 : Node {
BitSet bitSet;
Node *children[48] = {};
int8_t nextFree = 0;
int8_t index[256];
Node48() {
this->type = Type::Node48;
memset(index, -1, 256);
}
};
struct Node256 : Node {
PointerSet pointerSet;
BitSet bitSet;
Node *children[256] = {};
Node256() { this->type = Type::Node256; }
};
@@ -205,54 +206,6 @@ int getNodeIndex(Node16 *self, uint8_t index) {
#endif
}
#ifdef HAS_AVX
int firstNonNeg1(const int8_t x[16]) {
__m128i key_vec = _mm_set1_epi8(-1);
__m128i indices;
memcpy(&indices, x, 16);
__m128i results = _mm_cmpeq_epi8(key_vec, indices);
uint32_t bitfield = _mm_movemask_epi8(results) ^ 0xffff;
if (bitfield == 0)
return -1;
return std::countr_zero(bitfield);
}
int lastNonNeg1(const int8_t x[16]) {
__m128i key_vec = _mm_set1_epi8(-1);
__m128i indices;
memcpy(&indices, x, 16);
__m128i results = _mm_cmpeq_epi8(key_vec, indices);
uint32_t bitfield = _mm_movemask_epi8(results) ^ 0xffff;
if (bitfield == 0)
return -1;
return 31 - std::countl_zero(bitfield);
}
#endif
#ifdef HAS_ARM_NEON
int firstNonNeg1(const int8_t x[16]) {
uint8x16_t indices;
memcpy(&indices, x, 16);
uint16x8_t results = vreinterpretq_u16_u8(vceqq_u8(vdupq_n_u8(-1), indices));
uint64_t bitfield =
~vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(results, 4)), 0);
if (bitfield == 0)
return -1;
return std::countr_zero(bitfield) / 4;
}
int lastNonNeg1(const int8_t x[16]) {
uint8x16_t indices;
memcpy(&indices, x, 16);
uint16x8_t results = vreinterpretq_u16_u8(vceqq_u8(vdupq_n_u8(-1), indices));
uint64_t bitfield =
~vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(results, 4)), 0);
if (bitfield == 0)
return -1;
return 15 - std::countl_zero(bitfield) / 4;
}
#endif
[[maybe_unused]] Node *getChild(Node *self, uint8_t index) {
if (self->type == Type::Node4) {
auto *self4 = static_cast<Node4 *>(self);
@@ -372,36 +325,10 @@ int getChildGeq(Node *self, int child) {
#endif
} else if (self->type == Type::Node48) {
auto *self48 = static_cast<Node48 *>(self);
#if defined(HAS_AVX) || defined(HAS_ARM_NEON)
int i = child;
for (; (i & 0xf) != 0; ++i) {
if (self48->index[i] >= 0) {
assert(self48->children[self48->index[i]] != nullptr);
return i;
}
}
for (; i < 256; i += 16) {
auto result = firstNonNeg1(self48->index + i);
if (result != -1) {
return i + result;
}
}
#else
for (int i = child; i < 256; ++i) {
if (self48->index[i] >= 0) {
assert(self48->children[self48->index[i]] != nullptr);
return i;
}
}
#endif
return self48->bitSet.firstSetGeq(child);
} else {
auto *self256 = static_cast<Node256 *>(self);
#ifndef NDEBUG
for (int i = 0; i < 256; ++i) {
assert(self256->pointerSet.test(i) == (self256->children[i] != nullptr));
}
#endif
return self256->pointerSet.firstSetGeq(child);
return self256->bitSet.firstSetGeq(child);
}
return -1;
}
@@ -475,44 +402,10 @@ int getChildLeq(Node *self, int child) {
#endif
} else if (self->type == Type::Node48) {
auto *self48 = static_cast<Node48 *>(self);
#if defined(HAS_AVX) || defined(HAS_ARM_NEON)
int i = child;
if (i < 0) {
return -1;
}
for (; (i & 0xf) != 0; --i) {
if (self48->index[i] >= 0) {
assert(self48->children[self48->index[i]] != nullptr);
return i;
}
}
if (self48->index[i] >= 0) {
assert(self48->children[self48->index[i]] != nullptr);
return i;
}
i -= 16;
for (; i >= 0; i -= 16) {
auto result = lastNonNeg1(self48->index + i);
if (result != -1) {
return i + result;
}
}
#else
for (int i = child; i >= 0; --i) {
if (self48->index[i] >= 0) {
assert(self48->children[self48->index[i]] != nullptr);
return i;
}
}
#endif
return self48->bitSet.lastSetLeq(child);
} else {
auto *self256 = static_cast<Node256 *>(self);
#ifndef NDEBUG
for (int i = 0; i < 256; ++i) {
assert(self256->pointerSet.test(i) == (self256->children[i] != nullptr));
}
#endif
return self256->pointerSet.lastSetLeq(child);
return self256->bitSet.lastSetLeq(child);
}
return -1;
}
@@ -574,6 +467,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) {
newSelf->nextFree = 16;
int i = 0;
for (auto x : self16->index) {
newSelf->bitSet.set(x);
newSelf->children[i] = self16->children[i];
newSelf->index[x] = i;
++i;
@@ -611,7 +505,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) {
memcpy((void *)newSelf, self, offsetof(Node, type));
for (int i = 0; i < 256; ++i) {
if (self48->index[i] >= 0) {
newSelf->pointerSet.set(i);
newSelf->bitSet.set(i);
newSelf->children[i] = self48->children[self48->index[i]];
}
}
@@ -620,6 +514,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) {
setChildrenParents(self);
goto insert256;
} else {
self48->bitSet.set(index);
++self->numChildren;
assert(self48->nextFree < 48);
self48->index[index] = self48->nextFree;
@@ -632,7 +527,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index) {
if (!self256->children[index]) {
++self->numChildren;
}
self256->pointerSet.set(index);
self256->bitSet.set(index);
return self256->children[index];
}
}
@@ -1174,8 +1069,8 @@ void printTree() {
}
int main(void) {
// bench();
printTree();
bench();
// printTree();
return 0;
}
#endif

View File

@@ -3,4 +3,4 @@
set -euo pipefail
diff -u "$2" <(nm "$1" | grep " T " | cut -f3 -d " " | sort)
nm "$1" | grep " U " | (! grep -Pv 'abort|free|malloc|mem[a-z]*')
nm "$1" | grep " U " | (! grep -Pv 'abort|free|malloc|mem[a-z]*|__ashlti3')