Compare commits
2 Commits
14515e186a
...
0a1843a161
Author | SHA1 | Date | |
---|---|---|---|
0a1843a161 | |||
4edf0315d9 |
@@ -25,6 +25,9 @@ endif()
|
|||||||
add_compile_options(-fdata-sections -ffunction-sections -Wswitch-enum
|
add_compile_options(-fdata-sections -ffunction-sections -Wswitch-enum
|
||||||
-Werror=switch-enum)
|
-Werror=switch-enum)
|
||||||
|
|
||||||
|
option(USE_SIMD_FALLBACK
|
||||||
|
"Use fallback implementations of functions that use SIMD" OFF)
|
||||||
|
|
||||||
# This is encouraged according to
|
# This is encouraged according to
|
||||||
# https://valgrind.org/docs/manual/manual-core-adv.html#manual-core-adv.clientreq
|
# https://valgrind.org/docs/manual/manual-core-adv.html#manual-core-adv.clientreq
|
||||||
include_directories(SYSTEM ${CMAKE_SOURCE_DIR}/third_party/valgrind)
|
include_directories(SYSTEM ${CMAKE_SOURCE_DIR}/third_party/valgrind)
|
||||||
@@ -43,6 +46,7 @@ endif()
|
|||||||
include(CheckIncludeFileCXX)
|
include(CheckIncludeFileCXX)
|
||||||
include(CMakePushCheckState)
|
include(CMakePushCheckState)
|
||||||
|
|
||||||
|
if(NOT USE_SIMD_FALLBACK)
|
||||||
cmake_push_check_state()
|
cmake_push_check_state()
|
||||||
list(APPEND CMAKE_REQUIRED_FLAGS -mavx)
|
list(APPEND CMAKE_REQUIRED_FLAGS -mavx)
|
||||||
check_include_file_cxx("immintrin.h" HAS_AVX)
|
check_include_file_cxx("immintrin.h" HAS_AVX)
|
||||||
@@ -56,6 +60,7 @@ check_include_file_cxx("arm_neon.h" HAS_ARM_NEON)
|
|||||||
if(HAS_ARM_NEON)
|
if(HAS_ARM_NEON)
|
||||||
add_compile_definitions(HAS_ARM_NEON)
|
add_compile_definitions(HAS_ARM_NEON)
|
||||||
endif()
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES "")
|
set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES "")
|
||||||
|
|
||||||
|
@@ -29,6 +29,7 @@ limitations under the License.
|
|||||||
#include <span>
|
#include <span>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <string_view>
|
#include <string_view>
|
||||||
|
#include <type_traits>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
#ifdef HAS_AVX
|
#ifdef HAS_AVX
|
||||||
@@ -954,6 +955,42 @@ Node *&getOrCreateChild(Node *&self, uint8_t index,
|
|||||||
assert(self->getType() == Type_Node16);
|
assert(self->getType() == Type_Node16);
|
||||||
|
|
||||||
++self->numChildren;
|
++self->numChildren;
|
||||||
|
#ifdef HAS_AVX
|
||||||
|
__m128i key_vec = _mm_set1_epi8(index);
|
||||||
|
__m128i indices;
|
||||||
|
memcpy(&indices, self16->index, sizeof(self16->index));
|
||||||
|
__m128i results = _mm_cmpeq_epi8(key_vec, _mm_min_epu8(key_vec, indices));
|
||||||
|
int mask = (1 << (self->numChildren - 1)) - 1;
|
||||||
|
uint32_t bitfield = _mm_movemask_epi8(results) & mask;
|
||||||
|
bitfield |= uint32_t(1) << (self->numChildren - 1);
|
||||||
|
int i = std::countr_zero(bitfield);
|
||||||
|
if (i < self->numChildren - 1) {
|
||||||
|
memmove(self16->index + i + 1, self16->index + i,
|
||||||
|
self->numChildren - (i + 1));
|
||||||
|
memmove(self16->children + i + 1, self16->children + i,
|
||||||
|
(self->numChildren - (i + 1)) * sizeof(Child));
|
||||||
|
}
|
||||||
|
#elif defined(HAS_ARM_NEON)
|
||||||
|
uint8x16_t indices;
|
||||||
|
memcpy(&indices, self16->index, sizeof(self16->index));
|
||||||
|
// 0xff for each leq
|
||||||
|
auto results = vcleq_u8(vdupq_n_u8(index), indices);
|
||||||
|
uint64_t mask = (uint64_t(1) << ((self->numChildren - 1) * 4)) - 1;
|
||||||
|
// 0xf for each 0xff (within mask)
|
||||||
|
uint64_t bitfield =
|
||||||
|
vget_lane_u64(
|
||||||
|
vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(results), 4)),
|
||||||
|
0) &
|
||||||
|
mask;
|
||||||
|
bitfield |= uint64_t(0xf) << ((self->numChildren - 1) * 4);
|
||||||
|
int i = std::countr_zero(bitfield) / 4;
|
||||||
|
if (i < self->numChildren - 1) {
|
||||||
|
memmove(self16->index + i + 1, self16->index + i,
|
||||||
|
self->numChildren - (i + 1));
|
||||||
|
memmove(self16->children + i + 1, self16->children + i,
|
||||||
|
(self->numChildren - (i + 1)) * sizeof(Child));
|
||||||
|
}
|
||||||
|
#else
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (; i < int(self->numChildren) - 1; ++i) {
|
for (; i < int(self->numChildren) - 1; ++i) {
|
||||||
if (int(self16->index[i]) > int(index)) {
|
if (int(self16->index[i]) > int(index)) {
|
||||||
@@ -964,6 +1001,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
self16->index[i] = index;
|
self16->index[i] = index;
|
||||||
auto &result = self16->children[i].child;
|
auto &result = self16->children[i].child;
|
||||||
result = nullptr;
|
result = nullptr;
|
||||||
@@ -979,8 +1017,8 @@ Node *&getOrCreateChild(Node *&self, uint8_t index,
|
|||||||
self = newSelf;
|
self = newSelf;
|
||||||
goto insert256;
|
goto insert256;
|
||||||
}
|
}
|
||||||
insert48:
|
|
||||||
|
|
||||||
|
insert48:
|
||||||
auto *self48 = static_cast<Node48 *>(self);
|
auto *self48 = static_cast<Node48 *>(self);
|
||||||
self48->bitSet.set(index);
|
self48->bitSet.set(index);
|
||||||
++self->numChildren;
|
++self->numChildren;
|
||||||
@@ -992,6 +1030,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index,
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
case Type_Node256: {
|
case Type_Node256: {
|
||||||
|
|
||||||
insert256:
|
insert256:
|
||||||
auto *self256 = static_cast<Node256 *>(self);
|
auto *self256 = static_cast<Node256 *>(self);
|
||||||
++self->numChildren;
|
++self->numChildren;
|
||||||
|
Reference in New Issue
Block a user