Use new simd library for scanning string literals

This commit is contained in:
2025-06-04 13:49:19 -04:00
parent b5f41768a7
commit 3e2f830d0b
4 changed files with 112165 additions and 82 deletions

View File

@@ -1,6 +1,5 @@
#pragma once #pragma once
#include <bit>
#include <cassert> #include <cassert>
#include <cctype> #include <cctype>
#include <cstdint> #include <cstdint>
@@ -9,15 +8,9 @@
#include <initializer_list> #include <initializer_list>
#include <tuple> #include <tuple>
#ifdef __x86_64__
#include <immintrin.h>
#endif
#ifdef __aarch64__
#include <arm_neon.h>
#endif
#include "musttail.h" #include "musttail.h"
#include "preserve_none.h" #include "preserve_none.h"
#include "simd.h"
#include "tables.h" #include "tables.h"
#include "weaseljson.h" #include "weaseljson.h"
@@ -409,64 +402,26 @@ inline PRESERVE_NONE WeaselJsonStatus n_string(Parser3 *self) {
inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self) { inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self) {
const auto before = self->buf; const auto before = self->buf;
// Advance self->buf to the first "non-normal" character // Advance self->buf to the first "non-normal" character
#ifdef __x86_64__
for (;;) { for (;;) {
if (self->bufEnd - self->buf < 16) [[unlikely]] { constexpr int kStride = 64;
if (self->bufEnd - self->buf < kStride) [[unlikely]] {
while (self->buf != self->bufEnd && while (self->buf != self->bufEnd &&
tables.stringByteMeaning[uint8_t(*self->buf)] == Tables::NORMAL) { tables.stringByteMeaning[uint8_t(*self->buf)] == Tables::NORMAL) {
++self->buf; ++self->buf;
} }
break; break;
} }
__m128i x; using V = simd<int8_t, kStride>;
memcpy(&x, self->buf, 16); auto v = V{(int8_t *)self->buf};
const uint32_t dubquote = int normal =
_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_set1_epi8('"'), x)); (v != V::splat('"') & v != V::splat('\\') & v >= V::splat(0x20))
const uint32_t backslash = .count_leading_nonzero_lanes();
_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_set1_epi8('\\'), x)); self->buf += normal;
const uint32_t control_or_negative = if (normal < kStride) {
_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_set1_epi8(0x20), x));
const uint32_t non_normal = dubquote | backslash | control_or_negative;
if (non_normal) {
self->buf += std::countr_zero(non_normal);
break; break;
} }
self->buf += 16;
} }
#elif defined(__aarch64__)
for (;;) {
if (self->bufEnd - self->buf < 16) [[unlikely]] {
while (self->buf != self->bufEnd &&
tables.stringByteMeaning[uint8_t(*self->buf)] == Tables::NORMAL) {
++self->buf;
}
break;
}
int8x16_t x;
memcpy(&x, self->buf, 16);
const auto dubquote = vreinterpretq_s8_u8(vceqq_s8(vdupq_n_s8('"'), x));
const auto backslash = vreinterpretq_s8_u8(vceqq_s8(vdupq_n_s8('\\'), x));
const auto control_or_negative =
vreinterpretq_s8_u8(vcgtq_s8(vdupq_n_s8(0x20), x));
const auto non_normal = vget_lane_u64(
vreinterpret_u64_u8(vshrn_n_u16(
vreinterpretq_u16_s8(
vorrq_s8(vorrq_s8(dubquote, backslash), control_or_negative)),
4)),
0);
if (non_normal) {
self->buf += std::countr_zero(non_normal) / 4;
break;
}
self->buf += 16;
}
#else
while (self->buf != self->bufEnd &&
tables.stringByteMeaning[uint8_t(*self->buf)] == Tables::NORMAL) {
++self->buf;
}
#endif
int len = self->buf - before; int len = self->buf - before;
memmove(self->writeBuf, before, len); memmove(self->writeBuf, before, len);

1266
src/simd.h Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -2,6 +2,7 @@
#include <cctype> #include <cctype>
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <fstream>
#include <limits> #include <limits>
#include <string> #include <string>
@@ -119,32 +120,11 @@
namespace { namespace {
const std::string json = R"({ const std::string json = []() {
"a number": 12345, std::ifstream infile{"test.json"};
"true": true, return std::string{std::istreambuf_iterator<char>(infile),
"false": false, std::istreambuf_iterator<char>()};
"null": null, }();
"glossary": {
"title": "example glossary",
"GlossDiv": {
"title": "S",
"GlossList": {
"GlossEntry": {
"ID": "SGML",
"SortAs": "SGML",
"GlossTerm": "Standard Generalized Markup Language",
"Acronym": "SGML",
"Abbrev": "ISO 8879:1986",
"GlossDef": {
"para": "A meta-markup language, used to create markup languages such as DocBook.",
"GlossSeeAlso": ["GML", "XML"]
},
"GlossSee": "markup"
}
}
}
}
})";
void testStreaming(std::string const &json) { void testStreaming(std::string const &json) {
SerializeState streaming; SerializeState streaming;

110882
test.json Normal file

File diff suppressed because it is too large Load Diff