simd maybeSkipWs. Not sure if faster

This commit is contained in:
2025-05-13 21:17:37 -04:00
parent 36f8df4201
commit 280962cdd0

View File

@@ -1,7 +1,9 @@
#include <cassert>
#include <cctype>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <immintrin.h>
#include <initializer_list>
#include <string>
@@ -155,12 +157,68 @@ const char *symbolNames[] = {
"N_OBJECT_MAYBE_CONTINUE",
};
namespace {
int leadingWhitespaceCount(const char *buf, int len) {
// Based on
// http://0x80.pl/articles/simd-byte-lookup.html#special-case-1-small-sets
constexpr uint8_t charSet[] = {' ', '\t', '\n', '\r'};
constexpr static struct Table {
constexpr Table() {
static_assert(sizeof(charSet) < 8);
uint8_t bitElement[sizeof(charSet)]{};
for (int i = 0; i < int(sizeof(charSet)); ++i) {
bitElement[i] = 1 << i;
}
for (int i = 0; i < int(sizeof(charSet)); ++i) {
uint8_t c = charSet[i];
int low = c & 0xf;
int high = c >> 4;
lowNibbleTable[low] |= bitElement[i];
highNibbleTable[high] |= bitElement[i];
}
}
// lowNibbleTable[i] is the set of chars with i as its low nibble
alignas(16) uint8_t lowNibbleTable[16]{};
// highNibbleTable[i] is the set of chars with i as its high nibble
alignas(16) uint8_t highNibbleTable[16]{};
} table;
bool whitespace(char x) {
return x == 0x20 || x == 0x0A || x == 0x0D || x == 0x09;
int i = 0;
for (; i + 16 <= len; i += 16) {
__m128i copy;
memcpy(&copy, &buf[i], sizeof(copy));
const __m128i input = _mm_loadu_si128((const __m128i *)&copy);
const __m128i lower_nibbles = _mm_and_si128(input, _mm_set1_epi8(0x0f));
const __m128i higher_nibbles =
_mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0f));
const __m128i lo_translated = _mm_shuffle_epi8(
_mm_load_si128((const __m128i *)table.lowNibbleTable), lower_nibbles);
const __m128i hi_translated = _mm_shuffle_epi8(
_mm_load_si128((const __m128i *)table.highNibbleTable), higher_nibbles);
const __m128i intersection = _mm_and_si128(lo_translated, hi_translated);
uint32_t notInCharSet =
_mm_movemask_epi8(_mm_cmpeq_epi8(intersection, _mm_setzero_si128()));
if (notInCharSet != 0) {
return i + __builtin_ctz(notInCharSet);
}
}
for (; i < len; ++i) {
bool any = false;
for (auto c : charSet) {
if (buf[i] == c) {
any = true;
break;
}
}
if (!any) {
break;
}
}
return i;
}
namespace {
// Straightforward recursive descent that doesn't handle string escaping and
// treats numbers as [0-9.]+. May stack overflow on deeply nested json documents
struct Parser1 {
@@ -183,10 +241,9 @@ private:
// Helpers
void maybeSkipWs() {
while (len > 0 && whitespace(*buf)) {
++buf;
--len;
}
int leadingWs = leadingWhitespaceCount(buf, len);
buf += leadingWs;
len -= leadingWs;
}
bool parseLiteral(const char *literal) {
const int litLen = strlen(literal);
@@ -427,10 +484,9 @@ struct Parser2 {
private:
// Helpers
void maybeSkipWs() {
while (len > 0 && whitespace(*buf)) {
++buf;
--len;
}
int leadingWs = leadingWhitespaceCount(buf, len);
buf += leadingWs;
len -= leadingWs;
}
bool parseLiteral(const char *literal) {
const int litLen = strlen(literal);
@@ -666,6 +722,13 @@ Callbacks printCallbacks() {
} // namespace
TEST_CASE("leadingWhitespaceCount") {
{
const char *s = " \r\t\n x ";
CHECK(leadingWhitespaceCount(s, strlen(s)) == 5);
}
}
TEST_CASE("parser1") {
Callbacks c = printCallbacks();
auto copy = json;