Switch to dfa for strings

This commit is contained in:
2025-06-21 17:12:48 -04:00
parent 6c48c40d67
commit 229a68bfdd
2 changed files with 18 additions and 167 deletions

View File

@@ -667,8 +667,6 @@ enum Symbol : uint8_t {
T_L, T_L,
T_S, T_S,
T_COLON, T_COLON,
T_UTF8_CONTINUATION_BYTE,
T_UTF8_LAST_CONTINUATION_BYTE,
T_HEX, T_HEX,
T_HEX2, T_HEX2,
T_HEX3, T_HEX3,
@@ -748,6 +746,7 @@ struct Parser3 {
int const stackSize; int const stackSize;
bool complete; bool complete;
NumDfa numDfa; NumDfa numDfa;
Utf8Dfa strDfa;
}; };
inline PRESERVE_NONE WeaselJsonStatus n_whitespace(Parser3 *self, char *buf, inline PRESERVE_NONE WeaselJsonStatus n_whitespace(Parser3 *self, char *buf,
@@ -811,6 +810,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_value(Parser3 *self, char *buf,
++buf; ++buf;
self->dataBegin = self->writeBuf = buf; self->dataBegin = self->writeBuf = buf;
self->pop(); self->pop();
self->strDfa.reset();
if (auto s = self->push({N_STRING2})) { if (auto s = self->push({N_STRING2})) {
return s; return s;
} }
@@ -915,6 +915,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_object2(Parser3 *self, char *buf,
++buf; ++buf;
self->dataBegin = self->writeBuf = buf; self->dataBegin = self->writeBuf = buf;
self->pop(); self->pop();
self->strDfa.reset();
if (auto s = self->push({N_STRING2, T_COLON, N_VALUE, N_OBJECT3})) { if (auto s = self->push({N_STRING2, T_COLON, N_VALUE, N_OBJECT3})) {
return s; return s;
} }
@@ -1017,35 +1018,20 @@ inline PRESERVE_NONE WeaselJsonStatus n_string(Parser3 *self, char *buf,
++buf; ++buf;
self->dataBegin = self->writeBuf = buf; self->dataBegin = self->writeBuf = buf;
self->pop(); self->pop();
self->strDfa.reset();
if (auto s = self->push({N_STRING2})) { if (auto s = self->push({N_STRING2})) {
return s; return s;
} }
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
} }
template <class V> inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
PRESERVE_NONE WeaselJsonStatus n_string2_impl(Parser3 *self, char *buf,
char *bufEnd) { char *bufEnd) {
const auto before = buf; const auto before = buf;
// Advance buf to the first "non-normal" character // Advance buf until double quote, backslash, invalid utf8, or codepoint <
for (;;) { // 0x20
if (bufEnd - buf < V::lanes) [[unlikely]] { buf = (char *)self->strDfa.scan(buf, bufEnd);
while (buf != bufEnd &&
tables.stringByteMeaning[uint8_t(*buf)] == Tables::NORMAL) {
++buf;
}
break;
}
auto v = V{(int8_t *)buf};
int normal =
(v != V::splat('"') & v != V::splat('\\') & v >= V::splat(0x20))
.count_leading_nonzero_lanes();
buf += normal;
if (normal < V::lanes) {
break;
}
}
int len = buf - before; int len = buf - before;
memmove(self->writeBuf, before, len); memmove(self->writeBuf, before, len);
@@ -1056,86 +1042,28 @@ PRESERVE_NONE WeaselJsonStatus n_string2_impl(Parser3 *self, char *buf,
return WeaselJson_AGAIN; return WeaselJson_AGAIN;
} }
switch (tables.stringByteMeaning[uint8_t(*buf)]) { if (!self->strDfa.accept()) [[unlikely]] {
case Tables::NORMAL: return WeaselJson_REJECT;
__builtin_unreachable(); }
case Tables::DUBQUOTE:
switch (*buf) {
case '"':
self->flushString(true); self->flushString(true);
++buf; ++buf;
self->pop(); self->pop();
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case Tables::BACKSLASH: case '\\':
++buf; ++buf;
self->pop(); self->pop();
if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) { if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) {
return s; return s;
} }
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case Tables::TWO_BYTE_UTF8:
// two byte utf-8 encoding
self->utf8Codepoint = *buf & 0b00011111;
self->minCodepoint = 0x80;
*self->writeBuf++ = *buf++;
self->pop();
if (auto s = self->push({T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case Tables::THREE_BYTE_UTF8:
// three byte utf-8 encoding
self->utf8Codepoint = *buf & 0b00001111;
self->minCodepoint = 0x800;
*self->writeBuf++ = *buf++;
self->pop();
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE,
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case Tables::FOUR_BYTE_UTF8:
// four byte utf-8 encoding
self->utf8Codepoint = *buf & 0b00000111;
self->minCodepoint = 0x10000;
*self->writeBuf++ = *buf++;
self->pop();
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE,
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case Tables::CONTINUATION_BYTE:
case Tables::INVALID:
[[unlikely]] return WeaselJson_REJECT;
default: default:
__builtin_unreachable(); __builtin_unreachable();
} }
} }
#ifdef __x86_64__
template WeaselJsonStatus
n_string2_impl<simd<int8_t, 64, sse::Simd_x86_SSE>>(Parser3 *, char *, char *);
template __attribute__((target("avx2"))) WeaselJsonStatus
n_string2_impl<simd<int8_t, 64, sse::Simd_x86_AVX2>>(Parser3 *, char *, char *);
__attribute__((target("default"))) inline PRESERVE_NONE WeaselJsonStatus
n_string2(Parser3 *self, char *buf, char *bufEnd) {
MUSTTAIL return n_string2_impl<simd<int8_t, 64, sse::Simd_x86_SSE>>(self, buf,
bufEnd);
}
__attribute__((target("avx2"))) inline PRESERVE_NONE WeaselJsonStatus
n_string2(Parser3 *self, char *buf, char *bufEnd) {
MUSTTAIL return n_string2_impl<simd<int8_t, 64, sse::Simd_x86_AVX2>>(
self, buf, bufEnd);
}
#else
inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
char *bufEnd) {
MUSTTAIL return n_string2_impl<simd<int8_t, 32>>(self, buf, bufEnd);
}
#endif
inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self, inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
char *buf, char *buf,
char *bufEnd) { char *bufEnd) {
@@ -1150,6 +1078,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
case 't': case 't':
*self->writeBuf++ = tables.unescape[uint8_t(*buf++)]; *self->writeBuf++ = tables.unescape[uint8_t(*buf++)];
self->pop(); self->pop();
self->strDfa.reset();
if (auto s = self->push({N_STRING2})) { if (auto s = self->push({N_STRING2})) {
return s; return s;
} }
@@ -1158,6 +1087,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
++buf; ++buf;
self->utf8Codepoint = 0; self->utf8Codepoint = 0;
self->pop(); self->pop();
self->strDfa.reset();
if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX2, N_STRING2})) { if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX2, N_STRING2})) {
return s; return s;
} }
@@ -1167,40 +1097,6 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
} }
} }
inline PRESERVE_NONE WeaselJsonStatus t_utf8_continuation_byte(Parser3 *self,
char *buf,
char *bufEnd) {
if (tables.stringByteMeaning[uint8_t(*buf)] != Tables::CONTINUATION_BYTE)
[[unlikely]] {
return WeaselJson_REJECT;
}
self->utf8Codepoint <<= 6;
self->utf8Codepoint |= *buf & 0b00111111;
*self->writeBuf++ = *buf++;
self->pop();
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
inline PRESERVE_NONE WeaselJsonStatus
t_utf8_last_continuation_byte(Parser3 *self, char *buf, char *bufEnd) {
if (tables.stringByteMeaning[uint8_t(*buf)] != Tables::CONTINUATION_BYTE)
[[unlikely]] {
return WeaselJson_REJECT;
}
self->utf8Codepoint <<= 6;
self->utf8Codepoint |= *buf & 0b00111111;
if (self->utf8Codepoint < self->minCodepoint ||
self->utf8Codepoint > 0x10ffff ||
(0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff))
[[unlikely]] {
return WeaselJson_REJECT;
}
// TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized
*self->writeBuf++ = *buf++;
self->pop();
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
inline PRESERVE_NONE WeaselJsonStatus t_digit(Parser3 *self, char *buf, inline PRESERVE_NONE WeaselJsonStatus t_digit(Parser3 *self, char *buf,
char *bufEnd) { char *bufEnd) {
if ('0' <= *buf && *buf <= '9') { if ('0' <= *buf && *buf <= '9') {
@@ -1461,9 +1357,6 @@ constexpr inline struct ContinuationTable {
continuations[T_L] = singleChar<'l'>; continuations[T_L] = singleChar<'l'>;
continuations[T_S] = singleChar<'s'>; continuations[T_S] = singleChar<'s'>;
continuations[T_COLON] = singleChar<':', true>; continuations[T_COLON] = singleChar<':', true>;
continuations[T_UTF8_CONTINUATION_BYTE] = t_utf8_continuation_byte;
continuations[T_UTF8_LAST_CONTINUATION_BYTE] =
t_utf8_last_continuation_byte;
continuations[T_HEX] = t_hex; continuations[T_HEX] = t_hex;
continuations[T_HEX2] = t_hex2; continuations[T_HEX2] = t_hex2;
continuations[T_HEX3] = t_hex3; continuations[T_HEX3] = t_hex3;
@@ -1490,7 +1383,6 @@ constexpr inline struct ContinuationTable {
symbolNames[T_L] = "singleChar<'l'>"; symbolNames[T_L] = "singleChar<'l'>";
symbolNames[T_S] = "singleChar<'s'>"; symbolNames[T_S] = "singleChar<'s'>";
symbolNames[T_COLON] = "singleChar<':'>"; symbolNames[T_COLON] = "singleChar<':'>";
symbolNames[T_UTF8_CONTINUATION_BYTE] = "t_utf8_continuation_byte";
symbolNames[T_HEX] = "t_hex"; symbolNames[T_HEX] = "t_hex";
symbolNames[T_HEX2] = "t_hex2"; symbolNames[T_HEX2] = "t_hex2";
symbolNames[T_HEX3] = "t_hex3"; symbolNames[T_HEX3] = "t_hex3";
@@ -1515,8 +1407,6 @@ inline PRESERVE_NONE WeaselJsonStatus Parser3::keepGoing(Parser3 *self,
switch (self->top()) { switch (self->top()) {
case N_STRING2: case N_STRING2:
case N_STRING_FOLLOWING_ESCAPE: case N_STRING_FOLLOWING_ESCAPE:
case T_UTF8_CONTINUATION_BYTE:
case T_UTF8_LAST_CONTINUATION_BYTE:
case T_HEX: case T_HEX:
case T_HEX2: case T_HEX2:
case T_HEX3: case T_HEX3:

View File

@@ -1,50 +1,12 @@
#pragma once #pragma once
constexpr inline struct Tables { constexpr inline struct Tables {
enum StringByteMeaning {
INVALID,
NORMAL,
DUBQUOTE,
BACKSLASH,
TWO_BYTE_UTF8,
THREE_BYTE_UTF8,
FOUR_BYTE_UTF8,
CONTINUATION_BYTE,
};
constexpr Tables() { constexpr Tables() {
whitespace[' '] = true; whitespace[' '] = true;
whitespace['\n'] = true; whitespace['\n'] = true;
whitespace['\r'] = true; whitespace['\r'] = true;
whitespace['\t'] = true; whitespace['\t'] = true;
for (int i = 0; i < 256; ++i) {
if ((i & 0b11000000) == 0b10000000) {
stringByteMeaning[i] = CONTINUATION_BYTE;
}
if ((i & 0b11100000) == 0b11000000) {
stringByteMeaning[i] = TWO_BYTE_UTF8;
}
if ((i & 0b11110000) == 0b11100000) {
stringByteMeaning[i] = THREE_BYTE_UTF8;
}
if ((i & 0b11111000) == 0b11110000) {
stringByteMeaning[i] = FOUR_BYTE_UTF8;
}
}
for (int i = 0x20; i < 128; ++i) {
stringByteMeaning[i] = NORMAL;
}
stringByteMeaning['"'] = DUBQUOTE;
stringByteMeaning['\\'] = BACKSLASH;
stringByteMeaning[0xc0] = INVALID;
stringByteMeaning[0xc1] = INVALID;
for (int i = 0xF5; i < 0x100; ++i) {
stringByteMeaning[i] = INVALID;
}
unescape['n'] = '\n'; unescape['n'] = '\n';
unescape['r'] = '\r'; unescape['r'] = '\r';
unescape['t'] = '\t'; unescape['t'] = '\t';
@@ -55,6 +17,5 @@ constexpr inline struct Tables {
unescape['/'] = '/'; unescape['/'] = '/';
} }
bool whitespace[256]{}; bool whitespace[256]{};
StringByteMeaning stringByteMeaning[256]{};
char unescape[256]{}; char unescape[256]{};
} tables; } tables;