Switch to dfa for strings
This commit is contained in:
144
src/parser3.h
144
src/parser3.h
@@ -667,8 +667,6 @@ enum Symbol : uint8_t {
|
|||||||
T_L,
|
T_L,
|
||||||
T_S,
|
T_S,
|
||||||
T_COLON,
|
T_COLON,
|
||||||
T_UTF8_CONTINUATION_BYTE,
|
|
||||||
T_UTF8_LAST_CONTINUATION_BYTE,
|
|
||||||
T_HEX,
|
T_HEX,
|
||||||
T_HEX2,
|
T_HEX2,
|
||||||
T_HEX3,
|
T_HEX3,
|
||||||
@@ -748,6 +746,7 @@ struct Parser3 {
|
|||||||
int const stackSize;
|
int const stackSize;
|
||||||
bool complete;
|
bool complete;
|
||||||
NumDfa numDfa;
|
NumDfa numDfa;
|
||||||
|
Utf8Dfa strDfa;
|
||||||
};
|
};
|
||||||
|
|
||||||
inline PRESERVE_NONE WeaselJsonStatus n_whitespace(Parser3 *self, char *buf,
|
inline PRESERVE_NONE WeaselJsonStatus n_whitespace(Parser3 *self, char *buf,
|
||||||
@@ -811,6 +810,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_value(Parser3 *self, char *buf,
|
|||||||
++buf;
|
++buf;
|
||||||
self->dataBegin = self->writeBuf = buf;
|
self->dataBegin = self->writeBuf = buf;
|
||||||
self->pop();
|
self->pop();
|
||||||
|
self->strDfa.reset();
|
||||||
if (auto s = self->push({N_STRING2})) {
|
if (auto s = self->push({N_STRING2})) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
@@ -915,6 +915,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_object2(Parser3 *self, char *buf,
|
|||||||
++buf;
|
++buf;
|
||||||
self->dataBegin = self->writeBuf = buf;
|
self->dataBegin = self->writeBuf = buf;
|
||||||
self->pop();
|
self->pop();
|
||||||
|
self->strDfa.reset();
|
||||||
if (auto s = self->push({N_STRING2, T_COLON, N_VALUE, N_OBJECT3})) {
|
if (auto s = self->push({N_STRING2, T_COLON, N_VALUE, N_OBJECT3})) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
@@ -1017,35 +1018,20 @@ inline PRESERVE_NONE WeaselJsonStatus n_string(Parser3 *self, char *buf,
|
|||||||
++buf;
|
++buf;
|
||||||
self->dataBegin = self->writeBuf = buf;
|
self->dataBegin = self->writeBuf = buf;
|
||||||
self->pop();
|
self->pop();
|
||||||
|
self->strDfa.reset();
|
||||||
if (auto s = self->push({N_STRING2})) {
|
if (auto s = self->push({N_STRING2})) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class V>
|
inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
|
||||||
PRESERVE_NONE WeaselJsonStatus n_string2_impl(Parser3 *self, char *buf,
|
|
||||||
char *bufEnd) {
|
char *bufEnd) {
|
||||||
const auto before = buf;
|
const auto before = buf;
|
||||||
|
|
||||||
// Advance buf to the first "non-normal" character
|
// Advance buf until double quote, backslash, invalid utf8, or codepoint <
|
||||||
for (;;) {
|
// 0x20
|
||||||
if (bufEnd - buf < V::lanes) [[unlikely]] {
|
buf = (char *)self->strDfa.scan(buf, bufEnd);
|
||||||
while (buf != bufEnd &&
|
|
||||||
tables.stringByteMeaning[uint8_t(*buf)] == Tables::NORMAL) {
|
|
||||||
++buf;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
auto v = V{(int8_t *)buf};
|
|
||||||
int normal =
|
|
||||||
(v != V::splat('"') & v != V::splat('\\') & v >= V::splat(0x20))
|
|
||||||
.count_leading_nonzero_lanes();
|
|
||||||
buf += normal;
|
|
||||||
if (normal < V::lanes) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int len = buf - before;
|
int len = buf - before;
|
||||||
memmove(self->writeBuf, before, len);
|
memmove(self->writeBuf, before, len);
|
||||||
@@ -1056,86 +1042,28 @@ PRESERVE_NONE WeaselJsonStatus n_string2_impl(Parser3 *self, char *buf,
|
|||||||
return WeaselJson_AGAIN;
|
return WeaselJson_AGAIN;
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (tables.stringByteMeaning[uint8_t(*buf)]) {
|
if (!self->strDfa.accept()) [[unlikely]] {
|
||||||
case Tables::NORMAL:
|
return WeaselJson_REJECT;
|
||||||
__builtin_unreachable();
|
}
|
||||||
case Tables::DUBQUOTE:
|
|
||||||
|
switch (*buf) {
|
||||||
|
case '"':
|
||||||
self->flushString(true);
|
self->flushString(true);
|
||||||
++buf;
|
++buf;
|
||||||
self->pop();
|
self->pop();
|
||||||
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
||||||
case Tables::BACKSLASH:
|
case '\\':
|
||||||
++buf;
|
++buf;
|
||||||
self->pop();
|
self->pop();
|
||||||
if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) {
|
if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
||||||
case Tables::TWO_BYTE_UTF8:
|
|
||||||
// two byte utf-8 encoding
|
|
||||||
self->utf8Codepoint = *buf & 0b00011111;
|
|
||||||
self->minCodepoint = 0x80;
|
|
||||||
*self->writeBuf++ = *buf++;
|
|
||||||
self->pop();
|
|
||||||
if (auto s = self->push({T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
|
||||||
case Tables::THREE_BYTE_UTF8:
|
|
||||||
// three byte utf-8 encoding
|
|
||||||
self->utf8Codepoint = *buf & 0b00001111;
|
|
||||||
self->minCodepoint = 0x800;
|
|
||||||
*self->writeBuf++ = *buf++;
|
|
||||||
self->pop();
|
|
||||||
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE,
|
|
||||||
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
|
||||||
case Tables::FOUR_BYTE_UTF8:
|
|
||||||
// four byte utf-8 encoding
|
|
||||||
self->utf8Codepoint = *buf & 0b00000111;
|
|
||||||
self->minCodepoint = 0x10000;
|
|
||||||
*self->writeBuf++ = *buf++;
|
|
||||||
self->pop();
|
|
||||||
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE,
|
|
||||||
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
|
||||||
case Tables::CONTINUATION_BYTE:
|
|
||||||
case Tables::INVALID:
|
|
||||||
[[unlikely]] return WeaselJson_REJECT;
|
|
||||||
default:
|
default:
|
||||||
__builtin_unreachable();
|
__builtin_unreachable();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __x86_64__
|
|
||||||
template WeaselJsonStatus
|
|
||||||
n_string2_impl<simd<int8_t, 64, sse::Simd_x86_SSE>>(Parser3 *, char *, char *);
|
|
||||||
|
|
||||||
template __attribute__((target("avx2"))) WeaselJsonStatus
|
|
||||||
n_string2_impl<simd<int8_t, 64, sse::Simd_x86_AVX2>>(Parser3 *, char *, char *);
|
|
||||||
|
|
||||||
__attribute__((target("default"))) inline PRESERVE_NONE WeaselJsonStatus
|
|
||||||
n_string2(Parser3 *self, char *buf, char *bufEnd) {
|
|
||||||
MUSTTAIL return n_string2_impl<simd<int8_t, 64, sse::Simd_x86_SSE>>(self, buf,
|
|
||||||
bufEnd);
|
|
||||||
}
|
|
||||||
|
|
||||||
__attribute__((target("avx2"))) inline PRESERVE_NONE WeaselJsonStatus
|
|
||||||
n_string2(Parser3 *self, char *buf, char *bufEnd) {
|
|
||||||
MUSTTAIL return n_string2_impl<simd<int8_t, 64, sse::Simd_x86_AVX2>>(
|
|
||||||
self, buf, bufEnd);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
|
|
||||||
char *bufEnd) {
|
|
||||||
MUSTTAIL return n_string2_impl<simd<int8_t, 32>>(self, buf, bufEnd);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
|
inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
|
||||||
char *buf,
|
char *buf,
|
||||||
char *bufEnd) {
|
char *bufEnd) {
|
||||||
@@ -1150,6 +1078,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
|
|||||||
case 't':
|
case 't':
|
||||||
*self->writeBuf++ = tables.unescape[uint8_t(*buf++)];
|
*self->writeBuf++ = tables.unescape[uint8_t(*buf++)];
|
||||||
self->pop();
|
self->pop();
|
||||||
|
self->strDfa.reset();
|
||||||
if (auto s = self->push({N_STRING2})) {
|
if (auto s = self->push({N_STRING2})) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
@@ -1158,6 +1087,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
|
|||||||
++buf;
|
++buf;
|
||||||
self->utf8Codepoint = 0;
|
self->utf8Codepoint = 0;
|
||||||
self->pop();
|
self->pop();
|
||||||
|
self->strDfa.reset();
|
||||||
if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX2, N_STRING2})) {
|
if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX2, N_STRING2})) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
@@ -1167,40 +1097,6 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline PRESERVE_NONE WeaselJsonStatus t_utf8_continuation_byte(Parser3 *self,
|
|
||||||
char *buf,
|
|
||||||
char *bufEnd) {
|
|
||||||
if (tables.stringByteMeaning[uint8_t(*buf)] != Tables::CONTINUATION_BYTE)
|
|
||||||
[[unlikely]] {
|
|
||||||
return WeaselJson_REJECT;
|
|
||||||
}
|
|
||||||
self->utf8Codepoint <<= 6;
|
|
||||||
self->utf8Codepoint |= *buf & 0b00111111;
|
|
||||||
*self->writeBuf++ = *buf++;
|
|
||||||
self->pop();
|
|
||||||
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline PRESERVE_NONE WeaselJsonStatus
|
|
||||||
t_utf8_last_continuation_byte(Parser3 *self, char *buf, char *bufEnd) {
|
|
||||||
if (tables.stringByteMeaning[uint8_t(*buf)] != Tables::CONTINUATION_BYTE)
|
|
||||||
[[unlikely]] {
|
|
||||||
return WeaselJson_REJECT;
|
|
||||||
}
|
|
||||||
self->utf8Codepoint <<= 6;
|
|
||||||
self->utf8Codepoint |= *buf & 0b00111111;
|
|
||||||
if (self->utf8Codepoint < self->minCodepoint ||
|
|
||||||
self->utf8Codepoint > 0x10ffff ||
|
|
||||||
(0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff))
|
|
||||||
[[unlikely]] {
|
|
||||||
return WeaselJson_REJECT;
|
|
||||||
}
|
|
||||||
// TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized
|
|
||||||
*self->writeBuf++ = *buf++;
|
|
||||||
self->pop();
|
|
||||||
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline PRESERVE_NONE WeaselJsonStatus t_digit(Parser3 *self, char *buf,
|
inline PRESERVE_NONE WeaselJsonStatus t_digit(Parser3 *self, char *buf,
|
||||||
char *bufEnd) {
|
char *bufEnd) {
|
||||||
if ('0' <= *buf && *buf <= '9') {
|
if ('0' <= *buf && *buf <= '9') {
|
||||||
@@ -1461,9 +1357,6 @@ constexpr inline struct ContinuationTable {
|
|||||||
continuations[T_L] = singleChar<'l'>;
|
continuations[T_L] = singleChar<'l'>;
|
||||||
continuations[T_S] = singleChar<'s'>;
|
continuations[T_S] = singleChar<'s'>;
|
||||||
continuations[T_COLON] = singleChar<':', true>;
|
continuations[T_COLON] = singleChar<':', true>;
|
||||||
continuations[T_UTF8_CONTINUATION_BYTE] = t_utf8_continuation_byte;
|
|
||||||
continuations[T_UTF8_LAST_CONTINUATION_BYTE] =
|
|
||||||
t_utf8_last_continuation_byte;
|
|
||||||
continuations[T_HEX] = t_hex;
|
continuations[T_HEX] = t_hex;
|
||||||
continuations[T_HEX2] = t_hex2;
|
continuations[T_HEX2] = t_hex2;
|
||||||
continuations[T_HEX3] = t_hex3;
|
continuations[T_HEX3] = t_hex3;
|
||||||
@@ -1490,7 +1383,6 @@ constexpr inline struct ContinuationTable {
|
|||||||
symbolNames[T_L] = "singleChar<'l'>";
|
symbolNames[T_L] = "singleChar<'l'>";
|
||||||
symbolNames[T_S] = "singleChar<'s'>";
|
symbolNames[T_S] = "singleChar<'s'>";
|
||||||
symbolNames[T_COLON] = "singleChar<':'>";
|
symbolNames[T_COLON] = "singleChar<':'>";
|
||||||
symbolNames[T_UTF8_CONTINUATION_BYTE] = "t_utf8_continuation_byte";
|
|
||||||
symbolNames[T_HEX] = "t_hex";
|
symbolNames[T_HEX] = "t_hex";
|
||||||
symbolNames[T_HEX2] = "t_hex2";
|
symbolNames[T_HEX2] = "t_hex2";
|
||||||
symbolNames[T_HEX3] = "t_hex3";
|
symbolNames[T_HEX3] = "t_hex3";
|
||||||
@@ -1515,8 +1407,6 @@ inline PRESERVE_NONE WeaselJsonStatus Parser3::keepGoing(Parser3 *self,
|
|||||||
switch (self->top()) {
|
switch (self->top()) {
|
||||||
case N_STRING2:
|
case N_STRING2:
|
||||||
case N_STRING_FOLLOWING_ESCAPE:
|
case N_STRING_FOLLOWING_ESCAPE:
|
||||||
case T_UTF8_CONTINUATION_BYTE:
|
|
||||||
case T_UTF8_LAST_CONTINUATION_BYTE:
|
|
||||||
case T_HEX:
|
case T_HEX:
|
||||||
case T_HEX2:
|
case T_HEX2:
|
||||||
case T_HEX3:
|
case T_HEX3:
|
||||||
|
|||||||
39
src/tables.h
39
src/tables.h
@@ -1,50 +1,12 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
constexpr inline struct Tables {
|
constexpr inline struct Tables {
|
||||||
enum StringByteMeaning {
|
|
||||||
INVALID,
|
|
||||||
NORMAL,
|
|
||||||
DUBQUOTE,
|
|
||||||
BACKSLASH,
|
|
||||||
TWO_BYTE_UTF8,
|
|
||||||
THREE_BYTE_UTF8,
|
|
||||||
FOUR_BYTE_UTF8,
|
|
||||||
CONTINUATION_BYTE,
|
|
||||||
};
|
|
||||||
|
|
||||||
constexpr Tables() {
|
constexpr Tables() {
|
||||||
whitespace[' '] = true;
|
whitespace[' '] = true;
|
||||||
whitespace['\n'] = true;
|
whitespace['\n'] = true;
|
||||||
whitespace['\r'] = true;
|
whitespace['\r'] = true;
|
||||||
whitespace['\t'] = true;
|
whitespace['\t'] = true;
|
||||||
|
|
||||||
for (int i = 0; i < 256; ++i) {
|
|
||||||
if ((i & 0b11000000) == 0b10000000) {
|
|
||||||
stringByteMeaning[i] = CONTINUATION_BYTE;
|
|
||||||
}
|
|
||||||
if ((i & 0b11100000) == 0b11000000) {
|
|
||||||
stringByteMeaning[i] = TWO_BYTE_UTF8;
|
|
||||||
}
|
|
||||||
if ((i & 0b11110000) == 0b11100000) {
|
|
||||||
stringByteMeaning[i] = THREE_BYTE_UTF8;
|
|
||||||
}
|
|
||||||
if ((i & 0b11111000) == 0b11110000) {
|
|
||||||
stringByteMeaning[i] = FOUR_BYTE_UTF8;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0x20; i < 128; ++i) {
|
|
||||||
stringByteMeaning[i] = NORMAL;
|
|
||||||
}
|
|
||||||
stringByteMeaning['"'] = DUBQUOTE;
|
|
||||||
stringByteMeaning['\\'] = BACKSLASH;
|
|
||||||
|
|
||||||
stringByteMeaning[0xc0] = INVALID;
|
|
||||||
stringByteMeaning[0xc1] = INVALID;
|
|
||||||
for (int i = 0xF5; i < 0x100; ++i) {
|
|
||||||
stringByteMeaning[i] = INVALID;
|
|
||||||
}
|
|
||||||
|
|
||||||
unescape['n'] = '\n';
|
unescape['n'] = '\n';
|
||||||
unescape['r'] = '\r';
|
unescape['r'] = '\r';
|
||||||
unescape['t'] = '\t';
|
unescape['t'] = '\t';
|
||||||
@@ -55,6 +17,5 @@ constexpr inline struct Tables {
|
|||||||
unescape['/'] = '/';
|
unescape['/'] = '/';
|
||||||
}
|
}
|
||||||
bool whitespace[256]{};
|
bool whitespace[256]{};
|
||||||
StringByteMeaning stringByteMeaning[256]{};
|
|
||||||
char unescape[256]{};
|
char unescape[256]{};
|
||||||
} tables;
|
} tables;
|
||||||
|
|||||||
Reference in New Issue
Block a user