diff --git a/src/parser3.h b/src/parser3.h index 53e3e76..bc0482f 100644 --- a/src/parser3.h +++ b/src/parser3.h @@ -667,8 +667,6 @@ enum Symbol : uint8_t { T_L, T_S, T_COLON, - T_UTF8_CONTINUATION_BYTE, - T_UTF8_LAST_CONTINUATION_BYTE, T_HEX, T_HEX2, T_HEX3, @@ -748,6 +746,7 @@ struct Parser3 { int const stackSize; bool complete; NumDfa numDfa; + Utf8Dfa strDfa; }; inline PRESERVE_NONE WeaselJsonStatus n_whitespace(Parser3 *self, char *buf, @@ -811,6 +810,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_value(Parser3 *self, char *buf, ++buf; self->dataBegin = self->writeBuf = buf; self->pop(); + self->strDfa.reset(); if (auto s = self->push({N_STRING2})) { return s; } @@ -915,6 +915,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_object2(Parser3 *self, char *buf, ++buf; self->dataBegin = self->writeBuf = buf; self->pop(); + self->strDfa.reset(); if (auto s = self->push({N_STRING2, T_COLON, N_VALUE, N_OBJECT3})) { return s; } @@ -1017,35 +1018,20 @@ inline PRESERVE_NONE WeaselJsonStatus n_string(Parser3 *self, char *buf, ++buf; self->dataBegin = self->writeBuf = buf; self->pop(); + self->strDfa.reset(); if (auto s = self->push({N_STRING2})) { return s; } MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); } -template -PRESERVE_NONE WeaselJsonStatus n_string2_impl(Parser3 *self, char *buf, - char *bufEnd) { +inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf, + char *bufEnd) { const auto before = buf; - // Advance buf to the first "non-normal" character - for (;;) { - if (bufEnd - buf < V::lanes) [[unlikely]] { - while (buf != bufEnd && - tables.stringByteMeaning[uint8_t(*buf)] == Tables::NORMAL) { - ++buf; - } - break; - } - auto v = V{(int8_t *)buf}; - int normal = - (v != V::splat('"') & v != V::splat('\\') & v >= V::splat(0x20)) - .count_leading_nonzero_lanes(); - buf += normal; - if (normal < V::lanes) { - break; - } - } + // Advance buf until double quote, backslash, invalid utf8, or codepoint < + // 0x20 + buf = (char *)self->strDfa.scan(buf, bufEnd); int len = buf - before; memmove(self->writeBuf, before, len); @@ -1056,86 +1042,28 @@ PRESERVE_NONE WeaselJsonStatus n_string2_impl(Parser3 *self, char *buf, return WeaselJson_AGAIN; } - switch (tables.stringByteMeaning[uint8_t(*buf)]) { - case Tables::NORMAL: - __builtin_unreachable(); - case Tables::DUBQUOTE: + if (!self->strDfa.accept()) [[unlikely]] { + return WeaselJson_REJECT; + } + + switch (*buf) { + case '"': self->flushString(true); ++buf; self->pop(); MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); - case Tables::BACKSLASH: + case '\\': ++buf; self->pop(); if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) { return s; } MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); - case Tables::TWO_BYTE_UTF8: - // two byte utf-8 encoding - self->utf8Codepoint = *buf & 0b00011111; - self->minCodepoint = 0x80; - *self->writeBuf++ = *buf++; - self->pop(); - if (auto s = self->push({T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) { - return s; - } - MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); - case Tables::THREE_BYTE_UTF8: - // three byte utf-8 encoding - self->utf8Codepoint = *buf & 0b00001111; - self->minCodepoint = 0x800; - *self->writeBuf++ = *buf++; - self->pop(); - if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, - T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) { - return s; - } - MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); - case Tables::FOUR_BYTE_UTF8: - // four byte utf-8 encoding - self->utf8Codepoint = *buf & 0b00000111; - self->minCodepoint = 0x10000; - *self->writeBuf++ = *buf++; - self->pop(); - if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE, - T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) { - return s; - } - MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); - case Tables::CONTINUATION_BYTE: - case Tables::INVALID: - [[unlikely]] return WeaselJson_REJECT; default: __builtin_unreachable(); } } -#ifdef __x86_64__ -template WeaselJsonStatus -n_string2_impl>(Parser3 *, char *, char *); - -template __attribute__((target("avx2"))) WeaselJsonStatus -n_string2_impl>(Parser3 *, char *, char *); - -__attribute__((target("default"))) inline PRESERVE_NONE WeaselJsonStatus -n_string2(Parser3 *self, char *buf, char *bufEnd) { - MUSTTAIL return n_string2_impl>(self, buf, - bufEnd); -} - -__attribute__((target("avx2"))) inline PRESERVE_NONE WeaselJsonStatus -n_string2(Parser3 *self, char *buf, char *bufEnd) { - MUSTTAIL return n_string2_impl>( - self, buf, bufEnd); -} -#else -inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf, - char *bufEnd) { - MUSTTAIL return n_string2_impl>(self, buf, bufEnd); -} -#endif - inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self, char *buf, char *bufEnd) { @@ -1150,6 +1078,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self, case 't': *self->writeBuf++ = tables.unescape[uint8_t(*buf++)]; self->pop(); + self->strDfa.reset(); if (auto s = self->push({N_STRING2})) { return s; } @@ -1158,6 +1087,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self, ++buf; self->utf8Codepoint = 0; self->pop(); + self->strDfa.reset(); if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX2, N_STRING2})) { return s; } @@ -1167,40 +1097,6 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self, } } -inline PRESERVE_NONE WeaselJsonStatus t_utf8_continuation_byte(Parser3 *self, - char *buf, - char *bufEnd) { - if (tables.stringByteMeaning[uint8_t(*buf)] != Tables::CONTINUATION_BYTE) - [[unlikely]] { - return WeaselJson_REJECT; - } - self->utf8Codepoint <<= 6; - self->utf8Codepoint |= *buf & 0b00111111; - *self->writeBuf++ = *buf++; - self->pop(); - MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); -} - -inline PRESERVE_NONE WeaselJsonStatus -t_utf8_last_continuation_byte(Parser3 *self, char *buf, char *bufEnd) { - if (tables.stringByteMeaning[uint8_t(*buf)] != Tables::CONTINUATION_BYTE) - [[unlikely]] { - return WeaselJson_REJECT; - } - self->utf8Codepoint <<= 6; - self->utf8Codepoint |= *buf & 0b00111111; - if (self->utf8Codepoint < self->minCodepoint || - self->utf8Codepoint > 0x10ffff || - (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) - [[unlikely]] { - return WeaselJson_REJECT; - } - // TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized - *self->writeBuf++ = *buf++; - self->pop(); - MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); -} - inline PRESERVE_NONE WeaselJsonStatus t_digit(Parser3 *self, char *buf, char *bufEnd) { if ('0' <= *buf && *buf <= '9') { @@ -1461,9 +1357,6 @@ constexpr inline struct ContinuationTable { continuations[T_L] = singleChar<'l'>; continuations[T_S] = singleChar<'s'>; continuations[T_COLON] = singleChar<':', true>; - continuations[T_UTF8_CONTINUATION_BYTE] = t_utf8_continuation_byte; - continuations[T_UTF8_LAST_CONTINUATION_BYTE] = - t_utf8_last_continuation_byte; continuations[T_HEX] = t_hex; continuations[T_HEX2] = t_hex2; continuations[T_HEX3] = t_hex3; @@ -1490,7 +1383,6 @@ constexpr inline struct ContinuationTable { symbolNames[T_L] = "singleChar<'l'>"; symbolNames[T_S] = "singleChar<'s'>"; symbolNames[T_COLON] = "singleChar<':'>"; - symbolNames[T_UTF8_CONTINUATION_BYTE] = "t_utf8_continuation_byte"; symbolNames[T_HEX] = "t_hex"; symbolNames[T_HEX2] = "t_hex2"; symbolNames[T_HEX3] = "t_hex3"; @@ -1515,8 +1407,6 @@ inline PRESERVE_NONE WeaselJsonStatus Parser3::keepGoing(Parser3 *self, switch (self->top()) { case N_STRING2: case N_STRING_FOLLOWING_ESCAPE: - case T_UTF8_CONTINUATION_BYTE: - case T_UTF8_LAST_CONTINUATION_BYTE: case T_HEX: case T_HEX2: case T_HEX3: diff --git a/src/tables.h b/src/tables.h index 632ad52..a5a996a 100644 --- a/src/tables.h +++ b/src/tables.h @@ -1,50 +1,12 @@ #pragma once constexpr inline struct Tables { - enum StringByteMeaning { - INVALID, - NORMAL, - DUBQUOTE, - BACKSLASH, - TWO_BYTE_UTF8, - THREE_BYTE_UTF8, - FOUR_BYTE_UTF8, - CONTINUATION_BYTE, - }; constexpr Tables() { whitespace[' '] = true; whitespace['\n'] = true; whitespace['\r'] = true; whitespace['\t'] = true; - - for (int i = 0; i < 256; ++i) { - if ((i & 0b11000000) == 0b10000000) { - stringByteMeaning[i] = CONTINUATION_BYTE; - } - if ((i & 0b11100000) == 0b11000000) { - stringByteMeaning[i] = TWO_BYTE_UTF8; - } - if ((i & 0b11110000) == 0b11100000) { - stringByteMeaning[i] = THREE_BYTE_UTF8; - } - if ((i & 0b11111000) == 0b11110000) { - stringByteMeaning[i] = FOUR_BYTE_UTF8; - } - } - - for (int i = 0x20; i < 128; ++i) { - stringByteMeaning[i] = NORMAL; - } - stringByteMeaning['"'] = DUBQUOTE; - stringByteMeaning['\\'] = BACKSLASH; - - stringByteMeaning[0xc0] = INVALID; - stringByteMeaning[0xc1] = INVALID; - for (int i = 0xF5; i < 0x100; ++i) { - stringByteMeaning[i] = INVALID; - } - unescape['n'] = '\n'; unescape['r'] = '\r'; unescape['t'] = '\t'; @@ -55,6 +17,5 @@ constexpr inline struct Tables { unescape['/'] = '/'; } bool whitespace[256]{}; - StringByteMeaning stringByteMeaning[256]{}; char unescape[256]{}; } tables;