From a4d7d1f91e55c72634907b4c21c3ae90bf56cdd1 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Mon, 19 May 2025 17:50:48 -0400 Subject: [PATCH] Loop in string2 in normal case --- src/parser3.h | 93 ++++++++++++++++++++++++--------------------------- src/tables.h | 43 ++++++++++++++++++++---- 2 files changed, 80 insertions(+), 56 deletions(-) diff --git a/src/parser3.h b/src/parser3.h index e7ddf11..916ce06 100644 --- a/src/parser3.h +++ b/src/parser3.h @@ -327,30 +327,28 @@ inline Status n_string(Parser3 *self) { } inline Status n_string2(Parser3 *self) { - if (tables.invalidStringByte[uint8_t(*self->buf)]) { - return S_REJECT; - } - if (int8_t(*self->buf) > 0) { - // one byte utf-8 encoding - switch (*self->buf) { - case '"': - self->flushString(); - self->callbacks->on_end_string(self->data); - ++self->buf; - self->pop(); - MUSTTAIL return Parser3::keepGoing(self); - case '\\': - ++self->buf; - self->pop(); - if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) { - return s; - } - MUSTTAIL return Parser3::keepGoing(self); - default: - *self->writeBuf++ = *self->buf++; +begin: + switch (tables.stringByteMeaning[uint8_t(*self->buf)]) { + case Tables::NORMAL: + *self->writeBuf++ = *self->buf++; + if (self->buf == self->bufEnd) { MUSTTAIL return Parser3::keepGoing(self); } - } else if ((*self->buf & 0b11100000) == 0b11000000) { + goto begin; + case Tables::DUBQUOTE: + self->flushString(); + self->callbacks->on_end_string(self->data); + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + case Tables::BACKSLASH: + ++self->buf; + self->pop(); + if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + case Tables::TWO_BYTE_UTF8: // two byte utf-8 encoding self->utf8Codepoint = *self->buf & 0b00011111; self->minCodepoint = 0x80; @@ -360,8 +358,7 @@ inline Status n_string2(Parser3 *self) { return s; } MUSTTAIL return Parser3::keepGoing(self); - } - if ((*self->buf & 0b11110000) == 0b11100000) { + case Tables::THREE_BYTE_UTF8: // three byte utf-8 encoding self->utf8Codepoint = *self->buf & 0b00001111; self->minCodepoint = 0x800; @@ -372,7 +369,7 @@ inline Status n_string2(Parser3 *self) { return s; } MUSTTAIL return Parser3::keepGoing(self); - } else if ((*self->buf & 0b11111000) == 0b11110000) { + case Tables::FOUR_BYTE_UTF8: // four byte utf-8 encoding self->utf8Codepoint = *self->buf & 0b00000111; self->minCodepoint = 0x10000; @@ -383,8 +380,10 @@ inline Status n_string2(Parser3 *self) { return s; } MUSTTAIL return Parser3::keepGoing(self); + case Tables::CONTINUATION_BYTE: + case Tables::INVALID: + return S_REJECT; } - return S_REJECT; } inline Status n_string_following_escape(Parser3 *self) { @@ -417,37 +416,33 @@ inline Status n_string_following_escape(Parser3 *self) { } inline Status t_utf8_continuation_byte(Parser3 *self) { - if (tables.invalidStringByte[uint8_t(*self->buf)]) { + if (tables.stringByteMeaning[uint8_t(*self->buf)] != + Tables::CONTINUATION_BYTE) { return S_REJECT; } - if ((*self->buf & 0b11000000) == 0b10000000) { - self->utf8Codepoint <<= 6; - self->utf8Codepoint |= *self->buf & 0b00111111; - *self->writeBuf++ = *self->buf++; - self->pop(); - MUSTTAIL return Parser3::keepGoing(self); - } - return S_REJECT; + self->utf8Codepoint <<= 6; + self->utf8Codepoint |= *self->buf & 0b00111111; + *self->writeBuf++ = *self->buf++; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); } inline Status t_utf8_last_continuation_byte(Parser3 *self) { - if (tables.invalidStringByte[uint8_t(*self->buf)]) { + if (tables.stringByteMeaning[uint8_t(*self->buf)] != + Tables::CONTINUATION_BYTE) { return S_REJECT; } - if ((*self->buf & 0b11000000) == 0b10000000) { - self->utf8Codepoint <<= 6; - self->utf8Codepoint |= *self->buf & 0b00111111; - if (self->utf8Codepoint < self->minCodepoint || - self->utf8Codepoint > 0x10ffff || - (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) { - return S_REJECT; - } - // TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized - *self->writeBuf++ = *self->buf++; - self->pop(); - MUSTTAIL return Parser3::keepGoing(self); + self->utf8Codepoint <<= 6; + self->utf8Codepoint |= *self->buf & 0b00111111; + if (self->utf8Codepoint < self->minCodepoint || + self->utf8Codepoint > 0x10ffff || + (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) { + return S_REJECT; } - return S_REJECT; + // TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized + *self->writeBuf++ = *self->buf++; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); } inline Status t_digit(Parser3 *self) { diff --git a/src/tables.h b/src/tables.h index b00d5f6..5a11225 100644 --- a/src/tables.h +++ b/src/tables.h @@ -1,19 +1,48 @@ #pragma once constexpr inline struct Tables { + enum StringByteMeaning { + INVALID, + NORMAL, + DUBQUOTE, + BACKSLASH, + TWO_BYTE_UTF8, + THREE_BYTE_UTF8, + FOUR_BYTE_UTF8, + CONTINUATION_BYTE, + }; + constexpr Tables() { whitespace[' '] = true; whitespace['\n'] = true; whitespace['\r'] = true; whitespace['\t'] = true; - for (int i = 0; i < 0x20; ++i) { - invalidStringByte[i] = true; + for (int i = 0; i < 256; ++i) { + if ((i & 0b11000000) == 0b10000000) { + stringByteMeaning[i] = CONTINUATION_BYTE; + } + if ((i & 0b11100000) == 0b11000000) { + stringByteMeaning[i] = TWO_BYTE_UTF8; + } + if ((i & 0b11110000) == 0b11100000) { + stringByteMeaning[i] = THREE_BYTE_UTF8; + } + if ((i & 0b11111000) == 0b11110000) { + stringByteMeaning[i] = FOUR_BYTE_UTF8; + } } - invalidStringByte[0xc0] = true; - invalidStringByte[0xc1] = true; - for (int i = 0xf5; i <= 0xff; ++i) { - invalidStringByte[i] = true; + + for (int i = 0x20; i < 128; ++i) { + stringByteMeaning[i] = NORMAL; + } + stringByteMeaning['"'] = DUBQUOTE; + stringByteMeaning['\\'] = BACKSLASH; + + stringByteMeaning[0xc0] = INVALID; + stringByteMeaning[0xc1] = INVALID; + for (int i = 0xF5; i < 0x100; ++i) { + stringByteMeaning[i] = INVALID; } unescape['n'] = '\n'; @@ -26,6 +55,6 @@ constexpr inline struct Tables { unescape['/'] = '/'; } alignas(16) bool whitespace[256]{}; - alignas(16) bool invalidStringByte[256]{}; + alignas(16) StringByteMeaning stringByteMeaning[256]{}; alignas(16) char unescape[256]{}; } tables;