Loop in string2 in normal case

This commit is contained in:
2025-05-19 17:50:48 -04:00
parent 918950d7f8
commit a4d7d1f91e
2 changed files with 80 additions and 56 deletions

View File

@@ -327,30 +327,28 @@ inline Status n_string(Parser3 *self) {
} }
inline Status n_string2(Parser3 *self) { inline Status n_string2(Parser3 *self) {
if (tables.invalidStringByte[uint8_t(*self->buf)]) { begin:
return S_REJECT; switch (tables.stringByteMeaning[uint8_t(*self->buf)]) {
case Tables::NORMAL:
*self->writeBuf++ = *self->buf++;
if (self->buf == self->bufEnd) {
MUSTTAIL return Parser3::keepGoing(self);
} }
if (int8_t(*self->buf) > 0) { goto begin;
// one byte utf-8 encoding case Tables::DUBQUOTE:
switch (*self->buf) {
case '"':
self->flushString(); self->flushString();
self->callbacks->on_end_string(self->data); self->callbacks->on_end_string(self->data);
++self->buf; ++self->buf;
self->pop(); self->pop();
MUSTTAIL return Parser3::keepGoing(self); MUSTTAIL return Parser3::keepGoing(self);
case '\\': case Tables::BACKSLASH:
++self->buf; ++self->buf;
self->pop(); self->pop();
if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) { if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) {
return s; return s;
} }
MUSTTAIL return Parser3::keepGoing(self); MUSTTAIL return Parser3::keepGoing(self);
default: case Tables::TWO_BYTE_UTF8:
*self->writeBuf++ = *self->buf++;
MUSTTAIL return Parser3::keepGoing(self);
}
} else if ((*self->buf & 0b11100000) == 0b11000000) {
// two byte utf-8 encoding // two byte utf-8 encoding
self->utf8Codepoint = *self->buf & 0b00011111; self->utf8Codepoint = *self->buf & 0b00011111;
self->minCodepoint = 0x80; self->minCodepoint = 0x80;
@@ -360,8 +358,7 @@ inline Status n_string2(Parser3 *self) {
return s; return s;
} }
MUSTTAIL return Parser3::keepGoing(self); MUSTTAIL return Parser3::keepGoing(self);
} case Tables::THREE_BYTE_UTF8:
if ((*self->buf & 0b11110000) == 0b11100000) {
// three byte utf-8 encoding // three byte utf-8 encoding
self->utf8Codepoint = *self->buf & 0b00001111; self->utf8Codepoint = *self->buf & 0b00001111;
self->minCodepoint = 0x800; self->minCodepoint = 0x800;
@@ -372,7 +369,7 @@ inline Status n_string2(Parser3 *self) {
return s; return s;
} }
MUSTTAIL return Parser3::keepGoing(self); MUSTTAIL return Parser3::keepGoing(self);
} else if ((*self->buf & 0b11111000) == 0b11110000) { case Tables::FOUR_BYTE_UTF8:
// four byte utf-8 encoding // four byte utf-8 encoding
self->utf8Codepoint = *self->buf & 0b00000111; self->utf8Codepoint = *self->buf & 0b00000111;
self->minCodepoint = 0x10000; self->minCodepoint = 0x10000;
@@ -383,9 +380,11 @@ inline Status n_string2(Parser3 *self) {
return s; return s;
} }
MUSTTAIL return Parser3::keepGoing(self); MUSTTAIL return Parser3::keepGoing(self);
} case Tables::CONTINUATION_BYTE:
case Tables::INVALID:
return S_REJECT; return S_REJECT;
} }
}
inline Status n_string_following_escape(Parser3 *self) { inline Status n_string_following_escape(Parser3 *self) {
switch (*self->buf) { switch (*self->buf) {
@@ -417,24 +416,22 @@ inline Status n_string_following_escape(Parser3 *self) {
} }
inline Status t_utf8_continuation_byte(Parser3 *self) { inline Status t_utf8_continuation_byte(Parser3 *self) {
if (tables.invalidStringByte[uint8_t(*self->buf)]) { if (tables.stringByteMeaning[uint8_t(*self->buf)] !=
Tables::CONTINUATION_BYTE) {
return S_REJECT; return S_REJECT;
} }
if ((*self->buf & 0b11000000) == 0b10000000) {
self->utf8Codepoint <<= 6; self->utf8Codepoint <<= 6;
self->utf8Codepoint |= *self->buf & 0b00111111; self->utf8Codepoint |= *self->buf & 0b00111111;
*self->writeBuf++ = *self->buf++; *self->writeBuf++ = *self->buf++;
self->pop(); self->pop();
MUSTTAIL return Parser3::keepGoing(self); MUSTTAIL return Parser3::keepGoing(self);
} }
return S_REJECT;
}
inline Status t_utf8_last_continuation_byte(Parser3 *self) { inline Status t_utf8_last_continuation_byte(Parser3 *self) {
if (tables.invalidStringByte[uint8_t(*self->buf)]) { if (tables.stringByteMeaning[uint8_t(*self->buf)] !=
Tables::CONTINUATION_BYTE) {
return S_REJECT; return S_REJECT;
} }
if ((*self->buf & 0b11000000) == 0b10000000) {
self->utf8Codepoint <<= 6; self->utf8Codepoint <<= 6;
self->utf8Codepoint |= *self->buf & 0b00111111; self->utf8Codepoint |= *self->buf & 0b00111111;
if (self->utf8Codepoint < self->minCodepoint || if (self->utf8Codepoint < self->minCodepoint ||
@@ -447,8 +444,6 @@ inline Status t_utf8_last_continuation_byte(Parser3 *self) {
self->pop(); self->pop();
MUSTTAIL return Parser3::keepGoing(self); MUSTTAIL return Parser3::keepGoing(self);
} }
return S_REJECT;
}
inline Status t_digit(Parser3 *self) { inline Status t_digit(Parser3 *self) {
if ('0' <= *self->buf && *self->buf <= '9') { if ('0' <= *self->buf && *self->buf <= '9') {

View File

@@ -1,19 +1,48 @@
#pragma once #pragma once
constexpr inline struct Tables { constexpr inline struct Tables {
enum StringByteMeaning {
INVALID,
NORMAL,
DUBQUOTE,
BACKSLASH,
TWO_BYTE_UTF8,
THREE_BYTE_UTF8,
FOUR_BYTE_UTF8,
CONTINUATION_BYTE,
};
constexpr Tables() { constexpr Tables() {
whitespace[' '] = true; whitespace[' '] = true;
whitespace['\n'] = true; whitespace['\n'] = true;
whitespace['\r'] = true; whitespace['\r'] = true;
whitespace['\t'] = true; whitespace['\t'] = true;
for (int i = 0; i < 0x20; ++i) { for (int i = 0; i < 256; ++i) {
invalidStringByte[i] = true; if ((i & 0b11000000) == 0b10000000) {
stringByteMeaning[i] = CONTINUATION_BYTE;
} }
invalidStringByte[0xc0] = true; if ((i & 0b11100000) == 0b11000000) {
invalidStringByte[0xc1] = true; stringByteMeaning[i] = TWO_BYTE_UTF8;
for (int i = 0xf5; i <= 0xff; ++i) { }
invalidStringByte[i] = true; if ((i & 0b11110000) == 0b11100000) {
stringByteMeaning[i] = THREE_BYTE_UTF8;
}
if ((i & 0b11111000) == 0b11110000) {
stringByteMeaning[i] = FOUR_BYTE_UTF8;
}
}
for (int i = 0x20; i < 128; ++i) {
stringByteMeaning[i] = NORMAL;
}
stringByteMeaning['"'] = DUBQUOTE;
stringByteMeaning['\\'] = BACKSLASH;
stringByteMeaning[0xc0] = INVALID;
stringByteMeaning[0xc1] = INVALID;
for (int i = 0xF5; i < 0x100; ++i) {
stringByteMeaning[i] = INVALID;
} }
unescape['n'] = '\n'; unescape['n'] = '\n';
@@ -26,6 +55,6 @@ constexpr inline struct Tables {
unescape['/'] = '/'; unescape['/'] = '/';
} }
alignas(16) bool whitespace[256]{}; alignas(16) bool whitespace[256]{};
alignas(16) bool invalidStringByte[256]{}; alignas(16) StringByteMeaning stringByteMeaning[256]{};
alignas(16) char unescape[256]{}; alignas(16) char unescape[256]{};
} tables; } tables;