Handle unescaping directly in n_string2 if space permits

This commit is contained in:
2025-06-24 10:43:40 -04:00
parent 9803364adb
commit 330101a937

View File

@@ -461,6 +461,11 @@ inline PRESERVE_NONE WeaselJsonStatus n_string(Parser3 *self, char *buf,
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
inline int32_t read4_hex(const char *buf) {
return tables.hex[uint8_t(buf[0])] << 12 | tables.hex[uint8_t(buf[1])] << 8 |
tables.hex[uint8_t(buf[2])] << 4 | tables.hex[uint8_t(buf[3])] << 0;
}
inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
char *bufEnd) {
if (auto s = scan_string(self, buf, bufEnd)) {
@@ -474,10 +479,69 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case '\\':
++buf;
if (bufEnd - buf < /*strlen("u0000\\u0000")*/ 11) {
if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
} else {
if (*buf == 'u') {
++buf;
int32_t codepoint = read4_hex(buf);
if (codepoint < 0) [[unlikely]] {
return WeaselJson_REJECT;
}
buf += 4;
if (0xd800 <= codepoint && codepoint <= 0xdfff) {
// utf-16 surrogate
int32_t codepoint2 = read4_hex(buf + 2);
if (!(buf[0] == '\\' && buf[1] == 'u' && codepoint2 >= 0))
[[unlikely]] {
return WeaselJson_REJECT;
}
codepoint =
0x10000 + (codepoint - 0xd800) * 0x400 + (codepoint2 - 0xdc00);
assert(codepoint >= 0x10000);
if (codepoint > 0x10FFFF) [[unlikely]] {
return WeaselJson_REJECT;
}
buf += 6;
assert(codepoint < 0x10ffff);
self->writeBuf[3] = (0b00111111 & codepoint) | 0b10000000;
codepoint >>= 6;
self->writeBuf[2] = (0b00111111 & codepoint) | 0b10000000;
codepoint >>= 6;
self->writeBuf[1] = (0b00111111 & codepoint) | 0b10000000;
codepoint >>= 6;
self->writeBuf[0] = (0b00000111 & codepoint) | 0b11110000;
self->writeBuf += 4;
} else {
if (codepoint < 0x80) {
*self->writeBuf++ = codepoint;
} else if (codepoint < 0x800) {
self->writeBuf[1] = (0b00111111 & codepoint) | 0b10000000;
codepoint >>= 6;
self->writeBuf[0] = (0b00011111 & codepoint) | 0b11000000;
self->writeBuf += 2;
} else {
assert(codepoint < 0x10000);
self->writeBuf[2] = (0b00111111 & codepoint) | 0b10000000;
codepoint >>= 6;
self->writeBuf[1] = (0b00111111 & codepoint) | 0b10000000;
codepoint >>= 6;
self->writeBuf[0] = (0b00001111 & codepoint) | 0b11100000;
self->writeBuf += 3;
}
}
} else {
auto unescaped = tables.unescape[uint8_t(*buf++)];
if (unescaped == 0) [[unlikely]] {
return WeaselJson_REJECT;
}
*self->writeBuf++ = unescaped;
}
MUSTTAIL return n_string2(self, buf, bufEnd);
}
default:
[[unlikely]] return WeaselJson_REJECT;
}
@@ -760,7 +824,8 @@ constexpr inline struct ContinuationTable {
symbolNames[T_EOF] = "t_eof";
symbolNames[T_BACKSLASH] = "singleChar<'\\'>";
// All others can assume that there's at least one byte when they're called
// All others can assume that there's at least one byte when they're
// called
acceptsEmptyString[N_NUMBER] = true;
acceptsEmptyString[N_WHITESPACE] = true;
acceptsEmptyString[T_EOF] = true;