Handle unescaping directly in n_string2 if space permits
This commit is contained in:
@@ -461,6 +461,11 @@ inline PRESERVE_NONE WeaselJsonStatus n_string(Parser3 *self, char *buf,
|
|||||||
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline int32_t read4_hex(const char *buf) {
|
||||||
|
return tables.hex[uint8_t(buf[0])] << 12 | tables.hex[uint8_t(buf[1])] << 8 |
|
||||||
|
tables.hex[uint8_t(buf[2])] << 4 | tables.hex[uint8_t(buf[3])] << 0;
|
||||||
|
}
|
||||||
|
|
||||||
inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
|
inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
|
||||||
char *bufEnd) {
|
char *bufEnd) {
|
||||||
if (auto s = scan_string(self, buf, bufEnd)) {
|
if (auto s = scan_string(self, buf, bufEnd)) {
|
||||||
@@ -474,10 +479,69 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
|
|||||||
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
||||||
case '\\':
|
case '\\':
|
||||||
++buf;
|
++buf;
|
||||||
if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) {
|
if (bufEnd - buf < /*strlen("u0000\\u0000")*/ 11) {
|
||||||
return s;
|
if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
||||||
|
} else {
|
||||||
|
if (*buf == 'u') {
|
||||||
|
++buf;
|
||||||
|
int32_t codepoint = read4_hex(buf);
|
||||||
|
if (codepoint < 0) [[unlikely]] {
|
||||||
|
return WeaselJson_REJECT;
|
||||||
|
}
|
||||||
|
buf += 4;
|
||||||
|
if (0xd800 <= codepoint && codepoint <= 0xdfff) {
|
||||||
|
// utf-16 surrogate
|
||||||
|
int32_t codepoint2 = read4_hex(buf + 2);
|
||||||
|
if (!(buf[0] == '\\' && buf[1] == 'u' && codepoint2 >= 0))
|
||||||
|
[[unlikely]] {
|
||||||
|
return WeaselJson_REJECT;
|
||||||
|
}
|
||||||
|
codepoint =
|
||||||
|
0x10000 + (codepoint - 0xd800) * 0x400 + (codepoint2 - 0xdc00);
|
||||||
|
assert(codepoint >= 0x10000);
|
||||||
|
if (codepoint > 0x10FFFF) [[unlikely]] {
|
||||||
|
return WeaselJson_REJECT;
|
||||||
|
}
|
||||||
|
buf += 6;
|
||||||
|
assert(codepoint < 0x10ffff);
|
||||||
|
self->writeBuf[3] = (0b00111111 & codepoint) | 0b10000000;
|
||||||
|
codepoint >>= 6;
|
||||||
|
self->writeBuf[2] = (0b00111111 & codepoint) | 0b10000000;
|
||||||
|
codepoint >>= 6;
|
||||||
|
self->writeBuf[1] = (0b00111111 & codepoint) | 0b10000000;
|
||||||
|
codepoint >>= 6;
|
||||||
|
self->writeBuf[0] = (0b00000111 & codepoint) | 0b11110000;
|
||||||
|
self->writeBuf += 4;
|
||||||
|
} else {
|
||||||
|
if (codepoint < 0x80) {
|
||||||
|
*self->writeBuf++ = codepoint;
|
||||||
|
} else if (codepoint < 0x800) {
|
||||||
|
self->writeBuf[1] = (0b00111111 & codepoint) | 0b10000000;
|
||||||
|
codepoint >>= 6;
|
||||||
|
self->writeBuf[0] = (0b00011111 & codepoint) | 0b11000000;
|
||||||
|
self->writeBuf += 2;
|
||||||
|
} else {
|
||||||
|
assert(codepoint < 0x10000);
|
||||||
|
self->writeBuf[2] = (0b00111111 & codepoint) | 0b10000000;
|
||||||
|
codepoint >>= 6;
|
||||||
|
self->writeBuf[1] = (0b00111111 & codepoint) | 0b10000000;
|
||||||
|
codepoint >>= 6;
|
||||||
|
self->writeBuf[0] = (0b00001111 & codepoint) | 0b11100000;
|
||||||
|
self->writeBuf += 3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
auto unescaped = tables.unescape[uint8_t(*buf++)];
|
||||||
|
if (unescaped == 0) [[unlikely]] {
|
||||||
|
return WeaselJson_REJECT;
|
||||||
|
}
|
||||||
|
*self->writeBuf++ = unescaped;
|
||||||
|
}
|
||||||
|
MUSTTAIL return n_string2(self, buf, bufEnd);
|
||||||
}
|
}
|
||||||
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
|
|
||||||
default:
|
default:
|
||||||
[[unlikely]] return WeaselJson_REJECT;
|
[[unlikely]] return WeaselJson_REJECT;
|
||||||
}
|
}
|
||||||
@@ -760,7 +824,8 @@ constexpr inline struct ContinuationTable {
|
|||||||
symbolNames[T_EOF] = "t_eof";
|
symbolNames[T_EOF] = "t_eof";
|
||||||
symbolNames[T_BACKSLASH] = "singleChar<'\\'>";
|
symbolNames[T_BACKSLASH] = "singleChar<'\\'>";
|
||||||
|
|
||||||
// All others can assume that there's at least one byte when they're called
|
// All others can assume that there's at least one byte when they're
|
||||||
|
// called
|
||||||
acceptsEmptyString[N_NUMBER] = true;
|
acceptsEmptyString[N_NUMBER] = true;
|
||||||
acceptsEmptyString[N_WHITESPACE] = true;
|
acceptsEmptyString[N_WHITESPACE] = true;
|
||||||
acceptsEmptyString[T_EOF] = true;
|
acceptsEmptyString[T_EOF] = true;
|
||||||
|
|||||||
Reference in New Issue
Block a user