From 330101a93733a387c565c9a4a5847f84f9cb71e9 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Tue, 24 Jun 2025 10:43:40 -0400 Subject: [PATCH] Handle unescaping directly in n_string2 if space permits --- src/parser3.h | 73 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/src/parser3.h b/src/parser3.h index fc3ce9e..335fb09 100644 --- a/src/parser3.h +++ b/src/parser3.h @@ -461,6 +461,11 @@ inline PRESERVE_NONE WeaselJsonStatus n_string(Parser3 *self, char *buf, MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); } +inline int32_t read4_hex(const char *buf) { + return tables.hex[uint8_t(buf[0])] << 12 | tables.hex[uint8_t(buf[1])] << 8 | + tables.hex[uint8_t(buf[2])] << 4 | tables.hex[uint8_t(buf[3])] << 0; +} + inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf, char *bufEnd) { if (auto s = scan_string(self, buf, bufEnd)) { @@ -474,10 +479,69 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf, MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); case '\\': ++buf; - if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) { - return s; + if (bufEnd - buf < /*strlen("u0000\\u0000")*/ 11) { + if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); + } else { + if (*buf == 'u') { + ++buf; + int32_t codepoint = read4_hex(buf); + if (codepoint < 0) [[unlikely]] { + return WeaselJson_REJECT; + } + buf += 4; + if (0xd800 <= codepoint && codepoint <= 0xdfff) { + // utf-16 surrogate + int32_t codepoint2 = read4_hex(buf + 2); + if (!(buf[0] == '\\' && buf[1] == 'u' && codepoint2 >= 0)) + [[unlikely]] { + return WeaselJson_REJECT; + } + codepoint = + 0x10000 + (codepoint - 0xd800) * 0x400 + (codepoint2 - 0xdc00); + assert(codepoint >= 0x10000); + if (codepoint > 0x10FFFF) [[unlikely]] { + return WeaselJson_REJECT; + } + buf += 6; + assert(codepoint < 0x10ffff); + self->writeBuf[3] = (0b00111111 & codepoint) | 0b10000000; + codepoint >>= 6; + self->writeBuf[2] = (0b00111111 & codepoint) | 0b10000000; + codepoint >>= 6; + self->writeBuf[1] = (0b00111111 & codepoint) | 0b10000000; + codepoint >>= 6; + self->writeBuf[0] = (0b00000111 & codepoint) | 0b11110000; + self->writeBuf += 4; + } else { + if (codepoint < 0x80) { + *self->writeBuf++ = codepoint; + } else if (codepoint < 0x800) { + self->writeBuf[1] = (0b00111111 & codepoint) | 0b10000000; + codepoint >>= 6; + self->writeBuf[0] = (0b00011111 & codepoint) | 0b11000000; + self->writeBuf += 2; + } else { + assert(codepoint < 0x10000); + self->writeBuf[2] = (0b00111111 & codepoint) | 0b10000000; + codepoint >>= 6; + self->writeBuf[1] = (0b00111111 & codepoint) | 0b10000000; + codepoint >>= 6; + self->writeBuf[0] = (0b00001111 & codepoint) | 0b11100000; + self->writeBuf += 3; + } + } + } else { + auto unescaped = tables.unescape[uint8_t(*buf++)]; + if (unescaped == 0) [[unlikely]] { + return WeaselJson_REJECT; + } + *self->writeBuf++ = unescaped; + } + MUSTTAIL return n_string2(self, buf, bufEnd); } - MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); default: [[unlikely]] return WeaselJson_REJECT; } @@ -760,7 +824,8 @@ constexpr inline struct ContinuationTable { symbolNames[T_EOF] = "t_eof"; symbolNames[T_BACKSLASH] = "singleChar<'\\'>"; - // All others can assume that there's at least one byte when they're called + // All others can assume that there's at least one byte when they're + // called acceptsEmptyString[N_NUMBER] = true; acceptsEmptyString[N_WHITESPACE] = true; acceptsEmptyString[T_EOF] = true;