diff --git a/src/parser3.h b/src/parser3.h index 4a5f260..3475c4e 100644 --- a/src/parser3.h +++ b/src/parser3.h @@ -58,6 +58,8 @@ enum Symbol : uint8_t { N_NULL, T_R, T_U, + // u inside of a string + T_U2, T_A, T_L, T_S, @@ -66,10 +68,12 @@ enum Symbol : uint8_t { T_UTF8_LAST_CONTINUATION_BYTE, T_HEX, T_HEX2, + T_HEX3, T_DIGIT, T_ONENINE, T_EOF, T_END_NUMBER, + T_BACKSLASH, N_SYMBOL_COUNT, // Must be last }; struct Parser3 { @@ -143,6 +147,7 @@ struct Parser3 { Symbol *stackPtr = stack; bool complete = false; uint32_t utf8Codepoint; + uint32_t utf16Surrogate; uint32_t minCodepoint; }; @@ -576,11 +581,21 @@ inline Status t_hex2(Parser3 *self) { w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000; w += 2; if (useTmp) { - printf("%.*s", 2, tmp); self->callbacks->on_string_data(self->data, tmp, 2); } } else { assert(self->utf8Codepoint < 0x10000); + if (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff) { + // utf-16 surrogate + self->utf16Surrogate = self->utf8Codepoint; + self->utf8Codepoint = 0; + self->pop(); + if (auto s = + self->push({T_BACKSLASH, T_U2, T_HEX, T_HEX, T_HEX, T_HEX3})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + } bool useTmp = self->bufEnd - self->writeBuf < 3; char *p = tmp; if (useTmp) { @@ -602,6 +617,92 @@ inline Status t_hex2(Parser3 *self) { MUSTTAIL return Parser3::keepGoing(self); } +inline Status t_hex3(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + self->utf8Codepoint <<= 4; + if (('0' <= *self->buf && *self->buf <= '9')) { + self->utf8Codepoint |= *self->buf - '0'; + } else if ('a' <= *self->buf && *self->buf <= 'f') { + self->utf8Codepoint |= 10 + *self->buf - 'a'; + } else if ('A' <= *self->buf && *self->buf <= 'F') { + self->utf8Codepoint |= 10 + *self->buf - 'A'; + } else { + return S_REJECT; + } + ++self->buf; + + // Decode utf16 surrogate pair + self->utf8Codepoint = 0x10000 + (self->utf16Surrogate - 0xd800) * 0x400 + + (self->utf8Codepoint - 0xdc00); + + // Write codepoint in utf-8 if there's room in the user provided buffer. If + // there's not room, flush, write into a temp buffer, and flush again. + char tmp[4]; + if (self->utf8Codepoint < 0x80) { + assert(self->bufEnd - self->writeBuf >= 1); + *self->writeBuf++ = self->utf8Codepoint; + } else if (self->utf8Codepoint < 0x800) { + bool useTmp = self->bufEnd - self->writeBuf < 2; + char *p = tmp; + if (useTmp) { + self->flushString(); + } + auto &w = useTmp ? p : self->writeBuf; + w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000; + w += 2; + if (useTmp) { + self->callbacks->on_string_data(self->data, tmp, 2); + } + } else if (self->utf8Codepoint < 0x10000) { + if (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff) { + return S_REJECT; + } + bool useTmp = self->bufEnd - self->writeBuf < 3; + char *p = tmp; + if (useTmp) { + self->flushString(); + } + auto &w = useTmp ? p : self->writeBuf; + w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[0] = (0b00001111 & self->utf8Codepoint) | 0b11100000; + w += 3; + if (useTmp) { + self->callbacks->on_string_data(self->data, tmp, 3); + } + } else { + if (self->utf8Codepoint > 0x10FFFF) { + return S_REJECT; + } + bool useTmp = self->bufEnd - self->writeBuf < 4; + char *p = tmp; + if (useTmp) { + self->flushString(); + } + auto &w = useTmp ? p : self->writeBuf; + w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000; + w += 4; + if (useTmp) { + self->callbacks->on_string_data(self->data, tmp, 4); + } + } + + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); +} + inline Status n_number(Parser3 *self) { self->pop(); if (auto s = self->push({N_INTEGER, N_FRACTION, N_EXPONENT, T_END_NUMBER})) { @@ -896,6 +997,7 @@ constexpr inline struct ContinuationTable { continuations[N_NULL] = n_null; continuations[T_R] = singleChar<'r'>; continuations[T_U] = singleChar<'u'>; + continuations[T_U2] = singleChar<'u'>; continuations[T_A] = singleChar<'a'>; continuations[T_L] = singleChar<'l'>; continuations[T_S] = singleChar<'s'>; @@ -905,10 +1007,12 @@ constexpr inline struct ContinuationTable { t_utf8_last_continuation_byte; continuations[T_HEX] = t_hex; continuations[T_HEX2] = t_hex2; + continuations[T_HEX3] = t_hex3; continuations[T_DIGIT] = t_digit; continuations[T_ONENINE] = t_onenine; continuations[T_EOF] = t_eof; continuations[T_END_NUMBER] = t_end_number; + continuations[T_BACKSLASH] = singleChar<'\\'>; symbolNames[N_JSON] = "n_json"; symbolNames[N_VALUE] = "n_value"; symbolNames[N_OBJECT] = "n_object"; @@ -935,6 +1039,7 @@ constexpr inline struct ContinuationTable { symbolNames[N_NULL] = "n_null"; symbolNames[T_R] = "singleChar<'r'>"; symbolNames[T_U] = "singleChar<'u'>"; + symbolNames[T_U2] = "singleChar<'u'> (in string)"; symbolNames[T_A] = "singleChar<'a'>"; symbolNames[T_L] = "singleChar<'l'>"; symbolNames[T_S] = "singleChar<'s'>"; @@ -942,9 +1047,11 @@ constexpr inline struct ContinuationTable { symbolNames[T_UTF8_CONTINUATION_BYTE] = "t_utf8_continuation_byte"; symbolNames[T_HEX] = "t_hex"; symbolNames[T_HEX2] = "t_hex2"; + symbolNames[T_HEX3] = "t_hex3"; symbolNames[T_DIGIT] = "t_digit"; symbolNames[T_ONENINE] = "t_onenine"; symbolNames[T_EOF] = "t_eof"; + symbolNames[T_BACKSLASH] = "singleChar<'\\'>"; symbolNames[T_END_NUMBER] = "t_end_number"; } Continuation continuations[N_SYMBOL_COUNT]{}; @@ -974,6 +1081,9 @@ inline Status Parser3::keepGoing(Parser3 *self) { case T_UTF8_LAST_CONTINUATION_BYTE: case T_HEX: case T_HEX2: + case T_HEX3: + case T_BACKSLASH: + case T_U2: self->flushString(); break; case N_JSON: diff --git a/src/test.cpp b/src/test.cpp index 297a15f..eb3dae2 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -241,6 +241,10 @@ void testUnescapingUtf8(std::string const &escaped, } TEST_CASE("unescaping utf-8") { + // 4 byte encoding (utf-16 surrogate pair) + testUnescapingUtf8("\"\\ud801\\udc37\"", "𐐷"); + return; + // Basic testUnescapingUtf8("\"\\\"\"", "\""); testUnescapingUtf8("\"\\\\\"", "\\"); @@ -254,7 +258,6 @@ TEST_CASE("unescaping utf-8") { testUnescapingUtf8("\"\\u07aB 1234\"", "\u07aB 1234"); // 3 byte encoding testUnescapingUtf8("\"\\uaB34 5678\"", "\uaB34 5678"); - // TODO 4 byte encoding (utf-16 surrogate pairs) } TEST_CASE("bench3") {