From 34ad19c22f8951abbc3aa6849d6e86f05d820f03 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Mon, 19 May 2025 14:26:12 -0400 Subject: [PATCH] Unescape basic multi-lingual plane utf8 --- src/parser3.h | 88 +++++++++++++++++++++++++++++++++++++++++++++------ src/test.cpp | 55 ++++++++++++++++++++++++-------- 2 files changed, 120 insertions(+), 23 deletions(-) diff --git a/src/parser3.h b/src/parser3.h index 7e5d3c3..4a5f260 100644 --- a/src/parser3.h +++ b/src/parser3.h @@ -65,6 +65,7 @@ enum Symbol : uint8_t { T_UTF8_CONTINUATION_BYTE, T_UTF8_LAST_CONTINUATION_BYTE, T_HEX, + T_HEX2, T_DIGIT, T_ONENINE, T_EOF, @@ -96,6 +97,7 @@ struct Parser3 { if (len > 0) { callbacks->on_string_data(data, dataBegin, len); } + dataBegin = writeBuf; } [[nodiscard]] bool empty() const { return stackPtr == stack; } @@ -446,9 +448,9 @@ inline Status n_string_following_escape(Parser3 *self) { MUSTTAIL return Parser3::keepGoing(self); case 'u': ++self->buf; - // TODO unescape + self->utf8Codepoint = 0; self->pop(); - if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX, N_STRING2})) { + if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX2, N_STRING2})) { return s; } MUSTTAIL return Parser3::keepGoing(self); @@ -525,14 +527,79 @@ inline Status t_hex(Parser3 *self) { if (self->len() == 0) { return S_REJECT; } - if (('0' <= *self->buf && *self->buf <= '9') || - ('a' <= *self->buf && *self->buf <= 'f') || - ('A' <= *self->buf && *self->buf <= 'F')) { - *self->writeBuf++ = *self->buf++; - self->pop(); - MUSTTAIL return Parser3::keepGoing(self); + self->utf8Codepoint <<= 4; + if (('0' <= *self->buf && *self->buf <= '9')) { + self->utf8Codepoint |= *self->buf - '0'; + } else if ('a' <= *self->buf && *self->buf <= 'f') { + self->utf8Codepoint |= 10 + *self->buf - 'a'; + } else if ('A' <= *self->buf && *self->buf <= 'F') { + self->utf8Codepoint |= 10 + *self->buf - 'A'; + } else { + return S_REJECT; } - return S_REJECT; + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); +} + +inline Status t_hex2(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + self->utf8Codepoint <<= 4; + if (('0' <= *self->buf && *self->buf <= '9')) { + self->utf8Codepoint |= *self->buf - '0'; + } else if ('a' <= *self->buf && *self->buf <= 'f') { + self->utf8Codepoint |= 10 + *self->buf - 'a'; + } else if ('A' <= *self->buf && *self->buf <= 'F') { + self->utf8Codepoint |= 10 + *self->buf - 'A'; + } else { + return S_REJECT; + } + ++self->buf; + + // Write codepoint in utf-8 if there's room in the user provided buffer. If + // there's not room, flush, write into a temp buffer, and flush again. + char tmp[3]; + if (self->utf8Codepoint < 0x80) { + assert(self->bufEnd - self->writeBuf >= 1); + *self->writeBuf++ = self->utf8Codepoint; + } else if (self->utf8Codepoint < 0x800) { + bool useTmp = self->bufEnd - self->writeBuf < 2; + char *p = tmp; + if (useTmp) { + self->flushString(); + } + auto &w = useTmp ? p : self->writeBuf; + w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000; + w += 2; + if (useTmp) { + printf("%.*s", 2, tmp); + self->callbacks->on_string_data(self->data, tmp, 2); + } + } else { + assert(self->utf8Codepoint < 0x10000); + bool useTmp = self->bufEnd - self->writeBuf < 3; + char *p = tmp; + if (useTmp) { + self->flushString(); + } + auto &w = useTmp ? p : self->writeBuf; + w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[0] = (0b00001111 & self->utf8Codepoint) | 0b11100000; + w += 3; + if (useTmp) { + self->callbacks->on_string_data(self->data, tmp, 3); + } + } + + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); } inline Status n_number(Parser3 *self) { @@ -837,6 +904,7 @@ constexpr inline struct ContinuationTable { continuations[T_UTF8_LAST_CONTINUATION_BYTE] = t_utf8_last_continuation_byte; continuations[T_HEX] = t_hex; + continuations[T_HEX2] = t_hex2; continuations[T_DIGIT] = t_digit; continuations[T_ONENINE] = t_onenine; continuations[T_EOF] = t_eof; @@ -873,6 +941,7 @@ constexpr inline struct ContinuationTable { symbolNames[T_COLON] = "singleChar<':'>"; symbolNames[T_UTF8_CONTINUATION_BYTE] = "t_utf8_continuation_byte"; symbolNames[T_HEX] = "t_hex"; + symbolNames[T_HEX2] = "t_hex2"; symbolNames[T_DIGIT] = "t_digit"; symbolNames[T_ONENINE] = "t_onenine"; symbolNames[T_EOF] = "t_eof"; @@ -904,6 +973,7 @@ inline Status Parser3::keepGoing(Parser3 *self) { case T_UTF8_CONTINUATION_BYTE: case T_UTF8_LAST_CONTINUATION_BYTE: case T_HEX: + case T_HEX2: self->flushString(); break; case N_JSON: diff --git a/src/test.cpp b/src/test.cpp index a1483ff..297a15f 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -208,26 +208,53 @@ TEST_CASE("parser3") { TEST_CASE("streaming") { testStreaming(json); } -TEST_CASE("unescaping basic") { +void doTestUnescapingUtf8(std::string const &escaped, + std::string const &expected, bool streaming) { + CAPTURE(escaped); + CAPTURE(expected); + CAPTURE(streaming); auto c = noopCallbacks(); - c.on_string_data = +[](void *, const char *buf, int len) { - CHECK(std::string(buf, len) == "\n"); + std::string result; + c.on_string_data = +[](void *p, const char *buf, int len) { + auto &s = *(std::string *)p; + s.append(buf, len); }; - std::string copy = "\"\\n\""; - parser3::Parser3 parser(&c, nullptr); - CHECK(parser.parse(copy.data(), copy.length()) == parser3::S_AGAIN); + parser3::Parser3 parser(&c, &result); + auto copy = escaped; + if (streaming) { + for (int i = 0; i < copy.size(); ++i) { + CAPTURE(i); + CHECK(parser.parse(copy.data() + i, 1) == parser3::S_AGAIN); + } + } else { + CHECK(parser.parse(copy.data(), copy.size()) == parser3::S_AGAIN); + } CHECK(parser.parse(nullptr, 0) == parser3::S_OK); + CHECK(result.size() == expected.size()); + CHECK(result == expected); +} + +void testUnescapingUtf8(std::string const &escaped, + std::string const &expected) { + doTestUnescapingUtf8(escaped, expected, false); + doTestUnescapingUtf8(escaped, expected, true); } TEST_CASE("unescaping utf-8") { - auto c = noopCallbacks(); - c.on_string_data = +[](void *, const char *buf, int len) { - CHECK(std::string(buf, len) == "\uaB34"); - }; - std::string copy = "\"\\uaB34\""; - parser3::Parser3 parser(&c, nullptr); - CHECK(parser.parse(copy.data(), copy.length()) == parser3::S_AGAIN); - CHECK(parser.parse(nullptr, 0) == parser3::S_OK); + // Basic + testUnescapingUtf8("\"\\\"\"", "\""); + testUnescapingUtf8("\"\\\\\"", "\\"); + testUnescapingUtf8("\"\\/\"", "/"); + testUnescapingUtf8("\"\\b\"", "\b"); + testUnescapingUtf8("\"\\f\"", "\f"); + testUnescapingUtf8("\"\\n\"", "\n"); + testUnescapingUtf8("\"\\r\"", "\r"); + testUnescapingUtf8("\"\\t\"", "\t"); + // 2 byte encoding + testUnescapingUtf8("\"\\u07aB 1234\"", "\u07aB 1234"); + // 3 byte encoding + testUnescapingUtf8("\"\\uaB34 5678\"", "\uaB34 5678"); + // TODO 4 byte encoding (utf-16 surrogate pairs) } TEST_CASE("bench3") {