From 34ad19c22f8951abbc3aa6849d6e86f05d820f03 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew@weaselab.dev>
Date: Mon, 19 May 2025 14:26:12 -0400
Subject: [PATCH] Unescape basic multi-lingual plane utf8

---
 src/parser3.h | 88 +++++++++++++++++++++++++++++++++++++++++++++------
 src/test.cpp  | 55 ++++++++++++++++++++++++--------
 2 files changed, 120 insertions(+), 23 deletions(-)

diff --git a/src/parser3.h b/src/parser3.h
index 7e5d3c3..4a5f260 100644
--- a/src/parser3.h
+++ b/src/parser3.h
@@ -65,6 +65,7 @@ enum Symbol : uint8_t {
   T_UTF8_CONTINUATION_BYTE,
   T_UTF8_LAST_CONTINUATION_BYTE,
   T_HEX,
+  T_HEX2,
   T_DIGIT,
   T_ONENINE,
   T_EOF,
@@ -96,6 +97,7 @@ struct Parser3 {
     if (len > 0) {
       callbacks->on_string_data(data, dataBegin, len);
     }
+    dataBegin = writeBuf;
   }
 
   [[nodiscard]] bool empty() const { return stackPtr == stack; }
@@ -446,9 +448,9 @@ inline Status n_string_following_escape(Parser3 *self) {
     MUSTTAIL return Parser3::keepGoing(self);
   case 'u':
     ++self->buf;
-    // TODO unescape
+    self->utf8Codepoint = 0;
     self->pop();
-    if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX, N_STRING2})) {
+    if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX2, N_STRING2})) {
       return s;
     }
     MUSTTAIL return Parser3::keepGoing(self);
@@ -525,14 +527,79 @@ inline Status t_hex(Parser3 *self) {
   if (self->len() == 0) {
     return S_REJECT;
   }
-  if (('0' <= *self->buf && *self->buf <= '9') ||
-      ('a' <= *self->buf && *self->buf <= 'f') ||
-      ('A' <= *self->buf && *self->buf <= 'F')) {
-    *self->writeBuf++ = *self->buf++;
-    self->pop();
-    MUSTTAIL return Parser3::keepGoing(self);
+  self->utf8Codepoint <<= 4;
+  if (('0' <= *self->buf && *self->buf <= '9')) {
+    self->utf8Codepoint |= *self->buf - '0';
+  } else if ('a' <= *self->buf && *self->buf <= 'f') {
+    self->utf8Codepoint |= 10 + *self->buf - 'a';
+  } else if ('A' <= *self->buf && *self->buf <= 'F') {
+    self->utf8Codepoint |= 10 + *self->buf - 'A';
+  } else {
+    return S_REJECT;
   }
-  return S_REJECT;
+  ++self->buf;
+  self->pop();
+  MUSTTAIL return Parser3::keepGoing(self);
+}
+
+inline Status t_hex2(Parser3 *self) {
+  if (self->len() == 0) {
+    return S_REJECT;
+  }
+  self->utf8Codepoint <<= 4;
+  if (('0' <= *self->buf && *self->buf <= '9')) {
+    self->utf8Codepoint |= *self->buf - '0';
+  } else if ('a' <= *self->buf && *self->buf <= 'f') {
+    self->utf8Codepoint |= 10 + *self->buf - 'a';
+  } else if ('A' <= *self->buf && *self->buf <= 'F') {
+    self->utf8Codepoint |= 10 + *self->buf - 'A';
+  } else {
+    return S_REJECT;
+  }
+  ++self->buf;
+
+  // Write codepoint in utf-8 if there's room in the user provided buffer. If
+  // there's not room, flush, write into a temp buffer, and flush again.
+  char tmp[3];
+  if (self->utf8Codepoint < 0x80) {
+    assert(self->bufEnd - self->writeBuf >= 1);
+    *self->writeBuf++ = self->utf8Codepoint;
+  } else if (self->utf8Codepoint < 0x800) {
+    bool useTmp = self->bufEnd - self->writeBuf < 2;
+    char *p = tmp;
+    if (useTmp) {
+      self->flushString();
+    }
+    auto &w = useTmp ? p : self->writeBuf;
+    w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
+    self->utf8Codepoint >>= 6;
+    w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000;
+    w += 2;
+    if (useTmp) {
+      printf("%.*s", 2, tmp);
+      self->callbacks->on_string_data(self->data, tmp, 2);
+    }
+  } else {
+    assert(self->utf8Codepoint < 0x10000);
+    bool useTmp = self->bufEnd - self->writeBuf < 3;
+    char *p = tmp;
+    if (useTmp) {
+      self->flushString();
+    }
+    auto &w = useTmp ? p : self->writeBuf;
+    w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
+    self->utf8Codepoint >>= 6;
+    w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
+    self->utf8Codepoint >>= 6;
+    w[0] = (0b00001111 & self->utf8Codepoint) | 0b11100000;
+    w += 3;
+    if (useTmp) {
+      self->callbacks->on_string_data(self->data, tmp, 3);
+    }
+  }
+
+  self->pop();
+  MUSTTAIL return Parser3::keepGoing(self);
 }
 
 inline Status n_number(Parser3 *self) {
@@ -837,6 +904,7 @@ constexpr inline struct ContinuationTable {
     continuations[T_UTF8_LAST_CONTINUATION_BYTE] =
         t_utf8_last_continuation_byte;
     continuations[T_HEX] = t_hex;
+    continuations[T_HEX2] = t_hex2;
     continuations[T_DIGIT] = t_digit;
     continuations[T_ONENINE] = t_onenine;
     continuations[T_EOF] = t_eof;
@@ -873,6 +941,7 @@ constexpr inline struct ContinuationTable {
     symbolNames[T_COLON] = "singleChar<':'>";
     symbolNames[T_UTF8_CONTINUATION_BYTE] = "t_utf8_continuation_byte";
     symbolNames[T_HEX] = "t_hex";
+    symbolNames[T_HEX2] = "t_hex2";
     symbolNames[T_DIGIT] = "t_digit";
     symbolNames[T_ONENINE] = "t_onenine";
     symbolNames[T_EOF] = "t_eof";
@@ -904,6 +973,7 @@ inline Status Parser3::keepGoing(Parser3 *self) {
     case T_UTF8_CONTINUATION_BYTE:
     case T_UTF8_LAST_CONTINUATION_BYTE:
     case T_HEX:
+    case T_HEX2:
       self->flushString();
       break;
     case N_JSON:
diff --git a/src/test.cpp b/src/test.cpp
index a1483ff..297a15f 100644
--- a/src/test.cpp
+++ b/src/test.cpp
@@ -208,26 +208,53 @@ TEST_CASE("parser3") {
 
 TEST_CASE("streaming") { testStreaming(json); }
 
-TEST_CASE("unescaping basic") {
+void doTestUnescapingUtf8(std::string const &escaped,
+                          std::string const &expected, bool streaming) {
+  CAPTURE(escaped);
+  CAPTURE(expected);
+  CAPTURE(streaming);
   auto c = noopCallbacks();
-  c.on_string_data = +[](void *, const char *buf, int len) {
-    CHECK(std::string(buf, len) == "\n");
+  std::string result;
+  c.on_string_data = +[](void *p, const char *buf, int len) {
+    auto &s = *(std::string *)p;
+    s.append(buf, len);
   };
-  std::string copy = "\"\\n\"";
-  parser3::Parser3 parser(&c, nullptr);
-  CHECK(parser.parse(copy.data(), copy.length()) == parser3::S_AGAIN);
+  parser3::Parser3 parser(&c, &result);
+  auto copy = escaped;
+  if (streaming) {
+    for (int i = 0; i < copy.size(); ++i) {
+      CAPTURE(i);
+      CHECK(parser.parse(copy.data() + i, 1) == parser3::S_AGAIN);
+    }
+  } else {
+    CHECK(parser.parse(copy.data(), copy.size()) == parser3::S_AGAIN);
+  }
   CHECK(parser.parse(nullptr, 0) == parser3::S_OK);
+  CHECK(result.size() == expected.size());
+  CHECK(result == expected);
+}
+
+void testUnescapingUtf8(std::string const &escaped,
+                        std::string const &expected) {
+  doTestUnescapingUtf8(escaped, expected, false);
+  doTestUnescapingUtf8(escaped, expected, true);
 }
 
 TEST_CASE("unescaping utf-8") {
-  auto c = noopCallbacks();
-  c.on_string_data = +[](void *, const char *buf, int len) {
-    CHECK(std::string(buf, len) == "\uaB34");
-  };
-  std::string copy = "\"\\uaB34\"";
-  parser3::Parser3 parser(&c, nullptr);
-  CHECK(parser.parse(copy.data(), copy.length()) == parser3::S_AGAIN);
-  CHECK(parser.parse(nullptr, 0) == parser3::S_OK);
+  // Basic
+  testUnescapingUtf8("\"\\\"\"", "\"");
+  testUnescapingUtf8("\"\\\\\"", "\\");
+  testUnescapingUtf8("\"\\/\"", "/");
+  testUnescapingUtf8("\"\\b\"", "\b");
+  testUnescapingUtf8("\"\\f\"", "\f");
+  testUnescapingUtf8("\"\\n\"", "\n");
+  testUnescapingUtf8("\"\\r\"", "\r");
+  testUnescapingUtf8("\"\\t\"", "\t");
+  // 2 byte encoding
+  testUnescapingUtf8("\"\\u07aB 1234\"", "\u07aB 1234");
+  // 3 byte encoding
+  testUnescapingUtf8("\"\\uaB34 5678\"", "\uaB34 5678");
+  // TODO 4 byte encoding (utf-16 surrogate pairs)
 }
 
 TEST_CASE("bench3") {