Unescape basic multi-lingual plane utf8

2025-05-19 14:26:12 -04:00
parent 19bc216458
commit 34ad19c22f
2 changed files with 120 additions and 23 deletions
--- a/src/test.cpp
+++ b/src/test.cpp
@@ -208,26 +208,53 @@ TEST_CASE("parser3") {

 TEST_CASE("streaming") { testStreaming(json); }

-TEST_CASE("unescaping basic") {
+void doTestUnescapingUtf8(std::string const &escaped,
+                          std::string const &expected, bool streaming) {
+  CAPTURE(escaped);
+  CAPTURE(expected);
+  CAPTURE(streaming);
  auto c = noopCallbacks();
-  c.on_string_data = +[](void *, const char *buf, int len) {
-    CHECK(std::string(buf, len) == "\n");
+  std::string result;
+  c.on_string_data = +[](void *p, const char *buf, int len) {
+    auto &s = *(std::string *)p;
+    s.append(buf, len);
  };
-  std::string copy = "\"\\n\"";
-  parser3::Parser3 parser(&c, nullptr);
-  CHECK(parser.parse(copy.data(), copy.length()) == parser3::S_AGAIN);
+  parser3::Parser3 parser(&c, &result);
+  auto copy = escaped;
+  if (streaming) {
+    for (int i = 0; i < copy.size(); ++i) {
+      CAPTURE(i);
+      CHECK(parser.parse(copy.data() + i, 1) == parser3::S_AGAIN);
+    }
+  } else {
+    CHECK(parser.parse(copy.data(), copy.size()) == parser3::S_AGAIN);
+  }
  CHECK(parser.parse(nullptr, 0) == parser3::S_OK);
+  CHECK(result.size() == expected.size());
+  CHECK(result == expected);
+}
+
+void testUnescapingUtf8(std::string const &escaped,
+                        std::string const &expected) {
+  doTestUnescapingUtf8(escaped, expected, false);
+  doTestUnescapingUtf8(escaped, expected, true);
 }

 TEST_CASE("unescaping utf-8") {
-  auto c = noopCallbacks();
-  c.on_string_data = +[](void *, const char *buf, int len) {
-    CHECK(std::string(buf, len) == "\uaB34");
-  };
-  std::string copy = "\"\\uaB34\"";
-  parser3::Parser3 parser(&c, nullptr);
-  CHECK(parser.parse(copy.data(), copy.length()) == parser3::S_AGAIN);
-  CHECK(parser.parse(nullptr, 0) == parser3::S_OK);
+  // Basic
+  testUnescapingUtf8("\"\\\"\"", "\"");
+  testUnescapingUtf8("\"\\\\\"", "\\");
+  testUnescapingUtf8("\"\\/\"", "/");
+  testUnescapingUtf8("\"\\b\"", "\b");
+  testUnescapingUtf8("\"\\f\"", "\f");
+  testUnescapingUtf8("\"\\n\"", "\n");
+  testUnescapingUtf8("\"\\r\"", "\r");
+  testUnescapingUtf8("\"\\t\"", "\t");
+  // 2 byte encoding
+  testUnescapingUtf8("\"\\u07aB 1234\"", "\u07aB 1234");
+  // 3 byte encoding
+  testUnescapingUtf8("\"\\uaB34 5678\"", "\uaB34 5678");
+  // TODO 4 byte encoding (utf-16 surrogate pairs)
 }

 TEST_CASE("bench3") {