Unescape basic multi-lingual plane utf8

This commit is contained in:
2025-05-19 14:26:12 -04:00
parent 19bc216458
commit 34ad19c22f
2 changed files with 120 additions and 23 deletions

View File

@@ -208,26 +208,53 @@ TEST_CASE("parser3") {
TEST_CASE("streaming") { testStreaming(json); }
TEST_CASE("unescaping basic") {
void doTestUnescapingUtf8(std::string const &escaped,
std::string const &expected, bool streaming) {
CAPTURE(escaped);
CAPTURE(expected);
CAPTURE(streaming);
auto c = noopCallbacks();
c.on_string_data = +[](void *, const char *buf, int len) {
CHECK(std::string(buf, len) == "\n");
std::string result;
c.on_string_data = +[](void *p, const char *buf, int len) {
auto &s = *(std::string *)p;
s.append(buf, len);
};
std::string copy = "\"\\n\"";
parser3::Parser3 parser(&c, nullptr);
CHECK(parser.parse(copy.data(), copy.length()) == parser3::S_AGAIN);
parser3::Parser3 parser(&c, &result);
auto copy = escaped;
if (streaming) {
for (int i = 0; i < copy.size(); ++i) {
CAPTURE(i);
CHECK(parser.parse(copy.data() + i, 1) == parser3::S_AGAIN);
}
} else {
CHECK(parser.parse(copy.data(), copy.size()) == parser3::S_AGAIN);
}
CHECK(parser.parse(nullptr, 0) == parser3::S_OK);
CHECK(result.size() == expected.size());
CHECK(result == expected);
}
void testUnescapingUtf8(std::string const &escaped,
std::string const &expected) {
doTestUnescapingUtf8(escaped, expected, false);
doTestUnescapingUtf8(escaped, expected, true);
}
TEST_CASE("unescaping utf-8") {
auto c = noopCallbacks();
c.on_string_data = +[](void *, const char *buf, int len) {
CHECK(std::string(buf, len) == "\uaB34");
};
std::string copy = "\"\\uaB34\"";
parser3::Parser3 parser(&c, nullptr);
CHECK(parser.parse(copy.data(), copy.length()) == parser3::S_AGAIN);
CHECK(parser.parse(nullptr, 0) == parser3::S_OK);
// Basic
testUnescapingUtf8("\"\\\"\"", "\"");
testUnescapingUtf8("\"\\\\\"", "\\");
testUnescapingUtf8("\"\\/\"", "/");
testUnescapingUtf8("\"\\b\"", "\b");
testUnescapingUtf8("\"\\f\"", "\f");
testUnescapingUtf8("\"\\n\"", "\n");
testUnescapingUtf8("\"\\r\"", "\r");
testUnescapingUtf8("\"\\t\"", "\t");
// 2 byte encoding
testUnescapingUtf8("\"\\u07aB 1234\"", "\u07aB 1234");
// 3 byte encoding
testUnescapingUtf8("\"\\uaB34 5678\"", "\uaB34 5678");
// TODO 4 byte encoding (utf-16 surrogate pairs)
}
TEST_CASE("bench3") {