Unescape basic multi-lingual plane utf8
This commit is contained in:
@@ -65,6 +65,7 @@ enum Symbol : uint8_t {
|
|||||||
T_UTF8_CONTINUATION_BYTE,
|
T_UTF8_CONTINUATION_BYTE,
|
||||||
T_UTF8_LAST_CONTINUATION_BYTE,
|
T_UTF8_LAST_CONTINUATION_BYTE,
|
||||||
T_HEX,
|
T_HEX,
|
||||||
|
T_HEX2,
|
||||||
T_DIGIT,
|
T_DIGIT,
|
||||||
T_ONENINE,
|
T_ONENINE,
|
||||||
T_EOF,
|
T_EOF,
|
||||||
@@ -96,6 +97,7 @@ struct Parser3 {
|
|||||||
if (len > 0) {
|
if (len > 0) {
|
||||||
callbacks->on_string_data(data, dataBegin, len);
|
callbacks->on_string_data(data, dataBegin, len);
|
||||||
}
|
}
|
||||||
|
dataBegin = writeBuf;
|
||||||
}
|
}
|
||||||
|
|
||||||
[[nodiscard]] bool empty() const { return stackPtr == stack; }
|
[[nodiscard]] bool empty() const { return stackPtr == stack; }
|
||||||
@@ -446,9 +448,9 @@ inline Status n_string_following_escape(Parser3 *self) {
|
|||||||
MUSTTAIL return Parser3::keepGoing(self);
|
MUSTTAIL return Parser3::keepGoing(self);
|
||||||
case 'u':
|
case 'u':
|
||||||
++self->buf;
|
++self->buf;
|
||||||
// TODO unescape
|
self->utf8Codepoint = 0;
|
||||||
self->pop();
|
self->pop();
|
||||||
if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX, N_STRING2})) {
|
if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX2, N_STRING2})) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
MUSTTAIL return Parser3::keepGoing(self);
|
MUSTTAIL return Parser3::keepGoing(self);
|
||||||
@@ -525,14 +527,79 @@ inline Status t_hex(Parser3 *self) {
|
|||||||
if (self->len() == 0) {
|
if (self->len() == 0) {
|
||||||
return S_REJECT;
|
return S_REJECT;
|
||||||
}
|
}
|
||||||
if (('0' <= *self->buf && *self->buf <= '9') ||
|
self->utf8Codepoint <<= 4;
|
||||||
('a' <= *self->buf && *self->buf <= 'f') ||
|
if (('0' <= *self->buf && *self->buf <= '9')) {
|
||||||
('A' <= *self->buf && *self->buf <= 'F')) {
|
self->utf8Codepoint |= *self->buf - '0';
|
||||||
*self->writeBuf++ = *self->buf++;
|
} else if ('a' <= *self->buf && *self->buf <= 'f') {
|
||||||
self->pop();
|
self->utf8Codepoint |= 10 + *self->buf - 'a';
|
||||||
MUSTTAIL return Parser3::keepGoing(self);
|
} else if ('A' <= *self->buf && *self->buf <= 'F') {
|
||||||
|
self->utf8Codepoint |= 10 + *self->buf - 'A';
|
||||||
|
} else {
|
||||||
|
return S_REJECT;
|
||||||
}
|
}
|
||||||
return S_REJECT;
|
++self->buf;
|
||||||
|
self->pop();
|
||||||
|
MUSTTAIL return Parser3::keepGoing(self);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline Status t_hex2(Parser3 *self) {
|
||||||
|
if (self->len() == 0) {
|
||||||
|
return S_REJECT;
|
||||||
|
}
|
||||||
|
self->utf8Codepoint <<= 4;
|
||||||
|
if (('0' <= *self->buf && *self->buf <= '9')) {
|
||||||
|
self->utf8Codepoint |= *self->buf - '0';
|
||||||
|
} else if ('a' <= *self->buf && *self->buf <= 'f') {
|
||||||
|
self->utf8Codepoint |= 10 + *self->buf - 'a';
|
||||||
|
} else if ('A' <= *self->buf && *self->buf <= 'F') {
|
||||||
|
self->utf8Codepoint |= 10 + *self->buf - 'A';
|
||||||
|
} else {
|
||||||
|
return S_REJECT;
|
||||||
|
}
|
||||||
|
++self->buf;
|
||||||
|
|
||||||
|
// Write codepoint in utf-8 if there's room in the user provided buffer. If
|
||||||
|
// there's not room, flush, write into a temp buffer, and flush again.
|
||||||
|
char tmp[3];
|
||||||
|
if (self->utf8Codepoint < 0x80) {
|
||||||
|
assert(self->bufEnd - self->writeBuf >= 1);
|
||||||
|
*self->writeBuf++ = self->utf8Codepoint;
|
||||||
|
} else if (self->utf8Codepoint < 0x800) {
|
||||||
|
bool useTmp = self->bufEnd - self->writeBuf < 2;
|
||||||
|
char *p = tmp;
|
||||||
|
if (useTmp) {
|
||||||
|
self->flushString();
|
||||||
|
}
|
||||||
|
auto &w = useTmp ? p : self->writeBuf;
|
||||||
|
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
||||||
|
self->utf8Codepoint >>= 6;
|
||||||
|
w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000;
|
||||||
|
w += 2;
|
||||||
|
if (useTmp) {
|
||||||
|
printf("%.*s", 2, tmp);
|
||||||
|
self->callbacks->on_string_data(self->data, tmp, 2);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
assert(self->utf8Codepoint < 0x10000);
|
||||||
|
bool useTmp = self->bufEnd - self->writeBuf < 3;
|
||||||
|
char *p = tmp;
|
||||||
|
if (useTmp) {
|
||||||
|
self->flushString();
|
||||||
|
}
|
||||||
|
auto &w = useTmp ? p : self->writeBuf;
|
||||||
|
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
||||||
|
self->utf8Codepoint >>= 6;
|
||||||
|
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
||||||
|
self->utf8Codepoint >>= 6;
|
||||||
|
w[0] = (0b00001111 & self->utf8Codepoint) | 0b11100000;
|
||||||
|
w += 3;
|
||||||
|
if (useTmp) {
|
||||||
|
self->callbacks->on_string_data(self->data, tmp, 3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self->pop();
|
||||||
|
MUSTTAIL return Parser3::keepGoing(self);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline Status n_number(Parser3 *self) {
|
inline Status n_number(Parser3 *self) {
|
||||||
@@ -837,6 +904,7 @@ constexpr inline struct ContinuationTable {
|
|||||||
continuations[T_UTF8_LAST_CONTINUATION_BYTE] =
|
continuations[T_UTF8_LAST_CONTINUATION_BYTE] =
|
||||||
t_utf8_last_continuation_byte;
|
t_utf8_last_continuation_byte;
|
||||||
continuations[T_HEX] = t_hex;
|
continuations[T_HEX] = t_hex;
|
||||||
|
continuations[T_HEX2] = t_hex2;
|
||||||
continuations[T_DIGIT] = t_digit;
|
continuations[T_DIGIT] = t_digit;
|
||||||
continuations[T_ONENINE] = t_onenine;
|
continuations[T_ONENINE] = t_onenine;
|
||||||
continuations[T_EOF] = t_eof;
|
continuations[T_EOF] = t_eof;
|
||||||
@@ -873,6 +941,7 @@ constexpr inline struct ContinuationTable {
|
|||||||
symbolNames[T_COLON] = "singleChar<':'>";
|
symbolNames[T_COLON] = "singleChar<':'>";
|
||||||
symbolNames[T_UTF8_CONTINUATION_BYTE] = "t_utf8_continuation_byte";
|
symbolNames[T_UTF8_CONTINUATION_BYTE] = "t_utf8_continuation_byte";
|
||||||
symbolNames[T_HEX] = "t_hex";
|
symbolNames[T_HEX] = "t_hex";
|
||||||
|
symbolNames[T_HEX2] = "t_hex2";
|
||||||
symbolNames[T_DIGIT] = "t_digit";
|
symbolNames[T_DIGIT] = "t_digit";
|
||||||
symbolNames[T_ONENINE] = "t_onenine";
|
symbolNames[T_ONENINE] = "t_onenine";
|
||||||
symbolNames[T_EOF] = "t_eof";
|
symbolNames[T_EOF] = "t_eof";
|
||||||
@@ -904,6 +973,7 @@ inline Status Parser3::keepGoing(Parser3 *self) {
|
|||||||
case T_UTF8_CONTINUATION_BYTE:
|
case T_UTF8_CONTINUATION_BYTE:
|
||||||
case T_UTF8_LAST_CONTINUATION_BYTE:
|
case T_UTF8_LAST_CONTINUATION_BYTE:
|
||||||
case T_HEX:
|
case T_HEX:
|
||||||
|
case T_HEX2:
|
||||||
self->flushString();
|
self->flushString();
|
||||||
break;
|
break;
|
||||||
case N_JSON:
|
case N_JSON:
|
||||||
|
|||||||
55
src/test.cpp
55
src/test.cpp
@@ -208,26 +208,53 @@ TEST_CASE("parser3") {
|
|||||||
|
|
||||||
TEST_CASE("streaming") { testStreaming(json); }
|
TEST_CASE("streaming") { testStreaming(json); }
|
||||||
|
|
||||||
TEST_CASE("unescaping basic") {
|
void doTestUnescapingUtf8(std::string const &escaped,
|
||||||
|
std::string const &expected, bool streaming) {
|
||||||
|
CAPTURE(escaped);
|
||||||
|
CAPTURE(expected);
|
||||||
|
CAPTURE(streaming);
|
||||||
auto c = noopCallbacks();
|
auto c = noopCallbacks();
|
||||||
c.on_string_data = +[](void *, const char *buf, int len) {
|
std::string result;
|
||||||
CHECK(std::string(buf, len) == "\n");
|
c.on_string_data = +[](void *p, const char *buf, int len) {
|
||||||
|
auto &s = *(std::string *)p;
|
||||||
|
s.append(buf, len);
|
||||||
};
|
};
|
||||||
std::string copy = "\"\\n\"";
|
parser3::Parser3 parser(&c, &result);
|
||||||
parser3::Parser3 parser(&c, nullptr);
|
auto copy = escaped;
|
||||||
CHECK(parser.parse(copy.data(), copy.length()) == parser3::S_AGAIN);
|
if (streaming) {
|
||||||
|
for (int i = 0; i < copy.size(); ++i) {
|
||||||
|
CAPTURE(i);
|
||||||
|
CHECK(parser.parse(copy.data() + i, 1) == parser3::S_AGAIN);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
CHECK(parser.parse(copy.data(), copy.size()) == parser3::S_AGAIN);
|
||||||
|
}
|
||||||
CHECK(parser.parse(nullptr, 0) == parser3::S_OK);
|
CHECK(parser.parse(nullptr, 0) == parser3::S_OK);
|
||||||
|
CHECK(result.size() == expected.size());
|
||||||
|
CHECK(result == expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
void testUnescapingUtf8(std::string const &escaped,
|
||||||
|
std::string const &expected) {
|
||||||
|
doTestUnescapingUtf8(escaped, expected, false);
|
||||||
|
doTestUnescapingUtf8(escaped, expected, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE("unescaping utf-8") {
|
TEST_CASE("unescaping utf-8") {
|
||||||
auto c = noopCallbacks();
|
// Basic
|
||||||
c.on_string_data = +[](void *, const char *buf, int len) {
|
testUnescapingUtf8("\"\\\"\"", "\"");
|
||||||
CHECK(std::string(buf, len) == "\uaB34");
|
testUnescapingUtf8("\"\\\\\"", "\\");
|
||||||
};
|
testUnescapingUtf8("\"\\/\"", "/");
|
||||||
std::string copy = "\"\\uaB34\"";
|
testUnescapingUtf8("\"\\b\"", "\b");
|
||||||
parser3::Parser3 parser(&c, nullptr);
|
testUnescapingUtf8("\"\\f\"", "\f");
|
||||||
CHECK(parser.parse(copy.data(), copy.length()) == parser3::S_AGAIN);
|
testUnescapingUtf8("\"\\n\"", "\n");
|
||||||
CHECK(parser.parse(nullptr, 0) == parser3::S_OK);
|
testUnescapingUtf8("\"\\r\"", "\r");
|
||||||
|
testUnescapingUtf8("\"\\t\"", "\t");
|
||||||
|
// 2 byte encoding
|
||||||
|
testUnescapingUtf8("\"\\u07aB 1234\"", "\u07aB 1234");
|
||||||
|
// 3 byte encoding
|
||||||
|
testUnescapingUtf8("\"\\uaB34 5678\"", "\uaB34 5678");
|
||||||
|
// TODO 4 byte encoding (utf-16 surrogate pairs)
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE("bench3") {
|
TEST_CASE("bench3") {
|
||||||
|
|||||||
Reference in New Issue
Block a user