Unescape basic multi-lingual plane utf8

This commit is contained in:
2025-05-19 14:26:12 -04:00
parent 19bc216458
commit 34ad19c22f
2 changed files with 120 additions and 23 deletions

View File

@@ -65,6 +65,7 @@ enum Symbol : uint8_t {
T_UTF8_CONTINUATION_BYTE,
T_UTF8_LAST_CONTINUATION_BYTE,
T_HEX,
T_HEX2,
T_DIGIT,
T_ONENINE,
T_EOF,
@@ -96,6 +97,7 @@ struct Parser3 {
if (len > 0) {
callbacks->on_string_data(data, dataBegin, len);
}
dataBegin = writeBuf;
}
[[nodiscard]] bool empty() const { return stackPtr == stack; }
@@ -446,9 +448,9 @@ inline Status n_string_following_escape(Parser3 *self) {
MUSTTAIL return Parser3::keepGoing(self);
case 'u':
++self->buf;
// TODO unescape
self->utf8Codepoint = 0;
self->pop();
if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX, N_STRING2})) {
if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX2, N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self);
@@ -525,14 +527,79 @@ inline Status t_hex(Parser3 *self) {
if (self->len() == 0) {
return S_REJECT;
}
if (('0' <= *self->buf && *self->buf <= '9') ||
('a' <= *self->buf && *self->buf <= 'f') ||
('A' <= *self->buf && *self->buf <= 'F')) {
*self->writeBuf++ = *self->buf++;
self->pop();
MUSTTAIL return Parser3::keepGoing(self);
self->utf8Codepoint <<= 4;
if (('0' <= *self->buf && *self->buf <= '9')) {
self->utf8Codepoint |= *self->buf - '0';
} else if ('a' <= *self->buf && *self->buf <= 'f') {
self->utf8Codepoint |= 10 + *self->buf - 'a';
} else if ('A' <= *self->buf && *self->buf <= 'F') {
self->utf8Codepoint |= 10 + *self->buf - 'A';
} else {
return S_REJECT;
}
return S_REJECT;
++self->buf;
self->pop();
MUSTTAIL return Parser3::keepGoing(self);
}
inline Status t_hex2(Parser3 *self) {
if (self->len() == 0) {
return S_REJECT;
}
self->utf8Codepoint <<= 4;
if (('0' <= *self->buf && *self->buf <= '9')) {
self->utf8Codepoint |= *self->buf - '0';
} else if ('a' <= *self->buf && *self->buf <= 'f') {
self->utf8Codepoint |= 10 + *self->buf - 'a';
} else if ('A' <= *self->buf && *self->buf <= 'F') {
self->utf8Codepoint |= 10 + *self->buf - 'A';
} else {
return S_REJECT;
}
++self->buf;
// Write codepoint in utf-8 if there's room in the user provided buffer. If
// there's not room, flush, write into a temp buffer, and flush again.
char tmp[3];
if (self->utf8Codepoint < 0x80) {
assert(self->bufEnd - self->writeBuf >= 1);
*self->writeBuf++ = self->utf8Codepoint;
} else if (self->utf8Codepoint < 0x800) {
bool useTmp = self->bufEnd - self->writeBuf < 2;
char *p = tmp;
if (useTmp) {
self->flushString();
}
auto &w = useTmp ? p : self->writeBuf;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000;
w += 2;
if (useTmp) {
printf("%.*s", 2, tmp);
self->callbacks->on_string_data(self->data, tmp, 2);
}
} else {
assert(self->utf8Codepoint < 0x10000);
bool useTmp = self->bufEnd - self->writeBuf < 3;
char *p = tmp;
if (useTmp) {
self->flushString();
}
auto &w = useTmp ? p : self->writeBuf;
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[0] = (0b00001111 & self->utf8Codepoint) | 0b11100000;
w += 3;
if (useTmp) {
self->callbacks->on_string_data(self->data, tmp, 3);
}
}
self->pop();
MUSTTAIL return Parser3::keepGoing(self);
}
inline Status n_number(Parser3 *self) {
@@ -837,6 +904,7 @@ constexpr inline struct ContinuationTable {
continuations[T_UTF8_LAST_CONTINUATION_BYTE] =
t_utf8_last_continuation_byte;
continuations[T_HEX] = t_hex;
continuations[T_HEX2] = t_hex2;
continuations[T_DIGIT] = t_digit;
continuations[T_ONENINE] = t_onenine;
continuations[T_EOF] = t_eof;
@@ -873,6 +941,7 @@ constexpr inline struct ContinuationTable {
symbolNames[T_COLON] = "singleChar<':'>";
symbolNames[T_UTF8_CONTINUATION_BYTE] = "t_utf8_continuation_byte";
symbolNames[T_HEX] = "t_hex";
symbolNames[T_HEX2] = "t_hex2";
symbolNames[T_DIGIT] = "t_digit";
symbolNames[T_ONENINE] = "t_onenine";
symbolNames[T_EOF] = "t_eof";
@@ -904,6 +973,7 @@ inline Status Parser3::keepGoing(Parser3 *self) {
case T_UTF8_CONTINUATION_BYTE:
case T_UTF8_LAST_CONTINUATION_BYTE:
case T_HEX:
case T_HEX2:
self->flushString();
break;
case N_JSON: