Validate utf8

This commit is contained in:
2025-05-18 17:23:22 -04:00
parent 452bbd3d9c
commit d279173482
6 changed files with 105 additions and 95 deletions

View File

@@ -63,6 +63,7 @@ enum Symbol : uint8_t {
T_S,
T_COLON,
T_UTF8_CONTINUATION_BYTE,
T_UTF8_LAST_CONTINUATION_BYTE,
T_HEX,
T_DIGIT,
T_ONENINE,
@@ -118,6 +119,8 @@ struct Parser3 {
Symbol stack[kMaxStackSize];
Symbol *stackPtr = stack;
bool complete = false;
uint32_t utf8Codepoint;
uint32_t minCodepoint;
};
inline Status n_json(Parser3 *self) {
@@ -347,6 +350,9 @@ inline Status n_string2(Parser3 *self) {
if (*self->buf != '"') {
self->callbacks->on_string_data(self->data, self->buf, 1);
}
if (tables.invalidUtf8[uint8_t(*self->buf)]) {
return S_REJECT;
}
if (int8_t(*self->buf) > 0) {
// one byte utf-8 encoding
switch (*self->buf) {
@@ -368,28 +374,34 @@ inline Status n_string2(Parser3 *self) {
}
} else if ((*self->buf & 0b11100000) == 0b11000000) {
// two byte utf-8 encoding
self->utf8Codepoint = *self->buf & 0b00011111;
self->minCodepoint = 0x80;
++self->buf;
self->pop();
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, N_STRING2})) {
if (auto s = self->push({T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self);
}
if ((*self->buf & 0b11110000) == 0b11100000) {
// three byte utf-8 encoding
self->utf8Codepoint = *self->buf & 0b00001111;
self->minCodepoint = 0x800;
++self->buf;
self->pop();
if (auto s = self->push(
{T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE, N_STRING2})) {
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE,
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self);
} else if ((*self->buf & 0b11111000) == 0b11110000) {
// four byte utf-8 encoding
self->utf8Codepoint = *self->buf & 0b00000111;
self->minCodepoint = 0x10000;
++self->buf;
self->pop();
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE,
T_UTF8_CONTINUATION_BYTE, N_STRING2})) {
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self);
@@ -433,7 +445,36 @@ inline Status t_utf8_continuation_byte(Parser3 *self) {
if (self->len() == 0) {
return S_REJECT;
}
if (tables.invalidUtf8[uint8_t(*self->buf)]) {
return S_REJECT;
}
if ((*self->buf & 0b11000000) == 0b10000000) {
self->utf8Codepoint <<= 6;
self->utf8Codepoint |= *self->buf & 0b00111111;
self->callbacks->on_string_data(self->data, self->buf, 1);
++self->buf;
self->pop();
MUSTTAIL return Parser3::keepGoing(self);
}
return S_REJECT;
}
inline Status t_utf8_last_continuation_byte(Parser3 *self) {
if (self->len() == 0) {
return S_REJECT;
}
if (tables.invalidUtf8[uint8_t(*self->buf)]) {
return S_REJECT;
}
if ((*self->buf & 0b11000000) == 0b10000000) {
self->utf8Codepoint <<= 6;
self->utf8Codepoint |= *self->buf & 0b00111111;
if (self->utf8Codepoint < self->minCodepoint ||
self->utf8Codepoint > 0x10ffff ||
(0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) {
return S_REJECT;
}
// TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized
self->callbacks->on_string_data(self->data, self->buf, 1);
++self->buf;
self->pop();
@@ -782,6 +823,8 @@ constexpr inline struct ContinuationTable {
continuations[T_S] = singleChar<'s'>;
continuations[T_COLON] = singleChar<':'>;
continuations[T_UTF8_CONTINUATION_BYTE] = t_utf8_continuation_byte;
continuations[T_UTF8_LAST_CONTINUATION_BYTE] =
t_utf8_last_continuation_byte;
continuations[T_HEX] = t_hex;
continuations[T_DIGIT] = t_digit;
continuations[T_ONENINE] = t_onenine;