Validate utf8
This commit is contained in:
@@ -63,6 +63,7 @@ enum Symbol : uint8_t {
|
||||
T_S,
|
||||
T_COLON,
|
||||
T_UTF8_CONTINUATION_BYTE,
|
||||
T_UTF8_LAST_CONTINUATION_BYTE,
|
||||
T_HEX,
|
||||
T_DIGIT,
|
||||
T_ONENINE,
|
||||
@@ -118,6 +119,8 @@ struct Parser3 {
|
||||
Symbol stack[kMaxStackSize];
|
||||
Symbol *stackPtr = stack;
|
||||
bool complete = false;
|
||||
uint32_t utf8Codepoint;
|
||||
uint32_t minCodepoint;
|
||||
};
|
||||
|
||||
inline Status n_json(Parser3 *self) {
|
||||
@@ -347,6 +350,9 @@ inline Status n_string2(Parser3 *self) {
|
||||
if (*self->buf != '"') {
|
||||
self->callbacks->on_string_data(self->data, self->buf, 1);
|
||||
}
|
||||
if (tables.invalidUtf8[uint8_t(*self->buf)]) {
|
||||
return S_REJECT;
|
||||
}
|
||||
if (int8_t(*self->buf) > 0) {
|
||||
// one byte utf-8 encoding
|
||||
switch (*self->buf) {
|
||||
@@ -368,28 +374,34 @@ inline Status n_string2(Parser3 *self) {
|
||||
}
|
||||
} else if ((*self->buf & 0b11100000) == 0b11000000) {
|
||||
// two byte utf-8 encoding
|
||||
self->utf8Codepoint = *self->buf & 0b00011111;
|
||||
self->minCodepoint = 0x80;
|
||||
++self->buf;
|
||||
self->pop();
|
||||
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, N_STRING2})) {
|
||||
if (auto s = self->push({T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
|
||||
return s;
|
||||
}
|
||||
MUSTTAIL return Parser3::keepGoing(self);
|
||||
}
|
||||
if ((*self->buf & 0b11110000) == 0b11100000) {
|
||||
// three byte utf-8 encoding
|
||||
self->utf8Codepoint = *self->buf & 0b00001111;
|
||||
self->minCodepoint = 0x800;
|
||||
++self->buf;
|
||||
self->pop();
|
||||
if (auto s = self->push(
|
||||
{T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE, N_STRING2})) {
|
||||
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE,
|
||||
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
|
||||
return s;
|
||||
}
|
||||
MUSTTAIL return Parser3::keepGoing(self);
|
||||
} else if ((*self->buf & 0b11111000) == 0b11110000) {
|
||||
// four byte utf-8 encoding
|
||||
self->utf8Codepoint = *self->buf & 0b00000111;
|
||||
self->minCodepoint = 0x10000;
|
||||
++self->buf;
|
||||
self->pop();
|
||||
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE,
|
||||
T_UTF8_CONTINUATION_BYTE, N_STRING2})) {
|
||||
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
|
||||
return s;
|
||||
}
|
||||
MUSTTAIL return Parser3::keepGoing(self);
|
||||
@@ -433,7 +445,36 @@ inline Status t_utf8_continuation_byte(Parser3 *self) {
|
||||
if (self->len() == 0) {
|
||||
return S_REJECT;
|
||||
}
|
||||
if (tables.invalidUtf8[uint8_t(*self->buf)]) {
|
||||
return S_REJECT;
|
||||
}
|
||||
if ((*self->buf & 0b11000000) == 0b10000000) {
|
||||
self->utf8Codepoint <<= 6;
|
||||
self->utf8Codepoint |= *self->buf & 0b00111111;
|
||||
self->callbacks->on_string_data(self->data, self->buf, 1);
|
||||
++self->buf;
|
||||
self->pop();
|
||||
MUSTTAIL return Parser3::keepGoing(self);
|
||||
}
|
||||
return S_REJECT;
|
||||
}
|
||||
|
||||
inline Status t_utf8_last_continuation_byte(Parser3 *self) {
|
||||
if (self->len() == 0) {
|
||||
return S_REJECT;
|
||||
}
|
||||
if (tables.invalidUtf8[uint8_t(*self->buf)]) {
|
||||
return S_REJECT;
|
||||
}
|
||||
if ((*self->buf & 0b11000000) == 0b10000000) {
|
||||
self->utf8Codepoint <<= 6;
|
||||
self->utf8Codepoint |= *self->buf & 0b00111111;
|
||||
if (self->utf8Codepoint < self->minCodepoint ||
|
||||
self->utf8Codepoint > 0x10ffff ||
|
||||
(0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) {
|
||||
return S_REJECT;
|
||||
}
|
||||
// TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized
|
||||
self->callbacks->on_string_data(self->data, self->buf, 1);
|
||||
++self->buf;
|
||||
self->pop();
|
||||
@@ -782,6 +823,8 @@ constexpr inline struct ContinuationTable {
|
||||
continuations[T_S] = singleChar<'s'>;
|
||||
continuations[T_COLON] = singleChar<':'>;
|
||||
continuations[T_UTF8_CONTINUATION_BYTE] = t_utf8_continuation_byte;
|
||||
continuations[T_UTF8_LAST_CONTINUATION_BYTE] =
|
||||
t_utf8_last_continuation_byte;
|
||||
continuations[T_HEX] = t_hex;
|
||||
continuations[T_DIGIT] = t_digit;
|
||||
continuations[T_ONENINE] = t_onenine;
|
||||
|
||||
Reference in New Issue
Block a user