1087 lines
28 KiB
C++
1087 lines
28 KiB
C++
#pragma once
|
|
|
|
#include <cassert>
|
|
#include <cctype>
|
|
#include <cstdint>
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
#include <initializer_list>
|
|
#include <iterator>
|
|
#include <tuple>
|
|
#include <utility>
|
|
|
|
#include "musttail.h"
|
|
#include "tables.h"
|
|
#include "weaseljson.h"
|
|
|
|
namespace parser3 {
|
|
|
|
typedef WeaselJsonStatus (*Continuation)(struct Parser3 *);
|
|
|
|
// These appear in the stack of the pushdown
|
|
// automata
|
|
enum Symbol : uint8_t {
|
|
N_VALUE,
|
|
N_OBJECT2,
|
|
N_OBJECT3,
|
|
N_ARRAY2,
|
|
N_ARRAY3,
|
|
N_STRING,
|
|
N_STRING2,
|
|
N_STRING_FOLLOWING_ESCAPE,
|
|
N_INTEGER2,
|
|
N_DIGITS,
|
|
N_DIGITS2,
|
|
N_FRACTION,
|
|
N_EXPONENT,
|
|
N_SIGN,
|
|
N_WHITESPACE,
|
|
N_TRUE,
|
|
N_FALSE,
|
|
N_NULL,
|
|
T_R,
|
|
T_U,
|
|
// u inside of a string
|
|
T_U2,
|
|
T_A,
|
|
T_L,
|
|
T_S,
|
|
T_COLON,
|
|
T_UTF8_CONTINUATION_BYTE,
|
|
T_UTF8_LAST_CONTINUATION_BYTE,
|
|
T_HEX,
|
|
T_HEX2,
|
|
T_HEX3,
|
|
T_DIGIT,
|
|
T_ONENINE,
|
|
T_EOF,
|
|
T_END_NUMBER,
|
|
T_BACKSLASH,
|
|
N_SYMBOL_COUNT, // Must be last
|
|
};
|
|
struct Parser3 {
|
|
Parser3(const WeaselJsonCallbacks *callbacks, void *data)
|
|
: callbacks(callbacks), data(data) {
|
|
std::ignore = push({N_VALUE, N_WHITESPACE, T_EOF});
|
|
}
|
|
|
|
[[nodiscard]] WeaselJsonStatus parse(char *buf, int len) {
|
|
complete = len == 0;
|
|
this->buf = this->dataBegin = this->writeBuf = buf;
|
|
this->bufEnd = buf + len;
|
|
return keepGoing(this);
|
|
}
|
|
|
|
void flushNumber() {
|
|
int len = buf - dataBegin;
|
|
if (len > 0) {
|
|
callbacks->on_number_data(data, dataBegin, len);
|
|
}
|
|
}
|
|
|
|
void flushString() {
|
|
int len = writeBuf - dataBegin;
|
|
if (len > 0) {
|
|
callbacks->on_string_data(data, dataBegin, len);
|
|
}
|
|
dataBegin = writeBuf;
|
|
}
|
|
|
|
[[nodiscard]] bool empty() const { return stackPtr == stack; }
|
|
void pop() {
|
|
assert(!empty());
|
|
--stackPtr;
|
|
}
|
|
[[nodiscard]] WeaselJsonStatus push(std::initializer_list<Symbol> symbols) {
|
|
if (stackPtr >= std::end(stack) - symbols.size()) [[unlikely]] {
|
|
return WeaselJson_OVERFLOW;
|
|
}
|
|
for (int i = symbols.size() - 1; i >= 0; --i) {
|
|
*stackPtr++ = *(symbols.begin() + i);
|
|
}
|
|
return WeaselJson_OK;
|
|
}
|
|
[[nodiscard]] int len() const {
|
|
auto result = bufEnd - buf;
|
|
assert(result >= 0);
|
|
return result;
|
|
}
|
|
Symbol top() const {
|
|
assert(!empty());
|
|
return *(stackPtr - 1);
|
|
}
|
|
|
|
static WeaselJsonStatus keepGoing(Parser3 *self);
|
|
|
|
constexpr static int kMaxStackSize = 1024;
|
|
|
|
[[maybe_unused]] void debugPrint();
|
|
// Pointer to the next byte in the input to consume
|
|
char *buf = nullptr;
|
|
// Pointer past the end of the last byte available to consume
|
|
char *bufEnd = nullptr;
|
|
// Used for flushing pending data with on_*_data callbacks
|
|
char *dataBegin;
|
|
// Used for unescaping string data in place
|
|
char *writeBuf;
|
|
const WeaselJsonCallbacks *const callbacks;
|
|
void *const data;
|
|
Symbol stack[kMaxStackSize];
|
|
Symbol *stackPtr = stack;
|
|
bool complete = false;
|
|
uint32_t utf8Codepoint;
|
|
uint32_t utf16Surrogate;
|
|
uint32_t minCodepoint;
|
|
};
|
|
|
|
inline WeaselJsonStatus n_whitespace(Parser3 *self) {
|
|
if (self->len() == 0) {
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
while (tables.whitespace[uint8_t(*self->buf)]) {
|
|
++self->buf;
|
|
if (self->buf == self->bufEnd) {
|
|
return WeaselJson_AGAIN;
|
|
}
|
|
}
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
|
|
inline WeaselJsonStatus n_value(Parser3 *self) {
|
|
assert(self->len() != 0);
|
|
while (tables.whitespace[uint8_t(*self->buf)]) {
|
|
++self->buf;
|
|
if (self->buf == self->bufEnd) {
|
|
return WeaselJson_AGAIN;
|
|
}
|
|
}
|
|
switch (*self->buf) {
|
|
case '{':
|
|
self->callbacks->on_begin_object(self->data);
|
|
++self->buf;
|
|
self->pop();
|
|
if (auto s = self->push({N_OBJECT2})) {
|
|
return s;
|
|
}
|
|
break;
|
|
case '[':
|
|
self->callbacks->on_begin_array(self->data);
|
|
++self->buf;
|
|
self->pop();
|
|
if (auto s = self->push({N_ARRAY2})) {
|
|
return s;
|
|
}
|
|
break;
|
|
case '"':
|
|
self->callbacks->on_begin_string(self->data);
|
|
++self->buf;
|
|
self->dataBegin = self->writeBuf = self->buf;
|
|
self->pop();
|
|
if (auto s = self->push({N_STRING2})) {
|
|
return s;
|
|
}
|
|
break;
|
|
case '0':
|
|
self->pop();
|
|
if (auto s = self->push({N_FRACTION, N_EXPONENT})) {
|
|
return s;
|
|
}
|
|
self->callbacks->on_begin_number(self->data);
|
|
self->dataBegin = self->buf;
|
|
++self->buf;
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
case '8':
|
|
case '9':
|
|
self->pop();
|
|
self->callbacks->on_begin_number(self->data);
|
|
self->dataBegin = self->buf;
|
|
++self->buf;
|
|
if (auto s = self->push({N_DIGITS2, N_FRACTION, N_EXPONENT})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
case '-':
|
|
self->pop();
|
|
self->callbacks->on_begin_number(self->data);
|
|
self->dataBegin = self->buf;
|
|
++self->buf;
|
|
if (auto s = self->push({N_INTEGER2, N_FRACTION, N_EXPONENT})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
case 't':
|
|
++self->buf;
|
|
self->pop();
|
|
if (self->len() >= 3) {
|
|
if (memcmp(self->buf, "rue", 3) == 0) {
|
|
self->callbacks->on_true_literal(self->data);
|
|
self->buf += 3;
|
|
} else [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
} else {
|
|
if (auto s = self->push({T_R, T_U, N_TRUE})) {
|
|
return s;
|
|
}
|
|
}
|
|
break;
|
|
case 'f':
|
|
++self->buf;
|
|
self->pop();
|
|
if (self->len() >= 4) {
|
|
if (memcmp(self->buf, "alse", 4) == 0) {
|
|
self->callbacks->on_false_literal(self->data);
|
|
self->buf += 4;
|
|
} else [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
} else {
|
|
if (auto s = self->push({T_A, T_L, T_S, N_FALSE})) {
|
|
return s;
|
|
}
|
|
}
|
|
break;
|
|
case 'n':
|
|
++self->buf;
|
|
self->pop();
|
|
if (self->len() >= 3) {
|
|
if (memcmp(self->buf, "ull", 3) == 0) {
|
|
self->callbacks->on_null_literal(self->data);
|
|
self->buf += 3;
|
|
} else [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
} else {
|
|
if (auto s = self->push({T_U, T_L, N_NULL})) {
|
|
return s;
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
[[unlikely]] return WeaselJson_REJECT;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
|
|
inline WeaselJsonStatus n_object2(Parser3 *self) {
|
|
assert(self->len() != 0);
|
|
while (tables.whitespace[uint8_t(*self->buf)]) {
|
|
++self->buf;
|
|
if (self->buf == self->bufEnd) {
|
|
return WeaselJson_AGAIN;
|
|
}
|
|
}
|
|
switch (*self->buf) {
|
|
case '}':
|
|
++self->buf;
|
|
self->pop();
|
|
self->callbacks->on_end_object(self->data);
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
case '"':
|
|
self->callbacks->on_begin_string(self->data);
|
|
++self->buf;
|
|
self->dataBegin = self->writeBuf = self->buf;
|
|
self->pop();
|
|
if (auto s = self->push({N_STRING2, T_COLON, N_VALUE, N_OBJECT3})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
default:
|
|
[[unlikely]] return WeaselJson_REJECT;
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus n_object3(Parser3 *self) {
|
|
assert(self->len() != 0);
|
|
while (tables.whitespace[uint8_t(*self->buf)]) {
|
|
++self->buf;
|
|
if (self->buf == self->bufEnd) {
|
|
return WeaselJson_AGAIN;
|
|
}
|
|
}
|
|
switch (*self->buf) {
|
|
case '}':
|
|
++self->buf;
|
|
self->pop();
|
|
self->callbacks->on_end_object(self->data);
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
case ',':
|
|
++self->buf;
|
|
self->pop();
|
|
if (auto s = self->push({N_STRING, T_COLON, N_VALUE, N_OBJECT3})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
default:
|
|
[[unlikely]] return WeaselJson_REJECT;
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus n_array2(Parser3 *self) {
|
|
assert(self->len() != 0);
|
|
while (tables.whitespace[uint8_t(*self->buf)]) {
|
|
++self->buf;
|
|
if (self->buf == self->bufEnd) {
|
|
return WeaselJson_AGAIN;
|
|
}
|
|
}
|
|
switch (*self->buf) {
|
|
case ']':
|
|
++self->buf;
|
|
self->pop();
|
|
self->callbacks->on_end_array(self->data);
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
default:
|
|
self->pop();
|
|
if (auto s = self->push({N_VALUE, N_ARRAY3})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus n_array3(Parser3 *self) {
|
|
assert(self->len() != 0);
|
|
while (tables.whitespace[uint8_t(*self->buf)]) {
|
|
++self->buf;
|
|
if (self->buf == self->bufEnd) {
|
|
return WeaselJson_AGAIN;
|
|
}
|
|
}
|
|
switch (*self->buf) {
|
|
case ']':
|
|
++self->buf;
|
|
self->pop();
|
|
self->callbacks->on_end_array(self->data);
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
case ',':
|
|
++self->buf;
|
|
self->pop();
|
|
if (auto s = self->push({N_VALUE, N_ARRAY3})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
default:
|
|
[[unlikely]] return WeaselJson_REJECT;
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus n_string(Parser3 *self) {
|
|
assert(self->len() != 0);
|
|
while (tables.whitespace[uint8_t(*self->buf)]) {
|
|
++self->buf;
|
|
if (self->buf == self->bufEnd) {
|
|
return WeaselJson_AGAIN;
|
|
}
|
|
}
|
|
if (*self->buf != '"') [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
self->callbacks->on_begin_string(self->data);
|
|
++self->buf;
|
|
self->dataBegin = self->writeBuf = self->buf;
|
|
self->pop();
|
|
if (auto s = self->push({N_STRING2})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
|
|
inline WeaselJsonStatus n_string2(Parser3 *self) {
|
|
auto commit = [self, before = self->buf]() {
|
|
int len = self->buf - before;
|
|
if (self->writeBuf != before) {
|
|
memmove(self->writeBuf, before, len);
|
|
}
|
|
self->writeBuf += len;
|
|
};
|
|
begin:
|
|
auto meaning = tables.stringByteMeaning[uint8_t(*self->buf)];
|
|
if (meaning == Tables::NORMAL) {
|
|
++self->buf;
|
|
if (self->buf == self->bufEnd) {
|
|
commit();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
goto begin;
|
|
}
|
|
commit();
|
|
switch (meaning) {
|
|
case Tables::NORMAL:
|
|
__builtin_unreachable();
|
|
case Tables::DUBQUOTE:
|
|
self->flushString();
|
|
self->callbacks->on_end_string(self->data);
|
|
++self->buf;
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
case Tables::BACKSLASH:
|
|
++self->buf;
|
|
self->pop();
|
|
if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
case Tables::TWO_BYTE_UTF8:
|
|
// two byte utf-8 encoding
|
|
self->utf8Codepoint = *self->buf & 0b00011111;
|
|
self->minCodepoint = 0x80;
|
|
*self->writeBuf++ = *self->buf++;
|
|
self->pop();
|
|
if (auto s = self->push({T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
case Tables::THREE_BYTE_UTF8:
|
|
// three byte utf-8 encoding
|
|
self->utf8Codepoint = *self->buf & 0b00001111;
|
|
self->minCodepoint = 0x800;
|
|
*self->writeBuf++ = *self->buf++;
|
|
self->pop();
|
|
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE,
|
|
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
case Tables::FOUR_BYTE_UTF8:
|
|
// four byte utf-8 encoding
|
|
self->utf8Codepoint = *self->buf & 0b00000111;
|
|
self->minCodepoint = 0x10000;
|
|
*self->writeBuf++ = *self->buf++;
|
|
self->pop();
|
|
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE,
|
|
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
case Tables::CONTINUATION_BYTE:
|
|
case Tables::INVALID:
|
|
[[unlikely]] return WeaselJson_REJECT;
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus n_string_following_escape(Parser3 *self) {
|
|
switch (*self->buf) {
|
|
case '"':
|
|
case '\\':
|
|
case '/':
|
|
case 'b':
|
|
case 'f':
|
|
case 'n':
|
|
case 'r':
|
|
case 't':
|
|
*self->writeBuf++ = tables.unescape[*self->buf++];
|
|
self->pop();
|
|
if (auto s = self->push({N_STRING2})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
case 'u':
|
|
++self->buf;
|
|
self->utf8Codepoint = 0;
|
|
self->pop();
|
|
if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX2, N_STRING2})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
default:
|
|
[[unlikely]] return WeaselJson_REJECT;
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus t_utf8_continuation_byte(Parser3 *self) {
|
|
if (tables.stringByteMeaning[uint8_t(*self->buf)] !=
|
|
Tables::CONTINUATION_BYTE) [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
self->utf8Codepoint <<= 6;
|
|
self->utf8Codepoint |= *self->buf & 0b00111111;
|
|
*self->writeBuf++ = *self->buf++;
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
|
|
inline WeaselJsonStatus t_utf8_last_continuation_byte(Parser3 *self) {
|
|
if (tables.stringByteMeaning[uint8_t(*self->buf)] !=
|
|
Tables::CONTINUATION_BYTE) [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
self->utf8Codepoint <<= 6;
|
|
self->utf8Codepoint |= *self->buf & 0b00111111;
|
|
if (self->utf8Codepoint < self->minCodepoint ||
|
|
self->utf8Codepoint > 0x10ffff ||
|
|
(0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff))
|
|
[[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
// TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized
|
|
*self->writeBuf++ = *self->buf++;
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
|
|
inline WeaselJsonStatus t_digit(Parser3 *self) {
|
|
if ('0' <= *self->buf && *self->buf <= '9') {
|
|
++self->buf;
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
} else [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus t_onenine(Parser3 *self) {
|
|
if ('1' <= *self->buf && *self->buf <= '9') {
|
|
++self->buf;
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
} else [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus t_hex(Parser3 *self) {
|
|
self->utf8Codepoint <<= 4;
|
|
if (('0' <= *self->buf && *self->buf <= '9')) {
|
|
self->utf8Codepoint |= *self->buf - '0';
|
|
} else if ('a' <= *self->buf && *self->buf <= 'f') {
|
|
self->utf8Codepoint |= 10 + *self->buf - 'a';
|
|
} else if ('A' <= *self->buf && *self->buf <= 'F') {
|
|
self->utf8Codepoint |= 10 + *self->buf - 'A';
|
|
} else [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
++self->buf;
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
|
|
inline WeaselJsonStatus t_hex2(Parser3 *self) {
|
|
self->utf8Codepoint <<= 4;
|
|
if (('0' <= *self->buf && *self->buf <= '9')) {
|
|
self->utf8Codepoint |= *self->buf - '0';
|
|
} else if ('a' <= *self->buf && *self->buf <= 'f') {
|
|
self->utf8Codepoint |= 10 + *self->buf - 'a';
|
|
} else if ('A' <= *self->buf && *self->buf <= 'F') {
|
|
self->utf8Codepoint |= 10 + *self->buf - 'A';
|
|
} else [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
++self->buf;
|
|
|
|
// Write codepoint in utf-8 if there's room in the user provided buffer. If
|
|
// there's not room, flush, write into a temp buffer, and flush again.
|
|
char tmp[3];
|
|
if (self->utf8Codepoint < 0x80) {
|
|
assert(self->buf - self->writeBuf >= 1);
|
|
*self->writeBuf++ = self->utf8Codepoint;
|
|
} else if (self->utf8Codepoint < 0x800) {
|
|
bool useTmp = self->buf - self->writeBuf < 2;
|
|
char *p = tmp;
|
|
if (useTmp) {
|
|
self->flushString();
|
|
}
|
|
auto &w = useTmp ? p : self->writeBuf;
|
|
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
|
self->utf8Codepoint >>= 6;
|
|
w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000;
|
|
w += 2;
|
|
if (useTmp) {
|
|
self->callbacks->on_string_data(self->data, tmp, 2);
|
|
}
|
|
} else {
|
|
assert(self->utf8Codepoint < 0x10000);
|
|
if (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff) {
|
|
// utf-16 surrogate
|
|
self->utf16Surrogate = self->utf8Codepoint;
|
|
self->utf8Codepoint = 0;
|
|
self->pop();
|
|
if (auto s =
|
|
self->push({T_BACKSLASH, T_U2, T_HEX, T_HEX, T_HEX, T_HEX3})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
bool useTmp = self->buf - self->writeBuf < 3;
|
|
char *p = tmp;
|
|
if (useTmp) {
|
|
self->flushString();
|
|
}
|
|
auto &w = useTmp ? p : self->writeBuf;
|
|
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
|
self->utf8Codepoint >>= 6;
|
|
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
|
self->utf8Codepoint >>= 6;
|
|
w[0] = (0b00001111 & self->utf8Codepoint) | 0b11100000;
|
|
w += 3;
|
|
if (useTmp) {
|
|
self->callbacks->on_string_data(self->data, tmp, 3);
|
|
}
|
|
}
|
|
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
|
|
inline WeaselJsonStatus t_hex3(Parser3 *self) {
|
|
self->utf8Codepoint <<= 4;
|
|
if (('0' <= *self->buf && *self->buf <= '9')) {
|
|
self->utf8Codepoint |= *self->buf - '0';
|
|
} else if ('a' <= *self->buf && *self->buf <= 'f') {
|
|
self->utf8Codepoint |= 10 + *self->buf - 'a';
|
|
} else if ('A' <= *self->buf && *self->buf <= 'F') {
|
|
self->utf8Codepoint |= 10 + *self->buf - 'A';
|
|
} else [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
++self->buf;
|
|
|
|
if (!(0xdc00 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff))
|
|
[[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
|
|
// Decode utf16 surrogate pair
|
|
self->utf8Codepoint = 0x10000 + (self->utf16Surrogate - 0xd800) * 0x400 +
|
|
(self->utf8Codepoint - 0xdc00);
|
|
|
|
// Write codepoint in utf-8 if there's room in the user provided buffer. If
|
|
// there's not room, flush, write into a temp buffer, and flush again.
|
|
char tmp[4];
|
|
assert(self->utf8Codepoint >= 0x10000);
|
|
if (self->utf8Codepoint > 0x10FFFF) [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
bool useTmp = self->buf - self->writeBuf < 4;
|
|
char *p = tmp;
|
|
if (useTmp) {
|
|
self->flushString();
|
|
}
|
|
auto &w = useTmp ? p : self->writeBuf;
|
|
w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
|
self->utf8Codepoint >>= 6;
|
|
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
|
self->utf8Codepoint >>= 6;
|
|
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
|
self->utf8Codepoint >>= 6;
|
|
w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000;
|
|
w += 4;
|
|
if (useTmp) {
|
|
self->callbacks->on_string_data(self->data, tmp, 4);
|
|
}
|
|
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
|
|
inline WeaselJsonStatus n_integer(Parser3 *self) {
|
|
self->callbacks->on_begin_number(self->data);
|
|
self->dataBegin = self->buf;
|
|
switch (*self->buf) {
|
|
case '0':
|
|
++self->buf;
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
case '8':
|
|
case '9':
|
|
++self->buf;
|
|
self->pop();
|
|
if (auto s = self->push({N_DIGITS2})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
case '-':
|
|
++self->buf;
|
|
self->pop();
|
|
if (auto s = self->push({N_INTEGER2})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
default:
|
|
[[unlikely]] return WeaselJson_REJECT;
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus n_integer2(Parser3 *self) {
|
|
switch (*self->buf) {
|
|
case '0':
|
|
++self->buf;
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
case '8':
|
|
case '9':
|
|
++self->buf;
|
|
self->pop();
|
|
if (auto s = self->push({N_DIGITS2})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
default:
|
|
[[unlikely]] return WeaselJson_REJECT;
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus n_digits(Parser3 *self) {
|
|
switch (*self->buf) {
|
|
case '0':
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
case '8':
|
|
case '9':
|
|
++self->buf;
|
|
self->pop();
|
|
if (auto s = self->push({N_DIGITS2})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
default:
|
|
[[unlikely]] return WeaselJson_REJECT;
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus n_digits2(Parser3 *self) {
|
|
if (self->len() == 0) {
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
switch (*self->buf) {
|
|
case '0':
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
case '8':
|
|
case '9':
|
|
++self->buf;
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
default:
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus n_fraction(Parser3 *self) {
|
|
if (self->len() == 0) {
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
switch (*self->buf) {
|
|
case '.':
|
|
++self->buf;
|
|
self->pop();
|
|
if (auto s = self->push({N_DIGITS})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
default:
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
}
|
|
|
|
// Responsible for ensuring that on_end_number gets called
|
|
inline WeaselJsonStatus n_exponent(Parser3 *self) {
|
|
if (self->len() == 0) {
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
switch (*self->buf) {
|
|
case 'e':
|
|
case 'E':
|
|
++self->buf;
|
|
self->pop();
|
|
if (auto s = self->push({N_SIGN, N_DIGITS, T_END_NUMBER})) {
|
|
return s;
|
|
}
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
default:
|
|
self->pop();
|
|
self->flushNumber();
|
|
self->callbacks->on_end_number(self->data);
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus n_sign(Parser3 *self) {
|
|
if (self->len() == 0) {
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
switch (*self->buf) {
|
|
case '+':
|
|
case '-':
|
|
++self->buf;
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
default:
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus n_true(Parser3 *self) {
|
|
if (*self->buf == 'e') {
|
|
++self->buf;
|
|
self->pop();
|
|
self->callbacks->on_true_literal(self->data);
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
} else [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus n_false(Parser3 *self) {
|
|
if (*self->buf == 'e') {
|
|
++self->buf;
|
|
self->pop();
|
|
self->callbacks->on_false_literal(self->data);
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
} else [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus n_null(Parser3 *self) {
|
|
if (*self->buf == 'l') {
|
|
++self->buf;
|
|
self->pop();
|
|
self->callbacks->on_null_literal(self->data);
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
} else [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
}
|
|
|
|
template <char kChar, bool kSkipWhitespace = false>
|
|
inline WeaselJsonStatus singleChar(Parser3 *self) {
|
|
if constexpr (kSkipWhitespace) {
|
|
assert(self->len() != 0);
|
|
while (tables.whitespace[uint8_t(*self->buf)]) {
|
|
++self->buf;
|
|
if (self->buf == self->bufEnd) {
|
|
return WeaselJson_AGAIN;
|
|
}
|
|
}
|
|
}
|
|
if (*self->buf == kChar) {
|
|
++self->buf;
|
|
self->pop();
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
} else [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
}
|
|
|
|
inline WeaselJsonStatus t_eof(Parser3 *self) {
|
|
if (self->len() > 0) [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
return self->complete ? WeaselJson_OK : WeaselJson_AGAIN;
|
|
}
|
|
|
|
inline WeaselJsonStatus t_end_number(Parser3 *self) {
|
|
self->pop();
|
|
self->flushNumber();
|
|
self->callbacks->on_end_number(self->data);
|
|
MUSTTAIL return Parser3::keepGoing(self);
|
|
}
|
|
|
|
constexpr inline struct ContinuationTable {
|
|
constexpr ContinuationTable() {
|
|
// Defaults
|
|
for (int i = 0; i < N_SYMBOL_COUNT; ++i) {
|
|
continuations[i] = +[](struct Parser3 *) {
|
|
printf("unimplemented\n");
|
|
return WeaselJson_REJECT;
|
|
};
|
|
}
|
|
continuations[N_VALUE] = n_value;
|
|
continuations[N_OBJECT2] = n_object2;
|
|
continuations[N_OBJECT3] = n_object3;
|
|
continuations[N_ARRAY2] = n_array2;
|
|
continuations[N_ARRAY3] = n_array3;
|
|
continuations[N_STRING] = n_string;
|
|
continuations[N_STRING2] = n_string2;
|
|
continuations[N_STRING_FOLLOWING_ESCAPE] = n_string_following_escape;
|
|
continuations[N_INTEGER2] = n_integer2;
|
|
continuations[N_DIGITS] = n_digits;
|
|
continuations[N_DIGITS2] = n_digits2;
|
|
continuations[N_FRACTION] = n_fraction;
|
|
continuations[N_EXPONENT] = n_exponent;
|
|
continuations[N_SIGN] = n_sign;
|
|
continuations[N_WHITESPACE] = n_whitespace;
|
|
continuations[N_TRUE] = n_true;
|
|
continuations[N_FALSE] = n_false;
|
|
continuations[N_NULL] = n_null;
|
|
continuations[T_R] = singleChar<'r'>;
|
|
continuations[T_U] = singleChar<'u'>;
|
|
continuations[T_U2] = singleChar<'u'>;
|
|
continuations[T_A] = singleChar<'a'>;
|
|
continuations[T_L] = singleChar<'l'>;
|
|
continuations[T_S] = singleChar<'s'>;
|
|
continuations[T_COLON] = singleChar<':', true>;
|
|
continuations[T_UTF8_CONTINUATION_BYTE] = t_utf8_continuation_byte;
|
|
continuations[T_UTF8_LAST_CONTINUATION_BYTE] =
|
|
t_utf8_last_continuation_byte;
|
|
continuations[T_HEX] = t_hex;
|
|
continuations[T_HEX2] = t_hex2;
|
|
continuations[T_HEX3] = t_hex3;
|
|
continuations[T_DIGIT] = t_digit;
|
|
continuations[T_ONENINE] = t_onenine;
|
|
continuations[T_EOF] = t_eof;
|
|
continuations[T_END_NUMBER] = t_end_number;
|
|
continuations[T_BACKSLASH] = singleChar<'\\'>;
|
|
|
|
symbolNames[N_VALUE] = "n_value";
|
|
symbolNames[N_OBJECT2] = "n_object2";
|
|
symbolNames[N_OBJECT3] = "n_object3";
|
|
symbolNames[N_ARRAY2] = "n_array2";
|
|
symbolNames[N_ARRAY3] = "n_array3";
|
|
symbolNames[N_STRING] = "n_string";
|
|
symbolNames[N_STRING2] = "n_string2";
|
|
symbolNames[N_STRING_FOLLOWING_ESCAPE] = "n_string_following_escape";
|
|
symbolNames[N_INTEGER2] = "n_integer2";
|
|
symbolNames[N_DIGITS] = "n_digits";
|
|
symbolNames[N_DIGITS2] = "n_digits2";
|
|
symbolNames[N_FRACTION] = "n_fraction";
|
|
symbolNames[N_EXPONENT] = "n_exponent";
|
|
symbolNames[N_SIGN] = "n_sign";
|
|
symbolNames[N_WHITESPACE] = "n_whitespace";
|
|
symbolNames[N_TRUE] = "n_true";
|
|
symbolNames[N_FALSE] = "n_false";
|
|
symbolNames[N_NULL] = "n_null";
|
|
symbolNames[T_R] = "singleChar<'r'>";
|
|
symbolNames[T_U] = "singleChar<'u'>";
|
|
symbolNames[T_U2] = "singleChar<'u'> (in string)";
|
|
symbolNames[T_A] = "singleChar<'a'>";
|
|
symbolNames[T_L] = "singleChar<'l'>";
|
|
symbolNames[T_S] = "singleChar<'s'>";
|
|
symbolNames[T_COLON] = "singleChar<':'>";
|
|
symbolNames[T_UTF8_CONTINUATION_BYTE] = "t_utf8_continuation_byte";
|
|
symbolNames[T_HEX] = "t_hex";
|
|
symbolNames[T_HEX2] = "t_hex2";
|
|
symbolNames[T_HEX3] = "t_hex3";
|
|
symbolNames[T_DIGIT] = "t_digit";
|
|
symbolNames[T_ONENINE] = "t_onenine";
|
|
symbolNames[T_EOF] = "t_eof";
|
|
symbolNames[T_BACKSLASH] = "singleChar<'\\'>";
|
|
symbolNames[T_END_NUMBER] = "t_end_number";
|
|
|
|
// All others can assume that there's at least one byte when they're called
|
|
acceptsEmptyString[N_DIGITS2] = true;
|
|
acceptsEmptyString[N_FRACTION] = true;
|
|
acceptsEmptyString[N_EXPONENT] = true;
|
|
acceptsEmptyString[N_SIGN] = true;
|
|
acceptsEmptyString[N_WHITESPACE] = true;
|
|
acceptsEmptyString[T_EOF] = true;
|
|
acceptsEmptyString[T_END_NUMBER] = true;
|
|
}
|
|
Continuation continuations[N_SYMBOL_COUNT]{};
|
|
const char *symbolNames[N_SYMBOL_COUNT]{};
|
|
bool acceptsEmptyString[N_SYMBOL_COUNT]{};
|
|
} symbolTables;
|
|
|
|
inline WeaselJsonStatus Parser3::keepGoing(Parser3 *self) {
|
|
// self->debugPrint();
|
|
if (self->len() == 0) {
|
|
if (!self->complete) {
|
|
switch (self->top()) {
|
|
case N_INTEGER2:
|
|
case N_DIGITS:
|
|
case N_DIGITS2:
|
|
case N_FRACTION:
|
|
case N_EXPONENT:
|
|
case N_SIGN:
|
|
case T_DIGIT:
|
|
case T_ONENINE:
|
|
case T_END_NUMBER:
|
|
self->flushNumber();
|
|
break;
|
|
case N_STRING:
|
|
case N_STRING2:
|
|
case N_STRING_FOLLOWING_ESCAPE:
|
|
case T_UTF8_CONTINUATION_BYTE:
|
|
case T_UTF8_LAST_CONTINUATION_BYTE:
|
|
case T_HEX:
|
|
case T_HEX2:
|
|
case T_HEX3:
|
|
case T_BACKSLASH:
|
|
case T_U2:
|
|
self->flushString();
|
|
break;
|
|
case N_VALUE:
|
|
case N_OBJECT2:
|
|
case N_OBJECT3:
|
|
case N_ARRAY2:
|
|
case N_ARRAY3:
|
|
case N_WHITESPACE:
|
|
case N_TRUE:
|
|
case N_FALSE:
|
|
case N_NULL:
|
|
case T_R:
|
|
case T_U:
|
|
case T_A:
|
|
case T_L:
|
|
case T_S:
|
|
case T_COLON:
|
|
case T_EOF:
|
|
case N_SYMBOL_COUNT:
|
|
break;
|
|
}
|
|
return WeaselJson_AGAIN;
|
|
}
|
|
if (!symbolTables.acceptsEmptyString[self->top()]) [[unlikely]] {
|
|
return WeaselJson_REJECT;
|
|
}
|
|
}
|
|
MUSTTAIL return symbolTables.continuations[self->top()](self);
|
|
}
|
|
|
|
inline void Parser3::debugPrint() {
|
|
for (int i = 0; i < stackPtr - stack; ++i) {
|
|
printf("%s ", symbolTables.symbolNames[stack[i]]);
|
|
}
|
|
printf("\n");
|
|
for (int i = 0; i < len(); ++i) {
|
|
if (isprint(buf[i])) {
|
|
printf("%c", buf[i]);
|
|
} else {
|
|
printf("\\x%02x", uint8_t(buf[i]));
|
|
}
|
|
}
|
|
printf("\n");
|
|
}
|
|
|
|
} // namespace parser3
|