diff --git a/src/parser.h b/src/parser.h deleted file mode 100644 index 511a470..0000000 --- a/src/parser.h +++ /dev/null @@ -1,448 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "musttail.h" -#include "tables.h" -#include "weaseljson.h" - -// Terminals and Nonterminals. These appear in the stack of the pushdown -// automata -enum Symbol : int8_t { - T_COLON, - T_TRUE, - T_FALSE, - T_NULL, - T_R, - T_U, - T_A, - T_L, - T_S, - T_DUBQUOTE, - T_EOF, - // Nonterminals - N_STRING, // Not including leading double quote, but including trailing quote - N_STRING_FROM_ESCAPE, // Immediately after a backslach - N_NUMBER, - N_VALUE, - N_ARRAY_VALUE_OR_END, - N_OBJECT_VALUE_OR_END, - N_ARRAY_MAYBE_CONTINUE, - N_OBJECT_MAYBE_CONTINUE, - N_WHITESPACE, - N_PAST_END, // Must be last nonterminal -}; - -inline const char *symbolNames[] = { - "T_COLON", - "T_TRUE", - "T_FALSE", - "T_NULL", - "T_R", - "T_U", - "T_A", - "T_L", - "T_S", - "T_DUBQUOTE", - "T_EOF", - "N_STRING", - "N_STRING_FROM_ESCAPE", - "N_NUMBER", - "N_VALUE", - "N_ARRAY_VALUE_OR_END", - "N_OBJECT_VALUE_OR_END", - "N_ARRAY_MAYBE_CONTINUE", - "N_OBJECT_MAYBE_CONTINUE", - "N_WHITESPACE", -}; - -static_assert(sizeof(symbolNames) / sizeof(symbolNames[0]) == N_PAST_END); - -// Table-based ll(1) parser that doesn't handle escaping and all numbers, with a -// streaming interface. Does not validate utf-8. Uses O(1) memory. -struct Parser2 { - Parser2(const Callbacks *callbacks, void *data) - : callbacks(callbacks), data(data) { - std::ignore = push({N_WHITESPACE, N_VALUE, N_WHITESPACE, T_EOF}); - } - - enum Status { - // Accept input - S_OK, - // Consumed available input. Prime more and parse again - S_AGAIN, - // Invalid json - S_REJECT, - // json is too deeply nested - S_OVERFLOW, - }; - - [[nodiscard]] Status parse(char *buf, int len) { - complete = len == 0; - this->buf = buf; - this->bufEnd = buf + len; - return keepGoing(this); - } - - Parser2(Parser2 const &) = delete; - Parser2 &operator=(Parser2 const &) = delete; - Parser2(Parser2 &&) = delete; - Parser2 &operator=(Parser2 &&) = delete; - - static constexpr int kMaxStackSize = 1 << 10; - -private: - // Helpers - void maybeSkipWs() { - while (buf != bufEnd && tables.whitespace[*buf]) { - ++buf; - } - } - Status parse_number() { - char *const bufBefore = buf; - while (len() > 0) { - if (tables.number[*buf]) { - ++buf; - } else { - break; - } - } - if (buf != bufBefore) { - callbacks->on_number_data(data, bufBefore, buf - bufBefore); - } - if (len() == 0 && !complete) { - return S_AGAIN; - } - callbacks->on_end_number(data); - return S_OK; - } - Status parse_string(bool fromEscape) { - auto *result = buf; - if (fromEscape) { - if (*result == '\"') { - ++result; - } - pop(); - if (Status s = push({N_STRING})) { - return s; - } - } - for (;;) { - result = result == nullptr ? nullptr - : (char *)memchr(result, '"', bufEnd - result); - if (result == nullptr) { - if (complete) { - return S_REJECT; - } - callbacks->on_string_data(data, buf, len()); - if (bufEnd[-1] == '\\') { - pop(); - if (Status s = push({N_STRING_FROM_ESCAPE})) { - return s; - } - } - return S_AGAIN; - } - if (result != buf && result[-1] == '\\') { - ++result; - if (result == bufEnd) { - if (complete) { - return S_REJECT; - } - callbacks->on_string_data(data, buf, len()); - return S_AGAIN; - } - continue; - } - break; - } - int stringLen = result - buf; - if (stringLen > 0) { - callbacks->on_string_data(data, buf, stringLen); - } - buf += stringLen + 1; - callbacks->on_end_string(data); - return S_OK; - } - - typedef Status (*continuation)(Parser2 *); - - [[maybe_unused]] void debugPrint() { - for (int i = 0; i < stackPtr - stack; ++i) { - printf("%s ", symbolNames[stack[i]]); - } - printf("\n"); - } - - static Status keepGoing(Parser2 *self) { - if (self->len() == 0 && !self->complete) { - return S_AGAIN; - } - // self->debugPrint(); - MUSTTAIL return table[*(self->stackPtr - 1)](self); - } - - static Status string(Parser2 *self) { - if (Status s = self->parse_string(false)) { - return s; - } - self->pop(); - MUSTTAIL return keepGoing(self); - } - static Status stringFromEscape(Parser2 *self) { - if (Status s = self->parse_string(true)) { - return s; - } - self->pop(); - MUSTTAIL return keepGoing(self); - } - static Status number(Parser2 *self) { - if (Status s = self->parse_number()) { - return s; - } - self->pop(); - MUSTTAIL return keepGoing(self); - } - static Status value(Parser2 *self) { - switch (*self->buf) { - case '{': - ++self->buf; - self->callbacks->on_begin_object(self->data); - self->pop(); - if (Status s = self->push({N_WHITESPACE, N_OBJECT_VALUE_OR_END})) { - return s; - } - break; - case '[': - ++self->buf; - self->callbacks->on_begin_array(self->data); - self->pop(); - if (Status s = self->push({N_WHITESPACE, N_ARRAY_VALUE_OR_END})) { - return s; - } - break; - case '"': - ++self->buf; - self->pop(); - self->callbacks->on_begin_string(self->data); - if (Status s = self->push({N_STRING})) { - return s; - } - break; - case 't': - ++self->buf; - self->pop(); - if (Status s = self->push({T_R, T_U, T_TRUE})) { - return s; - } - break; - case 'f': - ++self->buf; - self->pop(); - if (Status s = self->push({T_A, T_L, T_S, T_FALSE})) { - return s; - } - break; - case 'n': - ++self->buf; - self->pop(); - if (Status s = self->push({T_U, T_L, T_NULL})) { - return s; - } - break; - default: - if (tables.number[*self->buf]) { - self->pop(); - self->callbacks->on_begin_number(self->data); - if (Status s = self->push({N_NUMBER})) { - return s; - } - break; - } - return S_REJECT; - } - MUSTTAIL return keepGoing(self); - } - static Status arrayOrEnd(Parser2 *self) { - if (*self->buf == ']') { - ++self->buf; - self->pop(); - self->callbacks->on_end_array(self->data); - MUSTTAIL return keepGoing(self); - } else { - self->pop(); - if (Status s = - self->push({N_VALUE, N_WHITESPACE, N_ARRAY_MAYBE_CONTINUE})) { - return s; - } - MUSTTAIL return keepGoing(self); - } - } - static Status objectOrEnd(Parser2 *self) { - if (*self->buf == '}') { - ++self->buf; - self->pop(); - self->callbacks->on_end_object(self->data); - MUSTTAIL return keepGoing(self); - } else if (*self->buf == '"') { - self->callbacks->on_begin_string(self->data); - ++self->buf; - self->pop(); - if (Status s = - self->push({N_STRING, N_WHITESPACE, T_COLON, N_WHITESPACE, - N_VALUE, N_WHITESPACE, N_OBJECT_MAYBE_CONTINUE})) { - return s; - } - MUSTTAIL return keepGoing(self); - } - return S_REJECT; - } - static Status arrayContinue(Parser2 *self) { - if (*self->buf == ',') { - ++self->buf; - self->pop(); - if (Status s = self->push( - {N_WHITESPACE, N_VALUE, N_WHITESPACE, N_ARRAY_MAYBE_CONTINUE})) { - return s; - } - MUSTTAIL return keepGoing(self); - } else if (*self->buf == ']') { - ++self->buf; - self->pop(); - self->callbacks->on_end_array(self->data); - MUSTTAIL return keepGoing(self); - } - return S_REJECT; - } - static Status objectContinue(Parser2 *self) { - if (*self->buf == ',') { - ++self->buf; - self->pop(); - if (Status s = self->push({N_WHITESPACE, T_DUBQUOTE, N_STRING, - N_WHITESPACE, T_COLON, N_WHITESPACE, N_VALUE, - N_WHITESPACE, N_OBJECT_MAYBE_CONTINUE})) { - return s; - } - MUSTTAIL return keepGoing(self); - } else if (*self->buf == '}') { - ++self->buf; - self->pop(); - self->callbacks->on_end_object(self->data); - MUSTTAIL return keepGoing(self); - } - return S_REJECT; - } - static Status finishTrue(Parser2 *self) { - if (*self->buf++ == 'e') { - self->pop(); - self->callbacks->on_true_literal(self->data); - MUSTTAIL return keepGoing(self); - } - return S_REJECT; - } - static Status finishFalse(Parser2 *self) { - if (*self->buf++ == 'e') { - self->pop(); - self->callbacks->on_false_literal(self->data); - MUSTTAIL return keepGoing(self); - } - return S_REJECT; - } - static Status finishNull(Parser2 *self) { - if (*self->buf++ == 'l') { - self->pop(); - self->callbacks->on_null_literal(self->data); - MUSTTAIL return keepGoing(self); - } - return S_REJECT; - } - template static Status singleChar(Parser2 *self) { - if (*self->buf++ == kChar) { - self->pop(); - MUSTTAIL return keepGoing(self); - } - return S_REJECT; - } - static Status dubquote(Parser2 *self) { - if (*self->buf++ == '"') { - self->callbacks->on_begin_string(self->data); - self->pop(); - MUSTTAIL return keepGoing(self); - } - return S_REJECT; - } - static Status whitespace(Parser2 *self) { - self->maybeSkipWs(); - if (self->len() == 0 && !self->complete) { - return S_AGAIN; - } - self->pop(); - MUSTTAIL return keepGoing(self); - } - static Status eof(Parser2 *self) { - if (self->len() > 0) { - return S_REJECT; - } - return self->complete ? S_OK : S_AGAIN; - } - - static constexpr continuation table[] = { - /*T_COLON*/ singleChar<':'>, - /*T_TRUE*/ finishTrue, - /*T_FALSE*/ finishFalse, - /*T_NULL*/ finishNull, - /*T_R*/ singleChar<'r'>, - /*T_U*/ singleChar<'u'>, - /*T_A*/ singleChar<'a'>, - /*T_L*/ singleChar<'l'>, - /*T_S*/ singleChar<'s'>, - /*T_DUBQUOTE*/ dubquote, - /*T_EOF*/ eof, - /*N_STRING*/ string, - /*N_STRING_FROM_ESCAPE*/ stringFromEscape, - /*N_NUMBER*/ number, - /*N_VALUE*/ value, - /*N_ARRAY_VALUE_OR_END*/ arrayOrEnd, - /*N_OBJECT_VALUE_OR_END*/ objectOrEnd, - /*N_ARRAY_MAYBE_CONTINUE*/ arrayContinue, - /*N_OBJECT_MAYBE_CONTINUE*/ objectContinue, - /*N_WHITESPACE*/ whitespace, - }; - - static_assert(sizeof(table) / sizeof(table[0]) == N_PAST_END); - - bool empty() const { return stackPtr == stack; } - void pop() { - assert(!empty()); - --stackPtr; - } - [[nodiscard]] Status push(std::initializer_list symbols) { - if (stackPtr >= std::end(stack) - symbols.size()) [[unlikely]] { - return S_OVERFLOW; - } - for (int i = symbols.size() - 1; i >= 0; --i) { - *stackPtr++ = *(symbols.begin() + i); - } - return S_OK; - } - int len() const { - auto result = bufEnd - buf; - assert(result >= 0); - return result; - } - - char *buf = nullptr; - char *bufEnd = nullptr; - const Callbacks *const callbacks; - void *const data; - Symbol stack[kMaxStackSize]; - Symbol *stackPtr = stack; - bool complete = false; -};