From 19208c0e0aba0cc8eaba29413bb8677ade71ae56 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Sun, 18 May 2025 11:34:12 -0400 Subject: [PATCH] Pivot to simpler approach. Passes JSONTestSuite --- src/callbacks.h | 127 ++++++++ src/parser.h | 2 +- src/parser3.h | 812 +++++++++++++++++++++++++++++++++++++++++++++++ src/test.cpp | 31 +- src/validate.cpp | 39 +++ 5 files changed, 998 insertions(+), 13 deletions(-) create mode 100644 src/callbacks.h create mode 100644 src/parser3.h create mode 100644 src/validate.cpp diff --git a/src/callbacks.h b/src/callbacks.h new file mode 100644 index 0000000..156bb52 --- /dev/null +++ b/src/callbacks.h @@ -0,0 +1,127 @@ +#pragma once + +#include "weaseljson.h" +#include +#include +#include + +inline Callbacks printCallbacks() { + Callbacks result; + result.on_begin_object = +[](void *) { puts("on_begin_object"); }; + result.on_end_object = +[](void *) { puts("on_end_object"); }; + result.on_begin_string = +[](void *) { puts("on_begin_string"); }; + result.on_string_data = +[](void *, const char *buf, int len) { + printf("on_string_data `%.*s`\n", len, buf); + }; + result.on_end_string = +[](void *) { puts("on_end_string"); }; + result.on_begin_array = +[](void *) { puts("on_begin_array"); }; + result.on_end_array = +[](void *) { puts("on_end_array"); }; + result.on_begin_number = +[](void *) { puts("on_begin_number"); }; + result.on_number_data = +[](void *, const char *buf, int len) { + printf("on_number_data `%.*s`\n", len, buf); + }; + result.on_end_number = +[](void *) { puts("on_end_number"); }; + result.on_true_literal = +[](void *) { puts("on_true_literal"); }; + result.on_false_literal = +[](void *) { puts("on_false_literal"); }; + result.on_null_literal = +[](void *) { puts("on_null_literal"); }; + return result; +} + +struct MinifyState { + bool isKey = false; + struct Cursor { + int64_t index; + bool isObject; + }; + void on_begin_value() { + if (!stack.empty()) { + auto &back = stack.back(); + if (back.isObject && back.index % 2 == 0 && back.index > 0) { + printf(","); + } + if (back.isObject && back.index % 2 == 1 && back.index > 0) { + printf(":"); + } + if (!back.isObject && back.index > 0) { + printf(","); + } + ++back.index; + } + } + std::vector stack; +}; + +inline Callbacks minifyCallbacks() { + Callbacks result; + result.on_begin_object = +[](void *p) { + auto *state = (MinifyState *)p; + state->on_begin_value(); + state->stack.push_back({0, true}); + printf("{"); + }; + result.on_end_object = +[](void *p) { + auto *state = (MinifyState *)p; + state->stack.pop_back(); + printf("}"); + }; + result.on_begin_string = +[](void *p) { + auto *state = (MinifyState *)p; + state->on_begin_value(); + printf("\""); + }; + result.on_string_data = + +[](void *, const char *buf, int len) { printf("%.*s", len, buf); }; + result.on_end_string = +[](void *p) { printf("\""); }; + result.on_begin_array = +[](void *p) { + auto *state = (MinifyState *)p; + state->on_begin_value(); + state->stack.push_back({0, false}); + printf("["); + }; + result.on_end_array = +[](void *p) { + auto *state = (MinifyState *)p; + state->stack.pop_back(); + printf("]"); + }; + result.on_begin_number = +[](void *p) { + auto *state = (MinifyState *)p; + state->on_begin_value(); + }; + result.on_number_data = + +[](void *, const char *buf, int len) { printf("%.*s", len, buf); }; + result.on_end_number = +[](void *) {}; + result.on_true_literal = +[](void *p) { + auto *state = (MinifyState *)p; + state->on_begin_value(); + printf("true"); + }; + result.on_false_literal = +[](void *p) { + auto *state = (MinifyState *)p; + state->on_begin_value(); + printf("false"); + }; + result.on_null_literal = +[](void *p) { + auto *state = (MinifyState *)p; + state->on_begin_value(); + printf("null"); + }; + return result; +} + +inline Callbacks noopCallbacks() { + Callbacks result; + result.on_begin_object = +[](void *) {}; + result.on_end_object = +[](void *) {}; + result.on_begin_string = +[](void *) {}; + result.on_string_data = +[](void *, const char *buf, int len) {}; + result.on_end_string = +[](void *) {}; + result.on_begin_array = +[](void *) {}; + result.on_end_array = +[](void *) {}; + result.on_begin_number = +[](void *) {}; + result.on_number_data = +[](void *, const char *buf, int len) {}; + result.on_end_number = +[](void *) {}; + result.on_true_literal = +[](void *) {}; + result.on_false_literal = +[](void *) {}; + result.on_null_literal = +[](void *) {}; + return result; +} diff --git a/src/parser.h b/src/parser.h index 38c4bf2..511a470 100644 --- a/src/parser.h +++ b/src/parser.h @@ -88,7 +88,7 @@ struct Parser2 { complete = len == 0; this->buf = buf; this->bufEnd = buf + len; - return table[*(stackPtr - 1)](this); + return keepGoing(this); } Parser2(Parser2 const &) = delete; diff --git a/src/parser3.h b/src/parser3.h new file mode 100644 index 0000000..c2e696a --- /dev/null +++ b/src/parser3.h @@ -0,0 +1,812 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "musttail.h" +#include "tables.h" +#include "weaseljson.h" + +namespace parser3 { + +enum Status { + // Accept input + S_OK, + // Consumed all available input. + S_AGAIN, + // Invalid json + S_REJECT, + // json is too deeply nested + S_OVERFLOW, +}; + +typedef Status (*Continuation)(struct Parser3 *); + +// These appear in the stack of the pushdown +// automata +enum Symbol : uint8_t { + N_JSON, + N_VALUE, + N_OBJECT, + N_OBJECT2, + N_OBJECT3, + N_ARRAY, + N_ARRAY2, + N_ARRAY3, + N_ELEMENT, + N_STRING, + N_STRING2, + N_STRING_FOLLOWING_ESCAPE, + N_NUMBER, + N_INTEGER, + N_INTEGER2, + N_DIGITS, + N_DIGITS2, + N_FRACTION, + N_EXPONENT, + N_SIGN, + N_WHITESPACE, + N_TRUE, + N_FALSE, + N_NULL, + T_R, + T_U, + T_A, + T_L, + T_S, + T_COLON, + T_UTF8_CONTINUATION_BYTE, + T_HEX, + T_DIGIT, + T_ONENINE, + T_EOF, + N_SYMBOL_COUNT, // Must be last +}; +struct Parser3 { + Parser3(const Callbacks *callbacks, void *data) + : callbacks(callbacks), data(data) { + std::ignore = push({N_JSON, T_EOF}); + } + + [[nodiscard]] Status parse(char *buf, int len) { + complete = len == 0; + this->buf = buf; + this->bufEnd = buf + len; + return keepGoing(this); + } + + [[nodiscard]] bool empty() const { return stackPtr == stack; } + void pop() { + assert(!empty()); + --stackPtr; + } + [[nodiscard]] Status push(std::initializer_list symbols) { + if (stackPtr >= std::end(stack) - symbols.size()) [[unlikely]] { + return S_OVERFLOW; + } + for (int i = symbols.size() - 1; i >= 0; --i) { + *stackPtr++ = *(symbols.begin() + i); + } + return S_OK; + } + [[nodiscard]] int len() const { + auto result = bufEnd - buf; + assert(result >= 0); + return result; + } + Symbol top() const { + assert(!empty()); + return *(stackPtr - 1); + } + + static Status keepGoing(Parser3 *self); + + constexpr static int kMaxStackSize = 1024; + + [[maybe_unused]] void debugPrint(); + char *buf = nullptr; + char *bufEnd = nullptr; + const Callbacks *const callbacks; + void *const data; + Symbol stack[kMaxStackSize]; + Symbol *stackPtr = stack; + bool complete = false; +}; + +inline Status n_json(Parser3 *self) { + self->pop(); + if (auto s = self->push({N_ELEMENT})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); +} + +inline Status n_value(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + switch (*self->buf) { + case '{': + self->pop(); + if (auto s = self->push({N_OBJECT})) { + return s; + } + break; + case '[': + self->pop(); + if (auto s = self->push({N_ARRAY})) { + return s; + } + break; + case '"': + self->pop(); + if (auto s = self->push({N_STRING})) { + return s; + } + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '-': + self->pop(); + if (auto s = self->push({N_NUMBER})) { + return s; + } + break; + case 't': + ++self->buf; + self->pop(); + if (auto s = self->push({T_R, T_U, N_TRUE})) { + return s; + } + break; + case 'f': + ++self->buf; + self->pop(); + if (auto s = self->push({T_A, T_L, T_S, N_FALSE})) { + return s; + } + break; + case 'n': + ++self->buf; + self->pop(); + if (auto s = self->push({T_U, T_L, N_NULL})) { + return s; + } + break; + default: + return S_REJECT; + } + MUSTTAIL return Parser3::keepGoing(self); +} + +inline Status n_object(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + if (*self->buf != '{') { + return S_REJECT; + } + ++self->buf; + self->pop(); + if (auto s = self->push({N_WHITESPACE, N_OBJECT2})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); +} + +inline Status n_object2(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + switch (*self->buf) { + case '}': + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + case '"': + self->pop(); + if (auto s = self->push( + {N_STRING, N_WHITESPACE, T_COLON, N_ELEMENT, N_OBJECT3})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + default: + return S_REJECT; + } +} + +inline Status n_object3(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + switch (*self->buf) { + case '}': + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + case ',': + ++self->buf; + self->pop(); + if (auto s = self->push({N_WHITESPACE, N_STRING, N_WHITESPACE, T_COLON, + N_ELEMENT, N_OBJECT3})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + default: + return S_REJECT; + } +} + +inline Status n_array(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + if (*self->buf != '[') { + return S_REJECT; + } + ++self->buf; + self->pop(); + if (auto s = self->push({N_WHITESPACE, N_ARRAY2})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); +} + +inline Status n_array2(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + switch (*self->buf) { + case ']': + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + default: + self->pop(); + if (auto s = self->push({N_VALUE, N_WHITESPACE, N_ARRAY3})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + } +} + +inline Status n_array3(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + switch (*self->buf) { + case ']': + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + case ',': + ++self->buf; + self->pop(); + if (auto s = self->push({N_ELEMENT, N_ARRAY3})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + default: + return S_REJECT; + } +} + +inline Status n_element(Parser3 *self) { + self->pop(); + if (auto s = self->push({N_WHITESPACE, N_VALUE, N_WHITESPACE})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); +} + +inline Status n_string(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + if (*self->buf != '"') { + return S_REJECT; + } + ++self->buf; + self->pop(); + if (auto s = self->push({N_STRING2})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); +} + +inline Status n_string2(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + // Try subtract and unsigned compare to save a branch? + if (uint8_t(*self->buf) < 0x20) { + return S_REJECT; + } + if (int8_t(*self->buf) > 0) { + // one byte utf-8 encoding + switch (*self->buf) { + case '"': + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + case '\\': + ++self->buf; + self->pop(); + if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + default: + ++self->buf; + MUSTTAIL return Parser3::keepGoing(self); + } + } else if ((*self->buf & 0b11100000) == 0b11000000) { + // two byte utf-8 encoding + ++self->buf; + self->pop(); + if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, N_STRING2})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + } + if ((*self->buf & 0b11110000) == 0b11100000) { + // three byte utf-8 encoding + ++self->buf; + self->pop(); + if (auto s = self->push( + {T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE, N_STRING2})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + } else if ((*self->buf & 0b11111000) == 0b11110000) { + // four byte utf-8 encoding + ++self->buf; + self->pop(); + if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE, + T_UTF8_CONTINUATION_BYTE, N_STRING2})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + } + return S_REJECT; +} + +inline Status n_string_following_escape(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + switch (*self->buf) { + case '"': + case '\\': + case '/': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + ++self->buf; + self->pop(); + if (auto s = self->push({N_STRING2})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + case 'u': + ++self->buf; + self->pop(); + if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX, N_STRING2})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + default: + return S_REJECT; + } +} + +inline Status t_utf8_continuation_byte(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + if ((*self->buf & 0b11000000) == 0b10000000) { + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } + return S_REJECT; +} + +inline Status t_digit(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + if ('0' <= *self->buf && *self->buf <= '9') { + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } + return S_REJECT; +} + +inline Status t_onenine(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + if ('1' <= *self->buf && *self->buf <= '9') { + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } + return S_REJECT; +} + +inline Status t_hex(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + if (('0' <= *self->buf && *self->buf <= '9') || + ('a' <= *self->buf && *self->buf <= 'f') || + ('A' <= *self->buf && *self->buf <= 'F')) { + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } + return S_REJECT; +} + +inline Status n_number(Parser3 *self) { + self->pop(); + if (auto s = self->push({N_INTEGER, N_FRACTION, N_EXPONENT})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); +} + +inline Status n_integer(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + switch (*self->buf) { + case '0': + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + ++self->buf; + self->pop(); + if (auto s = self->push({N_DIGITS2})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + case '-': + ++self->buf; + self->pop(); + if (auto s = self->push({N_INTEGER2})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + default: + return S_REJECT; + } +} + +inline Status n_integer2(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + switch (*self->buf) { + case '0': + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + ++self->buf; + self->pop(); + if (auto s = self->push({N_DIGITS2})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + default: + return S_REJECT; + } +} + +inline Status n_digits(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + switch (*self->buf) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + ++self->buf; + self->pop(); + if (auto s = self->push({N_DIGITS2})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + default: + return S_REJECT; + } +} + +inline Status n_digits2(Parser3 *self) { + if (self->len() == 0) { + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } + switch (*self->buf) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + ++self->buf; + MUSTTAIL return Parser3::keepGoing(self); + default: + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } +} + +inline Status n_fraction(Parser3 *self) { + if (self->len() == 0) { + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } + switch (*self->buf) { + case '.': + ++self->buf; + self->pop(); + if (auto s = self->push({N_DIGITS})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + default: + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } +} + +inline Status n_exponent(Parser3 *self) { + if (self->len() == 0) { + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } + switch (*self->buf) { + case 'e': + case 'E': + ++self->buf; + self->pop(); + if (auto s = self->push({N_SIGN, N_DIGITS})) { + return s; + } + MUSTTAIL return Parser3::keepGoing(self); + default: + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } +} + +inline Status n_sign(Parser3 *self) { + if (self->len() == 0) { + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } + switch (*self->buf) { + case '+': + case '-': + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + default: + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } +} + +inline Status n_whitespace(Parser3 *self) { + if (self->len() == 0) { + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } + if (tables.whitespace[uint8_t(*self->buf)]) { + ++self->buf; + MUSTTAIL return Parser3::keepGoing(self); + } + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); +} + +inline Status n_true(Parser3 *self) { + if (*self->buf == 'e') { + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } + return S_REJECT; +} + +inline Status n_false(Parser3 *self) { + if (*self->buf == 'e') { + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } + return S_REJECT; +} + +inline Status n_null(Parser3 *self) { + if (*self->buf == 'l') { + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } + return S_REJECT; +} + +template inline Status singleChar(Parser3 *self) { + if (*self->buf == kChar) { + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } + return S_REJECT; +} + +inline Status t_eof(Parser3 *self) { + if (self->len() > 0) { + return S_REJECT; + } + return self->complete ? S_OK : S_AGAIN; +} + +constexpr inline struct ContinuationTable { + constexpr ContinuationTable() { + // Defaults + for (int i = 0; i < N_SYMBOL_COUNT; ++i) { + continuations[i] = +[](struct Parser3 *) { + printf("unimplemented\n"); + return S_REJECT; + }; + } + continuations[N_JSON] = n_json; + continuations[N_VALUE] = n_value; + continuations[N_OBJECT] = n_object; + continuations[N_OBJECT2] = n_object2; + continuations[N_OBJECT3] = n_object3; + continuations[N_ARRAY] = n_array; + continuations[N_ARRAY2] = n_array2; + continuations[N_ARRAY3] = n_array3; + continuations[N_ELEMENT] = n_element; + continuations[N_STRING] = n_string; + continuations[N_STRING2] = n_string2; + continuations[N_STRING_FOLLOWING_ESCAPE] = n_string_following_escape; + continuations[N_NUMBER] = n_number; + continuations[N_INTEGER] = n_integer; + continuations[N_INTEGER2] = n_integer2; + continuations[N_DIGITS] = n_digits; + continuations[N_DIGITS2] = n_digits2; + continuations[N_FRACTION] = n_fraction; + continuations[N_EXPONENT] = n_exponent; + continuations[N_SIGN] = n_sign; + continuations[N_WHITESPACE] = n_whitespace; + continuations[N_TRUE] = n_true; + continuations[N_FALSE] = n_false; + continuations[N_NULL] = n_null; + continuations[T_R] = singleChar<'r'>; + continuations[T_U] = singleChar<'u'>; + continuations[T_A] = singleChar<'a'>; + continuations[T_L] = singleChar<'l'>; + continuations[T_S] = singleChar<'s'>; + continuations[T_COLON] = singleChar<':'>; + continuations[T_UTF8_CONTINUATION_BYTE] = t_utf8_continuation_byte; + continuations[T_HEX] = t_hex; + continuations[T_DIGIT] = t_digit; + continuations[T_ONENINE] = t_onenine; + continuations[T_EOF] = t_eof; + symbolNames[N_JSON] = "n_json"; + symbolNames[N_VALUE] = "n_value"; + symbolNames[N_OBJECT] = "n_object"; + symbolNames[N_OBJECT2] = "n_object2"; + symbolNames[N_OBJECT3] = "n_object3"; + symbolNames[N_ARRAY] = "n_array"; + symbolNames[N_ARRAY2] = "n_array2"; + symbolNames[N_ARRAY3] = "n_array3"; + symbolNames[N_ELEMENT] = "n_element"; + symbolNames[N_STRING] = "n_string"; + symbolNames[N_STRING2] = "n_string2"; + symbolNames[N_STRING_FOLLOWING_ESCAPE] = "n_string_following_escape"; + symbolNames[N_NUMBER] = "n_number"; + symbolNames[N_INTEGER] = "n_integer"; + symbolNames[N_INTEGER2] = "n_integer2"; + symbolNames[N_DIGITS] = "n_digits"; + symbolNames[N_DIGITS2] = "n_digits2"; + symbolNames[N_FRACTION] = "n_fraction"; + symbolNames[N_EXPONENT] = "n_exponent"; + symbolNames[N_SIGN] = "n_sign"; + symbolNames[N_WHITESPACE] = "n_whitespace"; + symbolNames[N_TRUE] = "n_true"; + symbolNames[N_FALSE] = "n_false"; + symbolNames[N_NULL] = "n_null"; + symbolNames[T_R] = "singleChar<'r'>"; + symbolNames[T_U] = "singleChar<'u'>"; + symbolNames[T_A] = "singleChar<'a'>"; + symbolNames[T_L] = "singleChar<'l'>"; + symbolNames[T_S] = "singleChar<'s'>"; + symbolNames[T_COLON] = "singleChar<':'>"; + symbolNames[T_UTF8_CONTINUATION_BYTE] = "t_utf8_continuation_byte"; + symbolNames[T_HEX] = "t_hex"; + symbolNames[T_DIGIT] = "t_digit"; + symbolNames[T_ONENINE] = "t_onenine"; + symbolNames[T_EOF] = "t_eof"; + } + Continuation continuations[N_SYMBOL_COUNT]{}; + const char *symbolNames[N_SYMBOL_COUNT]{}; +} symbolTables; + +inline Status Parser3::keepGoing(Parser3 *self) { + if (self->len() == 0 && !self->complete) { + return S_AGAIN; + } + self->debugPrint(); + MUSTTAIL return symbolTables.continuations[self->top()](self); +} + +inline void Parser3::debugPrint() { + for (int i = 0; i < stackPtr - stack; ++i) { + printf("%s ", symbolTables.symbolNames[stack[i]]); + } + printf("\n"); + for (int i = 0; i < len(); ++i) { + if (isprint(buf[i])) { + printf("%c", buf[i]); + } else { + printf("\\x%02x", uint8_t(buf[i])); + } + } + printf("\n"); +} + +} // namespace parser3 diff --git a/src/test.cpp b/src/test.cpp index a846cc2..b081e2c 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -11,7 +11,7 @@ #include #include -#include "parser.h" +#include "parser3.h" // This is the JSON grammar in McKeeman Form. @@ -539,28 +539,35 @@ TEST_CASE("parser2") { MinifyState state; { auto copy = json; - Parser2 parser(&c, &state); + parser3::Parser3 parser(&c, &state); int i = 0; for (; i < copy.length() - 1; ++i) { - REQUIRE(parser.parse(copy.data() + i, 1) == Parser2::S_AGAIN); + REQUIRE(parser.parse(copy.data() + i, 1) == parser3::S_AGAIN); } - CHECK(parser.parse(copy.data() + i, 1) == Parser2::S_AGAIN); - CHECK(parser.parse(nullptr, 0) == Parser2::S_OK); + CHECK(parser.parse(copy.data() + i, 1) == parser3::S_AGAIN); + CHECK(parser.parse(nullptr, 0) == parser3::S_OK); puts(""); } { std::string copy = "{\"x\": [], \"y\": {}}"; - Parser2 parser(&c, &state); - CHECK(parser.parse(copy.data(), copy.length()) == Parser2::S_AGAIN); - CHECK(parser.parse(nullptr, 0) == Parser2::S_OK); + parser3::Parser3 parser(&c, &state); + CHECK(parser.parse(copy.data(), copy.length()) == parser3::S_AGAIN); + CHECK(parser.parse(nullptr, 0) == parser3::S_OK); puts(""); } { auto c = noopCallbacks(); std::string copy = "{\"a\":\"a"; - Parser2 parser(&c, &state); - CHECK(parser.parse(copy.data(), copy.length()) == Parser2::S_AGAIN); - CHECK(parser.parse(nullptr, 0) == Parser2::S_REJECT); + parser3::Parser3 parser(&c, &state); + CHECK(parser.parse(copy.data(), copy.length()) == parser3::S_AGAIN); + CHECK(parser.parse(nullptr, 0) == parser3::S_REJECT); + } + { + auto c = noopCallbacks(); + std::string copy = "["; + parser3::Parser3 parser(&c, &state); + CHECK(parser.parse(copy.data(), copy.length()) == parser3::S_AGAIN); + CHECK(parser.parse(nullptr, 0) == parser3::S_REJECT); } } @@ -583,7 +590,7 @@ TEST_CASE("bench2") { bench.unit("byte"); bench.run("parser2", [&]() { auto copy = json; - Parser2 parser(&c, nullptr); + parser3::Parser3 parser(&c, nullptr); bench.doNotOptimizeAway(parser.parse(copy.data(), copy.length())); }); } diff --git a/src/validate.cpp b/src/validate.cpp new file mode 100644 index 0000000..58d5fc7 --- /dev/null +++ b/src/validate.cpp @@ -0,0 +1,39 @@ +#include +#include + +#include "callbacks.h" +#include "parser3.h" + +int main(int argc, char **argv) { + if (argc < 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + int fd = open(argv[1], O_RDONLY); + if (fd == -1) { + perror("open"); + return 1; + } + auto c = noopCallbacks(); + parser3::Parser3 parser(&c, nullptr); + for (;;) { + char buf[1024]; + int l = read(fd, buf, sizeof(buf)); + if (l == -1) { + perror("read"); + return 1; + } + switch (parser.parse(buf, l)) { + case parser3::S_OK: + return 0; + case parser3::S_AGAIN: + continue; + case parser3::S_REJECT: + case parser3::S_OVERFLOW: + return 1; + } + if (l == 0) { + return 1; + } + } +}