diff --git a/src/test.cpp b/src/test.cpp index 0fbaebd..d00a7c3 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -114,8 +114,6 @@ // '0009' ws struct Callbacks { - void (*on_begin_value)(void *data) = noop; - void (*on_end_value)(void *data) = noop; void (*on_begin_object)(void *data) = noop; void (*on_end_object)(void *data) = noop; void (*on_begin_string)(void *data) = noop; @@ -137,28 +135,36 @@ private: // Terminals and Nonterminals. These appear in the stack of the pushdown // automata -enum Symbol : uint8_t { +enum Symbol : int8_t { // Terminals + T_INVALID, + T_EOF, T_LBRACE, T_RBRACE, T_COMMA, - T_TRUE, - T_FALSE, - T_NULL, + T_ATOM, // Multibyte! + T_STRING, // Multibyte! T_LBRACKET, T_RBRACKET, T_COLON, - T_DOUBLEQUOTE, - N_CHARACTER, // Multibyte! + T_PAST_END, // Must be last terminal // Nonterminals - N_VALUE, + N_VALUE = T_PAST_END, + N_ARRAY_MAYBE_CONTINUE, N_OBJECT, - N_ARRAY, - N_STRING, - N_NUMBER, - N_MEMBER, - N_ELEMENTS, - N_CHARACTERS, + N_OBJECT_MAYBE_CONTINUE, + N_PAST_END, // Must be last nonterminal +}; + +const char *symbolNames[] = { + "T_INVALID", "T_EOF", + "T_LBRACE", "T_RBRACE", + "T_COMMA", "T_ATOM", + "T_STRING", "T_LBRACKET", + "T_RBRACKET", "T_COLON", + "N_VALUE", "N_ARRAY_MAYBE_CONTINUE", + "N_OBJECT", "N_OBJECT_MAYBE_CONTINUE", + "N_PAST_END", }; namespace { @@ -167,14 +173,14 @@ bool whitespace(char x) { return x == 0x20 || x == 0x0A || x == 0x0D || x == 0x09; } -// Straightforward recursive descent that doesn't handle string escaping or -// non-integer or negative numbers +// Straightforward recursive descent that doesn't handle string escaping and +// treats numbers as [0-9]+ struct Parser1 { Parser1(char *buf, int len, const Callbacks *callbacks, void *data) : buf(buf), len(len), callbacks(callbacks), data(data) {} // Returns false to reject - bool parse() { return parse_element(); } + [[nodiscard]] bool parse() { return parse_element(); } Parser1(Parser1 const &) = delete; Parser1 &operator=(Parser1 const &) = delete; @@ -399,6 +405,311 @@ private: } }; +#ifndef __has_attribute +#define __has_attribute(x) 0 +#endif + +#if __has_attribute(musttail) +#define MUSTTAIL __attribute__((musttail)) +#else +#define MUSTTAIL +#endif + +struct Parser2 { + Parser2(char *buf, int len, const Callbacks *callbacks, void *data) + : buf(buf), len(len), callbacks(callbacks), data(data) {} + + // Returns false to reject + [[nodiscard]] bool parse() { + stack.push_back(N_VALUE); + nextToken(); + return keepGoing(this); + } + + Parser2(Parser2 const &) = delete; + Parser2 &operator=(Parser2 const &) = delete; + Parser2(Parser2 &&) = delete; + Parser2 &operator=(Parser2 &&) = delete; + +private: + // Helpers + void maybeSkipWs() { + while (len > 0 && whitespace(*buf)) { + ++buf; + --len; + } + } + bool parseLiteral(const char *literal) { + const int litLen = strlen(literal); + if (len < litLen) { + return false; + } + len -= litLen; + return memcmp(std::exchange(buf, buf + litLen), literal, litLen) == 0; + } + bool parse_number() { + callbacks->on_begin_number(data); + char *const bufBefore = buf; + for (;;) { + if (len == 0) { + return false; + } + if ('0' <= *buf && *buf <= '9') { + ++buf; + --len; + } else { + break; + } + } + if (buf == bufBefore) { + return false; + } + callbacks->on_number_data(data, bufBefore, buf - bufBefore); + callbacks->on_end_number(data); + return true; + } + + bool parse_string() { + callbacks->on_begin_string(data); + if (!parseLiteral("\"")) { + return false; + } + auto *result = (char *)memchr(buf, '"', len); + if (result == nullptr) { + return false; + } + int stringLen = result - buf; + callbacks->on_string_data(data, buf, stringLen); + buf += stringLen; + len -= stringLen; + if (!parseLiteral("\"")) { + return false; + } + callbacks->on_end_string(data); + return true; + } + + typedef bool (*continuation)(Parser2 *); + + void printStack() { + printf("token: %s\n", symbolNames[currentToken]); + for (auto s : stack) { + printf("%s ", symbolNames[s]); + } + printf("\n"); + } + + static bool keepGoing(Parser2 *self) { + // self->printStack(); + if (self->stack.empty()) { + assert(self->currentToken == T_EOF); + return true; + } + if (self->stack.back() == self->currentToken) { + self->stack.pop_back(); + self->nextToken(); + MUSTTAIL return keepGoing(self); + } + // If the top of the stack is a terminal that doesn't match, reject + if (self->stack.back() < T_PAST_END) { + return false; + } + MUSTTAIL return table[self->stack.back() - T_PAST_END][self->currentToken]( + self); + } + + static bool reject(Parser2 *) { return false; } + static bool object(Parser2 *self) { + assert(self->currentToken == T_LBRACE); + self->callbacks->on_begin_object(self->data); + self->nextToken(); + self->stack.pop_back(); + self->stack.push_back(N_OBJECT_MAYBE_CONTINUE); + self->stack.push_back(N_VALUE); + self->stack.push_back(T_COLON); + self->stack.push_back(T_STRING); + MUSTTAIL return keepGoing(self); + } + static bool atom(Parser2 *self) { + if (*self->bufBefore == 't') { + self->callbacks->on_true_literal(self->data); + } else if (*self->bufBefore == 'f') { + self->callbacks->on_false_literal(self->data); + } else if (*self->bufBefore == 'n') { + self->callbacks->on_null_literal(self->data); + } else { + self->callbacks->on_begin_number(self->data); + self->callbacks->on_number_data(self->data, self->bufBefore + 1, + self->buf - self->bufBefore - 2); + self->callbacks->on_end_number(self->data); + } + self->nextToken(); + self->stack.pop_back(); + MUSTTAIL return keepGoing(self); + } + static bool string(Parser2 *self) { + assert(self->currentToken == T_STRING); + self->nextToken(); + self->stack.pop_back(); + MUSTTAIL return keepGoing(self); + } + static bool array(Parser2 *self) { + assert(self->currentToken == T_LBRACKET); + self->callbacks->on_begin_array(self->data); + self->nextToken(); + self->stack.pop_back(); + self->stack.push_back(N_ARRAY_MAYBE_CONTINUE); + self->stack.push_back(N_VALUE); + MUSTTAIL return keepGoing(self); + } + static bool continueArray(Parser2 *self) { + assert(self->currentToken == T_COMMA); + self->nextToken(); + self->stack.pop_back(); + self->stack.push_back(N_ARRAY_MAYBE_CONTINUE); + self->stack.push_back(N_VALUE); + MUSTTAIL return keepGoing(self); + } + static bool continueObject(Parser2 *self) { + assert(self->currentToken == T_COMMA); + self->nextToken(); + self->stack.pop_back(); + self->stack.push_back(N_OBJECT_MAYBE_CONTINUE); + self->stack.push_back(N_VALUE); + self->stack.push_back(T_COLON); + self->stack.push_back(T_STRING); + MUSTTAIL return keepGoing(self); + } + static bool finishArray(Parser2 *self) { + assert(self->currentToken == T_RBRACKET); + self->callbacks->on_end_array(self->data); + self->nextToken(); + self->stack.pop_back(); + MUSTTAIL return keepGoing(self); + } + static bool finishObject(Parser2 *self) { + assert(self->currentToken == T_RBRACE); + self->callbacks->on_end_object(self->data); + self->nextToken(); + self->stack.pop_back(); + MUSTTAIL return keepGoing(self); + } + + // table[nonterminal][terminal] + static constexpr continuation table[N_PAST_END - T_PAST_END][T_PAST_END] = { + /*N_VALUE*/ + { + /*T_INVALID*/ reject, + /*T_EOF*/ reject, + /*T_LBRACE*/ object, + /*T_RBRACE*/ reject, + /*T_COMMA*/ reject, + /*T_ATOM*/ atom, + /*T_STRING*/ string, + /*T_LBRACKET*/ array, + /*T_RBRACKET*/ reject, + /*T_COLON*/ reject, + }, + /*N_ARRAY_MAYBE_CONTINUE*/ + { + /*T_INVALID*/ reject, + /*T_EOF*/ reject, + /*T_LBRACE*/ reject, + /*T_RBRACE*/ reject, + /*T_COMMA*/ continueArray, + /*T_ATOM*/ reject, + /*T_STRING*/ reject, + /*T_LBRACKET*/ reject, + /*T_RBRACKET*/ finishArray, + /*T_COLON*/ reject, + }, + /*N_OBJECT*/ + { + /*T_INVALID*/ reject, + /*T_EOF*/ reject, + /*T_LBRACE*/ object, + /*T_RBRACE*/ reject, + /*T_COMMA*/ reject, + /*T_ATOM*/ reject, + /*T_STRING*/ reject, + /*T_LBRACKET*/ reject, + /*T_RBRACKET*/ reject, + /*T_COLON*/ reject, + }, + /*N_OBJECT_MAYBE_CONTINUE*/ + { + /*T_INVALID*/ reject, + /*T_EOF*/ reject, + /*T_LBRACE*/ reject, + /*T_RBRACE*/ finishObject, + /*T_COMMA*/ continueObject, + /*T_ATOM*/ reject, + /*T_STRING*/ reject, + /*T_LBRACKET*/ reject, + /*T_RBRACKET*/ reject, + /*T_COLON*/ reject, + }, + }; + + Symbol currentToken; + const char *bufBefore; + Symbol nextToken() { + maybeSkipWs(); + bufBefore = buf; + if (len == 0) { + return currentToken = T_EOF; + } + if (*buf == '{') { + parseLiteral("{"); + return currentToken = T_LBRACE; + } else if (*buf == '[') { + parseLiteral("["); + return currentToken = T_LBRACKET; + } else if (*buf == '}') { + parseLiteral("}"); + return currentToken = T_RBRACE; + } else if (*buf == ']') { + parseLiteral("]"); + return currentToken = T_RBRACKET; + } else if (*buf == ':') { + parseLiteral(":"); + return currentToken = T_COLON; + } else if (*buf == ',') { + parseLiteral(","); + return currentToken = T_COMMA; + } else if (*buf == '"') { + if (!parse_string()) { + return currentToken = T_INVALID; + } + return currentToken = T_STRING; + } else if (*buf == 't') { + if (!parseLiteral("true")) { + return currentToken = T_INVALID; + } + return currentToken = T_ATOM; + } else if (*buf == 'f') { + if (!parseLiteral("false")) { + return currentToken = T_INVALID; + } + } else if (*buf == 'n') { + if (!parseLiteral("null")) { + return currentToken = T_INVALID; + } + } else { + if (!parse_number()) { + return currentToken = T_INVALID; + } + } + return currentToken = T_ATOM; + } + + char *buf; + int len; + const Callbacks *const callbacks; + void *const data; + std::vector stack; +}; + const std::string json = R"({ "glossary": { "title": "example glossary", @@ -424,8 +735,6 @@ const std::string json = R"({ Callbacks printCallbacks() { Callbacks result; - result.on_begin_value = +[](void *) { puts("on_begin_value"); }; - result.on_end_value = +[](void *) { puts("on_end_value"); }; result.on_begin_object = +[](void *) { puts("on_begin_object"); }; result.on_end_object = +[](void *) { puts("on_end_object"); }; result.on_begin_string = +[](void *) { puts("on_begin_string"); }; @@ -453,8 +762,17 @@ TEST_CASE("parser1") { auto copy = json; Parser1 parser(copy.data(), copy.length(), &c, nullptr); CHECK(parser.parse()); +} - c = Callbacks{}; +TEST_CASE("parser2") { + Callbacks c = printCallbacks(); + auto copy = json; + Parser2 parser(copy.data(), copy.length(), &c, nullptr); + CHECK(parser.parse()); +} + +TEST_CASE("bench") { + auto c = Callbacks{}; ankerl::nanobench::Bench bench; bench.relative(true); bench.batch(json.size()); @@ -468,4 +786,9 @@ TEST_CASE("parser1") { Parser1 parser(copy.data(), copy.length(), &c, nullptr); bench.doNotOptimizeAway(parser.parse()); }); + bench.run("parser2", [&]() { + auto copy = json; + Parser2 parser(copy.data(), copy.length(), &c, nullptr); + bench.doNotOptimizeAway(parser.parse()); + }); }