#include #include #include #include #include #include #include #include #include // This is the JSON grammar in McKeeman Form. // json // element // value // object // array // string // number // "true" // "false" // "null" // object // '{' ws '}' // '{' members '}' // members // member // member ',' members // member // ws string ws ':' element // array // '[' ws ']' // '[' elements ']' // elements // element // element ',' elements // element // ws value ws // string // '"' characters '"' // characters // "" // character characters // character // '0020' . '10FFFF' - '"' - '\' // '\' escape // escape // '"' // '\' // '/' // 'b' // 'f' // 'n' // 'r' // 't' // 'u' hex hex hex hex // hex // digit // 'A' . 'F' // 'a' . 'f' // number // integer fraction exponent // integer // digit // onenine digits // '-' digit // '-' onenine digits // digits // digit // digit digits // digit // '0' // onenine // onenine // '1' . '9' // fraction // "" // '.' digits // exponent // "" // 'E' sign digits // 'e' sign digits // sign // "" // '+' // '-' // ws // "" // '0020' ws // '000A' ws // '000D' ws // '0009' ws struct Callbacks { void (*on_begin_object)(void *data) = noop; void (*on_end_object)(void *data) = noop; void (*on_begin_string)(void *data) = noop; void (*on_string_data)(void *data, const char *buf, int len) = noop; void (*on_end_string)(void *data) = noop; void (*on_begin_array)(void *data) = noop; void (*on_end_array)(void *data) = noop; void (*on_begin_number)(void *data) = noop; void (*on_number_data)(void *data, const char *buf, int len) = noop; void (*on_end_number)(void *data) = noop; void (*on_true_literal)(void *data) = noop; void (*on_false_literal)(void *data) = noop; void (*on_null_literal)(void *data) = noop; private: static void noop(void *) {} static void noop(void *, const char *, int) {} }; // Terminals and Nonterminals. These appear in the stack of the pushdown // automata enum Symbol : int8_t { // Terminals T_INVALID, T_EOF, T_LBRACE, T_RBRACE, T_COMMA, T_ATOM, // Multibyte! T_STRING, // Multibyte! T_LBRACKET, T_RBRACKET, T_COLON, T_PAST_END, // Must be last terminal // Nonterminals N_VALUE = T_PAST_END, N_ARRAY_MAYBE_CONTINUE, N_OBJECT, N_OBJECT_MAYBE_CONTINUE, N_PAST_END, // Must be last nonterminal }; const char *symbolNames[] = { "T_INVALID", "T_EOF", "T_LBRACE", "T_RBRACE", "T_COMMA", "T_ATOM", "T_STRING", "T_LBRACKET", "T_RBRACKET", "T_COLON", "N_VALUE", "N_ARRAY_MAYBE_CONTINUE", "N_OBJECT", "N_OBJECT_MAYBE_CONTINUE", "N_PAST_END", }; namespace { bool whitespace(char x) { return x == 0x20 || x == 0x0A || x == 0x0D || x == 0x09; } // Straightforward recursive descent that doesn't handle string escaping and // treats numbers as [0-9]+ struct Parser1 { Parser1(char *buf, int len, const Callbacks *callbacks, void *data) : buf(buf), len(len), callbacks(callbacks), data(data) {} // Returns false to reject [[nodiscard]] bool parse() { return parse_element(); } Parser1(Parser1 const &) = delete; Parser1 &operator=(Parser1 const &) = delete; Parser1(Parser1 &&) = delete; Parser1 &operator=(Parser1 &&) = delete; private: char *buf; int len; const Callbacks *const callbacks; void *const data; // Helpers void maybeSkipWs() { while (len > 0 && whitespace(*buf)) { ++buf; --len; } } bool parseLiteral(const char *literal) { const int litLen = strlen(literal); if (len < litLen) { return false; } len -= litLen; return memcmp(std::exchange(buf, buf + litLen), literal, litLen) == 0; } // functions corresponding to productions bool parse_element() { maybeSkipWs(); if (len == 0) { return false; } if (*buf == '{') { if (!parse_object()) { return false; } } else if (*buf == '[') { if (!parse_array()) { return false; } } else if (*buf == '"') { if (!parse_string()) { return false; } } else if (*buf == 't') { if (!parse_true()) { return false; } } else if (*buf == 'f') { if (!parse_false()) { return false; } } else if (*buf == 'n') { if (!parse_null()) { return false; } } else { if (!parse_number()) { return false; } } maybeSkipWs(); return true; } bool parse_object() { if (!parseLiteral("{")) { return false; } callbacks->on_begin_object(data); maybeSkipWs(); if (len == 0) { return false; } if (*buf != '}') { if (!parse_members()) { } } if (!parseLiteral("}")) { return false; } callbacks->on_end_object(data); return true; } bool parse_members() { begin: if (!parse_member()) { return false; } if (len == 0) { return false; } if (*buf == ',') { if (!parseLiteral(",")) { return false; } goto begin; // tail call } return true; } bool parse_member() { maybeSkipWs(); if (!parse_string()) { return false; } maybeSkipWs(); if (!parseLiteral(":")) { return false; } if (!parse_element()) { return false; } return true; } bool parse_array() { if (!parseLiteral("[")) { return false; } callbacks->on_begin_array(data); maybeSkipWs(); if (len == 0) { return false; } if (*buf != ']') { if (!parse_elements()) { return false; } } if (!parseLiteral("]")) { return false; } callbacks->on_end_array(data); return true; } bool parse_elements() { begin: if (!parse_element()) { return false; } if (len == 0) { return false; } if (*buf == ',') { if (!parseLiteral(",")) { return false; } goto begin; // tail call } return true; } bool parse_string() { callbacks->on_begin_string(data); if (!parseLiteral("\"")) { return false; } auto *result = (char *)memchr(buf, '"', len); if (result == nullptr) { return false; } int stringLen = result - buf; callbacks->on_string_data(data, buf, stringLen); buf += stringLen; len -= stringLen; if (!parseLiteral("\"")) { return false; } callbacks->on_end_string(data); return true; } bool parse_number() { callbacks->on_begin_number(data); char *const bufBefore = buf; for (;;) { if (len == 0) { return false; } if ('0' <= *buf && *buf <= '9') { ++buf; --len; } else { break; } } if (buf == bufBefore) { return false; } callbacks->on_number_data(data, bufBefore, buf - bufBefore); callbacks->on_end_number(data); return true; } bool parse_true() { if (!parseLiteral("true")) { return false; } callbacks->on_true_literal(data); return true; } bool parse_false() { if (!parseLiteral("false")) { return false; } callbacks->on_false_literal(data); return true; } bool parse_null() { if (!parseLiteral("null")) { return false; } callbacks->on_null_literal(data); return true; } }; #ifndef __has_attribute #define __has_attribute(x) 0 #endif #if __has_attribute(musttail) #define MUSTTAIL __attribute__((musttail)) #else #define MUSTTAIL #endif #if __has_attribute(preserve_none) #define PRESERVE_NONE __attribute__((preserve_none)) #else #define PRESERVE_NONE #endif struct Parser2 { Parser2(char *buf, int len, const Callbacks *callbacks, void *data) : buf(buf), len(len), callbacks(callbacks), data(data) {} // Returns false to reject [[nodiscard]] bool parse() { if (!push({N_VALUE})) { return false; } nextToken(); return keepGoing(this); } Parser2(Parser2 const &) = delete; Parser2 &operator=(Parser2 const &) = delete; Parser2(Parser2 &&) = delete; Parser2 &operator=(Parser2 &&) = delete; static constexpr int kMaxStackSize = 1 << 10; private: // Helpers void maybeSkipWs() { while (len > 0 && whitespace(*buf)) { ++buf; --len; } } bool parseLiteral(const char *literal) { const int litLen = strlen(literal); if (len < litLen) { return false; } len -= litLen; return memcmp(std::exchange(buf, buf + litLen), literal, litLen) == 0; } bool parse_number() { callbacks->on_begin_number(data); char *const bufBefore = buf; for (;;) { if (len == 0) { return false; } if ('0' <= *buf && *buf <= '9') { ++buf; --len; } else { break; } } if (buf == bufBefore) { return false; } callbacks->on_number_data(data, bufBefore, buf - bufBefore); callbacks->on_end_number(data); return true; } bool parse_string() { callbacks->on_begin_string(data); if (!parseLiteral("\"")) { return false; } auto *result = (char *)memchr(buf, '"', len); if (result == nullptr) { return false; } int stringLen = result - buf; callbacks->on_string_data(data, buf, stringLen); buf += stringLen; len -= stringLen; if (!parseLiteral("\"")) { return false; } callbacks->on_end_string(data); return true; } typedef PRESERVE_NONE bool (*continuation)(Parser2 *); void printStack() { printf("token: %s\n", symbolNames[currentToken]); for (int i = 0; i < stackPtr - stack; ++i) { printf("%s ", symbolNames[stack[i]]); } printf("\n"); } PRESERVE_NONE static bool tokenMatch(Parser2 *self) { self->pop(); self->nextToken(); MUSTTAIL return keepGoing(self); } PRESERVE_NONE static bool keepGoing(Parser2 *self) { // self->printStack(); if (self->empty()) { assert(self->currentToken == T_EOF); return true; } auto top = *(self->stackPtr - 1); MUSTTAIL return table[top][self->currentToken](self); } PRESERVE_NONE static bool reject(Parser2 *self) { self->pop(); return false; } PRESERVE_NONE static bool object(Parser2 *self) { self->pop(); assert(self->currentToken == T_LBRACE); self->callbacks->on_begin_object(self->data); self->nextToken(); if (!self->push({T_STRING, T_COLON, N_VALUE, N_OBJECT_MAYBE_CONTINUE})) { return false; } MUSTTAIL return keepGoing(self); } PRESERVE_NONE static bool atom(Parser2 *self) { self->pop(); if (*self->bufBefore == 't') { self->callbacks->on_true_literal(self->data); } else if (*self->bufBefore == 'f') { self->callbacks->on_false_literal(self->data); } else if (*self->bufBefore == 'n') { self->callbacks->on_null_literal(self->data); } else { self->callbacks->on_begin_number(self->data); self->callbacks->on_number_data(self->data, self->bufBefore + 1, self->buf - self->bufBefore - 2); self->callbacks->on_end_number(self->data); } self->nextToken(); MUSTTAIL return keepGoing(self); } PRESERVE_NONE static bool string(Parser2 *self) { self->pop(); assert(self->currentToken == T_STRING); self->nextToken(); MUSTTAIL return keepGoing(self); } PRESERVE_NONE static bool array(Parser2 *self) { self->pop(); assert(self->currentToken == T_LBRACKET); self->callbacks->on_begin_array(self->data); self->nextToken(); if (!self->push({N_VALUE, N_ARRAY_MAYBE_CONTINUE})) { return false; } MUSTTAIL return keepGoing(self); } PRESERVE_NONE static bool continueArray(Parser2 *self) { self->pop(); assert(self->currentToken == T_COMMA); self->nextToken(); if (!self->push({N_VALUE, N_ARRAY_MAYBE_CONTINUE})) { return false; } MUSTTAIL return keepGoing(self); } PRESERVE_NONE static bool continueObject(Parser2 *self) { self->pop(); assert(self->currentToken == T_COMMA); self->nextToken(); if (!self->push({T_STRING, T_COLON, N_VALUE, N_OBJECT_MAYBE_CONTINUE})) { return false; } MUSTTAIL return keepGoing(self); } PRESERVE_NONE static bool finishArray(Parser2 *self) { self->pop(); assert(self->currentToken == T_RBRACKET); self->callbacks->on_end_array(self->data); self->nextToken(); MUSTTAIL return keepGoing(self); } PRESERVE_NONE static bool finishObject(Parser2 *self) { self->pop(); assert(self->currentToken == T_RBRACE); self->callbacks->on_end_object(self->data); self->nextToken(); MUSTTAIL return keepGoing(self); } // table[nonterminal][terminal] static constexpr continuation table[N_PAST_END][T_PAST_END] = { /*T_INVALID*/ { /*T_INVALID*/ reject, /*T_EOF*/ reject, /*T_LBRACE*/ reject, /*T_RBRACE*/ reject, /*T_COMMA*/ reject, /*T_ATOM*/ reject, /*T_STRING*/ reject, /*T_LBRACKET*/ reject, /*T_RBRACKET*/ reject, /*T_COLON*/ reject, }, /*T_EOF*/ { /*T_INVALID*/ reject, /*T_EOF*/ tokenMatch, /*T_LBRACE*/ reject, /*T_RBRACE*/ reject, /*T_COMMA*/ reject, /*T_ATOM*/ reject, /*T_STRING*/ reject, /*T_LBRACKET*/ reject, /*T_RBRACKET*/ reject, /*T_COLON*/ reject, }, /*T_LBRACE*/ { /*T_INVALID*/ reject, /*T_EOF*/ reject, /*T_LBRACE*/ tokenMatch, /*T_RBRACE*/ reject, /*T_COMMA*/ reject, /*T_ATOM*/ reject, /*T_STRING*/ reject, /*T_LBRACKET*/ reject, /*T_RBRACKET*/ reject, /*T_COLON*/ reject, }, /*T_RBRACE*/ { /*T_INVALID*/ reject, /*T_EOF*/ reject, /*T_LBRACE*/ reject, /*T_RBRACE*/ tokenMatch, /*T_COMMA*/ reject, /*T_ATOM*/ reject, /*T_STRING*/ reject, /*T_LBRACKET*/ reject, /*T_RBRACKET*/ reject, /*T_COLON*/ reject, }, /*T_COMMA*/ { /*T_INVALID*/ reject, /*T_EOF*/ reject, /*T_LBRACE*/ reject, /*T_RBRACE*/ reject, /*T_COMMA*/ tokenMatch, /*T_ATOM*/ reject, /*T_STRING*/ reject, /*T_LBRACKET*/ reject, /*T_RBRACKET*/ reject, /*T_COLON*/ reject, }, /*T_ATOM*/ { /*T_INVALID*/ reject, /*T_EOF*/ reject, /*T_LBRACE*/ reject, /*T_RBRACE*/ reject, /*T_COMMA*/ reject, /*T_ATOM*/ tokenMatch, /*T_STRING*/ reject, /*T_LBRACKET*/ reject, /*T_RBRACKET*/ reject, /*T_COLON*/ reject, }, /*T_STRING*/ { /*T_INVALID*/ reject, /*T_EOF*/ reject, /*T_LBRACE*/ reject, /*T_RBRACE*/ reject, /*T_COMMA*/ reject, /*T_ATOM*/ reject, /*T_STRING*/ tokenMatch, /*T_LBRACKET*/ reject, /*T_RBRACKET*/ reject, /*T_COLON*/ reject, }, /*T_LBRACKET*/ { /*T_INVALID*/ reject, /*T_EOF*/ reject, /*T_LBRACE*/ reject, /*T_RBRACE*/ reject, /*T_COMMA*/ reject, /*T_ATOM*/ reject, /*T_STRING*/ reject, /*T_LBRACKET*/ tokenMatch, /*T_RBRACKET*/ reject, /*T_COLON*/ reject, }, /*T_RBRACKET*/ { /*T_INVALID*/ reject, /*T_EOF*/ reject, /*T_LBRACE*/ reject, /*T_RBRACE*/ reject, /*T_COMMA*/ reject, /*T_ATOM*/ reject, /*T_STRING*/ reject, /*T_LBRACKET*/ reject, /*T_RBRACKET*/ tokenMatch, /*T_COLON*/ reject, }, /*T_COLON*/ { /*T_INVALID*/ reject, /*T_EOF*/ reject, /*T_LBRACE*/ reject, /*T_RBRACE*/ reject, /*T_COMMA*/ reject, /*T_ATOM*/ reject, /*T_STRING*/ reject, /*T_LBRACKET*/ reject, /*T_RBRACKET*/ reject, /*T_COLON*/ tokenMatch, }, /*N_VALUE*/ { /*T_INVALID*/ reject, /*T_EOF*/ reject, /*T_LBRACE*/ object, /*T_RBRACE*/ reject, /*T_COMMA*/ reject, /*T_ATOM*/ atom, /*T_STRING*/ string, /*T_LBRACKET*/ array, /*T_RBRACKET*/ reject, /*T_COLON*/ reject, }, /*N_ARRAY_MAYBE_CONTINUE*/ { /*T_INVALID*/ reject, /*T_EOF*/ reject, /*T_LBRACE*/ reject, /*T_RBRACE*/ reject, /*T_COMMA*/ continueArray, /*T_ATOM*/ reject, /*T_STRING*/ reject, /*T_LBRACKET*/ reject, /*T_RBRACKET*/ finishArray, /*T_COLON*/ reject, }, /*N_OBJECT*/ { /*T_INVALID*/ reject, /*T_EOF*/ reject, /*T_LBRACE*/ object, /*T_RBRACE*/ reject, /*T_COMMA*/ reject, /*T_ATOM*/ reject, /*T_STRING*/ reject, /*T_LBRACKET*/ reject, /*T_RBRACKET*/ reject, /*T_COLON*/ reject, }, /*N_OBJECT_MAYBE_CONTINUE*/ { /*T_INVALID*/ reject, /*T_EOF*/ reject, /*T_LBRACE*/ reject, /*T_RBRACE*/ finishObject, /*T_COMMA*/ continueObject, /*T_ATOM*/ reject, /*T_STRING*/ reject, /*T_LBRACKET*/ reject, /*T_RBRACKET*/ reject, /*T_COLON*/ reject, }, }; Symbol currentToken; const char *bufBefore; void nextToken() { maybeSkipWs(); bufBefore = buf; if (len == 0) { currentToken = T_EOF; return; } if (*buf == '{') { parseLiteral("{"); currentToken = T_LBRACE; return; } else if (*buf == '[') { parseLiteral("["); currentToken = T_LBRACKET; return; } else if (*buf == '}') { parseLiteral("}"); currentToken = T_RBRACE; return; } else if (*buf == ']') { parseLiteral("]"); currentToken = T_RBRACKET; return; } else if (*buf == ':') { parseLiteral(":"); currentToken = T_COLON; return; } else if (*buf == ',') { parseLiteral(","); currentToken = T_COMMA; return; } else if (*buf == '"') { if (!parse_string()) { currentToken = T_INVALID; return; } currentToken = T_STRING; return; } else if (*buf == 't') { if (!parseLiteral("true")) { currentToken = T_INVALID; return; } currentToken = T_ATOM; return; } else if (*buf == 'f') { if (!parseLiteral("false")) { currentToken = T_INVALID; return; } } else if (*buf == 'n') { if (!parseLiteral("null")) { currentToken = T_INVALID; return; } } else { if (!parse_number()) { currentToken = T_INVALID; return; } } currentToken = T_ATOM; return; } char *buf; int len; const Callbacks *const callbacks; void *const data; Symbol stack[kMaxStackSize]; Symbol *stackPtr = stack; bool empty() { return stackPtr == stack; } void pop() { assert(!empty()); --stackPtr; } [[nodiscard]] bool push(std::initializer_list symbols) { if (stackPtr >= std::end(stack) - symbols.size()) [[unlikely]] { return false; } for (int i = symbols.size() - 1; i >= 0; --i) { *stackPtr++ = *(symbols.begin() + i); } return true; } }; const std::string json = R"({ "glossary": { "title": "example glossary", "GlossDiv": { "title": "S", "GlossList": { "GlossEntry": { "ID": "SGML", "SortAs": "SGML", "GlossTerm": "Standard Generalized Markup Language", "Acronym": "SGML", "Abbrev": "ISO 8879:1986", "GlossDef": { "para": "A meta-markup language, used to create markup languages such as DocBook.", "GlossSeeAlso": ["GML", "XML"] }, "GlossSee": "markup" } } } } })"; Callbacks printCallbacks() { Callbacks result; result.on_begin_object = +[](void *) { puts("on_begin_object"); }; result.on_end_object = +[](void *) { puts("on_end_object"); }; result.on_begin_string = +[](void *) { puts("on_begin_string"); }; result.on_string_data = +[](void *, const char *buf, int len) { printf("on_string_data `%.*s`\n", len, buf); }; result.on_end_string = +[](void *) { puts("on_end_string"); }; result.on_begin_array = +[](void *) { puts("on_begin_array"); }; result.on_end_array = +[](void *) { puts("on_end_array"); }; result.on_begin_number = +[](void *) { puts("on_begin_number"); }; result.on_number_data = +[](void *, const char *buf, int len) { printf("on_number_data `%.*s`\n", len, buf); }; result.on_end_number = +[](void *) { puts("on_end_number"); }; result.on_true_literal = +[](void *) { puts("on_true_literal"); }; result.on_false_literal = +[](void *) { puts("on_false_literal"); }; result.on_null_literal = +[](void *) { puts("on_null_literal"); }; return result; } } // namespace TEST_CASE("parser1") { Callbacks c = printCallbacks(); auto copy = json; Parser1 parser(copy.data(), copy.length(), &c, nullptr); CHECK(parser.parse()); } TEST_CASE("parser2") { Callbacks c = printCallbacks(); auto copy = json; Parser2 parser(copy.data(), copy.length(), &c, nullptr); CHECK(parser.parse()); } TEST_CASE("bench1") { auto c = Callbacks{}; ankerl::nanobench::Bench bench; bench.batch(json.size()); bench.unit("byte"); bench.run("parser1", [&]() { auto copy = json; Parser1 parser(copy.data(), copy.length(), &c, nullptr); bench.doNotOptimizeAway(parser.parse()); }); } TEST_CASE("bench2") { auto c = Callbacks{}; ankerl::nanobench::Bench bench; bench.batch(json.size()); bench.unit("byte"); bench.run("parser2", [&]() { auto copy = json; Parser2 parser(copy.data(), copy.length(), &c, nullptr); bench.doNotOptimizeAway(parser.parse()); }); }