#include #include #include #include #include #include #include #include #include #include #include // This is the JSON grammar in McKeeman Form. // json // element // value // object // array // string // number // "true" // "false" // "null" // object // '{' ws '}' // '{' members '}' // members // member // member ',' members // member // ws string ws ':' element // array // '[' ws ']' // '[' elements ']' // elements // element // element ',' elements // element // ws value ws // string // '"' characters '"' // characters // "" // character characters // character // '0020' . '10FFFF' - '"' - '\' // '\' escape // escape // '"' // '\' // '/' // 'b' // 'f' // 'n' // 'r' // 't' // 'u' hex hex hex hex // hex // digit // 'A' . 'F' // 'a' . 'f' // number // integer fraction exponent // integer // digit // onenine digits // '-' digit // '-' onenine digits // digits // digit // digit digits // digit // '0' // onenine // onenine // '1' . '9' // fraction // "" // '.' digits // exponent // "" // 'E' sign digits // 'e' sign digits // sign // "" // '+' // '-' // ws // "" // '0020' ws // '000A' ws // '000D' ws // '0009' ws struct Callbacks { void (*on_begin_object)(void *data) = noop; void (*on_end_object)(void *data) = noop; void (*on_begin_string)(void *data) = noop; void (*on_string_data)(void *data, const char *buf, int len) = noop; void (*on_end_string)(void *data) = noop; void (*on_begin_array)(void *data) = noop; void (*on_end_array)(void *data) = noop; void (*on_begin_number)(void *data) = noop; void (*on_number_data)(void *data, const char *buf, int len) = noop; void (*on_end_number)(void *data) = noop; void (*on_true_literal)(void *data) = noop; void (*on_false_literal)(void *data) = noop; void (*on_null_literal)(void *data) = noop; private: static void noop(void *) {} static void noop(void *, const char *, int) {} }; // Terminals and Nonterminals. These appear in the stack of the pushdown // automata enum Symbol : int8_t { T_STRING, // Multibyte! T_COLON, // Nonterminals N_VALUE, N_ARRAY_VALUE_OR_END, N_OBJECT_VALUE_OR_END, N_ARRAY_MAYBE_CONTINUE, N_OBJECT_MAYBE_CONTINUE, N_PAST_END, // Must be last nonterminal }; const char *symbolNames[] = { "T_STRING", "T_COLON", "N_VALUE", "N_ARRAY_VALUE_OR_END", "N_OBJECT_VALUE_OR_END", "N_ARRAY_MAYBE_CONTINUE", "N_OBJECT_MAYBE_CONTINUE", }; constexpr static struct Tables { constexpr Tables() { whitespace[' '] = true; whitespace['\n'] = true; whitespace['\r'] = true; whitespace['\t'] = true; } alignas(16) bool whitespace[256]{}; } tables; namespace { // Straightforward recursive descent that doesn't handle string escaping and // treats numbers as [0-9.]+. May stack overflow on deeply nested json documents struct Parser1 { Parser1(char *buf, int len, const Callbacks *callbacks, void *data) : buf(buf), bufEnd(buf + len), callbacks(callbacks), data(data) {} // Returns false to reject [[nodiscard]] bool parse() { return parse_element(); } Parser1(Parser1 const &) = delete; Parser1 &operator=(Parser1 const &) = delete; Parser1(Parser1 &&) = delete; Parser1 &operator=(Parser1 &&) = delete; private: char *buf; char *bufEnd; const Callbacks *const callbacks; void *const data; int len() const { return bufEnd - buf; } // Helpers void maybeSkipWs() { while (buf != bufEnd && tables.whitespace[*buf]) { ++buf; } } bool parseLiteral(const char *literal) { const int litLen = strlen(literal); if (len() < litLen) { return false; } return memcmp(std::exchange(buf, buf + litLen), literal, litLen) == 0; } // functions corresponding to productions bool parse_element() { maybeSkipWs(); if (len() == 0) { return false; } if (*buf == '{') { if (!parse_object()) { return false; } } else if (*buf == '[') { if (!parse_array()) { return false; } } else if (*buf == '"') { if (!parse_string()) { return false; } } else if (*buf == 't') { if (!parse_true()) { return false; } } else if (*buf == 'f') { if (!parse_false()) { return false; } } else if (*buf == 'n') { if (!parse_null()) { return false; } } else { if (!parse_number()) { return false; } } maybeSkipWs(); return true; } bool parse_object() { if (!parseLiteral("{")) { return false; } callbacks->on_begin_object(data); maybeSkipWs(); if (len() == 0) { return false; } if (*buf != '}') { if (!parse_members()) { } } if (!parseLiteral("}")) { return false; } callbacks->on_end_object(data); return true; } bool parse_members() { begin: if (!parse_member()) { return false; } if (len() == 0) { return false; } if (*buf == ',') { if (!parseLiteral(",")) { return false; } goto begin; // tail call } return true; } bool parse_member() { maybeSkipWs(); if (!parse_string()) { return false; } maybeSkipWs(); if (!parseLiteral(":")) { return false; } if (!parse_element()) { return false; } return true; } bool parse_array() { if (!parseLiteral("[")) { return false; } callbacks->on_begin_array(data); maybeSkipWs(); if (len() == 0) { return false; } if (*buf != ']') { if (!parse_elements()) { return false; } } if (!parseLiteral("]")) { return false; } callbacks->on_end_array(data); return true; } bool parse_elements() { begin: if (!parse_element()) { return false; } if (len() == 0) { return false; } if (*buf == ',') { if (!parseLiteral(",")) { return false; } goto begin; // tail call } return true; } bool parse_string() { callbacks->on_begin_string(data); if (!parseLiteral("\"")) { return false; } auto *result = (char *)memchr(buf, '"', len()); if (result == nullptr) { return false; } int stringLen = result - buf; callbacks->on_string_data(data, buf, stringLen); buf += stringLen; if (!parseLiteral("\"")) { return false; } callbacks->on_end_string(data); return true; } bool parse_number() { callbacks->on_begin_number(data); char *const bufBefore = buf; for (;;) { if (len() == 0) { return false; } if ('0' <= *buf && *buf <= '9' || (*buf == '.')) { ++buf; } else { break; } } if (buf == bufBefore) { return false; } callbacks->on_number_data(data, bufBefore, buf - bufBefore); callbacks->on_end_number(data); return true; } bool parse_true() { if (!parseLiteral("true")) { return false; } callbacks->on_true_literal(data); return true; } bool parse_false() { if (!parseLiteral("false")) { return false; } callbacks->on_false_literal(data); return true; } bool parse_null() { if (!parseLiteral("null")) { return false; } callbacks->on_null_literal(data); return true; } }; #ifndef __has_attribute #define __has_attribute(x) 0 #endif #if __has_attribute(musttail) #define MUSTTAIL __attribute__((musttail)) #else #define MUSTTAIL #endif // Table-based ll(1) parser that doesn't handle escaping and treats numbers as // [0-9.]+. Could be adapted to have a streaming interface. Uses O(1) memory. struct Parser2 { Parser2(char *buf, int len, const Callbacks *callbacks, void *data) : buf(buf), bufEnd(buf + len), callbacks(callbacks), data(data) {} // Returns false to reject [[nodiscard]] bool parse() { if (!push({N_VALUE})) { return false; } return keepGoing(this); } Parser2(Parser2 const &) = delete; Parser2 &operator=(Parser2 const &) = delete; Parser2(Parser2 &&) = delete; Parser2 &operator=(Parser2 &&) = delete; static constexpr int kMaxStackSize = 1 << 10; private: // Helpers void maybeSkipWs() { while (buf != bufEnd && tables.whitespace[*buf]) { ++buf; } } bool parseLiteral(const char *literal) { const int litLen = strlen(literal); if (len() < litLen) { return false; } if (memcmp(buf, literal, litLen) == 0) { buf += litLen; return true; } return false; } bool parse_number() { char *const bufBefore = buf; if (len() == 0 || !('0' <= *buf && *buf <= '9' || (*buf == '.'))) { return false; } callbacks->on_begin_number(data); ++buf; for (;;) { if ('0' <= *buf && *buf <= '9' || (*buf == '.')) { ++buf; } else { break; } } callbacks->on_number_data(data, bufBefore, buf - bufBefore); callbacks->on_end_number(data); return true; } bool parse_string() { if (!parseLiteral("\"")) { return false; } callbacks->on_begin_string(data); auto *result = (char *)memchr(buf, '"', len()); if (result == nullptr) { return false; } int stringLen = result - buf; callbacks->on_string_data(data, buf, stringLen); buf += stringLen; if (!parseLiteral("\"")) { return false; } callbacks->on_end_string(data); return true; } typedef bool (*continuation)(Parser2 *); [[maybe_unused]] void debugPrint() { for (int i = 0; i < stackPtr - stack; ++i) { printf("%s ", symbolNames[stack[i]]); } printf("\n"); } static bool keepGoing(Parser2 *self) { // self->debugPrint(); if (self->empty()) { return true; } auto top = *(self->stackPtr - 1); self->maybeSkipWs(); MUSTTAIL return table[top](self); } static bool string(Parser2 *self) { if (!self->parse_string()) { return false; } self->pop(); MUSTTAIL return keepGoing(self); } static bool colon(Parser2 *self) { if (!self->parseLiteral(":")) { return false; } self->pop(); MUSTTAIL return keepGoing(self); } static bool value(Parser2 *self) { if (self->parse_string()) { self->pop(); MUSTTAIL return keepGoing(self); } else if (self->parse_number()) { self->pop(); MUSTTAIL return keepGoing(self); } else if (self->parseLiteral("{")) { self->pop(); self->callbacks->on_begin_object(self->data); if (!self->push({N_OBJECT_VALUE_OR_END})) { return false; } MUSTTAIL return keepGoing(self); } else if (self->parseLiteral("[")) { self->pop(); self->callbacks->on_begin_array(self->data); if (!self->push({N_ARRAY_VALUE_OR_END})) { return false; } MUSTTAIL return keepGoing(self); } else if (self->parseLiteral("true")) { self->pop(); self->callbacks->on_true_literal(self->data); MUSTTAIL return keepGoing(self); } else if (self->parseLiteral("false")) { self->pop(); self->callbacks->on_false_literal(self->data); MUSTTAIL return keepGoing(self); } else if (self->parseLiteral("null")) { self->pop(); self->callbacks->on_null_literal(self->data); MUSTTAIL return keepGoing(self); } return false; } static bool arrayOrEnd(Parser2 *self) { if (self->parseLiteral("]")) { self->pop(); self->callbacks->on_end_array(self->data); MUSTTAIL return keepGoing(self); } else { self->pop(); if (!self->push({N_VALUE, N_ARRAY_MAYBE_CONTINUE})) { return false; } MUSTTAIL return keepGoing(self); } } static bool objectOrEnd(Parser2 *self) { if (self->parseLiteral("}")) { self->pop(); self->callbacks->on_end_object(self->data); MUSTTAIL return keepGoing(self); } else { self->pop(); if (!self->push({T_STRING, T_COLON, N_VALUE, N_OBJECT_MAYBE_CONTINUE})) { return false; } MUSTTAIL return keepGoing(self); } return false; } static bool arrayContinue(Parser2 *self) { if (self->parseLiteral(",")) { self->pop(); if (!self->push({N_VALUE, N_ARRAY_MAYBE_CONTINUE})) { return false; } MUSTTAIL return keepGoing(self); } else if (self->parseLiteral("]")) { self->pop(); self->callbacks->on_end_array(self->data); MUSTTAIL return keepGoing(self); } return false; } static bool objectContinue(Parser2 *self) { if (self->parseLiteral(",")) { self->pop(); if (!self->push({T_STRING, T_COLON, N_VALUE, N_OBJECT_MAYBE_CONTINUE})) { return false; } MUSTTAIL return keepGoing(self); } else if (self->parseLiteral("}")) { self->pop(); self->callbacks->on_end_object(self->data); MUSTTAIL return keepGoing(self); } return false; } static constexpr continuation table[N_PAST_END] = { /*T_STRING*/ string, /*T_COLON*/ colon, /*N_VALUE*/ value, /*N_ARRAY_VALUE_OR_END*/ arrayOrEnd, /*N_OBJECT_VALUE_OR_END*/ objectOrEnd, /*N_ARRAY_MAYBE_CONTINUE*/ arrayContinue, /*N_OBJECT_MAYBE_CONTINUE*/ objectContinue, }; char *buf; char *bufEnd; int len() const { return bufEnd - buf; } const Callbacks *const callbacks; void *const data; Symbol stack[kMaxStackSize]; Symbol *stackPtr = stack; bool empty() const { return stackPtr == stack; } void pop() { assert(!empty()); --stackPtr; } [[nodiscard]] bool push(std::initializer_list symbols) { if (stackPtr >= std::end(stack) - symbols.size()) [[unlikely]] { return false; } for (int i = symbols.size() - 1; i >= 0; --i) { *stackPtr++ = *(symbols.begin() + i); } return true; } }; const std::string json = R"({ "a number": 12345, "true": true, "false": false, "null": null, "glossary": { "title": "example glossary", "GlossDiv": { "title": "S", "GlossList": { "GlossEntry": { "ID": "SGML", "SortAs": "SGML", "GlossTerm": "Standard Generalized Markup Language", "Acronym": "SGML", "Abbrev": "ISO 8879:1986", "GlossDef": { "para": "A meta-markup language, used to create markup languages such as DocBook.", "GlossSeeAlso": ["GML", "XML"] }, "GlossSee": "markup" } } } } })"; Callbacks printCallbacks() { Callbacks result; result.on_begin_object = +[](void *) { puts("on_begin_object"); }; result.on_end_object = +[](void *) { puts("on_end_object"); }; result.on_begin_string = +[](void *) { puts("on_begin_string"); }; result.on_string_data = +[](void *, const char *buf, int len) { printf("on_string_data `%.*s`\n", len, buf); }; result.on_end_string = +[](void *) { puts("on_end_string"); }; result.on_begin_array = +[](void *) { puts("on_begin_array"); }; result.on_end_array = +[](void *) { puts("on_end_array"); }; result.on_begin_number = +[](void *) { puts("on_begin_number"); }; result.on_number_data = +[](void *, const char *buf, int len) { printf("on_number_data `%.*s`\n", len, buf); }; result.on_end_number = +[](void *) { puts("on_end_number"); }; result.on_true_literal = +[](void *) { puts("on_true_literal"); }; result.on_false_literal = +[](void *) { puts("on_false_literal"); }; result.on_null_literal = +[](void *) { puts("on_null_literal"); }; return result; } } // namespace TEST_CASE("parser1") { Callbacks c = printCallbacks(); { auto copy = json; Parser1 parser(copy.data(), copy.length(), &c, nullptr); CHECK(parser.parse()); } { std::string copy = "{\"x\": [], \"y\": {}}"; Parser1 parser(copy.data(), copy.length(), &c, nullptr); CHECK(parser.parse()); } } TEST_CASE("parser2") { Callbacks c = printCallbacks(); { auto copy = json; Parser2 parser(copy.data(), copy.length(), &c, nullptr); CHECK(parser.parse()); } { std::string copy = "{\"x\": [], \"y\": {}}"; Parser2 parser(copy.data(), copy.length(), &c, nullptr); CHECK(parser.parse()); } } TEST_CASE("bench1") { auto c = Callbacks{}; ankerl::nanobench::Bench bench; bench.batch(json.size()); bench.unit("byte"); bench.run("parser1", [&]() { auto copy = json; Parser1 parser(copy.data(), copy.length(), &c, nullptr); bench.doNotOptimizeAway(parser.parse()); }); } TEST_CASE("bench2") { auto c = Callbacks{}; ankerl::nanobench::Bench bench; bench.batch(json.size()); bench.unit("byte"); bench.run("parser2", [&]() { auto copy = json; Parser2 parser(copy.data(), copy.length(), &c, nullptr); bench.doNotOptimizeAway(parser.parse()); }); } TEST_CASE("bench3") { using namespace simdjson; ankerl::nanobench::Bench bench; bench.batch(json.size()); bench.unit("byte"); bench.run("parser3", [&]() { simdjson::padded_string my_padded_data(json.data(), json.size()); simdjson::dom::parser parser; auto doc = parser.parse(my_padded_data); bench.doNotOptimizeAway(doc); }); } TEST_CASE("bench4") { using namespace simdjson; ankerl::nanobench::Bench bench; bench.batch(json.size()); bench.unit("byte"); bench.run("parser4", [&]() { padded_string my_padded_data(json.data(), json.size()); ondemand::parser parser; auto doc = parser.iterate(my_padded_data); bench.doNotOptimizeAway(doc); }); }