From 9e56aa9612c452b83df11a9dcd37b71891ac3525 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Mon, 12 May 2025 13:16:16 -0400 Subject: [PATCH] Implement Parser1, simple recursive descent --- CMakeLists.txt | 5 +- src/test.cpp | 472 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 471 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 624e476..1545c2f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,9 +31,6 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/") include(CTest) include(doctest) -add_executable(bench src/bench.cpp) -target_link_libraries(bench PRIVATE nanobench) - add_executable(mytest src/test.cpp) -target_link_libraries(mytest PRIVATE doctest) +target_link_libraries(mytest PRIVATE doctest nanobench) doctest_discover_tests(mytest) diff --git a/src/test.cpp b/src/test.cpp index 03a9775..0fbaebd 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -1,3 +1,471 @@ -#include +#include +#include +#include +#include -TEST_CASE("add") { CHECK(0 + 1 == 1); } +#include +#include + +#include +#include + +// This is the JSON grammar in McKeeman Form. + +// json +// element + +// value +// object +// array +// string +// number +// "true" +// "false" +// "null" + +// object +// '{' ws '}' +// '{' members '}' + +// members +// member +// member ',' members + +// member +// ws string ws ':' element + +// array +// '[' ws ']' +// '[' elements ']' + +// elements +// element +// element ',' elements + +// element +// ws value ws + +// string +// '"' characters '"' + +// characters +// "" +// character characters + +// character +// '0020' . '10FFFF' - '"' - '\' +// '\' escape + +// escape +// '"' +// '\' +// '/' +// 'b' +// 'f' +// 'n' +// 'r' +// 't' +// 'u' hex hex hex hex + +// hex +// digit +// 'A' . 'F' +// 'a' . 'f' + +// number +// integer fraction exponent + +// integer +// digit +// onenine digits +// '-' digit +// '-' onenine digits + +// digits +// digit +// digit digits + +// digit +// '0' +// onenine + +// onenine +// '1' . '9' + +// fraction +// "" +// '.' digits + +// exponent +// "" +// 'E' sign digits +// 'e' sign digits + +// sign +// "" +// '+' +// '-' + +// ws +// "" +// '0020' ws +// '000A' ws +// '000D' ws +// '0009' ws + +struct Callbacks { + void (*on_begin_value)(void *data) = noop; + void (*on_end_value)(void *data) = noop; + void (*on_begin_object)(void *data) = noop; + void (*on_end_object)(void *data) = noop; + void (*on_begin_string)(void *data) = noop; + void (*on_string_data)(void *data, const char *buf, int len) = noop; + void (*on_end_string)(void *data) = noop; + void (*on_begin_array)(void *data) = noop; + void (*on_end_array)(void *data) = noop; + void (*on_begin_number)(void *data) = noop; + void (*on_number_data)(void *data, const char *buf, int len) = noop; + void (*on_end_number)(void *data) = noop; + void (*on_true_literal)(void *data) = noop; + void (*on_false_literal)(void *data) = noop; + void (*on_null_literal)(void *data) = noop; + +private: + static void noop(void *) {} + static void noop(void *, const char *, int) {} +}; + +// Terminals and Nonterminals. These appear in the stack of the pushdown +// automata +enum Symbol : uint8_t { + // Terminals + T_LBRACE, + T_RBRACE, + T_COMMA, + T_TRUE, + T_FALSE, + T_NULL, + T_LBRACKET, + T_RBRACKET, + T_COLON, + T_DOUBLEQUOTE, + N_CHARACTER, // Multibyte! + // Nonterminals + N_VALUE, + N_OBJECT, + N_ARRAY, + N_STRING, + N_NUMBER, + N_MEMBER, + N_ELEMENTS, + N_CHARACTERS, +}; + +namespace { + +bool whitespace(char x) { + return x == 0x20 || x == 0x0A || x == 0x0D || x == 0x09; +} + +// Straightforward recursive descent that doesn't handle string escaping or +// non-integer or negative numbers +struct Parser1 { + Parser1(char *buf, int len, const Callbacks *callbacks, void *data) + : buf(buf), len(len), callbacks(callbacks), data(data) {} + + // Returns false to reject + bool parse() { return parse_element(); } + + Parser1(Parser1 const &) = delete; + Parser1 &operator=(Parser1 const &) = delete; + Parser1(Parser1 &&) = delete; + Parser1 &operator=(Parser1 &&) = delete; + +private: + char *buf; + int len; + const Callbacks *const callbacks; + void *const data; + + // Helpers + void maybeSkipWs() { + while (len > 0 && whitespace(*buf)) { + ++buf; + --len; + } + } + bool parseLiteral(const char *literal) { + const int litLen = strlen(literal); + if (len < litLen) { + return false; + } + len -= litLen; + return memcmp(std::exchange(buf, buf + litLen), literal, litLen) == 0; + } + + // functions corresponding to productions + bool parse_element() { + maybeSkipWs(); + if (len == 0) { + return false; + } + if (*buf == '{') { + if (!parse_object()) { + return false; + } + } else if (*buf == '[') { + if (!parse_array()) { + return false; + } + } else if (*buf == '"') { + if (!parse_string()) { + return false; + } + } else if (*buf == 't') { + if (!parse_true()) { + return false; + } + } else if (*buf == 'f') { + if (!parse_false()) { + return false; + } + } else if (*buf == 'n') { + if (!parse_null()) { + return false; + } + } else { + if (!parse_number()) { + return false; + } + } + maybeSkipWs(); + return true; + } + + bool parse_object() { + if (!parseLiteral("{")) { + return false; + } + callbacks->on_begin_object(data); + maybeSkipWs(); + if (len == 0) { + return false; + } + if (*buf != '}') { + if (!parse_members()) { + } + } + if (!parseLiteral("}")) { + return false; + } + callbacks->on_end_object(data); + return true; + } + + bool parse_members() { + begin: + if (!parse_member()) { + return false; + } + if (len == 0) { + return false; + } + if (*buf == ',') { + if (!parseLiteral(",")) { + return false; + } + goto begin; // tail call + } + return true; + } + + bool parse_member() { + maybeSkipWs(); + if (!parse_string()) { + return false; + } + maybeSkipWs(); + if (!parseLiteral(":")) { + return false; + } + if (!parse_element()) { + return false; + } + return true; + } + + bool parse_array() { + if (!parseLiteral("[")) { + return false; + } + callbacks->on_begin_array(data); + maybeSkipWs(); + if (len == 0) { + return false; + } + if (*buf != ']') { + if (!parse_elements()) { + return false; + } + } + if (!parseLiteral("]")) { + return false; + } + callbacks->on_end_array(data); + return true; + } + + bool parse_elements() { + begin: + if (!parse_element()) { + return false; + } + if (len == 0) { + return false; + } + if (*buf == ',') { + if (!parseLiteral(",")) { + return false; + } + goto begin; // tail call + } + return true; + } + + bool parse_string() { + callbacks->on_begin_string(data); + if (!parseLiteral("\"")) { + return false; + } + auto *result = (char *)memchr(buf, '"', len); + if (result == nullptr) { + return false; + } + int stringLen = result - buf; + callbacks->on_string_data(data, buf, stringLen); + buf += stringLen; + len -= stringLen; + if (!parseLiteral("\"")) { + return false; + } + callbacks->on_end_string(data); + return true; + } + + bool parse_number() { + callbacks->on_begin_number(data); + char *const bufBefore = buf; + for (;;) { + if (len == 0) { + return false; + } + if ('0' <= *buf && *buf <= '9') { + ++buf; + --len; + } else { + break; + } + } + if (buf == bufBefore) { + return false; + } + callbacks->on_number_data(data, bufBefore, buf - bufBefore); + callbacks->on_end_number(data); + return true; + } + + bool parse_true() { + if (!parseLiteral("true")) { + return false; + } + callbacks->on_true_literal(data); + return true; + } + + bool parse_false() { + if (!parseLiteral("false")) { + return false; + } + callbacks->on_false_literal(data); + return true; + } + + bool parse_null() { + if (!parseLiteral("null")) { + return false; + } + callbacks->on_null_literal(data); + return true; + } +}; + +const std::string json = R"({ + "glossary": { + "title": "example glossary", + "GlossDiv": { + "title": "S", + "GlossList": { + "GlossEntry": { + "ID": "SGML", + "SortAs": "SGML", + "GlossTerm": "Standard Generalized Markup Language", + "Acronym": "SGML", + "Abbrev": "ISO 8879:1986", + "GlossDef": { + "para": "A meta-markup language, used to create markup languages such as DocBook.", + "GlossSeeAlso": ["GML", "XML"] + }, + "GlossSee": "markup" + } + } + } + } +})"; + +Callbacks printCallbacks() { + Callbacks result; + result.on_begin_value = +[](void *) { puts("on_begin_value"); }; + result.on_end_value = +[](void *) { puts("on_end_value"); }; + result.on_begin_object = +[](void *) { puts("on_begin_object"); }; + result.on_end_object = +[](void *) { puts("on_end_object"); }; + result.on_begin_string = +[](void *) { puts("on_begin_string"); }; + result.on_string_data = +[](void *, const char *buf, int len) { + printf("on_string_data `%.*s`\n", len, buf); + }; + result.on_end_string = +[](void *) { puts("on_end_string"); }; + result.on_begin_array = +[](void *) { puts("on_begin_array"); }; + result.on_end_array = +[](void *) { puts("on_end_array"); }; + result.on_begin_number = +[](void *) { puts("on_begin_number"); }; + result.on_number_data = +[](void *, const char *buf, int len) { + printf("on_number_data `%.*s`\n", len, buf); + }; + result.on_end_number = +[](void *) { puts("on_end_number"); }; + result.on_true_literal = +[](void *) { puts("on_true_literal"); }; + result.on_false_literal = +[](void *) { puts("on_false_literal"); }; + result.on_null_literal = +[](void *) { puts("on_null_literal"); }; + return result; +} + +} // namespace + +TEST_CASE("parser1") { + Callbacks c = printCallbacks(); + auto copy = json; + Parser1 parser(copy.data(), copy.length(), &c, nullptr); + CHECK(parser.parse()); + + c = Callbacks{}; + ankerl::nanobench::Bench bench; + bench.relative(true); + bench.batch(json.size()); + bench.unit("byte"); + bench.run("parser control", [&]() { + auto copy = json; + bench.doNotOptimizeAway(copy); + }); + bench.run("parser1", [&]() { + auto copy = json; + Parser1 parser(copy.data(), copy.length(), &c, nullptr); + bench.doNotOptimizeAway(parser.parse()); + }); +}