Implement Parser1, simple recursive descent

This commit is contained in:
2025-05-12 13:16:16 -04:00
parent f1acb2d0a0
commit 9e56aa9612
2 changed files with 471 additions and 6 deletions

View File

@@ -31,9 +31,6 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
include(CTest) include(CTest)
include(doctest) include(doctest)
add_executable(bench src/bench.cpp)
target_link_libraries(bench PRIVATE nanobench)
add_executable(mytest src/test.cpp) add_executable(mytest src/test.cpp)
target_link_libraries(mytest PRIVATE doctest) target_link_libraries(mytest PRIVATE doctest nanobench)
doctest_discover_tests(mytest) doctest_discover_tests(mytest)

View File

@@ -1,3 +1,471 @@
#include <doctest.h> #include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstring>
TEST_CASE("add") { CHECK(0 + 1 == 1); } #include <string>
#include <utility>
#include <doctest.h>
#include <nanobench.h>
// This is the JSON grammar in McKeeman Form.
// json
// element
// value
// object
// array
// string
// number
// "true"
// "false"
// "null"
// object
// '{' ws '}'
// '{' members '}'
// members
// member
// member ',' members
// member
// ws string ws ':' element
// array
// '[' ws ']'
// '[' elements ']'
// elements
// element
// element ',' elements
// element
// ws value ws
// string
// '"' characters '"'
// characters
// ""
// character characters
// character
// '0020' . '10FFFF' - '"' - '\'
// '\' escape
// escape
// '"'
// '\'
// '/'
// 'b'
// 'f'
// 'n'
// 'r'
// 't'
// 'u' hex hex hex hex
// hex
// digit
// 'A' . 'F'
// 'a' . 'f'
// number
// integer fraction exponent
// integer
// digit
// onenine digits
// '-' digit
// '-' onenine digits
// digits
// digit
// digit digits
// digit
// '0'
// onenine
// onenine
// '1' . '9'
// fraction
// ""
// '.' digits
// exponent
// ""
// 'E' sign digits
// 'e' sign digits
// sign
// ""
// '+'
// '-'
// ws
// ""
// '0020' ws
// '000A' ws
// '000D' ws
// '0009' ws
struct Callbacks {
void (*on_begin_value)(void *data) = noop;
void (*on_end_value)(void *data) = noop;
void (*on_begin_object)(void *data) = noop;
void (*on_end_object)(void *data) = noop;
void (*on_begin_string)(void *data) = noop;
void (*on_string_data)(void *data, const char *buf, int len) = noop;
void (*on_end_string)(void *data) = noop;
void (*on_begin_array)(void *data) = noop;
void (*on_end_array)(void *data) = noop;
void (*on_begin_number)(void *data) = noop;
void (*on_number_data)(void *data, const char *buf, int len) = noop;
void (*on_end_number)(void *data) = noop;
void (*on_true_literal)(void *data) = noop;
void (*on_false_literal)(void *data) = noop;
void (*on_null_literal)(void *data) = noop;
private:
static void noop(void *) {}
static void noop(void *, const char *, int) {}
};
// Terminals and Nonterminals. These appear in the stack of the pushdown
// automata
enum Symbol : uint8_t {
// Terminals
T_LBRACE,
T_RBRACE,
T_COMMA,
T_TRUE,
T_FALSE,
T_NULL,
T_LBRACKET,
T_RBRACKET,
T_COLON,
T_DOUBLEQUOTE,
N_CHARACTER, // Multibyte!
// Nonterminals
N_VALUE,
N_OBJECT,
N_ARRAY,
N_STRING,
N_NUMBER,
N_MEMBER,
N_ELEMENTS,
N_CHARACTERS,
};
namespace {
bool whitespace(char x) {
return x == 0x20 || x == 0x0A || x == 0x0D || x == 0x09;
}
// Straightforward recursive descent that doesn't handle string escaping or
// non-integer or negative numbers
struct Parser1 {
Parser1(char *buf, int len, const Callbacks *callbacks, void *data)
: buf(buf), len(len), callbacks(callbacks), data(data) {}
// Returns false to reject
bool parse() { return parse_element(); }
Parser1(Parser1 const &) = delete;
Parser1 &operator=(Parser1 const &) = delete;
Parser1(Parser1 &&) = delete;
Parser1 &operator=(Parser1 &&) = delete;
private:
char *buf;
int len;
const Callbacks *const callbacks;
void *const data;
// Helpers
void maybeSkipWs() {
while (len > 0 && whitespace(*buf)) {
++buf;
--len;
}
}
bool parseLiteral(const char *literal) {
const int litLen = strlen(literal);
if (len < litLen) {
return false;
}
len -= litLen;
return memcmp(std::exchange(buf, buf + litLen), literal, litLen) == 0;
}
// functions corresponding to productions
bool parse_element() {
maybeSkipWs();
if (len == 0) {
return false;
}
if (*buf == '{') {
if (!parse_object()) {
return false;
}
} else if (*buf == '[') {
if (!parse_array()) {
return false;
}
} else if (*buf == '"') {
if (!parse_string()) {
return false;
}
} else if (*buf == 't') {
if (!parse_true()) {
return false;
}
} else if (*buf == 'f') {
if (!parse_false()) {
return false;
}
} else if (*buf == 'n') {
if (!parse_null()) {
return false;
}
} else {
if (!parse_number()) {
return false;
}
}
maybeSkipWs();
return true;
}
bool parse_object() {
if (!parseLiteral("{")) {
return false;
}
callbacks->on_begin_object(data);
maybeSkipWs();
if (len == 0) {
return false;
}
if (*buf != '}') {
if (!parse_members()) {
}
}
if (!parseLiteral("}")) {
return false;
}
callbacks->on_end_object(data);
return true;
}
bool parse_members() {
begin:
if (!parse_member()) {
return false;
}
if (len == 0) {
return false;
}
if (*buf == ',') {
if (!parseLiteral(",")) {
return false;
}
goto begin; // tail call
}
return true;
}
bool parse_member() {
maybeSkipWs();
if (!parse_string()) {
return false;
}
maybeSkipWs();
if (!parseLiteral(":")) {
return false;
}
if (!parse_element()) {
return false;
}
return true;
}
bool parse_array() {
if (!parseLiteral("[")) {
return false;
}
callbacks->on_begin_array(data);
maybeSkipWs();
if (len == 0) {
return false;
}
if (*buf != ']') {
if (!parse_elements()) {
return false;
}
}
if (!parseLiteral("]")) {
return false;
}
callbacks->on_end_array(data);
return true;
}
bool parse_elements() {
begin:
if (!parse_element()) {
return false;
}
if (len == 0) {
return false;
}
if (*buf == ',') {
if (!parseLiteral(",")) {
return false;
}
goto begin; // tail call
}
return true;
}
bool parse_string() {
callbacks->on_begin_string(data);
if (!parseLiteral("\"")) {
return false;
}
auto *result = (char *)memchr(buf, '"', len);
if (result == nullptr) {
return false;
}
int stringLen = result - buf;
callbacks->on_string_data(data, buf, stringLen);
buf += stringLen;
len -= stringLen;
if (!parseLiteral("\"")) {
return false;
}
callbacks->on_end_string(data);
return true;
}
bool parse_number() {
callbacks->on_begin_number(data);
char *const bufBefore = buf;
for (;;) {
if (len == 0) {
return false;
}
if ('0' <= *buf && *buf <= '9') {
++buf;
--len;
} else {
break;
}
}
if (buf == bufBefore) {
return false;
}
callbacks->on_number_data(data, bufBefore, buf - bufBefore);
callbacks->on_end_number(data);
return true;
}
bool parse_true() {
if (!parseLiteral("true")) {
return false;
}
callbacks->on_true_literal(data);
return true;
}
bool parse_false() {
if (!parseLiteral("false")) {
return false;
}
callbacks->on_false_literal(data);
return true;
}
bool parse_null() {
if (!parseLiteral("null")) {
return false;
}
callbacks->on_null_literal(data);
return true;
}
};
const std::string json = R"({
"glossary": {
"title": "example glossary",
"GlossDiv": {
"title": "S",
"GlossList": {
"GlossEntry": {
"ID": "SGML",
"SortAs": "SGML",
"GlossTerm": "Standard Generalized Markup Language",
"Acronym": "SGML",
"Abbrev": "ISO 8879:1986",
"GlossDef": {
"para": "A meta-markup language, used to create markup languages such as DocBook.",
"GlossSeeAlso": ["GML", "XML"]
},
"GlossSee": "markup"
}
}
}
}
})";
Callbacks printCallbacks() {
Callbacks result;
result.on_begin_value = +[](void *) { puts("on_begin_value"); };
result.on_end_value = +[](void *) { puts("on_end_value"); };
result.on_begin_object = +[](void *) { puts("on_begin_object"); };
result.on_end_object = +[](void *) { puts("on_end_object"); };
result.on_begin_string = +[](void *) { puts("on_begin_string"); };
result.on_string_data = +[](void *, const char *buf, int len) {
printf("on_string_data `%.*s`\n", len, buf);
};
result.on_end_string = +[](void *) { puts("on_end_string"); };
result.on_begin_array = +[](void *) { puts("on_begin_array"); };
result.on_end_array = +[](void *) { puts("on_end_array"); };
result.on_begin_number = +[](void *) { puts("on_begin_number"); };
result.on_number_data = +[](void *, const char *buf, int len) {
printf("on_number_data `%.*s`\n", len, buf);
};
result.on_end_number = +[](void *) { puts("on_end_number"); };
result.on_true_literal = +[](void *) { puts("on_true_literal"); };
result.on_false_literal = +[](void *) { puts("on_false_literal"); };
result.on_null_literal = +[](void *) { puts("on_null_literal"); };
return result;
}
} // namespace
TEST_CASE("parser1") {
Callbacks c = printCallbacks();
auto copy = json;
Parser1 parser(copy.data(), copy.length(), &c, nullptr);
CHECK(parser.parse());
c = Callbacks{};
ankerl::nanobench::Bench bench;
bench.relative(true);
bench.batch(json.size());
bench.unit("byte");
bench.run("parser control", [&]() {
auto copy = json;
bench.doNotOptimizeAway(copy);
});
bench.run("parser1", [&]() {
auto copy = json;
Parser1 parser(copy.data(), copy.length(), &c, nullptr);
bench.doNotOptimizeAway(parser.parse());
});
}