Parser2: pushdown automata

This commit is contained in:
2025-05-12 17:42:17 -04:00
parent 9e56aa9612
commit d90ea31ded

View File

@@ -114,8 +114,6 @@
// '0009' ws
struct Callbacks {
void (*on_begin_value)(void *data) = noop;
void (*on_end_value)(void *data) = noop;
void (*on_begin_object)(void *data) = noop;
void (*on_end_object)(void *data) = noop;
void (*on_begin_string)(void *data) = noop;
@@ -137,28 +135,36 @@ private:
// Terminals and Nonterminals. These appear in the stack of the pushdown
// automata
enum Symbol : uint8_t {
enum Symbol : int8_t {
// Terminals
T_INVALID,
T_EOF,
T_LBRACE,
T_RBRACE,
T_COMMA,
T_TRUE,
T_FALSE,
T_NULL,
T_ATOM, // Multibyte!
T_STRING, // Multibyte!
T_LBRACKET,
T_RBRACKET,
T_COLON,
T_DOUBLEQUOTE,
N_CHARACTER, // Multibyte!
T_PAST_END, // Must be last terminal
// Nonterminals
N_VALUE,
N_VALUE = T_PAST_END,
N_ARRAY_MAYBE_CONTINUE,
N_OBJECT,
N_ARRAY,
N_STRING,
N_NUMBER,
N_MEMBER,
N_ELEMENTS,
N_CHARACTERS,
N_OBJECT_MAYBE_CONTINUE,
N_PAST_END, // Must be last nonterminal
};
const char *symbolNames[] = {
"T_INVALID", "T_EOF",
"T_LBRACE", "T_RBRACE",
"T_COMMA", "T_ATOM",
"T_STRING", "T_LBRACKET",
"T_RBRACKET", "T_COLON",
"N_VALUE", "N_ARRAY_MAYBE_CONTINUE",
"N_OBJECT", "N_OBJECT_MAYBE_CONTINUE",
"N_PAST_END",
};
namespace {
@@ -167,14 +173,14 @@ bool whitespace(char x) {
return x == 0x20 || x == 0x0A || x == 0x0D || x == 0x09;
}
// Straightforward recursive descent that doesn't handle string escaping or
// non-integer or negative numbers
// Straightforward recursive descent that doesn't handle string escaping and
// treats numbers as [0-9]+
struct Parser1 {
Parser1(char *buf, int len, const Callbacks *callbacks, void *data)
: buf(buf), len(len), callbacks(callbacks), data(data) {}
// Returns false to reject
bool parse() { return parse_element(); }
[[nodiscard]] bool parse() { return parse_element(); }
Parser1(Parser1 const &) = delete;
Parser1 &operator=(Parser1 const &) = delete;
@@ -399,6 +405,311 @@ private:
}
};
#ifndef __has_attribute
#define __has_attribute(x) 0
#endif
#if __has_attribute(musttail)
#define MUSTTAIL __attribute__((musttail))
#else
#define MUSTTAIL
#endif
struct Parser2 {
Parser2(char *buf, int len, const Callbacks *callbacks, void *data)
: buf(buf), len(len), callbacks(callbacks), data(data) {}
// Returns false to reject
[[nodiscard]] bool parse() {
stack.push_back(N_VALUE);
nextToken();
return keepGoing(this);
}
Parser2(Parser2 const &) = delete;
Parser2 &operator=(Parser2 const &) = delete;
Parser2(Parser2 &&) = delete;
Parser2 &operator=(Parser2 &&) = delete;
private:
// Helpers
void maybeSkipWs() {
while (len > 0 && whitespace(*buf)) {
++buf;
--len;
}
}
bool parseLiteral(const char *literal) {
const int litLen = strlen(literal);
if (len < litLen) {
return false;
}
len -= litLen;
return memcmp(std::exchange(buf, buf + litLen), literal, litLen) == 0;
}
bool parse_number() {
callbacks->on_begin_number(data);
char *const bufBefore = buf;
for (;;) {
if (len == 0) {
return false;
}
if ('0' <= *buf && *buf <= '9') {
++buf;
--len;
} else {
break;
}
}
if (buf == bufBefore) {
return false;
}
callbacks->on_number_data(data, bufBefore, buf - bufBefore);
callbacks->on_end_number(data);
return true;
}
bool parse_string() {
callbacks->on_begin_string(data);
if (!parseLiteral("\"")) {
return false;
}
auto *result = (char *)memchr(buf, '"', len);
if (result == nullptr) {
return false;
}
int stringLen = result - buf;
callbacks->on_string_data(data, buf, stringLen);
buf += stringLen;
len -= stringLen;
if (!parseLiteral("\"")) {
return false;
}
callbacks->on_end_string(data);
return true;
}
typedef bool (*continuation)(Parser2 *);
void printStack() {
printf("token: %s\n", symbolNames[currentToken]);
for (auto s : stack) {
printf("%s ", symbolNames[s]);
}
printf("\n");
}
static bool keepGoing(Parser2 *self) {
// self->printStack();
if (self->stack.empty()) {
assert(self->currentToken == T_EOF);
return true;
}
if (self->stack.back() == self->currentToken) {
self->stack.pop_back();
self->nextToken();
MUSTTAIL return keepGoing(self);
}
// If the top of the stack is a terminal that doesn't match, reject
if (self->stack.back() < T_PAST_END) {
return false;
}
MUSTTAIL return table[self->stack.back() - T_PAST_END][self->currentToken](
self);
}
static bool reject(Parser2 *) { return false; }
static bool object(Parser2 *self) {
assert(self->currentToken == T_LBRACE);
self->callbacks->on_begin_object(self->data);
self->nextToken();
self->stack.pop_back();
self->stack.push_back(N_OBJECT_MAYBE_CONTINUE);
self->stack.push_back(N_VALUE);
self->stack.push_back(T_COLON);
self->stack.push_back(T_STRING);
MUSTTAIL return keepGoing(self);
}
static bool atom(Parser2 *self) {
if (*self->bufBefore == 't') {
self->callbacks->on_true_literal(self->data);
} else if (*self->bufBefore == 'f') {
self->callbacks->on_false_literal(self->data);
} else if (*self->bufBefore == 'n') {
self->callbacks->on_null_literal(self->data);
} else {
self->callbacks->on_begin_number(self->data);
self->callbacks->on_number_data(self->data, self->bufBefore + 1,
self->buf - self->bufBefore - 2);
self->callbacks->on_end_number(self->data);
}
self->nextToken();
self->stack.pop_back();
MUSTTAIL return keepGoing(self);
}
static bool string(Parser2 *self) {
assert(self->currentToken == T_STRING);
self->nextToken();
self->stack.pop_back();
MUSTTAIL return keepGoing(self);
}
static bool array(Parser2 *self) {
assert(self->currentToken == T_LBRACKET);
self->callbacks->on_begin_array(self->data);
self->nextToken();
self->stack.pop_back();
self->stack.push_back(N_ARRAY_MAYBE_CONTINUE);
self->stack.push_back(N_VALUE);
MUSTTAIL return keepGoing(self);
}
static bool continueArray(Parser2 *self) {
assert(self->currentToken == T_COMMA);
self->nextToken();
self->stack.pop_back();
self->stack.push_back(N_ARRAY_MAYBE_CONTINUE);
self->stack.push_back(N_VALUE);
MUSTTAIL return keepGoing(self);
}
static bool continueObject(Parser2 *self) {
assert(self->currentToken == T_COMMA);
self->nextToken();
self->stack.pop_back();
self->stack.push_back(N_OBJECT_MAYBE_CONTINUE);
self->stack.push_back(N_VALUE);
self->stack.push_back(T_COLON);
self->stack.push_back(T_STRING);
MUSTTAIL return keepGoing(self);
}
static bool finishArray(Parser2 *self) {
assert(self->currentToken == T_RBRACKET);
self->callbacks->on_end_array(self->data);
self->nextToken();
self->stack.pop_back();
MUSTTAIL return keepGoing(self);
}
static bool finishObject(Parser2 *self) {
assert(self->currentToken == T_RBRACE);
self->callbacks->on_end_object(self->data);
self->nextToken();
self->stack.pop_back();
MUSTTAIL return keepGoing(self);
}
// table[nonterminal][terminal]
static constexpr continuation table[N_PAST_END - T_PAST_END][T_PAST_END] = {
/*N_VALUE*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ object,
/*T_RBRACE*/ reject,
/*T_COMMA*/ reject,
/*T_ATOM*/ atom,
/*T_STRING*/ string,
/*T_LBRACKET*/ array,
/*T_RBRACKET*/ reject,
/*T_COLON*/ reject,
},
/*N_ARRAY_MAYBE_CONTINUE*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ reject,
/*T_RBRACE*/ reject,
/*T_COMMA*/ continueArray,
/*T_ATOM*/ reject,
/*T_STRING*/ reject,
/*T_LBRACKET*/ reject,
/*T_RBRACKET*/ finishArray,
/*T_COLON*/ reject,
},
/*N_OBJECT*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ object,
/*T_RBRACE*/ reject,
/*T_COMMA*/ reject,
/*T_ATOM*/ reject,
/*T_STRING*/ reject,
/*T_LBRACKET*/ reject,
/*T_RBRACKET*/ reject,
/*T_COLON*/ reject,
},
/*N_OBJECT_MAYBE_CONTINUE*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ reject,
/*T_RBRACE*/ finishObject,
/*T_COMMA*/ continueObject,
/*T_ATOM*/ reject,
/*T_STRING*/ reject,
/*T_LBRACKET*/ reject,
/*T_RBRACKET*/ reject,
/*T_COLON*/ reject,
},
};
Symbol currentToken;
const char *bufBefore;
Symbol nextToken() {
maybeSkipWs();
bufBefore = buf;
if (len == 0) {
return currentToken = T_EOF;
}
if (*buf == '{') {
parseLiteral("{");
return currentToken = T_LBRACE;
} else if (*buf == '[') {
parseLiteral("[");
return currentToken = T_LBRACKET;
} else if (*buf == '}') {
parseLiteral("}");
return currentToken = T_RBRACE;
} else if (*buf == ']') {
parseLiteral("]");
return currentToken = T_RBRACKET;
} else if (*buf == ':') {
parseLiteral(":");
return currentToken = T_COLON;
} else if (*buf == ',') {
parseLiteral(",");
return currentToken = T_COMMA;
} else if (*buf == '"') {
if (!parse_string()) {
return currentToken = T_INVALID;
}
return currentToken = T_STRING;
} else if (*buf == 't') {
if (!parseLiteral("true")) {
return currentToken = T_INVALID;
}
return currentToken = T_ATOM;
} else if (*buf == 'f') {
if (!parseLiteral("false")) {
return currentToken = T_INVALID;
}
} else if (*buf == 'n') {
if (!parseLiteral("null")) {
return currentToken = T_INVALID;
}
} else {
if (!parse_number()) {
return currentToken = T_INVALID;
}
}
return currentToken = T_ATOM;
}
char *buf;
int len;
const Callbacks *const callbacks;
void *const data;
std::vector<Symbol> stack;
};
const std::string json = R"({
"glossary": {
"title": "example glossary",
@@ -424,8 +735,6 @@ const std::string json = R"({
Callbacks printCallbacks() {
Callbacks result;
result.on_begin_value = +[](void *) { puts("on_begin_value"); };
result.on_end_value = +[](void *) { puts("on_end_value"); };
result.on_begin_object = +[](void *) { puts("on_begin_object"); };
result.on_end_object = +[](void *) { puts("on_end_object"); };
result.on_begin_string = +[](void *) { puts("on_begin_string"); };
@@ -453,8 +762,17 @@ TEST_CASE("parser1") {
auto copy = json;
Parser1 parser(copy.data(), copy.length(), &c, nullptr);
CHECK(parser.parse());
}
c = Callbacks{};
TEST_CASE("parser2") {
Callbacks c = printCallbacks();
auto copy = json;
Parser2 parser(copy.data(), copy.length(), &c, nullptr);
CHECK(parser.parse());
}
TEST_CASE("bench") {
auto c = Callbacks{};
ankerl::nanobench::Bench bench;
bench.relative(true);
bench.batch(json.size());
@@ -468,4 +786,9 @@ TEST_CASE("parser1") {
Parser1 parser(copy.data(), copy.length(), &c, nullptr);
bench.doNotOptimizeAway(parser.parse());
});
bench.run("parser2", [&]() {
auto copy = json;
Parser2 parser(copy.data(), copy.length(), &c, nullptr);
bench.doNotOptimizeAway(parser.parse());
});
}