Parser2: pushdown automata
This commit is contained in:
365
src/test.cpp
365
src/test.cpp
@@ -114,8 +114,6 @@
|
||||
// '0009' ws
|
||||
|
||||
struct Callbacks {
|
||||
void (*on_begin_value)(void *data) = noop;
|
||||
void (*on_end_value)(void *data) = noop;
|
||||
void (*on_begin_object)(void *data) = noop;
|
||||
void (*on_end_object)(void *data) = noop;
|
||||
void (*on_begin_string)(void *data) = noop;
|
||||
@@ -137,28 +135,36 @@ private:
|
||||
|
||||
// Terminals and Nonterminals. These appear in the stack of the pushdown
|
||||
// automata
|
||||
enum Symbol : uint8_t {
|
||||
enum Symbol : int8_t {
|
||||
// Terminals
|
||||
T_INVALID,
|
||||
T_EOF,
|
||||
T_LBRACE,
|
||||
T_RBRACE,
|
||||
T_COMMA,
|
||||
T_TRUE,
|
||||
T_FALSE,
|
||||
T_NULL,
|
||||
T_ATOM, // Multibyte!
|
||||
T_STRING, // Multibyte!
|
||||
T_LBRACKET,
|
||||
T_RBRACKET,
|
||||
T_COLON,
|
||||
T_DOUBLEQUOTE,
|
||||
N_CHARACTER, // Multibyte!
|
||||
T_PAST_END, // Must be last terminal
|
||||
// Nonterminals
|
||||
N_VALUE,
|
||||
N_VALUE = T_PAST_END,
|
||||
N_ARRAY_MAYBE_CONTINUE,
|
||||
N_OBJECT,
|
||||
N_ARRAY,
|
||||
N_STRING,
|
||||
N_NUMBER,
|
||||
N_MEMBER,
|
||||
N_ELEMENTS,
|
||||
N_CHARACTERS,
|
||||
N_OBJECT_MAYBE_CONTINUE,
|
||||
N_PAST_END, // Must be last nonterminal
|
||||
};
|
||||
|
||||
const char *symbolNames[] = {
|
||||
"T_INVALID", "T_EOF",
|
||||
"T_LBRACE", "T_RBRACE",
|
||||
"T_COMMA", "T_ATOM",
|
||||
"T_STRING", "T_LBRACKET",
|
||||
"T_RBRACKET", "T_COLON",
|
||||
"N_VALUE", "N_ARRAY_MAYBE_CONTINUE",
|
||||
"N_OBJECT", "N_OBJECT_MAYBE_CONTINUE",
|
||||
"N_PAST_END",
|
||||
};
|
||||
|
||||
namespace {
|
||||
@@ -167,14 +173,14 @@ bool whitespace(char x) {
|
||||
return x == 0x20 || x == 0x0A || x == 0x0D || x == 0x09;
|
||||
}
|
||||
|
||||
// Straightforward recursive descent that doesn't handle string escaping or
|
||||
// non-integer or negative numbers
|
||||
// Straightforward recursive descent that doesn't handle string escaping and
|
||||
// treats numbers as [0-9]+
|
||||
struct Parser1 {
|
||||
Parser1(char *buf, int len, const Callbacks *callbacks, void *data)
|
||||
: buf(buf), len(len), callbacks(callbacks), data(data) {}
|
||||
|
||||
// Returns false to reject
|
||||
bool parse() { return parse_element(); }
|
||||
[[nodiscard]] bool parse() { return parse_element(); }
|
||||
|
||||
Parser1(Parser1 const &) = delete;
|
||||
Parser1 &operator=(Parser1 const &) = delete;
|
||||
@@ -399,6 +405,311 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
#ifndef __has_attribute
|
||||
#define __has_attribute(x) 0
|
||||
#endif
|
||||
|
||||
#if __has_attribute(musttail)
|
||||
#define MUSTTAIL __attribute__((musttail))
|
||||
#else
|
||||
#define MUSTTAIL
|
||||
#endif
|
||||
|
||||
struct Parser2 {
|
||||
Parser2(char *buf, int len, const Callbacks *callbacks, void *data)
|
||||
: buf(buf), len(len), callbacks(callbacks), data(data) {}
|
||||
|
||||
// Returns false to reject
|
||||
[[nodiscard]] bool parse() {
|
||||
stack.push_back(N_VALUE);
|
||||
nextToken();
|
||||
return keepGoing(this);
|
||||
}
|
||||
|
||||
Parser2(Parser2 const &) = delete;
|
||||
Parser2 &operator=(Parser2 const &) = delete;
|
||||
Parser2(Parser2 &&) = delete;
|
||||
Parser2 &operator=(Parser2 &&) = delete;
|
||||
|
||||
private:
|
||||
// Helpers
|
||||
void maybeSkipWs() {
|
||||
while (len > 0 && whitespace(*buf)) {
|
||||
++buf;
|
||||
--len;
|
||||
}
|
||||
}
|
||||
bool parseLiteral(const char *literal) {
|
||||
const int litLen = strlen(literal);
|
||||
if (len < litLen) {
|
||||
return false;
|
||||
}
|
||||
len -= litLen;
|
||||
return memcmp(std::exchange(buf, buf + litLen), literal, litLen) == 0;
|
||||
}
|
||||
bool parse_number() {
|
||||
callbacks->on_begin_number(data);
|
||||
char *const bufBefore = buf;
|
||||
for (;;) {
|
||||
if (len == 0) {
|
||||
return false;
|
||||
}
|
||||
if ('0' <= *buf && *buf <= '9') {
|
||||
++buf;
|
||||
--len;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (buf == bufBefore) {
|
||||
return false;
|
||||
}
|
||||
callbacks->on_number_data(data, bufBefore, buf - bufBefore);
|
||||
callbacks->on_end_number(data);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool parse_string() {
|
||||
callbacks->on_begin_string(data);
|
||||
if (!parseLiteral("\"")) {
|
||||
return false;
|
||||
}
|
||||
auto *result = (char *)memchr(buf, '"', len);
|
||||
if (result == nullptr) {
|
||||
return false;
|
||||
}
|
||||
int stringLen = result - buf;
|
||||
callbacks->on_string_data(data, buf, stringLen);
|
||||
buf += stringLen;
|
||||
len -= stringLen;
|
||||
if (!parseLiteral("\"")) {
|
||||
return false;
|
||||
}
|
||||
callbacks->on_end_string(data);
|
||||
return true;
|
||||
}
|
||||
|
||||
typedef bool (*continuation)(Parser2 *);
|
||||
|
||||
void printStack() {
|
||||
printf("token: %s\n", symbolNames[currentToken]);
|
||||
for (auto s : stack) {
|
||||
printf("%s ", symbolNames[s]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static bool keepGoing(Parser2 *self) {
|
||||
// self->printStack();
|
||||
if (self->stack.empty()) {
|
||||
assert(self->currentToken == T_EOF);
|
||||
return true;
|
||||
}
|
||||
if (self->stack.back() == self->currentToken) {
|
||||
self->stack.pop_back();
|
||||
self->nextToken();
|
||||
MUSTTAIL return keepGoing(self);
|
||||
}
|
||||
// If the top of the stack is a terminal that doesn't match, reject
|
||||
if (self->stack.back() < T_PAST_END) {
|
||||
return false;
|
||||
}
|
||||
MUSTTAIL return table[self->stack.back() - T_PAST_END][self->currentToken](
|
||||
self);
|
||||
}
|
||||
|
||||
static bool reject(Parser2 *) { return false; }
|
||||
static bool object(Parser2 *self) {
|
||||
assert(self->currentToken == T_LBRACE);
|
||||
self->callbacks->on_begin_object(self->data);
|
||||
self->nextToken();
|
||||
self->stack.pop_back();
|
||||
self->stack.push_back(N_OBJECT_MAYBE_CONTINUE);
|
||||
self->stack.push_back(N_VALUE);
|
||||
self->stack.push_back(T_COLON);
|
||||
self->stack.push_back(T_STRING);
|
||||
MUSTTAIL return keepGoing(self);
|
||||
}
|
||||
static bool atom(Parser2 *self) {
|
||||
if (*self->bufBefore == 't') {
|
||||
self->callbacks->on_true_literal(self->data);
|
||||
} else if (*self->bufBefore == 'f') {
|
||||
self->callbacks->on_false_literal(self->data);
|
||||
} else if (*self->bufBefore == 'n') {
|
||||
self->callbacks->on_null_literal(self->data);
|
||||
} else {
|
||||
self->callbacks->on_begin_number(self->data);
|
||||
self->callbacks->on_number_data(self->data, self->bufBefore + 1,
|
||||
self->buf - self->bufBefore - 2);
|
||||
self->callbacks->on_end_number(self->data);
|
||||
}
|
||||
self->nextToken();
|
||||
self->stack.pop_back();
|
||||
MUSTTAIL return keepGoing(self);
|
||||
}
|
||||
static bool string(Parser2 *self) {
|
||||
assert(self->currentToken == T_STRING);
|
||||
self->nextToken();
|
||||
self->stack.pop_back();
|
||||
MUSTTAIL return keepGoing(self);
|
||||
}
|
||||
static bool array(Parser2 *self) {
|
||||
assert(self->currentToken == T_LBRACKET);
|
||||
self->callbacks->on_begin_array(self->data);
|
||||
self->nextToken();
|
||||
self->stack.pop_back();
|
||||
self->stack.push_back(N_ARRAY_MAYBE_CONTINUE);
|
||||
self->stack.push_back(N_VALUE);
|
||||
MUSTTAIL return keepGoing(self);
|
||||
}
|
||||
static bool continueArray(Parser2 *self) {
|
||||
assert(self->currentToken == T_COMMA);
|
||||
self->nextToken();
|
||||
self->stack.pop_back();
|
||||
self->stack.push_back(N_ARRAY_MAYBE_CONTINUE);
|
||||
self->stack.push_back(N_VALUE);
|
||||
MUSTTAIL return keepGoing(self);
|
||||
}
|
||||
static bool continueObject(Parser2 *self) {
|
||||
assert(self->currentToken == T_COMMA);
|
||||
self->nextToken();
|
||||
self->stack.pop_back();
|
||||
self->stack.push_back(N_OBJECT_MAYBE_CONTINUE);
|
||||
self->stack.push_back(N_VALUE);
|
||||
self->stack.push_back(T_COLON);
|
||||
self->stack.push_back(T_STRING);
|
||||
MUSTTAIL return keepGoing(self);
|
||||
}
|
||||
static bool finishArray(Parser2 *self) {
|
||||
assert(self->currentToken == T_RBRACKET);
|
||||
self->callbacks->on_end_array(self->data);
|
||||
self->nextToken();
|
||||
self->stack.pop_back();
|
||||
MUSTTAIL return keepGoing(self);
|
||||
}
|
||||
static bool finishObject(Parser2 *self) {
|
||||
assert(self->currentToken == T_RBRACE);
|
||||
self->callbacks->on_end_object(self->data);
|
||||
self->nextToken();
|
||||
self->stack.pop_back();
|
||||
MUSTTAIL return keepGoing(self);
|
||||
}
|
||||
|
||||
// table[nonterminal][terminal]
|
||||
static constexpr continuation table[N_PAST_END - T_PAST_END][T_PAST_END] = {
|
||||
/*N_VALUE*/
|
||||
{
|
||||
/*T_INVALID*/ reject,
|
||||
/*T_EOF*/ reject,
|
||||
/*T_LBRACE*/ object,
|
||||
/*T_RBRACE*/ reject,
|
||||
/*T_COMMA*/ reject,
|
||||
/*T_ATOM*/ atom,
|
||||
/*T_STRING*/ string,
|
||||
/*T_LBRACKET*/ array,
|
||||
/*T_RBRACKET*/ reject,
|
||||
/*T_COLON*/ reject,
|
||||
},
|
||||
/*N_ARRAY_MAYBE_CONTINUE*/
|
||||
{
|
||||
/*T_INVALID*/ reject,
|
||||
/*T_EOF*/ reject,
|
||||
/*T_LBRACE*/ reject,
|
||||
/*T_RBRACE*/ reject,
|
||||
/*T_COMMA*/ continueArray,
|
||||
/*T_ATOM*/ reject,
|
||||
/*T_STRING*/ reject,
|
||||
/*T_LBRACKET*/ reject,
|
||||
/*T_RBRACKET*/ finishArray,
|
||||
/*T_COLON*/ reject,
|
||||
},
|
||||
/*N_OBJECT*/
|
||||
{
|
||||
/*T_INVALID*/ reject,
|
||||
/*T_EOF*/ reject,
|
||||
/*T_LBRACE*/ object,
|
||||
/*T_RBRACE*/ reject,
|
||||
/*T_COMMA*/ reject,
|
||||
/*T_ATOM*/ reject,
|
||||
/*T_STRING*/ reject,
|
||||
/*T_LBRACKET*/ reject,
|
||||
/*T_RBRACKET*/ reject,
|
||||
/*T_COLON*/ reject,
|
||||
},
|
||||
/*N_OBJECT_MAYBE_CONTINUE*/
|
||||
{
|
||||
/*T_INVALID*/ reject,
|
||||
/*T_EOF*/ reject,
|
||||
/*T_LBRACE*/ reject,
|
||||
/*T_RBRACE*/ finishObject,
|
||||
/*T_COMMA*/ continueObject,
|
||||
/*T_ATOM*/ reject,
|
||||
/*T_STRING*/ reject,
|
||||
/*T_LBRACKET*/ reject,
|
||||
/*T_RBRACKET*/ reject,
|
||||
/*T_COLON*/ reject,
|
||||
},
|
||||
};
|
||||
|
||||
Symbol currentToken;
|
||||
const char *bufBefore;
|
||||
Symbol nextToken() {
|
||||
maybeSkipWs();
|
||||
bufBefore = buf;
|
||||
if (len == 0) {
|
||||
return currentToken = T_EOF;
|
||||
}
|
||||
if (*buf == '{') {
|
||||
parseLiteral("{");
|
||||
return currentToken = T_LBRACE;
|
||||
} else if (*buf == '[') {
|
||||
parseLiteral("[");
|
||||
return currentToken = T_LBRACKET;
|
||||
} else if (*buf == '}') {
|
||||
parseLiteral("}");
|
||||
return currentToken = T_RBRACE;
|
||||
} else if (*buf == ']') {
|
||||
parseLiteral("]");
|
||||
return currentToken = T_RBRACKET;
|
||||
} else if (*buf == ':') {
|
||||
parseLiteral(":");
|
||||
return currentToken = T_COLON;
|
||||
} else if (*buf == ',') {
|
||||
parseLiteral(",");
|
||||
return currentToken = T_COMMA;
|
||||
} else if (*buf == '"') {
|
||||
if (!parse_string()) {
|
||||
return currentToken = T_INVALID;
|
||||
}
|
||||
return currentToken = T_STRING;
|
||||
} else if (*buf == 't') {
|
||||
if (!parseLiteral("true")) {
|
||||
return currentToken = T_INVALID;
|
||||
}
|
||||
return currentToken = T_ATOM;
|
||||
} else if (*buf == 'f') {
|
||||
if (!parseLiteral("false")) {
|
||||
return currentToken = T_INVALID;
|
||||
}
|
||||
} else if (*buf == 'n') {
|
||||
if (!parseLiteral("null")) {
|
||||
return currentToken = T_INVALID;
|
||||
}
|
||||
} else {
|
||||
if (!parse_number()) {
|
||||
return currentToken = T_INVALID;
|
||||
}
|
||||
}
|
||||
return currentToken = T_ATOM;
|
||||
}
|
||||
|
||||
char *buf;
|
||||
int len;
|
||||
const Callbacks *const callbacks;
|
||||
void *const data;
|
||||
std::vector<Symbol> stack;
|
||||
};
|
||||
|
||||
const std::string json = R"({
|
||||
"glossary": {
|
||||
"title": "example glossary",
|
||||
@@ -424,8 +735,6 @@ const std::string json = R"({
|
||||
|
||||
Callbacks printCallbacks() {
|
||||
Callbacks result;
|
||||
result.on_begin_value = +[](void *) { puts("on_begin_value"); };
|
||||
result.on_end_value = +[](void *) { puts("on_end_value"); };
|
||||
result.on_begin_object = +[](void *) { puts("on_begin_object"); };
|
||||
result.on_end_object = +[](void *) { puts("on_end_object"); };
|
||||
result.on_begin_string = +[](void *) { puts("on_begin_string"); };
|
||||
@@ -453,8 +762,17 @@ TEST_CASE("parser1") {
|
||||
auto copy = json;
|
||||
Parser1 parser(copy.data(), copy.length(), &c, nullptr);
|
||||
CHECK(parser.parse());
|
||||
}
|
||||
|
||||
c = Callbacks{};
|
||||
TEST_CASE("parser2") {
|
||||
Callbacks c = printCallbacks();
|
||||
auto copy = json;
|
||||
Parser2 parser(copy.data(), copy.length(), &c, nullptr);
|
||||
CHECK(parser.parse());
|
||||
}
|
||||
|
||||
TEST_CASE("bench") {
|
||||
auto c = Callbacks{};
|
||||
ankerl::nanobench::Bench bench;
|
||||
bench.relative(true);
|
||||
bench.batch(json.size());
|
||||
@@ -468,4 +786,9 @@ TEST_CASE("parser1") {
|
||||
Parser1 parser(copy.data(), copy.length(), &c, nullptr);
|
||||
bench.doNotOptimizeAway(parser.parse());
|
||||
});
|
||||
bench.run("parser2", [&]() {
|
||||
auto copy = json;
|
||||
Parser2 parser(copy.data(), copy.length(), &c, nullptr);
|
||||
bench.doNotOptimizeAway(parser.parse());
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user