Start organizing code

This commit is contained in:
2025-05-17 12:24:07 -04:00
parent 6f0315e00c
commit 733dc576b7
6 changed files with 524 additions and 479 deletions

View File

@@ -4,7 +4,6 @@
#include <cstdio>
#include <cstring>
#include <initializer_list>
#include <string>
#include <utility>
@@ -12,6 +11,8 @@
#include <nanobench.h>
#include <simdjson.h>
#include "parser.h"
// This is the JSON grammar in McKeeman Form.
// json
@@ -116,93 +117,6 @@
// '000D' ws
// '0009' ws
struct Callbacks {
void (*on_begin_object)(void *data) = noop;
void (*on_end_object)(void *data) = noop;
void (*on_begin_string)(void *data) = noop;
void (*on_string_data)(void *data, const char *buf, int len) = noop;
void (*on_end_string)(void *data) = noop;
void (*on_begin_array)(void *data) = noop;
void (*on_end_array)(void *data) = noop;
void (*on_begin_number)(void *data) = noop;
void (*on_number_data)(void *data, const char *buf, int len) = noop;
void (*on_end_number)(void *data) = noop;
void (*on_true_literal)(void *data) = noop;
void (*on_false_literal)(void *data) = noop;
void (*on_null_literal)(void *data) = noop;
private:
static void noop(void *) {}
static void noop(void *, const char *, int) {}
};
// Terminals and Nonterminals. These appear in the stack of the pushdown
// automata
enum Symbol : int8_t {
T_COLON,
T_TRUE,
T_FALSE,
T_NULL,
T_R,
T_U,
T_A,
T_L,
T_S,
T_DUBQUOTE,
// Nonterminals
N_STRING, // Not including leading double quote, but including trailing quote
N_STRING_FROM_ESCAPE, // Immediately after a backslach
N_NUMBER,
N_VALUE,
N_ARRAY_VALUE_OR_END,
N_OBJECT_VALUE_OR_END,
N_ARRAY_MAYBE_CONTINUE,
N_OBJECT_MAYBE_CONTINUE,
N_WHITESPACE,
N_PAST_END, // Must be last nonterminal
};
static const char *symbolNames[] = {
"T_COLON",
"T_TRUE",
"T_FALSE",
"T_NULL",
"T_R",
"T_U",
"T_A",
"T_L",
"T_S",
"T_DUBQUOTE",
"N_STRING",
"N_STRING_FROM_ESCAPE",
"N_NUMBER",
"N_VALUE",
"N_ARRAY_VALUE_OR_END",
"N_OBJECT_VALUE_OR_END",
"N_ARRAY_MAYBE_CONTINUE",
"N_OBJECT_MAYBE_CONTINUE",
"N_WHITESPACE",
};
static_assert(sizeof(symbolNames) / sizeof(symbolNames[0]) == N_PAST_END);
constexpr static struct Tables {
constexpr Tables() {
whitespace[' '] = true;
whitespace['\n'] = true;
whitespace['\r'] = true;
whitespace['\t'] = true;
for (int i = 0; i < 10; ++i) {
number['0' + i] = true;
}
number['.'] = true;
number['+'] = true;
number['-'] = true;
}
alignas(16) bool whitespace[256]{};
alignas(16) bool number[256]{};
} tables;
namespace {
// Straightforward recursive descent that doesn't handle string escaping and
@@ -453,395 +367,6 @@ private:
#define MUSTTAIL
#endif
// Table-based ll(1) parser that doesn't handle escaping and all numbers, with a
// streaming interface. Does not validate utf-8. Uses O(1) memory.
struct Parser2 {
Parser2(const Callbacks *callbacks, void *data)
: callbacks(callbacks), data(data) {
std::ignore = push({N_WHITESPACE, N_VALUE});
}
void prime(char *buf, int len) {
this->buf = buf;
this->bufEnd = buf + len;
}
enum Status {
// Accept input
S_OK,
// Consumed available input. Prime more and parse again
S_AGAIN,
// Invalid json
S_REJECT,
// json is too deeply nested
S_OVERFLOW,
};
[[nodiscard]] Status parse() { return keepGoing(this); }
Parser2(Parser2 const &) = delete;
Parser2 &operator=(Parser2 const &) = delete;
Parser2(Parser2 &&) = delete;
Parser2 &operator=(Parser2 &&) = delete;
static constexpr int kMaxStackSize = 1 << 10;
private:
// Helpers
void maybeSkipWs() {
while (buf != bufEnd && tables.whitespace[*buf]) {
++buf;
}
}
Status parse_number() {
char *const bufBefore = buf;
while (len() > 0) {
if (tables.number[*buf]) {
++buf;
} else {
break;
}
}
if (buf != bufBefore) {
callbacks->on_number_data(data, bufBefore, buf - bufBefore);
}
if (len() == 0) {
return S_AGAIN;
}
callbacks->on_end_number(data);
return S_OK;
}
Status parse_string(bool fromEscape) {
auto *result = buf;
if (fromEscape) {
if (*result == '\"') {
++result;
}
pop();
if (Status s = push({N_STRING})) {
return s;
}
}
for (;;) {
result = (char *)memchr(result, '"', bufEnd - result);
if (result == nullptr) {
callbacks->on_string_data(data, buf, len());
if (bufEnd[-1] == '\\') {
pop();
if (Status s = push({N_STRING_FROM_ESCAPE})) {
return s;
}
}
return S_AGAIN;
}
if (result != buf && result[-1] == '\\') {
++result;
if (result == bufEnd) {
callbacks->on_string_data(data, buf, len());
return S_AGAIN;
}
continue;
}
break;
}
int stringLen = result - buf;
if (stringLen > 0) {
callbacks->on_string_data(data, buf, stringLen);
}
buf += stringLen + 1;
callbacks->on_end_string(data);
return S_OK;
}
typedef Status (*continuation)(Parser2 *);
[[maybe_unused]] void debugPrint() {
for (int i = 0; i < stackPtr - stack; ++i) {
printf("%s ", symbolNames[stack[i]]);
}
printf("\n");
}
static Status keepGoing(Parser2 *self) {
if (self->len() == 0) {
return S_AGAIN;
}
// self->debugPrint();
MUSTTAIL return table[*(self->stackPtr - 1)](self);
}
static Status string(Parser2 *self) {
if (Status s = self->parse_string(false)) {
return s;
}
self->pop();
if (self->empty()) {
return S_OK;
}
MUSTTAIL return keepGoing(self);
}
static Status stringFromEscape(Parser2 *self) {
if (Status s = self->parse_string(true)) {
return s;
}
self->pop();
if (self->empty()) {
return S_OK;
}
MUSTTAIL return keepGoing(self);
}
static Status number(Parser2 *self) {
if (Status s = self->parse_number()) {
return s;
}
self->pop();
if (self->empty()) {
return S_OK;
}
MUSTTAIL return keepGoing(self);
}
static Status value(Parser2 *self) {
switch (*self->buf) {
case '{':
++self->buf;
self->callbacks->on_begin_object(self->data);
self->pop();
if (Status s = self->push({N_WHITESPACE, N_OBJECT_VALUE_OR_END})) {
return s;
}
break;
case '[':
++self->buf;
self->callbacks->on_begin_array(self->data);
self->pop();
if (Status s = self->push({N_WHITESPACE, N_ARRAY_VALUE_OR_END})) {
return s;
}
break;
case '"':
++self->buf;
self->pop();
self->callbacks->on_begin_string(self->data);
if (Status s = self->push({N_STRING})) {
return s;
}
break;
case 't':
++self->buf;
self->pop();
if (Status s = self->push({T_R, T_U, T_TRUE})) {
return s;
}
break;
case 'f':
++self->buf;
self->pop();
if (Status s = self->push({T_A, T_L, T_S, T_FALSE})) {
return s;
}
break;
case 'n':
++self->buf;
self->pop();
if (Status s = self->push({T_U, T_L, T_NULL})) {
return s;
}
break;
default:
self->pop();
self->callbacks->on_begin_number(self->data);
if (Status s = self->push({N_NUMBER})) {
return s;
}
break;
}
MUSTTAIL return keepGoing(self);
}
static Status arrayOrEnd(Parser2 *self) {
if (*self->buf == ']') {
++self->buf;
self->pop();
self->callbacks->on_end_array(self->data);
if (self->empty()) {
return S_OK;
}
MUSTTAIL return keepGoing(self);
} else {
self->pop();
if (Status s =
self->push({N_VALUE, N_WHITESPACE, N_ARRAY_MAYBE_CONTINUE})) {
return s;
}
MUSTTAIL return keepGoing(self);
}
}
static Status objectOrEnd(Parser2 *self) {
if (*self->buf == '}') {
++self->buf;
self->pop();
self->callbacks->on_end_object(self->data);
if (self->empty()) {
return S_OK;
}
MUSTTAIL return keepGoing(self);
} else if (*self->buf == '"') {
self->callbacks->on_begin_string(self->data);
++self->buf;
self->pop();
if (Status s =
self->push({N_STRING, N_WHITESPACE, T_COLON, N_WHITESPACE,
N_VALUE, N_WHITESPACE, N_OBJECT_MAYBE_CONTINUE})) {
return s;
}
MUSTTAIL return keepGoing(self);
}
return S_REJECT;
}
static Status arrayContinue(Parser2 *self) {
if (*self->buf == ',') {
++self->buf;
self->pop();
if (Status s = self->push(
{N_WHITESPACE, N_VALUE, N_WHITESPACE, N_ARRAY_MAYBE_CONTINUE})) {
return s;
}
MUSTTAIL return keepGoing(self);
} else if (*self->buf == ']') {
++self->buf;
self->pop();
self->callbacks->on_end_array(self->data);
if (self->empty()) {
return S_OK;
}
MUSTTAIL return keepGoing(self);
}
return S_REJECT;
}
static Status objectContinue(Parser2 *self) {
if (*self->buf == ',') {
++self->buf;
self->pop();
if (Status s = self->push({N_WHITESPACE, T_DUBQUOTE, N_STRING,
N_WHITESPACE, T_COLON, N_WHITESPACE, N_VALUE,
N_WHITESPACE, N_OBJECT_MAYBE_CONTINUE})) {
return s;
}
MUSTTAIL return keepGoing(self);
} else if (*self->buf == '}') {
++self->buf;
self->pop();
self->callbacks->on_end_object(self->data);
if (self->empty()) {
return S_OK;
}
MUSTTAIL return keepGoing(self);
}
return S_REJECT;
}
static Status finishTrue(Parser2 *self) {
if (*self->buf++ == 'e') {
self->pop();
self->callbacks->on_true_literal(self->data);
if (self->empty()) {
return S_OK;
}
MUSTTAIL return keepGoing(self);
}
return S_REJECT;
}
static Status finishFalse(Parser2 *self) {
if (*self->buf++ == 'e') {
self->pop();
self->callbacks->on_false_literal(self->data);
if (self->empty()) {
return S_OK;
}
MUSTTAIL return keepGoing(self);
}
return S_REJECT;
}
static Status finishNull(Parser2 *self) {
if (*self->buf++ == 'l') {
self->pop();
self->callbacks->on_null_literal(self->data);
if (self->empty()) {
return S_OK;
}
MUSTTAIL return keepGoing(self);
}
return S_REJECT;
}
template <char kChar> static Status singleChar(Parser2 *self) {
if (*self->buf++ == kChar) {
self->pop();
MUSTTAIL return keepGoing(self);
}
return S_REJECT;
}
static Status dubquote(Parser2 *self) {
if (*self->buf++ == '"') {
self->callbacks->on_begin_string(self->data);
self->pop();
MUSTTAIL return keepGoing(self);
}
return S_REJECT;
}
static Status whitespace(Parser2 *self) {
self->maybeSkipWs();
if (self->len() == 0) {
return S_AGAIN;
}
self->pop();
MUSTTAIL return keepGoing(self);
}
static constexpr continuation table[] = {
/*T_COLON*/ singleChar<':'>,
/*T_TRUE*/ finishTrue,
/*T_FALSE*/ finishFalse,
/*T_NULL*/ finishNull,
/*T_R*/ singleChar<'r'>,
/*T_U*/ singleChar<'u'>,
/*T_A*/ singleChar<'a'>,
/*T_L*/ singleChar<'l'>,
/*T_S*/ singleChar<'s'>,
/*T_DUBQUOTE*/ dubquote,
/*N_STRING*/ string,
/*N_STRING_FROM_ESCAPE*/ stringFromEscape,
/*N_NUMBER*/ number,
/*N_VALUE*/ value,
/*N_ARRAY_VALUE_OR_END*/ arrayOrEnd,
/*N_OBJECT_VALUE_OR_END*/ objectOrEnd,
/*N_ARRAY_MAYBE_CONTINUE*/ arrayContinue,
/*N_OBJECT_MAYBE_CONTINUE*/ objectContinue,
/*N_WHITESPACE*/ whitespace,
};
static_assert(sizeof(table) / sizeof(table[0]) == N_PAST_END);
char *buf = nullptr;
char *bufEnd = nullptr;
int len() const { return bufEnd - buf; }
const Callbacks *const callbacks;
void *const data;
Symbol stack[kMaxStackSize];
Symbol *stackPtr = stack;
bool empty() const { return stackPtr == stack; }
void pop() {
assert(!empty());
--stackPtr;
}
[[nodiscard]] Status push(std::initializer_list<Symbol> symbols) {
if (stackPtr >= std::end(stack) - symbols.size()) [[unlikely]] {
return S_OVERFLOW;
}
for (int i = symbols.size() - 1; i >= 0; --i) {
*stackPtr++ = *(symbols.begin() + i);
}
return S_OK;
}
};
const std::string json = R"({
"a number": 12345,
"true": true,
@@ -972,6 +497,24 @@ Callbacks minifyCallbacks() {
return result;
}
Callbacks noopCallbacks() {
Callbacks result;
result.on_begin_object = +[](void *) {};
result.on_end_object = +[](void *) {};
result.on_begin_string = +[](void *) {};
result.on_string_data = +[](void *, const char *buf, int len) {};
result.on_end_string = +[](void *) {};
result.on_begin_array = +[](void *) {};
result.on_end_array = +[](void *) {};
result.on_begin_number = +[](void *) {};
result.on_number_data = +[](void *, const char *buf, int len) {};
result.on_end_number = +[](void *) {};
result.on_true_literal = +[](void *) {};
result.on_false_literal = +[](void *) {};
result.on_null_literal = +[](void *) {};
return result;
}
} // namespace
TEST_CASE("parser1") {
@@ -1016,7 +559,7 @@ TEST_CASE("parser2") {
}
TEST_CASE("bench1") {
auto c = Callbacks{};
auto c = noopCallbacks();
ankerl::nanobench::Bench bench;
bench.batch(json.size());
bench.unit("byte");
@@ -1028,7 +571,7 @@ TEST_CASE("bench1") {
}
TEST_CASE("bench2") {
auto c = Callbacks{};
auto c = noopCallbacks();
ankerl::nanobench::Bench bench;
bench.batch(json.size());
bench.unit("byte");