Files
weaseljson/src/test.cpp
2025-05-13 11:14:51 -04:00

968 lines
22 KiB
C++

#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <initializer_list>
#include <string>
#include <utility>
#include <doctest.h>
#include <nanobench.h>
// This is the JSON grammar in McKeeman Form.
// json
// element
// value
// object
// array
// string
// number
// "true"
// "false"
// "null"
// object
// '{' ws '}'
// '{' members '}'
// members
// member
// member ',' members
// member
// ws string ws ':' element
// array
// '[' ws ']'
// '[' elements ']'
// elements
// element
// element ',' elements
// element
// ws value ws
// string
// '"' characters '"'
// characters
// ""
// character characters
// character
// '0020' . '10FFFF' - '"' - '\'
// '\' escape
// escape
// '"'
// '\'
// '/'
// 'b'
// 'f'
// 'n'
// 'r'
// 't'
// 'u' hex hex hex hex
// hex
// digit
// 'A' . 'F'
// 'a' . 'f'
// number
// integer fraction exponent
// integer
// digit
// onenine digits
// '-' digit
// '-' onenine digits
// digits
// digit
// digit digits
// digit
// '0'
// onenine
// onenine
// '1' . '9'
// fraction
// ""
// '.' digits
// exponent
// ""
// 'E' sign digits
// 'e' sign digits
// sign
// ""
// '+'
// '-'
// ws
// ""
// '0020' ws
// '000A' ws
// '000D' ws
// '0009' ws
struct Callbacks {
void (*on_begin_object)(void *data) = noop;
void (*on_end_object)(void *data) = noop;
void (*on_begin_string)(void *data) = noop;
void (*on_string_data)(void *data, const char *buf, int len) = noop;
void (*on_end_string)(void *data) = noop;
void (*on_begin_array)(void *data) = noop;
void (*on_end_array)(void *data) = noop;
void (*on_begin_number)(void *data) = noop;
void (*on_number_data)(void *data, const char *buf, int len) = noop;
void (*on_end_number)(void *data) = noop;
void (*on_true_literal)(void *data) = noop;
void (*on_false_literal)(void *data) = noop;
void (*on_null_literal)(void *data) = noop;
private:
static void noop(void *) {}
static void noop(void *, const char *, int) {}
};
// Terminals and Nonterminals. These appear in the stack of the pushdown
// automata
enum Symbol : int8_t {
// Terminals
T_INVALID,
T_EOF,
T_LBRACE,
T_RBRACE,
T_COMMA,
T_ATOM, // Multibyte!
T_STRING, // Multibyte!
T_LBRACKET,
T_RBRACKET,
T_COLON,
T_PAST_END, // Must be last terminal
// Nonterminals
N_VALUE = T_PAST_END,
N_ARRAY_MAYBE_CONTINUE,
N_OBJECT,
N_OBJECT_MAYBE_CONTINUE,
N_PAST_END, // Must be last nonterminal
};
const char *symbolNames[] = {
"T_INVALID", "T_EOF",
"T_LBRACE", "T_RBRACE",
"T_COMMA", "T_ATOM",
"T_STRING", "T_LBRACKET",
"T_RBRACKET", "T_COLON",
"N_VALUE", "N_ARRAY_MAYBE_CONTINUE",
"N_OBJECT", "N_OBJECT_MAYBE_CONTINUE",
"N_PAST_END",
};
namespace {
bool whitespace(char x) {
return x == 0x20 || x == 0x0A || x == 0x0D || x == 0x09;
}
// Straightforward recursive descent that doesn't handle string escaping and
// treats numbers as [0-9]+
struct Parser1 {
Parser1(char *buf, int len, const Callbacks *callbacks, void *data)
: buf(buf), len(len), callbacks(callbacks), data(data) {}
// Returns false to reject
[[nodiscard]] bool parse() { return parse_element(); }
Parser1(Parser1 const &) = delete;
Parser1 &operator=(Parser1 const &) = delete;
Parser1(Parser1 &&) = delete;
Parser1 &operator=(Parser1 &&) = delete;
private:
char *buf;
int len;
const Callbacks *const callbacks;
void *const data;
// Helpers
void maybeSkipWs() {
while (len > 0 && whitespace(*buf)) {
++buf;
--len;
}
}
bool parseLiteral(const char *literal) {
const int litLen = strlen(literal);
if (len < litLen) {
return false;
}
len -= litLen;
return memcmp(std::exchange(buf, buf + litLen), literal, litLen) == 0;
}
// functions corresponding to productions
bool parse_element() {
maybeSkipWs();
if (len == 0) {
return false;
}
if (*buf == '{') {
if (!parse_object()) {
return false;
}
} else if (*buf == '[') {
if (!parse_array()) {
return false;
}
} else if (*buf == '"') {
if (!parse_string()) {
return false;
}
} else if (*buf == 't') {
if (!parse_true()) {
return false;
}
} else if (*buf == 'f') {
if (!parse_false()) {
return false;
}
} else if (*buf == 'n') {
if (!parse_null()) {
return false;
}
} else {
if (!parse_number()) {
return false;
}
}
maybeSkipWs();
return true;
}
bool parse_object() {
if (!parseLiteral("{")) {
return false;
}
callbacks->on_begin_object(data);
maybeSkipWs();
if (len == 0) {
return false;
}
if (*buf != '}') {
if (!parse_members()) {
}
}
if (!parseLiteral("}")) {
return false;
}
callbacks->on_end_object(data);
return true;
}
bool parse_members() {
begin:
if (!parse_member()) {
return false;
}
if (len == 0) {
return false;
}
if (*buf == ',') {
if (!parseLiteral(",")) {
return false;
}
goto begin; // tail call
}
return true;
}
bool parse_member() {
maybeSkipWs();
if (!parse_string()) {
return false;
}
maybeSkipWs();
if (!parseLiteral(":")) {
return false;
}
if (!parse_element()) {
return false;
}
return true;
}
bool parse_array() {
if (!parseLiteral("[")) {
return false;
}
callbacks->on_begin_array(data);
maybeSkipWs();
if (len == 0) {
return false;
}
if (*buf != ']') {
if (!parse_elements()) {
return false;
}
}
if (!parseLiteral("]")) {
return false;
}
callbacks->on_end_array(data);
return true;
}
bool parse_elements() {
begin:
if (!parse_element()) {
return false;
}
if (len == 0) {
return false;
}
if (*buf == ',') {
if (!parseLiteral(",")) {
return false;
}
goto begin; // tail call
}
return true;
}
bool parse_string() {
callbacks->on_begin_string(data);
if (!parseLiteral("\"")) {
return false;
}
auto *result = (char *)memchr(buf, '"', len);
if (result == nullptr) {
return false;
}
int stringLen = result - buf;
callbacks->on_string_data(data, buf, stringLen);
buf += stringLen;
len -= stringLen;
if (!parseLiteral("\"")) {
return false;
}
callbacks->on_end_string(data);
return true;
}
bool parse_number() {
callbacks->on_begin_number(data);
char *const bufBefore = buf;
for (;;) {
if (len == 0) {
return false;
}
if ('0' <= *buf && *buf <= '9') {
++buf;
--len;
} else {
break;
}
}
if (buf == bufBefore) {
return false;
}
callbacks->on_number_data(data, bufBefore, buf - bufBefore);
callbacks->on_end_number(data);
return true;
}
bool parse_true() {
if (!parseLiteral("true")) {
return false;
}
callbacks->on_true_literal(data);
return true;
}
bool parse_false() {
if (!parseLiteral("false")) {
return false;
}
callbacks->on_false_literal(data);
return true;
}
bool parse_null() {
if (!parseLiteral("null")) {
return false;
}
callbacks->on_null_literal(data);
return true;
}
};
#ifndef __has_attribute
#define __has_attribute(x) 0
#endif
#if __has_attribute(musttail)
#define MUSTTAIL __attribute__((musttail))
#else
#define MUSTTAIL
#endif
#if __has_attribute(preserve_none)
#define PRESERVE_NONE __attribute__((preserve_none))
#else
#define PRESERVE_NONE
#endif
struct Parser2 {
Parser2(char *buf, int len, const Callbacks *callbacks, void *data)
: buf(buf), len(len), callbacks(callbacks), data(data) {}
// Returns false to reject
[[nodiscard]] bool parse() {
if (!push({N_VALUE})) {
return false;
}
nextToken();
return keepGoing(this);
}
Parser2(Parser2 const &) = delete;
Parser2 &operator=(Parser2 const &) = delete;
Parser2(Parser2 &&) = delete;
Parser2 &operator=(Parser2 &&) = delete;
static constexpr int kMaxStackSize = 1 << 10;
private:
// Helpers
void maybeSkipWs() {
while (len > 0 && whitespace(*buf)) {
++buf;
--len;
}
}
bool parseLiteral(const char *literal) {
const int litLen = strlen(literal);
if (len < litLen) {
return false;
}
len -= litLen;
return memcmp(std::exchange(buf, buf + litLen), literal, litLen) == 0;
}
bool parse_number() {
callbacks->on_begin_number(data);
char *const bufBefore = buf;
for (;;) {
if (len == 0) {
return false;
}
if ('0' <= *buf && *buf <= '9') {
++buf;
--len;
} else {
break;
}
}
if (buf == bufBefore) {
return false;
}
callbacks->on_number_data(data, bufBefore, buf - bufBefore);
callbacks->on_end_number(data);
return true;
}
bool parse_string() {
callbacks->on_begin_string(data);
if (!parseLiteral("\"")) {
return false;
}
auto *result = (char *)memchr(buf, '"', len);
if (result == nullptr) {
return false;
}
int stringLen = result - buf;
callbacks->on_string_data(data, buf, stringLen);
buf += stringLen;
len -= stringLen;
if (!parseLiteral("\"")) {
return false;
}
callbacks->on_end_string(data);
return true;
}
typedef PRESERVE_NONE bool (*continuation)(Parser2 *);
void printStack() {
printf("token: %s\n", symbolNames[currentToken]);
for (int i = 0; i < stackPtr - stack; ++i) {
printf("%s ", symbolNames[stack[i]]);
}
printf("\n");
}
PRESERVE_NONE static bool tokenMatch(Parser2 *self) {
self->pop();
self->nextToken();
MUSTTAIL return keepGoing(self);
}
PRESERVE_NONE static bool keepGoing(Parser2 *self) {
// self->printStack();
if (self->empty()) {
assert(self->currentToken == T_EOF);
return true;
}
auto top = *(self->stackPtr - 1);
MUSTTAIL return table[top][self->currentToken](self);
}
PRESERVE_NONE static bool reject(Parser2 *self) {
self->pop();
return false;
}
PRESERVE_NONE static bool object(Parser2 *self) {
self->pop();
assert(self->currentToken == T_LBRACE);
self->callbacks->on_begin_object(self->data);
self->nextToken();
if (!self->push({T_STRING, T_COLON, N_VALUE, N_OBJECT_MAYBE_CONTINUE})) {
return false;
}
MUSTTAIL return keepGoing(self);
}
PRESERVE_NONE static bool atom(Parser2 *self) {
self->pop();
if (*self->bufBefore == 't') {
self->callbacks->on_true_literal(self->data);
} else if (*self->bufBefore == 'f') {
self->callbacks->on_false_literal(self->data);
} else if (*self->bufBefore == 'n') {
self->callbacks->on_null_literal(self->data);
} else {
self->callbacks->on_begin_number(self->data);
self->callbacks->on_number_data(self->data, self->bufBefore + 1,
self->buf - self->bufBefore - 2);
self->callbacks->on_end_number(self->data);
}
self->nextToken();
MUSTTAIL return keepGoing(self);
}
PRESERVE_NONE static bool string(Parser2 *self) {
self->pop();
assert(self->currentToken == T_STRING);
self->nextToken();
MUSTTAIL return keepGoing(self);
}
PRESERVE_NONE static bool array(Parser2 *self) {
self->pop();
assert(self->currentToken == T_LBRACKET);
self->callbacks->on_begin_array(self->data);
self->nextToken();
if (!self->push({N_VALUE, N_ARRAY_MAYBE_CONTINUE})) {
return false;
}
MUSTTAIL return keepGoing(self);
}
PRESERVE_NONE static bool continueArray(Parser2 *self) {
self->pop();
assert(self->currentToken == T_COMMA);
self->nextToken();
if (!self->push({N_VALUE, N_ARRAY_MAYBE_CONTINUE})) {
return false;
}
MUSTTAIL return keepGoing(self);
}
PRESERVE_NONE static bool continueObject(Parser2 *self) {
self->pop();
assert(self->currentToken == T_COMMA);
self->nextToken();
if (!self->push({T_STRING, T_COLON, N_VALUE, N_OBJECT_MAYBE_CONTINUE})) {
return false;
}
MUSTTAIL return keepGoing(self);
}
PRESERVE_NONE static bool finishArray(Parser2 *self) {
self->pop();
assert(self->currentToken == T_RBRACKET);
self->callbacks->on_end_array(self->data);
self->nextToken();
MUSTTAIL return keepGoing(self);
}
PRESERVE_NONE static bool finishObject(Parser2 *self) {
self->pop();
assert(self->currentToken == T_RBRACE);
self->callbacks->on_end_object(self->data);
self->nextToken();
MUSTTAIL return keepGoing(self);
}
// table[nonterminal][terminal]
static constexpr continuation table[N_PAST_END][T_PAST_END] = {
/*T_INVALID*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ reject,
/*T_RBRACE*/ reject,
/*T_COMMA*/ reject,
/*T_ATOM*/ reject,
/*T_STRING*/ reject,
/*T_LBRACKET*/ reject,
/*T_RBRACKET*/ reject,
/*T_COLON*/ reject,
},
/*T_EOF*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ tokenMatch,
/*T_LBRACE*/ reject,
/*T_RBRACE*/ reject,
/*T_COMMA*/ reject,
/*T_ATOM*/ reject,
/*T_STRING*/ reject,
/*T_LBRACKET*/ reject,
/*T_RBRACKET*/ reject,
/*T_COLON*/ reject,
},
/*T_LBRACE*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ tokenMatch,
/*T_RBRACE*/ reject,
/*T_COMMA*/ reject,
/*T_ATOM*/ reject,
/*T_STRING*/ reject,
/*T_LBRACKET*/ reject,
/*T_RBRACKET*/ reject,
/*T_COLON*/ reject,
},
/*T_RBRACE*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ reject,
/*T_RBRACE*/ tokenMatch,
/*T_COMMA*/ reject,
/*T_ATOM*/ reject,
/*T_STRING*/ reject,
/*T_LBRACKET*/ reject,
/*T_RBRACKET*/ reject,
/*T_COLON*/ reject,
},
/*T_COMMA*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ reject,
/*T_RBRACE*/ reject,
/*T_COMMA*/ tokenMatch,
/*T_ATOM*/ reject,
/*T_STRING*/ reject,
/*T_LBRACKET*/ reject,
/*T_RBRACKET*/ reject,
/*T_COLON*/ reject,
},
/*T_ATOM*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ reject,
/*T_RBRACE*/ reject,
/*T_COMMA*/ reject,
/*T_ATOM*/ tokenMatch,
/*T_STRING*/ reject,
/*T_LBRACKET*/ reject,
/*T_RBRACKET*/ reject,
/*T_COLON*/ reject,
},
/*T_STRING*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ reject,
/*T_RBRACE*/ reject,
/*T_COMMA*/ reject,
/*T_ATOM*/ reject,
/*T_STRING*/ tokenMatch,
/*T_LBRACKET*/ reject,
/*T_RBRACKET*/ reject,
/*T_COLON*/ reject,
},
/*T_LBRACKET*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ reject,
/*T_RBRACE*/ reject,
/*T_COMMA*/ reject,
/*T_ATOM*/ reject,
/*T_STRING*/ reject,
/*T_LBRACKET*/ tokenMatch,
/*T_RBRACKET*/ reject,
/*T_COLON*/ reject,
},
/*T_RBRACKET*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ reject,
/*T_RBRACE*/ reject,
/*T_COMMA*/ reject,
/*T_ATOM*/ reject,
/*T_STRING*/ reject,
/*T_LBRACKET*/ reject,
/*T_RBRACKET*/ tokenMatch,
/*T_COLON*/ reject,
},
/*T_COLON*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ reject,
/*T_RBRACE*/ reject,
/*T_COMMA*/ reject,
/*T_ATOM*/ reject,
/*T_STRING*/ reject,
/*T_LBRACKET*/ reject,
/*T_RBRACKET*/ reject,
/*T_COLON*/ tokenMatch,
},
/*N_VALUE*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ object,
/*T_RBRACE*/ reject,
/*T_COMMA*/ reject,
/*T_ATOM*/ atom,
/*T_STRING*/ string,
/*T_LBRACKET*/ array,
/*T_RBRACKET*/ reject,
/*T_COLON*/ reject,
},
/*N_ARRAY_MAYBE_CONTINUE*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ reject,
/*T_RBRACE*/ reject,
/*T_COMMA*/ continueArray,
/*T_ATOM*/ reject,
/*T_STRING*/ reject,
/*T_LBRACKET*/ reject,
/*T_RBRACKET*/ finishArray,
/*T_COLON*/ reject,
},
/*N_OBJECT*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ object,
/*T_RBRACE*/ reject,
/*T_COMMA*/ reject,
/*T_ATOM*/ reject,
/*T_STRING*/ reject,
/*T_LBRACKET*/ reject,
/*T_RBRACKET*/ reject,
/*T_COLON*/ reject,
},
/*N_OBJECT_MAYBE_CONTINUE*/
{
/*T_INVALID*/ reject,
/*T_EOF*/ reject,
/*T_LBRACE*/ reject,
/*T_RBRACE*/ finishObject,
/*T_COMMA*/ continueObject,
/*T_ATOM*/ reject,
/*T_STRING*/ reject,
/*T_LBRACKET*/ reject,
/*T_RBRACKET*/ reject,
/*T_COLON*/ reject,
},
};
Symbol currentToken;
const char *bufBefore;
void nextToken() {
maybeSkipWs();
bufBefore = buf;
if (len == 0) {
currentToken = T_EOF;
return;
}
if (*buf == '{') {
parseLiteral("{");
currentToken = T_LBRACE;
return;
} else if (*buf == '[') {
parseLiteral("[");
currentToken = T_LBRACKET;
return;
} else if (*buf == '}') {
parseLiteral("}");
currentToken = T_RBRACE;
return;
} else if (*buf == ']') {
parseLiteral("]");
currentToken = T_RBRACKET;
return;
} else if (*buf == ':') {
parseLiteral(":");
currentToken = T_COLON;
return;
} else if (*buf == ',') {
parseLiteral(",");
currentToken = T_COMMA;
return;
} else if (*buf == '"') {
if (!parse_string()) {
currentToken = T_INVALID;
return;
}
currentToken = T_STRING;
return;
} else if (*buf == 't') {
if (!parseLiteral("true")) {
currentToken = T_INVALID;
return;
}
currentToken = T_ATOM;
return;
} else if (*buf == 'f') {
if (!parseLiteral("false")) {
currentToken = T_INVALID;
return;
}
} else if (*buf == 'n') {
if (!parseLiteral("null")) {
currentToken = T_INVALID;
return;
}
} else {
if (!parse_number()) {
currentToken = T_INVALID;
return;
}
}
currentToken = T_ATOM;
return;
}
char *buf;
int len;
const Callbacks *const callbacks;
void *const data;
Symbol stack[kMaxStackSize];
Symbol *stackPtr = stack;
bool empty() { return stackPtr == stack; }
void pop() {
assert(!empty());
--stackPtr;
}
[[nodiscard]] bool push(std::initializer_list<Symbol> symbols) {
if (stackPtr >= std::end(stack) - symbols.size()) [[unlikely]] {
return false;
}
for (int i = symbols.size() - 1; i >= 0; --i) {
*stackPtr++ = *(symbols.begin() + i);
}
return true;
}
};
const std::string json = R"({
"glossary": {
"title": "example glossary",
"GlossDiv": {
"title": "S",
"GlossList": {
"GlossEntry": {
"ID": "SGML",
"SortAs": "SGML",
"GlossTerm": "Standard Generalized Markup Language",
"Acronym": "SGML",
"Abbrev": "ISO 8879:1986",
"GlossDef": {
"para": "A meta-markup language, used to create markup languages such as DocBook.",
"GlossSeeAlso": ["GML", "XML"]
},
"GlossSee": "markup"
}
}
}
}
})";
Callbacks printCallbacks() {
Callbacks result;
result.on_begin_object = +[](void *) { puts("on_begin_object"); };
result.on_end_object = +[](void *) { puts("on_end_object"); };
result.on_begin_string = +[](void *) { puts("on_begin_string"); };
result.on_string_data = +[](void *, const char *buf, int len) {
printf("on_string_data `%.*s`\n", len, buf);
};
result.on_end_string = +[](void *) { puts("on_end_string"); };
result.on_begin_array = +[](void *) { puts("on_begin_array"); };
result.on_end_array = +[](void *) { puts("on_end_array"); };
result.on_begin_number = +[](void *) { puts("on_begin_number"); };
result.on_number_data = +[](void *, const char *buf, int len) {
printf("on_number_data `%.*s`\n", len, buf);
};
result.on_end_number = +[](void *) { puts("on_end_number"); };
result.on_true_literal = +[](void *) { puts("on_true_literal"); };
result.on_false_literal = +[](void *) { puts("on_false_literal"); };
result.on_null_literal = +[](void *) { puts("on_null_literal"); };
return result;
}
} // namespace
TEST_CASE("parser1") {
Callbacks c = printCallbacks();
auto copy = json;
Parser1 parser(copy.data(), copy.length(), &c, nullptr);
CHECK(parser.parse());
}
TEST_CASE("parser2") {
Callbacks c = printCallbacks();
auto copy = json;
Parser2 parser(copy.data(), copy.length(), &c, nullptr);
CHECK(parser.parse());
}
TEST_CASE("bench1") {
auto c = Callbacks{};
ankerl::nanobench::Bench bench;
bench.batch(json.size());
bench.unit("byte");
bench.run("parser1", [&]() {
auto copy = json;
Parser1 parser(copy.data(), copy.length(), &c, nullptr);
bench.doNotOptimizeAway(parser.parse());
});
}
TEST_CASE("bench2") {
auto c = Callbacks{};
ankerl::nanobench::Bench bench;
bench.batch(json.size());
bench.unit("byte");
bench.run("parser2", [&]() {
auto copy = json;
Parser2 parser(copy.data(), copy.length(), &c, nullptr);
bench.doNotOptimizeAway(parser.parse());
});
}