Files
weaseljson/src/test.cpp
2025-05-14 20:04:03 -04:00

777 lines
18 KiB
C++

#include <cassert>
#include <cctype>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <initializer_list>
#include <string>
#include <utility>
#include <doctest.h>
#include <nanobench.h>
#include <simdjson.h>
// This is the JSON grammar in McKeeman Form.
// json
// element
// value
// object
// array
// string
// number
// "true"
// "false"
// "null"
// object
// '{' ws '}'
// '{' members '}'
// members
// member
// member ',' members
// member
// ws string ws ':' element
// array
// '[' ws ']'
// '[' elements ']'
// elements
// element
// element ',' elements
// element
// ws value ws
// string
// '"' characters '"'
// characters
// ""
// character characters
// character
// '0020' . '10FFFF' - '"' - '\'
// '\' escape
// escape
// '"'
// '\'
// '/'
// 'b'
// 'f'
// 'n'
// 'r'
// 't'
// 'u' hex hex hex hex
// hex
// digit
// 'A' . 'F'
// 'a' . 'f'
// number
// integer fraction exponent
// integer
// digit
// onenine digits
// '-' digit
// '-' onenine digits
// digits
// digit
// digit digits
// digit
// '0'
// onenine
// onenine
// '1' . '9'
// fraction
// ""
// '.' digits
// exponent
// ""
// 'E' sign digits
// 'e' sign digits
// sign
// ""
// '+'
// '-'
// ws
// ""
// '0020' ws
// '000A' ws
// '000D' ws
// '0009' ws
struct Callbacks {
void (*on_begin_object)(void *data) = noop;
void (*on_end_object)(void *data) = noop;
void (*on_begin_string)(void *data) = noop;
void (*on_string_data)(void *data, const char *buf, int len) = noop;
void (*on_end_string)(void *data) = noop;
void (*on_begin_array)(void *data) = noop;
void (*on_end_array)(void *data) = noop;
void (*on_begin_number)(void *data) = noop;
void (*on_number_data)(void *data, const char *buf, int len) = noop;
void (*on_end_number)(void *data) = noop;
void (*on_true_literal)(void *data) = noop;
void (*on_false_literal)(void *data) = noop;
void (*on_null_literal)(void *data) = noop;
private:
static void noop(void *) {}
static void noop(void *, const char *, int) {}
};
// Terminals and Nonterminals. These appear in the stack of the pushdown
// automata
enum Symbol : int8_t {
T_STRING, // Multibyte!
T_COLON,
// Nonterminals
N_VALUE,
N_ARRAY_VALUE_OR_END,
N_OBJECT_VALUE_OR_END,
N_ARRAY_MAYBE_CONTINUE,
N_OBJECT_MAYBE_CONTINUE,
N_PAST_END, // Must be last nonterminal
};
const char *symbolNames[] = {
"T_STRING",
"T_COLON",
"N_VALUE",
"N_ARRAY_VALUE_OR_END",
"N_OBJECT_VALUE_OR_END",
"N_ARRAY_MAYBE_CONTINUE",
"N_OBJECT_MAYBE_CONTINUE",
};
constexpr static struct Tables {
constexpr Tables() {
whitespace[' '] = true;
whitespace['\n'] = true;
whitespace['\r'] = true;
whitespace['\t'] = true;
}
alignas(16) bool whitespace[256]{};
} tables;
namespace {
// Straightforward recursive descent that doesn't handle string escaping and
// treats numbers as [0-9.]+. May stack overflow on deeply nested json documents
struct Parser1 {
Parser1(char *buf, int len, const Callbacks *callbacks, void *data)
: buf(buf), bufEnd(buf + len), callbacks(callbacks), data(data) {}
// Returns false to reject
[[nodiscard]] bool parse() { return parse_element(); }
Parser1(Parser1 const &) = delete;
Parser1 &operator=(Parser1 const &) = delete;
Parser1(Parser1 &&) = delete;
Parser1 &operator=(Parser1 &&) = delete;
private:
char *buf;
char *bufEnd;
const Callbacks *const callbacks;
void *const data;
int len() const { return bufEnd - buf; }
// Helpers
void maybeSkipWs() {
while (buf != bufEnd && tables.whitespace[*buf]) {
++buf;
}
}
bool parseLiteral(const char *literal) {
const int litLen = strlen(literal);
if (len() < litLen) {
return false;
}
return memcmp(std::exchange(buf, buf + litLen), literal, litLen) == 0;
}
// functions corresponding to productions
bool parse_element() {
maybeSkipWs();
if (len() == 0) {
return false;
}
if (*buf == '{') {
if (!parse_object()) {
return false;
}
} else if (*buf == '[') {
if (!parse_array()) {
return false;
}
} else if (*buf == '"') {
if (!parse_string()) {
return false;
}
} else if (*buf == 't') {
if (!parse_true()) {
return false;
}
} else if (*buf == 'f') {
if (!parse_false()) {
return false;
}
} else if (*buf == 'n') {
if (!parse_null()) {
return false;
}
} else {
if (!parse_number()) {
return false;
}
}
maybeSkipWs();
return true;
}
bool parse_object() {
if (!parseLiteral("{")) {
return false;
}
callbacks->on_begin_object(data);
maybeSkipWs();
if (len() == 0) {
return false;
}
if (*buf != '}') {
if (!parse_members()) {
}
}
if (!parseLiteral("}")) {
return false;
}
callbacks->on_end_object(data);
return true;
}
bool parse_members() {
begin:
if (!parse_member()) {
return false;
}
if (len() == 0) {
return false;
}
if (*buf == ',') {
if (!parseLiteral(",")) {
return false;
}
goto begin; // tail call
}
return true;
}
bool parse_member() {
maybeSkipWs();
if (!parse_string()) {
return false;
}
maybeSkipWs();
if (!parseLiteral(":")) {
return false;
}
if (!parse_element()) {
return false;
}
return true;
}
bool parse_array() {
if (!parseLiteral("[")) {
return false;
}
callbacks->on_begin_array(data);
maybeSkipWs();
if (len() == 0) {
return false;
}
if (*buf != ']') {
if (!parse_elements()) {
return false;
}
}
if (!parseLiteral("]")) {
return false;
}
callbacks->on_end_array(data);
return true;
}
bool parse_elements() {
begin:
if (!parse_element()) {
return false;
}
if (len() == 0) {
return false;
}
if (*buf == ',') {
if (!parseLiteral(",")) {
return false;
}
goto begin; // tail call
}
return true;
}
bool parse_string() {
callbacks->on_begin_string(data);
if (!parseLiteral("\"")) {
return false;
}
auto *result = (char *)memchr(buf, '"', len());
if (result == nullptr) {
return false;
}
int stringLen = result - buf;
callbacks->on_string_data(data, buf, stringLen);
buf += stringLen;
if (!parseLiteral("\"")) {
return false;
}
callbacks->on_end_string(data);
return true;
}
bool parse_number() {
callbacks->on_begin_number(data);
char *const bufBefore = buf;
for (;;) {
if (len() == 0) {
return false;
}
if ('0' <= *buf && *buf <= '9' || (*buf == '.')) {
++buf;
} else {
break;
}
}
if (buf == bufBefore) {
return false;
}
callbacks->on_number_data(data, bufBefore, buf - bufBefore);
callbacks->on_end_number(data);
return true;
}
bool parse_true() {
if (!parseLiteral("true")) {
return false;
}
callbacks->on_true_literal(data);
return true;
}
bool parse_false() {
if (!parseLiteral("false")) {
return false;
}
callbacks->on_false_literal(data);
return true;
}
bool parse_null() {
if (!parseLiteral("null")) {
return false;
}
callbacks->on_null_literal(data);
return true;
}
};
#ifndef __has_attribute
#define __has_attribute(x) 0
#endif
#if __has_attribute(musttail)
#define MUSTTAIL __attribute__((musttail))
#else
#define MUSTTAIL
#endif
// Table-based ll(1) parser that doesn't handle escaping and treats numbers as
// [0-9.]+. Could be adapted to have a streaming interface. Uses O(1) memory.
struct Parser2 {
Parser2(char *buf, int len, const Callbacks *callbacks, void *data)
: buf(buf), bufEnd(buf + len), callbacks(callbacks), data(data) {}
// Returns false to reject
[[nodiscard]] bool parse() {
if (!push({N_VALUE})) {
return false;
}
return keepGoing(this);
}
Parser2(Parser2 const &) = delete;
Parser2 &operator=(Parser2 const &) = delete;
Parser2(Parser2 &&) = delete;
Parser2 &operator=(Parser2 &&) = delete;
static constexpr int kMaxStackSize = 1 << 10;
private:
// Helpers
void maybeSkipWs() {
while (buf != bufEnd && tables.whitespace[*buf]) {
++buf;
}
}
bool parseLiteral(const char *literal) {
const int litLen = strlen(literal);
if (len() < litLen) {
return false;
}
if (memcmp(buf, literal, litLen) == 0) {
buf += litLen;
return true;
}
return false;
}
bool parse_number() {
char *const bufBefore = buf;
if (len() == 0 || !('0' <= *buf && *buf <= '9' || (*buf == '.'))) {
return false;
}
callbacks->on_begin_number(data);
++buf;
for (;;) {
if ('0' <= *buf && *buf <= '9' || (*buf == '.')) {
++buf;
} else {
break;
}
}
callbacks->on_number_data(data, bufBefore, buf - bufBefore);
callbacks->on_end_number(data);
return true;
}
bool parse_string() {
if (!parseLiteral("\"")) {
return false;
}
callbacks->on_begin_string(data);
auto *result = (char *)memchr(buf, '"', len());
if (result == nullptr) {
return false;
}
int stringLen = result - buf;
callbacks->on_string_data(data, buf, stringLen);
buf += stringLen;
if (!parseLiteral("\"")) {
return false;
}
callbacks->on_end_string(data);
return true;
}
typedef bool (*continuation)(Parser2 *);
[[maybe_unused]] void debugPrint() {
for (int i = 0; i < stackPtr - stack; ++i) {
printf("%s ", symbolNames[stack[i]]);
}
printf("\n");
}
static bool keepGoing(Parser2 *self) {
// self->debugPrint();
if (self->empty()) {
return true;
}
auto top = *(self->stackPtr - 1);
self->maybeSkipWs();
MUSTTAIL return table[top](self);
}
static bool string(Parser2 *self) {
if (!self->parse_string()) {
return false;
}
self->pop();
MUSTTAIL return keepGoing(self);
}
static bool colon(Parser2 *self) {
if (!self->parseLiteral(":")) {
return false;
}
self->pop();
MUSTTAIL return keepGoing(self);
}
static bool value(Parser2 *self) {
if (self->parse_string()) {
self->pop();
MUSTTAIL return keepGoing(self);
} else if (self->parse_number()) {
self->pop();
MUSTTAIL return keepGoing(self);
} else if (self->parseLiteral("{")) {
self->pop();
self->callbacks->on_begin_object(self->data);
if (!self->push({N_OBJECT_VALUE_OR_END})) {
return false;
}
MUSTTAIL return keepGoing(self);
} else if (self->parseLiteral("[")) {
self->pop();
self->callbacks->on_begin_array(self->data);
if (!self->push({N_ARRAY_VALUE_OR_END})) {
return false;
}
MUSTTAIL return keepGoing(self);
} else if (self->parseLiteral("true")) {
self->pop();
self->callbacks->on_true_literal(self->data);
MUSTTAIL return keepGoing(self);
} else if (self->parseLiteral("false")) {
self->pop();
self->callbacks->on_false_literal(self->data);
MUSTTAIL return keepGoing(self);
} else if (self->parseLiteral("null")) {
self->pop();
self->callbacks->on_null_literal(self->data);
MUSTTAIL return keepGoing(self);
}
return false;
}
static bool arrayOrEnd(Parser2 *self) {
if (self->parseLiteral("]")) {
self->pop();
self->callbacks->on_end_array(self->data);
MUSTTAIL return keepGoing(self);
} else {
self->pop();
if (!self->push({N_VALUE, N_ARRAY_MAYBE_CONTINUE})) {
return false;
}
MUSTTAIL return keepGoing(self);
}
}
static bool objectOrEnd(Parser2 *self) {
if (self->parseLiteral("}")) {
self->pop();
self->callbacks->on_end_object(self->data);
MUSTTAIL return keepGoing(self);
} else {
self->pop();
if (!self->push({T_STRING, T_COLON, N_VALUE, N_OBJECT_MAYBE_CONTINUE})) {
return false;
}
MUSTTAIL return keepGoing(self);
}
return false;
}
static bool arrayContinue(Parser2 *self) {
if (self->parseLiteral(",")) {
self->pop();
if (!self->push({N_VALUE, N_ARRAY_MAYBE_CONTINUE})) {
return false;
}
MUSTTAIL return keepGoing(self);
} else if (self->parseLiteral("]")) {
self->pop();
self->callbacks->on_end_array(self->data);
MUSTTAIL return keepGoing(self);
}
return false;
}
static bool objectContinue(Parser2 *self) {
if (self->parseLiteral(",")) {
self->pop();
if (!self->push({T_STRING, T_COLON, N_VALUE, N_OBJECT_MAYBE_CONTINUE})) {
return false;
}
MUSTTAIL return keepGoing(self);
} else if (self->parseLiteral("}")) {
self->pop();
self->callbacks->on_end_object(self->data);
MUSTTAIL return keepGoing(self);
}
return false;
}
static constexpr continuation table[N_PAST_END] = {
/*T_STRING*/ string,
/*T_COLON*/ colon,
/*N_VALUE*/ value,
/*N_ARRAY_VALUE_OR_END*/ arrayOrEnd,
/*N_OBJECT_VALUE_OR_END*/ objectOrEnd,
/*N_ARRAY_MAYBE_CONTINUE*/ arrayContinue,
/*N_OBJECT_MAYBE_CONTINUE*/ objectContinue,
};
char *buf;
char *bufEnd;
int len() const { return bufEnd - buf; }
const Callbacks *const callbacks;
void *const data;
Symbol stack[kMaxStackSize];
Symbol *stackPtr = stack;
bool empty() const { return stackPtr == stack; }
void pop() {
assert(!empty());
--stackPtr;
}
[[nodiscard]] bool push(std::initializer_list<Symbol> symbols) {
if (stackPtr >= std::end(stack) - symbols.size()) [[unlikely]] {
return false;
}
for (int i = symbols.size() - 1; i >= 0; --i) {
*stackPtr++ = *(symbols.begin() + i);
}
return true;
}
};
const std::string json = R"({
"a number": 12345,
"true": true,
"false": false,
"null": null,
"glossary": {
"title": "example glossary",
"GlossDiv": {
"title": "S",
"GlossList": {
"GlossEntry": {
"ID": "SGML",
"SortAs": "SGML",
"GlossTerm": "Standard Generalized Markup Language",
"Acronym": "SGML",
"Abbrev": "ISO 8879:1986",
"GlossDef": {
"para": "A meta-markup language, used to create markup languages such as DocBook.",
"GlossSeeAlso": ["GML", "XML"]
},
"GlossSee": "markup"
}
}
}
}
})";
Callbacks printCallbacks() {
Callbacks result;
result.on_begin_object = +[](void *) { puts("on_begin_object"); };
result.on_end_object = +[](void *) { puts("on_end_object"); };
result.on_begin_string = +[](void *) { puts("on_begin_string"); };
result.on_string_data = +[](void *, const char *buf, int len) {
printf("on_string_data `%.*s`\n", len, buf);
};
result.on_end_string = +[](void *) { puts("on_end_string"); };
result.on_begin_array = +[](void *) { puts("on_begin_array"); };
result.on_end_array = +[](void *) { puts("on_end_array"); };
result.on_begin_number = +[](void *) { puts("on_begin_number"); };
result.on_number_data = +[](void *, const char *buf, int len) {
printf("on_number_data `%.*s`\n", len, buf);
};
result.on_end_number = +[](void *) { puts("on_end_number"); };
result.on_true_literal = +[](void *) { puts("on_true_literal"); };
result.on_false_literal = +[](void *) { puts("on_false_literal"); };
result.on_null_literal = +[](void *) { puts("on_null_literal"); };
return result;
}
} // namespace
TEST_CASE("parser1") {
Callbacks c = printCallbacks();
{
auto copy = json;
Parser1 parser(copy.data(), copy.length(), &c, nullptr);
CHECK(parser.parse());
}
{
std::string copy = "{\"x\": [], \"y\": {}}";
Parser1 parser(copy.data(), copy.length(), &c, nullptr);
CHECK(parser.parse());
}
}
TEST_CASE("parser2") {
Callbacks c = printCallbacks();
{
auto copy = json;
Parser2 parser(copy.data(), copy.length(), &c, nullptr);
CHECK(parser.parse());
}
{
std::string copy = "{\"x\": [], \"y\": {}}";
Parser2 parser(copy.data(), copy.length(), &c, nullptr);
CHECK(parser.parse());
}
}
TEST_CASE("bench1") {
auto c = Callbacks{};
ankerl::nanobench::Bench bench;
bench.batch(json.size());
bench.unit("byte");
bench.run("parser1", [&]() {
auto copy = json;
Parser1 parser(copy.data(), copy.length(), &c, nullptr);
bench.doNotOptimizeAway(parser.parse());
});
}
TEST_CASE("bench2") {
auto c = Callbacks{};
ankerl::nanobench::Bench bench;
bench.batch(json.size());
bench.unit("byte");
bench.run("parser2", [&]() {
auto copy = json;
Parser2 parser(copy.data(), copy.length(), &c, nullptr);
bench.doNotOptimizeAway(parser.parse());
});
}
TEST_CASE("bench3") {
using namespace simdjson;
ankerl::nanobench::Bench bench;
bench.batch(json.size());
bench.unit("byte");
bench.run("parser3", [&]() {
simdjson::padded_string my_padded_data(json.data(), json.size());
simdjson::dom::parser parser;
auto doc = parser.parse(my_padded_data);
bench.doNotOptimizeAway(doc);
});
}
TEST_CASE("bench4") {
using namespace simdjson;
ankerl::nanobench::Bench bench;
bench.batch(json.size());
bench.unit("byte");
bench.run("parser4", [&]() {
padded_string my_padded_data(json.data(), json.size());
ondemand::parser parser;
auto doc = parser.iterate(my_padded_data);
bench.doNotOptimizeAway(doc);
});
}