Files
weaseljson/src/test.cpp

503 lines
15 KiB
C++

#include <cassert>
#include <cctype>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <limits>
#include <string>
#include <doctest.h>
#include <nanobench.h>
#include <simdjson.h>
#include "callbacks.h"
#include "parser3.h"
#include "weaseljson.h"
// This is the JSON grammar in McKeeman Form.
// json
// element
// value
// object
// array
// string
// number
// "true"
// "false"
// "null"
// object
// '{' ws '}'
// '{' members '}'
// members
// member
// member ',' members
// member
// ws string ws ':' element
// array
// '[' ws ']'
// '[' elements ']'
// elements
// element
// element ',' elements
// element
// ws value ws
// string
// '"' characters '"'
// characters
// ""
// character characters
// character
// '0020' . '10FFFF' - '"' - '\'
// '\' escape
// escape
// '"'
// '\'
// '/'
// 'b'
// 'f'
// 'n'
// 'r'
// 't'
// 'u' hex hex hex hex
// hex
// digit
// 'A' . 'F'
// 'a' . 'f'
// number
// integer fraction exponent
// integer
// digit
// onenine digits
// '-' digit
// '-' onenine digits
// digits
// digit
// digit digits
// digit
// '0'
// onenine
// onenine
// '1' . '9'
// fraction
// ""
// '.' digits
// exponent
// ""
// 'E' sign digits
// 'e' sign digits
// sign
// ""
// '+'
// '-'
// ws
// ""
// '0020' ws
// '000A' ws
// '000D' ws
// '0009' ws
namespace {
const std::string json = []() {
std::ifstream infile{"test.json"};
return std::string{std::istreambuf_iterator<char>(infile),
std::istreambuf_iterator<char>()};
}();
void testStreaming(std::string const &json) {
SerializeState streaming;
SerializeState batch;
auto c = serializeCallbacks();
{
auto copy = json;
auto *parser = WeaselJsonParser_create(1024, &c, &streaming, 0);
for (size_t i = 0; i < copy.size(); ++i) {
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) ==
WeaselJson_AGAIN);
}
REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK);
WeaselJsonParser_destroy(parser);
}
{
auto copy = json;
auto *parser = WeaselJsonParser_create(1024, &c, &batch, 0);
REQUIRE(WeaselJsonParser_parse(parser, copy.data(), copy.size()) ==
WeaselJson_AGAIN);
REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK);
WeaselJsonParser_destroy(parser);
}
CHECK(streaming.result == batch.result);
}
} // namespace
TEST_CASE("parser3") {
WeaselJsonCallbacks c = serializeCallbacks();
SerializeState state;
{
auto copy = json;
auto *parser = WeaselJsonParser_create(1024, &c, &state, 0);
for (size_t i = 0; i < copy.size(); ++i) {
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) ==
WeaselJson_AGAIN);
}
REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK);
WeaselJsonParser_destroy(parser);
}
{
std::string copy = "{\"x\": [], \"y\": {}}";
auto *parser = WeaselJsonParser_create(1024, &c, &state, 0);
for (size_t i = 0; i < copy.size(); ++i) {
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) ==
WeaselJson_AGAIN);
}
REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK);
WeaselJsonParser_destroy(parser);
puts("");
}
{
auto c = noopCallbacks();
std::string copy = "{\"a\":\"a";
auto *parser = WeaselJsonParser_create(1024, &c, &state, 0);
for (size_t i = 0; i < copy.size(); ++i) {
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) ==
WeaselJson_AGAIN);
}
REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_REJECT);
WeaselJsonParser_destroy(parser);
}
{
auto c = noopCallbacks();
std::string copy = "[";
auto *parser = WeaselJsonParser_create(1024, &c, &state, 0);
for (size_t i = 0; i < copy.size(); ++i) {
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) ==
WeaselJson_AGAIN);
}
REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_REJECT);
WeaselJsonParser_destroy(parser);
}
}
TEST_CASE("streaming") { testStreaming(json); }
void doTestUnescapingUtf8(std::string const &escaped,
std::string const &expected, int stride) {
CAPTURE(escaped);
CAPTURE(expected);
CAPTURE(stride);
auto c = noopCallbacks();
std::string result;
c.on_string_data = +[](void *p, const char *buf, int len, int /*done*/) {
auto &s = *(std::string *)p;
s.append(buf, len);
};
auto *parser = WeaselJsonParser_create(1024, &c, &result, 0);
auto copy = escaped;
for (size_t i = 0; i < copy.size(); i += stride) {
CAPTURE(i);
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i,
std::min<int>(stride, copy.size() - i)) ==
WeaselJson_AGAIN);
}
REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK);
WeaselJsonParser_destroy(parser);
CHECK(result.size() == expected.size());
CHECK(result == expected);
}
void testUnescapingUtf8(std::string const &escaped,
std::string const &expected) {
for (int stride = 0; stride < 10; ++stride) {
doTestUnescapingUtf8(escaped, expected,
stride == 0 ? std::numeric_limits<int>::max()
: stride);
}
}
TEST_CASE("unescaping utf-8") {
// 4 byte encoding (utf-16 surrogate pair)
testUnescapingUtf8("\"\\ud801\\udc37\"", "𐐷");
// Basic
testUnescapingUtf8("\"\\\"\"", "\"");
testUnescapingUtf8("\"\\\\\"", "\\");
testUnescapingUtf8("\"\\/\"", "/");
testUnescapingUtf8("\"\\b\"", "\b");
testUnescapingUtf8("\"\\f\"", "\f");
testUnescapingUtf8("\"\\n\"", "\n");
testUnescapingUtf8("\"\\r\"", "\r");
testUnescapingUtf8("\"\\t\"", "\t");
// 2 byte encoding
testUnescapingUtf8("\"\\u07aB 1234\"", "\u07aB 1234");
// 3 byte encoding
testUnescapingUtf8("\"\\uaB34 5678\"", "\uaB34 5678");
}
TEST_CASE("bench3") {
auto c = noopCallbacks();
ankerl::nanobench::Bench bench;
bench.batch(json.size());
bench.unit("byte");
auto *parser = WeaselJsonParser_create(1024, &c, nullptr, 0);
for (size_t stride = 128; stride <= json.size(); stride *= 2) {
bench.run("parser3 (stride: " + std::to_string(stride) + ")", [&]() {
auto copy = json;
WeaselJsonParser_reset(parser);
for (size_t i = 0; i < copy.size(); i += stride) {
if (WeaselJsonParser_parse(parser, copy.data() + i,
std::min<int>(copy.size() - i, stride)) !=
WeaselJson_AGAIN) {
abort();
}
}
if (WeaselJsonParser_parse(parser, nullptr, 0) != WeaselJson_OK) {
abort();
}
});
}
WeaselJsonParser_destroy(parser);
}
TEST_CASE("bench4") {
using namespace simdjson;
ankerl::nanobench::Bench bench;
bench.batch(json.size());
bench.unit("byte");
bench.run("simdjson dom", [&]() {
simdjson::padded_string my_padded_data(json.data(), json.size());
simdjson::dom::parser parser;
auto doc = parser.parse(my_padded_data);
bench.doNotOptimizeAway(doc);
});
}
TEST_CASE("bench5") {
using namespace simdjson;
ankerl::nanobench::Bench bench;
bench.batch(json.size());
bench.unit("byte");
bench.run("simdjson on demand", [&]() {
padded_string my_padded_data(json.data(), json.size());
ondemand::parser parser;
auto doc = parser.iterate(my_padded_data);
bench.doNotOptimizeAway(doc);
});
}
TEST_CASE("num dfa") {
NumDfa dfa;
std::string match =
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"11111111";
auto *buf = dfa.scan(match.data(), match.data() + match.size());
CHECK(buf == match.data() + match.size());
CHECK(dfa.accept());
ankerl::nanobench::Bench bench;
bench.batch(match.size());
bench.unit("byte");
bench.run("number dfa", [&]() {
dfa.reset();
bench.doNotOptimizeAway(
dfa.scan(match.data(), match.data() + match.size()));
});
}
const char *utf8str =
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩"
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩"
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩";
TEST_CASE("utf8 dfa") {
Utf8Dfa dfa;
std::string match = utf8str;
auto *buf = dfa.scan(match.data(), match.data() + match.size());
CHECK(buf == match.data() + match.size());
CHECK(dfa.accept());
ankerl::nanobench::Bench bench;
bench.batch(match.size());
bench.unit("byte");
bench.run("utf8 dfa", [&]() {
dfa.reset();
bench.doNotOptimizeAway(
dfa.scan(match.data(), match.data() + match.size()));
});
bench.run("simdjson utf8", [&]() {
bench.doNotOptimizeAway(
simdjson::validate_utf8(match.data(), match.size()));
});
}
// Different input structures with special care in the implementation
// performance wise
TEST_CASE("bench input types") {
auto bench = [](std::string name, std::string json) {
auto c = noopCallbacks();
ankerl::nanobench::Bench bench;
bench.batch(json.size());
bench.unit("byte");
bench.relative(true);
bench.run("simdjson dom " + name, [&]() {
simdjson::padded_string my_padded_data(json.data(), json.size());
simdjson::dom::parser parser;
auto doc = parser.parse(my_padded_data);
bench.doNotOptimizeAway(doc);
});
auto *parser = WeaselJsonParser_create(1024, &c, nullptr, 0);
bench.run("parser3 " + name, [&]() {
auto copy = json;
WeaselJsonParser_reset(parser);
if (WeaselJsonParser_parse(parser, copy.data(), copy.size()) !=
WeaselJson_AGAIN) {
abort();
}
if (WeaselJsonParser_parse(parser, nullptr, 0) != WeaselJson_OK) {
abort();
}
});
WeaselJsonParser_destroy(parser);
};
bench("numbers", "[-123456789.000000000000000123456789e+12, "
"-123456789.000000000000000123456789E+12, "
"-123456789.000000000000000123456789e-12, "
"-123456789.000000000000000123456789E-12, "
"-123456789.000000000000000123456789e+12, "
"-123456789.000000000000000123456789E+12, "
"-123456789.000000000000000123456789e-12, "
"-123456789.000000000000000123456789E-12, "
"-123456789.000000000000000123456789e+12, "
"-123456789.000000000000000123456789E+12, "
"-123456789.000000000000000123456789e-12, "
"-123456789.000000000000000123456789E-12, "
"-123456789.000000000000000123456789e+12, "
"-123456789.000000000000000123456789E+12, "
"-123456789.000000000000000123456789e-12, "
"-123456789.000000000000000123456789E-12, "
"-123456789.000000000000000123456789e+12]");
bench("ascii",
"\"Donec lobortis eleifend condimentum. Cras dictum dolor lacinia "
"lectus vehicula rutrum. Maecenas quis nisi nunc. Nam tristique "
"feugiat est vitae mollis. Maecenas quis nisi nunc.\"");
bench("utf-8", std::string("\"") + utf8str + "\"");
bench("normal escapes",
R"(
["\n\r\t\"\b\f\\\/",
"\n\r\t\"\b\f\\\/",
"\n\r\t\"\b\f\\\/",
"\n\r\t\"\b\f\\\/",
"\n\r\t\"\b\f\\\/",
"\n\r\t\"\b\f\\\/",
"\n\r\t\"\b\f\\\/",
"\n\r\t\"\b\f\\\/",
"\n\r\t\"\b\f\\\/",
"\n\r\t\"\b\f\\\/",
"\n\r\t\"\b\f\\\/",
"\n\r\t\"\b\f\\\/",
"\n\r\t\"\b\f\\\/",
"\n\r\t\"\b\f\\\/",
"\n\r\t\"\b\f\\\/"]
)");
bench("unicode escapes",
R"(
["\uabcd\u1234\ud801\udc37",
"\uabcd\u1234\ud801\udc37",
"\uabcd\u1234\ud801\udc37",
"\uabcd\u1234\ud801\udc37",
"\uabcd\u1234\ud801\udc37",
"\uabcd\u1234\ud801\udc37",
"\uabcd\u1234\ud801\udc37",
"\uabcd\u1234\ud801\udc37",
"\uabcd\u1234\ud801\udc37",
"\uabcd\u1234\ud801\udc37",
"\uabcd\u1234\ud801\udc37",
"\uabcd\u1234\ud801\udc37",
"\uabcd\u1234\ud801\udc37",
"\uabcd\u1234\ud801\udc37",
"\uabcd\u1234\ud801\udc37"]
)");
bench("structural",
R"(
[
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}
]
)");
bench("whitespace", R"(
[
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
]
)");
bench("literals", R"([
true, false, null,
true, false, null,
true, false, null,
true, false, null,
true, false, null,
true, false, null,
true, false, null
]
)");
}