503 lines
15 KiB
C++
503 lines
15 KiB
C++
#include <cassert>
|
|
#include <cctype>
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
#include <fstream>
|
|
|
|
#include <limits>
|
|
#include <string>
|
|
|
|
#include <doctest.h>
|
|
#include <nanobench.h>
|
|
#include <simdjson.h>
|
|
|
|
#include "callbacks.h"
|
|
#include "parser3.h"
|
|
#include "weaseljson.h"
|
|
|
|
// This is the JSON grammar in McKeeman Form.
|
|
|
|
// json
|
|
// element
|
|
|
|
// value
|
|
// object
|
|
// array
|
|
// string
|
|
// number
|
|
// "true"
|
|
// "false"
|
|
// "null"
|
|
|
|
// object
|
|
// '{' ws '}'
|
|
// '{' members '}'
|
|
|
|
// members
|
|
// member
|
|
// member ',' members
|
|
|
|
// member
|
|
// ws string ws ':' element
|
|
|
|
// array
|
|
// '[' ws ']'
|
|
// '[' elements ']'
|
|
|
|
// elements
|
|
// element
|
|
// element ',' elements
|
|
|
|
// element
|
|
// ws value ws
|
|
|
|
// string
|
|
// '"' characters '"'
|
|
|
|
// characters
|
|
// ""
|
|
// character characters
|
|
|
|
// character
|
|
// '0020' . '10FFFF' - '"' - '\'
|
|
// '\' escape
|
|
|
|
// escape
|
|
// '"'
|
|
// '\'
|
|
// '/'
|
|
// 'b'
|
|
// 'f'
|
|
// 'n'
|
|
// 'r'
|
|
// 't'
|
|
// 'u' hex hex hex hex
|
|
|
|
// hex
|
|
// digit
|
|
// 'A' . 'F'
|
|
// 'a' . 'f'
|
|
|
|
// number
|
|
// integer fraction exponent
|
|
|
|
// integer
|
|
// digit
|
|
// onenine digits
|
|
// '-' digit
|
|
// '-' onenine digits
|
|
|
|
// digits
|
|
// digit
|
|
// digit digits
|
|
|
|
// digit
|
|
// '0'
|
|
// onenine
|
|
|
|
// onenine
|
|
// '1' . '9'
|
|
|
|
// fraction
|
|
// ""
|
|
// '.' digits
|
|
|
|
// exponent
|
|
// ""
|
|
// 'E' sign digits
|
|
// 'e' sign digits
|
|
|
|
// sign
|
|
// ""
|
|
// '+'
|
|
// '-'
|
|
|
|
// ws
|
|
// ""
|
|
// '0020' ws
|
|
// '000A' ws
|
|
// '000D' ws
|
|
// '0009' ws
|
|
|
|
namespace {
|
|
|
|
const std::string json = []() {
|
|
std::ifstream infile{"test.json"};
|
|
return std::string{std::istreambuf_iterator<char>(infile),
|
|
std::istreambuf_iterator<char>()};
|
|
}();
|
|
|
|
void testStreaming(std::string const &json) {
|
|
SerializeState streaming;
|
|
SerializeState batch;
|
|
auto c = serializeCallbacks();
|
|
{
|
|
auto copy = json;
|
|
auto *parser = WeaselJsonParser_create(1024, &c, &streaming, 0);
|
|
for (size_t i = 0; i < copy.size(); ++i) {
|
|
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) ==
|
|
WeaselJson_AGAIN);
|
|
}
|
|
REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK);
|
|
WeaselJsonParser_destroy(parser);
|
|
}
|
|
{
|
|
auto copy = json;
|
|
auto *parser = WeaselJsonParser_create(1024, &c, &batch, 0);
|
|
REQUIRE(WeaselJsonParser_parse(parser, copy.data(), copy.size()) ==
|
|
WeaselJson_AGAIN);
|
|
REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK);
|
|
WeaselJsonParser_destroy(parser);
|
|
}
|
|
CHECK(streaming.result == batch.result);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
TEST_CASE("parser3") {
|
|
WeaselJsonCallbacks c = serializeCallbacks();
|
|
SerializeState state;
|
|
{
|
|
auto copy = json;
|
|
auto *parser = WeaselJsonParser_create(1024, &c, &state, 0);
|
|
for (size_t i = 0; i < copy.size(); ++i) {
|
|
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) ==
|
|
WeaselJson_AGAIN);
|
|
}
|
|
REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK);
|
|
WeaselJsonParser_destroy(parser);
|
|
}
|
|
{
|
|
std::string copy = "{\"x\": [], \"y\": {}}";
|
|
auto *parser = WeaselJsonParser_create(1024, &c, &state, 0);
|
|
for (size_t i = 0; i < copy.size(); ++i) {
|
|
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) ==
|
|
WeaselJson_AGAIN);
|
|
}
|
|
REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK);
|
|
WeaselJsonParser_destroy(parser);
|
|
puts("");
|
|
}
|
|
{
|
|
auto c = noopCallbacks();
|
|
std::string copy = "{\"a\":\"a";
|
|
auto *parser = WeaselJsonParser_create(1024, &c, &state, 0);
|
|
for (size_t i = 0; i < copy.size(); ++i) {
|
|
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) ==
|
|
WeaselJson_AGAIN);
|
|
}
|
|
REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_REJECT);
|
|
WeaselJsonParser_destroy(parser);
|
|
}
|
|
{
|
|
auto c = noopCallbacks();
|
|
std::string copy = "[";
|
|
auto *parser = WeaselJsonParser_create(1024, &c, &state, 0);
|
|
for (size_t i = 0; i < copy.size(); ++i) {
|
|
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) ==
|
|
WeaselJson_AGAIN);
|
|
}
|
|
REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_REJECT);
|
|
WeaselJsonParser_destroy(parser);
|
|
}
|
|
}
|
|
|
|
TEST_CASE("streaming") { testStreaming(json); }
|
|
|
|
void doTestUnescapingUtf8(std::string const &escaped,
|
|
std::string const &expected, int stride) {
|
|
CAPTURE(escaped);
|
|
CAPTURE(expected);
|
|
CAPTURE(stride);
|
|
auto c = noopCallbacks();
|
|
std::string result;
|
|
c.on_string_data = +[](void *p, const char *buf, int len, int /*done*/) {
|
|
auto &s = *(std::string *)p;
|
|
s.append(buf, len);
|
|
};
|
|
auto *parser = WeaselJsonParser_create(1024, &c, &result, 0);
|
|
auto copy = escaped;
|
|
for (size_t i = 0; i < copy.size(); i += stride) {
|
|
CAPTURE(i);
|
|
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i,
|
|
std::min<int>(stride, copy.size() - i)) ==
|
|
WeaselJson_AGAIN);
|
|
}
|
|
REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK);
|
|
WeaselJsonParser_destroy(parser);
|
|
CHECK(result.size() == expected.size());
|
|
CHECK(result == expected);
|
|
}
|
|
|
|
void testUnescapingUtf8(std::string const &escaped,
|
|
std::string const &expected) {
|
|
for (int stride = 0; stride < 10; ++stride) {
|
|
doTestUnescapingUtf8(escaped, expected,
|
|
stride == 0 ? std::numeric_limits<int>::max()
|
|
: stride);
|
|
}
|
|
}
|
|
|
|
TEST_CASE("unescaping utf-8") {
|
|
// 4 byte encoding (utf-16 surrogate pair)
|
|
testUnescapingUtf8("\"\\ud801\\udc37\"", "𐐷");
|
|
|
|
// Basic
|
|
testUnescapingUtf8("\"\\\"\"", "\"");
|
|
testUnescapingUtf8("\"\\\\\"", "\\");
|
|
testUnescapingUtf8("\"\\/\"", "/");
|
|
testUnescapingUtf8("\"\\b\"", "\b");
|
|
testUnescapingUtf8("\"\\f\"", "\f");
|
|
testUnescapingUtf8("\"\\n\"", "\n");
|
|
testUnescapingUtf8("\"\\r\"", "\r");
|
|
testUnescapingUtf8("\"\\t\"", "\t");
|
|
// 2 byte encoding
|
|
testUnescapingUtf8("\"\\u07aB 1234\"", "\u07aB 1234");
|
|
// 3 byte encoding
|
|
testUnescapingUtf8("\"\\uaB34 5678\"", "\uaB34 5678");
|
|
}
|
|
|
|
TEST_CASE("bench3") {
|
|
auto c = noopCallbacks();
|
|
ankerl::nanobench::Bench bench;
|
|
bench.batch(json.size());
|
|
bench.unit("byte");
|
|
auto *parser = WeaselJsonParser_create(1024, &c, nullptr, 0);
|
|
for (size_t stride = 128; stride <= json.size(); stride *= 2) {
|
|
bench.run("parser3 (stride: " + std::to_string(stride) + ")", [&]() {
|
|
auto copy = json;
|
|
WeaselJsonParser_reset(parser);
|
|
for (size_t i = 0; i < copy.size(); i += stride) {
|
|
if (WeaselJsonParser_parse(parser, copy.data() + i,
|
|
std::min<int>(copy.size() - i, stride)) !=
|
|
WeaselJson_AGAIN) {
|
|
abort();
|
|
}
|
|
}
|
|
if (WeaselJsonParser_parse(parser, nullptr, 0) != WeaselJson_OK) {
|
|
abort();
|
|
}
|
|
});
|
|
}
|
|
WeaselJsonParser_destroy(parser);
|
|
}
|
|
|
|
TEST_CASE("bench4") {
|
|
using namespace simdjson;
|
|
ankerl::nanobench::Bench bench;
|
|
bench.batch(json.size());
|
|
bench.unit("byte");
|
|
bench.run("simdjson dom", [&]() {
|
|
simdjson::padded_string my_padded_data(json.data(), json.size());
|
|
simdjson::dom::parser parser;
|
|
auto doc = parser.parse(my_padded_data);
|
|
bench.doNotOptimizeAway(doc);
|
|
});
|
|
}
|
|
|
|
TEST_CASE("bench5") {
|
|
using namespace simdjson;
|
|
ankerl::nanobench::Bench bench;
|
|
bench.batch(json.size());
|
|
bench.unit("byte");
|
|
bench.run("simdjson on demand", [&]() {
|
|
padded_string my_padded_data(json.data(), json.size());
|
|
ondemand::parser parser;
|
|
auto doc = parser.iterate(my_padded_data);
|
|
bench.doNotOptimizeAway(doc);
|
|
});
|
|
}
|
|
|
|
TEST_CASE("num dfa") {
|
|
NumDfa dfa;
|
|
std::string match =
|
|
"111111111111111111111111111111111111111111111111111111111111111111111111"
|
|
"111111111111111111111111111111111111111111111111111111111111111111111111"
|
|
"111111111111111111111111111111111111111111111111111111111111111111111111"
|
|
"111111111111111111111111111111111111111111111111111111111111111111111111"
|
|
"111111111111111111111111111111111111111111111111111111111111111111111111"
|
|
"111111111111111111111111111111111111111111111111111111111111111111111111"
|
|
"111111111111111111111111111111111111111111111111111111111111111111111111"
|
|
"11111111";
|
|
auto *buf = dfa.scan(match.data(), match.data() + match.size());
|
|
CHECK(buf == match.data() + match.size());
|
|
CHECK(dfa.accept());
|
|
|
|
ankerl::nanobench::Bench bench;
|
|
bench.batch(match.size());
|
|
bench.unit("byte");
|
|
bench.run("number dfa", [&]() {
|
|
dfa.reset();
|
|
bench.doNotOptimizeAway(
|
|
dfa.scan(match.data(), match.data() + match.size()));
|
|
});
|
|
}
|
|
|
|
const char *utf8str =
|
|
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩"
|
|
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩"
|
|
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩";
|
|
|
|
TEST_CASE("utf8 dfa") {
|
|
Utf8Dfa dfa;
|
|
std::string match = utf8str;
|
|
auto *buf = dfa.scan(match.data(), match.data() + match.size());
|
|
CHECK(buf == match.data() + match.size());
|
|
CHECK(dfa.accept());
|
|
|
|
ankerl::nanobench::Bench bench;
|
|
bench.batch(match.size());
|
|
bench.unit("byte");
|
|
bench.run("utf8 dfa", [&]() {
|
|
dfa.reset();
|
|
bench.doNotOptimizeAway(
|
|
dfa.scan(match.data(), match.data() + match.size()));
|
|
});
|
|
bench.run("simdjson utf8", [&]() {
|
|
bench.doNotOptimizeAway(
|
|
simdjson::validate_utf8(match.data(), match.size()));
|
|
});
|
|
}
|
|
|
|
// Different input structures with special care in the implementation
|
|
// performance wise
|
|
TEST_CASE("bench input types") {
|
|
auto bench = [](std::string name, std::string json) {
|
|
auto c = noopCallbacks();
|
|
ankerl::nanobench::Bench bench;
|
|
bench.batch(json.size());
|
|
bench.unit("byte");
|
|
bench.relative(true);
|
|
|
|
bench.run("simdjson dom " + name, [&]() {
|
|
simdjson::padded_string my_padded_data(json.data(), json.size());
|
|
simdjson::dom::parser parser;
|
|
auto doc = parser.parse(my_padded_data);
|
|
bench.doNotOptimizeAway(doc);
|
|
});
|
|
|
|
auto *parser = WeaselJsonParser_create(1024, &c, nullptr, 0);
|
|
bench.run("parser3 " + name, [&]() {
|
|
auto copy = json;
|
|
WeaselJsonParser_reset(parser);
|
|
if (WeaselJsonParser_parse(parser, copy.data(), copy.size()) !=
|
|
WeaselJson_AGAIN) {
|
|
abort();
|
|
}
|
|
if (WeaselJsonParser_parse(parser, nullptr, 0) != WeaselJson_OK) {
|
|
abort();
|
|
}
|
|
});
|
|
WeaselJsonParser_destroy(parser);
|
|
};
|
|
|
|
bench("numbers", "[-123456789.000000000000000123456789e+12, "
|
|
"-123456789.000000000000000123456789E+12, "
|
|
"-123456789.000000000000000123456789e-12, "
|
|
"-123456789.000000000000000123456789E-12, "
|
|
"-123456789.000000000000000123456789e+12, "
|
|
"-123456789.000000000000000123456789E+12, "
|
|
"-123456789.000000000000000123456789e-12, "
|
|
"-123456789.000000000000000123456789E-12, "
|
|
"-123456789.000000000000000123456789e+12, "
|
|
"-123456789.000000000000000123456789E+12, "
|
|
"-123456789.000000000000000123456789e-12, "
|
|
"-123456789.000000000000000123456789E-12, "
|
|
"-123456789.000000000000000123456789e+12, "
|
|
"-123456789.000000000000000123456789E+12, "
|
|
"-123456789.000000000000000123456789e-12, "
|
|
"-123456789.000000000000000123456789E-12, "
|
|
"-123456789.000000000000000123456789e+12]");
|
|
bench("ascii",
|
|
"\"Donec lobortis eleifend condimentum. Cras dictum dolor lacinia "
|
|
"lectus vehicula rutrum. Maecenas quis nisi nunc. Nam tristique "
|
|
"feugiat est vitae mollis. Maecenas quis nisi nunc.\"");
|
|
bench("utf-8", std::string("\"") + utf8str + "\"");
|
|
bench("normal escapes",
|
|
R"(
|
|
["\n\r\t\"\b\f\\\/",
|
|
"\n\r\t\"\b\f\\\/",
|
|
"\n\r\t\"\b\f\\\/",
|
|
"\n\r\t\"\b\f\\\/",
|
|
"\n\r\t\"\b\f\\\/",
|
|
"\n\r\t\"\b\f\\\/",
|
|
"\n\r\t\"\b\f\\\/",
|
|
"\n\r\t\"\b\f\\\/",
|
|
"\n\r\t\"\b\f\\\/",
|
|
"\n\r\t\"\b\f\\\/",
|
|
"\n\r\t\"\b\f\\\/",
|
|
"\n\r\t\"\b\f\\\/",
|
|
"\n\r\t\"\b\f\\\/",
|
|
"\n\r\t\"\b\f\\\/",
|
|
"\n\r\t\"\b\f\\\/"]
|
|
)");
|
|
bench("unicode escapes",
|
|
R"(
|
|
["\uabcd\u1234\ud801\udc37",
|
|
"\uabcd\u1234\ud801\udc37",
|
|
"\uabcd\u1234\ud801\udc37",
|
|
"\uabcd\u1234\ud801\udc37",
|
|
"\uabcd\u1234\ud801\udc37",
|
|
"\uabcd\u1234\ud801\udc37",
|
|
"\uabcd\u1234\ud801\udc37",
|
|
"\uabcd\u1234\ud801\udc37",
|
|
"\uabcd\u1234\ud801\udc37",
|
|
"\uabcd\u1234\ud801\udc37",
|
|
"\uabcd\u1234\ud801\udc37",
|
|
"\uabcd\u1234\ud801\udc37",
|
|
"\uabcd\u1234\ud801\udc37",
|
|
"\uabcd\u1234\ud801\udc37",
|
|
"\uabcd\u1234\ud801\udc37"]
|
|
)");
|
|
bench("structural",
|
|
R"(
|
|
[
|
|
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
|
|
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
|
|
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
|
|
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
|
|
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
|
|
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
|
|
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
|
|
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
|
|
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
|
|
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
|
|
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
|
|
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
|
|
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
|
|
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
|
|
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]},
|
|
{"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}
|
|
]
|
|
)");
|
|
bench("whitespace", R"(
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0
|
|
]
|
|
)");
|
|
bench("literals", R"([
|
|
true, false, null,
|
|
true, false, null,
|
|
true, false, null,
|
|
true, false, null,
|
|
true, false, null,
|
|
true, false, null,
|
|
true, false, null
|
|
]
|
|
)");
|
|
}
|