#include #include #include #include #include #include #include #include #include #include #include "callbacks.h" #include "parser3.h" #include "weaseljson.h" // This is the JSON grammar in McKeeman Form. // json // element // value // object // array // string // number // "true" // "false" // "null" // object // '{' ws '}' // '{' members '}' // members // member // member ',' members // member // ws string ws ':' element // array // '[' ws ']' // '[' elements ']' // elements // element // element ',' elements // element // ws value ws // string // '"' characters '"' // characters // "" // character characters // character // '0020' . '10FFFF' - '"' - '\' // '\' escape // escape // '"' // '\' // '/' // 'b' // 'f' // 'n' // 'r' // 't' // 'u' hex hex hex hex // hex // digit // 'A' . 'F' // 'a' . 'f' // number // integer fraction exponent // integer // digit // onenine digits // '-' digit // '-' onenine digits // digits // digit // digit digits // digit // '0' // onenine // onenine // '1' . '9' // fraction // "" // '.' digits // exponent // "" // 'E' sign digits // 'e' sign digits // sign // "" // '+' // '-' // ws // "" // '0020' ws // '000A' ws // '000D' ws // '0009' ws namespace { const std::string json = []() { std::ifstream infile{"test.json"}; return std::string{std::istreambuf_iterator(infile), std::istreambuf_iterator()}; }(); void testStreaming(std::string const &json) { SerializeState streaming; SerializeState batch; auto c = serializeCallbacks(); { auto copy = json; auto *parser = WeaselJsonParser_create(1024, &c, &streaming, 0); for (size_t i = 0; i < copy.size(); ++i) { REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) == WeaselJson_AGAIN); } REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK); WeaselJsonParser_destroy(parser); } { auto copy = json; auto *parser = WeaselJsonParser_create(1024, &c, &batch, 0); REQUIRE(WeaselJsonParser_parse(parser, copy.data(), copy.size()) == WeaselJson_AGAIN); REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK); WeaselJsonParser_destroy(parser); } CHECK(streaming.result == batch.result); } } // namespace TEST_CASE("parser3") { WeaselJsonCallbacks c = serializeCallbacks(); SerializeState state; { auto copy = json; auto *parser = WeaselJsonParser_create(1024, &c, &state, 0); for (size_t i = 0; i < copy.size(); ++i) { REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) == WeaselJson_AGAIN); } REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK); WeaselJsonParser_destroy(parser); } { std::string copy = "{\"x\": [], \"y\": {}}"; auto *parser = WeaselJsonParser_create(1024, &c, &state, 0); for (size_t i = 0; i < copy.size(); ++i) { REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) == WeaselJson_AGAIN); } REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK); WeaselJsonParser_destroy(parser); puts(""); } { auto c = noopCallbacks(); std::string copy = "{\"a\":\"a"; auto *parser = WeaselJsonParser_create(1024, &c, &state, 0); for (size_t i = 0; i < copy.size(); ++i) { REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) == WeaselJson_AGAIN); } REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_REJECT); WeaselJsonParser_destroy(parser); } { auto c = noopCallbacks(); std::string copy = "["; auto *parser = WeaselJsonParser_create(1024, &c, &state, 0); for (size_t i = 0; i < copy.size(); ++i) { REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) == WeaselJson_AGAIN); } REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_REJECT); WeaselJsonParser_destroy(parser); } } TEST_CASE("streaming") { testStreaming(json); } void doTestUnescapingUtf8(std::string const &escaped, std::string const &expected, int stride) { CAPTURE(escaped); CAPTURE(expected); CAPTURE(stride); auto c = noopCallbacks(); std::string result; c.on_string_data = +[](void *p, const char *buf, int len, int /*done*/) { auto &s = *(std::string *)p; s.append(buf, len); }; auto *parser = WeaselJsonParser_create(1024, &c, &result, 0); auto copy = escaped; for (size_t i = 0; i < copy.size(); i += stride) { CAPTURE(i); REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, std::min(stride, copy.size() - i)) == WeaselJson_AGAIN); } REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK); WeaselJsonParser_destroy(parser); CHECK(result.size() == expected.size()); CHECK(result == expected); } void testUnescapingUtf8(std::string const &escaped, std::string const &expected) { for (int stride = 0; stride < 10; ++stride) { doTestUnescapingUtf8(escaped, expected, stride == 0 ? std::numeric_limits::max() : stride); } } TEST_CASE("unescaping utf-8") { // 4 byte encoding (utf-16 surrogate pair) testUnescapingUtf8("\"\\ud801\\udc37\"", "𐐷"); // Basic testUnescapingUtf8("\"\\\"\"", "\""); testUnescapingUtf8("\"\\\\\"", "\\"); testUnescapingUtf8("\"\\/\"", "/"); testUnescapingUtf8("\"\\b\"", "\b"); testUnescapingUtf8("\"\\f\"", "\f"); testUnescapingUtf8("\"\\n\"", "\n"); testUnescapingUtf8("\"\\r\"", "\r"); testUnescapingUtf8("\"\\t\"", "\t"); // 2 byte encoding testUnescapingUtf8("\"\\u07aB 1234\"", "\u07aB 1234"); // 3 byte encoding testUnescapingUtf8("\"\\uaB34 5678\"", "\uaB34 5678"); } TEST_CASE("bench3") { auto c = noopCallbacks(); ankerl::nanobench::Bench bench; bench.batch(json.size()); bench.unit("byte"); auto *parser = WeaselJsonParser_create(1024, &c, nullptr, 0); for (size_t stride = 128; stride <= json.size(); stride *= 2) { bench.run("parser3 (stride: " + std::to_string(stride) + ")", [&]() { auto copy = json; WeaselJsonParser_reset(parser); for (size_t i = 0; i < copy.size(); i += stride) { if (WeaselJsonParser_parse(parser, copy.data() + i, std::min(copy.size() - i, stride)) != WeaselJson_AGAIN) { abort(); } } if (WeaselJsonParser_parse(parser, nullptr, 0) != WeaselJson_OK) { abort(); } }); } WeaselJsonParser_destroy(parser); } TEST_CASE("bench4") { using namespace simdjson; ankerl::nanobench::Bench bench; bench.batch(json.size()); bench.unit("byte"); bench.run("simdjson dom", [&]() { simdjson::padded_string my_padded_data(json.data(), json.size()); simdjson::dom::parser parser; auto doc = parser.parse(my_padded_data); bench.doNotOptimizeAway(doc); }); } TEST_CASE("bench5") { using namespace simdjson; ankerl::nanobench::Bench bench; bench.batch(json.size()); bench.unit("byte"); bench.run("simdjson on demand", [&]() { padded_string my_padded_data(json.data(), json.size()); ondemand::parser parser; auto doc = parser.iterate(my_padded_data); bench.doNotOptimizeAway(doc); }); } TEST_CASE("num dfa") { NumDfa dfa; std::string match = "111111111111111111111111111111111111111111111111111111111111111111111111" "111111111111111111111111111111111111111111111111111111111111111111111111" "111111111111111111111111111111111111111111111111111111111111111111111111" "111111111111111111111111111111111111111111111111111111111111111111111111" "111111111111111111111111111111111111111111111111111111111111111111111111" "111111111111111111111111111111111111111111111111111111111111111111111111" "111111111111111111111111111111111111111111111111111111111111111111111111" "11111111"; auto *buf = dfa.scan(match.data(), match.data() + match.size()); CHECK(buf == match.data() + match.size()); CHECK(dfa.accept()); ankerl::nanobench::Bench bench; bench.batch(match.size()); bench.unit("byte"); bench.run("number dfa", [&]() { dfa.reset(); bench.doNotOptimizeAway( dfa.scan(match.data(), match.data() + match.size())); }); } const char *utf8str = "💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩" "💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩" "💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩"; TEST_CASE("utf8 dfa") { Utf8Dfa dfa; std::string match = utf8str; auto *buf = dfa.scan(match.data(), match.data() + match.size()); CHECK(buf == match.data() + match.size()); CHECK(dfa.accept()); ankerl::nanobench::Bench bench; bench.batch(match.size()); bench.unit("byte"); bench.run("utf8 dfa", [&]() { dfa.reset(); bench.doNotOptimizeAway( dfa.scan(match.data(), match.data() + match.size())); }); bench.run("simdjson utf8", [&]() { bench.doNotOptimizeAway( simdjson::validate_utf8(match.data(), match.size())); }); } // Different input structures with special care in the implementation // performance wise TEST_CASE("bench input types") { auto bench = [](std::string name, std::string json) { auto c = noopCallbacks(); ankerl::nanobench::Bench bench; bench.batch(json.size()); bench.unit("byte"); bench.relative(true); bench.run("simdjson dom " + name, [&]() { simdjson::padded_string my_padded_data(json.data(), json.size()); simdjson::dom::parser parser; auto doc = parser.parse(my_padded_data); bench.doNotOptimizeAway(doc); }); auto *parser = WeaselJsonParser_create(1024, &c, nullptr, 0); bench.run("parser3 " + name, [&]() { auto copy = json; WeaselJsonParser_reset(parser); if (WeaselJsonParser_parse(parser, copy.data(), copy.size()) != WeaselJson_AGAIN) { abort(); } if (WeaselJsonParser_parse(parser, nullptr, 0) != WeaselJson_OK) { abort(); } }); WeaselJsonParser_destroy(parser); }; bench("numbers", "[-123456789.000000000000000123456789e+12, " "-123456789.000000000000000123456789E+12, " "-123456789.000000000000000123456789e-12, " "-123456789.000000000000000123456789E-12, " "-123456789.000000000000000123456789e+12, " "-123456789.000000000000000123456789E+12, " "-123456789.000000000000000123456789e-12, " "-123456789.000000000000000123456789E-12, " "-123456789.000000000000000123456789e+12, " "-123456789.000000000000000123456789E+12, " "-123456789.000000000000000123456789e-12, " "-123456789.000000000000000123456789E-12, " "-123456789.000000000000000123456789e+12, " "-123456789.000000000000000123456789E+12, " "-123456789.000000000000000123456789e-12, " "-123456789.000000000000000123456789E-12, " "-123456789.000000000000000123456789e+12]"); bench("ascii", "\"Donec lobortis eleifend condimentum. Cras dictum dolor lacinia " "lectus vehicula rutrum. Maecenas quis nisi nunc. Nam tristique " "feugiat est vitae mollis. Maecenas quis nisi nunc.\""); bench("utf-8", std::string("\"") + utf8str + "\""); bench("normal escapes", R"( ["\n\r\t\"\b\f\\\/", "\n\r\t\"\b\f\\\/", "\n\r\t\"\b\f\\\/", "\n\r\t\"\b\f\\\/", "\n\r\t\"\b\f\\\/", "\n\r\t\"\b\f\\\/", "\n\r\t\"\b\f\\\/", "\n\r\t\"\b\f\\\/", "\n\r\t\"\b\f\\\/", "\n\r\t\"\b\f\\\/", "\n\r\t\"\b\f\\\/", "\n\r\t\"\b\f\\\/", "\n\r\t\"\b\f\\\/", "\n\r\t\"\b\f\\\/", "\n\r\t\"\b\f\\\/"] )"); bench("unicode escapes", R"( ["\uabcd\u1234\ud801\udc37", "\uabcd\u1234\ud801\udc37", "\uabcd\u1234\ud801\udc37", "\uabcd\u1234\ud801\udc37", "\uabcd\u1234\ud801\udc37", "\uabcd\u1234\ud801\udc37", "\uabcd\u1234\ud801\udc37", "\uabcd\u1234\ud801\udc37", "\uabcd\u1234\ud801\udc37", "\uabcd\u1234\ud801\udc37", "\uabcd\u1234\ud801\udc37", "\uabcd\u1234\ud801\udc37", "\uabcd\u1234\ud801\udc37", "\uabcd\u1234\ud801\udc37", "\uabcd\u1234\ud801\udc37"] )"); bench("structural", R"( [ {"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}, {"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}, {"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}, {"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}, {"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}, {"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}, {"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}, {"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}, {"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}, {"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}, {"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}, {"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}, {"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}, {"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}, {"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]}, {"": [{"": [[], [], [], [], [], [[[[[[[]]]]]]], {"": ""}]}]} ] )"); bench("whitespace", R"( [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] )"); bench("literals", R"([ true, false, null, true, false, null, true, false, null, true, false, null, true, false, null, true, false, null, true, false, null ] )"); }