302 lines
6.6 KiB
C++
302 lines
6.6 KiB
C++
#include <cassert>
|
|
#include <cctype>
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
|
|
#include <limits>
|
|
#include <string>
|
|
|
|
#include <doctest.h>
|
|
#include <nanobench.h>
|
|
#include <simdjson.h>
|
|
|
|
#include "callbacks.h"
|
|
#include "parser3.h"
|
|
|
|
// This is the JSON grammar in McKeeman Form.
|
|
|
|
// json
|
|
// element
|
|
|
|
// value
|
|
// object
|
|
// array
|
|
// string
|
|
// number
|
|
// "true"
|
|
// "false"
|
|
// "null"
|
|
|
|
// object
|
|
// '{' ws '}'
|
|
// '{' members '}'
|
|
|
|
// members
|
|
// member
|
|
// member ',' members
|
|
|
|
// member
|
|
// ws string ws ':' element
|
|
|
|
// array
|
|
// '[' ws ']'
|
|
// '[' elements ']'
|
|
|
|
// elements
|
|
// element
|
|
// element ',' elements
|
|
|
|
// element
|
|
// ws value ws
|
|
|
|
// string
|
|
// '"' characters '"'
|
|
|
|
// characters
|
|
// ""
|
|
// character characters
|
|
|
|
// character
|
|
// '0020' . '10FFFF' - '"' - '\'
|
|
// '\' escape
|
|
|
|
// escape
|
|
// '"'
|
|
// '\'
|
|
// '/'
|
|
// 'b'
|
|
// 'f'
|
|
// 'n'
|
|
// 'r'
|
|
// 't'
|
|
// 'u' hex hex hex hex
|
|
|
|
// hex
|
|
// digit
|
|
// 'A' . 'F'
|
|
// 'a' . 'f'
|
|
|
|
// number
|
|
// integer fraction exponent
|
|
|
|
// integer
|
|
// digit
|
|
// onenine digits
|
|
// '-' digit
|
|
// '-' onenine digits
|
|
|
|
// digits
|
|
// digit
|
|
// digit digits
|
|
|
|
// digit
|
|
// '0'
|
|
// onenine
|
|
|
|
// onenine
|
|
// '1' . '9'
|
|
|
|
// fraction
|
|
// ""
|
|
// '.' digits
|
|
|
|
// exponent
|
|
// ""
|
|
// 'E' sign digits
|
|
// 'e' sign digits
|
|
|
|
// sign
|
|
// ""
|
|
// '+'
|
|
// '-'
|
|
|
|
// ws
|
|
// ""
|
|
// '0020' ws
|
|
// '000A' ws
|
|
// '000D' ws
|
|
// '0009' ws
|
|
|
|
namespace {
|
|
|
|
const std::string json = R"({
|
|
"a number": 12345,
|
|
"true": true,
|
|
"false": false,
|
|
"null": null,
|
|
"glossary": {
|
|
"title": "example glossary",
|
|
"GlossDiv": {
|
|
"title": "S",
|
|
"GlossList": {
|
|
"GlossEntry": {
|
|
"ID": "SGML",
|
|
"SortAs": "SGML",
|
|
"GlossTerm": "Standard Generalized Markup Language",
|
|
"Acronym": "SGML",
|
|
"Abbrev": "ISO 8879:1986",
|
|
"GlossDef": {
|
|
"para": "A meta-markup language, used to create markup languages such as DocBook.",
|
|
"GlossSeeAlso": ["GML", "XML"]
|
|
},
|
|
"GlossSee": "markup"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
})";
|
|
|
|
void testStreaming(std::string const &json) {
|
|
SerializeState streaming;
|
|
SerializeState batch;
|
|
auto c = serializeCallbacks();
|
|
{
|
|
auto copy = json;
|
|
parser3::Parser3 parser(&c, &streaming);
|
|
for (int i = 0; i < copy.size(); ++i) {
|
|
REQUIRE(parser.parse(copy.data() + i, 1) == parser3::S_AGAIN);
|
|
}
|
|
CHECK(parser.parse(nullptr, 0) == parser3::S_OK);
|
|
}
|
|
{
|
|
auto copy = json;
|
|
parser3::Parser3 parser(&c, &batch);
|
|
REQUIRE(parser.parse(copy.data(), copy.size()) == parser3::S_AGAIN);
|
|
CHECK(parser.parse(nullptr, 0) == parser3::S_OK);
|
|
}
|
|
CHECK(streaming.result == batch.result);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
TEST_CASE("parser3") {
|
|
Callbacks c = serializeCallbacks();
|
|
SerializeState state;
|
|
{
|
|
auto copy = json;
|
|
parser3::Parser3 parser(&c, &state);
|
|
int i = 0;
|
|
for (; i < copy.length() - 1; ++i) {
|
|
REQUIRE(parser.parse(copy.data() + i, 1) == parser3::S_AGAIN);
|
|
}
|
|
CHECK(parser.parse(copy.data() + i, 1) == parser3::S_AGAIN);
|
|
CHECK(parser.parse(nullptr, 0) == parser3::S_OK);
|
|
puts("");
|
|
}
|
|
{
|
|
std::string copy = "{\"x\": [], \"y\": {}}";
|
|
parser3::Parser3 parser(&c, &state);
|
|
CHECK(parser.parse(copy.data(), copy.length()) == parser3::S_AGAIN);
|
|
CHECK(parser.parse(nullptr, 0) == parser3::S_OK);
|
|
puts("");
|
|
}
|
|
{
|
|
auto c = noopCallbacks();
|
|
std::string copy = "{\"a\":\"a";
|
|
parser3::Parser3 parser(&c, &state);
|
|
CHECK(parser.parse(copy.data(), copy.length()) == parser3::S_AGAIN);
|
|
CHECK(parser.parse(nullptr, 0) == parser3::S_REJECT);
|
|
}
|
|
{
|
|
auto c = noopCallbacks();
|
|
std::string copy = "[";
|
|
parser3::Parser3 parser(&c, &state);
|
|
CHECK(parser.parse(copy.data(), copy.length()) == parser3::S_AGAIN);
|
|
CHECK(parser.parse(nullptr, 0) == parser3::S_REJECT);
|
|
}
|
|
}
|
|
|
|
TEST_CASE("streaming") { testStreaming(json); }
|
|
|
|
void doTestUnescapingUtf8(std::string const &escaped,
|
|
std::string const &expected, int stride) {
|
|
CAPTURE(escaped);
|
|
CAPTURE(expected);
|
|
CAPTURE(stride);
|
|
auto c = noopCallbacks();
|
|
std::string result;
|
|
c.on_string_data = +[](void *p, const char *buf, int len) {
|
|
auto &s = *(std::string *)p;
|
|
s.append(buf, len);
|
|
};
|
|
parser3::Parser3 parser(&c, &result);
|
|
auto copy = escaped;
|
|
for (int i = 0; i < copy.size(); i += stride) {
|
|
CAPTURE(i);
|
|
CHECK(
|
|
parser.parse(copy.data() + i, std::min<int>(stride, copy.size() - i)) ==
|
|
parser3::S_AGAIN);
|
|
}
|
|
CHECK(parser.parse(nullptr, 0) == parser3::S_OK);
|
|
CHECK(result.size() == expected.size());
|
|
CHECK(result == expected);
|
|
}
|
|
|
|
void testUnescapingUtf8(std::string const &escaped,
|
|
std::string const &expected) {
|
|
for (int stride = 0; stride < 10; ++stride) {
|
|
doTestUnescapingUtf8(escaped, expected,
|
|
stride == 0 ? std::numeric_limits<int>::max()
|
|
: stride);
|
|
}
|
|
}
|
|
|
|
TEST_CASE("unescaping utf-8") {
|
|
// 4 byte encoding (utf-16 surrogate pair)
|
|
testUnescapingUtf8("\"\\ud801\\udc37\"", "𐐷");
|
|
return;
|
|
|
|
// Basic
|
|
testUnescapingUtf8("\"\\\"\"", "\"");
|
|
testUnescapingUtf8("\"\\\\\"", "\\");
|
|
testUnescapingUtf8("\"\\/\"", "/");
|
|
testUnescapingUtf8("\"\\b\"", "\b");
|
|
testUnescapingUtf8("\"\\f\"", "\f");
|
|
testUnescapingUtf8("\"\\n\"", "\n");
|
|
testUnescapingUtf8("\"\\r\"", "\r");
|
|
testUnescapingUtf8("\"\\t\"", "\t");
|
|
// 2 byte encoding
|
|
testUnescapingUtf8("\"\\u07aB 1234\"", "\u07aB 1234");
|
|
// 3 byte encoding
|
|
testUnescapingUtf8("\"\\uaB34 5678\"", "\uaB34 5678");
|
|
}
|
|
|
|
TEST_CASE("bench3") {
|
|
auto c = noopCallbacks();
|
|
ankerl::nanobench::Bench bench;
|
|
bench.batch(json.size());
|
|
bench.unit("byte");
|
|
bench.run("parser3", [&]() {
|
|
auto copy = json;
|
|
parser3::Parser3 parser(&c, nullptr);
|
|
bench.doNotOptimizeAway(parser.parse(copy.data(), copy.length()));
|
|
bench.doNotOptimizeAway(parser.parse(nullptr, 0) == parser3::S_OK);
|
|
});
|
|
}
|
|
|
|
TEST_CASE("bench4") {
|
|
using namespace simdjson;
|
|
ankerl::nanobench::Bench bench;
|
|
bench.batch(json.size());
|
|
bench.unit("byte");
|
|
bench.run("parser4", [&]() {
|
|
simdjson::padded_string my_padded_data(json.data(), json.size());
|
|
simdjson::dom::parser parser;
|
|
auto doc = parser.parse(my_padded_data);
|
|
bench.doNotOptimizeAway(doc);
|
|
});
|
|
}
|
|
|
|
TEST_CASE("bench5") {
|
|
using namespace simdjson;
|
|
ankerl::nanobench::Bench bench;
|
|
bench.batch(json.size());
|
|
bench.unit("byte");
|
|
bench.run("parser5", [&]() {
|
|
padded_string my_padded_data(json.data(), json.size());
|
|
ondemand::parser parser;
|
|
auto doc = parser.iterate(my_padded_data);
|
|
bench.doNotOptimizeAway(doc);
|
|
});
|
|
}
|