diff --git a/include/weaseljson.h b/include/weaseljson.h index 7dbe201..aedbd7d 100644 --- a/include/weaseljson.h +++ b/include/weaseljson.h @@ -1,7 +1,7 @@ #ifndef WEASELJSON_H #define WEASELJSON_H -struct Callbacks { +struct WeaselJsonCallbacks { void (*on_begin_object)(void *data); void (*on_end_object)(void *data); void (*on_begin_string)(void *data); diff --git a/src/callbacks.h b/src/callbacks.h index aab4bc5..396ed0b 100644 --- a/src/callbacks.h +++ b/src/callbacks.h @@ -7,8 +7,8 @@ #include "weaseljson.h" -inline Callbacks printCallbacks() { - Callbacks result; +inline WeaselJsonCallbacks printCallbacks() { + WeaselJsonCallbacks result; result.on_begin_object = +[](void *) { puts("on_begin_object"); }; result.on_end_object = +[](void *) { puts("on_end_object"); }; result.on_begin_string = +[](void *) { puts("on_begin_string"); }; @@ -29,8 +29,8 @@ inline Callbacks printCallbacks() { return result; } -inline Callbacks noopCallbacks() { - Callbacks result; +inline WeaselJsonCallbacks noopCallbacks() { + WeaselJsonCallbacks result; result.on_begin_object = +[](void *) {}; result.on_end_object = +[](void *) {}; result.on_begin_string = +[](void *) {}; @@ -72,8 +72,8 @@ struct SerializeState { std::vector stack; }; -inline Callbacks serializeCallbacks() { - Callbacks result; +inline WeaselJsonCallbacks serializeCallbacks() { + WeaselJsonCallbacks result; result.on_begin_object = +[](void *p) { auto *state = (SerializeState *)p; state->on_begin_value(); diff --git a/src/fuzz.cpp b/src/fuzz.cpp index 5dc38d9..35bdd71 100644 --- a/src/fuzz.cpp +++ b/src/fuzz.cpp @@ -1,14 +1,17 @@ #include "callbacks.h" +#include "json_value.h" #include "parser3.h" #include -std::pair runStreaming(std::string copy) { +std::pair runStreaming(std::string copy, + int stride) { SerializeState state; auto c = serializeCallbacks(); parser3::Parser3 parser(&c, &state); - for (int i = 0; i < copy.size(); ++i) { - auto s = parser.parse(copy.data() + i, 1); + for (int i = 0; i < copy.size(); i += stride) { + auto s = + parser.parse(copy.data() + i, std::min(stride, copy.size() - i)); if (s != parser3::S_AGAIN) { return {state.result, s}; } @@ -36,20 +39,23 @@ std::pair runBatch(std::string copy) { } void testStreaming(std::string const &json) { - auto streaming = runStreaming(json); auto batch = runBatch(json); - if (streaming != batch) { - if (streaming.second == batch.second && streaming.second != parser3::S_OK) { - // It's ok if the processed data doesn't match if parsing failed - return; + for (int stride = 1; stride < 16; ++stride) { + auto streaming = runStreaming(json, stride); + if (streaming != batch) { + if (streaming.second == batch.second && + streaming.second != parser3::S_OK) { + // It's ok if the processed data doesn't match if parsing failed + return; + } + printf("streaming: %s, %s\n", + streaming.second == parser3::S_OK ? "accept" : "reject", + streaming.first.c_str()); + printf("batch: %s, %s\n", + streaming.second == parser3::S_OK ? "accept" : "reject", + batch.first.c_str()); + abort(); } - printf("streaming: %s, %s\n", - streaming.second == parser3::S_OK ? "accept" : "reject", - streaming.first.c_str()); - printf("batch: %s, %s\n", - streaming.second == parser3::S_OK ? "accept" : "reject", - batch.first.c_str()); - abort(); } } @@ -92,9 +98,27 @@ void compareWithSimdjson(std::string const &json) { } } +void testStringRoundTrip(std::string_view s) { + if (!simdjson::validate_utf8(s.data(), s.size())) { + // You can't encode non utf-8 data in a json string, even with escaping + return; + } + for (int stride = 0; stride < 16; ++stride) { + auto escaped = "\"" + escapeAsJsonString(s) + "\""; + auto parsed = toValue(std::move(escaped)); + if (!parsed.has_value()) { + abort(); + } + if (std::get(*parsed) != s) { + abort(); + } + } +} + extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { auto s = std::string((const char *)data, size); testStreaming(s); compareWithSimdjson(s); + testStringRoundTrip(s); return 0; } diff --git a/src/json_value.h b/src/json_value.h new file mode 100644 index 0000000..5677ade --- /dev/null +++ b/src/json_value.h @@ -0,0 +1,215 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "parser3.h" +#include "weaseljson.h" + +struct JsonNumber : std::string {}; +using JsonValue = std::variant, + std::unique_ptr>; +struct JsonArray : std::vector {}; +struct JsonObject : std::map {}; + +struct ReadValueState { + JsonValue result; + std::vector valueStack; + std::vector keyStack; + std::vector isKeyStack; + void on_end_value() { + auto object = std::move(valueStack.back()); + valueStack.pop_back(); + if (valueStack.empty()) { + result = std::move(object); + return; + } + auto i = valueStack.back().index(); + switch (i) { + case 0: // null + case 1: // bool + case 2: // string + case 3: // number + __builtin_unreachable(); + case 4: // array + std::get>(valueStack.back()) + ->push_back(std::move(object)); + return; + case 5: // object + if (std::exchange(isKeyStack.back(), !isKeyStack.back())) { + keyStack.push_back(std::move(std::get(object))); + } else { + std::get>(valueStack.back()) + ->emplace(std::move(keyStack.back()), std::move(object)); + keyStack.pop_back(); + } + return; + } + } +}; + +inline WeaselJsonCallbacks readValueCallbacks() { + WeaselJsonCallbacks result; + result.on_begin_object = +[](void *p) { + auto *state = (ReadValueState *)p; + state->valueStack.emplace_back(std::make_unique()); + state->isKeyStack.push_back(true); + }; + result.on_end_object = +[](void *p) { + auto *state = (ReadValueState *)p; + state->isKeyStack.pop_back(); + state->on_end_value(); + }; + result.on_begin_string = +[](void *p) { + auto *state = (ReadValueState *)p; + state->valueStack.emplace_back(std::string()); + }; + result.on_string_data = +[](void *p, const char *buf, int len) { + auto *state = (ReadValueState *)p; + std::get(state->valueStack.back()).append(buf, len); + }; + result.on_end_string = +[](void *p) { + auto *state = (ReadValueState *)p; + state->on_end_value(); + }; + result.on_begin_array = +[](void *p) { + auto *state = (ReadValueState *)p; + state->valueStack.emplace_back(std::make_unique()); + }; + result.on_end_array = +[](void *p) { + auto *state = (ReadValueState *)p; + state->on_end_value(); + }; + result.on_begin_number = +[](void *p) { + auto *state = (ReadValueState *)p; + state->valueStack.emplace_back(JsonNumber()); + }; + result.on_number_data = +[](void *p, const char *buf, int len) { + auto *state = (ReadValueState *)p; + std::get(state->valueStack.back()).append(buf, len); + }; + result.on_end_number = +[](void *p) { + auto *state = (ReadValueState *)p; + state->on_end_value(); + }; + result.on_true_literal = +[](void *p) { + auto *state = (ReadValueState *)p; + state->valueStack.emplace_back(true); + state->on_end_value(); + }; + result.on_false_literal = +[](void *p) { + auto *state = (ReadValueState *)p; + state->valueStack.emplace_back(false); + state->on_end_value(); + }; + result.on_null_literal = +[](void *p) { + auto *state = (ReadValueState *)p; + state->valueStack.emplace_back(nullptr); + state->on_end_value(); + }; + return result; +} + +inline std::string escapeAsJsonString(std::string_view s) { + std::string result; + for (uint8_t c : s) { + switch (c) { + case '\"': + result.append(R"(\")"); + break; + case '\\': + result.append(R"(\\)"); + break; + case '\b': + result.append(R"(\b)"); + break; + case '\f': + result.append(R"(\f)"); + break; + case '\n': + result.append(R"(\n)"); + break; + case '\r': + result.append(R"(\r)"); + break; + case '\t': + result.append(R"(\t)"); + break; + default: + if (c < 0x20) { + const char *hex = "0123456789abcdef"; + result.append(R"(\u00)"); + result.push_back(hex[c >> 4]); + result.push_back(hex[c & 15]); + } else { + // TODO check if valid utf-8 + result.push_back(c); + } + } + } + return result; +} + +inline std::string toString(JsonValue const &jsonValue) { + switch (jsonValue.index()) { + case 0: // null + return "null"; + case 1: // bool + return std::get(jsonValue) ? "true" : "false"; + case 2: // string + return "\"" + escapeAsJsonString(std::get(jsonValue)) + "\""; + case 3: // number + return std::get(jsonValue); + case 4: // array + { + std::string result = "["; + std::string delimiter = ""; + for (auto const &v : *std::get>(jsonValue)) { + result += delimiter + toString(v); + delimiter = ", "; + } + return result + "]"; + } + case 5: // object + { + std::string result = "{"; + std::string delimiter = ""; + for (auto const &[k, v] : + *std::get>(jsonValue)) { + result += delimiter + "\"" + escapeAsJsonString(k) + "\": " + toString(v); + delimiter = ", "; + } + return result + "}"; + } + } + __builtin_unreachable(); +} + +inline std::optional toValue(std::string copy, int stride = 0) { + ReadValueState state; + auto c = readValueCallbacks(); + parser3::Parser3 parser(&c, &state); + if (stride == 0) { + if (parser.parse(copy.data(), copy.size()) != parser3::S_AGAIN) { + return std::nullopt; + } + } else { + for (int i = 0; i < copy.size(); i += stride) { + if (parser.parse(copy.data(), std::min(stride, copy.size() - i)) != + parser3::S_AGAIN) { + return std::nullopt; + } + } + } + if (parser.parse(nullptr, 0) != parser3::S_OK) { + return std::nullopt; + } + return std::move(state.result); +} diff --git a/src/parser3.h b/src/parser3.h index 3553960..b53b4e7 100644 --- a/src/parser3.h +++ b/src/parser3.h @@ -74,7 +74,7 @@ enum Symbol : uint8_t { N_SYMBOL_COUNT, // Must be last }; struct Parser3 { - Parser3(const Callbacks *callbacks, void *data) + Parser3(const WeaselJsonCallbacks *callbacks, void *data) : callbacks(callbacks), data(data) { std::ignore = push({N_WHITESPACE, N_VALUE, N_WHITESPACE, T_EOF}); } @@ -138,7 +138,7 @@ struct Parser3 { char *dataBegin; // Used for unescaping string data in place char *writeBuf; - const Callbacks *const callbacks; + const WeaselJsonCallbacks *const callbacks; void *const data; Symbol stack[kMaxStackSize]; Symbol *stackPtr = stack; diff --git a/src/test.cpp b/src/test.cpp index a6680d3..b30b3f9 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -170,7 +170,7 @@ void testStreaming(std::string const &json) { } // namespace TEST_CASE("parser3") { - Callbacks c = serializeCallbacks(); + WeaselJsonCallbacks c = serializeCallbacks(); SerializeState state; { auto copy = json;