Validate correct string data in fuzz test

This commit is contained in:
2025-05-21 15:45:24 -04:00
parent c261b64620
commit 611d1a07de
6 changed files with 264 additions and 25 deletions

View File

@@ -7,8 +7,8 @@
#include "weaseljson.h"
inline Callbacks printCallbacks() {
Callbacks result;
inline WeaselJsonCallbacks printCallbacks() {
WeaselJsonCallbacks result;
result.on_begin_object = +[](void *) { puts("on_begin_object"); };
result.on_end_object = +[](void *) { puts("on_end_object"); };
result.on_begin_string = +[](void *) { puts("on_begin_string"); };
@@ -29,8 +29,8 @@ inline Callbacks printCallbacks() {
return result;
}
inline Callbacks noopCallbacks() {
Callbacks result;
inline WeaselJsonCallbacks noopCallbacks() {
WeaselJsonCallbacks result;
result.on_begin_object = +[](void *) {};
result.on_end_object = +[](void *) {};
result.on_begin_string = +[](void *) {};
@@ -72,8 +72,8 @@ struct SerializeState {
std::vector<Cursor> stack;
};
inline Callbacks serializeCallbacks() {
Callbacks result;
inline WeaselJsonCallbacks serializeCallbacks() {
WeaselJsonCallbacks result;
result.on_begin_object = +[](void *p) {
auto *state = (SerializeState *)p;
state->on_begin_value();

View File

@@ -1,14 +1,17 @@
#include "callbacks.h"
#include "json_value.h"
#include "parser3.h"
#include <simdjson.h>
std::pair<std::string, parser3::Status> runStreaming(std::string copy) {
std::pair<std::string, parser3::Status> runStreaming(std::string copy,
int stride) {
SerializeState state;
auto c = serializeCallbacks();
parser3::Parser3 parser(&c, &state);
for (int i = 0; i < copy.size(); ++i) {
auto s = parser.parse(copy.data() + i, 1);
for (int i = 0; i < copy.size(); i += stride) {
auto s =
parser.parse(copy.data() + i, std::min<int>(stride, copy.size() - i));
if (s != parser3::S_AGAIN) {
return {state.result, s};
}
@@ -36,20 +39,23 @@ std::pair<std::string, parser3::Status> runBatch(std::string copy) {
}
void testStreaming(std::string const &json) {
auto streaming = runStreaming(json);
auto batch = runBatch(json);
if (streaming != batch) {
if (streaming.second == batch.second && streaming.second != parser3::S_OK) {
// It's ok if the processed data doesn't match if parsing failed
return;
for (int stride = 1; stride < 16; ++stride) {
auto streaming = runStreaming(json, stride);
if (streaming != batch) {
if (streaming.second == batch.second &&
streaming.second != parser3::S_OK) {
// It's ok if the processed data doesn't match if parsing failed
return;
}
printf("streaming: %s, %s\n",
streaming.second == parser3::S_OK ? "accept" : "reject",
streaming.first.c_str());
printf("batch: %s, %s\n",
streaming.second == parser3::S_OK ? "accept" : "reject",
batch.first.c_str());
abort();
}
printf("streaming: %s, %s\n",
streaming.second == parser3::S_OK ? "accept" : "reject",
streaming.first.c_str());
printf("batch: %s, %s\n",
streaming.second == parser3::S_OK ? "accept" : "reject",
batch.first.c_str());
abort();
}
}
@@ -92,9 +98,27 @@ void compareWithSimdjson(std::string const &json) {
}
}
void testStringRoundTrip(std::string_view s) {
if (!simdjson::validate_utf8(s.data(), s.size())) {
// You can't encode non utf-8 data in a json string, even with escaping
return;
}
for (int stride = 0; stride < 16; ++stride) {
auto escaped = "\"" + escapeAsJsonString(s) + "\"";
auto parsed = toValue(std::move(escaped));
if (!parsed.has_value()) {
abort();
}
if (std::get<std::string>(*parsed) != s) {
abort();
}
}
}
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
auto s = std::string((const char *)data, size);
testStreaming(s);
compareWithSimdjson(s);
testStringRoundTrip(s);
return 0;
}

215
src/json_value.h Normal file
View File

@@ -0,0 +1,215 @@
#pragma once
#include <cstddef>
#include <map>
#include <memory>
#include <optional>
#include <string>
#include <utility>
#include <variant>
#include <vector>
#include "parser3.h"
#include "weaseljson.h"
struct JsonNumber : std::string {};
using JsonValue = std::variant<std::nullptr_t, bool, std::string, JsonNumber,
std::unique_ptr<struct JsonArray>,
std::unique_ptr<struct JsonObject>>;
struct JsonArray : std::vector<JsonValue> {};
struct JsonObject : std::map<std::string, JsonValue> {};
struct ReadValueState {
JsonValue result;
std::vector<JsonValue> valueStack;
std::vector<std::string> keyStack;
std::vector</*bool*/ int> isKeyStack;
void on_end_value() {
auto object = std::move(valueStack.back());
valueStack.pop_back();
if (valueStack.empty()) {
result = std::move(object);
return;
}
auto i = valueStack.back().index();
switch (i) {
case 0: // null
case 1: // bool
case 2: // string
case 3: // number
__builtin_unreachable();
case 4: // array
std::get<std::unique_ptr<JsonArray>>(valueStack.back())
->push_back(std::move(object));
return;
case 5: // object
if (std::exchange(isKeyStack.back(), !isKeyStack.back())) {
keyStack.push_back(std::move(std::get<std::string>(object)));
} else {
std::get<std::unique_ptr<JsonObject>>(valueStack.back())
->emplace(std::move(keyStack.back()), std::move(object));
keyStack.pop_back();
}
return;
}
}
};
inline WeaselJsonCallbacks readValueCallbacks() {
WeaselJsonCallbacks result;
result.on_begin_object = +[](void *p) {
auto *state = (ReadValueState *)p;
state->valueStack.emplace_back(std::make_unique<JsonObject>());
state->isKeyStack.push_back(true);
};
result.on_end_object = +[](void *p) {
auto *state = (ReadValueState *)p;
state->isKeyStack.pop_back();
state->on_end_value();
};
result.on_begin_string = +[](void *p) {
auto *state = (ReadValueState *)p;
state->valueStack.emplace_back(std::string());
};
result.on_string_data = +[](void *p, const char *buf, int len) {
auto *state = (ReadValueState *)p;
std::get<std::string>(state->valueStack.back()).append(buf, len);
};
result.on_end_string = +[](void *p) {
auto *state = (ReadValueState *)p;
state->on_end_value();
};
result.on_begin_array = +[](void *p) {
auto *state = (ReadValueState *)p;
state->valueStack.emplace_back(std::make_unique<JsonArray>());
};
result.on_end_array = +[](void *p) {
auto *state = (ReadValueState *)p;
state->on_end_value();
};
result.on_begin_number = +[](void *p) {
auto *state = (ReadValueState *)p;
state->valueStack.emplace_back(JsonNumber());
};
result.on_number_data = +[](void *p, const char *buf, int len) {
auto *state = (ReadValueState *)p;
std::get<JsonNumber>(state->valueStack.back()).append(buf, len);
};
result.on_end_number = +[](void *p) {
auto *state = (ReadValueState *)p;
state->on_end_value();
};
result.on_true_literal = +[](void *p) {
auto *state = (ReadValueState *)p;
state->valueStack.emplace_back(true);
state->on_end_value();
};
result.on_false_literal = +[](void *p) {
auto *state = (ReadValueState *)p;
state->valueStack.emplace_back(false);
state->on_end_value();
};
result.on_null_literal = +[](void *p) {
auto *state = (ReadValueState *)p;
state->valueStack.emplace_back(nullptr);
state->on_end_value();
};
return result;
}
inline std::string escapeAsJsonString(std::string_view s) {
std::string result;
for (uint8_t c : s) {
switch (c) {
case '\"':
result.append(R"(\")");
break;
case '\\':
result.append(R"(\\)");
break;
case '\b':
result.append(R"(\b)");
break;
case '\f':
result.append(R"(\f)");
break;
case '\n':
result.append(R"(\n)");
break;
case '\r':
result.append(R"(\r)");
break;
case '\t':
result.append(R"(\t)");
break;
default:
if (c < 0x20) {
const char *hex = "0123456789abcdef";
result.append(R"(\u00)");
result.push_back(hex[c >> 4]);
result.push_back(hex[c & 15]);
} else {
// TODO check if valid utf-8
result.push_back(c);
}
}
}
return result;
}
inline std::string toString(JsonValue const &jsonValue) {
switch (jsonValue.index()) {
case 0: // null
return "null";
case 1: // bool
return std::get<bool>(jsonValue) ? "true" : "false";
case 2: // string
return "\"" + escapeAsJsonString(std::get<std::string>(jsonValue)) + "\"";
case 3: // number
return std::get<JsonNumber>(jsonValue);
case 4: // array
{
std::string result = "[";
std::string delimiter = "";
for (auto const &v : *std::get<std::unique_ptr<JsonArray>>(jsonValue)) {
result += delimiter + toString(v);
delimiter = ", ";
}
return result + "]";
}
case 5: // object
{
std::string result = "{";
std::string delimiter = "";
for (auto const &[k, v] :
*std::get<std::unique_ptr<JsonObject>>(jsonValue)) {
result += delimiter + "\"" + escapeAsJsonString(k) + "\": " + toString(v);
delimiter = ", ";
}
return result + "}";
}
}
__builtin_unreachable();
}
inline std::optional<JsonValue> toValue(std::string copy, int stride = 0) {
ReadValueState state;
auto c = readValueCallbacks();
parser3::Parser3 parser(&c, &state);
if (stride == 0) {
if (parser.parse(copy.data(), copy.size()) != parser3::S_AGAIN) {
return std::nullopt;
}
} else {
for (int i = 0; i < copy.size(); i += stride) {
if (parser.parse(copy.data(), std::min<int>(stride, copy.size() - i)) !=
parser3::S_AGAIN) {
return std::nullopt;
}
}
}
if (parser.parse(nullptr, 0) != parser3::S_OK) {
return std::nullopt;
}
return std::move(state.result);
}

View File

@@ -74,7 +74,7 @@ enum Symbol : uint8_t {
N_SYMBOL_COUNT, // Must be last
};
struct Parser3 {
Parser3(const Callbacks *callbacks, void *data)
Parser3(const WeaselJsonCallbacks *callbacks, void *data)
: callbacks(callbacks), data(data) {
std::ignore = push({N_WHITESPACE, N_VALUE, N_WHITESPACE, T_EOF});
}
@@ -138,7 +138,7 @@ struct Parser3 {
char *dataBegin;
// Used for unescaping string data in place
char *writeBuf;
const Callbacks *const callbacks;
const WeaselJsonCallbacks *const callbacks;
void *const data;
Symbol stack[kMaxStackSize];
Symbol *stackPtr = stack;

View File

@@ -170,7 +170,7 @@ void testStreaming(std::string const &json) {
} // namespace
TEST_CASE("parser3") {
Callbacks c = serializeCallbacks();
WeaselJsonCallbacks c = serializeCallbacks();
SerializeState state;
{
auto copy = json;