From 1217ded8a7bdc19c3394815735bff6a1b2567b38 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Fri, 23 May 2025 11:59:50 -0400 Subject: [PATCH] Convert everything to c api --- CMakeLists.txt | 5 +-- include/weaseljson.h | 6 ++-- src/fuzz.cpp | 27 ++++++++------- src/json_value.h | 13 +++---- src/lib.cpp | 42 +++++++++++++---------- src/parser3.h | 38 +++++++++++++-------- src/test.cpp | 81 +++++++++++++++++++++++++++----------------- src/validate.cpp | 7 ++-- 8 files changed, 130 insertions(+), 89 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 38752f0..331c749 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,7 +95,7 @@ endif() add_executable(mytest src/test.cpp) target_include_directories(mytest PRIVATE include) -target_link_libraries(mytest PRIVATE doctest nanobench simdjson) +target_link_libraries(mytest PRIVATE ${PROJECT_NAME} doctest nanobench simdjson) doctest_discover_tests(mytest) include(CMakePushCheckState) @@ -106,7 +106,7 @@ check_cxx_compiler_flag(-fsanitize=fuzzer-no-link HAS_LIB_FUZZER) cmake_pop_check_state() if(HAS_LIB_FUZZER) - add_executable(fuzz src/fuzz.cpp) + add_executable(fuzz src/fuzz.cpp src/lib.cpp) target_include_directories(fuzz PRIVATE include) target_link_libraries(fuzz PRIVATE simdjson) target_compile_options(fuzz PRIVATE -fsanitize=fuzzer) @@ -114,4 +114,5 @@ if(HAS_LIB_FUZZER) endif() add_executable(validate src/validate.cpp) +target_link_libraries(validate ${PROJECT_NAME}-static) target_include_directories(validate PRIVATE include) diff --git a/include/weaseljson.h b/include/weaseljson.h index ce2384b..ea7b70c 100644 --- a/include/weaseljson.h +++ b/include/weaseljson.h @@ -41,7 +41,8 @@ typedef struct WeaselJsonParser WeaselJsonParser; /** Create a parser. Increasing stack size increases memory usage but also * increases the depth of nested json accepted. `callbacks` and `data` must - * outlive the returned parser. */ + * outlive the returned parser. Returns null if there's insufficient available + * memory */ WeaselJsonParser *WeaselJsonParser_create(int stackSize, const WeaselJsonCallbacks *callbacks, void *data); @@ -53,7 +54,8 @@ void WeaselJsonParser_reset(WeaselJsonParser *parser); void WeaselJsonParser_destroy(WeaselJsonParser *parser); /** Incrementally parse `len` more bytes starting at `buf`. `buf` may be - * modified. Call with `len` 0 to indicate end of data */ + * modified. Call with `len` 0 to indicate end of data. `buf` may be null if + * `len` is 0 */ WeaselJsonStatus WeaselJsonParser_parse(WeaselJsonParser *parser, char *buf, int len); diff --git a/src/fuzz.cpp b/src/fuzz.cpp index 82da5e9..eacc770 100644 --- a/src/fuzz.cpp +++ b/src/fuzz.cpp @@ -1,6 +1,5 @@ #include "callbacks.h" #include "json_value.h" -#include "parser3.h" #include "weaseljson.h" #include @@ -9,34 +8,36 @@ std::pair runStreaming(std::string copy, int stride) { SerializeState state; auto c = serializeCallbacks(); - parser3::Parser3 parser(&c, &state); + std::unique_ptr parser{ + WeaselJsonParser_create(1024, &c, &state), WeaselJsonParser_destroy}; if (stride == 0) { - auto s = parser.parse(copy.data(), copy.size()); + auto s = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size()); if (s != WeaselJson_AGAIN) { return {state.result, s}; } } else { for (int i = 0; i < copy.size(); i += stride) { - auto s = - parser.parse(copy.data() + i, std::min(stride, copy.size() - i)); + auto s = WeaselJsonParser_parse(parser.get(), copy.data() + i, + std::min(stride, copy.size() - i)); if (s != WeaselJson_AGAIN) { return {state.result, s}; } } } - auto s = parser.parse(nullptr, 0); + auto s = WeaselJsonParser_parse(parser.get(), nullptr, 0); return {state.result, s}; } std::pair runBatch(std::string copy) { SerializeState state; auto c = serializeCallbacks(); - parser3::Parser3 parser(&c, &state); - auto s = parser.parse(copy.data(), copy.size()); + std::unique_ptr parser{ + WeaselJsonParser_create(1024, &c, &state), WeaselJsonParser_destroy}; + auto s = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size()); if (s != WeaselJson_AGAIN) { return {state.result, s}; } - s = parser.parse(nullptr, 0); + s = WeaselJsonParser_parse(parser.get(), nullptr, 0); return {state.result, s}; } @@ -73,10 +74,12 @@ void compareWithSimdjson(std::string const &json) { { auto copy = json; auto c = noopCallbacks(); - parser3::Parser3 parser3(&c, nullptr); - ours = parser3.parse(copy.data(), copy.size()); + std::unique_ptr + parser{WeaselJsonParser_create(1024, &c, nullptr), + WeaselJsonParser_destroy}; + ours = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size()); if (ours == WeaselJson_AGAIN) { - ours = parser3.parse(nullptr, 0); + ours = WeaselJsonParser_parse(parser.get(), nullptr, 0); } } diff --git a/src/json_value.h b/src/json_value.h index b8dfc54..c71dfa6 100644 --- a/src/json_value.h +++ b/src/json_value.h @@ -9,7 +9,6 @@ #include #include -#include "parser3.h" #include "weaseljson.h" struct JsonNumber : std::string {}; @@ -195,21 +194,23 @@ inline std::string toString(JsonValue const &jsonValue) { inline std::optional toValue(std::string copy, int stride) { ReadValueState state; auto c = readValueCallbacks(); - parser3::Parser3 parser(&c, &state); + std::unique_ptr parser{ + WeaselJsonParser_create(1024, &c, &state), WeaselJsonParser_destroy}; if (stride == 0) { - if (parser.parse(copy.data(), copy.size()) != WeaselJson_AGAIN) { + if (WeaselJsonParser_parse(parser.get(), copy.data(), copy.size()) != + WeaselJson_AGAIN) { return std::nullopt; } } else { for (int i = 0; i < copy.size(); i += stride) { - if (parser.parse(copy.data() + i, - std::min(stride, copy.size() - i)) != + if (WeaselJsonParser_parse(parser.get(), copy.data() + i, + std::min(stride, copy.size() - i)) != WeaselJson_AGAIN) { return std::nullopt; } } } - if (parser.parse(nullptr, 0) != WeaselJson_OK) { + if (WeaselJsonParser_parse(parser.get(), nullptr, 0) != WeaselJson_OK) { return std::nullopt; } return std::move(state.result); diff --git a/src/lib.cpp b/src/lib.cpp index 6672b2a..825472c 100644 --- a/src/lib.cpp +++ b/src/lib.cpp @@ -1,25 +1,33 @@ #include "parser3.h" #include "weaseljson.h" +using namespace parser3; + extern "C" { -/** Create a parser. Increasing stack size increases memory usage but also - * increases the depth of nested json accepted. `callbacks` and `data` must - * outlive the returned parser. */ __attribute__((visibility("default"))) WeaselJsonParser * WeaselJsonParser_create(int stackSize, const WeaselJsonCallbacks *callbacks, - void *data) {} - -/** Restore the parser to its newly-created state */ -__attribute__((visibility("default"))) void -WeaselJsonParser_reset(WeaselJsonParser *parser) {} - -/** Destroy the parser */ -__attribute__((visibility("default"))) void -WeaselJsonParser_destroy(WeaselJsonParser *parser) {} - -/** Incrementally parse `len` more bytes starting at `buf`. `buf` may be - * modified. Call with `len` 0 to indicate end of data */ -__attribute__((visibility("default"))) WeaselJsonStatus -WeaselJsonParser_parse(WeaselJsonParser *parser, char *buf, int len) {} + void *data) { + auto *buf = malloc(sizeof(Parser3) + stackSize); + if (buf == nullptr) { + return nullptr; + } + return (WeaselJsonParser *)new (buf) Parser3{callbacks, data, stackSize}; +} + +__attribute__((visibility("default"))) void +WeaselJsonParser_reset(WeaselJsonParser *parser) { + ((Parser3 *)parser)->reset(); +} + +__attribute__((visibility("default"))) void +WeaselJsonParser_destroy(WeaselJsonParser *parser) { + ((Parser3 *)parser)->~Parser3(); + free(parser); +} + +__attribute__((visibility("default"))) WeaselJsonStatus +WeaselJsonParser_parse(WeaselJsonParser *parser, char *buf, int len) { + return ((Parser3 *)parser)->parse(buf, len); +} } diff --git a/src/parser3.h b/src/parser3.h index aa06ab4..5e564a2 100644 --- a/src/parser3.h +++ b/src/parser3.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include @@ -69,9 +68,9 @@ enum Symbol : uint8_t { N_SYMBOL_COUNT, // Must be last }; struct Parser3 { - Parser3(const WeaselJsonCallbacks *callbacks, void *data) - : callbacks(callbacks), data(data) { - std::ignore = push({N_VALUE, N_WHITESPACE, T_EOF}); + Parser3(const WeaselJsonCallbacks *callbacks, void *data, int stackSize) + : callbacks(callbacks), data(data), stackSize(stackSize) { + reset(); } [[nodiscard]] WeaselJsonStatus parse(char *buf, int len) { @@ -96,13 +95,13 @@ struct Parser3 { dataBegin = writeBuf; } - [[nodiscard]] bool empty() const { return stackPtr == stack; } + [[nodiscard]] bool empty() const { return stackPtr == stack(); } void pop() { assert(!empty()); --stackPtr; } [[nodiscard]] WeaselJsonStatus push(std::initializer_list symbols) { - if (stackPtr >= std::end(stack) - symbols.size()) [[unlikely]] { + if (stackPtr >= stack() + stackSize - symbols.size()) [[unlikely]] { return WeaselJson_OVERFLOW; } for (int i = symbols.size() - 1; i >= 0; --i) { @@ -124,23 +123,32 @@ struct Parser3 { constexpr static int kMaxStackSize = 1024; - [[maybe_unused]] void debugPrint(); + [[maybe_unused]] void debugPrint() const; + + Symbol *stack() const { return (Symbol *)(this + 1); } + + void reset() { + stackPtr = stack(); + complete = false; + std::ignore = push({N_VALUE, N_WHITESPACE, T_EOF}); + } + // Pointer to the next byte in the input to consume - char *buf = nullptr; + char *buf; // Pointer past the end of the last byte available to consume - char *bufEnd = nullptr; + char *bufEnd; // Used for flushing pending data with on_*_data callbacks char *dataBegin; // Used for unescaping string data in place char *writeBuf; const WeaselJsonCallbacks *const callbacks; void *const data; - Symbol stack[kMaxStackSize]; - Symbol *stackPtr = stack; - bool complete = false; + Symbol *stackPtr; + bool complete; uint32_t utf8Codepoint; uint32_t utf16Surrogate; uint32_t minCodepoint; + int stackSize; }; inline PRESERVE_NONE WeaselJsonStatus n_whitespace(Parser3 *self) { @@ -1134,9 +1142,9 @@ inline PRESERVE_NONE WeaselJsonStatus Parser3::keepGoing(Parser3 *self) { MUSTTAIL return symbolTables.continuations[self->top()](self); } -inline void Parser3::debugPrint() { - for (int i = 0; i < stackPtr - stack; ++i) { - printf("%s ", symbolTables.symbolNames[stack[i]]); +inline void Parser3::debugPrint() const { + for (int i = 0; i < stackPtr - stack(); ++i) { + printf("%s ", symbolTables.symbolNames[stack()[i]]); } printf("\n"); for (int i = 0; i < len(); ++i) { diff --git a/src/test.cpp b/src/test.cpp index a56a29d..9073050 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -11,7 +11,6 @@ #include #include "callbacks.h" -#include "parser3.h" #include "weaseljson.h" // This is the JSON grammar in McKeeman Form. @@ -153,17 +152,21 @@ void testStreaming(std::string const &json) { auto c = serializeCallbacks(); { auto copy = json; - parser3::Parser3 parser(&c, &streaming); + auto *parser = WeaselJsonParser_create(1024, &c, &streaming); for (int i = 0; i < copy.size(); ++i) { - REQUIRE(parser.parse(copy.data() + i, 1) == WeaselJson_AGAIN); + REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) == + WeaselJson_AGAIN); } - CHECK(parser.parse(nullptr, 0) == WeaselJson_OK); + REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK); + WeaselJsonParser_destroy(parser); } { auto copy = json; - parser3::Parser3 parser(&c, &batch); - REQUIRE(parser.parse(copy.data(), copy.size()) == WeaselJson_AGAIN); - CHECK(parser.parse(nullptr, 0) == WeaselJson_OK); + auto *parser = WeaselJsonParser_create(1024, &c, &batch); + REQUIRE(WeaselJsonParser_parse(parser, copy.data(), copy.size()) == + WeaselJson_AGAIN); + REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK); + WeaselJsonParser_destroy(parser); } CHECK(streaming.result == batch.result); } @@ -175,35 +178,46 @@ TEST_CASE("parser3") { SerializeState state; { auto copy = json; - parser3::Parser3 parser(&c, &state); - int i = 0; - for (; i < copy.length() - 1; ++i) { - REQUIRE(parser.parse(copy.data() + i, 1) == WeaselJson_AGAIN); + auto *parser = WeaselJsonParser_create(1024, &c, &state); + for (int i = 0; i < copy.size(); ++i) { + REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) == + WeaselJson_AGAIN); } - CHECK(parser.parse(copy.data() + i, 1) == WeaselJson_AGAIN); - CHECK(parser.parse(nullptr, 0) == WeaselJson_OK); - puts(""); + REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK); + WeaselJsonParser_destroy(parser); } { std::string copy = "{\"x\": [], \"y\": {}}"; - parser3::Parser3 parser(&c, &state); - CHECK(parser.parse(copy.data(), copy.length()) == WeaselJson_AGAIN); - CHECK(parser.parse(nullptr, 0) == WeaselJson_OK); + auto *parser = WeaselJsonParser_create(1024, &c, &state); + for (int i = 0; i < copy.size(); ++i) { + REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) == + WeaselJson_AGAIN); + } + REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK); + WeaselJsonParser_destroy(parser); puts(""); } { auto c = noopCallbacks(); std::string copy = "{\"a\":\"a"; - parser3::Parser3 parser(&c, &state); - CHECK(parser.parse(copy.data(), copy.length()) == WeaselJson_AGAIN); - CHECK(parser.parse(nullptr, 0) == WeaselJson_REJECT); + auto *parser = WeaselJsonParser_create(1024, &c, &state); + for (int i = 0; i < copy.size(); ++i) { + REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) == + WeaselJson_AGAIN); + } + REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_REJECT); + WeaselJsonParser_destroy(parser); } { auto c = noopCallbacks(); std::string copy = "["; - parser3::Parser3 parser(&c, &state); - CHECK(parser.parse(copy.data(), copy.length()) == WeaselJson_AGAIN); - CHECK(parser.parse(nullptr, 0) == WeaselJson_REJECT); + auto *parser = WeaselJsonParser_create(1024, &c, &state); + for (int i = 0; i < copy.size(); ++i) { + REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) == + WeaselJson_AGAIN); + } + REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_REJECT); + WeaselJsonParser_destroy(parser); } } @@ -220,15 +234,16 @@ void doTestUnescapingUtf8(std::string const &escaped, auto &s = *(std::string *)p; s.append(buf, len); }; - parser3::Parser3 parser(&c, &result); + auto *parser = WeaselJsonParser_create(1024, &c, &result); auto copy = escaped; for (int i = 0; i < copy.size(); i += stride) { CAPTURE(i); - CHECK( - parser.parse(copy.data() + i, std::min(stride, copy.size() - i)) == - WeaselJson_AGAIN); + REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, + std::min(stride, copy.size() - i)) == + WeaselJson_AGAIN); } - CHECK(parser.parse(nullptr, 0) == WeaselJson_OK); + REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK); + WeaselJsonParser_destroy(parser); CHECK(result.size() == expected.size()); CHECK(result == expected); } @@ -266,22 +281,24 @@ TEST_CASE("bench3") { ankerl::nanobench::Bench bench; bench.batch(json.size()); bench.unit("byte"); + auto *parser = WeaselJsonParser_create(1024, &c, nullptr); for (int stride = 1; stride <= json.size(); stride *= 2) { bench.run("parser3 (stride: " + std::to_string(stride) + ")", [&]() { auto copy = json; - parser3::Parser3 parser(&c, nullptr); + WeaselJsonParser_reset(parser); for (int i = 0; i < copy.size(); i += stride) { - if (parser.parse(copy.data() + i, - std::min(copy.size() - i, stride)) != + if (WeaselJsonParser_parse(parser, copy.data() + i, + std::min(copy.size() - i, stride)) != WeaselJson_AGAIN) { abort(); } } - if (parser.parse(nullptr, 0) != WeaselJson_OK) { + if (WeaselJsonParser_parse(parser, nullptr, 0) != WeaselJson_OK) { abort(); } }); } + WeaselJsonParser_destroy(parser); } TEST_CASE("bench4") { diff --git a/src/validate.cpp b/src/validate.cpp index 1429ffb..42425da 100644 --- a/src/validate.cpp +++ b/src/validate.cpp @@ -1,8 +1,8 @@ #include +#include #include #include "callbacks.h" -#include "parser3.h" #include "weaseljson.h" int main(int argc, char **argv) { @@ -16,7 +16,8 @@ int main(int argc, char **argv) { return 1; } auto c = noopCallbacks(); - parser3::Parser3 parser(&c, nullptr); + std::unique_ptr parser{ + WeaselJsonParser_create(1024, &c, nullptr), WeaselJsonParser_destroy}; for (;;) { char buf[1024]; int l = read(fd, buf, sizeof(buf)); @@ -24,7 +25,7 @@ int main(int argc, char **argv) { perror("read"); return 1; } - switch (parser.parse(buf, l)) { + switch (WeaselJsonParser_parse(parser.get(), buf, l)) { case WeaselJson_OK: return 0; case WeaselJson_AGAIN: