From 628fe466ef85c9225bf3fd3a151b4abe274e7f18 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Sat, 16 Aug 2025 22:18:33 -0400 Subject: [PATCH] Compare with rapidjson --- CMakeLists.txt | 10 +- benchmarks/bench_parser_comparison.cpp | 599 +++++++++++++++++++++++++ 2 files changed, 608 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0212ebe..91ad8f0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,13 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(nlohmann_json) +FetchContent_Declare( + RapidJSON + GIT_REPOSITORY https://github.com/Tencent/rapidjson.git + GIT_TAG ab1842a2dae061284c0a62dca1cc6d5e7e37e346 # v1.1.0 +) +FetchContent_MakeAvailable(RapidJSON) + set(SIMDUTF_TESTS OFF CACHE BOOL "Disable simdutf tests" FORCE) @@ -105,7 +112,8 @@ add_executable( src/commit_request.cpp src/arena_allocator.cpp) target_link_libraries(bench_parser_comparison nanobench weaseljson test_data nlohmann_json::nlohmann_json simdutf::simdutf) -target_include_directories(bench_parser_comparison PRIVATE src) +target_include_directories(bench_parser_comparison + PRIVATE src ${rapidjson_SOURCE_DIR}/include) # Debug tools add_executable(debug_arena tools/debug_arena.cpp src/commit_request.cpp diff --git a/benchmarks/bench_parser_comparison.cpp b/benchmarks/bench_parser_comparison.cpp index f16050e..adca2fd 100644 --- a/benchmarks/bench_parser_comparison.cpp +++ b/benchmarks/bench_parser_comparison.cpp @@ -4,10 +4,381 @@ #include #include #include +#include +#include +#include +#include #include using namespace weaseldb::test_data; +// Arena-based allocator adapter for RapidJSON +class RapidJsonArenaAllocator { +public: + explicit RapidJsonArenaAllocator(ArenaAllocator *arena) : arena_(arena) {} + + static const bool kNeedFree = false; + + void *Malloc(size_t size) { return arena_->allocate(size); } + + void *Realloc(void *originalPtr, size_t originalSize, size_t newSize) { + // Arena allocators typically don't support realloc efficiently + // For RapidJSON strings, we'll just allocate new space + void *newPtr = arena_->allocate(newSize); + if (originalPtr && originalSize > 0) { + std::memcpy(newPtr, originalPtr, std::min(originalSize, newSize)); + } + return newPtr; + } + + static void Free(void *ptr) { + // Arena allocators don't free individual allocations + } + +private: + ArenaAllocator *arena_; +}; + +// Arena-based RapidJSON SAX handler for commit request parsing +class CommitRequestArenaHandler { +public: + struct Precondition { + enum class Type { PointRead, RangeRead }; + Type type; + uint64_t version = 0; + std::string_view key, begin, end; + }; + + struct Operation { + enum class Type { Write, Delete, RangeDelete }; + Type type; + std::string_view key, value, begin, end; + }; + + ArenaAllocator arena; + bool valid = true; + std::string_view request_id, leader_id; + uint64_t read_version = 0; + std::vector> preconditions; + std::vector> operations; + +private: + enum class State { + Root, + PreconditionsArray, + PreconditionObject, + OperationsArray, + OperationObject + } state = State::Root; + + std::string current_key; + Precondition current_precondition; + Operation current_operation; + + // Helper to store string in arena and return string_view + std::string_view store_string(const char *str, size_t length) { + char *stored = arena.allocate(length); + std::memcpy(stored, str, length); + return std::string_view(stored, length); + } + +public: + explicit CommitRequestArenaHandler() + : preconditions(ArenaStlAllocator(&arena)), + operations(ArenaStlAllocator(&arena)) {} + + bool Null() { return true; } + bool Bool(bool) { return true; } + bool Int(int i) { return Int64(i); } + bool Uint(unsigned u) { return Uint64(u); } + bool Int64(int64_t i) { return Uint64(static_cast(i)); } + bool Uint64(uint64_t u) { + if (state == State::Root) { + if (current_key == "read_version") { + read_version = u; + } + } else if (state == State::PreconditionObject) { + if (current_key == "version") { + current_precondition.version = u; + } + } + return true; + } + bool Double(double) { return true; } + bool RawNumber(const char *str, rapidjson::SizeType length, bool copy) { + return Uint64(std::strtoull(str, nullptr, 10)); + } + + bool String(const char *str, rapidjson::SizeType length, bool) { + std::string_view value = store_string(str, length); + + if (state == State::Root) { + if (current_key == "request_id") { + request_id = value; + } else if (current_key == "leader_id") { + leader_id = value; + } + } else if (state == State::PreconditionObject) { + if (current_key == "type") { + if (value == "point_read") { + current_precondition.type = Precondition::Type::PointRead; + } else if (value == "range_read") { + current_precondition.type = Precondition::Type::RangeRead; + } + } else if (current_key == "key") { + current_precondition.key = value; + } else if (current_key == "begin") { + current_precondition.begin = value; + } else if (current_key == "end") { + current_precondition.end = value; + } + } else if (state == State::OperationObject) { + if (current_key == "type") { + if (value == "write") { + current_operation.type = Operation::Type::Write; + } else if (value == "delete") { + current_operation.type = Operation::Type::Delete; + } else if (value == "range_delete") { + current_operation.type = Operation::Type::RangeDelete; + } + } else if (current_key == "key") { + current_operation.key = value; + } else if (current_key == "value") { + current_operation.value = value; + } else if (current_key == "begin") { + current_operation.begin = value; + } else if (current_key == "end") { + current_operation.end = value; + } + } + return true; + } + + bool StartObject() { + if (state == State::PreconditionsArray) { + state = State::PreconditionObject; + current_precondition = {}; + } else if (state == State::OperationsArray) { + state = State::OperationObject; + current_operation = {}; + } + return true; + } + + bool Key(const char *str, rapidjson::SizeType length, bool) { + current_key.assign(str, length); + return true; + } + + bool EndObject(rapidjson::SizeType) { + if (state == State::PreconditionObject) { + preconditions.push_back(current_precondition); + state = State::PreconditionsArray; + } else if (state == State::OperationObject) { + operations.push_back(current_operation); + state = State::OperationsArray; + } + return true; + } + + bool StartArray() { + if (current_key == "preconditions") { + state = State::PreconditionsArray; + } else if (current_key == "operations") { + state = State::OperationsArray; + } + return true; + } + + bool EndArray(rapidjson::SizeType) { + if (state == State::PreconditionsArray || state == State::OperationsArray) { + state = State::Root; + } + return true; + } + + bool validate() const { return !leader_id.empty() && read_version > 0; } + + void reset() { + arena.reset(); + valid = true; + request_id = {}; + leader_id = {}; + read_version = 0; + preconditions.clear(); + operations.clear(); + state = State::Root; + current_key.clear(); + current_precondition = {}; + current_operation = {}; + } + + size_t total_allocated() const { return arena.total_allocated(); } + size_t used_bytes() const { return arena.used_bytes(); } +}; + +// Standard RapidJSON SAX handler for commit request parsing +class CommitRequestSaxHandler { +public: + struct Precondition { + enum class Type { PointRead, RangeRead }; + Type type; + uint64_t version = 0; + std::string key, begin, end; + }; + + struct Operation { + enum class Type { Write, Delete, RangeDelete }; + Type type; + std::string key, value, begin, end; + }; + + bool valid = true; + std::string request_id, leader_id; + uint64_t read_version = 0; + std::vector preconditions; + std::vector operations; + +private: + enum class State { + Root, + PreconditionsArray, + PreconditionObject, + OperationsArray, + OperationObject + } state = State::Root; + + std::string current_key; + Precondition current_precondition; + Operation current_operation; + +public: + bool Null() { return true; } + bool Bool(bool) { return true; } + bool Int(int i) { return Int64(i); } + bool Uint(unsigned u) { return Uint64(u); } + bool Int64(int64_t i) { return Uint64(static_cast(i)); } + bool Uint64(uint64_t u) { + if (state == State::Root) { + if (current_key == "read_version") { + read_version = u; + } + } else if (state == State::PreconditionObject) { + if (current_key == "version") { + current_precondition.version = u; + } + } + return true; + } + bool Double(double) { return true; } + bool RawNumber(const char *str, rapidjson::SizeType length, bool copy) { + return Uint64(std::strtoull(str, nullptr, 10)); + } + + bool String(const char *str, rapidjson::SizeType length, bool) { + std::string value(str, length); + + if (state == State::Root) { + if (current_key == "request_id") { + request_id = value; + } else if (current_key == "leader_id") { + leader_id = value; + } + } else if (state == State::PreconditionObject) { + if (current_key == "type") { + if (value == "point_read") { + current_precondition.type = Precondition::Type::PointRead; + } else if (value == "range_read") { + current_precondition.type = Precondition::Type::RangeRead; + } + } else if (current_key == "key") { + current_precondition.key = value; + } else if (current_key == "begin") { + current_precondition.begin = value; + } else if (current_key == "end") { + current_precondition.end = value; + } + } else if (state == State::OperationObject) { + if (current_key == "type") { + if (value == "write") { + current_operation.type = Operation::Type::Write; + } else if (value == "delete") { + current_operation.type = Operation::Type::Delete; + } else if (value == "range_delete") { + current_operation.type = Operation::Type::RangeDelete; + } + } else if (current_key == "key") { + current_operation.key = value; + } else if (current_key == "value") { + current_operation.value = value; + } else if (current_key == "begin") { + current_operation.begin = value; + } else if (current_key == "end") { + current_operation.end = value; + } + } + return true; + } + + bool StartObject() { + if (state == State::PreconditionsArray) { + state = State::PreconditionObject; + current_precondition = {}; + } else if (state == State::OperationsArray) { + state = State::OperationObject; + current_operation = {}; + } + return true; + } + + bool Key(const char *str, rapidjson::SizeType length, bool) { + current_key.assign(str, length); + return true; + } + + bool EndObject(rapidjson::SizeType) { + if (state == State::PreconditionObject) { + preconditions.push_back(current_precondition); + state = State::PreconditionsArray; + } else if (state == State::OperationObject) { + operations.push_back(current_operation); + state = State::OperationsArray; + } + return true; + } + + bool StartArray() { + if (current_key == "preconditions") { + state = State::PreconditionsArray; + } else if (current_key == "operations") { + state = State::OperationsArray; + } + return true; + } + + bool EndArray(rapidjson::SizeType) { + if (state == State::PreconditionsArray || state == State::OperationsArray) { + state = State::Root; + } + return true; + } + + bool validate() const { return !leader_id.empty() && read_version > 0; } + + void reset() { + valid = true; + request_id.clear(); + leader_id.clear(); + read_version = 0; + preconditions.clear(); + operations.clear(); + state = State::Root; + current_key.clear(); + current_precondition = {}; + current_operation = {}; + } +}; + // JSON test data is now provided by test_data.hpp // Helper function to simulate validation work on nlohmann json object @@ -123,6 +494,48 @@ int main() { } }); + simple_bench.run("RapidJSON SAX + validation", [&] { + CommitRequestSaxHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(SIMPLE_JSON.c_str()); + bool result = reader.Parse(ss, handler); + result = result && handler.validate(); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); + + simple_bench.run("RapidJSON SAX (parse only)", [&] { + CommitRequestSaxHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(SIMPLE_JSON.c_str()); + bool result = reader.Parse(ss, handler); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); + + simple_bench.run("RapidJSON SAX Arena + validation", [&] { + CommitRequestArenaHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(SIMPLE_JSON.c_str()); + bool result = reader.Parse(ss, handler); + result = result && handler.validate(); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); + + simple_bench.run("RapidJSON SAX Arena (parse only)", [&] { + CommitRequestArenaHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(SIMPLE_JSON.c_str()); + bool result = reader.Parse(ss, handler); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); + // Medium complexity JSON comparison auto medium_bench = ankerl::nanobench::Bench() .title("Medium JSON Parsing Comparison") @@ -158,6 +571,48 @@ int main() { } }); + medium_bench.run("RapidJSON SAX + validation", [&] { + CommitRequestSaxHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(MEDIUM_JSON.c_str()); + bool result = reader.Parse(ss, handler); + result = result && handler.validate(); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); + + medium_bench.run("RapidJSON SAX (parse only)", [&] { + CommitRequestSaxHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(MEDIUM_JSON.c_str()); + bool result = reader.Parse(ss, handler); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); + + medium_bench.run("RapidJSON SAX Arena + validation", [&] { + CommitRequestArenaHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(MEDIUM_JSON.c_str()); + bool result = reader.Parse(ss, handler); + result = result && handler.validate(); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); + + medium_bench.run("RapidJSON SAX Arena (parse only)", [&] { + CommitRequestArenaHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(MEDIUM_JSON.c_str()); + bool result = reader.Parse(ss, handler); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); + // Complex JSON comparison auto complex_bench = ankerl::nanobench::Bench() .title("Complex JSON Parsing Comparison") @@ -193,6 +648,48 @@ int main() { } }); + complex_bench.run("RapidJSON SAX + validation", [&] { + CommitRequestSaxHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(COMPLEX_JSON.c_str()); + bool result = reader.Parse(ss, handler); + result = result && handler.validate(); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); + + complex_bench.run("RapidJSON SAX (parse only)", [&] { + CommitRequestSaxHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(COMPLEX_JSON.c_str()); + bool result = reader.Parse(ss, handler); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); + + complex_bench.run("RapidJSON SAX Arena + validation", [&] { + CommitRequestArenaHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(COMPLEX_JSON.c_str()); + bool result = reader.Parse(ss, handler); + result = result && handler.validate(); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); + + complex_bench.run("RapidJSON SAX Arena (parse only)", [&] { + CommitRequestArenaHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(COMPLEX_JSON.c_str()); + bool result = reader.Parse(ss, handler); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); + // Large batch operations comparison auto large_bench = ankerl::nanobench::Bench() .title("Large JSON Parsing Comparison") @@ -232,6 +729,50 @@ int main() { ankerl::nanobench::doNotOptimizeAway(false); } }); + + large_bench.run("RapidJSON SAX + validation (" + bench_name + ")", [&] { + CommitRequestSaxHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(large_json.c_str()); + bool result = reader.Parse(ss, handler); + result = result && handler.validate(); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); + + large_bench.run("RapidJSON SAX (parse only) (" + bench_name + ")", [&] { + CommitRequestSaxHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(large_json.c_str()); + bool result = reader.Parse(ss, handler); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); + + large_bench.run( + "RapidJSON SAX Arena + validation (" + bench_name + ")", [&] { + CommitRequestArenaHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(large_json.c_str()); + bool result = reader.Parse(ss, handler); + result = result && handler.validate(); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); + + large_bench.run( + "RapidJSON SAX Arena (parse only) (" + bench_name + ")", [&] { + CommitRequestArenaHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(large_json.c_str()); + bool result = reader.Parse(ss, handler); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); } // Memory efficiency comparison @@ -261,6 +802,28 @@ int main() { } }); + memory_bench.run("RapidJSON SAX (standard allocation)", [&] { + CommitRequestSaxHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(COMPLEX_JSON.c_str()); + bool result = reader.Parse(ss, handler); + result = result && handler.validate(); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.preconditions.size()); + ankerl::nanobench::doNotOptimizeAway(handler.operations.size()); + }); + + memory_bench.run("RapidJSON SAX Arena (arena allocation)", [&] { + CommitRequestArenaHandler handler; + rapidjson::Reader reader; + rapidjson::StringStream ss(COMPLEX_JSON.c_str()); + bool result = reader.Parse(ss, handler); + result = result && handler.validate(); + ankerl::nanobench::doNotOptimizeAway(result); + ankerl::nanobench::doNotOptimizeAway(handler.total_allocated()); + ankerl::nanobench::doNotOptimizeAway(handler.used_bytes()); + }); + // Reset and reuse comparison auto reuse_bench = ankerl::nanobench::Bench() .title("Reset and Reuse Comparison") @@ -300,6 +863,42 @@ int main() { } }); + reuse_bench.run("RapidJSON SAX (reset)", [&] { + static CommitRequestSaxHandler handler; + + rapidjson::Reader reader; + rapidjson::StringStream ss1(SIMPLE_JSON.c_str()); + bool result1 = reader.Parse(ss1, handler); + result1 = result1 && handler.validate(); + + handler.reset(); + + rapidjson::StringStream ss2(MEDIUM_JSON.c_str()); + bool result2 = reader.Parse(ss2, handler); + result2 = result2 && handler.validate(); + + ankerl::nanobench::doNotOptimizeAway(result1); + ankerl::nanobench::doNotOptimizeAway(result2); + }); + + reuse_bench.run("RapidJSON SAX Arena (reset)", [&] { + static CommitRequestArenaHandler handler; + + rapidjson::Reader reader; + rapidjson::StringStream ss1(SIMPLE_JSON.c_str()); + bool result1 = reader.Parse(ss1, handler); + result1 = result1 && handler.validate(); + + handler.reset(); + + rapidjson::StringStream ss2(MEDIUM_JSON.c_str()); + bool result2 = reader.Parse(ss2, handler); + result2 = result2 && handler.validate(); + + ankerl::nanobench::doNotOptimizeAway(result1); + ankerl::nanobench::doNotOptimizeAway(result2); + }); + std::cout << "\nBenchmark completed. The WeaselDB parser is optimized for:\n"; std::cout << "- Arena-based memory allocation for reduced fragmentation\n"; std::cout << "- Streaming parsing for network protocols\n";