diff --git a/src/fuzz.cpp b/src/fuzz.cpp index a3b6657..097e8f8 100644 --- a/src/fuzz.cpp +++ b/src/fuzz.cpp @@ -1,5 +1,6 @@ #include "callbacks.h" #include "json_value.h" +#include "parser3.h" #include "weaseljson.h" #include @@ -132,5 +133,19 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { testStreaming(s); compareWithSimdjson(s); testStringRoundTrip(s); + bool json_utf8 = true; + for (int i = 0; i < int(size); ++i) { + uint8_t c = data[i]; + json_utf8 = json_utf8 && c >= 0x20 && c != '"' && c != '\\'; + } + if (json_utf8) { + parser3::Utf8Dfa dfa; + auto result = dfa.scan((const char *)data, (const char *)data + size); + bool ok = result == (const char *)data + size && dfa.accept(); + bool valid = simdjson::validate_utf8(s.data(), s.size()); + if (ok != valid) { + abort(); + } + } return 0; } diff --git a/src/parser3.h b/src/parser3.h index d1011cd..ee202c9 100644 --- a/src/parser3.h +++ b/src/parser3.h @@ -326,6 +326,313 @@ private: uint64_t state = 6; }; +struct Utf8Dfa { + constexpr static uint64_t num_dfa_table[256] = { + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x0ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x0ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x0ull, + 0x0ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x2a000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x24000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0xc000000000000ull, + 0x6000000000000ull, + 0x6000000000000ull, + 0x6000000000000ull, + 0x12000000000000ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + }; + // Restore this dfa to its start state + void reset() { state = 48; } + // Return true if this dfa is in an accept state. You probably want to call + // scan until the match ends first. + bool accept() const { return (state & 63) == 48; } + // return value either points to the first byte which does not match, or + // bufEnd. Leaves the dfa in the last state of the match. +#ifdef __x86_64__ + __attribute__((target_clones("default", "bmi2"))) +#endif + const char * + scan(const char *buf, const char *bufEnd) { + auto state_ = state; + for (;;) { + constexpr int kStride = 16; + if (bufEnd - buf < kStride) [[unlikely]] { + while (buf != bufEnd) { + uint64_t row = num_dfa_table[uint8_t(*buf)]; + auto prev = state_; + state_ = (row >> (state_ & 63)) & 63; + if (state_ == 0) { + state_ = prev; + break; + } + ++buf; + } + state = state_; + return buf; + } + uint8_t prev[kStride + 1]; + prev[0] = state_; + for (int i = 0; i < kStride; ++i) { + uint64_t row = num_dfa_table[uint8_t(*buf)]; + prev[i + 1] = row >> (prev[i] & 63); + if ((prev[i + 1] & 63) == 0) { + state = prev[i]; + return buf; + } + ++buf; + } + state_ = prev[kStride]; + } + } + +private: + uint64_t state = 48; +}; + typedef PRESERVE_NONE WeaselJsonStatus (*Continuation)(struct Parser3 *, char *buf, char *bufEnd); diff --git a/src/test.cpp b/src/test.cpp index d24f1fa..6f3b581 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -310,8 +310,15 @@ TEST_CASE("bench5") { TEST_CASE("num dfa") { parser3::NumDfa dfa; - std::string match = "-1231279127389127389127398127389712893791287389217327482" - "374.0e69010101010101010101010101010101"; + std::string match = + "111111111111111111111111111111111111111111111111111111111111111111111111" + "111111111111111111111111111111111111111111111111111111111111111111111111" + "111111111111111111111111111111111111111111111111111111111111111111111111" + "111111111111111111111111111111111111111111111111111111111111111111111111" + "111111111111111111111111111111111111111111111111111111111111111111111111" + "111111111111111111111111111111111111111111111111111111111111111111111111" + "111111111111111111111111111111111111111111111111111111111111111111111111" + "11111111"; auto *buf = dfa.scan(match.data(), match.data() + match.size()); CHECK(buf == match.data() + match.size()); CHECK(dfa.accept()); @@ -325,3 +332,23 @@ TEST_CASE("num dfa") { dfa.scan(match.data(), match.data() + match.size())); }); } + +TEST_CASE("utf8 dfa") { + parser3::Utf8Dfa dfa; + std::string match = + "💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩" + "💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩" + "💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩"; + auto *buf = dfa.scan(match.data(), match.data() + match.size()); + CHECK(buf == match.data() + match.size()); + CHECK(dfa.accept()); + + ankerl::nanobench::Bench bench; + bench.batch(match.size()); + bench.unit("byte"); + bench.run("utf8 dfa", [&]() { + dfa.reset(); + bench.doNotOptimizeAway( + dfa.scan(match.data(), match.data() + match.size())); + }); +}