diff --git a/src/fuzz.cpp b/src/fuzz.cpp index 8149164..af3afdf 100644 --- a/src/fuzz.cpp +++ b/src/fuzz.cpp @@ -179,7 +179,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { json_utf8 = json_utf8 && c >= 0x20 && c != '"' && c != '\\'; } if (json_utf8) { - parser3::Utf8Dfa dfa; + Utf8Dfa dfa; auto result = dfa.scan((const char *)data, (const char *)data + size); bool ok = result == (const char *)data + size && dfa.accept(); bool valid = simdjson::validate_utf8(s.data(), s.size()); diff --git a/src/parser3.h b/src/parser3.h index 412b4ce..5ba294a 100644 --- a/src/parser3.h +++ b/src/parser3.h @@ -16,632 +16,6 @@ namespace parser3 { -// See https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725 for -// an explanation of this cycle/byte dfa implementation. -// -// Recognizes json number syntax. As a regex: -// -?([0-9]|[1-9][0-9]*)(\.[0-9]+)?((e|E)(-|\+)?[0-9]+)? -struct NumDfa { - constexpr static uint64_t table[256] = { - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x36000ull, - 0x0ull, - 0x36600ull, - 0x12480000000000ull, - 0x0ull, - 0x780aa47b091ec00ull, - 0x780aa47aa91ea80ull, - 0x780aa47aa91ea80ull, - 0x780aa47aa91ea80ull, - 0x780aa47aa91ea80ull, - 0x780aa47aa91ea80ull, - 0x780aa47aa91ea80ull, - 0x780aa47aa91ea80ull, - 0x780aa47aa91ea80ull, - 0x780aa47aa91ea80ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0xc30c000000000ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0xc30c000000000ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - }; - // Restore this dfa to its start state - void reset() { state = 6; } - // Return true if this dfa is in an accept state. You probably want to call - // scan until the match ends first. - bool accept() const { - return (state & 63) == 30 || (state & 63) == 36 || (state & 63) == 48 || - (state & 63) == 42; - } - // clang-format off -#ifdef __x86_64__ - __attribute__((target_clones("default", "bmi2"))) -#endif - // Return value either points to the first byte which does not match, or bufEnd. - // Leaves the dfa in the last state of the match. - const char *scan(const char *buf, const char *bufEnd) { - // clang-format on - auto state_ = state; - for (;;) { - constexpr int kStride = 16; - if (bufEnd - buf < kStride) [[unlikely]] { - while (buf != bufEnd) { - uint64_t row = table[uint8_t(*buf)]; - auto prev = state_; - state_ = (row >> (state_ & 63)) & 63; - if (state_ == 0) { - state_ = prev; - break; - } - ++buf; - } - state = state_; - return buf; - } - uint8_t prev[kStride + 1]; - prev[0] = state_; - for (int i = 0; i < kStride; ++i) { - uint64_t row = table[uint8_t(*buf)]; - prev[i + 1] = row >> (prev[i] & 63); - if ((prev[i + 1] & 63) == 0) { - state = prev[i]; - return buf; - } - ++buf; - } - state_ = prev[kStride]; - } - } - -private: - uint64_t state = 6; -}; - -// Recognizes sequences of valid utf8 characters except 0-0x20, double quote, -// and backslash -struct Utf8Dfa { - constexpr static uint64_t table[256] = { - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x0ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x0ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x30000000000000ull, - 0x18630780780ull, - 0x18630780780ull, - 0x18630780780ull, - 0x18630780780ull, - 0x18630780780ull, - 0x18630780780ull, - 0x18630780780ull, - 0x18630780780ull, - 0x18630780780ull, - 0x18630780780ull, - 0x18630780780ull, - 0x18630780780ull, - 0x18630780780ull, - 0x18630780780ull, - 0x18630780780ull, - 0x18630780780ull, - 0x1863001e780ull, - 0x1863001e780ull, - 0x1863001e780ull, - 0x1863001e780ull, - 0x1863001e780ull, - 0x1863001e780ull, - 0x1863001e780ull, - 0x1863001e780ull, - 0x1863001e780ull, - 0x1863001e780ull, - 0x1863001e780ull, - 0x1863001e780ull, - 0x1863001e780ull, - 0x1863001e780ull, - 0x1863001e780ull, - 0x1863001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x60063001e780ull, - 0x0ull, - 0x0ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x18000000000000ull, - 0x2a000000000000ull, - 0x1e000000000000ull, - 0x1e000000000000ull, - 0x1e000000000000ull, - 0x1e000000000000ull, - 0x1e000000000000ull, - 0x1e000000000000ull, - 0x1e000000000000ull, - 0x1e000000000000ull, - 0x1e000000000000ull, - 0x1e000000000000ull, - 0x1e000000000000ull, - 0x1e000000000000ull, - 0x24000000000000ull, - 0x1e000000000000ull, - 0x1e000000000000ull, - 0xc000000000000ull, - 0x6000000000000ull, - 0x6000000000000ull, - 0x6000000000000ull, - 0x12000000000000ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - 0x0ull, - }; - // Restore this dfa to its start state - void reset() { state = 48; } - // Return true if this dfa is in an accept state. You probably want to call - // scan until the match ends first. - bool accept() const { return (state & 63) == 48; } - // clang-format off -#ifdef __x86_64__ - __attribute__((target_clones("default", "bmi2"))) -#endif - // Return value either points to the first byte which does not match, or bufEnd. - // Leaves the dfa in the last state of the match. - const char *scan(const char *buf, const char *bufEnd) { - // clang-format on - auto state_ = state; - for (;;) { - constexpr int kStride = 16; - if (bufEnd - buf < kStride) [[unlikely]] { - while (buf != bufEnd) { - uint64_t row = table[uint8_t(*buf)]; - auto prev = state_; - state_ = (row >> (state_ & 63)) & 63; - if (state_ == 0) { - state_ = prev; - break; - } - ++buf; - } - state = state_; - return buf; - } - uint8_t prev[kStride + 1]; - prev[0] = state_; - for (int i = 0; i < kStride; ++i) { - uint64_t row = table[uint8_t(*buf)]; - prev[i + 1] = row >> (prev[i] & 63); - if ((prev[i + 1] & 63) == 0) { - state = prev[i]; - return buf; - } - ++buf; - } - state_ = prev[kStride]; - } - } - -private: - uint64_t state = 48; -}; - typedef PRESERVE_NONE WeaselJsonStatus (*Continuation)(struct Parser3 *, char *buf, char *bufEnd); diff --git a/src/tables.h b/src/tables.h index a5a996a..68b3e16 100644 --- a/src/tables.h +++ b/src/tables.h @@ -1,5 +1,7 @@ #pragma once +#include + constexpr inline struct Tables { constexpr Tables() { @@ -19,3 +21,629 @@ constexpr inline struct Tables { bool whitespace[256]{}; char unescape[256]{}; } tables; + +// See https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725 for +// an explanation of this cycle/byte dfa implementation. +// +// Recognizes json number syntax. As a regex: +// -?([0-9]|[1-9][0-9]*)(\.[0-9]+)?((e|E)(-|\+)?[0-9]+)? +struct NumDfa { + constexpr static uint64_t table[256] = { + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x36000ull, + 0x0ull, + 0x36600ull, + 0x12480000000000ull, + 0x0ull, + 0x780aa47b091ec00ull, + 0x780aa47aa91ea80ull, + 0x780aa47aa91ea80ull, + 0x780aa47aa91ea80ull, + 0x780aa47aa91ea80ull, + 0x780aa47aa91ea80ull, + 0x780aa47aa91ea80ull, + 0x780aa47aa91ea80ull, + 0x780aa47aa91ea80ull, + 0x780aa47aa91ea80ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0xc30c000000000ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0xc30c000000000ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + }; + // Restore this dfa to its start state + void reset() { state = 6; } + // Return true if this dfa is in an accept state. You probably want to call + // scan until the match ends first. + bool accept() const { + return (state & 63) == 30 || (state & 63) == 36 || (state & 63) == 48 || + (state & 63) == 42; + } + // clang-format off +#ifdef __x86_64__ + __attribute__((target_clones("default", "bmi2"))) +#endif + // Return value either points to the first byte which does not match, or bufEnd. + // Leaves the dfa in the last state of the match. + const char *scan(const char *buf, const char *bufEnd) { + // clang-format on + auto state_ = state; + for (;;) { + constexpr int kStride = 16; + if (bufEnd - buf < kStride) [[unlikely]] { + while (buf != bufEnd) { + uint64_t row = table[uint8_t(*buf)]; + auto prev = state_; + state_ = (row >> (state_ & 63)) & 63; + if (state_ == 0) { + state_ = prev; + break; + } + ++buf; + } + state = state_; + return buf; + } + uint8_t prev[kStride + 1]; + prev[0] = state_; + for (int i = 0; i < kStride; ++i) { + uint64_t row = table[uint8_t(*buf)]; + prev[i + 1] = row >> (prev[i] & 63); + if ((prev[i + 1] & 63) == 0) { + state = prev[i]; + return buf; + } + ++buf; + } + state_ = prev[kStride]; + } + } + +private: + uint64_t state = 6; +}; + +// Recognizes sequences of valid utf8 characters except 0-0x20, double quote, +// and backslash +struct Utf8Dfa { + constexpr static uint64_t table[256] = { + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x0ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x0ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x30000000000000ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x18630780780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x1863001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x60063001e780ull, + 0x0ull, + 0x0ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x18000000000000ull, + 0x2a000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0x24000000000000ull, + 0x1e000000000000ull, + 0x1e000000000000ull, + 0xc000000000000ull, + 0x6000000000000ull, + 0x6000000000000ull, + 0x6000000000000ull, + 0x12000000000000ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + 0x0ull, + }; + // Restore this dfa to its start state + void reset() { state = 48; } + // Return true if this dfa is in an accept state. You probably want to call + // scan until the match ends first. + bool accept() const { return (state & 63) == 48; } + // clang-format off +#ifdef __x86_64__ + __attribute__((target_clones("default", "bmi2"))) +#endif + // Return value either points to the first byte which does not match, or bufEnd. + // Leaves the dfa in the last state of the match. + const char *scan(const char *buf, const char *bufEnd) { + // clang-format on + auto state_ = state; + for (;;) { + constexpr int kStride = 16; + if (bufEnd - buf < kStride) [[unlikely]] { + while (buf != bufEnd) { + uint64_t row = table[uint8_t(*buf)]; + auto prev = state_; + state_ = (row >> (state_ & 63)) & 63; + if (state_ == 0) { + state_ = prev; + break; + } + ++buf; + } + state = state_; + return buf; + } + uint8_t prev[kStride + 1]; + prev[0] = state_; + for (int i = 0; i < kStride; ++i) { + uint64_t row = table[uint8_t(*buf)]; + prev[i + 1] = row >> (prev[i] & 63); + if ((prev[i + 1] & 63) == 0) { + state = prev[i]; + return buf; + } + ++buf; + } + state_ = prev[kStride]; + } + } + +private: + uint64_t state = 48; +}; diff --git a/src/test.cpp b/src/test.cpp index de24667..9fb36cc 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -309,7 +309,7 @@ TEST_CASE("bench5") { } TEST_CASE("num dfa") { - parser3::NumDfa dfa; + NumDfa dfa; std::string match = "111111111111111111111111111111111111111111111111111111111111111111111111" "111111111111111111111111111111111111111111111111111111111111111111111111" @@ -339,7 +339,7 @@ const char *utf8str = "💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩"; TEST_CASE("utf8 dfa") { - parser3::Utf8Dfa dfa; + Utf8Dfa dfa; std::string match = utf8str; auto *buf = dfa.scan(match.data(), match.data() + match.size()); CHECK(buf == match.data() + match.size());