Add automata to recognize utf8 in strings

This commit is contained in:
2025-06-21 15:56:15 -04:00
parent d1523acf94
commit 5613303d52
3 changed files with 351 additions and 2 deletions

View File

@@ -1,5 +1,6 @@
#include "callbacks.h" #include "callbacks.h"
#include "json_value.h" #include "json_value.h"
#include "parser3.h"
#include "weaseljson.h" #include "weaseljson.h"
#include <simdjson.h> #include <simdjson.h>
@@ -132,5 +133,19 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
testStreaming(s); testStreaming(s);
compareWithSimdjson(s); compareWithSimdjson(s);
testStringRoundTrip(s); testStringRoundTrip(s);
bool json_utf8 = true;
for (int i = 0; i < int(size); ++i) {
uint8_t c = data[i];
json_utf8 = json_utf8 && c >= 0x20 && c != '"' && c != '\\';
}
if (json_utf8) {
parser3::Utf8Dfa dfa;
auto result = dfa.scan((const char *)data, (const char *)data + size);
bool ok = result == (const char *)data + size && dfa.accept();
bool valid = simdjson::validate_utf8(s.data(), s.size());
if (ok != valid) {
abort();
}
}
return 0; return 0;
} }

View File

@@ -326,6 +326,313 @@ private:
uint64_t state = 6; uint64_t state = 6;
}; };
struct Utf8Dfa {
constexpr static uint64_t num_dfa_table[256] = {
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x30000000000000ull,
0x30000000000000ull,
0x0ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x0ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x0ull,
0x0ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x2a000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x24000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0xc000000000000ull,
0x6000000000000ull,
0x6000000000000ull,
0x6000000000000ull,
0x12000000000000ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
};
// Restore this dfa to its start state
void reset() { state = 48; }
// Return true if this dfa is in an accept state. You probably want to call
// scan until the match ends first.
bool accept() const { return (state & 63) == 48; }
// return value either points to the first byte which does not match, or
// bufEnd. Leaves the dfa in the last state of the match.
#ifdef __x86_64__
__attribute__((target_clones("default", "bmi2")))
#endif
const char *
scan(const char *buf, const char *bufEnd) {
auto state_ = state;
for (;;) {
constexpr int kStride = 16;
if (bufEnd - buf < kStride) [[unlikely]] {
while (buf != bufEnd) {
uint64_t row = num_dfa_table[uint8_t(*buf)];
auto prev = state_;
state_ = (row >> (state_ & 63)) & 63;
if (state_ == 0) {
state_ = prev;
break;
}
++buf;
}
state = state_;
return buf;
}
uint8_t prev[kStride + 1];
prev[0] = state_;
for (int i = 0; i < kStride; ++i) {
uint64_t row = num_dfa_table[uint8_t(*buf)];
prev[i + 1] = row >> (prev[i] & 63);
if ((prev[i + 1] & 63) == 0) {
state = prev[i];
return buf;
}
++buf;
}
state_ = prev[kStride];
}
}
private:
uint64_t state = 48;
};
typedef PRESERVE_NONE WeaselJsonStatus (*Continuation)(struct Parser3 *, typedef PRESERVE_NONE WeaselJsonStatus (*Continuation)(struct Parser3 *,
char *buf, char *bufEnd); char *buf, char *bufEnd);

View File

@@ -310,8 +310,15 @@ TEST_CASE("bench5") {
TEST_CASE("num dfa") { TEST_CASE("num dfa") {
parser3::NumDfa dfa; parser3::NumDfa dfa;
std::string match = "-1231279127389127389127398127389712893791287389217327482" std::string match =
"374.0e69010101010101010101010101010101"; "111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"11111111";
auto *buf = dfa.scan(match.data(), match.data() + match.size()); auto *buf = dfa.scan(match.data(), match.data() + match.size());
CHECK(buf == match.data() + match.size()); CHECK(buf == match.data() + match.size());
CHECK(dfa.accept()); CHECK(dfa.accept());
@@ -325,3 +332,23 @@ TEST_CASE("num dfa") {
dfa.scan(match.data(), match.data() + match.size())); dfa.scan(match.data(), match.data() + match.size()));
}); });
} }
TEST_CASE("utf8 dfa") {
parser3::Utf8Dfa dfa;
std::string match =
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩"
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩"
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩";
auto *buf = dfa.scan(match.data(), match.data() + match.size());
CHECK(buf == match.data() + match.size());
CHECK(dfa.accept());
ankerl::nanobench::Bench bench;
bench.batch(match.size());
bench.unit("byte");
bench.run("utf8 dfa", [&]() {
dfa.reset();
bench.doNotOptimizeAway(
dfa.scan(match.data(), match.data() + match.size()));
});
}