Add automata to recognize utf8 in strings
This commit is contained in:
15
src/fuzz.cpp
15
src/fuzz.cpp
@@ -1,5 +1,6 @@
|
||||
#include "callbacks.h"
|
||||
#include "json_value.h"
|
||||
#include "parser3.h"
|
||||
#include "weaseljson.h"
|
||||
|
||||
#include <simdjson.h>
|
||||
@@ -132,5 +133,19 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
|
||||
testStreaming(s);
|
||||
compareWithSimdjson(s);
|
||||
testStringRoundTrip(s);
|
||||
bool json_utf8 = true;
|
||||
for (int i = 0; i < int(size); ++i) {
|
||||
uint8_t c = data[i];
|
||||
json_utf8 = json_utf8 && c >= 0x20 && c != '"' && c != '\\';
|
||||
}
|
||||
if (json_utf8) {
|
||||
parser3::Utf8Dfa dfa;
|
||||
auto result = dfa.scan((const char *)data, (const char *)data + size);
|
||||
bool ok = result == (const char *)data + size && dfa.accept();
|
||||
bool valid = simdjson::validate_utf8(s.data(), s.size());
|
||||
if (ok != valid) {
|
||||
abort();
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
307
src/parser3.h
307
src/parser3.h
@@ -326,6 +326,313 @@ private:
|
||||
uint64_t state = 6;
|
||||
};
|
||||
|
||||
struct Utf8Dfa {
|
||||
constexpr static uint64_t num_dfa_table[256] = {
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x0ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x0ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x30000000000000ull,
|
||||
0x18630780780ull,
|
||||
0x18630780780ull,
|
||||
0x18630780780ull,
|
||||
0x18630780780ull,
|
||||
0x18630780780ull,
|
||||
0x18630780780ull,
|
||||
0x18630780780ull,
|
||||
0x18630780780ull,
|
||||
0x18630780780ull,
|
||||
0x18630780780ull,
|
||||
0x18630780780ull,
|
||||
0x18630780780ull,
|
||||
0x18630780780ull,
|
||||
0x18630780780ull,
|
||||
0x18630780780ull,
|
||||
0x18630780780ull,
|
||||
0x1863001e780ull,
|
||||
0x1863001e780ull,
|
||||
0x1863001e780ull,
|
||||
0x1863001e780ull,
|
||||
0x1863001e780ull,
|
||||
0x1863001e780ull,
|
||||
0x1863001e780ull,
|
||||
0x1863001e780ull,
|
||||
0x1863001e780ull,
|
||||
0x1863001e780ull,
|
||||
0x1863001e780ull,
|
||||
0x1863001e780ull,
|
||||
0x1863001e780ull,
|
||||
0x1863001e780ull,
|
||||
0x1863001e780ull,
|
||||
0x1863001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x60063001e780ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x18000000000000ull,
|
||||
0x2a000000000000ull,
|
||||
0x1e000000000000ull,
|
||||
0x1e000000000000ull,
|
||||
0x1e000000000000ull,
|
||||
0x1e000000000000ull,
|
||||
0x1e000000000000ull,
|
||||
0x1e000000000000ull,
|
||||
0x1e000000000000ull,
|
||||
0x1e000000000000ull,
|
||||
0x1e000000000000ull,
|
||||
0x1e000000000000ull,
|
||||
0x1e000000000000ull,
|
||||
0x1e000000000000ull,
|
||||
0x24000000000000ull,
|
||||
0x1e000000000000ull,
|
||||
0x1e000000000000ull,
|
||||
0xc000000000000ull,
|
||||
0x6000000000000ull,
|
||||
0x6000000000000ull,
|
||||
0x6000000000000ull,
|
||||
0x12000000000000ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
0x0ull,
|
||||
};
|
||||
// Restore this dfa to its start state
|
||||
void reset() { state = 48; }
|
||||
// Return true if this dfa is in an accept state. You probably want to call
|
||||
// scan until the match ends first.
|
||||
bool accept() const { return (state & 63) == 48; }
|
||||
// return value either points to the first byte which does not match, or
|
||||
// bufEnd. Leaves the dfa in the last state of the match.
|
||||
#ifdef __x86_64__
|
||||
__attribute__((target_clones("default", "bmi2")))
|
||||
#endif
|
||||
const char *
|
||||
scan(const char *buf, const char *bufEnd) {
|
||||
auto state_ = state;
|
||||
for (;;) {
|
||||
constexpr int kStride = 16;
|
||||
if (bufEnd - buf < kStride) [[unlikely]] {
|
||||
while (buf != bufEnd) {
|
||||
uint64_t row = num_dfa_table[uint8_t(*buf)];
|
||||
auto prev = state_;
|
||||
state_ = (row >> (state_ & 63)) & 63;
|
||||
if (state_ == 0) {
|
||||
state_ = prev;
|
||||
break;
|
||||
}
|
||||
++buf;
|
||||
}
|
||||
state = state_;
|
||||
return buf;
|
||||
}
|
||||
uint8_t prev[kStride + 1];
|
||||
prev[0] = state_;
|
||||
for (int i = 0; i < kStride; ++i) {
|
||||
uint64_t row = num_dfa_table[uint8_t(*buf)];
|
||||
prev[i + 1] = row >> (prev[i] & 63);
|
||||
if ((prev[i + 1] & 63) == 0) {
|
||||
state = prev[i];
|
||||
return buf;
|
||||
}
|
||||
++buf;
|
||||
}
|
||||
state_ = prev[kStride];
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
uint64_t state = 48;
|
||||
};
|
||||
|
||||
typedef PRESERVE_NONE WeaselJsonStatus (*Continuation)(struct Parser3 *,
|
||||
char *buf, char *bufEnd);
|
||||
|
||||
|
||||
31
src/test.cpp
31
src/test.cpp
@@ -310,8 +310,15 @@ TEST_CASE("bench5") {
|
||||
|
||||
TEST_CASE("num dfa") {
|
||||
parser3::NumDfa dfa;
|
||||
std::string match = "-1231279127389127389127398127389712893791287389217327482"
|
||||
"374.0e69010101010101010101010101010101";
|
||||
std::string match =
|
||||
"111111111111111111111111111111111111111111111111111111111111111111111111"
|
||||
"111111111111111111111111111111111111111111111111111111111111111111111111"
|
||||
"111111111111111111111111111111111111111111111111111111111111111111111111"
|
||||
"111111111111111111111111111111111111111111111111111111111111111111111111"
|
||||
"111111111111111111111111111111111111111111111111111111111111111111111111"
|
||||
"111111111111111111111111111111111111111111111111111111111111111111111111"
|
||||
"111111111111111111111111111111111111111111111111111111111111111111111111"
|
||||
"11111111";
|
||||
auto *buf = dfa.scan(match.data(), match.data() + match.size());
|
||||
CHECK(buf == match.data() + match.size());
|
||||
CHECK(dfa.accept());
|
||||
@@ -325,3 +332,23 @@ TEST_CASE("num dfa") {
|
||||
dfa.scan(match.data(), match.data() + match.size()));
|
||||
});
|
||||
}
|
||||
|
||||
TEST_CASE("utf8 dfa") {
|
||||
parser3::Utf8Dfa dfa;
|
||||
std::string match =
|
||||
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩"
|
||||
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩"
|
||||
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩";
|
||||
auto *buf = dfa.scan(match.data(), match.data() + match.size());
|
||||
CHECK(buf == match.data() + match.size());
|
||||
CHECK(dfa.accept());
|
||||
|
||||
ankerl::nanobench::Bench bench;
|
||||
bench.batch(match.size());
|
||||
bench.unit("byte");
|
||||
bench.run("utf8 dfa", [&]() {
|
||||
dfa.reset();
|
||||
bench.doNotOptimizeAway(
|
||||
dfa.scan(match.data(), match.data() + match.size()));
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user