Add automata to recognize utf8 in strings

This commit is contained in:
2025-06-21 15:56:15 -04:00
parent d1523acf94
commit 5613303d52
3 changed files with 351 additions and 2 deletions

View File

@@ -310,8 +310,15 @@ TEST_CASE("bench5") {
TEST_CASE("num dfa") {
parser3::NumDfa dfa;
std::string match = "-1231279127389127389127398127389712893791287389217327482"
"374.0e69010101010101010101010101010101";
std::string match =
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"11111111";
auto *buf = dfa.scan(match.data(), match.data() + match.size());
CHECK(buf == match.data() + match.size());
CHECK(dfa.accept());
@@ -325,3 +332,23 @@ TEST_CASE("num dfa") {
dfa.scan(match.data(), match.data() + match.size()));
});
}
TEST_CASE("utf8 dfa") {
parser3::Utf8Dfa dfa;
std::string match =
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩"
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩"
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩";
auto *buf = dfa.scan(match.data(), match.data() + match.size());
CHECK(buf == match.data() + match.size());
CHECK(dfa.accept());
ankerl::nanobench::Bench bench;
bench.batch(match.size());
bench.unit("byte");
bench.run("utf8 dfa", [&]() {
dfa.reset();
bench.doNotOptimizeAway(
dfa.scan(match.data(), match.data() + match.size()));
});
}