Compare commits

...

10 Commits

Author SHA1 Message Date
fc1b3ac147 Fix string scan bug 2025-06-22 23:02:38 -04:00
95250d1668 Improve fuzz testing to find string scan bug 2025-06-22 23:00:34 -04:00
5a74124cae Fix bug 2025-06-22 19:47:59 -04:00
337d93bcea simd scan with dfa as fallback 2025-06-21 21:59:21 -04:00
fa0cc1a970 Scan string directly in n_value 2025-06-21 21:17:12 -04:00
229a68bfdd Switch to dfa for strings 2025-06-21 17:12:48 -04:00
6c48c40d67 Rename tables 2025-06-21 16:05:02 -04:00
1df18b3c57 Add comments to dfa's 2025-06-21 16:01:35 -04:00
7806f6420f Add to corpus 2025-06-21 15:56:42 -04:00
5613303d52 Add automata to recognize utf8 in strings 2025-06-21 15:56:15 -04:00
27 changed files with 706 additions and 194 deletions

View File

@@ -0,0 +1 @@
[5,[5K,<2C>ǀ<EFBFBD><C780><EFBFBD><EFBFBD><EFBFBD><EFBFBD>

View File

@@ -0,0 +1,85 @@
[8,[[[[ [8,[[[[ [[8,[8,[8,[[[
[[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[
[[-418,[8,[[[[[-41.8,[8,[8,[[[[[-41.8,[3,[8,[[41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[8,[[[[ [[[
[[-418,[8,[[[[[-41.8,[8,[8,[[[[[-41.8,[3,[8,[[[[ [[[
[
[[-41.8,[8,[8,[[[
[8,[8,[8,[[[
[[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[8,[[[[ [[[
[[-418,[8,[[[[[-41.8,[8,[8,[[[[[-41.8,[3,[8,[[[[ [[[
[
[[-41.8,[8,[8,[[[
[8,8,[[[[ [[8,[8,[8,[[[
[[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[
[[-418,[8,[[[[[-41.8,[8,[8,[[[[[-41.8,[3,[8,[[41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[8,[[[[ [[[
[[-418,[8,[[[[[-41.8,[8,[8,[[[[[-41.8,[3,[8,[[[[ [[[
[
[[-41.8,[8,[8,[[[
[8,[8,[8,[[[
[[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[8,[[[[ [[[[-418,[8,[[[[[-41.8,[8,[8,[[[[[-41.8,[3,[8,[[[[ [[[
[
[[-41.8,[8,[8,[[[
[8,8,[[[[ [[8,[8,[8,[[[
[[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[
[[-418,[8,[[[[[-41.8,[8,[8,[[[[[-41.8,[3,[8,[[41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[8,[[[[ [[[
[[-418,[8,[[[[[-41.8,[8,[8,[[[[[-41.8,[3,[8,[[[[ [[[
[
[[-41.8,[8,[8,[[[
[8,[8,[8,[[[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[8,[[[[ [[[
[[-418,[8,[[[[[-41.8,[8,[8,[[[[[-41.8,[3,[8,[[[[ [[[
[
[[-41.8,[8,[8,[[[
[8,[8,[8,[[[
[[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[
[[-418,[8,[[[[[-41.8,[8,8[,[[[[[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[641.8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[
[[-418,[8,[[[[[-41.8,[8,[8,[[[[[-41.8,[3,[8,[[[[ [[[
[
[[-41.8,[8,[8,6[[[
[8,[8,[8,[[[
[[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[
[

View File

@@ -0,0 +1 @@
[5,7.777E-5,[57E-5,77.777E-<2D>7.77E-ǀ

View File

@@ -0,0 +1 @@
"

View File

@@ -0,0 +1 @@
[-5,-1<>MMMM<4D><4D>MM<4D>

View File

@@ -0,0 +1,60 @@
[[],[[],[],[[],[[],[[],[[],[],[[],[3,[3,[],[],[3,[],[],[[],[[],[[],[],[3,[3,[],[],[3,[],[[],[],[[],[[],[[],[[[],[],[[],[ [[[{},8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[ [[[[[[[[[[{},[[[[[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[8,[[[[ [[[
[[-41,{},[[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-20.8,[8,[8,[[1.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41,{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[7,[[[[41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418444444444444444444444444444,[8,[[[[[[-41.8,[8,[8,[[[[ [[[
[[-41,{},[[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-20.8,[8,[8,[[1.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41,{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[8,[[[[ [[[
[[-41,{},[[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8.8,[8,[[[{},{}, [[[{},{},[[false, [[false, [[[{},8, [[[
[[-41,{},[[[[{},{}, [[[{},{},[[false, [[false, [[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[
[[[[-41.8,[8,[8,[[[[ [[[
[[-41,{},[[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-20.8,[8,[8,[[1.8,[3,[8,[[[[ [[[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.9,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[8,[[[[41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[8,[[[[ [[[
[[-41,{},[[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[8,[[[[ [[[
[[-41,{},[[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-20.8,[8,[8,[[1.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41,{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[7,[[[[41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[[ [[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[8,[[[[ [[[
[[-41,{},[[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-417.777E-8,[[[[[-20.8,[8,[8,[[1.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41,{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-941.8,[8,[8,
[[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[8,[[[[ [[[

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1 @@
[[[1,334e55,335,336e5,34e55,3352071e5,33117e5,334e55,352234e5,33117e5,334e55,3e5,334e55,3352234e5,334e55,33,334e5,352234e5,364e55,33e5,334e55,3e5,33455,334e5,34e55,3352234e5,33117e5,334e54,3e5,334e55,3352234e5,334e55,33e5,355,3e5,334e5,334e5,34e55,3352234e5,33117,33,334e5,35e55,33952234e5,334e55,334e5,34e55,33522344e54,37e5,e5,334e55e55

View File

@@ -0,0 +1 @@
[<5B><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>501

View File

@@ -0,0 +1 @@
[[33<33>3666p<36>89501

View File

@@ -0,0 +1 @@


View File

@@ -0,0 +1 @@
[[330.5026663,666666891.5026664333333,666666666666666000001891.50266643,66666000001843,6666666666666391.5026664333333,66666666666666603,666666666666666000001891.50266643,666666891.5026664333333,666666666666666000001891.50266643,666666666666633333363,666666666666666000001333333,666666666666666000001866000001891.50266643,5026664333333,666666666666666000001891.50266643,666666891.5026664333333,666666666666666000001891.50266643,6666<36>~0

View File

@@ -0,0 +1,13 @@
[[[{},{}, [8,[8,
[[-41.8,[-48,[8,[
[[-418,[8,[[[1.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-3,[8,[[[[[[{},{},[[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[-41.8,[-41.8,[8,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[8,[[[[ [[[
[[-41,{},[[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-20.8,[8,[8,[[1.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[],[[],[],[[],[[],[][[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-3,[8,[[[[ [[[

Binary file not shown.

View File

@@ -0,0 +1 @@
nul<EFBFBD>,<2C><>+<2B>lllllll

View File

@@ -0,0 +1 @@
[8,[8,[8,-.[<5B><>..

View File

@@ -0,0 +1 @@
[5,[5,ǀ<><C780><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>

View File

@@ -0,0 +1 @@
[8,[8,[8399,-<2D>8,

View File

@@ -0,0 +1,15 @@
[[[{},{}, [[[{},{},[[[[{},{}, [[[{},{},[[false,[8,[8,[[1.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41,{},[[false, [[false, [[[{},[
[[-41,{},[[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8.8,[8,[[[{},{}, [[[{},{},[[false, [[false, [[[{},8, [[[
[[-41,{},[[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8.8,[8,[[[{},{}, [[[{}, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[-41,[8,[8,[[[[ [[[
[[-41,{},[[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-20.8,[8,[8,[[1.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41,{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[ [[[
[[-418,[8,[[[
[[-41.8,[8,[[[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8.8,[8,[[[{},{}, [[[{},{},[[false, [[false, [[[[false, [[false, [[8,[[[
[[-41,{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-41.8,[8,[8,[[[[ [[[ [[-41.8,[3,[8,[[[[[-41.8,[8,[
[[-418,[8,[[[[[[-41.8,[8,[8,[[[[ [[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8,[[[[[-20.8,[8,[8,[[1.8,[3,[[[false, [[[{},[
[[-41,{},[[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8.8,[8,[[[{},{}, [[[{},{},[[false, [[false, [[[{},8, [[[
[[-41,{},[[[[{},{}, [[[{},{},[[false, [[false, [[[{},8,[-41.8,[8,[8.8,[8,[[[{},{[[[
[8{},[-[

View File

@@ -0,0 +1 @@
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>-8<><38><EFBFBD><EFBFBD><EFBFBD>0

View File

@@ -0,0 +1 @@
2.6

1
corpus/example.bin Normal file
View File

@@ -0,0 +1 @@
"<22> <20> "

View File

@@ -1,5 +1,6 @@
#include "callbacks.h"
#include "json_value.h"
#include "parser3.h"
#include "weaseljson.h"
#include <simdjson.h>
@@ -41,6 +42,25 @@ std::pair<std::string, WeaselJsonStatus> runBatch(std::string copy) {
return {state.result, s};
}
std::pair<std::string, WeaselJsonStatus> runPrefix(std::string copy,
int prefix) {
SerializeState state;
auto c = serializeCallbacks();
std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)> parser{
WeaselJsonParser_create(1024, &c, &state), WeaselJsonParser_destroy};
auto s = WeaselJsonParser_parse(parser.get(), copy.data(), prefix);
if (s != WeaselJson_AGAIN) {
return {state.result, s};
}
s = WeaselJsonParser_parse(parser.get(), copy.data() + prefix,
copy.size() - prefix);
if (s != WeaselJson_AGAIN) {
return {state.result, s};
}
s = WeaselJsonParser_parse(parser.get(), nullptr, 0);
return {state.result, s};
}
void testStreaming(std::string const &json) {
auto batch = runBatch(json);
if (batch.second == WeaselJson_AGAIN) {
@@ -56,15 +76,36 @@ void testStreaming(std::string const &json) {
bool batchOk = batch.second == WeaselJson_OK;
if (streamingOk == batchOk && !batchOk) {
// It's ok if the processed data doesn't match if parsing failed
continue;
} else {
printf("streaming: %s, %s\n",
streaming.second == WeaselJson_OK ? "accept" : "reject",
streaming.first.c_str());
printf("batch: %s, %s\n",
batch.second == WeaselJson_OK ? "accept" : "reject",
batch.first.c_str());
abort();
}
}
if (int(json.size()) > stride) {
auto prefix = runPrefix(json, stride);
if (prefix != batch) {
if (prefix.second == WeaselJson_AGAIN) {
abort();
}
bool prefixOk = prefix.second == WeaselJson_OK;
bool batchOk = batch.second == WeaselJson_OK;
if (prefixOk == batchOk && !batchOk) {
// It's ok if the processed data doesn't match if parsing failed
} else {
printf("prefix: %s, %s\n",
prefix.second == WeaselJson_OK ? "accept" : "reject",
prefix.first.c_str());
printf("batch: %s, %s\n",
batch.second == WeaselJson_OK ? "accept" : "reject",
batch.first.c_str());
abort();
}
}
printf("streaming: %s, %s\n",
streaming.second == WeaselJson_OK ? "accept" : "reject",
streaming.first.c_str());
printf("batch: %s, %s\n",
batch.second == WeaselJson_OK ? "accept" : "reject",
batch.first.c_str());
abort();
}
}
}
@@ -132,5 +173,19 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
testStreaming(s);
compareWithSimdjson(s);
testStringRoundTrip(s);
bool json_utf8 = true;
for (int i = 0; i < int(size); ++i) {
uint8_t c = data[i];
json_utf8 = json_utf8 && c >= 0x20 && c != '"' && c != '\\';
}
if (json_utf8) {
parser3::Utf8Dfa dfa;
auto result = dfa.scan((const char *)data, (const char *)data + size);
bool ok = result == (const char *)data + size && dfa.accept();
bool valid = simdjson::validate_utf8(s.data(), s.size());
if (ok != valid) {
abort();
}
}
return 0;
}

View File

@@ -16,8 +16,13 @@
namespace parser3 {
// See https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725 for
// an explanation of this cycle/byte dfa implementation.
//
// Recognizes json number syntax. As a regex:
// -?([0-9]|[1-9][0-9]*)(\.[0-9]+)?((e|E)(-|\+)?[0-9]+)?
struct NumDfa {
constexpr static uint64_t num_dfa_table[256] = {
constexpr static uint64_t table[256] = {
0x0ull,
0x0ull,
0x0ull,
@@ -295,7 +300,7 @@ struct NumDfa {
constexpr int kStride = 16;
if (bufEnd - buf < kStride) [[unlikely]] {
while (buf != bufEnd) {
uint64_t row = num_dfa_table[uint8_t(*buf)];
uint64_t row = table[uint8_t(*buf)];
auto prev = state_;
state_ = (row >> (state_ & 63)) & 63;
if (state_ == 0) {
@@ -310,7 +315,7 @@ struct NumDfa {
uint8_t prev[kStride + 1];
prev[0] = state_;
for (int i = 0; i < kStride; ++i) {
uint64_t row = num_dfa_table[uint8_t(*buf)];
uint64_t row = table[uint8_t(*buf)];
prev[i + 1] = row >> (prev[i] & 63);
if ((prev[i + 1] & 63) == 0) {
state = prev[i];
@@ -326,6 +331,315 @@ private:
uint64_t state = 6;
};
// Recognizes sequences of valid utf8 characters except 0-0x20, double quote,
// and backslash
struct Utf8Dfa {
constexpr static uint64_t table[256] = {
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x30000000000000ull,
0x30000000000000ull,
0x0ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x0ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x0ull,
0x0ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x2a000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x24000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0xc000000000000ull,
0x6000000000000ull,
0x6000000000000ull,
0x6000000000000ull,
0x12000000000000ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
};
// Restore this dfa to its start state
void reset() { state = 48; }
// Return true if this dfa is in an accept state. You probably want to call
// scan until the match ends first.
bool accept() const { return (state & 63) == 48; }
// return value either points to the first byte which does not match, or
// bufEnd. Leaves the dfa in the last state of the match.
#ifdef __x86_64__
__attribute__((target_clones("default", "bmi2")))
#endif
const char *
scan(const char *buf, const char *bufEnd) {
auto state_ = state;
for (;;) {
constexpr int kStride = 16;
if (bufEnd - buf < kStride) [[unlikely]] {
while (buf != bufEnd) {
uint64_t row = table[uint8_t(*buf)];
auto prev = state_;
state_ = (row >> (state_ & 63)) & 63;
if (state_ == 0) {
state_ = prev;
break;
}
++buf;
}
state = state_;
return buf;
}
uint8_t prev[kStride + 1];
prev[0] = state_;
for (int i = 0; i < kStride; ++i) {
uint64_t row = table[uint8_t(*buf)];
prev[i + 1] = row >> (prev[i] & 63);
if ((prev[i + 1] & 63) == 0) {
state = prev[i];
return buf;
}
++buf;
}
state_ = prev[kStride];
}
}
private:
uint64_t state = 48;
};
typedef PRESERVE_NONE WeaselJsonStatus (*Continuation)(struct Parser3 *,
char *buf, char *bufEnd);
@@ -353,8 +667,6 @@ enum Symbol : uint8_t {
T_L,
T_S,
T_COLON,
T_UTF8_CONTINUATION_BYTE,
T_UTF8_LAST_CONTINUATION_BYTE,
T_HEX,
T_HEX2,
T_HEX3,
@@ -434,6 +746,7 @@ struct Parser3 {
int const stackSize;
bool complete;
NumDfa numDfa;
Utf8Dfa strDfa;
};
inline PRESERVE_NONE WeaselJsonStatus n_whitespace(Parser3 *self, char *buf,
@@ -467,6 +780,78 @@ inline PRESERVE_NONE WeaselJsonStatus n_number(Parser3 *self, char *buf,
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
// Advance buf until double quote, backslash, invalid utf8, or codepoint <
// 0x20
template <class V>
inline PRESERVE_NONE WeaselJsonStatus scan_string_impl(Parser3 *self,
char *&buf,
char *bufEnd) {
const auto before = buf;
// Advance buf past characters that transition the accept state to itself
if (self->strDfa.accept()) {
for (;;) {
if (bufEnd - buf < V::lanes) [[unlikely]] {
break;
}
auto v = V{(int8_t *)buf};
int normal =
(v != V::splat('"') & v != V::splat('\\') & v >= V::splat(0x20))
.count_leading_nonzero_lanes();
buf += normal;
if (normal < V::lanes) {
break;
}
}
}
buf = (char *)self->strDfa.scan(buf, bufEnd);
int len = buf - before;
if (self->writeBuf != before) {
memmove(self->writeBuf, before, len);
}
self->writeBuf += len;
if (buf == bufEnd) {
self->flushString(false);
return WeaselJson_AGAIN;
}
if (!self->strDfa.accept()) [[unlikely]] {
return WeaselJson_REJECT;
}
return WeaselJson_OK;
}
#ifdef __x86_64__
constexpr int kLanes = 32;
template WeaselJsonStatus
scan_string_impl<simd<int8_t, kLanes, sse::Simd_x86_SSE>>(Parser3 *, char *&,
char *);
template __attribute__((target("avx2"))) WeaselJsonStatus
scan_string_impl<simd<int8_t, kLanes, sse::Simd_x86_AVX2>>(Parser3 *, char *&,
char *);
__attribute__((target("default"))) inline PRESERVE_NONE WeaselJsonStatus
scan_string(Parser3 *self, char *&buf, char *bufEnd) {
MUSTTAIL return scan_string_impl<simd<int8_t, kLanes, sse::Simd_x86_SSE>>(
self, buf, bufEnd);
}
__attribute__((target("avx2"))) inline PRESERVE_NONE WeaselJsonStatus
scan_string(Parser3 *self, char *&buf, char *bufEnd) {
MUSTTAIL return scan_string_impl<simd<int8_t, kLanes, sse::Simd_x86_AVX2>>(
self, buf, bufEnd);
}
#else
inline PRESERVE_NONE WeaselJsonStatus scan_string(Parser3 *self, char *buf,
char *bufEnd) {
MUSTTAIL return scan_string_impl<simd<int8_t, 32>>(self, buf, bufEnd);
}
#endif
inline PRESERVE_NONE WeaselJsonStatus n_value(Parser3 *self, char *buf,
char *bufEnd) {
assert(bufEnd - buf != 0);
@@ -497,10 +882,31 @@ inline PRESERVE_NONE WeaselJsonStatus n_value(Parser3 *self, char *buf,
++buf;
self->dataBegin = self->writeBuf = buf;
self->pop();
if (auto s = self->push({N_STRING2})) {
self->strDfa.reset();
if (auto s = scan_string(self, buf, bufEnd)) {
if (s == WeaselJson_AGAIN) {
if (auto s2 = self->push({N_STRING2})) {
return s2;
}
}
return s;
}
break;
{
switch (*buf) {
case '"':
self->flushString(true);
++buf;
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case '\\':
++buf;
if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
default:
return WeaselJson_REJECT;
}
}
case '0':
case '1':
case '2':
@@ -601,6 +1007,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_object2(Parser3 *self, char *buf,
++buf;
self->dataBegin = self->writeBuf = buf;
self->pop();
self->strDfa.reset();
if (auto s = self->push({N_STRING2, T_COLON, N_VALUE, N_OBJECT3})) {
return s;
}
@@ -703,125 +1110,36 @@ inline PRESERVE_NONE WeaselJsonStatus n_string(Parser3 *self, char *buf,
++buf;
self->dataBegin = self->writeBuf = buf;
self->pop();
self->strDfa.reset();
if (auto s = self->push({N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
template <class V>
PRESERVE_NONE WeaselJsonStatus n_string2_impl(Parser3 *self, char *buf,
char *bufEnd) {
const auto before = buf;
// Advance buf to the first "non-normal" character
for (;;) {
if (bufEnd - buf < V::lanes) [[unlikely]] {
while (buf != bufEnd &&
tables.stringByteMeaning[uint8_t(*buf)] == Tables::NORMAL) {
++buf;
}
break;
}
auto v = V{(int8_t *)buf};
int normal =
(v != V::splat('"') & v != V::splat('\\') & v >= V::splat(0x20))
.count_leading_nonzero_lanes();
buf += normal;
if (normal < V::lanes) {
break;
}
inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
char *bufEnd) {
if (auto s = scan_string(self, buf, bufEnd)) {
return s;
}
int len = buf - before;
memmove(self->writeBuf, before, len);
self->writeBuf += len;
if (buf == bufEnd) {
self->flushString(false);
return WeaselJson_AGAIN;
}
switch (tables.stringByteMeaning[uint8_t(*buf)]) {
case Tables::NORMAL:
__builtin_unreachable();
case Tables::DUBQUOTE:
switch (*buf) {
case '"':
self->flushString(true);
++buf;
self->pop();
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case Tables::BACKSLASH:
case '\\':
++buf;
self->pop();
if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case Tables::TWO_BYTE_UTF8:
// two byte utf-8 encoding
self->utf8Codepoint = *buf & 0b00011111;
self->minCodepoint = 0x80;
*self->writeBuf++ = *buf++;
self->pop();
if (auto s = self->push({T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case Tables::THREE_BYTE_UTF8:
// three byte utf-8 encoding
self->utf8Codepoint = *buf & 0b00001111;
self->minCodepoint = 0x800;
*self->writeBuf++ = *buf++;
self->pop();
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE,
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case Tables::FOUR_BYTE_UTF8:
// four byte utf-8 encoding
self->utf8Codepoint = *buf & 0b00000111;
self->minCodepoint = 0x10000;
*self->writeBuf++ = *buf++;
self->pop();
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE,
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case Tables::CONTINUATION_BYTE:
case Tables::INVALID:
[[unlikely]] return WeaselJson_REJECT;
default:
__builtin_unreachable();
[[unlikely]] return WeaselJson_REJECT;
}
}
#ifdef __x86_64__
template WeaselJsonStatus
n_string2_impl<simd<int8_t, 64, sse::Simd_x86_SSE>>(Parser3 *, char *, char *);
template __attribute__((target("avx2"))) WeaselJsonStatus
n_string2_impl<simd<int8_t, 64, sse::Simd_x86_AVX2>>(Parser3 *, char *, char *);
__attribute__((target("default"))) inline PRESERVE_NONE WeaselJsonStatus
n_string2(Parser3 *self, char *buf, char *bufEnd) {
MUSTTAIL return n_string2_impl<simd<int8_t, 64, sse::Simd_x86_SSE>>(self, buf,
bufEnd);
}
__attribute__((target("avx2"))) inline PRESERVE_NONE WeaselJsonStatus
n_string2(Parser3 *self, char *buf, char *bufEnd) {
MUSTTAIL return n_string2_impl<simd<int8_t, 64, sse::Simd_x86_AVX2>>(
self, buf, bufEnd);
}
#else
inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
char *bufEnd) {
MUSTTAIL return n_string2_impl<simd<int8_t, 32>>(self, buf, bufEnd);
}
#endif
inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
char *buf,
char *bufEnd) {
@@ -836,6 +1154,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
case 't':
*self->writeBuf++ = tables.unescape[uint8_t(*buf++)];
self->pop();
self->strDfa.reset();
if (auto s = self->push({N_STRING2})) {
return s;
}
@@ -844,6 +1163,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
++buf;
self->utf8Codepoint = 0;
self->pop();
self->strDfa.reset();
if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX2, N_STRING2})) {
return s;
}
@@ -853,40 +1173,6 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
}
}
inline PRESERVE_NONE WeaselJsonStatus t_utf8_continuation_byte(Parser3 *self,
char *buf,
char *bufEnd) {
if (tables.stringByteMeaning[uint8_t(*buf)] != Tables::CONTINUATION_BYTE)
[[unlikely]] {
return WeaselJson_REJECT;
}
self->utf8Codepoint <<= 6;
self->utf8Codepoint |= *buf & 0b00111111;
*self->writeBuf++ = *buf++;
self->pop();
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
inline PRESERVE_NONE WeaselJsonStatus
t_utf8_last_continuation_byte(Parser3 *self, char *buf, char *bufEnd) {
if (tables.stringByteMeaning[uint8_t(*buf)] != Tables::CONTINUATION_BYTE)
[[unlikely]] {
return WeaselJson_REJECT;
}
self->utf8Codepoint <<= 6;
self->utf8Codepoint |= *buf & 0b00111111;
if (self->utf8Codepoint < self->minCodepoint ||
self->utf8Codepoint > 0x10ffff ||
(0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff))
[[unlikely]] {
return WeaselJson_REJECT;
}
// TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized
*self->writeBuf++ = *buf++;
self->pop();
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
inline PRESERVE_NONE WeaselJsonStatus t_digit(Parser3 *self, char *buf,
char *bufEnd) {
if ('0' <= *buf && *buf <= '9') {
@@ -1147,9 +1433,6 @@ constexpr inline struct ContinuationTable {
continuations[T_L] = singleChar<'l'>;
continuations[T_S] = singleChar<'s'>;
continuations[T_COLON] = singleChar<':', true>;
continuations[T_UTF8_CONTINUATION_BYTE] = t_utf8_continuation_byte;
continuations[T_UTF8_LAST_CONTINUATION_BYTE] =
t_utf8_last_continuation_byte;
continuations[T_HEX] = t_hex;
continuations[T_HEX2] = t_hex2;
continuations[T_HEX3] = t_hex3;
@@ -1176,7 +1459,6 @@ constexpr inline struct ContinuationTable {
symbolNames[T_L] = "singleChar<'l'>";
symbolNames[T_S] = "singleChar<'s'>";
symbolNames[T_COLON] = "singleChar<':'>";
symbolNames[T_UTF8_CONTINUATION_BYTE] = "t_utf8_continuation_byte";
symbolNames[T_HEX] = "t_hex";
symbolNames[T_HEX2] = "t_hex2";
symbolNames[T_HEX3] = "t_hex3";
@@ -1201,8 +1483,6 @@ inline PRESERVE_NONE WeaselJsonStatus Parser3::keepGoing(Parser3 *self,
switch (self->top()) {
case N_STRING2:
case N_STRING_FOLLOWING_ESCAPE:
case T_UTF8_CONTINUATION_BYTE:
case T_UTF8_LAST_CONTINUATION_BYTE:
case T_HEX:
case T_HEX2:
case T_HEX3:

View File

@@ -1,50 +1,12 @@
#pragma once
constexpr inline struct Tables {
enum StringByteMeaning {
INVALID,
NORMAL,
DUBQUOTE,
BACKSLASH,
TWO_BYTE_UTF8,
THREE_BYTE_UTF8,
FOUR_BYTE_UTF8,
CONTINUATION_BYTE,
};
constexpr Tables() {
whitespace[' '] = true;
whitespace['\n'] = true;
whitespace['\r'] = true;
whitespace['\t'] = true;
for (int i = 0; i < 256; ++i) {
if ((i & 0b11000000) == 0b10000000) {
stringByteMeaning[i] = CONTINUATION_BYTE;
}
if ((i & 0b11100000) == 0b11000000) {
stringByteMeaning[i] = TWO_BYTE_UTF8;
}
if ((i & 0b11110000) == 0b11100000) {
stringByteMeaning[i] = THREE_BYTE_UTF8;
}
if ((i & 0b11111000) == 0b11110000) {
stringByteMeaning[i] = FOUR_BYTE_UTF8;
}
}
for (int i = 0x20; i < 128; ++i) {
stringByteMeaning[i] = NORMAL;
}
stringByteMeaning['"'] = DUBQUOTE;
stringByteMeaning['\\'] = BACKSLASH;
stringByteMeaning[0xc0] = INVALID;
stringByteMeaning[0xc1] = INVALID;
for (int i = 0xF5; i < 0x100; ++i) {
stringByteMeaning[i] = INVALID;
}
unescape['n'] = '\n';
unescape['r'] = '\r';
unescape['t'] = '\t';
@@ -55,6 +17,5 @@ constexpr inline struct Tables {
unescape['/'] = '/';
}
bool whitespace[256]{};
StringByteMeaning stringByteMeaning[256]{};
char unescape[256]{};
} tables;

View File

@@ -310,8 +310,15 @@ TEST_CASE("bench5") {
TEST_CASE("num dfa") {
parser3::NumDfa dfa;
std::string match = "-1231279127389127389127398127389712893791287389217327482"
"374.0e69010101010101010101010101010101";
std::string match =
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"111111111111111111111111111111111111111111111111111111111111111111111111"
"11111111";
auto *buf = dfa.scan(match.data(), match.data() + match.size());
CHECK(buf == match.data() + match.size());
CHECK(dfa.accept());
@@ -325,3 +332,23 @@ TEST_CASE("num dfa") {
dfa.scan(match.data(), match.data() + match.size()));
});
}
TEST_CASE("utf8 dfa") {
parser3::Utf8Dfa dfa;
std::string match =
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩"
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩"
"💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩";
auto *buf = dfa.scan(match.data(), match.data() + match.size());
CHECK(buf == match.data() + match.size());
CHECK(dfa.accept());
ankerl::nanobench::Bench bench;
bench.batch(match.size());
bench.unit("byte");
bench.run("utf8 dfa", [&]() {
dfa.reset();
bench.doNotOptimizeAway(
dfa.scan(match.data(), match.data() + match.size()));
});
}