Add WeaselJsonRaw flag

This commit is contained in:
2025-06-25 18:44:37 -04:00
parent 3e72181bee
commit 9e4f90f218
4 changed files with 162 additions and 107 deletions

View File

@@ -10,7 +10,9 @@ std::pair<std::string, WeaselJsonStatus> runStreaming(std::string copy,
SerializeState state; SerializeState state;
auto c = serializeCallbacks(); auto c = serializeCallbacks();
std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)> parser{ std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)> parser{
WeaselJsonParser_create(1024, &c, &state, 0), WeaselJsonParser_destroy}; WeaselJsonParser_create(1024, &c, &state,
copy.size() % 2 == 0 ? WeaselJsonRaw : 0),
WeaselJsonParser_destroy};
if (stride == 0) { if (stride == 0) {
auto s = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size()); auto s = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size());
if (s != WeaselJson_AGAIN) { if (s != WeaselJson_AGAIN) {
@@ -33,7 +35,9 @@ std::pair<std::string, WeaselJsonStatus> runBatch(std::string copy) {
SerializeState state; SerializeState state;
auto c = serializeCallbacks(); auto c = serializeCallbacks();
std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)> parser{ std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)> parser{
WeaselJsonParser_create(1024, &c, &state, 0), WeaselJsonParser_destroy}; WeaselJsonParser_create(1024, &c, &state,
copy.size() % 2 == 0 ? WeaselJsonRaw : 0),
WeaselJsonParser_destroy};
auto s = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size()); auto s = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size());
if (s != WeaselJson_AGAIN) { if (s != WeaselJson_AGAIN) {
return {state.result, s}; return {state.result, s};
@@ -47,7 +51,9 @@ std::pair<std::string, WeaselJsonStatus> runPrefix(std::string copy,
SerializeState state; SerializeState state;
auto c = serializeCallbacks(); auto c = serializeCallbacks();
std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)> parser{ std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)> parser{
WeaselJsonParser_create(1024, &c, &state, 0), WeaselJsonParser_destroy}; WeaselJsonParser_create(1024, &c, &state,
copy.size() % 2 == 0 ? WeaselJsonRaw : 0),
WeaselJsonParser_destroy};
auto s = WeaselJsonParser_parse(parser.get(), copy.data(), prefix); auto s = WeaselJsonParser_parse(parser.get(), copy.data(), prefix);
if (s != WeaselJson_AGAIN) { if (s != WeaselJson_AGAIN) {
return {state.result, s}; return {state.result, s};
@@ -116,7 +122,8 @@ void compareWithSimdjson(std::string const &json) {
auto copy = json; auto copy = json;
auto c = noopCallbacks(); auto c = noopCallbacks();
std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)> std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)>
parser{WeaselJsonParser_create(1024, &c, nullptr, 0), parser{WeaselJsonParser_create(
1024, &c, nullptr, json.size() % 2 == 0 ? WeaselJsonRaw : 0),
WeaselJsonParser_destroy}; WeaselJsonParser_destroy};
ours = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size()); ours = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size());
if (ours == WeaselJson_AGAIN) { if (ours == WeaselJson_AGAIN) {

View File

@@ -12,7 +12,8 @@ WeaselJsonParser_create(int stackSize, const WeaselJsonCallbacks *callbacks,
if (buf == nullptr) { if (buf == nullptr) {
return nullptr; return nullptr;
} }
return (WeaselJsonParser *)new (buf) Parser3{callbacks, userdata, stackSize}; return (WeaselJsonParser *)new (buf)
Parser3{callbacks, userdata, stackSize, flags};
} }
__attribute__((visibility("default"))) void __attribute__((visibility("default"))) void

View File

@@ -64,9 +64,10 @@ enum Symbol : uint8_t {
N_SYMBOL_COUNT, // Must be last N_SYMBOL_COUNT, // Must be last
}; };
struct Parser3 { struct Parser3 {
Parser3(const WeaselJsonCallbacks *callbacks, void *userdata, int stackSize) Parser3(const WeaselJsonCallbacks *callbacks, void *userdata, int stackSize,
: callbacks(callbacks), userdata(userdata), int flags)
stackEnd(stack() + stackSize) { : callbacks(callbacks), userdata(userdata), stackEnd(stack() + stackSize),
flags(flags) {
reset(); reset();
} }
@@ -80,8 +81,13 @@ struct Parser3 {
} }
} }
void flushString(bool done) { void flushString(bool done, char *buf) {
int len = writeBuf - dataBegin; int len;
if (!(flags & WeaselJsonRaw)) {
len = writeBuf - dataBegin;
} else {
len = buf - dataBegin;
}
assert(len >= 0); assert(len >= 0);
if (done || len > 0) { if (done || len > 0) {
callbacks->on_string_data(userdata, dataBegin, len, done); callbacks->on_string_data(userdata, dataBegin, len, done);
@@ -129,6 +135,7 @@ struct Parser3 {
void *const userdata; void *const userdata;
Symbol *stackPtr; Symbol *stackPtr;
Symbol *const stackEnd; Symbol *const stackEnd;
int const flags;
uint32_t utf8Codepoint; uint32_t utf8Codepoint;
uint32_t utf16Surrogate; uint32_t utf16Surrogate;
uint32_t minCodepoint; uint32_t minCodepoint;
@@ -213,13 +220,15 @@ inline PRESERVE_NONE WeaselJsonStatus scan_string_impl(Parser3 *self,
buf = (char *)self->strDfa.scan(buf, bufEnd); buf = (char *)self->strDfa.scan(buf, bufEnd);
int len = buf - before; int len = buf - before;
if (!(self->flags & WeaselJsonRaw)) {
if (self->writeBuf != before) { if (self->writeBuf != before) {
memmove(self->writeBuf, before, len); memmove(self->writeBuf, before, len);
} }
self->writeBuf += len; self->writeBuf += len;
}
if (buf == bufEnd) { if (buf == bufEnd) {
self->flushString(false); self->flushString(false, buf);
return WeaselJson_AGAIN; return WeaselJson_AGAIN;
} }
@@ -531,7 +540,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
} }
switch (*buf) { switch (*buf) {
case '"': case '"':
self->flushString(true); self->flushString(true, buf);
++buf; ++buf;
self->pop(); self->pop();
if (buf == bufEnd) { if (buf == bufEnd) {
@@ -545,7 +554,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
return s; return s;
} }
if (buf == bufEnd) { if (buf == bufEnd) {
self->flushString(false); self->flushString(false, buf);
return WeaselJson_AGAIN; return WeaselJson_AGAIN;
} }
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
@@ -571,6 +580,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
return WeaselJson_REJECT; return WeaselJson_REJECT;
} }
buf += 6; buf += 6;
if (!(self->flags & WeaselJsonRaw)) {
assert(codepoint <= 0x10ffff); assert(codepoint <= 0x10ffff);
self->writeBuf[3] = (0b00111111 & codepoint) | 0b10000000; self->writeBuf[3] = (0b00111111 & codepoint) | 0b10000000;
codepoint >>= 6; codepoint >>= 6;
@@ -580,7 +590,9 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
codepoint >>= 6; codepoint >>= 6;
self->writeBuf[0] = (0b00000111 & codepoint) | 0b11110000; self->writeBuf[0] = (0b00000111 & codepoint) | 0b11110000;
self->writeBuf += 4; self->writeBuf += 4;
}
} else { } else {
if (!(self->flags & WeaselJsonRaw)) {
if (codepoint < 0x80) { if (codepoint < 0x80) {
*self->writeBuf++ = codepoint; *self->writeBuf++ = codepoint;
} else if (codepoint < 0x800) { } else if (codepoint < 0x800) {
@@ -598,15 +610,19 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
self->writeBuf += 3; self->writeBuf += 3;
} }
} }
}
} else { } else {
auto unescaped = tables.unescape[uint8_t(*buf++)]; auto unescaped = tables.unescape[uint8_t(*buf)];
if (unescaped == 0) [[unlikely]] { if (unescaped == 0) [[unlikely]] {
return WeaselJson_REJECT; return WeaselJson_REJECT;
} }
if (!(self->flags & WeaselJsonRaw)) {
*self->writeBuf++ = unescaped; *self->writeBuf++ = unescaped;
} }
++buf;
}
if (buf == bufEnd) { if (buf == bufEnd) {
self->flushString(false); self->flushString(false, buf);
return WeaselJson_AGAIN; return WeaselJson_AGAIN;
} }
MUSTTAIL return n_string2(self, buf, bufEnd); MUSTTAIL return n_string2(self, buf, bufEnd);
@@ -632,7 +648,10 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
case 'n': case 'n':
case 'r': case 'r':
case 't': case 't':
*self->writeBuf++ = tables.unescape[uint8_t(*buf++)]; if (!(self->flags & WeaselJsonRaw)) {
*self->writeBuf++ = tables.unescape[uint8_t(*buf)];
}
++buf;
self->pop(); self->pop();
break; break;
case 'u': case 'u':
@@ -647,7 +666,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
[[unlikely]] return WeaselJson_REJECT; [[unlikely]] return WeaselJson_REJECT;
} }
if (buf == bufEnd) { if (buf == bufEnd) {
self->flushString(false); self->flushString(false, buf);
return WeaselJson_AGAIN; return WeaselJson_AGAIN;
} }
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
@@ -667,7 +686,7 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex(Parser3 *self, char *buf,
++buf; ++buf;
self->pop(); self->pop();
if (buf == bufEnd) { if (buf == bufEnd) {
self->flushString(false); self->flushString(false, buf);
return WeaselJson_AGAIN; return WeaselJson_AGAIN;
} }
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
@@ -690,13 +709,16 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex2(Parser3 *self, char *buf,
// there's not room, flush, write into a temp buffer, and flush again. // there's not room, flush, write into a temp buffer, and flush again.
char tmp[3]; char tmp[3];
if (self->utf8Codepoint < 0x80) { if (self->utf8Codepoint < 0x80) {
if (!(self->flags & WeaselJsonRaw)) {
assert(buf - self->writeBuf >= 1); assert(buf - self->writeBuf >= 1);
*self->writeBuf++ = self->utf8Codepoint; *self->writeBuf++ = self->utf8Codepoint;
}
} else if (self->utf8Codepoint < 0x800) { } else if (self->utf8Codepoint < 0x800) {
if (!(self->flags & WeaselJsonRaw)) {
bool useTmp = buf - self->writeBuf < 2; bool useTmp = buf - self->writeBuf < 2;
char *p = tmp; char *p = tmp;
if (useTmp) [[unlikely]] { if (useTmp) [[unlikely]] {
self->flushString(false); self->flushString(false, buf);
} }
auto &w = useTmp ? p : self->writeBuf; auto &w = useTmp ? p : self->writeBuf;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
@@ -706,6 +728,7 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex2(Parser3 *self, char *buf,
if (useTmp) [[unlikely]] { if (useTmp) [[unlikely]] {
self->callbacks->on_string_data(self->userdata, tmp, 2, false); self->callbacks->on_string_data(self->userdata, tmp, 2, false);
} }
}
} else { } else {
assert(self->utf8Codepoint < 0x10000); assert(self->utf8Codepoint < 0x10000);
if (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff) { if (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff) {
@@ -718,15 +741,16 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex2(Parser3 *self, char *buf,
return s; return s;
} }
if (buf == bufEnd) { if (buf == bufEnd) {
self->flushString(false); self->flushString(false, buf);
return WeaselJson_AGAIN; return WeaselJson_AGAIN;
} }
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
} }
if (!(self->flags & WeaselJsonRaw)) {
bool useTmp = buf - self->writeBuf < 3; bool useTmp = buf - self->writeBuf < 3;
char *p = tmp; char *p = tmp;
if (useTmp) [[unlikely]] { if (useTmp) [[unlikely]] {
self->flushString(false); self->flushString(false, buf);
} }
auto &w = useTmp ? p : self->writeBuf; auto &w = useTmp ? p : self->writeBuf;
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000; w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
@@ -739,10 +763,11 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex2(Parser3 *self, char *buf,
self->callbacks->on_string_data(self->userdata, tmp, 3, false); self->callbacks->on_string_data(self->userdata, tmp, 3, false);
} }
} }
}
self->pop(); self->pop();
if (buf == bufEnd) { if (buf == bufEnd) {
self->flushString(false); self->flushString(false, buf);
return WeaselJson_AGAIN; return WeaselJson_AGAIN;
} }
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
@@ -777,10 +802,11 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex3(Parser3 *self, char *buf,
if (self->utf8Codepoint > 0x10FFFF) [[unlikely]] { if (self->utf8Codepoint > 0x10FFFF) [[unlikely]] {
return WeaselJson_REJECT; return WeaselJson_REJECT;
} }
if (!(self->flags & WeaselJsonRaw)) {
bool useTmp = buf - self->writeBuf < 4; bool useTmp = buf - self->writeBuf < 4;
char *p = tmp; char *p = tmp;
if (useTmp) [[unlikely]] { if (useTmp) [[unlikely]] {
self->flushString(false); self->flushString(false, buf);
} }
auto &w = useTmp ? p : self->writeBuf; auto &w = useTmp ? p : self->writeBuf;
w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000; w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
@@ -794,10 +820,11 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex3(Parser3 *self, char *buf,
if (useTmp) [[unlikely]] { if (useTmp) [[unlikely]] {
self->callbacks->on_string_data(self->userdata, tmp, 4, false); self->callbacks->on_string_data(self->userdata, tmp, 4, false);
} }
}
self->pop(); self->pop();
if (buf == bufEnd) { if (buf == bufEnd) {
self->flushString(false); self->flushString(false, buf);
return WeaselJson_AGAIN; return WeaselJson_AGAIN;
} }
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
@@ -814,7 +841,7 @@ inline PRESERVE_NONE WeaselJsonStatus singleCharInString(Parser3 *self,
++buf; ++buf;
self->pop(); self->pop();
if (buf == bufEnd) { if (buf == bufEnd) {
self->flushString(false); self->flushString(false, buf);
return WeaselJson_AGAIN; return WeaselJson_AGAIN;
} }
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);

View File

@@ -205,7 +205,7 @@ TEST_CASE("parser3") {
TEST_CASE("streaming") { testStreaming(json); } TEST_CASE("streaming") { testStreaming(json); }
void doTestUnescapingUtf8(std::string const &escaped, void doTestUnescapingUtf8(std::string const &escaped,
std::string const &expected, int stride) { std::string const &expected, int stride, int flags) {
CAPTURE(escaped); CAPTURE(escaped);
CAPTURE(expected); CAPTURE(expected);
CAPTURE(stride); CAPTURE(stride);
@@ -215,7 +215,7 @@ void doTestUnescapingUtf8(std::string const &escaped,
auto &s = *(std::string *)p; auto &s = *(std::string *)p;
s.append(buf, len); s.append(buf, len);
}; };
auto *parser = WeaselJsonParser_create(1024, &c, &result, 0); auto *parser = WeaselJsonParser_create(1024, &c, &result, flags);
auto copy = escaped; auto copy = escaped;
for (size_t i = 0; i < copy.size(); i += stride) { for (size_t i = 0; i < copy.size(); i += stride) {
CAPTURE(i); CAPTURE(i);
@@ -233,8 +233,11 @@ void testUnescapingUtf8(std::string const &escaped,
std::string const &expected) { std::string const &expected) {
for (int stride = 0; stride < 10; ++stride) { for (int stride = 0; stride < 10; ++stride) {
doTestUnescapingUtf8(escaped, expected, doTestUnescapingUtf8(escaped, expected,
stride == 0 ? std::numeric_limits<int>::max() stride == 0 ? std::numeric_limits<int>::max() : stride,
: stride); 0);
doTestUnescapingUtf8(
escaped, escaped.substr(1).substr(0, escaped.size() - 2),
stride == 0 ? std::numeric_limits<int>::max() : stride, WeaselJsonRaw);
} }
} }
@@ -376,6 +379,7 @@ TEST_CASE("bench input types") {
bench.doNotOptimizeAway(doc); bench.doNotOptimizeAway(doc);
}); });
{
auto *parser = WeaselJsonParser_create(1024, &c, nullptr, 0); auto *parser = WeaselJsonParser_create(1024, &c, nullptr, 0);
bench.run("parser3 " + name, [&]() { bench.run("parser3 " + name, [&]() {
auto copy = json; auto copy = json;
@@ -389,6 +393,22 @@ TEST_CASE("bench input types") {
} }
}); });
WeaselJsonParser_destroy(parser); WeaselJsonParser_destroy(parser);
}
{
auto *parser = WeaselJsonParser_create(1024, &c, nullptr, WeaselJsonRaw);
bench.run("parser3 (raw) " + name, [&]() {
auto copy = json;
WeaselJsonParser_reset(parser);
if (WeaselJsonParser_parse(parser, copy.data(), copy.size()) !=
WeaselJson_AGAIN) {
abort();
}
if (WeaselJsonParser_parse(parser, nullptr, 0) != WeaselJson_OK) {
abort();
}
});
WeaselJsonParser_destroy(parser);
}
}; };
bench("numbers", "[-123456789.000000000000000123456789e+12, " bench("numbers", "[-123456789.000000000000000123456789e+12, "