Add WeaselJsonRaw flag

This commit is contained in:
2025-06-25 18:44:37 -04:00
parent 3e72181bee
commit 9e4f90f218
4 changed files with 162 additions and 107 deletions

View File

@@ -10,7 +10,9 @@ std::pair<std::string, WeaselJsonStatus> runStreaming(std::string copy,
SerializeState state;
auto c = serializeCallbacks();
std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)> parser{
WeaselJsonParser_create(1024, &c, &state, 0), WeaselJsonParser_destroy};
WeaselJsonParser_create(1024, &c, &state,
copy.size() % 2 == 0 ? WeaselJsonRaw : 0),
WeaselJsonParser_destroy};
if (stride == 0) {
auto s = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size());
if (s != WeaselJson_AGAIN) {
@@ -33,7 +35,9 @@ std::pair<std::string, WeaselJsonStatus> runBatch(std::string copy) {
SerializeState state;
auto c = serializeCallbacks();
std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)> parser{
WeaselJsonParser_create(1024, &c, &state, 0), WeaselJsonParser_destroy};
WeaselJsonParser_create(1024, &c, &state,
copy.size() % 2 == 0 ? WeaselJsonRaw : 0),
WeaselJsonParser_destroy};
auto s = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size());
if (s != WeaselJson_AGAIN) {
return {state.result, s};
@@ -47,7 +51,9 @@ std::pair<std::string, WeaselJsonStatus> runPrefix(std::string copy,
SerializeState state;
auto c = serializeCallbacks();
std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)> parser{
WeaselJsonParser_create(1024, &c, &state, 0), WeaselJsonParser_destroy};
WeaselJsonParser_create(1024, &c, &state,
copy.size() % 2 == 0 ? WeaselJsonRaw : 0),
WeaselJsonParser_destroy};
auto s = WeaselJsonParser_parse(parser.get(), copy.data(), prefix);
if (s != WeaselJson_AGAIN) {
return {state.result, s};
@@ -116,7 +122,8 @@ void compareWithSimdjson(std::string const &json) {
auto copy = json;
auto c = noopCallbacks();
std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)>
parser{WeaselJsonParser_create(1024, &c, nullptr, 0),
parser{WeaselJsonParser_create(
1024, &c, nullptr, json.size() % 2 == 0 ? WeaselJsonRaw : 0),
WeaselJsonParser_destroy};
ours = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size());
if (ours == WeaselJson_AGAIN) {

View File

@@ -12,7 +12,8 @@ WeaselJsonParser_create(int stackSize, const WeaselJsonCallbacks *callbacks,
if (buf == nullptr) {
return nullptr;
}
return (WeaselJsonParser *)new (buf) Parser3{callbacks, userdata, stackSize};
return (WeaselJsonParser *)new (buf)
Parser3{callbacks, userdata, stackSize, flags};
}
__attribute__((visibility("default"))) void

View File

@@ -64,9 +64,10 @@ enum Symbol : uint8_t {
N_SYMBOL_COUNT, // Must be last
};
struct Parser3 {
Parser3(const WeaselJsonCallbacks *callbacks, void *userdata, int stackSize)
: callbacks(callbacks), userdata(userdata),
stackEnd(stack() + stackSize) {
Parser3(const WeaselJsonCallbacks *callbacks, void *userdata, int stackSize,
int flags)
: callbacks(callbacks), userdata(userdata), stackEnd(stack() + stackSize),
flags(flags) {
reset();
}
@@ -80,8 +81,13 @@ struct Parser3 {
}
}
void flushString(bool done) {
int len = writeBuf - dataBegin;
void flushString(bool done, char *buf) {
int len;
if (!(flags & WeaselJsonRaw)) {
len = writeBuf - dataBegin;
} else {
len = buf - dataBegin;
}
assert(len >= 0);
if (done || len > 0) {
callbacks->on_string_data(userdata, dataBegin, len, done);
@@ -129,6 +135,7 @@ struct Parser3 {
void *const userdata;
Symbol *stackPtr;
Symbol *const stackEnd;
int const flags;
uint32_t utf8Codepoint;
uint32_t utf16Surrogate;
uint32_t minCodepoint;
@@ -213,13 +220,15 @@ inline PRESERVE_NONE WeaselJsonStatus scan_string_impl(Parser3 *self,
buf = (char *)self->strDfa.scan(buf, bufEnd);
int len = buf - before;
if (!(self->flags & WeaselJsonRaw)) {
if (self->writeBuf != before) {
memmove(self->writeBuf, before, len);
}
self->writeBuf += len;
}
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
@@ -531,7 +540,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
}
switch (*buf) {
case '"':
self->flushString(true);
self->flushString(true, buf);
++buf;
self->pop();
if (buf == bufEnd) {
@@ -545,7 +554,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
return s;
}
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
@@ -571,6 +580,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
return WeaselJson_REJECT;
}
buf += 6;
if (!(self->flags & WeaselJsonRaw)) {
assert(codepoint <= 0x10ffff);
self->writeBuf[3] = (0b00111111 & codepoint) | 0b10000000;
codepoint >>= 6;
@@ -580,7 +590,9 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
codepoint >>= 6;
self->writeBuf[0] = (0b00000111 & codepoint) | 0b11110000;
self->writeBuf += 4;
}
} else {
if (!(self->flags & WeaselJsonRaw)) {
if (codepoint < 0x80) {
*self->writeBuf++ = codepoint;
} else if (codepoint < 0x800) {
@@ -598,15 +610,19 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
self->writeBuf += 3;
}
}
}
} else {
auto unescaped = tables.unescape[uint8_t(*buf++)];
auto unescaped = tables.unescape[uint8_t(*buf)];
if (unescaped == 0) [[unlikely]] {
return WeaselJson_REJECT;
}
if (!(self->flags & WeaselJsonRaw)) {
*self->writeBuf++ = unescaped;
}
++buf;
}
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
MUSTTAIL return n_string2(self, buf, bufEnd);
@@ -632,7 +648,10 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
case 'n':
case 'r':
case 't':
*self->writeBuf++ = tables.unescape[uint8_t(*buf++)];
if (!(self->flags & WeaselJsonRaw)) {
*self->writeBuf++ = tables.unescape[uint8_t(*buf)];
}
++buf;
self->pop();
break;
case 'u':
@@ -647,7 +666,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
[[unlikely]] return WeaselJson_REJECT;
}
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
@@ -667,7 +686,7 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex(Parser3 *self, char *buf,
++buf;
self->pop();
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
@@ -690,13 +709,16 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex2(Parser3 *self, char *buf,
// there's not room, flush, write into a temp buffer, and flush again.
char tmp[3];
if (self->utf8Codepoint < 0x80) {
if (!(self->flags & WeaselJsonRaw)) {
assert(buf - self->writeBuf >= 1);
*self->writeBuf++ = self->utf8Codepoint;
}
} else if (self->utf8Codepoint < 0x800) {
if (!(self->flags & WeaselJsonRaw)) {
bool useTmp = buf - self->writeBuf < 2;
char *p = tmp;
if (useTmp) [[unlikely]] {
self->flushString(false);
self->flushString(false, buf);
}
auto &w = useTmp ? p : self->writeBuf;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
@@ -706,6 +728,7 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex2(Parser3 *self, char *buf,
if (useTmp) [[unlikely]] {
self->callbacks->on_string_data(self->userdata, tmp, 2, false);
}
}
} else {
assert(self->utf8Codepoint < 0x10000);
if (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff) {
@@ -718,15 +741,16 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex2(Parser3 *self, char *buf,
return s;
}
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
if (!(self->flags & WeaselJsonRaw)) {
bool useTmp = buf - self->writeBuf < 3;
char *p = tmp;
if (useTmp) [[unlikely]] {
self->flushString(false);
self->flushString(false, buf);
}
auto &w = useTmp ? p : self->writeBuf;
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
@@ -739,10 +763,11 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex2(Parser3 *self, char *buf,
self->callbacks->on_string_data(self->userdata, tmp, 3, false);
}
}
}
self->pop();
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
@@ -777,10 +802,11 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex3(Parser3 *self, char *buf,
if (self->utf8Codepoint > 0x10FFFF) [[unlikely]] {
return WeaselJson_REJECT;
}
if (!(self->flags & WeaselJsonRaw)) {
bool useTmp = buf - self->writeBuf < 4;
char *p = tmp;
if (useTmp) [[unlikely]] {
self->flushString(false);
self->flushString(false, buf);
}
auto &w = useTmp ? p : self->writeBuf;
w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
@@ -794,10 +820,11 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex3(Parser3 *self, char *buf,
if (useTmp) [[unlikely]] {
self->callbacks->on_string_data(self->userdata, tmp, 4, false);
}
}
self->pop();
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
@@ -814,7 +841,7 @@ inline PRESERVE_NONE WeaselJsonStatus singleCharInString(Parser3 *self,
++buf;
self->pop();
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);

View File

@@ -205,7 +205,7 @@ TEST_CASE("parser3") {
TEST_CASE("streaming") { testStreaming(json); }
void doTestUnescapingUtf8(std::string const &escaped,
std::string const &expected, int stride) {
std::string const &expected, int stride, int flags) {
CAPTURE(escaped);
CAPTURE(expected);
CAPTURE(stride);
@@ -215,7 +215,7 @@ void doTestUnescapingUtf8(std::string const &escaped,
auto &s = *(std::string *)p;
s.append(buf, len);
};
auto *parser = WeaselJsonParser_create(1024, &c, &result, 0);
auto *parser = WeaselJsonParser_create(1024, &c, &result, flags);
auto copy = escaped;
for (size_t i = 0; i < copy.size(); i += stride) {
CAPTURE(i);
@@ -233,8 +233,11 @@ void testUnescapingUtf8(std::string const &escaped,
std::string const &expected) {
for (int stride = 0; stride < 10; ++stride) {
doTestUnescapingUtf8(escaped, expected,
stride == 0 ? std::numeric_limits<int>::max()
: stride);
stride == 0 ? std::numeric_limits<int>::max() : stride,
0);
doTestUnescapingUtf8(
escaped, escaped.substr(1).substr(0, escaped.size() - 2),
stride == 0 ? std::numeric_limits<int>::max() : stride, WeaselJsonRaw);
}
}
@@ -376,6 +379,7 @@ TEST_CASE("bench input types") {
bench.doNotOptimizeAway(doc);
});
{
auto *parser = WeaselJsonParser_create(1024, &c, nullptr, 0);
bench.run("parser3 " + name, [&]() {
auto copy = json;
@@ -389,6 +393,22 @@ TEST_CASE("bench input types") {
}
});
WeaselJsonParser_destroy(parser);
}
{
auto *parser = WeaselJsonParser_create(1024, &c, nullptr, WeaselJsonRaw);
bench.run("parser3 (raw) " + name, [&]() {
auto copy = json;
WeaselJsonParser_reset(parser);
if (WeaselJsonParser_parse(parser, copy.data(), copy.size()) !=
WeaselJson_AGAIN) {
abort();
}
if (WeaselJsonParser_parse(parser, nullptr, 0) != WeaselJson_OK) {
abort();
}
});
WeaselJsonParser_destroy(parser);
}
};
bench("numbers", "[-123456789.000000000000000123456789e+12, "