Merge remote-tracking branch 'origin/flags'

This commit is contained in:
2025-08-04 12:36:20 -04:00
8 changed files with 180 additions and 118 deletions

View File

@@ -36,13 +36,18 @@ enum WeaselJsonStatus {
typedef struct WeaselJsonParser WeaselJsonParser;
enum WeaselJsonFlags {
/** Do not unescape strings or write to the supplied buffer at all. */
WeaselJsonRaw = 1,
};
/** Create a parser. Increasing stack size increases memory usage but also
* increases the depth of nested json accepted. `callbacks` and `userdata` must
* outlive the returned parser. Returns null if there's insufficient available
* memory */
WeaselJsonParser *WeaselJsonParser_create(int stackSize,
const WeaselJsonCallbacks *callbacks,
void *userdata);
void *userdata, int flags);
/** Restore the parser to its newly-created state */
void WeaselJsonParser_reset(WeaselJsonParser *parser);

View File

@@ -10,7 +10,9 @@ std::pair<std::string, WeaselJsonStatus> runStreaming(std::string copy,
SerializeState state;
auto c = serializeCallbacks();
std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)> parser{
WeaselJsonParser_create(1024, &c, &state), WeaselJsonParser_destroy};
WeaselJsonParser_create(1024, &c, &state,
copy.size() % 2 == 0 ? WeaselJsonRaw : 0),
WeaselJsonParser_destroy};
if (stride == 0) {
auto s = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size());
if (s != WeaselJson_AGAIN) {
@@ -33,7 +35,9 @@ std::pair<std::string, WeaselJsonStatus> runBatch(std::string copy) {
SerializeState state;
auto c = serializeCallbacks();
std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)> parser{
WeaselJsonParser_create(1024, &c, &state), WeaselJsonParser_destroy};
WeaselJsonParser_create(1024, &c, &state,
copy.size() % 2 == 0 ? WeaselJsonRaw : 0),
WeaselJsonParser_destroy};
auto s = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size());
if (s != WeaselJson_AGAIN) {
return {state.result, s};
@@ -47,7 +51,9 @@ std::pair<std::string, WeaselJsonStatus> runPrefix(std::string copy,
SerializeState state;
auto c = serializeCallbacks();
std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)> parser{
WeaselJsonParser_create(1024, &c, &state), WeaselJsonParser_destroy};
WeaselJsonParser_create(1024, &c, &state,
copy.size() % 2 == 0 ? WeaselJsonRaw : 0),
WeaselJsonParser_destroy};
auto s = WeaselJsonParser_parse(parser.get(), copy.data(), prefix);
if (s != WeaselJson_AGAIN) {
return {state.result, s};
@@ -116,7 +122,8 @@ void compareWithSimdjson(std::string const &json) {
auto copy = json;
auto c = noopCallbacks();
std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)>
parser{WeaselJsonParser_create(1024, &c, nullptr),
parser{WeaselJsonParser_create(
1024, &c, nullptr, json.size() % 2 == 0 ? WeaselJsonRaw : 0),
WeaselJsonParser_destroy};
ours = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size());
if (ours == WeaselJson_AGAIN) {

View File

@@ -196,7 +196,7 @@ inline std::optional<JsonValue> toValue(std::string copy, int stride) {
ReadValueState state;
auto c = readValueCallbacks();
std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)> parser{
WeaselJsonParser_create(1024, &c, &state), WeaselJsonParser_destroy};
WeaselJsonParser_create(1024, &c, &state, 0), WeaselJsonParser_destroy};
if (stride == 0) {
if (WeaselJsonParser_parse(parser.get(), copy.data(), copy.size()) !=
WeaselJson_AGAIN) {

View File

@@ -7,12 +7,13 @@ extern "C" {
__attribute__((visibility("default"))) WeaselJsonParser *
WeaselJsonParser_create(int stackSize, const WeaselJsonCallbacks *callbacks,
void *userdata) {
void *userdata, int flags) {
auto *buf = malloc(sizeof(Parser3) + stackSize * sizeof(*Parser3::stackPtr));
if (buf == nullptr) {
return nullptr;
}
return (WeaselJsonParser *)new (buf) Parser3{callbacks, userdata, stackSize};
return (WeaselJsonParser *)new (buf)
Parser3{callbacks, userdata, stackSize, flags};
}
__attribute__((visibility("default"))) void

View File

@@ -64,9 +64,10 @@ enum Symbol : uint8_t {
N_SYMBOL_COUNT, // Must be last
};
struct Parser3 {
Parser3(const WeaselJsonCallbacks *callbacks, void *userdata, int stackSize)
: callbacks(callbacks), userdata(userdata),
stackEnd(stack() + stackSize) {
Parser3(const WeaselJsonCallbacks *callbacks, void *userdata, int stackSize,
int flags)
: callbacks(callbacks), userdata(userdata), stackEnd(stack() + stackSize),
flags(flags) {
reset();
}
@@ -80,8 +81,13 @@ struct Parser3 {
}
}
void flushString(bool done) {
int len = writeBuf - dataBegin;
void flushString(bool done, char *buf) {
int len;
if (!(flags & WeaselJsonRaw)) {
len = writeBuf - dataBegin;
} else {
len = buf - dataBegin;
}
assert(len >= 0);
if (done || len > 0) {
callbacks->on_string_data(userdata, dataBegin, len, done);
@@ -129,6 +135,7 @@ struct Parser3 {
void *const userdata;
Symbol *stackPtr;
Symbol *const stackEnd;
int const flags;
uint32_t utf8Codepoint;
uint32_t utf16Surrogate;
uint32_t minCodepoint;
@@ -213,13 +220,15 @@ inline PRESERVE_NONE WeaselJsonStatus scan_string_impl(Parser3 *self,
buf = (char *)self->strDfa.scan(buf, bufEnd);
int len = buf - before;
if (!(self->flags & WeaselJsonRaw)) {
if (self->writeBuf != before) {
memmove(self->writeBuf, before, len);
}
self->writeBuf += len;
}
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
@@ -531,7 +540,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
}
switch (*buf) {
case '"':
self->flushString(true);
self->flushString(true, buf);
++buf;
self->pop();
if (buf == bufEnd) {
@@ -545,7 +554,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
return s;
}
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
@@ -571,6 +580,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
return WeaselJson_REJECT;
}
buf += 6;
if (!(self->flags & WeaselJsonRaw)) {
assert(codepoint <= 0x10ffff);
self->writeBuf[3] = (0b00111111 & codepoint) | 0b10000000;
codepoint >>= 6;
@@ -580,7 +590,9 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
codepoint >>= 6;
self->writeBuf[0] = (0b00000111 & codepoint) | 0b11110000;
self->writeBuf += 4;
}
} else {
if (!(self->flags & WeaselJsonRaw)) {
if (codepoint < 0x80) {
*self->writeBuf++ = codepoint;
} else if (codepoint < 0x800) {
@@ -598,15 +610,19 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
self->writeBuf += 3;
}
}
}
} else {
auto unescaped = tables.unescape[uint8_t(*buf++)];
auto unescaped = tables.unescape[uint8_t(*buf)];
if (unescaped == 0) [[unlikely]] {
return WeaselJson_REJECT;
}
if (!(self->flags & WeaselJsonRaw)) {
*self->writeBuf++ = unescaped;
}
++buf;
}
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
MUSTTAIL return n_string2(self, buf, bufEnd);
@@ -632,7 +648,10 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
case 'n':
case 'r':
case 't':
*self->writeBuf++ = tables.unescape[uint8_t(*buf++)];
if (!(self->flags & WeaselJsonRaw)) {
*self->writeBuf++ = tables.unescape[uint8_t(*buf)];
}
++buf;
self->pop();
break;
case 'u':
@@ -647,7 +666,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
[[unlikely]] return WeaselJson_REJECT;
}
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
@@ -667,7 +686,7 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex(Parser3 *self, char *buf,
++buf;
self->pop();
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
@@ -690,13 +709,16 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex2(Parser3 *self, char *buf,
// there's not room, flush, write into a temp buffer, and flush again.
char tmp[3];
if (self->utf8Codepoint < 0x80) {
if (!(self->flags & WeaselJsonRaw)) {
assert(buf - self->writeBuf >= 1);
*self->writeBuf++ = self->utf8Codepoint;
}
} else if (self->utf8Codepoint < 0x800) {
if (!(self->flags & WeaselJsonRaw)) {
bool useTmp = buf - self->writeBuf < 2;
char *p = tmp;
if (useTmp) [[unlikely]] {
self->flushString(false);
self->flushString(false, buf);
}
auto &w = useTmp ? p : self->writeBuf;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
@@ -706,6 +728,7 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex2(Parser3 *self, char *buf,
if (useTmp) [[unlikely]] {
self->callbacks->on_string_data(self->userdata, tmp, 2, false);
}
}
} else {
assert(self->utf8Codepoint < 0x10000);
if (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff) {
@@ -718,15 +741,16 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex2(Parser3 *self, char *buf,
return s;
}
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
if (!(self->flags & WeaselJsonRaw)) {
bool useTmp = buf - self->writeBuf < 3;
char *p = tmp;
if (useTmp) [[unlikely]] {
self->flushString(false);
self->flushString(false, buf);
}
auto &w = useTmp ? p : self->writeBuf;
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
@@ -739,10 +763,11 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex2(Parser3 *self, char *buf,
self->callbacks->on_string_data(self->userdata, tmp, 3, false);
}
}
}
self->pop();
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
@@ -777,10 +802,11 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex3(Parser3 *self, char *buf,
if (self->utf8Codepoint > 0x10FFFF) [[unlikely]] {
return WeaselJson_REJECT;
}
if (!(self->flags & WeaselJsonRaw)) {
bool useTmp = buf - self->writeBuf < 4;
char *p = tmp;
if (useTmp) [[unlikely]] {
self->flushString(false);
self->flushString(false, buf);
}
auto &w = useTmp ? p : self->writeBuf;
w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
@@ -794,10 +820,11 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex3(Parser3 *self, char *buf,
if (useTmp) [[unlikely]] {
self->callbacks->on_string_data(self->userdata, tmp, 4, false);
}
}
self->pop();
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
@@ -814,7 +841,7 @@ inline PRESERVE_NONE WeaselJsonStatus singleCharInString(Parser3 *self,
++buf;
self->pop();
if (buf == bufEnd) {
self->flushString(false);
self->flushString(false, buf);
return WeaselJson_AGAIN;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);

View File

@@ -133,7 +133,7 @@ void testStreaming(std::string const &json) {
auto c = serializeCallbacks();
{
auto copy = json;
auto *parser = WeaselJsonParser_create(1024, &c, &streaming);
auto *parser = WeaselJsonParser_create(1024, &c, &streaming, 0);
for (size_t i = 0; i < copy.size(); ++i) {
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) ==
WeaselJson_AGAIN);
@@ -143,7 +143,7 @@ void testStreaming(std::string const &json) {
}
{
auto copy = json;
auto *parser = WeaselJsonParser_create(1024, &c, &batch);
auto *parser = WeaselJsonParser_create(1024, &c, &batch, 0);
REQUIRE(WeaselJsonParser_parse(parser, copy.data(), copy.size()) ==
WeaselJson_AGAIN);
REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK);
@@ -159,7 +159,7 @@ TEST_CASE("parser3") {
SerializeState state;
{
auto copy = json;
auto *parser = WeaselJsonParser_create(1024, &c, &state);
auto *parser = WeaselJsonParser_create(1024, &c, &state, 0);
for (size_t i = 0; i < copy.size(); ++i) {
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) ==
WeaselJson_AGAIN);
@@ -169,7 +169,7 @@ TEST_CASE("parser3") {
}
{
std::string copy = "{\"x\": [], \"y\": {}}";
auto *parser = WeaselJsonParser_create(1024, &c, &state);
auto *parser = WeaselJsonParser_create(1024, &c, &state, 0);
for (size_t i = 0; i < copy.size(); ++i) {
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) ==
WeaselJson_AGAIN);
@@ -181,7 +181,7 @@ TEST_CASE("parser3") {
{
auto c = noopCallbacks();
std::string copy = "{\"a\":\"a";
auto *parser = WeaselJsonParser_create(1024, &c, &state);
auto *parser = WeaselJsonParser_create(1024, &c, &state, 0);
for (size_t i = 0; i < copy.size(); ++i) {
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) ==
WeaselJson_AGAIN);
@@ -192,7 +192,7 @@ TEST_CASE("parser3") {
{
auto c = noopCallbacks();
std::string copy = "[";
auto *parser = WeaselJsonParser_create(1024, &c, &state);
auto *parser = WeaselJsonParser_create(1024, &c, &state, 0);
for (size_t i = 0; i < copy.size(); ++i) {
REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) ==
WeaselJson_AGAIN);
@@ -205,7 +205,7 @@ TEST_CASE("parser3") {
TEST_CASE("streaming") { testStreaming(json); }
void doTestUnescapingUtf8(std::string const &escaped,
std::string const &expected, int stride) {
std::string const &expected, int stride, int flags) {
CAPTURE(escaped);
CAPTURE(expected);
CAPTURE(stride);
@@ -215,7 +215,7 @@ void doTestUnescapingUtf8(std::string const &escaped,
auto &s = *(std::string *)p;
s.append(buf, len);
};
auto *parser = WeaselJsonParser_create(1024, &c, &result);
auto *parser = WeaselJsonParser_create(1024, &c, &result, flags);
auto copy = escaped;
for (size_t i = 0; i < copy.size(); i += stride) {
CAPTURE(i);
@@ -233,8 +233,11 @@ void testUnescapingUtf8(std::string const &escaped,
std::string const &expected) {
for (int stride = 0; stride < 10; ++stride) {
doTestUnescapingUtf8(escaped, expected,
stride == 0 ? std::numeric_limits<int>::max()
: stride);
stride == 0 ? std::numeric_limits<int>::max() : stride,
0);
doTestUnescapingUtf8(
escaped, escaped.substr(1).substr(0, escaped.size() - 2),
stride == 0 ? std::numeric_limits<int>::max() : stride, WeaselJsonRaw);
}
}
@@ -262,7 +265,7 @@ TEST_CASE("bench3") {
ankerl::nanobench::Bench bench;
bench.batch(json.size());
bench.unit("byte");
auto *parser = WeaselJsonParser_create(1024, &c, nullptr);
auto *parser = WeaselJsonParser_create(1024, &c, nullptr, 0);
for (size_t stride = 128; stride <= json.size(); stride *= 2) {
bench.run("parser3 (stride: " + std::to_string(stride) + ")", [&]() {
auto copy = json;
@@ -376,7 +379,8 @@ TEST_CASE("bench input types") {
bench.doNotOptimizeAway(doc);
});
auto *parser = WeaselJsonParser_create(1024, &c, nullptr);
{
auto *parser = WeaselJsonParser_create(1024, &c, nullptr, 0);
bench.run("parser3 " + name, [&]() {
auto copy = json;
WeaselJsonParser_reset(parser);
@@ -389,6 +393,22 @@ TEST_CASE("bench input types") {
}
});
WeaselJsonParser_destroy(parser);
}
{
auto *parser = WeaselJsonParser_create(1024, &c, nullptr, WeaselJsonRaw);
bench.run("parser3 (raw) " + name, [&]() {
auto copy = json;
WeaselJsonParser_reset(parser);
if (WeaselJsonParser_parse(parser, copy.data(), copy.size()) !=
WeaselJson_AGAIN) {
abort();
}
if (WeaselJsonParser_parse(parser, nullptr, 0) != WeaselJson_OK) {
abort();
}
});
WeaselJsonParser_destroy(parser);
}
};
bench("numbers", "[-123456789.000000000000000123456789e+12, "

View File

@@ -17,7 +17,7 @@ int main(int argc, char **argv) {
}
auto c = noopCallbacks();
std::unique_ptr<WeaselJsonParser, decltype(&WeaselJsonParser_destroy)> parser{
WeaselJsonParser_create(1024, &c, nullptr), WeaselJsonParser_destroy};
WeaselJsonParser_create(1024, &c, nullptr, 0), WeaselJsonParser_destroy};
for (;;) {
char buf[1024];
int l = read(fd, buf, sizeof(buf));

View File

@@ -92,6 +92,7 @@ class WeaselJsonParser:
ctypes.c_int,
ctypes.POINTER(WeaselJsonCallbacks),
ctypes.c_void_p,
ctypes.c_int,
)
self._lib.WeaselJsonParser_create.restype = ctypes.c_void_p
self._lib.WeaselJsonParser_reset.argtypes = (ctypes.c_void_p,)
@@ -110,6 +111,7 @@ class WeaselJsonParser:
stackSize,
c_callbacks,
self.voidp_callbacks,
0,
)
def parse(self, data: bytes) -> WeaselJsonStatus: