From 3e72181beef559ff9ce0d54c935f12b79c270583 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Wed, 25 Jun 2025 16:44:31 -0400 Subject: [PATCH 1/2] Add flags argument to WeaselJsonParser_create --- include/weaseljson.h | 7 ++++++- src/fuzz.cpp | 8 ++++---- src/json_value.h | 2 +- src/lib.cpp | 2 +- src/test.cpp | 18 +++++++++--------- src/validate.cpp | 2 +- weaseljson.py | 2 ++ 7 files changed, 24 insertions(+), 17 deletions(-) diff --git a/include/weaseljson.h b/include/weaseljson.h index 1442376..93d734b 100644 --- a/include/weaseljson.h +++ b/include/weaseljson.h @@ -36,13 +36,18 @@ enum WeaselJsonStatus { typedef struct WeaselJsonParser WeaselJsonParser; +enum WeaselJsonFlags { + /** Do not unescape strings or write to the supplied buffer at all. */ + WeaselJsonRaw = 1, +}; + /** Create a parser. Increasing stack size increases memory usage but also * increases the depth of nested json accepted. `callbacks` and `userdata` must * outlive the returned parser. Returns null if there's insufficient available * memory */ WeaselJsonParser *WeaselJsonParser_create(int stackSize, const WeaselJsonCallbacks *callbacks, - void *userdata); + void *userdata, int flags); /** Restore the parser to its newly-created state */ void WeaselJsonParser_reset(WeaselJsonParser *parser); diff --git a/src/fuzz.cpp b/src/fuzz.cpp index af3afdf..713c9f1 100644 --- a/src/fuzz.cpp +++ b/src/fuzz.cpp @@ -10,7 +10,7 @@ std::pair runStreaming(std::string copy, SerializeState state; auto c = serializeCallbacks(); std::unique_ptr parser{ - WeaselJsonParser_create(1024, &c, &state), WeaselJsonParser_destroy}; + WeaselJsonParser_create(1024, &c, &state, 0), WeaselJsonParser_destroy}; if (stride == 0) { auto s = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size()); if (s != WeaselJson_AGAIN) { @@ -33,7 +33,7 @@ std::pair runBatch(std::string copy) { SerializeState state; auto c = serializeCallbacks(); std::unique_ptr parser{ - WeaselJsonParser_create(1024, &c, &state), WeaselJsonParser_destroy}; + WeaselJsonParser_create(1024, &c, &state, 0), WeaselJsonParser_destroy}; auto s = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size()); if (s != WeaselJson_AGAIN) { return {state.result, s}; @@ -47,7 +47,7 @@ std::pair runPrefix(std::string copy, SerializeState state; auto c = serializeCallbacks(); std::unique_ptr parser{ - WeaselJsonParser_create(1024, &c, &state), WeaselJsonParser_destroy}; + WeaselJsonParser_create(1024, &c, &state, 0), WeaselJsonParser_destroy}; auto s = WeaselJsonParser_parse(parser.get(), copy.data(), prefix); if (s != WeaselJson_AGAIN) { return {state.result, s}; @@ -116,7 +116,7 @@ void compareWithSimdjson(std::string const &json) { auto copy = json; auto c = noopCallbacks(); std::unique_ptr - parser{WeaselJsonParser_create(1024, &c, nullptr), + parser{WeaselJsonParser_create(1024, &c, nullptr, 0), WeaselJsonParser_destroy}; ours = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size()); if (ours == WeaselJson_AGAIN) { diff --git a/src/json_value.h b/src/json_value.h index 496ed35..76b79f9 100644 --- a/src/json_value.h +++ b/src/json_value.h @@ -196,7 +196,7 @@ inline std::optional toValue(std::string copy, int stride) { ReadValueState state; auto c = readValueCallbacks(); std::unique_ptr parser{ - WeaselJsonParser_create(1024, &c, &state), WeaselJsonParser_destroy}; + WeaselJsonParser_create(1024, &c, &state, 0), WeaselJsonParser_destroy}; if (stride == 0) { if (WeaselJsonParser_parse(parser.get(), copy.data(), copy.size()) != WeaselJson_AGAIN) { diff --git a/src/lib.cpp b/src/lib.cpp index 24040f5..8b8b277 100644 --- a/src/lib.cpp +++ b/src/lib.cpp @@ -7,7 +7,7 @@ extern "C" { __attribute__((visibility("default"))) WeaselJsonParser * WeaselJsonParser_create(int stackSize, const WeaselJsonCallbacks *callbacks, - void *userdata) { + void *userdata, int flags) { auto *buf = malloc(sizeof(Parser3) + stackSize * sizeof(*Parser3::stackPtr)); if (buf == nullptr) { return nullptr; diff --git a/src/test.cpp b/src/test.cpp index ee8f1d7..a263009 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -133,7 +133,7 @@ void testStreaming(std::string const &json) { auto c = serializeCallbacks(); { auto copy = json; - auto *parser = WeaselJsonParser_create(1024, &c, &streaming); + auto *parser = WeaselJsonParser_create(1024, &c, &streaming, 0); for (size_t i = 0; i < copy.size(); ++i) { REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) == WeaselJson_AGAIN); @@ -143,7 +143,7 @@ void testStreaming(std::string const &json) { } { auto copy = json; - auto *parser = WeaselJsonParser_create(1024, &c, &batch); + auto *parser = WeaselJsonParser_create(1024, &c, &batch, 0); REQUIRE(WeaselJsonParser_parse(parser, copy.data(), copy.size()) == WeaselJson_AGAIN); REQUIRE(WeaselJsonParser_parse(parser, nullptr, 0) == WeaselJson_OK); @@ -159,7 +159,7 @@ TEST_CASE("parser3") { SerializeState state; { auto copy = json; - auto *parser = WeaselJsonParser_create(1024, &c, &state); + auto *parser = WeaselJsonParser_create(1024, &c, &state, 0); for (size_t i = 0; i < copy.size(); ++i) { REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) == WeaselJson_AGAIN); @@ -169,7 +169,7 @@ TEST_CASE("parser3") { } { std::string copy = "{\"x\": [], \"y\": {}}"; - auto *parser = WeaselJsonParser_create(1024, &c, &state); + auto *parser = WeaselJsonParser_create(1024, &c, &state, 0); for (size_t i = 0; i < copy.size(); ++i) { REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) == WeaselJson_AGAIN); @@ -181,7 +181,7 @@ TEST_CASE("parser3") { { auto c = noopCallbacks(); std::string copy = "{\"a\":\"a"; - auto *parser = WeaselJsonParser_create(1024, &c, &state); + auto *parser = WeaselJsonParser_create(1024, &c, &state, 0); for (size_t i = 0; i < copy.size(); ++i) { REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) == WeaselJson_AGAIN); @@ -192,7 +192,7 @@ TEST_CASE("parser3") { { auto c = noopCallbacks(); std::string copy = "["; - auto *parser = WeaselJsonParser_create(1024, &c, &state); + auto *parser = WeaselJsonParser_create(1024, &c, &state, 0); for (size_t i = 0; i < copy.size(); ++i) { REQUIRE(WeaselJsonParser_parse(parser, copy.data() + i, 1) == WeaselJson_AGAIN); @@ -215,7 +215,7 @@ void doTestUnescapingUtf8(std::string const &escaped, auto &s = *(std::string *)p; s.append(buf, len); }; - auto *parser = WeaselJsonParser_create(1024, &c, &result); + auto *parser = WeaselJsonParser_create(1024, &c, &result, 0); auto copy = escaped; for (size_t i = 0; i < copy.size(); i += stride) { CAPTURE(i); @@ -262,7 +262,7 @@ TEST_CASE("bench3") { ankerl::nanobench::Bench bench; bench.batch(json.size()); bench.unit("byte"); - auto *parser = WeaselJsonParser_create(1024, &c, nullptr); + auto *parser = WeaselJsonParser_create(1024, &c, nullptr, 0); for (size_t stride = 128; stride <= json.size(); stride *= 2) { bench.run("parser3 (stride: " + std::to_string(stride) + ")", [&]() { auto copy = json; @@ -376,7 +376,7 @@ TEST_CASE("bench input types") { bench.doNotOptimizeAway(doc); }); - auto *parser = WeaselJsonParser_create(1024, &c, nullptr); + auto *parser = WeaselJsonParser_create(1024, &c, nullptr, 0); bench.run("parser3 " + name, [&]() { auto copy = json; WeaselJsonParser_reset(parser); diff --git a/src/validate.cpp b/src/validate.cpp index 42425da..abbd2fa 100644 --- a/src/validate.cpp +++ b/src/validate.cpp @@ -17,7 +17,7 @@ int main(int argc, char **argv) { } auto c = noopCallbacks(); std::unique_ptr parser{ - WeaselJsonParser_create(1024, &c, nullptr), WeaselJsonParser_destroy}; + WeaselJsonParser_create(1024, &c, nullptr, 0), WeaselJsonParser_destroy}; for (;;) { char buf[1024]; int l = read(fd, buf, sizeof(buf)); diff --git a/weaseljson.py b/weaseljson.py index 405e10c..a39d953 100644 --- a/weaseljson.py +++ b/weaseljson.py @@ -92,6 +92,7 @@ class WeaselJsonParser: ctypes.c_int, ctypes.POINTER(WeaselJsonCallbacks), ctypes.c_void_p, + ctypes.c_int, ) self._lib.WeaselJsonParser_create.restype = ctypes.c_void_p self._lib.WeaselJsonParser_reset.argtypes = (ctypes.c_void_p,) @@ -110,6 +111,7 @@ class WeaselJsonParser: stackSize, c_callbacks, self.voidp_callbacks, + 0, ) def parse(self, data: bytes) -> WeaselJsonStatus: From 9e4f90f21807695f0d05daff6b0800ee4cc5b6e8 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Wed, 25 Jun 2025 18:44:37 -0400 Subject: [PATCH 2/2] Add WeaselJsonRaw flag --- src/fuzz.cpp | 15 +++- src/lib.cpp | 3 +- src/parser3.h | 197 ++++++++++++++++++++++++++++---------------------- src/test.cpp | 54 +++++++++----- 4 files changed, 162 insertions(+), 107 deletions(-) diff --git a/src/fuzz.cpp b/src/fuzz.cpp index 713c9f1..872879c 100644 --- a/src/fuzz.cpp +++ b/src/fuzz.cpp @@ -10,7 +10,9 @@ std::pair runStreaming(std::string copy, SerializeState state; auto c = serializeCallbacks(); std::unique_ptr parser{ - WeaselJsonParser_create(1024, &c, &state, 0), WeaselJsonParser_destroy}; + WeaselJsonParser_create(1024, &c, &state, + copy.size() % 2 == 0 ? WeaselJsonRaw : 0), + WeaselJsonParser_destroy}; if (stride == 0) { auto s = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size()); if (s != WeaselJson_AGAIN) { @@ -33,7 +35,9 @@ std::pair runBatch(std::string copy) { SerializeState state; auto c = serializeCallbacks(); std::unique_ptr parser{ - WeaselJsonParser_create(1024, &c, &state, 0), WeaselJsonParser_destroy}; + WeaselJsonParser_create(1024, &c, &state, + copy.size() % 2 == 0 ? WeaselJsonRaw : 0), + WeaselJsonParser_destroy}; auto s = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size()); if (s != WeaselJson_AGAIN) { return {state.result, s}; @@ -47,7 +51,9 @@ std::pair runPrefix(std::string copy, SerializeState state; auto c = serializeCallbacks(); std::unique_ptr parser{ - WeaselJsonParser_create(1024, &c, &state, 0), WeaselJsonParser_destroy}; + WeaselJsonParser_create(1024, &c, &state, + copy.size() % 2 == 0 ? WeaselJsonRaw : 0), + WeaselJsonParser_destroy}; auto s = WeaselJsonParser_parse(parser.get(), copy.data(), prefix); if (s != WeaselJson_AGAIN) { return {state.result, s}; @@ -116,7 +122,8 @@ void compareWithSimdjson(std::string const &json) { auto copy = json; auto c = noopCallbacks(); std::unique_ptr - parser{WeaselJsonParser_create(1024, &c, nullptr, 0), + parser{WeaselJsonParser_create( + 1024, &c, nullptr, json.size() % 2 == 0 ? WeaselJsonRaw : 0), WeaselJsonParser_destroy}; ours = WeaselJsonParser_parse(parser.get(), copy.data(), copy.size()); if (ours == WeaselJson_AGAIN) { diff --git a/src/lib.cpp b/src/lib.cpp index 8b8b277..c2fb753 100644 --- a/src/lib.cpp +++ b/src/lib.cpp @@ -12,7 +12,8 @@ WeaselJsonParser_create(int stackSize, const WeaselJsonCallbacks *callbacks, if (buf == nullptr) { return nullptr; } - return (WeaselJsonParser *)new (buf) Parser3{callbacks, userdata, stackSize}; + return (WeaselJsonParser *)new (buf) + Parser3{callbacks, userdata, stackSize, flags}; } __attribute__((visibility("default"))) void diff --git a/src/parser3.h b/src/parser3.h index 63053d2..4d4f10c 100644 --- a/src/parser3.h +++ b/src/parser3.h @@ -64,9 +64,10 @@ enum Symbol : uint8_t { N_SYMBOL_COUNT, // Must be last }; struct Parser3 { - Parser3(const WeaselJsonCallbacks *callbacks, void *userdata, int stackSize) - : callbacks(callbacks), userdata(userdata), - stackEnd(stack() + stackSize) { + Parser3(const WeaselJsonCallbacks *callbacks, void *userdata, int stackSize, + int flags) + : callbacks(callbacks), userdata(userdata), stackEnd(stack() + stackSize), + flags(flags) { reset(); } @@ -80,8 +81,13 @@ struct Parser3 { } } - void flushString(bool done) { - int len = writeBuf - dataBegin; + void flushString(bool done, char *buf) { + int len; + if (!(flags & WeaselJsonRaw)) { + len = writeBuf - dataBegin; + } else { + len = buf - dataBegin; + } assert(len >= 0); if (done || len > 0) { callbacks->on_string_data(userdata, dataBegin, len, done); @@ -129,6 +135,7 @@ struct Parser3 { void *const userdata; Symbol *stackPtr; Symbol *const stackEnd; + int const flags; uint32_t utf8Codepoint; uint32_t utf16Surrogate; uint32_t minCodepoint; @@ -213,13 +220,15 @@ inline PRESERVE_NONE WeaselJsonStatus scan_string_impl(Parser3 *self, buf = (char *)self->strDfa.scan(buf, bufEnd); int len = buf - before; - if (self->writeBuf != before) { - memmove(self->writeBuf, before, len); + if (!(self->flags & WeaselJsonRaw)) { + if (self->writeBuf != before) { + memmove(self->writeBuf, before, len); + } + self->writeBuf += len; } - self->writeBuf += len; if (buf == bufEnd) { - self->flushString(false); + self->flushString(false, buf); return WeaselJson_AGAIN; } @@ -531,7 +540,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf, } switch (*buf) { case '"': - self->flushString(true); + self->flushString(true, buf); ++buf; self->pop(); if (buf == bufEnd) { @@ -545,7 +554,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf, return s; } if (buf == bufEnd) { - self->flushString(false); + self->flushString(false, buf); return WeaselJson_AGAIN; } MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); @@ -571,42 +580,49 @@ inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf, return WeaselJson_REJECT; } buf += 6; - assert(codepoint <= 0x10ffff); - self->writeBuf[3] = (0b00111111 & codepoint) | 0b10000000; - codepoint >>= 6; - self->writeBuf[2] = (0b00111111 & codepoint) | 0b10000000; - codepoint >>= 6; - self->writeBuf[1] = (0b00111111 & codepoint) | 0b10000000; - codepoint >>= 6; - self->writeBuf[0] = (0b00000111 & codepoint) | 0b11110000; - self->writeBuf += 4; - } else { - if (codepoint < 0x80) { - *self->writeBuf++ = codepoint; - } else if (codepoint < 0x800) { - self->writeBuf[1] = (0b00111111 & codepoint) | 0b10000000; + if (!(self->flags & WeaselJsonRaw)) { + assert(codepoint <= 0x10ffff); + self->writeBuf[3] = (0b00111111 & codepoint) | 0b10000000; codepoint >>= 6; - self->writeBuf[0] = (0b00011111 & codepoint) | 0b11000000; - self->writeBuf += 2; - } else { - assert(codepoint < 0x10000); self->writeBuf[2] = (0b00111111 & codepoint) | 0b10000000; codepoint >>= 6; self->writeBuf[1] = (0b00111111 & codepoint) | 0b10000000; codepoint >>= 6; - self->writeBuf[0] = (0b00001111 & codepoint) | 0b11100000; - self->writeBuf += 3; + self->writeBuf[0] = (0b00000111 & codepoint) | 0b11110000; + self->writeBuf += 4; + } + } else { + if (!(self->flags & WeaselJsonRaw)) { + if (codepoint < 0x80) { + *self->writeBuf++ = codepoint; + } else if (codepoint < 0x800) { + self->writeBuf[1] = (0b00111111 & codepoint) | 0b10000000; + codepoint >>= 6; + self->writeBuf[0] = (0b00011111 & codepoint) | 0b11000000; + self->writeBuf += 2; + } else { + assert(codepoint < 0x10000); + self->writeBuf[2] = (0b00111111 & codepoint) | 0b10000000; + codepoint >>= 6; + self->writeBuf[1] = (0b00111111 & codepoint) | 0b10000000; + codepoint >>= 6; + self->writeBuf[0] = (0b00001111 & codepoint) | 0b11100000; + self->writeBuf += 3; + } } } } else { - auto unescaped = tables.unescape[uint8_t(*buf++)]; + auto unescaped = tables.unescape[uint8_t(*buf)]; if (unescaped == 0) [[unlikely]] { return WeaselJson_REJECT; } - *self->writeBuf++ = unescaped; + if (!(self->flags & WeaselJsonRaw)) { + *self->writeBuf++ = unescaped; + } + ++buf; } if (buf == bufEnd) { - self->flushString(false); + self->flushString(false, buf); return WeaselJson_AGAIN; } MUSTTAIL return n_string2(self, buf, bufEnd); @@ -632,7 +648,10 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self, case 'n': case 'r': case 't': - *self->writeBuf++ = tables.unescape[uint8_t(*buf++)]; + if (!(self->flags & WeaselJsonRaw)) { + *self->writeBuf++ = tables.unescape[uint8_t(*buf)]; + } + ++buf; self->pop(); break; case 'u': @@ -647,7 +666,7 @@ inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self, [[unlikely]] return WeaselJson_REJECT; } if (buf == bufEnd) { - self->flushString(false); + self->flushString(false, buf); return WeaselJson_AGAIN; } MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); @@ -667,7 +686,7 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex(Parser3 *self, char *buf, ++buf; self->pop(); if (buf == bufEnd) { - self->flushString(false); + self->flushString(false, buf); return WeaselJson_AGAIN; } MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); @@ -690,21 +709,25 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex2(Parser3 *self, char *buf, // there's not room, flush, write into a temp buffer, and flush again. char tmp[3]; if (self->utf8Codepoint < 0x80) { - assert(buf - self->writeBuf >= 1); - *self->writeBuf++ = self->utf8Codepoint; - } else if (self->utf8Codepoint < 0x800) { - bool useTmp = buf - self->writeBuf < 2; - char *p = tmp; - if (useTmp) [[unlikely]] { - self->flushString(false); + if (!(self->flags & WeaselJsonRaw)) { + assert(buf - self->writeBuf >= 1); + *self->writeBuf++ = self->utf8Codepoint; } - auto &w = useTmp ? p : self->writeBuf; - w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; - self->utf8Codepoint >>= 6; - w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000; - w += 2; - if (useTmp) [[unlikely]] { - self->callbacks->on_string_data(self->userdata, tmp, 2, false); + } else if (self->utf8Codepoint < 0x800) { + if (!(self->flags & WeaselJsonRaw)) { + bool useTmp = buf - self->writeBuf < 2; + char *p = tmp; + if (useTmp) [[unlikely]] { + self->flushString(false, buf); + } + auto &w = useTmp ? p : self->writeBuf; + w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000; + w += 2; + if (useTmp) [[unlikely]] { + self->callbacks->on_string_data(self->userdata, tmp, 2, false); + } } } else { assert(self->utf8Codepoint < 0x10000); @@ -718,31 +741,33 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex2(Parser3 *self, char *buf, return s; } if (buf == bufEnd) { - self->flushString(false); + self->flushString(false, buf); return WeaselJson_AGAIN; } MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); } - bool useTmp = buf - self->writeBuf < 3; - char *p = tmp; - if (useTmp) [[unlikely]] { - self->flushString(false); - } - auto &w = useTmp ? p : self->writeBuf; - w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000; - self->utf8Codepoint >>= 6; - w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; - self->utf8Codepoint >>= 6; - w[0] = (0b00001111 & self->utf8Codepoint) | 0b11100000; - w += 3; - if (useTmp) [[unlikely]] { - self->callbacks->on_string_data(self->userdata, tmp, 3, false); + if (!(self->flags & WeaselJsonRaw)) { + bool useTmp = buf - self->writeBuf < 3; + char *p = tmp; + if (useTmp) [[unlikely]] { + self->flushString(false, buf); + } + auto &w = useTmp ? p : self->writeBuf; + w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[0] = (0b00001111 & self->utf8Codepoint) | 0b11100000; + w += 3; + if (useTmp) [[unlikely]] { + self->callbacks->on_string_data(self->userdata, tmp, 3, false); + } } } self->pop(); if (buf == bufEnd) { - self->flushString(false); + self->flushString(false, buf); return WeaselJson_AGAIN; } MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); @@ -777,27 +802,29 @@ inline PRESERVE_NONE WeaselJsonStatus t_hex3(Parser3 *self, char *buf, if (self->utf8Codepoint > 0x10FFFF) [[unlikely]] { return WeaselJson_REJECT; } - bool useTmp = buf - self->writeBuf < 4; - char *p = tmp; - if (useTmp) [[unlikely]] { - self->flushString(false); - } - auto &w = useTmp ? p : self->writeBuf; - w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000; - self->utf8Codepoint >>= 6; - w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000; - self->utf8Codepoint >>= 6; - w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; - self->utf8Codepoint >>= 6; - w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000; - w += 4; - if (useTmp) [[unlikely]] { - self->callbacks->on_string_data(self->userdata, tmp, 4, false); + if (!(self->flags & WeaselJsonRaw)) { + bool useTmp = buf - self->writeBuf < 4; + char *p = tmp; + if (useTmp) [[unlikely]] { + self->flushString(false, buf); + } + auto &w = useTmp ? p : self->writeBuf; + w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000; + w += 4; + if (useTmp) [[unlikely]] { + self->callbacks->on_string_data(self->userdata, tmp, 4, false); + } } self->pop(); if (buf == bufEnd) { - self->flushString(false); + self->flushString(false, buf); return WeaselJson_AGAIN; } MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); @@ -814,7 +841,7 @@ inline PRESERVE_NONE WeaselJsonStatus singleCharInString(Parser3 *self, ++buf; self->pop(); if (buf == bufEnd) { - self->flushString(false); + self->flushString(false, buf); return WeaselJson_AGAIN; } MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd); diff --git a/src/test.cpp b/src/test.cpp index a263009..eb70738 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -205,7 +205,7 @@ TEST_CASE("parser3") { TEST_CASE("streaming") { testStreaming(json); } void doTestUnescapingUtf8(std::string const &escaped, - std::string const &expected, int stride) { + std::string const &expected, int stride, int flags) { CAPTURE(escaped); CAPTURE(expected); CAPTURE(stride); @@ -215,7 +215,7 @@ void doTestUnescapingUtf8(std::string const &escaped, auto &s = *(std::string *)p; s.append(buf, len); }; - auto *parser = WeaselJsonParser_create(1024, &c, &result, 0); + auto *parser = WeaselJsonParser_create(1024, &c, &result, flags); auto copy = escaped; for (size_t i = 0; i < copy.size(); i += stride) { CAPTURE(i); @@ -233,8 +233,11 @@ void testUnescapingUtf8(std::string const &escaped, std::string const &expected) { for (int stride = 0; stride < 10; ++stride) { doTestUnescapingUtf8(escaped, expected, - stride == 0 ? std::numeric_limits::max() - : stride); + stride == 0 ? std::numeric_limits::max() : stride, + 0); + doTestUnescapingUtf8( + escaped, escaped.substr(1).substr(0, escaped.size() - 2), + stride == 0 ? std::numeric_limits::max() : stride, WeaselJsonRaw); } } @@ -376,19 +379,36 @@ TEST_CASE("bench input types") { bench.doNotOptimizeAway(doc); }); - auto *parser = WeaselJsonParser_create(1024, &c, nullptr, 0); - bench.run("parser3 " + name, [&]() { - auto copy = json; - WeaselJsonParser_reset(parser); - if (WeaselJsonParser_parse(parser, copy.data(), copy.size()) != - WeaselJson_AGAIN) { - abort(); - } - if (WeaselJsonParser_parse(parser, nullptr, 0) != WeaselJson_OK) { - abort(); - } - }); - WeaselJsonParser_destroy(parser); + { + auto *parser = WeaselJsonParser_create(1024, &c, nullptr, 0); + bench.run("parser3 " + name, [&]() { + auto copy = json; + WeaselJsonParser_reset(parser); + if (WeaselJsonParser_parse(parser, copy.data(), copy.size()) != + WeaselJson_AGAIN) { + abort(); + } + if (WeaselJsonParser_parse(parser, nullptr, 0) != WeaselJson_OK) { + abort(); + } + }); + WeaselJsonParser_destroy(parser); + } + { + auto *parser = WeaselJsonParser_create(1024, &c, nullptr, WeaselJsonRaw); + bench.run("parser3 (raw) " + name, [&]() { + auto copy = json; + WeaselJsonParser_reset(parser); + if (WeaselJsonParser_parse(parser, copy.data(), copy.size()) != + WeaselJson_AGAIN) { + abort(); + } + if (WeaselJsonParser_parse(parser, nullptr, 0) != WeaselJson_OK) { + abort(); + } + }); + WeaselJsonParser_destroy(parser); + } }; bench("numbers", "[-123456789.000000000000000123456789e+12, "