From bf30eabdfc2639f5238bce4e133e9f173cccf354 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Mon, 19 May 2025 15:11:30 -0400 Subject: [PATCH] Match simdjson behavior for surrogate pairs As far as I can tell --- src/fuzz.cpp | 4 --- src/parser3.h | 81 +++++++++++++++------------------------------------ 2 files changed, 24 insertions(+), 61 deletions(-) diff --git a/src/fuzz.cpp b/src/fuzz.cpp index bd43121..8430c93 100644 --- a/src/fuzz.cpp +++ b/src/fuzz.cpp @@ -83,10 +83,6 @@ void compareWithSimdjson(std::string const &json) { // This gets returned for precision errors sometimes? return; } - if (theirs == simdjson::STRING_ERROR) { - // why god why god do I gotta suffer - return; - } if (theirs == simdjson::NUMBER_OUT_OF_RANGE) { // We don't validate the precision of numbers return; diff --git a/src/parser3.h b/src/parser3.h index 3475c4e..368efa5 100644 --- a/src/parser3.h +++ b/src/parser3.h @@ -633,6 +633,10 @@ inline Status t_hex3(Parser3 *self) { } ++self->buf; + if (!(0xdc00 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) { + return S_REJECT; + } + // Decode utf16 surrogate pair self->utf8Codepoint = 0x10000 + (self->utf16Surrogate - 0xd800) * 0x400 + (self->utf8Codepoint - 0xdc00); @@ -640,63 +644,26 @@ inline Status t_hex3(Parser3 *self) { // Write codepoint in utf-8 if there's room in the user provided buffer. If // there's not room, flush, write into a temp buffer, and flush again. char tmp[4]; - if (self->utf8Codepoint < 0x80) { - assert(self->bufEnd - self->writeBuf >= 1); - *self->writeBuf++ = self->utf8Codepoint; - } else if (self->utf8Codepoint < 0x800) { - bool useTmp = self->bufEnd - self->writeBuf < 2; - char *p = tmp; - if (useTmp) { - self->flushString(); - } - auto &w = useTmp ? p : self->writeBuf; - w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; - self->utf8Codepoint >>= 6; - w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000; - w += 2; - if (useTmp) { - self->callbacks->on_string_data(self->data, tmp, 2); - } - } else if (self->utf8Codepoint < 0x10000) { - if (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff) { - return S_REJECT; - } - bool useTmp = self->bufEnd - self->writeBuf < 3; - char *p = tmp; - if (useTmp) { - self->flushString(); - } - auto &w = useTmp ? p : self->writeBuf; - w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000; - self->utf8Codepoint >>= 6; - w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; - self->utf8Codepoint >>= 6; - w[0] = (0b00001111 & self->utf8Codepoint) | 0b11100000; - w += 3; - if (useTmp) { - self->callbacks->on_string_data(self->data, tmp, 3); - } - } else { - if (self->utf8Codepoint > 0x10FFFF) { - return S_REJECT; - } - bool useTmp = self->bufEnd - self->writeBuf < 4; - char *p = tmp; - if (useTmp) { - self->flushString(); - } - auto &w = useTmp ? p : self->writeBuf; - w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000; - self->utf8Codepoint >>= 6; - w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000; - self->utf8Codepoint >>= 6; - w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; - self->utf8Codepoint >>= 6; - w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000; - w += 4; - if (useTmp) { - self->callbacks->on_string_data(self->data, tmp, 4); - } + assert(self->utf8Codepoint < 0x10000); + if (self->utf8Codepoint > 0x10FFFF) { + return S_REJECT; + } + bool useTmp = self->bufEnd - self->writeBuf < 4; + char *p = tmp; + if (useTmp) { + self->flushString(); + } + auto &w = useTmp ? p : self->writeBuf; + w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; + self->utf8Codepoint >>= 6; + w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000; + w += 4; + if (useTmp) { + self->callbacks->on_string_data(self->data, tmp, 4); } self->pop();