Match simdjson behavior for surrogate pairs

As far as I can tell
This commit is contained in:
2025-05-19 15:11:30 -04:00
parent 292154100f
commit bf30eabdfc
2 changed files with 24 additions and 61 deletions

View File

@@ -83,10 +83,6 @@ void compareWithSimdjson(std::string const &json) {
// This gets returned for precision errors sometimes? // This gets returned for precision errors sometimes?
return; return;
} }
if (theirs == simdjson::STRING_ERROR) {
// why god why god do I gotta suffer
return;
}
if (theirs == simdjson::NUMBER_OUT_OF_RANGE) { if (theirs == simdjson::NUMBER_OUT_OF_RANGE) {
// We don't validate the precision of numbers // We don't validate the precision of numbers
return; return;

View File

@@ -633,6 +633,10 @@ inline Status t_hex3(Parser3 *self) {
} }
++self->buf; ++self->buf;
if (!(0xdc00 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) {
return S_REJECT;
}
// Decode utf16 surrogate pair // Decode utf16 surrogate pair
self->utf8Codepoint = 0x10000 + (self->utf16Surrogate - 0xd800) * 0x400 + self->utf8Codepoint = 0x10000 + (self->utf16Surrogate - 0xd800) * 0x400 +
(self->utf8Codepoint - 0xdc00); (self->utf8Codepoint - 0xdc00);
@@ -640,63 +644,26 @@ inline Status t_hex3(Parser3 *self) {
// Write codepoint in utf-8 if there's room in the user provided buffer. If // Write codepoint in utf-8 if there's room in the user provided buffer. If
// there's not room, flush, write into a temp buffer, and flush again. // there's not room, flush, write into a temp buffer, and flush again.
char tmp[4]; char tmp[4];
if (self->utf8Codepoint < 0x80) { assert(self->utf8Codepoint < 0x10000);
assert(self->bufEnd - self->writeBuf >= 1); if (self->utf8Codepoint > 0x10FFFF) {
*self->writeBuf++ = self->utf8Codepoint; return S_REJECT;
} else if (self->utf8Codepoint < 0x800) { }
bool useTmp = self->bufEnd - self->writeBuf < 2; bool useTmp = self->bufEnd - self->writeBuf < 4;
char *p = tmp; char *p = tmp;
if (useTmp) { if (useTmp) {
self->flushString(); self->flushString();
} }
auto &w = useTmp ? p : self->writeBuf; auto &w = useTmp ? p : self->writeBuf;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000; w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6; self->utf8Codepoint >>= 6;
w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000; w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
w += 2; self->utf8Codepoint >>= 6;
if (useTmp) { w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->callbacks->on_string_data(self->data, tmp, 2); self->utf8Codepoint >>= 6;
} w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000;
} else if (self->utf8Codepoint < 0x10000) { w += 4;
if (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff) { if (useTmp) {
return S_REJECT; self->callbacks->on_string_data(self->data, tmp, 4);
}
bool useTmp = self->bufEnd - self->writeBuf < 3;
char *p = tmp;
if (useTmp) {
self->flushString();
}
auto &w = useTmp ? p : self->writeBuf;
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[0] = (0b00001111 & self->utf8Codepoint) | 0b11100000;
w += 3;
if (useTmp) {
self->callbacks->on_string_data(self->data, tmp, 3);
}
} else {
if (self->utf8Codepoint > 0x10FFFF) {
return S_REJECT;
}
bool useTmp = self->bufEnd - self->writeBuf < 4;
char *p = tmp;
if (useTmp) {
self->flushString();
}
auto &w = useTmp ? p : self->writeBuf;
w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000;
w += 4;
if (useTmp) {
self->callbacks->on_string_data(self->data, tmp, 4);
}
} }
self->pop(); self->pop();