Match simdjson behavior for surrogate pairs

As far as I can tell
This commit is contained in:
2025-05-19 15:11:30 -04:00
parent 292154100f
commit bf30eabdfc
2 changed files with 24 additions and 61 deletions

View File

@@ -83,10 +83,6 @@ void compareWithSimdjson(std::string const &json) {
// This gets returned for precision errors sometimes? // This gets returned for precision errors sometimes?
return; return;
} }
if (theirs == simdjson::STRING_ERROR) {
// why god why god do I gotta suffer
return;
}
if (theirs == simdjson::NUMBER_OUT_OF_RANGE) { if (theirs == simdjson::NUMBER_OUT_OF_RANGE) {
// We don't validate the precision of numbers // We don't validate the precision of numbers
return; return;

View File

@@ -633,6 +633,10 @@ inline Status t_hex3(Parser3 *self) {
} }
++self->buf; ++self->buf;
if (!(0xdc00 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) {
return S_REJECT;
}
// Decode utf16 surrogate pair // Decode utf16 surrogate pair
self->utf8Codepoint = 0x10000 + (self->utf16Surrogate - 0xd800) * 0x400 + self->utf8Codepoint = 0x10000 + (self->utf16Surrogate - 0xd800) * 0x400 +
(self->utf8Codepoint - 0xdc00); (self->utf8Codepoint - 0xdc00);
@@ -640,43 +644,7 @@ inline Status t_hex3(Parser3 *self) {
// Write codepoint in utf-8 if there's room in the user provided buffer. If // Write codepoint in utf-8 if there's room in the user provided buffer. If
// there's not room, flush, write into a temp buffer, and flush again. // there's not room, flush, write into a temp buffer, and flush again.
char tmp[4]; char tmp[4];
if (self->utf8Codepoint < 0x80) { assert(self->utf8Codepoint < 0x10000);
assert(self->bufEnd - self->writeBuf >= 1);
*self->writeBuf++ = self->utf8Codepoint;
} else if (self->utf8Codepoint < 0x800) {
bool useTmp = self->bufEnd - self->writeBuf < 2;
char *p = tmp;
if (useTmp) {
self->flushString();
}
auto &w = useTmp ? p : self->writeBuf;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000;
w += 2;
if (useTmp) {
self->callbacks->on_string_data(self->data, tmp, 2);
}
} else if (self->utf8Codepoint < 0x10000) {
if (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff) {
return S_REJECT;
}
bool useTmp = self->bufEnd - self->writeBuf < 3;
char *p = tmp;
if (useTmp) {
self->flushString();
}
auto &w = useTmp ? p : self->writeBuf;
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[0] = (0b00001111 & self->utf8Codepoint) | 0b11100000;
w += 3;
if (useTmp) {
self->callbacks->on_string_data(self->data, tmp, 3);
}
} else {
if (self->utf8Codepoint > 0x10FFFF) { if (self->utf8Codepoint > 0x10FFFF) {
return S_REJECT; return S_REJECT;
} }
@@ -697,7 +665,6 @@ inline Status t_hex3(Parser3 *self) {
if (useTmp) { if (useTmp) {
self->callbacks->on_string_data(self->data, tmp, 4); self->callbacks->on_string_data(self->data, tmp, 4);
} }
}
self->pop(); self->pop();
MUSTTAIL return Parser3::keepGoing(self); MUSTTAIL return Parser3::keepGoing(self);