Match simdjson behavior for surrogate pairs

As far as I can tell
This commit is contained in:
2025-05-19 15:11:30 -04:00
parent 292154100f
commit bf30eabdfc
2 changed files with 24 additions and 61 deletions

View File

@@ -633,6 +633,10 @@ inline Status t_hex3(Parser3 *self) {
}
++self->buf;
if (!(0xdc00 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) {
return S_REJECT;
}
// Decode utf16 surrogate pair
self->utf8Codepoint = 0x10000 + (self->utf16Surrogate - 0xd800) * 0x400 +
(self->utf8Codepoint - 0xdc00);
@@ -640,63 +644,26 @@ inline Status t_hex3(Parser3 *self) {
// Write codepoint in utf-8 if there's room in the user provided buffer. If
// there's not room, flush, write into a temp buffer, and flush again.
char tmp[4];
if (self->utf8Codepoint < 0x80) {
assert(self->bufEnd - self->writeBuf >= 1);
*self->writeBuf++ = self->utf8Codepoint;
} else if (self->utf8Codepoint < 0x800) {
bool useTmp = self->bufEnd - self->writeBuf < 2;
char *p = tmp;
if (useTmp) {
self->flushString();
}
auto &w = useTmp ? p : self->writeBuf;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000;
w += 2;
if (useTmp) {
self->callbacks->on_string_data(self->data, tmp, 2);
}
} else if (self->utf8Codepoint < 0x10000) {
if (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff) {
return S_REJECT;
}
bool useTmp = self->bufEnd - self->writeBuf < 3;
char *p = tmp;
if (useTmp) {
self->flushString();
}
auto &w = useTmp ? p : self->writeBuf;
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[0] = (0b00001111 & self->utf8Codepoint) | 0b11100000;
w += 3;
if (useTmp) {
self->callbacks->on_string_data(self->data, tmp, 3);
}
} else {
if (self->utf8Codepoint > 0x10FFFF) {
return S_REJECT;
}
bool useTmp = self->bufEnd - self->writeBuf < 4;
char *p = tmp;
if (useTmp) {
self->flushString();
}
auto &w = useTmp ? p : self->writeBuf;
w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000;
w += 4;
if (useTmp) {
self->callbacks->on_string_data(self->data, tmp, 4);
}
assert(self->utf8Codepoint < 0x10000);
if (self->utf8Codepoint > 0x10FFFF) {
return S_REJECT;
}
bool useTmp = self->bufEnd - self->writeBuf < 4;
char *p = tmp;
if (useTmp) {
self->flushString();
}
auto &w = useTmp ? p : self->writeBuf;
w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000;
w += 4;
if (useTmp) {
self->callbacks->on_string_data(self->data, tmp, 4);
}
self->pop();