Match simdjson behavior for surrogate pairs
As far as I can tell
This commit is contained in:
@@ -633,6 +633,10 @@ inline Status t_hex3(Parser3 *self) {
|
||||
}
|
||||
++self->buf;
|
||||
|
||||
if (!(0xdc00 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) {
|
||||
return S_REJECT;
|
||||
}
|
||||
|
||||
// Decode utf16 surrogate pair
|
||||
self->utf8Codepoint = 0x10000 + (self->utf16Surrogate - 0xd800) * 0x400 +
|
||||
(self->utf8Codepoint - 0xdc00);
|
||||
@@ -640,63 +644,26 @@ inline Status t_hex3(Parser3 *self) {
|
||||
// Write codepoint in utf-8 if there's room in the user provided buffer. If
|
||||
// there's not room, flush, write into a temp buffer, and flush again.
|
||||
char tmp[4];
|
||||
if (self->utf8Codepoint < 0x80) {
|
||||
assert(self->bufEnd - self->writeBuf >= 1);
|
||||
*self->writeBuf++ = self->utf8Codepoint;
|
||||
} else if (self->utf8Codepoint < 0x800) {
|
||||
bool useTmp = self->bufEnd - self->writeBuf < 2;
|
||||
char *p = tmp;
|
||||
if (useTmp) {
|
||||
self->flushString();
|
||||
}
|
||||
auto &w = useTmp ? p : self->writeBuf;
|
||||
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
||||
self->utf8Codepoint >>= 6;
|
||||
w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000;
|
||||
w += 2;
|
||||
if (useTmp) {
|
||||
self->callbacks->on_string_data(self->data, tmp, 2);
|
||||
}
|
||||
} else if (self->utf8Codepoint < 0x10000) {
|
||||
if (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff) {
|
||||
return S_REJECT;
|
||||
}
|
||||
bool useTmp = self->bufEnd - self->writeBuf < 3;
|
||||
char *p = tmp;
|
||||
if (useTmp) {
|
||||
self->flushString();
|
||||
}
|
||||
auto &w = useTmp ? p : self->writeBuf;
|
||||
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
||||
self->utf8Codepoint >>= 6;
|
||||
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
||||
self->utf8Codepoint >>= 6;
|
||||
w[0] = (0b00001111 & self->utf8Codepoint) | 0b11100000;
|
||||
w += 3;
|
||||
if (useTmp) {
|
||||
self->callbacks->on_string_data(self->data, tmp, 3);
|
||||
}
|
||||
} else {
|
||||
if (self->utf8Codepoint > 0x10FFFF) {
|
||||
return S_REJECT;
|
||||
}
|
||||
bool useTmp = self->bufEnd - self->writeBuf < 4;
|
||||
char *p = tmp;
|
||||
if (useTmp) {
|
||||
self->flushString();
|
||||
}
|
||||
auto &w = useTmp ? p : self->writeBuf;
|
||||
w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
||||
self->utf8Codepoint >>= 6;
|
||||
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
||||
self->utf8Codepoint >>= 6;
|
||||
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
||||
self->utf8Codepoint >>= 6;
|
||||
w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000;
|
||||
w += 4;
|
||||
if (useTmp) {
|
||||
self->callbacks->on_string_data(self->data, tmp, 4);
|
||||
}
|
||||
assert(self->utf8Codepoint < 0x10000);
|
||||
if (self->utf8Codepoint > 0x10FFFF) {
|
||||
return S_REJECT;
|
||||
}
|
||||
bool useTmp = self->bufEnd - self->writeBuf < 4;
|
||||
char *p = tmp;
|
||||
if (useTmp) {
|
||||
self->flushString();
|
||||
}
|
||||
auto &w = useTmp ? p : self->writeBuf;
|
||||
w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
||||
self->utf8Codepoint >>= 6;
|
||||
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
||||
self->utf8Codepoint >>= 6;
|
||||
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
||||
self->utf8Codepoint >>= 6;
|
||||
w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000;
|
||||
w += 4;
|
||||
if (useTmp) {
|
||||
self->callbacks->on_string_data(self->data, tmp, 4);
|
||||
}
|
||||
|
||||
self->pop();
|
||||
|
||||
Reference in New Issue
Block a user