Match simdjson behavior for surrogate pairs
As far as I can tell
This commit is contained in:
@@ -83,10 +83,6 @@ void compareWithSimdjson(std::string const &json) {
|
|||||||
// This gets returned for precision errors sometimes?
|
// This gets returned for precision errors sometimes?
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (theirs == simdjson::STRING_ERROR) {
|
|
||||||
// why god why god do I gotta suffer
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (theirs == simdjson::NUMBER_OUT_OF_RANGE) {
|
if (theirs == simdjson::NUMBER_OUT_OF_RANGE) {
|
||||||
// We don't validate the precision of numbers
|
// We don't validate the precision of numbers
|
||||||
return;
|
return;
|
||||||
|
|||||||
@@ -633,6 +633,10 @@ inline Status t_hex3(Parser3 *self) {
|
|||||||
}
|
}
|
||||||
++self->buf;
|
++self->buf;
|
||||||
|
|
||||||
|
if (!(0xdc00 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) {
|
||||||
|
return S_REJECT;
|
||||||
|
}
|
||||||
|
|
||||||
// Decode utf16 surrogate pair
|
// Decode utf16 surrogate pair
|
||||||
self->utf8Codepoint = 0x10000 + (self->utf16Surrogate - 0xd800) * 0x400 +
|
self->utf8Codepoint = 0x10000 + (self->utf16Surrogate - 0xd800) * 0x400 +
|
||||||
(self->utf8Codepoint - 0xdc00);
|
(self->utf8Codepoint - 0xdc00);
|
||||||
@@ -640,63 +644,26 @@ inline Status t_hex3(Parser3 *self) {
|
|||||||
// Write codepoint in utf-8 if there's room in the user provided buffer. If
|
// Write codepoint in utf-8 if there's room in the user provided buffer. If
|
||||||
// there's not room, flush, write into a temp buffer, and flush again.
|
// there's not room, flush, write into a temp buffer, and flush again.
|
||||||
char tmp[4];
|
char tmp[4];
|
||||||
if (self->utf8Codepoint < 0x80) {
|
assert(self->utf8Codepoint < 0x10000);
|
||||||
assert(self->bufEnd - self->writeBuf >= 1);
|
if (self->utf8Codepoint > 0x10FFFF) {
|
||||||
*self->writeBuf++ = self->utf8Codepoint;
|
return S_REJECT;
|
||||||
} else if (self->utf8Codepoint < 0x800) {
|
}
|
||||||
bool useTmp = self->bufEnd - self->writeBuf < 2;
|
bool useTmp = self->bufEnd - self->writeBuf < 4;
|
||||||
char *p = tmp;
|
char *p = tmp;
|
||||||
if (useTmp) {
|
if (useTmp) {
|
||||||
self->flushString();
|
self->flushString();
|
||||||
}
|
}
|
||||||
auto &w = useTmp ? p : self->writeBuf;
|
auto &w = useTmp ? p : self->writeBuf;
|
||||||
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
||||||
self->utf8Codepoint >>= 6;
|
self->utf8Codepoint >>= 6;
|
||||||
w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000;
|
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
||||||
w += 2;
|
self->utf8Codepoint >>= 6;
|
||||||
if (useTmp) {
|
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
||||||
self->callbacks->on_string_data(self->data, tmp, 2);
|
self->utf8Codepoint >>= 6;
|
||||||
}
|
w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000;
|
||||||
} else if (self->utf8Codepoint < 0x10000) {
|
w += 4;
|
||||||
if (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff) {
|
if (useTmp) {
|
||||||
return S_REJECT;
|
self->callbacks->on_string_data(self->data, tmp, 4);
|
||||||
}
|
|
||||||
bool useTmp = self->bufEnd - self->writeBuf < 3;
|
|
||||||
char *p = tmp;
|
|
||||||
if (useTmp) {
|
|
||||||
self->flushString();
|
|
||||||
}
|
|
||||||
auto &w = useTmp ? p : self->writeBuf;
|
|
||||||
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
|
||||||
self->utf8Codepoint >>= 6;
|
|
||||||
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
|
||||||
self->utf8Codepoint >>= 6;
|
|
||||||
w[0] = (0b00001111 & self->utf8Codepoint) | 0b11100000;
|
|
||||||
w += 3;
|
|
||||||
if (useTmp) {
|
|
||||||
self->callbacks->on_string_data(self->data, tmp, 3);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (self->utf8Codepoint > 0x10FFFF) {
|
|
||||||
return S_REJECT;
|
|
||||||
}
|
|
||||||
bool useTmp = self->bufEnd - self->writeBuf < 4;
|
|
||||||
char *p = tmp;
|
|
||||||
if (useTmp) {
|
|
||||||
self->flushString();
|
|
||||||
}
|
|
||||||
auto &w = useTmp ? p : self->writeBuf;
|
|
||||||
w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
|
||||||
self->utf8Codepoint >>= 6;
|
|
||||||
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
|
||||||
self->utf8Codepoint >>= 6;
|
|
||||||
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
|
|
||||||
self->utf8Codepoint >>= 6;
|
|
||||||
w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000;
|
|
||||||
w += 4;
|
|
||||||
if (useTmp) {
|
|
||||||
self->callbacks->on_string_data(self->data, tmp, 4);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
self->pop();
|
self->pop();
|
||||||
|
|||||||
Reference in New Issue
Block a user