From bf30eabdfc2639f5238bce4e133e9f173cccf354 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew@weaselab.dev>
Date: Mon, 19 May 2025 15:11:30 -0400
Subject: [PATCH] Match simdjson behavior for surrogate pairs

As far as I can tell
---
 src/fuzz.cpp  |  4 ---
 src/parser3.h | 81 +++++++++++++++------------------------------------
 2 files changed, 24 insertions(+), 61 deletions(-)

diff --git a/src/fuzz.cpp b/src/fuzz.cpp
index bd43121..8430c93 100644
--- a/src/fuzz.cpp
+++ b/src/fuzz.cpp
@@ -83,10 +83,6 @@ void compareWithSimdjson(std::string const &json) {
       // This gets returned for precision errors sometimes?
       return;
     }
-    if (theirs == simdjson::STRING_ERROR) {
-      // why god why god do I gotta suffer
-      return;
-    }
     if (theirs == simdjson::NUMBER_OUT_OF_RANGE) {
       // We don't validate the precision of numbers
       return;
diff --git a/src/parser3.h b/src/parser3.h
index 3475c4e..368efa5 100644
--- a/src/parser3.h
+++ b/src/parser3.h
@@ -633,6 +633,10 @@ inline Status t_hex3(Parser3 *self) {
   }
   ++self->buf;
 
+  if (!(0xdc00 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) {
+    return S_REJECT;
+  }
+
   // Decode utf16 surrogate pair
   self->utf8Codepoint = 0x10000 + (self->utf16Surrogate - 0xd800) * 0x400 +
                         (self->utf8Codepoint - 0xdc00);
@@ -640,63 +644,26 @@ inline Status t_hex3(Parser3 *self) {
   // Write codepoint in utf-8 if there's room in the user provided buffer. If
   // there's not room, flush, write into a temp buffer, and flush again.
   char tmp[4];
-  if (self->utf8Codepoint < 0x80) {
-    assert(self->bufEnd - self->writeBuf >= 1);
-    *self->writeBuf++ = self->utf8Codepoint;
-  } else if (self->utf8Codepoint < 0x800) {
-    bool useTmp = self->bufEnd - self->writeBuf < 2;
-    char *p = tmp;
-    if (useTmp) {
-      self->flushString();
-    }
-    auto &w = useTmp ? p : self->writeBuf;
-    w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
-    self->utf8Codepoint >>= 6;
-    w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000;
-    w += 2;
-    if (useTmp) {
-      self->callbacks->on_string_data(self->data, tmp, 2);
-    }
-  } else if (self->utf8Codepoint < 0x10000) {
-    if (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff) {
-      return S_REJECT;
-    }
-    bool useTmp = self->bufEnd - self->writeBuf < 3;
-    char *p = tmp;
-    if (useTmp) {
-      self->flushString();
-    }
-    auto &w = useTmp ? p : self->writeBuf;
-    w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
-    self->utf8Codepoint >>= 6;
-    w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
-    self->utf8Codepoint >>= 6;
-    w[0] = (0b00001111 & self->utf8Codepoint) | 0b11100000;
-    w += 3;
-    if (useTmp) {
-      self->callbacks->on_string_data(self->data, tmp, 3);
-    }
-  } else {
-    if (self->utf8Codepoint > 0x10FFFF) {
-      return S_REJECT;
-    }
-    bool useTmp = self->bufEnd - self->writeBuf < 4;
-    char *p = tmp;
-    if (useTmp) {
-      self->flushString();
-    }
-    auto &w = useTmp ? p : self->writeBuf;
-    w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
-    self->utf8Codepoint >>= 6;
-    w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
-    self->utf8Codepoint >>= 6;
-    w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
-    self->utf8Codepoint >>= 6;
-    w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000;
-    w += 4;
-    if (useTmp) {
-      self->callbacks->on_string_data(self->data, tmp, 4);
-    }
+  assert(self->utf8Codepoint < 0x10000);
+  if (self->utf8Codepoint > 0x10FFFF) {
+    return S_REJECT;
+  }
+  bool useTmp = self->bufEnd - self->writeBuf < 4;
+  char *p = tmp;
+  if (useTmp) {
+    self->flushString();
+  }
+  auto &w = useTmp ? p : self->writeBuf;
+  w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
+  self->utf8Codepoint >>= 6;
+  w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
+  self->utf8Codepoint >>= 6;
+  w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
+  self->utf8Codepoint >>= 6;
+  w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000;
+  w += 4;
+  if (useTmp) {
+    self->callbacks->on_string_data(self->data, tmp, 4);
   }
 
   self->pop();