Basic unescaping

2025-05-19 12:47:14 -04:00
parent d5bd9fc018
commit 553a273a1b
1 changed files with 15 additions and 4 deletions
--- a/src/parser3.h
+++ b/src/parser3.h
@@ -79,7 +79,7 @@ struct Parser3 {

  [[nodiscard]] Status parse(char *buf, int len) {
    complete = len == 0;
-    this->buf = this->dataBegin = buf;
+    this->buf = this->dataBegin = this->writeBuf = buf;
    this->bufEnd = buf + len;
    return keepGoing(this);
  }
@@ -92,7 +92,7 @@ struct Parser3 {
  }

  void flushString() {
-    int len = buf - dataBegin;
+    int len = writeBuf - dataBegin;
    if (len > 0) {
      callbacks->on_string_data(data, dataBegin, len);
    }
@@ -127,10 +127,14 @@ struct Parser3 {
  constexpr static int kMaxStackSize = 1024;

  [[maybe_unused]] void debugPrint();
+  // Pointer to the next byte in the input to consume
  char *buf = nullptr;
+  // Pointer past the end of the last byte available to consume
  char *bufEnd = nullptr;
  // Used for flushing pending data with on_*_data callbacks
  char *dataBegin;
+  // Used for unescaping string data in place
+  char *writeBuf;
  const Callbacks *const callbacks;
  void *const data;
  Symbol stack[kMaxStackSize];
@@ -368,9 +372,9 @@ inline Status n_string2(Parser3 *self) {
    // one byte utf-8 encoding
    switch (*self->buf) {
    case '"':
-      self->flushString();
      ++self->buf;
      self->pop();
+      self->flushString();
      self->callbacks->on_end_string(self->data);
      MUSTTAIL return Parser3::keepGoing(self);
    case '\\':
@@ -389,6 +393,7 @@ inline Status n_string2(Parser3 *self) {
    self->utf8Codepoint = *self->buf & 0b00011111;
    self->minCodepoint = 0x80;
    ++self->buf;
+    ++self->writeBuf;
    self->pop();
    if (auto s = self->push({T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
      return s;
@@ -400,6 +405,7 @@ inline Status n_string2(Parser3 *self) {
    self->utf8Codepoint = *self->buf & 0b00001111;
    self->minCodepoint = 0x800;
    ++self->buf;
+    ++self->writeBuf;
    self->pop();
    if (auto s = self->push({T_UTF8_CONTINUATION_BYTE,
                             T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
@@ -411,6 +417,7 @@ inline Status n_string2(Parser3 *self) {
    self->utf8Codepoint = *self->buf & 0b00000111;
    self->minCodepoint = 0x10000;
    ++self->buf;
+    ++self->writeBuf;
    self->pop();
    if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE,
                             T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
@@ -434,7 +441,7 @@ inline Status n_string_following_escape(Parser3 *self) {
  case 'n':
  case 'r':
  case 't':
-    ++self->buf;
+    *self->writeBuf++ = tables.unescape[*self->buf++];
    self->pop();
    if (auto s = self->push({N_STRING2})) {
      return s;
@@ -442,6 +449,7 @@ inline Status n_string_following_escape(Parser3 *self) {
    MUSTTAIL return Parser3::keepGoing(self);
  case 'u':
    ++self->buf;
+    // TODO unescape
    self->pop();
    if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX, N_STRING2})) {
      return s;
@@ -463,6 +471,7 @@ inline Status t_utf8_continuation_byte(Parser3 *self) {
    self->utf8Codepoint <<= 6;
    self->utf8Codepoint |= *self->buf & 0b00111111;
    ++self->buf;
+    ++self->writeBuf;
    self->pop();
    MUSTTAIL return Parser3::keepGoing(self);
  }
@@ -486,6 +495,7 @@ inline Status t_utf8_last_continuation_byte(Parser3 *self) {
    }
    // TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized
    ++self->buf;
+    ++self->writeBuf;
    self->pop();
    MUSTTAIL return Parser3::keepGoing(self);
  }
@@ -524,6 +534,7 @@ inline Status t_hex(Parser3 *self) {
      ('a' <= *self->buf && *self->buf <= 'f') ||
      ('A' <= *self->buf && *self->buf <= 'F')) {
    ++self->buf;
+    ++self->writeBuf;
    self->pop();
    MUSTTAIL return Parser3::keepGoing(self);
  }