Loop in string2 in normal case

2025-05-19 17:50:48 -04:00
parent 918950d7f8
commit a4d7d1f91e
2 changed files with 80 additions and 56 deletions
--- a/src/parser3.h
+++ b/src/parser3.h
@@ -327,30 +327,28 @@ inline Status n_string(Parser3 *self) {
 }
 inline Status n_string2(Parser3 *self) {
-  if (tables.invalidStringByte[uint8_t(*self->buf)]) {
+begin:
-    return S_REJECT;
+  switch (tables.stringByteMeaning[uint8_t(*self->buf)]) {
-  }
+  case Tables::NORMAL:
-  if (int8_t(*self->buf) > 0) {
+    *self->writeBuf++ = *self->buf++;
-    // one byte utf-8 encoding
+    if (self->buf == self->bufEnd) {
    switch (*self->buf) {
    case '"':
      self->flushString();
      self->callbacks->on_end_string(self->data);
      ++self->buf;
      self->pop();
      MUSTTAIL return Parser3::keepGoing(self);
    case '\\':
      ++self->buf;
      self->pop();
      if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) {
        return s;
      }
      MUSTTAIL return Parser3::keepGoing(self);
    default:
      *self->writeBuf++ = *self->buf++;
      MUSTTAIL return Parser3::keepGoing(self);
    }
-  } else if ((*self->buf & 0b11100000) == 0b11000000) {
+    goto begin;
  case Tables::DUBQUOTE:
    self->flushString();
    self->callbacks->on_end_string(self->data);
    ++self->buf;
    self->pop();
    MUSTTAIL return Parser3::keepGoing(self);
  case Tables::BACKSLASH:
    ++self->buf;
    self->pop();
    if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) {
      return s;
    }
    MUSTTAIL return Parser3::keepGoing(self);
  case Tables::TWO_BYTE_UTF8:
    // two byte utf-8 encoding
    self->utf8Codepoint = *self->buf & 0b00011111;
    self->minCodepoint = 0x80;
@@ -360,8 +358,7 @@ inline Status n_string2(Parser3 *self) {
      return s;
    }
    MUSTTAIL return Parser3::keepGoing(self);
-  }
+  case Tables::THREE_BYTE_UTF8:
  if ((*self->buf & 0b11110000) == 0b11100000) {
    // three byte utf-8 encoding
    self->utf8Codepoint = *self->buf & 0b00001111;
    self->minCodepoint = 0x800;
@@ -372,7 +369,7 @@ inline Status n_string2(Parser3 *self) {
      return s;
    }
    MUSTTAIL return Parser3::keepGoing(self);
-  } else if ((*self->buf & 0b11111000) == 0b11110000) {
+  case Tables::FOUR_BYTE_UTF8:
    // four byte utf-8 encoding
    self->utf8Codepoint = *self->buf & 0b00000111;
    self->minCodepoint = 0x10000;
@@ -383,8 +380,10 @@ inline Status n_string2(Parser3 *self) {
      return s;
    }
    MUSTTAIL return Parser3::keepGoing(self);
  case Tables::CONTINUATION_BYTE:
  case Tables::INVALID:
    return S_REJECT;
  }
  return S_REJECT;
 }
 inline Status n_string_following_escape(Parser3 *self) {
@@ -417,37 +416,33 @@ inline Status n_string_following_escape(Parser3 *self) {
 }
 inline Status t_utf8_continuation_byte(Parser3 *self) {
-  if (tables.invalidStringByte[uint8_t(*self->buf)]) {
+  if (tables.stringByteMeaning[uint8_t(*self->buf)] !=
      Tables::CONTINUATION_BYTE) {
    return S_REJECT;
  }
-  if ((*self->buf & 0b11000000) == 0b10000000) {
+  self->utf8Codepoint <<= 6;
-    self->utf8Codepoint <<= 6;
+  self->utf8Codepoint |= *self->buf & 0b00111111;
-    self->utf8Codepoint |= *self->buf & 0b00111111;
+  *self->writeBuf++ = *self->buf++;
-    *self->writeBuf++ = *self->buf++;
+  self->pop();
-    self->pop();
+  MUSTTAIL return Parser3::keepGoing(self);
    MUSTTAIL return Parser3::keepGoing(self);
  }
  return S_REJECT;
 }
 inline Status t_utf8_last_continuation_byte(Parser3 *self) {
-  if (tables.invalidStringByte[uint8_t(*self->buf)]) {
+  if (tables.stringByteMeaning[uint8_t(*self->buf)] !=
      Tables::CONTINUATION_BYTE) {
    return S_REJECT;
  }
-  if ((*self->buf & 0b11000000) == 0b10000000) {
+  self->utf8Codepoint <<= 6;
-    self->utf8Codepoint <<= 6;
+  self->utf8Codepoint |= *self->buf & 0b00111111;
-    self->utf8Codepoint |= *self->buf & 0b00111111;
+  if (self->utf8Codepoint < self->minCodepoint ||
-    if (self->utf8Codepoint < self->minCodepoint ||
+      self->utf8Codepoint > 0x10ffff ||
-        self->utf8Codepoint > 0x10ffff ||
+      (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) {
-        (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) {
+    return S_REJECT;
      return S_REJECT;
    }
    // TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized
    *self->writeBuf++ = *self->buf++;
    self->pop();
    MUSTTAIL return Parser3::keepGoing(self);
  }
-  return S_REJECT;
+  // TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized
  *self->writeBuf++ = *self->buf++;
  self->pop();
  MUSTTAIL return Parser3::keepGoing(self);
 }
 inline Status t_digit(Parser3 *self) {
--- a/src/tables.h
+++ b/src/tables.h
@@ -1,19 +1,48 @@
 #pragma once
 constexpr inline struct Tables {
  enum StringByteMeaning {
    INVALID,
    NORMAL,
    DUBQUOTE,
    BACKSLASH,
    TWO_BYTE_UTF8,
    THREE_BYTE_UTF8,
    FOUR_BYTE_UTF8,
    CONTINUATION_BYTE,
  };
  constexpr Tables() {
    whitespace[' '] = true;
    whitespace['\n'] = true;
    whitespace['\r'] = true;
    whitespace['\t'] = true;
-    for (int i = 0; i < 0x20; ++i) {
+    for (int i = 0; i < 256; ++i) {
-      invalidStringByte[i] = true;
+      if ((i & 0b11000000) == 0b10000000) {
        stringByteMeaning[i] = CONTINUATION_BYTE;
      }
      if ((i & 0b11100000) == 0b11000000) {
        stringByteMeaning[i] = TWO_BYTE_UTF8;
      }
      if ((i & 0b11110000) == 0b11100000) {
        stringByteMeaning[i] = THREE_BYTE_UTF8;
      }
      if ((i & 0b11111000) == 0b11110000) {
        stringByteMeaning[i] = FOUR_BYTE_UTF8;
      }
    }
-    invalidStringByte[0xc0] = true;
+
-    invalidStringByte[0xc1] = true;
+    for (int i = 0x20; i < 128; ++i) {
-    for (int i = 0xf5; i <= 0xff; ++i) {
+      stringByteMeaning[i] = NORMAL;
-      invalidStringByte[i] = true;
+    }
    stringByteMeaning['"'] = DUBQUOTE;
    stringByteMeaning['\\'] = BACKSLASH;
    stringByteMeaning[0xc0] = INVALID;
    stringByteMeaning[0xc1] = INVALID;
    for (int i = 0xF5; i < 0x100; ++i) {
      stringByteMeaning[i] = INVALID;
    }
    unescape['n'] = '\n';
@@ -26,6 +55,6 @@ constexpr inline struct Tables {
    unescape['/'] = '/';
  }
  alignas(16) bool whitespace[256]{};
-  alignas(16) bool invalidStringByte[256]{};
+  alignas(16) StringByteMeaning stringByteMeaning[256]{};
  alignas(16) char unescape[256]{};
 } tables;