Basic unescaping

This commit is contained in:
2025-05-19 12:47:14 -04:00
parent d5bd9fc018
commit 553a273a1b

View File

@@ -79,7 +79,7 @@ struct Parser3 {
[[nodiscard]] Status parse(char *buf, int len) { [[nodiscard]] Status parse(char *buf, int len) {
complete = len == 0; complete = len == 0;
this->buf = this->dataBegin = buf; this->buf = this->dataBegin = this->writeBuf = buf;
this->bufEnd = buf + len; this->bufEnd = buf + len;
return keepGoing(this); return keepGoing(this);
} }
@@ -92,7 +92,7 @@ struct Parser3 {
} }
void flushString() { void flushString() {
int len = buf - dataBegin; int len = writeBuf - dataBegin;
if (len > 0) { if (len > 0) {
callbacks->on_string_data(data, dataBegin, len); callbacks->on_string_data(data, dataBegin, len);
} }
@@ -127,10 +127,14 @@ struct Parser3 {
constexpr static int kMaxStackSize = 1024; constexpr static int kMaxStackSize = 1024;
[[maybe_unused]] void debugPrint(); [[maybe_unused]] void debugPrint();
// Pointer to the next byte in the input to consume
char *buf = nullptr; char *buf = nullptr;
// Pointer past the end of the last byte available to consume
char *bufEnd = nullptr; char *bufEnd = nullptr;
// Used for flushing pending data with on_*_data callbacks // Used for flushing pending data with on_*_data callbacks
char *dataBegin; char *dataBegin;
// Used for unescaping string data in place
char *writeBuf;
const Callbacks *const callbacks; const Callbacks *const callbacks;
void *const data; void *const data;
Symbol stack[kMaxStackSize]; Symbol stack[kMaxStackSize];
@@ -368,9 +372,9 @@ inline Status n_string2(Parser3 *self) {
// one byte utf-8 encoding // one byte utf-8 encoding
switch (*self->buf) { switch (*self->buf) {
case '"': case '"':
self->flushString();
++self->buf; ++self->buf;
self->pop(); self->pop();
self->flushString();
self->callbacks->on_end_string(self->data); self->callbacks->on_end_string(self->data);
MUSTTAIL return Parser3::keepGoing(self); MUSTTAIL return Parser3::keepGoing(self);
case '\\': case '\\':
@@ -389,6 +393,7 @@ inline Status n_string2(Parser3 *self) {
self->utf8Codepoint = *self->buf & 0b00011111; self->utf8Codepoint = *self->buf & 0b00011111;
self->minCodepoint = 0x80; self->minCodepoint = 0x80;
++self->buf; ++self->buf;
++self->writeBuf;
self->pop(); self->pop();
if (auto s = self->push({T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) { if (auto s = self->push({T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
return s; return s;
@@ -400,6 +405,7 @@ inline Status n_string2(Parser3 *self) {
self->utf8Codepoint = *self->buf & 0b00001111; self->utf8Codepoint = *self->buf & 0b00001111;
self->minCodepoint = 0x800; self->minCodepoint = 0x800;
++self->buf; ++self->buf;
++self->writeBuf;
self->pop(); self->pop();
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, if (auto s = self->push({T_UTF8_CONTINUATION_BYTE,
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) { T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
@@ -411,6 +417,7 @@ inline Status n_string2(Parser3 *self) {
self->utf8Codepoint = *self->buf & 0b00000111; self->utf8Codepoint = *self->buf & 0b00000111;
self->minCodepoint = 0x10000; self->minCodepoint = 0x10000;
++self->buf; ++self->buf;
++self->writeBuf;
self->pop(); self->pop();
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE, if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE,
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) { T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
@@ -434,7 +441,7 @@ inline Status n_string_following_escape(Parser3 *self) {
case 'n': case 'n':
case 'r': case 'r':
case 't': case 't':
++self->buf; *self->writeBuf++ = tables.unescape[*self->buf++];
self->pop(); self->pop();
if (auto s = self->push({N_STRING2})) { if (auto s = self->push({N_STRING2})) {
return s; return s;
@@ -442,6 +449,7 @@ inline Status n_string_following_escape(Parser3 *self) {
MUSTTAIL return Parser3::keepGoing(self); MUSTTAIL return Parser3::keepGoing(self);
case 'u': case 'u':
++self->buf; ++self->buf;
// TODO unescape
self->pop(); self->pop();
if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX, N_STRING2})) { if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX, N_STRING2})) {
return s; return s;
@@ -463,6 +471,7 @@ inline Status t_utf8_continuation_byte(Parser3 *self) {
self->utf8Codepoint <<= 6; self->utf8Codepoint <<= 6;
self->utf8Codepoint |= *self->buf & 0b00111111; self->utf8Codepoint |= *self->buf & 0b00111111;
++self->buf; ++self->buf;
++self->writeBuf;
self->pop(); self->pop();
MUSTTAIL return Parser3::keepGoing(self); MUSTTAIL return Parser3::keepGoing(self);
} }
@@ -486,6 +495,7 @@ inline Status t_utf8_last_continuation_byte(Parser3 *self) {
} }
// TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized // TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized
++self->buf; ++self->buf;
++self->writeBuf;
self->pop(); self->pop();
MUSTTAIL return Parser3::keepGoing(self); MUSTTAIL return Parser3::keepGoing(self);
} }
@@ -524,6 +534,7 @@ inline Status t_hex(Parser3 *self) {
('a' <= *self->buf && *self->buf <= 'f') || ('a' <= *self->buf && *self->buf <= 'f') ||
('A' <= *self->buf && *self->buf <= 'F')) { ('A' <= *self->buf && *self->buf <= 'F')) {
++self->buf; ++self->buf;
++self->writeBuf;
self->pop(); self->pop();
MUSTTAIL return Parser3::keepGoing(self); MUSTTAIL return Parser3::keepGoing(self);
} }