Basic unescaping

This commit is contained in:
2025-05-19 12:47:14 -04:00
parent d5bd9fc018
commit 553a273a1b

View File

@@ -79,7 +79,7 @@ struct Parser3 {
[[nodiscard]] Status parse(char *buf, int len) {
complete = len == 0;
this->buf = this->dataBegin = buf;
this->buf = this->dataBegin = this->writeBuf = buf;
this->bufEnd = buf + len;
return keepGoing(this);
}
@@ -92,7 +92,7 @@ struct Parser3 {
}
void flushString() {
int len = buf - dataBegin;
int len = writeBuf - dataBegin;
if (len > 0) {
callbacks->on_string_data(data, dataBegin, len);
}
@@ -127,10 +127,14 @@ struct Parser3 {
constexpr static int kMaxStackSize = 1024;
[[maybe_unused]] void debugPrint();
// Pointer to the next byte in the input to consume
char *buf = nullptr;
// Pointer past the end of the last byte available to consume
char *bufEnd = nullptr;
// Used for flushing pending data with on_*_data callbacks
char *dataBegin;
// Used for unescaping string data in place
char *writeBuf;
const Callbacks *const callbacks;
void *const data;
Symbol stack[kMaxStackSize];
@@ -368,9 +372,9 @@ inline Status n_string2(Parser3 *self) {
// one byte utf-8 encoding
switch (*self->buf) {
case '"':
self->flushString();
++self->buf;
self->pop();
self->flushString();
self->callbacks->on_end_string(self->data);
MUSTTAIL return Parser3::keepGoing(self);
case '\\':
@@ -389,6 +393,7 @@ inline Status n_string2(Parser3 *self) {
self->utf8Codepoint = *self->buf & 0b00011111;
self->minCodepoint = 0x80;
++self->buf;
++self->writeBuf;
self->pop();
if (auto s = self->push({T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
return s;
@@ -400,6 +405,7 @@ inline Status n_string2(Parser3 *self) {
self->utf8Codepoint = *self->buf & 0b00001111;
self->minCodepoint = 0x800;
++self->buf;
++self->writeBuf;
self->pop();
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE,
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
@@ -411,6 +417,7 @@ inline Status n_string2(Parser3 *self) {
self->utf8Codepoint = *self->buf & 0b00000111;
self->minCodepoint = 0x10000;
++self->buf;
++self->writeBuf;
self->pop();
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE,
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
@@ -434,7 +441,7 @@ inline Status n_string_following_escape(Parser3 *self) {
case 'n':
case 'r':
case 't':
++self->buf;
*self->writeBuf++ = tables.unescape[*self->buf++];
self->pop();
if (auto s = self->push({N_STRING2})) {
return s;
@@ -442,6 +449,7 @@ inline Status n_string_following_escape(Parser3 *self) {
MUSTTAIL return Parser3::keepGoing(self);
case 'u':
++self->buf;
// TODO unescape
self->pop();
if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX, N_STRING2})) {
return s;
@@ -463,6 +471,7 @@ inline Status t_utf8_continuation_byte(Parser3 *self) {
self->utf8Codepoint <<= 6;
self->utf8Codepoint |= *self->buf & 0b00111111;
++self->buf;
++self->writeBuf;
self->pop();
MUSTTAIL return Parser3::keepGoing(self);
}
@@ -486,6 +495,7 @@ inline Status t_utf8_last_continuation_byte(Parser3 *self) {
}
// TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized
++self->buf;
++self->writeBuf;
self->pop();
MUSTTAIL return Parser3::keepGoing(self);
}
@@ -524,6 +534,7 @@ inline Status t_hex(Parser3 *self) {
('a' <= *self->buf && *self->buf <= 'f') ||
('A' <= *self->buf && *self->buf <= 'F')) {
++self->buf;
++self->writeBuf;
self->pop();
MUSTTAIL return Parser3::keepGoing(self);
}