Files
weaseljson/src/parser3.h

1525 lines
38 KiB
C++

#pragma once
#include <cassert>
#include <cctype>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <initializer_list>
#include <tuple>
#include "musttail.h"
#include "preserve_none.h"
#include "simd.h"
#include "tables.h"
#include "weaseljson.h"
namespace parser3 {
// See https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725 for
// an explanation of this cycle/byte dfa implementation.
//
// Recognizes json number syntax. As a regex:
// -?([0-9]|[1-9][0-9]*)(\.[0-9]+)?((e|E)(-|\+)?[0-9]+)?
struct NumDfa {
constexpr static uint64_t table[256] = {
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x36000ull,
0x0ull,
0x36600ull,
0x12480000000000ull,
0x0ull,
0x780aa47b091ec00ull,
0x780aa47aa91ea80ull,
0x780aa47aa91ea80ull,
0x780aa47aa91ea80ull,
0x780aa47aa91ea80ull,
0x780aa47aa91ea80ull,
0x780aa47aa91ea80ull,
0x780aa47aa91ea80ull,
0x780aa47aa91ea80ull,
0x780aa47aa91ea80ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0xc30c000000000ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0xc30c000000000ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
};
// Restore this dfa to its start state
void reset() { state = 6; }
// Return true if this dfa is in an accept state. You probably want to call
// scan until the match ends first.
bool accept() const {
return (state & 63) == 30 || (state & 63) == 36 || (state & 63) == 48 ||
(state & 63) == 42;
}
// return value either points to the first byte which does not match, or
// bufEnd. Leaves the dfa in the last state of the match.
#ifdef __x86_64__
__attribute__((target_clones("default", "bmi2")))
#endif
const char *
scan(const char *buf, const char *bufEnd) {
auto state_ = state;
for (;;) {
constexpr int kStride = 16;
if (bufEnd - buf < kStride) [[unlikely]] {
while (buf != bufEnd) {
uint64_t row = table[uint8_t(*buf)];
auto prev = state_;
state_ = (row >> (state_ & 63)) & 63;
if (state_ == 0) {
state_ = prev;
break;
}
++buf;
}
state = state_;
return buf;
}
uint8_t prev[kStride + 1];
prev[0] = state_;
for (int i = 0; i < kStride; ++i) {
uint64_t row = table[uint8_t(*buf)];
prev[i + 1] = row >> (prev[i] & 63);
if ((prev[i + 1] & 63) == 0) {
state = prev[i];
return buf;
}
++buf;
}
state_ = prev[kStride];
}
}
private:
uint64_t state = 6;
};
// Recognizes sequences of valid utf8 characters except 0-0x20, double quote,
// and backslash
struct Utf8Dfa {
constexpr static uint64_t table[256] = {
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x30000000000000ull,
0x30000000000000ull,
0x0ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x0ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x30000000000000ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x18630780780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x1863001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x60063001e780ull,
0x0ull,
0x0ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x18000000000000ull,
0x2a000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0x24000000000000ull,
0x1e000000000000ull,
0x1e000000000000ull,
0xc000000000000ull,
0x6000000000000ull,
0x6000000000000ull,
0x6000000000000ull,
0x12000000000000ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
0x0ull,
};
// Restore this dfa to its start state
void reset() { state = 48; }
// Return true if this dfa is in an accept state. You probably want to call
// scan until the match ends first.
bool accept() const { return (state & 63) == 48; }
// return value either points to the first byte which does not match, or
// bufEnd. Leaves the dfa in the last state of the match.
#ifdef __x86_64__
__attribute__((target_clones("default", "bmi2")))
#endif
const char *
scan(const char *buf, const char *bufEnd) {
auto state_ = state;
for (;;) {
constexpr int kStride = 16;
if (bufEnd - buf < kStride) [[unlikely]] {
while (buf != bufEnd) {
uint64_t row = table[uint8_t(*buf)];
auto prev = state_;
state_ = (row >> (state_ & 63)) & 63;
if (state_ == 0) {
state_ = prev;
break;
}
++buf;
}
state = state_;
return buf;
}
uint8_t prev[kStride + 1];
prev[0] = state_;
for (int i = 0; i < kStride; ++i) {
uint64_t row = table[uint8_t(*buf)];
prev[i + 1] = row >> (prev[i] & 63);
if ((prev[i + 1] & 63) == 0) {
state = prev[i];
return buf;
}
++buf;
}
state_ = prev[kStride];
}
}
private:
uint64_t state = 48;
};
typedef PRESERVE_NONE WeaselJsonStatus (*Continuation)(struct Parser3 *,
char *buf, char *bufEnd);
// These appear in the stack of the pushdown
// automata
enum Symbol : uint8_t {
N_VALUE,
N_OBJECT2,
N_OBJECT3,
N_ARRAY2,
N_ARRAY3,
N_STRING,
N_STRING2,
N_STRING_FOLLOWING_ESCAPE,
N_WHITESPACE,
N_NUMBER,
N_TRUE,
N_FALSE,
N_NULL,
T_R,
T_U,
// u inside of a string
T_U2,
T_A,
T_L,
T_S,
T_COLON,
T_HEX,
T_HEX2,
T_HEX3,
T_EOF,
T_BACKSLASH,
N_SYMBOL_COUNT, // Must be last
};
struct Parser3 {
Parser3(const WeaselJsonCallbacks *callbacks, void *userdata, int stackSize)
: callbacks(callbacks), userdata(userdata), stackSize(stackSize) {
reset();
}
[[nodiscard]] WeaselJsonStatus parse(char *buf, int len) {
complete = len == 0;
this->dataBegin = this->writeBuf = buf;
return keepGoing(this, buf, buf + len);
}
void flushNumber(bool done, char *buf) {
int len = buf - dataBegin;
assert(len >= 0);
if (done || len > 0) {
callbacks->on_number_data(userdata, dataBegin, len, done);
}
}
void flushString(bool done) {
int len = writeBuf - dataBegin;
assert(len >= 0);
if (done || len > 0) {
callbacks->on_string_data(userdata, dataBegin, len, done);
}
dataBegin = writeBuf;
}
[[nodiscard]] bool empty() const { return stackPtr == stack(); }
void pop() {
assert(!empty());
--stackPtr;
}
[[nodiscard]] WeaselJsonStatus push(std::initializer_list<Symbol> symbols) {
if (stackPtr >= stack() + stackSize - symbols.size()) [[unlikely]] {
return WeaselJson_OVERFLOW;
}
for (int i = symbols.size() - 1; i >= 0; --i) {
*stackPtr++ = *(symbols.begin() + i);
}
return WeaselJson_OK;
}
[[nodiscard]] Symbol top() const {
assert(!empty());
return *(stackPtr - 1);
}
static PRESERVE_NONE WeaselJsonStatus keepGoing(Parser3 *self, char *buf,
char *bufEnd);
Symbol *stack() const { return (Symbol *)(this + 1); }
void reset() {
stackPtr = stack();
complete = false;
std::ignore = push({N_VALUE, N_WHITESPACE, T_EOF});
}
// Used for flushing pending data with on_*_data callbacks
char *dataBegin;
// Used for unescaping string data in place
char *writeBuf;
WeaselJsonCallbacks const *const callbacks;
void *const userdata;
Symbol *stackPtr;
uint32_t utf8Codepoint;
uint32_t utf16Surrogate;
uint32_t minCodepoint;
int const stackSize;
bool complete;
NumDfa numDfa;
Utf8Dfa strDfa;
};
inline PRESERVE_NONE WeaselJsonStatus n_whitespace(Parser3 *self, char *buf,
char *bufEnd) {
if (bufEnd - buf == 0) {
self->pop();
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
while (tables.whitespace[uint8_t(*buf)]) {
++buf;
if (buf == bufEnd) {
return WeaselJson_AGAIN;
}
}
self->pop();
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
inline PRESERVE_NONE WeaselJsonStatus n_number(Parser3 *self, char *buf,
char *bufEnd) {
buf = (char *)self->numDfa.scan(buf, bufEnd);
if (buf == bufEnd && !self->complete) {
self->flushNumber(false, buf);
return WeaselJson_AGAIN;
}
if (!self->numDfa.accept()) [[unlikely]] {
return WeaselJson_REJECT;
}
self->flushNumber(true, buf);
self->pop();
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
// Advance buf until double quote, backslash, invalid utf8, or codepoint <
// 0x20
template <class V>
inline PRESERVE_NONE WeaselJsonStatus scan_string_impl(Parser3 *self,
char *&buf,
char *bufEnd) {
const auto before = buf;
// Advance buf past normal characters
for (;;) {
if (bufEnd - buf < V::lanes) [[unlikely]] {
break;
}
auto v = V{(int8_t *)buf};
int normal =
(v != V::splat('"') & v != V::splat('\\') & v >= V::splat(0x20))
.count_leading_nonzero_lanes();
buf += normal;
if (normal < V::lanes) {
break;
}
}
buf = (char *)self->strDfa.scan(buf, bufEnd);
int len = buf - before;
if (self->writeBuf != before) {
memmove(self->writeBuf, before, len);
}
self->writeBuf += len;
if (buf == bufEnd) {
self->flushString(false);
return WeaselJson_AGAIN;
}
if (!self->strDfa.accept()) [[unlikely]] {
return WeaselJson_REJECT;
}
return WeaselJson_OK;
}
#ifdef __x86_64__
constexpr int kLanes = 32;
template WeaselJsonStatus
scan_string_impl<simd<int8_t, kLanes, sse::Simd_x86_SSE>>(Parser3 *, char *&,
char *);
template __attribute__((target("avx2"))) WeaselJsonStatus
scan_string_impl<simd<int8_t, kLanes, sse::Simd_x86_AVX2>>(Parser3 *, char *&,
char *);
__attribute__((target("default"))) inline PRESERVE_NONE WeaselJsonStatus
scan_string(Parser3 *self, char *&buf, char *bufEnd) {
MUSTTAIL return scan_string_impl<simd<int8_t, kLanes, sse::Simd_x86_SSE>>(
self, buf, bufEnd);
}
__attribute__((target("avx2"))) inline PRESERVE_NONE WeaselJsonStatus
scan_string(Parser3 *self, char *&buf, char *bufEnd) {
MUSTTAIL return scan_string_impl<simd<int8_t, kLanes, sse::Simd_x86_AVX2>>(
self, buf, bufEnd);
}
#else
inline PRESERVE_NONE WeaselJsonStatus scan_string(Parser3 *self, char *buf,
char *bufEnd) {
MUSTTAIL return scan_string_impl<simd<int8_t, 32>>(self, buf, bufEnd);
}
#endif
inline PRESERVE_NONE WeaselJsonStatus n_value(Parser3 *self, char *buf,
char *bufEnd) {
assert(bufEnd - buf != 0);
while (tables.whitespace[uint8_t(*buf)]) {
++buf;
if (buf == bufEnd) {
return WeaselJson_AGAIN;
}
}
switch (*buf) {
case '{':
self->callbacks->on_begin_object(self->userdata);
++buf;
self->pop();
if (auto s = self->push({N_OBJECT2})) {
return s;
}
break;
case '[':
self->callbacks->on_begin_array(self->userdata);
++buf;
self->pop();
if (auto s = self->push({N_ARRAY2})) {
return s;
}
break;
case '"':
++buf;
self->dataBegin = self->writeBuf = buf;
self->pop();
self->strDfa.reset();
if (auto s = scan_string(self, buf, bufEnd)) {
if (s == WeaselJson_AGAIN) {
if (auto s2 = self->push({N_STRING2})) {
return s2;
}
}
return s;
}
{
switch (*buf) {
case '"':
self->flushString(true);
++buf;
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case '\\':
++buf;
if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
default:
return WeaselJson_REJECT;
}
}
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '-':
self->dataBegin = buf;
self->pop();
self->numDfa.reset();
buf = (char *)self->numDfa.scan(buf, bufEnd);
if (buf == bufEnd) {
self->flushNumber(false, buf);
if (auto s = self->push({N_NUMBER})) {
return s;
}
return WeaselJson_AGAIN;
}
if (!self->numDfa.accept()) [[unlikely]] {
return WeaselJson_REJECT;
}
self->flushNumber(true, buf);
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case 't':
++buf;
self->pop();
if (bufEnd - buf >= 3) {
if (memcmp(buf, "rue", 3) == 0) {
self->callbacks->on_true_literal(self->userdata);
buf += 3;
} else [[unlikely]] {
return WeaselJson_REJECT;
}
} else {
if (auto s = self->push({T_R, T_U, N_TRUE})) {
return s;
}
}
break;
case 'f':
++buf;
self->pop();
if (bufEnd - buf >= 4) {
if (memcmp(buf, "alse", 4) == 0) {
self->callbacks->on_false_literal(self->userdata);
buf += 4;
} else [[unlikely]] {
return WeaselJson_REJECT;
}
} else {
if (auto s = self->push({T_A, T_L, T_S, N_FALSE})) {
return s;
}
}
break;
case 'n':
++buf;
self->pop();
if (bufEnd - buf >= 3) {
if (memcmp(buf, "ull", 3) == 0) {
self->callbacks->on_null_literal(self->userdata);
buf += 3;
} else [[unlikely]] {
return WeaselJson_REJECT;
}
} else {
if (auto s = self->push({T_U, T_L, N_NULL})) {
return s;
}
}
break;
default:
[[unlikely]] return WeaselJson_REJECT;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
inline PRESERVE_NONE WeaselJsonStatus n_object2(Parser3 *self, char *buf,
char *bufEnd) {
assert(bufEnd - buf != 0);
while (tables.whitespace[uint8_t(*buf)]) {
++buf;
if (buf == bufEnd) {
return WeaselJson_AGAIN;
}
}
switch (*buf) {
case '}':
++buf;
self->pop();
self->callbacks->on_end_object(self->userdata);
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case '"':
++buf;
self->dataBegin = self->writeBuf = buf;
self->pop();
self->strDfa.reset();
if (auto s = self->push({N_STRING2, T_COLON, N_VALUE, N_OBJECT3})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
default:
[[unlikely]] return WeaselJson_REJECT;
}
}
inline PRESERVE_NONE WeaselJsonStatus n_object3(Parser3 *self, char *buf,
char *bufEnd) {
assert(bufEnd - buf != 0);
while (tables.whitespace[uint8_t(*buf)]) {
++buf;
if (buf == bufEnd) {
return WeaselJson_AGAIN;
}
}
switch (*buf) {
case '}':
++buf;
self->pop();
self->callbacks->on_end_object(self->userdata);
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case ',':
++buf;
self->pop();
if (auto s = self->push({N_STRING, T_COLON, N_VALUE, N_OBJECT3})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
default:
[[unlikely]] return WeaselJson_REJECT;
}
}
inline PRESERVE_NONE WeaselJsonStatus n_array2(Parser3 *self, char *buf,
char *bufEnd) {
assert(bufEnd - buf != 0);
while (tables.whitespace[uint8_t(*buf)]) {
++buf;
if (buf == bufEnd) {
return WeaselJson_AGAIN;
}
}
switch (*buf) {
case ']':
++buf;
self->pop();
self->callbacks->on_end_array(self->userdata);
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
default:
self->pop();
if (auto s = self->push({N_VALUE, N_ARRAY3})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
}
inline PRESERVE_NONE WeaselJsonStatus n_array3(Parser3 *self, char *buf,
char *bufEnd) {
assert(bufEnd - buf != 0);
while (tables.whitespace[uint8_t(*buf)]) {
++buf;
if (buf == bufEnd) {
return WeaselJson_AGAIN;
}
}
switch (*buf) {
case ']':
++buf;
self->pop();
self->callbacks->on_end_array(self->userdata);
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case ',':
++buf;
self->pop();
if (auto s = self->push({N_VALUE, N_ARRAY3})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
default:
[[unlikely]] return WeaselJson_REJECT;
}
}
inline PRESERVE_NONE WeaselJsonStatus n_string(Parser3 *self, char *buf,
char *bufEnd) {
assert(bufEnd - buf != 0);
while (tables.whitespace[uint8_t(*buf)]) {
++buf;
if (buf == bufEnd) {
return WeaselJson_AGAIN;
}
}
if (*buf != '"') [[unlikely]] {
return WeaselJson_REJECT;
}
++buf;
self->dataBegin = self->writeBuf = buf;
self->pop();
self->strDfa.reset();
if (auto s = self->push({N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
inline PRESERVE_NONE WeaselJsonStatus n_string2(Parser3 *self, char *buf,
char *bufEnd) {
if (auto s = scan_string(self, buf, bufEnd)) {
return s;
}
switch (*buf) {
case '"':
self->flushString(true);
++buf;
self->pop();
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case '\\':
++buf;
self->pop();
if (auto s = self->push({N_STRING_FOLLOWING_ESCAPE})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
default:
__builtin_unreachable();
}
}
inline PRESERVE_NONE WeaselJsonStatus n_string_following_escape(Parser3 *self,
char *buf,
char *bufEnd) {
switch (*buf) {
case '"':
case '\\':
case '/':
case 'b':
case 'f':
case 'n':
case 'r':
case 't':
*self->writeBuf++ = tables.unescape[uint8_t(*buf++)];
self->pop();
self->strDfa.reset();
if (auto s = self->push({N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
case 'u':
++buf;
self->utf8Codepoint = 0;
self->pop();
self->strDfa.reset();
if (auto s = self->push({T_HEX, T_HEX, T_HEX, T_HEX2, N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
default:
[[unlikely]] return WeaselJson_REJECT;
}
}
inline PRESERVE_NONE WeaselJsonStatus t_digit(Parser3 *self, char *buf,
char *bufEnd) {
if ('0' <= *buf && *buf <= '9') {
++buf;
self->pop();
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
} else [[unlikely]] {
return WeaselJson_REJECT;
}
}
inline PRESERVE_NONE WeaselJsonStatus t_onenine(Parser3 *self, char *buf,
char *bufEnd) {
if ('1' <= *buf && *buf <= '9') {
++buf;
self->pop();
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
} else [[unlikely]] {
return WeaselJson_REJECT;
}
}
inline PRESERVE_NONE WeaselJsonStatus t_hex(Parser3 *self, char *buf,
char *bufEnd) {
self->utf8Codepoint <<= 4;
if (('0' <= *buf && *buf <= '9')) {
self->utf8Codepoint |= *buf - '0';
} else if ('a' <= *buf && *buf <= 'f') {
self->utf8Codepoint |= 10 + *buf - 'a';
} else if ('A' <= *buf && *buf <= 'F') {
self->utf8Codepoint |= 10 + *buf - 'A';
} else [[unlikely]] {
return WeaselJson_REJECT;
}
++buf;
self->pop();
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
inline PRESERVE_NONE WeaselJsonStatus t_hex2(Parser3 *self, char *buf,
char *bufEnd) {
self->utf8Codepoint <<= 4;
if (('0' <= *buf && *buf <= '9')) {
self->utf8Codepoint |= *buf - '0';
} else if ('a' <= *buf && *buf <= 'f') {
self->utf8Codepoint |= 10 + *buf - 'a';
} else if ('A' <= *buf && *buf <= 'F') {
self->utf8Codepoint |= 10 + *buf - 'A';
} else [[unlikely]] {
return WeaselJson_REJECT;
}
++buf;
// Write codepoint in utf-8 if there's room in the user provided buffer. If
// there's not room, flush, write into a temp buffer, and flush again.
char tmp[3];
if (self->utf8Codepoint < 0x80) {
assert(buf - self->writeBuf >= 1);
*self->writeBuf++ = self->utf8Codepoint;
} else if (self->utf8Codepoint < 0x800) {
bool useTmp = buf - self->writeBuf < 2;
char *p = tmp;
if (useTmp) [[unlikely]] {
self->flushString(false);
}
auto &w = useTmp ? p : self->writeBuf;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[0] = (0b00011111 & self->utf8Codepoint) | 0b11000000;
w += 2;
if (useTmp) [[unlikely]] {
self->callbacks->on_string_data(self->userdata, tmp, 2, false);
}
} else {
assert(self->utf8Codepoint < 0x10000);
if (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff) {
// utf-16 surrogate
self->utf16Surrogate = self->utf8Codepoint;
self->utf8Codepoint = 0;
self->pop();
if (auto s =
self->push({T_BACKSLASH, T_U2, T_HEX, T_HEX, T_HEX, T_HEX3})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
bool useTmp = buf - self->writeBuf < 3;
char *p = tmp;
if (useTmp) [[unlikely]] {
self->flushString(false);
}
auto &w = useTmp ? p : self->writeBuf;
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[0] = (0b00001111 & self->utf8Codepoint) | 0b11100000;
w += 3;
if (useTmp) [[unlikely]] {
self->callbacks->on_string_data(self->userdata, tmp, 3, false);
}
}
self->pop();
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
inline PRESERVE_NONE WeaselJsonStatus t_hex3(Parser3 *self, char *buf,
char *bufEnd) {
self->utf8Codepoint <<= 4;
if (('0' <= *buf && *buf <= '9')) {
self->utf8Codepoint |= *buf - '0';
} else if ('a' <= *buf && *buf <= 'f') {
self->utf8Codepoint |= 10 + *buf - 'a';
} else if ('A' <= *buf && *buf <= 'F') {
self->utf8Codepoint |= 10 + *buf - 'A';
} else [[unlikely]] {
return WeaselJson_REJECT;
}
++buf;
if (!(0xdc00 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff))
[[unlikely]] {
return WeaselJson_REJECT;
}
// Decode utf16 surrogate pair
self->utf8Codepoint = 0x10000 + (self->utf16Surrogate - 0xd800) * 0x400 +
(self->utf8Codepoint - 0xdc00);
// Write codepoint in utf-8 if there's room in the user provided buffer. If
// there's not room, flush, write into a temp buffer, and flush again.
char tmp[4];
assert(self->utf8Codepoint >= 0x10000);
if (self->utf8Codepoint > 0x10FFFF) [[unlikely]] {
return WeaselJson_REJECT;
}
bool useTmp = buf - self->writeBuf < 4;
char *p = tmp;
if (useTmp) [[unlikely]] {
self->flushString(false);
}
auto &w = useTmp ? p : self->writeBuf;
w[3] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[2] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[1] = (0b00111111 & self->utf8Codepoint) | 0b10000000;
self->utf8Codepoint >>= 6;
w[0] = (0b00000111 & self->utf8Codepoint) | 0b11110000;
w += 4;
if (useTmp) [[unlikely]] {
self->callbacks->on_string_data(self->userdata, tmp, 4, false);
}
self->pop();
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
inline PRESERVE_NONE WeaselJsonStatus n_true(Parser3 *self, char *buf,
char *bufEnd) {
if (*buf == 'e') {
++buf;
self->pop();
self->callbacks->on_true_literal(self->userdata);
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
} else [[unlikely]] {
return WeaselJson_REJECT;
}
}
inline PRESERVE_NONE WeaselJsonStatus n_false(Parser3 *self, char *buf,
char *bufEnd) {
if (*buf == 'e') {
++buf;
self->pop();
self->callbacks->on_false_literal(self->userdata);
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
} else [[unlikely]] {
return WeaselJson_REJECT;
}
}
inline PRESERVE_NONE WeaselJsonStatus n_null(Parser3 *self, char *buf,
char *bufEnd) {
if (*buf == 'l') {
++buf;
self->pop();
self->callbacks->on_null_literal(self->userdata);
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
} else [[unlikely]] {
return WeaselJson_REJECT;
}
}
template <char kChar, bool kSkipWhitespace = false>
inline PRESERVE_NONE WeaselJsonStatus singleChar(Parser3 *self, char *buf,
char *bufEnd) {
if constexpr (kSkipWhitespace) {
assert(bufEnd - buf != 0);
while (tables.whitespace[uint8_t(*buf)]) {
++buf;
if (buf == bufEnd) {
return WeaselJson_AGAIN;
}
}
}
if (*buf == kChar) {
++buf;
self->pop();
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
} else [[unlikely]] {
return WeaselJson_REJECT;
}
}
inline PRESERVE_NONE WeaselJsonStatus t_eof(Parser3 *self, char *buf,
char *bufEnd) {
if (bufEnd - buf > 0) [[unlikely]] {
return WeaselJson_REJECT;
}
return self->complete ? WeaselJson_OK : WeaselJson_AGAIN;
}
inline PRESERVE_NONE WeaselJsonStatus t_end_number(Parser3 *self, char *buf,
char *bufEnd) {
self->pop();
self->flushNumber(true, buf);
MUSTTAIL return Parser3::keepGoing(self, buf, bufEnd);
}
constexpr inline struct ContinuationTable {
constexpr ContinuationTable() {
// Defaults
for (int i = 0; i < N_SYMBOL_COUNT; ++i) {
continuations[i] = +[](struct Parser3 *, char *, char *) PRESERVE_NONE {
printf("unimplemented\n");
return WeaselJson_REJECT;
};
}
continuations[N_VALUE] = n_value;
continuations[N_OBJECT2] = n_object2;
continuations[N_OBJECT3] = n_object3;
continuations[N_ARRAY2] = n_array2;
continuations[N_ARRAY3] = n_array3;
continuations[N_STRING] = n_string;
continuations[N_STRING2] = n_string2;
continuations[N_STRING_FOLLOWING_ESCAPE] = n_string_following_escape;
continuations[N_WHITESPACE] = n_whitespace;
continuations[N_NUMBER] = n_number;
continuations[N_TRUE] = n_true;
continuations[N_FALSE] = n_false;
continuations[N_NULL] = n_null;
continuations[T_R] = singleChar<'r'>;
continuations[T_U] = singleChar<'u'>;
continuations[T_U2] = singleChar<'u'>;
continuations[T_A] = singleChar<'a'>;
continuations[T_L] = singleChar<'l'>;
continuations[T_S] = singleChar<'s'>;
continuations[T_COLON] = singleChar<':', true>;
continuations[T_HEX] = t_hex;
continuations[T_HEX2] = t_hex2;
continuations[T_HEX3] = t_hex3;
continuations[T_EOF] = t_eof;
continuations[T_BACKSLASH] = singleChar<'\\'>;
symbolNames[N_VALUE] = "n_value";
symbolNames[N_OBJECT2] = "n_object2";
symbolNames[N_OBJECT3] = "n_object3";
symbolNames[N_ARRAY2] = "n_array2";
symbolNames[N_ARRAY3] = "n_array3";
symbolNames[N_STRING] = "n_string";
symbolNames[N_STRING2] = "n_string2";
symbolNames[N_STRING_FOLLOWING_ESCAPE] = "n_string_following_escape";
symbolNames[N_WHITESPACE] = "n_whitespace";
symbolNames[N_NUMBER] = "n_number";
symbolNames[N_TRUE] = "n_true";
symbolNames[N_FALSE] = "n_false";
symbolNames[N_NULL] = "n_null";
symbolNames[T_R] = "singleChar<'r'>";
symbolNames[T_U] = "singleChar<'u'>";
symbolNames[T_U2] = "singleChar<'u'> (in string)";
symbolNames[T_A] = "singleChar<'a'>";
symbolNames[T_L] = "singleChar<'l'>";
symbolNames[T_S] = "singleChar<'s'>";
symbolNames[T_COLON] = "singleChar<':'>";
symbolNames[T_HEX] = "t_hex";
symbolNames[T_HEX2] = "t_hex2";
symbolNames[T_HEX3] = "t_hex3";
symbolNames[T_EOF] = "t_eof";
symbolNames[T_BACKSLASH] = "singleChar<'\\'>";
// All others can assume that there's at least one byte when they're called
acceptsEmptyString[N_NUMBER] = true;
acceptsEmptyString[N_WHITESPACE] = true;
acceptsEmptyString[T_EOF] = true;
}
Continuation continuations[N_SYMBOL_COUNT]{};
const char *symbolNames[N_SYMBOL_COUNT]{};
bool acceptsEmptyString[N_SYMBOL_COUNT]{};
} symbolTables;
inline PRESERVE_NONE WeaselJsonStatus Parser3::keepGoing(Parser3 *self,
char *buf,
char *bufEnd) {
if (bufEnd - buf == 0) {
if (!self->complete) {
switch (self->top()) {
case N_STRING2:
case N_STRING_FOLLOWING_ESCAPE:
case T_HEX:
case T_HEX2:
case T_HEX3:
case T_BACKSLASH:
case T_U2:
self->flushString(false);
break;
case N_STRING: // The beginning of the string is in the future in this
// state. There's no data to flush yet
case N_VALUE:
case N_OBJECT2:
case N_OBJECT3:
case N_ARRAY2:
case N_ARRAY3:
case N_WHITESPACE:
case N_NUMBER:
case N_TRUE:
case N_FALSE:
case N_NULL:
case T_R:
case T_U:
case T_A:
case T_L:
case T_S:
case T_COLON:
case T_EOF:
case N_SYMBOL_COUNT:
break;
default:
__builtin_unreachable();
}
return WeaselJson_AGAIN;
}
if (!symbolTables.acceptsEmptyString[self->top()]) [[unlikely]] {
return WeaselJson_REJECT;
}
}
MUSTTAIL return symbolTables.continuations[self->top()](self, buf, bufEnd);
}
} // namespace parser3