weaseljson/src/test.cpp

#include <cassert>
#include <cctype>
#include <cstdint>
#include <cstdio>
#include <cstring>

#include <initializer_list>
#include <string>
#include <utility>

#include <doctest.h>
#include <nanobench.h>
#include <simdjson.h>

// This is the JSON grammar in McKeeman Form.

// json
//    element

// value
//    object
//    array
//    string
//    number
//    "true"
//    "false"
//    "null"

// object
//     '{' ws '}'
//     '{' members '}'

// members
//     member
//     member ',' members

// member
//     ws string ws ':' element

// array
//     '[' ws ']'
//     '[' elements ']'

// elements
//     element
//     element ',' elements

// element
//     ws value ws

// string
// '"' characters '"'

// characters
//     ""
//     character characters

// character
//     '0020' . '10FFFF' - '"' - '\'
// '\' escape

// escape
//     '"'
//     '\'
//     '/'
//     'b'
//     'f'
//     'n'
//     'r'
//     't'
//     'u' hex hex hex hex

// hex
//     digit
//     'A' . 'F'
//     'a' . 'f'

// number
//     integer fraction exponent

// integer
//     digit
//     onenine digits
//     '-' digit
//     '-' onenine digits

// digits
//     digit
//     digit digits

// digit
//     '0'
//     onenine

// onenine
//     '1' . '9'

// fraction
//     ""
//     '.' digits

// exponent
//     ""
//     'E' sign digits
//     'e' sign digits

// sign
//     ""
//     '+'
//     '-'

// ws
//     ""
//     '0020' ws
//     '000A' ws
//     '000D' ws
//     '0009' ws

struct Callbacks {
  void (*on_begin_object)(void *data) = noop;
  void (*on_end_object)(void *data) = noop;
  void (*on_begin_string)(void *data) = noop;
  void (*on_string_data)(void *data, const char *buf, int len) = noop;
  void (*on_end_string)(void *data) = noop;
  void (*on_begin_array)(void *data) = noop;
  void (*on_end_array)(void *data) = noop;
  void (*on_begin_number)(void *data) = noop;
  void (*on_number_data)(void *data, const char *buf, int len) = noop;
  void (*on_end_number)(void *data) = noop;
  void (*on_true_literal)(void *data) = noop;
  void (*on_false_literal)(void *data) = noop;
  void (*on_null_literal)(void *data) = noop;

private:
  static void noop(void *) {}
  static void noop(void *, const char *, int) {}
};

// Terminals and Nonterminals. These appear in the stack of the pushdown
// automata
enum Symbol : int8_t {
  T_STRING, // Multibyte!
  T_COLON,
  // Nonterminals
  N_VALUE,
  N_ARRAY_VALUE_OR_END,
  N_OBJECT_VALUE_OR_END,
  N_ARRAY_MAYBE_CONTINUE,
  N_OBJECT_MAYBE_CONTINUE,
  N_PAST_END, // Must be last nonterminal
};

const char *symbolNames[] = {
    "T_STRING",
    "T_COLON",
    "N_VALUE",
    "N_ARRAY_VALUE_OR_END",
    "N_OBJECT_VALUE_OR_END",
    "N_ARRAY_MAYBE_CONTINUE",
    "N_OBJECT_MAYBE_CONTINUE",
};

constexpr static struct Tables {
  constexpr Tables() {
    whitespace[' '] = true;
    whitespace['\n'] = true;
    whitespace['\r'] = true;
    whitespace['\t'] = true;
  }
  alignas(16) bool whitespace[256]{};
} tables;

namespace {

// Straightforward recursive descent that doesn't handle string escaping and
// treats numbers as [0-9.]+. May stack overflow on deeply nested json documents
struct Parser1 {
  Parser1(char *buf, int len, const Callbacks *callbacks, void *data)
      : buf(buf), bufEnd(buf + len), callbacks(callbacks), data(data) {}

  // Returns false to reject
  [[nodiscard]] bool parse() { return parse_element(); }

  Parser1(Parser1 const &) = delete;
  Parser1 &operator=(Parser1 const &) = delete;
  Parser1(Parser1 &&) = delete;
  Parser1 &operator=(Parser1 &&) = delete;

private:
  char *buf;
  char *bufEnd;
  const Callbacks *const callbacks;
  void *const data;

  int len() const { return bufEnd - buf; }

  // Helpers
  void maybeSkipWs() {
    while (buf != bufEnd && tables.whitespace[*buf]) {
      ++buf;
    }
  }
  bool parseLiteral(const char *literal) {
    const int litLen = strlen(literal);
    if (len() < litLen) {
      return false;
    }
    return memcmp(std::exchange(buf, buf + litLen), literal, litLen) == 0;
  }

  // functions corresponding to productions
  bool parse_element() {
    maybeSkipWs();
    if (len() == 0) {
      return false;
    }
    if (*buf == '{') {
      if (!parse_object()) {
        return false;
      }
    } else if (*buf == '[') {
      if (!parse_array()) {
        return false;
      }
    } else if (*buf == '"') {
      if (!parse_string()) {
        return false;
      }
    } else if (*buf == 't') {
      if (!parse_true()) {
        return false;
      }
    } else if (*buf == 'f') {
      if (!parse_false()) {
        return false;
      }
    } else if (*buf == 'n') {
      if (!parse_null()) {
        return false;
      }
    } else {
      if (!parse_number()) {
        return false;
      }
    }
    maybeSkipWs();
    return true;
  }

  bool parse_object() {
    if (!parseLiteral("{")) {
      return false;
    }
    callbacks->on_begin_object(data);
    maybeSkipWs();
    if (len() == 0) {
      return false;
    }
    if (*buf != '}') {
      if (!parse_members()) {
      }
    }
    if (!parseLiteral("}")) {
      return false;
    }
    callbacks->on_end_object(data);
    return true;
  }

  bool parse_members() {
  begin:
    if (!parse_member()) {
      return false;
    }
    if (len() == 0) {
      return false;
    }
    if (*buf == ',') {
      if (!parseLiteral(",")) {
        return false;
      }
      goto begin; // tail call
    }
    return true;
  }

  bool parse_member() {
    maybeSkipWs();
    if (!parse_string()) {
      return false;
    }
    maybeSkipWs();
    if (!parseLiteral(":")) {
      return false;
    }
    if (!parse_element()) {
      return false;
    }
    return true;
  }

  bool parse_array() {
    if (!parseLiteral("[")) {
      return false;
    }
    callbacks->on_begin_array(data);
    maybeSkipWs();
    if (len() == 0) {
      return false;
    }
    if (*buf != ']') {
      if (!parse_elements()) {
        return false;
      }
    }
    if (!parseLiteral("]")) {
      return false;
    }
    callbacks->on_end_array(data);
    return true;
  }

  bool parse_elements() {
  begin:
    if (!parse_element()) {
      return false;
    }
    if (len() == 0) {
      return false;
    }
    if (*buf == ',') {
      if (!parseLiteral(",")) {
        return false;
      }
      goto begin; // tail call
    }
    return true;
  }

  bool parse_string() {
    callbacks->on_begin_string(data);
    if (!parseLiteral("\"")) {
      return false;
    }
    auto *result = (char *)memchr(buf, '"', len());
    if (result == nullptr) {
      return false;
    }
    int stringLen = result - buf;
    callbacks->on_string_data(data, buf, stringLen);
    buf += stringLen;
    if (!parseLiteral("\"")) {
      return false;
    }
    callbacks->on_end_string(data);
    return true;
  }

  bool parse_number() {
    callbacks->on_begin_number(data);
    char *const bufBefore = buf;
    for (;;) {
      if (len() == 0) {
        return false;
      }
      if ('0' <= *buf && *buf <= '9' || (*buf == '.')) {
        ++buf;
      } else {
        break;
      }
    }
    if (buf == bufBefore) {
      return false;
    }
    callbacks->on_number_data(data, bufBefore, buf - bufBefore);
    callbacks->on_end_number(data);
    return true;
  }

  bool parse_true() {
    if (!parseLiteral("true")) {
      return false;
    }
    callbacks->on_true_literal(data);
    return true;
  }

  bool parse_false() {
    if (!parseLiteral("false")) {
      return false;
    }
    callbacks->on_false_literal(data);
    return true;
  }

  bool parse_null() {
    if (!parseLiteral("null")) {
      return false;
    }
    callbacks->on_null_literal(data);
    return true;
  }
};

#ifndef __has_attribute
#define __has_attribute(x) 0
#endif

#if __has_attribute(musttail)
#define MUSTTAIL __attribute__((musttail))
#else
#define MUSTTAIL
#endif

// Table-based ll(1) parser that doesn't handle escaping and treats numbers as
// [0-9.]+. Could be adapted to have a streaming interface. Uses O(1) memory.
struct Parser2 {
  Parser2(char *buf, int len, const Callbacks *callbacks, void *data)
      : buf(buf), bufEnd(buf + len), callbacks(callbacks), data(data) {}

  // Returns false to reject
  [[nodiscard]] bool parse() {
    if (!push({N_VALUE})) {
      return false;
    }
    return keepGoing(this);
  }

  Parser2(Parser2 const &) = delete;
  Parser2 &operator=(Parser2 const &) = delete;
  Parser2(Parser2 &&) = delete;
  Parser2 &operator=(Parser2 &&) = delete;

  static constexpr int kMaxStackSize = 1 << 10;

private:
  // Helpers
  void maybeSkipWs() {
    while (buf != bufEnd && tables.whitespace[*buf]) {
      ++buf;
    }
  }
  bool parseLiteral(const char *literal) {
    const int litLen = strlen(literal);
    if (len() < litLen) {
      return false;
    }
    if (memcmp(buf, literal, litLen) == 0) {
      buf += litLen;
      return true;
    }
    return false;
  }
  bool parse_number() {
    char *const bufBefore = buf;
    if (len() == 0 || !('0' <= *buf && *buf <= '9' || (*buf == '.'))) {
      return false;
    }
    callbacks->on_begin_number(data);
    ++buf;
    for (;;) {
      if ('0' <= *buf && *buf <= '9' || (*buf == '.')) {
        ++buf;
      } else {
        break;
      }
    }
    callbacks->on_number_data(data, bufBefore, buf - bufBefore);
    callbacks->on_end_number(data);
    return true;
  }
  bool parse_string() {
    if (!parseLiteral("\"")) {
      return false;
    }
    callbacks->on_begin_string(data);
    auto *result = (char *)memchr(buf, '"', len());
    if (result == nullptr) {
      return false;
    }
    int stringLen = result - buf;
    callbacks->on_string_data(data, buf, stringLen);
    buf += stringLen;
    if (!parseLiteral("\"")) {
      return false;
    }
    callbacks->on_end_string(data);
    return true;
  }

  typedef bool (*continuation)(Parser2 *);

  [[maybe_unused]] void debugPrint() {
    for (int i = 0; i < stackPtr - stack; ++i) {
      printf("%s ", symbolNames[stack[i]]);
    }
    printf("\n");
  }

  static bool keepGoing(Parser2 *self) {
    // self->debugPrint();
    if (self->empty()) {
      return true;
    }
    auto top = *(self->stackPtr - 1);
    self->maybeSkipWs();
    MUSTTAIL return table[top](self);
  }

  static bool string(Parser2 *self) {
    if (!self->parse_string()) {
      return false;
    }
    self->pop();
    MUSTTAIL return keepGoing(self);
  }
  static bool colon(Parser2 *self) {
    if (!self->parseLiteral(":")) {
      return false;
    }
    self->pop();
    MUSTTAIL return keepGoing(self);
  }
  static bool value(Parser2 *self) {
    if (self->parse_string()) {
      self->pop();
      MUSTTAIL return keepGoing(self);
    } else if (self->parse_number()) {
      self->pop();
      MUSTTAIL return keepGoing(self);
    } else if (self->parseLiteral("{")) {
      self->pop();
      self->callbacks->on_begin_object(self->data);
      if (!self->push({N_OBJECT_VALUE_OR_END})) {
        return false;
      }
      MUSTTAIL return keepGoing(self);
    } else if (self->parseLiteral("[")) {
      self->pop();
      self->callbacks->on_begin_array(self->data);
      if (!self->push({N_ARRAY_VALUE_OR_END})) {
        return false;
      }
      MUSTTAIL return keepGoing(self);
    } else if (self->parseLiteral("true")) {
      self->pop();
      self->callbacks->on_true_literal(self->data);
      MUSTTAIL return keepGoing(self);
    } else if (self->parseLiteral("false")) {
      self->pop();
      self->callbacks->on_false_literal(self->data);
      MUSTTAIL return keepGoing(self);
    } else if (self->parseLiteral("null")) {
      self->pop();
      self->callbacks->on_null_literal(self->data);
      MUSTTAIL return keepGoing(self);
    }
    return false;
  }
  static bool arrayOrEnd(Parser2 *self) {
    if (self->parseLiteral("]")) {
      self->pop();
      self->callbacks->on_end_array(self->data);
      MUSTTAIL return keepGoing(self);
    } else {
      self->pop();
      if (!self->push({N_VALUE, N_ARRAY_MAYBE_CONTINUE})) {
        return false;
      }
      MUSTTAIL return keepGoing(self);
    }
  }
  static bool objectOrEnd(Parser2 *self) {
    if (self->parseLiteral("}")) {
      self->pop();
      self->callbacks->on_end_object(self->data);
      MUSTTAIL return keepGoing(self);
    } else {
      self->pop();
      if (!self->push({T_STRING, T_COLON, N_VALUE, N_OBJECT_MAYBE_CONTINUE})) {
        return false;
      }
      MUSTTAIL return keepGoing(self);
    }
    return false;
  }
  static bool arrayContinue(Parser2 *self) {
    if (self->parseLiteral(",")) {
      self->pop();
      if (!self->push({N_VALUE, N_ARRAY_MAYBE_CONTINUE})) {
        return false;
      }
      MUSTTAIL return keepGoing(self);
    } else if (self->parseLiteral("]")) {
      self->pop();
      self->callbacks->on_end_array(self->data);
      MUSTTAIL return keepGoing(self);
    }
    return false;
  }
  static bool objectContinue(Parser2 *self) {
    if (self->parseLiteral(",")) {
      self->pop();
      if (!self->push({T_STRING, T_COLON, N_VALUE, N_OBJECT_MAYBE_CONTINUE})) {
        return false;
      }
      MUSTTAIL return keepGoing(self);
    } else if (self->parseLiteral("}")) {
      self->pop();
      self->callbacks->on_end_object(self->data);
      MUSTTAIL return keepGoing(self);
    }
    return false;
  }

  static constexpr continuation table[N_PAST_END] = {
      /*T_STRING*/ string,
      /*T_COLON*/ colon,
      /*N_VALUE*/ value,
      /*N_ARRAY_VALUE_OR_END*/ arrayOrEnd,
      /*N_OBJECT_VALUE_OR_END*/ objectOrEnd,
      /*N_ARRAY_MAYBE_CONTINUE*/ arrayContinue,
      /*N_OBJECT_MAYBE_CONTINUE*/ objectContinue,
  };

  char *buf;
  char *bufEnd;
  int len() const { return bufEnd - buf; }
  const Callbacks *const callbacks;
  void *const data;
  Symbol stack[kMaxStackSize];
  Symbol *stackPtr = stack;
  bool empty() const { return stackPtr == stack; }
  void pop() {
    assert(!empty());
    --stackPtr;
  }
  [[nodiscard]] bool push(std::initializer_list<Symbol> symbols) {
    if (stackPtr >= std::end(stack) - symbols.size()) [[unlikely]] {
      return false;
    }
    for (int i = symbols.size() - 1; i >= 0; --i) {
      *stackPtr++ = *(symbols.begin() + i);
    }
    return true;
  }
};

const std::string json = R"({
    "a number": 12345,
    "true": true,
    "false": false,
    "null": null,
    "glossary": {
        "title": "example glossary",
		"GlossDiv": {
            "title": "S",
			"GlossList": {
                "GlossEntry": {
                    "ID": "SGML",
					"SortAs": "SGML",
					"GlossTerm": "Standard Generalized Markup Language",
					"Acronym": "SGML",
					"Abbrev": "ISO 8879:1986",
					"GlossDef": {
                        "para": "A meta-markup language, used to create markup languages such as DocBook.",
						"GlossSeeAlso": ["GML", "XML"]
                    },
					"GlossSee": "markup"
                }
            }
        }
    }
})";

Callbacks printCallbacks() {
  Callbacks result;
  result.on_begin_object = +[](void *) { puts("on_begin_object"); };
  result.on_end_object = +[](void *) { puts("on_end_object"); };
  result.on_begin_string = +[](void *) { puts("on_begin_string"); };
  result.on_string_data = +[](void *, const char *buf, int len) {
    printf("on_string_data `%.*s`\n", len, buf);
  };
  result.on_end_string = +[](void *) { puts("on_end_string"); };
  result.on_begin_array = +[](void *) { puts("on_begin_array"); };
  result.on_end_array = +[](void *) { puts("on_end_array"); };
  result.on_begin_number = +[](void *) { puts("on_begin_number"); };
  result.on_number_data = +[](void *, const char *buf, int len) {
    printf("on_number_data `%.*s`\n", len, buf);
  };
  result.on_end_number = +[](void *) { puts("on_end_number"); };
  result.on_true_literal = +[](void *) { puts("on_true_literal"); };
  result.on_false_literal = +[](void *) { puts("on_false_literal"); };
  result.on_null_literal = +[](void *) { puts("on_null_literal"); };
  return result;
}

} // namespace

TEST_CASE("parser1") {
  Callbacks c = printCallbacks();
  {
    auto copy = json;
    Parser1 parser(copy.data(), copy.length(), &c, nullptr);
    CHECK(parser.parse());
  }
  {
    std::string copy = "{\"x\": [], \"y\": {}}";
    Parser1 parser(copy.data(), copy.length(), &c, nullptr);
    CHECK(parser.parse());
  }
}

TEST_CASE("parser2") {
  Callbacks c = printCallbacks();
  {
    auto copy = json;
    Parser2 parser(copy.data(), copy.length(), &c, nullptr);
    CHECK(parser.parse());
  }
  {
    std::string copy = "{\"x\": [], \"y\": {}}";
    Parser2 parser(copy.data(), copy.length(), &c, nullptr);
    CHECK(parser.parse());
  }
}

TEST_CASE("bench1") {
  auto c = Callbacks{};
  ankerl::nanobench::Bench bench;
  bench.batch(json.size());
  bench.unit("byte");
  bench.run("parser1", [&]() {
    auto copy = json;
    Parser1 parser(copy.data(), copy.length(), &c, nullptr);
    bench.doNotOptimizeAway(parser.parse());
  });
}

TEST_CASE("bench2") {
  auto c = Callbacks{};
  ankerl::nanobench::Bench bench;
  bench.batch(json.size());
  bench.unit("byte");
  bench.run("parser2", [&]() {
    auto copy = json;
    Parser2 parser(copy.data(), copy.length(), &c, nullptr);
    bench.doNotOptimizeAway(parser.parse());
  });
}

TEST_CASE("bench3") {
  using namespace simdjson;
  ankerl::nanobench::Bench bench;
  bench.batch(json.size());
  bench.unit("byte");
  bench.run("parser3", [&]() {
    simdjson::padded_string my_padded_data(json.data(), json.size());
    simdjson::dom::parser parser;
    auto doc = parser.parse(my_padded_data);
    bench.doNotOptimizeAway(doc);
  });
}

TEST_CASE("bench4") {
  using namespace simdjson;
  ankerl::nanobench::Bench bench;
  bench.batch(json.size());
  bench.unit("byte");
  bench.run("parser4", [&]() {
    padded_string my_padded_data(json.data(), json.size());
    ondemand::parser parser;
    auto doc = parser.iterate(my_padded_data);
    bench.doNotOptimizeAway(doc);
  });
}