Validate utf8

This commit is contained in:
2025-05-18 17:23:22 -04:00
parent 452bbd3d9c
commit d279173482
6 changed files with 105 additions and 95 deletions

View File

@@ -1,9 +1,7 @@
#pragma once
#include "weaseljson.h"
#include <cstdint>
#include <cstdio>
#include <vector>
inline Callbacks printCallbacks() {
Callbacks result;
@@ -27,87 +25,6 @@ inline Callbacks printCallbacks() {
return result;
}
struct MinifyState {
bool isKey = false;
struct Cursor {
int64_t index;
bool isObject;
};
void on_begin_value() {
if (!stack.empty()) {
auto &back = stack.back();
if (back.isObject && back.index % 2 == 0 && back.index > 0) {
printf(",");
}
if (back.isObject && back.index % 2 == 1 && back.index > 0) {
printf(":");
}
if (!back.isObject && back.index > 0) {
printf(",");
}
++back.index;
}
}
std::vector<Cursor> stack;
};
inline Callbacks minifyCallbacks() {
Callbacks result;
result.on_begin_object = +[](void *p) {
auto *state = (MinifyState *)p;
state->on_begin_value();
state->stack.push_back({0, true});
printf("{");
};
result.on_end_object = +[](void *p) {
auto *state = (MinifyState *)p;
state->stack.pop_back();
printf("}");
};
result.on_begin_string = +[](void *p) {
auto *state = (MinifyState *)p;
state->on_begin_value();
printf("\"");
};
result.on_string_data =
+[](void *, const char *buf, int len) { printf("%.*s", len, buf); };
result.on_end_string = +[](void *p) { printf("\""); };
result.on_begin_array = +[](void *p) {
auto *state = (MinifyState *)p;
state->on_begin_value();
state->stack.push_back({0, false});
printf("[");
};
result.on_end_array = +[](void *p) {
auto *state = (MinifyState *)p;
state->stack.pop_back();
printf("]");
};
result.on_begin_number = +[](void *p) {
auto *state = (MinifyState *)p;
state->on_begin_value();
};
result.on_number_data =
+[](void *, const char *buf, int len) { printf("%.*s", len, buf); };
result.on_end_number = +[](void *) {};
result.on_true_literal = +[](void *p) {
auto *state = (MinifyState *)p;
state->on_begin_value();
printf("true");
};
result.on_false_literal = +[](void *p) {
auto *state = (MinifyState *)p;
state->on_begin_value();
printf("false");
};
result.on_null_literal = +[](void *p) {
auto *state = (MinifyState *)p;
state->on_begin_value();
printf("null");
};
return result;
}
inline Callbacks noopCallbacks() {
Callbacks result;
result.on_begin_object = +[](void *) {};

View File

@@ -1,6 +1,9 @@
#include "callbacks.h"
#include "minify.h"
#include "parser3.h"
#include <simdjson.h>
std::pair<std::string, parser3::Status> runStreaming(std::string copy) {
MinifyState state;
auto c = minifyCallbacks();
@@ -41,7 +44,52 @@ void testStreaming(std::string const &json) {
}
}
void compareWithSimdjson(std::string const &json) {
parser3::Status ours;
{
auto copy = json;
auto c = noopCallbacks();
parser3::Parser3 parser3(&c, nullptr);
ours = parser3.parse(copy.data(), copy.size());
if (ours == parser3::S_AGAIN) {
ours = parser3.parse(nullptr, 0);
}
}
using namespace simdjson;
simdjson::padded_string my_padded_data(json.data(), json.size());
simdjson::dom::parser parser;
auto doc = parser.parse(my_padded_data);
auto theirs = doc.error();
if (ours == parser3::S_OVERFLOW || theirs == simdjson::DEPTH_ERROR) {
return;
}
if ((ours == parser3::S_OK) != (theirs == simdjson::SUCCESS)) {
if (json.starts_with("\xef\xbb\xbf")) {
// What to do with byte order mark?
return;
}
if (theirs == simdjson::NUMBER_ERROR || theirs == simdjson::BIGINT_ERROR) {
// This gets returned for precision errors sometimes?
return;
}
if (theirs == simdjson::STRING_ERROR) {
// why god why god do I gotta suffer
return;
}
if (theirs == simdjson::NUMBER_OUT_OF_RANGE) {
// We don't validate the precision of numbers
return;
}
printf("ours: %d\n", ours);
printf("theirs: %d\n", theirs);
abort();
}
}
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
testStreaming(std::string((const char *)data, size));
auto s = std::string((const char *)data, size);
testStreaming(s);
compareWithSimdjson(s);
return 0;
}

View File

@@ -4,6 +4,7 @@
#include <cstdint>
#include <string>
#include <vector>
struct MinifyState {
bool isKey = false;
struct Cursor {
@@ -29,7 +30,7 @@ struct MinifyState {
std::vector<Cursor> stack;
};
Callbacks minifyCallbacks() {
inline Callbacks minifyCallbacks() {
Callbacks result;
result.on_begin_object = +[](void *p) {
auto *state = (MinifyState *)p;

View File

@@ -63,6 +63,7 @@ enum Symbol : uint8_t {
T_S,
T_COLON,
T_UTF8_CONTINUATION_BYTE,
T_UTF8_LAST_CONTINUATION_BYTE,
T_HEX,
T_DIGIT,
T_ONENINE,
@@ -118,6 +119,8 @@ struct Parser3 {
Symbol stack[kMaxStackSize];
Symbol *stackPtr = stack;
bool complete = false;
uint32_t utf8Codepoint;
uint32_t minCodepoint;
};
inline Status n_json(Parser3 *self) {
@@ -347,6 +350,9 @@ inline Status n_string2(Parser3 *self) {
if (*self->buf != '"') {
self->callbacks->on_string_data(self->data, self->buf, 1);
}
if (tables.invalidUtf8[uint8_t(*self->buf)]) {
return S_REJECT;
}
if (int8_t(*self->buf) > 0) {
// one byte utf-8 encoding
switch (*self->buf) {
@@ -368,28 +374,34 @@ inline Status n_string2(Parser3 *self) {
}
} else if ((*self->buf & 0b11100000) == 0b11000000) {
// two byte utf-8 encoding
self->utf8Codepoint = *self->buf & 0b00011111;
self->minCodepoint = 0x80;
++self->buf;
self->pop();
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, N_STRING2})) {
if (auto s = self->push({T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self);
}
if ((*self->buf & 0b11110000) == 0b11100000) {
// three byte utf-8 encoding
self->utf8Codepoint = *self->buf & 0b00001111;
self->minCodepoint = 0x800;
++self->buf;
self->pop();
if (auto s = self->push(
{T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE, N_STRING2})) {
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE,
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self);
} else if ((*self->buf & 0b11111000) == 0b11110000) {
// four byte utf-8 encoding
self->utf8Codepoint = *self->buf & 0b00000111;
self->minCodepoint = 0x10000;
++self->buf;
self->pop();
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE,
T_UTF8_CONTINUATION_BYTE, N_STRING2})) {
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
return s;
}
MUSTTAIL return Parser3::keepGoing(self);
@@ -433,7 +445,36 @@ inline Status t_utf8_continuation_byte(Parser3 *self) {
if (self->len() == 0) {
return S_REJECT;
}
if (tables.invalidUtf8[uint8_t(*self->buf)]) {
return S_REJECT;
}
if ((*self->buf & 0b11000000) == 0b10000000) {
self->utf8Codepoint <<= 6;
self->utf8Codepoint |= *self->buf & 0b00111111;
self->callbacks->on_string_data(self->data, self->buf, 1);
++self->buf;
self->pop();
MUSTTAIL return Parser3::keepGoing(self);
}
return S_REJECT;
}
inline Status t_utf8_last_continuation_byte(Parser3 *self) {
if (self->len() == 0) {
return S_REJECT;
}
if (tables.invalidUtf8[uint8_t(*self->buf)]) {
return S_REJECT;
}
if ((*self->buf & 0b11000000) == 0b10000000) {
self->utf8Codepoint <<= 6;
self->utf8Codepoint |= *self->buf & 0b00111111;
if (self->utf8Codepoint < self->minCodepoint ||
self->utf8Codepoint > 0x10ffff ||
(0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) {
return S_REJECT;
}
// TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized
self->callbacks->on_string_data(self->data, self->buf, 1);
++self->buf;
self->pop();
@@ -782,6 +823,8 @@ constexpr inline struct ContinuationTable {
continuations[T_S] = singleChar<'s'>;
continuations[T_COLON] = singleChar<':'>;
continuations[T_UTF8_CONTINUATION_BYTE] = t_utf8_continuation_byte;
continuations[T_UTF8_LAST_CONTINUATION_BYTE] =
t_utf8_last_continuation_byte;
continuations[T_HEX] = t_hex;
continuations[T_DIGIT] = t_digit;
continuations[T_ONENINE] = t_onenine;

View File

@@ -6,13 +6,13 @@ constexpr inline struct Tables {
whitespace['\n'] = true;
whitespace['\r'] = true;
whitespace['\t'] = true;
for (int i = 0; i < 10; ++i) {
number['0' + i] = true;
invalidUtf8[0xc0] = true;
invalidUtf8[0xc1] = true;
for (int i = 0xf5; i <= 0xff; ++i) {
invalidUtf8[i] = true;
}
number['.'] = true;
number['+'] = true;
number['-'] = true;
}
alignas(16) bool whitespace[256]{};
alignas(16) bool number[256]{};
alignas(16) bool invalidUtf8[256]{};
} tables;