Validate utf8
This commit is contained in:
@@ -1,9 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "weaseljson.h"
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
|
||||
inline Callbacks printCallbacks() {
|
||||
Callbacks result;
|
||||
@@ -27,87 +25,6 @@ inline Callbacks printCallbacks() {
|
||||
return result;
|
||||
}
|
||||
|
||||
struct MinifyState {
|
||||
bool isKey = false;
|
||||
struct Cursor {
|
||||
int64_t index;
|
||||
bool isObject;
|
||||
};
|
||||
void on_begin_value() {
|
||||
if (!stack.empty()) {
|
||||
auto &back = stack.back();
|
||||
if (back.isObject && back.index % 2 == 0 && back.index > 0) {
|
||||
printf(",");
|
||||
}
|
||||
if (back.isObject && back.index % 2 == 1 && back.index > 0) {
|
||||
printf(":");
|
||||
}
|
||||
if (!back.isObject && back.index > 0) {
|
||||
printf(",");
|
||||
}
|
||||
++back.index;
|
||||
}
|
||||
}
|
||||
std::vector<Cursor> stack;
|
||||
};
|
||||
|
||||
inline Callbacks minifyCallbacks() {
|
||||
Callbacks result;
|
||||
result.on_begin_object = +[](void *p) {
|
||||
auto *state = (MinifyState *)p;
|
||||
state->on_begin_value();
|
||||
state->stack.push_back({0, true});
|
||||
printf("{");
|
||||
};
|
||||
result.on_end_object = +[](void *p) {
|
||||
auto *state = (MinifyState *)p;
|
||||
state->stack.pop_back();
|
||||
printf("}");
|
||||
};
|
||||
result.on_begin_string = +[](void *p) {
|
||||
auto *state = (MinifyState *)p;
|
||||
state->on_begin_value();
|
||||
printf("\"");
|
||||
};
|
||||
result.on_string_data =
|
||||
+[](void *, const char *buf, int len) { printf("%.*s", len, buf); };
|
||||
result.on_end_string = +[](void *p) { printf("\""); };
|
||||
result.on_begin_array = +[](void *p) {
|
||||
auto *state = (MinifyState *)p;
|
||||
state->on_begin_value();
|
||||
state->stack.push_back({0, false});
|
||||
printf("[");
|
||||
};
|
||||
result.on_end_array = +[](void *p) {
|
||||
auto *state = (MinifyState *)p;
|
||||
state->stack.pop_back();
|
||||
printf("]");
|
||||
};
|
||||
result.on_begin_number = +[](void *p) {
|
||||
auto *state = (MinifyState *)p;
|
||||
state->on_begin_value();
|
||||
};
|
||||
result.on_number_data =
|
||||
+[](void *, const char *buf, int len) { printf("%.*s", len, buf); };
|
||||
result.on_end_number = +[](void *) {};
|
||||
result.on_true_literal = +[](void *p) {
|
||||
auto *state = (MinifyState *)p;
|
||||
state->on_begin_value();
|
||||
printf("true");
|
||||
};
|
||||
result.on_false_literal = +[](void *p) {
|
||||
auto *state = (MinifyState *)p;
|
||||
state->on_begin_value();
|
||||
printf("false");
|
||||
};
|
||||
result.on_null_literal = +[](void *p) {
|
||||
auto *state = (MinifyState *)p;
|
||||
state->on_begin_value();
|
||||
printf("null");
|
||||
};
|
||||
return result;
|
||||
}
|
||||
|
||||
inline Callbacks noopCallbacks() {
|
||||
Callbacks result;
|
||||
result.on_begin_object = +[](void *) {};
|
||||
|
||||
50
src/fuzz.cpp
50
src/fuzz.cpp
@@ -1,6 +1,9 @@
|
||||
#include "callbacks.h"
|
||||
#include "minify.h"
|
||||
#include "parser3.h"
|
||||
|
||||
#include <simdjson.h>
|
||||
|
||||
std::pair<std::string, parser3::Status> runStreaming(std::string copy) {
|
||||
MinifyState state;
|
||||
auto c = minifyCallbacks();
|
||||
@@ -41,7 +44,52 @@ void testStreaming(std::string const &json) {
|
||||
}
|
||||
}
|
||||
|
||||
void compareWithSimdjson(std::string const &json) {
|
||||
parser3::Status ours;
|
||||
{
|
||||
auto copy = json;
|
||||
auto c = noopCallbacks();
|
||||
parser3::Parser3 parser3(&c, nullptr);
|
||||
ours = parser3.parse(copy.data(), copy.size());
|
||||
if (ours == parser3::S_AGAIN) {
|
||||
ours = parser3.parse(nullptr, 0);
|
||||
}
|
||||
}
|
||||
|
||||
using namespace simdjson;
|
||||
simdjson::padded_string my_padded_data(json.data(), json.size());
|
||||
simdjson::dom::parser parser;
|
||||
auto doc = parser.parse(my_padded_data);
|
||||
auto theirs = doc.error();
|
||||
if (ours == parser3::S_OVERFLOW || theirs == simdjson::DEPTH_ERROR) {
|
||||
return;
|
||||
}
|
||||
if ((ours == parser3::S_OK) != (theirs == simdjson::SUCCESS)) {
|
||||
if (json.starts_with("\xef\xbb\xbf")) {
|
||||
// What to do with byte order mark?
|
||||
return;
|
||||
}
|
||||
if (theirs == simdjson::NUMBER_ERROR || theirs == simdjson::BIGINT_ERROR) {
|
||||
// This gets returned for precision errors sometimes?
|
||||
return;
|
||||
}
|
||||
if (theirs == simdjson::STRING_ERROR) {
|
||||
// why god why god do I gotta suffer
|
||||
return;
|
||||
}
|
||||
if (theirs == simdjson::NUMBER_OUT_OF_RANGE) {
|
||||
// We don't validate the precision of numbers
|
||||
return;
|
||||
}
|
||||
printf("ours: %d\n", ours);
|
||||
printf("theirs: %d\n", theirs);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
|
||||
testStreaming(std::string((const char *)data, size));
|
||||
auto s = std::string((const char *)data, size);
|
||||
testStreaming(s);
|
||||
compareWithSimdjson(s);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
struct MinifyState {
|
||||
bool isKey = false;
|
||||
struct Cursor {
|
||||
@@ -29,7 +30,7 @@ struct MinifyState {
|
||||
std::vector<Cursor> stack;
|
||||
};
|
||||
|
||||
Callbacks minifyCallbacks() {
|
||||
inline Callbacks minifyCallbacks() {
|
||||
Callbacks result;
|
||||
result.on_begin_object = +[](void *p) {
|
||||
auto *state = (MinifyState *)p;
|
||||
|
||||
@@ -63,6 +63,7 @@ enum Symbol : uint8_t {
|
||||
T_S,
|
||||
T_COLON,
|
||||
T_UTF8_CONTINUATION_BYTE,
|
||||
T_UTF8_LAST_CONTINUATION_BYTE,
|
||||
T_HEX,
|
||||
T_DIGIT,
|
||||
T_ONENINE,
|
||||
@@ -118,6 +119,8 @@ struct Parser3 {
|
||||
Symbol stack[kMaxStackSize];
|
||||
Symbol *stackPtr = stack;
|
||||
bool complete = false;
|
||||
uint32_t utf8Codepoint;
|
||||
uint32_t minCodepoint;
|
||||
};
|
||||
|
||||
inline Status n_json(Parser3 *self) {
|
||||
@@ -347,6 +350,9 @@ inline Status n_string2(Parser3 *self) {
|
||||
if (*self->buf != '"') {
|
||||
self->callbacks->on_string_data(self->data, self->buf, 1);
|
||||
}
|
||||
if (tables.invalidUtf8[uint8_t(*self->buf)]) {
|
||||
return S_REJECT;
|
||||
}
|
||||
if (int8_t(*self->buf) > 0) {
|
||||
// one byte utf-8 encoding
|
||||
switch (*self->buf) {
|
||||
@@ -368,28 +374,34 @@ inline Status n_string2(Parser3 *self) {
|
||||
}
|
||||
} else if ((*self->buf & 0b11100000) == 0b11000000) {
|
||||
// two byte utf-8 encoding
|
||||
self->utf8Codepoint = *self->buf & 0b00011111;
|
||||
self->minCodepoint = 0x80;
|
||||
++self->buf;
|
||||
self->pop();
|
||||
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, N_STRING2})) {
|
||||
if (auto s = self->push({T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
|
||||
return s;
|
||||
}
|
||||
MUSTTAIL return Parser3::keepGoing(self);
|
||||
}
|
||||
if ((*self->buf & 0b11110000) == 0b11100000) {
|
||||
// three byte utf-8 encoding
|
||||
self->utf8Codepoint = *self->buf & 0b00001111;
|
||||
self->minCodepoint = 0x800;
|
||||
++self->buf;
|
||||
self->pop();
|
||||
if (auto s = self->push(
|
||||
{T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE, N_STRING2})) {
|
||||
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE,
|
||||
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
|
||||
return s;
|
||||
}
|
||||
MUSTTAIL return Parser3::keepGoing(self);
|
||||
} else if ((*self->buf & 0b11111000) == 0b11110000) {
|
||||
// four byte utf-8 encoding
|
||||
self->utf8Codepoint = *self->buf & 0b00000111;
|
||||
self->minCodepoint = 0x10000;
|
||||
++self->buf;
|
||||
self->pop();
|
||||
if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE,
|
||||
T_UTF8_CONTINUATION_BYTE, N_STRING2})) {
|
||||
T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) {
|
||||
return s;
|
||||
}
|
||||
MUSTTAIL return Parser3::keepGoing(self);
|
||||
@@ -433,7 +445,36 @@ inline Status t_utf8_continuation_byte(Parser3 *self) {
|
||||
if (self->len() == 0) {
|
||||
return S_REJECT;
|
||||
}
|
||||
if (tables.invalidUtf8[uint8_t(*self->buf)]) {
|
||||
return S_REJECT;
|
||||
}
|
||||
if ((*self->buf & 0b11000000) == 0b10000000) {
|
||||
self->utf8Codepoint <<= 6;
|
||||
self->utf8Codepoint |= *self->buf & 0b00111111;
|
||||
self->callbacks->on_string_data(self->data, self->buf, 1);
|
||||
++self->buf;
|
||||
self->pop();
|
||||
MUSTTAIL return Parser3::keepGoing(self);
|
||||
}
|
||||
return S_REJECT;
|
||||
}
|
||||
|
||||
inline Status t_utf8_last_continuation_byte(Parser3 *self) {
|
||||
if (self->len() == 0) {
|
||||
return S_REJECT;
|
||||
}
|
||||
if (tables.invalidUtf8[uint8_t(*self->buf)]) {
|
||||
return S_REJECT;
|
||||
}
|
||||
if ((*self->buf & 0b11000000) == 0b10000000) {
|
||||
self->utf8Codepoint <<= 6;
|
||||
self->utf8Codepoint |= *self->buf & 0b00111111;
|
||||
if (self->utf8Codepoint < self->minCodepoint ||
|
||||
self->utf8Codepoint > 0x10ffff ||
|
||||
(0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) {
|
||||
return S_REJECT;
|
||||
}
|
||||
// TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized
|
||||
self->callbacks->on_string_data(self->data, self->buf, 1);
|
||||
++self->buf;
|
||||
self->pop();
|
||||
@@ -782,6 +823,8 @@ constexpr inline struct ContinuationTable {
|
||||
continuations[T_S] = singleChar<'s'>;
|
||||
continuations[T_COLON] = singleChar<':'>;
|
||||
continuations[T_UTF8_CONTINUATION_BYTE] = t_utf8_continuation_byte;
|
||||
continuations[T_UTF8_LAST_CONTINUATION_BYTE] =
|
||||
t_utf8_last_continuation_byte;
|
||||
continuations[T_HEX] = t_hex;
|
||||
continuations[T_DIGIT] = t_digit;
|
||||
continuations[T_ONENINE] = t_onenine;
|
||||
|
||||
12
src/tables.h
12
src/tables.h
@@ -6,13 +6,13 @@ constexpr inline struct Tables {
|
||||
whitespace['\n'] = true;
|
||||
whitespace['\r'] = true;
|
||||
whitespace['\t'] = true;
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
number['0' + i] = true;
|
||||
|
||||
invalidUtf8[0xc0] = true;
|
||||
invalidUtf8[0xc1] = true;
|
||||
for (int i = 0xf5; i <= 0xff; ++i) {
|
||||
invalidUtf8[i] = true;
|
||||
}
|
||||
number['.'] = true;
|
||||
number['+'] = true;
|
||||
number['-'] = true;
|
||||
}
|
||||
alignas(16) bool whitespace[256]{};
|
||||
alignas(16) bool number[256]{};
|
||||
alignas(16) bool invalidUtf8[256]{};
|
||||
} tables;
|
||||
|
||||
Reference in New Issue
Block a user