From d27917348295d5c4c8f57bdd35b996b03cf37335 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Sun, 18 May 2025 17:23:22 -0400 Subject: [PATCH] Validate utf8 --- CMakeLists.txt | 1 + src/callbacks.h | 83 ------------------------------------------------- src/fuzz.cpp | 50 ++++++++++++++++++++++++++++- src/minify.h | 3 +- src/parser3.h | 51 +++++++++++++++++++++++++++--- src/tables.h | 12 +++---- 6 files changed, 105 insertions(+), 95 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b646fa..fb7e7a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -51,6 +51,7 @@ doctest_discover_tests(mytest) add_executable(fuzz src/fuzz.cpp) target_include_directories(fuzz PRIVATE include) +target_link_libraries(fuzz PRIVATE simdjson) target_compile_options(fuzz PRIVATE -fsanitize=fuzzer) target_link_options(fuzz PRIVATE -fsanitize=fuzzer) diff --git a/src/callbacks.h b/src/callbacks.h index 156bb52..9b8b648 100644 --- a/src/callbacks.h +++ b/src/callbacks.h @@ -1,9 +1,7 @@ #pragma once #include "weaseljson.h" -#include #include -#include inline Callbacks printCallbacks() { Callbacks result; @@ -27,87 +25,6 @@ inline Callbacks printCallbacks() { return result; } -struct MinifyState { - bool isKey = false; - struct Cursor { - int64_t index; - bool isObject; - }; - void on_begin_value() { - if (!stack.empty()) { - auto &back = stack.back(); - if (back.isObject && back.index % 2 == 0 && back.index > 0) { - printf(","); - } - if (back.isObject && back.index % 2 == 1 && back.index > 0) { - printf(":"); - } - if (!back.isObject && back.index > 0) { - printf(","); - } - ++back.index; - } - } - std::vector stack; -}; - -inline Callbacks minifyCallbacks() { - Callbacks result; - result.on_begin_object = +[](void *p) { - auto *state = (MinifyState *)p; - state->on_begin_value(); - state->stack.push_back({0, true}); - printf("{"); - }; - result.on_end_object = +[](void *p) { - auto *state = (MinifyState *)p; - state->stack.pop_back(); - printf("}"); - }; - result.on_begin_string = +[](void *p) { - auto *state = (MinifyState *)p; - state->on_begin_value(); - printf("\""); - }; - result.on_string_data = - +[](void *, const char *buf, int len) { printf("%.*s", len, buf); }; - result.on_end_string = +[](void *p) { printf("\""); }; - result.on_begin_array = +[](void *p) { - auto *state = (MinifyState *)p; - state->on_begin_value(); - state->stack.push_back({0, false}); - printf("["); - }; - result.on_end_array = +[](void *p) { - auto *state = (MinifyState *)p; - state->stack.pop_back(); - printf("]"); - }; - result.on_begin_number = +[](void *p) { - auto *state = (MinifyState *)p; - state->on_begin_value(); - }; - result.on_number_data = - +[](void *, const char *buf, int len) { printf("%.*s", len, buf); }; - result.on_end_number = +[](void *) {}; - result.on_true_literal = +[](void *p) { - auto *state = (MinifyState *)p; - state->on_begin_value(); - printf("true"); - }; - result.on_false_literal = +[](void *p) { - auto *state = (MinifyState *)p; - state->on_begin_value(); - printf("false"); - }; - result.on_null_literal = +[](void *p) { - auto *state = (MinifyState *)p; - state->on_begin_value(); - printf("null"); - }; - return result; -} - inline Callbacks noopCallbacks() { Callbacks result; result.on_begin_object = +[](void *) {}; diff --git a/src/fuzz.cpp b/src/fuzz.cpp index 762bb66..01e9696 100644 --- a/src/fuzz.cpp +++ b/src/fuzz.cpp @@ -1,6 +1,9 @@ +#include "callbacks.h" #include "minify.h" #include "parser3.h" +#include + std::pair runStreaming(std::string copy) { MinifyState state; auto c = minifyCallbacks(); @@ -41,7 +44,52 @@ void testStreaming(std::string const &json) { } } +void compareWithSimdjson(std::string const &json) { + parser3::Status ours; + { + auto copy = json; + auto c = noopCallbacks(); + parser3::Parser3 parser3(&c, nullptr); + ours = parser3.parse(copy.data(), copy.size()); + if (ours == parser3::S_AGAIN) { + ours = parser3.parse(nullptr, 0); + } + } + + using namespace simdjson; + simdjson::padded_string my_padded_data(json.data(), json.size()); + simdjson::dom::parser parser; + auto doc = parser.parse(my_padded_data); + auto theirs = doc.error(); + if (ours == parser3::S_OVERFLOW || theirs == simdjson::DEPTH_ERROR) { + return; + } + if ((ours == parser3::S_OK) != (theirs == simdjson::SUCCESS)) { + if (json.starts_with("\xef\xbb\xbf")) { + // What to do with byte order mark? + return; + } + if (theirs == simdjson::NUMBER_ERROR || theirs == simdjson::BIGINT_ERROR) { + // This gets returned for precision errors sometimes? + return; + } + if (theirs == simdjson::STRING_ERROR) { + // why god why god do I gotta suffer + return; + } + if (theirs == simdjson::NUMBER_OUT_OF_RANGE) { + // We don't validate the precision of numbers + return; + } + printf("ours: %d\n", ours); + printf("theirs: %d\n", theirs); + abort(); + } +} + extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { - testStreaming(std::string((const char *)data, size)); + auto s = std::string((const char *)data, size); + testStreaming(s); + compareWithSimdjson(s); return 0; } diff --git a/src/minify.h b/src/minify.h index 9c3fe5a..b094cd4 100644 --- a/src/minify.h +++ b/src/minify.h @@ -4,6 +4,7 @@ #include #include #include + struct MinifyState { bool isKey = false; struct Cursor { @@ -29,7 +30,7 @@ struct MinifyState { std::vector stack; }; -Callbacks minifyCallbacks() { +inline Callbacks minifyCallbacks() { Callbacks result; result.on_begin_object = +[](void *p) { auto *state = (MinifyState *)p; diff --git a/src/parser3.h b/src/parser3.h index 9cc0794..93657e1 100644 --- a/src/parser3.h +++ b/src/parser3.h @@ -63,6 +63,7 @@ enum Symbol : uint8_t { T_S, T_COLON, T_UTF8_CONTINUATION_BYTE, + T_UTF8_LAST_CONTINUATION_BYTE, T_HEX, T_DIGIT, T_ONENINE, @@ -118,6 +119,8 @@ struct Parser3 { Symbol stack[kMaxStackSize]; Symbol *stackPtr = stack; bool complete = false; + uint32_t utf8Codepoint; + uint32_t minCodepoint; }; inline Status n_json(Parser3 *self) { @@ -347,6 +350,9 @@ inline Status n_string2(Parser3 *self) { if (*self->buf != '"') { self->callbacks->on_string_data(self->data, self->buf, 1); } + if (tables.invalidUtf8[uint8_t(*self->buf)]) { + return S_REJECT; + } if (int8_t(*self->buf) > 0) { // one byte utf-8 encoding switch (*self->buf) { @@ -368,28 +374,34 @@ inline Status n_string2(Parser3 *self) { } } else if ((*self->buf & 0b11100000) == 0b11000000) { // two byte utf-8 encoding + self->utf8Codepoint = *self->buf & 0b00011111; + self->minCodepoint = 0x80; ++self->buf; self->pop(); - if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, N_STRING2})) { + if (auto s = self->push({T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) { return s; } MUSTTAIL return Parser3::keepGoing(self); } if ((*self->buf & 0b11110000) == 0b11100000) { // three byte utf-8 encoding + self->utf8Codepoint = *self->buf & 0b00001111; + self->minCodepoint = 0x800; ++self->buf; self->pop(); - if (auto s = self->push( - {T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE, N_STRING2})) { + if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, + T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) { return s; } MUSTTAIL return Parser3::keepGoing(self); } else if ((*self->buf & 0b11111000) == 0b11110000) { // four byte utf-8 encoding + self->utf8Codepoint = *self->buf & 0b00000111; + self->minCodepoint = 0x10000; ++self->buf; self->pop(); if (auto s = self->push({T_UTF8_CONTINUATION_BYTE, T_UTF8_CONTINUATION_BYTE, - T_UTF8_CONTINUATION_BYTE, N_STRING2})) { + T_UTF8_LAST_CONTINUATION_BYTE, N_STRING2})) { return s; } MUSTTAIL return Parser3::keepGoing(self); @@ -433,7 +445,36 @@ inline Status t_utf8_continuation_byte(Parser3 *self) { if (self->len() == 0) { return S_REJECT; } + if (tables.invalidUtf8[uint8_t(*self->buf)]) { + return S_REJECT; + } if ((*self->buf & 0b11000000) == 0b10000000) { + self->utf8Codepoint <<= 6; + self->utf8Codepoint |= *self->buf & 0b00111111; + self->callbacks->on_string_data(self->data, self->buf, 1); + ++self->buf; + self->pop(); + MUSTTAIL return Parser3::keepGoing(self); + } + return S_REJECT; +} + +inline Status t_utf8_last_continuation_byte(Parser3 *self) { + if (self->len() == 0) { + return S_REJECT; + } + if (tables.invalidUtf8[uint8_t(*self->buf)]) { + return S_REJECT; + } + if ((*self->buf & 0b11000000) == 0b10000000) { + self->utf8Codepoint <<= 6; + self->utf8Codepoint |= *self->buf & 0b00111111; + if (self->utf8Codepoint < self->minCodepoint || + self->utf8Codepoint > 0x10ffff || + (0xd800 <= self->utf8Codepoint && self->utf8Codepoint <= 0xdfff)) { + return S_REJECT; + } + // TODO tell valgrind utf8Codepoint and minCodepoint are uninitialized self->callbacks->on_string_data(self->data, self->buf, 1); ++self->buf; self->pop(); @@ -782,6 +823,8 @@ constexpr inline struct ContinuationTable { continuations[T_S] = singleChar<'s'>; continuations[T_COLON] = singleChar<':'>; continuations[T_UTF8_CONTINUATION_BYTE] = t_utf8_continuation_byte; + continuations[T_UTF8_LAST_CONTINUATION_BYTE] = + t_utf8_last_continuation_byte; continuations[T_HEX] = t_hex; continuations[T_DIGIT] = t_digit; continuations[T_ONENINE] = t_onenine; diff --git a/src/tables.h b/src/tables.h index 0a725a8..a7abd36 100644 --- a/src/tables.h +++ b/src/tables.h @@ -6,13 +6,13 @@ constexpr inline struct Tables { whitespace['\n'] = true; whitespace['\r'] = true; whitespace['\t'] = true; - for (int i = 0; i < 10; ++i) { - number['0' + i] = true; + + invalidUtf8[0xc0] = true; + invalidUtf8[0xc1] = true; + for (int i = 0xf5; i <= 0xff; ++i) { + invalidUtf8[i] = true; } - number['.'] = true; - number['+'] = true; - number['-'] = true; } alignas(16) bool whitespace[256]{}; - alignas(16) bool number[256]{}; + alignas(16) bool invalidUtf8[256]{}; } tables;