From 37c75f747b5105351c54c38bc29be226395a38ba Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Fri, 19 Apr 2024 14:26:47 -0700 Subject: [PATCH] Draft Testing section --- paper/bibliography.bib | 79 ++++++++++++++++++++++++++++++++++++++++++ paper/paper.tex | 37 ++++++++++++++++++-- 2 files changed, 113 insertions(+), 3 deletions(-) diff --git a/paper/bibliography.bib b/paper/bibliography.bib index c48c3cd..4261d4a 100644 --- a/paper/bibliography.bib +++ b/paper/bibliography.bib @@ -168,3 +168,82 @@ keywords = {data structures, searching, trees} author={Lemire, Daniel and Kaser, Owen and Kurz, Nathan and Deri, Luca and O’Hara, Chris and Saint‐Jacques, François and Ssi‐Yan‐Kai, Gregory}, year={2018}, month=jan, pages={867–895} } + +@misc{libfuzzer, + title = {libFuzzer – a library for coverage-guided fuzz testing}, + howpublished = {\url{https://llvm.org/docs/LibFuzzer.html}}, + note = {Accessed: 2024-04-19} +} + +@misc{ubsan, + title = {Undefined Behaviort Sanitizer}, + howpublished = {\url{https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html}}, + note = {Accessed: 2024-04-19} +} + + +@inproceedings{10.5555/2342821.2342849, +author = {Serebryany, Konstantin and Bruening, Derek and Potapenko, Alexander and Vyukov, Dmitry}, +title = {AddressSanitizer: a fast address sanity checker}, +year = {2012}, +publisher = {USENIX Association}, +address = {USA}, +abstract = {Memory access bugs, including buffer overflows and uses of freed heap memory, remain a serious problem for programming languages like C and C++. Many memory error detectors exist, but most of them are either slow or detect a limited set of bugs, or both.This paper presents AddressSanitizer, a new memory error detector. Our tool finds out-of-bounds accesses to heap, stack, and global objects, as well as use-after-free bugs. It employs a specialized memory allocator and code instrumentation that is simple enough to be implemented in any compiler, binary translation system, or even in hardware.AddressSanitizer achieves efficiency without sacrificing comprehensiveness. Its average slowdown is just 73\% yet it accurately detects bugs at the point of occurrence. It has found over 300 previously unknown bugs in the Chromium browser and many bugs in other software.}, +booktitle = {Proceedings of the 2012 USENIX Conference on Annual Technical Conference}, +pages = {28}, +numpages = {1}, +location = {Boston, MA}, +series = {USENIX ATC'12} +} +@inproceedings{10.1145/1791194.1791203, +author = {Serebryany, Konstantin and Iskhodzhanov, Timur}, +title = {ThreadSanitizer: data race detection in practice}, +year = {2009}, +isbn = {9781605587936}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/1791194.1791203}, +doi = {10.1145/1791194.1791203}, +abstract = {Data races are a particularly unpleasant kind of threading bugs. They are hard to find and reproduce -- you may not observe a bug during the entire testing cycle and will only see it in production as rare unexplainable failures. This paper presents ThreadSanitizer -- a dynamic detector of data races. We describe the hybrid algorithm (based on happens-before and locksets) used in the detector. We introduce what we call dynamic annotations -- a sort of race detection API that allows a user to inform the detector about any tricky synchronization in the user program. Various practical aspects of using ThreadSanitizer for testing multithreaded C++ code at Google are also discussed.}, +booktitle = {Proceedings of the Workshop on Binary Instrumentation and Applications}, +pages = {62–71}, +numpages = {10}, +keywords = {Valgrind, concurrency bugs, dynamic data race detection, testing}, +location = {New York, New York, USA}, +series = {WBIA '09} +} + +@article{10.1145/3591257, +author = {Isemann, Raphael and Giuffrida, Cristiano and Bos, Herbert and van der Kouwe, Erik and Gleissenthall, Klaus von}, +title = {Don’t Look UB: Exposing Sanitizer-Eliding Compiler Optimizations}, +year = {2023}, +issue_date = {June 2023}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +volume = {7}, +number = {PLDI}, +url = {https://doi.org/10.1145/3591257}, +doi = {10.1145/3591257}, +abstract = {Sanitizers are widely used compiler features that detect undefined behavior and resulting vulnerabilities by injecting runtime checks into programs. For better performance, sanitizers are often used in conjunction with optimization passes. But doing so combines two compiler features with conflicting objectives. While sanitizers want to expose undefined behavior, optimizers often exploit these same properties for performance. In this paper, we show that this clash can have serious consequences: optimizations can remove sanitizer failures, thereby hiding the presence of bugs or even introducing new ones. + +We present LookUB, a differential-testing based framework for finding optimizer transformations that elide sanitizer failures. We used our method to find 17 such sanitizer-eliding optimizations in Clang. Next, we used static analysis and fuzzing to search for bugs in open-source projects that were previously hidden due to sanitizer-eliding optimizations. This led us to discover 20 new bugs in Linux Containers, libmpeg2, NTFS-3G, and WINE. Finally, we present an effective mitigation strategy based on a customization of the Clang optimizer with an overhead increase of 4\%.}, +journal = {Proc. ACM Program. Lang.}, +month = {jun}, +articleno = {143}, +numpages = {21}, +keywords = {Sanitizers, Optimizations, Fuzzing} +} + +@inproceedings{10.5555/1247360.1247362, +author = {Seward, Julian and Nethercote, Nicholas}, +title = {Using Valgrind to detect undefined value errors with bit-precision}, +year = {2005}, +publisher = {USENIX Association}, +address = {USA}, +abstract = {We present Memcheck, a tool that has been implemented with the dynamic binary instrumentation framework Valgrind. Memcheck detects a wide range of memory errors in programs as they run. This paper focuses on one kind of error that Memcheck detects: undefined value errors. Such errors are common, and often cause bugs that are hard to find in programs written in languages such as C, C++ and Fortran. Memcheck's definedness checking improves on that of previous tools by being accurate to the level of individual bits. This accuracy gives Memcheck a low false positive and false negative rate.The definedness checking involves shadowing every bit of data in registers and memory with a second bit that indicates if the bit has a defined value. Every value-creating operation is instrumented with a shadow operation that propagates shadow bits appropriately. Memcheck uses these shadow bits to detect uses of undefined values that could adversely affect a program's behaviour.Under Memcheck, programs typically run 20-30 times slower than normal. This is fast enough to use with large programs. Memcheck finds many errors in real programs, and has been used during the past two years by thousands of programmers on a wide range of systems, including OpenOffice, Mozilla, Opera, KDE, GNOME, MySQL, Perl, Samba, The GIMP, and Unreal Tournament.}, +booktitle = {Proceedings of the Annual Conference on USENIX Annual Technical Conference}, +pages = {2}, +numpages = {1}, +location = {Anaheim, CA}, +series = {ATEC '05} +} \ No newline at end of file diff --git a/paper/paper.tex b/paper/paper.tex index 9e81ac2..2a6d336 100644 --- a/paper/paper.tex +++ b/paper/paper.tex @@ -111,7 +111,6 @@ or equivalently [a_{0}\dots a_{k}, a_{0}\dots a_{k} 0) \] and continues with a sequence of prefix ranges ending in each digit up until $a_{k+1}$. -Recall that the range $[a_{0}\dots a_{k} 0, a_{0}\dots a_{k} 1)$ is equivalent to the set of keys starting with $a_{0}\dots a_{k} 0$. \begin{align*} \dots \quad \cup \quad & [a_{0}\dots a_{k} 0, a_{0}\dots a_{k} 1) \quad \cup \\ @@ -120,7 +119,9 @@ Recall that the range $[a_{0}\dots a_{k} 0, a_{0}\dots a_{k} 1)$ is equivalent t & [a_{0}\dots a_{k} (a_{k+1}-1), a_{0}\dots a_{k+1}) \end{align*} +Recall that the range $[a_{0}\dots a_{k} 0, a_{0}\dots a_{k} 1)$ is the set of keys starting with $a_{0}\dots a_{k} 0$. The remainder of the partition begins with the singleton set + \[ \dots \quad \cup \quad [a_{0}\dots a_{k + 1}, a_{0}\dots a_{k + 1} 0) \quad \cup\ \quad \dots \] @@ -169,7 +170,7 @@ A few notes on implementation: \begin{itemize} \item{For clarity, the above algorithm decouples the logical partitioning from the physical structure of the tree. An optimized implementation would merge adjacent prefix ranges that don't correspond to nodes in the tree as it scans, so that it only calculates the version of such merged ranges once. Additionally, our implementation stores an index of which child pointers are valid as a bitset for Node48 and Node256 to speed up this scan using techniques inspired by \cite{Lemire_2018}.} \item{In order to avoid many costly pointer indirections, we can store the max version not in each node itself but next to each node's parent pointer. Without this, the range read performance is not competetive with the skip list.} - \item{An optimized implementation would construct the partition of $[a_{i}\dots a_{m}, a_{i} + 1)$ in reverse order, as it descends along the search path to $a_{i}\dots a_{m}$} + \item{An optimized implementation would visit the partition of $[a_{i}\dots a_{m}, a_{i} + 1)$ in reverse order, as it descends along the search path to $a_{i}\dots a_{m}$} \item{An optimized implementation would search for the common prefix first, and return early if any prefix of the common prefix has a $max \leq r$.} \end{itemize} @@ -180,7 +181,7 @@ We track the rate of insertions of new nodes and make sure that our incremental \subsection{Adding point writes} -A point write of $k$ at version $v$ simply sets $max \gets v$ \footnote{Recall that write versions are non-decreasing.} for every node along $k$'s search path, and sets $range$ for $k$'s node to the $range$ of the first node greater than $k$, or the \emph{oldest version} if none exists. +A point write of $k$ at version $v$ simply sets $max \gets v$ \footnote{Recall that write versions are non-decreasing.} for every node along $k$'s search path, and sets $range$ for $k$'s node to the $range$ of the first node greater than $k$, or \emph{oldest version} if none exists. \subsection{Adding range writes} @@ -191,6 +192,36 @@ Nodes along the search path to $e$ that are a strict prefix of $e$ get $max$ set \section{Testing} +The correctness of \emph{lastCommit} is critically important, as a bug would likely result in data corruption, and so we use a variety of testing techniques. +The main technique is to let libfuzzer \cite{libfuzzer} generate sequences of arbitrary operations, and apply each sequence to both the optimized radix tree and a naive implementation based on an unaugmented ordered map that serves as the specification of the intended behavior. +After libfuzzer generates inputs with broad code coverage, we use libfuzzer's ``corpus minimization'' feature to pare down the test inputs without losing coverage (as measured by libfuzzer) into a fixed set of tests short enough that it's feasible to run interactively during development. +In order to keep these test inputs short, we constrain the size of keys at the loss of some generality. +We believe there isn't anything in the implementation particularly sensitive to the exact length of keys \footnote{\texttt{longestCommonPrefix} is a possible exception, but its length sensitivity is well encapsulated}. +Libfuzzer's minimized corpus achieves 98\% line coverage on its own. +We regenerate the corpus on an ad hoc basis by running libfuzzer for a few cpu-hours, during which it tests millions of unique inputs. + +In addition to asserting correct externally-visible behavior, in each of these tests we assert that internal invariants hold between operations. +We also use address sanitizer \cite{10.5555/2342821.2342849} to detect memory errors, undefined behavior sanitizer \cite{ubsan} to detect invocations of undefined behavior, and thread sanitizer \cite{10.1145/1791194.1791203} (while exercising concurrent access as allowed by the documented contract) to detect data-race-related undefined behavior. + +Each of these sanitizers is implemented using compiler instrumentation, which means that they are not testing the final binary artifact that will be run in production. +Therefore we also run the test inputs linking directly to the final release artifact, both standalone and under valgrind \cite{10.5555/1247360.1247362}. +When testing the final artifacts, we do not assert internal invariants as we lack convenient access to the internals. +As a defense against possible bugs in compilers' sanitizer and optimizer passes \cite{10.1145/3591257}, we also test with sanitizers enabled and optimizations disabled, and test with both clang and gcc. + +We audited the 2\% of lines that were not covered by libfuzzer \footnote{In order to see the uncovered lines for yourself, exclude all tests containing the word ``script'' with \texttt{ctest -E script}. Look in \texttt{Jenkinsfile} for an example of how to measure coverage.} and found the following: +\begin{itemize} + \item Three occurrences which can be reached from an input that libfuzzer could theoretically generate. In each case the uncovered code is straightforward, and is exercised from an entry point by a manually written test. + \item One occurrence which requires a large number of operations, and cannot be reached from an input satisfying the size constraints we impose on libfuzzer. This code is also straightforward, and is exercised from an entry point by a manually written test. The purpose of this code is to keep memory usage in check, and so it's expected that it cannot be reached without a large number of operations. + \item One occurrence which is not reachable from any entry point, but is exercised when asserting internal invariants. This line is now suppressed with an explanatory comment. +\end{itemize} + +We assert 100\% line coverage in continuous integration, which is achieved with a few caveats. +2\% of the code is only covered by a few manually written tests. +We suppress lines manually checked to be unreachable from an entry point. +There is also a significant amount of test-only code which is suppressed from coverage measurements. +There's a small difference in the behavior between debug and release builds: the code which scans for old entries gets run more frequently when assertions are enabled. +This code is not straightforward, so exercising it from only a manually written test seems insufficient. + \section{Conclusion} \printbibliography