From e956c526b2840b4c9b174456b54beb1918dbfa75 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Fri, 16 Feb 2024 16:27:55 -0800 Subject: [PATCH] Draft paper introduction --- paper/.gitignore | 5 ++- paper/bibliography.bib | 83 ++++++++++++++++++++++++++++++++++++++++-- paper/paper.tex | 40 ++++++++++++++++++-- 3 files changed, 120 insertions(+), 8 deletions(-) diff --git a/paper/.gitignore b/paper/.gitignore index 24cfcb3..945e00e 100644 --- a/paper/.gitignore +++ b/paper/.gitignore @@ -3,7 +3,10 @@ *.bcf *.blg *.dvi +*.fdb_latexmk +*.fls *.log *.out *.pdf -*.run.xml \ No newline at end of file +*.run.xml +*.synctex.gz diff --git a/paper/bibliography.bib b/paper/bibliography.bib index 9c893e1..3bb3857 100644 --- a/paper/bibliography.bib +++ b/paper/bibliography.bib @@ -51,8 +51,8 @@ url = {https://doi.org/10.1145/78973.78977}, doi = {10.1145/78973.78977}, abstract = {Skip lists are data structures that use probabilistic balancing rather than strictly enforced balancing. As a result, the algorithms for insertion and deletion in skip lists are much simpler and significantly faster than equivalent algorithms for balanced trees.}, journal = {Commun. ACM}, -month = {jun}, -pages = {668–676}, +month = {6}, +pages = {668-676}, numpages = {9}, keywords = {data structures, searching, trees} } @@ -75,4 +75,81 @@ keywords = {data structures, searching, trees} timestamp = {Fri, 24 Mar 2023 00:00:01 +0100}, biburl = {https://dblp.org/rec/conf/icde/LeisK013.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} -} \ No newline at end of file +} + +@book{10.5555/17299, +author = {Bernstein, Philip A and Hadzilacos, Vassos and Goodman, Nathan}, +title = {Concurrency control and recovery in database systems}, +year = {1986}, +isbn = {0201107155}, +publisher = {Addison-Wesley Longman Publishing Co., Inc.}, +address = {USA} +} + +@book{cormen2022introduction, + title={Introduction to algorithms}, + author={Cormen, Thomas H and Leiserson, Charles E and Rivest, Ronald L and Stein, Clifford}, + year={2022}, + publisher={MIT press}, + chapter={17 Augmenting Data Structures} +} + +@article{bentley1979decomposable, + title={Decomposable searching problems}, + author={Bentley, Jon Louis and others}, + journal={Inf. Process. Lett.}, + volume={8}, + number={5}, + pages={244--251}, + year={1979} +} + +@inproceedings{adelson1962algorithm, + title={An algorithm for organization of information}, + author={Adelson-Velskii, Georgii Maksimovich and Landis, Evgenii Mikhailovich}, + booktitle={Doklady Akademii Nauk}, + volume={146}, + number={2}, + pages={263--266}, + year={1962}, + organization={Russian Academy of Sciences} +} + +@inproceedings{guibas1978dichromatic, + title={A dichromatic framework for balanced trees}, + author={Guibas, Leo J and Sedgewick, Robert}, + booktitle={19th Annual Symposium on Foundations of Computer Science (sfcs 1978)}, + pages={8--21}, + year={1978}, + organization={IEEE} +} + +@article{seidel1996randomized, + title={Randomized search trees}, + author={Seidel, Raimund and Aragon, Cecilia R}, + journal={Algorithmica}, + volume={16}, + number={4-5}, + pages={464--497}, + year={1996}, + publisher={Springer} +} + +@article{comer1979ubiquitous, + title={Ubiquitous B-tree}, + author={Comer, Douglas}, + journal={ACM Computing Surveys (CSUR)}, + volume={11}, + number={2}, + pages={121--137}, + year={1979}, + publisher={ACM New York, NY, USA} +} + +@inproceedings{binna2018hot, + title={HOT: A height optimized trie index for main-memory database systems}, + author={Binna, Robert and Zangerle, Eva and Pichl, Martin and Specht, G{\"u}nther and Leis, Viktor}, + booktitle={Proceedings of the 2018 International Conference on Management of Data}, + pages={521--534}, + year={2018} +} diff --git a/paper/paper.tex b/paper/paper.tex index b2c2bf5..f0da3e4 100644 --- a/paper/paper.tex +++ b/paper/paper.tex @@ -1,9 +1,10 @@ \documentclass[twocolumn]{article} \usepackage{hyperref} +\usepackage[utf8]{inputenc} \title{ARTful Conflict Checking for FoundationDB} -\author{Andrew Noyes \\ \href{mailto:andrew@weaselab.dev}{andrew@weaselab.dev}} +\author{Andrew Noyes \thanks{\href{mailto:andrew@weaselab.dev}{andrew@weaselab.dev}}} \date{} \usepackage{biblatex} @@ -16,9 +17,40 @@ \section{Abstract} FoundationDB \cite{DBLP:conf/sigmod/ZhouXSNMTABSLRD21} provides serializability using a specialized data structure called \textit{lastCommit} \footnote{See Algorithm 1 referenced in \cite{DBLP:conf/sigmod/ZhouXSNMTABSLRD21}}. -This data structure maps key ranges (sets of keys denoted by either a singleton key or a half-open interval) to a ``commit version'' represented as a 64-bit integer. -FoundationDB implements \textit{lastCommit} as a version-augmented probabilistic SkipList \cite{10.1145/78973.78977}. -In this paper, we propose an alternative implementation of \textit{lastCommit} as a version-augmented Adaptive Radix Tree \cite{DBLP:conf/icde/LeisK013}, and evaluate its performance. +This data structure maps key ranges (sets of bitwise-lexicographically-ordered keys denoted by either a singleton key or a half-open interval) to a version represented as a 64-bit integer. +FoundationDB implements \textit{lastCommit} as a version-augmented probabilistic skip list \cite{10.1145/78973.78977}. +In this paper, we propose an alternative implementation of \textit{lastCommit} as a version-augmented Adaptive Radix Tree (ART) \cite{DBLP:conf/icde/LeisK013}, and evaluate its performance. + +\section{Introduction} + +Let's begin by considering design options for \textit{lastCommit}. +In order to manage half-open intervals we need an ordered data structure, so hash tables are out of consideration. +For any ordered data structure we can implement \textit{lastCommit} using a representation where a logical key is mapped to the value of the last physical key less than or equal to the logical key. +This is a standard technique used throughout FoundationDB. + +The problem with applying this to an off-the-shelf ordered data structure is that checking a read range is linear in the number of intersecting physical keys. +Under a high-enough write load, there can be arbitrarily many point writes unexpired in the MVCC \cite{10.5555/17299} window. +Scanning through every point write intersecting a large range read would make conflict checking unacceptably slow. + +This suggests we consider augmenting \cite{cormen2022introduction} an ordered data structure to make checking the max version of a range sublinear. +Since finding the maximum of a set of elements is a decomposable search problem \cite{bentley1979decomposable}, we could apply the general technique using \texttt{std::max} as our binary operation, and \texttt{MIN\_INT} as our identity. +Algorithmically, this describes the implementation of FoundationDB's skip list. +We can also consider any other ordered data structure to augment, such as any variant of a balanced binary search tree \cite{adelson1962algorithm,guibas1978dichromatic,seidel1996randomized}, a b-tree \cite{comer1979ubiquitous}, or a radix tree \cite{DBLP:conf/icde/LeisK013,binna2018hot}. + +Let's compare the relevant properties of our candidate data structures for insertion/update and read operations. +After insertion, the max version along the search path must reflect the update. +For comparison-based trees, updating max version along the search path cannot be done during top-down search, because \emph{insertion will change the search path}, and we do not know whether or not this is an insert or an update until we complete the top-down search. +We have no choice but to do a second, bottom-up pass to propagate max version changes. +Furthermore, the usual way of propagating the change will always propagate all the way to the root, since inserts always use the highest-yet version. +For a radix tree, max version can be updated on the top-down pass, and there's minimal overhead compared to the radix tree un-augmented. + +For ``last less than or equal to'' queries, skip lists have the convenient property that no backtracking is necessary, since the bottommost level is a sorted linked list. +Binary search trees and radix trees both require backtracking up the search path. +It's possible to trade off the backtracking for the increased overhead of maintaining the elements in an auxiliary sorted linked list during insertion. + +Our options also have various tradeoffs inherited from their un-augmented versions such as different worst-case and expected bounds on the length of search paths and the number of rotations performed upon insert. +ART has been shown \cite{DBLP:conf/icde/LeisK013} to offer superior performance to comparison-based data structures on modern hardware, which is on its own a compelling reason to consider it. +The Height Optimized Trie (HOT) \cite{binna2018hot} outperforms ART, but has a few practical disadvantages \footnote{HOT has more implementation complexity than the already-daunting ART. Additionally it requires AVX2 instructions and involves rebalancing operations during insertion. Even so, it's likely that a HOT-based \emph{lastCommit} version would be superior.} and will not be considered in this paper. \printbibliography