@comment{{This file has been generated by bib2bib 1.99}}
@comment{{Command line: bib2bib -ob bibliography.bib references.bib research.bib}}
@comment{{Prefer citations from https://dl.acm.org/, if available. DOI as citekey, if available.}}
@inproceedings{10.1145/318898.318923, author = {Copeland, George P. and Khoshafian, Setrag N.}, title = {A Decomposition Storage Model}, year = {1985}, isbn = {0897911601}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://dl.acm.org/doi/pdf/10.1145/318898.318923}, doi = {10.1145/318898.318923}, booktitle = {Proceedings of the 1985 ACM SIGMOD International Conference on Management of Data}, pages = {268–279}, numpages = {12}, location = {Austin, Texas, USA}, series = {SIGMOD '85} }
@inproceedings{10.1145/800083.802685, author = {Copeland, George}, title = {What If Mass Storage Were Free?}, year = {1980}, isbn = {9781450373951}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://dl.acm.org/doi/pdf/10.1145/800083.802685}, doi = {10.1145/800083.802685}, abstract = {This paper investigates how database systems would be designed and used under the limiting-case assumption that mass storage is free. It is argued that free mass storage would free database systems from the limitations and problems caused by conventional deletion techniques. A non-deletion strategy would significantly simplify database systems and their operation, as well as increase their functionality and availability. Consideration of this limiting case helps shed light on a more realistic argument: if the cost of mass storage were low enough, then deletion would become undesirable.It is also argued that the often labor-intensive costs and time delays involved in archival and retrieval of older data can be minimized if a single technology were available with low-cost on-line storage and a low-cost archival media with long shelf life.Optical discs promise to come one to two orders of magnitude closer to the limiting case of free mass storage than ever before. Other features of optical discs include improved reliability and a single technology for both on-line and archival storage with a long shelf life. Because of these features and because of (not in spite of) their non-deletion limitation, it is argued that optical discs fit the requirements of database systems better than magnetic discs and tapes.}, booktitle = {Proceedings of the Fifth Workshop on Computer Architecture for Non-Numeric Processing}, pages = {1–7}, numpages = {7}, location = {Pacific Grove, California, USA}, series = {CAW '80} }
@article{10.1145/971697.602300, author = {Copeland, George and Maier, David}, title = {Making {Smalltalk} a Database System}, year = {1984}, issue_date = {June 1984}, volume = {14}, number = {2}, issn = {0163-5808}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://dl.acm.org/doi/pdf/10.1145/971697.602300}, doi = {10.1145/971697.602300}, abstract = {To overcome limitations in the modeling power of existing database systems and provide a better tool for database application programming, Servio Logic Corporation is developing a computer system to support a set-theoretic data model in an object-oriented programming environment We recount the problems with existing models and database systems We then show how features of Smalltalk, such such as operational semantics, its type hierarchy, entity identity and the merging of programming and data language, solve many of those problems Nest we consider what Smalltalk lacks as a database system secondary storage management, a declarative semantics, concurrency, past states To address these shortcomings, we needed a formal data model We introduce the GemStone data model, and show how it helps to define path expressions, a declarative semantics and object history in the OPAL language We summarize similar approaches, and give a brief overview of the GemStone system implementation}, journal = {SIGMOD Rec.}, month = jun, pages = {316–325}, numpages = {10} }
@article{10.1109/69.755613, author = {Jensen, Christian S. and Snodgrass, Richard Thomas}, title = {Temporal Data Management}, year = {1999}, issue_date = {January 1999}, publisher = {IEEE Educational Activities Department}, address = {USA}, volume = {11}, number = {1}, issn = {1041-4347}, url = {http://www2.cs.arizona.edu/~rts/pubs/TKDEJan99.pdf}, doi = {10.1109/69.755613}, abstract = {A wide range of database applications manage time-varying information. Existing database technology currently provides little support for managing such data. The research area of temporal databases has made important contributions in characterizing the semantics of such information and in providing expressive and efficient means to model, store, and query temporal data. This paper introduces the reader to temporal data management, surveys state-of-the-art solutions to challenging aspects of temporal data management, and points to research directions.}, journal = {IEEE Trans. on Knowl. and Data Eng.}, month = jan, pages = {36–44}, numpages = {9}, keywords = {temporal database, SQL, time-constrained database, Query language, TSQL2, transaction time, valid time., temporal data model, user-defined time} }
@book{10.5555/320037, author = {Snodgrass, Richard Thomas}, title = {Developing Time-Oriented Database Applications in {SQL}}, year = {1999}, isbn = {1558604367}, doi = {10.5555/320037}, url = {http://www2.cs.arizona.edu/~rts/tdbbook.pdf}, publisher = {Morgan Kaufmann Publishers Inc.}, address = {San Francisco, CA, USA} }
@article{10.1145/3180143, author = {Ngo, Hung Q. and Porat, Ely and R\'{e}, Christopher and Rudra, Atri}, title = {Worst-Case Optimal Join Algorithms}, year = {2018}, issue_date = {June 2018}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {65}, number = {3}, issn = {0004-5411}, url = {https://www.cs.stanford.edu/people/chrismre/papers/paper49.Ngo.pdf}, doi = {10.1145/3180143}, abstract = {Efficient join processing is one of the most fundamental and well-studied tasks in database research. In this work, we examine algorithms for natural join queries over many relations and describe a new algorithm to process these queries optimally in terms of worst-case data complexity. Our result builds on recent work by Atserias, Grohe, and Marx, who gave bounds on the size of a natural join query in terms of the sizes of the individual relations in the body of the query. These bounds, however, are not constructive: they rely on Shearer’s entropy inequality, which is information-theoretic. Thus, the previous results leave open the question of whether there exist algorithms whose runtimes achieve these optimal bounds. An answer to this question may be interesting to database practice, as we show in this article that any project-join style plans, such as ones typically employed in a relational database management system, are asymptotically slower than the optimal for some queries. We present an algorithm whose runtime is worst-case optimal for all natural join queries. Our result may be of independent interest, as our algorithm also yields a constructive proof of the general fractional cover bound by Atserias, Grohe, and Marx without using Shearer’s inequality. This bound implies two famous inequalities in geometry: the Loomis-Whitney inequality and its generalization, the Bollob\'{a}s-Thomason inequality. Hence, our results algorithmically prove these inequalities as well. Finally, we discuss how our algorithm can be used to evaluate full conjunctive queries optimally, to compute a relaxed notion of joins and to optimally (in the worst-case) enumerate all induced copies of a fixed subgraph inside of a given large graph.}, journal = {J. ACM}, month = mar, articleno = {16}, numpages = {40}, keywords = {Join Algorithms, fractional cover bound, Bollob\'{a}s-Thomason inequality, Loomis-Whitney inequality} }
@article{10.14778/3436905.3436913, author = {Li, Tianyu and Butrovich, Matthew and Ngom, Amadou and Lim, Wan Shen and McKinney, Wes and Pavlo, Andrew}, title = {Mainlining Databases: Supporting Fast Transactional Workloads on Universal Columnar Data File Formats}, year = {2021}, issue_date = {December 2020}, publisher = {VLDB Endowment}, volume = {14}, number = {4}, issn = {2150-8097}, url = {https://db.cs.cmu.edu/papers/2020/p534-li.pdf}, doi = {10.14778/3436905.3436913}, abstract = {The proliferation of modern data processing tools has given rise to open-source columnar data formats. These formats help organizations avoid repeated conversion of data to a new format for each application. However, these formats are read-only, and organizations must use a heavy-weight transformation process to load data from on-line transactional processing (OLTP) systems. As a result, DBMSs often fail to take advantage of full network bandwidth when transferring data. We aim to reduce or even eliminate this overhead by developing a storage architecture for in-memory database management systems (DBMSs) that is aware of the eventual usage of its data and emits columnar storage blocks in a universal open-source format. We introduce relaxations to common analytical data formats to efficiently update records and rely on a lightweight transformation process to convert blocks to a read-optimized layout when they are cold. We also describe how to access data from third-party analytical tools with minimal serialization overhead. We implemented our storage engine based on the Apache Arrow format and integrated it into the NoisePage DBMS to evaluate our work. Our experiments show that our approach achieves comparable performance with dedicated OLTP DBMSs while enabling orders-of-magnitude faster data exports to external data science and machine learning tools than existing methods.}, journal = {Proceedings of the VLDB Endowment}, month = feb, pages = {534–546}, numpages = {13} }
@inproceedings{CIDR-BonczMonetDBX100HyperPipeliningQueryExecution, title = {{MonetDB/X100}: Hyper-Pipelining Query Execution}, author = {Boncz, Peter A. and Zukowski, Marcin and Nes, Niels}, booktitle = {Second Biennial Conference on Innovative Data Systems Research, {CIDR} 2005, Asilomar, CA, USA, January 4-7, 2005, Online Proceedings}, pages = {225–237}, publisher = {www.cidrdb.org}, year = {2005}, url = {http://cidrdb.org/cidr2005/papers/P19.pdf}, timestamp = {Mon, 18 Jul 2022 17:13:00 +0200}, biburl = {https://dblp.org/rec/conf/cidr/BonczZN05.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} }
@article{10.1007/s00778-002-0074-9, author = {Ailamaki, Anastassia and DeWitt, David J. and Hill, Mark D.}, title = {Data Page Layouts for Relational Databases on Deep Memory Hierarchies}, year = {2002}, issue_date = {November 2002}, publisher = {Springer-Verlag}, address = {Berlin, Heidelberg}, volume = {11}, number = {3}, issn = {1066-8888}, url = {https://research.cs.wisc.edu/multifacet/papers/vldbj02_pax.pdf}, doi = {10.1007/s00778-002-0074-9}, abstract = {Relational database systems have traditionally optimized for I/O performance and organized records sequentially on disk pages using the N-ary Storage Model (NSM) (a.k.a., slotted pages). Recent research, however, indicates that cache utilization and performance is becoming increasingly important on modern platforms. In this paper, we first demonstrate that in-page data placement is the key to high cache performance and that NSM exhibits low cache utilization on modern platforms. Next, we propose a new data organization model called PAX (Partition Attributes Across), that significantly improves cache performance by grouping together all values of each attribute within each page. Because PAX only affects layout inside the pages, it incurs no storage penalty and does not affect I/O behavior. According to our experimental results (which were obtained without using any indices on the participating relations), when compared to NSM: (a) PAX exhibits superior cache and memory bandwidth utilization, saving at least 75\% of NSM's stall time due to data cache accesses; (b) range selection queries and updates on memory-resident relations execute 1725\% faster; and (c) TPC-H queries involving I/O execute 1148\% faster. Finally, we show that PAX performs well across different memory system designs.}, journal = {The VLDB Journal}, month = nov, pages = {198–215}, numpages = {18}, keywords = {Disk page layout, Relational data placement, Cache-conscious database systems} }
@techreport{ISO/IEC-9075-2:2016, author = {ISO/IEC 9075-2:2016}, title = {Information technology — Database languages — {SQL} — Part 2: Foundation {(SQL/Foundation)}}, note = {https://www.iso.org/standard/63556.html}, url = {https://www.iso.org/standard/63556.html}, year = {2021}, month = aug, type = {Standard}, institution = {ISO/IEC} }
@techreport{ISO/IEC-19075-2:2021, author = {ISO/IEC 19075-2:2021}, title = {Information technology — Guidance for the use of database language {SQL} — Part 2: Time-related information}, note = {https://www.iso.org/standard/78933.html}, url = {https://www.iso.org/standard/78933.html}, year = {2021}, month = aug, type = {Standard}, institution = {ISO/IEC} }
@techreport{ISO/IEC-19075-6:2021, author = {ISO/IEC 19075-6:2021}, title = {Information technology — Guidance for the use of database language {SQL} — Part 6: Support for {JSON}}, note = {https://www.iso.org/standard/78937.html}, url = {https://www.iso.org/standard/78937.html}, year = {2021}, month = aug, type = {Standard}, institution = {ISO/IEC} }
@inproceedings{10.1145/3318464.3380579, author = {Nathan, Vikram and Ding, Jialin and Alizadeh, Mohammad and Kraska, Tim}, title = {Learning Multi-Dimensional Indexes}, year = {2020}, isbn = {9781450367356}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://arxiv.org/pdf/1912.01668.pdf}, doi = {10.1145/3318464.3380579}, abstract = {Scanning and filtering over multi-dimensional tables are key operations in modern analytical database engines. To optimize the performance of these operations, databases often create clustered indexes over a single dimension or multi-dimensional indexes such as R-Trees, or use complex sort orders (e.g., Z-ordering). However, these schemes are often hard to tune and their performance is inconsistent across different datasets and queries. In this paper, we introduce Flood, a multi-dimensional in-memory read-optimized index that automatically adapts itself to a particular dataset and workload by jointly optimizing the index structure and data storage layout. Flood achieves up to three orders of magnitude faster performance for range scans with predicates than state-of-the-art multi-dimensional indexes or sort orders on real-world datasets and workloads. Our work serves as a building block towards an end-to-end learned database system.}, booktitle = {Proceedings of the 2020 ACM SIGMOD International Conference on Management of Data}, pages = {985–1000}, numpages = {16}, keywords = {in-memory, multi-dimensional, indexing, databases, primary index}, location = {Portland, OR, USA}, series = {SIGMOD '20} }
@misc{YouTube-Raberg-YjAVsvYGbuU, author = {Råberg, Håkan}, title = {The Design and Implementation of a Bitemporal {DBMS}}, url = {https://www.youtube.com/watch?v=YjAVsvYGbuU}, year = {2019}, month = sep, keywords = {temporal, bitemporal, z-curves}, location = {Helsinki, Finland}, series = {ClojuTRE 2019} }
@misc{YouTube-Raberg-Px-7TlceM5A, author = {Råberg, Håkan}, title = {Light and Adaptive Indexing for Immutable Databases}, url = {https://www.youtube.com/watch?v=Px-7TlceM5A}, year = {2022}, month = sep, keywords = {machine learning, adaptive indexes, databases, indexing, separation of storage from compute}, location = {St. Louis, MO, USA}, series = {Strange Loop 2020} }
@inproceedings{CIDR-IdreosDatabaseCracking, author = {Idreos, Stratos and Kersten, Martin and Manegold, Stefan}, title = {Database Cracking.}, booktitle = {Conference on Innovative Data Systems Research}, year = {2007}, month = {01}, url = {https://stratos.seas.harvard.edu/files/IKM_CIDR07.pdf} }
@inproceedings{10.1145/800296.811515, author = {Chamberlin, Donald D. and Boyce, Raymond F.}, title = {{SEQUEL}: A Structured {English} Query Language}, year = {1974}, isbn = {9781450374156}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://dl.acm.org/doi/pdf/10.1145/800296.811515}, doi = {10.1145/800296.811515}, abstract = {In this paper we present the data manipulation facility for a structured English query language (SEQUEL) which can be used for accessing data in an integrated relational data base. Without resorting to the concepts of bound variables and quantifiers SEQUEL identifies a set of simple operations on tabular structures, which can be shown to be of equivalent power to the first order predicate calculus. A SEQUEL user is presented with a consistent set of keyword English templates which reflect how people use tables to obtain information. Moreover, the SEQUEL user is able to compose these basic templates in a structured manner in order to form more complex queries. SEQUEL is intended as a data base sublanguage for both the professional programmer and the more infrequent data base user.}, booktitle = {Proceedings of the 1974 ACM SIGFIDET (Now SIGMOD) Workshop on Data Description, Access and Control}, pages = {249–264}, numpages = {16}, keywords = {Information Retrieval, Data Base Management Systems, Query Languages, Data Manipulation Languages}, location = {Ann Arbor, Michigan}, series = {SIGFIDET '74} }
@article{10.1145/362384.362685, author = {Codd, E. F.}, title = {A Relational Model of Data for Large Shared Data Banks}, year = {1970}, issue_date = {June 1970}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {13}, number = {6}, issn = {0001-0782}, url = {https://dl.acm.org/doi/pdf/10.1145/362384.362685}, doi = {10.1145/362384.362685}, abstract = {Future users of large data banks must be protected from having to know how the data is organized in the machine (the internal representation). A prompting service which supplies such information is not a satisfactory solution. Activities of users at terminals and most application programs should remain unaffected when the internal representation of data is changed and even when some aspects of the external representation are changed. Changes in data representation will often be needed as a result of changes in query, update, and report traffic and natural growth in the types of stored information.Existing noninferential, formatted data systems provide users with tree-structured files or slightly more general network models of the data. In Section 1, inadequacies of these models are discussed. A model based on n-ary relations, a normal form for data base relations, and the concept of a universal data sublanguage are introduced. In Section 2, certain operations on relations (other than logical inference) are discussed and applied to the problems of redundancy and consistency in the user's model.}, journal = {Commun. ACM}, month = jun, pages = {377–387}, numpages = {11}, keywords = {composition, data base, redundancy, data structure, data bank, predicate calculus, retrieval language, relations, hierarchies of data, data organization, data integrity, consistency, networks of data, security, derivability, join} }
@article{10.1145/262762.262770, author = {McHugh, Jason and Abiteboul, Serge and Goldman, Roy and Quass, Dallas and Widom, Jennifer}, title = {Lore: A Database Management System for Semistructured Data}, year = {1997}, issue_date = {Sept. 1997}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {26}, number = {3}, issn = {0163-5808}, url = {https://dl.acm.org/doi/pdf/10.1145/262762.262770}, doi = {10.1145/262762.262770}, abstract = {Lore (for Lightweight Object Repository) is a DBMS designed specifically for managing semistructured information. Implementing Lore has required rethinking all aspects of a DBMS, including storage management, indexing, query processing and optimization, and user interfaces. This paper provides an overview of these aspects of the Lore system, as well as other novel features such as dynamic structural summaries and seamless access to data from external sources.}, journal = {SIGMOD Rec.}, month = sep, pages = {54–66}, numpages = {13} }
@inproceedings{10.1145/1376616.1376645, author = {Brantner, Matthias and Florescu, Daniela and Graf, David and Kossmann, Donald and Kraska, Tim}, title = {Building a Database on {S3}}, year = {2008}, isbn = {9781605581026}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://people.csail.mit.edu/kraska/pub/sigmod08-s3.pdf}, doi = {10.1145/1376616.1376645}, abstract = {There has been a great deal of hype about Amazon's simple storage service (S3). S3 provides infinite scalability and high availability at low cost. Currently, S3 is used mostly to store multi-media documents (videos, photos, audio) which are shared by a community of people and rarely updated. The purpose of this paper is to demonstrate the opportunities and limitations of using S3 as a storage system for general-purpose database applications which involve small objects and frequent updates. Read, write, and commit protocols are presented. Furthermore, the cost (\$), performance, and consistency properties of such a storage system are studied.}, booktitle = {Proceedings of the 2008 ACM SIGMOD International Conference on Management of Data}, pages = {251–264}, numpages = {14}, keywords = {ec2, database, simpledb, cost trade-off, aws, storage system, s3, concurrency, sqs, performance, cloud computing, eventual consistency}, location = {Vancouver, Canada}, series = {SIGMOD '08} }
@article{10.14778/3415478.3415534, author = {Liu, Zhen Hua and Hammerschmidt, Beda and McMahon, Doug and Chang, Hui and Lu, Ying and Spiegel, Josh and Sosa, Alfonso Colunga and Suresh, Srikrishnan and Arora, Geeta and Arora, Vikas}, title = {Native {JSON} Datatype Support: Maturing {SQL} and {NoSQL} Convergence in {Oracle} {Database}}, year = {2020}, issue_date = {August 2020}, publisher = {VLDB Endowment}, volume = {13}, number = {12}, issn = {2150-8097}, url = {http://www.vldb.org/pvldb/vol13/p3059-liu.pdf}, doi = {10.14778/3415478.3415534}, abstract = {Both RDBMS and NoSQL database vendors have added varying degrees of support for storing and processing JSON data. Some vendors store JSON directly as text while others add new JSON type systems backed by binary encoding formats. The latter option is increasingly popular as it enables richer type systems and efficient query processing. In this paper, we present our new native JSON datatype and how it is fully integrated with the Oracle Database ecosystem to transform Oracle Database into a mature platform for serving both SQL and NoSQL style access paradigms. We show how our uniquely designed Oracle Binary JSON format (OSON) is able to speed up both OLAP and OLTP workloads over JSON documents.}, journal = {Proc. VLDB Endow.}, month = sep, pages = {3059–3071}, numpages = {13} }
@inproceedings{10.1145/2588555.2595628, author = {Liu, Zhen Hua and Hammerschmidt, Beda and McMahon, Doug}, title = {{JSON} Data Management: Supporting Schema-Less Development in {RDBMS}}, year = {2014}, isbn = {9781450323765}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://www.db.ics.keio.ac.jp/seminar/2015/20151106_moe/20151106_moe_SIGMOD.pdf}, doi = {10.1145/2588555.2595628}, abstract = {Relational Database Management Systems (RDBMS) have been very successful at managing structured data with well-defined schemas. Despite this, relational systems are generally not the first choice for management of data where schemas are not pre-defined or must be flexible in the face of variations and changes. Instead, No-SQL database systems supporting JSON are often selected to provide persistence to such applications. JSON is a light-weight and flexible semi-structured data format supporting constructs common in most programming languages. In this paper, we analyze the way in which requirements differ between management of relational data and management of JSON data. We present three architectural principles that facilitate a schema-less development style within an RDBMS so that RDBMS users can store, query, and index JSON data without requiring schemas. We show how these three principles can be applied to industry-leading RDBMS platforms, such as the Oracle RDBMS Server, with relatively little effort. Consequently, an RDBMS can unify the management of both relational data and JSON data in one platform and use SQL with an embedded JSON path language as a single declarative language to query both relational data and JSON data. This SQL/JSON approach offers significant benefits to application developers as they can use one product to manage both relational data and semi-structured flexible schema data.}, booktitle = {Proceedings of the 2014 ACM SIGMOD International Conference on Management of Data}, pages = {1247–1258}, numpages = {12}, keywords = {JSON, no-SQL, schema-less, XML, SQL/XML, SQL/JSON}, location = {Snowbird, Utah, USA}, series = {SIGMOD '14} }
@article{10.1007/s13222-017-0267-4, title = {{SQL/JSON} Standard: Properties and Deficiencies}, author = {Du{\v s}an Petkovi{\'c}}, journal = {Datenbank-Spektrum}, year = {2017}, month = oct, volume = {17}, pages = {277-287}, doi = {10.1007/s13222-017-0267-4}, url = {https://www.researchgate.net/profile/Dusan-Petkovic-2/publication/320594498_SQLJSON_Standard_Properties_and_Deficiencies/links/625455194f88c3119cf27dc3/SQL-JSON-Standard-Properties-and-Deficiencies.pdf} }
@article{Couchbase-Chamberlin2019ComparingSQLPlusPlusSQL2016, title = {Comparing Two {SQL}-Based Approaches for Querying {JSON}: {SQL++} and {SQL:2016}}, author = {Don Chamberlin}, journal = {Couchbase Resources}, year = {2019}, month = aug, url = {https://info.couchbase.com/rs/302-GJY-034/images/Comparing_Two_SQL_Based_Approaches_WP.pdf} }
@techreport{W3C-RobieXQuery31, author = {Robie, Jonathan and Dyck, Michael and Spiegel, Josh}, title = {{XQuery} 3.1: An {XML} Query Language}, url = {https://www.w3.org/TR/xquery-31/}, note = {Accessed: {2023-01-01}}, year = {2017}, month = mar, type = {Recommendation}, institution = {W3C} }
@misc{Chamberlin2003XQuery, author = {Don Chamberlin}, title = {{XQuery}: A Query Language for {XML} (slides)}, month = jun, year = {2003}, url = {https://cseweb.ucsd.edu/classes/wi19/cse232B-a/papers/sigmod03_xquery.pdf}, note = {Accessed: {2023–01-01}} }
@misc{CHM-McJonesOralHistoryOfDonaldChamberlin, title = {Oral History of {Donald} {Chamberlin}}, author = {Paul McJones}, publisher = {Computer History Museum}, url = {https://archive.computerhistory.org/resources/access/text/2015/06/102702111-05-01-acc.pdf}, month = jul, year = {2009} }
@inproceedings{CIDR-LiuManagementOfFlexibleSchemaDataInRDBMS, title = {Management of Flexible Schema Data in {RDBMSs} - Opportunities and Limitations for {NoSQL} -}, author = {Liu, Zhen Hua and Gawlick, Dieter}, booktitle = {Conference on Innovative Data Systems Research}, year = {2015}, url = {https://www.cidrdb.org/cidr2015/Papers/CIDR15_Paper5.pdf} }
@article{10.14778/1687553.1687618, author = {Manegold, Stefan and Kersten, Martin L. and Boncz, Peter}, title = {Database Architecture Evolution: Mammals Flourished Long before Dinosaurs Became Extinct}, year = {2009}, issue_date = {August 2009}, publisher = {VLDB Endowment}, volume = {2}, number = {2}, issn = {2150-8097}, url = {https://www.researchgate.net/profile/Martin-Kersten-2/publication/220538804_Database_Architecture_Evolution_Mammals_Flourished_long_before_Dinosaurs_became_Extinct/links/09e4150adcb360bdcd000000/Database-Architecture-Evolution-Mammals-Flourished-long-before-Dinosaurs-became-Extinct.pdf}, doi = {10.14778/1687553.1687618}, abstract = {The holy grail for database architecture research is to find a solution that is Scalable & Speedy, to run on anything from small ARM processors up to globally distributed compute clusters, Stable & Secure, to service a broad user community, Small & Simple, to be comprehensible to a small team of programmers, Self-managing, to let it run out-of-the-box without hassle.In this paper, we provide a trip report on this quest, covering both past experiences, ongoing research on hardware-conscious algorithms, and novel ways towards self-management specifically focused on column store solutions.}, journal = {Proceedings of the VLDB Endowment}, month = aug, pages = {1648–1653}, numpages = {6} }
@inproceedings{10.1145/3373376.3378504, author = {Bindschaedler, Laurent and Goel, Ashvin and Zwaenepoel, Willy}, title = {Hailstorm: Disaggregated Compute and Storage for Distributed {LSM}-Based Databases}, year = {2020}, isbn = {9781450371025}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://www.eecg.utoronto.ca/~ashvin/publications/hailstorm.pdf}, doi = {10.1145/3373376.3378504}, abstract = {Distributed LSM-based databases face throughput and latency issues due to load imbalance across instances and interference from background tasks such as flushing, compaction, and data migration. Hailstorm addresses these problems by deploying the database storage engines over a distributed filesystem that disaggregates storage from processing, enabling storage pooling and compaction offloading. Hailstorm pools storage devices within a rack, allowing each storage engine to fully utilize the aggregate rack storage capacity and bandwidth. Storage pooling successfully handles load imbalance without the need for resharding. Hailstorm offloads compaction tasks to remote nodes, distributing their impact, and improving overall system throughput and response time. We show that Hailstorm achieves load balance in many MongoDB deployments with skewed workloads, improving the average throughput by 60\%, while decreasing tail latency by as much as 5X. In workloads with range queries, Hailstorm provides up to 22X throughput improvements. Hailstorm also enables cost savings of 47-56\% in OLTP workloads.}, booktitle = {Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems}, pages = {301–316}, numpages = {16}, keywords = {rocksdb, storage, ycsb, key-value store, disaggregation, mongodb, compaction offloading, tpc-e, hailstorm, tikv, database, skew, tidb, tpc-c, compute, distributed}, location = {Lausanne, Switzerland}, series = {ASPLOS '20} }
@article{10.1016/S0306-4379(03)00047-4, author = {Torp, Kristian and Jensen, Christian S. and Snodgrass, Richard T.}, title = {Modification Semantics in Now-Relative Databases}, year = {2004}, issue_date = {December 2004}, publisher = {Elsevier Science Ltd.}, address = {GBR}, volume = {29}, number = {8}, issn = {0306-4379}, url = {https://www2.cs.arizona.edu/~rts/pubs/ISDec04.pdf}, doi = {10.1016/S0306-4379(03)00047-4}, abstract = {Most real-world databases record time-varying information. In such databases, the notion of "the current time," or NOW, occurs naturally and prominently. For example, when capturing the past states of a relation using begin and end time columns, tuples that are part of the current state have some past time as their begin time and NOW as their end time. While the semantics of such variable databases has been described in detail and is well understood, the modification of variable databases remains unexplored. This paper defines the semantics of modifications involving the variable NOW. More specifically, the problems with modifications in the presence of NOW are explored, illustrating that the main problems are with modifications of tuples that reach into the future. The paper defines the semantics of modifications--including insertions, deletions, and updates--of databases without NOW, with NOW, and with values of the type NOW + Δ, where Δ is a non-variable time duration. To accommodate these semantics, three new timestamp values are introduced. Finally, implementation is explored. We show how to represent the variable NOW with columns of standard SQL data types and give a mapping from SQL on NOW-relative data to standard SQL on these columns. The paper thereby completes the semantics, the querying, and the modification of now-relative databases.}, journal = {Inf. Syst.}, month = dec, pages = {653–683}, numpages = {31}, keywords = {updates, temporal query language, temporal data, SQL, Now-relative information, Now} }
@article{10.1145/22952.22956, author = {Snodgrass, Richard}, title = {The Temporal Query Language {TQuel}}, year = {1987}, issue_date = {June 1987}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {12}, number = {2}, issn = {0362-5915}, url = {https://dl.acm.org/doi/pdf/10.1145/22952.22956}, doi = {10.1145/22952.22956}, abstract = {Recently, attention has been focused on temporal databases, representing an enterprise over time. We have developed a new language, Tquel, to query a temporal database. TQuel was designed to be a minimal extension, both syntactically and semantically, of Quel, the query language in the Ingres relational database management system. This paper discusses the language informally, then provides a tuple relational calculus semantics for the TQuel statements that differ from their Quel counterparts, including the modification statements. The three additional temporal constructs defined in Tquel are shown to be direct semantic analogues of Quel's where clause and target list. We also discuss reducibility of the semantics to Quel's semantics when applied to a static database. TQuel is compared with ten other query languages supporting time.}, journal = {ACM Trans. Database Syst.}, month = jun, pages = {247–298}, numpages = {52} }
@inproceedings{DBLP-NeumannUnnestingArbitraryQueries, author = {Neumann, Thomas and Kemper, Alfons}, title = {Unnesting Arbitrary Queries}, booktitle = {Datenbanksysteme f{\"{u}}r Business, Technologie und Web (BTW), 16. Fachtagung des GI-Fachbereichs "Datenbanken und Informationssysteme" (DBIS), 4.-6.3.2015 in Hamburg, Germany. Proceedings}, series = {{LNI}}, volume = {{P-241}}, pages = {383-402}, publisher = {{GI}}, year = {2015}, url = {https://cs.emis.de/LNI/Proceedings/Proceedings241/383.pdf}, timestamp = {Thu, 14 Nov 2019 16:35:26 +0100}, biburl = {https://dblp.org/rec/conf/btw/0001K15.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} }
@inproceedings{10.1007/978-3-642-03784-9_3, author = {Brisaboa, Nieves and Ladra, Susana and Navarro, Gonzalo}, title = {k2-Trees for Compact Web Graph Representation}, booktitle = {String Processing and Information Retrieval}, year = {2009}, month = aug, pages = {18-30}, isbn = {978-3-642-03783-2}, journal = {Lecture Notes in Computer Science}, doi = {10.1007/978-3-642-03784-9_3}, url = {https://lbd.udc.es/Repository/Publications/Drafts/K2treforCom.pdf} }
@article{10.1145/351827.351829, author = {Eppstein, David}, title = {Fast Hierarchical Clustering and Other Applications of Dynamic Closest Pairs}, year = {2001}, issue_date = {2000}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {5}, issn = {1084-6654}, url = {https://dl.acm.org/doi/pdf/10.1145/351827.351829}, doi = {10.1145/351827.351829}, abstract = {We develop data structures for dynamic closest pair problems with arbitrary distance functions, that do not necessarily come from any geometric structure on the objects. Based on a technique previously used by the author for Euclidean closest pairs, we show how to insert and delete objects from an n-object set, maintaining the closest pair, in O(n log2 n) time per update and O(n) space. With quadratic space, we can instead use a quadtree-like structure to achieve an optimal time bound, O(n) per update. We apply these data structures to hierarchical clustering, greedy matching, and TSP heuristics, and discuss other potential applications in machine learning, Gr\"{o}bner bases, and local improvement algorithms for partition and placement problems. Experiments show our new methods to be faster in practice than previously used heuristics.}, journal = {ACM J. Exp. Algorithmics}, month = dec, pages = {1–es}, numpages = {23}, keywords = {nearest-neighbor heuristic, matching, quadtree, conga line data structure, TSP} }
@inbook{10.1137/1.9781611976014.6, author = {Chan, Timothy M.}, title = {Dynamic Generalized Closest Pair: Revisiting {Eppstein's} Technique}, booktitle = {Symposium on Simplicity in Algorithms (SOSA)}, year = {2020}, publisher = {Society for Industrial and Applied Mathematics}, chapter = {}, pages = {33-37}, doi = {10.1137/1.9781611976014.6}, url = {https://tmc.web.engr.illinois.edu/dyn_cp.pdf}, eprint = {https://epubs.siam.org/doi/pdf/10.1137/1.9781611976014.6} }
@article{10.1145/202660.202667, author = {Darwen, Hugh and Date, C. J.}, title = {The Third Manifesto}, year = {1995}, issue_date = {March 1995}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {24}, number = {1}, issn = {0163-5808}, url = {https://dl.acm.org/doi/pdf/10.1145/202660.202667}, doi = {10.1145/202660.202667}, abstract = {We present a manifesto for the future direction of data and database management systems. The manifesto consists of a series of prescriptions, proscriptions, and "very strong suggestions."}, journal = {SIGMOD Rec.}, month = mar, pages = {39–49}, numpages = {11} }
@book{978-1-4493-7332-0, abstract = {Data is at the center of many challenges in system design today. Difficult issues need to be figured out, such as scalability, consistency, reliability, efficiency, and maintainability. In addition, we have an overwhelming variety of tools, including relational databases, NoSQL datastores, stream or batch processors, and message brokers. What are the right choices for your application? How do you make sense of all these buzzwords? In this practical and comprehensive guide, author Martin Kleppmann helps you navigate this diverse landscape by examining the pros and cons of various technologies for processing and storing data. Software keeps changing, but the fundamental principles remain the same.}, author = {Kleppmann, Martin}, biburl = {https://www.bibsonomy.org/bibtex/24ee7900592839fd42b02b0836724e3ae/flint63}, file = {eBook:2017/Kleppmann17.pdf:PDF;O'Reilly Product page:http\://shop.oreilly.com/product/0636920032175.do:URL;Amazon Search inside:http\://www.amazon.de/gp/reader/1449373321/:URL}, groups = {public}, interhash = {35b14b82d95d9ac5fd25b7a1b3e22ea0}, intrahash = {4ee7900592839fd42b02b0836724e3ae}, isbn = {978-1-4493-7332-0}, keywords = {01841 103 safari book software development database architecture}, publisher = {O'Reilly}, timestamp = {2018-04-16T11:37:30.000+0200}, title = {Designing Data-Intensive Applications}, url = {https://www.safaribooksonline.com/library/view/designing-data-intensive-applications/9781491903063/}, username = {flint63}, year = {2016} }
@misc{CMU-PavloFall2020, author = {Andy Pavlo}, title = {Introduction to Database Systems}, url = {https://15445.courses.cs.cmu.edu/fall2020/schedule.html}, publisher = {Carnegie Mellon University School of Computer Science}, year = {2020} }
@misc{CMU-PavloSpring2020, author = {Andy Pavlo}, title = {Advanced Database Systems}, url = {https://15721.courses.cs.cmu.edu/spring2020/schedule.html}, publisher = {Carnegie Mellon University School of Computer Science}, year = {2020} }
@article{10.14778/3415478.3415560, author = {Armbrust, Michael and Das, Tathagata and Sun, Liwen and Yavuz, Burak and Zhu, Shixiong and Murthy, Mukul and Torres, Joseph and van Hovell, Herman and Ionescu, Adrian and \L{}uszczak, Alicja and undefinedwitakowski, Micha\l{} and Szafra\'{n}ski, Micha\l{} and Li, Xiao and Ueshin, Takuya and Mokhtar, Mostafa and Boncz, Peter and Ghodsi, Ali and Paranjpye, Sameer and Senster, Pieter and Xin, Reynold and Zaharia, Matei}, title = {Delta Lake: High-Performance {ACID} Table Storage over Cloud Object Stores}, year = {2020}, issue_date = {August 2020}, publisher = {VLDB Endowment}, volume = {13}, number = {12}, issn = {2150-8097}, url = {https://www.databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf}, doi = {10.14778/3415478.3415560}, abstract = {Cloud object stores such as Amazon S3 are some of the largest and most cost-effective storage systems on the planet, making them an attractive target to store large data warehouses and data lakes. Unfortunately, their implementation as key-value stores makes it difficult to achieve ACID transactions and high performance: metadata operations such as listing objects are expensive, and consistency guarantees are limited. In this paper, we present Delta Lake, an open source ACID table storage layer over cloud object stores initially developed at Databricks. Delta Lake uses a transaction log that is compacted into Apache Parquet format to provide ACID properties, time travel, and significantly faster metadata operations for large tabular datasets (e.g., the ability to quickly search billions of table partitions for those relevant to a query). It also leverages this design to provide high-level features such as automatic data layout optimization, upserts, caching, and audit logs. Delta Lake tables can be accessed from Apache Spark, Hive, Presto, Redshift and other systems. Delta Lake is deployed at thousands of Databricks customers that process exabytes of data per day, with the largest instances managing exabyte-scale datasets and billions of objects.}, journal = {Proceedings of the VLDB Endowment}, month = sep, pages = {3411–3424}, numpages = {14} }
@article{10.5555/1756006.1756034, author = {Ben-Haim, Yael and Tom-Tov, Elad}, title = {A Streaming Parallel Decision Tree Algorithm}, year = {2010}, issue_date = {3/1/2010}, publisher = {JMLR.org}, volume = {11}, issn = {1532-4435}, abstract = {We propose a new algorithm for building decision tree classifiers. The algorithm is executed in a distributed environment and is especially designed for classifying large data sets and streaming data. It is empirically shown to be as accurate as a standard decision tree classifier, while being scalable for processing of streaming data on multiple processors. These findings are supported by a rigorous analysis of the algorithm's accuracy.The essence of the algorithm is to quickly construct histograms at the processors, which compress the data to a fixed amount of memory. A master processor uses this information to find near-optimal split points to terminal tree nodes. Our analysis shows that guarantees on the local accuracy of split points imply guarantees on the overall tree accuracy.}, journal = {J. Mach. Learn. Res.}, month = mar, pages = {849–872}, numpages = {24}, url = {https://www.jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf}, doi = {10.5555/1756006.1756034} }
@comment{{10.1016/0196-6774(80)90015-2 does not appear to have a public PDF?}}
@article{10.1016/0196-6774(80)90015-2, title = {Decomposable searching problems I. Static-to-dynamic transformation}, author = {Bentley, Jon Louis. and Saxe, James B.}, journal = {Journal of Algorithms}, volume = {1}, number = {4}, pages = {301-358}, year = {1980}, issn = {0196-6774}, doi = {10.1016/0196-6774(80)90015-2}, url = {https://www.sciencedirect.com/science/article/pii/0196677480900152}, abstract = {Transformations that serve as tools in the design of new data structures are investigated. Specifically, general methods for converting static structures (in which all elements are known before any searches are performed) to dynamic structures (in which insertions of new elements can be mixed with searches) are studied. Three classes of such transformations are exhibited, each based on a different counting scheme for representing the integers, and a combinatorial model is used to show the optimality of many of the transformations. Issues such as online data structures and deletion of elements are also examined. To demonstrate the applicability of these tools, several new data structures that have been developed by applying the transformations are studied.} }
@inproceedings{10.5555/645925.671364, author = {Boncz, Peter A. and Manegold, Stefan and Kersten, Martin L.}, title = {Database Architecture Optimized for the New Bottleneck: Memory Access}, year = {1999}, isbn = {1558606157}, publisher = {Morgan Kaufmann Publishers Inc.}, address = {San Francisco, CA, USA}, booktitle = {Proceedings of the 25th International Conference on Very Large Data Bases}, pages = {54–65}, numpages = {12}, series = {VLDB '99}, url = {https://www.vldb.org/conf/1999/P5.pdf}, doi = {10.5555/645925.671364} }
@article{10.1145/3534963, author = {Chen, Lu and Gao, Yunjun and Song, Xuan and Li, Zheng and Zhu, Yifan and Miao, Xiaoye and Jensen, Christian S.}, title = {Indexing Metric Spaces for Exact Similarity Search}, year = {2022}, issue_date = {July 2023}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {55}, number = {6}, issn = {0360-0300}, url = {https://arxiv.org/pdf/2005.03468.pdf}, doi = {10.1145/3534963}, abstract = {With the continued digitization of societal processes, we are seeing an explosion in available data. This is referred to as big data. In a research setting, three aspects of the data are often viewed as the main sources of challenges when attempting to enable value creation from big data: volume, velocity, and variety. Many studies address volume or velocity, while fewer studies concern the variety. Metric spaces are ideal for addressing variety because they can accommodate any data as long as it can be equipped with a distance notion that satisfies the triangle inequality. To accelerate search in metric spaces, a collection of indexing techniques for metric data have been proposed. However, existing surveys offer limited coverage, and a comprehensive empirical study exists has yet to be reported. We offer a comprehensive survey of existing metric indexes that support exact similarity search: we summarize existing partitioning, pruning, and validation techniques used by metric indexes to support exact similarity search; we provide the time and space complexity analyses of index construction; and we offer an empirical comparison of their query processing performance. Empirical studies are important when evaluating metric indexing performance, because performance can depend highly on the effectiveness of available pruning and validation as well as on the data distribution, which means that complexity analyses often offer limited insights. This article aims at revealing strengths and weaknesses of different indexing techniques to offer guidance on selecting an appropriate indexing technique for a given setting, and to provide directions for future research on metric indexing.}, journal = {ACM Comput. Surv.}, month = dec, articleno = {128}, numpages = {39}, keywords = {metric similarity search, Metric spaces, indexing and querying} }
@article{10.1145/3080008, author = {Cormode, Graham}, title = {Data Sketching}, year = {2017}, issue_date = {September 2017}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {60}, number = {9}, issn = {0001-0782}, url = {http://dimacs.rutgers.edu/~graham/pubs/papers/cacm-sketch.pdf}, doi = {10.1145/3080008}, abstract = {The approximate approach is often faster and more efficient.}, journal = {Commun. ACM}, month = {aug}, pages = {48–55}, numpages = {8} }
@inproceedings{CIDR-CrottyAreYouSureYouWantToUseMMAP, author = {Crotty, Andrew and Leis, Viktor and Pavlo, Andrew}, title = {Are You Sure You Want to Use {MMAP} in Your Database Management System?}, booktitle = {12th Conference on Innovative Data Systems Research, {CIDR} 2022, Chaminade, CA, USA, January 9-12, 2022}, publisher = {www.cidrdb.org}, year = {2022}, url = {https://www.cidrdb.org/cidr2022/papers/p13-crotty.pdf}, timestamp = {Mon, 18 Jul 2022 17:13:00 +0200}, biburl = {https://dblp.org/rec/conf/cidr/CrottyLP22.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} }
@article{10.14778/3389133.3389135, author = {Ferragina, Paolo and Vinciguerra, Giorgio}, title = {The PGM-Index: A Fully-Dynamic Compressed Learned Index with Provable Worst-Case Bounds}, year = {2020}, issue_date = {April 2020}, publisher = {VLDB Endowment}, volume = {13}, number = {8}, issn = {2150-8097}, url = {http://www.vldb.org/pvldb/vol13/p1162-ferragina.pdf}, doi = {10.14778/3389133.3389135}, abstract = {We present the first learned index that supports predecessor, range queries and updates within provably efficient time and space bounds in the worst case. In the (static) context of just predecessor and range queries these bounds turn out to be optimal. We call this learned index the Piecewise Geometric Model index (PGM-index). Its flexible design allows us to introduce three variants which are novel in the context of learned data structures. The first variant of the PGM-index is able to adapt itself to the distribution of the query operations, thus resulting in the first known distribution-aware learned index to date. The second variant exploits the repetitiveness possibly present at the level of the learned models that compose the PGM-index to further compress its succinct space footprint. The third one is a multicriteria variant of the PGM-index that efficiently auto-tunes itself in a few seconds over hundreds of millions of keys to satisfy space-time constraints which evolve over time across users, devices and applications.These theoretical achievements are supported by a large set of experimental results on known datasets which show that the fully-dynamic PGM-index improves the space occupancy of existing traditional and learned indexes by up to three orders of magnitude, while still achieving their same or even better query and update time efficiency. As an example, in the static setting of predecessor and range queries, the PGM-index matches the query performance of a cache-optimised static B+-tree within two orders of magnitude (83\texttimes{}) less space; whereas in the fully-dynamic setting, where insertions and deletions are allowed, the PGM-index improves the query and update time performance of a B+-tree by up to 71\% within three orders of magnitude (1140\texttimes{}) less space.}, journal = {Proceedings of the VLDB Endowment}, month = may, pages = {1162–1175}, numpages = {14} }
@article{IEEE-TCDE-TheSapHanadatabaseAnArchitectureOverview, title = {The {SAP} {HANA} database - An architecture overview}, author = {F{\"a}rber, Franz and May, Norman and Lehner, Wolfgang and Grosse, Philipp and M{\"u}ller, Ingo and Rauhe, Hannes and Dees, Jonathan}, year = {2012}, month = {03}, pages = {28-33}, volume = {35}, journal = {Bulletin of the IEEE Computer Society Technical Committee on Data Engineering}, url = {http://sites.computer.org/debull/A12mar/hana.pdf} }
@inproceedings{10.1145/2723372.2742795, author = {Gupta, Anurag and Agarwal, Deepak and Tan, Derek and Kulesza, Jakub and Pathak, Rahul and Stefani, Stefano and Srinivasan, Vidhya}, title = {{Amazon} {Redshift} and the Case for Simpler Data Warehouses}, year = {2015}, isbn = {9781450327589}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://dl.acm.org/doi/pdf/10.1145/2723372.2742795}, doi = {10.1145/2723372.2742795}, abstract = {Amazon Redshift is a fast, fully managed, petabyte-scale data warehouse solution that makes it simple and cost-effective to efficiently analyze large volumes of data using existing business intelligence tools. Since launching in February 2013, it has been Amazon Web Service's (AWS) fastest growing service, with many thousands of customers and many petabytes of data under management. Amazon Redshift's pace of adoption has been a surprise to many participants in the data warehousing community. While Amazon Redshift was priced disruptively at launch, available for as little as \$1000/TB/year, there are many open-source data warehousing technologies and many commercial data warehousing engines that provide free editions for development or under some usage limit. While Amazon Redshift provides a modern MPP, columnar, scale-out architecture, so too do many other data warehousing engines. And, while Amazon Redshift is available in the AWS cloud, one can build data warehouses using EC2 instances and the database engine of one's choice with either local or network-attached storage.In this paper, we discuss an oft-overlooked differentiating characteristic of Amazon Redshift -- simplicity. Our goal with Amazon Redshift was not to compete with other data warehousing engines, but to compete with non-consumption. We believe the vast majority of data is collected but not analyzed. We believe, while most database vendors target larger enterprises, there is little correlation in today's economy between data set size and company size. And, we believe the models used to procure and consume analytics technology need to support experimentation and evaluation. Amazon Redshift was designed to bring data warehousing to a mass market by making it easy to buy, easy to tune and easy to manage while also being fast and cost-effective.}, booktitle = {Proceedings of the 2015 ACM SIGMOD International Conference on Management of Data}, pages = {1917–1923}, numpages = {7}, keywords = {amazon redshift, columnar database:redshift:data warehousing:mpp}, location = {Melbourne, Victoria, Australia}, series = {SIGMOD '15} }
@article{10.14778/2168651.2168652, author = {Halim, Felix and Idreos, Stratos and Karras, Panagiotis and Yap, Roland H. C.}, title = {Stochastic Database Cracking: Towards Robust Adaptive Indexing in Main-Memory Column-Stores}, year = {2012}, issue_date = {February 2012}, publisher = {VLDB Endowment}, volume = {5}, number = {6}, issn = {2150-8097}, url = {https://arxiv.org/pdf/1203.0055.pdf}, doi = {10.14778/2168651.2168652}, abstract = {Modern business applications and scientific databases call for inherently dynamic data storage environments. Such environments are characterized by two challenging features: (a) they have little idle system time to devote on physical design; and (b) there is little, if any, a priori workload knowledge, while the query and data workload keeps changing dynamically. In such environments, traditional approaches to index building and maintenance cannot apply. Database cracking has been proposed as a solution that allows on-the-fly physical data reorganization, as a collateral effect of query processing. Cracking aims to continuously and automatically adapt indexes to the workload at hand, without human intervention. Indexes are built incrementally, adaptively, and on demand. Nevertheless, as we show, existing adaptive indexing methods fail to deliver workload-robustness; they perform much better with random workloads than with others. This frailty derives from the inelasticity with which these approaches interpret each query as a hint on how data should be stored. Current cracking schemes blindly reorganize the data within each query's range, even if that results into successive expensive operations with minimal indexing benefit.In this paper, we introduce stochastic cracking, a significantly more resilient approach to adaptive indexing. Stochastic cracking also uses each query as a hint on how to reorganize data, but not blindly so; it gains resilience and avoids performance bottlenecks by deliberately applying certain arbitrary choices in its decision-making. Thereby, we bring adaptive indexing forward to a mature formulation that confers the workload-robustness previous approaches lacked. Our extensive experimental study verifies that stochastic cracking maintains the desired properties of original database cracking while at the same time it performs well with diverse realistic workloads.}, journal = {Proc. VLDB Endow.}, month = feb, pages = {502–513}, numpages = {12} }
@inproceedings{10.1145/509383.509385, author = {Hammer, Michael and Chan, Arvola}, title = {Index Selection in a Self-Adaptive Data Base Management System}, year = {1976}, isbn = {9781450347297}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://dl.acm.org/doi/pdf/10.1145/509383.509385}, doi = {10.1145/509383.509385}, abstract = {We address the problem of automatically adjusting the physical organization of a data base to optimize its performance as its access requirements change. We describe the principles of the automatic index selection facility of a prototype self-adaptive data base management system that is currently under development. The importance of accurate usage model acquisition and data characteristics estimation is stressed. The statistics gathering mechanisms that are being incorporated into our prototype system are discussed. Exponential smoothing techniques are used for averaging statistics observed over different periods of time in order to predict future characteristics. An heuristic algorithm for selecting indices to match projected access requirements is presented. The cost model on which the decision procedure is based is flexible enough to incorporate the overhead costs of index creation, index storage and application program recompilation.}, booktitle = {Proceedings of the 1976 ACM SIGMOD International Conference on Management of Data}, pages = {1–8}, numpages = {8}, location = {Washington, D.C.}, series = {SIGMOD '76} }
@article{10.1145/2857274.2884038, author = {Helland, Pat}, title = {Immutability Changes Everything: We Need It, We Can Afford It, and the Time is Now.}, year = {2015}, issue_date = {November-December 2015}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {13}, number = {9}, issn = {1542-7730}, url = {https://dl.acm.org/doi/pdf/10.1145/2857274.2884038}, doi = {10.1145/2857274.2884038}, journal = {Queue}, month = nov, pages = {101–125}, numpages = {25} }
@article{10.14778/3358701.3358705, author = {Holanda, Pedro and Raasveldt, Mark and Manegold, Stefan and M\"{u}hleisen, Hannes}, title = {Progressive Indexes: Indexing for Interactive Data Analysis}, year = {2019}, issue_date = {September 2019}, publisher = {VLDB Endowment}, volume = {12}, number = {13}, issn = {2150-8097}, url = {http://www.vldb.org/pvldb/vol12/p2366-holanda.pdf}, doi = {10.14778/3358701.3358705}, abstract = {Interactive exploration of large volumes of data is increasingly common, as data scientists attempt to extract interesting information from large opaque data sets. This scenario presents a difficult challenge for traditional database systems, as (1) nothing is known about the query workload in advance, (2) the query workload is constantly changing, and (3) the system must provide interactive responses to the issued queries. This environment is challenging for index creation, as traditional database indexes require upfront creation, hence a priori workload knowledge, to be efficient.In this paper, we introduce Progressive Indexing, a novel performance-driven indexing technique that focuses on automatic index creation while providing interactive response times to incoming queries. Its design allows queries to have a limited budget to spend on index creation. The indexing budget is automatically tuned to each query before query processing. This allows for systems to provide interactive answers to queries during index creation while being robust against various workload patterns and data distributions.}, journal = {Proceedings of the VLDB Endowment}, month = sep, pages = {2366–2378}, numpages = {13} }
@article{10.14778/3415478.3415535, author = {Huang, Dongxu and Liu, Qi and Cui, Qiu and Fang, Zhuhe and Ma, Xiaoyu and Xu, Fei and Shen, Li and Tang, Liu and Zhou, Yuxing and Huang, Menglong and Wei, Wan and Liu, Cong and Zhang, Jian and Li, Jianjun and Wu, Xuelian and Song, Lingyu and Sun, Ruoxi and Yu, Shuaipeng and Zhao, Lei and Cameron, Nicholas and Pei, Liquan and Tang, Xin}, title = {{TiDB:} A Raft-Based {HTAP} Database}, year = {2020}, issue_date = {August 2020}, publisher = {VLDB Endowment}, volume = {13}, number = {12}, issn = {2150-8097}, url = {https://www.vldb.org/pvldb/vol13/p3072-huang.pdf}, doi = {10.14778/3415478.3415535}, abstract = {Hybrid Transactional and Analytical Processing ({HTAP}) databases require processing transactional and analytical queries in isolation to remove the interference between them. To achieve this, it is necessary to maintain different replicas of data specified for the two types of queries. However, it is challenging to provide a consistent view for distributed replicas within a storage system, where analytical requests can efficiently read consistent and fresh data from transactional workloads at scale and with high availability.To meet this challenge, we propose extending replicated state machine-based consensus algorithms to provide consistent replicas for HTAP workloads. Based on this novel idea, we present a Raft-based HTAP database: TiDB. In the database, we design a multi-Raft storage system which consists of a row store and a column store. The row store is built based on the Raft algorithm. It is scalable to materialize updates from transactional requests with high availability. In particular, it asynchronously replicates Raft logs to learners which transform row format to column format for tuples, forming a real-time updatable column store. This column store allows analytical queries to efficiently read fresh and consistent data with strong isolation from transactions on the row store. Based on this storage system, we build an SQL engine to process large-scale distributed transactions and expensive analytical queries. The SQL engine optimally accesses row-format and column-format replicas of data. We also include a powerful analysis engine, TiSpark, to help TiDB connect to the Hadoop ecosystem. Comprehensive experiments show that TiDB achieves isolated high performance under CH-benCHmark, a benchmark focusing on HTAP workloads.}, journal = {Proceedings of the VLDB Endowment}, month = sep, pages = {3072–3084}, numpages = {13} }
@misc{10.48550/arxiv.1706.05714, author = {Idreos, Stratos and Maas, Lukas M. and Kester, Mike S.}, title = {Evolutionary Data Systems}, doi = {10.48550/ARXIV.1706.05714}, url = {https://arxiv.org/pdf/1706.05714.pdf}, keywords = {Databases (cs.DB), FOS: Computer and information sciences, FOS: Computer and information sciences}, publisher = {arXiv}, year = {2017}, copyright = {arXiv.org perpetual, non-exclusive license} }
@article{10.14778/2002938.2002944, title = {Merging what's cracked, cracking what's merged: Adaptive Indexing in Main-Memory Column-Stores}, author = {Idreos, Stratos and Manegold, Stefan and Kuno, Harumi and Graefe, Goetz}, journal = {Proceedings of the VLDB Endowment}, volume = {4}, year = {2011}, month = jun, pages = {585-597}, doi = {10.14778/2002938.2002944}, url = {https://www.vldb.org/pvldb/vol4/p586-idreos.pdf} }
@article{IEEE-debu-IdreosPeriodicTableOfDataStructures, title = {The Periodic Table of Data Structures}, author = {Idreos, Stratos and Zoumpatianos, Kostas and Athanassoulis, Manos and Dayan, Niv and Hentschel, Brian and Kester, Michael S. and Guo, Demi and Maas, Lukas M. and Qin, Wilson and Wasay, Abdul and Sun, Yiyou}, journal = {IEEE Data Eng. Bull.}, year = {2018}, volume = {41}, pages = {64-75}, ee = {http://sites.computer.org/debull/A18sept/p64.pdf}, url = {https://stratos.seas.harvard.edu/files/stratos/files/periodictabledatastructures.pdf} }
@inproceedings{10.1145/3183713.3199671, author = {Idreos, Stratos and Zoumpatianos, Kostas and Hentschel, Brian and Kester, Michael S. and Guo, Demi}, title = {The Data Calculator: Data Structure Design and Cost Synthesis from First Principles and Learned Cost Models}, year = {2018}, isbn = {9781450347037}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://doi.org/10.1145/3183713.3199671}, doi = {10.1145/3183713.3199671}, abstract = {Data structures are critical in any data-driven scenario, but they are notoriously hard to design due to a massive design space and the dependence of performance on workload and hardware which evolve continuously. We present a design engine, the Data Calculator, which enables interactive and semi-automated design of data structures. It brings two innovations. First, it offers a set of fine-grained design primitives that capture the first principles of data layout design: how data structure nodes lay data out, and how they are positioned relative to each other. This allows for a structured description of the universe of possible data structure designs that can be synthesized as combinations of those primitives. The second innovation is computation of performance using learned cost models. These models are trained on diverse hardware and data profiles and capture the cost properties of fundamental data access primitives (e.g., random access). With these models, we synthesize the performance cost of complex operations on arbitrary data structure designs without having to: 1) implement the data structure, 2) run the workload, or even 3) access the target hardware. We demonstrate that the Data Calculator can assist data structure designers and researchers by accurately answering rich what-if design questions on the order of a few seconds or minutes, i.e., computing how the performance (response time) of a given data structure design is impacted by variations in the: 1) design, 2) hardware, 3) data, and 4) query workloads. This makes it effortless to test numerous designs and ideas before embarking on lengthy implementation, deployment, and hardware acquisition steps. We also demonstrate that the Data Calculator can synthesize entirely new designs, auto-complete partial designs, and detect suboptimal design choices.}, booktitle = {Proceedings of the 2018 International Conference on Management of Data}, pages = {535–550}, numpages = {16}, keywords = {learned cost models, data structure synthesis}, location = {Houston, TX, USA}, series = {SIGMOD '18} }
@inproceedings{10.1109/ICDE.2011.5767867, author = {Kemper, Alfons and Neumann, Thomas}, title = {{HyPer}: A hybrid {OLTP} \& {OLAP} main memory database system based on virtual memory snapshots}, booktitle = {2011 IEEE 27th International Conference on Data Engineering}, year = {2011}, volume = {}, number = {}, pages = {195-206}, doi = {10.1109/ICDE.2011.5767867}, url = {https://cs.brown.edu/courses/cs227/archives/2012/papers/olap/hyper.pdf} }
@article{10.1145/1273445.1273458, author = {Keshav, S.}, title = {How to Read a Paper}, year = {2007}, issue_date = {July 2007}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {37}, number = {3}, issn = {0146-4833}, url = {http://ccr.sigcomm.org/online/files/p83-keshavA.pdf}, doi = {10.1145/1273445.1273458}, abstract = {Researchers spend a great deal of time reading research papers. However, this skill is rarely taught, leading to much wasted effort. This article outlines a practical and efficient three-pass method for reading research papers. I also describe how to use this method to do a literature survey.}, journal = {SIGCOMM Comput. Commun. Rev.}, month = {jul}, pages = {83–84}, numpages = {2}, keywords = {hints, reading, paper} }
@inproceedings{10.1145/3533702.3534912, author = {Kipf, Andreas and Horn, Dominik and Pfeil, Pascal and Marcus, Ryan and Kraska, Tim}, title = {{LSI:} A Learned Secondary Index Structure}, year = {2022}, isbn = {9781450393775}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://arxiv.org/pdf/2205.05769.pdf}, doi = {10.1145/3533702.3534912}, abstract = {Learned index structures have been shown to achieve favorable lookup performance and space consumption compared to their traditional counterparts such as B-trees. However, most learned index studies have focused on the primary indexing setting, where the base data is sorted. In this work, we investigate whether learned indexes sustain their advantage in the secondary indexing setting. We introduce Learned Secondary Index (LSI), a first attempt to use learned indexes for indexing unsorted data. LSI works by building a learned index over a permutation vector, which allows binary search to performed on the unsorted base data using random access. We additionally augment LSI with a fingerprint vector to accelerate equality lookups. We show that LSI achieves comparable lookup performance to state-of-the-art secondary indexes while being up to 6x more space efficient.}, booktitle = {Proceedings of the Fifth International Workshop on Exploiting Artificial Intelligence Techniques for Data Management}, articleno = {4}, numpages = {5}, location = {Philadelphia, Pennsylvania}, series = {aiDM '22} }
@inproceedings{10.1145/3401071.3401659, author = {Kipf, Andreas and Marcus, Ryan and van Renen, Alexander and Stoian, Mihail and Kemper, Alfons and Kraska, Tim and Neumann, Thomas}, title = {{RadixSpline:} A Single-Pass Learned Index}, year = {2020}, isbn = {9781450380294}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://arxiv.org/pdf/2004.14541.pdf}, doi = {10.1145/3401071.3401659}, abstract = {Recent research has shown that learned models can outperform state-of-the-art index structures in size and lookup performance. While this is a very promising result, existing learned structures are often cumbersome to implement and are slow to build. In fact, most approaches that we are aware of require multiple training passes over the data.We introduce RadixSpline (RS), a learned index that can be built in a single pass over the data and is competitive with state-of-the-art learned index models, like RMI, in size and lookup performance. We evaluate RS using the SOSD benchmark and show that it achieves competitive results on all datasets, despite the fact that it only has two parameters.}, booktitle = {Proceedings of the Third International Workshop on Exploiting Artificial Intelligence Techniques for Data Management}, articleno = {5}, numpages = {5}, location = {Portland, Oregon}, series = {aiDM '20} }
@article{10.14778/3476311.3476392, author = {Kraska, Tim}, title = {Towards Instance-Optimized Data Systems}, year = {2021}, issue_date = {July 2021}, publisher = {VLDB Endowment}, volume = {14}, number = {12}, issn = {2150-8097}, url = {http://vldb.org/pvldb/vol14/p3222-kraska.pdf}, doi = {10.14778/3476311.3476392}, abstract = {In recent years, we have seen increased interest in applying machine learning to system problems. For example, there has been work on applying machine learning to improve query optimization, indexing, storage layouts, scheduling, log-structured merge trees, sorting, compression, and sketches, among many other data management tasks. Arguably, the ideas behind these techniques are similar: machine learning is used to model the data and/or workload in order to derive a more efficient algorithm or data structure. Ultimately, these techniques will allow us to build "instance-optimized" systems: that is, systems that self-adjust to a given workload and data distribution to provide unprecedented performance without the need for tuning by an administrator. While many of these techniques promise orders-of-magnitude better performance in lab settings, there is still general skepticism about how practical the current techniques really are.The following is intended as a progress report on ML for Systems and its readiness for real-world deployments, with a focus on our projects done as part of the Data Systems and AI Lab (DSAIL) at MIT By no means is it a comprehensive overview of all existing work, which has been steadily growing over the past several years not only in the database community but also in the systems, networking, theory, PL, and many other adjacent communities.}, journal = {Proceedings of the VLDB Endowment}, month = oct, pages = {3222–3232}, numpages = {11} }
@inproceedings{10.1145/3183713.3196909, author = {Kraska, Tim and Beutel, Alex and Chi, Ed H. and Dean, Jeffrey and Polyzotis, Neoklis}, title = {The Case for Learned Index Structures}, year = {2018}, isbn = {9781450347037}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://arxiv.org/pdf/1712.01208.pdf}, doi = {10.1145/3183713.3196909}, abstract = {Indexes are models: a btree-Index can be seen as a model to map a key to the position of a record within a sorted array, a Hash-Index as a model to map a key to a position of a record within an unsorted array, and a BitMap-Index as a model to indicate if a data record exists or not. In this exploratory research paper, we start from this premise and posit that all existing index structures can be replaced with other types of models, including deep-learning models, which we term em learned indexes. We theoretically analyze under which conditions learned indexes outperform traditional index structures and describe the main challenges in designing learned index structures. Our initial results show that our learned indexes can have significant advantages over traditional indexes. More importantly, we believe that the idea of replacing core components of a data management system through learned models has far reaching implications for future systems designs and that this work provides just a glimpse of what might be possible.}, booktitle = {Proceedings of the 2018 International Conference on Management of Data}, pages = {489–504}, numpages = {16}, keywords = {linear regression, hash-map, neural net, learned index structure, mixture of experts, learned index, index structures, learned data structures, b-tree, cdf, bloom-filter}, location = {Houston, TX, USA}, series = {SIGMOD '18} }
@inproceedings{10.5555/645924.671173, author = {Moerkotte, Guido}, title = {Small Materialized Aggregates: A Light Weight Index Structure for Data Warehousing}, year = {1998}, isbn = {1558605665}, publisher = {Morgan Kaufmann Publishers Inc.}, address = {San Francisco, CA, USA}, booktitle = {Proceedings of the 24rd International Conference on Very Large Data Bases}, pages = {476–487}, numpages = {12}, series = {VLDB '98}, url = {https://www.vldb.org/conf/1998/p476.pdf}, doi = {10.5555/645924.671173} }
@inproceedings{CIDR-MaddenSelfOrganizingDC, author = {Madden, Samuel and Ding, Jialin and Kraska, Tim and Sudhir, Sivaprasad and Cohen, David and Mattson, Timothy G. and Tatbul, Nesime}, title = {Self-Organizing Data Containers}, booktitle = {12th Conference on Innovative Data Systems Research, {CIDR} 2022, Chaminade, CA, USA, January 9-12, 2022}, publisher = {www.cidrdb.org}, year = {2022}, url = {https://www.cidrdb.org/cidr2022/papers/p44-madden.pdf}, timestamp = {Mon, 18 Jul 2022 17:13:00 +0200}, biburl = {https://dblp.org/rec/conf/cidr/MaddenDKSCMT22.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} }
@inproceedings{10.1007/978-3-540-70504-8_12, author = {Neumann, Thomas and Michel, Sebastian}, title = {Smooth Interpolating Histograms with Error Guarantees}, year = {2008}, isbn = {9783540705031}, publisher = {Springer-Verlag}, address = {Berlin, Heidelberg}, url = {https://domino.mpi-inf.mpg.de/intranet/ag5/ag5publ.nsf/082b47ae0d6defd7c12565dd006fb8dd/0f1acb6745b57561c12574f7004ccb54/\$FILE/splinehistograms.pdf}, doi = {10.1007/978-3-540-70504-8_12}, abstract = {Accurate selectivity estimations are essential for query optimization decisions where they are typically derived from various kinds of histograms which condense value distributions into compact representations. The estimation accuracy of existing approaches typically varies across the domain, with some estimations being very accurate and some quite inaccurate. This is in particular unfortunate when performing a parametric search using these estimations, as the estimation artifacts can dominate the search results. We propose the usage of linear splines to construct histograms with known error guarantees across the whole continuous domain. These histograms are particularly well suited for using the estimates in parameter optimization. We show by a comprehensive performance evaluation using both synthetic and real world data that our approach clearly outperforms existing techniques.}, booktitle = {Proceedings of the 25th British National Conference on Databases: Sharing Data, Information and Knowledge}, pages = {126–138}, numpages = {13}, location = {Cardiff, Wales, UK}, series = {BNCOD '08} }
@article{10.1145/348.318586, author = {Nievergelt, J. and Hinterberger, Hans and Sevcik, Kenneth C.}, title = {The Grid File: An Adaptable, Symmetric Multikey File Structure}, year = {1984}, issue_date = {March 1984}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {9}, number = {1}, issn = {0362-5915}, url = {https://dl.acm.org/doi/pdf/10.1145/348.318586}, doi = {10.1145/348.318586}, abstract = {Traditional file structures that provide multikey access to records, for example, inverted files, are extensions of file structures originally designed for single-key access. They manifest various deficiencies in particular for multikey access to highly dynamic files. We study the dynamic aspects of file structures that treat all keys symmetrically, that is, file structures which avoid the distinction between primary and secondary keys. We start from a bitmap approach and treat the problem of file design as one of data compression of a large sparse matrix. This leads to the notions of a grid partition of the search space and of a grid directory, which are the keys to a dynamic file structure called the grid file. This file system adapts gracefully to its contents under insertions and deletions, and thus achieves an upper bound of two disk accesses for single record retrieval; it also handles range queries and partially specified queries efficiently. We discuss in detail the design decisions that led to the grid file, present simulation results of its behavior, and compare it to other multikey access file structures.}, journal = {ACM Trans. Database Syst.}, month = mar, pages = {38–71}, numpages = {34} }
@article{10.14778/3115404.3115415, author = {Olma, Matthaios and Karpathiotakis, Manos and Alagiannis, Ioannis and Athanassoulis, Manos and Ailamaki, Anastasia}, title = {Slalom: Coasting through Raw Data via Adaptive Partitioning and Indexing}, year = {2017}, issue_date = {June 2017}, publisher = {VLDB Endowment}, volume = {10}, number = {10}, issn = {2150-8097}, url = {http://www.vldb.org/pvldb/vol10/p1106-olma.pdf}, doi = {10.14778/3115404.3115415}, abstract = {The constant flux of data and queries alike has been pushing the boundaries of data analysis systems. The increasing size of raw data files has made data loading an expensive operation that delays the data-to-insight time. Hence, recent in-situ query processing systems operate directly over raw data, alleviating the loading cost. At the same time, analytical workloads have increasing number of queries. Typically, each query focuses on a constantly shifting -- yet small -- range. Minimizing the workload latency, now, requires the benefits of indexing in in-situ query processing.In this paper, we present Slalom, an in-situ query engine that accommodates workload shifts by monitoring user access patterns. Slalom makes on-the-fly partitioning and indexing decisions, based on information collected by lightweight monitoring. Slalom has two key components: (i) an online partitioning and indexing scheme, and (ii) a partitioning and indexing tuner tailored for in-situ query engines. When compared to the state of the art, Slalom offers performance benefits by taking into account user query patterns to (a) logically partition raw data files and (b) build for each partition lightweight partition-specific indexes. Due to its lightweight and adaptive nature, Slalom achieves efficient accesses to raw data with minimal memory consumption. Our experimentation with both micro-benchmarks and real-life workloads shows that Slalom outperforms state-of-the-art in-situ engines (3 -- 10\texttimes{}), and achieves comparable query response times with fully indexed DBMS, offering much lower (∼ 3\texttimes{}) cumulative query execution times for query workloads with increasing size and unpredictable access patterns.}, journal = {Proceedings of the VLDB Endowment}, month = jun, pages = {1106–1117}, numpages = {12} }
@article{10.1007/s002360050048, author = {O'Neil, Patrick and Cheng, Edward and Gawlick, Dieter and O'Neil, Elizabeth}, title = {The Log-Structured Merge-Tree ({LSM-Tree})}, year = {1996}, issue_date = {1996}, publisher = {Springer-Verlag}, address = {Berlin, Heidelberg}, volume = {33}, number = {4}, issn = {0001-5903}, url = {https://www.cs.umb.edu/~poneil/lsmtree.pdf}, doi = {10.1007/s002360050048}, journal = {Acta Informatica}, month = jun, pages = {351–385}, numpages = {35} }
@inproceedings{CIDR-Pavlo2017SelfDrivingDM, title = {Self-Driving Database Management Systems}, author = {Pavlo, Andrew and Angulo, Gustavo and Arulraj, Joy and Lin, Haibin and Lin, Jiexi and Ma, Lin and Menon, Prashanth and Mowry, Todd C. and Perron, Matthew and Quah, Ian and Santurkar, Siddharth and Tomasic, Anthony and Toor, Skye and Aken, Dana Van and Wang, Ziqi and Wu, Yingjun and Xian, Ran and Zhang, Tieying}, booktitle = {Conference on Innovative Data Systems Research}, year = {2017}, url = {https://db.cs.cmu.edu/papers/2017/p42-pavlo-cidr17.pdf} }
@inproceedings{10.1145/2723372.2723719, author = {Petraki, Eleni and Idreos, Stratos and Manegold, Stefan}, title = {Holistic Indexing in Main-Memory Column-Stores}, year = {2015}, isbn = {9781450327589}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://stratos.seas.harvard.edu/files/stratos/files/holisticindexing.pdf}, doi = {10.1145/2723372.2723719}, abstract = {Great database systems performance relies heavily on index tuning, i.e., creating and utilizing the best indices depending on the workload. However, the complexity of the index tuning process has dramatically increased in recent years due to ad-hoc workloads and shortage of time and system resources to invest in tuning.This paper introduces holistic indexing, a new approach to automated index tuning in dynamic environments. Holistic indexing requires zero set-up and tuning effort, relying on adaptive index creation as a side-effect of query processing. Indices are created incrementally and partially;they are continuously refined as we process more and more queries. Holistic indexing takes the state-of-the-art adaptive indexing ideas a big step further by introducing the notion of a system which never stops refining the index space, taking educated decisions about which index we should incrementally refine next based on continuous knowledge acquisition about the running workload and resource utilization. When the system detects idle CPU cycles, it utilizes those extra cycles by refining the adaptive indices which are most likely to bring a benefit for future queries. Such idle CPU cycles occur when the system cannot exploit all available cores up to 100\%, i.e., either because the workload is not enough to saturate the CPUs or because the current tasks performed for query processing are not easy to parallelize to the point where all available CPU power is exploited.In this paper, we present the design of holistic indexing for column-oriented database architectures and we discuss a detailed analysis against parallel versions of state-of-the-art indexing and adaptive indexing approaches. Holistic indexing is implemented in an open-source column-store DBMS. Our detailed experiments on both synthetic and standard benchmarks (TPC-H) and workloads (SkyServer) demonstrate that holistic indexing brings significant performance gains by being able to continuously refine the physical design in parallel to query processing, exploiting any idle CPU resources.}, booktitle = {Proceedings of the 2015 ACM SIGMOD International Conference on Management of Data}, pages = {1153–1166}, numpages = {14}, keywords = {holistic indexing, self-organization}, location = {Melbourne, Victoria, Australia}, series = {SIGMOD '15} }
@inproceedings{10.1145/3514221.3526055, author = {Prout, Adam and Wang, Szu-Po and Victor, Joseph and Sun, Zhou and Li, Yongzhu and Chen, Jack and Bergeron, Evan and Hanson, Eric and Walzer, Robert and Gomes, Rodrigo and Shamgunov, Nikita}, title = {Cloud-Native Transactions and Analytics in {SingleStore}}, year = {2022}, isbn = {9781450392495}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://dl.acm.org/doi/pdf/10.1145/3514221.3526055}, doi = {10.1145/3514221.3526055}, abstract = {The last decade has seen a remarkable rise in specialized database systems. Systems for transaction processing, data warehousing, time series analysis, full-text search, data lakes, in-memory caching, document storage, queuing, graph processing, and geo-replicated operational workloads are now available to developers. A belief has taken hold that a single general-purpose database is not capable of running varied workloads at a reasonable cost with strong performance, at the level of scale and concurrency people demand today. There is value in specialization, but the complexity and cost of using multiple specialized systems in a single application environment is becoming apparent. This realization is driving developers and IT decision makers to seek databases capable of powering a broader set of use cases when looking to adopt a new database. Hybrid transaction and analytical (HTAP) databases have been developed to try to tame some of this chaos.In this paper we introduce SinglestoreDB (S2DB), formerly called MemSQL, a distributed general-purpose SQL database designed to have the versatility to run both operational and analytical workloads with good performance. It was one of the earliest distributed HTAP databases on the market. It can scale out to efficiently utilize 100s of hosts, 1000s of cores and 10s of TBs of RAM while still providing a user experience similar to a single-host SQL database such as Oracle or SQL Server. S2DB's unified table storage runs both transactional and analytical workloads efficiently with operations like fast scans, seeks, filters, aggregations, and updates. This is accomplished through a combination of rowstore, columnstore and vectorization techniques, ability to seek efficiently into a columnstore using secondary indexes, and using in-memory rowstore buffers for recently modified data. It avoids design simplifications (i.e., only supporting batch loading, or limiting the query surface area to particular patterns of queries) that sacrifice the ability to run a broad set of workloads.Today, after 10 years of development, S2DB runs demanding production workloads for some of the world's largest financial, telecom, high-tech, and energy companies. These customers drove the product towards a database capable of running a breadth of workloads across their organizations, often replacing two or three different databases with S2DB. The design of S2DB's storage, transaction processing, and query processing were developed to maintain this versatility.}, booktitle = {Proceedings of the 2022 International Conference on Management of Data}, pages = {2340–2352}, numpages = {13}, keywords = {databases, separation of storage and compute, distributed systems, transactions and analytics}, location = {Philadelphia, PA, USA}, series = {SIGMOD '22} }
@inproceedings{10.1109/MSST.2010.5496972, author = {Shvachko, Konstantin and Kuang, Hairong and Radia, Sanjay and Chansler, Robert}, title = {The {Hadoop} Distributed File System}, booktitle = {2010 IEEE 26th Symposium on Mass Storage Systems and Technologies (MSST)}, year = {2010}, volume = {}, number = {}, pages = {1-10}, abstract = {The Hadoop Distributed File System ({HDFS}) is designed to store very large data sets reliably, and to stream those data sets at high bandwidth to user applications. In a large cluster, thousands of servers both host directly attached storage and execute user application tasks. By distributing storage and computation across many servers, the resource can grow with demand while remaining economical at every size. We describe the architecture of HDFS and report on experience using HDFS to manage 25 petabytes of enterprise data at Yahoo!.}, keywords = {}, doi = {10.1109/MSST.2010.5496972}, issn = {2160-1968}, month = may, url = {https://pages.cs.wisc.edu/~akella/CS838/F16/838-CloudPapers/hdfs.pdf} }
@misc{10.48550/arxiv.2111.14905, author = {Spector, Benjamin and Kipf, Andreas and Vaidya, Kapil and Wang, Chi and Minhas, Umar Farooq and Kraska, Tim}, title = {Bounding the Last Mile: Efficient Learned String Indexing}, doi = {10.48550/ARXIV.2111.14905}, url = {https://arxiv.org/pdf/2111.14905.pdf}, keywords = {Databases (cs.DB), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences}, publisher = {arXiv}, year = {2021}, copyright = {arXiv.org perpetual, non-exclusive license} }
@inproceedings{10.5555/1325851.1325981, author = {Stonebraker, Michael and Madden, Samuel and Abadi, Daniel J. and Harizopoulos, Stavros and Hachem, Nabil and Helland, Pat}, title = {The End of an Architectural Era: (It's Time for a Complete Rewrite)}, year = {2007}, isbn = {9781595936493}, publisher = {VLDB Endowment}, abstract = {In previous papers [SC05, SBC+07], some of us predicted the end of "one size fits all" as a commercial relational DBMS paradigm. These papers presented reasons and experimental evidence that showed that the major RDBMS vendors can be outperformed by 1--2 orders of magnitude by specialized engines in the data warehouse, stream processing, text, and scientific database markets.Assuming that specialized engines dominate these markets over time, the current relational DBMS code lines will be left with the business data processing (OLTP) market and hybrid markets where more than one kind of capability is required. In this paper we show that current RDBMSs can be beaten by nearly two orders of magnitude in the OLTP market as well. The experimental evidence comes from comparing a new OLTP prototype, H-Store, which we have built at M.I.T. to a popular RDBMS on the standard transactional benchmark, TPC-C.We conclude that the current RDBMS code lines, while attempting to be a "one size fits all" solution, in fact, excel at nothing. Hence, they are 25 year old legacy code lines that should be retired in favor of a collection of "from scratch" specialized engines. The DBMS vendors (and the research community) should start with a clean sheet of paper and design systems for tomorrow's requirements, not continue to push code lines and architectures designed for yesterday's needs.}, booktitle = {Proceedings of the 33rd International Conference on Very Large Data Bases}, pages = {1150–1160}, numpages = {11}, location = {Vienna, Austria}, series = {VLDB '07}, doi = {10.5555/1325851.1325981}, url = {https://www.vldb.org/conf/2007/papers/industrial/p1150-stonebraker.pdf} }
@inproceedings{10.1145/3183713.3196938, author = {Vandiver, Ben and Prasad, Shreya and Rana, Pratibha and Zik, Eden and Saeidi, Amin and Parimal, Pratyush and Pantela, Styliani and Dave, Jaimin}, title = {Eon {Mode:} Bringing the {Vertica} Columnar Database to the Cloud}, year = {2018}, isbn = {9781450347037}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://www.vertica.com/wp-content/uploads/2018/05/Vertica_EON_SIGMOD_Paper.pdf}, doi = {10.1145/3183713.3196938}, abstract = {The Vertica Analytic Database is a powerful tool for high performance, large scale SQL analytics. Historically, Vertica has managed direct-attached disk for performance and reliability, at a cost of product complexity and scalability. Eon mode is a new architecture for Vertica that places the data on a reliable shared storage, matching the original architecture's performance on existing workloads and supporting new workloads. While the design reuses Vertica's optimizer and execution engine, the metadata, storage, and fault tolerance mechanisms are re-architected to enable and take advantage of shared storage. A sharding mechanism distributes load over the nodes while retaining the capability of running node-local table joins. Running on Amazon EC2 compute and S3 storage, Eon mode demonstrates good performance, superior scalability, and robust operational behavior. With these improvements, Vertica delivers on the promise of cloud economics, consuming only the compute and storage resources needed, while supporting efficient elasticity.}, booktitle = {Proceedings of the 2018 International Conference on Management of Data}, pages = {797–809}, numpages = {13}, keywords = {databases, cloud storage, column stores, shared storage}, location = {Houston, TX, USA}, series = {SIGMOD '18} }
@inproceedings{10.5555/3388242.3388275, author = {Vuppalapati, Midhul and Miron, Justin and Agarwal, Rachit and Truong, Dan and Motivala, Ashish and Cruanes, Thierry}, title = {Building an Elastic Query Engine on Disaggregated Storage}, year = {2020}, isbn = {9781939133137}, publisher = {USENIX Association}, address = {USA}, abstract = {We present operational experience running Snowflake, a cloud-based data warehousing system with SQL support similar to state-of-the-art databases. Snowflake design is motivated by three goals: (1) compute and storage elasticity; (2) support for multi-tenancy; and, (3) high performance. Over the last few years, Snowflake has grown to serve thousands of customers executing millions of queries on petabytes of data every day.This paper presents Snowflake design and implementation, along with a discussion on how recent changes in cloud infrastructure (emerging hardware, fine-grained billing, etc.) have altered the many assumptions that guided the design and optimization of Snowflake system. Using data collected from various components of our system during execution of 70 million queries over a 14 day period, our study both deepens the understanding of existing problems and highlights new research challenges along a multitude of dimensions including design of storage systems and high-performance query execution engines.}, booktitle = {Proceedings of the 17th Usenix Conference on Networked Systems Design and Implementation}, pages = {449–462}, numpages = {14}, location = {Santa Clara, CA, USA}, series = {NSDI'20}, doi = {10.5555/3388242.3388275}, url = {https://www.usenix.org/system/files/nsdi20-paper-vuppalapati.pdf} }
@inproceedings{10.1145/2588555.2595631, author = {Yang, Fangjin and Tschetter, Eric and L\'{e}aut\'{e}, Xavier and Ray, Nelson and Merlino, Gian and Ganguli, Deep}, title = {Druid: A Real-Time Analytical Data Store}, year = {2014}, isbn = {9781450323765}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {http://static.druid.io/docs/druid.pdf}, doi = {10.1145/2588555.2595631}, abstract = {Druid is an open source data store designed for real-time exploratory analytics on large data sets. The system combines a column-oriented storage layout, a distributed, shared-nothing architecture, and an advanced indexing structure to allow for the arbitrary exploration of billion-row tables with sub-second latencies. In this paper, we describe Druid's architecture, and detail how it supports fast aggregations, flexible filters, and low latency data ingestion.}, booktitle = {Proceedings of the 2014 ACM SIGMOD International Conference on Management of Data}, pages = {157–168}, numpages = {12}, keywords = {open source, column-oriented, fault-tolerant, real-time, highly available, analytics, distributed, OLAP}, location = {Snowbird, Utah, USA}, series = {SIGMOD '14} }
@inproceedings{10.5555/2228298.2228301, author = {Zaharia, Matei and Chowdhury, Mosharaf and Das, Tathagata and Dave, Ankur and Ma, Justin and McCauley, Murphy and Franklin, Michael J. and Shenker, Scott and Stoica, Ion}, title = {Resilient Distributed Datasets: A Fault-Tolerant Abstraction for in-Memory Cluster Computing}, year = {2012}, publisher = {USENIX Association}, address = {USA}, abstract = {We present Resilient Distributed Datasets (RDDs), a distributed memory abstraction that lets programmers perform in-memory computations on large clusters in a fault-tolerant manner. RDDs are motivated by two types of applications that current computing frameworks handle inefficiently: iterative algorithms and interactive data mining tools. In both cases, keeping data in memory can improve performance by an order of magnitude. To achieve fault tolerance efficiently, RDDs provide a restricted form of shared memory, based on coarse-grained transformations rather than fine-grained updates to shared state. However, we show that RDDs are expressive enough to capture a wide class of computations, including recent specialized programming models for iterative jobs, such as Pregel, and new applications that these models do not capture. We have implemented RDDs in a system called Spark, which we evaluate through a variety of user applications and benchmarks.}, booktitle = {Proceedings of the 9th USENIX Conference on Networked Systems Design and Implementation}, pages = {2}, numpages = {1}, location = {San Jose, CA}, series = {NSDI'12}, doi = {10.5555/2228298.2228301}, url = {https://www.usenix.org/system/files/conference/nsdi12/nsdi12-final138.pdf} }
@techreport{Apache-ArrowColumnarFormat, author = {Apache Software Foundation}, institution = {Apache Software Foundation}, title = {Arrow Columnar Format, Version 1.0}, year = {2020}, url = {https://arrow.apache.org/docs/format/Columnar.html}, note = {Accessed: {2023–01-01}} }
@techreport{Apache-IcebergTableSpec, author = {Apache Software Foundation}, institution = {Apache Software Foundation}, title = {Iceberg Table Spec, Version 2.0}, year = {2020}, url = {https://iceberg.apache.org/spec/index.html}, note = {Accessed: {2023–01-01}} }
@misc{Rockset-ConceptsDesignArchitecture, title = {Rockset Concepts, Design and Architecture}, author = {Desai, Purvi and Leong, Kevin}, year = {2022}, month = jul, url = {https://rockset.com/Rockset_Concepts_Design_Architecture.pdf}, note = {Accessed: {2023-01-01}} }
@misc{Storm-SeparatingComputeAndStorage, title = {Separating compute and storage: What it means, and why it's important for databases}, author = {Storm, Adam}, year = {2019}, month = jan, url = {https://ajstorm.medium.com/separating-compute-and-storage-59def4f27d64}, note = {Accessed: {2023-01-01}} }
@article{10.1145/2590989.2590991, author = {Ngo, Hung Q and R\'{e}, Christopher and Rudra, Atri}, title = {Skew Strikes Back: New Developments in the Theory of Join Algorithms}, year = {2014}, issue_date = {December 2013}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {42}, number = {4}, issn = {0163-5808}, url = {https://arxiv.org/pdf/1310.3314.pdf}, doi = {10.1145/2590989.2590991}, journal = {SIGMOD Rec.}, month = feb, pages = {5–16}, numpages = {12} }
@inproceedings{10.1145/2213836.2213946, author = {Sikka, Vishal and F\"{a}rber, Franz and Lehner, Wolfgang and Cha, Sang Kyun and Peh, Thomas and Bornh\"{o}vd, Christof}, title = {Efficient Transaction Processing in {SAP} {HANA} Database: The End of a Column Store Myth}, year = {2012}, isbn = {9781450312479}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://15799.courses.cs.cmu.edu/fall2013/static/papers/p731-sikka.pdf}, doi = {10.1145/2213836.2213946}, abstract = {The SAP HANA database is the core of SAP's new data management platform. The overall goal of the SAP HANA database is to provide a generic but powerful system for different query scenarios, both transactional and analytical, on the same data representation within a highly scalable execution environment. Within this paper, we highlight the main features that differentiate the SAP HANA database from classical relational database engines. Therefore, we outline the general architecture and design criteria of the SAP HANA in a first step. In a second step, we challenge the common belief that column store data structures are only superior in analytical workloads and not well suited for transactional workloads. We outline the concept of record life cycle management to use different storage formats for the different stages of a record. We not only discuss the general concept but also dive into some of the details of how to efficiently propagate records through their life cycle and moving database entries from write-optimized to read-optimized storage formats. In summary, the paper aims at illustrating how the SAP HANA database is able to efficiently work in analytical as well as transactional workload environments.}, booktitle = {Proceedings of the 2012 ACM SIGMOD International Conference on Management of Data}, pages = {731–742}, numpages = {12}, keywords = {column store, transaction processing, sap hana}, location = {Scottsdale, Arizona, USA}, series = {SIGMOD '12} }
@inproceedings{10.1145/3035918.3056101, author = {Verbitski, Alexandre and Gupta, Anurag and Saha, Debanjan and Brahmadesam, Murali and Gupta, Kamal and Mittal, Raman and Krishnamurthy, Sailesh and Maurice, Sandor and Kharatishvili, Tengiz and Bao, Xiaofeng}, title = {{Amazon} {Aurora}: Design Considerations for High Throughput Cloud-Native Relational Databases}, year = {2017}, isbn = {9781450341974}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://pdos.csail.mit.edu/6.824/papers/aurora.pdf}, doi = {10.1145/3035918.3056101}, abstract = {Amazon Aurora is a relational database service for OLTP workloads offered as part of Amazon Web Services (AWS). In this paper, we describe the architecture of Aurora and the design considerations leading to that architecture. We believe the central constraint in high throughput data processing has moved from compute and storage to the network. Aurora brings a novel architecture to the relational database to address this constraint, most notably by pushing redo processing to a multi-tenant scale-out storage service, purpose-built for Aurora. We describe how doing so not only reduces network traffic, but also allows for fast crash recovery, failovers to replicas without loss of data, and fault-tolerant, self-healing storage. We then describe how Aurora achieves consensus on durable state across numerous storage nodes using an efficient asynchronous scheme, avoiding expensive and chatty recovery protocols. Finally, having operated Aurora as a production service for over 18 months, we share the lessons we have learnt from our customers on what modern cloud applications expect from databases.}, booktitle = {Proceedings of the 2017 ACM International Conference on Management of Data}, pages = {1041–1052}, numpages = {12}, keywords = {recovery, quorum models, databases, log processing, performance, replication, distributed systems, oltp}, location = {Chicago, Illinois, USA}, series = {SIGMOD '17} }
@inproceedings{10.1145/3183713.3196931, author = {Zhang, Huanchen and Lim, Hyeontaek and Leis, Viktor and Andersen, David G. and Kaminsky, Michael and Keeton, Kimberly and Pavlo, Andrew}, title = {{SuRF:} Practical Range Query Filtering with Fast Succinct Tries}, year = {2018}, isbn = {9781450347037}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://www.cs.cmu.edu/~huanche1/publications/surf_paper.pdf}, doi = {10.1145/3183713.3196931}, abstract = {We present the Succinct Range Filter (SuRF), a fast and compact data structure for approximate membership tests. Unlike traditional Bloom filters, SuRF supports both single-key lookups and common range queries: open-range queries, closed-range queries, and range counts. SuRF is based on a new data structure called the Fast Succinct Trie (FST) that matches the point and range query performance of state-of-the-art order-preserving indexes, while consuming only 10 bits per trie node. The false positive rates in SuRF for both point and range queries are tunable to satisfy different application needs. We evaluate SuRF in RocksDB as a replacement for its Bloom filters to reduce I/O by filtering requests before they access on-disk data structures. Our experiments on a 100 GB dataset show that replacing RocksDB's Bloom filters with SuRFs speeds up open-seek (without upper-bound) and closed-seek (with upper-bound) queries by up to 1.5\texttimes{} and 5\texttimes{} with a modest cost on the worst-case (all-missing) point query throughput due to slightly higher false positive rate.}, booktitle = {Proceedings of the 2018 International Conference on Management of Data}, pages = {323–336}, numpages = {14}, keywords = {fast succinct tries, lsm-trees, range filter, succinct data structures, surf}, location = {Houston, TX, USA}, series = {SIGMOD '18} }
@inproceedings{10.5555/1076260-StonebrakerHellerstein-WhatGoesAroundComesAround, title = {What Goes Around Comes Around}, author = {Stonebraker, Michael and Hellerstein, Joseph M.}, year = {2004}, url = {https://api.semanticscholar.org/CorpusID:208780954}, doi = {https://dl.acm.org/doi/book/10.5555/1076260}, booktitle = {Readings in Database Systems, Fourth Edition} }
This file was generated by bibtex2html 1.99.