diff --git a/paper/paper.bib b/paper/paper.bib index 63a52dce..61736be4 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -75,3 +75,49 @@ @inproceedings{10.1145/3529190.3529222 location = {Corfu, Greece}, series = {PETRA '22} } + +@inproceedings{10.1145/2723372.2742797, +author = {Armbrust, Michael and Xin, Reynold S. and Lian, Cheng and Huai, Yin and Liu, Davies and Bradley, Joseph K. and Meng, Xiangrui and Kaftan, Tomer and Franklin, Michael J. and Ghodsi, Ali and Zaharia, Matei}, +title = {Spark SQL: Relational Data Processing in Spark}, +year = {2015}, +isbn = {9781450327589}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/2723372.2742797}, +doi = {10.1145/2723372.2742797}, +abstract = {Spark SQL is a new module in Apache Spark that integrates relational processing with Spark's functional programming API. Built on our experience with Shark, Spark SQL lets Spark programmers leverage the benefits of relational processing (e.g. declarative queries and optimized storage), and lets SQL users call complex analytics libraries in Spark (e.g. machine learning). Compared to previous systems, Spark SQL makes two main additions. First, it offers much tighter integration between relational and procedural processing, through a declarative DataFrame API that integrates with procedural Spark code. Second, it includes a highly extensible optimizer, Catalyst, built using features of the Scala programming language, that makes it easy to add composable rules, control code generation, and define extension points. Using Catalyst, we have built a variety of features (e.g. schema inference for JSON, machine learning types, and query federation to external databases) tailored for the complex needs of modern data analysis. We see Spark SQL as an evolution of both SQL-on-Spark and of Spark itself, offering richer APIs and optimizations while keeping the benefits of the Spark programming model.}, +booktitle = {Proceedings of the 2015 ACM SIGMOD International Conference on Management of Data}, +pages = {1383–1394}, +numpages = {12}, +keywords = {data warehouse, databases, hadoop, machine learning, spark}, +location = {Melbourne, Victoria, Australia}, +series = {SIGMOD '15} +} + +@inproceedings{10.1145/2872427.2883029, +author = {Pezoa, Felipe and Reutter, Juan and Suarez, Fernando and Ugarte, Martin and Vrgoč, Domagoj}, +year = {2016}, +month = {04}, +pages = {263-273}, +title = {Foundations of JSON Schema}, +doi = {10.1145/2872427.2883029} +} + +@article{10.3389/fdata.2020.564115, + title={Toward Data Lakes as Central Building Blocks for Data Management and Analysis}, + author={Dumke, André R. and Parchmann, Andreas and Schmid, Stefan and Hauswirth, Manfred}, + journal={Frontiers in Big Data}, + volume={3}, + pages={564115}, + year={2020}, + publisher={Frontiers}, + doi={10.3389/fdata.2020.564115} +} + +@misc{oreilly2023technology, + title = {Technology Trends for 2023}, + author = {{O'Reilly Media}}, + year = {2023}, + url = {https://www.oreilly.com/radar/technology-trends-for-2023/}, + note = {Accessed: 2024-05-18} +} diff --git a/paper/paper.md b/paper/paper.md index f66a9deb..67737282 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -48,6 +48,28 @@ An additional argument is related to the rapid evolution of the data ecosystem [ One last argument in favor of using a quality tool such as `cuallee` is the need to integrate quality procedures into the early stages of data product development. Whether in industry or academia, there is often a tendency to prioritize functional aspects over quality, leading to less time being dedicated to quality activities. By providing a clear, easy-to-use, and adaptable programming interface for data quality, teams can incorporate quality into their development process, promoting a proactive approach of building quality in rather than relying solely on testing to ensure quality. +# Data Quality Frameworks +Data platforms have diversified from file systems and relational databases, to full ecosystems including the concept of data lakes [@10.3389/fdata.2020.564115]. Modern platforms host a variety of data formats, beyond traditional tabular data, including semi-structured like `JSON` [@10.1145/2872427.2883029] or unstructured like audio or images. + +Operating with modern data platforms, requires a sophisticated data processing framework capable to handle multiple formats, and scalability. Apache Spark [@10.1145/2723372.2742797] has revolutionized the data flow paradigm by bringing computation to the data, reversing the omnipresent data to the computation, it has commoditized large scale data processing and it has grown in adoption. + +Apache Spark's growth can be attributed to its ease of use, versatility, and performance. It supports multiple programming languages, including Python, Scala, Java, and R, making it accessible to a wide range of developers. Moreover, Spark's ability to handle various data processing tasks —batch processing, real-time streaming, machine learning, and graph processing—within a unified framework has been a key factor in its widespread adoption [@oreilly2023technology]. + +`cuallee` is powered by native data engines like Apache Spark. Compared to other data quality frameworks it brings substantial advantages in reduced complexity, less computation resources and the fastest time per validation. + +The following table (\autoref{tab:performance}) provides a summary of the performance comparisson: + +Framework | Checks Definition | Time + ------- | ----------- | ---- +`great_expectations` | `python` | `▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 66s` +`soda` | `yaml` | `▇▇▇▇▇▇▇▇▇▇▇▇▇ 43s` +`pydeequ` | `python` | `▇▇▇ 11s` +`cuallee` | `python` | `▇▇ 7s` + +Table: Performance comparisson on popular data quality frameworks []{label="tab:performance"} \label{tab:performance} + + + # Methods `cuallee` employs a heuristic-based approach to define quality rules for each dataset. This prevents the inadvertent duplication of quality predicates, thus reducing the likelihood of human error in defining rules with identical predicates. Several studies have been conducted on the efficiency of these rules, including auto-validation and auto-definition using profilers [@10.1145/3580305.3599776]. @@ -118,6 +140,6 @@ Check | Description | DataType `Control.percentage_fill` | `% rows` not empty | _agnostic_ `Control.percentage_empty` | `% rows` empty | _agnostic_ -Table: List and description of the currently available []{label="tab:checks"} \label{tab:checks} +Table: List and description of the currently available checks []{label="tab:checks"} \label{tab:checks} # References \ No newline at end of file diff --git a/paper/paper.pdf b/paper/paper.pdf index 95d5695f..6bdbfc59 100644 Binary files a/paper/paper.pdf and b/paper/paper.pdf differ diff --git a/test/performance/cuallee/test_performance_cuallee.py b/test/performance/cuallee/test_performance_cuallee.py index 751809c4..c4033e7f 100755 --- a/test/performance/cuallee/test_performance_cuallee.py +++ b/test/performance/cuallee/test_performance_cuallee.py @@ -12,7 +12,7 @@ [check.is_complete(name) for name in df.columns] -[check.is_unique(name) for name in df.columns] +[check.is_unique(name, approximate=True) for name in df.columns] start = datetime.now()