@ARTICLE{26583204_162625191_2015, author = {Nikolay Golov and Lars Ronnback}, keywords = {, Big Data, massively parallel processing (MPP), database, normalization, analytics, ad-hoc, querying, modelingperformance}, title = {

SQL query optimization for highly normalized Big Data

}, journal = {}, year = {2015}, number = {3(33) }, pages = {7-14}, url = {https://bijournal.hse.ru/en/2015--3(33) /162625191.html}, publisher = {}, abstract = {Nikolay I. Golov - Lecturer, Department of Business Analytics, School of Business Informatics, Faculty of Business and Management, National Research University Higher School of Economics. Address: 20, Myasnitskaya Street, Moscow, 101000, Russian Federation.E-mail:ngolov@hse.ruLars Ronnback - Lecturer, Department of Computer Science, Stocholm UniversityAddress: SE-106 91 Stockholm, SwedenE-mail:lars.ronnback@anchormodeling.com      This paper describes an approach for fast ad-hoc analysis of Big Data inside a relational data model. The approach strives to achieve maximal utilization of highly normalized temporary tables through the merge join algorithm. It is designed for the Anchor modeling technique, which requires a very high level of table normalization. Anchor modeling is a novel data warehouse modeling technique, designed for classical databases and adapted by the authors of the article for Big Data environment and a massively parallel processing (MPP) database. Anchor modeling provides flexibility and high speed of data loading, where the presented approach adds support for fast ad-hoc analysis of Big Data sets (tens of terabytes).      Different approaches to query plan optimization are described and estimated, for row-based and column-based databases. Theoretical estimations and results of real data experiments carried out in a column-based MPP environment (HP Vertica) are presented and compared. The results show that the approach is particularly favorable when the available RAM resources are scarce, so that a switch is made from pure in-memory processing to spilling over from hard disk, while executing ad-hoc queries. Scaling is also investigated by running the same analysis on different numbers of nodes in the MPP cluster. Configurations of five, ten and twelve nodes were tested, using click stream data of Avito, the biggest classified site in Russia.}, annote = {Nikolay I. Golov - Lecturer, Department of Business Analytics, School of Business Informatics, Faculty of Business and Management, National Research University Higher School of Economics. Address: 20, Myasnitskaya Street, Moscow, 101000, Russian Federation.E-mail:ngolov@hse.ruLars Ronnback - Lecturer, Department of Computer Science, Stocholm UniversityAddress: SE-106 91 Stockholm, SwedenE-mail:lars.ronnback@anchormodeling.com      This paper describes an approach for fast ad-hoc analysis of Big Data inside a relational data model. The approach strives to achieve maximal utilization of highly normalized temporary tables through the merge join algorithm. It is designed for the Anchor modeling technique, which requires a very high level of table normalization. Anchor modeling is a novel data warehouse modeling technique, designed for classical databases and adapted by the authors of the article for Big Data environment and a massively parallel processing (MPP) database. Anchor modeling provides flexibility and high speed of data loading, where the presented approach adds support for fast ad-hoc analysis of Big Data sets (tens of terabytes).      Different approaches to query plan optimization are described and estimated, for row-based and column-based databases. Theoretical estimations and results of real data experiments carried out in a column-based MPP environment (HP Vertica) are presented and compared. The results show that the approach is particularly favorable when the available RAM resources are scarce, so that a switch is made from pure in-memory processing to spilling over from hard disk, while executing ad-hoc queries. Scaling is also investigated by running the same analysis on different numbers of nodes in the MPP cluster. Configurations of five, ten and twelve nodes were tested, using click stream data of Avito, the biggest classified site in Russia.} }