Papers | Parallel Computing
2018
Claudia Misale, Maurizio Drocco, Guy Tremblay, Marco Aldinucci
PiCo: a Novel Approach to Stream Data Analytics Proceedings Article
In: Proc. of Euro-Par Workshops: 1st Intl. Workshop on Autonomic Solutions for Parallel and Distributed Data Stream Processing (Auto-DaSP 2017), Springer, Santiago de Compostela, Spain, 2018.
Abstract | Links | BibTeX | Tags: rephrase, toreador
@inproceedings{pico:autodasp:17,
title = {PiCo: a Novel Approach to Stream Data Analytics},
author = {Claudia Misale and Maurizio Drocco and Guy Tremblay and Marco Aldinucci},
url = {https://iris.unito.it/retrieve/handle/2318/1659344/409520/autodasp.pdf},
doi = {10.1007/978-3-319-75178-8_10},
year = {2018},
date = {2018-08-01},
booktitle = {Proc. of Euro-Par Workshops: 1st Intl. Workshop on Autonomic Solutions for Parallel and Distributed Data Stream Processing (Auto-DaSP 2017)},
volume = {10659},
publisher = {Springer},
address = {Santiago de Compostela, Spain},
series = {LNCS},
abstract = {In this paper, we present a new C++ API with a fluent interface called PiCo (Pipeline Composition). PiCo's programming model aims at making easier the programming of data analytics applications while preserving or enhancing their performance. This is attained through three key design choices: 1) unifying batch and stream data access models, 2) decoupling processing from data layout, and 3) exploiting a stream-oriented, scalable, effiicient C++11 runtime system. PiCo proposes a programming model based on pipelines and operators that are polymorphic with respect to data types in the sense that it is possible to re-use the same algorithms and pipelines on different data models (e.g., streams, lists, sets, etc.). Preliminary results show that PiCo can attain better performances in terms of execution times and hugely improve memory utilization when compared to Spark and Flink in both batch and stream processing.},
keywords = {rephrase, toreador},
pubstate = {published},
tppubtype = {inproceedings}
}
Marco Aldinucci, Sergio Rabellino, Marco Pironti, Filippo Spiga, Paolo Viviani, Maurizio Drocco, Marco Guerzoni, Guido Boella, Marco Mellia, Paolo Margara, Idillio Drago, Roberto Marturano, Guido Marchetto, Elio Piccolo, Stefano Bagnasco, Stefano Lusso, Sara Vallero, Giuseppe Attardi, Alex Barchiesi, Alberto Colla, Fulvio Galeazzi
HPC4AI, an AI-on-demand federated platform endeavour Proceedings Article
In: ACM Computing Frontiers, Ischia, Italy, 2018.
Abstract | Links | BibTeX | Tags: hpc4ai, rephrase, toreador
@inproceedings{18:hpc4ai_acm_CF,
title = {HPC4AI, an AI-on-demand federated platform endeavour},
author = {Marco Aldinucci and Sergio Rabellino and Marco Pironti and Filippo Spiga and Paolo Viviani and Maurizio Drocco and Marco Guerzoni and Guido Boella and Marco Mellia and Paolo Margara and Idillio Drago and Roberto Marturano and Guido Marchetto and Elio Piccolo and Stefano Bagnasco and Stefano Lusso and Sara Vallero and Giuseppe Attardi and Alex Barchiesi and Alberto Colla and Fulvio Galeazzi},
url = {https://iris.unito.it/retrieve/handle/2318/1765596/689772/2018_hpc4ai_ACM_CF.pdf},
doi = {10.1145/3203217.3205340},
year = {2018},
date = {2018-05-01},
booktitle = {ACM Computing Frontiers},
address = {Ischia, Italy},
abstract = {In April 2018, under the auspices of the POR-FESR 2014-2020 program of Italian Piedmont Region, the Turin's Centre on High-Performance Computing for Artificial Intelligence (HPC4AI) was funded with a capital investment of 4.5Me and it began its deployment. HPC4AI aims to facilitate scientific research and engineering in the areas of Artificial Intelligence and Big Data Analytics. HPC4AI will specifically focus on methods for the on-demand provisioning of AI and BDA Cloud services to the regional and national industrial community, which includes the large regional ecosystem of Small-Medium Enterprises (SMEs) active in many different sectors such as automotive, aerospace, mechatronics, manufacturing, health and agrifood.},
keywords = {hpc4ai, rephrase, toreador},
pubstate = {published},
tppubtype = {inproceedings}
}
Claudia Misale, Maurizio Drocco, Guy Tremblay, Alberto R. Martinelli, Marco Aldinucci
PiCo: High-performance data analytics pipelines in modern C++ Journal Article
In: Future Generation Computer Systems, vol. 87, pp. 392–403, 2018.
Abstract | Links | BibTeX | Tags: fastflow, HPC, toreador
@article{18:fgcs:pico,
title = {PiCo: High-performance data analytics pipelines in modern C++},
author = {Claudia Misale and Maurizio Drocco and Guy Tremblay and Alberto R. Martinelli and Marco Aldinucci},
url = {https://iris.unito.it/retrieve/handle/2318/1668444/414280/fgcs_pico.pdf},
doi = {10.1016/j.future.2018.05.030},
year = {2018},
date = {2018-01-01},
booktitle = {Future Generation Computer Systems},
journal = {Future Generation Computer Systems},
volume = {87},
pages = {392–403},
abstract = {In this paper, we present a new C++ API with a fluent interface called PiCo (Pipeline Composition). PiCo's programming model aims at making easier the programming of data analytics applications while preserving or enhancing their performance. This is attained through three key design choices: (1) unifying batch and stream data access models, (2) decoupling processing from data layout, and (3) exploiting a stream-oriented, scalable, efficient C++11 runtime system. PiCo proposes a programming model based on pipelines and operators that are polymorphic with respect to data types in the sense that it is possible to reuse the same algorithms and pipelines on different data models (e.g., streams, lists, sets, etc.). Preliminary results show that PiCo, when compared to Spark and Flink, can attain better performances in terms of execution times and can hugely improve memory utilization, both for batch and stream processing.},
keywords = {fastflow, HPC, toreador},
pubstate = {published},
tppubtype = {article}
}
2017
Maurizio Drocco
Parallel Programming with Global Asynchronous Memory: Models, C++ APIs and Implementations PhD Thesis
Computer Science Department, University of Torino, 2017.
Abstract | Links | BibTeX | Tags: fastflow, paraphrase, repara, rephrase, toreador
@phdthesis{17:gam:drocco:thesis,
title = {Parallel Programming with Global Asynchronous Memory: Models, C++ APIs and Implementations},
author = {Maurizio Drocco},
url = {https://zenodo.org/record/1037585/files/Drocco_phd_thesis.pdf},
doi = {10.5281/zenodo.1037585},
year = {2017},
date = {2017-10-01},
school = {Computer Science Department, University of Torino},
abstract = {In the realm of High Performance Computing (HPC), message passing has been the programming paradigm of choice for over twenty years. The durable MPI (Message Passing Interface) standard, with send/receive communication, broadcast, gather/scatter, and reduction collectives is still used to construct parallel programs where each communication is orchestrated by the de-vel-oper-based precise knowledge of data distribution and overheads; collective communications simplify the orchestration but might induce excessive synchronization. Early attempts to bring shared-memory programming model—with its programming adv-antages—to distributed computing, referred as the Distributed Shared Memory (DSM) model, faded away; one of the main issue was to combine performance and programmability with the memory consistency model. The recently proposed Partitioned Global Address Space (PGAS) model is a modern revamp of DSM that exposes data placement to enable optimizations based on locality, but it still addresses (simple) data-parallelism only and it relies on expensive sharing protocols. We advocate an alternative programming model for distributed computing based on a Global Asynchronous Memory (GAM), aiming to emphavoid coherency and consistency problems rather than solving them. We materialize GAM by designing and implementing a emphdistributed smart pointers library, inspired by C++ smart pointers. In this model, public and private pointers (resembling C++ shared and unique pointers, respectively) are moved around instead of messages (i.e., data), thus alleviating the user from the burden of minimizing transfers. On top of smart pointers, we propose a high-level C++ template library for writing applications in terms of dataflow-like networks, namely GAM nets, consisting of stateful processors exchanging pointers in fully asynchronous fashion. We demonstrate the validity of the proposed approach, from the expressiveness perspective, by showing how GAM nets can be exploited to implement higher-level parallel programming models, such as data and task parallelism. As for the performance perspective, the execution of two non-toy benchmarks on a number of different small-scale HPC clusters exhibits both close-to-ideal scalability and negligible overhead with respect to state-of-the-art benchmark implementations. For instance, the GAM implementation of a high-quality video restoration filter sustains a 100 fps throughput over 70%-noisy high-quality video streams on a 4-node cluster of Graphics Processing Units (GPUs), with minimal programming effort.},
keywords = {fastflow, paraphrase, repara, rephrase, toreador},
pubstate = {published},
tppubtype = {phdthesis}
}
Maurizio Drocco, Claudia Misale, Guy Tremblay, Marco Aldinucci
A Formal Semantics for Data Analytics Pipelines Technical Report
Computer Science Department, University of Torino 2017, (https://arxiv.org/abs/1705.01629).
Links | BibTeX | Tags: rephrase, toreador
@techreport{17:drocco:techreport,
title = {A Formal Semantics for Data Analytics Pipelines},
author = {Maurizio Drocco and Claudia Misale and Guy Tremblay and Marco Aldinucci},
url = {https://doi.org/10.5281/zenodo.571802},
doi = {10.5281/zenodo.571802},
year = {2017},
date = {2017-05-01},
institution = {Computer Science Department, University of Torino},
note = {https://arxiv.org/abs/1705.01629},
keywords = {rephrase, toreador},
pubstate = {published},
tppubtype = {techreport}
}
Claudia Misale
PiCo: A Domain-Specific Language for Data Analytics Pipelines PhD Thesis
Computer Science Department, University of Torino, 2017.
Abstract | Links | BibTeX | Tags: fastflow, paraphrase, repara, rephrase, toreador
@phdthesis{17:pico:misale:thesis,
title = {PiCo: A Domain-Specific Language for Data Analytics Pipelines},
author = {Claudia Misale},
url = {https://iris.unito.it/retrieve/handle/2318/1633743/320170/Misale_thesis.pdf},
doi = {10.5281/zenodo.579753},
year = {2017},
date = {2017-05-01},
school = {Computer Science Department, University of Torino},
abstract = {In the world of Big Data analytics, there is a series of tools aiming at simplifying programming applications to be executed on clusters. Although each tool claims to provide better programming, data and execution models—for which only informal (and often confusing) semantics is generally provided—all share a common under- lying model, namely, the Dataflow model. Using this model as a starting point, it is possible to categorize and analyze almost all aspects about Big Data analytics tools from a high level perspective. This analysis can be considered as a first step toward a formal model to be exploited in the design of a (new) framework for Big Data analytics. By putting clear separations between all levels of abstraction (i.e., from the runtime to the user API), it is easier for a programmer or software designer to avoid mixing low level with high level aspects, as we are often used to see in state-of-the-art Big Data analytics frameworks.
From the user-level perspective, we think that a clearer and simple semantics is preferable, together with a strong separation of concerns. For this reason, we use the Dataflow model as a starting point to build a programming environment with a simplified programming model implemented as a Domain-Specific Language, that is on top of a stack of layers that build a prototypical framework for Big Data analytics.
The contribution of this thesis is twofold: first, we show that the proposed model is (at least) as general as existing batch and streaming frameworks (e.g., Spark, Flink, Storm, Google Dataflow), thus making it easier to understand high-level data-processing applications written in such frameworks. As result of this analysis, we provide a layered model that can represent tools and applications following the Dataflow paradigm and we show how the analyzed tools fit in each level.
Second, we propose a programming environment based on such layered model in the form of a Domain-Specific Language (DSL) for processing data collections, called PiCo (Pipeline Composition). The main entity of this programming model is the Pipeline, basically a DAG-composition of processing elements. This model is intended to give the user an unique interface for both stream and batch processing, hiding completely data management and focusing only on operations, which are represented by Pipeline stages. Our DSL will be built on top of the FastFlow library, exploiting both shared and distributed parallelism, and implemented in C++11/14 with the aim of porting C++ into the Big Data world.},
keywords = {fastflow, paraphrase, repara, rephrase, toreador},
pubstate = {published},
tppubtype = {phdthesis}
}
From the user-level perspective, we think that a clearer and simple semantics is preferable, together with a strong separation of concerns. For this reason, we use the Dataflow model as a starting point to build a programming environment with a simplified programming model implemented as a Domain-Specific Language, that is on top of a stack of layers that build a prototypical framework for Big Data analytics.
The contribution of this thesis is twofold: first, we show that the proposed model is (at least) as general as existing batch and streaming frameworks (e.g., Spark, Flink, Storm, Google Dataflow), thus making it easier to understand high-level data-processing applications written in such frameworks. As result of this analysis, we provide a layered model that can represent tools and applications following the Dataflow paradigm and we show how the analyzed tools fit in each level.
Second, we propose a programming environment based on such layered model in the form of a Domain-Specific Language (DSL) for processing data collections, called PiCo (Pipeline Composition). The main entity of this programming model is the Pipeline, basically a DAG-composition of processing elements. This model is intended to give the user an unique interface for both stream and batch processing, hiding completely data management and focusing only on operations, which are represented by Pipeline stages. Our DSL will be built on top of the FastFlow library, exploiting both shared and distributed parallelism, and implemented in C++11/14 with the aim of porting C++ into the Big Data world.
Claudia Misale, Maurizio Drocco, Marco Aldinucci, Guy Tremblay
A Comparison of Big Data Frameworks on a Layered Dataflow Model Journal Article
In: Parallel Processing Letters, vol. 27, no. 01, pp. 1–20, 2017.
Abstract | Links | BibTeX | Tags: rephrase, toreador
@article{17:bigdatasurvey:PPL,
title = {A Comparison of Big Data Frameworks on a Layered Dataflow Model},
author = {Claudia Misale and Maurizio Drocco and Marco Aldinucci and Guy Tremblay},
url = {https://iris.unito.it/retrieve/handle/2318/1626287/303421/preprintPPL_4aperto.pdf},
doi = {10.1142/S0129626417400035},
year = {2017},
date = {2017-01-01},
journal = {Parallel Processing Letters},
volume = {27},
number = {01},
pages = {1–20},
abstract = {In the world of Big Data analytics, there is a series of tools aiming at simplifying programming applications to be executed on clusters. Although each tool claims to provide better programming, data and execution models, for which only informal (and often confusing) semantics is generally provided, all share a common underlying model, namely, the Dataflow model. The Dataflow model we propose shows how various tools share the same expressiveness at different levels of abstraction. The contribution of this work is twofold: first, we show that the proposed model is (at least) as general as existing batch and streaming frameworks (e.g., Spark, Flink, Storm), thus making it easier to understand high-level data-processing applications written in such frameworks. Second, we provide a layered model that can represent tools and applications following the Dataflow paradigm and we show how the analyzed tools fit in each level.},
keywords = {rephrase, toreador},
pubstate = {published},
tppubtype = {article}
}
2016
Claudia Misale, Maurizio Drocco, Marco Aldinucci, Guy Tremblay
A Comparison of Big Data Frameworks on a Layered Dataflow Model Proceedings Article
In: Proc. of Intl. Workshop on High-Level Parallel Programming (HLPP), pp. 1–19, arXiv.org, Muenster, Germany, 2016.
Abstract | Links | BibTeX | Tags: rephrase, toreador
@inproceedings{16:bigdatasurvey:hlpp,
title = {A Comparison of Big Data Frameworks on a Layered Dataflow Model},
author = {Claudia Misale and Maurizio Drocco and Marco Aldinucci and Guy Tremblay},
url = {http://arxiv.org/pdf/1606.05293v1.pdf},
doi = {10.5281/zenodo.321866},
year = {2016},
date = {2016-07-01},
booktitle = {Proc. of Intl. Workshop on High-Level Parallel Programming (HLPP)},
pages = {1–19},
publisher = {arXiv.org},
address = {Muenster, Germany},
abstract = {In the world of Big Data analytics, there is a series of tools aiming at simplifying programming applications to be executed on clusters. Although each tool claims to provide better programming, data and execution models, for which only informal (and often confusing) semantics is generally provided, all share a common underlying model, namely, the Dataflow model. The Dataflow model we propose shows how various tools share the same expressiveness at different levels of abstraction. The contribution of this work is twofold: first, we show that the proposed model is (at least) as general as existing batch and streaming frameworks (e.g., Spark, Flink, Storm), thus making it easier to understand high-level data-processing applications written in such frameworks. Second, we provide a layered model that can represent tools and applications following the Dataflow paradigm and we show how the analyzed tools fit in each level.},
keywords = {rephrase, toreador},
pubstate = {published},
tppubtype = {inproceedings}
}