Papers | Parallel Computing
2024
Giulio Malenza, Valentina Cesare, Marco Aldinucci, Ugo Becciani, Alberto Vecchiato
Toward HPC application portability via C++ PSTL: the Gaia AVU-GSR code assessment Journal Article
In: The Journal of Supercomputing, 2024, ISSN: 09208542.
Abstract | Links | BibTeX | Tags: eupex, HPC, icsc
@article{24:jsupe:Gaia,
title = {Toward HPC application portability via C++ PSTL: the Gaia AVU-GSR code assessment},
author = {Giulio Malenza and Valentina Cesare and Marco Aldinucci and Ugo Becciani and Alberto Vecchiato},
doi = {10.1007/s11227-024-06011-1},
issn = {09208542},
year = {2024},
date = {2024-03-01},
journal = {The Journal of Supercomputing},
publisher = {Springer},
abstract = {The computing capacity needed to process the data generated in modern scientific experiments is approaching ExaFLOPs. Currently, achieving such performances is only feasible through GPU-accelerated supercomputers. Different languages were developed to program GPUs at different levels of abstraction. Typically, the more abstract the languages, the more portable they are across different GPUs. However, the less abstract and co-designed with the hardware, the more room for code optimization and, eventually, the more performance. In the HPC context, portability and performance are a fairly traditional dichotomy. The current C++ Parallel Standard Template Library (PSTL) has the potential to go beyond this dichotomy. In this work, we analyze the main performance benefits and limitations of PSTL using as a use-case the Gaia Astrometric Verification Unit-Global Sphere Reconstruction parallel solver developed by the European Space Agency Gaia mission. The code aims to find the astrometric parameters of $$sim10^8$$stars in the Milky Way by iteratively solving a linear system of equations with the LSQR algorithm, originally GPU-ported with the CUDA language. We show that the performance obtained with the PSTL version, which is intrinsically more portable than CUDA, is comparable to the CUDA one on NVIDIA GPU architecture.},
keywords = {eupex, HPC, icsc},
pubstate = {published},
tppubtype = {article}
}
Marco Edoardo Santimaria, Samuele Fonio, Giulio Malenza, Iacopo Colonnelli, Marco Aldinucci
Benchmarking Parallelization Models through Karmarkar Interior-point method Proceedings Article
In: Chis, Horacio González-Vélez Adriana E. (Ed.): Proc. of 32nd Euromicro intl. Conference on Parallel, Distributed and Network-based Processing (PDP), pp. 1-8, IEEE, Dublin, Ireland, 2024, ISSN: 2377-5750.
Abstract | Links | BibTeX | Tags: HPC, icsc
@inproceedings{24:pdp:karmarkar,
title = {Benchmarking Parallelization Models through Karmarkar Interior-point method},
author = {Marco Edoardo Santimaria and Samuele Fonio and Giulio Malenza and Iacopo Colonnelli and Marco Aldinucci},
editor = {Horacio González-Vélez Adriana E. Chis},
url = {https://hdl.handle.net/2318/1964571},
doi = {10.1109/PDP62718.2024.00010},
issn = {2377-5750},
year = {2024},
date = {2024-03-01},
booktitle = {Proc. of 32nd Euromicro intl. Conference on Parallel, Distributed and Network-based Processing (PDP)},
pages = {1-8},
publisher = {IEEE},
address = {Dublin, Ireland},
abstract = {Optimization problems are one of the main focus of scientific research. Their computational-intensive nature makes them prone to be parallelized with consistent improvements in performance. This paper sheds light on different parallel models for accelerating Karmarkar's Interior-point method. To do so, we assess parallelization strategies for individual operations within the aforementioned Karmarkar's algorithm using OpenMP, GPU acceleration with CUDA, and the recent Parallel Standard C++ Linear Algebra library (PSTL) executing both on GPU and CPU. Our different implementations yield interesting benchmark results that show the optimal approach for parallelizing interior point algorithms for general Linear Programming (LP) problems. In addition, we propose a more theoretical perspective of the parallelization of this algorithm, with a detailed study of our OpenMP implementation, showing the limits of optimizing the single operations},
keywords = {HPC, icsc},
pubstate = {published},
tppubtype = {inproceedings}
}
2023
Gianluca Mittone, Samuele Fonio
Benchmarking Federated Learning Scalability Proceedings Article
In: Proceedings of the 2nd Italian Conference on Big Data and Data Science, ITADATA 2023, September 11-13, 2023, CEUR, Naples, Italy, 2023.
Abstract | Links | BibTeX | Tags: eupilot, HPC, icsc
@inproceedings{23:itadata:extabstract:mittone:fonio,
title = {Benchmarking Federated Learning Scalability},
author = {Gianluca Mittone and Samuele Fonio},
url = {https://hdl.handle.net/2318/1933852},
year = {2023},
date = {2023-09-01},
booktitle = {Proceedings of the 2nd Italian Conference on Big Data and Data Science, ITADATA 2023, September 11-13, 2023},
publisher = {CEUR},
address = {Naples, Italy},
abstract = {Federated Learning (FL) is a widespread Machine Learning paradigm handling distributed Big Data. In this work, we demonstrate that different FL frameworks expose different scaling performances despite adopting the same technologies, highlighting the need for a more comprehensive study on the topic.},
keywords = {eupilot, HPC, icsc},
pubstate = {published},
tppubtype = {inproceedings}
}
Valentina Cesare, Ugo Becciani, Alberto Vecchiato, Mario Gilberto Lattanzi, Fabio Pitari, Marco Aldinucci, Beatrice Bucciarelli
The MPI + CUDA Gaia AVU–GSR Parallel Solver Toward Next-generation Exascale Infrastructures Journal Article
In: Publications of the Astronomical Society of the Pacific, vol. 135, no. 1049, 2023.
Abstract | Links | BibTeX | Tags: HPC
@article{23:GAIAMPI_PASP,
title = {The MPI + CUDA Gaia AVU–GSR Parallel Solver Toward Next-generation Exascale Infrastructures},
author = {Valentina Cesare and Ugo Becciani and Alberto Vecchiato and Mario Gilberto Lattanzi and Fabio Pitari and Marco Aldinucci and Beatrice Bucciarelli},
url = {https://iopscience.iop.org/article/10.1088/1538-3873/acdf1e/pdf},
doi = {10.1088/1538-3873/acdf1e},
year = {2023},
date = {2023-08-01},
journal = {Publications of the Astronomical Society of the Pacific},
volume = {135},
number = {1049},
abstract = {We ported to the GPU with CUDA the Astrometric Verification Unit–Global Sphere Reconstruction (AVU–GSR) Parallel Solver developed for the ESA Gaia mission, by optimizing a previous OpenACC porting of this application. The code aims to find, with a [10, 100] μarcsec precision, the astrometric parameters of about 10^8 stars, the attitude and instrumental settings of the Gaia satellite, and the global parameter γ of the parametrized Post-Newtonian formalism, by solving a system of linear equations, A × x = b, with the LSQR iterative algorithm. The coefficient matrix A of the final Gaia data set is large, with ∼1011 × 108 elements, and sparse, reaching a size of ∼10–100 TB, typical for the Big Data analysis, which requires an efficient parallelization to obtain scientific results in reasonable timescales. The speedup of the CUDA code over the original AVU–GSR solver, parallelized on the CPU with MPI + OpenMP, increases with the system size and the number of resources, reaching a maximum of ∼14×, >9× over the OpenACC application. This result is obtained by comparing the two codes on the CINECA cluster Marconi100, with 4 V100 GPUs per node. After verifying the agreement between the solutions of a set of systems with different sizes computed with the CUDA and the OpenMP codes and that the solutions showed the required precision, the CUDA code was put in production on Marconi100, essential for an optimal AVU–GSR pipeline and the successive Gaia Data Releases. This analysis represents a first step to understand the (pre-)Exascale behavior of a class of applications that follow the same structure of this code. In the next months, we plan to run this code on the pre-Exascale platform Leonardo of CINECA, with 4 next-generation A200 GPUs per node, toward a porting on this infrastructure, where we expect to obtain even higher performances.},
key = {icsc, eupex},
keywords = {HPC},
pubstate = {published},
tppubtype = {article}
}
Gianluca Mittone, Nicolò Tonci, Robert Birke, Iacopo Colonnelli, Doriana Medić, Andrea Bartolini, Roberto Esposito, Emanuele Parisi, Francesco Beneventi, Mirko Polato, Massimo Torquati, Luca Benini, Marco Aldinucci
Experimenting with Emerging RISC-V Systems for Decentralised Machine Learning Proceedings Article
In: 20th ACM International Conference on Computing Frontiers (CF '23), ACM, Bologna, Italy, 2023, ISBN: 979-8-4007-0140-5/23/05, (https://arxiv.org/abs/2302.07946).
Abstract | Links | BibTeX | Tags: ai, confidential, eupilot, HPC, icsc, riscv
@inproceedings{23:mittone:fl-riscv,
title = {Experimenting with Emerging RISC-V Systems for Decentralised Machine Learning},
author = {Gianluca Mittone and Nicolò Tonci and Robert Birke and Iacopo Colonnelli and Doriana Medić and Andrea Bartolini and Roberto Esposito and Emanuele Parisi and Francesco Beneventi and Mirko Polato and Massimo Torquati and Luca Benini and Marco Aldinucci},
url = {https://dl.acm.org/doi/pdf/10.1145/3587135.3592211},
doi = {10.1145/3587135.3592211},
isbn = {979-8-4007-0140-5/23/05},
year = {2023},
date = {2023-05-01},
booktitle = {20th ACM International Conference on Computing Frontiers (CF '23)},
publisher = {ACM},
address = {Bologna, Italy},
institution = {Computer Science Department, University of Torino},
abstract = {Decentralised Machine Learning (DML) enables collaborative machine learning without centralised input data. Federated Learning (FL) and Edge Inference are examples of DML. While tools for DML (especially FL) are starting to flourish, many are not flexible and portable enough to experiment with novel systems (e.g., RISC-V), non-fully connected topologies, and asynchronous collaboration schemes. We overcome these limitations via a domain-specific language allowing to map DML schemes to an underlying middleware, i.e. the FastFlow parallel programming library. We experiment with it by generating different working DML schemes on two emerging architectures (ARM-v8, RISC-V) and the x86-64 platform. We characterise the performance and energy efficiency of the presented schemes and systems. As a byproduct, we introduce a RISC-V porting of the PyTorch framework, the first publicly available to our knowledge.},
note = {https://arxiv.org/abs/2302.07946},
keywords = {ai, confidential, eupilot, HPC, icsc, riscv},
pubstate = {published},
tppubtype = {inproceedings}
}
Giorgio Audrito, Alberto Riccardo Martinelli, Gianluca Torta
Parallelising an Aggregate Programming Framework with Message-Passing Interface Proceedings Article
In: 2023 IEEE International Conference on Autonomic Computing and Self-Organizing Systems Companion (ACSOS-C), pp. 140–145, 2023.
@inproceedings{23:acsos:fcppmpi,
title = {Parallelising an Aggregate Programming Framework with Message-Passing Interface},
author = {Giorgio Audrito and Alberto Riccardo Martinelli and Gianluca Torta},
doi = {10.1109/ACSOS-C58168.2023.00054},
year = {2023},
date = {2023-01-01},
booktitle = {2023 IEEE International Conference on Autonomic Computing and Self-Organizing Systems Companion (ACSOS-C)},
pages = {140–145},
keywords = {HPC},
pubstate = {published},
tppubtype = {inproceedings}
}
Javier Garcia-Blas, Genaro Sanchez-Gallegos, Cosmin Petre, Alberto Riccardo Martinelli, Marco Aldinucci, Jesus Carretero
Hercules: Scalable and Network Portable In-Memory Ad-Hoc File System for Data-Centric and High-Performance Applications Proceedings Article
In: Cano, José, Dikaiakos, Marios D., Papadopoulos, George A., Pericàs, Miquel, Sakellariou, Rizos (Ed.): Euro-Par 2023: Parallel Processing, pp. 679–693, Springer Nature Switzerland, Cham, 2023, ISBN: 978-3-031-39698-4.
Abstract | BibTeX | Tags: admire, HPC
@inproceedings{10.1007/978-3-031-39698-4_46,
title = {Hercules: Scalable and Network Portable In-Memory Ad-Hoc File System for Data-Centric and High-Performance Applications},
author = {Javier Garcia-Blas and Genaro Sanchez-Gallegos and Cosmin Petre and Alberto Riccardo Martinelli and Marco Aldinucci and Jesus Carretero},
editor = {José Cano and Marios D. Dikaiakos and George A. Papadopoulos and Miquel Pericàs and Rizos Sakellariou},
isbn = {978-3-031-39698-4},
year = {2023},
date = {2023-01-01},
booktitle = {Euro-Par 2023: Parallel Processing},
pages = {679–693},
publisher = {Springer Nature Switzerland},
address = {Cham},
abstract = {The growing demands for data processing by new data-intensive applications are putting pressure on the performance and capacity of HPC storage systems. The advancement in storage technologies, such as NVMe and persistent memory, are aimed at meeting these demands. However, relying solely on ultra-fast storage devices is not cost-effective, leading to the need for multi-tier storage hierarchies to move data based on its usage. To address this issue, ad-hoc file systems have been proposed as a solution. They utilise the available storage of compute nodes, such as memory and persistent storage, to create a temporary file system that adapts to the application behaviour in the HPC environment. This work presents the design, implementation, and evaluation of a distributed ad-hoc in-memory storage system (Hercules), highlighting the new communication model included in Hercules. This communication model takes advantage of the Unified Communication X framework (UCX). This solution leverages the capabilities of RDMA protocols, including Infiniband, Onmipath, shared memory, and zero-copy transfers. The preliminary evaluation results show excellent network utilisation compared with other existing technologies.},
keywords = {admire, HPC},
pubstate = {published},
tppubtype = {inproceedings}
}
2021
Marco Aldinucci, Valentina Cesare, Iacopo Colonnelli, Alberto Riccardo Martinelli, Gianluca Mittone, Barbara Cantalupo, Carlo Cavazzoni, Maurizio Drocco
Practical Parallelization of Scientific Applications with OpenMP, OpenACC and MPI Journal Article
In: Journal of Parallel and Distributed Computing, vol. 157, pp. 13–29, 2021.
Abstract | Links | BibTeX | Tags: HPC
@article{21:jpdc:loop,
title = {Practical Parallelization of Scientific Applications with OpenMP, OpenACC and MPI},
author = {Marco Aldinucci and Valentina Cesare and Iacopo Colonnelli and Alberto Riccardo Martinelli and Gianluca Mittone and Barbara Cantalupo and Carlo Cavazzoni and Maurizio Drocco},
url = {https://iris.unito.it/retrieve/handle/2318/1792557/770851/Practical_Parallelization_JPDC_preprint.pdf},
doi = {10.1016/j.jpdc.2021.05.017},
year = {2021},
date = {2021-01-01},
journal = {Journal of Parallel and Distributed Computing},
volume = {157},
pages = {13–29},
abstract = {This work aims at distilling a systematic methodology to modernize existing sequential scientific codes with a little re-designing effort, turning an old codebase into emphmodern code, i.e., parallel and robust code. We propose a semi-automatic methodology to parallelize scientific applications designed with a purely sequential programming mindset, possibly using global variables, aliasing, random number generators, and stateful functions. We demonstrate that the same methodology works for the parallelization in the shared memory model (via OpenMP), message passing model (via MPI), and General Purpose Computing on GPU model (via OpenACC). The method is demonstrated parallelizing four real-world sequential codes in the domain of physics and material science. The methodology itself has been distilled in collaboration with MSc students of the Parallel Computing course at the University of Torino, that applied it for the first time to the project works that they presented for the final exam of the course. Every year the course hosts some special lectures from industry representatives, who present how they use parallel computing and offer codes to be parallelizeda.},
keywords = {HPC},
pubstate = {published},
tppubtype = {article}
}
Daniele D'Agostino, Ivan Merelli, Marco Aldinucci, Daniele Cesini
Hardware and Software Solutions for Energy-Efficient Computing in Scientific Programming Journal Article
In: Scientific Programming, vol. 2021, pp. 5514284, 2021, ISBN: 1058-9244.
Abstract | Links | BibTeX | Tags: HPC
@article{21:dagostino:lowpower,
title = {Hardware and Software Solutions for Energy-Efficient Computing in Scientific Programming},
author = {Daniele D'Agostino and Ivan Merelli and Marco Aldinucci and Daniele Cesini},
url = {https://downloads.hindawi.com/journals/sp/2021/5514284.pdf},
doi = {10.1155/2021/5514284},
isbn = {1058-9244},
year = {2021},
date = {2021-01-01},
journal = {Scientific Programming},
volume = {2021},
pages = {5514284},
publisher = {Hindawi},
abstract = {Energy consumption is one of the major issues in today’s computer science, and an increasing number of scientific communities are interested in evaluating the tradeoff between time-to-solution and energy-to-solution. Despite, in the last two decades, computing which revolved around centralized computing infrastructures, such as supercomputing and data centers, the wide adoption of the Internet of Things (IoT) paradigm is currently inverting this trend due to the huge amount of data it generates, pushing computing power back to places where the data are generated—the so-called fog/edge computing. This shift towards a decentralized model requires an equivalent change in the software engineering paradigms, development environments, hardware tools, languages, and computation models for scientific programming because the local computational capabilities are typically limited and require a careful evaluation of power consumption. This paper aims to present how these concepts can be actually implemented in scientific software by presenting the state of the art of powerful, less power-hungry processors from one side and energy-aware tools and techniques from the other one.},
keywords = {HPC},
pubstate = {published},
tppubtype = {article}
}
2020
Vasco Amaral, Beatriz Norberto, Miguel Goulão, Marco Aldinucci, Siegfried Benkner, Andrea Bracciali, Paulo Carreira, Edgars Celms, Luís Correia, Clemens Grelck, Helen Karatza, Christoph Kessler, Peter Kilpatrick, Hugo Martiniano, Ilias Mavridis, Sabri Pllana, Ana Respício, José Simão, Luís Veiga, Ari Visa
Programming languages for data-Intensive HPC applications: A systematic mapping study Journal Article
In: Parallel Computing, pp. 102584, 2020, ISSN: 0167-8191.
Abstract | Links | BibTeX | Tags: HPC
@article{20:sms:chipset,
title = {Programming languages for data-Intensive HPC applications: A systematic mapping study},
author = {Vasco Amaral and Beatriz Norberto and Miguel Goulão and Marco Aldinucci and Siegfried Benkner and Andrea Bracciali and Paulo Carreira and Edgars Celms and Luís Correia and Clemens Grelck and Helen Karatza and Christoph Kessler and Peter Kilpatrick and Hugo Martiniano and Ilias Mavridis and Sabri Pllana and Ana Respício and José Simão and Luís Veiga and Ari Visa},
url = {https://iris.unito.it/retrieve/689605/1-s2.0-S0167819119301759-main.pdf},
doi = {https://doi.org/10.1016/j.parco.2019.102584},
issn = {0167-8191},
year = {2020},
date = {2020-01-01},
journal = {Parallel Computing},
pages = {102584},
abstract = {A major challenge in modelling and simulation is the need to combine expertise in both software technologies and a given scientific domain. When High-Performance Computing (HPC) is required to solve a scientific problem, software development becomes a problematic issue. Considering the complexity of the software for HPC, it is useful to identify programming languages that can be used to alleviate this issue. Because the existing literature on the topic of HPC is very dispersed, we performed a Systematic Mapping Study (SMS) in the context of the European COST Action cHiPSet. This literature study maps characteristics of various programming languages for data-intensive HPC applications, including category, typical user profiles, effectiveness, and type of articles. We organised the SMS in two phases. In the first phase, relevant articles are identified employing an automated keyword-based search in eight digital libraries. This lead to an initial sample of 420 papers, which was then narrowed down in a second phase by human inspection of article abstracts, titles and projects to 152 relevant articles published in the period 2006–2018. The analysis of these articles enabled us to identify 26 programming languages referred to in 33 of relevant articles. We compared the outcome of the mapping study with results of our questionnaire-based survey that involved 57 HPC experts. The mapping study and the survey revealed that the desired features of programming languages for data-intensive HPC applications are portability, performance and usability. Furthermore, we observed that the majority of the programming languages used in the context of data-intensive HPC applications are text-based general-purpose programming languages. Typically these have a steep learning curve, which makes them difficult to adopt. We believe that the outcome of this study will inspire future research and development in programming languages for data-intensive HPC applications.},
keywords = {HPC},
pubstate = {published},
tppubtype = {article}
}
2019
Clemens Grelck, Ewa Niewiadomska-Szynkiewicz, Marco Aldinucci, Andrea Bracciali, Elisabeth Larsson
Why High-Performance Modelling and Simulation for Big Data Applications Matters Book Chapter
In: Kołodziej, Joanna, González-Vélez, Horacio (Ed.): High-Performance Modelling and Simulation for Big Data Applications: Selected Results of the COST Action IC1406 cHiPSet, no. 11400, pp. 1–35, Springer International Publishing, Cham, 2019, ISBN: 978-3-030-16272-6.
Abstract | Links | BibTeX | Tags: HPC
@inbook{Grelck2019,
title = {Why High-Performance Modelling and Simulation for Big Data Applications Matters},
author = {Clemens Grelck and Ewa Niewiadomska-Szynkiewicz and Marco Aldinucci and Andrea Bracciali and Elisabeth Larsson},
editor = {Joanna Kołodziej and Horacio González-Vélez},
url = {https://link.springer.com/content/pdf/10.1007%2F978-3-030-16272-6_1.pdf},
doi = {10.1007/978-3-030-16272-6_1},
isbn = {978-3-030-16272-6},
year = {2019},
date = {2019-01-01},
booktitle = {High-Performance Modelling and Simulation for Big Data Applications: Selected Results of the COST Action IC1406 cHiPSet},
number = {11400},
pages = {1–35},
publisher = {Springer International Publishing},
address = {Cham},
series = {LNCS},
abstract = {Modelling and Simulation (M&S) offer adequate abstractions to manage the complexity of analysing big data in scientific and engineering domains. Unfortunately, big data problems are often not easily amenable to efficient and effective use of High Performance Computing (HPC) facilities and technologies. Furthermore, M&S communities typically lack the detailed expertise required to exploit the full potential of HPC solutions while HPC specialists may not be fully aware of specific modelling and simulation requirements and applications.},
keywords = {HPC},
pubstate = {published},
tppubtype = {inbook}
}
2018
Claudia Misale, Maurizio Drocco, Guy Tremblay, Alberto R. Martinelli, Marco Aldinucci
PiCo: High-performance data analytics pipelines in modern C++ Journal Article
In: Future Generation Computer Systems, vol. 87, pp. 392–403, 2018.
Abstract | Links | BibTeX | Tags: fastflow, HPC, toreador
@article{18:fgcs:pico,
title = {PiCo: High-performance data analytics pipelines in modern C++},
author = {Claudia Misale and Maurizio Drocco and Guy Tremblay and Alberto R. Martinelli and Marco Aldinucci},
url = {https://iris.unito.it/retrieve/handle/2318/1668444/414280/fgcs_pico.pdf},
doi = {10.1016/j.future.2018.05.030},
year = {2018},
date = {2018-01-01},
booktitle = {Future Generation Computer Systems},
journal = {Future Generation Computer Systems},
volume = {87},
pages = {392–403},
abstract = {In this paper, we present a new C++ API with a fluent interface called PiCo (Pipeline Composition). PiCo's programming model aims at making easier the programming of data analytics applications while preserving or enhancing their performance. This is attained through three key design choices: (1) unifying batch and stream data access models, (2) decoupling processing from data layout, and (3) exploiting a stream-oriented, scalable, efficient C++11 runtime system. PiCo proposes a programming model based on pipelines and operators that are polymorphic with respect to data types in the sense that it is possible to reuse the same algorithms and pipelines on different data models (e.g., streams, lists, sets, etc.). Preliminary results show that PiCo, when compared to Spark and Flink, can attain better performances in terms of execution times and can hugely improve memory utilization, both for batch and stream processing.},
keywords = {fastflow, HPC, toreador},
pubstate = {published},
tppubtype = {article}
}
Marco Aldinucci, Marco Danelutto, Maurizio Drocco, Peter Kilpatrick, Claudia Misale, Guilherme Peretti Pezzi, Massimo Torquati
A Parallel Pattern for Iterative Stencil + Reduce Journal Article
In: Journal of Supercomputing, vol. 74, no. 11, pp. 5690–5705, 2018.
Abstract | Links | BibTeX | Tags: HPC, repara, rephrase
@article{16:stencilreduce:jsupe,
title = {A Parallel Pattern for Iterative Stencil + Reduce},
author = {Marco Aldinucci and Marco Danelutto and Maurizio Drocco and Peter Kilpatrick and Claudia Misale and Guilherme Peretti Pezzi and Massimo Torquati},
url = {https://iris.unito.it/retrieve/0716fc42-53d7-48c0-9469-697aabfe7759/jspaper.pdf},
doi = {10.1007/s11227-016-1871-z},
year = {2018},
date = {2018-01-01},
journal = {Journal of Supercomputing},
volume = {74},
number = {11},
pages = {5690–5705},
abstract = {We advocate the Loop-of-stencil-reduce pattern as a means of simplifying the implementation of data-parallel programs on heterogeneous multi-core platforms. Loop-of-stencil-reduce is general enough to subsume map, reduce, map-reduce, stencil, stencil-reduce, and, crucially, their usage in a loop in both data-parallel and streaming applications, or a combination of both. The pattern makes it possible to deploy a single stencil computation kernel on different GPUs. We discuss the implementation of Loop-of-stencil-reduce in FastFlow, a framework for the implementation of applications based on the parallel patterns. Experiments are presented to illustrate the use of Loop-of-stencil-reduce in developing data-parallel kernels running on heterogeneous systems.},
keywords = {HPC, repara, rephrase},
pubstate = {published},
tppubtype = {article}
}
2017
Salvatore Cuomo, Marco Aldinucci, Massimo Torquati
Guest Editorial for Programming Models and Algorithms for Data Analysis in HPC Systems Journal Article
In: International Journal of Parallel Programming, pp. 1–3, 2017, ISSN: 0885-7458, (Editorial).
Abstract | Links | BibTeX | Tags: HPC
@article{17:ijpp:cuomo:editorial,
title = {Guest Editorial for Programming Models and Algorithms for Data Analysis in HPC Systems},
author = {Salvatore Cuomo and Marco Aldinucci and Massimo Torquati},
url = {https://doi.org/10.1007/s10766-017-0531-0},
doi = {10.1007/s10766-017-0531-0},
issn = {0885-7458},
year = {2017},
date = {2017-10-01},
journal = {International Journal of Parallel Programming},
pages = {1–3},
abstract = {Performance is still the hottest keyword in parallel and distributed systems: performance evaluation, design for performance, performance portability and scalability are just a few of the many possible declinations that nowadays are of paramount scientific importance. To tackle these challenges, system architects, applications programmers and data center managers need methodological tools to fit at best the overall workload and the available architecture, maximizing the overall performances and minimizing overheads, energy consumption or idle time while application developers mainly aim at algorithmic and software oriented performances. Proper methodologies for modeling and analysis are the way to turn complexity into opportunities.
This Special Issue of the International Journal of Parallel Programming welcomes papers that present practical and methodological approaches to analytical and simulative performance evaluation for architecturally complex systems and high-performance parallel and computing algorithm. Successful contributions have been done on specific technologies, applications and innovative solutions to system specifications and algorithmic schemes both.},
note = {Editorial},
keywords = {HPC},
pubstate = {published},
tppubtype = {article}
}
This Special Issue of the International Journal of Parallel Programming welcomes papers that present practical and methodological approaches to analytical and simulative performance evaluation for architecturally complex systems and high-performance parallel and computing algorithm. Successful contributions have been done on specific technologies, applications and innovative solutions to system specifications and algorithmic schemes both.
Paolo Viviani, Massimo Torquati, Marco Aldinucci, Roberto d'Ippolito
Multiple back-end support for the Armadillo linear algebra interface Proceedings Article
In: In proc. of the 32nd ACM Symposium on Applied Computing (SAC), pp. 1566–1573, Marrakesh, Morocco, 2017.
Abstract | Links | BibTeX | Tags: HPC, repara, rephrase
@inproceedings{17:sac:armadillo,
title = {Multiple back-end support for the Armadillo linear algebra interface},
author = {Paolo Viviani and Massimo Torquati and Marco Aldinucci and Roberto d'Ippolito},
url = {https://iris.unito.it/retrieve/handle/2318/1626229/299089/armadillo_4aperto.pdf},
year = {2017},
date = {2017-04-01},
booktitle = {In proc. of the 32nd ACM Symposium on Applied Computing (SAC)},
pages = {1566–1573},
address = {Marrakesh, Morocco},
abstract = {The Armadillo C++ library provides programmers with a high-level Matlab-like syntax for linear algebra. Its design aims at providing a good balance between speed and ease of use. It can be linked with different back-ends, i.e. different LAPACK-compliant libraries. In this work we present a novel run-time support of Armadillo, which gracefully extends mainstream implementation to enable back-end switching without recompilation and multiple back-end support. The extension is specifically designed to not affect Armadillo class template prototypes, thus to be easily interoperable with future evolutions of the Armadillo library itself. The proposed software stack is then tested for functionality and performance against a kernel code extracted from an industrial application.},
keywords = {HPC, repara, rephrase},
pubstate = {published},
tppubtype = {inproceedings}
}
Marco Aldinucci, Stefano Bagnasco, Stefano Lusso, Paolo Pasteris, Sergio Rabellino
OCCAM: a flexible, multi-purpose and extendable HPC cluster Proceedings Article
In: Journal of Physics: Conf. Series (CHEP 2016), pp. 082039, San Francisco, USA, 2017.
Abstract | Links | BibTeX | Tags: c3s, HPC
@inproceedings{16:occam:chep,
title = {OCCAM: a flexible, multi-purpose and extendable HPC cluster},
author = {Marco Aldinucci and Stefano Bagnasco and Stefano Lusso and Paolo Pasteris and Sergio Rabellino},
url = {http://iopscience.iop.org/article/10.1088/1742-6596/898/8/082039/meta},
doi = {10.1088/1742-6596/898/8/082039},
year = {2017},
date = {2017-01-01},
booktitle = {Journal of Physics: Conf. Series (CHEP 2016)},
volume = {898},
number = {8},
pages = {082039},
address = {San Francisco, USA},
abstract = {Obtaining CPU cycles on an HPC cluster is nowadays relatively simple and sometimes even cheap for academic institutions. However, in most of the cases providers of HPC services would not allow changes on the configuration, implementation of special features or a lower-level control on the computing infrastructure and networks, for example for testing new computing patterns or conducting research on HPC itself. The variety of use cases proposed by several departments of the University of Torino, including ones from solid-state chemistry, high-energy physics, computer science, big data analytics, computational biology, genomics and many others, called for different and sometimes conflicting configurations; furthermore, several R&D activities in the field of scientific computing, with topics ranging from GPU acceleration to Cloud Computing technologies, needed a platform to be carried out on. The Open Computing Cluster for Advanced data Manipulation (OCCAM) is a multi-purpose flexible HPC cluster designed and operated by a collaboration between the University of Torino and the Torino branch of the Istituto Nazionale di Fisica Nucleare. It is aimed at providing a flexible, reconfigurable and extendable infrastructure to cater to a wide range of different scientific computing needs, as well as a platform for R&D activities on computational technologies themselves. Extending it with novel architecture CPU, accelerator or hybrid microarchitecture (such as forthcoming Intel Xeon Phi Knights Landing) should be as a simple as plugging a node in a rack. The initial system counts slightly more than 1100 cpu cores and includes different types of computing nodes (standard dual-socket nodes, large quad-sockets nodes with 768 GB RAM, and multi-GPU nodes) and two separate disk storage subsystems: a smaller high-performance scratch area, based on the Lustre file system, intended for direct computational I/O and a larger one, of the order of 1PB, to archive near-line data for archival purposes. All the components of the system are interconnected through a 10Gb/s Ethernet layer with one-level topology and an InfiniBand FDR 56Gbps layer in fat-tree topology. A system of this kind, heterogeneous and reconfigurable by design, poses a number of challenges related to the frequency at which heterogeneous hardware resources might change their availability and shareability status, which in turn affect methods and means to allocate, manage, optimize, bill, monitor VMs, virtual farms, jobs, interactive bare-metal sessions, etc. This poster describes some of the use cases that prompted the design ad construction of the HPC cluster, its architecture and a first characterization of its performance by some synthetic benchmark tools and a few realistic use-case tests.},
keywords = {c3s, HPC},
pubstate = {published},
tppubtype = {inproceedings}
}
Concetto Spampinato, Simone Palazzo, Daniela Giordano, Marco Aldinucci, Rosalia Leonardi
Deep learning for automated skeletal bone age assessment in X-ray images Journal Article
In: Medical Image Analysis, vol. 36, pp. 41–51, 2017.
Abstract | Links | BibTeX | Tags: HPC
@article{17:deepx:conce,
title = {Deep learning for automated skeletal bone age assessment in X-ray images},
author = {Concetto Spampinato and Simone Palazzo and Daniela Giordano and Marco Aldinucci and Rosalia Leonardi},
url = {https://iris.unito.it/retrieve/e27ce42b-5743-2581-e053-d805fe0acbaa/main.pdf},
doi = {10.1016/j.media.2016.10.010},
year = {2017},
date = {2017-01-01},
journal = {Medical Image Analysis},
volume = {36},
pages = {41–51},
abstract = {Skeletal bone age assessment is a common clinical practice to investigate endocrinology, genetic and growth disorders in children. It is generally performed by radiological examination of the left hand by using either the Greulich and Pyle (G&P) method or the Tanner–Whitehouse (TW) one. However, both clinical procedures show several limitations, from the examination effort of radiologists to (most importantly) significant intra- and inter-operator variability. To address these problems, several automated approaches (especially relying on the TW method) have been proposed; nevertheless, none of them has been proved able to generalize to different races, age ranges and genders. In this paper, we propose and test several deep learning approaches to assess skeletal bone age automatically; the results showed an average discrepancy between manual and automatic evaluation of about 0.8 years, which is state-of-the-art performance. Furthermore, this is the first automated skeletal bone age assessment work tested on a public dataset and for all age ranges, races and genders, for which the source code is available, thus representing an exhaustive baseline for future research in the field. Beside the specific application scenario, this paper aims at providing answers to more general questions about deep learning on medical images: from the comparison between deep-learned features and manually-crafted ones, to the usage of deep-learning methods trained on general imagery for medical problems, to how to train a CNN with few images.},
keywords = {HPC},
pubstate = {published},
tppubtype = {article}
}
Wissam Abu Ahmad, Andrea Bartolini, Francesco Beneventi, Luca Benini, Andrea Borghesi, Marco Cicala, Privato Forestieri, Cosimo Gianfreda, Daniele Gregori, Antonio Libri, Filippo Spiga, Simone Tinti
Design of an Energy Aware Petaflops Class High Performance Cluster Based on Power Architecture Proceedings Article
In: 2017 IEEE International Parallel and Distributed Processing Symposium Workshops, IPDPS Workshops 2017, Orlando / Buena Vista, FL, USA, May 29 - June 2, 2017, pp. 964–973, 2017.
@inproceedings{DBLP:conf/ipps/AhmadBBBBCFGGLS17,
title = {Design of an Energy Aware Petaflops Class High Performance Cluster Based on Power Architecture},
author = {Wissam Abu Ahmad and Andrea Bartolini and Francesco Beneventi and Luca Benini and Andrea Borghesi and Marco Cicala and Privato Forestieri and Cosimo Gianfreda and Daniele Gregori and Antonio Libri and Filippo Spiga and Simone Tinti},
url = {https://doi.org/10.1109/IPDPSW.2017.22},
doi = {10.1109/IPDPSW.2017.22},
year = {2017},
date = {2017-01-01},
booktitle = {2017 IEEE International Parallel and Distributed Processing Symposium Workshops, IPDPS Workshops 2017, Orlando / Buena Vista, FL, USA, May 29 - June 2, 2017},
pages = {964–973},
keywords = {HPC},
pubstate = {published},
tppubtype = {inproceedings}
}
2016
Paolo Viviani, Marco Aldinucci, Roberto d'Ippolito
An hybrid linear algebra framework for engineering Proceedings Article
In: Advanced Computer Architecture and Compilation for High-Performance and Embedded Systems (ACACES) – Poster Abstracts, Fiuggi, Italy, 2016.
Abstract | Links | BibTeX | Tags: HPC, repara
@inproceedings{16:acaces:armadillo,
title = {An hybrid linear algebra framework for engineering},
author = {Paolo Viviani and Marco Aldinucci and Roberto d'Ippolito},
url = {https://iris.unito.it/retrieve/handle/2318/1622382/300198/armadillo.pdf},
year = {2016},
date = {2016-07-01},
booktitle = {Advanced Computer Architecture and Compilation for High-Performance and Embedded Systems (ACACES) – Poster Abstracts},
address = {Fiuggi, Italy},
abstract = {The aim of this work is to provide developers and domain experts with simple (Matlab-like) inter- face for performing linear algebra tasks while retaining state-of-the-art computational speed. To achieve this goal we extend Armadillo C++ library is extended in order to support with multiple LAPACK-compliant back-ends targeting different architectures including CUDA GPUs; moreover our approach involves the possibility of dynamically switching between such back-ends in order to select the one which is most convenient based on the specific problem and hardware configura- tion. This approach is eventually validated within an industrial environment.},
keywords = {HPC, repara},
pubstate = {published},
tppubtype = {inproceedings}
}
Bogdan Nicolae, Carlos H. A. Costa, Claudia Misale, Kostas Katrinis, Yoonho Park
Towards Memory-Optimized Data Shuffling Patterns for Big Data Analytics Proceedings Article
In: IEEE/ACM 16th Intl. Symposium on Cluster, Cloud and Grid Computing, CCGrid 2016, IEEE, Cartagena, Colombia, 2016.
Abstract | Links | BibTeX | Tags: HPC
@inproceedings{16:ccgrid:misale,
title = {Towards Memory-Optimized Data Shuffling Patterns for Big Data Analytics},
author = {Bogdan Nicolae and Carlos H. A. Costa and Claudia Misale and Kostas Katrinis and Yoonho Park},
url = {http://ieeexplore.ieee.org/document/7515716/},
doi = {10.1109/CCGrid.2016.85},
year = {2016},
date = {2016-01-01},
booktitle = {IEEE/ACM 16th Intl. Symposium on Cluster, Cloud and Grid Computing, CCGrid 2016},
publisher = {IEEE},
address = {Cartagena, Colombia},
abstract = {Big data analytics is an indispensable tool in transforming science, engineering, medicine, healthcare, finance and ultimately business itself. With the explosion of data sizes and need for shorter time-to-solution, in-memory platforms such as Apache Spark gain increasing popularity. However, this introduces important challenges, among which data shuffling is particularly difficult: on one hand it is a key part of the computation that has a major impact on the overall performance and scalability so its efficiency is paramount, while on the other hand it needs to operate with scarce memory in order to leave as much memory available for data caching. In this context, efficient scheduling of data transfers such that it addresses both dimensions of the problem simultaneously is non-trivial. State-of-the-art solutions often rely on simple approaches that yield sub optimal performance and resource usage. This paper contributes a novel shuffle data transfer strategy that dynamically adapts to the computation with minimal memory utilization, which we briefly underline as a series of design principles.},
keywords = {HPC},
pubstate = {published},
tppubtype = {inproceedings}
}
Paolo Viviani, Marco Aldinucci, Roberto d'Ippolito, Jean Lemeire, Dean Vucinic
A flexible numerical framework for engineering - a Response Surface Modelling application Unpublished
2016.
Abstract | BibTeX | Tags: HPC, repara, rephrase
@unpublished{16:acex:armadillo,
title = {A flexible numerical framework for engineering - a Response Surface Modelling application},
author = {Paolo Viviani and Marco Aldinucci and Roberto d'Ippolito and Jean Lemeire and Dean Vucinic},
year = {2016},
date = {2016-01-01},
booktitle = {10th Intl. Conference on Advanced Computational Engineering and Experimenting (ACE-X)},
abstract = {This work presents the innovative approach adopted for the development of a new numerical software framework for accelerating Dense Linear Algebra calculations and its application within an engineering context. In particular, Response Surface Models (RSM) are a key tool to reduce the computational effort involved in engineering design processes like design optimization. However, RSMs may prove to be too expensive to be computed when the dimensionality of the system and/or the size of the dataset to be synthesized is significantly high or when a large number of different Response Surfaces has to be calculated in order to improve the overall accuracy (e.g. like when using Ensemble Modelling techniques). On the other hand, it is a known challenge that the potential of modern hybrid hardware (e.g. multicore, GPUs) is not exploited by current engineering tools, while they can lead to a significant performance improvement. To fill this gap, a software framework is being developed that enables the hybrid and scalable acceleration of the linear algebra core for engineering applications and especially of RSMs calculations with a user-friendly syntax that allows good portability between different hardware architectures, with no need of specific expertise in parallel programming and accelerator technology. The effectiveness of this framework is shown by comparing an accelerated code to a single-core calculation of a Radial Basis Function RSM on some benchmark datasets. This approach is then validated within a real-life engineering application and the achievements are presented and discussed.},
keywords = {HPC, repara, rephrase},
pubstate = {published},
tppubtype = {unpublished}
}
2015
Marco Aldinucci, Marco Danelutto, Maurizio Drocco, Peter Kilpatrick, Guilherme Peretti Pezzi, Massimo Torquati
The Loop-of-Stencil-Reduce paradigm Proceedings Article
In: Proc. of Intl. Workshop on Reengineering for Parallelism in Heterogeneous Parallel Platforms (RePara), pp. 172–177, IEEE, Helsinki, Finland, 2015.
Abstract | Links | BibTeX | Tags: fastflow, HPC, repara
@inproceedings{opencl:ff:ispa:15,
title = {The Loop-of-Stencil-Reduce paradigm},
author = {Marco Aldinucci and Marco Danelutto and Maurizio Drocco and Peter Kilpatrick and Guilherme Peretti Pezzi and Massimo Torquati},
url = {https://iris.unito.it/retrieve/handle/2318/1523738/52857/15_RePara_ISPA.pdf},
doi = {10.1109/Trustcom.2015.628},
year = {2015},
date = {2015-08-01},
booktitle = {Proc. of Intl. Workshop on Reengineering for Parallelism in Heterogeneous Parallel Platforms (RePara)},
pages = {172–177},
publisher = {IEEE},
address = {Helsinki, Finland},
abstract = {In this paper we advocate the Loop-of-stencil-reduce pattern as a way to simplify the parallel programming of heterogeneous platforms (multicore+GPUs). Loop-of-Stencil-reduce is general enough to subsume map, reduce, map-reduce, stencil, stencil-reduce, and, crucially, their usage in a loop. It transparently targets (by using OpenCL) combinations of CPU cores and GPUs, and it makes it possible to simplify the deployment of a single stencil computation kernel on different GPUs. The paper discusses the implementation of Loop-of-stencil-reduce within the FastFlow parallel framework, considering a simple iterative data-parallel application as running example (Game of Life) and a highly effective parallel filter for visual data restoration to assess performance. Thanks to the high-level design of the Loop-of-stencil-reduce, it was possible to run the filter seamlessly on a multicore machine, on multi-GPUs, and on both.},
keywords = {fastflow, HPC, repara},
pubstate = {published},
tppubtype = {inproceedings}
}
Marco Aldinucci, Guilherme Peretti Pezzi, Maurizio Drocco, Concetto Spampinato, Massimo Torquati
Parallel Visual Data Restoration on Multi-GPGPUs using Stencil-Reduce Pattern Journal Article
In: International Journal of High Performance Computing Applications, vol. 29, no. 4, pp. 461–472, 2015.
Abstract | Links | BibTeX | Tags: fastflow, HPC, impact, paraphrase
@article{ff:denoiser:ijhpca:15,
title = {Parallel Visual Data Restoration on Multi-GPGPUs using Stencil-Reduce Pattern},
author = {Marco Aldinucci and Guilherme Peretti Pezzi and Maurizio Drocco and Concetto Spampinato and Massimo Torquati},
url = {https://iris.unito.it/retrieve/handle/2318/1522073/299200/ijhpca_4aperto.pdf},
doi = {10.1177/1094342014567907},
year = {2015},
date = {2015-01-01},
journal = {International Journal of High Performance Computing Applications},
volume = {29},
number = {4},
pages = {461–472},
abstract = {In this paper, a highly effective parallel filter for visual data restoration is presented. The filter is designed following a skeletal approach, using a newly proposed stencil-reduce, and has been implemented by way of the FastFlow parallel programming library. As a result of its high-level design, it is possible to run the filter seamlessly on a multicore machine, on multi-GPGPUs, or on both. The design and implementation of the filter are discussed, and an experimental evaluation is presented.},
keywords = {fastflow, HPC, impact, paraphrase},
pubstate = {published},
tppubtype = {article}
}
2014
Marco Aldinucci, Massimo Torquati, Maurizio Drocco, Guilherme Peretti Pezzi, Concetto Spampinato
FastFlow: Combining Pattern-Level Abstraction and Efficiency in GPGPUs Proceedings Article
In: GPU Technology Conference (GTC), San Jose, CA, USA, 2014.
Abstract | Links | BibTeX | Tags: fastflow, HPC, impact, paraphrase
@inproceedings{ff:gtc:2014,
title = {FastFlow: Combining Pattern-Level Abstraction and Efficiency in GPGPUs},
author = {Marco Aldinucci and Massimo Torquati and Maurizio Drocco and Guilherme Peretti Pezzi and Concetto Spampinato},
url = {http://calvados.di.unipi.it/storage/talks/2014_S4729-Marco-Aldinucci.pdf},
year = {2014},
date = {2014-03-01},
booktitle = {GPU Technology Conference (GTC)},
address = {San Jose, CA, USA},
abstract = {Learn how FastFlow's parallel patterns can be used to design parallel applications for execution on both CPUs and GPGPUs while avoiding most of the complex low-level detail needed to make them efficient, portable and rapid to prototype. As use case, we will show the design and effectiveness of a novel universal image filtering template based on the variational approach.},
keywords = {fastflow, HPC, impact, paraphrase},
pubstate = {published},
tppubtype = {inproceedings}
}
Marco Aldinucci, Massimo Torquati, Maurizio Drocco, Guilherme Peretti Pezzi, Concetto Spampinato
An Overview of FastFlow: Combining Pattern-Level Abstraction and Efficiency in GPGPUs Proceedings Article
In: GPU Technology Conference (GTC), San Jose, CA, USA, 2014.
Abstract | Links | BibTeX | Tags: fastflow, HPC, impact, paraphrase
@inproceedings{ff:gtc:2014:short,
title = {An Overview of FastFlow: Combining Pattern-Level Abstraction and Efficiency in GPGPUs},
author = {Marco Aldinucci and Massimo Torquati and Maurizio Drocco and Guilherme Peretti Pezzi and Concetto Spampinato},
url = {http://calvados.di.unipi.it/storage/talks/2014_S4585-Marco-Aldinucci.pdf},
year = {2014},
date = {2014-03-01},
booktitle = {GPU Technology Conference (GTC)},
address = {San Jose, CA, USA},
abstract = {Get an overview of FastFlow's parallel patterns can be used to design parallel applications for execution on both CPUs and GPGPUs while avoiding most of the complex low-level detail needed to make them efficient, portable and rapid to prototype. For a more detailed and technical review of FastFlow's parallel patterns as well as a use case where we will show the design and effectiveness of a novel universal image filtering template based on the variational approach.},
keywords = {fastflow, HPC, impact, paraphrase},
pubstate = {published},
tppubtype = {inproceedings}
}
Maurizio Drocco, Marco Aldinucci, Massimo Torquati
A Dynamic Memory Allocator for heterogeneous platforms Proceedings Article
In: Advanced Computer Architecture and Compilation for High-Performance and Embedded Systems (ACACES) – Poster Abstracts, HiPEAC, Fiuggi, Italy, 2014.
Abstract | Links | BibTeX | Tags: fastflow, HPC
@inproceedings{ff:acaces:14,
title = {A Dynamic Memory Allocator for heterogeneous platforms},
author = {Maurizio Drocco and Marco Aldinucci and Massimo Torquati},
url = {http://calvados.di.unipi.it/storage/paper_files/2014_ACACES_ex-abstract.pdf},
year = {2014},
date = {2014-01-01},
booktitle = {Advanced Computer Architecture and Compilation for High-Performance and Embedded Systems (ACACES) – Poster Abstracts},
publisher = {HiPEAC},
address = {Fiuggi, Italy},
abstract = {Modern computers are built upon heterogeneous multi-core/many cores architectures (e.g. GPGPU connected to multi-core CPU). Achieving peak performance on these architectures is hard and may require a substantial programming effort. High-level programming patterns, coupled with efficient low-level runtime supports, have been proposed to relieve the programmer from worrying about low-level details such as synchronisation of racing processes as well as those fine tunings needed to improve the overall performance. Among them are (parallel) dynamic memory allocation and effective exploitation of the memory hierarchy. The memory allocator is often a bottleneck that severely limits program scalability, robustness and portability on parallel systems. In this work we introduce a novel memory allocator, based on the FastFlow's allocator and the recently proposed CUDA Unified Memory, which aims to efficiently integrate host and device memories into a unique dynamic-allocable memory space, accessible transparently by both host and device code.},
keywords = {fastflow, HPC},
pubstate = {published},
tppubtype = {inproceedings}
}