We present a new cache oblivious scheme for iterative stencil computations that performs beyond system bandwidth limitations as though gigabytes of data could reside in an enormous on-chip cache. We compare execution times for 2D and 3D spatial domains with up to 128 million double precision elements for constant and variable stencils against hand-optimized naive code and the automatic polyhedral parallelizer and locality optimizer PluTo and demonstrate the clear superiority of our results. The performance benefits stem from a tiling structure that caters for data locality, parallelism and vectorization simultaneously. Rather than tiling the iteration space from inside, we take an exterior approach with a predefined hierarchy, simple regular parallelogram tiles and a locality preserving parallelization. These advantages come at the cost of an irregular work-load distribution but a tightly integrated load-balancer ensures a high utilization of all resources.
@inproceedings{StShPa_10CORALS,
author = {Strzodka, Robert and Shaheen, Mohammed and Pajak, Dawid and Seidel, Hans-Peter},
title = {Cache oblivious parallelograms in iterative stencil computations},
booktitle = {ICS '10: Proceedings of the 24th ACM International Conference on Supercomputing},
month = jun,
year = {2010},
isbn = {978-1-4503-0018-6},
pages = {49--59},
location = {Tsukuba, Ibaraki, Japan},
doi = {http://doi.acm.org/10.1145/1810085.1810096},
publisher = {ACM},
}