@online{corr/abs-1811-03254,
TITLE = {(Near) Optimal Parallelism Bound for Fully Asynchronous Coordinate Descent with Linear Speedup},
AUTHOR = {Cheung, Yun Kuen and Cole, Richard and Tao, Yixin},
LANGUAGE = {eng},
URL = {http://arxiv.org/abs/1811.03254},
EPRINT = {1811.03254},
EPRINTTYPE = {arXiv},
YEAR = {2018},
ABSTRACT = {When solving massive optimization problems in areas such as machine learning,<br>it is a common practice to seek speedup via massive parallelism. However,<br>especially in an asynchronous environment, there are limits on the possible<br>parallelism. Accordingly, we seek tight bounds on the viable parallelism in<br>asynchronous implementations of coordinate descent.<br> We focus on asynchronous coordinate descent (ACD) algorithms on convex<br>functions $F:\mathbb{R}^n \rightarrow \mathbb{R}$ of the form $$F(x) = f(x) ~+~<br>\sum_{k=1}^n \Psi_k(x_k),$$ where $f:\mathbb{R}^n \rightarrow \mathbb{R}$ is a<br>smooth convex function, and each $\Psi_k:\mathbb{R} \rightarrow \mathbb{R}$ is<br>a univariate and possibly non-smooth convex function.<br> Our approach is to quantify the shortfall in progress compared to the<br>standard sequential stochastic gradient descent. This leads to a truly simple<br>yet optimal analysis of the standard stochastic ACD in a partially asynchronous<br>environment, which already generalizes and improves on the bounds in prior<br>work. We also give a considerably more involved analysis for general<br>asynchronous environments in which the only constraint is that each update can<br>overlap with at most $q$ others, where $q$ is at most the number of processors<br>times the ratio in the lengths of the longest and shortest updates. The main<br>technical challenge is to demonstrate linear speedup in the latter environment.<br>This stems from the subtle interplay of asynchrony and randomization. This<br>improves Liu and Wright's (SIOPT'15) lower bound on the maximum degree of<br>parallelism almost quadratically, and we show that our new bound is almost<br>optimal.<br>},
}
