@online{Carmel_2504.17776,
TITLE = {Fitting Tree Metrics and Ultrametrics in Data Streams},
AUTHOR = {Carmel, Amir and Das, Debarati and Kipouridis, Evangelos and Pipis, Evangelos},
LANGUAGE = {eng},
URL = {https://arxiv.org/abs/2504.17776},
EPRINT = {2504.17776},
EPRINTTYPE = {arXiv},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Fitting distances to tree metrics and ultrametrics are two widely used<br>methods in hierarchical clustering, primarily explored within the context of<br>numerical taxonomy. Given a positive distance function<br>$D:\binom{V}{2}\rightarrow\mathbb{R}_{>0}$, the goal is to find a tree (or<br>ultrametric) $T$ including all elements of set $V$ such that the difference<br>between the distances among vertices in $T$ and those specified by $D$ is<br>minimized. In this paper, we initiate the study of ultrametric and tree metric<br>fitting problems in the semi-streaming model, where the distances between pairs<br>of elements from $V$ (with $|V|=n$), defined by the function $D$, can arrive in<br>an arbitrary order. We study these problems under various distance norms:<br> For the $\ell_0$ objective, we provide a single-pass polynomial-time<br>$\tilde{O}(n)$-space $O(1)$ approximation algorithm for ultrametrics and prove<br>that no single-pass exact algorithm exists, even with exponential time.<br> Next, we show that the algorithm for $\ell_0$ implies an $O(\Delta/\delta)$<br>approximation for the $\ell_1$ objective, where $\Delta$ is the maximum and<br>$\delta$ is the minimum absolute difference between distances in the input.<br>This bound matches the best-known approximation for the RAM model using a<br>combinatorial algorithm when $\Delta/\delta=O(n)$.<br> For the $\ell_\infty$ objective, we provide a complete characterization of<br>the ultrametric fitting problem. We present a single-pass polynomial-time<br>$\tilde{O}(n)$-space 2-approximation algorithm and show that no better than<br>2-approximation is possible, even with exponential time. We also show that,<br>with an additional pass, it is possible to achieve a polynomial-time exact<br>algorithm for ultrametrics.<br> Finally, we extend the results for all these objectives to tree metrics by<br>using only one additional pass through the stream and without asymptotically<br>increasing the approximation factor.<br>},
}
