@online{Arkami2412.01968,
TITLE = {On the Theoretical Foundations of Data Exchange Economies},
AUTHOR = {Akrami, Hannaneh and Ray Chaudhury, Bhaskar and Garg, Jugal and Murhekar, Aniket},
LANGUAGE = {eng},
URL = {https://arxiv.org/abs/2412.01968},
EPRINT = {2412.01968},
EPRINTTYPE = {arXiv},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
ABSTRACT = {The immense success of ML systems relies heavily on large-scale, high-quality<br>data. The high demand for data has led to many paradigms that involve selling,<br>exchanging, and sharing data, motivating the study of economic processes with<br>data as an asset. However, data differs from classical economic assets in terms<br>of free duplication: there is no concept of limited supply since it can be<br>replicated at zero marginal cost. This distinction introduces fundamental<br>differences between economic processes involving data and those concerning<br>other assets.<br> We study a parallel to exchange (Arrow-Debreu) markets where data is the<br>asset. Here, agents with datasets exchange data fairly and voluntarily, aiming<br>for mutual benefit without monetary compensation. This framework is<br>particularly relevant for non-profit organizations that seek to improve their<br>ML models through data exchange, yet are restricted from selling their data for<br>profit.<br> We propose a general framework for data exchange, built on two core<br>principles: (i) fairness, ensuring that each agent receives utility<br>proportional to their contribution to others; contributions are quantifiable<br>using standard credit-sharing functions like the Shapley value, and (ii)<br>stability, ensuring that no coalition of agents can identify an exchange among<br>themselves which they unanimously prefer to the current exchange. We show that<br>fair and stable exchanges exist for all monotone continuous utility functions.<br>Next, we investigate the computational complexity of finding approximate fair<br>and stable exchanges. We present a local search algorithm for instances with<br>monotone submodular utility functions, where each agent contributions are<br>measured using the Shapley value. We prove that this problem lies in CLS under<br>mild assumptions. Our framework opens up several intriguing theoretical<br>directions for research in data economics.<br>},
}
