D5
Databases and Information Systems
2022
[1]
H. Arnaout, T.-K. Tran, D. Stepanova, M. H. Gad-Elrab, S. Razniewski, and G. Weikum, “Utilizing Language Model Probes for Knowledge Graph Repair,” in Wiki Workshop 2022, Virtual Event, 2022.
Export
BibTeX
@inproceedings{Arnaout_Wiki2022, TITLE = {Utilizing Language Model Probes for Knowledge Graph Repair}, AUTHOR = {Arnaout, Hiba and Tran, Trung-Kien and Stepanova, Daria and Gad-Elrab, Mohamed Hassan and Razniewski, Simon and Weikum, Gerhard}, LANGUAGE = {eng}, URL = {https://wikiworkshop.org/2022/}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {Wiki Workshop 2022}, ADDRESS = {Virtual Event}, }
Endnote
%0 Conference Proceedings %A Arnaout, Hiba %A Tran, Trung-Kien %A Stepanova, Daria %A Gad-Elrab, Mohamed Hassan %A Razniewski, Simon %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations External Organizations External Organizations External Organizations Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Utilizing Language Model Probes for Knowledge Graph Repair : %G eng %U http://hdl.handle.net/21.11116/0000-000A-63F4-3 %U https://wikiworkshop.org/2022/ %D 2022 %B Wiki Workshop 2022 %Z date of event: 2022-04-25 - 2022-04-25 %C Virtual Event %B Wiki Workshop 2022
[2]
H. Arnaout, S. Razniewski, G. Weikum, and J. Z. Pan, “UnCommonSense: Informative Negative Knowledge about Everyday Concepts,” in CIKM ’22, 31st ACM International Conference on Information and Knowledge Management, Atlanta GA USA, 2022.
Abstract
Commonsense knowledge about everyday concepts is an important asset for AI<br>applications, such as question answering and chatbots. Recently, we have seen<br>an increasing interest in the construction of structured commonsense knowledge<br>bases (CSKBs). An important part of human commonsense is about properties that<br>do not apply to concepts, yet existing CSKBs only store positive statements.<br>Moreover, since CSKBs operate under the open-world assumption, absent<br>statements are considered to have unknown truth rather than being invalid. This<br>paper presents the UNCOMMONSENSE framework for materializing informative<br>negative commonsense statements. Given a target concept, comparable concepts<br>are identified in the CSKB, for which a local closed-world assumption is<br>postulated. This way, positive statements about comparable concepts that are<br>absent for the target concept become seeds for negative statement candidates.<br>The large set of candidates is then scrutinized, pruned and ranked by<br>informativeness. Intrinsic and extrinsic evaluations show that our method<br>significantly outperforms the state-of-the-art. A large dataset of informative<br>negations is released as a resource for future research.<br>
Export
BibTeX
@inproceedings{ArnaoutCIKM2022, TITLE = {{UnCommonSense}: Informative Negative Knowledge about Everyday Concepts}, AUTHOR = {Arnaout, Hiba and Razniewski, Simon and Weikum, Gerhard and Pan, Jeff Z.}, LANGUAGE = {eng}, ISBN = {978-1-4503-9236-5}, DOI = {10.1145/3511808.3557484}, PUBLISHER = {ACM}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, ABSTRACT = {Commonsense knowledge about everyday concepts is an important asset for AI<br>applications, such as question answering and chatbots. Recently, we have seen<br>an increasing interest in the construction of structured commonsense knowledge<br>bases (CSKBs). An important part of human commonsense is about properties that<br>do not apply to concepts, yet existing CSKBs only store positive statements.<br>Moreover, since CSKBs operate under the open-world assumption, absent<br>statements are considered to have unknown truth rather than being invalid. This<br>paper presents the UNCOMMONSENSE framework for materializing informative<br>negative commonsense statements. Given a target concept, comparable concepts<br>are identified in the CSKB, for which a local closed-world assumption is<br>postulated. This way, positive statements about comparable concepts that are<br>absent for the target concept become seeds for negative statement candidates.<br>The large set of candidates is then scrutinized, pruned and ranked by<br>informativeness. Intrinsic and extrinsic evaluations show that our method<br>significantly outperforms the state-of-the-art. A large dataset of informative<br>negations is released as a resource for future research.<br>}, BOOKTITLE = {CIKM '22, 31st ACM International Conference on Information and Knowledge Management}, EDITOR = {Al Hasan, Mohammad and Xiong, Li}, PAGES = {37--46}, ADDRESS = {Atlanta GA USA}, }
Endnote
%0 Conference Proceedings %A Arnaout, Hiba %A Razniewski, Simon %A Weikum, Gerhard %A Pan, Jeff Z. %+ Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations %T UnCommonSense: Informative Negative Knowledge about Everyday Concepts : %G eng %U http://hdl.handle.net/21.11116/0000-000A-F224-C %R 10.1145/3511808.3557484 %D 2022 %B 31st ACM International Conference on Information and Knowledge Management %Z date of event: 2022-10-17 - 2022-10-21 %C Atlanta GA USA %X Commonsense knowledge about everyday concepts is an important asset for AI<br>applications, such as question answering and chatbots. Recently, we have seen<br>an increasing interest in the construction of structured commonsense knowledge<br>bases (CSKBs). An important part of human commonsense is about properties that<br>do not apply to concepts, yet existing CSKBs only store positive statements.<br>Moreover, since CSKBs operate under the open-world assumption, absent<br>statements are considered to have unknown truth rather than being invalid. This<br>paper presents the UNCOMMONSENSE framework for materializing informative<br>negative commonsense statements. Given a target concept, comparable concepts<br>are identified in the CSKB, for which a local closed-world assumption is<br>postulated. This way, positive statements about comparable concepts that are<br>absent for the target concept become seeds for negative statement candidates.<br>The large set of candidates is then scrutinized, pruned and ranked by<br>informativeness. Intrinsic and extrinsic evaluations show that our method<br>significantly outperforms the state-of-the-art. A large dataset of informative<br>negations is released as a resource for future research.<br> %K Computer Science, Artificial Intelligence, cs.AI,Computer Science, Databases, cs.DB,Computer Science, Information Retrieval, cs.IR %B CIKM '22 %E Al Hasan, Mohammad; Xiong, Li %P 37 - 46 %I ACM %@ 978-1-4503-9236-5
[3]
P. Christmann, R. Saha Roy, and G. Weikum, “Conversational Question Answering on Heterogeneous Sources,” in SIGIR ’22, 45th International ACM SIGIR Conference on Research and Development in Information Retrieval, Madrid, Spain, 2022.
Export
BibTeX
@inproceedings{Christmann_SIGIR2022, TITLE = {Conversational Question Answering on Heterogeneous Sources}, AUTHOR = {Christmann, Phlipp and Saha Roy, Rishiraj and Weikum, Gerhard}, LANGUAGE = {eng}, ISBN = {978-1-4503-8732-3}, DOI = {10.1145/3477495.3531815}, PUBLISHER = {ACM}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {SIGIR '22, 45th International ACM SIGIR Conference on Research and Development in Information Retrieval}, EDITOR = {Amigo, Enrique and Castells, Pablo and Gonzalo, Julio and Carterett, Ben and Culpepper, J. Shane and Kazai, Gabriella}, PAGES = {144--154}, ADDRESS = {Madrid, Spain}, }
Endnote
%0 Conference Proceedings %A Christmann, Phlipp %A Saha Roy, Rishiraj %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Conversational Question Answering on Heterogeneous Sources : %G eng %U http://hdl.handle.net/21.11116/0000-000A-6148-8 %R 10.1145/3477495.3531815 %D 2022 %B 45th International ACM SIGIR Conference on Research and Development in Information Retrieval %Z date of event: 2022-07-11 - 2022-07-15 %C Madrid, Spain %B SIGIR '22 %E Amigo, Enrique; Castells, Pablo; Gonzalo, Julio; Carterett, Ben; Culpepper, J. Shane; Kazai, Gabriella %P 144 - 154 %I ACM %@ 978-1-4503-8732-3
[4]
P. Christmann, R. Saha Roy, and G. Weikum, “Beyond NED: Fast and Effective Search Space Reduction for Complex Question Answering over Knowledge Bases,” in WSDM ’22, Fifteenth ACM International Conference on Web Search and Data Mining, Tempe, AZ, USA (Virutal Event), 2022.
Export
BibTeX
@inproceedings{Christmann_WSDM22, TITLE = {Beyond {NED}: {F}ast and Effective Search Space Reduction for Complex Question Answering over Knowledge Bases}, AUTHOR = {Christmann, Phlipp and Saha Roy, Rishiraj and Weikum, Gerhard}, LANGUAGE = {eng}, ISBN = {978-1-4503-9132-0}, DOI = {10.1145/3488560.3498488}, PUBLISHER = {ACM}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {WSDM '22, Fifteenth ACM International Conference on Web Search and Data Mining}, PAGES = {172--180}, ADDRESS = {Tempe, AZ, USA (Virutal Event)}, }
Endnote
%0 Conference Proceedings %A Christmann, Phlipp %A Saha Roy, Rishiraj %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Beyond NED: Fast and Effective Search Space Reduction for Complex Question Answering over Knowledge Bases : %G eng %U http://hdl.handle.net/21.11116/0000-000A-27C6-B %R 10.1145/3488560.3498488 %D 2022 %B Fifteenth ACM International Conference on Web Search and Data Mining %Z date of event: 2022-02-21 - 2022-02-25 %C Tempe, AZ, USA (Virutal Event) %B WSDM '22 %P 172 - 180 %I ACM %@ 978-1-4503-9132-0
[5]
C. X. Chu, “Knowledge Extraction from Fictional Texts,” Universität des Saarlandes, Saarbrücken, 2022.
Abstract
Knowledge extraction from text is a key task in natural language processing, which involves many sub-tasks, such as taxonomy induction, named entity recognition and typing, relation extraction, knowledge canonicalization and so on. By constructing structured knowledge from natural language text, knowledge extraction becomes a key asset for search engines, question answering and other downstream applications. However, current knowledge extraction methods mostly focus on prominent real-world entities with Wikipedia and mainstream news articles as sources. The constructed knowledge bases, therefore, lack information about long-tail domains, with fiction and fantasy as archetypes. Fiction and fantasy are core parts of our human culture, spanning from literature to movies, TV series, comics and video games. With thousands of fictional universes which have been created, knowledge from fictional domains are subject of search-engine queries - by fans as well as cultural analysts. Unlike the real-world domain, knowledge extraction on such specific domains like fiction and fantasy has to tackle several key challenges: - Training data: Sources for fictional domains mostly come from books and fan-built content, which is sparse and noisy, and contains difficult structures of texts, such as dialogues and quotes. Training data for key tasks such as taxonomy induction, named entity typing or relation extraction are also not available. - Domain characteristics and diversity: Fictional universes can be highly sophisticated, containing entities, social structures and sometimes languages that are completely different from the real world. State-of-the-art methods for knowledge extraction make assumptions on entity-class, subclass and entity-entity relations that are often invalid for fictional domains. With different genres of fictional domains, another requirement is to transfer models across domains. - Long fictional texts: While state-of-the-art models have limitations on the input sequence length, it is essential to develop methods that are able to deal with very long texts (e.g. entire books), to capture multiple contexts and leverage widely spread cues. This dissertation addresses the above challenges, by developing new methodologies that advance the state of the art on knowledge extraction in fictional domains. - The first contribution is a method, called TiFi, for constructing type systems (taxonomy induction) for fictional domains. By tapping noisy fan-built content from online communities such as Wikia, TiFi induces taxonomies through three main steps: category cleaning, edge cleaning and top-level construction. Exploiting a variety of features from the original input, TiFi is able to construct taxonomies for a diverse range of fictional domains with high precision. - The second contribution is a comprehensive approach, called ENTYFI, for named entity recognition and typing in long fictional texts. Built on 205 automatically induced high-quality type systems for popular fictional domains, ENTYFI exploits the overlap and reuse of these fictional domains on unseen texts. By combining different typing modules with a consolidation stage, ENTYFI is able to do fine-grained entity typing in long fictional texts with high precision and recall. - The third contribution is an end-to-end system, called KnowFi, for extracting relations between entities in very long texts such as entire books. KnowFi leverages background knowledge from 142 popular fictional domains to identify interesting relations and to collect distant training samples. KnowFi devises a similarity-based ranking technique to reduce false positives in training samples and to select potential text passages that contain seed pairs of entities. By training a hierarchical neural network for all relations, KnowFi is able to infer relations between entity pairs across long fictional texts, and achieves gains over the best prior methods for relation extraction.
Export
BibTeX
@phdthesis{Chuphd2022, TITLE = {Knowledge Extraction from Fictional Texts}, AUTHOR = {Chu, Cuong Xuan}, LANGUAGE = {eng}, URL = {nbn:de:bsz:291--ds-361070}, DOI = {10.22028/D291-36107}, SCHOOL = {Universit{\"a}t des Saarlandes}, ADDRESS = {Saarbr{\"u}cken}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, DATE = {2022}, ABSTRACT = {Knowledge extraction from text is a key task in natural language processing, which involves many sub-tasks, such as taxonomy induction, named entity recognition and typing, relation extraction, knowledge canonicalization and so on. By constructing structured knowledge from natural language text, knowledge extraction becomes a key asset for search engines, question answering and other downstream applications. However, current knowledge extraction methods mostly focus on prominent real-world entities with Wikipedia and mainstream news articles as sources. The constructed knowledge bases, therefore, lack information about long-tail domains, with fiction and fantasy as archetypes. Fiction and fantasy are core parts of our human culture, spanning from literature to movies, TV series, comics and video games. With thousands of fictional universes which have been created, knowledge from fictional domains are subject of search-engine queries -- by fans as well as cultural analysts. Unlike the real-world domain, knowledge extraction on such specific domains like fiction and fantasy has to tackle several key challenges: -- Training data: Sources for fictional domains mostly come from books and fan-built content, which is sparse and noisy, and contains difficult structures of texts, such as dialogues and quotes. Training data for key tasks such as taxonomy induction, named entity typing or relation extraction are also not available. -- Domain characteristics and diversity: Fictional universes can be highly sophisticated, containing entities, social structures and sometimes languages that are completely different from the real world. State-of-the-art methods for knowledge extraction make assumptions on entity-class, subclass and entity-entity relations that are often invalid for fictional domains. With different genres of fictional domains, another requirement is to transfer models across domains. -- Long fictional texts: While state-of-the-art models have limitations on the input sequence length, it is essential to develop methods that are able to deal with very long texts (e.g. entire books), to capture multiple contexts and leverage widely spread cues. This dissertation addresses the above challenges, by developing new methodologies that advance the state of the art on knowledge extraction in fictional domains. -- The first contribution is a method, called TiFi, for constructing type systems (taxonomy induction) for fictional domains. By tapping noisy fan-built content from online communities such as Wikia, TiFi induces taxonomies through three main steps: category cleaning, edge cleaning and top-level construction. Exploiting a variety of features from the original input, TiFi is able to construct taxonomies for a diverse range of fictional domains with high precision. -- The second contribution is a comprehensive approach, called ENTYFI, for named entity recognition and typing in long fictional texts. Built on 205 automatically induced high-quality type systems for popular fictional domains, ENTYFI exploits the overlap and reuse of these fictional domains on unseen texts. By combining different typing modules with a consolidation stage, ENTYFI is able to do fine-grained entity typing in long fictional texts with high precision and recall. -- The third contribution is an end-to-end system, called KnowFi, for extracting relations between entities in very long texts such as entire books. KnowFi leverages background knowledge from 142 popular fictional domains to identify interesting relations and to collect distant training samples. KnowFi devises a similarity-based ranking technique to reduce false positives in training samples and to select potential text passages that contain seed pairs of entities. By training a hierarchical neural network for all relations, KnowFi is able to infer relations between entity pairs across long fictional texts, and achieves gains over the best prior methods for relation extraction.}, }
Endnote
%0 Thesis %A Chu, Cuong Xuan %Y Weikum, Gerhard %A referee: Theobald, Martin %+ Databases and Information Systems, MPI for Informatics, Max Planck Society International Max Planck Research School, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Knowledge Extraction from Fictional Texts : %G eng %U http://hdl.handle.net/21.11116/0000-000A-9598-2 %R 10.22028/D291-36107 %U nbn:de:bsz:291--ds-361070 %I Universit&#228;t des Saarlandes %C Saarbr&#252;cken %D 2022 %P 129 p. %V phd %9 phd %X Knowledge extraction from text is a key task in natural language processing, which involves many sub-tasks, such as taxonomy induction, named entity recognition and typing, relation extraction, knowledge canonicalization and so on. By constructing structured knowledge from natural language text, knowledge extraction becomes a key asset for search engines, question answering and other downstream applications. However, current knowledge extraction methods mostly focus on prominent real-world entities with Wikipedia and mainstream news articles as sources. The constructed knowledge bases, therefore, lack information about long-tail domains, with fiction and fantasy as archetypes. Fiction and fantasy are core parts of our human culture, spanning from literature to movies, TV series, comics and video games. With thousands of fictional universes which have been created, knowledge from fictional domains are subject of search-engine queries - by fans as well as cultural analysts. Unlike the real-world domain, knowledge extraction on such specific domains like fiction and fantasy has to tackle several key challenges: - Training data: Sources for fictional domains mostly come from books and fan-built content, which is sparse and noisy, and contains difficult structures of texts, such as dialogues and quotes. Training data for key tasks such as taxonomy induction, named entity typing or relation extraction are also not available. - Domain characteristics and diversity: Fictional universes can be highly sophisticated, containing entities, social structures and sometimes languages that are completely different from the real world. State-of-the-art methods for knowledge extraction make assumptions on entity-class, subclass and entity-entity relations that are often invalid for fictional domains. With different genres of fictional domains, another requirement is to transfer models across domains. - Long fictional texts: While state-of-the-art models have limitations on the input sequence length, it is essential to develop methods that are able to deal with very long texts (e.g. entire books), to capture multiple contexts and leverage widely spread cues. This dissertation addresses the above challenges, by developing new methodologies that advance the state of the art on knowledge extraction in fictional domains. - The first contribution is a method, called TiFi, for constructing type systems (taxonomy induction) for fictional domains. By tapping noisy fan-built content from online communities such as Wikia, TiFi induces taxonomies through three main steps: category cleaning, edge cleaning and top-level construction. Exploiting a variety of features from the original input, TiFi is able to construct taxonomies for a diverse range of fictional domains with high precision. - The second contribution is a comprehensive approach, called ENTYFI, for named entity recognition and typing in long fictional texts. Built on 205 automatically induced high-quality type systems for popular fictional domains, ENTYFI exploits the overlap and reuse of these fictional domains on unseen texts. By combining different typing modules with a consolidation stage, ENTYFI is able to do fine-grained entity typing in long fictional texts with high precision and recall. - The third contribution is an end-to-end system, called KnowFi, for extracting relations between entities in very long texts such as entire books. KnowFi leverages background knowledge from 142 popular fictional domains to identify interesting relations and to collect distant training samples. KnowFi devises a similarity-based ranking technique to reduce false positives in training samples and to select potential text passages that contain seed pairs of entities. By training a hierarchical neural network for all relations, KnowFi is able to infer relations between entity pairs across long fictional texts, and achieves gains over the best prior methods for relation extraction. %U https://publikationen.sulb.uni-saarland.de/handle/20.500.11880/32914
[6]
D. Dave, A. Celestino, A. S. Varde, and V. Anu, “Management of Implicit Requirements Data in Large SRS Documents: Taxonomy and Techniques,” Sigmod Record, vol. 51, no. 2, 2022.
Export
BibTeX
@article{dave2022, TITLE = {Management of Implicit Requirements Data in Large {SRS} Documents: {T}axonomy and Techniques}, AUTHOR = {Dave, Dev and Celestino, Angelica and Varde, Aparna S. and Anu, Vaibhav}, LANGUAGE = {eng}, ISSN = {0163-5808}, PUBLISHER = {Special Interest Group on the Management of Data}, ADDRESS = {New York, NY}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, JOURNAL = {Sigmod Record}, VOLUME = {51}, NUMBER = {2}, PAGES = {18--29}, }
Endnote
%0 Journal Article %A Dave, Dev %A Celestino, Angelica %A Varde, Aparna S. %A Anu, Vaibhav %+ External Organizations External Organizations Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations %T Management of Implicit Requirements Data in Large SRS Documents: Taxonomy and Techniques : %G eng %U http://hdl.handle.net/21.11116/0000-000A-F1AD-3 %7 2022 %D 2022 %J Sigmod Record %V 51 %N 2 %& 18 %P 18 - 29 %I Special Interest Group on the Management of Data %C New York, NY %@ false
[7]
J. Fischer, “More than the sum of its parts,” Universität des Saarlandes, Saarbrücken, 2022.
Abstract
In this thesis we explore pattern mining and deep learning. Often seen as orthogonal, we show that these fields complement each other and propose to combine them to gain from each other’s strengths. We, first, show how to efficiently discover succinct and non-redundant sets of patterns that provide insight into data beyond conjunctive statements. We leverage the interpretability of such patterns to unveil how and which information flows through neural networks, as well as what characterizes their decisions. Conversely, we show how to combine continuous optimization with pattern discovery, proposing a neural network that directly encodes discrete patterns, which allows us to apply pattern mining at a scale orders of magnitude larger than previously possible. Large neural networks are, however, exceedingly expensive to train for which ‘lottery tickets’ – small, well-trainable sub-networks in randomly initialized neural networks – offer a remedy. We identify theoretical limitations of strong tickets and overcome them by equipping these tickets with the property of universal approximation. To analyze whether limitations in ticket sparsity are algorithmic or fundamental, we propose a framework to plant and hide lottery tickets. With novel ticket benchmarks we then conclude that the limitation is likely algorithmic, encouraging further developments for which our framework offers means to measure progress.
Export
BibTeX
@phdthesis{Fischerphd2022, TITLE = {More than the sum of its parts}, AUTHOR = {Fischer, Jonas}, LANGUAGE = {eng}, URL = {nbn:de:bsz:291--ds-370240}, DOI = {10.22028/D291-37024}, SCHOOL = {Universit{\"a}t des Saarlandes}, ADDRESS = {Saarbr{\"u}cken}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, DATE = {2022}, ABSTRACT = {In this thesis we explore pattern mining and deep learning. Often seen as orthogonal, we show that these fields complement each other and propose to combine them to gain from each other{\textquoteright}s strengths. We, first, show how to efficiently discover succinct and non-redundant sets of patterns that provide insight into data beyond conjunctive statements. We leverage the interpretability of such patterns to unveil how and which information flows through neural networks, as well as what characterizes their decisions. Conversely, we show how to combine continuous optimization with pattern discovery, proposing a neural network that directly encodes discrete patterns, which allows us to apply pattern mining at a scale orders of magnitude larger than previously possible. Large neural networks are, however, exceedingly expensive to train for which {\textquoteleft}lottery tickets{\textquoteright} -- small, well-trainable sub-networks in randomly initialized neural networks -- offer a remedy. We identify theoretical limitations of strong tickets and overcome them by equipping these tickets with the property of universal approximation. To analyze whether limitations in ticket sparsity are algorithmic or fundamental, we propose a framework to plant and hide lottery tickets. With novel ticket benchmarks we then conclude that the limitation is likely algorithmic, encouraging further developments for which our framework offers means to measure progress.}, }
Endnote
%0 Thesis %A Fischer, Jonas %Y Vreeken, Jilles %A referee: Weikum, Gerhard %A referee: Parthasarathy, Srinivasan %+ Databases and Information Systems, MPI for Informatics, Max Planck Society International Max Planck Research School, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations %T More than the sum of its parts : pattern mining neural networks, and how they complement each other %G eng %U http://hdl.handle.net/21.11116/0000-000B-38BF-0 %R 10.22028/D291-37024 %U nbn:de:bsz:291--ds-370240 %I Universit&#228;t des Saarlandes %C Saarbr&#252;cken %D 2022 %P 250 p. %V phd %9 phd %X In this thesis we explore pattern mining and deep learning. Often seen as orthogonal, we show that these fields complement each other and propose to combine them to gain from each other&#8217;s strengths. We, first, show how to efficiently discover succinct and non-redundant sets of patterns that provide insight into data beyond conjunctive statements. We leverage the interpretability of such patterns to unveil how and which information flows through neural networks, as well as what characterizes their decisions. Conversely, we show how to combine continuous optimization with pattern discovery, proposing a neural network that directly encodes discrete patterns, which allows us to apply pattern mining at a scale orders of magnitude larger than previously possible. Large neural networks are, however, exceedingly expensive to train for which &#8216;lottery tickets&#8217; &#8211; small, well-trainable sub-networks in randomly initialized neural networks &#8211; offer a remedy. We identify theoretical limitations of strong tickets and overcome them by equipping these tickets with the property of universal approximation. To analyze whether limitations in ticket sparsity are algorithmic or fundamental, we propose a framework to plant and hide lottery tickets. With novel ticket benchmarks we then conclude that the limitation is likely algorithmic, encouraging further developments for which our framework offers means to measure progress. %U https://publikationen.sulb.uni-saarland.de/handle/20.500.11880/33893
[8]
S. Ghosh, S. Razniewski, and G. Weikum, “Answering Count Queries with Explanatory Evidence,” in SIGIR ’22, 45th International ACM SIGIR Conference on Research and Development in Information Retrieval, Madrid, Spain, 2022. (arXiv: 2204.05039)
Export
BibTeX
@inproceedings{Ghosh_SIGIR22, TITLE = {Answering Count Queries with Explanatory Evidence}, AUTHOR = {Ghosh, Shrestha and Razniewski, Simon and Weikum, Gerhard}, LANGUAGE = {eng}, ISBN = {978-1-4503-8732-3}, DOI = {10.1145/3477495.3531870}, EPRINT = {2204.05039}, EPRINTTYPE = {arXiv}, PUBLISHER = {ACM}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {SIGIR '22, 45th International ACM SIGIR Conference on Research and Development in Information Retrieval}, EDITOR = {Amigo, Enrique and Castells, Pablo and Gonzalo, Julio and Carterett, Ben and Culpepper, J. Shane and Kazai, Gabriella}, PAGES = {2415--2419}, ADDRESS = {Madrid, Spain}, }
Endnote
%0 Conference Proceedings %A Ghosh, Shrestha %A Razniewski, Simon %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Answering Count Queries with Explanatory Evidence : %G eng %U http://hdl.handle.net/21.11116/0000-000A-9E36-8 %R 10.1145/3477495.3531870 %D 2022 %B 45th International ACM SIGIR Conference on Research and Development in Information Retrieval %Z date of event: 2022-07-11 - 2022-07-15 %C Madrid, Spain %B SIGIR '22 %E Amigo, Enrique; Castells, Pablo; Gonzalo, Julio; Carterett, Ben; Culpepper, J. Shane; Kazai, Gabriella %P 2415 - 2419 %I ACM %@ 978-1-4503-8732-3
[9]
S. Ghosh, S. Razniewski, and G. Weikum, “Answering Count Questions with Structured Answers from Text,” 2022. . (arXiv: 2209.07250)
Abstract
In this work we address the challenging case of answering count queries in<br>web search, such as ``number of songs by John Lennon''. Prior methods merely<br>answer these with a single, and sometimes puzzling number or return a ranked<br>list of text snippets with different numbers. This paper proposes a methodology<br>for answering count queries with inference, contextualization and explanatory<br>evidence. Unlike previous systems, our method infers final answers from<br>multiple observations, supports semantic qualifiers for the counts, and<br>provides evidence by enumerating representative instances. Experiments with a<br>wide variety of queries, including existing benchmark show the benefits of our<br>method, and the influence of specific parameter settings. Our code, data and an<br>interactive system demonstration are publicly available at<br>https://github.com/ghoshs/CoQEx and https://nlcounqer.mpi-inf.mpg.de/.<br>
Export
BibTeX
@online{Ghosh_2209.07250, TITLE = {Answering Count Questions with Structured Answers from Text}, AUTHOR = {Ghosh, Shrestha and Razniewski, Simon and Weikum, Gerhard}, LANGUAGE = {eng}, DOI = {10.48550/arXiv.2209.07250}, EPRINT = {2209.07250}, EPRINTTYPE = {arXiv}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, ABSTRACT = {In this work we address the challenging case of answering count queries in<br>web search, such as ``number of songs by John Lennon''. Prior methods merely<br>answer these with a single, and sometimes puzzling number or return a ranked<br>list of text snippets with different numbers. This paper proposes a methodology<br>for answering count queries with inference, contextualization and explanatory<br>evidence. Unlike previous systems, our method infers final answers from<br>multiple observations, supports semantic qualifiers for the counts, and<br>provides evidence by enumerating representative instances. Experiments with a<br>wide variety of queries, including existing benchmark show the benefits of our<br>method, and the influence of specific parameter settings. Our code, data and an<br>interactive system demonstration are publicly available at<br>https://github.com/ghoshs/CoQEx and https://nlcounqer.mpi-inf.mpg.de/.<br>}, }
Endnote
%0 Report %A Ghosh, Shrestha %A Razniewski, Simon %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Answering Count Questions with Structured Answers from Text : %G eng %U http://hdl.handle.net/21.11116/0000-000B-1D84-0 %R 10.48550/arXiv.2209.07250 %D 2022 %X In this work we address the challenging case of answering count queries in<br>web search, such as ``number of songs by John Lennon''. Prior methods merely<br>answer these with a single, and sometimes puzzling number or return a ranked<br>list of text snippets with different numbers. This paper proposes a methodology<br>for answering count queries with inference, contextualization and explanatory<br>evidence. Unlike previous systems, our method infers final answers from<br>multiple observations, supports semantic qualifiers for the counts, and<br>provides evidence by enumerating representative instances. Experiments with a<br>wide variety of queries, including existing benchmark show the benefits of our<br>method, and the influence of specific parameter settings. Our code, data and an<br>interactive system demonstration are publicly available at<br>https://github.com/ghoshs/CoQEx and https://nlcounqer.mpi-inf.mpg.de/.<br> %K Computer Science, Information Retrieval, cs.IR
[10]
A. Guimarães, “Data Science Methods for the Analysis of Controversial Social Media Discussions,” Universität des Saarlandes, Saarbrücken, 2022.
Abstract
Social media communities like Reddit and Twitter allow users to express their views on<br>topics of their interest, and to engage with other users who may share or oppose these views.<br>This can lead to productive discussions towards a consensus, or to contended debates, where<br>disagreements frequently arise.<br>Prior work on such settings has primarily focused on identifying notable instances of antisocial<br>behavior such as hate-speech and “trolling”, which represent possible threats to the health of<br>a community. These, however, are exceptionally severe phenomena, and do not encompass<br>controversies stemming from user debates, differences of opinions, and off-topic content, all<br>of which can naturally come up in a discussion without going so far as to compromise its<br>development.<br>This dissertation proposes a framework for the systematic analysis of social media discussions<br>that take place in the presence of controversial themes, disagreements, and mixed opinions from<br>participating users. For this, we develop a feature-based model to describe key elements of a<br>discussion, such as its salient topics, the level of activity from users, the sentiments it expresses,<br>and the user feedback it receives.<br>Initially, we build our feature model to characterize adversarial discussions surrounding<br>political campaigns on Twitter, with a focus on the factual and sentimental nature of their<br>topics and the role played by different users involved. We then extend our approach to Reddit<br>discussions, leveraging community feedback signals to define a new notion of controversy<br>and to highlight conversational archetypes that arise from frequent and interesting interaction<br>patterns. We use our feature model to build logistic regression classifiers that can predict future<br>instances of controversy in Reddit communities centered on politics, world news, sports, and<br>personal relationships. Finally, our model also provides the basis for a comparison of different<br>communities in the health domain, where topics and activity vary considerably despite their<br>shared overall focus. In each of these cases, our framework provides insight into how user<br>behavior can shape a community’s individual definition of controversy and its overall identity.
Export
BibTeX
@phdthesis{Decarvalhophd2021, TITLE = {Data Science Methods for the Analysis of Controversial Social Media Discussions}, AUTHOR = {Guimar{\~a}es, Anna}, LANGUAGE = {eng}, URL = {nbn:de:bsz:291--ds-365021}, DOI = {10.22028/D291-36502}, SCHOOL = {Universit{\"a}t des Saarlandes}, ADDRESS = {Saarbr{\"u}cken}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, DATE = {2022}, ABSTRACT = {Social media communities like Reddit and Twitter allow users to express their views on<br>topics of their interest, and to engage with other users who may share or oppose these views.<br>This can lead to productive discussions towards a consensus, or to contended debates, where<br>disagreements frequently arise.<br>Prior work on such settings has primarily focused on identifying notable instances of antisocial<br>behavior such as hate-speech and {\textquotedblleft}trolling{\textquotedblright}, which represent possible threats to the health of<br>a community. These, however, are exceptionally severe phenomena, and do not encompass<br>controversies stemming from user debates, differences of opinions, and off-topic content, all<br>of which can naturally come up in a discussion without going so far as to compromise its<br>development.<br>This dissertation proposes a framework for the systematic analysis of social media discussions<br>that take place in the presence of controversial themes, disagreements, and mixed opinions from<br>participating users. For this, we develop a feature-based model to describe key elements of a<br>discussion, such as its salient topics, the level of activity from users, the sentiments it expresses,<br>and the user feedback it receives.<br>Initially, we build our feature model to characterize adversarial discussions surrounding<br>political campaigns on Twitter, with a focus on the factual and sentimental nature of their<br>topics and the role played by different users involved. We then extend our approach to Reddit<br>discussions, leveraging community feedback signals to define a new notion of controversy<br>and to highlight conversational archetypes that arise from frequent and interesting interaction<br>patterns. We use our feature model to build logistic regression classifiers that can predict future<br>instances of controversy in Reddit communities centered on politics, world news, sports, and<br>personal relationships. Finally, our model also provides the basis for a comparison of different<br>communities in the health domain, where topics and activity vary considerably despite their<br>shared overall focus. In each of these cases, our framework provides insight into how user<br>behavior can shape a community{\textquoteright}s individual definition of controversy and its overall identity.}, }
Endnote
%0 Thesis %A Guimar&#227;es, Anna %Y Weikum, Gerhard %A referee: de Melo, Gerard %A referee: Yates, Andrew %+ Databases and Information Systems, MPI for Informatics, Max Planck Society International Max Planck Research School, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Data Science Methods for the Analysis of Controversial Social Media Discussions : %G eng %U http://hdl.handle.net/21.11116/0000-000A-CDF7-9 %R 10.22028/D291-36502 %U nbn:de:bsz:291--ds-365021 %I Universit&#228;t des Saarlandes %C Saarbr&#252;cken %D 2022 %P 94 p. %V phd %9 phd %X Social media communities like Reddit and Twitter allow users to express their views on<br>topics of their interest, and to engage with other users who may share or oppose these views.<br>This can lead to productive discussions towards a consensus, or to contended debates, where<br>disagreements frequently arise.<br>Prior work on such settings has primarily focused on identifying notable instances of antisocial<br>behavior such as hate-speech and &#8220;trolling&#8221;, which represent possible threats to the health of<br>a community. These, however, are exceptionally severe phenomena, and do not encompass<br>controversies stemming from user debates, differences of opinions, and off-topic content, all<br>of which can naturally come up in a discussion without going so far as to compromise its<br>development.<br>This dissertation proposes a framework for the systematic analysis of social media discussions<br>that take place in the presence of controversial themes, disagreements, and mixed opinions from<br>participating users. For this, we develop a feature-based model to describe key elements of a<br>discussion, such as its salient topics, the level of activity from users, the sentiments it expresses,<br>and the user feedback it receives.<br>Initially, we build our feature model to characterize adversarial discussions surrounding<br>political campaigns on Twitter, with a focus on the factual and sentimental nature of their<br>topics and the role played by different users involved. We then extend our approach to Reddit<br>discussions, leveraging community feedback signals to define a new notion of controversy<br>and to highlight conversational archetypes that arise from frequent and interesting interaction<br>patterns. We use our feature model to build logistic regression classifiers that can predict future<br>instances of controversy in Reddit communities centered on politics, world news, sports, and<br>personal relationships. Finally, our model also provides the basis for a comparison of different<br>communities in the health domain, where topics and activity vary considerably despite their<br>shared overall focus. In each of these cases, our framework provides insight into how user<br>behavior can shape a community&#8217;s individual definition of controversy and its overall identity. %U https://publikationen.sulb.uni-saarland.de/handle/20.500.11880/33161
[11]
V. T. Ho, D. Stepanova, D. Milchevski, J. Strötgen, and G. Weikum, “Enhancing Knowledge Bases with Quantity Facts,” in WWW ’22, ACM Web Conference, Virtual Event, Lyon, France, 2022.
Export
BibTeX
@inproceedings{Ho_WWW22, TITLE = {Enhancing Knowledge Bases with Quantity Facts}, AUTHOR = {Ho, Vinh Thinh and Stepanova, Daria and Milchevski, Dragan and Str{\"o}tgen, Jannik and Weikum, Gerhard}, LANGUAGE = {eng}, ISBN = {978-1-4503-9096-5}, DOI = {10.1145/3485447.3511932}, PUBLISHER = {ACM}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {WWW '22, ACM Web Conference}, EDITOR = {Laforest, Fr{\'e}d{\'e}rique and Troncy, Rapha{\"e}l and Simperl, Elena and Agarwal, Deepak and Gionis, Aristides and Herman, Ivan and M{\'e}dini, Lionel}, PAGES = {893--901}, ADDRESS = {Virtual Event, Lyon, France}, }
Endnote
%0 Conference Proceedings %A Ho, Vinh Thinh %A Stepanova, Daria %A Milchevski, Dragan %A Str&#246;tgen, Jannik %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations External Organizations External Organizations External Organizations Databases and Information Systems, MPI for Informatics, Max Planck Society %T Enhancing Knowledge Bases with Quantity Facts : %G eng %U http://hdl.handle.net/21.11116/0000-000A-614E-2 %R 10.1145/3485447.3511932 %D 2022 %B ACM Web Conference %Z date of event: 2022-04-25 - 2022-04-29 %C Virtual Event, Lyon, France %B WWW '22 %E Laforest, Fr&#233;d&#233;rique; Troncy, Rapha&#235;l; Simperl, Elena; Agarwal, Deepak; Gionis, Aristides; Herman, Ivan; M&#233;dini, Lionel %P 893 - 901 %I ACM %@ 978-1-4503-9096-5
[12]
P. Lahoti, K. Gummadi, and G. Weikum, “Responsible Model Deployment via Model-agnostic Uncertainty Learning,” Machine Learning, 2022.
Export
BibTeX
@article{Lahoti2022, TITLE = {Responsible Model Deployment via Model-agnostic Uncertainty Learning}, AUTHOR = {Lahoti, Preethi and Gummadi, Krishna and Weikum, Gerhard}, LANGUAGE = {eng}, ISSN = {0885-6125}, DOI = {10.1007/s10994-022-06248-y}, PUBLISHER = {Springer}, ADDRESS = {Dordrecht}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, JOURNAL = {Machine Learning}, }
Endnote
%0 Journal Article %A Lahoti, Preethi %A Gummadi, Krishna %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations Databases and Information Systems, MPI for Informatics, Max Planck Society %T Responsible Model Deployment via Model-agnostic Uncertainty Learning : %G eng %U http://hdl.handle.net/21.11116/0000-000B-58F0-3 %R 10.1007/s10994-022-06248-y %7 2022 %D 2022 %J Machine Learning %I Springer %C Dordrecht %@ false
[13]
P. Lahoti, K. Gummadi, and G. Weikum, “Detecting and Mitigating Test-time Failure Risks via Model-agnostic Uncertainty Learning,” in 21st IEEE International Conference on Data Mining (ICDM 2021), Auckland, New Zealand (Virtual Conference), 2022.
Export
BibTeX
@inproceedings{Gummadi_ICDM21, TITLE = {Detecting and Mitigating Test-time Failure Risks via Model-agnostic Uncertainty Learning}, AUTHOR = {Lahoti, Preethi and Gummadi, Krishna and Weikum, Gerhard}, LANGUAGE = {eng}, ISBN = {978-1-6654-2398-4}, DOI = {10.1109/ICDM51629.2021.00141}, PUBLISHER = {IEEE}, YEAR = {2021}, MARGINALMARK = {$\bullet$}, DATE = {2022}, BOOKTITLE = {21st IEEE International Conference on Data Mining (ICDM 2021)}, EDITOR = {Bailey, James and Miettinen, Pauli and Koh, Yun Sing and Tao, Dacheng and Wu, Xindong}, PAGES = {1174--1179}, ADDRESS = {Auckland, New Zealand (Virtual Conference)}, }
Endnote
%0 Conference Proceedings %A Lahoti, Preethi %A Gummadi, Krishna %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations Databases and Information Systems, MPI for Informatics, Max Planck Society %T Detecting and Mitigating Test-time Failure Risks via Model-agnostic Uncertainty Learning : %G eng %U http://hdl.handle.net/21.11116/0000-000A-5E15-6 %R 10.1109/ICDM51629.2021.00141 %D 2022 %B 21st IEEE International Conference on Data Mining %Z date of event: 2021-12-07 - 2021-12-10 %C Auckland, New Zealand (Virtual Conference) %B 21st IEEE International Conference on Data Mining %E Bailey, James; Miettinen, Pauli; Koh, Yun Sing; Tao, Dacheng; Wu, Xindong %P 1174 - 1179 %I IEEE %@ 978-1-6654-2398-4
[14]
P. Lahoti, “Operationalizing Fairness for Responsible Machine Learning,” Universität des Saarlandes, Saarbrücken, 2022.
Abstract
As machine learning (ML) is increasingly used for decision making in scenarios that impact humans, there is a growing awareness of its potential for unfairness. A large body of recent work has focused on proposing formal notions of fairness in ML, as well as approaches to mitigate unfairness. However, there is a growing disconnect between the ML fairness literature and the needs to operationalize fairness in practice. This thesis addresses the need for responsible ML by developing new models and methods to address challenges in operationalizing fairness in practice. Specifically, it makes the following contributions. First, we tackle a key assumption in the group fairness literature that sensitive demographic attributes such as race and gender are known upfront, and can be readily used in model training to mitigate unfairness. In practice, factors like privacy and regulation often prohibit ML models from collecting or using protected attributes in decision making. To address this challenge we introduce the novel notion of computationally-identifiable errors and propose Adversarially Reweighted Learning (ARL), an optimization method that seeks to improve the worst-case performance over unobserved groups, without requiring access to the protected attributes in the dataset. Second, we argue that while group fairness notions are a desirable fairness criterion, they are fundamentally limited as they reduce fairness to an average statistic over pre-identified protected groups. In practice, automated decisions are made at an individual level, and can adversely impact individual people irrespective of the group statistic. We advance the paradigm of individual fairness by proposing iFair (individually fair representations), an optimization approach for learning a low dimensional latent representation of the data with two goals: to encode the data as well as possible, while removing any information about protected attributes in the transformed representation. Third, we advance the individual fairness paradigm, which requires that similar individuals receive similar outcomes. However, similarity metrics computed over observed feature space can be brittle, and inherently limited in their ability to accurately capture similarity between individuals. To address this, we introduce a novel notion of fairness graphs, wherein pairs of individuals can be identified as deemed similar with respect to the ML objective. We cast the problem of individual fairness into graph embedding, and propose PFR (pairwise fair representations), a method to learn a unified pairwise fair representation of the data. Fourth, we tackle the challenge that production data after model deployment is constantly evolving. As a consequence, in spite of the best efforts in training a fair model, ML systems can be prone to failure risks due to a variety of unforeseen reasons. To ensure responsible model deployment, potential failure risks need to be predicted, and mitigation actions need to be devised, for example, deferring to a human expert when uncertain or collecting additional data to address model’s blind-spots. We propose Risk Advisor, a model-agnostic meta-learner to predict potential failure risks and to give guidance on the sources of uncertainty inducing the risks, by leveraging information theoretic notions of aleatoric and epistemic uncertainty. This dissertation brings ML fairness closer to real-world applications by developing methods that address key practical challenges. Extensive experiments on a variety of real-world and synthetic datasets show that our proposed methods are viable in practice.
Export
BibTeX
@phdthesis{Lahotophd2022, TITLE = {Operationalizing Fairness for Responsible Machine Learning}, AUTHOR = {Lahoti, Preethi}, LANGUAGE = {eng}, URL = {nbn:de:bsz:291--ds-365860}, DOI = {10.22028/D291-36586}, SCHOOL = {Universit{\"a}t des Saarlandes}, ADDRESS = {Saarbr{\"u}cken}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, DATE = {2022}, ABSTRACT = {As machine learning (ML) is increasingly used for decision making in scenarios that impact humans, there is a growing awareness of its potential for unfairness. A large body of recent work has focused on proposing formal notions of fairness in ML, as well as approaches to mitigate unfairness. However, there is a growing disconnect between the ML fairness literature and the needs to operationalize fairness in practice. This thesis addresses the need for responsible ML by developing new models and methods to address challenges in operationalizing fairness in practice. Specifically, it makes the following contributions. First, we tackle a key assumption in the group fairness literature that sensitive demographic attributes such as race and gender are known upfront, and can be readily used in model training to mitigate unfairness. In practice, factors like privacy and regulation often prohibit ML models from collecting or using protected attributes in decision making. To address this challenge we introduce the novel notion of computationally-identifiable errors and propose Adversarially Reweighted Learning (ARL), an optimization method that seeks to improve the worst-case performance over unobserved groups, without requiring access to the protected attributes in the dataset. Second, we argue that while group fairness notions are a desirable fairness criterion, they are fundamentally limited as they reduce fairness to an average statistic over pre-identified protected groups. In practice, automated decisions are made at an individual level, and can adversely impact individual people irrespective of the group statistic. We advance the paradigm of individual fairness by proposing iFair (individually fair representations), an optimization approach for learning a low dimensional latent representation of the data with two goals: to encode the data as well as possible, while removing any information about protected attributes in the transformed representation. Third, we advance the individual fairness paradigm, which requires that similar individuals receive similar outcomes. However, similarity metrics computed over observed feature space can be brittle, and inherently limited in their ability to accurately capture similarity between individuals. To address this, we introduce a novel notion of fairness graphs, wherein pairs of individuals can be identified as deemed similar with respect to the ML objective. We cast the problem of individual fairness into graph embedding, and propose PFR (pairwise fair representations), a method to learn a unified pairwise fair representation of the data. Fourth, we tackle the challenge that production data after model deployment is constantly evolving. As a consequence, in spite of the best efforts in training a fair model, ML systems can be prone to failure risks due to a variety of unforeseen reasons. To ensure responsible model deployment, potential failure risks need to be predicted, and mitigation actions need to be devised, for example, deferring to a human expert when uncertain or collecting additional data to address model{\textquoteright}s blind-spots. We propose Risk Advisor, a model-agnostic meta-learner to predict potential failure risks and to give guidance on the sources of uncertainty inducing the risks, by leveraging information theoretic notions of aleatoric and epistemic uncertainty. This dissertation brings ML fairness closer to real-world applications by developing methods that address key practical challenges. Extensive experiments on a variety of real-world and synthetic datasets show that our proposed methods are viable in practice.}, }
Endnote
%0 Thesis %A Lahoti, Preethi %Y Weikum, Gerhard %A referee: Gummadi, Krishna %+ Databases and Information Systems, MPI for Informatics, Max Planck Society International Max Planck Research School, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Group K. Gummadi, Max Planck Institute for Software Systems, Max Planck Society %T Operationalizing Fairness for Responsible Machine Learning : %G eng %U http://hdl.handle.net/21.11116/0000-000A-CEC6-F %R 10.22028/D291-36586 %U nbn:de:bsz:291--ds-365860 %I Universit&#228;t des Saarlandes %C Saarbr&#252;cken %D 2022 %P 129 p. %V phd %9 phd %X As machine learning (ML) is increasingly used for decision making in scenarios that impact humans, there is a growing awareness of its potential for unfairness. A large body of recent work has focused on proposing formal notions of fairness in ML, as well as approaches to mitigate unfairness. However, there is a growing disconnect between the ML fairness literature and the needs to operationalize fairness in practice. This thesis addresses the need for responsible ML by developing new models and methods to address challenges in operationalizing fairness in practice. Specifically, it makes the following contributions. First, we tackle a key assumption in the group fairness literature that sensitive demographic attributes such as race and gender are known upfront, and can be readily used in model training to mitigate unfairness. In practice, factors like privacy and regulation often prohibit ML models from collecting or using protected attributes in decision making. To address this challenge we introduce the novel notion of computationally-identifiable errors and propose Adversarially Reweighted Learning (ARL), an optimization method that seeks to improve the worst-case performance over unobserved groups, without requiring access to the protected attributes in the dataset. Second, we argue that while group fairness notions are a desirable fairness criterion, they are fundamentally limited as they reduce fairness to an average statistic over pre-identified protected groups. In practice, automated decisions are made at an individual level, and can adversely impact individual people irrespective of the group statistic. We advance the paradigm of individual fairness by proposing iFair (individually fair representations), an optimization approach for learning a low dimensional latent representation of the data with two goals: to encode the data as well as possible, while removing any information about protected attributes in the transformed representation. Third, we advance the individual fairness paradigm, which requires that similar individuals receive similar outcomes. However, similarity metrics computed over observed feature space can be brittle, and inherently limited in their ability to accurately capture similarity between individuals. To address this, we introduce a novel notion of fairness graphs, wherein pairs of individuals can be identified as deemed similar with respect to the ML objective. We cast the problem of individual fairness into graph embedding, and propose PFR (pairwise fair representations), a method to learn a unified pairwise fair representation of the data. Fourth, we tackle the challenge that production data after model deployment is constantly evolving. As a consequence, in spite of the best efforts in training a fair model, ML systems can be prone to failure risks due to a variety of unforeseen reasons. To ensure responsible model deployment, potential failure risks need to be predicted, and mitigation actions need to be devised, for example, deferring to a human expert when uncertain or collecting additional data to address model&#8217;s blind-spots. We propose Risk Advisor, a model-agnostic meta-learner to predict potential failure risks and to give guidance on the sources of uncertainty inducing the risks, by leveraging information theoretic notions of aleatoric and epistemic uncertainty. This dissertation brings ML fairness closer to real-world applications by developing methods that address key practical challenges. Extensive experiments on a variety of real-world and synthetic datasets show that our proposed methods are viable in practice. %U https://publikationen.sulb.uni-saarland.de/handle/20.500.11880/33465
[15]
A. Marx and J. Fischer, “Estimating Mutual Information via Geodesic kNN,” in Proceedings of the SIAM International Conference on Data Mining (SDM 2022), Alexandria, VA, USA. (Accepted/in press)
Export
BibTeX
@inproceedings{Marx_SDM2022, TITLE = {{Estimating Mutual Information via Geodesic $k$NN}}, AUTHOR = {Marx, Alexander and Fischer, Jonas}, LANGUAGE = {eng}, DOI = {10.1137/1.9781611976700.44}, PUBLISHER = {SIAM}, YEAR = {2022}, PUBLREMARK = {Accepted}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {Proceedings of the SIAM International Conference on Data Mining (SDM 2022)}, ADDRESS = {Alexandria, VA, USA}, }
Endnote
%0 Conference Proceedings %A Marx, Alexander %A Fischer, Jonas %+ Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Estimating Mutual Information via Geodesic kNN : %G eng %U http://hdl.handle.net/21.11116/0000-0009-B19D-E %R 10.1137/1.9781611976700.44 %D 2021 %B SIAM International Conference on Data Mining %Z date of event: 2022-04-28 - 2022-04-30 %C Alexandria, VA, USA %B Proceedings of the SIAM International Conference on Data Mining %I SIAM
[16]
T. Nguyen, A. Yates, A. Zirikly, B. Desmet, and A. Cohan, “Improving the Generalizability of Depression Detection by Leveraging Clinical Questionnaires,” in The 60th Annual Meeting of the Association for Computational Linguistics (ACL 2022), Dublin, Ireland, 2022.
Export
BibTeX
@inproceedings{Nguyen_ACL22, TITLE = {Improving the Generalizability of Depression Detection by Leveraging Clinical Questionnaires}, AUTHOR = {Nguyen, Thong and Yates, Andrew and Zirikly, Ayah and Desmet, Bart and Cohan, Arman}, LANGUAGE = {eng}, ISBN = {978-1-955917-21-6}, DOI = {10.18653/v1/2022.acl-long.578}, PUBLISHER = {ACL}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {The 60th Annual Meeting of the Association for Computational Linguistics (ACL 2022)}, EDITOR = {Muresan, Smaranda and Nakov, Preslav and Villavicencio, Aline}, PAGES = {8446--8459}, ADDRESS = {Dublin, Ireland}, }
Endnote
%0 Conference Proceedings %A Nguyen, Thong %A Yates, Andrew %A Zirikly, Ayah %A Desmet, Bart %A Cohan, Arman %+ Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations External Organizations External Organizations %T Improving the Generalizability of Depression Detection by Leveraging Clinical Questionnaires : %G eng %U http://hdl.handle.net/21.11116/0000-000B-1DAA-6 %R 10.18653/v1/2022.acl-long.578 %D 2022 %B 60th Annual Meeting of the Association for Computational Linguistic %Z date of event: 2022-05-22 - 2022-05-27 %C Dublin, Ireland %B The 60th Annual Meeting of the Association for Computational Linguistics %E Muresan, Smaranda; Nakov, Preslav; Villavicencio, Aline %P 8446 - 8459 %I ACL %@ 978-1-955917-21-6
[17]
T.-P. Nguyen, S. Razniewski, A. Varde, and G. Weikum, “Extracting Cultural Commonsense Knowledge at Scale,” 2022. [Online]. Available: https://arxiv.org/abs/2210.07763. (arXiv: 2210.07763)
Abstract
Structured knowledge is important for many AI applications. Commonsense<br>knowledge, which is crucial for robust human-centric AI, is covered by a small<br>number of structured knowledge projects. However, they lack knowledge about<br>human traits and behaviors conditioned on socio-cultural contexts, which is<br>crucial for situative AI. This paper presents CANDLE, an end-to-end methodology<br>for extracting high-quality cultural commonsense knowledge (CCSK) at scale.<br>CANDLE extracts CCSK assertions from a huge web corpus and organizes them into<br>coherent clusters, for 3 domains of subjects (geography, religion, occupation)<br>and several cultural facets (food, drinks, clothing, traditions, rituals,<br>behaviors). CANDLE includes judicious techniques for classification-based<br>filtering and scoring of interestingness. Experimental evaluations show the<br>superiority of the CANDLE CCSK collection over prior works, and an extrinsic<br>use case demonstrates the benefits of CCSK for the GPT-3 language model. Code<br>and data can be accessed at https://cultural-csk.herokuapp.com/.<br>
Export
BibTeX
@online{Nguyen2210.07763, TITLE = {Extracting Cultural Commonsense Knowledge at Scale}, AUTHOR = {Nguyen, Tuan-Phong and Razniewski, Simon and Varde, Aparna and Weikum, Gerhard}, LANGUAGE = {eng}, URL = {https://arxiv.org/abs/2210.07763}, DOI = {10.48550/arXiv.2210.07763}, EPRINT = {2210.07763}, EPRINTTYPE = {arXiv}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, ABSTRACT = {Structured knowledge is important for many AI applications. Commonsense<br>knowledge, which is crucial for robust human-centric AI, is covered by a small<br>number of structured knowledge projects. However, they lack knowledge about<br>human traits and behaviors conditioned on socio-cultural contexts, which is<br>crucial for situative AI. This paper presents CANDLE, an end-to-end methodology<br>for extracting high-quality cultural commonsense knowledge (CCSK) at scale.<br>CANDLE extracts CCSK assertions from a huge web corpus and organizes them into<br>coherent clusters, for 3 domains of subjects (geography, religion, occupation)<br>and several cultural facets (food, drinks, clothing, traditions, rituals,<br>behaviors). CANDLE includes judicious techniques for classification-based<br>filtering and scoring of interestingness. Experimental evaluations show the<br>superiority of the CANDLE CCSK collection over prior works, and an extrinsic<br>use case demonstrates the benefits of CCSK for the GPT-3 language model. Code<br>and data can be accessed at https://cultural-csk.herokuapp.com/.<br>}, }
Endnote
%0 Report %A Nguyen, Tuan-Phong %A Razniewski, Simon %A Varde, Aparna %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations Databases and Information Systems, MPI for Informatics, Max Planck Society %T Extracting Cultural Commonsense Knowledge at Scale : %G eng %U http://hdl.handle.net/21.11116/0000-000B-58B3-8 %U https://arxiv.org/abs/2210.07763 %R 10.48550/arXiv.2210.07763 %D 2022 %X Structured knowledge is important for many AI applications. Commonsense<br>knowledge, which is crucial for robust human-centric AI, is covered by a small<br>number of structured knowledge projects. However, they lack knowledge about<br>human traits and behaviors conditioned on socio-cultural contexts, which is<br>crucial for situative AI. This paper presents CANDLE, an end-to-end methodology<br>for extracting high-quality cultural commonsense knowledge (CCSK) at scale.<br>CANDLE extracts CCSK assertions from a huge web corpus and organizes them into<br>coherent clusters, for 3 domains of subjects (geography, religion, occupation)<br>and several cultural facets (food, drinks, clothing, traditions, rituals,<br>behaviors). CANDLE includes judicious techniques for classification-based<br>filtering and scoring of interestingness. Experimental evaluations show the<br>superiority of the CANDLE CCSK collection over prior works, and an extrinsic<br>use case demonstrates the benefits of CCSK for the GPT-3 language model. Code<br>and data can be accessed at https://cultural-csk.herokuapp.com/.<br> %K Computer Science, Computation and Language, cs.CL,Computer Science, Artificial Intelligence, cs.AI
[18]
T.-P. Nguyen and S. Razniewski, “Materialized Knowledge Bases from Commonsense Transformers,” in Proceedings of the First Workshop on Commonsense Representation and Reasoning (CSRR 2022), Dublin, Ireland, 2022.
Export
BibTeX
@inproceedings{Nguyen_CSRR22, TITLE = {Materialized Knowledge Bases from Commonsense Transformers}, AUTHOR = {Nguyen, Tuan-Phong and Razniewski, Simon}, LANGUAGE = {eng}, ISBN = {978-1-955917-28-5}, URL = {https://openreview.net/forum?id=HI5M4MYedZ5}, PUBLISHER = {ACL}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {Proceedings of the First Workshop on Commonsense Representation and Reasoning (CSRR 2022)}, EDITOR = {Bosselut, Antoine and Li, Xiang and Yuchen, Bill and Shwartz, Vered and Majumder, Bodhisattwa Prasad and Kumar Lal, Yash and Rudinger, Rachel and Ren, Xiang and Tandon, Niket and Zouhar, Vil{\'e}m}, PAGES = {36--42}, ADDRESS = {Dublin, Ireland}, }
Endnote
%0 Conference Proceedings %A Nguyen, Tuan-Phong %A Razniewski, Simon %+ Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Materialized Knowledge Bases from Commonsense Transformers : %G eng %U http://hdl.handle.net/21.11116/0000-000B-1D87-D %U https://openreview.net/forum?id=HI5M4MYedZ5 %D 2022 %B 1st Workshop on Commonsense Representation and Reasoning %Z date of event: 2022-05-27 - 2022-05-27 %C Dublin, Ireland %B Proceedings of the First Workshop on Commonsense Representation and Reasoning %E Bosselut, Antoine; Li, Xiang; Yuchen, Bill; Shwartz, Vered; Majumder, Bodhisattwa Prasad; Kumar Lal, Yash; Rudinger, Rachel; Ren, Xiang; Tandon, Niket; Zouhar, Vil&#233;m %P 36 - 42 %I ACL %@ 978-1-955917-28-5
[19]
R. Pradeep, Y. Liu, X. Zhang, Y. Li, A. Yates, and J. Lin, “Squeezing Water from a Stone: A Bag of Tricks for Further Improving Cross-Encoder Effectiveness for Reranking,” in Advances in Information Retrieval (ECIR 2022), Stavanger, Norway, 2022.
Export
BibTeX
@inproceedings{Pradeep_ECIR2022, TITLE = {Squeezing Water from a Stone: {A} Bag of Tricks for Further Improving Cross-Encoder Effectiveness for Reranking}, AUTHOR = {Pradeep, Ronak and Liu, Yuqi and Zhang, Xinyu and Li, Yilin and Yates, Andrew and Lin, Jimmy}, LANGUAGE = {eng}, ISBN = {978-3-030-99736-6}, DOI = {10.1007/978-3-030-99736-6_44}, PUBLISHER = {Springer}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, DATE = {2022}, BOOKTITLE = {Advances in Information Retrieval (ECIR 2022)}, EDITOR = {Hagen, Matthias and Verbene, Suzan and Macdonald, Craig and Seifert, Christin and Balog, Krisztian and N{\o}rv{\aa}g, Kjetil and Setty, Vinay}, PAGES = {655--670}, SERIES = {Lecture Notes in Computer Science}, VOLUME = {13185}, ADDRESS = {Stavanger, Norway}, }
Endnote
%0 Conference Proceedings %A Pradeep, Ronak %A Liu, Yuqi %A Zhang, Xinyu %A Li, Yilin %A Yates, Andrew %A Lin, Jimmy %+ External Organizations External Organizations External Organizations External Organizations Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations %T Squeezing Water from a Stone: A Bag of Tricks for Further Improving Cross-Encoder Effectiveness for Reranking : %G eng %U http://hdl.handle.net/21.11116/0000-000A-9E28-8 %R 10.1007/978-3-030-99736-6_44 %D 2022 %B 44th European Conference on IR Research %Z date of event: 2022-04-10 - 2022-04-14 %C Stavanger, Norway %B Advances in Information Retrieval %E Hagen, Matthias; Verbene, Suzan; Macdonald, Craig; Seifert, Christin; Balog, Krisztian; N&#248;rv&#229;g, Kjetil; Setty, Vinay %P 655 - 670 %I Springer %@ 978-3-030-99736-6 %B Lecture Notes in Computer Science %N 13185
[20]
M. Puri, A. S. Varde, and G. de Melo, “Commonsense Based Text Mining on Urban Policy,” Language Resources and Evaluation, 2022.
Export
BibTeX
@article{Puri2022, TITLE = {Commonsense Based Text Mining on Urban Policy}, AUTHOR = {Puri, Manish and Varde, Aparna S. and de Melo, Gerard}, LANGUAGE = {eng}, ISSN = {1574-020X; 1572-0218; 1572-8412; 1574-0218; 0010-4817}, DOI = {10.1007/s10579-022-09584-6}, PUBLISHER = {Springer}, ADDRESS = {New York, NY}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, JOURNAL = {Language Resources and Evaluation}, }
Endnote
%0 Journal Article %A Puri, Manish %A Varde, Aparna S. %A de Melo, Gerard %+ External Organizations Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations %T Commonsense Based Text Mining on Urban Policy : %G eng %U http://hdl.handle.net/21.11116/0000-000A-20AC-0 %R 10.1007/s10579-022-09584-6 %7 2022 %D 2022 %J Language Resources and Evaluation %O Computers and the Humanities Lang Resources & Evaluation %I Springer %C New York, NY %@ false %U https://rdcu.be/cJwGl
[21]
J. Romero and S. Razniewski, “Do Children Texts Hold The Key To Commonsense Knowledge?,” 2022. [Online]. Available: https://arxiv.org/abs/2210.04530. (arXiv: 2210.04530)
Abstract
Compiling comprehensive repositories of commonsense knowledge is a<br>long-standing problem in AI. Many concerns revolve around the issue of<br>reporting bias, i.e., that frequency in text sources is not a good proxy for<br>relevance or truth. This paper explores whether children's texts hold the key<br>to commonsense knowledge compilation, based on the hypothesis that such content<br>makes fewer assumptions on the reader's knowledge, and therefore spells out<br>commonsense more explicitly. An analysis with several corpora shows that<br>children's texts indeed contain much more, and more typical commonsense<br>assertions. Moreover, experiments show that this advantage can be leveraged in<br>popular language-model-based commonsense knowledge extraction settings, where<br>task-unspecific fine-tuning on small amounts of children texts (childBERT)<br>already yields significant improvements. This provides a refreshing perspective<br>different from the common trend of deriving progress from ever larger models<br>and corpora.<br>
Export
BibTeX
@online{Romero2210.04530, TITLE = {Do Children Texts Hold The Key To Commonsense Knowledge?}, AUTHOR = {Romero, Julien and Razniewski, Simon}, LANGUAGE = {eng}, URL = {https://arxiv.org/abs/2210.04530}, DOI = {10.48550/arXiv.2210.04530}, EPRINT = {2210.04530}, EPRINTTYPE = {arXiv}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, ABSTRACT = {Compiling comprehensive repositories of commonsense knowledge is a<br>long-standing problem in AI. Many concerns revolve around the issue of<br>reporting bias, i.e., that frequency in text sources is not a good proxy for<br>relevance or truth. This paper explores whether children's texts hold the key<br>to commonsense knowledge compilation, based on the hypothesis that such content<br>makes fewer assumptions on the reader's knowledge, and therefore spells out<br>commonsense more explicitly. An analysis with several corpora shows that<br>children's texts indeed contain much more, and more typical commonsense<br>assertions. Moreover, experiments show that this advantage can be leveraged in<br>popular language-model-based commonsense knowledge extraction settings, where<br>task-unspecific fine-tuning on small amounts of children texts (childBERT)<br>already yields significant improvements. This provides a refreshing perspective<br>different from the common trend of deriving progress from ever larger models<br>and corpora.<br>}, }
Endnote
%0 Report %A Romero, Julien %A Razniewski, Simon %+ External Organizations Databases and Information Systems, MPI for Informatics, Max Planck Society %T Do Children Texts Hold The Key To Commonsense Knowledge? : %G eng %U http://hdl.handle.net/21.11116/0000-000B-58AA-3 %U https://arxiv.org/abs/2210.04530 %R 10.48550/arXiv.2210.04530 %D 2022 %X Compiling comprehensive repositories of commonsense knowledge is a<br>long-standing problem in AI. Many concerns revolve around the issue of<br>reporting bias, i.e., that frequency in text sources is not a good proxy for<br>relevance or truth. This paper explores whether children's texts hold the key<br>to commonsense knowledge compilation, based on the hypothesis that such content<br>makes fewer assumptions on the reader's knowledge, and therefore spells out<br>commonsense more explicitly. An analysis with several corpora shows that<br>children's texts indeed contain much more, and more typical commonsense<br>assertions. Moreover, experiments show that this advantage can be leveraged in<br>popular language-model-based commonsense knowledge extraction settings, where<br>task-unspecific fine-tuning on small amounts of children texts (childBERT)<br>already yields significant improvements. This provides a refreshing perspective<br>different from the common trend of deriving progress from ever larger models<br>and corpora.<br> %K Computer Science, Computation and Language, cs.CL,Computer Science, Artificial Intelligence, cs.AI
[22]
S. Singhania, T.-P. Nguyen, and S. Razniewski, Eds., Knowledge Base Construction from Pre-trained Language Models 2022. CEUR-WS, 2022.
Export
BibTeX
@proceedings{SinghaniaLMKBC22, TITLE = {Knowledge Base Construction from Pre-trained Language Models 2022 (LM-KBC 2022)}, EDITOR = {Singhania, Sneha and Nguyen, Tuan-Phong and Razniewski, Simon}, LANGUAGE = {eng}, URL = {urn:nbn:de:0074-3274-1; http://ceur-ws.org/Vol-3274/}, PUBLISHER = {CEUR-WS}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, SERIES = {CEUR Workshop Proceedings}, VOLUME = {3274}, ADDRESS = {Virtual Event, Hanghzou, China}, }
Endnote
%0 Conference Proceedings %E Singhania, Sneha %E Nguyen, Tuan-Phong %E Razniewski, Simon %+ Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Knowledge Base Construction from Pre-trained Language Models 2022 : Proceedings of the Semantic Web Challenge on Knowledge Base Construction from Pre-trained Language Models 2022 co-located with the 21st International Semantic Web Conference (ISWC2022) %G eng %U http://hdl.handle.net/21.11116/0000-000B-C723-D %U urn:nbn:de:0074-3274-1 %U http://ceur-ws.org/Vol-3274/ %I CEUR-WS %D 2022 %B Semantic Web Challenge on Knowledge Base Construction from Pre-trained Language Models %Z date of event: 2022-10 - 2022-10 %D 2022 %C Virtual Event, Hanghzou, China %S CEUR Workshop Proceedings %V 3274
[23]
S. Singhania, S. Razniewski, and G. Weikum, “Predicting Document Coverage for Relation Extraction,” Transactions of the Association of Computational Linguistics, vol. 10, 2022.
Export
BibTeX
@article{Singhania2022, TITLE = {Predicting Document Coverage for Relation Extraction}, AUTHOR = {Singhania, Sneha and Razniewski, Simon and Weikum, Gerhard}, LANGUAGE = {eng}, ISSN = {2307-387X}, DOI = {10.1162/tacl_a_00456}, PUBLISHER = {ACL}, ADDRESS = {Cambridge, MA}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, JOURNAL = {Transactions of the Association of Computational Linguistics}, VOLUME = {10}, PAGES = {207--223}, }
Endnote
%0 Journal Article %A Singhania, Sneha %A Razniewski, Simon %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Predicting Document Coverage for Relation Extraction : %G eng %U http://hdl.handle.net/21.11116/0000-000A-27B8-B %R 10.1162/tacl_a_00456 %7 2022 %D 2022 %J Transactions of the Association of Computational Linguistics %V 10 %& 207 %P 207 - 223 %I ACL %C Cambridge, MA %@ false
[24]
A. Tigunova, “Extracting Personal Information from Conversations,” Universität des Saarlandes, Saarbrücken, 2022.
Abstract
Personal knowledge is a versatile resource that is valuable for a wide range of downstream applications. Background facts about users can allow chatbot assistants to produce more topical and empathic replies. In the context of recommendation and retrieval models, personal facts can be used to customize the ranking results for individual users. A Personal Knowledge Base, populated with personal facts, such as demographic information, interests and interpersonal relationships, is a unique endpoint for storing and querying personal knowledge. Such knowledge bases are easily interpretable and can provide users with full control over their own personal knowledge, including revising stored facts and managing access by downstream services for personalization purposes. To alleviate users from extensive manual effort to build such personal knowledge base, we can leverage automated extraction methods applied to the textual content of the users, such as dialogue transcripts or social media posts. Mainstream extraction methods specialize on well-structured data, such as biographical texts or encyclopedic articles, which are rare for most people. In turn, conversational data is abundant but challenging to process and requires specialized methods for extraction of personal facts. In this dissertation we address the acquisition of personal knowledge from conversational data. We propose several novel deep learning models for inferring speakers’ personal attributes: • Demographic attributes, age, gender, profession and family status, are inferred by HAMs - hierarchical neural classifiers with attention mechanism. Trained HAMs can be transferred between different types of conversational data and provide interpretable predictions. • Long-tailed personal attributes, hobby and profession, are predicted with CHARM - a zero-shot learning model, overcoming the lack of labeled training samples for rare attribute values. By linking conversational utterances to external sources, CHARM is able to predict attribute values which it never saw during training. • Interpersonal relationships are inferred with PRIDE - a hierarchical transformer-based model. To accurately predict fine-grained relationships, PRIDE leverages personal traits of the speakers and the style of conversational utterances. Experiments with various conversational texts, including Reddit discussions and movie scripts, demonstrate the viability of our methods and their superior performance compared to state-of-the-art baselines.
Export
BibTeX
@phdthesis{Tiguphd2022, TITLE = {Extracting Personal Information from Conversations}, AUTHOR = {Tigunova, Anna}, LANGUAGE = {eng}, URL = {nbn:de:bsz:291--ds-356280}, DOI = {10.22028/D291-35628}, SCHOOL = {Universit{\"a}t des Saarlandes}, ADDRESS = {Saarbr{\"u}cken}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, DATE = {2022}, ABSTRACT = {Personal knowledge is a versatile resource that is valuable for a wide range of downstream applications. Background facts about users can allow chatbot assistants to produce more topical and empathic replies. In the context of recommendation and retrieval models, personal facts can be used to customize the ranking results for individual users. A Personal Knowledge Base, populated with personal facts, such as demographic information, interests and interpersonal relationships, is a unique endpoint for storing and querying personal knowledge. Such knowledge bases are easily interpretable and can provide users with full control over their own personal knowledge, including revising stored facts and managing access by downstream services for personalization purposes. To alleviate users from extensive manual effort to build such personal knowledge base, we can leverage automated extraction methods applied to the textual content of the users, such as dialogue transcripts or social media posts. Mainstream extraction methods specialize on well-structured data, such as biographical texts or encyclopedic articles, which are rare for most people. In turn, conversational data is abundant but challenging to process and requires specialized methods for extraction of personal facts. In this dissertation we address the acquisition of personal knowledge from conversational data. We propose several novel deep learning models for inferring speakers{\textquoteright} personal attributes: \mbox{$\bullet$} Demographic attributes, age, gender, profession and family status, are inferred by HAMs -- hierarchical neural classifiers with attention mechanism. Trained HAMs can be transferred between different types of conversational data and provide interpretable predictions. \mbox{$\bullet$} Long-tailed personal attributes, hobby and profession, are predicted with CHARM -- a zero-shot learning model, overcoming the lack of labeled training samples for rare attribute values. By linking conversational utterances to external sources, CHARM is able to predict attribute values which it never saw during training. \mbox{$\bullet$} Interpersonal relationships are inferred with PRIDE -- a hierarchical transformer-based model. To accurately predict fine-grained relationships, PRIDE leverages personal traits of the speakers and the style of conversational utterances. Experiments with various conversational texts, including Reddit discussions and movie scripts, demonstrate the viability of our methods and their superior performance compared to state-of-the-art baselines.}, }
Endnote
%0 Thesis %A Tigunova, Anna %Y Weikum, Gerhard %A referee: Yates, Andrew %A referee: Demberg, Vera %+ Databases and Information Systems, MPI for Informatics, Max Planck Society International Max Planck Research School, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations %T Extracting Personal Information from Conversations : %G eng %U http://hdl.handle.net/21.11116/0000-000B-3FE1-1 %R 10.22028/D291-35628 %U nbn:de:bsz:291--ds-356280 %I Universit&#228;t des Saarlandes %C Saarbr&#252;cken %D 2022 %P 139 p. %V phd %9 phd %X Personal knowledge is a versatile resource that is valuable for a wide range of downstream applications. Background facts about users can allow chatbot assistants to produce more topical and empathic replies. In the context of recommendation and retrieval models, personal facts can be used to customize the ranking results for individual users. A Personal Knowledge Base, populated with personal facts, such as demographic information, interests and interpersonal relationships, is a unique endpoint for storing and querying personal knowledge. Such knowledge bases are easily interpretable and can provide users with full control over their own personal knowledge, including revising stored facts and managing access by downstream services for personalization purposes. To alleviate users from extensive manual effort to build such personal knowledge base, we can leverage automated extraction methods applied to the textual content of the users, such as dialogue transcripts or social media posts. Mainstream extraction methods specialize on well-structured data, such as biographical texts or encyclopedic articles, which are rare for most people. In turn, conversational data is abundant but challenging to process and requires specialized methods for extraction of personal facts. In this dissertation we address the acquisition of personal knowledge from conversational data. We propose several novel deep learning models for inferring speakers&#8217; personal attributes: &#8226; Demographic attributes, age, gender, profession and family status, are inferred by HAMs - hierarchical neural classifiers with attention mechanism. Trained HAMs can be transferred between different types of conversational data and provide interpretable predictions. &#8226; Long-tailed personal attributes, hobby and profession, are predicted with CHARM - a zero-shot learning model, overcoming the lack of labeled training samples for rare attribute values. By linking conversational utterances to external sources, CHARM is able to predict attribute values which it never saw during training. &#8226; Interpersonal relationships are inferred with PRIDE - a hierarchical transformer-based model. To accurately predict fine-grained relationships, PRIDE leverages personal traits of the speakers and the style of conversational utterances. Experiments with various conversational texts, including Reddit discussions and movie scripts, demonstrate the viability of our methods and their superior performance compared to state-of-the-art baselines. %U https://publikationen.sulb.uni-saarland.de/handle/20.500.11880/32546
[25]
A. Tigunova, “Extracting personal information from conversations,” Universität des Saarlandes, Saarbrücken, 2022.
Abstract
Personal knowledge is a versatile resource that is valuable for a wide range of downstream applications. Background facts about users can allow chatbot assistants to produce more topical and empathic replies. In the context of recommendation and retrieval models, personal facts can be used to customize the ranking results for individual users. A Personal Knowledge Base, populated with personal facts, such as demographic information, interests and interpersonal relationships, is a unique endpoint for storing and querying personal knowledge. Such knowledge bases are easily interpretable and can provide users with full control over their own personal knowledge, including revising stored facts and managing access by downstream services for personalization purposes. To alleviate users from extensive manual effort to build such personal knowledge base, we can leverage automated extraction methods applied to the textual content of the users, such as dialogue transcripts or social media posts. Mainstream extraction methods specialize on well-structured data, such as biographical texts or encyclopedic articles, which are rare for most people. In turn, conversational data is abundant but challenging to process and requires specialized methods for extraction of personal facts. In this dissertation we address the acquisition of personal knowledge from conversational data. We propose several novel deep learning models for inferring speakers’ personal attributes: • Demographic attributes, age, gender, profession and family status, are inferred by HAMs - hierarchical neural classifiers with attention mechanism. Trained HAMs can be transferred between different types of conversational data and provide interpretable predictions. • Long-tailed personal attributes, hobby and profession, are predicted with CHARM - a zero-shot learning model, overcoming the lack of labeled training samples for rare attribute values. By linking conversational utterances to external sources, CHARM is able to predict attribute values which it never saw during training. • Interpersonal relationships are inferred with PRIDE - a hierarchical transformer-based model. To accurately predict fine-grained relationships, PRIDE leverages personal traits of the speakers and the style of conversational utterances. Experiments with various conversational texts, including Reddit discussions and movie scripts, demonstrate the viability of our methods and their superior performance compared to state-of-the-art baselines.
Export
BibTeX
@phdthesis{Tiguphd2021, TITLE = {Extracting personal information from conversations}, AUTHOR = {Tigunova, Anna}, LANGUAGE = {eng}, URL = {nbn:de:bsz:291--ds-356280}, DOI = {10.22028/D291-35628}, SCHOOL = {Universit{\"a}t des Saarlandes}, ADDRESS = {Saarbr{\"u}cken}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, DATE = {2022}, ABSTRACT = {Personal knowledge is a versatile resource that is valuable for a wide range of downstream applications. Background facts about users can allow chatbot assistants to produce more topical and empathic replies. In the context of recommendation and retrieval models, personal facts can be used to customize the ranking results for individual users. A Personal Knowledge Base, populated with personal facts, such as demographic information, interests and interpersonal relationships, is a unique endpoint for storing and querying personal knowledge. Such knowledge bases are easily interpretable and can provide users with full control over their own personal knowledge, including revising stored facts and managing access by downstream services for personalization purposes. To alleviate users from extensive manual effort to build such personal knowledge base, we can leverage automated extraction methods applied to the textual content of the users, such as dialogue transcripts or social media posts. Mainstream extraction methods specialize on well-structured data, such as biographical texts or encyclopedic articles, which are rare for most people. In turn, conversational data is abundant but challenging to process and requires specialized methods for extraction of personal facts. In this dissertation we address the acquisition of personal knowledge from conversational data. We propose several novel deep learning models for inferring speakers{\textquoteright} personal attributes: \mbox{$\bullet$} Demographic attributes, age, gender, profession and family status, are inferred by HAMs -- hierarchical neural classifiers with attention mechanism. Trained HAMs can be transferred between different types of conversational data and provide interpretable predictions. \mbox{$\bullet$} Long-tailed personal attributes, hobby and profession, are predicted with CHARM -- a zero-shot learning model, overcoming the lack of labeled training samples for rare attribute values. By linking conversational utterances to external sources, CHARM is able to predict attribute values which it never saw during training. \mbox{$\bullet$} Interpersonal relationships are inferred with PRIDE -- a hierarchical transformer-based model. To accurately predict fine-grained relationships, PRIDE leverages personal traits of the speakers and the style of conversational utterances. Experiments with various conversational texts, including Reddit discussions and movie scripts, demonstrate the viability of our methods and their superior performance compared to state-of-the-art baselines.}, }
Endnote
%0 Thesis %A Tigunova, Anna %Y Weikum, Gerhard %A referee: Yates, Andrew %A referee: Demberg,, Vera %+ Databases and Information Systems, MPI for Informatics, Max Planck Society International Max Planck Research School, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations %T Extracting personal information from conversations : %G eng %U http://hdl.handle.net/21.11116/0000-000A-3C9E-2 %R 10.22028/D291-35628 %U nbn:de:bsz:291--ds-356280 %I Universit&#228;t des Saarlandes %C Saarbr&#252;cken %D 2022 %P 126 p. %V phd %9 phd %X Personal knowledge is a versatile resource that is valuable for a wide range of downstream applications. Background facts about users can allow chatbot assistants to produce more topical and empathic replies. In the context of recommendation and retrieval models, personal facts can be used to customize the ranking results for individual users. A Personal Knowledge Base, populated with personal facts, such as demographic information, interests and interpersonal relationships, is a unique endpoint for storing and querying personal knowledge. Such knowledge bases are easily interpretable and can provide users with full control over their own personal knowledge, including revising stored facts and managing access by downstream services for personalization purposes. To alleviate users from extensive manual effort to build such personal knowledge base, we can leverage automated extraction methods applied to the textual content of the users, such as dialogue transcripts or social media posts. Mainstream extraction methods specialize on well-structured data, such as biographical texts or encyclopedic articles, which are rare for most people. In turn, conversational data is abundant but challenging to process and requires specialized methods for extraction of personal facts. In this dissertation we address the acquisition of personal knowledge from conversational data. We propose several novel deep learning models for inferring speakers&#8217; personal attributes: &#8226; Demographic attributes, age, gender, profession and family status, are inferred by HAMs - hierarchical neural classifiers with attention mechanism. Trained HAMs can be transferred between different types of conversational data and provide interpretable predictions. &#8226; Long-tailed personal attributes, hobby and profession, are predicted with CHARM - a zero-shot learning model, overcoming the lack of labeled training samples for rare attribute values. By linking conversational utterances to external sources, CHARM is able to predict attribute values which it never saw during training. &#8226; Interpersonal relationships are inferred with PRIDE - a hierarchical transformer-based model. To accurately predict fine-grained relationships, PRIDE leverages personal traits of the speakers and the style of conversational utterances. Experiments with various conversational texts, including Reddit discussions and movie scripts, demonstrate the viability of our methods and their superior performance compared to state-of-the-art baselines. %U https://publikationen.sulb.uni-saarland.de/handle/20.500.11880/32546
[26]
A. S. Varde, “Computational Estimation by Scientific Data Mining with Classical Methods to Automate Learning Strategies of Scientists,” ACM Transactions on Knowledge Discovery from Data, vol. 16, no. 5, 2022.
Export
BibTeX
@article{Varde2022b, TITLE = {Computational Estimation by Scientific Data Mining with Classical Methods to Automate Learning Strategies of Scientists}, AUTHOR = {Varde, Aparna S.}, LANGUAGE = {eng}, DOI = {10.1145/3502736}, PUBLISHER = {ACM}, ADDRESS = {New York, NY}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, JOURNAL = {ACM Transactions on Knowledge Discovery from Data}, VOLUME = {16}, NUMBER = {5}, PAGES = {1--52}, EID = {86}, }
Endnote
%0 Journal Article %A Varde, Aparna S. %+ Databases and Information Systems, MPI for Informatics, Max Planck Society %T Computational Estimation by Scientific Data Mining with Classical Methods to Automate Learning Strategies of Scientists : %G eng %U http://hdl.handle.net/21.11116/0000-000A-9D92-0 %R 10.1145/3502736 %7 2022 %D 2022 %J ACM Transactions on Knowledge Discovery from Data %V 16 %N 5 %& 1 %P 1 - 52 %Z sequence number: 86 %I ACM %C New York, NY
[27]
A. S. Varde, A. Pandey, and X. Du, “Prediction Tool on Fine Particle Pollutants and Air Quality for Environmental Engineering,” SN Computer Science, vol. 3, no. 3, 2022.
Export
BibTeX
@article{Varde2022, TITLE = {Prediction Tool on Fine Particle Pollutants and Air Quality for Environmental Engineering}, AUTHOR = {Varde, Aparna S. and Pandey, Abidha and Du, Xu}, LANGUAGE = {eng}, ISSN = {2661-8907}, DOI = {10.1007/s42979-022-01068-2}, PUBLISHER = {Springer Nature}, ADDRESS = {Singapore}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, JOURNAL = {SN Computer Science}, VOLUME = {3}, NUMBER = {3}, EID = {184}, }
Endnote
%0 Journal Article %A Varde, Aparna S. %A Pandey, Abidha %A Du, Xu %+ Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations External Organizations %T Prediction Tool on Fine Particle Pollutants and Air Quality for Environmental Engineering : %G eng %U http://hdl.handle.net/21.11116/0000-000A-2F55-3 %R 10.1007/s42979-022-01068-2 %7 2022 %D 2022 %J SN Computer Science %V 3 %N 3 %Z sequence number: 184 %I Springer Nature %C Singapore %@ false