D5
Databases and Information Systems
2022
[1]
H. Arnaout, T.-K. Tran, D. Stepanova, M. H. Gad-Elrab, S. Razniewski, and G. Weikum, “Utilizing Language Model Probes for Knowledge Graph Repair,” in Wiki Workshop 2022, Virtual Event, 2022.
Export
BibTeX
@inproceedings{Arnaout_Wiki2022, TITLE = {Utilizing Language Model Probes for Knowledge Graph Repair}, AUTHOR = {Arnaout, Hiba and Tran, Trung-Kien and Stepanova, Daria and Gad-Elrab, Mohamed Hassan and Razniewski, Simon and Weikum, Gerhard}, LANGUAGE = {eng}, URL = {https://wikiworkshop.org/2022/}, YEAR = {2022}, BOOKTITLE = {Wiki Workshop 2022}, ADDRESS = {Virtual Event}, }
Endnote
%0 Conference Proceedings %A Arnaout, Hiba %A Tran, Trung-Kien %A Stepanova, Daria %A Gad-Elrab, Mohamed Hassan %A Razniewski, Simon %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations External Organizations External Organizations External Organizations Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Utilizing Language Model Probes for Knowledge Graph Repair : %G eng %U http://hdl.handle.net/21.11116/0000-000A-63F4-3 %U https://wikiworkshop.org/2022/ %D 2022 %B Wiki Workshop 2022 %Z date of event: 2022-04-25 - 2022-04-25 %C Virtual Event %B Wiki Workshop 2022
[2]
P. Christmann, R. Saha Roy, and G. Weikum, “Conversational Question Answering on Heterogeneous Sources,” 2022. [Online]. Available: https://arxiv.org/abs/2204.11677v1. (arXiv: 2204.11677)
Abstract
Conversational question answering (ConvQA) tackles sequential information needs where contexts in follow-up questions are left implicit. Current ConvQA systems operate over homogeneous sources of information: either a knowledge base (KB), or a text corpus, or a collection of tables. This paper addresses the novel issue of jointly tapping into all of these together, this way boosting answer coverage and confidence. We present CONVINSE, an end-to-end pipeline for ConvQA over heterogeneous sources, operating in three stages: i) learning an explicit structured representation of an incoming question and its conversational context, ii) harnessing this frame-like representation to uniformly capture relevant evidences from KB, text, and tables, and iii) running a fusion-in-decoder model to generate the answer. We construct and release the first benchmark, ConvMix, for ConvQA over heterogeneous sources, comprising 3000 real-user conversations with 16000 questions, along with entity annotations, completed question utterances, and question paraphrases. Experiments demonstrate the viability and advantages of our method, compared to state-of-the-art baselines.
Export
BibTeX
@online{Christmann2022, TITLE = {Conversational Question Answering on Heterogeneous Sources}, AUTHOR = {Christmann, Phlipp and Saha Roy, Rishiraj and Weikum, Gerhard}, LANGUAGE = {eng}, URL = {https://arxiv.org/abs/2204.11677v1}, EPRINT = {2204.11677}, EPRINTTYPE = {arXiv}, YEAR = {2022}, ABSTRACT = {Conversational question answering (ConvQA) tackles sequential information needs where contexts in follow-up questions are left implicit. Current ConvQA systems operate over homogeneous sources of information: either a knowledge base (KB), or a text corpus, or a collection of tables. This paper addresses the novel issue of jointly tapping into all of these together, this way boosting answer coverage and confidence. We present CONVINSE, an end-to-end pipeline for ConvQA over heterogeneous sources, operating in three stages: i) learning an explicit structured representation of an incoming question and its conversational context, ii) harnessing this frame-like representation to uniformly capture relevant evidences from KB, text, and tables, and iii) running a fusion-in-decoder model to generate the answer. We construct and release the first benchmark, ConvMix, for ConvQA over heterogeneous sources, comprising 3000 real-user conversations with 16000 questions, along with entity annotations, completed question utterances, and question paraphrases. Experiments demonstrate the viability and advantages of our method, compared to state-of-the-art baselines.}, }
Endnote
%0 Report %A Christmann, Phlipp %A Saha Roy, Rishiraj %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Conversational Question Answering on Heterogeneous Sources : %G eng %U http://hdl.handle.net/21.11116/0000-000A-6148-8 %U https://arxiv.org/abs/2204.11677v1 %D 2022 %X Conversational question answering (ConvQA) tackles sequential information needs where contexts in follow-up questions are left implicit. Current ConvQA systems operate over homogeneous sources of information: either a knowledge base (KB), or a text corpus, or a collection of tables. This paper addresses the novel issue of jointly tapping into all of these together, this way boosting answer coverage and confidence. We present CONVINSE, an end-to-end pipeline for ConvQA over heterogeneous sources, operating in three stages: i) learning an explicit structured representation of an incoming question and its conversational context, ii) harnessing this frame-like representation to uniformly capture relevant evidences from KB, text, and tables, and iii) running a fusion-in-decoder model to generate the answer. We construct and release the first benchmark, ConvMix, for ConvQA over heterogeneous sources, comprising 3000 real-user conversations with 16000 questions, along with entity annotations, completed question utterances, and question paraphrases. Experiments demonstrate the viability and advantages of our method, compared to state-of-the-art baselines. %K Computer Science, Information Retrieval, cs.IR,Computer Science, Computation and Language, cs.CL
[3]
P. Christmann, R. Saha Roy, and G. Weikum, “Beyond NED: Fast and Effective Search Space Reduction for Complex Question Answering over Knowledge Bases,” in WSDM ’22, Fifteenth ACM International Conference on Web Search and Data Mining, Tempe, AZ, USA (Virutal Event), 2022.
Export
BibTeX
@inproceedings{Christmann_WSDM22, TITLE = {Beyond {NED}: {F}ast and Effective Search Space Reduction for Complex Question Answering over Knowledge Bases}, AUTHOR = {Christmann, Phlipp and Saha Roy, Rishiraj and Weikum, Gerhard}, LANGUAGE = {eng}, ISBN = {978-1-4503-9132-0}, DOI = {10.1145/3488560.3498488}, PUBLISHER = {ACM}, YEAR = {2022}, BOOKTITLE = {WSDM '22, Fifteenth ACM International Conference on Web Search and Data Mining}, PAGES = {172--180}, ADDRESS = {Tempe, AZ, USA (Virutal Event)}, }
Endnote
%0 Conference Proceedings %A Christmann, Phlipp %A Saha Roy, Rishiraj %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Beyond NED: Fast and Effective Search Space Reduction for Complex Question Answering over Knowledge Bases : %G eng %U http://hdl.handle.net/21.11116/0000-000A-27C6-B %R 10.1145/3488560.3498488 %D 2022 %B Fifteenth ACM International Conference on Web Search and Data Mining %Z date of event: 2022-02-21 - 2022-02-25 %C Tempe, AZ, USA (Virutal Event) %B WSDM '22 %P 172 - 180 %I ACM %@ 978-1-4503-9132-0
[4]
C. X. Chu, “Knowledge Extraction from Fictional Texts,” Universität des Saarlandes, Saarbrücken, 2022.
Abstract
Knowledge extraction from text is a key task in natural language processing, which involves many sub-tasks, such as taxonomy induction, named entity recognition and typing, relation extraction, knowledge canonicalization and so on. By constructing structured knowledge from natural language text, knowledge extraction becomes a key asset for search engines, question answering and other downstream applications. However, current knowledge extraction methods mostly focus on prominent real-world entities with Wikipedia and mainstream news articles as sources. The constructed knowledge bases, therefore, lack information about long-tail domains, with fiction and fantasy as archetypes. Fiction and fantasy are core parts of our human culture, spanning from literature to movies, TV series, comics and video games. With thousands of fictional universes which have been created, knowledge from fictional domains are subject of search-engine queries - by fans as well as cultural analysts. Unlike the real-world domain, knowledge extraction on such specific domains like fiction and fantasy has to tackle several key challenges: - Training data: Sources for fictional domains mostly come from books and fan-built content, which is sparse and noisy, and contains difficult structures of texts, such as dialogues and quotes. Training data for key tasks such as taxonomy induction, named entity typing or relation extraction are also not available. - Domain characteristics and diversity: Fictional universes can be highly sophisticated, containing entities, social structures and sometimes languages that are completely different from the real world. State-of-the-art methods for knowledge extraction make assumptions on entity-class, subclass and entity-entity relations that are often invalid for fictional domains. With different genres of fictional domains, another requirement is to transfer models across domains. - Long fictional texts: While state-of-the-art models have limitations on the input sequence length, it is essential to develop methods that are able to deal with very long texts (e.g. entire books), to capture multiple contexts and leverage widely spread cues. This dissertation addresses the above challenges, by developing new methodologies that advance the state of the art on knowledge extraction in fictional domains. - The first contribution is a method, called TiFi, for constructing type systems (taxonomy induction) for fictional domains. By tapping noisy fan-built content from online communities such as Wikia, TiFi induces taxonomies through three main steps: category cleaning, edge cleaning and top-level construction. Exploiting a variety of features from the original input, TiFi is able to construct taxonomies for a diverse range of fictional domains with high precision. - The second contribution is a comprehensive approach, called ENTYFI, for named entity recognition and typing in long fictional texts. Built on 205 automatically induced high-quality type systems for popular fictional domains, ENTYFI exploits the overlap and reuse of these fictional domains on unseen texts. By combining different typing modules with a consolidation stage, ENTYFI is able to do fine-grained entity typing in long fictional texts with high precision and recall. - The third contribution is an end-to-end system, called KnowFi, for extracting relations between entities in very long texts such as entire books. KnowFi leverages background knowledge from 142 popular fictional domains to identify interesting relations and to collect distant training samples. KnowFi devises a similarity-based ranking technique to reduce false positives in training samples and to select potential text passages that contain seed pairs of entities. By training a hierarchical neural network for all relations, KnowFi is able to infer relations between entity pairs across long fictional texts, and achieves gains over the best prior methods for relation extraction.
Export
BibTeX
@phdthesis{Chuphd2022, TITLE = {Knowledge Extraction from Fictional Texts}, AUTHOR = {Chu, Cuong Xuan}, LANGUAGE = {eng}, URL = {nbn:de:bsz:291--ds-361070}, DOI = {10.22028/D291-36107}, SCHOOL = {Universit{\"a}t des Saarlandes}, ADDRESS = {Saarbr{\"u}cken}, YEAR = {2022}, DATE = {2022}, ABSTRACT = {Knowledge extraction from text is a key task in natural language processing, which involves many sub-tasks, such as taxonomy induction, named entity recognition and typing, relation extraction, knowledge canonicalization and so on. By constructing structured knowledge from natural language text, knowledge extraction becomes a key asset for search engines, question answering and other downstream applications. However, current knowledge extraction methods mostly focus on prominent real-world entities with Wikipedia and mainstream news articles as sources. The constructed knowledge bases, therefore, lack information about long-tail domains, with fiction and fantasy as archetypes. Fiction and fantasy are core parts of our human culture, spanning from literature to movies, TV series, comics and video games. With thousands of fictional universes which have been created, knowledge from fictional domains are subject of search-engine queries -- by fans as well as cultural analysts. Unlike the real-world domain, knowledge extraction on such specific domains like fiction and fantasy has to tackle several key challenges: -- Training data: Sources for fictional domains mostly come from books and fan-built content, which is sparse and noisy, and contains difficult structures of texts, such as dialogues and quotes. Training data for key tasks such as taxonomy induction, named entity typing or relation extraction are also not available. -- Domain characteristics and diversity: Fictional universes can be highly sophisticated, containing entities, social structures and sometimes languages that are completely different from the real world. State-of-the-art methods for knowledge extraction make assumptions on entity-class, subclass and entity-entity relations that are often invalid for fictional domains. With different genres of fictional domains, another requirement is to transfer models across domains. -- Long fictional texts: While state-of-the-art models have limitations on the input sequence length, it is essential to develop methods that are able to deal with very long texts (e.g. entire books), to capture multiple contexts and leverage widely spread cues. This dissertation addresses the above challenges, by developing new methodologies that advance the state of the art on knowledge extraction in fictional domains. -- The first contribution is a method, called TiFi, for constructing type systems (taxonomy induction) for fictional domains. By tapping noisy fan-built content from online communities such as Wikia, TiFi induces taxonomies through three main steps: category cleaning, edge cleaning and top-level construction. Exploiting a variety of features from the original input, TiFi is able to construct taxonomies for a diverse range of fictional domains with high precision. -- The second contribution is a comprehensive approach, called ENTYFI, for named entity recognition and typing in long fictional texts. Built on 205 automatically induced high-quality type systems for popular fictional domains, ENTYFI exploits the overlap and reuse of these fictional domains on unseen texts. By combining different typing modules with a consolidation stage, ENTYFI is able to do fine-grained entity typing in long fictional texts with high precision and recall. -- The third contribution is an end-to-end system, called KnowFi, for extracting relations between entities in very long texts such as entire books. KnowFi leverages background knowledge from 142 popular fictional domains to identify interesting relations and to collect distant training samples. KnowFi devises a similarity-based ranking technique to reduce false positives in training samples and to select potential text passages that contain seed pairs of entities. By training a hierarchical neural network for all relations, KnowFi is able to infer relations between entity pairs across long fictional texts, and achieves gains over the best prior methods for relation extraction.}, }
Endnote
%0 Thesis %A Chu, Cuong Xuan %Y Weikum, Gerhard %A referee: Theobald, Martin %+ Databases and Information Systems, MPI for Informatics, Max Planck Society International Max Planck Research School, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Knowledge Extraction from Fictional Texts : %G eng %U http://hdl.handle.net/21.11116/0000-000A-9598-2 %R 10.22028/D291-36107 %U nbn:de:bsz:291--ds-361070 %I Universität des Saarlandes %C Saarbrücken %D 2022 %P 129 p. %V phd %9 phd %X Knowledge extraction from text is a key task in natural language processing, which involves many sub-tasks, such as taxonomy induction, named entity recognition and typing, relation extraction, knowledge canonicalization and so on. By constructing structured knowledge from natural language text, knowledge extraction becomes a key asset for search engines, question answering and other downstream applications. However, current knowledge extraction methods mostly focus on prominent real-world entities with Wikipedia and mainstream news articles as sources. The constructed knowledge bases, therefore, lack information about long-tail domains, with fiction and fantasy as archetypes. Fiction and fantasy are core parts of our human culture, spanning from literature to movies, TV series, comics and video games. With thousands of fictional universes which have been created, knowledge from fictional domains are subject of search-engine queries - by fans as well as cultural analysts. Unlike the real-world domain, knowledge extraction on such specific domains like fiction and fantasy has to tackle several key challenges: - Training data: Sources for fictional domains mostly come from books and fan-built content, which is sparse and noisy, and contains difficult structures of texts, such as dialogues and quotes. Training data for key tasks such as taxonomy induction, named entity typing or relation extraction are also not available. - Domain characteristics and diversity: Fictional universes can be highly sophisticated, containing entities, social structures and sometimes languages that are completely different from the real world. State-of-the-art methods for knowledge extraction make assumptions on entity-class, subclass and entity-entity relations that are often invalid for fictional domains. With different genres of fictional domains, another requirement is to transfer models across domains. - Long fictional texts: While state-of-the-art models have limitations on the input sequence length, it is essential to develop methods that are able to deal with very long texts (e.g. entire books), to capture multiple contexts and leverage widely spread cues. This dissertation addresses the above challenges, by developing new methodologies that advance the state of the art on knowledge extraction in fictional domains. - The first contribution is a method, called TiFi, for constructing type systems (taxonomy induction) for fictional domains. By tapping noisy fan-built content from online communities such as Wikia, TiFi induces taxonomies through three main steps: category cleaning, edge cleaning and top-level construction. Exploiting a variety of features from the original input, TiFi is able to construct taxonomies for a diverse range of fictional domains with high precision. - The second contribution is a comprehensive approach, called ENTYFI, for named entity recognition and typing in long fictional texts. Built on 205 automatically induced high-quality type systems for popular fictional domains, ENTYFI exploits the overlap and reuse of these fictional domains on unseen texts. By combining different typing modules with a consolidation stage, ENTYFI is able to do fine-grained entity typing in long fictional texts with high precision and recall. - The third contribution is an end-to-end system, called KnowFi, for extracting relations between entities in very long texts such as entire books. KnowFi leverages background knowledge from 142 popular fictional domains to identify interesting relations and to collect distant training samples. KnowFi devises a similarity-based ranking technique to reduce false positives in training samples and to select potential text passages that contain seed pairs of entities. By training a hierarchical neural network for all relations, KnowFi is able to infer relations between entity pairs across long fictional texts, and achieves gains over the best prior methods for relation extraction. %U https://publikationen.sulb.uni-saarland.de/handle/20.500.11880/32914
[5]
S. Ghosh, S. Razniewski, and G. Weikum, “Answering Count Queries with Explanatory Evidence,” 2022. [Online]. Available: https://arxiv.org/abs/2204.05039. (arXiv: 2204.05039)
Abstract
A challenging case in web search and question answering are count queries, such as \textit{"number of songs by John Lennon"}. Prior methods merely answer these with a single, and sometimes puzzling number or return a ranked list of text snippets with different numbers. This paper proposes a methodology for answering count queries with inference, contextualization and explanatory evidence. Unlike previous systems, our method infers final answers from multiple observations, supports semantic qualifiers for the counts, and provides evidence by enumerating representative instances. Experiments with a wide variety of queries show the benefits of our method. To promote further research on this underexplored topic, we release an annotated dataset of 5k queries with 200k relevant text spans.
Export
BibTeX
@online{Ghosh2204.05039, TITLE = {Answering Count Queries with Explanatory Evidence}, AUTHOR = {Ghosh, Shrestha and Razniewski, Simon and Weikum, Gerhard}, LANGUAGE = {eng}, URL = {https://arxiv.org/abs/2204.05039}, EPRINT = {2204.05039}, EPRINTTYPE = {arXiv}, YEAR = {2022}, ABSTRACT = {A challenging case in web search and question answering are count queries, such as \textit{"number of songs by John Lennon"}. Prior methods merely answer these with a single, and sometimes puzzling number or return a ranked list of text snippets with different numbers. This paper proposes a methodology for answering count queries with inference, contextualization and explanatory evidence. Unlike previous systems, our method infers final answers from multiple observations, supports semantic qualifiers for the counts, and provides evidence by enumerating representative instances. Experiments with a wide variety of queries show the benefits of our method. To promote further research on this underexplored topic, we release an annotated dataset of 5k queries with 200k relevant text spans.}, }
Endnote
%0 Report %A Ghosh, Shrestha %A Razniewski, Simon %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Answering Count Queries with Explanatory Evidence : %G eng %U http://hdl.handle.net/21.11116/0000-000A-9E36-8 %U https://arxiv.org/abs/2204.05039 %D 2022 %X A challenging case in web search and question answering are count queries, such as \textit{"number of songs by John Lennon"}. Prior methods merely answer these with a single, and sometimes puzzling number or return a ranked list of text snippets with different numbers. This paper proposes a methodology for answering count queries with inference, contextualization and explanatory evidence. Unlike previous systems, our method infers final answers from multiple observations, supports semantic qualifiers for the counts, and provides evidence by enumerating representative instances. Experiments with a wide variety of queries show the benefits of our method. To promote further research on this underexplored topic, we release an annotated dataset of 5k queries with 200k relevant text spans. %K Computer Science, Information Retrieval, cs.IR
[6]
V. T. Ho, D. Stepanova, D. Milchevski, J. Strötgen, and G. Weikum, “Enhancing Knowledge Bases with Quantity Facts,” in WWW ’22, ACM Web Conference, Virtual Event, Lyon, France, 2022.
Export
BibTeX
@inproceedings{Ho_WWW22, TITLE = {Enhancing Knowledge Bases with Quantity Facts}, AUTHOR = {Ho, Vinh Thinh and Stepanova, Daria and Milchevski, Dragan and Str{\"o}tgen, Jannik and Weikum, Gerhard}, LANGUAGE = {eng}, ISBN = {978-1-4503-9096-5}, DOI = {10.1145/3485447.3511932}, PUBLISHER = {ACM}, YEAR = {2022}, BOOKTITLE = {WWW '22, ACM Web Conference}, EDITOR = {Laforest, Fr{\'e}d{\'e}rique and Troncy, Rapha{\"e}l and Simperl, Elena and Agarwal, Deepak and Gionis, Aristides and Herman, Ivan and M{\'e}dini, Lionel}, PAGES = {893--901}, ADDRESS = {Virtual Event, Lyon, France}, }
Endnote
%0 Conference Proceedings %A Ho, Vinh Thinh %A Stepanova, Daria %A Milchevski, Dragan %A Strötgen, Jannik %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations External Organizations External Organizations External Organizations Databases and Information Systems, MPI for Informatics, Max Planck Society %T Enhancing Knowledge Bases with Quantity Facts : %G eng %U http://hdl.handle.net/21.11116/0000-000A-614E-2 %R 10.1145/3485447.3511932 %D 2022 %B ACM Web Conference %Z date of event: 2022-04-25 - 2022-04-29 %C Virtual Event, Lyon, France %B WWW '22 %E Laforest, Frédérique; Troncy, Raphaël; Simperl, Elena; Agarwal, Deepak; Gionis, Aristides; Herman, Ivan; Médini, Lionel %P 893 - 901 %I ACM %@ 978-1-4503-9096-5
[7]
P. Lahoti, K. Gummadi, and G. Weikum, “Detecting and Mitigating Test-time Failure Risks via Model-agnostic Uncertainty Learning,” in 21st IEEE International Conference on Data Mining (ICDM 2021), Auckland, New Zealand (Virtual Conference), 2022.
Export
BibTeX
@inproceedings{Gummadi_ICDM21, TITLE = {Detecting and Mitigating Test-time Failure Risks via Model-agnostic Uncertainty Learning}, AUTHOR = {Lahoti, Preethi and Gummadi, Krishna and Weikum, Gerhard}, LANGUAGE = {eng}, ISBN = {978-1-6654-2398-4}, DOI = {10.1109/ICDM51629.2021.00141}, PUBLISHER = {IEEE}, YEAR = {2021}, DATE = {2022}, BOOKTITLE = {21st IEEE International Conference on Data Mining (ICDM 2021)}, EDITOR = {Bailey, James and Miettinen, Pauli and Koh, Yun Sing and Tao, Dacheng and Wu, Xindong}, PAGES = {1174--1179}, ADDRESS = {Auckland, New Zealand (Virtual Conference)}, }
Endnote
%0 Conference Proceedings %A Lahoti, Preethi %A Gummadi, Krishna %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations Databases and Information Systems, MPI for Informatics, Max Planck Society %T Detecting and Mitigating Test-time Failure Risks via Model-agnostic Uncertainty Learning : %G eng %U http://hdl.handle.net/21.11116/0000-000A-5E15-6 %R 10.1109/ICDM51629.2021.00141 %D 2022 %B 21st IEEE International Conference on Data Mining %Z date of event: 2021-12-07 - 2021-12-10 %C Auckland, New Zealand (Virtual Conference) %B 21st IEEE International Conference on Data Mining %E Bailey, James; Miettinen, Pauli; Koh, Yun Sing; Tao, Dacheng; Wu, Xindong %P 1174 - 1179 %I IEEE %@ 978-1-6654-2398-4
[8]
A. Marx and J. Fischer, “Estimating Mutual Information via Geodesic kNN,” in Proceedings of the SIAM International Conference on Data Mining (SDM 2022), Alexandria, VA, USA. (Accepted/in press)
Export
BibTeX
@inproceedings{Marx_SDM2022, TITLE = {{Estimating Mutual Information via Geodesic $k$NN}}, AUTHOR = {Marx, Alexander and Fischer, Jonas}, LANGUAGE = {eng}, DOI = {10.1137/1.9781611976700.44}, PUBLISHER = {SIAM}, YEAR = {2022}, PUBLREMARK = {Accepted}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {Proceedings of the SIAM International Conference on Data Mining (SDM 2022)}, ADDRESS = {Alexandria, VA, USA}, }
Endnote
%0 Conference Proceedings %A Marx, Alexander %A Fischer, Jonas %+ Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Estimating Mutual Information via Geodesic kNN : %G eng %U http://hdl.handle.net/21.11116/0000-0009-B19D-E %R 10.1137/1.9781611976700.44 %D 2021 %B SIAM International Conference on Data Mining %Z date of event: 2022-04-28 - 2022-04-30 %C Alexandria, VA, USA %B Proceedings of the SIAM International Conference on Data Mining %I SIAM
[9]
R. Pradeep, Y. Liu, X. Zhang, Y. Li, A. Yates, and J. Lin, “Squeezing Water from a Stone: A Bag of Tricks for Further Improving Cross-Encoder Effectiveness for Reranking,” in Advances in Information Retrieval (ECIR 2022), Stavanger, Norway, 2022.
Export
BibTeX
@inproceedings{Pradeep_ECIR2022, TITLE = {Squeezing Water from a Stone: {A} Bag of Tricks for Further Improving Cross-Encoder Effectiveness for Reranking}, AUTHOR = {Pradeep, Ronak and Liu, Yuqi and Zhang, Xinyu and Li, Yilin and Yates, Andrew and Lin, Jimmy}, LANGUAGE = {eng}, ISBN = {978-3-030-99736-6}, DOI = {10.1007/978-3-030-99736-6_44}, PUBLISHER = {Springer}, YEAR = {2022}, DATE = {2022}, BOOKTITLE = {Advances in Information Retrieval (ECIR 2022)}, EDITOR = {Hagen, Matthias and Verbene, Suzan and Macdonald, Craig and Seifert, Christin and Balog, Krisztian and N{\o}rv{\aa}g, Kjetil and Setty, Vinay}, PAGES = {655--670}, SERIES = {Lecture Notes in Computer Science}, VOLUME = {13185}, ADDRESS = {Stavanger, Norway}, }
Endnote
%0 Conference Proceedings %A Pradeep, Ronak %A Liu, Yuqi %A Zhang, Xinyu %A Li, Yilin %A Yates, Andrew %A Lin, Jimmy %+ External Organizations External Organizations External Organizations External Organizations Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations %T Squeezing Water from a Stone: A Bag of Tricks for Further Improving Cross-Encoder Effectiveness for Reranking : %G eng %U http://hdl.handle.net/21.11116/0000-000A-9E28-8 %R 10.1007/978-3-030-99736-6_44 %D 2022 %B 44th European Conference on IR Research %Z date of event: 2022-04-10 - 2022-04-14 %C Stavanger, Norway %B Advances in Information Retrieval %E Hagen, Matthias; Verbene, Suzan; Macdonald, Craig; Seifert, Christin; Balog, Krisztian; Nørvåg, Kjetil; Setty, Vinay %P 655 - 670 %I Springer %@ 978-3-030-99736-6 %B Lecture Notes in Computer Science %N 13185
[10]
M. Puri, A. Varde, and G. de Melo, “Commonsense Based Text Mining on Urban Policy,” Language Resources and Evaluation, 2022.
Export
BibTeX
@article{Puri2022, TITLE = {Commonsense Based Text Mining on Urban Policy}, AUTHOR = {Puri, Manish and Varde, Aparna and de Melo, Gerard}, LANGUAGE = {eng}, ISSN = {1574-020X; 1572-0218; 1572-8412; 1574-0218; 0010-4817}, DOI = {10.1007/s10579-022-09584-6}, PUBLISHER = {Springer}, ADDRESS = {New York, NY}, YEAR = {2022}, JOURNAL = {Language Resources and Evaluation}, }
Endnote
%0 Journal Article %A Puri, Manish %A Varde, Aparna %A de Melo, Gerard %+ External Organizations Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations %T Commonsense Based Text Mining on Urban Policy : %G eng %U http://hdl.handle.net/21.11116/0000-000A-20AC-0 %R 10.1007/s10579-022-09584-6 %7 2022 %D 2022 %J Language Resources and Evaluation %O Computers and the Humanities Lang Resources & Evaluation %I Springer %C New York, NY %@ false %U https://rdcu.be/cJwGl
[11]
S. Singhania, S. Razniewski, and G. Weikum, “Predicting Document Coverage for Relation Extraction,” Transactions of the Association of Computational Linguistics, vol. 10, 2022.
Export
BibTeX
@article{Singhania2022, TITLE = {Predicting Document Coverage for Relation Extraction}, AUTHOR = {Singhania, Sneha and Razniewski, Simon and Weikum, Gerhard}, LANGUAGE = {eng}, ISSN = {2307-387X}, DOI = {10.1162/tacl_a_00456}, PUBLISHER = {ACL}, ADDRESS = {Cambridge, MA}, YEAR = {2022}, JOURNAL = {Transactions of the Association of Computational Linguistics}, VOLUME = {10}, PAGES = {207--223}, }
Endnote
%0 Journal Article %A Singhania, Sneha %A Razniewski, Simon %A Weikum, Gerhard %+ Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society %T Predicting Document Coverage for Relation Extraction : %G eng %U http://hdl.handle.net/21.11116/0000-000A-27B8-B %R 10.1162/tacl_a_00456 %7 2022 %D 2022 %J Transactions of the Association of Computational Linguistics %V 10 %& 207 %P 207 - 223 %I ACL %C Cambridge, MA %@ false
[12]
A. Tigunova, “Extracting personal information from conversations,” Universität des Saarlandes, Saarbrücken, 2022.
Abstract
Personal knowledge is a versatile resource that is valuable for a wide range of downstream applications. Background facts about users can allow chatbot assistants to produce more topical and empathic replies. In the context of recommendation and retrieval models, personal facts can be used to customize the ranking results for individual users. A Personal Knowledge Base, populated with personal facts, such as demographic information, interests and interpersonal relationships, is a unique endpoint for storing and querying personal knowledge. Such knowledge bases are easily interpretable and can provide users with full control over their own personal knowledge, including revising stored facts and managing access by downstream services for personalization purposes. To alleviate users from extensive manual effort to build such personal knowledge base, we can leverage automated extraction methods applied to the textual content of the users, such as dialogue transcripts or social media posts. Mainstream extraction methods specialize on well-structured data, such as biographical texts or encyclopedic articles, which are rare for most people. In turn, conversational data is abundant but challenging to process and requires specialized methods for extraction of personal facts. In this dissertation we address the acquisition of personal knowledge from conversational data. We propose several novel deep learning models for inferring speakers’ personal attributes: • Demographic attributes, age, gender, profession and family status, are inferred by HAMs - hierarchical neural classifiers with attention mechanism. Trained HAMs can be transferred between different types of conversational data and provide interpretable predictions. • Long-tailed personal attributes, hobby and profession, are predicted with CHARM - a zero-shot learning model, overcoming the lack of labeled training samples for rare attribute values. By linking conversational utterances to external sources, CHARM is able to predict attribute values which it never saw during training. • Interpersonal relationships are inferred with PRIDE - a hierarchical transformer-based model. To accurately predict fine-grained relationships, PRIDE leverages personal traits of the speakers and the style of conversational utterances. Experiments with various conversational texts, including Reddit discussions and movie scripts, demonstrate the viability of our methods and their superior performance compared to state-of-the-art baselines.
Export
BibTeX
@phdthesis{Tiguphd2021, TITLE = {Extracting personal information from conversations}, AUTHOR = {Tigunova, Anna}, LANGUAGE = {eng}, URL = {nbn:de:bsz:291--ds-356280}, DOI = {10.22028/D291-35628}, SCHOOL = {Universit{\"a}t des Saarlandes}, ADDRESS = {Saarbr{\"u}cken}, YEAR = {2022}, DATE = {2022}, ABSTRACT = {Personal knowledge is a versatile resource that is valuable for a wide range of downstream applications. Background facts about users can allow chatbot assistants to produce more topical and empathic replies. In the context of recommendation and retrieval models, personal facts can be used to customize the ranking results for individual users. A Personal Knowledge Base, populated with personal facts, such as demographic information, interests and interpersonal relationships, is a unique endpoint for storing and querying personal knowledge. Such knowledge bases are easily interpretable and can provide users with full control over their own personal knowledge, including revising stored facts and managing access by downstream services for personalization purposes. To alleviate users from extensive manual effort to build such personal knowledge base, we can leverage automated extraction methods applied to the textual content of the users, such as dialogue transcripts or social media posts. Mainstream extraction methods specialize on well-structured data, such as biographical texts or encyclopedic articles, which are rare for most people. In turn, conversational data is abundant but challenging to process and requires specialized methods for extraction of personal facts. In this dissertation we address the acquisition of personal knowledge from conversational data. We propose several novel deep learning models for inferring speakers{\textquoteright} personal attributes: \mbox{$\bullet$} Demographic attributes, age, gender, profession and family status, are inferred by HAMs -- hierarchical neural classifiers with attention mechanism. Trained HAMs can be transferred between different types of conversational data and provide interpretable predictions. \mbox{$\bullet$} Long-tailed personal attributes, hobby and profession, are predicted with CHARM -- a zero-shot learning model, overcoming the lack of labeled training samples for rare attribute values. By linking conversational utterances to external sources, CHARM is able to predict attribute values which it never saw during training. \mbox{$\bullet$} Interpersonal relationships are inferred with PRIDE -- a hierarchical transformer-based model. To accurately predict fine-grained relationships, PRIDE leverages personal traits of the speakers and the style of conversational utterances. Experiments with various conversational texts, including Reddit discussions and movie scripts, demonstrate the viability of our methods and their superior performance compared to state-of-the-art baselines.}, }
Endnote
%0 Thesis %A Tigunova, Anna %Y Weikum, Gerhard %A referee: Yates, Andrew %A referee: Demberg,, Vera %+ Databases and Information Systems, MPI for Informatics, Max Planck Society International Max Planck Research School, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations %T Extracting personal information from conversations : %G eng %U http://hdl.handle.net/21.11116/0000-000A-3C9E-2 %R 10.22028/D291-35628 %U nbn:de:bsz:291--ds-356280 %I Universität des Saarlandes %C Saarbrücken %D 2022 %P 126 p. %V phd %9 phd %X Personal knowledge is a versatile resource that is valuable for a wide range of downstream applications. Background facts about users can allow chatbot assistants to produce more topical and empathic replies. In the context of recommendation and retrieval models, personal facts can be used to customize the ranking results for individual users. A Personal Knowledge Base, populated with personal facts, such as demographic information, interests and interpersonal relationships, is a unique endpoint for storing and querying personal knowledge. Such knowledge bases are easily interpretable and can provide users with full control over their own personal knowledge, including revising stored facts and managing access by downstream services for personalization purposes. To alleviate users from extensive manual effort to build such personal knowledge base, we can leverage automated extraction methods applied to the textual content of the users, such as dialogue transcripts or social media posts. Mainstream extraction methods specialize on well-structured data, such as biographical texts or encyclopedic articles, which are rare for most people. In turn, conversational data is abundant but challenging to process and requires specialized methods for extraction of personal facts. In this dissertation we address the acquisition of personal knowledge from conversational data. We propose several novel deep learning models for inferring speakers’ personal attributes: • Demographic attributes, age, gender, profession and family status, are inferred by HAMs - hierarchical neural classifiers with attention mechanism. Trained HAMs can be transferred between different types of conversational data and provide interpretable predictions. • Long-tailed personal attributes, hobby and profession, are predicted with CHARM - a zero-shot learning model, overcoming the lack of labeled training samples for rare attribute values. By linking conversational utterances to external sources, CHARM is able to predict attribute values which it never saw during training. • Interpersonal relationships are inferred with PRIDE - a hierarchical transformer-based model. To accurately predict fine-grained relationships, PRIDE leverages personal traits of the speakers and the style of conversational utterances. Experiments with various conversational texts, including Reddit discussions and movie scripts, demonstrate the viability of our methods and their superior performance compared to state-of-the-art baselines. %U https://publikationen.sulb.uni-saarland.de/handle/20.500.11880/32546
[13]
A. Varde, “Computational Estimation by Scientific Data Mining with Classical Methods to Automate Learning Strategies of Scientists,” ACM Transactions on Knowledge Discovery from Data, vol. 16, no. 5, 2022.
Export
BibTeX
@article{Varde2022, TITLE = {Computational Estimation by Scientific Data Mining with Classical Methods to Automate Learning Strategies of Scientists}, AUTHOR = {Varde, Aparna}, LANGUAGE = {eng}, DOI = {10.1145/3502736}, PUBLISHER = {ACM}, ADDRESS = {New York, NY}, YEAR = {2022}, JOURNAL = {ACM Transactions on Knowledge Discovery from Data}, VOLUME = {16}, NUMBER = {5}, PAGES = {1--52}, EID = {86}, }
Endnote
%0 Journal Article %A Varde, Aparna %+ Databases and Information Systems, MPI for Informatics, Max Planck Society %T Computational Estimation by Scientific Data Mining with Classical Methods to Automate Learning Strategies of Scientists : %G eng %U http://hdl.handle.net/21.11116/0000-000A-9D92-0 %R 10.1145/3502736 %7 2022 %D 2022 %J ACM Transactions on Knowledge Discovery from Data %V 16 %N 5 %& 1 %P 1 - 52 %Z sequence number: 86 %I ACM %C New York, NY
[14]
A. Varde, A. Pandey, and X. Du, “Prediction Tool on Fine Particle Pollutants and Air Quality for Environmental Engineering,” SN Computer Science, vol. 3, no. 3, 2022.
Export
BibTeX
@article{Varde2022, TITLE = {Prediction Tool on Fine Particle Pollutants and Air Quality for Environmental Engineering}, AUTHOR = {Varde, Aparna and Pandey, Abidha and Du, Xu}, LANGUAGE = {eng}, ISSN = {2661-8907}, DOI = {10.1007/s42979-022-01068-2}, PUBLISHER = {Springer Nature}, ADDRESS = {Singapore}, YEAR = {2022}, JOURNAL = {SN Computer Science}, VOLUME = {3}, NUMBER = {3}, EID = {184}, }
Endnote
%0 Journal Article %A Varde, Aparna %A Pandey, Abidha %A Du, Xu %+ Databases and Information Systems, MPI for Informatics, Max Planck Society External Organizations External Organizations %T Prediction Tool on Fine Particle Pollutants and Air Quality for Environmental Engineering : %G eng %U http://hdl.handle.net/21.11116/0000-000A-2F55-3 %R 10.1007/s42979-022-01068-2 %7 2022 %D 2022 %J SN Computer Science %V 3 %N 3 %Z sequence number: 184 %I Springer Nature %C Singapore %@ false