Margret Keuper (Research Leader)

Prof. Dr. Margret Keuper

Address: Max-Planck-Institut für Informatik
Saarland Informatics Campus
Campus E1 4
66123 Saarbrücken
Standort: E1 4 - 617
Telefon: +49 681 9325 2117
Fax: +49 681 9325 2099
E-mail: Get email via email

Publications

2025

Conference paper

T. Medi, S. Jung, and M. Keuper

“FAIR-TAT: Improving Model Fairness Using Targeted Adversarial Training,” in IEEE/CVF Winter Conference on Applications of Computer Vision (WACV 2025), Tucson, AZ, USA, 2025.

mehr

BibTeX

@inproceedings{Medi_WACV25,
TITLE = {{FAIR}-{TAT}: Improving Model Fairness Using Targeted Adversarial Training},
AUTHOR = {Medi, Tejaswini and Jung, Steffen and Keuper, Margret},
LANGUAGE = {eng},
ISBN = {979-8-3315-1083-1},
DOI = {10.1109/WACV61041.2025.00760},
PUBLISHER = {IEEE},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
DATE = {2025},
BOOKTITLE = {IEEE/CVF Winter Conference on Applications of Computer Vision (WACV 2025)},
PAGES = {7827--7836},
ADDRESS = {Tucson, AZ, USA},
}

Endnote

%0 Conference Proceedings
%A Medi, Tejaswini
%A Jung, Steffen
%A Keuper, Margret
%+ External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T FAIR-TAT: Improving Model Fairness Using Targeted Adversarial Training : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-5E2E-3
%R 10.1109/WACV61041.2025.00760
%D 2025
%B IEEE/CVF Winter Conference on Applications of Computer Vision
%Z date of event: 2025-02-28 - 2025-03-04
%C Tucson, AZ, USA
%B IEEE/CVF Winter Conference on Applications of Computer Vision
%P 7827 - 7836
%I IEEE
%@ 979-8-3315-1083-1

Conference paper

K. Prasse, I. Bravo, S. Walter, and M. Keuper

“I Spy with My Little Eye: A Minimum Cost Multicut Investigation of Dataset Frames,” in IEEE/CVF Winter Conference on Applications of Computer Vision (WACV 2025), Tucson, AZ, USA, 2025.

mehr

BibTeX

@inproceedings{Prasse_WACV25,
TITLE = {I Spy with My Little Eye: {A} Minimum Cost Multicut Investigation of Dataset Frames},
AUTHOR = {Prasse, Katharina and Bravo, Isaac and Walter, Stefanie and Keuper, Margret},
LANGUAGE = {eng},
ISBN = {979-8-3315-1083-1},
DOI = {10.1109/WACV61041.2025.00214},
PUBLISHER = {IEEE},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
DATE = {2025},
BOOKTITLE = {IEEE/CVF Winter Conference on Applications of Computer Vision (WACV 2025)},
PAGES = {2134--2143},
ADDRESS = {Tucson, AZ, USA},
}

Endnote

%0 Conference Proceedings
%A Prasse, Katharina
%A Bravo, Isaac
%A Walter, Stefanie
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T I Spy with My Little Eye: A Minimum Cost Multicut Investigation of Dataset Frames : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0011-0CD1-4
%R 10.1109/WACV61041.2025.00214
%D 2025
%B IEEE/CVF Winter Conference on Applications of Computer Vision
%Z date of event: 2025-02-28 - 2025-03-04
%C Tucson, AZ, USA
%B IEEE/CVF Winter Conference on Applications of Computer Vision
%P 2134 - 2143
%I IEEE
%@ 979-8-3315-1083-1

Conference paper

Y. Liu, C. Graf, M. Spies, and M. Keuper

“Segment any Repeated Object,” in IEEE International Conference on Robotics and Automation (ICRA 2025), Hyderabad, India.

mehr

BibTeX

@inproceedings{Liu_ICRA2025,
TITLE = {Segment any Repeated Object},
AUTHOR = {Liu, Yushi and Graf, Christian and Spies, Markus and Keuper, Margret},
LANGUAGE = {eng},
PUBLISHER = {IEEE},
YEAR = {2025},
PUBLREMARK = {Accepted},
MARGINALMARK = {$\bullet$},
BOOKTITLE = {IEEE International Conference on Robotics and Automation (ICRA 2025)},
ADDRESS = {Hyderabad, India},
}

Endnote

%0 Conference Proceedings
%A Liu, Yushi
%A Graf, Christian
%A Spies, Markus
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Segment any Repeated Object : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0011-0D44-3
%D 2025
%B IEEE International Conference on Robotics and Automation
%Z date of event: 2025-04-06 - 2025-04-11
%C Hyderabad, India
%B IEEE International Conference on Robotics and Automation
%I IEEE

Article

J. Lukasik, M. Moeller, and M. Keuper

“An Evaluation of Zero-Cost Proxies - From Neural Architecture Performance Prediction to Model Robustness,” International Journal of Computer Vision, vol. 133, 2025.

mehr

BibTeX

@article{Lukasik24,
TITLE = {An Evaluation of Zero-Cost Proxies -- From Neural Architecture Performance Prediction to Model Robustness},
AUTHOR = {Lukasik, Jovita and Moeller, Michael and Keuper, Margret},
LANGUAGE = {eng},
ISSN = {0920-5691},
DOI = {10.1007/s11263-024-02265-7},
PUBLISHER = {Kluwer Academic Publishers},
ADDRESS = {Hingham, Mass.},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
DATE = {2025},
JOURNAL = {International Journal of Computer Vision},
VOLUME = {133},
PAGES = {2635--2652},
}

Endnote

%0 Journal Article
%A Lukasik, Jovita
%A Moeller, Michael
%A Keuper, Margret
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T An Evaluation of Zero-Cost Proxies - From Neural Architecture Performance Prediction to Model Robustness : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-6C38-7
%R 10.1007/s11263-024-02265-7
%7 2024-12-09
%D 2025
%J International Journal of Computer Vision
%O Int. J. Comput. Vis.
%V 133
%& 2635
%P 2635 - 2652
%I Kluwer Academic Publishers
%C Hingham, Mass.
%@ false

Conference paper

Y. Li, W. Beluch, M. Keuper, D. Zhang, and A. Khoreva

“VSTAR: Generative Temporal Nursing for Longer Dynamic Video Synthesis,” in The Thirteenth International Conference on Learning Representations (ICLR 2025), Singapore, 2025.

mehr

BibTeX

@inproceedings{Li_ICLR25,
TITLE = {{VSTAR}: Generative Temporal Nursing for Longer Dynamic Video Synthesis},
AUTHOR = {Li, Yumeng and Beluch, William and Keuper, Margret and Zhang, Dan and Khoreva, Anna},
LANGUAGE = {eng},
PUBLISHER = {OpenReview.net},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
BOOKTITLE = {The Thirteenth International Conference on Learning Representations (ICLR 2025)},
ADDRESS = {Singapore},
}

Endnote

%0 Conference Proceedings
%A Li, Yumeng
%A Beluch, William
%A Keuper, Margret
%A Zhang, Dan
%A Khoreva, Anna
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
External Organizations
%T VSTAR: Generative Temporal Nursing for Longer Dynamic Video Synthesis : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-5E1D-6
%D 2025
%B Thirteenth International Conference on Learning Representations
%Z date of event: 2025-04-24 - 2025-04-28
%C Singapore
%B The Thirteenth International Conference on Learning Representations
%I OpenReview.net

Conference paper

P. Gavrikov, J. Lukasik, S. Jung, R. Geirhos, M. J. Mirza, M. Keuper, and J. Keuper

“Can We Talk Models Into Seeing the World Differently?,” in The Thirteenth International Conference on Learning Representations (ICLR 2025 ), Singapore, 2025.

mehr

BibTeX

@inproceedings{Gavrikov_ICLR25,
TITLE = {Can We Talk Models Into Seeing the World Differently?},
AUTHOR = {Gavrikov, Paul and Lukasik, Jovita and Jung, Steffen and Geirhos, Robert and Mirza, Muhammad Jehanzeb and Keuper, Margret and Keuper, Janis},
LANGUAGE = {eng},
PUBLISHER = {OpenReview.net},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
BOOKTITLE = {The Thirteenth International Conference on Learning Representations (ICLR 2025 )},
ADDRESS = {Singapore},
}

Endnote

%0 Conference Proceedings
%A Gavrikov, Paul
%A Lukasik, Jovita
%A Jung, Steffen
%A Geirhos, Robert
%A Mirza, Muhammad Jehanzeb
%A Keuper, Margret
%A Keuper, Janis
%+ External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
%T Can We Talk Models Into Seeing the World Differently? : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0011-0C79-9
%D 2025
%B Thirteenth International Conference on Learning Representations
%Z date of event: 2025-04-24 - 2025-04-28
%C Singapore
%B The Thirteenth International Conference on Learning Representations
%I OpenReview.net

Paper

S. Agnihotri, D. Schader, N. Sharei, M. E. Kaçar, and M. Keuper

“Are Synthetic Corruptions A Reliable Proxy For Real-World Corruptions?,” 2025. [Online]. Available: https://arxiv.org/abs/2505.04835.

mehr

Abstract

Deep learning (DL) models are widely used in real-world applications but remain vulnerable to distribution shifts, especially due to weather and lighting changes. Collecting diverse real-world data for testing the robustness of DL models is resource-intensive, making synthetic corruptions an attractive alternative for robustness testing. However, are synthetic corruptions a reliable proxy for real-world corruptions? To answer this, we conduct the largest benchmarking study on semantic segmentation models, comparing performance on real-world corruptions and synthetic corruptions datasets. Our results reveal a strong correlation in mean performance, supporting the use of synthetic corruptions for robustness evaluation. We further analyze corruption-specific correlations, providing key insights to understand when synthetic corruptions succeed in representing real-world corruptions. Open-source Code: github.com/shashankskagnihotri/benchmarking_robustness/tree/segmentation_david/semantic_segmentation

BibTeX

@online{Agnihotri_2505.04835,
TITLE = {Are Synthetic Corruptions A Reliable Proxy For Real-World Corruptions?},
AUTHOR = {Agnihotri, Shashank and Schader, David and Sharei, Nico and Ka{\c c}ar, Mehmet Ege and Keuper, Margret},
LANGUAGE = {eng},
URL = {https://arxiv.org/abs/2505.04835},
EPRINT = {2505.04835},
EPRINTTYPE = {arXiv},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Deep learning (DL) models are widely used in real-world applications but remain vulnerable to distribution shifts, especially due to weather and lighting changes. Collecting diverse real-world data for testing the robustness of DL models is resource-intensive, making synthetic corruptions an attractive alternative for robustness testing. However, are synthetic corruptions a reliable proxy for real-world corruptions? To answer this, we conduct the largest benchmarking study on semantic segmentation models, comparing performance on real-world corruptions and synthetic corruptions datasets. Our results reveal a strong correlation in mean performance, supporting the use of synthetic corruptions for robustness evaluation. We further analyze corruption-specific correlations, providing key insights to understand when synthetic corruptions succeed in representing real-world corruptions. Open-source Code: https://github.com/shashankskagnihotri/benchmarking_robustness/tree/segmentation_david/semantic_segmentation},
}

Endnote

%0 Report
%A Agnihotri, Shashank
%A Schader, David
%A Sharei, Nico
%A Ka&#231;ar, Mehmet Ege
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Are Synthetic Corruptions A Reliable Proxy For Real-World Corruptions? : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0011-7272-C
%U https://arxiv.org/abs/2505.04835
%D 2025
%X Deep learning (DL) models are widely used in real-world applications but remain vulnerable to distribution shifts, especially due to weather and lighting changes. Collecting diverse real-world data for testing the robustness of DL models is resource-intensive, making synthetic corruptions an attractive alternative for robustness testing. However, are synthetic corruptions a reliable proxy for real-world corruptions? To answer this, we conduct the largest benchmarking study on semantic segmentation models, comparing performance on real-world corruptions and synthetic corruptions datasets. Our results reveal a strong correlation in mean performance, supporting the use of synthetic corruptions for robustness evaluation. We further analyze corruption-specific correlations, providing key insights to understand when synthetic corruptions succeed in representing real-world corruptions. Open-source Code: https://github.com/shashankskagnihotri/benchmarking_robustness/tree/segmentation_david/semantic_segmentation
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV

Paper

S. Agnihotri, A. Ansari, A. Dackermann, F. Rösch, and M. Keuper

“DispBench: Benchmarking Disparity Estimation to Synthetic Corruptions,” 2025. [Online]. Available: https://arxiv.org/abs/2505.05091.

mehr

Abstract

Deep learning (DL) has surpassed human performance on standard benchmarks, driving its widespread adoption in computer vision tasks. One such task is disparity estimation, estimating the disparity between matching pixels in stereo image pairs, which is crucial for safety-critical applications like medical surgeries and autonomous navigation. However, DL-based disparity estimation methods are highly susceptible to distribution shifts and adversarial attacks, raising concerns about their reliability and generalization. Despite these concerns, a standardized benchmark for evaluating the robustness of disparity estimation methods remains absent, hindering progress in the field.
To address this gap, we introduce DispBench, a comprehensive benchmarking tool for systematically assessing the reliability of disparity estimation methods. DispBench evaluates robustness against synthetic image corruptions such as adversarial attacks and out-of-distribution shifts caused by 2D Common Corruptions across multiple datasets and diverse corruption scenarios. We conduct the most extensive performance and robustness analysis of disparity estimation methods to date, uncovering key correlations between accuracy, reliability, and generalization. Open-source code for DispBench: github.com/shashankskagnihotri/benchmarking_robustness/tree/disparity_estimation/final/disparity_estimation

BibTeX

@online{Agnihotri_2505.05091,
TITLE = {{DispBench}: {B}enchmarking Disparity Estimation to Synthetic Corruptions},
AUTHOR = {Agnihotri, Shashank and Ansari, Amaan and Dackermann, Annika and R{\"o}sch, Fabian and Keuper, Margret},
LANGUAGE = {eng},
URL = {https://arxiv.org/abs/2505.05091},
EPRINT = {2505.05091},
EPRINTTYPE = {arXiv},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Deep learning (DL) has surpassed human performance on standard benchmarks, driving its widespread adoption in computer vision tasks. One such task is disparity estimation, estimating the disparity between matching pixels in stereo image pairs, which is crucial for safety-critical applications like medical surgeries and autonomous navigation. However, DL-based disparity estimation methods are highly susceptible to distribution shifts and adversarial attacks, raising concerns about their reliability and generalization. Despite these concerns, a standardized benchmark for evaluating the robustness of disparity estimation methods remains absent, hindering progress in the field.<br> To address this gap, we introduce DispBench, a comprehensive benchmarking tool for systematically assessing the reliability of disparity estimation methods. DispBench evaluates robustness against synthetic image corruptions such as adversarial attacks and out-of-distribution shifts caused by 2D Common Corruptions across multiple datasets and diverse corruption scenarios. We conduct the most extensive performance and robustness analysis of disparity estimation methods to date, uncovering key correlations between accuracy, reliability, and generalization. Open-source code for DispBench: https://github.com/shashankskagnihotri/benchmarking_robustness/tree/disparity_estimation/final/disparity_estimation},
}

Endnote

%0 Report
%A Agnihotri, Shashank
%A Ansari, Amaan
%A Dackermann, Annika
%A R&#246;sch, Fabian
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T DispBench: Benchmarking Disparity Estimation to Synthetic Corruptions : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0011-72E0-F
%U https://arxiv.org/abs/2505.05091
%D 2025
%X Deep learning (DL) has surpassed human performance on standard benchmarks, driving its widespread adoption in computer vision tasks. One such task is disparity estimation, estimating the disparity between matching pixels in stereo image pairs, which is crucial for safety-critical applications like medical surgeries and autonomous navigation. However, DL-based disparity estimation methods are highly susceptible to distribution shifts and adversarial attacks, raising concerns about their reliability and generalization. Despite these concerns, a standardized benchmark for evaluating the robustness of disparity estimation methods remains absent, hindering progress in the field.<br>  To address this gap, we introduce DispBench, a comprehensive benchmarking tool for systematically assessing the reliability of disparity estimation methods. DispBench evaluates robustness against synthetic image corruptions such as adversarial attacks and out-of-distribution shifts caused by 2D Common Corruptions across multiple datasets and diverse corruption scenarios. We conduct the most extensive performance and robustness analysis of disparity estimation methods to date, uncovering key correlations between accuracy, reliability, and generalization. Open-source code for DispBench: https://github.com/shashankskagnihotri/benchmarking_robustness/tree/disparity_estimation/final/disparity_estimation
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV,Computer Science, Learning, cs.LG

Paper

S. Agnihotri, D. Schader, J. Jakubassa, N. Sharei, S. Kral, M. E. Kaçar, R. Weber, and M. Keuper

“SemSegBench & DetecBench: Benchmarking Reliability and Generalization Beyond Classification,” 2025. .

mehr

Abstract

Reliability and generalization in deep learning are predominantly studied in the context of image classification. Yet, real-world applications in safety-critical domains involve a broader set of semantic tasks, such as semantic segmentation and object detection, which come with a diverse set of dedicated model architectures. To facilitate research towards robust model design in segmentation and detection, our primary objective is to provide benchmarking tools regarding robustness to distribution shifts and adversarial manipulations. We propose the benchmarking tools SEMSEGBENCH and DETECBENCH, along with the most extensive evaluation to date on the reliability and generalization of semantic segmentation and object detection models. In particular, we benchmark 76 segmentation models across four datasets and 61 object detectors across two datasets, evaluating their performance under diverse adversarial attacks and common corruptions. Our findings reveal systematic weaknesses in state-of-the-art models and uncover key trends based on architecture, backbone, and model capacity. SEMSEGBENCH and DETECBENCH are open-sourced in our GitHub repository (https://github.com/shashankskagnihotri/benchmarking_reliability_generalization) along with our complete set of total 6139 evaluations. We anticipate the collected data to foster and encourage future research towards improved model reliability beyond classification.

BibTeX

@online{Agnihotri_2505.18015,
TITLE = {{SemSegBench} \& {DetecBench}: Benchmarking Reliability and Generalization Beyond Classification},
AUTHOR = {Agnihotri, Shashank and Schader, David and Jakubassa, Jonas and Sharei, Nico and Kral, Simon and Ka{\c c}ar, Mehmet Ege and Weber, Ruben and Keuper, Margret},
LANGUAGE = {eng},
DOI = {10.48550/arXiv.2505.18015},
EPRINT = {2505.18015},
EPRINTTYPE = {arXiv},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Reliability and generalization in deep learning are predominantly studied in the context of image classification. Yet, real-world applications in safety-critical domains involve a broader set of semantic tasks, such as semantic segmentation and object detection, which come with a diverse set of dedicated model architectures. To facilitate research towards robust model design in segmentation and detection, our primary objective is to provide benchmarking tools regarding robustness to distribution shifts and adversarial manipulations. We propose the benchmarking tools SEMSEGBENCH and DETECBENCH, along with the most extensive evaluation to date on the reliability and generalization of semantic segmentation and object detection models. In particular, we benchmark 76 segmentation models across four datasets and 61 object detectors across two datasets, evaluating their performance under diverse adversarial attacks and common corruptions. Our findings reveal systematic weaknesses in state-of-the-art models and uncover key trends based on architecture, backbone, and model capacity. SEMSEGBENCH and DETECBENCH are open-sourced in our GitHub repository (https://github.com/shashankskagnihotri/benchmarking_reliability_generalization) along with our complete set of total 6139 evaluations. We anticipate the collected data to foster and encourage future research towards improved model reliability beyond classification.},
}

Endnote

%0 Report
%A Agnihotri, Shashank
%A Schader, David
%A Jakubassa, Jonas
%A Sharei, Nico
%A Kral, Simon
%A Ka&#231;ar, Mehmet Ege
%A Weber, Ruben
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T SemSegBench & DetecBench: Benchmarking Reliability and Generalization Beyond Classification : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0011-7501-8
%R 10.48550/arXiv.2505.18015
%D 2025
%X Reliability and generalization in deep learning are predominantly studied in the context of image classification. Yet, real-world applications in safety-critical domains involve a broader set of semantic tasks, such as semantic segmentation and object detection, which come with a diverse set of dedicated model architectures. To facilitate research towards robust model design in segmentation and detection, our primary objective is to provide benchmarking tools regarding robustness to distribution shifts and adversarial manipulations. We propose the benchmarking tools SEMSEGBENCH and DETECBENCH, along with the most extensive evaluation to date on the reliability and generalization of semantic segmentation and object detection models. In particular, we benchmark 76 segmentation models across four datasets and 61 object detectors across two datasets, evaluating their performance under diverse adversarial attacks and common corruptions. Our findings reveal systematic weaknesses in state-of-the-art models and uncover key trends based on architecture, backbone, and model capacity. SEMSEGBENCH and DETECBENCH are open-sourced in our GitHub repository (https://github.com/shashankskagnihotri/benchmarking_reliability_generalization) along with our complete set of total 6139 evaluations. We anticipate the collected data to foster and encourage future research towards improved model reliability beyond classification.
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV,Computer Science, Learning, cs.LG

Paper

J. Belouadi, E. Ilg, M. Keuper, H. Tanaka, M. Utiyama, R. Dabre, S. Eger, and S. P. Ponzetto

“TikZero: Zero-Shot Text-Guided Graphics Program Synthesis,” 2025. [Online]. Available: https://arxiv.org/abs/2503.11509.

mehr

Abstract

With the rise of generative AI, synthesizing figures from text captions
becomes a compelling application. However, achieving high geometric precision
and editability requires representing figures as graphics programs in languages
like TikZ, and aligned training data (i.e., graphics programs with captions)
remains scarce. Meanwhile, large amounts of unaligned graphics programs and
captioned raster images are more readily available. We reconcile these
disparate data sources by presenting TikZero, which decouples graphics program
generation from text understanding by using image representations as an
intermediary bridge. It enables independent training on graphics programs and
captioned images and allows for zero-shot text-guided graphics program
synthesis during inference. We show that our method substantially outperforms
baselines that can only operate with caption-aligned graphics programs.
Furthermore, when leveraging caption-aligned graphics programs as a
complementary training signal, TikZero matches or exceeds the performance of
much larger models, including commercial systems like GPT-4o. Our code,
datasets, and select models are publicly available.

BibTeX

@online{Belouadi2503.11509,
TITLE = {Tik{Z}ero: {Z}ero-Shot Text-Guided Graphics Program Synthesis},
AUTHOR = {Belouadi, Jonas and Ilg, Eddy and Keuper, Margret and Tanaka, Hideki and Utiyama, Masao and Dabre, Raj and Eger, Steffen and Ponzetto, Simone Paolo},
LANGUAGE = {eng},
URL = {https://arxiv.org/abs/2503.11509},
EPRINT = {2503.11509},
EPRINTTYPE = {arXiv},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
ABSTRACT = {With the rise of generative AI, synthesizing figures from text captions<br>becomes a compelling application. However, achieving high geometric precision<br>and editability requires representing figures as graphics programs in languages<br>like TikZ, and aligned training data (i.e., graphics programs with captions)<br>remains scarce. Meanwhile, large amounts of unaligned graphics programs and<br>captioned raster images are more readily available. We reconcile these<br>disparate data sources by presenting TikZero, which decouples graphics program<br>generation from text understanding by using image representations as an<br>intermediary bridge. It enables independent training on graphics programs and<br>captioned images and allows for zero-shot text-guided graphics program<br>synthesis during inference. We show that our method substantially outperforms<br>baselines that can only operate with caption-aligned graphics programs.<br>Furthermore, when leveraging caption-aligned graphics programs as a<br>complementary training signal, TikZero matches or exceeds the performance of<br>much larger models, including commercial systems like GPT-4o. Our code,<br>datasets, and select models are publicly available.<br>},
}

Endnote

%0 Report
%A Belouadi, Jonas
%A Ilg, Eddy
%A Keuper, Margret
%A Tanaka, Hideki
%A Utiyama, Masao
%A Dabre, Raj
%A Eger, Steffen
%A Ponzetto, Simone Paolo
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
%T TikZero: Zero-Shot Text-Guided Graphics Program Synthesis : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0011-1CE4-D
%U https://arxiv.org/abs/2503.11509
%D 2025
%X   With the rise of generative AI, synthesizing figures from text captions<br>becomes a compelling application. However, achieving high geometric precision<br>and editability requires representing figures as graphics programs in languages<br>like TikZ, and aligned training data (i.e., graphics programs with captions)<br>remains scarce. Meanwhile, large amounts of unaligned graphics programs and<br>captioned raster images are more readily available. We reconcile these<br>disparate data sources by presenting TikZero, which decouples graphics program<br>generation from text understanding by using image representations as an<br>intermediary bridge. It enables independent training on graphics programs and<br>captioned images and allows for zero-shot text-guided graphics program<br>synthesis during inference. We show that our method substantially outperforms<br>baselines that can only operate with caption-aligned graphics programs.<br>Furthermore, when leveraging caption-aligned graphics programs as a<br>complementary training signal, TikZero matches or exceeds the performance of<br>much larger models, including commercial systems like GPT-4o. Our code,<br>datasets, and select models are publicly available.<br>
%K Computer Science, Computation and Language, cs.CL,Computer Science, Computer Vision and Pattern Recognition, cs.CV

Paper

M. Fatima, S. Jung, and M. Keuper

“Corner Cases: How Size and Position of Objects Challenge ImageNet-Trained Models,” 2025. [Online]. Available: https://arxiv.org/abs/2505.03569.

mehr

Abstract

Backgrounds in images play a major role in contributing to spurious correlations among different data points. Owing to aesthetic preferences of humans capturing the images, datasets can exhibit positional (location of the object within a given frame) and size (region-of-interest to image ratio) biases for different classes. In this paper, we show that these biases can impact how much a model relies on spurious features in the background to make its predictions. To better illustrate our findings, we propose a synthetic dataset derived from ImageNet1k, Hard-Spurious-ImageNet, which contains images with various backgrounds, object positions, and object sizes. By evaluating the dataset on different pretrained models, we find that most models rely heavily on spurious features in the background when the region-of-interest (ROI) to image ratio is small and the object is far from the center of the image. Moreover, we also show that current methods that aim to mitigate harmful spurious features, do not take into account these factors, hence fail to achieve considerable performance gains for worst-group accuracies when the size and location of core features in an image change.

BibTeX

@online{Fatima_2505.03569,
TITLE = {Corner Cases: How Size and Position of Objects Challenge {ImageNet}-Trained Models},
AUTHOR = {Fatima, Mishal and Jung, Steffen and Keuper, Margret},
LANGUAGE = {eng},
URL = {https://arxiv.org/abs/2505.03569},
EPRINT = {2505.03569},
EPRINTTYPE = {arXiv},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Backgrounds in images play a major role in contributing to spurious correlations among different data points. Owing to aesthetic preferences of humans capturing the images, datasets can exhibit positional (location of the object within a given frame) and size (region-of-interest to image ratio) biases for different classes. In this paper, we show that these biases can impact how much a model relies on spurious features in the background to make its predictions. To better illustrate our findings, we propose a synthetic dataset derived from ImageNet1k, Hard-Spurious-ImageNet, which contains images with various backgrounds, object positions, and object sizes. By evaluating the dataset on different pretrained models, we find that most models rely heavily on spurious features in the background when the region-of-interest (ROI) to image ratio is small and the object is far from the center of the image. Moreover, we also show that current methods that aim to mitigate harmful spurious features, do not take into account these factors, hence fail to achieve considerable performance gains for worst-group accuracies when the size and location of core features in an image change.},
}

Endnote

%0 Report
%A Fatima, Mishal
%A Jung, Steffen
%A Keuper, Margret
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Corner Cases: How Size and Position of Objects Challenge ImageNet-Trained Models : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0011-7268-8
%U https://arxiv.org/abs/2505.03569
%D 2025
%X Backgrounds in images play a major role in contributing to spurious correlations among different data points. Owing to aesthetic preferences of humans capturing the images, datasets can exhibit positional (location of the object within a given frame) and size (region-of-interest to image ratio) biases for different classes. In this paper, we show that these biases can impact how much a model relies on spurious features in the background to make its predictions. To better illustrate our findings, we propose a synthetic dataset derived from ImageNet1k, Hard-Spurious-ImageNet, which contains images with various backgrounds, object positions, and object sizes. By evaluating the dataset on different pretrained models, we find that most models rely heavily on spurious features in the background when the region-of-interest (ROI) to image ratio is small and the object is far from the center of the image. Moreover, we also show that current methods that aim to mitigate harmful spurious features, do not take into account these factors, hence fail to achieve considerable performance gains for worst-group accuracies when the size and location of core features in an image change.
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV

Paper

C. Leiter, Y. M. Asano, M. Keuper, and S. Eger

“CROC: Evaluating and Training T2I Metrics with Pseudo- and Human-Labeled Contrastive Robustness Checks,” 2025. .

mehr

Abstract

The assessment of evaluation metrics (meta-evaluation) is crucial for determining the suitability of existing metrics in text-to-image (T2I) generation tasks. Human-based meta-evaluation is costly and time-intensive, and automated alternatives are scarce. We address this gap and propose CROC: a scalable framework for automated Contrastive Robustness Checks that systematically probes and quantifies metric robustness by synthesizing contrastive test cases across a comprehensive taxonomy of image properties. With CROC, we generate a pseudo-labeled dataset (CROC$^{syn}$) of over one million contrastive prompt-image pairs to enable a fine-grained comparison of evaluation metrics. We also use the dataset to train CROCScore, a new metric that achieves state-of-the-art performance among open-source methods, demonstrating an additional key application of our framework. To complement this dataset, we introduce a human-supervised benchmark (CROC$^{hum}$) targeting especially challenging categories. Our results highlight robustness issues in existing metrics: for example, many fail on prompts involving negation, and all tested open-source metrics fail on at least 25% of cases involving correct identification of body parts.

BibTeX

@online{Leiter_2505.11314,
TITLE = {{CROC}: Evaluating and Training {T2I} Metrics with Pseudo- and Human-Labeled Contrastive Robustness Checks},
AUTHOR = {Leiter, Christoph and Asano, Yuki M. and Keuper, Margret and Eger, Steffen},
LANGUAGE = {eng},
DOI = {10.48550/arXiv.2505.11314},
EPRINT = {2505.11314},
EPRINTTYPE = {arXiv},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
ABSTRACT = {The assessment of evaluation metrics (meta-evaluation) is crucial for determining the suitability of existing metrics in text-to-image (T2I) generation tasks. Human-based meta-evaluation is costly and time-intensive, and automated alternatives are scarce. We address this gap and propose CROC: a scalable framework for automated Contrastive Robustness Checks that systematically probes and quantifies metric robustness by synthesizing contrastive test cases across a comprehensive taxonomy of image properties. With CROC, we generate a pseudo-labeled dataset (CROC$^{syn}$) of over one million contrastive prompt-image pairs to enable a fine-grained comparison of evaluation metrics. We also use the dataset to train CROCScore, a new metric that achieves state-of-the-art performance among open-source methods, demonstrating an additional key application of our framework. To complement this dataset, we introduce a human-supervised benchmark (CROC$^{hum}$) targeting especially challenging categories. Our results highlight robustness issues in existing metrics: for example, many fail on prompts involving negation, and all tested open-source metrics fail on at least 25% of cases involving correct identification of body parts.},
}

Endnote

%0 Report
%A Leiter, Christoph
%A Asano, Yuki M.
%A Keuper, Margret
%A Eger, Steffen
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
%T CROC: Evaluating and Training T2I Metrics with Pseudo- and Human-Labeled Contrastive Robustness Checks : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0011-72EB-4
%R 10.48550/arXiv.2505.11314
%D 2025
%X The assessment of evaluation metrics (meta-evaluation) is crucial for determining the suitability of existing metrics in text-to-image (T2I) generation tasks. Human-based meta-evaluation is costly and time-intensive, and automated alternatives are scarce. We address this gap and propose CROC: a scalable framework for automated Contrastive Robustness Checks that systematically probes and quantifies metric robustness by synthesizing contrastive test cases across a comprehensive taxonomy of image properties. With CROC, we generate a pseudo-labeled dataset (CROC$^{syn}$) of over one million contrastive prompt-image pairs to enable a fine-grained comparison of evaluation metrics. We also use the dataset to train CROCScore, a new metric that achieves state-of-the-art performance among open-source methods, demonstrating an additional key application of our framework. To complement this dataset, we introduce a human-supervised benchmark (CROC$^{hum}$) targeting especially challenging categories. Our results highlight robustness issues in existing metrics: for example, many fail on prompts involving negation, and all tested open-source metrics fail on at least 25% of cases involving correct identification of body parts.
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV,Computer Science, Computation and Language, cs.CL

Paper

P. Müller, A. Braun, and M. Keuper

“Examining the Impact of Optical Aberrations to Image Classification and Object Detection Models,” 2025. [Online]. Available: https://arxiv.org/abs/2504.18510.

mehr

Abstract

Deep neural networks (DNNs) have proven to be successful in various computer
vision applications such that models even infer in safety-critical situations.
Therefore, vision models have to behave in a robust way to disturbances such as
noise or blur. While seminal benchmarks exist to evaluate model robustness to
diverse corruptions, blur is often approximated in an overly simplistic way to
model defocus, while ignoring the different blur kernel shapes that result from
optical systems. To study model robustness against realistic optical blur
effects, this paper proposes two datasets of blur corruptions, which we denote
OpticsBench and LensCorruptions. OpticsBench examines primary aberrations such
as coma, defocus, and astigmatism, i.e. aberrations that can be represented by
varying a single parameter of Zernike polynomials. To go beyond the principled
but synthetic setting of primary aberrations, LensCorruptions samples linear
combinations in the vector space spanned by Zernike polynomials, corresponding
to 100 real lenses. Evaluations for image classification and object detection
on ImageNet and MSCOCO show that for a variety of different pre-trained models,
the performance on OpticsBench and LensCorruptions varies significantly,
indicating the need to consider realistic image corruptions to evaluate a
model's robustness against blur.

BibTeX

@online{Mueller_2504.18510,
TITLE = {Examining the Impact of Optical Aberrations to Image Classification and Object Detection Models},
AUTHOR = {M{\"u}ller, Patrick and Braun, Alexander and Keuper, Margret},
LANGUAGE = {eng},
URL = {https://arxiv.org/abs/2504.18510},
EPRINT = {2504.18510},
EPRINTTYPE = {arXiv},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Deep neural networks (DNNs) have proven to be successful in various computer<br>vision applications such that models even infer in safety-critical situations.<br>Therefore, vision models have to behave in a robust way to disturbances such as<br>noise or blur. While seminal benchmarks exist to evaluate model robustness to<br>diverse corruptions, blur is often approximated in an overly simplistic way to<br>model defocus, while ignoring the different blur kernel shapes that result from<br>optical systems. To study model robustness against realistic optical blur<br>effects, this paper proposes two datasets of blur corruptions, which we denote<br>OpticsBench and LensCorruptions. OpticsBench examines primary aberrations such<br>as coma, defocus, and astigmatism, i.e. aberrations that can be represented by<br>varying a single parameter of Zernike polynomials. To go beyond the principled<br>but synthetic setting of primary aberrations, LensCorruptions samples linear<br>combinations in the vector space spanned by Zernike polynomials, corresponding<br>to 100 real lenses. Evaluations for image classification and object detection<br>on ImageNet and MSCOCO show that for a variety of different pre-trained models,<br>the performance on OpticsBench and LensCorruptions varies significantly,<br>indicating the need to consider realistic image corruptions to evaluate a<br>model's robustness against blur.<br>},
}

Endnote

%0 Report
%A M&#252;ller, Patrick
%A Braun, Alexander
%A Keuper, Margret
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Examining the Impact of Optical Aberrations to Image Classification and
  Object Detection Models : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0011-4C9E-7
%U https://arxiv.org/abs/2504.18510
%D 2025
%X   Deep neural networks (DNNs) have proven to be successful in various computer<br>vision applications such that models even infer in safety-critical situations.<br>Therefore, vision models have to behave in a robust way to disturbances such as<br>noise or blur. While seminal benchmarks exist to evaluate model robustness to<br>diverse corruptions, blur is often approximated in an overly simplistic way to<br>model defocus, while ignoring the different blur kernel shapes that result from<br>optical systems. To study model robustness against realistic optical blur<br>effects, this paper proposes two datasets of blur corruptions, which we denote<br>OpticsBench and LensCorruptions. OpticsBench examines primary aberrations such<br>as coma, defocus, and astigmatism, i.e. aberrations that can be represented by<br>varying a single parameter of Zernike polynomials. To go beyond the principled<br>but synthetic setting of primary aberrations, LensCorruptions samples linear<br>combinations in the vector space spanned by Zernike polynomials, corresponding<br>to 100 real lenses. Evaluations for image classification and object detection<br>on ImageNet and MSCOCO show that for a variety of different pre-trained models,<br>the performance on OpticsBench and LensCorruptions varies significantly,<br>indicating the need to consider realistic image corruptions to evaluate a<br>model's robustness against blur.<br>
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV

Paper

K. Prasse, M. Kleinmann, I. Adam, K. Beckersjuergen, A. Edte, J. Frroku, T. Gumpp, S. Jung, I. Bravo, S. Walter, and M. Keuper

“Deep Learning for Climate Action: Computer Vision Analysis of Visual Narratives on X,” 2025. [Online]. Available: https://arxiv.org/abs/2503.09361.

mehr

Abstract

Climate change is one of the most pressing challenges of the 21st century,
sparking widespread discourse across social media platforms. Activists,
policymakers, and researchers seek to understand public sentiment and
narratives while access to social media data has become increasingly restricted
in the post-API era. In this study, we analyze a dataset of climate
change-related tweets from X (formerly Twitter) shared in 2019, containing 730k
tweets along with the shared images. Our approach integrates statistical
analysis, image classification, object detection, and sentiment analysis to
explore visual narratives in climate discourse. Additionally, we introduce a
graphical user interface (GUI) to facilitate interactive data exploration. Our
findings reveal key themes in climate communication, highlight sentiment
divergence between images and text, and underscore the strengths and
limitations of foundation models in analyzing social media imagery. By
releasing our code and tools, we aim to support future research on the
intersection of climate change, social media, and computer vision.

BibTeX

@online{Prasse2503.09361,
TITLE = {Deep Learning for Climate Action: {C}omputer Vision Analysis of Visual Narratives on {X}},
AUTHOR = {Prasse, Katharina and Kleinmann, Marcel and Adam, Inken and Beckersjuergen, Kerstin and Edte, Andreas and Frroku, Jona and Gumpp, Timotheus and Jung, Steffen and Bravo, Isaac and Walter, Stefanie and Keuper, Margret},
LANGUAGE = {eng},
URL = {https://arxiv.org/abs/2503.09361},
EPRINT = {2503.09361},
EPRINTTYPE = {arXiv},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Climate change is one of the most pressing challenges of the 21st century,<br>sparking widespread discourse across social media platforms. Activists,<br>policymakers, and researchers seek to understand public sentiment and<br>narratives while access to social media data has become increasingly restricted<br>in the post-API era. In this study, we analyze a dataset of climate<br>change-related tweets from X (formerly Twitter) shared in 2019, containing 730k<br>tweets along with the shared images. Our approach integrates statistical<br>analysis, image classification, object detection, and sentiment analysis to<br>explore visual narratives in climate discourse. Additionally, we introduce a<br>graphical user interface (GUI) to facilitate interactive data exploration. Our<br>findings reveal key themes in climate communication, highlight sentiment<br>divergence between images and text, and underscore the strengths and<br>limitations of foundation models in analyzing social media imagery. By<br>releasing our code and tools, we aim to support future research on the<br>intersection of climate change, social media, and computer vision.<br>},
}

Endnote

%0 Report
%A Prasse, Katharina
%A Kleinmann, Marcel
%A Adam, Inken
%A Beckersjuergen, Kerstin
%A Edte, Andreas
%A Frroku, Jona
%A Gumpp, Timotheus
%A Jung, Steffen
%A Bravo, Isaac
%A Walter, Stefanie
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Deep Learning for Climate Action: Computer Vision Analysis of Visual Narratives on X : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0011-1CDE-5
%U https://arxiv.org/abs/2503.09361
%D 2025
%X   Climate change is one of the most pressing challenges of the 21st century,<br>sparking widespread discourse across social media platforms. Activists,<br>policymakers, and researchers seek to understand public sentiment and<br>narratives while access to social media data has become increasingly restricted<br>in the post-API era. In this study, we analyze a dataset of climate<br>change-related tweets from X (formerly Twitter) shared in 2019, containing 730k<br>tweets along with the shared images. Our approach integrates statistical<br>analysis, image classification, object detection, and sentiment analysis to<br>explore visual narratives in climate discourse. Additionally, we introduce a<br>graphical user interface (GUI) to facilitate interactive data exploration. Our<br>findings reveal key themes in climate communication, highlight sentiment<br>divergence between images and text, and underscore the strengths and<br>limitations of foundation models in analyzing social media imagery. By<br>releasing our code and tools, we aim to support future research on the<br>intersection of climate change, social media, and computer vision.<br>
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV,cs.SI

Paper

K. Prasse, P. Knab, S. Marton, C. Bartelt, and M. Keuper

“DCBM: Data-Efficient Visual Concept Bottleneck Models,” 2025. [Online]. Available: https://arxiv.org/abs/2412.11576.

mehr

Abstract

Concept Bottleneck Models (CBMs) enhance the interpretability of neural
networks by basing predictions on human-understandable concepts. However,
current CBMs typically rely on concept sets extracted from large language
models or extensive image corpora, limiting their effectiveness in data-sparse
scenarios. We propose Data-efficient CBMs (DCBMs), which reduce the need for
large sample sizes during concept generation while preserving interpretability.
DCBMs define concepts as image regions detected by segmentation or detection
foundation models, allowing each image to generate multiple concepts across
different granularities. This removes reliance on textual descriptions and
large-scale pre-training, making DCBMs applicable for fine-grained
classification and out-of-distribution tasks. Attribution analysis using
Grad-CAM demonstrates that DCBMs deliver visual concepts that can be localized
in test images. By leveraging dataset-specific concepts instead of predefined
ones, DCBMs enhance adaptability to new domains.

BibTeX

@online{Prasse2412.11576,
TITLE = {{DCBM}: Data-Efficient Visual Concept Bottleneck Models},
AUTHOR = {Prasse, Katharina and Knab, Patrick and Marton, Sascha and Bartelt, Christian and Keuper, Margret},
LANGUAGE = {eng},
URL = {https://arxiv.org/abs/2412.11576},
EPRINT = {2412.11576},
EPRINTTYPE = {arXiv},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Concept Bottleneck Models (CBMs) enhance the interpretability of neural<br>networks by basing predictions on human-understandable concepts. However,<br>current CBMs typically rely on concept sets extracted from large language<br>models or extensive image corpora, limiting their effectiveness in data-sparse<br>scenarios. We propose Data-efficient CBMs (DCBMs), which reduce the need for<br>large sample sizes during concept generation while preserving interpretability.<br>DCBMs define concepts as image regions detected by segmentation or detection<br>foundation models, allowing each image to generate multiple concepts across<br>different granularities. This removes reliance on textual descriptions and<br>large-scale pre-training, making DCBMs applicable for fine-grained<br>classification and out-of-distribution tasks. Attribution analysis using<br>Grad-CAM demonstrates that DCBMs deliver visual concepts that can be localized<br>in test images. By leveraging dataset-specific concepts instead of predefined<br>ones, DCBMs enhance adaptability to new domains.<br>},
}

Endnote

%0 Report
%A Prasse, Katharina
%A Knab, Patrick
%A Marton, Sascha
%A Bartelt, Christian
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T DCBM: Data-Efficient Visual Concept Bottleneck Models : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-BF46-9
%U https://arxiv.org/abs/2412.11576
%D 2025
%X   Concept Bottleneck Models (CBMs) enhance the interpretability of neural<br>networks by basing predictions on human-understandable concepts. However,<br>current CBMs typically rely on concept sets extracted from large language<br>models or extensive image corpora, limiting their effectiveness in data-sparse<br>scenarios. We propose Data-efficient CBMs (DCBMs), which reduce the need for<br>large sample sizes during concept generation while preserving interpretability.<br>DCBMs define concepts as image regions detected by segmentation or detection<br>foundation models, allowing each image to generate multiple concepts across<br>different granularities. This removes reliance on textual descriptions and<br>large-scale pre-training, making DCBMs applicable for fine-grained<br>classification and out-of-distribution tasks. Attribution analysis using<br>Grad-CAM demonstrates that DCBMs deliver visual concepts that can be localized<br>in test images. By leveraging dataset-specific concepts instead of predefined<br>ones, DCBMs enhance adaptability to new domains.<br>
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV

Paper

J. Schmalfuss, V. Oei, L. Mehl, M. Bartsch, S. Agnihotri, M. Keuper, and A. Bruhn

“RobustSpring: Benchmarking Robustness to Image Corruptions for Optical Flow, Scene Flow and Stereo,” 2025. .

mehr

Abstract

Standard benchmarks for optical flow, scene flow, and stereo vision algorithms generally focus on model accuracy rather than robustness to image corruptions like noise or rain. Hence, the resilience of models to such real-world perturbations is largely unquantified. To address this, we present RobustSpring, a comprehensive dataset and benchmark for evaluating robustness to image corruptions for optical flow, scene flow, and stereo models. RobustSpring applies 20 different image corruptions, including noise, blur, color changes, quality degradations, and weather distortions, in a time-, stereo-, and depth-consistent manner to the high-resolution Spring dataset, creating a suite of 20,000 corrupted images that reflect challenging conditions. RobustSpring enables comparisons of model robustness via a new corruption robustness metric. Integration with the Spring benchmark enables public two-axis evaluations of both accuracy and robustness. We benchmark a curated selection of initial models, observing that accurate models are not necessarily robust and that robustness varies widely by corruption type. RobustSpring is a new computer vision benchmark that treats robustness as a first-class citizen to foster models that combine accuracy with resilience. It will be available at spring-benchmark.org.

BibTeX

@online{Schmalfuss_2505.09368,
TITLE = {{RobustSpring}: Benchmarking Robustness to Image Corruptions for Optical Flow, Scene Flow and Stereo},
AUTHOR = {Schmalfuss, Jenny and Oei, Victor and Mehl, Lukas and Bartsch, Madlen and Agnihotri, Shashank and Keuper, Margret and Bruhn, Andr{\'e}s},
LANGUAGE = {eng},
DOI = {10.48550/arXiv.2505.09368},
EPRINT = {2505.09368},
EPRINTTYPE = {arXiv},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Standard benchmarks for optical flow, scene flow, and stereo vision algorithms generally focus on model accuracy rather than robustness to image corruptions like noise or rain. Hence, the resilience of models to such real-world perturbations is largely unquantified. To address this, we present RobustSpring, a comprehensive dataset and benchmark for evaluating robustness to image corruptions for optical flow, scene flow, and stereo models. RobustSpring applies 20 different image corruptions, including noise, blur, color changes, quality degradations, and weather distortions, in a time-, stereo-, and depth-consistent manner to the high-resolution Spring dataset, creating a suite of 20,000 corrupted images that reflect challenging conditions. RobustSpring enables comparisons of model robustness via a new corruption robustness metric. Integration with the Spring benchmark enables public two-axis evaluations of both accuracy and robustness. We benchmark a curated selection of initial models, observing that accurate models are not necessarily robust and that robustness varies widely by corruption type. RobustSpring is a new computer vision benchmark that treats robustness as a first-class citizen to foster models that combine accuracy with resilience. It will be available at https://spring-benchmark.org.},
}

Endnote

%0 Report
%A Schmalfuss, Jenny
%A Oei, Victor
%A Mehl, Lukas
%A Bartsch, Madlen
%A Agnihotri, Shashank
%A Keuper, Margret
%A Bruhn, Andr&#233;s
%+ External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
%T RobustSpring: Benchmarking Robustness to Image Corruptions for Optical Flow, Scene Flow and Stereo : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0011-72E3-C
%R 10.48550/arXiv.2505.09368
%D 2025
%X Standard benchmarks for optical flow, scene flow, and stereo vision algorithms generally focus on model accuracy rather than robustness to image corruptions like noise or rain. Hence, the resilience of models to such real-world perturbations is largely unquantified. To address this, we present RobustSpring, a comprehensive dataset and benchmark for evaluating robustness to image corruptions for optical flow, scene flow, and stereo models. RobustSpring applies 20 different image corruptions, including noise, blur, color changes, quality degradations, and weather distortions, in a time-, stereo-, and depth-consistent manner to the high-resolution Spring dataset, creating a suite of 20,000 corrupted images that reflect challenging conditions. RobustSpring enables comparisons of model robustness via a new corruption robustness metric. Integration with the Spring benchmark enables public two-axis evaluations of both accuracy and robustness. We benchmark a curated selection of initial models, observing that accurate models are not necessarily robust and that robustness varies widely by corruption type. RobustSpring is a new computer vision benchmark that treats robustness as a first-class citizen to foster models that combine accuracy with resilience. It will be available at https://spring-benchmark.org.
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV,Computer Science, Learning, cs.LG

Paper

J. Xu, O. Kao, and M. Keuper

“Informed Mixing -- Improving Open Set Recognition via Attribution-based Augmentation,” 2025. .

mehr

Abstract

Open set recognition (OSR) is devised to address the problem of detecting novel classes during model inference. Even in recent vision models, this remains an open issue which is receiving increasing attention. Thereby, a crucial challenge is to learn features that are relevant for unseen categories from given data, for which these features might not be discriminative. To facilitate this process and "optimize to learn" more diverse features, we propose GradMix, a data augmentation method that dynamically leverages gradient-based attribution maps of the model during training to mask out already learned concepts. Thus GradMix encourages the model to learn a more complete set of representative features from the same data source. Extensive experiments on open set recognition, close set classification, and out-of-distribution detection reveal that our method can often outperform the state-of-the-art. GradMix can further increase model robustness to corruptions as well as downstream classification performance for self-supervised learning, indicating its benefit for model generalization.

BibTeX

@online{Xu_2505.12803,
TITLE = {Informed Mixing -- Improving Open Set Recognition via Attribution-based Augmentation},
AUTHOR = {Xu, Jiawen and Kao, Odej and Keuper, Margret},
LANGUAGE = {eng},
DOI = {10.48550/arXiv.2505.12803},
EPRINT = {2505.12803},
EPRINTTYPE = {arXiv},
YEAR = {2025},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Open set recognition (OSR) is devised to address the problem of detecting novel classes during model inference. Even in recent vision models, this remains an open issue which is receiving increasing attention. Thereby, a crucial challenge is to learn features that are relevant for unseen categories from given data, for which these features might not be discriminative. To facilitate this process and "optimize to learn" more diverse features, we propose GradMix, a data augmentation method that dynamically leverages gradient-based attribution maps of the model during training to mask out already learned concepts. Thus GradMix encourages the model to learn a more complete set of representative features from the same data source. Extensive experiments on open set recognition, close set classification, and out-of-distribution detection reveal that our method can often outperform the state-of-the-art. GradMix can further increase model robustness to corruptions as well as downstream classification performance for self-supervised learning, indicating its benefit for model generalization.},
}

Endnote

%0 Report
%A Xu, Jiawen
%A Kao, Odej
%A Keuper, Margret
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Informed Mixing -- Improving Open Set Recognition via Attribution-based Augmentation : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0011-72EE-1
%R 10.48550/arXiv.2505.12803
%D 2025
%X Open set recognition (OSR) is devised to address the problem of detecting novel classes during model inference. Even in recent vision models, this remains an open issue which is receiving increasing attention. Thereby, a crucial challenge is to learn features that are relevant for unseen categories from given data, for which these features might not be discriminative. To facilitate this process and "optimize to learn" more diverse features, we propose GradMix, a data augmentation method that dynamically leverages gradient-based attribution maps of the model during training to mask out already learned concepts. Thus GradMix encourages the model to learn a more complete set of representative features from the same data source. Extensive experiments on open set recognition, close set classification, and out-of-distribution detection reveal that our method can often outperform the state-of-the-art. GradMix can further increase model robustness to corruptions as well as downstream classification performance for self-supervised learning, indicating its benefit for model generalization.
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV,Computer Science, Learning, cs.LG

2024

Conference paper

S. Agnihotri, J. Grabinski, and M. Keuper

“Improving Feature Stability during Upsampling - Spectral Artifacts and the Importance of Spatial Context,” in Computer Vision -- ECCV 2024, Milano, Italy, 2024.

mehr

BibTeX

@inproceedings{AgnihotriECCV24,
TITLE = {Improving Feature Stability during Upsampling -- Spectral Artifacts and the Importance of Spatial Context},
AUTHOR = {Agnihotri, Shashank and Grabinski, Julia and Keuper, Margret},
LANGUAGE = {eng},
ISBN = {978-3-031-73635-3},
DOI = {10.1007/978-3-031-73636-0_21},
PUBLISHER = {Springer},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
DATE = {2024},
BOOKTITLE = {Computer Vision -- ECCV 2024},
EDITOR = {Leonardis, Ale{\v s} and Ricci, Elisa and Roth, Stefan and Russakovsky, Olga and Sattler, Torsten and Varol, G{\"u}l},
PAGES = {357--376},
SERIES = {Lecture Notes in Computer Science},
VOLUME = {15116},
ADDRESS = {Milano, Italy},
}

Endnote

%0 Conference Proceedings
%A Agnihotri, Shashank
%A Grabinski, Julia
%A Keuper, Margret
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Improving Feature Stability during Upsampling - Spectral Artifacts and the Importance of Spatial Context : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000F-D878-7
%R 10.1007/978-3-031-73636-0_21
%D 2024
%B 18th European Conference on Computer Vision 
%Z date of event: 2024-09-29 - 2024-10-04
%C Milano, Italy
%B Computer Vision -- ECCV 2024
%E Leonardis, Ale&#353;; Ricci, Elisa; Roth, Stefan; Russakovsky, Olga; Sattler, Torsten; Varol, G&#252;l
%P 357 - 376
%I Springer
%@ 978-3-031-73635-3
%B Lecture Notes in Computer Science
%N 15116
%U https://rdcu.be/d2ndn

Conference paper

U. A. Kaplan, Y. Li, M. Keuper, A. Khoreva, and D. Zhang

“Domain-Aware Fine-Tuning of Foundation Models,” in ICML 2024 Workshop on Foundation Models in the Wild (ICML 2024 FM-Wild Workshop), Vienna, Austria, 2024.

mehr

BibTeX

@inproceedings{kaplan2024domainaware,
TITLE = {Domain-Aware Fine-Tuning of Foundation Models},
AUTHOR = {Kaplan, U{\u g}ur Ali and Li, Yumeng and Keuper, Margret and Khoreva, Anna and Zhang, Dan},
LANGUAGE = {eng},
URL = {https://openreview.net/forum?id=fIc8BXTKVc},
PUBLISHER = {OpenReview.net},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
DATE = {2024},
BOOKTITLE = {ICML 2024 Workshop on Foundation Models in the Wild (ICML 2024 FM-Wild Workshop)},
ADDRESS = {Vienna, Austria},
}

Endnote

%0 Conference Proceedings
%A Kaplan, U&#287;ur Ali
%A Li, Yumeng
%A Keuper, Margret
%A Khoreva, Anna
%A Zhang, Dan
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
%T Domain-Aware Fine-Tuning of Foundation Models : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-4386-B
%U https://openreview.net/forum?id=fIc8BXTKVc
%D 2024
%B ICML 2024 Workshop on Foundation Models in the Wild
%Z date of event: 2024-07-27 - 2024-07-27
%C Vienna, Austria
%B ICML 2024 Workshop on Foundation Models in the Wild
%I OpenReview.net

Conference paper

H. Sommerhoff, S. Agnihotri, M. Saleh, M. Moeller, M. Keuper, and B. Choubey

“Task Driven Sensor Layouts - Joint Optimization of Pixel Layout and Network Parameters,” in IEEE International Conference on Computational Photography (ICCP 2024), Lausanne, Switzerland, 2024.

mehr

BibTeX

@inproceedings{Sommerhoff24,
TITLE = {Task Driven Sensor Layouts -- Joint Optimization of Pixel Layout and Network Parameters},
AUTHOR = {Sommerhoff, Hendrik and Agnihotri, Shashank and Saleh, Mohamed and Moeller, Michael and Keuper, Margret and Choubey, Bhaskar},
LANGUAGE = {eng},
ISBN = {979-8-3503-6155-1},
DOI = {10.1109/ICCP61108.2024.10644474},
PUBLISHER = {IEEE},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
DATE = {2024},
BOOKTITLE = {IEEE International Conference on Computational Photography (ICCP 2024)},
PAGES = {1--10},
ADDRESS = {Lausanne, Switzerland},
}

Endnote

%0 Conference Proceedings
%A Sommerhoff, Hendrik
%A Agnihotri, Shashank
%A Saleh, Mohamed
%A Moeller, Michael
%A Keuper, Margret
%A Choubey, Bhaskar
%+ External Organizations
External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
%T Task Driven Sensor Layouts - Joint Optimization of Pixel Layout and Network Parameters : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-2CF6-8
%R 10.1109/ICCP61108.2024.10644474
%D 2024
%B IEEE International Conference on Computational Photography
%Z date of event: 2024-07-22 - 2024-07-24
%C Lausanne, Switzerland
%B IEEE International Conference on Computational Photography
%P 1 - 10
%I IEEE
%@ 979-8-3503-6155-1

Article

Y. Li, D. Zhang, M. Keuper, and A. Khoreva

“Intra- & Extra-Source Exemplar-Based Style Synthesis for Improved Domain Generalization,” International Journal of Computer Vision, vol. 132, 2024.

mehr

BibTeX

@article{Li_2023,
TITLE = {Intra- \& Extra-Source Exemplar-Based Style Synthesis for Improved Domain Generalization},
AUTHOR = {Li, Yumeng and Zhang, Dan and Keuper, Margret and Khoreva, Anna},
LANGUAGE = {eng},
ISSN = {0920-5691},
DOI = {10.1007/s11263-023-01878-8},
PUBLISHER = {Springer},
ADDRESS = {New York, NY},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
DATE = {2024},
JOURNAL = {International Journal of Computer Vision},
VOLUME = {132},
PAGES = {446--465},
}

Endnote

%0 Journal Article
%A Li, Yumeng
%A Zhang, Dan
%A Keuper, Margret
%A Khoreva, Anna
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
%T Intra- & Extra-Source Exemplar-Based Style Synthesis for Improved
Domain Generalization : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000D-CA41-6
%R 10.1007/s11263-023-01878-8
%7 2023
%D 2024
%J International Journal of Computer Vision
%O Int. J. Comput. Vis.
%V 132
%& 446
%P 446 - 465
%I Springer
%C New York, NY
%@ false

Conference paper

P. Gavrikov, S. Agnihotri, M. Keuper, and J. Keuper

“How Do Training Methods Influence the Utilization of Vision Models?,” in Interpretable AI: Past, Present and Future (IAI Workshop @ NeurIPS 2024), Vancouver, Canada, 2024.

mehr

BibTeX

@inproceedings{Gavrikov_2410.14470,
TITLE = {How Do Training Methods Influence the Utilization of Vision Models?},
AUTHOR = {Gavrikov, Paul and Agnihotri, Shashank and Keuper, Margret and Keuper, Janis},
LANGUAGE = {eng},
URL = {https://openreview.net/forum?id=zJFvjdW9JS; https://interpretable-ai-workshop.github.io/},
PUBLISHER = {OpenReview.net},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
DATE = {2024},
BOOKTITLE = {Interpretable AI: Past, Present and Future (IAI Workshop @ NeurIPS 2024)},
PAGES = {1--13},
ADDRESS = {Vancouver, Canada},
}

Endnote

%0 Conference Proceedings
%A Gavrikov, Paul
%A Agnihotri, Shashank
%A Keuper, Margret
%A Keuper, Janis
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
%T How Do Training Methods Influence the Utilization of Vision Models? : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-5E29-8
%U https://openreview.net/forum?id=zJFvjdW9JS
%D 2024
%B Interpretable AI Workshop
%Z date of event: 2024-12-15 - 2024-12-15
%C Vancouver, Canada
%B Interpretable AI: Past, Present and Future
%P 1 - 13
%I OpenReview.net
%U https://github.com/paulgavrikov/layer_criticality

Conference paper

K. Prasse, S. Jung, Y. Zhou, and M. Keuper

“Local Spherical Harmonics Improve Skeleton-Based Hand Action Recognition,” in Pattern Recognition (DAGM GCPR 2024), Munich, Germany.

mehr

Abstract

Hand action recognition is essential. Communication, human-robot
interactions, and gesture control are dependent on it. Skeleton-based action
recognition traditionally includes hands, which belong to the classes which
remain challenging to correctly recognize to date. We propose a method
specifically designed for hand action recognition which uses relative angular
embeddings and local Spherical Harmonics to create novel hand representations.
The use of Spherical Harmonics creates rotation-invariant representations which
make hand action recognition even more robust against inter-subject differences
and viewpoint changes. We conduct extensive experiments on the hand joints in
the First-Person Hand Action Benchmark with RGB-D Videos and 3D Hand Pose
Annotations, and on the NTU RGB+D 120 dataset, demonstrating the benefit of
using Local Spherical Harmonics Representations. Our code is available at
github.com/KathPra/LSHR_LSHT.

BibTeX

@inproceedings{Prasse_DAGMGCPR24,
TITLE = {Local Spherical Harmonics Improve Skeleton-Based Hand Action Recognition},
AUTHOR = {Prasse, Katharina and Jung, Steffen and Zhou, Yuxuan and Keuper, Margret},
LANGUAGE = {eng},
PUBLISHER = {Springer},
YEAR = {2024},
PUBLREMARK = {Accepted},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Hand action recognition is essential. Communication, human-robot<br>interactions, and gesture control are dependent on it. Skeleton-based action<br>recognition traditionally includes hands, which belong to the classes which<br>remain challenging to correctly recognize to date. We propose a method<br>specifically designed for hand action recognition which uses relative angular<br>embeddings and local Spherical Harmonics to create novel hand representations.<br>The use of Spherical Harmonics creates rotation-invariant representations which<br>make hand action recognition even more robust against inter-subject differences<br>and viewpoint changes. We conduct extensive experiments on the hand joints in<br>the First-Person Hand Action Benchmark with RGB-D Videos and 3D Hand Pose<br>Annotations, and on the NTU RGB+D 120 dataset, demonstrating the benefit of<br>using Local Spherical Harmonics Representations. Our code is available at<br>https://github.com/KathPra/LSHR_LSHT.<br>},
BOOKTITLE = {Pattern Recognition (DAGM GCPR 2024)},
SERIES = {Lecture Notes in Computer Science},
ADDRESS = {Munich, Germany},
}

Endnote

%0 Conference Proceedings
%A Prasse, Katharina
%A Jung, Steffen
%A Zhou, Yuxuan
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Local Spherical Harmonics Improve Skeleton-Based Hand Action Recognition : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-8B4A-F
%D 2024
%B German Conference on Pattern Recognition
%Z date of event: 2024-09-10 - 2024-09-13
%C Munich, Germany
%X   Hand action recognition is essential. Communication, human-robot<br>interactions, and gesture control are dependent on it. Skeleton-based action<br>recognition traditionally includes hands, which belong to the classes which<br>remain challenging to correctly recognize to date. We propose a method<br>specifically designed for hand action recognition which uses relative angular<br>embeddings and local Spherical Harmonics to create novel hand representations.<br>The use of Spherical Harmonics creates rotation-invariant representations which<br>make hand action recognition even more robust against inter-subject differences<br>and viewpoint changes. We conduct extensive experiments on the hand joints in<br>the First-Person Hand Action Benchmark with RGB-D Videos and 3D Hand Pose<br>Annotations, and on the NTU RGB+D 120 dataset, demonstrating the benefit of<br>using Local Spherical Harmonics Representations. Our code is available at<br>https://github.com/KathPra/LSHR_LSHT.<br>
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV
%B Pattern Recognition
%I Springer
%B Lecture Notes in Computer Science

Conference paper

S. Agnihotri, S. Jung, and M. Keuper

“CosPGD: An Efficient White-Box Adversarial Attack for Pixel-Wise Prediction Tasks,” in Proceedings of the 41st International Conference on Machine Learning (ICML 2024), Vienna, Austria, 2024.

mehr

BibTeX

@inproceedings{Agnihotri_ICML24,
TITLE = {{CosPGD}: {A}n Efficient White-Box Adversarial Attack for Pixel-Wise Prediction Tasks},
AUTHOR = {Agnihotri, Shashank and Jung, Steffen and Keuper, Margret},
LANGUAGE = {eng},
ISSN = {1938-7228},
URL = {https://proceedings.mlr.press/v235/},
PUBLISHER = {MLR Press},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
DATE = {2024},
BOOKTITLE = {Proceedings of the 41st International Conference on Machine Learning (ICML 2024)},
EDITOR = {Salakhutdinov, Ruslan and Kolter, Zico and Heller, Katherine and Weller, Adrian and Oliver, Nuria and Scarlett, Jonathan and Berkenkamp, Felix},
PAGES = {416--451},
SERIES = {Proceedings of the Machine Learning Research},
VOLUME = {235},
ADDRESS = {Vienna, Austria},
}

Endnote

%0 Conference Proceedings
%A Agnihotri, Shashank
%A Jung, Steffen
%A Keuper, Margret
%+ External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T CosPGD: An Efficient White-Box Adversarial Attack for Pixel-Wise 
Prediction Tasks : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000F-3F1E-B
%D 2024
%B 41st International Conference on Machine Learning
%Z date of event: 2024-07-21 - 2024-07-27
%C Vienna, Austria
%B Proceedings of the 41st International Conference on Machine Learning
%E Salakhutdinov, Ruslan; Kolter, Zico; Heller, Katherine; Weller, Adrian; Oliver, Nuria; Scarlett, Jonathan; Berkenkamp, Felix
%P 416 - 451
%I MLR Press
%U https://proceedings.mlr.press/v235/
%B Proceedings of the Machine Learning Research
%N 235
%@ false

Conference paper

J. P. Schneider, M. Fatima, J. Lukasik, A. Kolb, M. Keuper, and M. Moeller

“Implicit Representations for Constrained Image Segmentation,” in Proceedings of the 41st International Conference on Machine Learning (ICML 2024), Vienna, Austria, 2024.

mehr

BibTeX

@inproceedings{convexSegICML2024,
TITLE = {Implicit Representations for Constrained Image Segmentation},
AUTHOR = {Schneider, Jan Philipp and Fatima, Mishal and Lukasik, Jovita and Kolb, Andreas and Keuper, Margret and Moeller, Michael},
LANGUAGE = {eng},
ISSN = {1938-7228},
URL = {https://proceedings.mlr.press/v235/},
PUBLISHER = {MLR Press},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
DATE = {2024},
BOOKTITLE = {Proceedings of the 41st International Conference on Machine Learning (ICML 2024)},
EDITOR = {Salakhutdinov, Ruslan and Kolter, Zico and Heller, Katherine and Weller, Adrian and Oliver, Nuria and Scarlett, Jonathan and Berkenkamp, Felix},
PAGES = {43765--43790},
SERIES = {Proceedings of the Machine Learning Research},
VOLUME = {235},
ADDRESS = {Vienna, Austria},
}

Endnote

%0 Conference Proceedings
%A Schneider, Jan Philipp
%A Fatima, Mishal
%A Lukasik, Jovita
%A Kolb, Andreas
%A Keuper, Margret
%A Moeller, Michael
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
%T Implicit Representations for Constrained Image Segmentation : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000F-610A-9
%D 2024
%B 41st International Conference on Machine Learning
%Z date of event: 2024-07-21 - 2024-07-27
%C Vienna, Austria
%B Proceedings of the 41st International Conference on Machine Learning
%E Salakhutdinov, Ruslan; Kolter, Zico; Heller, Katherine; Weller, Adrian; Oliver, Nuria; Scarlett, Jonathan; Berkenkamp, Felix
%P 43765 - 43790
%I MLR Press
%U https://proceedings.mlr.press/v235/
%B Proceedings of the Machine Learning Research
%N 235
%@ false

Conference paper

Y. Zhou, M. Fritz, and M. Keuper

“MultiMax: Sparse and Mulit-Modal Attention Learning,” in Proceedings of the 41st International Conference on Machine Learning (ICML 2024), Vienna, Austria, 2024.

mehr

BibTeX

@inproceedings{multimaxICML2024,
TITLE = {{MultiMax}: {S}parse and Mulit-Modal Attention Learning},
AUTHOR = {Zhou, Yuxuan and Fritz, Mario and Keuper, Margret},
LANGUAGE = {eng},
ISSN = {1938-7228},
URL = {https://proceedings.mlr.press/v235/},
PUBLISHER = {MLR Press},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
DATE = {2024},
BOOKTITLE = {Proceedings of the 41st International Conference on Machine Learning (ICML 2024)},
EDITOR = {Salakhutdinov, Ruslan and Kolter, Zico and Heller, Katherine and Weller, Adrian and Oliver, Nuria and Scarlett, Jonathan and Berkenkamp, Felix},
PAGES = {61897--61912},
SERIES = {Proceedings of the Machine Learning Research},
VOLUME = {235},
ADDRESS = {Vienna, Austria},
}

Endnote

%0 Conference Proceedings
%A Zhou, Yuxuan
%A Fritz, Mario
%A Keuper, Margret
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T MultiMax: Sparse and Mulit-Modal Attention Learning : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000F-6103-0
%D 2024
%B 41st International Conference on Machine Learning
%Z date of event: 2024-07-21 - 2024-07-27
%C Vienna, Austria
%B Proceedings of the 41st International Conference on Machine Learning
%E Salakhutdinov, Ruslan; Kolter, Zico; Heller, Katherine; Weller, Adrian; Oliver, Nuria; Scarlett, Jonathan; Berkenkamp, Felix
%P 61897 - 61912
%I MLR Press
%U https://proceedings.mlr.press/v235/
%B Proceedings of the Machine Learning Research
%N 235
%@ false

Conference paper

Y. Li, M. Keuper, D. Zhang, and A. Khoreva

“Adversarial Supervision Makes Layout-to-Image Diffusion Models Thrive,” in The Twelfth International Conference on Learning Representations (ICLR 2024), Vienna, Austria, 2024.

mehr

BibTeX

@inproceedings{li2024aldm,
TITLE = {Adversarial Supervision Makes Layout-to-Image Diffusion Models Thrive},
AUTHOR = {Li, Yumeng and Keuper, Margret and Zhang, Dan and Khoreva, Anna},
LANGUAGE = {eng},
URL = {https://openreview.net/forum?id=EJPIzl7mgc; https://iclr.cc/Conferences/2024},
PUBLISHER = {OpenReview.net},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
DATE = {2024},
BOOKTITLE = {The Twelfth International Conference on Learning Representations (ICLR 2024)},
PAGES = {1--23},
ADDRESS = {Vienna, Austria},
}

Endnote

%0 Conference Proceedings
%A Li, Yumeng
%A Keuper, Margret
%A Zhang, Dan
%A Khoreva, Anna
%+ External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
External Organizations
%T Adversarial Supervision Makes Layout-to-Image Diffusion Models Thrive : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000F-23F1-9
%U https://openreview.net/forum?id=EJPIzl7mgc
%D 2024
%B Twelfth International Conference on Learning Representations
%Z date of event: 2024-05-07 - 2024-05-11
%C Vienna, Austria
%B The Twelfth International Conference on Learning Representations
%P 1 - 23
%I OpenReview.net


%U https://yumengli007.github.io/ALDM/

Article

K. Bäuerle, P. Müller, S. M. Kazim, I. Ihrke, and M. Keuper

“Learning the essential in less than 2k additional weights - a simple approach to improve image classification stability under corruptions,” Transactions on Machine Learning Research, vol. 2024, no. 6, 2024.

mehr

BibTeX

@article{BaeuerleTMLR24,
TITLE = {Learning the essential in less than 2k additional weights -- a simple approach to improve image classification stability under corruptions},
AUTHOR = {B{\"a}uerle, Kai and M{\"u}ller, Patrick and Kazim, Syed Muhammad and Ihrke, Ivo and Keuper, Margret},
LANGUAGE = {eng},
ISSN = {2835-8856},
PUBLISHER = {TMLR},
ADDRESS = {New York, NY},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
DATE = {2024},
JOURNAL = {Transactions on Machine Learning Research},
VOLUME = {2024},
NUMBER = {6},
PAGES = {1--18},
}

Endnote

%0 Journal Article
%A B&#228;uerle, Kai
%A M&#252;ller, Patrick
%A Kazim, Syed Muhammad
%A Ihrke, Ivo
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Learning the essential in less than 2k additional weights - a simple approach to improve image classification stability under corruptions : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-4378-C
%7 2024-09-17
%D 2024
%J Transactions on Machine Learning Research
%V 2024
%N 6
%& 1
%P 1 - 18
%I TMLR
%C New York, NY
%@ false
%U https://openreview.net/forum?id=i2SuGWtIIm

Article

J. Grabinski, J. Keuper, and M. Keuper

“As large as it gets - Studying Infinitely Large Convolutions via Neural Implicit Frequency Filters,” Transactions on Machine Learning Research, vol. 2024, 2024.

mehr

BibTeX

@article{grabinski2024as,
TITLE = {As large as it gets -- Studying Infinitely Large Convolutions via Neural Implicit Frequency Filters},
AUTHOR = {Grabinski, Julia and Keuper, Janis and Keuper, Margret},
LANGUAGE = {eng},
ISSN = {2835-8856},
URL = {https://openreview.net/forum?id=xRy1YRcHWj},
PUBLISHER = {TMLR},
ADDRESS = {New York, NY},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
DATE = {2024},
JOURNAL = {Transactions on Machine Learning Research},
VOLUME = {2024},
PAGES = {1--42},
}

Endnote

%0 Journal Article
%A Grabinski, Julia
%A Keuper, Janis
%A Keuper, Margret
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T As large as it gets - Studying Infinitely Large Convolutions via Neural Implicit Frequency Filters : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000F-60FF-6
%U https://openreview.net/forum?id=xRy1YRcHWj
%7 2024
%D 2024
%J Transactions on Machine Learning Research
%V 2024
%& 1
%P 1 - 42
%I TMLR
%C New York, NY
%@ false

Paper

S. Agnihotri, J. Grabinski, J. Keuper, and M. Keuper

“Beware of Aliases -- Signal Preservation is Crucial for Robust Image Restoration,” 2024. [Online]. Available: https://arxiv.org/abs/2406.07435.

mehr

Abstract

Image restoration networks are usually comprised of an encoder and a decoder,
responsible for aggregating image content from noisy, distorted data and to
restore clean, undistorted images, respectively. Data aggregation as well as
high-resolution image generation both usually come at the risk of involving
aliases, i.e.~standard architectures put their ability to reconstruct the model
input in jeopardy to reach high PSNR values on validation data. The price to be
paid is low model robustness. In this work, we show that simply providing
alias-free paths in state-of-the-art reconstruction transformers supports
improved model robustness at low costs on the restoration performance. We do so
by proposing BOA-Restormer, a transformer-based image restoration model that
executes downsampling and upsampling operations partly in the frequency domain
to ensure alias-free paths along the entire model while potentially preserving
all relevant high-frequency information.

BibTeX

@online{Agnihotri_2406.07435,
TITLE = {Beware of Aliases -- Signal Preservation is Crucial for Robust Image Restoration},
AUTHOR = {Agnihotri, Shashank and Grabinski, Julia and Keuper, Janis and Keuper, Margret},
LANGUAGE = {eng},
URL = {https://arxiv.org/abs/2406.07435},
EPRINT = {2406.07435},
EPRINTTYPE = {arXiv},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Image restoration networks are usually comprised of an encoder and a decoder,<br>responsible for aggregating image content from noisy, distorted data and to<br>restore clean, undistorted images, respectively. Data aggregation as well as<br>high-resolution image generation both usually come at the risk of involving<br>aliases, i.e.~standard architectures put their ability to reconstruct the model<br>input in jeopardy to reach high PSNR values on validation data. The price to be<br>paid is low model robustness. In this work, we show that simply providing<br>alias-free paths in state-of-the-art reconstruction transformers supports<br>improved model robustness at low costs on the restoration performance. We do so<br>by proposing BOA-Restormer, a transformer-based image restoration model that<br>executes downsampling and upsampling operations partly in the frequency domain<br>to ensure alias-free paths along the entire model while potentially preserving<br>all relevant high-frequency information.<br>},
}

Endnote

%0 Report
%A Agnihotri, Shashank
%A Grabinski, Julia
%A Keuper, Janis
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Beware of Aliases -- Signal Preservation is Crucial for Robust Image
  Restoration : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-5E20-1
%U https://arxiv.org/abs/2406.07435
%D 2024
%X   Image restoration networks are usually comprised of an encoder and a decoder,<br>responsible for aggregating image content from noisy, distorted data and to<br>restore clean, undistorted images, respectively. Data aggregation as well as<br>high-resolution image generation both usually come at the risk of involving<br>aliases, i.e.~standard architectures put their ability to reconstruct the model<br>input in jeopardy to reach high PSNR values on validation data. The price to be<br>paid is low model robustness. In this work, we show that simply providing<br>alias-free paths in state-of-the-art reconstruction transformers supports<br>improved model robustness at low costs on the restoration performance. We do so<br>by proposing BOA-Restormer, a transformer-based image restoration model that<br>executes downsampling and upsampling operations partly in the frequency domain<br>to ensure alias-free paths along the entire model while potentially preserving<br>all relevant high-frequency information.<br>
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV,Computer Science, Learning, cs.LG,eess.IV

Paper

P. Gavrikov, J. Lukasik, S. Jung, R. Geirhos, B. Lamm, M. J. Mirza, M. Keuper, and J. Keuper

“Are Vision Language Models Texture or Shape Biased and Can We Steer Them?,” 2024. [Online]. Available: https://arxiv.org/abs/2403.09193.

mehr

Abstract

Vision language models (VLMs) have drastically changed the computer vision
model landscape in only a few years, opening an exciting array of new
applications from zero-shot image classification, over to image captioning, and
visual question answering. Unlike pure vision models, they offer an intuitive
way to access visual content through language prompting. The wide applicability
of such models encourages us to ask whether they also align with human vision -
specifically, how far they adopt human-induced visual biases through multimodal
fusion, or whether they simply inherit biases from pure vision models. One
important visual bias is the texture vs. shape bias, or the dominance of local
over global information. In this paper, we study this bias in a wide range of
popular VLMs. Interestingly, we find that VLMs are often more shape-biased than
their vision encoders, indicating that visual biases are modulated to some
extent through text in multimodal models. If text does indeed influence visual
biases, this suggests that we may be able to steer visual biases not just
through visual input but also through language: a hypothesis that we confirm
through extensive experiments. For instance, we are able to steer shape bias
from as low as 49% to as high as 72% through prompting alone. For now, the
strong human bias towards shape (96%) remains out of reach for all tested VLMs.

BibTeX

@online{Gavrikov_2403.09193,
TITLE = {Are Vision Language Models Texture or Shape Biased and Can We Steer Them?},
AUTHOR = {Gavrikov, Paul and Lukasik, Jovita and Jung, Steffen and Geirhos, Robert and Lamm, Bianca and Mirza, Muhammad Jehanzeb and Keuper, Margret and Keuper, Janis},
LANGUAGE = {eng},
URL = {https://arxiv.org/abs/2403.09193},
EPRINT = {2403.09193},
EPRINTTYPE = {arXiv},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Vision language models (VLMs) have drastically changed the computer vision<br>model landscape in only a few years, opening an exciting array of new<br>applications from zero-shot image classification, over to image captioning, and<br>visual question answering. Unlike pure vision models, they offer an intuitive<br>way to access visual content through language prompting. The wide applicability<br>of such models encourages us to ask whether they also align with human vision -<br>specifically, how far they adopt human-induced visual biases through multimodal<br>fusion, or whether they simply inherit biases from pure vision models. One<br>important visual bias is the texture vs. shape bias, or the dominance of local<br>over global information. In this paper, we study this bias in a wide range of<br>popular VLMs. Interestingly, we find that VLMs are often more shape-biased than<br>their vision encoders, indicating that visual biases are modulated to some<br>extent through text in multimodal models. If text does indeed influence visual<br>biases, this suggests that we may be able to steer visual biases not just<br>through visual input but also through language: a hypothesis that we confirm<br>through extensive experiments. For instance, we are able to steer shape bias<br>from as low as 49% to as high as 72% through prompting alone. For now, the<br>strong human bias towards shape (96%) remains out of reach for all tested VLMs.<br>},
}

Endnote

%0 Report
%A Gavrikov, Paul
%A Lukasik, Jovita
%A Jung, Steffen
%A Geirhos, Robert
%A Lamm, Bianca
%A Mirza, Muhammad Jehanzeb
%A Keuper, Margret
%A Keuper, Janis
%+ External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
%T Are Vision Language Models Texture or Shape Biased and Can We Steer
  Them? : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-5DEE-B
%U https://arxiv.org/abs/2403.09193
%D 2024
%X   Vision language models (VLMs) have drastically changed the computer vision<br>model landscape in only a few years, opening an exciting array of new<br>applications from zero-shot image classification, over to image captioning, and<br>visual question answering. Unlike pure vision models, they offer an intuitive<br>way to access visual content through language prompting. The wide applicability<br>of such models encourages us to ask whether they also align with human vision -<br>specifically, how far they adopt human-induced visual biases through multimodal<br>fusion, or whether they simply inherit biases from pure vision models. One<br>important visual bias is the texture vs. shape bias, or the dominance of local<br>over global information. In this paper, we study this bias in a wide range of<br>popular VLMs. Interestingly, we find that VLMs are often more shape-biased than<br>their vision encoders, indicating that visual biases are modulated to some<br>extent through text in multimodal models. If text does indeed influence visual<br>biases, this suggests that we may be able to steer visual biases not just<br>through visual input but also through language: a hypothesis that we confirm<br>through extensive experiments. For instance, we are able to steer shape bias<br>from as low as 49% to as high as 72% through prompting alone. For now, the<br>strong human bias towards shape (96%) remains out of reach for all tested VLMs.<br>
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV,Computer Science, Artificial Intelligence, cs.AI,Computer Science, Learning, cs.LG,Quantitative Biology, Neurons and Cognition, q-bio.NC

Paper

T. Medi, A. Rampini, P. Reddy, P. K. Jayaraman, and M. Keuper

“3D-WAG: Hierarchical Wavelet-Guided Autoregressive Generation for High-Fidelity 3D Shapes,” 2024. [Online]. Available: https://arxiv.org/abs/2411.19037.

mehr

Abstract

Autoregressive (AR) models have achieved remarkable success in natural
language and image generation, but their application to 3D shape modeling
remains largely unexplored. Unlike diffusion models, AR models enable more
efficient and controllable generation with faster inference times, making them
especially suitable for data-intensive domains. Traditional 3D generative
models using AR approaches often rely on ``next-token" predictions at the voxel
or point level. While effective for certain applications, these methods can be
restrictive and computationally expensive when dealing with large-scale 3D
data. To tackle these challenges, we introduce 3D-WAG, an AR model for 3D
implicit distance fields that can perform unconditional shape generation,
class-conditioned and also text-conditioned shape generation. Our key idea is
to encode shapes as multi-scale wavelet token maps and use a Transformer to
predict the ``next higher-resolution token map" in an autoregressive manner. By
redefining 3D AR generation task as ``next-scale" prediction, we reduce the
computational cost of generation compared to traditional ``next-token"
prediction models, while preserving essential geometric details of 3D shapes in
a more structured and hierarchical manner. We evaluate 3D-WAG to showcase its
benefit by quantitative and qualitative comparisons with state-of-the-art
methods on widely used benchmarks. Our results show 3D-WAG achieves superior
performance in key metrics like Coverage and MMD, generating high-fidelity 3D
shapes that closely match the real data distribution.

BibTeX

@online{Medi2411.19037,
TITLE = {{3D}-{WAG}: Hierarchical Wavelet-Guided Autoregressive Generation for High-Fidelity {3D} Shapes},
AUTHOR = {Medi, Tejaswini and Rampini, Arianna and Reddy, Pradyumna and Jayaraman, Pradeep Kumar and Keuper, Margret},
LANGUAGE = {eng},
URL = {https://arxiv.org/abs/2411.19037},
EPRINT = {2411.19037},
EPRINTTYPE = {arXiv},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Autoregressive (AR) models have achieved remarkable success in natural<br>language and image generation, but their application to 3D shape modeling<br>remains largely unexplored. Unlike diffusion models, AR models enable more<br>efficient and controllable generation with faster inference times, making them<br>especially suitable for data-intensive domains. Traditional 3D generative<br>models using AR approaches often rely on ``next-token" predictions at the voxel<br>or point level. While effective for certain applications, these methods can be<br>restrictive and computationally expensive when dealing with large-scale 3D<br>data. To tackle these challenges, we introduce 3D-WAG, an AR model for 3D<br>implicit distance fields that can perform unconditional shape generation,<br>class-conditioned and also text-conditioned shape generation. Our key idea is<br>to encode shapes as multi-scale wavelet token maps and use a Transformer to<br>predict the ``next higher-resolution token map" in an autoregressive manner. By<br>redefining 3D AR generation task as ``next-scale" prediction, we reduce the<br>computational cost of generation compared to traditional ``next-token"<br>prediction models, while preserving essential geometric details of 3D shapes in<br>a more structured and hierarchical manner. We evaluate 3D-WAG to showcase its<br>benefit by quantitative and qualitative comparisons with state-of-the-art<br>methods on widely used benchmarks. Our results show 3D-WAG achieves superior<br>performance in key metrics like Coverage and MMD, generating high-fidelity 3D<br>shapes that closely match the real data distribution.<br>},
}

Endnote

%0 Report
%A Medi, Tejaswini
%A Rampini, Arianna
%A Reddy, Pradyumna
%A Jayaraman, Pradeep Kumar
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T 3D-WAG: Hierarchical Wavelet-Guided Autoregressive Generation for
  High-Fidelity 3D Shapes : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-BF26-D
%U https://arxiv.org/abs/2411.19037
%D 2024
%X   Autoregressive (AR) models have achieved remarkable success in natural<br>language and image generation, but their application to 3D shape modeling<br>remains largely unexplored. Unlike diffusion models, AR models enable more<br>efficient and controllable generation with faster inference times, making them<br>especially suitable for data-intensive domains. Traditional 3D generative<br>models using AR approaches often rely on ``next-token" predictions at the voxel<br>or point level. While effective for certain applications, these methods can be<br>restrictive and computationally expensive when dealing with large-scale 3D<br>data. To tackle these challenges, we introduce 3D-WAG, an AR model for 3D<br>implicit distance fields that can perform unconditional shape generation,<br>class-conditioned and also text-conditioned shape generation. Our key idea is<br>to encode shapes as multi-scale wavelet token maps and use a Transformer to<br>predict the ``next higher-resolution token map" in an autoregressive manner. By<br>redefining 3D AR generation task as ``next-scale" prediction, we reduce the<br>computational cost of generation compared to traditional ``next-token"<br>prediction models, while preserving essential geometric details of 3D shapes in<br>a more structured and hierarchical manner. We evaluate 3D-WAG to showcase its<br>benefit by quantitative and qualitative comparisons with state-of-the-art<br>methods on widely used benchmarks. Our results show 3D-WAG achieves superior<br>performance in key metrics like Coverage and MMD, generating high-fidelity 3D<br>shapes that closely match the real data distribution.<br>
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV

Paper

T. Medi, J. Grabinski, and M. Keuper

“Towards Class-wise Robustness Analysis,” 2024. [Online]. Available: https://arxiv.org/abs/2411.19853.

mehr

Abstract

While being very successful in solving many downstream tasks, the application
of deep neural networks is limited in real-life scenarios because of their
susceptibility to domain shifts such as common corruptions, and adversarial
attacks. The existence of adversarial examples and data corruption
significantly reduces the performance of deep classification models.
Researchers have made strides in developing robust neural architectures to
bolster decisions of deep classifiers. However, most of these works rely on
effective adversarial training methods, and predominantly focus on overall
model robustness, disregarding class-wise differences in robustness, which are
critical. Exploiting weakly robust classes is a potential avenue for attackers
to fool the image recognition models. Therefore, this study investigates
class-to-class biases across adversarially trained robust classification models
to understand their latent space structures and analyze their strong and weak
class-wise properties. We further assess the robustness of classes against
common corruptions and adversarial attacks, recognizing that class
vulnerability extends beyond the number of correct classifications for a
specific class. We find that the number of false positives of classes as
specific target classes significantly impacts their vulnerability to attacks.
Through our analysis on the Class False Positive Score, we assess a fair
evaluation of how susceptible each class is to misclassification.

BibTeX

@online{Medi2411.19853,
TITLE = {Towards Class-wise Robustness Analysis},
AUTHOR = {Medi, Tejaswini and Grabinski, Julia and Keuper, Margret},
LANGUAGE = {eng},
URL = {https://arxiv.org/abs/2411.19853},
EPRINT = {2411.19853},
EPRINTTYPE = {arXiv},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
ABSTRACT = {While being very successful in solving many downstream tasks, the application<br>of deep neural networks is limited in real-life scenarios because of their<br>susceptibility to domain shifts such as common corruptions, and adversarial<br>attacks. The existence of adversarial examples and data corruption<br>significantly reduces the performance of deep classification models.<br>Researchers have made strides in developing robust neural architectures to<br>bolster decisions of deep classifiers. However, most of these works rely on<br>effective adversarial training methods, and predominantly focus on overall<br>model robustness, disregarding class-wise differences in robustness, which are<br>critical. Exploiting weakly robust classes is a potential avenue for attackers<br>to fool the image recognition models. Therefore, this study investigates<br>class-to-class biases across adversarially trained robust classification models<br>to understand their latent space structures and analyze their strong and weak<br>class-wise properties. We further assess the robustness of classes against<br>common corruptions and adversarial attacks, recognizing that class<br>vulnerability extends beyond the number of correct classifications for a<br>specific class. We find that the number of false positives of classes as<br>specific target classes significantly impacts their vulnerability to attacks.<br>Through our analysis on the Class False Positive Score, we assess a fair<br>evaluation of how susceptible each class is to misclassification.<br>},
}

Endnote

%0 Report
%A Medi, Tejaswini
%A Grabinski, Julia
%A Keuper, Margret
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Towards Class-wise Robustness Analysis : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-BF33-E
%U https://arxiv.org/abs/2411.19853
%D 2024
%X   While being very successful in solving many downstream tasks, the application<br>of deep neural networks is limited in real-life scenarios because of their<br>susceptibility to domain shifts such as common corruptions, and adversarial<br>attacks. The existence of adversarial examples and data corruption<br>significantly reduces the performance of deep classification models.<br>Researchers have made strides in developing robust neural architectures to<br>bolster decisions of deep classifiers. However, most of these works rely on<br>effective adversarial training methods, and predominantly focus on overall<br>model robustness, disregarding class-wise differences in robustness, which are<br>critical. Exploiting weakly robust classes is a potential avenue for attackers<br>to fool the image recognition models. Therefore, this study investigates<br>class-to-class biases across adversarially trained robust classification models<br>to understand their latent space structures and analyze their strong and weak<br>class-wise properties. We further assess the robustness of classes against<br>common corruptions and adversarial attacks, recognizing that class<br>vulnerability extends beyond the number of correct classifications for a<br>specific class. We find that the number of false positives of classes as<br>specific target classes significantly impacts their vulnerability to attacks.<br>Through our analysis on the Class False Positive Score, we assess a fair<br>evaluation of how susceptible each class is to misclassification.<br>
%K Computer Science, Learning, cs.LG,Computer Science, Computer Vision and Pattern Recognition, cs.CV

Paper

Y. Zhou, M. Keuper, and M. Fritz

“Balancing Diversity and Risk in LLM Sampling: How to Select Your Method and Parameter for Open-Ended Text Generation,” 2024. [Online]. Available: https://arxiv.org/abs/2408.13586.

mehr

Abstract

Sampling-based decoding strategies have been widely adopted for Large
Language Models (LLMs) in numerous applications, which target a balance between
diversity and quality via temperature tuning and tail truncation (e.g., top-k
and top-p sampling). Considering the high dynamic range of the candidate
next-token given different prefixes, recent studies propose to adaptively
truncate the tail of LLM's predicted distribution. Although improved results
haven been reported with these methods on open-ended text generation tasks, the
results are highly dependent on the curated truncation parameters and exemplar
text. In this paper, we propose a systematic way to estimate the intrinsic
capacity of a truncation sampling method by considering the trade-off between
diversity and risk at each decoding step, based on our collected prefix tree
which preserves the context of a full sentence. Our work provides a
comprehensive comparison between existing truncation sampling methods, as well
as their recommended parameters as a guideline for users.

BibTeX

@online{Zhou_2408.13586,
TITLE = {Balancing Diversity and Risk in {LLM} Sampling: How to Select Your Method and Parameter for Open-Ended Text Generation},
AUTHOR = {Zhou, Yuxuan and Keuper, Margret and Fritz, Mario},
LANGUAGE = {eng},
URL = {https://arxiv.org/abs/2408.13586},
EPRINT = {2408.13586},
EPRINTTYPE = {arXiv},
YEAR = {2024},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Sampling-based decoding strategies have been widely adopted for Large<br>Language Models (LLMs) in numerous applications, which target a balance between<br>diversity and quality via temperature tuning and tail truncation (e.g., top-k<br>and top-p sampling). Considering the high dynamic range of the candidate<br>next-token given different prefixes, recent studies propose to adaptively<br>truncate the tail of LLM's predicted distribution. Although improved results<br>haven been reported with these methods on open-ended text generation tasks, the<br>results are highly dependent on the curated truncation parameters and exemplar<br>text. In this paper, we propose a systematic way to estimate the intrinsic<br>capacity of a truncation sampling method by considering the trade-off between<br>diversity and risk at each decoding step, based on our collected prefix tree<br>which preserves the context of a full sentence. Our work provides a<br>comprehensive comparison between existing truncation sampling methods, as well<br>as their recommended parameters as a guideline for users.<br>},
}

Endnote

%0 Report
%A Zhou, Yuxuan
%A Keuper, Margret
%A Fritz, Mario
%+ External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
%T Balancing Diversity and Risk in LLM Sampling: How to Select Your Method
  and Parameter for Open-Ended Text Generation : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-5E23-E
%U https://arxiv.org/abs/2408.13586
%D 2024
%X   Sampling-based decoding strategies have been widely adopted for Large<br>Language Models (LLMs) in numerous applications, which target a balance between<br>diversity and quality via temperature tuning and tail truncation (e.g., top-k<br>and top-p sampling). Considering the high dynamic range of the candidate<br>next-token given different prefixes, recent studies propose to adaptively<br>truncate the tail of LLM's predicted distribution. Although improved results<br>haven been reported with these methods on open-ended text generation tasks, the<br>results are highly dependent on the curated truncation parameters and exemplar<br>text. In this paper, we propose a systematic way to estimate the intrinsic<br>capacity of a truncation sampling method by considering the trade-off between<br>diversity and risk at each decoding step, based on our collected prefix tree<br>which preserves the context of a full sentence. Our work provides a<br>comprehensive comparison between existing truncation sampling methods, as well<br>as their recommended parameters as a guideline for users.<br>
%K Computer Science, Computation and Language, cs.CL,Computer Science, Artificial Intelligence, cs.AI

2023

Conference paper

Y. Li, M. Keuper, D. Zhang, and A. Khoreva

“Divide & Bind Your Attention for Improved Generative Semantic Nursing,” in 34th British Machine Vision Conference (BMVC 2023), Aberdeen, UK, 2023.

mehr

BibTeX

@inproceedings{LiBMVC23,
TITLE = {Divide \& Bind Your Attention for Improved Generative Semantic Nursing},
AUTHOR = {Li, Yumeng and Keuper, Margret and Zhang, Dan and Khoreva, Anna},
LANGUAGE = {eng},
PUBLISHER = {BMVA Press},
YEAR = {2023},
MARGINALMARK = {$\bullet$},
BOOKTITLE = {34th British Machine Vision Conference (BMVC 2023)},
EID = {366},
ADDRESS = {Aberdeen, UK},
}

Endnote

%0 Conference Proceedings
%A Li, Yumeng
%A Keuper, Margret
%A Zhang, Dan
%A Khoreva, Anna
%+ External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
External Organizations
%T Divide & Bind Your Attention for Improved Generative Semantic Nursing : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-971E-3
%D 2023
%B 34th British Machine Vision Conference
%Z date of event: 2023-11-20 - 2023-11-24
%C Aberdeen, UK
%B 34th British Machine Vision Conference
%Z sequence number: 366
%I BMVA Press

Conference paper

J. Lukasik, J. Geiping, M. Moeller, and M. Keuper

“Differentiable Architecture Search: a One-Shot Method?,” in AutoML Conference 2023, Potsdam/Berlin, Germany, 2023.

mehr

BibTeX

@inproceedings{lukasik2023differentiable,
TITLE = {Differentiable Architecture Search: a One-Shot Method?},
AUTHOR = {Lukasik, Jovita and Geiping, Jonas and Moeller, Michael and Keuper, Margret},
LANGUAGE = {eng},
URL = {https://openreview.net/forum?id=LV-5kHj-uV5; https://2023.automl.cc/},
PUBLISHER = {OpenReview.net},
YEAR = {2023},
MARGINALMARK = {$\bullet$},
BOOKTITLE = {AutoML Conference 2023},
PAGES = {1--18},
ADDRESS = {Potsdam/Berlin, Germany},
}

Endnote

%0 Conference Proceedings
%A Lukasik, Jovita
%A Geiping, Jonas
%A Moeller, Michael
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Differentiable Architecture Search: a One-Shot Method?  : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000F-2428-C
%U https://openreview.net/forum?id=LV-5kHj-uV5
%D 2023
%B International Conference on Automated Machine Learning
%Z date of event: 2023-09-12 - 2023-09-15
%C Potsdam/Berlin, Germany
%B AutoML Conference 2023
%P 1 - 18
%I OpenReview.net

Conference paper

S. Jung, J. Lukasik, and M. Keuper

“Neural Architecture Design and Robustness: A Dataset,” in Eleventh International Conference on Learning Representations (ICLR 2023), Kigali, Rwanda, 2023.

mehr

Abstract

Deep learning models have proven to be successful in a wide
range of machine learning tasks. Yet, they are often highly sensitive to
perturbations on the input data which can lead to incorrect decisions
with high confidence, hampering their deployment for practical
use-cases. Thus, finding architectures that are (more) robust against
perturbations has received much attention in recent years. Just like the
search for well-performing architectures in terms of clean accuracy,
this usually involves a tedious trial-and-error process with one
additional challenge: the evaluation of a network's robustness is
significantly more expensive than its evaluation for clean accuracy.
Thus, the aim of this paper is to facilitate better streamlined research
on architectural design choices with respect to their impact on
robustness as well as, for example, the evaluation of surrogate measures
for robustness. We therefore borrow one of the most commonly considered
search spaces for neural architecture search for image classification,
NAS-Bench-201, which contains a manageable size of 6466 non-isomorphic
network designs. We evaluate all these networks on a range of common
adversarial attacks and corruption types and introduce a database on
neural architecture design and robustness evaluations. We further
present three exemplary use cases of this dataset, in which we (i)
benchmark robustness measurements based on Jacobian and Hessian matrices
for their robustness predictability, (ii) perform neural architecture
search on robust accuracies, and (iii) provide an initial analysis of
how architectural design choices affect robustness. We find that
carefully crafting the topology of a network can have substantial impact
on its robustness, where networks with the same parameter count range in
mean adversarial robust accuracy from 20%-41%.

BibTeX

@inproceedings{Jung_ICLR23,
TITLE = {Neural Architecture Design and Robustness: {A} Dataset},
AUTHOR = {Jung, Steffen and Lukasik, Jovita and Keuper, Margret},
LANGUAGE = {eng},
URL = {https://openreview.net/forum?id=p8coElqiSDw},
PUBLISHER = {OpenReview.net},
YEAR = {2023},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Deep learning models have proven to be successful in a wide <br>range of machine learning tasks. Yet, they are often highly sensitive to <br>perturbations on the input data which can lead to incorrect decisions <br>with high confidence, hampering their deployment for practical <br>use-cases. Thus, finding architectures that are (more) robust against <br>perturbations has received much attention in recent years. Just like the <br>search for well-performing architectures in terms of clean accuracy, <br>this usually involves a tedious trial-and-error process with one <br>additional challenge: the evaluation of a network's robustness is <br>significantly more expensive than its evaluation for clean accuracy. <br>Thus, the aim of this paper is to facilitate better streamlined research <br>on architectural design choices with respect to their impact on <br>robustness as well as, for example, the evaluation of surrogate measures <br>for robustness. We therefore borrow one of the most commonly considered <br>search spaces for neural architecture search for image classification, <br>NAS-Bench-201, which contains a manageable size of 6466 non-isomorphic <br>network designs. We evaluate all these networks on a range of common <br>adversarial attacks and corruption types and introduce a database on <br>neural architecture design and robustness evaluations. We further <br>present three exemplary use cases of this dataset, in which we (i) <br>benchmark robustness measurements based on Jacobian and Hessian matrices <br>for their robustness predictability, (ii) perform neural architecture <br>search on robust accuracies, and (iii) provide an initial analysis of <br>how architectural design choices affect robustness. We find that <br>carefully crafting the topology of a network can have substantial impact <br>on its robustness, where networks with the same parameter count range in <br>mean adversarial robust accuracy from 20%-41%.},
BOOKTITLE = {Eleventh International Conference on Learning Representations (ICLR 2023)},
ADDRESS = {Kigali, Rwanda},
}

Endnote

%0 Conference Proceedings
%A Jung, Steffen
%A Lukasik, Jovita
%A Keuper, Margret
%+ Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Neural Architecture Design and Robustness: A Dataset : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000C-738F-2
%U https://openreview.net/forum?id=p8coElqiSDw
%D 2023
%B Eleventh International Conference on Learning Representations
%Z date of event: 2023-05-01 - 2023-05-05
%C Kigali, Rwanda
%X Deep learning models have proven to be successful in a wide <br>range of machine learning tasks. Yet, they are often highly sensitive to <br>perturbations on the input data which can lead to incorrect decisions <br>with high confidence, hampering their deployment for practical <br>use-cases. Thus, finding architectures that are (more) robust against <br>perturbations has received much attention in recent years. Just like the <br>search for well-performing architectures in terms of clean accuracy, <br>this usually involves a tedious trial-and-error process with one <br>additional challenge: the evaluation of a network's robustness is <br>significantly more expensive than its evaluation for clean accuracy. <br>Thus, the aim of this paper is to facilitate better streamlined research <br>on architectural design choices with respect to their impact on <br>robustness as well as, for example, the evaluation of surrogate measures <br>for robustness. We therefore borrow one of the most commonly considered <br>search spaces for neural architecture search for image classification, <br>NAS-Bench-201, which contains a manageable size of 6466 non-isomorphic <br>network designs. We evaluate all these networks on a range of common <br>adversarial attacks and corruption types and introduce a database on <br>neural architecture design and robustness evaluations. We further <br>present three exemplary use cases of this dataset, in which we (i) <br>benchmark robustness measurements based on Jacobian and Hessian matrices <br>for their robustness predictability, (ii) perform neural architecture <br>search on robust accuracies, and (iii) provide an initial analysis of <br>how architectural design choices affect robustness. We find that <br>carefully crafting the topology of a network can have substantial impact <br>on its robustness, where networks with the same parameter count range in <br>mean adversarial robust accuracy from 20%-41%.
%B Eleventh International Conference on Learning Representations
%I OpenReview.net

Conference paper

S. Agnihotri, K. V. Gandikota, J. Grabinski, P. Chandramouli, and M. Keuper

“On the Unreasonable Vulnerability of Transformers for Image Restoration – and an Easy Fix,” in IEEE/CVF International Conference on Computer Vision Workshops (ICCVW 2023), Paris, France, 2023.

mehr

BibTeX

@inproceedings{ICCVW_Agnihotri23,
TITLE = {On the Unreasonable Vulnerability of Transformers for Image Restoration -- and an Easy Fix},
AUTHOR = {Agnihotri, Shashank and Gandikota, Kanchana Vaishnavi and Grabinski, Julia and Chandramouli, Paramanand and Keuper, Margret},
LANGUAGE = {eng},
ISBN = {979-8-3503-0744-3},
DOI = {10.1109/ICCVW60793.2023.00398},
PUBLISHER = {IEEE},
YEAR = {2023},
MARGINALMARK = {$\bullet$},
DATE = {2023},
BOOKTITLE = {IEEE/CVF International Conference on Computer Vision Workshops (ICCVW 2023)},
PAGES = {3709--3719},
ADDRESS = {Paris, France},
}

Endnote

%0 Conference Proceedings
%A Agnihotri, Shashank
%A Gandikota, Kanchana Vaishnavi
%A Grabinski, Julia
%A Chandramouli, Paramanand
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T On the Unreasonable Vulnerability of Transformers for Image Restoration &#8211; and an Easy Fix : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000F-1ABE-F
%R 10.1109/ICCVW60793.2023.00398
%D 2023
%B 4th Workshop on Adversarial Robustness in the Real World
%Z date of event: 2023-10-02 - 2023-10-02
%C Paris, France
%B IEEE/CVF International Conference on Computer Vision Workshops 
%P 3709 - 3719
%I IEEE
%@ 979-8-3503-0744-3

Conference paper

P. Müller, A. Braun, and M. Keuper

“Classification Robustness to Common Optical Aberrations,” in IEEE/CVF International Conference on Computer Vision Workshops (ICCVW 2023), Paris, France, 2023.

mehr

BibTeX

@inproceedings{ICCVW_Mueller23,
TITLE = {Classification Robustness to Common Optical Aberrations},
AUTHOR = {M{\"u}ller, Patrrick and Braun, Alexander and Keuper, Margret},
LANGUAGE = {eng},
ISBN = {979-8-3503-0744-3},
DOI = {10.1109/ICCVW60793.2023.00391},
PUBLISHER = {IEEE},
YEAR = {2023},
MARGINALMARK = {$\bullet$},
DATE = {2023},
BOOKTITLE = {IEEE/CVF International Conference on Computer Vision Workshops (ICCVW 2023)},
PAGES = {3634--3645},
ADDRESS = {Paris, France},
}

Endnote

%0 Conference Proceedings
%A M&#252;ller, Patrrick
%A Braun, Alexander
%A Keuper, Margret
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Classification Robustness to Common Optical Aberrations : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000F-1AC3-8
%R 10.1109/ICCVW60793.2023.00391
%D 2023
%B 4th Workshop on Adversarial Robustness in the Real World
%Z date of event: 2023-10-02 - 2023-10-02
%C Paris, France
%B IEEE/CVF International Conference on Computer Vision Workshops 
%P 3634 - 3645
%I IEEE
%@ 979-8-3503-0744-3

Article

E. Levinkov, A. Kardoost, B. Andres, and M. Keuper

“Higher-Order Multicuts for Geometric Model Fitting and Motion Segmentation,” IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 45, no. 1, 2023.

mehr

Abstract

Minimum cost lifted multicut problem is a generalization of the multicut problem and is a means to optimizing a decomposition of a graph w.r.t. both positive and negative edge costs. Its main advantage is that multicut-based formulations do not require the number of components given a priori; instead, it is deduced from the solution. However, the standard multicut cost function is limited to pairwise relationships between nodes, while several important applications either require or can benefit from a higher-order cost function, i.e. hyper-edges. In this paper, we propose a pseudo-boolean formulation for a multiple model fitting problem. It is based on a formulation of any-order minimum cost lifted multicuts, which allows to partition an undirected graph with pairwise connectivity such as to minimize costs defined over any set of hyper-edges. As the proposed formulation is NP-hard and the branch-and-bound algorithm is too slow in practice, we propose an efficient local search algorithm for inference into resulting problems. We demonstrate versatility and effectiveness of our approach in several applications: geometric multiple model fitting, homography and motion estimation, motion segmentation.

BibTeX

@article{Keuper22,
TITLE = {Higher-Order Multicuts for Geometric Model Fitting and Motion Segmentation},
AUTHOR = {Levinkov, Evgeny and Kardoost, Amirhossein and Andres, Bjoern and Keuper, Margret},
LANGUAGE = {eng},
ISSN = {0162-8828},
DOI = {10.1109/TPAMI.2022.3148795},
PUBLISHER = {IEEE},
ADDRESS = {Piscataway, NJ},
YEAR = {2023},
MARGINALMARK = {$\bullet$},
DATE = {2023},
ABSTRACT = {Minimum cost lifted multicut problem is a generalization of the multicut problem and is a means to optimizing a decomposition of a graph w.r.t. both positive and negative edge costs. Its main advantage is that multicut-based formulations do not require the number of components given a priori; instead, it is deduced from the solution. However, the standard multicut cost function is limited to pairwise relationships between nodes, while several important applications either require or can benefit from a higher-order cost function, i.e. hyper-edges. In this paper, we propose a pseudo-boolean formulation for a multiple model fitting problem. It is based on a formulation of any-order minimum cost lifted multicuts, which allows to partition an undirected graph with pairwise connectivity such as to minimize costs defined over any set of hyper-edges. As the proposed formulation is NP-hard and the branch-and-bound algorithm is too slow in practice, we propose an efficient local search algorithm for inference into resulting problems. We demonstrate versatility and effectiveness of our approach in several applications: geometric multiple model fitting, homography and motion estimation, motion segmentation.},
JOURNAL = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
VOLUME = {45},
NUMBER = {1},
PAGES = {608--622},
}

Endnote

%0 Journal Article
%A Levinkov, Evgeny
%A Kardoost, Amirhossein
%A Andres, Bjoern
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Higher-Order Multicuts for Geometric Model Fitting and Motion Segmentation :
%G eng
%U http://hdl.handle.net/21.11116/0000-0009-F784-B
%R 10.1109/TPAMI.2022.3148795
%7 2022
%D 2023
%X Minimum cost lifted multicut problem is a generalization of the multicut problem and is a means to optimizing a decomposition of a graph w.r.t. both positive and negative edge costs. Its main advantage is that multicut-based formulations do not require the number of components given a priori; instead, it is deduced from the solution. However, the standard multicut cost function is limited to pairwise relationships between nodes, while several important applications either require or can benefit from a higher-order cost function, i.e. hyper-edges. In this paper, we propose a pseudo-boolean formulation for a multiple model fitting problem. It is based on a formulation of any-order minimum cost lifted multicuts, which allows to partition an undirected graph with pairwise connectivity such as to minimize costs defined over any set of hyper-edges. As the proposed formulation is NP-hard and the branch-and-bound algorithm is too slow in practice, we propose an efficient local search algorithm for inference into resulting problems. We demonstrate versatility and effectiveness of our approach in several applications: geometric multiple model fitting, homography and motion estimation, motion segmentation.
%J IEEE Transactions on Pattern Analysis and Machine Intelligence
%O IEEE Trans. Pattern Anal. Mach. Intell.
%V 45
%N 1
%& 608
%P 608 - 622
%I IEEE
%C Piscataway, NJ
%@ false

Conference paper

Y. Li, D. Zhang, M. Keuper, and A. Khoreva

“Intra-Source Style Augmentation for Improved Domain Generalization,” in 2023 IEEE Winter Conference on Applications of Computer Vision (WACV 2023), Waikoloa, HI, USA, 2023.

mehr

BibTeX

@inproceedings{Li_WACV23,
TITLE = {Intra-Source Style Augmentation for Improved Domain Generalization},
AUTHOR = {Li, Yumeng and Zhang, Dan and Keuper, Margret and Khoreva, Anna},
LANGUAGE = {eng},
ISBN = {978-1-6654-9346-8},
DOI = {10.1109/WACV56688.2023.00058},
PUBLISHER = {IEEE},
YEAR = {2023},
MARGINALMARK = {$\bullet$},
BOOKTITLE = {2023 IEEE Winter Conference on Applications of Computer Vision (WACV 2023)},
PAGES = {509--519},
ADDRESS = {Waikoloa, HI, USA},
}

Endnote

%0 Conference Proceedings
%A Li, Yumeng
%A Zhang, Dan
%A Keuper, Margret
%A Khoreva, Anna
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
%T Intra-Source Style Augmentation for Improved Domain Generalization : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000B-67FC-6
%R 10.1109/WACV56688.2023.00058
%D 2023
%B IEEE Winter Conference on Applications of Computer Vision
%Z date of event: 2023-01-03 - 2023-01-07
%C Waikoloa, HI, USA
%B 2023 IEEE Winter Conference on Applications of Computer Vision
%P 509 - 519
%I IEEE
%@ 978-1-6654-9346-8

Article

V. Kostyukhin, M. Keuper, I. Ibragimov, N. Owtscharenko, and M. Cristinziani

“Improving Primary-Vertex Reconstruction with a Minimum-Cost Lifted Multicut Graph Partitioning Algorithm,” Journal of Instrumentation, vol. 18, 2023.

mehr

BibTeX

@article{Kostyukhin:2023kcv,
TITLE = {Improving Primary-Vertex Reconstruction with a Minimum-Cost Lifted Multicut Graph Partitioning Algorithm},
AUTHOR = {Kostyukhin, V. and Keuper, Margret and Ibragimov, I. and Owtscharenko, N. and Cristinziani, M.},
LANGUAGE = {eng},
ISSN = {1748-0221},
DOI = {10.1088/1748-0221/18/07/P07013},
PUBLISHER = {IOP Publishing},
ADDRESS = {Bristol},
YEAR = {2023},
MARGINALMARK = {$\bullet$},
DATE = {2023},
JOURNAL = {Journal of Instrumentation},
VOLUME = {18},
PAGES = {1--24},
}

Endnote

%0 Journal Article
%A Kostyukhin, V.
%A Keuper, Margret
%A Ibragimov, I.
%A Owtscharenko, N.
%A Cristinziani, M.
%+ External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
External Organizations
External Organizations
%T Improving Primary-Vertex Reconstruction with a Minimum-Cost Lifted Multicut Graph Partitioning Algorithm : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000F-242F-5
%R 10.1088/1748-0221/18/07/P07013
%7 2023
%D 2023
%J Journal of Instrumentation
%O JINST
%V 18
%& 1
%P 1 - 24
%I IOP Publishing
%C Bristol
%@ false

Conference paper

K. Prasse, S. Jung, I. B. Bravo, S. Walter, and M. Keuper

“Towards Understanding Climate Change Perceptions: A Social Media Dataset,” in NeurIPS 2023 Workshop on Tackling Climate Change with Machine Learning, New Orleans, LA, USA, 2023.

mehr

BibTeX

@inproceedings{prasse2023towards,
TITLE = {Towards Understanding Climate Change Perceptions: {A} Social Media Dataset},
AUTHOR = {Prasse, Katharina and Jung, Steffen and Bravo, Isaac B and Walter, Stefanie and Keuper, Margret},
LANGUAGE = {eng},
URL = {https://www.climatechange.ai/papers/neurips2023/3; https://www.climatechange.ai},
PUBLISHER = {climatechange.ai},
YEAR = {2023},
MARGINALMARK = {$\bullet$},
BOOKTITLE = {NeurIPS 2023 Workshop on Tackling Climate Change with Machine Learning},
EDITOR = {Bhalerao, Rasika and Roth, Mark and Jeggle, Kai and Montalvo Arviszu, Jorge and Madadkhani, Shiva and Bengio, Yoshua},
PAGES = {1--20},
ADDRESS = {New Orleans, LA, USA},
}

Endnote

%0 Conference Proceedings
%A Prasse, Katharina
%A Jung, Steffen
%A Bravo, Isaac B
%A Walter, Stefanie
%A Keuper, Margret
%+ External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Towards Understanding Climate Change Perceptions: A Social Media Dataset : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000F-2458-6
%U https://www.climatechange.ai/papers/neurips2023/3
%D 2023
%B Workshop on Tackling Climate Change with Machine Learning
%Z date of event: 2023-12-16 - 2023-12-16
%C New Orleans, LA, USA
%B NeurIPS 2023 Workshop on Tackling Climate Change with Machine Learning
%E Bhalerao, Rasika; Roth, Mark; Jeggle, Kai; Montalvo Arviszu, Jorge; Madadkhani, Shiva; Bengio, Yoshua
%P 1 - 20
%I climatechange.ai

Conference paper

J. Lukasik, M. Moeller, and M. Keuper

“An Evaluation of Zero-Cost Proxies - From Neural Architecture Performance Prediction to Model Robustness,” in Pattern Recognition (DAGM GCPR 2023), Heidelberg, Germany, 2023.

mehr

BibTeX

@inproceedings{10.1007/978-3-031-54605-1_40,
TITLE = {An Evaluation of Zero-Cost Proxies -- From Neural Architecture Performance Prediction to Model Robustness},
AUTHOR = {Lukasik, Jovita and Moeller, Michael and Keuper, Margret},
LANGUAGE = {eng},
ISBN = {978-3-031-54604-4; 978-3-031-54605-1},
DOI = {10.1007/978-3-031-54605-1_40},
PUBLISHER = {Springer},
YEAR = {2023},
MARGINALMARK = {$\bullet$},
DATE = {2023},
BOOKTITLE = {Pattern Recognition (DAGM GCPR 2023)},
EDITOR = {K{\"o}the, Ullrich and Rother, Carsten},
PAGES = {624--638},
SERIES = {Lecture Notes in Computer Science},
VOLUME = {14264},
ADDRESS = {Heidelberg, Germany},
}

Endnote

%0 Conference Proceedings
%A Lukasik, Jovita
%A Moeller, Michael
%A Keuper, Margret
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T An Evaluation of Zero-Cost Proxies - From Neural Architecture Performance Prediction to Model Robustness : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000F-243C-6
%R 10.1007/978-3-031-54605-1_40
%D 2023
%B 45th German Conference on Pattern Recognition
%Z date of event: 2023-09-19 - 2023-09-22
%C Heidelberg, Germany
%B Pattern Recognition
%E K&#246;the, Ullrich; Rother, Carsten
%P 624 - 638
%I Springer
%@ 978-3-031-54604-4 978-3-031-54605-1
%B Lecture Notes in Computer Science
%N 14264
%U https://rdcu.be/dErn5

Conference paper

T. Medi, J. Tayyub, M. Sarmad, F. Lindseth, and M. Keuper

“FullFormer: Generating Shapes Inside Shapes,” in Pattern Recognition (DAGM GCPR 2023), Heidelberg, Germany, 2024.

mehr

BibTeX

@inproceedings{10.1007/978-3-031-54605-1_10,
TITLE = {{FullFormer}: {G}enerating Shapes Inside Shapes},
AUTHOR = {Medi, Tejaswini and Tayyub, Jawad and Sarmad, Muhammad and Lindseth, Frank and Keuper, Margret},
LANGUAGE = {eng},
ISBN = {978-3-031-54604-4; 978-3-031-54605-1},
DOI = {10.1007/978-3-031-54605-1_10},
PUBLISHER = {Springer},
YEAR = {2023},
MARGINALMARK = {$\bullet$},
DATE = {2024},
BOOKTITLE = {Pattern Recognition (DAGM GCPR 2023)},
EDITOR = {K{\"o}the, Ullrich and Rother, Carsten},
PAGES = {147--162},
SERIES = {Lecture Notes in Computer Science},
VOLUME = {14264},
ADDRESS = {Heidelberg, Germany},
}

Endnote

%0 Conference Proceedings
%A Medi, Tejaswini
%A Tayyub, Jawad
%A Sarmad, Muhammad
%A Lindseth, Frank
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T FullFormer: Generating Shapes Inside Shapes : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000F-2464-8
%R 10.1007/978-3-031-54605-1_10
%D 2024
%B 45th German Conference on Pattern Recognition
%Z date of event: 2023-09-19 - 2023-09-22
%C Heidelberg, Germany
%B Pattern Recognition
%E K&#246;the, Ullrich; Rother, Carsten
%P 147 - 162
%I Springer
%@ 978-3-031-54604-4 978-3-031-54605-1
%B Lecture Notes in Computer Science
%N 14264
%U https://rdcu.be/dErAg

Conference paper

P. Lorenz, M. Keuper, and J. Keuper

“Unfolding Local Growth Rate Estimates for (Almost) Perfect Adversarial Detection,” in Proceedings of the 18th International Joint Conference on Computer Vision, Imaging and Computer Graphics Theory and Applications. - Vol. 5, VISAPP (VISIGRAPP 2023), Lisbon, Portugal, 2023.

mehr

BibTeX

@inproceedings{LorenzVISAPP23,
TITLE = {Unfolding Local Growth Rate Estimates for (Almost) Perfect Adversarial Detection},
AUTHOR = {Lorenz, Peter and Keuper, Margret and Keuper, Janis},
LANGUAGE = {eng},
ISBN = {978-989-758-634-7},
DOI = {10.5220/0011586500003417},
PUBLISHER = {SciTePRress},
YEAR = {2023},
MARGINALMARK = {$\bullet$},
DATE = {2023},
BOOKTITLE = {Proceedings of the 18th International Joint Conference on Computer Vision, Imaging and Computer Graphics Theory and Applications. -- Vol. 5, VISAPP (VISIGRAPP 2023)},
EDITOR = {Radeva, Petia and Farinella, Giovanni Maria and Bouatouch, Kadi},
PAGES = {27--38},
ADDRESS = {Lisbon, Portugal},
}

Endnote

%0 Conference Proceedings
%A Lorenz, Peter
%A Keuper, Margret
%A Keuper, Janis
%+ External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
%T Unfolding Local Growth Rate Estimates for (Almost) Perfect Adversarial Detection : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-A31A-9
%R 10.5220/0011586500003417
%D 2023
%B 18th International Joint Conference on Computer Vision, Imaging and Computer Graphics Theory and Applications 
%Z date of event: 2023-02-19 - 2023-02-21
%C Lisbon, Portugal
%B Proceedings of the 18th International Joint Conference on Computer Vision, Imaging and Computer Graphics Theory and Applications. - Vol. 5, VISAPP
%E Radeva, Petia; Farinella, Giovanni Maria; Bouatouch, Kadi
%P 27 - 38
%I SciTePRress
%@ 978-989-758-634-7

Conference paper

P. Gavrikov, J. Keuper, and M. Keuper

“An Extended Study of Human-like Behavior under Adversarial Training,” in Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW 2023), Vancouver, Canada, 2023.

mehr

BibTeX

@inproceedings{Gavrikov_CVPRW23,
TITLE = {An Extended Study of Human-like Behavior under Adversarial Training},
AUTHOR = {Gavrikov, Paul and Keuper, Janis and Keuper, Margret},
LANGUAGE = {eng},
ISBN = {979-8-3503-0249-3},
DOI = {10.1109/CVPRW59228.2023.00233},
PUBLISHER = {IEEE},
YEAR = {2023},
MARGINALMARK = {$\bullet$},
DATE = {2023},
BOOKTITLE = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW 2023)},
PAGES = {2361--2368},
ADDRESS = {Vancouver, Canada},
}

Endnote

%0 Conference Proceedings
%A Gavrikov, Paul
%A Keuper, Janis
%A Keuper, Margret
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T An Extended Study of Human-like Behavior under Adversarial Training : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-A279-F
%R 10.1109/CVPRW59228.2023.00233
%D 2023
%B The 3rd Workshop of Adversarial Machine Learning on Computer 
Vision: Art of Robustness
%Z date of event: 2023-06-19 - 2023-06-19
%C Vancouver, Canada
%B Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops
%P 2361 - 2368
%I IEEE
%@ 979-8-3503-0249-3

Article

J. Lukasik, P. Gavrikov, J. Keuper, and M. Keuper

“Improving Native CNN Robustness with Filter Frequency Regularization,” Transactions on Machine Learning Research, vol. 2023, 2023.

mehr

BibTeX

@article{lukasik2023improving,
TITLE = {Improving Native {CNN} Robustness with Filter Frequency Regularization},
AUTHOR = {Lukasik, Jovita and Gavrikov, Paul and Keuper, Janis and Keuper, Margret},
LANGUAGE = {eng},
ISSN = {2835-8856},
URL = {https://openreview.net/forum?id=2wecNCpZ7Y},
PUBLISHER = {TMLR},
ADDRESS = {New York, NY},
YEAR = {2023},
MARGINALMARK = {$\bullet$},
JOURNAL = {Transactions on Machine Learning Research},
VOLUME = {2023},
PAGES = {1--36},
}

Endnote

%0 Journal Article
%A Lukasik, Jovita
%A Gavrikov, Paul
%A Keuper, Janis
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Improving Native CNN Robustness with Filter Frequency Regularization : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000F-241E-8
%U https://openreview.net/forum?id=2wecNCpZ7Y
%7 2023
%D 2023
%J Transactions on Machine Learning Research
%V 2023
%& 1
%P 1 - 36
%I TMLR
%C New York, NY
%@ false
%U https://github.com/jovitalukasik/filter_freq_reg

Conference paper

J. P. Schneider, F. Mishal, J. Lukasik, A. Kolb, M. Keuper, and M. Moeller

“Implicit Representations for Image Segmentation,” in UniReps: The First Workshop on Unifying Representations in Neural Models, New Orleans, LA, USA, 2022.

mehr

BibTeX

@inproceedings{schneider2023implicit,
TITLE = {Implicit Representations for Image Segmentation},
AUTHOR = {Schneider, Jan Philipp and Mishal, Fatima and Lukasik, Jovita and Kolb, Andreas and Keuper, Margret and Moeller, Michael},
LANGUAGE = {eng},
URL = {https://openreview.net/forum?id=LSSiDy7fG1; https://openreview.net/group?id=NeurIPS.cc/2023/Workshop/UniReps},
PUBLISHER = {OpenReview.net},
YEAR = {2023},
BOOKTITLE = {UniReps: The First Workshop on Unifying Representations in Neural Models},
PAGES = {1--6},
ADDRESS = {New Orleans, LA, USA},
}

Endnote

%0 Conference Proceedings
%A Schneider, Jan Philipp
%A Mishal, Fatima
%A Lukasik, Jovita
%A Kolb, Andreas 
%A Keuper, Margret
%A Moeller, Michael
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
%T Implicit Representations for Image Segmentation : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000F-240B-D
%U https://openreview.net/forum?id=LSSiDy7fG1
%D 2022
%B First Workshop on Unifying Representations in Neural Models
%Z date of event: 2023-12-15 - 2023-12-15
%C New Orleans, LA, USA
%B UniReps: The First Workshop on Unifying Representations in Neural Models
%P 1 - 6
%I OpenReview.net

Conference paper

D2D6

S. Jung, J. C. Schwedhelm, C. Schillings, and M. Keuper

“Happy People --Image Synthesis as Black-Box Optimization Problem in the Discrete Latent Space of Deep Generative Models,” in Workshop Generative Models for Computer Vision, Vancouver, Canada, 2023.

mehr

BibTeX

@inproceedings{JungCVPR23,
TITLE = {Happy People -- Image Synthesis as Black-Box Optimization Problem in the Discrete Latent Space of Deep Generative Models},
AUTHOR = {Jung, Steffen and Schwedhelm, Jan Christian and Schillings, Claudia and Keuper, Margret},
LANGUAGE = {eng},
YEAR = {2023},
MARGINALMARK = {$\bullet$},
BOOKTITLE = {Workshop Generative Models for Computer Vision},
EDITOR = {Kortylewski, Adam and Zhan, Fangneng and Liu, Lingjie and Sitzmann, Vincent and Yuile, Alan and Theobalt, Christian},
ADDRESS = {Vancouver, Canada},
}

Endnote

%0 Conference Proceedings
%A Jung, Steffen
%A Schwedhelm, Jan Christian
%A Schillings, Claudia
%A Keuper, Margret
%+ Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Happy People --Image Synthesis as Black-Box Optimization Problem in the Discrete Latent Space of Deep Generative Models : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-A088-F
%D 2023
%B Workshop Generative Models for Computer Vision 
%Z date of event: 2023-06-18 - 2023-06-18
%C Vancouver, Canada
%B Workshop Generative Models for Computer Vision 
%E Kortylewski, Adam; Zhan, Fangneng; Liu, Lingjie; Sitzmann, Vincent; Yuile, Alan; Theobalt, Christian
%U https://generative-vision.github.io/workshop-CVPR-23/data/29.pdf

Paper

J. Grabinski, J. Keuper, and M. Keuper

“Fix your downsampling ASAP! Be natively more robust via Aliasing and Spectral Artifact free Pooling,” 2023. [Online]. Available: https://arxiv.org/abs/2307.09804.

mehr

Abstract

Convolutional neural networks encode images through a sequence of
convolutions, normalizations and non-linearities as well as downsampling
operations into potentially strong semantic embeddings. Yet, previous work
showed that even slight mistakes during sampling, leading to aliasing, can be
directly attributed to the networks' lack in robustness. To address such issues
and facilitate simpler and faster adversarial training, [12] recently proposed
FLC pooling, a method for provably alias-free downsampling - in theory. In this
work, we conduct a further analysis through the lens of signal processing and
find that such current pooling methods, which address aliasing in the frequency
domain, are still prone to spectral leakage artifacts. Hence, we propose
aliasing and spectral artifact-free pooling, short ASAP. While only introducing
a few modifications to FLC pooling, networks using ASAP as downsampling method
exhibit higher native robustness against common corruptions, a property that
FLC pooling was missing. ASAP also increases native robustness against
adversarial attacks on high and low resolution data while maintaining similar
clean accuracy or even outperforming the baseline.

BibTeX

@online{Grabinski2307.09804,
TITLE = {Fix your downsampling {ASAP}! Be natively more robust via Aliasing and Spectral Artifact free Pooling},
AUTHOR = {Grabinski, Julia and Keuper, Janis and Keuper, Margret},
LANGUAGE = {eng},
URL = {https://arxiv.org/abs/2307.09804},
EPRINT = {2307.09804},
EPRINTTYPE = {arXiv},
YEAR = {2023},
MARGINALMARK = {$\bullet$},
ABSTRACT = {Convolutional neural networks encode images through a sequence of<br>convolutions, normalizations and non-linearities as well as downsampling<br>operations into potentially strong semantic embeddings. Yet, previous work<br>showed that even slight mistakes during sampling, leading to aliasing, can be<br>directly attributed to the networks' lack in robustness. To address such issues<br>and facilitate simpler and faster adversarial training, [12] recently proposed<br>FLC pooling, a method for provably alias-free downsampling -- in theory. In this<br>work, we conduct a further analysis through the lens of signal processing and<br>find that such current pooling methods, which address aliasing in the frequency<br>domain, are still prone to spectral leakage artifacts. Hence, we propose<br>aliasing and spectral artifact-free pooling, short ASAP. While only introducing<br>a few modifications to FLC pooling, networks using ASAP as downsampling method<br>exhibit higher native robustness against common corruptions, a property that<br>FLC pooling was missing. ASAP also increases native robustness against<br>adversarial attacks on high and low resolution data while maintaining similar<br>clean accuracy or even outperforming the baseline.<br>},
}

Endnote

%0 Report
%A Grabinski, Julia
%A Keuper, Janis
%A Keuper, Margret
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Fix your downsampling ASAP! Be natively more robust via Aliasing and
  Spectral Artifact free Pooling : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0010-8B50-7
%U https://arxiv.org/abs/2307.09804
%D 2023
%X   Convolutional neural networks encode images through a sequence of<br>convolutions, normalizations and non-linearities as well as downsampling<br>operations into potentially strong semantic embeddings. Yet, previous work<br>showed that even slight mistakes during sampling, leading to aliasing, can be<br>directly attributed to the networks' lack in robustness. To address such issues<br>and facilitate simpler and faster adversarial training, [12] recently proposed<br>FLC pooling, a method for provably alias-free downsampling - in theory. In this<br>work, we conduct a further analysis through the lens of signal processing and<br>find that such current pooling methods, which address aliasing in the frequency<br>domain, are still prone to spectral leakage artifacts. Hence, we propose<br>aliasing and spectral artifact-free pooling, short ASAP. While only introducing<br>a few modifications to FLC pooling, networks using ASAP as downsampling method<br>exhibit higher native robustness against common corruptions, a property that<br>FLC pooling was missing. ASAP also increases native robustness against<br>adversarial attacks on high and low resolution data while maintaining similar<br>clean accuracy or even outperforming the baseline.<br>
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV,eess.IV

2022

Conference paper

Y. Zhou, W. Xiang, C. Li, B. Wang, X. Wei, L. Zhang, M. Keuper, and X. Hua

“SP-ViT: Learning 2D Spatial Priors for Vision Transformers,” in 33rd British Machine Vision Conference (BMVC 2022), London, UK, 2022.

mehr

BibTeX

@inproceedings{ZhouBMVC22,
TITLE = {{SP-ViT}: {L}earning {2D} Spatial Priors for Vision Transformers},
AUTHOR = {Zhou, Yuxuan and Xiang, Wangmeng and Li, Chao and Wang, Biao and Wei, Xihan and Zhang, Lei and Keuper, Margret and Hua, Xiansheng},
LANGUAGE = {eng},
URL = {https://bmvc2022.mpi-inf.mpg.de/564/},
PUBLISHER = {BMVA Press},
YEAR = {2022},
BOOKTITLE = {33rd British Machine Vision Conference (BMVC 2022)},
EID = {564},
ADDRESS = {London, UK},
}

Endnote

%0 Conference Proceedings
%A Zhou, Yuxuan
%A Xiang, Wangmeng
%A Li, Chao
%A Wang, Biao
%A Wei, Xihan
%A Zhang, Lei
%A Keuper, Margret
%A Hua, Xiansheng
%+ External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
%T SP-ViT: Learning 2D Spatial Priors for Vision Transformers : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000B-680B-5
%U https://bmvc2022.mpi-inf.mpg.de/564/
%D 2022
%B 33rd British Machine Vision Conference
%Z date of event: 2022-11-21 - 2022-11-24
%C London, UK
%B 33rd British Machine Vision Conference
%Z sequence number: 564
%I BMVA Press

Conference paper

J. Grabinski, P. Gavrikov, J. Keuper, and M. Keuper

“Robust Models are less Over-Confident,” in Advances in Neural Information Processing Systems 35 (NeurIPS 2022), New Orleans, LA, USA, 2022.

mehr

BibTeX

@inproceedings{Grabinski_Neurips22,
TITLE = {Robust Models are less Over-Confident},
AUTHOR = {Grabinski, Julia and Gavrikov, Paul and Keuper, Janis and Keuper, Margret},
LANGUAGE = {eng},
PUBLISHER = {Currran Assoicate, Inc.},
YEAR = {2022},
BOOKTITLE = {Advances in Neural Information Processing Systems 35 (NeurIPS 2022)},
EDITOR = {Koyejo, S. and Mohamed, S. and Agarwal, A. and Belgrave, D. and Cho, K. and Oh, A.},
PAGES = {39059--39075},
ADDRESS = {New Orleans, LA, USA},
}

Endnote

%0 Conference Proceedings
%A Grabinski, Julia
%A Gavrikov, Paul
%A Keuper, Janis
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Robust Models are less Over-Confident : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000B-67C7-1
%D 2022
%B 36th Conference on Neural Information Processing Systems
%Z date of event: 2022-11-28 - 2022-12-09
%C New Orleans, LA, USA
%B Advances in Neural Information Processing Systems 35
%E Koyejo, S.; Mohamed, S.; Agarwal, A.; Belgrave, D.; Cho, K.; Oh, A.
%P 39059 - 39075
%I Currran Assoicate, Inc.
%U https://openreview.net/forum?id=5K3uopkizS

Conference paper

A. Saseendran, K. Skubch, and M. Keuper

“Trading off Image Quality for Robustness is not Necessary with Regularized Deterministic Autoencoders,” in Advances in Neural Information Processing Systems 35 (NeurIPS 2022), New Orleans, LA, USA, 2022.

mehr

BibTeX

@inproceedings{Saseendran_Neurips22,
TITLE = {Trading off Image Quality for Robustness is not Necessary with Regularized Deterministic Autoencoders},
AUTHOR = {Saseendran, Amrutha and Skubch, Kathrin and Keuper, Margret},
LANGUAGE = {eng},
PUBLISHER = {Curran Associates, Inc},
YEAR = {2022},
BOOKTITLE = {Advances in Neural Information Processing Systems 35 (NeurIPS 2022)},
DEBUG = {author: Mohamed, S.},
EDITOR = {Koyejo, S. and Agarwal, Alekh and Belgrave, Danielle and Cho, Kyunghyun and Oh, A.},
PAGES = {26751--26763},
ADDRESS = {New Orleans, LA, USA},
}

Endnote

%0 Conference Proceedings
%A Saseendran, Amrutha
%A Skubch, Kathrin
%A Keuper, Margret
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Trading off Image Quality for Robustness is not Necessary with Regularized Deterministic Autoencoders : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000B-67CC-C
%D 2022
%B 36th Conference on Neural Information Processing Systems
%Z date of event: 2022-11-28 - 2022-12-09
%C New Orleans, LA, USA
%B Advances in Neural Information Processing Systems 35
%E Koyejo, S.; Mohamed, S.; Agarwal, Alekh; Belgrave, Danielle; Cho, Kyunghyun; Oh, A.
%P 26751 - 26763
%I Curran Associates, Inc
%U https://openreview.net/forum?id=9YasTgzma8c

Conference paper

J. Grabinski, S. Jung, J. Keuper, and M. Keuper

“FrequencyLowCut Pooling - Plug & Play against Catastrophic Overfitting,” in Computer Vision -- ECCV 2022, Tel Aviv, Israel, 2022.

mehr

BibTeX

@inproceedings{Grabinski_ECCV2022,
TITLE = {{FrequencyLowCut} pooling -- Plug {\&} Play against Catastrophic Overfitting},
AUTHOR = {Grabinski, Julia and Jung, Steffen and Keuper, Janis and Keuper, Margret},
LANGUAGE = {eng},
ISBN = {978-3-031-19780-2},
DOI = {10.1007/978-3-031-19781-9_3},
PUBLISHER = {Springer},
YEAR = {2022},
DATE = {2022},
BOOKTITLE = {Computer Vision -- ECCV 2022},
EDITOR = {Avidan, Shai and Brostow, Gabriel and Ciss{\'e}, Moustapha and Farinella, Giovanni and Hassner, Tal},
PAGES = {36--57},
SERIES = {Lecture Notes in Computer Science},
VOLUME = {13674},
ADDRESS = {Tel Aviv, Israel},
}

Endnote

%0 Conference Proceedings
%A Grabinski, Julia
%A Jung, Steffen
%A Keuper, Janis
%A Keuper, Margret
%+ External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T FrequencyLowCut Pooling - Plug & Play against Catastrophic 
Overfitting : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000A-C016-4
%R 10.1007/978-3-031-19781-9_3
%D 2022
%B 17th European Conference on Computer Vision
%Z date of event: 2022-10-23 - 2022-10-27
%C Tel Aviv, Israel
%B Computer Vision -- ECCV 2022
%E Avidan, Shai; Brostow, Gabriel; Ciss&#233;, Moustapha; Farinella, Giovanni; Hassner, Tal
%P 36 - 57
%I Springer
%@ 978-3-031-19780-2
%B Lecture Notes in Computer Science
%N 13674

Conference paper

J. Lukasik, S. Jung, and M. Keuper

“Learning Where To Look - Generative NAS is Surprisingly Efficient,” in Computer Vision -- ECCV 2022, Tel Aviv, Israel, 2022.

mehr

BibTeX

@inproceedings{Lukasik_ECCV2022,
TITLE = {Learning Where To Look -- Generative {NAS} is Surprisingly Efficient},
AUTHOR = {Lukasik, Jovita and Jung, Steffen and Keuper, Margret},
LANGUAGE = {eng},
ISBN = {978-3-031-20049-6},
DOI = {10.1007/978-3-031-20050-2_16},
PUBLISHER = {Springer},
YEAR = {2022},
DATE = {2022},
BOOKTITLE = {Computer Vision -- ECCV 2022},
EDITOR = {Avidan, Shai and Brostow, Gabriel and Ciss{\'e}, Moustapha and Farinella, Giovanni and Hassner, Tal},
PAGES = {257--273},
SERIES = {Lecture Notes in Computer Science},
VOLUME = {13683},
ADDRESS = {Tel Aviv, Israel},
}

Endnote

%0 Conference Proceedings
%A Lukasik, Jovita
%A Jung, Steffen
%A Keuper, Margret
%+ Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Learning Where To Look - Generative NAS is Surprisingly Efficient : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000A-C00C-0
%R 10.1007/978-3-031-20050-2_16
%D 2022
%B 17th European Conference on Computer Vision
%Z date of event: 2022-10-23 - 2022-10-27
%C Tel Aviv, Israel
%B Computer Vision -- ECCV 2022
%E Avidan, Shai; Brostow, Gabriel; Ciss&#233;, Moustapha; Farinella, Giovanni; Hassner, Tal
%P 257 - 273
%I Springer
%@ 978-3-031-20049-6
%B Lecture Notes in Computer Science
%N 13683
%U https://github.com/jovitalukasik/AG-Net

Article

J. Grabinski, J. Keuper, and M. Keuper

“Aliasing and Adversarial Robust Generalization of CNNs,” Machine Learning, vol. 111, 2022.

mehr

BibTeX

@article{Grabinski22a,
TITLE = {Aliasing and adversarial robust generalization of {CNNs}},
AUTHOR = {Grabinski, Julia and Keuper, Janis and Keuper, Margret},
LANGUAGE = {eng},
ISSN = {0885-6125},
DOI = {10.1007/s10994-022-06222-8},
PUBLISHER = {Springer},
ADDRESS = {Dordrecht},
YEAR = {2022},
DATE = {2022},
JOURNAL = {Machine Learning},
VOLUME = {111},
PAGES = {3925--3951},
}

Endnote

%0 Journal Article
%A Grabinski, Julia
%A Keuper, Janis
%A Keuper, Margret
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Aliasing and Adversarial Robust Generalization of CNNs : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000C-1BA7-A
%R 10.1007/s10994-022-06222-8
%7 2022
%D 2022
%J Machine Learning
%V 111
%& 3925
%P 3925 - 3951
%I Springer
%C Dordrecht
%@ false

Conference paper

S. Jung and M. Keuper

“Learning to solve Minimum Cost Multicuts efficiently using Edge-Weighted Graph Convolutional Neural Networks,” in Machine Learning and Knowledge Discovery in Databases (ECML PKDD 2022), Grenoble, France, 2022.

mehr

Abstract

The minimum cost multicut problem is the NP-hard/APX-hard combinatorial
optimization problem of partitioning a real-valued edge-weighted graph such as
to minimize the total cost of the partition. While graph convolutional neural
networks (GNN) have proven to be promising in the context of combinatorial
optimization, most of them are only tailored to or tested on positive-valued
edge weights, i.e. they do not comply to the nature of the multicut problem. We
therefore adapt various GNN architectures including Graph Convolutional
Networks, Signed Graph Convolutional Networks and Graph Isomorphic Networks to
facilitate the efficient encoding of real-valued edge costs. Moreover, we
employ a reformulation of the multicut ILP constraints to a polynomial program
as loss function that allows to learn feasible multicut solutions in a scalable
way. Thus, we provide the first approach towards end-to-end trainable
multicuts. Our findings support that GNN approaches can produce good solutions
in practice while providing lower computation times and largely improved
scalability compared to LP solvers and optimized heuristics, especially when
considering large instances.

BibTeX

@inproceedings{Jung_ECML22,
TITLE = {Learning to solve Minimum Cost Multicuts efficiently using Edge-Weighted Graph Convolutional Neural Networks},
AUTHOR = {Jung, Steffen and Keuper, Margret},
LANGUAGE = {eng},
PUBLISHER = {ecmlpkdd.org},
YEAR = {2022},
ABSTRACT = {The minimum cost multicut problem is the NP-hard/APX-hard combinatorial<br>optimization problem of partitioning a real-valued edge-weighted graph such as<br>to minimize the total cost of the partition. While graph convolutional neural<br>networks (GNN) have proven to be promising in the context of combinatorial<br>optimization, most of them are only tailored to or tested on positive-valued<br>edge weights, i.e. they do not comply to the nature of the multicut problem. We<br>therefore adapt various GNN architectures including Graph Convolutional<br>Networks, Signed Graph Convolutional Networks and Graph Isomorphic Networks to<br>facilitate the efficient encoding of real-valued edge costs. Moreover, we<br>employ a reformulation of the multicut ILP constraints to a polynomial program<br>as loss function that allows to learn feasible multicut solutions in a scalable<br>way. Thus, we provide the first approach towards end-to-end trainable<br>multicuts. Our findings support that GNN approaches can produce good solutions<br>in practice while providing lower computation times and largely improved<br>scalability compared to LP solvers and optimized heuristics, especially when<br>considering large instances.<br>},
BOOKTITLE = {Machine Learning and Knowledge Discovery in Databases (ECML PKDD 2022)},
PAGES = {1--17},
EID = {486},
ADDRESS = {Grenoble, France},
}

Endnote

%0 Conference Proceedings
%A Jung, Steffen
%A Keuper, Margret
%+ Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Learning to solve Minimum Cost Multicuts efficiently using Edge-Weighted
  Graph Convolutional Neural Networks : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000A-C01E-C
%D 2022
%B European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases
%Z date of event: 2022-09-19 - 2022-09-23
%C Grenoble, France
%X   The minimum cost multicut problem is the NP-hard/APX-hard combinatorial<br>optimization problem of partitioning a real-valued edge-weighted graph such as<br>to minimize the total cost of the partition. While graph convolutional neural<br>networks (GNN) have proven to be promising in the context of combinatorial<br>optimization, most of them are only tailored to or tested on positive-valued<br>edge weights, i.e. they do not comply to the nature of the multicut problem. We<br>therefore adapt various GNN architectures including Graph Convolutional<br>Networks, Signed Graph Convolutional Networks and Graph Isomorphic Networks to<br>facilitate the efficient encoding of real-valued edge costs. Moreover, we<br>employ a reformulation of the multicut ILP constraints to a polynomial program<br>as loss function that allows to learn feasible multicut solutions in a scalable<br>way. Thus, we provide the first approach towards end-to-end trainable<br>multicuts. Our findings support that GNN approaches can produce good solutions<br>in practice while providing lower computation times and largely improved<br>scalability compared to LP solvers and optimized heuristics, especially when<br>considering large instances.<br>
%B Machine Learning and Knowledge Discovery in Databases
%P 1 - 17
%Z sequence number: 486
%I ecmlpkdd.org
%U https://2022.ecmlpkdd.org/wp-content/uploads/2022/09/sub_486.pdf

Conference paper

P. Müller, A. Braun, and M. Keuper

“Impact of Realistic Properties of the Point Spread Function on Classification Tasks to Reveal a Possible Distribution Shift,” in NeurIPS 2022 Workshop on Distribution Shifts: Connecting Methods and Applications (NeurIPS 2022 Workshop DistShift), New Orelans, LA, USA, 2022.

mehr

BibTeX

@inproceedings{Mueller_NEURIPSW22,
TITLE = {Impact of Realistic Properties of the Point Spread Function on Classification Tasks to Reveal a Possible Distribution Shift},
AUTHOR = {M{\"u}ller, Patrick and Braun, Alexander and Keuper, Margret},
LANGUAGE = {eng},
URL = {https://openreview.net/forum?id=r7WJpE3oy0},
PUBLISHER = {OpenReview.net},
YEAR = {2022},
BOOKTITLE = {NeurIPS 2022 Workshop on Distribution Shifts: Connecting Methods and Applications (NeurIPS 2022 Workshop DistShift)},
ADDRESS = {New Orelans, LA, USA},
}

Endnote

%0 Conference Proceedings
%A M&#252;ller, Patrick
%A Braun, Alexander
%A Keuper, Margret
%+ External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Impact of Realistic Properties of the Point Spread Function on Classification Tasks to Reveal a Possible Distribution Shift : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000B-67EE-6
%U https://openreview.net/forum?id=r7WJpE3oy0
%D 2022
%B NeurIPS 2022 Workshop on Distribution Shifts: Connecting Methods and Applications
%Z date of event: 2022-12-03 - 2022-12-03
%C New Orelans, LA, USA
%B NeurIPS 2022 Workshop on Distribution Shifts: Connecting Methods and Applications
%I OpenReview.net


%U https://openreview.net/forum?id=r7WJpE3oy0

Conference paper

S. Jung, S. Ziegler, A. Kardoost, and M. Keuper

“Optimizing Edge Detection for Image Segmentation with Multicut Penalties,” in Pattern Recognition (DAGM GCPR 2022), Konstanz, Germany, 2022.

mehr

Abstract

The Minimum Cost Multicut Problem (MP) is a popular way for obtaining a graph
decomposition by optimizing binary edge labels over edge costs. While the
formulation of a MP from independently estimated costs per edge is highly
flexible and intuitive, solving the MP is NP-hard and time-expensive. As a
remedy, recent work proposed to predict edge probabilities with awareness to
potential conflicts by incorporating cycle constraints in the prediction
process. We argue that such formulation, while providing a first step towards
end-to-end learnable edge weights, is suboptimal, since it is built upon a
loose relaxation of the MP. We therefore propose an adaptive CRF that allows to
progressively consider more violated constraints and, in consequence, to issue
solutions with higher validity. Experiments on the BSDS500 benchmark for
natural image segmentation as well as on electron microscopic recordings show
that our approach yields more precise edge detection and image segmentation.

BibTeX

@inproceedings{Jung_GCPR2022,
TITLE = {Optimizing Edge Detection for Image Segmentation with Multicut Penalties},
AUTHOR = {Jung, Steffen and Ziegler, Sebastian and Kardoost, Amirhossein and Keuper, Margret},
LANGUAGE = {eng},
ISBN = {978-3-031-16787-4},
DOI = {10.1007/978-3-031-16788-1_12},
PUBLISHER = {Springer},
YEAR = {2022},
DATE = {2022},
ABSTRACT = {The Minimum Cost Multicut Problem (MP) is a popular way for obtaining a graph<br>decomposition by optimizing binary edge labels over edge costs. While the<br>formulation of a MP from independently estimated costs per edge is highly<br>flexible and intuitive, solving the MP is NP-hard and time-expensive. As a<br>remedy, recent work proposed to predict edge probabilities with awareness to<br>potential conflicts by incorporating cycle constraints in the prediction<br>process. We argue that such formulation, while providing a first step towards<br>end-to-end learnable edge weights, is suboptimal, since it is built upon a<br>loose relaxation of the MP. We therefore propose an adaptive CRF that allows to<br>progressively consider more violated constraints and, in consequence, to issue<br>solutions with higher validity. Experiments on the BSDS500 benchmark for<br>natural image segmentation as well as on electron microscopic recordings show<br>that our approach yields more precise edge detection and image segmentation.<br>},
BOOKTITLE = {Pattern Recognition (DAGM GCPR 2022)},
EDITOR = {Andres, Bj{\"o}rn and Bernard, Florian and Cremers, Daniel and Frintrop, Simone and Goldl{\"u}cke, Bastian and Ihrke, Ivo},
PAGES = {182--197},
SERIES = {Lecture Notes in Computer Science},
VOLUME = {13485},
ADDRESS = {Konstanz, Germany},
}

Endnote

%0 Conference Proceedings
%A Jung, Steffen
%A Ziegler, Sebastian
%A Kardoost, Amirhossein
%A Keuper, Margret
%+ Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Optimizing Edge Detection for Image Segmentation with Multicut Penalties : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000A-C025-3
%R 10.1007/978-3-031-16788-1_12
%D 2022
%B 44th German Conference on Pattern Recognition
%Z date of event: 2022-09-27 - 2022-09-30
%C Konstanz, Germany
%X   The Minimum Cost Multicut Problem (MP) is a popular way for obtaining a graph<br>decomposition by optimizing binary edge labels over edge costs. While the<br>formulation of a MP from independently estimated costs per edge is highly<br>flexible and intuitive, solving the MP is NP-hard and time-expensive. As a<br>remedy, recent work proposed to predict edge probabilities with awareness to<br>potential conflicts by incorporating cycle constraints in the prediction<br>process. We argue that such formulation, while providing a first step towards<br>end-to-end learnable edge weights, is suboptimal, since it is built upon a<br>loose relaxation of the MP. We therefore propose an adaptive CRF that allows to<br>progressively consider more violated constraints and, in consequence, to issue<br>solutions with higher validity. Experiments on the BSDS500 benchmark for<br>natural image segmentation as well as on electron microscopic recordings show<br>that our approach yields more precise edge detection and image segmentation.<br>
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV
%B Pattern Recognition
%E Andres, Bj&#246;rn; Bernard, Florian; Cremers, Daniel; Frintrop, Simone; Goldl&#252;cke, Bastian; Ihrke, Ivo
%P 182 - 197
%I Springer
%@ 978-3-031-16787-4
%B Lecture Notes in Computer Science
%N 13485

Paper

Y. Zhou, C. Li, Z.-Q. Cheng, Y. Geng, X. Xie, and M. Keuper

“Hypergraph Transformer for Skeleton-based Action Recognition,” 2022. [Online]. Available: https://arxiv.org/abs/2211.09590.

mehr

Abstract

Skeleton-based action recognition aims to predict human actions given human
joint coordinates with skeletal interconnections. To model such off-grid data
points and their co-occurrences, Transformer-based formulations would be a
natural choice. However, Transformers still lag behind state-of-the-art methods
using graph convolutional networks (GCNs). Transformers assume that the input
is permutation-invariant and homogeneous (partially alleviated by positional
encoding), which ignores an important characteristic of skeleton data, i.e.,
bone connectivity. Furthermore, each type of body joint has a clear physical
meaning in human motion, i.e., motion retains an intrinsic relationship
regardless of the joint coordinates, which is not explored in Transformers. In
fact, certain re-occurring groups of body joints are often involved in specific
actions, such as the subconscious hand movement for keeping balance. Vanilla
attention is incapable of describing such underlying relations that are
persistent and beyond pair-wise. In this work, we aim to exploit these unique
aspects of skeleton data to close the performance gap between Transformers and
GCNs. Specifically, we propose a new self-attention (SA) extension, named
Hypergraph Self-Attention (HyperSA), to incorporate inherently higher-order
relations into the model. The K-hop relative positional embeddings are also
employed to take bone connectivity into account. We name the resulting model
Hyperformer, and it achieves comparable or better performance w.r.t. accuracy
and efficiency than state-of-the-art GCN architectures on NTU RGB+D, NTU RGB+D
120, and Northwestern-UCLA datasets. On the largest NTU RGB+D 120 dataset, the
significantly improved performance reached by our Hyperformer demonstrates the
underestimated potential of Transformer models in this field.

BibTeX

@online{Zhou2211.09590,
TITLE = {Hypergraph Transformer for Skeleton-based Action Recognition},
AUTHOR = {Zhou, Yuxuan and Li, Chao and Cheng, Zhi-Qi and Geng, Yifeng and Xie, Xuansong and Keuper, Margret},
LANGUAGE = {eng},
URL = {https://arxiv.org/abs/2211.09590},
EPRINT = {2211.09590},
EPRINTTYPE = {arXiv},
YEAR = {2022},
ABSTRACT = {Skeleton-based action recognition aims to predict human actions given human<br>joint coordinates with skeletal interconnections. To model such off-grid data<br>points and their co-occurrences, Transformer-based formulations would be a<br>natural choice. However, Transformers still lag behind state-of-the-art methods<br>using graph convolutional networks (GCNs). Transformers assume that the input<br>is permutation-invariant and homogeneous (partially alleviated by positional<br>encoding), which ignores an important characteristic of skeleton data, i.e.,<br>bone connectivity. Furthermore, each type of body joint has a clear physical<br>meaning in human motion, i.e., motion retains an intrinsic relationship<br>regardless of the joint coordinates, which is not explored in Transformers. In<br>fact, certain re-occurring groups of body joints are often involved in specific<br>actions, such as the subconscious hand movement for keeping balance. Vanilla<br>attention is incapable of describing such underlying relations that are<br>persistent and beyond pair-wise. In this work, we aim to exploit these unique<br>aspects of skeleton data to close the performance gap between Transformers and<br>GCNs. Specifically, we propose a new self-attention (SA) extension, named<br>Hypergraph Self-Attention (HyperSA), to incorporate inherently higher-order<br>relations into the model. The K-hop relative positional embeddings are also<br>employed to take bone connectivity into account. We name the resulting model<br>Hyperformer, and it achieves comparable or better performance w.r.t. accuracy<br>and efficiency than state-of-the-art GCN architectures on NTU RGB+D, NTU RGB+D<br>120, and Northwestern-UCLA datasets. On the largest NTU RGB+D 120 dataset, the<br>significantly improved performance reached by our Hyperformer demonstrates the<br>underestimated potential of Transformer models in this field.<br>},
}

Endnote

%0 Report
%A Zhou, Yuxuan
%A Li, Chao
%A Cheng, Zhi-Qi
%A Geng, Yifeng
%A Xie, Xuansong
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
External Organizations
External Organizations
Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
%T Hypergraph Transformer for Skeleton-based Action Recognition : 
%G eng
%U http://hdl.handle.net/21.11116/0000-000C-1BC0-D
%U https://arxiv.org/abs/2211.09590
%D 2022
%X   Skeleton-based action recognition aims to predict human actions given human<br>joint coordinates with skeletal interconnections. To model such off-grid data<br>points and their co-occurrences, Transformer-based formulations would be a<br>natural choice. However, Transformers still lag behind state-of-the-art methods<br>using graph convolutional networks (GCNs). Transformers assume that the input<br>is permutation-invariant and homogeneous (partially alleviated by positional<br>encoding), which ignores an important characteristic of skeleton data, i.e.,<br>bone connectivity. Furthermore, each type of body joint has a clear physical<br>meaning in human motion, i.e., motion retains an intrinsic relationship<br>regardless of the joint coordinates, which is not explored in Transformers. In<br>fact, certain re-occurring groups of body joints are often involved in specific<br>actions, such as the subconscious hand movement for keeping balance. Vanilla<br>attention is incapable of describing such underlying relations that are<br>persistent and beyond pair-wise. In this work, we aim to exploit these unique<br>aspects of skeleton data to close the performance gap between Transformers and<br>GCNs. Specifically, we propose a new self-attention (SA) extension, named<br>Hypergraph Self-Attention (HyperSA), to incorporate inherently higher-order<br>relations into the model. The K-hop relative positional embeddings are also<br>employed to take bone connectivity into account. We name the resulting model<br>Hyperformer, and it achieves comparable or better performance w.r.t. accuracy<br>and efficiency than state-of-the-art GCN architectures on NTU RGB+D, NTU RGB+D<br>120, and Northwestern-UCLA datasets. On the largest NTU RGB+D 120 dataset, the<br>significantly improved performance reached by our Hyperformer demonstrates the<br>underestimated potential of Transformer models in this field.<br>
%K Computer Science, Computer Vision and Pattern Recognition, cs.CV

2021

Conference paper

A. Saseendran, K. Skubch, S. Falkner, and M. Keuper

“Shape your Space: A Gaussian Mixture Regularization Approach to Deterministic Autoencoders,” in Advances in Neural Information Processing Systems 34 pre-proceedings (NeurIPS 2021), Virtual Event, 2021.

mehr

BibTeX

@inproceedings{Saseendran_NeurIPs2021,
TITLE = {Shape your Space: {A} {G}aussian Mixture Regularization Approach to Deterministic Autoencoders},
AUTHOR = {Saseendran, Amrutha and Skubch, Kathrin and Falkner, Stefan and Keuper, Margret},
LANGUAGE = {eng},
ISBN = {9781713845393},
PUBLISHER = {Curran Associates, Inc.},
YEAR = {2021},
BOOKTITLE = {Advances in Neural Information Processing Systems 34 pre-proceedings (NeurIPS 2021)},
EDITOR = {Ranzato, M. and Beygelzimer, A. and Liang, P. S. and Vaughan, J. W. and Dauphin, Y.},
PAGES = {7319--7332},
ADDRESS = {Virtual Event},
}

Endnote

%0 Conference Proceedings
%A Saseendran, Amrutha
%A Skubch, Kathrin
%A Falkner, Stefan
%A Keuper, Margret
%+ External Organizations
External Organizations
External Organizations
Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
%T Shape your Space: A Gaussian Mixture Regularization Approach to Deterministic Autoencoders : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0009-882C-D
%D 2021
%B 35th Conference on Neural Information Processing Systems
%Z date of event: 2021-12-06 - 2021-12-14
%C Virtual Event
%B Advances in Neural Information Processing Systems 34 pre-proceedings
%E Ranzato, M.; Beygelzimer, A.; Liang, P. S.; Vaughan, J. W.; Dauphin, Y.
%P 7319 - 7332
%I Curran Associates, Inc.

%@ 9781713845393
%U https://papers.nips.cc/paper/2021/file/3c057cb2b41f22c0e740974d7a428918-Paper.pdf

Conference paper

J. Geiping, J. Lukasik, M. Keuper, and M. Moeller

“DARTS for Inverse Problems: a Study on Stability,” in NeurIPS 2021 Workshop on Deep Learning and Inverse Problems (NeurIPS 2021 Deep Inverse Workshop), Virtual, 2021.

mehr

BibTeX

@inproceedings{Geiping_NEURIPSW21,
TITLE = {{DARTS} for Inverse Problems: {A} Study on Stability},
AUTHOR = {Geiping, Jonas and Lukasik, Jovita and Keuper, Margret and Moeller, Michael},
LANGUAGE = {eng},
URL = {https://openreview.net/forum?id=ty5XCitJfLA; https://openreview.net/group?id=NeurIPS.cc/2021/Workshop/Deep_Inverse},
PUBLISHER = {OpenReview.net},
YEAR = {2021},
BOOKTITLE = {NeurIPS 2021 Workshop on Deep Learning and Inverse Problems (NeurIPS 2021 Deep Inverse Workshop)},
ADDRESS = {Virtual},
}

Endnote

%0 Conference Proceedings
%A Geiping, Jonas
%A Lukasik, Jovita
%A Keuper, Margret
%A Moeller, Michael
%+ External Organizations
External Organizations
Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
External Organizations
%T DARTS for Inverse Problems: a Study on Stability : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0009-87A6-3
%U https://openreview.net/forum?id=ty5XCitJfLA
%D 2021
%B NeurIPS 2021 Workshop on Deep Learning and Inverse Problems
%Z date of event: 2021-12-10 - 2021-12-10
%C Virtual
%B NeurIPS 2021 Workshop on Deep Learning and Inverse Problems
%I OpenReview.net

Conference paper

S. Jung and M. Keuper

“Internalized Biases in Fréchet Inception Distance,” in NeurIPS 2021 Workshop on Distribution Shifts: Connecting Methods and Applications (NeurIPS 2021 Workshop DistShift), Virtual, 2021.

mehr

BibTeX

@inproceedings{Jung_NEURIPSW21,
TITLE = {Internalized Biases in {F}r\'{e}chet Inception Distance},
AUTHOR = {Jung, Steffen and Keuper, Margret},
LANGUAGE = {eng},
URL = {https://openreview.net/forum?id=mLG96UpmbYz; https://openreview.net/group?id=NeurIPS.cc/2021/Workshop/DistShift},
PUBLISHER = {OpenReview.net},
YEAR = {2021},
BOOKTITLE = {NeurIPS 2021 Workshop on Distribution Shifts: Connecting Methods and Applications (NeurIPS 2021 Workshop DistShift)},
ADDRESS = {Virtual},
}

Endnote

%0 Conference Proceedings
%A Jung, Steffen
%A Keuper, Margret
%+ Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
%T Internalized Biases in Fr&#233;chet Inception Distance : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0009-9CA2-0
%U https://openreview.net/forum?id=mLG96UpmbYz
%D 2021
%B NeurIPS 2021 Workshop on Distribution Shifts: Connecting Methods and Applications
%Z date of event: 2021-12-13 - 2021-12-13
%C Virtual
%B NeurIPS 2021 Workshop on Distribution Shifts: Connecting Methods and Applications
%I OpenReview.net


%U https://openreview.net/pdf?id=mLG96UpmbYz

Conference paper

Y. He, N. Yu, M. Keuper, and M. Fritz

“Beyond the Spectrum: Detecting Deepfakes via Re-Synthesis,” in Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence (IJCAI 2021), Montreal, Canada, 2021.

mehr

BibTeX

@inproceedings{He_IJCAI2021,
TITLE = {Beyond the Spectrum: {D}etecting Deepfakes via Re-Synthesis},
AUTHOR = {He, Yang and Yu, Ning and Keuper, Margret and Fritz, Mario},
LANGUAGE = {eng},
ISBN = {978-0-9992411-9-6},
DOI = {10.24963/ijcai.2021/349},
PUBLISHER = {IJCAI},
YEAR = {2021},
BOOKTITLE = {Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence (IJCAI 2021)},
EDITOR = {Zhou, Zhi-Hua},
PAGES = {2534--2541},
ADDRESS = {Montreal, Canada},
}

Endnote

%0 Conference Proceedings
%A He, Yang
%A Yu, Ning
%A Keuper, Margret
%A Fritz, Mario
%+ External Organizations
Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
External Organizations
External Organizations
%T Beyond the Spectrum: Detecting Deepfakes via Re-Synthesis : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0009-8833-4
%R 10.24963/ijcai.2021/349 
%D 2021
%B Thirtieth International Joint Conference on Artificial Intelligence
%Z date of event: 2021-08-19 - 2021-08-27
%C Montreal, Canada
%B Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence
%E Zhou, Zhi-Hua
%P 2534 - 2541
%I IJCAI
%@ 978-0-9992411-9-6
%U https://www.ijcai.org/proceedings/2021/0349.pdf

Conference paper

S. Jung and M. Keuper

“Spectral Distribution Aware Image Generation,” in Thirty-Fifth AAAI Conference on Artificial Intelligence Technical Tracks 2, Virtual Conference, 2021.

mehr

BibTeX

@inproceedings{Jung_AAAI21,
TITLE = {Spectral Distribution Aware Image Generation},
AUTHOR = {Jung, Steffen and Keuper, Margret},
LANGUAGE = {eng},
ISBN = {978-1-57735-866-4},
DOI = {10.1609/aaai.v35i2.16267},
PUBLISHER = {AAAI},
YEAR = {2021},
BOOKTITLE = {Thirty-Fifth AAAI Conference on Artificial Intelligence Technical Tracks 2},
PAGES = {1734--1742},
ADDRESS = {Virtual Conference},
}

Endnote

%0 Conference Proceedings
%A Jung, Steffen
%A Keuper, Margret
%+ Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society
External Organizations
%T Spectral Distribution Aware Image Generation : 
%G eng
%U http://hdl.handle.net/21.11116/0000-0007-A808-3
%R 10.1609/aaai.v35i2.16267
%D 2021
%B Thirty-Fifth AAAI Conference on Artificial Intelligence
%Z date of event: 2021-02-02 - 2021-02-09
%C Virtual Conference
%B Thirty-Fifth AAAI Conference on Artificial Intelligence Technical Tracks 2
%P 1734 - 1742
%I AAAI
%@ 978-1-57735-866-4

2017

Conference paper

Y. He, W.-C. Chiu, M. Keuper, and M. Fritz

“STD2P: RGBD Semantic Segmentation Using Spatio-Temporal Data-Driven Pooling,” in 30th IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2017), Honolulu, HI, USA, 2017.

mehr

BibTeX

@inproceedings{yang_cvpr17,
TITLE = {{STD2P}: {RGBD} Semantic Segmentation Using Spatio-Temporal Data-Driven Pooling},
AUTHOR = {He, Yang and Chiu, Wei-Chen and Keuper, Margret and Fritz, Mario},
LANGUAGE = {eng},
ISBN = {978-1-5386-0458-8},
DOI = {10.1109/CVPR.2017.757},
PUBLISHER = {IEEE Computer Society},
YEAR = {2017},
DATE = {2017},
BOOKTITLE = {30th IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2017)},
PAGES = {7158--7167},
ADDRESS = {Honolulu, HI, USA},
}

Endnote

%0 Conference Proceedings
%A He, Yang
%A Chiu, Wei-Chen
%A Keuper, Margret
%A Fritz, Mario
%+ Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
%T STD2P: RGBD Semantic Segmentation Using Spatio-Temporal Data-Driven Pooling : 
%G eng
%U http://hdl.handle.net/11858/00-001M-0000-002D-B8E6-C
%R 10.1109/CVPR.2017.757
%D 2017
%B 30th IEEE Conference on Computer Vision and Pattern Recognition
%Z date of event: 2017-07-22 - 2017-07-25
%C Honolulu, HI, USA
%B 30th IEEE Conference on Computer Vision and Pattern Recognition
%P 7158 - 7167
%I IEEE Computer Society
%@ 978-1-5386-0458-8

Conference paper

Y. He, M. Keuper, B. Schiele, and M. Fritz

“Learning Dilation Factors for Semantic Segmentation of Street Scenes,” in Pattern Recognition (GCPR 2017), Basel, Switzerland, 2017.

mehr

BibTeX

@inproceedings{he17gcpr,
TITLE = {Learning Dilation Factors for Semantic Segmentation of Street Scenes},
AUTHOR = {He, Yang and Keuper, Margret and Schiele, Bernt and Fritz, Mario},
LANGUAGE = {eng},
ISBN = {978-3-319-66708-9},
DOI = {10.1007/978-3-319-66709-6_4},
PUBLISHER = {Springer},
YEAR = {2017},
DATE = {2017},
BOOKTITLE = {Pattern Recognition (GCPR 2017)},
EDITOR = {Roth, Volker and Vetter, Thomas},
PAGES = {41--51},
SERIES = {Lecture Notes in Computer Science},
VOLUME = {10496},
ADDRESS = {Basel, Switzerland},
}

Endnote

%0 Conference Proceedings
%A He, Yang
%A Keuper, Margret
%A Schiele, Bernt
%A Fritz, Mario
%+ Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
External Organizations
Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
%T Learning Dilation Factors for Semantic Segmentation of Street Scenes : 
%G eng
%U http://hdl.handle.net/11858/00-001M-0000-002D-B8F6-8
%R 10.1007/978-3-319-66709-6_4
%D 2017
%B 39th German Conference on Pattern Recognition
%Z date of event: 2017-09-13 - 2017-09-15
%C Basel, Switzerland
%B Pattern Recognition
%E Roth, Volker; Vetter, Thomas
%P 41 - 51
%I Springer
%@ 978-3-319-66708-9
%B Lecture Notes in Computer Science
%N 10496

2016

Paper

Y. He, W.-C. Chiu, M. Keuper, and M. Fritz

“RGBD Semantic Segmentation Using Spatio-Temporal Data-Driven Pooling,” 2016. [Online]. Available: http://arxiv.org/abs/1604.02388.

mehr

Abstract

Beyond the success in classification, neural networks have recently shown

strong results on pixel-wise prediction tasks like image semantic segmentation

on RGBD data. However, the commonly used deconvolutional layers for upsampling

intermediate representations to the full-resolution output still show different

failure modes, like imprecise segmentation boundaries and label mistakes in

particular on large, weakly textured objects (e.g. fridge, whiteboard, door).

We attribute these errors in part to the rigid way, current network aggregate

information, that can be either too local (missing context) or too global

(inaccurate boundaries). Therefore we propose a data-driven pooling layer that

integrates with fully convolutional architectures and utilizes boundary

detection from RGBD image segmentation approaches. We extend our approach to

leverage region-level correspondences across images with an additional temporal

pooling stage. We evaluate our approach on the NYU-Depth-V2 dataset comprised

of indoor RGBD video sequences and compare it to various state-of-the-art

baselines. Besides a general improvement over the state-of-the-art, our

approach shows particularly good results in terms of accuracy of the predicted

boundaries and in segmenting previously problematic classes.

BibTeX

@online{He_arXiv2016,
TITLE = {{RGBD} Semantic Segmentation Using Spatio-Temporal Data-Driven Pooling},
AUTHOR = {He, Yang and Chiu, Wei-Chen and Keuper, Margret and Fritz, Mario},
LANGUAGE = {eng},
URL = {http://arxiv.org/abs/1604.02388},
EPRINT = {1604.02388},
EPRINTTYPE = {arXiv},
YEAR = {2016},
ABSTRACT = {Beyond the success in classification, neural networks have recently shown strong results on pixel-wise prediction tasks like image semantic segmentation on RGBD data. However, the commonly used deconvolutional layers for upsampling intermediate representations to the full-resolution output still show different failure modes, like imprecise segmentation boundaries and label mistakes in particular on large, weakly textured objects (e.g. fridge, whiteboard, door). We attribute these errors in part to the rigid way, current network aggregate information, that can be either too local (missing context) or too global (inaccurate boundaries). Therefore we propose a data-driven pooling layer that integrates with fully convolutional architectures and utilizes boundary detection from RGBD image segmentation approaches. We extend our approach to leverage region-level correspondences across images with an additional temporal pooling stage. We evaluate our approach on the NYU-Depth-V2 dataset comprised of indoor RGBD video sequences and compare it to various state-of-the-art baselines. Besides a general improvement over the state-of-the-art, our approach shows particularly good results in terms of accuracy of the predicted boundaries and in segmenting previously problematic classes.},
}

Endnote

%0 Report
%A He, Yang
%A Chiu, Wei-Chen
%A Keuper, Margret
%A Fritz, Mario
%+ Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
%T RGBD Semantic Segmentation Using Spatio-Temporal Data-Driven Pooling : 
%G eng
%U http://hdl.handle.net/11858/00-001M-0000-002B-063C-5
%U http://arxiv.org/abs/1604.02388
%D 2016
%X   Beyond the success in classification, neural networks have recently shown
strong results on pixel-wise prediction tasks like image semantic segmentation
on RGBD data. However, the commonly used deconvolutional layers for upsampling
intermediate representations to the full-resolution output still show different
failure modes, like imprecise segmentation boundaries and label mistakes in
particular on large, weakly textured objects (e.g. fridge, whiteboard, door).
We attribute these errors in part to the rigid way, current network aggregate
information, that can be either too local (missing context) or too global
(inaccurate boundaries). Therefore we propose a data-driven pooling layer that
integrates with fully convolutional architectures and utilizes boundary
detection from RGBD image segmentation approaches. We extend our approach to
leverage region-level correspondences across images with an additional temporal
pooling stage. We evaluate our approach on the NYU-Depth-V2 dataset comprised
of indoor RGBD video sequences and compare it to various state-of-the-art
baselines. Besides a general improvement over the state-of-the-art, our
approach shows particularly good results in terms of accuracy of the predicted
boundaries and in segmenting previously problematic classes.

%K Computer Science, Computer Vision and Pattern Recognition, cs.CV

2015

Conference paper

M. Keuper, E. Levinkov, N. Bonneel, G. Layoue, T. Brox, and B. Andres

“Efficient Decomposition of Image and Mesh Graphs by Lifted Multicuts,” in ICCV 2015, IEEE International Conference on Computer Vision, Santiago, Chile, 2015.

mehr

BibTeX

@inproceedings{keuper-2015a,
TITLE = {Efficient Decomposition of Image and Mesh Graphs by Lifted Multicuts},
AUTHOR = {Keuper, Margret and Levinkov, Evgeny and Bonneel, Nicolas and Layoue, Guilaume and Brox, Thomas and Andres, Bjoern},
LANGUAGE = {eng},
ISBN = {978-1-4673-8390-5},
DOI = {10.1109/ICCV.2015.204},
PUBLISHER = {IEEE},
YEAR = {2015},
DATE = {2015},
BOOKTITLE = {ICCV 2015, IEEE International Conference on Computer Vision},
PAGES = {1751--1759},
ADDRESS = {Santiago, Chile},
}

Endnote

%0 Conference Proceedings
%A Keuper, Margret
%A Levinkov, Evgeny
%A Bonneel, Nicolas
%A Layoue, Guilaume
%A Brox, Thomas
%A Andres, Bjoern
%+ External Organizations
Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
External Organizations
External Organizations
External Organizations
Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
%T Efficient Decomposition of Image and Mesh Graphs by Lifted 
Multicuts : 
%G eng
%U http://hdl.handle.net/11858/00-001M-0000-0028-DC59-C
%R 10.1109/ICCV.2015.204
%D 2015
%B IEEE International Conference on Computer Vision
%Z date of event: 2015-12-13 - 2015-12-16
%C Santiago, Chile
%B ICCV 2015
%P 1751 - 1759
%I IEEE
%@ 978-1-4673-8390-5

Conference paper

M. Keuper, B. Andres, and T. Brox

“Motion Trajectory Segmentation via Minimum Cost Multicuts,” in ICCV 2015, IEEE International Conference on Computer Vision, Santiago, Chile, 2015.

mehr

BibTeX

@inproceedings{keuper-2015b,
TITLE = {Motion Trajectory Segmentation via Minimum Cost Multicuts},
AUTHOR = {Keuper, Margret and Andres, Bjoern and Brox, Thomas},
LANGUAGE = {eng},
ISBN = {1-4673-8390-5},
DOI = {10.1109/ICCV.2015.374},
PUBLISHER = {IEEE},
YEAR = {2015},
DATE = {2015},
BOOKTITLE = {ICCV 2015, IEEE International Conference on Computer Vision},
PAGES = {3271--3279},
ADDRESS = {Santiago, Chile},
}

Endnote

%0 Conference Proceedings
%A Keuper, Margret
%A Andres, Bjoern
%A Brox, Thomas
%+ Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society
External Organizations
%T Motion Trajectory Segmentation via Minimum Cost Multicuts : 
%G eng
%U http://hdl.handle.net/11858/00-001M-0000-0028-DC42-F
%R 10.1109/ICCV.2015.374
%D 2015
%B IEEE International Conference on Computer Vision
%Z date of event: 2015-12-13 - 2015-12-16
%C Santiago, Chile
%B ICCV 2015
%P 3271 - 3279
%I IEEE
%@ 1-4673-8390-5