D2
Computer Vision and Machine Learning

Dengxin Dai (Senior Researcher)

Dr. Dengxin Dai

Address
Max-Planck-Institut für Informatik
Saarland Informatics Campus
Campus E1 4
66123 Saarbrücken
Location
E1 4 - 604
Phone
+49 681 9325 2104
Fax
+49 681 9325 2099

Vision for Autonomous Systems (VAS) Group

My Group Website: VAS

Hiring

  • We are hiring PostDocs, PhD students, and Research Interns; we also offer projects for master's thesis. If you are interested, please contact me <ddai@mpi-inf.mpg.de> with your CV and transcripts. I also accept applicants with CSC scholorship.
  • We focus on deep learning-based perception for autonomous driving, especially on scaling existing visual perception models to novel domains,  to new data modality, to unseen classes and to more tasks 

Publications

Dai, D., Vasudevan, A. B., Matas, J., & Van Gool, L. (2023). Binaural SoundNet: Predicting Semantics, Depth and Motion with Binaural Sounds. IEEE Transactions on Pattern Analysis and Machine Intelligence, 45(1). doi:10.1109/TPAMI.2022.3155643
Export
BibTeX
@article{Dai2109.02763, TITLE = {Binaural {SoundNet}: {P}redicting Semantics, Depth and Motion with Binaural Sounds}, AUTHOR = {Dai, Dengxin and Vasudevan, Arun Balajee and Matas, Jiri and Van Gool, Luc}, LANGUAGE = {eng}, ISSN = {0162-8828}, DOI = {10.1109/TPAMI.2022.3155643}, PUBLISHER = {IEEE}, ADDRESS = {Piscataway, NJ}, YEAR = {2023}, MARGINALMARK = {$\bullet$}, DATE = {2023}, JOURNAL = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, VOLUME = {45}, NUMBER = {1}, PAGES = {123--136}, }
Endnote
%0 Journal Article %A Dai, Dengxin %A Vasudevan, Arun Balajee %A Matas, Jiri %A Van Gool, Luc %+ Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations External Organizations External Organizations %T Binaural SoundNet: Predicting Semantics, Depth and Motion with Binaural Sounds : %G eng %U http://hdl.handle.net/21.11116/0000-0009-444C-6 %R 10.1109/TPAMI.2022.3155643 %7 2022 %D 2023 %J IEEE Transactions on Pattern Analysis and Machine Intelligence %O IEEE Trans. Pattern Anal. Mach. Intell. %V 45 %N 1 %& 123 %P 123 - 136 %I IEEE %C Piscataway, NJ %@ false
Shi, S., Jiang, L., Dai, D., & Schiele, B. (n.d.). Motion Transformer with Global Intention Localization and Local Movement Refinement. In Advances in Neural Information Processing Systems 35 (NeurIPS 2022). New Orleans, LO, USA.
(Accepted/in press)
Export
BibTeX
@inproceedings{Shi_Neurips22, TITLE = {Motion Transformer with Global Intention Localization and Local Movement Refinement}, AUTHOR = {Shi, Shaoshuai and Jiang, Li and Dai, Dengxin and Schiele, Bernt}, LANGUAGE = {eng}, YEAR = {2022}, PUBLREMARK = {Accepted}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {Advances in Neural Information Processing Systems 35 (NeurIPS 2022)}, ADDRESS = {New Orleans, LO, USA}, }
Endnote
%0 Conference Proceedings %A Shi, Shaoshuai %A Jiang, Li %A Dai, Dengxin %A Schiele, Bernt %+ Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society %T Motion Transformer with Global Intention Localization and Local Movement Refinement : %G eng %U http://hdl.handle.net/21.11116/0000-000C-1853-C %D 2022 %B 36th Conference on Neural Information Processing Systems %Z date of event: 2022-11-28 - 2022-12-09 %C New Orleans, LO, USA %B Advances in Neural Information Processing Systems 35 %U https://openreview.net/forum?id=9t-j3xDm7_Q
Shi, S., Jiang, L., Dai, D., & Schiele, B. (2022). MTR-A: 1st Place Solution for 2022 Waymo Open Dataset Challenge -- Motion Prediction. Retrieved from https://arxiv.org/abs/2209.10033
(arXiv: 2209.10033)
Abstract
In this report, we present the 1st place solution for motion prediction track<br>in 2022 Waymo Open Dataset Challenges. We propose a novel Motion Transformer<br>framework for multimodal motion prediction, which introduces a small set of<br>novel motion query pairs for generating better multimodal future trajectories<br>by jointly performing the intention localization and iterative motion<br>refinement. A simple model ensemble strategy with non-maximum-suppression is<br>adopted to further boost the final performance. Our approach achieves the 1st<br>place on the motion prediction leaderboard of 2022 Waymo Open Dataset<br>Challenges, outperforming other methods with remarkable margins. Code will be<br>available at https://github.com/sshaoshuai/MTR.<br>
Export
BibTeX
@online{Shi2209.10033, TITLE = {{MTR}-A: 1st Place Solution for 2022 Waymo Open Dataset Challenge -- Motion Prediction}, AUTHOR = {Shi, Shaoshuai and Jiang, Li and Dai, Dengxin and Schiele, Bernt}, LANGUAGE = {eng}, URL = {https://arxiv.org/abs/2209.10033}, EPRINT = {2209.10033}, EPRINTTYPE = {arXiv}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, ABSTRACT = {In this report, we present the 1st place solution for motion prediction track<br>in 2022 Waymo Open Dataset Challenges. We propose a novel Motion Transformer<br>framework for multimodal motion prediction, which introduces a small set of<br>novel motion query pairs for generating better multimodal future trajectories<br>by jointly performing the intention localization and iterative motion<br>refinement. A simple model ensemble strategy with non-maximum-suppression is<br>adopted to further boost the final performance. Our approach achieves the 1st<br>place on the motion prediction leaderboard of 2022 Waymo Open Dataset<br>Challenges, outperforming other methods with remarkable margins. Code will be<br>available at https://github.com/sshaoshuai/MTR.<br>}, }
Endnote
%0 Report %A Shi, Shaoshuai %A Jiang, Li %A Dai, Dengxin %A Schiele, Bernt %+ Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society %T MTR-A: 1st Place Solution for 2022 Waymo Open Dataset Challenge -- Motion Prediction : %G eng %U http://hdl.handle.net/21.11116/0000-000C-184C-5 %U https://arxiv.org/abs/2209.10033 %D 2022 %X In this report, we present the 1st place solution for motion prediction track<br>in 2022 Waymo Open Dataset Challenges. We propose a novel Motion Transformer<br>framework for multimodal motion prediction, which introduces a small set of<br>novel motion query pairs for generating better multimodal future trajectories<br>by jointly performing the intention localization and iterative motion<br>refinement. A simple model ensemble strategy with non-maximum-suppression is<br>adopted to further boost the final performance. Our approach achieves the 1st<br>place on the motion prediction leaderboard of 2022 Waymo Open Dataset<br>Challenges, outperforming other methods with remarkable margins. Code will be<br>available at https://github.com/sshaoshuai/MTR.<br> %K Computer Science, Computer Vision and Pattern Recognition, cs.CV
Ding, J., Xue, N., Xia, G.-S., & Dai, D. (2022). Decoupling Zero-Shot Semantic Segmentation. In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022). New Orleans, LA, USA: IEEE. doi:10.1109/CVPR52688.2022.01129
Abstract
Zero-shot semantic segmentation (ZS3) aims to segment the novel categories<br>that have not been seen in the training. Existing works formulate ZS3 as a<br>pixel-level zero-shot classification problem, and transfer semantic knowledge<br>from seen classes to unseen ones with the help of language models pre-trained<br>only with texts. While simple, the pixel-level ZS3 formulation shows the<br>limited capability to integrate vision-language models that are often<br>pre-trained with image-text pairs and currently demonstrate great potential for<br>vision tasks. Inspired by the observation that humans often perform<br>segment-level semantic labeling, we propose to decouple the ZS3 into two<br>sub-tasks: 1) a class-agnostic grouping task to group the pixels into segments.<br>2) a zero-shot classification task on segments. The former sub-task does not<br>involve category information and can be directly transferred to group pixels<br>for unseen classes. The latter subtask performs at segment-level and provides a<br>natural way to leverage large-scale vision-language models pre-trained with<br>image-text pairs (e.g. CLIP) for ZS3. Based on the decoupling formulation, we<br>propose a simple and effective zero-shot semantic segmentation model, called<br>ZegFormer, which outperforms the previous methods on ZS3 standard benchmarks by<br>large margins, e.g., 35 points on the PASCAL VOC and 3 points on the COCO-Stuff<br>in terms of mIoU for unseen classes. Code will be released at<br>https://github.com/dingjiansw101/ZegFormer.<br>
Export
BibTeX
@inproceedings{Ding_CVPR2022, TITLE = {Decoupling Zero-Shot Semantic Segmentation}, AUTHOR = {Ding, Jian and Xue, Nan and Xia, Gui-Song and Dai, Dengxin}, LANGUAGE = {eng}, ISBN = {978-1-6654-6946-3}, DOI = {10.1109/CVPR52688.2022.01129}, PUBLISHER = {IEEE}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, ABSTRACT = {Zero-shot semantic segmentation (ZS3) aims to segment the novel categories<br>that have not been seen in the training. Existing works formulate ZS3 as a<br>pixel-level zero-shot classification problem, and transfer semantic knowledge<br>from seen classes to unseen ones with the help of language models pre-trained<br>only with texts. While simple, the pixel-level ZS3 formulation shows the<br>limited capability to integrate vision-language models that are often<br>pre-trained with image-text pairs and currently demonstrate great potential for<br>vision tasks. Inspired by the observation that humans often perform<br>segment-level semantic labeling, we propose to decouple the ZS3 into two<br>sub-tasks: 1) a class-agnostic grouping task to group the pixels into segments.<br>2) a zero-shot classification task on segments. The former sub-task does not<br>involve category information and can be directly transferred to group pixels<br>for unseen classes. The latter subtask performs at segment-level and provides a<br>natural way to leverage large-scale vision-language models pre-trained with<br>image-text pairs (e.g. CLIP) for ZS3. Based on the decoupling formulation, we<br>propose a simple and effective zero-shot semantic segmentation model, called<br>ZegFormer, which outperforms the previous methods on ZS3 standard benchmarks by<br>large margins, e.g., 35 points on the PASCAL VOC and 3 points on the COCO-Stuff<br>in terms of mIoU for unseen classes. Code will be released at<br>https://github.com/dingjiansw101/ZegFormer.<br>}, BOOKTITLE = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022)}, PAGES = {11573--11582}, ADDRESS = {New Orleans, LA, USA}, }
Endnote
%0 Conference Proceedings %A Ding, Jian %A Xue, Nan %A Xia, Gui-Song %A Dai, Dengxin %+ External Organizations External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society %T Decoupling Zero-Shot Semantic Segmentation : %G eng %U http://hdl.handle.net/21.11116/0000-000A-16BD-9 %R 10.1109/CVPR52688.2022.01129 %D 2022 %B 35th IEEE/CVF Conference on Computer Vision and Pattern Recognition %Z date of event: 2022-06-19 - 2022-06-24 %C New Orleans, LA, USA %X Zero-shot semantic segmentation (ZS3) aims to segment the novel categories<br>that have not been seen in the training. Existing works formulate ZS3 as a<br>pixel-level zero-shot classification problem, and transfer semantic knowledge<br>from seen classes to unseen ones with the help of language models pre-trained<br>only with texts. While simple, the pixel-level ZS3 formulation shows the<br>limited capability to integrate vision-language models that are often<br>pre-trained with image-text pairs and currently demonstrate great potential for<br>vision tasks. Inspired by the observation that humans often perform<br>segment-level semantic labeling, we propose to decouple the ZS3 into two<br>sub-tasks: 1) a class-agnostic grouping task to group the pixels into segments.<br>2) a zero-shot classification task on segments. The former sub-task does not<br>involve category information and can be directly transferred to group pixels<br>for unseen classes. The latter subtask performs at segment-level and provides a<br>natural way to leverage large-scale vision-language models pre-trained with<br>image-text pairs (e.g. CLIP) for ZS3. Based on the decoupling formulation, we<br>propose a simple and effective zero-shot semantic segmentation model, called<br>ZegFormer, which outperforms the previous methods on ZS3 standard benchmarks by<br>large margins, e.g., 35 points on the PASCAL VOC and 3 points on the COCO-Stuff<br>in terms of mIoU for unseen classes. Code will be released at<br>https://github.com/dingjiansw101/ZegFormer.<br> %K Computer Science, Computer Vision and Pattern Recognition, cs.CV %B IEEE/CVF Conference on Computer Vision and Pattern Recognition %P 11573 - 11582 %I IEEE %@ 978-1-6654-6946-3
Fan, Q., Segu, M., Tai, Y.-W., Yu, F., Tang, C.-K., Schiele, B., & Dai, D. (2022). Normalization Perturbation: A Simple Domain Generalization Method for Real-World Domain Shifts. Retrieved from https://arxiv.org/abs/2211.04393
(arXiv: 2211.04393)
Abstract
Improving model's generalizability against domain shifts is crucial,<br>especially for safety-critical applications such as autonomous driving.<br>Real-world domain styles can vary substantially due to environment changes and<br>sensor noises, but deep models only know the training domain style. Such domain<br>style gap impedes model generalization on diverse real-world domains. Our<br>proposed Normalization Perturbation (NP) can effectively overcome this domain<br>style overfitting problem. We observe that this problem is mainly caused by the<br>biased distribution of low-level features learned in shallow CNN layers. Thus,<br>we propose to perturb the channel statistics of source domain features to<br>synthesize various latent styles, so that the trained deep model can perceive<br>diverse potential domains and generalizes well even without observations of<br>target domain data in training. We further explore the style-sensitive channels<br>for effective style synthesis. Normalization Perturbation only relies on a<br>single source domain and is surprisingly effective and extremely easy to<br>implement. Extensive experiments verify the effectiveness of our method for<br>generalizing models under real-world domain shifts.<br>
Export
BibTeX
@online{Fan2211.04393, TITLE = {Normalization Perturbation: A Simple Domain Generalization Method for Real-World Domain Shifts}, AUTHOR = {Fan, Qi and Segu, Mattia and Tai, Yu-Wing and Yu, Fisher and Tang, Chi-Keung and Schiele, Bernt and Dai, Dengxin}, LANGUAGE = {eng}, URL = {https://arxiv.org/abs/2211.04393}, EPRINT = {2211.04393}, EPRINTTYPE = {arXiv}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, ABSTRACT = {Improving model's generalizability against domain shifts is crucial,<br>especially for safety-critical applications such as autonomous driving.<br>Real-world domain styles can vary substantially due to environment changes and<br>sensor noises, but deep models only know the training domain style. Such domain<br>style gap impedes model generalization on diverse real-world domains. Our<br>proposed Normalization Perturbation (NP) can effectively overcome this domain<br>style overfitting problem. We observe that this problem is mainly caused by the<br>biased distribution of low-level features learned in shallow CNN layers. Thus,<br>we propose to perturb the channel statistics of source domain features to<br>synthesize various latent styles, so that the trained deep model can perceive<br>diverse potential domains and generalizes well even without observations of<br>target domain data in training. We further explore the style-sensitive channels<br>for effective style synthesis. Normalization Perturbation only relies on a<br>single source domain and is surprisingly effective and extremely easy to<br>implement. Extensive experiments verify the effectiveness of our method for<br>generalizing models under real-world domain shifts.<br>}, }
Endnote
%0 Report %A Fan, Qi %A Segu, Mattia %A Tai, Yu-Wing %A Yu, Fisher %A Tang, Chi-Keung %A Schiele, Bernt %A Dai, Dengxin %+ External Organizations External Organizations External Organizations External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society %T Normalization Perturbation: A Simple Domain Generalization Method for Real-World Domain Shifts : %G eng %U http://hdl.handle.net/21.11116/0000-000C-1857-8 %U https://arxiv.org/abs/2211.04393 %D 2022 %X Improving model's generalizability against domain shifts is crucial,<br>especially for safety-critical applications such as autonomous driving.<br>Real-world domain styles can vary substantially due to environment changes and<br>sensor noises, but deep models only know the training domain style. Such domain<br>style gap impedes model generalization on diverse real-world domains. Our<br>proposed Normalization Perturbation (NP) can effectively overcome this domain<br>style overfitting problem. We observe that this problem is mainly caused by the<br>biased distribution of low-level features learned in shallow CNN layers. Thus,<br>we propose to perturb the channel statistics of source domain features to<br>synthesize various latent styles, so that the trained deep model can perceive<br>diverse potential domains and generalizes well even without observations of<br>target domain data in training. We further explore the style-sensitive channels<br>for effective style synthesis. Normalization Perturbation only relies on a<br>single source domain and is surprisingly effective and extremely easy to<br>implement. Extensive experiments verify the effectiveness of our method for<br>generalizing models under real-world domain shifts.<br> %K Computer Science, Computer Vision and Pattern Recognition, cs.CV
Zhang, Z., Liniger, A., Dai, D., Yu, F., & Van Gool, L. (2021). End-to-End Urban Driving by Imitating a Reinforcement Learning Coach. In ICCV 2021, IEEE/CVF International Conference on Computer Vision. Virtual Event: IEEE. doi:10.1109/ICCV48922.2021.01494
Export
BibTeX
@inproceedings{zhang2021roach, TITLE = {End-to-End Urban Driving by Imitating a Reinforcement Learning Coach}, AUTHOR = {Zhang, Zhejun and Liniger, Alexander and Dai, Dengxin and Yu, Fisher and Van Gool, Luc}, LANGUAGE = {eng}, ISBN = {978-1-6654-2812-5}, DOI = {10.1109/ICCV48922.2021.01494}, PUBLISHER = {IEEE}, YEAR = {2021}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {ICCV 2021, IEEE/CVF International Conference on Computer Vision}, PAGES = {15202--15212}, ADDRESS = {Virtual Event}, }
Endnote
%0 Conference Proceedings %A Zhang, Zhejun %A Liniger, Alexander %A Dai, Dengxin %A Yu, Fisher %A Van Gool, Luc %+ External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations External Organizations %T End-to-End Urban Driving by Imitating a Reinforcement Learning Coach : %G eng %U http://hdl.handle.net/21.11116/0000-0009-4452-E %R 10.1109/ICCV48922.2021.01494 %D 2021 %B IEEE/CVF International Conference on Computer Vision %Z date of event: 2021-10-11 - 2021-10-17 %C Virtual Event %B ICCV 2021 %P 15202 - 15212 %I IEEE %@ 978-1-6654-2812-5 %U https://github.com/zhejz/carla-roach
Gong, R., Danelljan, M., Dai, D., Wang, W., Paudel, D. P., Chhatkuli, A., … Van Gool, L. (2021). TADA: Taxonomy Adaptive Domain Adaptation. Retrieved from https://arxiv.org/abs/2109.04813
(arXiv: 2109.04813)
Abstract
Traditional domain adaptation addresses the task of adapting a model to a<br>novel target domain under limited or no additional supervision. While tackling<br>the input domain gap, the standard domain adaptation settings assume no domain<br>change in the output space. In semantic prediction tasks, different datasets<br>are often labeled according to different semantic taxonomies. In many<br>real-world settings, the target domain task requires a different taxonomy than<br>the one imposed by the source domain. We therefore introduce the more general<br>taxonomy adaptive domain adaptation (TADA) problem, allowing for inconsistent<br>taxonomies between the two domains. We further propose an approach that jointly<br>addresses the image-level and label-level domain adaptation. On the<br>label-level, we employ a bilateral mixed sampling strategy to augment the<br>target domain, and a relabelling method to unify and align the label spaces. We<br>address the image-level domain gap by proposing an uncertainty-rectified<br>contrastive learning method, leading to more domain-invariant and class<br>discriminative features. We extensively evaluate the effectiveness of our<br>framework under different TADA settings: open taxonomy, coarse-to-fine<br>taxonomy, and partially-overlapping taxonomy. Our framework outperforms<br>previous state-of-the-art by a large margin, while capable of adapting to<br>target taxonomies.<br>
Export
BibTeX
@online{Gong2109.04813, TITLE = {{TADA}: {T}axonomy Adaptive Domain Adaptation}, AUTHOR = {Gong, Rui and Danelljan, Martin and Dai, Dengxin and Wang, Wenguan and Paudel, Danda Pani and Chhatkuli, Ajad and Yu, Fisher and Van Gool, Luc}, LANGUAGE = {eng}, URL = {https://arxiv.org/abs/2109.04813}, EPRINT = {2109.04813}, EPRINTTYPE = {arXiv}, YEAR = {2021}, MARGINALMARK = {$\bullet$}, ABSTRACT = {Traditional domain adaptation addresses the task of adapting a model to a<br>novel target domain under limited or no additional supervision. While tackling<br>the input domain gap, the standard domain adaptation settings assume no domain<br>change in the output space. In semantic prediction tasks, different datasets<br>are often labeled according to different semantic taxonomies. In many<br>real-world settings, the target domain task requires a different taxonomy than<br>the one imposed by the source domain. We therefore introduce the more general<br>taxonomy adaptive domain adaptation (TADA) problem, allowing for inconsistent<br>taxonomies between the two domains. We further propose an approach that jointly<br>addresses the image-level and label-level domain adaptation. On the<br>label-level, we employ a bilateral mixed sampling strategy to augment the<br>target domain, and a relabelling method to unify and align the label spaces. We<br>address the image-level domain gap by proposing an uncertainty-rectified<br>contrastive learning method, leading to more domain-invariant and class<br>discriminative features. We extensively evaluate the effectiveness of our<br>framework under different TADA settings: open taxonomy, coarse-to-fine<br>taxonomy, and partially-overlapping taxonomy. Our framework outperforms<br>previous state-of-the-art by a large margin, while capable of adapting to<br>target taxonomies.<br>}, }
Endnote
%0 Report %A Gong, Rui %A Danelljan, Martin %A Dai, Dengxin %A Wang, Wenguan %A Paudel, Danda Pani %A Chhatkuli, Ajad %A Yu, Fisher %A Van Gool, Luc %+ External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations External Organizations External Organizations External Organizations External Organizations %T TADA: Taxonomy Adaptive Domain Adaptation : %G eng %U http://hdl.handle.net/21.11116/0000-0009-89F0-D %U https://arxiv.org/abs/2109.04813 %D 2021 %X Traditional domain adaptation addresses the task of adapting a model to a<br>novel target domain under limited or no additional supervision. While tackling<br>the input domain gap, the standard domain adaptation settings assume no domain<br>change in the output space. In semantic prediction tasks, different datasets<br>are often labeled according to different semantic taxonomies. In many<br>real-world settings, the target domain task requires a different taxonomy than<br>the one imposed by the source domain. We therefore introduce the more general<br>taxonomy adaptive domain adaptation (TADA) problem, allowing for inconsistent<br>taxonomies between the two domains. We further propose an approach that jointly<br>addresses the image-level and label-level domain adaptation. On the<br>label-level, we employ a bilateral mixed sampling strategy to augment the<br>target domain, and a relabelling method to unify and align the label spaces. We<br>address the image-level domain gap by proposing an uncertainty-rectified<br>contrastive learning method, leading to more domain-invariant and class<br>discriminative features. We extensively evaluate the effectiveness of our<br>framework under different TADA settings: open taxonomy, coarse-to-fine<br>taxonomy, and partially-overlapping taxonomy. Our framework outperforms<br>previous state-of-the-art by a large margin, while capable of adapting to<br>target taxonomies.<br> %K Computer Science, Computer Vision and Pattern Recognition, cs.CV
Ji, G.-P., Fan, D.-P., Chou, Y.-C., Dai, D., Liniger, A., & Van Gool, L. (2022). Deep Gradient Learning for Efficient Camouflaged Object Detection. Retrieved from https://arxiv.org/pdf/2205.12853.pdf
(arXiv: 2205.12853)
Abstract
This paper introduces DGNet, a novel deep framework that exploits object<br>gradient supervision for camouflaged object detection (COD). It decouples the<br>task into two connected branches, i.e., a context and a texture encoder. The<br>essential connection is the gradient-induced transition, representing a soft<br>grouping between context and texture features. Benefiting from the simple but<br>efficient framework, DGNet outperforms existing state-of-the-art COD models by<br>a large margin. Notably, our efficient version, DGNet-S, runs in real-time (80<br>fps) and achieves comparable results to the cutting-edge model<br>JCSOD-CVPR$_{21}$ with only 6.82% parameters. Application results also show<br>that the proposed DGNet performs well in polyp segmentation, defect detection,<br>and transparent object segmentation tasks. Codes will be made available at<br>https://github.com/GewelsJI/DGNet.<br>
Export
BibTeX
@online{Ji2205.12853, TITLE = {Deep Gradient Learning for Efficient Camouflaged Object Detection}, AUTHOR = {Ji, Ge-Peng and Fan, Deng-Ping and Chou, Yu-Cheng and Dai, Dengxin and Liniger, Alexander and Van Gool, Luc}, LANGUAGE = {eng}, URL = {https://arxiv.org/pdf/2205.12853.pdf}, EPRINT = {2205.12853}, EPRINTTYPE = {arXiv}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, ABSTRACT = {This paper introduces DGNet, a novel deep framework that exploits object<br>gradient supervision for camouflaged object detection (COD). It decouples the<br>task into two connected branches, i.e., a context and a texture encoder. The<br>essential connection is the gradient-induced transition, representing a soft<br>grouping between context and texture features. Benefiting from the simple but<br>efficient framework, DGNet outperforms existing state-of-the-art COD models by<br>a large margin. Notably, our efficient version, DGNet-S, runs in real-time (80<br>fps) and achieves comparable results to the cutting-edge model<br>JCSOD-CVPR$_{21}$ with only 6.82% parameters. Application results also show<br>that the proposed DGNet performs well in polyp segmentation, defect detection,<br>and transparent object segmentation tasks. Codes will be made available at<br>https://github.com/GewelsJI/DGNet.<br>}, }
Endnote
%0 Report %A Ji, Ge-Peng %A Fan, Deng-Ping %A Chou, Yu-Cheng %A Dai, Dengxin %A Liniger, Alexander %A Van Gool, Luc %+ External Organizations External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations External Organizations %T Deep Gradient Learning for Efficient Camouflaged Object Detection : %G eng %U http://hdl.handle.net/21.11116/0000-000C-1B97-C %U https://arxiv.org/pdf/2205.12853.pdf %D 2022 %X This paper introduces DGNet, a novel deep framework that exploits object<br>gradient supervision for camouflaged object detection (COD). It decouples the<br>task into two connected branches, i.e., a context and a texture encoder. The<br>essential connection is the gradient-induced transition, representing a soft<br>grouping between context and texture features. Benefiting from the simple but<br>efficient framework, DGNet outperforms existing state-of-the-art COD models by<br>a large margin. Notably, our efficient version, DGNet-S, runs in real-time (80<br>fps) and achieves comparable results to the cutting-edge model<br>JCSOD-CVPR$_{21}$ with only 6.82% parameters. Application results also show<br>that the proposed DGNet performs well in polyp segmentation, defect detection,<br>and transparent object segmentation tasks. Codes will be made available at<br>https://github.com/GewelsJI/DGNet.<br> %K Computer Science, Computer Vision and Pattern Recognition, cs.CV
Hoyer, L., Dai, D., & Van Gool, L. (2022a). HRDA: Context-Aware High-Resolution Domain-Adaptive Semantic Segmentation. In Computer Vision -- ECCV 2022. Tel Aviv, Israel: Springer. doi:10.1007/978-3-031-20056-4_22
Export
BibTeX
@inproceedings{Hoyer_ECCV2022b, TITLE = {{HRDA}: {C}ontext-Aware High-Resolution Domain-Adaptive Semantic Segmentation}, AUTHOR = {Hoyer, Lukas and Dai, Dengxin and Van Gool, Luc}, LANGUAGE = {eng}, ISBN = {978-3-031-20055-7}, DOI = {10.1007/978-3-031-20056-4_22}, PUBLISHER = {Springer}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, DATE = {2022}, BOOKTITLE = {Computer Vision -- ECCV 2022}, EDITOR = {Avidan, Shai and Brostow, Gabriel and Ciss{\'e}, Moustapha and Farinella, Giovanni Maria and Hassner, Tal}, PAGES = {372--391}, SERIES = {Lecture Notes in Computer Science}, VOLUME = {13690}, ADDRESS = {Tel Aviv, Israel}, }
Endnote
%0 Conference Proceedings %A Hoyer, Lukas %A Dai, Dengxin %A Van Gool, Luc %+ External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T HRDA: Context-Aware High-Resolution Domain-Adaptive Semantic Segmentation : %G eng %U http://hdl.handle.net/21.11116/0000-000C-1B92-1 %R 10.1007/978-3-031-20056-4_22 %D 2022 %B 17th European Conference on Computer Vision %Z date of event: 2022-10-23 - 2022-10-27 %C Tel Aviv, Israel %B Computer Vision -- ECCV 2022 %E Avidan, Shai; Brostow, Gabriel; Ciss&#233;, Moustapha; Farinella, Giovanni Maria; Hassner, Tal %P 372 - 391 %I Springer %@ 978-3-031-20055-7 %B Lecture Notes in Computer Science %N 13690
Hahner, M., Sakaridis, C., Bijelic, M., Heide, F., Yu, F., Dai, D., & Van Gool, L. (2022). LiDAR Snowfall Simulation for Robust 3D Object Detection. In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022). New Orleans, LA, USA: IEEE. doi:10.1109/CVPR52688.2022.01588
Export
BibTeX
@inproceedings{Hahner_CVPR22, TITLE = {{LiDAR} Snowfall Simulation for Robust {3D} Object Detection}, AUTHOR = {Hahner, Martin and Sakaridis, Christos and Bijelic, Mario and Heide, Felix and Yu, Fisher and Dai, Dengxin and Van Gool, Luc}, LANGUAGE = {eng}, ISBN = {978-1-6654-6946-3}, DOI = {10.1109/CVPR52688.2022.01588}, PUBLISHER = {IEEE}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022)}, PAGES = {16343--16353}, ADDRESS = {New Orleans, LA, USA}, }
Endnote
%0 Conference Proceedings %A Hahner, Martin %A Sakaridis, Christos %A Bijelic, Mario %A Heide, Felix %A Yu, Fisher %A Dai, Dengxin %A Van Gool, Luc %+ External Organizations External Organizations External Organizations External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T LiDAR Snowfall Simulation for Robust 3D Object Detection : %G eng %U http://hdl.handle.net/21.11116/0000-000C-1B50-C %R 10.1109/CVPR52688.2022.01588 %D 2022 %B 35th IEEE/CVF Conference on Computer Vision and Pattern Recognition %Z date of event: 2022-06-19 - 2022-06-24 %C New Orleans, LA, USA %B IEEE/CVF Conference on Computer Vision and Pattern Recognition %P 16343 - 16353 %I IEEE %@ 978-1-6654-6946-3
Gong, R., Danelljan, M., Dai, D., Paudel, D. P., Chhatkuli, A., Yu, F., & Van Gool, L. (2022). TACS: Taxonomy Adaptive Cross-Domain Semantic Segmentation. In Computer Vision -- ECCV 2022. Tel Aviv, Israel: Springer. doi:10.1007/978-3-031-19830-4_2
Export
BibTeX
@inproceedings{Gong_ECCV2022b, TITLE = {{TACS}: {T}axonomy Adaptive Cross-Domain Semantic Segmentation}, AUTHOR = {Gong, Rui and Danelljan, Martin and Dai, Dengxin and Paudel, Danda Pani and Chhatkuli, Ajad and Yu, Fisher and Van Gool, Luc}, LANGUAGE = {eng}, ISBN = {978-3-031-19829-8}, DOI = {10.1007/978-3-031-19830-4_2}, PUBLISHER = {Springer}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, DATE = {2022}, BOOKTITLE = {Computer Vision -- ECCV 2022}, EDITOR = {Avidan, Shai and Brostow, Gabriel and Ciss{\'e}, Moustapha and Farinella, Giovanni Maria and Hassner, Tal}, PAGES = {19--35}, SERIES = {Lecture Notes in Computer Science}, VOLUME = {13694}, ADDRESS = {Tel Aviv, Israel}, }
Endnote
%0 Conference Proceedings %A Gong, Rui %A Danelljan, Martin %A Dai, Dengxin %A Paudel, Danda Pani %A Chhatkuli, Ajad %A Yu, Fisher %A Van Gool, Luc %+ External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations External Organizations External Organizations External Organizations %T TACS: Taxonomy Adaptive Cross-Domain Semantic Segmentation : %G eng %U http://hdl.handle.net/21.11116/0000-000C-1B59-3 %R 10.1007/978-3-031-19830-4_2 %D 2022 %B 17th European Conference on Computer Vision %Z date of event: 2022-10-23 - 2022-10-27 %C Tel Aviv, Israel %B Computer Vision -- ECCV 2022 %E Avidan, Shai; Brostow, Gabriel; Ciss&#233;, Moustapha; Farinella, Giovanni Maria; Hassner, Tal %P 19 - 35 %I Springer %@ 978-3-031-19829-8 %B Lecture Notes in Computer Science %N 13694
Broedermann, T., Sakaridis, C., Dai, D., & Van Gool, L. (2022). HRFuser: A Multi-resolution Sensor Fusion Architecture for 2D Object Detection. Retrieved from https://arxiv.org/abs/2206.15157
(arXiv: 2206.15157)
Abstract
Besides standard cameras, autonomous vehicles typically include multiple<br>additional sensors, such as lidars and radars, which help acquire richer<br>information for perceiving the content of the driving scene. While several<br>recent works focus on fusing certain pairs of sensors - such as camera and<br>lidar or camera and radar - by using architectural components specific to the<br>examined setting, a generic and modular sensor fusion architecture is missing<br>from the literature. In this work, we focus on 2D object detection, a<br>fundamental high-level task which is defined on the 2D image domain, and<br>propose HRFuser, a multi-resolution sensor fusion architecture that scales<br>straightforwardly to an arbitrary number of input modalities. The design of<br>HRFuser is based on state-of-the-art high-resolution networks for image-only<br>dense prediction and incorporates a novel multi-window cross-attention block as<br>the means to perform fusion of multiple modalities at multiple resolutions.<br>Even though cameras alone provide very informative features for 2D detection,<br>we demonstrate via extensive experiments on the nuScenes and Seeing Through Fog<br>datasets that our model effectively leverages complementary features from<br>additional modalities, substantially improving upon camera-only performance and<br>consistently outperforming state-of-the-art fusion methods for 2D detection<br>both in normal and adverse conditions. The source code will be made publicly<br>available.<br>
Export
BibTeX
@online{Broedermann2206.15157, TITLE = {{HRFuser}: A Multi-resolution Sensor Fusion Architecture for {2D} Object Detection}, AUTHOR = {Broedermann, Tim and Sakaridis, Christos and Dai, Dengxin and Van Gool, Luc}, LANGUAGE = {eng}, URL = {https://arxiv.org/abs/2206.15157}, EPRINT = {2206.15157}, EPRINTTYPE = {arXiv}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, ABSTRACT = {Besides standard cameras, autonomous vehicles typically include multiple<br>additional sensors, such as lidars and radars, which help acquire richer<br>information for perceiving the content of the driving scene. While several<br>recent works focus on fusing certain pairs of sensors -- such as camera and<br>lidar or camera and radar -- by using architectural components specific to the<br>examined setting, a generic and modular sensor fusion architecture is missing<br>from the literature. In this work, we focus on 2D object detection, a<br>fundamental high-level task which is defined on the 2D image domain, and<br>propose HRFuser, a multi-resolution sensor fusion architecture that scales<br>straightforwardly to an arbitrary number of input modalities. The design of<br>HRFuser is based on state-of-the-art high-resolution networks for image-only<br>dense prediction and incorporates a novel multi-window cross-attention block as<br>the means to perform fusion of multiple modalities at multiple resolutions.<br>Even though cameras alone provide very informative features for 2D detection,<br>we demonstrate via extensive experiments on the nuScenes and Seeing Through Fog<br>datasets that our model effectively leverages complementary features from<br>additional modalities, substantially improving upon camera-only performance and<br>consistently outperforming state-of-the-art fusion methods for 2D detection<br>both in normal and adverse conditions. The source code will be made publicly<br>available.<br>}, }
Endnote
%0 Report %A Broedermann, Tim %A Sakaridis, Christos %A Dai, Dengxin %A Van Gool, Luc %+ External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T HRFuser: A Multi-resolution Sensor Fusion Architecture for 2D Object Detection : %G eng %U http://hdl.handle.net/21.11116/0000-000C-1B9D-6 %U https://arxiv.org/abs/2206.15157 %D 2022 %X Besides standard cameras, autonomous vehicles typically include multiple<br>additional sensors, such as lidars and radars, which help acquire richer<br>information for perceiving the content of the driving scene. While several<br>recent works focus on fusing certain pairs of sensors - such as camera and<br>lidar or camera and radar - by using architectural components specific to the<br>examined setting, a generic and modular sensor fusion architecture is missing<br>from the literature. In this work, we focus on 2D object detection, a<br>fundamental high-level task which is defined on the 2D image domain, and<br>propose HRFuser, a multi-resolution sensor fusion architecture that scales<br>straightforwardly to an arbitrary number of input modalities. The design of<br>HRFuser is based on state-of-the-art high-resolution networks for image-only<br>dense prediction and incorporates a novel multi-window cross-attention block as<br>the means to perform fusion of multiple modalities at multiple resolutions.<br>Even though cameras alone provide very informative features for 2D detection,<br>we demonstrate via extensive experiments on the nuScenes and Seeing Through Fog<br>datasets that our model effectively leverages complementary features from<br>additional modalities, substantially improving upon camera-only performance and<br>consistently outperforming state-of-the-art fusion methods for 2D detection<br>both in normal and adverse conditions. The source code will be made publicly<br>available.<br> %K Computer Science, Computer Vision and Pattern Recognition, cs.CV,Computer Science, Learning, cs.LG
Wu, Y.-H., Zhang, D., Zhang, L., Zhan, X., Dai, D., Liu, Y., & Cheng, M.-M. (2022). Ret3D: Rethinking Object Relations for Efficient 3D Object Detection in Driving Scenes. Retrieved from https://arxiv.org/abs/2208.08621
(arXiv: 2208.08621)
Abstract
Current efficient LiDAR-based detection frameworks are lacking in exploiting<br>object relations, which naturally present in both spatial and temporal manners.<br>To this end, we introduce a simple, efficient, and effective two-stage<br>detector, termed as Ret3D. At the core of Ret3D is the utilization of novel<br>intra-frame and inter-frame relation modules to capture the spatial and<br>temporal relations accordingly. More Specifically, intra-frame relation module<br>(IntraRM) encapsulates the intra-frame objects into a sparse graph and thus<br>allows us to refine the object features through efficient message passing. On<br>the other hand, inter-frame relation module (InterRM) densely connects each<br>object in its corresponding tracked sequences dynamically, and leverages such<br>temporal information to further enhance its representations efficiently through<br>a lightweight transformer network. We instantiate our novel designs of IntraRM<br>and InterRM with general center-based or anchor-based detectors and evaluate<br>them on Waymo Open Dataset (WOD). With negligible extra overhead, Ret3D<br>achieves the state-of-the-art performance, being 5.5% and 3.2% higher than the<br>recent competitor in terms of the LEVEL 1 and LEVEL 2 mAPH metrics on vehicle<br>detection, respectively.<br>
Export
BibTeX
@online{Wu2208.08621, TITLE = {{Ret{3D}}: Rethinking Object Relations for Efficient {3D} Object Detection in Driving Scenes}, AUTHOR = {Wu, Yu-Huan and Zhang, Da and Zhang, Le and Zhan, Xin and Dai, Dengxin and Liu, Yun and Cheng, Ming-Ming}, LANGUAGE = {eng}, URL = {https://arxiv.org/abs/2208.08621}, EPRINT = {2208.08621}, EPRINTTYPE = {arXiv}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, ABSTRACT = {Current efficient LiDAR-based detection frameworks are lacking in exploiting<br>object relations, which naturally present in both spatial and temporal manners.<br>To this end, we introduce a simple, efficient, and effective two-stage<br>detector, termed as Ret3D. At the core of Ret3D is the utilization of novel<br>intra-frame and inter-frame relation modules to capture the spatial and<br>temporal relations accordingly. More Specifically, intra-frame relation module<br>(IntraRM) encapsulates the intra-frame objects into a sparse graph and thus<br>allows us to refine the object features through efficient message passing. On<br>the other hand, inter-frame relation module (InterRM) densely connects each<br>object in its corresponding tracked sequences dynamically, and leverages such<br>temporal information to further enhance its representations efficiently through<br>a lightweight transformer network. We instantiate our novel designs of IntraRM<br>and InterRM with general center-based or anchor-based detectors and evaluate<br>them on Waymo Open Dataset (WOD). With negligible extra overhead, Ret3D<br>achieves the state-of-the-art performance, being 5.5% and 3.2% higher than the<br>recent competitor in terms of the LEVEL 1 and LEVEL 2 mAPH metrics on vehicle<br>detection, respectively.<br>}, }
Endnote
%0 Report %A Wu, Yu-Huan %A Zhang, Da %A Zhang, Le %A Zhan, Xin %A Dai, Dengxin %A Liu, Yun %A Cheng, Ming-Ming %+ External Organizations External Organizations External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations External Organizations %T Ret3D: Rethinking Object Relations for Efficient 3D Object Detection in Driving Scenes : %G eng %U http://hdl.handle.net/21.11116/0000-000C-1BA0-1 %U https://arxiv.org/abs/2208.08621 %D 2022 %X Current efficient LiDAR-based detection frameworks are lacking in exploiting<br>object relations, which naturally present in both spatial and temporal manners.<br>To this end, we introduce a simple, efficient, and effective two-stage<br>detector, termed as Ret3D. At the core of Ret3D is the utilization of novel<br>intra-frame and inter-frame relation modules to capture the spatial and<br>temporal relations accordingly. More Specifically, intra-frame relation module<br>(IntraRM) encapsulates the intra-frame objects into a sparse graph and thus<br>allows us to refine the object features through efficient message passing. On<br>the other hand, inter-frame relation module (InterRM) densely connects each<br>object in its corresponding tracked sequences dynamically, and leverages such<br>temporal information to further enhance its representations efficiently through<br>a lightweight transformer network. We instantiate our novel designs of IntraRM<br>and InterRM with general center-based or anchor-based detectors and evaluate<br>them on Waymo Open Dataset (WOD). With negligible extra overhead, Ret3D<br>achieves the state-of-the-art performance, being 5.5% and 3.2% higher than the<br>recent competitor in terms of the LEVEL 1 and LEVEL 2 mAPH metrics on vehicle<br>detection, respectively.<br> %K Computer Science, Computer Vision and Pattern Recognition, cs.CV,Computer Science, Artificial Intelligence, cs.AI
Hoyer, L., Dai, D., Wang, H., & Van Gool, L. (2022). MIC: Masked Image Consistency for Context-Enhanced Domain Adaptation. Retrieved from https://arxiv.org/abs/2212.01322
(arXiv: 2212.01322)
Abstract
In unsupervised domain adaptation (UDA), a model trained on source data (e.g.<br>synthetic) is adapted to target data (e.g. real-world) without access to target<br>annotation. Most previous UDA methods struggle with classes that have a similar<br>visual appearance on the target domain as no ground truth is available to learn<br>the slight appearance differences. To address this problem, we propose a Masked<br>Image Consistency (MIC) module to enhance UDA by learning spatial context<br>relations of the target domain as additional clues for robust visual<br>recognition. MIC enforces the consistency between predictions of masked target<br>images, where random patches are withheld, and pseudo-labels that are generated<br>based on the complete image by an exponential moving average teacher. To<br>minimize the consistency loss, the network has to learn to infer the<br>predictions of the masked regions from their context. Due to its simple and<br>universal concept, MIC can be integrated into various UDA methods across<br>different visual recognition tasks such as image classification, semantic<br>segmentation, and object detection. MIC significantly improves the<br>state-of-the-art performance across the different recognition tasks for<br>synthetic-to-real, day-to-nighttime, and clear-to-adverse-weather UDA. For<br>instance, MIC achieves an unprecedented UDA performance of 75.9 mIoU and 92.8%<br>on GTA-to-Cityscapes and VisDA-2017, respectively, which corresponds to an<br>improvement of +2.1 and +3.0 percent points over the previous state of the art.<br>The implementation is available at https://github.com/lhoyer/MIC.<br>
Export
BibTeX
@online{Hoyer2212.01322, TITLE = {{MIC}: Masked Image Consistency for Context-Enhanced Domain Adaptation}, AUTHOR = {Hoyer, Lukas and Dai, Dengxin and Wang, Haoran and Van Gool, Luc}, LANGUAGE = {eng}, URL = {https://arxiv.org/abs/2212.01322}, EPRINT = {2212.01322}, EPRINTTYPE = {arXiv}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, ABSTRACT = {In unsupervised domain adaptation (UDA), a model trained on source data (e.g.<br>synthetic) is adapted to target data (e.g. real-world) without access to target<br>annotation. Most previous UDA methods struggle with classes that have a similar<br>visual appearance on the target domain as no ground truth is available to learn<br>the slight appearance differences. To address this problem, we propose a Masked<br>Image Consistency (MIC) module to enhance UDA by learning spatial context<br>relations of the target domain as additional clues for robust visual<br>recognition. MIC enforces the consistency between predictions of masked target<br>images, where random patches are withheld, and pseudo-labels that are generated<br>based on the complete image by an exponential moving average teacher. To<br>minimize the consistency loss, the network has to learn to infer the<br>predictions of the masked regions from their context. Due to its simple and<br>universal concept, MIC can be integrated into various UDA methods across<br>different visual recognition tasks such as image classification, semantic<br>segmentation, and object detection. MIC significantly improves the<br>state-of-the-art performance across the different recognition tasks for<br>synthetic-to-real, day-to-nighttime, and clear-to-adverse-weather UDA. For<br>instance, MIC achieves an unprecedented UDA performance of 75.9 mIoU and 92.8%<br>on GTA-to-Cityscapes and VisDA-2017, respectively, which corresponds to an<br>improvement of +2.1 and +3.0 percent points over the previous state of the art.<br>The implementation is available at https://github.com/lhoyer/MIC.<br>}, }
Endnote
%0 Report %A Hoyer, Lukas %A Dai, Dengxin %A Wang, Haoran %A Van Gool, Luc %+ External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations External Organizations %T MIC: Masked Image Consistency for Context-Enhanced Domain Adaptation : %G eng %U http://hdl.handle.net/21.11116/0000-000C-1BA3-E %U https://arxiv.org/abs/2212.01322 %D 2022 %X In unsupervised domain adaptation (UDA), a model trained on source data (e.g.<br>synthetic) is adapted to target data (e.g. real-world) without access to target<br>annotation. Most previous UDA methods struggle with classes that have a similar<br>visual appearance on the target domain as no ground truth is available to learn<br>the slight appearance differences. To address this problem, we propose a Masked<br>Image Consistency (MIC) module to enhance UDA by learning spatial context<br>relations of the target domain as additional clues for robust visual<br>recognition. MIC enforces the consistency between predictions of masked target<br>images, where random patches are withheld, and pseudo-labels that are generated<br>based on the complete image by an exponential moving average teacher. To<br>minimize the consistency loss, the network has to learn to infer the<br>predictions of the masked regions from their context. Due to its simple and<br>universal concept, MIC can be integrated into various UDA methods across<br>different visual recognition tasks such as image classification, semantic<br>segmentation, and object detection. MIC significantly improves the<br>state-of-the-art performance across the different recognition tasks for<br>synthetic-to-real, day-to-nighttime, and clear-to-adverse-weather UDA. For<br>instance, MIC achieves an unprecedented UDA performance of 75.9 mIoU and 92.8%<br>on GTA-to-Cityscapes and VisDA-2017, respectively, which corresponds to an<br>improvement of +2.1 and +3.0 percent points over the previous state of the art.<br>The implementation is available at https://github.com/lhoyer/MIC.<br> %K Computer Science, Computer Vision and Pattern Recognition, cs.CV
Gong, R., Dai, D., Chen, Y., Li, W., & Van Gool, L. (2021). mDALU: Multi-Source Domain Adaptation and Label Unification with Partial Datasets. In ICCV 2021, IEEE/CVF International Conference on Computer Vision. Virtual Event: IEEE. doi:10.1109/ICCV48922.2021.00875
Export
BibTeX
@inproceedings{GongICCV21, TITLE = {{mDALU}: {M}ulti-Source Domain Adaptation and Label Unification with Partial Datasets}, AUTHOR = {Gong, Rui and Dai, Dengxin and Chen, Yuhua and Li, Wen and Van Gool, Luc}, LANGUAGE = {eng}, ISBN = {978-1-6654-2812-5}, DOI = {10.1109/ICCV48922.2021.00875}, PUBLISHER = {IEEE}, YEAR = {2021}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {ICCV 2021, IEEE/CVF International Conference on Computer Vision}, PAGES = {8856--8865}, ADDRESS = {Virtual Event}, }
Endnote
%0 Conference Proceedings %A Gong, Rui %A Dai, Dengxin %A Chen, Yuhua %A Li, Wen %A Van Gool, Luc %+ External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations External Organizations External Organizations %T mDALU: Multi-Source Domain Adaptation and Label Unification with Partial Datasets : %G eng %U http://hdl.handle.net/21.11116/0000-0009-4476-6 %R 10.1109/ICCV48922.2021.00875 %D 2021 %B IEEE/CVF International Conference on Computer Vision %Z date of event: 2021-10-11 - 2021-10-17 %C Virtual Event %B ICCV 2021 %P 8856 - 8865 %I IEEE %@ 978-1-6654-2812-5
Wang, Q., Dai, D., Hoyer, L., Van Gool, L., & Fink, O. (2021). Domain Adaptive Semantic Segmentation with Self-Supervised Depth Estimation. In ICCV 2021, IEEE/CVF International Conference on Computer Vision. Virtual Event: IEEE. doi:10.1109/ICCV48922.2021.00840
Export
BibTeX
@inproceedings{wang2021domain, TITLE = {Domain Adaptive Semantic Segmentation with Self-Supervised Depth Estimation}, AUTHOR = {Wang, Qin and Dai, Dengxin and Hoyer, Lukas and Van Gool, Luc and Fink, Olga}, LANGUAGE = {eng}, ISBN = {978-1-6654-2812-5}, DOI = {10.1109/ICCV48922.2021.00840}, PUBLISHER = {IEEE}, YEAR = {2021}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {ICCV 2021, IEEE/CVF International Conference on Computer Vision}, PAGES = {8495--8505}, ADDRESS = {Virtual Event}, }
Endnote
%0 Conference Proceedings %A Wang, Qin %A Dai, Dengxin %A Hoyer, Lukas %A Van Gool, Luc %A Fink, Olga %+ External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations External Organizations External Organizations %T Domain Adaptive Semantic Segmentation with Self-Supervised Depth Estimation : %G eng %U http://hdl.handle.net/21.11116/0000-0009-44AE-7 %R 10.1109/ICCV48922.2021.00840 %D 2021 %B IEEE/CVF International Conference on Computer Vision %Z date of event: 2021-10-11 - 2021-10-17 %C Virtual Event %B ICCV 2021 %P 8495 - 8505 %I IEEE %@ 978-1-6654-2812-5 %U https://github.com/qinenergy/corda
Patil, V., Liniger, A., Dai, D., & Van Gool, L. (2022). Improving Depth Estimation Using Map-Based Depth Priors. IEEE Robotics and Automation Letters, 7(2). doi:10.1109/LRA.2022.3146914
Export
BibTeX
@article{Patil2022, TITLE = {Improving Depth Estimation Using Map-Based Depth Priors}, AUTHOR = {Patil, Vaishakh and Liniger, Alexander and Dai, Dengxin and Van Gool, Luc}, LANGUAGE = {eng}, ISSN = {2377-3766}, DOI = {10.1109/LRA.2022.3146914}, PUBLISHER = {IEEE}, ADDRESS = {New York, NY}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, JOURNAL = {IEEE Robotics and Automation Letters}, VOLUME = {7}, NUMBER = {2}, PAGES = {3640--3647}, }
Endnote
%0 Journal Article %A Patil, Vaishakh %A Liniger, Alexander %A Dai, Dengxin %A Van Gool, Luc %+ External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T Improving Depth Estimation Using Map-Based Depth Priors : %G eng %U http://hdl.handle.net/21.11116/0000-000A-1531-7 %R 10.1109/LRA.2022.3146914 %7 2022 %D 2022 %J IEEE Robotics and Automation Letters %V 7 %N 2 %& 3640 %P 3640 - 3647 %I IEEE %C New York, NY %@ false
Hahner, M., Sakaridis, C., Dai, D., & Van Gool, L. (2021). Fog Simulation on Real LiDAR Point Clouds for 3D Object Detection in Adverse Weather. In ICCV 2021, IEEE/CVF International Conference on Computer Vision. Virtual Event: IEEE. doi:10.1109/ICCV48922.2021.01500
Export
BibTeX
@inproceedings{HahnerICCV21, TITLE = {Fog Simulation on Real {LiDAR} Point Clouds for {3D} Object Detection in Adverse Weather}, AUTHOR = {Hahner, Martin and Sakaridis, Christos and Dai, Dengxin and Van Gool, Luc}, LANGUAGE = {eng}, ISBN = {978-1-6654-2812-5}, DOI = {10.1109/ICCV48922.2021.01500}, PUBLISHER = {IEEE}, YEAR = {2021}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {ICCV 2021, IEEE/CVF International Conference on Computer Vision}, PAGES = {15263--15272}, ADDRESS = {Virtual Event}, }
Endnote
%0 Conference Proceedings %A Hahner, Martin %A Sakaridis, Christos %A Dai, Dengxin %A Van Gool, Luc %+ External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T Fog Simulation on Real LiDAR Point Clouds for 3D Object Detection in Adverse Weather : %G eng %U http://hdl.handle.net/21.11116/0000-0009-445F-1 %R 10.1109/ICCV48922.2021.01500 %D 2021 %B IEEE/CVF International Conference on Computer Vision %Z date of event: 2021-10-11 - 2021-10-17 %C Virtual Event %B ICCV 2021 %P 15263 - 15272 %I IEEE %@ 978-1-6654-2812-5 %U https://github.com/MartinHahner/LiDAR_fog_sim
Li, S., Chen, X., Liu, Y., Dai, D., Stachniss, C., & Gall, J. (2022). Multi-Scale Interaction for Real-Time LiDAR Data Segmentation on an Embedded Platform. IEEE Robotics and Automation Letters, 7(2). doi:10.1109/LRA.2021.3132059
Export
BibTeX
@article{Li2022, TITLE = {Multi-Scale Interaction for Real-Time {LiDAR} Data Segmentation on an Embedded Platform}, AUTHOR = {Li, Shijie and Chen, Xieyuanli and Liu, Yun and Dai, Dengxin and Stachniss, Cyrill and Gall, J{\"u}rgen}, LANGUAGE = {eng}, ISSN = {2377-3766}, DOI = {10.1109/LRA.2021.3132059}, PUBLISHER = {IEEE}, ADDRESS = {Piscataway, NJ}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, DATE = {2022}, JOURNAL = {IEEE Robotics and Automation Letters}, VOLUME = {7}, NUMBER = {2}, PAGES = {738--745}, }
Endnote
%0 Journal Article %A Li, Shijie %A Chen, Xieyuanli %A Liu, Yun %A Dai, Dengxin %A Stachniss, Cyrill %A Gall, J&#252;rgen %+ External Organizations External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations External Organizations %T Multi-Scale Interaction for Real-Time LiDAR Data Segmentation on an Embedded Platform : %G eng %U http://hdl.handle.net/21.11116/0000-0009-B1AD-C %R 10.1109/LRA.2021.3132059 %7 2022 %D 2022 %J IEEE Robotics and Automation Letters %V 7 %N 2 %& 738 %P 738 - 745 %I IEEE %C Piscataway, NJ %@ false
Vasudevan, A. B., Dai, D., & Van Gool, L. (2022). Sound and Visual Representation Learning with Multiple Pretraining Tasks. In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022). New Orleans, LA, USA: IEEE. doi:10.1109/CVPR52688.2022.01421
Export
BibTeX
@inproceedings{Vasudevan_CVPR2022, TITLE = {Sound and Visual Representation Learning with Multiple Pretraining Tasks}, AUTHOR = {Vasudevan, Arun Balajee and Dai, Dengxin and Van Gool, Luc}, LANGUAGE = {eng}, ISBN = {978-1-6654-6946-3}, DOI = {10.1109/CVPR52688.2022.01421}, PUBLISHER = {IEEE}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022)}, PAGES = {14596--14606}, ADDRESS = {New Orleans, LA, USA}, }
Endnote
%0 Conference Proceedings %A Vasudevan, Arun Balajee %A Dai, Dengxin %A Van Gool, Luc %+ External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T Sound and Visual Representation Learning with Multiple Pretraining Tasks : %G eng %U http://hdl.handle.net/21.11116/0000-000A-16C0-4 %R 10.1109/CVPR52688.2022.01421 %D 2022 %B 35th IEEE/CVF Conference on Computer Vision and Pattern Recognition %Z date of event: 2022-06-19 - 2022-06-24 %C New Orleans, LA, USA %B IEEE/CVF Conference on Computer Vision and Pattern Recognition %P 14596 - 14606 %I IEEE %@ 978-1-6654-6946-3
Zaech, J.-N., Dai, D., Liniger, A., Danelljan, M., & Van Gool, L. (2022). Learnable Online Graph Representations for 3D Multi-Object Tracking. IEEE Robotics and Automation Letters. doi:10.1109/LRA.2022.3145952
Export
BibTeX
@article{Zaech2104.11747, TITLE = {Learnable Online Graph Representations for {3D} Multi-Object Tracking}, AUTHOR = {Zaech, Jan-Nico and Dai, Dengxin and Liniger, Alexander and Danelljan, Martin and Van Gool, Luc}, LANGUAGE = {eng}, ISSN = {2377-3766}, DOI = {10.1109/LRA.2022.3145952}, PUBLISHER = {IEEE}, ADDRESS = {Piscataway, NJ}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, JOURNAL = {IEEE Robotics and Automation Letters}, }
Endnote
%0 Journal Article %A Zaech, Jan-Nico %A Dai, Dengxin %A Liniger, Alexander %A Danelljan, Martin %A Van Gool, Luc %+ External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations External Organizations External Organizations %T Learnable Online Graph Representations for 3D Multi-Object Tracking : %G eng %U http://hdl.handle.net/21.11116/0000-0009-444F-3 %R 10.1109/LRA.2022.3145952 %7 2022 %D 2022 %J IEEE Robotics and Automation Letters %I IEEE %C Piscataway, NJ %@ false
Gong, R., Li, W., Chen, Y., Dai, D., & Van Gool, L. (2021). DLOW: Domain Flow and Applications. International Journal of Computer Vision, 129. doi:10.1007/s11263-021-01496-2
Export
BibTeX
@article{Gong2021, TITLE = {{DLOW}: {D}omain Flow and Applications}, AUTHOR = {Gong, Rui and Li, Wen and Chen, Yuhua and Dai, Dengxin and Van Gool, Luc}, LANGUAGE = {eng}, ISSN = {0920-5691}, DOI = {10.1007/s11263-021-01496-2}, PUBLISHER = {Springer}, ADDRESS = {New York, NY}, YEAR = {2021}, MARGINALMARK = {$\bullet$}, JOURNAL = {International Journal of Computer Vision}, VOLUME = {129}, PAGES = {2865--2888}, }
Endnote
%0 Journal Article %A Gong, Rui %A Li, Wen %A Chen, Yuhua %A Dai, Dengxin %A Van Gool, Luc %+ External Organizations External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T DLOW: Domain Flow and Applications : %G eng %U http://hdl.handle.net/21.11116/0000-0009-2A6C-0 %R 10.1007/s11263-021-01496-2 %7 2021 %D 2021 %J International Journal of Computer Vision %O Int. J. Comput. Vis. %V 129 %& 2865 %P 2865 - 2888 %I Springer %C New York, NY %@ false %U https://github.com/ETHRuiGong/DLOW
Sakaridis, C., Dai, D., & Van Gool, L. (2021). ACDC: The Adverse Conditions Dataset with Correspondences for Semantic Driving Scene Understanding. In ICCV 2021, IEEE/CVF International Conference on Computer Vision. Virtual Event: IEEE. doi:10.1109/ICCV48922.2021.01059
Export
BibTeX
@inproceedings{SakaridisICCV21, TITLE = {{ACDC}: {The} Adverse Conditions Dataset with Correspondences for Semantic Driving Scene Understanding}, AUTHOR = {Sakaridis, Christos and Dai, Dengxin and Van Gool, Luc}, LANGUAGE = {eng}, DOI = {10.1109/ICCV48922.2021.01059}, PUBLISHER = {IEEE}, YEAR = {2021}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {ICCV 2021, IEEE/CVF International Conference on Computer Vision}, PAGES = {10745--10755}, ADDRESS = {Virtual Event}, }
Endnote
%0 Conference Proceedings %A Sakaridis, Christos %A Dai, Dengxin %A Van Gool, Luc %+ External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T ACDC: The Adverse Conditions Dataset with Correspondences for Semantic Driving Scene Understanding : %G eng %U http://hdl.handle.net/21.11116/0000-0009-446A-4 %R 10.1109/ICCV48922.2021.01059 %D 2021 %B IEEE/CVF International Conference on Computer Vision %Z date of event: 2021-10-11 - 2021-10-17 %C Virtual Event %B ICCV 2021 %P 10745 - 10755 %I IEEE %U https://acdc.vision.ee.ethz.ch/
Hoyer, L., Dai, D., Wang, Q., Chen, Y., & Van Gool, L. (2021). Improving Semi-Supervised and Domain-Adaptive Semantic Segmentation with Self-Supervised Depth Estimation. Retrieved from https://arxiv.org/abs/2108.12545
(arXiv: 2108.12545)
Abstract
Training deep networks for semantic segmentation requires large amounts of<br>labeled training data, which presents a major challenge in practice, as<br>labeling segmentation masks is a highly labor-intensive process. To address<br>this issue, we present a framework for semi-supervised and domain-adaptive<br>semantic segmentation, which is enhanced by self-supervised monocular depth<br>estimation (SDE) trained only on unlabeled image sequences.<br> In particular, we utilize SDE as an auxiliary task comprehensively across the<br>entire learning framework: First, we automatically select the most useful<br>samples to be annotated for semantic segmentation based on the correlation of<br>sample diversity and difficulty between SDE and semantic segmentation. Second,<br>we implement a strong data augmentation by mixing images and labels using the<br>geometry of the scene. Third, we transfer knowledge from features learned<br>during SDE to semantic segmentation by means of transfer and multi-task<br>learning. And fourth, we exploit additional labeled synthetic data with<br>Cross-Domain DepthMix and Matching Geometry Sampling to align synthetic and<br>real data.<br> We validate the proposed model on the Cityscapes dataset, where all four<br>contributions demonstrate significant performance gains, and achieve<br>state-of-the-art results for semi-supervised semantic segmentation as well as<br>for semi-supervised domain adaptation. In particular, with only 1/30 of the<br>Cityscapes labels, our method achieves 92% of the fully-supervised baseline<br>performance and even 97% when exploiting additional data from GTA. The source<br>code is available at<br>https://github.com/lhoyer/improving_segmentation_with_selfsupervised_depth.<br>
Export
BibTeX
@online{Hoyer2108.12545, TITLE = {Improving Semi-Supervised and Domain-Adaptive Semantic Segmentation with Self-Supervised Depth Estimation}, AUTHOR = {Hoyer, Lukas and Dai, Dengxin and Wang, Qin and Chen, Yuhua and Van Gool, Luc}, LANGUAGE = {eng}, URL = {https://arxiv.org/abs/2108.12545}, EPRINT = {2108.12545}, EPRINTTYPE = {arXiv}, YEAR = {2021}, MARGINALMARK = {$\bullet$}, ABSTRACT = {Training deep networks for semantic segmentation requires large amounts of<br>labeled training data, which presents a major challenge in practice, as<br>labeling segmentation masks is a highly labor-intensive process. To address<br>this issue, we present a framework for semi-supervised and domain-adaptive<br>semantic segmentation, which is enhanced by self-supervised monocular depth<br>estimation (SDE) trained only on unlabeled image sequences.<br> In particular, we utilize SDE as an auxiliary task comprehensively across the<br>entire learning framework: First, we automatically select the most useful<br>samples to be annotated for semantic segmentation based on the correlation of<br>sample diversity and difficulty between SDE and semantic segmentation. Second,<br>we implement a strong data augmentation by mixing images and labels using the<br>geometry of the scene. Third, we transfer knowledge from features learned<br>during SDE to semantic segmentation by means of transfer and multi-task<br>learning. And fourth, we exploit additional labeled synthetic data with<br>Cross-Domain DepthMix and Matching Geometry Sampling to align synthetic and<br>real data.<br> We validate the proposed model on the Cityscapes dataset, where all four<br>contributions demonstrate significant performance gains, and achieve<br>state-of-the-art results for semi-supervised semantic segmentation as well as<br>for semi-supervised domain adaptation. In particular, with only 1/30 of the<br>Cityscapes labels, our method achieves 92% of the fully-supervised baseline<br>performance and even 97% when exploiting additional data from GTA. The source<br>code is available at<br>https://github.com/lhoyer/improving_segmentation_with_selfsupervised_depth.<br>}, }
Endnote
%0 Report %A Hoyer, Lukas %A Dai, Dengxin %A Wang, Qin %A Chen, Yuhua %A Van Gool, Luc %+ External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations External Organizations External Organizations %T Improving Semi-Supervised and Domain-Adaptive Semantic Segmentation with Self-Supervised Depth Estimation : %G eng %U http://hdl.handle.net/21.11116/0000-0009-4449-9 %U https://arxiv.org/abs/2108.12545 %D 2021 %X Training deep networks for semantic segmentation requires large amounts of<br>labeled training data, which presents a major challenge in practice, as<br>labeling segmentation masks is a highly labor-intensive process. To address<br>this issue, we present a framework for semi-supervised and domain-adaptive<br>semantic segmentation, which is enhanced by self-supervised monocular depth<br>estimation (SDE) trained only on unlabeled image sequences.<br> In particular, we utilize SDE as an auxiliary task comprehensively across the<br>entire learning framework: First, we automatically select the most useful<br>samples to be annotated for semantic segmentation based on the correlation of<br>sample diversity and difficulty between SDE and semantic segmentation. Second,<br>we implement a strong data augmentation by mixing images and labels using the<br>geometry of the scene. Third, we transfer knowledge from features learned<br>during SDE to semantic segmentation by means of transfer and multi-task<br>learning. And fourth, we exploit additional labeled synthetic data with<br>Cross-Domain DepthMix and Matching Geometry Sampling to align synthetic and<br>real data.<br> We validate the proposed model on the Cityscapes dataset, where all four<br>contributions demonstrate significant performance gains, and achieve<br>state-of-the-art results for semi-supervised semantic segmentation as well as<br>for semi-supervised domain adaptation. In particular, with only 1/30 of the<br>Cityscapes labels, our method achieves 92% of the fully-supervised baseline<br>performance and even 97% when exploiting additional data from GTA. The source<br>code is available at<br>https://github.com/lhoyer/improving_segmentation_with_selfsupervised_depth.<br> %K Computer Science, Computer Vision and Pattern Recognition, cs.CV
Fan, Y., Dai, D., & Schiele, B. (2022). CoSSL: Co-Learning of Representation and Classifier for Imbalanced Semi-Supervised Learning. In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022). New Orleans, LA, USA: IEEE. doi:10.1109/CVPR52688.2022.01417
Abstract
In this paper, we propose a novel co-learning framework (CoSSL) with<br>decoupled representation learning and classifier learning for imbalanced SSL.<br>To handle the data imbalance, we devise Tail-class Feature Enhancement (TFE)<br>for classifier learning. Furthermore, the current evaluation protocol for<br>imbalanced SSL focuses only on balanced test sets, which has limited<br>practicality in real-world scenarios. Therefore, we further conduct a<br>comprehensive evaluation under various shifted test distributions. In<br>experiments, we show that our approach outperforms other methods over a large<br>range of shifted distributions, achieving state-of-the-art performance on<br>benchmark datasets ranging from CIFAR-10, CIFAR-100, ImageNet, to Food-101. Our<br>code will be made publicly available.<br>
Export
BibTeX
@inproceedings{Fan_CVPR2022, TITLE = {{CoSSL}: {C}o-Learning of Representation and Classifier for Imbalanced Semi-Supervised Learning}, AUTHOR = {Fan, Yue and Dai, Dengxin and Schiele, Bernt}, LANGUAGE = {eng}, ISBN = {978-1-6654-6946-3}, DOI = {10.1109/CVPR52688.2022.01417}, PUBLISHER = {IEEE}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, ABSTRACT = {In this paper, we propose a novel co-learning framework (CoSSL) with<br>decoupled representation learning and classifier learning for imbalanced SSL.<br>To handle the data imbalance, we devise Tail-class Feature Enhancement (TFE)<br>for classifier learning. Furthermore, the current evaluation protocol for<br>imbalanced SSL focuses only on balanced test sets, which has limited<br>practicality in real-world scenarios. Therefore, we further conduct a<br>comprehensive evaluation under various shifted test distributions. In<br>experiments, we show that our approach outperforms other methods over a large<br>range of shifted distributions, achieving state-of-the-art performance on<br>benchmark datasets ranging from CIFAR-10, CIFAR-100, ImageNet, to Food-101. Our<br>code will be made publicly available.<br>}, BOOKTITLE = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022)}, PAGES = {14554--14564}, ADDRESS = {New Orleans, LA, USA}, }
Endnote
%0 Conference Proceedings %A Fan, Yue %A Dai, Dengxin %A Schiele, Bernt %+ Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society %T CoSSL: Co-Learning of Representation and Classifier for Imbalanced Semi-Supervised Learning : %G eng %U http://hdl.handle.net/21.11116/0000-000A-16BA-C %R 10.1109/CVPR52688.2022.01417 %D 2022 %B 35th IEEE/CVF Conference on Computer Vision and Pattern Recognition %Z date of event: 2022-06-19 - 2022-06-24 %C New Orleans, LA, USA %X In this paper, we propose a novel co-learning framework (CoSSL) with<br>decoupled representation learning and classifier learning for imbalanced SSL.<br>To handle the data imbalance, we devise Tail-class Feature Enhancement (TFE)<br>for classifier learning. Furthermore, the current evaluation protocol for<br>imbalanced SSL focuses only on balanced test sets, which has limited<br>practicality in real-world scenarios. Therefore, we further conduct a<br>comprehensive evaluation under various shifted test distributions. In<br>experiments, we show that our approach outperforms other methods over a large<br>range of shifted distributions, achieving state-of-the-art performance on<br>benchmark datasets ranging from CIFAR-10, CIFAR-100, ImageNet, to Food-101. Our<br>code will be made publicly available.<br> %K Computer Science, Computer Vision and Pattern Recognition, cs.CV,Computer Science, Learning, cs.LG %B IEEE/CVF Conference on Computer Vision and Pattern Recognition %P 14554 - 14564 %I IEEE %@ 978-1-6654-6946-3
Fan, Y., Kukleva, A., Dai, D., & Schiele, B. (2022). Revisiting Consistency Regularization for Semi-supervised Learning. International Journal of Computer Vision. doi:10.1007/s11263-022-01723-4
Export
BibTeX
@article{Fan22, TITLE = {Revisiting Consistency Regularization for Semi-supervised Learning}, AUTHOR = {Fan, Yue and Kukleva, Anna and Dai, Dengxin and Schiele, Bernt}, LANGUAGE = {eng}, ISSN = {0920-5691}, DOI = {10.1007/s11263-022-01723-4}, PUBLISHER = {Springer}, ADDRESS = {New York, NY}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, JOURNAL = {International Journal of Computer Vision}, }
Endnote
%0 Journal Article %A Fan, Yue %A Kukleva, Anna %A Dai, Dengxin %A Schiele, Bernt %+ Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society %T Revisiting Consistency Regularization for Semi-supervised Learning : %G eng %U http://hdl.handle.net/21.11116/0000-000C-73A9-4 %R 10.1007/s11263-022-01723-4 %7 2022 %D 2022 %J International Journal of Computer Vision %O Int. J. Comput. Vis. %I Springer %C New York, NY %@ false
Cai, S., Obukhov, A., Dai, D., & Van Gool, L. (2022). Pix2NeRF: Unsupervised Conditional Pi-GAN for Single Image to Neural Radiance Fields Translation. In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022). New Orleans, LA, USA: IEEE. doi:10.1109/CVPR52688.2022.00395
Export
BibTeX
@inproceedings{Cai_CVPR2022, TITLE = {{Pix2NeRF}: {U}nsupervised Conditional $\pi$-{GAN} for Single Image to Neural Radiance Fields Translation}, AUTHOR = {Cai, Shengqu and Obukhov, Anton and Dai, Dengxin and Van Gool, Luc}, LANGUAGE = {eng}, ISBN = {978-1-6654-6946-3}, DOI = {10.1109/CVPR52688.2022.00395}, PUBLISHER = {IEEE}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022)}, PAGES = {3971--3980}, ADDRESS = {New Orleans, LA, USA}, }
Endnote
%0 Conference Proceedings %A Cai, Shengqu %A Obukhov, Anton %A Dai, Dengxin %A Van Gool, Luc %+ External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T Pix2NeRF: Unsupervised Conditional Pi-GAN for Single Image to Neural Radiance Fields Translation : %G eng %U http://hdl.handle.net/21.11116/0000-000A-160D-0 %R 10.1109/CVPR52688.2022.00395 %D 2022 %B 35th IEEE/CVF Conference on Computer Vision and Pattern Recognition %Z date of event: 2022-06-19 - 2022-06-24 %C New Orleans, LA, USA %B IEEE/CVF Conference on Computer Vision and Pattern Recognition %P 3971 - 3980 %I IEEE %@ 978-1-6654-6946-3
Ma, X., Wang, Z., Zhan, Y., Zheng, Y., Wang, Z., Dai, D., & Lin, C.-W. (2022). Both Style and Fog Matter: Cumulative Domain Adaptation for Semantic Foggy Scene Understanding. In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022). New Orleans, LA, USA: IEEE. doi:10.1109/CVPR52688.2022.01835
Abstract
Although considerable progress has been made in semantic scene understanding<br>under clear weather, it is still a tough problem under adverse weather<br>conditions, such as dense fog, due to the uncertainty caused by imperfect<br>observations. Besides, difficulties in collecting and labeling foggy images<br>hinder the progress of this field. Considering the success in semantic scene<br>understanding under clear weather, we think it is reasonable to transfer<br>knowledge learned from clear images to the foggy domain. As such, the problem<br>becomes to bridge the domain gap between clear images and foggy images. Unlike<br>previous methods that mainly focus on closing the domain gap caused by fog --<br>defogging the foggy images or fogging the clear images, we propose to alleviate<br>the domain gap by considering fog influence and style variation simultaneously.<br>The motivation is based on our finding that the style-related gap and the<br>fog-related gap can be divided and closed respectively, by adding an<br>intermediate domain. Thus, we propose a new pipeline to cumulatively adapt<br>style, fog and the dual-factor (style and fog). Specifically, we devise a<br>unified framework to disentangle the style factor and the fog factor<br>separately, and then the dual-factor from images in different domains.<br>Furthermore, we collaborate the disentanglement of three factors with a novel<br>cumulative loss to thoroughly disentangle these three factors. Our method<br>achieves the state-of-the-art performance on three benchmarks and shows<br>generalization ability in rainy and snowy scenes.<br>
Export
BibTeX
@inproceedings{Ma_CVPR2022, TITLE = {Both Style and Fog Matter: {C}umulative Domain Adaptation for Semantic Foggy Scene Understanding}, AUTHOR = {Ma, Xianzheng and Wang, Zhixiang and Zhan, Yacheng and Zheng, Yinqiang and Wang, Zheng and Dai, Dengxin and Lin, Chia-Wen}, LANGUAGE = {eng}, ISBN = {978-1-6654-6946-3}, DOI = {10.1109/CVPR52688.2022.01835}, PUBLISHER = {IEEE}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, ABSTRACT = {Although considerable progress has been made in semantic scene understanding<br>under clear weather, it is still a tough problem under adverse weather<br>conditions, such as dense fog, due to the uncertainty caused by imperfect<br>observations. Besides, difficulties in collecting and labeling foggy images<br>hinder the progress of this field. Considering the success in semantic scene<br>understanding under clear weather, we think it is reasonable to transfer<br>knowledge learned from clear images to the foggy domain. As such, the problem<br>becomes to bridge the domain gap between clear images and foggy images. Unlike<br>previous methods that mainly focus on closing the domain gap caused by fog --<br>defogging the foggy images or fogging the clear images, we propose to alleviate<br>the domain gap by considering fog influence and style variation simultaneously.<br>The motivation is based on our finding that the style-related gap and the<br>fog-related gap can be divided and closed respectively, by adding an<br>intermediate domain. Thus, we propose a new pipeline to cumulatively adapt<br>style, fog and the dual-factor (style and fog). Specifically, we devise a<br>unified framework to disentangle the style factor and the fog factor<br>separately, and then the dual-factor from images in different domains.<br>Furthermore, we collaborate the disentanglement of three factors with a novel<br>cumulative loss to thoroughly disentangle these three factors. Our method<br>achieves the state-of-the-art performance on three benchmarks and shows<br>generalization ability in rainy and snowy scenes.<br>}, BOOKTITLE = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022)}, PAGES = {18900--18909}, ADDRESS = {New Orleans, LA, USA}, }
Endnote
%0 Conference Proceedings %A Ma, Xianzheng %A Wang, Zhixiang %A Zhan, Yacheng %A Zheng, Yinqiang %A Wang, Zheng %A Dai, Dengxin %A Lin, Chia-Wen %+ External Organizations External Organizations External Organizations External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T Both Style and Fog Matter: Cumulative Domain Adaptation for Semantic Foggy Scene Understanding : %G eng %U http://hdl.handle.net/21.11116/0000-000A-165D-6 %R 10.1109/CVPR52688.2022.01835 %D 2022 %B 35th IEEE/CVF Conference on Computer Vision and Pattern Recognition %Z date of event: 2022-06-19 - 2022-06-24 %C New Orleans, LA, USA %X Although considerable progress has been made in semantic scene understanding<br>under clear weather, it is still a tough problem under adverse weather<br>conditions, such as dense fog, due to the uncertainty caused by imperfect<br>observations. Besides, difficulties in collecting and labeling foggy images<br>hinder the progress of this field. Considering the success in semantic scene<br>understanding under clear weather, we think it is reasonable to transfer<br>knowledge learned from clear images to the foggy domain. As such, the problem<br>becomes to bridge the domain gap between clear images and foggy images. Unlike<br>previous methods that mainly focus on closing the domain gap caused by fog --<br>defogging the foggy images or fogging the clear images, we propose to alleviate<br>the domain gap by considering fog influence and style variation simultaneously.<br>The motivation is based on our finding that the style-related gap and the<br>fog-related gap can be divided and closed respectively, by adding an<br>intermediate domain. Thus, we propose a new pipeline to cumulatively adapt<br>style, fog and the dual-factor (style and fog). Specifically, we devise a<br>unified framework to disentangle the style factor and the fog factor<br>separately, and then the dual-factor from images in different domains.<br>Furthermore, we collaborate the disentanglement of three factors with a novel<br>cumulative loss to thoroughly disentangle these three factors. Our method<br>achieves the state-of-the-art performance on three benchmarks and shows<br>generalization ability in rainy and snowy scenes.<br> %K Computer Science, Computer Vision and Pattern Recognition, cs.CV %B IEEE/CVF Conference on Computer Vision and Pattern Recognition %P 18900 - 18909 %I IEEE %@ 978-1-6654-6946-3
Gong, S., Zhang, S., Yang, J., Dai, D., & Schiele, B. (2022a). Bi-level Alignment for Cross-Domain Crowd Counting. In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022). New Orleans, LA, USA: IEEE. doi:10.1109/CVPR52688.2022.00739
Export
BibTeX
@inproceedings{Gong_CVPR2022, TITLE = {Bi-level Alignment for Cross-Domain Crowd Counting}, AUTHOR = {Gong, Shenjian and Zhang, Shanshan and Yang, Jian and Dai, Dengxin and Schiele, Bernt}, LANGUAGE = {eng}, ISBN = {978-1-6654-6946-3}, DOI = {10.1109/CVPR52688.2022.00739}, PUBLISHER = {IEEE}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022)}, PAGES = {7532--7540}, ADDRESS = {New Orleans, LA, USA}, }
Endnote
%0 Conference Proceedings %A Gong, Shenjian %A Zhang, Shanshan %A Yang, Jian %A Dai, Dengxin %A Schiele, Bernt %+ External Organizations External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society %T Bi-level Alignment for Cross-Domain Crowd Counting : %G eng %U http://hdl.handle.net/21.11116/0000-000C-138E-F %R 10.1109/CVPR52688.2022.00739 %D 2022 %B 35th IEEE/CVF Conference on Computer Vision and Pattern Recognition %Z date of event: 2022-06-19 - 2022-06-24 %C New Orleans, LA, USA %B IEEE/CVF Conference on Computer Vision and Pattern Recognition %P 7532 - 7540 %I IEEE %@ 978-1-6654-6946-3
Vödisch, N., Unal, O., Li, K., Van Gool, L., & Dai, D. (2022). End-to-End Optimization of LiDAR Beam Configuration for 3D Object Detection and Localization. IEEE Robotics and Automation Letters, 7(2). doi:10.1109/LRA.2022.3142738
Export
BibTeX
@article{Voedisch2022, TITLE = {End-to-End Optimization of {LiDAR} Beam Configuration for {3D} Object Detection and Localization}, AUTHOR = {V{\"o}disch, Niclas and Unal, Ozan and Li, Ke and Van Gool, Luc and Dai, Dengxin}, LANGUAGE = {eng}, ISSN = {2377-3766}, DOI = {10.1109/LRA.2022.3142738}, PUBLISHER = {IEEE}, ADDRESS = {New York, NY}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, JOURNAL = {IEEE Robotics and Automation Letters}, VOLUME = {7}, NUMBER = {2}, PAGES = {2242--2249}, }
Endnote
%0 Journal Article %A V&#246;disch, Niclas %A Unal, Ozan %A Li, Ke %A Van Gool, Luc %A Dai, Dengxin %+ External Organizations External Organizations External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society %T End-to-End Optimization of LiDAR Beam Configuration for 3D Object Detection and Localization : %G eng %U http://hdl.handle.net/21.11116/0000-000A-0A71-C %R 10.1109/LRA.2022.3142738 %7 2022 %D 2022 %J IEEE Robotics and Automation Letters %V 7 %N 2 %& 2242 %P 2242 - 2249 %I IEEE %C New York, NY %@ false
Wang, Q., Fink, O., Van Gool, L., & Dai, D. (2022). Continual Test-Time Domain Adaptation. In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022). New Orleans, LA, USA: IEEE. doi:10.1109/CVPR52688.2022.00706
Export
BibTeX
@inproceedings{Wang_CVPR22b, TITLE = {Continual Test-Time Domain Adaptation}, AUTHOR = {Wang, Qin and Fink, Olga and Van Gool, Luc and Dai, Dengxin}, LANGUAGE = {eng}, ISBN = {978-1-6654-6946-3}, DOI = {10.1109/CVPR52688.2022.00706}, PUBLISHER = {IEEE}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022)}, PAGES = {7191--7201}, ADDRESS = {New Orleans, LA, USA}, }
Endnote
%0 Conference Proceedings %A Wang, Qin %A Fink, Olga %A Van Gool, Luc %A Dai, Dengxin %+ External Organizations External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society %T Continual Test-Time Domain Adaptation : %G eng %U http://hdl.handle.net/21.11116/0000-000C-13E7-A %R 10.1109/CVPR52688.2022.00706 %D 2022 %B 35th IEEE/CVF Conference on Computer Vision and Pattern Recognition %Z date of event: 2022-06-19 - 2022-06-24 %C New Orleans, LA, USA %B IEEE/CVF Conference on Computer Vision and Pattern Recognition %P 7191 - 7201 %I IEEE %@ 978-1-6654-6946-3
Unal, O., Dai, D., & Van Gool, L. (2022). Scribble-Supervised LiDAR Semantic Segmentation. In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022). New Orleans, LA, USA: IEEE. doi:10.1109/CVPR52688.2022.00272
Export
BibTeX
@inproceedings{Unal_CVPR22, TITLE = {Scribble-Supervised {LiDAR} Semantic Segmentation}, AUTHOR = {Unal, Ozan and Dai, Dengxin and Van Gool, Luc}, LANGUAGE = {eng}, ISBN = {978-1-6654-6946-3}, DOI = {10.1109/CVPR52688.2022.00272}, PUBLISHER = {IEEE}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022)}, PAGES = {2687--2697}, ADDRESS = {New Orleans, LA, USA}, }
Endnote
%0 Conference Proceedings %A Unal, Ozan %A Dai, Dengxin %A Van Gool, Luc %+ External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T Scribble-Supervised LiDAR Semantic Segmentation : %G eng %U http://hdl.handle.net/21.11116/0000-000C-13E3-E %R 10.1109/CVPR52688.2022.00272 %D 2022 %B 35th IEEE/CVF Conference on Computer Vision and Pattern Recognition %Z date of event: 2022-06-19 - 2022-06-24 %C New Orleans, LA, USA %B IEEE/CVF Conference on Computer Vision and Pattern Recognition %P 2687 - 2697 %I IEEE %@ 978-1-6654-6946-3
Sun, G., Probst, T., Paudel, D. P., Popovic, N., Kanakis, M., Patel, J., … Van Gool, L. (2021). Task Switching Network for Multi-task Learning. In ICCV 2021, IEEE/CVF International Conference on Computer Vision. Virtual Event: IEEE. doi:10.1109/ICCV48922.2021.00818
Export
BibTeX
@inproceedings{Sun_ICCV21, TITLE = {Task Switching Network for Multi-task Learning}, AUTHOR = {Sun, Guolei and Probst, Thomas and Paudel, Danda Pani and Popovic, Nikola and Kanakis, Menelaos and Patel, Jagruti and Dai, Dengxin and Van Gool, Luc}, LANGUAGE = {eng}, ISBN = {978-1-6654-2812-5}, DOI = {10.1109/ICCV48922.2021.00818}, PUBLISHER = {IEEE}, YEAR = {2021}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {ICCV 2021, IEEE/CVF International Conference on Computer Vision}, PAGES = {8271--8280}, ADDRESS = {Virtual Event}, }
Endnote
%0 Conference Proceedings %A Sun, Guolei %A Probst, Thomas %A Paudel, Danda Pani %A Popovic, Nikola %A Kanakis, Menelaos %A Patel, Jagruti %A Dai, Dengxin %A Van Gool, Luc %+ External Organizations External Organizations External Organizations External Organizations External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T Task Switching Network for Multi-task Learning : %G eng %U http://hdl.handle.net/21.11116/0000-000A-C9C4-6 %R 10.1109/ICCV48922.2021.00818 %D 2021 %B IEEE/CVF International Conference on Computer Vision %Z date of event: 2021-10-11 - 2021-10-17 %C Virtual Event %B ICCV 2021 %P 8271 - 8280 %I IEEE %@ 978-1-6654-2812-5
Hoyer, L., Dai, D., & Van Gool, L. (2022b). DAFormer: Improving Network Architectures and Training Strategies for Domain-Adaptive Semantic Segmentation. In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022). New Orleans, LA, USA: IEEE. doi:10.1109/CVPR52688.2022.00969
Abstract
As acquiring pixel-wise annotations of real-world images for semantic<br>segmentation is a costly process, a model can instead be trained with more<br>accessible synthetic data and adapted to real images without requiring their<br>annotations. This process is studied in unsupervised domain adaptation (UDA).<br>Even though a large number of methods propose new adaptation strategies, they<br>are mostly based on outdated network architectures. As the influence of recent<br>network architectures has not been systematically studied, we first benchmark<br>different network architectures for UDA and then propose a novel UDA method,<br>DAFormer, based on the benchmark results. The DAFormer network consists of a<br>Transformer encoder and a multi-level context-aware feature fusion decoder. It<br>is enabled by three simple but crucial training strategies to stabilize the<br>training and to avoid overfitting DAFormer to the source domain: While the Rare<br>Class Sampling on the source domain improves the quality of pseudo-labels by<br>mitigating the confirmation bias of self-training towards common classes, the<br>Thing-Class ImageNet Feature Distance and a learning rate warmup promote<br>feature transfer from ImageNet pretraining. DAFormer significantly improves the<br>state-of-the-art performance by 10.8 mIoU for GTA->Cityscapes and 5.4 mIoU for<br>Synthia->Cityscapes and enables learning even difficult classes such as train,<br>bus, and truck well. The implementation is available at<br>https://github.com/lhoyer/DAFormer.<br>
Export
BibTeX
@inproceedings{Hoyer_CVPR2022, TITLE = {{DAFormer}: {I}mproving Network Architectures and Training Strategies for Domain-Adaptive Semantic Segmentation}, AUTHOR = {Hoyer, Lukas and Dai, Dengxin and Van Gool, Luc}, LANGUAGE = {eng}, ISBN = {978-1-6654-6946-3}, DOI = {10.1109/CVPR52688.2022.00969}, PUBLISHER = {IEEE}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, ABSTRACT = {As acquiring pixel-wise annotations of real-world images for semantic<br>segmentation is a costly process, a model can instead be trained with more<br>accessible synthetic data and adapted to real images without requiring their<br>annotations. This process is studied in unsupervised domain adaptation (UDA).<br>Even though a large number of methods propose new adaptation strategies, they<br>are mostly based on outdated network architectures. As the influence of recent<br>network architectures has not been systematically studied, we first benchmark<br>different network architectures for UDA and then propose a novel UDA method,<br>DAFormer, based on the benchmark results. The DAFormer network consists of a<br>Transformer encoder and a multi-level context-aware feature fusion decoder. It<br>is enabled by three simple but crucial training strategies to stabilize the<br>training and to avoid overfitting DAFormer to the source domain: While the Rare<br>Class Sampling on the source domain improves the quality of pseudo-labels by<br>mitigating the confirmation bias of self-training towards common classes, the<br>Thing-Class ImageNet Feature Distance and a learning rate warmup promote<br>feature transfer from ImageNet pretraining. DAFormer significantly improves the<br>state-of-the-art performance by 10.8 mIoU for GTA->Cityscapes and 5.4 mIoU for<br>Synthia->Cityscapes and enables learning even difficult classes such as train,<br>bus, and truck well. The implementation is available at<br>https://github.com/lhoyer/DAFormer.<br>}, BOOKTITLE = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022)}, PAGES = {9914--9925}, ADDRESS = {New Orleans, LA, USA}, }
Endnote
%0 Conference Proceedings %A Hoyer, Lukas %A Dai, Dengxin %A Van Gool, Luc %+ External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T DAFormer: Improving Network Architectures and Training Strategies for Domain-Adaptive Semantic Segmentation : %G eng %U http://hdl.handle.net/21.11116/0000-000A-16B5-1 %R 10.1109/CVPR52688.2022.00969 %D 2022 %B 35th IEEE/CVF Conference on Computer Vision and Pattern Recognition %Z date of event: 2022-06-19 - 2022-06-24 %C New Orleans, LA, USA %X As acquiring pixel-wise annotations of real-world images for semantic<br>segmentation is a costly process, a model can instead be trained with more<br>accessible synthetic data and adapted to real images without requiring their<br>annotations. This process is studied in unsupervised domain adaptation (UDA).<br>Even though a large number of methods propose new adaptation strategies, they<br>are mostly based on outdated network architectures. As the influence of recent<br>network architectures has not been systematically studied, we first benchmark<br>different network architectures for UDA and then propose a novel UDA method,<br>DAFormer, based on the benchmark results. The DAFormer network consists of a<br>Transformer encoder and a multi-level context-aware feature fusion decoder. It<br>is enabled by three simple but crucial training strategies to stabilize the<br>training and to avoid overfitting DAFormer to the source domain: While the Rare<br>Class Sampling on the source domain improves the quality of pseudo-labels by<br>mitigating the confirmation bias of self-training towards common classes, the<br>Thing-Class ImageNet Feature Distance and a learning rate warmup promote<br>feature transfer from ImageNet pretraining. DAFormer significantly improves the<br>state-of-the-art performance by 10.8 mIoU for GTA->Cityscapes and 5.4 mIoU for<br>Synthia->Cityscapes and enables learning even difficult classes such as train,<br>bus, and truck well. The implementation is available at<br>https://github.com/lhoyer/DAFormer.<br> %K Computer Science, Computer Vision and Pattern Recognition, cs.CV %B IEEE/CVF Conference on Computer Vision and Pattern Recognition %P 9914 - 9925 %I IEEE %@ 978-1-6654-6946-3
Li, K., Dai, D., & van Gool, L. (2022). Hyperspectral Image Super-Resolution with RGB Image Super-Resolution as an Auxiliary Task. In 2022 IEEE Winter Conference on Applications of Computer Vision (WACV 2022). Waikoloa Village, HI, USA: IEEE. doi:10.1109/WACV51458.2022.00409
Export
BibTeX
@inproceedings{Li_WACV22, TITLE = {Hyperspectral Image Super-Resolution with {RGB} Image Super-Resolution as an Auxiliary Task}, AUTHOR = {Li, Ke and Dai, Dengxin and van Gool, Luc}, LANGUAGE = {eng}, ISBN = {978-1-6654-0915-5}, DOI = {10.1109/WACV51458.2022.00409}, PUBLISHER = {IEEE}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {2022 IEEE Winter Conference on Applications of Computer Vision (WACV 2022)}, PAGES = {4039--4048}, ADDRESS = {Waikoloa Village, HI, USA}, }
Endnote
%0 Conference Proceedings %A Li, Ke %A Dai, Dengxin %A van Gool, Luc %+ External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T Hyperspectral Image Super-Resolution with RGB Image Super-Resolution as an Auxiliary Task : %G eng %U http://hdl.handle.net/21.11116/0000-000A-CD2C-F %R 10.1109/WACV51458.2022.00409 %D 2022 %B IEEE Winter Conference on Applications of Computer Vision %Z date of event: 2022-01-04 - 2022-01-08 %C Waikoloa Village, HI, USA %B 2022 IEEE Winter Conference on Applications of Computer Vision %P 4039 - 4048 %I IEEE %@ 978-1-6654-0915-5
Zaech, J.-N., Liniger, A., Danelljan, M., Dai, D., & Van Gool, L. (2022). Adiabatic Quantum Computing for Multi Object Tracking. In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022). New Orleans, LA, USA: IEEE. doi:10.1109/CVPR52688.2022.00861
Export
BibTeX
@inproceedings{Zaech_CVPR2022, TITLE = {Adiabatic Quantum Computing for Multi Object Tracking}, AUTHOR = {Zaech, Jan-Nico and Liniger, Alexander and Danelljan, Martin and Dai, Dengxin and Van Gool, Luc}, LANGUAGE = {eng}, ISBN = {978-1-6654-6946-3}, DOI = {10.1109/CVPR52688.2022.00861}, PUBLISHER = {IEEE}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022)}, PAGES = {8801--8812}, ADDRESS = {New Orleans, LA, USA}, }
Endnote
%0 Conference Proceedings %A Zaech, Jan-Nico %A Liniger, Alexander %A Danelljan, Martin %A Dai, Dengxin %A Van Gool, Luc %+ External Organizations External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T Adiabatic Quantum Computing for Multi Object Tracking : %G eng %U http://hdl.handle.net/21.11116/0000-000A-16C3-1 %R 10.1109/CVPR52688.2022.00861 %D 2022 %B 35th IEEE/CVF Conference on Computer Vision and Pattern Recognition %Z date of event: 2022-06-19 - 2022-06-24 %C New Orleans, LA, USA %B IEEE/CVF Conference on Computer Vision and Pattern Recognition %P 8801 - 8812 %I IEEE %@ 978-1-6654-6946-3
Gong, S., Zhang, S., Yang, J., Dai, D., & Schiele, B. (2022b). Class-Agnostic Object Counting Robust to Intraclass Diversity. In Computer Vision -- ECCV 2022. Tel Aviv, Israel: Springer. doi:10.1007/978-3-031-19827-4_23
Export
BibTeX
@inproceedings{Gong_ECCV2022c, TITLE = {Class-Agnostic Object Counting Robust to Intraclass Diversity}, AUTHOR = {Gong, Shenjian and Zhang, Shanshan and Yang, Jian and Dai, Dengxin and Schiele, Bernt}, LANGUAGE = {eng}, ISBN = {978-3-031-20073-1}, DOI = {10.1007/978-3-031-19827-4_23}, PUBLISHER = {Springer}, YEAR = {2022}, MARGINALMARK = {$\bullet$}, DATE = {2022}, BOOKTITLE = {Computer Vision -- ECCV 2022}, EDITOR = {Avidan, Shai and Brostow, Gabriel and Ciss{\'e}, Moustapha and Farinella, Giovanni Maria and Hassner, Tal}, PAGES = {388--403}, SERIES = {Lecture Notes in Computer Science}, VOLUME = {13693}, ADDRESS = {Tel Aviv, Israel}, }
Endnote
%0 Conference Proceedings %A Gong, Shenjian %A Zhang, Shanshan %A Yang, Jian %A Dai, Dengxin %A Schiele, Bernt %+ External Organizations External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society %T Class-Agnostic Object Counting Robust to Intraclass Diversity : %G eng %U http://hdl.handle.net/21.11116/0000-000C-1841-0 %R 10.1007/978-3-031-19827-4_23 %D 2022 %B 17th European Conference on Computer Vision %Z date of event: 2022-10-23 - 2022-10-27 %C Tel Aviv, Israel %B Computer Vision -- ECCV 2022 %E Avidan, Shai; Brostow, Gabriel; Ciss&#233;, Moustapha; Farinella, Giovanni Maria; Hassner, Tal %P 388 - 403 %I Springer %@ 978-3-031-20073-1 %B Lecture Notes in Computer Science %N 13693 %U https://rdcu.be/c2vcR