Gerard Pons-Moll (Senior Researcher)

Dr. Gerard Pons-Moll

Address
Max-Planck-Institut für Informatik
Saarland Informatics Campus
Campus E1 4
66123 Saarbrücken
Location
E1 4 - Room 605
Phone
+49 681 9325 2135
Fax
+49 681 9325 2099
Email
Get email via email

Group Homepage

 

Please visit my group website: http://virtualhumans.mpi-inf.mpg.de

 

    Offers

    If you are interested in doing a PhD on related areas, contact me direclty or send your application to d2-application@mpi-inf.mpg.de. We also have projects for bachelor and master theses and research internships of 6 months.

    Publications -- MPII only

    Alldieck, T., Magnor, M. A., Bhatnagar, B. L., Theobalt, C., & Pons-Moll, G. (n.d.). Learning to Reconstruct People in Clothing from a Single RGB Camera. In 32nd IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2019). Long Beach, CA, USA: IEEE.
    (Accepted/in press)
    Export
    BibTeX
    @inproceedings{alldieck19cvpr, TITLE = {Learning to Reconstruct People in Clothing from a Single {RGB} Camera}, AUTHOR = {Alldieck, Thiemo and Magnor, Marcus A. and Bhatnagar, Bharat Lal and Theobalt, Christian and Pons-Moll, Gerard}, PUBLISHER = {IEEE}, YEAR = {2019}, PUBLREMARK = {Accepted}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {32nd IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2019)}, ADDRESS = {Long Beach, CA, USA}, }
    Endnote
    %0 Conference Proceedings %A Alldieck, Thiemo %A Magnor, Marcus A. %A Bhatnagar, Bharat Lal %A Theobalt, Christian %A Pons-Moll, Gerard %+ Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society %T Learning to Reconstruct People in Clothing from a Single RGB Camera : %U http://hdl.handle.net/21.11116/0000-0003-5F97-9 %D 2019 %B 32nd IEEE Conference on Computer Vision and Pattern Recognition %Z date of event: 2019-06-16 - 2019-06-20 %C Long Beach, CA, USA %B 32nd IEEE Conference on Computer Vision and Pattern Recognition %I IEEE
    Habibie, I., Xu, W., Mehta, D., Pons-Moll, G., & Theobalt, C. (n.d.). In the Wild Human Pose Estimation using Explicit 2D Features and Intermediate 3D Representations. In 32nd IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2019). Long Beach, CA, USA: IEEE.
    (Accepted/in press)
    Export
    BibTeX
    @inproceedings{habibieCVPR19, TITLE = {In the Wild Human Pose Estimation using Explicit {2D} Features and Intermediate 3D Representations}, AUTHOR = {Habibie, Ikhsanul and Xu, Weipeng and Mehta, Dushyant and Pons-Moll, Gerard and Theobalt, Christian}, PUBLISHER = {IEEE}, YEAR = {2019}, PUBLREMARK = {Accepted}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {32nd IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2019)}, ADDRESS = {Long Beach, CA, USA}, }
    Endnote
    %0 Conference Proceedings %A Habibie, Ikhsanul %A Xu, Weipeng %A Mehta, Dushyant %A Pons-Moll, Gerard %A Theobalt, Christian %+ External Organizations Computer Graphics, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society %T In the Wild Human Pose Estimation using Explicit 2D Features and Intermediate 3D Representations : %U http://hdl.handle.net/21.11116/0000-0003-6520-7 %D 2019 %B 32nd IEEE Conference on Computer Vision and Pattern Recognition %Z date of event: 2019-06-16 - 2019-06-20 %C Long Beach, CA, USA %B 32nd IEEE Conference on Computer Vision and Pattern Recognition %I IEEE
    Yu, T., Zheng, Z., Zhong, Y., Zhao, J., Quionhai, D., Pons-Moll, G., & Liu, Y. (n.d.). SimulCap : Single-View Human Performance Capture with Cloth Simulation. In 32nd IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2019). Long Beach, CA, USA: IEEE.
    (Accepted/in press)
    Export
    BibTeX
    @inproceedings{SimulCap19, TITLE = {{SimulCap} : {S}ingle-View Human Performance Capture with Cloth Simulation}, AUTHOR = {Yu, Tao and Zheng, Zerong and Zhong, Yuan and Zhao, Jianhui and Quionhai, Dai and Pons-Moll, Gerard and Liu, Yebin}, PUBLISHER = {IEEE}, YEAR = {2019}, PUBLREMARK = {Accepted}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {32nd IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2019)}, ADDRESS = {Long Beach, CA, USA}, }
    Endnote
    %0 Conference Proceedings %A Yu, Tao %A Zheng, Zerong %A Zhong, Yuan %A Zhao, Jianhui %A Quionhai, Dai %A Pons-Moll, Gerard %A Liu, Yebin %+ External Organizations External Organizations External Organizations External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society %T SimulCap : Single-View Human Performance Capture with Cloth Simulation : %U http://hdl.handle.net/21.11116/0000-0003-651E-B %D 2019 %B 32nd IEEE Conference on Computer Vision and Pattern Recognition %Z date of event: 2019-06-16 - 2019-06-20 %C Long Beach, CA, USA %B 32nd IEEE Conference on Computer Vision and Pattern Recognition %I IEEE
    Sattar, H., Pons-Moll, G., & Fritz, M. (2019). Fashion is Taking Shape: Understanding Clothing Preference Based on Body Shape From Online Sources. In 2019 IEEE Winter Conference on Applications of Computer Vision (WACV 2019). Waikoloa Village, HI, USA: IEEE. doi:10.1109/WACV.2019.00108
    Export
    BibTeX
    @inproceedings{sattar19wacv, TITLE = {Fashion is Taking Shape: {U}nderstanding Clothing Preference Based on Body Shape From Online Sources}, AUTHOR = {Sattar, Hosnieh and Pons-Moll, Gerard and Fritz, Mario}, LANGUAGE = {eng}, ISBN = {978-1-7281-1975-5}, DOI = {10.1109/WACV.2019.00108}, PUBLISHER = {IEEE}, YEAR = {2019}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {2019 IEEE Winter Conference on Applications of Computer Vision (WACV 2019)}, PAGES = {968--977}, ADDRESS = {Waikoloa Village, HI, USA}, }
    Endnote
    %0 Conference Proceedings %A Sattar, Hosnieh %A Pons-Moll, Gerard %A Fritz, Mario %+ Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T Fashion is Taking Shape: Understanding Clothing Preference Based on Body Shape From Online Sources : %G eng %U http://hdl.handle.net/21.11116/0000-0001-B309-B %R 10.1109/WACV.2019.00108 %D 2019 %B IEEE Winter Conference on Applications of Computer Vision %Z date of event: 2019-01-08 - 2019-01-10 %C Waikoloa Village, HI, USA %B 2019 IEEE Winter Conference on Applications of Computer Vision %P 968 - 977 %I IEEE %@ 978-1-7281-1975-5
    Sattar, H., Krombholz, K., Pons-Moll, G., & Fritz, M. (2019). Shape Evasion: Preventing Body Shape Inference of Multi-Stage Approaches. Retrieved from http://arxiv.org/abs/1905.11503
    (arXiv: 1905.11503)
    Abstract
    Modern approaches to pose and body shape estimation have recently achieved strong performance even under challenging real-world conditions. Even from a single image of a clothed person, a realistic looking body shape can be inferred that captures a users' weight group and body shape type well. This opens up a whole spectrum of applications -- in particular in fashion -- where virtual try-on and recommendation systems can make use of these new and automatized cues. However, a realistic depiction of the undressed body is regarded highly private and therefore might not be consented by most people. Hence, we ask if the automatic extraction of such information can be effectively evaded. While adversarial perturbations have been shown to be effective for manipulating the output of machine learning models -- in particular, end-to-end deep learning approaches -- state of the art shape estimation methods are composed of multiple stages. We perform the first investigation of different strategies that can be used to effectively manipulate the automatic shape estimation while preserving the overall appearance of the original image.
    Export
    BibTeX
    @online{Sattar_arXiv1905.11503, TITLE = {Shape Evasion: Preventing Body Shape Inference of Multi-Stage Approaches}, AUTHOR = {Sattar, Hosnieh and Krombholz, Katharina and Pons-Moll, Gerard and Fritz, Mario}, LANGUAGE = {eng}, URL = {http://arxiv.org/abs/1905.11503}, EPRINT = {1905.11503}, EPRINTTYPE = {arXiv}, YEAR = {2019}, MARGINALMARK = {$\bullet$}, ABSTRACT = {Modern approaches to pose and body shape estimation have recently achieved strong performance even under challenging real-world conditions. Even from a single image of a clothed person, a realistic looking body shape can be inferred that captures a users' weight group and body shape type well. This opens up a whole spectrum of applications -- in particular in fashion -- where virtual try-on and recommendation systems can make use of these new and automatized cues. However, a realistic depiction of the undressed body is regarded highly private and therefore might not be consented by most people. Hence, we ask if the automatic extraction of such information can be effectively evaded. While adversarial perturbations have been shown to be effective for manipulating the output of machine learning models -- in particular, end-to-end deep learning approaches -- state of the art shape estimation methods are composed of multiple stages. We perform the first investigation of different strategies that can be used to effectively manipulate the automatic shape estimation while preserving the overall appearance of the original image.}, }
    Endnote
    %0 Report %A Sattar, Hosnieh %A Krombholz, Katharina %A Pons-Moll, Gerard %A Fritz, Mario %+ Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T Shape Evasion: Preventing Body Shape Inference of Multi-Stage Approaches : %G eng %U http://hdl.handle.net/21.11116/0000-0003-B2E5-1 %U http://arxiv.org/abs/1905.11503 %D 2019 %X Modern approaches to pose and body shape estimation have recently achieved strong performance even under challenging real-world conditions. Even from a single image of a clothed person, a realistic looking body shape can be inferred that captures a users' weight group and body shape type well. This opens up a whole spectrum of applications -- in particular in fashion -- where virtual try-on and recommendation systems can make use of these new and automatized cues. However, a realistic depiction of the undressed body is regarded highly private and therefore might not be consented by most people. Hence, we ask if the automatic extraction of such information can be effectively evaded. While adversarial perturbations have been shown to be effective for manipulating the output of machine learning models -- in particular, end-to-end deep learning approaches -- state of the art shape estimation methods are composed of multiple stages. We perform the first investigation of different strategies that can be used to effectively manipulate the automatic shape estimation while preserving the overall appearance of the original image. %K Computer Science, Computer Vision and Pattern Recognition, cs.CV,Computer Science, Artificial Intelligence, cs.AI,Computer Science, Cryptography and Security, cs.CR,Computer Science, Learning, cs.LG
    Habermann, M., Xu, W., Rohdin, H., Zollhöfer, M., Pons-Moll, G., & Theobalt, C. (2019). NRST: Non-rigid Surface Tracking from Monocular Video. In Pattern Recognition (GCPR 2018). Stuttgart, Germany: Springer. doi:10.1007/978-3-030-12939-2_23
    Export
    BibTeX
    @inproceedings{Habermann_GVPR18, TITLE = {{NRST}: {N}on-rigid Surface Tracking from Monocular Video}, AUTHOR = {Habermann, Marc and Xu, Weipeng and Rohdin, Helge and Zollh{\"o}fer, Michael and Pons-Moll, Gerard and Theobalt, Christian}, LANGUAGE = {eng}, ISBN = {978-3-030-12938-5}, DOI = {10.1007/978-3-030-12939-2_23}, PUBLISHER = {Springer}, YEAR = {2018}, MARGINALMARK = {$\bullet$}, DATE = {2019}, BOOKTITLE = {Pattern Recognition (GCPR 2018)}, EDITOR = {Brox, Thomas and Bruhn, Andr{\'e}s and Fritz, Mario}, PAGES = {335--348}, SERIES = {Lecture Notes in Computer Science}, VOLUME = {11269}, ADDRESS = {Stuttgart, Germany}, }
    Endnote
    %0 Conference Proceedings %A Habermann, Marc %A Xu, Weipeng %A Rohdin, Helge %A Zollhöfer, Michael %A Pons-Moll, Gerard %A Theobalt, Christian %+ Computer Graphics, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society External Organizations Computer Graphics, MPI for Informatics, Max Planck Society Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society %T NRST: Non-rigid Surface Tracking from Monocular Video : %G eng %U http://hdl.handle.net/21.11116/0000-0002-B94C-9 %R 10.1007/978-3-030-12939-2_23 %D 2019 %B 40th German Conference on Pattern Recognition %Z date of event: 2018-10-09 - 2018-10-12 %C Stuttgart, Germany %B Pattern Recognition %E Brox, Thomas; Bruhn, Andrés; Fritz, Mario %P 335 - 348 %I Springer %@ 978-3-030-12938-5 %B Lecture Notes in Computer Science %N 11269
    Habermann, M., Xu, W., Zollhöfer, M., Pons-Moll, G., & Theobalt, C. (2019). LiveCap: Real-time Human Performance Capture from Monocular Video. ACM Transactions on Graphics, 38(2). doi:10.1145/3311970
    Export
    BibTeX
    @article{Habermann_TOG19, TITLE = {{LiveCap}: {R}eal-time Human Performance Capture from Monocular Video}, AUTHOR = {Habermann, Marc and Xu, Weipeng and Zollh{\"o}fer, Michael and Pons-Moll, Gerard and Theobalt, Christian}, LANGUAGE = {eng}, ISSN = {0730-0301}, DOI = {10.1145/3311970}, PUBLISHER = {ACM}, ADDRESS = {New York, NY}, YEAR = {2019}, MARGINALMARK = {$\bullet$}, DATE = {2019}, JOURNAL = {ACM Transactions on Graphics}, VOLUME = {38}, NUMBER = {2}, EID = {14}, }
    Endnote
    %0 Journal Article %A Habermann, Marc %A Xu, Weipeng %A Zollhöfer, Michael %A Pons-Moll, Gerard %A Theobalt, Christian %+ Computer Graphics, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society %T LiveCap: Real-time Human Performance Capture from Monocular Video : %G eng %U http://hdl.handle.net/21.11116/0000-0002-B947-E %R 10.1145/3311970 %7 2019 %D 2019 %J ACM Transactions on Graphics %V 38 %N 2 %Z sequence number: 14 %I ACM %C New York, NY %@ false
    Mahmood, N., Ghorbani, N., Troje, N. F., Pons-Moll, G., & Black, M. J. (2019). AMASS: Archive of Motion Capture as Surface Shapes. Retrieved from http://arxiv.org/abs/1904.03278
    (arXiv: 1904.03278)
    Abstract
    Large datasets are the cornerstone of recent advances in computer vision using deep learning. In contrast, existing human motion capture (mocap) datasets are small and the motions limited, hampering progress on learning models of human motion. While there are many different datasets available, they each use a different parameterization of the body, making it difficult to integrate them into a single meta dataset. To address this, we introduce AMASS, a large and varied database of human motion that unifies 15 different optical marker-based mocap datasets by representing them within a common framework and parameterization. We achieve this using a new method, MoSh++, that converts mocap data into realistic 3D human meshes represented by a rigged body model; here we use SMPL [doi:10.1145/2816795.2818013], which is widely used and provides a standard skeletal representation as well as a fully rigged surface mesh. The method works for arbitrary marker sets, while recovering soft-tissue dynamics and realistic hand motion. We evaluate MoSh++ and tune its hyperparameters using a new dataset of 4D body scans that are jointly recorded with marker-based mocap. The consistent representation of AMASS makes it readily useful for animation, visualization, and generating training data for deep learning. Our dataset is significantly richer than previous human motion collections, having more than 40 hours of motion data, spanning over 300 subjects, more than 11,000 motions, and will be publicly available to the research community.
    Export
    BibTeX
    @online{Mahmood_arXiv1904.03278, TITLE = {{AMASS}: Archive of Motion Capture as Surface Shapes}, AUTHOR = {Mahmood, Naureen and Ghorbani, Nima and Troje, Nikolaus F. and Pons-Moll, Gerard and Black, Michael J.}, LANGUAGE = {eng}, URL = {http://arxiv.org/abs/1904.03278}, EPRINT = {1904.03278}, EPRINTTYPE = {arXiv}, YEAR = {2019}, MARGINALMARK = {$\bullet$}, ABSTRACT = {Large datasets are the cornerstone of recent advances in computer vision using deep learning. In contrast, existing human motion capture (mocap) datasets are small and the motions limited, hampering progress on learning models of human motion. While there are many different datasets available, they each use a different parameterization of the body, making it difficult to integrate them into a single meta dataset. To address this, we introduce AMASS, a large and varied database of human motion that unifies 15 different optical marker-based mocap datasets by representing them within a common framework and parameterization. We achieve this using a new method, MoSh++, that converts mocap data into realistic 3D human meshes represented by a rigged body model; here we use SMPL [doi:10.1145/2816795.2818013], which is widely used and provides a standard skeletal representation as well as a fully rigged surface mesh. The method works for arbitrary marker sets, while recovering soft-tissue dynamics and realistic hand motion. We evaluate MoSh++ and tune its hyperparameters using a new dataset of 4D body scans that are jointly recorded with marker-based mocap. The consistent representation of AMASS makes it readily useful for animation, visualization, and generating training data for deep learning. Our dataset is significantly richer than previous human motion collections, having more than 40 hours of motion data, spanning over 300 subjects, more than 11,000 motions, and will be publicly available to the research community.}, }
    Endnote
    %0 Report %A Mahmood, Naureen %A Ghorbani, Nima %A Troje, Nikolaus F. %A Pons-Moll, Gerard %A Black, Michael J. %+ External Organizations External Organizations External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations %T AMASS: Archive of Motion Capture as Surface Shapes : %G eng %U http://hdl.handle.net/21.11116/0000-0003-ECAB-3 %U http://arxiv.org/abs/1904.03278 %D 2019 %X Large datasets are the cornerstone of recent advances in computer vision using deep learning. In contrast, existing human motion capture (mocap) datasets are small and the motions limited, hampering progress on learning models of human motion. While there are many different datasets available, they each use a different parameterization of the body, making it difficult to integrate them into a single meta dataset. To address this, we introduce AMASS, a large and varied database of human motion that unifies 15 different optical marker-based mocap datasets by representing them within a common framework and parameterization. We achieve this using a new method, MoSh++, that converts mocap data into realistic 3D human meshes represented by a rigged body model; here we use SMPL [doi:10.1145/2816795.2818013], which is widely used and provides a standard skeletal representation as well as a fully rigged surface mesh. The method works for arbitrary marker sets, while recovering soft-tissue dynamics and realistic hand motion. We evaluate MoSh++ and tune its hyperparameters using a new dataset of 4D body scans that are jointly recorded with marker-based mocap. The consistent representation of AMASS makes it readily useful for animation, visualization, and generating training data for deep learning. Our dataset is significantly richer than previous human motion collections, having more than 40 hours of motion data, spanning over 300 subjects, more than 11,000 motions, and will be publicly available to the research community. %K Computer Science, Computer Vision and Pattern Recognition, cs.CV,Computer Science, Graphics, cs.GR
    Alldieck, T., Pons-Moll, G., Theobalt, C., & Magnor, M. A. (2019). Tex2Shape: Detailed Full Human Body Geometry from a Single Image. Retrieved from http://arxiv.org/abs/1904.08645
    (arXiv: 1904.08645)
    Abstract
    We present a simple yet effective method to infer detailed full human body shape from only a single photograph. Our model can infer full-body shape including face, hair, and clothing including wrinkles at interactive frame-rates. Results feature details even on parts that are occluded in the input image. Our main idea is to turn shape regression into an aligned image-to-image translation problem. The input to our method is a partial texture map of the visible region obtained from off-the-shelf methods. From a partial texture, we estimate detailed normal and vector displacement maps, which can be applied to a low-resolution smooth body model to add detail and clothing. Despite being trained purely with synthetic data, our model generalizes well to real-world photographs. Numerous results demonstrate the versatility and robustness of our method.
    Export
    BibTeX
    @online{Alldieck_arXiv1904.08645, TITLE = {{Tex2Shape}: Detailed Full Human Body Geometry from a Single Image}, AUTHOR = {Alldieck, Thiemo and Pons-Moll, Gerard and Theobalt, Christian and Magnor, Marcus A.}, LANGUAGE = {eng}, URL = {http://arxiv.org/abs/1904.08645}, EPRINT = {1904.08645}, EPRINTTYPE = {arXiv}, YEAR = {2019}, MARGINALMARK = {$\bullet$}, ABSTRACT = {We present a simple yet effective method to infer detailed full human body shape from only a single photograph. Our model can infer full-body shape including face, hair, and clothing including wrinkles at interactive frame-rates. Results feature details even on parts that are occluded in the input image. Our main idea is to turn shape regression into an aligned image-to-image translation problem. The input to our method is a partial texture map of the visible region obtained from off-the-shelf methods. From a partial texture, we estimate detailed normal and vector displacement maps, which can be applied to a low-resolution smooth body model to add detail and clothing. Despite being trained purely with synthetic data, our model generalizes well to real-world photographs. Numerous results demonstrate the versatility and robustness of our method.}, }
    Endnote
    %0 Report %A Alldieck, Thiemo %A Pons-Moll, Gerard %A Theobalt, Christian %A Magnor, Marcus A. %+ External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society External Organizations %T Tex2Shape: Detailed Full Human Body Geometry from a Single Image : %G eng %U http://hdl.handle.net/21.11116/0000-0003-ECBE-E %U http://arxiv.org/abs/1904.08645 %D 2019 %X We present a simple yet effective method to infer detailed full human body shape from only a single photograph. Our model can infer full-body shape including face, hair, and clothing including wrinkles at interactive frame-rates. Results feature details even on parts that are occluded in the input image. Our main idea is to turn shape regression into an aligned image-to-image translation problem. The input to our method is a partial texture map of the visible region obtained from off-the-shelf methods. From a partial texture, we estimate detailed normal and vector displacement maps, which can be applied to a low-resolution smooth body model to add detail and clothing. Despite being trained purely with synthetic data, our model generalizes well to real-world photographs. Numerous results demonstrate the versatility and robustness of our method. %K Computer Science, Computer Vision and Pattern Recognition, cs.CV
    Mehta, D., Sotnychenko, O., Mueller, F., Xu, W., Elgharib, M., Fua, P., … Theobalt, C. (2019). XNect: Real-time Multi-person 3D Human Pose Estimation with a Single RGB Camera. Retrieved from http://arxiv.org/abs/1907.00837
    (arXiv: 1907.00837)
    Abstract
    We present a real-time approach for multi-person 3D motion capture at over 30 fps using a single RGB camera. It operates in generic scenes and is robust to difficult occlusions both by other people and objects. Our method operates in subsequent stages. The first stage is a convolutional neural network (CNN) that estimates 2D and 3D pose features along with identity assignments for all visible joints of all individuals. We contribute a new architecture for this CNN, called SelecSLS Net, that uses novel selective long and short range skip connections to improve the information flow allowing for a drastically faster network without compromising accuracy. In the second stage, a fully-connected neural network turns the possibly partial (on account of occlusion) 2D pose and 3D pose features for each subject into a complete 3D pose estimate per individual. The third stage applies space-time skeletal model fitting to the predicted 2D and 3D pose per subject to further reconcile the 2D and 3D pose, and enforce temporal coherence. Our method returns the full skeletal pose in joint angles for each subject. This is a further key distinction from previous work that neither extracted global body positions nor joint angle results of a coherent skeleton in real time for multi-person scenes. The proposed system runs on consumer hardware at a previously unseen speed of more than 30 fps given 512x320 images as input while achieving state-of-the-art accuracy, which we will demonstrate on a range of challenging real-world scenes.
    Export
    BibTeX
    @online{Mehta_arXiv1907.00837, TITLE = {{XNect}: Real-time Multi-person {3D} Human Pose Estimation with a Single {RGB} Camera}, AUTHOR = {Mehta, Dushyant and Sotnychenko, Oleksandr and Mueller, Franziska and Xu, Weipeng and Elgharib, Mohamed and Fua, Pascal and Seidel, Hans-Peter and Rhodin, Helge and Pons-Moll, Gerard and Theobalt, Christian}, LANGUAGE = {eng}, URL = {http://arxiv.org/abs/1907.00837}, EPRINT = {1907.00837}, EPRINTTYPE = {arXiv}, YEAR = {2019}, MARGINALMARK = {$\bullet$}, ABSTRACT = {We present a real-time approach for multi-person 3D motion capture at over 30 fps using a single RGB camera. It operates in generic scenes and is robust to difficult occlusions both by other people and objects. Our method operates in subsequent stages. The first stage is a convolutional neural network (CNN) that estimates 2D and 3D pose features along with identity assignments for all visible joints of all individuals. We contribute a new architecture for this CNN, called SelecSLS Net, that uses novel selective long and short range skip connections to improve the information flow allowing for a drastically faster network without compromising accuracy. In the second stage, a fully-connected neural network turns the possibly partial (on account of occlusion) 2D pose and 3D pose features for each subject into a complete 3D pose estimate per individual. The third stage applies space-time skeletal model fitting to the predicted 2D and 3D pose per subject to further reconcile the 2D and 3D pose, and enforce temporal coherence. Our method returns the full skeletal pose in joint angles for each subject. This is a further key distinction from previous work that neither extracted global body positions nor joint angle results of a coherent skeleton in real time for multi-person scenes. The proposed system runs on consumer hardware at a previously unseen speed of more than 30 fps given 512x320 images as input while achieving state-of-the-art accuracy, which we will demonstrate on a range of challenging real-world scenes.}, }
    Endnote
    %0 Report %A Mehta, Dushyant %A Sotnychenko, Oleksandr %A Mueller, Franziska %A Xu, Weipeng %A Elgharib, Mohamed %A Fua, Pascal %A Seidel, Hans-Peter %A Rhodin, Helge %A Pons-Moll, Gerard %A Theobalt, Christian %+ Computer Graphics, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society External Organizations Computer Graphics, MPI for Informatics, Max Planck Society External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society %T XNect: Real-time Multi-person 3D Human Pose Estimation with a Single RGB Camera : %G eng %U http://hdl.handle.net/21.11116/0000-0003-FE21-A %U http://arxiv.org/abs/1907.00837 %D 2019 %X We present a real-time approach for multi-person 3D motion capture at over 30 fps using a single RGB camera. It operates in generic scenes and is robust to difficult occlusions both by other people and objects. Our method operates in subsequent stages. The first stage is a convolutional neural network (CNN) that estimates 2D and 3D pose features along with identity assignments for all visible joints of all individuals. We contribute a new architecture for this CNN, called SelecSLS Net, that uses novel selective long and short range skip connections to improve the information flow allowing for a drastically faster network without compromising accuracy. In the second stage, a fully-connected neural network turns the possibly partial (on account of occlusion) 2D pose and 3D pose features for each subject into a complete 3D pose estimate per individual. The third stage applies space-time skeletal model fitting to the predicted 2D and 3D pose per subject to further reconcile the 2D and 3D pose, and enforce temporal coherence. Our method returns the full skeletal pose in joint angles for each subject. This is a further key distinction from previous work that neither extracted global body positions nor joint angle results of a coherent skeleton in real time for multi-person scenes. The proposed system runs on consumer hardware at a previously unseen speed of more than 30 fps given 512x320 images as input while achieving state-of-the-art accuracy, which we will demonstrate on a range of challenging real-world scenes. %K Computer Science, Computer Vision and Pattern Recognition, cs.CV,Computer Science, Graphics, cs.GR
    Yu, T., Zheng, Z., Guo, K., Zhao, J., Dai, Q., Li, H., … Liu, Y. (2018). DoubleFusion: Real-time Capture of Human Performances with Inner Body Shapes from a Single Depth Sensor. In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2018). Salt Lake City, UT, USA: IEEE. doi:10.1109/CVPR.2018.00761
    Export
    BibTeX
    @inproceedings{DoubleFusion, TITLE = {{DoubleFusion}: {R}eal-time Capture of Human Performances with Inner Body Shapes from a Single Depth Sensor}, AUTHOR = {Yu, Tao and Zheng, Zherong and Guo, Kaiwen and Zhao, Jianhui and Dai, Qionghai and Li, Hao and Pons-Moll, Gerard and Liu, Yebin}, LANGUAGE = {eng}, ISBN = {978-1-5386-6420-9}, DOI = {10.1109/CVPR.2018.00761}, PUBLISHER = {IEEE}, YEAR = {2018}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2018)}, PAGES = {7287--7296}, ADDRESS = {Salt Lake City, UT, USA}, }
    Endnote
    %0 Conference Proceedings %A Yu, Tao %A Zheng, Zherong %A Guo, Kaiwen %A Zhao, Jianhui %A Dai, Qionghai %A Li, Hao %A Pons-Moll, Gerard %A Liu, Yebin %+ External Organizations External Organizations External Organizations External Organizations External Organizations External Organizations Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society External Organizations %T DoubleFusion: Real-time Capture of Human Performances with Inner Body Shapes from a Single Depth Sensor : %G eng %U http://hdl.handle.net/21.11116/0000-0001-1E30-8 %R 10.1109/CVPR.2018.00761 %D 2018 %B 31st IEEE Conference on Computer Vision and Pattern Recognition %Z date of event: 2018-06-18 - 2018-06-22 %C Salt Lake City, UT, USA %B IEEE/CVF Conference on Computer Vision and Pattern Recognition %P 7287 - 7296 %I IEEE %@ 978-1-5386-6420-9
    Omran, M., Lassner,, C., Pons-Moll, G., Gehler, P., & Schiele, B. (2018). Neural Body Fitting: Unifying Deep Learning and Model Based Human Pose and Shape Estimation. In 3DV 2018 , International Conference on 3D Vision. Verona, Italy: IEEE. doi:10.1109/3DV.2018.00062
    Export
    BibTeX
    @inproceedings{omran2018nbf, TITLE = {Neural Body Fitting: Unifying Deep Learning and Model Based Human Pose and Shape Estimation}, AUTHOR = {Omran, Mohamed and Lassner,, Christoph and Pons-Moll, Gerard and Gehler, Peter and Schiele, Bernt}, LANGUAGE = {eng}, ISBN = {978-1-5386-8425-2 ; 978-1-5386-8426-9}, DOI = {10.1109/3DV.2018.00062}, PUBLISHER = {IEEE}, YEAR = {2018}, MARGINALMARK = {$\bullet$}, DATE = {2018}, BOOKTITLE = {3DV 2018 , International Conference on 3D Vision}, PAGES = {484--494}, ADDRESS = {Verona, Italy}, }
    Endnote
    %0 Conference Proceedings %A Omran, Mohamed %A Lassner,, Christoph %A Pons-Moll, Gerard %A Gehler, Peter %A Schiele, Bernt %+ Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society External Organizations Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society External Organizations Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society %T Neural Body Fitting: Unifying Deep Learning and Model Based Human Pose and Shape Estimation : %G eng %U http://hdl.handle.net/21.11116/0000-0001-E564-C %R 10.1109/3DV.2018.00062 %D 2018 %B International Conference on 3D Vision %Z date of event: 2018-09-05 - 2018-09-08 %C Verona, Italy %B 3DV 2018 %P 484 - 494 %I IEEE %@ 978-1-5386-8425-2 978-1-5386-8426-9
    Alldieck, T., Magnor, M. A., Xu, W., Theobalt, C., & Pons-Moll, G. (2018a). Video Based Reconstruction of 3D People Models. In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2018). Salt Lake City, UT, USA: IEEE. doi:10.1109/CVPR.2018.00875
    Export
    BibTeX
    @inproceedings{alldieck2018video, TITLE = {Video Based Reconstruction of {3D} People Models}, AUTHOR = {Alldieck, Thiemo and Magnor, Marcus A. and Xu, Weipeng and Theobalt, Christian and Pons-Moll, Gerard}, LANGUAGE = {eng}, ISBN = {978-1-5386-6420-9}, DOI = {10.1109/CVPR.2018.00875}, PUBLISHER = {IEEE}, YEAR = {2018}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2018)}, PAGES = {8387--8397}, ADDRESS = {Salt Lake City, UT, USA}, }
    Endnote
    %0 Conference Proceedings %A Alldieck, Thiemo %A Magnor, Marcus A. %A Xu, Weipeng %A Theobalt, Christian %A Pons-Moll, Gerard %+ External Organizations External Organizations External Organizations Computer Graphics, MPI for Informatics, Max Planck Society Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society %T Video Based Reconstruction of 3D People Models : %G eng %U http://hdl.handle.net/21.11116/0000-0001-1E24-6 %R 10.1109/CVPR.2018.00875 %D 2018 %B 31st IEEE Conference on Computer Vision and Pattern Recognition %Z date of event: 2018-06-18 - 2018-06-22 %C Salt Lake City, UT, USA %B IEEE/CVF Conference on Computer Vision and Pattern Recognition %P 8387 - 8397 %I IEEE %@ 978-1-5386-6420-9
    Von Marcard, T., Henschel, R., Black, M. J., Rosenhahn, B., & Pons-Moll, G. (2018). Recovering Accurate {3D} Human Pose in the Wild Using {IMUs} and a Moving Camera. In Computer Vision -- ECCV 2018. Munich, Germany: Springer. doi:10.1007/978-3-030-01249-6_37
    Export
    BibTeX
    @inproceedings{Marcard_ECCV2018, TITLE = {Recovering Accurate {\textbraceleft}{3D}{\textbraceright} Human Pose in the Wild Using {\textbraceleft}{IMUs}{\textbraceright} and a Moving Camera}, AUTHOR = {von Marcard, Timo and Henschel, Roberto and Black, Michael J. and Rosenhahn, Bodo and Pons-Moll, Gerard}, LANGUAGE = {eng}, ISBN = {978-3-030-01248-9}, DOI = {10.1007/978-3-030-01249-6_37}, PUBLISHER = {Springer}, YEAR = {2018}, MARGINALMARK = {$\bullet$}, DATE = {2018}, BOOKTITLE = {Computer Vision -- ECCV 2018}, PAGES = {614--631}, SERIES = {Lecture Notes in Computer Science}, VOLUME = {11214}, ADDRESS = {Munich, Germany}, }
    Endnote
    %0 Conference Proceedings %A von Marcard, Timo %A Henschel, Roberto %A Black, Michael J. %A Rosenhahn, Bodo %A Pons-Moll, Gerard %+ External Organizations External Organizations External Organizations External Organizations Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society %T Recovering Accurate {3D} Human Pose in the Wild Using {IMUs} and a Moving Camera : %G eng %U http://hdl.handle.net/21.11116/0000-0002-5B61-B %R 10.1007/978-3-030-01249-6_37 %D 2018 %B 15th European Conference on Computer Vision %Z date of event: 2018-09-08 - 2018-09-14 %C Munich, Germany %B Computer Vision -- ECCV 2018 %P 614 - 631 %I Springer %@ 978-3-030-01248-9 %B Lecture Notes in Computer Science %N 11214
    Alldieck, T., Magnor, M. A., Xu, W., Theobalt, C., & Pons-Moll, G. (2018b). Detailed Human Avatars from Monocular Video. In 3DV 2018 , International Conference on 3D Vision. Verona, Italy: IEEE. doi:10.1109/3DV.2018.00022
    Export
    BibTeX
    @inproceedings{Alldieck_3DV2018, TITLE = {Detailed Human Avatars from Monocular Video}, AUTHOR = {Alldieck, Thiemo and Magnor, Marcus A. and Xu, Weipeng and Theobalt, Christian and Pons-Moll, Gerard}, LANGUAGE = {eng}, ISBN = {978-1-5386-8425-2 ; 978-1-5386-8426-9}, DOI = {10.1109/3DV.2018.00022}, PUBLISHER = {IEEE}, YEAR = {2018}, MARGINALMARK = {$\bullet$}, DATE = {2018}, BOOKTITLE = {3DV 2018 , International Conference on 3D Vision}, PAGES = {98--109}, ADDRESS = {Verona, Italy}, }
    Endnote
    %0 Conference Proceedings %A Alldieck, Thiemo %A Magnor, Marcus A. %A Xu, Weipeng %A Theobalt, Christian %A Pons-Moll, Gerard %+ Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society External Organizations Computer Graphics, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society %T Detailed Human Avatars from Monocular Video : %G eng %U http://hdl.handle.net/21.11116/0000-0002-5C40-F %R 10.1109/3DV.2018.00022 %D 2018 %B International Conference on 3D Vision %Z date of event: 2018-09-05 - 2018-09-08 %C Verona, Italy %B 3DV 2018 %P 98 - 109 %I IEEE %@ 978-1-5386-8425-2 978-1-5386-8426-9
    Mehta, D., Sotnychenko, O., Mueller, F., Xu, W., Sridhar, S., Pons-Moll, G., & Theobalt, C. (2018). Single-Shot Multi-person 3D Pose Estimation from Monocular RGB. In 3DV 2018 , International Conference on 3D Vision. Verona, Italy: IEEE. doi:10.1109/3DV.2018.00024
    Export
    BibTeX
    @inproceedings{Mehta_3DV2018, TITLE = {Single-Shot Multi-person {3D} Pose Estimation from Monocular {RGB}}, AUTHOR = {Mehta, Dushyant and Sotnychenko, Oleksandr and Mueller, Franziska and Xu, Weipeng and Sridhar, Srinath and Pons-Moll, Gerard and Theobalt, Christian}, LANGUAGE = {eng}, ISBN = {978-1-5386-8425-2 ; 978-1-5386-8426-9}, DOI = {10.1109/3DV.2018.00024}, PUBLISHER = {IEEE}, YEAR = {2018}, MARGINALMARK = {$\bullet$}, DATE = {2018}, BOOKTITLE = {3DV 2018 , International Conference on 3D Vision}, PAGES = {120--130}, ADDRESS = {Verona, Italy}, }
    Endnote
    %0 Conference Proceedings %A Mehta, Dushyant %A Sotnychenko, Oleksandr %A Mueller, Franziska %A Xu, Weipeng %A Sridhar, Srinath %A Pons-Moll, Gerard %A Theobalt, Christian %+ Computer Graphics, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society External Organizations Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society %T Single-Shot Multi-person 3D Pose Estimation from Monocular RGB : %G eng %U http://hdl.handle.net/21.11116/0000-0002-5C46-9 %R 10.1109/3DV.2018.00024 %D 2018 %B International Conference on 3D Vision %Z date of event: 2018-09-05 - 2018-09-08 %C Verona, Italy %B 3DV 2018 %P 120 - 130 %I IEEE %@ 978-1-5386-8425-2 978-1-5386-8426-9
    Mehta, D., Sotnychenko, O., Mueller, F., Rhodin, H., Xu, W., Pons-Moll, G., & Theobalt, C. (2018). Demo of XNect: Real-time Multi-person 3D Human Pose Estimation with a Single RGB Camera. In ECCV 2018 Demo Sessions. Munich, Germany. Retrieved from http://gvv.mpi-inf.mpg.de/projects/XNectDemo/
    Export
    BibTeX
    @inproceedings{XNectDemo_ECCV2018, TITLE = {Demo of {XNect}: Real-time Multi-person {3D} Human Pose Estimation with a Single {RGB} Camera}, AUTHOR = {Mehta, Dushyant and Sotnychenko, Oleksandr and Mueller, Franziska and Rhodin, Helge and Xu, Weipeng and Pons-Moll, Gerard and Theobalt, Christian}, LANGUAGE = {eng}, URL = {http://gvv.mpi-inf.mpg.de/projects/XNectDemo/}, YEAR = {2018}, MARGINALMARK = {$\bullet$}, BOOKTITLE = {ECCV 2018 Demo Sessions}, ADDRESS = {Munich, Germany}, }
    Endnote
    %0 Conference Proceedings %A Mehta, Dushyant %A Sotnychenko, Oleksandr %A Mueller, Franziska %A Rhodin, Helge %A Xu, Weipeng %A Pons-Moll, Gerard %A Theobalt, Christian %+ Computer Graphics, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society Computer Graphics, MPI for Informatics, Max Planck Society %T Demo of XNect: Real-time Multi-person 3D Human Pose Estimation with a Single RGB Camera : %G eng %U http://hdl.handle.net/21.11116/0000-0002-F4DC-3 %U http://gvv.mpi-inf.mpg.de/projects/XNectDemo/ %D 2018 %B European Conference on Computer Vision %Z date of event: 2018-09-08 - 2018-09-14 %C Munich, Germany %B ECCV 2018 Demo Sessions %U http://gvv.mpi-inf.mpg.de/projects/XNectDemo/
    Huang, Y., Kaufmann, M., Aksan, E., Black, M. J., Hilliges, O., & Pons-Moll, G. (2018). Deep Inertial Poser: Learning to Reconstruct Human Pose from Sparse Inertial Measurements in Real Time. ACM Transactions on Graphics (Proc. ACM SIGGRAPH Asia 2018), 37(6). doi:10.1145/3272127.3275108
    Export
    BibTeX
    @article{DIP:SIGGRAPHAsia:2018, TITLE = {Deep Inertial Poser: {L}earning to Reconstruct Human Pose from Sparse Inertial Measurements in Real Time}, AUTHOR = {Huang, Yinghao and Kaufmann, Manuel and Aksan, Emre and Black, Michael J. and Hilliges, Otmar and Pons-Moll, Gerard}, LANGUAGE = {eng}, ISSN = {0730-0301}, ISBN = {978-1-4503-6008-1}, DOI = {10.1145/3272127.3275108}, PUBLISHER = {ACM}, ADDRESS = {New York, NY}, YEAR = {2018}, MARGINALMARK = {$\bullet$}, DATE = {2018}, JOURNAL = {ACM Transactions on Graphics (Proc. ACM SIGGRAPH Asia)}, VOLUME = {37}, NUMBER = {6}, EID = {185}, BOOKTITLE = {Proceedings of ACM SIGGRAPH Asia 2018}, }
    Endnote
    %0 Journal Article %A Huang, Yinghao %A Kaufmann, Manuel %A Aksan, Emre %A Black, Michael J. %A Hilliges, Otmar %A Pons-Moll, Gerard %+ External Organizations External Organizations External Organizations External Organizations External Organizations Computer Vision and Multimodal Computing, MPI for Informatics, Max Planck Society %T Deep Inertial Poser: Learning to Reconstruct Human Pose from Sparse Inertial Measurements in Real Time : %G eng %U http://hdl.handle.net/21.11116/0000-0002-9C1E-E %R 10.1145/3272127.3275108 %7 2018 %D 2018 %J ACM Transactions on Graphics %O TOG %V 37 %N 6 %Z sequence number: 185 %I ACM %C New York, NY %@ false %B Proceedings of ACM SIGGRAPH Asia 2018 %O ACM SIGGRAPH Asia 2018 Tokyo, Japan, December 04 - 07, 2018 SA'18 SA 2018 %@ 978-1-4503-6008-1