@phdthesis{Khorevaphd2017,
TITLE = {Learning to Segment in Images and Videos with Different Forms of Supervision},
AUTHOR = {Khoreva, Anna},
LANGUAGE = {eng},
URL = {urn:nbn:de:bsz:291-scidok-ds-269954},
DOI = {10.22028/D291-26995},
SCHOOL = {Universit{\"a}t des Saarlandes},
ADDRESS = {Saarbr{\"u}cken},
YEAR = {2017},
DATE = {2017},
ABSTRACT = {Much progress has been made in image and video segmentation<br>over the last years. To a large extent, the success can be attributed to<br>the strong appearance models completely learned from data, in particular<br>using deep learning methods. However,to perform best these methods require<br>large representative datasets for training with expensive pixel-level<br>annotations, which in case of videos are prohibitive to obtain. Therefore,<br>there is a need to relax this constraint and to consider alternative forms<br>of supervision, which are easier and cheaper to collect. In this thesis,<br>we aim to develop algorithms for learning to segment in images and videos<br>with different levels of supervision.<br>First, we develop approaches for training convolutional networks with weaker<br>forms of supervision, such as bounding boxes or image labels, for object<br>boundary estimation and semantic/instance labelling tasks. We propose to<br>generate pixel-level approximate groundtruth from these weaker forms of<br>annotations to train a network, which allows to achieve high-quality<br>results comparable to the full supervision quality without any<br>modifications of the network architecture or the training procedure.<br>Second, we address the problem of the excessive computational and memory<br>costs inherent to solving video segmentation via graphs. We propose<br>approaches to improve the runtime and memory efficiency as well as the<br>output segmentation quality by learning from the available training data<br>the best representation of the graph. In particular, we contribute with<br>learning must-link constraints, the topology and edge weights of the graph<br>as well as enhancing the graph nodes -- superpixels -- themselves.<br>Third, we tackle the task of pixel-level object tracking and address the<br>problem of the limited amount of densely annotated video data for training<br>convolutional networks. We introduce an architecture which allows training<br>with static images only and propose an elaborate data synthesis scheme<br>which creates a large number of training examples close to the target<br>domain from the given first frame mask. With the proposed techniques we<br>show that densely annotated consequent video data is not necessary to<br>achieve high-quality temporally coherent video segmentationresults.<br>In summary, this thesis advances the state of the art in weakly supervised<br>image segmentation, graph-based video segmentation and pixel-level object<br>tracking and contributes with the new ways of training convolutional<br>networks with a limited amount of pixel-level annotated training data.},
}
