@inproceedings{acff762acb6e4b1e8287878aeabff71b,
title = "Learning 3D Scene Semantics and Structure from a Single Depth Image",
abstract = "In this paper, we aim to understand the semantics and 3D structure of a scene from a single depth image. Recent deep neural networks based methods aim to simultaneously learn object class labels and infer the 3D shape of a scene represented by a large voxel grid. However, individual objects within the scene are usually only represented by a few voxels leading to a loss of geometric detail. In addition, significant computational and memory resources are required to process the large scale voxel grid of a whole scene. To address this, we propose an efficient and holistic pipeline, 3R-Depth, to simultaneously learn the semantics and structure of a scene from a single depth image. Our key idea is to deeply fuse an efficient 3D shape estimator with existing recognition (e.g., ResNets) and segmentation (e.g., MaskR-CNN) techniques. Object level semantics and latent feature maps are extracted and then fed to a shape estimator to extract the 3D shape. Extensive experiments are conducted on large-scale synthesized indoor scene datasets, quantitatively and qualitatively demonstrating the merits and superior performance of 3R-Depth.",
author = "Bo Yang and Zihang Lai and Xiaoxuan Lu and Shuyu Lin and Hongkai Wen and Andrew Markham and Niki Trigoni",
year = "2018",
month = dec,
day = "17",
doi = "10.1109/CVPRW.2018.00069",
language = "English",
isbn = "978-1-5386-6101-7",
publisher = "Institute of Electrical and Electronics Engineers",
pages = "422--4223",
booktitle = "2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)",
address = "United States",
}