@article{10.1145/3272127.3275035,
author = {Ma, Rui and Patil, Akshay Gadi and Fisher, Matthew and Li, Manyi and Pirk, S\"{o}ren and Hua, Binh-Son and Yeung, Sai-Kit and Tong, Xin and Guibas, Leonidas and Zhang, Hao},
title = {Language-Driven Synthesis of 3D Scenes from Scene Databases},
year = {2018},
issue_date = {November 2018},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {37},
number = {6},
issn = {0730-0301},
url = {https://doi.org/10.1145/3272127.3275035},
doi = {10.1145/3272127.3275035},
abstract = {We introduce a novel framework for using natural language to generate and edit 3D
indoor scenes, harnessing scene semantics and text-scene grounding knowledge learned
from large annotated 3D scene databases. The advantage of natural language editing
interfaces is strongest when performing semantic operations at the sub-scene level,
acting on groups of objects. We learn how to manipulate these sub-scenes by analyzing
existing 3D scenes. We perform edits by first parsing a natural language command from
the user and transforming it into a semantic scene graph that is used to retrieve
corresponding sub-scenes from the databases that match the command. We then augment
this retrieved sub-scene by incorporating other objects that may be implied by the
scene context. Finally, a new 3D scene is synthesized by aligning the augmented sub-scene
with the user's current scene, where new objects are spliced into the environment,
possibly triggering appropriate adjustments to the existing scene arrangement. A suggestive
modeling interface with multiple interpretations of user commands is used to alleviate
ambiguities in natural language. We conduct studies comparing our approach against
both prior text-to-scene work and artist-made scenes and find that our method significantly
outperforms prior work and is comparable to handmade scenes even when complex and
varied natural sentences are used.},
journal = {ACM Trans. Graph.},
month = dec,
articleno = {212},
numpages = {16},
keywords = {data-driven 3D scene generation and editing, natural language interface, relational model, semantic scene graph}
}