@inproceedings{8809624733474b2685db1c18d7a7dcc0,
title = "TaLNet: Voice reconstruction from tongue and lip articulation with transfer learning from text-to-speech synthesis",
abstract = "This paper presents TaLNet, a model for voice reconstruc-tion with ultrasound tongue and optical lip videos as inputs. TaLNet is based on an encoder-decoder architecture. Sep-arate encoders are dedicated to processing the tongue and lip data streams respectively. The decoder predicts acoustic features conditioned on encoder outputs and speaker codes. To mitigate for having only relatively small amounts of dual articulatory-acoustic data available for training, and since our task here shares with text-to-speech (TTS) the common goal of speech generation, we propose a novel transfer learning strategy to exploit the much larger amounts of acoustic-only data available to train TTS models. For this, a Tacotron 2 TTS model is first trained, and then the parameters of its de-coder are transferred to the TaLNet decoder. We have eval-uated our approach on an unconstrained multi-speaker voice recovery task. Our results show the effectiveness of both the proposed model and the transfer learning strategy. Speech reconstructed using our proposed method significantly out-performed all baselines (DNN, BLSTM and without trans-fer learning) in terms of both naturalness and intelligibility. When using an ASR model decoding the recovery speech, the WER of our proposed method shows a relative reduction of over 30% compared to baselines.",
keywords = "speech synthesis",
author = "Jing-Xuan Zhang and Korin Richmond and Zhen-Hua Ling and Li-Rong Dai",
year = "2021",
month = may,
day = "18",
language = "English",
isbn = "9781577358664",
series = "Proceedings of the AAAI Conference on Artificial Intelligence",
publisher = "Publishing Services Network",
pages = "14402--14410",
booktitle = "Proceedings of the AAAI Conference on Artificial Intelligence",
}