@inproceedings{0a2959c681ad48f1a41bc9c53f276119,
title = "CA-Wav2Lip: Coordinate Attention-based Speech to Lip Synthesis in the Wild",
abstract = "With the growing consumption of online visual contents, there is an urgent need for video translation in order to reach a wider audience from around the world. However, the materials after direct translation and dubbing are unable to create a natural audio-visual experience since the translated speech and lip movement are often out of sync. To improve the viewing experience, an accurate automatic lip-movement synchronization generation system is necessary. To improve the accuracy and visual quality of speech to lip generation, this research proposes two techniques: Embedding Attention Mechanisms in Convolution Layers and Deploying SSIM as Loss Function in Visual Quality Discriminator. The proposed system as well as several other ones are tested on three audiovisual datasets. The results show that our proposed methods achieve superior performance over the state-of-The-Art speech to lip synthesis on not only the accuracy but also the visual quality of audio-lip synchronization generation.",
keywords = "channel attention, lip synthesis, spatial attention, talking face generation",
author = "Wang, {Kuan Chien} and Jie Zhang and Jingquan Huang and Qi Li and Sun, {Min Te} and Kazuya Sakai and Ku, {Wei Shinn}",
note = "Publisher Copyright: {\textcopyright} 2023 IEEE.; 9th IEEE International Conference on Smart Computing, SMARTCOMP 2023 ; Conference date: 26-06-2023 Through 29-06-2023",
year = "2023",
doi = "10.1109/SMARTCOMP58114.2023.00018",
language = "???core.languages.en_GB???",
series = "Proceedings - 2023 IEEE International Conference on Smart Computing, SMARTCOMP 2023",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "1--8",
booktitle = "Proceedings - 2023 IEEE International Conference on Smart Computing, SMARTCOMP 2023",
}