@inproceedings{36a2363c8c3e49d0b81209acc0306157,
title = "Code-Switching Speech Synthesis Based on Self-Supervised Learning and Domain Adaptive Speaker Encoder",
abstract = "Recently, end-to-end speech synthesis models based on deep learning have made great progress in speech quality, and gradually replaced traditional speech synthesis methods into the mainstream. However, these methods are still challenging to synthesize highly natural speech. In order to solve the above problems, we introduce self-supervised learning and frame-level domain adversarial training into the speaker encoder based on the speaker verification task, so that the speaker vectors of different languages keep a consistent distribution in the speaker space, and the performance of speech synthesis is improved. In addition, we use a non-autoregressive speech synthesis model in the selection of speech synthesis model, so as to solve the problem of unnatural speech rate caused by cross-language speech synthesis. We first demonstrate that in the mixed language dataset of LibriTTS and AISHELL3, the speaker encoder trained with self-supervised representation has a 4.968% absolute EER reduction compared to the traditional MFCC on the speaker verification task, indicating that self-supervised representation has better generalization for domain-complex datasets. Then we obtain MOS scores of 3.635 and 3.675 for speech naturalness and speaker similarity in the code-switching speech synthesis task, respectively. Our approach simplifies the need to use multiple monolingual encoders to model linguistic information in the past literature, and adds frame-level domain adversarial training to optimize the speaker vectors in the speaker feature space to facilitate the code-switching speech synthesis task.",
keywords = "Code Switching, Domain Adaptation, Self-Supervised Learning, Speech synthesis",
author = "Lin, {Yi Xing} and Pai, {Cheng Hsun} and Le, {Phuong Thi} and Bima Prihasto and Huang, {Chien Ling} and Wang, {Jia Ching}",
note = "Publisher Copyright: {\textcopyright} 2023 IEEE.; 48th IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2023 ; Conference date: 04-06-2023 Through 10-06-2023",
year = "2023",
doi = "10.1109/ICASSP49357.2023.10096027",
language = "???core.languages.en_GB???",
series = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing, Proceedings",
}