@inproceedings{7cb63e0b5ef64315b1b1f13f4e37bd85,
title = "Zero-Shot Voice Conversion Based on Speaker Embedding Domain Generalization",
abstract = "In this paper, a zero-shot voice conversion frame-work is constructed by effectively decoupling the semantic and speaker features in speech. The proposed method is based on the pre-trained wav2vec 2.0 model to extract semantic features from source speakers and a WavLM model to extract speaker features from target speakers. We propose the Robust-MAML model to map the speaker feature of the target speaker into a domain generalization space, making it directly applicable to any unregistered speaker domain. Finally, through transfer learning, the speech synthesis model FastSpeech2 integrates the semantic feature and domain-generalized speaker features to synthesize the target speaker's voice. Experimental results show that the proposed method outperforms the common baseline systems in both naturalness and speaker similarity.",
keywords = "domain generalization, speaker embedding, speech synthesis, Zero-shot voice conversion",
author = "Lin, {Yi Xing} and Cheng, {Chun Hsiang} and Le, {Phuong Thi} and Huang, {Bing Jhih} and Liao Chu-Xin and Huang, {Chien Lin} and Wang, {Jia Ching}",
note = "Publisher Copyright: {\textcopyright} 2023 IEEE.; 2023 RIVF International Conference on Computing and Communication Technologies, RIVF 2023 ; Conference date: 23-12-2023 Through 25-12-2023",
year = "2023",
doi = "10.1109/RIVF60135.2023.10471830",
language = "???core.languages.en_GB???",
series = "Proceedings - 2023 RIVF International Conference on Computing and Communication Technologies, RIVF 2023",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "585--589",
editor = "Bao, {Vo Nguyen Quoc} and Chau, {Le Hai}",
booktitle = "Proceedings - 2023 RIVF International Conference on Computing and Communication Technologies, RIVF 2023",
}