@inproceedings{3c997af9ed50489d82a00bfc90e67928,
title = "High-Fidelity Neural Phonetic Posteriorgrams",
abstract = "A phonetic posteriorgram (PPG) is a time-varying categorical distribution over acoustic units of speech (e.g., phonemes). PPGs are a popular representation in speech generation due to their ability to disentangle pronunciation features from speaker identity, allowing accurate reconstruction of pronunciation (e.g., voice conversion) and coarse-grained pronunciation editing (e.g., foreign accent conversion). In this paper, we demonstrably improve the quality of PPGs to produce a state-of-the-art interpretable PPG representation. We train an off-the-shelf speech synthesizer using our PPG representation and show that high-quality PPGs yield independent control over pitch and pronunciation. We further demonstrate novel uses of PPGs, such as an acoustic pronunciation distance and fine-grained pronunciation control.",
keywords = "interpretable, ppg, pronunciation, representation",
author = "Cameron Churchwell and Max Morrison and Bryan Pardo",
note = "Publisher Copyright: {\textcopyright}2024 IEEE.; 2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops, ICASSPW 2024 ; Conference date: 14-04-2024 Through 19-04-2024",
year = "2024",
doi = "10.1109/ICASSPW62465.2024.10669905",
language = "English (US)",
series = "2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops, ICASSPW 2024 - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "823--827",
booktitle = "2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops, ICASSPW 2024 - Proceedings",
address = "United States",
}