@inproceedings{70960c96549b4c168cd2d40f015d2139,
title = "MASKMARK: ROBUST NEURAL WATERMARKING FOR REAL AND SYNTHETIC SPEECH",
abstract = "High-quality speech synthesis models may be used to spread misinformation or impersonate voices. Audio watermarking can combat misuse by embedding a traceable signature in generated audio. However, existing audio watermarks typically demonstrate robustness to only a small set of transformations of the watermarked audio. To address this, we propose MaskMark, a neural network-based digital audio watermarking technique optimized for speech. MaskMark embeds a secret key vector in audio via a multiplicative spectrogram mask, allowing the detection of watermarked speech segments even under substantial signal-processing or neural network-based transformations. Comparisons to a state-of-the-art baseline on natural and synthetic speech corpora and a human subjects evaluation demonstrate MaskMark's superior robustness in detecting watermarked speech while maintaining high perceptual transparency.",
keywords = "Watermarking, speech synthesis, synthetic media",
author = "Patrick O'Reilly and Zeyu Jin and Jiaqi Su and Bryan Pardo",
note = "Publisher Copyright: {\textcopyright} 2024 IEEE.; 2024 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2024 ; Conference date: 14-04-2024 Through 19-04-2024",
year = "2024",
doi = "10.1109/ICASSP48485.2024.10447253",
language = "English (US)",
series = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "4650--4654",
booktitle = "2024 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2024 - Proceedings",
address = "United States",
}