@inproceedings{d26e29b01a3d46e49d1a31ea3d382904,
title = "Simultaneous Separation and Transcription of Mixtures with Multiple Polyphonic and Percussive Instruments",
abstract = "We present a single deep learning architecture that can both separate an audio recording of a musical mixture into constituent single-instrument recordings and transcribe these instruments into a human-readable format at the same time, learning a shared musical representation for both tasks. This novel architecture, which we call Cerberus, builds on the Chimera network for source separation by adding a third head for transcription. By training each head with different losses, we are able to jointly learn how to separate and transcribe up to five instruments with a single network. We show that separation and transcription are highly complementary with one another and when learned jointly, lead to Cerberus networks that are better at both separation and transcription and generalize better to unseen mixtures.",
keywords = "computer audition, deep clustering, multitask learning, music transcription, source separation",
author = "Ethan Manilow and Prem Seetharaman and Bryan Pardo",
note = "Publisher Copyright: {\textcopyright} 2020 IEEE.; 2020 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2020 ; Conference date: 04-05-2020 Through 08-05-2020",
year = "2020",
month = may,
doi = "10.1109/ICASSP40776.2020.9054340",
language = "English (US)",
series = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "771--775",
booktitle = "2020 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2020 - Proceedings",
address = "United States",
}