@inproceedings{2500ef9d37ce411396390ad0e8ba2ff4,
title = "Product HMMs for audio-visual continuous speech recognition using facial animation parameters",
abstract = "The use of visual information in addition to acoustic can improve automatic speech recognition. In this paper we compare different approaches for audio-visual information integration and show how they affect automatic speech recognition performance. We utilize facial animation parameters (FAPs), supported by the MPEG-4 standard for the visual representation as visual features. We use both single-stream and multi-stream hidden Markov models (HMM) to integrate audio and visual information. We performed both state and phone synchronous multi-stream integration. Product HMM topology is used to model the phone-synchronous integration. ASR experiments were performed under noisy audio conditions using a relatively large vocabulary (approximately 1000 words) audio-visual database. The proposed phone-synchronous system, which performed the best, reduces the word error rate (WER) by approximately 20% relatively to audio-only ASR (A-ASR) WERs, at various SNRs with additive white Gaussian noise.",
author = "Aleksic, {P. S.} and Katsaggelos, {A. K.}",
note = "Publisher Copyright: {\textcopyright} 2003 IEEE.; 2003 International Conference on Multimedia and Expo, ICME 2003 ; Conference date: 06-07-2003 Through 09-07-2003",
year = "2003",
doi = "10.1109/ICME.2003.1221658",
language = "English (US)",
series = "Proceedings - IEEE International Conference on Multimedia and Expo",
publisher = "IEEE Computer Society",
pages = "481--484",
booktitle = "Proceedings - 2003 International Conference on Multimedia and Expo, ICME",
address = "United States",
}