Citation

BibTex format

@inproceedings{Hogg:2021:10.1109/ICASSP39728.2021.9414130,
author = {Hogg, A and Naylor, P and Evers, C},
doi = {10.1109/ICASSP39728.2021.9414130},
publisher = {IEEE},
title = {Multichannel overlapping speaker segmentation using multiple hypothesis tracking of acoustic and spatial features},
url = {http://dx.doi.org/10.1109/ICASSP39728.2021.9414130},
year = {2021}
}

RIS format (EndNote, RefMan)

TY  - CPAPER
AB - An essential part of any diarization system is the task of speaker segmentation which is important for many applications including speaker indexing and automatic speech recognition (ASR) in multi-speaker environments. Segmentation of overlapping speech has recently been a key focus of this work. In this paper we explore the use of a new multimodal approach for overlapping speaker segmentation that tracks both the fundamental frequency (F0) of the speaker and the speaker’s direction of arrival (DOA) simultaneously. Our proposed multiple hypothesis tracking system, which simultaneously tracks both features, shows an improvement in segmentation performance when compared to tracking these features separately. An illustrative example of overlapping speech demonstrates the effectiveness of our proposed system. We also undertake a statistical analysis on 12 meetings from the AMI corpus and show an improvement in the HIT rate of 14.1% on average against a commonly used deep learning bidirectional long short term memory network (BLSTM) approach.
AU - Hogg,A
AU - Naylor,P
AU - Evers,C
DO - 10.1109/ICASSP39728.2021.9414130
PB - IEEE
PY - 2021///
TI - Multichannel overlapping speaker segmentation using multiple hypothesis tracking of acoustic and spatial features
UR - http://dx.doi.org/10.1109/ICASSP39728.2021.9414130
UR - http://hdl.handle.net/10044/1/87556
ER -