BibTex format
@article{Caspe:2025:10.17743/jaes.2022.0204,
author = {Caspe, F and Shier, J and Sandler, M and Saitis, C and McPherson, A},
doi = {10.17743/jaes.2022.0204},
journal = {AES: Journal of the Audio Engineering Society},
title = {Designing neural synthesizers for low-latency interaction},
url = {http://dx.doi.org/10.17743/jaes.2022.0204},
volume = {73},
year = {2025}
}
RIS format (EndNote, RefMan)
TY - JOUR
AB - Neural audio synthesis (NAS) models offer interactive musical control over high-quality, expressive audio generators. While these models can operate in real time, they often suffer from high latency, making them unsuitable for intimate musical interaction. The impact of architectural choices in deep learning models on audio latency remains largely unexplored in the NAS literature. In this work, the authors investigate the sources of latency and jitter typically found in interactive NAS models. They then apply this analysis to the task of timbre transfer using the RAVE model (Realtime Audio Variational autoEncoder), a convolutional variational autoencoder for audio waveforms introduced by Caillon and Esling in 2021. Finally, an iterative design approach for optimizing latency is presented. This culminates with a model the authors call BRAVE (Bravely Realtime Audio Variational autoEncoder), which is low-latency and exhibits better pitch and loudness replication while showing timbre modification capabilities similar to RAVE. It is implemented in a specialized inference framework for low-latency, real-time inference, and a proof-of-concept audio plugin compatible with audio signals from musical instruments is presented. The authors expect the challenges and guidelines described in this document to support NAS researchers in designing models for low-latency inference from the ground up, enriching the landscape of possibilities for musicians.
AU - Caspe,F
AU - Shier,J
AU - Sandler,M
AU - Saitis,C
AU - McPherson,A
DO - 10.17743/jaes.2022.0204
PY - 2025///
SN - 1549-4950
TI - Designing neural synthesizers for low-latency interaction
T2 - AES: Journal of the Audio Engineering Society
UR - http://dx.doi.org/10.17743/jaes.2022.0204
VL - 73
ER -