Publications

180 / 3,937 publications found.


  •  Petermann, D., Wichern, G., Subramanian, A.S., Wang, Z.-Q., Le Roux, J., "Tackling the Cocktail Fork Problem for Separation and Transcription of Real-World Soundtracks", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/​TASLP.2023.3290428, Vol. 31, pp. 2592-2605, September 2023.
    BibTeX TR2023-113 PDF
    • @article{Petermann2023sep,
    • author = {Petermann, Darius and Wichern, Gordon and Subramanian, Aswin Shanmugam and Wang, Zhong-Qiu and {Le Roux}, Jonathan},
    • title = {{Tackling the Cocktail Fork Problem for Separation and Transcription of Real-World Soundtracks}},
    • journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
    • year = 2023,
    • volume = 31,
    • pages = {2592--2605},
    • month = sep,
    • doi = {10.1109/TASLP.2023.3290428},
    • issn = {2329-9304},
    • url = {https://www.merl.com/publications/TR2023-113}
    • }
  •  Hori, C., Peng, P., Harwath, D., Liu, X., Ota, K., Jain, S., Corcodel, R., Jha, D.K., Romeres, D., Le Roux, J., "Style-transfer based Speech and Audio-visual Scene understanding for Robot Action Sequence Acquisition from Videos", Interspeech, DOI: 10.21437/​Interspeech.2023-1983, August 2023, pp. 4663-4667.
    BibTeX TR2023-104 PDF
    • @inproceedings{Hori2023aug,
    • author = {Hori, Chiori and Peng, Puyuang and Harwath, David and Liu, Xinyu and Ota, Kei and Jain, Siddarth and Corcodel, Radu and Jha, Devesh K. and Romeres, Diego and {Le Roux}, Jonathan},
    • title = {{Style-transfer based Speech and Audio-visual Scene understanding for Robot Action Sequence Acquisition from Videos}},
    • booktitle = {Interspeech},
    • year = 2023,
    • pages = {4663--4667},
    • month = aug,
    • doi = {10.21437/Interspeech.2023-1983},
    • url = {https://www.merl.com/publications/TR2023-104}
    • }
  •  Wu, S.-L., Chang, X., Wichern, G., Jung, J.-W., Germain, F., Le Roux, J., Watanabe, S., "BEATs-based Audio Captioning Model with Instructor Embedding Supervision and ChatGPT Mix-up," Tech. Rep. TR2023-068, DCASE2023 Challenge, May 2023.
    BibTeX TR2023-068 PDF
    • @techreport{Wu2023may,
    • author = {Wu, Shih-Lun and Chang, Xuankai and Wichern, Gordon and Jung, Jee-weon and Germain, Francois and {Le Roux}, Jonathan and Watanabe, Shinji},
    • title = {{BEATs-based Audio Captioning Model with Instructor Embedding Supervision and ChatGPT Mix-up}},
    • institution = {DCASE2023 Challenge},
    • year = 2023,
    • month = may,
    • url = {https://www.merl.com/publications/TR2023-068}
    • }
  •  Chen, K., Wichern, G., Germain, F., Le Roux, J., "Pac-HuBERT: Self-Supervised Music Source Separation via Primitive Auditory Clustering and Hidden-Unit BERT", IEEE ICASSP Satellite Workshop on Self-supervision in Audio, Speech and Beyond (SASB), DOI: 10.1109/​ICASSPW59220.2023.10193575, May 2023.
    BibTeX TR2023-030 PDF
    • @inproceedings{Chen2023may,
    • author = {Chen, Ke and Wichern, Gordon and Germain, Francois and {Le Roux}, Jonathan},
    • title = {{Pac-HuBERT: Self-Supervised Music Source Separation via Primitive Auditory Clustering and Hidden-Unit BERT}},
    • booktitle = {IEEE ICASSP Satellite Workshop on Self-supervision in Audio, Speech and Beyond (SASB)},
    • year = 2023,
    • month = may,
    • doi = {10.1109/ICASSPW59220.2023.10193575},
    • isbn = {979-8-3503-0261-5},
    • url = {https://www.merl.com/publications/TR2023-030}
    • }
  •  Aralikatti, R., Boeddeker, C., Wichern, G., Subramanian, A.S., Le Roux, J., "Reverberation as Supervision for Speech Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP49357.2023.10095022, May 2023, pp. 1-5.
    BibTeX TR2023-016 PDF
    • @inproceedings{Aralikatti2023may,
    • author = {Aralikatti, Rohith and Boeddeker, Christoph and Wichern, Gordon and Subramanian, Aswin Shanmugam and {Le Roux}, Jonathan},
    • title = {{Reverberation as Supervision for Speech Separation}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2023,
    • pages = {1--5},
    • month = may,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP49357.2023.10095022},
    • url = {https://www.merl.com/publications/TR2023-016}
    • }
  •  Bralios, D., Tzinis, E., Wichern, G., Smaragdis, P., Le Roux, J., "Latent Iterative Refinement for Modular Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP49357.2023.10096897, May 2023, pp. 1-5.
    BibTeX TR2023-019 PDF
    • @inproceedings{Bralios2023may,
    • author = {Bralios, Dimitrios and Tzinis, Efthymios and Wichern, Gordon and Smaragdis, Paris and {Le Roux}, Jonathan},
    • title = {{Latent Iterative Refinement for Modular Source Separation}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2023,
    • pages = {1--5},
    • month = may,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP49357.2023.10096897},
    • url = {https://www.merl.com/publications/TR2023-019}
    • }
  •  Petermann, D., Wichern, G., Subramanian, A.S., Le Roux, J., "Hyperbolic Audio Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP49357.2023.10094943, May 2023, pp. 1-5.
    BibTeX TR2023-017 PDF Video Software
    • @inproceedings{Petermann2023may,
    • author = {Petermann, Darius and Wichern, Gordon and Subramanian, Aswin Shanmugam and {Le Roux}, Jonathan},
    • title = {{Hyperbolic Audio Source Separation}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2023,
    • pages = {1--5},
    • month = may,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP49357.2023.10094943},
    • url = {https://www.merl.com/publications/TR2023-017}
    • }
  •  Tzinis, E., Wichern, G., Smaragdis, P., Le Roux, J., "Optimal Condition Training for Target Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP49357.2023.10095128, May 2023, pp. 1-5.
    BibTeX TR2023-018 PDF
    • @inproceedings{Tzinis2023may,
    • author = {Tzinis, Efthymios and Wichern, Gordon and Smaragdis, Paris and {Le Roux}, Jonathan},
    • title = {{Optimal Condition Training for Target Source Separation}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2023,
    • pages = {1--5},
    • month = may,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP49357.2023.10095128},
    • url = {https://www.merl.com/publications/TR2023-018}
    • }
  •  Yen, H., Germain, F., Wichern, G., Le Roux, J., "Cold Diffusion for Speech Enhancement", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP49357.2023.10096064, May 2023, pp. 1-5.
    BibTeX TR2023-020 PDF
    • @inproceedings{Yen2023may,
    • author = {Yen, Hao and Germain, Francois and Wichern, Gordon and {Le Roux}, Jonathan},
    • title = {{Cold Diffusion for Speech Enhancement}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2023,
    • pages = {1--5},
    • month = may,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP49357.2023.10096064},
    • url = {https://www.merl.com/publications/TR2023-020}
    • }
  •  Wang, Z.-Q., Wichern, G., Watanabe, S., Le Roux, J., "STFT-Domain Neural Speech Enhancement with Very Low Algorithmic Latency", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/​TASLP.2022.3224285, Vol. 31, pp. 397-410, December 2022.
    BibTeX TR2022-166 PDF
    • @article{Wang2022dec2,
    • author = {Wang, Zhong-Qiu and Wichern, Gordon and Watanabe, Shinji and {Le Roux}, Jonathan},
    • title = {{STFT-Domain Neural Speech Enhancement with Very Low Algorithmic Latency}},
    • journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
    • year = 2022,
    • volume = 31,
    • pages = {397--410},
    • month = dec,
    • doi = {10.1109/TASLP.2022.3224285},
    • issn = {2329-9304},
    • url = {https://www.merl.com/publications/TR2022-166}
    • }
  •  Venkatesh, S., Wichern, G., Subramanian, A.S., Le Roux, J., "Improved Domain Generalization via Disentangled Multi-Task Learning in Unsupervised Anomalous Sound Detection", DCASE Workshop, Lagrange, M. and Mesaros, A. and Pellegrini, T. and Richard, G. and Serizel, R. and Stowell, D., Eds., November 2022.
    BibTeX TR2022-146 PDF Presentation
    • @inproceedings{Venkatesh2022nov,
    • author = {Venkatesh, Satvik and Wichern, Gordon and Subramanian, Aswin Shanmugam and {Le Roux}, Jonathan},
    • title = {{Improved Domain Generalization via Disentangled Multi-Task Learning in Unsupervised Anomalous Sound Detection}},
    • booktitle = {DCASE Workshop},
    • year = 2022,
    • editor = {Lagrange, M. and Mesaros, A. and Pellegrini, T. and Richard, G. and Serizel, R. and Stowell, D.},
    • month = nov,
    • isbn = {978-952-03-2677-7},
    • url = {https://www.merl.com/publications/TR2022-146}
    • }
  •  Pan, Z., Wichern, G., Germain, F., Subramanian, A.S., Le Roux, J., "Towards End-to-end Speaker Diarization in the Wild", arXiv, November 2022.
    BibTeX arXiv
    • @article{Pan2022nov,
    • author = {Pan, Zexu and Wichern, Gordon and Germain, Francois and Subramanian, Aswin Shanmugam and {Le Roux}, Jonathan},
    • title = {{Towards End-to-end Speaker Diarization in the Wild}},
    • journal = {arXiv},
    • year = 2022,
    • month = nov,
    • url = {https://arxiv.org/abs/2211.01299}
    • }
  •  Hori, C., Hori, T., Le Roux, J., "Low-Latency Streaming Scene-aware Interaction Using Audio-Visual Transformers", Interspeech, DOI: 10.21437/​Interspeech.2022-10891, September 2022, pp. 4511-4515.
    BibTeX TR2022-116 PDF
    • @inproceedings{Hori2022sep,
    • author = {Hori, Chiori and Hori, Takaaki and {Le Roux}, Jonathan},
    • title = {{Low-Latency Streaming Scene-aware Interaction Using Audio-Visual Transformers}},
    • booktitle = {Interspeech},
    • year = 2022,
    • pages = {4511--4515},
    • month = sep,
    • doi = {10.21437/Interspeech.2022-10891},
    • url = {https://www.merl.com/publications/TR2022-116}
    • }
  •  Tzinis, E., Wichern, G., Subramanian, A.S., Smaragdis, P., Le Roux, J., "Heterogeneous Target Speech Separation", Interspeech, DOI: 10.21437/​Interspeech.2022-10717, September 2022, pp. 1796-1800.
    BibTeX TR2022-115 PDF Video Presentation
    • @inproceedings{Tzinis2022sep,
    • author = {Tzinis, Efthymios and Wichern, Gordon and Subramanian, Aswin Shanmugam and Smaragdis, Paris and {Le Roux}, Jonathan},
    • title = {{Heterogeneous Target Speech Separation}},
    • booktitle = {Interspeech},
    • year = 2022,
    • pages = {1796--1800},
    • month = sep,
    • doi = {10.21437/Interspeech.2022-10717},
    • url = {https://www.merl.com/publications/TR2022-115}
    • }
  •  Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Momentum Pseudo-Labeling: Semi-Supervised ASR with Continuously Improving Pseudo-Labels", IEEE Journal of Selected Topics in Signal Processing, DOI: 10.1109/​JSTSP.2022.3195367, Vol. 16, No. 6, pp. 1424-1438, September 2022.
    BibTeX TR2022-112 PDF
    • @article{Higuchi2022sep,
    • author = {Higuchi, Yosuke and Moritz, Niko and {Le Roux}, Jonathan and Hori, Takaaki},
    • title = {{Momentum Pseudo-Labeling: Semi-Supervised ASR with Continuously Improving Pseudo-Labels}},
    • journal = {IEEE Journal of Selected Topics in Signal Processing},
    • year = 2022,
    • volume = 16,
    • number = 6,
    • pages = {1424--1438},
    • month = sep,
    • doi = {10.1109/JSTSP.2022.3195367},
    • issn = {1941-0484},
    • url = {https://www.merl.com/publications/TR2022-112}
    • }
  •  Venkatesh, S., Wichern, G., Subramanian, A.S., Le Roux, J., "Disentangled Surrogate Task Learning for Improved Domain Generalization in Unsupervised Anomolous Sound Detection," Tech. Rep. TR2022-092, Detection and Classification of Acoustic Scenes and Events (DCASE) Challenge 2022, July 2022.
    BibTeX TR2022-092 PDF Presentation
    • @techreport{Venkatesh2022jul,
    • author = {Venkatesh, Satvik and Wichern, Gordon and Subramanian, Aswin Shanmugam and {Le Roux}, Jonathan},
    • title = {{Disentangled Surrogate Task Learning for Improved Domain Generalization in Unsupervised Anomolous Sound Detection}},
    • institution = {DCASE2022 Challenge},
    • year = 2022,
    • month = jul,
    • url = {https://www.merl.com/publications/TR2022-092}
    • }
  •  Chang, X., Moritz, N., Hori, T., Watanabe, S., Le Roux, J., "Extended Graph Temporal Classification for Multi-Speaker End-to-End ASR", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP43922.2022.9747375, April 2022, pp. 7322-7326.
    BibTeX TR2022-021 PDF
    • @inproceedings{Chang2022apr,
    • author = {Chang, Xuankai and Moritz, Niko and Hori, Takaaki and Watanabe, Shinji and {Le Roux}, Jonathan},
    • title = {{Extended Graph Temporal Classification for Multi-Speaker End-to-End ASR}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {7322--7326},
    • month = apr,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP43922.2022.9747375},
    • url = {https://www.merl.com/publications/TR2022-021}
    • }
  •  Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Advancing Momentum Pseudo-Labeling with Conformer and Initialization Strategy", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP43922.2022.9746275, April 2022, pp. 7672-7676.
    BibTeX TR2022-026 PDF
    • @inproceedings{Higuchi2022apr,
    • author = {Higuchi, Yosuke and Moritz, Niko and {Le Roux}, Jonathan and Hori, Takaaki},
    • title = {{Advancing Momentum Pseudo-Labeling with Conformer and Initialization Strategy}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {7672--7676},
    • month = apr,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP43922.2022.9746275},
    • url = {https://www.merl.com/publications/TR2022-026}
    • }
  •  Moritz, N., Hori, T., Watanabe, S., Le Roux, J., "Sequence Transduction with Graph-based Supervision", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP43922.2022.9747788, April 2022, pp. 7212-7216.
    BibTeX TR2022-024 PDF
    • @inproceedings{Moritz2022apr,
    • author = {Moritz, Niko and Hori, Takaaki and Watanabe, Shinji and {Le Roux}, Jonathan},
    • title = {{Sequence Transduction with Graph-based Supervision}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {7212--7216},
    • month = apr,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP43922.2022.9747788},
    • url = {https://www.merl.com/publications/TR2022-024}
    • }
  •  Petermann, D., Wichern, G., Wang, Z.-Q., Le Roux, J., "The Cocktail Fork Problem: Three-Stem Audio Separation for Real-World Soundtracks", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP43922.2022.9746005, April 2022, pp. 526-530.
    BibTeX TR2022-022 PDF Video Software
    • @inproceedings{Petermann2022apr,
    • author = {Petermann, Darius and Wichern, Gordon and Wang, Zhong-Qiu and {Le Roux}, Jonathan},
    • title = {{The Cocktail Fork Problem: Three-Stem Audio Separation for Real-World Soundtracks}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {526--530},
    • month = apr,
    • doi = {10.1109/ICASSP43922.2022.9746005},
    • url = {https://www.merl.com/publications/TR2022-022}
    • }
  •  Shah, A.P., Geng, S., Gao, P., Cherian, A., Hori, T., Marks, T.K., Le Roux, J., Hori, C., "Audio-Visual Scene-Aware Dialog and Reasoning Using Audio-Visual Transformers with Joint Student-Teacher Learning", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), April 2022, pp. 7732-7736.
    BibTeX TR2022-019 PDF
    • @inproceedings{Shah2022apr,
    • author = {Shah, Ankit Parag and Geng, Shijie and Gao, Peng and Cherian, Anoop and Hori, Takaaki and Marks, Tim K. and {Le Roux}, Jonathan and Hori, Chiori},
    • title = {{Audio-Visual Scene-Aware Dialog and Reasoning Using Audio-Visual Transformers with Joint Student-Teacher Learning}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {7732--7736},
    • month = apr,
    • publisher = {IEEE},
    • issn = {1520-6149},
    • isbn = {978-1-6654-0540-9},
    • url = {https://www.merl.com/publications/TR2022-019}
    • }
  •  Slizovskaia, O., Wichern, G., Wang, Z.-Q., Le Roux, J., "Locate This, Not That: Class-Conditioned Sound Event DOA Estimation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP43922.2022.9747604, April 2022, pp. 711-715.
    BibTeX TR2022-023 PDF
    • @inproceedings{Slizovskaia2022mar,
    • author = {Slizovskaia, Olga and Wichern, Gordon and Wang, Zhong-Qiu and {Le Roux}, Jonathan},
    • title = {{Locate This, Not That: Class-Conditioned Sound Event DOA Estimation}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {711--715},
    • month = apr,
    • doi = {10.1109/ICASSP43922.2022.9747604},
    • url = {https://www.merl.com/publications/TR2022-023}
    • }
  •  Hori, C., Shah, A.P., Geng, S., Gao, P., Cherian, A., Hori, T., Le Roux, J., Marks, T.K., "Overview of Audio Visual Scene-Aware Dialog with Reasoning Track for Natural Language Generation in DSTC10", The 10th Dialog System Technology Challenge Workshop at AAAI, February 2022.
    BibTeX TR2022-016 PDF
    • @inproceedings{Hori2022feb,
    • author = {Hori, Chiori and Shah, Ankit Parag and Geng, Shijie and Gao, Peng and Cherian, Anoop and Hori, Takaaki and {Le Roux}, Jonathan and Marks, Tim K.},
    • title = {{Overview of Audio Visual Scene-Aware Dialog with Reasoning Track for Natural Language Generation in DSTC10}},
    • booktitle = {The 10th Dialog System Technology Challenge Workshop at AAAI},
    • year = 2022,
    • month = feb,
    • url = {https://www.merl.com/publications/TR2022-016}
    • }
  •  Shah, A.P., Hori, T., Le Roux, J., Hori, C., "DSTC10-AVSD Submission System with Reasoning using Audio-Visual Transformers with Joint Student-Teacher Learning", The 10th Dialog System Technology Challenge Workshop at AAAI 2022, February 2022.
    BibTeX TR2022-025 PDF
    • @inproceedings{Shah2022feb,
    • author = {Shah, Ankit Parag and Hori, Takaaki and {Le Roux}, Jonathan and Hori, Chiori},
    • title = {{DSTC10-AVSD Submission System with Reasoning using Audio-Visual Transformers with Joint Student-Teacher Learning}},
    • booktitle = {The 10th Dialog System Technology Challenge Workshop at AAAI 2022},
    • year = 2022,
    • month = feb,
    • url = {https://www.merl.com/publications/TR2022-025}
    • }
  •  Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Momentum Pseudo-Labelingによる半教師ありEnd-to-End音声認識", Acoustical Society of Japan Spring Meeting (ASJ), February 2022.
    BibTeX
    • @inproceedings{Higuchi2022feb,
    • author = {Higuchi, Yosuke and Moritz, Niko and {Le Roux}, Jonathan and Hori, Takaaki},
    • title = {{Momentum Pseudo-Labelingによる半教師ありEnd-to-End音声認識}},
    • booktitle = {Acoustical Society of Japan Spring Meeting (ASJ)},
    • year = 2022,
    • month = feb
    • }