Publications

169 / 3,803 publications found.


  •  Pan, Z., Wichern, G., Germain, F., Subramanian, A.S., Le Roux, J., "Towards End-to-end Speaker Diarization in the Wild", arXiv, November 2022.
    BibTeX arXiv
    • @article{Pan2022nov,
    • author = {Pan, Zexu and Wichern, Gordon and Germain, Francois and Subramanian, Aswin Shanmugam and {Le Roux}, Jonathan},
    • title = {{Towards End-to-end Speaker Diarization in the Wild}},
    • journal = {arXiv},
    • year = 2022,
    • month = nov,
    • url = {https://arxiv.org/abs/2211.01299}
    • }
  •  Hori, C., Hori, T., Le Roux, J., "Low-Latency Streaming Scene-aware Interaction Using Audio-Visual Transformers", Interspeech, DOI: 10.21437/​Interspeech.2022-10891, September 2022, pp. 4511-4515.
    BibTeX TR2022-116 PDF
    • @inproceedings{Hori2022sep,
    • author = {Hori, Chiori and Hori, Takaaki and {Le Roux}, Jonathan},
    • title = {{Low-Latency Streaming Scene-aware Interaction Using Audio-Visual Transformers}},
    • booktitle = {Interspeech},
    • year = 2022,
    • pages = {4511--4515},
    • month = sep,
    • doi = {10.21437/Interspeech.2022-10891},
    • url = {https://www.merl.com/publications/TR2022-116}
    • }
  •  Tzinis, E., Wichern, G., Subramanian, A.S., Smaragdis, P., Le Roux, J., "Heterogeneous Target Speech Separation", Interspeech, DOI: 10.21437/​Interspeech.2022-10717, September 2022, pp. 1796-1800.
    BibTeX TR2022-115 PDF Video Presentation
    • @inproceedings{Tzinis2022sep,
    • author = {Tzinis, Efthymios and Wichern, Gordon and Subramanian, Aswin Shanmugam and Smaragdis, Paris and {Le Roux}, Jonathan},
    • title = {{Heterogeneous Target Speech Separation}},
    • booktitle = {Interspeech},
    • year = 2022,
    • pages = {1796--1800},
    • month = sep,
    • doi = {10.21437/Interspeech.2022-10717},
    • url = {https://www.merl.com/publications/TR2022-115}
    • }
  •  Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Momentum Pseudo-Labeling: Semi-Supervised ASR with Continuously Improving Pseudo-Labels", IEEE Journal of Selected Topics in Signal Processing, DOI: 10.1109/​JSTSP.2022.3195367, Vol. 16, No. 6, pp. 1424-1438, September 2022.
    BibTeX TR2022-112 PDF
    • @article{Higuchi2022sep,
    • author = {Higuchi, Yosuke and Moritz, Niko and {Le Roux}, Jonathan and Hori, Takaaki},
    • title = {{Momentum Pseudo-Labeling: Semi-Supervised ASR with Continuously Improving Pseudo-Labels}},
    • journal = {IEEE Journal of Selected Topics in Signal Processing},
    • year = 2022,
    • volume = 16,
    • number = 6,
    • pages = {1424--1438},
    • month = sep,
    • doi = {10.1109/JSTSP.2022.3195367},
    • issn = {1941-0484},
    • url = {https://www.merl.com/publications/TR2022-112}
    • }
  •  Venkatesh, S., Wichern, G., Subramanian, A.S., Le Roux, J., "Disentangled Surrogate Task Learning for Improved Domain Generalization in Unsupervised Anomolous Sound Detection," Tech. Rep. TR2022-092, Detection and Classification of Acoustic Scenes and Events (DCASE) Challenge 2022, July 2022.
    BibTeX TR2022-092 PDF Presentation
    • @techreport{Venkatesh2022jul,
    • author = {Venkatesh, Satvik and Wichern, Gordon and Subramanian, Aswin Shanmugam and {Le Roux}, Jonathan},
    • title = {{Disentangled Surrogate Task Learning for Improved Domain Generalization in Unsupervised Anomolous Sound Detection}},
    • institution = {DCASE2022 Challenge},
    • year = 2022,
    • month = jul,
    • url = {https://www.merl.com/publications/TR2022-092}
    • }
  •  Chang, X., Moritz, N., Hori, T., Watanabe, S., Le Roux, J., "Extended Graph Temporal Classification for Multi-Speaker End-to-End ASR", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP43922.2022.9747375, April 2022, pp. 7322-7326.
    BibTeX TR2022-021 PDF
    • @inproceedings{Chang2022apr,
    • author = {Chang, Xuankai and Moritz, Niko and Hori, Takaaki and Watanabe, Shinji and {Le Roux}, Jonathan},
    • title = {{Extended Graph Temporal Classification for Multi-Speaker End-to-End ASR}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {7322--7326},
    • month = apr,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP43922.2022.9747375},
    • url = {https://www.merl.com/publications/TR2022-021}
    • }
  •  Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Advancing Momentum Pseudo-Labeling with Conformer and Initialization Strategy", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP43922.2022.9746275, April 2022, pp. 7672-7676.
    BibTeX TR2022-026 PDF
    • @inproceedings{Higuchi2022apr,
    • author = {Higuchi, Yosuke and Moritz, Niko and {Le Roux}, Jonathan and Hori, Takaaki},
    • title = {{Advancing Momentum Pseudo-Labeling with Conformer and Initialization Strategy}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {7672--7676},
    • month = apr,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP43922.2022.9746275},
    • url = {https://www.merl.com/publications/TR2022-026}
    • }
  •  Moritz, N., Hori, T., Watanabe, S., Le Roux, J., "Sequence Transduction with Graph-based Supervision", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP43922.2022.9747788, April 2022, pp. 7212-7216.
    BibTeX TR2022-024 PDF
    • @inproceedings{Moritz2022apr,
    • author = {Moritz, Niko and Hori, Takaaki and Watanabe, Shinji and {Le Roux}, Jonathan},
    • title = {{Sequence Transduction with Graph-based Supervision}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {7212--7216},
    • month = apr,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP43922.2022.9747788},
    • url = {https://www.merl.com/publications/TR2022-024}
    • }
  •  Petermann, D., Wichern, G., Wang, Z.-Q., Le Roux, J., "The Cocktail Fork Problem: Three-Stem Audio Separation for Real-World Soundtracks", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP43922.2022.9746005, April 2022, pp. 526-530.
    BibTeX TR2022-022 PDF Video Software
    • @inproceedings{Petermann2022apr,
    • author = {Petermann, Darius and Wichern, Gordon and Wang, Zhong-Qiu and {Le Roux}, Jonathan},
    • title = {{The Cocktail Fork Problem: Three-Stem Audio Separation for Real-World Soundtracks}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {526--530},
    • month = apr,
    • doi = {10.1109/ICASSP43922.2022.9746005},
    • url = {https://www.merl.com/publications/TR2022-022}
    • }
  •  Shah, A.P., Geng, S., Gao, P., Cherian, A., Hori, T., Marks, T.K., Le Roux, J., Hori, C., "Audio-Visual Scene-Aware Dialog and Reasoning Using Audio-Visual Transformers with Joint Student-Teacher Learning", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), April 2022, pp. 7732-7736.
    BibTeX TR2022-019 PDF
    • @inproceedings{Shah2022apr,
    • author = {Shah, Ankit Parag and Geng, Shijie and Gao, Peng and Cherian, Anoop and Hori, Takaaki and Marks, Tim K. and {Le Roux}, Jonathan and Hori, Chiori},
    • title = {{Audio-Visual Scene-Aware Dialog and Reasoning Using Audio-Visual Transformers with Joint Student-Teacher Learning}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {7732--7736},
    • month = apr,
    • publisher = {IEEE},
    • issn = {1520-6149},
    • isbn = {978-1-6654-0540-9},
    • url = {https://www.merl.com/publications/TR2022-019}
    • }
  •  Slizovskaia, O., Wichern, G., Wang, Z.-Q., Le Roux, J., "Locate This, Not That: Class-Conditioned Sound Event DOA Estimation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP43922.2022.9747604, April 2022, pp. 711-715.
    BibTeX TR2022-023 PDF
    • @inproceedings{Slizovskaia2022mar,
    • author = {Slizovskaia, Olga and Wichern, Gordon and Wang, Zhong-Qiu and {Le Roux}, Jonathan},
    • title = {{Locate This, Not That: Class-Conditioned Sound Event DOA Estimation}},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {711--715},
    • month = apr,
    • doi = {10.1109/ICASSP43922.2022.9747604},
    • url = {https://www.merl.com/publications/TR2022-023}
    • }
  •  Hori, C., Shah, A.P., Geng, S., Gao, P., Cherian, A., Hori, T., Le Roux, J., Marks, T.K., "Overview of Audio Visual Scene-Aware Dialog with Reasoning Track for Natural Language Generation in DSTC10", The 10th Dialog System Technology Challenge Workshop at AAAI, February 2022.
    BibTeX TR2022-016 PDF
    • @inproceedings{Hori2022feb,
    • author = {Hori, Chiori and Shah, Ankit Parag and Geng, Shijie and Gao, Peng and Cherian, Anoop and Hori, Takaaki and {Le Roux}, Jonathan and Marks, Tim K.},
    • title = {{Overview of Audio Visual Scene-Aware Dialog with Reasoning Track for Natural Language Generation in DSTC10}},
    • booktitle = {The 10th Dialog System Technology Challenge Workshop at AAAI},
    • year = 2022,
    • month = feb,
    • url = {https://www.merl.com/publications/TR2022-016}
    • }
  •  Shah, A.P., Hori, T., Le Roux, J., Hori, C., "DSTC10-AVSD Submission System with Reasoning using Audio-Visual Transformers with Joint Student-Teacher Learning", The 10th Dialog System Technology Challenge Workshop at AAAI 2022, February 2022.
    BibTeX TR2022-025 PDF
    • @inproceedings{Shah2022feb,
    • author = {Shah, Ankit Parag and Hori, Takaaki and {Le Roux}, Jonathan and Hori, Chiori},
    • title = {{DSTC10-AVSD Submission System with Reasoning using Audio-Visual Transformers with Joint Student-Teacher Learning}},
    • booktitle = {The 10th Dialog System Technology Challenge Workshop at AAAI 2022},
    • year = 2022,
    • month = feb,
    • url = {https://www.merl.com/publications/TR2022-025}
    • }
  •  Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Momentum Pseudo-Labelingによる半教師ありEnd-to-End音声認識", Acoustical Society of Japan Spring Meeting (ASJ), February 2022.
    BibTeX
    • @inproceedings{Higuchi2022feb,
    • author = {Higuchi, Yosuke and Moritz, Niko and {Le Roux}, Jonathan and Hori, Takaaki},
    • title = {{Momentum Pseudo-Labelingによる半教師ありEnd-to-End音声認識}},
    • booktitle = {Acoustical Society of Japan Spring Meeting (ASJ)},
    • year = 2022,
    • month = feb
    • }
  •  Cherian, A., Hori, C., Marks, T.K., Le Roux, J., "(2.5+1)D Spatio-Temporal Scene Graphs for Video Question Answering", AAAI Conference on Artificial Intelligence, DOI: 10.1609/​aaai.v36i1.19922, February 2022, pp. 444-453.
    BibTeX TR2022-014 PDF Video Presentation
    • @inproceedings{Cherian2022feb,
    • author = {Cherian, Anoop and Hori, Chiori and Marks, Tim K. and {Le Roux}, Jonathan},
    • title = {{(2.5+1)D Spatio-Temporal Scene Graphs for Video Question Answering}},
    • booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence},
    • year = 2022,
    • pages = {444--453},
    • month = feb,
    • doi = {10.1609/aaai.v36i1.19922},
    • url = {https://www.merl.com/publications/TR2022-014}
    • }
  •  Wang, Z.-Q., Wichern, G., Le Roux, J., "Convolutive Prediction for Monaural Speech Dereverberation and Noisy-Reverberant Speaker Separation", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/​TASLP.2021.3129363, Vol. 29, pp. 3476-3490, December 2021.
    BibTeX TR2021-144 PDF
    • @article{Wang2021dec,
    • author = {Wang, Zhong-Qiu and Wichern, Gordon and {Le Roux}, Jonathan},
    • title = {{Convolutive Prediction for Monaural Speech Dereverberation and Noisy-Reverberant Speaker Separation}},
    • journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
    • year = 2021,
    • volume = 29,
    • pages = {3476--3490},
    • month = dec,
    • doi = {10.1109/TASLP.2021.3129363},
    • url = {https://www.merl.com/publications/TR2021-144}
    • }
  •  Wang, Z.-Q., Wichern, G., Le Roux, J., "On The Compensation Between Magnitude and Phase in Speech Separation", IEEE Signal Processing Letters, DOI: 10.1109/​LSP.2021.3116502, Vol. 28, pp. 2018-2022, November 2021.
    BibTeX TR2021-137 PDF
    • @article{Wang2021nov2,
    • author = {Wang, Zhong-Qiu and Wichern, Gordon and {Le Roux}, Jonathan},
    • title = {{On The Compensation Between Magnitude and Phase in Speech Separation}},
    • journal = {IEEE Signal Processing Letters},
    • year = 2021,
    • volume = 28,
    • pages = {2018--2022},
    • month = nov,
    • doi = {10.1109/LSP.2021.3116502},
    • url = {https://www.merl.com/publications/TR2021-137}
    • }
  •  Wang, Z.-Q., Wichern, G., Le Roux, J., "Convolutive Prediction for Reverberant Speech Separation", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/​WASPAA52581.2021.9632667, October 2021, pp. 56-60.
    BibTeX TR2021-127 PDF
    • @inproceedings{Wang2021oct4,
    • author = {Wang, Zhong-Qiu and Wichern, Gordon and {Le Roux}, Jonathan},
    • title = {{Convolutive Prediction for Reverberant Speech Separation}},
    • booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
    • year = 2021,
    • pages = {56--60},
    • month = oct,
    • publisher = {IEEE},
    • doi = {10.1109/WASPAA52581.2021.9632667},
    • url = {https://www.merl.com/publications/TR2021-127}
    • }
  •  Wichern, G., Chakrabarty, A., Wang, Z.-Q., Le Roux, J., "Anomalous sound detection using attentive neural processes", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/​WASPAA52581.2021.9632762, October 2021, pp. 186-190.
    BibTeX TR2021-129 PDF
    • @inproceedings{Wichern2021oct,
    • author = {Wichern, Gordon and Chakrabarty, Ankush and Wang, Zhong-Qiu and {Le Roux}, Jonathan},
    • title = {{Anomalous sound detection using attentive neural processes}},
    • booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
    • year = 2021,
    • pages = {186--190},
    • month = oct,
    • publisher = {IEEE},
    • doi = {10.1109/WASPAA52581.2021.9632762},
    • url = {https://www.merl.com/publications/TR2021-129}
    • }
  •  Chatterjee, M., Le Roux, J., Ahuja, N., Cherian, A., "Visual Scene Graphs for Audio Source Separation", IEEE International Conference on Computer Vision (ICCV), October 2021, pp. 1204-1213.
    BibTeX TR2021-095 PDF Video Software
    • @inproceedings{Chatterjee2021oct,
    • author = {Chatterjee, Moitreya and {Le Roux}, Jonathan and Ahuja, Narendra and Cherian, Anoop},
    • title = {{Visual Scene Graphs for Audio Source Separation}},
    • booktitle = {IEEE International Conference on Computer Vision (ICCV)},
    • year = 2021,
    • pages = {1204--1213},
    • month = oct,
    • publisher = {CVF},
    • url = {https://www.merl.com/publications/TR2021-095}
    • }
  •  Wang, Z.-Q., Wichern, G., Le Roux, J., "Leveraging Low-Distortion Target Estimates for Improved Speech Enhancement", arXiv, October 2021.
    BibTeX arXiv
    • @article{Wang2021oct,
    • author = {Wang, Zhong-Qiu and Wichern, Gordon and {Le Roux}, Jonathan},
    • title = {{Leveraging Low-Distortion Target Estimates for Improved Speech Enhancement}},
    • journal = {arXiv},
    • year = 2021,
    • month = oct,
    • url = {https://arxiv.org/abs/2110.00570}
    • }
  •  Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Momentum Pseudo-Labeling for Semi-Supervised Speech Recognition", Interspeech, DOI: 10.21437/​Interspeech.2021-571, September 2021, pp. 726-730.
    BibTeX TR2021-103 PDF
    • @inproceedings{Higuchi2021sep,
    • author = {Higuchi, Yosuke and Moritz, Niko and {Le Roux}, Jonathan and Hori, Takaaki},
    • title = {{Momentum Pseudo-Labeling for Semi-Supervised Speech Recognition}},
    • booktitle = {Interspeech},
    • year = 2021,
    • pages = {726--730},
    • month = sep,
    • doi = {10.21437/Interspeech.2021-571},
    • url = {https://www.merl.com/publications/TR2021-103}
    • }
  •  Hori, T., Moritz, N., Hori, C., Le Roux, J., "Advanced Long-context End-to-end Speech Recognition Using Context-expanded Transformers", Interspeech, DOI: 10.21437/​Interspeech.2021-1643, August 2021, pp. 2097-2101.
    BibTeX TR2021-100 PDF
    • @inproceedings{Hori2021aug3,
    • author = {Hori, Takaaki and Moritz, Niko and Hori, Chiori and {Le Roux}, Jonathan},
    • title = {{Advanced Long-context End-to-end Speech Recognition Using Context-expanded Transformers}},
    • booktitle = {Interspeech},
    • year = 2021,
    • pages = {2097--2101},
    • month = aug,
    • doi = {10.21437/Interspeech.2021-1643},
    • url = {https://www.merl.com/publications/TR2021-100}
    • }
  •  Hori, C., Hori, T., Le Roux, J., "Optimizing Latency for Online Video Captioning Using Audio-VisualTransformers", Interspeech, DOI: 10.21437/​Interspeech.2021-1975, August 2021, pp. 586–590.
    BibTeX TR2021-093 PDF
    • @inproceedings{Hori2021aug2,
    • author = {Hori, Chiori and Hori, Takaaki and {Le Roux}, Jonathan},
    • title = {{Optimizing Latency for Online Video Captioning Using Audio-VisualTransformers}},
    • booktitle = {Interspeech},
    • year = 2021,
    • pages = {586–590},
    • month = aug,
    • publisher = {ISCA},
    • doi = {10.21437/Interspeech.2021-1975},
    • url = {https://www.merl.com/publications/TR2021-093}
    • }
  •  Moritz, N., Hori, T., Le Roux, J., "Dual Causal/Non-Causal Self-Attention for Streaming End-to-End Speech Recognition", Interspeech, DOI: 10.21437/​Interspeech.2021-1693, August 2021, pp. 1822-1826.
    BibTeX TR2021-094 PDF
    • @inproceedings{Moritz2021aug,
    • author = {Moritz, Niko and Hori, Takaaki and {Le Roux}, Jonathan},
    • title = {{Dual Causal/Non-Causal Self-Attention for Streaming End-to-End Speech Recognition}},
    • booktitle = {Interspeech},
    • year = 2021,
    • pages = {1822--1826},
    • month = aug,
    • doi = {10.21437/Interspeech.2021-1693},
    • url = {https://www.merl.com/publications/TR2021-094}
    • }