Publications

Masuyama, Y., "Single- and Multi-Channel Speech Enhancement and Separation for Far-Field Conversation Recognition," Tech. Rep. TR2025-097, Jelinek Summer Workshop on Speech and Language Technology (JSALT), June 2025.
BibTeX TR2025-097 PDF
- @techreport{Masuyama2025jun,
- author = {{{Masuyama, Yoshiki}}},
- title = {{{Single- and Multi-Channel Speech Enhancement and Separation for Far-Field Conversation Recognition}}},
- institution = {Jelinek Summer Workshop on Speech and Language Technology (JSALT)},
- year = 2025,
- month = jun,
- url = {https://www.merl.com/publications/TR2025-097}
- }
Masuyama, Y., Chang, X., Zhang, W., Cornell, S., Wang, Z.-Q., Ono, N., Qian, Y., Watanabe, S., "An End-to-End Integration of Speech Separation and Recognition with Self-Supervised Learning Representation", Computer Speech & Language, DOI: 10.1016/j.csl.2025.101813, Vol. 95, pp. 101813, May 2025.
BibTeX TR2025-054 PDF
- @article{Masuyama2025may,
- author = {Masuyama, Yoshiki and Chang, Xuankai and Zhang, Wangyou and Cornell, Samuele and Wang, Zhong-Qiu and Ono, Nobutaka and Qian, Yanmin and Watanabe, Shinji},
- title = {{An End-to-End Integration of Speech Separation and Recognition with Self-Supervised Learning Representation}},
- journal = {Computer Speech \& Language},
- year = 2025,
- volume = 95,
- pages = 101813,
- month = may,
- doi = {10.1016/j.csl.2025.101813},
- issn = {0885-2308},
- url = {https://www.merl.com/publications/TR2025-054}
- }
Saijo, K., Ebbers, J., Germain, F.G., Wichern, G., Le Roux, J., "Task-Aware Unified Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP49660.2025.10887819, April 2025.
BibTeX TR2025-032 PDF
- @inproceedings{Saijo2025mar,
- author = {Saijo, Kohei and Ebbers, Janek and Germain, François G and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Task-Aware Unified Source Separation}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2025,
- month = mar,
- doi = {10.1109/ICASSP49660.2025.10887819},
- url = {https://www.merl.com/publications/TR2025-032}
- }
Yataka, R., Wang, P., Boufounos, P.T., Takahashi, R., "Multi-View Radar Detection Transformer with Differentiable Positional Encoding", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), April 2025.
BibTeX TR2025-027 PDF
- @inproceedings{Yataka2025mar,
- author = {Yataka, Ryoma and Wang, Pu and Boufounos, Petros T. and Takahashi, Ryuhei},
- title = {{Multi-View Radar Detection Transformer with Differentiable Positional Encoding}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2025,
- month = mar,
- url = {https://www.merl.com/publications/TR2025-027}
- }
Saijo, K., Wichern, G., Germain, F.G., Pan, Z., Le Roux, J., "TF-Locoformer: Transformer with Local Modeling by Convolution for Speech Separation and Enhancement", International Workshop on Acoustic Signal Enhancement (IWAENC), DOI: 10.1109/IWAENC61483.2024.10694313, September 2024, pp. 205-209.
BibTeX TR2024-126 PDF Software
- @inproceedings{Saijo2024sep2,
- author = {Saijo, Kohei and Wichern, Gordon and Germain, François G and Pan, Zexu and {Le Roux}, Jonathan},
- title = {{TF-Locoformer: Transformer with Local Modeling by Convolution for Speech Separation and Enhancement}},
- booktitle = {International Workshop on Acoustic Signal Enhancement (IWAENC)},
- year = 2024,
- pages = {205--209},
- month = sep,
- doi = {10.1109/IWAENC61483.2024.10694313},
- issn = {2835-3439},
- isbn = {979-8-3503-6185-8},
- url = {https://www.merl.com/publications/TR2024-126}
- }
Saijo, K., Wichern, G., Germain, F.G., Pan, Z., Le Roux, J., "Enhanced Reverberation as Supervision for Unsupervised Speech Separation", Interspeech, DOI: 10.21437/Interspeech.2024-1241, September 2024, pp. 607-611.
BibTeX TR2024-116 PDF Software
- @inproceedings{Saijo2024sep,
- author = {Saijo, Kohei and Wichern, Gordon and Germain, François G and Pan, Zexu and {Le Roux}, Jonathan},
- title = {{Enhanced Reverberation as Supervision for Unsupervised Speech Separation}},
- booktitle = {Interspeech},
- year = 2024,
- pages = {607--611},
- month = sep,
- doi = {10.21437/Interspeech.2024-1241},
- issn = {2958-1796},
- url = {https://www.merl.com/publications/TR2024-116}
- }
Yataka, R., Wang, P., Boufounos, P.T., Takahashi, R., "Radar Perception with Scalable Connective Temporal Relations for Autonomous Driving", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10446449, March 2024, pp. 13266-13270.
BibTeX TR2024-023 PDF
- @inproceedings{Yataka2024mar,
- author = {Yataka, Ryoma and Wang, Pu and Boufounos, Petros T. and Takahashi, Ryuhei},
- title = {{Radar Perception with Scalable Connective Temporal Relations for Autonomous Driving}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {13266--13270},
- month = mar,
- publisher = {IEEE},
- doi = {10.1109/ICASSP48485.2024.10446449},
- issn = {2379-190X},
- isbn = {979-8-3503-4485-1},
- url = {https://www.merl.com/publications/TR2024-023}
- }
Baoueb, T., Liu, H., Fontaine, M., Le Roux, J., Richard, G., "SpecDiff-GAN: A Spectrally-Shaped Noise Diffusion GAN for Speech and Music Synthesis", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10446830, March 2024, pp. 986-990.
BibTeX TR2024-013 PDF
- @inproceedings{Baoueb2024mar,
- author = {Baoueb, Teysir and Liu, Haocheng and Fontaine, Mathieu and {Le Roux}, Jonathan and Richard, Gaël},
- title = {{SpecDiff-GAN: A Spectrally-Shaped Noise Diffusion GAN for Speech and Music Synthesis}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {986--990},
- month = mar,
- doi = {10.1109/ICASSP48485.2024.10446830},
- issn = {2379-190X},
- isbn = {979-8-3503-4485-1},
- url = {https://www.merl.com/publications/TR2024-013}
- }
Hori, C., Wang, P., Rahman, M., Vaca-Rubio, C., Khurana, S., Cherian, A., Le Roux, J., "Wi-Fi based Indoor Monitoring Enhanced by Multimodal Fusion", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10447600, March 2024, pp. 13296-13300.
BibTeX TR2024-012 PDF
- @inproceedings{Hori2024mar,
- author = {Hori, Chiori and Wang, Pu and Rahman, Mahbub and Vaca-Rubio, Cristian and Khurana, Sameer and Cherian, Anoop and {Le Roux}, Jonathan},
- title = {{Wi-Fi based Indoor Monitoring Enhanced by Multimodal Fusion}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {13296--13300},
- month = mar,
- publisher = {IEEE},
- doi = {10.1109/ICASSP48485.2024.10447600},
- issn = {2379-190X},
- isbn = {979-8-3503-4485-1},
- url = {https://www.merl.com/publications/TR2024-012}
- }
Pan, Z., Wichern, G., Masuyama, Y., Germain, F.G., Khurana, S., Hori, C., Le Roux, J., "Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), DOI: 10.1109/ASRU57964.2023.10389618, December 2023.
BibTeX TR2023-152 PDF Video
- @inproceedings{Pan2023dec2,
- author = {Pan, Zexu and Wichern, Gordon and Masuyama, Yoshiki and Germain, François G and Khurana, Sameer and Hori, Chiori and {Le Roux}, Jonathan},
- title = {{Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction}},
- booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
- year = 2023,
- month = dec,
- doi = {10.1109/ASRU57964.2023.10389618},
- isbn = {979-8-3503-0689-7},
- url = {https://www.merl.com/publications/TR2023-152}
- }
Yen, H., Germain, F., Wichern, G., Le Roux, J., "Cold Diffusion for Speech Enhancement", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP49357.2023.10096064, May 2023, pp. 1-5.
BibTeX TR2023-020 PDF
- @inproceedings{Yen2023may,
- author = {Yen, Hao and Germain, Francois and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Cold Diffusion for Speech Enhancement}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2023,
- pages = {1--5},
- month = may,
- publisher = {IEEE},
- doi = {10.1109/ICASSP49357.2023.10096064},
- url = {https://www.merl.com/publications/TR2023-020}
- }
Wang, Z.-Q., Wichern, G., Watanabe, S., Le Roux, J., "STFT-Domain Neural Speech Enhancement with Very Low Algorithmic Latency", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/TASLP.2022.3224285, Vol. 31, pp. 397-410, December 2022.
BibTeX TR2022-166 PDF
- @article{Wang2022dec2,
- author = {Wang, Zhong-Qiu and Wichern, Gordon and Watanabe, Shinji and {Le Roux}, Jonathan},
- title = {{STFT-Domain Neural Speech Enhancement with Very Low Algorithmic Latency}},
- journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
- year = 2022,
- volume = 31,
- pages = {397--410},
- month = dec,
- doi = {10.1109/TASLP.2022.3224285},
- issn = {2329-9304},
- url = {https://www.merl.com/publications/TR2022-166}
- }
Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Advancing Momentum Pseudo-Labeling with Conformer and Initialization Strategy", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP43922.2022.9746275, April 2022, pp. 7672-7676.
BibTeX TR2022-026 PDF
- @inproceedings{Higuchi2022apr,
- author = {Higuchi, Yosuke and Moritz, Niko and {Le Roux}, Jonathan and Hori, Takaaki},
- title = {{Advancing Momentum Pseudo-Labeling with Conformer and Initialization Strategy}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2022,
- pages = {7672--7676},
- month = apr,
- publisher = {IEEE},
- doi = {10.1109/ICASSP43922.2022.9746275},
- url = {https://www.merl.com/publications/TR2022-026}
- }
Wang, Z.-Q., Wichern, G., Le Roux, J., "Leveraging Low-Distortion Target Estimates for Improved Speech Enhancement", arXiv, October 2021.
BibTeX arXiv
- @article{Wang2021oct,
- author = {Wang, Zhong-Qiu and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Leveraging Low-Distortion Target Estimates for Improved Speech Enhancement}},
- journal = {arXiv},
- year = 2021,
- month = oct,
- url = {https://arxiv.org/abs/2110.00570}
- }
Watanabe, S., Boyer, F., Chang, X., Guo, P., Hayashi, T., Higuchi, Y., Hori, T., Huang, W.-C., Inaguma, H., Kamo, N., Shigeki, K., Li, C., Shi, J., Subramanian, A.S., Zhang, W., "The 2020 ESPNET Update: New Features, Broadened Applications, Performance Improvements, and Future Plans", IEEE Data Science and Learning Workshop (DSLW), DOI: 10.1109/DSLW51110, June 2021, pp. 1-6.
BibTeX TR2021-073 PDF
- @inproceedings{Watanabe2021jun,
- author = {Watanabe, Shinji and Boyer, Florian and Chang, Xuankai and Guo, Pengcheng and Hayashi, Tomoki and Higuchi, Yosuke and Hori, Takaaki and Huang, Wen-Chin and Inaguma, Hirofumi and Kamo, Naoyuki and Shigeki, Karita and Li, Chenda and Shi, Jing and Subramanian, Aswin S and Zhang, Wangyou},
- title = {{The 2020 ESPNET Update: New Features, Broadened Applications, Performance Improvements, and Future Plans}},
- booktitle = {IEEE Data Science and Learning Workshop (DSLW)},
- year = 2021,
- pages = {1--6},
- month = jun,
- publisher = {IEEE},
- doi = {10.1109/DSLW51110},
- isbn = {978-1-6654-2826-2},
- url = {https://www.merl.com/publications/TR2021-073}
- }
Chang, X., Zhang, W., Qian, Y., Le Roux, J., Watanabe, S., "MIMO-Speech: End-to-End Multi-Channel Multi-Speaker Speech Recognition", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), December 2019, pp. 237-144.
BibTeX TR2019-157 PDF
- @inproceedings{Chang2019dec,
- author = {Chang, Xuankai and Zhang, Wangyou and Qian, Yanmin and {Le Roux}, Jonathan and Watanabe, Shinji},
- title = {{MIMO-Speech: End-to-End Multi-Channel Multi-Speaker Speech Recognition}},
- booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
- year = 2019,
- pages = {237--144},
- month = dec,
- isbn = {978-1-7281-0305-1},
- url = {https://www.merl.com/publications/TR2019-157}
- }
Kavalerov, I., Wisdom, S., Erdogan, H., Patton, B., Wilson, K., Le Roux, J., Hershey, J., "Universal Sound Separation", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/WASPAA.2019.8937253, October 2019, pp. 170-174.
BibTeX TR2019-123 PDF
- @inproceedings{Kavalerov2019oct,
- author = {Kavalerov, Ilya and Wisdom, Scott and Erdogan, Hakan and Patton, Brian and Wilson, Kevin and {Le Roux}, Jonathan and Hershey, John},
- title = {{Universal Sound Separation}},
- booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
- year = 2019,
- pages = {170--174},
- month = oct,
- doi = {10.1109/WASPAA.2019.8937253},
- issn = {1947-1629},
- isbn = {978-1-7281-1123-0},
- url = {https://www.merl.com/publications/TR2019-123}
- }
Kadu, A., Mansour, H., Boufounos, P.T., Liu, D., "Reflection Tomographic Imaging of Highly Scattering Objects Using Incremental Frequency Inversion", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8682393, May 2019.
BibTeX TR2019-012 PDF Video
- @inproceedings{Kadu2019may,
- author = {Kadu, Ajinkya and Mansour, Hassan and Boufounos, Petros T. and Liu, Dehong},
- title = {{Reflection Tomographic Imaging of Highly Scattering Objects Using Incremental Frequency Inversion}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8682393},
- url = {https://www.merl.com/publications/TR2019-012}
- }
Le Roux, J., Wichern, G., Watanabe, S., Sarroff, A., Hershey, J., "The Phasebook: Building Complex Masks via Discrete Representations for Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8682587, May 2019.
BibTeX TR2019-008 PDF
- @inproceedings{LeRoux2019may2,
- author = {{Le Roux}, Jonathan and Wichern, Gordon and Watanabe, Shinji and Sarroff, Andy and Hershey, John},
- title = {{The Phasebook: Building Complex Masks via Discrete Representations for Source Separation}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8682587},
- url = {https://www.merl.com/publications/TR2019-008}
- }
Le Roux, J., Wisdom, S., Erdogan, H., Hershey, J., "SDR -- Half-Baked or Well Done?", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683855, May 2019.
BibTeX TR2019-013 PDF
- @inproceedings{LeRoux2019may,
- author = {{Le Roux}, Jonathan and Wisdom, Scott and Erdogan, Hakan and Hershey, John},
- title = {{SDR-- Half- Baked or Well Done? }},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683855},
- url = {https://www.merl.com/publications/TR2019-013}
- }
Le Roux, J., Wichern, G., Watanabe, S., Sarroff, A., Hershey, J., "Phasebook and Friends: Leveraging discrete representations for source separation", IEEE Journal of Selected Topics in Signal Processing, DOI: 10.1109/JSTSP.2019.2904183, Vol. 13, No. 2, pp. 370-382, March 2019.
BibTeX TR2018-199 PDF
- @article{LeRoux2019mar,
- author = {{Le Roux}, Jonathan and Wichern, Gordon and Watanabe, Shinji and Sarroff, Andy and Hershey, John},
- title = {{Phasebook and Friends: Leveraging discrete representations for source separation}},
- journal = {IEEE Journal of Selected Topics in Signal Processing},
- year = 2019,
- volume = 13,
- number = 2,
- pages = {370--382},
- month = mar,
- doi = {10.1109/JSTSP.2019.2904183},
- url = {https://www.merl.com/publications/TR2018-199}
- }
Wichern, G., Le Roux, J., "Phase Reconstruction with Learned Time-Frequency Representations for Single-Channel Speech Separation", International Workshop on Acoustic Signal Enhancement (IWAENC), DOI: 10.1109/IWAENC.2018.8521243, September 2018.
BibTeX TR2018-146 PDF
- @inproceedings{Wichern2018sep,
- author = {Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Phase Reconstruction with Learned Time-Frequency Representations for Single-Channel Speech Separation}},
- booktitle = {International Workshop on Acoustic Signal Enhancement (IWAENC)},
- year = 2018,
- month = sep,
- doi = {10.1109/IWAENC.2018.8521243},
- url = {https://www.merl.com/publications/TR2018-146}
- }
Xiao, X., Watanabe, S., Erdogan, H., Mandel, M., Lu, L., Hershey, J., Seltzer, M., Chen, G., Zhang, Y., Yu, D., "Discriminative beamforming with phase aware neural networks for speech enhancement and recognition" in New Era for Robust Speech Recognition: Exploiting Deep Learning, Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R., Eds., chapter 4, Springer, July 9, 2018.
BibTeX
- @incollection{Xiao2018jul2,
- author = {Xiao, Xiong and Watanabe, Shinji and Erdogan, Hakan and Mandel, Michael and Lu, Liang and Hershey, John and Seltzer, Mike and Chen, Guoguo and Zhang, Yu and Yu, Dong},
- title = {{Discriminative beamforming with phase aware neural networks for speech enhancement and recognition}},
- booktitle = {New Era for Robust Speech Recognition: Exploiting Deep Learning},
- year = 2018,
- editor = {Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R.},
- chapter = 4,
- month = jul,
- publisher = {Springer}
- }
Ochiai, T., Watanabe, S., Katagiri, S., Hori, T., Hershey, J.R., "Speaker Adaptation for Multichannel End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2018.8462161, April 2018, pp. 6707-6711.
BibTeX TR2018-006 PDF
- @inproceedings{Ochiai2018apr,
- author = {Ochiai, Tsubasa and Watanabe, Shinji and Katagiri, Shigeru and Hori, Takaaki and Hershey, John R.},
- title = {{Speaker Adaptation for Multichannel End-to-End Speech Recognition}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2018,
- pages = {6707--6711},
- month = apr,
- doi = {10.1109/ICASSP.2018.8462161},
- url = {https://www.merl.com/publications/TR2018-006}
- }
Ochiai, T., Watanabe, S., Katagiri, S., "Does speech enhancement work with end-to-end ASR objectives?: Experimental analysis of multichannel end-to-end ASR", IEEE International Workshop on Machine Learning for Signal Processing (MLSP), DOI: 10.1109/JSTSP.2017.2764276, October 2017, vol. 11, pp. 1274-1288.
BibTeX TR2017-139 PDF
- @inproceedings{Ochiai2017oct,
- author = {Ochiai, Tsubasa and Watanabe, Shinji and Katagiri, Shigeru},
- title = {{Does speech enhancement work with end-to-end ASR objectives?: Experimental analysis of multichannel end-to-end ASR}},
- booktitle = {IEEE International Workshop on Machine Learning for Signal Processing (MLSP)},
- year = 2017,
- volume = 11,
- number = 8,
- pages = {1274--1288},
- month = oct,
- doi = {10.1109/JSTSP.2017.2764276},
- url = {https://www.merl.com/publications/TR2017-139}
- }