Publications

Richter, J., Masuyama, Y., Boeddeker, C., Edo, T., Wichern, G., Le Roux, J., "Predictive-Generative Drift Decomposition for Speech Enhancement and Separation", arXiv, May 2026.
BibTeX arXiv
- @article{Richter2026may,
- author = {{Richter, Julius and Masuyama, Yoshiki and Boeddeker, Christoph and Edo, Takahiro and Wichern, Gordon and Le Roux, Jonathan}},
- title = {{Predictive-Generative Drift Decomposition for Speech Enhancement and Separation}},
- journal = {arXiv},
- year = 2026,
- month = may,
- url = {https://arxiv.org/abs/2605.06189}
- }
Tandi, K., Ali, W.H., Rapp, J., Mansour, H., "Single View Camera-Based Dynamic Airflow Sensing", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), May 2026.
BibTeX TR2026-038 PDF
- @inproceedings{Tandi2026may,
- author = {Tandi, Kevin and Ali, Wael H. and Rapp, Joshua and Mansour, Hassan},
- title = {{Single View Camera-Based Dynamic Airflow Sensing}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2026,
- month = may,
- url = {https://www.merl.com/publications/TR2026-038}
- }
Aihara, R., Masuyama, Y., Paissan, F., Germain, F.G., Wichern, G., Le Roux, J., "SUNAC: Source-aware Unified Neural Audio Codec", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), May 2026.
BibTeX TR2026-032 PDF
- @inproceedings{Aihara2026may,
- author = {Aihara, Ryo and Masuyama, Yoshiki and Paissan, Francesco and Germain, François G and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{SUNAC: Source-aware Unified Neural Audio Codec}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2026,
- month = may,
- url = {https://www.merl.com/publications/TR2026-032}
- }
Han, J., Wang, R., Masuyama, Y., Delcroix, M., Rohdin, J., Du, J., Burget, L., "Spatially Aware Self-Supervised Models for Multi-Channel Neural Speaker Diarization", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), May 2026.
BibTeX TR2026-047 PDF
- @inproceedings{Han2026may,
- author = {Han, Jiangyu and Wang, Ruoyu and Masuyama, Yoshiki and Delcroix, Marc and Rohdin, Johan and Du, Jun and Burget, Lukáš},
- title = {{Spatially Aware Self-Supervised Models for Multi-Channel Neural Speaker Diarization}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2026,
- month = may,
- url = {https://www.merl.com/publications/TR2026-047}
- }
Kato, S., Wang, P., Fujihashi, T., Markham, A., "Heatmap-to-SMPL Multi-View Radar Transformer for Multi-Person 3D Pose Estimation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP55912.2026.11463668, May 2026.
BibTeX TR2026-040 PDF
- @inproceedings{Kato2026may,
- author = {Kato, Sorachi and Wang, Pu and Fujihashi, Takuya and Markham, Andrew},
- title = {{Heatmap-to-SMPL Multi-View Radar Transformer for Multi-Person 3D Pose Estimation}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2026,
- month = may,
- publisher = {IEEE},
- doi = {10.1109/ICASSP55912.2026.11463668},
- issn = {2379-190X},
- isbn = {979-8-3315-6701-9},
- url = {https://www.merl.com/publications/TR2026-040}
- }
Masuyama, Y., Germain, F.G., Wichern, G., Hori, C., Le Roux, J., "Velocity Potential Neural Field for Efficient Ambisonics Impulse Response Modeling", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), May 2026.
BibTeX TR2026-033 PDF
- @inproceedings{Masuyama2026may,
- author = {Masuyama, Yoshiki and Germain, François G and Wichern, Gordon and Hori, Chiori and {Le Roux}, Jonathan},
- title = {{Velocity Potential Neural Field for Efficient Ambisonics Impulse Response Modeling}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2026,
- month = may,
- url = {https://www.merl.com/publications/TR2026-033}
- }
Masuyama, Y., Saijo, K., Paissan, F., Han, J., Delcroix, M., Aihara, R., Germain, F.G., Wichern, G., Le Roux, J., "FlexIO: Flexible Single- and Multi-Channel Speech Separation and Enhancement", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), May 2026.
BibTeX TR2026-034 PDF
- @inproceedings{Masuyama2026may2,
- author = {Masuyama, Yoshiki and Saijo, Kohei and Paissan, Francesco and Han, Jiangyu and Delcroix, Marc and Aihara, Ryo and Germain, François G and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{FlexIO: Flexible Single- and Multi-Channel Speech Separation and Enhancement}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2026,
- month = may,
- url = {https://www.merl.com/publications/TR2026-034}
- }
Takahashi, R., Mansour, H., Boufounos, P.T., "DUAL-REGULARIZED ITERATIVE ADAPTIVE APPROACH FOR DOA SPECTRUM RECONSTRUCTION IN LIMITED ANGLE SECTOR", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), May 2026.
BibTeX TR2026-039 PDF
- @inproceedings{Takahashi2026may,
- author = {Takahashi, Ryuhei and Mansour, Hassan and Boufounos, Petros T.},
- title = {{DUAL-REGULARIZED ITERATIVE ADAPTIVE APPROACH FOR DOA SPECTRUM RECONSTRUCTION IN LIMITED ANGLE SECTOR}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2026,
- month = may,
- url = {https://www.merl.com/publications/TR2026-039}
- }
Zhang, H., Ma, Y., Kitichotkul, R., Rapp, J., Boufounos, P.T., "ProxiCBO: A Consensus-based Method for Composite Optimization", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), May 2026.
BibTeX TR2026-041 PDF
- @inproceedings{Zhang2026may,
- author = {Zhang, Haoyu and Ma, Yanting and Kitichotkul, Ruangrawee and Rapp, Joshua and Boufounos, Petros T.},
- title = {{ProxiCBO: A Consensus-based Method for Composite Optimization}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2026,
- month = may,
- url = {https://www.merl.com/publications/TR2026-041}
- }
Aihara, R., Masuyama, Y., Germain, F.G., Wichern, G., Le Roux, J., "Exploring Disentangled Neural Speech Codecs from Self-Supervised Representations", IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW), May 2026.
BibTeX TR2026-035 PDF
- @inproceedings{Aihara2026may2,
- author = {Aihara, Ryo and Masuyama, Yoshiki and Germain, François G and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Exploring Disentangled Neural Speech Codecs from Self-Supervised Representations}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)},
- year = 2026,
- month = may,
- url = {https://www.merl.com/publications/TR2026-035}
- }
Wilkinghoff, K., Yang, H., Ebbers, J., Germain, F.G., Wichern, G., Le Roux, J., "Local Density-Based Anomaly Score Normalization for Domain Generalization", IEEE Transactions on Audio, Speech and Language Processing, DOI: 10.1109/TASLPRO.2025.3629236, Vol. 33, pp. 4642-4652, January 2026.
BibTeX TR2026-010 PDF Software
- @article{Wilkinghoff2026jan,
- author = {Wilkinghoff, Kevin and Yang, Haici and Ebbers, Janek and Germain, François G and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Local Density-Based Anomaly Score Normalization for Domain Generalization}},
- journal = {IEEE Transactions on Audio, Speech and Language Processing},
- year = 2026,
- volume = 33,
- pages = {4642--4652},
- month = jan,
- doi = {10.1109/TASLPRO.2025.3629236},
- issn = {2998-4173},
- url = {https://www.merl.com/publications/TR2026-010}
- }
Cornell, S., Boeddeker, C., Park, T., Huang, H., Raj, D., Wiesner, M., Masuyama, Y., Chang, X., Wang, Z.-Q., Squartini, S., Garcia, P., Watanabe, S., "Recent Trends in Distant Conversational Speech Recognition: A Review of CHiME-7 and 8 DASR Challenges", Computer Speech & Language, DOI: 10.1016/j.csl.2025.101901, Vol. 97, pp. 101901, December 2025.
BibTeX TR2026-008 PDF
- @article{Cornell2025dec,
- author = {Cornell, Samuele and Boeddeker, Christoph and Park, Taejin and Huang, He and Raj, Desh and Wiesner, Matthew and Masuyama, Yoshiki and Chang, Xuankai and Wang, Zhong-Qiu and Squartini, Stefano and Garcia, Paola and Watanabe, Shinji},
- title = {{Recent Trends in Distant Conversational Speech Recognition: A Review of CHiME-7 and 8 DASR Challenges}},
- journal = {Computer Speech \& Language},
- year = 2025,
- volume = 97,
- pages = 101901,
- month = dec,
- doi = {10.1016/j.csl.2025.101901},
- url = {https://www.merl.com/publications/TR2026-008}
- }
Hori, C., Masuyama, Y., Jain, S., Corcodel, R., Jha, D.K., Romeres, D., Le Roux, J., "Robot Confirmation Generation and Action Planning Using Long-context Q-Former Integrated with Multimodal LLM", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), DOI: 10.1109/ASRU65441.2025.11434641, December 2025.
BibTeX TR2025-167 PDF
- @inproceedings{Hori2025dec,
- author = {Hori, Chiori and Masuyama, Yoshiki and Jain, Siddarth and Corcodel, Radu and Jha, Devesh K. and Romeres, Diego and {Le Roux}, Jonathan},
- title = {{Robot Confirmation Generation and Action Planning Using Long-context Q-Former Integrated with Multimodal LLM}},
- booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
- year = 2025,
- month = dec,
- doi = {10.1109/ASRU65441.2025.11434641},
- issn = {2997-6995},
- isbn = {979-8-3315-4426-3},
- url = {https://www.merl.com/publications/TR2025-167}
- }
Masuyama, Y., "Neural Fields for Spatial Audio Modeling," Tech. Rep. TR2025-171, Speech and Audio in the Northeast (SANE), November 2025.
BibTeX TR2025-171 PDF
- @techreport{Masuyama2025nov,
- author = {Masuyama, Yoshiki},
- title = {{Neural Fields for Spatial Audio Modeling}},
- institution = {Speech and Audio in the Northeast (SANE)},
- year = 2025,
- month = nov,
- url = {https://www.merl.com/publications/TR2025-171}
- }
Paissan, F., Wichern, G., Masuyama, Y., Aihara, R., Germain, F.G., Saijo, K., Le Roux, J., "FasTUSS: Faster Task-Aware Unified Source Separation", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/WASPAA66052.2025.11230943, October 2025.
BibTeX TR2025-143 PDF
- @inproceedings{Paissan2025oct,
- author = {Paissan, Francesco and Wichern, Gordon and Masuyama, Yoshiki and Aihara, Ryo and Germain, François G and Saijo, Kohei and {Le Roux}, Jonathan},
- title = {{FasTUSS: Faster Task-Aware Unified Source Separation}},
- booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
- year = 2025,
- month = oct,
- doi = {10.1109/WASPAA66052.2025.11230943},
- url = {https://www.merl.com/publications/TR2025-143}
- }
Hussein, A., Khurana, S., Wichern, G., Germain, F.G., Le Roux, J., "HASRD: Hierarchical Acoustic and Semantic Representation Disentanglement", Interspeech, DOI: 10.21437/Interspeech.2025-2063, August 2025, pp. 5393-5397.
BibTeX TR2025-122 PDF
- @inproceedings{Hussein2025aug,
- author = {Hussein, Amir and Khurana, Sameer and Wichern, Gordon and Germain, François G and {Le Roux}, Jonathan},
- title = {{HASRD: Hierarchical Acoustic and Semantic Representation Disentanglement}},
- booktitle = {Interspeech},
- year = 2025,
- pages = {5393--5397},
- month = aug,
- publisher = {ISCA},
- doi = {10.21437/Interspeech.2025-2063},
- url = {https://www.merl.com/publications/TR2025-122}
- }
Khurana, S., Klement, D., Laurent, A., Bobos, D., Novosad, J., Gazdik, P., Zhang, E., Huang, Z., Hussein, A., Marxer, R., Masuyama, Y., Aihara, R., Hori, C., Germain, F.G., Wichern, G., Le Roux, J., "Factorized RVQ-GAN For Disentangled Speech Tokenization", Interspeech, DOI: 10.21437/Interspeech.2025-2612, August 2025, pp. 3514-3518.
BibTeX TR2025-123 PDF
- @inproceedings{Khurana2025aug,
- author = {Khurana, Sameer and Klement, Dominik and Laurent, Antoine and Bobos, Dominik and Novosad, Juraj and Gazdik, Peter and Zhang, Ellen and Huang, Zilli and Hussein, Amir and Marxer, Ricard and Masuyama, Yoshiki and Aihara, Ryo and Hori, Chiori and Germain, François G and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Factorized RVQ-GAN For Disentangled Speech Tokenization}},
- booktitle = {Interspeech},
- year = 2025,
- pages = {3514--3518},
- month = aug,
- publisher = {ISCA},
- doi = {10.21437/Interspeech.2025-2612},
- url = {https://www.merl.com/publications/TR2025-123}
- }
Yang, H., Wichern, G., Aihara, R., Masuyama, Y., Khurana, S., Germain, F.G., Le Roux, J., "Investigating Continuous Autoregressive Generative Speech Enhancement", Interspeech, DOI: doi: 10.21437/Interspeech.2025-2335, August 2025, pp. 2360-2364.
BibTeX TR2025-119 PDF
- @inproceedings{Yang2025aug,
- author = {Yang, Haici and Wichern, Gordon and Aihara, Ryo and Masuyama, Yoshiki and Khurana, Sameer and Germain, François G and {Le Roux}, Jonathan},
- title = {{Investigating Continuous Autoregressive Generative Speech Enhancement}},
- booktitle = {Interspeech},
- year = 2025,
- pages = {2360--2364},
- month = aug,
- publisher = {ISCA},
- doi = {doi: 10.21437/Interspeech.2025-2335},
- url = {https://www.merl.com/publications/TR2025-119}
- }
Aihara, R., Masuyama, Y., Germain, F.G., Wichern, G., Le Roux, J., "Exploring Disentangled Neural Speech Codecs from Self-Supervised Representations", arXiv, August 2025.
BibTeX arXiv
- @article{Aihara2025aug,
- author = {Aihara, Ryo and Masuyama, Yoshiki and Germain, François G and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Exploring Disentangled Neural Speech Codecs from Self-Supervised Representations}},
- journal = {arXiv},
- year = 2025,
- month = aug,
- url = {https://arxiv.org/abs/2508.08399}
- }
Steinmetz, C., Uhle, C., Everardo, F., Mitcheltree, C., McElveen, J.K., Jot, J.-M., Wichern, G., "Audio Signal Processing in the Artificial Intelligence Era: Challenges and Directions", Journal of the Audio Engineering Society, DOI: 10.17743/jaes.2022.0209, Vol. 73, No. 7/8, pp. 406-428, August 2025.
BibTeX TR2025-116 PDF
- @article{Steinmetz2025aug,
- author = {Steinmetz, Christian and Uhle, Christian and Everardo, Flavio and Mitcheltree, Christopher and McElveen, J. Keith and Jot, Jean-Marc and Wichern, Gordon},
- title = {{Audio Signal Processing in the Artificial Intelligence Era: Challenges and Directions}},
- journal = {Journal of the Audio Engineering Society},
- year = 2025,
- volume = 73,
- number = {7/8},
- pages = {406--428},
- month = aug,
- doi = {10.17743/jaes.2022.0209},
- url = {https://www.merl.com/publications/TR2025-116}
- }
Masuyama, Y., "Single- and Multi-Channel Speech Enhancement and Separation for Far-Field Conversation Recognition," Tech. Rep. TR2025-097, Jelinek Summer Workshop on Speech and Language Technology (JSALT), June 2025.
BibTeX TR2025-097 PDF
- @techreport{Masuyama2025jun,
- author = {{{Masuyama, Yoshiki}}},
- title = {{{Single- and Multi-Channel Speech Enhancement and Separation for Far-Field Conversation Recognition}}},
- institution = {Jelinek Summer Workshop on Speech and Language Technology (JSALT)},
- year = 2025,
- month = jun,
- url = {https://www.merl.com/publications/TR2025-097}
- }
Masuyama, Y., Chang, X., Zhang, W., Cornell, S., Wang, Z.-Q., Ono, N., Qian, Y., Watanabe, S., "An End-to-End Integration of Speech Separation and Recognition with Self-Supervised Learning Representation", Computer Speech & Language, DOI: 10.1016/j.csl.2025.101813, Vol. 95, pp. 101813, May 2025.
BibTeX TR2025-054 PDF
- @article{Masuyama2025may,
- author = {Masuyama, Yoshiki and Chang, Xuankai and Zhang, Wangyou and Cornell, Samuele and Wang, Zhong-Qiu and Ono, Nobutaka and Qian, Yanmin and Watanabe, Shinji},
- title = {{An End-to-End Integration of Speech Separation and Recognition with Self-Supervised Learning Representation}},
- journal = {Computer Speech \& Language},
- year = 2025,
- volume = 95,
- pages = 101813,
- month = may,
- doi = {10.1016/j.csl.2025.101813},
- issn = {0885-2308},
- url = {https://www.merl.com/publications/TR2025-054}
- }
Araki, S., Ito, N., Haeb-Umbach, R., Wichern, G., Wang, Z.-Q., Mitsufuji, Y., "30+ Years of Source Separation Research: Achievements and Future Challenges", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP49660.2025.10889006, April 2025, pp. 1-5.
BibTeX TR2025-036 PDF
- @inproceedings{Araki2025mar,
- author = {Araki, Shoko and Ito, Nobutaka and Haeb-Umbach, Reinhold and Wichern, Gordon and Wang, Zhong-Qiu and Mitsufuji, Yuki},
- title = {{30+ Years of Source Separation Research: Achievements and Future Challenges}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2025,
- pages = {1--5},
- month = mar,
- doi = {10.1109/ICASSP49660.2025.10889006},
- url = {https://www.merl.com/publications/TR2025-036}
- }
Attiah, K., Wang, P., Mansour, H., Koike-Akino, T., Boufounos, P.T., "Enabling DMG Wi-Fi Sensing in Data Transmission Intervals by Exploiting Beam Training Codebook", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP49660.2025.10889300, April 2025.
BibTeX TR2025-026 PDF
- @inproceedings{Attiah2025mar,
- author = {Attiah, Kareem and Wang, Pu and Mansour, Hassan and Koike-Akino, Toshiaki and Boufounos, Petros T.},
- title = {{Enabling DMG Wi-Fi Sensing in Data Transmission Intervals by Exploiting Beam Training Codebook}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2025,
- month = mar,
- publisher = {IEEE},
- doi = {10.1109/ICASSP49660.2025.10889300},
- issn = {2379-190X},
- isbn = {979-8-3503-6874-1},
- url = {https://www.merl.com/publications/TR2025-026}
- }
Ebbers, J., Germain, F.G., Wilkinghoff, K., Wichern, G., Le Roux, J., "No Class Left Behind: A Closer Look at Class Balancing for Audio Tagging", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP49660.2025.10890706, April 2025.
BibTeX TR2025-037 PDF
- @inproceedings{Ebbers2025mar,
- author = {Ebbers, Janek and Germain, François G and Wilkinghoff, Kevin and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{No Class Left Behind: A Closer Look at Class Balancing for Audio Tagging}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2025,
- month = mar,
- doi = {10.1109/ICASSP49660.2025.10890706},
- url = {https://www.merl.com/publications/TR2025-037}
- }