- Chatterjee, M., Ahuja, N., Cherian, A., "Quantifying Predictive Uncertainty for Stochastic Video Synthesis from Audio", IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW), June 2022.
BibTeX TR2022-082 PDF- @inproceedings{Chatterjee2022jun,
- author = {Chatterjee, Moitreya and Ahuja, Narendra and Cherian, Anoop},
- title = {Quantifying Predictive Uncertainty for Stochastic Video Synthesis from Audio},
- booktitle = {IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)},
- year = 2022,
- month = jun,
- url = {https://www.merl.com/publications/TR2022-082}
- }
- Chang, X., Moritz, N., Hori, T., Watanabe, S., Le Roux, J., "Extended Graph Temporal Classification for Multi-Speaker End-to-End ASR", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP43922.2022.9747375, April 2022, pp. 7322-7326.
BibTeX TR2022-021 PDF- @inproceedings{Chang2022apr,
- author = {Chang, Xuankai and Moritz, Niko and Hori, Takaaki and Watanabe, Shinji and Le Roux, Jonathan},
- title = {Extended Graph Temporal Classification for Multi-Speaker End-to-End ASR},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2022,
- pages = {7322--7326},
- month = apr,
- publisher = {IEEE},
- doi = {10.1109/ICASSP43922.2022.9747375},
- url = {https://www.merl.com/publications/TR2022-021}
- }
- Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Advancing Momentum Pseudo-Labeling with Conformer and Initialization Strategy", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP43922.2022.9746275, April 2022, pp. 7672-7676.
BibTeX TR2022-026 PDF- @inproceedings{Higuchi2022apr,
- author = {Higuchi, Yosuke and Moritz, Niko and Le Roux, Jonathan and Hori, Takaaki},
- title = {Advancing Momentum Pseudo-Labeling with Conformer and Initialization Strategy},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2022,
- pages = {7672--7676},
- month = apr,
- publisher = {IEEE},
- doi = {10.1109/ICASSP43922.2022.9746275},
- url = {https://www.merl.com/publications/TR2022-026}
- }
- Moritz, N., Hori, T., Watanabe, S., Le Roux, J., "Sequence Transduction with Graph-based Supervision", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP43922.2022.9747788, April 2022, pp. 7212-7216.
BibTeX TR2022-024 PDF- @inproceedings{Moritz2022apr,
- author = {Moritz, Niko and Hori, Takaaki and Watanabe, Shinji and Le Roux, Jonathan},
- title = {Sequence Transduction with Graph-based Supervision},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2022,
- pages = {7212--7216},
- month = apr,
- publisher = {IEEE},
- doi = {10.1109/ICASSP43922.2022.9747788},
- url = {https://www.merl.com/publications/TR2022-024}
- }
- Petermann, D., Wichern, G., Wang, Z.-Q., Le Roux, J., "The Cocktail Fork Problem: Three-Stem Audio Separation for Real-World Soundtracks", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP43922.2022.9746005, April 2022, pp. 526-530.
BibTeX TR2022-022 PDF Video Software- @inproceedings{Petermann2022apr,
- author = {Petermann, Darius and Wichern, Gordon and Wang, Zhong-Qiu and Le Roux, Jonathan},
- title = {The Cocktail Fork Problem: Three-Stem Audio Separation for Real-World Soundtracks},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2022,
- pages = {526--530},
- month = apr,
- doi = {10.1109/ICASSP43922.2022.9746005},
- url = {https://www.merl.com/publications/TR2022-022}
- }
- Shah, A.P., Geng, S., Gao, P., Cherian, A., Hori, T., Marks, T.K., Le Roux, J., Hori, C., "Audio-Visual Scene-Aware Dialog and Reasoning Using Audio-Visual Transformers with Joint Student-Teacher Learning", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), April 2022, pp. 7732-7736.
BibTeX TR2022-019 PDF- @inproceedings{Shah2022apr,
- author = {Shah, Ankit Parag and Geng, Shijie and Gao, Peng and Cherian, Anoop and Hori, Takaaki and Marks, Tim K. and Le Roux, Jonathan and Hori, Chiori},
- title = {Audio-Visual Scene-Aware Dialog and Reasoning Using Audio-Visual Transformers with Joint Student-Teacher Learning},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2022,
- pages = {7732--7736},
- month = apr,
- publisher = {IEEE},
- issn = {1520-6149},
- isbn = {978-1-6654-0540-9},
- url = {https://www.merl.com/publications/TR2022-019}
- }
- Slizovskaia, O., Wichern, G., Wang, Z.-Q., Le Roux, J., "Locate This, Not That: Class-Conditioned Sound Event DOA Estimation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP43922.2022.9747604, April 2022, pp. 711-715.
BibTeX TR2022-023 PDF- @inproceedings{Slizovskaia2022mar,
- author = {Slizovskaia, Olga and Wichern, Gordon and Wang, Zhong-Qiu and Le Roux, Jonathan},
- title = {Locate This, Not That: Class-Conditioned Sound Event DOA Estimation},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2022,
- pages = {711--715},
- month = apr,
- doi = {10.1109/ICASSP43922.2022.9747604},
- url = {https://www.merl.com/publications/TR2022-023}
- }
- Hori, C., Shah, A.P., Geng, S., Gao, P., Cherian, A., Hori, T., Le Roux, J., Marks, T.K., "Overview of Audio Visual Scene-Aware Dialog with Reasoning Track for Natural Language Generation in DSTC10", The 10th Dialog System Technology Challenge Workshop at AAAI, February 2022.
BibTeX TR2022-016 PDF- @inproceedings{Hori2022feb,
- author = {Hori, Chiori and Shah, Ankit Parag and Geng, Shijie and Gao, Peng and Cherian, Anoop and Hori, Takaaki and Le Roux, Jonathan and Marks, Tim K.},
- title = {Overview of Audio Visual Scene-Aware Dialog with Reasoning Track for Natural Language Generation in DSTC10},
- booktitle = {The 10th Dialog System Technology Challenge Workshop at AAAI},
- year = 2022,
- month = feb,
- url = {https://www.merl.com/publications/TR2022-016}
- }
- Shah, A.P., Hori, T., Le Roux, J., Hori, C., "DSTC10-AVSD Submission System with Reasoning using Audio-Visual Transformers with Joint Student-Teacher Learning", The 10th Dialog System Technology Challenge Workshop at AAAI 2022, February 2022.
BibTeX TR2022-025 PDF- @inproceedings{Shah2022feb,
- author = {{Shah, Ankit Parag and Hori, Takaaki and Le Roux, Jonathan and Hori, Chiori}},
- title = {DSTC10-AVSD Submission System with Reasoning using Audio-Visual Transformers with Joint Student-Teacher Learning},
- booktitle = {The 10th Dialog System Technology Challenge Workshop at AAAI 2022},
- year = 2022,
- month = feb,
- url = {https://www.merl.com/publications/TR2022-025}
- }
- Cherian, A., Hori, C., Marks, T.K., Le Roux, J., "(2.5+1)D Spatio-Temporal Scene Graphs for Video Question Answering", AAAI Conference on Artificial Intelligence, DOI: 10.1609/aaai.v36i1.19922, February 2022, pp. 444-453.
BibTeX TR2022-014 PDF Video Presentation- @inproceedings{Cherian2022feb,
- author = {Cherian, Anoop and Hori, Chiori and Marks, Tim K. and Le Roux, Jonathan},
- title = {(2.5+1)D Spatio-Temporal Scene Graphs for Video Question Answering},
- booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence},
- year = 2022,
- pages = {444--453},
- month = feb,
- doi = {10.1609/aaai.v36i1.19922},
- url = {https://www.merl.com/publications/TR2022-014}
- }
- Wang, Z.-Q., Wichern, G., Le Roux, J., "Convolutive Prediction for Monaural Speech Dereverberation and Noisy-Reverberant Speaker Separation", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/TASLP.2021.3129363, Vol. 29, pp. 3476-3490, December 2021.
BibTeX TR2021-144 PDF- @article{Wang2021dec,
- author = {Wang, Zhong-Qiu and Wichern, Gordon and Le Roux, Jonathan},
- title = {Convolutive Prediction for Monaural Speech Dereverberation and Noisy-Reverberant Speaker Separation},
- journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
- year = 2021,
- volume = 29,
- pages = {3476--3490},
- month = dec,
- doi = {10.1109/TASLP.2021.3129363},
- url = {https://www.merl.com/publications/TR2021-144}
- }
- Wang, Z.-Q., Wichern, G., Le Roux, J., "On The Compensation Between Magnitude and Phase in Speech Separation", IEEE Signal Processing Letters, DOI: 10.1109/LSP.2021.3116502, Vol. 28, pp. 2018-2022, November 2021.
BibTeX TR2021-137 PDF- @article{Wang2021nov2,
- author = {Wang, Zhong-Qiu and Wichern, Gordon and Le Roux, Jonathan},
- title = {On The Compensation Between Magnitude and Phase in Speech Separation},
- journal = {IEEE Signal Processing Letters},
- year = 2021,
- volume = 28,
- pages = {2018--2022},
- month = nov,
- doi = {10.1109/LSP.2021.3116502},
- url = {https://www.merl.com/publications/TR2021-137}
- }
- Wang, Z.-Q., Wichern, G., Le Roux, J., "Convolutive Prediction for Reverberant Speech Separation", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/WASPAA52581.2021.9632667, October 2021, pp. 56-60.
BibTeX TR2021-127 PDF- @inproceedings{Wang2021oct4,
- author = {Wang, Zhong-Qiu and Wichern, Gordon and Le Roux, Jonathan},
- title = {Convolutive Prediction for Reverberant Speech Separation},
- booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
- year = 2021,
- pages = {56--60},
- month = oct,
- publisher = {IEEE},
- doi = {10.1109/WASPAA52581.2021.9632667},
- url = {https://www.merl.com/publications/TR2021-127}
- }
- Wichern, G., Chakrabarty, A., Wang, Z.-Q., Le Roux, J., "Anomalous sound detection using attentive neural processes", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/WASPAA52581.2021.9632762, October 2021, pp. 186-190.
BibTeX TR2021-129 PDF- @inproceedings{Wichern2021oct,
- author = {Wichern, Gordon and Chakrabarty, Ankush and Wang, Zhong-Qiu and Le Roux, Jonathan},
- title = {Anomalous sound detection using attentive neural processes},
- booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
- year = 2021,
- pages = {186--190},
- month = oct,
- publisher = {IEEE},
- doi = {10.1109/WASPAA52581.2021.9632762},
- url = {https://www.merl.com/publications/TR2021-129}
- }
- Chatterjee, M., Le Roux, J., Ahuja, N., Cherian, A., "Visual Scene Graphs for Audio Source Separation", IEEE International Conference on Computer Vision (ICCV), October 2021, pp. 1204-1213.
BibTeX TR2021-095 PDF Video Software- @inproceedings{Chatterjee2021oct,
- author = {Chatterjee, Moitreya and Le Roux, Jonathan and Ahuja, Narendra and Cherian, Anoop},
- title = {Visual Scene Graphs for Audio Source Separation},
- booktitle = {IEEE International Conference on Computer Vision (ICCV)},
- year = 2021,
- pages = {1204--1213},
- month = oct,
- publisher = {CVF},
- url = {https://www.merl.com/publications/TR2021-095}
- }
- Higuchi, Y., Moritz, N., Le Roux, J., Hori, T., "Momentum Pseudo-Labeling for Semi-Supervised Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2021-571, September 2021, pp. 726-730.
BibTeX TR2021-103 PDF- @inproceedings{Higuchi2021sep,
- author = {Higuchi, Yosuke and Moritz, Niko and Le Roux, Jonathan and Hori, Takaaki},
- title = {Momentum Pseudo-Labeling for Semi-Supervised Speech Recognition},
- booktitle = {Interspeech},
- year = 2021,
- pages = {726--730},
- month = sep,
- doi = {10.21437/Interspeech.2021-571},
- url = {https://www.merl.com/publications/TR2021-103}
- }
- Hori, T., Moritz, N., Hori, C., Le Roux, J., "Advanced Long-context End-to-end Speech Recognition Using Context-expanded Transformers", Interspeech, DOI: 10.21437/Interspeech.2021-1643, August 2021, pp. 2097-2101.
BibTeX TR2021-100 PDF- @inproceedings{Hori2021aug3,
- author = {Hori, Takaaki and Moritz, Niko and Hori, Chiori and Le Roux, Jonathan},
- title = {Advanced Long-context End-to-end Speech Recognition Using Context-expanded Transformers},
- booktitle = {Interspeech},
- year = 2021,
- pages = {2097--2101},
- month = aug,
- doi = {10.21437/Interspeech.2021-1643},
- url = {https://www.merl.com/publications/TR2021-100}
- }
- Hori, C., Hori, T., Le Roux, J., "Optimizing Latency for Online Video Captioning Using Audio-VisualTransformers", Interspeech, DOI: 10.21437/Interspeech.2021-1975, August 2021, pp. 586–590.
BibTeX TR2021-093 PDF- @inproceedings{Hori2021aug2,
- author = {Hori, Chiori and Hori, Takaaki and Le Roux, Jonathan},
- title = {Optimizing Latency for Online Video Captioning Using Audio-VisualTransformers},
- booktitle = {Interspeech},
- year = 2021,
- pages = {586–590},
- month = aug,
- publisher = {ISCA},
- doi = {10.21437/Interspeech.2021-1975},
- url = {https://www.merl.com/publications/TR2021-093}
- }
- Moritz, N., Hori, T., Le Roux, J., "Dual Causal/Non-Causal Self-Attention for Streaming End-to-End Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2021-1693, August 2021, pp. 1822-1826.
BibTeX TR2021-094 PDF- @inproceedings{Moritz2021aug,
- author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
- title = {Dual Causal/Non-Causal Self-Attention for Streaming End-to-End Speech Recognition},
- booktitle = {Interspeech},
- year = 2021,
- pages = {1822--1826},
- month = aug,
- doi = {10.21437/Interspeech.2021-1693},
- url = {https://www.merl.com/publications/TR2021-094}
- }
- Hori, C., "Human Perspective Scene Understanding via Multimodal Sensing," Tech. Rep. TR2022-151, Audio-Visual Scene Understanding Tutorial at CVPR 2021, June 2021.
BibTeX TR2022-151 PDF Video- @techreport{Hori2021jun,
- author = {Hori, Chiori},
- title = {Human Perspective Scene Understanding via Multimodal Sensing},
- institution = {Mitsubishi Electric Research Laboratories},
- year = 2021,
- month = jun,
- url = {https://www.merl.com/publications/TR2022-151}
- }
- Moritz, N., Hori, T., Le Roux, J., "Capturing Multi-Resolution Context by Dilated Self-Attention", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP39728.2021.9415001, June 2021, pp. 5869-5873.
BibTeX TR2021-036 PDF- @inproceedings{Moritz2021jun,
- author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
- title = {Capturing Multi-Resolution Context by Dilated Self-Attention},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2021,
- pages = {5869--5873},
- month = jun,
- doi = {10.1109/ICASSP39728.2021.9415001},
- url = {https://www.merl.com/publications/TR2021-036}
- }
- Hung, Y.-N., Wichern, G., Le Roux, J., "Transcription Is All You Need: Learning to Separate Musical Mixtures with Score as Supervision", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP39728.2021.9413358, June 2021, pp. 46-50.
BibTeX TR2021-069 PDF- @inproceedings{Hung2021jun,
- author = {Hung, Yun-Ning and Wichern, Gordon and Le Roux, Jonathan},
- title = {Transcription Is All You Need: Learning to Separate Musical Mixtures with Score as Supervision},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2021,
- pages = {46--50},
- month = jun,
- doi = {10.1109/ICASSP39728.2021.9413358},
- issn = {2379-190X},
- isbn = {978-1-7281-7605-5},
- url = {https://www.merl.com/publications/TR2021-069}
- }
- Khurana, S., Moritz, N., Hori, T., Le Roux, J., "Unsupervised Domain Adaptation For Speech Recognition via Uncertainty Driven Self-Training", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP39728.2021.9414299, June 2021, pp. 6553-6557.
BibTeX TR2021-039 PDF- @inproceedings{Khurana2021jun,
- author = {Khurana, Sameer and Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
- title = {Unsupervised Domain Adaptation For Speech Recognition via Uncertainty Driven Self-Training},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2021,
- pages = {6553--6557},
- month = jun,
- doi = {10.1109/ICASSP39728.2021.9414299},
- url = {https://www.merl.com/publications/TR2021-039}
- }
- Moritz, N., Hori, T., Le Roux, J., "Semi-Supervised Speech Recognition via Graph-Based Temporal Classification", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP39728.2021.9414058, June 2021, pp. 6548-6552.
BibTeX TR2021-037 PDF- @inproceedings{Moritz2021jun2,
- author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
- title = {Semi-Supervised Speech Recognition via Graph-Based Temporal Classification},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2021,
- pages = {6548--6552},
- month = jun,
- doi = {10.1109/ICASSP39728.2021.9414058},
- url = {https://www.merl.com/publications/TR2021-037}
- }
- Watanabe, S., Boyer, F., Chang, X., Guo, P., Hayashi, T., Higuchi, Y., Hori, T., Huang, W.-C., Inaguma, H., Kamo, N., Shigeki, K., Li, C., Shi, J., Subramanian, A.S., Zhang, W., "The 2020 ESPNET Update: New Features, Broadened Applications, Performance Improvements, and Future Plans", IEEE Data Science and Learning Workshop (DSLW), DOI: 10.1109/DSLW51110, June 2021, pp. 1-6.
BibTeX TR2021-073 PDF- @inproceedings{Watanabe2021jun,
- author = {Watanabe, Shinji and Boyer, Florian and Chang, Xuankai and Guo, Pengcheng and Hayashi, Tomoki and Higuchi, Yosuke and Hori, Takaaki and Huang, Wen-Chin and Inaguma, Hirofumi and Kamo, Naoyuki and Shigeki, Karita and Li, Chenda and Shi, Jing and Subramanian, Aswin S and Zhang, Wangyou},
- title = {The 2020 ESPNET Update: New Features, Broadened Applications, Performance Improvements, and Future Plans},
- booktitle = {IEEE Data Science and Learning Workshop (DSLW)},
- year = 2021,
- pages = {1--6},
- month = jun,
- publisher = {IEEE},
- doi = {10.1109/DSLW51110},
- isbn = {978-1-6654-2826-2},
- url = {https://www.merl.com/publications/TR2021-073}
- }