Publications

Baskar, M.K., Watanabe, S., Astudillo, R., Hori, T., Burget, L., Cernocky, J.H., "Semi-supervised Sequence-to-sequence ASR using Unpaired Speech and Text", Interspeech, DOI: 10.21437/Interspeech.2019-3167, September 2019, pp. 3790-3794.
BibTeX TR2019-100 PDF
- @inproceedings{Baskar2019sep,
- author = {Baskar, Murali Karthick and Watanabe, Shinji and Astudillo, Ramon and Hori, Takaaki and Burget, Lukas and Cernocky, Jan, Honza},
- title = {{Semi-supervised Sequence-to-sequence ASR using Unpaired Speech and Text}},
- booktitle = {Interspeech},
- year = 2019,
- pages = {3790--3794},
- month = sep,
- doi = {10.21437/Interspeech.2019-3167},
- issn = {1990-9772},
- url = {https://www.merl.com/publications/TR2019-100}
- }
Karafiat, M., Baskar, M.K., Watanabe, S., Hori, T., Wiesner, M., Cernocky, J.H., "Analysis of Multilingual Sequence-to-Sequence Speech Recognition Systems", Interspeech, DOI: 10.21437/Interspeech.2019-2355//, September 2019, pp. 2019-2355.
BibTeX TR2019-103 PDF
- @inproceedings{Karafiat2019sep,
- author = {Karafiat, Martin and Baskar, Murali Karthick and Watanabe, Shinji and Hori, Takaaki and Wiesner, Matthew and Cernocky, Jan, Honza},
- title = {{Analysis of Multilingual Sequence-to-Sequence Speech Recognition Systems}},
- booktitle = {Interspeech},
- year = 2019,
- pages = {2019--2355},
- month = sep,
- doi = {10.21437/Interspeech.2019-2355//},
- url = {https://www.merl.com/publications/TR2019-103}
- }
Moritz, N., Hori, T., Le Roux, J., "Unidirectional Neural Network Architectures for End-to-End Automatic Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2019-2837, September 2019, pp. 76-80.
BibTeX TR2019-098 PDF
- @inproceedings{Moritz2019sep,
- author = {Moritz, Niko and Hori, Takaaki and {Le Roux}, Jonathan},
- title = {{Unidirectional Neural Network Architectures for End-to-End Automatic Speech Recognition}},
- booktitle = {Interspeech},
- year = 2019,
- pages = {76--80},
- month = sep,
- doi = {10.21437/Interspeech.2019-2837},
- url = {https://www.merl.com/publications/TR2019-098}
- }
Seki, H., Hori, T., Watanabe, S., Le Roux, J., Hershey, J., "End-to-End Multilingual Multi-Speaker Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2019-3038, September 2019, pp. 3755-3759.
BibTeX TR2019-101 PDF
- @inproceedings{Seki2019sep,
- author = {Seki, Hiroshi and Hori, Takaaki and Watanabe, Shinji and {Le Roux}, Jonathan and Hershey, John},
- title = {{End-to-End Multilingual Multi-Speaker Speech Recognition}},
- booktitle = {Interspeech},
- year = 2019,
- pages = {3755--3759},
- month = sep,
- doi = {10.21437/Interspeech.2019-3038},
- url = {https://www.merl.com/publications/TR2019-101}
- }
Seki, H., Hori, T., Watanabe, S., Moritz, N., Le Roux, J., "Vectorized Beam Search for CTC-Attention-based Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2019-2860, September 2019, pp. 3825-3829.
BibTeX TR2019-102 PDF
- @inproceedings{Seki2019sep2,
- author = {Seki, Hiroshi and Hori, Takaaki and Watanabe, Shinji and Moritz, Niko and {Le Roux}, Jonathan},
- title = {{Vectorized Beam Search for CTC-Attention-based Speech Recognition}},
- booktitle = {Interspeech},
- year = 2019,
- pages = {3825--3829},
- month = sep,
- doi = {10.21437/Interspeech.2019-2860},
- url = {https://www.merl.com/publications/TR2019-102}
- }
Baskar, M.K., Burget, L., Watanabe, S., Karafiat, M., Hori, T., Cernocky, J.H., "Promising Accurate Prefix Boosting for Sequence-to-Sequence ASR", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8682782, May 2019, pp. 5646-5650.
BibTeX TR2019-006 PDF
- @inproceedings{Baskar2019may,
- author = {Baskar, Murali Karthick and Burget, Lukas and Watanabe, Shinji and Karafiat, Martin and Hori, Takaaki and Cernocky, Jan, Honza},
- title = {{Promising Accurate Prefix Boosting for Sequence-to-Sequence ASR}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- pages = {5646--5650},
- month = may,
- doi = {10.1109/ICASSP.2019.8682782},
- issn = {2379-190X},
- isbn = {978-1-4799-8131-1},
- url = {https://www.merl.com/publications/TR2019-006}
- }
Cho, J., Watanabe, S., Hori, T., Baskar, M.K., Inaguma, H., Villalba, J., Dehak, N., "Language Model Integration Based on Memory Control for Sequence to Sequence Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683380, May 2019.
BibTeX TR2019-007 PDF
- @inproceedings{Cho2019may,
- author = {Cho, Jaejin and Watanabe, Shinji and Hori, Takaaki and Baskar, Murali Karthick and Inaguma, Hirofumi and Villalba, Jesus and Dehak, Najim},
- title = {{Language Model Integration Based on Memory Control for Sequence to Sequence Speech Recognition}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683380},
- url = {https://www.merl.com/publications/TR2019-007}
- }
Hori, T., Astudillo, R., Hayashi, T., Zhang, Y., Watanabe, S., Le Roux, J., "Cycle-Consistency Training for End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683307, May 2019.
BibTeX TR2019-002 PDF
- @inproceedings{Hori2019may,
- author = {Hori, Takaaki and Astudillo, Ramon and Hayashi, Tomoki and Zhang, Yu and Watanabe, Shinji and {Le Roux}, Jonathan},
- title = {{Cycle-Consistency Training for End-to-End Speech Recognition }},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683307},
- url = {https://www.merl.com/publications/TR2019-002}
- }
Moritz, N., Hori, T., Le Roux, J., "Triggered Attention for End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683510, May 2019.
BibTeX TR2019-015 PDF
- @inproceedings{Moritz2019may,
- author = {Moritz, Niko and Hori, Takaaki and {Le Roux}, Jonathan},
- title = {{Triggered Attention for End-to-End Speech Recognition}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683510},
- url = {https://www.merl.com/publications/TR2019-015}
- }
Wang, X., Li, R., Mallidi, S.H., Hori, T., Watanabe, S., Hermansky, H., "Stream Attention-Based Multi-Array End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8682650, May 2019.
BibTeX TR2019-005 PDF
- @inproceedings{Wang2019may,
- author = {Wang, Xiaofei and Li, Ruizhi and Mallidi, Sri Harish and Hori, Takaaki and Watanabe, Shinji and Hermansky, Hynek},
- title = {{Stream Attention-Based Multi-Array End-to-End Speech Recognition}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8682650},
- url = {https://www.merl.com/publications/TR2019-005}
- }
Cho, J., Baskar, M.K., Li, R., Wiesner, M., Mallidi, S.H., Yalta, N., Karafiat, M., Watanabe, S., Hori, T., "Multilingual Sequence-to-Sequence Speech Recognition: Architecture, Transfer Learning, and Language Modeling", IEEE Spoken Language Technology Workshop (SLT), DOI: 10.1109/SLT.2018.8639655, December 2018.
BibTeX TR2018-175 PDF
- @inproceedings{Cho2018dec,
- author = {Cho, Jaejin and Baskar, Murali Karthick and Li, Ruizhi and Wiesner, Matthew and Mallidi, Sri Harish and Yalta, Nelson and Karafiat, Martin and Watanabe, Shinji and Hori, Takaaki},
- title = {{Multilingual Sequence-to-Sequence Speech Recognition: Architecture, Transfer Learning, and Language Modeling}},
- booktitle = {IEEE Spoken Language Technology Workshop (SLT)},
- year = 2018,
- month = dec,
- doi = {10.1109/SLT.2018.8639655},
- url = {https://www.merl.com/publications/TR2018-175}
- }
Hayashi, T., Watanabe, S., Zhang, Y., Toda, T., Hori, T., Astudillo, R., Takeda, K., "Back-Translation-Style Data Augmentation for End-to-End ASR", IEEE Spoken Language Technology Workshop (SLT), DOI: 10.1109/SLT.2018.8639619, December 2018.
BibTeX TR2018-174 PDF
- @inproceedings{Hayashi2018dec,
- author = {Hayashi, Tomoki and Watanabe, Shinji and Zhang, Yu and Toda, Tomoki and Hori, Takaaki and Astudillo, Ramon and Takeda, Kazuya},
- title = {{Back-Translation-Style Data Augmentation for End-to-End ASR}},
- booktitle = {IEEE Spoken Language Technology Workshop (SLT)},
- year = 2018,
- month = dec,
- doi = {10.1109/SLT.2018.8639619},
- url = {https://www.merl.com/publications/TR2018-174}
- }
Hori, T., Cho, J., Watanabe, S., "End-to-End Speech Recognition with Word-Based RNN Language Models", IEEE Spoken Language Technology Workshop (SLT), DOI: 10.1109/SLT.2018.8639693, December 2018.
BibTeX TR2018-176 PDF
- @inproceedings{Hori2018dec,
- author = {Hori, Takaaki and Cho, Jaejin and Watanabe, Shinji},
- title = {{End-to-End Speech Recognition with Word-Based RNN Language Models}},
- booktitle = {IEEE Spoken Language Technology Workshop (SLT)},
- year = 2018,
- month = dec,
- doi = {10.1109/SLT.2018.8639693},
- url = {https://www.merl.com/publications/TR2018-176}
- }
Watanabe, S., Hori, T., Karita, S., Hayashi, T., Nishitoba, J., Unno, Y., Enrique Yalta Soplin, N., Heymann, J., Wiesner, M., Chen, N., Renduchintala, A., Ochiai, T., "ESPnet: End-to-End Speech Processing Toolkit", Interspeech, September 2018.
BibTeX TR2018-136 PDF
- @inproceedings{Watanabe2018sep,
- author = {Watanabe, Shinji and Hori, Takaaki and Karita, Shigeki and Hayashi, Tomoki and Nishitoba, Jiro and Unno, Yuya and Enrique Yalta Soplin, Nelson and Heymann, Jahn and Wiesner, Matthew and Chen, Nanxin and Renduchintala, Adithya and Ochiai, Tsubasa},
- title = {{ESPnet: End-to-End Speech Processing Toolkit}},
- booktitle = {Interspeech},
- year = 2018,
- month = sep,
- url = {https://www.merl.com/publications/TR2018-136}
- }
Ochiai, T., Watanabe, S., Katagiri, S., Hori, T., Hershey, J.R., "Speaker Adaptation for Multichannel End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2018.8462161, April 2018, pp. 6707-6711.
BibTeX TR2018-006 PDF
- @inproceedings{Ochiai2018apr,
- author = {Ochiai, Tsubasa and Watanabe, Shinji and Katagiri, Shigeru and Hori, Takaaki and Hershey, John R.},
- title = {{Speaker Adaptation for Multichannel End-to-End Speech Recognition}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2018,
- pages = {6707--6711},
- month = apr,
- doi = {10.1109/ICASSP.2018.8462161},
- url = {https://www.merl.com/publications/TR2018-006}
- }
Seki, H., Watanabe, S., Hori, T., Le Roux, J., Hershey, J.R., "An End-to-End Language-Tracking Speech Recognizer for Mixed-Language Speech", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2018.8462180, April 2018, pp. 4919-4923.
BibTeX TR2018-002 PDF Video
- @inproceedings{Seki2018apr,
- author = {Seki, Hiroshi and Watanabe, Shinji and Hori, Takaaki and {Le Roux}, Jonathan and Hershey, John R.},
- title = {{An End-to-End Language-Tracking Speech Recognizer for Mixed-Language Speech}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2018,
- pages = {4919--4923},
- month = apr,
- doi = {10.1109/ICASSP.2018.8462180},
- url = {https://www.merl.com/publications/TR2018-002}
- }
Settle, S., Le Roux, J., Hori, T., Watanabe, S., Hershey, J.R., "End-to-End Multi-Speaker Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2018.8461893, April 2018, pp. 4819-4823.
BibTeX TR2018-001 PDF Video
- @inproceedings{Settle2018apr,
- author = {Settle, Shane and {Le Roux}, Jonathan and Hori, Takaaki and Watanabe, Shinji and Hershey, John R.},
- title = {{End-to-End Multi-Speaker Speech Recognition}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2018,
- pages = {4819--4823},
- month = apr,
- doi = {10.1109/ICASSP.2018.8461893},
- url = {https://www.merl.com/publications/TR2018-001}
- }
Hori, T., Watanabe, S., Hershey, J.R., "Multi-level Language Modeling and Decoding for Open Vocabulary End-to-End Speech Recognition", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), DOI: 10.1109/ASRU.2017.8268948, December 2017.
BibTeX TR2017-181 PDF
- @inproceedings{Hori2017dec,
- author = {Hori, Takaaki and Watanabe, Shinji and Hershey, John R.},
- title = {{Multi-level Language Modeling and Decoding for Open Vocabulary End-to-End Speech Recognition}},
- booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
- year = 2017,
- month = dec,
- doi = {10.1109/ASRU.2017.8268948},
- url = {https://www.merl.com/publications/TR2017-181}
- }
Watanabe, S., Hori, T., Hershey, J.R., "Language Independent End-to-End Architecture For Joint Language and Speech Recognition", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), DOI: 10.1109/ASRU.2017.8268945, December 2017.
BibTeX TR2017-182 PDF Video
- @inproceedings{Watanabe2017dec,
- author = {Watanabe, Shinji and Hori, Takaaki and Hershey, John R.},
- title = {{Language Independent End-to-End Architecture For Joint Language and Speech Recognition}},
- booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
- year = 2017,
- month = dec,
- doi = {10.1109/ASRU.2017.8268945},
- url = {https://www.merl.com/publications/TR2017-182}
- }
Ochiai, T., Watanabe, S., Katagiri, S., "Does speech enhancement work with end-to-end ASR objectives?: Experimental analysis of multichannel end-to-end ASR", IEEE International Workshop on Machine Learning for Signal Processing (MLSP), DOI: 10.1109/JSTSP.2017.2764276, October 2017, vol. 11, pp. 1274-1288.
BibTeX TR2017-139 PDF
- @inproceedings{Ochiai2017oct,
- author = {Ochiai, Tsubasa and Watanabe, Shinji and Katagiri, Shigeru},
- title = {{Does speech enhancement work with end-to-end ASR objectives?: Experimental analysis of multichannel end-to-end ASR}},
- booktitle = {IEEE International Workshop on Machine Learning for Signal Processing (MLSP)},
- year = 2017,
- volume = 11,
- number = 8,
- pages = {1274--1288},
- month = oct,
- doi = {10.1109/JSTSP.2017.2764276},
- url = {https://www.merl.com/publications/TR2017-139}
- }
Ochiai, T., Watanabe, S., Hori, T., Hershey, J.R., Xiao, X., "Unified Architecture for Multichannel End-to-End Speech Recognition with Neural Beamforming", IEEE Journal of Selected Topics in Signal Processing, DOI: 10.1109/JSTSP.2017.2764276, Vol. 11, No. 8, pp. 1274-1288, October 2017.
BibTeX TR2017-192 PDF
- @article{Ochiai2017oct2,
- author = {Ochiai, Tsubasa and Watanabe, Shinji and Hori, Takaaki and Hershey, John R. and Xiao, Xiong},
- title = {{Unified Architecture for Multichannel End-to-End Speech Recognition with Neural Beamforming}},
- journal = {IEEE Journal of Selected Topics in Signal Processing},
- year = 2017,
- volume = 11,
- number = 8,
- pages = {1274--1288},
- month = oct,
- doi = {10.1109/JSTSP.2017.2764276},
- issn = {1941-0484},
- url = {https://www.merl.com/publications/TR2017-192}
- }
Watanabe, S., Hori, T., Kim, S., Hershey, J.R., Hayashi, T., "Hybrid CTC/Attention Architecture for End-to-End Speech Recognition", IEEE Journal of Selected Topics in Signal Processing, DOI: 10.1109/JSTSP.2017.2763455, Vol. 11, No. 8, pp. 1240-1253, October 2017.
BibTeX TR2017-190 PDF Video
- @article{Watanabe2017oct,
- author = {Watanabe, Shinji and Hori, Takaaki and Kim, Suyoun and Hershey, John R. and Hayashi, Tomoki},
- title = {{Hybrid CTC/Attention Architecture for End-to-End Speech Recognition}},
- journal = {IEEE Journal of Selected Topics in Signal Processing},
- year = 2017,
- volume = 11,
- number = 8,
- pages = {1240--1253},
- month = oct,
- doi = {10.1109/JSTSP.2017.2763455},
- issn = {1941-0484},
- url = {https://www.merl.com/publications/TR2017-190}
- }
Hori, T., Watanabe, S., Zhang, Y., Chan, W., "Advances in Joint CTC-Attention based End-to-End Speech Recognition with a Deep CNN Encoder and RNN-LM", Interspeech, August 2017.
BibTeX TR2017-132 PDF Video
- @inproceedings{Hori2017aug,
- author = {Hori, Takaaki and Watanabe, Shinji and Zhang, Yu and Chan, William},
- title = {{Advances in Joint CTC-Attention based End-to-End Speech Recognition with a Deep CNN Encoder and RNN-LM}},
- booktitle = {Interspeech},
- year = 2017,
- month = aug,
- url = {https://www.merl.com/publications/TR2017-132}
- }
Tachioka, Y., Narita, T., Miura, I., Uramoto, T., Monta, N., Uenohara, S., Furuya, K., Watanabe, S., Le Roux, J., "Coupled initialization of multi-channel non-negative matrix factorization based on spatial and spectral information", Interspeech, August 2017.
BibTeX TR2017-134 PDF
- @inproceedings{Tachioka2017aug,
- author = {Tachioka, Yuuki and Narita, Tomohiro and Miura, Iori and Uramoto, Takanobu and Monta, Natsuki and Uenohara, Shingo and Furuya, Kenichi and Watanabe, Shinji and {Le Roux}, Jonathan},
- title = {{Coupled initialization of multi-channel non-negative matrix factorization based on spatial and spectral information}},
- booktitle = {Interspeech},
- year = 2017,
- month = aug,
- url = {https://www.merl.com/publications/TR2017-134}
- }
Hori, T., Watanabe, S., Hershey, J.R., "Joint CTC/attention decoding for end-to-end speech recognition", Association for Computational Linguistics (ACL), DOI: 10.18653/v1/P17-1048, July 2017, pp. 518-529.
BibTeX TR2017-103 PDF Video
- @inproceedings{Hori2017jul,
- author = {Hori, Takaaki and Watanabe, Shinji and Hershey, John R.},
- title = {{Joint CTC/attention decoding for end-to-end speech recognition}},
- booktitle = {Association for Computational Linguistics (ACL)},
- year = 2017,
- pages = {518--529},
- month = jul,
- doi = {10.18653/v1/P17-1048},
- url = {https://www.merl.com/publications/TR2017-103}
- }