Sameer Khurana

- Phone: 617-621-7598
- Email:
-
Position:
Research / Technical Staff
Visiting Research Scientist -
Education:
Ph.D., Massachusetts Institute of Technology, 2023 -
Research Areas:
External Links:
Sameer's Quick Links
-
Biography
Sameer's research interests include multimodal, transfer and self-supervised learning applied to speech and audio domains. He conducted his Ph.D. research in the Spoken Language Systems Lab at MIT Computer Science and AI Lab (CSAIL), where he developed transfer learning methods for spoken language processing applications.
-
Research Highlights
-
Internships with Sameer
-
SA2072: Multimodal Representation Learning
MERL is offering internship positions for PhD candidates interested in audio-visual-language multimodal learning. The role involves understanding the complex interplay between sound, visuals, and language, aiming to drive next-generation AI applications. Interns will work closely with a group of researchers at MERL to develop and implement models, with an emphasis on integrating different sensory modalities. Internships regularly lead to one or more publications in top-tier venues, which can later become part of the intern''s doctoral work. Ideal candidates are senior Ph.D. students in fields such as Audio Machine Learning, Computer Vision, or Natural Language Processing. Experience in multimodal learning is preferable. Good programming skills in Python and knowledge of deep learning frameworks such as PyTorch are essential. Multiple positions are available with flexible start date (not just Spring/Summer but throughout 2024) and duration (typically 3-6 months).
-
-
MERL Publications
- "Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction", arXiv, October 2023.BibTeX arXiv
- @article{Pan2023oct,
- author = {Pan, Zexu and Wichern, Gordon and Masuyama, Yoshiki and Germain, François G and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
- title = {Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction},
- journal = {arXiv},
- year = 2023,
- month = oct,
- url = {http://arxiv.org/abs/2310.19644}
- }
, - "Generation or Replication: Auscultating Audio Latent Diffusion Models", arXiv, October 2023.BibTeX arXiv
- @article{Bralios2023oct,
- author = {Bralios, Dimitrios and Wichern, Gordon and Germain, François G and Pan, Zexu and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
- title = {Generation or Replication: Auscultating Audio Latent Diffusion Models},
- journal = {arXiv},
- year = 2023,
- month = oct,
- url = {https://arxiv.org/abs/2310.10604}
- }
, - "Unsupervised Domain Adaptation For Speech Recognition via Uncertainty Driven Self-Training", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP39728.2021.9414299, June 2021, pp. 6553-6557.BibTeX TR2021-039 PDF
- @inproceedings{Khurana2021jun,
- author = {Khurana, Sameer and Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
- title = {Unsupervised Domain Adaptation For Speech Recognition via Uncertainty Driven Self-Training},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2021,
- pages = {6553--6557},
- month = jun,
- doi = {10.1109/ICASSP39728.2021.9414299},
- url = {https://www.merl.com/publications/TR2021-039}
- }
,
- "Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction", arXiv, October 2023.
-
Other Publications
- "Whisper-AT: Noise-Robust Automatic Speech Recognizers are Also Strong General Audio Event Taggers", Interspeech 2023, 2023.BibTeX
- @Article{gong2023whisper,
- author = {Gong, Yuan and Khurana, Sameer and Karlinsky, Leonid and Glass, James},
- title = {Whisper-AT: Noise-Robust Automatic Speech Recognizers are Also Strong General Audio Event Taggers},
- journal = {Interspeech 2023},
- year = 2023
- }
, - "Improved Cross-Lingual Transfer Learning For Automatic Speech Translation", Preprint 2023, 2023.BibTeX
- @Article{khurana2023improved,
- author = {Khurana, Sameer and Dawalatabad, Nauman and Laurent, Antoine and Vicente, Luis and Gimeno, Pablo and Mingote, Victoria and Glass, James},
- title = {Improved Cross-Lingual Transfer Learning For Automatic Speech Translation},
- journal = {Preprint 2023},
- year = 2023
- }
, - "Transfer Learning For Spoken Language Processing", 2023, Massachusetts Institute of Technology.BibTeX
- @Phdthesis{khurana2023transfer,
- author = {Khurana, Sameer},
- title = {Transfer Learning For Spoken Language Processing},
- school = {Massachusetts Institute of Technology},
- year = 2023
- }
, - "ON-TRAC consortium systems for the IWSLT 2023 dialectal and low-resource speech translation tasks", IWSLT 2023, 2023.BibTeX
- @Inproceedings{laurent2023trac,
- author = {Laurent, Antoine and Gahbiche, Souhir and Nguyen, Ha and Elleuch, Haroun and Bougares, Fethi and Thiol, Antoine and Riguidel, Hugo and Mdhaffar, Salima and Laperri{\`e}re, Ga{\"e}lle and Maison, Lucas and others},
- title = {ON-TRAC consortium systems for the IWSLT 2023 dialectal and low-resource speech translation tasks},
- booktitle = {IWSLT 2023},
- year = 2023
- }
, - "Direct Text to Speech Translation System Using Acoustic Units", IEEE Signal Processing Letters 2023, 2023.BibTeX
- @Article{mingote2023direct,
- author = {Mingote, Victoria and Gimeno, Pablo and Vicente, Luis and Khurana, Sameer and Laurent, Antoine and Duret, Jarod},
- title = {Direct Text to Speech Translation System Using Acoustic Units},
- journal = {IEEE Signal Processing Letters 2023},
- year = 2023,
- publisher = {IEEE}
- }
, - "Comparison of Multilingual Self-Supervised and Weakly-Supervised Speech Pre-Training for Adaptation to Unseen Languages", Interspeech 2023, 2023.BibTeX
- @Article{rouditchenko2023comparison,
- author = {Rouditchenko, Andrew and Khurana, Sameer and Thomas, Samuel and Feris, Rogerio and Karlinsky, Leonid and Kuehne, Hilde and Harwath, David and Kingsbury, Brian and Glass, James},
- title = {Comparison of Multilingual Self-Supervised and Weakly-Supervised Speech Pre-Training for Adaptation to Unseen Languages},
- journal = {Interspeech 2023},
- year = 2023
- }
, - "Detecting Dementia from Long Neuropsychological Interviews", EMNLP 2022, 2022.BibTeX
- @Inproceedings{dawalatabad2022detecting,
- author = {Dawalatabad, Nauman and Gong, Yuan and Khurana, Sameer and Au, Rhoda and Glass, James},
- title = {Detecting Dementia from Long Neuropsychological Interviews},
- booktitle = {EMNLP 2022},
- year = 2022
- }
, - "On Unsupervised Uncertainty-Driven Speech Pseudo-Label Filtering and Model Calibration", ICASSP 2023, 2022.BibTeX
- @Article{dawalatabad2022unsupervised,
- author = {Dawalatabad, Nauman and Khurana, Sameer and Laurent, Antoine and Glass, James},
- title = {On Unsupervised Uncertainty-Driven Speech Pseudo-Label Filtering and Model Calibration},
- journal = {ICASSP 2023},
- year = 2022
- }
, - "CMKD: CNN/Transformer-Based Cross-Model Knowledge Distillation for Audio Classification", Preprint 2022, 2022.BibTeX
- @Article{gong2022cmkd,
- author = {Gong, Yuan and Khurana, Sameer and Rouditchenko, Andrew and Glass, James},
- title = {CMKD: CNN/Transformer-Based Cross-Model Knowledge Distillation for Audio Classification},
- journal = {Preprint 2022},
- year = 2022
- }
, - "Magic dust for cross-lingual adaptation of monolingual wav2vec-2.0", ICASSP 2022, 2022.BibTeX
- @Inproceedings{khurana2022magic,
- author = {Khurana, Sameer and Laurent, Antoine and Glass, James},
- title = {Magic dust for cross-lingual adaptation of monolingual wav2vec-2.0},
- booktitle = {ICASSP 2022},
- year = 2022,
- organization = {IEEE}
- }
, - "SAMU-XLSR: Semantically-Aligned Multimodal Utterance-level Cross-Lingual Speech Representation", IEEE Journal of Selected Topics in Signal Processing 2022, 2022.BibTeX
- @Article{khurana2022samu,
- author = {Khurana, Sameer and Laurent, Antoine and Glass, James},
- title = {SAMU-XLSR: Semantically-Aligned Multimodal Utterance-level Cross-Lingual Speech Representation},
- journal = {IEEE Journal of Selected Topics in Signal Processing 2022},
- year = 2022
- }
, - "Multi-lingual Speech to Speech Translation for Under-Resourced Languages", 2022.BibTeX
- @Inproceedings{larcher2022multi,
- author = {Larcher, Anthony and Est{\`e}ve, Yannick and Rouvier, Mickael and Tomashenko, Natalia and Duret, Jarod and Laperriere, Gaelle and Kesijaru, Santosh and Sarvas, Marek and Kohlova, Renata and Li, Henry and others},
- title = {Multi-lingual Speech to Speech Translation for Under-Resourced Languages},
- year = 2022,
- organization = {Jelinek Summer Workshop on Speech and Language Technology 2022}
- }
, - "Unsupervised domain adaptation for speech recognition via uncertainty driven self-training", ICASSP 2021, 2021.BibTeX
- @Inproceedings{khurana2021unsupervised,
- author = {Khurana, Sameer and Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
- title = {Unsupervised domain adaptation for speech recognition via uncertainty driven self-training},
- booktitle = {ICASSP 2021},
- year = 2021,
- organization = {IEEE}
- }
, - "Parp: Prune, adjust and re-prune for self-supervised speech recognition", NeurIPS 2021, 2021.BibTeX
- @Article{lai2021parp,
- author = {Lai, Cheng-I Jeff and Zhang, Yang and Liu, Alexander H and Chang, Shiyu and Liao, Yi-Lun and Chuang, Yung-Sung and Qian, Kaizhi and Khurana, Sameer and Cox, David and Glass, Jim},
- title = {Parp: Prune, adjust and re-prune for self-supervised speech recognition},
- journal = {NeurIPS 2021},
- year = 2021
- }
, - "A convolutional deep markov model for unsupervised speech representation learning", Interspeech 2020, 2020.BibTeX
- @Article{khurana2020convolutional,
- author = {Khurana, Sameer and Laurent, Antoine and Hsu, Wei-Ning and Chorowski, Jan and Lancucki, Adrian and Marxer, Ricard and Glass, James},
- title = {A convolutional deep markov model for unsupervised speech representation learning},
- journal = {Interspeech 2020},
- year = 2020
- }
, - "Cstnet: Contrastive speech translation network for self-supervised speech representation learning", Preprint, 2020.BibTeX
- @Article{khurana2020cstnet,
- author = {Khurana, Sameer and Laurent, Antoine and Glass, James},
- title = {Cstnet: Contrastive speech translation network for self-supervised speech representation learning},
- journal = {Preprint},
- year = 2020
- }
, - "Robust training of vector quantized bottleneck models", IJCNN 2020, 2020.BibTeX
- @Inproceedings{lancucki2020robust,
- author = {{\L}a{\'n}cucki, Adrian and Chorowski, Jan and Sanchez, Guillaume and Marxer, Ricard and Chen, Nanxin and Dolfing, Hans JGA and Khurana, Sameer and Alum{\"a}e, Tanel and Laurent, Antoine},
- title = {Robust training of vector quantized bottleneck models},
- booktitle = {IJCNN 2020},
- year = 2020,
- organization = {IEEE}
- }
, - "DARTS: Dialectal Arabic transcription system", Preprint, 2019.BibTeX
- @Article{khurana2019darts,
- author = {Khurana, Sameer and Ali, Ahmed and Glass, James},
- title = {DARTS: Dialectal Arabic transcription system},
- journal = {Preprint},
- year = 2019
- }
, - "A Factorial Deep Markov Model For Unsupervised Disentangled Representation Learning From Speech", ICASSP 2019, 2019.BibTeX
- @Article{khurana2019factorial,
- author = {Khurana, Sameer and Joty, Shafiq Rayhan and Ali, Ahmed and Glass, James},
- title = {A Factorial Deep Markov Model For Unsupervised Disentangled Representation Learning From Speech},
- journal = {ICASSP 2019},
- year = 2019
- }
, - "DeepSol: A Deep Learning Framework for Sequence-Based Protein Solubility Prediction", Bioinformatics 2018, 2018.BibTeX
- @Article{khurana2018deepsol,
- author = {Khurana, Sameer and Rawi, Reda and Kunji, Khalid and Chuang, Gwo-Yu and Bensmail, Halima and Mall, Raghvendra and Valencia, Alfonso},
- title = {DeepSol: A Deep Learning Framework for Sequence-Based Protein Solubility Prediction},
- journal = {Bioinformatics 2018},
- year = 2018
- }
, - "Exploiting convolutional neural networks for phonotactic based dialect identification", ICASSP 2018, 2018.BibTeX
- @Inproceedings{najafian2018exploiting,
- author = {Najafian, Maryam and Khurana, Sameer and Shan, Suwon and Ali, Ahmed and Glass, James},
- title = {Exploiting convolutional neural networks for phonotactic based dialect identification},
- booktitle = {ICASSP 2018},
- year = 2018,
- organization = {IEEE}
- }
,
- "Whisper-AT: Noise-Robust Automatic Speech Recognizers are Also Strong General Audio Event Taggers", Interspeech 2023, 2023.