Publications

594 / 3,591 publications found.


  •  Pan, Z., Wichern, G., Germain, F.G., Subramanian, A., Le Roux, J., "Late Audio-Visual Fusion for In-The-Wild Speaker Diarization", Hands-free Speech Communication and Microphone Arrays (HSCMA), April 2024.
    BibTeX TR2024-029 PDF
    • @inproceedings{Pan2024apr,
    • author = {Pan, Zexu and Wichern, Gordon and Germain, François G and Subramanian, Aswin and Le Roux, Jonathan},
    • title = {Late Audio-Visual Fusion for In-The-Wild Speaker Diarization},
    • booktitle = {Hands-free Speech Communication and Microphone Arrays (HSCMA)},
    • year = 2024,
    • month = apr,
    • url = {https://www.merl.com/publications/TR2024-029}
    • }
  •  Koo, J., Wichern, G., Germain, F.G., Khurana, S., Le Roux, J., "Understanding and Controlling Generative Music Transformers by Probing Individual Attention Heads", IEEE ICASSP Satellite Workshop on Explainable Machine Learning for Speech and Audio (XAI-SA), April 2024.
    BibTeX TR2024-032 PDF
    • @inproceedings{Koo2024apr,
    • author = {Koo, Junghyun and Wichern, Gordon and Germain, François G and Khurana, Sameer and Le Roux, Jonathan},
    • title = {Understanding and Controlling Generative Music Transformers by Probing Individual Attention Heads},
    • booktitle = {IEEE ICASSP Satellite Workshop on Explainable Machine Learning for Speech and Audio (XAI-SA)},
    • year = 2024,
    • month = apr,
    • url = {https://www.merl.com/publications/TR2024-032}
    • }
  •  Jeon, C.-B., Wichern, G., Germain, F.G., Le Roux, J., "Why does music source separation benefit from cacophony?", IEEE ICASSP Satellite Workshop on Explainable Machine Learning for Speech and Audio (XAI-SA), March 2024.
    BibTeX TR2024-030 PDF
    • @inproceedings{Jeon2024mar,
    • author = {Jeon, Chang-Bin and Wichern, Gordon and Germain, François G and Le Roux, Jonathan},
    • title = {Why does music source separation benefit from cacophony?},
    • booktitle = {IEEE ICASSP Satellite Workshop on Explainable Machine Learning for Speech and Audio (XAI-SA)},
    • year = 2024,
    • month = mar,
    • url = {https://www.merl.com/publications/TR2024-030}
    • }
  •  Bralios, D., Wichern, G., Germain, F.G., Pan, Z., Khurana, S., Hori, C., Le Roux, J., "Generation or Replication: Auscultating Audio Latent Diffusion Models", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2024.
    BibTeX TR2024-027 PDF
    • @inproceedings{Bralios2024mar,
    • author = {Bralios, Dimitrios and Wichern, Gordon and Germain, François G and Pan, Zexu and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
    • title = {Generation or Replication: Auscultating Audio Latent Diffusion Models},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2024,
    • month = mar,
    • url = {https://www.merl.com/publications/TR2024-027}
    • }
  •  Masuyama, Y., Wichern, G., Germain, F.G., Pan, Z., Khurana, S., Hori, C., Le Roux, J., "NIIRF: Neural IIR Filter Field for HRTF Upsampling and Personalization", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2024.
    BibTeX TR2024-026 PDF
    • @inproceedings{Masuyama2024mar,
    • author = {Masuyama, Yoshiki and Wichern, Gordon and Germain, François G and Pan, Zexu and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
    • title = {NIIRF: Neural IIR Filter Field for HRTF Upsampling and Personalization},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2024,
    • month = mar,
    • url = {https://www.merl.com/publications/TR2024-026}
    • }
  •  Pan, Z., Wichern, G., Germain, F.G., Khurana, S., Le Roux, J., "NeuroHeed+: Improving Neuro-steered Speaker Extraction with Joint Auditory Attention Detection", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2024.
    BibTeX TR2024-025 PDF
    • @inproceedings{Pan2024mar,
    • author = {Pan, Zexu and Wichern, Gordon and Germain, François G and Khurana, Sameer and Le Roux, Jonathan},
    • title = {NeuroHeed+: Improving Neuro-steered Speaker Extraction with Joint Auditory Attention Detection},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2024,
    • month = mar,
    • url = {https://www.merl.com/publications/TR2024-025}
    • }
  •  Wu, S.-L., Chang, X., Wichern, G., Jung, J.-W., Germain, F.G., Le Roux, J., Watanabe, S., "Improving Audio Captioning Models with Fine-grained Audio Features, Text Embedding Supervision, and LLM Mix-up Augmentation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2024.
    BibTeX TR2024-028 PDF
    • @inproceedings{Wu2024mar,
    • author = {Wu, Shih-Lun and Chang, Xuankai and Wichern, Gordon and Jung, Jee-weon and Germain, François G and Le Roux, Jonathan and Watanabe, Shinji},
    • title = {Improving Audio Captioning Models with Fine-grained Audio Features, Text Embedding Supervision, and LLM Mix-up Augmentation},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2024,
    • month = mar,
    • url = {https://www.merl.com/publications/TR2024-028}
    • }
  •  Baoueb, T., Liu, H., Fontaine, M., Le Roux, J., Richard, G., "SpecDiff-GAN: A Spectrally-Shaped Noise Diffusion GAN for Speech and Music Synthesis", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2024.
    BibTeX TR2024-013 PDF
    • @inproceedings{Baoueb2024mar,
    • author = {Baoueb, Teysir and Liu, Haocheng and Fontaine, Mathieu and Le Roux, Jonathan and Richard, Gaël},
    • title = {SpecDiff-GAN: A Spectrally-Shaped Noise Diffusion GAN for Speech and Music Synthesis},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2024,
    • month = mar,
    • url = {https://www.merl.com/publications/TR2024-013}
    • }
  •  Hori, C., Wang, P., Rahman, M., Vaca-Rubio, C., Khurana, S., Cherian, A., Le Roux, J., "Wi-Fi based Indoor Monitoring Enhanced by Multimodal Fusion", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2024.
    BibTeX TR2024-012 PDF
    • @inproceedings{Hori2024mar,
    • author = {Hori, Chiori and Wang, Pu and Rahman, Mahbub and Vaca-Rubio, Cristian and Khurana, Sameer and Cherian, Anoop and Le Roux, Jonathan},
    • title = {Wi-Fi based Indoor Monitoring Enhanced by Multimodal Fusion},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2024,
    • month = mar,
    • url = {https://www.merl.com/publications/TR2024-012}
    • }
  •  Liu, H., Baoueb, T., Fontaine, M., Le Roux, J., Richard, G., "GLA-Grad: A Griffin-Lim Extended Waveform Generation Diffusion Model", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2024.
    BibTeX TR2024-014 PDF
    • @inproceedings{Liu2024mar,
    • author = {Liu, Haocheng and Baoueb, Teysir and Fontaine, Mathieu and Le Roux, Jonathan and Richard, Gaël},
    • title = {GLA-Grad: A Griffin-Lim Extended Waveform Generation Diffusion Model},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2024,
    • month = mar,
    • url = {https://www.merl.com/publications/TR2024-014}
    • }
  •  Lowy, A., Li, Z., Liu, J., Koike-Akino, T., Parsons, K., Wang, Y., "Why Does Differential Privacy with Large ε Defend Against Practical Membership Inference Attacks?", AAAI Workshop on Privacy-Preserving Artificial Intelligence, February 2024.
    BibTeX TR2024-009 PDF
    • @inproceedings{Lowy2024feb2,
    • author = {Lowy, Andrew and Li, Zhuohang and Liu, Jing and Koike-Akino, Toshiaki and Parsons, Kieran and Wang, Ye},
    • title = {Why Does Differential Privacy with Large ε Defend Against Practical Membership Inference Attacks?},
    • booktitle = {AAAI Workshop on Privacy-Preserving Artificial Intelligence},
    • year = 2024,
    • month = feb,
    • url = {https://www.merl.com/publications/TR2024-009}
    • }
  •  Boeddeker, C., Subramanian, A.S., Wichern, G., Haeb-Umbach, R., Le Roux, J., "TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/​TASLP.2024.3350887, Vol. 32, pp. 1185-1197, February 2024.
    BibTeX TR2024-006 PDF
    • @article{Boeddeker2024feb,
    • author = {Boeddeker, Christoph and Subramanian, Aswin Shanmugam and Wichern, Gordon and Haeb-Umbach, Reinhold and Le Roux, Jonathan},
    • title = {TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings},
    • journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
    • year = 2024,
    • volume = 32,
    • pages = {1185--1197},
    • month = feb,
    • doi = {10.1109/TASLP.2024.3350887},
    • issn = {2329-9304},
    • url = {https://www.merl.com/publications/TR2024-006}
    • }
  •  Carmichael, Z., Jones, L.S., Cherian, A., Michael J., , Scheirer, W., "Pixel-Grounded Prototypical Part Networks", IEEE Winter Conference on Applications of Computer Vision (WACV), January 2024.
    BibTeX TR2024-002 PDF Presentation
    • @inproceedings{Carmichael2024jan,
    • author = {Carmichael, Zachariah and Jones, Lohit, Suhas and Cherian, Anoop and Michael J. and Scheirer, Walter},
    • title = {Pixel-Grounded Prototypical Part Networks},
    • booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)},
    • year = 2024,
    • month = jan,
    • url = {https://www.merl.com/publications/TR2024-002}
    • }
  •  Liu, X., Paul, S., Chatterjee, M., Cherian, A., "CAVEN: An Embodied Conversational Agent for Efficient Audio-Visual Navigation in Noisy Environments", AAAI Conference on Artificial Intelligence, December 2023.
    BibTeX TR2023-154 PDF
    • @inproceedings{Liu2023dec2,
    • author = {Liu, Xiulong and Paul, Sudipta and Chatterjee, Moitreya and Cherian, Anoop},
    • title = {CAVEN: An Embodied Conversational Agent for Efficient Audio-Visual Navigation in Noisy Environments},
    • booktitle = {AAAI Conference on Artificial Intelligence},
    • year = 2023,
    • month = dec,
    • url = {https://www.merl.com/publications/TR2023-154}
    • }
  •  Liu, J., Koike-Akino, T., Wang, P., Brand, M., Wang, Y., Parsons, K., "LoDA: Low-Dimensional Adaptation of Large Language Models", Advances in Neural Information Processing Systems (NeurIPS) workshop, December 2023.
    BibTeX TR2023-150 PDF
    • @inproceedings{Liu2023dec,
    • author = {Liu, Jing and Koike-Akino, Toshiaki and Wang, Pu and Brand, Matthew and Wang, Ye and Parsons, Kieran},
    • title = {LoDA: Low-Dimensional Adaptation of Large Language Models},
    • booktitle = {Advances in Neural Information Processing Systems (NeurIPS) workshop},
    • year = 2023,
    • month = dec,
    • url = {https://www.merl.com/publications/TR2023-150}
    • }
  •  Pan, Z., Wichern, G., Masuyama, Y., Germain, F.G., Khurana, S., Hori, C., Le Roux, J., "Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), DOI: 10.1109/​ASRU57964.2023.10389618, December 2023.
    BibTeX TR2023-152 PDF
    • @inproceedings{Pan2023dec2,
    • author = {Pan, Zexu and Wichern, Gordon and Masuyama, Yoshiki and Germain, François G and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
    • title = {Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction},
    • booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
    • year = 2023,
    • month = dec,
    • doi = {10.1109/ASRU57964.2023.10389618},
    • isbn = {979-8-3503-0689-7},
    • url = {https://www.merl.com/publications/TR2023-152}
    • }
  •  Li, Z., Lowy, A., Liu, J., Koike-Akino, T., Malin, B., Parsons, K., Wang, Y., "Exploring User-level Gradient Inversion with a Diffusion Prior", International Workshop on Federated Learning in the Age of Foundation Models in Conjunction with NeurIPS, December 2023.
    BibTeX TR2023-149 PDF
    • @inproceedings{Li2023dec,
    • author = {Li, Zhuohang and Lowy, Andrew and Liu, Jing and Koike-Akino, Toshiaki and Malin, Bradley and Parsons, Kieran and Wang, Ye},
    • title = {Exploring User-level Gradient Inversion with a Diffusion Prior},
    • booktitle = {International Workshop on Federated Learning in the Age of Foundation Models in Conjunction with NeurIPS},
    • year = 2023,
    • month = dec,
    • url = {https://www.merl.com/publications/TR2023-149}
    • }
  •  He, Y., Shin, S., Cherian, A., Markham, A., Trigon, N., "Sound3DVDet: 3D Sound Source Detection using Multiview Microphone Array and RGB Images", IEEE Winter Conference on Applications of Computer Vision (WACV), December 2023.
    BibTeX TR2023-144 PDF
    • @inproceedings{He2023dec,
    • author = {He, Yuhang and Shin, Sangyun and Cherian, Anoop and Markham, Andrew and Trigon, Niki},
    • title = {Sound3DVDet: 3D Sound Source Detection using Multiview Microphone Array and RGB Images},
    • booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)},
    • year = 2023,
    • month = dec,
    • url = {https://www.merl.com/publications/TR2023-144}
    • }
  •  Wu, S.-L., Chang, X., Wichern, G., Jung, J.-W., Germain, F.G., Le Roux, J., Watanabe, S., "On the Use of Pretrained Deep Audio Encoders for Automated Audio Captioning Tasks", International Symposium on Future Active Safety Technology toward zero traffic accidents (FAST-zero), November 2023.
    BibTeX TR2023-141 PDF
    • @inproceedings{Wu2023nov,
    • author = {Wu, Shih-Lun and Chang, Xuankai and Wichern, Gordon and Jung, Jee-weon and Germain, François G and Le Roux, Jonathan and Watanabe, Shinji},
    • title = {On the Use of Pretrained Deep Audio Encoders for Automated Audio Captioning Tasks},
    • booktitle = {International Symposium on Future Active Safety Technology toward zero traffic accidents (FAST-zero)},
    • year = 2023,
    • month = nov,
    • url = {https://www.merl.com/publications/TR2023-141}
    • }
  •  Nair, N.G., Cherian, A., Lohit, S., Wang, Y., Koike-Akino, T., Patel, V.M., Marks, T.K., "Steered Diffusion: A Generalized Framework for Plug-and-Play Conditional Image Synthesis", IEEE International Conference on Computer Vision (ICCV), October 2023, pp. 20850-20860.
    BibTeX TR2023-126 PDF Presentation
    • @inproceedings{Nair2023sep,
    • author = {Nair, Nithin Gopalakrishnan and Cherian, Anoop and Lohit, Suhas and Wang, Ye and Koike-Akino, Toshiaki and Patel, Vishal M. and Marks, Tim K.},
    • title = {Steered Diffusion: A Generalized Framework for Plug-and-Play Conditional Image Synthesis},
    • booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision},
    • year = 2023,
    • pages = {20850--20860},
    • month = oct,
    • publisher = {IEEE/CVF},
    • url = {https://www.merl.com/publications/TR2023-126}
    • }
  •  Sharma, M., Chatterjee, M., Peng, K.-C., Lohit, S., Jones, M.J., "Tensor Factorization for Leveraging Cross-Modal Knowledge in Data-Constrained Infrared Object Detection", IEEE International Conference on Computer Vision Workshops (ICCV), October 2023, pp. 924-932.
    BibTeX TR2023-125 PDF Presentation
    • @inproceedings{Sharma2023oct,
    • author = {Sharma, Manish and Chatterjee, Moitreya and Peng, Kuan-Chuan and Lohit, Suhas and Jones, Michael J.},
    • title = {Tensor Factorization for Leveraging Cross-Modal Knowledge in Data-Constrained Infrared Object Detection},
    • booktitle = {IEEE International Conference on Computer Vision Workshops (ICCV)},
    • year = 2023,
    • pages = {924--932},
    • month = oct,
    • url = {https://www.merl.com/publications/TR2023-125}
    • }
  •  Huang, B., Yu, J., Jain, S., "EARL: Eye-on-Hand Reinforcement Learner for Dynamic Grasping with Active Pose Estimation", 2023 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), DOI: 10.1109/​IROS55552.2023.10341988, October 2023, pp. 2963-2970.
    BibTeX TR2023-118 PDF Video
    • @inproceedings{Huang2023oct,
    • author = {Huang, Baichuan and Yu, Jingjin and Jain, Siddarth},
    • title = {EARL: Eye-on-Hand Reinforcement Learner for Dynamic Grasping with Active Pose Estimation},
    • booktitle = {2023 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
    • year = 2023,
    • pages = {2963--2970},
    • month = oct,
    • publisher = {IEEE},
    • doi = {10.1109/IROS55552.2023.10341988},
    • issn = {2153-0866},
    • isbn = {978-1-6654-9190-7},
    • url = {https://www.merl.com/publications/TR2023-118}
    • }
  •  Falcon Perez, R., Wichern, G., Germain, F., Le Roux, J., "Location as supervision for weakly supervised multi-channel source separation of machine sounds", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/​WASPAA58266.2023.10248128, September 2023.
    BibTeX TR2023-119 PDF Presentation
    • @inproceedings{FalconPerez2023aug,
    • author = {Falcon Perez, Ricardo and Wichern, Gordon and Germain, Francois and Le Roux, Jonathan},
    • title = {Location as supervision for weakly supervised multi-channel source separation of machine sounds},
    • booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
    • year = 2023,
    • month = sep,
    • publisher = {IEEE},
    • doi = {10.1109/WASPAA58266.2023.10248128},
    • issn = {1947-1629},
    • isbn = {979-8-3503-2372-6},
    • url = {https://www.merl.com/publications/TR2023-119}
    • }
  •  Germain, F., Wichern, G., Le Roux, J., "Hyperbolic Unsupervised Anomalous Sound Detection", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/​WASPAA58266.2023.10248092, September 2023.
    BibTeX TR2023-108 PDF Video Presentation
    • @inproceedings{Germain2023aug,
    • author = {Germain, Francois and Wichern, Gordon and Le Roux, Jonathan},
    • title = {Hyperbolic Unsupervised Anomalous Sound Detection},
    • booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
    • year = 2023,
    • month = sep,
    • publisher = {IEEE},
    • doi = {10.1109/WASPAA58266.2023.10248092},
    • issn = {1947-1629},
    • isbn = {979-8-3503-2372-6},
    • url = {https://www.merl.com/publications/TR2023-108}
    • }
  •  Petermann, D., Wichern, G., Subramanian, A.S., Wang, Z.-Q., Le Roux, J., "Tackling the Cocktail Fork Problem for Separation and Transcription of Real-World Soundtracks", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/​TASLP.2023.3290428, Vol. 31, pp. 2592-2605, September 2023.
    BibTeX TR2023-113 PDF
    • @article{Petermann2023sep,
    • author = {Petermann, Darius and Wichern, Gordon and Subramanian, Aswin Shanmugam and Wang, Zhong-Qiu and Le Roux, Jonathan},
    • title = {Tackling the Cocktail Fork Problem for Separation and Transcription of Real-World Soundtracks},
    • journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
    • year = 2023,
    • volume = 31,
    • pages = {2592--2605},
    • month = sep,
    • doi = {10.1109/TASLP.2023.3290428},
    • issn = {2329-9304},
    • url = {https://www.merl.com/publications/TR2023-113}
    • }