Publications

25 / 3,757 publications found.


  •  Ick, Christopher, Wichern, Gordon, Masuyama, Yoshiki, Germain, François G, Le Roux, Jonathan, "Spatially-Aware Losses for Enhanced Neural Acoustic Fields", Tech. Rep. TR2024-169, Mitsubishi Electric Research Laboratories, Cambridge, MA, December 2024.
    BibTeX TR2024-169 PDF
    • @techreport{MERL_TR2024-169,
    • author = {Ick, Christopher; Wichern, Gordon; Masuyama, Yoshiki; Germain, François G; Le Roux, Jonathan},
    • title = {Spatially-Aware Losses for Enhanced Neural Acoustic Fields},
    • institution = {MERL - Mitsubishi Electric Research Laboratories},
    • address = {Cambridge, MA 02139},
    • number = {TR2024-169},
    • month = dec,
    • year = 2024,
    • url = {https://www.merl.com/publications/TR2024-169/}
    • }
  •  Ick, C., Wichern, G., Masuyama, Y., Germain, F.G., Le Roux, J., "Spatially-Aware Losses for Enhanced Neural Acoustic Fields", NeurIPS 2024 Audio Imagination Workshop, December 2024.
    BibTeX
    • @inproceedings{Ick2024dec,
    • author = {{Ick, Christopher and Wichern, Gordon and Masuyama, Yoshiki and Germain, François G and Le Roux, Jonathan}},
    • title = {Spatially-Aware Losses for Enhanced Neural Acoustic Fields},
    • booktitle = {NeurIPS 2024 Audio Imagination Workshop},
    • year = 2024,
    • month = dec
    • }
  •  Saijo, K., Ebbers, J., Germain, F.G., Wichern, G., Le Roux, J., "Task-Aware Unified Source Separation", arXiv, October 2024.
    BibTeX arXiv
    • @article{Saijo2024oct,
    • author = {Saijo, Kohei and Ebbers, Janek and Germain, François G and Wichern, Gordon and Le Roux, Jonathan}},
    • title = {Task-Aware Unified Source Separation},
    • journal = {arXiv},
    • year = 2024,
    • month = oct,
    • url = {https://arxiv.org/abs/2410.23987v1}
    • }
  •  Saijo, K., Ebbers, J., Germain, F.G., Khurana, S., Wichern, G., Le Roux, J., "Leveraging Audio-Only Data for Text-Queried Target Sound Extraction", arXiv, September 2024.
    BibTeX arXiv
    • @article{Saijo2024sep3,
    • author = {{Saijo, Kohei and Ebbers, Janek and Germain, François G and Khurana, Sameer and Wichern, Gordon and Le Roux, Jonathan}},
    • title = {Leveraging Audio-Only Data for Text-Queried Target Sound Extraction},
    • journal = {arXiv},
    • year = 2024,
    • month = sep,
    • url = {https://arxiv.org/abs/2409.13152v1}
    • }
  •  Saijo, K., Wichern, G., Germain, F.G., Pan, Z., Le Roux, J., "TF-Locoformer: Transformer with Local Modeling by Convolution for Speech Separation and Enhancement", International Workshop on Acoustic Signal Enhancement (IWAENC), DOI: 10.1109/​IWAENC61483.2024.10694313, September 2024, pp. 205-209.
    BibTeX TR2024-126 PDF Software
    • @inproceedings{Saijo2024sep2,
    • author = {Saijo, Kohei and Wichern, Gordon and Germain, François G and Pan, Zexu and Le Roux, Jonathan}},
    • title = {TF-Locoformer: Transformer with Local Modeling by Convolution for Speech Separation and Enhancement},
    • booktitle = {International Workshop on Acoustic Signal Enhancement (IWAENC)},
    • year = 2024,
    • pages = {205--209},
    • month = sep,
    • doi = {10.1109/IWAENC61483.2024.10694313},
    • issn = {2835-3439},
    • isbn = {979-8-3503-6185-8},
    • url = {https://www.merl.com/publications/TR2024-126}
    • }
  •  Ebbers, J., Germain, F.G., Wichern, G., Le Roux, J., "Sound Event Bounding Boxes", Interspeech, DOI: 10.21437/​Interspeech.2024-2075, September 2024, pp. 562-566.
    BibTeX TR2024-118 PDF Software
    • @inproceedings{Ebbers2024sep,
    • author = {Ebbers, Janek and Germain, François G and Wichern, Gordon and Le Roux, Jonathan}},
    • title = {Sound Event Bounding Boxes},
    • booktitle = {Interspeech},
    • year = 2024,
    • pages = {562--566},
    • month = sep,
    • doi = {10.21437/Interspeech.2024-2075},
    • issn = {2958-1796},
    • url = {https://www.merl.com/publications/TR2024-118}
    • }
  •  Pan, Z., Wichern, G., Germain, F.G., Saijo, K., Le Roux, J., "PARIS: Pseudo-AutoRegressIve Siamese Training for Online Speech Separation", Interspeech, DOI: 10.21437/​Interspeech.2024-1066, September 2024, pp. 582-586.
    BibTeX TR2024-124 PDF
    • @inproceedings{Pan2024sep,
    • author = {Pan, Zexu and Wichern, Gordon and Germain, François G and Saijo, Kohei and Le Roux, Jonathan}},
    • title = {PARIS: Pseudo-AutoRegressIve Siamese Training for Online Speech Separation},
    • booktitle = {Interspeech},
    • year = 2024,
    • pages = {582--586},
    • month = sep,
    • doi = {10.21437/Interspeech.2024-1066},
    • issn = {2958-1796},
    • url = {https://www.merl.com/publications/TR2024-124}
    • }
  •  Saijo, K., Wichern, G., Germain, F.G., Pan, Z., Le Roux, J., "Enhanced Reverberation as Supervision for Unsupervised Speech Separation", Interspeech, DOI: 10.21437/​Interspeech.2024-1241, September 2024, pp. 607-611.
    BibTeX TR2024-116 PDF Software
    • @inproceedings{Saijo2024sep,
    • author = {Saijo, Kohei and Wichern, Gordon and Germain, François G and Pan, Zexu and Le Roux, Jonathan}},
    • title = {Enhanced Reverberation as Supervision for Unsupervised Speech Separation},
    • booktitle = {Interspeech},
    • year = 2024,
    • pages = {607--611},
    • month = sep,
    • doi = {10.21437/Interspeech.2024-1241},
    • issn = {2958-1796},
    • url = {https://www.merl.com/publications/TR2024-116}
    • }
  •  Pan, Z., Wichern, G., Germain, F.G., Subramanian, A., Le Roux, J., "Late Audio-Visual Fusion for In-The-Wild Speaker Diarization", Hands-free Speech Communication and Microphone Arrays (HSCMA), DOI: 10.1109/​ICASSPW62465.2024.10626914, April 2024, pp. 174-178.
    BibTeX TR2024-029 PDF
    • @inproceedings{Pan2024apr,
    • author = {Pan, Zexu and Wichern, Gordon and Germain, François G and Subramanian, Aswin and Le Roux, Jonathan},
    • title = {Late Audio-Visual Fusion for In-The-Wild Speaker Diarization},
    • booktitle = {Hands-free Speech Communication and Microphone Arrays (HSCMA)},
    • year = 2024,
    • pages = {174--178},
    • month = apr,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSPW62465.2024.10626914},
    • isbn = {979-8-3503-7451-3},
    • url = {https://www.merl.com/publications/TR2024-029}
    • }
  •  Koo, J., Wichern, G., Germain, F.G., Khurana, S., Le Roux, J., "SMITIN: Self-Monitored Inference-Time INtervention for Generative Music Transformers", arXiv, April 2024.
    BibTeX arXiv
    • @article{Koo2024apr2,
    • author = {Koo, Junghyun and Wichern, Gordon and Germain, François G and Khurana, Sameer and Le Roux, Jonathan},
    • title = {SMITIN: Self-Monitored Inference-Time INtervention for Generative Music Transformers},
    • journal = {arXiv},
    • year = 2024,
    • month = apr,
    • url = {https://arxiv.org/abs/2404.02252}
    • }
  •  Koo, J., Wichern, G., Germain, F.G., Khurana, S., Le Roux, J., "Understanding and Controlling Generative Music Transformers by Probing Individual Attention Heads", IEEE ICASSP Satellite Workshop on Explainable Machine Learning for Speech and Audio (XAI-SA), April 2024.
    BibTeX TR2024-032 PDF
    • @inproceedings{Koo2024apr,
    • author = {Koo, Junghyun and Wichern, Gordon and Germain, François G and Khurana, Sameer and Le Roux, Jonathan},
    • title = {Understanding and Controlling Generative Music Transformers by Probing Individual Attention Heads},
    • booktitle = {IEEE ICASSP Satellite Workshop on Explainable Machine Learning for Speech and Audio (XAI-SA)},
    • year = 2024,
    • month = apr,
    • url = {https://www.merl.com/publications/TR2024-032}
    • }
  •  Jeon, C.-B., Wichern, G., Germain, F.G., Le Roux, J., "Why does music source separation benefit from cacophony?", IEEE ICASSP Satellite Workshop on Explainable Machine Learning for Speech and Audio (XAI-SA), DOI: 10.1109/​ICASSPW62465.2024.10669899, March 2024, pp. 873-877.
    BibTeX TR2024-030 PDF Video
    • @inproceedings{Jeon2024mar,
    • author = {Jeon, Chang-Bin and Wichern, Gordon and Germain, François G and Le Roux, Jonathan},
    • title = {Why does music source separation benefit from cacophony?},
    • booktitle = {IEEE ICASSP Satellite Workshop on Explainable Machine Learning for Speech and Audio (XAI-SA)},
    • year = 2024,
    • pages = {873--877},
    • month = mar,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSPW62465.2024.10669899},
    • isbn = {979-8-3503-7451-3},
    • url = {https://www.merl.com/publications/TR2024-030}
    • }
  •  Bralios, D., Wichern, G., Germain, F.G., Pan, Z., Khurana, S., Hori, C., Le Roux, J., "Generation or Replication: Auscultating Audio Latent Diffusion Models", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP48485.2024.10447705, March 2024, pp. 1156-1160.
    BibTeX TR2024-027 PDF
    • @inproceedings{Bralios2024mar,
    • author = {Bralios, Dimitrios and Wichern, Gordon and Germain, François G and Pan, Zexu and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
    • title = {Generation or Replication: Auscultating Audio Latent Diffusion Models},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2024,
    • pages = {1156--1160},
    • month = mar,
    • doi = {10.1109/ICASSP48485.2024.10447705},
    • url = {https://www.merl.com/publications/TR2024-027}
    • }
  •  Masuyama, Y., Wichern, G., Germain, F.G., Pan, Z., Khurana, S., Hori, C., Le Roux, J., "NIIRF: Neural IIR Filter Field for HRTF Upsampling and Personalization", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP48485.2024.10448477, March 2024, pp. 1016-1020.
    BibTeX TR2024-026 PDF Software
    • @inproceedings{Masuyama2024mar,
    • author = {Masuyama, Yoshiki and Wichern, Gordon and Germain, François G and Pan, Zexu and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
    • title = {NIIRF: Neural IIR Filter Field for HRTF Upsampling and Personalization},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2024,
    • pages = {1016--1020},
    • month = mar,
    • doi = {10.1109/ICASSP48485.2024.10448477},
    • url = {https://www.merl.com/publications/TR2024-026}
    • }
  •  Pan, Z., Wichern, G., Germain, F.G., Khurana, S., Le Roux, J., "NeuroHeed+: Improving Neuro-steered Speaker Extraction with Joint Auditory Attention Detection", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP48485.2024.10446333, March 2024, pp. 11456-11460.
    BibTeX TR2024-025 PDF
    • @inproceedings{Pan2024mar,
    • author = {Pan, Zexu and Wichern, Gordon and Germain, François G and Khurana, Sameer and Le Roux, Jonathan},
    • title = {NeuroHeed+: Improving Neuro-steered Speaker Extraction with Joint Auditory Attention Detection},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2024,
    • pages = {11456--11460},
    • month = mar,
    • doi = {10.1109/ICASSP48485.2024.10446333},
    • url = {https://www.merl.com/publications/TR2024-025}
    • }
  •  Wu, S.-L., Chang, X., Wichern, G., Jung, J.-W., Germain, F.G., Le Roux, J., Watanabe, S., "Improving Audio Captioning Models with Fine-grained Audio Features, Text Embedding Supervision, and LLM Mix-up Augmentation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP48485.2024.10447215, March 2024, pp. 316-320.
    BibTeX TR2024-028 PDF
    • @inproceedings{Wu2024mar,
    • author = {Wu, Shih-Lun and Chang, Xuankai and Wichern, Gordon and Jung, Jee-weon and Germain, François G and Le Roux, Jonathan and Watanabe, Shinji},
    • title = {Improving Audio Captioning Models with Fine-grained Audio Features, Text Embedding Supervision, and LLM Mix-up Augmentation},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2024,
    • pages = {316--320},
    • month = mar,
    • doi = {10.1109/ICASSP48485.2024.10447215},
    • url = {https://www.merl.com/publications/TR2024-028}
    • }
  •  Pan, Z., Wichern, G., Masuyama, Y., Germain, F.G., Khurana, S., Hori, C., Le Roux, J., "Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction", IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), DOI: 10.1109/​ASRU57964.2023.10389618, December 2023.
    BibTeX TR2023-152 PDF Video
    • @inproceedings{Pan2023dec2,
    • author = {Pan, Zexu and Wichern, Gordon and Masuyama, Yoshiki and Germain, François G and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
    • title = {Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction},
    • booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
    • year = 2023,
    • month = dec,
    • doi = {10.1109/ASRU57964.2023.10389618},
    • isbn = {979-8-3503-0689-7},
    • url = {https://www.merl.com/publications/TR2023-152}
    • }
  •  Wu, S.-L., Chang, X., Wichern, G., Jung, J.-W., Germain, F.G., Le Roux, J., Watanabe, S., "On the Use of Pretrained Deep Audio Encoders for Automated Audio Captioning Tasks", International Symposium on Future Active Safety Technology toward zero traffic accidents (FAST-zero), November 2023.
    BibTeX TR2023-141 PDF
    • @inproceedings{Wu2023nov,
    • author = {Wu, Shih-Lun and Chang, Xuankai and Wichern, Gordon and Jung, Jee-weon and Germain, François G and Le Roux, Jonathan and Watanabe, Shinji},
    • title = {On the Use of Pretrained Deep Audio Encoders for Automated Audio Captioning Tasks},
    • booktitle = {International Symposium on Future Active Safety Technology toward zero traffic accidents (FAST-zero)},
    • year = 2023,
    • month = nov,
    • url = {https://www.merl.com/publications/TR2023-141}
    • }
  •  Pan, Z., Wichern, G., Germain, F., Subramanian, A., Le Roux, J., "Late Audio-Visual Fusion for In-The-Wild Speaker Diarization", arXiv, DOI: 10.48550/​arXiv.2211.01299, September 2023.
    BibTeX arXiv
    • @article{Pan2023sep,
    • author = {Pan, Zexu and Wichern, Gordon and Germain, Francois and Subramanian, Aswin and Le Roux, Jonathan},
    • title = {Late Audio-Visual Fusion for In-The-Wild Speaker Diarization},
    • journal = {arXiv},
    • year = 2023,
    • month = sep,
    • doi = {10.48550/arXiv.2211.01299},
    • url = {https://arxiv.org/abs/2211.01299}
    • }
  •  Falcon Perez, R., Wichern, G., Germain, F., Le Roux, J., "Location as supervision for weakly supervised multi-channel source separation of machine sounds", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/​WASPAA58266.2023.10248128, September 2023.
    BibTeX TR2023-119 PDF Presentation
    • @inproceedings{FalconPerez2023aug,
    • author = {Falcon Perez, Ricardo and Wichern, Gordon and Germain, Francois and Le Roux, Jonathan},
    • title = {Location as supervision for weakly supervised multi-channel source separation of machine sounds},
    • booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
    • year = 2023,
    • month = sep,
    • publisher = {IEEE},
    • doi = {10.1109/WASPAA58266.2023.10248128},
    • issn = {1947-1629},
    • isbn = {979-8-3503-2372-6},
    • url = {https://www.merl.com/publications/TR2023-119}
    • }
  •  Germain, F., Wichern, G., Le Roux, J., "Hyperbolic Unsupervised Anomalous Sound Detection", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/​WASPAA58266.2023.10248092, September 2023.
    BibTeX TR2023-108 PDF Video Presentation
    • @inproceedings{Germain2023aug,
    • author = {Germain, Francois and Wichern, Gordon and Le Roux, Jonathan},
    • title = {Hyperbolic Unsupervised Anomalous Sound Detection},
    • booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
    • year = 2023,
    • month = sep,
    • publisher = {IEEE},
    • doi = {10.1109/WASPAA58266.2023.10248092},
    • issn = {1947-1629},
    • isbn = {979-8-3503-2372-6},
    • url = {https://www.merl.com/publications/TR2023-108}
    • }
  •  Wu, S.-L., Chang, X., Wichern, G., Jung, J.-W., Germain, F., Le Roux, J., Watanabe, S., "BEATs-based Audio Captioning Model with Instructor Embedding Supervision and ChatGPT Mix-up," Tech. Rep. TR2023-068, DCASE2023 Challenge, May 2023.
    BibTeX TR2023-068 PDF
    • @techreport{Wu2023may,
    • author = {Wu, Shih-Lun and Chang, Xuankai and Wichern, Gordon and Jung, Jee-weon and Germain, Francois and Le Roux, Jonathan and Watanabe, Shinji},
    • title = {BEATs-based Audio Captioning Model with Instructor Embedding Supervision and ChatGPT Mix-up},
    • institution = {DCASE2023 Challenge},
    • year = 2023,
    • month = may,
    • url = {https://www.merl.com/publications/TR2023-068}
    • }
  •  Chen, K., Wichern, G., Germain, F., Le Roux, J., "Pac-HuBERT: Self-Supervised Music Source Separation via Primitive Auditory Clustering and Hidden-Unit BERT", IEEE ICASSP Satellite Workshop on Self-supervision in Audio, Speech and Beyond (SASB), DOI: 10.1109/​ICASSPW59220.2023.10193575, May 2023.
    BibTeX TR2023-030 PDF
    • @inproceedings{Chen2023may,
    • author = {Chen, Ke and Wichern, Gordon and Germain, Francois and Le Roux, Jonathan},
    • title = {Pac-HuBERT: Self-Supervised Music Source Separation via Primitive Auditory Clustering and Hidden-Unit BERT},
    • booktitle = {IEEE ICASSP Satellite Workshop on Self-supervision in Audio, Speech and Beyond (SASB)},
    • year = 2023,
    • month = may,
    • doi = {10.1109/ICASSPW59220.2023.10193575},
    • isbn = {979-8-3503-0261-5},
    • url = {https://www.merl.com/publications/TR2023-030}
    • }
  •  Yen, H., Germain, F., Wichern, G., Le Roux, J., "Cold Diffusion for Speech Enhancement", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP49357.2023.10096064, May 2023, pp. 1-5.
    BibTeX TR2023-020 PDF
    • @inproceedings{Yen2023may,
    • author = {Yen, Hao and Germain, Francois and Wichern, Gordon and Le Roux, Jonathan},
    • title = {Cold Diffusion for Speech Enhancement},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2023,
    • pages = {1--5},
    • month = may,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP49357.2023.10096064},
    • url = {https://www.merl.com/publications/TR2023-020}
    • }
  •  Pan, Z., Wichern, G., Germain, F., Subramanian, A.S., Le Roux, J., "Towards End-to-end Speaker Diarization in the Wild", arXiv, November 2022.
    BibTeX arXiv
    • @article{Pan2022nov,
    • author = {Pan, Zexu and Wichern, Gordon and Germain, Francois and Subramanian, Aswin Shanmugam and Le Roux, Jonathan},
    • title = {Towards End-to-end Speaker Diarization in the Wild},
    • journal = {arXiv},
    • year = 2022,
    • month = nov,
    • url = {https://arxiv.org/abs/2211.01299}
    • }