Zexu Pan

- Phone: 617-621-7551
- Email:
-
Position:
Research / Technical Staff
Visiting Research Scientist -
Education:
Ph.D. National University of Singapore 2023 -
Research Areas:
External Links:
Zexu's Quick Links
-
Biography
Zexu joined MERL after completing his Ph.D. at the National University of Singapore in 2023. His research interests are artificial intelligence, deep learning, and their applications on but not limited to speech processing, such as multi-modal speech enhancement, speaker extraction, speaker diarization, robust automatic speech recognition, multi-modal representation learning, and auditory attention detection.
-
Research Highlights
-
MERL Publications
- "Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction", arXiv, October 2023.BibTeX arXiv
- @article{Pan2023oct,
- author = {Pan, Zexu and Wichern, Gordon and Masuyama, Yoshiki and Germain, François G and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
- title = {Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction},
- journal = {arXiv},
- year = 2023,
- month = oct,
- url = {http://arxiv.org/abs/2310.19644}
- }
, - "Generation or Replication: Auscultating Audio Latent Diffusion Models", arXiv, October 2023.BibTeX arXiv
- @article{Bralios2023oct,
- author = {Bralios, Dimitrios and Wichern, Gordon and Germain, François G and Pan, Zexu and Khurana, Sameer and Hori, Chiori and Le Roux, Jonathan},
- title = {Generation or Replication: Auscultating Audio Latent Diffusion Models},
- journal = {arXiv},
- year = 2023,
- month = oct,
- url = {https://arxiv.org/abs/2310.10604}
- }
, - "Late Audio-Visual Fusion for In-The-Wild Speaker Diarization", arXiv, DOI: 10.48550/arXiv.2211.01299, September 2023.BibTeX arXiv
- @article{Pan2023sep,
- author = {Pan, Zexu and Wichern, Gordon and Germain, Francois and Subramanian, Aswin and Le Roux, Jonathan},
- title = {Late Audio-Visual Fusion for In-The-Wild Speaker Diarization},
- journal = {arXiv},
- year = 2023,
- month = sep,
- doi = {10.48550/arXiv.2211.01299},
- url = {https://arxiv.org/abs/2211.01299}
- }
, - "Towards End-to-end Speaker Diarization in the Wild", arXiv, November 2022. ,
- "Scenario-Aware Audio-Visual TF-GridNet for Target Speech Extraction", arXiv, October 2023.
-
Other Publications
- "Target Active Speaker Detection with Audio-visual Cues", Proc. INTERSPEECH, 2023.BibTeX
- @Inproceedings{jiang2023target,
- author = {Jiang, Yidi and Tao, Ruijie and Pan, Zexu and Li, Haizhou},
- title = {Target Active Speaker Detection with Audio-visual Cues},
- booktitle = {Proc. INTERSPEECH},
- year = 2023
- }
, - "Rethinking the Visual Cues in Audio-visual Speaker Extraction", Proc. INTERSPEECH, 2023.BibTeX
- @Inproceedings{li2023rethinking,
- author = {Li, Junjie and Ge, Meng and Pan, Zexu and Cao, Rui and Wang, Longbiao and Dang, Jianwu and Zhang, Shiliang},
- title = {Rethinking the Visual Cues in Audio-visual Speaker Extraction},
- booktitle = {Proc. INTERSPEECH},
- year = 2023
- }
, - "ImagineNet: Target Speaker Extraction with Intermittent Visual Cue Through Embedding Inpainting", Proc. IEEE Int. Conf. Acoust., Speech, Signal Process., 2023.BibTeX
- @Inproceedings{pan2023imaginenet,
- author = {Pan, Zexu and Wang, Wupeng and Borsdorf, Marvin and Li, Haizhou},
- title = {ImagineNet: Target Speaker Extraction with Intermittent Visual Cue Through Embedding Inpainting},
- booktitle = {Proc. IEEE Int. Conf. Acoust., Speech, Signal Process.},
- year = 2023
- }
, - "Time-Domain Speech Separation Networks With Graph Encoding Auxiliary", IEEE Signal Processing Letters, Vol. 30, pp. 110-114, 2023.BibTeX
- @Article{wang2023graph,
- author = {Wang, Tingting and Pan, Zexu and Ge, Meng and Yang, Zhen and Li, Haizhou},
- title = {Time-Domain Speech Separation Networks With Graph Encoding Auxiliary},
- journal = {IEEE Signal Processing Letters},
- year = 2023,
- volume = 30,
- pages = {110--114}
- }
, - "Selective Listening by Synchronizing Speech with Lips", IEEE/ACM Trans. Audio, Speech, Lang. Process., Vol. 30, pp. 1650-1664, 2022.BibTeX
- @Article{pan2021reentry,
- author = {Pan, Zexu and Tao, Ruijie and Xu, Chenglin and Li, Haizhou},
- title = {Selective Listening by Synchronizing Speech with Lips},
- journal = {IEEE/ACM Trans. Audio, Speech, Lang. Process.},
- year = 2022,
- volume = 30,
- pages = {1650--1664}
- }
, - "A Hybrid Continuity Loss to Reduce Over-Suppression for Time-domain Target Speaker Extraction", Proc. INTERSPEECH, 2022, pp. 1786-1790.BibTeX
- @Inproceedings{pan2022hybrid,
- author = {Pan, Zexu and Ge, Meng and Li, Haizhou},
- title = {A Hybrid Continuity Loss to Reduce Over-Suppression for Time-domain Target Speaker Extraction},
- booktitle = {Proc. INTERSPEECH},
- year = 2022,
- pages = {1786--1790}
- }
, - "Speaker Extraction with Co-Speech Gestures Cue", IEEE Signal Processing Letters, Vol. 29, pp. 1467-1471, 2022.BibTeX
- @Article{pan2022seg,
- author = {Pan, Zexu and Qian, Xinyuan and Li, Haizhou},
- title = {Speaker Extraction with Co-Speech Gestures Cue},
- journal = {IEEE Signal Processing Letters},
- year = 2022,
- volume = 29,
- pages = {1467--1471}
- }
, - "VCSE: Time-Domain Visual-Contextual Speaker Extraction Network", Proc. INTERSPEECH, 2022, pp. 906-910.BibTeX
- @Inproceedings{tavcse2022,
- author = {Li, Junjie and Ge, Meng and Pan, Zexu and Wang, Longbiao and Dang, Jianwu},
- title = {VCSE: Time-Domain Visual-Contextual Speaker Extraction Network},
- booktitle = {Proc. INTERSPEECH},
- year = 2022,
- pages = {906--910}
- }
, - "USEV: Universal Speaker Extraction With Visual Cue", IEEE/ACM Trans. Audio, Speech, Lang. Process., Vol. 30, pp. 3032-3045, 2022.BibTeX
- @Article{usev21,
- author = {Pan, Zexu and Ge, Meng and Li, Haizhou},
- title = {USEV: Universal Speaker Extraction With Visual Cue},
- journal = {IEEE/ACM Trans. Audio, Speech, Lang. Process.},
- year = 2022,
- volume = 30,
- pages = {3032--3045}
- }
, - "Muse: Multi-Modal Target Speaker Extraction with Visual Cues", Proc. IEEE Int. Conf. Acoust., Speech, Signal Process., 2021, pp. 6678-6682.BibTeX
- @Inproceedings{pan2020muse,
- author = {Pan, Zexu and Tao, Ruijie and Xu, Chenglin and Li, Haizhou},
- title = {Muse: Multi-Modal Target Speaker Extraction with Visual Cues},
- booktitle = {Proc. IEEE Int. Conf. Acoust., Speech, Signal Process.},
- year = 2021,
- pages = {6678--6682}
- }
, - "Multi-target DoA Estimation with an Audio-visual Fusion Mechanism", Proc. IEEE Int. Conf. Acoust., Speech, Signal Process., 2021, pp. 4280-4284.BibTeX
- @Inproceedings{qian2021multi,
- author = {Qian, Xinyuan and Madhavi, Maulik and Pan, Zexu and Wang, Jiadong and Li, Haizhou},
- title = {Multi-target DoA Estimation with an Audio-visual Fusion Mechanism},
- booktitle = {Proc. IEEE Int. Conf. Acoust., Speech, Signal Process.},
- year = 2021,
- pages = {4280--4284}
- }
, - "Is Someone Speaking? Exploring Long-term Temporal Features for Audio-visual Active Speaker Detection", Proc. of the 29th ACM Int. Conf. on Multimedia, 2021, pp. 3927-3935.BibTeX
- @Inproceedings{tao2021someone,
- author = {Tao, Ruijie and Pan, Zexu and Das, Rohan Kumar and Qian, Xinyuan and Shou, Mike Zheng and Li, Haizhou},
- title = {Is Someone Speaking? Exploring Long-term Temporal Features for Audio-visual Active Speaker Detection},
- booktitle = {Proc. of the 29th ACM Int. Conf. on Multimedia},
- year = 2021,
- pages = {3927--3935}
- }
, - "Multi-Modal Attention for Speech Emotion Recognition", Proc. INTERSPEECH, 2020, pp. 364-368.BibTeX
- @Inproceedings{pan2020multi,
- author = {Pan, Zexu and Luo, Zhaojie and Yang, Jichen and Li, Haizhou},
- title = {Multi-Modal Attention for Speech Emotion Recognition},
- booktitle = {Proc. INTERSPEECH},
- year = 2020,
- pages = {364--368}
- }
,
- "Target Active Speaker Detection with Audio-visual Cues", Proc. INTERSPEECH, 2023.