Publications

55 / 3,739 publications found.


  •  Cherian, A., Corcodel, R., Jain, S., Romeres, D., "LLMPhy: Complex Physical Reasoning Using Large Language Models and World Models", International Conference on Learning Representations (ICLR), October 2024.
    BibTeX
    • @article{Cherian2024oct,
    • author = {Cherian, Anoop and Corcodel, Radu and Jain, Siddarth and Romeres, Diego}},
    • title = {LLMPhy: Complex Physical Reasoning Using Large Language Models and World Models},
    • journal = {International Conference on Learning Representations (ICLR)},
    • year = 2024,
    • month = oct
    • }
  •  Cherian, A., Jain, S., Marks, T.K., "Few-shot Transparent Instance Segmentation for Bin Picking", IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), September 2024.
    BibTeX TR2024-127 PDF
    • @inproceedings{Cherian2024sep,
    • author = {Cherian, Anoop and Jain, Siddarth and Marks, Tim K.}},
    • title = {Few-shot Transparent Instance Segmentation for Bin Picking},
    • booktitle = {IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
    • year = 2024,
    • month = sep,
    • url = {https://www.merl.com/publications/TR2024-127}
    • }
  •  Yin, J., Luo, A., Du, Y., Cherian, A., Marks, T.K., Le Roux, J., Gan, C., "Disentangled Acoustic Fields For Multimodal Physical Scene Understanding", IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), September 2024.
    BibTeX TR2024-125 PDF
    • @inproceedings{Yin2024sep,
    • author = {Yin, Jie and Luo, Andrew and Du, Yilun and Cherian, Anoop and Marks, Tim K. and Le Roux, Jonathan and Gan, Chuang}},
    • title = {Disentangled Acoustic Fields For Multimodal Physical Scene Understanding},
    • booktitle = {IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
    • year = 2024,
    • month = sep,
    • url = {https://www.merl.com/publications/TR2024-125}
    • }
  •  Zhang, J., Zhang, F., Rodriguez, C., Ben-Shabat, I., Cherian, A., Gould, S., "Temporally Grounding Instructional Diagrams in Unconstrained Videos", arXiv, July 2024.
    BibTeX arXiv
    • @article{Zhang2024jul4,
    • author = {Zhang, Jiahao and Zhang, Frederic and Rodriguez, Cristian and Ben-Shabat, Itzik and Cherian, Anoop and Gould, Stephen}},
    • title = {Temporally Grounding Instructional Diagrams in Unconstrained Videos},
    • journal = {arXiv},
    • year = 2024,
    • month = jul,
    • url = {https://arxiv.org/abs/2407.12066}
    • }
  •  Cherian, A., Peng, K.-C., Lohit, S., Matthiesen, J., Smith, K., Tenenbaum, J.B., "Evaluating Large Vision-and-Language Models on Children's Mathematical Olympiads", arXiv, June 2024.
    BibTeX arXiv
    • @article{Cherian2024jun,
    • author = {Cherian, Anoop and Peng, Kuan-Chuan and Lohit, Suhas and Matthiesen, Joanna and Smith, Kevin and Tenenbaum, Joshua B.}},
    • title = {Evaluating Large Vision-and-Language Models on Children's Mathematical Olympiads},
    • journal = {arXiv},
    • year = 2024,
    • month = jun,
    • url = {https://arxiv.org/abs/2406.15736}
    • }
  •  Ni, H., Egger, B., Lohit, S., Cherian, A., Wang, Y., Koike-Akino, T., Huang, S.X., Marks, T.K., "TI2V-Zero: Zero-Shot Image Conditioning for Text-to-Video Diffusion Models", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), June 2024, pp. 9015-9025.
    BibTeX TR2024-059 PDF Video Software Presentation
    • @inproceedings{Ni2024jun,
    • author = {Ni, Haomiao and Egger, Bernhard and Lohit, Suhas and Cherian, Anoop and Wang, Ye and Koike-Akino, Toshiaki and Huang, Sharon X. and Marks, Tim K.},
    • title = {TI2V-Zero: Zero-Shot Image Conditioning for Text-to-Video Diffusion Models},
    • booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
    • year = 2024,
    • pages = {9015--9025},
    • month = jun,
    • url = {https://www.merl.com/publications/TR2024-059}
    • }
  •  He, Y., Cherian, A., Wichern, G., Markham, A., "Deep Neural Room Acoustics Primitive", International Conference on Machine Learning (ICML), June 2024, pp. 17842-17857.
    BibTeX TR2024-072 PDF
    • @inproceedings{He2024jun,
    • author = {He, Yuhang and Cherian, Anoop and Wichern, Gordon and Markham, Andrew}},
    • title = {Deep Neural Room Acoustics Primitive},
    • booktitle = {International Conference on Machine Learning (ICML)},
    • year = 2024,
    • pages = {17842--17857},
    • month = jun,
    • url = {https://www.merl.com/publications/TR2024-072}
    • }
  •  Yang, Z., Liu, J., Chen, P., Cherian, A., Marks, T.K., Le Roux, J., Gan, C., "RILA: Reflective and Imaginative Language Agent for Zero-Shot Semantic Audio-Visual Navigation", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), April 2024, pp. 16251-16261.
    BibTeX TR2024-043 PDF
    • @inproceedings{Yang2024apr,
    • author = {Yang, Zeyuan and Liu, Jiageng and Chen, Peihao and Cherian, Anoop and Marks, Tim K. and Le Roux, Jonathan and Gan, Chuang},
    • title = {RILA: Reflective and Imaginative Language Agent for Zero-Shot Semantic Audio-Visual Navigation},
    • booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
    • year = 2024,
    • pages = {16251--16261},
    • month = apr,
    • publisher = {CVF},
    • url = {https://www.merl.com/publications/TR2024-043}
    • }
  •  Zhu, X., Jha, D.K., Romeres, D., Sun, L., Tomizuka, M., Cherian, A., "Multi-level Reasoning for Robotic Assembly: From Sequence Inference to Contact Selection", IEEE International Conference on Robotics and Automation (ICRA), March 2024, pp. 816-823.
    BibTeX TR2024-033 PDF Video
    • @inproceedings{Zhu2024mar,
    • author = {Zhu, Xinghao and Jha, Devesh K. and Romeres, Diego and Sun, Lingfeng and Tomizuka, Masayoshi and Cherian, Anoop},
    • title = {Multi-level Reasoning for Robotic Assembly: From Sequence Inference to Contact Selection},
    • booktitle = {IEEE International Conference on Robotics and Automation (ICRA)},
    • year = 2024,
    • pages = {816--823},
    • month = mar,
    • publisher = {IEEE},
    • url = {https://www.merl.com/publications/TR2024-033}
    • }
  •  Hori, C., Wang, P., Rahman, M., Vaca-Rubio, C., Khurana, S., Cherian, A., Le Roux, J., "Wi-Fi based Indoor Monitoring Enhanced by Multimodal Fusion", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP48485.2024.10447600, March 2024, pp. 13296-13300.
    BibTeX TR2024-012 PDF
    • @inproceedings{Hori2024mar,
    • author = {Hori, Chiori and Wang, Pu and Rahman, Mahbub and Vaca-Rubio, Cristian and Khurana, Sameer and Cherian, Anoop and Le Roux, Jonathan},
    • title = {Wi-Fi based Indoor Monitoring Enhanced by Multimodal Fusion},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2024,
    • pages = {13296--13300},
    • month = mar,
    • publisher = {IEEE},
    • doi = {10.1109/ICASSP48485.2024.10447600},
    • issn = {2379-190X},
    • isbn = {979-8-3503-4485-1},
    • url = {https://www.merl.com/publications/TR2024-012}
    • }
  •  Carmichael, Z., Jones, L.S., Cherian, A., Michael J., , Scheirer, W., "Pixel-Grounded Prototypical Part Networks", IEEE Winter Conference on Applications of Computer Vision (WACV), DOI: 10.1109/​WACV57701.2024.00470, January 2024.
    BibTeX TR2024-002 PDF Video Software Presentation
    • @inproceedings{Carmichael2024jan,
    • author = {Carmichael, Zachariah and Jones, Lohit, Suhas and Cherian, Anoop and Michael J. and Scheirer, Walter},
    • title = {Pixel-Grounded Prototypical Part Networks},
    • booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)},
    • year = 2024,
    • month = jan,
    • doi = {10.1109/WACV57701.2024.00470},
    • url = {https://www.merl.com/publications/TR2024-002}
    • }
  •  Liu, X., Paul, S., Chatterjee, M., Cherian, A., "CAVEN: An Embodied Conversational Agent for Efficient Audio-Visual Navigation in Noisy Environments", AAAI Conference on Artificial Intelligence, DOI: 10.1609/​aaai.v38i4.28167, December 2023, pp. 3765-3773.
    BibTeX TR2023-154 PDF
    • @inproceedings{Liu2023dec2,
    • author = {Liu, Xiulong and Paul, Sudipta and Chatterjee, Moitreya and Cherian, Anoop},
    • title = {CAVEN: An Embodied Conversational Agent for Efficient Audio-Visual Navigation in Noisy Environments},
    • booktitle = {Proceedings of the 38th AAAI Conference on Artificial Intelligence},
    • year = 2023,
    • pages = {3765--3773},
    • month = dec,
    • doi = {10.1609/aaai.v38i4.28167},
    • url = {https://www.merl.com/publications/TR2023-154}
    • }
  •  He, Y., Shin, S., Cherian, A., Markham, A., Trigon, N., "Sound3DVDet: 3D Sound Source Detection using Multiview Microphone Array and RGB Images", IEEE Winter Conference on Applications of Computer Vision (WACV), December 2023, pp. 5496-5507.
    BibTeX TR2023-144 PDF
    • @inproceedings{He2023dec,
    • author = {He, Yuhang and Shin, Sangyun and Cherian, Anoop and Markham, Andrew and Trigon, Niki},
    • title = {Sound3DVDet: 3D Sound Source Detection using Multiview Microphone Array and RGB Images},
    • booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)},
    • year = 2023,
    • pages = {5496--5507},
    • month = dec,
    • url = {https://www.merl.com/publications/TR2023-144}
    • }
  •  Nair, N.G., Cherian, A., Lohit, S., Wang, Y., Koike-Akino, T., Patel, V.M., Marks, T.K., "Steered Diffusion: A Generalized Framework for Plug-and-Play Conditional Image Synthesis", IEEE International Conference on Computer Vision (ICCV), October 2023, pp. 20850-20860.
    BibTeX TR2023-126 PDF Software Presentation
    • @inproceedings{Nair2023sep,
    • author = {Nair, Nithin Gopalakrishnan and Cherian, Anoop and Lohit, Suhas and Wang, Ye and Koike-Akino, Toshiaki and Patel, Vishal M. and Marks, Tim K.},
    • title = {Steered Diffusion: A Generalized Framework for Plug-and-Play Conditional Image Synthesis},
    • booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision},
    • year = 2023,
    • pages = {20850--20860},
    • month = oct,
    • publisher = {IEEE/CVF},
    • url = {https://www.merl.com/publications/TR2023-126}
    • }
  •  Liu, X., Paul, S., Chatterjee, M., Cherian, A., "Active Sparse Conversations for Improved Audio-Visual Embodied Navigation", arXiv, June 2023.
    BibTeX arXiv
    • @inproceedings{Liu2023jun,
    • author = {Liu, Xiulong and Paul, Sudipta and Chatterjee, Moitreya and Cherian, Anoop},
    • title = {Active Sparse Conversations for Improved Audio-Visual Embodied Navigation},
    • booktitle = {arXiv},
    • year = 2023,
    • month = jun,
    • url = {https://arxiv.org/abs/2306.04047}
    • }
  •  Cherian, A., Jain, S., Marks, T.K., Sullivan, A., "Discriminative 3D Shape Modeling for Few-Shot Instance Segmentation", IEEE International Conference on Robotics and Automation (ICRA), DOI: 10.1109/​ICRA48891.2023.10160644, May 2023, pp. 9296-9302.
    BibTeX TR2023-010 PDF Presentation
    • @inproceedings{Cherian2023may,
    • author = {Cherian, Anoop and Jain, Siddarth and Marks, Tim K. and Sullivan, Alan},
    • title = {Discriminative 3D Shape Modeling for Few-Shot Instance Segmentation},
    • booktitle = {IEEE International Conference on Robotics and Automation (ICRA)},
    • year = 2023,
    • pages = {9296--9302},
    • month = may,
    • publisher = {IEEE},
    • doi = {10.1109/ICRA48891.2023.10160644},
    • url = {https://www.merl.com/publications/TR2023-010}
    • }
  •  Ota, K., Tung, H.-Y., Smith, K., Cherian, A., Marks, T.K., Sullivan, A., Kanezaki, A., Tenenbaum, J.B., "H-SAUR: Hypothesize, Simulate, Act, Update, and Repeat for Understanding Object Articulations from Interactions", IEEE International Conference on Robotics and Automation (ICRA), DOI: 10.1109/​ICRA48891.2023.10160575, May 2023, pp. 7272-7278.
    BibTeX TR2023-009 PDF
    • @inproceedings{Ota2023may,
    • author = {Ota, Kei and Tung, Hsiao-Yu and Smith, Kevin and Cherian, Anoop and Marks, Tim K. and Sullivan, Alan and Kanezaki, Asako and Tenenbaum, Joshua B.},
    • title = {H-SAUR: Hypothesize, Simulate, Act, Update, and Repeat for Understanding Object Articulations from Interactions},
    • booktitle = {IEEE International Conference on Robotics and Automation (ICRA)},
    • year = 2023,
    • pages = {7272--7278},
    • month = may,
    • publisher = {IEEE},
    • doi = {10.1109/ICRA48891.2023.10160575},
    • url = {https://www.merl.com/publications/TR2023-009}
    • }
  •  Shah, A., Roy, A., Shah, K., Mishra, S.K., Jacobs, D., Cherian, A., Chellappa, R., "HaLP: Hallucinating Latent Positives for Skeleton-based Self-Supervised Learning of Actions", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), May 2023, pp. 18846-18856.
    BibTeX TR2023-035 PDF
    • @inproceedings{Shah2023may,
    • author = {Shah, Anshul and Roy, Aniket and Shah, Ketul and Mishra, Shlok Kumar and Jacobs, David and Cherian, Anoop and Chellappa, Rama},
    • title = {HaLP: Hallucinating Latent Positives for Skeleton-based Self-Supervised Learning of Actions},
    • booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
    • year = 2023,
    • pages = {18846--18856},
    • month = may,
    • publisher = {CVF},
    • url = {https://www.merl.com/publications/TR2023-035}
    • }
  •  Zhang, J., Cherian, A., Liu, Y., Shabat, I.B., Rodriguez, C., Gould, S., "Aligning Step-by-Step Instructional Diagrams to Video Demonstrations", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), May 2023, pp. 2483-2492.
    BibTeX TR2023-034 PDF
    • @inproceedings{Zhang2023may,
    • author = {Zhang, Jiahao and Cherian, Anoop and Liu, Yanbin and Shabat, Itzik Ben and Rodriguez, Cristian and Gould, Stephen},
    • title = {Aligning Step-by-Step Instructional Diagrams to Video Demonstrations},
    • booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
    • year = 2023,
    • pages = {2483--2492},
    • month = may,
    • publisher = {CVF},
    • url = {https://www.merl.com/publications/TR2023-034}
    • }
  •  Cherian, A., Peng, K.-C., Lohit, S., Smith, K., Tenenbaum, J.B., "Are Deep Neural Networks SMARTer than Second Graders?", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), March 2023, pp. 10834-10844.
    BibTeX TR2023-014 PDF Video Data Software Presentation
    • @inproceedings{Cherian2023mar,
    • author = {Cherian, Anoop and Peng, Kuan-Chuan and Lohit, Suhas and Smith, Kevin and Tenenbaum, Joshua B.},
    • title = {Are Deep Neural Networks SMARTer than Second Graders?},
    • booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
    • year = 2023,
    • pages = {10834--10844},
    • month = mar,
    • publisher = {CVF},
    • url = {https://www.merl.com/publications/TR2023-014}
    • }
  •  Liu, T., Cherian, A., "Learning a Constrained Optimizer: A Primal Method", AAAI Bridge on Constraint Programming and Machine Learning, January 2023.
    BibTeX TR2023-003 PDF
    • @inproceedings{Liu2023jan,
    • author = {Liu, Tao and Cherian, Anoop},
    • title = {Learning a Constrained Optimizer: A Primal Method},
    • booktitle = {AAAI Bridge on Constraint Programming and Machine Learning},
    • year = 2023,
    • month = jan,
    • url = {https://www.merl.com/publications/TR2023-003}
    • }
  •  Chatterjee, M., Ahuja, N., Cherian, A., "Learning Audio-Visual Dynamics Using Scene Graphs for Audio Source Separation", Advances in Neural Information Processing Systems (NeurIPS), November 2022.
    BibTeX TR2022-140 PDF Presentation
    • @inproceedings{Chatterjee2022nov,
    • author = {Chatterjee, Moitreya and Ahuja, Narendra and Cherian, Anoop},
    • title = {Learning Audio-Visual Dynamics Using Scene Graphs for Audio Source Separation},
    • booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
    • year = 2022,
    • month = nov,
    • url = {https://www.merl.com/publications/TR2022-140}
    • }
  •  Paul, S., Roy Chowdhury, A.K., Cherian, A., "AVLEN: Audio-Visual-Language Embodied Navigation in 3D Environments", Advances in Neural Information Processing Systems (NeurIPS), October 2022, pp. 6236-6249.
    BibTeX TR2022-131 PDF Video Data Software
    • @inproceedings{Paul2022oct2,
    • author = {Paul, Sudipta and Roy Chowdhury, Amit K and Cherian, Anoop},
    • title = {AVLEN: Audio-Visual-Language Embodied Navigation in 3D Environments},
    • booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
    • year = 2022,
    • pages = {6236--6249},
    • month = oct,
    • url = {https://www.merl.com/publications/TR2022-131}
    • }
  •  Chatterjee, M., Ahuja, N., Cherian, A., "Quantifying Predictive Uncertainty for Stochastic Video Synthesis from Audio", IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW), June 2022.
    BibTeX TR2022-082 PDF
    • @inproceedings{Chatterjee2022jun,
    • author = {Chatterjee, Moitreya and Ahuja, Narendra and Cherian, Anoop},
    • title = {Quantifying Predictive Uncertainty for Stochastic Video Synthesis from Audio},
    • booktitle = {IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)},
    • year = 2022,
    • month = jun,
    • url = {https://www.merl.com/publications/TR2022-082}
    • }
  •  Shah, A.P., Geng, S., Gao, P., Cherian, A., Hori, T., Marks, T.K., Le Roux, J., Hori, C., "Audio-Visual Scene-Aware Dialog and Reasoning Using Audio-Visual Transformers with Joint Student-Teacher Learning", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), April 2022, pp. 7732-7736.
    BibTeX TR2022-019 PDF
    • @inproceedings{Shah2022apr,
    • author = {Shah, Ankit Parag and Geng, Shijie and Gao, Peng and Cherian, Anoop and Hori, Takaaki and Marks, Tim K. and Le Roux, Jonathan and Hori, Chiori},
    • title = {Audio-Visual Scene-Aware Dialog and Reasoning Using Audio-Visual Transformers with Joint Student-Teacher Learning},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2022,
    • pages = {7732--7736},
    • month = apr,
    • publisher = {IEEE},
    • issn = {1520-6149},
    • isbn = {978-1-6654-0540-9},
    • url = {https://www.merl.com/publications/TR2022-019}
    • }