{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T05:10:12Z","timestamp":1730697012603,"version":"3.28.0"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681622","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"8402-8411","update-policy":"http:\/\/dx.doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Detecting Multimodal Situations with Insufficient Context and Abstaining from Baseless Predictions"],"prefix":"10.1145","author":[{"ORCID":"http:\/\/orcid.org\/0009-0002-1004-3998","authenticated-orcid":false,"given":"Junzhang","family":"Liu","sequence":"first","affiliation":[{"name":"Columbia University, New York, NY, USA"}]},{"ORCID":"http:\/\/orcid.org\/0009-0003-7785-4637","authenticated-orcid":false,"given":"Zhecan","family":"Wang","sequence":"additional","affiliation":[{"name":"Columbia University, New York, NY, USA"}]},{"ORCID":"http:\/\/orcid.org\/0009-0005-9836-6039","authenticated-orcid":false,"given":"Hammad","family":"Ayyubi","sequence":"additional","affiliation":[{"name":"Columbia University, New York, NY, USA"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-7912-4035","authenticated-orcid":false,"given":"Haoxuan","family":"You","sequence":"additional","affiliation":[{"name":"Columbia University, New York, NY, USA"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-3226-396X","authenticated-orcid":false,"given":"Chris","family":"Thomas","sequence":"additional","affiliation":[{"name":"Virginia Tech, Blacksburg, VA, USA"}]},{"ORCID":"http:\/\/orcid.org\/0000-0001-9927-8392","authenticated-orcid":false,"given":"Rui","family":"Sun","sequence":"additional","affiliation":[{"name":"Columbia University, New York, NY, USA"}]},{"ORCID":"http:\/\/orcid.org\/0000-0003-1444-1205","authenticated-orcid":false,"given":"Shih-Fu","family":"Chang","sequence":"additional","affiliation":[{"name":"Columbia University, New York, NY, USA"}]},{"ORCID":"http:\/\/orcid.org\/0000-0001-5365-0072","authenticated-orcid":false,"given":"Kai-Wei","family":"Chang","sequence":"additional","affiliation":[{"name":"University of California, Los Angeles, Los Angeles, CA, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3--540--76298-0_52"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00437"},{"key":"e_1_3_2_1_4_1","volume-title":"arxiv","author":"Biten Ali Furkan","year":"1904","unstructured":"Ali Furkan Biten, Lluis Gomez, Mar\u00e7al Rusi\u00f1ol, and Dimosthenis Karatzas. 2019. Good News, Everyone! Context driven entity-aware captioning for news images. arxiv: 1904.01475 [cs.CV]"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TEC.1957.5222035"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02303"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/5326.827457"},{"key":"e_1_3_2_1_8_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. arxiv","author":"Dosovitskiy Alexey","year":"2010","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. arxiv: 2010.11929 [cs.CV]"},{"key":"e_1_3_2_1_9_1","article-title":"On the Foundations of Noise-free Selective Classification","volume":"11","author":"El-Yaniv Ran","year":"2010","unstructured":"Ran El-Yaniv and Yair Wiener. 2010. On the Foundations of Noise-free Selective Classification. Journal of Machine Learning Research, Vol. 11 (05 2010), 1605--1641.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 961--970","author":"Fabian Caba Heilbron Bernard Ghanem","year":"2015","unstructured":"Bernard Ghanem Fabian Caba Heilbron, Victor Escorcia and Juan Carlos Niebles. 2015. ActivityNet: A Large-Scale Video Benchmark for Human Activity Understanding. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 961--970."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00380"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS40897.2019.8967670"},{"key":"e_1_3_2_1_13_1","volume-title":"PromptCap: Prompt-Guided Task-Aware Image Captioning. arXiv preprint arXiv:2211.09699","author":"Hu Yushi","year":"2022","unstructured":"Yushi Hu, Hang Hua, Zhengyuan Yang, Weijia Shi, Noah A Smith, and Jiebo Luo. 2022. PromptCap: Prompt-Guided Task-Aware Image Captioning. arXiv preprint arXiv:2211.09699 (2022)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02238"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3337791"},{"key":"e_1_3_2_1_16_1","volume-title":"Pythia v0. 1: the winning entry to the vqa challenge","author":"Jiang Yu","year":"2018","unstructured":"Yu Jiang, Vivek Natarajan, Xinlei Chen, Marcus Rohrbach, Dhruv Batra, and Devi Parikh. 2018. Pythia v0. 1: the winning entry to the vqa challenge 2018. arXiv preprint arXiv:1807.09956 (2018)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.217"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_19_1","volume-title":"Switzerland)","author":"Lee Jaeyun","year":"2021","unstructured":"Jaeyun Lee and Incheol Kim. 2021. Vision--Language--Knowledge Co-Embedding for Visual Commonsense Reasoning. Sensors (Basel, Switzerland), Vol. 21 (2021). https:\/\/api.semanticscholar.org\/CorpusID:233462391"},{"key":"e_1_3_2_1_20_1","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. arxiv: 2301.12597 [cs.CV]"},{"key":"e_1_3_2_1_21_1","volume-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. arxiv: 2201.12086 [cs.CV]","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. arxiv: 2201.12086 [cs.CV]"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-61616-8_63"},{"key":"e_1_3_2_1_23_1","volume-title":"Wayne Xin Zhao, and Ji-Rong Wen","author":"Li Yifan","year":"2023","unstructured":"Yifan Li, Yifan Du, Kun Zhou, Jinpeng Wang, Wayne Xin Zhao, and Ji-Rong Wen. 2023. Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Retrieval augmented visual question answering with outside knowledge. arXiv preprint arXiv:2210.03809","author":"Lin Weizhe","year":"2022","unstructured":"Weizhe Lin and Bill Byrne. 2022. Retrieval augmented visual question answering with outside knowledge. arXiv preprint arXiv:2210.03809 (2022)."},{"key":"e_1_3_2_1_25_1","volume-title":"Medical visual question answering: A survey. Artificial Intelligence in Medicine","author":"Lin Zhihong","year":"2023","unstructured":"Zhihong Lin, Donghao Zhang, Qingyi Tao, Danli Shi, Gholamreza Haffari, Qi Wu, Mingguang He, and Zongyuan Ge. 2023. Medical visual question answering: A survey. Artificial Intelligence in Medicine (2023), 102611."},{"key":"e_1_3_2_1_26_1","volume-title":"Negative object presence evaluation (nope) to measure object hallucination in vision-language models. arXiv preprint arXiv:2310.05338","author":"Lovenia Holy","year":"2023","unstructured":"Holy Lovenia, Wenliang Dai, Samuel Cahyawijaya, Ziwei Ji, and Pascale Fung. 2023. Negative object presence evaluation (nope) to measure object hallucination in vision-language models. arXiv preprint arXiv:2310.05338 (2023)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1097"},{"key":"e_1_3_2_1_28_1","volume-title":"OK-VQA: A Visual Question Answering Benchmark Requiring External Knowledge. In Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Marino Kenneth","year":"2019","unstructured":"Kenneth Marino, Mohammad Rastegari, Ali Farhadi, and Roozbeh Mottaghi. 2019. OK-VQA: A Visual Question Answering Benchmark Requiring External Knowledge. In Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00301"},{"key":"e_1_3_2_1_30_1","volume-title":"VisualCOMET: Reasoning about the Dynamic Context of a Still Image. arxiv","author":"Park Jae Sung","year":"2004","unstructured":"Jae Sung Park, Chandra Bhagavatula, Roozbeh Mottaghi, Ali Farhadi, and Yejin Choi. 2020. VisualCOMET: Reasoning about the Dynamic Context of a Still Image. arxiv: 2004.10796 [cs.CV]"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","unstructured":"Yury Puzis Yevgen Borodin Rami Puzis and Iv Ramakrishnan. 2013. Predictive web automation assistant for people with vision impairments. 1031--1040. https:\/\/doi.org\/10.1145\/2488388.2488478","DOI":"10.1145\/2488388.2488478"},{"key":"e_1_3_2_1_32_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arxiv: 2103.00020 [cs.CV]"},{"key":"e_1_3_2_1_33_1","volume-title":"Question relevance in VQA: identifying non-visual and false-premise questions. arXiv preprint arXiv:1606.06622","author":"Ray Arijit","year":"2016","unstructured":"Arijit Ray, Gordon Christie, Mohit Bansal, Dhruv Batra, and Devi Parikh. 2016. Question relevance in VQA: identifying non-visual and false-premise questions. arXiv preprint arXiv:1606.06622 (2016)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"e_1_3_2_1_36_1","volume-title":"Anand Mishra and Partha Pratim Talukdar","author":"Sanket Shah Naganand Yadati","year":"2019","unstructured":"Naganand Yadati Sanket Shah, Anand Mishra and Partha Pratim Talukdar. 2019. KVQA: Knowledge-Aware Visual Question Answering. In AAAI."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33013027"},{"key":"e_1_3_2_1_38_1","volume-title":"A-okvqa: A benchmark for visual question answering using world knowledge. In Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27","author":"Schwenk Dustin","year":"2022","unstructured":"Dustin Schwenk, Apoorv Khandelwal, Christopher Clark, Kenneth Marino, and Roozbeh Mottaghi. 2022. A-okvqa: A benchmark for visual question answering using world knowledge. In Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part VIII. Springer, 146--162."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Zhenwei Shao Zhou Yu Meng Wang and Jun Yu. 2023. Prompting Large Language Models with Answer Heuristics for Knowledge-based Visual Question Answering. In Computer Vision and Pattern Recognition (CVPR). 14974--14983.","DOI":"10.1109\/CVPR52729.2023.01438"},{"key":"e_1_3_2_1_40_1","volume-title":"KVL-BERT: Knowledge Enhanced Visual-and-Linguistic BERT for Visual Commonsense Reasoning. arxiv","author":"Song Dandan","year":"2012","unstructured":"Dandan Song, Siyi Ma, Zhanchen Sun, Sicheng Yang, and Lejian Liao. 2020. KVL-BERT: Knowledge Enhanced Visual-and-Linguistic BERT for Visual Commonsense Reasoning. arxiv: 2012.07000 [cs.AI]"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Robyn Speer Joshua Chin and Catherine Havasi. 2018. ConceptNet 5.5: An Open Multilingual Graph of General Knowledge. arxiv: 1612.03975 [cs.CL]","DOI":"10.1609\/aaai.v31i1.11164"},{"key":"e_1_3_2_1_42_1","volume-title":"VL-BERT: Pre-training of Generic Visual-Linguistic Representations. arxiv","author":"Su Weijie","year":"1908","unstructured":"Weijie Su, Xizhou Zhu, Yue Cao, Bin Li, Lewei Lu, Furu Wei, and Jifeng Dai. 2020. VL-BERT: Pre-training of Generic Visual-Linguistic Representations. arxiv: 1908.08530 [cs.CV]"},{"key":"e_1_3_2_1_43_1","volume-title":"Hoi","author":"Huat Tiong Anthony Meng","year":"2023","unstructured":"Anthony Meng Huat Tiong, Junnan Li, Boyang Li, Silvio Savarese, and Steven C. H. Hoi. 2023. Plug-and-Play VQA: Zero-shot VQA by Conjoining Large Pretrained Models with Zero Training. arxiv: 2210.08773 [cs.CV]"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","unstructured":"Andeep Toor Harry Wechsler and Michele Nappi. 2017. Question Part Relevance and Editing for Cooperative and Context-Aware VQA (C2VQA). 1--6. https:\/\/doi.org\/10.1145\/3095713.3095718","DOI":"10.1145\/3095713.3095718"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2018.02.013"},{"key":"e_1_3_2_1_46_1","volume-title":"Transform and Tell: Entity-Aware News Image Captioning. arxiv","author":"Tran Alasdair","year":"2004","unstructured":"Alasdair Tran, Alexander Mathews, and Lexing Xie. 2020. Transform and Tell: Entity-Aware News Image Captioning. arxiv: 2004.08070 [cs.CV]"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.158"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/2629489"},{"key":"e_1_3_2_1_49_1","volume-title":"OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework. arxiv: 2202.03052 [cs.CV]","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. 2022. OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework. arxiv: 2202.03052 [cs.CV]"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_9"},{"key":"e_1_3_2_1_51_1","volume-title":"say, and segment: Teaching lmms to overcome false premises. arXiv preprint arXiv:2312.08366","author":"Wu Tsung-Han","year":"2023","unstructured":"Tsung-Han Wu, Giscard Biamby, David Chan, Lisa Dunlap, Ritwik Gupta, Xudong Wang, Joseph E Gonzalez, and Trevor Darrell. 2023. See, say, and segment: Teaching lmms to overcome false premises. arXiv preprint arXiv:2312.08366 (2023)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00644"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00688"},{"key":"e_1_3_2_1_54_1","volume-title":"From Recognition to Cognition: Visual Commonsense Reasoning. arxiv","author":"Zellers Rowan","year":"1811","unstructured":"Rowan Zellers, Yonatan Bisk, Ali Farhadi, and Yejin Choi. 2019. From Recognition to Cognition: Visual Commonsense Reasoning. arxiv: 1811.10830 [cs.CV]"},{"key":"e_1_3_2_1_55_1","volume-title":"SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference. arxiv","author":"Zellers Rowan","year":"2018","unstructured":"Rowan Zellers, Yonatan Bisk, Roy Schwartz, and Yejin Choi. 2018. SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference. arxiv: 1808.05326 [cs.CL]"},{"key":"e_1_3_2_1_56_1","volume-title":"Jize Cao, Ali Farhadi, and Yejin Choi.","author":"Zellers Rowan","year":"2021","unstructured":"Rowan Zellers, Ximing Lu, Jack Hessel, Youngjae Yu, Jae Sung Park, Jize Cao, Ali Farhadi, and Yejin Choi. 2021. MERLOT: Multimodal Neural Script Knowledge Models. arxiv: 2106.02636 [cs.CV]"},{"key":"e_1_3_2_1_57_1","unstructured":"Deyao Zhu Jun Chen Xiaoqian Shen Xiang Li and Mohamed Elhoseiny. 2023. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. arxiv: 2304.10592 [cs.CV]"},{"key":"e_1_3_2_1_58_1","unstructured":"Matej Zorec Tim Carrington and Matija Marolt. 2020. Visual Assistant-TV for the visually impaired. In ICCHP. 43."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681622","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T04:29:48Z","timestamp":1730694588000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681622"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":58,"alternative-id":["10.1145\/3664647.3681622","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681622","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}
  NODES
INTERN 2