Article(id=1249044011021964066, tenantId=1146029695717560320, journalId=1249024232475115590, issueId=1249044006114628363, articleNumber=null, orderNo=null, doi=10.11834/jig.240535, pmid=null, cstr=null, oa=null, hot=null, price=null, onlineType=0, articleFormat=0, articleType=null, articleTypeStr=null, receivedDate=1725552000000, receivedDateStr=2024-09-06, revisedDate=1745078400000, revisedDateStr=2025-04-20, acceptedDate=null, acceptedDateStr=null, onlineDate=1775724898343, onlineDateStr=2026-04-09, pubDate=1765814400000, pubDateStr=2025-12-16, doiRegisterDate=null, doiRegisterDateStr=null, onlineIssueDate=1775724898343, onlineIssueDateStr=2026-04-09, onlineJustAcceptDate=null, onlineJustAcceptDateStr=null, onlineFirstDate=null, onlineFirstDateStr=null, sourceXml=null, magXml=null, createTime=1775724898342, creator=13041195026, updateTime=1775724898342, updator=13041195026, issue=Issue{id=1249044006114628363, tenantId=1146029695717560320, journalId=1249024232475115590, year='2025', volume='30', issue='12', pageStart='3707', pageEnd='3968', issueExtLink='null', onlineDate='null', pubDate='null', beforeIssueId=null, nextIssueId=null, price=null, status=1, issueComplete=1, articleOrder=1, issueType=1, specialIssue=null, createTime=1775724897161, creator=13041195026, updateTime=1775726353303, updator=13041195026, preIssue=null, nextIssue=null, ext={EN=IssueExt(id=1249050113662984471, tenantId=1146029695717560320, journalId=1249024232475115590, issueId=1249044006114628363, language=EN, specialIssueTitle=, coverIllustrator=null, specialIssueEditor=, specialIssueAbout=), CN=IssueExt(id=1249050113667178776, tenantId=1146029695717560320, journalId=1249024232475115590, issueId=1249044006114628363, language=CN, specialIssueTitle=, coverIllustrator=null, specialIssueEditor=, specialIssueAbout=)}, issueFiles=null}, startPage=3760, endPage=3781, ext={EN=ArticleExt(id=1249044011311371045, articleId=1249044011021964066, tenantId=1146029695717560320, journalId=1249024232475115590, language=EN, title=Video question answering with large language models: a survey, columnId=1249044010699002654, journalTitle=Journal of Image and Graphics, columnName=Review, runingTitle=null, highlight=null, articleAbstract=
In recent years, large language models (LLMs) have achieved remarkable progress in natural language processing (NLP), demonstrating exceptional capabilities in language understanding and generation. These advancements have driven widespread applications in tasks such as text generation, machine translation, question answering, text summarization, and text classification. However, despite their impressive performance in handling and generating text, LLMs face notable limitations when handling highly complex multimodal tasks, particularly in the domain of video question answering (Video QA). Video QA is a particularly challenging task that requires models to comprehend and generate responses based on dynamic visual content, which often includes temporal and auditory information. Unlike static images or purely textual contents, video data contains inherent temporal dependencies, where the meaning of events and actions unfolds over time. This temporal dimension adds substantial complexity to the understanding process because models must not only interpret individual frames but also maintain coherent understanding across sequences of frames within the broader video context. Consequently, effective Video QA demands advanced temporal information processing capabilities that many LLMs, primarily designed for static text, often struggle to handle adequately. Moreover, the multimodal nature of video, which often involves the integration of visual, auditory, and occasionally textual cues, further complicates the task. Effective Video QA requires the model to seamlessly fuse information across these different modalities, ensuring accurate interpretation and response to questions regarding video content. This process involves understanding visual scenes, recognizing speech or background sounds, and correlating them with the corresponding textual information. The challenge lies not only in processing each modality independently but also in establishing meaningful connections between them to generate coherent and contextually appropriate responses. This paper presents a comprehensive review of the current state of research on Video QA models based on large language models. The technical characteristics, strengths, and weaknesses of non-real-time and real-time Video QA models are also investigated. Non-real-time Video QA models typically operate on pre-recorded video content, allowing them to access and analyze the entire video sequence before generating responses. These models can leverage global contextual information, making such models particularly effective for tasks that require video content analysis, such as video summarization or detailed scene interpretation. However, they may struggle with efficiency and scalability, particularly when handling long videos or large datasets. In contrast, real-time Video QA models are designed to process video streams as they are received, increasing their suitability for applications requiring immediate responses, such as live video monitoring or interactive video systems. However, these models must maintain a balance between processing speed and accuracy due to their frequently limited access to the full temporal context of the video. The paper discusses the challenges encountered by these models in maintaining performance under real-time constraints, including efficient computation and prediction capability based on partial information. Additionally, the paper explores the commonly used datasets in Video QA research, highlighting their features, limitations, and the types of tasks they are designed to address. The evaluation of Video QA models is also examined, focusing on the metrics and benchmarks used to assess their performance. Understanding the strengths and weaknesses of different datasets is crucial for advancing the field, helping in the identification of gaps in current research and guiding the development of robust and versatile models. Finally, the paper addresses the extensive challenges and bottlenecks in the field of Video QA, including the difficulties in scaling models to handle large and diverse video datasets, the need for efficient multimodal fusion techniques, and the computational demands associated with video data processing in real-time. The discussion is further extended to consider the potential future research directions in Video QA, with particular emphasis on improving the temporal reasoning capabilities of LLMs, enhancing their multimodal integration, and developing efficient model architectures that can operate effectively under resource constraints. Overall, while large language models have presented new possibilities in the field of video interpretation, considerable challenges remain in adapting these models to the specific demands of Video QA. Through the systematic review of the current advancements and the presentation of the key obstacles and future directions, this paper aims to contribute to the ongoing efforts to develop highly capable and intelligent multimodal AI systems. The field must continue innovations in the following areas: temporal modeling, where novel architectures that can effectively capture long-range dependencies in video sequences are needed; multimodal representation learning, where sophisticated approaches for integrating visual, auditory, and textual features could yield substantial improvements. Furthermore, the development of highly efficient training paradigms that can address the computational intensity of video processing while retaining model performance is essential for practical applications. Another critical area for future work focuses on the creation of highly comprehensive and challenging benchmark datasets that effectively reflect real-world scenarios, pushing the boundaries of what current models can achieve. As research in this area progresses, addressing these challenges will be crucial for realizing the full potential of LLMs in video interpretation applications. Achieving this goal will require AI systems that can interpret and reason about dynamic visual content with a level of proficiency comparable to human cognition. The integration of advanced techniques from computer vision, speech processing, and natural language understanding will be pivotal in developing truly multimodal systems capable of managing the complexity and variability in real-world video data. Through continued innovation and interdisciplinary collaboration, the field can overcome current limitations and drive the development of next-generation video understanding technologies with broad applicability across domains such as education, entertainment, surveillance, and human-computer interaction.
, correspAuthors=Guanbin Li, authorNote=null, correspAuthorsNote=null, copyrightStatement=null, copyrightOwner=null, extLink=null, articleAbsUrl=null, sourceXml=null, magXml=null, pdfUrl=null, pdf=null, pdfFileSize=null, pdfExtLink=null, richHtmlUrl=null, mobilePdfUrl=null, reviewReport=null, pdfFirstPage=null, abstractGraph=null, abstractGraphContent=null, abstractVideo=null, citation=null, cebUrl=null, magXmlContent=null, mapNumber=null, authorCompany=null, fund=null, authors=null, authorsList=Junlin Xie, Ruifei Zhang, Guanbin Li), CN=ArticleExt(id=1249044022266892328, articleId=1249044011021964066, tenantId=1146029695717560320, journalId=1249024232475115590, language=CN, title=大语言模型下的视频问答方法综述, columnId=1249044010992603937, journalTitle=中国图象图形学报, columnName=综述, runingTitle=null, highlight=null, articleAbstract=
大语言模型在自然语言处理领域取得显著进展,展现出卓越的语言理解和生成能力。然而,尽管这些模型在文本处理方面表现出色,但在应对复杂多模态任务时,尤其在视频问答领域局限性逐渐显现。视频作为一种动态的视觉模态,具有显著的时序依赖性和跨模态信息融合的复杂性,对模型的时序信息处理能力和计算效率提出更高的要求。本文系统回顾基于大语言模型的视频问答模型的研究进展,详细分析非实时视频问答模型与实时视频问答模型的技术特点、优势及其在不同应用场景中的表现。同时,探讨了现有研究中常用的数据集及其评测标准,并总结了当前技术面临的挑战与瓶颈。在此基础上,对未来视频问答模型的发展方向进行前瞻性展望,旨在推动多模态人工智能的进一步发展与应用。
, correspAuthors=李冠彬, authorNote=null, correspAuthorsNote=
, copyrightStatement=null, copyrightOwner=null, extLink=null, articleAbsUrl=null, sourceXml=YUYZuFiqDA4sVn8JPqk3jw==, magXml=Gk77myihG7IhpnQiUkm1fA==, pdfUrl=null, pdf=mG4ZwWAAtDmHXXBlmiulYA==, pdfFileSize=2503704, pdfExtLink=null, richHtmlUrl=null, mobilePdfUrl=null, reviewReport=null, pdfFirstPage=null, abstractGraph=G32E8gveFz/z5H+3BCyRmw==, abstractGraphContent=null, abstractVideo=null, citation=null, cebUrl=null, magXmlContent=3f6VbltraIYxlWzJ3gcPfQ==, mapNumber=null, authorCompany=null, fund=null, authors=
, authorsList=谢君琳, 张锐斐, 李冠彬)}, authors=[Author(id=1249322281642693155, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, orderNo=0, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=223010150@link.cuhk.edu.cn, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1249322281705607718, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, authorId=1249322281642693155, language=EN, stringName=Junlin Xie, firstName=Junlin, middleName=null, lastName=Xie, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
1, address=
1School of Science and Engineer, The Chinese University of Hong Kong (Shenzhen), Shenzhen518116,China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1249322281776910887, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, authorId=1249322281642693155, language=CN, stringName=谢君琳, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
1, address=
1香港中文大学(深圳)理工学院,深圳518116, bio={"content":"
谢君琳,女,博士研究生,主要研究方向为多模态大模型。E-mail: 223010150@link.cuhk.edu.cn
"}, bioImg=null, bioContent=
谢君琳,女,博士研究生,主要研究方向为多模态大模型。E-mail: 223010150@link.cuhk.edu.cn
, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1249322281500086811, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, xref=1, ext=[AuthorCompanyExt(id=1249322281508475420, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, companyId=1249322281500086811, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
1School of Science and Engineer, The Chinese University of Hong Kong (Shenzhen), Shenzhen518116,China), AuthorCompanyExt(id=1249322281516864029, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, companyId=1249322281500086811, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
1香港中文大学(深圳)理工学院,深圳518116)])]), Author(id=1249322281844019753, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, orderNo=1, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=liguanbin@mail.sysu.edu.cn, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1249322281923711531, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, authorId=1249322281844019753, language=EN, stringName=Ruifei Zhang, firstName=Ruifei, middleName=null, lastName=Zhang, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
1, address=
1School of Science and Engineer, The Chinese University of Hong Kong (Shenzhen), Shenzhen518116,China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1249322281999209004, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, authorId=1249322281844019753, language=CN, stringName=张锐斐, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
1, address=
1香港中文大学(深圳)理工学院,深圳518116, bio={"content":"
李冠彬,通信作者,男,教授,主要研究方向为视觉理解与生成、多模态大模型、具身智能。E-mail:liguanbin@mail.sysu.edu.cn
"}, bioImg=null, bioContent=
李冠彬,通信作者,男,教授,主要研究方向为视觉理解与生成、多模态大模型、具身智能。E-mail:liguanbin@mail.sysu.edu.cn
, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1249322281500086811, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, xref=1, ext=[AuthorCompanyExt(id=1249322281508475420, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, companyId=1249322281500086811, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
1School of Science and Engineer, The Chinese University of Hong Kong (Shenzhen), Shenzhen518116,China), AuthorCompanyExt(id=1249322281516864029, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, companyId=1249322281500086811, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
1香港中文大学(深圳)理工学院,深圳518116)])]), Author(id=1249322282066317870, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, orderNo=2, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=223010140@link.cuhk.edu.cn, emailSecond=null, emailThird=null, correspondingAuthor=1, authorType=1, ext={EN=AuthorExt(id=1249322282133426736, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, authorId=1249322282066317870, language=EN, stringName=Guanbin Li, firstName=Guanbin, middleName=null, lastName=Li, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
2, *, address=
2School of Computer Science and Engineer, Sun Yat-sen University, Guangzhou510006,China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1249322282200535601, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, authorId=1249322282066317870, language=CN, stringName=李冠彬, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
2, *, address=
2中山大学计算机学院,广州510006, bio={"content":"
张锐斐,男,博士研究生,主要研究方向为多模态大模型。E-mail: 223010140@link.cuhk.edu.cn
"}, bioImg=null, bioContent=
张锐斐,男,博士研究生,主要研究方向为多模态大模型。E-mail: 223010140@link.cuhk.edu.cn
, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1249322281575584286, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, xref=2, ext=[AuthorCompanyExt(id=1249322281579778591, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, companyId=1249322281575584286, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
2School of Computer Science and Engineer, Sun Yat-sen University, Guangzhou510006,China), AuthorCompanyExt(id=1249322281588167200, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, companyId=1249322281575584286, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
2中山大学计算机学院,广州510006)])])], keywords=[Keyword(id=1249322282368307762, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=EN, orderNo=1, keyword=large language models(LLMs)), Keyword(id=1249322282431222323, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=EN, orderNo=2, keyword=video question answering(Video QA)), Keyword(id=1249322282485748276, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=EN, orderNo=3, keyword=multimodal information fusion), Keyword(id=1249322282544468533, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=EN, orderNo=4, keyword=temporal information processing), Keyword(id=1249322282607383094, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=EN, orderNo=5, keyword=video understanding), Keyword(id=1249322282670297655, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=CN, orderNo=1, keyword=大语言模型(LLMs)), Keyword(id=1249322282741600824, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=CN, orderNo=2, keyword=视频问答(Video QA)), Keyword(id=1249322282796126777, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=CN, orderNo=3, keyword=多模态信息融合), Keyword(id=1249322282850652730, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=CN, orderNo=4, keyword=时序信息处理), Keyword(id=1249322282930344507, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=CN, orderNo=5, keyword=视频理解)], refs=[Reference(id=1249322285237211716, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=0, authorNames=Achiam J, Adler S, Agarwal S, Ahmad L, Akkaya I, Aleman F L, Almeida D, Altenschmidt J, Altman S, Anadkat S, Avila R, Babuschkin I, Balaji S, Balcom V, Baltescu P, Bao H M, Bavarian M, Belgum J, Bello I, Berdine J, Bernadett-Shapiro G, Berner C, Bogdonoff L, Boiko O, Boyd M, Brakman A L, Brockman G, Brooks T, Brundage M, Button K, Cai T, Campbell R, Cann A, Carey B, Carlson C, Carmichael R, Chan B, Chang C, Chantzis F, Chen D, Chen S, Chen R, Chen J, Chen M, Chess B, Cho C, Chu C, Chung H W, Cummings D, Currier J, Dai Y X, Decareaux C, Degry T, Deutsch N, Deville D, Dhar A, Dohan D, Dowling S, Dunning S, Ecoffet A, Eleti A, Eloundou T, Farhi D, Fedus L, Felix N, Fishman S P, Forte J, Fulford I, Gao L, Georges E, Gibson C, Goel V, Gogineni T, Goh G, Gontijo-Lopes R, Gordon J, Grafstein M, Gray S, Greene R, Gross J, Gu S S, Guo Y F, Hallacy C, Han J, Harris J, He Y C, Heaton M, Heidecke J, Hesse C, Hickey A, Hickey W, Hoeschele P, Houghton B, Hsu K, Hu S L, Hu X, Huizinga J, Jain S, Jai S, Jang J, Jiang A, Jiang R, Jin H Z, Jin D, Jomoto S, Jonn B, Jun H, Kaftan T, Kaiser Ł, Kamali A, Kanitscheider I, Keskar N S, Khan T, Kilpatrick L, Kim J W, Kim C, Kim Y, Kirchner J H, Kiros J, Knight M, Kokotajlo D, Kondraciuk Ł, Kondrich A, Konstantinidis A, Kosic K, Krueger G, Kuo V, Lampe M, Lan I, Lee T, Leike J, Leung J, Levy D, Li C M, Lim R, Lin M, Lin S, Litwin M, Lopez T, Lowe R, Lue P, Makanju A, Malfacini K, Manning S, Markov T, Markovski Y, Martin B, Mayer K, Mayne A, McGrew B, McKinney S M, McLeavey C, McMillan P, McNeil J, Medina D, Mehta A, Menick J, Metz L, Mishchenko A, Mishkin P, Monaco V, Morikawa E, Mossing D, Mu T, Murati M, Murk O, Mély D, Nair A, Nakano R, Nayak R, Neelakantan A, Ngo R, Noh H, Ouyang L, O’Keefe C, Pachocki J, Paino A, Palermo J, Pantuliano A, Parascandolo G, Parish J, Parparita E, Passos A, Pavlov M, Peng A, Perelman A, de Avila Belbute Peres F, Petrov M, de Oliveira Pinto H P, Pokorny M, Pokrass M, Pong V H, Powell T, Power A, Power B, Proehl E, Puri R, Radford A, Rae J, Ramesh A, Raymond C, Real F, Rimbach K, Ross C, Rotsted B, Roussez H, Ryder N, Saltarelli M, Sanders T, Santurkar S, Sastry G, Schmidt H, Schnurr D, Schulman J, Selsam D, Sheppard K, Sherbakov T, Shieh J, Shoker S, Shyam P, Sidor S, Sigler E, Simens M, Sitkin J, Slama K, Sohl I, Sokolowsky B, Song Y, Staudacher N, Such F P, Summers N, Sutskever I, Tang J, Tezak N, Thompson M B, Tillet P, Tootoonchian A, Tseng E, Tuggle P, Turley N, Tworek J, Uribe J F C, Vallone A, Vijayvergiya A, Voss C, Wainwright C, Wang J J, Wang A, Wang B, Ward J, Wei J, Weinmann C J, Welihinda A, Welinder P, Weng J Y, Weng L L, Wiethoff M, Willner D, Winter C, Wolrich S, Wong H, Workman L, Wu S, Wu J, Wu M, Xiao K, Xu T, Yoo S, Yu K, Yuan Q M, Zaremba W, Zellers R, Zhang C, Zhang M, Zhao S J, Zheng T H, Zhuang J, Zhuk W, Zoph B, journalName=null, refType=null, unstructuredReference=
Achiam J,
Adler S,
Agarwal S,
Ahmad L,
Akkaya I,
Aleman F L,
Almeida D,
Altenschmidt J,
Altman S,
Anadkat S,
Avila R,
Babuschkin I,
Balaji S,
Balcom V,
Baltescu P,
Bao H M,
Bavarian M,
Belgum J,
Bello I,
Berdine J,
Bernadett-Shapiro G,
Berner C,
Bogdonoff L,
Boiko O,
Boyd M,
Brakman A L,
Brockman G,
Brooks T,
Brundage M,
Button K,
Cai T,
Campbell R,
Cann A,
Carey B,
Carlson C,
Carmichael R,
Chan B,
Chang C,
Chantzis F,
Chen D,
Chen S,
Chen R,
Chen J,
Chen M,
Chess B,
Cho C,
Chu C,
Chung H W,
Cummings D,
Currier J,
Dai Y X,
Decareaux C,
Degry T,
Deutsch N,
Deville D,
Dhar A,
Dohan D,
Dowling S,
Dunning S,
Ecoffet A,
Eleti A,
Eloundou T,
Farhi D,
Fedus L,
Felix N,
Fishman S P,
Forte J,
Fulford I,
Gao L,
Georges E,
Gibson C,
Goel V,
Gogineni T,
Goh G,
Gontijo-Lopes R,
Gordon J,
Grafstein M,
Gray S,
Greene R,
Gross J,
Gu S S,
Guo Y F,
Hallacy C,
Han J,
Harris J,
He Y C,
Heaton M,
Heidecke J,
Hesse C,
Hickey A,
Hickey W,
Hoeschele P,
Houghton B,
Hsu K,
Hu S L,
Hu X,
Huizinga J,
Jain S,
Jai S,
Jang J,
Jiang A,
Jiang R,
Jin H Z,
Jin D,
Jomoto S,
Jonn B,
Jun H,
Kaftan T,
Kaiser Ł,
Kamali A,
Kanitscheider I,
Keskar N S,
Khan T,
Kilpatrick L,
Kim J W,
Kim C,
Kim Y,
Kirchner J H,
Kiros J,
Knight M,
Kokotajlo D,
Kondraciuk Ł,
Kondrich A,
Konstantinidis A,
Kosic K,
Krueger G,
Kuo V,
Lampe M,
Lan I,
Lee T,
Leike J,
Leung J,
Levy D,
Li C M,
Lim R,
Lin M,
Lin S,
Litwin M,
Lopez T,
Lowe R,
Lue P,
Makanju A,
Malfacini K,
Manning S,
Markov T,
Markovski Y,
Martin B,
Mayer K,
Mayne A,
McGrew B,
McKinney S M,
McLeavey C,
McMillan P,
McNeil J,
Medina D,
Mehta A,
Menick J,
Metz L,
Mishchenko A,
Mishkin P,
Monaco V,
Morikawa E,
Mossing D,
Mu T,
Murati M,
Murk O,
Mély D,
Nair A,
Nakano R,
Nayak R,
Neelakantan A,
Ngo R,
Noh H,
Ouyang L,
O’Keefe C,
Pachocki J,
Paino A,
Palermo J,
Pantuliano A,
Parascandolo G,
Parish J,
Parparita E,
Passos A,
Pavlov M,
Peng A,
Perelman A,
de Avila Belbute Peres F,
Petrov M,
de Oliveira Pinto H P,
Pokorny M,
Pokrass M,
Pong V H,
Powell T,
Power A,
Power B,
Proehl E,
Puri R,
Radford A,
Rae J,
Ramesh A,
Raymond C,
Real F,
Rimbach K,
Ross C,
Rotsted B,
Roussez H,
Ryder N,
Saltarelli M,
Sanders T,
Santurkar S,
Sastry G,
Schmidt H,
Schnurr D,
Schulman J,
Selsam D,
Sheppard K,
Sherbakov T,
Shieh J,
Shoker S,
Shyam P,
Sidor S,
Sigler E,
Simens M,
Sitkin J,
Slama K,
Sohl I,
Sokolowsky B,
Song Y,
Staudacher N,
Such F P,
Summers N,
Sutskever I,
Tang J,
Tezak N,
Thompson M B,
Tillet P,
Tootoonchian A,
Tseng E,
Tuggle P,
Turley N,
Tworek J,
Uribe J F C,
Vallone A,
Vijayvergiya A,
Voss C,
Wainwright C,
Wang J J,
Wang A,
Wang B,
Ward J,
Wei J,
Weinmann C J,
Welihinda A,
Welinder P,
Weng J Y,
Weng L L,
Wiethoff M,
Willner D,
Winter C,
Wolrich S,
Wong H,
Workman L,
Wu S,
Wu J,
Wu M,
Xiao K,
Xu T,
Yoo S,
Yu K,
Yuan Q M,
Zaremba W,
Zellers R,
Zhang C,
Zhang M,
Zhao S J,
Zheng T H,
Zhuang J,
Zhuk W and
Zoph B.
2024. GPT-4 Technical Report [EB/OL]. [2024-09-06].
https://arxiv.org/pdf/2303.08774.pdf, articleTitle=GPT-4 Technical Report, refAbstract=null), Reference(id=1249322285350457925, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=1, authorNames=Anil R, Dai A M, Firat O, Johnson M, Lepikhin D, Passos A, Shakeri S, Taropa E, Bailey P, Chen Z F, Chu E, Clark J H, El Shafey L, Huang Y P, Meier-Hellstern K, Mishra G, Moreira E, Omernick M, Robinson K, Ruder S, Tay Y, Xiao K F, Xu Y Z, Zhang Y J, Abrego G H, Ahn J, Austin J, Barham P, Botha J, Bradbury J, Brahma S, Brooks K, Catasta M, Cheng Y, Cherry C, Choquette-Choo C A, Chowdhery A, Crepy C, Dave S, Dehghani M, Dev S, Devlin J, Díaz M, Du N, Dyer E, Feinberg V, Feng F, Fienberg V, Freitag M, Garcia X, Gehrmann S, Gonzalez L, Gur-Ari G, Hand S, Hashemi H, Hou L, Howland J, Hu A, Hui J, Hurwitz J, Isard M, Ittycheriah A, Jagielski M, Jia W H, Kenealy K, Krikun M, Kudugunta S, Lan C, Lee K, Lee B, Li E, Li M, Li W, Li Y G, Li J, Lim H, Lin H Z, Liu Z T, Liu F, Maggioni M, Mahendru A, Maynez J, Misra V, Moussalem M, Nado Z, Nham J, Ni E, Nystrom A, Parrish A, Pellat M, Polacek M, Polozov A, Pope R, Qiao S Y, Reif E, Richter B, Riley P, Ros A C, Roy A, Saeta B, Samuel R, Shelby R, Slone A, Smilkov D, So D R, Sohn D, Tokumine S, Valter D, Vasudevan V, Vodrahalli K, Wang X Z, Wang P D, Wang Z R, Wang T, Wieting J, Wu Y H, Xu K, Xu Y H, Xue L T, Yin P C, Yu J H, Zhang Q, Zheng S, Zheng C, Zhou W K, Zhou D, Petrov S, Wu Y H, journalName=null, refType=null, unstructuredReference=
Anil R,
Dai A M,
Firat O,
Johnson M,
Lepikhin D,
Passos A,
Shakeri S,
Taropa E,
Bailey P,
Chen Z F,
Chu E,
Clark J H,
El Shafey L,
Huang Y P,
Meier-Hellstern K,
Mishra G,
Moreira E,
Omernick M,
Robinson K,
Ruder S,
Tay Y,
Xiao K F,
Xu Y Z,
Zhang Y J,
Abrego G H,
Ahn J,
Austin J,
Barham P,
Botha J,
Bradbury J,
Brahma S,
Brooks K,
Catasta M,
Cheng Y,
Cherry C,
Choquette-Choo C A,
Chowdhery A,
Crepy C,
Dave S,
Dehghani M,
Dev S,
Devlin J,
Díaz M,
Du N,
Dyer E,
Feinberg V,
Feng F,
Fienberg V,
Freitag M,
Garcia X,
Gehrmann S,
Gonzalez L,
Gur-Ari G,
Hand S,
Hashemi H,
Hou L,
Howland J,
Hu A,
Hui J,
Hurwitz J,
Isard M,
Ittycheriah A,
Jagielski M,
Jia W H,
Kenealy K,
Krikun M,
Kudugunta S,
Lan C,
Lee K,
Lee B,
Li E,
Li M,
Li W,
Li Y G,
Li J,
Lim H,
Lin H Z,
Liu Z T,
Liu F,
Maggioni M,
Mahendru A,
Maynez J,
Misra V,
Moussalem M,
Nado Z,
Nham J,
Ni E,
Nystrom A,
Parrish A,
Pellat M,
Polacek M,
Polozov A,
Pope R,
Qiao S Y,
Reif E,
Richter B,
Riley P,
Ros A C,
Roy A,
Saeta B,
Samuel R,
Shelby R,
Slone A,
Smilkov D,
So D R,
Sohn D,
Tokumine S,
Valter D,
Vasudevan V,
Vodrahalli K,
Wang X Z,
Wang P D,
Wang Z R,
Wang T,
Wieting J,
Wu Y H,
Xu K,
Xu Y H,
Xue L T,
Yin P C,
Yu J H,
Zhang Q,
Zheng S,
Zheng C,
Zhou W K,
Zhou D,
Petrov S and
Wu Y H.
2023. PaLM 2 Technical Report [EB/OL]. [2024-09-06].
https://arxiv.org/pdf/2305.10403.pdf, articleTitle=PaLM 2 Technical Report, refAbstract=null), Reference(id=1249322285430149702, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2021, volume=null, issue=null, pageStart=1708, pageEnd=1718, url=null, language=null, rfNumber=null, rfOrder=2, authorNames=Bain M, Nagrani A, Varol G, Zisserman A, journalName=null, refType=null, unstructuredReference=
Bain M,
Nagrani A,
Varol G and
Zisserman A.
2021. Frozen in time: a joint video and image encoder for end-to-end retrieval//Proceedings of 2021 IEEE/CVF International Conference on Computer Vision. Montreal, Canada: IEEE:1708-1718 [DOI:
10.1109/ICCV48922.2021.00175], articleTitle=Frozen in time: a joint video and image encoder for end-to-end retrieval, refAbstract=null), Reference(id=1249322285555978823, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024a, volume=null, issue=null, pageStart=18407, pageEnd=18418, url=null, language=null, rfNumber=null, rfOrder=3, authorNames=Chen J, Lyu Z Y, Wu S W, Lin K Q, Song C N, Gao D F, Liu J W, Gao Z T, Mao D X, Shou M Z, journalName=null, refType=null, unstructuredReference=
Chen J,
Lyu Z Y,
Wu S W,
Lin K Q,
Song C N,
Gao D F,
Liu J W,
Gao Z T,
Mao D X and
Shou M Z.
2024a. VideoLLM-online: online video large language model for streaming video//Proceedings of 2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition. Seattle, USA: IEEE:18407-18418 [DOI:
10.1109/CVPR52733.2024.01742], articleTitle=VideoLLM-online: online video large language model for streaming video, refAbstract=null), Reference(id=1249322285660836424, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024b, volume=null, issue=null, pageStart=#614, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=4, authorNames=Chen L, Wei X L, Li J S, Dong X Y, Zhang P, Zang Y H, Chen Z H, Duan H D, Lin B, Tang Z Y, Yuan L, Qiao Y, Lin D H, Zhao F, Wang J Q, journalName=null, refType=null, unstructuredReference=
Chen L,
Wei X L,
Li J S,
Dong X Y,
Zhang P,
Zang Y H,
Chen Z H,
Duan H D,
Lin B,
Tang Z Y,
Yuan L,
Qiao Y,
Lin D H,
Zhao F and
Wang J Q.
2024b. ShareGPT4Video: improving video understanding and generation with better captions//Proceedings of the 38th International Conference on Neural Information Processing Systems. Vancouver, Canada: Curran Associates Inc.:#614, articleTitle=ShareGPT4Video: improving video understanding and generation with better captions, refAbstract=null), Reference(id=1249322285761499721, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2019, volume=null, issue=null, pageStart=1999, pageEnd=2007, url=null, language=null, rfNumber=null, rfOrder=5, authorNames=Fan C Y, Zhang X F, Zhang S, Wang W S, Zhang C, Huang H, journalName=null, refType=null, unstructuredReference=
Fan C Y,
Zhang X F,
Zhang S,
Wang W S,
Zhang C and
Huang H.
2019. Heterogeneous memory enhanced multimodal attention model for video question answering//Proceedings of 2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition. Long Beach, USA: IEEE:1999-2007 [DOI:
10.1109/CVPR.2019.00210], articleTitle=Heterogeneous memory enhanced multimodal attention model for video question answering, refAbstract=null), Reference(id=1249322285878940234, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2021, volume=null, issue=null, pageStart=3783, pageEnd=3786, url=null, language=null, rfNumber=null, rfOrder=6, authorNames=Fan H Q, Murrell T, Wang H, Alwala K V, Li Y H, Li Y L, Xiong B, Ravi N, Li M, Yang H C, Malik J, Girshick R, Feiszli M, Adcock A, Lo W Y, Feichtenhofer C, journalName=null, refType=null, unstructuredReference=
Fan H Q,
Murrell T,
Wang H,
Alwala K V,
Li Y H,
Li Y L,
Xiong B,
Ravi N,
Li M,
Yang H C,
Malik J,
Girshick R,
Feiszli M,
Adcock A,
Lo W Y and
Feichtenhofer C.
2021. PyTorchVideo: a deep learning library for video understanding//Proceedings of the 29th ACM International Conference on Multimedia. [s.l.]: Association for Computing Machinery:3783-3786 [DOI:
10.1145/3474085.3478329], articleTitle=PyTorchVideo: a deep learning library for video understanding, refAbstract=null), Reference(id=1249322285962826315, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=75, pageEnd=92, url=null, language=null, rfNumber=null, rfOrder=7, authorNames=Fan Y, Ma X J, Wu R J, Du Y T, Li J Q, Gao Z, Li Q, journalName=null, refType=null, unstructuredReference=
Fan Y,
Ma X J,
Wu R J,
Du Y T,
Li J Q,
Gao Z and
Li Q.
2024. VideoAgent: a memory-augmented multimodal agent for video understanding//Proceedings of the 18th European Conference on Computer Vision. Milan, Italy: Springer:75-92 [DOI:
10.1007/978-3-031-72670-5_5], articleTitle=VideoAgent: a memory-augmented multimodal agent for video understanding, refAbstract=null), Reference(id=1249322286017352268, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2025, volume=null, issue=null, pageStart=3922, pageEnd=3938, url=null, language=null, rfNumber=null, rfOrder=8, authorNames=Feng Z P, Zhang Y, Li H, Wu B, Liao J Y, Liu W Q, Lang J, Feng Y, Wu J, Liu Z Z, journalName=Findings of the Association for Computational Linguistics, refType=null, unstructuredReference=
Feng Z P,
Zhang Y,
Li H,
Wu B,
Liao J Y,
Liu W Q,
Lang J,
Feng Y,
Wu J and
Liu Z Z.
2025. TEaR: improving LLM-based machine translation with systematic self-refinement//
Findings of the Association for Computational Linguistics. Albuquerque, New Mexico, USA: Association for Computational Linguistics:3922-3938 [DOI:
10.18653/v1/2025.findings-naacl.218], articleTitle=TEaR: improving LLM-based machine translation with systematic self-refinement, refAbstract=null), Reference(id=1249322286071878221, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=9, authorNames=Fu C Y, Dai Y H, Luo Y D, Li L, Ren S H, Zhang R R, Zhou C Y, Shen Y H, Zhang M D, Chen P X, Li Y W, Lin S H, Zhao S R, Li K, Xu T, Zheng X W, Chen E H, Shan C F, He R, Sun X, journalName=null, refType=null, unstructuredReference=
Fu C Y,
Dai Y H,
Luo Y D,
Li L,
Ren S H,
Zhang R R,
Zhou C Y,
Shen Y H,
Zhang M D,
Chen P X,
Li Y W,
Lin S H,
Zhao S R,
Li K,
Xu T,
Zheng X W,
Chen E H,
Shan C F,
He R and
Sun X.
2024. Video-MME: the first-ever comprehensive evaluation benchmark of multi-modal LLMs in video analysis [EB/OL]. [2024-09-06].
https://arxiv.org/pdf/2405.21075.pdf, articleTitle=Video-MME: the first-ever comprehensive evaluation benchmark of multi-modal LLMs in video analysis, refAbstract=null), Reference(id=1249322286138987086, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2016, volume=null, issue=null, pageStart=638, pageEnd=642, url=null, language=null, rfNumber=null, rfOrder=10, authorNames=Fu R G, Li B, Gao Y H, Wang P, journalName=null, refType=null, unstructuredReference=
Fu R G,
Li B,
Gao Y H and
Wang P.
2016. Content-based image retrieval based on CNN and SVM//Proceedings of the 2nd IEEE International Conference on Computer and Communications. Chengdu, China: IEEE:638-642 [DOI:
10.1109/CompComm.2016.7924779], articleTitle=Content-based image retrieval based on CNN and SVM, refAbstract=null), Reference(id=1249322286210290255, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2016, volume=null, issue=null, pageStart=457, pageEnd=468, url=null, language=null, rfNumber=null, rfOrder=11, authorNames=Fukui A, Park D H, Yang D, Rohrbach A, Darrell T, Rohrbach M, journalName=null, refType=null, unstructuredReference=
Fukui A,
Park D H,
Yang D,
Rohrbach A,
Darrell T and
Rohrbach M.
2016. Multimodal compact bilinear pooling for visual question answering and visual grounding//Proceedings of 2016 Conference on Empirical Methods in Natural Language Processing. Austin, Texas, USA: Association for Computational Linguistics:457-468 [DOI:
10.18653/v1/D16-1044], articleTitle=Multimodal compact bilinear pooling for visual question answering and visual grounding, refAbstract=null), Reference(id=1249322286281593424, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2019, volume=null, issue=null, pageStart=6639, pageEnd=6648, url=null, language=null, rfNumber=null, rfOrder=12, authorNames=Gao P, Jiang Z K, You H X, Lu P, Hoi S C H, Wang X G, Li H S, journalName=null, refType=null, unstructuredReference=
Gao P,
Jiang Z K,
You H X,
Lu P,
Hoi S C H,
Wang X G and
Li H S.
2019. Dynamic fusion with intra-and inter-modality attention flow for visual question answering//Proceedings of 2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition. Long Beach, USA: IEEE:6639-6648 [DOI:
10.1109/CVPR.2019.00680], articleTitle=Dynamic fusion with intra-and inter-modality attention flow for visual question answering, refAbstract=null), Reference(id=1249322286348702289, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=13504, pageEnd=13514, url=null, language=null, rfNumber=null, rfOrder=13, authorNames=He B, Li H D, Jang Y K, Jia M L, Cao X F, Shah A, Shrivastava A, Lim S N, journalName=null, refType=null, unstructuredReference=
He B,
Li H D,
Jang Y K,
Jia M L,
Cao X F,
Shah A,
Shrivastava A and
Lim S N.
2024. MA-LMM: memory-augmented large multimodal model for long-term video understanding//Proceedings of 2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition. Seattle, USA: IEEE:13504-13514 [DOI:
10.1109/CVPR52733.2024.01282], articleTitle=MA-LMM: memory-augmented large multimodal model for long-term video understanding, refAbstract=null), Reference(id=1249322286420005458, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2020, volume=null, issue=null, pageStart=709, pageEnd=727, url=null, language=null, rfNumber=null, rfOrder=14, authorNames=Huang Q Q, Xiong Y, Rao A Y, Wang J Z, Lin D H, journalName=null, refType=null, unstructuredReference=
Huang Q Q,
Xiong Y,
Rao A Y,
Wang J Z and
Lin D H.
2020. MovieNet: a holistic dataset for movie understanding//Proceedings of the 16th European Conference on Computer Vision. Glasgow, UK: Springer:709-727 [DOI:
10.1007/978-3-030-58548-8_41], articleTitle=MovieNet: a holistic dataset for movie understanding, refAbstract=null), Reference(id=1249322286491308627, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=18198, pageEnd=18208, url=null, language=null, rfNumber=null, rfOrder=15, authorNames=Islam M M, Ho N, Yang X T, Nagarajan T, Torresani L, Bertasius G, journalName=null, refType=null, unstructuredReference=
Islam M M,
Ho N,
Yang X T,
Nagarajan T,
Torresani L and
Bertasius G.
2024. Video ReCap: recursive captioning of hour-long videos//Proceedings of 2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition. Seattle, USA: IEEE:18198-18208 [DOI:
10.1109/CVPR52733.2024.01723], articleTitle=Video ReCap: recursive captioning of hour-long videos, refAbstract=null), Reference(id=1249322286558417492, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2017, volume=null, issue=null, pageStart=1359, pageEnd=1367, url=null, language=null, rfNumber=null, rfOrder=16, authorNames=Jang Y, Song Y L, Yu Y, Kim Y, Kim G, journalName=null, refType=null, unstructuredReference=
Jang Y,
Song Y L,
Yu Y,
Kim Y and
Kim G.
2017. TGIF-QA: toward spatio-temporal reasoning in visual question answering//Proceedings of 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). Honolulu, USA: IEEE:1359-1367 [DOI:
10.1109/CVPR.2017.149], articleTitle=TGIF-QA: toward spatio-temporal reasoning in visual question answering, refAbstract=null), Reference(id=1249322286646497877, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=13700, pageEnd=13710, url=null, language=null, rfNumber=null, rfOrder=17, authorNames=Jin P, Takanobu R, Zhang W C, Cao X C, Yuan L, journalName=null, refType=null, unstructuredReference=
Jin P,
Takanobu R,
Zhang W C,
Cao X C and
Yuan L.
2024. Chat-UniVi: unified visual representation empowers large language models with image and video understanding//Proceedings of 2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition. Seattle, USA: IEEE:13700-13710 [DOI:
10.1109/CVPR52733.2024.01300], articleTitle=Chat-UniVi: unified visual representation empowers large language models with image and video understanding, refAbstract=null), Reference(id=1249322286730383958, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2025, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=18, authorNames=Kahatapitiya K, Ranasinghe K, Park J, Ryoo M S, journalName=null, refType=null, unstructuredReference=
Kahatapitiya K,
Ranasinghe K,
Park J and
Ryoo M S.
2025. Language repository for long video understanding//Proceedings of the 13th International Conference on Learning Representations. Singapore, Singapore: OpenReview.net, articleTitle=Language repository for long video understanding, refAbstract=null), Reference(id=1249322286810075735, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=12, issue=null, pageStart=193057, pageEnd=193075, url=null, language=null, rfNumber=null, rfOrder=19, authorNames=Kim W, Choi C, Lee W, Rhee W, journalName=IEEE Access, refType=null, unstructuredReference=
Kim W,
Choi C,
Lee W and
Rhee W.
2024. An image grid can be worth a video: zero-shot video question answering using a VLM.
IEEE Access,
12: 193057-193075 [DOI:
10.1109/ACCESS.2024.3517625], articleTitle=An image grid can be worth a video: zero-shot video question answering using a VLM, refAbstract=null), Reference(id=1249322286877184600, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2009, volume=39, issue=5, pageStart=489, pageEnd=504, url=null, language=null, rfNumber=null, rfOrder=20, authorNames=Lavee G, Rivlin E, Rudzsky M, journalName=IEEE Transactions on Systems, Man, and Cybernetics, Part C (Applications and Reviews), refType=null, unstructuredReference=
Lavee G,
Rivlin E and
Rudzsky M.
2009. Understanding video events: a survey of methods for automatic interpretation of semantic occurrences in video.
IEEE Transactions on Systems, Man, and Cybernetics, Part C (Applications and Reviews),
39(5): 489-504 [DOI:
10.1109/TSMCC.2009.2023380], articleTitle=Understanding video events: a survey of methods for automatic interpretation of semantic occurrences in video, refAbstract=null), Reference(id=1249322286935904857, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=5971, pageEnd=5984, url=null, language=null, rfNumber=null, rfOrder=21, authorNames=Lin B, Ye Y, Zhu B, Cui J, Ning M, Jin P, Yuan L, journalName=null, refType=null, unstructuredReference=
Lin B,
Ye Y,
Zhu B,
Cui J,
Ning M,
Jin P, and
Yuan L.
2024. Video-LLaVA: learning united visual representation by alignment before projection//Proceedings of 2024 Conference on Empirical Methods in Natural Language Processing. Florida, USA: ACL:5971-5984, articleTitle=Video-LLaVA: learning united visual representation by alignment before projection, refAbstract=null), Reference(id=1249322287023985242, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2020, volume=null, issue=null, pageStart=9969, pageEnd=9978, url=null, language=null, rfNumber=null, rfOrder=22, authorNames=Le T M, Le V, Venkatesh S, Tran T, journalName=null, refType=null, unstructuredReference=
Le T M,
Le V,
Venkatesh S and
Tran T.
2020. Hierarchical conditional relation networks for video question answering//Proceedings of 2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition. Seattle, USA: IEEE:9969-9978 [DOI:
10.1109/CVPR42600.2020.00999], articleTitle=Hierarchical conditional relation networks for video question answering, refAbstract=null), Reference(id=1249322287116259931, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2020, volume=null, issue=null, pageStart=8211, pageEnd=8225, url=null, language=null, rfNumber=null, rfOrder=23, authorNames=Lei J, Yu L C, Berg T, Bansal M, journalName=null, refType=null, unstructuredReference=
Lei J,
Yu L C,
Berg T and
Bansal M.
2020. TVQA+: spatio-temporal grounding for video question answering//Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. [s.l.]: Association for Computational Linguistics:8211-8225 [DOI:
10.18653/v1/2020.acl-main.730], articleTitle=TVQA+: spatio-temporal grounding for video question answering, refAbstract=null), Reference(id=1249322287174980188, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024a, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=24, authorNames=Li K C, He Y N, Wang Y, Li Y Z, Wang W H, Luo P, Wang Y L, Wang L M, Qiao Y, journalName=null, refType=null, unstructuredReference=
Li K C,
He Y N,
Wang Y,
Li Y Z,
Wang W H,
Luo P,
Wang Y L,
Wang L M and
Qiao Y.
2024a. VideoChat: chat-centric video understanding [EB/OL]. [2024-09-06].
https://arxiv.org/pdf/2305.06355.pdf, articleTitle=VideoChat: chat-centric video understanding, refAbstract=null), Reference(id=1249322287284032093, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024c, volume=null, issue=null, pageStart=323, pageEnd=340, url=null, language=null, rfNumber=null, rfOrder=25, authorNames=Li Y W, Wang C Y, Jia J Y, journalName=null, refType=null, unstructuredReference=
Li Y W,
Wang C Y and
Jia J Y.
2024c. LLaMA-VID: an image is worth 2 tokens in large language models//Proceedings of the 18th European Conference on Computer Vision. Milan, Italy: Springer:323-340 [DOI:
10.1007/978-3-031-72952-2_19], articleTitle=LLaMA-VID: an image is worth 2 tokens in large language models, refAbstract=null), Reference(id=1249322287351140958, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024b, volume=null, issue=null, pageStart=1, pageEnd=18, url=null, language=null, rfNumber=null, rfOrder=26, authorNames=Liu R Y, Li C, Tang H R, Ge Y X, Shan Y, Li G, journalName=null, refType=null, unstructuredReference=
Liu R Y,
Li C,
Tang H R,
Ge Y X,
Shan Y and
Li G.
2024b. ST-LLM: large language models are effective temporal learners//Proceedings of the 18th European Conference on Computer Vision. Milan, Italy: Springer:1-18 [DOI:
10.1007/978-3-031-72998-0_1], articleTitle=ST-LLM: large language models are effective temporal learners, refAbstract=null), Reference(id=1249322287435027039, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024c, volume=null, issue=null, pageStart=216, pageEnd=233, url=null, language=null, rfNumber=null, rfOrder=27, authorNames=Liu Y, Duan H D, Zhang Y H, Li B, Zhang S Y, Zhao W B, Yuan Y K, Wang J Q, He C H, Liu Z W, Chen K, Lin D H, journalName=null, refType=null, unstructuredReference=
Liu Y,
Duan H D,
Zhang Y H,
Li B,
Zhang S Y,
Zhao W B,
Yuan Y K,
Wang J Q,
He C H,
Liu Z W,
Chen K and
Lin D H.
2024c. MMBench: is your multi-modal model an all-around player?//Proceedings of the 18th European Conference on Computer Vision. Milan, Italy: Springer:216-233 [DOI:
10.1007/978-3-031-72658-3_13], articleTitle=MMBench: is your multi-modal model an all-around player?, refAbstract=null), Reference(id=1249322287565050464, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=28, authorNames=Luo R P, Zhao Z W, Yang M, Dong J W, Li D, Wang T, Qiu M H, Hu L M, Wei Z Y, journalName=null, refType=null, unstructuredReference=
Luo R P,
Zhao Z W,
Yang M,
Dong J W,
Li D,
Wang T,
Qiu M H,
Hu L M and
Wei Z Y.
2024. Valley: video assistant with large language model enhanced ability//Proceedings of the 12th International Conference on Learning Representations. Vienna, Austria: OpenReview.net, articleTitle=Valley: video assistant with large language model enhanced ability, refAbstract=null), Reference(id=1249322287648936545, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=12585, pageEnd=12602, url=null, language=null, rfNumber=null, rfOrder=29, authorNames=Maaz M, Rasheed H, Khan S, Khan F S, journalName=null, refType=null, unstructuredReference=
Maaz M,
Rasheed H,
Khan S and
Khan F S.
2024. Video-ChatGPT: towards detailed video understanding via large vision and language models//Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics. [s.l.]: Association for Computational Linguistics:12585-12602, articleTitle=Video-ChatGPT: towards detailed video understanding via large vision and language models, refAbstract=null), Reference(id=1249322287732822626, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=30, authorNames=Mo Y H, Qin H, Dong Y S, Zhu Z Y, Li Z L, journalName=null, refType=null, unstructuredReference=
Mo Y H,
Qin H,
Dong Y S,
Zhu Z Y and
Li Z L.
2024. Large language model (LLM) AI text generation detection based on transformer deep learning algorithm [EB/OL]. [2024-09-06].
https://arxiv.org/pdf/2405.06652.pdf, articleTitle=Large language model (LLM) AI text generation detection based on transformer deep learning algorithm, refAbstract=null), Reference(id=1249322289259549283, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=272, pageEnd=283, url=null, language=null, rfNumber=null, rfOrder=31, authorNames=Pan J T, Lin Z Y, Ge Y Y, Zhu X T, Zhang R R, Wang Y, Qiao Y, Li H S, journalName=null, refType=null, unstructuredReference=
Pan J T,
Lin Z Y,
Ge Y Y,
Zhu X T,
Zhang R R,
Wang Y,
Qiao Y and
Li H S.
2023. Retrieving-to-answer: zero-shot video question answering with frozen large language models//Proceedings of 2023 IEEE/CVF International Conference on Computer Vision Workshops (ICCVW). Paris, France: IEEE:272-283 [DOI: 10.1109/ICCVW60793.2023.00035]., articleTitle=Retrieving-to-answer: zero-shot video question answering with frozen large language models, refAbstract=null), Reference(id=1249322289356018276, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=#3792, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=32, authorNames=Qian R, Dong X Y, Zhang P, Zang Y H, Ding S R, Lin D H, Wang J Q, journalName=null, refType=null, unstructuredReference=
Qian R,
Dong X Y,
Zhang P,
Zang Y H,
Ding S R,
Lin D H and
Wang J Q.
2024. Streaming long video understanding with large language models//Proceedings of the 38th International Conference on Neural Information Processing Systems. Vancouver, Canada: Curran Associates Inc.:#3792, articleTitle=Streaming long video understanding with large language models, refAbstract=null), Reference(id=1249322289435710053, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2016, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=33, authorNames=Singh G, Cuzzolin F, journalName=null, refType=null, unstructuredReference=
Singh G and
Cuzzolin F.
2016. Untrimmed video classification for activity detection: submission to ActivityNet challenge [EB/OL]. [2024-09-06].
https://arxiv.org/pdf/1607.01979.pdf, articleTitle=Untrimmed video classification for activity detection: submission to ActivityNet challenge, refAbstract=null), Reference(id=1249322289519596134, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=18221, pageEnd=18232, url=null, language=null, rfNumber=null, rfOrder=34, authorNames=Song E X, Chai W H, Wang G H, Zhang Y C, Zhou H Y, Wu F Y, Chi H Z, Guo X, Ye T, Zhang Y T, Lu Y, Hwang J N, Wang G A, journalName=null, refType=null, unstructuredReference=
Song E X,
Chai W H,
Wang G H,
Zhang Y C,
Zhou H Y,
Wu F Y,
Chi H Z,
Guo X,
Ye T,
Zhang Y T,
Lu Y,
Hwang J N and
Wang G A.
2024. MovieChat: from dense token to sparse memory for long video understanding//Proceedings of 2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition. Seattle, USA: IEEE:18221-18232 [DOI:
10.1109/CVPR52733.2024.01725], articleTitle=MovieChat: from dense token to sparse memory for long video understanding, refAbstract=null), Reference(id=1249322289595093607, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=35, authorNames=Tang Y L, Bi J, Xu S T, Song L C, Liang S S, Wang T, Zhang D A, An J, Lin J Y, Zhu R Y, Vosoughi A, Huang C, Zhang Z L, Liu P X, Feng M Q, Zheng F, Zhang J G, Luo P, Luo J B, Xu C L, journalName=null, refType=null, unstructuredReference=
Tang Y L,
Bi J,
Xu S T,
Song L C,
Liang S S,
Wang T,
Zhang D A,
An J,
Lin J Y,
Zhu R Y,
Vosoughi A,
Huang C,
Zhang Z L,
Liu P X,
Feng M Q,
Zheng F,
Zhang J G,
Luo P,
Luo J B and
Xu C L.
2023. Video understanding with large language models: a survey [EB/OL]. [2024-09-06].
https://arxiv.org/pdf/2312.17432.pdf, articleTitle=Video understanding with large language models: a survey, refAbstract=null), Reference(id=1249322289683173992, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=36, authorNames=Touvron H, Lavril T, Izacard G, Martinet X, Lachaux M A, Lacroix T, Rozière B, Goyal N, Hambro E, Azhar F, Rodriguez A, Joulin A, Grave E, Lample G, journalName=null, refType=null, unstructuredReference=
Touvron H,
Lavril T,
Izacard G,
Martinet X,
Lachaux M A,
Lacroix T,
Rozière B,
Goyal N,
Hambro E,
Azhar F,
Rodriguez A,
Joulin A,
Grave E and
Lample G.
2023. LLAMA: open and efficient foundation language models [EB/OL]. [2024-09-06].
https://arxiv.org/pdf/2302.13971.pdf, articleTitle=LLAMA: open and efficient foundation language models, refAbstract=null), Reference(id=1249322289762865769, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=5354, pageEnd=5357, url=null, language=null, rfNumber=null, rfOrder=37, authorNames=Trummer I, journalName=null, refType=null, unstructuredReference=
Trummer I.
2024. Large language models: principles and practice//Proceedings of the 40th IEEE International Conference on Data Engineering (ICDE). Utrecht, the Netherlands: IEEE:5354-5357 [DOI:
10.1109/ICDE60146.2024.00404], articleTitle=Large language models: principles and practice, refAbstract=null), Reference(id=1249322289825780330, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=58, pageEnd=76, url=null, language=null, rfNumber=null, rfOrder=38, authorNames=Wang X, Zhang Y, Zohar O, Yeung-Levy S, journalName=null, refType=null, unstructuredReference=
Wang X,
Zhang Y,
Zohar O, and
Yeung-Levy S.
2024. VideoAgent: long-form video understanding with large language model as agent//Proceedings of 2024 European Conference on Computer Vision. Cham: Springer Nature, Switzerland:58-76 [DOI:
10.1007/978-3-031-72989-8_4], articleTitle=VideoAgent: long-form video understanding with large language model as agent, refAbstract=null), Reference(id=1249322289905472107, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=2786, pageEnd=2792, url=null, language=null, rfNumber=null, rfOrder=39, authorNames=Wei F S, Keeling R, Huber-Fliflet N, Zhang J P, Dabrowski A, Yang J C, Mao Q, Qin H, journalName=null, refType=null, unstructuredReference=
Wei F S,
Keeling R,
Huber-Fliflet N,
Zhang J P,
Dabrowski A,
Yang J C,
Mao Q and
Qin H.
2023. Empirical study of LLM fine-tuning for text classification in legal document review//Proceedings of 2023 IEEE International Conference on Big Data (BigData). Sorrento, Italy: IEEE:2786-2792 [DOI:
10.1109/BigData59044.2023.10386911], articleTitle=Empirical study of LLM fine-tuning for text classification in legal document review, refAbstract=null), Reference(id=1249322290001941100, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=453, pageEnd=470, url=null, language=null, rfNumber=null, rfOrder=40, authorNames=Weng Y, Han M, He H, Chang X, Zhuang B, journalName=null, refType=null, unstructuredReference=
Weng Y,
Han M,
He H,
Chang X, and
Zhuang B.
2024. LongVLM: efficient long video understanding via large language models//Proceedings of 2024 European Conference on Computer Vision. Milan,Italy: European Computer Vision Association:453-470 [DOI:
10.1007/978-3-031-70070-1_25], articleTitle=LongVLM: efficient long video understanding via large language models, refAbstract=null), Reference(id=1249322290069049965, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2021, volume=null, issue=null, pageStart=9772, pageEnd=9781, url=null, language=null, rfNumber=null, rfOrder=41, authorNames=Xiao J B, Shang X D, Yao A, Chua T S, journalName=null, refType=null, unstructuredReference=
Xiao J B,
Shang X D,
Yao A and
Chua T S.
2021. NExT-QA: next phase of question-answering to explaining temporal actions//Proceedings of 2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition. Nashville, USA: IEEE:9772-9781 [DOI:
10.1109/CVPR46437.2021.00965], articleTitle=NExT-QA: next phase of question-answering to explaining temporal actions, refAbstract=null), Reference(id=1249322290131964526, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2017, volume=null, issue=null, pageStart=1645, pageEnd=1653, url=null, language=null, rfNumber=null, rfOrder=42, authorNames=Xu D J, Zhao Z, Xiao J, Wu F, Zhang H W, He X N, Zhuang Y T, journalName=null, refType=null, unstructuredReference=
Xu D J,
Zhao Z,
Xiao J,
Wu F,
Zhang H W,
He X N and
Zhuang Y T.
2017. Video question answering via gradually refined attention over appearance and motion//Proceedings of the 25th ACM International Conference on Multimedia. Mountain View, USA: Association for Computing Machinery:1645-1653 [DOI:
10.1145/3123266.3123427], articleTitle=Video question answering via gradually refined attention over appearance and motion, refAbstract=null), Reference(id=1249322290207461999, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=43, authorNames=Xu L, Zhao Y L, Zhou D Q, Lin Z J, Ng S K, Feng J S, journalName=null, refType=null, unstructuredReference=
Xu L,
Zhao Y L,
Zhou D Q,
Lin Z J,
Ng S K and
Feng J S.
2024. PLLaVA: parameter-free LLaVA extension from images to videos for video dense captioning [EB/OL]. [2024-09-06].
https://arxiv.org/pdf/2404.16994.pdf, articleTitle=PLLaVA: parameter-free LLaVA extension from images to videos for video dense captioning, refAbstract=null), Reference(id=1249322290261987952, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2022, volume=52, issue=1, pageStart=1, pageEnd=53, url=null, language=null, rfNumber=null, rfOrder=44, authorNames=Yang Z, He X W, Wu J H, Wang X, Zhao Y, journalName=Scientia Sinica Informationis, refType=null, unstructuredReference=
Yang Z,
He X W,
Wu J H,
Wang X and
Zhao Y.
2022. Edge computing technologies for streaming video analytics.
Scientia Sinica Informationis,
52(1): 1-53, articleTitle=Edge computing technologies for streaming video analytics, refAbstract=null), Reference(id=1249322290316513905, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2022, volume=52, issue=1, pageStart=1, pageEnd=53, url=null, language=null, rfNumber=null, rfOrder=45, authorNames=杨铮, 贺骁武, 吴家行, 王需, 赵毅, journalName=面向实时视频流分析的边缘计算技术, refType=null, unstructuredReference=杨铮, 贺骁武, 吴家行, 王需, 赵毅.
2022.
面向实时视频流分析的边缘计算技术. 中国科学: 信息科学),
52(1): 1-53 [DOI:
10.1360/SSI-2021-0133], articleTitle=null, refAbstract=null), Reference(id=1249322290371039858, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024a, volume=null, issue=null, pageStart=146, pageEnd=164, url=null, language=null, rfNumber=null, rfOrder=46, authorNames=Ye Q L, Yu Z T, Shao R, Xie X Y, Torr P, Cao X C, journalName=null, refType=null, unstructuredReference=
Ye Q L,
Yu Z T,
Shao R,
Xie X Y,
Torr P and
Cao X C.
2024a. CAT: enhancing multimodal large language model to answer questions in dynamic audio-visual scenarios//Proceedings of the 18th European Conference on Computer Vision. Milan, Italy: Springer:146-164 [DOI:
10.1007/978-3-031-72684-2_9], articleTitle=CAT: enhancing multimodal large language model to answer questions in dynamic audio-visual scenarios, refAbstract=null), Reference(id=1249322290425565811, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024b, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=47, authorNames=Ye X B, Gan Y K, Huang X K, Ge Y X, Tang Y S, journalName=null, refType=null, unstructuredReference=
Ye X B,
Gan Y K,
Huang X K,
Ge Y X and
Tang Y S.
2024b. VoCo-LLaMA: towards vision compression with large language models [EB/OL]. [2024-09-06].
https://arxiv.org/pdf/2406.12275.pdf, articleTitle=VoCo-LLaMA: towards vision compression with large language models, refAbstract=null), Reference(id=1249322290509451892, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2025, volume=35, issue=5, pageStart=4592, pageEnd=4607, url=null, language=null, rfNumber=null, rfOrder=48, authorNames=You Z, Wen Z Q, Chen Y F, Li X, Zeng R H, Wang Y W, Tan M K, journalName=IEEE Transactions on Circuits and Systems for Video Technology, refType=null, unstructuredReference=
You Z,
Wen Z Q,
Chen Y F,
Li X,
Zeng R H,
Wang Y W and
Tan M K.
2025. Toward long video understanding via fine-detailed video story generation.
IEEE Transactions on Circuits and Systems for Video Technology,
35(5): 4592-4607 [DOI:
10.1109/TCSVT.2024.3514820], articleTitle=Toward long video understanding via fine-detailed video story generation, refAbstract=null), Reference(id=1249322290584949365, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2019, volume=null, issue=null, pageStart=9127, pageEnd=9134, url=null, language=null, rfNumber=null, rfOrder=49, authorNames=Yu Z, Xu D J, Yu J, Yu T, Zhao Z, Zhuang Y T, Tao D C, journalName=null, refType=null, unstructuredReference=
Yu Z,
Xu D J,
Yu J,
Yu T,
Zhao Z,
Zhuang Y T and
Tao D C.
2019. ActivityNet-QA: a dataset for understanding complex web videos via question answering//Proceedings of 2019 AAAI Conference on Artificial Intelligence. Honolulu, USA: AAAI Press:9127-9134 [DOI:
10.1609/aaai.v33i01.33019127], articleTitle=ActivityNet-QA: a dataset for understanding complex web videos via question answering, refAbstract=null), Reference(id=1249322290668835446, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024a, volume=null, issue=null, pageStart=21715, pageEnd=21737, url=null, language=null, rfNumber=null, rfOrder=50, authorNames=Zhang C, Lu T X, Islam M M, Wang Z Y, Yu S B, Bansal M, Bertasius G, journalName=null, refType=null, unstructuredReference=
Zhang C,
Lu T X,
Islam M M,
Wang Z Y,
Yu S B,
Bansal M and
Bertasius G.
2024a. A simple LLM framework for long-range video question-answering//Proceedings of 2024 Conference on Empirical Methods in Natural Language Processing. Miami, USA: Association for Computational Linguistics:21715-21737 [DOI:
10.18653/v1/2024.emnlp-main.1209], articleTitle=A simple LLM framework for long-range video question-answering, refAbstract=null), Reference(id=1249322290735944311, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024b, volume=null, issue=null, pageStart=12401, pageEnd=12430, url=null, language=null, rfNumber=null, rfOrder=51, authorNames=Zhang D Z, Yu Y H, Dong J H, Li C X, Su D, Chu C H, Yu D, journalName=Findings of the Association for Computational Linguistics, refType=null, unstructuredReference=
Zhang D Z,
Yu Y H,
Dong J H,
Li C X,
Su D,
Chu C H and
Yu D.
2024b. MM-LLMs: recent advances in multimodal large language models//
Findings of the Association for Computational Linguistics. Bangkok, Thailand: Association for Computational Linguistics:12401-12430 [DOI:
10.18653/v1/2024.findings-acl.738], articleTitle=MM-LLMs: recent advances in multimodal large language models, refAbstract=null), Reference(id=1249322290811441784, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=543, pageEnd=553, url=null, language=null, rfNumber=null, rfOrder=52, authorNames=Zhang H, Li X, Bing L D, journalName=null, refType=null, unstructuredReference=
Zhang H,
Li X and
Bing L D.
2023. Video-LLaMA: an instruction-tuned audio-visual language model for video understanding//Proceedings of 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Singapore, Singapore: Association for Computational Linguistics:543-553 [DOI:
10.18653/v1/2023.emnlp-demo.49], articleTitle=Video-LLaMA: an instruction-tuned audio-visual language model for video understanding, refAbstract=null), Reference(id=1249322290882744953, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024c, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=53, authorNames=Zhang H J, Wang Y Q, Tang Y S, Liu Y, Feng J S, Dai J F, Jin X J, journalName=null, refType=null, unstructuredReference=
Zhang H J,
Wang Y Q,
Tang Y S,
Liu Y,
Feng J S,
Dai J F and
Jin X J.
2024c. Flash-VStream: memory-based real-time understanding for long video streams [EB/OL]. [2024-09-06].
https://arxiv.org/pdf/2406.08085.pdf, articleTitle=Flash-VStream: memory-based real-time understanding for long video streams, refAbstract=null), Reference(id=1249322290954048122, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2024d, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=54, authorNames=Zhang P Y, Zhang K C, Li B, Zeng G T, Yang J K, Zhang Y H, Wang Z Y, Tan H R, Li C Y, Liu Z W, journalName=null, refType=null, unstructuredReference=
Zhang P Y,
Zhang K C,
Li B,
Zeng G T,
Yang J K,
Zhang Y H,
Wang Z Y,
Tan H R,
Li C Y and
Liu Z W.
2024d. Long context transfer from language to vision//Proceedings of the 13th International Conference on Learning Representations. Singapore, Singapore: OpenReview.net, articleTitle=Long context transfer from language to vision, refAbstract=null), Reference(id=1249322291016962683, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2019, volume=null, issue=null, pageStart=1, pageEnd=6, url=null, language=null, rfNumber=null, rfOrder=55, authorNames=Zhang S F, Zhai J H, Xie B J, Zhan Y, Wang X, journalName=null, refType=null, unstructuredReference=
Zhang S F,
Zhai J H,
Xie B J,
Zhan Y and
Wang X.
2019. Multimodal representation learning: advances, trends and challenges//Proceedings of 2019 International Conference on Machine Learning and Cybernetics (ICMLC). Kobe, Japan: IEEE:1-6 [DOI:
10.1109/ICMLC48188.2019.8949228], articleTitle=Multimodal representation learning: advances, trends and challenges, refAbstract=null), Reference(id=1249322291109237372, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2017, volume=null, issue=null, pageStart=3518, pageEnd=3524, url=null, language=null, rfNumber=null, rfOrder=56, authorNames=Zhao Z, Yang Q F, Cai D, He X F, Zhuang Y T, journalName=null, refType=null, unstructuredReference=
Zhao Z,
Yang Q F,
Cai D,
He X F and
Zhuang Y T.
2017. Video question answering via hierarchical spatio-temporal attention networks//Proceedings of the 26th International Joint Conference on Artificial Intelligence. Melbourne, Australia: AAAI Press:3518-3524, articleTitle=Video question answering via hierarchical spatio-temporal attention networks, refAbstract=null), Reference(id=1249322291167957629, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, doi=null, pmid=null, pmcid=null, year=2018, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=null, rfOrder=57, authorNames=Zhou L W, Xu C L, Corso J, journalName=null, refType=null, unstructuredReference=
Zhou L W,
Xu C L and
Corso J.
2018. Towards automatic learning of procedures from web instructional videos//Proceedings of 2018 AAAI Conference on Artificial Intelligence. New Orleans, USA: AAAI Press [DOI:
10.1609/aaai.v32i1.12342], articleTitle=Towards automatic learning of procedures from web instructional videos, refAbstract=null)], funds=null, companyList=[AuthorCompany(id=1249322281500086811, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, xref=1, ext=[AuthorCompanyExt(id=1249322281508475420, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, companyId=1249322281500086811, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
1School of Science and Engineer, The Chinese University of Hong Kong (Shenzhen), Shenzhen518116,China), AuthorCompanyExt(id=1249322281516864029, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, companyId=1249322281500086811, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
1香港中文大学(深圳)理工学院,深圳518116)]), AuthorCompany(id=1249322281575584286, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, xref=2, ext=[AuthorCompanyExt(id=1249322281579778591, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, companyId=1249322281575584286, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
2School of Computer Science and Engineer, Sun Yat-sen University, Guangzhou510006,China), AuthorCompanyExt(id=1249322281588167200, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, companyId=1249322281575584286, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
2中山大学计算机学院,广州510006)])], figs=[ArticleFig(id=1249322283106505276, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=EN, label=Fig.1, caption=
Overall framework of this paper, figureFileSmall=F5x/032vz3n0VtwNgH5qaA==, figureFileBig=G32E8gveFz/z5H+3BCyRmw==, tableContent=null), ArticleFig(id=1249322283156836925, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=CN, label=图1, caption=
本文结构框架, figureFileSmall=F5x/032vz3n0VtwNgH5qaA==, figureFileBig=G32E8gveFz/z5H+3BCyRmw==, tableContent=null), ArticleFig(id=1249322283253305918, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=EN, label=Fig.2, caption=
Visualization of the current methodology model, figureFileSmall=bllJ82FDp6oYvJZ56GIz4g==, figureFileBig=NJ39+Ak8U5/48gwqXSr0xw==, tableContent=null), ArticleFig(id=1249322284771643967, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=CN, label=图2, caption=
当前方法模型可视化展示, figureFileSmall=bllJ82FDp6oYvJZ56GIz4g==, figureFileBig=NJ39+Ak8U5/48gwqXSr0xw==, tableContent=null), ArticleFig(id=1249322284847141440, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=EN, label=Tab.1, caption=
Comprehensive overview and summary of representative algorithms for video question answering in large language models
, figureFileSmall=null, figureFileBig=null, tableContent=
| 方法类型 | 代表性方法 | 核心特点/代码链接 |
|---|
| 无需训练 | VideoAgent | 将视频理解过程定义为状态、动作和观察的序列,并使用大语言模型作为控制这一过程的核心代理。公开代码链接:https://github.com/wxh1996/VideoAgent |
| VideoAgentM | 将视频中信息整理成一个结构化的记忆库,帮助大语言模型像人类一样理解和推理视频内容,同时利用现有工具模型回答问题。公开代码链接: https://github.com/YueFan1014/VideoAgent |
| Video ReCap | 引入递归的视频语言架构,利用语言层次结构逐步生成视频的多层次描述。 公开代码链接:https://github.com/md-mohaiminul/VideoRecap |
| 需要训练 | 特征池化 | Video-LLaVA | 利用空间和时间池化技术将视频处理为一系列时序密集图像,从中提取关键的空间和时间特征。 |
| Valley | 提出两种改进的平均池化类模型方法的结构,以增强模型对视频时序信息的理解。 |
| 令牌压缩 | LLaMA-VID | 将每个视频帧图像表示为两种不同的令牌:上下文令牌和内容令牌。 公开代码链接: https://github.com/dvlab-research/LLaMA-VID |
| VideoChat | 基于动态令牌选择和聚合的机制,能够在视频处理时动态地选择视频帧特征中最具代表性的特征令牌聚合,通过引入一种简单有效的池化策略来平滑沿时间维度的特征分布,从而减少极端特征的主导影响。公开代码链接: https://github.com/magic-research/PLLaVA |
| Chat-UniVi | 通过引入基于K 近邻的密度峰值聚类算法的令牌合并方法,逐步合并具有相似语义意义的视觉令牌,从而获得动态视觉令牌。公开代码链接: https://github.com/PKU-YuanGroup/Chat-UniVi |
| VoCo-LLaMA | 首个利用大语言模型内在功能进行视觉压缩的方法。 公开代码链接:https://github.com/Yxxxb/VoCo-LLaMA |
| 记忆机制 | MA-LMM | 引入一种长时记忆库,通过顺序处理视频帧并将提取的特征存储在记忆库中,实现长时间视频的有效建模。公开代码链接:https://github.com/boheumd/MA-LMM |
| MovieChat | 将短期记忆作为视频信息的快速处理和存储单元,长期记忆用于整合和保持关键信息。公开代码链接:https://github.com/rese1f/MovieChat |
| 拼成大图 | LongVA | 将视频表示为扩展的图像形式,通过这种编码方式,大语言模型可以将整个视频视做一个整体。 |
), ArticleFig(id=1249322284914250305, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=CN, label=表1, caption=
大语言模型下视频问答的代表性算法归纳总结和总览
, figureFileSmall=null, figureFileBig=null, tableContent=
| 方法类型 | 代表性方法 | 核心特点/代码链接 |
|---|
| 无需训练 | VideoAgent | 将视频理解过程定义为状态、动作和观察的序列,并使用大语言模型作为控制这一过程的核心代理。公开代码链接:https://github.com/wxh1996/VideoAgent |
| VideoAgentM | 将视频中信息整理成一个结构化的记忆库,帮助大语言模型像人类一样理解和推理视频内容,同时利用现有工具模型回答问题。公开代码链接: https://github.com/YueFan1014/VideoAgent |
| Video ReCap | 引入递归的视频语言架构,利用语言层次结构逐步生成视频的多层次描述。 公开代码链接:https://github.com/md-mohaiminul/VideoRecap |
| 需要训练 | 特征池化 | Video-LLaVA | 利用空间和时间池化技术将视频处理为一系列时序密集图像,从中提取关键的空间和时间特征。 |
| Valley | 提出两种改进的平均池化类模型方法的结构,以增强模型对视频时序信息的理解。 |
| 令牌压缩 | LLaMA-VID | 将每个视频帧图像表示为两种不同的令牌:上下文令牌和内容令牌。 公开代码链接: https://github.com/dvlab-research/LLaMA-VID |
| VideoChat | 基于动态令牌选择和聚合的机制,能够在视频处理时动态地选择视频帧特征中最具代表性的特征令牌聚合,通过引入一种简单有效的池化策略来平滑沿时间维度的特征分布,从而减少极端特征的主导影响。公开代码链接: https://github.com/magic-research/PLLaVA |
| Chat-UniVi | 通过引入基于K 近邻的密度峰值聚类算法的令牌合并方法,逐步合并具有相似语义意义的视觉令牌,从而获得动态视觉令牌。公开代码链接: https://github.com/PKU-YuanGroup/Chat-UniVi |
| VoCo-LLaMA | 首个利用大语言模型内在功能进行视觉压缩的方法。 公开代码链接:https://github.com/Yxxxb/VoCo-LLaMA |
| 记忆机制 | MA-LMM | 引入一种长时记忆库,通过顺序处理视频帧并将提取的特征存储在记忆库中,实现长时间视频的有效建模。公开代码链接:https://github.com/boheumd/MA-LMM |
| MovieChat | 将短期记忆作为视频信息的快速处理和存储单元,长期记忆用于整合和保持关键信息。公开代码链接:https://github.com/rese1f/MovieChat |
| 拼成大图 | LongVA | 将视频表示为扩展的图像形式,通过这种编码方式,大语言模型可以将整个视频视做一个整体。 |
), ArticleFig(id=1249322284977164866, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=EN, label=Tab.2, caption=
Overview and summary of the current video question answering datasets
, figureFileSmall=null, figureFileBig=null, tableContent=
), ArticleFig(id=1249322285128159811, tenantId=1146029695717560320, journalId=1249024232475115590, articleId=1249044011021964066, language=CN, label=表2, caption=
视频问答当前数据集总览和归纳
, figureFileSmall=null, figureFileBig=null, tableContent=
)], attaches=null, journal=Journal(id=1249023527618129992, delFlag=0, nameCn=中国图象图形学报, nameEn=Journal of Image and Graphics, nameHistory1=null, nameHistory2=null, issn=1006-8961, eissn=null, cn=11-3758, coden=CODEN ZTTXFZ, periodic=0, language=CN, oaType=null, ccby=null, superviseOffice=null, ownerOffice=null, pubOffice=null, editorOffice=null, officeType=null, aims=null, clcCode=null, officeProv=null, officeCity=null, officeAddr=null, officeZip=null, officeEmail=null, officePhone=null, editDirector=null, officeDirector=null, officeDirectorPhone=null, officeStaffNum=null, officeEmpNum=null, coverPicUrl=uirXtX858YS3zEpFXZttJA==, journalPrice=null, startedYear=null, abbrevIsoEn=Journal of Image and Graphics, journalRemark=null, publicationField=null, createdTime=1775720014721, updatedTime=1775720337198, createdBy=18614031015, updatedBy=13701087609, firstLetterCn=J, firstLetterEn=J, subjectCode=Engineering, subjectName=null, subjectCodeEn=Engineering, subjectNameEn=null, picCn=uirXtX858YS3zEpFXZttJA==, picEn=bud7qaxfvWHeFsbyBTAiKQ==, jcr=null, cjcr=null, exts=[JournalExt(id=1249024880377786590, language=CN, name=中国图象图形学报, nameHistory1=null, nameHistory2=null, managedBy=, sponsoredBy=, publishedBy=, editorOffice=, officeProv=null, officeCity=null, officeAddr=, officeZip=, editDirector=, officeDirector=null, officePhone=null, coverPicUrl=null, journalRemark=, submitArticleUrl=null, websiteUrl=, createdTime=1775720337242, updatedTime=1775720337242, createdBy=13701087609, updatedBy=13701087609, submissionGuidelinesUrl=, submissionAuthorUrl=https://journal.ids.fzyun.cn/auth/realms/journal/protocol/openid-connect/auth?client_id=journal-cjig-author&redirect_uri=https%3A%2F%2Fcjig.portal.founderss.cn%2Foauth%2Fcallback&response_type=code&scope=phone+openid+email+profile&state=e6369def-2842-41d8, submissionEditorUrl=https://journal.ids.fzyun.cn/auth/realms/journal/protocol/openid-connect/auth?client_id=journal-cjig-editor&redirect_uri=https%3A%2F%2Fcjigeditor.portal.founderss.cn%2Foauth%2Fcallback&response_type=code&scope=phone+openid+email+profile&state=9ccec05b-6bd, submissionReviewUrl=https://journal.ids.fzyun.cn/auth/realms/journal/protocol/openid-connect/auth?client_id=journal-cjig-author&redirect_uri=https%3A%2F%2Fcjig.portal.founderss.cn%2Foauth%2Fcallback&response_type=code&scope=phone+openid+email+profile&state=1e8a31c8-5434-4f78, submissionCeEditorUrl=, submissionAeEditorUrl=, option={"copyright":""}), JournalExt(id=1249024880449089759, language=EN, name=Journal of Image and Graphics, nameHistory1=null, nameHistory2=null, managedBy=, sponsoredBy=, publishedBy=, editorOffice=, officeProv=null, officeCity=null, officeAddr=, officeZip=, editDirector=, officeDirector=null, officePhone=null, coverPicUrl=null, journalRemark=, submitArticleUrl=null, websiteUrl=, createdTime=1775720337259, updatedTime=1775720337259, createdBy=13701087609, updatedBy=13701087609, submissionGuidelinesUrl=, submissionAuthorUrl=https://journal.ids.fzyun.cn/auth/realms/journal/protocol/openid-connect/auth?client_id=journal-cjig-author&redirect_uri=https%3A%2F%2Fcjig.portal.founderss.cn%2Foauth%2Fcallback&response_type=code&scope=phone+openid+email+profile&state=e6369def-2842-41d8, submissionEditorUrl=https://journal.ids.fzyun.cn/auth/realms/journal/protocol/openid-connect/auth?client_id=journal-cjig-editor&redirect_uri=https%3A%2F%2Fcjigeditor.portal.founderss.cn%2Foauth%2Fcallback&response_type=code&scope=phone+openid+email+profile&state=9ccec05b-6bd, submissionReviewUrl=https://journal.ids.fzyun.cn/auth/realms/journal/protocol/openid-connect/auth?client_id=journal-cjig-author&redirect_uri=https%3A%2F%2Fcjig.portal.founderss.cn%2Foauth%2Fcallback&response_type=code&scope=phone+openid+email+profile&state=1e8a31c8-5434-4f78, submissionCeEditorUrl=, submissionAeEditorUrl=, option={"copyright":""})], databaseList=null, tenantJournalId=1249024232475115590, websiteList=[Website(id=1249025782459334881, webName=null, webTitle=null, webDomain=null, webCopyrigh=null, webIpcNo=null, seoTitle=null, seoKeywords=null, seoDescription=null, tenantJournalId=null, journalId=1249024232475115590, journalNameCn=null, journalNameEn=null, grayFlag=null, tenantId=1146029695717560320, platformId=null, journalGroupId=null, journalGroupNameCn=null, journalGroupNameEn=null, type=1, domain=https://castjournals.cast.org.cn/joweb/zgtxtxxb/CN, language=CN, createTime=1775720552315, createBy=18614031015, updateTime=1775720586268, updateBy=18614031015, name=中国图象图形学报-中文, tplId=1146099689490845704, title=中国图象图形学报, delFlag=0, indexPage=/home, props=[WebsiteProps(id=1249026166254928133, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782459334881, code=articleTextType, value=kx, createTime=1775720643819, updateTime=1775720643819, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026166221373698, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782459334881, code=banner, value=null, createTime=1775720643811, updateTime=1775720643811, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026166271705352, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782459334881, code=grayFlag, value=0, createTime=1775720643823, updateTime=1775720643823, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026166212985089, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782459334881, code=logo, value=https://castjournals.cast.org.cn/joweb/zgtxtxxb/CN/file/pic?fileId=TDRjKTHfgAnvFKZaDA70wA==, createTime=1775720643809, updateTime=1775720643809, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026166288482570, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782459334881, code=minRunFlag, value=0, createTime=1775720643827, updateTime=1775720643827, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026166246539524, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782459334881, code=picServerUrl, value=https://castjournals.cast.org.cn/joweb/zgtxtxxb/CN/file/pic, createTime=1775720643817, updateTime=1775720643817, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026166280093961, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782459334881, code=silenceFlag, value=0, createTime=1775720643825, updateTime=1775720643825, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026166233956611, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782459334881, code=staticResourcePath, value=https://castjournals.cast.org.cn/joweb/cast_kjdb_cn_619/, createTime=1775720643814, updateTime=1775720643814, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026166259122438, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782459334881, code=themeColor, value=null, createTime=1775720643820, updateTime=1775720643820, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026166267511047, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782459334881, code=themeStyle, value=null, createTime=1775720643822, updateTime=1775720643822, creator=18614031015, updator=18614031015)]), Website(id=1249025782681633001, webName=null, webTitle=null, webDomain=null, webCopyrigh=null, webIpcNo=null, seoTitle=null, seoKeywords=null, seoDescription=null, tenantJournalId=null, journalId=1249024232475115590, journalNameCn=null, journalNameEn=null, grayFlag=null, tenantId=1146029695717560320, platformId=null, journalGroupId=null, journalGroupNameCn=null, journalGroupNameEn=null, type=1, domain=https://castjournals.cast.org.cn/joweb/zgtxtxxb/EN, language=EN, createTime=1775720552368, createBy=18614031015, updateTime=1775720607118, updateBy=18614031015, name=中国图象图形学报-英文, tplId=1146101810881728533, title=Journal of Image and Graphics, delFlag=0, indexPage=/home, props=[WebsiteProps(id=1249026195371786511, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782681633001, code=articleTextType, value=kx, createTime=1775720650761, updateTime=1775720650761, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026195355009292, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782681633001, code=banner, value=null, createTime=1775720650757, updateTime=1775720650757, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026195392758034, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782681633001, code=grayFlag, value=0, createTime=1775720650766, updateTime=1775720650766, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026195342426379, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782681633001, code=logo, value=https://castjournals.cast.org.cn/joweb/zgtxtxxb/EN/file/pic?fileId=TDRjKTHfgAnvFKZaDA70wA==, createTime=1775720650754, updateTime=1775720650754, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026195409535252, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782681633001, code=minRunFlag, value=0, createTime=1775720650770, updateTime=1775720650770, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026195367592206, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782681633001, code=picServerUrl, value=https://castjournals.cast.org.cn/joweb/zgtxtxxb/EN/file/pic, createTime=1775720650760, updateTime=1775720650760, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026195401146643, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782681633001, code=silenceFlag, value=0, createTime=1775720650768, updateTime=1775720650768, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026195359203597, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782681633001, code=staticResourcePath, value=https://castjournals.cast.org.cn/joweb/cast_kjdb_en_623/, createTime=1775720650758, updateTime=1775720650758, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026195380175120, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782681633001, code=themeColor, value=null, createTime=1775720650763, updateTime=1775720650763, creator=18614031015, updator=18614031015), WebsiteProps(id=1249026195388563729, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1249025782681633001, code=themeStyle, value=null, createTime=1775720650765, updateTime=1775720650765, creator=18614031015, updator=18614031015)])], journalTitle=中国图象图形学报, weixinUrl=null, journalUrl=https://www.cjig.cn/, iacademicId=null, status=1, seqNo=null, journalTitleEn=Journal of Image and Graphics, journalPhotoCn=uirXtX858YS3zEpFXZttJA==, journalPhotoEn=bud7qaxfvWHeFsbyBTAiKQ==, journalFirstLetter=J, journalRecommend=null, journalNew=null, journalCollection=null, jcrJf=null, cjcrJf=null, jcrJfStr=null, cjcrJfStr=null, submissionFirstDecision=null, sciSubjectClassification=null, casSubjectClassification=null, citeScore=null, totalCitationFrequency=null, icpCode=null, psCode=null, advertisingLicenseCode=null, copyrightInformation=null, country=null, option=, provinceCode=null, provinceName=null, collectFlag=false), detailUrlCn=https://castjournals.cast.org.cn/joweb/zgtxtxxb/CN/10.11834/jig.240535, detailUrlEn=https://castjournals.cast.org.cn/joweb/zgtxtxxb/EN/10.11834/jig.240535, pdfUrlCn=https://castjournals.cast.org.cn/joweb/zgtxtxxb/CN/PDF/10.11834/jig.240535, pdfUrlEn=https://castjournals.cast.org.cn/joweb/zgtxtxxb/EN/PDF/10.11834/jig.240535, aliStartDate=null, aliEndDate=null, collectionFlag=false, citedCount=null, citedUrl=null, reference=null)