Article(id=1251856529385796289, tenantId=1146029695717560320, journalId=1251234268282663017, issueId=1251856520619700745, articleNumber=null, orderNo=null, doi=10.3969/j.issn.1003-3106.2025.11.014, pmid=null, cstr=null, oa=null, hot=null, price=null, onlineType=0, articleFormat=0, articleType=null, articleTypeStr=null, receivedDate=1754064000000, receivedDateStr=2025-08-02, revisedDate=null, revisedDateStr=null, acceptedDate=null, acceptedDateStr=null, onlineDate=1776395454974, onlineDateStr=2026-04-17, pubDate=1762272000000, pubDateStr=2025-11-05, doiRegisterDate=null, doiRegisterDateStr=null, onlineIssueDate=1776395454974, onlineIssueDateStr=2026-04-17, onlineJustAcceptDate=null, onlineJustAcceptDateStr=null, onlineFirstDate=null, onlineFirstDateStr=null, sourceXml=null, magXml=null, createTime=1776395454974, creator=13701087609, updateTime=1776395454974, updator=13701087609, issue=Issue{id=1251856520619700745, tenantId=1146029695717560320, journalId=1251234268282663017, year='2025', volume='55', issue='11', pageStart='2131', pageEnd='2324', issueExtLink='null', onlineDate='null', pubDate='null', beforeIssueId=null, nextIssueId=null, price=null, status=1, issueComplete=1, articleOrder=1, issueType=1, specialIssue=null, createTime=1776395452885, creator=13701087609, updateTime=1776395571911, updator=13701087609, preIssue=null, nextIssue=null, ext={EN=IssueExt(id=1251857019939013255, tenantId=1146029695717560320, journalId=1251234268282663017, issueId=1251856520619700745, language=EN, specialIssueTitle=, coverIllustrator=null, specialIssueEditor=, specialIssueAbout=), CN=IssueExt(id=1251857019939013256, tenantId=1146029695717560320, journalId=1251234268282663017, issueId=1251856520619700745, language=CN, specialIssueTitle=, coverIllustrator=null, specialIssueEditor=, specialIssueAbout=)}, issueFiles=null}, startPage=2256, endPage=2273, ext={EN=ArticleExt(id=1251856529738117853, articleId=1251856529385796289, tenantId=1146029695717560320, journalId=1251234268282663017, language=EN, title=A Comprehensive Review and Future Perspectives on Embodied AI Large Models, columnId=1251856524038058520, journalTitle=Radio Engineering, columnName=Engineering & Application, runingTitle=null, highlight=null, articleAbstract=
Vision-Language-Action (VLA) models are a core technology for achieving general embodied artificial intelligence, aiming to integrate visual perception, language understanding, and action decision-making within a unified end-to-end framework. The current research status and development trajectory of VLA models are comprehensively and systematically reviewed. The theoretical origins of VLA models are traced, and the paradigm shift from modular designs to unified architectures is clarified. Along the evolutionary path of VLA, representative works such as SpatialVLA, TLA, and GR00T N1 are presented with a focus on multimodal fusion and cognitive hierarchies. A detailed taxonomy of VLA models is constructed from two key dimensions-macro architecture and system hierarchy. Key technologies and design principles are deeply analyzed, ranging from pioneering works such as RT-1, to models introducing large-scale knowledge transfer such as RT-2, OpenVLA, and ECOT, and further to cutting-edge dual-system architectures such as Helix, OpenHelix, DexVLA, and DexGraspVLA. Mainstream simulation environments, core datasets, and benchmarks supporting VLA research are systematically integrated and reviewed. The application status and prospects of VLA models in robotic manipulation, autonomous navigation, and industrial automation are explored. Core challenges in current VLA research are analyzed, including generalization and data efficiency, long-horizon task planning, and real-time responsiveness. Future research directions are discussed, including integration with world models and enhancement of data efficiency.
, correspAuthors=null, authorNote=null, correspAuthorsNote=null, copyrightStatement=null, copyrightOwner=null, extLink=null, articleAbsUrl=null, sourceXml=null, magXml=null, pdfUrl=null, pdf=null, pdfFileSize=null, pdfExtLink=null, richHtmlUrl=null, mobilePdfUrl=null, reviewReport=null, pdfFirstPage=null, abstractGraph=null, abstractGraphContent=null, abstractVideo=null, citation=null, cebUrl=null, magXmlContent=null, mapNumber=null, authorCompany=null, fund=null, authors=null, authorsList=Tingyu YUAN, Kai LIU, Biaoliang GUAN, Wen YE, Yacui ZHAO, Chaoyang ZHAO, Jinqiao WANG), CN=ArticleExt(id=1251856533890478990, articleId=1251856529385796289, tenantId=1146029695717560320, journalId=1251234268282663017, language=CN, title=具身智能大模型综述及展望, columnId=1251856524432323108, journalTitle=无线电工程, columnName=工程与应用, runingTitle=null, highlight=null, articleAbstract=
视觉-语言-行动(Vision-Language-Action,VLA)模型是实现通用具身人工智能的核心技术,旨在在统一的端到端框架内融合视觉感知、语言理解与动作决策。对VLA模型的研究现状与发展脉络进行了全面而系统的梳理。追溯了VLA模型的理论起源,阐明了其从分离式模块向统一架构演进的范式变迁。针对VLA的演进路线,以多模态融合与认知分层为重点阐述了SpatialVLA、TLA与GR00T N1等工作。构建了一个详尽的VLA模型分类体系,从宏观架构和系统分层2个核心维度,深入剖析了从RT-1等开创性工作到引入大规模知识迁移的RT-2、OpenVLA、ECOT等工作,再到双系统架构的Helix、OpenHelix、DexVLA、DexGraspVLA等前沿模型的关键技术与设计思想。系统性地整合与评述了支撑VLA研究的主流仿真环境、核心数据集与基准,并探讨了其在机器人操作、自主导航、工业自动化等领域的应用现状与前景。深入剖析了当前VLA研究在泛化性与数据效率、长时程任务规划、实时响应速度等方面面临的核心挑战,并对融合世界模型、提升数据效率等未来研究方向进行了展望。
, correspAuthors=null, authorNote=null, correspAuthorsNote=null, copyrightStatement=null, copyrightOwner=null, extLink=null, articleAbsUrl=null, sourceXml=3go7C+PhDkWskLeWhtgbFA==, magXml=SYJbbPn8bbDdIUq1k2KlZw==, pdfUrl=null, pdf=FyW/4ZsnaOFvxmYLm01Awg==, pdfFileSize=8725065, pdfExtLink=null, richHtmlUrl=null, mobilePdfUrl=null, reviewReport=null, pdfFirstPage=null, abstractGraph=i9WgRNQIfkcX36GehDV3qw==, abstractGraphContent=null, abstractVideo=null, citation=null, cebUrl=null, magXmlContent=1AM31H8snqmXoiJXMis5yA==, mapNumber=null, authorCompany=null, fund=null, authors=
袁霆宇 男,(2002—),博士研究生。主要研究方向:具身智能、多模态大模型、3D生成、强化学习等。
刘凯 男,(2002—),硕士研究生。主要研究方向:具身智能、多模态大模型、计算机视觉等。
关标良 男,(2002—),硕士研究生。主要研究方向:具身智能、强化学习、计算机视觉、多模态大模型等。
叶雯 女,(2003—),博士研究生。主要研究方向:具身智能、多模态大模型智能体、多智能体系统、AIGC、计算机视觉等。
赵雅萃 女,(2002—),硕士研究生。主要研究方向:具身智能、多模态大模型等。
赵朝阳 男,(1985—),博士,副研究员。主要研究方向:视频图像分析、多模态大模型、具身智能等。
王金桥 男,(1978—),博士,研究员。主要研究方向:具身智能、视频图像分析、多模态大模型、自监督学习、目标检测与跟踪、细粒度识别、行为识别。
, authorsList=袁霆宇, 刘凯, 关标良, 叶雯, 赵雅萃, 赵朝阳, 王金桥)}, authors=[Author(id=1251856536281232352, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, orderNo=0, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=null, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1251856536365118438, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, authorId=1251856536281232352, language=EN, stringName=Tingyu YUAN, firstName=Tingyu, middleName=null, lastName=YUAN, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
1, 2, address=
1.Foundation Model Research Center, Institute of Automation, Chinese Academy of Sciences, Beijing 100083, China
2.School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing 100083, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1251856536469976044, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, authorId=1251856536281232352, language=CN, stringName=袁霆宇, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
1, 2, address=
1.中国科学院自动化研究所 紫东太初大模型研究中心,北京 100083
2.中国科学院大学 人工智能学院,北京 100083, bio={"content":"
袁霆宇 男,(2002—),博士研究生。主要研究方向:具身智能、多模态大模型、3D生成、强化学习等。
"}, bioImg=null, bioContent=
袁霆宇 男,(2002—),博士研究生。主要研究方向:具身智能、多模态大模型、3D生成、强化学习等。
, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1251856534184080291, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=1., ext=[AuthorCompanyExt(id=1251856534200857510, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534184080291, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
1.Foundation Model Research Center, Institute of Automation, Chinese Academy of Sciences, Beijing 100083, China), AuthorCompanyExt(id=1251856534213440426, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534184080291, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
1.中国科学院自动化研究所 紫东太初大模型研究中心,北京 100083)]), AuthorCompany(id=1251856534309909421, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=2., ext=[AuthorCompanyExt(id=1251856534314103726, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534309909421, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
2.School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing 100083, China), AuthorCompanyExt(id=1251856534322492335, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534309909421, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
2.中国科学院大学 人工智能学院,北京 100083)])]), Author(id=1251856536574833652, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, orderNo=1, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=null, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1251856536709051392, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, authorId=1251856536574833652, language=EN, stringName=Kai LIU, firstName=Kai, middleName=null, lastName=LIU, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
3, address=
3.School of Software Engineering, Xi’an Jiaotong University, Xi’an 710049, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1251856536809713669, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, authorId=1251856536574833652, language=CN, stringName=刘凯, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
3, address=
3.西安交通大学 软件学院,陕西 西安 710049, bio={"content":"
刘凯 男,(2002—),硕士研究生。主要研究方向:具身智能、多模态大模型、计算机视觉等。
"}, bioImg=null, bioContent=
刘凯 男,(2002—),硕士研究生。主要研究方向:具身智能、多模态大模型、计算机视觉等。
, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1251856534414767034, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=3., ext=[AuthorCompanyExt(id=1251856534423155641, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534414767034, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
3.School of Software Engineering, Xi’an Jiaotong University, Xi’an 710049, China), AuthorCompanyExt(id=1251856534427349946, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534414767034, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
3.西安交通大学 软件学院,陕西 西安 710049)])]), Author(id=1251856536889405451, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, orderNo=2, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=null, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1251856537015234582, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, authorId=1251856536889405451, language=EN, stringName=Biaoliang GUAN, firstName=Biaoliang, middleName=null, lastName=GUAN, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
3, address=
3.School of Software Engineering, Xi’an Jiaotong University, Xi’an 710049, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1251856537128480796, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, authorId=1251856536889405451, language=CN, stringName=关标良, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
3, address=
3.西安交通大学 软件学院,陕西 西安 710049, bio={"content":"
关标良 男,(2002—),硕士研究生。主要研究方向:具身智能、强化学习、计算机视觉、多模态大模型等。
"}, bioImg=null, bioContent=
关标良 男,(2002—),硕士研究生。主要研究方向:具身智能、强化学习、计算机视觉、多模态大模型等。
, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1251856534414767034, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=3., ext=[AuthorCompanyExt(id=1251856534423155641, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534414767034, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
3.School of Software Engineering, Xi’an Jiaotong University, Xi’an 710049, China), AuthorCompanyExt(id=1251856534427349946, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534414767034, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
3.西安交通大学 软件学院,陕西 西安 710049)])]), Author(id=1251856537245921317, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, orderNo=3, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=null, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1251856537371750446, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, authorId=1251856537245921317, language=EN, stringName=Wen YE, firstName=Wen, middleName=null, lastName=YE, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
2, 4, address=
2.School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing 100083, China
4.New Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing 100083, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1251856537468219446, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, authorId=1251856537245921317, language=CN, stringName=叶雯, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
2, 4, address=
2.中国科学院大学 人工智能学院,北京 100083
4.中国科学院自动化研究所 模式识别实验室,北京 100083, bio={"content":"
叶雯 女,(2003—),博士研究生。主要研究方向:具身智能、多模态大模型智能体、多智能体系统、AIGC、计算机视觉等。
"}, bioImg=null, bioContent=
叶雯 女,(2003—),博士研究生。主要研究方向:具身智能、多模态大模型智能体、多智能体系统、AIGC、计算机视觉等。
, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1251856534309909421, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=2., ext=[AuthorCompanyExt(id=1251856534314103726, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534309909421, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
2.School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing 100083, China), AuthorCompanyExt(id=1251856534322492335, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534309909421, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
2.中国科学院大学 人工智能学院,北京 100083)]), AuthorCompany(id=1251856534528013250, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=4., ext=[AuthorCompanyExt(id=1251856534536401859, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534528013250, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
4.New Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing 100083, China), AuthorCompanyExt(id=1251856534544790468, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534528013250, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
4.中国科学院自动化研究所 模式识别实验室,北京 100083)])]), Author(id=1251856537556299836, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, orderNo=4, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=null, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1251856537656963138, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, authorId=1251856537556299836, language=EN, stringName=Yacui ZHAO, firstName=Yacui, middleName=null, lastName=ZHAO, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
5, address=
5.The Hamlyn Centre, Imperial College London, London SW7 2AZ , United Kingdom, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1251856537786986572, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, authorId=1251856537556299836, language=CN, stringName=赵雅萃, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
5, address=
5.帝国理工学院 哈姆林中心,伦敦SW7 2AZ, bio={"content":"
赵雅萃 女,(2002—),硕士研究生。主要研究方向:具身智能、多模态大模型等。
"}, bioImg=null, bioContent=
赵雅萃 女,(2002—),硕士研究生。主要研究方向:具身智能、多模态大模型等。
, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1251856536067322829, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=5., ext=[AuthorCompanyExt(id=1251856536071517133, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856536067322829, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
5.The Hamlyn Centre, Imperial College London, London SW7 2AZ , United Kingdom), AuthorCompanyExt(id=1251856536075711438, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856536067322829, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
5.帝国理工学院 哈姆林中心,伦敦SW7 2AZ)])]), Author(id=1251856537912815699, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, orderNo=5, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=null, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1251856538017673309, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, authorId=1251856537912815699, language=EN, stringName=Chaoyang ZHAO, firstName=Chaoyang, middleName=null, lastName=ZHAO, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
1, 6, address=
1.Foundation Model Research Center, Institute of Automation, Chinese Academy of Sciences, Beijing 100083, China
6.Objecteye. Inc, Beijing 100083, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1251856538202222692, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, authorId=1251856537912815699, language=CN, stringName=赵朝阳, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
1, 6, address=
1.中国科学院自动化研究所 紫东太初大模型研究中心,北京 100083
6.中科视语(北京)科技有限公司,北京 100083, bio={"content":"
赵朝阳 男,(1985—),博士,副研究员。主要研究方向:视频图像分析、多模态大模型、具身智能等。
"}, bioImg=null, bioContent=
赵朝阳 男,(1985—),博士,副研究员。主要研究方向:视频图像分析、多模态大模型、具身智能等。
, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1251856534184080291, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=1., ext=[AuthorCompanyExt(id=1251856534200857510, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534184080291, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
1.Foundation Model Research Center, Institute of Automation, Chinese Academy of Sciences, Beijing 100083, China), AuthorCompanyExt(id=1251856534213440426, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534184080291, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
1.中国科学院自动化研究所 紫东太初大模型研究中心,北京 100083)]), AuthorCompany(id=1251856536163791830, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=6., ext=[AuthorCompanyExt(id=1251856536172180438, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856536163791830, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
6.Objecteye. Inc, Beijing 100083, China), AuthorCompanyExt(id=1251856536176374743, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856536163791830, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
6.中科视语(北京)科技有限公司,北京 100083)])]), Author(id=1251856538290303080, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, orderNo=6, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=null, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1251856538407743601, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, authorId=1251856538290303080, language=EN, stringName=Jinqiao WANG, firstName=Jinqiao, middleName=null, lastName=WANG, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
1, 2, address=
1.Foundation Model Research Center, Institute of Automation, Chinese Academy of Sciences, Beijing 100083, China
2.School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing 100083, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1251856538504212600, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, authorId=1251856538290303080, language=CN, stringName=王金桥, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
1, 2, address=
1.中国科学院自动化研究所 紫东太初大模型研究中心,北京 100083
2.中国科学院大学 人工智能学院,北京 100083, bio={"content":"
王金桥 男,(1978—),博士,研究员。主要研究方向:具身智能、视频图像分析、多模态大模型、自监督学习、目标检测与跟踪、细粒度识别、行为识别。
"}, bioImg=null, bioContent=
王金桥 男,(1978—),博士,研究员。主要研究方向:具身智能、视频图像分析、多模态大模型、自监督学习、目标检测与跟踪、细粒度识别、行为识别。
, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1251856534184080291, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=1., ext=[AuthorCompanyExt(id=1251856534200857510, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534184080291, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
1.Foundation Model Research Center, Institute of Automation, Chinese Academy of Sciences, Beijing 100083, China), AuthorCompanyExt(id=1251856534213440426, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534184080291, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
1.中国科学院自动化研究所 紫东太初大模型研究中心,北京 100083)]), AuthorCompany(id=1251856534309909421, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=2., ext=[AuthorCompanyExt(id=1251856534314103726, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534309909421, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
2.School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing 100083, China), AuthorCompanyExt(id=1251856534322492335, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534309909421, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
2.中国科学院大学 人工智能学院,北京 100083)])])], keywords=[Keyword(id=1251856538734899330, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=EN, orderNo=1, keyword=VLA models), Keyword(id=1251856538814591112, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=EN, orderNo=2, keyword=large models), Keyword(id=1251856538898477197, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=EN, orderNo=3, keyword=embodied AI), Keyword(id=1251856540513284244, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=EN, orderNo=4, keyword=robot learning), Keyword(id=1251856540697833629, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=EN, orderNo=5, keyword=multimodal learning), Keyword(id=1251856540802691235, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=CN, orderNo=1, keyword=视觉-语言-行动模型), Keyword(id=1251856540932714665, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=CN, orderNo=2, keyword=大模型), Keyword(id=1251856541075321007, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=CN, orderNo=3, keyword=具身智能), Keyword(id=1251856541167595695, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=CN, orderNo=4, keyword=机器人学习), Keyword(id=1251856541285036215, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=CN, orderNo=5, keyword=多模态学习)], refs=[Reference(id=1251856545206710581, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[1], rfOrder=0, authorNames=MA Y E, SONG Z X, ZHUANG Y Z, journalName=null, refType=null, unstructuredReference=
MA Y E,
SONG Z X,
ZHUANG Y Z,
et al. A Survey on Vision-Language-Action Models for Embodied AI[EB/OL]. (2025-03-04)[
2025-07-10].
https://arxiv.org/abs/2405.14093., articleTitle=A Survey on Vision-Language-Action Models for Embodied AI, refAbstract=null), Reference(id=1251856545278013752, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2016, volume=35, issue=5, pageStart=1285, pageEnd=1298, url=null, language=null, rfNumber=[2], rfOrder=1, authorNames=SHIN H C, ROTH H R, GAO M C, journalName=IEEE Transactions on Medical Imaging, refType=null, unstructuredReference=
SHIN H C,
ROTH H R,
GAO M C,
et al. Deep Convolutional Neural Networks for Computer-aided Detection:CNN Architectures, Dataset Characteristics and Transfer Learning[J].
IEEE Transactions on Medical Imaging,
2016,
35(5):1285-1298., articleTitle=Deep Convolutional Neural Networks for Computer-aided Detection:CNN Architectures, Dataset Characteristics and Transfer Learning, refAbstract=null), Reference(id=1251856545345122620, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=2165, pageEnd=2183, url=null, language=null, rfNumber=[3], rfOrder=2, authorNames=BROHAN A, BROWN N, CARBAJAL J, journalName=null, refType=null, unstructuredReference=
BROHAN A,
BROWN N,
CARBAJAL J,
et al. RT-2:Vision-Language-Action Models Transfer Web Knowledge to Robotic Control[C]//Proceedings of Conference on Robot Learning. Atlanta:PMLR,
2023:2165-2183., articleTitle=RT-2:Vision-Language-Action Models Transfer Web Knowledge to Robotic Control, refAbstract=null), Reference(id=1251856545416425791, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[4], rfOrder=3, authorNames=DOSOVITSKIY A, BEYER L, KOLESNIKOV A, journalName=null, refType=null, unstructuredReference=
DOSOVITSKIY A,
BEYER L,
KOLESNIKOV A,
et al. An Image Is Worth 16×16 Words: Transformers for Image Recognition at Scale[EB/OL]. (2020-10-22)[
2025-07-10].
https://arxiv.org/abs/2010.11929., articleTitle=An Image Is Worth 16×16 Words: Transformers for Image Recognition at Scale, refAbstract=null), Reference(id=1251856545487728963, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2024, volume=15, issue=3, pageStart=1, pageEnd=45, url=null, language=null, rfNumber=[5], rfOrder=4, authorNames=CHANG Y P, WANG X, WANG J D, journalName=ACM Transactions on Intelligent Systems and Technology, refType=null, unstructuredReference=
CHANG Y P,
WANG X,
WANG J D,
et al. A Survey on Evaluation of Large Language Models[J].
ACM Transactions on Intelligent Systems and Technology,
2024,
15(3):1-45., articleTitle=A Survey on Evaluation of Large Language Models, refAbstract=null), Reference(id=1251856545617752391, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2024, volume=46, issue=8, pageStart=5625, pageEnd=5644, url=null, language=null, rfNumber=[6], rfOrder=5, authorNames=ZHANG J Y, HUANG J X, JIN S, journalName=IEEE Transactions on Pattern Analysis and Machine Intelligence, refType=null, unstructuredReference=
ZHANG J Y,
HUANG J X,
JIN S,
et al. Vision-Language Models for Vision Tasks: A Survey[J].
IEEE Transactions on Pattern Analysis and Machine Intelligence,
2024,
46(8):5625-5644., articleTitle=Vision-Language Models for Vision Tasks: A Survey, refAbstract=null), Reference(id=1251856545705832779, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[7], rfOrder=6, authorNames=VASWANI A, SHAZEER N, PARMAR N, journalName=null, refType=null, unstructuredReference=
VASWANI A,
SHAZEER N,
PARMAR N,
et al. Attention Is All You Need[EB/OL]. (2017-06-12)[
2025-07-10].
https://arxiv.org/abs/1706.03762., articleTitle=Attention Is All You Need, refAbstract=null), Reference(id=1251856545814884689, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[8], rfOrder=7, authorNames=BROHAN A, BROWN N, CARBAJAL J, journalName=null, refType=null, unstructuredReference=
BROHAN A,
BROWN N,
CARBAJAL J,
et al. RT-1:Robotics Transformer for Real-world Control at Scale[EB/OL]. (2022-12-13)[
2025-07-10].
https://arxiv.org/abs/2212.06817., articleTitle=RT-1:Robotics Transformer for Real-world Control at Scale, refAbstract=null), Reference(id=1251856545898770772, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2009, volume=null, issue=null, pageStart=248, pageEnd=255, url=null, language=null, rfNumber=[9], rfOrder=8, authorNames=DENG J, DONG W, SOCHER R, journalName=null, refType=null, unstructuredReference=
DENG J,
DONG W,
SOCHER R,
et al. ImageNet: A Large-scale Hierarchical Image Database[C]//2009 IEEE Conference on Computer Vision and Pattern Recognition. Miami: IEEE,
2009:248-255., articleTitle=ImageNet: A Large-scale Hierarchical Image Database, refAbstract=null), Reference(id=1251856545974268247, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2019, volume=null, issue=null, pageStart=6105, pageEnd=6114, url=null, language=null, rfNumber=[10], rfOrder=9, authorNames=TAN M X, LE Q V, journalName=null, refType=null, unstructuredReference=
TAN M X,
LE Q V,
et al. EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks[C]//International Conference on Machine Learning. Long Beach: PMLR,
2019:6105-6114., articleTitle=EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks, refAbstract=null), Reference(id=1251856546032988506, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2016, volume=null, issue=null, pageStart=770, pageEnd=778, url=null, language=null, rfNumber=[11], rfOrder=10, authorNames=HE K M, ZHANG X Y, REN S Q, journalName=null, refType=null, unstructuredReference=
HE K M,
ZHANG X Y,
REN S Q,
et al. Deep Residual Learning for Image Recognition[C]//Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. Las Vegas: IEEE,
2016:770-778., articleTitle=Deep Residual Learning for Image Recognition, refAbstract=null), Reference(id=1251856546091708767, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[12], rfOrder=11, authorNames=OQUAB M, DARCET T, MOUTAKANNI T, journalName=null, refType=null, unstructuredReference=
OQUAB M,
DARCET T,
MOUTAKANNI T,
et al. DINOv2: Learning Robust Visual Features Without Supervision[EB/OL]. (2023-04-14)[
2025-07-10].
https://arxiv.org/abs/2304.07193., articleTitle=DINOv2: Learning Robust Visual Features Without Supervision, refAbstract=null), Reference(id=1251856546158817635, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=11941, pageEnd=11952, url=null, language=null, rfNumber=[13], rfOrder=12, authorNames=ZHAI X H, MUSTAFA B, KOLESNIKOV A, journalName=null, refType=null, unstructuredReference=
ZHAI X H,
MUSTAFA B,
KOLESNIKOV A,
et al. Sigmoid Loss for Language Image Pre-training[C]//Proceedings of the IEEE/CVF International Conference on Computer Vision. Paris:IEEE/CVF,
2023:11941-11952., articleTitle=Sigmoid Loss for Language Image Pre-training, refAbstract=null), Reference(id=1251856546251092325, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=8469, pageEnd=8488, url=null, language=null, rfNumber=[14], rfOrder=13, authorNames=DRIESS D, XIA F, SAJJADI M S M, journalName=null, refType=null, unstructuredReference=
DRIESS D,
XIA F,
SAJJADI M S M,
et al. PaLM-E:An Embodied Multimodal Language Model[C]// Proceedings of the 40th International Conference on Machine Learning. Honolulu:PMLR,
2023: 8469-8488., articleTitle=PaLM-E:An Embodied Multimodal Language Model, refAbstract=null), Reference(id=1251856546334978409, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[15], rfOrder=14, authorNames=TOUVRON H, LAVRIL T, IZACARD G, journalName=null, refType=null, unstructuredReference=
TOUVRON H,
LAVRIL T,
IZACARD G,
et al. LLaMA:Open and Efficient Foundation Language Models[EB/OL]. (2023-02-27)[
2025-07-10].
https://arxiv.org/abs/2302.13971., articleTitle=LLaMA:Open and Efficient Foundation Language Models, refAbstract=null), Reference(id=1251856546402087276, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[16], rfOrder=15, authorNames=MESNARD T, HARDIN C, DADASHI R, journalName=null, refType=null, unstructuredReference=
MESNARD T,
HARDIN C,
DADASHI R,
et al. Gemma: Open Models Based on Gemini Research and Technology[EB/OL]. (2024-03-13)[
2025-07-10].
https://arxiv.org/abs/2403.08295., articleTitle=Gemma: Open Models Based on Gemini Research and Technology, refAbstract=null), Reference(id=1251856546473390446, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[17], rfOrder=16, authorNames=SAPKOTA R, CAO Y, ROUMELIOTIS K I, journalName=null, refType=null, unstructuredReference=
SAPKOTA R,
CAO Y,
ROUMELIOTIS K I,
et al. Vision-Language-Action Models: Concepts, Progress, Applications and Challenges[EB/OL]. (2025-05-07)[
2025-07-10].
https://arxiv.org/abs/2505.04769., articleTitle=Vision-Language-Action Models: Concepts, Progress, Applications and Challenges, refAbstract=null), Reference(id=1251856546557276528, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2018, volume=null, issue=null, pageStart=3942, pageEnd=3951, url=null, language=null, rfNumber=[18], rfOrder=17, authorNames=PEREZ E, STRUB F, DE VRIES H, journalName=null, refType=null, unstructuredReference=
PEREZ E,
STRUB F,
DE VRIES H,
et al. FiLM: Visual Reasoning with a General Conditioning Layer[C]//Proceedings of the AAAI Conference on Artificial Intelligence. New Orleans:AAAI Press,
2018:3942-3951., articleTitle=FiLM: Visual Reasoning with a General Conditioning Layer, refAbstract=null), Reference(id=1251856546645356913, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2022, volume=null, issue=null, pageStart=5582, pageEnd=5590, url=null, language=null, rfNumber=[19], rfOrder=18, authorNames=DU Y, LIU Z, LI J, journalName=null, refType=null, unstructuredReference=
DU Y,
LIU Z,
LI J,
et al. A Survey of Vision-Language Pre-trained Models[C]// Proceedings of the Thirty-First International Joint Conference on Artificial Intelligence (IJCAI 2022). Vienna: IJCAI Organization,
2022: 5582-5590., articleTitle=A Survey of Vision-Language Pre-trained Models, refAbstract=null), Reference(id=1251856546720854386, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=1, pageEnd=42, url=null, language=null, rfNumber=[20], rfOrder=19, authorNames=JIANG Y F, GUPTA A, ZHANG Z C, journalName=null, refType=null, unstructuredReference=
JIANG Y F,
GUPTA A,
ZHANG Z C,
et al. VIMA:General Robot Manipulation with Multimodal Prompts[C]//Proceedings of the 40th International Conference on Machine Learning. Honolulu:PMLR,
2023:1-42., articleTitle=VIMA:General Robot Manipulation with Multimodal Prompts, refAbstract=null), Reference(id=1251856546800546167, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[21], rfOrder=20, authorNames=KIM M J, PERTSCH K, KARAMCHETI S, journalName=null, refType=null, unstructuredReference=
KIM M J,
PERTSCH K,
KARAMCHETI S,
et al. Open-VLA: An Open-source Vision-Language-Action Model[EB/OL]. (2024-06-13)[
2025-07-10].
https://arxiv.org/abs/2406.09246., articleTitle=Open-VLA: An Open-source Vision-Language-Action Model, refAbstract=null), Reference(id=1251856546867655036, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[22], rfOrder=21, authorNames=WEN J J, ZHU Y C, LI J M, journalName=null, refType=null, unstructuredReference=
WEN J J,
ZHU Y C,
LI J M,
et al. TinyVLA: Towards Fast, Data-efficient Vision-Language-Action Models for Robotic Manipulation[EB/OL]. (2024-09-19)[
2025-07-10].
https://arxiv.org/abs/2409.12514., articleTitle=TinyVLA: Towards Fast, Data-efficient Vision-Language-Action Models for Robotic Manipulation, refAbstract=null), Reference(id=1251856546938958208, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[23], rfOrder=22, authorNames=ZHEN H Y, QIU X W, CHEN P H, journalName=null, refType=null, unstructuredReference=
ZHEN H Y,
QIU X W,
CHEN P H,
et al. 3D-VLA: A 3D Vision-Language-Action Generative World Model[EB/OL]. (2024-03-14)[
2025-07-10].
https://arxiv.org/abs/2403.09631., articleTitle=3D-VLA: A 3D Vision-Language-Action Generative World Model, refAbstract=null), Reference(id=1251856547010261383, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[24], rfOrder=23, authorNames=REED S, ZOLNA K, PARISOTTO E, journalName=null, refType=null, unstructuredReference=
REED S,
ZOLNA K,
PARISOTTO E,
et al. A Generalist Agent[EB/OL]. (2022-05-12)[
2025-07-10].
https://arxiv.org/abs/2205.06175., articleTitle=A Generalist Agent, refAbstract=null), Reference(id=1251856547115118984, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[25], rfOrder=24, authorNames=HO J, JAIN A, ABBEEL P, journalName=null, refType=null, unstructuredReference=
HO J,
JAIN A,
ABBEEL P. Denoising Diffusion Probabilistic Models[EB/OL]. (2020-06-19)[
2025-07-10].
https://arxiv.org/abs/2006.11239., articleTitle=Denoising Diffusion Probabilistic Models, refAbstract=null), Reference(id=1251856547190616457, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[26], rfOrder=25, authorNames=CHI C, XU Z J, FENG S Y, journalName=null, refType=null, unstructuredReference=
CHI C,
XU Z J,
FENG S Y,
et al. Diffusion Policy:Visuomotor Policy Learning via Action Diffusion[EB/OL]. (2023-03-07)[
2025-07-10].
https://arxiv.org/abs/2303.04137., articleTitle=Diffusion Policy:Visuomotor Policy Learning via Action Diffusion, refAbstract=null), Reference(id=1251856547278696844, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[27], rfOrder=26, authorNames=BLACK K, BROWN N, DRIESS D, journalName=null, refType=null, unstructuredReference=
BLACK K,
BROWN N,
DRIESS D,
et al.
π0 :A Vision-Language-Action Flow Model for General Robot Control[EB/OL]. (2024-10-31)[
2025-07-10].
https://arxiv.org/abs/2410.24164., articleTitle=
π0 :A Vision-Language-Action Flow Model for General Robot Control, refAbstract=null), Reference(id=1251856547350000016, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[28], rfOrder=27, authorNames=OCTO MODEL TEAM, GHOSH D, WALKE H, journalName=null, refType=null, unstructuredReference=OCTO MODEL TEAM,
GHOSH D,
WALKE H,
et al. Octo: An Open-source Generalist Robot Policy[EB/OL]. (2024-05-20)[
2025-07-10].
https://arxiv.org/abs/2405.12213., articleTitle=Octo: An Open-source Generalist Robot Policy, refAbstract=null), Reference(id=1251856547408720274, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[29], rfOrder=28, authorNames=LIPMAN Y, CHEN R T Q, BEN-HAMU H, journalName=null, refType=null, unstructuredReference=
LIPMAN Y,
CHEN R T Q,
BEN-HAMU H,
et al. Flow Matching for Generative Modeling[EB/OL]. (2022-10-06)[
2025-07-10].
https://arxiv.org/abs/2210.02747., articleTitle=Flow Matching for Generative Modeling, refAbstract=null), Reference(id=1251856547484217749, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[30], rfOrder=29, authorNames=PHYSICAL INTELLIGENCE, BLACK K, BROWN N, journalName=null, refType=null, unstructuredReference=PHYSICAL INTELLIGENCE,
BLACK K,
BROWN N,
et al.
π0.5: A Vision-Language-Action Model with Open-World Generalization[EB/OL]. (2025-04-22)[
2025-07-10].
https://arxiv.org/abs/2504.16054., articleTitle=
π0.5: A Vision-Language-Action Model with Open-World Generalization, refAbstract=null), Reference(id=1251856547559715225, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[31], rfOrder=30, authorNames=JIANG S C, HUANG Z L, QIAN K G, journalName=null, refType=null, unstructuredReference=
JIANG S C,
HUANG Z L,
QIAN K G,
et al. A Survey on Vision-Language-Action Models for Autonomous Driving[EB/OL]. (2025-06-30)[
2025-07-10].
https://arxiv.org/abs/2506.24044., articleTitle=A Survey on Vision-Language-Action Models for Autonomous Driving, refAbstract=null), Reference(id=1251856547631018396, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[32], rfOrder=31, authorNames=HONG Y C, WU Q, QI Y K, journalName=null, refType=null, unstructuredReference=
HONG Y C,
WU Q,
QI Y K,
et al. A Recurrent Vision-and-Language BERT for Navigation[EB/OL]. (2020-11-26)[
2025-07-10].
https://arxiv.org/abs/2011.13922., articleTitle=A Recurrent Vision-and-Language BERT for Navigation, refAbstract=null), Reference(id=1251856547689738653, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2019, volume=null, issue=null, pageStart=4171, pageEnd=4186, url=null, language=null, rfNumber=[33], rfOrder=32, authorNames=DEVLIN J, CHANG M W, LEE K, journalName=null, refType=null, unstructuredReference=
DEVLIN J,
CHANG M W,
LEE K,
et al. BERT: Pretraining of Deep Bidirectional Transformers for Language Understanding[C]//Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Minneapolis: Association for Computational Linguistics,
2019:4171-4186., articleTitle=BERT: Pretraining of Deep Bidirectional Transformers for Language Understanding, refAbstract=null), Reference(id=1251856547765236129, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[34], rfOrder=33, authorNames=SHRIDHAR M, MANUELLI L, FOX D, journalName=null, refType=null, unstructuredReference=
SHRIDHAR M,
MANUELLI L,
FOX D. CLIPort: What and Where Pathways for Robotic Manipulation[EB/OL]. (2021-09-24)[
2025-07-10].
https://arxiv.org/abs/2109.12098., articleTitle=CLIPort: What and Where Pathways for Robotic Manipulation, refAbstract=null), Reference(id=1251856547849122210, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2021, volume=null, issue=null, pageStart=8748, pageEnd=8763, url=null, language=null, rfNumber=[35], rfOrder=34, authorNames=RADFORD A, KIM J W, HALLACY C, journalName=null, refType=null, unstructuredReference=
RADFORD A,
KIM J W,
HALLACY C,
et al. Learning Transferable Visual Models from Natural Language Supervision[C]// Proceedings of the 38th International Conference on Machine Learning.[S. l.]:PMLR,
2021:8748-8763., articleTitle=Learning Transferable Visual Models from Natural Language Supervision, refAbstract=null), Reference(id=1251856547907842469, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025, volume=13, issue=null, pageStart=162467, pageEnd=162504, url=null, language=null, rfNumber=[36], rfOrder=35, authorNames=KAWAHARAZUKA K, OH J, YAMADA J, journalName=IEEE Access, refType=null, unstructuredReference=
KAWAHARAZUKA K,
OH J,
YAMADA J,
et al. Vision-Language-Action Models for Robotics: A Review Towards Real-world Applications[J].
IEEE Access,
2025,
13:162467-162504., articleTitle=Vision-Language-Action Models for Robotics: A Review Towards Real-world Applications, refAbstract=null), Reference(id=1251856549484900775, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[37], rfOrder=36, authorNames=CHEN X, DJOLONGA J, PADLEWSKI P, journalName=null, refType=null, unstructuredReference=
CHEN X,
DJOLONGA J,
PADLEWSKI P,
et al. PaLI-X:On Scaling up a Multilingual Vision and Language Model[EB/OL]. (2023-05-29)[
2025-07-10].
https://arxiv.org/abs/2305.18565., articleTitle=PaLI-X:On Scaling up a Multilingual Vision and Language Model, refAbstract=null), Reference(id=1251856549560398251, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[38], rfOrder=37, authorNames=TOUVRON H, MARTIN L, STONE K, journalName=null, refType=null, unstructuredReference=
TOUVRON H,
MARTIN L,
STONE K,
et al. Llama 2:Open Foundation and Fine-tuned Chat Models[EB/OL]. (2023-07-18)[
2025-07-10].
https://arxiv.org/abs/2307.09288., articleTitle=Llama 2:Open Foundation and Fine-tuned Chat Models, refAbstract=null), Reference(id=1251856549640090030, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[39], rfOrder=38, authorNames=O'NEILL A, REHMAN A, MASSUKURI A, journalName=null, refType=null, unstructuredReference=
O'NEILL A,
REHMAN A,
MASSUKURI A,
et al. Open X-Embodiment: Robotic Learning Datasets and RT-X Models[EB/OL]. (2023-10-13)[
2025-07-10].
https://arxiv.org/abs/2310.08864., articleTitle=Open X-Embodiment: Robotic Learning Datasets and RT-X Models, refAbstract=null), Reference(id=1251856549719781809, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[40], rfOrder=39, authorNames=HU E J, SHEN Y L, WALLIS P, journalName=null, refType=null, unstructuredReference=
HU E J,
SHEN Y L,
WALLIS P,
et al. LoRA: Low-rank Adaptation of Large Language Models[EB/OL]. (2021-06-17)[
2025-07-10].
https://arxiv.org/abs/2106.09685., articleTitle=LoRA: Low-rank Adaptation of Large Language Models, refAbstract=null), Reference(id=1251856549807862199, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[41], rfOrder=40, authorNames=ZHU M J, ZHU Y C, LI J M, journalName=null, refType=null, unstructuredReference=
ZHU M J,
ZHU Y C,
LI J M,
et al. ObjectVLA: End-to-End Open-World Object Manipulation Without Demonstration[EB/OL]. (2025-02-26)[
2025-07-10].
https://arxiv.org/abs/2502.19250., articleTitle=ObjectVLA: End-to-End Open-World Object Manipulation Without Demonstration, refAbstract=null), Reference(id=1251856549887553977, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[42], rfOrder=41, authorNames=HAN X F, CHEN S P, FU Z H, journalName=null, refType=null, unstructuredReference=
HAN X F,
CHEN S P,
FU Z H,
et al. Multimodal Fusion and Vision-Language Models: A Survey for Robot Vision[EB/OL]. (2025-04-03)[
2025-07-10].
https://arxiv.org/abs/2504.02477., articleTitle=Multimodal Fusion and Vision-Language Models: A Survey for Robot Vision, refAbstract=null), Reference(id=1251856549954662844, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[43], rfOrder=42, authorNames=HUANG W L, WANG C, ZHANG R H, journalName=null, refType=null, unstructuredReference=
HUANG W L,
WANG C,
ZHANG R H,
et al. VoxPoser: Composable 3D Value Maps for Robotic Manipulation with Language Models[EB/OL]. (2023-07-12)[
2025-07-10].
https://arxiv.org/abs/2307.05973., articleTitle=VoxPoser: Composable 3D Value Maps for Robotic Manipulation with Language Models, refAbstract=null), Reference(id=1251856550034354621, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[44], rfOrder=43, authorNames=LI C M, WEN J J, PENG Y, journalName=null, refType=null, unstructuredReference=
LI C M,
WEN J J,
PENG Y,
et al. PointVLA: Injecting the 3D World into Vision-Language-Action Models[EB/OL]. (2025-03-10)[
2025-07-10].
https://arxiv.org/abs/2503.07511., articleTitle=PointVLA: Injecting the 3D World into Vision-Language-Action Models, refAbstract=null), Reference(id=1251856550109852096, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025, volume=null, issue=null, pageStart=1, pageEnd=19, url=null, language=null, rfNumber=[45], rfOrder=44, authorNames=QU D L, SONG H M, CHEN Q Z, journalName=null, refType=null, unstructuredReference=
QU D L,
SONG H M,
CHEN Q Z,
et al. SpatialVLA:Exploring Spatial Representations for Visual-Language-Action Model[C]//Robotics: Science and Systems (RSS 2025). Utrecht: Robotics: Science and Systems Foundation,
2025:1-19., articleTitle=SpatialVLA:Exploring Spatial Representations for Visual-Language-Action Model, refAbstract=null), Reference(id=1251856550206321091, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[46], rfOrder=45, authorNames=PAN M Y, ZHANG J M, WU T S, journalName=null, refType=null, unstructuredReference=
PAN M Y,
ZHANG J M,
WU T S,
et al. OmniManip:Towards General Robotic Manipulation via Object-centric Interaction Primitives as Spatial Constraints[EB/OL]. (2025-01-07)[
2025-07-10].
https://arxiv.org/abs/2501.03841., articleTitle=OmniManip:Towards General Robotic Manipulation via Object-centric Interaction Primitives as Spatial Constraints, refAbstract=null), Reference(id=1251856550327955909, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[47], rfOrder=46, authorNames=ZHANG C H, HAO P Z, CAO X K, journalName=null, refType=null, unstructuredReference=
ZHANG C H,
HAO P Z,
CAO X K,
et al. VTLA: Vision-Tactile-Language-Action Model with Preference Learning for Insertion Manipulation[EB/OL]. (2025-05-14)[
2025-07-10].
https://arxiv.org/abs/2505.09577., articleTitle=VTLA: Vision-Tactile-Language-Action Model with Preference Learning for Insertion Manipulation, refAbstract=null), Reference(id=1251856550428619208, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[48], rfOrder=47, authorNames=HAO P, ZHANG C H, LI D Z, journalName=null, refType=null, unstructuredReference=
HAO P,
ZHANG C H,
LI D Z,
et al. TLA: Tactile-Language-Action Model for Contact-rich Manipulation[EB/OL]. (2025-03-11)[
2025-07-10].
https://arxiv.org/abs/2503.08548., articleTitle=TLA: Tactile-Language-Action Model for Contact-rich Manipulation, refAbstract=null), Reference(id=1251856550499922379, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[49], rfOrder=48, authorNames=SHI L X Y, ICHTER B, EQUI M, journalName=null, refType=null, unstructuredReference=
SHI L X Y,
ICHTER B,
EQUI M,
et al. Hi Robot:Open-Ended Instruction Following with Hierarchical Vision-Language-Action Models[EB/OL]. (2025-02-26)[
2025-07-10].
https://arxiv.org/abs/2502.19417., articleTitle=Hi Robot:Open-Ended Instruction Following with Hierarchical Vision-Language-Action Models, refAbstract=null), Reference(id=1251856550596391373, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[50], rfOrder=49, authorNames=BJORCK J, CASTAÑEDA F, CHERNIADEV N, journalName=null, refType=null, unstructuredReference=
BJORCK J,
CASTAÑEDA F,
CHERNIADEV N,
et al. GR00T N1: An Open Foundation Model for Generalist Humanoid Robots[EB/OL]. (2025-03-18)[
2025-07-10].
https://arxiv.org/abs/2503.14734., articleTitle=GR00T N1: An Open Foundation Model for Generalist Humanoid Robots, refAbstract=null), Reference(id=1251856550671888848, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[51], rfOrder=50, authorNames=LI Y H, WEI F Y, ZHANG C, journalName=null, refType=null, unstructuredReference=
LI Y H,
WEI F Y,
ZHANG C,
et al. EAGLE-2: Faster Inference of Language Models with Dynamic Draft Trees[EB/OL]. (2024-06-24)[
2025-07-10].
https://arxiv.org/abs/2406.16858., articleTitle=EAGLE-2: Faster Inference of Language Models with Dynamic Draft Trees, refAbstract=null), Reference(id=1251856550768357843, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=4172, pageEnd=4182, url=null, language=null, rfNumber=[52], rfOrder=51, authorNames=PEEBLES W, XIE S N, journalName=null, refType=null, unstructuredReference=
PEEBLES W,
XIE S N. Scalable Diffusion Models with Transformers[C]//Proceedings of the IEEE/CVF International Conference on Computer Vision. Paris: IEEE,
2023: 4172-4182., articleTitle=Scalable Diffusion Models with Transformers, refAbstract=null), Reference(id=1251856550860632534, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[53], rfOrder=52, authorNames=null, journalName=null, refType=null, unstructuredReference=Figure. Helix: A Vision-Language-Action Model for Generalist Humanoid Control[EB/OL]. (2025-02-20)[
2025-07-10].
https://www.figure.ai/news/helix., articleTitle=Figure. Helix: A Vision-Language-Action Model for Generalist Humanoid Control, refAbstract=null), Reference(id=1251856550940324312, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[54], rfOrder=53, authorNames=WEN J J, ZHU Y C, LI J M, journalName=null, refType=null, unstructuredReference=
WEN J J,
ZHU Y C,
LI J M,
et al. DexVLA: Vision-Language Model with Plug-in Diffusion Expert for General Robot Control[EB/OL]. (2025-02-10)[
2025-07-10].
https://arxiv.org/abs/2502.05855., articleTitle=DexVLA: Vision-Language Model with Plug-in Diffusion Expert for General Robot Control, refAbstract=null), Reference(id=1251856551024210395, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[55], rfOrder=54, authorNames=XIONG J Z, LIU G C, HUANG L, journalName=null, refType=null, unstructuredReference=
XIONG J Z,
LIU G C,
HUANG L,
et al. Autoregressive Models in Vision: A Survey[EB/OL]. (2024-11-08)[
2025-07-10].
https://arxiv.org/abs/2411.05902., articleTitle=Autoregressive Models in Vision: A Survey, refAbstract=null), Reference(id=1251856551078736349, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=26429, pageEnd=26455, url=null, language=null, rfNumber=[56], rfOrder=55, authorNames=LU J S, CLARK C, LEE S, journalName=null, refType=null, unstructuredReference=
LU J S,
CLARK C,
LEE S,
et al. Unified-IO 2: Scaling Autoregressive Multimodal Models with Vision Language Audio and Action[C]//Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. Seattle:IEEE,
2024:26429-26455., articleTitle=Unified-IO 2: Scaling Autoregressive Multimodal Models with Vision Language Audio and Action, refAbstract=null), Reference(id=1251856551162622431, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[57], rfOrder=56, authorNames=ZAWALSKI M, CHEN W, PERTSCH K, journalName=null, refType=null, unstructuredReference=
ZAWALSKI M,
CHEN W,
PERTSCH K,
et al. Robotic Control via Embodied Chain-of-Thought Reasoning[EB/OL]. (2024-07-11)[
2025-07-10].
https://arxiv.org/abs/2407.08693., articleTitle=Robotic Control via Embodied Chain-of-Thought Reasoning, refAbstract=null), Reference(id=1251856551259091426, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2022, volume=null, issue=null, pageStart=24824, pageEnd=24837, url=null, language=null, rfNumber=[58], rfOrder=57, authorNames=WEI J, WANG X Z, SCHUURMANS D, journalName=null, refType=null, unstructuredReference=
WEI J,
WANG X Z,
SCHUURMANS D,
et al. Chain-of-thought Prompting Elicits Reasoning in Large Language Models[C]//Advances in Neural Information Processing Systems 36 ( NeurIPS 2022). New Orleans: NeurIPS,
2022:24824-24837., articleTitle=Chain-of-thought Prompting Elicits Reasoning in Large Language Models, refAbstract=null), Reference(id=1251856551330394597, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=1, pageEnd=17, url=null, language=null, rfNumber=[59], rfOrder=58, authorNames=DU Y L, YANG M J, DAI B, journalName=null, refType=null, unstructuredReference=
DU Y L,
YANG M J,
DAI B,
et al. Learning Universal Policies via Text-guided Video Generation[C]//Advances in Neural Information Processing Systems 37 (NeurIPS 2023). New Orleans: NeurIPS,
2023:1-17., articleTitle=Learning Universal Policies via Text-guided Video Generation, refAbstract=null), Reference(id=1251856551414280680, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[60], rfOrder=59, authorNames=ZHAO T Z, KUMAR V, LEVINE S, journalName=null, refType=null, unstructuredReference=
ZHAO T Z,
KUMAR V,
LEVINE S,
et al. Learning Fine-grained Bimanual Manipulation with Low-cost Hardware[EB/OL]. (2023-04-23)[
2025-07-10].
https://arxiv.org/abs/2304.13705., articleTitle=Learning Fine-grained Bimanual Manipulation with Low-cost Hardware, refAbstract=null), Reference(id=1251856551481389545, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025, volume=null, issue=null, pageStart=1, pageEnd=24, url=null, language=null, rfNumber=[61], rfOrder=60, authorNames=KIM M J, FINN C, LIANG P, journalName=null, refType=null, unstructuredReference=
KIM M J,
FINN C,
LIANG P,
et al. Fine-tuning Vision-Language-Action Models: Optimizing Speed and Success[C]//Robotics: Science and Systems (RSS 2025). Utrecht:Robotics: Science and Systems Foundation,
2025:1-24., articleTitle=Fine-tuning Vision-Language-Action Models: Optimizing Speed and Success, refAbstract=null), Reference(id=1251856551569469930, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[62], rfOrder=61, authorNames=CEN J, YU C H, YUAN H J, journalName=null, refType=null, unstructuredReference=
CEN J,
YU C H,
YUAN H J,
et al. WorldVLA: Towards Autoregressive Action World Model[EB/OL]. (2025-06-26)[
2025-07-10].
https://arxiv.org/abs/2506.21539., articleTitle=WorldVLA: Towards Autoregressive Action World Model, refAbstract=null), Reference(id=1251856551636578796, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[63], rfOrder=62, authorNames=JIANG A X, GAO Y, SUN Z G, journalName=null, refType=null, unstructuredReference=
JIANG A X,
GAO Y,
SUN Z G,
et al. DiffVLA: Vision-Language Guided Diffusion Planning for Autonomous Driving[EB/OL]. (2025-05-26)[
2025-07-10].
https://arxiv.org/abs/2505.19381., articleTitle=DiffVLA: Vision-Language Guided Diffusion Planning for Autonomous Driving, refAbstract=null), Reference(id=1251856551733047790, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[64], rfOrder=63, authorNames=SHUKOR M, AUBAKIROVA D, CAPUANO F, journalName=null, refType=null, unstructuredReference=
SHUKOR M,
AUBAKIROVA D,
CAPUANO F,
et al. SmolVLA: A Vision-Language-Action Model for Affordable and Efficient Robotics[EB/OL]. (2025-06-02)[
2025-07-10].
https://arxiv.org/abs/2506.01844., articleTitle=SmolVLA: A Vision-Language-Action Model for Affordable and Efficient Robotics, refAbstract=null), Reference(id=1251856551800156656, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[65], rfOrder=64, authorNames=LIU S M, WU L X, LI B G, journalName=null, refType=null, unstructuredReference=
LIU S M,
WU L X,
LI B G,
et al. RDT-1B: A Diffusion Foundation Model for Bimanual Manipulation[EB/OL]. (2024-10-10)[
2025-07-10].
https://arxiv.org/abs/2410.07864, articleTitle=RDT-1B: A Diffusion Foundation Model for Bimanual Manipulation, refAbstract=null), Reference(id=1251856551879848434, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[66], rfOrder=65, authorNames=XUE H R, REN J J, CHEN W D, journalName=null, refType=null, unstructuredReference=
XUE H R,
REN J J,
CHEN W D,
et al. Reactive Diffusion Policy: Slow-fast Visual-tactile Policy Learning for Contact-rich Manipulation[EB/OL]. (2025-03-04)[
2025-07-10].
https://arxiv.org/abs/2503.02881., articleTitle=Reactive Diffusion Policy: Slow-fast Visual-tactile Policy Learning for Contact-rich Manipulation, refAbstract=null), Reference(id=1251856551963734517, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[67], rfOrder=66, authorNames=LIU J M, CHEN H R, AN P J, journalName=null, refType=null, unstructuredReference=
LIU J M,
CHEN H R,
AN P J,
et al. HybridVLA: Collaborative Diffusion and Autoregression in a Unified Vision-Language-Action Model[EB/OL]. (2025-03-13)[
2025-07-10].
https://arxiv.org/abs/2503.10631., articleTitle=HybridVLA: Collaborative Diffusion and Autoregression in a Unified Vision-Language-Action Model, refAbstract=null), Reference(id=1251856552043426293, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[68], rfOrder=67, authorNames=GHOSH A, ACHARYA A, SAHA S, journalName=null, refType=null, unstructuredReference=
GHOSH A,
ACHARYA A,
SAHA S,
et al. Exploring the Frontier of Vision-Language Models: A Survey of Current Methodologies and Future Directions[EB/OL]. (2024-04-12)[
2025-07-10].
https://arxiv.org/abs/2404.07214v2., articleTitle=Exploring the Frontier of Vision-Language Models: A Survey of Current Methodologies and Future Directions, refAbstract=null), Reference(id=1251856552144089592, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[69], rfOrder=68, authorNames=null, journalName=null, refType=null, unstructuredReference=PsiBot. The Second Wave of Real VLA: Psi R1 Achieves Generalized Intelligence at the Brain Level![EB/OL]. (2025-04-29)[
2025-07-10].
https://www.psibot.ai/en/008_en/., articleTitle=PsiBot. The Second Wave of Real VLA: Psi R1 Achieves Generalized Intelligence at the Brain Level!, refAbstract=null), Reference(id=1251856552219587065, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[70], rfOrder=69, authorNames=ZHONG Y F, HUANG X C, LI R C, journalName=null, refType=null, unstructuredReference=
ZHONG Y F,
HUANG X C,
LI R C,
et al. DexGraspVLA: A Vision-Language-Action Framework Towards General Dexterous Grasping[EB/OL]. (2025-02-28)[
2025-07-10].
https://arxiv.org/abs/2502.20900., articleTitle=DexGraspVLA: A Vision-Language-Action Framework Towards General Dexterous Grasping, refAbstract=null), Reference(id=1251856552299278843, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[71], rfOrder=70, authorNames=LIN F Q, NAI R Q, HU Y D, journalName=null, refType=null, unstructuredReference=
LIN F Q,
NAI R Q,
HU Y D,
et al. OneTwoVLA: A Unified Vision-Language-Action Model with Adaptive Reasoning[EB/OL]. (2025-05-17)[
2025-07-10].
https://arxiv.org/abs/2505.11917., articleTitle=OneTwoVLA: A Unified Vision-Language-Action Model with Adaptive Reasoning, refAbstract=null), Reference(id=1251856552366387707, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[72], rfOrder=71, authorNames=LI Z X, WU X Y, DU H Y, journalName=null, refType=null, unstructuredReference=
LI Z X,
WU X Y,
DU H Y,
et al. A Survey of State of the Art Large Vision Language Models: Alignment, Benchmark, Evaluations and Challenges[EB/OL]. (2025-01-04)[
2025-07-10].
https://arxiv.org/abs/2501.02189., articleTitle=A Survey of State of the Art Large Vision Language Models: Alignment, Benchmark, Evaluations and Challenges, refAbstract=null), Reference(id=1251856553964417534, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[73], rfOrder=72, authorNames=KOLVE E, MOTTAGHI R, HAN W, journalName=null, refType=null, unstructuredReference=
KOLVE E,
MOTTAGHI R,
HAN W,
et al. AI2-THOR:An Interactive 3D Environment for Visual AI[EB/OL]. (2017-12-14)[
2025-07-10].
https://arxiv.org/abs/1712.05474., articleTitle=AI2-THOR:An Interactive 3D Environment for Visual AI, refAbstract=null), Reference(id=1251856554048303614, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2020, volume=null, issue=null, pageStart=10737, pageEnd=10746, url=null, language=null, rfNumber=[74], rfOrder=73, authorNames=SHRIDHAR M, THOMASON J, GORDON D, journalName=null, refType=null, unstructuredReference=
SHRIDHAR M,
THOMASON J,
GORDON D,
et al. ALFRED: A Benchmark for Interpreting Grounded Instructions for Everyday Tasks[C]//Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. eattle:IEEE,
2020:10737-10746., articleTitle=ALFRED: A Benchmark for Interpreting Grounded Instructions for Everyday Tasks, refAbstract=null), Reference(id=1251856554123801088, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2019, volume=null, issue=null, pageStart=9339, pageEnd=9347, url=null, language=null, rfNumber=[75], rfOrder=74, authorNames=SAVVA M, KADIAN A, MAKSYMETS O, journalName=null, refType=null, unstructuredReference=
SAVVA M,
KADIAN A,
MAKSYMETS O,
et al. Habitat: A Platform for Embodied AI Research[C]//Proceedings of the IEEE/CVF International Conference on Computer Vision. Seoul: IEEE,
2019: 9339-9347., articleTitle=Habitat: A Platform for Embodied AI Research, refAbstract=null), Reference(id=1251856554203492865, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2017, volume=null, issue=null, pageStart=667, pageEnd=676, url=null, language=null, rfNumber=[76], rfOrder=75, authorNames=CHANG A, DAI A, FUNKHOUSER T, journalName=null, refType=null, unstructuredReference=
CHANG A,
DAI A,
FUNKHOUSER T,
et al. Matterport3D: Learning from RGB-D Data in Indoor Environments[C]//Proceedings of the IEEE International Conference on Computer Vision. Qingdao:IEEE,
2017:667-676., articleTitle=Matterport3D: Learning from RGB-D Data in Indoor Environments, refAbstract=null), Reference(id=1251856554278990339, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2021, volume=null, issue=null, pageStart=7520, pageEnd=7527, url=null, language=null, rfNumber=[77], rfOrder=76, authorNames=SHEN B K, XIA F, LI C S, journalName=null, refType=null, unstructuredReference=
SHEN B K,
XIA F,
LI C S,
et al. iGibson 1.0: A Simulation Environment for Interactive Tasks in Large Realistic Scenes[C]//2021 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS). Prague:IEEE,
2021:7520-7527., articleTitle=iGibson 1.0: A Simulation Environment for Interactive Tasks in Large Realistic Scenes, refAbstract=null), Reference(id=1251856554383847941, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[78], rfOrder=77, authorNames=LI C S, XIA F, MARTÍN-MARTÍN R, journalName=null, refType=null, unstructuredReference=
LI C S,
XIA F,
MARTÍN-MARTÍN R,
et al. iGibson 2.0: Object-Centric Simulation for Robot Learning of Everyday Household Tasks[EB/OL]. (2021-08-06)[
2025-07-10].
https://arxiv.org/abs/2108.03272., articleTitle=iGibson 2.0: Object-Centric Simulation for Robot Learning of Everyday Household Tasks, refAbstract=null), Reference(id=1251856554459345415, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2020, volume=null, issue=null, pageStart=11097, pageEnd=11107, url=null, language=null, rfNumber=[79], rfOrder=78, authorNames=XIANG F B, QIN Y Z, MO K C, journalName=null, refType=null, unstructuredReference=
XIANG F B,
QIN Y Z,
MO K C,
et al. SAPIEN: A Simulated Part-based Interactive Environment[C]//Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. Seattle:IEEE,
2020:11097-11107., articleTitle=SAPIEN: A Simulated Part-based Interactive Environment, refAbstract=null), Reference(id=1251856554522259977, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[80], rfOrder=79, authorNames=MAKOVIYCHUK V, WAWRZYNIAK L, GUO Y R, journalName=null, refType=null, unstructuredReference=
MAKOVIYCHUK V,
WAWRZYNIAK L,
GUO Y R,
et al. Isaac Gym: High Performance GPU-based Physics Simulation for Robot Learning[EB/OL]. (2021-08-24)[
2025-07-10].
https://arxiv.org/abs/2108.10470., articleTitle=Isaac Gym: High Performance GPU-based Physics Simulation for Robot Learning, refAbstract=null), Reference(id=1251856554593563147, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=44323, pageEnd=44340, url=null, language=null, rfNumber=[81], rfOrder=80, authorNames=KUMAR V, SHAH R, ZHOU G Y, journalName=null, refType=null, unstructuredReference=
KUMAR V,
SHAH R,
ZHOU G Y,
et al. RoboHive: A Unified Framework for Robot Learning[C]//37th Conference on Neural Information Processing Systems (NeurIPS 2023) Track on Datasets and Benchmarks. New Orleans:NeurIPS,
2023:44323-44340., articleTitle=RoboHive: A Unified Framework for Robot Learning, refAbstract=null), Reference(id=1251856554677449229, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2012, volume=null, issue=null, pageStart=5026, pageEnd=5033, url=null, language=null, rfNumber=[82], rfOrder=81, authorNames=TODOROV E, EREZ T, TASSA Y, journalName=null, refType=null, unstructuredReference=
TODOROV E,
EREZ T,
TASSA Y. MuJoCo: A Physics Engine for Model-based Control[C]//2012 IEEE/RSJ International Conference on Intelligent Robots and Systems. Vilamoura-Algarve: IEEE,
2012: 5026-5033., articleTitle=MuJoCo: A Physics Engine for Model-based Control, refAbstract=null), Reference(id=1251856554731975183, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[83], rfOrder=82, authorNames=GURUPRASAD P, SIKKA H, SONG J W, journalName=null, refType=null, unstructuredReference=
GURUPRASAD P,
SIKKA H,
SONG J W,
et al. Benchmarking Vision, Language, & Action Models on Robotic Learning Tasks[EB/OL]. (2024-11-04)[
2025-07-10].
https://arxiv.org/abs/2411.05821., articleTitle=Benchmarking Vision, Language, & Action Models on Robotic Learning Tasks, refAbstract=null), Reference(id=1251856554799084049, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[84], rfOrder=83, authorNames=EBERT F, YANG Y F, SCHMECKPEPER K, journalName=null, refType=null, unstructuredReference=
EBERT F,
YANG Y F,
SCHMECKPEPER K,
et al. Bridge Data: Boosting Generalization of Robotic Skills with Cross-domain Datasets[EB/OL]. (2021-09-27)[
2025-07-10].
https://arxiv.org/abs/2109.13396., articleTitle=Bridge Data: Boosting Generalization of Robotic Skills with Cross-domain Datasets, refAbstract=null), Reference(id=1251856554870387219, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2022, volume=7, issue=2, pageStart=7327, pageEnd=7334, url=null, language=null, rfNumber=[85], rfOrder=84, authorNames=MEES O, HERMANN L, ROSETE-BEAS E, journalName=IEEE Robot ics and Automation Letters, refType=null, unstructuredReference=
MEES O,
HERMANN L,
ROSETE-BEAS E,
et al. CALVIN:A Benchmark for Language-Conditioned Policy Learning for Long-horizon Robot Manipulation Tasks[J].
IEEE Robot ics and Automation Letters,
2022,
7(2):7327-7334., articleTitle=CALVIN:A Benchmark for Language-Conditioned Policy Learning for Long-horizon Robot Manipulation Tasks, refAbstract=null), Reference(id=1251856554958467605, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=80, pageEnd=93, url=null, language=null, rfNumber=[86], rfOrder=85, authorNames=LI C, ZHANG R, WONG J, journalName=null, refType=null, unstructuredReference=
LI C,
ZHANG R,
WONG J,
et al. Behavior-1K: A Benchmark for Embodied AI with 1,000 Everyday Activities and Realistic Simulation[C]//Conference on Robot Learning. Atlanta:PMLR,
2023: 80-93., articleTitle=Behavior-1K: A Benchmark for Embodied AI with 1,000 Everyday Activities and Realistic Simulation, refAbstract=null), Reference(id=1251856555042353687, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=1, pageEnd=30, url=null, language=null, rfNumber=[87], rfOrder=86, authorNames=GU J Y, XIANG F B, LI X L, journalName=null, refType=null, unstructuredReference=
GU J Y,
XIANG F B,
LI X L,
et al. ManiSkill2:A Unified Benchmark for Generalizable Manipulation Skills[C]//International Conference on Learning Representations (ICLR 2023). Kigali:PMLR,
2023:1-30., articleTitle=ManiSkill2:A Unified Benchmark for Generalizable Manipulation Skills, refAbstract=null), Reference(id=1251856555105268249, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2020, volume=5, issue=2, pageStart=3019, pageEnd=3026, url=null, language=null, rfNumber=[88], rfOrder=87, authorNames=JAMES S, MA Z C, ARROJO D R, journalName=IEEE Robotics and Automation Letters, refType=null, unstructuredReference=
JAMES S,
MA Z C,
ARROJO D R,
et al. RLBench:The Robot Learning Benchmark & Learning Environment[J].
IEEE Robotics and Automation Letters,
2020,
5(2):3019-3026., articleTitle=RLBench:The Robot Learning Benchmark & Learning Environment, refAbstract=null), Reference(id=1251856555172377115, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=44776, pageEnd=44791, url=null, language=null, rfNumber=[89], rfOrder=88, authorNames=LIU B, ZHU Y F, GAO C K, journalName=null, refType=null, unstructuredReference=
LIU B,
ZHU Y F,
GAO C K,
et al. LIBERO: Benchmarking Knowledge Transfer for Lifelong Robot Learning[C]// NIPS’23: Proceedings of the 37th International Conference on Neural Information Processing Systems. New Orleans: Curran Associates Inc.,
2023:44776-44791., articleTitle=LIBERO: Benchmarking Knowledge Transfer for Lifelong Robot Learning, refAbstract=null), Reference(id=1251856555243680285, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[90], rfOrder=89, authorNames=GURUPRASAD P, WANG Y Y, CHOWDHURY S, journalName=null, refType=null, unstructuredReference=
GURUPRASAD P,
WANG Y Y,
CHOWDHURY S,
et al. Benchmarking Vision,Language,& Action Models in Procedurally Generated, Open Ended Action Environments[EB/OL]. (2025-05-08)[
2025-07-10].
https://arxiv.org/abs/2505.05540., articleTitle=Benchmarking Vision,Language,& Action Models in Procedurally Generated, Open Ended Action Environments, refAbstract=null), Reference(id=1251856555310789151, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2022, volume=null, issue=null, pageStart=7606, pageEnd=7623, url=null, language=null, rfNumber=[91], rfOrder=90, authorNames=GU J, STEFANI E, WU Q, journalName=null, refType=null, unstructuredReference=
GU J,
STEFANI E,
WU Q,
et al. Vision-and-Language Navigation: A Survey of Tasks, Methods, and Future Directions[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL 2022). Dublin: Association for Computational Linguistics,
2022:7606-7623., articleTitle=Vision-and-Language Navigation: A Survey of Tasks, Methods, and Future Directions, refAbstract=null), Reference(id=1251856555373703713, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025, volume=null, issue=null, pageStart=1724, pageEnd=1734, url=null, language=null, rfNumber=[92], rfOrder=91, authorNames=JI Y H, TAN H J, SHI J Y, journalName=null, refType=null, unstructuredReference=
JI Y H,
TAN H J,
SHI J Y,
et al. RoboBrain: A Unified Brain Model for Robotic Manipulation from Abstract to Concrete[C]//Proceedings of the Computer Vision and Pattern Recognition Conference. Nashville:IEEE,
2025:1724-1734., articleTitle=RoboBrain: A Unified Brain Model for Robotic Manipulation from Abstract to Concrete, refAbstract=null), Reference(id=1251856555440812579, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2023, volume=49, issue=1, pageStart=1, pageEnd=14, url=null, language=null, rfNumber=[93], rfOrder=92, authorNames=司马双霖, 黄岩, 何科技, journalName=自动化学报, refType=null, unstructuredReference=司马双霖,黄岩,何科技,
等.视觉语言导航研究进展[J].
自动化学报,
2023,
49(1):1-14., articleTitle=视觉语言导航研究进展, refAbstract=null), Reference(id=1251856555507921445, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2024, volume=7, issue=5, pageStart=99, pageEnd=110, url=null, language=null, rfNumber=[94], rfOrder=93, authorNames=杨玉琪, 王梦云, 刘运卓, journalName=无人系统技术, refType=null, unstructuredReference=杨玉琪,王梦云,刘运卓,
等.具身智能及其在自主无人系统的应用研究[J].
无人系统技术,
2024,
7(5):99-110., articleTitle=具身智能及其在自主无人系统的应用研究, refAbstract=null), Reference(id=1251856555583418919, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[95], rfOrder=94, authorNames=SERPIVA V, LYKOV A, MYSHLYAEV A, journalName=null, refType=null, unstructuredReference=
SERPIVA V,
LYKOV A,
MYSHLYAEV A,
et al. RaceVLA: VLA-based Racing Drone Navigation with Humanlike Behaviour[EB/OL]. (2025-03-04)[
2025-07-10].
https://arxiv.org/abs/2503.02572., articleTitle=RaceVLA: VLA-based Racing Drone Navigation with Humanlike Behaviour, refAbstract=null), Reference(id=1251856555671499305, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[96], rfOrder=95, authorNames=CHENG A C, JI Y Z, YANG Z J, journalName=null, refType=null, unstructuredReference=
CHENG A C,
JI Y Z,
YANG Z J,
et al. NaviLa: Legged Robot Vision-Language-Action Model for Navigation[EB/OL]. (2024-12-05)[
2025-07-10].
https://arxiv.org/abs/2412.04453., articleTitle=NaviLa: Legged Robot Vision-Language-Action Model for Navigation, refAbstract=null), Reference(id=1251856555738608171, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[97], rfOrder=96, authorNames=ZHOU X C, HAN X Y, YANG F, journalName=null, refType=null, unstructuredReference=
ZHOU X C,
HAN X Y,
YANG F,
et al. OpenDriveVLA: Towards End-to-End Autonomous Driving with Large Vision Language Action Model[EB/OL]. (2025-03-30)[
2025-07-10].
https://arxiv.org/abs/2503.23463., articleTitle=OpenDriveVLA: Towards End-to-End Autonomous Driving with Large Vision Language Action Model, refAbstract=null), Reference(id=1251856555814105645, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[98], rfOrder=97, authorNames=QIAN K, SUN T Y, WANG W H, journalName=null, refType=null, unstructuredReference=
QIAN K,
SUN T Y,
WANG W H. Exploring Large Vision-Language Models for Robust and Efficient Industrial Anomaly Detection[EB/OL]. (2024-12-01)[
2025-07-10].
https://arxiv.org/abs/2412.00890., articleTitle=Exploring Large Vision-Language Models for Robust and Efficient Industrial Anomaly Detection, refAbstract=null), Reference(id=1251856555868631599, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[99], rfOrder=98, authorNames=LI Q X, LIANG Y B, WANG Z Y, journalName=null, refType=null, unstructuredReference=
LI Q X,
LIANG Y B,
WANG Z Y,
et al. CogAct: A Foundational Vision-Language-Action Model for Synergizing Cognition and Action in Robotic Manipulation[EB/OL]. (2024-11-29)[
2025-07-10].
https://arxiv.org/abs/2411.19650., articleTitle=CogAct: A Foundational Vision-Language-Action Model for Synergizing Cognition and Action in Robotic Manipulation, refAbstract=null), Reference(id=1251856555931546160, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2022, volume=74, issue=null, pageStart=459, pageEnd=515, url=null, language=null, rfNumber=[100], rfOrder=99, authorNames=FRANCIS J, KITAMURA N, LABELLE F, journalName=Journal of Artificial Intelligence Research, refType=null, unstructuredReference=
FRANCIS J,
KITAMURA N,
LABELLE F,
et al. Core Challenges in Embodied Vision-Language Planning[J].
Journal of Artificial Intelligence Research,
2022,
74:459-515., articleTitle=Core Challenges in Embodied Vision-Language Planning, refAbstract=null), Reference(id=1251856556007043633, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2024, volume=38, issue=12, pageStart=12, pageEnd=25, url=null, language=null, rfNumber=[101], rfOrder=100, authorNames=邓鹏, 唐文涛, 罗静, journalName=电子测量与仪器学报, refType=null, unstructuredReference=邓鹏,唐文涛,罗静.机器人大模型发展与挑战[J].
电子测量与仪器学报,
2024,
38(12):12-25., articleTitle=机器人大模型发展与挑战, refAbstract=null), Reference(id=1251856556074152498, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2021, volume=71, issue=null, pageStart=1183, pageEnd=1317, url=null, language=null, rfNumber=[102], rfOrder=101, authorNames=MOGADALA A, KALIMUTHU M, KLAKOW D, journalName=Journal of Ar tificial Intelligence Research, refType=null, unstructuredReference=
MOGADALA A,
KALIMUTHU M,
KLAKOW D. Trends in Integration of Vision and Language Research: A Survey of Tasks, Datasets, and Methods[J].
Journal of Ar tificial Intelligence Research,
2021,
71:1183-1317., articleTitle=Trends in Integration of Vision and Language Research: A Survey of Tasks, Datasets, and Methods, refAbstract=null), Reference(id=1251856556128678451, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[103], rfOrder=102, authorNames=CHERN E, HU Z L, CHERN S, journalName=null, refType=null, unstructuredReference=
CHERN E,
HU Z L,
CHERN S,
et al. Thinking with Generated Images[EB/OL]. (2025-05-28)[
2025-07-10].
https://arxiv.org/abs/2505.22525., articleTitle=Thinking with Generated Images, refAbstract=null), Reference(id=1251856556187398708, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, doi=null, pmid=null, pmcid=null, year=2025-07-10, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[104], rfOrder=103, authorNames=LI C Z, WU W S, ZHANG H Y, journalName=null, refType=null, unstructuredReference=
LI C Z,
WU W S,
ZHANG H Y,
et al. Imagine While Reasoning in Space: Multimodal Visualization-of-Thought[EB/OL]. (2025-01-13)[
2025-07-10].
https://arxiv.org/abs/2501.07542., articleTitle=Imagine While Reasoning in Space: Multimodal Visualization-of-Thought, refAbstract=null)], funds=null, companyList=[AuthorCompany(id=1251856534184080291, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=1., ext=[AuthorCompanyExt(id=1251856534200857510, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534184080291, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
1.Foundation Model Research Center, Institute of Automation, Chinese Academy of Sciences, Beijing 100083, China), AuthorCompanyExt(id=1251856534213440426, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534184080291, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
1.中国科学院自动化研究所 紫东太初大模型研究中心,北京 100083)]), AuthorCompany(id=1251856534309909421, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=2., ext=[AuthorCompanyExt(id=1251856534314103726, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534309909421, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
2.School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing 100083, China), AuthorCompanyExt(id=1251856534322492335, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534309909421, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
2.中国科学院大学 人工智能学院,北京 100083)]), AuthorCompany(id=1251856534414767034, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=3., ext=[AuthorCompanyExt(id=1251856534423155641, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534414767034, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
3.School of Software Engineering, Xi’an Jiaotong University, Xi’an 710049, China), AuthorCompanyExt(id=1251856534427349946, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534414767034, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
3.西安交通大学 软件学院,陕西 西安 710049)]), AuthorCompany(id=1251856534528013250, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=4., ext=[AuthorCompanyExt(id=1251856534536401859, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534528013250, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
4.New Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing 100083, China), AuthorCompanyExt(id=1251856534544790468, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856534528013250, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
4.中国科学院自动化研究所 模式识别实验室,北京 100083)]), AuthorCompany(id=1251856536067322829, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=5., ext=[AuthorCompanyExt(id=1251856536071517133, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856536067322829, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
5.The Hamlyn Centre, Imperial College London, London SW7 2AZ , United Kingdom), AuthorCompanyExt(id=1251856536075711438, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856536067322829, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
5.帝国理工学院 哈姆林中心,伦敦SW7 2AZ)]), AuthorCompany(id=1251856536163791830, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, xref=6., ext=[AuthorCompanyExt(id=1251856536172180438, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856536163791830, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
6.Objecteye. Inc, Beijing 100083, China), AuthorCompanyExt(id=1251856536176374743, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, companyId=1251856536163791830, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=
6.中科视语(北京)科技有限公司,北京 100083)])], figs=[ArticleFig(id=1251856541389893820, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=EN, label=Fig.1, caption=
Typical VLA model architecture comprising language,visual,and action models, figureFileSmall=LvCQUNLIFv/lnId0PDBt8A==, figureFileBig=i9WgRNQIfkcX36GehDV3qw==, tableContent=null), ArticleFig(id=1251856541473779905, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=CN, label=图1, caption=
典型VLA模型架构,包含语言模型、视觉模型、动作模型, figureFileSmall=LvCQUNLIFv/lnId0PDBt8A==, figureFileBig=i9WgRNQIfkcX36GehDV3qw==, tableContent=null), ArticleFig(id=1251856541662523594, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=EN, label=Fig.2, caption=
Prevalent multimodal fusion strategies, figureFileSmall=aqbjCvbgT/2xDlcMTebFfg==, figureFileBig=hA5Q1hfBhVWUB2ySG8tJ0w==, tableContent=null), ArticleFig(id=1251856541775769810, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=CN, label=图2, caption=
常见的多模态融合策略, figureFileSmall=aqbjCvbgT/2xDlcMTebFfg==, figureFileBig=hA5Q1hfBhVWUB2ySG8tJ0w==, tableContent=null), ArticleFig(id=1251856541859655896, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=EN, label=Fig.3, caption=
Comparison between π0and other models, figureFileSmall=teAWA7Efk+Hj2mMxOpZovA==, figureFileBig=KQNDg2HmwhsK1j2AwxfOrg==, tableContent=null), ArticleFig(id=1251856541964513501, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=CN, label=图3, caption=
π0与其他模型的对比, figureFileSmall=teAWA7Efk+Hj2mMxOpZovA==, figureFileBig=KQNDg2HmwhsK1j2AwxfOrg==, tableContent=null), ArticleFig(id=1251856542098731236, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=EN, label=Fig.4, caption=
VLA architecture of the RT-2, figureFileSmall=yZBHL74IdJgBZOFaxFT0DA==, figureFileBig=2Q8sm2G3bJMvZT22n/mWGg==, tableContent=null), ArticleFig(id=1251856542165840105, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=CN, label=图4, caption=
RT-2的VLA架构, figureFileSmall=yZBHL74IdJgBZOFaxFT0DA==, figureFileBig=2Q8sm2G3bJMvZT22n/mWGg==, tableContent=null), ArticleFig(id=1251856542274892015, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=EN, label=Fig.5, caption=
Classification of VLA models, figureFileSmall=P1zTu0L6W77mUJuyaXXanQ==, figureFileBig=Ab7ogPCOw5mcJUTXWwXTtw==, tableContent=null), ArticleFig(id=1251856542358778099, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=CN, label=图5, caption=
VLA模型的分类, figureFileSmall=P1zTu0L6W77mUJuyaXXanQ==, figureFileBig=Ab7ogPCOw5mcJUTXWwXTtw==, tableContent=null), ArticleFig(id=1251856542459441398, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=EN, label=Fig.6, caption=
Application fields of VLA models, figureFileSmall=HUGrHgfhLx2VQf66U2Bq6g==, figureFileBig=/IH9P/T+oZWyE0aPjMGtcA==, tableContent=null), ArticleFig(id=1251856542547521786, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=CN, label=图6, caption=
VLA模型的应用领域, figureFileSmall=HUGrHgfhLx2VQf66U2Bq6g==, figureFileBig=/IH9P/T+oZWyE0aPjMGtcA==, tableContent=null), ArticleFig(id=1251856542635602175, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=EN, label=Tab.1, caption=
Comparison of ViT and CNN architectures on different datasets
, figureFileSmall=null, figureFileBig=null, tableContent=
| Dataset | Ours-JFT (ViT-H/14) | Ours-JFT (ViT-L/16) | Ours-I21k (ViT-L/16) | BiT-L (ResNet152×4) | Noisy Student (EfficientNet-L2) |
|---|
| ImageNet | 88.55±0.04 | 87.76±0.03 | 85.30±0.02 | 87.54±0.04 | 88.4/88.5∗ |
| ImageNet Real | 90.72±0.05 | 90.54±0.03 | 88.62±0.05 | 90.54 | 90.55 |
| CIFAR-10 | 99.50±0.06 | 99.42±0.03 | 99.15±0.03 | 99.37±0.06 | — |
| CIFAR-100 | 94.55±0.04 | 93.90±0.05 | 93.25±0.05 | 93.51±0.08 | — |
| Oxford-IIIT Pets | 97.56±0.03 | 97.32±0.11 | 94.67±0.15 | 96.62±0.23 | — |
| Oxford Flowers-102 | 99.68±0.02 | 99.74±0.00 | 99.61±0.02 | 99.63±0.03 | — |
| VTAB (19 tasks) | 77.63±0.23 | 76.28±0.46 | 72.72±0.21 | 76.29±1.70 | — |
| TPUv3-core-days | 2.5k | 0.68k | 0.23k | 9.9k | 12.3k |
), ArticleFig(id=1251856542736265475, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=CN, label=表1, caption=
ViT与CNN架构在不同数据集的对比
, figureFileSmall=null, figureFileBig=null, tableContent=
| Dataset | Ours-JFT (ViT-H/14) | Ours-JFT (ViT-L/16) | Ours-I21k (ViT-L/16) | BiT-L (ResNet152×4) | Noisy Student (EfficientNet-L2) |
|---|
| ImageNet | 88.55±0.04 | 87.76±0.03 | 85.30±0.02 | 87.54±0.04 | 88.4/88.5∗ |
| ImageNet Real | 90.72±0.05 | 90.54±0.03 | 88.62±0.05 | 90.54 | 90.55 |
| CIFAR-10 | 99.50±0.06 | 99.42±0.03 | 99.15±0.03 | 99.37±0.06 | — |
| CIFAR-100 | 94.55±0.04 | 93.90±0.05 | 93.25±0.05 | 93.51±0.08 | — |
| Oxford-IIIT Pets | 97.56±0.03 | 97.32±0.11 | 94.67±0.15 | 96.62±0.23 | — |
| Oxford Flowers-102 | 99.68±0.02 | 99.74±0.00 | 99.61±0.02 | 99.63±0.03 | — |
| VTAB (19 tasks) | 77.63±0.23 | 76.28±0.46 | 72.72±0.21 | 76.29±1.70 | — |
| TPUv3-core-days | 2.5k | 0.68k | 0.23k | 9.9k | 12.3k |
), ArticleFig(id=1251856542878871818, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=EN, label=Tab.2, caption=
Classification based on macro architecture
, figureFileSmall=null, figureFileBig=null, tableContent=
| 分类 | 核心思想 | 代表模型 |
|---|
| 预测离散动作词元的自回归式 | 将动作映射为离散词元 | RT-2、ACT、ECOT、UniPi、OpenVLA-OFT、WorldVLA |
| 生成连续动作序列的生成式 | 学习动作轨迹的条件概率分布 | π0、Diffusion Policy、RDT-1B、TacAR、HybridVLA |
), ArticleFig(id=1251856542975340814, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=CN, label=表2, caption=
基于宏观架构的分类
, figureFileSmall=null, figureFileBig=null, tableContent=
| 分类 | 核心思想 | 代表模型 |
|---|
| 预测离散动作词元的自回归式 | 将动作映射为离散词元 | RT-2、ACT、ECOT、UniPi、OpenVLA-OFT、WorldVLA |
| 生成连续动作序列的生成式 | 学习动作轨迹的条件概率分布 | π0、Diffusion Policy、RDT-1B、TacAR、HybridVLA |
), ArticleFig(id=1251856543101169942, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=EN, label=Tab.3, caption=
Classification based on cognitive hierarchies
, figureFileSmall=null, figureFileBig=null, tableContent=
| 分类 | 核心思想 | 代表模型 |
|---|
| 单系统端到端 | 端到端多模态到动作 | RT-1、ACT、RT-2、OpenVLA、ECOT、UniPi |
| 双系统分层架构 | 高层任务规划与低层控制结合 | Hi-Robot、GR00T N1、Helix、DexVLA、Psi-R1、DexGraspVLA |
), ArticleFig(id=1251856543176667419, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=CN, label=表3, caption=
基于认知分层的分类
, figureFileSmall=null, figureFileBig=null, tableContent=
| 分类 | 核心思想 | 代表模型 |
|---|
| 单系统端到端 | 端到端多模态到动作 | RT-1、ACT、RT-2、OpenVLA、ECOT、UniPi |
| 双系统分层架构 | 高层任务规划与低层控制结合 | Hi-Robot、GR00T N1、Helix、DexVLA、Psi-R1、DexGraspVLA |
), ArticleFig(id=1251856543277330719, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=EN, label=Tab.4, caption=
Mainstream embodied simulators
, figureFileSmall=null, figureFileBig=null, tableContent=
| 类别 | 开发机构 | 核心特点 |
|---|
| AI2-THOR | AI2 | 专注室内场景 |
| Habitat | MetaAI | 室内场景,关注人机交互 |
| iGibson | 斯坦福大学 | 强调真实的物理交互 |
| SAPIEN | 加州大学圣迭戈分校 | 高精度物理操控 |
| Isaac Sim / Gym | NVIDIA | 照片级渲染与GPU加速 |
| RoboHive | 斯坦福大学 | 标准化多任务基准 |
), ArticleFig(id=1251856543373799716, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=CN, label=表4, caption=
主流具身仿真器
, figureFileSmall=null, figureFileBig=null, tableContent=
| 类别 | 开发机构 | 核心特点 |
|---|
| AI2-THOR | AI2 | 专注室内场景 |
| Habitat | MetaAI | 室内场景,关注人机交互 |
| iGibson | 斯坦福大学 | 强调真实的物理交互 |
| SAPIEN | 加州大学圣迭戈分校 | 高精度物理操控 |
| Isaac Sim / Gym | NVIDIA | 照片级渲染与GPU加速 |
| RoboHive | 斯坦福大学 | 标准化多任务基准 |
), ArticleFig(id=1251856544946663722, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=EN, label=Tab.5, caption=
Embodied intelligence task datasetand benchmark
, figureFileSmall=null, figureFileBig=null, tableContent=
| 仿真器 | 核心聚焦点 |
|---|
| OXE | 高保真模拟与跨平台部署能力,支持Sim2Real的模仿学习研究 |
| BridgeData V2 | 多样化机器人操作数据集 |
| ALFRED | 视觉-语言任务中多阶段目标分解与长指令理解执行 |
| BEHAVIOR-1K | 面向日常家庭活动的大规模任务库与常识泛化评估框架 |
| CALVIN | 多任务混合训练下的高鲁棒性连续操控与零样本泛化能力 |
| ManiSkill | 强物理属性建模支持下的操控策略物体类别泛化研究 |
| RLBench | 语言+视觉+动作模态融合的多任务模仿学习与低样本适应 |
| LIBERO | 多任务语言指令下的持续学习评估框架,关注抗遗忘与泛化能力 |
), ArticleFig(id=1251856545034744110, tenantId=1146029695717560320, journalId=1251234268282663017, articleId=1251856529385796289, language=CN, label=表5, caption=
具身智能任务数据集与基准
, figureFileSmall=null, figureFileBig=null, tableContent=
| 仿真器 | 核心聚焦点 |
|---|
| OXE | 高保真模拟与跨平台部署能力,支持Sim2Real的模仿学习研究 |
| BridgeData V2 | 多样化机器人操作数据集 |
| ALFRED | 视觉-语言任务中多阶段目标分解与长指令理解执行 |
| BEHAVIOR-1K | 面向日常家庭活动的大规模任务库与常识泛化评估框架 |
| CALVIN | 多任务混合训练下的高鲁棒性连续操控与零样本泛化能力 |
| ManiSkill | 强物理属性建模支持下的操控策略物体类别泛化研究 |
| RLBench | 语言+视觉+动作模态融合的多任务模仿学习与低样本适应 |
| LIBERO | 多任务语言指令下的持续学习评估框架,关注抗遗忘与泛化能力 |
)], attaches=null, journal=Journal(id=1251231494547484770, delFlag=0, nameCn=无线电工程, nameEn=Radio Engineering, nameHistory1=null, nameHistory2=null, issn=1003-3106, eissn=, cn=13-1097/TN, coden=null, periodic=月刊, language=CN, oaType=1, ccby=null, superviseOffice=null, ownerOffice=null, pubOffice=null, editorOffice=null, officeType=null, aims=null, clcCode=null, officeProv=null, officeCity=null, officeAddr=null, officeZip=null, officeEmail=, officePhone=, editDirector=null, officeDirector=null, officeDirectorPhone=null, officeStaffNum=null, officeEmpNum=null, coverPicUrl=g0Lcfmfo7oUYDLtQTsujLQ==, journalPrice=null, startedYear=null, abbrevIsoEn=Radio Engineering, journalRemark=null, publicationField=null, createdTime=1776246435060, updatedTime=1776252047215, createdBy=18614031015, updatedBy=13701087609, firstLetterCn=R, firstLetterEn=R, subjectCode=Engineering, subjectName=工程, subjectCodeEn=Engineering, subjectNameEn=null, picCn=g0Lcfmfo7oUYDLtQTsujLQ==, picEn=fe/NRE6hx/Z5ZHweFTw2gA==, jcr=null, cjcr=null, exts=[JournalExt(id=1251255033736676338, language=CN, name=无线电工程, nameHistory1=null, nameHistory2=null, managedBy=, sponsoredBy=, publishedBy=, editorOffice=, officeProv=null, officeCity=null, officeAddr=, officeZip=, editDirector=, officeDirector=null, officePhone=null, coverPicUrl=null, journalRemark=, submitArticleUrl=null, websiteUrl=, createdTime=1776252047237, updatedTime=1776252047237, createdBy=13701087609, updatedBy=13701087609, submissionGuidelinesUrl=, submissionAuthorUrl=https://wxdg.cbpt.cnki.net/index.aspx?t=1, submissionEditorUrl=https://wxdg.cbpt.cnki.net/index.aspx?t=3, submissionReviewUrl=https://wxdg.cbpt.cnki.net/index.aspx?t=2, submissionCeEditorUrl=, submissionAeEditorUrl=, option={"copyright":""}), JournalExt(id=1251255033787007987, language=EN, name=Radio Engineering, nameHistory1=null, nameHistory2=null, managedBy=, sponsoredBy=, publishedBy=, editorOffice=, officeProv=null, officeCity=null, officeAddr=, officeZip=, editDirector=, officeDirector=null, officePhone=null, coverPicUrl=null, journalRemark=, submitArticleUrl=null, websiteUrl=, createdTime=1776252047249, updatedTime=1776252047249, createdBy=13701087609, updatedBy=13701087609, submissionGuidelinesUrl=, submissionAuthorUrl=https://wxdg.cbpt.cnki.net/index.aspx?t=1, submissionEditorUrl=https://wxdg.cbpt.cnki.net/index.aspx?t=3, submissionReviewUrl=https://wxdg.cbpt.cnki.net/index.aspx?t=2, submissionCeEditorUrl=, submissionAeEditorUrl=, option={"copyright":""})], databaseList=null, tenantJournalId=1251234268282663017, websiteList=[Website(id=1251257283494232110, webName=null, webTitle=null, webDomain=null, webCopyrigh=null, webIpcNo=null, seoTitle=null, seoKeywords=null, seoDescription=null, tenantJournalId=null, journalId=1251234268282663017, journalNameCn=null, journalNameEn=null, grayFlag=null, tenantId=1146029695717560320, platformId=null, journalGroupId=null, journalGroupNameCn=null, journalGroupNameEn=null, type=1, domain=https://castjournals.cast.org.cn/joweb/wxdgc/CN, language=CN, createTime=1776252583625, createBy=18614031015, updateTime=1776253601640, updateBy=18614031015, name=无线电工程-中文, tplId=1146099689490845704, title=无线电工程, delFlag=0, indexPage=/home, props=[WebsiteProps(id=1251261682236997912, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283494232110, code=articleTextType, value=kx, createTime=1776253632363, updateTime=1776253632363, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261682199249173, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283494232110, code=banner, value=null, createTime=1776253632354, updateTime=1776253632354, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261682262163739, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283494232110, code=grayFlag, value=0, createTime=1776253632369, updateTime=1776253632369, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261682190860564, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283494232110, code=logo, value=https://castjournals.cast.org.cn/joweb/wxdgc/CN/file/pic?fileId=AnvFcVcgnutwkVR+TFSYqg==, createTime=1776253632352, updateTime=1776253632352, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261682283135261, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283494232110, code=minRunFlag, value=0, createTime=1776253632374, updateTime=1776253632374, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261682224414999, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283494232110, code=picServerUrl, value=https://castjournals.cast.org.cn/joweb/wxdgc/CN/file/pic, createTime=1776253632360, updateTime=1776253632360, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261682274746652, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283494232110, code=silenceFlag, value=0, createTime=1776253632372, updateTime=1776253632372, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261682207637782, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283494232110, code=staticResourcePath, value=https://castjournals.cast.org.cn/joweb/cast_kjdb_cn_619/, createTime=1776253632356, updateTime=1776253632356, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261682245386521, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283494232110, code=themeColor, value=null, createTime=1776253632365, updateTime=1776253632365, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261682249580826, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283494232110, code=themeStyle, value=null, createTime=1776253632366, updateTime=1776253632366, creator=18614031015, updator=18614031015)]), Website(id=1251257283603284042, webName=null, webTitle=null, webDomain=null, webCopyrigh=null, webIpcNo=null, seoTitle=null, seoKeywords=null, seoDescription=null, tenantJournalId=null, journalId=1251234268282663017, journalNameCn=null, journalNameEn=null, grayFlag=null, tenantId=1146029695717560320, platformId=null, journalGroupId=null, journalGroupNameCn=null, journalGroupNameEn=null, type=1, domain=https://castjournals.cast.org.cn/joweb/wxdgc/EN, language=EN, createTime=1776252583647, createBy=18614031015, updateTime=1776253597767, updateBy=18614031015, name=无线电工程-英文, tplId=1146101810881728533, title=Radio Engineering, delFlag=0, indexPage=/home, props=[WebsiteProps(id=1251261640977625682, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283603284042, code=articleTextType, value=kx, createTime=1776253622526, updateTime=1776253622526, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261640956654159, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283603284042, code=banner, value=null, createTime=1776253622521, updateTime=1776253622521, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261641002791509, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283603284042, code=grayFlag, value=0, createTime=1776253622532, updateTime=1776253622532, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261640948265550, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283603284042, code=logo, value=https://castjournals.cast.org.cn/joweb/wxdgc/EN/file/pic?fileId=AnvFcVcgnutwkVR+TFSYqg==, createTime=1776253622519, updateTime=1776253622519, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261641015374423, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283603284042, code=minRunFlag, value=0, createTime=1776253622535, updateTime=1776253622535, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261640969237073, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283603284042, code=picServerUrl, value=https://castjournals.cast.org.cn/joweb/wxdgc/EN/file/pic, createTime=1776253622524, updateTime=1776253622524, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261641011180118, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283603284042, code=silenceFlag, value=0, createTime=1776253622534, updateTime=1776253622534, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261640965042768, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283603284042, code=staticResourcePath, value=https://castjournals.cast.org.cn/joweb/cast_kjdb_en_623/, createTime=1776253622523, updateTime=1776253622523, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261640986014291, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283603284042, code=themeColor, value=null, createTime=1776253622528, updateTime=1776253622528, creator=18614031015, updator=18614031015), WebsiteProps(id=1251261640994402900, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283603284042, code=themeStyle, value=null, createTime=1776253622530, updateTime=1776253622530, creator=18614031015, updator=18614031015)])], journalTitle=无线电工程, weixinUrl=null, journalUrl=https://wxdg.cbpt.cnki.net/, iacademicId=null, status=1, seqNo=null, journalTitleEn=Radio Engineering, journalPhotoCn=g0Lcfmfo7oUYDLtQTsujLQ==, journalPhotoEn=fe/NRE6hx/Z5ZHweFTw2gA==, journalFirstLetter=R, journalRecommend=null, journalNew=null, journalCollection=null, jcrJf=null, cjcrJf=null, jcrJfStr=null, cjcrJfStr=null, submissionFirstDecision=null, sciSubjectClassification=null, casSubjectClassification=null, citeScore=null, totalCitationFrequency=null, icpCode=null, psCode=null, advertisingLicenseCode=null, copyrightInformation=null, country=null, option=, provinceCode=null, provinceName=null, collectFlag=false), detailUrlCn=https://castjournals.cast.org.cn/joweb/wxdgc/CN/10.3969/j.issn.1003-3106.2025.11.014, detailUrlEn=https://castjournals.cast.org.cn/joweb/wxdgc/EN/10.3969/j.issn.1003-3106.2025.11.014, pdfUrlCn=https://castjournals.cast.org.cn/joweb/wxdgc/CN/PDF/10.3969/j.issn.1003-3106.2025.11.014, pdfUrlEn=https://castjournals.cast.org.cn/joweb/wxdgc/EN/PDF/10.3969/j.issn.1003-3106.2025.11.014, aliStartDate=null, aliEndDate=null, collectionFlag=false, citedCount=null, citedUrl=null, reference=null)