Article(id=1251480537668465256, tenantId=1146029695717560320, journalId=1251234078029037663, issueId=1251480531381207309, articleNumber=null, orderNo=null, doi=10.11887/j.issn.1001-2486.25050035, pmid=null, cstr=null, oa=null, hot=null, price=null, onlineType=0, articleFormat=0, articleType=null, articleTypeStr=null, receivedDate=1748016000000, receivedDateStr=2025-05-24, revisedDate=null, revisedDateStr=null, acceptedDate=null, acceptedDateStr=null, onlineDate=1776305811563, onlineDateStr=2026-04-16, pubDate=1766851200000, pubDateStr=2025-12-28, doiRegisterDate=null, doiRegisterDateStr=null, onlineIssueDate=1776305811563, onlineIssueDateStr=2026-04-16, onlineJustAcceptDate=null, onlineJustAcceptDateStr=null, onlineFirstDate=null, onlineFirstDateStr=null, sourceXml=null, magXml=null, createTime=1776305811563, creator=13701087609, updateTime=1776305811563, updator=13701087609, issue=Issue{id=1251480531381207309, tenantId=1146029695717560320, journalId=1251234078029037663, year='2025', volume='47', issue='6', pageStart='1', pageEnd='306', issueExtLink='null', onlineDate='null', pubDate='null', beforeIssueId=null, nextIssueId=null, price=null, status=1, issueComplete=1, articleOrder=1, issueType=1, specialIssue=null, createTime=1776305810065, creator=13701087609, updateTime=1776305899308, updator=13701087609, preIssue=null, nextIssue=null, ext={EN=IssueExt(id=1251480905865446141, tenantId=1146029695717560320, journalId=1251234078029037663, issueId=1251480531381207309, language=EN, specialIssueTitle=, coverIllustrator=null, specialIssueEditor=, specialIssueAbout=), CN=IssueExt(id=1251480905865446142, tenantId=1146029695717560320, journalId=1251234078029037663, issueId=1251480531381207309, language=CN, specialIssueTitle=, coverIllustrator=null, specialIssueEditor=, specialIssueAbout=)}, issueFiles=null}, startPage=60, endPage=70, ext={EN=ArticleExt(id=1251480539379741348, articleId=1251480537668465256, tenantId=1146029695717560320, journalId=1251234078029037663, language=EN, title=Operator-aware tensor offloading approach for large language model inference in resource-constrained scenarios, columnId=1251480536670220899, journalTitle=Journal of National Niversity of Defense Technology, columnName=Computer System and technology, runingTitle=null, highlight=null, articleAbstract=
Efficient inference deployment of large language models faces severe challenges in resource-constrained scenarios.Although current mainstream inference optimization techniques have improved model inference efficiency to some extent, they still suffer from issues like coarse-grained deployment and poor inference accuracy.Based on the discovery that different operators exhibit varying degrees of GPU affinity, an OATO(operator-aware tensor offloading)approach was proposed.OATO could extract operators′semantic knowledge and used it to design an intelligent scheduling algorithm, which further yielded a globally optimal model-deployment plan.Meanwhile, the OATO approach was integrated into the latest large model inference framework Llama.cpp to implement an operator-aware tensor offloading enhanced inference engine, referred to as OALlama.cpp.Experimental results show that compared with the state-of-the-art inference engines Llama.cpp and FlexGen, OALlama.cpp achieves the best inference performance on three large models.Notably, in the scenario where 75% of the LlaMA3-8B model weights are loaded on the GPU, the first-token generation speed of OALlama.cpp is nearly doubled compared with FlexGen and Llama.cpp.
, correspAuthors=Songlei JIAN, authorNote=null, correspAuthorsNote=null, copyrightStatement=null, copyrightOwner=null, extLink=null, articleAbsUrl=null, sourceXml=null, magXml=null, pdfUrl=null, pdf=null, pdfFileSize=null, pdfExtLink=null, richHtmlUrl=null, mobilePdfUrl=null, reviewReport=null, pdfFirstPage=null, abstractGraph=null, abstractGraphContent=null, abstractVideo=null, citation=null, cebUrl=null, magXmlContent=null, mapNumber=null, authorCompany=null, fund=null, authors=null, authorsList=Jianfeng ZHANG, Dong XIE, Songlei JIAN, Bao LI, Xiaochuan WANG, Yong GUO, Jie YU), CN=ArticleExt(id=1251480545302098840, articleId=1251480537668465256, tenantId=1146029695717560320, journalId=1251234078029037663, language=CN, title=资源受限场景下基于算子感知的大模型推理张量卸载方法, columnId=1251480538381496943, journalTitle=国防科技大学学报, columnName=计算机系统与技术, runingTitle=null, highlight=null, articleAbstract=
在一些资源受限场景下,大语言模型的高效推理部署面临严峻挑战。当前主流的模型推理优化技术,虽然在一定程度上提高了模型推理效率,但是仍然存在部署粒度较为粗糙、推理精度较差等问题。根据不同算子对GPU亲和度不同的发现,提出算子感知张量卸载(operator-aware tensor offloading,OATO)方法。OATO能够提取算子的语义知识,并基于此设计了智能算子调度算法,可以生成全局最优模型部署方案。同时,将OATO方法集成进最新的大模型推理框架Llama.cpp中,实现了算子感知的张量卸载增强推理引擎OALlama.cpp。实验结果表明,相比于业内最先进的推理引擎Llama.cpp和FlexGen,OALlama.cpp在3种大模型上均取得最好的推理性能,尤其是在LlaMA3-8B模型GPU加载75%权重的场景下,OALlama.cpp的首词生成速度相比FlexGen和Llama.cpp提升近1倍。
, correspAuthors=蹇松雷, authorNote=null, correspAuthorsNote=
, copyrightStatement=null, copyrightOwner=null, extLink=null, articleAbsUrl=null, sourceXml=TYnlBYq2+/w/Kxyja845zQ==, magXml=AFTGqiYW/qLeeS7VTdPFcg==, pdfUrl=null, pdf=35KwPYpwGPELgorSW+pFsg==, pdfFileSize=5429351, pdfExtLink=null, richHtmlUrl=null, mobilePdfUrl=null, reviewReport=null, pdfFirstPage=null, abstractGraph=mrok1ikjnN1rLjLrdUWi6Q==, abstractGraphContent=null, abstractVideo=null, citation=null, cebUrl=null, magXmlContent=wwLKOuoCBnXOrUq/7UE0jQ==, mapNumber=null, authorCompany=null, fund=null, authors=
, authorsList=张建锋, 谢栋, 蹇松雷, 李宝, 王晓川, 郭勇, 余杰)}, authors=[Author(id=1251480547185341367, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, orderNo=0, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=jfzhang@nudt.edu.cn, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1251480547290198971, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, authorId=1251480547185341367, language=EN, stringName=Jianfeng ZHANG, firstName=Jianfeng, middleName=null, lastName=ZHANG, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=College of Computer Science and Technology, National University of Defense Technology, Changsha 410073, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1251480547395056580, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, authorId=1251480547185341367, language=CN, stringName=张建锋, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=国防科技大学 计算机学院,湖南 长沙 410073, bio={"content":"
张建锋(1984—),男,陕西宝鸡人,副研究员,博士,E-mail:jfzhang@nudt.edu.cn
"}, bioImg=null, bioContent=
张建锋(1984—),男,陕西宝鸡人,副研究员,博士,E-mail:jfzhang@nudt.edu.cn
, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1251480545616671657, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, xref=null, ext=[AuthorCompanyExt(id=1251480545625060266, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, companyId=1251480545616671657, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=College of Computer Science and Technology, National University of Defense Technology, Changsha 410073, China), AuthorCompanyExt(id=1251480545633448875, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, companyId=1251480545616671657, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=国防科技大学 计算机学院,湖南 长沙 410073)])]), Author(id=1251480547537662919, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, orderNo=1, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=null, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1251480547629937612, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, authorId=1251480547537662919, language=EN, stringName=Dong XIE, firstName=Dong, middleName=null, lastName=XIE, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=College of Computer Science and Technology, National University of Defense Technology, Changsha 410073, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1251480547747378132, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, authorId=1251480547537662919, language=CN, stringName=谢栋, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=国防科技大学 计算机学院,湖南 长沙 410073, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1251480545616671657, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, xref=null, ext=[AuthorCompanyExt(id=1251480545625060266, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, companyId=1251480545616671657, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=College of Computer Science and Technology, National University of Defense Technology, Changsha 410073, China), AuthorCompanyExt(id=1251480545633448875, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, companyId=1251480545616671657, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=国防科技大学 计算机学院,湖南 长沙 410073)])]), Author(id=1251480547848041434, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, orderNo=2, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=jiansonglei@nudt.edu.cn, emailSecond=null, emailThird=null, correspondingAuthor=1, authorType=1, ext={EN=AuthorExt(id=1251480547973870565, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, authorId=1251480547848041434, language=EN, stringName=Songlei JIAN, firstName=Songlei, middleName=null, lastName=JIAN, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
*, address=College of Computer Science and Technology, National University of Defense Technology, Changsha 410073, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1251480548066145259, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, authorId=1251480547848041434, language=CN, stringName=蹇松雷, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
*, address=国防科技大学 计算机学院,湖南 长沙 410073, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1251480545616671657, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, xref=null, ext=[AuthorCompanyExt(id=1251480545625060266, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, companyId=1251480545616671657, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=College of Computer Science and Technology, National University of Defense Technology, Changsha 410073, China), AuthorCompanyExt(id=1251480545633448875, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, companyId=1251480545616671657, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=国防科技大学 计算机学院,湖南 长沙 410073)])]), Author(id=1251480548145837041, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, orderNo=3, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=null, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1251480548238111736, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, authorId=1251480548145837041, language=EN, stringName=Bao LI, firstName=Bao, middleName=null, lastName=LI, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=College of Computer Science and Technology, National University of Defense Technology, Changsha 410073, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1251480548313609214, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, authorId=1251480548145837041, language=CN, stringName=李宝, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=国防科技大学 计算机学院,湖南 长沙 410073, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1251480545616671657, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, xref=null, ext=[AuthorCompanyExt(id=1251480545625060266, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, companyId=1251480545616671657, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=College of Computer Science and Technology, National University of Defense Technology, Changsha 410073, China), AuthorCompanyExt(id=1251480545633448875, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, companyId=1251480545616671657, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=国防科技大学 计算机学院,湖南 长沙 410073)])]), Author(id=1251480548405882884, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, orderNo=4, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=null, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1251480548498157578, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, authorId=1251480548405882884, language=EN, stringName=Xiaochuan WANG, firstName=Xiaochuan, middleName=null, lastName=WANG, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=College of Computer Science and Technology, National University of Defense Technology, Changsha 410073, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1251480548603015184, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, authorId=1251480548405882884, language=CN, stringName=王晓川, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=国防科技大学 计算机学院,湖南 长沙 410073, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1251480545616671657, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, xref=null, ext=[AuthorCompanyExt(id=1251480545625060266, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, companyId=1251480545616671657, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=College of Computer Science and Technology, National University of Defense Technology, Changsha 410073, China), AuthorCompanyExt(id=1251480545633448875, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, companyId=1251480545616671657, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=国防科技大学 计算机学院,湖南 长沙 410073)])]), Author(id=1251480548703678488, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, orderNo=5, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=null, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1251480548804341791, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, authorId=1251480548703678488, language=EN, stringName=Yong GUO, firstName=Yong, middleName=null, lastName=GUO, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=College of Computer Science and Technology, National University of Defense Technology, Changsha 410073, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1251480548913393703, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, authorId=1251480548703678488, language=CN, stringName=郭勇, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=国防科技大学 计算机学院,湖南 长沙 410073, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1251480545616671657, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, xref=null, ext=[AuthorCompanyExt(id=1251480545625060266, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, companyId=1251480545616671657, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=College of Computer Science and Technology, National University of Defense Technology, Changsha 410073, China), AuthorCompanyExt(id=1251480545633448875, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, companyId=1251480545616671657, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=国防科技大学 计算机学院,湖南 长沙 410073)])]), Author(id=1251480549035028528, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, orderNo=6, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=null, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1251480549156663352, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, authorId=1251480549035028528, language=EN, stringName=Jie YU, firstName=Jie, middleName=null, lastName=YU, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=College of Computer Science and Technology, National University of Defense Technology, Changsha 410073, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1251480549248938049, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, authorId=1251480549035028528, language=CN, stringName=余杰, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=国防科技大学 计算机学院,湖南 长沙 410073, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1251480545616671657, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, xref=null, ext=[AuthorCompanyExt(id=1251480545625060266, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, companyId=1251480545616671657, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=College of Computer Science and Technology, National University of Defense Technology, Changsha 410073, China), AuthorCompanyExt(id=1251480545633448875, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, companyId=1251480545616671657, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=国防科技大学 计算机学院,湖南 长沙 410073)])])], keywords=[Keyword(id=1251480549395738695, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, orderNo=1, keyword=large language models), Keyword(id=1251480549471236172, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, orderNo=2, keyword=resource constraints), Keyword(id=1251480549542539345, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, orderNo=3, keyword=model inference), Keyword(id=1251480549626425432, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, orderNo=4, keyword=GPU affinities of operators), Keyword(id=1251480549739671644, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, orderNo=5, keyword=operator-aware tensor offloading approach), Keyword(id=1251480549848723554, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, orderNo=1, keyword=大语言模型), Keyword(id=1251480549932609639, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, orderNo=2, keyword=资源受限), Keyword(id=1251480550024884332, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, orderNo=3, keyword=模型推理), Keyword(id=1251480551585165427, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, orderNo=4, keyword=算子GPU亲和度), Keyword(id=1251480551677440121, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, orderNo=5, keyword=算子感知张量卸载方法)], refs=[Reference(id=1251480554466652478, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[1], rfOrder=0, authorNames=ZHAO W X, ZHOU K, LI J Y, journalName=null, refType=null, unstructuredReference=
ZHAO W X,
ZHOU K,
LI J Y,
et al.A survey of large language models[EB/OL].(2025-03-11)[
2025-05-01].
https://arxiv.org/abs/2303.18223., articleTitle=A survey of large language models, refAbstract=null), Reference(id=1251480556064682311, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[2], rfOrder=1, authorNames=YUAN Z H, SHANG Y Z, ZHOU Y, journalName=null, refType=null, unstructuredReference=
YUAN Z H,
SHANG Y Z,
ZHOU Y,
et al.LLM inference unveiled:survey and roofline model insights[EB/OL].(2024-05-01)[
2025-05-01].
https://arxiv.org/abs/2402.16363., articleTitle=LLM inference unveiled:survey and roofline model insights, refAbstract=null), Reference(id=1251480556169539917, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[3], rfOrder=2, authorNames=MIAO X P, OLIARO G, ZHANG Z H, journalName=null, refType=null, unstructuredReference=
MIAO X P,
OLIARO G,
ZHANG Z H,
et al.Towards efficient generative large language model serving:a survey from algorithms to systems[EB/OL].(2023-12-23)[
2025-05-01].
https://arxiv.org/abs/2312.15234., articleTitle=Towards efficient generative large language model serving:a survey from algorithms to systems, refAbstract=null), Reference(id=1251480556270203223, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2017, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[4], rfOrder=3, authorNames=VASWANI A, SHAZEER N M, PARMAR N, journalName=null, refType=null, unstructuredReference=
VASWANI A,
SHAZEER N M,
PARMAR N,
et al. Attention is all you need[C]//Proceedings of 31 st Conference on Neural Information Processing Systems,
2017., articleTitle=Attention is all you need, refAbstract=null), Reference(id=1251480556349895014, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2020, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[5], rfOrder=4, authorNames=BROWN T B, MANN B, RYDER N, journalName=null, refType=null, unstructuredReference=
BROWN T B,
MANN B,
RYDER N,
et al.Language models are few-shot learners[C]//Proceedings of 34th Conference on Neural Information Processing Systems,
2020., articleTitle=Language models are few-shot learners, refAbstract=null), Reference(id=1251480556458946925, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2022, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[6], rfOrder=5, authorNames=OUYANG L, WU J, JIANG X, journalName=null, refType=null, unstructuredReference=
OUYANG L,
WU J,
JIANG X,
et al.Training language models to follow instructions with human feedback[C]//Proceedings of 36th Conference on Neural Information Processing Systems,
2022., articleTitle=Training language models to follow instructions with human feedback, refAbstract=null), Reference(id=1251480556588970361, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[7], rfOrder=6, authorNames=OpenAI, journalName=null, refType=null, unstructuredReference=OpenAI.GPT-4 technical report[EB/OL].(2024-03-04)[
2025-05-01].
https://arxiv.org/abs/2303.08774., articleTitle=GPT-4 technical report, refAbstract=null), Reference(id=1251480556693827972, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[8], rfOrder=7, authorNames=TOUVRON H, LAVRIL T, IZACARD G, journalName=null, refType=null, unstructuredReference=
TOUVRON H,
LAVRIL T,
IZACARD G,
et al.LLAMA:open and efficient foundation language models[EB/OL]. (2023-02-27)[
2025-05-01].
https://arxiv.org/abs/2302.13971., articleTitle=LLAMA:open and efficient foundation language models, refAbstract=null), Reference(id=1251480556794491280, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[9], rfOrder=8, authorNames=TOUVRON H, MARTIN L, STONE K, journalName=null, refType=null, unstructuredReference=
TOUVRON H,
MARTIN L,
STONE K,
et al.LLAMA 2:open foundation and fine-tuned chat models[EB/OL].(2023-07-19)[
2025-05-01].
https://arxiv.org/abs/2307.09288., articleTitle=LLAMA 2:open foundation and fine-tuned chat models, refAbstract=null), Reference(id=1251480556899348890, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[10], rfOrder=9, authorNames=HU S D, TU Y G, HAN X, journalName=null, refType=null, unstructuredReference=
HU S D,
TU Y G,
HAN X,
et al.MiniCPM:unveiling the potential of small language models with scalable training strategies[EB/OL].(2024-06-03)[
2025-05-01].
https://arxiv.org/abs/2404.06395., articleTitle=MiniCPM:unveiling the potential of small language models with scalable training strategies, refAbstract=null), Reference(id=1251480557000012192, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[11], rfOrder=10, authorNames=YAO Y, YU T Y, ZHANG A, journalName=null, refType=null, unstructuredReference=
YAO Y,
YU T Y,
ZHANG A,
et al.MiniCPM-V:a GPT-4V level mllm on your phone[EB/OL].(2024-08-03)[
2025-05-01].
https://arxiv.org/abs/2408.01800., articleTitle=MiniCPM-V:a GPT-4V level mllm on your phone, refAbstract=null), Reference(id=1251480557100675493, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[12], rfOrder=11, authorNames=GUO D Y, ZHU Q H, YANG D J, journalName=null, refType=null, unstructuredReference=
GUO D Y,
ZHU Q H,
YANG D J,
et al.DeepSeek-Coder:when the large language model meets programming-the rise of code intelligence[EB/OL].(2024-01-26)[
2025-05-01].
https://arxiv.org/abs/2401.14196., articleTitle=DeepSeek-Coder:when the large language model meets programming-the rise of code intelligence, refAbstract=null), Reference(id=1251480557184561581, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[13], rfOrder=12, authorNames=LU H Y, LIU W, ZHANG B, journalName=null, refType=null, unstructuredReference=
LU H Y,
LIU W,
ZHANG B,
et al.DeepSeek-VL:towards real-world vision-language understanding[EB/OL].(2024-03-11)[
2025-05-01].
https://arxiv.org/abs/2403.05525., articleTitle=DeepSeek-VL:towards real-world vision-language understanding, refAbstract=null), Reference(id=1251480557293613494, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[14], rfOrder=13, authorNames=DeepSeek-AI, journalName=null, refType=null, unstructuredReference=DeepSeek-AI. DeepSeek-V2: a strong, economical, and efficient mixture-of-experts language model[EB/OL].(2024-06-19)[
2025-05-01].
https://arxiv.org/abs/2405.04434., articleTitle=DeepSeek-V2: a strong, economical, and efficient mixture-of-experts language model, refAbstract=null), Reference(id=1251480557394276800, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[15], rfOrder=14, authorNames=CHEN M, TWOREK J, JUN H, journalName=null, refType=null, unstructuredReference=
CHEN M,
TWOREK J,
JUN H,
et al.Evaluating large language models trained on code[EB/OL].(2021-07-14)[
2025-05-01].
https://arxiv.org/abs/2107.03374., articleTitle=Evaluating large language models trained on code, refAbstract=null), Reference(id=1251480557528494538, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[16], rfOrder=15, authorNames=LIAN L, LI B Y, YALA A, journalName=null, refType=null, unstructuredReference=
LIAN L,
LI B Y,
YALA A,
et al.LLM-grounded diffusion:enhancing prompt understanding of text-to-image diffusion models with large language models[EB/OL].(2024-03-04)[
2025-05-01].
https://arxiv.org/abs/2305.13655., articleTitle=LLM-grounded diffusion:enhancing prompt understanding of text-to-image diffusion models with large language models, refAbstract=null), Reference(id=1251480557608186323, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=5792, pageEnd=5793, url=null, language=null, rfNumber=[17], rfOrder=16, authorNames=DONG X L, MOON S, XU Y E, journalName=null, refType=null, unstructuredReference=
DONG X L,
MOON S,
XU Y E,
et al.Towards next generation intelligent assistants leveraging LLM techniques[C]//Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining,
2023:5792-5793., articleTitle=Towards next generation intelligent assistants leveraging LLM techniques, refAbstract=null), Reference(id=1251480557717238233, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[18], rfOrder=17, authorNames=XU D L, ZHANG H, YANG L M, journalName=null, refType=null, unstructuredReference=
XU D L,
ZHANG H,
YANG L M,
et al.Empowering 1000 tokens/second on-device LLM prefilling with mllm-NPU[EB/OL].(2024-07-08)[
2025-05-01].
https://arxiv.org/html/2407.05858v1., articleTitle=Empowering 1000 tokens/second on-device LLM prefilling with mllm-NPU, refAbstract=null), Reference(id=1251480557805318624, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=590, pageEnd=606, url=null, language=null, rfNumber=[19], rfOrder=18, authorNames=SONG Y X, MI Z Y, XIE H T, journalName=null, refType=null, unstructuredReference=
SONG Y X,
MI Z Y,
XIE H T,
et al.PowerInfer:fast large language model serving with a consumer-grade GPU[C]//Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles,
2024:590-606., articleTitle=PowerInfer:fast large language model serving with a consumer-grade GPU, refAbstract=null), Reference(id=1251480558019228138, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=12562, pageEnd=12584, url=null, language=null, rfNumber=[20], rfOrder=19, authorNames=ALIZADEH K, MIRZADEH S I, BELENKO D, journalName=null, refType=null, unstructuredReference=
ALIZADEH K,
MIRZADEH S I,
BELENKO D,
et al.LLM in a flash:efficient large language model inference with limited memory[C]//Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1:Long Papers),
2024:12562-12584., articleTitle=LLM in a flash:efficient large language model inference with limited memory, refAbstract=null), Reference(id=1251480558170223093, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2018, volume=null, issue=null, pageStart=2704, pageEnd=2713, url=null, language=null, rfNumber=[21], rfOrder=20, authorNames=JACOB B, KLIGYS S, CHEN B, journalName=null, refType=null, unstructuredReference=
JACOB B,
KLIGYS S,
CHEN B,
et al.Quantization and training of neural networks for efficient integer-arithmetic-only inference[C]//Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition,
2018:2704-2713., articleTitle=Quantization and training of neural networks for efficient integer-arithmetic-only inference, refAbstract=null), Reference(id=1251480558321218046, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2019, volume=null, issue=null, pageStart=1325, pageEnd=1334, url=null, language=null, rfNumber=[22], rfOrder=21, authorNames=NAGEL M, BAALEN MV, BLANKEVOORT T, journalName=null, refType=null, unstructuredReference=
NAGEL M,
BAALEN MV,
BLANKEVOORT T,
et al.Data-free quantization through weight equalization and bias correction[C]//Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV),
2019:1325-1334., articleTitle=Data-free quantization through weight equalization and bias correction, refAbstract=null), Reference(id=1251480558438658567, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2019, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[23], rfOrder=22, authorNames=ZHAO R, HU Y W, DOTZEL J, journalName=null, refType=null, unstructuredReference=
ZHAO R,
HU Y W,
DOTZEL J,
et al.Improving neural network quantization without retraining using outlier channel splitting[C]//Proceedings of the 36th International Conference on Machine Learning,
2019., articleTitle=Improving neural network quantization without retraining using outlier channel splitting, refAbstract=null), Reference(id=1251480558660956693, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[24], rfOrder=23, authorNames=NVIDIA, journalName=null, refType=null, unstructuredReference=NVIDIA.NVIDIA TensorRT-LLM:a TensorRT toolbox for optimized large language model inference[EB/OL].[
2025-05-01].
https://github.com/NVIDIA/TensorRT-LLM., articleTitle=NVIDIA TensorRT-LLM:a TensorRT toolbox for optimized large language model inference, refAbstract=null), Reference(id=1251480558782591517, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[25], rfOrder=24, authorNames=LIU Z C, WANG J, DAO T, journalName=null, refType=null, unstructuredReference=
LIU Z C,
WANG J,
DAO T,
et al.Deja Vu:contextual sparsity for efficient LLMs at inference time[C]//Proceedings of the 40th International Conference on Machine Learning,
2023., articleTitle=Deja Vu:contextual sparsity for efficient LLMs at inference time, refAbstract=null), Reference(id=1251480558845506085, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2024, volume=582, issue=null, pageStart=127468, pageEnd=null, url=null, language=null, rfNumber=[26], rfOrder=25, authorNames=FARINA M, AHMAD U, TAHA A, journalName=Neurocomputing, refType=null, unstructuredReference=
FARINA M,
AHMAD U,
TAHA A,
et al. Sparsity in transformers:a systematic literature review[J].
Neurocomputing,
2024,
582:127468., articleTitle=Sparsity in transformers:a systematic literature review, refAbstract=null), Reference(id=1251480558946169387, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[27], rfOrder=26, authorNames=LI L J, DONG P J, TANG Z H, journalName=null, refType=null, unstructuredReference=
LI L J,
DONG P J,
TANG Z H,
et al.Discovering sparsity allocation for layer-wise pruning of large language models[C]//Proceedings of 38th Conference on Neural Information Processing Systems,
2024., articleTitle=Discovering sparsity allocation for layer-wise pruning of large language models, refAbstract=null), Reference(id=1251480560460313135, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=2813, pageEnd=2819, url=null, language=null, rfNumber=[28], rfOrder=27, authorNames=YI C X, JIAN S L, TAN Y S, journalName=null, refType=null, unstructuredReference=
YI C X,
JIAN S L,
TAN Y S,
et al.HMO:host memory optimization for model inference acceleration on edge devices[C]//Proceedings of the IEEE International Conference on Systems, Man, and Cybernetics (SMC),
2024:2813-2819., articleTitle=HMO:host memory optimization for model inference acceleration on edge devices, refAbstract=null), Reference(id=1251480560548393529, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[29], rfOrder=28, authorNames=GERGANOV G, journalName=null, refType=null, unstructuredReference=
GERGANOV G. Ggml-org/llama.cpp: LLM inference in C/C++[EB/OL].[
2025-05-01].
https://github.com/ggerganov/Llama.cpp., articleTitle=Ggml-org/llama.cpp: LLM inference in C/C++, refAbstract=null), Reference(id=1251480560653251133, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025-05-01, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[30], rfOrder=29, authorNames=LEE W, LEE J, SEO J, journalName=null, refType=null, unstructuredReference=
LEE W,
LEE J,
SEO J,
et al.InfiniGen:efficient generative inference of large language models with dynamic KV cache management[EB/OL].(2024-06-28)[
2025-05-01].
https://arxiv.org/abs/2406.19707., articleTitle=InfiniGen:efficient generative inference of large language models with dynamic KV cache management, refAbstract=null), Reference(id=1251480560762303043, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[31], rfOrder=30, authorNames=SHENG Y, ZHENG L M, YUAN B H, journalName=null, refType=null, unstructuredReference=
SHENG Y,
ZHENG L M,
YUAN B H,
et al.FlexGen:high-throughput generative inference of large language models with a single GPU[C]//Proceedings of the 40th International Conference on Machine Learning,
2023., articleTitle=FlexGen:high-throughput generative inference of large language models with a single GPU, refAbstract=null), Reference(id=1251480560883937869, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2022, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[32], rfOrder=31, authorNames=AMINABADI R Y, RAJBHANDARI S, AWAN A A, journalName=null, refType=null, unstructuredReference=
AMINABADI R Y,
RAJBHANDARI S,
AWAN A A,
et al. DeepSpeed-inference: enabling efficient inference of transformer models at unprecedented scale[C]//Proceedings of the SC22:International Conference for High Performance Computing,Networking,Storage and Analysis,
2022., articleTitle=DeepSpeed-inference: enabling efficient inference of transformer models at unprecedented scale, refAbstract=null), Reference(id=1251480560988795475, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2023, volume=null, issue=null, pageStart=611, pageEnd=626, url=null, language=null, rfNumber=[33], rfOrder=32, authorNames=KWON W, LI Z H, ZHUANG S Y, journalName=null, refType=null, unstructuredReference=
KWON W,
LI Z H,
ZHUANG S Y,
et al.Efficient memory management for large language model serving with PagedAttention[C]//Proceedings of the 29th Symposium on Operating Systems Principles,
2023:611-626., articleTitle=Efficient memory management for large language model serving with PagedAttention, refAbstract=null), Reference(id=1251480561110430296, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=null, pageStart=1250, pageEnd=1255, url=null, language=null, rfNumber=[34], rfOrder=33, authorNames=YI C X, JIAN S L, TAN Y S, journalName=null, refType=null, unstructuredReference=
YI C X,
JIAN S L,
TAN Y S,
et al.MACA:memory-aware convolution accelerating for CNN inference on edge devices[C]//Proceedings of the 27th International Conference on Computer Supported Cooperative Work in Design (CSCWD),
2024:1250-1255., articleTitle=MACA:memory-aware convolution accelerating for CNN inference on edge devices, refAbstract=null), Reference(id=1251480561232065121, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025, volume=62, issue=3, pageStart=545, pageEnd=562, url=null, language=null, rfNumber=[35], rfOrder=34, authorNames=葛旭冉, 欧洋, 王博, journalName=计算机研究与发展, refType=null, unstructuredReference=葛旭冉, 欧洋, 王博,
等.大语言模型推理中的存储优化技术综述[J].
计算机研究与发展,
2025,
62(3):545-562., articleTitle=大语言模型推理中的存储优化技术综述, refAbstract=null), Reference(id=1251480561324339816, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025, volume=62, issue=3, pageStart=545, pageEnd=562, url=null, language=null, rfNumber=[35], rfOrder=35, authorNames=GE X R, OU Y, WANG B, journalName=Journal of Computer Research and Development, refType=null, unstructuredReference=
GE X R,
OU Y,
WANG B,
et al.Survey of storage optimization techniques in large language model inference[J].
Journal of Computer Research and Development,
2025,
62(3):545-562.(in Chinese), articleTitle=Survey of storage optimization techniques in large language model inference, refAbstract=null), Reference(id=1251480561462751855, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[36], rfOrder=36, authorNames=梁绪宁, 王思琪, 杨海龙, journalName=计算机工程, refType=null, unstructuredReference=梁绪宁, 王思琪, 杨海龙,
等.基于自适应张量交换和重算的大模型推理优化[J/OL].
计算机工程,
2025.(2025-04-14)[2025-05-01].
https://link.cnki.net/doi/10.19678/j.issn.1000-3428.0070644., articleTitle=基于自适应张量交换和重算的大模型推理优化, refAbstract=null), Reference(id=1251480561580192371, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, doi=null, pmid=null, pmcid=null, year=2025, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[36], rfOrder=37, authorNames=LIANG X N, WANG S Q, YANG H L, journalName=Computer Engineering, refType=null, unstructuredReference=
LIANG X N,
WANG S Q,
YANG H L,
et al.Adaptive tensor swapping and re-computation for efficient large language model inference[J/OL].
Computer Engineering,
2025.(2025-04-14)[2025-05-01].
https://link.cnki.net/doi/10.19678/j.issn.1000-3428.0070644.(in Chinese), articleTitle=Adaptive tensor swapping and re-computation for efficient large language model inference, refAbstract=null)], funds=null, companyList=[AuthorCompany(id=1251480545616671657, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, xref=null, ext=[AuthorCompanyExt(id=1251480545625060266, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, companyId=1251480545616671657, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=College of Computer Science and Technology, National University of Defense Technology, Changsha 410073, China), AuthorCompanyExt(id=1251480545633448875, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, companyId=1251480545616671657, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=国防科技大学 计算机学院,湖南 长沙 410073)])], figs=[ArticleFig(id=1251480551820046463, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, label=Fig.1, caption=
Model memory usage, figureFileSmall=5VTf3kfg1pmufPGH7QpfbQ==, figureFileBig=mrok1ikjnN1rLjLrdUWi6Q==, tableContent=null), ArticleFig(id=1251480551937486982, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, label=图1, caption=
模型内存占用情况, figureFileSmall=5VTf3kfg1pmufPGH7QpfbQ==, figureFileBig=mrok1ikjnN1rLjLrdUWi6Q==, tableContent=null), ArticleFig(id=1251480552163979412, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, label=Fig.2, caption=
Latency of varying mul-mat operations, figureFileSmall=n9VgU88rpxSb/IxIe7aXEA==, figureFileBig=kMsAOUHJkQkrZqGrucIbQA==, tableContent=null), ArticleFig(id=1251480552252059804, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, label=图2, caption=
不同矩阵乘法算子操作的延迟, figureFileSmall=n9VgU88rpxSb/IxIe7aXEA==, figureFileBig=kMsAOUHJkQkrZqGrucIbQA==, tableContent=null), ArticleFig(id=1251480552348528804, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, label=Fig.3, caption=
GPU affinities of operators, figureFileSmall=ouCbvSt18xZPHArVQhGs9A==, figureFileBig=D8Q4jMu9efb4CkbusYfrjQ==, tableContent=null), ArticleFig(id=1251480552461775019, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, label=图3, caption=
算子的GPU亲和度, figureFileSmall=ouCbvSt18xZPHArVQhGs9A==, figureFileBig=D8Q4jMu9efb4CkbusYfrjQ==, tableContent=null), ArticleFig(id=1251480552545661107, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, label=Fig.4, caption=
System overview of OALlama.cpp, figureFileSmall=Zb2IisUQWZTXj9Ufp6c0pA==, figureFileBig=uQVfgkdFJf6JwMUKtD1qyw==, tableContent=null), ArticleFig(id=1251480552642130105, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, label=图4, caption=
OALlama.cpp系统概览, figureFileSmall=Zb2IisUQWZTXj9Ufp6c0pA==, figureFileBig=uQVfgkdFJf6JwMUKtD1qyw==, tableContent=null), ArticleFig(id=1251480552763764927, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, label=Fig.5, caption=
An example of operator flow, figureFileSmall=wNwlGTKcoYRVE0p2WZ270Q==, figureFileBig=zDnPq9133g+uS4HRTiqWUA==, tableContent=null), ArticleFig(id=1251480552872816837, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, label=图5, caption=
算子流示例, figureFileSmall=wNwlGTKcoYRVE0p2WZ270Q==, figureFileBig=zDnPq9133g+uS4HRTiqWUA==, tableContent=null), ArticleFig(id=1251480552960897228, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, label=Alg.1, caption=
Smart operator scheduling algorithm, figureFileSmall=wPJHrH8q1NEnLGLpCn8Ltg==, figureFileBig=BOonUsrDGb/59LoYS1kFbQ==, tableContent=null), ArticleFig(id=1251480553061560530, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, label=算法1, caption=
智能算子调度算法, figureFileSmall=wPJHrH8q1NEnLGLpCn8Ltg==, figureFileBig=BOonUsrDGb/59LoYS1kFbQ==, tableContent=null), ArticleFig(id=1251480553170612442, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, label=Fig.6, caption=
Mapping between target operator and backend device, figureFileSmall=Ev7WaLTKAYSgMWW2FEWIdQ==, figureFileBig=Wy8m5hwByWuUffQaHs2uZA==, tableContent=null), ArticleFig(id=1251480553296441571, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, label=图6, caption=
目标算子-后端设备映射关系, figureFileSmall=Ev7WaLTKAYSgMWW2FEWIdQ==, figureFileBig=Wy8m5hwByWuUffQaHs2uZA==, tableContent=null), ArticleFig(id=1251480553409687788, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, label=Fig.7, caption=
End-to-end inference performance of varing models, figureFileSmall=WO6aRsDYkEH+XWGeqaqz9A==, figureFileBig=hNkuNBoCl5nIsbxVEWr9ew==, tableContent=null), ArticleFig(id=1251480553510351091, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, label=图7, caption=
不同模型端到端推理性能, figureFileSmall=WO6aRsDYkEH+XWGeqaqz9A==, figureFileBig=hNkuNBoCl5nIsbxVEWr9ew==, tableContent=null), ArticleFig(id=1251480553619403006, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, label=Fig.8, caption=
Inference performance of prefill stage with varying prompt length, figureFileSmall=tP+yEzl1JTcSMgMUcQhs6Q==, figureFileBig=ArBzqK/xBU4OTH9Pclh84w==, tableContent=null), ArticleFig(id=1251480553703289095, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, label=图8, caption=
不同提示长度下预填充阶段的推理性能, figureFileSmall=tP+yEzl1JTcSMgMUcQhs6Q==, figureFileBig=ArBzqK/xBU4OTH9Pclh84w==, tableContent=null), ArticleFig(id=1251480553803952399, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, label=Fig.9, caption=
Inference performance of decode stage with varying output length, figureFileSmall=wWX/bEA13JWVXyrhtWwnOQ==, figureFileBig=4rumhHcSu8WMl9DOZ6IdyA==, tableContent=null), ArticleFig(id=1251480553917198615, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, label=图9, caption=
不同输出长度下解码阶段的推理性能, figureFileSmall=wWX/bEA13JWVXyrhtWwnOQ==, figureFileBig=4rumhHcSu8WMl9DOZ6IdyA==, tableContent=null), ArticleFig(id=1251480554030444832, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, label=Fig.10, caption=
Inference performance of LlaMA3-8B under varying resource constraints scenarios, figureFileSmall=DYz13z3uhSEzo5+EIC7Vtg==, figureFileBig=AAhSy3fSpKi3bUT5gL3gWA==, tableContent=null), ArticleFig(id=1251480554135302436, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, label=图10, caption=
不同资源受限场景中LlaMA3-8B的推理性能, figureFileSmall=DYz13z3uhSEzo5+EIC7Vtg==, figureFileBig=AAhSy3fSpKi3bUT5gL3gWA==, tableContent=null), ArticleFig(id=1251480554256937263, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=EN, label=Tab.1, caption=
Operator latency of different models in inference scenarios
, figureFileSmall=null, figureFileBig=null, tableContent=
| 算子 | MiniCPM3-4B | LlaMA3-8 B | QWQ-32B |
|---|
| G | C | G | C | G | C |
|---|
| mul-mat | 11 | 13902 | 8 | 55922 | 13 | 110469 |
| add | 4 | 246 | 3 | 1020 | 5 | 964 |
| Mul | 4 | 354 | 3 | 517 | 5 | 817 |
| Norm | 4 | 289 | 3 | 459 | 5 | 557 |
| Softmax | 4 | 273 | 3 | 274 | 6 | 322 |
), ArticleFig(id=1251480554345017651, tenantId=1146029695717560320, journalId=1251234078029037663, articleId=1251480537668465256, language=CN, label=表1, caption=
不同模型推理场景下算子延迟
, figureFileSmall=null, figureFileBig=null, tableContent=
| 算子 | MiniCPM3-4B | LlaMA3-8 B | QWQ-32B |
|---|
| G | C | G | C | G | C |
|---|
| mul-mat | 11 | 13902 | 8 | 55922 | 13 | 110469 |
| add | 4 | 246 | 3 | 1020 | 5 | 964 |
| Mul | 4 | 354 | 3 | 517 | 5 | 817 |
| Norm | 4 | 289 | 3 | 459 | 5 | 557 |
| Softmax | 4 | 273 | 3 | 274 | 6 | 322 |
)], attaches=null, journal=Journal(id=1251231494090305632, delFlag=0, nameCn=国防科技大学学报, nameEn=Journal of National Niversity of Defense Technology, nameHistory1=null, nameHistory2=null, issn=1001-2486, eissn=, cn=43-1067/T, coden=null, periodic=双月刊, language=CN, oaType=1, ccby=null, superviseOffice=null, ownerOffice=null, pubOffice=null, editorOffice=null, officeType=null, aims=null, clcCode=null, officeProv=null, officeCity=null, officeAddr=null, officeZip=null, officeEmail=, officePhone=, editDirector=null, officeDirector=null, officeDirectorPhone=null, officeStaffNum=null, officeEmpNum=null, coverPicUrl=h+HgOUssQ5XqPoD980XNIA==, journalPrice=null, startedYear=null, abbrevIsoEn=Journal of National Niversity of Defense Technology, journalRemark=null, publicationField=null, createdTime=1776246434950, updatedTime=1776251967711, createdBy=18614031015, updatedBy=13701087609, firstLetterCn=J, firstLetterEn=J, subjectCode=Engineering, subjectName=工程, subjectCodeEn=Engineering, subjectNameEn=null, picCn=h+HgOUssQ5XqPoD980XNIA==, picEn=hJx8onaXftcX9VtGkHdjDA==, jcr=null, cjcr=null, exts=[JournalExt(id=1251254700306285546, language=CN, name=国防科技大学学报, nameHistory1=null, nameHistory2=null, managedBy=, sponsoredBy=, publishedBy=, editorOffice=, officeProv=null, officeCity=null, officeAddr=, officeZip=, editDirector=, officeDirector=null, officePhone=null, coverPicUrl=null, journalRemark=, submitArticleUrl=null, websiteUrl=, createdTime=1776251967741, updatedTime=1776251967741, createdBy=13701087609, updatedBy=13701087609, submissionGuidelinesUrl=, submissionAuthorUrl=http://journal.nudt.edu.cn/gfkjdxxb/author/login, submissionEditorUrl=http://journal.nudt.edu.cn/gfkjdxxb/editor/login, submissionReviewUrl=http://journal.nudt.edu.cn/gfkjdxxb/reviewer/login, submissionCeEditorUrl=, submissionAeEditorUrl=, option={"copyright":""}), JournalExt(id=1251254700356617195, language=EN, name=Journal of National Niversity of Defense Technology, nameHistory1=null, nameHistory2=null, managedBy=, sponsoredBy=, publishedBy=, editorOffice=, officeProv=null, officeCity=null, officeAddr=, officeZip=, editDirector=, officeDirector=null, officePhone=null, coverPicUrl=null, journalRemark=, submitArticleUrl=null, websiteUrl=, createdTime=1776251967753, updatedTime=1776251967753, createdBy=13701087609, updatedBy=13701087609, submissionGuidelinesUrl=, submissionAuthorUrl=http://journal.nudt.edu.cn/gfkjdxxb/author/login, submissionEditorUrl=http://journal.nudt.edu.cn/gfkjdxxb/editor/login, submissionReviewUrl=http://journal.nudt.edu.cn/gfkjdxxb/reviewer/login, submissionCeEditorUrl=, submissionAeEditorUrl=, option={"copyright":""})], databaseList=null, tenantJournalId=1251234078029037663, websiteList=[Website(id=1251257283485843500, webName=null, webTitle=null, webDomain=null, webCopyrigh=null, webIpcNo=null, seoTitle=null, seoKeywords=null, seoDescription=null, tenantJournalId=null, journalId=1251234078029037663, journalNameCn=null, journalNameEn=null, grayFlag=null, tenantId=1146029695717560320, platformId=null, journalGroupId=null, journalGroupNameCn=null, journalGroupNameEn=null, type=1, domain=https://castjournals.cast.org.cn/joweb/gfkjdxxb/CN, language=CN, createTime=1776252583619, createBy=18614031015, updateTime=1776253414371, updateBy=18614031015, name=国防科技大学学报-中文, tplId=1146099689490845704, title=国防科技大学学报, delFlag=0, indexPage=/home, props=[WebsiteProps(id=1251260875290653228, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283485843500, code=articleTextType, value=kx, createTime=1776253439972, updateTime=1776253439972, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260875273876009, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283485843500, code=banner, value=null, createTime=1776253439968, updateTime=1776253439968, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260875311624751, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283485843500, code=grayFlag, value=0, createTime=1776253439977, updateTime=1776253439977, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260875261293096, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283485843500, code=logo, value=https://castjournals.cast.org.cn/joweb/gfkjdxxb/CN/file/pic?fileId=WpHzMFTSHy8AuOKzUbYrdw==, createTime=1776253439965, updateTime=1776253439965, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260875382927921, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283485843500, code=minRunFlag, value=0, createTime=1776253439994, updateTime=1776253439994, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260875286458923, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283485843500, code=picServerUrl, value=https://castjournals.cast.org.cn/joweb/gfkjdxxb/CN/file/pic, createTime=1776253439971, updateTime=1776253439971, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260875320013360, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283485843500, code=silenceFlag, value=0, createTime=1776253439979, updateTime=1776253439979, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260875278070314, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283485843500, code=staticResourcePath, value=https://castjournals.cast.org.cn/joweb/cast_kjdb_cn_619/, createTime=1776253439969, updateTime=1776253439969, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260875299041837, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283485843500, code=themeColor, value=null, createTime=1776253439974, updateTime=1776253439974, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260875303236142, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283485843500, code=themeStyle, value=null, createTime=1776253439975, updateTime=1776253439975, creator=18614031015, updator=18614031015)]), Website(id=1251257283599089718, webName=null, webTitle=null, webDomain=null, webCopyrigh=null, webIpcNo=null, seoTitle=null, seoKeywords=null, seoDescription=null, tenantJournalId=null, journalId=1251234078029037663, journalNameCn=null, journalNameEn=null, grayFlag=null, tenantId=1146029695717560320, platformId=null, journalGroupId=null, journalGroupNameCn=null, journalGroupNameEn=null, type=1, domain=https://castjournals.cast.org.cn/joweb/gfkjdxxb/EN, language=EN, createTime=1776252583646, createBy=18614031015, updateTime=1776253409915, updateBy=18614031015, name=国防科技大学学报-英文, tplId=1146101810881728533, title=Journal of National Niversity of Defense Technology, delFlag=0, indexPage=/home, props=[WebsiteProps(id=1251260846312210678, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283599089718, code=articleTextType, value=kx, createTime=1776253433063, updateTime=1776253433063, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260846232518899, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283599089718, code=banner, value=null, createTime=1776253433044, updateTime=1776253433044, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260846396096761, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283599089718, code=grayFlag, value=0, createTime=1776253433083, updateTime=1776253433083, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260846219935986, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283599089718, code=logo, value=https://castjournals.cast.org.cn/joweb/gfkjdxxb/EN/file/pic?fileId=WpHzMFTSHy8AuOKzUbYrdw==, createTime=1776253433041, updateTime=1776253433041, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260846442234107, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283599089718, code=minRunFlag, value=0, createTime=1776253433094, updateTime=1776253433094, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260846282850549, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283599089718, code=picServerUrl, value=https://castjournals.cast.org.cn/joweb/gfkjdxxb/EN/file/pic, createTime=1776253433056, updateTime=1776253433056, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260846417068282, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283599089718, code=silenceFlag, value=0, createTime=1776253433088, updateTime=1776253433088, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260846257684724, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283599089718, code=staticResourcePath, value=https://castjournals.cast.org.cn/joweb/cast_kjdb_en_623/, createTime=1776253433050, updateTime=1776253433050, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260846337376503, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283599089718, code=themeColor, value=null, createTime=1776253433070, updateTime=1776253433070, creator=18614031015, updator=18614031015), WebsiteProps(id=1251260846362542328, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1251257283599089718, code=themeStyle, value=null, createTime=1776253433075, updateTime=1776253433075, creator=18614031015, updator=18614031015)])], journalTitle=国防科技大学学报, weixinUrl=null, journalUrl=http://journal.nudt.edu.cn/, iacademicId=null, status=1, seqNo=null, journalTitleEn=Journal of National Niversity of Defense Technology, journalPhotoCn=h+HgOUssQ5XqPoD980XNIA==, journalPhotoEn=hJx8onaXftcX9VtGkHdjDA==, journalFirstLetter=J, journalRecommend=null, journalNew=null, journalCollection=null, jcrJf=null, cjcrJf=null, jcrJfStr=null, cjcrJfStr=null, submissionFirstDecision=null, sciSubjectClassification=null, casSubjectClassification=null, citeScore=null, totalCitationFrequency=null, icpCode=null, psCode=null, advertisingLicenseCode=null, copyrightInformation=null, country=null, option=, provinceCode=null, provinceName=null, collectFlag=false), detailUrlCn=https://castjournals.cast.org.cn/joweb/gfkjdxxb/CN/10.11887/j.issn.1001-2486.25050035, detailUrlEn=https://castjournals.cast.org.cn/joweb/gfkjdxxb/EN/10.11887/j.issn.1001-2486.25050035, pdfUrlCn=https://castjournals.cast.org.cn/joweb/gfkjdxxb/CN/PDF/10.11887/j.issn.1001-2486.25050035, pdfUrlEn=https://castjournals.cast.org.cn/joweb/gfkjdxxb/EN/PDF/10.11887/j.issn.1001-2486.25050035, aliStartDate=null, aliEndDate=null, collectionFlag=false, citedCount=null, citedUrl=null, reference=null)