Article(id=1149776907329302597, tenantId=1146029695717560320, journalId=1146123166801305609, issueId=1149776900194791454, articleNumber=null, orderNo=null, doi=10.12404/j.issn.1671-1815.2403487, pmid=null, cstr=null, oa=null, hot=null, price=null, onlineType=0, articleFormat=0, articleType=null, articleTypeStr=research-article, receivedDate=1715356800000, receivedDateStr=2024-05-11, revisedDate=1722441600000, revisedDateStr=2024-08-01, acceptedDate=null, acceptedDateStr=null, onlineDate=1752057776528, onlineDateStr=2025-07-09, pubDate=1744905600000, pubDateStr=2025-04-18, doiRegisterDate=null, doiRegisterDateStr=null, onlineIssueDate=1752057776528, onlineIssueDateStr=2025-07-09, onlineJustAcceptDate=null, onlineJustAcceptDateStr=null, onlineFirstDate=null, onlineFirstDateStr=null, sourceXml=null, magXml=null, createTime=1752057776528, creator=13701087609, updateTime=1752057776528, updator=13701087609, issue=Issue{id=1149776900194791454, tenantId=1146029695717560320, journalId=1146123166801305609, year='2025', volume='25', issue='11', pageStart='4397', pageEnd='4826', issueExtLink='null', onlineDate='null', pubDate='null', beforeIssueId=null, nextIssueId=null, price=null, status=1, issueComplete=1, articleOrder=1, issueType=-1, specialIssue=0, createTime=1752057774827, creator=13701087609, updateTime=1768456666677, updator=13701087609, preIssue=null, nextIssue=null, ext={EN=IssueExt(id=1218558837930512931, tenantId=1146029695717560320, journalId=1146123166801305609, issueId=1149776900194791454, language=EN, specialIssueTitle=, coverIllustrator=, specialIssueEditor=, specialIssueAbout=), CN=IssueExt(id=1218558837930512932, tenantId=1146029695717560320, journalId=1146123166801305609, issueId=1149776900194791454, language=CN, specialIssueTitle=, coverIllustrator=, specialIssueEditor=, specialIssueAbout=)}, issueFiles=null}, startPage=4638, endPage=4646, ext={EN=ArticleExt(id=1149776907572572230, articleId=1149776907329302597, tenantId=1146029695717560320, journalId=1146123166801305609, language=EN, title=Cross Modal Learning Method of Speech Face
via Single Stream Network, columnId=1156262729162810294, journalTitle=Science Technology and Engineering, columnName=Papers·Automation and Computational Technology, runingTitle=null, highlight=null, articleAbstract=
Existing methods for audio-visual cross-modal association learning often adopt a dual-stream network structure, but they still face challenges in reducing computational complexity, model light weighting, and efficient feature fusion. To improve model performance and enhance the efficiency of cross-modal learning, a single-stream network-based approach for audio-visual cross-modal learning was proposed. Firstly, preprocessed data from both modalities were fed into a single-stream feature extraction network, where a class-information-based loss function was employed to learn and extract feature vectors from both modalities. Subsequently, attention-based feature fusion was performed on the extracted feature vectors from both modalities. Finally, a combination of cosine similarity algorithm and cross-entropy loss was used to learn the association between the two modalities, thus completing the cross-modal association learning task. Experimental results demonstrate that the proposed method achieves promising performance in audio-visual cross-modal verification, matching, and retrieval tasks, ensuring excellent performance while considering the lightness and flexibility of the network structure.
, correspAuthors=Fan-liang BU, authorNote=null, correspAuthorsNote=null, copyrightStatement=null, copyrightOwner=null, extLink=null, articleAbsUrl=null, sourceXml=null, magXml=null, pdfUrl=null, pdf=null, pdfFileSize=null, pdfExtLink=null, richHtmlUrl=null, mobilePdfUrl=null, reviewReport=null, pdfFirstPage=null, abstractGraph=null, abstractGraphContent=null, abstractVideo=null, citation=null, cebUrl=null, magXmlContent=null, mapNumber=null, authorCompany=null, fund=null, authors=null, authorsList=Fang-hao ZHONG, Fan-liang BU, Hao-ming QIN), CN=ArticleExt(id=1149776930318283417, articleId=1149776907329302597, tenantId=1146029695717560320, journalId=1146123166801305609, language=CN, title=基于单流网络的语音-人脸的跨模态学习方法, columnId=1156262729783567290, journalTitle=科学技术与工程, columnName=论文·自动化技术、计算机技术, runingTitle=null, highlight=null, articleAbstract=
现有的语音-人脸跨模态关联学习方法多采用双流网络结构,在降低计算复杂度、模型轻量化和高效特征融合方面还面临一些挑战,为了改善模型性能,提高跨模态学习的效率,提出一种基于单流网络的语音-人脸的跨模态学习方法。首先,将预处理的两种模态数据送入单流特征提取网络,利用基于类信息的损失函数学习提取两种模态的有效特征,接着对提取的两种模态特征向量进行基于注意力机制的特征融合,最后使用余弦相似度算法和交叉熵损失相结合的方法来学习两种模态的关联,从而完成跨模态关联学习任务。实验结果表明,本文提出的方法在语音-人脸跨模态验证、匹配和检索任务上均取得了良好的效果,在考虑网络结构轻量化和灵活性的同时保证了优秀的性能。
, correspAuthors=卜凡亮, authorNote=null, correspAuthorsNote=
, copyrightStatement=null, copyrightOwner=null, extLink=null, articleAbsUrl=null, sourceXml=xuLX6/MKuUwbsuvfMW9Jew==, magXml=/pVz4TBCSEzh76LfSFnzTw==, pdfUrl=null, pdf=mUxc6OHff02NpcqY/hZYzA==, pdfFileSize=5863599, pdfExtLink=null, richHtmlUrl=null, mobilePdfUrl=null, reviewReport=null, pdfFirstPage=null, abstractGraph=AEQd1kB3VFdi7Bj79oNSbg==, abstractGraphContent=null, abstractVideo=null, citation=null, cebUrl=null, magXmlContent=JOj3+h8LpIqUIIpu+bKFBw==, mapNumber=null, authorCompany=null, fund=null, authors=
, authorsList=钟方昊, 卜凡亮, 秦昊铭)}, authors=[Author(id=1218843901805379938, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, orderNo=0, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=792999805@qq.com, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1218843902040260976, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, authorId=1218843901805379938, language=EN, stringName=Fang-hao ZHONG, firstName=Fang-hao, middleName=null, lastName=ZHONG, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=School of Information Network Security, Peoples Public Security University of China, Beijing 100038, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1218843902149312891, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, authorId=1218843901805379938, language=CN, stringName=钟方昊, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=中国人民公安大学信息网络安全学院, 北京 100038;, bio={"content":"
钟方昊(2000—),男,汉族,江西赣州人,硕士研究生。研究方向:计算机视觉、多模态学习。E-mail:792999805@qq.com。
"}, bioImg=null, bioContent=
钟方昊(2000—),男,汉族,江西赣州人,硕士研究生。研究方向:计算机视觉、多模态学习。E-mail:792999805@qq.com。
, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1218843901700522325, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, xref=null, ext=[AuthorCompanyExt(id=1218843901708910934, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, companyId=1218843901700522325, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=School of Information Network Security, Peoples Public Security University of China, Beijing 100038, China), AuthorCompanyExt(id=1218843901713105240, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, companyId=1218843901700522325, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=中国人民公安大学信息网络安全学院, 北京 100038;)])]), Author(id=1218843902229004677, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, orderNo=1, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=bufanliang@sina.com, emailSecond=null, emailThird=null, correspondingAuthor=1, authorType=1, ext={EN=AuthorExt(id=1218843902317085072, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, authorId=1218843902229004677, language=EN, stringName=Fan-liang BU, firstName=Fan-liang, middleName=null, lastName=BU, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
*, address=School of Information Network Security, Peoples Public Security University of China, Beijing 100038, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1218843902413554069, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, authorId=1218843902229004677, language=CN, stringName=卜凡亮, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
*, address=中国人民公安大学信息网络安全学院, 北京 100038;, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1218843901700522325, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, xref=null, ext=[AuthorCompanyExt(id=1218843901708910934, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, companyId=1218843901700522325, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=School of Information Network Security, Peoples Public Security University of China, Beijing 100038, China), AuthorCompanyExt(id=1218843901713105240, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, companyId=1218843901700522325, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=中国人民公安大学信息网络安全学院, 北京 100038;)])]), Author(id=1218843902539383200, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, orderNo=2, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=null, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1218843902673600939, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, authorId=1218843902539383200, language=EN, stringName=Hao-ming QIN, firstName=Hao-ming, middleName=null, lastName=QIN, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=School of Information Network Security, Peoples Public Security University of China, Beijing 100038, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1218843902761681332, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, authorId=1218843902539383200, language=CN, stringName=秦昊铭, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=中国人民公安大学信息网络安全学院, 北京 100038;, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1218843901700522325, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, xref=null, ext=[AuthorCompanyExt(id=1218843901708910934, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, companyId=1218843901700522325, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=School of Information Network Security, Peoples Public Security University of China, Beijing 100038, China), AuthorCompanyExt(id=1218843901713105240, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, companyId=1218843901700522325, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=中国人民公安大学信息网络安全学院, 北京 100038;)])])], keywords=[Keyword(id=1218843902967202243, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, orderNo=1, keyword=association learning), Keyword(id=1218843903072059848, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, orderNo=2, keyword=voice-face cross-modal), Keyword(id=1218843903151751631, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, orderNo=3, keyword=single-stream network), Keyword(id=1218843903269192149, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, orderNo=4, keyword=feature fusion), Keyword(id=1218843903353078236, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, orderNo=1, keyword=关联学习), Keyword(id=1218843903462130151, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, orderNo=2, keyword=语音-人脸跨模态), Keyword(id=1218843903566987757, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, orderNo=3, keyword=单流网络), Keyword(id=1218843903663456759, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, orderNo=4, keyword=特征融合)], refs=[Reference(id=1218843908092642149, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2003, volume=13, issue=19, pageStart=1709, pageEnd=1714, url=null, language=null, rfNumber=[1], rfOrder=0, authorNames=Kamachi M, Hill H, Lander K, journalName=Current Biology, refType=null, unstructuredReference=
Kamachi M,
Hill H,
Lander K, et al. Putting the face to the voice: matching identity across modality[J].
Current Biology,
2003,
13(19): 1709-1714., articleTitle=Putting the face to the voice: matching identity across modality, refAbstract=null), Reference(id=1218843908214276976, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2016, volume=78, issue=3, pageStart=868, pageEnd=879, url=null, language=null, rfNumber=[2], rfOrder=1, authorNames=Smith H M J, Dunn A K, Baguley T, journalName=Attention, Perception & Psychophysics, refType=null, unstructuredReference=
Smith H M J,
Dunn A K,
Baguley T, et al. Matching novel face and voice identity using static and dynamic facial images[J].
Attention, Perception & Psychophysics,
2016,
78(3): 868-879., articleTitle=Matching novel face and voice identity using static and dynamic facial images, refAbstract=null), Reference(id=1218843908356883326, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=1990, volume=55, issue=null, pageStart=241, pageEnd=261, url=null, language=null, rfNumber=[3], rfOrder=2, authorNames=Teager H M, Teager S M, journalName=Speech Production and Speech Modelling, refType=null, unstructuredReference=
Teager H M,
Teager S M. Evidence for nonlinear sound production mechanisms in the vocal tract[J].
Speech Production and Speech Modelling,
1990,
55: 241-261., articleTitle=Evidence for nonlinear sound production mechanisms in the vocal tract, refAbstract=null), Reference(id=1218843908533044113, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2022, volume=22, issue=36, pageStart=16116, pageEnd=16122, url=null, language=null, rfNumber=[4], rfOrder=3, authorNames=郭睿华, 宋俊鹏, 王文旭, journalName=科学技术与工程, refType=null, unstructuredReference=郭睿华, 宋俊鹏, 王文旭, 等. 基于视触数据融合的多模态细分类系统[J].
科学技术与工程,
2022,
22(36): 16116-16122., articleTitle=基于视触数据融合的多模态细分类系统, refAbstract=null), Reference(id=1218843908675650463, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2022, volume=22, issue=36, pageStart=16116, pageEnd=16122, url=null, language=null, rfNumber=[4], rfOrder=4, authorNames=Guo Ruihua, Song Junpeng, Wang Wenxu, journalName=Science Technology and Engineering, refType=null, unstructuredReference=
Guo Ruihua,
Song Junpeng,
Wang Wenxu, et al. Subdivision system based on multimodal visual and tactile data fusion[J].
Science Technology and Engineering,
2022,
22(36): 16116-16122., articleTitle=Subdivision system based on multimodal visual and tactile data fusion, refAbstract=null), Reference(id=1218843908847616938, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2020, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[5], rfOrder=5, authorNames=Tao R, Das R K, Li H, journalName=Arxiv Preprint Arxiv: 2008.03894, refType=null, unstructuredReference=
Tao R,
Das R K,
Li H. Audio-visual speaker recognition with a cross-modal discriminative network[J].
Arxiv Preprint Arxiv: 2008.03894,
2020., articleTitle=Audio-visual speaker recognition with a cross-modal discriminative network, refAbstract=null), Reference(id=1218843909065720766, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2019, volume=22, issue=4, pageStart=934, pageEnd=948, url=null, language=null, rfNumber=[6], rfOrder=6, authorNames=Liu Y, Kılıç V, Guan J, journalName=IEEE Transactions on Multimedia, refType=null, unstructuredReference=
Liu Y,
Kılıç V,
Guan J, et al. Audio-visual particle flow smc-phd filtering for multi-speaker tracking[J].
IEEE Transactions on Multimedia,
2019,
22(4): 934-948., articleTitle=Audio-visual particle flow smc-phd filtering for multi-speaker tracking, refAbstract=null), Reference(id=1218843909204132814, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2021, volume=32, issue=1, pageStart=423, pageEnd=436, url=null, language=null, rfNumber=[7], rfOrder=7, authorNames=Kong C, Chen B, Yang W, journalName=IEEE Transactions on Circuits and Systems for Video Technology, refType=null, unstructuredReference=
Kong C,
Chen B,
Yang W, et al. Appearance matters, so does audio: revealing the hidden face via cross-modality transfer[J].
IEEE Transactions on Circuits and Systems for Video Technology,
2021,
32(1): 423-436., articleTitle=Appearance matters, so does audio: revealing the hidden face via cross-modality transfer, refAbstract=null), Reference(id=1218843909388682212, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2024, volume=24, issue=7, pageStart=2804, pageEnd=2812, url=null, language=null, rfNumber=[8], rfOrder=8, authorNames=李俊屿, 卜凡亮, 谭林, journalName=科学技术与工程, refType=null, unstructuredReference=李俊屿, 卜凡亮, 谭林, 等. 基于多模态共享网络的自监督语音-人脸跨模态关联学习方法[J].
科学技术与工程,
2024,
24(7): 2804-2812., articleTitle=基于多模态共享网络的自监督语音-人脸跨模态关联学习方法, refAbstract=null), Reference(id=1218843909501928429, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2024, volume=24, issue=7, pageStart=2804, pageEnd=2812, url=null, language=null, rfNumber=[8], rfOrder=9, authorNames=Li Junyu, Bu Fanliang, Tan Lin, journalName=Science Technology and Engineering, refType=null, unstructuredReference=
Li Junyu,
Bu Fanliang,
Tan Lin, et al. Self-supervised voice-face cross-modal association learning method via multi-modal shared network[J].
Science Technology and Engineering,
2024,
24(7): 2804-2812., articleTitle=Self-supervised voice-face cross-modal association learning method via multi-modal shared network, refAbstract=null), Reference(id=1218843909661312000, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2018, volume=null, issue=null, pageStart=8427, pageEnd=8436, url=null, language=null, rfNumber=[9], rfOrder=10, authorNames=Nagrani A, Albanie S, Zisserman A, journalName=Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, refType=null, unstructuredReference=
Nagrani A,
Albanie S,
Zisserman A. Seeing voices and hearing faces: cross-modal biometric matching[C]//
Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. New York: IEEE,
2018: 8427-8436., articleTitle=Seeing voices and hearing faces: cross-modal biometric matching, refAbstract=null), Reference(id=1218843909824888853, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2018, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[10], rfOrder=11, authorNames=Wen Y, Ismail M A, Liu W, journalName=Arxiv Preprint Arxiv: 1807.04836, refType=null, unstructuredReference=
Wen Y,
Ismail M A,
Liu W, et al. Disjoint mapping network for cross-modal matching of voices and faces[J].
Arxiv Preprint Arxiv: 1807.04836,
2018., articleTitle=Disjoint mapping network for cross-modal matching of voices and faces, refAbstract=null), Reference(id=1218843909963300897, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2018, volume=null, issue=null, pageStart=71, pageEnd=88, url=null, language=null, rfNumber=[11], rfOrder=12, authorNames=Nagrani A, Albanie S, Zisserman A, journalName=Proceedings of the European Conference on Computer Vision (ECCV). Munich, refType=null, unstructuredReference=
Nagrani A,
Albanie S,
Zisserman A. Learnable pins: cross-modal embeddings for person identity[C]//
Proceedings of the European Conference on Computer Vision (ECCV). Munich, Germany: Springer,
2018: 71-88., articleTitle=Learnable pins: cross-modal embeddings for person identity, refAbstract=null), Reference(id=1218843910168821807, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2019, volume=null, issue=null, pageStart=276, pageEnd=292, url=null, language=null, rfNumber=[12], rfOrder=13, authorNames=Kim C, Shin H V, Oh T H, journalName=Asian Conference on Computer Vision. Berlin:Springer, refType=null, unstructuredReference=
Kim C,
Shin H V,
Oh T H, et al. On learning associations of faces and voices[C]//
Asian Conference on Computer Vision. Berlin:Springer,
2019: 276-292., articleTitle=On learning associations of faces and voices, refAbstract=null), Reference(id=1218843910265290810, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2018, volume=null, issue=null, pageStart=1011, pageEnd=1019, url=null, language=null, rfNumber=[13], rfOrder=14, authorNames=Horiguchi S, Kanda N, Nagamatsu K, journalName=Proceedings of the 26th ACM International Conference on Multimedia. Seoul, refType=null, unstructuredReference=
Horiguchi S,
Kanda N,
Nagamatsu K. Face-voice matching using cross-modal embeddings[C]//
Proceedings of the 26th ACM International Conference on Multimedia. Seoul, Korea: ACM,
2018: 1011-1019., articleTitle=Face-voice matching using cross-modal embeddings, refAbstract=null), Reference(id=1218843910420480075, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2019, volume=null, issue=null, pageStart=1, pageEnd=7, url=null, language=null, rfNumber=[14], rfOrder=15, authorNames=Nawaz S, Janjua M K, Gallo I, journalName=2019 Digital Image Computing:Techniques and Applications (DICTA), refType=null, unstructuredReference=
Nawaz S,
Janjua M K,
Gallo I, et al. Deep latent space learning for cross-modal mapping of audio and visual signals[C]//
2019 Digital Image Computing:Techniques and Applications (DICTA). New York: IEEE,
2019: 1-7., articleTitle=Deep latent space learning for cross-modal mapping of audio and visual signals, refAbstract=null), Reference(id=1218843910680526950, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2016, volume=23, issue=10, pageStart=1499, pageEnd=1503, url=null, language=null, rfNumber=[15], rfOrder=16, authorNames=Zhang K, Zhang Z, Li Z, journalName=IEEE Signal Processing Letters, refType=null, unstructuredReference=
Zhang K,
Zhang Z,
Li Z, et al. Joint face detection and alignment using multitask cascaded convolutional networks[J].
IEEE Signal Processing Letters,
2016,
23(10): 1499-1503., articleTitle=Joint face detection and alignment using multitask cascaded convolutional networks, refAbstract=null), Reference(id=1218843910776995954, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2011, volume=null, issue=9, pageStart=78, pageEnd=79, url=null, language=null, rfNumber=[16], rfOrder=17, authorNames=刘琦, journalName=网络安全技术与应用, refType=null, unstructuredReference=刘琦. 语音信号短时能量及短时幅值对比分析[J].
网络安全技术与应用,
2011(9): 78-79., articleTitle=语音信号短时能量及短时幅值对比分析, refAbstract=null), Reference(id=1218843910919602308, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2011, volume=null, issue=9, pageStart=78, pageEnd=79, url=null, language=null, rfNumber=[16], rfOrder=18, authorNames=Liu Qi, journalName=Journal of Network Security Technology and Applications, refType=null, unstructuredReference=
Liu Qi. Comparative analysis of short-time energy and short-time amplitude of speech signal[J].
Journal of Network Security Technology and Applications,
2011(9): 78-79., articleTitle=Comparative analysis of short-time energy and short-time amplitude of speech signal, refAbstract=null), Reference(id=1218843911041237137, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2017, volume=null, issue=null, pageStart=2616, pageEnd=2620, url=null, language=null, rfNumber=[17], rfOrder=19, authorNames=Nagrani A, Chung J S, Zisserman A, journalName=Arxiv Preprint Arxiv: 1706. 08612, refType=null, unstructuredReference=
Nagrani A,
Chung J S,
Zisserman A. Voxceleb: a large-scale speaker identification dataset[J].
Arxiv Preprint Arxiv: 1706. 08612,
2017: 2616-2620., articleTitle=Voxceleb: a large-scale speaker identification dataset, refAbstract=null), Reference(id=1218843911209009317, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2016, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[18], rfOrder=20, authorNames=Szegedy C, Ioffe S, Vanhoucke V, journalName=Arxiv.1602.07261, refType=null, unstructuredReference=
Szegedy C,
Ioffe S,
Vanhoucke V, et al. Inception-v4, inception-ResNet and the impact of residual connections on learning[J].
Arxiv.1602.07261,
2016., articleTitle=Inception-v4, inception-ResNet and the impact of residual connections on learning, refAbstract=null), Reference(id=1218843911326449845, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2022, volume=null, issue=null, pageStart=7057, pageEnd=7061, url=null, language=null, rfNumber=[19], rfOrder=21, authorNames=Saeed M S, Khan M H, Nawaz S, journalName=ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), refType=null, unstructuredReference=
Saeed M S,
Khan M H,
Nawaz S, et al. Fusion and orthogonal projection for improved face-voice association[C]//
ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). New York: IEEE,
2022: 7057-7061., articleTitle=Fusion and orthogonal projection for improved face-voice association, refAbstract=null), Reference(id=1218843911473250506, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2016, volume=null, issue=null, pageStart=499, pageEnd=515, url=null, language=null, rfNumber=[20], rfOrder=22, authorNames=Wen Y, Zhang K, Li Z, journalName=Computer Vision-ECCV 2016: 14th European Conference, refType=null, unstructuredReference=
Wen Y,
Zhang K,
Li Z, et al. A discriminative feature learning approach for deep face recognition[C]//
Computer Vision-ECCV 2016: 14th European Conference. Amsterdam: Springer International Publishing,
2016: 499-515., articleTitle=A discriminative feature learning approach for deep face recognition, refAbstract=null), Reference(id=1218843911573913810, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2017, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[21], rfOrder=23, authorNames=Arevalo J, Solorio T, Montes-y-Gómez M, journalName=Arxiv Preprint Arxiv: 1702.01992, refType=null, unstructuredReference=
Arevalo J,
Solorio T,
Montes-y-Gómez M, et al. Gated multimodal units for information fusion[J].
Arxiv Preprint Arxiv: 1702.01992,
2017., articleTitle=Gated multimodal units for information fusion, refAbstract=null), Reference(id=1218843911733297379, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2020, volume=null, issue=null, pageStart=2252, pageEnd=2256, url=null, language=null, rfNumber=[22], rfOrder=24, authorNames=Chen Z, Wang S, Qian Y, journalName=Interspeech 2020, refType=null, unstructuredReference=
Chen Z,
Wang S,
Qian Y. Multi-modality matters: a performance leap on VoxCeleb[C]//
Interspeech 2020. Slough: ISCA,
2020: 2252-2256., articleTitle=Multi-modality matters: a performance leap on VoxCeleb, refAbstract=null), Reference(id=1218843911959789817, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2017, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[23], rfOrder=25, authorNames=Nagrani A, Chung J S, Zisserman A, journalName=Arxiv Preprint, refType=null, unstructuredReference=
Nagrani A,
Chung J S,
Zisserman A. Voxceleb: a large-scale speaker identification dataset[J].
Arxiv Preprint,
2017: Arxiv: 1706. 08612., articleTitle=Voxceleb: a large-scale speaker identification dataset, refAbstract=null), Reference(id=1218843912060453124, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[24], rfOrder=26, authorNames=Kingma D P, journalName=arXiv preprint Arxiv: 1412.6980, 2014, refType=null, unstructuredReference=
Kingma D P. Adam: a method for stochastic optimization[J].
arXiv preprint Arxiv: 1412.6980, 2014., articleTitle=Adam: a method for stochastic optimization, refAbstract=null), Reference(id=1218843912194670866, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2021, volume=null, issue=null, pageStart=1682, pageEnd=1691, url=null, language=null, rfNumber=[25], rfOrder=27, authorNames=Nawaz S, Saeed M S, Morerio P, journalName=Computer Vision and Pattern Recognition, refType=null, unstructuredReference=
Nawaz S,
Saeed M S,
Morerio P, et al. Cross-modal speaker verification and recognition: a multilingual perspective[C]//
Computer Vision and Pattern Recognition. New York: IEEE,
2021: 1682-1691., articleTitle=Cross-modal speaker verification and recognition: a multilingual perspective, refAbstract=null), Reference(id=1218843912370831655, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2021, volume=null, issue=null, pageStart=6194, pageEnd=6198, url=null, language=null, rfNumber=[26], rfOrder=28, authorNames=Sarı L, Singh K, Zhou J, journalName=2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), refType=null, unstructuredReference=
Sarı L,
Singh K,
Zhou J, et al. A multi-view approach to audio-visual speaker verification[C]//
2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). New York: IEEE,
2021: 6194-6198., articleTitle=A multi-view approach to audio-visual speaker verification, refAbstract=null), Reference(id=1218843912479883574, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2021, volume=24, issue=null, pageStart=338, pageEnd=351, url=null, language=null, rfNumber=[27], rfOrder=29, authorNames=Zheng A, Hu M, Jiang B, journalName=IEEE Transactions on Multimedia, refType=null, unstructuredReference=
Zheng A,
Hu M,
Jiang B, et al. Adversarial-metric learning for audio-visual cross-modal matching[J].
IEEE Transactions on Multimedia,
2021,
24: 338-351., articleTitle=Adversarial-metric learning for audio-visual cross-modal matching, refAbstract=null), Reference(id=1218843912584741185, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[28], rfOrder=30, authorNames=Xiong C Y, Zhang D Y, Liu Tao, journalName=2019: arXiv: 1911.09338, refType=null, unstructuredReference=
Xiong C Y,
Zhang D Y,
Liu Tao, et al. Voice-face cross-modal matching and retrieval: a benchmark.
2019: arXiv: 1911.09338., articleTitle=Voice-face cross-modal matching and retrieval: a benchmark, refAbstract=null), Reference(id=1218843912702181710, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2023, volume=60, issue=11, pageStart=2638, pageEnd=2649, url=null, language=null, rfNumber=[29], rfOrder=31, authorNames=朱明航, 柳欣, 于镇宁, journalName=计算机研究与发展, refType=null, unstructuredReference=朱明航, 柳欣, 于镇宁, 等. 基于双向伪标签自监督学习的跨人脸-语音匹配方法[J].
计算机研究与发展,
2023,
60(11): 2638-2649., articleTitle=基于双向伪标签自监督学习的跨人脸-语音匹配方法, refAbstract=null), Reference(id=1218843912823816534, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2023, volume=60, issue=11, pageStart=2638, pageEnd=2649, url=null, language=null, rfNumber=[29], rfOrder=32, authorNames=Zhu Minghang, Liu Xin, Yu Zhenning, journalName=Journal of Computer Research and Development, refType=null, unstructuredReference=
Zhu Minghang,
Liu Xin,
Yu Zhenning, et al. Cross face-voice matching method
via bi-pseudo label based self-supervised learning[J].
Journal of Computer Research and Development,
2023,
60(11): 2638-2649., articleTitle=Cross face-voice matching method
via bi-pseudo label based self-supervised learning, refAbstract=null), Reference(id=1218843912941257060, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2022, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[30], rfOrder=33, authorNames=Zhu B, Xu K, Wang C, journalName=arXiv preprint arXiv: 2204.14057, refType=null, unstructuredReference=
Zhu B,
Xu K,
Wang C, et al. Unsupervised voice-face representation learning by cross-modal prototype contrast[J].
arXiv preprint arXiv: 2204.14057,
2022., articleTitle=Unsupervised voice-face representation learning by cross-modal prototype contrast, refAbstract=null), Reference(id=1218843913100640621, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2022, volume=null, issue=null, pageStart=527, pageEnd=535, url=null, language=null, rfNumber=[31], rfOrder=34, authorNames=Chen G, Zhang D, Liu T, journalName=Proceedings of the 2022 International Conference on Multimedia Retrieval, refType=null, unstructuredReference=
Chen G,
Zhang D,
Liu T, et al. Self-lifting: a novel framework for unsupervised voice-face association learning[C]//
Proceedings of the 2022 International Conference on Multimedia Retrieval. New York: IEEE,
2022: 527-535., articleTitle=Self-lifting: a novel framework for unsupervised voice-face association learning, refAbstract=null), Reference(id=1218843913218081143, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, doi=null, pmid=null, pmcid=null, year=2020, volume=null, issue=null, pageStart=1881, pageEnd=1884, url=null, language=null, rfNumber=[32], rfOrder=35, authorNames=Wang R, Liu X, Cheung Y, journalName=Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval, refType=null, unstructuredReference=
Wang R,
Liu X,
Cheung Y, et al. Learning discriminative joint embeddings for efficient face and voice association[C]//
Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval. New York: ACM,
2020: 1881-1884., articleTitle=Learning discriminative joint embeddings for efficient face and voice association, refAbstract=null)], funds=[Fund(id=1218843907635462982, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, awardId=2023SYL08, language=CN, fundingSource=中国人民公安大学安全防范工程双一流创新研究专项(2023SYL08), fundOrder=null, country=null)], companyList=[AuthorCompany(id=1218843901700522325, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, xref=null, ext=[AuthorCompanyExt(id=1218843901708910934, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, companyId=1218843901700522325, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=School of Information Network Security, Peoples Public Security University of China, Beijing 100038, China), AuthorCompanyExt(id=1218843901713105240, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, companyId=1218843901700522325, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=中国人民公安大学信息网络安全学院, 北京 100038;)])], figs=[ArticleFig(id=1218843903965446665, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, label=Fig.1, caption=
Overall framework of cross-modal association learning method for speech-face based on single-stream network, figureFileSmall=EEtVW/AAuoMNaSJUsaIEtg==, figureFileBig=fjnSLt6HkBLIyTHGZifBeg==, tableContent=null), ArticleFig(id=1218843904066109971, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, label=图1, caption=
基于单流网络的语音-人脸跨模态关联学习方法的总体框架 ei和bi分别表示语音和人脸的特征嵌入;Ii表示特征融合后的混合向量
, figureFileSmall=EEtVW/AAuoMNaSJUsaIEtg==, figureFileBig=fjnSLt6HkBLIyTHGZifBeg==, tableContent=null), ArticleFig(id=1218843904208716322, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, label=Fig.2, caption=
Framework of InceptionResNet-V1, figureFileSmall=sq4a9HBT8iNuDBMjg9V88Q==, figureFileBig=WRB0P9a4E7AKyQl86QGOFw==, tableContent=null), ArticleFig(id=1218843904351322671, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, label=图2, caption=
InceptionResNet-V1结构, figureFileSmall=sq4a9HBT8iNuDBMjg9V88Q==, figureFileBig=WRB0P9a4E7AKyQl86QGOFw==, tableContent=null), ArticleFig(id=1218843904439403061, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, label=Fig.3, caption=
Framework of InceptionResNet-A and Reduction-A, figureFileSmall=VSrlL8/wmvATheUimmqTRg==, figureFileBig=mm2Ge1RDQqXfno8caUlnnA==, tableContent=null), ArticleFig(id=1218843904607175230, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, label=图3, caption=
InceptionResNet-A 和Reduction-A结构图, figureFileSmall=VSrlL8/wmvATheUimmqTRg==, figureFileBig=mm2Ge1RDQqXfno8caUlnnA==, tableContent=null), ArticleFig(id=1218843904745587271, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, label=Fig.4, caption=
Framework of InceptionResNet-B and Reduction-B, figureFileSmall=AdpCG5ygvhaz+41jCtYLeQ==, figureFileBig=QGaas99Nf6l2Sz+gNhfe1Q==, tableContent=null), ArticleFig(id=1218843904892387922, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, label=图4, caption=
InceptionResNet-B和Reduction-B结构图, figureFileSmall=AdpCG5ygvhaz+41jCtYLeQ==, figureFileBig=QGaas99Nf6l2Sz+gNhfe1Q==, tableContent=null), ArticleFig(id=1218843905026605663, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, label=Fig.5, caption=
Structure diagram of attention-based feature fusion module, figureFileSmall=YJkF9GKF5up28RbHifP/Qw==, figureFileBig=vXEhmiH5zuTVXvl034zmIw==, tableContent=null), ArticleFig(id=1218843905139851883, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, label=图5, caption=
基于注意力的特征融合模块结构图, figureFileSmall=YJkF9GKF5up28RbHifP/Qw==, figureFileBig=vXEhmiH5zuTVXvl034zmIw==, tableContent=null), ArticleFig(id=1218843905286652541, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, label=Fig.6, caption=
Comparison of cross-modal 1∶N matching results, figureFileSmall=jTglxOAJoYOTQmybUovd0Q==, figureFileBig=VXWzV5VEpgWMe3kROzY2Dw==, tableContent=null), ArticleFig(id=1218843905391510154, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, label=图6, caption=
跨模态1∶N匹配结果对比, figureFileSmall=jTglxOAJoYOTQmybUovd0Q==, figureFileBig=VXWzV5VEpgWMe3kROzY2Dw==, tableContent=null), ArticleFig(id=1218843905517339288, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, label=Fig.7, caption=
Visualized results of cross-modal retrieval, figureFileSmall=+9qxh948OQar6knacqAA6A==, figureFileBig=i8OSbdLEzmg9HL8lhok61A==, tableContent=null), ArticleFig(id=1218843905638974112, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, label=图7, caption=
跨模态检索可视化结果, figureFileSmall=+9qxh948OQar6knacqAA6A==, figureFileBig=i8OSbdLEzmg9HL8lhok61A==, tableContent=null), ArticleFig(id=1218843905773191852, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, label=Table 1, caption=
Experimental platform and environment
, figureFileSmall=null, figureFileBig=null, tableContent=
| 配置类型 | 参数 |
| CPU | Intel CORE i7 |
| GPU | NVIDIA GeForce RTX 4060 Laptop GPU |
| 操作系统 | Windows11 |
| 编程语言 | Python3.9 |
| 深度学习框架 | Pytorch、tensorflow |
| 开发工具 | PyCharm |
), ArticleFig(id=1218843905861272249, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, label=表1, caption=
实验平台与环境
, figureFileSmall=null, figureFileBig=null, tableContent=
| 配置类型 | 参数 |
| CPU | Intel CORE i7 |
| GPU | NVIDIA GeForce RTX 4060 Laptop GPU |
| 操作系统 | Windows11 |
| 编程语言 | Python3.9 |
| 深度学习框架 | Pytorch、tensorflow |
| 开发工具 | PyCharm |
), ArticleFig(id=1218843905970324165, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, label=Table 2, caption=
Results of ungrouped cross-modal validation experiments
, figureFileSmall=null, figureFileBig=null, tableContent=
| 方法 | EER/% | AUC/% |
| DIM Net[10] | 24.9 | 82.5 |
| Learnable Pins[11] | 29.6 | 78.5 |
| MAV-Celeb[25] | 29.0 | 78.9 |
| Deep Latent Space[14] | 29.5 | 78.8 |
| Multi-view Approach[26] | 28.0 | — |
| Adversarial-Metric Learning[27] | — | 80.6 |
| 本文方法 | 25.6 | 82.5 |
), ArticleFig(id=1218843906104541902, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, label=表2, caption=
未分组跨模态验证实验结果
, figureFileSmall=null, figureFileBig=null, tableContent=
| 方法 | EER/% | AUC/% |
| DIM Net[10] | 24.9 | 82.5 |
| Learnable Pins[11] | 29.6 | 78.5 |
| MAV-Celeb[25] | 29.0 | 78.9 |
| Deep Latent Space[14] | 29.5 | 78.8 |
| Multi-view Approach[26] | 28.0 | — |
| Adversarial-Metric Learning[27] | — | 80.6 |
| 本文方法 | 25.6 | 82.5 |
), ArticleFig(id=1218843906226176729, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, label=Table 3, caption=
Results of cross-modal validation under constraint conditions experiments
, figureFileSmall=null, figureFileBig=null, tableContent=
| 方法 | AUC/% |
| 性别 | 国籍 | 年龄 | GNA |
| DIMNet-I[10] | 71.0 | 81.1 | 77.7 | 62.8 |
| DIMNet-IG[10] | 71.2 | 81.9 | 78.0 | 62.8 |
| Learnable Pins[11] | 61.1 | 77.2 | 74.9 | 58.8 |
| Deep Latent Space[14] | 62.4 | 53.1 | 73.5 | 51.4 |
| 本文方法 | 68.8 | 70.8 | 76.9 | 58.2 |
), ArticleFig(id=1218843906343617252, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, label=表3, caption=
约束条件跨模态验证实验结果
, figureFileSmall=null, figureFileBig=null, tableContent=
| 方法 | AUC/% |
| 性别 | 国籍 | 年龄 | GNA |
| DIMNet-I[10] | 71.0 | 81.1 | 77.7 | 62.8 |
| DIMNet-IG[10] | 71.2 | 81.9 | 78.0 | 62.8 |
| Learnable Pins[11] | 61.1 | 77.2 | 74.9 | 58.8 |
| Deep Latent Space[14] | 62.4 | 53.1 | 73.5 | 51.4 |
| 本文方法 | 68.8 | 70.8 | 76.9 | 58.2 |
), ArticleFig(id=1218843906473640688, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, label=Table 4, caption=
Results of cross-modal 1∶N matching experiments
, figureFileSmall=null, figureFileBig=null, tableContent=
| 样本数 | V-F/% |
| SVHF[9] | Learnable Pins[11] | Single Stream Net[14] | Dim net[10] | 本文 方法 |
| 2 | 82 | 84 | 78 | 84 | 83 |
| 4 | 61 | 54 | 56 | 65 | 64 |
| 6 | 49 | 42 | 42 | 52 | 51 |
| 8 | 43 | 36 | 36 | 44 | 42 |
| 10 | 36 | 30 | 30 | 38 | 37 |
), ArticleFig(id=1218843906649801469, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, label=表4, caption=
跨模态1∶N匹配实验结果
, figureFileSmall=null, figureFileBig=null, tableContent=
| 样本数 | V-F/% |
| SVHF[9] | Learnable Pins[11] | Single Stream Net[14] | Dim net[10] | 本文 方法 |
| 2 | 82 | 84 | 78 | 84 | 83 |
| 4 | 61 | 54 | 56 | 65 | 64 |
| 6 | 49 | 42 | 42 | 52 | 51 |
| 8 | 43 | 36 | 36 | 44 | 42 |
| 10 | 36 | 30 | 30 | 38 | 37 |
), ArticleFig(id=1218843906842739466, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, label=Table 5, caption=
Results of cross-modal retrieval experiment
, figureFileSmall=null, figureFileBig=null, tableContent=
| 方法 | mAP/% |
| V-F | F-V |
| VFMR[28] | 4.70 | 5.47 |
| DIMNet[10] | 6.22 | 6.65 |
| Bi-Pcm-FST[29] | 6.36 | 6.04 |
| Deep Latent Space[14] | 6.87 | 7.57 |
| 本文方法 | 6.92 | 7.55 |
), ArticleFig(id=1218843906960179991, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, label=表5, caption=
跨模态检索实验结果
, figureFileSmall=null, figureFileBig=null, tableContent=
| 方法 | mAP/% |
| V-F | F-V |
| VFMR[28] | 4.70 | 5.47 |
| DIMNet[10] | 6.22 | 6.65 |
| Bi-Pcm-FST[29] | 6.36 | 6.04 |
| Deep Latent Space[14] | 6.87 | 7.57 |
| 本文方法 | 6.92 | 7.55 |
), ArticleFig(id=1218843907090203427, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=EN, label=Table 6, caption=
Results of ablation experiment
, figureFileSmall=null, figureFileBig=null, tableContent=
| 消融实验条目 | 未分组跨模态验证实验 |
| EER/% | AUC/% |
| 单流网络+FOP损失模块 | 28.5 | 79.8 |
| 单流网络+特征融合模块+CE损失 | 27.5 | 81.6 |
| 单流网络+特征融合模块+余弦相似度损失 | 30.5 | 77.5 |
| 完整模型 | 25.6 | 82.5 |
), ArticleFig(id=1218843907237004076, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1149776907329302597, language=CN, label=表6, caption=
消融实验结果
, figureFileSmall=null, figureFileBig=null, tableContent=
| 消融实验条目 | 未分组跨模态验证实验 |
| EER/% | AUC/% |
| 单流网络+FOP损失模块 | 28.5 | 79.8 |
| 单流网络+特征融合模块+CE损失 | 27.5 | 81.6 |
| 单流网络+特征融合模块+余弦相似度损失 | 30.5 | 77.5 |
| 完整模型 | 25.6 | 82.5 |
)], attaches=null, journal=Journal(id=1146119176004939786, delFlag=0, nameCn=科学技术与工程, nameEn=Science Technology and Engineering, nameHistory1=null, nameHistory2=null, issn=1671-1815, eissn=, cn=11-4688/T, coden=null, periodic=4, language=CN, oaType=是, ccby=null, superviseOffice=null, ownerOffice=null, pubOffice=null, editorOffice=null, officeType=null, aims=null, clcCode=null, officeProv=null, officeCity=null, officeAddr=null, officeZip=null, officeEmail=null, officePhone=null, editDirector=null, officeDirector=null, officeDirectorPhone=null, officeStaffNum=null, officeEmpNum=null, coverPicUrl=UKU/O7GSka5polgCTkbIIw==, journalPrice=null, startedYear=null, abbrevIsoEn=Sci Technol Eng, journalRemark=null, publicationField=null, createdTime=null, updatedTime=1754445529766, createdBy=null, updatedBy=13701087609, firstLetterCn=S, firstLetterEn=S, subjectCode=Natural Sciences, subjectName=自然科学, subjectCodeEn=Natural Sciences, subjectNameEn=null, picCn=UKU/O7GSka5polgCTkbIIw==, picEn=5hwlULoNwcbj3xUmVi9MAQ==, jcr=null, cjcr=null, exts=[JournalExt(id=1159791870395564357, language=CN, name=科学技术与工程, nameHistory1=null, nameHistory2=null, managedBy=, sponsoredBy=, publishedBy=, editorOffice=, officeProv=null, officeCity=null, officeAddr=, officeZip=, editDirector=null, officeDirector=null, officePhone=null, coverPicUrl=null, journalRemark=, submitArticleUrl=null, websiteUrl=http://www.stae.com.cn/jsygc/home, createdTime=1754445529793, updatedTime=1754445529793, createdBy=13701087609, updatedBy=13701087609, submissionGuidelinesUrl=http://www.stae.com.cn/jsygc/site/menus/20090429150146001, submissionAuthorUrl=http://www.stae.com.cn/jsygc/author/login, submissionEditorUrl=http://www.stae.com.cn/jsygc/editor/login, submissionReviewUrl=http://www.stae.com.cn/jsygc/reviewer/login, submissionCeEditorUrl=, submissionAeEditorUrl=, option={"copyright":""}), JournalExt(id=1159791870441701702, language=EN, name=Science Technology and Engineering, nameHistory1=null, nameHistory2=null, managedBy=, sponsoredBy=, publishedBy=, editorOffice=, officeProv=null, officeCity=null, officeAddr=, officeZip=, editDirector=null, officeDirector=null, officePhone=null, coverPicUrl=null, journalRemark=, submitArticleUrl=null, websiteUrl=http://www.stae.com.cn/jsygc/home, createdTime=1754445529804, updatedTime=1754445529804, createdBy=13701087609, updatedBy=13701087609, submissionGuidelinesUrl=, submissionAuthorUrl=http://www.stae.com.cn/jsygc/author/login, submissionEditorUrl=http://www.stae.com.cn/jsygc/editor/login, submissionReviewUrl=http://www.stae.com.cn/jsygc/reviewer/login, submissionCeEditorUrl=, submissionAeEditorUrl=, option={"copyright":""})], databaseList=null, tenantJournalId=1146123166801305609, websiteList=[Website(id=1148243202391400884, webName=null, webTitle=null, webDomain=null, webCopyrigh=null, webIpcNo=null, seoTitle=null, seoKeywords=null, seoDescription=null, tenantJournalId=null, journalId=1146123166801305609, journalNameCn=null, journalNameEn=null, grayFlag=null, tenantId=1146029695717560320, platformId=null, journalGroupId=null, journalGroupNameCn=null, journalGroupNameEn=null, type=1, domain=https://castjournals.cast.org.cn/joweb/kxjsygc/CN, language=CN, createTime=1751692112777, createBy=18614031015, updateTime=1753520965431, updateBy=18614031015, name=科学技术与工程-中文站点, tplId=1146099689490845704, title=科学技术与工程, delFlag=0, indexPage=/home, props=[WebsiteProps(id=1148622798802673703, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1148243202391400884, code=articleTextType, value=kx, createTime=1751782615614, updateTime=1751782615614, creator=18614031015, updator=18614031015), WebsiteProps(id=1148622798781702180, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1148243202391400884, code=banner, value=null, createTime=1751782615609, updateTime=1751782615609, creator=18614031015, updator=18614031015), WebsiteProps(id=1148622798769119267, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1148243202391400884, code=logo, value=https://castjournals.cast.org.cn/joweb/kjdb/CN/file/pic?fileId=j86gbwi+p0Idkyl5SzIlmQ==, createTime=1751782615606, updateTime=1751782615606, creator=18614031015, updator=18614031015), WebsiteProps(id=1148622798794285094, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1148243202391400884, code=picServerUrl, value=https://castjournals.cast.org.cn/joweb/kjdb/CN/file/pic, createTime=1751782615612, updateTime=1751782615612, creator=18614031015, updator=18614031015), WebsiteProps(id=1148622798790090789, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1148243202391400884, code=staticResourcePath, value=https://castjournals.cast.org.cn/joweb/cast_kjdb_cn_619/, createTime=1751782615611, updateTime=1751782615611, creator=18614031015, updator=18614031015)]), Website(id=1155914124811976731, webName=null, webTitle=null, webDomain=null, webCopyrigh=null, webIpcNo=null, seoTitle=null, seoKeywords=null, seoDescription=null, tenantJournalId=null, journalId=1146123166801305609, journalNameCn=null, journalNameEn=null, grayFlag=null, tenantId=1146029695717560320, platformId=null, journalGroupId=null, journalGroupNameCn=null, journalGroupNameEn=null, type=1, domain=https://castjournals.cast.org.cn/joweb/kxjsygc/EN, language=EN, createTime=1753521003206, createBy=18614031015, updateTime=1753521003206, updateBy=18614031015, name=科学技术与工程-英文站点, tplId=1146101810881728533, title=Science Technology and Engineering, delFlag=0, indexPage=/home, props=[WebsiteProps(id=1155914371227308235, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1155914124811976731, code=articleTextType, value=kx, createTime=1753521061952, updateTime=1753521061952, creator=18614031015, updator=18614031015), WebsiteProps(id=1155914371210531016, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1155914124811976731, code=banner, value=null, createTime=1753521061947, updateTime=1753521061947, creator=18614031015, updator=18614031015), WebsiteProps(id=1155914371202142407, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1155914124811976731, code=logo, value=https://castjournals.cast.org.cn/joweb/kjdb/CN/file/pic?fileId=j86gbwi+p0Idkyl5SzIlmQ==, createTime=1753521061945, updateTime=1753521061945, creator=18614031015, updator=18614031015), WebsiteProps(id=1155914371223113930, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1155914124811976731, code=picServerUrl, value=https://castjournals.cast.org.cn/joweb/kjdb/CN/file/pic, createTime=1753521061950, updateTime=1753521061950, creator=18614031015, updator=18614031015), WebsiteProps(id=1155914371218919625, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1155914124811976731, code=staticResourcePath, value=https://castjournals.cast.org.cn/joweb/cast_kjdb_cn_619/, createTime=1753521061949, updateTime=1753521061949, creator=18614031015, updator=18614031015)])], journalTitle=科学技术与工程, weixinUrl=null, journalUrl=null, iacademicId=null, status=0, seqNo=null, journalTitleEn=Science Technology and Engineering, journalPhotoCn=UKU/O7GSka5polgCTkbIIw==, journalPhotoEn=5hwlULoNwcbj3xUmVi9MAQ==, journalFirstLetter=S, journalRecommend=null, journalNew=null, journalCollection=null, jcrJf=null, cjcrJf=null, jcrJfStr=null, cjcrJfStr=null, submissionFirstDecision=null, sciSubjectClassification=null, casSubjectClassification=null, citeScore=null, totalCitationFrequency=null, icpCode=null, psCode=null, advertisingLicenseCode=null, copyrightInformation=null, country=null, option=null, provinceCode=null, provinceName=null, collectFlag=false), detailUrlCn=https://castjournals.cast.org.cn/joweb/kxjsygc/CN/10.12404/j.issn.1671-1815.2403487, detailUrlEn=https://castjournals.cast.org.cn/joweb/kxjsygc/EN/10.12404/j.issn.1671-1815.2403487, pdfUrlCn=https://castjournals.cast.org.cn/joweb/kxjsygc/CN/PDF/10.12404/j.issn.1671-1815.2403487, pdfUrlEn=https://castjournals.cast.org.cn/joweb/kxjsygc/EN/PDF/10.12404/j.issn.1671-1815.2403487, aliStartDate=null, aliEndDate=null, collectionFlag=false, citedCount=null, citedUrl=null, reference=null)