Article(id=1261262688311755499, tenantId=1146029695717560320, journalId=1146123166801305609, issueId=1261262687258985194, articleNumber=null, orderNo=null, doi=10.12404/j.issn.1671-1815.2404954, pmid=null, cstr=null, oa=null, hot=null, price=null, onlineType=0, articleFormat=0, articleType=null, articleTypeStr=research-article, receivedDate=1719849600000, receivedDateStr=2024-07-02, revisedDate=1744300800000, revisedDateStr=2025-04-11, acceptedDate=null, acceptedDateStr=null, onlineDate=1778638058021, onlineDateStr=2026-05-13, pubDate=1752768000000, pubDateStr=2025-07-18, doiRegisterDate=null, doiRegisterDateStr=null, onlineIssueDate=1778638058021, onlineIssueDateStr=2026-05-13, onlineJustAcceptDate=null, onlineJustAcceptDateStr=null, onlineFirstDate=null, onlineFirstDateStr=null, sourceXml=null, magXml=null, createTime=1778638058021, creator=13701087609, updateTime=1778638058021, updator=13701087609, issue=Issue{id=1261262687258985194, tenantId=1146029695717560320, journalId=1146123166801305609, year='2025', volume='25', issue='20', pageStart='8317', pageEnd='8759', issueExtLink='null', onlineDate='null', pubDate='null', beforeIssueId=null, nextIssueId=null, price=null, status=1, issueComplete=1, articleOrder=1, issueType=-1, specialIssue=null, createTime=1778638057769, creator=13701087609, updateTime=1778753106634, updator=13701087609, preIssue=null, nextIssue=null, ext={EN=IssueExt(id=1261745237240722095, tenantId=1146029695717560320, journalId=1146123166801305609, issueId=1261262687258985194, language=EN, specialIssueTitle=, coverIllustrator=null, specialIssueEditor=, specialIssueAbout=), CN=IssueExt(id=1261745237240722096, tenantId=1146029695717560320, journalId=1146123166801305609, issueId=1261262687258985194, language=CN, specialIssueTitle=, coverIllustrator=null, specialIssueEditor=, specialIssueAbout=)}, issueFiles=null}, startPage=8331, endPage=8346, ext={EN=ArticleExt(id=1261262689075118829, articleId=1261262688311755499, tenantId=1146029695717560320, journalId=1146123166801305609, language=EN, title=Review of Speech Enhancement Methods Based on Deep Learning, columnId=1156262731956212064, journalTitle=Science Technology and Engineering, columnName=Surveies·Automation and Computational Technology, runingTitle=null, highlight=null, articleAbstract=
With the emergence of deep learning technologies, speech enhancement methods based on deep learning have seen widespread application and generally surpass traditional approaches in performance. The fundamental framework of noise reduction signal processing in speech enhancement was outlined and progressively delved into the latest advancements in deep learning-driven speech enhancement models. A comprehensive organization of deep learning-based speech enhancement algorithms was provided, detailing the principles, characteristics, evaluation metrics, and representative studies of various neural network-based methods. The advantages and limitations of these approaches were thoroughly assessed. Finally, in light of the current developmental landscape, the core challenges encountered in the speech enhancement process were analyzed, and future developmental trajectories were discussed and predicted.
, correspAuthors=Jia-qi FENG, authorNote=null, correspAuthorsNote=null, copyrightStatement=null, copyrightOwner=null, extLink=null, articleAbsUrl=null, sourceXml=null, magXml=null, pdfUrl=null, pdf=null, pdfFileSize=null, pdfExtLink=null, richHtmlUrl=null, mobilePdfUrl=null, reviewReport=null, pdfFirstPage=null, abstractGraph=null, abstractGraphContent=null, abstractVideo=null, citation=null, cebUrl=null, magXmlContent=null, mapNumber=null, authorCompany=null, fund=null, authors=null, authorsList=Hua-peng WANG, Jia-qi FENG), CN=ArticleExt(id=1261262692195681024, articleId=1261262688311755499, tenantId=1146029695717560320, journalId=1146123166801305609, language=CN, title=基于深度学习的语音增强方法综述, columnId=1156262732098818403, journalTitle=科学技术与工程, columnName=综述·自动化技术、计算机技术, runingTitle=null, highlight=null, articleAbstract=
随着深度学习技术的兴起,基于深度学习的语音增强方法日益广泛应用,性能普遍优于传统方法。概述语音增强中降噪信号处理的基本框架,逐步分析深度学习驱动的语音增强模型的最新进展。对基于深度学习的语音增强算法进行全面整理,详细介绍不同神经网络的语音增强方法的原理、特点、评价指标及代表性研究,综合评估这些方法的优势与不足。最后,结合当前发展状况,分析语音增强过程中面临的核心挑战,并对未来发展路径进行讨论与预测。
, correspAuthors=冯嘉琪, authorNote=null, correspAuthorsNote=
, copyrightStatement=null, copyrightOwner=null, extLink=null, articleAbsUrl=null, sourceXml=ab2VFPycpY9t/sAQ7l7BYA==, magXml=ajtMW4aeiXU7f0gkbqgYJQ==, pdfUrl=null, pdf=ALQSwsrhpc1+ndk16PhZ4g==, pdfFileSize=7702740, pdfExtLink=null, richHtmlUrl=null, mobilePdfUrl=null, reviewReport=null, pdfFirstPage=null, abstractGraph=WTiZhWcZGiyd31Wj0OtHFA==, abstractGraphContent=null, abstractVideo=null, citation=null, cebUrl=null, magXmlContent=/mn7b5ror+SdjYrv/40Yww==, mapNumber=null, authorCompany=null, fund=null, authors=
, authorsList=王华朋, 冯嘉琪)}, authors=[Author(id=1261377027215601807, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, orderNo=0, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=huapeng.wang@hotmail.com, emailSecond=null, emailThird=null, correspondingAuthor=0, authorType=1, ext={EN=AuthorExt(id=1261377027794415764, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, authorId=1261377027215601807, language=EN, stringName=Hua-peng WANG, firstName=Hua-peng, middleName=null, lastName=WANG, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=College of Public Security Information Technology and Intelligence, Criminal Investigation Police University of China, Shenyang 110854, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1261377030080311457, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, authorId=1261377027215601807, language=CN, stringName=王华朋, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=null, address=中国刑事警察学院公安信息技术与情报学院, 沈阳 110854, bio={"content":"
王华朋(1979—),男,汉族,山东菏泽人,博士,教授。研究方向:说话人识别、深度学习、人工智能。E-mail:huapeng.wang@hotmail.com。
"}, bioImg=null, bioContent=
王华朋(1979—),男,汉族,山东菏泽人,博士,教授。研究方向:说话人识别、深度学习、人工智能。E-mail:huapeng.wang@hotmail.com。
, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1261377026125082750, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, xref=null, ext=[AuthorCompanyExt(id=1261377026376740993, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, companyId=1261377026125082750, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=College of Public Security Information Technology and Intelligence, Criminal Investigation Police University of China, Shenyang 110854, China), AuthorCompanyExt(id=1261377026410295426, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, companyId=1261377026125082750, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=中国刑事警察学院公安信息技术与情报学院, 沈阳 110854)])]), Author(id=1261377030722039978, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, orderNo=1, firstName=null, middleName=null, lastName=null, nameCn=null, orcid=null, stid=null, country=null, authorPic=null, dead=0, email=18240668287@163.com, emailSecond=null, emailThird=null, correspondingAuthor=1, authorType=1, ext={EN=AuthorExt(id=1261377032009691319, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, authorId=1261377030722039978, language=EN, stringName=Jia-qi FENG, firstName=Jia-qi, middleName=null, lastName=FENG, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
*, address=College of Public Security Information Technology and Intelligence, Criminal Investigation Police University of China, Shenyang 110854, China, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null), CN=AuthorExt(id=1261377032403955901, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, authorId=1261377030722039978, language=CN, stringName=冯嘉琪, firstName=null, middleName=null, lastName=null, prefix=null, suffix=null, authorComment=null, nameInitials=null, affiliation=null, department=null, xref=
*, address=中国刑事警察学院公安信息技术与情报学院, 沈阳 110854, bio=null, bioImg=null, bioContent=null, aboutCorrespAuthor=null)}, companyList=[AuthorCompany(id=1261377026125082750, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, xref=null, ext=[AuthorCompanyExt(id=1261377026376740993, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, companyId=1261377026125082750, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=College of Public Security Information Technology and Intelligence, Criminal Investigation Police University of China, Shenyang 110854, China), AuthorCompanyExt(id=1261377026410295426, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, companyId=1261377026125082750, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=中国刑事警察学院公安信息技术与情报学院, 沈阳 110854)])])], keywords=[Keyword(id=1261377035469992143, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=EN, orderNo=1, keyword=speech enhancement), Keyword(id=1261377036107526357, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=EN, orderNo=2, keyword=deep learning), Keyword(id=1261377037281931490, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=EN, orderNo=3, keyword=speech denoising), Keyword(id=1261377039429415145, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=EN, orderNo=4, keyword=neural network), Keyword(id=1261377041220382968, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=CN, orderNo=1, keyword=语音增强), Keyword(id=1261377043720188165, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=CN, orderNo=2, keyword=深度学习), Keyword(id=1261377044848455951, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=CN, orderNo=3, keyword=语音降噪), Keyword(id=1261377045813145884, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=CN, orderNo=4, keyword=神经网络)], refs=[Reference(id=1261377092982288923, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2006, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[1], rfOrder=0, authorNames=Benesty J, Makino S, Chen J, journalName=Speech enhancement, refType=null, unstructuredReference=
Benesty J,
Makino S,
Chen J.
Speech enhancement[M]. Berlin: Springer,
2006., articleTitle=null, refAbstract=null), Reference(id=1261377093431079459, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2024, volume=24, issue=33, pageStart=14278, pageEnd=14286, url=null, language=null, rfNumber=[2], rfOrder=1, authorNames=潘卫军, 王梓璇, 蒋培元, journalName=科学技术与工程, refType=null, unstructuredReference=潘卫军, 王梓璇, 蒋培元,
等. 面向管制语音识别系统的性能评价方法[J].
科学技术与工程,
2024,
24(33): 14278-14286., articleTitle=面向管制语音识别系统的性能评价方法, refAbstract=null), Reference(id=1261377093737263660, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2024, volume=24, issue=33, pageStart=14278, pageEnd=14286, url=null, language=null, rfNumber=[2], rfOrder=2, authorNames=Pan Weijun, Wang Zixuan, Jiang Peiyuan, journalName=Science Technology and Engineering, refType=null, unstructuredReference=
Pan Weijun,
Wang Zixuan,
Jiang Peiyuan,
et al. Performance evaluation methods for ATC speech recognition systems[J].
Science Technology and Engineering,
2024,
24(33): 14278-14286., articleTitle=Performance evaluation methods for ATC speech recognition systems, refAbstract=null), Reference(id=1261377094072807988, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=1979, volume=27, issue=2, pageStart=113, pageEnd=120, url=null, language=null, rfNumber=[3], rfOrder=3, authorNames=Boll S, journalName=IEEE Transactions on Acoustics, Speech, and Signal Processing, refType=null, unstructuredReference=
Boll S. Suppression of acoustic noise in speech using spectral subtraction[J].
IEEE Transactions on Acoustics, Speech, and Signal Processing,
1979,
27(2): 113-120., articleTitle=Suppression of acoustic noise in speech using spectral subtraction, refAbstract=null), Reference(id=1261377094320271931, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=1996, volume=35, issue=20, pageStart=3930, pageEnd=3936, url=null, language=null, rfNumber=[4], rfOrder=4, authorNames=Zalevsky Z, Mendlovic D, journalName=Applied Optics, refType=null, unstructuredReference=
Zalevsky Z,
Mendlovic D. Fractional wiener filter[J].
Applied Optics,
1996,
35(20): 3930-3936., articleTitle=Fractional wiener filter, refAbstract=null), Reference(id=1261377094529987138, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=1985, volume=33, issue=2, pageStart=443, pageEnd=445, url=null, language=null, rfNumber=[5], rfOrder=5, authorNames=Ephraim Y, Malah D, journalName=IEEE Transactions on Acoustics, Speech, and Signal Processing, refType=null, unstructuredReference=
Ephraim Y,
Malah D. Speech enhancement using a minimum mean-square error log-spectral amplitude estimator[J].
IEEE Transactions on Acoustics, Speech, and Signal Processing,
1985,
33(2): 443-445., articleTitle=Speech enhancement using a minimum mean-square error log-spectral amplitude estimator, refAbstract=null), Reference(id=1261377094710342214, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2001, volume=null, issue=null, pageStart=665, pageEnd=668, url=null, language=null, rfNumber=[6], rfOrder=6, authorNames=Chennoukh S, Gerrits A, Miet G, journalName=IEEE International Conference on Acoustics, Speech, and Signal Processing. Proceedings, refType=null, unstructuredReference=
Chennoukh S,
Gerrits A,
Miet G,
et al. Speech enhancement via frequency bandwidth extension using line spectral frequencies[C]//
IEEE International Conference on Acoustics, Speech, and Signal Processing. Proceedings. Salt Lake City: IEEE,
2001: 665-668., articleTitle=Speech enhancement via frequency bandwidth extension using line spectral frequencies, refAbstract=null), Reference(id=1261377096392258123, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2024, volume=24, issue=27, pageStart=11763, pageEnd=11773, url=null, language=null, rfNumber=[7], rfOrder=7, authorNames=万玫汐, 王华朋, 闫道申, journalName=科学技术与工程, refType=null, unstructuredReference=万玫汐, 王华朋, 闫道申,
等. 基于改进ecapa-tdnn的法庭自动说话人识别[J].
科学技术与工程,
2024,
24(27): 11763-11773., articleTitle=基于改进ecapa-tdnn的法庭自动说话人识别, refAbstract=null), Reference(id=1261377096610361938, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2024, volume=24, issue=27, pageStart=11763, pageEnd=11773, url=null, language=null, rfNumber=[7], rfOrder=8, authorNames=Wan Meixi, Wang Huapeng, Yan Daoshen, journalName=Science Technology and Engineering, refType=null, unstructuredReference=
Wan Meixi,
Wang Huapeng,
Yan Daoshen,
et al. Forensic automatic speaker recognition based on enhanced ECAPA-TDNN[J].
Science Technology and Engineering,
2024,
24(27): 11763-11773., articleTitle=Forensic automatic speaker recognition based on enhanced ECAPA-TDNN, refAbstract=null), Reference(id=1261377097528914520, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2020, volume=null, issue=null, pageStart=851, pageEnd=855, url=null, language=null, rfNumber=[8], rfOrder=9, authorNames=Takeuchi D, Yatabe K, Koizumi Y, journalName=ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), refType=null, unstructuredReference=
Takeuchi D,
Yatabe K,
Koizumi Y,
et al. Real-Time speech enhancement using equilibriated RNN[C]//
ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Barcelona: IEEE,
2020: 851-855., articleTitle=Real-Time speech enhancement using equilibriated RNN, refAbstract=null), Reference(id=1261377097960927841, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2015, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[9], rfOrder=10, authorNames=Weninger F, Erdogan H, Watanabe S, journalName=Speech enhancement with LSTM recurrent neural networks and its application to noise-robust ASR, refType=null, unstructuredReference=
Weninger F,
Erdogan H,
Watanabe S,
et al.
Speech enhancement with LSTM recurrent neural networks and its application to noise-robust ASR[M]. Cham: Springer International Publishing,
2015., articleTitle=null, refAbstract=null), Reference(id=1261377098195808874, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[10], rfOrder=11, authorNames=Pascual S, Bonafonte A, Serrà J, journalName=arXiv Preprint, refType=null, unstructuredReference=
Pascual S,
Bonafonte A,
Serrà J. SEGAN: speech enhancement generative adversarial network[J].
arXiv Preprint, 2017: arXiv: 1703.09452., articleTitle=SEGAN: speech enhancement generative adversarial network, refAbstract=null), Reference(id=1261377098518770287, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[11], rfOrder=12, authorNames=Meng Z, Jinyu L, Gong Y, journalName=arXiv Preprint, refType=null, unstructuredReference=
Meng Z,
Jinyu L,
Gong Y,
et al. Cycle-Consistent speech enhancement[J].
arXiv Preprint, arXiv: 1809.02253., articleTitle=Cycle-Consistent speech enhancement, refAbstract=null), Reference(id=1261377098791400056, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2021, volume=null, issue=null, pageStart=7098, pageEnd=7102, url=null, language=null, rfNumber=[12], rfOrder=13, authorNames=Wang K, He B, Zhu W P, journalName=ICASSP 2021—2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), refType=null, unstructuredReference=
Wang K,
He B,
Zhu W P. TSTNN: two-stage transformer based neural network for speech enhancement in the time domain[C]//
ICASSP 2021—2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Toronto: IEEE,
2021: 7098-7102., articleTitle=TSTNN: two-stage transformer based neural network for speech enhancement in the time domain, refAbstract=null), Reference(id=1261377099189858943, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2014, volume=null, issue=null, pageStart=91, pageEnd=96, url=null, language=null, rfNumber=[13], rfOrder=14, authorNames=Rakshit H, Ullah M A, journalName=null, refType=null, unstructuredReference=
Rakshit H,
Ullah M A. A comparative study on window functions for designing efficient FIR filter[C]//9th International Forum on Strategic Technology (IFOST). Cox’s Bazar: IEEE,
2014: 91-96., articleTitle=A comparative study on window functions for designing efficient FIR filter, refAbstract=null), Reference(id=1261377100867580551, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[14], rfOrder=15, authorNames=Ren M, Liao R, Urtasun R, journalName=arXiv Preprint, refType=null, unstructuredReference=
Ren M,
Liao R,
Urtasun R,
et al. Normalizing the normalizers: comparing and extending network normalization schemes[J].
arXiv Preprint, 2017: arXiv: 1611.04520., articleTitle=Normalizing the normalizers: comparing and extending network normalization schemes, refAbstract=null), Reference(id=1261377101018575502, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2014, volume=null, issue=null, pageStart=2843, pageEnd=2846, url=null, language=null, rfNumber=[15], rfOrder=16, authorNames=Kang T G, Kwon K, Shin J W, journalName=Interspeech, refType=null, unstructuredReference=
Kang T G,
Kwon K,
Shin J W,
et al. NMF-based speech enhancement incorporating deep neural network[C]//
Interspeech. New York: ISCA,
2014: 2843-2846., articleTitle=NMF-based speech enhancement incorporating deep neural network, refAbstract=null), Reference(id=1261377101194736276, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2014, volume=null, issue=null, pageStart=1, pageEnd=5, url=null, language=null, rfNumber=[16], rfOrder=17, authorNames=Chauhan P M, Desai N P, journalName=International Conference on Green Computing Communication and Electrical Engineering (ICGCCEE), refType=null, unstructuredReference=
Chauhan P M,
Desai N P. Mel frequency cepstral coefficients (MFCC) based speaker identification in noisy environment using wiener filter[C]//
International Conference on Green Computing Communication and Electrical Engineering (ICGCCEE). Coimbatore: IEEE,
2014: 1-5., articleTitle=Mel frequency cepstral coefficients (MFCC) based speaker identification in noisy environment using wiener filter, refAbstract=null), Reference(id=1261377101387674264, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2007, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[17], rfOrder=18, authorNames=Philipos C L, journalName=Speech enhancement: theory and practice, refType=null, unstructuredReference=
Philipos C L.
Speech enhancement: theory and practice[M]. 1st ed. Boca Raton: CRC Press,
2007., articleTitle=null, refAbstract=null), Reference(id=1261377101689664157, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2008, volume=null, issue=null, pageStart=1589, pageEnd=1592, url=null, language=null, rfNumber=[18], rfOrder=19, authorNames=Yang S, Wang D L, journalName=2008 IEEE International Conference on Acoustics, Speech and Signal Processing. Las Vegas, refType=null, unstructuredReference=
Yang S,
Wang D L. Robust speaker identification using auditory features and computational auditory scene analysis[C]//
2008 IEEE International Conference on Acoustics, Speech and Signal Processing. Las Vegas, NV: IEEE,
2008: 1589-1592., articleTitle=Robust speaker identification using auditory features and computational auditory scene analysis, refAbstract=null), Reference(id=1261377101924545189, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2012, volume=29, issue=6, pageStart=82, pageEnd=97, url=null, language=null, rfNumber=[19], rfOrder=20, authorNames=Hinton G, Deng L, Yu D, journalName=IEEE Signal Processing Magazine, refType=null, unstructuredReference=
Hinton G,
Deng L,
Yu D,
et al. Deep neural networks for acoustic modeling in speech recognition: the shared views of four research groups[J].
IEEE Signal Processing Magazine,
2012,
29(6): 82-97., articleTitle=Deep neural networks for acoustic modeling in speech recognition: the shared views of four research groups, refAbstract=null), Reference(id=1261377102083928748, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2016, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[20], rfOrder=21, authorNames=Melve O K, journalName=Speech enhancement with deep neural networks, refType=null, unstructuredReference=
Melve O K.
Speech enhancement with deep neural networks[D]. Oslo: Norwegian University of Science and Technology,
2016., articleTitle=null, refAbstract=null), Reference(id=1261377102201369262, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2019, volume=13, issue=2, pageStart=206, pageEnd=219, url=null, language=null, rfNumber=[21], rfOrder=22, authorNames=Purwins H, Li B, Virtanen T, journalName=IEEE Journal of Selected Topics in Signal Processing, refType=null, unstructuredReference=
Purwins H,
Li B,
Virtanen T,
et al. Deep learning for audio signal processing[J].
IEEE Journal of Selected Topics in Signal Processing,
2019,
13(2): 206-219., articleTitle=Deep learning for audio signal processing, refAbstract=null), Reference(id=1261377102356558516, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[22], rfOrder=23, authorNames=Neekhara P, Hussain S, Pandey P, journalName=arXiv Preprint, refType=null, unstructuredReference=
Neekhara P,
Hussain S,
Pandey P,
et al. Universal adversarial perturbations for speech recognition systems[J].
arXiv Preprint, 2019: arXiv: 1905.03828., articleTitle=Universal adversarial perturbations for speech recognition systems, refAbstract=null), Reference(id=1261377102507553465, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2023, volume=31, issue=null, pageStart=1360, pageEnd=1370, url=null, language=null, rfNumber=[23], rfOrder=24, authorNames=Pandey A, Wang D, journalName=IEEE/ACM Transactions on Audio, Speech, and Language Processing, refType=null, unstructuredReference=
Pandey A,
Wang D. Attentive training: a new training framework for speech enhancement[J].
IEEE/ACM Transactions on Audio, Speech, and Language Processing,
2023,
31: 1360-1370., articleTitle=Attentive training: a new training framework for speech enhancement, refAbstract=null), Reference(id=1261377102624993981, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[24], rfOrder=25, authorNames=Gulati A, Qin J, Chiu C C, journalName=arXiv Preprint, refType=null, unstructuredReference=
Gulati A,
Qin J,
Chiu C C,
et al. Conformer: convolution-augmented transformer for speech recognition[J].
arXiv Preprint, 2020: DOI:
10.48550/arXiv.2005.08100., articleTitle=Conformer: convolution-augmented transformer for speech recognition, refAbstract=null), Reference(id=1261377102734045888, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[25], rfOrder=26, authorNames=Tagliasacchi M, Li Y, Misiunas K, journalName=arXiv Preprint, refType=null, unstructuredReference=
Tagliasacchi M,
Li Y,
Misiunas K,
et al. SEANet: a multi-modal speech enhancement network[J].
arXiv Preprint, 2020: DOI:
10.48550/arXiv.2009.02095., articleTitle=SEANet: a multi-modal speech enhancement network, refAbstract=null), Reference(id=1261377102947955395, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[26], rfOrder=27, authorNames=Chen W, Xing X, Xu X, journalName=arXiv Preprint, refType=null, unstructuredReference=
Chen W,
Xing X,
Xu X,
et al. SpeechFormer: a hierarchical efficient framework incorporating the characteristics of speech[J].
arXiv Preprint, 2022: DOI:
10.48550/arXiv.2203.03812., articleTitle=SpeechFormer: a hierarchical efficient framework incorporating the characteristics of speech, refAbstract=null), Reference(id=1261377103153476297, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=2, pageStart=406, pageEnd=416, url=null, language=null, rfNumber=[27], rfOrder=28, authorNames=张天骐, 罗庆予, 张慧芝, journalName=信号处理, refType=null, unstructuredReference=张天骐, 罗庆予, 张慧芝,
等. 复谱映射下融合高效Transformer的语音增强方法[J].
信号处理,
2024(2): 406-416., articleTitle=复谱映射下融合高效Transformer的语音增强方法, refAbstract=null), Reference(id=1261377103308665549, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=2, pageStart=406, pageEnd=416, url=null, language=null, rfNumber=[27], rfOrder=29, authorNames=Zhang Tianqi, Luo Qingyu, Zhang Huizhi, journalName=Journal of Signal Processing, refType=null, unstructuredReference=
Zhang Tianqi,
Luo Qingyu,
Zhang Huizhi,
et al. Speech enhancement method based on complex spectrum mapping with efficient Transformer[J].
Journal of Signal Processing,
2024(2): 406-416., articleTitle=Speech enhancement method based on complex spectrum mapping with efficient Transformer, refAbstract=null), Reference(id=1261377103522575059, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2018, volume=null, issue=null, pageStart=5409, pageEnd=5413, url=null, language=null, rfNumber=[28], rfOrder=30, authorNames=Odelowo B O, Anderson D V, journalName=IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Calgary, refType=null, unstructuredReference=
Odelowo B O,
Anderson D V. A study of training targets for deep neural network-based speech enhancement using noise prediction[C]//
IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Calgary, AB: IEEE,
2018: 5409-5413., articleTitle=A study of training targets for deep neural network-based speech enhancement using noise prediction, refAbstract=null), Reference(id=1261377103619044055, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2020, volume=null, issue=null, pageStart=1, pageEnd=8, url=null, language=null, rfNumber=[29], rfOrder=31, authorNames=Nossier S A, Wall J, Moniri M, journalName=International Joint Conference on Neural Networks (IJCNN), refType=null, unstructuredReference=
Nossier S A,
Wall J,
Moniri M,
et al. Mapping and masking targets comparison using different deep learning based speech enhancement architectures[C]//
International Joint Conference on Neural Networks (IJCNN). Glasgow: IEEE,
2020: 1-8., articleTitle=Mapping and masking targets comparison using different deep learning based speech enhancement architectures, refAbstract=null), Reference(id=1261377104852169435, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[30], rfOrder=32, authorNames=Germain F G, Chen Q, Koltun V, journalName=arXiv Preprint, refType=null, unstructuredReference=
Germain F G,
Chen Q,
Koltun V. Speech denoising with deep feature losses[J].
arXiv Preprint, 2018: arXiv: 1806.10522., articleTitle=Speech denoising with deep feature losses, refAbstract=null), Reference(id=1261377104969609951, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[31], rfOrder=33, authorNames=Rethage D, Pons J, Serra X, journalName=arXiv Preprint, refType=null, unstructuredReference=
Rethage D,
Pons J,
Serra X. A wavenet for speech denoising[J].
arXiv Preprint, 2018: arXiv: 1706.07162., articleTitle=A wavenet for speech denoising, refAbstract=null), Reference(id=1261377105162547939, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2020, volume=27, issue=null, pageStart=1700, pageEnd=1704, url=null, language=null, rfNumber=[32], rfOrder=34, authorNames=Phan H, McLoughlin I V, Pham L, journalName=IEEE Signal Processing Letters, refType=null, unstructuredReference=
Phan H,
McLoughlin I V,
Pham L,
et al. Improving GANs for speech enhancement[J].
IEEE Signal Processing Letters,
2020,
27: 1700-1704., articleTitle=Improving GANs for speech enhancement, refAbstract=null), Reference(id=1261377105330320103, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2016, volume=24, issue=5, pageStart=967, pageEnd=977, url=null, language=null, rfNumber=[33], rfOrder=35, authorNames=Zhang X L, Wang D, journalName=IEEE/ACM Transactions on Audio, Speech, and Language Processing, refType=null, unstructuredReference=
Zhang X L,
Wang D. A deep ensemble learning method for monaural speech separation[J].
IEEE/ACM Transactions on Audio, Speech, and Language Processing,
2016,
24(5): 967-977., articleTitle=A deep ensemble learning method for monaural speech separation, refAbstract=null), Reference(id=1261377105716196075, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=1947, volume=44, issue=2, pageStart=105, pageEnd=129, url=null, language=null, rfNumber=[34], rfOrder=36, authorNames=Miller G A, journalName=Psychological Bulletin, refType=null, unstructuredReference=
Miller G A. The masking of speech[J].
Psychological Bulletin,
1947,
44(2): 105-129., articleTitle=The masking of speech, refAbstract=null), Reference(id=1261377105867191024, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2014, volume=22, issue=12, pageStart=1849, pageEnd=1858, url=null, language=null, rfNumber=[35], rfOrder=37, authorNames=Wang Y X, Narayanan A, Wang D L, journalName=IEEE/ACM Transactions on Audio, Speech, and Language Processing, refType=null, unstructuredReference=
Wang Y X,
Narayanan A,
Wang D L. On training targets for supervised speech separation[J].
IEEE/ACM Transactions on Audio, Speech, and Language Processing,
2014,
22(12): 1849-1858., articleTitle=On training targets for supervised speech separation, refAbstract=null), Reference(id=1261377106043351798, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2005, volume=null, issue=null, pageStart=181, pageEnd=null, url=null, language=null, rfNumber=[36], rfOrder=38, authorNames=Wang D, journalName=On ideal binary mask as the computational goal of auditory scene analysis, refType=null, unstructuredReference=
Wang D.
On ideal binary mask as the computational goal of auditory scene analysis[M]. Boston: Kluwer Academic Publishers,
2005: 181-197., articleTitle=null, refAbstract=null), Reference(id=1261377106127237881, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2015, volume=null, issue=null, pageStart=708, pageEnd=712, url=null, language=null, rfNumber=[37], rfOrder=39, authorNames=Erdogan H, Hershey J R, Watanabe S, journalName=IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), refType=null, unstructuredReference=
Erdogan H,
Hershey J R,
Watanabe S,
et al. Phase-sensitive and recognition-boosted speech separation using deep recurrent neural networks[C]//
IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). South Brisbane: IEEE,
2015: 708-712., articleTitle=Phase-sensitive and recognition-boosted speech separation using deep recurrent neural networks, refAbstract=null), Reference(id=1261377106261455614, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2016, volume=24, issue=3, pageStart=483, pageEnd=null, url=null, language=null, rfNumber=[38], rfOrder=40, authorNames=Williamson D S, Wang Y, Wang D, journalName=IEEE/ACM Transactions on Audio, Speech, and Language Processing, refType=null, unstructuredReference=
Williamson D S,
Wang Y,
Wang D. Complex ratio masking for monaural speech separation[J].
IEEE/ACM Transactions on Audio, Speech, and Language Processing,
2016,
24(3): 483., articleTitle=Complex ratio masking for monaural speech separation, refAbstract=null), Reference(id=1261377106374701826, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2018, volume=26, issue=10, pageStart=1702, pageEnd=1726, url=null, language=null, rfNumber=[39], rfOrder=41, authorNames=Wang D, Chen J, journalName=IEEE/ACM Transactions on Audio, Speech, and Language Processing, refType=null, unstructuredReference=
Wang D,
Chen J. Supervised speech separation based on deep learning: an overview[J].
IEEE/ACM Transactions on Audio, Speech, and Language Processing,
2018,
26(10): 1702-1726., articleTitle=Supervised speech separation based on deep learning: an overview, refAbstract=null), Reference(id=1261377106550862598, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2018, volume=null, issue=null, pageStart=5049, pageEnd=5052, url=null, language=null, rfNumber=[40], rfOrder=42, authorNames=Karjol P, Ajay Kumar M, Ghosh P K, journalName=IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Calgary, refType=null, unstructuredReference=
Karjol P,
Ajay Kumar M,
Ghosh P K. Speech enhancement using multiple deep neural networks[C]//
IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Calgary, AB: IEEE,
2018: 5049-5052., articleTitle=Speech enhancement using multiple deep neural networks, refAbstract=null), Reference(id=1261377106735411979, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2018, volume=null, issue=null, pageStart=5074, pageEnd=5078, url=null, language=null, rfNumber=[41], rfOrder=43, authorNames=Zhao Y, Xu B, Giri R, journalName=IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Calgary, refType=null, unstructuredReference=
Zhao Y,
Xu B,
Giri R,
et al. Perceptually guided speech enhancement using deep neural networks[C]//
IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Calgary, AB: IEEE,
2018: 5074-5078., articleTitle=Perceptually guided speech enhancement using deep neural networks, refAbstract=null), Reference(id=1261377106840269584, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[42], rfOrder=44, authorNames=Kawanaka M, Koizumi Y, Miyazaki R, journalName=arXiv Preprint, refType=null, unstructuredReference=
Kawanaka M,
Koizumi Y,
Miyazaki R,
et al. Stable training of DNN for speech enhancement based on perceptually-motivated black-box cost function[J].
arXiv Preprint, 2020: arXiv:2002.05879., articleTitle=Stable training of DNN for speech enhancement based on perceptually-motivated black-box cost function, refAbstract=null), Reference(id=1261377106928349972, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[43], rfOrder=45, authorNames=Fujimura T, Koizumi Y, Yatabe K, journalName=arXiv Preprint, refType=null, unstructuredReference=
Fujimura T,
Koizumi Y,
Yatabe K,
et al. Noisy-target training: a training strategy for DNN-based speech enhancement without clean speech[J].
arXiv Preprint, 2021: arXiv: 2101.08625., articleTitle=Noisy-target training: a training strategy for DNN-based speech enhancement without clean speech, refAbstract=null), Reference(id=1261377107049984792, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[44], rfOrder=46, authorNames=Furnon N, Serizel R, Illina I, journalName=arXiv Preprint, refType=null, unstructuredReference=
Furnon N,
Serizel R,
Illina I,
et al. DNN-based mask estimation for distributed speech enhancement in spatially unconstrained microphone arrays[J].
arXiv Preprint, 2020: arXiv: 2011.01714., articleTitle=DNN-based mask estimation for distributed speech enhancement in spatially unconstrained microphone arrays, refAbstract=null), Reference(id=1261377107289060125, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[45], rfOrder=47, authorNames=Xu Y, Du J, Huang Z, journalName=arXiv Preprint, refType=null, unstructuredReference=
Xu Y,
Du J,
Huang Z,
et al. Multi-objective learning and mask-based post-processing for deep neural network based speech enhancement[J].
arXiv Preprint, 2017: DOI:
10.48550/arXiv.1703.07172., articleTitle=Multi-objective learning and mask-based post-processing for deep neural network based speech enhancement, refAbstract=null), Reference(id=1261377107414889248, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2015, volume=null, issue=null, pageStart=5421, pageEnd=5425, url=null, language=null, rfNumber=[46], rfOrder=48, authorNames=Arisoy E, Sethy A, Ramabhadran B, journalName=IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), refType=null, unstructuredReference=
Arisoy E,
Sethy A,
Ramabhadran B,
et al. Bidirectional recurrent neural network language models for automatic speech recognition[C]//
IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). South Brisbane: IEEE,
2015: 5421-5425., articleTitle=Bidirectional recurrent neural network language models for automatic speech recognition, refAbstract=null), Reference(id=1261377107519746851, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2018, volume=null, issue=null, pageStart=3229, pageEnd=3233, url=null, language=null, rfNumber=[47], rfOrder=49, authorNames=Tan K, Wang D, journalName=Interspeech, refType=null, unstructuredReference=
Tan K,
Wang D. A convolutional recurrent neural network for real-time speech enhancement[C]//
Interspeech. Hyderabad: ISCA,
2018: 3229-3233., articleTitle=A convolutional recurrent neural network for real-time speech enhancement, refAbstract=null), Reference(id=1261377107695907623, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2024, volume=156, issue=null, pageStart=103008, pageEnd=null, url=null, language=null, rfNumber=[48], rfOrder=50, authorNames=Wahab F E, Ye Z, Saleem N, journalName=Speech Communication, refType=null, unstructuredReference=
Wahab F E,
Ye Z,
Saleem N,
et al. Compact deep neural networks for real-time speech enhancement on resource-limited devices[J].
Speech Communication,
2024,
156: 103008., articleTitle=Compact deep neural networks for real-time speech enhancement on resource-limited devices, refAbstract=null), Reference(id=1261377107800765226, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2014, volume=null, issue=null, pageStart=631, pageEnd=635, url=null, language=null, rfNumber=[49], rfOrder=51, authorNames=Geiger J T, Zhang Z, Weninger F, journalName=Interspeech, refType=null, unstructuredReference=
Geiger J T,
Zhang Z,
Weninger F,
et al. Robust speech recognition using long short-term memory recurrent neural networks for hybrid acoustic modelling[C]//
Interspeech. New York: ISCA,
2014: 631-635., articleTitle=Robust speech recognition using long short-term memory recurrent neural networks for hybrid acoustic modelling, refAbstract=null), Reference(id=1261377107939177264, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2015, volume=null, issue=null, pageStart=91, pageEnd=null, url=null, language=null, rfNumber=[50], rfOrder=52, authorNames=Weninger F, Erdogan H, Watanabe S, journalName=Speech enhancement with LSTM recurrent neural networks and its application to noise-robust ASR, refType=null, unstructuredReference=
Weninger F,
Erdogan H,
Watanabe S,
et al.
Speech enhancement with LSTM recurrent neural networks and its application to noise-robust ASR[M]. Cham: Springer International Publishing,
2015: 91-99., articleTitle=null, refAbstract=null), Reference(id=1261377108073394996, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2017, volume=null, issue=null, pageStart=136, pageEnd=140, url=null, language=null, rfNumber=[51], rfOrder=53, authorNames=Sun L, Du J, Dai L R, journalName=Hands-free Speech Communications and Microphone Arrays (HSCMA). San Francisco, refType=null, unstructuredReference=
Sun L,
Du J,
Dai L R,
et al. Multiple-target deep learning for LSTM-RNN based speech enhancement[C]//
Hands-free Speech Communications and Microphone Arrays (HSCMA). San Francisco, CA: IEEE,
2017: 136-140., articleTitle=Multiple-target deep learning for LSTM-RNN based speech enhancement, refAbstract=null), Reference(id=1261377108236972855, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2021, volume=172, issue=null, pageStart=107647, pageEnd=null, url=null, language=null, rfNumber=[52], rfOrder=54, authorNames=Wang Z, Zhang T, Shao Y, journalName=Applied Acoustics, refType=null, unstructuredReference=
Wang Z,
Zhang T,
Shao Y,
et al. LSTM-convolutional-BLSTM encoder-decoder network for minimum mean-square error approach to speech enhancement[J].
Applied Acoustics,
2021,
172:107647., articleTitle=LSTM-convolutional-BLSTM encoder-decoder network for minimum mean-square error approach to speech enhancement, refAbstract=null), Reference(id=1261377108371190586, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2022, volume=10, issue=null, pageStart=30069, pageEnd=30079, url=null, language=null, rfNumber=[53], rfOrder=55, authorNames=Oruh J, Viriri S, Adegun A, journalName=IEEE Access, refType=null, unstructuredReference=
Oruh J,
Viriri S,
Adegun A. Long short-term memory recurrent neural network for automatic speech recognition[J].
IEEE Access,
2022,
10: 30069-30079., articleTitle=Long short-term memory recurrent neural network for automatic speech recognition, refAbstract=null), Reference(id=1261377108526379841, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2005, volume=18, issue=5/6, pageStart=602, pageEnd=610, url=null, language=null, rfNumber=[54], rfOrder=56, authorNames=Graves A, Schmidhuber J, journalName=Neural Networks, refType=null, unstructuredReference=
Graves A,
Schmidhuber J. Framewise phoneme classification with bidirectional LSTM and otherneural network architectures[J].
Neural Networks,
2005,
18(5/6): 602-610., articleTitle=Framewise phoneme classification with bidirectional LSTM and otherneural network architectures, refAbstract=null), Reference(id=1261377108715123523, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[55], rfOrder=57, authorNames=Zhang Y, Chen G, Yu D, journalName=arXiv Preprint, refType=null, unstructuredReference=
Zhang Y,
Chen G,
Yu D,
et al. Highway long short-term memory RNNs for distant speech recognition[J].
arXiv Preprint, 2016: arXiv: 1510.08983., articleTitle=Highway long short-term memory RNNs for distant speech recognition, refAbstract=null), Reference(id=1261377108874507081, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2017, volume=null, issue=null, pageStart=5340, pageEnd=5344, url=null, language=null, rfNumber=[56], rfOrder=58, authorNames=Xue S, Yan Z, journalName=IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). New Orleans, refType=null, unstructuredReference=
Xue S,
Yan Z. Improving latency-controlled BLSTM acoustic models for online speech recognition[C]//
IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). New Orleans, LA: IEEE,
2017: 5340-5344., articleTitle=Improving latency-controlled BLSTM acoustic models for online speech recognition, refAbstract=null), Reference(id=1261377109004530507, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2019, volume=27, issue=7, pageStart=1179, pageEnd=null, url=null, language=null, rfNumber=[57], rfOrder=59, authorNames=Pandey A, Wang D, journalName=IEEE/ACM Transactions on Audio, Speech, and Language Processing, refType=null, unstructuredReference=
Pandey A,
Wang D. A New framework for CNN-based speech enhancementin the time domain[J].
IEEE/ACM Transactions on Audio, Speech, and Language Processing,
2019,
27(7): 1179., articleTitle=A New framework for CNN-based speech enhancementin the time domain, refAbstract=null), Reference(id=1261377109138748242, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2016, volume=null, issue=null, pageStart=3768, pageEnd=3772, url=null, language=null, rfNumber=[58], rfOrder=60, authorNames=Fu S W, Tsao Y, Lu X, journalName=Interspeech 2016, refType=null, unstructuredReference=
Fu S W,
Tsao Y,
Lu X. SNR-aware convolutional neural network modeling for speech enhancement[C]//
Interspeech 2016. New York: ISCA,
2016: 3768-3772., articleTitle=SNR-aware convolutional neural network modeling for speech enhancement, refAbstract=null), Reference(id=1261377109323297624, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2019, volume=null, issue=null, pageStart=5756, pageEnd=5760, url=null, language=null, rfNumber=[59], rfOrder=61, authorNames=Ouyang Z, Yu H, Zhu W P, journalName=IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), refType=null, unstructuredReference=
Ouyang Z,
Yu H,
Zhu W P,
et al. A fully convolutional neural network for complex spectrogram processing in speech enhancement[C]//
IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Brighton: IEEE,
2019: 5756-5760., articleTitle=A fully convolutional neural network for complex spectrogram processing in speech enhancement, refAbstract=null), Reference(id=1261377109482681181, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2019, volume=null, issue=null, pageStart=6875, pageEnd=6879, url=null, language=null, rfNumber=[60], rfOrder=62, authorNames=Pandey A, Wang D, journalName=IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), refType=null, unstructuredReference=
Pandey A,
Wang D. Temporal convolutional neural network for real-time speech enhancement in the time domain[C]//
IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Brighton: IEEE,
2019: 6875-6879., articleTitle=Temporal convolutional neural network for real-time speech enhancement in the time domain, refAbstract=null), Reference(id=1261377109608510305, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=11, pageStart=3383, pageEnd=3389, url=null, language=null, rfNumber=[61], rfOrder=63, authorNames=许春冬, 王磊, 胡菁兰, journalName=计算机工程与设计, refType=null, unstructuredReference=许春冬, 王磊, 胡菁兰,
等. 结合残差与双注意力机制的U-net语音增强方法[J].
计算机工程与设计,
2024(11): 3383-3389., articleTitle=结合残差与双注意力机制的U-net语音增强方法, refAbstract=null), Reference(id=1261377109742728035, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=11, pageStart=3383, pageEnd=3389, url=null, language=null, rfNumber=[61], rfOrder=64, authorNames=Xu Chundong, Wang Lei, Hu Jinglan, journalName=Computer Engineering and Design, refType=null, unstructuredReference=
Xu Chundong,
Wang Lei,
Hu Jinglan,
et al. U-Net speech enhancement method combining residual and dual attention mechanism[J].
Computer Engineering and Design,
2024(11): 3383-3389., articleTitle=U-Net speech enhancement method combining residual and dual attention mechanism, refAbstract=null), Reference(id=1261377109860168553, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2022, volume=22, issue=5, pageStart=1950, pageEnd=1957, url=null, language=null, rfNumber=[62], rfOrder=65, authorNames=徐浩森, 姜囡, 齐志坤, journalName=科学技术与工程, refType=null, unstructuredReference=徐浩森, 姜囡, 齐志坤. 基于注意力机制的卷积循环网络语音降噪[J].
科学技术与工程,
2022,
22(5): 1950-1957., articleTitle=基于注意力机制的卷积循环网络语音降噪, refAbstract=null), Reference(id=1261377109960831853, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2022, volume=22, issue=5, pageStart=1950, pageEnd=1957, url=null, language=null, rfNumber=[62], rfOrder=66, authorNames=Xu Haosen, Jiang Nan, Qi Zhikun, journalName=Science Technology and Engineering, refType=null, unstructuredReference=
Xu Haosen,
Jiang Nan,
Qi Zhikun. Speech denoising based on attention mechanism using convolution loop network[J].
Science Technology and Engineering,
2022,
22(5): 1950-1957., articleTitle=Speech denoising based on attention mechanism using convolution loop network, refAbstract=null), Reference(id=1261377110149575538, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2019, volume=null, issue=null, pageStart=106, pageEnd=110, url=null, language=null, rfNumber=[63], rfOrder=67, authorNames=Baby D, Verhulst S, journalName=IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), refType=null, unstructuredReference=
Baby D,
Verhulst S. Sergan: speech enhancement using relativistic generative adversarial networks with gradient penalty[C]//
IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Brighton: IEEE,
2019: 106-110., articleTitle=Sergan: speech enhancement using relativistic generative adversarial networks with gradient penalty, refAbstract=null), Reference(id=1261377110342513527, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2018, volume=null, issue=null, pageStart=5414, pageEnd=5418, url=null, language=null, rfNumber=[64], rfOrder=68, authorNames=Pandey A, Wang D, journalName=IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Calgary, refType=null, unstructuredReference=
Pandey A,
Wang D. On adversarial training and loss functions for speech enhancement[C]//
IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Calgary, AB: IEEE,
2018: 5414-5418., articleTitle=On adversarial training and loss functions for speech enhancement, refAbstract=null), Reference(id=1261377110493508475, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2018, volume=null, issue=null, pageStart=5039, pageEnd=5043, url=null, language=null, rfNumber=[65], rfOrder=69, authorNames=Soni M H, Shah N, Patil H A, journalName=IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Calgary, refType=null, unstructuredReference=
Soni M H,
Shah N,
Patil H A. Time-frequency masking-based speech enhancement using generative adversarial network[C]//
IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Calgary, AB: IEEE,
2018: 5039-5043., articleTitle=Time-frequency masking-based speech enhancement using generative adversarial network, refAbstract=null), Reference(id=1261377110686446465, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2020, volume=118, issue=null, pageStart=1, pageEnd=9, url=null, language=null, rfNumber=[66], rfOrder=70, authorNames=Yang F, Wang Z, Li J, journalName=Speech Communication, refType=null, unstructuredReference=
Yang F,
Wang Z,
Li J,
et al. Improving generative adversarial networks for speech enhancement through regularization of latent representations[J].
Speech Communication,
2020,
118: 1-9., articleTitle=Improving generative adversarial networks for speech enhancement through regularization of latent representations, refAbstract=null), Reference(id=1261377110858412935, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[67], rfOrder=71, authorNames=Fu S W, Liao C F, Tsao Y, journalName=arXiv Preprint, refType=null, unstructuredReference=
Fu S W,
Liao C F,
Tsao Y,
et al. MetricGAN: generative adversarial networks based black-box metric scores optimization for speech enhancement[J].
arXiv Preprint, 2019: DOI:
10.48550/arXiv.1905.04874., articleTitle=MetricGAN: generative adversarial networks based black-box metric scores optimization for speech enhancement, refAbstract=null), Reference(id=1261377111030379405, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2024, volume=32, issue=null, pageStart=2477, pageEnd=2493, url=null, language=null, rfNumber=[68], rfOrder=72, authorNames=Abdulatif S, Cao R, Yang B, journalName=IEEE/ACM Transactions on Audio, Speech, and Language Processing, refType=null, unstructuredReference=
Abdulatif S,
Cao R,
Yang B. CMGAN: Conformer-based metric-GAN for monaural speech enhancement[J].
IEEE/ACM Transactions on Audio, Speech, and Language Processing,
2024,
32: 2477-2493., articleTitle=CMGAN: Conformer-based metric-GAN for monaural speech enhancement, refAbstract=null), Reference(id=1261377111131042705, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2024, volume=32, issue=null, pageStart=2418, pageEnd=2431, url=null, language=null, rfNumber=[69], rfOrder=73, authorNames=Borgström B J, Brandstein M S, journalName=IEEE/ACM Transactions on Audio, Speech, and Language Processing, refType=null, unstructuredReference=
Borgström B J,
Brandstein M S. A multiscale autoencoder (MSAE) framework for end-to-end neural network speech enhancement[J].
IEEE/ACM Transactions on Audio, Speech, and Language Processing,
2024,
32: 2418-2431., articleTitle=A multiscale autoencoder (MSAE) framework for end-to-end neural network speech enhancement, refAbstract=null), Reference(id=1261377111290426262, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2020, volume=null, issue=null, pageStart=6649, pageEnd=6653, url=null, language=null, rfNumber=[70], rfOrder=74, authorNames=Kim J, El-Khamy M, Lee J, journalName=IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), refType=null, unstructuredReference=
Kim J,
El-Khamy M,
Lee J. T-GSA: Transformer with gaussian-weighted self-attention for speech enhancement[C]//
IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Barcelona: IEEE,
2020: 6649-6653., articleTitle=T-GSA: Transformer with gaussian-weighted self-attention for speech enhancement, refAbstract=null), Reference(id=1261377111437226906, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=null, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[71], rfOrder=75, authorNames=Yu G, Li A, Zheng C, journalName=arXiv Preprint, refType=null, unstructuredReference=
Yu G,
Li A,
Zheng C,
et al. Dual-branch attention-in-attention Transformer for single-channel speech enhancement[J].
arXiv Preprint, 2022: arXiv: 2110.06467., articleTitle=Dual-branch attention-in-attention Transformer for single-channel speech enhancement, refAbstract=null), Reference(id=1261377111592416158, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2022, volume=14, issue=3, pageStart=1152, pageEnd=1158, url=null, language=null, rfNumber=[72], rfOrder=76, authorNames=Yu W, Zhou J, Wang H, journalName=Cognitive Computation, refType=null, unstructuredReference=
Yu W,
Zhou J,
Wang H,
et al. SETransformer: speech enhancement transformer[J].
Cognitive Computation,
2022,
14(3): 1152-1158., articleTitle=SETransformer: speech enhancement transformer, refAbstract=null), Reference(id=1261377111760188320, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2023, volume=11, issue=null, pageStart=66979, pageEnd=66994, url=null, language=null, rfNumber=[73], rfOrder=77, authorNames=Saleem N, Gunawan T S, Kartiwi M, journalName=IEEE Access, refType=null, unstructuredReference=
Saleem N,
Gunawan T S,
Kartiwi M,
et al. NSE-CATNet: deep neural speech enhancement using convolutional attention Transformer network[J].
IEEE Access,
2023,
11: 66979-66994., articleTitle=NSE-CATNet: deep neural speech enhancement using convolutional attention Transformer network, refAbstract=null), Reference(id=1261377111919571875, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2022, volume=30, issue=null, pageStart=2629, pageEnd=2644, url=null, language=null, rfNumber=[74], rfOrder=78, authorNames=Yu G, Li A, Wang H, journalName=IEEE/ACM Transactions on Audio, Speech, and Language Processing, refType=null, unstructuredReference=
Yu G,
Li A,
Wang H,
et al. DBT-Net: dual-branch federative magnitude and phase estimation with attention-in-attention Transformer for monaural speech enhancement[J].
IEEE/ACM Transactions on Audio, Speech, and Language Processing,
2022,
30: 2629-2644., articleTitle=DBT-Net: dual-branch federative magnitude and phase estimation with attention-in-attention Transformer for monaural speech enhancement, refAbstract=null), Reference(id=1261377112062178214, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2024, volume=224, issue=null, pageStart=110131, pageEnd=null, url=null, language=null, rfNumber=[75], rfOrder=79, authorNames=Saleem N, Bourouis S, Elmannai H, journalName=Applied Acoustics, refType=null, unstructuredReference=
Saleem N,
Bourouis S,
Elmannai H,
et al. DPHT-ANet: dual-path high-order transformer-style fully attentional network for monaural speech enhancement[J].
Applied Acoustics,
2024,
224: 110131., articleTitle=DPHT-ANet: dual-path high-order transformer-style fully attentional network for monaural speech enhancement, refAbstract=null), Reference(id=1261377112229950376, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=11, pageStart=15, pageEnd=26, url=null, language=null, rfNumber=[76], rfOrder=80, authorNames=解元, 邹涛, 孙为军, journalName=通信学报, refType=null, unstructuredReference=解元, 邹涛, 孙为军,
等. 基于混合混响模型的多通道语音增强算法[J].
通信学报,
2024(11): 15-26., articleTitle=基于混合混响模型的多通道语音增强算法, refAbstract=null), Reference(id=1261377112347390891, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=11, pageStart=15, pageEnd=26, url=null, language=null, rfNumber=[76], rfOrder=81, authorNames=Xie Yuan, Zou Tao, Sun Weijun, journalName=Journal on Communications, refType=null, unstructuredReference=
Xie Yuan,
Zou Tao,
Sun Weijun,
et al. Multichannel speech enhancement algorithm based onhybrid reverberation model[J].
Journal on Communications,
2024(11): 15-26., articleTitle=Multichannel speech enhancement algorithm based onhybrid reverberation model, refAbstract=null), Reference(id=1261377112464831406, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=8, pageStart=326, pageEnd=335, url=null, language=null, rfNumber=[77], rfOrder=82, authorNames=芮小博, 孔欣玥, 伍洲, journalName=仪器仪表学报, refType=null, unstructuredReference=芮小博, 孔欣玥, 伍洲,
等. 基于谱特征自适应估计的激光相干语音探测信号增强方法[J].
仪器仪表学报,
2024(8): 326-335., articleTitle=基于谱特征自适应估计的激光相干语音探测信号增强方法, refAbstract=null), Reference(id=1261377112594854832, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2024, volume=null, issue=8, pageStart=326, pageEnd=335, url=null, language=null, rfNumber=[77], rfOrder=83, authorNames=Rui Xiaobo, Kong Xinyue, Wu Zhou, journalName=Chinese Journal of Scientific Instrument, refType=null, unstructuredReference=
Rui Xiaobo,
Kong Xinyue,
Wu Zhou,
et al. Enhancement of speech detected by laser coherent detection method based on spectral feature adaptation[J].
Chinese Journal of Scientific Instrument,
2024(8): 326-335., articleTitle=Enhancement of speech detected by laser coherent detection method based on spectral feature adaptation, refAbstract=null), Reference(id=1261377112766821298, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2001, volume=null, issue=null, pageStart=749, pageEnd=752, url=null, language=null, rfNumber=[78], rfOrder=84, authorNames=Rix A W, Beerends J G, Hollier M P, journalName=IEEE International Conference on Acoustics, Speech, and Signal Processing. Salt Lake City, refType=null, unstructuredReference=
Rix A W,
Beerends J G,
Hollier M P,
et al. Perceptual evaluation of speech quality (PESQ)-a new method for speech quality assessment of telephone networks and codecs[C]//
IEEE International Conference on Acoustics, Speech, and Signal Processing. Salt Lake City, UT: IEEE,
2001: 749-752., articleTitle=Perceptual evaluation of speech quality (PESQ)-a new method for speech quality assessment of telephone networks and codecs, refAbstract=null), Reference(id=1261377112871678899, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2010, volume=null, issue=null, pageStart=4214, pageEnd=4217, url=null, language=null, rfNumber=[79], rfOrder=85, authorNames=Taal C, Hendriks R, Heusdens R, journalName=IEEE International Conference on Acoustics, Speech and Signal Processing, refType=null, unstructuredReference=
Taal C,
Hendriks R,
Heusdens R,
et al. A short-time objective intelligibility measure for time-frequency weighted noisy speech[C]//
IEEE International Conference on Acoustics, Speech and Signal Processing. New York: IEEE,
2010: 4214-4217., articleTitle=A short-time objective intelligibility measure for time-frequency weighted noisy speech, refAbstract=null), Reference(id=1261377113031062452, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2006, volume=14, issue=4, pageStart=1462, pageEnd=1469, url=null, language=null, rfNumber=[80], rfOrder=86, authorNames=Vincent E, Gribonval R, Fevotte C, journalName=IEEE Transactions on Audio, Speech and Language Processing, refType=null, unstructuredReference=
Vincent E,
Gribonval R,
Fevotte C. Performance measurement in blind audio source separation[J].
IEEE Transactions on Audio, Speech and Language Processing,
2006,
14(4): 1462-1469., articleTitle=Performance measurement in blind audio source separation, refAbstract=null), Reference(id=1261377113215611829, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=2011, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[81], rfOrder=87, authorNames=Loizou P C, journalName=Speech quality assessment, refType=null, unstructuredReference=
Loizou P C.
Speech quality assessment[M]. Berlin: Springer,
2011., articleTitle=null, refAbstract=null), Reference(id=1261377113286914998, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, doi=null, pmid=null, pmcid=null, year=1988, volume=null, issue=null, pageStart=null, pageEnd=null, url=null, language=null, rfNumber=[82], rfOrder=88, authorNames=Quackenbush S R, Barnwell T P, Clements M A, journalName=Objective measures of speech quality, refType=null, unstructuredReference=
Quackenbush S R,
Barnwell T P,
Clements M A.
Objective measures of speech quality[M]. Englewood Cliffs, N.J.: Prentice Hall,
1988., articleTitle=null, refAbstract=null)], funds=[Fund(id=1261377089480045054, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, awardId=2017YFC0821000, language=CN, fundingSource=国家重点研发计划(2017YFC0821000), fundOrder=null, country=null), Fund(id=1261377090310517255, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, awardId=KF202117, language=CN, fundingSource=司法部司法鉴定重点实验室(KF202117), fundOrder=null, country=null), Fund(id=1261377092365726228, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, awardId=2024YCZD05, language=CN, fundingSource=中国刑事警察学院研究生创新能力提升项目(2024YCZD05), fundOrder=null, country=null)], companyList=[AuthorCompany(id=1261377026125082750, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, xref=null, ext=[AuthorCompanyExt(id=1261377026376740993, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, companyId=1261377026125082750, language=EN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=College of Public Security Information Technology and Intelligence, Criminal Investigation Police University of China, Shenyang 110854, China), AuthorCompanyExt(id=1261377026410295426, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, companyId=1261377026125082750, language=CN, country=null, province=null, city=null, postcode=null, companyName=null, departmentName=null, remark=中国刑事警察学院公安信息技术与情报学院, 沈阳 110854)])], figs=[ArticleFig(id=1261377055460045137, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=EN, label=Fig.1, caption=
Features extraction in frequency domain, figureFileSmall=3B6UxWrBHW9dHtootxKe3g==, figureFileBig=I+wpRMhSAVCZJ2gAtSP9Jw==, tableContent=null), ArticleFig(id=1261377057024520537, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=CN, label=图1, caption=
频域特征提取, figureFileSmall=3B6UxWrBHW9dHtootxKe3g==, figureFileBig=I+wpRMhSAVCZJ2gAtSP9Jw==, tableContent=null), ArticleFig(id=1261377060770034021, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=EN, label=Fig.2, caption=
Deep neural network (DNN) model, figureFileSmall=Md95ZKt2NowVypJtkCNhaA==, figureFileBig=i+KEm7XGCr8HbzkJUU8yBg==, tableContent=null), ArticleFig(id=1261377061764084078, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=CN, label=图2, caption=
深度神经网络(DNN)模型, figureFileSmall=Md95ZKt2NowVypJtkCNhaA==, figureFileBig=i+KEm7XGCr8HbzkJUU8yBg==, tableContent=null), ArticleFig(id=1261377063454388593, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=EN, label=Fig.3, caption=
Recurrent neural network (RNN) model, figureFileSmall=sJZ/M+9LyM6dKg0kYGUB6w==, figureFileBig=muqHRgnGuDyTfRjbiNR07g==, tableContent=null), ArticleFig(id=1261377066646253947, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=CN, label=图3, caption=
循环神经网络(RNN)模型 x为输入向量;o为输出向量。输入向量${x}_{t}=[{x}_{1},{x}_{2},\dots,{x}_{T}]$为当前时间步的输入序列,其和输出向量${o}_{t}=[{o}_{1},{o}_{2},\dots,{o}_{T}]$的维度根据特征和输出需求而异;s为在特定时间步的隐藏状态,是RNN能够处理序列数据的关键;st为根据前一隐藏状态st-1确定的当前时间步的状态;W、U和V为权重矩阵
, figureFileSmall=sJZ/M+9LyM6dKg0kYGUB6w==, figureFileBig=muqHRgnGuDyTfRjbiNR07g==, tableContent=null), ArticleFig(id=1261377067485114753, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=EN, label=Fig.4, caption=
LSTM model, figureFileSmall=cLAnrl7MHfLlaV3RK0ymmw==, figureFileBig=y+gNVrODpCpsW1esXltYLQ==, tableContent=null), ArticleFig(id=1261377069200585098, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=CN, label=图4, caption=
LSTM模型, figureFileSmall=cLAnrl7MHfLlaV3RK0ymmw==, figureFileBig=y+gNVrODpCpsW1esXltYLQ==, tableContent=null), ArticleFig(id=1261377070064611727, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=EN, label=Fig.5, caption=
CNN model, figureFileSmall=+vcsH/LaEKLia2M++ms2Ow==, figureFileBig=LyF/NBRTwl33l0EjIuxTxg==, tableContent=null), ArticleFig(id=1261377071272571287, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=CN, label=图5, caption=
CNN模型, figureFileSmall=+vcsH/LaEKLia2M++ms2Ow==, figureFileBig=LyF/NBRTwl33l0EjIuxTxg==, tableContent=null), ArticleFig(id=1261377072946098592, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=EN, label=Fig.6, caption=
GAN model, figureFileSmall=I4NxP5saIKoZga5HGpHYrA==, figureFileBig=RHxZWO8c+kj/+0x+puAjYA==, tableContent=null), ArticleFig(id=1261377074720289191, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=CN, label=图6, caption=
GAN模型, figureFileSmall=I4NxP5saIKoZga5HGpHYrA==, figureFileBig=RHxZWO8c+kj/+0x+puAjYA==, tableContent=null), ArticleFig(id=1261377075865334190, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=EN, label=Fig.7, caption=
Transformer encoder and decoder, figureFileSmall=lewB6+hiYLoEQi7t2YFGyQ==, figureFileBig=HxK/TE4006xYN7jptUO93Q==, tableContent=null), ArticleFig(id=1261377076754526647, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=CN, label=图7, caption=
Transformer编码器和解码器 N为堆叠的层数;Output(shifted right)为输出右移;Input embedding和Output embedding分别为输入嵌入和输出嵌入;Positional encoding为位置编码;Muti-head attention为多头注意力层;Masked Muti-head attention为掩蔽的多头注意力层;Add & norm为层归一化;Feed forward为前置反馈层;Linear为线性层;Softmax函数;Outputs probablities为输出概率
, figureFileSmall=lewB6+hiYLoEQi7t2YFGyQ==, figureFileBig=HxK/TE4006xYN7jptUO93Q==, tableContent=null), ArticleFig(id=1261377079644402114, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=EN, label=Table 1, caption=
Review of datasets
, figureFileSmall=null, figureFileBig=null, tableContent=
数据集 类型 | 数据库 | 语言 | 规模 | 采样率/ kHz |
| TIMIT | 英语 | 630位说话者 | 16.00 |
纯净 语音 | VoiceBank | 英语 | 110位说话者、44 h语音 | 48.00 |
| LibriSpeech | 英语 | 1 000 h语音 | 16.00 |
| WSJ0 | 英语 | 800 h | 16.00 |
| UrbanSound8K | — | 10种噪声、8 732条记录 | 22.05 |
| 噪声 | Demand | — | 16通道、6种大环境噪声 | 48.00 |
| Noise-92 | — | 15种噪声类型 | 19.98 |
| ESC 50 | — | 50种环境噪声、 2 000条记录 | 44.10 |
| CHiME3 | 英语 | 342 h语音、50 h嘈杂 环境音频 | 16.00 |
含噪 语音 | VoiceBank+ Demand | 英语 | 两个子集:28位说话者 和56位说话者 | 48.00 |
| AMI | 英语 | 100 h会议录音 | 16.00 |
| DAPS | 英语 | 20位说话者 | 44.10 |
| Aurora-2 | 英语 | 8 440条记录 | 8.00 |
| NOISEX-92 | — | 8种噪声、1.4 G | 16.00 |
), ArticleFig(id=1261377080500040142, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=CN, label=表1, caption=
数据集概述
, figureFileSmall=null, figureFileBig=null, tableContent=
数据集 类型 | 数据库 | 语言 | 规模 | 采样率/ kHz |
| TIMIT | 英语 | 630位说话者 | 16.00 |
纯净 语音 | VoiceBank | 英语 | 110位说话者、44 h语音 | 48.00 |
| LibriSpeech | 英语 | 1 000 h语音 | 16.00 |
| WSJ0 | 英语 | 800 h | 16.00 |
| UrbanSound8K | — | 10种噪声、8 732条记录 | 22.05 |
| 噪声 | Demand | — | 16通道、6种大环境噪声 | 48.00 |
| Noise-92 | — | 15种噪声类型 | 19.98 |
| ESC 50 | — | 50种环境噪声、 2 000条记录 | 44.10 |
| CHiME3 | 英语 | 342 h语音、50 h嘈杂 环境音频 | 16.00 |
含噪 语音 | VoiceBank+ Demand | 英语 | 两个子集:28位说话者 和56位说话者 | 48.00 |
| AMI | 英语 | 100 h会议录音 | 16.00 |
| DAPS | 英语 | 20位说话者 | 44.10 |
| Aurora-2 | 英语 | 8 440条记录 | 8.00 |
| NOISEX-92 | — | 8种噪声、1.4 G | 16.00 |
), ArticleFig(id=1261377081326318036, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=EN, label=Table 2, caption=
Advantages and disadvantages of deep learning based speech enhancement methods
, figureFileSmall=null, figureFileBig=null, tableContent=
| 方法类型 | 优点 | 缺点 |
| DNN | 能够通过学习大量的训练数据来捕捉和模拟复杂的非线性关系;根据所需的结果进行优化调整,应用于多种数据类型 | 需要大量数据训练数据和高质量标签;模型复杂度高,计算和存储成本较大;可能存在延迟、梯度消失或爆炸问题,影响训练稳定性 |
| RNN | 能够捕捉语音信号的时间序列特性;与卷积层结合时可以扩展感受野 | 计算成本较高,训练困难;容易梯度消失或爆炸;对长时序依赖关系的建模效率较低 |
| LSTM | 解决了传统RNN可能梯度消失问题;可长期存储时间序列信息;适用于语音信号中较长时序依赖的任务 | 模型训练时间较长,内存占用高;延迟较大,尤其在实时应用中表现不佳;对参数初始化和超参数设置较为敏感 |
| GRU | 改进了LSTM的门控机制,减少了计算复杂度;更高效,适用于资源受限环境;能够较好解决梯度消失问题 | 在捕捉复杂长时间依赖关系上可能不如LSTM;收敛速度慢,学习效率低 |
| CNN | 在局部特征提取和模式识别中表现优异,适合处理语音的时间频率特征;自动学习特征,无须人工监督 | 不适合处理长时间序列依赖性特征;对语音输入数据的变化缺乏强鲁棒性;需要与其他模型结合才能解决时序建模问题 |
| GAN | 能够生成高质量的增强语音信号,重建信号更接近原始语音;支持多种编码器-解码器结构,灵活性高 | 对抗训练比较困难和不稳定;对超参数的选择较为敏感,调优难度高;训练成本较高,尤其在高维复杂数据中表现出瓶颈 |
| AE | 能够学习到信号的压缩表示;可以实现降维,有助于特征提取 | 传统AE可能在处理非线性和复杂数据分布方面存在局限性;复杂语音增强性能不如更高级的深度学习模型 |
| DAE | 能够学习从噪声语音中恢复干净语音的映射;在训练阶段引入噪声,提升模型鲁棒性 | 对未知类型噪声或非平稳噪声的处理能力有限 |
| VAE | 通过引入概率生成模型,实现自然和多样化的语音信号生成;适用于复杂信号建模,能够生成连续性更好的特征 | 涉及复杂的变分推断步骤,训练难度较大;对超参数敏感性高,优化和调试成本较高 |
| Transformer | 自注意力机制能够有效捕捉长距离依赖关系;较好处理非局部特征;并行化能力强,计算效率高 | 结构复杂,参数数量多,计算资源需求高;需要大量高质量训练数据;实时应用中可能存在延迟问题 |
), ArticleFig(id=1261377084300079576, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=CN, label=表2, caption=
基于深度学习的语音增强方法的优缺点
, figureFileSmall=null, figureFileBig=null, tableContent=
| 方法类型 | 优点 | 缺点 |
| DNN | 能够通过学习大量的训练数据来捕捉和模拟复杂的非线性关系;根据所需的结果进行优化调整,应用于多种数据类型 | 需要大量数据训练数据和高质量标签;模型复杂度高,计算和存储成本较大;可能存在延迟、梯度消失或爆炸问题,影响训练稳定性 |
| RNN | 能够捕捉语音信号的时间序列特性;与卷积层结合时可以扩展感受野 | 计算成本较高,训练困难;容易梯度消失或爆炸;对长时序依赖关系的建模效率较低 |
| LSTM | 解决了传统RNN可能梯度消失问题;可长期存储时间序列信息;适用于语音信号中较长时序依赖的任务 | 模型训练时间较长,内存占用高;延迟较大,尤其在实时应用中表现不佳;对参数初始化和超参数设置较为敏感 |
| GRU | 改进了LSTM的门控机制,减少了计算复杂度;更高效,适用于资源受限环境;能够较好解决梯度消失问题 | 在捕捉复杂长时间依赖关系上可能不如LSTM;收敛速度慢,学习效率低 |
| CNN | 在局部特征提取和模式识别中表现优异,适合处理语音的时间频率特征;自动学习特征,无须人工监督 | 不适合处理长时间序列依赖性特征;对语音输入数据的变化缺乏强鲁棒性;需要与其他模型结合才能解决时序建模问题 |
| GAN | 能够生成高质量的增强语音信号,重建信号更接近原始语音;支持多种编码器-解码器结构,灵活性高 | 对抗训练比较困难和不稳定;对超参数的选择较为敏感,调优难度高;训练成本较高,尤其在高维复杂数据中表现出瓶颈 |
| AE | 能够学习到信号的压缩表示;可以实现降维,有助于特征提取 | 传统AE可能在处理非线性和复杂数据分布方面存在局限性;复杂语音增强性能不如更高级的深度学习模型 |
| DAE | 能够学习从噪声语音中恢复干净语音的映射;在训练阶段引入噪声,提升模型鲁棒性 | 对未知类型噪声或非平稳噪声的处理能力有限 |
| VAE | 通过引入概率生成模型,实现自然和多样化的语音信号生成;适用于复杂信号建模,能够生成连续性更好的特征 | 涉及复杂的变分推断步骤,训练难度较大;对超参数敏感性高,优化和调试成本较高 |
| Transformer | 自注意力机制能够有效捕捉长距离依赖关系;较好处理非局部特征;并行化能力强,计算效率高 | 结构复杂,参数数量多,计算资源需求高;需要大量高质量训练数据;实时应用中可能存在延迟问题 |
), ArticleFig(id=1261377084727898591, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=EN, label=Table 3, caption=
Speech enhancement evaluations measures[17,78-82]
, figureFileSmall=null, figureFileBig=null, tableContent=
| 评估方法 | 数学表达式 |
| PESQ[78] | $\mathrm{P}\mathrm{E}\mathrm{S}\mathrm{Q}=4.5-0.1{d}_{\mathrm{S}\mathrm{Y}\mathrm{M}}-0.030\mathrm{ }9{d}_{\mathrm{A}\mathrm{S}\mathrm{Y}\mathrm{M}}$ |
| STOI[79] | $\mathrm{S}\mathrm{T}\mathrm{O}\mathrm{I}=\frac{1}{JM}\stackrel{J}{\sum _{j=1}}\stackrel{M}{\sum _{m=1}}{d}_{j}\left(m\right)$ |
| SDR[80] | $\mathrm{S}\mathrm{D}\mathrm{R}=10\mathrm{l}\mathrm{g}\frac{\left|\right|s{\left|\right|}^{2}}{\left|\left|s-\widehat{s}\right|\right|{}^{2}}$ |
| BSD[81] | $\mathrm{B}\mathrm{S}\mathrm{D}\left(k\right)=\stackrel{{N}_{b}}{\sum _{b=1}}\left[{S}_{k}\right(b)-{\stackrel{—}{S}}_{k}{\left(b\right)]}^{2}$ |
| LLR[82] | ${d}_{\mathrm{L}\mathrm{L}\mathrm{R}}({a}_{x},{\stackrel{-}{a}}_{\widehat{x}})=\mathrm{l}\mathrm{n}\left(\frac{{{\stackrel{-}{a}}^{\mathrm{T}}}_{\widehat{x}}{R}_{x}{\stackrel{-}{a}}_{\widehat{x}}}{{{a}^{\mathrm{T}}}_{x}{R}_{x}{a}_{x}}\right)$ |
| SegSNR[81] | $\mathrm{S}\mathrm{N}{\mathrm{R}}_{\mathrm{s}\mathrm{e}\mathrm{g}}=\frac{10}{{M}_{\mathrm{s}}}\stackrel{{M}_{\mathrm{s}}-1}{\sum _{m=0}}\mathrm{l}\mathrm{g}\left\{\frac{\stackrel{Nm+N-1}{\sum _{n=Nm}}{x}^{2}\left(n\right)}{\stackrel{Nm+N-1}{\sum _{n=Nm}}\left[x\right(n)-\widehat{x}{\left(n\right)]}^{2}}\right\}$ |
综合评价 指标[17] | $\begin{array}{l}{C}_{\mathrm{s}\mathrm{i}\mathrm{g}}=3.093-1.092{S}_{\mathrm{L}\mathrm{L}\mathrm{R}}+0.603{S}_{\mathrm{P}\mathrm{E}\mathrm{S}\mathrm{Q}}-0.009{S}_{\mathrm{W}\mathrm{S}\mathrm{S}}\\ {C}_{\mathrm{b}\mathrm{a}\mathrm{k}}=1.634+0.478{S}_{\mathrm{P}\mathrm{E}\mathrm{S}\mathrm{Q}}-0.007{S}_{\mathrm{W}\mathrm{S}\mathrm{S}}+0.063{S}_{\mathrm{S}\mathrm{N}{\mathrm{R}}_{\mathrm{s}\mathrm{e}\mathrm{g}}}\end{array}$ |
), ArticleFig(id=1261377085185077728, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=CN, label=表3, caption=
语音增强评估指标[17,78-82]
, figureFileSmall=null, figureFileBig=null, tableContent=
| 评估方法 | 数学表达式 |
| PESQ[78] | $\mathrm{P}\mathrm{E}\mathrm{S}\mathrm{Q}=4.5-0.1{d}_{\mathrm{S}\mathrm{Y}\mathrm{M}}-0.030\mathrm{ }9{d}_{\mathrm{A}\mathrm{S}\mathrm{Y}\mathrm{M}}$ |
| STOI[79] | $\mathrm{S}\mathrm{T}\mathrm{O}\mathrm{I}=\frac{1}{JM}\stackrel{J}{\sum _{j=1}}\stackrel{M}{\sum _{m=1}}{d}_{j}\left(m\right)$ |
| SDR[80] | $\mathrm{S}\mathrm{D}\mathrm{R}=10\mathrm{l}\mathrm{g}\frac{\left|\right|s{\left|\right|}^{2}}{\left|\left|s-\widehat{s}\right|\right|{}^{2}}$ |
| BSD[81] | $\mathrm{B}\mathrm{S}\mathrm{D}\left(k\right)=\stackrel{{N}_{b}}{\sum _{b=1}}\left[{S}_{k}\right(b)-{\stackrel{—}{S}}_{k}{\left(b\right)]}^{2}$ |
| LLR[82] | ${d}_{\mathrm{L}\mathrm{L}\mathrm{R}}({a}_{x},{\stackrel{-}{a}}_{\widehat{x}})=\mathrm{l}\mathrm{n}\left(\frac{{{\stackrel{-}{a}}^{\mathrm{T}}}_{\widehat{x}}{R}_{x}{\stackrel{-}{a}}_{\widehat{x}}}{{{a}^{\mathrm{T}}}_{x}{R}_{x}{a}_{x}}\right)$ |
| SegSNR[81] | $\mathrm{S}\mathrm{N}{\mathrm{R}}_{\mathrm{s}\mathrm{e}\mathrm{g}}=\frac{10}{{M}_{\mathrm{s}}}\stackrel{{M}_{\mathrm{s}}-1}{\sum _{m=0}}\mathrm{l}\mathrm{g}\left\{\frac{\stackrel{Nm+N-1}{\sum _{n=Nm}}{x}^{2}\left(n\right)}{\stackrel{Nm+N-1}{\sum _{n=Nm}}\left[x\right(n)-\widehat{x}{\left(n\right)]}^{2}}\right\}$ |
综合评价 指标[17] | $\begin{array}{l}{C}_{\mathrm{s}\mathrm{i}\mathrm{g}}=3.093-1.092{S}_{\mathrm{L}\mathrm{L}\mathrm{R}}+0.603{S}_{\mathrm{P}\mathrm{E}\mathrm{S}\mathrm{Q}}-0.009{S}_{\mathrm{W}\mathrm{S}\mathrm{S}}\\ {C}_{\mathrm{b}\mathrm{a}\mathrm{k}}=1.634+0.478{S}_{\mathrm{P}\mathrm{E}\mathrm{S}\mathrm{Q}}-0.007{S}_{\mathrm{W}\mathrm{S}\mathrm{S}}+0.063{S}_{\mathrm{S}\mathrm{N}{\mathrm{R}}_{\mathrm{s}\mathrm{e}\mathrm{g}}}\end{array}$ |
), ArticleFig(id=1261377087663911401, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=EN, label=Table 4, caption=
Comparison between different deep learning based speech enhancement models
, figureFileSmall=null, figureFileBig=null, tableContent=
| 方法类型 | 数据集 | 提取特征 | 训练类型 | 评估指标 |
| DNN | NOISEX、CHiME-2、Voice Bank+DEMAND、TIMIT+AURORA | MFCC、LPS、(log/power) Mag | 监督学习 | PESQ、SDR、STOI、SegSNR |
| RNN | CHiME-3、Voice Bank + DEMAND、TIMIT | LPS、MFCC、Raw signal、(log/power) Mag | 监督学习 | PESQ、STOI |
| LSTM | CHiME-3、Voice Bank + DEMAND、WSJCAMO | LPS、MFCC、Raw signal、(log/power) Mag | 监督学习 | PESQ、STOI |
| GRU | CHiME-3、Voice Bank + DEMAND、TIMIT | LPS、MFCC、Raw signal | 监督学习 | PESQ、STOI |
| CNN | CHiME-4、Aurora-4 | Raw signal、(log/power) Mag | 监督学习 | PESQ、STOI、WER、SDR、CSIG、CBAK、COVL、SegSNR |
| GAN | TIMIT + NOISEX +SSN、Voice Bank +、 DEMAND | MFCC、Raw signal | 无监督学习 | PESQ、STOI、CSIG, CBAK、COVL、SSNR、WER、SDR |
| AE | CHiME-2、WSJCAMO | MFCC、Log Me | 无监督学习 | PESQ、STOI |
| DAE | MFCC、(Log) Mel | CHiME-2 | 无监督学习 | PESQ、WER |
| VAE | Reverberant、ATR | MFCC、Log Mel | 无监督学习 | PESQ、STOI、SNR、MMSE |
| Transformer | QUT-NOISE-TIMIT、VoiceBank + DEMAND、 TIMIT + Musan + Noise-92 | (log/power) Mag、Raw signal | 监督学习 | SDR、PESQ、CSIG CBAK、COVL、SegSNR |
), ArticleFig(id=1261377088456634867, tenantId=1146029695717560320, journalId=1146123166801305609, articleId=1261262688311755499, language=CN, label=表4, caption=
不同基于深度学习的语音增强方法的比较
, figureFileSmall=null, figureFileBig=null, tableContent=
| 方法类型 | 数据集 | 提取特征 | 训练类型 | 评估指标 |
| DNN | NOISEX、CHiME-2、Voice Bank+DEMAND、TIMIT+AURORA | MFCC、LPS、(log/power) Mag | 监督学习 | PESQ、SDR、STOI、SegSNR |
| RNN | CHiME-3、Voice Bank + DEMAND、TIMIT | LPS、MFCC、Raw signal、(log/power) Mag | 监督学习 | PESQ、STOI |
| LSTM | CHiME-3、Voice Bank + DEMAND、WSJCAMO | LPS、MFCC、Raw signal、(log/power) Mag | 监督学习 | PESQ、STOI |
| GRU | CHiME-3、Voice Bank + DEMAND、TIMIT | LPS、MFCC、Raw signal | 监督学习 | PESQ、STOI |
| CNN | CHiME-4、Aurora-4 | Raw signal、(log/power) Mag | 监督学习 | PESQ、STOI、WER、SDR、CSIG、CBAK、COVL、SegSNR |
| GAN | TIMIT + NOISEX +SSN、Voice Bank +、 DEMAND | MFCC、Raw signal | 无监督学习 | PESQ、STOI、CSIG, CBAK、COVL、SSNR、WER、SDR |
| AE | CHiME-2、WSJCAMO | MFCC、Log Me | 无监督学习 | PESQ、STOI |
| DAE | MFCC、(Log) Mel | CHiME-2 | 无监督学习 | PESQ、WER |
| VAE | Reverberant、ATR | MFCC、Log Mel | 无监督学习 | PESQ、STOI、SNR、MMSE |
| Transformer | QUT-NOISE-TIMIT、VoiceBank + DEMAND、 TIMIT + Musan + Noise-92 | (log/power) Mag、Raw signal | 监督学习 | SDR、PESQ、CSIG CBAK、COVL、SegSNR |
)], attaches=null, journal=Journal(id=1146119176004939786, delFlag=0, nameCn=科学技术与工程, nameEn=Science Technology and Engineering, nameHistory1=null, nameHistory2=null, issn=1671-1815, eissn=, cn=11-4688/T, coden=null, periodic=4, language=CN, oaType=是, ccby=null, superviseOffice=null, ownerOffice=null, pubOffice=null, editorOffice=null, officeType=null, aims=null, clcCode=null, officeProv=null, officeCity=null, officeAddr=null, officeZip=null, officeEmail=null, officePhone=null, editDirector=null, officeDirector=null, officeDirectorPhone=null, officeStaffNum=null, officeEmpNum=null, coverPicUrl=UKU/O7GSka5polgCTkbIIw==, journalPrice=null, startedYear=null, abbrevIsoEn=Sci Technol Eng, journalRemark=null, publicationField=null, createdTime=null, updatedTime=1754445529766, createdBy=null, updatedBy=13701087609, firstLetterCn=S, firstLetterEn=S, subjectCode=Natural Sciences, subjectName=自然科学, subjectCodeEn=Natural Sciences, subjectNameEn=null, picCn=UKU/O7GSka5polgCTkbIIw==, picEn=5hwlULoNwcbj3xUmVi9MAQ==, jcr=null, cjcr=null, exts=[JournalExt(id=1159791870395564357, language=CN, name=科学技术与工程, nameHistory1=null, nameHistory2=null, managedBy=, sponsoredBy=, publishedBy=, editorOffice=, officeProv=null, officeCity=null, officeAddr=, officeZip=, editDirector=null, officeDirector=null, officePhone=null, coverPicUrl=null, journalRemark=, submitArticleUrl=null, websiteUrl=http://www.stae.com.cn/jsygc/home, createdTime=1754445529793, updatedTime=1754445529793, createdBy=13701087609, updatedBy=13701087609, submissionGuidelinesUrl=http://www.stae.com.cn/jsygc/site/menus/20090429150146001, submissionAuthorUrl=http://www.stae.com.cn/jsygc/author/login, submissionEditorUrl=http://www.stae.com.cn/jsygc/editor/login, submissionReviewUrl=http://www.stae.com.cn/jsygc/reviewer/login, submissionCeEditorUrl=, submissionAeEditorUrl=, option={"copyright":""}), JournalExt(id=1159791870441701702, language=EN, name=Science Technology and Engineering, nameHistory1=null, nameHistory2=null, managedBy=, sponsoredBy=, publishedBy=, editorOffice=, officeProv=null, officeCity=null, officeAddr=, officeZip=, editDirector=null, officeDirector=null, officePhone=null, coverPicUrl=null, journalRemark=, submitArticleUrl=null, websiteUrl=http://www.stae.com.cn/jsygc/home, createdTime=1754445529804, updatedTime=1754445529804, createdBy=13701087609, updatedBy=13701087609, submissionGuidelinesUrl=, submissionAuthorUrl=http://www.stae.com.cn/jsygc/author/login, submissionEditorUrl=http://www.stae.com.cn/jsygc/editor/login, submissionReviewUrl=http://www.stae.com.cn/jsygc/reviewer/login, submissionCeEditorUrl=, submissionAeEditorUrl=, option={"copyright":""})], databaseList=null, tenantJournalId=1146123166801305609, websiteList=[Website(id=1148243202391400884, webName=null, webTitle=null, webDomain=null, webCopyrigh=null, webIpcNo=null, seoTitle=null, seoKeywords=null, seoDescription=null, tenantJournalId=null, journalId=1146123166801305609, journalNameCn=null, journalNameEn=null, grayFlag=null, tenantId=1146029695717560320, platformId=null, journalGroupId=null, journalGroupNameCn=null, journalGroupNameEn=null, type=1, domain=https://castjournals.cast.org.cn/joweb/kxjsygc/CN, language=CN, createTime=1751692112777, createBy=18614031015, updateTime=1753520965431, updateBy=18614031015, name=科学技术与工程-中文站点, tplId=1146099689490845704, title=科学技术与工程, delFlag=0, indexPage=/home, props=[WebsiteProps(id=1148622798802673703, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1148243202391400884, code=articleTextType, value=kx, createTime=1751782615614, updateTime=1751782615614, creator=18614031015, updator=18614031015), WebsiteProps(id=1148622798781702180, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1148243202391400884, code=banner, value=null, createTime=1751782615609, updateTime=1751782615609, creator=18614031015, updator=18614031015), WebsiteProps(id=1148622798769119267, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1148243202391400884, code=logo, value=https://castjournals.cast.org.cn/joweb/kjdb/CN/file/pic?fileId=j86gbwi+p0Idkyl5SzIlmQ==, createTime=1751782615606, updateTime=1751782615606, creator=18614031015, updator=18614031015), WebsiteProps(id=1148622798794285094, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1148243202391400884, code=picServerUrl, value=https://castjournals.cast.org.cn/joweb/kjdb/CN/file/pic, createTime=1751782615612, updateTime=1751782615612, creator=18614031015, updator=18614031015), WebsiteProps(id=1148622798790090789, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1148243202391400884, code=staticResourcePath, value=https://castjournals.cast.org.cn/joweb/cast_kjdb_cn_619/, createTime=1751782615611, updateTime=1751782615611, creator=18614031015, updator=18614031015)]), Website(id=1155914124811976731, webName=null, webTitle=null, webDomain=null, webCopyrigh=null, webIpcNo=null, seoTitle=null, seoKeywords=null, seoDescription=null, tenantJournalId=null, journalId=1146123166801305609, journalNameCn=null, journalNameEn=null, grayFlag=null, tenantId=1146029695717560320, platformId=null, journalGroupId=null, journalGroupNameCn=null, journalGroupNameEn=null, type=1, domain=https://castjournals.cast.org.cn/joweb/kxjsygc/EN, language=EN, createTime=1753521003206, createBy=18614031015, updateTime=1753521003206, updateBy=18614031015, name=科学技术与工程-英文站点, tplId=1146101810881728533, title=Science Technology and Engineering, delFlag=0, indexPage=/home, props=[WebsiteProps(id=1155914371227308235, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1155914124811976731, code=articleTextType, value=kx, createTime=1753521061952, updateTime=1753521061952, creator=18614031015, updator=18614031015), WebsiteProps(id=1155914371210531016, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1155914124811976731, code=banner, value=null, createTime=1753521061947, updateTime=1753521061947, creator=18614031015, updator=18614031015), WebsiteProps(id=1155914371202142407, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1155914124811976731, code=logo, value=https://castjournals.cast.org.cn/joweb/kjdb/CN/file/pic?fileId=j86gbwi+p0Idkyl5SzIlmQ==, createTime=1753521061945, updateTime=1753521061945, creator=18614031015, updator=18614031015), WebsiteProps(id=1155914371223113930, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1155914124811976731, code=picServerUrl, value=https://castjournals.cast.org.cn/joweb/kjdb/CN/file/pic, createTime=1753521061950, updateTime=1753521061950, creator=18614031015, updator=18614031015), WebsiteProps(id=1155914371218919625, tenantId=1146029695717560320, journalId=null, journalGroupId=null, siteId=1155914124811976731, code=staticResourcePath, value=https://castjournals.cast.org.cn/joweb/cast_kjdb_cn_619/, createTime=1753521061949, updateTime=1753521061949, creator=18614031015, updator=18614031015)])], journalTitle=科学技术与工程, weixinUrl=null, journalUrl=null, iacademicId=null, status=0, seqNo=null, journalTitleEn=Science Technology and Engineering, journalPhotoCn=UKU/O7GSka5polgCTkbIIw==, journalPhotoEn=5hwlULoNwcbj3xUmVi9MAQ==, journalFirstLetter=S, journalRecommend=null, journalNew=null, journalCollection=null, jcrJf=null, cjcrJf=null, jcrJfStr=null, cjcrJfStr=null, submissionFirstDecision=null, sciSubjectClassification=null, casSubjectClassification=null, citeScore=null, totalCitationFrequency=null, icpCode=null, psCode=null, advertisingLicenseCode=null, copyrightInformation=null, country=null, option=null, provinceCode=null, provinceName=null, collectFlag=false), detailUrlCn=https://castjournals.cast.org.cn/joweb/kxjsygc/CN/10.12404/j.issn.1671-1815.2404954, detailUrlEn=https://castjournals.cast.org.cn/joweb/kxjsygc/EN/10.12404/j.issn.1671-1815.2404954, pdfUrlCn=https://castjournals.cast.org.cn/joweb/kxjsygc/CN/PDF/10.12404/j.issn.1671-1815.2404954, pdfUrlEn=https://castjournals.cast.org.cn/joweb/kxjsygc/EN/PDF/10.12404/j.issn.1671-1815.2404954, aliStartDate=null, aliEndDate=null, collectionFlag=false, citedCount=null, citedUrl=null, reference=null)