# Copyright (c) Microsoft # 2022 Chengdong Liang (liangchengdong@mail.nwpu.edu.cn) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import gradio as gr import wespeaker # import wespeakerruntime as wespeaker # from sklearn.metrics.pairwise import cosine_similarity STYLE = """ """ OUTPUT_OK = (STYLE + """

The speakers are

{:.1f}%

similar

Welcome, human!

(You must get at least 70% to be considered the same person)
""") OUTPUT_FAIL = (STYLE + """

The speakers are

{:.1f}%

similar

Warning! stranger!

(You must get at least 70% to be considered the same person)
""") OUTPUT_ERROR = (STYLE + """

Input Error

{}!

""") cn_model = wespeaker.load_model("chinese") en_model = wespeaker.load_model("english") def speaker_verification(audio_path1, audio_path2, lang='CN'): if audio_path1 == None or audio_path2 == None: output = OUTPUT_ERROR.format('Please enter two audios') return output if lang == 'EN': model = cn_model elif lang == 'CN': model = en_model else: output = OUTPUT_ERROR.format('Please select a language') return output # emb1 = model.extract_embedding(audio_path1) # emb2 = model.extract_embedding(audio_path2) # cos_score = model.compute_cosine_score(emb1[0], emb2[0]) # cos_score = (cos_score + 1) / 2.0 cos_score = wespeaker.compute_similarity(audio_path1, audio_path2) if cos_score >= 0.70: output = OUTPUT_OK.format(cos_score * 100) else: output = OUTPUT_FAIL.format(cos_score * 100) return output # input inputs = [ gr.inputs.Audio(source="microphone", type="filepath", optional=True, label='Speaker#1'), gr.inputs.Audio(source="microphone", type="filepath", optional=True, label='Speaker#2'), gr.Radio(['EN', 'CN'], label='Language'), ] output = gr.outputs.HTML(label="") # description description = ("

WeSpeaker Demo ! Try it with your own voice ! Note: We recommend that the audio length be greater than 5s !

" "

CN: cnceleb_resnet34.onnx. EN: voxceleb_resnet34.onnx

") article = ( "

" "Github: Learn more about WeSpeaker" "

") examples = [ ['examples/BAC009S0764W0228.wav', 'examples/BAC009S0764W0328.wav', 'CN'], ['examples/BAC009S0913W0133.wav', 'examples/BAC009S0764W0228.wav', 'CN'], ['examples/00001_spk1.wav', 'examples/00003_spk2.wav', 'EN'], ['examples/00010_spk2.wav', 'examples/00024_spk1.wav', 'EN'], ['examples/00001_spk1.wav', 'examples/00024_spk1.wav', 'EN'], ['examples/00010_spk2.wav', 'examples/00003_spk2.wav', 'EN'], ] interface = gr.Interface( fn=speaker_verification, inputs=inputs, outputs=output, title="Speaker Verification in WeSpeaker : 基于 WeSpeaker 的说话人确认", description=description, article=article, examples=examples, theme="huggingface", ) interface.launch(enable_queue=True)