|
import gradio as gr |
|
from transformers import pipeline |
|
from pydub import AudioSegment |
|
import os |
|
import speech_recognition as sr |
|
|
|
|
|
html_seeker='''<style> |
|
html, body { |
|
margin: 0; |
|
padding: 0; |
|
min-width: 900px; |
|
} |
|
#header { |
|
/*position: fixed;*/ |
|
top: 0; |
|
left: 0; |
|
height: 50px; |
|
min-width: 900px; |
|
line-height: 50px; |
|
width: 100%; |
|
background-color: #999; |
|
box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5); |
|
font-family: Helvetica, sans-serif; |
|
} |
|
#header, #header a { |
|
color: white; |
|
} |
|
|
|
.home { |
|
margin: 0; |
|
font-weight: bold; |
|
text-transform: lowercase; |
|
width: 100px; |
|
} |
|
h4.home { |
|
margin: 0; |
|
background: #666; |
|
padding-left: 25px; |
|
padding-right: 30px; |
|
margin-right: 20px; |
|
float: left; |
|
text-decoration: none; |
|
} |
|
.home:hover a { |
|
background: #555; |
|
} |
|
#audio { |
|
margin-left: 10px; |
|
width: 500px; |
|
display: inline-block; |
|
} |
|
#transcript { |
|
margin: 0 15px; |
|
margin-bottom: 5em; |
|
white-space: pre-wrap; |
|
line-height: 2em; |
|
max-width: 600px; |
|
color: #999; |
|
clear: both; |
|
margin-top: 75px; |
|
/*direction: rtl;*/ |
|
} |
|
.success { |
|
color: black; |
|
|
|
} |
|
.success:hover { |
|
text-decoration: underline; |
|
} |
|
.active { |
|
color: magenta; |
|
background-color: yellow; |
|
} |
|
#preloader { |
|
visibility: hidden; |
|
} |
|
|
|
|
|
</style><div id="header"> |
|
|
|
<audio id="audio" src="17.mp3" controls="true" ></audio> |
|
</div> |
|
</div> |
|
|
|
|
|
<div id="transcript" dir="auto"></div> |
|
<img src="" onload=" |
|
var oldScript = document.querySelector('script#huihiuh6'); |
|
var newScript = document.createElement('script'); |
|
Array.from(oldScript.attributes) |
|
.forEach( attr => newScript.setAttribute(attr.name, attr.value) ); |
|
newScript.appendChild(document.createTextNode(oldScript.innerHTML)); |
|
oldScript.parentNode.replaceChild(newScript, oldScript); |
|
"> |
|
|
|
<script id="huihiuh6"> |
|
function myFunction543rr(){ |
|
console.log('loaded00000000000000002'); |
|
} |
|
var $a = document.getElementById("audio"); |
|
$a.src=document.querySelector('audio').src; |
|
console.log($a); |
|
window.onkeydown = function(ev) { |
|
if(ev.keyCode == 32) { |
|
ev.preventDefault(); |
|
$a.pause(); |
|
} |
|
} |
|
var $trans = document.getElementById("transcript"); |
|
var wds = []; |
|
var cur_wd; |
|
|
|
function highlight_word() { |
|
var t = $a.currentTime; |
|
// XXX: O(N); use binary search |
|
var hits = wds.filter(function(x) { |
|
return (t - x['timestamp']['0']) > 0.01 && (x['timestamp']['1'] - t) > 0.01; |
|
}, wds); |
|
var next_wd = hits[hits.length - 1]; |
|
|
|
if(cur_wd != next_wd) { |
|
var active = document.querySelectorAll('.active'); |
|
for(var i = 0; i < active.length; i++) { |
|
active[i].classList.remove('active'); |
|
} |
|
if(next_wd && next_wd.$div) { |
|
next_wd.$div.classList.add('active'); |
|
//render_phones(next_wd); |
|
} |
|
} |
|
cur_wd = next_wd; |
|
//highlight_phone(t); |
|
|
|
window.requestAnimationFrame(highlight_word); |
|
} |
|
window.requestAnimationFrame(highlight_word); |
|
|
|
$trans.innerHTML = "Loading..."; |
|
|
|
function render(ret) { |
|
wds = ret['chunks'] || []; |
|
transcript = ret['text']; |
|
|
|
$trans.innerHTML = ''; |
|
|
|
var currentOffset = 0; |
|
|
|
wds.forEach(function(wd) { |
|
|
|
|
|
var $wd = document.createElement('span'); |
|
var txt = wd['text']; |
|
var $wdText = document.createTextNode(txt); |
|
$wd.appendChild($wdText); |
|
wd.$div = $wd; |
|
$wd.className = 'success'; |
|
|
|
$wd.onclick = function() { |
|
console.log(wd['timestamp']['0']); |
|
$a.currentTime = wd['timestamp']['0']; |
|
$a.play(); |
|
}; |
|
$trans.appendChild($wd); |
|
$trans.appendChild(document.createTextNode(' ')); |
|
}); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
function update() { |
|
if(INLINE_JSON) { |
|
// We want this to work from file:/// domains, so we provide a |
|
// mechanism for inlining the alignment data. |
|
render(INLINE_JSON); |
|
} |
|
} |
|
|
|
var INLINE_JSON=''' |
|
html_seeker2='''; |
|
update(); |
|
</script>''' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
def predict_fa(speech,model): |
|
if model== "SLPL/Sharif-wav2vec2": |
|
text = model2(speech,return_timestamps="word" ) |
|
elif model== "ghofrani/common8": |
|
text = model1(speech,return_timestamps="word" ) |
|
elif model== "voidful/wav2vec2-xlsr-multilingual-56": |
|
text = model0(speech,return_timestamps="word" ) |
|
|
|
return [text['text'],json.dumps(text),html_seeker+json.dumps(text)+html_seeker2] |
|
|
|
|
|
def convert_to_wav(filename): |
|
filenameObj=os.path.splitext(filename) |
|
audio = AudioSegment.from_file(filename,format=filenameObj[1].replace(".","")) |
|
new_filename = filenameObj[0] + ".wav" |
|
while os.path.exists(new_filename): |
|
new_filename = os.path.splitext(new_filename)[0]+"(1)"+ ".wav" |
|
audio.export(new_filename, format="wav") |
|
print(f"Converting {filename} to {new_filename}...") |
|
return new_filename |
|
def g_rec(audio_File ,language): |
|
r = sr.Recognizer() |
|
print(audio_File) |
|
|
|
|
|
|
|
hellow=sr.AudioFile(audio_File) |
|
with hellow as source: |
|
audio = r.record(source) |
|
try: |
|
s = r.recognize_google(audio,language =language) |
|
res= "Text: "+s |
|
except Exception as e: |
|
res= "Exception: "+str(e) |
|
return res |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("multilingual Speech Recognition") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Tab("google"): |
|
gr.Markdown("set your speech language") |
|
inputs_speech1 =[ |
|
gr.Audio(sources=["upload"], type="filepath"), |
|
gr.Dropdown(choices=["af-ZA","am-ET","ar-AE","ar-BH","ar-DZ","ar-EG","ar-IL","ar-IQ","ar-JO","ar-KW","ar-LB","ar-MA","ar-MR","ar-OM","ar-PS","ar-QA","ar-SA","ar-TN","ar-YE","az-AZ","bg-BG","bn-BD","bn-IN","bs-BA","ca-ES","cs-CZ","da-DK","de-AT","de-CH","de-DE","el-GR","en-AU","en-CA","en-GB","en-GH","en-HK","en-IE","en-IN","en-KE","en-NG","en-NZ","en-PH","en-PK","en-SG","en-TZ","en-US","en-ZA","es-AR","es-BO","es-CL","es-CO","es-CR","es-DO","es-EC","es-ES","es-GT","es-HN","es-MX","es-NI","es-PA","es-PE","es-PR","es-PY","es-SV","es-US","es-UY","es-VE","et-EE","eu-ES","fa-IR","fi-FI","fil-PH","fr-BE","fr-CA","fr-CH","fr-FR","gl-ES","gu-IN","hi-IN","hr-HR","hu-HU","hy-AM","id-ID","is-IS","it-CH","it-IT","iw-IL","ja-JP","jv-ID","ka-GE","kk-KZ","km-KH","kn-IN","ko-KR","lo-LA","lt-LT","lv-LV","mk-MK","ml-IN","mn-MN","mr-IN","ms-MY","my-MM","ne-NP","nl-BE","nl-NL","no-NO","pa-Guru-IN","pl-PL","pt-BR","pt-PT","ro-RO","ru-RU","si-LK","sk-SK","sl-SI","sq-AL","sr-RS","su-ID","sv-SE","sw-KE","sw-TZ","ta-IN","ta-LK","ta-MY","ta-SG","te-IN","th-TH","tr-TR","uk-UA","ur-IN","ur-PK","uz-UZ","vi-VN","yue-Hant-HK","zh (cmn-Hans-CN)","zh-TW (cmn-Hant-TW)","zu-ZA"] |
|
,value="fa-IR",label="language code") |
|
] |
|
output_transcribe1 = gr.Textbox(label="output") |
|
transcribe_audio1_go= gr.Button("Submit") |
|
|
|
transcribe_audio1_fa.click(fn=predict_fa, |
|
inputs=[inputs_speech_fa ,inputs_model_fa ], |
|
outputs=[output_transcribe1_fa ,output_transcribe1_fa1,output_transcribe1_fa2 ] ) |
|
|
|
transcribe_audio1_go.click(fn=g_rec, |
|
inputs=inputs_speech1 , |
|
outputs=output_transcribe1 ) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|