Persian_Automatic_Speech_Recognition-asr

Running

App Files Files Community

Persian_Automatic_Speech_Recognition-asr / app.py

karim23657

Update app.py

414670d verified 3 months ago

raw

history blame

8.58 kB

	import gradio as gr
	from transformers import pipeline
	from pydub import AudioSegment
	import os
	import speech_recognition as sr


	html_seeker='''<style>
	html, body {
	margin: 0;
	padding: 0;
	min-width: 900px;
	}
	#header {
	/position: fixed;/
	top: 0;
	left: 0;
	height: 50px;
	min-width: 900px;
	line-height: 50px;
	width: 100%;
	background-color: #999;
	box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5);
	font-family: Helvetica, sans-serif;
	}
	#header, #header a {
	color: white;
	}

	.home {
	margin: 0;
	font-weight: bold;
	text-transform: lowercase;
	width: 100px;
	}
	h4.home {
	margin: 0;
	background: #666;
	padding-left: 25px;
	padding-right: 30px;
	margin-right: 20px;
	float: left;
	text-decoration: none;
	}
	.home:hover a {
	background: #555;
	}
	#audio {
	margin-left: 10px;
	width: 500px;
	display: inline-block;
	}
	#transcript {
	margin: 0 15px;
	margin-bottom: 5em;
	white-space: pre-wrap;
	line-height: 2em;
	max-width: 600px;
	color: #999;
	clear: both;
	margin-top: 75px;
	/direction: rtl;/
	}
	.success {
	color: black;

	}
	.success:hover {
	text-decoration: underline;
	}
	.active {
	color: magenta;
	background-color: yellow;
	}
	#preloader {
	visibility: hidden;
	}


	</style><div id="header">

	<audio id="audio" src="17.mp3" controls="true" ></audio>
	</div>
	</div>


	<div id="transcript" dir="auto"></div>
	<img src="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7" onload="
	var oldScript = document.querySelector('script#huihiuh6');
	var newScript = document.createElement('script');
	Array.from(oldScript.attributes)
	.forEach( attr => newScript.setAttribute(attr.name, attr.value) );
	newScript.appendChild(document.createTextNode(oldScript.innerHTML));
	oldScript.parentNode.replaceChild(newScript, oldScript);
	">

	<script id="huihiuh6">
	function myFunction543rr(){
	console.log('loaded00000000000000002');
	}
	var $a = document.getElementById("audio");
	$a.src=document.querySelector('audio').src;
	console.log($a);
	window.onkeydown = function(ev) {
	if(ev.keyCode == 32) {
	ev.preventDefault();
	$a.pause();
	}
	}
	var $trans = document.getElementById("transcript");
	var wds = [];
	var cur_wd;

	function highlight_word() {
	var t = $a.currentTime;
	// XXX: O(N); use binary search
	var hits = wds.filter(function(x) {
	return (t - x['timestamp']['0']) > 0.01 && (x['timestamp']['1'] - t) > 0.01;
	}, wds);
	var next_wd = hits[hits.length - 1];

	if(cur_wd != next_wd) {
	var active = document.querySelectorAll('.active');
	for(var i = 0; i < active.length; i++) {
	active[i].classList.remove('active');
	}
	if(next_wd && next_wd.$div) {
	next_wd.$div.classList.add('active');
	//render_phones(next_wd);
	}
	}
	cur_wd = next_wd;
	//highlight_phone(t);

	window.requestAnimationFrame(highlight_word);
	}
	window.requestAnimationFrame(highlight_word);

	$trans.innerHTML = "Loading...";

	function render(ret) {
	wds = ret['chunks'] \|\| [];
	transcript = ret['text'];

	$trans.innerHTML = '';

	var currentOffset = 0;

	wds.forEach(function(wd) {


	var $wd = document.createElement('span');
	var txt = wd['text'];
	var $wdText = document.createTextNode(txt);
	$wd.appendChild($wdText);
	wd.$div = $wd;
	$wd.className = 'success';

	$wd.onclick = function() {
	console.log(wd['timestamp']['0']);
	$a.currentTime = wd['timestamp']['0'];
	$a.play();
	};
	$trans.appendChild($wd);
	$trans.appendChild(document.createTextNode(' '));
	});


	}





	function update() {
	if(INLINE_JSON) {
	// We want this to work from file:/// domains, so we provide a
	// mechanism for inlining the alignment data.
	render(INLINE_JSON);
	}
	}

	var INLINE_JSON='''
	html_seeker2=''';
	update();
	</script>'''

	# model_name = "voidful/wav2vec2-xlsr-multilingual-56"
	# model0 = pipeline(task="automatic-speech-recognition",
	# model=model_name)


	# model_name = "SLPL/Sharif-wav2vec2"
	# model2 = pipeline(task="automatic-speech-recognition",
	# model=model_name)
	# model_name = "ghofrani/common8"
	# model1 = pipeline(task="automatic-speech-recognition",
	# model=model_name)

	import json
	def predict_fa(speech,model):
	if model== "SLPL/Sharif-wav2vec2":
	text = model2(speech,return_timestamps="word" )
	elif model== "ghofrani/common8":
	text = model1(speech,return_timestamps="word" )
	elif model== "voidful/wav2vec2-xlsr-multilingual-56":
	text = model0(speech,return_timestamps="word" )

	return [text['text'],json.dumps(text),html_seeker+json.dumps(text)+html_seeker2]


	def convert_to_wav(filename):
	filenameObj=os.path.splitext(filename)
	audio = AudioSegment.from_file(filename,format=filenameObj[1].replace(".",""))
	new_filename = filenameObj[0] + ".wav"
	while os.path.exists(new_filename):
	new_filename = os.path.splitext(new_filename)[0]+"(1)"+ ".wav"
	audio.export(new_filename, format="wav")
	print(f"Converting {filename} to {new_filename}...")
	return new_filename
	def g_rec(audio_File ,language):
	r = sr.Recognizer()
	print(audio_File)

	#if not os.path.splitext(audio_File)[1]==".wav":
	# audio_File=convert_to_wav(audio_File)
	hellow=sr.AudioFile(audio_File)
	with hellow as source:
	audio = r.record(source)
	try:
	s = r.recognize_google(audio,language =language)
	res= "Text: "+s
	except Exception as e:
	res= "Exception: "+str(e)
	return res
	# Export file as .wav

	#predict(load_file_to_data('audio file path',sampling_rate=16_000)) # beware of the audio file sampling rate

	#predict_lang_specific(load_file_to_data('audio file path',sampling_rate=16_000),'en') # beware of the audio file sampling rate
	with gr.Blocks() as demo:
	gr.Markdown("multilingual Speech Recognition")

	# with gr.Tab("Persian models"):
	# inputs_speech_fa =gr.Audio(sources=["upload"], type="filepath", optional=True,label="Upload your audio:")
	# inputs_model_fa =gr.inputs.Radio(label="Language", choices=["ghofrani/common8","SLPL/Sharif-wav2vec2","voidful/wav2vec2-xlsr-multilingual-56"])
	# output_transcribe1_fa = gr.Textbox(label="Transcribed text:")
	# output_transcribe1_fa1 = gr.Textbox(label="Transcribed text with timestamps:")
	# output_transcribe1_fa2 =gr.HTML(label="")
	# transcribe_audio1_fa= gr.Button("Submit")
	with gr.Tab("google"):
	gr.Markdown("set your speech language")
	inputs_speech1 =[
	gr.Audio(sources=["upload"], type="filepath"),
	gr.Dropdown(choices=["af-ZA","am-ET","ar-AE","ar-BH","ar-DZ","ar-EG","ar-IL","ar-IQ","ar-JO","ar-KW","ar-LB","ar-MA","ar-MR","ar-OM","ar-PS","ar-QA","ar-SA","ar-TN","ar-YE","az-AZ","bg-BG","bn-BD","bn-IN","bs-BA","ca-ES","cs-CZ","da-DK","de-AT","de-CH","de-DE","el-GR","en-AU","en-CA","en-GB","en-GH","en-HK","en-IE","en-IN","en-KE","en-NG","en-NZ","en-PH","en-PK","en-SG","en-TZ","en-US","en-ZA","es-AR","es-BO","es-CL","es-CO","es-CR","es-DO","es-EC","es-ES","es-GT","es-HN","es-MX","es-NI","es-PA","es-PE","es-PR","es-PY","es-SV","es-US","es-UY","es-VE","et-EE","eu-ES","fa-IR","fi-FI","fil-PH","fr-BE","fr-CA","fr-CH","fr-FR","gl-ES","gu-IN","hi-IN","hr-HR","hu-HU","hy-AM","id-ID","is-IS","it-CH","it-IT","iw-IL","ja-JP","jv-ID","ka-GE","kk-KZ","km-KH","kn-IN","ko-KR","lo-LA","lt-LT","lv-LV","mk-MK","ml-IN","mn-MN","mr-IN","ms-MY","my-MM","ne-NP","nl-BE","nl-NL","no-NO","pa-Guru-IN","pl-PL","pt-BR","pt-PT","ro-RO","ru-RU","si-LK","sk-SK","sl-SI","sq-AL","sr-RS","su-ID","sv-SE","sw-KE","sw-TZ","ta-IN","ta-LK","ta-MY","ta-SG","te-IN","th-TH","tr-TR","uk-UA","ur-IN","ur-PK","uz-UZ","vi-VN","yue-Hant-HK","zh (cmn-Hans-CN)","zh-TW (cmn-Hant-TW)","zu-ZA"]
	,value="fa-IR",label="language code")
	]
	output_transcribe1 = gr.Textbox(label="output")
	transcribe_audio1_go= gr.Button("Submit")

	transcribe_audio1_fa.click(fn=predict_fa,
	inputs=[inputs_speech_fa ,inputs_model_fa ],
	outputs=[output_transcribe1_fa ,output_transcribe1_fa1,output_transcribe1_fa2 ] )

	transcribe_audio1_go.click(fn=g_rec,
	inputs=inputs_speech1 ,
	outputs=output_transcribe1 )


	if __name__ == "__main__":
	demo.launch()