File size: 2,258 Bytes
3f298f4
2f5b4f8
ccb1371
 
 
2cf8580
ccb1371
1a69e12
8dfefc8
3037e1b
3f298f4
1a69e12
 
 
 
 
 
 
5f18828
b50bfc8
467cc5c
ccb1371
 
467cc5c
ccb1371
 
 
 
 
 
 
 
 
 
 
b50bfc8
ccb1371
467cc5c
1a69e12
 
 
 
467cc5c
ccb1371
 
 
 
51a052f
ccb1371
5a87b08
467cc5c
ccb1371
51a052f
ccb1371
51a052f
ccb1371
51a052f
 
 
ccb1371
 
 
 
3f298f4
2f5b4f8
 
 
467cc5c
 
 
 
8dfefc8
3037e1b
 
 
2f5b4f8
3f298f4
2f5b4f8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
from sad_tf import *
from autosub import SpeechRecognizer
from autosub import GOOGLE_SPEECH_API_KEY
import soundfile as sf
import io

cssfa = """ 
 textarea { direction: rtl; text-align: right; font-family: Calibri, sans-serif; font-size: 16px;} 
"""


cssen = """ 
 textarea { direction: ltr; text-align: left; font-family: Calibri, sans-serif; font-size: 16px;} 
"""

css=""

seg = Segmenter(ffmpeg_path="ffmpeg",model_path="keras_speech_music_noise_cnn.hdf5" , device="cpu",vad_type="vad")



def process_segment(args):
 segment, wav,recognizer = args
 start, stop = segment
 # pp = converter((start, stop))
 pp = pcm_to_flac(wav[int(start*16000) : int(stop*16000)])
 tr_beamsearch_lm = recognizer(pp) 
 return start, stop, tr_beamsearch_lm

def pcm_to_flac(pcm_data, sample_rate=16000):
 buffer = io.BytesIO()
 sf.write(buffer, pcm_data, sample_rate, format='FLAC')
 flac_data = buffer.getvalue() 
 return flac_data

    
def transcribe_audio(audio_file,lan):
    if (lan=="en"):
          css = cssen
    else:
        css = cssfa
    recognizer = SpeechRecognizer(language=lan, rate=16000,api_key=GOOGLE_SPEECH_API_KEY, proxies=None)
    text=""
    isig,wav =  seg(audio_file)
    isig = filter_output(isig , max_silence=0.5 ,ignore_small_speech_segments=0.1 , max_speech_len=15 ,split_speech_bigger_than=20)   
    isig = [(a,b) for x,a,b,_,_ in isig]
    print(isig)
    results=[]
    for segment in isig:
         results.append (process_segment((segment, wav,recognizer)))
    for start, stop, tr_beamsearch_lm in results:
          
          try:
              
              text += ' ' + tr_beamsearch_lm + '\r\n'
              print(start)
              print(stop)
              print(text)
          except:
               pass
    
    return text

# Define the Gradio interface
interface = gr.Interface(
    fn=transcribe_audio,
    inputs=[
        gr.Audio(type="filepath"),  
        gr.Radio(choices=["fa", "en", "ar"], label="Language")
    ],
    outputs=gr.Textbox(label="Transcription", elem_id="output-text",interactive=True),
    title="Persian Audio Transcription",
    description="Upload an audio file or record audio to get the transcription.",
    css=css
)

# Launch the Gradio app
interface.launch()