Spaces:

Harveenchadha
/

hindi-speech-recognition-vakyansh-wav2vec2

Runtime error

App Files Files Community

Harveenchadha commited on Feb 6, 2023

Commit

5b6d82c

•

1 Parent(s): 07d59d1

Refactored Code

Browse files

Files changed (1) hide show

app.py +18 -51

app.py CHANGED Viewed

@@ -2,97 +2,64 @@ import soundfile as sf
 import torch
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
 import gradio as gr
-import scipy.signal as sps
 import sox
 import subprocess
-def convert(inputfile, outfile):
-    sox_tfm = sox.Transformer()
-    sox_tfm.set_output_format(
-        file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16
-    )
-    #print(this is not done)
-    sox_tfm.build(inputfile, outfile)
-def read_file(wav):
-    sample_rate, signal = wav
-    signal = signal.mean(-1)
-    number_of_samples = round(len(signal) * float(16000) / sample_rate)
-    resampled_signal = sps.resample(signal, number_of_samples)
-    return resampled_signal
 def resampler(input_file_path, output_file_path):
-    #output_file_path = output_folder_path + input_file_path.split('/')[-1]
     command = (
         f"ffmpeg -hide_banner -loglevel panic -i {input_file_path} -ar 16000 -ac 1 -bits_per_raw_sample 16 -vn "
         f"{output_file_path}"
     )
     subprocess.call(command, shell=True)
-def parse_transcription_with_lm(wav_file):
-    input_values = read_file_and_process(wav_file)
-    # with torch.no_grad():
-    #     logits = model(**input_values).logits[0].cpu().numpy()
-    # print(logits)
-    # int_result = processor_with_LM.decode(logits = logits, output_word_offsets=False,
-    #                                      beam_width=128
-    #                                      )
-    # print(int_result)
-    # transcription =  int_result.text.replace('<s>','')
-    with torch.no_grad():
-        logits = model(**input_values).logits
     result = processor_with_LM.batch_decode(logits.cpu().numpy())
     text = result.text
     transcription = text[0].replace('<s>','')
     return transcription
-def read_file_and_process(wav_file):
-    filename = wav_file.split('.')[0]
-    resampler(wav_file, filename + "16k.wav")
-    speech, _ = sf.read(filename + "16k.wav")
-    inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
-    return inputs
 def parse(wav_file, applyLM):
     if applyLM:
         return parse_transcription_with_lm(wav_file)
     else:
         return parse_transcription(wav_file)
-def parse_transcription(wav_file):
-    input_values = read_file_and_process(wav_file)
-    with torch.no_grad():
-        logits = model(**input_values).logits
-    #logits = model(input_values).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
-    return transcription
 model_id = "Harveenchadha/vakyansh-wav2vec2-hindi-him-4200"
 processor = Wav2Vec2Processor.from_pretrained(model_id)
 processor_with_LM = Wav2Vec2ProcessorWithLM.from_pretrained(model_id)
 model = Wav2Vec2ForCTC.from_pretrained(model_id)
 input_ = gr.Audio(source="microphone", type="filepath")
-#input_ = gr.inputs.Audio(source="microphone", type="numpy")
 txtbox = gr.Textbox(
             label="Output from model will appear here:",
             lines=5
         )
 chkbox = gr.Checkbox(label="Apply LM", value=False)
 gr.Interface(parse, inputs = [input_, chkbox],  outputs=txtbox,
              streaming=True, interactive=True,
              analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);

 import torch
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
 import gradio as gr
 import sox
 import subprocess
+def read_file_and_process(wav_file):
+    filename = wav_file.split('.')[0]
+    filename_16k = filename + "16k.wav"
+    resampler(wav_file, filename_16k)
+    speech, _ = sf.read(filename_16k)
+    inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
+    return inputs
 def resampler(input_file_path, output_file_path):
     command = (
         f"ffmpeg -hide_banner -loglevel panic -i {input_file_path} -ar 16000 -ac 1 -bits_per_raw_sample 16 -vn "
         f"{output_file_path}"
     )
     subprocess.call(command, shell=True)
+def parse_transcription_with_lm(logits):
     result = processor_with_LM.batch_decode(logits.cpu().numpy())
     text = result.text
     transcription = text[0].replace('<s>','')
     return transcription
+def parse_transcription(logits):
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
+    return transcription
 def parse(wav_file, applyLM):
+    input_values = read_file_and_process(wav_file)
+    with torch.no_grad():
+        logits = model(**input_values).logits
     if applyLM:
         return parse_transcription_with_lm(wav_file)
     else:
         return parse_transcription(wav_file)
 model_id = "Harveenchadha/vakyansh-wav2vec2-hindi-him-4200"
 processor = Wav2Vec2Processor.from_pretrained(model_id)
 processor_with_LM = Wav2Vec2ProcessorWithLM.from_pretrained(model_id)
 model = Wav2Vec2ForCTC.from_pretrained(model_id)
 input_ = gr.Audio(source="microphone", type="filepath")
 txtbox = gr.Textbox(
             label="Output from model will appear here:",
             lines=5
         )
 chkbox = gr.Checkbox(label="Apply LM", value=False)
 gr.Interface(parse, inputs = [input_, chkbox],  outputs=txtbox,
              streaming=True, interactive=True,
              analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);