Spaces:

aware-ai
/

german-asr

Runtime error

App Files Files Community

flozi00 commited on Jun 29, 2022

Commit

2302e12

•

1 Parent(s): 7f06476

Update app.py (#2)

Browse files

- Update app.py (36e1b68705d8256a83d756334b956b74ec0e6bb1)
- Update requirements.txt (8d8b21e49c1b2d0b7007e37e4d294fbeb1846351)

Files changed (2) hide show

app.py +83 -43
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -1,49 +1,89 @@
 from transformers import pipeline
 import gradio as gr
-from pyctcdecode import BeamSearchDecoderCTC
-#lmID = "aware-ai/german-lowercase-wiki-5gram"
-#decoder = BeamSearchDecoderCTC.load_from_hf_hub(lmID)
-p = pipeline("automatic-speech-recognition", model="aware-ai/wav2vec2-xls-r-1b-5gram-german")
-ttp = pipeline("text2text-generation", model="aware-ai/marian-german-grammar")
-def transcribe(audio):
-    transcribed = p(audio, chunk_length_s=16, stride_length_s=(4, 0))["text"]
-    return transcribed
-def punctuate(transcribed):
-    punctuated = ttp(transcribed, max_length = 512)[0]["generated_text"]
-    return punctuated
-def get_asr_interface():
-    return gr.Interface(
-        fn=transcribe,
-        inputs=[
-            gr.inputs.Audio(source="microphone", type="filepath")
-        ],
-        outputs=[
-            "textbox",
-        ])
-def get_punctuation_interface():
-    return gr.Interface(
-        fn=punctuate,
-        inputs=[
-            "textbox",
-        ],
-        outputs=[
-            "textbox",
-        ])
-interfaces = [
-    get_asr_interface(),
-    get_punctuation_interface(),
-]
-names = [
-    "ASR",
-    "GRAMMAR",
-]
-gr.TabbedInterface(interfaces, names).launch(server_name = "0.0.0.0", enable_queue=False)

 from transformers import pipeline
+import torch
 import gradio as gr
+import subprocess
+import numpy as np
+import time
+p = pipeline("automatic-speech-recognition", model="aware-ai/wav2vec2-base-german")
+model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
+                              model='silero_vad', force_reload=False, onnx=True)
+def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
+    """
+    Helper function to read an audio file through ffmpeg.
+    """
+    ar = f"{sampling_rate}"
+    ac = "1"
+    format_for_conversion = "f32le"
+    ffmpeg_command = [
+        "ffmpeg",
+        "-i",
+        "pipe:0",
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1",
+    ]
+    try:
+        with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
+            output_stream = ffmpeg_process.communicate(bpayload)
+    except FileNotFoundError as error:
+        raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error
+    out_bytes = output_stream[0]
+    audio = np.frombuffer(out_bytes, np.float32)
+    if audio.shape[0] == 0:
+        raise ValueError("Malformed soundfile")
+    return audio
+(get_speech_timestamps,
+ _, read_audio,
+ *_) = utils
+def is_speech(wav, sr):
+    speech_timestamps = get_speech_timestamps(wav, model,
+                                    sampling_rate=sr)
+    return len(speech_timestamps) > 0
+def transcribe(audio, state={"text": "", "temp_text": "", "audio": ""}):
+    if state is None:
+        state={"text": "", "temp_text": "", "audio": ""}
+    with open(audio, "rb") as f:
+            payload = f.read()
+    audio = ffmpeg_read(payload, sampling_rate=16000)
+    _sr = 16000
+    speech = is_speech(wav_data, _sr)
+    if(speech):
+        if(state["audio"] is ""):
+            state["audio"] = wav_data
+        else:
+            state["audio"] = np.concatenate((state["audio"], wav_data))
+    else:
+        if(state["audio"] is not ""):
+            text = p(state["audio"])["text"] + "\n"
+            state["temp_text"] = text
+        state["text"] += state["temp_text"]
+        state["temp_text"] = ""
+        state["audio"] = ""
+    time.sleep(0.5)
+    return f'{state["text"]} ( {state["temp_text"]} )', state
+gr.Interface(
+    transcribe,
+    [gr.Audio(source="microphone", type="filepath", streaming=True), "state"],
+    [gr.Textbox(),"state"],
+    live=True
+    ).launch(server_name = "0.0.0.0")

requirements.txt CHANGED Viewed

@@ -5,4 +5,6 @@ sentencepiece
 librosa
 torchaudio
 pyctcdecode
-https://github.com/kpu/kenlm/archive/master.zip

 librosa
 torchaudio
 pyctcdecode
+https://github.com/kpu/kenlm/archive/master.zip
+onnx
+onnxruntime