Spaces:

HakimHa
/

wanderJoy

Runtime error

HakimHa commited on Jul 20, 2023

Commit

b2370b0

1 Parent(s): 1491160

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from PIL import Image
 from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
 import soundfile as sf
 import torch
 model_name_or_path = "microsoft/DialoGPT-large"
@@ -16,7 +17,7 @@ model = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True,
 )
-p = pipeline("automatic-speech-recognition")
 # Function to handle text input
 def handle_text(text):
@@ -31,16 +32,30 @@ def handle_image(img):
     return "This image seems nice!"
 # Function to handle audio input
-def handle_audio(audio):
-    # Gradio's Audio component returns a tuple of (sample_rate, audio_data)
-    text = p(audio)["text"]
-    return text
 def chatbot(text, img, audio):
     text_output = handle_text(text) if text is not None else ''
     img_output = handle_image(img) if img is not None else ''
-    audio_output = handle_text(audio) if audio is not None else ''
     outputs = [o for o in [text_output, img_output, audio_output] if o]
     return "\n".join(outputs)

 from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
 import soundfile as sf
 import torch
+from speech_recognition import AudioFile, Recognizer
 model_name_or_path = "microsoft/DialoGPT-large"
     trust_remote_code=True,
 )
 # Function to handle text input
 def handle_text(text):
     return "This image seems nice!"
 # Function to handle audio input
+def stt(audio: object, language: str) -> str:
+    """Converts speech to text.
+    Args:
+        audio: record of user speech
+    Returns:
+        text (str): recognized speech of user
+    """
+    # Create a Recognizer object
+    r = Recognizer()
+    # Open the audio file
+    with AudioFile(audio) as source:
+        # Listen for the data (load audio to memory)
+        audio_data = r.record(source)
+        # Transcribe the audio using Google's speech-to-text API
+        text = r.recognize_google(audio_data, language=language)
+    return text
 def chatbot(text, img, audio):
     text_output = handle_text(text) if text is not None else ''
     img_output = handle_image(img) if img is not None else ''
+    audio_output = handle_text(stt(audio)) if audio is not None else ''
     outputs = [o for o in [text_output, img_output, audio_output] if o]
     return "\n".join(outputs)