HakimHa commited on
Commit
b2370b0
Β·
1 Parent(s): 1491160

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -6
app.py CHANGED
@@ -3,6 +3,7 @@ from PIL import Image
3
  from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
4
  import soundfile as sf
5
  import torch
 
6
 
7
  model_name_or_path = "microsoft/DialoGPT-large"
8
 
@@ -16,7 +17,7 @@ model = AutoModelForCausalLM.from_pretrained(
16
  trust_remote_code=True,
17
  )
18
 
19
- p = pipeline("automatic-speech-recognition")
20
 
21
  # Function to handle text input
22
  def handle_text(text):
@@ -31,16 +32,30 @@ def handle_image(img):
31
  return "This image seems nice!"
32
 
33
  # Function to handle audio input
34
- def handle_audio(audio):
35
- # Gradio's Audio component returns a tuple of (sample_rate, audio_data)
36
- text = p(audio)["text"]
37
- return text
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  def chatbot(text, img, audio):
41
  text_output = handle_text(text) if text is not None else ''
42
  img_output = handle_image(img) if img is not None else ''
43
- audio_output = handle_text(audio) if audio is not None else ''
44
 
45
  outputs = [o for o in [text_output, img_output, audio_output] if o]
46
  return "\n".join(outputs)
 
3
  from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
4
  import soundfile as sf
5
  import torch
6
+ from speech_recognition import AudioFile, Recognizer
7
 
8
  model_name_or_path = "microsoft/DialoGPT-large"
9
 
 
17
  trust_remote_code=True,
18
  )
19
 
20
+
21
 
22
  # Function to handle text input
23
  def handle_text(text):
 
32
  return "This image seems nice!"
33
 
34
  # Function to handle audio input
35
+ def stt(audio: object, language: str) -> str:
36
+ """Converts speech to text.
 
 
37
 
38
+ Args:
39
+ audio: record of user speech
40
+
41
+ Returns:
42
+ text (str): recognized speech of user
43
+ """
44
+
45
+ # Create a Recognizer object
46
+ r = Recognizer()
47
+ # Open the audio file
48
+ with AudioFile(audio) as source:
49
+ # Listen for the data (load audio to memory)
50
+ audio_data = r.record(source)
51
+ # Transcribe the audio using Google's speech-to-text API
52
+ text = r.recognize_google(audio_data, language=language)
53
+ return text
54
 
55
  def chatbot(text, img, audio):
56
  text_output = handle_text(text) if text is not None else ''
57
  img_output = handle_image(img) if img is not None else ''
58
+ audio_output = handle_text(stt(audio)) if audio is not None else ''
59
 
60
  outputs = [o for o in [text_output, img_output, audio_output] if o]
61
  return "\n".join(outputs)