HakimHa commited on
Commit
1491160
Β·
1 Parent(s): 1b1c058

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -11
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  from PIL import Image
3
- from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoModelForCausalLM, AutoTokenizer
4
  import soundfile as sf
5
  import torch
6
 
@@ -16,9 +16,7 @@ model = AutoModelForCausalLM.from_pretrained(
16
  trust_remote_code=True,
17
  )
18
 
19
- # Load pre-trained model and processor for Wav2Vec2
20
- processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
21
- wav2vec2_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
22
 
23
  # Function to handle text input
24
  def handle_text(text):
@@ -35,18 +33,14 @@ def handle_image(img):
35
  # Function to handle audio input
36
  def handle_audio(audio):
37
  # Gradio's Audio component returns a tuple of (sample_rate, audio_data)
38
- sample_rate, audio_data = audio
39
- input_values = wav2vec2_processor(audio_data, sampling_rate=sample_rate, return_tensors="pt").input_values
40
- logits = wav2vec2_model(input_values).logits
41
- predicted_ids = torch.argmax(logits, dim=-1)
42
- transcriptions = wav2vec2_processor.decode(predicted_ids[0])
43
- return handle_text(transcriptions)
44
 
45
 
46
  def chatbot(text, img, audio):
47
  text_output = handle_text(text) if text is not None else ''
48
  img_output = handle_image(img) if img is not None else ''
49
- audio_output = handle_audio(audio) if audio is not None else ''
50
 
51
  outputs = [o for o in [text_output, img_output, audio_output] if o]
52
  return "\n".join(outputs)
 
1
  import gradio as gr
2
  from PIL import Image
3
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
4
  import soundfile as sf
5
  import torch
6
 
 
16
  trust_remote_code=True,
17
  )
18
 
19
+ p = pipeline("automatic-speech-recognition")
 
 
20
 
21
  # Function to handle text input
22
  def handle_text(text):
 
33
  # Function to handle audio input
34
  def handle_audio(audio):
35
  # Gradio's Audio component returns a tuple of (sample_rate, audio_data)
36
+ text = p(audio)["text"]
37
+ return text
 
 
 
 
38
 
39
 
40
  def chatbot(text, img, audio):
41
  text_output = handle_text(text) if text is not None else ''
42
  img_output = handle_image(img) if img is not None else ''
43
+ audio_output = handle_text(audio) if audio is not None else ''
44
 
45
  outputs = [o for o in [text_output, img_output, audio_output] if o]
46
  return "\n".join(outputs)