Baghdad99 commited on
Commit
5dbc4ea
1 Parent(s): 563f027

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -24
app.py CHANGED
@@ -1,15 +1,12 @@
 
1
  import gradio as gr
2
- from transformers import pipeline, AutoTokenizer
3
  import numpy as np
4
- from pydub import AudioSegment
5
  import librosa
6
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
7
-
8
- # Load the model and processor
9
- model = Wav2Vec2ForCTC.from_pretrained("Akashpb13/Hausa_xlsr")
10
- processor = Wav2Vec2Processor.from_pretrained("Akashpb13/Hausa_xlsr")
11
-
12
 
 
 
 
13
  translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
14
  tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
15
 
@@ -18,27 +15,17 @@ def translate_speech(audio_input):
18
  audio_data, sample_rate = librosa.load(audio_input, sr=None)
19
 
20
  # Prepare the input dictionary
21
- input_dict = processor(audio_data, return_tensors="pt", padding=True)
22
 
23
- # Use the model to get the logits
24
- logits = model(input_dict.input_values.to("cpu")).logits
25
 
26
  # Get the predicted IDs
27
  pred_ids = torch.argmax(logits, dim=-1)[0]
28
 
29
  # Decode the predicted IDs to get the transcription
30
- transcription = processor.decode(pred_ids)
31
-
32
- # Use the speech recognition pipeline to transcribe the audio
33
- output = pipe(audio_data)
34
-
35
- # Check if the output contains 'text'
36
- if 'text' in output:
37
- transcription = output["text"]
38
- print(f"Transcription: {transcription}") # Print the transcription
39
- else:
40
- print("The output does not contain 'text'")
41
- return
42
 
43
  # Use the translation pipeline to translate the transcription
44
  translated_text = translator(transcription, return_tensors="pt")
@@ -71,7 +58,6 @@ def translate_speech(audio_input):
71
 
72
  return 16000, synthesised_speech
73
 
74
-
75
  # Define the Gradio interface
76
  iface = gr.Interface(
77
  fn=translate_speech,
 
1
+ import torch # Add this line
2
  import gradio as gr
3
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline, AutoTokenizer
4
  import numpy as np
 
5
  import librosa
 
 
 
 
 
 
6
 
7
+ # Load the models and processors
8
+ asr_model = Wav2Vec2ForCTC.from_pretrained("Akashpb13/Hausa_xlsr")
9
+ asr_processor = Wav2Vec2Processor.from_pretrained("Akashpb13/Hausa_xlsr")
10
  translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
11
  tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
12
 
 
15
  audio_data, sample_rate = librosa.load(audio_input, sr=None)
16
 
17
  # Prepare the input dictionary
18
+ input_dict = asr_processor(audio_data, return_tensors="pt", padding=True)
19
 
20
+ # Use the ASR model to get the logits
21
+ logits = asr_model(input_dict.input_values.to("cpu")).logits
22
 
23
  # Get the predicted IDs
24
  pred_ids = torch.argmax(logits, dim=-1)[0]
25
 
26
  # Decode the predicted IDs to get the transcription
27
+ transcription = asr_processor.decode(pred_ids)
28
+ print(f"Transcription: {transcription}") # Print the transcription
 
 
 
 
 
 
 
 
 
 
29
 
30
  # Use the translation pipeline to translate the transcription
31
  translated_text = translator(transcription, return_tensors="pt")
 
58
 
59
  return 16000, synthesised_speech
60
 
 
61
  # Define the Gradio interface
62
  iface = gr.Interface(
63
  fn=translate_speech,