englissi commited on
Commit
bb16e26
β€’
1 Parent(s): 1c30f5b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -13
app.py CHANGED
@@ -1,27 +1,29 @@
1
  import gradio as gr
2
- import os
3
- import time # To add a short delay
4
-
5
- # Force install torch (though this should ideally be handled in requirements.txt)
6
- os.system("pip install torch")
7
- time.sleep(10) # Adding a short delay to ensure installation completes
8
-
9
  import torch
10
- from transformers import pipeline
11
-
12
- # Initialize the ASR pipeline for Bulgarian
13
- # Load model directly
14
  from transformers import AutoProcessor, AutoModelForCTC
 
15
 
 
16
  processor = AutoProcessor.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bulgarian")
17
  model = AutoModelForCTC.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bulgarian")
18
 
19
 
20
  # ASR λ³€ν™˜ ν•¨μˆ˜ (speech-to-text conversion)
21
  def asr_generate(audio):
22
- transcription = asr_pipeline(audio)["text"]
 
 
 
 
 
 
 
 
 
 
23
  return transcription
24
 
 
25
  # Gradio μΈν„°νŽ˜μ΄μŠ€ 생성
26
  iface = gr.Interface(
27
  fn=asr_generate,
@@ -33,4 +35,4 @@ iface = gr.Interface(
33
 
34
  # μΈν„°νŽ˜μ΄μŠ€ μ‹€ν–‰
35
  if __name__ == "__main__":
36
- iface.launch()
 
1
  import gradio as gr
 
 
 
 
 
 
 
2
  import torch
 
 
 
 
3
  from transformers import AutoProcessor, AutoModelForCTC
4
+ import soundfile as sf # To handle audio input
5
 
6
+ # Load the processor and model directly for Bulgarian ASR
7
  processor = AutoProcessor.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bulgarian")
8
  model = AutoModelForCTC.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bulgarian")
9
 
10
 
11
  # ASR λ³€ν™˜ ν•¨μˆ˜ (speech-to-text conversion)
12
  def asr_generate(audio):
13
+ # Load and process the audio file
14
+ speech, _ = sf.read(audio)
15
+ inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
16
+
17
+ with torch.no_grad():
18
+ logits = model(**inputs).logits
19
+
20
+ # Get predicted IDs and decode the text
21
+ predicted_ids = torch.argmax(logits, dim=-1)
22
+ transcription = processor.batch_decode(predicted_ids)[0]
23
+
24
  return transcription
25
 
26
+
27
  # Gradio μΈν„°νŽ˜μ΄μŠ€ 생성
28
  iface = gr.Interface(
29
  fn=asr_generate,
 
35
 
36
  # μΈν„°νŽ˜μ΄μŠ€ μ‹€ν–‰
37
  if __name__ == "__main__":
38
+ iface.launch()