versae commited on
Commit
ea81981
1 Parent(s): c3e0f72

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -35
app.py CHANGED
@@ -4,6 +4,7 @@ import torch
4
 
5
  import gradio as gr
6
  import pytube as pt
 
7
  from transformers import pipeline
8
  from huggingface_hub import model_info
9
 
@@ -12,33 +13,39 @@ lang = "fi"
12
 
13
  share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
14
  auth_token = os.environ.get("AUTH_TOKEN") or True
15
- device = 0 if torch.cuda.is_available() else "cpu"
16
- pipe = pipeline(
17
- task="automatic-speech-recognition",
18
- model=MODEL_NAME,
19
- chunk_length_s=30,
20
- device=device,
21
- token=auth_token,
22
- )
23
-
24
- pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
25
-
26
- def transcribe(microphone, file_upload):
27
- warn_output = ""
28
- if (microphone is not None) and (file_upload is not None):
29
- warn_output = (
30
- "WARNING: You've uploaded an audio file and used the microphone. "
31
- "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
32
- )
33
-
34
- elif (microphone is None) and (file_upload is None):
35
- return "ERROR: You have to either use the microphone or upload an audio file"
36
-
37
- file = microphone if microphone is not None else file_upload
38
-
39
- text = pipe(file)["text"]
40
-
41
- return warn_output + text
 
 
 
 
 
 
42
 
43
 
44
  def _return_yt_html_embed(yt_url):
@@ -50,13 +57,13 @@ def _return_yt_html_embed(yt_url):
50
  return HTML_str
51
 
52
 
53
- def yt_transcribe(yt_url):
54
  yt = pt.YouTube(yt_url)
55
  html_embed_str = _return_yt_html_embed(yt_url)
56
  stream = yt.streams.filter(only_audio=True)[0]
57
  stream.download(filename="audio.mp3")
58
 
59
- text = pipe("audio.mp3")["text"]
60
 
61
  return html_embed_str, text
62
 
@@ -66,11 +73,10 @@ demo = gr.Blocks()
66
  mf_transcribe = gr.Interface(
67
  fn=transcribe,
68
  inputs=[
69
- gr.inputs.Audio(source="microphone", type="filepath", optional=True),
70
- gr.inputs.Audio(source="upload", type="filepath", optional=True),
71
  ],
72
  outputs="text",
73
- layout="horizontal",
74
  theme="huggingface",
75
  title="Whisper Demo: Transcribe Audio",
76
  description=(
@@ -83,10 +89,12 @@ mf_transcribe = gr.Interface(
83
 
84
  yt_transcribe = gr.Interface(
85
  fn=yt_transcribe,
86
- inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
 
 
 
87
  examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
88
  outputs=["html", "text"],
89
- layout="horizontal",
90
  theme="huggingface",
91
  title="Whisper Demo: Transcribe YouTube",
92
  description=(
@@ -100,4 +108,4 @@ yt_transcribe = gr.Interface(
100
  with demo:
101
  gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
102
 
103
- demo.launch(enable_queue=True, share=True)
 
4
 
5
  import gradio as gr
6
  import pytube as pt
7
+ import spaces
8
  from transformers import pipeline
9
  from huggingface_hub import model_info
10
 
 
13
 
14
  share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
15
  auth_token = os.environ.get("AUTH_TOKEN") or True
16
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
17
+ print(f"Using device: {device}")
18
+
19
+ @spaces.GPU(duration=120)
20
+ def pipe(file, return_timestamps=False):
21
+ asr = pipeline(
22
+ task="automatic-speech-recognition",
23
+ model=MODEL_NAME,
24
+ chunk_length_s=30,
25
+ device=device,
26
+ token=auth_token,
27
+ )
28
+ asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(
29
+ language=lang,
30
+ task="transcribe",
31
+ no_timestamps=not return_timestamps,
32
+ )
33
+ # asr.model.config.no_timestamps_token_id = asr.tokenizer.encode("<|notimestamps|>", add_special_tokens=False)[0]
34
+ return asr(file, return_timestamps=return_timestamps)
35
+
36
+ def transcribe(file, return_timestamps=False):
37
+ if not return_timestamps:
38
+ text = pipe(file)["text"]
39
+ else:
40
+ chunks = pipe(file, return_timestamps=True)["chunks"]
41
+ text = []
42
+ for chunk in chunks:
43
+ start_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][0])) if chunk["timestamp"][0] is not None else "??:??:??"
44
+ end_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][1])) if chunk["timestamp"][1] is not None else "??:??:??"
45
+ line = f"[{start_time} -> {end_time}] {chunk['text']}"
46
+ text.append(line)
47
+ text = "\n".join(text)
48
+ return text
49
 
50
 
51
  def _return_yt_html_embed(yt_url):
 
57
  return HTML_str
58
 
59
 
60
+ def yt_transcribe(yt_url, return_timestamps=False):
61
  yt = pt.YouTube(yt_url)
62
  html_embed_str = _return_yt_html_embed(yt_url)
63
  stream = yt.streams.filter(only_audio=True)[0]
64
  stream.download(filename="audio.mp3")
65
 
66
+ text = transcribe("audio.mp3", return_timestamps=return_timestamps)
67
 
68
  return html_embed_str, text
69
 
 
73
  mf_transcribe = gr.Interface(
74
  fn=transcribe,
75
  inputs=[
76
+ gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
77
+ # gr.components.Checkbox(label="Return timestamps"),
78
  ],
79
  outputs="text",
 
80
  theme="huggingface",
81
  title="Whisper Demo: Transcribe Audio",
82
  description=(
 
89
 
90
  yt_transcribe = gr.Interface(
91
  fn=yt_transcribe,
92
+ inputs=[
93
+ gr.components.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
94
+ # gr.components.Checkbox(label="Return timestamps"),
95
+ ],
96
  examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
97
  outputs=["html", "text"],
 
98
  theme="huggingface",
99
  title="Whisper Demo: Transcribe YouTube",
100
  description=(
 
108
  with demo:
109
  gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
110
 
111
+ demo.launch(share=True).queue()