Jonathan Li commited on
Commit
5de9db4
·
1 Parent(s): 1ebc0dd

Fix url problem

Browse files
Files changed (1) hide show
  1. app.py +17 -4
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  import requests
 
3
  from transformers import AutoTokenizer, pipeline
4
  from youtube_transcript_api._transcripts import TranscriptListFetcher
5
 
@@ -12,6 +13,14 @@ tokenizer = AutoTokenizer.from_pretrained("./checkpoint-6000")
12
  max_size = 512
13
  classes = [False, True]
14
 
 
 
 
 
 
 
 
 
15
  def process(obj):
16
  o = obj["events"]
17
  new_l = []
@@ -65,8 +74,12 @@ def get_transcript(video_id, session):
65
  p = process(obj.json())
66
  return p
67
 
68
- def transcript(video_id):
69
- return " ".join(l["w"].strip() for l in get_transcript(video_id, requests.Session()))
 
 
 
 
70
 
71
  def inference(transcript):
72
  tokens = tokenizer(transcript.split(" "))["input_ids"]
@@ -119,9 +132,9 @@ def predict(transcript):
119
  with gr.Blocks() as demo:
120
  with gr.Row():
121
  with gr.Column():
122
- inp = gr.Textbox(label="Video ID or URL", placeholder="Video id", lines=1, max_lines=1)
123
  btn = gr.Button("Fetch Transcript")
124
- gr.Examples(["xsLJZyih3Ac"], [inp])
125
  text = gr.Textbox(label="Transcript", placeholder="<generated transcript>")
126
  btn.click(fn=transcript, inputs=inp, outputs=text)
127
  with gr.Column():
 
1
  import gradio as gr
2
  import requests
3
+ import re
4
  from transformers import AutoTokenizer, pipeline
5
  from youtube_transcript_api._transcripts import TranscriptListFetcher
6
 
 
13
  max_size = 512
14
  classes = [False, True]
15
 
16
+ pattern = re.compile(
17
+ r"(?:https?:\/\/)?(?:[0-9A-Z-]+\.)?(?:youtube|youtu|youtube-nocookie)\.(?:com|be)\/(?:watch\?v=|watch\?.+&v=|embed\/|v\/|.+\?v=)?([^&=\n%\?]{11})"
18
+ )
19
+
20
+ def video_id(url):
21
+ p = pattern.match(url)
22
+ return p.group(1) if p else None
23
+
24
  def process(obj):
25
  o = obj["events"]
26
  new_l = []
 
74
  p = process(obj.json())
75
  return p
76
 
77
+ def transcript(url):
78
+ i = video_id(url)
79
+ if i:
80
+ return " ".join(l["w"].strip() for l in get_transcript(i, requests.Session()))
81
+ else:
82
+ return "ERROR: Failed to load transcript (it the link a valid youtube url?)..."
83
 
84
  def inference(transcript):
85
  tokens = tokenizer(transcript.split(" "))["input_ids"]
 
132
  with gr.Blocks() as demo:
133
  with gr.Row():
134
  with gr.Column():
135
+ inp = gr.Textbox(label="Video URL", placeholder="Video URL", lines=1, max_lines=1)
136
  btn = gr.Button("Fetch Transcript")
137
+ gr.Examples(["youtu.be/xsLJZyih3Ac"], [inp])
138
  text = gr.Textbox(label="Transcript", placeholder="<generated transcript>")
139
  btn.click(fn=transcript, inputs=inp, outputs=text)
140
  with gr.Column():