tomriddle commited on
Commit
6c4f574
1 Parent(s): 4af4e17

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -56
app.py CHANGED
@@ -1,71 +1,22 @@
1
  import pathlib
2
- from faster_whisper import WhisperModel
3
  import yt_dlp
4
  import uuid
5
  import os
6
  import gradio as gr
7
  from tqdm import tqdm
8
 
9
- # List of all supported video sites here https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md
10
- def download_convert_video_to_audio(
11
- yt_dlp,
12
- video_url: str,
13
- destination_path: pathlib.Path,
14
- ) -> None:
15
- ydl_opts = {
16
- "format": "bestaudio/best",
17
- "postprocessors": [
18
- { # Extract audio using ffmpeg
19
- "key": "FFmpegExtractAudio",
20
- "preferredcodec": "mp3",
21
- }
22
- ],
23
- "outtmpl": f"{destination_path}.%(ext)s",
24
- }
25
- try:
26
- print(f"Downloading video from {video_url}")
27
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
28
- ydl.download(video_url)
29
- print(f"Downloaded video from {video_url} to {destination_path}")
30
- except Exception as e:
31
- raise (e)
32
 
33
- def segment_to_dict(segment):
34
- segment = segment._asdict()
35
- if segment["words"] is not None:
36
- segment["words"] = [word._asdict() for word in segment["words"]]
37
- return segment
38
-
39
- def download_video(video_url: str):
40
- download_convert_video_to_audio(yt_dlp, video_url, f"{uuid.uuid4().hex}")
41
-
42
- def transcribe_video(video_url: str, word_timestamps: bool = True, model_size: str = "tiny"):
43
- print(word_timestamps)
44
- print("loading model")
45
- model = WhisperModel(model_size, device="cpu", compute_type="int8")
46
- # model = WhisperModel(model_size, device="cuda", compute_type="float16")
47
- print("getting hex")
48
- rand_id = uuid.uuid4().hex
49
- print("doing download")
50
- download_convert_video_to_audio(yt_dlp, video_url, f"{rand_id}")
51
- segments, info = model.transcribe(f"{rand_id}.mp3", beam_size=5, word_timestamps=word_timestamps)
52
- segments = [segment_to_dict(segment) for segment in segments]
53
- total_duration = round(info.duration, 2) # Same precision as the Whisper timestamps.
54
- print(info)
55
- os.remove(f"{rand_id}.mp3")
56
- print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
57
- print(segments)
58
  return segments
59
 
60
- # print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
61
-
62
- # for segment in segments:
63
- # print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
64
 
65
  demo = gr.Interface(fn=transcribe_video, inputs=[
66
- gr.Textbox(label="Video URL"),
67
- gr.Checkbox(label="Word Timestamps", info="Do you want word timestamps in the response?"),
68
- gr.Dropdown(label="Model", value="tiny", choices=["tiny", "base", "small"])
69
  ], outputs="text")
70
 
71
  demo.launch()
 
1
  import pathlib
 
2
  import yt_dlp
3
  import uuid
4
  import os
5
  import gradio as gr
6
  from tqdm import tqdm
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ def transcribe_video(d_id_key: str, elv_key: str, full_text: str):
10
+ print("Test")
11
+ segments="Work In Progress"
12
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  return segments
14
 
 
 
 
 
15
 
16
  demo = gr.Interface(fn=transcribe_video, inputs=[
17
+ gr.Textbox(label="D-Id API Key",placeholder="Paste your D-Id",type='password'),
18
+ gr.Textbox(label="Elevenlabs API Keys",placeholder="Paste Elevenlabs",type='password'),
19
+ gr.Textbox(lines=4, label=" Please input the text you wish to generate in order to make the photo speak.", placeholder="English Text here")
20
  ], outputs="text")
21
 
22
  demo.launch()