Spaces:
Running
Running
import logging | |
import os | |
import tempfile | |
from typing import Iterator | |
from pysrt import SubRipFile, SubRipItem, SubRipTime | |
from pytubefix import YouTube | |
from transcriber import TranscribeResult | |
logger = logging.getLogger(__name__) | |
def download_youtube_audio(video_id: str) -> str: | |
""" | |
Download audio from YouTube video. | |
Args: | |
video_id (str): YouTube video ID. | |
Returns: | |
str: Path to the downloaded audio file. | |
""" | |
urls = "https://www.youtube.com/watch?v={}".format(video_id) | |
try: | |
# https://github.com/JuanBindez/pytubefix/issues/242#issuecomment-2369067929 | |
vid = YouTube(urls, "MWEB") | |
if vid.title is None: | |
return None | |
audio_download = vid.streams.get_audio_only() | |
audio_download.download( | |
mp3=True, | |
filename=video_id, | |
output_path=tempfile.gettempdir(), | |
skip_existing=True, | |
) | |
audio_file = tempfile.gettempdir() + "/" + video_id + ".mp3" | |
return audio_file | |
except Exception as e: | |
print(e) | |
return None | |
def to_srt(results: Iterator["TranscribeResult"]) -> str: | |
""" | |
Convert the list of TranscribeResult objects into a SRT file | |
""" | |
srt = SubRipFile() | |
for i, t in enumerate(results): | |
start = SubRipTime(seconds=t.start_time) | |
end = SubRipTime(seconds=t.end_time) | |
item = SubRipItem(index=i, start=start, end=end, text=t.text) | |
srt.append(item) | |
temp_file = tempfile.gettempdir() + "/output.srt" | |
srt.save(temp_file) | |
with open(temp_file, "r", encoding="utf-8") as f: | |
srt_text = f.read() | |
os.remove(temp_file) | |
return srt_text | |