Spaces:
Build error
Build error
import shutil | |
import sys | |
import time | |
from pathlib import Path | |
import anvil.server | |
import anvil.media | |
from whisper.utils import write_srt, write_vtt | |
from yt_dlp import YoutubeDL | |
from yt_dlp.utils import DownloadError | |
import os | |
import tempfile | |
import json | |
import argparse | |
import whisper | |
from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE | |
import ffmpeg | |
from utils.subs import bake_subs, get_srt | |
from utils.utils import get_args | |
original_dir = os.getcwd() | |
output_dir = Path('output') | |
args = get_args() | |
model_size: str = args.get("model", os.environ.get("WHISPER_MODEL", "large")) | |
preload_model: bool = args.get("preload") | |
if preload_model: | |
print("Preloading model") | |
model = whisper.load_model(model_size) | |
def download_generator(url, translate_action=True, source_language='Autodetect', corrected_subtitles=None): | |
# Step 1 : check if video is available | |
yield {"message": f"Checking {url} for videos"} | |
try: | |
meta = check_download(url) | |
# print(json.dumps(meta, indent=2)) | |
# if(meta['duration'] > 159) : | |
# raise Exception("Video is too long, please use videos less than 159 seconds") | |
yield {"message": f"Found video with {meta['duration']} seconds duration from {meta['extractor']}", "meta": meta} | |
tempdir = output_dir/f"{meta['id']}" | |
except Exception as e: | |
yield {"message": f"{e}"} | |
return | |
# Step 2 : Download video and extract audio | |
try: | |
# check if we already have the folder and the main files | |
if(tempdir.is_dir() and (tempdir/f"{meta['id']}.{meta['ext']}").is_file() and (tempdir/f"{meta['id']}.mp3").is_file()): | |
yield {"message": f"Using cached files"} | |
video = str((tempdir/f"{meta['id']}.{meta['ext']}").resolve()) | |
audio = str((tempdir/f"{meta['id']}.mp3").resolve()) | |
else: | |
yield {"message": f"Starting download with URL {url}, this may take a while"} | |
meta, video, audio = download(url, tempdir) | |
yield {"message": f"Downloaded video and extracted audio", "video": video, "audio": audio, "meta": meta} | |
except Exception as e: | |
os.chdir(original_dir) | |
yield {"message": f"{e}"} | |
raise e | |
srt_path = tempdir / f"{meta['id']}.srt" | |
vtt_path = tempdir / f"{meta['id']}.vtt" | |
if not corrected_subtitles: | |
### Step 3 : Transcribe with whisper | |
yield {"message": f"[PLEASE WAIT] Starting whisper transcribe with {meta['id']}.mp3"} | |
try: | |
whisper_result = transcribe(audio, translate_action, source_language) | |
with open(srt_path, "w", encoding="utf-8") as srt: | |
write_srt(whisper_result["segments"], file=srt) | |
with open(vtt_path, "w", encoding="utf-8") as vtt: | |
write_vtt(whisper_result["segments"], file=vtt) | |
whisper_result["srt"] = Path(srt_path).read_text() | |
whisper_result["vtt"] = Path(vtt_path).read_text() | |
yield {"message": f"Transcribe successful", "whisper_result": whisper_result, "meta": meta, "srt_path": srt_path, "vtt_path": vtt_path} | |
except Exception as e: | |
os.chdir(original_dir) | |
yield {"message": f"{e}"} | |
raise e | |
else: | |
### step 3.5 : use corrected subtitles | |
yield {"message": f"Using corrected subtitles"} | |
with open(srt_path, "w", encoding="utf-8") as srt: | |
srt.write(corrected_subtitles) | |
yield {"message": f"Transcribe successful", "srt_path": srt_path, "meta": meta} | |
### Step 4 : Bake subtitles into video with ffmpeg | |
yield {"message": f"[PLEASE WAIT] baking subtitles into video"} | |
try: | |
print('Stating to bake subtitles') | |
subbed_video_path = tempdir / f"{meta['id']}_translated.mp4" | |
fontsdir = Path('fonts') | |
bake_subs(video, subbed_video_path.absolute() , srt_path.absolute(), fontsdir, translate_action) | |
yield {"message": f"Subtitled video ready!", "sub_video": str(subbed_video_path.absolute()), "meta": meta, "vtt_path": vtt_path} | |
except ffmpeg.Error as e: | |
print('stdout:', e.stdout.decode('utf8')) | |
print('stderr:', e.stderr.decode('utf8')) | |
raise e | |
except Exception as e: | |
print('stdout:', e.stdout.decode('utf8')) | |
print('stderr:', e.stderr.decode('utf8')) | |
os.chdir(original_dir) | |
print('error', file=sys.stderr) | |
raise e | |
yield {"message": f"{e}"} | |
def user_uploaded_video_generator(video, translate_action=True, source_language='Autodetect', corrected_subtitles=None): | |
video_name = Path(video).stem | |
# create tempdir | |
tempdir = output_dir / video_name | |
tempdir.mkdir(parents=True, exist_ok=True) | |
# copy video with shutil.copy2 | |
video_path = tempdir / Path(video).name | |
shutil.copy2(video, video_path) | |
yield {"message": f"Extracting audio from {video_name}", "video": video_path} | |
# TODO : extract audio from videos | |
output_audio = tempdir / f"{video_name}.mp3" | |
ffmpeg.input(video_path).output(filename=output_audio).run() | |
yield {"message": f"Got audio from {video_name}", "video": video, "audio": output_audio} | |
# Run whisper on the audio with language unless auto | |
try: | |
audio_file = output_audio | |
print(f"Starting whisper transcribe with {output_audio}") | |
transcribe_whisper_result = transcribe(audio_file, translate_action=False, language='Autodetect', override_model_size=model_size) | |
yield {"message": f"Finished transcription, starting translation to {transcribe_whisper_result['language']}"} | |
detected_language = LANGUAGES[transcribe_whisper_result["language"]] | |
translate_whisper_result = transcribe(audio_file, translate_action=True, language=detected_language, override_model_size=model_size) | |
yield {"message": f"Finished translation to English, preparing subtitle files"} | |
with open(tempdir / f"{video_name}.vtt", "w", encoding="utf-8") as vtt: | |
write_vtt(transcribe_whisper_result['segments'], file=vtt) | |
# yield {"message": f"Created VTT files", "vtt_path": f"{video_name}.vtt", "vtt_en_path": f"{video_name}.en.vtt"} | |
# write_srt(transcribe_whisper_result['segments'], tempdir / f"{video_name}.srt") | |
# write_srt(translate_whisper_result['segments'], tempdir / f"{video_name}_en.srt") | |
# yield {"message": f"Created SRT files", "srt_path": f"{video_name}.srt", "srt_en_path": f"{video_name}.en.srt"} | |
# print(f"Transcribe successful!") | |
except Exception as e: | |
print(f"Could not transcribe file: {e}") | |
return | |
def caption_generator(social_media_url,uid, language="Autodetect", model_size=model_size): | |
with tempfile.TemporaryDirectory() as tempdir: | |
tempdir = Path(tempdir) | |
# try: | |
# print(f"Downloading {social_media_url} ") | |
# meta = check_download(social_media_url) | |
# print(f"Downloaded {meta['id']}.mp3 from {meta['uploader_id']} and url {meta['webpage_url']}") | |
# except Exception as e: | |
# print(f"Could not download file: {e}") | |
# raise | |
try: | |
print(f"Starting audio only download with URL {social_media_url}, this may take a while") | |
meta, audio = download_audio(social_media_url, tempdir, id=uid) | |
print(f"Downloaded video and extracted audio") | |
except Exception as e: | |
print(f"Could not download file: {e}") | |
raise | |
# Run whisper on the audio with language unless auto | |
try: | |
print(f"Starting whisper transcribe with {uid}.mp3") | |
transcribe_whisper_result = transcribe(audio, translate_action=False, language=language, override_model_size=model_size) | |
detected_language = LANGUAGES[transcribe_whisper_result["language"]] | |
print(f"Transcribe successful!, writing files") | |
vtt_path = tempdir / f"{transcribe_whisper_result['language']}.vtt" | |
with open(vtt_path.resolve(), "w", encoding="utf-8") as vtt: | |
write_vtt(transcribe_whisper_result["segments"], file=vtt) | |
whisper_result_captions = [ | |
{ | |
"language_tag": transcribe_whisper_result["language"], | |
"vtt_file": anvil.BlobMedia(content_type="text/plain", content=vtt_path.read_bytes(), | |
name=f"{uid}.{transcribe_whisper_result['language']}.vtt") | |
}, | |
] | |
if detected_language != "en": | |
print(f"Transcribe successful! Starting translation to English") | |
translate_whisper_result = transcribe(audio, translate_action=True, language=detected_language, override_model_size=model_size) | |
en_vtt_path = tempdir / f"en.vtt" | |
with open(en_vtt_path.resolve(), "w", encoding="utf-8") as en_vtt: | |
write_vtt(translate_whisper_result["segments"], file=en_vtt) | |
print(f"Finished translation to English, preparing subtitle files") | |
whisper_result_captions.append( | |
{ | |
"language_tag": "en", | |
"vtt_file": anvil.BlobMedia(content_type="text/plain", content=en_vtt_path.read_bytes(), name=f"{uid}.en.vtt") | |
} | |
) | |
except Exception as e: | |
print(f"Could not transcribe file: {e}") | |
raise | |
print(f"Finished processing {uid} file, returning results") | |
print(whisper_result_captions) | |
return 'success', whisper_result_captions | |
# Run whisper with translation task enabled (and save to different srt file) | |
# Call anvil background task with both files, and both the plain texts | |
def progress_hook(d): | |
if d['status'] == 'downloading': | |
print("downloading " + str(round(float(d['downloaded_bytes']) / float(d['total_bytes']) * 100, 1)) + "%") | |
yield f"{d['_percent_str']} downloaded" | |
if d['status'] == 'finished': | |
filename = d['filename'] | |
print(filename) | |
yield f"Downloaded {filename}" | |
def download(url, tempdir, format="bestvideo[ext=mp4]+bestaudio/best", verbose=False, keepVideo=True, filename="%(id)s.%(ext)s"): | |
try: | |
ydl_opts = { | |
"format": format, | |
"keepvideo": keepVideo, | |
'postprocessors': [{ | |
'key': 'FFmpegExtractAudio', | |
'preferredcodec': 'mp3', | |
'preferredquality': '192', | |
}], | |
"skip_download": False, | |
"outtmpl": f"{tempdir}/{filename}", | |
"noplaylist": True, | |
"verbose": verbose, | |
"quiet": False, | |
"progress_hooks": [progress_hook], | |
} | |
ydl = YoutubeDL(ydl_opts) | |
meta = ydl.extract_info( | |
url, | |
download=True, | |
) | |
except DownloadError as e: | |
raise e | |
else: | |
audio = tempdir / f"{meta['id']}.mp3" | |
if (keepVideo): | |
video = tempdir / f"{meta['id']}.{meta['ext']}" | |
return meta, str(video.resolve()), str(audio.resolve()) | |
else: | |
return meta, None, str(audio.resolve()) | |
def download_audio(url, tempdir, format="bestaudio/best", verbose=False, id=None): | |
filename = f"{id}.%(ext)s" | |
try: | |
ydl_opts = { | |
"format": format, | |
"keepvideo": False, | |
'postprocessors': [{ | |
'key': 'FFmpegExtractAudio', | |
'preferredcodec': 'mp3', | |
'preferredquality': '192', | |
}], | |
"skip_download": False, | |
"outtmpl": f"{tempdir}/{filename}", | |
"noplaylist": True, | |
"verbose": verbose, | |
"quiet": False, | |
"progress_hooks": [progress_hook], | |
} | |
ydl = YoutubeDL(ydl_opts) | |
meta = ydl.extract_info( | |
url, | |
download=True, | |
) | |
except DownloadError as e: | |
raise e | |
else: | |
audio = tempdir / f"{id}.mp3" | |
return meta, str(audio.resolve()) | |
def check_download(url): | |
ydl_opts = { | |
"format": "bestvideo[ext=mp4]+bestaudio/best", | |
"skip_download": True, | |
"verbose": False, | |
} | |
ydl = YoutubeDL(ydl_opts) | |
try: | |
meta = ydl.extract_info( | |
url, | |
download=False, | |
) | |
except DownloadError as e: | |
raise e | |
else: | |
return meta | |
def transcribe(audio, translate_action=True, language='Autodetect', override_model_size=''): | |
""" | |
Transcribe audio file with whisper | |
:param audio: - The audio file to transcribe | |
:param translate_action: Bool - Whether to translate to English or keep original language | |
:param language: String - The language to transcribe to, default is Autodetect | |
:param override_model_size: Bool - Whether to override the model size | |
:return: | |
""" | |
task = "translate" if translate_action else "transcribe" | |
model_size_to_load = override_model_size if override_model_size else model_size | |
print(f'Starting {task} with whisper size {model_size_to_load} on {audio}') | |
global model | |
if not preload_model or model_size != override_model_size: | |
model = whisper.load_model(model_size_to_load) | |
props = { | |
"task": task, | |
} | |
if language != 'Autodetect': | |
props["language"] = TO_LANGUAGE_CODE[language.lower()] if len(language) > 2 else language | |
output = model.transcribe(audio, verbose=True, **props) | |
output['segments'] = output['segments'] | |
output['requested_language'] = language.lower() | |
print(f'Finished transcribe from {LANGUAGES[output["language"]].capitalize()}', output["text"]) | |
return output | |