vidtranslator / download.py
altryne's picture
Ok fixed the gradio part
805009b unverified
raw
history blame
12.6 kB
import shutil
import sys
import time
from pathlib import Path
import anvil.server
import anvil.media
from whisper.utils import write_srt, write_vtt
from yt_dlp import YoutubeDL
from yt_dlp.utils import DownloadError
import os
import tempfile
import json
import argparse
import whisper
from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE
import ffmpeg
from utils.subs import bake_subs, get_srt
from utils.utils import get_args
original_dir = os.getcwd()
output_dir = Path('output')
args = get_args()
model_size: str = args.get("model", os.environ.get("WHISPER_MODEL", "large"))
preload_model: bool = args.get("preload")
if preload_model:
print("Preloading model")
model = whisper.load_model(model_size)
def download_generator(url, translate_action=True, source_language='Autodetect', corrected_subtitles=None):
# Step 1 : check if video is available
yield {"message": f"Checking {url} for videos"}
try:
meta = check_download(url)
# print(json.dumps(meta, indent=2))
# if(meta['duration'] > 159) :
# raise Exception("Video is too long, please use videos less than 159 seconds")
yield {"message": f"Found video with {meta['duration']} seconds duration from {meta['extractor']}", "meta": meta}
tempdir = output_dir/f"{meta['id']}"
except Exception as e:
yield {"message": f"{e}"}
return
# Step 2 : Download video and extract audio
try:
# check if we already have the folder and the main files
if(tempdir.is_dir() and (tempdir/f"{meta['id']}.{meta['ext']}").is_file() and (tempdir/f"{meta['id']}.mp3").is_file()):
yield {"message": f"Using cached files"}
video = str((tempdir/f"{meta['id']}.{meta['ext']}").resolve())
audio = str((tempdir/f"{meta['id']}.mp3").resolve())
else:
yield {"message": f"Starting download with URL {url}, this may take a while"}
meta, video, audio = download(url, tempdir)
yield {"message": f"Downloaded video and extracted audio", "video": video, "audio": audio, "meta": meta}
except Exception as e:
os.chdir(original_dir)
yield {"message": f"{e}"}
raise e
srt_path = tempdir / f"{meta['id']}.srt"
vtt_path = tempdir / f"{meta['id']}.vtt"
if not corrected_subtitles:
### Step 3 : Transcribe with whisper
yield {"message": f"[PLEASE WAIT] Starting whisper transcribe with {meta['id']}.mp3"}
try:
whisper_result = transcribe(audio, translate_action, source_language)
with open(srt_path, "w", encoding="utf-8") as srt:
write_srt(whisper_result["segments"], file=srt)
with open(vtt_path, "w", encoding="utf-8") as vtt:
write_vtt(whisper_result["segments"], file=vtt)
whisper_result["srt"] = Path(srt_path).read_text()
whisper_result["vtt"] = Path(vtt_path).read_text()
yield {"message": f"Transcribe successful", "whisper_result": whisper_result, "meta": meta, "srt_path": srt_path, "vtt_path": vtt_path}
except Exception as e:
os.chdir(original_dir)
yield {"message": f"{e}"}
raise e
else:
### step 3.5 : use corrected subtitles
yield {"message": f"Using corrected subtitles"}
with open(srt_path, "w", encoding="utf-8") as srt:
srt.write(corrected_subtitles)
yield {"message": f"Transcribe successful", "srt_path": srt_path, "meta": meta}
### Step 4 : Bake subtitles into video with ffmpeg
yield {"message": f"[PLEASE WAIT] baking subtitles into video"}
try:
print('Stating to bake subtitles')
subbed_video_path = tempdir / f"{meta['id']}_translated.mp4"
fontsdir = Path('fonts')
bake_subs(video, subbed_video_path.absolute() , srt_path.absolute(), fontsdir, translate_action)
yield {"message": f"Subtitled video ready!", "sub_video": str(subbed_video_path.absolute()), "meta": meta, "vtt_path": vtt_path}
except ffmpeg.Error as e:
print('stdout:', e.stdout.decode('utf8'))
print('stderr:', e.stderr.decode('utf8'))
raise e
except Exception as e:
print('stdout:', e.stdout.decode('utf8'))
print('stderr:', e.stderr.decode('utf8'))
os.chdir(original_dir)
print('error', file=sys.stderr)
raise e
yield {"message": f"{e}"}
def user_uploaded_video_generator(video, translate_action=True, source_language='Autodetect', corrected_subtitles=None):
video_name = Path(video).stem
# create tempdir
tempdir = output_dir / video_name
tempdir.mkdir(parents=True, exist_ok=True)
# copy video with shutil.copy2
video_path = tempdir / Path(video).name
shutil.copy2(video, video_path)
yield {"message": f"Extracting audio from {video_name}", "video": video_path}
# TODO : extract audio from videos
output_audio = tempdir / f"{video_name}.mp3"
ffmpeg.input(video_path).output(filename=output_audio).run()
yield {"message": f"Got audio from {video_name}", "video": video, "audio": output_audio}
# Run whisper on the audio with language unless auto
try:
audio_file = output_audio
print(f"Starting whisper transcribe with {output_audio}")
transcribe_whisper_result = transcribe(audio_file, translate_action=False, language='Autodetect', override_model_size=model_size)
yield {"message": f"Finished transcription, starting translation to {transcribe_whisper_result['language']}"}
detected_language = LANGUAGES[transcribe_whisper_result["language"]]
translate_whisper_result = transcribe(audio_file, translate_action=True, language=detected_language, override_model_size=model_size)
yield {"message": f"Finished translation to English, preparing subtitle files"}
with open(tempdir / f"{video_name}.vtt", "w", encoding="utf-8") as vtt:
write_vtt(transcribe_whisper_result['segments'], file=vtt)
# yield {"message": f"Created VTT files", "vtt_path": f"{video_name}.vtt", "vtt_en_path": f"{video_name}.en.vtt"}
# write_srt(transcribe_whisper_result['segments'], tempdir / f"{video_name}.srt")
# write_srt(translate_whisper_result['segments'], tempdir / f"{video_name}_en.srt")
# yield {"message": f"Created SRT files", "srt_path": f"{video_name}.srt", "srt_en_path": f"{video_name}.en.srt"}
# print(f"Transcribe successful!")
except Exception as e:
print(f"Could not transcribe file: {e}")
return
def caption_generator(social_media_url,uid, language="Autodetect", model_size=model_size):
with tempfile.TemporaryDirectory() as tempdir:
tempdir = Path(tempdir)
# try:
# print(f"Downloading {social_media_url} ")
# meta = check_download(social_media_url)
# print(f"Downloaded {meta['id']}.mp3 from {meta['uploader_id']} and url {meta['webpage_url']}")
# except Exception as e:
# print(f"Could not download file: {e}")
# raise
try:
print(f"Starting audio only download with URL {social_media_url}, this may take a while")
meta, audio = download_audio(social_media_url, tempdir, id=uid)
print(f"Downloaded video and extracted audio")
except Exception as e:
print(f"Could not download file: {e}")
raise
# Run whisper on the audio with language unless auto
try:
print(f"Starting whisper transcribe with {uid}.mp3")
transcribe_whisper_result = transcribe(audio, translate_action=False, language=language, override_model_size=model_size)
detected_language = LANGUAGES[transcribe_whisper_result["language"]]
print(f"Transcribe successful!, writing files")
vtt_path = tempdir / f"{transcribe_whisper_result['language']}.vtt"
with open(vtt_path.resolve(), "w", encoding="utf-8") as vtt:
write_vtt(transcribe_whisper_result["segments"], file=vtt)
whisper_result_captions = [
{
"language_tag": transcribe_whisper_result["language"],
"vtt_file": anvil.BlobMedia(content_type="text/plain", content=vtt_path.read_bytes(),
name=f"{uid}.{transcribe_whisper_result['language']}.vtt")
},
]
if detected_language != "en":
print(f"Transcribe successful! Starting translation to English")
translate_whisper_result = transcribe(audio, translate_action=True, language=detected_language, override_model_size=model_size)
en_vtt_path = tempdir / f"en.vtt"
with open(en_vtt_path.resolve(), "w", encoding="utf-8") as en_vtt:
write_vtt(translate_whisper_result["segments"], file=en_vtt)
print(f"Finished translation to English, preparing subtitle files")
whisper_result_captions.append(
{
"language_tag": "en",
"vtt_file": anvil.BlobMedia(content_type="text/plain", content=en_vtt_path.read_bytes(), name=f"{uid}.en.vtt")
}
)
except Exception as e:
print(f"Could not transcribe file: {e}")
raise
print(f"Finished processing {uid} file, returning results")
print(whisper_result_captions)
return 'success', whisper_result_captions
# Run whisper with translation task enabled (and save to different srt file)
# Call anvil background task with both files, and both the plain texts
def progress_hook(d):
if d['status'] == 'downloading':
print("downloading " + str(round(float(d['downloaded_bytes']) / float(d['total_bytes']) * 100, 1)) + "%")
yield f"{d['_percent_str']} downloaded"
if d['status'] == 'finished':
filename = d['filename']
print(filename)
yield f"Downloaded {filename}"
def download(url, tempdir, format="bestvideo[ext=mp4]+bestaudio/best", verbose=False, keepVideo=True, filename="%(id)s.%(ext)s"):
try:
ydl_opts = {
"format": format,
"keepvideo": keepVideo,
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
"skip_download": False,
"outtmpl": f"{tempdir}/{filename}",
"noplaylist": True,
"verbose": verbose,
"quiet": False,
"progress_hooks": [progress_hook],
}
ydl = YoutubeDL(ydl_opts)
meta = ydl.extract_info(
url,
download=True,
)
except DownloadError as e:
raise e
else:
audio = tempdir / f"{meta['id']}.mp3"
if (keepVideo):
video = tempdir / f"{meta['id']}.{meta['ext']}"
return meta, str(video.resolve()), str(audio.resolve())
else:
return meta, None, str(audio.resolve())
def download_audio(url, tempdir, format="bestaudio/best", verbose=False, id=None):
filename = f"{id}.%(ext)s"
try:
ydl_opts = {
"format": format,
"keepvideo": False,
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
"skip_download": False,
"outtmpl": f"{tempdir}/{filename}",
"noplaylist": True,
"verbose": verbose,
"quiet": False,
"progress_hooks": [progress_hook],
}
ydl = YoutubeDL(ydl_opts)
meta = ydl.extract_info(
url,
download=True,
)
except DownloadError as e:
raise e
else:
audio = tempdir / f"{id}.mp3"
return meta, str(audio.resolve())
def check_download(url):
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio/best",
"skip_download": True,
"verbose": False,
}
ydl = YoutubeDL(ydl_opts)
try:
meta = ydl.extract_info(
url,
download=False,
)
except DownloadError as e:
raise e
else:
return meta
def transcribe(audio, translate_action=True, language='Autodetect', override_model_size=''):
"""
Transcribe audio file with whisper
:param audio: - The audio file to transcribe
:param translate_action: Bool - Whether to translate to English or keep original language
:param language: String - The language to transcribe to, default is Autodetect
:param override_model_size: Bool - Whether to override the model size
:return:
"""
task = "translate" if translate_action else "transcribe"
model_size_to_load = override_model_size if override_model_size else model_size
print(f'Starting {task} with whisper size {model_size_to_load} on {audio}')
global model
if not preload_model or model_size != override_model_size:
model = whisper.load_model(model_size_to_load)
props = {
"task": task,
}
if language != 'Autodetect':
props["language"] = TO_LANGUAGE_CODE[language.lower()] if len(language) > 2 else language
output = model.transcribe(audio, verbose=True, **props)
output['segments'] = output['segments']
output['requested_language'] = language.lower()
print(f'Finished transcribe from {LANGUAGES[output["language"]].capitalize()}', output["text"])
return output