Raphael
isolate requests in their directory
02cf79a unverified
raw
history blame
7.25 kB
import logging
import math
import os
import shutil
import tempfile
import time
from datasets import load_dataset
import gradio as gr
import moviepy.editor as mp
import numpy as np
import pysrt
import torch
from transformers import pipeline
import yt_dlp
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', force=True)
LOG = logging.getLogger(__name__)
CLIP_SECONDS = 20
SLICES = 4
SLICE_DURATION = CLIP_SECONDS / SLICES
# At most 6 mins
MAX_CHUNKS = 45
asr_kwargs = {
"task": "automatic-speech-recognition",
"model": "openai/whisper-medium.en"
}
translator_kwargs = {
"task": "translation_en_to_fr",
"model": "Helsinki-NLP/opus-mt-en-fr"
}
summarizer_kwargs = {
"task": "summarization",
"model": "facebook/bart-large-cnn"
}
if torch.cuda.is_available():
LOG.info("GPU available")
asr_kwargs['device'] = 'cuda:0'
translator_kwargs['device'] = 'cuda:0'
summarizer_kwargs['device'] = 'cuda:0'
# All three models should fit together on a single T4 GPU
LOG.info("Fetching ASR model from the Hub if not already there")
asr = pipeline(**asr_kwargs)
LOG.info("Fetching translation model from the Hub if not already there")
translator = pipeline(**translator_kwargs)
LOG.info("Fetching summarization model from the Hub if not already there")
summarizer = pipeline(**summarizer_kwargs)
def demo(url: str, translate: bool):
# Free disk space leak
basedir = tempfile.mkdtemp()
LOG.info("Base directory %s", basedir)
video_path, video = download(url, os.path.join(basedir, 'video.mp4'))
audio_clips(video, basedir)
srt_file, summary = process_video(basedir, video.duration, translate)
return summary, srt_file, [video_path, srt_file]
def download(url, dst):
LOG.info("Downloading provided url %s", url)
opts = {
'skip_download': False,
'overwrites': True,
'format': 'mp4',
'outtmpl': {'default': dst}
}
with yt_dlp.YoutubeDL(opts) as dl:
dl.download([url])
return dst, mp.VideoFileClip(dst)
def audiodir(basedir):
return os.path.join(basedir, 'audio')
def audio_clips(video: mp.VideoFileClip, basedir: str):
LOG.info("Building audio clips")
clips_dir = audiodir(basedir)
shutil.rmtree(clips_dir, ignore_errors=True)
os.makedirs(clips_dir, exist_ok=True)
audio = video.audio
end = audio.duration
digits = int(math.log(end / CLIP_SECONDS, 10)) + 1
for idx, i in enumerate(range(0, int(end), CLIP_SECONDS)):
sub_end = min(i+CLIP_SECONDS, end)
# print(sub_end)
sub_clip = audio.subclip(t_start=i, t_end=sub_end)
audio_file = os.path.join(clips_dir, f"audio_{idx:0{digits}d}" + ".ogg")
# audio_file = os.path.join(AUDIO_CLIPS, "audio_" + str(idx))
sub_clip.write_audiofile(audio_file, fps=16000)
def process_video(basedir: str, duration, translate: bool):
audio_dir = audiodir(basedir)
transcriptions = transcription(audio_dir, duration)
subs = translation(transcriptions, translate)
srt_file = build_srt_clips(subs, basedir)
summary = summarize(transcriptions, translate)
return srt_file, summary
def transcription(audio_dir: str, duration):
LOG.info("Audio transcription")
# Not exact, nvm, doesn't need to be
chunks = int(duration / CLIP_SECONDS + 1)
chunks = min(chunks, MAX_CHUNKS)
LOG.debug("Loading audio clips dataset")
dataset = load_dataset("audiofolder", data_dir=audio_dir)
dataset = dataset['train']
dataset = dataset['audio'][0:chunks]
start = time.time()
transcriptions = []
for i, d in enumerate(np.array_split(dataset, 5)):
d = list(d)
LOG.info("ASR batch %d / 5, samples %d", i, len(d))
t = asr(d, max_new_tokens=10000)
transcriptions.extend(t)
transcriptions = [t['text'] for t in transcriptions]
elapsed = time.time() - start
LOG.info("Transcription done, elapsed %.2f seconds", elapsed)
return transcriptions
def translation(transcriptions, translate):
if translate:
LOG.info("Performing translation")
start = time.time()
translations = translator(transcriptions)
translations = [t['translation_text'] for t in translations]
elapsed = time.time() - start
LOG.info("Translation done, elapsed %.2f seconds", elapsed)
else:
translations = transcriptions
return translations
def summarize(transcriptions, translate):
LOG.info("Generating video summary")
whole_text = ' '.join(transcriptions).strip()
word_count = len(whole_text.split())
summary = summarizer(whole_text)
# min_length=word_count // 4 + 1,
# max_length=word_count // 2 + 1)
summary = translation([summary[0]['summary_text']], translate)[0]
return summary
def subs_to_timed_segments(subtitles: list[str]):
LOG.info("Building srt segments")
all_chunks = []
for sub in subtitles:
chunks = np.array_split(sub.split(' '), SLICES)
all_chunks.extend(chunks)
subs = []
for c in all_chunks:
c = ' '.join(c)
subs.append(c)
segments = []
for i, c in enumerate(subs):
segments.append({
'text': c.strip(),
'start': i * SLICE_DURATION,
'end': (i + 1) * SLICE_DURATION
})
return segments
def build_srt_clips(subs, basedir):
LOG.info("Generating subtitles")
segments = subs_to_timed_segments(subs)
LOG.info("Building srt clips")
max_text_len = 30
subtitles = pysrt.SubRipFile()
first = True
for segment in segments:
start = segment['start'] * 1000
if first:
start += 3000
first = False
end = segment['end'] * 1000
text = segment['text']
text = text.strip()
if len(text) < max_text_len:
o = pysrt.SubRipItem()
o.start = pysrt.SubRipTime(0, 0, 0, start)
o.end = pysrt.SubRipTime(0, 0, 0, end)
o.text = text
subtitles.append(o)
else:
# Just split in two, should be ok in most cases
words = text.split()
o = pysrt.SubRipItem()
o.text = ' '.join(words[0:len(words)//2])
o.start = pysrt.SubRipTime(0, 0, 0, start)
chkpt = (start + end) / 2
o.end = pysrt.SubRipTime(0, 0, 0, chkpt)
subtitles.append(o)
o = pysrt.SubRipItem()
o.text = ' '.join(words[len(words)//2:])
o.start = pysrt.SubRipTime(0, 0, 0, chkpt)
o.end = pysrt.SubRipTime(0, 0, 0, end)
subtitles.append(o)
srt_path = os.path.join(basedir, 'video.srt')
subtitles.save(srt_path, encoding='utf-8')
LOG.info("Subtitles saved in srt file %s", srt_path)
return srt_path
iface = gr.Interface(
fn=demo,
inputs=[
gr.Text(value="https://youtu.be/tiZFewofSLM", label="English video url"),
gr.Checkbox(value=True, label='Translate to French')],
outputs=[
gr.Text(label="Video summary"),
gr.File(label="SRT file"),
gr.Video(label="Video with subtitles"),
])
iface.launch()