DAI_Project / app.py
ChiBenevisamPas's picture
Remove TS
0dbfe10 verified
raw
history blame
7.66 kB
import gradio as gr
import whisper
import os
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from docx import Document # For Word output
from fpdf import FPDF # For PDF output
from pptx import Presentation # For PowerPoint output
import subprocess # To use ffmpeg for embedding subtitles
import shlex # For better command-line argument handling
# Load the Whisper model
model = whisper.load_model("tiny") # Smaller model for faster transcription
# Load M2M100 translation model for different languages
def load_translation_model(target_language):
lang_codes = {
"fa": "fa", # Persian (Farsi)
"es": "es", # Spanish
"fr": "fr", # French
}
target_lang_code = lang_codes.get(target_language)
if not target_lang_code:
raise ValueError(f"Translation model for {target_language} not supported")
# Load M2M100 model and tokenizer
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer.src_lang = "en"
tokenizer.tgt_lang = target_lang_code
return tokenizer, translation_model
def translate_text(text, tokenizer, model):
try:
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tokenizer.tgt_lang))
return tokenizer.decode(translated[0], skip_special_tokens=True)
except Exception as e:
raise RuntimeError(f"Error during translation: {e}")
# Helper function to format timestamps in SRT format (hh:mm:ss,ms)
def format_timestamp(seconds):
milliseconds = int((seconds % 1) * 1000)
seconds = int(seconds)
hours = seconds // 3600
minutes = (seconds % 3600) // 60
seconds = seconds % 60
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
# Corrected write_srt function
def write_srt(transcription, output_file, tokenizer=None, translation_model=None):
with open(output_file, "w") as f:
for i, segment in enumerate(transcription['segments']):
start = segment['start']
end = segment['end']
text = segment['text']
if translation_model:
text = translate_text(text, tokenizer, translation_model)
start_time = format_timestamp(start)
end_time = format_timestamp(end)
f.write(f"{i + 1}\n")
f.write(f"{start_time} --> {end_time}\n")
f.write(f"{text.strip()}\n\n")
def embed_hardsub_in_video(video_file, srt_file, output_video):
"""Uses ffmpeg to burn subtitles into the video (hardsub)."""
command = f'ffmpeg -i "{video_file}" -vf "subtitles=\'{srt_file}\'" -c:v libx264 -crf 23 -preset medium "{output_video}"'
try:
print(f"Running command: {command}") # Debug statement
process = subprocess.run(shlex.split(command), capture_output=True, text=True, timeout=300)
print(f"ffmpeg output: {process.stdout}") # Debug statement
if process.returncode != 0:
raise RuntimeError(f"ffmpeg error: {process.stderr}") # Print the error
except subprocess.TimeoutExpired:
raise RuntimeError("ffmpeg process timed out.")
except Exception as e:
raise RuntimeError(f"Error running ffmpeg: {e}")
def write_word(transcription, output_file, tokenizer=None, translation_model=None):
"""Creates a Word document from the transcription without timestamps."""
doc = Document()
for i, segment in enumerate(transcription['segments']):
text = segment['text']
if translation_model:
text = translate_text(text, tokenizer, translation_model)
doc.add_paragraph(f"{i + 1}. {text.strip()}") # No timestamps
doc.save(output_file)
def write_pdf(transcription, output_file, tokenizer=None, translation_model=None):
"""Creates a PDF document from the transcription without timestamps."""
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_font("Arial", size=12)
for i, segment in enumerate(transcription['segments']):
text = segment['text']
if translation_model:
text = translate_text(text, tokenizer, translation_model)
pdf.multi_cell(0, 10, f"{i + 1}. {text.strip()}") # No timestamps
pdf.output(output_file)
def write_ppt(transcription, output_file, tokenizer=None, translation_model=None):
"""Creates a PowerPoint presentation from the transcription without timestamps."""
ppt = Presentation()
for i, segment in enumerate(transcription['segments']):
text = segment['text']
if translation_model:
text = translate_text(text, tokenizer, translation_model)
slide = ppt.slides.add_slide(ppt.slide_layouts[5]) # Blank slide
title = slide.shapes.title
title.text = f"{i + 1}. {text.strip()}" # No timestamps
ppt.save(output_file)
def transcribe_video(video_file, language, target_language, output_format):
# Transcribe the video with Whisper
result = model.transcribe(video_file.name, language=language)
video_name = os.path.splitext(video_file.name)[0]
# Load the translation model for the selected subtitle language
if target_language != "en":
try:
tokenizer, translation_model = load_translation_model(target_language)
except Exception as e:
raise RuntimeError(f"Error loading translation model: {e}")
else:
tokenizer, translation_model = None, None
# Save the SRT file
srt_file = f"{video_name}.srt"
write_srt(result, srt_file, tokenizer, translation_model)
# Output based on user's selection
if output_format == "SRT":
return srt_file
elif output_format == "Video with Hardsub":
output_video = f"{video_name}_with_subtitles.mp4"
try:
embed_hardsub_in_video(video_file.name, srt_file, output_video)
return output_video
except Exception as e:
raise RuntimeError(f"Error embedding subtitles in video: {e}")
elif output_format == "Word":
word_file = f"{video_name}.docx"
write_word(result, word_file, tokenizer, translation_model)
return word_file
elif output_format == "PDF":
pdf_file = f"{video_name}.pdf"
write_pdf(result, pdf_file, tokenizer, translation_model)
return pdf_file
elif output_format == "PowerPoint":
ppt_file = f"{video_name}.pptx"
write_ppt(result, ppt_file, tokenizer, translation_model)
return ppt_file
# Gradio interface
iface = gr.Interface(
fn=transcribe_video,
inputs=[
gr.File(label="Upload Video"),
gr.Dropdown(label="Select Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"),
gr.Dropdown(label="Select Subtitle Language", choices=["en", "fa", "es", "fr"], value="fa"),
gr.Radio(label="Output Format", choices=["SRT", "Video with Hardsub", "Word", "PDF", "PowerPoint"], value="Video with Hardsub")
],
outputs=gr.File(label="Download Subtitles, Video, or Document"),
title="Video Subtitle Generator with Hardsub and Document Formats",
description="Upload a video file to generate subtitles in SRT format, download the video with hardsubbed subtitles, or generate Word, PDF, or PowerPoint documents using Whisper and M2M100 for translation."
)
if __name__ == "__main__":
iface.launch()