Spaces:
Sleeping
Sleeping
import gradio as gr | |
import whisper | |
import os | |
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer | |
from docx import Document # For Word output | |
from fpdf import FPDF # For PDF output | |
from pptx import Presentation # For PowerPoint output | |
import subprocess # To use ffmpeg for embedding subtitles | |
import shlex # For better command-line argument handling | |
# Load the Whisper model | |
model = whisper.load_model("tiny") # Smaller model for faster transcription | |
# Load M2M100 translation model for different languages | |
def load_translation_model(target_language): | |
lang_codes = { | |
"fa": "fa", # Persian (Farsi) | |
"es": "es", # Spanish | |
"fr": "fr", # French | |
} | |
target_lang_code = lang_codes.get(target_language) | |
if not target_lang_code: | |
raise ValueError(f"Translation model for {target_language} not supported") | |
# Load M2M100 model and tokenizer | |
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") | |
translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") | |
tokenizer.src_lang = "en" | |
tokenizer.tgt_lang = target_lang_code | |
return tokenizer, translation_model | |
def translate_text(text, tokenizer, model): | |
try: | |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) | |
translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tokenizer.tgt_lang)) | |
return tokenizer.decode(translated[0], skip_special_tokens=True) | |
except Exception as e: | |
raise RuntimeError(f"Error during translation: {e}") | |
# Helper function to format timestamps in SRT format (hh:mm:ss,ms) | |
def format_timestamp(seconds): | |
milliseconds = int((seconds % 1) * 1000) | |
seconds = int(seconds) | |
hours = seconds // 3600 | |
minutes = (seconds % 3600) // 60 | |
seconds = seconds % 60 | |
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}" | |
# Corrected write_srt function | |
def write_srt(transcription, output_file, tokenizer=None, translation_model=None): | |
with open(output_file, "w") as f: | |
for i, segment in enumerate(transcription['segments']): | |
start = segment['start'] | |
end = segment['end'] | |
text = segment['text'] | |
if translation_model: | |
text = translate_text(text, tokenizer, translation_model) | |
start_time = format_timestamp(start) | |
end_time = format_timestamp(end) | |
f.write(f"{i + 1}\n") | |
f.write(f"{start_time} --> {end_time}\n") | |
f.write(f"{text.strip()}\n\n") | |
def embed_hardsub_in_video(video_file, srt_file, output_video): | |
"""Uses ffmpeg to burn subtitles into the video (hardsub).""" | |
command = f'ffmpeg -i "{video_file}" -vf "subtitles=\'{srt_file}\'" -c:v libx264 -crf 23 -preset medium "{output_video}"' | |
try: | |
print(f"Running command: {command}") # Debug statement | |
process = subprocess.run(shlex.split(command), capture_output=True, text=True, timeout=300) | |
print(f"ffmpeg output: {process.stdout}") # Debug statement | |
if process.returncode != 0: | |
raise RuntimeError(f"ffmpeg error: {process.stderr}") # Print the error | |
except subprocess.TimeoutExpired: | |
raise RuntimeError("ffmpeg process timed out.") | |
except Exception as e: | |
raise RuntimeError(f"Error running ffmpeg: {e}") | |
def write_word(transcription, output_file, tokenizer=None, translation_model=None): | |
"""Creates a Word document from the transcription without timestamps.""" | |
doc = Document() | |
for i, segment in enumerate(transcription['segments']): | |
text = segment['text'] | |
if translation_model: | |
text = translate_text(text, tokenizer, translation_model) | |
doc.add_paragraph(f"{i + 1}. {text.strip()}") # No timestamps | |
doc.save(output_file) | |
def write_pdf(transcription, output_file, tokenizer=None, translation_model=None): | |
"""Creates a PDF document from the transcription without timestamps.""" | |
pdf = FPDF() | |
pdf.set_auto_page_break(auto=True, margin=15) | |
pdf.add_page() | |
pdf.set_font("Arial", size=12) | |
for i, segment in enumerate(transcription['segments']): | |
text = segment['text'] | |
if translation_model: | |
text = translate_text(text, tokenizer, translation_model) | |
pdf.multi_cell(0, 10, f"{i + 1}. {text.strip()}") # No timestamps | |
pdf.output(output_file) | |
def write_ppt(transcription, output_file, tokenizer=None, translation_model=None): | |
"""Creates a PowerPoint presentation from the transcription without timestamps.""" | |
ppt = Presentation() | |
for i, segment in enumerate(transcription['segments']): | |
text = segment['text'] | |
if translation_model: | |
text = translate_text(text, tokenizer, translation_model) | |
slide = ppt.slides.add_slide(ppt.slide_layouts[5]) # Blank slide | |
title = slide.shapes.title | |
title.text = f"{i + 1}. {text.strip()}" # No timestamps | |
ppt.save(output_file) | |
def transcribe_video(video_file, language, target_language, output_format): | |
# Transcribe the video with Whisper | |
result = model.transcribe(video_file.name, language=language) | |
video_name = os.path.splitext(video_file.name)[0] | |
# Load the translation model for the selected subtitle language | |
if target_language != "en": | |
try: | |
tokenizer, translation_model = load_translation_model(target_language) | |
except Exception as e: | |
raise RuntimeError(f"Error loading translation model: {e}") | |
else: | |
tokenizer, translation_model = None, None | |
# Save the SRT file | |
srt_file = f"{video_name}.srt" | |
write_srt(result, srt_file, tokenizer, translation_model) | |
# Output based on user's selection | |
if output_format == "SRT": | |
return srt_file | |
elif output_format == "Video with Hardsub": | |
output_video = f"{video_name}_with_subtitles.mp4" | |
try: | |
embed_hardsub_in_video(video_file.name, srt_file, output_video) | |
return output_video | |
except Exception as e: | |
raise RuntimeError(f"Error embedding subtitles in video: {e}") | |
elif output_format == "Word": | |
word_file = f"{video_name}.docx" | |
write_word(result, word_file, tokenizer, translation_model) | |
return word_file | |
elif output_format == "PDF": | |
pdf_file = f"{video_name}.pdf" | |
write_pdf(result, pdf_file, tokenizer, translation_model) | |
return pdf_file | |
elif output_format == "PowerPoint": | |
ppt_file = f"{video_name}.pptx" | |
write_ppt(result, ppt_file, tokenizer, translation_model) | |
return ppt_file | |
# Gradio interface | |
iface = gr.Interface( | |
fn=transcribe_video, | |
inputs=[ | |
gr.File(label="Upload Video"), | |
gr.Dropdown(label="Select Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"), | |
gr.Dropdown(label="Select Subtitle Language", choices=["en", "fa", "es", "fr"], value="fa"), | |
gr.Radio(label="Output Format", choices=["SRT", "Video with Hardsub", "Word", "PDF", "PowerPoint"], value="Video with Hardsub") | |
], | |
outputs=gr.File(label="Download Subtitles, Video, or Document"), | |
title="Video Subtitle Generator with Hardsub and Document Formats", | |
description="Upload a video file to generate subtitles in SRT format, download the video with hardsubbed subtitles, or generate Word, PDF, or PowerPoint documents using Whisper and M2M100 for translation." | |
) | |
if __name__ == "__main__": | |
iface.launch() | |