import gradio as gr import whisper import os from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer from docx import Document # For Word output from fpdf import FPDF # For PDF output from pptx import Presentation # For PowerPoint output import subprocess # To use ffmpeg for embedding subtitles import shlex # For better command-line argument handling # Load the Whisper model model = whisper.load_model("tiny") # Smaller model for faster transcription # Load M2M100 translation model for different languages def load_translation_model(target_language): lang_codes = { "fa": "fa", # Persian (Farsi) "es": "es", # Spanish "fr": "fr", # French } target_lang_code = lang_codes.get(target_language) if not target_lang_code: raise ValueError(f"Translation model for {target_language} not supported") # Load M2M100 model and tokenizer tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") tokenizer.src_lang = "en" tokenizer.tgt_lang = target_lang_code return tokenizer, translation_model def translate_text(text, tokenizer, model): try: inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tokenizer.tgt_lang)) return tokenizer.decode(translated[0], skip_special_tokens=True) except Exception as e: raise RuntimeError(f"Error during translation: {e}") # Helper function to format timestamps in SRT format (hh:mm:ss,ms) def format_timestamp(seconds): milliseconds = int((seconds % 1) * 1000) seconds = int(seconds) hours = seconds // 3600 minutes = (seconds % 3600) // 60 seconds = seconds % 60 return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}" # Corrected write_srt function def write_srt(transcription, output_file, tokenizer=None, translation_model=None): with open(output_file, "w") as f: for i, segment in enumerate(transcription['segments']): start = segment['start'] end = segment['end'] text = segment['text'] if translation_model: text = translate_text(text, tokenizer, translation_model) start_time = format_timestamp(start) end_time = format_timestamp(end) f.write(f"{i + 1}\n") f.write(f"{start_time} --> {end_time}\n") f.write(f"{text.strip()}\n\n") def embed_hardsub_in_video(video_file, srt_file, output_video): """Uses ffmpeg to burn subtitles into the video (hardsub).""" command = f'ffmpeg -i "{video_file}" -vf "subtitles=\'{srt_file}\'" -c:v libx264 -crf 23 -preset medium "{output_video}"' try: print(f"Running command: {command}") # Debug statement process = subprocess.run(shlex.split(command), capture_output=True, text=True, timeout=300) print(f"ffmpeg output: {process.stdout}") # Debug statement if process.returncode != 0: raise RuntimeError(f"ffmpeg error: {process.stderr}") # Print the error except subprocess.TimeoutExpired: raise RuntimeError("ffmpeg process timed out.") except Exception as e: raise RuntimeError(f"Error running ffmpeg: {e}") def write_word(transcription, output_file, tokenizer=None, translation_model=None): """Creates a Word document from the transcription without timestamps.""" doc = Document() for i, segment in enumerate(transcription['segments']): text = segment['text'] if translation_model: text = translate_text(text, tokenizer, translation_model) doc.add_paragraph(f"{i + 1}. {text.strip()}") # No timestamps doc.save(output_file) def write_pdf(transcription, output_file, tokenizer=None, translation_model=None): """Creates a PDF document from the transcription without timestamps.""" pdf = FPDF() pdf.set_auto_page_break(auto=True, margin=15) pdf.add_page() pdf.set_font("Arial", size=12) for i, segment in enumerate(transcription['segments']): text = segment['text'] if translation_model: text = translate_text(text, tokenizer, translation_model) pdf.multi_cell(0, 10, f"{i + 1}. {text.strip()}") # No timestamps pdf.output(output_file) def write_ppt(transcription, output_file, tokenizer=None, translation_model=None): """Creates a PowerPoint presentation from the transcription without timestamps.""" ppt = Presentation() for i, segment in enumerate(transcription['segments']): text = segment['text'] if translation_model: text = translate_text(text, tokenizer, translation_model) slide = ppt.slides.add_slide(ppt.slide_layouts[5]) # Blank slide title = slide.shapes.title title.text = f"{i + 1}. {text.strip()}" # No timestamps ppt.save(output_file) def transcribe_video(video_file, language, target_language, output_format): # Transcribe the video with Whisper result = model.transcribe(video_file.name, language=language) video_name = os.path.splitext(video_file.name)[0] # Load the translation model for the selected subtitle language if target_language != "en": try: tokenizer, translation_model = load_translation_model(target_language) except Exception as e: raise RuntimeError(f"Error loading translation model: {e}") else: tokenizer, translation_model = None, None # Save the SRT file srt_file = f"{video_name}.srt" write_srt(result, srt_file, tokenizer, translation_model) # Output based on user's selection if output_format == "SRT": return srt_file elif output_format == "Video with Hardsub": output_video = f"{video_name}_with_subtitles.mp4" try: embed_hardsub_in_video(video_file.name, srt_file, output_video) return output_video except Exception as e: raise RuntimeError(f"Error embedding subtitles in video: {e}") elif output_format == "Word": word_file = f"{video_name}.docx" write_word(result, word_file, tokenizer, translation_model) return word_file elif output_format == "PDF": pdf_file = f"{video_name}.pdf" write_pdf(result, pdf_file, tokenizer, translation_model) return pdf_file elif output_format == "PowerPoint": ppt_file = f"{video_name}.pptx" write_ppt(result, ppt_file, tokenizer, translation_model) return ppt_file # Gradio interface iface = gr.Interface( fn=transcribe_video, inputs=[ gr.File(label="Upload Video"), gr.Dropdown(label="Select Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"), gr.Dropdown(label="Select Subtitle Language", choices=["en", "fa", "es", "fr"], value="fa"), gr.Radio(label="Output Format", choices=["SRT", "Video with Hardsub", "Word", "PDF", "PowerPoint"], value="Video with Hardsub") ], outputs=gr.File(label="Download Subtitles, Video, or Document"), title="Video Subtitle Generator with Hardsub and Document Formats", description="Upload a video file to generate subtitles in SRT format, download the video with hardsubbed subtitles, or generate Word, PDF, or PowerPoint documents using Whisper and M2M100 for translation." ) if __name__ == "__main__": iface.launch()