import gradio as gr import whisper import os from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer from docx import Document from fpdf import FPDF from pptx import Presentation import subprocess import shlex # Load the Whisper model (smaller model for faster transcription) model = whisper.load_model("tiny") # Load M2M100 translation model for different languages def load_translation_model(target_language): lang_codes = { "fa": "fa", # Persian (Farsi) "es": "es", # Spanish "fr": "fr", # French } target_lang_code = lang_codes.get(target_language) if not target_lang_code: raise ValueError(f"Translation model for {target_language} not supported") tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") tokenizer.src_lang = "en" tokenizer.tgt_lang = target_lang_code return tokenizer, translation_model def translate_text(text, tokenizer, model): try: inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tokenizer.tgt_lang)) return tokenizer.decode(translated[0], skip_special_tokens=True) except Exception as e: raise RuntimeError(f"Error during translation: {e}") # Helper function to format timestamps in SRT format def format_timestamp(seconds): milliseconds = int((seconds % 1) * 1000) seconds = int(seconds) hours = seconds // 3600 minutes = (seconds % 3600) // 60 seconds = seconds % 60 return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}" # Corrected write_srt function def write_srt(transcription, output_file, tokenizer=None, translation_model=None): with open(output_file, "w") as f: for i, segment in enumerate(transcription['segments']): start = segment['start'] end = segment['end'] text = segment['text'] if translation_model: text = translate_text(text, tokenizer, translation_model) start_time = format_timestamp(start) end_time = format_timestamp(end) f.write(f"{i + 1}\n") f.write(f"{start_time} --> {end_time}\n") f.write(f"{text.strip()}\n\n") # Embedding subtitles into video (hardsub) def embed_hardsub_in_video(video_file, srt_file, output_video): command = f'ffmpeg -i "{video_file}" -vf "subtitles=\'{srt_file}\'" -c:v libx264 -crf 23 -preset medium "{output_video}"' try: process = subprocess.run(shlex.split(command), capture_output=True, text=True, timeout=300) if process.returncode != 0: raise RuntimeError(f"ffmpeg error: {process.stderr}") except subprocess.TimeoutExpired: raise RuntimeError("ffmpeg process timed out.") except Exception as e: raise RuntimeError(f"Error running ffmpeg: {e}") # Helper function to write Word documents def write_word(transcription, output_file, tokenizer=None, translation_model=None, target_language=None): doc = Document() rtl = target_language == "fa" for i, segment in enumerate(transcription['segments']): text = segment['text'] if translation_model: text = translate_text(text, tokenizer, translation_model) para = doc.add_paragraph(f"{i + 1}. {text.strip()}") if rtl: para.paragraph_format.right_to_left = True doc.save(output_file) # Helper function to reverse text for RTL def reverse_text_for_rtl(text): return ' '.join([word[::-1] for word in text.split()]) # Helper function to write PDF documents def write_pdf(transcription, output_file, tokenizer=None, translation_model=None): pdf = FPDF() pdf.add_page() font_path = "/home/user/app/B-NAZANIN.TTF" pdf.add_font('B-NAZANIN', '', font_path, uni=True) pdf.set_font('B-NAZANIN', size=12) for i, segment in enumerate(transcription['segments']): text = segment['text'] if translation_model: text = translate_text(text, tokenizer, translation_model) reversed_text = reverse_text_for_rtl(text) pdf.multi_cell(0, 10, f"{i + 1}. {reversed_text.strip()}", align='R') pdf.output(output_file) # Helper function to write PowerPoint slides def write_ppt(transcription, output_file, tokenizer=None, translation_model=None): ppt = Presentation() for i, segment in enumerate(transcription['segments']): text = segment['text'] if translation_model: text = translate_text(text, tokenizer, translation_model) slide = ppt.slides.add_slide(ppt.slide_layouts[5]) title = slide.shapes.title title.text = f"{i + 1}. {text.strip()}" ppt.save(output_file) # Transcribing video and generating output def transcribe_video(video_file, language, target_language, output_format): result = model.transcribe(video_file.name, language=language) video_name = os.path.splitext(video_file.name)[0] if target_language != "en": try: tokenizer, translation_model = load_translation_model(target_language) except Exception as e: raise RuntimeError(f"Error loading translation model: {e}") else: tokenizer, translation_model = None, None srt_file = f"{video_name}.srt" write_srt(result, srt_file, tokenizer, translation_model) if output_format == "SRT": return srt_file elif output_format == "Video with Hardsub": output_video = f"{video_name}_with_subtitles.mp4" try: embed_hardsub_in_video(video_file.name, srt_file, output_video) return output_video except Exception as e: raise RuntimeError(f"Error embedding subtitles in video: {e}") elif output_format == "Word": word_file = f"{video_name}.docx" write_word(result, word_file, tokenizer, translation_model) return word_file elif output_format == "PDF": pdf_file = f"{video_name}.pdf" write_pdf(result, pdf_file, tokenizer, translation_model) return pdf_file elif output_format == "PowerPoint": ppt_file = f"{video_name}.pptx" write_ppt(result, ppt_file, tokenizer, translation_model) return ppt_file # Gradio interface with better UI iface = gr.Interface( fn=transcribe_video, inputs=[ gr.File(label="Upload Video File"), gr.Dropdown(label="Select Original Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"), gr.Dropdown(label="Select Subtitle Translation Language", choices=["en", "fa", "es", "fr"], value="fa"), gr.Radio(label="Choose Output Format", choices=["SRT", "Video with Hardsub", "Word", "PDF", "PowerPoint"], value="Video with Hardsub") ], outputs=gr.File(label="Download File"), title="Video Subtitle Generator with Translation & Multi-Format Output", description=( "This tool allows you to generate subtitles from a video file using Whisper, " "translate the subtitles into multiple languages using M2M100, and export them " "in various formats including SRT, hardcoded subtitles in video, Word, PDF, or PowerPoint." ), theme="compact", live=False # No live interaction needed ) if __name__ == "__main__": iface.launch()