File size: 7,661 Bytes
3d2a173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0dbfe10
3d2a173
 
 
 
 
 
 
0dbfe10
3d2a173
 
 
0dbfe10
3d2a173
 
 
 
 
 
 
 
 
 
 
0dbfe10
3d2a173
 
 
 
0dbfe10
3d2a173
 
 
 
 
 
 
 
 
 
0dbfe10
3d2a173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import gradio as gr
import whisper
import os
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from docx import Document  # For Word output
from fpdf import FPDF  # For PDF output
from pptx import Presentation  # For PowerPoint output
import subprocess  # To use ffmpeg for embedding subtitles
import shlex  # For better command-line argument handling

# Load the Whisper model
model = whisper.load_model("tiny")  # Smaller model for faster transcription

# Load M2M100 translation model for different languages
def load_translation_model(target_language):
    lang_codes = {
        "fa": "fa",  # Persian (Farsi)
        "es": "es",  # Spanish
        "fr": "fr",  # French
    }
    target_lang_code = lang_codes.get(target_language)
    if not target_lang_code:
        raise ValueError(f"Translation model for {target_language} not supported")

    # Load M2M100 model and tokenizer
    tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
    translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")

    tokenizer.src_lang = "en"
    tokenizer.tgt_lang = target_lang_code

    return tokenizer, translation_model

def translate_text(text, tokenizer, model):
    try:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tokenizer.tgt_lang))
        return tokenizer.decode(translated[0], skip_special_tokens=True)
    except Exception as e:
        raise RuntimeError(f"Error during translation: {e}")

# Helper function to format timestamps in SRT format (hh:mm:ss,ms)
def format_timestamp(seconds):
    milliseconds = int((seconds % 1) * 1000)
    seconds = int(seconds)
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

# Corrected write_srt function
def write_srt(transcription, output_file, tokenizer=None, translation_model=None):
    with open(output_file, "w") as f:
        for i, segment in enumerate(transcription['segments']):
            start = segment['start']
            end = segment['end']
            text = segment['text']
            
            if translation_model:
                text = translate_text(text, tokenizer, translation_model)
            
            start_time = format_timestamp(start)
            end_time = format_timestamp(end)
            
            f.write(f"{i + 1}\n")
            f.write(f"{start_time} --> {end_time}\n")
            f.write(f"{text.strip()}\n\n")

def embed_hardsub_in_video(video_file, srt_file, output_video):
    """Uses ffmpeg to burn subtitles into the video (hardsub)."""
    command = f'ffmpeg -i "{video_file}" -vf "subtitles=\'{srt_file}\'" -c:v libx264 -crf 23 -preset medium "{output_video}"'
    
    try:
        print(f"Running command: {command}")  # Debug statement
        process = subprocess.run(shlex.split(command), capture_output=True, text=True, timeout=300)
        print(f"ffmpeg output: {process.stdout}")  # Debug statement
        if process.returncode != 0:
            raise RuntimeError(f"ffmpeg error: {process.stderr}")  # Print the error
    except subprocess.TimeoutExpired:
        raise RuntimeError("ffmpeg process timed out.")
    except Exception as e:
        raise RuntimeError(f"Error running ffmpeg: {e}")

def write_word(transcription, output_file, tokenizer=None, translation_model=None):
    """Creates a Word document from the transcription without timestamps."""
    doc = Document()
    for i, segment in enumerate(transcription['segments']):
        text = segment['text']
        
        if translation_model:
            text = translate_text(text, tokenizer, translation_model)
        
        doc.add_paragraph(f"{i + 1}. {text.strip()}")  # No timestamps
    doc.save(output_file)

def write_pdf(transcription, output_file, tokenizer=None, translation_model=None):
    """Creates a PDF document from the transcription without timestamps."""
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    
    for i, segment in enumerate(transcription['segments']):
        text = segment['text']
        
        if translation_model:
            text = translate_text(text, tokenizer, translation_model)
        
        pdf.multi_cell(0, 10, f"{i + 1}. {text.strip()}")  # No timestamps
    
    pdf.output(output_file)

def write_ppt(transcription, output_file, tokenizer=None, translation_model=None):
    """Creates a PowerPoint presentation from the transcription without timestamps."""
    ppt = Presentation()
    
    for i, segment in enumerate(transcription['segments']):
        text = segment['text']
        
        if translation_model:
            text = translate_text(text, tokenizer, translation_model)
        
        slide = ppt.slides.add_slide(ppt.slide_layouts[5])  # Blank slide
        title = slide.shapes.title
        title.text = f"{i + 1}. {text.strip()}"  # No timestamps
    
    ppt.save(output_file)

def transcribe_video(video_file, language, target_language, output_format):
    # Transcribe the video with Whisper
    result = model.transcribe(video_file.name, language=language)
    video_name = os.path.splitext(video_file.name)[0]
    
    # Load the translation model for the selected subtitle language
    if target_language != "en":
        try:
            tokenizer, translation_model = load_translation_model(target_language)
        except Exception as e:
            raise RuntimeError(f"Error loading translation model: {e}")
    else:
        tokenizer, translation_model = None, None

    # Save the SRT file
    srt_file = f"{video_name}.srt"
    write_srt(result, srt_file, tokenizer, translation_model)

    # Output based on user's selection
    if output_format == "SRT":
        return srt_file
    elif output_format == "Video with Hardsub":
        output_video = f"{video_name}_with_subtitles.mp4"
        try:
            embed_hardsub_in_video(video_file.name, srt_file, output_video)
            return output_video
        except Exception as e:
            raise RuntimeError(f"Error embedding subtitles in video: {e}")
    elif output_format == "Word":
        word_file = f"{video_name}.docx"
        write_word(result, word_file, tokenizer, translation_model)
        return word_file
    elif output_format == "PDF":
        pdf_file = f"{video_name}.pdf"
        write_pdf(result, pdf_file, tokenizer, translation_model)
        return pdf_file
    elif output_format == "PowerPoint":
        ppt_file = f"{video_name}.pptx"
        write_ppt(result, ppt_file, tokenizer, translation_model)
        return ppt_file

# Gradio interface
iface = gr.Interface(
    fn=transcribe_video,
    inputs=[
        gr.File(label="Upload Video"),
        gr.Dropdown(label="Select Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"),
        gr.Dropdown(label="Select Subtitle Language", choices=["en", "fa", "es", "fr"], value="fa"),
        gr.Radio(label="Output Format", choices=["SRT", "Video with Hardsub", "Word", "PDF", "PowerPoint"], value="Video with Hardsub")
    ],
    outputs=gr.File(label="Download Subtitles, Video, or Document"),
    title="Video Subtitle Generator with Hardsub and Document Formats",
    description="Upload a video file to generate subtitles in SRT format, download the video with hardsubbed subtitles, or generate Word, PDF, or PowerPoint documents using Whisper and M2M100 for translation."
)

if __name__ == "__main__":
    iface.launch()