ChiBenevisamPas commited on
Commit
3d2a173
1 Parent(s): db36c57

Upload files

Browse files
Files changed (2) hide show
  1. app.py +191 -0
  2. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import whisper
3
+ import os
4
+ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
5
+ from docx import Document # For Word output
6
+ from fpdf import FPDF # For PDF output
7
+ from pptx import Presentation # For PowerPoint output
8
+ import subprocess # To use ffmpeg for embedding subtitles
9
+ import shlex # For better command-line argument handling
10
+
11
+ # Load the Whisper model
12
+ model = whisper.load_model("tiny") # Smaller model for faster transcription
13
+
14
+ # Load M2M100 translation model for different languages
15
+ def load_translation_model(target_language):
16
+ lang_codes = {
17
+ "fa": "fa", # Persian (Farsi)
18
+ "es": "es", # Spanish
19
+ "fr": "fr", # French
20
+ }
21
+ target_lang_code = lang_codes.get(target_language)
22
+ if not target_lang_code:
23
+ raise ValueError(f"Translation model for {target_language} not supported")
24
+
25
+ # Load M2M100 model and tokenizer
26
+ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
27
+ translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
28
+
29
+ tokenizer.src_lang = "en"
30
+ tokenizer.tgt_lang = target_lang_code
31
+
32
+ return tokenizer, translation_model
33
+
34
+ def translate_text(text, tokenizer, model):
35
+ try:
36
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
37
+ translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tokenizer.tgt_lang))
38
+ return tokenizer.decode(translated[0], skip_special_tokens=True)
39
+ except Exception as e:
40
+ raise RuntimeError(f"Error during translation: {e}")
41
+
42
+ # Helper function to format timestamps in SRT format (hh:mm:ss,ms)
43
+ def format_timestamp(seconds):
44
+ milliseconds = int((seconds % 1) * 1000)
45
+ seconds = int(seconds)
46
+ hours = seconds // 3600
47
+ minutes = (seconds % 3600) // 60
48
+ seconds = seconds % 60
49
+ return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
50
+
51
+ # Corrected write_srt function
52
+ def write_srt(transcription, output_file, tokenizer=None, translation_model=None):
53
+ with open(output_file, "w") as f:
54
+ for i, segment in enumerate(transcription['segments']):
55
+ start = segment['start']
56
+ end = segment['end']
57
+ text = segment['text']
58
+
59
+ if translation_model:
60
+ text = translate_text(text, tokenizer, translation_model)
61
+
62
+ start_time = format_timestamp(start)
63
+ end_time = format_timestamp(end)
64
+
65
+ f.write(f"{i + 1}\n")
66
+ f.write(f"{start_time} --> {end_time}\n")
67
+ f.write(f"{text.strip()}\n\n")
68
+
69
+ def embed_hardsub_in_video(video_file, srt_file, output_video):
70
+ """Uses ffmpeg to burn subtitles into the video (hardsub)."""
71
+ command = f'ffmpeg -i "{video_file}" -vf "subtitles=\'{srt_file}\'" -c:v libx264 -crf 23 -preset medium "{output_video}"'
72
+
73
+ try:
74
+ print(f"Running command: {command}") # Debug statement
75
+ process = subprocess.run(shlex.split(command), capture_output=True, text=True, timeout=300)
76
+ print(f"ffmpeg output: {process.stdout}") # Debug statement
77
+ if process.returncode != 0:
78
+ raise RuntimeError(f"ffmpeg error: {process.stderr}") # Print the error
79
+ except subprocess.TimeoutExpired:
80
+ raise RuntimeError("ffmpeg process timed out.")
81
+ except Exception as e:
82
+ raise RuntimeError(f"Error running ffmpeg: {e}")
83
+
84
+ def write_word(transcription, output_file, tokenizer=None, translation_model=None):
85
+ """Creates a Word document from the transcription."""
86
+ doc = Document()
87
+ for i, segment in enumerate(transcription['segments']):
88
+ start = segment['start']
89
+ end = segment['end']
90
+ text = segment['text']
91
+
92
+ if translation_model:
93
+ text = translate_text(text, tokenizer, translation_model)
94
+
95
+ doc.add_paragraph(f"{i + 1}. [{format_timestamp(start)} - {format_timestamp(end)}] {text.strip()}")
96
+ doc.save(output_file)
97
+
98
+ def write_pdf(transcription, output_file, tokenizer=None, translation_model=None):
99
+ """Creates a PDF document from the transcription."""
100
+ pdf = FPDF()
101
+ pdf.set_auto_page_break(auto=True, margin=15)
102
+ pdf.add_page()
103
+ pdf.set_font("Arial", size=12)
104
+
105
+ for i, segment in enumerate(transcription['segments']):
106
+ start = segment['start']
107
+ end = segment['end']
108
+ text = segment['text']
109
+
110
+ if translation_model:
111
+ text = translate_text(text, tokenizer, translation_model)
112
+
113
+ pdf.multi_cell(0, 10, f"{i + 1}. [{format_timestamp(start)} - {format_timestamp(end)}] {text.strip()}")
114
+
115
+ pdf.output(output_file)
116
+
117
+ def write_ppt(transcription, output_file, tokenizer=None, translation_model=None):
118
+ """Creates a PowerPoint presentation from the transcription."""
119
+ ppt = Presentation()
120
+
121
+ for i, segment in enumerate(transcription['segments']):
122
+ start = segment['start']
123
+ end = segment['end']
124
+ text = segment['text']
125
+
126
+ if translation_model:
127
+ text = translate_text(text, tokenizer, translation_model)
128
+
129
+ slide = ppt.slides.add_slide(ppt.slide_layouts[5]) # Blank slide
130
+ title = slide.shapes.title
131
+ title.text = f"{i + 1}. [{format_timestamp(start)} - {format_timestamp(end)}] {text.strip()}"
132
+
133
+ ppt.save(output_file)
134
+
135
+ def transcribe_video(video_file, language, target_language, output_format):
136
+ # Transcribe the video with Whisper
137
+ result = model.transcribe(video_file.name, language=language)
138
+ video_name = os.path.splitext(video_file.name)[0]
139
+
140
+ # Load the translation model for the selected subtitle language
141
+ if target_language != "en":
142
+ try:
143
+ tokenizer, translation_model = load_translation_model(target_language)
144
+ except Exception as e:
145
+ raise RuntimeError(f"Error loading translation model: {e}")
146
+ else:
147
+ tokenizer, translation_model = None, None
148
+
149
+ # Save the SRT file
150
+ srt_file = f"{video_name}.srt"
151
+ write_srt(result, srt_file, tokenizer, translation_model)
152
+
153
+ # Output based on user's selection
154
+ if output_format == "SRT":
155
+ return srt_file
156
+ elif output_format == "Video with Hardsub":
157
+ output_video = f"{video_name}_with_subtitles.mp4"
158
+ try:
159
+ embed_hardsub_in_video(video_file.name, srt_file, output_video)
160
+ return output_video
161
+ except Exception as e:
162
+ raise RuntimeError(f"Error embedding subtitles in video: {e}")
163
+ elif output_format == "Word":
164
+ word_file = f"{video_name}.docx"
165
+ write_word(result, word_file, tokenizer, translation_model)
166
+ return word_file
167
+ elif output_format == "PDF":
168
+ pdf_file = f"{video_name}.pdf"
169
+ write_pdf(result, pdf_file, tokenizer, translation_model)
170
+ return pdf_file
171
+ elif output_format == "PowerPoint":
172
+ ppt_file = f"{video_name}.pptx"
173
+ write_ppt(result, ppt_file, tokenizer, translation_model)
174
+ return ppt_file
175
+
176
+ # Gradio interface
177
+ iface = gr.Interface(
178
+ fn=transcribe_video,
179
+ inputs=[
180
+ gr.File(label="Upload Video"),
181
+ gr.Dropdown(label="Select Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"),
182
+ gr.Dropdown(label="Select Subtitle Language", choices=["en", "fa", "es", "fr"], value="fa"),
183
+ gr.Radio(label="Output Format", choices=["SRT", "Video with Hardsub", "Word", "PDF", "PowerPoint"], value="Video with Hardsub")
184
+ ],
185
+ outputs=gr.File(label="Download Subtitles, Video, or Document"),
186
+ title="Video Subtitle Generator with Hardsub and Document Formats",
187
+ description="Upload a video file to generate subtitles in SRT format, download the video with hardsubbed subtitles, or generate Word, PDF, or PowerPoint documents using Whisper and M2M100 for translation."
188
+ )
189
+
190
+ if __name__ == "__main__":
191
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ transformers>=4.30.0
2
+ gradio>=3.16.0
3
+ ffmpeg-python
4
+ python-docx
5
+ fpdf
6
+ python-pptx
7
+ sentencepiece # Required for M2M100 and MarianMT translation models
8
+ librosa # Required for audio processing
9
+ git+https://github.com/openai/whisper.git