ChiBenevisamPas commited on
Commit
daf618a
·
verified ·
1 Parent(s): 1e62147

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -52
app.py CHANGED
@@ -2,14 +2,15 @@ import gradio as gr
2
  import whisper
3
  import os
4
  from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
5
- from docx import Document # For Word output
6
- from fpdf import FPDF # For PDF output
7
- from pptx import Presentation # For PowerPoint output
8
- import subprocess # To use ffmpeg for embedding subtitles
9
- import shlex # For better command-line argument handling
 
10
 
11
- # Load the Whisper model
12
- model = whisper.load_model("tiny") # Smaller model for faster transcription
13
 
14
  # Load M2M100 translation model for different languages
15
  def load_translation_model(target_language):
@@ -17,12 +18,20 @@ def load_translation_model(target_language):
17
  "fa": "fa", # Persian (Farsi)
18
  "es": "es", # Spanish
19
  "fr": "fr", # French
 
 
 
 
 
 
 
 
 
20
  }
21
  target_lang_code = lang_codes.get(target_language)
22
  if not target_lang_code:
23
  raise ValueError(f"Translation model for {target_language} not supported")
24
 
25
- # Load M2M100 model and tokenizer
26
  tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
27
  translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
28
 
@@ -39,7 +48,7 @@ def translate_text(text, tokenizer, model):
39
  except Exception as e:
40
  raise RuntimeError(f"Error during translation: {e}")
41
 
42
- # Helper function to format timestamps in SRT format (hh:mm:ss,ms)
43
  def format_timestamp(seconds):
44
  milliseconds = int((seconds % 1) * 1000)
45
  seconds = int(seconds)
@@ -66,73 +75,81 @@ def write_srt(transcription, output_file, tokenizer=None, translation_model=None
66
  f.write(f"{start_time} --> {end_time}\n")
67
  f.write(f"{text.strip()}\n\n")
68
 
 
69
  def embed_hardsub_in_video(video_file, srt_file, output_video):
70
- """Uses ffmpeg to burn subtitles into the video (hardsub)."""
71
  command = f'ffmpeg -i "{video_file}" -vf "subtitles=\'{srt_file}\'" -c:v libx264 -crf 23 -preset medium "{output_video}"'
72
-
73
  try:
74
- print(f"Running command: {command}") # Debug statement
75
  process = subprocess.run(shlex.split(command), capture_output=True, text=True, timeout=300)
76
- print(f"ffmpeg output: {process.stdout}") # Debug statement
77
  if process.returncode != 0:
78
- raise RuntimeError(f"ffmpeg error: {process.stderr}") # Print the error
79
  except subprocess.TimeoutExpired:
80
  raise RuntimeError("ffmpeg process timed out.")
81
  except Exception as e:
82
  raise RuntimeError(f"Error running ffmpeg: {e}")
83
 
84
- def write_word(transcription, output_file, tokenizer=None, translation_model=None):
85
- """Creates a Word document from the transcription without timestamps."""
86
  doc = Document()
 
87
  for i, segment in enumerate(transcription['segments']):
88
  text = segment['text']
89
-
90
  if translation_model:
91
  text = translate_text(text, tokenizer, translation_model)
92
-
93
- doc.add_paragraph(f"{i + 1}. {text.strip()}")
 
94
  doc.save(output_file)
95
 
 
 
 
 
 
96
  def write_pdf(transcription, output_file, tokenizer=None, translation_model=None):
97
- """Creates a PDF document from the transcription without timestamps."""
98
  pdf = FPDF()
99
- pdf.set_auto_page_break(auto=True, margin=15)
100
  pdf.add_page()
101
- pdf.set_font("Arial", size=12)
102
-
 
103
  for i, segment in enumerate(transcription['segments']):
104
  text = segment['text']
105
-
106
  if translation_model:
107
  text = translate_text(text, tokenizer, translation_model)
108
-
109
- pdf.multi_cell(0, 10, f"{i + 1}. {text.strip()}")
110
-
111
  pdf.output(output_file)
112
 
 
113
  def write_ppt(transcription, output_file, tokenizer=None, translation_model=None):
114
- """Creates a PowerPoint presentation from the transcription without timestamps."""
115
  ppt = Presentation()
116
-
117
  for i, segment in enumerate(transcription['segments']):
118
  text = segment['text']
119
-
120
  if translation_model:
121
  text = translate_text(text, tokenizer, translation_model)
122
-
123
- slide = ppt.slides.add_slide(ppt.slide_layouts[5]) # Blank slide
124
  title = slide.shapes.title
125
  title.text = f"{i + 1}. {text.strip()}"
126
-
127
  ppt.save(output_file)
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
- def transcribe_video(video_file, language, target_language, output_format):
131
- # Transcribe the video with Whisper
132
- result = model.transcribe(video_file.name, language=language)
133
- video_name = os.path.splitext(video_file.name)[0]
134
-
135
- # Load the translation model for the selected subtitle language
136
  if target_language != "en":
137
  try:
138
  tokenizer, translation_model = load_translation_model(target_language)
@@ -141,23 +158,21 @@ def transcribe_video(video_file, language, target_language, output_format):
141
  else:
142
  tokenizer, translation_model = None, None
143
 
144
- # Save the SRT file
145
  srt_file = f"{video_name}.srt"
146
  write_srt(result, srt_file, tokenizer, translation_model)
147
 
148
- # Output based on user's selection
149
  if output_format == "SRT":
150
  return srt_file
151
  elif output_format == "Video with Hardsub":
152
  output_video = f"{video_name}_with_subtitles.mp4"
153
  try:
154
- embed_hardsub_in_video(video_file.name, srt_file, output_video)
155
  return output_video
156
  except Exception as e:
157
  raise RuntimeError(f"Error embedding subtitles in video: {e}")
158
  elif output_format == "Word":
159
  word_file = f"{video_name}.docx"
160
- write_word(result, word_file, tokenizer, translation_model)
161
  return word_file
162
  elif output_format == "PDF":
163
  pdf_file = f"{video_name}.pdf"
@@ -168,19 +183,28 @@ def transcribe_video(video_file, language, target_language, output_format):
168
  write_ppt(result, ppt_file, tokenizer, translation_model)
169
  return ppt_file
170
 
171
- # Gradio interface
172
  iface = gr.Interface(
173
  fn=transcribe_video,
174
  inputs=[
175
- gr.File(label="Upload Video"),
176
- gr.Dropdown(label="Select Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"),
177
- gr.Dropdown(label="Select Subtitle Language", choices=["en", "fa", "es", "fr"], value="fa"),
178
- gr.Radio(label="Output Format", choices=["SRT", "Video with Hardsub", "Word", "PDF", "PowerPoint"], value="Video with Hardsub")
 
179
  ],
180
- outputs=gr.File(label="Download Subtitles, Video, or Document"),
181
- title="Video Subtitle Generator with Hardsub and Document Formats",
182
- description="Upload a video file to generate subtitles in SRT format, download the video with hardsubbed subtitles, or generate Word, PDF, or PowerPoint documents using Whisper and M2M100 for translation."
 
 
 
 
 
 
183
  )
184
 
185
  if __name__ == "__main__":
186
- iface.launch()
 
 
 
2
  import whisper
3
  import os
4
  from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
5
+ from docx import Document
6
+ from fpdf import FPDF
7
+ from pptx import Presentation
8
+ import subprocess
9
+ import shlex
10
+ import yt_dlp
11
 
12
+ # Load the Whisper model (smaller model for faster transcription)
13
+ model = whisper.load_model("tiny")
14
 
15
  # Load M2M100 translation model for different languages
16
  def load_translation_model(target_language):
 
18
  "fa": "fa", # Persian (Farsi)
19
  "es": "es", # Spanish
20
  "fr": "fr", # French
21
+ "de": "de", # German
22
+ "it": "it", # Italian
23
+ "pt": "pt", # Portuguese
24
+ "ar": "ar", # Arabic
25
+ "zh": "zh", # Chinese
26
+ "hi": "hi", # Hindi
27
+ "ja": "ja", # Japanese
28
+ "ko": "ko", # Korean
29
+ "ru": "ru", # Russian
30
  }
31
  target_lang_code = lang_codes.get(target_language)
32
  if not target_lang_code:
33
  raise ValueError(f"Translation model for {target_language} not supported")
34
 
 
35
  tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
36
  translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
37
 
 
48
  except Exception as e:
49
  raise RuntimeError(f"Error during translation: {e}")
50
 
51
+ # Helper function to format timestamps in SRT format
52
  def format_timestamp(seconds):
53
  milliseconds = int((seconds % 1) * 1000)
54
  seconds = int(seconds)
 
75
  f.write(f"{start_time} --> {end_time}\n")
76
  f.write(f"{text.strip()}\n\n")
77
 
78
+ # Embedding subtitles into video (hardsub)
79
  def embed_hardsub_in_video(video_file, srt_file, output_video):
 
80
  command = f'ffmpeg -i "{video_file}" -vf "subtitles=\'{srt_file}\'" -c:v libx264 -crf 23 -preset medium "{output_video}"'
 
81
  try:
 
82
  process = subprocess.run(shlex.split(command), capture_output=True, text=True, timeout=300)
 
83
  if process.returncode != 0:
84
+ raise RuntimeError(f"ffmpeg error: {process.stderr}")
85
  except subprocess.TimeoutExpired:
86
  raise RuntimeError("ffmpeg process timed out.")
87
  except Exception as e:
88
  raise RuntimeError(f"Error running ffmpeg: {e}")
89
 
90
+ # Helper function to write Word documents
91
+ def write_word(transcription, output_file, tokenizer=None, translation_model=None, target_language=None):
92
  doc = Document()
93
+ rtl = target_language == "fa"
94
  for i, segment in enumerate(transcription['segments']):
95
  text = segment['text']
 
96
  if translation_model:
97
  text = translate_text(text, tokenizer, translation_model)
98
+ para = doc.add_paragraph(f"{i + 1}. {text.strip()}")
99
+ if rtl:
100
+ para.paragraph_format.right_to_left = True
101
  doc.save(output_file)
102
 
103
+ # Helper function to reverse text for RTL
104
+ def reverse_text_for_rtl(text):
105
+ return ' '.join([word[::-1] for word in text.split()])
106
+
107
+ # Helper function to write PDF documents
108
  def write_pdf(transcription, output_file, tokenizer=None, translation_model=None):
 
109
  pdf = FPDF()
 
110
  pdf.add_page()
111
+ font_path = "/home/user/app/B-NAZANIN.TTF"
112
+ pdf.add_font('B-NAZANIN', '', font_path, uni=True)
113
+ pdf.set_font('B-NAZANIN', size=12)
114
  for i, segment in enumerate(transcription['segments']):
115
  text = segment['text']
 
116
  if translation_model:
117
  text = translate_text(text, tokenizer, translation_model)
118
+ reversed_text = reverse_text_for_rtl(text)
119
+ pdf.multi_cell(0, 10, f"{i + 1}. {reversed_text.strip()}", align='R')
 
120
  pdf.output(output_file)
121
 
122
+ # Helper function to write PowerPoint slides
123
  def write_ppt(transcription, output_file, tokenizer=None, translation_model=None):
 
124
  ppt = Presentation()
 
125
  for i, segment in enumerate(transcription['segments']):
126
  text = segment['text']
 
127
  if translation_model:
128
  text = translate_text(text, tokenizer, translation_model)
129
+ slide = ppt.slides.add_slide(ppt.slide_layouts[5])
 
130
  title = slide.shapes.title
131
  title.text = f"{i + 1}. {text.strip()}"
 
132
  ppt.save(output_file)
133
 
134
+ # Function to download YouTube video
135
+ def download_youtube_video(url):
136
+ ydl_opts = {
137
+ 'format': 'mp4',
138
+ 'outtmpl': 'downloaded_video.mp4',
139
+ }
140
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
141
+ ydl.download([url])
142
+ return 'downloaded_video.mp4'
143
+
144
+ # Transcribing video and generating output
145
+ def transcribe_video(video_file, video_url, language, target_language, output_format):
146
+ if video_url:
147
+ video_file_path = download_youtube_video(video_url)
148
+ else:
149
+ video_file_path = video_file.name
150
 
151
+ result = model.transcribe(video_file_path, language=language)
152
+ video_name = os.path.splitext(video_file_path)[0]
 
 
 
 
153
  if target_language != "en":
154
  try:
155
  tokenizer, translation_model = load_translation_model(target_language)
 
158
  else:
159
  tokenizer, translation_model = None, None
160
 
 
161
  srt_file = f"{video_name}.srt"
162
  write_srt(result, srt_file, tokenizer, translation_model)
163
 
 
164
  if output_format == "SRT":
165
  return srt_file
166
  elif output_format == "Video with Hardsub":
167
  output_video = f"{video_name}_with_subtitles.mp4"
168
  try:
169
+ embed_hardsub_in_video(video_file_path, srt_file, output_video)
170
  return output_video
171
  except Exception as e:
172
  raise RuntimeError(f"Error embedding subtitles in video: {e}")
173
  elif output_format == "Word":
174
  word_file = f"{video_name}.docx"
175
+ write_word(result, word_file, tokenizer, translation_model, target_language)
176
  return word_file
177
  elif output_format == "PDF":
178
  pdf_file = f"{video_name}.pdf"
 
183
  write_ppt(result, ppt_file, tokenizer, translation_model)
184
  return ppt_file
185
 
186
+ # Gradio interface with YouTube URL
187
  iface = gr.Interface(
188
  fn=transcribe_video,
189
  inputs=[
190
+ gr.File(label="Upload Video File (or leave empty for YouTube link)"), # Removed 'optional=True'
191
+ gr.Textbox(label="YouTube Video URL (optional)", placeholder="https://www.youtube.com/watch?v=..."),
192
+ gr.Dropdown(label="Select Original Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"),
193
+ gr.Dropdown(label="Select Subtitle Translation Language", choices=["en", "fa", "es", "de", "fr", "it", "pt"], value="fa"),
194
+ gr.Radio(label="Choose Output Format", choices=["SRT", "Video with Hardsub", "Word", "PDF", "PowerPoint"], value="Video with Hardsub")
195
  ],
196
+ outputs=gr.File(label="Download File"),
197
+ title="Video Subtitle Generator with Translation & Multi-Format Output (Supports YouTube)",
198
+ description=(
199
+ "This tool allows you to generate subtitles from a video file or YouTube link using Whisper, "
200
+ "translate the subtitles into multiple languages using M2M100, and export them "
201
+ "in various formats including SRT, hardcoded subtitles in video, Word, PDF, or PowerPoint."
202
+ ),
203
+ theme="compact",
204
+ live=False
205
  )
206
 
207
  if __name__ == "__main__":
208
+ iface.launch()
209
+
210
+