Spaces:

visionaries666
/

DAI_Project

Sleeping

App Files Files Community

DAI_Project / app.py

ChiBenevisamPas

Remove TS

0dbfe10 verified 2 months ago

raw

history blame

7.66 kB

	import gradio as gr
	import whisper
	import os
	from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
	from docx import Document # For Word output
	from fpdf import FPDF # For PDF output
	from pptx import Presentation # For PowerPoint output
	import subprocess # To use ffmpeg for embedding subtitles
	import shlex # For better command-line argument handling

	# Load the Whisper model
	model = whisper.load_model("tiny") # Smaller model for faster transcription

	# Load M2M100 translation model for different languages
	def load_translation_model(target_language):
	lang_codes = {
	"fa": "fa", # Persian (Farsi)
	"es": "es", # Spanish
	"fr": "fr", # French
	}
	target_lang_code = lang_codes.get(target_language)
	if not target_lang_code:
	raise ValueError(f"Translation model for {target_language} not supported")

	# Load M2M100 model and tokenizer
	tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
	translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")

	tokenizer.src_lang = "en"
	tokenizer.tgt_lang = target_lang_code

	return tokenizer, translation_model

	def translate_text(text, tokenizer, model):
	try:
	inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
	translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tokenizer.tgt_lang))
	return tokenizer.decode(translated[0], skip_special_tokens=True)
	except Exception as e:
	raise RuntimeError(f"Error during translation: {e}")

	# Helper function to format timestamps in SRT format (hh:mm:ss,ms)
	def format_timestamp(seconds):
	milliseconds = int((seconds % 1) * 1000)
	seconds = int(seconds)
	hours = seconds // 3600
	minutes = (seconds % 3600) // 60
	seconds = seconds % 60
	return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

	# Corrected write_srt function
	def write_srt(transcription, output_file, tokenizer=None, translation_model=None):
	with open(output_file, "w") as f:
	for i, segment in enumerate(transcription['segments']):
	start = segment['start']
	end = segment['end']
	text = segment['text']

	if translation_model:
	text = translate_text(text, tokenizer, translation_model)

	start_time = format_timestamp(start)
	end_time = format_timestamp(end)

	f.write(f"{i + 1}\n")
	f.write(f"{start_time} --> {end_time}\n")
	f.write(f"{text.strip()}\n\n")

	def embed_hardsub_in_video(video_file, srt_file, output_video):
	"""Uses ffmpeg to burn subtitles into the video (hardsub)."""
	command = f'ffmpeg -i "{video_file}" -vf "subtitles=\'{srt_file}\'" -c:v libx264 -crf 23 -preset medium "{output_video}"'

	try:
	print(f"Running command: {command}") # Debug statement
	process = subprocess.run(shlex.split(command), capture_output=True, text=True, timeout=300)
	print(f"ffmpeg output: {process.stdout}") # Debug statement
	if process.returncode != 0:
	raise RuntimeError(f"ffmpeg error: {process.stderr}") # Print the error
	except subprocess.TimeoutExpired:
	raise RuntimeError("ffmpeg process timed out.")
	except Exception as e:
	raise RuntimeError(f"Error running ffmpeg: {e}")

	def write_word(transcription, output_file, tokenizer=None, translation_model=None):
	"""Creates a Word document from the transcription without timestamps."""
	doc = Document()
	for i, segment in enumerate(transcription['segments']):
	text = segment['text']

	if translation_model:
	text = translate_text(text, tokenizer, translation_model)

	doc.add_paragraph(f"{i + 1}. {text.strip()}") # No timestamps
	doc.save(output_file)

	def write_pdf(transcription, output_file, tokenizer=None, translation_model=None):
	"""Creates a PDF document from the transcription without timestamps."""
	pdf = FPDF()
	pdf.set_auto_page_break(auto=True, margin=15)
	pdf.add_page()
	pdf.set_font("Arial", size=12)

	for i, segment in enumerate(transcription['segments']):
	text = segment['text']

	if translation_model:
	text = translate_text(text, tokenizer, translation_model)

	pdf.multi_cell(0, 10, f"{i + 1}. {text.strip()}") # No timestamps

	pdf.output(output_file)

	def write_ppt(transcription, output_file, tokenizer=None, translation_model=None):
	"""Creates a PowerPoint presentation from the transcription without timestamps."""
	ppt = Presentation()

	for i, segment in enumerate(transcription['segments']):
	text = segment['text']

	if translation_model:
	text = translate_text(text, tokenizer, translation_model)

	slide = ppt.slides.add_slide(ppt.slide_layouts[5]) # Blank slide
	title = slide.shapes.title
	title.text = f"{i + 1}. {text.strip()}" # No timestamps

	ppt.save(output_file)

	def transcribe_video(video_file, language, target_language, output_format):
	# Transcribe the video with Whisper
	result = model.transcribe(video_file.name, language=language)
	video_name = os.path.splitext(video_file.name)[0]

	# Load the translation model for the selected subtitle language
	if target_language != "en":
	try:
	tokenizer, translation_model = load_translation_model(target_language)
	except Exception as e:
	raise RuntimeError(f"Error loading translation model: {e}")
	else:
	tokenizer, translation_model = None, None

	# Save the SRT file
	srt_file = f"{video_name}.srt"
	write_srt(result, srt_file, tokenizer, translation_model)

	# Output based on user's selection
	if output_format == "SRT":
	return srt_file
	elif output_format == "Video with Hardsub":
	output_video = f"{video_name}_with_subtitles.mp4"
	try:
	embed_hardsub_in_video(video_file.name, srt_file, output_video)
	return output_video
	except Exception as e:
	raise RuntimeError(f"Error embedding subtitles in video: {e}")
	elif output_format == "Word":
	word_file = f"{video_name}.docx"
	write_word(result, word_file, tokenizer, translation_model)
	return word_file
	elif output_format == "PDF":
	pdf_file = f"{video_name}.pdf"
	write_pdf(result, pdf_file, tokenizer, translation_model)
	return pdf_file
	elif output_format == "PowerPoint":
	ppt_file = f"{video_name}.pptx"
	write_ppt(result, ppt_file, tokenizer, translation_model)
	return ppt_file

	# Gradio interface
	iface = gr.Interface(
	fn=transcribe_video,
	inputs=[
	gr.File(label="Upload Video"),
	gr.Dropdown(label="Select Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"),
	gr.Dropdown(label="Select Subtitle Language", choices=["en", "fa", "es", "fr"], value="fa"),
	gr.Radio(label="Output Format", choices=["SRT", "Video with Hardsub", "Word", "PDF", "PowerPoint"], value="Video with Hardsub")
	],
	outputs=gr.File(label="Download Subtitles, Video, or Document"),
	title="Video Subtitle Generator with Hardsub and Document Formats",
	description="Upload a video file to generate subtitles in SRT format, download the video with hardsubbed subtitles, or generate Word, PDF, or PowerPoint documents using Whisper and M2M100 for translation."
	)

	if __name__ == "__main__":
	iface.launch()