Scribe_Transcription_Demo_2

Running

App Files Files Community

Scribe_Transcription_Demo_2 / app.py

MrSimple07

Update app.py

db03031 verified 23 days ago

raw

history blame contribute delete

6.68 kB

	import os
	import gradio as gr
	import requests
	import json
	from moviepy import VideoFileClip
	import uuid
	import time

	ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY", None)

	def extract_audio(video_path, output_format="mp3"):
	if not video_path:
	return None, "No video provided"

	output_path = f"extracted_audio_{uuid.uuid4().hex[:8]}.{output_format}"

	try:
	video = VideoFileClip(video_path)
	video.audio.write_audiofile(output_path, logger=None)
	video.close()
	return output_path, f"Audio extracted successfully"
	except Exception as e:
	return None, f"Error extracting audio: {str(e)}"

	def save_transcription(transcription):
	if "error" in transcription:
	return None, transcription["error"]
	transcript_filename = f"transcription_{uuid.uuid4().hex[:8]}.txt"

	try:
	with open(transcript_filename, "w", encoding="utf-8") as f:
	f.write(transcription.get('text', 'No text found'))

	return transcript_filename, "Transcription saved as text file"
	except Exception as e:
	return None, f"Error saving transcription: {str(e)}"

	def process_video_file(video_file, output_format, api_key, model_id):
	if video_file is None:
	return None, "Please upload a video file", None, "No video provided"

	audio_path, message = extract_audio(video_file, output_format)

	if audio_path and os.path.exists(audio_path):
	transcription = transcribe_audio(audio_path, api_key, model_id)
	transcript_file, transcript_message = save_transcription(transcription)
	return audio_path, message, transcript_file, transcript_message
	else:
	return None, message, None, "Audio extraction failed, cannot transcribe"

	def process_video_url(video_url, output_format, api_key, model_id):
	if not video_url.strip():
	return None, "Please enter a video URL", None, "No URL provided"

	video_path, error = download_video_from_url(video_url)
	if error:
	return None, error, None, "Video download failed, cannot transcribe"

	audio_path, message = extract_audio(video_path, output_format)
	if video_path and os.path.exists(video_path):
	try:
	os.remove(video_path)
	except:
	pass

	if audio_path and os.path.exists(audio_path):
	transcription = transcribe_audio(audio_path, api_key, model_id)
	transcript_file, transcript_message = save_transcription(transcription)
	return audio_path, message, transcript_file, transcript_message
	else:
	return None, message, None, "Audio extraction failed, cannot transcribe"


	def transcribe_audio(audio_path, api_key, model_id="elevenlabs_1"):

	start_time = time.time()

	if not api_key:
	return {"error": "Please provide an API key"}

	url = "https://api.elevenlabs.io/v1/speech-to-text"
	headers = {
	"xi-api-key": api_key,
	"Accept": "application/json"
	}

	try:
	with open(audio_path, "rb") as f:
	files = {
	"file": (os.path.basename(audio_path), f, "audio/mpeg"),
	"model_id": (None, model_id)
	}

	# Use requests.post with explicit content type handling
	response = requests.post(
	url,
	headers=headers,
	files=files
	)

	# Check for specific HTTP errors
	if response.status_code == 401:
	return {"error": "Unauthorized. Please check your API key."}
	if response.status_code == 422:
	return {"error": "Unprocessable Entity. Check file format or API usage."}

	response.raise_for_status()
	result = response.json()
	except requests.exceptions.RequestException as e:
	return {"error": f"API request failed: {str(e)}"}
	except json.JSONDecodeError:
	return {"error": "Failed to parse API response"}

	end_time = time.time()
	processing_time = end_time - start_time

	file_size = os.path.getsize(audio_path) / (1024 * 1024)

	try:
	audio_data, sample_rate = sf.read(audio_path)
	audio_duration = len(audio_data) / sample_rate
	except:
	try:
	import librosa
	audio_duration = librosa.get_duration(filename=audio_path)
	except:
	audio_duration = 0

	# Ensure text is extracted correctly from the new response format
	text = result.get('text', '')

	return {
	"service": "ElevenLabs",
	"text": text,
	"processing_time": processing_time,
	"file_size_mb": file_size,
	"audio_duration": audio_duration,
	"real_time_factor": processing_time / audio_duration if audio_duration > 0 else None,
	"processing_speed": audio_duration / processing_time if audio_duration > 0 else None,
	"raw_response": result,
	"language_code": result.get('language_code'),
	"language_probability": result.get('language_probability')
	}

	with gr.Blocks(title="Video to Audio to Transcription") as app:
	gr.Markdown("# Video => Audio => Transcription")

	api_key = gr.Textbox(
	placeholder="Enter your ElevenLabs API key",
	label="ElevenLabs API Key",
	type="password",
	value=ELEVENLABS_API_KEY
	)

	model_id = gr.Dropdown(
	choices=["scribe_v1"],
	value="scribe_v1",
	label="Transcription Model"
	)

	with gr.Tabs():
	with gr.TabItem("Upload Video"):
	with gr.Row():
	with gr.Column():
	video_input = gr.Video(label="Upload Video")
	format_choice_file = gr.Radio(["mp3", "wav"], value="mp3", label="Output Format")
	extract_button_file = gr.Button("Extract Audio & Transcribe")

	with gr.Column():
	audio_output_file = gr.Audio(label="Extracted Audio", type="filepath")
	status_output_file = gr.Textbox(label="Audio Extraction Status")
	transcript_file_output = gr.File(label="Transcription Text File")
	transcript_status_output = gr.Textbox(label="Transcription Status")

	extract_button_file.click(
	fn=process_video_file,
	inputs=[video_input, format_choice_file, api_key, model_id],
	outputs=[audio_output_file, status_output_file, transcript_file_output, transcript_status_output]
	)

	if __name__ == "__main__":
	app.launch()