|
import os |
|
import gradio as gr |
|
import requests |
|
import json |
|
from moviepy import VideoFileClip |
|
import uuid |
|
import time |
|
|
|
ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY", None) |
|
|
|
def extract_audio(video_path, output_format="mp3"): |
|
if not video_path: |
|
return None, "No video provided" |
|
|
|
output_path = f"extracted_audio_{uuid.uuid4().hex[:8]}.{output_format}" |
|
|
|
try: |
|
video = VideoFileClip(video_path) |
|
video.audio.write_audiofile(output_path, logger=None) |
|
video.close() |
|
return output_path, f"Audio extracted successfully" |
|
except Exception as e: |
|
return None, f"Error extracting audio: {str(e)}" |
|
|
|
def save_transcription(transcription): |
|
if "error" in transcription: |
|
return None, transcription["error"] |
|
transcript_filename = f"transcription_{uuid.uuid4().hex[:8]}.txt" |
|
|
|
try: |
|
with open(transcript_filename, "w", encoding="utf-8") as f: |
|
f.write(transcription.get('text', 'No text found')) |
|
|
|
return transcript_filename, "Transcription saved as text file" |
|
except Exception as e: |
|
return None, f"Error saving transcription: {str(e)}" |
|
|
|
def process_video_file(video_file, output_format, api_key, model_id): |
|
if video_file is None: |
|
return None, "Please upload a video file", None, "No video provided" |
|
|
|
audio_path, message = extract_audio(video_file, output_format) |
|
|
|
if audio_path and os.path.exists(audio_path): |
|
transcription = transcribe_audio(audio_path, api_key, model_id) |
|
transcript_file, transcript_message = save_transcription(transcription) |
|
return audio_path, message, transcript_file, transcript_message |
|
else: |
|
return None, message, None, "Audio extraction failed, cannot transcribe" |
|
|
|
def process_video_url(video_url, output_format, api_key, model_id): |
|
if not video_url.strip(): |
|
return None, "Please enter a video URL", None, "No URL provided" |
|
|
|
video_path, error = download_video_from_url(video_url) |
|
if error: |
|
return None, error, None, "Video download failed, cannot transcribe" |
|
|
|
audio_path, message = extract_audio(video_path, output_format) |
|
if video_path and os.path.exists(video_path): |
|
try: |
|
os.remove(video_path) |
|
except: |
|
pass |
|
|
|
if audio_path and os.path.exists(audio_path): |
|
transcription = transcribe_audio(audio_path, api_key, model_id) |
|
transcript_file, transcript_message = save_transcription(transcription) |
|
return audio_path, message, transcript_file, transcript_message |
|
else: |
|
return None, message, None, "Audio extraction failed, cannot transcribe" |
|
|
|
|
|
def transcribe_audio(audio_path, api_key, model_id="elevenlabs_1"): |
|
|
|
start_time = time.time() |
|
|
|
if not api_key: |
|
return {"error": "Please provide an API key"} |
|
|
|
url = "https://api.elevenlabs.io/v1/speech-to-text" |
|
headers = { |
|
"xi-api-key": api_key, |
|
"Accept": "application/json" |
|
} |
|
|
|
try: |
|
with open(audio_path, "rb") as f: |
|
files = { |
|
"file": (os.path.basename(audio_path), f, "audio/mpeg"), |
|
"model_id": (None, model_id) |
|
} |
|
|
|
|
|
response = requests.post( |
|
url, |
|
headers=headers, |
|
files=files |
|
) |
|
|
|
|
|
if response.status_code == 401: |
|
return {"error": "Unauthorized. Please check your API key."} |
|
if response.status_code == 422: |
|
return {"error": "Unprocessable Entity. Check file format or API usage."} |
|
|
|
response.raise_for_status() |
|
result = response.json() |
|
except requests.exceptions.RequestException as e: |
|
return {"error": f"API request failed: {str(e)}"} |
|
except json.JSONDecodeError: |
|
return {"error": "Failed to parse API response"} |
|
|
|
end_time = time.time() |
|
processing_time = end_time - start_time |
|
|
|
file_size = os.path.getsize(audio_path) / (1024 * 1024) |
|
|
|
try: |
|
audio_data, sample_rate = sf.read(audio_path) |
|
audio_duration = len(audio_data) / sample_rate |
|
except: |
|
try: |
|
import librosa |
|
audio_duration = librosa.get_duration(filename=audio_path) |
|
except: |
|
audio_duration = 0 |
|
|
|
|
|
text = result.get('text', '') |
|
|
|
return { |
|
"service": "ElevenLabs", |
|
"text": text, |
|
"processing_time": processing_time, |
|
"file_size_mb": file_size, |
|
"audio_duration": audio_duration, |
|
"real_time_factor": processing_time / audio_duration if audio_duration > 0 else None, |
|
"processing_speed": audio_duration / processing_time if audio_duration > 0 else None, |
|
"raw_response": result, |
|
"language_code": result.get('language_code'), |
|
"language_probability": result.get('language_probability') |
|
} |
|
|
|
with gr.Blocks(title="Video to Audio to Transcription") as app: |
|
gr.Markdown("# Video => Audio => Transcription") |
|
|
|
api_key = gr.Textbox( |
|
placeholder="Enter your ElevenLabs API key", |
|
label="ElevenLabs API Key", |
|
type="password", |
|
value=ELEVENLABS_API_KEY |
|
) |
|
|
|
model_id = gr.Dropdown( |
|
choices=["scribe_v1"], |
|
value="scribe_v1", |
|
label="Transcription Model" |
|
) |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Upload Video"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
video_input = gr.Video(label="Upload Video") |
|
format_choice_file = gr.Radio(["mp3", "wav"], value="mp3", label="Output Format") |
|
extract_button_file = gr.Button("Extract Audio & Transcribe") |
|
|
|
with gr.Column(): |
|
audio_output_file = gr.Audio(label="Extracted Audio", type="filepath") |
|
status_output_file = gr.Textbox(label="Audio Extraction Status") |
|
transcript_file_output = gr.File(label="Transcription Text File") |
|
transcript_status_output = gr.Textbox(label="Transcription Status") |
|
|
|
extract_button_file.click( |
|
fn=process_video_file, |
|
inputs=[video_input, format_choice_file, api_key, model_id], |
|
outputs=[audio_output_file, status_output_file, transcript_file_output, transcript_status_output] |
|
) |
|
|
|
if __name__ == "__main__": |
|
app.launch() |