Spaces:
Running
Running
File size: 9,470 Bytes
c70d81d b2fc243 c70d81d 9d8380b e502065 c70d81d ea370c7 40a56d1 c70d81d b72617b e502065 240e2c3 40a56d1 240e2c3 c70d81d ea370c7 c70d81d ea370c7 b72617b c70d81d b72617b c70d81d b72617b c70d81d c4f7d64 c70d81d 9d8380b a80039f c70d81d b72617b c70d81d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
import gradio as gr
import yt_dlp
from dotenv import load_dotenv
import os
import google.generativeai as genai
import re
import torch
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import time
import spaces
load_dotenv()
default_gemini_api_key = os.getenv('gemini_api_key')
device = 0 if torch.cuda.is_available() else "cpu"
def load_pipeline(model_name):
return pipeline(
task="automatic-speech-recognition",
model=model_name,
chunk_length_s=30,
device=device,
)
def configure_genai(api_key, model_variant):
genai.configure(api_key=api_key)
return genai.GenerativeModel(model_variant)
def extract_youtube_id(youtube_url):
# Extract the YouTube video ID from various URL formats
youtube_id_match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', youtube_url)
if youtube_id_match:
return youtube_id_match.group(1)
return None
def download_youtube_audio(youtube_url, output_filename):
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'outtmpl': output_filename,
}
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([youtube_url])
print(f"Downloaded audio from YouTube URL: {youtube_url}")
return output_filename
except Exception as e:
print(f"Error downloading YouTube audio: {str(e)}")
raise gr.Error(f"Failed to download YouTube audio: {str(e)}")
def summarize_transcription(transcription, model, gemini_prompt):
try:
prompt = f"{gemini_prompt}:\n\n{transcription}"
response = model.generate_content(prompt)
return response.text
except Exception as e:
print(f"Error summarizing transcription: {str(e)}")
return f"Error summarizing transcription: {str(e)}"
@spaces.GPU(duration=180)
def process_audio(audio_file, language, whisper_model):
print("Starting transcription...")
start_time = time.time()
if device == 0:
pipe = load_pipeline(whisper_model)
else:
pipe = load_pipeline("openai/whisper-tiny")
with open(audio_file, "rb") as f:
inputs = f.read()
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
if language:
print(f"Using language: {language}")
transcription = pipe(inputs, batch_size=8, generate_kwargs={"task": "transcribe", "language": language}, return_timestamps=True)["text"]
else:
print("No language defined, using default language")
transcription = pipe(inputs, batch_size=8, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
end_time = time.time()
processing_time = round(end_time - start_time, 2)
return transcription, processing_time
def transcribe(youtube_url, audio_file, whisper_model, gemini_api_key, gemini_prompt, gemini_model_variant, language, progress=gr.Progress()):
try:
progress(0, desc="Initializing")
if not gemini_api_key:
gemini_api_key = default_gemini_api_key
model = configure_genai(gemini_api_key, gemini_model_variant)
if youtube_url:
progress(0.1, desc="Extracting YouTube ID")
youtube_id = extract_youtube_id(youtube_url)
if youtube_id:
output_filename = f"{youtube_id}"
else:
output_filename = f"unknown"
progress(0.2, desc="Downloading YouTube audio")
audio_file = download_youtube_audio(youtube_url, output_filename)
audio_file = f"{audio_file}.mp3"
print(f"Audio file downloaded: {audio_file}")
else:
progress(0.2, desc="Reading audio file")
audio_file = f"{audio_file.name}"
print(f"Audio file read: {audio_file}")
progress(0.4, desc="Starting transcription")
transcription, processing_time = process_audio(audio_file, language, whisper_model)
progress(0.6, desc="Cleaning up")
# Delete the audio file after transcription
if os.path.exists(f"{audio_file}.mp3"):
os.remove(f"{audio_file}.mp3")
print(f"Deleted audio file: {audio_file}.mp3")
progress(0.7, desc="Summarizing transcription")
# Summarize the transcription
summary = summarize_transcription(transcription, model, gemini_prompt)
progress(0.8, desc="Preparing output")
# Prepare the transcription and summary message
transcription_message = f"{transcription}" if transcription else ""
summary_message = f"{summary}" if summary else ""
progress(0.9, desc="Saving output to file")
print("Saving transcription and summary to file...")
# Save transcription and summary to separate text files
transcription_file = "transcription_output.txt"
summary_file = "summary_output.txt"
with open(transcription_file, "w", encoding="utf-8") as f:
f.write(transcription_message)
with open(summary_file, "w", encoding="utf-8") as f:
f.write(summary_message)
progress(1, desc="Complete")
print("Transcription and summarization complete.")
return transcription_message, summary_message, transcription_file, summary_file, processing_time
except gr.Error as e:
# Re-raise Gradio errors
raise e
except Exception as e:
print(f"Error during transcription or summarization: {str(e)}")
raise gr.Error(f"Transcription or summarization failed: {str(e)}")
def toggle_input(choice):
if choice == "YouTube URL":
return gr.update(visible=True), gr.update(visible=False, value=None)
else:
return gr.update(visible=False, value=None), gr.update(visible=True)
def toggle_language(choice):
if choice == True:
return gr.update(visible=True, value="id")
else:
return gr.update(visible=False, value="")
with gr.Blocks(theme='NoCrypt/miku') as demo:
gr.Label('Youtube Summarizer WebUI created with ❤️ by Ryusui', show_label=False)
with gr.Accordion("Input"):
with gr.Column():
input_type = gr.Radio(["YouTube URL", "Audio File"], label="Input Type", value="Audio File", info="Please consider using the audio file if you face any issues with the YouTube URL. Currently youtube is banning HuggingFace IP Addresses.", interactive=False)
with gr.Row():
youtube_url = gr.Textbox(label="YouTube URL", visible=False, info="Input the full URL of the YouTube video you want to transcribe and summarize. Example: https://www.youtube.com/watch?v=VIDEO_ID")
audio_file = gr.File(label="Upload Audio File", visible=True, file_types=['.wav', '.flac', '.mp3'])
whisper_model = gr.Dropdown(["openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium", "openai/whisper-large-v3"], label="Whisper Model", value="openai/whisper-large-v3", info="Tiny is the fastest model, but it's not the best quality. large-v3 is the best quality, but it's the slowest model.")
gemini_model_variant = gr.Dropdown(["gemini-1.5-flash", "gemini-1.5-pro"], label="Gemini Model Variant", value="gemini-1.5-pro", info="Gemini-1.5-flash is the fastest model, but it's not the best quality. Gemini-1.5-pro is the best quality, but it's slower")
define_language = gr.Checkbox(label="Define Language", value=False, info="If you want to define the language, check this box")
language = gr.Dropdown(["id","en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh"], label="Language", value=None, info="Select the language for transcription", visible=False)
gemini_api_key = gr.Textbox(label="Gemini API Key (Optional)", placeholder="Enter your Gemini API key or leave blank to use default", info="If you facing error on transcription, please try to use your own API key")
gemini_prompt = gr.Textbox(label="Gemini Prompt", value="Buatkan resume dari transkrip ini")
transcribe_button = gr.Button("Transcribe and Summarize")
with gr.Accordion("Output"):
with gr.Column():
transcription_output = gr.Textbox(label="Transcription Output")
summary_output = gr.Textbox(label="Summary Output")
transcription_file = gr.File(label="Download Transcription")
summary_file = gr.File(label="Download Summary")
processing_time = gr.Textbox(label="Transcription Processing Time (seconds)")
input_type.change(fn=toggle_input, inputs=input_type, outputs=[youtube_url, audio_file])
define_language.change(fn=toggle_language, inputs=define_language, outputs=[language])
transcribe_button.click(
fn=transcribe,
inputs=[
youtube_url,
audio_file,
whisper_model,
gemini_api_key,
gemini_prompt,
gemini_model_variant,
language,
],
outputs=[transcription_output, summary_output, transcription_file, summary_file, processing_time]
)
print("Launching Gradio interface...")
demo.launch() |