Spaces:

ardha27
/

Youtube-AI-Summarizer

Running

File size: 9,470 Bytes

import gradio as gr
import yt_dlp
from dotenv import load_dotenv
import os
import google.generativeai as genai
import re
import torch
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import time
import spaces

load_dotenv()
default_gemini_api_key = os.getenv('gemini_api_key')

device = 0 if torch.cuda.is_available() else "cpu"

def load_pipeline(model_name):
    return pipeline(
        task="automatic-speech-recognition",
        model=model_name,
        chunk_length_s=30,
        device=device,
    )

def configure_genai(api_key, model_variant):
    genai.configure(api_key=api_key)
    return genai.GenerativeModel(model_variant)

def extract_youtube_id(youtube_url):
    # Extract the YouTube video ID from various URL formats
    youtube_id_match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', youtube_url)
    if youtube_id_match:
        return youtube_id_match.group(1)
    return None

def download_youtube_audio(youtube_url, output_filename):
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'outtmpl': output_filename,
    }
    
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([youtube_url])
        
        print(f"Downloaded audio from YouTube URL: {youtube_url}")
        return output_filename
    except Exception as e:
        print(f"Error downloading YouTube audio: {str(e)}")
        raise gr.Error(f"Failed to download YouTube audio: {str(e)}")

def summarize_transcription(transcription, model, gemini_prompt):
    try:
        prompt = f"{gemini_prompt}:\n\n{transcription}"
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        print(f"Error summarizing transcription: {str(e)}")
        return f"Error summarizing transcription: {str(e)}"
    
@spaces.GPU(duration=180)
def process_audio(audio_file, language, whisper_model):
    print("Starting transcription...")
    start_time = time.time()

    if device == 0:
        pipe = load_pipeline(whisper_model)
    else:
        pipe = load_pipeline("openai/whisper-tiny")

    with open(audio_file, "rb") as f:
        inputs = f.read()

    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}

    if language:
        print(f"Using language: {language}")
        transcription = pipe(inputs, batch_size=8, generate_kwargs={"task": "transcribe", "language": language}, return_timestamps=True)["text"]
    else:
        print("No language defined, using default language")
        transcription = pipe(inputs, batch_size=8, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
    
    end_time = time.time()
    processing_time = round(end_time - start_time, 2)
    return transcription, processing_time

def transcribe(youtube_url, audio_file, whisper_model, gemini_api_key, gemini_prompt, gemini_model_variant, language, progress=gr.Progress()):
    try:
        progress(0, desc="Initializing")
        if not gemini_api_key:
            gemini_api_key = default_gemini_api_key
        model = configure_genai(gemini_api_key, gemini_model_variant)

        if youtube_url:
            progress(0.1, desc="Extracting YouTube ID")
            youtube_id = extract_youtube_id(youtube_url)
            if youtube_id:
                output_filename = f"{youtube_id}"
            else:
                output_filename = f"unknown"
            progress(0.2, desc="Downloading YouTube audio")
            audio_file = download_youtube_audio(youtube_url, output_filename)
            audio_file = f"{audio_file}.mp3"
            print(f"Audio file downloaded: {audio_file}")
        else:
            progress(0.2, desc="Reading audio file")
            audio_file = f"{audio_file.name}"
            print(f"Audio file read: {audio_file}")
        
        progress(0.4, desc="Starting transcription")
        transcription, processing_time = process_audio(audio_file, language, whisper_model)
        
        progress(0.6, desc="Cleaning up")
        # Delete the audio file after transcription
        if os.path.exists(f"{audio_file}.mp3"):
            os.remove(f"{audio_file}.mp3")
            print(f"Deleted audio file: {audio_file}.mp3")
        
        progress(0.7, desc="Summarizing transcription")
        # Summarize the transcription
        summary = summarize_transcription(transcription, model, gemini_prompt)
        
        progress(0.8, desc="Preparing output")
        # Prepare the transcription and summary message
        transcription_message = f"{transcription}" if transcription else ""
        
        summary_message = f"{summary}" if summary else ""
        
        progress(0.9, desc="Saving output to file")
        print("Saving transcription and summary to file...")
        # Save transcription and summary to separate text files
        transcription_file = "transcription_output.txt"
        summary_file = "summary_output.txt"
        with open(transcription_file, "w", encoding="utf-8") as f:
            f.write(transcription_message)
        with open(summary_file, "w", encoding="utf-8") as f:
            f.write(summary_message)
        
        progress(1, desc="Complete")
        print("Transcription and summarization complete.")
        return transcription_message, summary_message, transcription_file, summary_file, processing_time
    except gr.Error as e:
        # Re-raise Gradio errors
        raise e
    except Exception as e:
        print(f"Error during transcription or summarization: {str(e)}")
        raise gr.Error(f"Transcription or summarization failed: {str(e)}")

def toggle_input(choice):
        if choice == "YouTube URL":
            return gr.update(visible=True), gr.update(visible=False, value=None)
        else:
            return gr.update(visible=False, value=None), gr.update(visible=True)
        
def toggle_language(choice):
    if choice == True:
        return gr.update(visible=True, value="id")
    else:
        return gr.update(visible=False, value="")

with gr.Blocks(theme='NoCrypt/miku') as demo:
    gr.Label('Youtube Summarizer WebUI created with ❤️ by Ryusui', show_label=False)
    
    with gr.Accordion("Input"):
        with gr.Column():
            input_type = gr.Radio(["YouTube URL", "Audio File"], label="Input Type", value="Audio File", info="Please consider using the audio file if you face any issues with the YouTube URL. Currently youtube is banning HuggingFace IP Addresses.", interactive=False)
            with gr.Row():
                youtube_url = gr.Textbox(label="YouTube URL", visible=False, info="Input the full URL of the YouTube video you want to transcribe and summarize. Example: https://www.youtube.com/watch?v=VIDEO_ID")
                audio_file = gr.File(label="Upload Audio File", visible=True, file_types=['.wav', '.flac', '.mp3'])
                whisper_model = gr.Dropdown(["openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium", "openai/whisper-large-v3"], label="Whisper Model", value="openai/whisper-large-v3", info="Tiny is the fastest model, but it's not the best quality. large-v3 is the best quality, but it's the slowest model.")
                gemini_model_variant = gr.Dropdown(["gemini-1.5-flash", "gemini-1.5-pro"], label="Gemini Model Variant", value="gemini-1.5-pro", info="Gemini-1.5-flash is the fastest model, but it's not the best quality. Gemini-1.5-pro is the best quality, but it's slower")
            define_language = gr.Checkbox(label="Define Language", value=False, info="If you want to define the language, check this box")
            language = gr.Dropdown(["id","en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh"], label="Language", value=None, info="Select the language for transcription", visible=False)
            gemini_api_key = gr.Textbox(label="Gemini API Key (Optional)", placeholder="Enter your Gemini API key or leave blank to use default", info="If you facing error on transcription, please try to use your own API key")
            gemini_prompt = gr.Textbox(label="Gemini Prompt", value="Buatkan resume dari transkrip ini")
            transcribe_button = gr.Button("Transcribe and Summarize")

    with gr.Accordion("Output"):
        with gr.Column():
            transcription_output = gr.Textbox(label="Transcription Output")
            summary_output = gr.Textbox(label="Summary Output")
        transcription_file = gr.File(label="Download Transcription")
        summary_file = gr.File(label="Download Summary")
        processing_time = gr.Textbox(label="Transcription Processing Time (seconds)")
    
    input_type.change(fn=toggle_input, inputs=input_type, outputs=[youtube_url, audio_file])
    define_language.change(fn=toggle_language, inputs=define_language, outputs=[language])

    transcribe_button.click(
        fn=transcribe,
        inputs=[
            youtube_url,
            audio_file,
            whisper_model,
            gemini_api_key,
            gemini_prompt,
            gemini_model_variant,
            language,
        ],
        outputs=[transcription_output, summary_output, transcription_file, summary_file, processing_time]
    )

print("Launching Gradio interface...")
demo.launch()