Spaces:

AnalysisWithMSR
/

SEO

Sleeping

App Files Files Community

AnalysisWithMSR commited on Dec 15, 2024

Commit

6aaea8c

verified ·

1 Parent(s): 318ea39

Update app.py

Browse files

Files changed (1) hide show

app.py +208 -5

app.py CHANGED Viewed

@@ -1,8 +1,211 @@
 import os
-# Checking yt-dlp version
-os.system('yt-dlp --version')
-# Checking youtube-transcript-api (this needs to be done within Python)
-# Checking ffmpeg version
-os.system('ffmpeg -version')

+import googleapiclient.discovery
+import re
+import yt_dlp
+import whisper
+from pydub import AudioSegment
+import tempfile
+from transformers import pipeline
+from youtube_transcript_api import YouTubeTranscriptApi
+import torch
+import openai
+import json
+from urllib.parse import urlparse, parse_qs
 import os
+import gradio as gr
+def extract_video_id(url):
+    """Extracts the video ID from a YouTube URL."""
+    try:
+        parsed_url = urlparse(url)
+        if "youtube.com" in parsed_url.netloc:
+            query_params = parse_qs(parsed_url.query)
+            return query_params.get('v', [None])[0]
+        elif "youtu.be" in parsed_url.netloc:
+            return parsed_url.path.strip("/")
+        else:
+            print("Invalid YouTube URL.")
+            return None
+    except Exception as e:
+        print(f"Error parsing URL: {e}")
+        return None
+def get_video_duration(video_id, api_key):
+    """Fetches the video duration in minutes."""
+    try:
+        youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)
+        request = youtube.videos().list(part="contentDetails", id=video_id)
+        response = request.execute()
+        if response["items"]:
+            duration = response["items"][0]["contentDetails"]["duration"]
+            match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?', duration)
+            hours = int(match.group(1)) if match.group(1) else 0
+            minutes = int(match.group(2)) if match.group(2) else 0
+            seconds = int(match.group(3)) if match.group(3) else 0
+            return hours * 60 + minutes + seconds / 60
+        else:
+            print("No video details found.")
+            return None
+    except Exception as e:
+        print(f"Error fetching video duration: {e}")
+        return None
+def download_and_transcribe_with_whisper(youtube_url):
+    try:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_audio_file = os.path.join(temp_dir, "audio.mp3")
+            ydl_opts = {
+                'format': 'bestaudio/best',
+                'outtmpl': temp_audio_file,
+                'extractaudio': True,
+                'audioquality': 1,
+            }
+            # Download audio using yt-dlp
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                ydl.download([youtube_url])
+            # Convert to wav for Whisper
+            audio = AudioSegment.from_file(temp_audio_file)
+            wav_file = os.path.join(temp_dir, "audio.wav")
+            audio.export(wav_file, format="wav")
+            # Run Whisper transcription
+            model = whisper.load_model("large")
+            result = model.transcribe(wav_file)
+            transcript = result['text']
+            return transcript
+    except Exception as e:
+        print(f"Error during transcription: {e}")
+        return None
+def get_transcript_from_youtube_api(video_id, video_length):
+    """Fetches transcript using YouTube API if available."""
+    try:
+        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+        for transcript in transcript_list:
+            if not transcript.is_generated:
+                segments = transcript.fetch()
+                return " ".join(segment['text'] for segment in segments)
+        if video_length > 15:
+            auto_transcript = transcript_list.find_generated_transcript(['en'])
+            if auto_transcript:
+                segments = auto_transcript.fetch()
+                return " ".join(segment['text'] for segment in segments)
+        print("Manual transcript not available, and video is too short for auto-transcript.")
+        return None
+    except Exception as e:
+        print(f"Error fetching transcript: {e}")
+        return None
+def get_transcript(youtube_url, api_key):
+    """Gets transcript from YouTube API or Whisper if unavailable."""
+    video_id = extract_video_id(youtube_url)
+    if not video_id:
+        print("Invalid or unsupported YouTube URL.")
+        return None
+    video_length = get_video_duration(video_id, api_key)
+    if video_length is not None:
+        print(f"Video length: {video_length:.2f} minutes.")
+        transcript = get_transcript_from_youtube_api(video_id, video_length)
+        if transcript:
+            return transcript
+        print("Using Whisper for transcription.")
+        return download_and_transcribe_with_whisper(youtube_url)
+    else:
+        print("Error fetching video duration.")
+        return None
+def summarize_text_huggingface(text):
+    """Summarizes text using a Hugging Face summarization model."""
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
+    max_input_length = 1024
+    chunk_overlap = 100
+    text_chunks = [
+        text[i:i + max_input_length]
+        for i in range(0, len(text), max_input_length - chunk_overlap)
+    ]
+    summaries = [
+        summarizer(chunk, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
+        for chunk in text_chunks
+    ]
+    return " ".join(summaries)
+def generate_optimized_content(api_key, summarized_transcript):
+    openai.api_key = api_key
+    prompt = f"""
+    Analyze the following summarized YouTube video transcript and:
+    1. Extract the top 10 keywords.
+    2. Generate an optimized title (less than 65 characters).
+    3. Create an engaging description.
+    4. Generate related tags for the video.
+    Summarized Transcript:
+    {summarized_transcript}
+    Provide the results in the following JSON format:
+    {{
+        "keywords": ["keyword1", "keyword2", ..., "keyword10"],
+        "title": "Generated Title",
+        "description": "Generated Description",
+        "tags": ["tag1", "tag2", ..., "tag10"]
+    }}
+    """
+    try:
+        # Use the updated OpenAI API format for chat completions
+        response = openai.ChatCompletion.create(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "system", "content": "You are an SEO expert."},
+                      {"role": "user", "content": prompt}]
+        )
+        # Extract and parse the response
+        response_content = response['choices'][0]['message']['content']
+        content = json.loads(response_content)
+        return content
+    except Exception as e:
+        print(f"Error generating content: {e}")
+        return None
+def process_youtube_url(youtube_url, youtube_api_key, openai_api_key):
+    transcript = get_transcript(youtube_url, youtube_api_key)
+    if not transcript:
+        return "Could not fetch the transcript. Please try another video."
+    summary = summarize_text_huggingface(transcript)
+    optimized_content = generate_optimized_content(openai_api_key, summary)
+    if optimized_content:
+        return json.dumps(optimized_content, indent=4)
+    else:
+        return "Error generating optimized content."
+# Gradio Interface
+def gradio_interface(youtube_url, youtube_api_key, openai_api_key):
+    return process_youtube_url(youtube_url, youtube_api_key, openai_api_key)
+# Creating the Gradio interface
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[
+        gr.Textbox(label="YouTube URL"),
+        gr.Textbox(label="YouTube API Key", type="password"),
+        gr.Textbox(label="OpenAI API Key", type="password")
+    ],
+    outputs=gr.Textbox(label="Optimized Content"),
+    live=True
+)
+if __name__ == "__main__":
+    iface.launch()