Spaces:

AnalysisWithMSR
/

SEO

Sleeping

App Files Files Community

AnalysisWithMSR commited on Dec 14, 2024

Commit

9823a49

verified ·

1 Parent(s): adbbdc0

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -57

app.py CHANGED Viewed

@@ -1,21 +1,25 @@
-import os
-import gradio as gr
 import yt_dlp
 import whisper
 from pydub import AudioSegment
 from transformers import pipeline
 from youtube_transcript_api import YouTubeTranscriptApi
-from urllib.parse import urlparse, parse_qs
 import openai
 import json
-import tempfile
-import re
-import torch
-from googleapiclient.discovery import build  # Add the import for Google API client
-# Function to extract YouTube video ID
 def extract_video_id(url):
     try:
         parsed_url = urlparse(url)
         if "youtube.com" in parsed_url.netloc:
@@ -23,15 +27,15 @@ def extract_video_id(url):
             return query_params.get('v', [None])[0]
         elif "youtu.be" in parsed_url.netloc:
             return parsed_url.path.strip("/")
-        return None
-    except Exception:
         return None
-# Function to get video duration
 def get_video_duration(video_id, api_key):
     try:
-        youtube = build("youtube", "v3", developerKey=api_key)
         request = youtube.videos().list(part="contentDetails", id=video_id)
         response = request.execute()
         if response["items"]:
@@ -41,39 +45,79 @@ def get_video_duration(video_id, api_key):
             minutes = int(match.group(2)) if match.group(2) else 0
             seconds = int(match.group(3)) if match.group(3) else 0
             return hours * 60 + minutes + seconds / 60
         return None
-    except Exception:
-        return None
-# Download and transcribe with Whisper
 def download_and_transcribe_with_whisper(youtube_url):
     try:
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_audio_file = os.path.join(temp_dir, "audio.mp3")
             ydl_opts = {
                 'format': 'bestaudio/best',
                 'outtmpl': temp_audio_file,
                 'extractaudio': True,
             }
             with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                 ydl.download([youtube_url])
             audio = AudioSegment.from_file(temp_audio_file)
             wav_file = os.path.join(temp_dir, "audio.wav")
             audio.export(wav_file, format="wav")
             model = whisper.load_model("large")
             result = model.transcribe(wav_file)
-            return result['text']
-    except Exception:
         return None
-# Function to summarize using Hugging Face
 def summarize_text_huggingface(text):
     summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
     max_input_length = 1024
     chunk_overlap = 100
@@ -87,10 +131,7 @@ def summarize_text_huggingface(text):
     ]
     return " ".join(summaries)
-# Function to generate optimized content with OpenAI
-def generate_optimized_content(api_key, summarized_transcript):
-    openai.api_key = api_key
     prompt = f"""
     Analyze the following summarized YouTube video transcript and:
     1. Extract the top 10 keywords.
@@ -109,6 +150,7 @@ def generate_optimized_content(api_key, summarized_transcript):
         "tags": ["tag1", "tag2", ..., "tag10"]
     }}
     """
     try:
         response = openai.ChatCompletion.create(
             model="gpt-3.5-turbo",
@@ -118,39 +160,27 @@ def generate_optimized_content(api_key, summarized_transcript):
             ]
         )
         response_content = response['choices'][0]['message']['content']
-        return json.loads(response_content)
-    except Exception:
-        return None
-# Main Gradio function
-def process_video(youtube_url, youtube_api_key, openai_api_key):
-    video_id = extract_video_id(youtube_url)
-    if not video_id:
-        return "Invalid YouTube URL.", "", ""
-    video_length = get_video_duration(video_id, youtube_api_key)
-    if not video_length:
-        return "Error fetching video duration.", "", ""
-    transcript = download_and_transcribe_with_whisper(youtube_url)
     if not transcript:
-        return "Error fetching transcript.", "", ""
     summary = summarize_text_huggingface(transcript)
-    optimized_content = generate_optimized_content(openai_api_key, summary)
-    return summary, json.dumps(optimized_content, indent=4), transcript
-# Gradio Interface
-youtube_api_key = os.getenv("YOUTUBE_API_KEY")
-openai_api_key = os.getenv("OPENAI_API_KEY")
-gr.Interface(
-    fn=lambda youtube_url: process_video(youtube_url, youtube_api_key, openai_api_key),
-    inputs="text",
-    outputs=["text", "text", "text"],
-    title="YouTube Transcript Summarizer",
-    description="Enter a YouTube URL to extract, summarize, and optimize content.",
-).launch()

+import googleapiclient.discovery
+import re
 import yt_dlp
 import whisper
 from pydub import AudioSegment
+import tempfile
 from transformers import pipeline
 from youtube_transcript_api import YouTubeTranscriptApi
+import torch
 import openai
 import json
+from urllib.parse import urlparse, parse_qs
+import os
+import gradio as gr
+# Ensure your API keys are set as environment variables
+youtube_api_key = os.getenv("YOUTUBE_API_KEY")
+openai_api_key = os.getenv("OPENAI_API_KEY")
+openai.api_key = openai_api_key
 def extract_video_id(url):
+    """Extracts the video ID from a YouTube URL."""
     try:
         parsed_url = urlparse(url)
         if "youtube.com" in parsed_url.netloc:
             return query_params.get('v', [None])[0]
         elif "youtu.be" in parsed_url.netloc:
             return parsed_url.path.strip("/")
+        else:
+            return None
+    except Exception as e:
         return None
 def get_video_duration(video_id, api_key):
+    """Fetches the video duration in minutes."""
     try:
+        youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)
         request = youtube.videos().list(part="contentDetails", id=video_id)
         response = request.execute()
         if response["items"]:
             minutes = int(match.group(2)) if match.group(2) else 0
             seconds = int(match.group(3)) if match.group(3) else 0
             return hours * 60 + minutes + seconds / 60
+        else:
+            return None
+    except Exception as e:
         return None
 def download_and_transcribe_with_whisper(youtube_url):
     try:
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_audio_file = os.path.join(temp_dir, "audio.mp3")
             ydl_opts = {
                 'format': 'bestaudio/best',
                 'outtmpl': temp_audio_file,
                 'extractaudio': True,
+                'audioquality': 1,
             }
+            # Download audio using yt-dlp
             with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                 ydl.download([youtube_url])
+            # Convert to wav for Whisper
             audio = AudioSegment.from_file(temp_audio_file)
             wav_file = os.path.join(temp_dir, "audio.wav")
             audio.export(wav_file, format="wav")
+            # Run Whisper transcription
             model = whisper.load_model("large")
             result = model.transcribe(wav_file)
+            transcript = result['text']
+            return transcript
+    except Exception as e:
         return None
+def get_transcript_from_youtube_api(video_id, video_length):
+    """Fetches transcript using YouTube API if available."""
+    try:
+        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+        for transcript in transcript_list:
+            if not transcript.is_generated:
+                segments = transcript.fetch()
+                return " ".join(segment['text'] for segment in segments)
+        if video_length > 15:
+            auto_transcript = transcript_list.find_generated_transcript(['en'])
+            if auto_transcript:
+                segments = auto_transcript.fetch()
+                return " ".join(segment['text'] for segment in segments)
+        return None
+    except Exception as e:
+        return None
+def get_transcript(youtube_url):
+    """Gets transcript from YouTube API or Whisper if unavailable."""
+    video_id = extract_video_id(youtube_url)
+    if not video_id:
+        return "Invalid or unsupported YouTube URL."
+    video_length = get_video_duration(video_id, youtube_api_key)
+    if video_length is not None:
+        transcript = get_transcript_from_youtube_api(video_id, video_length)
+        if transcript:
+            return transcript
+        return download_and_transcribe_with_whisper(youtube_url)
+    else:
+        return "Error fetching video duration."
 def summarize_text_huggingface(text):
+    """Summarizes text using a Hugging Face summarization model."""
     summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
     max_input_length = 1024
     chunk_overlap = 100
     ]
     return " ".join(summaries)
+def generate_optimized_content(summarized_transcript):
     prompt = f"""
     Analyze the following summarized YouTube video transcript and:
     1. Extract the top 10 keywords.
         "tags": ["tag1", "tag2", ..., "tag10"]
     }}
     """
     try:
         response = openai.ChatCompletion.create(
             model="gpt-3.5-turbo",
             ]
         )
         response_content = response['choices'][0]['message']['content']
+        content = json.loads(response_content)
+        return content
+    except Exception as e:
+        return {"error": str(e)}
+def process_video(youtube_url):
+    transcript = get_transcript(youtube_url)
     if not transcript:
+        return "Could not fetch the transcript. Please try another video."
     summary = summarize_text_huggingface(transcript)
+    optimized_content = generate_optimized_content(summary)
+    return optimized_content
+iface = gr.Interface(
+    fn=process_video,
+    inputs=gr.Textbox(label="Enter a YouTube video URL"),
+    outputs=gr.JSON(label="Optimized Content"),
+    title="YouTube Video Optimization Tool",
+    description="Enter a YouTube URL to generate optimized titles, descriptions, and tags."
+)
+if __name__ == "__main__":
+    iface.launch()