AnalysisWithMSR commited on
Commit
c277c70
·
verified ·
1 Parent(s): 0e34a60

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +156 -0
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import yt_dlp
4
+ import whisper
5
+ from pydub import AudioSegment
6
+ from transformers import pipeline
7
+ from youtube_transcript_api import YouTubeTranscriptApi
8
+ from urllib.parse import urlparse, parse_qs
9
+ import openai
10
+ import json
11
+ import tempfile
12
+ import re
13
+ import torch
14
+ from googleapiclient.discovery import build # Add the import for Google API client
15
+
16
+
17
+ # Function to extract YouTube video ID
18
+ def extract_video_id(url):
19
+ try:
20
+ parsed_url = urlparse(url)
21
+ if "youtube.com" in parsed_url.netloc:
22
+ query_params = parse_qs(parsed_url.query)
23
+ return query_params.get('v', [None])[0]
24
+ elif "youtu.be" in parsed_url.netloc:
25
+ return parsed_url.path.strip("/")
26
+ return None
27
+ except Exception:
28
+ return None
29
+
30
+
31
+ # Function to get video duration
32
+ def get_video_duration(video_id, api_key):
33
+ try:
34
+ youtube = build("youtube", "v3", developerKey=api_key)
35
+ request = youtube.videos().list(part="contentDetails", id=video_id)
36
+ response = request.execute()
37
+ if response["items"]:
38
+ duration = response["items"][0]["contentDetails"]["duration"]
39
+ match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?', duration)
40
+ hours = int(match.group(1)) if match.group(1) else 0
41
+ minutes = int(match.group(2)) if match.group(2) else 0
42
+ seconds = int(match.group(3)) if match.group(3) else 0
43
+ return hours * 60 + minutes + seconds / 60
44
+ return None
45
+ except Exception:
46
+ return None
47
+
48
+
49
+ # Download and transcribe with Whisper
50
+ def download_and_transcribe_with_whisper(youtube_url):
51
+ try:
52
+ with tempfile.TemporaryDirectory() as temp_dir:
53
+ temp_audio_file = os.path.join(temp_dir, "audio.mp3")
54
+
55
+ ydl_opts = {
56
+ 'format': 'bestaudio/best',
57
+ 'outtmpl': temp_audio_file,
58
+ 'extractaudio': True,
59
+ }
60
+
61
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
62
+ ydl.download([youtube_url])
63
+
64
+ audio = AudioSegment.from_file(temp_audio_file)
65
+ wav_file = os.path.join(temp_dir, "audio.wav")
66
+ audio.export(wav_file, format="wav")
67
+
68
+ model = whisper.load_model("large")
69
+ result = model.transcribe(wav_file)
70
+ return result['text']
71
+ except Exception:
72
+ return None
73
+
74
+
75
+ # Function to summarize using Hugging Face
76
+ def summarize_text_huggingface(text):
77
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
78
+ max_input_length = 1024
79
+ chunk_overlap = 100
80
+ text_chunks = [
81
+ text[i:i + max_input_length]
82
+ for i in range(0, len(text), max_input_length - chunk_overlap)
83
+ ]
84
+ summaries = [
85
+ summarizer(chunk, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
86
+ for chunk in text_chunks
87
+ ]
88
+ return " ".join(summaries)
89
+
90
+
91
+ # Function to generate optimized content with OpenAI
92
+ def generate_optimized_content(api_key, summarized_transcript):
93
+ openai.api_key = api_key
94
+ prompt = f"""
95
+ Analyze the following summarized YouTube video transcript and:
96
+ 1. Extract the top 10 keywords.
97
+ 2. Generate an optimized title (less than 65 characters).
98
+ 3. Create an engaging description.
99
+ 4. Generate related tags for the video.
100
+
101
+ Summarized Transcript:
102
+ {summarized_transcript}
103
+
104
+ Provide the results in the following JSON format:
105
+ {{
106
+ "keywords": ["keyword1", "keyword2", ..., "keyword10"],
107
+ "title": "Generated Title",
108
+ "description": "Generated Description",
109
+ "tags": ["tag1", "tag2", ..., "tag10"]
110
+ }}
111
+ """
112
+ try:
113
+ response = openai.ChatCompletion.create(
114
+ model="gpt-3.5-turbo",
115
+ messages=[
116
+ {"role": "system", "content": "You are an SEO expert."},
117
+ {"role": "user", "content": prompt}
118
+ ]
119
+ )
120
+ response_content = response['choices'][0]['message']['content']
121
+ return json.loads(response_content)
122
+ except Exception:
123
+ return None
124
+
125
+
126
+ # Main Gradio function
127
+ def process_video(youtube_url, youtube_api_key, openai_api_key):
128
+ video_id = extract_video_id(youtube_url)
129
+ if not video_id:
130
+ return "Invalid YouTube URL.", "", ""
131
+
132
+ video_length = get_video_duration(video_id, youtube_api_key)
133
+ if not video_length:
134
+ return "Error fetching video duration.", "", ""
135
+
136
+ transcript = download_and_transcribe_with_whisper(youtube_url)
137
+ if not transcript:
138
+ return "Error fetching transcript.", "", ""
139
+
140
+ summary = summarize_text_huggingface(transcript)
141
+ optimized_content = generate_optimized_content(openai_api_key, summary)
142
+
143
+ return summary, json.dumps(optimized_content, indent=4), transcript
144
+
145
+
146
+ # Gradio Interface
147
+ youtube_api_key = os.getenv("YOUTUBE_API_KEY")
148
+ openai_api_key = os.getenv("OPENAI_API_KEY")
149
+
150
+ gr.Interface(
151
+ fn=lambda youtube_url: process_video(youtube_url, youtube_api_key, openai_api_key),
152
+ inputs="text",
153
+ outputs=["text", "text", "text"],
154
+ title="YouTube Transcript Summarizer",
155
+ description="Enter a YouTube URL to extract, summarize, and optimize content.",
156
+ ).launch()