AnalysisWithMSR commited on
Commit
6aaea8c
·
verified ·
1 Parent(s): 318ea39

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +208 -5
app.py CHANGED
@@ -1,8 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
2
 
3
- # Checking yt-dlp version
4
- os.system('yt-dlp --version')
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- # Checking youtube-transcript-api (this needs to be done within Python)
7
- # Checking ffmpeg version
8
- os.system('ffmpeg -version')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import googleapiclient.discovery
2
+ import re
3
+ import yt_dlp
4
+ import whisper
5
+ from pydub import AudioSegment
6
+ import tempfile
7
+ from transformers import pipeline
8
+ from youtube_transcript_api import YouTubeTranscriptApi
9
+ import torch
10
+ import openai
11
+ import json
12
+ from urllib.parse import urlparse, parse_qs
13
  import os
14
+ import gradio as gr
15
 
16
+ def extract_video_id(url):
17
+ """Extracts the video ID from a YouTube URL."""
18
+ try:
19
+ parsed_url = urlparse(url)
20
+ if "youtube.com" in parsed_url.netloc:
21
+ query_params = parse_qs(parsed_url.query)
22
+ return query_params.get('v', [None])[0]
23
+ elif "youtu.be" in parsed_url.netloc:
24
+ return parsed_url.path.strip("/")
25
+ else:
26
+ print("Invalid YouTube URL.")
27
+ return None
28
+ except Exception as e:
29
+ print(f"Error parsing URL: {e}")
30
+ return None
31
 
32
+ def get_video_duration(video_id, api_key):
33
+ """Fetches the video duration in minutes."""
34
+ try:
35
+ youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)
36
+ request = youtube.videos().list(part="contentDetails", id=video_id)
37
+ response = request.execute()
38
+ if response["items"]:
39
+ duration = response["items"][0]["contentDetails"]["duration"]
40
+ match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?', duration)
41
+ hours = int(match.group(1)) if match.group(1) else 0
42
+ minutes = int(match.group(2)) if match.group(2) else 0
43
+ seconds = int(match.group(3)) if match.group(3) else 0
44
+ return hours * 60 + minutes + seconds / 60
45
+ else:
46
+ print("No video details found.")
47
+ return None
48
+ except Exception as e:
49
+ print(f"Error fetching video duration: {e}")
50
+ return None
51
+
52
+ def download_and_transcribe_with_whisper(youtube_url):
53
+ try:
54
+ with tempfile.TemporaryDirectory() as temp_dir:
55
+ temp_audio_file = os.path.join(temp_dir, "audio.mp3")
56
+
57
+ ydl_opts = {
58
+ 'format': 'bestaudio/best',
59
+ 'outtmpl': temp_audio_file,
60
+ 'extractaudio': True,
61
+ 'audioquality': 1,
62
+ }
63
+
64
+ # Download audio using yt-dlp
65
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
66
+ ydl.download([youtube_url])
67
+
68
+ # Convert to wav for Whisper
69
+ audio = AudioSegment.from_file(temp_audio_file)
70
+ wav_file = os.path.join(temp_dir, "audio.wav")
71
+ audio.export(wav_file, format="wav")
72
+
73
+ # Run Whisper transcription
74
+ model = whisper.load_model("large")
75
+ result = model.transcribe(wav_file)
76
+ transcript = result['text']
77
+ return transcript
78
+
79
+ except Exception as e:
80
+ print(f"Error during transcription: {e}")
81
+ return None
82
+
83
+ def get_transcript_from_youtube_api(video_id, video_length):
84
+ """Fetches transcript using YouTube API if available."""
85
+ try:
86
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
87
+
88
+ for transcript in transcript_list:
89
+ if not transcript.is_generated:
90
+ segments = transcript.fetch()
91
+ return " ".join(segment['text'] for segment in segments)
92
+
93
+ if video_length > 15:
94
+ auto_transcript = transcript_list.find_generated_transcript(['en'])
95
+ if auto_transcript:
96
+ segments = auto_transcript.fetch()
97
+ return " ".join(segment['text'] for segment in segments)
98
+
99
+ print("Manual transcript not available, and video is too short for auto-transcript.")
100
+ return None
101
+
102
+ except Exception as e:
103
+ print(f"Error fetching transcript: {e}")
104
+ return None
105
+
106
+ def get_transcript(youtube_url, api_key):
107
+ """Gets transcript from YouTube API or Whisper if unavailable."""
108
+ video_id = extract_video_id(youtube_url)
109
+ if not video_id:
110
+ print("Invalid or unsupported YouTube URL.")
111
+ return None
112
+
113
+ video_length = get_video_duration(video_id, api_key)
114
+ if video_length is not None:
115
+ print(f"Video length: {video_length:.2f} minutes.")
116
+ transcript = get_transcript_from_youtube_api(video_id, video_length)
117
+ if transcript:
118
+ return transcript
119
+ print("Using Whisper for transcription.")
120
+ return download_and_transcribe_with_whisper(youtube_url)
121
+ else:
122
+ print("Error fetching video duration.")
123
+ return None
124
+
125
+ def summarize_text_huggingface(text):
126
+ """Summarizes text using a Hugging Face summarization model."""
127
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
128
+ max_input_length = 1024
129
+ chunk_overlap = 100
130
+ text_chunks = [
131
+ text[i:i + max_input_length]
132
+ for i in range(0, len(text), max_input_length - chunk_overlap)
133
+ ]
134
+ summaries = [
135
+ summarizer(chunk, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
136
+ for chunk in text_chunks
137
+ ]
138
+ return " ".join(summaries)
139
+
140
+ def generate_optimized_content(api_key, summarized_transcript):
141
+ openai.api_key = api_key
142
+
143
+ prompt = f"""
144
+ Analyze the following summarized YouTube video transcript and:
145
+ 1. Extract the top 10 keywords.
146
+ 2. Generate an optimized title (less than 65 characters).
147
+ 3. Create an engaging description.
148
+ 4. Generate related tags for the video.
149
+
150
+ Summarized Transcript:
151
+ {summarized_transcript}
152
+
153
+ Provide the results in the following JSON format:
154
+ {{
155
+ "keywords": ["keyword1", "keyword2", ..., "keyword10"],
156
+ "title": "Generated Title",
157
+ "description": "Generated Description",
158
+ "tags": ["tag1", "tag2", ..., "tag10"]
159
+ }}
160
+ """
161
+
162
+ try:
163
+ # Use the updated OpenAI API format for chat completions
164
+ response = openai.ChatCompletion.create(
165
+ model="gpt-3.5-turbo",
166
+ messages=[{"role": "system", "content": "You are an SEO expert."},
167
+ {"role": "user", "content": prompt}]
168
+ )
169
+ # Extract and parse the response
170
+ response_content = response['choices'][0]['message']['content']
171
+ content = json.loads(response_content)
172
+ return content
173
+
174
+ except Exception as e:
175
+ print(f"Error generating content: {e}")
176
+ return None
177
+
178
+
179
+ def process_youtube_url(youtube_url, youtube_api_key, openai_api_key):
180
+ transcript = get_transcript(youtube_url, youtube_api_key)
181
+ if not transcript:
182
+ return "Could not fetch the transcript. Please try another video."
183
+
184
+ summary = summarize_text_huggingface(transcript)
185
+
186
+ optimized_content = generate_optimized_content(openai_api_key, summary)
187
+ if optimized_content:
188
+ return json.dumps(optimized_content, indent=4)
189
+ else:
190
+ return "Error generating optimized content."
191
+
192
+
193
+ # Gradio Interface
194
+ def gradio_interface(youtube_url, youtube_api_key, openai_api_key):
195
+ return process_youtube_url(youtube_url, youtube_api_key, openai_api_key)
196
+
197
+
198
+ # Creating the Gradio interface
199
+ iface = gr.Interface(
200
+ fn=gradio_interface,
201
+ inputs=[
202
+ gr.Textbox(label="YouTube URL"),
203
+ gr.Textbox(label="YouTube API Key", type="password"),
204
+ gr.Textbox(label="OpenAI API Key", type="password")
205
+ ],
206
+ outputs=gr.Textbox(label="Optimized Content"),
207
+ live=True
208
+ )
209
+
210
+ if __name__ == "__main__":
211
+ iface.launch()