Ramakrushna
commited on
Commit
•
2c566d7
1
Parent(s):
e7654f2
commit new files
Browse files- README.md +38 -3
- falsk_app.py +55 -0
- groq_backend.py +246 -0
- requirements.txt +0 -0
README.md
CHANGED
@@ -1,3 +1,38 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# youtube2blog
|
2 |
+
|
3 |
+
![screenshot](https://github.com/S4mpl3r/youtube2blog/blob/main/assets/screenshot.jpg)
|
4 |
+
|
5 |
+
youtube2blog offers a seamless solution for transforming any YouTube video, or audio file, into a comprehensive blog post. This tool leverages the power of [Groq](https://groq.com) and [Deepgram](https://deepgram.com) to provide a streamlined content creation process.
|
6 |
+
|
7 |
+
## Features
|
8 |
+
- **Video-to-Blog conversion:** Easily convert any YouTube video into a blog post with just the video's URL.
|
9 |
+
- **Keyword Extraction:** Extract top 10 keywords from a youtube video.
|
10 |
+
- **Transcription:** Obtain a full transcript of the video for further analysis or content creation.
|
11 |
+
|
12 |
+
This tool is not limited to YouTube videos; users can also input their own audio files to generate blog posts, extract keywords, and transcribe content.
|
13 |
+
|
14 |
+
## Installation
|
15 |
+
To use this tool you should obtain Groq and Deepgram API keys. Groq is currently free, and Deepgram provides $200 credit which is more than enough to run this tool.
|
16 |
+
To install, do the following:
|
17 |
+
1. Clone the repository:
|
18 |
+
```bash
|
19 |
+
git clone https://github.com/S4mpl3r/youtube2blog.git
|
20 |
+
```
|
21 |
+
2. Create a python environment and activate it. (optional, but highly recommended)
|
22 |
+
3. Create a .env file in the project root and populate it with your API keys:
|
23 |
+
```bash
|
24 |
+
GROQ_API_KEY=<YOUR_KEY>
|
25 |
+
DEEPGRAM_API_KEY=<YOUR_KEY>
|
26 |
+
```
|
27 |
+
3. Install the required packages
|
28 |
+
```bash
|
29 |
+
python -m pip install -r requirements.txt
|
30 |
+
```
|
31 |
+
4. CD into the youtube2blog directory and run the tool:
|
32 |
+
```bash
|
33 |
+
cd youtube2blog/
|
34 |
+
python youtube2blog.py
|
35 |
+
```
|
36 |
+
|
37 |
+
## License
|
38 |
+
MIT
|
falsk_app.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import groq_backend
|
2 |
+
from flask import Flask, request, jsonify, g
|
3 |
+
from typing import Dict, Union
|
4 |
+
import logging
|
5 |
+
import re
|
6 |
+
|
7 |
+
|
8 |
+
app = Flask(__name__)
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
# Configuration settings directly defined
|
12 |
+
app.config['DEFAULT_CHUNK_SIZE'] = 25*60*1000
|
13 |
+
app.config['TEMP_DIR'] = "temp"
|
14 |
+
|
15 |
+
YOUTUBE_URL_PATTERN = r'^(https?://)?(www\.)?(youtube\.com|youtu\.be)/.+$'
|
16 |
+
|
17 |
+
def validate_youtube_url(url: str) -> bool:
|
18 |
+
return re.match(YOUTUBE_URL_PATTERN, url) is not None
|
19 |
+
|
20 |
+
|
21 |
+
@app.route('/transcribe', methods=['POST'])
|
22 |
+
def transcribe() -> Union[Dict, str]:
|
23 |
+
data = request.json
|
24 |
+
yt_url = data.get('yt_url')
|
25 |
+
chunk_size = data.get('chunk_size', app.config['DEFAULT_CHUNK_SIZE'])
|
26 |
+
temp_dir = data.get('temp_dir', app.config['TEMP_DIR'])
|
27 |
+
|
28 |
+
if not yt_url:
|
29 |
+
error = "YouTube URL is required."
|
30 |
+
logger.error(error)
|
31 |
+
return jsonify({'error': error}), 400
|
32 |
+
|
33 |
+
if not validate_youtube_url(yt_url):
|
34 |
+
error = f"Invalid YouTube URL: {yt_url}"
|
35 |
+
logger.error(error)
|
36 |
+
return jsonify({'error': error}), 400
|
37 |
+
|
38 |
+
try:
|
39 |
+
print(f"Transcribing {yt_url} with chunk size {chunk_size} and temp dir {temp_dir}")
|
40 |
+
transcript = groq_backend.generate_youtube_transcript_with_groq(
|
41 |
+
yt_url, chunk_size, temp_dir
|
42 |
+
)
|
43 |
+
return jsonify({'transcript': transcript}), 200
|
44 |
+
except ValueError as e:
|
45 |
+
error = f"Failed to transcribe video: {str(e)}"
|
46 |
+
logger.exception(error)
|
47 |
+
return jsonify({'error': error}), 400
|
48 |
+
except Exception as e:
|
49 |
+
error = "Unexpected error occurred during transcription"
|
50 |
+
logger.exception(error)
|
51 |
+
return jsonify({'error': error}), 500
|
52 |
+
|
53 |
+
|
54 |
+
if __name__ == '__main__':
|
55 |
+
app.run()
|
groq_backend.py
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
import logging
|
4 |
+
import traceback
|
5 |
+
import re
|
6 |
+
from typing import List, Optional
|
7 |
+
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
from groq import Groq
|
10 |
+
import yt_dlp
|
11 |
+
from pydub import AudioSegment
|
12 |
+
|
13 |
+
|
14 |
+
logging.basicConfig(level=logging.INFO)
|
15 |
+
load_dotenv('.env')
|
16 |
+
|
17 |
+
def transcribe_with_groq(file_path: str) -> str:
|
18 |
+
"""
|
19 |
+
Transcribes an audio file using Groq's transcription service.
|
20 |
+
This function uses Groq's audio transcription API to convert the audio file into text. The audio file is expected to be in an MP3 format.
|
21 |
+
Groq utilizes the Whisper model, an automatic speech recognition system developed by OpenAI, for efficient and accurate audio transcription.
|
22 |
+
Args:
|
23 |
+
file_path (str): The absolute or relative path to the audio file that needs to be transcribed. This file should be accessible and readable.
|
24 |
+
Returns:
|
25 |
+
str: The full transcript of the audio file.
|
26 |
+
Raises:
|
27 |
+
Exception: If there are issues in transcribing the audio file.
|
28 |
+
"""
|
29 |
+
client = Groq()
|
30 |
+
filename = os.path.basename(file_path)
|
31 |
+
|
32 |
+
try:
|
33 |
+
with open(file_path, "rb") as file:
|
34 |
+
result = client.audio.transcriptions.create(
|
35 |
+
file=(filename, file.read()),
|
36 |
+
model="whisper-large-v3",
|
37 |
+
)
|
38 |
+
transcription = result.text
|
39 |
+
logging.info(transcription)
|
40 |
+
|
41 |
+
return transcription
|
42 |
+
except Exception as e:
|
43 |
+
logging.error(f"Error during transcription: {str(e)}")
|
44 |
+
logging.error(traceback.format_exc())
|
45 |
+
raise
|
46 |
+
|
47 |
+
def yt_dlp_download(yt_url:str, output_path:str = None) -> str:
|
48 |
+
"""
|
49 |
+
Downloads the audio track from a specified YouTube video URL using the yt-dlp library, then converts it to an MP3 format file.
|
50 |
+
This function configures yt-dlp to extract the best quality audio available and uses FFmpeg (via yt-dlp's postprocessors) to convert the audio to MP3 format. The resulting MP3 file is saved to the specified or default output directory with a filename derived from the video title.
|
51 |
+
Args:
|
52 |
+
yt_url (str): The URL of the YouTube video from which audio will be downloaded. This should be a valid YouTube video URL.
|
53 |
+
Returns:
|
54 |
+
str: The absolute file path of the downloaded and converted MP3 file. This path includes the filename which is derived from the original video title.
|
55 |
+
Raises:
|
56 |
+
yt_dlp.utils.DownloadError: If there is an issue with downloading the video's audio due to reasons such as video unavailability or restrictions.
|
57 |
+
|
58 |
+
Exception: For handling unexpected errors during the download and conversion process.
|
59 |
+
"""
|
60 |
+
if output_path is None:
|
61 |
+
output_path = os.getcwd()
|
62 |
+
|
63 |
+
ydl_opts = {
|
64 |
+
'format': 'bestaudio/best',
|
65 |
+
'postprocessors': [{
|
66 |
+
'key': 'FFmpegExtractAudio',
|
67 |
+
'preferredcodec': 'mp3',
|
68 |
+
'preferredquality': '192',
|
69 |
+
}],
|
70 |
+
'outtmpl': os.path.join(output_path, '%(title)s.%(ext)s'),
|
71 |
+
}
|
72 |
+
|
73 |
+
try:
|
74 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
75 |
+
result = ydl.extract_info(yt_url, download=True)
|
76 |
+
file_name = ydl.prepare_filename(result)
|
77 |
+
mp3_file_path = file_name.rsplit('.', 1)[0] + '.mp3'
|
78 |
+
logging.info(f"yt_dlp_download saved YouTube video to file path: {mp3_file_path}")
|
79 |
+
return mp3_file_path
|
80 |
+
except yt_dlp.utils.DownloadError as e:
|
81 |
+
logging.error(f"yt_dlp_download failed to download audio from URL {yt_url}: {e}")
|
82 |
+
raise
|
83 |
+
except Exception as e:
|
84 |
+
logging.error(f"An unexpected error occurred with yt_dlp_download: {e}")
|
85 |
+
logging.error(traceback.format_exc())
|
86 |
+
raise
|
87 |
+
|
88 |
+
|
89 |
+
def create_audio_chunks(audio_file: str, chunk_size: int, temp_dir: str) -> List[str]:
|
90 |
+
"""
|
91 |
+
Splits an audio file into smaller segments or chunks based on a specified duration. This function is useful for processing large audio files incrementally or in parallel, which can be beneficial for tasks such as audio analysis or transcription where handling smaller segments might be more manageable.
|
92 |
+
AudioSegment can slice an audio file by specifying the start and end times in milliseconds. This allows you to extract precise segments of the audio without needing to process the entire file at once. For example, `audio[1000:2000]` extracts a segment from the 1-second mark to the 2-second mark of the audio file.
|
93 |
+
Args:
|
94 |
+
audio_file (str): The absolute or relative path to the audio file that needs to be chunked. This file should be accessible and readable.
|
95 |
+
|
96 |
+
chunk_size (int): The length of each audio chunk expressed in milliseconds. This value determines how the audio file will be divided. For example, a `chunk_size` of 1000 milliseconds will split the audio into chunks of 1 second each.
|
97 |
+
|
98 |
+
temp_dir (str): The directory where the temporary audio chunk files will be stored. This directory will be used to save the output chunk files, and it must have write permissions. If the directory does not exist, it will be created.
|
99 |
+
Returns:
|
100 |
+
List[str]: A list containing the file paths of all the audio chunks created. Each path in the list represents a single chunk file stored in the specified `temp_dir`. The files are named sequentially based on their order in the original audio file.
|
101 |
+
Raises:
|
102 |
+
FileNotFoundError: If the `audio_file` does not exist or is inaccessible.
|
103 |
+
|
104 |
+
PermissionError: If the script lacks the necessary permissions to read the `audio_file` or write to the `temp_dir`.
|
105 |
+
ValueError: If `chunk_size` is set to a non-positive value.
|
106 |
+
"""
|
107 |
+
os.makedirs(temp_dir, exist_ok=True)
|
108 |
+
file_name = os.path.splitext(os.path.basename(audio_file))[0]
|
109 |
+
|
110 |
+
try:
|
111 |
+
audio = AudioSegment.from_file(audio_file)
|
112 |
+
except Exception as e:
|
113 |
+
logging.error(f"create_audio_chunks failed to load audio file {audio_file}: {e}")
|
114 |
+
logging.error(traceback.format_exc())
|
115 |
+
return []
|
116 |
+
|
117 |
+
start = 0
|
118 |
+
end = chunk_size
|
119 |
+
counter = 0
|
120 |
+
chunk_files = []
|
121 |
+
|
122 |
+
|
123 |
+
|
124 |
+
while start < len(audio):
|
125 |
+
chunk = audio[start:end]
|
126 |
+
chunk_file_path = os.path.join(temp_dir, f"{counter}_{file_name}.mp3")
|
127 |
+
try:
|
128 |
+
chunk.export(chunk_file_path, format="mp3") # Using .mp3 because it's cheaper
|
129 |
+
chunk_files.append(chunk_file_path)
|
130 |
+
except Exception as e:
|
131 |
+
error_message = f"create_audio_chunks failed to export chunk {counter}: {e}"
|
132 |
+
logging.error(error_message)
|
133 |
+
logging.error(traceback.format_exc())
|
134 |
+
raise error_message
|
135 |
+
start += chunk_size
|
136 |
+
end += chunk_size
|
137 |
+
counter += 1
|
138 |
+
return chunk_files
|
139 |
+
|
140 |
+
def generate_youtube_transcript_with_groq(yt_url: str, chunk_size: int, temp_dir: str) -> str:
|
141 |
+
"""
|
142 |
+
Generate a transcript for a YouTube video using Groq's transcription service by processing the video's audio.
|
143 |
+
This function performs several steps to achieve transcription:
|
144 |
+
1. It validates the provided YouTube URL to ensure it is correctly formatted.
|
145 |
+
2. Downloads the YouTube video to a local file.
|
146 |
+
3. Splits the downloaded video's audio track into manageable chunks of a specified duration.
|
147 |
+
4. Transcribes each audio chunk into text using Groq's transcription service.
|
148 |
+
5. Aggregates the transcriptions of all chunks to form a complete transcript of the video.
|
149 |
+
6. Cleans up all temporary files and directories created during the process to free up system resources.
|
150 |
+
Args:
|
151 |
+
yt_url (str): The URL of the YouTube video to transcribe. This should be a valid YouTube link.
|
152 |
+
|
153 |
+
chunk_size (int): The duration of each audio chunk in milliseconds. This determines how the audio is divided for transcription.
|
154 |
+
|
155 |
+
temp_dir (str): The directory to store the temporary audio chunk files. This directory will be used for intermediate storage and must be writable.
|
156 |
+
Returns:
|
157 |
+
str: The full transcript of the YouTube video, which is a concatenation of all transcribed audio chunks.
|
158 |
+
Raises:
|
159 |
+
ValueError: If the YouTube URL is invalid, indicating the URL does not match the expected format or cannot be processed.
|
160 |
+
|
161 |
+
Exception: If there are issues in any of the steps such as downloading the video, creating audio chunks, transcribing the chunks, or cleaning up temporary files. Specific errors will provide more details on the step that failed.
|
162 |
+
"""
|
163 |
+
youtube_url_pattern = r'^(https?://)?(www\.)?(youtube\.com|youtu\.be)/.+$'
|
164 |
+
if not re.match(youtube_url_pattern, yt_url):
|
165 |
+
logging.error(f"Invalid YouTube URL: {yt_url}")
|
166 |
+
raise ValueError("Invalid YouTube URL provided.")
|
167 |
+
|
168 |
+
try:
|
169 |
+
file_path = yt_dlp_download(yt_url)
|
170 |
+
except Exception as e:
|
171 |
+
logging.error(f"generate_youtube_transcript_with_groq failed to download YouTube video from URL {yt_url}: {e}")
|
172 |
+
logging.error(traceback.format_exc())
|
173 |
+
raise
|
174 |
+
|
175 |
+
try:
|
176 |
+
chunk_files = create_audio_chunks(file_path, chunk_size, temp_dir)
|
177 |
+
except Exception as e:
|
178 |
+
error_message = f"generate_youtube_transcript_with_groq failed to create audio chunks from file {file_path}: {e}"
|
179 |
+
logging.error(error_message)
|
180 |
+
logging.error(traceback.format_exc())
|
181 |
+
raise error_message
|
182 |
+
|
183 |
+
transcripts = []
|
184 |
+
for file_name in chunk_files:
|
185 |
+
try:
|
186 |
+
logging.info(f"Transcribing {file_name}")
|
187 |
+
transcript = transcribe_with_groq(file_name)
|
188 |
+
transcripts.append(transcript)
|
189 |
+
except Exception as e:
|
190 |
+
error_message = f"generate_youtube_transcript_with_groq failed to transcribe file {file_name}: {e}"
|
191 |
+
logging.error(error_message)
|
192 |
+
logging.error(traceback.format_exc())
|
193 |
+
raise error_message
|
194 |
+
|
195 |
+
full_transcript = " ".join(transcripts)
|
196 |
+
|
197 |
+
try:
|
198 |
+
# Clean up the temporary directory
|
199 |
+
shutil.rmtree(temp_dir)
|
200 |
+
except Exception as e:
|
201 |
+
error_message = f"generate_youtube_transcript_with_groq failed to remove temporary directory {temp_dir}: {e}"
|
202 |
+
logging.error(error_message)
|
203 |
+
logging.error(traceback.format_exc())
|
204 |
+
raise error_message
|
205 |
+
|
206 |
+
try:
|
207 |
+
# Remove the downloaded video file
|
208 |
+
os.remove(file_path)
|
209 |
+
except Exception as e:
|
210 |
+
error_message = (f"generate_youtube_transcript_with_groq failed to remove downloaded file {file_path}: {e}")
|
211 |
+
logging.error(error_message)
|
212 |
+
logging.error(traceback.format_exc())
|
213 |
+
raise error_message
|
214 |
+
|
215 |
+
return full_transcript
|
216 |
+
|
217 |
+
def save_transcript_to_file(transcript: str, folder_path: str, file_name: str):
|
218 |
+
"""
|
219 |
+
Saves the transcript to a text file in a specified folder with a specified file name.
|
220 |
+
|
221 |
+
Args:
|
222 |
+
transcript (str): The transcript text to save.
|
223 |
+
folder_path (str): The path to the folder where the file will be saved.
|
224 |
+
file_name (str): The name of the file (without the .txt extension).
|
225 |
+
"""
|
226 |
+
if not os.path.exists(folder_path):
|
227 |
+
os.makedirs(folder_path)
|
228 |
+
|
229 |
+
file_path = os.path.join(folder_path, file_name + '.txt')
|
230 |
+
|
231 |
+
with open(file_path, 'w') as file:
|
232 |
+
file.write(transcript)
|
233 |
+
|
234 |
+
logging.info(f"Transcript saved to {file_path}")
|
235 |
+
|
236 |
+
if __name__ == "__main__":
|
237 |
+
yt_url = "https://www.youtube.com/watch?v=ZUOYyXg7ewo&ab_channel=TalkTottenham"
|
238 |
+
chunk_size = 25*60000
|
239 |
+
temp_directory = "temp_chunks"
|
240 |
+
transcript = generate_youtube_transcript_with_groq(yt_url, chunk_size, temp_directory)
|
241 |
+
|
242 |
+
# Save the transcript
|
243 |
+
folder_to_save = "transcripts"
|
244 |
+
file_name_to_save = "youtube_transcript"
|
245 |
+
save_transcript_to_file(transcript, folder_to_save, file_name_to_save)
|
246 |
+
print(f"Transcript saved in {folder_to_save}/{file_name_to_save}.txt")
|
requirements.txt
ADDED
Binary file (1.23 kB). View file
|
|