Commit
•
e79a00d
1
Parent(s):
efdc76b
AMIT's python files (#1)
Browse files- AMIT's python files (b574243264ffd3f6d6db3118d41c43994633e2bd)
Co-authored-by: Amit Jadhav <Amitjadhav01@users.noreply.huggingface.co>
- data_loader.py +54 -0
- summarize_transcription.py +54 -0
- transcribe_audio.py +44 -0
data_loader.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
import subprocess
|
4 |
+
import time
|
5 |
+
|
6 |
+
def download_audio(youtube_url):
|
7 |
+
"""Downloads audio from the given YouTube URL and saves it to audio_files directory."""
|
8 |
+
# Create audio_files directory if it doesn't exist
|
9 |
+
if not os.path.exists('audio_files'):
|
10 |
+
os.makedirs('audio_files')
|
11 |
+
|
12 |
+
# Use yt-dlp to download audio
|
13 |
+
command = [
|
14 |
+
'yt-dlp',
|
15 |
+
'-x', # Extract audio
|
16 |
+
'--audio-format', 'wav', # Convert to WAV format
|
17 |
+
'-o', 'audio_files/%(title)s.%(ext)s', # Output format
|
18 |
+
youtube_url
|
19 |
+
]
|
20 |
+
|
21 |
+
result = subprocess.run(command, capture_output=True, text=True)
|
22 |
+
|
23 |
+
if result.returncode != 0:
|
24 |
+
logging.error(f'Error downloading audio: {result.stderr}')
|
25 |
+
raise Exception('Failed to download audio')
|
26 |
+
|
27 |
+
# Wait a moment for the file to be created
|
28 |
+
time.sleep(1)
|
29 |
+
|
30 |
+
def get_audio_filename():
|
31 |
+
"""Returns the latest downloaded audio filename from audio_files directory."""
|
32 |
+
audio_files = os.listdir('audio_files')
|
33 |
+
if audio_files:
|
34 |
+
# Sort by modification time, return the latest .wav file
|
35 |
+
audio_files.sort(key=lambda x: os.path.getmtime(os.path.join('audio_files', x)))
|
36 |
+
for file in audio_files:
|
37 |
+
if file.endswith('.wav'):
|
38 |
+
return os.path.join('audio_files', file)
|
39 |
+
return None
|
40 |
+
|
41 |
+
if __name__ == "__main__":
|
42 |
+
youtube_url = input("Enter the YouTube URL: ")
|
43 |
+
try:
|
44 |
+
download_audio(youtube_url)
|
45 |
+
|
46 |
+
# Get the latest audio file (optional step, can be used later for transcription)
|
47 |
+
audio_file = get_audio_filename()
|
48 |
+
if audio_file:
|
49 |
+
print(f"Audio file downloaded: {audio_file}")
|
50 |
+
else:
|
51 |
+
logging.error('No audio file found after download.')
|
52 |
+
raise Exception('No audio file found.')
|
53 |
+
except Exception as e:
|
54 |
+
logging.error(f'An error occurred: {e}')
|
summarize_transcription.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
from langchain_openai import OpenAI
|
4 |
+
from langchain_core.prompts import PromptTemplate
|
5 |
+
from langchain_core.runnables import RunnableSequence
|
6 |
+
import time
|
7 |
+
|
8 |
+
def summarize_text(text):
|
9 |
+
"""Summarizes the given text using LangChain with OpenAI."""
|
10 |
+
prompt_template = PromptTemplate(
|
11 |
+
input_variables=["text"],
|
12 |
+
template="Please summarize the following text:\n\n{text}"
|
13 |
+
)
|
14 |
+
|
15 |
+
llm = OpenAI(temperature=0.7) # Adjust the temperature for creativity
|
16 |
+
summarization_chain = RunnableSequence(prompt_template | llm)
|
17 |
+
|
18 |
+
max_retries = 3
|
19 |
+
for attempt in range(max_retries):
|
20 |
+
try:
|
21 |
+
summary = summarization_chain.invoke({"text": text})
|
22 |
+
return summary
|
23 |
+
except Exception as e:
|
24 |
+
if 'insufficient_quota' in str(e) and attempt < max_retries - 1:
|
25 |
+
print(f'Quota exceeded. Retrying in {2 ** attempt} seconds...')
|
26 |
+
time.sleep(2 ** attempt) # Exponential backoff
|
27 |
+
else:
|
28 |
+
logging.error(f'An error occurred: {e}')
|
29 |
+
raise e
|
30 |
+
|
31 |
+
if __name__ == "__main__":
|
32 |
+
# Ensure the blogs folder exists
|
33 |
+
if not os.path.exists('blogs'):
|
34 |
+
os.makedirs('blogs')
|
35 |
+
|
36 |
+
# Get the transcription file path from the user
|
37 |
+
transcription_file_path = input("Enter the path to the transcription file: ")
|
38 |
+
|
39 |
+
# Read the transcription text
|
40 |
+
try:
|
41 |
+
with open(transcription_file_path, 'r') as file:
|
42 |
+
transcription_text = file.read()
|
43 |
+
|
44 |
+
# Summarize the transcription text
|
45 |
+
summary = summarize_text(transcription_text)
|
46 |
+
|
47 |
+
# Save the summary to a text file in the blogs folder
|
48 |
+
summary_file_path = os.path.join('blogs', os.path.basename(transcription_file_path).replace('.txt', '_summary.txt'))
|
49 |
+
with open(summary_file_path, 'w') as summary_file:
|
50 |
+
summary_file.write(summary)
|
51 |
+
|
52 |
+
print(f"Summary saved to: {summary_file_path}")
|
53 |
+
except Exception as e:
|
54 |
+
logging.error(f'An error occurred while processing the transcription file: {e}')
|
transcribe_audio.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
from whisper import load_model
|
4 |
+
|
5 |
+
def transcribe_audio(audio_file):
|
6 |
+
"""Transcribes audio to text using Whisper."""
|
7 |
+
# Load Whisper model
|
8 |
+
model = load_model("base") # Change to desired model size
|
9 |
+
|
10 |
+
# Perform transcription
|
11 |
+
try:
|
12 |
+
result = model.transcribe(audio_file)
|
13 |
+
return result['text']
|
14 |
+
except Exception as e:
|
15 |
+
logging.error(f'Error transcribing audio: {e}')
|
16 |
+
raise Exception('Failed to transcribe audio')
|
17 |
+
|
18 |
+
def save_transcription(transcription, title):
|
19 |
+
"""Saves the transcription to a text file."""
|
20 |
+
# Create transcription directory if it doesn't exist
|
21 |
+
if not os.path.exists('transcriptions'):
|
22 |
+
os.makedirs('transcriptions')
|
23 |
+
|
24 |
+
# Save the transcription to a text file
|
25 |
+
transcription_file = os.path.join('transcriptions', f'{title}.txt')
|
26 |
+
with open(transcription_file, 'w', encoding='utf-8') as f:
|
27 |
+
f.write(transcription)
|
28 |
+
print(f'Transcription saved to: {transcription_file}')
|
29 |
+
|
30 |
+
if __name__ == "__main__":
|
31 |
+
# Specify the path to the audio file
|
32 |
+
audio_file = input("Enter the path to the audio file: ")
|
33 |
+
|
34 |
+
# Extract title from the audio file name
|
35 |
+
title = os.path.splitext(os.path.basename(audio_file))[0]
|
36 |
+
|
37 |
+
try:
|
38 |
+
transcription = transcribe_audio(audio_file)
|
39 |
+
print("Transcription:", transcription)
|
40 |
+
|
41 |
+
# Save the transcription to a file
|
42 |
+
save_transcription(transcription, title)
|
43 |
+
except Exception as e:
|
44 |
+
logging.error(f'An error occurred: {e}')
|