Ramakrushna Amitjadhav01 commited on
Commit
e79a00d
1 Parent(s): efdc76b

AMIT's python files (#1)

Browse files

- AMIT's python files (b574243264ffd3f6d6db3118d41c43994633e2bd)


Co-authored-by: Amit Jadhav <Amitjadhav01@users.noreply.huggingface.co>

Files changed (3) hide show
  1. data_loader.py +54 -0
  2. summarize_transcription.py +54 -0
  3. transcribe_audio.py +44 -0
data_loader.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import subprocess
4
+ import time
5
+
6
+ def download_audio(youtube_url):
7
+ """Downloads audio from the given YouTube URL and saves it to audio_files directory."""
8
+ # Create audio_files directory if it doesn't exist
9
+ if not os.path.exists('audio_files'):
10
+ os.makedirs('audio_files')
11
+
12
+ # Use yt-dlp to download audio
13
+ command = [
14
+ 'yt-dlp',
15
+ '-x', # Extract audio
16
+ '--audio-format', 'wav', # Convert to WAV format
17
+ '-o', 'audio_files/%(title)s.%(ext)s', # Output format
18
+ youtube_url
19
+ ]
20
+
21
+ result = subprocess.run(command, capture_output=True, text=True)
22
+
23
+ if result.returncode != 0:
24
+ logging.error(f'Error downloading audio: {result.stderr}')
25
+ raise Exception('Failed to download audio')
26
+
27
+ # Wait a moment for the file to be created
28
+ time.sleep(1)
29
+
30
+ def get_audio_filename():
31
+ """Returns the latest downloaded audio filename from audio_files directory."""
32
+ audio_files = os.listdir('audio_files')
33
+ if audio_files:
34
+ # Sort by modification time, return the latest .wav file
35
+ audio_files.sort(key=lambda x: os.path.getmtime(os.path.join('audio_files', x)))
36
+ for file in audio_files:
37
+ if file.endswith('.wav'):
38
+ return os.path.join('audio_files', file)
39
+ return None
40
+
41
+ if __name__ == "__main__":
42
+ youtube_url = input("Enter the YouTube URL: ")
43
+ try:
44
+ download_audio(youtube_url)
45
+
46
+ # Get the latest audio file (optional step, can be used later for transcription)
47
+ audio_file = get_audio_filename()
48
+ if audio_file:
49
+ print(f"Audio file downloaded: {audio_file}")
50
+ else:
51
+ logging.error('No audio file found after download.')
52
+ raise Exception('No audio file found.')
53
+ except Exception as e:
54
+ logging.error(f'An error occurred: {e}')
summarize_transcription.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from langchain_openai import OpenAI
4
+ from langchain_core.prompts import PromptTemplate
5
+ from langchain_core.runnables import RunnableSequence
6
+ import time
7
+
8
+ def summarize_text(text):
9
+ """Summarizes the given text using LangChain with OpenAI."""
10
+ prompt_template = PromptTemplate(
11
+ input_variables=["text"],
12
+ template="Please summarize the following text:\n\n{text}"
13
+ )
14
+
15
+ llm = OpenAI(temperature=0.7) # Adjust the temperature for creativity
16
+ summarization_chain = RunnableSequence(prompt_template | llm)
17
+
18
+ max_retries = 3
19
+ for attempt in range(max_retries):
20
+ try:
21
+ summary = summarization_chain.invoke({"text": text})
22
+ return summary
23
+ except Exception as e:
24
+ if 'insufficient_quota' in str(e) and attempt < max_retries - 1:
25
+ print(f'Quota exceeded. Retrying in {2 ** attempt} seconds...')
26
+ time.sleep(2 ** attempt) # Exponential backoff
27
+ else:
28
+ logging.error(f'An error occurred: {e}')
29
+ raise e
30
+
31
+ if __name__ == "__main__":
32
+ # Ensure the blogs folder exists
33
+ if not os.path.exists('blogs'):
34
+ os.makedirs('blogs')
35
+
36
+ # Get the transcription file path from the user
37
+ transcription_file_path = input("Enter the path to the transcription file: ")
38
+
39
+ # Read the transcription text
40
+ try:
41
+ with open(transcription_file_path, 'r') as file:
42
+ transcription_text = file.read()
43
+
44
+ # Summarize the transcription text
45
+ summary = summarize_text(transcription_text)
46
+
47
+ # Save the summary to a text file in the blogs folder
48
+ summary_file_path = os.path.join('blogs', os.path.basename(transcription_file_path).replace('.txt', '_summary.txt'))
49
+ with open(summary_file_path, 'w') as summary_file:
50
+ summary_file.write(summary)
51
+
52
+ print(f"Summary saved to: {summary_file_path}")
53
+ except Exception as e:
54
+ logging.error(f'An error occurred while processing the transcription file: {e}')
transcribe_audio.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from whisper import load_model
4
+
5
+ def transcribe_audio(audio_file):
6
+ """Transcribes audio to text using Whisper."""
7
+ # Load Whisper model
8
+ model = load_model("base") # Change to desired model size
9
+
10
+ # Perform transcription
11
+ try:
12
+ result = model.transcribe(audio_file)
13
+ return result['text']
14
+ except Exception as e:
15
+ logging.error(f'Error transcribing audio: {e}')
16
+ raise Exception('Failed to transcribe audio')
17
+
18
+ def save_transcription(transcription, title):
19
+ """Saves the transcription to a text file."""
20
+ # Create transcription directory if it doesn't exist
21
+ if not os.path.exists('transcriptions'):
22
+ os.makedirs('transcriptions')
23
+
24
+ # Save the transcription to a text file
25
+ transcription_file = os.path.join('transcriptions', f'{title}.txt')
26
+ with open(transcription_file, 'w', encoding='utf-8') as f:
27
+ f.write(transcription)
28
+ print(f'Transcription saved to: {transcription_file}')
29
+
30
+ if __name__ == "__main__":
31
+ # Specify the path to the audio file
32
+ audio_file = input("Enter the path to the audio file: ")
33
+
34
+ # Extract title from the audio file name
35
+ title = os.path.splitext(os.path.basename(audio_file))[0]
36
+
37
+ try:
38
+ transcription = transcribe_audio(audio_file)
39
+ print("Transcription:", transcription)
40
+
41
+ # Save the transcription to a file
42
+ save_transcription(transcription, title)
43
+ except Exception as e:
44
+ logging.error(f'An error occurred: {e}')