uzi007 commited on
Commit
11bc390
1 Parent(s): 1e23530

Added Functions & Updated Requirements File

Browse files
Files changed (3) hide show
  1. media_download.py +36 -9
  2. requirements.txt +2 -1
  3. summarizer.py +56 -7
media_download.py CHANGED
@@ -372,24 +372,51 @@ class YoutubeDownloader(MediaDownloader):
372
  video_path (str): Path to the input video file.
373
 
374
  Returns:
375
- bool: True if extraction is successful, False otherwise.
376
  """
377
  try:
378
- # Determine the file format (MP4 or WebM) based on the file extension
379
  filename, extension = os.path.splitext(video_path)
380
-
381
- # Extracted audio path
382
  audio_path = filename + '.mp3'
383
 
384
- # Choose the appropriate codec for the output audio format (MP3)
385
  audio_codec = "libmp3lame" if extension.lower() in (".mp4", ".webm") else "mp3"
386
 
387
- # Run the ffmpeg command to extract audio
388
- subprocess.run(["ffmpeg", "-i", video_path, "-vn", "-acodec",
389
- audio_codec, audio_path, '-loglevel', 'quiet'], check=True)
390
-
391
 
392
  return audio_path
393
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  except subprocess.CalledProcessError as e:
395
  print(f"Error: {e}")
 
372
  video_path (str): Path to the input video file.
373
 
374
  Returns:
375
+ str: Path of extracted audio.
376
  """
377
  try:
378
+ # Path for Extracted Audio File
379
  filename, extension = os.path.splitext(video_path)
 
 
380
  audio_path = filename + '.mp3'
381
 
382
+ # Choosing the Appropriate Codec for the Output Audio Format (MP3)
383
  audio_codec = "libmp3lame" if extension.lower() in (".mp4", ".webm") else "mp3"
384
 
385
+ # Extracting Audio using FFMPEG Command
386
+ command = ["ffmpeg", "-i", video_path, "-vn", "-acodec",
387
+ audio_codec, audio_path, '-loglevel', 'quiet']
388
+ subprocess.run(command, check=True)
389
 
390
  return audio_path
391
 
392
+ except subprocess.CalledProcessError as e:
393
+ print(f"Error: {e}")
394
+
395
+ @staticmethod
396
+ def burn_subtitles(video_file_path, subtitle_file_path):
397
+ '''
398
+ Burns the subtitles onto the video
399
+
400
+ Args:
401
+ video_file_path (str): Path to the input video file.
402
+ subtitle_file_path (str): Path to the subtitle file.
403
+
404
+ Returns:
405
+ str: Path of output video with subtitles.
406
+ '''
407
+ try:
408
+ # Getting Output File Path
409
+ video_filename, video_extension = os.path.splitext(video_file_path)
410
+ subtitle_filename, subtitle_extension = os.path.splitext(subtitle_file_path)
411
+ output_file_path = video_filename + subtitle_extension.replace('.', '_') + video_extension
412
+
413
+ # Burning the Subtitles onto Video using FFMPEG Command
414
+ command = ['ffmpeg', '-i', video_file_path,
415
+ '-vf', f'subtitles={subtitle_file_path}',
416
+ output_file_path, '-loglevel', 'quiet']
417
+ subprocess.run(command, check=True)
418
+
419
+ return output_file_path
420
+
421
  except subprocess.CalledProcessError as e:
422
  print(f"Error: {e}")
requirements.txt CHANGED
@@ -1,7 +1,6 @@
1
  fastapi
2
  faster-whisper
3
  langchain
4
- nltk
5
  openai
6
  pandas
7
  pytube
@@ -12,3 +11,5 @@ uvicorn
12
  wordcloud
13
  youtube-transcript-api
14
  git+https://github.com/suno-ai/bark.git
 
 
 
1
  fastapi
2
  faster-whisper
3
  langchain
 
4
  openai
5
  pandas
6
  pytube
 
11
  wordcloud
12
  youtube-transcript-api
13
  git+https://github.com/suno-ai/bark.git
14
+ --extra-index-url https://download.pytorch.org/whl/cu113
15
+ torch
summarizer.py CHANGED
@@ -10,6 +10,10 @@ from sklearn.cluster import KMeans
10
  from sklearn.metrics import silhouette_score
11
  import os
12
  from langchain.docstore.document import Document
 
 
 
 
13
 
14
  os.environ["OPENAI_API_KEY"] = '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'
15
 
@@ -118,7 +122,7 @@ class Extract_Summary:
118
  prompt_template = """
119
  Extract Key Informtion from the text below. This key information can include People Names & their Role/rank, Locations, Organization,Nationalities,Religions,
120
  Events such as Historical, social, sporting and naturally occurring events, Products , Address & email, URL, Date & Time, Provide the list of Key information each
121
- should be labeled with thier crossponding category.if key information related to category is not present, dont add that category in Response.
122
  {text}
123
 
124
  """
@@ -133,7 +137,7 @@ class Extract_Summary:
133
  map_prompts = """
134
  Extract Key Informtion from the text below. This key information can include People Names & their Role/rank, Locations, Organization,Nationalities,Religions,
135
  Events such as Historical, social, sporting and naturally occurring events, Products , Address & email, URL, Date & Time, Provide the list of Key information each
136
- should be labeled with thier crossponding category.if key information related to category is not present, dont add that category in Response.
137
  {text}
138
 
139
  """
@@ -141,7 +145,7 @@ class Extract_Summary:
141
  Below Text contains Key Information that was extracted from text. You job is to combine the Key Information and Return the results.This key information can include People Names & their Role/rank,
142
  Locations, Organization,Nationalities,Religions,Events such as Historical, social, sporting and naturally occurring events, Products ,
143
  Address & email, URL, Date & Time, Provide the list of Key information each should be labeled with thier crossponding category.
144
- if key information related to category is not present, dont add that category in Response.
145
  {text}
146
 
147
  """
@@ -253,11 +257,56 @@ class Extract_Summary:
253
  # display(Markdown(f"Text: {docs}"))
254
  # display(Markdown(f"Summary Response: {output}"))
255
  return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
- def create_wordcloud(self, output):
258
- wc = WordCloud(stopwords=STOPWORDS, height=500, width=300)
259
- wc.generate(output)
260
- wc.to_file('WordCloud.png')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
 
263
  class AudioBookNarration:
 
10
  from sklearn.metrics import silhouette_score
11
  import os
12
  from langchain.docstore.document import Document
13
+ import re
14
+ from collections import Counter
15
+ # import nltk
16
+ from nltk.corpus import stopwords
17
 
18
  os.environ["OPENAI_API_KEY"] = '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'
19
 
 
122
  prompt_template = """
123
  Extract Key Informtion from the text below. This key information can include People Names & their Role/rank, Locations, Organization,Nationalities,Religions,
124
  Events such as Historical, social, sporting and naturally occurring events, Products , Address & email, URL, Date & Time, Provide the list of Key information each
125
+ should be labeled with thier crossponding category.if key information related to category is not present,add "Not mentioned" in the response.
126
  {text}
127
 
128
  """
 
137
  map_prompts = """
138
  Extract Key Informtion from the text below. This key information can include People Names & their Role/rank, Locations, Organization,Nationalities,Religions,
139
  Events such as Historical, social, sporting and naturally occurring events, Products , Address & email, URL, Date & Time, Provide the list of Key information each
140
+ should be labeled with thier crossponding category.if key information related to category is not present, add Not mentioned in the response.
141
  {text}
142
 
143
  """
 
145
  Below Text contains Key Information that was extracted from text. You job is to combine the Key Information and Return the results.This key information can include People Names & their Role/rank,
146
  Locations, Organization,Nationalities,Religions,Events such as Historical, social, sporting and naturally occurring events, Products ,
147
  Address & email, URL, Date & Time, Provide the list of Key information each should be labeled with thier crossponding category.
148
+ if key information related to category is not present, add Not mentioned in the response.
149
  {text}
150
 
151
  """
 
257
  # display(Markdown(f"Text: {docs}"))
258
  # display(Markdown(f"Summary Response: {output}"))
259
  return output
260
+
261
+
262
+ def parse_key_information(self,text):
263
+
264
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
265
+ # Initialize the dictionary to store information
266
+ info_dict = {}
267
+
268
+ current_category = None
269
+
270
+ # Iterate through each line and process the information
271
+ for line in lines:
272
+ if re.match(r'^[A-Z][\w\s&/-]*:', line):
273
+ current_category = line.rstrip(':')
274
+ info_dict[current_category] = []
275
+ else:
276
+ if line != '- Not mentioned':
277
+ info_dict[current_category].append(line.replace('- ', ''))
278
+
279
+ # Remove categories with no entries
280
+ info_dict = {category: entries for category, entries in info_dict.items() if entries}
281
+
282
+ return info_dict
283
+
284
 
285
+ # def create_wordcloud(self, output):
286
+ # wc = WordCloud(stopwords=STOPWORDS, height=500, width=300)
287
+ # wc.generate(output)
288
+ # wc.to_file('WordCloud.png')
289
+
290
+
291
+ def create_word_count(text):
292
+ # Split the text into words, convert them to lowercase
293
+ words = text.split()
294
+ words = [word.lower() for word in words]
295
+
296
+ # Get a list of English stop words
297
+ stop_words = set(stopwords.words('english'))
298
+
299
+ # Filter out stop words from the list of words
300
+ filtered_words = [word for word in words if word not in stop_words]
301
+
302
+ # Count the frequencies of each word
303
+ word_counts = Counter(filtered_words)
304
+
305
+ # Convert the Counter object to a dictionary
306
+ word_count_dict = dict(word_counts)
307
+
308
+ return word_count_dict
309
+
310
 
311
 
312
  class AudioBookNarration: