Spaces:
Runtime error
Runtime error
KarthickAdopleAI
commited on
Commit
•
28482bd
1
Parent(s):
4bfb1ad
Update app.py
Browse files
app.py
CHANGED
@@ -14,6 +14,8 @@ from pytube import YouTube
|
|
14 |
import requests
|
15 |
import logging
|
16 |
import os
|
|
|
|
|
17 |
nltk.download('punkt')
|
18 |
nltk.download('stopwords')
|
19 |
|
@@ -27,7 +29,6 @@ class VideoAnalytics:
|
|
27 |
def __init__(self):
|
28 |
"""
|
29 |
Initialize the VideoAnalytics object.
|
30 |
-
|
31 |
Args:
|
32 |
hf_token (str): Hugging Face API token.
|
33 |
"""
|
@@ -39,16 +40,7 @@ class VideoAnalytics:
|
|
39 |
# Initialize transcribed text variable
|
40 |
self.transcribed_text = ""
|
41 |
|
42 |
-
|
43 |
-
self.API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3"
|
44 |
-
|
45 |
-
|
46 |
-
hf_token = os.getenv('HF_TOKEN')
|
47 |
-
# Placeholder for Hugging Face API token
|
48 |
-
self.hf_token = hf_token # Replace this with the actual Hugging Face API token
|
49 |
-
|
50 |
-
# Set headers for API requests with Hugging Face token
|
51 |
-
self.headers = {"Authorization": f"Bearer {self.hf_token}"}
|
52 |
|
53 |
# Initialize english text variable
|
54 |
self.english_text = ""
|
@@ -61,13 +53,86 @@ class VideoAnalytics:
|
|
61 |
# Configure logging settings
|
62 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def transcribe_video(self, vid: str) -> str:
|
65 |
"""
|
66 |
Transcribe the audio of the video.
|
67 |
-
|
68 |
Args:
|
69 |
vid (str): Path to the video file.
|
70 |
-
|
71 |
Returns:
|
72 |
str: Transcribed text.
|
73 |
"""
|
@@ -78,28 +143,23 @@ class VideoAnalytics:
|
|
78 |
|
79 |
# Write audio to a temporary file
|
80 |
audio.write_audiofile("output_audio.mp3")
|
81 |
-
audio_file = open("output_audio.mp3", "rb")
|
82 |
|
83 |
-
# Define a helper function to query the Hugging Face model
|
84 |
-
def query(data):
|
85 |
-
response = requests.post(self.API_URL, headers=self.headers, data=data)
|
86 |
-
return response.json()
|
87 |
|
88 |
-
#
|
89 |
-
|
90 |
|
91 |
-
|
92 |
# Update the transcribed_text attribute with the transcription result
|
93 |
-
self.transcribed_text =
|
94 |
# Update the translation text into english_text
|
95 |
self.english_text = self.translation()
|
96 |
# Return the transcribed text
|
97 |
-
return
|
98 |
|
99 |
except Exception as e:
|
100 |
logging.error(f"Error transcribing video: {e}")
|
101 |
return ""
|
102 |
-
|
103 |
def generate_video_summary(self) -> str:
|
104 |
"""
|
105 |
Generate a summary of the transcribed video.
|
@@ -365,10 +425,30 @@ class VideoAnalytics:
|
|
365 |
# Log any errors that occur during initialization of YouTube object
|
366 |
logging.error(f"Error downloading video: {e}")
|
367 |
return ""
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
|
373 |
def main(self, video: str = None, input_path: str = None) -> tuple:
|
374 |
"""
|
@@ -386,7 +466,7 @@ class VideoAnalytics:
|
|
386 |
video_ = VideoFileClip(input_path)
|
387 |
duration = video_.duration
|
388 |
video_.close()
|
389 |
-
if round(duration) <= 600:
|
390 |
text = self.transcribe_video(input_path)
|
391 |
else:
|
392 |
return "Video Duration Above 10 Minutes,Try Below 10 Minutes Video","","",None,None,None
|
@@ -394,7 +474,7 @@ class VideoAnalytics:
|
|
394 |
video_ = VideoFileClip(video)
|
395 |
duration = video_.duration
|
396 |
video_.close()
|
397 |
-
if round(duration) <= 600:
|
398 |
text = self.transcribe_video(video)
|
399 |
input_path = video
|
400 |
else:
|
|
|
14 |
import requests
|
15 |
import logging
|
16 |
import os
|
17 |
+
from pydub import AudioSegment
|
18 |
+
import speech_recognition as sr
|
19 |
nltk.download('punkt')
|
20 |
nltk.download('stopwords')
|
21 |
|
|
|
29 |
def __init__(self):
|
30 |
"""
|
31 |
Initialize the VideoAnalytics object.
|
|
|
32 |
Args:
|
33 |
hf_token (str): Hugging Face API token.
|
34 |
"""
|
|
|
40 |
# Initialize transcribed text variable
|
41 |
self.transcribed_text = ""
|
42 |
|
43 |
+
self.r = sr.Recognizer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
# Initialize english text variable
|
46 |
self.english_text = ""
|
|
|
53 |
# Configure logging settings
|
54 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
55 |
|
56 |
+
def mp3_to_wav(self, mp3_file: str, wav_file: str) -> str:
|
57 |
+
"""
|
58 |
+
Convert an MP3 audio file to WAV format.
|
59 |
+
|
60 |
+
Args:
|
61 |
+
mp3_file (str): The path to the input MP3 file.
|
62 |
+
wav_file (str): The path to save the output WAV file.
|
63 |
+
|
64 |
+
Returns:
|
65 |
+
str: The filename of the converted WAV file.
|
66 |
+
|
67 |
+
Raises:
|
68 |
+
Exception: If there's an error during the conversion process.
|
69 |
+
"""
|
70 |
+
try:
|
71 |
+
# Load the MP3 file
|
72 |
+
audio = AudioSegment.from_mp3(mp3_file)
|
73 |
+
|
74 |
+
# Export the audio to WAV format
|
75 |
+
audio.export(wav_file, format="wav")
|
76 |
+
|
77 |
+
logging.info(f"MP3 file '{mp3_file}' converted to WAV successfully: {wav_file}")
|
78 |
+
|
79 |
+
return wav_file
|
80 |
+
except Exception as e:
|
81 |
+
# Log the exception and raise it further
|
82 |
+
logging.error(f"Error occurred while converting MP3 to WAV: {e}")
|
83 |
+
raise e
|
84 |
+
|
85 |
+
# Function to recognize speech in the audio file
|
86 |
+
def transcribe_audio(self,path):
|
87 |
+
"""Transcribe speech from an audio file."""
|
88 |
+
try:
|
89 |
+
with sr.AudioFile(path) as source:
|
90 |
+
audio_listened = r.record(source)
|
91 |
+
text = r.recognize_google(audio_listened)
|
92 |
+
return text
|
93 |
+
except sr.UnknownValueError as e:
|
94 |
+
logging.error(f"Speech recognition could not understand audio: {e}")
|
95 |
+
return ""
|
96 |
+
except sr.RequestError as e:
|
97 |
+
logging.error(f"Could not request results from Google Speech Recognition service: {e}")
|
98 |
+
return ""
|
99 |
+
|
100 |
+
# Function to split the audio file into chunks on silence and apply speech recognition
|
101 |
+
def get_large_audio_transcription_on_silence(self,path):
|
102 |
+
"""Split the large audio file into chunks and apply speech recognition on each chunk."""
|
103 |
+
try:
|
104 |
+
sound = AudioSegment.from_file(path)
|
105 |
+
chunks = split_on_silence(sound, min_silence_len=500, silence_thresh=sound.dBFS-14, keep_silence=500)
|
106 |
+
folder_name = "audio-chunks"
|
107 |
+
|
108 |
+
if not os.path.isdir(folder_name):
|
109 |
+
os.mkdir(folder_name)
|
110 |
+
|
111 |
+
whole_text = ""
|
112 |
+
|
113 |
+
for i, audio_chunk in enumerate(chunks, start=1):
|
114 |
+
chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
|
115 |
+
audio_chunk.export(chunk_filename, format="wav")
|
116 |
+
|
117 |
+
text = self.transcribe_audio(chunk_filename)
|
118 |
+
|
119 |
+
if text:
|
120 |
+
text = f"{text.capitalize()}. "
|
121 |
+
logging.info(f"Transcribed {chunk_filename}: {text}")
|
122 |
+
whole_text += text
|
123 |
+
else:
|
124 |
+
logging.warning(f"No speech recognized in {chunk_filename}")
|
125 |
+
|
126 |
+
return whole_text
|
127 |
+
except Exception as e:
|
128 |
+
logging.error(f"Error processing audio: {e}")
|
129 |
+
return ""
|
130 |
+
|
131 |
def transcribe_video(self, vid: str) -> str:
|
132 |
"""
|
133 |
Transcribe the audio of the video.
|
|
|
134 |
Args:
|
135 |
vid (str): Path to the video file.
|
|
|
136 |
Returns:
|
137 |
str: Transcribed text.
|
138 |
"""
|
|
|
143 |
|
144 |
# Write audio to a temporary file
|
145 |
audio.write_audiofile("output_audio.mp3")
|
|
|
146 |
|
|
|
|
|
|
|
|
|
147 |
|
148 |
+
# Replace 'input.mp3' and 'output.wav' with your file paths
|
149 |
+
audio_filename = self.mp3_to_wav("output_audio.mp3", 'output.wav')
|
150 |
|
151 |
+
text = self.get_large_audio_transcription_on_silence(audio_filename)
|
152 |
# Update the transcribed_text attribute with the transcription result
|
153 |
+
self.transcribed_text = text
|
154 |
# Update the translation text into english_text
|
155 |
self.english_text = self.translation()
|
156 |
# Return the transcribed text
|
157 |
+
return text
|
158 |
|
159 |
except Exception as e:
|
160 |
logging.error(f"Error transcribing video: {e}")
|
161 |
return ""
|
162 |
+
|
163 |
def generate_video_summary(self) -> str:
|
164 |
"""
|
165 |
Generate a summary of the transcribed video.
|
|
|
425 |
# Log any errors that occur during initialization of YouTube object
|
426 |
logging.error(f"Error downloading video: {e}")
|
427 |
return ""
|
428 |
+
|
429 |
+
def save_audio_with_gtts(self, text: str, filename: str) -> str:
|
430 |
+
"""
|
431 |
+
Generate an audio file from the given text using gTTS and save it.
|
432 |
+
|
433 |
+
Args:
|
434 |
+
text (str): The text to be converted into speech.
|
435 |
+
filename (str): The filename (including path) to save the audio file.
|
436 |
+
|
437 |
+
Returns:
|
438 |
+
str: The filename of the saved audio file.
|
439 |
+
|
440 |
+
Raises:
|
441 |
+
Exception: If there's an error during the conversion or saving process.
|
442 |
+
"""
|
443 |
+
try:
|
444 |
+
tts = gTTS(text=text, lang='en')
|
445 |
+
tts.save(filename)
|
446 |
+
logging.info(f"Audio file saved successfully: {filename}")
|
447 |
+
return filename
|
448 |
+
except Exception as e:
|
449 |
+
# Log the exception and raise it further
|
450 |
+
logging.error(f"Error occurred while saving audio: {e}")
|
451 |
+
raise e
|
452 |
|
453 |
def main(self, video: str = None, input_path: str = None) -> tuple:
|
454 |
"""
|
|
|
466 |
video_ = VideoFileClip(input_path)
|
467 |
duration = video_.duration
|
468 |
video_.close()
|
469 |
+
if round(duration) <= 6*600:
|
470 |
text = self.transcribe_video(input_path)
|
471 |
else:
|
472 |
return "Video Duration Above 10 Minutes,Try Below 10 Minutes Video","","",None,None,None
|
|
|
474 |
video_ = VideoFileClip(video)
|
475 |
duration = video_.duration
|
476 |
video_.close()
|
477 |
+
if round(duration) <= 6*600:
|
478 |
text = self.transcribe_video(video)
|
479 |
input_path = video
|
480 |
else:
|