Spaces:
Sleeping
Sleeping
Keane Moraes
commited on
Commit
•
aec1dec
1
Parent(s):
e9d1d9f
mutlithreading primary implementation works
Browse files- app.py +21 -8
- transcription.py +57 -32
app.py
CHANGED
@@ -42,7 +42,7 @@ data_transcription = {"title":"", "text":""}
|
|
42 |
embeddings = []
|
43 |
text_chunks_lib = dict()
|
44 |
user_input = None
|
45 |
-
title_entry =
|
46 |
|
47 |
tldr = ""
|
48 |
summary = ""
|
@@ -65,6 +65,8 @@ st.write('It provides a summary, transcription, key insights, a mind map and a Q
|
|
65 |
bar = st.progress(0)
|
66 |
|
67 |
def generate_word_embeddings():
|
|
|
|
|
68 |
if not os.path.exists(f"{folder_name}/word_embeddings.csv"):
|
69 |
for i, segment in enumerate(segments):
|
70 |
bar.progress(max(math.ceil((i/len(segments) * 50)), 1))
|
@@ -85,12 +87,20 @@ def generate_word_embeddings():
|
|
85 |
|
86 |
|
87 |
def generate_text_chunks_lib():
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
|
|
|
|
92 |
title_entry = text_df['title'][0]
|
93 |
-
print(title_entry)
|
94 |
for i in range(0, len(text_df)):
|
95 |
nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
|
96 |
# For each chunk of sentences (within the token max)
|
@@ -106,6 +116,7 @@ def generate_text_chunks_lib():
|
|
106 |
keywords = key_engine.get_keywords(text_chunks_lib)
|
107 |
|
108 |
|
|
|
109 |
# =========== SIDEBAR FOR GENERATION ===========
|
110 |
with st.sidebar:
|
111 |
youtube_link = st.text_input(label = "Type in your Youtube link", placeholder = "", key="url")
|
@@ -171,6 +182,7 @@ with st.sidebar:
|
|
171 |
# Generate embeddings
|
172 |
thread1 = Thread(target=generate_word_embeddings)
|
173 |
thread1.start()
|
|
|
174 |
# Generate text chunks
|
175 |
thread2 = Thread(target=generate_text_chunks_lib)
|
176 |
thread2.start()
|
@@ -181,20 +193,21 @@ with st.sidebar:
|
|
181 |
|
182 |
# Generate the summary
|
183 |
if gen_summary == 'Yes':
|
|
|
184 |
se = TextSummarizer(title_entry)
|
185 |
text_transcription = data_transcription['text']
|
186 |
with st.spinner("Generating summary and TLDR..."):
|
|
|
187 |
summary = se.generate_full_summary(text_chunks_lib)
|
188 |
summary_list = summary.split("\n\n")
|
189 |
tldr = se.generate_short_summary(summary_list)
|
190 |
-
|
191 |
# Generate key takeaways
|
192 |
kt = KeyTakeaways()
|
193 |
with st.spinner("Generating key takeaways ... "):
|
194 |
takeaways = kt.generate_key_takeaways(text_chunks_lib)
|
195 |
-
|
196 |
-
|
197 |
-
bar.progress(100)
|
198 |
|
199 |
if is_completed_analysis:
|
200 |
st.header("Key Takeaways")
|
|
|
42 |
embeddings = []
|
43 |
text_chunks_lib = dict()
|
44 |
user_input = None
|
45 |
+
title_entry = ""
|
46 |
|
47 |
tldr = ""
|
48 |
summary = ""
|
|
|
65 |
bar = st.progress(0)
|
66 |
|
67 |
def generate_word_embeddings():
|
68 |
+
global data
|
69 |
+
|
70 |
if not os.path.exists(f"{folder_name}/word_embeddings.csv"):
|
71 |
for i, segment in enumerate(segments):
|
72 |
bar.progress(max(math.ceil((i/len(segments) * 50)), 1))
|
|
|
87 |
|
88 |
|
89 |
def generate_text_chunks_lib():
|
90 |
+
|
91 |
+
global title_entry, text_chunks_lib
|
92 |
+
global keywords
|
93 |
+
global tldr
|
94 |
+
global summary
|
95 |
+
global takeaways
|
96 |
+
global input_accepted
|
97 |
+
global data_transcription
|
98 |
|
99 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
100 |
+
text_df = pd.DataFrame.from_dict({"title": [data_transcription["title"]], "text":[data_transcription["text"]]})
|
101 |
+
input_accepted = True
|
102 |
title_entry = text_df['title'][0]
|
103 |
+
print("\n\nFIRST TITLE_ENTRY", title_entry)
|
104 |
for i in range(0, len(text_df)):
|
105 |
nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
|
106 |
# For each chunk of sentences (within the token max)
|
|
|
116 |
keywords = key_engine.get_keywords(text_chunks_lib)
|
117 |
|
118 |
|
119 |
+
|
120 |
# =========== SIDEBAR FOR GENERATION ===========
|
121 |
with st.sidebar:
|
122 |
youtube_link = st.text_input(label = "Type in your Youtube link", placeholder = "", key="url")
|
|
|
182 |
# Generate embeddings
|
183 |
thread1 = Thread(target=generate_word_embeddings)
|
184 |
thread1.start()
|
185 |
+
|
186 |
# Generate text chunks
|
187 |
thread2 = Thread(target=generate_text_chunks_lib)
|
188 |
thread2.start()
|
|
|
193 |
|
194 |
# Generate the summary
|
195 |
if gen_summary == 'Yes':
|
196 |
+
print("\n\nTITLE ENTRY: ", title_entry)
|
197 |
se = TextSummarizer(title_entry)
|
198 |
text_transcription = data_transcription['text']
|
199 |
with st.spinner("Generating summary and TLDR..."):
|
200 |
+
print("\n\nTEXT_CHNK_SUMMARY\n\n", text_chunks_lib)
|
201 |
summary = se.generate_full_summary(text_chunks_lib)
|
202 |
summary_list = summary.split("\n\n")
|
203 |
tldr = se.generate_short_summary(summary_list)
|
204 |
+
|
205 |
# Generate key takeaways
|
206 |
kt = KeyTakeaways()
|
207 |
with st.spinner("Generating key takeaways ... "):
|
208 |
takeaways = kt.generate_key_takeaways(text_chunks_lib)
|
209 |
+
is_completed_analysis = True
|
210 |
+
bar.progress(100)
|
|
|
211 |
|
212 |
if is_completed_analysis:
|
213 |
st.header("Key Takeaways")
|
transcription.py
CHANGED
@@ -25,6 +25,7 @@ from nltk import tokenize
|
|
25 |
# For other stuff
|
26 |
import os, re
|
27 |
import time, math
|
|
|
28 |
|
29 |
# USEFUL CONSTANTS
|
30 |
|
@@ -53,7 +54,7 @@ class DownloadAudio:
|
|
53 |
"""Returns the title of the youtube video"""
|
54 |
return self.yt["title"]
|
55 |
|
56 |
-
def download(self, pathname:str) ->
|
57 |
"""
|
58 |
Download the audio from the youtube video and saves it to multiple .wav files
|
59 |
in the specified folder. Returns a list of the paths to the .wav files.
|
@@ -93,30 +94,31 @@ class DownloadAudio:
|
|
93 |
# If the total duration is less than the duration of each segment,
|
94 |
# then just return the original file
|
95 |
if total_byte_size < MAX_FILE_SIZE_BYTES:
|
96 |
-
return FINAL_WAV_PATH
|
97 |
-
|
98 |
-
#
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
#
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
#
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
118 |
|
119 |
-
return
|
120 |
|
121 |
|
122 |
class VideoTranscription:
|
@@ -150,18 +152,40 @@ class VideoTranscription:
|
|
150 |
audio_file = DownloadAudio(self.datalink)
|
151 |
|
152 |
# Get the names of the stored wav files
|
153 |
-
|
154 |
-
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
# Get the transcription of each audio chunk
|
156 |
-
text_transcriptions = ""
|
157 |
# for file_name in file_names:
|
158 |
# Get the transcription
|
159 |
-
chunk_segments, _ = self.model.transcribe(original_file_name, beam_size=5)
|
160 |
-
for chunk_segment in chunk_segments:
|
161 |
-
|
|
|
|
|
162 |
|
163 |
# Tokenize each sentence of the transcription.
|
164 |
-
sentences = tokenize.sent_tokenize(
|
165 |
segments = []
|
166 |
for i, sentence in enumerate(sentences):
|
167 |
segment = {
|
@@ -171,9 +195,10 @@ class VideoTranscription:
|
|
171 |
}
|
172 |
segments.append(segment)
|
173 |
|
|
|
174 |
final_transcription = {
|
175 |
"title": audio_file.get_yt_title(),
|
176 |
-
"text":
|
177 |
"segments": segments
|
178 |
}
|
179 |
|
|
|
25 |
# For other stuff
|
26 |
import os, re
|
27 |
import time, math
|
28 |
+
from threading import Thread
|
29 |
|
30 |
# USEFUL CONSTANTS
|
31 |
|
|
|
54 |
"""Returns the title of the youtube video"""
|
55 |
return self.yt["title"]
|
56 |
|
57 |
+
def download(self, pathname:str) -> list:
|
58 |
"""
|
59 |
Download the audio from the youtube video and saves it to multiple .wav files
|
60 |
in the specified folder. Returns a list of the paths to the .wav files.
|
|
|
94 |
# If the total duration is less than the duration of each segment,
|
95 |
# then just return the original file
|
96 |
if total_byte_size < MAX_FILE_SIZE_BYTES:
|
97 |
+
return [FINAL_WAV_PATH]
|
98 |
+
|
99 |
+
# Get the size of the wav file
|
100 |
+
channels = audio.channels
|
101 |
+
sample_width = audio.sample_width
|
102 |
+
duration_in_sec = math.ceil(len(audio) / 1000)
|
103 |
+
sample_rate = audio.frame_rate
|
104 |
+
bit_rate = sample_width * 8
|
105 |
+
wav_file_size = (sample_rate * bit_rate * channels * duration_in_sec) / 8
|
106 |
+
|
107 |
+
# Get the length of each chunk in milliseconds and make the chunks
|
108 |
+
chunk_length_in_sec = math.ceil((duration_in_sec * MAX_FILE_SIZE_BYTES ) / wav_file_size) #in sec
|
109 |
+
chunk_length_ms = chunk_length_in_sec * 1000
|
110 |
+
chunks = make_chunks(audio, chunk_length_ms)
|
111 |
+
|
112 |
+
# Export all of the individual chunks as wav files
|
113 |
+
chunk_names = []
|
114 |
+
for i, chunk in enumerate(chunks):
|
115 |
+
print(f"exporting chunk {i}")
|
116 |
+
chunk_name = f"{self.YOUTUBE_VIDEO_ID}_{i}.wav"
|
117 |
+
output_chunk_path = f"{pathname}/{chunk_name}"
|
118 |
+
chunk_names.append(output_chunk_path)
|
119 |
+
chunk.export(f"{output_chunk_path}", format="wav")
|
120 |
|
121 |
+
return chunk_names
|
122 |
|
123 |
|
124 |
class VideoTranscription:
|
|
|
152 |
audio_file = DownloadAudio(self.datalink)
|
153 |
|
154 |
# Get the names of the stored wav files
|
155 |
+
file_names = audio_file.download(FOLDER_NAME)
|
156 |
+
print("FILE NAMES", file_names)
|
157 |
+
text_transcriptions = [""] * len(file_names)
|
158 |
+
|
159 |
+
def perform_transcription(file_name, i):
|
160 |
+
print("transcribing", file_name, " for ", i)
|
161 |
+
chunk_segments, _ = self.model.transcribe(file_name, beam_size=5)
|
162 |
+
for chunk_segment in chunk_segments:
|
163 |
+
text_transcriptions[i] += chunk_segment.text.replace("$", "\$")
|
164 |
+
|
165 |
+
# Initialize the threads
|
166 |
+
threads = []
|
167 |
+
for i, file_name in enumerate(file_names):
|
168 |
+
threads.append(Thread(target=perform_transcription, args=(file_name, i)))
|
169 |
+
|
170 |
+
# Start the threads
|
171 |
+
for thread in threads:
|
172 |
+
thread.start()
|
173 |
+
|
174 |
+
# Wait for the threads to finish
|
175 |
+
for thread in threads:
|
176 |
+
thread.join()
|
177 |
+
|
178 |
# Get the transcription of each audio chunk
|
|
|
179 |
# for file_name in file_names:
|
180 |
# Get the transcription
|
181 |
+
# chunk_segments, _ = self.model.transcribe(original_file_name, beam_size=5)
|
182 |
+
# for chunk_segment in chunk_segments:
|
183 |
+
# text_transcriptions += chunk_segment.text.replace("$", "\$")
|
184 |
+
|
185 |
+
final_text_transcription = " ".join(text_transcriptions)
|
186 |
|
187 |
# Tokenize each sentence of the transcription.
|
188 |
+
sentences = tokenize.sent_tokenize(final_text_transcription)
|
189 |
segments = []
|
190 |
for i, sentence in enumerate(sentences):
|
191 |
segment = {
|
|
|
195 |
}
|
196 |
segments.append(segment)
|
197 |
|
198 |
+
|
199 |
final_transcription = {
|
200 |
"title": audio_file.get_yt_title(),
|
201 |
+
"text": final_text_transcription,
|
202 |
"segments": segments
|
203 |
}
|
204 |
|