Update functions.py
Browse files- functions.py +18 -17
functions.py
CHANGED
@@ -97,7 +97,7 @@ prompt = ChatPromptTemplate.from_messages(messages)
|
|
97 |
|
98 |
###################### Functions #######################################################################################
|
99 |
|
100 |
-
@st.
|
101 |
def load_models():
|
102 |
|
103 |
'''Load and cache all the models to be used'''
|
@@ -116,20 +116,20 @@ def load_models():
|
|
116 |
|
117 |
return sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer, sbert
|
118 |
|
119 |
-
@st.
|
120 |
def load_asr_model(asr_model_name):
|
121 |
asr_model = whisper.load_model(asr_model_name)
|
122 |
|
123 |
return asr_model
|
124 |
|
125 |
-
@st.
|
126 |
def load_whisper_api(audio):
|
127 |
file = open(audio, "rb")
|
128 |
transcript = openai.Audio.translate("whisper-1", file)
|
129 |
|
130 |
return transcript
|
131 |
|
132 |
-
@st.
|
133 |
def process_corpus(corpus, title, embedding_model, chunk_size=1000, overlap=50):
|
134 |
|
135 |
'''Process text for Semantic Search'''
|
@@ -144,7 +144,7 @@ def process_corpus(corpus, title, embedding_model, chunk_size=1000, overlap=50):
|
|
144 |
|
145 |
return docsearch
|
146 |
|
147 |
-
@st.
|
148 |
def chunk_and_preprocess_text(text,thresh=500):
|
149 |
|
150 |
"""Chunk text longer than n tokens for summarization"""
|
@@ -170,7 +170,7 @@ def chunk_and_preprocess_text(text,thresh=500):
|
|
170 |
|
171 |
return chunks
|
172 |
|
173 |
-
@st.
|
174 |
def gen_embeddings(embedding_model):
|
175 |
|
176 |
'''Generate embeddings for given model'''
|
@@ -187,7 +187,7 @@ def gen_embeddings(embedding_model):
|
|
187 |
|
188 |
return embeddings
|
189 |
|
190 |
-
@st.
|
191 |
def embed_text(query,title,embedding_model,_docsearch):
|
192 |
|
193 |
'''Embed text and generate semantic search scores'''
|
@@ -212,12 +212,12 @@ def embed_text(query,title,embedding_model,_docsearch):
|
|
212 |
|
213 |
return answer
|
214 |
|
215 |
-
@st.
|
216 |
def get_spacy():
|
217 |
nlp = en_core_web_lg.load()
|
218 |
return nlp
|
219 |
|
220 |
-
@st.
|
221 |
def inference(link, upload, _asr_model):
|
222 |
'''Convert Youtube video or Audio upload to text'''
|
223 |
|
@@ -307,7 +307,7 @@ def inference(link, upload, _asr_model):
|
|
307 |
return results['text'], yt.title
|
308 |
|
309 |
|
310 |
-
@st.
|
311 |
def sentiment_pipe(earnings_text):
|
312 |
'''Determine the sentiment of the text'''
|
313 |
|
@@ -316,7 +316,7 @@ def sentiment_pipe(earnings_text):
|
|
316 |
|
317 |
return earnings_sentiment, earnings_sentences
|
318 |
|
319 |
-
@st.
|
320 |
def summarize_text(text_to_summarize,max_len,min_len):
|
321 |
'''Summarize text with HF model'''
|
322 |
|
@@ -329,7 +329,7 @@ def summarize_text(text_to_summarize,max_len,min_len):
|
|
329 |
|
330 |
return summarized_text
|
331 |
|
332 |
-
@st.
|
333 |
def clean_text(text):
|
334 |
'''Clean all text'''
|
335 |
|
@@ -341,7 +341,7 @@ def clean_text(text):
|
|
341 |
|
342 |
return text
|
343 |
|
344 |
-
@st.
|
345 |
def chunk_long_text(text,threshold,window_size=3,stride=2):
|
346 |
'''Preprocess text and chunk for sentiment analysis'''
|
347 |
|
@@ -378,7 +378,7 @@ def summary_downloader(raw_text):
|
|
378 |
href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
|
379 |
st.markdown(href,unsafe_allow_html=True)
|
380 |
|
381 |
-
@st.
|
382 |
def get_all_entities_per_sentence(text):
|
383 |
doc = nlp(''.join(text))
|
384 |
|
@@ -401,12 +401,12 @@ def get_all_entities_per_sentence(text):
|
|
401 |
|
402 |
return entities_all_sentences
|
403 |
|
404 |
-
@st.
|
405 |
def get_all_entities(text):
|
406 |
all_entities_per_sentence = get_all_entities_per_sentence(text)
|
407 |
return list(itertools.chain.from_iterable(all_entities_per_sentence))
|
408 |
|
409 |
-
@st.
|
410 |
def get_and_compare_entities(article_content,summary_output):
|
411 |
|
412 |
all_entities_per_sentence = get_all_entities_per_sentence(article_content)
|
@@ -454,7 +454,7 @@ def get_and_compare_entities(article_content,summary_output):
|
|
454 |
|
455 |
return matched_entities, unmatched_entities
|
456 |
|
457 |
-
@st.
|
458 |
def highlight_entities(article_content,summary_output):
|
459 |
|
460 |
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
|
@@ -506,6 +506,7 @@ def fin_ext(text):
|
|
506 |
|
507 |
## Knowledge Graphs code
|
508 |
|
|
|
509 |
def extract_relations_from_model_output(text):
|
510 |
relations = []
|
511 |
relation, subject, relation, object_ = '', '', '', ''
|
|
|
97 |
|
98 |
###################### Functions #######################################################################################
|
99 |
|
100 |
+
@st.cache_resource
|
101 |
def load_models():
|
102 |
|
103 |
'''Load and cache all the models to be used'''
|
|
|
116 |
|
117 |
return sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer, sbert
|
118 |
|
119 |
+
@st.cache_resource
|
120 |
def load_asr_model(asr_model_name):
|
121 |
asr_model = whisper.load_model(asr_model_name)
|
122 |
|
123 |
return asr_model
|
124 |
|
125 |
+
@st.cache_data
|
126 |
def load_whisper_api(audio):
|
127 |
file = open(audio, "rb")
|
128 |
transcript = openai.Audio.translate("whisper-1", file)
|
129 |
|
130 |
return transcript
|
131 |
|
132 |
+
@st.cache_data
|
133 |
def process_corpus(corpus, title, embedding_model, chunk_size=1000, overlap=50):
|
134 |
|
135 |
'''Process text for Semantic Search'''
|
|
|
144 |
|
145 |
return docsearch
|
146 |
|
147 |
+
@st.cache_data
|
148 |
def chunk_and_preprocess_text(text,thresh=500):
|
149 |
|
150 |
"""Chunk text longer than n tokens for summarization"""
|
|
|
170 |
|
171 |
return chunks
|
172 |
|
173 |
+
@st.cache_resource
|
174 |
def gen_embeddings(embedding_model):
|
175 |
|
176 |
'''Generate embeddings for given model'''
|
|
|
187 |
|
188 |
return embeddings
|
189 |
|
190 |
+
@st.cache_data
|
191 |
def embed_text(query,title,embedding_model,_docsearch):
|
192 |
|
193 |
'''Embed text and generate semantic search scores'''
|
|
|
212 |
|
213 |
return answer
|
214 |
|
215 |
+
@st.cache_resource
|
216 |
def get_spacy():
|
217 |
nlp = en_core_web_lg.load()
|
218 |
return nlp
|
219 |
|
220 |
+
@st.cache_data
|
221 |
def inference(link, upload, _asr_model):
|
222 |
'''Convert Youtube video or Audio upload to text'''
|
223 |
|
|
|
307 |
return results['text'], yt.title
|
308 |
|
309 |
|
310 |
+
@st.cache_data
|
311 |
def sentiment_pipe(earnings_text):
|
312 |
'''Determine the sentiment of the text'''
|
313 |
|
|
|
316 |
|
317 |
return earnings_sentiment, earnings_sentences
|
318 |
|
319 |
+
@st.cache_data
|
320 |
def summarize_text(text_to_summarize,max_len,min_len):
|
321 |
'''Summarize text with HF model'''
|
322 |
|
|
|
329 |
|
330 |
return summarized_text
|
331 |
|
332 |
+
@st.cache_data
|
333 |
def clean_text(text):
|
334 |
'''Clean all text'''
|
335 |
|
|
|
341 |
|
342 |
return text
|
343 |
|
344 |
+
@st.cache_data
|
345 |
def chunk_long_text(text,threshold,window_size=3,stride=2):
|
346 |
'''Preprocess text and chunk for sentiment analysis'''
|
347 |
|
|
|
378 |
href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
|
379 |
st.markdown(href,unsafe_allow_html=True)
|
380 |
|
381 |
+
@st.cache_data
|
382 |
def get_all_entities_per_sentence(text):
|
383 |
doc = nlp(''.join(text))
|
384 |
|
|
|
401 |
|
402 |
return entities_all_sentences
|
403 |
|
404 |
+
@st.cache_data
|
405 |
def get_all_entities(text):
|
406 |
all_entities_per_sentence = get_all_entities_per_sentence(text)
|
407 |
return list(itertools.chain.from_iterable(all_entities_per_sentence))
|
408 |
|
409 |
+
@st.cache_data
|
410 |
def get_and_compare_entities(article_content,summary_output):
|
411 |
|
412 |
all_entities_per_sentence = get_all_entities_per_sentence(article_content)
|
|
|
454 |
|
455 |
return matched_entities, unmatched_entities
|
456 |
|
457 |
+
@st.cache_data
|
458 |
def highlight_entities(article_content,summary_output):
|
459 |
|
460 |
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
|
|
|
506 |
|
507 |
## Knowledge Graphs code
|
508 |
|
509 |
+
@st.cache_data
|
510 |
def extract_relations_from_model_output(text):
|
511 |
relations = []
|
512 |
relation, subject, relation, object_ = '', '', '', ''
|