amirhoseinsedaghati
commited on
Commit
•
2a97daa
1
Parent(s):
627c527
Upload pages files
Browse files
pages/Analyze_Text.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from streamlit.components.v1 import html
|
3 |
+
import pandas as pd
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import plotly.express as px
|
6 |
+
from wordcloud.wordcloud import WordCloud
|
7 |
+
from configs.db_configs import add_one_item
|
8 |
+
from configs.html_features import set_image, HTML_WRAPPER
|
9 |
+
|
10 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
11 |
+
|
12 |
+
import torch
|
13 |
+
from torch.nn.functional import softmax
|
14 |
+
|
15 |
+
from spacy import displacy
|
16 |
+
import spacy
|
17 |
+
nlp = spacy.load('en_core_web_sm')
|
18 |
+
|
19 |
+
from collections import Counter
|
20 |
+
import neattext as nt
|
21 |
+
import neattext.functions as nfx
|
22 |
+
from textblob import TextBlob
|
23 |
+
|
24 |
+
|
25 |
+
def get_tokens_analysis(text):
|
26 |
+
doc_obj = nlp(text)
|
27 |
+
tokens_stats = [(token.text, token.shape_, token.pos_, token.tag_, token.lemma_, token.is_alpha, token.is_stop) for token in doc_obj]
|
28 |
+
tokens_stats_df = pd.DataFrame(tokens_stats, columns=['Token', 'Shape', 'Part-of-Speech', 'Part-of-Speech Tag', 'Root', 'IsAlpha', 'IsStop'])
|
29 |
+
return tokens_stats_df
|
30 |
+
|
31 |
+
|
32 |
+
def get_entities_tokens(text):
|
33 |
+
doc_obj = nlp(text)
|
34 |
+
|
35 |
+
html = displacy.render(doc_obj, style='ent')
|
36 |
+
html = html.replace('\n\n', '\n')
|
37 |
+
entities_tokens_html = HTML_WRAPPER.format(html)
|
38 |
+
return entities_tokens_html
|
39 |
+
|
40 |
+
|
41 |
+
def get_word_stats(text):
|
42 |
+
text_frame_obj = nt.TextFrame(text)
|
43 |
+
word_stats = text_frame_obj.word_stats()
|
44 |
+
word_length_freq = text_frame_obj.word_length_freq()
|
45 |
+
word_length_df = pd.DataFrame(word_length_freq.items(), columns=['word length', 'frequency'])
|
46 |
+
word_length_df['word length'] = word_length_df['word length'].astype(str)
|
47 |
+
word_length_df['word length'] = 'length ' + word_length_df['word length']
|
48 |
+
custom_color = px.colors.sequential.Blues_r
|
49 |
+
figure = px.pie(word_length_df, names='word length', values='frequency', title='Word Percentage Frequency by length', width=400, height=400, color_discrete_sequence=custom_color)
|
50 |
+
return word_stats, figure
|
51 |
+
|
52 |
+
|
53 |
+
def plot_top_keywords_frequencies(text, n_top_keywords):
|
54 |
+
preprocessed_text = nfx.remove_stopwords(text)
|
55 |
+
blob = TextBlob(preprocessed_text)
|
56 |
+
words = blob.words
|
57 |
+
top_keywords = Counter(words).most_common(n_top_keywords)
|
58 |
+
top_keywords_df = pd.DataFrame(top_keywords, columns=['words', 'frequency'])
|
59 |
+
figure = px.bar(top_keywords_df, x='words', y='frequency', color='frequency', title=f'the frequency of {n_top_keywords} top keywords', width=400, height=400, color_continuous_scale='Blues')
|
60 |
+
return figure
|
61 |
+
|
62 |
+
|
63 |
+
def get_sentence_stats(text):
|
64 |
+
blob = TextBlob(text)
|
65 |
+
sentences = [str(sentence) for sentence in blob.sentences]
|
66 |
+
noun_phrases = list(blob.noun_phrases)
|
67 |
+
sentence_stats = {
|
68 |
+
'Number of Sentences' : len(sentences),
|
69 |
+
'Number of Noun Phrases' : len(noun_phrases)
|
70 |
+
}
|
71 |
+
sentence_stats_df = pd.DataFrame(sentence_stats, index=[0])
|
72 |
+
return sentences, noun_phrases, sentence_stats_df
|
73 |
+
|
74 |
+
|
75 |
+
def plot_tokens_pos(tokens_stats_df):
|
76 |
+
pos_df = tokens_stats_df['Part-of-Speech'].value_counts().to_frame().reset_index()
|
77 |
+
pos_df.columns = ['Part-of-Speech', 'Frequency']
|
78 |
+
figure = px.bar(pos_df, x='Part-of-Speech', y='Frequency', color='Frequency', title=f'The Frequency of Tokens Part of speech', width=400, height=400, color_continuous_scale='Blues')
|
79 |
+
return figure
|
80 |
+
|
81 |
+
|
82 |
+
def get_sentiment_analysis_res(text):
|
83 |
+
tokenizer = AutoTokenizer.from_pretrained('stevhliu/my_awesome_model')
|
84 |
+
inputs = tokenizer(text, return_tensors='pt')
|
85 |
+
model = AutoModelForSequenceClassification.from_pretrained('stevhliu/my_awesome_model')
|
86 |
+
with torch.no_grad():
|
87 |
+
logits = model(**inputs).logits
|
88 |
+
|
89 |
+
predicted_class_id = logits.argmax().item()
|
90 |
+
model.config.id2label = {0:'Negative', 1:'Positive'}
|
91 |
+
label = model.config.id2label[predicted_class_id]
|
92 |
+
score = float(softmax(logits, dim=1)[0][predicted_class_id])
|
93 |
+
sentiment_df = pd.DataFrame([[label, score]], columns=['Text Polarity', 'Belonging Probability'])
|
94 |
+
return sentiment_df
|
95 |
+
|
96 |
+
|
97 |
+
def plot_word_frequency(text):
|
98 |
+
wc = WordCloud(width=600, height=500).generate(text)
|
99 |
+
fig = plt.figure()
|
100 |
+
plt.imshow(wc, interpolation='bilinear')
|
101 |
+
plt.axis('off')
|
102 |
+
return fig
|
103 |
+
|
104 |
+
def main():
|
105 |
+
st.title('Text Analyzer')
|
106 |
+
im1, im2, im3 = st.columns([1, 5.3, 1])
|
107 |
+
with im1:
|
108 |
+
pass
|
109 |
+
with im2:
|
110 |
+
url = "https://i.postimg.cc/jdF1hPng/combined.png"
|
111 |
+
html(set_image(url), height=500, width=500)
|
112 |
+
with im3:
|
113 |
+
pass
|
114 |
+
|
115 |
+
text = st.text_area('Text Analyzer', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
|
116 |
+
n_top_keywords = st.sidebar.slider('n Top keywords', 5, 15, 5, 1)
|
117 |
+
if st.button('Analyze it'):
|
118 |
+
if text != '':
|
119 |
+
with st.expander('Original Text'):
|
120 |
+
st.write(text)
|
121 |
+
add_one_item(text, 'Text Analyzer')
|
122 |
+
|
123 |
+
with st.expander('Text Analysis'):
|
124 |
+
tokens_stats_df = get_tokens_analysis(text)
|
125 |
+
st.dataframe(tokens_stats_df)
|
126 |
+
|
127 |
+
with st.expander('Text Entities'):
|
128 |
+
entities_tokens_html = get_entities_tokens(text)
|
129 |
+
html(entities_tokens_html, height=300, scrolling=True)
|
130 |
+
|
131 |
+
col11, col12 = st.columns(2)
|
132 |
+
with col11:
|
133 |
+
with st.expander('Word Statistics'):
|
134 |
+
word_stats_json, figure = get_word_stats(text)
|
135 |
+
st.json(word_stats_json)
|
136 |
+
st.plotly_chart(figure)
|
137 |
+
|
138 |
+
with col12:
|
139 |
+
with st.expander(f'The Frequency of {n_top_keywords} Top Keywords'):
|
140 |
+
figure = plot_top_keywords_frequencies(text, n_top_keywords)
|
141 |
+
st.plotly_chart(figure)
|
142 |
+
|
143 |
+
col21, col22 = st.columns(2)
|
144 |
+
with col21:
|
145 |
+
with st.expander('Sentence Statistics'):
|
146 |
+
sentences, noun_phrases, sentence_stats_df = get_sentence_stats(text)
|
147 |
+
st.dataframe(sentence_stats_df)
|
148 |
+
st.write('Sentences:\n', sentences)
|
149 |
+
st.write('Noun Phrases:\n', noun_phrases)
|
150 |
+
|
151 |
+
with col22:
|
152 |
+
with st.expander('The Frequency of Tokens Part of speech'):
|
153 |
+
figure = plot_tokens_pos(tokens_stats_df)
|
154 |
+
st.plotly_chart(figure)
|
155 |
+
|
156 |
+
col31, col32 = st.columns(2)
|
157 |
+
with col31:
|
158 |
+
with st.expander('Sentiment Analysis'):
|
159 |
+
sentiment_df = get_sentiment_analysis_res(text)
|
160 |
+
st.dataframe(sentiment_df)
|
161 |
+
|
162 |
+
with col32:
|
163 |
+
with st.expander('Word Frequency'):
|
164 |
+
fig = plot_word_frequency(text)
|
165 |
+
st.pyplot(fig)
|
166 |
+
|
167 |
+
else:
|
168 |
+
st.error('Please enter a non-empty text.')
|
169 |
+
|
170 |
+
|
171 |
+
if __name__ == '__main__':
|
172 |
+
main()
|
173 |
+
|
174 |
+
|
pages/Find_Topic.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import numpy as np
|
3 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
4 |
+
from scipy import linalg
|
5 |
+
import regex as re
|
6 |
+
from configs.db_configs import add_one_item
|
7 |
+
from streamlit.components.v1 import html
|
8 |
+
from configs.html_features import set_image
|
9 |
+
|
10 |
+
|
11 |
+
def preprocess_text(text):
|
12 |
+
vectorizer = CountVectorizer(stop_words='english')
|
13 |
+
vector = vectorizer.fit_transform([text]).todense()
|
14 |
+
vocab = np.array(vectorizer.get_feature_names_out())
|
15 |
+
U, s, Vh = linalg.svd(vector, full_matrices=False)
|
16 |
+
return vocab, U, s, Vh
|
17 |
+
|
18 |
+
|
19 |
+
def show_topics(text, num_top_words):
|
20 |
+
vocab, U, s, Vh = preprocess_text(text)
|
21 |
+
pattern = '\d+'
|
22 |
+
top_words = lambda Vh: [vocab[i] for i in np.argsort(Vh)[:-num_top_words-1:-1]]
|
23 |
+
topic_words = top_words(Vh[0])
|
24 |
+
topic_words = ' '.join(topic_words)
|
25 |
+
return ' '.join([re.sub(pattern, '', word) for word in topic_words.split()])
|
26 |
+
|
27 |
+
|
28 |
+
def main():
|
29 |
+
st.title('Topic Modeling by Top Keywords')
|
30 |
+
im1, im2, im3 = st.columns([1, 5.3, 1])
|
31 |
+
with im1:
|
32 |
+
pass
|
33 |
+
with im2:
|
34 |
+
url = "https://i.postimg.cc/jdF1hPng/combined.png"
|
35 |
+
html(set_image(url), height=500, width=500)
|
36 |
+
with im3:
|
37 |
+
pass
|
38 |
+
text = st.text_area('Find Topic', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
|
39 |
+
num_top_words = st.sidebar.slider('Number of Top Keywords', min_value=5, max_value=20, step=1, value=10)
|
40 |
+
if st.button('Find Topic'):
|
41 |
+
if text != '':
|
42 |
+
with st.expander('Original Text'):
|
43 |
+
st.write(text)
|
44 |
+
add_one_item(text, 'Topic Modeling')
|
45 |
+
|
46 |
+
with st.expander(f'Show Topic by {num_top_words} Top Keywords'):
|
47 |
+
topic_words = show_topics(text, num_top_words)
|
48 |
+
st.write(topic_words)
|
49 |
+
|
50 |
+
if __name__ == '__main__':
|
51 |
+
main()
|
pages/Summarize_Text.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
3 |
+
from configs.download_files import FileDownloader
|
4 |
+
from configs.db_configs import add_one_item
|
5 |
+
from streamlit.components.v1 import html
|
6 |
+
from configs.html_features import set_image
|
7 |
+
|
8 |
+
def summarize_text(text):
|
9 |
+
prefix = 'summarize: '
|
10 |
+
text = prefix + text
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained('stevhliu/my_awesome_billsum_model')
|
12 |
+
input_ids = tokenizer(text=text, return_tensors='pt')['input_ids']
|
13 |
+
model = AutoModelForSeq2SeqLM.from_pretrained('stevhliu/my_awesome_billsum_model')
|
14 |
+
|
15 |
+
if len(input_ids[0]) < 200:
|
16 |
+
output_ids = model.generate(input_ids, max_new_tokens=100, do_sample=False)
|
17 |
+
summarized_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
18 |
+
return summarized_text
|
19 |
+
|
20 |
+
elif len(input_ids[0]) > 200:
|
21 |
+
output_ids = model.generate(input_ids, max_new_tokens=round(len(input_ids[0]) * 1/2), do_sample=False)
|
22 |
+
summarized_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
23 |
+
return summarized_text
|
24 |
+
|
25 |
+
|
26 |
+
def main():
|
27 |
+
st.title('Text Summarizer')
|
28 |
+
im1, im2, im3 = st.columns([1, 5.3, 1])
|
29 |
+
with im1:
|
30 |
+
pass
|
31 |
+
with im2:
|
32 |
+
url = "https://i.postimg.cc/jdF1hPng/combined.png"
|
33 |
+
html(set_image(url), height=500, width=500)
|
34 |
+
with im3:
|
35 |
+
pass
|
36 |
+
text = st.text_area('Text Summarizer', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
|
37 |
+
|
38 |
+
if st.button('Summarize it'):
|
39 |
+
if text != "":
|
40 |
+
with st.expander('Original Text'):
|
41 |
+
st.write(text)
|
42 |
+
add_one_item(text, "Text Summarizer")
|
43 |
+
|
44 |
+
with st.expander('Summarized Text'):
|
45 |
+
summarized_text = summarize_text(text)
|
46 |
+
st.write(summarized_text)
|
47 |
+
|
48 |
+
with st.expander('Download Summarized Text'):
|
49 |
+
FileDownloader(summarized_text, 'txt').download()
|
50 |
+
|
51 |
+
else:
|
52 |
+
st.error('Please enter a non-empty text.')
|
53 |
+
|
54 |
+
|
55 |
+
if __name__ == '__main__':
|
56 |
+
main()
|
57 |
+
|
pages/Translate_Text.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
3 |
+
from configs.download_files import FileDownloader
|
4 |
+
from configs.db_configs import add_one_item
|
5 |
+
from streamlit.components.v1 import html
|
6 |
+
from configs.html_features import set_image
|
7 |
+
|
8 |
+
def translate_text_to_text(text, source_lang, target_lang):
|
9 |
+
prefix = f'translate {source_lang} to {target_lang}: '
|
10 |
+
text = prefix + text
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained('stevhliu/my_awesome_opus_books_model')
|
12 |
+
input_ids = tokenizer(text, return_tensors='pt').input_ids
|
13 |
+
model = AutoModelForSeq2SeqLM.from_pretrained('stevhliu/my_awesome_opus_books_model')
|
14 |
+
output_ids = model.generate(input_ids, max_new_tokens=len(input_ids[0]) * 3, do_sample=False)
|
15 |
+
translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
16 |
+
return translated_text
|
17 |
+
|
18 |
+
|
19 |
+
def main():
|
20 |
+
st.title('Text Translator')
|
21 |
+
im1, im2, im3 = st.columns([1, 5.3, 1])
|
22 |
+
with im1:
|
23 |
+
pass
|
24 |
+
with im2:
|
25 |
+
url = "https://i.postimg.cc/jdF1hPng/combined.png"
|
26 |
+
html(set_image(url), height=500, width=500)
|
27 |
+
with im3:
|
28 |
+
pass
|
29 |
+
languages = ['English', 'French']
|
30 |
+
source_lang = st.sidebar.selectbox('Source Language', languages)
|
31 |
+
target_lang = st.sidebar.selectbox('Target Language', languages, index=1)
|
32 |
+
text = st.text_area('Text Translator', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
|
33 |
+
if st.button('translate it'):
|
34 |
+
if text != '':
|
35 |
+
if (source_lang == 'English' and target_lang == 'English') or (source_lang == 'French' and target_lang == 'French'):
|
36 |
+
st.error('Expected different values for source and target languages, but got the same values!')
|
37 |
+
|
38 |
+
else:
|
39 |
+
with st.expander('Original Text'):
|
40 |
+
st.write(text)
|
41 |
+
add_one_item(text, 'Text Translator')
|
42 |
+
|
43 |
+
with st.expander('Translated Text'):
|
44 |
+
translated_text = translate_text_to_text(text, source_lang, target_lang)
|
45 |
+
st.write(translated_text)
|
46 |
+
|
47 |
+
with st.expander('Download Translated Text'):
|
48 |
+
FileDownloader(translated_text, 'txt').download()
|
49 |
+
else:
|
50 |
+
st.error('Please enter a non-empty text.')
|
51 |
+
|
52 |
+
|
53 |
+
if __name__ == '__main__':
|
54 |
+
main()
|
pages/__pycache__/text_analysis.cpython-310.pyc
ADDED
Binary file (785 Bytes). View file
|
|
pages/__pycache__/text_summarization.cpython-310.pyc
ADDED
Binary file (680 Bytes). View file
|
|