amirhoseinsedaghati commited on
Commit
2a97daa
1 Parent(s): 627c527

Upload pages files

Browse files
pages/Analyze_Text.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit.components.v1 import html
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+ import plotly.express as px
6
+ from wordcloud.wordcloud import WordCloud
7
+ from configs.db_configs import add_one_item
8
+ from configs.html_features import set_image, HTML_WRAPPER
9
+
10
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
+
12
+ import torch
13
+ from torch.nn.functional import softmax
14
+
15
+ from spacy import displacy
16
+ import spacy
17
+ nlp = spacy.load('en_core_web_sm')
18
+
19
+ from collections import Counter
20
+ import neattext as nt
21
+ import neattext.functions as nfx
22
+ from textblob import TextBlob
23
+
24
+
25
+ def get_tokens_analysis(text):
26
+ doc_obj = nlp(text)
27
+ tokens_stats = [(token.text, token.shape_, token.pos_, token.tag_, token.lemma_, token.is_alpha, token.is_stop) for token in doc_obj]
28
+ tokens_stats_df = pd.DataFrame(tokens_stats, columns=['Token', 'Shape', 'Part-of-Speech', 'Part-of-Speech Tag', 'Root', 'IsAlpha', 'IsStop'])
29
+ return tokens_stats_df
30
+
31
+
32
+ def get_entities_tokens(text):
33
+ doc_obj = nlp(text)
34
+
35
+ html = displacy.render(doc_obj, style='ent')
36
+ html = html.replace('\n\n', '\n')
37
+ entities_tokens_html = HTML_WRAPPER.format(html)
38
+ return entities_tokens_html
39
+
40
+
41
+ def get_word_stats(text):
42
+ text_frame_obj = nt.TextFrame(text)
43
+ word_stats = text_frame_obj.word_stats()
44
+ word_length_freq = text_frame_obj.word_length_freq()
45
+ word_length_df = pd.DataFrame(word_length_freq.items(), columns=['word length', 'frequency'])
46
+ word_length_df['word length'] = word_length_df['word length'].astype(str)
47
+ word_length_df['word length'] = 'length ' + word_length_df['word length']
48
+ custom_color = px.colors.sequential.Blues_r
49
+ figure = px.pie(word_length_df, names='word length', values='frequency', title='Word Percentage Frequency by length', width=400, height=400, color_discrete_sequence=custom_color)
50
+ return word_stats, figure
51
+
52
+
53
+ def plot_top_keywords_frequencies(text, n_top_keywords):
54
+ preprocessed_text = nfx.remove_stopwords(text)
55
+ blob = TextBlob(preprocessed_text)
56
+ words = blob.words
57
+ top_keywords = Counter(words).most_common(n_top_keywords)
58
+ top_keywords_df = pd.DataFrame(top_keywords, columns=['words', 'frequency'])
59
+ figure = px.bar(top_keywords_df, x='words', y='frequency', color='frequency', title=f'the frequency of {n_top_keywords} top keywords', width=400, height=400, color_continuous_scale='Blues')
60
+ return figure
61
+
62
+
63
+ def get_sentence_stats(text):
64
+ blob = TextBlob(text)
65
+ sentences = [str(sentence) for sentence in blob.sentences]
66
+ noun_phrases = list(blob.noun_phrases)
67
+ sentence_stats = {
68
+ 'Number of Sentences' : len(sentences),
69
+ 'Number of Noun Phrases' : len(noun_phrases)
70
+ }
71
+ sentence_stats_df = pd.DataFrame(sentence_stats, index=[0])
72
+ return sentences, noun_phrases, sentence_stats_df
73
+
74
+
75
+ def plot_tokens_pos(tokens_stats_df):
76
+ pos_df = tokens_stats_df['Part-of-Speech'].value_counts().to_frame().reset_index()
77
+ pos_df.columns = ['Part-of-Speech', 'Frequency']
78
+ figure = px.bar(pos_df, x='Part-of-Speech', y='Frequency', color='Frequency', title=f'The Frequency of Tokens Part of speech', width=400, height=400, color_continuous_scale='Blues')
79
+ return figure
80
+
81
+
82
+ def get_sentiment_analysis_res(text):
83
+ tokenizer = AutoTokenizer.from_pretrained('stevhliu/my_awesome_model')
84
+ inputs = tokenizer(text, return_tensors='pt')
85
+ model = AutoModelForSequenceClassification.from_pretrained('stevhliu/my_awesome_model')
86
+ with torch.no_grad():
87
+ logits = model(**inputs).logits
88
+
89
+ predicted_class_id = logits.argmax().item()
90
+ model.config.id2label = {0:'Negative', 1:'Positive'}
91
+ label = model.config.id2label[predicted_class_id]
92
+ score = float(softmax(logits, dim=1)[0][predicted_class_id])
93
+ sentiment_df = pd.DataFrame([[label, score]], columns=['Text Polarity', 'Belonging Probability'])
94
+ return sentiment_df
95
+
96
+
97
+ def plot_word_frequency(text):
98
+ wc = WordCloud(width=600, height=500).generate(text)
99
+ fig = plt.figure()
100
+ plt.imshow(wc, interpolation='bilinear')
101
+ plt.axis('off')
102
+ return fig
103
+
104
+ def main():
105
+ st.title('Text Analyzer')
106
+ im1, im2, im3 = st.columns([1, 5.3, 1])
107
+ with im1:
108
+ pass
109
+ with im2:
110
+ url = "https://i.postimg.cc/jdF1hPng/combined.png"
111
+ html(set_image(url), height=500, width=500)
112
+ with im3:
113
+ pass
114
+
115
+ text = st.text_area('Text Analyzer', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
116
+ n_top_keywords = st.sidebar.slider('n Top keywords', 5, 15, 5, 1)
117
+ if st.button('Analyze it'):
118
+ if text != '':
119
+ with st.expander('Original Text'):
120
+ st.write(text)
121
+ add_one_item(text, 'Text Analyzer')
122
+
123
+ with st.expander('Text Analysis'):
124
+ tokens_stats_df = get_tokens_analysis(text)
125
+ st.dataframe(tokens_stats_df)
126
+
127
+ with st.expander('Text Entities'):
128
+ entities_tokens_html = get_entities_tokens(text)
129
+ html(entities_tokens_html, height=300, scrolling=True)
130
+
131
+ col11, col12 = st.columns(2)
132
+ with col11:
133
+ with st.expander('Word Statistics'):
134
+ word_stats_json, figure = get_word_stats(text)
135
+ st.json(word_stats_json)
136
+ st.plotly_chart(figure)
137
+
138
+ with col12:
139
+ with st.expander(f'The Frequency of {n_top_keywords} Top Keywords'):
140
+ figure = plot_top_keywords_frequencies(text, n_top_keywords)
141
+ st.plotly_chart(figure)
142
+
143
+ col21, col22 = st.columns(2)
144
+ with col21:
145
+ with st.expander('Sentence Statistics'):
146
+ sentences, noun_phrases, sentence_stats_df = get_sentence_stats(text)
147
+ st.dataframe(sentence_stats_df)
148
+ st.write('Sentences:\n', sentences)
149
+ st.write('Noun Phrases:\n', noun_phrases)
150
+
151
+ with col22:
152
+ with st.expander('The Frequency of Tokens Part of speech'):
153
+ figure = plot_tokens_pos(tokens_stats_df)
154
+ st.plotly_chart(figure)
155
+
156
+ col31, col32 = st.columns(2)
157
+ with col31:
158
+ with st.expander('Sentiment Analysis'):
159
+ sentiment_df = get_sentiment_analysis_res(text)
160
+ st.dataframe(sentiment_df)
161
+
162
+ with col32:
163
+ with st.expander('Word Frequency'):
164
+ fig = plot_word_frequency(text)
165
+ st.pyplot(fig)
166
+
167
+ else:
168
+ st.error('Please enter a non-empty text.')
169
+
170
+
171
+ if __name__ == '__main__':
172
+ main()
173
+
174
+
pages/Find_Topic.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ from sklearn.feature_extraction.text import CountVectorizer
4
+ from scipy import linalg
5
+ import regex as re
6
+ from configs.db_configs import add_one_item
7
+ from streamlit.components.v1 import html
8
+ from configs.html_features import set_image
9
+
10
+
11
+ def preprocess_text(text):
12
+ vectorizer = CountVectorizer(stop_words='english')
13
+ vector = vectorizer.fit_transform([text]).todense()
14
+ vocab = np.array(vectorizer.get_feature_names_out())
15
+ U, s, Vh = linalg.svd(vector, full_matrices=False)
16
+ return vocab, U, s, Vh
17
+
18
+
19
+ def show_topics(text, num_top_words):
20
+ vocab, U, s, Vh = preprocess_text(text)
21
+ pattern = '\d+'
22
+ top_words = lambda Vh: [vocab[i] for i in np.argsort(Vh)[:-num_top_words-1:-1]]
23
+ topic_words = top_words(Vh[0])
24
+ topic_words = ' '.join(topic_words)
25
+ return ' '.join([re.sub(pattern, '', word) for word in topic_words.split()])
26
+
27
+
28
+ def main():
29
+ st.title('Topic Modeling by Top Keywords')
30
+ im1, im2, im3 = st.columns([1, 5.3, 1])
31
+ with im1:
32
+ pass
33
+ with im2:
34
+ url = "https://i.postimg.cc/jdF1hPng/combined.png"
35
+ html(set_image(url), height=500, width=500)
36
+ with im3:
37
+ pass
38
+ text = st.text_area('Find Topic', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
39
+ num_top_words = st.sidebar.slider('Number of Top Keywords', min_value=5, max_value=20, step=1, value=10)
40
+ if st.button('Find Topic'):
41
+ if text != '':
42
+ with st.expander('Original Text'):
43
+ st.write(text)
44
+ add_one_item(text, 'Topic Modeling')
45
+
46
+ with st.expander(f'Show Topic by {num_top_words} Top Keywords'):
47
+ topic_words = show_topics(text, num_top_words)
48
+ st.write(topic_words)
49
+
50
+ if __name__ == '__main__':
51
+ main()
pages/Summarize_Text.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+ from configs.download_files import FileDownloader
4
+ from configs.db_configs import add_one_item
5
+ from streamlit.components.v1 import html
6
+ from configs.html_features import set_image
7
+
8
+ def summarize_text(text):
9
+ prefix = 'summarize: '
10
+ text = prefix + text
11
+ tokenizer = AutoTokenizer.from_pretrained('stevhliu/my_awesome_billsum_model')
12
+ input_ids = tokenizer(text=text, return_tensors='pt')['input_ids']
13
+ model = AutoModelForSeq2SeqLM.from_pretrained('stevhliu/my_awesome_billsum_model')
14
+
15
+ if len(input_ids[0]) < 200:
16
+ output_ids = model.generate(input_ids, max_new_tokens=100, do_sample=False)
17
+ summarized_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
18
+ return summarized_text
19
+
20
+ elif len(input_ids[0]) > 200:
21
+ output_ids = model.generate(input_ids, max_new_tokens=round(len(input_ids[0]) * 1/2), do_sample=False)
22
+ summarized_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
23
+ return summarized_text
24
+
25
+
26
+ def main():
27
+ st.title('Text Summarizer')
28
+ im1, im2, im3 = st.columns([1, 5.3, 1])
29
+ with im1:
30
+ pass
31
+ with im2:
32
+ url = "https://i.postimg.cc/jdF1hPng/combined.png"
33
+ html(set_image(url), height=500, width=500)
34
+ with im3:
35
+ pass
36
+ text = st.text_area('Text Summarizer', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
37
+
38
+ if st.button('Summarize it'):
39
+ if text != "":
40
+ with st.expander('Original Text'):
41
+ st.write(text)
42
+ add_one_item(text, "Text Summarizer")
43
+
44
+ with st.expander('Summarized Text'):
45
+ summarized_text = summarize_text(text)
46
+ st.write(summarized_text)
47
+
48
+ with st.expander('Download Summarized Text'):
49
+ FileDownloader(summarized_text, 'txt').download()
50
+
51
+ else:
52
+ st.error('Please enter a non-empty text.')
53
+
54
+
55
+ if __name__ == '__main__':
56
+ main()
57
+
pages/Translate_Text.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+ from configs.download_files import FileDownloader
4
+ from configs.db_configs import add_one_item
5
+ from streamlit.components.v1 import html
6
+ from configs.html_features import set_image
7
+
8
+ def translate_text_to_text(text, source_lang, target_lang):
9
+ prefix = f'translate {source_lang} to {target_lang}: '
10
+ text = prefix + text
11
+ tokenizer = AutoTokenizer.from_pretrained('stevhliu/my_awesome_opus_books_model')
12
+ input_ids = tokenizer(text, return_tensors='pt').input_ids
13
+ model = AutoModelForSeq2SeqLM.from_pretrained('stevhliu/my_awesome_opus_books_model')
14
+ output_ids = model.generate(input_ids, max_new_tokens=len(input_ids[0]) * 3, do_sample=False)
15
+ translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
16
+ return translated_text
17
+
18
+
19
+ def main():
20
+ st.title('Text Translator')
21
+ im1, im2, im3 = st.columns([1, 5.3, 1])
22
+ with im1:
23
+ pass
24
+ with im2:
25
+ url = "https://i.postimg.cc/jdF1hPng/combined.png"
26
+ html(set_image(url), height=500, width=500)
27
+ with im3:
28
+ pass
29
+ languages = ['English', 'French']
30
+ source_lang = st.sidebar.selectbox('Source Language', languages)
31
+ target_lang = st.sidebar.selectbox('Target Language', languages, index=1)
32
+ text = st.text_area('Text Translator', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
33
+ if st.button('translate it'):
34
+ if text != '':
35
+ if (source_lang == 'English' and target_lang == 'English') or (source_lang == 'French' and target_lang == 'French'):
36
+ st.error('Expected different values for source and target languages, but got the same values!')
37
+
38
+ else:
39
+ with st.expander('Original Text'):
40
+ st.write(text)
41
+ add_one_item(text, 'Text Translator')
42
+
43
+ with st.expander('Translated Text'):
44
+ translated_text = translate_text_to_text(text, source_lang, target_lang)
45
+ st.write(translated_text)
46
+
47
+ with st.expander('Download Translated Text'):
48
+ FileDownloader(translated_text, 'txt').download()
49
+ else:
50
+ st.error('Please enter a non-empty text.')
51
+
52
+
53
+ if __name__ == '__main__':
54
+ main()
pages/__pycache__/text_analysis.cpython-310.pyc ADDED
Binary file (785 Bytes). View file
 
pages/__pycache__/text_summarization.cpython-310.pyc ADDED
Binary file (680 Bytes). View file