Upload pages files
Browse files
@@ -0,0 +1,174 @@
1 |
import streamlit as st
2 |
from streamlit.components.v1 import html
3 |
import pandas as pd
4 |
import matplotlib.pyplot as plt
5 |
import plotly.express as px
6 |
from wordcloud.wordcloud import WordCloud
7 |
from configs.db_configs import add_one_item
8 |
from configs.html_features import set_image, HTML_WRAPPER
9 |
10 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
11 |
12 |
import torch
13 |
from torch.nn.functional import softmax
14 |
15 |
from spacy import displacy
16 |
import spacy
17 |
nlp = spacy.load('en_core_web_sm')
18 |
19 |
from collections import Counter
20 |
import neattext as nt
21 |
import neattext.functions as nfx
22 |
from textblob import TextBlob
23 |
24 |
25 |
def get_tokens_analysis(text):
26 |
doc_obj = nlp(text)
27 |
tokens_stats = [(token.text, token.shape_, token.pos_, token.tag_, token.lemma_, token.is_alpha, token.is_stop) for token in doc_obj]
28 |
tokens_stats_df = pd.DataFrame(tokens_stats, columns=['Token', 'Shape', 'Part-of-Speech', 'Part-of-Speech Tag', 'Root', 'IsAlpha', 'IsStop'])
29 |
return tokens_stats_df
30 |
31 |
32 |
def get_entities_tokens(text):
33 |
doc_obj = nlp(text)
34 |
35 |
html = displacy.render(doc_obj, style='ent')
36 |
html = html.replace('\n\n', '\n')
37 |
entities_tokens_html = HTML_WRAPPER.format(html)
38 |
return entities_tokens_html
39 |
40 |
41 |
def get_word_stats(text):
42 |
text_frame_obj = nt.TextFrame(text)
43 |
word_stats = text_frame_obj.word_stats()
44 |
word_length_freq = text_frame_obj.word_length_freq()
45 |
word_length_df = pd.DataFrame(word_length_freq.items(), columns=['word length', 'frequency'])
46 |
word_length_df['word length'] = word_length_df['word length'].astype(str)
47 |
word_length_df['word length'] = 'length ' + word_length_df['word length']
48 |
custom_color = px.colors.sequential.Blues_r
49 |
figure = px.pie(word_length_df, names='word length', values='frequency', title='Word Percentage Frequency by length', width=400, height=400, color_discrete_sequence=custom_color)
50 |
return word_stats, figure
51 |
52 |
53 |
def plot_top_keywords_frequencies(text, n_top_keywords):
54 |
preprocessed_text = nfx.remove_stopwords(text)
55 |
blob = TextBlob(preprocessed_text)
56 |
words = blob.words
57 |
top_keywords = Counter(words).most_common(n_top_keywords)
58 |
top_keywords_df = pd.DataFrame(top_keywords, columns=['words', 'frequency'])
59 |
figure = px.bar(top_keywords_df, x='words', y='frequency', color='frequency', title=f'the frequency of {n_top_keywords} top keywords', width=400, height=400, color_continuous_scale='Blues')
60 |
return figure
61 |
62 |
63 |
def get_sentence_stats(text):
64 |
blob = TextBlob(text)
65 |
sentences = [str(sentence) for sentence in blob.sentences]
66 |
noun_phrases = list(blob.noun_phrases)
67 |
sentence_stats = {
68 |
'Number of Sentences' : len(sentences),
69 |
'Number of Noun Phrases' : len(noun_phrases)
70 |
71 |
sentence_stats_df = pd.DataFrame(sentence_stats, index=[0])
72 |
return sentences, noun_phrases, sentence_stats_df
73 |
74 |
75 |
def plot_tokens_pos(tokens_stats_df):
76 |
pos_df = tokens_stats_df['Part-of-Speech'].value_counts().to_frame().reset_index()
77 |
pos_df.columns = ['Part-of-Speech', 'Frequency']
78 |
figure = px.bar(pos_df, x='Part-of-Speech', y='Frequency', color='Frequency', title=f'The Frequency of Tokens Part of speech', width=400, height=400, color_continuous_scale='Blues')
79 |
return figure
80 |
81 |
82 |
def get_sentiment_analysis_res(text):
83 |
tokenizer = AutoTokenizer.from_pretrained('stevhliu/my_awesome_model')
84 |
inputs = tokenizer(text, return_tensors='pt')
85 |
model = AutoModelForSequenceClassification.from_pretrained('stevhliu/my_awesome_model')
86 |
with torch.no_grad():
87 |
logits = model(**inputs).logits
88 |
89 |
predicted_class_id = logits.argmax().item()
90 |
model.config.id2label = {0:'Negative', 1:'Positive'}
91 |
label = model.config.id2label[predicted_class_id]
92 |
score = float(softmax(logits, dim=1)[0][predicted_class_id])
93 |
sentiment_df = pd.DataFrame([[label, score]], columns=['Text Polarity', 'Belonging Probability'])
94 |
return sentiment_df
95 |
96 |
97 |
def plot_word_frequency(text):
98 |
wc = WordCloud(width=600, height=500).generate(text)
99 |
fig = plt.figure()
100 |
plt.imshow(wc, interpolation='bilinear')
101 |
102 |
return fig
103 |
104 |
def main():
105 |
st.title('Text Analyzer')
106 |
im1, im2, im3 = st.columns([1, 5.3, 1])
107 |
with im1:
108 |
109 |
with im2:
110 |
url = "https://i.postimg.cc/jdF1hPng/combined.png"
111 |
html(set_image(url), height=500, width=500)
112 |
with im3:
113 |
114 |
115 |
text = st.text_area('Text Analyzer', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
116 |
n_top_keywords = st.sidebar.slider('n Top keywords', 5, 15, 5, 1)
117 |
if st.button('Analyze it'):
118 |
if text != '':
119 |
with st.expander('Original Text'):
120 |
121 |
add_one_item(text, 'Text Analyzer')
122 |
123 |
with st.expander('Text Analysis'):
124 |
tokens_stats_df = get_tokens_analysis(text)
125 |
126 |
127 |
with st.expander('Text Entities'):
128 |
entities_tokens_html = get_entities_tokens(text)
129 |
html(entities_tokens_html, height=300, scrolling=True)
130 |
131 |
col11, col12 = st.columns(2)
132 |
with col11:
133 |
with st.expander('Word Statistics'):
134 |
word_stats_json, figure = get_word_stats(text)
135 |
136 |
137 |
138 |
with col12:
139 |
with st.expander(f'The Frequency of {n_top_keywords} Top Keywords'):
140 |
figure = plot_top_keywords_frequencies(text, n_top_keywords)
141 |
142 |
143 |
col21, col22 = st.columns(2)
144 |
with col21:
145 |
with st.expander('Sentence Statistics'):
146 |
sentences, noun_phrases, sentence_stats_df = get_sentence_stats(text)
147 |
148 |
st.write('Sentences:\n', sentences)
149 |
st.write('Noun Phrases:\n', noun_phrases)
150 |
151 |
with col22:
152 |
with st.expander('The Frequency of Tokens Part of speech'):
153 |
figure = plot_tokens_pos(tokens_stats_df)
154 |
155 |
156 |
col31, col32 = st.columns(2)
157 |
with col31:
158 |
with st.expander('Sentiment Analysis'):
159 |
sentiment_df = get_sentiment_analysis_res(text)
160 |
161 |
162 |
with col32:
163 |
with st.expander('Word Frequency'):
164 |
fig = plot_word_frequency(text)
165 |
166 |
167 |
168 |
st.error('Please enter a non-empty text.')
169 |
170 |
171 |
if __name__ == '__main__':
172 |
173 |
174 |
@@ -0,0 +1,51 @@
1 |
import streamlit as st
2 |
import numpy as np
3 |
from sklearn.feature_extraction.text import CountVectorizer
4 |
from scipy import linalg
5 |
import regex as re
6 |
from configs.db_configs import add_one_item
7 |
from streamlit.components.v1 import html
8 |
from configs.html_features import set_image
9 |
10 |
11 |
def preprocess_text(text):
12 |
vectorizer = CountVectorizer(stop_words='english')
13 |
vector = vectorizer.fit_transform([text]).todense()
14 |
vocab = np.array(vectorizer.get_feature_names_out())
15 |
U, s, Vh = linalg.svd(vector, full_matrices=False)
16 |
return vocab, U, s, Vh
17 |
18 |
19 |
def show_topics(text, num_top_words):
20 |
vocab, U, s, Vh = preprocess_text(text)
21 |
pattern = '\d+'
22 |
top_words = lambda Vh: [vocab[i] for i in np.argsort(Vh)[:-num_top_words-1:-1]]
23 |
topic_words = top_words(Vh[0])
24 |
topic_words = ' '.join(topic_words)
25 |
return ' '.join([re.sub(pattern, '', word) for word in topic_words.split()])
26 |
27 |
28 |
def main():
29 |
st.title('Topic Modeling by Top Keywords')
30 |
im1, im2, im3 = st.columns([1, 5.3, 1])
31 |
with im1:
32 |
33 |
with im2:
34 |
url = "https://i.postimg.cc/jdF1hPng/combined.png"
35 |
html(set_image(url), height=500, width=500)
36 |
with im3:
37 |
38 |
text = st.text_area('Find Topic', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
39 |
num_top_words = st.sidebar.slider('Number of Top Keywords', min_value=5, max_value=20, step=1, value=10)
40 |
if st.button('Find Topic'):
41 |
if text != '':
42 |
with st.expander('Original Text'):
43 |
44 |
add_one_item(text, 'Topic Modeling')
45 |
46 |
with st.expander(f'Show Topic by {num_top_words} Top Keywords'):
47 |
topic_words = show_topics(text, num_top_words)
48 |
49 |
50 |
if __name__ == '__main__':
51 |
@@ -0,0 +1,57 @@
1 |
import streamlit as st
2 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3 |
from configs.download_files import FileDownloader
4 |
from configs.db_configs import add_one_item
5 |
from streamlit.components.v1 import html
6 |
from configs.html_features import set_image
7 |
8 |
def summarize_text(text):
9 |
prefix = 'summarize: '
10 |
text = prefix + text
11 |
tokenizer = AutoTokenizer.from_pretrained('stevhliu/my_awesome_billsum_model')
12 |
input_ids = tokenizer(text=text, return_tensors='pt')['input_ids']
13 |
model = AutoModelForSeq2SeqLM.from_pretrained('stevhliu/my_awesome_billsum_model')
14 |
15 |
if len(input_ids[0]) < 200:
16 |
output_ids = model.generate(input_ids, max_new_tokens=100, do_sample=False)
17 |
summarized_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
18 |
return summarized_text
19 |
20 |
elif len(input_ids[0]) > 200:
21 |
output_ids = model.generate(input_ids, max_new_tokens=round(len(input_ids[0]) * 1/2), do_sample=False)
22 |
summarized_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
23 |
return summarized_text
24 |
25 |
26 |
def main():
27 |
st.title('Text Summarizer')
28 |
im1, im2, im3 = st.columns([1, 5.3, 1])
29 |
with im1:
30 |
31 |
with im2:
32 |
url = "https://i.postimg.cc/jdF1hPng/combined.png"
33 |
html(set_image(url), height=500, width=500)
34 |
with im3:
35 |
36 |
text = st.text_area('Text Summarizer', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
37 |
38 |
if st.button('Summarize it'):
39 |
if text != "":
40 |
with st.expander('Original Text'):
41 |
42 |
add_one_item(text, "Text Summarizer")
43 |
44 |
with st.expander('Summarized Text'):
45 |
summarized_text = summarize_text(text)
46 |
47 |
48 |
with st.expander('Download Summarized Text'):
49 |
FileDownloader(summarized_text, 'txt').download()
50 |
51 |
52 |
st.error('Please enter a non-empty text.')
53 |
54 |
55 |
if __name__ == '__main__':
56 |
57 |
@@ -0,0 +1,54 @@
1 |
import streamlit as st
2 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3 |
from configs.download_files import FileDownloader
4 |
from configs.db_configs import add_one_item
5 |
from streamlit.components.v1 import html
6 |
from configs.html_features import set_image
7 |
8 |
def translate_text_to_text(text, source_lang, target_lang):
9 |
prefix = f'translate {source_lang} to {target_lang}: '
10 |
text = prefix + text
11 |
tokenizer = AutoTokenizer.from_pretrained('stevhliu/my_awesome_opus_books_model')
12 |
input_ids = tokenizer(text, return_tensors='pt').input_ids
13 |
model = AutoModelForSeq2SeqLM.from_pretrained('stevhliu/my_awesome_opus_books_model')
14 |
output_ids = model.generate(input_ids, max_new_tokens=len(input_ids[0]) * 3, do_sample=False)
15 |
translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
16 |
return translated_text
17 |
18 |
19 |
def main():
20 |
st.title('Text Translator')
21 |
im1, im2, im3 = st.columns([1, 5.3, 1])
22 |
with im1:
23 |
24 |
with im2:
25 |
url = "https://i.postimg.cc/jdF1hPng/combined.png"
26 |
html(set_image(url), height=500, width=500)
27 |
with im3:
28 |
29 |
languages = ['English', 'French']
30 |
source_lang = st.sidebar.selectbox('Source Language', languages)
31 |
target_lang = st.sidebar.selectbox('Target Language', languages, index=1)
32 |
text = st.text_area('Text Translator', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
33 |
if st.button('translate it'):
34 |
if text != '':
35 |
if (source_lang == 'English' and target_lang == 'English') or (source_lang == 'French' and target_lang == 'French'):
36 |
st.error('Expected different values for source and target languages, but got the same values!')
37 |
38 |
39 |
with st.expander('Original Text'):
40 |
41 |
add_one_item(text, 'Text Translator')
42 |
43 |
with st.expander('Translated Text'):
44 |
translated_text = translate_text_to_text(text, source_lang, target_lang)
45 |
46 |
47 |
with st.expander('Download Translated Text'):
48 |
FileDownloader(translated_text, 'txt').download()
49 |
50 |
st.error('Please enter a non-empty text.')
51 |
52 |
53 |
if __name__ == '__main__':
54 |
Binary file (785 Bytes). View file
Binary file (680 Bytes). View file