amirhoseinsedaghati's picture
Upload pages files
2a97daa verified
raw
history blame
6.74 kB
import streamlit as st
from streamlit.components.v1 import html
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud.wordcloud import WordCloud
from configs.db_configs import add_one_item
from configs.html_features import set_image, HTML_WRAPPER
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax
from spacy import displacy
import spacy
nlp = spacy.load('en_core_web_sm')
from collections import Counter
import neattext as nt
import neattext.functions as nfx
from textblob import TextBlob
def get_tokens_analysis(text):
doc_obj = nlp(text)
tokens_stats = [(token.text, token.shape_, token.pos_, token.tag_, token.lemma_, token.is_alpha, token.is_stop) for token in doc_obj]
tokens_stats_df = pd.DataFrame(tokens_stats, columns=['Token', 'Shape', 'Part-of-Speech', 'Part-of-Speech Tag', 'Root', 'IsAlpha', 'IsStop'])
return tokens_stats_df
def get_entities_tokens(text):
doc_obj = nlp(text)
html = displacy.render(doc_obj, style='ent')
html = html.replace('\n\n', '\n')
entities_tokens_html = HTML_WRAPPER.format(html)
return entities_tokens_html
def get_word_stats(text):
text_frame_obj = nt.TextFrame(text)
word_stats = text_frame_obj.word_stats()
word_length_freq = text_frame_obj.word_length_freq()
word_length_df = pd.DataFrame(word_length_freq.items(), columns=['word length', 'frequency'])
word_length_df['word length'] = word_length_df['word length'].astype(str)
word_length_df['word length'] = 'length ' + word_length_df['word length']
custom_color = px.colors.sequential.Blues_r
figure = px.pie(word_length_df, names='word length', values='frequency', title='Word Percentage Frequency by length', width=400, height=400, color_discrete_sequence=custom_color)
return word_stats, figure
def plot_top_keywords_frequencies(text, n_top_keywords):
preprocessed_text = nfx.remove_stopwords(text)
blob = TextBlob(preprocessed_text)
words = blob.words
top_keywords = Counter(words).most_common(n_top_keywords)
top_keywords_df = pd.DataFrame(top_keywords, columns=['words', 'frequency'])
figure = px.bar(top_keywords_df, x='words', y='frequency', color='frequency', title=f'the frequency of {n_top_keywords} top keywords', width=400, height=400, color_continuous_scale='Blues')
return figure
def get_sentence_stats(text):
blob = TextBlob(text)
sentences = [str(sentence) for sentence in blob.sentences]
noun_phrases = list(blob.noun_phrases)
sentence_stats = {
'Number of Sentences' : len(sentences),
'Number of Noun Phrases' : len(noun_phrases)
}
sentence_stats_df = pd.DataFrame(sentence_stats, index=[0])
return sentences, noun_phrases, sentence_stats_df
def plot_tokens_pos(tokens_stats_df):
pos_df = tokens_stats_df['Part-of-Speech'].value_counts().to_frame().reset_index()
pos_df.columns = ['Part-of-Speech', 'Frequency']
figure = px.bar(pos_df, x='Part-of-Speech', y='Frequency', color='Frequency', title=f'The Frequency of Tokens Part of speech', width=400, height=400, color_continuous_scale='Blues')
return figure
def get_sentiment_analysis_res(text):
tokenizer = AutoTokenizer.from_pretrained('stevhliu/my_awesome_model')
inputs = tokenizer(text, return_tensors='pt')
model = AutoModelForSequenceClassification.from_pretrained('stevhliu/my_awesome_model')
with torch.no_grad():
logits = model(**inputs).logits
predicted_class_id = logits.argmax().item()
model.config.id2label = {0:'Negative', 1:'Positive'}
label = model.config.id2label[predicted_class_id]
score = float(softmax(logits, dim=1)[0][predicted_class_id])
sentiment_df = pd.DataFrame([[label, score]], columns=['Text Polarity', 'Belonging Probability'])
return sentiment_df
def plot_word_frequency(text):
wc = WordCloud(width=600, height=500).generate(text)
fig = plt.figure()
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
return fig
def main():
st.title('Text Analyzer')
im1, im2, im3 = st.columns([1, 5.3, 1])
with im1:
pass
with im2:
url = "https://i.postimg.cc/jdF1hPng/combined.png"
html(set_image(url), height=500, width=500)
with im3:
pass
text = st.text_area('Text Analyzer', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
n_top_keywords = st.sidebar.slider('n Top keywords', 5, 15, 5, 1)
if st.button('Analyze it'):
if text != '':
with st.expander('Original Text'):
st.write(text)
add_one_item(text, 'Text Analyzer')
with st.expander('Text Analysis'):
tokens_stats_df = get_tokens_analysis(text)
st.dataframe(tokens_stats_df)
with st.expander('Text Entities'):
entities_tokens_html = get_entities_tokens(text)
html(entities_tokens_html, height=300, scrolling=True)
col11, col12 = st.columns(2)
with col11:
with st.expander('Word Statistics'):
word_stats_json, figure = get_word_stats(text)
st.json(word_stats_json)
st.plotly_chart(figure)
with col12:
with st.expander(f'The Frequency of {n_top_keywords} Top Keywords'):
figure = plot_top_keywords_frequencies(text, n_top_keywords)
st.plotly_chart(figure)
col21, col22 = st.columns(2)
with col21:
with st.expander('Sentence Statistics'):
sentences, noun_phrases, sentence_stats_df = get_sentence_stats(text)
st.dataframe(sentence_stats_df)
st.write('Sentences:\n', sentences)
st.write('Noun Phrases:\n', noun_phrases)
with col22:
with st.expander('The Frequency of Tokens Part of speech'):
figure = plot_tokens_pos(tokens_stats_df)
st.plotly_chart(figure)
col31, col32 = st.columns(2)
with col31:
with st.expander('Sentiment Analysis'):
sentiment_df = get_sentiment_analysis_res(text)
st.dataframe(sentiment_df)
with col32:
with st.expander('Word Frequency'):
fig = plot_word_frequency(text)
st.pyplot(fig)
else:
st.error('Please enter a non-empty text.')
if __name__ == '__main__':
main()