Spaces:

amirhoseinsedaghati
/

multi-purpose-text-application

Sleeping

App Files Files Community

amirhoseinsedaghati commited on Feb 21, 2024

Commit

2a97daa

verified ·

1 Parent(s): 627c527

Upload pages files

Browse files

Files changed (6) hide show

pages/Analyze_Text.py +174 -0
pages/Find_Topic.py +51 -0
pages/Summarize_Text.py +57 -0
pages/Translate_Text.py +54 -0
pages/__pycache__/text_analysis.cpython-310.pyc +0 -0
pages/__pycache__/text_summarization.cpython-310.pyc +0 -0

pages/Analyze_Text.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import streamlit as st
+from streamlit.components.v1 import html
+import pandas as pd
+import matplotlib.pyplot as plt
+import plotly.express as px
+from wordcloud.wordcloud import WordCloud
+from configs.db_configs import add_one_item
+from configs.html_features import set_image, HTML_WRAPPER
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+from torch.nn.functional import softmax
+from spacy import displacy
+import spacy
+nlp = spacy.load('en_core_web_sm')
+from collections import Counter
+import neattext as nt
+import neattext.functions as nfx
+from textblob import TextBlob
+def get_tokens_analysis(text):
+    doc_obj = nlp(text)
+    tokens_stats = [(token.text, token.shape_, token.pos_, token.tag_, token.lemma_, token.is_alpha, token.is_stop) for token in doc_obj]
+    tokens_stats_df = pd.DataFrame(tokens_stats, columns=['Token', 'Shape', 'Part-of-Speech', 'Part-of-Speech Tag', 'Root', 'IsAlpha', 'IsStop'])
+    return tokens_stats_df
+def get_entities_tokens(text):
+    doc_obj = nlp(text)
+    html = displacy.render(doc_obj, style='ent')
+    html = html.replace('\n\n', '\n')
+    entities_tokens_html = HTML_WRAPPER.format(html)
+    return entities_tokens_html
+def get_word_stats(text):
+    text_frame_obj = nt.TextFrame(text)
+    word_stats = text_frame_obj.word_stats()
+    word_length_freq = text_frame_obj.word_length_freq()
+    word_length_df = pd.DataFrame(word_length_freq.items(), columns=['word length', 'frequency'])
+    word_length_df['word length'] = word_length_df['word length'].astype(str)
+    word_length_df['word length'] = 'length ' + word_length_df['word length']
+    custom_color = px.colors.sequential.Blues_r
+    figure = px.pie(word_length_df, names='word length', values='frequency', title='Word Percentage Frequency by length', width=400, height=400, color_discrete_sequence=custom_color)
+    return word_stats, figure
+def plot_top_keywords_frequencies(text, n_top_keywords):
+    preprocessed_text = nfx.remove_stopwords(text)
+    blob = TextBlob(preprocessed_text)
+    words = blob.words
+    top_keywords = Counter(words).most_common(n_top_keywords)
+    top_keywords_df = pd.DataFrame(top_keywords, columns=['words', 'frequency'])
+    figure = px.bar(top_keywords_df, x='words', y='frequency', color='frequency', title=f'the frequency of {n_top_keywords} top keywords', width=400, height=400, color_continuous_scale='Blues')
+    return figure
+def get_sentence_stats(text):
+    blob = TextBlob(text)
+    sentences = [str(sentence) for sentence in blob.sentences]
+    noun_phrases = list(blob.noun_phrases)
+    sentence_stats = {
+        'Number of Sentences' : len(sentences),
+        'Number of Noun Phrases' : len(noun_phrases)
+    }
+    sentence_stats_df = pd.DataFrame(sentence_stats, index=[0])
+    return sentences, noun_phrases, sentence_stats_df
+def plot_tokens_pos(tokens_stats_df):
+    pos_df = tokens_stats_df['Part-of-Speech'].value_counts().to_frame().reset_index()
+    pos_df.columns = ['Part-of-Speech', 'Frequency']
+    figure = px.bar(pos_df, x='Part-of-Speech', y='Frequency', color='Frequency', title=f'The Frequency of Tokens Part of speech', width=400, height=400, color_continuous_scale='Blues')
+    return figure
+def get_sentiment_analysis_res(text):
+    tokenizer = AutoTokenizer.from_pretrained('stevhliu/my_awesome_model')
+    inputs = tokenizer(text, return_tensors='pt')
+    model = AutoModelForSequenceClassification.from_pretrained('stevhliu/my_awesome_model')
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    predicted_class_id = logits.argmax().item()
+    model.config.id2label = {0:'Negative', 1:'Positive'}
+    label = model.config.id2label[predicted_class_id]
+    score = float(softmax(logits, dim=1)[0][predicted_class_id])
+    sentiment_df = pd.DataFrame([[label, score]], columns=['Text Polarity', 'Belonging Probability'])
+    return sentiment_df
+def plot_word_frequency(text):
+    wc = WordCloud(width=600, height=500).generate(text)
+    fig = plt.figure()
+    plt.imshow(wc, interpolation='bilinear')
+    plt.axis('off')
+    return fig
+def main():
+    st.title('Text Analyzer')
+    im1, im2, im3 = st.columns([1, 5.3, 1])
+    with im1:
+        pass
+    with im2:
+        url = "https://i.postimg.cc/jdF1hPng/combined.png"
+        html(set_image(url), height=500, width=500)
+    with im3:
+        pass
+    text = st.text_area('Text Analyzer', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
+    n_top_keywords = st.sidebar.slider('n Top keywords', 5, 15, 5, 1)
+    if st.button('Analyze it'):
+        if text != '':
+            with st.expander('Original Text'):
+                st.write(text)
+                add_one_item(text, 'Text Analyzer')
+            with st.expander('Text Analysis'):
+                tokens_stats_df = get_tokens_analysis(text)
+                st.dataframe(tokens_stats_df)
+            with st.expander('Text Entities'):
+                entities_tokens_html = get_entities_tokens(text)
+                html(entities_tokens_html, height=300, scrolling=True)
+            col11, col12 = st.columns(2)
+            with col11:
+                with st.expander('Word Statistics'):
+                    word_stats_json, figure = get_word_stats(text)
+                    st.json(word_stats_json)
+                    st.plotly_chart(figure)
+            with col12:
+                with st.expander(f'The Frequency of {n_top_keywords} Top Keywords'):
+                    figure = plot_top_keywords_frequencies(text, n_top_keywords)
+                    st.plotly_chart(figure)
+            col21, col22 = st.columns(2)
+            with col21:
+                with st.expander('Sentence Statistics'):
+                    sentences, noun_phrases, sentence_stats_df = get_sentence_stats(text)
+                    st.dataframe(sentence_stats_df)
+                    st.write('Sentences:\n', sentences)
+                    st.write('Noun Phrases:\n', noun_phrases)
+            with col22:
+                with st.expander('The Frequency of Tokens Part of speech'):
+                    figure = plot_tokens_pos(tokens_stats_df)
+                    st.plotly_chart(figure)
+            col31, col32 = st.columns(2)
+            with col31:
+                with st.expander('Sentiment Analysis'):
+                    sentiment_df = get_sentiment_analysis_res(text)
+                    st.dataframe(sentiment_df)
+            with col32:
+                with st.expander('Word Frequency'):
+                    fig = plot_word_frequency(text)
+                    st.pyplot(fig)
+        else:
+            st.error('Please enter a non-empty text.')
+if __name__ == '__main__':
+    main()

pages/Find_Topic.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import streamlit as st
+import numpy as np
+from sklearn.feature_extraction.text import CountVectorizer
+from scipy import linalg
+import regex as re
+from configs.db_configs import add_one_item
+from streamlit.components.v1 import html
+from configs.html_features import set_image
+def preprocess_text(text):
+    vectorizer = CountVectorizer(stop_words='english')
+    vector = vectorizer.fit_transform([text]).todense()
+    vocab = np.array(vectorizer.get_feature_names_out())
+    U, s, Vh = linalg.svd(vector, full_matrices=False)
+    return vocab, U, s, Vh
+def show_topics(text, num_top_words):
+    vocab, U, s, Vh = preprocess_text(text)
+    pattern = '\d+'
+    top_words = lambda Vh: [vocab[i] for i in np.argsort(Vh)[:-num_top_words-1:-1]]
+    topic_words = top_words(Vh[0])
+    topic_words = ' '.join(topic_words)
+    return ' '.join([re.sub(pattern, '', word) for word in topic_words.split()])
+def main():
+    st.title('Topic Modeling by Top Keywords')
+    im1, im2, im3 = st.columns([1, 5.3, 1])
+    with im1:
+        pass
+    with im2:
+        url = "https://i.postimg.cc/jdF1hPng/combined.png"
+        html(set_image(url), height=500, width=500)
+    with im3:
+        pass
+    text = st.text_area('Find Topic', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
+    num_top_words = st.sidebar.slider('Number of Top Keywords', min_value=5, max_value=20, step=1, value=10)
+    if st.button('Find Topic'):
+        if text != '':
+            with st.expander('Original Text'):
+                st.write(text)
+                add_one_item(text, 'Topic Modeling')
+            with st.expander(f'Show Topic by {num_top_words} Top Keywords'):
+                topic_words = show_topics(text, num_top_words)
+                st.write(topic_words)
+if __name__ == '__main__':
+    main()

pages/Summarize_Text.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import streamlit as st
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from configs.download_files import FileDownloader
+from configs.db_configs import add_one_item
+from streamlit.components.v1 import html
+from configs.html_features import set_image
+def summarize_text(text):
+    prefix = 'summarize: '
+    text = prefix + text
+    tokenizer = AutoTokenizer.from_pretrained('stevhliu/my_awesome_billsum_model')
+    input_ids = tokenizer(text=text, return_tensors='pt')['input_ids']
+    model = AutoModelForSeq2SeqLM.from_pretrained('stevhliu/my_awesome_billsum_model')
+    if len(input_ids[0]) < 200:
+        output_ids = model.generate(input_ids, max_new_tokens=100, do_sample=False)
+        summarized_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        return summarized_text
+    elif len(input_ids[0]) > 200:
+        output_ids = model.generate(input_ids, max_new_tokens=round(len(input_ids[0]) * 1/2), do_sample=False)
+        summarized_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        return summarized_text
+def main():
+    st.title('Text Summarizer')
+    im1, im2, im3 = st.columns([1, 5.3, 1])
+    with im1:
+        pass
+    with im2:
+        url = "https://i.postimg.cc/jdF1hPng/combined.png"
+        html(set_image(url), height=500, width=500)
+    with im3:
+        pass
+    text = st.text_area('Text Summarizer', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
+    if st.button('Summarize it'):
+        if text != "":
+            with st.expander('Original Text'):
+                st.write(text)
+                add_one_item(text, "Text Summarizer")
+            with st.expander('Summarized Text'):
+                summarized_text = summarize_text(text)
+                st.write(summarized_text)
+            with st.expander('Download Summarized Text'):
+                FileDownloader(summarized_text, 'txt').download()
+        else:
+            st.error('Please enter a non-empty text.')
+if __name__ == '__main__':
+    main()

pages/Translate_Text.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import streamlit as st
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from configs.download_files import FileDownloader
+from configs.db_configs import add_one_item
+from streamlit.components.v1 import html
+from configs.html_features import set_image
+def translate_text_to_text(text, source_lang, target_lang):
+    prefix = f'translate {source_lang} to {target_lang}: '
+    text = prefix + text
+    tokenizer = AutoTokenizer.from_pretrained('stevhliu/my_awesome_opus_books_model')
+    input_ids = tokenizer(text, return_tensors='pt').input_ids
+    model = AutoModelForSeq2SeqLM.from_pretrained('stevhliu/my_awesome_opus_books_model')
+    output_ids = model.generate(input_ids, max_new_tokens=len(input_ids[0]) * 3, do_sample=False)
+    translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return translated_text
+def main():
+    st.title('Text Translator')
+    im1, im2, im3 = st.columns([1, 5.3, 1])
+    with im1:
+        pass
+    with im2:
+        url = "https://i.postimg.cc/jdF1hPng/combined.png"
+        html(set_image(url), height=500, width=500)
+    with im3:
+        pass
+    languages = ['English', 'French']
+    source_lang = st.sidebar.selectbox('Source Language', languages)
+    target_lang = st.sidebar.selectbox('Target Language', languages, index=1)
+    text = st.text_area('Text Translator', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
+    if st.button('translate it'):
+        if text != '':
+            if (source_lang == 'English' and target_lang == 'English') or (source_lang == 'French' and target_lang == 'French'):
+                st.error('Expected different values for source and target languages, but got the same values!')
+            else:
+                with st.expander('Original Text'):
+                    st.write(text)
+                    add_one_item(text, 'Text Translator')
+                with st.expander('Translated Text'):
+                    translated_text = translate_text_to_text(text, source_lang, target_lang)
+                    st.write(translated_text)
+                with st.expander('Download Translated Text'):
+                    FileDownloader(translated_text, 'txt').download()
+        else:
+            st.error('Please enter a non-empty text.')
+if __name__ == '__main__':
+    main()

pages/__pycache__/text_analysis.cpython-310.pyc ADDED Viewed

Binary file (785 Bytes). View file

pages/__pycache__/text_summarization.cpython-310.pyc ADDED Viewed

Binary file (680 Bytes). View file