EE21's picture
Update app.py
71f1303
raw
history blame
5.73 kB
import streamlit as st
from datasets import load_dataset
import PyPDF2
from extractive_summarization import summarize_with_textrank, summarize_with_lsa
from abstractive_summarization import summarize_with_bart_cnn, summarize_with_bart_ft, summarize_with_led, summarize_with_t5
from keyword_extraction import extract_keywords
from keyphrase_extraction import extract_sentences_with_obligations
#from blanc import BlancHelp
# Load in ToS
dataset = load_dataset("EE21/ToS-Summaries")
# Extract titles or identifiers for the ToS
tos_titles = ["None"] + [f"Document {i}" for i in range(len(dataset['train']))]
# Set page to wide mode
st.set_page_config(layout="wide")
# Function to handle file upload and return its content
def load_pdf(file):
pdf_reader = PyPDF2.PdfReader(file)
pdf_text = ""
for page_num in range(len(pdf_reader.pages)):
pdf_text += pdf_reader.pages[page_num].extract_text() or ""
return pdf_text
# Main app
def main():
st.title("Terms of Service Summarizer")
# Layout: 3 columns
col1, col2, col3 = st.columns([1, 3, 2], gap="large")
# Left column: Radio buttons for summarizer choice
with col1:
radio_options = ["Abstractive (T5)", "Abstractive (LED)", 'Abstractive (Fine-tuned BART)', "Abstractive (BART-large-CNN)", 'Extractive (TextRank)',
"Extractive (Latent Semantic Analysis)", 'Keyphrase Extraction (RAKE)', 'Keyword Extraction (RAKE)']
help_text = "Abstractive: Abstractive summarization generates a summary that may contain words not present in the original text. " \
"It uses a fine-tuned model on BART-large-CNN.<br>" \
"Extractive: Extractive summarization selects and extracts sentences or phrases directly from the original text to create a summary using the TextRank algorithm.<br>" \
"Keyword Extraction: Keyword extraction identifies and extracts important keywords or terms from the text using the Rake algorithm. " \
"These keywords can be used for various purposes such as content analysis and SEO.<br>" \
"Keyphrase Extraction: Keyphrase extraction is similar to keyword extraction but focuses on identifying multi-word phrases or expressions that are significant in the text using the Rake algorithm."
radio_selection = st.radio("Choose type of summarizer:", radio_options, help=help_text)
# Middle column: Text input and File uploader
with col2:
user_input = st.text_area("Enter your text here:")
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
# Dropdown for selecting the document
tos_selection_index = st.selectbox("Select a Terms of Service Document", range(len(tos_titles)), format_func=lambda x: tos_titles[x])
if st.button("Summarize"):
if uploaded_file and user_input and tos_selection_index:
st.warning("Please provide either text input or a PDF file, not both.")
return
elif uploaded_file:
# Extract text from PDF
file_content = load_pdf(uploaded_file)
st.write("PDF uploaded successfully.")
elif user_input:
file_content = user_input
elif tos_selection != "None":
tos_selection_index = tos_titles.index(tos_selection) - 1 # Adjust for the added 'None' option
file_content = dataset['train'][tos_selection_index]['plain_text']
else:
st.warning("Please upload a PDF, enter some text, or select a document to summarize.")
return
# Perform extractive summarization
if radio_selection == "Extractive (TextRank)":
summary = summarize_with_textrank(file_content)
st.session_state.summary = summary
# Perform extractive summarization
if radio_selection == "Extractive (Latent Semantic Analysis)":
summary = summarize_with_lsa(file_content)
st.session_state.summary = summary
# Perform extractive summarization
if radio_selection == "Abstractive (Fine-tuned BART)":
summary = summarize_with_bart_ft(file_content)
st.session_state.summary = summary
# Perform extractive summarization
if radio_selection == "Abstractive (BART-large-CNN)":
summary = summarize_with_bart_cnn(file_content)
st.session_state.summary = summary
# Perform extractive summarization
if radio_selection == "Abstractive (T5)":
summary = summarize_with_t5(file_content)
st.session_state.summary = summary
# Perform extractive summarization
if radio_selection == "Abstractive (LED)":
summary = summarize_with_led(file_content)
st.session_state.summary = summary
# Perform Keyword Extraction
if radio_selection == "Keyword Extraction (RAKE)":
summary = extract_keywords(file_content)
st.session_state.summary = summary
# Perform Keyphrase Extraction
if radio_selection == "Keyphrase Extraction (RAKE)":
summary = extract_sentences_with_obligations(file_content)
st.session_state.summary = summary
# Right column: Displaying text after pressing 'Summarize'
with col3:
st.write("Summary:")
if 'summary' in st.session_state:
st.write(st.session_state.summary)
if __name__ == "__main__":
main()