Spaces:

EE21
/

ToS-Summarization

Sleeping

App Files Files Community

ToS-Summarization / app.py

EE21

Update app.py

6349b5a about 1 year ago

raw

history blame

7.56 kB

	import streamlit as st
	import re
	from rouge import Rouge
	from datasets import load_dataset
	import PyPDF2
	from extractive_summarization import summarize_with_textrank, summarize_with_lsa
	from abstractive_summarization import summarize_with_bart_cnn, summarize_with_bart_ft, summarize_with_led, summarize_with_t5
	from keyword_extraction import extract_keywords
	from keyphrase_extraction import extract_sentences_with_obligations
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	from PIL import Image
	import io
	#from blanc import BlancHelp

	# Load in ToS
	dataset = load_dataset("EE21/ToS-Summaries")


	# Extract titles or identifiers for the ToS
	tos_titles = [f"Document {i}" for i in range(len(dataset['train']))]


	# Set page to wide mode
	st.set_page_config(layout="wide")

	# Function to handle file upload and return its content
	def load_pdf(file):
	pdf_reader = PyPDF2.PdfReader(file)
	pdf_text = ""
	for page_num in range(len(pdf_reader.pages)):
	pdf_text += pdf_reader.pages[page_num].extract_text() or ""
	return pdf_text

	# Main app
	def main():
	st.title("Terms of Service Summarizer")

	# Layout: 3 columns
	col1, col2, col3 = st.columns([1, 3, 2], gap="large")

	# Left column: Radio buttons for summarizer choice
	with col1:
	radio_options = ["Abstractive (T5)", "Abstractive (LED)", 'Abstractive (Fine-tuned BART)', "Abstractive (BART-large-CNN)", 'Extractive (TextRank)',
	"Extractive (Latent Semantic Analysis)", 'Keyphrase Extraction (RAKE)', 'Keyword Extraction (RAKE)']

	help_text = "Abstractive: Abstractive summarization generates a summary that may contain words not present in the original text. " \
	"It uses a fine-tuned model on BART-large-CNN.<br>" \
	"Extractive: Extractive summarization selects and extracts sentences or phrases directly from the original text to create a summary using the TextRank algorithm.<br>" \
	"Keyword Extraction: Keyword extraction identifies and extracts important keywords or terms from the text using the Rake algorithm. " \
	"These keywords can be used for various purposes such as content analysis and SEO.<br>" \
	"Keyphrase Extraction: Keyphrase extraction is similar to keyword extraction but focuses on identifying multi-word phrases or expressions that are significant in the text using the Rake algorithm."

	radio_selection = st.radio("Choose type of summarizer:", radio_options, help=help_text)

	# Middle column: Text input and File uploader
	with col2:
	user_input = st.text_area("Enter your text here:")
	uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

	# Dropdown for selecting the document
	tos_selection_index = st.selectbox("Select a Terms of Service Document", range(len(tos_titles)), format_func=lambda x: tos_titles[x])

	if st.button("Summarize"):
	if uploaded_file and user_input and tos_selection_index:
	st.warning("Please provide either text input or a PDF file, not both.")
	return
	elif uploaded_file:
	# Extract text from PDF
	file_content = load_pdf(uploaded_file)
	st.write("PDF uploaded successfully.")
	elif user_input:
	file_content = user_input
	elif tos_selection_index is not None:
	file_content = dataset['train'][tos_selection_index]['plain_text']
	else:
	st.warning("Please upload a PDF, enter some text, or select a document to summarize.")
	return

	# Perform extractive summarization
	if radio_selection == "Extractive (TextRank)":
	summary = summarize_with_textrank(file_content)
	st.session_state.summary = summary

	# Perform extractive summarization
	if radio_selection == "Extractive (Latent Semantic Analysis)":
	summary = summarize_with_lsa(file_content)
	st.session_state.summary = summary

	# Perform extractive summarization
	if radio_selection == "Abstractive (Fine-tuned BART)":
	summary = summarize_with_bart_ft(file_content)
	st.session_state.summary = summary

	# Perform extractive summarization
	if radio_selection == "Abstractive (BART-large-CNN)":
	summary = summarize_with_bart_cnn(file_content)
	st.session_state.summary = summary

	# Perform extractive summarization
	if radio_selection == "Abstractive (T5)":
	summary = summarize_with_t5(file_content)
	st.session_state.summary = summary

	# Perform extractive summarization
	if radio_selection == "Abstractive (LED)":
	summary = summarize_with_led(file_content)
	st.session_state.summary = summary

	# Perform Keyword Extraction
	if radio_selection == "Keyword Extraction (RAKE)":
	summary = extract_keywords(file_content)
	st.session_state.summary = summary

	# Perform Keyphrase Extraction
	if radio_selection == "Keyphrase Extraction (RAKE)":
	summary = extract_sentences_with_obligations(file_content)
	st.session_state.summary = summary

	# Right column: Displaying text after pressing 'Summarize'
	with col3:
	st.write("Summary:")
	if 'summary' in st.session_state:
	st.write(st.session_state.summary)

	# Generate and display word cloud
	wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=20).generate(st.session_state.summary)
	# Convert to PIL Image
	image = wordcloud.to_image()
	# Convert PIL Image to bytes
	buf = io.BytesIO()
	image.save(buf, format='PNG')
	byte_im = buf.getvalue()
	st.image(byte_im, caption='Word Cloud of Summary', use_column_width=True)

	# Check if no PDF or text input is provided and a ToS document is selected
	if not uploaded_file and not user_input and tos_selection_index is not None and 'summary' in dataset['train'][tos_selection_index]:
	# Fetch the reference summary
	reference_summary = dataset['train'][tos_selection_index]['summary']

	# Calculate ROUGE scores
	rouge = Rouge()
	scores = rouge.get_scores(st.session_state.summary, reference_summary)

	# Display ROUGE scores as styled text
	col1, col2, col3 = st.columns(3)
	with col1:
	st.markdown(f"<p style='text-align: center; color: black; border: 1px solid #cccccc; padding: 5px; border-radius: 4px;'>ROUGE-1: {scores[0]['rouge-1']['f']:.4f}</p>", unsafe_allow_html=True)
	with col2:
	st.markdown(f"<p style='text-align: center; color: black; border: 1px solid #cccccc; padding: 5px; border-radius: 4px;'>ROUGE-2: {scores[0]['rouge-2']['f']:.4f}</p>", unsafe_allow_html=True)
	with col3:
	st.markdown(f"<p style='text-align: center; color: black; border: 1px solid #cccccc; padding: 5px; border-radius: 4px;'>ROUGE-L: {scores[0]['rouge-l']['f']:.4f}</p>", unsafe_allow_html=True)

	if __name__ == "__main__":
	main()