Spaces:

CodeHima
/

TOSRoberta

Sleeping

App Files Files Community

TOSRoberta / app.py

CodeHima

Update app.py

ba8276b verified 8 months ago

raw

history blame contribute delete

4.3 kB

	import streamlit as st
	import pandas as pd
	import torch
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import subprocess

	# Ensure the spaCy model is downloaded
	subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])

	from utils.text_processing import extract_text_from_pdf, split_into_clauses
	from utils.model_utils import predict_unfairness

	# Set page title and favicon
	st.set_page_config(
	page_title="Terms of Service Analyzer",
	page_icon="📜",
	layout="wide"
	)

	# Load model and tokenizer from Hugging Face
	@st.cache_resource
	def load_model():
	model = AutoModelForSequenceClassification.from_pretrained("CodeHima/Tos-Roberta")
	tokenizer = AutoTokenizer.from_pretrained("CodeHima/Tos-Roberta")
	return model, tokenizer

	model, tokenizer = load_model()

	st.title("📜 Terms of Service Analyzer")

	# File upload
	uploaded_file = st.file_uploader("Choose a PDF or text file", type=["pdf", "txt"])

	# Text input
	text_input = st.text_area("Or paste your Terms of Service here")

	if uploaded_file is not None or text_input:
	# Create a progress bar
	progress_bar = st.progress(0)

	# Create a status text
	status_text = st.empty()

	if uploaded_file is not None:
	status_text.text("Reading file...")
	progress_bar.progress(10)
	if uploaded_file.type == "application/pdf":
	text = extract_text_from_pdf(uploaded_file)
	else:
	text = uploaded_file.getvalue().decode("utf-8")
	else:
	text = text_input

	status_text.text("Splitting into clauses...")
	progress_bar.progress(30)
	clauses = split_into_clauses(text)

	results = []
	total_clauses = len(clauses)

	for i, clause in enumerate(clauses):
	status_text.text(f"Analyzing clause {i+1} of {total_clauses}...")
	# Update progress calculation to ensure it's always between 0 and 100
	progress = min(30 + int((i+1) / total_clauses * 60), 90)
	progress_bar.progress(progress)
	label, probabilities = predict_unfairness(clause, model, tokenizer)
	results.append({
	"clause": clause,
	"label": label,
	"probabilities": probabilities
	})

	status_text.text("Preparing results...")
	progress_bar.progress(100)

	df = pd.DataFrame(results)

	# Calculate summary
	total_clauses = len(df)
	clearly_fair = sum(df['label'] == 'clearly_fair')
	potentially_unfair = sum(df['label'] == 'potentially_unfair')
	clearly_unfair = sum(df['label'] == 'clearly_unfair')

	# Clear the progress bar and status text
	progress_bar.empty()
	status_text.empty()

	# Display summary
	st.header("Summary")
	col1, col2, col3 = st.columns(3)
	col1.metric("Clearly Fair", clearly_fair, f"{clearly_fair/total_clauses:.1%}")
	col2.metric("Potentially Unfair", potentially_unfair, f"{potentially_unfair/total_clauses:.1%}")
	col3.metric("Clearly Unfair", clearly_unfair, f"{clearly_unfair/total_clauses:.1%}")

	# Recommendation
	if clearly_unfair > 0 or potentially_unfair / total_clauses > 0.3:
	st.warning("⚠️ Exercise caution! This ToS contains unfair or potentially unfair clauses.")
	elif potentially_unfair > 0:
	st.info("ℹ️ Proceed with awareness. This ToS contains some potentially unfair clauses.")
	else:
	st.success("✅ This ToS appears to be fair. Always read carefully nonetheless.")

	# Display results
	st.header("Detailed Analysis")
	for _, row in df.iterrows():
	if row['label'] == 'clearly_fair':
	st.success(f"{row['label'].replace('_', ' ').title()}: {row['clause']}")
	elif row['label'] == 'potentially_unfair':
	st.warning(f"{row['label'].replace('_', ' ').title()}: {row['clause']}")
	else:
	st.error(f"{row['label'].replace('_', ' ').title()}: {row['clause']}")

	st.write(f"Probabilities: Clearly Fair: {row['probabilities'][0]:.2f}, "
	f"Potentially Unfair: {row['probabilities'][1]:.2f}, "
	f"Clearly Unfair: {row['probabilities'][2]:.2f}")
	st.divider()
	else:
	st.info("Please upload a file or paste your Terms of Service to begin analysis.")