simpsonjj's picture
Update app.py
6a2ea24 verified
raw
history blame
4.52 kB
import os
import streamlit as st
import torch
import pandas as pd
import time
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Set up Streamlit app
st.title("Document Scoring App for Various Risk Factors Categories")
# Hugging Face model directories
model_directories = {
'finance': 'mgmtprofessor/finance_risk_factors',
'accounting': 'mgmtprofessor/accounting_risk_factors',
'technology': 'mgmtprofessor/technology_risk_factors',
'international': 'mgmtprofessor/international_risk_factors',
'operations': 'mgmtprofessor/operations_risk_factors',
'marketing': 'mgmtprofessor/marketing_risk_factors',
'management': 'mgmtprofessor/management_risk_factors',
'legal': 'mgmtprofessor/legal_risk_factors'
}
# Check if CUDA is available
use_cuda = torch.cuda.is_available()
# Function to load a model from Hugging Face
def load_model(category):
try:
# Load the model from Hugging Face based on the category
model_name = model_directories.get(category)
if model_name:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
return model, tokenizer
else:
st.error(f"No Hugging Face model found for {category}")
return None, None
except Exception as e:
st.error(f"Failed to load model for {category}: {e}")
return None, None
# Function to score a document and return the prediction and probability for class '1'
def score_document(model, tokenizer, text_data):
if isinstance(text_data, str):
text_data = [text_data]
# Tokenize the input
inputs = tokenizer(text_data, return_tensors="pt", padding=True, truncation=True)
# Perform the prediction
with torch.no_grad():
outputs = model(**inputs)
# Get probabilities (softmax)
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
# Get the prediction (class with highest probability)
predictions = torch.argmax(probabilities, dim=1)
# Get the probability associated with class '1'
probability_class_1 = probabilities[:, 1].item()
return predictions.item(), probability_class_1
# Let the user upload a file
doc_file = st.file_uploader("Upload a document (.txt)", type=["txt"])
# Track the start time
start_time = time.time()
# Make predictions when a file is uploaded
if doc_file is not None:
# Read the content of the uploaded .txt file
text_data = doc_file.read().decode("utf-8")
# Initialize an empty DataFrame for results
result_df = pd.DataFrame(columns=["Category", "Prediction", "Probability"])
# Progress bar
progress_bar = st.progress(0)
total_categories = len(model_directories)
for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
# Load the pre-trained model for the current category
model, tokenizer = load_model(category)
# Skip the category if model loading fails
if model is not None:
# Score the document
prediction, probability = score_document(model, tokenizer, text_data)
# Create a DataFrame for the current result
new_row = pd.DataFrame({
"Category": [category],
"Prediction": [prediction],
"Probability": [probability]
})
# Use pd.concat to append the new row to the DataFrame
result_df = pd.concat([result_df, new_row], ignore_index=True)
# Update the progress bar
progress_bar.progress((i + 1) / total_categories)
# Estimate remaining time
elapsed_time = time.time() - start_time
estimated_total_time = (elapsed_time / (i + 1)) * total_categories
st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")
# Save results to CSV
csv = result_df.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download results as CSV",
data=csv,
file_name="document_scoring_results.csv",
mime="text/csv",
)
# Display completion message
st.success("Document scoring complete!")
st.write("Note: Ensure the uploaded document is in .txt format containing text data. The current limit is 512 tokens and will be increased later.")