File size: 4,521 Bytes
6a2ea24 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import os
import streamlit as st
import torch
import pandas as pd
import time
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Set up Streamlit app
st.title("Document Scoring App for Various Risk Factors Categories")
# Hugging Face model directories
model_directories = {
'finance': 'mgmtprofessor/finance_risk_factors',
'accounting': 'mgmtprofessor/accounting_risk_factors',
'technology': 'mgmtprofessor/technology_risk_factors',
'international': 'mgmtprofessor/international_risk_factors',
'operations': 'mgmtprofessor/operations_risk_factors',
'marketing': 'mgmtprofessor/marketing_risk_factors',
'management': 'mgmtprofessor/management_risk_factors',
'legal': 'mgmtprofessor/legal_risk_factors'
}
# Check if CUDA is available
use_cuda = torch.cuda.is_available()
# Function to load a model from Hugging Face
def load_model(category):
try:
# Load the model from Hugging Face based on the category
model_name = model_directories.get(category)
if model_name:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
return model, tokenizer
else:
st.error(f"No Hugging Face model found for {category}")
return None, None
except Exception as e:
st.error(f"Failed to load model for {category}: {e}")
return None, None
# Function to score a document and return the prediction and probability for class '1'
def score_document(model, tokenizer, text_data):
if isinstance(text_data, str):
text_data = [text_data]
# Tokenize the input
inputs = tokenizer(text_data, return_tensors="pt", padding=True, truncation=True)
# Perform the prediction
with torch.no_grad():
outputs = model(**inputs)
# Get probabilities (softmax)
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
# Get the prediction (class with highest probability)
predictions = torch.argmax(probabilities, dim=1)
# Get the probability associated with class '1'
probability_class_1 = probabilities[:, 1].item()
return predictions.item(), probability_class_1
# Let the user upload a file
doc_file = st.file_uploader("Upload a document (.txt)", type=["txt"])
# Track the start time
start_time = time.time()
# Make predictions when a file is uploaded
if doc_file is not None:
# Read the content of the uploaded .txt file
text_data = doc_file.read().decode("utf-8")
# Initialize an empty DataFrame for results
result_df = pd.DataFrame(columns=["Category", "Prediction", "Probability"])
# Progress bar
progress_bar = st.progress(0)
total_categories = len(model_directories)
for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
# Load the pre-trained model for the current category
model, tokenizer = load_model(category)
# Skip the category if model loading fails
if model is not None:
# Score the document
prediction, probability = score_document(model, tokenizer, text_data)
# Create a DataFrame for the current result
new_row = pd.DataFrame({
"Category": [category],
"Prediction": [prediction],
"Probability": [probability]
})
# Use pd.concat to append the new row to the DataFrame
result_df = pd.concat([result_df, new_row], ignore_index=True)
# Update the progress bar
progress_bar.progress((i + 1) / total_categories)
# Estimate remaining time
elapsed_time = time.time() - start_time
estimated_total_time = (elapsed_time / (i + 1)) * total_categories
st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")
# Save results to CSV
csv = result_df.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download results as CSV",
data=csv,
file_name="document_scoring_results.csv",
mime="text/csv",
)
# Display completion message
st.success("Document scoring complete!")
st.write("Note: Ensure the uploaded document is in .txt format containing text data. The current limit is 512 tokens and will be increased later.")
|