|
import os |
|
import streamlit as st |
|
import torch |
|
import pandas as pd |
|
import time |
|
from tqdm import tqdm |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
|
|
|
|
st.title("Document Scoring App for Various Risk Factors Categories") |
|
|
|
|
|
model_directories = { |
|
'finance': 'mgmtprofessor/finance_risk_factors', |
|
'accounting': 'mgmtprofessor/accounting_risk_factors', |
|
'technology': 'mgmtprofessor/technology_risk_factors', |
|
'international': 'mgmtprofessor/international_risk_factors', |
|
'operations': 'mgmtprofessor/operations_risk_factors', |
|
'marketing': 'mgmtprofessor/marketing_risk_factors', |
|
'management': 'mgmtprofessor/management_risk_factors', |
|
'legal': 'mgmtprofessor/legal_risk_factors' |
|
} |
|
|
|
|
|
use_cuda = torch.cuda.is_available() |
|
|
|
|
|
def load_model(category): |
|
try: |
|
|
|
model_name = model_directories.get(category) |
|
if model_name: |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSequenceClassification.from_pretrained(model_name) |
|
return model, tokenizer |
|
else: |
|
st.error(f"No Hugging Face model found for {category}") |
|
return None, None |
|
except Exception as e: |
|
st.error(f"Failed to load model for {category}: {e}") |
|
return None, None |
|
|
|
|
|
def score_document(model, tokenizer, text_data): |
|
if isinstance(text_data, str): |
|
text_data = [text_data] |
|
|
|
|
|
inputs = tokenizer(text_data, return_tensors="pt", padding=True, truncation=True) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
|
|
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) |
|
|
|
|
|
predictions = torch.argmax(probabilities, dim=1) |
|
|
|
|
|
probability_class_1 = probabilities[:, 1].item() |
|
|
|
return predictions.item(), probability_class_1 |
|
|
|
|
|
doc_file = st.file_uploader("Upload a document (.txt)", type=["txt"]) |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
if doc_file is not None: |
|
|
|
text_data = doc_file.read().decode("utf-8") |
|
|
|
|
|
result_df = pd.DataFrame(columns=["Category", "Prediction", "Probability"]) |
|
|
|
|
|
progress_bar = st.progress(0) |
|
total_categories = len(model_directories) |
|
|
|
for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")): |
|
|
|
model, tokenizer = load_model(category) |
|
|
|
|
|
if model is not None: |
|
|
|
prediction, probability = score_document(model, tokenizer, text_data) |
|
|
|
|
|
new_row = pd.DataFrame({ |
|
"Category": [category], |
|
"Prediction": [prediction], |
|
"Probability": [probability] |
|
}) |
|
|
|
|
|
result_df = pd.concat([result_df, new_row], ignore_index=True) |
|
|
|
|
|
progress_bar.progress((i + 1) / total_categories) |
|
|
|
|
|
elapsed_time = time.time() - start_time |
|
estimated_total_time = (elapsed_time / (i + 1)) * total_categories |
|
st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s") |
|
|
|
|
|
csv = result_df.to_csv(index=False).encode('utf-8') |
|
st.download_button( |
|
label="Download results as CSV", |
|
data=csv, |
|
file_name="document_scoring_results.csv", |
|
mime="text/csv", |
|
) |
|
|
|
|
|
st.success("Document scoring complete!") |
|
|
|
st.write("Note: Ensure the uploaded document is in .txt format containing text data. The current limit is 512 tokens and will be increased later.") |
|
|