Spaces:

mgmtprofessor
/

risk_factors_scoring

Sleeping

File size: 4,521 Bytes

6a2ea24

import os
import streamlit as st
import torch
import pandas as pd
import time
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Set up Streamlit app
st.title("Document Scoring App for Various Risk Factors Categories")

# Hugging Face model directories
model_directories = {
    'finance': 'mgmtprofessor/finance_risk_factors',
    'accounting': 'mgmtprofessor/accounting_risk_factors',
    'technology': 'mgmtprofessor/technology_risk_factors',
    'international': 'mgmtprofessor/international_risk_factors',
    'operations': 'mgmtprofessor/operations_risk_factors',
    'marketing': 'mgmtprofessor/marketing_risk_factors',
    'management': 'mgmtprofessor/management_risk_factors',
    'legal': 'mgmtprofessor/legal_risk_factors'
}

# Check if CUDA is available
use_cuda = torch.cuda.is_available()

# Function to load a model from Hugging Face
def load_model(category):
    try:
        # Load the model from Hugging Face based on the category
        model_name = model_directories.get(category)
        if model_name:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForSequenceClassification.from_pretrained(model_name)
            return model, tokenizer
        else:
            st.error(f"No Hugging Face model found for {category}")
            return None, None
    except Exception as e:
        st.error(f"Failed to load model for {category}: {e}")
        return None, None

# Function to score a document and return the prediction and probability for class '1'
def score_document(model, tokenizer, text_data):
    if isinstance(text_data, str):
        text_data = [text_data]

    # Tokenize the input
    inputs = tokenizer(text_data, return_tensors="pt", padding=True, truncation=True)
    
    # Perform the prediction
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get probabilities (softmax)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Get the prediction (class with highest probability)
    predictions = torch.argmax(probabilities, dim=1)
    
    # Get the probability associated with class '1'
    probability_class_1 = probabilities[:, 1].item()

    return predictions.item(), probability_class_1

# Let the user upload a file
doc_file = st.file_uploader("Upload a document (.txt)", type=["txt"])

# Track the start time
start_time = time.time()

# Make predictions when a file is uploaded
if doc_file is not None:
    # Read the content of the uploaded .txt file
    text_data = doc_file.read().decode("utf-8")
    
    # Initialize an empty DataFrame for results
    result_df = pd.DataFrame(columns=["Category", "Prediction", "Probability"])
    
    # Progress bar
    progress_bar = st.progress(0)
    total_categories = len(model_directories)
    
    for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
        # Load the pre-trained model for the current category
        model, tokenizer = load_model(category)
        
        # Skip the category if model loading fails
        if model is not None:
            # Score the document
            prediction, probability = score_document(model, tokenizer, text_data)
            
            # Create a DataFrame for the current result
            new_row = pd.DataFrame({
                "Category": [category],
                "Prediction": [prediction],
                "Probability": [probability]
            })
            
            # Use pd.concat to append the new row to the DataFrame
            result_df = pd.concat([result_df, new_row], ignore_index=True)
        
        # Update the progress bar
        progress_bar.progress((i + 1) / total_categories)
        
        # Estimate remaining time
        elapsed_time = time.time() - start_time
        estimated_total_time = (elapsed_time / (i + 1)) * total_categories
        st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")

    # Save results to CSV
    csv = result_df.to_csv(index=False).encode('utf-8')
    st.download_button(
        label="Download results as CSV",
        data=csv,
        file_name="document_scoring_results.csv",
        mime="text/csv",
    )

    # Display completion message
    st.success("Document scoring complete!")

st.write("Note: Ensure the uploaded document is in .txt format containing text data. The current limit is 512 tokens and will be increased later.")