File size: 4,521 Bytes
6a2ea24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import streamlit as st
import torch
import pandas as pd
import time
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Set up Streamlit app
st.title("Document Scoring App for Various Risk Factors Categories")

# Hugging Face model directories
model_directories = {
    'finance': 'mgmtprofessor/finance_risk_factors',
    'accounting': 'mgmtprofessor/accounting_risk_factors',
    'technology': 'mgmtprofessor/technology_risk_factors',
    'international': 'mgmtprofessor/international_risk_factors',
    'operations': 'mgmtprofessor/operations_risk_factors',
    'marketing': 'mgmtprofessor/marketing_risk_factors',
    'management': 'mgmtprofessor/management_risk_factors',
    'legal': 'mgmtprofessor/legal_risk_factors'
}

# Check if CUDA is available
use_cuda = torch.cuda.is_available()

# Function to load a model from Hugging Face
def load_model(category):
    try:
        # Load the model from Hugging Face based on the category
        model_name = model_directories.get(category)
        if model_name:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForSequenceClassification.from_pretrained(model_name)
            return model, tokenizer
        else:
            st.error(f"No Hugging Face model found for {category}")
            return None, None
    except Exception as e:
        st.error(f"Failed to load model for {category}: {e}")
        return None, None

# Function to score a document and return the prediction and probability for class '1'
def score_document(model, tokenizer, text_data):
    if isinstance(text_data, str):
        text_data = [text_data]

    # Tokenize the input
    inputs = tokenizer(text_data, return_tensors="pt", padding=True, truncation=True)
    
    # Perform the prediction
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get probabilities (softmax)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Get the prediction (class with highest probability)
    predictions = torch.argmax(probabilities, dim=1)
    
    # Get the probability associated with class '1'
    probability_class_1 = probabilities[:, 1].item()

    return predictions.item(), probability_class_1

# Let the user upload a file
doc_file = st.file_uploader("Upload a document (.txt)", type=["txt"])

# Track the start time
start_time = time.time()

# Make predictions when a file is uploaded
if doc_file is not None:
    # Read the content of the uploaded .txt file
    text_data = doc_file.read().decode("utf-8")
    
    # Initialize an empty DataFrame for results
    result_df = pd.DataFrame(columns=["Category", "Prediction", "Probability"])
    
    # Progress bar
    progress_bar = st.progress(0)
    total_categories = len(model_directories)
    
    for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
        # Load the pre-trained model for the current category
        model, tokenizer = load_model(category)
        
        # Skip the category if model loading fails
        if model is not None:
            # Score the document
            prediction, probability = score_document(model, tokenizer, text_data)
            
            # Create a DataFrame for the current result
            new_row = pd.DataFrame({
                "Category": [category],
                "Prediction": [prediction],
                "Probability": [probability]
            })
            
            # Use pd.concat to append the new row to the DataFrame
            result_df = pd.concat([result_df, new_row], ignore_index=True)
        
        # Update the progress bar
        progress_bar.progress((i + 1) / total_categories)
        
        # Estimate remaining time
        elapsed_time = time.time() - start_time
        estimated_total_time = (elapsed_time / (i + 1)) * total_categories
        st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")

    # Save results to CSV
    csv = result_df.to_csv(index=False).encode('utf-8')
    st.download_button(
        label="Download results as CSV",
        data=csv,
        file_name="document_scoring_results.csv",
        mime="text/csv",
    )

    # Display completion message
    st.success("Document scoring complete!")

st.write("Note: Ensure the uploaded document is in .txt format containing text data. The current limit is 512 tokens and will be increased later.")