Files changed (1) hide show
  1. app.py +123 -123
app.py CHANGED
@@ -1,123 +1,123 @@
1
- import os
2
- import streamlit as st
3
- import torch
4
- import pandas as pd
5
- import time
6
- from tqdm import tqdm
7
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
8
-
9
- # Set up Streamlit app
10
- st.title("Document Scoring App for Various Risk Factors Categories")
11
-
12
- # Hugging Face model directories
13
- model_directories = {
14
- 'finance': 'mgmtprofessor/finance_risk_factors',
15
- 'accounting': 'mgmtprofessor/accounting_risk_factors',
16
- 'technology': 'mgmtprofessor/technology_risk_factors',
17
- 'international': 'mgmtprofessor/international_risk_factors',
18
- 'operations': 'mgmtprofessor/operations_risk_factors',
19
- 'marketing': 'mgmtprofessor/marketing_risk_factors',
20
- 'management': 'mgmtprofessor/management_risk_factors',
21
- 'legal': 'mgmtprofessor/legal_risk_factors'
22
- }
23
-
24
- # Check if CUDA is available
25
- use_cuda = torch.cuda.is_available()
26
-
27
- # Function to load a model from Hugging Face
28
- def load_model(category):
29
- try:
30
- # Load the model from Hugging Face based on the category
31
- model_name = model_directories.get(category)
32
- if model_name:
33
- tokenizer = AutoTokenizer.from_pretrained(model_name)
34
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
35
- return model, tokenizer
36
- else:
37
- st.error(f"No Hugging Face model found for {category}")
38
- return None, None
39
- except Exception as e:
40
- st.error(f"Failed to load model for {category}: {e}")
41
- return None, None
42
-
43
- # Function to score a document and return the prediction and probability for class '1'
44
- def score_document(model, tokenizer, text_data):
45
- if isinstance(text_data, str):
46
- text_data = [text_data]
47
-
48
- # Tokenize the input
49
- inputs = tokenizer(text_data, return_tensors="pt", padding=True, truncation=True)
50
-
51
- # Perform the prediction
52
- with torch.no_grad():
53
- outputs = model(**inputs)
54
-
55
- # Get probabilities (softmax)
56
- probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
57
-
58
- # Get the prediction (class with highest probability)
59
- predictions = torch.argmax(probabilities, dim=1)
60
-
61
- # Get the probability associated with class '1'
62
- probability_class_1 = probabilities[:, 1].item()
63
-
64
- return predictions.item(), probability_class_1
65
-
66
- # Let the user upload a file
67
- doc_file = st.file_uploader("Upload a document (.txt)", type=["txt"])
68
-
69
- # Track the start time
70
- start_time = time.time()
71
-
72
- # Make predictions when a file is uploaded
73
- if doc_file is not None:
74
- # Read the content of the uploaded .txt file
75
- text_data = doc_file.read().decode("utf-8")
76
-
77
- # Initialize an empty DataFrame for results
78
- result_df = pd.DataFrame(columns=["Category", "Prediction", "Probability"])
79
-
80
- # Progress bar
81
- progress_bar = st.progress(0)
82
- total_categories = len(model_directories)
83
-
84
- for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
85
- # Load the pre-trained model for the current category
86
- model, tokenizer = load_model(category)
87
-
88
- # Skip the category if model loading fails
89
- if model is not None:
90
- # Score the document
91
- prediction, probability = score_document(model, tokenizer, text_data)
92
-
93
- # Create a DataFrame for the current result
94
- new_row = pd.DataFrame({
95
- "Category": [category],
96
- "Prediction": [prediction],
97
- "Probability": [probability]
98
- })
99
-
100
- # Use pd.concat to append the new row to the DataFrame
101
- result_df = pd.concat([result_df, new_row], ignore_index=True)
102
-
103
- # Update the progress bar
104
- progress_bar.progress((i + 1) / total_categories)
105
-
106
- # Estimate remaining time
107
- elapsed_time = time.time() - start_time
108
- estimated_total_time = (elapsed_time / (i + 1)) * total_categories
109
- st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")
110
-
111
- # Save results to CSV
112
- csv = result_df.to_csv(index=False).encode('utf-8')
113
- st.download_button(
114
- label="Download results as CSV",
115
- data=csv,
116
- file_name="document_scoring_results.csv",
117
- mime="text/csv",
118
- )
119
-
120
- # Display completion message
121
- st.success("Document scoring complete!")
122
-
123
- st.write("Note: Ensure the uploaded document is in .txt format containing text data.")
 
1
+ import os
2
+ import streamlit as st
3
+ import torch
4
+ import pandas as pd
5
+ import time
6
+ from tqdm import tqdm
7
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
8
+
9
+ # Set up Streamlit app
10
+ st.title("Document Scoring App for Various Risk Factors Categories")
11
+
12
+ # Hugging Face model directories
13
+ model_directories = {
14
+ 'finance': 'mgmtprofessor/finance_risk_factors',
15
+ 'accounting': 'mgmtprofessor/accounting_risk_factors',
16
+ 'technology': 'mgmtprofessor/technology_risk_factors',
17
+ 'international': 'mgmtprofessor/international_risk_factors',
18
+ 'operations': 'mgmtprofessor/operations_risk_factors',
19
+ 'marketing': 'mgmtprofessor/marketing_risk_factors',
20
+ 'management': 'mgmtprofessor/management_risk_factors',
21
+ 'legal': 'mgmtprofessor/legal_risk_factors'
22
+ }
23
+
24
+ # Check if CUDA is available
25
+ use_cuda = torch.cuda.is_available()
26
+
27
+ # Function to load a model from Hugging Face
28
+ def load_model(category):
29
+ try:
30
+ # Load the model from Hugging Face based on the category
31
+ model_name = model_directories.get(category)
32
+ if model_name:
33
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
34
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
35
+ return model, tokenizer
36
+ else:
37
+ st.error(f"No Hugging Face model found for {category}")
38
+ return None, None
39
+ except Exception as e:
40
+ st.error(f"Failed to load model for {category}: {e}")
41
+ return None, None
42
+
43
+ # Function to score a document and return the prediction and probability for class '1'
44
+ def score_document(model, tokenizer, text_data):
45
+ if isinstance(text_data, str):
46
+ text_data = [text_data]
47
+
48
+ # Tokenize the input
49
+ inputs = tokenizer(text_data, return_tensors="pt", padding=True, truncation=True)
50
+
51
+ # Perform the prediction
52
+ with torch.no_grad():
53
+ outputs = model(**inputs)
54
+
55
+ # Get probabilities (softmax)
56
+ probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
57
+
58
+ # Get the prediction (class with highest probability)
59
+ predictions = torch.argmax(probabilities, dim=1)
60
+
61
+ # Get the probability associated with class '1'
62
+ probability_class_1 = probabilities[:, 1].item()
63
+
64
+ return predictions.item(), probability_class_1
65
+
66
+ # Let the user upload a file
67
+ doc_file = st.file_uploader("Upload a document (.txt)", type=["txt"])
68
+
69
+ # Track the start time
70
+ start_time = time.time()
71
+
72
+ # Make predictions when a file is uploaded
73
+ if doc_file is not None:
74
+ # Read the content of the uploaded .txt file
75
+ text_data = doc_file.read().decode("utf-8")
76
+
77
+ # Initialize an empty DataFrame for results
78
+ result_df = pd.DataFrame(columns=["Category", "Prediction", "Probability"])
79
+
80
+ # Progress bar
81
+ progress_bar = st.progress(0)
82
+ total_categories = len(model_directories)
83
+
84
+ for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
85
+ # Load the pre-trained model for the current category
86
+ model, tokenizer = load_model(category)
87
+
88
+ # Skip the category if model loading fails
89
+ if model is not None:
90
+ # Score the document
91
+ prediction, probability = score_document(model, tokenizer, text_data)
92
+
93
+ # Create a DataFrame for the current result
94
+ new_row = pd.DataFrame({
95
+ "Category": [category],
96
+ "Prediction": [prediction],
97
+ "Probability": [probability]
98
+ })
99
+
100
+ # Use pd.concat to append the new row to the DataFrame
101
+ result_df = pd.concat([result_df, new_row], ignore_index=True)
102
+
103
+ # Update the progress bar
104
+ progress_bar.progress((i + 1) / total_categories)
105
+
106
+ # Estimate remaining time
107
+ elapsed_time = time.time() - start_time
108
+ estimated_total_time = (elapsed_time / (i + 1)) * total_categories
109
+ st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")
110
+
111
+ # Save results to CSV
112
+ csv = result_df.to_csv(index=False).encode('utf-8')
113
+ st.download_button(
114
+ label="Download results as CSV",
115
+ data=csv,
116
+ file_name="document_scoring_results.csv",
117
+ mime="text/csv",
118
+ )
119
+
120
+ # Display completion message
121
+ st.success("Document scoring complete!")
122
+
123
+ st.write("Note: Ensure the uploaded document is in .txt format containing text data. The current limit is 512 tokens and will be increased later.")