mlkorra commited on
Commit
a20a7ca
·
verified ·
1 Parent(s): dcb2841

Add app, utils classifier

Browse files
Files changed (3) hide show
  1. app.py +18 -0
  2. assets/style.css +80 -0
  3. utils/util_classifier.py +264 -0
app.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def main():
4
+
5
+ st.set_page_config(
6
+ page_title="ConstructAI",
7
+ page_icon="🏗️",
8
+ layout="wide"
9
+ )
10
+
11
+ home_page = st.Page("pages/Home.py",icon="🏠")
12
+ classifier_page = st.Page('pages/Classifier.py',title='Classifier',icon="🛠️")
13
+ project_wiki_page = st.Page('pages/Project_Wiki.py',title = 'Project Wiki', icon=":material/dashboard:")
14
+ pg = st.navigation([home_page, classifier_page, project_wiki_page])
15
+ pg.run()
16
+
17
+ if __name__ == "__main__":
18
+ main()
assets/style.css ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* General Styles */
2
+ .stButton>button {
3
+ background-color: #4CAF50;
4
+ color: white;
5
+ padding: 0.5rem 1rem;
6
+ border-radius: 5px;
7
+ border: none;
8
+ transition: all 0.3s;
9
+ }
10
+
11
+ .stButton>button:hover {
12
+ background-color: #45a049;
13
+ transform: translateY(-2px);
14
+ }
15
+
16
+ /* Hero Section */
17
+ .hero-section {
18
+ background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
19
+ padding: 2rem;
20
+ border-radius: 10px;
21
+ margin: 2rem 0;
22
+ text-align: center;
23
+ }
24
+
25
+ /* Feature Cards */
26
+ .feature-card {
27
+ background: white;
28
+ padding: 1.5rem;
29
+ border-radius: 8px;
30
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
31
+ margin: 1rem 0;
32
+ text-align: center;
33
+ }
34
+
35
+ /* Results Display */
36
+ .confidence-meter {
37
+ background: #f0f0f0;
38
+ border-radius: 10px;
39
+ height: 20px;
40
+ position: relative;
41
+ margin: 1rem 0;
42
+ }
43
+
44
+ .meter-fill {
45
+ background: linear-gradient(90deg, #4CAF50, #45a049);
46
+ height: 100%;
47
+ border-radius: 10px;
48
+ transition: width 0.5s ease-in-out;
49
+ }
50
+
51
+ .result-card {
52
+ background: white;
53
+ padding: 1.5rem;
54
+ border-radius: 8px;
55
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
56
+ margin: 1rem 0;
57
+ text-align: center;
58
+ }
59
+
60
+ /* Probability Bars */
61
+ .prob-bar {
62
+ display: flex;
63
+ align-items: center;
64
+ margin: 0.5rem 0;
65
+ }
66
+
67
+ .bar {
68
+ flex-grow: 1;
69
+ height: 20px;
70
+ background: #f0f0f0;
71
+ margin: 0 1rem;
72
+ border-radius: 10px;
73
+ overflow: hidden;
74
+ }
75
+
76
+ .fill {
77
+ height: 100%;
78
+ background: #4CAF50;
79
+ transition: width 0.5s ease-in-out;
80
+ }
utils/util_classifier.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
2
+ import torch
3
+ import numpy as np
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ import joblib
6
+ import pandas as pd
7
+ from datetime import datetime
8
+ import logging
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class TextClassificationPipeline:
14
+ def __init__(self, model_path='./models', method='bertbased'):
15
+ """
16
+ Initialize the classification pipeline
17
+ Args:
18
+ model_path: Path to saved models
19
+ method: 'bertbased' or 'baseline'
20
+ """
21
+ try:
22
+ self.method = method
23
+
24
+ if method == 'bertbased':
25
+ logger.info("Loading BERT model...")
26
+ self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
27
+ self.model = AutoModelForSequenceClassification.from_pretrained(f"{model_path}/bert-model")
28
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
29
+ self.model.to(self.device)
30
+ self.model.eval()
31
+ logger.info(f"BERT model loaded successfully. Using device: {self.device}")
32
+ else:
33
+ logger.info("Loading baseline model...")
34
+ self.tfidf = joblib.load(f"{model_path}/baseline-model/tfidf_vectorizer.pkl")
35
+ self.baseline_model = joblib.load(f"{model_path}/baseline-model/baseline_model.pkl")
36
+ logger.info("Baseline model loaded successfully")
37
+
38
+ # Load label encoder for both methods
39
+ self.label_encoder = joblib.load(f"{model_path}/label_encoder.pkl")
40
+
41
+ except Exception as e:
42
+ logger.error(f"Error initializing model: {str(e)}")
43
+ raise
44
+
45
+ # def preprocess_text(self, text):
46
+ # """Clean and preprocess text"""
47
+ # if isinstance(text, str):
48
+ # # Basic cleaning
49
+ # text = text.strip()
50
+ # text = ' '.join(text.split()) # Remove extra whitespace
51
+ # return text
52
+ # return text
53
+ def preprocess_text(self, text):
54
+ """Clean and preprocess text"""
55
+ if isinstance(text, str):
56
+ # Basic cleaning
57
+ text = text.strip()
58
+ text = ' '.join(text.split()) # Remove extra whitespace
59
+ # Capitalize first letter to match training data format
60
+ text = text.title() # This will capitalize first letter of each word
61
+ return text
62
+ return text
63
+
64
+ def preprocess(self, text):
65
+ """
66
+ Preprocess the input text based on method
67
+ """
68
+ try:
69
+ # Clean text first
70
+ text = self.preprocess_text(text)
71
+
72
+ if self.method == 'bertbased':
73
+ # BERT preprocessing
74
+ encodings = self.tokenizer(
75
+ text,
76
+ truncation=True,
77
+ padding=True,
78
+ max_length=512,
79
+ return_tensors='pt'
80
+ )
81
+ encodings = {k: v.to(self.device) for k, v in encodings.items()}
82
+ return encodings
83
+ else:
84
+ # Baseline preprocessing
85
+ return self.tfidf.transform([text] if isinstance(text, str) else text)
86
+
87
+ except Exception as e:
88
+ logger.error(f"Error in preprocessing: {str(e)}")
89
+ raise
90
+
91
+ def predict(self, text, return_probability=False):
92
+ """
93
+ Predict using either BERT or baseline model
94
+ Args:
95
+ text: Input text or list of texts
96
+ return_probability: Whether to return probability scores
97
+ Returns:
98
+ Predictions with metadata
99
+ """
100
+ try:
101
+ # Handle both single string and list of strings
102
+ if isinstance(text, str):
103
+ text = [text]
104
+
105
+ # Preprocess
106
+ inputs = self.preprocess(text)
107
+
108
+ if self.method == 'bertbased':
109
+ # BERT predictions
110
+ with torch.no_grad():
111
+ outputs = self.model(**inputs)
112
+ probabilities = torch.softmax(outputs.logits, dim=-1)
113
+ predictions = torch.argmax(probabilities, dim=-1)
114
+
115
+ predictions = predictions.cpu().numpy()
116
+ probabilities = probabilities.cpu().numpy()
117
+
118
+ else:
119
+ # Baseline predictions
120
+ predictions = self.baseline_model.predict(inputs)
121
+ probabilities = self.baseline_model.predict_proba(inputs)
122
+
123
+ # Convert numeric predictions to original labels
124
+ predicted_labels = self.label_encoder.inverse_transform(predictions)
125
+
126
+ # Ensure consistent casing with training data
127
+ predicted_labels = [label.title() for label in predicted_labels]
128
+
129
+ if return_probability:
130
+ results = []
131
+ for t, label, prob, probs in zip(text, predicted_labels,
132
+ probabilities.max(axis=1),
133
+ probabilities):
134
+ result = {
135
+ 'text': t[:200] + '...' if len(t) > 200 else t,
136
+ 'predicted_label': label.title(), # Ensure consistent casing
137
+ 'confidence': float(prob),
138
+ 'model_type': self.method,
139
+ 'probabilities': {
140
+ self.label_encoder.inverse_transform([i])[0].title(): float(p) # Consistent casing
141
+ for i, p in enumerate(probs)
142
+ },
143
+ # ... rest of the result dictionary ...
144
+ }
145
+ results.append(result)
146
+
147
+ return results[0] if len(text) == 1 else results
148
+
149
+ return predicted_labels[0] if len(text) == 1 else predicted_labels
150
+
151
+ except Exception as e:
152
+ logger.error(f"Error in prediction: {str(e)}")
153
+ raise
154
+
155
+ def predict_old(self, text, return_probability=False):
156
+ """
157
+ Predict using either BERT or baseline model
158
+ Args:
159
+ text: Input text or list of texts
160
+ return_probability: Whether to return probability scores
161
+ Returns:
162
+ Predictions with metadata
163
+ """
164
+ try:
165
+ # Handle both single string and list of strings
166
+ if isinstance(text, str):
167
+ text = [text]
168
+
169
+ # Preprocess
170
+ inputs = self.preprocess(text)
171
+
172
+ if self.method == 'bertbased':
173
+ # BERT predictions
174
+ with torch.no_grad():
175
+ outputs = self.model(**inputs)
176
+ probabilities = torch.softmax(outputs.logits, dim=-1)
177
+ predictions = torch.argmax(probabilities, dim=-1)
178
+
179
+ predictions = predictions.cpu().numpy()
180
+ probabilities = probabilities.cpu().numpy()
181
+
182
+ else:
183
+ # Baseline predictions
184
+ predictions = self.baseline_model.predict(inputs)
185
+ probabilities = self.baseline_model.predict_proba(inputs)
186
+
187
+ # Convert numeric predictions to original labels
188
+ predicted_labels = self.label_encoder.inverse_transform(predictions)
189
+
190
+ if return_probability:
191
+ results = []
192
+ for t, label, prob, probs in zip(text, predicted_labels,
193
+ probabilities.max(axis=1),
194
+ probabilities):
195
+ # Create detailed result dictionary
196
+ result = {
197
+ 'text': t[:200] + '...' if len(t) > 200 else t, # Truncate long text
198
+ 'predicted_label': label,
199
+ 'confidence': float(prob),
200
+ 'model_type': self.method,
201
+ 'probabilities': {
202
+ self.label_encoder.inverse_transform([i])[0]: float(p)
203
+ for i, p in enumerate(probs)
204
+ },
205
+ 'timestamp': datetime.now().isoformat(),
206
+ 'metadata': {
207
+ 'model_name': 'BERT' if self.method == 'bertbased' else 'Baseline',
208
+ 'text_length': len(t),
209
+ 'preprocessing_steps': ['cleaning', 'tokenization']
210
+ }
211
+ }
212
+ results.append(result)
213
+
214
+ return results[0] if len(text) == 1 else results
215
+
216
+ return predicted_labels[0] if len(text) == 1 else predicted_labels
217
+
218
+ except Exception as e:
219
+ logger.error(f"Error in prediction: {str(e)}")
220
+ raise
221
+
222
+ def get_model_info(self):
223
+ """Return model information"""
224
+ return {
225
+ 'model_type': self.method,
226
+ 'model_name': 'BERT' if self.method == 'bertbased' else 'Baseline',
227
+ 'device': str(self.device) if self.method == 'bertbased' else 'CPU',
228
+ 'max_sequence_length': 512 if self.method == 'bertbased' else None,
229
+ 'number_of_classes': len(self.label_encoder.classes_),
230
+ 'classes': list(self.label_encoder.classes_)
231
+ }
232
+
233
+ def load_and_process_pdf(url_or_file):
234
+ """
235
+ Load and process PDF from URL or file
236
+ Returns extracted text
237
+ """
238
+ try:
239
+ # Your PDF processing code here
240
+ # Return extracted text
241
+ pass
242
+ except Exception as e:
243
+ logger.error(f"Error processing PDF: {str(e)}")
244
+ raise
245
+
246
+ # Example usage
247
+ if __name__ == "__main__":
248
+ # Test the pipeline
249
+ classifier = TextClassificationPipeline()
250
+
251
+ # Test single prediction
252
+ text = "Example construction document text"
253
+ result = classifier.predict(text, return_probability=True)
254
+ print("\nSingle Prediction Result:")
255
+ print(result)
256
+
257
+ # Test batch prediction
258
+ texts = ["First document", "Second document"]
259
+ results = classifier.predict(texts, return_probability=True)
260
+ print("\nBatch Prediction Results:")
261
+ for result in results:
262
+ print(f"\nText: {result['text']}")
263
+ print(f"Prediction: {result['predicted_label']}")
264
+ print(f"Confidence: {result['confidence']:.4f}")