Spaces:
Sleeping
Sleeping
File size: 15,095 Bytes
3129827 acd6fc0 3129827 d158fe3 3129827 d158fe3 3129827 78bab84 3129827 78bab84 ff8ba27 3129827 ff8ba27 3129827 ff8ba27 3129827 ff8ba27 3129827 ff8ba27 113e28e 3129827 ff8ba27 3129827 113e28e 3129827 fa54056 3129827 fa54056 3129827 8aee7cd 3129827 8aee7cd 3129827 42a3141 3129827 b64af7d e587583 3129827 a26bec3 fa54056 3129827 fa54056 962e59d 3129827 fa54056 962e59d 3129827 fa54056 3129827 113e28e 3129827 d158fe3 3129827 c3ce081 3129827 d158fe3 3129827 d158fe3 3129827 a26bec3 3129827 e27677b 3129827 113e28e 3129827 c3ce081 3129827 fa54056 c950127 113e28e c3ce081 ae3cb5c 3129827 113e28e 14cf178 3129827 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 |
import gradio as gr
import regex as re
import torch
import nltk
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from nltk.tokenize import sent_tokenize
import plotly.express as px
import time
import tqdm
nltk.download('punkt_tab')
# Define the device (GPU or CPU)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# Define the model and tokenizer
checkpoint = "ieq/IEQ-BERT"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to(device)
# Define the function for preprocessing text
def prep_text(text):
clean_sents = []
sent_tokens = sent_tokenize(str(text))
for sent_token in sent_tokens:
word_tokens = [str(word_token).strip().lower() for word_token in sent_token.split()]
clean_sents.append(' '.join((word_tokens)))
joined_clean_sents = '. '.join(clean_sents).strip(' ')
return joined_clean_sents
# APP INFO
def app_info():
check = """
Please go to either the "Single-Text-Prediction" or "Multi-Text-Prediction" tab to analyse your text.
"""
return check
# Create Gradio interface for app info
iface1 = gr.Interface(
fn=app_info, inputs=None, outputs=['text'], title="General-Infomation",
description='''
This app, powered by the IEQ-BERT model, is for automating the classification of text with respect
to indoor environmental quality (IEQ). IEQ refers to the quality of the indoor air, lighting,
temperature, and acoustics within a building, as well as the overall comfort and well-being of its occupants. It encompasses various
factors that can impact the health, productivity, and satisfaction of people who spend time indoors, such as office workers, students,
patients, and residents. This app assigns five labels to any given text; hence, a text may be assigned one or more labels. The five labels include
the following:
- Acoustic
- Indoor air quality (IAQ)
- No IEQ (label assigned when no IEQ is detected)
- Thermal
- Visual
Because IEQ-BERT is capable of assigning one or more labels to a text, it is possible that the returned prediction, like
(Acoustic_No IEQ) or (NO IEQ_Thermal). These multiple predictions that include "No IEQ" may suggest a lack of contextual
clarity in the text and need manual review to confirm the label.
This app has two analysis modules summarised below:
- Single-Text-Prediction - Analyses text pasted in a text box and returns IEQ prediction.
- Multi-Text-Prediction - Analyses multiple rows of texts in an uploaded CSV or Excell file and returns a downloadable CSV file with IEQ prediction for each row of text.
This app runs on a free server and may, therefore, not be suitable for analysing large CSV files.
If you need assistance with analysing large CSV files, use the contact information in the Contact section to get in touch.
<h3>Contact</h3>
<p>We would be happy to receive your feedback regarding this app. If you would also like to collaborate with us to explore some use cases for the model
powering this app, we are happy to hear from you.</p>
Dr Abdul-Manan Sadick - s.sadick@deakin.edu.au\n
Dr Giorgia Chinazzo - giorgia.chinazzo@northwestern.edu
''')
# SINGLE TEXT
# Define the prediction function
def predict_single_text(text, threshold):
"""
Predicts the IEQ labels for a single text.
Args:
text (str): The text to be analyzed.
Returns:
top_prediction (dict): A dictionary containing the top predicted IEQ labels and their corresponding probabilities.
fig (plotly.graph_objs.Figure): A bar chart showing the likelihood of each IEQ label.
"""
# Preprocess the input text
cleaned_text = prep_text(text)
# Check if the text is empty after preprocessing
if cleaned_text == "":
raise gr.Error('This model needs some text input to return a prediction')
# Tokenize the preprocessed text
tokenized_text = tokenizer(cleaned_text, return_tensors="pt", truncation=True, max_length=512, padding=True).to(
device)
# Make predictions
with torch.no_grad():
outputs = model(**tokenized_text)
logits = outputs.logits
# Calculate the probabilities
probabilities = torch.sigmoid(logits).squeeze()
threshold = 0.3 # default probability threshold value for prediction
# Get the predicted labels
predicted_labels_ = (probabilities.cpu().numpy() > threshold).tolist()
# Define the list of IEQ labels
label_list = [
'Acoustic',
'Indoor air quality',
'No IEQ',
'Thermal',
'Visual'
]
# Map the predicted labels to their corresponding names
predicted_labels = [label_list[i] for i in range(len(label_list)) if predicted_labels_[i] == 1]
# Get the probabilities of the predicted labels
predicted_prob = [round(a_, 3) for a_ in probabilities.cpu().numpy().tolist() if a_ > threshold]
# Create a dictionary containing the top predicted IEQ labels and their corresponding probabilities
top_prediction = (dict(zip(predicted_labels, predicted_prob)))
# Create a bar chart showing the likelihood of each IEQ label
# Make dataframe for plotly bar chart
u, v = zip(*dict(zip(label_list, probabilities.cpu().numpy().tolist())).items())
m = list(u)
n = list(v)
df2 = pd.DataFrame()
df2['IEQ'] = m
df2['Likelihood'] = n
# plot graph of predictions
fig = px.bar(df2, x="Likelihood", y="IEQ", orientation="h")
fig.update_layout(
# barmode='stack',
template='simple_white', font=dict(family="Arial", size=12, color="black"),
autosize=True,
width=400,
height=400,
xaxis_title="Likelihood of IEQ",
yaxis_title="Indoor environmental quality (IEQ)",
# legend_title="Topics"
)
fig.update_xaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=12))
fig.update_yaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=12))
fig.update_annotations(font_size=12)
return top_prediction, fig
# Create Gradio interface for single text
threshold_input = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="Threshold value (default=0.3)")
iface2 = gr.Interface(fn=predict_single_text,
inputs=[gr.Textbox(lines=7, label="Paste or type text here"), threshold_input],
outputs=[gr.Label(label="Predicted Labels", show_label=True),
gr.Plot(label="Likelihood of all labels", show_label=False)],
title="Single Text Prediction",
description="""**Threshold value:** The threshold value determines the minimum probability required
for a label to be predicted. A higher threshold value will result in fewer labels being predicted,
while a lower threshold value will result in more labels being predicted. The default threshold value is 0.3.""",
article="**Note:** The quality of model predictions may depend on the quality of the information provided.")
# UPLOAD CSV
# Define the prediction function
def predict_from_csv(file, column_name, threshold, progress=gr.Progress()):
"""
Predicts the IEQ labels for a list of texts in a CSV file.
Args:
file (str): The path to the CSV file.
column_name (str): The name of the column containing the text to be analyzed.
progress (gr.Progress): A progress bar to display the analysis progress.
Returns:
fig (plotly.graph_objs.Figure): A histogram showing the frequency of each IEQ label.
output_csv (gr.File): A downloadable CSV file containing the predictions.
"""
# Read the CSV or Excel file
if file.endswith('.csv'):
df_docs = pd.read_csv(file)
elif file.endswith('.xls') or file.endswith('.xlsx'):
df_docs = pd.read_excel(file)
else:
raise gr.Error("Invalid file type. Please upload a CSV or Excel file.")
# Check if the specified column exists
if column_name not in df_docs.columns:
raise gr.Error(f"The column '{column_name}' does not exist in the uploaded CSV file.")
# Extract the text list from the specified column
text_list = df_docs[column_name].tolist()
# Define the list of IEQ labels
label_list = [
'Acoustic',
'Indoor air quality',
'No IEQ',
'Thermal',
'Visual'
]
# Initialize lists to store the predictions
labels_predicted = []
prediction_scores = []
# Initialize empty lists for IEQ labels and scores
ieq1 = []
ieq2 = []
ieq3 = []
ieq4 = []
ieq5 = []
score1 = []
score2 = []
score3 = []
score4 = []
score5 = []
# Preprocess text and make predictions
for text_input in progress.tqdm(text_list, desc="Analysing data"):
# Sleep to avoid rate limiting
time.sleep(0.02)
# Preprocess the text
cleaned_text = prep_text(text_input)
# Tokenize the text
tokenized_text = tokenizer(cleaned_text, return_tensors="pt", truncation=True, max_length=512, padding=True).to(
device)
# Make predictions
with torch.no_grad():
outputs = model(**tokenized_text)
logits = outputs.logits
# Calculate the probabilities
predictions = torch.sigmoid(logits).squeeze()
# # Define the threshold for prediction
# threshold = 0.3
# Get the predicted labels
predicted_labels_ = (predictions.cpu().numpy() > threshold).tolist()
# Map the predicted labels to their corresponding names
predicted_labels = [label_list[i] for i in range(len(label_list)) if predicted_labels_[i] == 1]
# Get the probabilities of the predicted labels
prediction_score = [round(a_, 3) for a_ in predictions.cpu().numpy().tolist() if a_ > threshold]
# Append the predictions to the lists
labels_predicted.append(predicted_labels)
prediction_scores.append(prediction_score)
# Append to ieq1 to ieq5
for i in range(5):
if i < len(predicted_labels):
if i == 0:
ieq1.append(predicted_labels[i])
elif i == 1:
ieq2.append(predicted_labels[i])
elif i == 2:
ieq3.append(predicted_labels[i])
elif i == 3:
ieq4.append(predicted_labels[i])
elif i == 4:
ieq5.append(predicted_labels[i])
else:
if i == 0:
ieq1.append("-")
elif i == 1:
ieq2.append("-")
elif i == 2:
ieq3.append("-")
elif i == 3:
ieq4.append("-")
elif i == 4:
ieq5.append("-")
# Append to score1 to score5
for i in range(5):
if i < len(prediction_score):
if i == 0:
score1.append(prediction_score[i])
elif i == 1:
score2.append(prediction_score[i])
elif i == 2:
score3.append(prediction_score[i])
elif i == 3:
score4.append(prediction_score[i])
elif i == 4:
score5.append(prediction_score[i])
else:
if i == 0:
score1.append("-")
elif i == 1:
score2.append("-")
elif i == 2:
score3.append("-")
elif i == 3:
score4.append("-")
elif i == 4:
score5.append("-")
# Append the predictions to the DataFrame
df_docs['IEQ_predicted'] = labels_predicted
df_docs['prediction_scores'] = prediction_scores
df_docs['IEQ1'] = ieq1
df_docs['IEQ2'] = ieq2
df_docs['IEQ3'] = ieq3
df_docs['IEQ4'] = ieq4
df_docs['IEQ5'] = ieq5
df_docs['Score1'] = score1
df_docs['Score2'] = score2
df_docs['Score3'] = score3
df_docs['Score4'] = score4
df_docs['Score5'] = score5
# Save the predictions to a CSV file
df_docs.to_csv('IEQ_predictions.csv')
# Create a downloadable CSV file
output_csv = gr.File(value='IEQ_predictions.csv', visible=True)
# # Create a histogram showing the frequency of each IEQ label
# fig = px.histogram(df_docs, y="IEQ_predicted")
# fig.update_layout(
# template='seaborn',
# font=dict(family="Arial", size=12, color="black"),
# autosize=True,
# # width=800,
# # height=500,
# xaxis_title="IEQ counts",
# yaxis_title="Indoor environmental quality (IEQ)",
# )
# fig.update_xaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=12))
# fig.update_yaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=12))
# fig.update_annotations(font_size=12)
return output_csv
# Define the input component
file_input = gr.File(label="Upload CSV or Excel file here", show_label=True, file_types=[".csv", ".xls", ".xlsx"])
column_name_input = gr.Textbox(label="Enter the column name containing the text to be analyzed", show_label=True)
threshold_input = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="Threshold value (default=0.3)")
# Create the Gradio interface
iface3 = gr.Interface(fn=predict_from_csv,
inputs=[file_input, column_name_input, threshold_input],
outputs=gr.File(label='Download output CSV', show_label=True),
title="Multi-text Prediction",
description='''**Threshold value:** The threshold value determines the minimum probability required
for a label to be predicted. A higher threshold value will result in fewer labels being predicted,
while a lower threshold value will result in more labels being predicted. The default threshold value is 0.3''',
article="""**Note About Processing Large Dataset:** If you have more than 100 rows of data to process,
it may be best to use the Google Colab Notebook at the link below where you can leverage GPU for faster processing
https://colab.research.google.com/drive/15uOTDnzjQ_iD5HpuZuFX1oUoS_OvSJ5L?usp=sharing""")
# Create a tabbed interface
demo = gr.TabbedInterface(interface_list=[iface1, iface2, iface3],
tab_names=["General-App-Info", "Single-Text-Prediction", "Multi-Text-Prediction"],
title="Indoor Environmental Quality (IEQ) Text Classifier App",
theme='soft'
)
# Launch the interface
demo.queue().launch()
|