Spaces:

ieq
/

IEQ-Text-Classifier-App

Sleeping

App Files Files Community

IEQ-Text-Classifier-App / app.py

sadickam

Update app.py

b64af7d verified 2 months ago

raw

history blame contribute delete

15.1 kB

	import gradio as gr
	import regex as re
	import torch
	import nltk
	import pandas as pd
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	from nltk.tokenize import sent_tokenize
	import plotly.express as px
	import time
	import tqdm
	nltk.download('punkt_tab')

	# Define the device (GPU or CPU)
	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

	# Define the model and tokenizer
	checkpoint = "ieq/IEQ-BERT"
	tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to(device)


	# Define the function for preprocessing text
	def prep_text(text):
	clean_sents = []
	sent_tokens = sent_tokenize(str(text))
	for sent_token in sent_tokens:
	word_tokens = [str(word_token).strip().lower() for word_token in sent_token.split()]
	clean_sents.append(' '.join((word_tokens)))
	joined_clean_sents = '. '.join(clean_sents).strip(' ')
	return joined_clean_sents


	# APP INFO
	def app_info():
	check = """
	Please go to either the "Single-Text-Prediction" or "Multi-Text-Prediction" tab to analyse your text.
	"""

	return check


	# Create Gradio interface for app info
	iface1 = gr.Interface(
	fn=app_info, inputs=None, outputs=['text'], title="General-Infomation",
	description='''
	This app, powered by the IEQ-BERT model, is for automating the classification of text with respect
	to indoor environmental quality (IEQ). IEQ refers to the quality of the indoor air, lighting,
	temperature, and acoustics within a building, as well as the overall comfort and well-being of its occupants. It encompasses various
	factors that can impact the health, productivity, and satisfaction of people who spend time indoors, such as office workers, students,
	patients, and residents. This app assigns five labels to any given text; hence, a text may be assigned one or more labels. The five labels include
	the following:
	- Acoustic
	- Indoor air quality (IAQ)
	- No IEQ (label assigned when no IEQ is detected)
	- Thermal
	- Visual

	Because IEQ-BERT is capable of assigning one or more labels to a text, it is possible that the returned prediction, like
	(Acoustic_No IEQ) or (NO IEQ_Thermal). These multiple predictions that include "No IEQ" may suggest a lack of contextual
	clarity in the text and need manual review to confirm the label.

	This app has two analysis modules summarised below:
	- Single-Text-Prediction - Analyses text pasted in a text box and returns IEQ prediction.
	- Multi-Text-Prediction - Analyses multiple rows of texts in an uploaded CSV or Excell file and returns a downloadable CSV file with IEQ prediction for each row of text.

	This app runs on a free server and may, therefore, not be suitable for analysing large CSV files.
	If you need assistance with analysing large CSV files, use the contact information in the Contact section to get in touch.

	<h3>Contact</h3>
	<p>We would be happy to receive your feedback regarding this app. If you would also like to collaborate with us to explore some use cases for the model
	powering this app, we are happy to hear from you.</p>

	Dr Abdul-Manan Sadick - s.sadick@deakin.edu.au\n
	Dr Giorgia Chinazzo - giorgia.chinazzo@northwestern.edu
	''')


	# SINGLE TEXT
	# Define the prediction function
	def predict_single_text(text, threshold):
	"""
	Predicts the IEQ labels for a single text.

	Args:
	text (str): The text to be analyzed.

	Returns:
	top_prediction (dict): A dictionary containing the top predicted IEQ labels and their corresponding probabilities.
	fig (plotly.graph_objs.Figure): A bar chart showing the likelihood of each IEQ label.
	"""
	# Preprocess the input text
	cleaned_text = prep_text(text)

	# Check if the text is empty after preprocessing
	if cleaned_text == "":
	raise gr.Error('This model needs some text input to return a prediction')

	# Tokenize the preprocessed text
	tokenized_text = tokenizer(cleaned_text, return_tensors="pt", truncation=True, max_length=512, padding=True).to(
	device)

	# Make predictions
	with torch.no_grad():
	outputs = model(**tokenized_text)
	logits = outputs.logits

	# Calculate the probabilities
	probabilities = torch.sigmoid(logits).squeeze()

	threshold = 0.3 # default probability threshold value for prediction

	# Get the predicted labels
	predicted_labels_ = (probabilities.cpu().numpy() > threshold).tolist()

	# Define the list of IEQ labels
	label_list = [
	'Acoustic',
	'Indoor air quality',
	'No IEQ',
	'Thermal',
	'Visual'
	]

	# Map the predicted labels to their corresponding names
	predicted_labels = [label_list[i] for i in range(len(label_list)) if predicted_labels_[i] == 1]

	# Get the probabilities of the predicted labels
	predicted_prob = [round(a_, 3) for a_ in probabilities.cpu().numpy().tolist() if a_ > threshold]

	# Create a dictionary containing the top predicted IEQ labels and their corresponding probabilities
	top_prediction = (dict(zip(predicted_labels, predicted_prob)))

	# Create a bar chart showing the likelihood of each IEQ label
	# Make dataframe for plotly bar chart
	u, v = zip(*dict(zip(label_list, probabilities.cpu().numpy().tolist())).items())
	m = list(u)
	n = list(v)
	df2 = pd.DataFrame()
	df2['IEQ'] = m
	df2['Likelihood'] = n

	# plot graph of predictions
	fig = px.bar(df2, x="Likelihood", y="IEQ", orientation="h")

	fig.update_layout(
	# barmode='stack',
	template='simple_white', font=dict(family="Arial", size=12, color="black"),
	autosize=True,
	width=400,
	height=400,
	xaxis_title="Likelihood of IEQ",
	yaxis_title="Indoor environmental quality (IEQ)",
	# legend_title="Topics"
	)

	fig.update_xaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=12))
	fig.update_yaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=12))
	fig.update_annotations(font_size=12)

	return top_prediction, fig

	# Create Gradio interface for single text
	threshold_input = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="Threshold value (default=0.3)")
	iface2 = gr.Interface(fn=predict_single_text,
	inputs=[gr.Textbox(lines=7, label="Paste or type text here"), threshold_input],
	outputs=[gr.Label(label="Predicted Labels", show_label=True),
	gr.Plot(label="Likelihood of all labels", show_label=False)],
	title="Single Text Prediction",
	description="""Threshold value: The threshold value determines the minimum probability required
	for a label to be predicted. A higher threshold value will result in fewer labels being predicted,
	while a lower threshold value will result in more labels being predicted. The default threshold value is 0.3.""",
	article="Note: The quality of model predictions may depend on the quality of the information provided.")


	# UPLOAD CSV
	# Define the prediction function
	def predict_from_csv(file, column_name, threshold, progress=gr.Progress()):
	"""
	Predicts the IEQ labels for a list of texts in a CSV file.

	Args:
	file (str): The path to the CSV file.
	column_name (str): The name of the column containing the text to be analyzed.
	progress (gr.Progress): A progress bar to display the analysis progress.

	Returns:
	fig (plotly.graph_objs.Figure): A histogram showing the frequency of each IEQ label.
	output_csv (gr.File): A downloadable CSV file containing the predictions.
	"""
	# Read the CSV or Excel file
	if file.endswith('.csv'):
	df_docs = pd.read_csv(file)
	elif file.endswith('.xls') or file.endswith('.xlsx'):
	df_docs = pd.read_excel(file)
	else:
	raise gr.Error("Invalid file type. Please upload a CSV or Excel file.")

	# Check if the specified column exists
	if column_name not in df_docs.columns:
	raise gr.Error(f"The column '{column_name}' does not exist in the uploaded CSV file.")

	# Extract the text list from the specified column
	text_list = df_docs[column_name].tolist()

	# Define the list of IEQ labels
	label_list = [
	'Acoustic',
	'Indoor air quality',
	'No IEQ',
	'Thermal',
	'Visual'
	]

	# Initialize lists to store the predictions
	labels_predicted = []
	prediction_scores = []

	# Initialize empty lists for IEQ labels and scores
	ieq1 = []
	ieq2 = []
	ieq3 = []
	ieq4 = []
	ieq5 = []
	score1 = []
	score2 = []
	score3 = []
	score4 = []
	score5 = []

	# Preprocess text and make predictions
	for text_input in progress.tqdm(text_list, desc="Analysing data"):
	# Sleep to avoid rate limiting
	time.sleep(0.02)

	# Preprocess the text
	cleaned_text = prep_text(text_input)

	# Tokenize the text
	tokenized_text = tokenizer(cleaned_text, return_tensors="pt", truncation=True, max_length=512, padding=True).to(
	device)

	# Make predictions
	with torch.no_grad():
	outputs = model(**tokenized_text)
	logits = outputs.logits

	# Calculate the probabilities
	predictions = torch.sigmoid(logits).squeeze()

	# # Define the threshold for prediction
	# threshold = 0.3

	# Get the predicted labels
	predicted_labels_ = (predictions.cpu().numpy() > threshold).tolist()

	# Map the predicted labels to their corresponding names
	predicted_labels = [label_list[i] for i in range(len(label_list)) if predicted_labels_[i] == 1]

	# Get the probabilities of the predicted labels
	prediction_score = [round(a_, 3) for a_ in predictions.cpu().numpy().tolist() if a_ > threshold]

	# Append the predictions to the lists
	labels_predicted.append(predicted_labels)
	prediction_scores.append(prediction_score)

	# Append to ieq1 to ieq5
	for i in range(5):
	if i < len(predicted_labels):
	if i == 0:
	ieq1.append(predicted_labels[i])
	elif i == 1:
	ieq2.append(predicted_labels[i])
	elif i == 2:
	ieq3.append(predicted_labels[i])
	elif i == 3:
	ieq4.append(predicted_labels[i])
	elif i == 4:
	ieq5.append(predicted_labels[i])
	else:
	if i == 0:
	ieq1.append("-")
	elif i == 1:
	ieq2.append("-")
	elif i == 2:
	ieq3.append("-")
	elif i == 3:
	ieq4.append("-")
	elif i == 4:
	ieq5.append("-")

	# Append to score1 to score5
	for i in range(5):
	if i < len(prediction_score):
	if i == 0:
	score1.append(prediction_score[i])
	elif i == 1:
	score2.append(prediction_score[i])
	elif i == 2:
	score3.append(prediction_score[i])
	elif i == 3:
	score4.append(prediction_score[i])
	elif i == 4:
	score5.append(prediction_score[i])
	else:
	if i == 0:
	score1.append("-")
	elif i == 1:
	score2.append("-")
	elif i == 2:
	score3.append("-")
	elif i == 3:
	score4.append("-")
	elif i == 4:
	score5.append("-")

	# Append the predictions to the DataFrame
	df_docs['IEQ_predicted'] = labels_predicted
	df_docs['prediction_scores'] = prediction_scores
	df_docs['IEQ1'] = ieq1
	df_docs['IEQ2'] = ieq2
	df_docs['IEQ3'] = ieq3
	df_docs['IEQ4'] = ieq4
	df_docs['IEQ5'] = ieq5
	df_docs['Score1'] = score1
	df_docs['Score2'] = score2
	df_docs['Score3'] = score3
	df_docs['Score4'] = score4
	df_docs['Score5'] = score5

	# Save the predictions to a CSV file
	df_docs.to_csv('IEQ_predictions.csv')

	# Create a downloadable CSV file
	output_csv = gr.File(value='IEQ_predictions.csv', visible=True)

	# # Create a histogram showing the frequency of each IEQ label
	# fig = px.histogram(df_docs, y="IEQ_predicted")
	# fig.update_layout(
	# template='seaborn',
	# font=dict(family="Arial", size=12, color="black"),
	# autosize=True,
	# # width=800,
	# # height=500,
	# xaxis_title="IEQ counts",
	# yaxis_title="Indoor environmental quality (IEQ)",
	# )
	# fig.update_xaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=12))
	# fig.update_yaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=12))
	# fig.update_annotations(font_size=12)

	return output_csv


	# Define the input component
	file_input = gr.File(label="Upload CSV or Excel file here", show_label=True, file_types=[".csv", ".xls", ".xlsx"])
	column_name_input = gr.Textbox(label="Enter the column name containing the text to be analyzed", show_label=True)
	threshold_input = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="Threshold value (default=0.3)")

	# Create the Gradio interface
	iface3 = gr.Interface(fn=predict_from_csv,
	inputs=[file_input, column_name_input, threshold_input],
	outputs=gr.File(label='Download output CSV', show_label=True),
	title="Multi-text Prediction",
	description='''Threshold value: The threshold value determines the minimum probability required
	for a label to be predicted. A higher threshold value will result in fewer labels being predicted,
	while a lower threshold value will result in more labels being predicted. The default threshold value is 0.3''',
	article="""Note About Processing Large Dataset: If you have more than 100 rows of data to process,
	it may be best to use the Google Colab Notebook at the link below where you can leverage GPU for faster processing
	https://colab.research.google.com/drive/15uOTDnzjQ_iD5HpuZuFX1oUoS_OvSJ5L?usp=sharing""")

	# Create a tabbed interface
	demo = gr.TabbedInterface(interface_list=[iface1, iface2, iface3],
	tab_names=["General-App-Info", "Single-Text-Prediction", "Multi-Text-Prediction"],
	title="Indoor Environmental Quality (IEQ) Text Classifier App",
	theme='soft'
	)

	# Launch the interface
	demo.queue().launch()