Spaces:
Runtime error
Runtime error
import os | |
import random | |
import gradio as gr | |
import json | |
import numpy as np | |
import torch | |
import heapq | |
import pandas as pd | |
from tqdm import tqdm | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
from torch.utils.data import TensorDataset, DataLoader | |
class Preprocess: | |
def __init__(self, tokenizer_vocab_path, tokenizer_max_len): | |
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_vocab_path, use_auth_token='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') | |
self.max_len = tokenizer_max_len | |
def clean_text(self, text): | |
text = text.lower() | |
stopwords = ["i", "was", "transferred", | |
"from", "to", "nilienda", "kituo", | |
"cha", "lakini", "saa", "hii", "niko", | |
"at", "nilienda", "nikahudumiwa", "pole", | |
"deliver", "na", "ni", "baada", "ya", | |
"kutumwa", "kutoka", "nilienda", | |
"ndipo", "nikapewa", "hiyo", "lindam ama", "nikawa", | |
"mgonjwa", "nikatibiwa", "in", "had", "a", | |
"visit", "gynaecologist", "ndio", | |
"karibu", "mimi", "niko", "sehemu", "hospitali", | |
"serikali", "delivered", "katika", "kaunti", "kujifungua", | |
"katika", "huko", "nilipoenda", "kwa", "bado", "naedelea", | |
"sija", "maliza", "mwisho", | |
"nilianza", "kliniki", "yangu", | |
"nilianzia", "nilijifungua"] | |
text_single = ' '.join(word for word in text.split() if word not in stopwords) | |
return text_single | |
def encode_fn(self, text_single): | |
""" | |
Using tokenizer to preprocess the text | |
example of text_single:'Nairobi Hospital' | |
""" | |
tokenizer = self.tokenizer(text_single, | |
padding=True, | |
truncation=True, | |
max_length=self.max_len, | |
return_tensors='pt' | |
) | |
input_ids = tokenizer['input_ids'] | |
attention_mask = tokenizer['attention_mask'] | |
return input_ids, attention_mask | |
def process_tokenizer(self, text_single): | |
""" | |
Preprocess text and prepare dataloader for a single new sentence | |
""" | |
input_ids, attention_mask = self.encode_fn(text_single) | |
data = TensorDataset(input_ids, attention_mask) | |
return data | |
class Facility_Model: | |
def __init__(self, facility_model_path: any, | |
max_len: int): | |
self.max_len = max_len | |
self.softmax = torch.nn.Softmax(dim=1) | |
self.gpu = False | |
self.model = AutoModelForSequenceClassification.from_pretrained(facility_model_path, use_auth_token='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') | |
self.model.eval() # set pytorch model for inference mode | |
if torch.cuda.device_count() > 1: | |
self.model = torch.nn.DataParallel(self.model) | |
if self.gpu: | |
seed = 42 | |
random.seed(seed) | |
np.random.seed(seed) | |
torch.manual_seed(seed) | |
torch.cuda.manual_seed_all(seed) | |
torch.backends.cudnn.deterministic = True | |
self.device = torch.device('cuda') | |
else: | |
self.device = 'cpu' | |
self.model = self.model.to(self.device) | |
def predict_single(self, model, pred_data): | |
""" | |
Model inference for new single sentence | |
""" | |
pred_dataloader = DataLoader(pred_data, batch_size=10, shuffle=False) | |
for i, batch in enumerate(pred_dataloader): | |
with torch.no_grad(): | |
outputs = model(input_ids=batch[0].to(self.device), | |
attention_mask=batch[1].to(self.device) | |
) | |
loss, logits = outputs.loss, outputs.logits | |
probability = self.softmax(logits) | |
probability_list = probability.detach().cpu().numpy() | |
return probability_list | |
def output_intent_probability(self, pred: any) -> dict: | |
""" | |
convert the model output into a dictionary with all intents and its probability | |
""" | |
output_dict = {} | |
# transform the relation table(between label and intent) | |
path_table = pd.read_csv('dhis_label_relation_14357.csv') | |
label_intent_dict = path_table[["label", "corresponding_label"]].set_index("corresponding_label").to_dict()['label'] | |
# transform the output into dictionary(between intent and probability) | |
for intent in range(pred.shape[1]): | |
output_dict[label_intent_dict[intent]] = pred[0][intent] | |
return output_dict | |
def inference(self, prepared_data): | |
""" | |
Make predictions on one new sentence and output a JSON format variable | |
""" | |
temp = [] | |
prob_distribution = self.predict_single(self.model, prepared_data) | |
prediction_results = self.output_intent_probability(prob_distribution.astype(float)) | |
# Filter out predictions containing "dental" or "optical" keywords | |
filtered_results = {intent: prob for intent, prob in prediction_results.items() | |
if | |
"dental" not in intent.lower() and "optical" not in intent.lower() and "eye" not in intent.lower()} | |
sorted_pred_intent_results = sorted(filtered_results.items(), key=lambda x: x[1], reverse=True) | |
sorted_pred_intent_results_dict = dict(sorted_pred_intent_results) | |
# Return the top result | |
top_results = dict(list(sorted_pred_intent_results)[:1]) | |
# temp.append(top_results) | |
# final_preds = json.dumps(temp) | |
final_preds = ', '.join(top_results.keys()) | |
final_preds = final_preds.replace("'", "") | |
return final_preds | |
jacaranda_hugging_face_model = "Jacaranda/dhis_14000_600k_Test_Model" | |
obj_Facility_Model = Facility_Model(facility_model_path=jacaranda_hugging_face_model, | |
max_len=128 | |
) | |
processor = Preprocess(tokenizer_vocab_path=jacaranda_hugging_face_model, | |
tokenizer_max_len=128 | |
) | |
def predict_batch_from_csv(input_file, output_file): | |
# Load batch data from CSV | |
batch_data = pd.read_csv(input_file) | |
# Initialize predictions list | |
predictions = [] | |
# Iterate over rows with tqdm for progress tracking | |
for _, row in tqdm(batch_data.iterrows(), total=len(batch_data)): | |
text = row['facility_name'] # Replace 'facility_name' with the actual column name containing the text data | |
cleaned_text = processor.clean_text(text) | |
prepared_data = processor.process_tokenizer(cleaned_text) | |
prediction = obj_Facility_Model.inference(prepared_data) | |
predictions.append(prediction) | |
# Create DataFrame for predictions | |
output_data = pd.DataFrame({'prediction': predictions}) | |
# Merge with input DataFrame | |
pred_output_df = pd.concat([batch_data, output_data], axis=1) | |
# Save predictions to CSV | |
pred_output_df.to_csv(output_file, index=False) |