Spaces:
Build error
Build error
from utils import * | |
import torch | |
import torch.nn as nn | |
from torch.utils.data import Dataset, DataLoader | |
import unicodedata | |
import re | |
# Undesirable patterns within texts | |
patterns = { | |
'CONCLUSIONS AND IMPLICATIONS':'', | |
'BACKGROUND AND PURPOSE':'', | |
'EXPERIMENTAL APPROACH':'', | |
'KEY RESULTS AEA':'', | |
'©':'', | |
'®':'', | |
'μ':'', | |
'(C)':'', | |
'OBJECTIVE:':'', | |
'MATERIALS AND METHODS:':'', | |
'SIGNIFICANCE:':'', | |
'BACKGROUND:':'', | |
'RESULTS:':'', | |
'METHODS:':'', | |
'CONCLUSIONS:':'', | |
'AIM:':'', | |
'STUDY DESIGN:':'', | |
'CLINICAL RELEVANCE:':'', | |
'CONCLUSION:':'', | |
'HYPOTHESIS:':'', | |
'CLINICAL RELEVANCE:':'', | |
'Questions/Purposes:':'', | |
'Introduction:':'', | |
'PURPOSE:':'', | |
'PATIENTS AND METHODS:':'', | |
'FINDINGS:':'', | |
'INTERPRETATIONS:':'', | |
'FUNDING:':'', | |
'PROGRESS:':'', | |
'CONTEXT:':'', | |
'MEASURES:':'', | |
'DESIGN:':'', | |
'BACKGROUND AND OBJECTIVES:':'', | |
'<p>':'', | |
'</p>':'', | |
'<<ETX>>':'', | |
'+/-':'', | |
} | |
patterns = {x.lower():y for x,y in patterns.items()} | |
class treat_text: | |
def __init__(self, patterns): | |
self.patterns = patterns | |
def __call__(self,text): | |
text = unicodedata.normalize("NFKD",str(text)) | |
text = multiple_replace(self.patterns,text.lower()) | |
text = re.sub('(\(.+\))|(\[.+\])|( \d )|(<)|(>)|(- )','', text) | |
text = re.sub('( +)',' ', text) | |
text = re.sub('(, ,)|(,,)',',', text) | |
text = re.sub('(%)|(per cent)',' percent', text) | |
return text | |
# Regex multiple replace function | |
def multiple_replace(dict, text): | |
# Building regex from dict keys | |
regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys()))) | |
# Substitution | |
return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) | |
treat_text_fun = treat_text(patterns) | |
import sys | |
sys.path.append('ML-SLRC/') | |
path = 'ML-SLRC/' | |
model_path = path + 'model.pt' | |
info_path = path + 'Info.json' | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
# # carrega o modelo | |
model = torch.load(model_path) | |
# # carrega as meta informações do modelo treinado | |
with open(info_path, 'r') as f: | |
Info = json.load(f) | |
import random | |
from datetime import datetime | |
rand_seed = 2003 | |
# datetime object containing current date and time | |
now = datetime.now() | |
time_stamp = now.strftime("%d_%m_%Y_HR_%H_%M_%S") | |
config = { | |
"shots_per_class":8, | |
"batch_size":4, | |
"epochs":8, | |
"learning_rate":5e-05, | |
"weight_decay": 0.85, | |
"rand_seed":rand_seed, | |
'pos_weight':3.5, | |
'p_incld': 0.2, | |
'p_excld': 0.01, | |
} | |
NAME = str(config['shots_per_class'])+'-shots-Learner' +'_'+ time_stamp | |
num_workers = 0 | |
val_batch = 100 | |
p_included = 0.7 | |
p_notincluded = 0.3 | |
sample_valid = 300 | |
gen_seed = torch.Generator().manual_seed(rand_seed) | |
np.random.seed(rand_seed) | |
torch.manual_seed(rand_seed) | |
random.seed(rand_seed) | |
def treat_data_input(data, etailment_txt): | |
data_train = data.groupby('test').sample(frac=1) | |
dataload_all = data.copy() | |
dataload_all.test = dataload_all.test.replace({np.nan: 'NANN'}) | |
dataset_train = SLR_DataSet(data=data_train, | |
input= 'text', | |
output='test', | |
tokenizer= initializer_model_scibert.tokenizer, | |
LABEL_MAP=LABEL_MAP, | |
treat_text=treat_text_fun, | |
etailment_txt=etailment_txt) | |
dataset_remain = SLR_DataSet(data=dataload_all, | |
input= 'text', | |
output='test', | |
tokenizer= initializer_model_scibert.tokenizer, | |
LABEL_MAP=LABEL_MAP, | |
treat_text=treat_text_fun, | |
etailment_txt=etailment_txt) | |
dataload_train = DataLoader(dataset_train, | |
batch_size=config['batch_size'],drop_last=False, | |
num_workers=num_workers) | |
dataload_remain = DataLoader(dataset_remain, | |
batch_size=200,drop_last=False, | |
num_workers=num_workers) | |
return dataload_train, dataload_remain | |
import gc | |
from torch.optim import Adam | |
from scipy.stats import entropy | |
def treat_train_evaluate(dataload_train, dataload_remain): | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
gc.collect() | |
torch.cuda.empty_cache() | |
model_few = deepcopy(model) | |
model_few.loss_fn = nn.BCEWithLogitsLoss(reduction = 'mean', | |
pos_weight=torch.FloatTensor([config['pos_weight']])) | |
optimizer = Adam(model_few.parameters(), lr = config['learning_rate'], | |
weight_decay = config['weight_decay']) | |
model_few.to('cuda') | |
model_few.train() | |
trainlog = model_few.fit(optimizer=optimizer, | |
scheduler = None, | |
data_train_loader=dataload_train, | |
epochs = config['epochs'], print_info = 1, metrics= False, | |
log = None, metrics_print = False) | |
(loss, features_out, (logits, outputs)) = model_few.evaluate(dataload_remain) | |
return logits | |
def treat_sort(dataload_all,logits): | |
dataload_all['prediction'] = torch.sigmoid(logits) | |
dataload_all = dataload_all.sort_values(by=['prediction'], ascending=False).reset_index(drop=True) | |
dataload_all.to_excel("output.xlsx") | |
def pipeline(data): | |
# data = pd.read_csv(fil.name) | |
data = pd.read_excel(data) | |
dataload_train, dataload_remain = treat_data_input(data,"its a great text") | |
logits = treat_train_evaluate(dataload_train, dataload_remain) | |
treat_sort(dataload_all,logits) | |
return "output.xlsx" | |
import gradio as gr | |
with gr.Blocks() as demo: | |
fil = gr.File(label="input data") | |
output = gr.File(label="output data") | |
greet_btn = gr.Button("Greet") | |
greet_btn.click(fn=pipeline, inputs=fil, outputs=output) | |
demo.launch() |