meta-demo-app / app.py
BecomeAllan
init_comit
6755d15
raw
history blame
5.76 kB
from utils import *
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import unicodedata
import re
# Undesirable patterns within texts
patterns = {
'CONCLUSIONS AND IMPLICATIONS':'',
'BACKGROUND AND PURPOSE':'',
'EXPERIMENTAL APPROACH':'',
'KEY RESULTS AEA':'',
'©':'',
'®':'',
'μ':'',
'(C)':'',
'OBJECTIVE:':'',
'MATERIALS AND METHODS:':'',
'SIGNIFICANCE:':'',
'BACKGROUND:':'',
'RESULTS:':'',
'METHODS:':'',
'CONCLUSIONS:':'',
'AIM:':'',
'STUDY DESIGN:':'',
'CLINICAL RELEVANCE:':'',
'CONCLUSION:':'',
'HYPOTHESIS:':'',
'CLINICAL RELEVANCE:':'',
'Questions/Purposes:':'',
'Introduction:':'',
'PURPOSE:':'',
'PATIENTS AND METHODS:':'',
'FINDINGS:':'',
'INTERPRETATIONS:':'',
'FUNDING:':'',
'PROGRESS:':'',
'CONTEXT:':'',
'MEASURES:':'',
'DESIGN:':'',
'BACKGROUND AND OBJECTIVES:':'',
'<p>':'',
'</p>':'',
'<<ETX>>':'',
'+/-':'',
}
patterns = {x.lower():y for x,y in patterns.items()}
class treat_text:
def __init__(self, patterns):
self.patterns = patterns
def __call__(self,text):
text = unicodedata.normalize("NFKD",str(text))
text = multiple_replace(self.patterns,text.lower())
text = re.sub('(\(.+\))|(\[.+\])|( \d )|(<)|(>)|(- )','', text)
text = re.sub('( +)',' ', text)
text = re.sub('(, ,)|(,,)',',', text)
text = re.sub('(%)|(per cent)',' percent', text)
return text
# Regex multiple replace function
def multiple_replace(dict, text):
# Building regex from dict keys
regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
# Substitution
return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)
treat_text_fun = treat_text(patterns)
import sys
sys.path.append('ML-SLRC/')
path = 'ML-SLRC/'
model_path = path + 'model.pt'
info_path = path + 'Info.json'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# # carrega o modelo
model = torch.load(model_path)
# # carrega as meta informações do modelo treinado
with open(info_path, 'r') as f:
Info = json.load(f)
import random
from datetime import datetime
rand_seed = 2003
# datetime object containing current date and time
now = datetime.now()
time_stamp = now.strftime("%d_%m_%Y_HR_%H_%M_%S")
config = {
"shots_per_class":8,
"batch_size":4,
"epochs":8,
"learning_rate":5e-05,
"weight_decay": 0.85,
"rand_seed":rand_seed,
'pos_weight':3.5,
'p_incld': 0.2,
'p_excld': 0.01,
}
NAME = str(config['shots_per_class'])+'-shots-Learner' +'_'+ time_stamp
num_workers = 0
val_batch = 100
p_included = 0.7
p_notincluded = 0.3
sample_valid = 300
gen_seed = torch.Generator().manual_seed(rand_seed)
np.random.seed(rand_seed)
torch.manual_seed(rand_seed)
random.seed(rand_seed)
def treat_data_input(data, etailment_txt):
data_train = data.groupby('test').sample(frac=1)
dataload_all = data.copy()
dataload_all.test = dataload_all.test.replace({np.nan: 'NANN'})
dataset_train = SLR_DataSet(data=data_train,
input= 'text',
output='test',
tokenizer= initializer_model_scibert.tokenizer,
LABEL_MAP=LABEL_MAP,
treat_text=treat_text_fun,
etailment_txt=etailment_txt)
dataset_remain = SLR_DataSet(data=dataload_all,
input= 'text',
output='test',
tokenizer= initializer_model_scibert.tokenizer,
LABEL_MAP=LABEL_MAP,
treat_text=treat_text_fun,
etailment_txt=etailment_txt)
dataload_train = DataLoader(dataset_train,
batch_size=config['batch_size'],drop_last=False,
num_workers=num_workers)
dataload_remain = DataLoader(dataset_remain,
batch_size=200,drop_last=False,
num_workers=num_workers)
return dataload_train, dataload_remain
import gc
from torch.optim import Adam
from scipy.stats import entropy
def treat_train_evaluate(dataload_train, dataload_remain):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
gc.collect()
torch.cuda.empty_cache()
model_few = deepcopy(model)
model_few.loss_fn = nn.BCEWithLogitsLoss(reduction = 'mean',
pos_weight=torch.FloatTensor([config['pos_weight']]))
optimizer = Adam(model_few.parameters(), lr = config['learning_rate'],
weight_decay = config['weight_decay'])
model_few.to('cuda')
model_few.train()
trainlog = model_few.fit(optimizer=optimizer,
scheduler = None,
data_train_loader=dataload_train,
epochs = config['epochs'], print_info = 1, metrics= False,
log = None, metrics_print = False)
(loss, features_out, (logits, outputs)) = model_few.evaluate(dataload_remain)
return logits
def treat_sort(dataload_all,logits):
dataload_all['prediction'] = torch.sigmoid(logits)
dataload_all = dataload_all.sort_values(by=['prediction'], ascending=False).reset_index(drop=True)
dataload_all.to_excel("output.xlsx")
def pipeline(data):
# data = pd.read_csv(fil.name)
data = pd.read_excel(data)
dataload_train, dataload_remain = treat_data_input(data,"its a great text")
logits = treat_train_evaluate(dataload_train, dataload_remain)
treat_sort(dataload_all,logits)
return "output.xlsx"
import gradio as gr
with gr.Blocks() as demo:
fil = gr.File(label="input data")
output = gr.File(label="output data")
greet_btn = gr.Button("Greet")
greet_btn.click(fn=pipeline, inputs=fil, outputs=output)
demo.launch()