Spaces:
Build error
Build error
import os | |
import argparse | |
import numpy | |
import tokenizers | |
import transformers | |
import huggingface_hub | |
import qarac.corpora.BNCorpus | |
import qarac.models.QaracTrainerModel | |
import qarac.corpora.CombinedCorpus | |
import torch | |
import spacy | |
import pandas | |
import qarac.utils.CoreferenceResolver | |
import nltk.corpus | |
import difflib | |
import scipy.stats | |
import scipy.spatial | |
import seaborn | |
import tqdm | |
import gradio | |
import boto3 | |
class SequenceCrossEntropyLoss(torch.nn.Module): | |
def __init__(self): | |
super(SequenceCrossEntropyLoss,self).__init__() | |
self.crossentropy = torch.nn.CrossEntropyLoss() | |
def forward(self,y_pred,y_true): | |
(batch_size,sequence_length,n_classes) = y_pred.shape | |
predictions = y_pred.view(-1,n_classes) | |
labels = y_true.view(-1) | |
return self.crossentropy(predictions,labels) | |
class CombinedLoss(torch.nn.Module): | |
def __init__(self): | |
super(CombinedLoss,self).__init__() | |
self.component_losses = (SequenceCrossEntropyLoss(), | |
torch.nn.MSELoss(), | |
SequenceCrossEntropyLoss(), | |
torch.nn.MSELoss()) | |
def forward(self,y_pred,y_true): | |
return sum((fn(pred,obs) | |
for (fn,pred,obs) in zip(self.component_losses, | |
y_pred, | |
y_true))) | |
def capitalise(token,i): | |
return token.text_with_ws.title() if i==0 or token.tag_.startswith('NNP') else token.text_with_ws.lower() | |
def clean_question(doc): | |
words = [capitalise(token,i) for (i,token) in enumerate(doc)] | |
if words[-1]!='?': | |
words.append('?') | |
return ''.join(words) | |
def download_training_data(): | |
if not os.path.exists('corpora'): | |
os.makedirs('corpora') | |
s3 = boto3.client('s3', | |
aws_access_key_id=os.environ['AWS_KEY'], | |
aws_secret_access_key=os.evviron['AWS_SECRET']) | |
for obj in s3.list_objects(Bucket='qarac')['Contents']: | |
filename = obj['Key'] | |
s3.download_file('qarac',filename,'corpora/{}'.format(filename)) | |
def prepare_wiki_qa(filename,outfilename): | |
data = pandas.read_csv(filename,sep='\t') | |
data['QNum']=data['QuestionID'].apply(lambda x: int(x[1:])) | |
nlp = spacy.load('en_core_web_trf') | |
predictor = qarac.utils.CoreferenceResolver.CoreferenceResolver() | |
data['Resolved_answer'] = data.groupby('QNum')['Sentence'].transform(predictor) | |
unique_questions = data.groupby('QNum')['Question'].first() | |
cleaned_questions = pandas.Series([clean_question(doc) | |
for doc in nlp.pipe(unique_questions)], | |
index = unique_questions.index) | |
for (i,question) in cleaned_questions.items(): | |
data.loc[data['QNum']==i,'Cleaned_question']=question | |
data[['Cleaned_question','Resolved_answer','Label']].to_csv(outfilename) | |
def prepare_training_datasets(): | |
wikiqa = pandas.read_csv('corpora/WikiQA.csv') | |
avicenna = pandas.read_csv('corpora/Avicenna_Train.csv',encoding='iso-8859-1') | |
snli = pandas.read_csv('corpora/snli_1.0_train.csv') | |
question_answering = wikiqa.loc[wikiqa['Label']==1, | |
['Cleaned_question', | |
'Resolved_answer']].rename(columns={'Cleaned_question':'question', | |
'Resolved_answer':'answer'}) | |
reasoning = avicenna.loc[avicenna['Syllogistic relation']=='yes', | |
['Premise 1', | |
'Premise 2', | |
'Conclusion']].rename(columns={'Premise 1':'proposition0', | |
'Premise 2':'proposition1', | |
'Conclusion':'conclusion'}) | |
consistency = snli.loc[snli['gold_label']!='-', | |
['sentence1', | |
'sentence2']].rename(columns={'sentence1':'statement0', | |
'sentence2':'statement1'}) | |
mapping = {'entailment':1.0, | |
'neutral':0.0, | |
'contradiction':-1.0} | |
consistency['consistency'] = snli.loc[snli['gold_label']!='-', | |
'gold_label'].apply(lambda x:mapping[x]) | |
all_text = pandas.concat([wikiqa['Resolved_answer'], | |
avicenna['Premise 1'], | |
avicenna['Premise 1'], | |
reasoning['conclusion'], | |
snli['sentence1'], | |
snli['sentence2']]).to_frame(name='all_text').reset_index(drop=True) | |
all_text.to_csv('corpora/all_text.csv') | |
question_answering.to_csv('corpora/question_answering.csv') | |
reasoning.to_csv('corpora/reasoning_train.csv') | |
consistency.to_csv('corpora/consistency.csv') | |
def train_models(path,progress=gradio.Progress(track_tqdm=True)): | |
tokenizer = tokenizers.Tokenizer.from_pretrained('roberta-base') | |
trainer = qarac.models.QaracTrainerModel.QaracTrainerModel('roberta-base', | |
tokenizer) | |
trainer.cuda() | |
loss_fn = CombinedLoss() | |
loss_fn.cuda() | |
optimizer = torch.optim.NAdam(trainer.parameters(),lr=5.0e-5) | |
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,gamma=0.9) | |
training_data = qarac.corpora.CombinedCorpus.CombinedCorpus(tokenizer, | |
all_text='corpora/all_text.csv', | |
question_answering='corpora/question_answering.csv', | |
reasoning='corpora/reasoning_train.csv', | |
consistency='corpora/consistency.csv') | |
n_batches = len(training_data) | |
history = {} | |
for epoch in range(10): | |
print("Epoch",epoch) | |
epoch_label = 'Epoch {}'.format(epoch) | |
epoch_data = {} | |
for (batch,(X,Y)) in enumerate(tqdm.tqdm(training_data)): | |
prediction = trainer(X['all_text'], | |
X['offset_text'], | |
X['question'], | |
X['answer'], | |
X['proposition0'], | |
X['proposition1'], | |
X['conclusion_offset'], | |
X['statement0'], | |
X['statement1']) | |
loss = loss_fn(prediction,Y) | |
loss.backward() | |
optimizer.step() | |
optimizer.zero_grad() | |
if batch % 1024 == 0 or batch == n_batches-1: | |
epoch_data[batch] = loss.item() | |
history[epoch_label] = epoch_data | |
scheduler.step() | |
huggingface_hub.login(token=os.environ['HUGGINGFACE_TOKEN']) | |
trainer.question_encoder.push_to_hub('{}/qarac-roberta-question-encoder'.format(path)) | |
trainer.answer_encoder.push_to_hub('{}/qarac-roberta-answer-encoder'.format(path)) | |
trainer.decoder.push_to_hub('{}/qarac-roberta-decoder'.format(path)) | |
return history | |
def test_encode_decode(path): | |
encoder = transformers.Transformer.from_pretrained('{}/qarac-roberta-answer-encoder'.format(path)) | |
decoder = transformers.Transformer.from_pretrained('{}/qarac-robeerta-decoder'.format(path)) | |
tokenizer=tokenizers.Tokenizer.from_pretrained('roberta-base') | |
exclude = tokenizer.encode('<s> </s> <pad>').ids | |
analyser = difflib.SequenceMatcher(lambda x: x in exclude) | |
bnc = nltk.corpus.reader.bnc.BNCCorpusReader('/'.join([os.environ['HOME'], | |
'BNC', | |
'Texts']), | |
fileids=r'[A-K]/\w*/\w*\.xml') | |
matches = [] | |
batch = [] | |
pad_token = tokenizer.token_to_id('<pad>') | |
for sent in bnc.sents(strip_space=False): | |
batch.append(tokenizer.encode(''.join(sent))) | |
if len(batch)==32: | |
maxlen = max((len(sentence) for sentence in batch)) | |
for sample in batch: | |
sample.pad(maxlen,pad_id=pad_token) | |
input_ids = torch.tensor([sample.ids for sample in batch]) | |
attention_mask = torch.not_equal(input_ids,pad_token) | |
vectors = encoder(input_ids, | |
attention_mask) | |
decoded = decoder.generate(vector=vectors) | |
for (s1,s2) in zip(batch,decoded): | |
analyser.set_seqs(s1.ids, s2) | |
matches.append(analyser.ratio()) | |
batch = [] | |
if len(batch)!=0: | |
maxlen = max((len(sentence) for sentence in batch)) | |
for sample in batch: | |
sample.pad(maxlen,pad_id=pad_token) | |
input_ids = torch.tensor([sample.ids for sample in batch]) | |
attention_mask = torch.not_equal(input_ids, pad_token) | |
vectors = encoder(input_ids, | |
attention_mask) | |
decoded = decoder.generate(vector=vectors) | |
for (s1,s2) in zip(batch,decoded): | |
analyser.set_seqs(s1.ids, s2) | |
matches.append(analyser.ratio()) | |
matches = numpy.array(matches) | |
print("Accuracy: mean = {0}, sd = {1}".format(matches.mean(), | |
matches.sd())) | |
(alpha,beta,loc,scale)=scipy.stats.beta.fit(matches,floc=0.0,fscale=1.0) | |
print("Beta distribution parameters alpha = {0}, beta = {1}".format(alpha,beta)) | |
(hist,bins) = numpy.histogram(matches,bins='fd') | |
with pandas.option_context('plotting.backend','matploblib.backends.backend_svg') as options: | |
axes = pandas.Series(hist,index=(bins[1:]+bins[:-1]/2)).plot.bar() | |
axes.get_figure().savefig('encode_decode_histogram.svg') | |
percent = numpy.linspace(0.0,1.0,101) | |
percentiles = numpy.quantile(matches,percent) | |
with pandas.option_context('plotting.backend','matplotlib.backends.backend_svg') as options: | |
axes = pandas.Series(percentiles, index=percent).plot.bar() | |
axes.get_figure().savefig('encode_decode_percentile.svg') | |
def test_question_answering(path): | |
question_encoder = transformers.Transformer.from_pretrained('{}/qarac-roberta-question-encoder'.format(path)) | |
answer_encoder = transformers.Transformer.from_pretrained('{}/qarac-roberta-answer-encoder'.format(path)) | |
tokenizer = tokenizers.Tokenizer.from_pretrained('roberta-base') | |
data = pandas.read_csv('WikiQA.tsv',sep='\t') | |
data['QNum']=data['QuestionID'].apply(lambda x: int(x[1:])) | |
nlp = spacy.load('en_core_web_trf') | |
predictor = qarac.utils.CoreferenceResolver.CoreferenceResolver() | |
data['Resolved_answer'] = data.groupby('QNum')['Sentence'].transform(predictor) | |
unique_questions = data.groupby('QNum')['Question'].first() | |
cleaned_questions = pandas.Series([clean_question(doc) | |
for doc in nlp.pipe(unique_questions)], | |
index = unique_questions.index) | |
def tokenize(column): | |
return tokenizer.encode_batch(column.apply(lambda x:tokenizers.TextInputSequence(x)), | |
add_special_tokens=False) | |
questions = tokenize(cleaned_questions) | |
maxlen=max((len(question) for question in questions)) | |
pad_token = tokenizer.token_to_id('<pad>') | |
for question in questions: | |
question.pad(maxlen,pad_id=pad_token) | |
question_ids = torch.tensor([question.ids | |
for question in questions]) | |
attention_mask = torch.not_equal(question_ids, | |
pad_token) | |
q_vectors = question_encoder(question_ids, | |
attention_mask=attention_mask).numpy() | |
answers = tokenize(data['Resolved_answer']) | |
maxlen = max((len(answer) for answer in answers)) | |
for answer in answers: | |
answer.pad(maxlen,pad_id=pad_token) | |
answer_ids = torch.tensor([answer.ids | |
for answer in answers]) | |
attention_mask = torch.not_equal(answer_ids, | |
pad_token) | |
answer_lookup = scipy.spatial.KDTree(answer_encoder(answer_ids, | |
attention_mask=attention_mask).numpy()) | |
n_correct = 0 | |
all_distances = 0.0 | |
correct_distances = 0.0 | |
wrong_distances = 0.0 | |
all_sq = 0.0 | |
correct_sq = 0.0 | |
wrong_sq = 0.0 | |
for (i,qv) in enumerate(q_vectors): | |
(d,row) = answer_lookup.query(qv) | |
dsq=d**2.0 | |
correct = (row['QNum']==i and row['Label']==1) | |
all_distances+=d | |
all_sq+=dsq | |
if correct: | |
n_correct+=1 | |
correct_distances+=d | |
correct_sq+=dsq | |
else: | |
wrong_distances+=d | |
wrong_sq+=dsq | |
N = cleaned_questions.shape[0] | |
print("{0} questions, {1} possible answers, {2} correct answers".format(N, | |
data.shape[0], | |
n_correct)) | |
accuracy = n_correct/N | |
baseline = N/data.shape[0] | |
kappa = 1.0 - ((1.0-accuracy)/(1.0-baseline)) | |
print(("Accuracy: {0}, Baseline {1}, kappa{2} ".format(accuracy,baseline,kappa))) | |
mean_dist =all_distances/N | |
mean_sq = all_sq/N | |
all_sd = numpy.sqrt(mean_sq-(mean_dist**2.0)) | |
print("Question-answer distances") | |
print("All: mean {0}, sd {1}".format(mean_dist,all_sd)) | |
correct_mean = correct_distances/n_correct | |
correct_meansq = correct_sq/n_correct | |
correct_sd = numpy.sqrt(correct_meansq - (correct_mean**2.0)) | |
print("Correct: mean {0}, sd {1}".format(correct_mean,correct_sd)) | |
wrong_mean = wrong_distances/(N-n_correct) | |
wrong_meansq = wrong_sq/(N-n_correct) | |
wrong_sd = numpy.sqrt(wrong_meansq - (wrong_mean**2.0)) | |
print("Wrong: mean {0}, sd {1}".format(wrong_mean,wrong_sd)) | |
def test_reasoning(path): | |
encoder = transformers.Transformer.from_pretrained('{}/qarac-roberta-answer-encoder'.format(path)) | |
decoder = transformers.Transformer.from_pretrained('{}/qarac-robeerta-decoder'.format(path)) | |
tokenizer=tokenizers.Tokenizer.from_pretrained('roberta-base') | |
exclude = tokenizer.encode('<s> </s> <pad>').ids | |
analyser = difflib.SequenceMatcher(lambda x: x in exclude) | |
data = pandas.read_csv('corpora/Avicenna_Test.csv',encoding='iso-8859-1') | |
data = data.loc[data['Syllogistic relation']=='yes'] | |
def tokenize(column): | |
return tokenizer.encode_batch(column.apply(lambda x:tokenizers.TextInputSequence(x)), | |
add_special_tokens=False) | |
p0 = tokenize(data['Premise 1']) | |
p1 = tokenize(data['Premise 2']) | |
c = tokenize(data['Conclusion']) | |
p0_batch = [] | |
p1_batch = [] | |
c_batch = [] | |
n=0 | |
pad_token = tokenizer.token_to_id('<pad>') | |
matches=[] | |
for (p0_sample,p1_sample,c_sample) in zip(p0,p1,c): | |
p0_batch.append(p0_sample) | |
p1_batch.append(p1_sample) | |
c_batch.append(c_sample) | |
n+=1 | |
if n==32: | |
maxlen=max((len(sample for sample in p0_batch))) | |
for sample in p0_batch: | |
sample.pad(maxlen,pad_token) | |
p0_in = torch.tensor([sample.ids for sample in p0.batch]) | |
p0_attn = torch.not_equal(p0_in, | |
pad_token) | |
maxlen=max((len(sample for sample in p1_batch))) | |
for sample in p1_batch: | |
sample.pad(maxlen,pad_token) | |
p1_in = torch.tensor([sample.ids for sample in p1.batch]) | |
p1_attn = torch.not_equal(p0_in, | |
pad_token) | |
predictions = decoder.generate(vector=(encoder(p0_in, | |
attention_mask=p0_attn) | |
+encoder(p1_in, | |
attention_mask=p1_attn))) | |
for (s1,s2) in zip(c_batch,predictions): | |
analyser.set_seqs(s1.ids, s2) | |
matches.append(analyser.ratio()) | |
n=0 | |
p0_batch=[] | |
p1_batch=[] | |
c_batch=[] | |
if n!=0: | |
maxlen=max((len(sample for sample in p0_batch))) | |
for sample in p0_batch: | |
sample.pad(maxlen,pad_token) | |
p0_in = torch.tensor([sample.ids for sample in p0.batch]) | |
p0_attn = torch.not_equal(p0_in, | |
pad_token) | |
maxlen=max((len(sample for sample in p1_batch))) | |
for sample in p1_batch: | |
sample.pad(maxlen,pad_token) | |
p1_in = torch.tensor([sample.ids for sample in p1.batch]) | |
p1_attn = torch.not_equal(p0_in, | |
pad_token) | |
predictions = decoder.generate(vector=(encoder(p0_in, | |
attention_mask=p0_attn) | |
+encoder(p1_in, | |
attention_mask=p1_attn))) | |
for (s1,s2) in zip(c_batch,predictions): | |
analyser.set_seqs(s1.ids, s2) | |
matches.append(analyser.ratio()) | |
matches = numpy.array(matches) | |
print("Accuracy: mean = {0}, sd = {1}".format(matches.mean(), | |
matches.sd())) | |
(alpha,beta,loc,scale)=scipy.stats.beta.fit(matches,floc=0.0,fscale=1.0) | |
print("Beta distribution parameters alpha = {0}, beta = {1}".format(alpha,beta)) | |
(hist,bins) = numpy.histogram(matches,bins='fd') | |
with pandas.option_context('plotting.backend','matploblib.backends.backend_svg') as options: | |
axes = pandas.Series(hist,index=(bins[1:]+bins[:-1]/2)).plot.bar() | |
axes.get_figure().savefig('reasoning_histogram.svg') | |
percent = numpy.linspace(0.0,1.0,101) | |
percentiles = numpy.quantile(matches,percent) | |
with pandas.option_context('plotting.backend','matplotlib.backends.backend_svg') as options: | |
axes = pandas.Series(percentiles, index=percent).plot.bar() | |
axes.get_figure().savefig('reasoning_percentile.svg') | |
def test_consistency(path): | |
encoder = transformers.Transformer.from_pretrained('{}/qarac-roberta-answer-encoder'.format(path)) | |
tokenizer = tokenizer=tokenizers.Tokenizer.from_pretrained('roberta-base') | |
data = pandas.read_csv('corpora/snli_1.0_test.csv') | |
data = data.loc[data['gold_label']!='-'] | |
pad_token=tokenizer.token_to_id('<pad>') | |
def tokenize(column): | |
return tokenizer.encode_batch(column.apply(lambda x:tokenizers.TextInputSequence(x)), | |
add_special_tokens=False) | |
s0 =tokenize(data['sentence1']) | |
s1 = tokenize(data['sentence1']) | |
maxlen = max((len(sentence for sentence in s0))) | |
for sentence in s0: | |
sentence.pad(maxlen,pad_id=pad_token) | |
s0_in = torch.tensor([sentence.ids for sentence in s0]) | |
s0_attn = torch.not_equal(s0_in, | |
pad_token) | |
maxlen = max((len(sentence for sentence in s1))) | |
for sentence in s1: | |
sentence.pad(maxlen,pad_id=pad_token) | |
s1_in = torch.tensor([sentence.ids for sentence in s1]) | |
s1_attn = torch.not_equal(s1_in, | |
pad_token) | |
s0_vec = encoder(s0_in,attention_mask=s0_attn) | |
s1_vec = encoder(s1_in,attention_mask=s1_attn) | |
cosine = torch.nn.CosineSimilarity(dim=2,eps=1.0e-12) | |
consistency = cosine(s0_vec,s1_vec).numpy() | |
results = pandas.DataFrame({'label':data['gold_label'], | |
'score':consistency}) | |
third = 1.0/3.0 | |
def predicted_labels(x): | |
return 'entailment' if x>third else 'contradiction' if x<-third else 'neutral' | |
results['prediction'] = results['score'].apply(predicted_labels) | |
confusion=results.groupby('label')['prediction'].value_counts().fillna(0) | |
seaborn.heatmap(confusion).save('consistency_confusion_matrix.svg') | |
correct = pandas.Series({label:confusion[label,label] | |
for label in confusion.index}) | |
print("Accuracy: {}".format(correct.sum()/data.shape[0])) | |
print("Precision") | |
print(correct/confusion.sum(axis='columns')) | |
print("Recall") | |
print(correct/confusion.sum(axis='rows')) | |
def stats(group): | |
(alpha,beta,loc,scale) = scipy.stats.beta.fit(group) | |
mean = group.mean() | |
sd = group.std() | |
return pandas.Series({'mean':mean, | |
'sd':sd, | |
'min':loc, | |
'max':loc+scale, | |
'alpha':alpha, | |
'beta':beta}) | |
print(results.groupby('label')['score'].apply(stats)) | |
quartiles = numpy.quantile(consistency,[0.0,0.25,0.5,0.75,1.0]) | |
IQR = quartiles[3]-quartiles[1] | |
bin_width = 2.0*IQR/(data.shape[0]**1.5) | |
n_bins = int((quartiles[4]-quartiles[0])/bin_width) | |
bins = numpy.linspace(quartiles[0],quartiles[4],n_bins) | |
def hist(col): | |
(result,_) = numpy.histogram(col,bins) | |
return result | |
histograms = results.groupby('label')['score'].apply(hist) | |
histograms.coluumns = (bins[1:]+bins[:-1])/2 | |
with pandas.option_context('plotting.backend','matploblib.backends.backend_svg') as options: | |
axes=histograms.T.plot.bar(stacked=True) | |
axes.get_figure().savefig('consistency_histograms.svg') | |
percent = numpy.linspace(0.0,1.0,101) | |
percentiles = results.groupby('label')['score'].apply(lambda x: numpy.percentile(x,percent)) | |
with pandas.option_context('plotting.backend','matploblib.backends.backend_svg') as options: | |
axes=percentiles.T.plot.line() | |
axes.get_figure().savefig('consistency_percentiles.svg') | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(prog='QARAC', | |
description='Experimental NLP system, aimed at improving factual accuracy') | |
parser.add_argument('task') | |
parser.add_argument('-f','--filename') | |
parser.add_argument('-t','--training-task') | |
parser.add_argument('-o','--outputfile') | |
args = parser.parse_args() | |
if args.task == 'prepare_wiki_qa': | |
prepare_wiki_qa(args.filename,args.outputfile) | |
elif args.task == 'prepare_training_datasets': | |
prepare_training_datasets() | |
elif args.task == 'train_models': | |
train_models(args.filename) | |
elif args.task == 'test_encode_decode': | |
test_encode_decode(args.filename) | |
elif args.task== 'test_question_answering': | |
test_question_answering(args.filename) | |
elif args.task=="test_reasoning": | |
test_reasoning(args.filename) | |
elif args.task=='test_consistency': | |
test_consistency(args.filename) | |