Spaces:
Build error
Build error
PeteBleackley
commited on
Commit
·
4f8366b
1
Parent(s):
e149b0f
Components for managing training corpora
Browse files- qarac/corpora/CombinedCorpus.py +199 -0
- qarac/corpora/CorpusLoader.py +91 -0
- qarac/corpora/Preprocessor.py +91 -0
- qarac/models/QaracTrainerModel.py +13 -2
- qarac/utils/CoreferenceResolver.py +36 -0
qarac/corpora/CombinedCorpus.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Created on Wed Sep 20 14:12:34 2023
|
5 |
+
|
6 |
+
@author: peter
|
7 |
+
"""
|
8 |
+
|
9 |
+
import collections
|
10 |
+
import numpy
|
11 |
+
import tensorflow
|
12 |
+
import keras
|
13 |
+
import CorpusLoader
|
14 |
+
import CorpusRepeater
|
15 |
+
|
16 |
+
class CombinedCorpus(keras.utils.Sequence):
|
17 |
+
|
18 |
+
def __init__(self,tokenizer,**kwargs):
|
19 |
+
"""
|
20 |
+
Creates the Combined Corpus
|
21 |
+
|
22 |
+
Parameters
|
23 |
+
----------
|
24 |
+
tokenizer : tokenizers.Tokenizer
|
25 |
+
Tokenizer used in preparing datasets
|
26 |
+
**kwargs : str
|
27 |
+
paths for tokenized datsets
|
28 |
+
|
29 |
+
Returns
|
30 |
+
-------
|
31 |
+
None.
|
32 |
+
|
33 |
+
"""
|
34 |
+
super(CombinedCorpus,self).__init__()
|
35 |
+
self.tokenizer = tokenizer
|
36 |
+
start_doc = tokenizer.encode('<s>')
|
37 |
+
end_doc = tokenizer.encode('</s>')
|
38 |
+
self.all_text = CorpusLoader.CorpusLoader(kwargs['all_text'],
|
39 |
+
start_doc,
|
40 |
+
end_doc,
|
41 |
+
['all_text'],
|
42 |
+
{'all_text':('offset_text',
|
43 |
+
'encode_decode')})
|
44 |
+
n_samples = len(self.all_text)
|
45 |
+
self.n_batches = numpy.ceil(n_samples/32.0).astype(int)
|
46 |
+
self.question_answering = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['question_answering'],
|
47 |
+
start_doc,
|
48 |
+
end_doc,
|
49 |
+
['question',
|
50 |
+
'answer'],
|
51 |
+
{}),
|
52 |
+
n_samples)
|
53 |
+
self.reasoning = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['reasoning'],
|
54 |
+
start_doc,
|
55 |
+
end_doc,
|
56 |
+
['proposition0',
|
57 |
+
'proposition1'],
|
58 |
+
{'conclusion':('conclusion_offset',
|
59 |
+
'reasoning')}),
|
60 |
+
n_samples)
|
61 |
+
self.consistency = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['consitency'],
|
62 |
+
start_doc,
|
63 |
+
end_doc,
|
64 |
+
['statement0',
|
65 |
+
'statement1'],
|
66 |
+
{},
|
67 |
+
'consistency'),
|
68 |
+
n_samples)
|
69 |
+
self.batches = []
|
70 |
+
self.pad_token = self.tokenizer.token_to_id('<pad>')
|
71 |
+
self.on_epoch_end()
|
72 |
+
|
73 |
+
def __len__(self):
|
74 |
+
"""
|
75 |
+
Number of batches
|
76 |
+
|
77 |
+
Returns
|
78 |
+
-------
|
79 |
+
int
|
80 |
+
Number of batches
|
81 |
+
|
82 |
+
"""
|
83 |
+
return self.n_batches
|
84 |
+
|
85 |
+
def __getitem__(self,n):
|
86 |
+
"""
|
87 |
+
Retrieves a batch of data
|
88 |
+
|
89 |
+
Parameters
|
90 |
+
----------
|
91 |
+
n : int
|
92 |
+
index of batch to retrieve
|
93 |
+
|
94 |
+
Returns
|
95 |
+
-------
|
96 |
+
tupe(dict,dict)
|
97 |
+
Batch of data
|
98 |
+
|
99 |
+
"""
|
100 |
+
return self.batches[n]
|
101 |
+
|
102 |
+
def samples(self):
|
103 |
+
"""
|
104 |
+
Iterates over samples of data
|
105 |
+
|
106 |
+
Yields
|
107 |
+
------
|
108 |
+
X : dict
|
109 |
+
Sample of training inputs
|
110 |
+
Y : dict
|
111 |
+
Sample of training outputs
|
112 |
+
|
113 |
+
"""
|
114 |
+
for sample in zip(self.all_text,
|
115 |
+
self.question_answering,
|
116 |
+
self.reasoning,
|
117 |
+
self.consistency):
|
118 |
+
X={}
|
119 |
+
Y={}
|
120 |
+
for (x,y) in sample:
|
121 |
+
X.update(x)
|
122 |
+
Y.update(y)
|
123 |
+
yield (X,Y)
|
124 |
+
|
125 |
+
def on_epoch_end(self):
|
126 |
+
"""
|
127 |
+
Regenerates batches of data
|
128 |
+
|
129 |
+
Returns
|
130 |
+
-------
|
131 |
+
None.
|
132 |
+
|
133 |
+
"""
|
134 |
+
self.batches = []
|
135 |
+
n=0
|
136 |
+
X = collections.defaultdict(list)
|
137 |
+
Y = collections.defaultdict(list)
|
138 |
+
for (x,y) in self.samples:
|
139 |
+
for (key,value) in x.items:
|
140 |
+
X[key].append(value)
|
141 |
+
for (key,value) in y.items():
|
142 |
+
Y[key].append(value)
|
143 |
+
n+=1
|
144 |
+
if n==32:
|
145 |
+
self.batches.append(self.batch(X,Y))
|
146 |
+
n=0
|
147 |
+
X.clear()
|
148 |
+
Y.clear()
|
149 |
+
if n!=0:
|
150 |
+
self.batches.append(self.batch(X,Y,n))
|
151 |
+
|
152 |
+
def batch(self,X,Y,n=32):
|
153 |
+
"""
|
154 |
+
Creates a batch of data from samples
|
155 |
+
|
156 |
+
Parameters
|
157 |
+
----------
|
158 |
+
X : dict[str,list]
|
159 |
+
Input samples
|
160 |
+
Y : dict[str.list]
|
161 |
+
output samples
|
162 |
+
n : int, optional
|
163 |
+
Size of batch. The default is 32.
|
164 |
+
|
165 |
+
Returns
|
166 |
+
-------
|
167 |
+
X : dict[str,tensorflow.Tensor]
|
168 |
+
Batched input samples
|
169 |
+
Y : dict[str,tensorflow.Tensor]
|
170 |
+
Batched output samples
|
171 |
+
|
172 |
+
"""
|
173 |
+
for (key,value) in X.items():
|
174 |
+
X[key] = self.pad(value)
|
175 |
+
for (key,value) in Y.items():
|
176 |
+
Y[key] = tensorflow.constant(value) if key=='consistency' else self.pad(value)
|
177 |
+
Y['question_answering'] = tensorflow.zeros((n,768))
|
178 |
+
return (X,Y)
|
179 |
+
|
180 |
+
def pad(self,batch):
|
181 |
+
"""
|
182 |
+
Pads a batch of samples to uniform length
|
183 |
+
|
184 |
+
Parameters
|
185 |
+
----------
|
186 |
+
batch : list[tokenizers.Encoding]
|
187 |
+
Samples to be padded
|
188 |
+
|
189 |
+
Returns
|
190 |
+
-------
|
191 |
+
tensorflow.Tensor
|
192 |
+
Padded data
|
193 |
+
|
194 |
+
"""
|
195 |
+
maxlen = max((len(sample) for sample in batch))
|
196 |
+
return tensorflow.constant([sample.pad(maxlen,pad_id=self.pad_token).ids
|
197 |
+
for sample in batch])
|
198 |
+
|
199 |
+
|
qarac/corpora/CorpusLoader.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Created on Wed Sep 20 07:48:54 2023
|
5 |
+
|
6 |
+
@author: peter
|
7 |
+
"""
|
8 |
+
|
9 |
+
import datasets
|
10 |
+
import tokenizers
|
11 |
+
|
12 |
+
class CorpusLoader(object):
|
13 |
+
|
14 |
+
def __init__(self,path,
|
15 |
+
start_doc,
|
16 |
+
end_doc,
|
17 |
+
text_inputs,
|
18 |
+
text_outputs,
|
19 |
+
label=None):
|
20 |
+
"""
|
21 |
+
Creates the Corpus Loader
|
22 |
+
|
23 |
+
Parameters
|
24 |
+
----------
|
25 |
+
path : str
|
26 |
+
Path to load dataset from
|
27 |
+
start_doc : tokenizers.Encoding
|
28 |
+
Token id for document start character
|
29 |
+
end_doc : tokenizers.Encoding
|
30 |
+
Token id for the document end character
|
31 |
+
text_inputs : list[str]
|
32 |
+
Columns of the dataset to add to the inputs
|
33 |
+
text_outputs : dict[str,tuple[str]]
|
34 |
+
The columns of the dataset to add to the outputs. The key is the name
|
35 |
+
of the column in the original dataset, the first element of the tuple
|
36 |
+
is the name that the column prefixed with '<s>' will have in the
|
37 |
+
inputs, and the second element of the tuple is the name that the column
|
38 |
+
suffixed with '</s>' will have in the outputs
|
39 |
+
label : str, optional
|
40 |
+
A column of numerical labels to add to the outputs. The default is None.
|
41 |
+
|
42 |
+
Returns
|
43 |
+
-------
|
44 |
+
None.
|
45 |
+
|
46 |
+
"""
|
47 |
+
data = datasets.Dataset.from_file(path)
|
48 |
+
self.n_rows = len(data)
|
49 |
+
self.dataset = data.to_iterable_dataset()
|
50 |
+
self.start_doc = start_doc
|
51 |
+
self.end_doc = end_doc
|
52 |
+
self.text_inputs = text_inputs
|
53 |
+
self.text_outputs = text_outputs
|
54 |
+
self.label = label
|
55 |
+
|
56 |
+
def __len__(self):
|
57 |
+
"""
|
58 |
+
The length of the corpus
|
59 |
+
|
60 |
+
Returns
|
61 |
+
-------
|
62 |
+
int
|
63 |
+
The number of samples
|
64 |
+
|
65 |
+
"""
|
66 |
+
return self.n_rows
|
67 |
+
|
68 |
+
def __iter__(self):
|
69 |
+
"""
|
70 |
+
Generates samples in a random order
|
71 |
+
|
72 |
+
Yields
|
73 |
+
------
|
74 |
+
X : dict
|
75 |
+
Inputs for model
|
76 |
+
Y : dict
|
77 |
+
outputs for model
|
78 |
+
|
79 |
+
"""
|
80 |
+
for row in self.dataset.shuffle():
|
81 |
+
X={}
|
82 |
+
Y={}
|
83 |
+
for column in self.text_inputs:
|
84 |
+
X[column] = row[column]
|
85 |
+
for (column,(x_name),y_name) in self.text_outputs.items():
|
86 |
+
X[x_name] = tokenizers.Encoding.merge([self.start_doc,row[column]])
|
87 |
+
Y[y_name] = tokenizers.Encoding.merge([row[column],self.end_doc])
|
88 |
+
if self.label is not None:
|
89 |
+
Y[self.label]=row[self.label]
|
90 |
+
yield (X,Y)
|
91 |
+
|
qarac/corpora/Preprocessor.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Created on Mon Sep 18 13:18:59 2023
|
5 |
+
|
6 |
+
@author: peter
|
7 |
+
"""
|
8 |
+
|
9 |
+
import tokenizers
|
10 |
+
import datasets
|
11 |
+
import pandas
|
12 |
+
|
13 |
+
class Preprocessor(object):
|
14 |
+
|
15 |
+
def __init__(self,tokenizer_path='roberta-base'):
|
16 |
+
"""
|
17 |
+
Creates the preporcessor
|
18 |
+
|
19 |
+
Parameters
|
20 |
+
----------
|
21 |
+
tokenizer_path : str, optional
|
22 |
+
The path to the pretrained tokenizer . The default is 'roberta-base'.
|
23 |
+
|
24 |
+
Returns
|
25 |
+
-------
|
26 |
+
None.
|
27 |
+
|
28 |
+
"""
|
29 |
+
self.tokenizer = tokenizers.Tokenizer.from_pretrained(tokenizer_path)
|
30 |
+
self.start_token = self.tokenizer.encode('<s>')
|
31 |
+
self.end_token = self.tokenizer.encode('</s>')
|
32 |
+
|
33 |
+
def __call__(self,data):
|
34 |
+
"""
|
35 |
+
Tokenizes a column of data
|
36 |
+
|
37 |
+
Parameters
|
38 |
+
----------
|
39 |
+
data : pandas.Series
|
40 |
+
Column of text tata
|
41 |
+
|
42 |
+
Returns
|
43 |
+
-------
|
44 |
+
list[tokenizers.Encoding]
|
45 |
+
Tokenized data
|
46 |
+
|
47 |
+
"""
|
48 |
+
return self.tokenizer.encode_batch(data,add_special_tokens=False)
|
49 |
+
|
50 |
+
|
51 |
+
def combine(self,*args):
|
52 |
+
"""
|
53 |
+
Tokenises several data columns
|
54 |
+
|
55 |
+
Parameters
|
56 |
+
----------
|
57 |
+
*args : sequence of pandas.Series
|
58 |
+
.
|
59 |
+
|
60 |
+
Returns
|
61 |
+
-------
|
62 |
+
TYPE
|
63 |
+
DESCRIPTION.
|
64 |
+
|
65 |
+
"""
|
66 |
+
return self(pandas.concatenate(args))
|
67 |
+
|
68 |
+
def process_labels(self,data,column):
|
69 |
+
"""
|
70 |
+
Converts labels to numerical value for consitency objective
|
71 |
+
|
72 |
+
Parameters
|
73 |
+
----------
|
74 |
+
data : datasets.Dataset
|
75 |
+
dataset for which labels need to be converted
|
76 |
+
column : str
|
77 |
+
The column on which to apply label conversion
|
78 |
+
|
79 |
+
Returns
|
80 |
+
-------
|
81 |
+
datasets.Dataset
|
82 |
+
The dataset with the labels converted
|
83 |
+
|
84 |
+
"""
|
85 |
+
label_values = {'entailment':1.0,
|
86 |
+
'neutral':0.0,
|
87 |
+
'contradiction':-1.0}
|
88 |
+
return data.align_labels_with_mapping(label_values,
|
89 |
+
column)
|
90 |
+
|
91 |
+
|
qarac/models/QaracTrainerModel.py
CHANGED
@@ -53,13 +53,24 @@ class QuaracTrainerModel(keras.Model):
|
|
53 |
'conclusion_offset': tokenized text of conclusions for reasoning
|
54 |
objective, prefixed by '<s>'
|
55 |
'statement0': tokenized statement for consistency objective
|
|
|
56 |
training : Bool, optional
|
57 |
Not used. The default is None.
|
58 |
|
59 |
Returns
|
60 |
-------
|
61 |
-
results :
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
"""
|
65 |
results = {}
|
|
|
53 |
'conclusion_offset': tokenized text of conclusions for reasoning
|
54 |
objective, prefixed by '<s>'
|
55 |
'statement0': tokenized statement for consistency objective
|
56 |
+
'statement1: tokenized statement for consistency objective'
|
57 |
training : Bool, optional
|
58 |
Not used. The default is None.
|
59 |
|
60 |
Returns
|
61 |
-------
|
62 |
+
results : dict[str,tensorflow.tensor]
|
63 |
+
Fields are
|
64 |
+
'encode_decode': tokeniaed text from decoding of vectors produced by
|
65 |
+
answer encoder from 'all_text'
|
66 |
+
'question_answering': difference between vector produced by question
|
67 |
+
encoder for 'question' and answer encoder for
|
68 |
+
'answer'
|
69 |
+
'reasoning': tokenised text produced by decoder from sum of vectors
|
70 |
+
produced by answwr endocer for 'proposition0' and
|
71 |
+
'proposition1'
|
72 |
+
'consistency': cosine similarity of vectors produced by answer encoder
|
73 |
+
from 'statement0' and 'statement1'
|
74 |
|
75 |
"""
|
76 |
results = {}
|
qarac/utils/CoreferenceResolver.py
CHANGED
@@ -10,15 +10,51 @@ from allennlp.predictors.predictor import Predictor
|
|
10 |
import pandas
|
11 |
|
12 |
def clean(sentence):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
return sentence if sentence.strip().endswith('.') else sentence+'.'
|
14 |
|
15 |
class CoreferenceResolver(object):
|
16 |
|
17 |
def __init__(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
model_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
|
19 |
self.predictor = Predictor.from_path(model_url)
|
20 |
|
21 |
def __call__(self,group):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
tokenized = group.apply(clean).str.split()
|
23 |
line_breaks = tokenized.apply(len).cumsum()
|
24 |
doc = []
|
|
|
10 |
import pandas
|
11 |
|
12 |
def clean(sentence):
|
13 |
+
"""
|
14 |
+
Ensure sentence ends with full stop
|
15 |
+
|
16 |
+
Parameters
|
17 |
+
----------
|
18 |
+
sentence : str
|
19 |
+
Sentence to be cleaned
|
20 |
+
|
21 |
+
Returns
|
22 |
+
-------
|
23 |
+
str
|
24 |
+
Sentence with full stop at the end.
|
25 |
+
|
26 |
+
"""
|
27 |
return sentence if sentence.strip().endswith('.') else sentence+'.'
|
28 |
|
29 |
class CoreferenceResolver(object):
|
30 |
|
31 |
def __init__(self):
|
32 |
+
"""
|
33 |
+
Creates the Coreference resolver
|
34 |
+
|
35 |
+
Returns
|
36 |
+
-------
|
37 |
+
None.
|
38 |
+
|
39 |
+
"""
|
40 |
model_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
|
41 |
self.predictor = Predictor.from_path(model_url)
|
42 |
|
43 |
def __call__(self,group):
|
44 |
+
"""
|
45 |
+
|
46 |
+
|
47 |
+
Parameters
|
48 |
+
----------
|
49 |
+
group : pandas.Series
|
50 |
+
Sentences on which to perform coreference resolution
|
51 |
+
|
52 |
+
Returns
|
53 |
+
-------
|
54 |
+
pandas.Series
|
55 |
+
Sentences with coreferences resolved
|
56 |
+
|
57 |
+
"""
|
58 |
tokenized = group.apply(clean).str.split()
|
59 |
line_breaks = tokenized.apply(len).cumsum()
|
60 |
doc = []
|