PeteBleackley commited on
Commit
4f8366b
·
1 Parent(s): e149b0f

Components for managing training corpora

Browse files
qarac/corpora/CombinedCorpus.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Wed Sep 20 14:12:34 2023
5
+
6
+ @author: peter
7
+ """
8
+
9
+ import collections
10
+ import numpy
11
+ import tensorflow
12
+ import keras
13
+ import CorpusLoader
14
+ import CorpusRepeater
15
+
16
+ class CombinedCorpus(keras.utils.Sequence):
17
+
18
+ def __init__(self,tokenizer,**kwargs):
19
+ """
20
+ Creates the Combined Corpus
21
+
22
+ Parameters
23
+ ----------
24
+ tokenizer : tokenizers.Tokenizer
25
+ Tokenizer used in preparing datasets
26
+ **kwargs : str
27
+ paths for tokenized datsets
28
+
29
+ Returns
30
+ -------
31
+ None.
32
+
33
+ """
34
+ super(CombinedCorpus,self).__init__()
35
+ self.tokenizer = tokenizer
36
+ start_doc = tokenizer.encode('<s>')
37
+ end_doc = tokenizer.encode('</s>')
38
+ self.all_text = CorpusLoader.CorpusLoader(kwargs['all_text'],
39
+ start_doc,
40
+ end_doc,
41
+ ['all_text'],
42
+ {'all_text':('offset_text',
43
+ 'encode_decode')})
44
+ n_samples = len(self.all_text)
45
+ self.n_batches = numpy.ceil(n_samples/32.0).astype(int)
46
+ self.question_answering = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['question_answering'],
47
+ start_doc,
48
+ end_doc,
49
+ ['question',
50
+ 'answer'],
51
+ {}),
52
+ n_samples)
53
+ self.reasoning = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['reasoning'],
54
+ start_doc,
55
+ end_doc,
56
+ ['proposition0',
57
+ 'proposition1'],
58
+ {'conclusion':('conclusion_offset',
59
+ 'reasoning')}),
60
+ n_samples)
61
+ self.consistency = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['consitency'],
62
+ start_doc,
63
+ end_doc,
64
+ ['statement0',
65
+ 'statement1'],
66
+ {},
67
+ 'consistency'),
68
+ n_samples)
69
+ self.batches = []
70
+ self.pad_token = self.tokenizer.token_to_id('<pad>')
71
+ self.on_epoch_end()
72
+
73
+ def __len__(self):
74
+ """
75
+ Number of batches
76
+
77
+ Returns
78
+ -------
79
+ int
80
+ Number of batches
81
+
82
+ """
83
+ return self.n_batches
84
+
85
+ def __getitem__(self,n):
86
+ """
87
+ Retrieves a batch of data
88
+
89
+ Parameters
90
+ ----------
91
+ n : int
92
+ index of batch to retrieve
93
+
94
+ Returns
95
+ -------
96
+ tupe(dict,dict)
97
+ Batch of data
98
+
99
+ """
100
+ return self.batches[n]
101
+
102
+ def samples(self):
103
+ """
104
+ Iterates over samples of data
105
+
106
+ Yields
107
+ ------
108
+ X : dict
109
+ Sample of training inputs
110
+ Y : dict
111
+ Sample of training outputs
112
+
113
+ """
114
+ for sample in zip(self.all_text,
115
+ self.question_answering,
116
+ self.reasoning,
117
+ self.consistency):
118
+ X={}
119
+ Y={}
120
+ for (x,y) in sample:
121
+ X.update(x)
122
+ Y.update(y)
123
+ yield (X,Y)
124
+
125
+ def on_epoch_end(self):
126
+ """
127
+ Regenerates batches of data
128
+
129
+ Returns
130
+ -------
131
+ None.
132
+
133
+ """
134
+ self.batches = []
135
+ n=0
136
+ X = collections.defaultdict(list)
137
+ Y = collections.defaultdict(list)
138
+ for (x,y) in self.samples:
139
+ for (key,value) in x.items:
140
+ X[key].append(value)
141
+ for (key,value) in y.items():
142
+ Y[key].append(value)
143
+ n+=1
144
+ if n==32:
145
+ self.batches.append(self.batch(X,Y))
146
+ n=0
147
+ X.clear()
148
+ Y.clear()
149
+ if n!=0:
150
+ self.batches.append(self.batch(X,Y,n))
151
+
152
+ def batch(self,X,Y,n=32):
153
+ """
154
+ Creates a batch of data from samples
155
+
156
+ Parameters
157
+ ----------
158
+ X : dict[str,list]
159
+ Input samples
160
+ Y : dict[str.list]
161
+ output samples
162
+ n : int, optional
163
+ Size of batch. The default is 32.
164
+
165
+ Returns
166
+ -------
167
+ X : dict[str,tensorflow.Tensor]
168
+ Batched input samples
169
+ Y : dict[str,tensorflow.Tensor]
170
+ Batched output samples
171
+
172
+ """
173
+ for (key,value) in X.items():
174
+ X[key] = self.pad(value)
175
+ for (key,value) in Y.items():
176
+ Y[key] = tensorflow.constant(value) if key=='consistency' else self.pad(value)
177
+ Y['question_answering'] = tensorflow.zeros((n,768))
178
+ return (X,Y)
179
+
180
+ def pad(self,batch):
181
+ """
182
+ Pads a batch of samples to uniform length
183
+
184
+ Parameters
185
+ ----------
186
+ batch : list[tokenizers.Encoding]
187
+ Samples to be padded
188
+
189
+ Returns
190
+ -------
191
+ tensorflow.Tensor
192
+ Padded data
193
+
194
+ """
195
+ maxlen = max((len(sample) for sample in batch))
196
+ return tensorflow.constant([sample.pad(maxlen,pad_id=self.pad_token).ids
197
+ for sample in batch])
198
+
199
+
qarac/corpora/CorpusLoader.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Wed Sep 20 07:48:54 2023
5
+
6
+ @author: peter
7
+ """
8
+
9
+ import datasets
10
+ import tokenizers
11
+
12
+ class CorpusLoader(object):
13
+
14
+ def __init__(self,path,
15
+ start_doc,
16
+ end_doc,
17
+ text_inputs,
18
+ text_outputs,
19
+ label=None):
20
+ """
21
+ Creates the Corpus Loader
22
+
23
+ Parameters
24
+ ----------
25
+ path : str
26
+ Path to load dataset from
27
+ start_doc : tokenizers.Encoding
28
+ Token id for document start character
29
+ end_doc : tokenizers.Encoding
30
+ Token id for the document end character
31
+ text_inputs : list[str]
32
+ Columns of the dataset to add to the inputs
33
+ text_outputs : dict[str,tuple[str]]
34
+ The columns of the dataset to add to the outputs. The key is the name
35
+ of the column in the original dataset, the first element of the tuple
36
+ is the name that the column prefixed with '<s>' will have in the
37
+ inputs, and the second element of the tuple is the name that the column
38
+ suffixed with '</s>' will have in the outputs
39
+ label : str, optional
40
+ A column of numerical labels to add to the outputs. The default is None.
41
+
42
+ Returns
43
+ -------
44
+ None.
45
+
46
+ """
47
+ data = datasets.Dataset.from_file(path)
48
+ self.n_rows = len(data)
49
+ self.dataset = data.to_iterable_dataset()
50
+ self.start_doc = start_doc
51
+ self.end_doc = end_doc
52
+ self.text_inputs = text_inputs
53
+ self.text_outputs = text_outputs
54
+ self.label = label
55
+
56
+ def __len__(self):
57
+ """
58
+ The length of the corpus
59
+
60
+ Returns
61
+ -------
62
+ int
63
+ The number of samples
64
+
65
+ """
66
+ return self.n_rows
67
+
68
+ def __iter__(self):
69
+ """
70
+ Generates samples in a random order
71
+
72
+ Yields
73
+ ------
74
+ X : dict
75
+ Inputs for model
76
+ Y : dict
77
+ outputs for model
78
+
79
+ """
80
+ for row in self.dataset.shuffle():
81
+ X={}
82
+ Y={}
83
+ for column in self.text_inputs:
84
+ X[column] = row[column]
85
+ for (column,(x_name),y_name) in self.text_outputs.items():
86
+ X[x_name] = tokenizers.Encoding.merge([self.start_doc,row[column]])
87
+ Y[y_name] = tokenizers.Encoding.merge([row[column],self.end_doc])
88
+ if self.label is not None:
89
+ Y[self.label]=row[self.label]
90
+ yield (X,Y)
91
+
qarac/corpora/Preprocessor.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Mon Sep 18 13:18:59 2023
5
+
6
+ @author: peter
7
+ """
8
+
9
+ import tokenizers
10
+ import datasets
11
+ import pandas
12
+
13
+ class Preprocessor(object):
14
+
15
+ def __init__(self,tokenizer_path='roberta-base'):
16
+ """
17
+ Creates the preporcessor
18
+
19
+ Parameters
20
+ ----------
21
+ tokenizer_path : str, optional
22
+ The path to the pretrained tokenizer . The default is 'roberta-base'.
23
+
24
+ Returns
25
+ -------
26
+ None.
27
+
28
+ """
29
+ self.tokenizer = tokenizers.Tokenizer.from_pretrained(tokenizer_path)
30
+ self.start_token = self.tokenizer.encode('<s>')
31
+ self.end_token = self.tokenizer.encode('</s>')
32
+
33
+ def __call__(self,data):
34
+ """
35
+ Tokenizes a column of data
36
+
37
+ Parameters
38
+ ----------
39
+ data : pandas.Series
40
+ Column of text tata
41
+
42
+ Returns
43
+ -------
44
+ list[tokenizers.Encoding]
45
+ Tokenized data
46
+
47
+ """
48
+ return self.tokenizer.encode_batch(data,add_special_tokens=False)
49
+
50
+
51
+ def combine(self,*args):
52
+ """
53
+ Tokenises several data columns
54
+
55
+ Parameters
56
+ ----------
57
+ *args : sequence of pandas.Series
58
+ .
59
+
60
+ Returns
61
+ -------
62
+ TYPE
63
+ DESCRIPTION.
64
+
65
+ """
66
+ return self(pandas.concatenate(args))
67
+
68
+ def process_labels(self,data,column):
69
+ """
70
+ Converts labels to numerical value for consitency objective
71
+
72
+ Parameters
73
+ ----------
74
+ data : datasets.Dataset
75
+ dataset for which labels need to be converted
76
+ column : str
77
+ The column on which to apply label conversion
78
+
79
+ Returns
80
+ -------
81
+ datasets.Dataset
82
+ The dataset with the labels converted
83
+
84
+ """
85
+ label_values = {'entailment':1.0,
86
+ 'neutral':0.0,
87
+ 'contradiction':-1.0}
88
+ return data.align_labels_with_mapping(label_values,
89
+ column)
90
+
91
+
qarac/models/QaracTrainerModel.py CHANGED
@@ -53,13 +53,24 @@ class QuaracTrainerModel(keras.Model):
53
  'conclusion_offset': tokenized text of conclusions for reasoning
54
  objective, prefixed by '<s>'
55
  'statement0': tokenized statement for consistency objective
 
56
  training : Bool, optional
57
  Not used. The default is None.
58
 
59
  Returns
60
  -------
61
- results : TYPE
62
- DESCRIPTION.
 
 
 
 
 
 
 
 
 
 
63
 
64
  """
65
  results = {}
 
53
  'conclusion_offset': tokenized text of conclusions for reasoning
54
  objective, prefixed by '<s>'
55
  'statement0': tokenized statement for consistency objective
56
+ 'statement1: tokenized statement for consistency objective'
57
  training : Bool, optional
58
  Not used. The default is None.
59
 
60
  Returns
61
  -------
62
+ results : dict[str,tensorflow.tensor]
63
+ Fields are
64
+ 'encode_decode': tokeniaed text from decoding of vectors produced by
65
+ answer encoder from 'all_text'
66
+ 'question_answering': difference between vector produced by question
67
+ encoder for 'question' and answer encoder for
68
+ 'answer'
69
+ 'reasoning': tokenised text produced by decoder from sum of vectors
70
+ produced by answwr endocer for 'proposition0' and
71
+ 'proposition1'
72
+ 'consistency': cosine similarity of vectors produced by answer encoder
73
+ from 'statement0' and 'statement1'
74
 
75
  """
76
  results = {}
qarac/utils/CoreferenceResolver.py CHANGED
@@ -10,15 +10,51 @@ from allennlp.predictors.predictor import Predictor
10
  import pandas
11
 
12
  def clean(sentence):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  return sentence if sentence.strip().endswith('.') else sentence+'.'
14
 
15
  class CoreferenceResolver(object):
16
 
17
  def __init__(self):
 
 
 
 
 
 
 
 
18
  model_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
19
  self.predictor = Predictor.from_path(model_url)
20
 
21
  def __call__(self,group):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  tokenized = group.apply(clean).str.split()
23
  line_breaks = tokenized.apply(len).cumsum()
24
  doc = []
 
10
  import pandas
11
 
12
  def clean(sentence):
13
+ """
14
+ Ensure sentence ends with full stop
15
+
16
+ Parameters
17
+ ----------
18
+ sentence : str
19
+ Sentence to be cleaned
20
+
21
+ Returns
22
+ -------
23
+ str
24
+ Sentence with full stop at the end.
25
+
26
+ """
27
  return sentence if sentence.strip().endswith('.') else sentence+'.'
28
 
29
  class CoreferenceResolver(object):
30
 
31
  def __init__(self):
32
+ """
33
+ Creates the Coreference resolver
34
+
35
+ Returns
36
+ -------
37
+ None.
38
+
39
+ """
40
  model_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
41
  self.predictor = Predictor.from_path(model_url)
42
 
43
  def __call__(self,group):
44
+ """
45
+
46
+
47
+ Parameters
48
+ ----------
49
+ group : pandas.Series
50
+ Sentences on which to perform coreference resolution
51
+
52
+ Returns
53
+ -------
54
+ pandas.Series
55
+ Sentences with coreferences resolved
56
+
57
+ """
58
  tokenized = group.apply(clean).str.split()
59
  line_breaks = tokenized.apply(len).cumsum()
60
  doc = []