Spaces:

PlayfulTechnology
/

QARAC

Build error

App Files Files Community

PeteBleackley commited on Sep 21, 2023

Commit

4f8366b

1 Parent(s): e149b0f

Components for managing training corpora

Browse files

Files changed (5) hide show

qarac/corpora/CombinedCorpus.py +199 -0
qarac/corpora/CorpusLoader.py +91 -0
qarac/corpora/Preprocessor.py +91 -0
qarac/models/QaracTrainerModel.py +13 -2
qarac/utils/CoreferenceResolver.py +36 -0

qarac/corpora/CombinedCorpus.py ADDED Viewed

	@@ -0,0 +1,199 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 20 14:12:34 2023
+@author: peter
+"""
+import collections
+import numpy
+import tensorflow
+import keras
+import CorpusLoader
+import CorpusRepeater
+class CombinedCorpus(keras.utils.Sequence):
+    def __init__(self,tokenizer,**kwargs):
+        """
+        Creates the Combined Corpus
+        Parameters
+        ----------
+        tokenizer : tokenizers.Tokenizer
+            Tokenizer used in preparing datasets
+        **kwargs : str
+            paths for tokenized datsets
+        Returns
+        -------
+        None.
+        """
+        super(CombinedCorpus,self).__init__()
+        self.tokenizer = tokenizer
+        start_doc = tokenizer.encode('<s>')
+        end_doc = tokenizer.encode('</s>')
+        self.all_text = CorpusLoader.CorpusLoader(kwargs['all_text'],
+                                                  start_doc,
+                                                  end_doc,
+                                                  ['all_text'],
+                                                  {'all_text':('offset_text',
+                                                               'encode_decode')})
+        n_samples = len(self.all_text)
+        self.n_batches = numpy.ceil(n_samples/32.0).astype(int)
+        self.question_answering = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['question_answering'],
+                                                                                          start_doc,
+                                                                                          end_doc,
+                                                                                          ['question',
+                                                                                           'answer'],
+                                                                                          {}),
+                                                                n_samples)
+        self.reasoning = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['reasoning'],
+                                                                                 start_doc,
+                                                                                 end_doc,
+                                                                                 ['proposition0',
+                                                                                  'proposition1'],
+                                                                                 {'conclusion':('conclusion_offset',
+                                                                                                'reasoning')}),
+                                                       n_samples)
+        self.consistency = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['consitency'],
+                                                                                   start_doc,
+                                                                                   end_doc,
+                                                                                   ['statement0',
+                                                                                    'statement1'],
+                                                                                   {},
+                                                                                   'consistency'),
+                                                         n_samples)
+        self.batches = []
+        self.pad_token = self.tokenizer.token_to_id('<pad>')
+        self.on_epoch_end()
+    def __len__(self):
+        """
+        Number of batches
+        Returns
+        -------
+        int
+            Number of batches
+        """
+        return self.n_batches
+    def __getitem__(self,n):
+        """
+        Retrieves a batch of data
+        Parameters
+        ----------
+        n : int
+            index of batch to retrieve
+        Returns
+        -------
+        tupe(dict,dict)
+            Batch of data
+        """
+        return self.batches[n]
+    def samples(self):
+        """
+        Iterates over samples of data
+        Yields
+        ------
+        X : dict
+            Sample of training inputs
+        Y : dict
+            Sample of training outputs
+        """
+        for sample in zip(self.all_text,
+                          self.question_answering,
+                          self.reasoning,
+                          self.consistency):
+            X={}
+            Y={}
+            for (x,y) in sample:
+                X.update(x)
+                Y.update(y)
+            yield (X,Y)
+    def on_epoch_end(self):
+        """
+        Regenerates batches of data
+        Returns
+        -------
+        None.
+        """
+        self.batches = []
+        n=0
+        X = collections.defaultdict(list)
+        Y = collections.defaultdict(list)
+        for (x,y) in self.samples:
+            for (key,value) in x.items:
+                X[key].append(value)
+            for (key,value) in y.items():
+                Y[key].append(value)
+            n+=1
+            if n==32:
+                self.batches.append(self.batch(X,Y))
+                n=0
+                X.clear()
+                Y.clear()
+        if n!=0:
+            self.batches.append(self.batch(X,Y,n))
+    def batch(self,X,Y,n=32):
+        """
+        Creates a batch of data from samples
+        Parameters
+        ----------
+        X : dict[str,list]
+            Input samples
+        Y : dict[str.list]
+            output samples
+        n : int, optional
+            Size of batch. The default is 32.
+        Returns
+        -------
+        X : dict[str,tensorflow.Tensor]
+            Batched input samples
+        Y : dict[str,tensorflow.Tensor]
+            Batched output samples
+        """
+        for (key,value) in X.items():
+            X[key] = self.pad(value)
+        for (key,value) in Y.items():
+            Y[key] = tensorflow.constant(value) if key=='consistency' else self.pad(value)
+        Y['question_answering'] = tensorflow.zeros((n,768))
+        return (X,Y)
+    def pad(self,batch):
+        """
+        Pads a batch of samples to uniform length
+        Parameters
+        ----------
+        batch : list[tokenizers.Encoding]
+                Samples to be padded
+        Returns
+        -------
+        tensorflow.Tensor
+            Padded data
+        """
+        maxlen = max((len(sample) for sample in batch))
+        return tensorflow.constant([sample.pad(maxlen,pad_id=self.pad_token).ids
+                                    for sample in batch])

qarac/corpora/CorpusLoader.py ADDED Viewed

	@@ -0,0 +1,91 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 20 07:48:54 2023
+@author: peter
+"""
+import datasets
+import tokenizers
+class CorpusLoader(object):
+    def __init__(self,path,
+                 start_doc,
+                 end_doc,
+                 text_inputs,
+                 text_outputs,
+                 label=None):
+        """
+        Creates the Corpus Loader
+        Parameters
+        ----------
+        path : str
+            Path to load dataset from
+        start_doc : tokenizers.Encoding
+            Token id for document start character
+        end_doc : tokenizers.Encoding
+            Token id for the document end character
+        text_inputs : list[str]
+            Columns of the dataset to add to the inputs
+        text_outputs : dict[str,tuple[str]]
+            The columns of the dataset to add to the outputs. The key is the name
+            of the column in the original dataset, the first element of the tuple
+            is the name that the column prefixed with '<s>' will have in the
+            inputs, and the second element of the tuple is the name that the column
+            suffixed with '</s>' will have in the outputs
+        label : str, optional
+            A column of numerical labels to add to the outputs. The default is None.
+        Returns
+        -------
+        None.
+        """
+        data = datasets.Dataset.from_file(path)
+        self.n_rows = len(data)
+        self.dataset = data.to_iterable_dataset()
+        self.start_doc = start_doc
+        self.end_doc = end_doc
+        self.text_inputs = text_inputs
+        self.text_outputs = text_outputs
+        self.label = label
+    def __len__(self):
+        """
+        The length of the corpus
+        Returns
+        -------
+        int
+            The number of samples
+        """
+        return self.n_rows
+    def __iter__(self):
+        """
+        Generates samples in a random order
+        Yields
+        ------
+        X : dict
+            Inputs for model
+        Y : dict
+            outputs for model
+        """
+        for row in self.dataset.shuffle():
+            X={}
+            Y={}
+            for column in self.text_inputs:
+                X[column] = row[column]
+            for (column,(x_name),y_name) in self.text_outputs.items():
+                X[x_name] = tokenizers.Encoding.merge([self.start_doc,row[column]])
+                Y[y_name] = tokenizers.Encoding.merge([row[column],self.end_doc])
+            if self.label is not None:
+                Y[self.label]=row[self.label]
+            yield (X,Y)

qarac/corpora/Preprocessor.py ADDED Viewed

	@@ -0,0 +1,91 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Sep 18 13:18:59 2023
+@author: peter
+"""
+import tokenizers
+import datasets
+import pandas
+class Preprocessor(object):
+    def __init__(self,tokenizer_path='roberta-base'):
+        """
+        Creates the preporcessor
+        Parameters
+        ----------
+        tokenizer_path : str, optional
+            The path to the pretrained tokenizer . The default is 'roberta-base'.
+        Returns
+        -------
+        None.
+        """
+        self.tokenizer = tokenizers.Tokenizer.from_pretrained(tokenizer_path)
+        self.start_token = self.tokenizer.encode('<s>')
+        self.end_token = self.tokenizer.encode('</s>')
+    def __call__(self,data):
+        """
+        Tokenizes a column of data
+        Parameters
+        ----------
+        data : pandas.Series
+            Column of text tata
+        Returns
+        -------
+        list[tokenizers.Encoding]
+            Tokenized data
+        """
+        return self.tokenizer.encode_batch(data,add_special_tokens=False)
+    def combine(self,*args):
+        """
+        Tokenises several data columns
+        Parameters
+        ----------
+        *args : sequence of pandas.Series
+            .
+        Returns
+        -------
+        TYPE
+            DESCRIPTION.
+        """
+        return self(pandas.concatenate(args))
+    def process_labels(self,data,column):
+        """
+        Converts labels to numerical value for consitency objective
+        Parameters
+        ----------
+        data : datasets.Dataset
+            dataset for which labels need to be converted
+        column : str
+            The column on which to apply label conversion
+        Returns
+        -------
+        datasets.Dataset
+            The dataset with the labels converted
+        """
+        label_values = {'entailment':1.0,
+                        'neutral':0.0,
+                        'contradiction':-1.0}
+        return data.align_labels_with_mapping(label_values,
+                                              column)

qarac/models/QaracTrainerModel.py CHANGED Viewed

@@ -53,13 +53,24 @@ class QuaracTrainerModel(keras.Model):
             'conclusion_offset': tokenized text of conclusions for reasoning
                                  objective, prefixed by '<s>'
             'statement0': tokenized statement for consistency objective
         training : Bool, optional
             Not used. The default is None.
         Returns
         -------
-        results : TYPE
-            DESCRIPTION.
         """
         results = {}

             'conclusion_offset': tokenized text of conclusions for reasoning
                                  objective, prefixed by '<s>'
             'statement0': tokenized statement for consistency objective
+            'statement1: tokenized statement for consistency objective'
         training : Bool, optional
             Not used. The default is None.
         Returns
         -------
+        results : dict[str,tensorflow.tensor]
+            Fields are
+            'encode_decode': tokeniaed text from decoding of vectors produced by
+                             answer encoder from 'all_text'
+            'question_answering': difference between vector produced by question
+                                  encoder for 'question' and answer encoder for
+                                  'answer'
+            'reasoning': tokenised text produced by decoder from sum of vectors
+                         produced by answwr endocer for 'proposition0' and
+                         'proposition1'
+            'consistency': cosine similarity of vectors produced by answer encoder
+                           from 'statement0' and 'statement1'
         """
         results = {}

qarac/utils/CoreferenceResolver.py CHANGED Viewed

@@ -10,15 +10,51 @@ from allennlp.predictors.predictor import Predictor
 import pandas
 def clean(sentence):
     return sentence if sentence.strip().endswith('.') else sentence+'.'
 class CoreferenceResolver(object):
     def __init__(self):
         model_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
         self.predictor = Predictor.from_path(model_url)
     def __call__(self,group):
         tokenized = group.apply(clean).str.split()
         line_breaks = tokenized.apply(len).cumsum()
         doc = []

 import pandas
 def clean(sentence):
+    """
+    Ensure sentence ends with full stop
+    Parameters
+    ----------
+    sentence : str
+        Sentence to be cleaned
+    Returns
+    -------
+    str
+        Sentence with full stop at the end.
+    """
     return sentence if sentence.strip().endswith('.') else sentence+'.'
 class CoreferenceResolver(object):
     def __init__(self):
+        """
+        Creates the Coreference resolver
+        Returns
+        -------
+        None.
+        """
         model_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
         self.predictor = Predictor.from_path(model_url)
     def __call__(self,group):
+        """
+        Parameters
+        ----------
+        group : pandas.Series
+            Sentences on which to perform coreference resolution
+        Returns
+        -------
+        pandas.Series
+            Sentences with coreferences resolved
+        """
         tokenized = group.apply(clean).str.split()
         line_breaks = tokenized.apply(len).cumsum()
         doc = []