PeteBleackley commited on
Commit
83d5adb
Β·
1 Parent(s): 75ef467

Code for building and training base modkels.

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.json
{src β†’ qarac}/__init__.py RENAMED
File without changes
qarac/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (128 Bytes). View file
 
{src β†’ qarac}/corpora/BNCorpus.py RENAMED
@@ -5,6 +5,7 @@ Created on Thu Aug 24 10:38:48 2023
5
 
6
  @author: peter
7
  """
 
8
  import numpy
9
  import numpy.random
10
  import nltk.corpus
@@ -16,12 +17,20 @@ def detokenize(sentences):
16
  class BNCorpus(object):
17
 
18
  def __init__(self,fileids=None,tokenizer=None,task=None):
19
- self.bnc = nltk.corpus.reader.bnc.BNCCorpusReader('BNC/Texts', fileids=r'[A-K]/\w*/\w*\.xml')
 
 
 
20
  self.file_ids = self.bnc.fileids() if fileids is None else fileids
21
  self.n_docs = len(self.file_ids)
22
  self.rng = numpy.random.default_rng()
23
  self.tokenizer = tokenizer
24
  self.task = task
 
 
 
 
 
25
 
26
  def __len__(self):
27
  return self.n_docs
@@ -29,8 +38,8 @@ class BNCorpus(object):
29
  def split(self,p=0.8):
30
  n = int(p*self.n_docs)
31
  self.rng.shuffle(self.file_ids)
32
- train = BNCorpus(self.fileids[:n],self.tokenizer,self.task)
33
- test = BNCorpus(self.fileids[n:],self.tokenizer,self.task)
34
  return (train,test)
35
 
36
  def __iter__(self):
@@ -40,32 +49,47 @@ class BNCorpus(object):
40
  if self.task is None:
41
  yield detokenize(doc)
42
  elif self.task=='encode':
43
- yield self.endoder_example(doc)
44
  else:
45
  yield self.decoder_example(doc)
46
 
47
  def encoder_example(self,doc):
48
- masked_sentences = []
49
- sample_weights = []
50
- for sentence in doc:
51
- cp = sentence[:]
52
- n = len(sentence)
53
- weights = numpy.zeros(n)
54
- k = self.rng.integers(n)
55
- cp[k] = '[MASK] '
56
- masked_sentences.append(cp)
57
- weights[k] = 1
58
- sample_weights.append(weights)
59
- return (self.tokenizer.encode(detokenize(masked_sentences)),
60
- self.tokenizer.encode(detokenize(doc)),
 
 
 
61
  numpy.concatenate(sample_weights))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
- def decoder_sample(self,doc):
64
- x = ['START'] + doc
65
- y = doc + ['END']
66
- sample_weights = [numpy.zeros(len(sentence)) if i==0
67
- else numpy.ones(len(sentence))
68
- for (i,sentence) in enumerate(y)]
69
- return (self.tokenizer.encode(detokenize(x)),
70
- self.tokenizer.encode(detokenize(y)),
71
- numpy.concatenate(sample_weights))
 
5
 
6
  @author: peter
7
  """
8
+ import os
9
  import numpy
10
  import numpy.random
11
  import nltk.corpus
 
17
  class BNCorpus(object):
18
 
19
  def __init__(self,fileids=None,tokenizer=None,task=None):
20
+ self.bnc = nltk.corpus.reader.bnc.BNCCorpusReader('/'.join([os.environ['HOME'],
21
+ 'BNC',
22
+ 'Texts']),
23
+ fileids=r'[A-K]/\w*/\w*\.xml')
24
  self.file_ids = self.bnc.fileids() if fileids is None else fileids
25
  self.n_docs = len(self.file_ids)
26
  self.rng = numpy.random.default_rng()
27
  self.tokenizer = tokenizer
28
  self.task = task
29
+ if self.tokenizer is not None:
30
+ self.mask = self.tokenizer.token_to_id('<mask>')
31
+ self.start = self.tokenizer.token_to_id('<start>')
32
+ self.end = self.tokenizer.token_to_id('<end>')
33
+ self.pad = numpy.array([self.tokenizer.token_to_id('<pad>')])
34
 
35
  def __len__(self):
36
  return self.n_docs
 
38
  def split(self,p=0.8):
39
  n = int(p*self.n_docs)
40
  self.rng.shuffle(self.file_ids)
41
+ train = BNCorpus(self.file_ids[:n],self.tokenizer,self.task)
42
+ test = BNCorpus(self.file_ids[n:],self.tokenizer,self.task)
43
  return (train,test)
44
 
45
  def __iter__(self):
 
49
  if self.task is None:
50
  yield detokenize(doc)
51
  elif self.task=='encode':
52
+ yield self.encoder_example(doc)
53
  else:
54
  yield self.decoder_example(doc)
55
 
56
  def encoder_example(self,doc):
57
+ sentences = self.encode(doc)
58
+ masked_sentences = [sentence.copy()
59
+ for sentence in sentences]
60
+ sample_weights = [numpy.zeros_like(sentence)
61
+ for sentence in sentences]
62
+ masks = self.rng.integers([sentence.shape[0]
63
+ for sentence in sentences])
64
+ for (i,n) in enumerate(masks):
65
+ masked_sentences[i][n]=self.mask
66
+ sample_weights[i][n]=1
67
+ if sum((sentence.shape[0] for sentence in sentences))%2 ==1:
68
+ masked_sentences.append(self.pad)
69
+ sentences.append(self.pad)
70
+ sample_weights.append(numpy.zeros(1))
71
+ return (numpy.concatenate(masked_sentences),
72
+ numpy.concatenate(sentences),
73
  numpy.concatenate(sample_weights))
74
+
75
+
76
+
77
+
78
+ def decoder_example(self,doc):
79
+ sentences = self.encode(doc)
80
+ before = [numpy.array([self.start])]+sentences
81
+ sentences.append(numpy.array([self.end]))
82
+ sample_weights = numpy.ones(sum([sentence.shape[0]
83
+ for sentence in sentences]))
84
+ sample_weights[:4]=0
85
+ return (numpy.concatenate(before),
86
+ numpy.concatenate(sentences),
87
+ sample_weights)
88
+
89
+
90
+ def encode(self,doc):
91
+ return [numpy.array(self.tokenizer.encode(''.join(sentence)).ids)
92
+ for sentence in doc
93
+ if len(sentence)>0]
94
 
95
+
 
 
 
 
 
 
 
 
qarac/corpora/Batcher.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Mon Aug 28 11:25:26 2023
5
+
6
+ @author: peter
7
+ """
8
+ import keras
9
+ import tensorflow
10
+ import tqdm
11
+
12
+ class Batcher(keras.utils.Sequence):
13
+
14
+ def __init__(self,source,batch_size=32):
15
+ self.batches = None
16
+ self.source=source
17
+ self.batch_size=batch_size
18
+ self.on_epoch_end()
19
+
20
+ def __len__(self):
21
+ return len(self.batches)
22
+
23
+ def __getitem__(self, index):
24
+ return self.batches[index]
25
+
26
+ def on_epoch_end(self):
27
+ self.batches = []
28
+ n=0
29
+ X=[]
30
+ Y=[]
31
+ Z=[]
32
+ for (x,y,z) in tqdm.tqdm(self.source):
33
+ X.append(x)
34
+ Y.append(y)
35
+ Z.append(z)
36
+ n+=1
37
+ if n==self.batch_size:
38
+ self.batches.append((tensorflow.ragged.constant(X),
39
+ tensorflow.ragged.constant(Y),
40
+ tensorflow.ragged.constant(Z)))
41
+ n=0
42
+ X=[]
43
+ Y=[]
44
+ Z=[]
45
+ if n!=0:
46
+ self.batches.append((tensorflow.ragged.constant(X),
47
+ tensorflow.ragged.constant(Y),
48
+ tensorflow.ragged.constant(Z)))
49
+
50
+
qarac/corpora/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
qarac/corpora/__pycache__/BNCorpus.cpython-310.pyc ADDED
Binary file (3.91 kB). View file
 
qarac/corpora/__pycache__/Batcher.cpython-310.pyc ADDED
Binary file (1.37 kB). View file
 
qarac/corpora/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (136 Bytes). View file
 
{src β†’ qarac}/models/__init__.py RENAMED
File without changes
qarac/models/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (135 Bytes). View file
 
qarac/models/__pycache__/qarac_base_model.cpython-310.pyc ADDED
Binary file (899 Bytes). View file
 
{src β†’ qarac}/models/layers/HierarchicalLogits.py RENAMED
@@ -11,7 +11,8 @@ import tensorflow
11
 
12
  class LeafNode(keras.layers.Layer):
13
  def __init__(self):
14
- self.bias = self.add_weight(shape=(1,),
 
15
  initializer='random_normal',
16
  trainable=True)
17
 
@@ -55,7 +56,7 @@ class HierarchicalLogits(keras.layers.Layer):
55
  def call(self,X,training=None):
56
 
57
  y=tensorflow.tensordot(X,self.normal,1)
58
- result = self.concat([self.left(X)+y,self.right(X)]-y)
59
  return result
60
 
61
 
 
11
 
12
  class LeafNode(keras.layers.Layer):
13
  def __init__(self):
14
+ super(LeafNode,self).__init__()
15
+ self.bias = self.add_weight(
16
  initializer='random_normal',
17
  trainable=True)
18
 
 
56
  def call(self,X,training=None):
57
 
58
  y=tensorflow.tensordot(X,self.normal,1)
59
+ result = self.concat([self.left(X)+y,self.right(X)-y])
60
  return result
61
 
62
 
{src β†’ qarac}/models/layers/HierarchicalSoftMax.py RENAMED
File without changes
{src β†’ qarac}/models/layers/HyenaLayer.py RENAMED
@@ -9,15 +9,23 @@ Created on Tue Aug 22 09:34:14 2023
9
  import keras
10
  import keras_nlp
11
  import tensorflow
 
12
 
13
  def convolve(x,y):
14
- xT = tensorflow.transpose(x,[0,2,1])
15
- yT = tensorflow.transpose(y,[0,2,1])
16
- z = tensorflow.signal.irfft(tensorflow.signal.rfft(xT)*tensorflow.signal.rfft(yT))
17
- return tensorflow.transpose(z,[0,2,1])
18
-
19
-
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  class HyenaLayer(keras.layers.Layer):
23
  """Keras implementation of Hyena layer. Unlike in the original paper,
@@ -40,29 +48,50 @@ class HyenaLayer(keras.layers.Layer):
40
  None.
41
 
42
  """
43
-
44
  self.stages = stages
45
  self.causal = causal
46
  self.data_projection = None
47
  self.filters = None
48
- self.positional_encoding = keras_nlp.layers.SinePositionalEmbedding()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  def build(self,input_shape):
51
- self.data_projection = keras.layers.TimeDistributed(keras.layers.Dense((self.stages+1,input_shape[2]),
52
- activation='linear'))
53
- self.filters = keras.layers.TimeDistributed((self.stages,input_shape[2]),
54
- activation='linear')
 
55
 
56
  def call(self,X,training=None):
57
- x = self.data_projection(X)
58
- f = self.filters(self.positional_encoding(X))
 
 
 
 
 
 
59
  if self.causal:
60
  concat = keras.layers.Concatenate()
61
  x = concat(x,tensorflow.zeros_like(x))
62
  f = concat(f,tensorflow.zeros_like(f))
63
- y = x[0]
64
  for i in range(self.stages):
65
- y = convolve(y,f[i])*x[i+1]
66
  if self.causal:
67
  for (i,n) in enumerate(X.row_lengths()):
68
  y[i] = y[i,:n]
 
9
  import keras
10
  import keras_nlp
11
  import tensorflow
12
+ import warnings
13
 
14
  def convolve(x,y):
 
 
 
 
 
 
15
 
16
+ fx = tensorflow.vectorized_map(fft, x, warn=False)
17
+ fy = tensorflow.vectorized_map(fft, y, warn=False)
18
+ fz = fx*fy
19
+ return tensorflow.vectorized_map(ifft,fz,warn=False)
20
+
21
+ @tensorflow.function
22
+ def fft(x):
23
+ return tensorflow.signal.rfft(tensorflow.transpose(x))
24
+
25
+ @tensorflow.function
26
+ def ifft(x):
27
+ return tensorflow.transpose(tensorflow.signal.irfft(x))
28
+
29
 
30
  class HyenaLayer(keras.layers.Layer):
31
  """Keras implementation of Hyena layer. Unlike in the original paper,
 
48
  None.
49
 
50
  """
51
+ super(HyenaLayer,self).__init__()
52
  self.stages = stages
53
  self.causal = causal
54
  self.data_projection = None
55
  self.filters = None
56
+
57
+ def positional_encoding(self,X):
58
+ t = tensorflow.dtypes.saturate_cast(tensorflow.ragged.range(X.row_lengths()),
59
+ tensorflow.float32)
60
+ width = X.shape[-1]//2
61
+ f =10000 **tensorflow.expand_dims(-tensorflow.range(width,
62
+ dtype=tensorflow.float32)/width,
63
+ axis=0)
64
+ phi = tensorflow.RaggedTensor.from_row_lengths(t.flat_values * f,
65
+ X.row_lengths())
66
+
67
+ return tensorflow.concat([tensorflow.sin(phi),
68
+ tensorflow.cos(phi)],
69
+ axis=-1)
70
+
71
 
72
  def build(self,input_shape):
73
+ width = input_shape[-1]
74
+ self.data_projection = self.add_weight(shape=(width,width,self.stages+1),
75
+ trainable=True)
76
+ self.filters = self.add_weight(shape=(width,width,self.stages),
77
+ trainable=True)
78
 
79
  def call(self,X,training=None):
80
+ x_flat = tensorflow.tensordot(X.flat_values,
81
+ self.data_projection,
82
+ axes=1)
83
+ f_flat = tensorflow.tensordot(self.positional_encoding(X).flat_values,
84
+ self.filters,
85
+ axes=1)
86
+ x = tensorflow.RaggedTensor.from_row_lengths(x_flat,X.row_lengths())
87
+ f = tensorflow.RaggedTensor.from_row_lengths(f_flat,X.row_lengths())
88
  if self.causal:
89
  concat = keras.layers.Concatenate()
90
  x = concat(x,tensorflow.zeros_like(x))
91
  f = concat(f,tensorflow.zeros_like(f))
92
+ y = x[:,:,:,0]
93
  for i in range(self.stages):
94
+ y = convolve(y,f[:,:,:,i])*x[:,:,:,i+1]
95
  if self.causal:
96
  for (i,n) in enumerate(X.row_lengths()):
97
  y[i] = y[i,:n]
qarac/models/layers/__init__.py ADDED
File without changes
qarac/models/layers/__pycache__/HierarchicalLogits.cpython-310.pyc ADDED
Binary file (2.43 kB). View file
 
qarac/models/layers/__pycache__/HyenaLayer.cpython-310.pyc ADDED
Binary file (3.19 kB). View file
 
qarac/models/layers/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (142 Bytes). View file
 
qarac/models/qarac_base_model.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Wed Aug 23 09:50:14 2023
5
+
6
+ @author: peter
7
+ """
8
+
9
+ import keras
10
+ import qarac.models.layers.HierarchicalLogits
11
+ import qarac.models.layers.HyenaLayer
12
+
13
+ def qarac_base_model(vocab_size,width,depth,decoder=True):
14
+ print('Building','decoder' if decoder else 'encoder','model with vocab size',
15
+ vocab_size,',',depth,'layers and vector width',width)
16
+ stack = [keras.layers.Input(shape=(None,),ragged=True),
17
+ keras.layers.Embedding(vocab_size,width,name='Embedding')]
18
+ for _ in range(depth):
19
+ stack.append(qarac.models.layers.HyenaLayer.HyenaLayer(causal=decoder))
20
+ #stack.append(keras.layers.TimeDistributed(qarac.models.layers.HierarchicalLogits.HierarchicalLogits(vocab_size)))
21
+ #stack.append(keras.layers.TimeDistributed(keras.layers.Softmax()))
22
+ stack.append(keras.layers.TimeDistributed(keras.layers.Dense(vocab_size,activation='softmax')))
23
+ return keras.models.Sequential(stack)
scripts.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import argparse
4
+ import pickle
5
+ import tokenizers
6
+ import qarac.corpora.BNCorpus
7
+ import qarac.corpora.Batcher
8
+ import qarac.models.qarac_base_model
9
+ import keras
10
+
11
+
12
+
13
+
14
+ def train_base_model(task,filename):
15
+ tokenizer = tokenizers.Tokenizer.from_pretrained('xlm-roberta-base')
16
+ tokenizer.add_special_tokens(['<start>','<end>','<pad>'])
17
+ tokenizer.save('/'.join([os.environ['HOME'],
18
+ 'QARAC',
19
+ 'models',
20
+ 'tokenizer.json']))
21
+ bnc = qarac.corpora.BNCorpus.BNCorpus(tokenizer=tokenizer,
22
+ task=task)
23
+ (train,test)=bnc.split(0.01)
24
+ train_data=qarac.corpora.Batcher.Batcher(train)
25
+ model = qarac.models.qarac_base_model.qarac_base_model(tokenizer.get_vocab_size(),
26
+ 768,
27
+ 12,
28
+ task=='decode')
29
+ optimizer = keras.optimizers.Nadam(learning_rate=keras.optimizers.schedules.ExponentialDecay(1.0e-5, 100, 0.99))
30
+ model.compile(optimizer=optimizer,loss='sparse_categorical_crossentropy',metrics='accuracy')
31
+ model.fit(train_data,
32
+ epochs=100,
33
+ workers = 16)
34
+ test_data=qarac.corpora.Batcher.Batcher(test)
35
+ print(model.evaluate(test_data))
36
+ model.save(filename)
37
+
38
+
39
+
40
+
41
+ if __name__ == '__main__':
42
+ parser = argparse.ArgumentParser(prog='QARAC',
43
+ description='Experimental NLP system, aimed at improving factual accuracy')
44
+ parser.add_argument('task')
45
+ parser.add_argument('-f','--filename')
46
+ parser.add_argument('-t','--training-task')
47
+ args = parser.parse_args()
48
+ if args.task == 'train_base_model':
49
+ train_base_model(args.training_task,args.filename)
50
+
src/models/quarac_base_model.py DELETED
@@ -1,18 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- Created on Wed Aug 23 09:50:14 2023
5
-
6
- @author: peter
7
- """
8
-
9
- import keras
10
- import layers
11
-
12
- def quarac_base_model(vocab_size,width,depth,decoder=True):
13
- stack = [keras.layers.Embedding(vocab_size,width)]
14
- for _ in range(depth):
15
- stack.append(layers.HyenaLayer(causal=decoder))
16
- stack.append(keras.layers.Timedistributed(layers.HierarchicalLogits()))
17
- stack.append(keras.layers.Timedistributed(keras.layers.Softmax()))
18
- return keras.models.Sequential(stack)