PeteBleackley commited on
Commit
75ef467
·
1 Parent(s): 47a7fc3

Corpus iterator for BNC

Browse files
Files changed (2) hide show
  1. requirements.txt +3 -0
  2. src/corpora/BNCorpus.py +71 -0
requirements.txt CHANGED
@@ -1,3 +1,6 @@
1
  keras
2
  keras_nlp
3
  tensorflow
 
 
 
 
1
  keras
2
  keras_nlp
3
  tensorflow
4
+ numpy
5
+ nltk
6
+ tokenizers
src/corpora/BNCorpus.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Thu Aug 24 10:38:48 2023
5
+
6
+ @author: peter
7
+ """
8
+ import numpy
9
+ import numpy.random
10
+ import nltk.corpus
11
+
12
+ def detokenize(sentences):
13
+ return ' '.join([''.join(sentence)
14
+ for sentence in sentences])
15
+
16
+ class BNCorpus(object):
17
+
18
+ def __init__(self,fileids=None,tokenizer=None,task=None):
19
+ self.bnc = nltk.corpus.reader.bnc.BNCCorpusReader('BNC/Texts', fileids=r'[A-K]/\w*/\w*\.xml')
20
+ self.file_ids = self.bnc.fileids() if fileids is None else fileids
21
+ self.n_docs = len(self.file_ids)
22
+ self.rng = numpy.random.default_rng()
23
+ self.tokenizer = tokenizer
24
+ self.task = task
25
+
26
+ def __len__(self):
27
+ return self.n_docs
28
+
29
+ def split(self,p=0.8):
30
+ n = int(p*self.n_docs)
31
+ self.rng.shuffle(self.file_ids)
32
+ train = BNCorpus(self.fileids[:n],self.tokenizer,self.task)
33
+ test = BNCorpus(self.fileids[n:],self.tokenizer,self.task)
34
+ return (train,test)
35
+
36
+ def __iter__(self):
37
+ self.rng.shuffle(self.file_ids)
38
+ for fileid in self.file_ids:
39
+ doc = self.bnc.sents(fileid,strip_space=False)
40
+ if self.task is None:
41
+ yield detokenize(doc)
42
+ elif self.task=='encode':
43
+ yield self.endoder_example(doc)
44
+ else:
45
+ yield self.decoder_example(doc)
46
+
47
+ def encoder_example(self,doc):
48
+ masked_sentences = []
49
+ sample_weights = []
50
+ for sentence in doc:
51
+ cp = sentence[:]
52
+ n = len(sentence)
53
+ weights = numpy.zeros(n)
54
+ k = self.rng.integers(n)
55
+ cp[k] = '[MASK] '
56
+ masked_sentences.append(cp)
57
+ weights[k] = 1
58
+ sample_weights.append(weights)
59
+ return (self.tokenizer.encode(detokenize(masked_sentences)),
60
+ self.tokenizer.encode(detokenize(doc)),
61
+ numpy.concatenate(sample_weights))
62
+
63
+ def decoder_sample(self,doc):
64
+ x = ['START'] + doc
65
+ y = doc + ['END']
66
+ sample_weights = [numpy.zeros(len(sentence)) if i==0
67
+ else numpy.ones(len(sentence))
68
+ for (i,sentence) in enumerate(y)]
69
+ return (self.tokenizer.encode(detokenize(x)),
70
+ self.tokenizer.encode(detokenize(y)),
71
+ numpy.concatenate(sample_weights))