File size: 6,109 Bytes
6ed21b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187

import sys
from collections import Counter, OrderedDict 
import pickle
import numpy

unk_string   = '<UNK>'
pad_string   = '<PAD>'

def read_tagged_sentences(path, max_sent_len):
   """
   Read a dataset.
   Each line consists of a token and a tag separated by a tab character
   """
   sentences, words, tags = [], [], []
   with open(path) as file:
      for line in file:
         line = line.rstrip()
         if line:
            word, tag, *_ = line.split("\t")
            words.append(word)
            tags.append(tag)
         else:
            # empty line marking the end of a sentence
            if 0 < len(words) < max_sent_len:
               sentences.append((words, tags))
            words, tags = [], []
   return sentences


def read_word_embeddings(filename):
   # Read word embeddings from file.
   word_embeddings = []
   if filename is not None:
      print("reading word embeddings ...", file=sys.stderr)
      with open(filename) as file:
         for line in file:
            word, *vec = line.rstrip().split(' ')
            if word != unk_string:
               word_embeddings.append((word, numpy.array(vec, dtype=numpy.float32)))
      print("done", file=sys.stderr)
   word_emb_size = len(word_embeddings[0][1]) if word_embeddings else 0
   return word_embeddings, word_emb_size
            

def make_dict(counter, min_freq=0, add_pad_symbol=False):
   """
   Create a dictionary which maps strings with some minimal frequency to numbers.
   We don't use pack_padded sequence, so it is OK to assign ID 1 to the
   padding symbol.
   """
   symlist = [unk_string] + ([pad_string] if add_pad_symbol else []) + \
             [elem for elem,freq in counter.most_common() if freq>=min_freq]
   string2ID = {elem:i for i,elem in enumerate(symlist)}
   return string2ID, symlist


class Data(object):
   """
   class for reading a tagged training and development corpus or a test corpus
   """
   
   IGNORE_INDEX = -100

   def __init__(self, *args):
      if len(args) == 1:
         self.init_test(*args)
      else:
         self.init_train(*args)

   ### functions needed during training ###############################################

   def init_train(self, path_train, path_dev, word_trunc_len,
                  min_char_freq, max_sent_len, word_embeddings, ignore_tag):

      self.word_trunc_len = word_trunc_len  # length to which words are truncated or filled up

      # reading the datasets
      self.train_sentences = read_tagged_sentences(path_train, max_sent_len)
      self.dev_sentences   = read_tagged_sentences(path_dev, max_sent_len)
   
      ### create dictionaries which map characters or tags to IDs
      char_counter = Counter()
      tag_counter  = Counter()
      for words, tags in self.train_sentences:
         tag_counter.update(tags)
         for word in words:
            char_counter.update(word)
      self.char2ID, _ = make_dict(char_counter, min_char_freq, add_pad_symbol=True)

      if ignore_tag is not None:
         tag_counter.pop(ignore_tag, None) # remove this special tag if present
         self.tag2ID, self.ID2tag  = make_dict(tag_counter)
         self.tag2ID[ignore_tag] = self.IGNORE_INDEX  # empty tags will not be trained
      else:
         self.tag2ID, self.ID2tag  = make_dict(tag_counter)

      ### sizes of the symbol inventories
      self.num_char_types = len(self.char2ID)
      self.num_tag_types  = len(self.ID2tag)

      self.word_embeddings, self.word_emb_size = read_word_embeddings(word_embeddings)
      

   def get_charIDs(self, word):
      '''
      maps a word to a sequence of character IDs
      '''

      unkID = self.char2ID[unk_string]
      padID = self.char2ID[pad_string]

      charIDs = [self.char2ID.get(c, unkID) for c in word]

      # add enough padding symbols
      fwd_charIDs = [padID] * self.word_trunc_len + charIDs
      bwd_charIDs = [padID] * self.word_trunc_len + charIDs[::-1]

      # truncate
      fwd_charIDs = fwd_charIDs[-self.word_trunc_len:]
      bwd_charIDs = bwd_charIDs[-self.word_trunc_len:]

      return fwd_charIDs, bwd_charIDs


   def words2charIDvec(self, words):
      """
      converts words to char-ID vectors
      """

      ### convert words to character ID sequences
      fwd_charID_seqs = []
      bwd_charID_seqs = []
      for word in words:
         fwd_charIDs, bwd_charIDs = self.get_charIDs(word)
         fwd_charID_seqs.append(fwd_charIDs)
         bwd_charID_seqs.append(bwd_charIDs)

      fwd_charID_seqs = numpy.asarray(fwd_charID_seqs, dtype='int32')
      bwd_charID_seqs = numpy.asarray(bwd_charID_seqs, dtype='int32')

      return fwd_charID_seqs, bwd_charID_seqs


   def tags2IDs(self, tags):
      """
      takes a list of tags and converts them to IDs using the tag2ID dictionary
      """
      unkID = self.tag2ID[unk_string]
      IDs = [self.tag2ID.get(tag, unkID) for tag in tags]
      return numpy.asarray(IDs, dtype='int32')


   def save_parameters(self, filename):
      """ save parameters to a file """
      all_params = (self.word_trunc_len, self.char2ID, self.ID2tag)
      with open(filename, "wb") as file:
         pickle.dump(all_params, file)


   ### functions needed during tagging ###############################################

   def init_test(self, filename):
      """ load parameters from a file """
      with open(filename, "rb") as file:
         self.word_trunc_len, self.char2ID, self.ID2tag = pickle.load(file)

   def sentences(self, filename):
      """ read data to be tagged. One token per line. Empty line follows a sentence """
      with open(filename) as f:
         words = []
         for line in f:
            line = line.rstrip()
            if line != '':
               words.append(line)
            elif len(words) > 0:
               # empty line indicates the end of a sentence
               yield words
               words = []
   
   def single_sentences(self, sentence):
      yield sentence

   def IDs2tags(self, IDs):
      """ takes a list of IDs and converts them to tags using the ID2tag dictionary """
      return [self.ID2tag[int(ID)] for ID in IDs]