nielklug commited on
Commit
6ed21b9
·
1 Parent(s): 2ba47af
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. MHGTagger/CRFTagger.py +110 -0
  2. MHGTagger/Data.py +186 -0
  3. MHGTagger/RNNTagger.py +111 -0
  4. MHGTagger/__pycache__/CRFTagger.cpython-310.pyc +0 -0
  5. MHGTagger/__pycache__/CRFTagger.cpython-37.pyc +0 -0
  6. MHGTagger/__pycache__/CRFTagger.cpython-38.pyc +0 -0
  7. MHGTagger/__pycache__/Data.cpython-37.pyc +0 -0
  8. MHGTagger/__pycache__/Data.cpython-38.pyc +0 -0
  9. MHGTagger/__pycache__/NMT.cpython-310.pyc +0 -0
  10. MHGTagger/__pycache__/NMTData.cpython-310.pyc +0 -0
  11. MHGTagger/__pycache__/RNNData.cpython-310.pyc +0 -0
  12. MHGTagger/__pycache__/RNNData.cpython-37.pyc +0 -0
  13. MHGTagger/__pycache__/RNNData.cpython-38.pyc +0 -0
  14. MHGTagger/__pycache__/RNNTagger.cpython-310.pyc +0 -0
  15. MHGTagger/__pycache__/RNNTagger.cpython-37.pyc +0 -0
  16. MHGTagger/__pycache__/RNNTagger.cpython-38.pyc +0 -0
  17. MHGTagger/__pycache__/rnn_annotate.cpython-38.pyc +0 -0
  18. MHGTagger/rnn_annotate.py +145 -0
  19. MHGTagger/tagger.hyper +0 -0
  20. MHGTagger/tagger.io +0 -0
  21. README.md +1 -1
  22. Tagset_Mappings/POS-mapping.txt +73 -0
  23. Tagset_Mappings/__pycache__/tag_mapping.cpython-38.pyc +0 -0
  24. Tagset_Mappings/feature-mapping.txt +11 -0
  25. Tagset_Mappings/tag_mapping.py +129 -0
  26. app.py +47 -0
  27. parse.py +19 -0
  28. parsing/EVALB/COLLINS.prm +66 -0
  29. parsing/EVALB/LICENSE +24 -0
  30. parsing/EVALB/Makefile +4 -0
  31. parsing/EVALB/README +300 -0
  32. parsing/EVALB/bug/bug.gld +5 -0
  33. parsing/EVALB/bug/bug.rsl-new +39 -0
  34. parsing/EVALB/bug/bug.rsl-old +45 -0
  35. parsing/EVALB/bug/bug.tst +5 -0
  36. parsing/EVALB/evalb +0 -0
  37. parsing/EVALB/evalb.c +1537 -0
  38. parsing/EVALB/new.prm +87 -0
  39. parsing/EVALB/nk.prm +92 -0
  40. parsing/EVALB/sample/sample.gld +24 -0
  41. parsing/EVALB/sample/sample.prm +65 -0
  42. parsing/EVALB/sample/sample.rsl +56 -0
  43. parsing/EVALB/sample/sample.tst +24 -0
  44. parsing/EVALB/tgrep_proc.prl +9 -0
  45. parsing/EVALB_SPMRL/Makefile +65 -0
  46. parsing/EVALB_SPMRL/README +76 -0
  47. parsing/EVALB_SPMRL/README.orig +230 -0
  48. parsing/EVALB_SPMRL/evalb.c +1724 -0
  49. parsing/EVALB_SPMRL/spmrl.prm +91 -0
  50. parsing/EVALB_SPMRL/spmrl_hebrew.prm +118 -0
MHGTagger/CRFTagger.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import sys
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+ from .RNNTagger import RNNTagger
8
+
9
+
10
+ ### auxiliary functions ############################################
11
+
12
+ def logsumexp(x, dim):
13
+ """ sums up log-scale values """
14
+ offset, _ = torch.max(x, dim=dim)
15
+ offset_broadcasted = offset.unsqueeze(dim)
16
+ safe_log_sum_exp = torch.log(torch.exp(x-offset_broadcasted).sum(dim=dim))
17
+ return safe_log_sum_exp + offset
18
+
19
+ def lookup(T, indices):
20
+ """ look up probabilities of tags in a vector, matrix, or 3D tensor """
21
+ if T.dim() == 3:
22
+ return T.gather(2, indices.unsqueeze(2)).squeeze(2)
23
+ elif T.dim() == 2:
24
+ return T.gather(1, indices.unsqueeze(1)).squeeze(1)
25
+ elif T.dim() == 1:
26
+ return T[indices]
27
+ else:
28
+ raise Exception('unexpected tensor size in function "lookup"')
29
+
30
+
31
+ ### tagger class ###############################################
32
+
33
+ class CRFTagger(nn.Module):
34
+ """ implements a CRF tagger """
35
+
36
+ def __init__(self, num_chars, num_tags, char_emb_size,
37
+ char_rec_size, word_rec_size, word_rnn_depth,
38
+ dropout_rate, word_emb_size, beam_size):
39
+
40
+ super(CRFTagger, self).__init__()
41
+
42
+ # simple LSTMTagger which computes tag scores
43
+ self.base_tagger = RNNTagger(num_chars, num_tags, char_emb_size,
44
+ char_rec_size, word_rec_size,
45
+ word_rnn_depth, dropout_rate, word_emb_size)
46
+ self.beam_size = beam_size if 0 < beam_size < num_tags else num_tags
47
+ self.weights = nn.Parameter(torch.zeros(num_tags, num_tags))
48
+ self.dropout = nn.Dropout(dropout_rate)
49
+
50
+
51
+ def forward(self, fwd_charIDs, bwd_charIDs, tags=None):
52
+
53
+ annotation_mode = (tags is None)
54
+
55
+ scores = self.base_tagger(fwd_charIDs, bwd_charIDs)
56
+
57
+ # extract the highest-scoring tags for each word and their scores
58
+ best_scores, best_tags = scores.topk(self.beam_size, dim=-1)
59
+
60
+ if self.training: # not done during dev evaluation
61
+ # check whether the goldstandard tags are among the best tags
62
+ gs_contained = (best_tags == tags.unsqueeze(1)).sum(dim=-1)
63
+
64
+ # replace the tag with the lowest score at each position
65
+ # by the gs tag if the gs tag is not in the list
66
+ last_column = gs_contained * best_tags[:,-1] + (1-gs_contained) * tags
67
+ s = lookup(scores, last_column)
68
+ best_tags = torch.cat((best_tags[:,:-1], last_column.unsqueeze(1)), dim=1)
69
+ best_scores = torch.cat((best_scores[:,:-1], s.unsqueeze(1)), dim=1)
70
+
71
+ best_previous = [] # stores the backpointers of the Viterbi algorithm
72
+ viterbi_scores = best_scores[0]
73
+ if not annotation_mode:
74
+ forward_scores = best_scores[0]
75
+ for i in range(1,scores.size(0)): # for all word positions except the first
76
+ # lookup of the tag-pair weights
77
+ w = self.weights[best_tags[i-1]][:,best_tags[i]]
78
+
79
+ # Viterbi algorithm
80
+ values = viterbi_scores.unsqueeze(1) + best_scores[i].unsqueeze(0) + w
81
+ viterbi_scores, best_prev = torch.max(values, dim=0)
82
+ best_previous.append(best_prev)
83
+
84
+ # Forward algorithm
85
+ if not annotation_mode:
86
+ values = forward_scores.unsqueeze(1) + best_scores[i].unsqueeze(0) + w
87
+ forward_scores = logsumexp(values, dim=0)
88
+
89
+ # Viterbi algorithm
90
+ _, index = torch.max(viterbi_scores, dim=0)
91
+ best_indices = [index]
92
+ for i in range(len(best_previous)-1, -1, -1):
93
+ index = best_previous[i][index]
94
+ best_indices.append(index)
95
+
96
+ # reverse the indices and map them to tag IDs
97
+ best_indices = torch.stack(best_indices[::-1])
98
+ predicted_tags = lookup(best_tags, best_indices)
99
+
100
+ if annotation_mode:
101
+ return predicted_tags
102
+ else:
103
+ # loss computation
104
+ basetagger_scores = lookup(scores, tags).sum()
105
+ CRFweights = self.weights[tags[:-1], tags[1:]].sum() if tags.size(0)>1 else 0
106
+ logZ = logsumexp(forward_scores, dim=0) # log partition function
107
+ logprob = basetagger_scores + CRFweights - logZ
108
+
109
+ return predicted_tags, -logprob
110
+
MHGTagger/Data.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import sys
3
+ from collections import Counter, OrderedDict
4
+ import pickle
5
+ import numpy
6
+
7
+ unk_string = '<UNK>'
8
+ pad_string = '<PAD>'
9
+
10
+ def read_tagged_sentences(path, max_sent_len):
11
+ """
12
+ Read a dataset.
13
+ Each line consists of a token and a tag separated by a tab character
14
+ """
15
+ sentences, words, tags = [], [], []
16
+ with open(path) as file:
17
+ for line in file:
18
+ line = line.rstrip()
19
+ if line:
20
+ word, tag, *_ = line.split("\t")
21
+ words.append(word)
22
+ tags.append(tag)
23
+ else:
24
+ # empty line marking the end of a sentence
25
+ if 0 < len(words) < max_sent_len:
26
+ sentences.append((words, tags))
27
+ words, tags = [], []
28
+ return sentences
29
+
30
+
31
+ def read_word_embeddings(filename):
32
+ # Read word embeddings from file.
33
+ word_embeddings = []
34
+ if filename is not None:
35
+ print("reading word embeddings ...", file=sys.stderr)
36
+ with open(filename) as file:
37
+ for line in file:
38
+ word, *vec = line.rstrip().split(' ')
39
+ if word != unk_string:
40
+ word_embeddings.append((word, numpy.array(vec, dtype=numpy.float32)))
41
+ print("done", file=sys.stderr)
42
+ word_emb_size = len(word_embeddings[0][1]) if word_embeddings else 0
43
+ return word_embeddings, word_emb_size
44
+
45
+
46
+ def make_dict(counter, min_freq=0, add_pad_symbol=False):
47
+ """
48
+ Create a dictionary which maps strings with some minimal frequency to numbers.
49
+ We don't use pack_padded sequence, so it is OK to assign ID 1 to the
50
+ padding symbol.
51
+ """
52
+ symlist = [unk_string] + ([pad_string] if add_pad_symbol else []) + \
53
+ [elem for elem,freq in counter.most_common() if freq>=min_freq]
54
+ string2ID = {elem:i for i,elem in enumerate(symlist)}
55
+ return string2ID, symlist
56
+
57
+
58
+ class Data(object):
59
+ """
60
+ class for reading a tagged training and development corpus or a test corpus
61
+ """
62
+
63
+ IGNORE_INDEX = -100
64
+
65
+ def __init__(self, *args):
66
+ if len(args) == 1:
67
+ self.init_test(*args)
68
+ else:
69
+ self.init_train(*args)
70
+
71
+ ### functions needed during training ###############################################
72
+
73
+ def init_train(self, path_train, path_dev, word_trunc_len,
74
+ min_char_freq, max_sent_len, word_embeddings, ignore_tag):
75
+
76
+ self.word_trunc_len = word_trunc_len # length to which words are truncated or filled up
77
+
78
+ # reading the datasets
79
+ self.train_sentences = read_tagged_sentences(path_train, max_sent_len)
80
+ self.dev_sentences = read_tagged_sentences(path_dev, max_sent_len)
81
+
82
+ ### create dictionaries which map characters or tags to IDs
83
+ char_counter = Counter()
84
+ tag_counter = Counter()
85
+ for words, tags in self.train_sentences:
86
+ tag_counter.update(tags)
87
+ for word in words:
88
+ char_counter.update(word)
89
+ self.char2ID, _ = make_dict(char_counter, min_char_freq, add_pad_symbol=True)
90
+
91
+ if ignore_tag is not None:
92
+ tag_counter.pop(ignore_tag, None) # remove this special tag if present
93
+ self.tag2ID, self.ID2tag = make_dict(tag_counter)
94
+ self.tag2ID[ignore_tag] = self.IGNORE_INDEX # empty tags will not be trained
95
+ else:
96
+ self.tag2ID, self.ID2tag = make_dict(tag_counter)
97
+
98
+ ### sizes of the symbol inventories
99
+ self.num_char_types = len(self.char2ID)
100
+ self.num_tag_types = len(self.ID2tag)
101
+
102
+ self.word_embeddings, self.word_emb_size = read_word_embeddings(word_embeddings)
103
+
104
+
105
+ def get_charIDs(self, word):
106
+ '''
107
+ maps a word to a sequence of character IDs
108
+ '''
109
+
110
+ unkID = self.char2ID[unk_string]
111
+ padID = self.char2ID[pad_string]
112
+
113
+ charIDs = [self.char2ID.get(c, unkID) for c in word]
114
+
115
+ # add enough padding symbols
116
+ fwd_charIDs = [padID] * self.word_trunc_len + charIDs
117
+ bwd_charIDs = [padID] * self.word_trunc_len + charIDs[::-1]
118
+
119
+ # truncate
120
+ fwd_charIDs = fwd_charIDs[-self.word_trunc_len:]
121
+ bwd_charIDs = bwd_charIDs[-self.word_trunc_len:]
122
+
123
+ return fwd_charIDs, bwd_charIDs
124
+
125
+
126
+ def words2charIDvec(self, words):
127
+ """
128
+ converts words to char-ID vectors
129
+ """
130
+
131
+ ### convert words to character ID sequences
132
+ fwd_charID_seqs = []
133
+ bwd_charID_seqs = []
134
+ for word in words:
135
+ fwd_charIDs, bwd_charIDs = self.get_charIDs(word)
136
+ fwd_charID_seqs.append(fwd_charIDs)
137
+ bwd_charID_seqs.append(bwd_charIDs)
138
+
139
+ fwd_charID_seqs = numpy.asarray(fwd_charID_seqs, dtype='int32')
140
+ bwd_charID_seqs = numpy.asarray(bwd_charID_seqs, dtype='int32')
141
+
142
+ return fwd_charID_seqs, bwd_charID_seqs
143
+
144
+
145
+ def tags2IDs(self, tags):
146
+ """
147
+ takes a list of tags and converts them to IDs using the tag2ID dictionary
148
+ """
149
+ unkID = self.tag2ID[unk_string]
150
+ IDs = [self.tag2ID.get(tag, unkID) for tag in tags]
151
+ return numpy.asarray(IDs, dtype='int32')
152
+
153
+
154
+ def save_parameters(self, filename):
155
+ """ save parameters to a file """
156
+ all_params = (self.word_trunc_len, self.char2ID, self.ID2tag)
157
+ with open(filename, "wb") as file:
158
+ pickle.dump(all_params, file)
159
+
160
+
161
+ ### functions needed during tagging ###############################################
162
+
163
+ def init_test(self, filename):
164
+ """ load parameters from a file """
165
+ with open(filename, "rb") as file:
166
+ self.word_trunc_len, self.char2ID, self.ID2tag = pickle.load(file)
167
+
168
+ def sentences(self, filename):
169
+ """ read data to be tagged. One token per line. Empty line follows a sentence """
170
+ with open(filename) as f:
171
+ words = []
172
+ for line in f:
173
+ line = line.rstrip()
174
+ if line != '':
175
+ words.append(line)
176
+ elif len(words) > 0:
177
+ # empty line indicates the end of a sentence
178
+ yield words
179
+ words = []
180
+
181
+ def single_sentences(self, sentence):
182
+ yield sentence
183
+
184
+ def IDs2tags(self, IDs):
185
+ """ takes a list of IDs and converts them to tags using the ID2tag dictionary """
186
+ return [self.ID2tag[int(ID)] for ID in IDs]
MHGTagger/RNNTagger.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import sys
3
+ import torch
4
+ from torch import nn
5
+
6
+
7
+ class WordRepresentation(nn.Module):
8
+ '''
9
+ RNN for computing character-based word representations
10
+ '''
11
+ def __init__(self, num_chars, emb_size, rec_size, dropout_rate):
12
+ super().__init__()
13
+
14
+ # character embedding lookup table
15
+ self.embeddings = nn.Embedding(num_chars, emb_size)
16
+
17
+ # character-based LSTMs
18
+ self.fwd_rnn = nn.LSTM(emb_size, rec_size)
19
+ self.bwd_rnn = nn.LSTM(emb_size, rec_size)
20
+
21
+ self.dropout = nn.Dropout(dropout_rate)
22
+
23
+
24
+ def forward(self, fwd_charIDs, bwd_charIDs):
25
+ # swap the 2 dimensions and lookup the embeddings
26
+ fwd_embs = self.embeddings(fwd_charIDs.t())
27
+ bwd_embs = self.embeddings(bwd_charIDs.t())
28
+
29
+ # run the biLSTM over characters
30
+ fwd_outputs, _ = self.fwd_rnn(fwd_embs)
31
+ bwd_outputs, _ = self.bwd_rnn(bwd_embs)
32
+
33
+ # concatenate the forward and backward final states to form
34
+ # word representations
35
+ word_reprs = torch.cat((fwd_outputs[-1], bwd_outputs[-1]), -1)
36
+
37
+ return word_reprs
38
+
39
+
40
+ class ResidualLSTM(nn.Module):
41
+ ''' Deep BiRNN with residual connections '''
42
+
43
+ def __init__(self, input_size, rec_size, num_rnns, dropout_rate):
44
+ super().__init__()
45
+ self.rnn = nn.LSTM(input_size, rec_size,
46
+ bidirectional=True, batch_first=True)
47
+
48
+ self.deep_rnns = nn.ModuleList([
49
+ nn.LSTM(2*rec_size, rec_size, bidirectional=True, batch_first=True)
50
+ for _ in range(num_rnns-1)])
51
+
52
+ self.dropout = nn.Dropout(dropout_rate)
53
+
54
+ def forward(self, state):
55
+ state, _ = self.rnn(state)
56
+ for rnn in self.deep_rnns:
57
+ hidden, _ = rnn(self.dropout(state))
58
+ state = state + hidden # residual connection
59
+ return state
60
+
61
+
62
+ class RNNTagger(nn.Module):
63
+ ''' main tagger module '''
64
+
65
+ def __init__(self, num_chars, num_tags, char_emb_size, char_rec_size,
66
+ word_rec_size, word_rnn_depth, dropout_rate, word_emb_size):
67
+
68
+ super().__init__()
69
+
70
+ # character-based BiLSTMs
71
+ self.word_representations = WordRepresentation(num_chars, char_emb_size,
72
+ char_rec_size, dropout_rate)
73
+ # word-based BiLSTM
74
+ self.word_rnn = ResidualLSTM(char_rec_size*2, word_rec_size, word_rnn_depth,
75
+ dropout_rate)
76
+ # output feed-forward network
77
+ self.output_layer = nn.Linear(2*word_rec_size, num_tags)
78
+
79
+ # dropout layers
80
+ self.dropout = nn.Dropout(dropout_rate)
81
+
82
+ # word embedding projection layer for finetuning on word embeddings
83
+ if word_emb_size > 0:
84
+ self.projection_layer = nn.Linear(2*char_rec_size, word_emb_size)
85
+
86
+
87
+ def forward(self, fwd_charIDs, bwd_charIDs, word_embedding_training=False):
88
+
89
+ # compute the character-based word representations
90
+ word_reprs = self.word_representations(fwd_charIDs, bwd_charIDs)
91
+
92
+ if word_embedding_training:
93
+ if not hasattr(self, 'projection_layer'):
94
+ sys.exit("Error: The embedding projection layer is undefined!")
95
+ # Project the word representations to word embedding vectors
96
+ # for finetuning on word embeddings as an auxiliary task
97
+ word_embs = self.projection_layer(word_reprs)
98
+ return word_embs
99
+
100
+ # apply dropout
101
+ word_reprs = self.dropout(word_reprs)
102
+
103
+ # run the BiLSTM over words
104
+ reprs = self.word_rnn(word_reprs.unsqueeze(0)).squeeze(0)
105
+ reprs = self.dropout(reprs) # and apply dropout
106
+
107
+ # apply the output layers
108
+ scores = self.output_layer(reprs)
109
+
110
+ return scores
111
+
MHGTagger/__pycache__/CRFTagger.cpython-310.pyc ADDED
Binary file (2.85 kB). View file
 
MHGTagger/__pycache__/CRFTagger.cpython-37.pyc ADDED
Binary file (2.81 kB). View file
 
MHGTagger/__pycache__/CRFTagger.cpython-38.pyc ADDED
Binary file (2.84 kB). View file
 
MHGTagger/__pycache__/Data.cpython-37.pyc ADDED
Binary file (5.64 kB). View file
 
MHGTagger/__pycache__/Data.cpython-38.pyc ADDED
Binary file (5.83 kB). View file
 
MHGTagger/__pycache__/NMT.cpython-310.pyc ADDED
Binary file (9.98 kB). View file
 
MHGTagger/__pycache__/NMTData.cpython-310.pyc ADDED
Binary file (6.78 kB). View file
 
MHGTagger/__pycache__/RNNData.cpython-310.pyc ADDED
Binary file (6.2 kB). View file
 
MHGTagger/__pycache__/RNNData.cpython-37.pyc ADDED
Binary file (6.04 kB). View file
 
MHGTagger/__pycache__/RNNData.cpython-38.pyc ADDED
Binary file (6.08 kB). View file
 
MHGTagger/__pycache__/RNNTagger.cpython-310.pyc ADDED
Binary file (2.94 kB). View file
 
MHGTagger/__pycache__/RNNTagger.cpython-37.pyc ADDED
Binary file (3.13 kB). View file
 
MHGTagger/__pycache__/RNNTagger.cpython-38.pyc ADDED
Binary file (3.11 kB). View file
 
MHGTagger/__pycache__/rnn_annotate.cpython-38.pyc ADDED
Binary file (3.28 kB). View file
 
MHGTagger/rnn_annotate.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+
3
+ import sys
4
+ import pickle
5
+ import torch
6
+ from huggingface_hub import hf_hub_download
7
+
8
+ from .Data import Data
9
+ from .RNNTagger import RNNTagger
10
+ from .CRFTagger import CRFTagger
11
+
12
+
13
+ ###########################################################################
14
+ # main function
15
+ ###########################################################################
16
+
17
+ class Args:
18
+ def __init__(self, path_param, model_id, path_data, crf_beam_size, gpu, min_prob, print_probs) -> None:
19
+ self.path_param = path_param
20
+ self.model_id = model_id
21
+ self.path_data = path_data
22
+ self.crf_beam_size = crf_beam_size
23
+ self.gpu = gpu
24
+ self.min_prob = min_prob
25
+ self.print_probs = print_probs
26
+
27
+ # if __name__ == "__main__":
28
+ def annotate(tokens, path_param='MHGTagger/tagger', model_id='nielklug/rnn_tagger', path_data='', crf_beam_size=10, gpu=-1, min_prob=-1.0, print_probs=True):
29
+
30
+ # parser = argparse.ArgumentParser(description='Annotation program of the RNN-Tagger.')
31
+
32
+ # parser.add_argument('path_param', type=str,
33
+ # help='name of parameter file')
34
+ # parser.add_argument('path_data', type=str,
35
+ # help='name of the file with input data')
36
+ # parser.add_argument('--crf_beam_size', type=int, default=10,
37
+ # help='size of the CRF beam (if the system contains a CRF layer)')
38
+ # parser.add_argument('--gpu', type=int, default=0,
39
+ # help='selection of the GPU. The default is: 0 (CPU=-1)')
40
+ # parser.add_argument("--min_prob", type=float, default=-1.0,
41
+ # help="print all tags whose probability exceeds the probability of the best tag times this threshold")
42
+ # parser.add_argument("--print_probs", action="store_true", default=False,
43
+ # help="print the tag probabilities")
44
+
45
+ args = Args(path_param, model_id, path_data, crf_beam_size, gpu, min_prob, print_probs)
46
+
47
+ # Select the processing device
48
+ if args.gpu >= 0:
49
+ if not torch.cuda.is_available():
50
+ print('No gpu available. Using cpu instead.', file=sys.stderr)
51
+ args.gpu = -1
52
+ else:
53
+ if args.gpu >= torch.cuda.device_count():
54
+ print('gpu '+str(args.gpu)+' not available. Using gpu 0 instead.', file=sys.stderr)
55
+ args.gpu = 0
56
+ torch.cuda.set_device(args.gpu)
57
+ device = torch.device('cuda' if args.gpu >= 0 else 'cpu')
58
+
59
+ # load parameters
60
+ data = Data(args.path_param+'.io') # read the symbol mapping tables
61
+
62
+ with open(args.path_param+'.hyper', 'rb') as file:
63
+ hyper_params = pickle.load(file)
64
+ model = CRFTagger(*hyper_params) if len(hyper_params)==10 \
65
+ else RNNTagger(*hyper_params)
66
+
67
+ model_file = hf_hub_download(repo_id=args.model_id, filename='tagger.rnn')
68
+ model.load_state_dict(torch.load(model_file,
69
+ map_location=torch.device('cpu')))
70
+
71
+ model = model.to(device)
72
+
73
+ if type(model) is CRFTagger:
74
+ for optvar, option in zip((args.min_prob, args.print_probs),
75
+ ("min_prob","print_probs")):
76
+ if optvar:
77
+ print(f"Warning: Option --{option} is ignored because the model has a CRF output layer", file=sys.stderr)
78
+
79
+ model.eval()
80
+ with torch.no_grad():
81
+ for i, words in enumerate(data.single_sentences(tokens)):
82
+ # print(i, end='\r', file=sys.stderr, flush=True)
83
+
84
+ # map words to numbers and create Torch variables
85
+ fwd_charIDs, bwd_charIDs = data.words2charIDvec(words)
86
+ fwd_charIDs = torch.LongTensor(fwd_charIDs).to(device)
87
+ bwd_charIDs = torch.LongTensor(bwd_charIDs).to(device)
88
+
89
+ words_all = []
90
+ tagged = []
91
+ probs_all = []
92
+ # run the model
93
+ if type(model) is RNNTagger:
94
+ tagscores = model(fwd_charIDs, bwd_charIDs)
95
+ if args.min_prob == -1.0:
96
+ # only print the word and tag with the highest score
97
+ tagIDs = tagscores.argmax(-1)
98
+ tags = data.IDs2tags(tagIDs.to("cpu"))
99
+ if not args.print_probs:
100
+ for word, tag in zip(words, tags):
101
+ # print(word, tag, sep="\t")
102
+ words_all.append(word)
103
+ tagged.append(tag)
104
+ else:
105
+ # print probabilities as well
106
+ tagprobs = torch.nn.functional.softmax(tagscores, dim=-1)
107
+ # get the probabilities of the highest-scoring tags
108
+ probs = tagprobs[range(len(tagIDs)), tagIDs].to("cpu").tolist()
109
+ # print the result
110
+ for word, tag, prob in zip(words, tags, probs):
111
+ # print(word, tag, round(float(prob), 4), sep="\t")
112
+ words_all.append(word)
113
+ tagged.append(tag)
114
+ probs_all.append(round(float(prob), 4))
115
+ else:
116
+ # print the best tags for each word
117
+ tagprobs = torch.nn.functional.softmax(tagscores, dim=-1)
118
+ # get the most probable tag and its probability
119
+ best_probs, _ = tagprobs.max(-1)
120
+ # get all tags with a probability above best_prob * min_prob
121
+ thresholds = best_probs * args.min_prob
122
+ greaterflags = (tagprobs > thresholds.unsqueeze(1))
123
+ for word, flags, probs in zip(words, greaterflags, tagprobs):
124
+ # get the IDs of the best tags
125
+ IDs = flags.nonzero()
126
+ # get the best tags and their probabilities
127
+ best_probs = probs[IDs].to("cpu")
128
+ best_tags = data.IDs2tags(IDs.to("cpu"))
129
+ # sort the tags by decreasing probability
130
+ sorted_list = sorted(zip(best_tags, best_probs), key=lambda x:-x[1])
131
+ best_tags, best_probs = zip(*sorted_list)
132
+ # generate the output
133
+ if args.print_probs:
134
+ # append the probabilities to the tags
135
+ best_tags = [f"{t} {float(p):.4f}" for t, p in zip(best_tags, best_probs)]
136
+ print(word, ' '.join(best_tags), sep="\t")
137
+ elif type(model) is CRFTagger:
138
+ tagIDs = model(fwd_charIDs, bwd_charIDs)
139
+ tags = data.IDs2tags(tagIDs)
140
+ for word, tag in zip(words, tags):
141
+ print(word, tag, sep='\t')
142
+ else:
143
+ sys.exit('Error')
144
+
145
+ return (words_all, tagged, probs_all)
MHGTagger/tagger.hyper ADDED
Binary file (41 Bytes). View file
 
MHGTagger/tagger.io ADDED
Binary file (229 kB). View file
 
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Mhg Parsing
3
  emoji: 🌍
4
  colorFrom: gray
5
  colorTo: red
 
1
  ---
2
+ title: MHG Parsing
3
  emoji: 🌍
4
  colorFrom: gray
5
  colorTo: red
Tagset_Mappings/POS-mapping.txt ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $_ $_
2
+ ADJA ADJA
3
+ ADJD ADJD
4
+ ADJN ADJA.Pos
5
+ ADJS ADJA
6
+ APPR APPR
7
+ APPRART APPRART
8
+ AVD ADV
9
+ AVD-KO* ADV
10
+ AVG PWAV
11
+ AVW PWAV
12
+ CARDA CARD
13
+ CARDD CARD
14
+ CARDN CARD
15
+ CARDS CARD
16
+ DDA PDAT
17
+ DDART ART
18
+ DDD PDAT
19
+ DDN PDAT
20
+ DDS PDS
21
+ DGA PWAT
22
+ DGS PWS
23
+ DIA PIAT
24
+ DIART ART
25
+ DID PDAT
26
+ DIN PDAT
27
+ DIS PIS
28
+ DPOSA PPOSAT
29
+ DPOSD PPOSS
30
+ DPOSN PPOSAT
31
+ DPOSS NN
32
+ DRELS PRELS
33
+ DWA PWAT
34
+ DWD PWS
35
+ DWS PWS
36
+ FM FM
37
+ ITJ ITJ
38
+ KO* KOUS
39
+ KOKOM KOKOM
40
+ KON KON
41
+ KOUS KOUS
42
+ NA NN
43
+ NE NE
44
+ PART PART
45
+ PAVAP PROAV
46
+ PAVD PROAV
47
+ PAVG PROAV
48
+ PAVW PWAV
49
+ PG PWS
50
+ PI PIS
51
+ PPER PPER
52
+ PRF PRF
53
+ PTK ADV
54
+ PTKA PTKA
55
+ PTKANT PTKANT
56
+ PTKNEG PTKNEG
57
+ PTKVZ PTKVZ
58
+ PW PWS
59
+ VAFIN VAFIN
60
+ VAIMP VAIMP
61
+ VAINF VAINF
62
+ VAPP VAPP
63
+ VAPS ADJD.Pos
64
+ VMFIN VMFIN
65
+ VMIMP VMIMP
66
+ VMINF VMINF
67
+ VMPP VMPP
68
+ VMPS ADJD.Pos
69
+ VVFIN VVFIN
70
+ VVIMP VVIMP
71
+ VVINF VVINF
72
+ VVPP VVPP
73
+ VVPS ADJD.Pos
Tagset_Mappings/__pycache__/tag_mapping.cpython-38.pyc ADDED
Binary file (3.28 kB). View file
 
Tagset_Mappings/feature-mapping.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Masc,Fem *
2
+ Fem,Masc *
3
+ Masc,Neut *
4
+ Neut,Masc *
5
+ Fem,Neut *
6
+ Neut,Fem *
7
+ Abl Dat
8
+ Instr Dat
9
+ Akk Acc
10
+ Voc Nom
11
+ bSg Sg
Tagset_Mappings/tag_mapping.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+
3
+ """
4
+ cd schmid/MHG-Parser/Tagset-Mappings
5
+ python tag-mapping.py ../self-attentive-parser-master/data/mhg/MHG.tagged > ../self-attentive-parser-master/data/mhg/MHG_new.mapped
6
+ """
7
+
8
+ import sys
9
+ import fileinput
10
+
11
+ with open("Tagset_Mappings/POS-mapping.txt") as file:
12
+ pos_map = dict(line.split() for line in file if line.strip())
13
+
14
+ with open("Tagset_Mappings/feature-mapping.txt") as file:
15
+ feature_map = dict(line.split() for line in file if line.strip())
16
+
17
+ def map_tags(tags):
18
+ return [map_tag(tag) for tag in tags]
19
+
20
+
21
+ def map_tag(tag):
22
+ tag.replace('AVD.Comp', 'AVD').replace('AVD.Sup', 'AVD')
23
+ pos, *features = tag.split(".")
24
+ pos = pos.split('|')[0]
25
+ pos = pos_map[pos]
26
+ pos, *features2 = pos.split(".")
27
+ features = features2 + features
28
+ features = [feature_map.get(f, f) for f in features]
29
+ if pos == 'ADJA':
30
+ if len(features) == 5:
31
+ features = [features[0], features[2], features[3], features[1]]
32
+ elif len(features) in [3,4]:
33
+ features = [features[0], features[2], '*', features[1]]
34
+ elif len(features) == 2:
35
+ features = [features[0], '*', '*', features[1]]
36
+ elif len(features) == 1:
37
+ features = [features[0], '*', '*', '*']
38
+ elif pos in ['ADV', 'CARD']:
39
+ features = []
40
+ elif pos in ['ART', 'APPRART']:
41
+ if len(features) == 4:
42
+ features = [features[1], features[2], features[0]]
43
+ elif len(features) in [0, 1]:
44
+ features = ['*', '*', '*']
45
+ elif pos == 'NN':
46
+ if len(features) == 4:
47
+ features = [features[1], features[2], features[0]]
48
+ elif len(features) == 0:
49
+ features = ['*', '*', '*']
50
+ elif pos == 'NE':
51
+ if len(features) == 2:
52
+ features.append('*')
53
+ elif len(features) == 1:
54
+ features.extend(['*', '*'])
55
+ elif pos == 'PDAT':
56
+ if len(features) == 4:
57
+ features = [features[1], features[2], features[0]]
58
+ elif len(features) == 0:
59
+ features = ['*', '*', '*']
60
+ elif pos == 'PIAT':
61
+ if len(features) == 4:
62
+ features = [features[1], features[2], features[0]]
63
+ if len(features) == 2:
64
+ features = [features[1], '*', features[0]]
65
+ elif len(features) == 0:
66
+ features = ['*', '*', '*']
67
+ elif pos == 'PPOSAT':
68
+ if len(features) in [3, 4]:
69
+ features = [features[1], features[2], features[0]]
70
+ elif len(features) == 0:
71
+ features = ['*', '*', '*']
72
+ elif pos == 'PWAT' and len(features) == 4:
73
+ features = [features[1], features[2], features[0]]
74
+ elif pos == 'PPOSS':
75
+ features = ['*.*.*']
76
+ elif pos == 'PDS':
77
+ if len(features) == 4:
78
+ features = [features[1], features[2], features[0]]
79
+ elif len(features) == 1:
80
+ features.extend(['*', '*'])
81
+ elif len(features) == 2:
82
+ features = [features[1], '*', '*']
83
+ elif pos == 'PIS':
84
+ if len(features) == 4:
85
+ features = [features[1], features[2], features[0]]
86
+ elif len(features) == 0:
87
+ features = ['*', '*', '*']
88
+ elif pos == 'PWS':
89
+ if len(features) == 4:
90
+ features = [features[1], features[2], features[0]]
91
+ elif len(features) == 0:
92
+ features = ['*', '*', '*']
93
+ elif pos == 'PRELS' and len(features) == 3:
94
+ features = [features[1], features[2], features[0]]
95
+ elif pos == 'PPER' and len(features) == 4:
96
+ features = [features[3], features[1], features[2], features[0]]
97
+ elif pos == 'PRF' and len(features) == 3:
98
+ features = ['*', features[0], features[1]]
99
+ elif pos in ['VAFIN','VMFIN','VVFIN'] and len(features) == 4:
100
+ features = [features[3], features[2], features[1], features[0]]
101
+ elif pos in ['VAIMP','VMIMP','VVIMP'] and len(features) == 2:
102
+ features = [features[1], features[0], 'Imp']
103
+ elif pos in ['VAINF','VMINF','VVINF'] and len(features) == 0:
104
+ features = ['Inf']
105
+ elif pos in ['VAPP','VMPP','VVPP'] and len(features) == 0:
106
+ features = ['Psp']
107
+ return '.'.join([pos]+features)
108
+
109
+ # for i, line in enumerate(fileinput.input()):
110
+ # print(i, end="\r", file=sys.stderr)
111
+ # line = line.strip()
112
+ # if line:
113
+ # word, tag, *_ = line.split("\t")
114
+ # tag = tag.replace('APPR|DDART', 'APPRART')
115
+ # for t in tag.split("|"):
116
+ # print(word, map_tag(t), sep="\t")
117
+ # else:
118
+ # print()
119
+
120
+
121
+ # for i, line in enumerate(fileinput.input()):
122
+ # print(i, end="\r", file=sys.stderr)
123
+ # line = line.strip()
124
+ # if line:
125
+ # word, tag, *_ = line.split("\t")
126
+ # tag = tag.replace('APPR|DDART', 'APPRART')
127
+ # print(word, map_tag(tag.split('|')[0]), sep="\t")
128
+ # else:
129
+ # print()
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from parse import parse_text
3
+ import nltk
4
+ from nltk import Tree
5
+ import pandas as pd
6
+ import re
7
+ from nltk.tree.prettyprinter import TreePrettyPrinter
8
+
9
+
10
+ st.title("MHG parsing system (demo)")
11
+ text = st.text_area("""This is a simple demo of a Middle High German (MHG) parsing system using delexicalization method.\n\n
12
+ Enter some MHG text below!""")
13
+
14
+ st.text("""Example MHG sentences:
15
+ 1. Swer an rehte güete wendet sîn gemüete, dem volget sælde und êre, des gît gewisse
16
+ lêre künec Artûs der guote, der mit rîters muote nâch lobe kunde strîten.
17
+ 2. Uns ist in alten mæren wunders vil geseitvon helden lobebæren, von grôzer arebeit,
18
+ von freuden, hôchgezîten, von weinen und von klagen, von küener recken strîten muget
19
+ ir nu wunder hœren sagen.""")
20
+
21
+ nltk.download('punkt')
22
+
23
+
24
+ if text:
25
+ tokens, tags, probs, parse_tree = parse_text(text)
26
+
27
+ # create a table to show the tagged results:
28
+ zipped = list(zip(tokens, tags, probs))
29
+
30
+ df = pd.DataFrame(zipped, columns=['Token', 'Tag', 'Prob.'])
31
+
32
+ # Convert the bracket parse tree into an NLTK Tree
33
+ t = Tree.fromstring(re.sub(r'(\.[^ )]+)+', '', parse_tree))
34
+
35
+ tree_svg = TreePrettyPrinter(t).svg(nodecolor='black', leafcolor='black', funccolor='black')
36
+
37
+ col1 = st.columns(1)[0]
38
+ col1.header("POS tagging result:")
39
+ col1.table(df)
40
+
41
+ col2 = st.columns(1)[0]
42
+ col2.header("Parsing result:")
43
+ col2.write(parse_tree.replace('_', '\_').replace('$', '\$').replace('*', '\*'))
44
+
45
+ # Display the graph in the Streamlit app
46
+ col2.image(tree_svg, use_column_width=True)
47
+
parse.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from MHGTagger.rnn_annotate import annotate
3
+ from Tagset_Mappings.tag_mapping import map_tags
4
+ from parsing.src.parse import run_parse
5
+ from nltk import word_tokenize
6
+
7
+ def parse_text(text):
8
+ tokens = tokenize(text)
9
+ tokens, tags, probs = annotate(tokens)
10
+ tags = map_tags(tags)
11
+ parse_tree = run_parse(tokens, tags)[0]
12
+ return tokens, tags, probs, parse_tree
13
+
14
+ def tokenize(text: str):
15
+ text = re.sub(r'\s*([.,;:?!"])\s', r' \1 ', text)
16
+ text = re.sub(r'\s*([.,;:?!"]) ', r' \1 ', text)
17
+ tokens = word_tokenize(text)
18
+ return tokens
19
+
parsing/EVALB/COLLINS.prm ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##------------------------------------------##
2
+ ## Debug mode ##
3
+ ## 0: No debugging ##
4
+ ## 1: print data for individual sentence ##
5
+ ##------------------------------------------##
6
+ DEBUG 0
7
+
8
+ ##------------------------------------------##
9
+ ## MAX error ##
10
+ ## Number of error to stop the process. ##
11
+ ## This is useful if there could be ##
12
+ ## tokanization error. ##
13
+ ## The process will stop when this number##
14
+ ## of errors are accumulated. ##
15
+ ##------------------------------------------##
16
+ MAX_ERROR 10
17
+
18
+ ##------------------------------------------##
19
+ ## Cut-off length for statistics ##
20
+ ## At the end of evaluation, the ##
21
+ ## statistics for the senetnces of length##
22
+ ## less than or equal to this number will##
23
+ ## be shown, on top of the statistics ##
24
+ ## for all the sentences ##
25
+ ##------------------------------------------##
26
+ CUTOFF_LEN 40
27
+
28
+ ##------------------------------------------##
29
+ ## unlabeled or labeled bracketing ##
30
+ ## 0: unlabeled bracketing ##
31
+ ## 1: labeled bracketing ##
32
+ ##------------------------------------------##
33
+ LABELED 1
34
+
35
+ ##------------------------------------------##
36
+ ## Delete labels ##
37
+ ## list of labels to be ignored. ##
38
+ ## If it is a pre-terminal label, delete ##
39
+ ## the word along with the brackets. ##
40
+ ## If it is a non-terminal label, just ##
41
+ ## delete the brackets (don't delete ##
42
+ ## deildrens). ##
43
+ ##------------------------------------------##
44
+ DELETE_LABEL TOP
45
+ DELETE_LABEL -NONE-
46
+ DELETE_LABEL ,
47
+ DELETE_LABEL :
48
+ DELETE_LABEL ``
49
+ DELETE_LABEL ''
50
+ DELETE_LABEL .
51
+
52
+ ##------------------------------------------##
53
+ ## Delete labels for length calculation ##
54
+ ## list of labels to be ignored for ##
55
+ ## length calculation purpose ##
56
+ ##------------------------------------------##
57
+ DELETE_LABEL_FOR_LENGTH -NONE-
58
+
59
+ ##------------------------------------------##
60
+ ## Equivalent labels, words ##
61
+ ## the pairs are considered equivalent ##
62
+ ## This is non-directional. ##
63
+ ##------------------------------------------##
64
+ EQ_LABEL ADVP PRT
65
+
66
+ # EQ_WORD Example example
parsing/EVALB/LICENSE ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This is free and unencumbered software released into the public domain.
2
+
3
+ Anyone is free to copy, modify, publish, use, compile, sell, or
4
+ distribute this software, either in source code form or as a compiled
5
+ binary, for any purpose, commercial or non-commercial, and by any
6
+ means.
7
+
8
+ In jurisdictions that recognize copyright laws, the author or authors
9
+ of this software dedicate any and all copyright interest in the
10
+ software to the public domain. We make this dedication for the benefit
11
+ of the public at large and to the detriment of our heirs and
12
+ successors. We intend this dedication to be an overt act of
13
+ relinquishment in perpetuity of all present and future rights to this
14
+ software under copyright law.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19
+ IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20
+ OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21
+ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
+ OTHER DEALINGS IN THE SOFTWARE.
23
+
24
+ For more information, please refer to <http://unlicense.org/>
parsing/EVALB/Makefile ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ all: evalb
2
+
3
+ evalb: evalb.c
4
+ gcc -Wall -g -o evalb evalb.c
parsing/EVALB/README ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #################################################################
2
+ # #
3
+ # Bug fix and additional functionality for evalb #
4
+ # #
5
+ # This updated version of evalb fixes a bug in which sentences #
6
+ # were incorrectly categorized as "length mismatch" when the #
7
+ # the parse output had certain mislabeled parts-of-speech. #
8
+ # #
9
+ # The bug was the result of evalb treating one of the tags (in #
10
+ # gold or test) as a label to be deleted (see sections [6],[7] #
11
+ # for details), but not the corresponding tag in the other. #
12
+ # This most often occurs with punctuation. See the subdir #
13
+ # "bug" for an example gld and tst file demonstating the bug, #
14
+ # as well as output of evalb with and without the bug fix. #
15
+ # #
16
+ # For the present version in case of length mismatch, the nodes #
17
+ # causing the imbalance are reinserted to resolve the miscount. #
18
+ # If the lengths of gold and test truly differ, the error is #
19
+ # still reported. The parameter file "new.prm" (derived from #
20
+ # COLLINS.prm) shows how to add new potential mislabelings for #
21
+ # quotes (",``,',`). #
22
+ # #
23
+ # I have preserved DJB's revision for modern compilers except #
24
+ # for the delcaration of "exit" which is provided by stdlib. #
25
+ # #
26
+ # Other changes: #
27
+ # #
28
+ # * output of F-Measure in addition to precision and recall #
29
+ # (I did not update the documention in section [4] for this) #
30
+ # #
31
+ # * more comprehensive DEBUG output that includes bracketing #
32
+ # information as evalb is processing each sentence #
33
+ # (useful in working through this, and peraps other bugs). #
34
+ # Use either the "-D" run-time switch or set DEBUG to 2 in #
35
+ # the parameter file. #
36
+ # #
37
+ # * added DELETE_LABEL lines in new.prm for S1 nodes produced #
38
+ # by the Charniak parser and "?", "!" punctuation produced by #
39
+ # the Bikel parser. #
40
+ # #
41
+ # #
42
+ # David Ellis (Brown) #
43
+ # #
44
+ # January.2006 #
45
+ #################################################################
46
+
47
+ #################################################################
48
+ # #
49
+ # Update of evalb for modern compilers #
50
+ # #
51
+ # This is an updated version of evalb, for use with modern C #
52
+ # compilers. There are a few updates, each marked in the code: #
53
+ # #
54
+ # /* DJB: explanation of comment */ #
55
+ # #
56
+ # The updates are purely to help compilation with recent #
57
+ # versions of GCC (and other C compilers). There are *NO* other #
58
+ # changes to the algorithm itself. #
59
+ # #
60
+ # I have made these changes following recommendations from #
61
+ # users of the Corpora Mailing List, especially Peet Morris and #
62
+ # Ramon Ziai. #
63
+ # #
64
+ # David Brooks (Birmingham) #
65
+ # #
66
+ # September.2005 #
67
+ #################################################################
68
+
69
+ #################################################################
70
+ # #
71
+ # README file for evalb #
72
+ # #
73
+ # Satoshi Sekine (NYU) #
74
+ # Mike Collins (UPenn) #
75
+ # #
76
+ # October.1997 #
77
+ #################################################################
78
+
79
+ Contents of this README:
80
+
81
+ [0] COPYRIGHT
82
+ [1] INTRODUCTION
83
+ [2] INSTALLATION AND RUN
84
+ [3] OPTIONS
85
+ [4] OUTPUT FORMAT FROM THE SCORER
86
+ [5] HOW TO CREATE A GOLDFILE FROM THE TREEBANK
87
+ [6] THE PARAMETER FILE
88
+ [7] MORE DETAILS ABOUT THE SCORING ALGORITHM
89
+
90
+
91
+ [0] COPYRIGHT
92
+
93
+ The authors abandon the copyright of this program. Everyone is
94
+ permitted to copy and distribute the program or a portion of the program
95
+ with no charge and no restrictions unless it is harmful to someone.
96
+
97
+ However, the authors are delightful for the user's kindness of proper
98
+ usage and letting the authors know bugs or problems.
99
+
100
+ This software is provided "AS IS", and the authors make no warranties,
101
+ express or implied.
102
+
103
+ To legally enforce the abandonment of copyright, this package is released
104
+ under the Unlicense (see LICENSE).
105
+
106
+ [1] INTRODUCTION
107
+
108
+ Evaluation of bracketing looks simple, but in fact, there are minor
109
+ differences from system to system. This is a program to parametarize
110
+ such minor differences and to give an informative result.
111
+
112
+ "evalb" evaluates bracketing accuracy in a test-file against a gold-file.
113
+ It returns recall, precision, tagging accuracy. It uses an identical
114
+ algorithm to that used in (Collins ACL97).
115
+
116
+
117
+ [2] Installation and Run
118
+
119
+ To compile the scorer, type
120
+
121
+ > make
122
+
123
+
124
+ To run the scorer:
125
+
126
+ > evalb -p Parameter_file Gold_file Test_file
127
+
128
+
129
+ For example to use the sample files:
130
+
131
+ > evalb -p sample.prm sample.gld sample.tst
132
+
133
+
134
+
135
+ [3] OPTIONS
136
+
137
+ You can specify system parameters in the command line options.
138
+ Other options concerning to evaluation metrix should be specified
139
+ in parameter file, described later.
140
+
141
+ -p param_file parameter file
142
+ -d debug mode
143
+ -e n number of error to kill (default=10)
144
+ -h help
145
+
146
+
147
+
148
+ [4] OUTPUT FORMAT FROM THE SCORER
149
+
150
+ The scorer gives individual scores for each sentence, for
151
+ example:
152
+
153
+ Sent. Matched Bracket Cross Correct Tag
154
+ ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy
155
+ ============================================================================
156
+ 1 8 0 100.00 100.00 5 5 5 0 6 5 83.33
157
+
158
+ At the end of the output the === Summary === section gives statistics
159
+ for all sentences, and for sentences <=40 words in length. The summary
160
+ contains the following information:
161
+
162
+ i) Number of sentences -- total number of sentences.
163
+
164
+ ii) Number of Error/Skip sentences -- should both be 0 if there is no
165
+ problem with the parsed/gold files.
166
+
167
+ iii) Number of valid sentences = Number of sentences - Number of Error/Skip
168
+ sentences
169
+
170
+ iv) Bracketing recall = (number of correct constituents)
171
+ ----------------------------------------
172
+ (number of constituents in the goldfile)
173
+
174
+ v) Bracketing precision = (number of correct constituents)
175
+ ----------------------------------------
176
+ (number of constituents in the parsed file)
177
+
178
+ vi) Complete match = percentaage of sentences where recall and precision are
179
+ both 100%.
180
+
181
+ vii) Average crossing = (number of constituents crossing a goldfile constituen
182
+ ----------------------------------------------------
183
+ (number of sentences)
184
+
185
+ viii) No crossing = percentage of sentences which have 0 crossing brackets.
186
+
187
+ ix) 2 or less crossing = percentage of sentences which have <=2 crossing brackets.
188
+
189
+ x) Tagging accuracy = percentage of correct POS tags (but see [5].3 for exact
190
+ details of what is counted).
191
+
192
+
193
+
194
+ [5] HOW TO CREATE A GOLDFILE FROM THE PENN TREEBANK
195
+
196
+
197
+ The gold and parsed files are in a format similar to this:
198
+
199
+ (TOP (S (INTJ (RB No)) (, ,) (NP (PRP it)) (VP (VBD was) (RB n't) (NP (NNP Black) (NNP Monday))) (. .)))
200
+
201
+ To create a gold file from the treebank:
202
+
203
+ tgrep -wn '/.*/' | tgrep_proc.prl
204
+
205
+ will produce a goldfile in the required format. ("tgrep -wn '/.*/'" prints
206
+ parse trees, "tgrep_process.prl" just skips blank lines).
207
+
208
+ For example, to produce a goldfile for section 23 of the treebank:
209
+
210
+ tgrep -wn '/.*/' | tail +90895 | tgrep_process.prl | sed 2416q > sec23.gold
211
+
212
+
213
+
214
+ [6] THE PARAMETER (.prm) FILE
215
+
216
+
217
+ The .prm file sets options regarding the scoring method. COLLINS.prm gives
218
+ the same scoring behaviour as the scorer used in (Collins 97). The options
219
+ chosen were:
220
+
221
+ 1) LABELED 1
222
+
223
+ to give labelled precision/recall figures, i.e. a constituent must have the
224
+ same span *and* label as a constituent in the goldfile.
225
+
226
+ 2) DELETE_LABEL TOP
227
+
228
+ Don't count the "TOP" label (which is always given in the output of tgrep)
229
+ when scoring.
230
+
231
+ 3) DELETE_LABEL -NONE-
232
+
233
+ Remove traces (and all constituents which dominate nothing but traces) when
234
+ scoring. For example
235
+
236
+ .... (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) (. .)))
237
+
238
+ would be processed to give
239
+
240
+ .... (VP (VBD reported)) (. .)))
241
+
242
+
243
+ 4)
244
+ DELETE_LABEL , -- for the purposes of scoring remove punctuation
245
+ DELETE_LABEL :
246
+ DELETE_LABEL ``
247
+ DELETE_LABEL ''
248
+ DELETE_LABEL .
249
+
250
+ 5) DELETE_LABEL_FOR_LENGTH -NONE- -- don't include traces when calculating
251
+ the length of a sentence (important
252
+ when classifying a sentence as <=40
253
+ words or >40 words)
254
+
255
+ 6) EQ_LABEL ADVP PRT
256
+
257
+ Count ADVP and PRT as being the same label when scoring.
258
+
259
+
260
+
261
+
262
+ [7] MORE DETAILS ABOUT THE SCORING ALGORITHM
263
+
264
+
265
+ 1) The scorer initially processes the files to remove all nodes specified
266
+ by DELETE_LABEL in the .prm file. It also recursively removes nodes which
267
+ dominate nothing due to all their children being removed. For example, if
268
+ -NONE- is specified as a label to be deleted,
269
+
270
+ .... (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) (. .)))
271
+
272
+ would be processed to give
273
+
274
+ .... (VP (VBD reported)) (. .)))
275
+
276
+ 2) The scorer also removes all functional tags attached to non-terminals
277
+ (functional tags are prefixed with "-" or "=" in the treebank). For example
278
+ "NP-SBJ" is processed to give "NP", "NP=2" is changed to "NP".
279
+
280
+
281
+ 3) Tagging accuracy counts tags for all words *except* any tags which are
282
+ deleted by a DELETE_LABEL specification in the .prm file. (For example, for
283
+ COLLINS.prm, punctuation tagged as "," ":" etc. would not be included).
284
+
285
+ 4) When calculating the length of a sentence, all words with POS tags not
286
+ included in the "DELETE_LABEL_FOR_LENGTH" list in the .prm file are
287
+ counted. (For COLLINS.prm, only "-NONE-" is specified in this list, so
288
+ traces are removed before calculating the length of the sentence).
289
+
290
+ 5) There are some subtleties in scoring when either the goldfile or parsed
291
+ file contains multiple constituents for the same span which have the same
292
+ non-terminal label. e.g. (NP (NP the man)) If the goldfile contains n
293
+ constituents for the same span, and the parsed file contains m constituents
294
+ with that nonterminal, the scorer works as follows:
295
+
296
+ i) If m>n, then the precision is n/m, recall is 100%
297
+
298
+ ii) If n>m, then the precision is 100%, recall is m/n.
299
+
300
+ iii) If n==m, recall and precision are both 100%.
parsing/EVALB/bug/bug.gld ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ (TOP (S (NP-SBJ (DT The) (NN Thy-1) (NN gene) (NN promoter) ) (VP (VBZ resembles) (NP (DT a) (`` ") (JJ housekeeping) ('' ") (NN promoter) ) (PP (IN in) (SBAR (IN that) (S (NP-SBJ-68 (PRP it) ) (VP-COOD (VP (VBZ is) (ADJP-PRD (JJ located) (PP (IN within) (NP (DT a) (JJ methylation-free) (NN island) )))) (, ,) (VP (VBZ lacks) (NP (DT a) (JJ canonical) (NN TATA) (NN box) )) (, ,) (CC and) (VP (VBZ displays) (NP (NN heterogeneity) ) (PP (IN in) (NP (NP (DT the) (JJ 5'-end) (NNS termini) ) (PP (IN of) (NP (DT the) (NN mRNA) )))))))))) (. .) ) )
2
+ (TOP (S (NP-SBJ (DT The) (JJ latter) (`` ") (NP (NP (JJ nuclear) (NN factor) ) (PP (IN for) (NP (VBN activated) (NN T) (NNS cells) ))) ('' ") ) (ADVP (RB likely) ) (VP (VBZ contributes) (PP (TO to) (NP (NP (DT the) (NN tissue) (NN specificity) ) (PP (IN of) (NP (NN IL-2) (NN gene) (NN expression) ))))) (. .) ) )
3
+ (TOP (S (ADVP (RB Thus) ) (, ,) (NP-SBJ (PRP we) ) (VP (VBD postulated) (SBAR-COOD (SBAR (IN that) (S (NP-SBJ (NP (DT the) (JJ circadian) (NN modification) ) (PP (IN of) (NP (NN GR) ))) (VP (VBD was) (ADJP-PRD (JJ independent) (PP (IN of) (NP-COOD (NP (NP (DT the) (JJ diurnal) (NNS fluctuations) ) (PP (IN in) (NP (NN plasma) (NN cortisol) (NN level) ))) (CC or) (NP (NP (DT the) (JJ circadian) (NNS variations) ) (PP (IN in) (NP (JJ environmental) (NN lighting) ))))))))) (CC and) (SBAR (IN that) (S (NP-SBJ-79 (DT the) (NN rhythmicity) ) (VP (MD might) (VP (VB be) (VP (VBN regulated) (NP (-NONE- *-79) ) (PP (IN by) (NP-LGS (NP (DT the) (`` ') (JJ circadian) (NN pacemaker) ('' ') ) (ADJP (JJ located) (PP (IN in) (NP (DT the) (JJ human) (JJ basal) (NN brain) )))))))))))) (. .) ) )
4
+ (TOP (S (NP-SBJ-70 (JJ Such) (NN transcription) (NNS factors) ) (VP (VBP play) (NP (DT a) (JJ key) (NN role) ) (PP (IN in) (NP (NP (DT the) (NN development) ) (PP (IN of) (NP (DT the) (JJ mature) (NN T-cell) (NN phenotype) )))) (PP (IN by) (S (NP-SBJ (-NONE- *-70) ) (VP (VBG functioning) (PP (IN as) (`` ') (NP (NP (JJ master) (NNS regulators) ) (PP (IN of) (NP (NN T-cell) (NN differentiation) ))) ('' ') ))))) (. .) ) )
5
+ (TOP (S (NP-SBJ (NP (DT The) (NN conversion) ) (PP (IN of) (NP (DT the) (NN TCEd) )) (PP (TO to) (NP (DT a) (`` ') (JJ perfect) ('' ') (NN NF-kB) (NN binding) (NN site) ))) (VP-COOD (VP (VBZ leads) (PP (TO to) (NP-19 (NP (DT a) (JJR tighter) (NN binding) ) (PP (IN of) (NP (NN NF-kB) )) (PP (TO to) (NP (NN TCEd) (NN DNA) ))))) (CC and) (, ,) (VP (PP (IN as) (NP (DT a) (JJ functional) (NN consequence) )) (, ,) (PP (TO to) (NP=19 (NP (DT the) (NN activity) ) (PP (IN of) (NP (DT the) (`` ') (VBN converted) ('' ') (NN TCEd) (NNS motifs) )) (PP (IN in) (NP (NN HeLa) (NNS cells) )))))) (. .) ) )
parsing/EVALB/bug/bug.rsl-new ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Sent. Matched Bracket Cross Correct Tag
2
+ ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy
3
+ ============================================================================
4
+ 1 37 0 77.27 65.38 17 22 26 5 34 27 79.41
5
+ 2 21 0 69.23 64.29 9 13 14 2 20 16 80.00
6
+ 3 47 0 80.00 82.35 28 35 34 4 44 40 90.91
7
+ 4 26 0 35.29 37.50 6 17 16 8 25 18 72.00
8
+ 5 44 0 42.31 33.33 11 26 33 17 38 28 73.68
9
+ ============================================================================
10
+ 62.83 57.72 71 113 123 0 161 129 80.12
11
+ === Summary ===
12
+
13
+ -- All --
14
+ Number of sentence = 5
15
+ Number of Error sentence = 0
16
+ Number of Skip sentence = 0
17
+ Number of Valid sentence = 5
18
+ Bracketing Recall = 62.83
19
+ Bracketing Precision = 57.72
20
+ Bracketing FMeasure = 60.17
21
+ Complete match = 0.00
22
+ Average crossing = 7.20
23
+ No crossing = 0.00
24
+ 2 or less crossing = 20.00
25
+ Tagging accuracy = 80.12
26
+
27
+ -- len<=40 --
28
+ Number of sentence = 3
29
+ Number of Error sentence = 0
30
+ Number of Skip sentence = 0
31
+ Number of Valid sentence = 3
32
+ Bracketing Recall = 61.54
33
+ Bracketing Precision = 57.14
34
+ Bracketing FMeasure = 59.26
35
+ Complete match = 0.00
36
+ Average crossing = 5.00
37
+ No crossing = 0.00
38
+ 2 or less crossing = 33.33
39
+ Tagging accuracy = 77.22
parsing/EVALB/bug/bug.rsl-old ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Sent. Matched Bracket Cross Correct Tag
2
+ ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy
3
+ ============================================================================
4
+ 1 : Length unmatch (33|35)
5
+ 1 37 1 0.00 0.00 0 0 0 0 0 0 0.00
6
+ 2 : Length unmatch (19|21)
7
+ 2 21 1 0.00 0.00 0 0 0 0 0 0 0.00
8
+ 3 : Length unmatch (44|45)
9
+ 3 47 1 0.00 0.00 0 0 0 0 0 0 0.00
10
+ 4 : Length unmatch (24|26)
11
+ 4 26 1 0.00 0.00 0 0 0 0 0 0 0.00
12
+ 5 : Length unmatch (38|39)
13
+ 5 44 1 0.00 0.00 0 0 0 0 0 0 0.00
14
+ ============================================================================
15
+ 0 0 0.00
16
+
17
+ === Summary ===
18
+
19
+ -- All --
20
+ Number of sentence = 5
21
+ Number of Error sentence = 5
22
+ Number of Skip sentence = 0
23
+ Number of Valid sentence = 0
24
+ Bracketing Recall = 0.00
25
+ Bracketing Precision = 0.00
26
+ Bracketing FMeasure = nan
27
+ Complete match = 0.00
28
+ Average crossing = 0.00
29
+ No crossing = 0.00
30
+ 2 or less crossing = 0.00
31
+ Tagging accuracy = 0.00
32
+
33
+ -- len<=40 --
34
+ Number of sentence = 3
35
+ Number of Error sentence = 3
36
+ Number of Skip sentence = 0
37
+ Number of Valid sentence = 0
38
+ Bracketing Recall = 0.00
39
+ Bracketing Precision = 0.00
40
+ Bracketing FMeasure = nan
41
+ Complete match = 0.00
42
+ Average crossing = 0.00
43
+ No crossing = 0.00
44
+ 2 or less crossing = 0.00
45
+ Tagging accuracy = 0.00
parsing/EVALB/bug/bug.tst ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ (S1 (S (NP (DT The) (JJ Thy-1) (NN gene) (NN promoter)) (VP (VP (VBZ resembles) (NP (NP (DT a) (ADJP (CD ") (NN housekeeping)) (NN ") (NN promoter)) (SBAR (WHPP (IN in) (WHNP (WDT that))) (S (NP (PRP it)) (VP (VBZ is) (VP (VBN located) (PP (IN within) (NP (DT a) (JJ methylation-free) (NN island))))))))) (, ,) (VP (VBZ lacks) (NP (DT a) (JJ canonical) (NNP TATA) (NN box))) (, ,) (CC and) (VP (VBZ displays) (NP (NP (NN heterogeneity)) (PP (IN in) (NP (NP (DT the) (JJ 5'-end) (NNS termini)) (PP (IN of) (NP (DT the) (NN mRNA)))))))) (. .)))
2
+ (S1 (S (NP (NP (DT The) (JJ latter) (CD ") (JJ nuclear) (NN factor)) (PP (IN for) (NP (VBN activated) (NN T) (NNS cells)))) (VP (VBZ ") (ADJP (JJ likely) (S (VP (VBZ contributes) (PP (TO to) (NP (NP (DT the) (NN tissue) (NN specificity)) (PP (IN of) (NP (JJ IL-2) (NN gene) (NN expression))))))))) (. .)))
3
+ (S1 (S (ADVP (RB Thus)) (, ,) (NP (PRP we)) (VP (VBD postulated) (SBAR (SBAR (IN that) (S (NP (NP (DT the) (JJ circadian) (NN modification)) (PP (IN of) (NP (NNP GR)))) (VP (VBD was) (ADJP (JJ independent) (PP (IN of) (NP (DT the) (JJ diurnal) (NNS fluctuations)))) (PP (IN in) (NP (NP (NN plasma) (JJ cortisol) (NN level)) (CC or) (NP (NP (DT the) (JJ circadian) (NNS variations)) (PP (IN in) (NP (JJ environmental) (NN lighting))))))))) (CC and) (SBAR (IN that) (S (NP (DT the) (NN rhythmicity)) (VP (MD might) (VP (VB be) (VP (VBN regulated) (PP (IN by) (NP (DT the) ('' ') (NP (JJ circadian) (NN pacemaker) (POS ')) (VP (VBN located) (PP (IN in) (NP (DT the) (JJ human) (JJ basal) (NN brain))))))))))))) (. .)))
4
+ (S1 (S (NP (JJ Such) (NN transcription) (NNS factors)) (VP (VBP play) (NP (NP (DT a) (JJ key) (NN role)) (PP (IN in) (NP (NP (DT the) (NN development)) (PP (IN of) (NP (NP (DT the) (JJ mature) (JJ T-cell) (NN phenotype)) (PP (IN by) (NP (NP (NN functioning) (RB as) (POS ')) (NN master) (NNS regulators))))) (PP (IN of) (NP (JJ T-cell) (NN differentiation) (POS '))))))) (. .)))
5
+ (S1 (S (NP (NP (DT The) (NN conversion)) (PP (IN of) (NP (DT the)))) (VP (VBD TCEd) (PP (TO to) (NP (NP (DT a) ('' ') (JJ perfect) ('' ') (NN NF-kB)) (SBAR (S (NP (JJ binding) (NN site)) (VP (VBZ leads) (PP (TO to) (NP (NP (NP (DT a) (ADJP (RBR tighter) (JJ binding)) (PP (IN of) (NP (NP (NNS NF-kB)) (PP (PP (TO to) (NP (JJ TCEd) (NN DNA))) (CC and) (PP (, ,) (PP (IN as) (NP (DT a) (JJ functional) (NN consequence))) (, ,) (TO to) (NP (NP (DT the) (NN activity)) (PP (IN of) (NP (DT the)))))))) (POS ')) (JJ converted) ('' ') (JJ TCEd) (NNS motifs)) (PP (IN in) (NP (NNP HeLa) (NNS cells))))))))))) (. .)))
parsing/EVALB/evalb ADDED
Binary file (59.6 kB). View file
 
parsing/EVALB/evalb.c ADDED
@@ -0,0 +1,1537 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*****************************************************************/
2
+ /* evalb [-p param_file] [-dh] [-e n] gold-file test-file */
3
+ /* */
4
+ /* Evaluate bracketing in test-file against gold-file. */
5
+ /* Return recall, precision, tagging accuracy. */
6
+ /* */
7
+ /* <option> */
8
+ /* -p param_file parameter file */
9
+ /* -d debug mode */
10
+ /* -e n number of error to kill (default=10) */
11
+ /* -h help */
12
+ /* */
13
+ /* Satoshi Sekine (NYU) */
14
+ /* Mike Collins (UPenn) */
15
+ /* */
16
+ /* October.1997 */
17
+ /* */
18
+ /* Please refer README for the update information */
19
+ /*****************************************************************/
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h> //### added for exit, atoi decls
23
+ #include <ctype.h>
24
+ #include <string.h>
25
+
26
+
27
+ /* Internal Data format -------------------------------------------*/
28
+ /* */
29
+ /* (S (NP (NNX this)) (VP (VBX is) (NP (DT a) (NNX pen))) (SYM .)) */
30
+ /* */
31
+ /* wn=5 */
32
+ /* word label */
33
+ /* terminal[0] = this NNX */
34
+ /* terminal[1] = is VBX */
35
+ /* terminal[2] = a DT */
36
+ /* terminal[3] = pen NNX */
37
+ /* terminal[4] = . SYM */
38
+ /* */
39
+ /* bn=4 */
40
+ /* start end label */
41
+ /* bracket[0] = 0 5 S */
42
+ /* bracket[1] = 0 0 NP */
43
+ /* bracket[2] = 1 4 VP */
44
+ /* bracket[3] = 2 4 NP */
45
+ /* */
46
+ /* matched bracketing */
47
+ /* Recall = --------------------------- */
48
+ /* # of bracket in ref-data */
49
+ /* */
50
+ /* matched bracketing */
51
+ /* Recall = --------------------------- */
52
+ /* # of bracket in test-data */
53
+ /* */
54
+ /*-----------------------------------------------------------------*/
55
+
56
+ /******************/
57
+ /* constant macro */
58
+ /******************/
59
+
60
+ #define MAX_SENT_LEN 5000
61
+ #define MAX_WORD_IN_SENT 200
62
+ #define MAX_BRACKET_IN_SENT 200
63
+ #define MAX_WORD_LEN 100
64
+ #define MAX_LABEL_LEN 30
65
+ #define MAX_QUOTE_TERM 20
66
+
67
+ #define MAX_DELETE_LABEL 100
68
+ #define MAX_EQ_LABEL 100
69
+ #define MAX_EQ_WORD 100
70
+
71
+ #define MAX_LINE_LEN 500
72
+
73
+ #define DEFAULT_MAX_ERROR 10
74
+ #define DEFAULT_CUT_LEN 40
75
+
76
+ /*************/
77
+ /* structure */
78
+ /*************/
79
+
80
+ typedef struct ss_terminal {
81
+ char word[MAX_WORD_LEN];
82
+ char label[MAX_LABEL_LEN];
83
+ int result; /* 0:unmatch, 1:match, 9:undef */
84
+ } s_terminal;
85
+
86
+ typedef struct ss_term_ind {
87
+ s_terminal term;
88
+ int index;
89
+ int bracket;
90
+ int endslen;
91
+ int ends[MAX_BRACKET_IN_SENT];
92
+ } s_term_ind;
93
+
94
+ typedef struct ss_bracket {
95
+ int start;
96
+ int end;
97
+ unsigned int buf_start;
98
+ unsigned int buf_end;
99
+ char label[MAX_LABEL_LEN];
100
+ int result; /* 0: unmatch, 1:match, 5:delete 9:undef */
101
+ } s_bracket;
102
+
103
+
104
+ typedef struct ss_equiv {
105
+ char *s1;
106
+ char *s2;
107
+ } s_equiv;
108
+
109
+
110
+ /****************************/
111
+ /* global variables */
112
+ /* gold-data: suffix = 1 */
113
+ /* test-data: suffix = 2 */
114
+ /****************************/
115
+
116
+ /*---------------*/
117
+ /* Sentence data */
118
+ /*---------------*/
119
+ int wn1, wn2; /* number of words in sentence */
120
+ int r_wn1; /* number of words in sentence */
121
+ /* which only ignores labels in */
122
+ /* DELETE_LABEL_FOR_LENGTH */
123
+
124
+ s_terminal terminal1[MAX_WORD_IN_SENT]; /* terminal information */
125
+ s_terminal terminal2[MAX_WORD_IN_SENT];
126
+
127
+ s_term_ind quotterm1[MAX_QUOTE_TERM]; /* special terminals ("'","POS") */
128
+ s_term_ind quotterm2[MAX_QUOTE_TERM];
129
+
130
+ int bn1, bn2; /* number of brackets */
131
+
132
+ int r_bn1, r_bn2; /* number of brackets */
133
+ /* after deletion */
134
+
135
+ s_bracket bracket1[MAX_BRACKET_IN_SENT]; /* bracket information */
136
+ s_bracket bracket2[MAX_BRACKET_IN_SENT];
137
+
138
+
139
+ /*------------*/
140
+ /* Total data */
141
+ /*------------*/
142
+ int TOTAL_bn1, TOTAL_bn2, TOTAL_match; /* total number of brackets */
143
+ int TOTAL_sent; /* No. of sentence */
144
+ int TOTAL_error_sent; /* No. of error sentence */
145
+ int TOTAL_skip_sent; /* No. of skip sentence */
146
+ int TOTAL_comp_sent; /* No. of complete match sent */
147
+ int TOTAL_word; /* total number of word */
148
+ int TOTAL_crossing; /* total crossing */
149
+ int TOTAL_no_crossing; /* no crossing sentence */
150
+ int TOTAL_2L_crossing; /* 2 or less crossing sentence */
151
+ int TOTAL_correct_tag; /* total correct tagging */
152
+
153
+ int TOT_cut_len = DEFAULT_CUT_LEN; /* Cut-off length in statistics */
154
+
155
+ /* data for sentences with len <= CUT_LEN */
156
+ /* Historically it was 40. */
157
+ int TOT40_bn1, TOT40_bn2, TOT40_match; /* total number of brackets */
158
+ int TOT40_sent; /* No. of sentence */
159
+ int TOT40_error_sent; /* No. of error sentence */
160
+ int TOT40_skip_sent; /* No. of skip sentence */
161
+ int TOT40_comp_sent; /* No. of complete match sent */
162
+ int TOT40_word; /* total number of word */
163
+ int TOT40_crossing; /* total crossing */
164
+ int TOT40_no_crossing; /* no crossing sentence */
165
+ int TOT40_2L_crossing; /* 2 or less crossing sentence */
166
+ int TOT40_correct_tag; /* total correct tagging */
167
+
168
+ /*------------*/
169
+ /* miscallous */
170
+ /*------------*/
171
+ int Line; /* line number */
172
+ int Error_count = 0; /* Error count */
173
+ int Status; /* Result status for each sent */
174
+ /* 0: OK, 1: skip, 2: error */
175
+
176
+ /*-------------------*/
177
+ /* stack manuplation */
178
+ /*-------------------*/
179
+ int stack_top;
180
+ int stack[MAX_BRACKET_IN_SENT];
181
+
182
+ /************************************************************/
183
+ /* User parameters which can be specified in parameter file */
184
+ /************************************************************/
185
+
186
+ /*------------------------------------------*/
187
+ /* Debug mode */
188
+ /* print out data for individual sentence */
189
+ /*------------------------------------------*/
190
+ int DEBUG=0;
191
+
192
+ /*------------------------------------------*/
193
+ /* MAX error */
194
+ /* Number of error to stop the process. */
195
+ /* This is useful if there could be */
196
+ /* tokanization error. */
197
+ /* The process will stop when this number*/
198
+ /* of errors are accumulated. */
199
+ /*------------------------------------------*/
200
+ int Max_error = DEFAULT_MAX_ERROR;
201
+
202
+ /*------------------------------------------*/
203
+ /* Cut-off length for statistics */
204
+ /* int TOT_cut_len = DEFAULT_CUT_LEN; */
205
+ /* (Defined above) */
206
+ /*------------------------------------------*/
207
+
208
+
209
+ /*------------------------------------------*/
210
+ /* unlabeled or labeled bracketing */
211
+ /* 0: unlabeled bracketing */
212
+ /* 1: labeled bracketing */
213
+ /*------------------------------------------*/
214
+ int F_label = 1;
215
+
216
+ /*------------------------------------------*/
217
+ /* Delete labels */
218
+ /* list of labels to be ignored. */
219
+ /* If it is a pre-terminal label, delete */
220
+ /* the word along with the brackets. */
221
+ /* If it is a non-terminal label, just */
222
+ /* delete the brackets (don't delete */
223
+ /* childrens). */
224
+ /*------------------------------------------*/
225
+ char *Delete_label[MAX_DELETE_LABEL];
226
+ int Delete_label_n = 0;
227
+
228
+ /*------------------------------------------*/
229
+ /* Delete labels for length calculation */
230
+ /* list of labels to be ignored for */
231
+ /* length calculation purpose */
232
+ /*------------------------------------------*/
233
+ char *Delete_label_for_length[MAX_DELETE_LABEL];
234
+ int Delete_label_for_length_n = 0;
235
+
236
+ /*------------------------------------------*/
237
+ /* Labels to be considered for misquote */
238
+ /* (could be possesive or quote) */
239
+ /*------------------------------------------*/
240
+ char *Quote_term[MAX_QUOTE_TERM];
241
+ int Quote_term_n = 0;
242
+
243
+ /*------------------------------------------*/
244
+ /* Equivalent labels, words */
245
+ /* the pairs are considered equivalent */
246
+ /* This is non-directional. */
247
+ /*------------------------------------------*/
248
+ s_equiv EQ_label[MAX_EQ_LABEL];
249
+ int EQ_label_n = 0;
250
+
251
+ s_equiv EQ_word[MAX_EQ_WORD];
252
+ int EQ_word_n = 0;
253
+
254
+
255
+
256
+ /************************/
257
+ /* Function return-type */
258
+ /************************/
259
+ int main();
260
+ void init_global();
261
+ void print_head();
262
+ void init();
263
+ void read_parameter_file();
264
+ void set_param();
265
+ int narg();
266
+ int read_line();
267
+
268
+ void pushb();
269
+ int popb();
270
+ int stackempty();
271
+
272
+ void calc_result(unsigned char *buf1,unsigned char *buf);
273
+ void fix_quote();
274
+ void reinsert_term();
275
+ void massage_data();
276
+ void modify_label();
277
+ void individual_result();
278
+ void print_total();
279
+ void dsp_info();
280
+ int is_terminator();
281
+ int is_deletelabel();
282
+ int is_deletelabel_for_length();
283
+ int is_quote_term();
284
+ int word_comp();
285
+ int label_comp();
286
+
287
+ void Error();
288
+ void Fatal();
289
+ void Usage();
290
+
291
+ /* ### provided by std headers
292
+ int fprintf();
293
+ int printf();
294
+ int atoi();
295
+ int fclose();
296
+ int sscanf();
297
+ */
298
+
299
+ /***********/
300
+ /* program */
301
+ /***********/
302
+ #define ARG_CHECK(st) if(!(*++(*argv) || (--argc && *++argv))){ \
303
+ fprintf(stderr,"Missing argument: %s\n",st); \
304
+ }
305
+
306
+ int
307
+ main(argc,argv)
308
+ int argc;
309
+ char *argv[];
310
+ {
311
+ char *filename1, *filename2;
312
+ FILE *fd1, *fd2;
313
+ unsigned char buff[5000];
314
+ unsigned char buff1[5000];
315
+
316
+ filename1=NULL;
317
+ filename2=NULL;
318
+
319
+ for(argc--,argv++;argc>0;argc--,argv++){
320
+ if(**argv == '-'){
321
+ while(*++(*argv)){
322
+ switch(**argv){
323
+
324
+ case 'h': /* help */
325
+ Usage();
326
+ exit(1);
327
+
328
+ case 'd': /* debug mode */
329
+ DEBUG = 1;
330
+ goto nextarg;
331
+
332
+ case 'D': /* debug mode */
333
+ DEBUG = 2;
334
+ goto nextarg;
335
+
336
+ case 'c': /* cut-off length */
337
+ ARG_CHECK("cut-off length for statistices");
338
+ TOT_cut_len = atoi(*argv);
339
+ goto nextarg;
340
+
341
+ case 'e': /* max error */
342
+ ARG_CHECK("number of error to kill");
343
+ Max_error = atoi(*argv);
344
+ goto nextarg;
345
+
346
+ case 'p': /* parameter file */
347
+ ARG_CHECK("parameter file");
348
+ read_parameter_file(*argv);
349
+ goto nextarg;
350
+
351
+ default:
352
+ Usage();
353
+ exit(0);
354
+ }
355
+ }
356
+ } else {
357
+ if(filename1==NULL){
358
+ filename1 = *argv;
359
+ }else if(filename2==NULL){
360
+ filename2 = *argv;
361
+ }
362
+ }
363
+ nextarg: continue;
364
+ }
365
+
366
+ init_global();
367
+
368
+
369
+ if((fd1 = fopen(filename1,"r"))==NULL){
370
+ Fatal("Can't open gold file (%s)\n",filename1);
371
+ }
372
+ if((fd2 = fopen(filename2,"r"))==NULL){
373
+ Fatal("Can't open test file (%s)\n",filename2);
374
+ }
375
+
376
+ print_head();
377
+
378
+ for(Line=1;fgets(buff,5000,fd1)!=NULL;Line++){
379
+
380
+ init();
381
+
382
+ /* READ 1 */
383
+ r_wn1 = read_line(buff,terminal1,quotterm1,&wn1,bracket1,&bn1);
384
+
385
+ strcpy(buff1,buff);
386
+
387
+ /* READ 2 */
388
+ if(fgets(buff,5000,fd2)==NULL){
389
+ Error("Number of lines unmatch (too many lines in gold file)\n");
390
+ break;
391
+ }
392
+
393
+ read_line(buff,terminal2,quotterm2,&wn2,bracket2,&bn2);
394
+
395
+ /* Calculate result and print it */
396
+ calc_result(buff1,buff);
397
+
398
+ if(DEBUG>=1){
399
+ dsp_info();
400
+ }
401
+ }
402
+
403
+ if(fgets(buff,5000,fd2)!=NULL){
404
+ Error("Number of lines unmatch (too many lines in test file)\n");
405
+ }
406
+
407
+ print_total();
408
+
409
+ return (0);
410
+ }
411
+
412
+
413
+ /*-----------------------------*/
414
+ /* initialize global variables */
415
+ /*-----------------------------*/
416
+ void
417
+ init_global()
418
+ {
419
+ TOTAL_bn1 = TOTAL_bn2 = TOTAL_match = 0;
420
+ TOTAL_sent = TOTAL_error_sent = TOTAL_skip_sent = TOTAL_comp_sent = 0;
421
+ TOTAL_word = TOTAL_correct_tag = 0;
422
+ TOTAL_crossing = 0;
423
+ TOTAL_no_crossing = TOTAL_2L_crossing = 0;
424
+
425
+ TOT40_bn1 = TOT40_bn2 = TOT40_match = 0;
426
+ TOT40_sent = TOT40_error_sent = TOT40_skip_sent = TOT40_comp_sent = 0;
427
+ TOT40_word = TOT40_correct_tag = 0;
428
+ TOT40_crossing = 0;
429
+ TOT40_no_crossing = TOT40_2L_crossing = 0;
430
+
431
+ }
432
+
433
+
434
+ /*------------------*/
435
+ /* print head title */
436
+ /*------------------*/
437
+ void
438
+ print_head()
439
+ {
440
+ printf(" Sent. Matched Bracket Cross Correct Tag\n");
441
+ printf(" ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy\n");
442
+ printf("============================================================================\n");
443
+ }
444
+
445
+
446
+ /*-----------------------------------------------*/
447
+ /* initialization at each individual computation */
448
+ /*-----------------------------------------------*/
449
+ void
450
+ init()
451
+ {
452
+ int i;
453
+
454
+ wn1 = 0;
455
+ wn2 = 0;
456
+ bn1 = 0;
457
+ bn2 = 0;
458
+ r_bn1 = 0;
459
+ r_bn2 = 0;
460
+
461
+ for(i=0;i<MAX_WORD_IN_SENT;i++){
462
+ terminal1[i].word[0] = '\0';
463
+ terminal1[i].label[0] = '\0';
464
+ terminal1[i].result = 9;
465
+ terminal2[i].word[0] = '\0';
466
+ terminal2[i].label[0] = '\0';
467
+ terminal2[i].result = 9;
468
+ }
469
+
470
+ for(i=0;i<MAX_QUOTE_TERM;i++){
471
+ quotterm1[i].term.word[0] = '\0';
472
+ quotterm1[i].term.label[0] = '\0';
473
+ quotterm1[i].term.result = 9;
474
+ quotterm1[i].index = -1;
475
+ quotterm1[i].bracket = -1;
476
+ quotterm2[i].term.word[0] = '\0';
477
+ quotterm2[i].term.label[0] = '\0';
478
+ quotterm2[i].term.result = 9;
479
+ quotterm2[i].index = -1;
480
+ quotterm2[i].bracket = -1;
481
+ }
482
+
483
+ for(i=0;i<MAX_BRACKET_IN_SENT;i++){
484
+ bracket1[i].start = -1;
485
+ bracket1[i].end = -1;
486
+ bracket1[i].label[0] = '\0';
487
+ bracket1[i].result = 9;
488
+ bracket2[i].start = -1;
489
+ bracket2[i].end = -1;
490
+ bracket2[i].label[0] = '\0';
491
+ bracket2[i].result = 9;
492
+ }
493
+
494
+ Status = 0;
495
+ }
496
+
497
+ /*----------------*/
498
+ /* parameter file */
499
+ /*----------------*/
500
+ void
501
+ read_parameter_file(filename)
502
+ char *filename;
503
+ {
504
+ char buff[MAX_LINE_LEN];
505
+ FILE *fd;
506
+ int line;
507
+ int i;
508
+
509
+ if((fd=fopen(filename,"r"))==NULL){
510
+ Fatal("Can't open parameter file (%s)\n",filename);
511
+ }
512
+
513
+ for(line=1;fgets(buff,MAX_LINE_LEN,fd)!=NULL;line++){
514
+
515
+ /* clean up the tail and find unvalid line */
516
+ /*-----------------------------------------*/
517
+ for(i=strlen(buff)-1;i>0 && (isspace(buff[i]) || buff[i]=='\n');i--){
518
+ buff[i]='\0';
519
+ }
520
+ if(buff[0]=='#' || /* comment-line */
521
+ strlen(buff)<3){ /* too short, just ignore */
522
+ continue;
523
+ }
524
+
525
+ /* place the parameter and value */
526
+ /*-------------------------------*/
527
+ for(i=0;!isspace(buff[i]);i++);
528
+ for(;isspace(buff[i]) && buff[i]!='\0';i++);
529
+ if(buff[i]=='\0'){
530
+ fprintf(stderr,"Empty value in parameter file (%d)\n",line);
531
+ }
532
+
533
+ /* set parameter and value */
534
+ /*-------------------------*/
535
+ set_param(buff,buff+i);
536
+ }
537
+
538
+ fclose(fd);
539
+ }
540
+
541
+
542
+ #define STRNCMP(s) (strncmp(param,s,strlen(s))==0 && \
543
+ (param[strlen(s)]=='\0' || isspace(param[strlen(s)])))
544
+
545
+
546
+ void
547
+ set_param(param,value)
548
+ char *param, *value;
549
+ {
550
+ char l1[MAX_LABEL_LEN], l2[MAX_LABEL_LEN];
551
+
552
+ if(STRNCMP("DEBUG")){
553
+
554
+ DEBUG = atoi(value);
555
+
556
+ }else if(STRNCMP("MAX_ERROR")){
557
+
558
+ Max_error = atoi(value);
559
+
560
+ }else if(STRNCMP("CUTOFF_LEN")){
561
+
562
+ TOT_cut_len = atoi(value);
563
+
564
+ }else if(STRNCMP("LABELED")){
565
+
566
+ F_label = atoi(value);
567
+
568
+ }else if(STRNCMP("DELETE_LABEL")){
569
+
570
+ Delete_label[Delete_label_n] = (char *)malloc(strlen(value)+1);
571
+ strcpy(Delete_label[Delete_label_n],value);
572
+ Delete_label_n++;
573
+
574
+ }else if(STRNCMP("DELETE_LABEL_FOR_LENGTH")){
575
+
576
+ Delete_label_for_length[Delete_label_for_length_n] = (char *)malloc(strlen(value)+1);
577
+ strcpy(Delete_label_for_length[Delete_label_for_length_n],value);
578
+ Delete_label_for_length_n++;
579
+
580
+ }else if(STRNCMP("QUOTE_LABEL")){
581
+
582
+ Quote_term[Quote_term_n] = (char *)malloc(strlen(value)+1);
583
+ strcpy(Quote_term[Quote_term_n],value);
584
+ Quote_term_n++;
585
+
586
+ }else if(STRNCMP("EQ_LABEL")){
587
+
588
+ if(narg(value)!=2){
589
+ fprintf(stderr,"EQ_LABEL requires two values\n");
590
+ return;
591
+ }
592
+ sscanf(value,"%s %s",l1,l2);
593
+ EQ_label[EQ_label_n].s1 = (char *)malloc(strlen(l1)+1);
594
+ strcpy(EQ_label[EQ_label_n].s1,l1);
595
+ EQ_label[EQ_label_n].s2 = (char *)malloc(strlen(l2)+1);
596
+ strcpy(EQ_label[EQ_label_n].s2,l2);
597
+ EQ_label_n++;
598
+
599
+ }else if(STRNCMP("EQ_WORD")){
600
+
601
+ if(narg(value)!=2){
602
+ fprintf(stderr,"EQ_WORD requires two values\n");
603
+ return;
604
+ }
605
+ sscanf(value,"%s %s",l1,l2);
606
+ EQ_word[EQ_word_n].s1 = (char *)malloc(strlen(l1)+1);
607
+ strcpy(EQ_word[EQ_word_n].s1,l1);
608
+ EQ_word[EQ_word_n].s2 = (char *)malloc(strlen(l2)+1);
609
+ strcpy(EQ_word[EQ_word_n].s2,l2);
610
+ EQ_word_n++;
611
+
612
+ }else{
613
+
614
+ fprintf(stderr,"Unknown keyword (%s) in parameter file\n",param);
615
+
616
+ }
617
+ }
618
+
619
+
620
+ int
621
+ narg(s)
622
+ char *s;
623
+ {
624
+ int n;
625
+
626
+ for(n=0;*s!='\0';){
627
+ for(;isspace(*s);s++);
628
+ if(*s=='\0'){
629
+ break;
630
+ }
631
+ n++;
632
+ for(;!isspace(*s);s++){
633
+ if(*s=='\0'){
634
+ break;
635
+ }
636
+ }
637
+ }
638
+
639
+ return(n);
640
+ }
641
+
642
+ /*-----------------------------*/
643
+ /* Read line and gather data. */
644
+ /* Return langth of sentence. */
645
+ /*-----------------------------*/
646
+ int
647
+ read_line(buff, terminal, quotterm, wn, bracket, bn)
648
+ char *buff;
649
+ s_terminal terminal[];
650
+ s_term_ind quotterm[];
651
+ int *wn;
652
+ s_bracket bracket[];
653
+ int *bn;
654
+ {
655
+ char *p, *q, label[MAX_LABEL_LEN], word[MAX_WORD_LEN];
656
+ int qt; /* quote term counter */
657
+ int wid, bid; /* word ID, bracket ID */
658
+ int n; /* temporary remembering the position */
659
+ int b; /* temporary remembering bid */
660
+ int i;
661
+ int len; /* length of the sentence */
662
+
663
+ len = 0;
664
+ stack_top=0;
665
+
666
+ for(p=buff,qt=0,wid=0,bid=0;*p!='\0';){
667
+
668
+ if(isspace(*p)){
669
+ p++;
670
+ continue;
671
+
672
+ /* open bracket */
673
+ /*--------------*/
674
+ }else if(*p=='('){
675
+
676
+ n=wid;
677
+ for(p++,i=0;!is_terminator(*p);p++,i++){
678
+ label[i]=*p;
679
+ }
680
+ label[i]='\0';
681
+
682
+ /* Find terminals */
683
+ q = p;
684
+ if(isspace(*q)){
685
+ for(q++;isspace(*q);q++);
686
+ for(i=0;!is_terminator(*q);q++,i++){
687
+ word[i]=*q;
688
+ }
689
+ word[i]='\0';
690
+
691
+ /* compute length */
692
+ if(*q==')' && !is_deletelabel_for_length(label)==1){
693
+ len++;
694
+ }
695
+ if (DEBUG>1)
696
+ printf("label=%s, word=%s, wid=%d\n",label,word,wid);
697
+ /* quote terminal */
698
+ if(*q==')' && is_quote_term(label,word)==1){
699
+ strcpy(quotterm[qt].term.word,word);
700
+ strcpy(quotterm[qt].term.label,label);
701
+ quotterm[qt].index = wid;
702
+ quotterm[qt].bracket = bid;
703
+ quotterm[qt].endslen = stack_top;
704
+ //quotterm[qt].ends = (int*)malloc(stack_top*sizeof(int));
705
+ memcpy(quotterm[qt].ends,stack,stack_top*sizeof(int));
706
+ qt++;
707
+ }
708
+
709
+ /* delete terminal */
710
+ if(*q==')' && is_deletelabel(label)==1){
711
+ p = q+1;
712
+ continue;
713
+
714
+ /* valid terminal */
715
+ }else if(*q==')'){
716
+ strcpy(terminal[wid].word,word);
717
+ strcpy(terminal[wid].label,label);
718
+ wid++;
719
+ p = q+1;
720
+ continue;
721
+
722
+ /* error */
723
+ }else if(*q!='('){
724
+ Error("More than two elements in a bracket\n");
725
+ }
726
+ }
727
+
728
+ /* otherwise non-terminal label */
729
+ bracket[bid].start = wid;
730
+ bracket[bid].buf_start = p-buff;
731
+ strcpy(bracket[bid].label,label);
732
+ pushb(bid);
733
+ bid++;
734
+
735
+ /* close bracket */
736
+ /*---------------*/
737
+ }else if(*p==')'){
738
+
739
+ b = popb();
740
+ bracket[b].end = wid;
741
+ bracket[b].buf_end = p-buff;
742
+ p++;
743
+
744
+ /* error */
745
+ /*-------*/
746
+ }else{
747
+
748
+ Error("Reading sentence\n");
749
+ }
750
+ }
751
+
752
+ if(!stackempty()){
753
+ Error("Bracketing is unbalanced (too many open bracket)\n");
754
+ }
755
+
756
+ *wn = wid;
757
+ *bn = bid;
758
+
759
+ return(len);
760
+ }
761
+
762
+
763
+ /*----------------------*/
764
+ /* stack operation */
765
+ /* for bracketing pairs */
766
+ /*----------------------*/
767
+ void
768
+ pushb(item)
769
+ int item;
770
+ {
771
+ stack[stack_top++]=item;
772
+ }
773
+
774
+ int
775
+ popb()
776
+ {
777
+ int item;
778
+
779
+ item = stack[stack_top-1];
780
+
781
+ if(stack_top-- < 0){
782
+ Error("Bracketing unbalance (too many close bracket)\n");
783
+ }
784
+ return(item);
785
+ }
786
+
787
+ int
788
+ stackempty()
789
+ {
790
+ if(stack_top==0){
791
+ return(1);
792
+ }else{
793
+ return(0);
794
+ }
795
+ }
796
+
797
+
798
+ /*------------------*/
799
+ /* calculate result */
800
+ /*------------------*/
801
+ void
802
+ calc_result(unsigned char *buf1,unsigned char *buf)
803
+ {
804
+ int i, j, l;
805
+ int match, crossing, correct_tag;
806
+
807
+ int last_i = -1;
808
+
809
+ char my_buf[1000];
810
+ int match_found = 0;
811
+
812
+ char match_j[200];
813
+ for (j = 0; j < bn2; ++j) {
814
+ match_j[j] = 0;
815
+ }
816
+
817
+ /* ML */
818
+ if (DEBUG>1)
819
+ printf("\n");
820
+
821
+
822
+ /* Find skip and error */
823
+ /*---------------------*/
824
+ if(wn2==0){
825
+ Status = 2;
826
+ individual_result(0,0,0,0,0,0);
827
+ return;
828
+ }
829
+
830
+ if(wn1 != wn2){
831
+ //if (DEBUG>1)
832
+ //Error("Length unmatch (%d|%d)\n",wn1,wn2);
833
+ fix_quote();
834
+ if(wn1 != wn2){
835
+ Error("Length unmatch (%d|%d)\n",wn1,wn2);
836
+ individual_result(0,0,0,0,0,0);
837
+ return;
838
+ }
839
+ }
840
+
841
+ for(i=0;i<wn1;i++){
842
+ if(word_comp(terminal1[i].word,terminal2[i].word)==0){
843
+ Error("Words unmatch (%s|%s)\n",terminal1[i].word,
844
+ terminal2[i].word);
845
+ individual_result(0,0,0,0,0,0);
846
+ return;
847
+ }
848
+ }
849
+
850
+ /* massage the data */
851
+ /*------------------*/
852
+ massage_data();
853
+
854
+ /* matching brackets */
855
+ /*-------------------*/
856
+ match = 0;
857
+ for(i=0;i<bn1;i++){
858
+ for(j=0;j<bn2;j++){
859
+
860
+ if (DEBUG>1)
861
+ printf("1.res=%d, 2.res=%d, 1.start=%d, 2.start=%d, 1.end=%d, 2.end=%d\n",bracket1[i].result,bracket2[j].result,bracket1[i].start,bracket2[j].start,bracket1[i].end,bracket2[j].end);
862
+
863
+ // does bracket match?
864
+ if(bracket1[i].result != 5 &&
865
+ bracket2[j].result == 0 &&
866
+ bracket1[i].start == bracket2[j].start && bracket1[i].end == bracket2[j].end) {
867
+
868
+ // (1) do we not care about the label or (2) does the label match?
869
+ if (F_label==0 || label_comp(bracket1[i].label,bracket2[j].label)==1) {
870
+ bracket1[i].result = bracket2[j].result = 1;
871
+ match++;
872
+ match_found = 1;
873
+ break;
874
+ } else {
875
+ if (DEBUG>1) {
876
+ printf(" LABEL[%d-%d]: ",bracket1[i].start,bracket1[i].end-1);
877
+ l = bracket1[i].buf_end-bracket1[i].buf_start;
878
+ strncpy(my_buf,buf1+bracket1[i].buf_start,l);
879
+ my_buf[l] = '\0';
880
+ printf("%s\n",my_buf);
881
+ }
882
+ match_found = 1;
883
+ match_j[j] = 1;
884
+ }
885
+ }
886
+ }
887
+
888
+ if (!match_found && bracket1[i].result != 5 && DEBUG>1) {
889
+ /* ### ML 09/28/03: gold bracket with no corresponding test bracket */
890
+ printf(" BRACKET[%d-%d]: ",bracket1[i].start,bracket1[i].end-1);
891
+ l = bracket1[i].buf_end-bracket1[i].buf_start;
892
+ strncpy(my_buf,buf1+bracket1[i].buf_start,l);
893
+ my_buf[l] = '\0';
894
+ printf("%s\n",my_buf);
895
+ }
896
+ match_found = 0;
897
+ }
898
+
899
+ for(j=0;j<bn2;j++){
900
+ if (bracket2[j].result==0 && !match_j[j] && DEBUG>1) {
901
+ /* test bracket with no corresponding gold bracket */
902
+ printf(" EXTRA[%d-%d]: ",bracket2[j].start,bracket2[j].end-1);
903
+ l = bracket2[j].buf_end-bracket2[j].buf_start;
904
+ strncpy(my_buf,buf+bracket2[j].buf_start,l);
905
+ my_buf[l] = '\0';
906
+ printf("%s\n",my_buf);
907
+ }
908
+ }
909
+
910
+ /* crossing */
911
+ /*----------*/
912
+ crossing = 0;
913
+
914
+ /* crossing is counted based on the brackets */
915
+ /* in test rather than gold file (by Mike) */
916
+ for(j=0;j<bn2;j++){
917
+ for(i=0;i<bn1;i++){
918
+ if(bracket1[i].result != 5 &&
919
+ bracket2[j].result != 5 &&
920
+ ((bracket1[i].start < bracket2[j].start &&
921
+ bracket1[i].end > bracket2[j].start &&
922
+ bracket1[i].end < bracket2[j].end) ||
923
+ (bracket1[i].start > bracket2[j].start &&
924
+ bracket1[i].start < bracket2[j].end &&
925
+ bracket1[i].end > bracket2[j].end))){
926
+
927
+ /* ### ML 09/01/03: get details on cross-brackettings */
928
+ if (i != last_i) {
929
+ if (DEBUG>1) {
930
+ printf(" CROSSING[%d-%d]: ",bracket1[i].start,bracket1[i].end-1);
931
+ l = bracket1[i].buf_end-bracket1[i].buf_start;
932
+ strncpy(my_buf,buf1+bracket1[i].buf_start,l);
933
+ my_buf[l] = '\0';
934
+ printf("%s\n",my_buf);
935
+
936
+ /* ML
937
+ printf("\n CROSSING at bracket %d:\n",i-1);
938
+ printf(" GOLD (tokens %d-%d): ",bracket1[i].start,bracket1[i].end-1);
939
+ l = bracket1[i].buf_end-bracket1[i].buf_start;
940
+ strncpy(my_buf,buf1+bracket1[i].buf_start,l);
941
+ my_buf[l] = '\0';
942
+ printf("%s\n",my_buf);
943
+ */
944
+ }
945
+ last_i = i;
946
+ }
947
+
948
+ /* ML
949
+ printf(" TEST (tokens %d-%d): ",bracket2[j].start,bracket2[j].end-1);
950
+ l = bracket2[j].buf_end-bracket2[j].buf_start;
951
+ strncpy(my_buf,buf+bracket2[j].buf_start,l);
952
+ my_buf[l] = '\0';
953
+ printf("%s\n",my_buf);
954
+ */
955
+
956
+ crossing++;
957
+ break;
958
+ }
959
+ }
960
+ }
961
+
962
+ /* Tagging accuracy */
963
+ /*------------------*/
964
+ correct_tag=0;
965
+ for(i=0;i<wn1;i++){
966
+ if(label_comp(terminal1[i].label,terminal2[i].label)==1){
967
+ terminal1[i].result = terminal2[i].result = 1;
968
+ correct_tag++;
969
+ } else {
970
+ terminal1[i].result = terminal2[i].result = 0;
971
+ }
972
+ }
973
+
974
+ individual_result(wn1,r_bn1,r_bn2,match,crossing,correct_tag);
975
+ }
976
+
977
+ void
978
+ fix_quote()
979
+ {
980
+ int i,j,k;
981
+ if (DEBUG>1) {
982
+ for(i=0;i<MAX_QUOTE_TERM;i++){
983
+ if (quotterm1[i].index!=-1)
984
+ printf("%d: %s - %s\n",quotterm1[i].index,
985
+ quotterm1[i].term.label,
986
+ quotterm1[i].term.word);
987
+ if (quotterm2[i].index!=-1)
988
+ printf("%d: %s - %s\n",quotterm2[i].index,
989
+ quotterm2[i].term.label,
990
+ quotterm2[i].term.word);
991
+ }
992
+ }
993
+ for(i=0;i<MAX_QUOTE_TERM;i++) {
994
+ int ind = quotterm2[i].index;
995
+ if (ind!=-1) {
996
+ for(j=0;j<MAX_QUOTE_TERM;j++){
997
+ if (quotterm1[j].index==ind &&
998
+ strcmp(quotterm1[j].term.label,
999
+ quotterm2[i].term.label)!=0) {
1000
+ if (is_deletelabel(quotterm1[j].term.label) && !is_deletelabel(quotterm2[i].term.label)) {
1001
+ reinsert_term(&quotterm1[j],terminal1,bracket1,&wn1);
1002
+ for (k=j;k<MAX_QUOTE_TERM;k++)
1003
+ if (quotterm1[k].index!=-1)
1004
+ quotterm1[k].index++;
1005
+ } else if (is_deletelabel(quotterm2[i].term.label) && !is_deletelabel(quotterm1[j].term.label)) {
1006
+ reinsert_term(&quotterm2[i],terminal2,bracket2,&wn2);
1007
+ for (k=i;k<MAX_QUOTE_TERM;k++)
1008
+ if (quotterm2[k].index!=-1)
1009
+ quotterm2[k].index++;
1010
+ }
1011
+ }
1012
+ }
1013
+ } else break;
1014
+ }
1015
+ }
1016
+
1017
+ void
1018
+ reinsert_term(quot,terminal,bracket,wn)
1019
+ s_term_ind* quot;
1020
+ s_terminal terminal[];
1021
+ s_bracket bracket[];
1022
+ int* wn;
1023
+ {
1024
+ int ind = quot->index;
1025
+ int bra = quot->bracket;
1026
+ s_terminal* term = &quot->term;
1027
+ int k;
1028
+ memmove(&terminal[ind+1],
1029
+ &terminal[ind],
1030
+ sizeof(s_terminal)*(MAX_WORD_IN_SENT-ind-1));
1031
+ strcpy(terminal[ind].label,term->label);
1032
+ strcpy(terminal[ind].word,term->word);
1033
+ (*wn)++;
1034
+ if (DEBUG>1)
1035
+ printf("bra=%d, ind=%d\n",bra,ind);
1036
+ for(k=0;k<MAX_BRACKET_IN_SENT;k++) {
1037
+ if (bracket[k].start==-1)
1038
+ break;
1039
+ if (DEBUG>1)
1040
+ printf("bracket[%d]={%d,%d}\n",k,bracket[k].start,bracket[k].end);
1041
+ if (k>=bra) {
1042
+ bracket[k].start++;
1043
+ bracket[k].end++;
1044
+ }
1045
+ //if (bracket[k].start<=ind && bracket[k].end>=ind)
1046
+ //bracket[k].end++;
1047
+ }
1048
+ if (DEBUG>1)
1049
+ printf("endslen=%d\n",quot->endslen);
1050
+ for(k=0;k<quot->endslen;k++) {
1051
+ //printf("ends[%d]=%d",k,quot->ends[k]);
1052
+ bracket[quot->ends[k]].end++;
1053
+ }
1054
+ //free(quot->ends);
1055
+ }
1056
+ /*
1057
+ void
1058
+ adjust_end(ind,bra)
1059
+ int ind;
1060
+ int bra;
1061
+ {
1062
+ for(k=0;k<MAX_BRACKET_IN_SENT;k++) {
1063
+ if (bracket[k].start==-1)
1064
+ break;
1065
+ printf("bracket[%d]={%d,%d}\n",k,bracket[k].start,bracket[k].end);
1066
+ if (k>=bra)
1067
+ bracket[k].end++;
1068
+ }
1069
+ }
1070
+ */
1071
+ void
1072
+ massage_data()
1073
+ {
1074
+ int i, j;
1075
+
1076
+ /* for GOLD */
1077
+ /*----------*/
1078
+ for(i=0;i<bn1;i++){
1079
+
1080
+ bracket1[i].result = 0;
1081
+
1082
+ /* Zero element */
1083
+ if(bracket1[i].start == bracket1[i].end){
1084
+ bracket1[i].result = 5;
1085
+ continue;
1086
+ }
1087
+
1088
+ /* Modify label */
1089
+ modify_label(bracket1[i].label);
1090
+
1091
+ /* Delete label */
1092
+ for(j=0;j<Delete_label_n;j++){
1093
+ if(label_comp(bracket1[i].label,Delete_label[j])==1){
1094
+ bracket1[i].result = 5;
1095
+ }
1096
+ }
1097
+ }
1098
+
1099
+ /* for TEST */
1100
+ /*----------*/
1101
+ for(i=0;i<bn2;i++){
1102
+
1103
+ bracket2[i].result = 0;
1104
+
1105
+ /* Zero element */
1106
+ if(bracket2[i].start == bracket2[i].end){
1107
+ bracket2[i].result = 5;
1108
+ continue;
1109
+ }
1110
+
1111
+ /* Modify label */
1112
+ modify_label(bracket2[i].label);
1113
+
1114
+ /* Delete label */
1115
+ for(j=0;j<Delete_label_n;j++){
1116
+ if(label_comp(bracket2[i].label,Delete_label[j])==1){
1117
+ bracket2[i].result = 5;
1118
+ }
1119
+ }
1120
+ }
1121
+
1122
+
1123
+ /* count up real number of brackets (exclude deleted ones) */
1124
+ /*---------------------------------------------------------*/
1125
+ r_bn1 = r_bn2 = 0;
1126
+
1127
+ for(i=0;i<bn1;i++){
1128
+ if(bracket1[i].result != 5){
1129
+ r_bn1++;
1130
+ }
1131
+ }
1132
+
1133
+ for(i=0;i<bn2;i++){
1134
+ if(bracket2[i].result != 5){
1135
+ r_bn2++;
1136
+ }
1137
+ }
1138
+ }
1139
+
1140
+
1141
+ /*------------------------*/
1142
+ /* trim the tail of label */
1143
+ /*------------------------*/
1144
+ void
1145
+ modify_label(label)
1146
+ char *label;
1147
+ {
1148
+ char *p;
1149
+
1150
+ for(p=label;*p!='\0';p++){
1151
+ if(*p=='-' || *p=='='){
1152
+ *p='\0';
1153
+ break;
1154
+ }
1155
+ }
1156
+ }
1157
+
1158
+
1159
+ /*-----------------------------------------------*/
1160
+ /* add individual statistics to TOTAL statictics */
1161
+ /*-----------------------------------------------*/
1162
+ void
1163
+ individual_result(wn1,bn1,bn2,match,crossing,correct_tag)
1164
+ int wn1,bn1,bn2,match,crossing,correct_tag;
1165
+ {
1166
+
1167
+ /* Statistics for ALL */
1168
+ /*--------------------*/
1169
+ TOTAL_sent++;
1170
+ if(Status==1){
1171
+ TOTAL_error_sent++;
1172
+ }else if(Status==2){
1173
+ TOTAL_skip_sent++;
1174
+ }else{
1175
+ TOTAL_bn1 += bn1;
1176
+ TOTAL_bn2 += bn2;
1177
+ TOTAL_match += match;
1178
+ if(bn1==bn2 && bn2==match){
1179
+ TOTAL_comp_sent++;
1180
+ }
1181
+ TOTAL_word += wn1;
1182
+ TOTAL_crossing += crossing;
1183
+ if(crossing==0){
1184
+ TOTAL_no_crossing++;
1185
+ }
1186
+ if(crossing <= 2){
1187
+ TOTAL_2L_crossing++;
1188
+ }
1189
+ TOTAL_correct_tag += correct_tag;
1190
+ }
1191
+
1192
+
1193
+ /* Statistics for sent length <= TOT_cut_len */
1194
+ /*-------------------------------------------*/
1195
+ if(r_wn1<=TOT_cut_len){
1196
+ TOT40_sent++;
1197
+ if(Status==1){
1198
+ TOT40_error_sent++;
1199
+ }else if(Status==2){
1200
+ TOT40_skip_sent++;
1201
+ }else{
1202
+ TOT40_bn1 += bn1;
1203
+ TOT40_bn2 += bn2;
1204
+ TOT40_match += match;
1205
+ if(bn1==bn2 && bn2==match){
1206
+ TOT40_comp_sent++;
1207
+ }
1208
+ TOT40_word += wn1;
1209
+ TOT40_crossing += crossing;
1210
+ if(crossing==0){
1211
+ TOT40_no_crossing++;
1212
+ }
1213
+ if(crossing <= 2){
1214
+ TOT40_2L_crossing++;
1215
+ }
1216
+ TOT40_correct_tag += correct_tag;
1217
+ }
1218
+ }
1219
+
1220
+ /* Print individual result */
1221
+ /*-------------------------*/
1222
+ printf("%4d %3d %d ",Line,r_wn1,Status);
1223
+ printf("%6.2f %6.2f %3d %3d %3d %3d",
1224
+ (r_bn1==0?0.0:100.0*match/r_bn1),
1225
+ (r_bn2==0?0.0:100.0*match/r_bn2),
1226
+ match, r_bn1, r_bn2, crossing);
1227
+
1228
+ printf(" %4d %4d %6.2f\n",wn1,correct_tag,
1229
+ (wn1==0?0.0:100.0*correct_tag/wn1));
1230
+ }
1231
+
1232
+
1233
+ /*------------------------*/
1234
+ /* print total statistics */
1235
+ /*------------------------*/
1236
+ void
1237
+ print_total()
1238
+ {
1239
+ int sentn;
1240
+ double r,p,f;
1241
+
1242
+ printf("============================================================================\n");
1243
+
1244
+ if(TOTAL_bn1>0 && TOTAL_bn2>0){
1245
+ printf(" %6.2f %6.2f %6d %5d %5d %5d",
1246
+ (TOTAL_bn1>0?100.0*TOTAL_match/TOTAL_bn1:0.0),
1247
+ (TOTAL_bn2>0?100.0*TOTAL_match/TOTAL_bn2:0.0),
1248
+ TOTAL_match,
1249
+ TOTAL_bn1,
1250
+ TOTAL_bn2,
1251
+ TOTAL_crossing);
1252
+ }
1253
+
1254
+ printf(" %5d %5d %6.2f",
1255
+ TOTAL_word,
1256
+ TOTAL_correct_tag,
1257
+ (TOTAL_word>0?100.0*TOTAL_correct_tag/TOTAL_word:0.0));
1258
+
1259
+ printf("\n");
1260
+ printf("=== Summary ===\n");
1261
+
1262
+ sentn = TOTAL_sent - TOTAL_error_sent - TOTAL_skip_sent;
1263
+
1264
+ printf("\n-- All --\n");
1265
+ printf("Number of sentence = %6d\n",TOTAL_sent);
1266
+ printf("Number of Error sentence = %6d\n",TOTAL_error_sent);
1267
+ printf("Number of Skip sentence = %6d\n",TOTAL_skip_sent);
1268
+ printf("Number of Valid sentence = %6d\n",sentn);
1269
+
1270
+ r = TOTAL_bn1>0 ? 100.0*TOTAL_match/TOTAL_bn1 : 0.0;
1271
+ printf("Bracketing Recall = %6.2f\n",r);
1272
+
1273
+ p = TOTAL_bn2>0 ? 100.0*TOTAL_match/TOTAL_bn2 : 0.0;
1274
+ printf("Bracketing Precision = %6.2f\n",p);
1275
+
1276
+ f = 2*p*r/(p+r);
1277
+ printf("Bracketing FMeasure = %6.2f\n",f);
1278
+
1279
+ printf("Complete match = %6.2f\n",
1280
+ (sentn>0?100.0*TOTAL_comp_sent/sentn:0.0));
1281
+ printf("Average crossing = %6.2f\n",
1282
+ (sentn>0?1.0*TOTAL_crossing/sentn:0.0));
1283
+ printf("No crossing = %6.2f\n",
1284
+ (sentn>0?100.0*TOTAL_no_crossing/sentn:0.0));
1285
+ printf("2 or less crossing = %6.2f\n",
1286
+ (sentn>0?100.0*TOTAL_2L_crossing/sentn:0.0));
1287
+ printf("Tagging accuracy = %6.2f\n",
1288
+ (TOTAL_word>0?100.0*TOTAL_correct_tag/TOTAL_word:0.0));
1289
+
1290
+ sentn = TOT40_sent - TOT40_error_sent - TOT40_skip_sent;
1291
+
1292
+ printf("\n-- len<=%d --\n",TOT_cut_len);
1293
+ printf("Number of sentence = %6d\n",TOT40_sent);
1294
+ printf("Number of Error sentence = %6d\n",TOT40_error_sent);
1295
+ printf("Number of Skip sentence = %6d\n",TOT40_skip_sent);
1296
+ printf("Number of Valid sentence = %6d\n",sentn);
1297
+
1298
+
1299
+ r = TOT40_bn1>0 ? 100.0*TOT40_match/TOT40_bn1 : 0.0;
1300
+ printf("Bracketing Recall = %6.2f\n",r);
1301
+
1302
+ p = TOT40_bn2>0 ? 100.0*TOT40_match/TOT40_bn2 : 0.0;
1303
+ printf("Bracketing Precision = %6.2f\n",p);
1304
+
1305
+ f = 2*p*r/(p+r);
1306
+ printf("Bracketing FMeasure = %6.2f\n",f);
1307
+
1308
+ printf("Complete match = %6.2f\n",
1309
+ (sentn>0?100.0*TOT40_comp_sent/sentn:0.0));
1310
+ printf("Average crossing = %6.2f\n",
1311
+ (sentn>0?1.0*TOT40_crossing/sentn:0.0));
1312
+ printf("No crossing = %6.2f\n",
1313
+ (sentn>0?100.0*TOT40_no_crossing/sentn:0.0));
1314
+ printf("2 or less crossing = %6.2f\n",
1315
+ (sentn>0?100.0*TOT40_2L_crossing/sentn:0.0));
1316
+ printf("Tagging accuracy = %6.2f\n",
1317
+ (TOT40_word>0?100.0*TOT40_correct_tag/TOT40_word:0.0));
1318
+
1319
+ }
1320
+
1321
+
1322
+ /*--------------------------------*/
1323
+ /* display individual information */
1324
+ /*--------------------------------*/
1325
+ void
1326
+ dsp_info()
1327
+ {
1328
+ int i, n;
1329
+
1330
+ printf("-<1>---(wn1=%3d, bn1=%3d)- ",wn1,bn1);
1331
+ printf("-<2>---(wn2=%3d, bn2=%3d)-\n",wn2,bn2);
1332
+
1333
+ n = (wn1>wn2?wn1:wn2);
1334
+
1335
+ for(i=0;i<n;i++){
1336
+ if(terminal1[i].word[0]!='\0'){
1337
+ printf("%3d : %d : %-6s %-16s ",i,terminal1[i].result,
1338
+ terminal1[i].label,terminal1[i].word);
1339
+ }else{
1340
+ printf(" ");
1341
+ }
1342
+
1343
+ if(terminal2[i].word[0]!='\0'){
1344
+ printf("%3d : %d : %-6s %-16s\n",i,terminal2[i].result,
1345
+ terminal2[i].label,terminal2[i].word);
1346
+ }else{
1347
+ printf("\n");
1348
+ }
1349
+ }
1350
+ printf("\n");
1351
+
1352
+ n = (bn1>bn2?bn1:bn2);
1353
+
1354
+ for(i=0;i<n;i++){
1355
+ if(bracket1[i].start != -1){
1356
+ printf("%3d : %d : %3d %3d %-6s ",i,bracket1[i].result,
1357
+ bracket1[i].start,bracket1[i].end,
1358
+ bracket1[i].label);
1359
+ } else {
1360
+ printf(" ");
1361
+ }
1362
+
1363
+ if(bracket2[i].start != -1){
1364
+ printf("%3d : %d : %3d %3d %-6s\n",i,bracket2[i].result,
1365
+ bracket2[i].start,bracket2[i].end,
1366
+ bracket2[i].label);
1367
+ } else {
1368
+ printf("\n");
1369
+ }
1370
+ }
1371
+ printf("\n");
1372
+
1373
+ printf("========\n");
1374
+
1375
+ }
1376
+
1377
+
1378
+ /*-----------------*/
1379
+ /* some predicates */
1380
+ /*-----------------*/
1381
+ int
1382
+ is_terminator(c)
1383
+ char c;
1384
+ {
1385
+ if(isspace(c) || c=='(' || c==')'){
1386
+ return(1);
1387
+ }else{
1388
+ return(0);
1389
+ }
1390
+ }
1391
+
1392
+ int
1393
+ is_deletelabel(s)
1394
+ char *s;
1395
+ {
1396
+ int i;
1397
+
1398
+ for(i=0;i<Delete_label_n;i++){
1399
+ if(strcmp(s,Delete_label[i])==0){
1400
+ return(1);
1401
+ }
1402
+ }
1403
+
1404
+ return(0);
1405
+ }
1406
+
1407
+ int
1408
+ is_deletelabel_for_length(s)
1409
+ char *s;
1410
+ {
1411
+ int i;
1412
+
1413
+ for(i=0;i<Delete_label_for_length_n;i++){
1414
+ if(strcmp(s,Delete_label_for_length[i])==0){
1415
+ return(1);
1416
+ }
1417
+ }
1418
+
1419
+ return(0);
1420
+ }
1421
+
1422
+ int
1423
+ is_quote_term(s,w)
1424
+ char *s;
1425
+ char *w;
1426
+ {
1427
+ int i;
1428
+
1429
+ for(i=0;i<Quote_term_n;i++){
1430
+ if(strcmp(s,Quote_term[i])==0){
1431
+ if (strcmp(w,"'")==0 || strcmp(w,"\"")==0 || strcmp(w,"/")==0)
1432
+ return(1);
1433
+ }
1434
+ }
1435
+
1436
+ return(0);
1437
+ }
1438
+
1439
+
1440
+ /*---------------*/
1441
+ /* compare words */
1442
+ /*---------------*/
1443
+ int
1444
+ word_comp(s1,s2)
1445
+ char *s1,*s2;
1446
+ {
1447
+ int i;
1448
+
1449
+ if(strcmp(s1,s2)==0){
1450
+ return(1);
1451
+ }
1452
+
1453
+ for(i=0;i<EQ_word_n;i++){
1454
+ if((strcmp(s1,EQ_word[i].s1)==0 &&
1455
+ strcmp(s2,EQ_word[i].s2)==0) ||
1456
+ (strcmp(s1,EQ_word[i].s2)==0 &&
1457
+ strcmp(s2,EQ_word[i].s1)==0)){
1458
+ return(1);
1459
+ }
1460
+ }
1461
+
1462
+ return(0);
1463
+ }
1464
+
1465
+ /*----------------*/
1466
+ /* compare labels */
1467
+ /*----------------*/
1468
+ int
1469
+ label_comp(s1,s2)
1470
+ char *s1,*s2;
1471
+ {
1472
+ int i;
1473
+
1474
+ if(strcmp(s1,s2)==0){
1475
+ return(1);
1476
+ }
1477
+
1478
+ for(i=0;i<EQ_label_n;i++){
1479
+ if((strcmp(s1,EQ_label[i].s1)==0 &&
1480
+ strcmp(s2,EQ_label[i].s2)==0) ||
1481
+ (strcmp(s1,EQ_label[i].s2)==0 &&
1482
+ strcmp(s2,EQ_label[i].s1)==0)){
1483
+ return(1);
1484
+ }
1485
+ }
1486
+
1487
+ return(0);
1488
+ }
1489
+
1490
+
1491
+ /*--------*/
1492
+ /* errors */
1493
+ /*--------*/
1494
+ void
1495
+ Error(s,arg1,arg2,arg3)
1496
+ char *s, *arg1, *arg2, *arg3;
1497
+ {
1498
+ Status = 1;
1499
+ fprintf(stderr,"%d : ",Line);
1500
+ fprintf(stderr,s,arg1,arg2,arg3);
1501
+ if(Error_count++>Max_error){
1502
+ exit(1);
1503
+ }
1504
+ }
1505
+
1506
+
1507
+ /*---------------------*/
1508
+ /* fatal error to exit */
1509
+ /*---------------------*/
1510
+ void
1511
+ Fatal(s,arg1,arg2,arg3)
1512
+ char *s, *arg1, *arg2, *arg3;
1513
+ {
1514
+ fprintf(stderr,s,arg1,arg2,arg3);
1515
+ exit(1);
1516
+ }
1517
+
1518
+
1519
+ /*-------*/
1520
+ /* Usage */
1521
+ /*-------*/
1522
+ void
1523
+ Usage()
1524
+ {
1525
+ fprintf(stderr," evalb [-dDh][-c n][-e n][-p param_file] gold-file test-file \n");
1526
+ fprintf(stderr," \n");
1527
+ fprintf(stderr," Evaluate bracketing in test-file against gold-file. \n");
1528
+ fprintf(stderr," Return recall, precision, F-Measure, tag accuracy. \n");
1529
+ fprintf(stderr," \n");
1530
+ fprintf(stderr," <option> \n");
1531
+ fprintf(stderr," -d debug mode \n");
1532
+ fprintf(stderr," -D debug mode plus bracketing info \n");
1533
+ fprintf(stderr," -c n cut-off length forstatistics (def.=40)\n");
1534
+ fprintf(stderr," -e n number of error to kill (default=10) \n");
1535
+ fprintf(stderr," -p param_file parameter file \n");
1536
+ fprintf(stderr," -h help \n");
1537
+ }
parsing/EVALB/new.prm ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##------------------------------------------##
2
+ ## Debug mode ##
3
+ ## 0: No debugging ##
4
+ ## 1: print data for individual sentence ##
5
+ ## 2: print detailed bracketing info ##
6
+ ##------------------------------------------##
7
+ DEBUG 0
8
+
9
+ ##------------------------------------------##
10
+ ## MAX error ##
11
+ ## Number of error to stop the process. ##
12
+ ## This is useful if there could be ##
13
+ ## tokanization error. ##
14
+ ## The process will stop when this number##
15
+ ## of errors are accumulated. ##
16
+ ##------------------------------------------##
17
+ MAX_ERROR 10
18
+
19
+ ##------------------------------------------##
20
+ ## Cut-off length for statistics ##
21
+ ## At the end of evaluation, the ##
22
+ ## statistics for the senetnces of length##
23
+ ## less than or equal to this number will##
24
+ ## be shown, on top of the statistics ##
25
+ ## for all the sentences ##
26
+ ##------------------------------------------##
27
+ CUTOFF_LEN 40
28
+
29
+ ##------------------------------------------##
30
+ ## unlabeled or labeled bracketing ##
31
+ ## 0: unlabeled bracketing ##
32
+ ## 1: labeled bracketing ##
33
+ ##------------------------------------------##
34
+ LABELED 1
35
+
36
+ ##------------------------------------------##
37
+ ## Delete labels ##
38
+ ## list of labels to be ignored. ##
39
+ ## If it is a pre-terminal label, delete ##
40
+ ## the word along with the brackets. ##
41
+ ## If it is a non-terminal label, just ##
42
+ ## delete the brackets (don't delete ##
43
+ ## deildrens). ##
44
+ ##------------------------------------------##
45
+ DELETE_LABEL TOP
46
+ DELETE_LABEL S1
47
+ DELETE_LABEL -NONE-
48
+ DELETE_LABEL ,
49
+ DELETE_LABEL :
50
+ DELETE_LABEL ``
51
+ DELETE_LABEL ''
52
+ DELETE_LABEL .
53
+ DELETE_LABEL ?
54
+ DELETE_LABEL !
55
+
56
+ ##------------------------------------------##
57
+ ## Delete labels for length calculation ##
58
+ ## list of labels to be ignored for ##
59
+ ## length calculation purpose ##
60
+ ##------------------------------------------##
61
+ DELETE_LABEL_FOR_LENGTH -NONE-
62
+
63
+ ##------------------------------------------##
64
+ ## Labels to be considered for misquote ##
65
+ ## (could be possesive or quote) ##
66
+ ##------------------------------------------##
67
+ QUOTE_LABEL ``
68
+ QUOTE_LABEL ''
69
+ QUOTE_LABEL POS
70
+
71
+ ##------------------------------------------##
72
+ ## These ones are less common, but ##
73
+ ## are on occasion output by parsers: ##
74
+ ##------------------------------------------##
75
+ QUOTE_LABEL NN
76
+ QUOTE_LABEL CD
77
+ QUOTE_LABEL VBZ
78
+ QUOTE_LABEL :
79
+
80
+ ##------------------------------------------##
81
+ ## Equivalent labels, words ##
82
+ ## the pairs are considered equivalent ##
83
+ ## This is non-directional. ##
84
+ ##------------------------------------------##
85
+ EQ_LABEL ADVP PRT
86
+
87
+ # EQ_WORD Example example
parsing/EVALB/nk.prm ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Based on new.prm (and by extension COLLINS.prm)
2
+ # The only change from new.prm is increasing MAX_ERROR. The evaluation should be
3
+ # identical to the standard setup, except that evalb won't give up early for a
4
+ # parser that has just started training and does not yet produce good results.
5
+
6
+ ##------------------------------------------##
7
+ ## Debug mode ##
8
+ ## 0: No debugging ##
9
+ ## 1: print data for individual sentence ##
10
+ ## 2: print detailed bracketing info ##
11
+ ##------------------------------------------##
12
+ DEBUG 0
13
+
14
+ ##------------------------------------------##
15
+ ## MAX error ##
16
+ ## Number of error to stop the process. ##
17
+ ## This is useful if there could be ##
18
+ ## tokanization error. ##
19
+ ## The process will stop when this number##
20
+ ## of errors are accumulated. ##
21
+ ##------------------------------------------##
22
+ MAX_ERROR 10000
23
+
24
+ ##------------------------------------------##
25
+ ## Cut-off length for statistics ##
26
+ ## At the end of evaluation, the ##
27
+ ## statistics for the senetnces of length##
28
+ ## less than or equal to this number will##
29
+ ## be shown, on top of the statistics ##
30
+ ## for all the sentences ##
31
+ ##------------------------------------------##
32
+ CUTOFF_LEN 40
33
+
34
+ ##------------------------------------------##
35
+ ## unlabeled or labeled bracketing ##
36
+ ## 0: unlabeled bracketing ##
37
+ ## 1: labeled bracketing ##
38
+ ##------------------------------------------##
39
+ LABELED 1
40
+
41
+ ##------------------------------------------##
42
+ ## Delete labels ##
43
+ ## list of labels to be ignored. ##
44
+ ## If it is a pre-terminal label, delete ##
45
+ ## the word along with the brackets. ##
46
+ ## If it is a non-terminal label, just ##
47
+ ## delete the brackets (don't delete ##
48
+ ## deildrens). ##
49
+ ##------------------------------------------##
50
+ DELETE_LABEL TOP
51
+ DELETE_LABEL S1
52
+ DELETE_LABEL -NONE-
53
+ DELETE_LABEL ,
54
+ DELETE_LABEL :
55
+ DELETE_LABEL ``
56
+ DELETE_LABEL ''
57
+ DELETE_LABEL .
58
+ DELETE_LABEL ?
59
+ DELETE_LABEL !
60
+
61
+ ##------------------------------------------##
62
+ ## Delete labels for length calculation ##
63
+ ## list of labels to be ignored for ##
64
+ ## length calculation purpose ##
65
+ ##------------------------------------------##
66
+ DELETE_LABEL_FOR_LENGTH -NONE-
67
+
68
+ ##------------------------------------------##
69
+ ## Labels to be considered for misquote ##
70
+ ## (could be possesive or quote) ##
71
+ ##------------------------------------------##
72
+ QUOTE_LABEL ``
73
+ QUOTE_LABEL ''
74
+ QUOTE_LABEL POS
75
+
76
+ ##------------------------------------------##
77
+ ## These ones are less common, but ##
78
+ ## are on occasion output by parsers: ##
79
+ ##------------------------------------------##
80
+ QUOTE_LABEL NN
81
+ QUOTE_LABEL CD
82
+ QUOTE_LABEL VBZ
83
+ QUOTE_LABEL :
84
+
85
+ ##------------------------------------------##
86
+ ## Equivalent labels, words ##
87
+ ## the pairs are considered equivalent ##
88
+ ## This is non-directional. ##
89
+ ##------------------------------------------##
90
+ EQ_LABEL ADVP PRT
91
+
92
+ # EQ_WORD Example example
parsing/EVALB/sample/sample.gld ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
2
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
3
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
4
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
5
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
6
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
7
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
8
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
9
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
10
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
11
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
12
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
13
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
14
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
15
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
16
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
17
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
18
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
19
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
20
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
21
+ (S (A-SBJ-1 (P this)) (B-WHATEVER (Q is) (A (R a) (T test))))
22
+ (S (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))))
23
+ (S (A (P this)) (B (Q is) (A (R a) (T test))) (-NONE- *))
24
+ (S (A (P this)) (B (Q is) (A (R a) (T test))) (: *))
parsing/EVALB/sample/sample.prm ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##------------------------------------------##
2
+ ## Debug mode ##
3
+ ## print out data for individual sentence ##
4
+ ##------------------------------------------##
5
+ DEBUG 0
6
+
7
+ ##------------------------------------------##
8
+ ## MAX error ##
9
+ ## Number of error to stop the process. ##
10
+ ## This is useful if there could be ##
11
+ ## tokanization error. ##
12
+ ## The process will stop when this number##
13
+ ## of errors are accumulated. ##
14
+ ##------------------------------------------##
15
+ MAX_ERROR 10
16
+
17
+ ##------------------------------------------##
18
+ ## Cut-off length for statistics ##
19
+ ## At the end of evaluation, the ##
20
+ ## statistics for the senetnces of length##
21
+ ## less than or equal to this number will##
22
+ ## be shown, on top of the statistics ##
23
+ ## for all the sentences ##
24
+ ##------------------------------------------##
25
+ CUTOFF_LEN 40
26
+
27
+ ##------------------------------------------##
28
+ ## unlabeled or labeled bracketing ##
29
+ ## 0: unlabeled bracketing ##
30
+ ## 1: labeled bracketing ##
31
+ ##------------------------------------------##
32
+ LABELED 1
33
+
34
+ ##------------------------------------------##
35
+ ## Delete labels ##
36
+ ## list of labels to be ignored. ##
37
+ ## If it is a pre-terminal label, delete ##
38
+ ## the word along with the brackets. ##
39
+ ## If it is a non-terminal label, just ##
40
+ ## delete the brackets (don't delete ##
41
+ ## deildrens). ##
42
+ ##------------------------------------------##
43
+ DELETE_LABEL TOP
44
+ DELETE_LABEL -NONE-
45
+ DELETE_LABEL ,
46
+ DELETE_LABEL :
47
+ DELETE_LABEL ``
48
+ DELETE_LABEL ''
49
+
50
+ ##------------------------------------------##
51
+ ## Delete labels for length calculation ##
52
+ ## list of labels to be ignored for ##
53
+ ## length calculation purpose ##
54
+ ##------------------------------------------##
55
+ DELETE_LABEL_FOR_LENGTH -NONE-
56
+
57
+
58
+ ##------------------------------------------##
59
+ ## Equivalent labels, words ##
60
+ ## the pairs are considered equivalent ##
61
+ ## This is non-directional. ##
62
+ ##------------------------------------------##
63
+ EQ_LABEL T TT
64
+
65
+ EQ_WORD This this
parsing/EVALB/sample/sample.rsl ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Sent. Matched Bracket Cross Correct Tag
2
+ ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy
3
+ ============================================================================
4
+ 1 4 0 100.00 100.00 4 4 4 0 4 4 100.00
5
+ 2 4 0 75.00 75.00 3 4 4 0 4 4 100.00
6
+ 3 4 0 100.00 100.00 4 4 4 0 4 3 75.00
7
+ 4 4 0 75.00 75.00 3 4 4 0 4 3 75.00
8
+ 5 4 0 75.00 75.00 3 4 4 0 4 4 100.00
9
+ 6 4 0 50.00 66.67 2 4 3 1 4 4 100.00
10
+ 7 4 0 25.00 100.00 1 4 1 0 4 4 100.00
11
+ 8 4 0 0.00 0.00 0 4 0 0 4 4 100.00
12
+ 9 4 0 100.00 80.00 4 4 5 0 4 4 100.00
13
+ 10 4 0 100.00 50.00 4 4 8 0 4 4 100.00
14
+ 11 4 2 0.00 0.00 0 0 0 0 4 0 0.00
15
+ 12 4 1 0.00 0.00 0 0 0 0 4 0 0.00
16
+ 13 4 1 0.00 0.00 0 0 0 0 4 0 0.00
17
+ 14 4 2 0.00 0.00 0 0 0 0 4 0 0.00
18
+ 15 4 0 100.00 100.00 4 4 4 0 4 4 100.00
19
+ 16 4 1 0.00 0.00 0 0 0 0 4 0 0.00
20
+ 17 4 1 0.00 0.00 0 0 0 0 4 0 0.00
21
+ 18 4 0 100.00 100.00 4 4 4 0 4 4 100.00
22
+ 19 4 0 100.00 100.00 4 4 4 0 4 4 100.00
23
+ 20 4 1 0.00 0.00 0 0 0 0 4 0 0.00
24
+ 21 4 0 100.00 100.00 4 4 4 0 4 4 100.00
25
+ 22 44 0 100.00 100.00 34 34 34 0 44 44 100.00
26
+ 23 4 0 100.00 100.00 4 4 4 0 4 4 100.00
27
+ 24 5 0 100.00 100.00 4 4 4 0 4 4 100.00
28
+ ============================================================================
29
+ 87.76 90.53 86 98 95 16 108 106 98.15
30
+ === Summary ===
31
+
32
+ -- All --
33
+ Number of sentence = 24
34
+ Number of Error sentence = 5
35
+ Number of Skip sentence = 2
36
+ Number of Valid sentence = 17
37
+ Bracketing Recall = 87.76
38
+ Bracketing Precision = 90.53
39
+ Complete match = 52.94
40
+ Average crossing = 0.06
41
+ No crossing = 94.12
42
+ 2 or less crossing = 100.00
43
+ Tagging accuracy = 98.15
44
+
45
+ -- len<=40 --
46
+ Number of sentence = 23
47
+ Number of Error sentence = 5
48
+ Number of Skip sentence = 2
49
+ Number of Valid sentence = 16
50
+ Bracketing Recall = 81.25
51
+ Bracketing Precision = 85.25
52
+ Complete match = 50.00
53
+ Average crossing = 0.06
54
+ No crossing = 93.75
55
+ 2 or less crossing = 100.00
56
+ Tagging accuracy = 96.88
parsing/EVALB/sample/sample.tst ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
2
+ (S (A (P this)) (B (Q is) (C (R a) (T test))))
3
+ (S (A (P this)) (B (Q is) (A (R a) (U test))))
4
+ (S (C (P this)) (B (Q is) (A (R a) (U test))))
5
+ (S (A (P this)) (B (Q is) (R a) (A (T test))))
6
+ (S (A (P this) (Q is)) (A (R a) (T test)))
7
+ (S (P this) (Q is) (R a) (T test))
8
+ (P this) (Q is) (R a) (T test)
9
+ (S (A (P this)) (B (Q is) (A (A (R a) (T test)))))
10
+ (S (A (P this)) (B (Q is) (A (A (A (A (A (R a) (T test))))))))
11
+
12
+ (S (A (P this)) (B (Q was) (A (A (R a) (T test)))))
13
+ (S (A (P this)) (B (Q is) (U not) (A (A (R a) (T test)))))
14
+
15
+ (TOP (S (A (P this)) (B (Q is) (A (R a) (T test)))))
16
+ (S (A (P this)) (NONE *) (B (Q is) (A (R a) (T test))))
17
+ (S (A (P this)) (S (NONE abc) (A (NONE *))) (B (Q is) (A (R a) (T test))))
18
+ (S (A (P this)) (B (Q is) (A (R a) (TT test))))
19
+ (S (A (P This)) (B (Q is) (A (R a) (T test))))
20
+ (S (A (P That)) (B (Q is) (A (R a) (T test))))
21
+ (S (A (P this)) (B (Q is) (A (R a) (T test))))
22
+ (S (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))))
23
+ (S (A (P this)) (B (Q is) (A (R a) (T test))) (-NONE- *))
24
+ (S (A (P this)) (B (Q is) (A (R a) (T test))) (: *))
parsing/EVALB/tgrep_proc.prl ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/local/bin/perl
2
+
3
+ while(<>)
4
+ {
5
+ if(m/TOP/) #skip lines which are blank
6
+ {
7
+ print;
8
+ }
9
+ }
parsing/EVALB_SPMRL/Makefile ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TESTFILE=dev.Arabic.gold.ptb.-feat
2
+ JAVA=java
3
+
4
+
5
+ all: evalb
6
+
7
+
8
+ clean:
9
+ rm -f evalb_spmrl
10
+
11
+ install: evalb
12
+ cp evalb_spmrl /usr/local/bin
13
+
14
+
15
+
16
+ evalb: evalb.c
17
+ gcc -Wall -O3 -g -o evalb_spmrl evalb.c
18
+
19
+ evalb_linux: evalb.c
20
+ gcc -Wall -fPIC -O3 -g -o evalb_spmrl evalb.c
21
+ #to compile on linux
22
+
23
+
24
+
25
+ # note: on the original makefile, the funsigned-char option was applied
26
+
27
+ home: install_home
28
+
29
+ install_home: all
30
+ cp evalb_spmrl ${PREFIX}/bin
31
+
32
+ up:
33
+ tar zcvf ../evalb_spmrl2013.tar.gz ../evalb_spmrl2013/
34
+ putW ../evalb_spmrl2013.tar.gz
35
+
36
+
37
+
38
+ #################################
39
+ # stuff to debug some treebanks #
40
+ #################################
41
+ test_full: all
42
+ ./evalb dev.Arabic.gold.ptb dev.Arabic.gold.ptb
43
+
44
+
45
+ test: all
46
+ ./evalb -p ./new.prm ${TESTFILE} ${TESTFILE}
47
+
48
+ debug: all
49
+ ./evalb -D ${TESTFILE} ${TESTFILE}
50
+ echo "./evalb -D ${TESTFILE} ${TESTFILE}"
51
+
52
+ debug_one: all
53
+ lines 616 < ${TESTFILE} > ${TESTFILE}.616
54
+ ./evalb -D ${TESTFILE}.616 ${TESTFILE}.616
55
+ echo "./evalb -D ${TESTFILE}.616 ${TESTFILE}.616"
56
+
57
+ releaf:
58
+ ./evalb -D dev.Arabic.gold.ptb.-feat.616.bug dev.Arabic.gold.ptb.-feat.616.bug
59
+ echo "./evalb -D dev.Arabic.gold.ptb.-feat.616.bug dev.Arabic.gold.ptb.-feat.616.bug" > /dev/stderr
60
+
61
+ java:
62
+ ${JAVA} -jar ./evalC/evalC.jar ${TESTFILE} ${TESTFILE} /dev/stdout
63
+
64
+
65
+
parsing/EVALB_SPMRL/README ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ // Djam�: version record added for history's sake.
3
+ // note to future updater: please add your changelog below
4
+
5
+ (copied from http://nlp.cs.nyu.edu/evalb/ )
6
+ EVALB20080701.tgz (July 1, 2008 version) modified by Don Blaheta (Knox College)
7
+ EVALB20060307.tgz (March 3, 2006 version; debuged of Jan. 17, 2006 version) modified by David Ellis (Brown University)
8
+ EVALB20060117.tgz (Jan. 17, 2006 version) modified by David Ellis (Brown University)
9
+ EVALB20050908.tgz (Sept. 8, 2005 version) modified by David Brroks (Birmingham)
10
+ EVALB.tgz (original version).
11
+ Authors
12
+
13
+ Satoshi Sekine (New York University) : e-mail: his last name (at) cs.nyu.edu
14
+ Michael John Collins (University of Pennsylvania)
15
+ Note: the authors are not responsible for the newer versions. We put these versions even without checking the program. Please be responsible for yourself.
16
+
17
+ *************************************************************************
18
+
19
+ Modification
20
+
21
+ David Brroks (Birmingham): fixed the code so that the program can be compiled by the latest gcc (September 2005). Helps are given by Peet Morris and Ramon Ziai through the Corpora Mailing list.
22
+ David Ellis (Brown University) : fixes a bug in which sentences were incorrectly categorized as "length mismatch" when the the parse output had certain mislabeled parts-of-speech.
23
+ Don Blaheta (KNOX) : fixes a bug on the output of last number of the total information was not TOTAL_crossing, but it was TOTAL_non_crossing.
24
+
25
+
26
+
27
+ April 2012
28
+ // Modified by Slav Petrov and Ryanc Mc Donald (Google inc., for the sancl 2012 shared task)
29
+ // ===> making it less sensitive to punct POS errors leading to
30
+ // mismatch of length
31
+
32
+
33
+ August 2013, 10
34
+ // Modified by Djam� Seddah (Univ. Paris Sorbonne, for the spmrl 2013 shared task)
35
+ // ===> making it able to cope with Arabic very long lines (byte wise)
36
+ // ===> now limit is 50000 bytes, was 5000 (tricky bug, if you ask me)
37
+ // please check the constant macro section if you encounter weird bugs not present in other
38
+ implementations (check evalC by Federico Sangatti for example, http://homepages.inf.ed.ac.uk/fsangati/evalC_25_5_10.zip )
39
+
40
+
41
+ August 2013, 23
42
+ // Modif from Thomas M�ller (IMS Stuttgart)
43
+ // ===> adding of # in the stop word modify_label function (so that the
44
+ // lexer will read NPP instead of NPP##feat:...### as in hte SPMRL Data set
45
+ // Modif from Djam� Seddah
46
+ // ===> Application of modify_label to all labels (including the POS label
47
+ // wich were left untouched for some reasons)
48
+ // That should btw be an option. (wether to evaluate full labels or not,
49
+ // only stripping of Non Terminal, POS tag and so on)
50
+
51
+
52
+ August 2013, 27
53
+ // Modif from Djam�
54
+ // --> adding of an option to include the non parsed sentences in the
55
+ // --> evaluation (-X option)
56
+ // --> adding an option to evaluate only the first N sentences (-K n)
57
+ // --> adding an option to provide a compact results view (-L) so one can do
58
+ // --> find ./ -name "*parsed.run?" -exec evalb_spmrl -L GOLD {} \; -print |
59
+ // --> grep -v '=====' | grep '='
60
+
61
+ September 2013, 6
62
+ // Modif from DJame
63
+ // fixing the infinite slowness bug (shame on me)
64
+ // now speed is similar to what it was before
65
+
66
+
67
+ October 2013, 13
68
+ // Addition from Djame
69
+ // Adding the spmrl_hebrew.prm if one wants to evaluate hebrew parsing within the
70
+ // same conditions as the state-of-the-art
71
+ // namely without counting the additional SYNpos layer which inflates evalb
72
+ // scores by almost 2 points.
73
+ // Note: for the spmrl shared task, we used the spmrl.prm file (so with
74
+ // these labels. It was too late to modify the rules once again when we
75
+ // realized this)
76
+
parsing/EVALB_SPMRL/README.orig ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #################################################################
2
+ # #
3
+ # README file for evalb #
4
+ # #
5
+ # Satoshi Sekine (NYU) #
6
+ # Mike Collins (UPenn) #
7
+ # #
8
+ # October.1997 #
9
+ #################################################################
10
+
11
+ Contents of this README:
12
+
13
+ [0] COPYRIGHT
14
+ [1] INTRODUCTION
15
+ [2] INSTALLATION AND RUN
16
+ [3] OPTIONS
17
+ [4] OUTPUT FORMAT FROM THE SCORER
18
+ [5] HOW TO CREATE A GOLDFILE FROM THE TREEBANK
19
+ [6] THE PARAMETER FILE
20
+ [7] MORE DETAILS ABOUT THE SCORING ALGORITHM
21
+
22
+
23
+ [0] COPYRIGHT
24
+
25
+ The authors abandon the copyright of this program. Everyone is
26
+ permitted to copy and distribute the program or a portion of the program
27
+ with no charge and no restrictions unless it is harmful to someone.
28
+
29
+ However, the authors are delightful for the user's kindness of proper
30
+ usage and letting the authors know bugs or problems.
31
+
32
+ This software is provided "AS IS", and the authors make no warranties,
33
+ express or implied.
34
+
35
+
36
+ [1] INTRODUCTION
37
+
38
+ Evaluation of bracketing looks simple, but in fact, there are minor
39
+ differences from system to system. This is a program to parametarize
40
+ such minor differences and to give an informative result.
41
+
42
+ "evalb" evaluates bracketing accuracy in a test-file against a gold-file.
43
+ It returns recall, precision, tagging accuracy. It uses an identical
44
+ algorithm to that used in (Collins ACL97).
45
+
46
+
47
+ [2] Installation and Run
48
+
49
+ To compile the scorer, type
50
+
51
+ > make
52
+
53
+
54
+ To run the scorer:
55
+
56
+ > evalb -p Parameter_file Gold_file Test_file
57
+
58
+
59
+ For example to use the sample files:
60
+
61
+ > evalb -p sample.prm sample.gld sample.tst
62
+
63
+
64
+
65
+ [3] OPTIONS
66
+
67
+ You can specify system parameters in the command line options.
68
+ Other options concerning to evaluation metrix should be specified
69
+ in parameter file, described later.
70
+
71
+ -p param_file parameter file
72
+ -d debug mode
73
+ -e n number of error to kill (default=10)
74
+ -h help
75
+
76
+
77
+
78
+ [4] OUTPUT FORMAT FROM THE SCORER
79
+
80
+ The scorer gives individual scores for each sentence, for
81
+ example:
82
+
83
+ Sent. Matched Bracket Cross Correct Tag
84
+ ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy
85
+ ============================================================================
86
+ 1 8 0 100.00 100.00 5 5 5 0 6 5 83.33
87
+
88
+ At the end of the output the === Summary === section gives statistics
89
+ for all sentences, and for sentences <=40 words in length. The summary
90
+ contains the following information:
91
+
92
+ i) Number of sentences -- total number of sentences.
93
+
94
+ ii) Number of Error/Skip sentences -- should both be 0 if there is no
95
+ problem with the parsed/gold files.
96
+
97
+ iii) Number of valid sentences = Number of sentences - Number of Error/Skip
98
+ sentences
99
+
100
+ iv) Bracketing recall = (number of correct constituents)
101
+ ----------------------------------------
102
+ (number of constituents in the goldfile)
103
+
104
+ v) Bracketing precision = (number of correct constituents)
105
+ ----------------------------------------
106
+ (number of constituents in the parsed file)
107
+
108
+ vi) Complete match = percentaage of sentences where recall and precision are
109
+ both 100%.
110
+
111
+ vii) Average crossing = (number of constituents crossing a goldfile constituen
112
+ ----------------------------------------------------
113
+ (number of sentences)
114
+
115
+ viii) No crossing = percentage of sentences which have 0 crossing brackets.
116
+
117
+ ix) 2 or less crossing = percentage of sentences which have <=2 crossing brackets.
118
+
119
+ x) Tagging accuracy = percentage of correct POS tags (but see [5].3 for exact
120
+ details of what is counted).
121
+
122
+
123
+
124
+ [5] HOW TO CREATE A GOLDFILE FROM THE PENN TREEBANK
125
+
126
+
127
+ The gold and parsed files are in a format similar to this:
128
+
129
+ (TOP (S (INTJ (RB No)) (, ,) (NP (PRP it)) (VP (VBD was) (RB n't) (NP (NNP Black) (NNP Monday))) (. .)))
130
+
131
+ To create a gold file from the treebank:
132
+
133
+ tgrep -wn '/.*/' | tgrep_proc.prl
134
+
135
+ will produce a goldfile in the required format. ("tgrep -wn '/.*/'" prints
136
+ parse trees, "tgrep_process.prl" just skips blank lines).
137
+
138
+ For example, to produce a goldfile for section 23 of the treebank:
139
+
140
+ tgrep -wn '/.*/' | tail +90895 | tgrep_process.prl | sed 2416q > sec23.gold
141
+
142
+
143
+
144
+ [6] THE PARAMETER (.prm) FILE
145
+
146
+
147
+ The .prm file sets options regarding the scoring method. COLLINS.prm gives
148
+ the same scoring behaviour as the scorer used in (Collins 97). The options
149
+ chosen were:
150
+
151
+ 1) LABELED 1
152
+
153
+ to give labelled precision/recall figures, i.e. a constituent must have the
154
+ same span *and* label as a constituent in the goldfile.
155
+
156
+ 2) DELETE_LABEL TOP
157
+
158
+ Don't count the "TOP" label (which is always given in the output of tgrep)
159
+ when scoring.
160
+
161
+ 3) DELETE_LABEL -NONE-
162
+
163
+ Remove traces (and all constituents which dominate nothing but traces) when
164
+ scoring. For example
165
+
166
+ .... (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) (. .)))
167
+
168
+ would be processed to give
169
+
170
+ .... (VP (VBD reported)) (. .)))
171
+
172
+
173
+ 4)
174
+ DELETE_LABEL , -- for the purposes of scoring remove punctuation
175
+ DELETE_LABEL :
176
+ DELETE_LABEL ``
177
+ DELETE_LABEL ''
178
+ DELETE_LABEL .
179
+
180
+ 5) DELETE_LABEL_FOR_LENGTH -NONE- -- don't include traces when calculating
181
+ the length of a sentence (important
182
+ when classifying a sentence as <=40
183
+ words or >40 words)
184
+
185
+ 6) EQ_LABEL ADVP PRT
186
+
187
+ Count ADVP and PRT as being the same label when scoring.
188
+
189
+
190
+
191
+
192
+ [7] MORE DETAILS ABOUT THE SCORING ALGORITHM
193
+
194
+
195
+ 1) The scorer initially processes the files to remove all nodes specified
196
+ by DELETE_LABEL in the .prm file. It also recursively removes nodes which
197
+ dominate nothing due to all their children being removed. For example, if
198
+ -NONE- is specified as a label to be deleted,
199
+
200
+ .... (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) (. .)))
201
+
202
+ would be processed to give
203
+
204
+ .... (VP (VBD reported)) (. .)))
205
+
206
+ 2) The scorer also removes all functional tags attached to non-terminals
207
+ (functional tags are prefixed with "-" or "=" in the treebank). For example
208
+ "NP-SBJ" is processed to give "NP", "NP=2" is changed to "NP".
209
+
210
+
211
+ 3) Tagging accuracy counts tags for all words *except* any tags which are
212
+ deleted by a DELETE_LABEL specification in the .prm file. (For example, for
213
+ COLLINS.prm, punctuation tagged as "," ":" etc. would not be included).
214
+
215
+ 4) When calculating the length of a sentence, all words with POS tags not
216
+ included in the "DELETE_LABEL_FOR_LENGTH" list in the .prm file are
217
+ counted. (For COLLINS.prm, only "-NONE-" is specified in this list, so
218
+ traces are removed before calculating the length of the sentence).
219
+
220
+ 5) There are some subtleties in scoring when either the goldfile or parsed
221
+ file contains multiple constituents for the same span which have the same
222
+ non-terminal label. e.g. (NP (NP the man)) If the goldfile contains n
223
+ constituents for the same span, and the parsed file contains m constituents
224
+ with that nonterminal, the scorer works as follows:
225
+
226
+ i) If m>n, then the precision is n/m, recall is 100%
227
+
228
+ ii) If n>m, then the precision is 100%, recall is m/n.
229
+
230
+ iii) If n==m, recall and precision are both 100%.
parsing/EVALB_SPMRL/evalb.c ADDED
@@ -0,0 +1,1724 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*****************************************************************/
2
+ /* evalb [-p param_file] [-dh] [-e n] gold-file test-file */
3
+ /* */
4
+ /* Evaluate bracketing in test-file against gold-file. */
5
+ /* Return recall, precision, tagging accuracy. */
6
+ /* */
7
+ /* <option> */
8
+ /* -p param_file parameter file */
9
+ /* -d debug mode */
10
+ /* -e n number of error to kill (default=10) */
11
+ /* -h help */
12
+ /* */
13
+ /* Satoshi Sekine (NYU) */
14
+ /* Mike Collins (UPenn) */
15
+ /* */
16
+ /* October.1997 */
17
+ /* */
18
+ /* Please refer README for the update information */
19
+ /*****************************************************************/
20
+
21
+ // Djamé: version record added for history's sake.
22
+ // note to future updater: please add your changelog below
23
+
24
+ // Modified by Slav Petrov and Ryanc Mc Donald (for the sancl 2012 shared task)
25
+ // ===> making it less sensitive to punct POS errors leading to
26
+ // ===> mismatch of length
27
+
28
+ // Modified by Djamé Seddah (for the spmrl shared 2013 task)
29
+ // ===> making it able to cope with Arabic very long lines (byte wise)
30
+ // ===> now limit is 50000 bytes, was 5000 (damn bug, if you ask me)
31
+ // ===> modified to cope with spmrl 2013 morpg features (suggested by thomas Muller from IMS)
32
+ // please check the constant macro section
33
+ // Correction of bug causing hard slowdown (due to max_word_in_sent set too high)
34
+ // former version was 78x slower than regular evalb.
35
+
36
+
37
+
38
+
39
+
40
+
41
+ #include <stdio.h>
42
+ #include <stdlib.h> //### added for exit, atoi decls
43
+ #include <ctype.h>
44
+ #include <string.h>
45
+ #ifndef __APPLE__ # dj: added to compile on mac os x
46
+ #include <malloc.h>
47
+ #endif
48
+
49
+ /* Internal Data format -------------------------------------------*/
50
+ /* */
51
+ /* (S (NP (NNX this)) (VP (VBX is) (NP (DT a) (NNX pen))) (SYM .)) */
52
+ /* */
53
+ /* wn=5 */
54
+ /* word label */
55
+ /* terminal[0] = this NNX */
56
+ /* terminal[1] = is VBX */
57
+ /* terminal[2] = a DT */
58
+ /* terminal[3] = pen NNX */
59
+ /* terminal[4] = . SYM */
60
+ /* */
61
+ /* bn=4 */
62
+ /* start end label */
63
+ /* bracket[0] = 0 5 S */
64
+ /* bracket[1] = 0 0 NP */
65
+ /* bracket[2] = 1 4 VP */
66
+ /* bracket[3] = 2 4 NP */
67
+ /* */
68
+ /* matched bracketing */
69
+ /* Recall = --------------------------- */
70
+ /* # of bracket in ref-data */
71
+ /* */
72
+ /* matched bracketing */
73
+ /* Recall = --------------------------- */
74
+ /* # of bracket in test-data */
75
+ /* */
76
+ /*-----------------------------------------------------------------*/
77
+
78
+ /******************/
79
+ /* constant macro */
80
+ /******************/
81
+
82
+ #define MAX_SENT_LEN 50000 //Djamé : was not used
83
+ #define MAX_WORD_IN_SENT 1000
84
+ #define MAX_BRACKET_IN_SENT 2000
85
+ #define MAX_WORD_LEN 100
86
+ #define MAX_LABEL_LEN 300
87
+ #define MAX_QUOTE_TERM 20
88
+
89
+ #define MAX_DELETE_LABEL 1000
90
+ #define MAX_EQ_LABEL 1000
91
+ #define MAX_EQ_WORD 1000
92
+
93
+ #define MAX_LINE_LEN 500
94
+
95
+ #define DEFAULT_MAX_ERROR 10
96
+ #define DEFAULT_CUT_LEN 40
97
+
98
+ /*************/
99
+ /* structure */
100
+ /*************/
101
+
102
+ typedef struct ss_terminal {
103
+ char word[MAX_WORD_LEN];
104
+ char label[MAX_LABEL_LEN];
105
+ int result; /* 0:unmatch, 1:match, 9:undef */
106
+ } s_terminal;
107
+
108
+ typedef struct ss_term_ind {
109
+ s_terminal term;
110
+ int index;
111
+ int bracket;
112
+ int endslen;
113
+ int ends[MAX_BRACKET_IN_SENT];
114
+ } s_term_ind;
115
+
116
+ typedef struct ss_bracket {
117
+ int start;
118
+ int end;
119
+ unsigned int buf_start;
120
+ unsigned int buf_end;
121
+ char label[MAX_LABEL_LEN];
122
+ int result; /* 0: unmatch, 1:match, 5:delete 9:undef */
123
+ } s_bracket;
124
+
125
+
126
+ typedef struct ss_equiv {
127
+ char *s1;
128
+ char *s2;
129
+ } s_equiv;
130
+
131
+
132
+ /****************************/
133
+ /* global variables */
134
+ /* gold-data: suffix = 1 */
135
+ /* test-data: suffix = 2 */
136
+ /****************************/
137
+
138
+ /*---------------*/
139
+ /* Sentence data */
140
+ /*---------------*/
141
+ int wn1, wn2; /* number of words in sentence */
142
+ int r_wn1; /* number of words in sentence */
143
+ /* which only ignores labels in */
144
+ /* DELETE_LABEL_FOR_LENGTH */
145
+
146
+ s_terminal terminal1[MAX_WORD_IN_SENT]; /* terminal information */
147
+ s_terminal terminal2[MAX_WORD_IN_SENT];
148
+
149
+ s_term_ind quotterm1[MAX_QUOTE_TERM]; /* special terminals ("'","POS") */
150
+ s_term_ind quotterm2[MAX_QUOTE_TERM];
151
+
152
+ int bn1, bn2; /* number of brackets */
153
+
154
+ int r_bn1, r_bn2; /* number of brackets */
155
+ /* after deletion */
156
+
157
+ s_bracket bracket1[MAX_BRACKET_IN_SENT]; /* bracket information */
158
+ s_bracket bracket2[MAX_BRACKET_IN_SENT];
159
+
160
+
161
+ /*------------*/
162
+ /* Total data */
163
+ /*------------*/
164
+ int TOTAL_bn1, TOTAL_bn2, TOTAL_match; /* total number of brackets */
165
+ int TOTAL_sent; /* No. of sentence */
166
+ int TOTAL_error_sent; /* No. of error sentence */
167
+ int TOTAL_skip_sent; /* No. of skip sentence */
168
+ int TOTAL_comp_sent; /* No. of complete match sent */
169
+ int TOTAL_word; /* total number of word */
170
+ int TOTAL_crossing; /* total crossing */
171
+ int TOTAL_no_crossing; /* no crossing sentence */
172
+ int TOTAL_2L_crossing; /* 2 or less crossing sentence */
173
+ int TOTAL_correct_tag; /* total correct tagging */
174
+
175
+ int TOT_cut_len = DEFAULT_CUT_LEN; /* Cut-off length in statistics */
176
+
177
+ /* data for sentences with len <= CUT_LEN */
178
+ /* Historically it was 40. */
179
+ int TOT40_bn1, TOT40_bn2, TOT40_match; /* total number of brackets */
180
+ int TOT40_sent; /* No. of sentence */
181
+ int TOT40_error_sent; /* No. of error sentence */
182
+ int TOT40_skip_sent; /* No. of skip sentence */
183
+ int TOT40_comp_sent; /* No. of complete match sent */
184
+ int TOT40_word; /* total number of word */
185
+ int TOT40_crossing; /* total crossing */
186
+ int TOT40_no_crossing; /* no crossing sentence */
187
+ int TOT40_2L_crossing; /* 2 or less crossing sentence */
188
+ int TOT40_correct_tag; /* total correct tagging */
189
+
190
+ /*------------*/
191
+ /* miscallous */
192
+ /*------------*/
193
+ int Line; /* line number */
194
+ int Error_count = 0; /* Error count */
195
+ int Status; /* Result status for each sent */
196
+ /* 0: OK, 1: skip, 2: error */
197
+
198
+ /*-------------------*/
199
+ /* stack manuplation */
200
+ /*-------------------*/
201
+ int stack_top;
202
+ int stack[MAX_BRACKET_IN_SENT];
203
+
204
+ /************************************************************/
205
+ /* User parameters which can be specified in parameter file */
206
+ /************************************************************/
207
+
208
+ /*------------------------------------------*/
209
+ /* Debug mode */
210
+ /* print out data for individual sentence */
211
+ /*------------------------------------------*/
212
+ int DEBUG=0;
213
+
214
+ /*------------------------------------------*/
215
+ /* MAX error */
216
+ /* Number of error to stop the process. */
217
+ /* This is useful if there could be */
218
+ /* tokanization error. */
219
+ /* The process will stop when this number*/
220
+ /* of errors are accumulated. */
221
+ /*------------------------------------------*/
222
+ int Max_error = DEFAULT_MAX_ERROR;
223
+
224
+ /*------------------------------------------*/
225
+ /* Cut-off length for statistics */
226
+ /* int TOT_cut_len = DEFAULT_CUT_LEN; */
227
+ /* (Defined above) */
228
+ /*------------------------------------------*/
229
+
230
+
231
+ /*------------------------------------------*/
232
+ /* unlabeled or labeled bracketing */
233
+ /* 0: unlabeled bracketing */
234
+ /* 1: labeled bracketing */
235
+ /*------------------------------------------*/
236
+ int F_label = 1;
237
+
238
+ /*------------------------------------------*/
239
+ /* Delete labels */
240
+ /* list of labels to be ignored. */
241
+ /* If it is a pre-terminal label, delete */
242
+ /* the word along with the brackets. */
243
+ /* If it is a non-terminal label, just */
244
+ /* delete the brackets (don't delete */
245
+ /* childrens). */
246
+ /*------------------------------------------*/
247
+ char *Delete_label[MAX_DELETE_LABEL];
248
+ int Delete_label_n = 0;
249
+
250
+ /*------------------------------------------*/
251
+ /* Delete labels for length calculation */
252
+ /* list of labels to be ignored for */
253
+ /* length calculation purpose */
254
+ /*------------------------------------------*/
255
+ char *Delete_label_for_length[MAX_DELETE_LABEL];
256
+ int Delete_label_for_length_n = 0;
257
+
258
+ /*------------------------------------------*/
259
+ /* Labels to be considered for misquote */
260
+ /* (could be possesive or quote) */
261
+ /*------------------------------------------*/
262
+ char *Quote_term[MAX_QUOTE_TERM];
263
+ int Quote_term_n = 0;
264
+
265
+ /*------------------------------------------*/
266
+ /* Equivalent labels, words */
267
+ /* the pairs are considered equivalent */
268
+ /* This is non-directional. */
269
+ /*------------------------------------------*/
270
+ s_equiv EQ_label[MAX_EQ_LABEL];
271
+ int EQ_label_n = 0;
272
+
273
+ s_equiv EQ_word[MAX_EQ_WORD];
274
+ int EQ_word_n = 0;
275
+
276
+
277
+ // added by djame
278
+ int spmrl_max_line_to_read=-1 ;
279
+ int spmrl_compact_view=0; // default : classic view
280
+ int spmrl_compact_view40=0; // if one, prints <40 sentence in compact view
281
+ int spmrl_count_bad_sent=0; // default no count
282
+ int spmrl_print_filename=0; // default not to print name
283
+
284
+ /************************/
285
+ /* Function return-type */
286
+ /************************/
287
+ int main();
288
+ void init_global();
289
+ void print_head();
290
+ void init();
291
+ void read_parameter_file();
292
+ void set_param();
293
+ int narg();
294
+ int read_line();
295
+
296
+ void pushb();
297
+ int popb();
298
+ int stackempty();
299
+
300
+ void calc_result(unsigned char *buf1,unsigned char *buf);
301
+ void fix_quote();
302
+ void reinsert_term();
303
+ void massage_data();
304
+ int massage_data_gold_only(); // djame: non destructive
305
+ void modify_label();
306
+ void individual_result();
307
+ void print_total();
308
+ void dsp_info();
309
+ int my_isspace(char c); // Djamé: added for debugging' sake
310
+
311
+ int is_terminator();
312
+ int is_deletelabel();
313
+ int is_deletelabel_for_length();
314
+ int is_quote_term();
315
+ int word_comp();
316
+ int label_comp();
317
+
318
+ void Error();
319
+ void Fatal();
320
+ void Usage();
321
+
322
+ /* ### provided by std headers
323
+ int fprintf();
324
+ int printf();
325
+ int atoi();
326
+ int fclose();
327
+ int sscanf();
328
+ */
329
+
330
+ /***********/
331
+ /* program */
332
+ /***********/
333
+ #define ARG_CHECK(st) if(!(*++(*argv) || (--argc && *++argv))){ \
334
+ fprintf(stderr,"Missing argument: %s\n",st); \
335
+ }
336
+
337
+
338
+ char *filename1, *filename2;
339
+ int
340
+ main(argc,argv)
341
+ int argc;
342
+ char *argv[];
343
+ {
344
+
345
+ FILE *fd1, *fd2;
346
+ unsigned char buff[MAX_SENT_LEN];
347
+ unsigned char buff1[MAX_SENT_LEN];
348
+ int quiet=0; // Djame
349
+ filename1=NULL;
350
+ filename2=NULL;
351
+
352
+
353
+ for(argc--,argv++;argc>0;argc--,argv++){
354
+ if(**argv == '-'){
355
+ while(*++(*argv)){
356
+ switch(**argv){
357
+
358
+ case 'h': /* help */
359
+ Usage();
360
+ exit(1);
361
+
362
+ case 'd': /* debug mode */
363
+ DEBUG = 1;
364
+ goto nextarg;
365
+
366
+ case 'D': /* debug mode */
367
+ DEBUG = 2;
368
+ goto nextarg;
369
+
370
+ case 'c': /* cut-off length */
371
+ ARG_CHECK("cut-off length for statistices");
372
+ TOT_cut_len = atoi(*argv);
373
+ fprintf(stderr,"cutoff %d\n",TOT_cut_len);
374
+ //exit(0);
375
+ goto nextarg;
376
+
377
+
378
+ case 'e': /* max error */
379
+ ARG_CHECK("number of error to kill");
380
+ Max_error = atoi(*argv);
381
+ goto nextarg;
382
+
383
+ case 'p': /* parameter file */
384
+ ARG_CHECK("parameter file");
385
+ read_parameter_file(*argv);
386
+ goto nextarg;
387
+ case 'K':
388
+ ARG_CHECK("Max nb of sentences to read");
389
+ spmrl_max_line_to_read=atoi(*argv);
390
+ goto nextarg;
391
+ case 'L': // added by djame to maintain compatibility with spmrl 2013 shared task's results extraction rules.
392
+ spmrl_compact_view=1;
393
+ goto nextarg;
394
+ case 'l': // added by djame to maintain compatibility with spmrl 2013 shared task's results extraction rules.
395
+ spmrl_compact_view=1;
396
+ spmrl_compact_view40=1;
397
+ goto nextarg;
398
+ case 'X': // added by djame : count skipping sentences (()) as bad sentence
399
+ spmrl_count_bad_sent=1;
400
+ goto nextarg;
401
+ case 'V': // added by djame to add gold_name vs test_file in the outpu
402
+ spmrl_print_filename=1;
403
+ goto nextarg;
404
+ default:
405
+ Usage();
406
+ exit(0);
407
+ }
408
+ }
409
+ } else {
410
+ if(filename1==NULL){
411
+ filename1 = *argv;
412
+ }else if(filename2==NULL){
413
+ filename2 = *argv;
414
+ }
415
+ }
416
+ nextarg: continue;
417
+ }
418
+
419
+ init_global();
420
+
421
+
422
+ if((fd1 = fopen(filename1,"r"))==NULL){
423
+ Fatal("Can't open gold file (%s)\n",filename1);
424
+ }
425
+ if((fd2 = fopen(filename2,"r"))==NULL){
426
+ Fatal("Can't open test file (%s)\n",filename2);
427
+ }
428
+
429
+ print_head();
430
+
431
+ for(Line=1;fgets(buff,MAX_SENT_LEN,fd1)!=NULL;Line++){
432
+
433
+ init();
434
+
435
+ /* READ 1 */
436
+ r_wn1 = read_line(buff,terminal1,quotterm1,&wn1,bracket1,&bn1);
437
+
438
+ strcpy(buff1,buff);
439
+
440
+ /* READ 2 */
441
+ if(fgets(buff,MAX_SENT_LEN,fd2)==NULL){
442
+ Error("Number of lines unmatch (too many lines in gold file)\n");
443
+ break;
444
+ }
445
+
446
+ read_line(buff,terminal2,quotterm2,&wn2,bracket2,&bn2);
447
+
448
+ /* Calculate result and print it */
449
+ calc_result(buff1,buff);
450
+
451
+ if(DEBUG>=1){
452
+ dsp_info();
453
+ }
454
+ // Added by djame
455
+ if (spmrl_max_line_to_read!=-1){
456
+ if ((Line+1) > spmrl_max_line_to_read ){
457
+ quiet=1;
458
+ break; // evaluate only spmrl_max_line_to_read -1 (to keep compatibility with lines )
459
+ }
460
+ }
461
+
462
+ }
463
+
464
+ if( (quiet==0) && (fgets(buff,MAX_SENT_LEN,fd2)!=NULL)){
465
+ Error("Number of lines unmatch (too many lines in test file)\n");
466
+ }
467
+
468
+ print_total();
469
+
470
+ return (0);
471
+ }
472
+
473
+
474
+ /*-----------------------------*/
475
+ /* initialize global variables */
476
+ /*-----------------------------*/
477
+ void
478
+ init_global()
479
+ {
480
+ TOTAL_bn1 = TOTAL_bn2 = TOTAL_match = 0;
481
+ TOTAL_sent = TOTAL_error_sent = TOTAL_skip_sent = TOTAL_comp_sent = 0;
482
+ TOTAL_word = TOTAL_correct_tag = 0;
483
+ TOTAL_crossing = 0;
484
+ TOTAL_no_crossing = TOTAL_2L_crossing = 0;
485
+
486
+ TOT40_bn1 = TOT40_bn2 = TOT40_match = 0;
487
+ TOT40_sent = TOT40_error_sent = TOT40_skip_sent = TOT40_comp_sent = 0;
488
+ TOT40_word = TOT40_correct_tag = 0;
489
+ TOT40_crossing = 0;
490
+ TOT40_no_crossing = TOT40_2L_crossing = 0;
491
+
492
+ }
493
+
494
+
495
+ /*------------------*/
496
+ /* print head title */
497
+ /*------------------*/
498
+ void
499
+ print_head()
500
+ {
501
+ printf(" Sent. Matched Bracket Cross Correct Tag\n");
502
+ printf(" ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy\n");
503
+ printf("============================================================================\n");
504
+ }
505
+
506
+
507
+ /*-----------------------------------------------*/
508
+ /* initialization at each individual computation */
509
+ /*-----------------------------------------------*/
510
+ void
511
+ init()
512
+ {
513
+ int i;
514
+
515
+ wn1 = 0;
516
+ wn2 = 0;
517
+ bn1 = 0;
518
+ bn2 = 0;
519
+ r_bn1 = 0;
520
+ r_bn2 = 0;
521
+
522
+ for(i=0;i<MAX_WORD_IN_SENT;i++){
523
+ terminal1[i].word[0] = '\0';
524
+ terminal1[i].label[0] = '\0';
525
+ terminal1[i].result = 9;
526
+ terminal2[i].word[0] = '\0';
527
+ terminal2[i].label[0] = '\0';
528
+ terminal2[i].result = 9;
529
+ }
530
+
531
+ for(i=0;i<MAX_QUOTE_TERM;i++){
532
+ quotterm1[i].term.word[0] = '\0';
533
+ quotterm1[i].term.label[0] = '\0';
534
+ quotterm1[i].term.result = 9;
535
+ quotterm1[i].index = -1;
536
+ quotterm1[i].bracket = -1;
537
+ quotterm2[i].term.word[0] = '\0';
538
+ quotterm2[i].term.label[0] = '\0';
539
+ quotterm2[i].term.result = 9;
540
+ quotterm2[i].index = -1;
541
+ quotterm2[i].bracket = -1;
542
+ }
543
+
544
+ for(i=0;i<MAX_BRACKET_IN_SENT;i++){
545
+ bracket1[i].start = -1;
546
+ bracket1[i].end = -1;
547
+ bracket1[i].label[0] = '\0';
548
+ bracket1[i].result = 9;
549
+ bracket2[i].start = -1;
550
+ bracket2[i].end = -1;
551
+ bracket2[i].label[0] = '\0';
552
+ bracket2[i].result = 9;
553
+ }
554
+
555
+ Status = 0;
556
+ }
557
+
558
+ /*----------------*/
559
+ /* parameter file */
560
+ /*----------------*/
561
+ void
562
+ read_parameter_file(filename)
563
+ char *filename;
564
+ {
565
+ char buff[MAX_LINE_LEN];
566
+ FILE *fd;
567
+ int line;
568
+ int i;
569
+
570
+ if((fd=fopen(filename,"r"))==NULL){
571
+ Fatal("Can't open parameter file (%s)\n",filename);
572
+ }
573
+
574
+ for(line=1;fgets(buff,MAX_LINE_LEN,fd)!=NULL;line++){
575
+
576
+ /* clean up the tail and find unvalid line */
577
+ /*-----------------------------------------*/
578
+ for(i=strlen(buff)-1;i>0 && (isspace(buff[i]) || buff[i]=='\n');i--){
579
+ buff[i]='\0';
580
+ }
581
+ if(buff[0]=='#' || /* comment-line */
582
+ strlen(buff)<3){ /* too short, just ignore */
583
+ continue;
584
+ }
585
+
586
+ /* place the parameter and value */
587
+ /*-------------------------------*/
588
+ for(i=0;!isspace(buff[i]);i++);
589
+ for(;isspace(buff[i]) && buff[i]!='\0';i++);
590
+ if(buff[i]=='\0'){
591
+ fprintf(stderr,"Empty value in parameter file (%d)\n",line);
592
+ }
593
+
594
+ /* set parameter and value */
595
+ /*-------------------------*/
596
+ set_param(buff,buff+i);
597
+ }
598
+
599
+ fclose(fd);
600
+ }
601
+
602
+
603
+ #define STRNCMP(s) (strncmp(param,s,strlen(s))==0 && \
604
+ (param[strlen(s)]=='\0' || isspace(param[strlen(s)])))
605
+
606
+
607
+ void
608
+ set_param(param,value)
609
+ char *param, *value;
610
+ {
611
+ char l1[MAX_LABEL_LEN], l2[MAX_LABEL_LEN];
612
+
613
+ if(STRNCMP("DEBUG")){
614
+
615
+ DEBUG = atoi(value);
616
+
617
+ }else if(STRNCMP("MAX_ERROR")){
618
+
619
+ Max_error = atoi(value);
620
+
621
+ }else if(STRNCMP("CUTOFF_LEN")){
622
+
623
+ TOT_cut_len = atoi(value);
624
+
625
+ }else if(STRNCMP("LABELED")){
626
+
627
+ F_label = atoi(value);
628
+
629
+ }else if(STRNCMP("DELETE_LABEL")){
630
+
631
+ Delete_label[Delete_label_n] = (char *)malloc(strlen(value)+1);
632
+ strcpy(Delete_label[Delete_label_n],value);
633
+ Delete_label_n++;
634
+
635
+ }else if(STRNCMP("DELETE_LABEL_FOR_LENGTH")){
636
+
637
+ Delete_label_for_length[Delete_label_for_length_n] = (char *)malloc(strlen(value)+1);
638
+ strcpy(Delete_label_for_length[Delete_label_for_length_n],value);
639
+ Delete_label_for_length_n++;
640
+
641
+ }else if(STRNCMP("QUOTE_LABEL")){
642
+
643
+ Quote_term[Quote_term_n] = (char *)malloc(strlen(value)+1);
644
+ strcpy(Quote_term[Quote_term_n],value);
645
+ Quote_term_n++;
646
+
647
+ }else if(STRNCMP("EQ_LABEL")){
648
+
649
+ if(narg(value)!=2){
650
+ fprintf(stderr,"EQ_LABEL requires two values\n");
651
+ return;
652
+ }
653
+ sscanf(value,"%s %s",l1,l2);
654
+ EQ_label[EQ_label_n].s1 = (char *)malloc(strlen(l1)+1);
655
+ strcpy(EQ_label[EQ_label_n].s1,l1);
656
+ EQ_label[EQ_label_n].s2 = (char *)malloc(strlen(l2)+1);
657
+ strcpy(EQ_label[EQ_label_n].s2,l2);
658
+ EQ_label_n++;
659
+
660
+ }else if(STRNCMP("EQ_WORD")){
661
+
662
+ if(narg(value)!=2){
663
+ fprintf(stderr,"EQ_WORD requires two values\n");
664
+ return;
665
+ }
666
+ sscanf(value,"%s %s",l1,l2);
667
+ EQ_word[EQ_word_n].s1 = (char *)malloc(strlen(l1)+1);
668
+ strcpy(EQ_word[EQ_word_n].s1,l1);
669
+ EQ_word[EQ_word_n].s2 = (char *)malloc(strlen(l2)+1);
670
+ strcpy(EQ_word[EQ_word_n].s2,l2);
671
+ EQ_word_n++;
672
+
673
+ }else{
674
+
675
+ fprintf(stderr,"Unknown keyword (%s) in parameter file\n",param);
676
+
677
+ }
678
+ }
679
+
680
+
681
+ int
682
+ narg(s)
683
+ char *s;
684
+ {
685
+ int n;
686
+
687
+ for(n=0;*s!='\0';){
688
+ for(;isspace(*s);s++);
689
+ if(*s=='\0'){
690
+ break;
691
+ }
692
+ n++;
693
+ for(;!isspace(*s);s++){
694
+ if(*s=='\0'){
695
+ break;
696
+ }
697
+ }
698
+ }
699
+
700
+ return(n);
701
+ }
702
+
703
+ /*-----------------------------*/
704
+ /* Read line and gather data. */
705
+ /* Return langth of sentence. */
706
+ /*-----------------------------*/
707
+ int
708
+ read_line(buff, terminal, quotterm, wn, bracket, bn)
709
+ char *buff;
710
+ s_terminal terminal[];
711
+ s_term_ind quotterm[];
712
+ int *wn;
713
+ s_bracket bracket[];
714
+ int *bn;
715
+ {
716
+ char *p, *q, label[MAX_LABEL_LEN], word[MAX_WORD_LEN];
717
+ int qt; /* quote term counter */
718
+ int wid, bid; /* word ID, bracket ID */
719
+ int n; /* temporary remembering the position */
720
+ int b; /* temporary remembering bid */
721
+ int i;
722
+ int len; /* length of the sentence */
723
+
724
+ len = 0;
725
+ stack_top=0;
726
+
727
+ for(p=buff,qt=0,wid=0,bid=0;*p!='\0';){
728
+
729
+ if(isspace(*p)){
730
+ p++;
731
+ continue;
732
+
733
+ /* open bracket */
734
+ /*--------------*/
735
+ }else if(*p=='('){
736
+
737
+ n=wid;
738
+ for(p++,i=0;!is_terminator(*p);p++,i++){
739
+ label[i]=*p;
740
+ }
741
+ label[i]='\0';
742
+
743
+ /* Find terminals */
744
+ q = p;
745
+ if(isspace(*q)){
746
+ for(q++;isspace(*q);q++);
747
+ for(i=0;!is_terminator(*q);q++,i++){
748
+ word[i]=*q;
749
+ }
750
+ word[i]='\0';
751
+
752
+ /* compute length */
753
+ if(*q==')' && !is_deletelabel_for_length(label)==1){
754
+ len++;
755
+ }
756
+ if (DEBUG>1)
757
+ printf("label=%s, word=%s, wid=%d\n",label,word,wid);
758
+ /* quote terminal */
759
+ if(*q==')' && is_quote_term(label,word)==1){
760
+ strcpy(quotterm[qt].term.word,word);
761
+ strcpy(quotterm[qt].term.label,label);
762
+ quotterm[qt].index = wid;
763
+ quotterm[qt].bracket = bid;
764
+ quotterm[qt].endslen = stack_top;
765
+ //quotterm[qt].ends = (int*)malloc(stack_top*sizeof(int));
766
+ memcpy(quotterm[qt].ends,stack,stack_top*sizeof(int));
767
+ qt++;
768
+ }
769
+
770
+ /* Slav: do not delete terminals */
771
+ /* delete terminal */
772
+ //if(*q==')' && is_deletelabel(label)==1){
773
+ // p = q+1;
774
+ // continue;
775
+
776
+ /* valid terminal */
777
+ //}else
778
+ if(*q==')'){
779
+ strcpy(terminal[wid].word,word);
780
+ strcpy(terminal[wid].label,label);
781
+ wid++;
782
+ p = q+1;
783
+ continue;
784
+
785
+ /* error */
786
+ }else if(*q!='('){
787
+ fprintf(stderr,"debug djam: q= %s\n",q);
788
+ Error("More than two elements in a bracket\n");
789
+ }
790
+ }
791
+
792
+ /* otherwise non-terminal label */
793
+ bracket[bid].start = wid;
794
+ bracket[bid].buf_start = p-buff;
795
+ strcpy(bracket[bid].label,label);
796
+ pushb(bid);
797
+ bid++;
798
+
799
+ /* close bracket */
800
+ /*---------------*/
801
+ }else if(*p==')'){
802
+
803
+ b = popb();
804
+ bracket[b].end = wid;
805
+ bracket[b].buf_end = p-buff;
806
+ p++;
807
+
808
+ /* error */
809
+ /*-------*/
810
+ }else{
811
+
812
+ Error("Reading sentence\n");
813
+ }
814
+ }
815
+
816
+ if(!stackempty()){
817
+ Error("Bracketing is unbalanced (too many open bracket)\n");
818
+ }
819
+
820
+ *wn = wid;
821
+ *bn = bid;
822
+
823
+ return(len);
824
+ }
825
+
826
+
827
+ /*----------------------*/
828
+ /* stack operation */
829
+ /* for bracketing pairs */
830
+ /*----------------------*/
831
+ void
832
+ pushb(item)
833
+ int item;
834
+ {
835
+ stack[stack_top++]=item;
836
+ }
837
+
838
+ int
839
+ popb()
840
+ {
841
+ int item;
842
+
843
+ item = stack[stack_top-1];
844
+
845
+ if(stack_top-- < 0){
846
+ Error("Bracketing unbalance (too many close bracket)\n");
847
+ }
848
+ return(item);
849
+ }
850
+
851
+ int
852
+ stackempty()
853
+ {
854
+ if(stack_top==0){
855
+ return(1);
856
+ }else{
857
+ return(0);
858
+ }
859
+ }
860
+
861
+
862
+ /*------------------*/
863
+ /* calculate result */
864
+ /*------------------*/
865
+ void
866
+ calc_result(unsigned char *buf1,unsigned char *buf)
867
+ {
868
+ int i, j, l;
869
+ int match, crossing, correct_tag;
870
+
871
+ int last_i = -1;
872
+
873
+ char my_buf[10000]; //djame: was 1000
874
+ int match_found = 0;
875
+
876
+ char match_j[2000]; //djame was : 200
877
+ for (j = 0; j < bn2; ++j) {
878
+ match_j[j] = 0;
879
+ }
880
+
881
+ /* ML */
882
+ if (DEBUG>1)
883
+ printf("\n");
884
+
885
+
886
+ /* Find skip and error */
887
+ /*---------------------*/
888
+ if(wn2==0){ // Djame: case of empty lines
889
+ if (spmrl_count_bad_sent==1){
890
+ Status = 3;
891
+ //individual_result(wn1,r_bn1,r_bn2,match,crossing,correct_tag);
892
+ int n_bracket_gold=massage_data_gold_only();
893
+ r_bn1=n_bracket_gold;
894
+ individual_result(wn1,n_bracket_gold,0,0,0,0); // testing the case of missing analysis was 0,0
895
+ }else {
896
+ Status=2;
897
+ individual_result(0,0,0,0,0,0);
898
+ }
899
+
900
+ return;
901
+ }
902
+
903
+ if(wn1 != wn2){
904
+ //if (DEBUG>1)
905
+ //Error("Length unmatch (%d|%d)\n",wn1,wn2);
906
+ fix_quote();
907
+ if(wn1 != wn2){
908
+ individual_result(0,0,0,0,0,0);
909
+ /* Slav: ignore 1 word sentences */
910
+ if (wn1 > 1) {
911
+ Error("Length unmatch (%d|%d)\n",wn1,wn2);
912
+ return;
913
+ }
914
+ }
915
+ }
916
+
917
+ for(i=0;i<wn1;i++){
918
+ if(word_comp(terminal1[i].word,terminal2[i].word)==0){
919
+ Error("Words unmatch (%s|%s)\n",terminal1[i].word,
920
+ terminal2[i].word);
921
+ individual_result(0,0,0,0,0,0);
922
+ return;
923
+ }
924
+ }
925
+
926
+ /* massage the data */
927
+ /*------------------*/
928
+ massage_data();
929
+
930
+ /* matching brackets */
931
+ /*-------------------*/
932
+ match = 0;
933
+ for(i=0;i<bn1;i++){
934
+ for(j=0;j<bn2;j++){
935
+
936
+ if (DEBUG>1)
937
+ printf("1.res=%d, 2.res=%d, 1.start=%d, 2.start=%d, 1.end=%d, 2.end=%d\n",bracket1[i].result,bracket2[j].result,bracket1[i].start,bracket2[j].start,bracket1[i].end,bracket2[j].end);
938
+
939
+ // does bracket match?
940
+ if(bracket1[i].result != 5 &&
941
+ bracket2[j].result == 0 &&
942
+ bracket1[i].start == bracket2[j].start && bracket1[i].end == bracket2[j].end) {
943
+
944
+ // (1) do we not care about the label or (2) does the label match?
945
+ if (F_label==0 || label_comp(bracket1[i].label,bracket2[j].label)==1) {
946
+ bracket1[i].result = bracket2[j].result = 1;
947
+ match++;
948
+ match_found = 1;
949
+ break;
950
+ } else {
951
+ if (DEBUG>1) {
952
+ printf(" LABEL[%d-%d]: ",bracket1[i].start,bracket1[i].end-1);
953
+ l = bracket1[i].buf_end-bracket1[i].buf_start;
954
+ strncpy(my_buf,buf1+bracket1[i].buf_start,l);
955
+ my_buf[l] = '\0';
956
+ printf("%s\n",my_buf);
957
+ }
958
+ match_found = 1;
959
+ match_j[j] = 1;
960
+ }
961
+ }
962
+ }
963
+
964
+ if (!match_found && bracket1[i].result != 5 && DEBUG>1) {
965
+ /* ### ML 09/28/03: gold bracket with no corresponding test bracket */
966
+ printf(" BRACKET[%d-%d]: ",bracket1[i].start,bracket1[i].end-1);
967
+ l = bracket1[i].buf_end-bracket1[i].buf_start;
968
+ strncpy(my_buf,buf1+bracket1[i].buf_start,l);
969
+ my_buf[l] = '\0';
970
+ printf("%s\n",my_buf);
971
+ }
972
+ match_found = 0;
973
+ }
974
+
975
+ for(j=0;j<bn2;j++){
976
+ if (bracket2[j].result==0 && !match_j[j] && DEBUG>1) {
977
+ /* test bracket with no corresponding gold bracket */
978
+ printf(" EXTRA[%d-%d]: ",bracket2[j].start,bracket2[j].end-1);
979
+ l = bracket2[j].buf_end-bracket2[j].buf_start;
980
+ strncpy(my_buf,buf+bracket2[j].buf_start,l);
981
+ my_buf[l] = '\0';
982
+ printf("%s\n",my_buf);
983
+ }
984
+ }
985
+
986
+ /* crossing */
987
+ /*----------*/
988
+ crossing = 0;
989
+
990
+ /* crossing is counted based on the brackets */
991
+ /* in test rather than gold file (by Mike) */
992
+ for(j=0;j<bn2;j++){
993
+ for(i=0;i<bn1;i++){
994
+ if(bracket1[i].result != 5 &&
995
+ bracket2[j].result != 5 &&
996
+ ((bracket1[i].start < bracket2[j].start &&
997
+ bracket1[i].end > bracket2[j].start &&
998
+ bracket1[i].end < bracket2[j].end) ||
999
+ (bracket1[i].start > bracket2[j].start &&
1000
+ bracket1[i].start < bracket2[j].end &&
1001
+ bracket1[i].end > bracket2[j].end))){
1002
+
1003
+ /* ### ML 09/01/03: get details on cross-brackettings */
1004
+ if (i != last_i) {
1005
+ if (DEBUG>1) {
1006
+ printf(" CROSSING[%d-%d]: ",bracket1[i].start,bracket1[i].end-1);
1007
+ l = bracket1[i].buf_end-bracket1[i].buf_start;
1008
+ strncpy(my_buf,buf1+bracket1[i].buf_start,l);
1009
+ my_buf[l] = '\0';
1010
+ printf("%s\n",my_buf);
1011
+
1012
+ /* ML
1013
+ printf("\n CROSSING at bracket %d:\n",i-1);
1014
+ printf(" GOLD (tokens %d-%d): ",bracket1[i].start,bracket1[i].end-1);
1015
+ l = bracket1[i].buf_end-bracket1[i].buf_start;
1016
+ strncpy(my_buf,buf1+bracket1[i].buf_start,l);
1017
+ my_buf[l] = '\0';
1018
+ printf("%s\n",my_buf);
1019
+ */
1020
+ }
1021
+ last_i = i;
1022
+ }
1023
+
1024
+ /* ML
1025
+ printf(" TEST (tokens %d-%d): ",bracket2[j].start,bracket2[j].end-1);
1026
+ l = bracket2[j].buf_end-bracket2[j].buf_start;
1027
+ strncpy(my_buf,buf+bracket2[j].buf_start,l);
1028
+ my_buf[l] = '\0';
1029
+ printf("%s\n",my_buf);
1030
+ */
1031
+
1032
+ crossing++;
1033
+ break;
1034
+ }
1035
+ }
1036
+ }
1037
+
1038
+ /* Tagging accuracy */
1039
+ /*------------------*/
1040
+ correct_tag=0;
1041
+ for(i=0;i<wn1;i++){
1042
+ if(label_comp(terminal1[i].label,terminal2[i].label)==1){
1043
+ terminal1[i].result = terminal2[i].result = 1;
1044
+ correct_tag++;
1045
+ } else {
1046
+ terminal1[i].result = terminal2[i].result = 0;
1047
+ }
1048
+ }
1049
+
1050
+ individual_result(wn1,r_bn1,r_bn2,match,crossing,correct_tag);
1051
+ }
1052
+
1053
+ void
1054
+ fix_quote()
1055
+ {
1056
+ int i,j,k;
1057
+ if (DEBUG>1) {
1058
+ for(i=0;i<MAX_QUOTE_TERM;i++){
1059
+ if (quotterm1[i].index!=-1)
1060
+ printf("%d: %s - %s\n",quotterm1[i].index,
1061
+ quotterm1[i].term.label,
1062
+ quotterm1[i].term.word);
1063
+ if (quotterm2[i].index!=-1)
1064
+ printf("%d: %s - %s\n",quotterm2[i].index,
1065
+ quotterm2[i].term.label,
1066
+ quotterm2[i].term.word);
1067
+ }
1068
+ }
1069
+ for(i=0;i<MAX_QUOTE_TERM;i++) {
1070
+ int ind = quotterm2[i].index;
1071
+ if (ind!=-1) {
1072
+ for(j=0;j<MAX_QUOTE_TERM;j++){
1073
+ if (quotterm1[j].index==ind &&
1074
+ strcmp(quotterm1[j].term.label,
1075
+ quotterm2[i].term.label)!=0) {
1076
+ if (is_deletelabel(quotterm1[j].term.label) && !is_deletelabel(quotterm2[i].term.label)) {
1077
+ reinsert_term(&quotterm1[j],terminal1,bracket1,&wn1);
1078
+ for (k=j;k<MAX_QUOTE_TERM;k++)
1079
+ if (quotterm1[k].index!=-1)
1080
+ quotterm1[k].index++;
1081
+ } else if (is_deletelabel(quotterm2[i].term.label) && !is_deletelabel(quotterm1[j].term.label)) {
1082
+ reinsert_term(&quotterm2[i],terminal2,bracket2,&wn2);
1083
+ for (k=i;k<MAX_QUOTE_TERM;k++)
1084
+ if (quotterm2[k].index!=-1)
1085
+ quotterm2[k].index++;
1086
+ }
1087
+ }
1088
+ }
1089
+ } else break;
1090
+ }
1091
+ }
1092
+
1093
+ void
1094
+ reinsert_term(quot,terminal,bracket,wn)
1095
+ s_term_ind* quot;
1096
+ s_terminal terminal[];
1097
+ s_bracket bracket[];
1098
+ int* wn;
1099
+ {
1100
+ int ind = quot->index;
1101
+ int bra = quot->bracket;
1102
+ s_terminal* term = &quot->term;
1103
+ int k;
1104
+ memmove(&terminal[ind+1],
1105
+ &terminal[ind],
1106
+ sizeof(s_terminal)*(MAX_WORD_IN_SENT-ind-1));
1107
+ strcpy(terminal[ind].label,term->label);
1108
+ strcpy(terminal[ind].word,term->word);
1109
+ (*wn)++;
1110
+ if (DEBUG>1)
1111
+ printf("bra=%d, ind=%d\n",bra,ind);
1112
+ for(k=0;k<MAX_BRACKET_IN_SENT;k++) {
1113
+ if (bracket[k].start==-1)
1114
+ break;
1115
+ if (DEBUG>1)
1116
+ printf("bracket[%d]={%d,%d}\n",k,bracket[k].start,bracket[k].end);
1117
+ if (k>=bra) {
1118
+ bracket[k].start++;
1119
+ bracket[k].end++;
1120
+ }
1121
+ //if (bracket[k].start<=ind && bracket[k].end>=ind)
1122
+ //bracket[k].end++;
1123
+ }
1124
+ if (DEBUG>1)
1125
+ printf("endslen=%d\n",quot->endslen);
1126
+ for(k=0;k<quot->endslen;k++) {
1127
+ //printf("ends[%d]=%d",k,quot->ends[k]);
1128
+ bracket[quot->ends[k]].end++;
1129
+ }
1130
+ //free(quot->ends);
1131
+ }
1132
+ /*
1133
+ void
1134
+ adjust_end(ind,bra)
1135
+ int ind;
1136
+ int bra;
1137
+ {
1138
+ for(k=0;k<MAX_BRACKET_IN_SENT;k++) {
1139
+ if (bracket[k].start==-1)
1140
+ break;
1141
+ printf("bracket[%d]={%d,%d}\n",k,bracket[k].start,bracket[k].end);
1142
+ if (k>=bra)
1143
+ bracket[k].end++;
1144
+ }
1145
+ }
1146
+ */
1147
+
1148
+
1149
+
1150
+ int massage_data_gold_only(){
1151
+ int i, j;
1152
+ int gold_valid_bracket=0;
1153
+ char buflabel[MAX_LABEL_LEN]; // djame
1154
+ /* for GOLD */
1155
+ /*----------*/
1156
+ for(i=0;i<bn1;i++){
1157
+
1158
+ bracket1[i].result = 0;
1159
+
1160
+ /* Zero element */
1161
+ if(bracket1[i].start == bracket1[i].end){
1162
+ //bracket1[i].result = bracket1[i].result; // was 5
1163
+ continue;
1164
+ }else {
1165
+ gold_valid_bracket++;
1166
+ }
1167
+
1168
+
1169
+ /* Modify label */
1170
+ strcpy(buflabel,bracket1[i].label); //djame
1171
+ modify_label(buflabel); // Djamé will be called twice
1172
+
1173
+ /* Delete label */
1174
+ for(j=0;j<Delete_label_n;j++){
1175
+ if(label_comp(buflabel,Delete_label[j])!=1){
1176
+ gold_valid_bracket++;
1177
+ }
1178
+ }
1179
+ }
1180
+
1181
+ return gold_valid_bracket;
1182
+ }
1183
+
1184
+
1185
+
1186
+
1187
+
1188
+ void
1189
+ massage_data()
1190
+ {
1191
+ int i, j;
1192
+
1193
+ /* for GOLD */
1194
+ /*----------*/
1195
+ for(i=0;i<bn1;i++){
1196
+
1197
+ bracket1[i].result = 0;
1198
+
1199
+ /* Zero element */
1200
+ if(bracket1[i].start == bracket1[i].end){
1201
+ bracket1[i].result = 5;
1202
+ continue;
1203
+ }
1204
+
1205
+ /* Modify label */
1206
+ modify_label(bracket1[i].label);
1207
+
1208
+ /* Delete label */
1209
+ for(j=0;j<Delete_label_n;j++){
1210
+ if(label_comp(bracket1[i].label,Delete_label[j])==1){
1211
+ bracket1[i].result = 5;
1212
+ }
1213
+ }
1214
+ }
1215
+
1216
+ /* for TEST */
1217
+ /*----------*/
1218
+ for(i=0;i<bn2;i++){
1219
+
1220
+ bracket2[i].result = 0;
1221
+
1222
+ /* Zero element */
1223
+ if(bracket2[i].start == bracket2[i].end){
1224
+ bracket2[i].result = 5;
1225
+ continue;
1226
+ }
1227
+
1228
+ /* Modify label */
1229
+ modify_label(bracket2[i].label);
1230
+
1231
+ /* Delete label */
1232
+ for(j=0;j<Delete_label_n;j++){
1233
+ if(label_comp(bracket2[i].label,Delete_label[j])==1){
1234
+ bracket2[i].result = 5;
1235
+ }
1236
+ }
1237
+ }
1238
+
1239
+
1240
+ /* count up real number of brackets (exclude deleted ones) */
1241
+ /*---------------------------------------------------------*/
1242
+ r_bn1 = r_bn2 = 0;
1243
+
1244
+ for(i=0;i<bn1;i++){
1245
+ if(bracket1[i].result != 5){
1246
+ r_bn1++;
1247
+ }
1248
+ }
1249
+
1250
+ for(i=0;i<bn2;i++){
1251
+ if(bracket2[i].result != 5){
1252
+ r_bn2++;
1253
+ }
1254
+ }
1255
+ }
1256
+
1257
+
1258
+ /*------------------------*/
1259
+ /* trim the tail of label */
1260
+ /*------------------------*/
1261
+ void
1262
+ modify_label(label)
1263
+ char *label;
1264
+ {
1265
+ char *p;
1266
+
1267
+ for(p=label;*p!='\0';p++){
1268
+ if(*p=='-' || *p=='='|| *p=='#'){ // for dealing with morph features
1269
+ *p='\0';
1270
+ break;
1271
+ }
1272
+ }
1273
+ }
1274
+
1275
+
1276
+ /*-----------------------------------------------*/
1277
+ /* add individual statistics to TOTAL statictics */
1278
+ /*-----------------------------------------------*/
1279
+ void
1280
+ individual_result(wn1,bn1,bn2,match,crossing,correct_tag)
1281
+ int wn1,bn1,bn2,match,crossing,correct_tag;
1282
+ {
1283
+
1284
+ /* Statistics for ALL */
1285
+ /*--------------------*/
1286
+ TOTAL_sent++;
1287
+ if(Status==1){
1288
+ TOTAL_error_sent++;
1289
+ }else if(Status==2){
1290
+ TOTAL_skip_sent++;
1291
+ }else{
1292
+ TOTAL_bn1 += bn1;
1293
+ TOTAL_bn2 += bn2;
1294
+ TOTAL_match += match;
1295
+ if(bn1==bn2 && bn2==match){
1296
+ TOTAL_comp_sent++;
1297
+ }
1298
+ TOTAL_word += wn1;
1299
+ TOTAL_crossing += crossing;
1300
+ if(crossing==0){
1301
+ TOTAL_no_crossing++;
1302
+ }
1303
+ if(crossing <= 2){
1304
+ TOTAL_2L_crossing++;
1305
+ }
1306
+ TOTAL_correct_tag += correct_tag;
1307
+ }
1308
+
1309
+
1310
+ /* Statistics for sent length <= TOT_cut_len */
1311
+ /*-------------------------------------------*/
1312
+ //fprintf(stderr,"cut-off %d\n",TOT_cut_len);
1313
+ //exit(0);
1314
+ if(r_wn1<=TOT_cut_len){
1315
+ TOT40_sent++;
1316
+ if(Status==1){
1317
+ TOT40_error_sent++;
1318
+ }else if(Status==2){
1319
+ TOT40_skip_sent++;
1320
+ }else{
1321
+ TOT40_bn1 += bn1;
1322
+ TOT40_bn2 += bn2;
1323
+ TOT40_match += match;
1324
+ if(bn1==bn2 && bn2==match){
1325
+ TOT40_comp_sent++;
1326
+ }
1327
+ TOT40_word += wn1;
1328
+ TOT40_crossing += crossing;
1329
+ if(crossing==0){
1330
+ TOT40_no_crossing++;
1331
+ }
1332
+ if(crossing <= 2){
1333
+ TOT40_2L_crossing++;
1334
+ }
1335
+ TOT40_correct_tag += correct_tag;
1336
+ }
1337
+ }
1338
+
1339
+ /* Print individual result */
1340
+ /*-------------------------*/
1341
+ printf("%4d %3d %d ",Line,r_wn1,Status);
1342
+ printf("%6.2f %6.2f %3d %3d %3d %3d",
1343
+ (r_bn1==0?0.0:100.0*match/r_bn1),
1344
+ (r_bn2==0?0.0:100.0*match/r_bn2),
1345
+ match, r_bn1, r_bn2, crossing);
1346
+
1347
+ printf(" %4d %4d %6.2f\n",wn1,correct_tag,
1348
+ (wn1==0?0.0:100.0*correct_tag/wn1));
1349
+ }
1350
+
1351
+
1352
+ /*------------------------*/
1353
+ /* print total statistics */
1354
+ /*------------------------*/
1355
+ void
1356
+ print_total()
1357
+ {
1358
+ int sentn;
1359
+ double r,p,f;
1360
+ FILE *file;
1361
+
1362
+
1363
+
1364
+ r = TOTAL_bn1>0 ? 100.0*TOTAL_match/TOTAL_bn1 : 0.0;
1365
+ p = TOTAL_bn2>0 ? 100.0*TOTAL_match/TOTAL_bn2 : 0.0;
1366
+ f = 2*p*r/(p+r);
1367
+
1368
+ if (spmrl_compact_view == 0){
1369
+
1370
+ printf("============================================================================\n");
1371
+
1372
+ if(TOTAL_bn1>0 && TOTAL_bn2>0){
1373
+ printf(" %6.2f %6.2f %6d %5d %5d %5d",
1374
+ (TOTAL_bn1>0?100.0*TOTAL_match/TOTAL_bn1:0.0),
1375
+ (TOTAL_bn2>0?100.0*TOTAL_match/TOTAL_bn2:0.0),
1376
+ TOTAL_match,
1377
+ TOTAL_bn1,
1378
+ TOTAL_bn2,
1379
+ TOTAL_crossing);
1380
+ }
1381
+
1382
+ printf(" %5d %5d %6.2f",
1383
+ TOTAL_word,
1384
+ TOTAL_correct_tag,
1385
+ (TOTAL_word>0?100.0*TOTAL_correct_tag/TOTAL_word:0.0));
1386
+
1387
+ printf("\n");
1388
+ if (spmrl_print_filename==0){
1389
+ printf("=== Summary ===\n");
1390
+ }else {
1391
+ printf("=== Summary: %s\tvs\t%s ===\n",filename1,filename2);
1392
+ }
1393
+
1394
+
1395
+ sentn = TOTAL_sent - TOTAL_error_sent - TOTAL_skip_sent;
1396
+
1397
+ printf("\n-- All --\n");
1398
+ printf("Number of sentence = %6d\n",TOTAL_sent);
1399
+ printf("Number of Error sentence = %6d\n",TOTAL_error_sent);
1400
+ printf("Number of Skip sentence = %6d\n",TOTAL_skip_sent);
1401
+ printf("Number of Valid sentence = %6d\n",sentn);
1402
+
1403
+ //r = TOTAL_bn1>0 ? 100.0*TOTAL_match/TOTAL_bn1 : 0.0;
1404
+ printf("Bracketing Recall = %6.2f\n",r);
1405
+
1406
+ // p = TOTAL_bn2>0 ? 100.0*TOTAL_match/TOTAL_bn2 : 0.0;
1407
+ printf("Bracketing Precision = %6.2f\n",p);
1408
+
1409
+ // f = 2*p*r/(p+r);
1410
+ printf("Bracketing FMeasure = %6.2f\n",f);
1411
+
1412
+ printf("Complete match = %6.2f\n",
1413
+ (sentn>0?100.0*TOTAL_comp_sent/sentn:0.0));
1414
+ printf("Average crossing = %6.2f\n",
1415
+ (sentn>0?1.0*TOTAL_crossing/sentn:0.0));
1416
+ printf("No crossing = %6.2f\n",
1417
+ (sentn>0?100.0*TOTAL_no_crossing/sentn:0.0));
1418
+ printf("2 or less crossing = %6.2f\n",
1419
+ (sentn>0?100.0*TOTAL_2L_crossing/sentn:0.0));
1420
+ printf("Tagging accuracy = %6.2f\n",
1421
+ (TOTAL_word>0?100.0*TOTAL_correct_tag/TOTAL_word:0.0));
1422
+
1423
+ // Write stats also to a file.
1424
+ file = fopen("status", "w");
1425
+ fprintf(file, "---\n");
1426
+ fprintf(file, "F1: %.2f\n", f);
1427
+ fprintf(file, "LP: %.2f\n", p);
1428
+ fprintf(file, "LR: %.2f\n", r);
1429
+ fprintf(file, "POS: %.2f\n",
1430
+ (TOTAL_word>0?100.0*TOTAL_correct_tag/TOTAL_word:0.0));
1431
+ fprintf(file, "errorRate: %.2f\n", 100-f);
1432
+ fclose(file);
1433
+
1434
+ sentn = TOT40_sent - TOT40_error_sent - TOT40_skip_sent;
1435
+
1436
+ printf("\n-- len<=%d --\n",TOT_cut_len);
1437
+ printf("Number of sentence = %6d\n",TOT40_sent);
1438
+ printf("Number of Error sentence = %6d\n",TOT40_error_sent);
1439
+ printf("Number of Skip sentence = %6d\n",TOT40_skip_sent);
1440
+ printf("Number of Valid sentence = %6d\n",sentn);
1441
+
1442
+
1443
+ r = TOT40_bn1>0 ? 100.0*TOT40_match/TOT40_bn1 : 0.0;
1444
+ printf("Bracketing Recall = %6.2f\n",r);
1445
+
1446
+ p = TOT40_bn2>0 ? 100.0*TOT40_match/TOT40_bn2 : 0.0;
1447
+ printf("Bracketing Precision = %6.2f\n",p);
1448
+
1449
+ f = 2*p*r/(p+r);
1450
+ printf("Bracketing FMeasure = %6.2f\n",f);
1451
+
1452
+ printf("Complete match = %6.2f\n",
1453
+ (sentn>0?100.0*TOT40_comp_sent/sentn:0.0));
1454
+ printf("Average crossing = %6.2f\n",
1455
+ (sentn>0?1.0*TOT40_crossing/sentn:0.0));
1456
+ printf("No crossing = %6.2f\n",
1457
+ (sentn>0?100.0*TOT40_no_crossing/sentn:0.0));
1458
+ printf("2 or less crossing = %6.2f\n",
1459
+ (sentn>0?100.0*TOT40_2L_crossing/sentn:0.0));
1460
+ printf("Tagging accuracy = %6.2f\n",
1461
+ (TOT40_word>0?100.0*TOT40_correct_tag/TOT40_word:0.0));
1462
+ }else { // else spmrl_compact_view
1463
+ if (spmrl_compact_view40 ==0){
1464
+ double pos=(TOTAL_word>0?100.0*TOTAL_correct_tag/TOTAL_word:0.0);
1465
+ sentn = TOTAL_sent - TOTAL_error_sent - TOTAL_skip_sent;
1466
+
1467
+ double EX=(sentn>0?100.0*TOTAL_comp_sent/sentn:0.0);
1468
+
1469
+ printf("F1: %6.2f %%\tPrec: %6.2f %%\tRec: %6.2f %%\t",f,r,p);
1470
+ printf("POS: %6.2f %%\tEX: %6.2f %%\tUnparsed: %6d\tSent: %6d\tfile: %s\n",pos,EX,TOTAL_skip_sent+TOTAL_error_sent,TOTAL_sent,filename2);// ICI
1471
+ }else {
1472
+
1473
+ r = TOT40_bn1>0 ? 100.0*TOT40_match/TOT40_bn1 : 0.0;
1474
+ p = TOT40_bn2>0 ? 100.0*TOT40_match/TOT40_bn2 : 0.0;
1475
+ f = 2*p*r/(p+r);
1476
+ double pos=(TOT40_word>0?100.0*TOT40_correct_tag/TOT40_word:0.0);
1477
+ sentn = TOT40_sent - TOT40_error_sent - TOT40_skip_sent;
1478
+ double EX=(sentn>0?100.0*TOT40_comp_sent/sentn:0.0);
1479
+
1480
+ printf("F1: %6.2f %%\tPrec: %6.2f %%\tRec: %6.2f %%\t",f,r,p);
1481
+ printf("POS: %6.2f %%\tEX: %6.2f %%\tUnparsed: %6d\tSent: %6d\tfile: %s\n",pos,EX,TOT40_skip_sent+TOT40_error_sent,TOT40_sent,filename2);// ICI<#statements#>
1482
+ }
1483
+
1484
+ }
1485
+
1486
+ }
1487
+
1488
+
1489
+ /*--------------------------------*/
1490
+ /* display individual information */
1491
+ /*--------------------------------*/
1492
+ void
1493
+ dsp_info()
1494
+ {
1495
+ int i, n;
1496
+
1497
+ printf("-<1>---(wn1=%3d, bn1=%3d)- ",wn1,bn1);
1498
+ printf("-<2>---(wn2=%3d, bn2=%3d)-\n",wn2,bn2);
1499
+
1500
+ n = (wn1>wn2?wn1:wn2);
1501
+
1502
+ for(i=0;i<n;i++){
1503
+ if(terminal1[i].word[0]!='\0'){
1504
+ printf("%3d : %d : %-6s %-16s ",i,terminal1[i].result,
1505
+ terminal1[i].label,terminal1[i].word);
1506
+ }else{
1507
+ printf(" ");
1508
+ }
1509
+
1510
+ if(terminal2[i].word[0]!='\0'){
1511
+ printf("%3d : %d : %-6s %-16s\n",i,terminal2[i].result,
1512
+ terminal2[i].label,terminal2[i].word);
1513
+ }else{
1514
+ printf("\n");
1515
+ }
1516
+ }
1517
+ printf("\n");
1518
+
1519
+ n = (bn1>bn2?bn1:bn2);
1520
+
1521
+ for(i=0;i<n;i++){
1522
+ if(bracket1[i].start != -1){
1523
+ printf("%3d : %d : %3d %3d %-6s ",i,bracket1[i].result,
1524
+ bracket1[i].start,bracket1[i].end,
1525
+ bracket1[i].label);
1526
+ } else {
1527
+ printf(" ");
1528
+ }
1529
+
1530
+ if(bracket2[i].start != -1){
1531
+ printf("%3d : %d : %3d %3d %-6s\n",i,bracket2[i].result,
1532
+ bracket2[i].start,bracket2[i].end,
1533
+ bracket2[i].label);
1534
+ } else {
1535
+ printf("\n");
1536
+ }
1537
+ }
1538
+ printf("\n");
1539
+
1540
+ printf("========\n");
1541
+
1542
+ }
1543
+
1544
+
1545
+ /*-----------------*/
1546
+ /* some predicates */
1547
+ /*-----------------*/
1548
+
1549
+
1550
+ // Djamé: reimplementing isspace (while digging bug in spmrl 2013 arabic gold dev line 616)
1551
+ int my_isspace(char c){
1552
+ // those are Posix's sapce : "\t\n\v\f\r"
1553
+ //return (c==' ' || c=='\n');
1554
+ return (c==' ' || c=='\t' || c=='\r' || c=='\n' || c=='\v' || c=='\f');
1555
+ }
1556
+
1557
+
1558
+
1559
+
1560
+ int
1561
+ is_terminator(c)
1562
+ char c;
1563
+ {
1564
+ if(isspace(c) || c=='(' || c==')'){
1565
+ return(1);
1566
+ }else{
1567
+ return(0);
1568
+ }
1569
+ }
1570
+
1571
+ int
1572
+ is_deletelabel(s)
1573
+ char *s;
1574
+ {
1575
+ int i;
1576
+
1577
+ for(i=0;i<Delete_label_n;i++){
1578
+ if(strcmp(s,Delete_label[i])==0){
1579
+ return(1);
1580
+ }
1581
+ }
1582
+
1583
+ return(0);
1584
+ }
1585
+
1586
+ int
1587
+ is_deletelabel_for_length(s)
1588
+ char *s;
1589
+ {
1590
+ int i;
1591
+
1592
+ for(i=0;i<Delete_label_for_length_n;i++){
1593
+ if(strcmp(s,Delete_label_for_length[i])==0){
1594
+ return(1);
1595
+ }
1596
+ }
1597
+
1598
+ return(0);
1599
+ }
1600
+
1601
+ int
1602
+ is_quote_term(s,w)
1603
+ char *s;
1604
+ char *w;
1605
+ {
1606
+ int i;
1607
+
1608
+ for(i=0;i<Quote_term_n;i++){
1609
+ if(strcmp(s,Quote_term[i])==0){
1610
+ // Djame : Arabic word contain quote
1611
+ if (strcmp(w,"'")==0 || strcmp(w,"\"")==0 || strcmp(w,"/")==0)
1612
+ //if (strcmp(w,"\"")==0 || strcmp(w,"/")==0)
1613
+ return(1);
1614
+ }
1615
+ }
1616
+
1617
+ return(0);
1618
+ }
1619
+
1620
+
1621
+ /*---------------*/
1622
+ /* compare words */
1623
+ /*---------------*/
1624
+ int
1625
+ word_comp(s1,s2)
1626
+ char *s1,*s2;
1627
+ {
1628
+ int i;
1629
+
1630
+ if(strcmp(s1,s2)==0){
1631
+ return(1);
1632
+ }
1633
+
1634
+ for(i=0;i<EQ_word_n;i++){
1635
+ if((strcmp(s1,EQ_word[i].s1)==0 &&
1636
+ strcmp(s2,EQ_word[i].s2)==0) ||
1637
+ (strcmp(s1,EQ_word[i].s2)==0 &&
1638
+ strcmp(s2,EQ_word[i].s1)==0)){
1639
+ return(1);
1640
+ }
1641
+ }
1642
+
1643
+ return(0);
1644
+ }
1645
+
1646
+ /*----------------*/
1647
+ /* compare labels */
1648
+ /*----------------*/
1649
+ int
1650
+ label_comp(s1,s2)
1651
+ char *s1,*s2;
1652
+ {
1653
+ int i;
1654
+ // Added by djame for spmrl 2013 so pos tag got filtered too
1655
+
1656
+ modify_label(s1); // djame
1657
+ modify_label(s2); // djame
1658
+ if(strcmp(s1,s2)==0){
1659
+ return(1);
1660
+ }
1661
+
1662
+ for(i=0;i<EQ_label_n;i++){
1663
+ if((strcmp(s1,EQ_label[i].s1)==0 &&
1664
+ strcmp(s2,EQ_label[i].s2)==0) ||
1665
+ (strcmp(s1,EQ_label[i].s2)==0 &&
1666
+ strcmp(s2,EQ_label[i].s1)==0)){
1667
+ return(1);
1668
+ }
1669
+ }
1670
+
1671
+ return(0);
1672
+ }
1673
+
1674
+
1675
+ /*--------*/
1676
+ /* errors */
1677
+ /*--------*/
1678
+ void
1679
+ Error(s,arg1,arg2,arg3)
1680
+ char *s, *arg1, *arg2, *arg3;
1681
+ {
1682
+ Status = 1;
1683
+ fprintf(stderr,"%d : ",Line);
1684
+ fprintf(stderr,s,arg1,arg2,arg3);
1685
+ if(Error_count++>Max_error){
1686
+ exit(1);
1687
+ }
1688
+ }
1689
+
1690
+
1691
+ /*---------------------*/
1692
+ /* fatal error to exit */
1693
+ /*---------------------*/
1694
+ void
1695
+ Fatal(s,arg1,arg2,arg3)
1696
+ char *s, *arg1, *arg2, *arg3;
1697
+ {
1698
+ fprintf(stderr,s,arg1,arg2,arg3);
1699
+ exit(1);
1700
+ }
1701
+
1702
+
1703
+ /*-------*/
1704
+ /* Usage */
1705
+ /*-------*/
1706
+ void
1707
+ Usage()
1708
+ {
1709
+ fprintf(stderr," evalb [-dDh][-c n][-e n][-p param_file] gold-file test-file \n");
1710
+ fprintf(stderr," \n");
1711
+ fprintf(stderr," Evaluate bracketing in test-file against gold-file. \n");
1712
+ fprintf(stderr," Return recall, precision, F-Measure, tag accuracy. \n");
1713
+ fprintf(stderr," \n");
1714
+ fprintf(stderr," <option> \n");
1715
+ fprintf(stderr," -d debug mode \n");
1716
+ fprintf(stderr," -D debug mode plus bracketing info \n");
1717
+ fprintf(stderr," -c n cut-off length forstatistics (def.=40)\n");
1718
+ fprintf(stderr," -e n number of error to kill (default=10) \n");
1719
+ fprintf(stderr," -p param_file parameter file \n");
1720
+ fprintf(stderr," -K n Evaluate up to n sentences \n");
1721
+ fprintf(stderr," -X Count skipped sentences brackets as not parsed \n");
1722
+ fprintf(stderr," -L Compact view (for use in batch mode, all sentences \n");
1723
+ fprintf(stderr," -h help \n");
1724
+ }
parsing/EVALB_SPMRL/spmrl.prm ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##------------------------------------------##
2
+ ## Debug mode ##
3
+ ## 0: No debugging ##
4
+ ## 1: print data for individual sentence ##
5
+ ## 2: print detailed bracketing info ##
6
+ ##------------------------------------------##
7
+ DEBUG 0
8
+
9
+ ##------------------------------------------##
10
+ ## MAX error ##
11
+ ## Number of error to stop the process. ##
12
+ ## This is useful if there could be ##
13
+ ## tokanization error. ##
14
+ ## The process will stop when this number##
15
+ ## of errors are accumulated. ##
16
+ ##------------------------------------------##
17
+ MAX_ERROR 10000
18
+
19
+ ##------------------------------------------##
20
+ ## Cut-off length for statistics ##
21
+ ## At the end of evaluation, the ##
22
+ ## statistics for the senetnces of length##
23
+ ## less than or equal to this number will##
24
+ ## be shown, on top of the statistics ##
25
+ ## for all the sentences ##
26
+ ##------------------------------------------##
27
+ CUTOFF_LEN 70
28
+
29
+ ##------------------------------------------##
30
+ ## unlabeled or labeled bracketing ##
31
+ ## 0: unlabeled bracketing ##
32
+ ## 1: labeled bracketing ##
33
+ ##------------------------------------------##
34
+ LABELED 1
35
+
36
+ ##------------------------------------------##
37
+ ## Delete labels ##
38
+ ## list of labels to be ignored. ##
39
+ ## If it is a pre-terminal label, delete ##
40
+ ## the word along with the brackets. ##
41
+ ## If it is a non-terminal label, just ##
42
+ ## delete the brackets (don't delete ##
43
+ ## deildrens). ##
44
+ ##------------------------------------------##
45
+ DELETE_LABEL TOP
46
+ DELETE_LABEL ROOT
47
+ DELETE_LABEL S1
48
+ DELETE_LABEL -NONE-
49
+ DELETE_LABEL VROOT
50
+
51
+ #DELETE_LABEL ,
52
+ #DELETE_LABEL :
53
+ #DELETE_LABEL ``
54
+ #DELETE_LABEL ''
55
+ #DELETE_LABEL .
56
+ #DELETE_LABEL ?
57
+ #DELETE_LABEL !
58
+ #DELETE_LABEL PONCT
59
+
60
+ ##------------------------------------------##
61
+ ## Delete labels for length calculation ##
62
+ ## list of labels to be ignored for ##
63
+ ## length calculation purpose ##
64
+ ##------------------------------------------##
65
+ DELETE_LABEL_FOR_LENGTH -NONE-
66
+
67
+ ##------------------------------------------##
68
+ ## Labels to be considered for misquote ##
69
+ ## (could be possesive or quote) ##
70
+ ##------------------------------------------##
71
+ #QUOTE_LABEL ``
72
+ #QUOTE_LABEL ''
73
+ #QUOTE_LABEL POS
74
+
75
+ ##------------------------------------------##
76
+ ## These ones are less common, but ##
77
+ ## are on occasion output by parsers: ##
78
+ ##------------------------------------------##
79
+ #QUOTE_LABEL NN
80
+ #QUOTE_LABEL CD
81
+ #QUOTE_LABEL VBZ
82
+ #QUOTE_LABEL :
83
+
84
+ ##------------------------------------------##
85
+ ## Equivalent labels, words ##
86
+ ## the pairs are considered equivalent ##
87
+ ## This is non-directional. ##
88
+ ##------------------------------------------##
89
+ #EQ_LABEL ADVP PRT
90
+
91
+ # EQ_WORD Example example
parsing/EVALB_SPMRL/spmrl_hebrew.prm ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##------------------------------------------##
2
+ ## Debug mode ##
3
+ ## 0: No debugging ##
4
+ ## 1: print data for individual sentence ##
5
+ ## 2: print detailed bracketing info ##
6
+ ##------------------------------------------##
7
+ DEBUG 0
8
+
9
+ ##------------------------------------------##
10
+ ## MAX error ##
11
+ ## Number of error to stop the process. ##
12
+ ## This is useful if there could be ##
13
+ ## tokanization error. ##
14
+ ## The process will stop when this number##
15
+ ## of errors are accumulated. ##
16
+ ##------------------------------------------##
17
+ MAX_ERROR 10000
18
+
19
+ ##------------------------------------------##
20
+ ## Cut-off length for statistics ##
21
+ ## At the end of evaluation, the ##
22
+ ## statistics for the senetnces of length##
23
+ ## less than or equal to this number will##
24
+ ## be shown, on top of the statistics ##
25
+ ## for all the sentences ##
26
+ ##------------------------------------------##
27
+ CUTOFF_LEN 40
28
+
29
+ ##------------------------------------------##
30
+ ## unlabeled or labeled bracketing ##
31
+ ## 0: unlabeled bracketing ##
32
+ ## 1: labeled bracketing ##
33
+ ##------------------------------------------##
34
+ LABELED 1
35
+
36
+ ##------------------------------------------##
37
+ ## Delete labels ##
38
+ ## list of labels to be ignored. ##
39
+ ## If it is a pre-terminal label, delete ##
40
+ ## the word along with the brackets. ##
41
+ ## If it is a non-terminal label, just ##
42
+ ## delete the brackets (don't delete ##
43
+ ## deildrens). ##
44
+ ##------------------------------------------##
45
+ DELETE_LABEL TOP
46
+ DELETE_LABEL ROOT
47
+ DELETE_LABEL S1
48
+ DELETE_LABEL -NONE-
49
+ DELETE_LABEL VROOT
50
+ #DELETE_LABEL SENT
51
+
52
+ #DELETE_LABEL ,
53
+ #DELETE_LABEL :
54
+ #DELETE_LABEL ``
55
+ #DELETE_LABEL ''
56
+ #DELETE_LABEL .
57
+ #DELETE_LABEL ?
58
+ #DELETE_LABEL !
59
+ #DELETE_LABEL PONCT
60
+
61
+ ##------------------------------------------##
62
+ ## Delete labels for length calculation ##
63
+ ## list of labels to be ignored for ##
64
+ ## length calculation purpose ##
65
+ ##------------------------------------------##
66
+ DELETE_LABEL_FOR_LENGTH -NONE-
67
+
68
+ ##------------------------------------------##
69
+ ## Labels to be considered for misquote ##
70
+ ## (could be possesive or quote) ##
71
+ ##------------------------------------------##
72
+ #QUOTE_LABEL ``
73
+ #QUOTE_LABEL ''
74
+ #QUOTE_LABEL POS
75
+
76
+ ##------------------------------------------##
77
+ ## These ones are less common, but ##
78
+ ## are on occasion output by parsers: ##
79
+ ##------------------------------------------##
80
+ #QUOTE_LABEL NN
81
+ #QUOTE_LABEL CD
82
+ #QUOTE_LABEL VBZ
83
+ #QUOTE_LABEL :
84
+
85
+ ##------------------------------------------##
86
+ ## Equivalent labels, words ##
87
+ ## the pairs are considered equivalent ##
88
+ ## This is non-directional. ##
89
+ ##------------------------------------------##
90
+ #EQ_LABEL ADVP PRT
91
+
92
+ # EQ_WORD Example example
93
+ DELETE_LABEL SYN_NN
94
+ DELETE_LABEL SYN_NNP
95
+ DELETE_LABEL SYN_NNT
96
+ DELETE_LABEL SYN_PRP
97
+ DELETE_LABEL SYN_JJ
98
+ DELETE_LABEL SYN_JJT
99
+ DELETE_LABEL SYN_RB
100
+ DELETE_LABEL SYN_RBR
101
+ DELETE_LABEL SYN_MOD
102
+ DELETE_LABEL SYN_VB
103
+ DELETE_LABEL SYN_AUX
104
+ DELETE_LABEL SYN_AGR
105
+ DELETE_LABEL SYN_IN
106
+ DELETE_LABEL SYN_COM
107
+ DELETE_LABEL SYN_REL
108
+ DELETE_LABEL SYN_CC
109
+ DELETE_LABEL SYN_QW
110
+ DELETE_LABEL SYN_HAM
111
+ DELETE_LABEL SYN_WDT
112
+ DELETE_LABEL SYN_DT
113
+ DELETE_LABEL SYN_CD
114
+ DELETE_LABEL SYN_CDT
115
+ DELETE_LABEL SYN_AT
116
+ DELETE_LABEL SYN_H
117
+ DELETE_LABEL SYN_FL
118
+ DELETE_LABEL SYN_ZVL