lingbionlp commited on
Commit
49c5cf1
1 Parent(s): e49befb

Upload 10 files

Browse files
src/abbre_resolution.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Tue Aug 11 16:52:40 2020
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ import logging
9
+ import regex
10
+ import sys
11
+ import io
12
+
13
+ """
14
+ A Python 3 refactoring of Vincent Van Asch's Python 2 code at
15
+
16
+ http://www.cnts.ua.ac.be/~vincent/scripts/abbreviations.py
17
+
18
+ Based on
19
+
20
+ A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
21
+ A. Schwartz and M. Hearst
22
+ Biocomputing, 2003, pp 451-462.
23
+
24
+ """
25
+
26
+ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
27
+ log = logging.getLogger('Abbre')
28
+
29
+
30
+ class Candidate(str):
31
+ def __init__(self, value):
32
+ super().__init__()
33
+ self.start = 0
34
+ self.stop = 0
35
+
36
+ def set_position(self, start, stop):
37
+ self.start = start
38
+ self.stop = stop
39
+
40
+
41
+ def yield_lines_from_file(file_path):
42
+ with open(file_path, 'rb') as f:
43
+ for line in f:
44
+ try:
45
+ line = line.decode('utf-8')
46
+ except UnicodeDecodeError:
47
+ line = line.decode('latin-1').encode('utf-8').decode('utf-8')
48
+ line = line.strip()
49
+ yield line
50
+ f.close()
51
+
52
+
53
+ def yield_lines_from_doc(doc_text):
54
+ for line in doc_text.split("\n"):
55
+ yield line.strip()
56
+
57
+
58
+ def best_candidates(sentence):
59
+ """
60
+ :param sentence: line read from input file
61
+ :return: a Candidate iterator
62
+ """
63
+
64
+ if '(' in sentence:
65
+ # Check some things first
66
+ if sentence.count('(') != sentence.count(')'):
67
+ raise ValueError("Unbalanced parentheses: {}".format(sentence))
68
+
69
+ if sentence.find('(') > sentence.find(')'):
70
+ raise ValueError("First parentheses is right: {}".format(sentence))
71
+
72
+ closeindex = -1
73
+ while 1:
74
+ # Look for open parenthesis
75
+ openindex = sentence.find('(', closeindex + 1)
76
+
77
+ if openindex == -1: break
78
+
79
+ # Look for closing parentheses
80
+ closeindex = openindex + 1
81
+ open = 1
82
+ skip = False
83
+ while open:
84
+ try:
85
+ char = sentence[closeindex]
86
+ except IndexError:
87
+ # We found an opening bracket but no associated closing bracket
88
+ # Skip the opening bracket
89
+ skip = True
90
+ break
91
+ if char == '(':
92
+ open += 1
93
+ elif char in [')', ';', ':']:
94
+ open -= 1
95
+ closeindex += 1
96
+
97
+ if skip:
98
+ closeindex = openindex + 1
99
+ continue
100
+
101
+ # Output if conditions are met
102
+ start = openindex + 1
103
+ stop = closeindex - 1
104
+ candidate = sentence[start:stop]
105
+
106
+ # Take into account whitespace that should be removed
107
+ start = start + len(candidate) - len(candidate.lstrip())
108
+ stop = stop - len(candidate) + len(candidate.rstrip())
109
+ candidate = sentence[start:stop]
110
+
111
+ if conditions(candidate):
112
+ new_candidate = Candidate(candidate)
113
+ new_candidate.set_position(start, stop)
114
+ yield new_candidate
115
+
116
+
117
+ def conditions(candidate):
118
+ """
119
+ Based on Schwartz&Hearst
120
+
121
+ 2 <= len(str) <= 10
122
+ len(tokens) <= 2
123
+ re.search('\p{L}', str)
124
+ str[0].isalnum()
125
+
126
+ and extra:
127
+ if it matches (\p{L}\.?\s?){2,}
128
+ it is a good candidate.
129
+
130
+ :param candidate: candidate abbreviation
131
+ :return: True if this is a good candidate
132
+ """
133
+ viable = True
134
+ if regex.match('(\p{L}\.?\s?){2,}', candidate.lstrip()):
135
+ viable = True
136
+ if len(candidate) < 2 or len(candidate) > 10:
137
+ viable = False
138
+ if len(candidate.split()) > 2:
139
+ viable = False
140
+ if not regex.search('\p{L}', candidate):
141
+ viable = False
142
+ if not candidate[0].isalnum():
143
+ viable = False
144
+
145
+ return viable
146
+
147
+
148
+ def get_definition(candidate, sentence):
149
+ """
150
+ Takes a candidate and a sentence and returns the definition candidate.
151
+
152
+ The definintion candidate is the set of tokens (in front of the candidate)
153
+ that starts with a token starting with the first character of the candidate
154
+
155
+ :param candidate: candidate abbreviation
156
+ :param sentence: current sentence (single line from input file)
157
+ :return: candidate definition for this abbreviation
158
+ """
159
+ # Take the tokens in front of the candidate
160
+ tokens = regex.split(r'[\s\-]+', sentence[:candidate.start - 2].lower())
161
+ #print(tokens)
162
+ # the char that we are looking for
163
+ key = candidate[0].lower()
164
+
165
+ # Count the number of tokens that start with the same character as the candidate
166
+ # print(tokens)
167
+ firstchars = [t[0] for t in tokens]
168
+ # print(firstchars)
169
+ definition_freq = firstchars.count(key)
170
+ candidate_freq = candidate.lower().count(key)
171
+
172
+ # Look for the list of tokens in front of candidate that
173
+ # have a sufficient number of tokens starting with key
174
+ if candidate_freq <= definition_freq:
175
+ # we should at least have a good number of starts
176
+ count = 0
177
+ start = 0
178
+ startindex = len(firstchars) - 1
179
+
180
+ while count < candidate_freq:
181
+ if abs(start) > len(firstchars):
182
+ raise ValueError("candiate {} not found".format(candidate))
183
+ start -= 1
184
+ # Look up key in the definition
185
+ try:
186
+ startindex = firstchars.index(key, len(firstchars) + start)
187
+ except ValueError:
188
+ pass
189
+
190
+ # Count the number of keys in definition
191
+ count = firstchars[startindex:].count(key)
192
+
193
+ # We found enough keys in the definition so return the definition as a definition candidate
194
+ start = len(' '.join(tokens[:startindex]))
195
+ stop = candidate.start - 1
196
+ candidate = sentence[start:stop]
197
+
198
+ # Remove whitespace
199
+ start = start + len(candidate) - len(candidate.lstrip())
200
+ stop = stop - len(candidate) + len(candidate.rstrip())
201
+ candidate = sentence[start:stop]
202
+
203
+ new_candidate = Candidate(candidate)
204
+ new_candidate.set_position(start, stop)
205
+ #print('new_candidate:')
206
+ #print(new_candidate,start,stop)
207
+ return new_candidate
208
+
209
+ else:
210
+ raise ValueError('There are less keys in the tokens in front of candidate than there are in the candidate')
211
+
212
+
213
+ def select_definition(definition, abbrev):
214
+ """
215
+ Takes a definition candidate and an abbreviation candidate
216
+ and returns True if the chars in the abbreviation occur in the definition
217
+
218
+ Based on
219
+ A simple algorithm for identifying abbreviation definitions in biomedical texts, Schwartz & Hearst
220
+ :param definition: candidate definition
221
+ :param abbrev: candidate abbreviation
222
+ :return:
223
+ """
224
+
225
+
226
+ if len(definition) < len(abbrev):
227
+ raise ValueError('Abbreviation is longer than definition')
228
+
229
+ if abbrev in definition.split():
230
+ raise ValueError('Abbreviation is full word of definition')
231
+
232
+ sindex = -1
233
+ lindex = -1
234
+
235
+ while 1:
236
+ try:
237
+ longchar = definition[lindex].lower()
238
+ except IndexError:
239
+ raise
240
+
241
+ shortchar = abbrev[sindex].lower()
242
+
243
+ if not shortchar.isalnum():
244
+ sindex -= 1
245
+
246
+ if sindex == -1 * len(abbrev):
247
+ if shortchar == longchar:
248
+ if lindex == -1 * len(definition) or not definition[lindex - 1].isalnum():
249
+ break
250
+ else:
251
+ lindex -= 1
252
+ else:
253
+ lindex -= 1
254
+ if lindex == -1 * (len(definition) + 1):
255
+ raise ValueError("definition {} was not found in {}".format(abbrev, definition))
256
+
257
+ else:
258
+ if shortchar == longchar:
259
+ sindex -= 1
260
+ lindex -= 1
261
+ else:
262
+ lindex -= 1
263
+ # print('lindex:',lindex,len(definition),definition[lindex:len(definition)])
264
+ new_candidate = Candidate(definition[lindex:len(definition)])
265
+ new_candidate.set_position(definition.start+lindex+len(definition), definition.stop)
266
+ definition = new_candidate
267
+
268
+ tokens = len(definition.split())
269
+ length = len(abbrev)
270
+
271
+ if tokens > min([length + 5, length * 2]):
272
+ raise ValueError("did not meet min(|A|+5, |A|*2) constraint")
273
+
274
+ # Do not return definitions that contain unbalanced parentheses
275
+ if definition.count('(') != definition.count(')'):
276
+ raise ValueError("Unbalanced parentheses not allowed in a definition")
277
+ # print('select:')
278
+ # print(definition,definition.start, definition.stop)
279
+ new_definition_dict={'definition':definition,'start':definition.start,'stop':definition.stop}
280
+ return new_definition_dict
281
+
282
+
283
+ def extract_abbreviation_definition_pairs(file_path=None, doc_text=None):
284
+ abbrev_map = []
285
+ omit = 0
286
+ written = 0
287
+ if file_path:
288
+ sentence_iterator = enumerate(yield_lines_from_file(file_path))
289
+ elif doc_text:
290
+ sentence_iterator = enumerate(yield_lines_from_doc(doc_text))
291
+ else:
292
+ return abbrev_map
293
+
294
+ for i, sentence in sentence_iterator:
295
+ #print(sentence)
296
+ try:
297
+ for candidate in best_candidates(sentence):
298
+ #print(candidate)
299
+ try:
300
+ #print('begin get definition')
301
+ definition = get_definition(candidate, sentence)
302
+ #print('get_definition:')
303
+ #print(definition)
304
+
305
+ except (ValueError, IndexError) as e:
306
+ #log.debug("{} Omitting candidate {}. Reason: {}".format(i, candidate, e.args[0]))
307
+ omit += 1
308
+ else:
309
+ try:
310
+ definition_dict = select_definition(definition, candidate)
311
+ except (ValueError, IndexError) as e:
312
+ #log.debug("{} Omitting definition {} for candidate {}. Reason: {}".format(i, definition_dict, candidate, e.args[0]))
313
+ omit += 1
314
+ else:
315
+ definition_dict['abbre']=candidate
316
+ abbrev_map.append(definition_dict)
317
+ written += 1
318
+ except (ValueError, IndexError) as e:
319
+ log.debug("{} Error processing sentence {}: {}".format(i, sentence, e.args[0]))
320
+ log.debug("{} abbreviations detected and kept ({} omitted)".format(written, omit))
321
+ return abbrev_map
322
+
323
+ def postprocess_abbr(ner_result,ori_text):
324
+
325
+ final_result={}
326
+ if len(ner_result)==0:
327
+ return []
328
+ # abbr recognition
329
+ abbr_result=extract_abbreviation_definition_pairs(doc_text=ori_text)
330
+
331
+ # read ner results
332
+ nor_loc_list={} #{entity_name_location:entity_information}
333
+
334
+ for ele in ner_result:
335
+ nor_loc_list[str(ele[0])+' '+str(ele[1])]=ele
336
+ final_result['\t'.join(ele)]=[int(ele[0]),int(ele[1])]
337
+
338
+ #abbr matching
339
+ for abbr in abbr_result:
340
+ abbr_index=str(abbr['start'])+' '+str(abbr['stop'])
341
+ if abbr_index in nor_loc_list.keys():
342
+
343
+ line=ori_text
344
+ abbr_text=abbr['abbre']
345
+ abbr_eid=0
346
+ while line.find(abbr_text)>=0:
347
+ abbr_sid=line.find(abbr_text)+abbr_eid
348
+ abbr_eid=abbr_sid+len(abbr_text)
349
+ # print(abbr_sid,abbr_eid)
350
+ if abbr_sid>0 and abbr_eid<len(ori_text):
351
+ if ori_text[abbr_sid-1].isalnum()==False and ori_text[abbr_eid].isalnum()==False:
352
+ final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+nor_loc_list[abbr_index][2]+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
353
+ elif abbr_sid==0 and abbr_eid<len(ori_text):
354
+ if ori_text[abbr_eid].isalnum()==False:
355
+ final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+nor_loc_list[abbr_index][2]+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
356
+ elif abbr_sid>0 and abbr_eid==len(ori_text):
357
+ if ori_text[abbr_sid-1].isalnum()==False :
358
+ final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+nor_loc_list[abbr_index][2]+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
359
+ line=ori_text[abbr_eid:]
360
+ # print(final_result)
361
+ sorted_final_result=sorted(final_result.items(), key=lambda kv:(kv[1]), reverse=False)
362
+ final_result=[]
363
+ for ele in sorted_final_result:
364
+ final_result.append(ele[0].split('\t'))
365
+ return final_result
366
+
367
+ def ner_abbr(ner_result,abbr_result,ori_text):
368
+ # read ner results
369
+ nor_name_list={} #{entity_name:entity_information}
370
+ nor_loc_list={} #{entity_name_location:entity_information}
371
+ final_result={} #{entity_information:location} use to sort
372
+ for ele in ner_result:
373
+ temp_seg=ele.split('\t')
374
+ nor_loc_list[temp_seg[0]+' '+temp_seg[1]]=temp_seg
375
+ nor_name_list[temp_seg[2].lower()]=temp_seg
376
+ final_result['\t'.join(temp_seg[0:4])]=[int(temp_seg[0]),int(temp_seg[1])]
377
+
378
+ #abbr matching
379
+ for abbr in abbr_result:
380
+ abbr_index=str(abbr['start'])+' '+str(abbr['stop'])
381
+ if abbr_index in nor_loc_list.keys():
382
+
383
+ line=ori_text
384
+ abbr_text=abbr['abbre']
385
+ abbr_eid=0
386
+ while line.find(abbr_text)>=0:
387
+ abbr_sid=line.find(abbr_text)+abbr_eid
388
+ abbr_eid=abbr_sid+len(abbr_text)
389
+ # print(abbr_sid,abbr_eid)
390
+ if abbr_sid>0 and abbr_eid<len(ori_text):
391
+ if ori_text[abbr_sid-1].isalnum()==False and ori_text[abbr_eid].isalnum()==False:
392
+ final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+abbr_text+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
393
+ elif abbr_sid==0 and abbr_eid<len(ori_text):
394
+ if ori_text[abbr_eid].isalnum()==False:
395
+ final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+abbr_text+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
396
+ elif abbr_sid>0 and abbr_eid==len(ori_text):
397
+ if ori_text[abbr_sid-1].isalnum()==False :
398
+ final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+abbr_text+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
399
+ line=ori_text[abbr_eid:]
400
+ # print(final_result)
401
+ final_result=sorted(final_result.items(), key=lambda kv:(kv[1]), reverse=False)
402
+
403
+ return final_result
404
+
405
+
406
+
407
+
408
+ if __name__ == '__main__':
409
+ path='//panfs/pan1/bionlp/lulab/luoling/HPO_project/diseaseTag/data/test/results/'
410
+ fin=open(path+'NCBI_test_phecr_95.tsv','r',encoding='utf-8')
411
+ context=fin.read().strip().split('\n\n')
412
+ fin.close()
413
+ fout=open(path+'NCBI_test_phecr_abbre_95.tsv','w',encoding='utf-8')
414
+ for doc in context:
415
+ lines=doc.split('\n')
416
+ ori_text=lines[1]
417
+ # print(ori_text)
418
+ fout.write(lines[0]+'\n'+lines[1]+'\n')
419
+ if len(lines)>2:
420
+ abbr_result=extract_abbreviation_definition_pairs(doc_text=ori_text)
421
+ print(abbr_result)
422
+ abbr_out=ner_abbr(lines[2:],abbr_result,ori_text)
423
+ else:
424
+ abbr_out=[]
425
+ # print('final:',abbr_out)
426
+ for ele in abbr_out:
427
+ fout.write(ele[0]+'\n')
428
+ fout.write('\n')
429
+ # sys.exit()
430
+ fout.close()
431
+ #last_out=combine_ml_dict_fn(abbr_out,infile)
432
+ #print(last_out)
433
+
434
+
src/combine_result.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Mon Jun 15 11:24:45 2020
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ import io
9
+ def nest_overlap_entity(nest_list):
10
+ temp_result_list={}
11
+ for i in range(0, len(nest_list)):
12
+ hpoid=nest_list[i][3]
13
+ if hpoid not in temp_result_list.keys():
14
+ temp_result_list[hpoid]=nest_list[i]
15
+ else:
16
+ score=float(nest_list[i][4])
17
+ old_score=float(temp_result_list[hpoid][4])
18
+ if score>old_score: # retain higer score concept
19
+ temp_result_list[hpoid]=nest_list[i]
20
+ new_list=[]
21
+ for hpoid in temp_result_list.keys():
22
+ new_list.append(temp_result_list[hpoid])
23
+
24
+ temp_result_list={} #same index, different ids
25
+ for i in range(0, len(new_list)):
26
+ ids=new_list[i][0]+' '+new_list[i][1]
27
+ if ids not in temp_result_list.keys():
28
+ temp_result_list[ids]=new_list[i]
29
+ else:
30
+ score=float(nest_list[i][4])
31
+ old_score=float(temp_result_list[ids][4])
32
+ if score>old_score:
33
+ temp_result_list[ids]=new_list[i]
34
+ final_list=[]
35
+ for ids in temp_result_list.keys():
36
+ final_list.append(temp_result_list[ids])
37
+ return final_list
38
+ def combine_ml_dict(dict_tsv,ml_tsv,nest=True):
39
+ fin_dic=io.StringIO(dict_tsv)
40
+ fin_ml=io.StringIO(ml_tsv)
41
+ fout=io.StringIO()
42
+ all_dic=fin_dic.read().strip().split('\n\n')
43
+ all_ml=fin_ml.read().strip().split('\n\n')
44
+ fin_dic.close()
45
+ fin_ml.close()
46
+
47
+ for i in range(0,len(all_dic)):
48
+ lines_dic=all_dic[i].split('\n')
49
+ lines_ml=all_ml[i].split('\n')
50
+ entity_list={}
51
+ for j in range(1,len(lines_dic)):
52
+ seg=lines_dic[j].split('\t')
53
+ entity_list[lines_dic[j]]=[int(seg[0]),int(seg[1])] #dict results score 1.00
54
+ for j in range(1,len(lines_ml)):
55
+ seg=lines_ml[j].split('\t')
56
+ entity_list[lines_ml[j]]=[int(seg[0]),int(seg[1])]
57
+
58
+ entity_list=sorted(entity_list.items(), key=lambda kv:(kv[1]), reverse=False)
59
+ entity_list_sort=[]
60
+ for ele in entity_list:
61
+ entity_list_sort.append(ele[0])
62
+
63
+ final_entity=[]
64
+ if len(entity_list_sort)!=0:
65
+ first_entity=entity_list_sort[0].split('\t')
66
+ nest_list=[first_entity]
67
+ max_eid=int(first_entity[1])
68
+
69
+ for i in range(1,len(entity_list_sort)):
70
+ segs=entity_list_sort[i].split('\t')
71
+ if int(segs[0])> max_eid:
72
+ if len(nest_list)==1:
73
+ final_entity.append(nest_list[0])
74
+ nest_list=[]
75
+ nest_list.append(segs)
76
+ if int(segs[1])>max_eid:
77
+ max_eid=int(segs[1])
78
+ else:
79
+ tem=nest_overlap_entity(nest_list)
80
+ final_entity.extend(tem)
81
+ nest_list=[]
82
+ nest_list.append(segs)
83
+ if int(segs[1])>max_eid:
84
+ max_eid=int(segs[1])
85
+ else:
86
+ nest_list.append(segs)
87
+ if int(segs[1])>max_eid:
88
+ max_eid=int(segs[1])
89
+ if nest_list!=[]:
90
+ if len(nest_list)==1:
91
+ final_entity.append(nest_list[0])
92
+
93
+ else:
94
+ tem=nest_overlap_entity(nest_list)#find max entity
95
+ final_entity.extend(tem)
96
+
97
+ fout.write(lines_ml[0]+'\n')
98
+ for ele in final_entity:
99
+ fout.write('\t'.join(ele)+'\n')
100
+ fout.write('\n')
101
+ return fout.getvalue()
102
+
src/ml_ner.py ADDED
@@ -0,0 +1,563 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Jun 12 16:41:54 2020
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ import io
9
+ import time
10
+ import numpy as np
11
+ NEG_LABEL='ONT:None'
12
+ def ml_intext(infile):
13
+ fin=open(infile,'r',encoding='utf-8')
14
+ alltexts=fin.read().strip().split('\n\n')
15
+ fin.close()
16
+ data_list=[]
17
+ label_list=[]
18
+ for sents in alltexts:
19
+ lines=sents.split('\n')
20
+ temp_sentece=[]
21
+ label=lines[0].split('\t')[0]
22
+ label_list.append(label)
23
+ for i in range(1,len(lines)):
24
+ seg=lines[i].split('\t')
25
+ temp_sentece.append(seg)
26
+ data_list.append(temp_sentece)
27
+ return data_list,label_list
28
+ def ml_intext_fn(ml_input):
29
+ fin=io.StringIO(ml_input)
30
+ alltexts=fin.read().strip().split('\n\n')
31
+ fin.close()
32
+ data_list=[]
33
+ label_list=[]
34
+ for sents in alltexts:
35
+ lines=sents.split('\n')
36
+ temp_sentece=[]
37
+ label=lines[0].split('\t')[0]
38
+ label_list.append(label)
39
+ for i in range(1,len(lines)):
40
+ seg=lines[i].split('\t')
41
+ temp_sentece.append(seg)
42
+ data_list.append(temp_sentece)
43
+ return data_list,label_list
44
+ def pun_filter(temp_entity):
45
+ pun_list=[',','.','!',';',':','?','(',')','[',']','{','}']
46
+ filter_flag=0
47
+ for ele in temp_entity:
48
+ if ele in pun_list:
49
+ filter_flag=1
50
+ break
51
+ return filter_flag
52
+ def pos_filter(temp_pos,temp_entity):
53
+ pos_list_l=['PRP']
54
+ pos_list=['IN','DT','CC','O','MD','EX','POS','WDT','WP','WP$','WRB','TO','PRP$']
55
+ verb_word=['is','are','was','were','had','have','has','be','been','also']
56
+ filter_flag=0
57
+
58
+ if (temp_entity[0] in verb_word) or (temp_entity[-1] in verb_word):
59
+ filter_flag=1
60
+ if (temp_pos[0] in pos_list) or (temp_pos[-1] in pos_list) or (temp_pos[0] in pos_list_l):
61
+ filter_flag=1
62
+ return filter_flag
63
+
64
+ def build_ngram_testset_filted(conll_input,Ngram=8):
65
+
66
+ fin_genia=io.StringIO(conll_input)
67
+ fout_context=io.StringIO()
68
+ fout_txt=io.StringIO()
69
+
70
+ index_dict={}
71
+ allentity=[]
72
+ alltext=fin_genia.read().strip().split('\n\n')
73
+ fin_genia.close()
74
+ num_total=0
75
+ for i in range(0,len(alltext)):
76
+
77
+ lines=alltext[i].split('\n')
78
+ ori_txt=[]
79
+ for ele in lines:
80
+ seg=ele.split('\t')
81
+ ori_txt.append(seg[0])
82
+ fout_txt.write(' '.join(ori_txt)+'\n')
83
+
84
+ if Ngram>len(lines):
85
+ Ngram=len(lines)
86
+
87
+ fout_context_list=[]
88
+ temp_entity=[]
89
+ temp_pos=[]
90
+ for ngram in range(2,Ngram+1):
91
+ if ngram==1:
92
+ for j in range(0, len(lines)):
93
+ sid=0
94
+ eid=0
95
+ for m in range(0,len(lines)):
96
+ if m==j:
97
+ sid=m
98
+ eid=m
99
+ fout_context_list.append(lines[m]+'\tO\tB')
100
+ temp_seg=lines[m].split('\t')
101
+ temp_entity.append(temp_seg[0])
102
+ temp_pos.append(temp_seg[3])
103
+ else:
104
+ pass
105
+ # print(sentence[m])
106
+ # fout_context_list.append(lines[m]+'\tO\tO')
107
+ if pun_filter(temp_entity)==0 and pos_filter(temp_pos,temp_entity)==0:
108
+ num_total+=1
109
+ if ' '.join(temp_entity) not in allentity:
110
+ allentity.append(' '.join(temp_entity))
111
+ fout_context.write(NEG_LABEL+'\t'+' '.join(temp_entity)+'\n')
112
+ fout_context.write('\n'.join(fout_context_list)+'\n\n')
113
+ index_dict[str(num_total)]=[i,sid,eid]
114
+ temp_entity=[]
115
+ temp_pos=[]
116
+ fout_context_list=[]
117
+ elif ngram==2:
118
+ for j in range(0, len(lines)-1):
119
+ sid=0
120
+ eid=0
121
+ for m in range(0,len(lines)):
122
+ if m==j:
123
+ fout_context_list.append(lines[m]+'\tO\tB')
124
+ sid=m
125
+ temp_seg=lines[m].split('\t')
126
+ temp_entity.append(temp_seg[0])
127
+ temp_pos.append(temp_seg[3])
128
+ elif m==j+1:
129
+ fout_context_list.append(lines[m]+'\tO\tB')
130
+ eid=m
131
+ temp_seg=lines[m].split('\t')
132
+ temp_entity.append(temp_seg[0])
133
+ temp_pos.append(temp_seg[3])
134
+ else:
135
+ pass
136
+ # fout_context_list.append(lines[m]+'\tO\tO')
137
+
138
+ if pun_filter(temp_entity)==0 and pos_filter(temp_pos,temp_entity)==0:
139
+ num_total+=1
140
+ if ' '.join(temp_entity) not in allentity:
141
+ allentity.append(' '.join(temp_entity))
142
+ fout_context.write(NEG_LABEL+'\t'+' '.join(temp_entity)+'\n')
143
+ fout_context.write('\n'.join(fout_context_list)+'\n\n')
144
+ index_dict[str(num_total)]=[i,sid,eid]
145
+ temp_entity=[]
146
+ temp_pos=[]
147
+ fout_context_list=[]
148
+ else :
149
+ for j in range(0, len(lines)-ngram+1):
150
+ sid=0
151
+ eid=0
152
+ for m in range(0,len(lines)):
153
+ if m==j:
154
+ fout_context_list.append(lines[m]+'\tO\tB')
155
+ sid=m
156
+ temp_seg=lines[m].split('\t')
157
+ temp_entity.append(temp_seg[0])
158
+ temp_pos.append(temp_seg[3])
159
+ elif m>j and m<j+ngram-1:
160
+ fout_context_list.append(lines[m]+'\tO\tB')
161
+ temp_seg=lines[m].split('\t')
162
+ temp_entity.append(temp_seg[0])
163
+ temp_pos.append(temp_seg[2])
164
+ elif m==j+ngram-1:
165
+ fout_context_list.append(lines[m]+'\tO\tB')
166
+ eid=m
167
+ temp_seg=lines[m].split('\t')
168
+ temp_entity.append(temp_seg[0])
169
+ temp_pos.append(temp_seg[3])
170
+ else:
171
+ pass
172
+ # fout_context_list.append(lines[m]+'\tO\tO')
173
+
174
+ if pun_filter(temp_entity)==0 and pos_filter(temp_pos,temp_entity)==0:
175
+ num_total+=1
176
+ if ' '.join(temp_entity) not in allentity:
177
+ allentity.append(' '.join(temp_entity))
178
+ fout_context.write(NEG_LABEL+'\t'+' '.join(temp_entity)+'\n')
179
+ fout_context.write('\n'.join(fout_context_list)+'\n\n')
180
+ index_dict[str(num_total)]=[i,sid,eid]
181
+
182
+ temp_entity=[]
183
+ temp_pos=[]
184
+ fout_context_list=[]
185
+
186
+ return fout_context.getvalue(),fout_txt.getvalue(),index_dict
187
+
188
+ def build_all_ngram_testset_filted(conll_input,Ngram=8):
189
+
190
+ fin_genia=io.StringIO(conll_input)
191
+ fout_context=io.StringIO()
192
+ fout_txt=io.StringIO()
193
+
194
+ index_dict={}
195
+ allentity=[]
196
+ alltext=fin_genia.read().strip().split('\n\n')
197
+ fin_genia.close()
198
+ num_total=0
199
+ for i in range(0,len(alltext)):
200
+
201
+ lines=alltext[i].split('\n')
202
+ ori_txt=[]
203
+ for ele in lines:
204
+ seg=ele.split('\t')
205
+ ori_txt.append(seg[0])
206
+ fout_txt.write(' '.join(ori_txt)+'\n')
207
+
208
+ if Ngram>len(lines):
209
+ Ngram=len(lines)
210
+
211
+ fout_context_list=[]
212
+ temp_entity=[]
213
+ temp_pos=[]
214
+ for ngram in range(1,Ngram+1):
215
+ if ngram==1:
216
+ for j in range(0, len(lines)):
217
+ sid=0
218
+ eid=0
219
+ for m in range(0,len(lines)):
220
+ if m==j:
221
+ sid=m
222
+ eid=m
223
+ fout_context_list.append(lines[m]+'\tO\tB')
224
+ temp_seg=lines[m].split('\t')
225
+ temp_entity.append(temp_seg[0])
226
+ temp_pos.append(temp_seg[3])
227
+ else:
228
+ pass
229
+ # print(sentence[m])
230
+ # fout_context_list.append(lines[m]+'\tO\tO')
231
+ if pun_filter(temp_entity)==0 and pos_filter(temp_pos,temp_entity)==0:
232
+ num_total+=1
233
+ if ' '.join(temp_entity) not in allentity:
234
+ allentity.append(' '.join(temp_entity))
235
+ fout_context.write(NEG_LABEL+'\t'+' '.join(temp_entity)+'\n')
236
+ fout_context.write('\n'.join(fout_context_list)+'\n\n')
237
+ index_dict[str(num_total)]=[i,sid,eid]
238
+ temp_entity=[]
239
+ temp_pos=[]
240
+ fout_context_list=[]
241
+ elif ngram==2:
242
+ for j in range(0, len(lines)-1):
243
+ sid=0
244
+ eid=0
245
+ for m in range(0,len(lines)):
246
+ if m==j:
247
+ fout_context_list.append(lines[m]+'\tO\tB')
248
+ sid=m
249
+ temp_seg=lines[m].split('\t')
250
+ temp_entity.append(temp_seg[0])
251
+ temp_pos.append(temp_seg[3])
252
+ elif m==j+1:
253
+ fout_context_list.append(lines[m]+'\tO\tB')
254
+ eid=m
255
+ temp_seg=lines[m].split('\t')
256
+ temp_entity.append(temp_seg[0])
257
+ temp_pos.append(temp_seg[3])
258
+ else:
259
+ pass
260
+ # fout_context_list.append(lines[m]+'\tO\tO')
261
+
262
+ if pun_filter(temp_entity)==0 and pos_filter(temp_pos,temp_entity)==0:
263
+ num_total+=1
264
+ if ' '.join(temp_entity) not in allentity:
265
+ allentity.append(' '.join(temp_entity))
266
+ fout_context.write(NEG_LABEL+'\t'+' '.join(temp_entity)+'\n')
267
+ fout_context.write('\n'.join(fout_context_list)+'\n\n')
268
+ index_dict[str(num_total)]=[i,sid,eid]
269
+ temp_entity=[]
270
+ temp_pos=[]
271
+ fout_context_list=[]
272
+ else :
273
+ for j in range(0, len(lines)-ngram+1):
274
+ sid=0
275
+ eid=0
276
+ for m in range(0,len(lines)):
277
+ if m==j:
278
+ fout_context_list.append(lines[m]+'\tO\tB')
279
+ sid=m
280
+ temp_seg=lines[m].split('\t')
281
+ temp_entity.append(temp_seg[0])
282
+ temp_pos.append(temp_seg[3])
283
+ elif m>j and m<j+ngram-1:
284
+ fout_context_list.append(lines[m]+'\tO\tB')
285
+ temp_seg=lines[m].split('\t')
286
+ temp_entity.append(temp_seg[0])
287
+ temp_pos.append(temp_seg[2])
288
+ elif m==j+ngram-1:
289
+ fout_context_list.append(lines[m]+'\tO\tB')
290
+ eid=m
291
+ temp_seg=lines[m].split('\t')
292
+ temp_entity.append(temp_seg[0])
293
+ temp_pos.append(temp_seg[3])
294
+ else:
295
+ pass
296
+ # fout_context_list.append(lines[m]+'\tO\tO')
297
+
298
+ if pun_filter(temp_entity)==0 and pos_filter(temp_pos,temp_entity)==0:
299
+ num_total+=1
300
+ if ' '.join(temp_entity) not in allentity:
301
+ allentity.append(' '.join(temp_entity))
302
+ fout_context.write(NEG_LABEL+'\t'+' '.join(temp_entity)+'\n')
303
+ fout_context.write('\n'.join(fout_context_list)+'\n\n')
304
+ index_dict[str(num_total)]=[i,sid,eid]
305
+
306
+ temp_entity=[]
307
+ temp_pos=[]
308
+ fout_context_list=[]
309
+
310
+ return fout_context.getvalue(),fout_txt.getvalue(),index_dict
311
+
312
+ def output_result(result,label_2_index,Top_N=5):
313
+
314
+ fout=io.StringIO()
315
+ ont_label={}
316
+
317
+ for key in label_2_index.keys():
318
+ ont_label[label_2_index[key]]=key
319
+
320
+
321
+ for line in result:
322
+ #Top_index=line.argsort()[-1*Top_N:][::-1]
323
+ index_top_unsort=np.argpartition(line,-Top_N)[-Top_N:]
324
+ values_top=line[index_top_unsort]
325
+ Top_index=index_top_unsort[np.argsort(-values_top)]
326
+ temp_list=[]
327
+ for max_index in Top_index:
328
+ ont_id=ont_label[max_index]
329
+ ont_id_value=round(line[max_index],5)
330
+ temp_list.append(str(ont_id)+'|'+str(ont_id_value))
331
+ fout.write('\t'.join(temp_list)+'\n')
332
+
333
+ return fout.getvalue()
334
+
335
+ def decode_tsv(test_score, ml_input_index, ml_input_txt, T=0.8):
336
+
337
+ fin_predict=io.StringIO(test_score)
338
+ fin_text=io.StringIO(ml_input_txt)
339
+ fout=io.StringIO()
340
+
341
+ test_txt=fin_text.read().strip().split('\n')
342
+ test_index=ml_input_index
343
+ test_pre=fin_predict.read().strip().split('\n')
344
+
345
+ fin_text.close()
346
+ fin_predict.close()
347
+
348
+ sent_result={}
349
+ for i in range(0,len(test_pre)):
350
+ seg_pre=test_pre[i].split('\t')[0].split('|')
351
+ #print(seg_pre,T)
352
+ if float(seg_pre[1])>T and seg_pre[0]!=NEG_LABEL:
353
+ term_id=str(i+1)
354
+ pre_result=[test_index[term_id][1],test_index[term_id][2],seg_pre[0],seg_pre[1]]
355
+ sent_id=str(test_index[term_id][0])
356
+ if sent_id not in sent_result.keys():
357
+ sent_result[sent_id]=[pre_result]
358
+ else:
359
+ sent_result[sent_id].append(pre_result)
360
+
361
+ for i in range(0,len(test_txt)):
362
+ fout.write(test_txt[i]+'\n')
363
+ if str(i) in sent_result.keys():
364
+ temp_result={}
365
+ for ele in sent_result[str(i)]:
366
+ temp_line=str(ele[0])+'\t'+str(ele[1])+'\t'+' '.join(test_txt[i].split()[ele[0]:ele[1]+1])+'\t'+ele[2]+'\t'+ele[3]
367
+ temp_result[temp_line]=[ele[0],ele[1]]
368
+ if len(temp_result)>=1:
369
+ temp_result=sorted(temp_result.items(), key=lambda d: (d[1][0],d[1][1]), reverse=False)
370
+ for ent in temp_result:
371
+ fout.write(ent[0]+'\n')
372
+ fout.write('\n')
373
+
374
+ return fout.getvalue()
375
+
376
+ def score_filter(temp_entity, T=0.1):
377
+
378
+ result_list=[]
379
+ for i in range(0,len(temp_entity)):
380
+ if float (temp_entity[i][-1])>=T:
381
+ result_list.append(temp_entity[i])
382
+ return(result_list)
383
+ def find_max_entity_nest(nest_list):
384
+ temp_result_list={}
385
+ for i in range(0, len(nest_list)):
386
+ hpoid=nest_list[i][-2]
387
+ score=float(nest_list[i][-1])
388
+ if hpoid not in temp_result_list.keys():
389
+ temp_result_list[hpoid]=nest_list[i]
390
+ else:
391
+ if score>float(temp_result_list[hpoid][-1]):
392
+ temp_result_list[hpoid]=nest_list[i]
393
+ new_list=[]
394
+ for hpoid in temp_result_list.keys():
395
+ new_list.append(temp_result_list[hpoid])
396
+ return new_list
397
+ def duplicate_filter(temp_entity):
398
+ result_list=[]
399
+ if len(temp_entity)>1:
400
+ first_entity=temp_entity[0]
401
+ nest_list=[first_entity]
402
+ max_eid=int(first_entity[1])
403
+
404
+ for i in range(1,len(temp_entity)):
405
+ segs=temp_entity[i]
406
+ if int(segs[0])> max_eid:
407
+ if len(nest_list)==1:
408
+ result_list.append(nest_list[0])
409
+ nest_list=[segs]
410
+ if int(segs[1])>max_eid:
411
+ max_eid=int(segs[1])
412
+ else:
413
+ result_list.extend(find_max_entity_nest(nest_list))
414
+ nest_list=[segs]
415
+
416
+ if int(segs[1])>max_eid:
417
+ max_eid=int(segs[1])
418
+
419
+ else:
420
+ nest_list.append(segs)
421
+ if int(segs[1])>max_eid:
422
+ max_eid=int(segs[1])
423
+ if nest_list!=[]:
424
+ if len(nest_list)==1:
425
+ result_list.append(nest_list[0])
426
+
427
+ else:
428
+ result_list.extend(find_max_entity_nest(nest_list))
429
+ else:
430
+ result_list=temp_entity
431
+ return result_list
432
+ def combine_strategy(test_decode_temp, T=0.8):
433
+ fin=io.StringIO(test_decode_temp)
434
+ fout=io.StringIO()
435
+
436
+ documents=fin.read().strip().split('\n\n')
437
+ fin.close()
438
+
439
+ for doc in documents:
440
+ lines=doc.split('\n')
441
+ context=lines[0]
442
+ final_entity_list=[]
443
+ if len(lines)>1:
444
+ # all entity candidates
445
+ temp_entity=[]
446
+ for i in range(1,len(lines)):
447
+ temp_entity.append(lines[i].split('\t'))
448
+ #print('all entity condidates: ',len(temp_entity))
449
+
450
+ # 将阈值低于T的候选过滤
451
+ filter1=score_filter(temp_entity,T)
452
+ # print('filter1:', len(filter1))
453
+ filter2=duplicate_filter(filter1)
454
+ #print('filter2:', filter2)
455
+ final_entity_list=filter2
456
+
457
+ fout.write(context+'\n')
458
+ for ele in final_entity_list:
459
+ fout.write('\t'.join(ele)+'\n')
460
+ fout.write('\n')
461
+
462
+ return fout.getvalue()
463
+
464
+
465
+ def model_predict(ml_input,nn_model,ml_input_txt,ml_input_index,Threshold):
466
+ if nn_model.model_type=='cnn':
467
+ test_set,test_label = ml_intext_fn(ml_input)
468
+ test_x, test_y = nn_model.rep.represent_instances_all_feas(test_set,test_label,word_max_len=nn_model.hyper['sen_max'],char_max_len=nn_model.hyper['word_max'])
469
+ input_test = []
470
+
471
+ if nn_model.fea_dict['word'] == 1:
472
+ input_test.append(test_x[0])
473
+
474
+ if nn_model.fea_dict['char'] == 1:
475
+ input_test.append(test_x[1])
476
+
477
+ if nn_model.fea_dict['lemma'] == 1:
478
+ input_test.append(test_x[2])
479
+
480
+ if nn_model.fea_dict['pos'] == 1:
481
+ input_test.append(test_x[3])
482
+
483
+ test_pre = nn_model.model.predict(input_test,batch_size=256,verbose=0)
484
+
485
+ elif nn_model.model_type=='bert':
486
+
487
+ test_set,test_label = ml_intext_fn(ml_input)
488
+ test_x,test_y=nn_model.rep.load_data(test_set,test_label,word_max_len=nn_model.maxlen)
489
+ test_pre = nn_model.model.predict(test_x,batch_size=128,verbose=0)
490
+
491
+ test_score=output_result(test_pre, nn_model.rep.label_2_index,Top_N=3)
492
+ #print('test_score:',test_score)
493
+ test_decode_temp=decode_tsv(test_score, ml_input_index, ml_input_txt, T=Threshold)
494
+ #print('decode_temp:\n',test_decode_temp)
495
+ # test_pre_tsv=combine_strategy(test_decode_temp,T=Threshold)
496
+ return test_decode_temp
497
+
498
+ def model_predict_old(ml_input,nn_model,ml_input_txt,ml_input_index,Threshold):
499
+ if nn_model.model_type=='cnn':
500
+ test_set,test_label = ml_intext_fn(ml_input)
501
+ test_x, test_y = nn_model.rep.represent_instances_all_feas(test_set,test_label,word_max_len=nn_model.hyper['sen_max'],char_max_len=nn_model.hyper['word_max'])
502
+ input_test = []
503
+
504
+ if nn_model.fea_dict['word'] == 1:
505
+ input_test.append(test_x[0])
506
+
507
+ if nn_model.fea_dict['char'] == 1:
508
+ input_test.append(test_x[1])
509
+
510
+ if nn_model.fea_dict['lemma'] == 1:
511
+ input_test.append(test_x[2])
512
+
513
+ if nn_model.fea_dict['pos'] == 1:
514
+ input_test.append(test_x[3])
515
+
516
+ test_pre = nn_model.model.predict(input_test,batch_size=256)
517
+
518
+ elif nn_model.model_type=='bert':
519
+
520
+ test_set,test_label = ml_intext_fn(ml_input)
521
+ test_x,test_y=nn_model.rep.load_data(test_set,test_label,word_max_len=nn_model.maxlen)
522
+ test_pre = nn_model.model.predict(test_x,batch_size=128)
523
+
524
+ test_score=output_result(test_pre, nn_model.rep.label_2_index,Top_N=3)
525
+ #print('test_score:',test_score)
526
+ test_decode_temp=decode_tsv(test_score, ml_input_index, ml_input_txt, T=0.0)
527
+ #print('decode_temp:\n',test_decode_temp)
528
+ test_pre_tsv=combine_strategy(test_decode_temp,T=Threshold)
529
+ return test_pre_tsv
530
+
531
+ def output_txt(ml_input_txt):
532
+ fin_text=io.StringIO(ml_input_txt)
533
+ fout=io.StringIO()
534
+
535
+ test_txt=fin_text.read().strip().split('\n')
536
+
537
+ fin_text.close()
538
+
539
+ for i in range(0,len(test_txt)):
540
+ fout.write(test_txt[i]+'\n')
541
+ fout.write('\n')
542
+
543
+ return fout.getvalue()
544
+
545
+ def ml_tagging(ssplit_token,ml_model,Threshold):
546
+ ml_input, ml_input_txt,ml_input_index=build_ngram_testset_filted(ssplit_token)
547
+ #print('ml_input:')
548
+ #print(ml_input)
549
+ if len(ml_input_index)>0:
550
+ ml_pre_tsv=model_predict(ml_input,ml_model,ml_input_txt,ml_input_index,Threshold)
551
+ else:
552
+ ml_pre_tsv=output_txt(ml_input_txt)
553
+ return ml_pre_tsv
554
+
555
+ def ml_tagging_allngram(ssplit_token,ml_model,Threshold):
556
+ ml_input, ml_input_txt,ml_input_index=build_all_ngram_testset_filted(ssplit_token)
557
+ #print('ml_input:')
558
+ #print(ml_input)
559
+ if len(ml_input_index)>0:
560
+ ml_pre_tsv=model_predict_old(ml_input,ml_model,ml_input_txt,ml_input_index,Threshold)
561
+ else:
562
+ ml_pre_tsv=output_txt(ml_input_txt)
563
+ return ml_pre_tsv
src/nn_model.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Thu Mar 26 09:04:13 2020
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ import time
9
+ import sys
10
+ import numpy as np
11
+ import tensorflow as tf
12
+ from src.nn_represent import CNN_RepresentationLayer,BERT_RepresentationLayer
13
+ from tensorflow.keras.layers import *
14
+ from tensorflow.keras.models import Model
15
+ # from keras_bert import load_trained_model_from_checkpoint
16
+ from transformers import TFAutoModel
17
+
18
+
19
+ '''
20
+ import keras.backend.tensorflow_backend as KTF
21
+ physical_devices =tf.config.experimental.list_physical_devices('GPU')
22
+ '''
23
+
24
+ class bioTag_CNN():
25
+ def __init__(self, model_files):
26
+ self.model_type='cnn'
27
+ model_test_type='cnn'
28
+ self.fea_dict = {'word': 1,
29
+ 'char': 1,
30
+ 'lemma':0,
31
+ 'pos':0}
32
+
33
+ self.hyper = {'sen_max' :20,
34
+ 'word_max' :40,
35
+ 'charvec_size' :50,
36
+ 'pos_size' :50}
37
+
38
+ self.w2vfile=model_files['w2vfile']
39
+ self.charfile=model_files['charfile']
40
+ self.labelfile=model_files['labelfile']
41
+ self.posfile=model_files['posfile']
42
+
43
+ vocab={'char':self.charfile,'label':self.labelfile,'pos':self.posfile}
44
+ print('loading w2v model.....')
45
+ self.rep = CNN_RepresentationLayer(self.w2vfile,vocab_file=vocab, frequency=400000)
46
+
47
+ print('building model......')
48
+ all_fea = []
49
+ fea_list = []
50
+
51
+ if self.fea_dict['word'] == 1:
52
+ word_input = Input(shape=(self.hyper['sen_max'],), dtype='int32', name='word_input')
53
+ all_fea.append(word_input)
54
+ word_fea = Embedding(self.rep.vec_table.shape[0], self.rep.vec_table.shape[1], weights=[self.rep.vec_table], trainable=True,mask_zero=False, input_length=self.hyper['sen_max'], name='word_emd')(word_input)
55
+ fea_list.append(word_fea)
56
+
57
+ if self.fea_dict['char'] == 1:
58
+ char_input = Input(shape=(self.hyper['sen_max'],self.hyper['word_max']), dtype='int32', name='char_input')
59
+ all_fea.append(char_input)
60
+ char_fea = TimeDistributed(Embedding(self.rep.char_table_size, self.hyper['charvec_size'], trainable=True,mask_zero=False), name='char_emd')(char_input)
61
+ char_fea = TimeDistributed(Conv1D(self.hyper['charvec_size']*2, 3, padding='same',activation='relu'), name="char_cnn")(char_fea)
62
+ char_fea_max = TimeDistributed(GlobalMaxPooling1D(), name="char_pooling_max")(char_fea)
63
+ fea_list.append(char_fea_max)
64
+
65
+ if self.fea_dict['lemma'] == 1:
66
+ lemma_input = Input(shape=(self.hyper['sen_max'],), dtype='int32', name='lemma_input')
67
+ all_fea.append(lemma_input)
68
+ lemma_fea = Embedding(self.rep.vec_table.shape[0], self.rep.vec_table.shape[1], weights=[self.rep.vec_table], trainable=True,mask_zero=False, input_length=self.hyper['sen_max'], name='lemma_emd')(lemma_input)
69
+ fea_list.append(lemma_fea)
70
+
71
+ if self.fea_dict['pos'] == 1:
72
+ pos_input = Input(shape=(self.hyper['sen_max'],), dtype='int32', name='pos_input')
73
+ all_fea.append(pos_input)
74
+ pos_fea = Embedding(self.rep.pos_table_size, self.hyper['pos_size'], trainable=True,mask_zero=False, input_length=self.hyper['sen_max'], name='pos_emd')(pos_input)
75
+ fea_list.append(pos_fea)
76
+
77
+ if len(fea_list) == 1:
78
+ concate_vec = fea_list[0]
79
+ else:
80
+ concate_vec = Concatenate()(fea_list)
81
+
82
+ concate_vec = Dropout(0.4)(concate_vec)
83
+
84
+ # model
85
+ if model_test_type=='cnn':
86
+ cnn = Conv1D(1024, 1, padding='valid', activation='relu',name='cnn1')(concate_vec)
87
+ cnn = GlobalMaxPooling1D()(cnn)
88
+ elif model_test_type=='lstm':
89
+ bilstm = Bidirectional(LSTM(200, return_sequences=True, implementation=2, dropout=0.4, recurrent_dropout=0.4), name='bilstm1')(concate_vec)
90
+ cnn = GlobalMaxPooling1D()(bilstm)
91
+
92
+
93
+ dense = Dense(1024, activation='relu')(cnn)
94
+ dense= Dropout(0.4)(dense)
95
+ output = Dense(self.rep.label_table_size, activation='softmax')(dense)
96
+ self.model = Model(inputs=all_fea, outputs=output)
97
+ def load_model(self,model_file):
98
+ self.model.load_weights(model_file)
99
+ self.model.summary()
100
+ print('load model done!')
101
+
102
+ class bioTag_BERT():
103
+ def __init__(self, model_files):
104
+ self.model_type='bert'
105
+ self.maxlen = 32
106
+
107
+ self.checkpoint_path = model_files['checkpoint_path']
108
+ self.label_file=model_files['labelfile']
109
+ self.lowercase=model_files['lowercase']
110
+ self.rep = BERT_RepresentationLayer(self.checkpoint_path, self.label_file, lowercase=self.lowercase)
111
+
112
+
113
+ plm_model = TFAutoModel.from_pretrained(self.checkpoint_path, from_pt=True)
114
+
115
+ x1_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='input_ids')
116
+ x2_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='token_type_ids')
117
+ x3_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='attention_mask')
118
+ #x = plm_model(x1_in, token_type_ids=x2_in, attention_mask=x3_in)[1]
119
+ #x = plm_model(x1_in, token_type_ids=x2_in, attention_mask=x3_in)[0]
120
+ #x = GlobalMaxPooling1D()(x)
121
+ x = plm_model(x1_in, token_type_ids=x2_in, attention_mask=x3_in)[0][:,0,:] #[CLS] embedding
122
+ outputs = Dense(self.rep.label_table_size, activation='softmax')(x)
123
+
124
+ self.model = Model(inputs=[x1_in,x2_in,x3_in], outputs=outputs)
125
+
126
+ def load_model(self,model_file):
127
+ self.model.load_weights(model_file)
128
+ self.model.summary()
129
+
130
+
src/nn_represent.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Jun 12 10:02:20 2020
4
+
5
+ @author: luol2
6
+ """
7
+ import time
8
+ import os, sys
9
+ import numpy as np
10
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
11
+ # from keras_bert import Tokenizer
12
+ from transformers import AutoTokenizer
13
+
14
+
15
+ class CNN_RepresentationLayer(object):
16
+
17
+
18
+ def __init__(self, wordvec_file, vocab_file=[],\
19
+ vec_size=50, word_size=10000, frequency=10000):
20
+
21
+ '''
22
+ wordvec_file : the file path of word embedding
23
+ vec_size : the dimension size of word vector
24
+ learned by word2vec tool
25
+
26
+ word_size : the size of word vocabulary
27
+
28
+ frequency : the threshold for the words left according to
29
+ their frequency appeared in the text
30
+ for example, when frequency is 10000, the most
31
+ frequent appeared 10000 words are considered
32
+
33
+ '''
34
+ #load word embedding
35
+ file = open(wordvec_file)
36
+ first_line = file.readline().strip()
37
+ file.close()
38
+ self.word_size = int(first_line.split()[0])
39
+ self.vec_size = int(first_line.split()[1])
40
+ self.frequency = frequency
41
+
42
+ if self.frequency>self.word_size:
43
+ self.vec_table = np.zeros((self.word_size + 2, self.vec_size))
44
+ else:
45
+ self.vec_table = np.zeros((self.frequency + 2, self.vec_size))
46
+ self.word_2_index = {}
47
+ self.load_wordvecs(wordvec_file)
48
+
49
+ #other fea
50
+ self.char_2_index={}
51
+ self.char_table_size=0
52
+ if 'char' in vocab_file.keys():
53
+ self.load_fea_vocab(vocab_file['char'],self.char_2_index)
54
+ self.char_table_size=len(self.char_2_index)
55
+ #print(self.char_table_size)
56
+ #print(self.char_2_index)
57
+
58
+ self.label_2_index={}
59
+ self.label_table_size=0
60
+ if 'label' in vocab_file.keys():
61
+ self.load_label_vocab(vocab_file['label'],self.label_2_index)
62
+ self.label_table_size=len(self.label_2_index)
63
+ #print(self.label_table_size)
64
+ #print(self.char_2_index)
65
+
66
+ self.pos_2_index={}
67
+ self.pos_table_size=0
68
+ if 'pos' in vocab_file.keys():
69
+ self.load_fea_vocab(vocab_file['pos'],self.pos_2_index)
70
+ self.pos_table_size=len(self.pos_2_index)
71
+ #print(self.pos_table_size)
72
+
73
+
74
+
75
+ def load_wordvecs(self, wordvec_file):
76
+
77
+ file = open(wordvec_file,'r',encoding='utf-8')
78
+ file.readline()
79
+ #print(self.word_size)
80
+ #print(self.vec_size)
81
+ row = 0
82
+ self.word_2_index['padding_0'] = row #oov-zero vector
83
+ row+=1
84
+ for line in file:
85
+ if row <= self.word_size and row <= self.frequency:
86
+ line_split = line.strip().split(' ')
87
+ self.word_2_index[line_split[0]] = row
88
+ for col in range(self.vec_size):
89
+ self.vec_table[row][col] = float(line_split[col + 1])
90
+ row += 1
91
+ else:
92
+ break
93
+
94
+ self.word_2_index['sparse_vectors'] = row #oov-zero vector
95
+ file.close()
96
+
97
+ def load_fea_vocab(self,fea_file,fea_index):
98
+ fin=open(fea_file,'r',encoding='utf-8')
99
+ i=0
100
+ fea_index['padding_0']=i
101
+ i+=1
102
+ fea_index['oov_padding']=i
103
+ i+=1
104
+ for line in fin:
105
+ fea_index[line.strip()]=i
106
+ i+=1
107
+ fin.close()
108
+
109
+ def load_label_vocab(self,fea_file,fea_index):
110
+ fin=open(fea_file,'r',encoding='utf-8')
111
+ i=0
112
+ for line in fin:
113
+ fea_index[line.strip()]=i
114
+ i+=1
115
+ fin.close()
116
+
117
+ '''
118
+ def generate_label_list(self,labels):
119
+ label_list=[]
120
+
121
+ for label in labels:
122
+ temp_label=[0]*self.label_table_size
123
+ temp_label[self.label_2_index[label]]=1
124
+ label_list.append(temp_label)
125
+ return label_list
126
+ '''
127
+ def generate_label_list(self,labels):
128
+ sparse_labels=[]
129
+ for ele in labels:
130
+ sparse_labels.append(self.label_2_index[ele])
131
+ return(sparse_labels)
132
+
133
+ def represent_instances_all_feas(self, instances, labels, word_max_len=100, char_max_len=50, training=False):
134
+
135
+ x_text_list=[]
136
+ x_word_list=[]
137
+ x_char_list=[]
138
+ x_lemma_list=[]
139
+ x_pos_list=[]
140
+
141
+ y_list=[]
142
+
143
+ for sentence in instances:
144
+ sentence_list=[]
145
+ sentence_word_list=[]
146
+ sentence_lemma_list=[]
147
+ sentence_pos_list=[]
148
+ sentence_text=[]
149
+ for j in range(0,len(sentence)):
150
+ word=sentence[j]
151
+ #char fea
152
+ char_list=[0]*char_max_len
153
+ for i in range(len(word[0])):
154
+ if i<char_max_len:
155
+ if word[0][i] in self.char_2_index.keys():
156
+ char_list[i]=self.char_2_index[word[0][i]]
157
+ else:
158
+ char_list[i]=self.char_2_index['oov_padding']
159
+ sentence_word_list.append(char_list)
160
+
161
+ #word fea
162
+ sentence_text.append(word[0].lower())
163
+ if word[0].lower() in self.word_2_index.keys():
164
+ sentence_list.append(self.word_2_index[word[0].lower()])
165
+ else:
166
+ sentence_list.append(self.word_2_index['sparse_vectors'])
167
+
168
+ #lemma fea
169
+ if word[1].lower() in self.word_2_index.keys():
170
+ sentence_lemma_list.append(self.word_2_index[word[1].lower()])
171
+ else:
172
+ sentence_lemma_list.append(self.word_2_index['sparse_vectors'])
173
+
174
+ #pos fea
175
+ if word[3] in self.pos_2_index.keys():
176
+ sentence_pos_list.append(self.pos_2_index[word[3]])
177
+ else:
178
+ sentence_pos_list.append(self.pos_2_index['oov_padding'])
179
+
180
+ x_text_list.append(sentence_text)
181
+ x_word_list.append(sentence_list)
182
+ x_char_list.append(sentence_word_list)
183
+ x_lemma_list.append(sentence_lemma_list)
184
+ x_pos_list.append(sentence_pos_list)
185
+
186
+ if training==True:
187
+ y_list=self.generate_label_list(labels)
188
+ x_word_np = pad_sequences(x_word_list, word_max_len, value=0, padding='post',truncating='post') # right padding
189
+ x_char_np = pad_sequences(x_char_list, word_max_len, value=0, padding='post',truncating='post')
190
+ x_lemma_np = pad_sequences(x_lemma_list, word_max_len, value=0, padding='post',truncating='post')
191
+ x_pos_np = pad_sequences(x_pos_list, word_max_len, value=0, padding='post',truncating='post')
192
+ y_np = np.array(y_list)
193
+
194
+ else:
195
+ x_word_np = pad_sequences(x_word_list, word_max_len, value=0, padding='post',truncating='post') # right padding
196
+ x_char_np = pad_sequences(x_char_list, word_max_len, value=0, padding='post',truncating='post')
197
+ x_lemma_np=[]
198
+ x_pos_np=[]
199
+ y_np=[]
200
+
201
+ return [x_word_np, x_char_np, x_lemma_np, x_pos_np, x_text_list], y_np
202
+
203
+
204
+
205
+ class BERT_RepresentationLayer(object):
206
+
207
+
208
+ def __init__(self, tokenizer_name_or_path, label_file,lowercase=True):
209
+
210
+
211
+ #load vocab
212
+ self.model_type='bert'
213
+ #self.model_type='roberta'
214
+ if self.model_type in {"gpt2", "roberta"}:
215
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True, add_prefix_space=True,do_lower_case=lowercase)
216
+ else:
217
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True,do_lower_case=lowercase)
218
+
219
+ #load label
220
+ self.label_2_index={}
221
+ self.index_2_label={}
222
+ self.label_table_size=0
223
+ self.load_label_vocab(label_file,self.label_2_index,self.index_2_label)
224
+ self.label_table_size=len(self.label_2_index)
225
+ self.vocab_len=len(self.tokenizer)
226
+
227
+ def load_label_vocab(self,fea_file,fea_index,index_2_label):
228
+
229
+ fin=open(fea_file,'r',encoding='utf-8')
230
+ all_text=fin.read().strip().split('\n')
231
+ fin.close()
232
+ for i in range(0,len(all_text)):
233
+ fea_index[all_text[i]]=i
234
+ index_2_label[str(i)]=all_text[i]
235
+
236
+ def generate_label_list(self,labels):
237
+ sparse_labels=[]
238
+ for ele in labels:
239
+ sparse_labels.append(self.label_2_index[ele])
240
+ return(sparse_labels)
241
+
242
+ def load_data(self,instances, labels, word_max_len=100,training=False):
243
+
244
+ x_index=[]
245
+ x_seg=[]
246
+ x_mask=[]
247
+ y_list=[]
248
+
249
+ for sentence in instances:
250
+ sentence_text_list=[]
251
+ for j in range(0,len(sentence)):
252
+ sentence_text_list.append(sentence[j][0].lower()) #input lower
253
+
254
+ token_result=self.tokenizer(
255
+ sentence_text_list,
256
+ max_length=word_max_len,
257
+ truncation=True,is_split_into_words=True)
258
+
259
+ bert_tokens=self.tokenizer.convert_ids_to_tokens(token_result['input_ids'])
260
+ word_index=token_result.word_ids(batch_index=0)
261
+
262
+
263
+ x_index.append(token_result['input_ids'])
264
+ if self.model_type in {"gpt2", "roberta"}:
265
+ x_seg.append([0]*len(token_result['input_ids']))
266
+ else:
267
+ x_seg.append(token_result['token_type_ids'])
268
+ x_mask.append(token_result['attention_mask'])
269
+
270
+ if training==True:
271
+ y_list=self.generate_label_list(labels)
272
+
273
+ x1_np = pad_sequences(x_index, word_max_len, value=0, padding='post',truncating='post') # right padding
274
+ x2_np = pad_sequences(x_seg, word_max_len, value=0, padding='post',truncating='post')
275
+ x3_np = pad_sequences(x_mask, word_max_len, value=0, padding='post',truncating='post')
276
+ y_np = np.array(y_list)
277
+
278
+ else:
279
+ x1_np = pad_sequences(x_index, word_max_len, value=0, padding='post',truncating='post') # right padding
280
+ x2_np = pad_sequences(x_seg, word_max_len, value=0, padding='post',truncating='post')
281
+ x3_np = pad_sequences(x_mask, word_max_len, value=0, padding='post',truncating='post')
282
+ y_np=[]
283
+
284
+ return [x1_np, x2_np, x3_np], y_np
285
+
286
+ if __name__ == '__main__':
287
+ pass
288
+
289
+
src/post_processing.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Thu Jun 18 20:08:30 2020
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ def combine_overlap(mention_list):
9
+
10
+ entity_list=[]
11
+ if len(mention_list)>2:
12
+
13
+ first_entity=mention_list[0]
14
+ nest_list=[first_entity]
15
+ max_eid=int(first_entity[1])
16
+ for i in range(1,len(mention_list)):
17
+ segs=mention_list[i]
18
+ if int(segs[0])> max_eid:
19
+ if len(nest_list)==1:
20
+ entity_list.append(nest_list[0])
21
+ nest_list=[]
22
+ nest_list.append(segs)
23
+ if int(segs[1])>max_eid:
24
+ max_eid=int(segs[1])
25
+ else:
26
+ tem=find_max_entity(nest_list)#find max entity
27
+ entity_list.append(tem)
28
+ nest_list=[]
29
+ nest_list.append(segs)
30
+ if int(segs[1])>max_eid:
31
+ max_eid=int(segs[1])
32
+
33
+ else:
34
+ nest_list.append(segs)
35
+ if int(segs[1])>max_eid:
36
+ max_eid=int(segs[1])
37
+ if nest_list!=[]:
38
+ if len(nest_list)==1:
39
+ entity_list.append(nest_list[0])
40
+
41
+ else:
42
+ tem=find_max_entity(nest_list)#find max entity
43
+ entity_list.append(tem)
44
+ else:
45
+ entity_list=mention_list
46
+
47
+ return entity_list
48
+
49
+ def find_max_entity(nest_list):
50
+ max_len=0
51
+ max_entity=[]
52
+ for i in range(0, len(nest_list)):
53
+ length=int(nest_list[i][1])-int(nest_list[i][0])
54
+ if length>max_len:
55
+ max_len=length
56
+ max_entity=nest_list[i]
57
+
58
+ return max_entity
src/restore_index.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Sun Jun 14 17:19:02 2020
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ import io
9
+ import sys
10
+
11
+ def restore_index_nest_fn(ori_text,file_pre):
12
+
13
+
14
+ fin_pre=io.StringIO(file_pre)
15
+ #print(file_pre)
16
+ all_pre=fin_pre.read().strip().split('\n\n')
17
+ fin_pre.close()
18
+ #print(len(all_pre))
19
+
20
+ new_sentence=''
21
+ restore_result=[]
22
+
23
+ sentence_ori=ori_text.lower().replace('``','" ')
24
+ sentence_ori=sentence_ori.replace("''",'" ')
25
+ for pre_i in range(0,len(all_pre)):
26
+ pre_lines=all_pre[pre_i].split('\n')
27
+ #print(pre_lines)
28
+ # print(sentence_ori)
29
+ if len(pre_lines)>1:
30
+ #print(pre_lines)
31
+ sentence_pre=pre_lines[0].lower().replace('``','"')
32
+ sentence_pre=sentence_pre.replace("''",'"')
33
+ sentence_pre=sentence_pre.split()
34
+ pre_result=[]
35
+ for i in range(1,len(pre_lines)):
36
+ pre_result.append(pre_lines[i].split('\t'))
37
+
38
+ restore_sid=0
39
+ restore_eid=0
40
+ each_word_id=[]
41
+
42
+ for i in range(0,len(sentence_pre)):
43
+
44
+ temp_id=sentence_ori.find(sentence_pre[i])
45
+ if temp_id<0:
46
+ if sentence_pre[i].find('"')>=0:
47
+ temp_id = sentence_ori.find(sentence_pre[i].replace('"','" '))
48
+ else:
49
+ #print('ori:',sentence_ori)
50
+ print('resotr index error:',sentence_pre[i])
51
+ new_sentence+=sentence_ori[0:temp_id]
52
+
53
+ restore_sid=len(new_sentence)
54
+ restore_eid=len(new_sentence)+len(sentence_pre[i])
55
+ each_word_id.append([str(restore_sid),str(restore_eid)])
56
+ new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
57
+ sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
58
+ # print('each_word:',each_word_id)
59
+ for pre_ele in pre_result:
60
+ # if len(pre_ele)>4:
61
+ # temp_pre_result=[each_word_id[int(pre_ele[0])][0],each_word_id[int(pre_ele[1])][1],pre_ele[3].split('|')[0],pre_ele[4]]
62
+ # else:
63
+ # temp_pre_result=[each_word_id[int(pre_ele[0])][0],each_word_id[int(pre_ele[1])][1],pre_ele[3].split('|')[0],'1.00']
64
+ temp_pre_result=[each_word_id[int(pre_ele[0])][0],each_word_id[int(pre_ele[1])][1],pre_ele[3].split('|')[0],pre_ele[4]]
65
+ if temp_pre_result not in restore_result:
66
+ restore_result.append(temp_pre_result)
67
+ else:
68
+ sentence_pre=pre_lines[0].lower().replace('``','"')
69
+ sentence_pre=sentence_pre.replace("''",'"')
70
+ sentence_pre=sentence_pre.split()
71
+
72
+ for i in range(0,len(sentence_pre)):
73
+
74
+ temp_id=sentence_ori.find(sentence_pre[i])
75
+ if temp_id<0:
76
+ if sentence_pre[i].find('"')>=0:
77
+ temp_id = sentence_ori.find(sentence_pre[i].replace('"','" '))
78
+ else:
79
+ print('resotr index error:',sentence_pre[i])
80
+ new_sentence+=sentence_ori[0:temp_id]
81
+ new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
82
+ sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
83
+ # print('resotre:',restore_result)
84
+ return restore_result
85
+
86
+ if __name__=='__main__':
87
+ path='//panfs/pan1/bionlp/lulab/luoling/HPO_project/bioTag/data/test/gsc/result/'
88
+ fin=open(path+'GSCplus_Nest_biobert.tsv','r',encoding='utf-8')
89
+ fout=open(path+'GSCplus_Nest_restore_biobert.tsv','w',encoding='utf-8')
90
+ all_context=fin.read().strip().split('\n\n\n\n')
91
+ fin.close()
92
+ file_num=0
93
+ for doc in all_context:
94
+ file_num+=1
95
+ print('file_num:',file_num)
96
+ doc_ele=doc.split('\n\n')
97
+ first_line = doc_ele[0].split('\n')
98
+ pmid=first_line[0]
99
+ ori_text=first_line[1]
100
+ pre_result='\n\n'.join(doc_ele[1:])
101
+ # print('pmid:',pmid)
102
+ # print('ori:',ori_text)
103
+ # print('pre:',pre_result)
104
+ final_result=restore_index_nest_fn(ori_text,pre_result)
105
+ fout.write(pmid+'\n'+ori_text+'\n')
106
+ for ele in final_result:
107
+ fout.write('\t'.join(ele)+'\t'+ori_text[int(ele[0]):int(ele[1])]+'\n')
108
+ fout.write('\n')
109
+ fout.close()
src/src_app-old.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Mon Nov 21 16:21:25 2022
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ import streamlit as st
9
+ from src.nn_model import bioTag_CNN,bioTag_BERT
10
+ from src.dic_ner import dic_ont
11
+ from src.tagging_text import bioTag
12
+ import os
13
+ import json
14
+ from pandas import DataFrame
15
+ import nltk
16
+ nltk.download('punkt')
17
+ nltk.download('averaged_perceptron_tagger')
18
+ nltk.download('wordnet')
19
+
20
+ st.set_page_config(
21
+ page_title="PhenoTagger_v1.2",
22
+ page_icon="🎈",
23
+ layout="wide",
24
+ menu_items={
25
+ 'Get Help': 'https://www.ncbi.nlm.nih.gov/research/bionlp/',
26
+ 'About': "PhenoTagger v1.2"
27
+ }
28
+ )
29
+
30
+
31
+ # def _max_width_():
32
+ # max_width_str = f"max-width: 2400px;"
33
+ # st.markdown(
34
+ # f"""
35
+ # <style>
36
+ # .reportview-container .main .block-container{{
37
+ # {max_width_str}
38
+ # }}
39
+ # </style>
40
+ # """,
41
+ # unsafe_allow_html=True,
42
+ # )
43
+
44
+
45
+ # _max_width_()
46
+
47
+ # c30, c31, c32 = st.columns([2.5, 1, 3])
48
+
49
+ # with c30:
50
+ # # st.image("logo.png", width=400)
51
+ st.title("👨‍⚕️ PhenoTagger_v1.2 Demo")
52
+
53
+ with st.expander("ℹ️ - About this app", expanded=True):
54
+
55
+ st.write(
56
+ """
57
+ - This app is an easy-to-use interface built in Streamlit for [PhenoTagger](https://github.com/ncbi-nlp/PhenoTagger) library!
58
+ - PhenoTagger is a hybrid method that combines dictionary and deep learning-based methods to recognize Human Phenotype Ontology (HPO) concepts in unstructured biomedical text. Please refer to [our paper](https://doi.org/10.1093/bioinformatics/btab019) for more details.
59
+ - Contact: [NLM/NCBI BioNLP Research Group](https://www.ncbi.nlm.nih.gov/research/bionlp/)
60
+ """
61
+ )
62
+
63
+ st.markdown("")
64
+
65
+ st.markdown("")
66
+ st.markdown("## 📌 Paste document ")
67
+ with st.form(key="my_form"):
68
+
69
+
70
+ ce, c1, ce, c2, c3 = st.columns([0.07, 1, 0.07, 4, 0.07])
71
+ with c1:
72
+ ModelType = st.radio(
73
+ "Choose your Ontology",
74
+ ["HPO(Default)", "UBERON"],
75
+ #help="Bioformer is more precise, CNN is more efficient",
76
+ )
77
+
78
+ if ModelType == "HPO(Default)":
79
+ # kw_model = KeyBERT(model=roberta)
80
+
81
+ @st.cache(allow_output_mutation=True)
82
+ def load_model():
83
+ ontfiles={'dic_file':'./dict_hpo/noabb_lemma.dic',
84
+ 'word_id_file':'./dict_hpo/word_id_map.json',
85
+ 'id_word_file':'./dict_hpo/id_word_map.json'}
86
+
87
+
88
+ vocabfiles={'labelfile':'./dict_hpo/lable.vocab',
89
+ 'checkpoint_path':'./models_v1.2/bioformer-cased-v1.0/',
90
+ 'lowercase':False}
91
+ modelfile='./models_v1.2/bioformer-HPO.h5'
92
+
93
+
94
+ biotag_dic=dic_ont(ontfiles)
95
+
96
+ nn_model=bioTag_BERT(vocabfiles)
97
+ nn_model.load_model(modelfile)
98
+ return nn_model,biotag_dic
99
+
100
+ nn_model,biotag_dic = load_model()
101
+
102
+ else:
103
+ @st.cache(allow_output_mutation=True)
104
+ def load_model():
105
+ ontfiles={'dic_file':'./dict_uberon/noabb_lemma.dic',
106
+ 'word_id_file':'./dict_uberon/word_id_map.json',
107
+ 'id_word_file':'./dict_uberon/id_word_map.json'}
108
+
109
+ vocabfiles={'labelfile':'./dict_uberon/lable.vocab',
110
+ 'checkpoint_path':'./models_v1.2/bioformer-cased-v1.0/',
111
+ 'lowercase':False}
112
+
113
+ modelfile='./models_v1.2/bioformer-UBERON.h5'
114
+
115
+ biotag_dic=dic_ont(ontfiles)
116
+
117
+ nn_model=bioTag_CNN(vocabfiles)
118
+ nn_model.load_model(modelfile)
119
+
120
+ return nn_model,biotag_dic
121
+
122
+ nn_model,biotag_dic = load_model()
123
+
124
+ para_overlap = st.checkbox(
125
+ "Overlap concept",
126
+ value=False,
127
+ help="Tick this box to identify overlapping concepts",
128
+ )
129
+ para_abbr = st.checkbox(
130
+ "Abbreviaitons",
131
+ value=True,
132
+ help="Tick this box to identify abbreviations",
133
+ )
134
+
135
+ para_threshold = st.slider(
136
+ "Threshold",
137
+ min_value=0.5,
138
+ max_value=1.0,
139
+ value=0.95,
140
+ step=0.05,
141
+ help="Retrun the preditions which socre over the threshold.",
142
+ )
143
+
144
+
145
+
146
+
147
+ with c2:
148
+
149
+
150
+ doc = st.text_area(
151
+ "Paste your text below",
152
+ value = 'The clinical features of Angelman syndrome (AS) comprise severe mental retardation, postnatal microcephaly, macrostomia and prognathia, absence of speech, ataxia, and a happy disposition. We report on seven patients who lack most of these features, but presented with obesity, muscular hypotonia and mild mental retardation. Based on the latter findings, the patients were initially suspected of having Prader-Willi syndrome. DNA methylation analysis of SNRPN and D15S63, however, revealed an AS pattern, ie the maternal band was faint or absent. Cytogenetic studies and microsatellite analysis demonstrated apparently normal chromosomes 15 of biparental inheritance. We conclude that these patients have an imprinting defect and a previously unrecognised form of AS. The mild phenotype may be explained by an incomplete imprinting defect or by cellular mosaicism.',
153
+ height=400,
154
+ )
155
+
156
+
157
+
158
+
159
+ # MAX_WORDS = 500
160
+ # import re
161
+ # res = len(re.findall(r"\w+", doc))
162
+ # if res > MAX_WORDS:
163
+ # st.warning(
164
+ # "⚠️ Your text contains "
165
+ # + str(res)
166
+ # + " words."
167
+ # + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
168
+ # )
169
+
170
+ # doc = doc[:MAX_WORDS]
171
+
172
+ submit_button = st.form_submit_button(label="✨ Submit!")
173
+
174
+
175
+ if not submit_button:
176
+ st.stop()
177
+
178
+ #st.write(para_overlap,para_abbr,para_threshold)
179
+ para_set={
180
+ #model_type':para_model, # cnn or bioformer
181
+ 'onlyLongest': not para_overlap, # False: return overlap concepts, True only longgest
182
+ 'abbrRecog':para_abbr,# False: don't identify abbr, True: identify abbr
183
+ 'ML_Threshold':para_threshold,# the Threshold of deep learning model
184
+ }
185
+ st.markdown("")
186
+ st.markdown("## 💡 Tagging results:")
187
+ with st.spinner('Wait for tagging...'):
188
+ tag_result=bioTag(doc,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
189
+
190
+ st.markdown('<font style="color: rgb(128, 128, 128);">Move the mouse🖱️ over the entity to display the HPO id.</font>', unsafe_allow_html=True)
191
+ # print('dic...........:',biotag_dic.keys())
192
+ # st.write('parameters:', para_overlap,para_abbr,para_threshold)
193
+
194
+ html_results=''
195
+ text_results=doc+'\n'
196
+ entity_end=0
197
+ hpoid_count={}
198
+ if len(tag_result)>=0:
199
+ for ele in tag_result:
200
+ entity_start=int(ele[0])
201
+ html_results+=doc[entity_end:entity_start]
202
+ entity_end=int(ele[1])
203
+ entity_id=ele[2]
204
+ entity_score=ele[3]
205
+ text_results+=ele[0]+'\t'+ele[1]+'\t'+doc[entity_start:entity_end]+'\t'+ele[2]+'\t'+format(float(ele[3]),'.2f')+'\n'
206
+ if entity_id not in hpoid_count.keys():
207
+ hpoid_count[entity_id]=1
208
+ else:
209
+ hpoid_count[entity_id]+=1
210
+
211
+ html_results+='<font style="background-color: rgb(255, 204, 0)'+';" title="'+entity_id+'">'+doc[entity_start:entity_end]+'</font>'
212
+ html_results+=doc[entity_end:]
213
+
214
+ else:
215
+ html_results=doc
216
+
217
+ st.markdown('<table border="1"><tr><td>'+html_results+'</td></tr></table>', unsafe_allow_html=True)
218
+
219
+
220
+ #table
221
+ data_entity=[]
222
+ for ele in hpoid_count.keys():
223
+ segs=ele.split(';')
224
+ term_name=''
225
+ for seg in segs:
226
+ term_name+=biotag_dic.id_word[seg][0]+';'
227
+ temp=[ele,term_name,hpoid_count[ele]] #hpoid, term name, count
228
+ data_entity.append(temp)
229
+
230
+
231
+ st.markdown("")
232
+ st.markdown("")
233
+ # st.markdown("## Table output:")
234
+
235
+ # cs, c1, c2, c3, cLast = st.columns([2, 1.5, 1.5, 1.5, 2])
236
+
237
+ # with c1:
238
+ # CSVButton2 = download_button(keywords, "Data.csv", "📥 Download (.csv)")
239
+ # with c2:
240
+ # CSVButton2 = download_button(keywords, "Data.txt", "📥 Download (.txt)")
241
+ # with c3:
242
+ # CSVButton2 = download_button(keywords, "Data.json", "📥 Download (.json)")
243
+
244
+ # st.header("")
245
+
246
+ df = (
247
+ DataFrame(data_entity, columns=["Ontology_id", "Term name","Frequency"])
248
+ .sort_values(by="Frequency", ascending=False)
249
+ .reset_index(drop=True)
250
+ )
251
+
252
+ df.index += 1
253
+
254
+ c1, c2, c3 = st.columns([1, 4, 1])
255
+
256
+ # format_dictionary = {
257
+ # "Relevancy": "{:.1%}",
258
+ # }
259
+
260
+ # df = df.format(format_dictionary)
261
+
262
+ with c2:
263
+ st.table(df)
264
+
265
+ c1, c2, c3 = st.columns([1, 1, 1])
266
+ with c2:
267
+ st.download_button('Download annotations', text_results)
268
+
src/ssplit_tokenzier.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Jun 12 15:26:44 2020
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ import nltk
9
+ from nltk.stem import WordNetLemmatizer
10
+ from nltk.corpus import wordnet
11
+ from nltk.stem.porter import PorterStemmer
12
+ lemmatizer = WordNetLemmatizer()
13
+ stemmer = PorterStemmer()
14
+ import io
15
+
16
+ def get_wordnet_pos(treebank_tag):
17
+ if treebank_tag.startswith('J'):
18
+ return wordnet.ADJ
19
+ elif treebank_tag.startswith('V'):
20
+ return wordnet.VERB
21
+ elif treebank_tag.startswith('N'):
22
+ return wordnet.NOUN
23
+ elif treebank_tag.startswith('R') or treebank_tag=='IN':
24
+ return wordnet.ADV
25
+ else:
26
+ return wordnet.NOUN
27
+
28
+ def ssplit_token_pos_lemma(in_text):
29
+
30
+ fout=io.StringIO()
31
+
32
+ line=in_text.strip()
33
+ line=line.replace('-',' - ').replace('/',' / ')
34
+ sentences = nltk.sent_tokenize(line)
35
+ sentences = [nltk.word_tokenize(sent) for sent in sentences]
36
+ # print(sentences)
37
+ for sent in sentences:
38
+ token_pos = nltk.pos_tag(sent)
39
+ for token in token_pos:
40
+ lemma = lemmatizer.lemmatize(token[0].lower(), get_wordnet_pos(token[1]))
41
+ stem = stemmer.stem(token[0].lower())
42
+ fout.write(token[0]+'\t'+lemma+'\t'+stem+'\t'+token[1]+'\n')
43
+ fout.write('\n')
44
+
45
+ return fout.getvalue()
src/tagging_text.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Jun 12 11:33:22 2020
4
+
5
+ @author: luol2
6
+ """
7
+ import argparse
8
+ from src.ssplit_tokenzier import ssplit_token_pos_lemma
9
+ from src.ml_ner import ml_tagging,ml_tagging_allngram
10
+ from src.combine_result import combine_ml_dict
11
+ from src.restore_index import restore_index_nest_fn
12
+ from src.dic_ner import dic_ont
13
+ from src.post_processing import combine_overlap
14
+ from src.abbre_resolution import postprocess_abbr
15
+ import os
16
+ import time
17
+ import json
18
+
19
+ #hybrid method
20
+ def bioTag(text,biotag_dic,ml_model,onlyLongest=False, abbrRecog=False, Threshold=0.95):
21
+
22
+ # startTime=time.time()
23
+ ssplit_token=ssplit_token_pos_lemma(text)
24
+ #print(ssplit_token)
25
+ # print('ssplit token:',time.time()-startTime)
26
+
27
+ # startTime=time.time()
28
+ dict_tsv=biotag_dic.matching(ssplit_token)
29
+ # print('dict tsv:\n',dict_tsv)
30
+ # print('dict ner:',time.time()-startTime)
31
+
32
+ # startTime=time.time()
33
+ ml_tsv=ml_tagging(ssplit_token,ml_model,Threshold)
34
+ #print('ml_tsv:\n',ml_tsv)
35
+ # print('ml ner:',time.time()-startTime)
36
+
37
+ # startTime=time.time()
38
+ combine_tsv=combine_ml_dict(dict_tsv,ml_tsv)
39
+ #combine_tsv=combine_ml_dict_fn(ml_tsv,dict_tsv)
40
+ #print('combine:\n',combine_tsv)
41
+
42
+ final_result= restore_index_nest_fn(text,combine_tsv)
43
+ # print('final ner:',time.time()-startTime)
44
+ if onlyLongest==True:
45
+ final_result=combine_overlap(final_result)
46
+ if abbrRecog==True:
47
+ final_result=postprocess_abbr(final_result,text)
48
+ # print('final result:')
49
+ # print(final_result)
50
+
51
+ return final_result
52
+
53
+ # only machine learning-based method
54
+ def bioTag_ml(text,ml_model,onlyLongest=False,abbrRecog=False, Threshold=0.95):
55
+
56
+ # startTime=time.time()
57
+ ssplit_token=ssplit_token_pos_lemma(text)
58
+ # print(ssplit_token)
59
+ # print('ssplit token:',time.time()-startTime)
60
+
61
+ # startTime=time.time()
62
+ ml_tsv=ml_tagging_allngram(ssplit_token,ml_model,Threshold)
63
+ # print('ml_tsv:\n',ml_tsv)
64
+ # print('ml ner:',time.time()-startTime)
65
+
66
+ final_result= restore_index_nest_fn(text,ml_tsv)
67
+ # print('final ner:',time.time()-startTime)
68
+ if onlyLongest==True:
69
+ final_result=combine_overlap(final_result)
70
+
71
+ if abbrRecog==True:
72
+ final_result=postprocess_abbr(final_result,text)
73
+
74
+ return final_result
75
+
76
+ # only dict method
77
+ def bioTag_dic(text,biotag_dic,onlyLongest=False, abbrRecog=False):
78
+
79
+ # startTime=time.time()
80
+ ssplit_token=ssplit_token_pos_lemma(text)
81
+ # print(ssplit_token)
82
+ # print('ssplit token:',time.time()-startTime)
83
+
84
+ # startTime=time.time()
85
+ dict_tsv=biotag_dic.matching(ssplit_token)
86
+ # print('dict tsv:\n',dict_tsv)
87
+ # print('dict ner:',time.time()-startTime)
88
+
89
+ final_result= restore_index_nest_fn(text,dict_tsv)
90
+ # print('final ner:',time.time()-startTime)
91
+ if onlyLongest==True:
92
+ final_result=combine_overlap(final_result)
93
+
94
+ if abbrRecog==True:
95
+ final_result=postprocess_abbr(final_result,text)
96
+
97
+ return final_result
98
+