lingbionlp
commited on
Commit
•
8ab6ceb
1
Parent(s):
5e753a2
Upload 2 files
Browse files- AIO_label.vocab +21 -0
- postprocessing.py +551 -0
AIO_label.vocab
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
O
|
2 |
+
B-Gene
|
3 |
+
I-Gene
|
4 |
+
O-Gene
|
5 |
+
B-FamilyName
|
6 |
+
I-FamilyName
|
7 |
+
B-Disease
|
8 |
+
I-Disease
|
9 |
+
O-Disease
|
10 |
+
B-Chemical
|
11 |
+
I-Chemical
|
12 |
+
O-Chemical
|
13 |
+
B-Mutation
|
14 |
+
I-Mutation
|
15 |
+
O-Mutation
|
16 |
+
B-Species
|
17 |
+
I-Species
|
18 |
+
O-Species
|
19 |
+
B-CellLine
|
20 |
+
I-CellLine
|
21 |
+
O-CellLine
|
postprocessing.py
ADDED
@@ -0,0 +1,551 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Nov 03 20:08:30 2022
|
4 |
+
|
5 |
+
@author: luol2
|
6 |
+
"""
|
7 |
+
|
8 |
+
|
9 |
+
import logging
|
10 |
+
import regex
|
11 |
+
import sys
|
12 |
+
import io
|
13 |
+
|
14 |
+
"""
|
15 |
+
A Python 3 refactoring of Vincent Van Asch's Python 2 code at
|
16 |
+
|
17 |
+
http://www.cnts.ua.ac.be/~vincent/scripts/abbreviations.py
|
18 |
+
|
19 |
+
Based on
|
20 |
+
|
21 |
+
A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
|
22 |
+
A. Schwartz and M. Hearst
|
23 |
+
Biocomputing, 2003, pp 451-462.
|
24 |
+
|
25 |
+
"""
|
26 |
+
|
27 |
+
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
|
28 |
+
log = logging.getLogger('Abbre')
|
29 |
+
|
30 |
+
|
31 |
+
class Candidate(str):
|
32 |
+
def __init__(self, value):
|
33 |
+
super().__init__()
|
34 |
+
self.start = 0
|
35 |
+
self.stop = 0
|
36 |
+
|
37 |
+
def set_position(self, start, stop):
|
38 |
+
self.start = start
|
39 |
+
self.stop = stop
|
40 |
+
|
41 |
+
|
42 |
+
def yield_lines_from_file(file_path):
|
43 |
+
with open(file_path, 'rb') as f:
|
44 |
+
for line in f:
|
45 |
+
try:
|
46 |
+
line = line.decode('utf-8')
|
47 |
+
except UnicodeDecodeError:
|
48 |
+
line = line.decode('latin-1').encode('utf-8').decode('utf-8')
|
49 |
+
line = line.strip()
|
50 |
+
yield line
|
51 |
+
f.close()
|
52 |
+
|
53 |
+
|
54 |
+
def yield_lines_from_doc(doc_text):
|
55 |
+
for line in doc_text.split("\n"):
|
56 |
+
yield line.strip()
|
57 |
+
|
58 |
+
|
59 |
+
def best_candidates(sentence):
|
60 |
+
"""
|
61 |
+
:param sentence: line read from input file
|
62 |
+
:return: a Candidate iterator
|
63 |
+
"""
|
64 |
+
|
65 |
+
if '(' in sentence:
|
66 |
+
# Check some things first
|
67 |
+
if sentence.count('(') != sentence.count(')'):
|
68 |
+
raise ValueError("Unbalanced parentheses: {}".format(sentence))
|
69 |
+
|
70 |
+
if sentence.find('(') > sentence.find(')'):
|
71 |
+
raise ValueError("First parentheses is right: {}".format(sentence))
|
72 |
+
|
73 |
+
closeindex = -1
|
74 |
+
while 1:
|
75 |
+
# Look for open parenthesis
|
76 |
+
openindex = sentence.find('(', closeindex + 1)
|
77 |
+
|
78 |
+
if openindex == -1: break
|
79 |
+
|
80 |
+
# Look for closing parentheses
|
81 |
+
closeindex = openindex + 1
|
82 |
+
open = 1
|
83 |
+
skip = False
|
84 |
+
while open:
|
85 |
+
try:
|
86 |
+
char = sentence[closeindex]
|
87 |
+
except IndexError:
|
88 |
+
# We found an opening bracket but no associated closing bracket
|
89 |
+
# Skip the opening bracket
|
90 |
+
skip = True
|
91 |
+
break
|
92 |
+
if char == '(':
|
93 |
+
open += 1
|
94 |
+
elif char in [')', ';', ':']:
|
95 |
+
open -= 1
|
96 |
+
closeindex += 1
|
97 |
+
|
98 |
+
if skip:
|
99 |
+
closeindex = openindex + 1
|
100 |
+
continue
|
101 |
+
|
102 |
+
# Output if conditions are met
|
103 |
+
start = openindex + 1
|
104 |
+
stop = closeindex - 1
|
105 |
+
candidate = sentence[start:stop]
|
106 |
+
|
107 |
+
# Take into account whitespace that should be removed
|
108 |
+
start = start + len(candidate) - len(candidate.lstrip())
|
109 |
+
stop = stop - len(candidate) + len(candidate.rstrip())
|
110 |
+
candidate = sentence[start:stop]
|
111 |
+
|
112 |
+
if conditions(candidate):
|
113 |
+
new_candidate = Candidate(candidate)
|
114 |
+
new_candidate.set_position(start, stop)
|
115 |
+
yield new_candidate
|
116 |
+
|
117 |
+
|
118 |
+
def conditions(candidate):
|
119 |
+
"""
|
120 |
+
Based on Schwartz&Hearst
|
121 |
+
|
122 |
+
2 <= len(str) <= 10
|
123 |
+
len(tokens) <= 2
|
124 |
+
re.search('\p{L}', str)
|
125 |
+
str[0].isalnum()
|
126 |
+
|
127 |
+
and extra:
|
128 |
+
if it matches (\p{L}\.?\s?){2,}
|
129 |
+
it is a good candidate.
|
130 |
+
|
131 |
+
:param candidate: candidate abbreviation
|
132 |
+
:return: True if this is a good candidate
|
133 |
+
"""
|
134 |
+
viable = True
|
135 |
+
if regex.match('(\p{L}\.?\s?){2,}', candidate.lstrip()):
|
136 |
+
viable = True
|
137 |
+
if len(candidate) < 2 or len(candidate) > 10:
|
138 |
+
viable = False
|
139 |
+
if len(candidate.split()) > 2:
|
140 |
+
viable = False
|
141 |
+
if not regex.search('\p{L}', candidate):
|
142 |
+
viable = False
|
143 |
+
if not candidate[0].isalnum():
|
144 |
+
viable = False
|
145 |
+
|
146 |
+
return viable
|
147 |
+
|
148 |
+
|
149 |
+
def get_definition(candidate, sentence):
|
150 |
+
"""
|
151 |
+
Takes a candidate and a sentence and returns the definition candidate.
|
152 |
+
|
153 |
+
The definintion candidate is the set of tokens (in front of the candidate)
|
154 |
+
that starts with a token starting with the first character of the candidate
|
155 |
+
|
156 |
+
:param candidate: candidate abbreviation
|
157 |
+
:param sentence: current sentence (single line from input file)
|
158 |
+
:return: candidate definition for this abbreviation
|
159 |
+
"""
|
160 |
+
# Take the tokens in front of the candidate
|
161 |
+
tokens = regex.split(r'[\s\-]+', sentence[:candidate.start - 2].lower())
|
162 |
+
#print(tokens)
|
163 |
+
# the char that we are looking for
|
164 |
+
key = candidate[0].lower()
|
165 |
+
|
166 |
+
# Count the number of tokens that start with the same character as the candidate
|
167 |
+
# print(tokens)
|
168 |
+
firstchars = [t[0] for t in tokens]
|
169 |
+
# print(firstchars)
|
170 |
+
definition_freq = firstchars.count(key)
|
171 |
+
candidate_freq = candidate.lower().count(key)
|
172 |
+
|
173 |
+
# Look for the list of tokens in front of candidate that
|
174 |
+
# have a sufficient number of tokens starting with key
|
175 |
+
if candidate_freq <= definition_freq:
|
176 |
+
# we should at least have a good number of starts
|
177 |
+
count = 0
|
178 |
+
start = 0
|
179 |
+
startindex = len(firstchars) - 1
|
180 |
+
|
181 |
+
while count < candidate_freq:
|
182 |
+
if abs(start) > len(firstchars):
|
183 |
+
raise ValueError("candiate {} not found".format(candidate))
|
184 |
+
start -= 1
|
185 |
+
# Look up key in the definition
|
186 |
+
try:
|
187 |
+
startindex = firstchars.index(key, len(firstchars) + start)
|
188 |
+
except ValueError:
|
189 |
+
pass
|
190 |
+
|
191 |
+
# Count the number of keys in definition
|
192 |
+
count = firstchars[startindex:].count(key)
|
193 |
+
|
194 |
+
# We found enough keys in the definition so return the definition as a definition candidate
|
195 |
+
start = len(' '.join(tokens[:startindex]))
|
196 |
+
stop = candidate.start - 1
|
197 |
+
candidate = sentence[start:stop]
|
198 |
+
|
199 |
+
# Remove whitespace
|
200 |
+
start = start + len(candidate) - len(candidate.lstrip())
|
201 |
+
stop = stop - len(candidate) + len(candidate.rstrip())
|
202 |
+
candidate = sentence[start:stop]
|
203 |
+
|
204 |
+
new_candidate = Candidate(candidate)
|
205 |
+
new_candidate.set_position(start, stop)
|
206 |
+
#print('new_candidate:')
|
207 |
+
#print(new_candidate,start,stop)
|
208 |
+
return new_candidate
|
209 |
+
|
210 |
+
else:
|
211 |
+
raise ValueError('There are less keys in the tokens in front of candidate than there are in the candidate')
|
212 |
+
|
213 |
+
|
214 |
+
def select_definition(definition, abbrev):
|
215 |
+
"""
|
216 |
+
Takes a definition candidate and an abbreviation candidate
|
217 |
+
and returns True if the chars in the abbreviation occur in the definition
|
218 |
+
|
219 |
+
Based on
|
220 |
+
A simple algorithm for identifying abbreviation definitions in biomedical texts, Schwartz & Hearst
|
221 |
+
:param definition: candidate definition
|
222 |
+
:param abbrev: candidate abbreviation
|
223 |
+
:return:
|
224 |
+
"""
|
225 |
+
|
226 |
+
|
227 |
+
if len(definition) < len(abbrev):
|
228 |
+
raise ValueError('Abbreviation is longer than definition')
|
229 |
+
|
230 |
+
if abbrev in definition.split():
|
231 |
+
raise ValueError('Abbreviation is full word of definition')
|
232 |
+
|
233 |
+
sindex = -1
|
234 |
+
lindex = -1
|
235 |
+
|
236 |
+
while 1:
|
237 |
+
try:
|
238 |
+
longchar = definition[lindex].lower()
|
239 |
+
except IndexError:
|
240 |
+
raise
|
241 |
+
|
242 |
+
shortchar = abbrev[sindex].lower()
|
243 |
+
|
244 |
+
if not shortchar.isalnum():
|
245 |
+
sindex -= 1
|
246 |
+
|
247 |
+
if sindex == -1 * len(abbrev):
|
248 |
+
if shortchar == longchar:
|
249 |
+
if lindex == -1 * len(definition) or not definition[lindex - 1].isalnum():
|
250 |
+
break
|
251 |
+
else:
|
252 |
+
lindex -= 1
|
253 |
+
else:
|
254 |
+
lindex -= 1
|
255 |
+
if lindex == -1 * (len(definition) + 1):
|
256 |
+
raise ValueError("definition {} was not found in {}".format(abbrev, definition))
|
257 |
+
|
258 |
+
else:
|
259 |
+
if shortchar == longchar:
|
260 |
+
sindex -= 1
|
261 |
+
lindex -= 1
|
262 |
+
else:
|
263 |
+
lindex -= 1
|
264 |
+
# print('lindex:',lindex,len(definition),definition[lindex:len(definition)])
|
265 |
+
new_candidate = Candidate(definition[lindex:len(definition)])
|
266 |
+
new_candidate.set_position(definition.start+lindex+len(definition), definition.stop)
|
267 |
+
definition = new_candidate
|
268 |
+
|
269 |
+
tokens = len(definition.split())
|
270 |
+
length = len(abbrev)
|
271 |
+
|
272 |
+
if tokens > min([length + 5, length * 2]):
|
273 |
+
raise ValueError("did not meet min(|A|+5, |A|*2) constraint")
|
274 |
+
|
275 |
+
# Do not return definitions that contain unbalanced parentheses
|
276 |
+
if definition.count('(') != definition.count(')'):
|
277 |
+
raise ValueError("Unbalanced parentheses not allowed in a definition")
|
278 |
+
# print('select:')
|
279 |
+
# print(definition,definition.start, definition.stop)
|
280 |
+
new_definition_dict={'definition':definition,'start':definition.start,'stop':definition.stop}
|
281 |
+
return new_definition_dict
|
282 |
+
|
283 |
+
|
284 |
+
def extract_abbreviation_definition_pairs(file_path=None, doc_text=None):
|
285 |
+
abbrev_map = [] #[{definition,start,stop,abbre}]
|
286 |
+
abbr_full_dict={} #{abbre:(fullname_start,fullname_stop)}
|
287 |
+
fullloc_abbr_dict={} #{"fullname_s fullname_e":abbr}
|
288 |
+
omit = 0
|
289 |
+
written = 0
|
290 |
+
if file_path:
|
291 |
+
sentence_iterator = enumerate(yield_lines_from_file(file_path))
|
292 |
+
elif doc_text:
|
293 |
+
sentence_iterator = enumerate(yield_lines_from_doc(doc_text))
|
294 |
+
else:
|
295 |
+
return abbrev_map
|
296 |
+
|
297 |
+
for i, sentence in sentence_iterator:
|
298 |
+
#print(sentence)
|
299 |
+
try:
|
300 |
+
for candidate in best_candidates(sentence):
|
301 |
+
#print(candidate)
|
302 |
+
try:
|
303 |
+
#print('begin get definition')
|
304 |
+
definition = get_definition(candidate, sentence)
|
305 |
+
#print('get_definition:')
|
306 |
+
#print(definition)
|
307 |
+
|
308 |
+
except (ValueError, IndexError) as e:
|
309 |
+
#log.debug("{} Omitting candidate {}. Reason: {}".format(i, candidate, e.args[0]))
|
310 |
+
omit += 1
|
311 |
+
else:
|
312 |
+
try:
|
313 |
+
definition_dict = select_definition(definition, candidate)
|
314 |
+
except (ValueError, IndexError) as e:
|
315 |
+
#log.debug("{} Omitting definition {} for candidate {}. Reason: {}".format(i, definition_dict, candidate, e.args[0]))
|
316 |
+
omit += 1
|
317 |
+
else:
|
318 |
+
definition_dict['abbre']=candidate
|
319 |
+
abbrev_map.append(definition_dict)
|
320 |
+
abbr_full_dict[definition_dict['abbre']]=(definition_dict['start'],definition_dict['stop'])
|
321 |
+
fullloc_abbr_dict[str(definition_dict['start'])+' '+str(definition_dict['stop'])]=definition_dict['abbre']
|
322 |
+
written += 1
|
323 |
+
except (ValueError, IndexError) as e:
|
324 |
+
log.debug("{} Error processing sentence {}: {}".format(i, sentence, e.args[0]))
|
325 |
+
log.debug("{} abbreviations detected and kept ({} omitted)".format(written, omit))
|
326 |
+
return abbrev_map,abbr_full_dict,fullloc_abbr_dict
|
327 |
+
|
328 |
+
|
329 |
+
def postprocess_abbr(ner_result,ori_text): #ner_result {'entity_s entity_e':[eles]}
|
330 |
+
|
331 |
+
final_result=[]
|
332 |
+
if len(ner_result)==0:
|
333 |
+
return {}
|
334 |
+
|
335 |
+
# abbr recognition
|
336 |
+
abbr_list, abbr_full_dict,fullloc_abbr_dict=extract_abbreviation_definition_pairs(doc_text=ori_text)
|
337 |
+
# print(abbr_list)
|
338 |
+
#print(abbr_full_dict)
|
339 |
+
# print(fullloc_abbr_dict)
|
340 |
+
|
341 |
+
#ner loc
|
342 |
+
ner_loc_result={}
|
343 |
+
for ele in ner_result.keys():
|
344 |
+
# ner_loc_result[ner_result[ele][0]+' '+ner_result[ele][1]]=ner_result[ele]
|
345 |
+
ner_loc_result[ner_result[ele][1]]=ner_result[ele]
|
346 |
+
|
347 |
+
# remove the wrong abbr, add miss abbr
|
348 |
+
for entity_loc in ner_result.keys():
|
349 |
+
|
350 |
+
if (ner_result[entity_loc][-1]!='CellLine') and (ner_result[entity_loc][2] in abbr_full_dict.keys()) : #the entity is abbr
|
351 |
+
#use the fullname entity type
|
352 |
+
fullname_loc_e=str(abbr_full_dict[ner_result[entity_loc][2]][1])
|
353 |
+
|
354 |
+
if fullname_loc_e in ner_loc_result.keys(): #fullname is entity
|
355 |
+
final_result.append([ner_result[entity_loc][0], ner_result[entity_loc][1],ner_result[entity_loc][2],ner_loc_result[fullname_loc_e][-1]])
|
356 |
+
|
357 |
+
|
358 |
+
# # fullname_loc=str(abbr_full_dict[ner_result[entity_loc][2]][0])+' '+str(abbr_full_dict[ner_result[entity_loc][2]][1])
|
359 |
+
# fullname_loc_e=str(abbr_full_dict[ner_result[entity_loc][2]][1])
|
360 |
+
# if (ner_result[entity_loc][-1]=='Gene') or (ner_result[entity_loc][-1]=='FamilyName'): #gene keep original entity type
|
361 |
+
# if fullname_loc_e in ner_loc_result.keys(): #fullname is entity
|
362 |
+
# final_result.append(ner_result[entity_loc])
|
363 |
+
# # elif fullname_loc_e in ner_loc_result.keys(): #fullname is entity
|
364 |
+
# # final_result.append(ner_result[entity_loc])
|
365 |
+
# else: # no-gene use the fullname entity type
|
366 |
+
# if fullname_loc_e in ner_loc_result.keys(): #fullname is entity
|
367 |
+
# final_result.append([ner_result[entity_loc][0], ner_result[entity_loc][1],ner_result[entity_loc][2],ner_loc_result[fullname_loc_e][-1]])
|
368 |
+
# # elif fullname_loc_e in ner_loc_result.keys(): #fullname is entity
|
369 |
+
# # final_result.append([ner_result[entity_loc][0], ner_result[entity_loc][1],ner_result[entity_loc][2],ner_loc_result[fullname_loc_e][-1]])
|
370 |
+
|
371 |
+
|
372 |
+
|
373 |
+
elif entity_loc in fullloc_abbr_dict.keys(): #the entity is fullname
|
374 |
+
abbr_loc_s=ori_text.find(fullloc_abbr_dict[entity_loc],int(ner_result[entity_loc][1]))
|
375 |
+
final_result.append(ner_result[entity_loc])
|
376 |
+
if abbr_loc_s>=0:
|
377 |
+
abbr_loc_e=abbr_loc_s+len(fullloc_abbr_dict[entity_loc])
|
378 |
+
abbr_loc=str(abbr_loc_s)+' '+str(abbr_loc_e)
|
379 |
+
# print(abbr_loc,fullloc_abbr_dict[entity_loc])
|
380 |
+
if abbr_loc not in ner_result.keys():#add abbr
|
381 |
+
final_result.append([str(abbr_loc_s),str(abbr_loc_e),ori_text[abbr_loc_s:abbr_loc_e],ner_result[entity_loc][-1]])
|
382 |
+
|
383 |
+
else:
|
384 |
+
#if entity is only Punctuation
|
385 |
+
if len(ner_result[entity_loc][2])==1 and (not ner_result[entity_loc][2].isalpha()):
|
386 |
+
pass
|
387 |
+
# print(ner_result[entity_loc])
|
388 |
+
else:
|
389 |
+
final_result.append(ner_result[entity_loc])
|
390 |
+
|
391 |
+
|
392 |
+
#print(final_result)
|
393 |
+
return final_result
|
394 |
+
|
395 |
+
|
396 |
+
def entity_consistency(ner_result,ori_text): #ner_result=[]
|
397 |
+
|
398 |
+
final_result={}
|
399 |
+
entity_loc_set=set()
|
400 |
+
entity_type={} #{entity:{type1:num,type2:num}}
|
401 |
+
|
402 |
+
for segs in ner_result:
|
403 |
+
entity_loc_set.add(segs[0]+' '+segs[1])
|
404 |
+
final_result['\t'.join(segs)]=[int(segs[0]),int(segs[1])]
|
405 |
+
if len(segs[2])>1:
|
406 |
+
if segs[2].isupper():#entity is all supper abbr
|
407 |
+
if segs[2] not in entity_type.keys():
|
408 |
+
entity_type[segs[2]]={segs[-1]:1}
|
409 |
+
else:
|
410 |
+
if segs[-1] in entity_type[segs[2]]:
|
411 |
+
entity_type[segs[2]][segs[-1]]+=1
|
412 |
+
else:
|
413 |
+
entity_type[segs[2]][segs[-1]]=1
|
414 |
+
else: #not abbr
|
415 |
+
if segs[2].lower() not in entity_type.keys():
|
416 |
+
entity_type[segs[2].lower()]={segs[-1]:1}
|
417 |
+
else:
|
418 |
+
if segs[-1] in entity_type[segs[2].lower()]:
|
419 |
+
entity_type[segs[2].lower()][segs[-1]]+=1
|
420 |
+
else:
|
421 |
+
entity_type[segs[2].lower()][segs[-1]]=1
|
422 |
+
|
423 |
+
|
424 |
+
# print(entity_type)
|
425 |
+
# print('..........')
|
426 |
+
entity_type_major={}
|
427 |
+
for ele in entity_type.keys():
|
428 |
+
entity_type_major[ele]=max(zip(entity_type[ele].values(), entity_type[ele].keys()))[1]
|
429 |
+
# print(entity_type_major)
|
430 |
+
|
431 |
+
|
432 |
+
#find miss entity
|
433 |
+
for entity_text in entity_type_major.keys():
|
434 |
+
|
435 |
+
if entity_text.isupper():#entity is all supper abbr
|
436 |
+
new_text=ori_text
|
437 |
+
else:
|
438 |
+
new_text=ori_text.lower()
|
439 |
+
ent_eid=0
|
440 |
+
while new_text.find(entity_text,ent_eid)>=0:
|
441 |
+
ent_sid=new_text.find(entity_text,ent_eid)
|
442 |
+
ent_eid=ent_sid+len(entity_text)
|
443 |
+
entity_loc=str(ent_sid)+' '+str(ent_eid)
|
444 |
+
# print(abbr_sid,abbr_eid)
|
445 |
+
if entity_loc not in entity_loc_set:
|
446 |
+
if ent_sid>0 and ent_eid<len(new_text):
|
447 |
+
if new_text[ent_sid-1].isalnum()==False and new_text[ent_eid].isalnum()==False:
|
448 |
+
final_result[str(ent_sid)+'\t'+str(ent_eid)+'\t'+ori_text[ent_sid:ent_eid]+'\t'+entity_type_major[entity_text]]=[ent_sid,ent_eid]
|
449 |
+
entity_loc_set.add(entity_loc)
|
450 |
+
elif ent_sid==0 and ent_eid<len(new_text):
|
451 |
+
if new_text[ent_eid].isalnum()==False:
|
452 |
+
final_result[str(ent_sid)+'\t'+str(ent_eid)+'\t'+ori_text[ent_sid:ent_eid]+'\t'+entity_type_major[entity_text]]=[ent_sid,ent_eid]
|
453 |
+
entity_loc_set.add(entity_loc)
|
454 |
+
elif ent_sid>0 and ent_eid==len(new_text):
|
455 |
+
if new_text[ent_sid-1].isalnum()==False :
|
456 |
+
final_result[str(ent_sid)+'\t'+str(ent_eid)+'\t'+ori_text[ent_sid:ent_eid]+'\t'+entity_type_major[entity_text]]=[ent_sid,ent_eid]
|
457 |
+
entity_loc_set.add(entity_loc)
|
458 |
+
|
459 |
+
if len(final_result)!=len(ner_result):#add new entity, sort , remover overloppling
|
460 |
+
final_result=sorted(final_result.items(), key=lambda kv:(kv[1]), reverse=False)
|
461 |
+
mention_list=[]
|
462 |
+
for ele in final_result:
|
463 |
+
mention_list.append(ele[0].split('\t'))
|
464 |
+
final_ner_result=combine_overlap(mention_list)
|
465 |
+
else:
|
466 |
+
final_ner_result=ner_result
|
467 |
+
return final_ner_result
|
468 |
+
|
469 |
+
def combine_overlap(mention_list):
|
470 |
+
|
471 |
+
entity_list=[]
|
472 |
+
if len(mention_list)>2:
|
473 |
+
|
474 |
+
first_entity=mention_list[0]
|
475 |
+
nest_list=[first_entity]
|
476 |
+
max_eid=int(first_entity[1])
|
477 |
+
for i in range(1,len(mention_list)):
|
478 |
+
segs=mention_list[i]
|
479 |
+
if int(segs[0])>= max_eid:
|
480 |
+
if len(nest_list)==1:
|
481 |
+
entity_list.append(nest_list[0])
|
482 |
+
nest_list=[]
|
483 |
+
nest_list.append(segs)
|
484 |
+
if int(segs[1])>max_eid:
|
485 |
+
max_eid=int(segs[1])
|
486 |
+
else:
|
487 |
+
tem=find_max_entity(nest_list)#find max entity
|
488 |
+
entity_list.append(tem)
|
489 |
+
nest_list=[]
|
490 |
+
nest_list.append(segs)
|
491 |
+
if int(segs[1])>max_eid:
|
492 |
+
max_eid=int(segs[1])
|
493 |
+
|
494 |
+
else:
|
495 |
+
nest_list.append(segs)
|
496 |
+
if int(segs[1])>max_eid:
|
497 |
+
max_eid=int(segs[1])
|
498 |
+
if nest_list!=[]:
|
499 |
+
if len(nest_list)==1:
|
500 |
+
entity_list.append(nest_list[0])
|
501 |
+
|
502 |
+
else:
|
503 |
+
tem=find_max_entity(nest_list)#find max entity
|
504 |
+
entity_list.append(tem)
|
505 |
+
else:
|
506 |
+
entity_list=mention_list
|
507 |
+
|
508 |
+
return entity_list
|
509 |
+
|
510 |
+
def find_max_entity(nest_list):
|
511 |
+
max_len=0
|
512 |
+
max_entity=[]
|
513 |
+
for i in range(0, len(nest_list)):
|
514 |
+
length=int(nest_list[i][1])-int(nest_list[i][0])
|
515 |
+
if length>max_len:
|
516 |
+
max_len=length
|
517 |
+
max_entity=nest_list[i]
|
518 |
+
|
519 |
+
return max_entity
|
520 |
+
|
521 |
+
|
522 |
+
|
523 |
+
|
524 |
+
if __name__ == '__main__':
|
525 |
+
|
526 |
+
path='//panfs/pan1/bionlplab/luol2/PubTator3/example/post-out/'
|
527 |
+
fin=open(path+'PubmedBERT-CRF-AIO_ALL.test_preds','r',encoding='utf-8')
|
528 |
+
all_in=fin.read().strip().split('\n\n')
|
529 |
+
fout=open(path+'PubmedBERT-CRF-AIO_ALL-post4.test_preds','w',encoding='utf-8')
|
530 |
+
for doc in all_in:
|
531 |
+
lines=doc.split('\n')
|
532 |
+
pmid=lines[0].split('|t|')[0]
|
533 |
+
ori_text=lines[0].split('|t|')[1]+' '+lines[1].split('|a|')[1]
|
534 |
+
ner_result={}
|
535 |
+
for i in range(2,len(lines)):
|
536 |
+
seg=lines[i].split('\t')
|
537 |
+
ner_result[seg[1]+' '+seg[2]]=seg[1:]
|
538 |
+
# abbr recognition
|
539 |
+
final_ner=postprocess_abbr(ner_result,ori_text)
|
540 |
+
#entity consistence
|
541 |
+
final_ner=entity_consistency(final_ner,ori_text)
|
542 |
+
# final_result=sorted(final_ner.items(), key=lambda kv:(kv[1]), reverse=False)
|
543 |
+
fout.write(lines[0]+'\n'+lines[1]+'\n')
|
544 |
+
for ele in final_ner:
|
545 |
+
fout.write(pmid+'\t'+'\t'.join(ele)+'\n')
|
546 |
+
fout.write('\n')
|
547 |
+
fout.close()
|
548 |
+
|
549 |
+
# sys.exit()
|
550 |
+
|
551 |
+
|