lingbionlp
/

AIONER0414

Model card Files Files and versions Community

lingbionlp commited on Apr 14, 2023

Commit

8ab6ceb

•

1 Parent(s): 5e753a2

Upload 2 files

Browse files

Files changed (2) hide show

AIO_label.vocab +21 -0
postprocessing.py +551 -0

AIO_label.vocab ADDED Viewed

	@@ -0,0 +1,21 @@

+O
+B-Gene
+I-Gene
+O-Gene
+B-FamilyName
+I-FamilyName
+B-Disease
+I-Disease
+O-Disease
+B-Chemical
+I-Chemical
+O-Chemical
+B-Mutation
+I-Mutation
+O-Mutation
+B-Species
+I-Species
+O-Species
+B-CellLine
+I-CellLine
+O-CellLine

postprocessing.py ADDED Viewed

	@@ -0,0 +1,551 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Nov 03 20:08:30 2022
+@author: luol2
+"""
+import logging
+import regex
+import sys
+import io
+"""
+A Python 3 refactoring of Vincent Van Asch's Python 2 code at
+http://www.cnts.ua.ac.be/~vincent/scripts/abbreviations.py
+Based on
+A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
+A. Schwartz and M. Hearst
+Biocomputing, 2003, pp 451-462.
+"""
+logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+log = logging.getLogger('Abbre')
+class Candidate(str):
+    def __init__(self, value):
+        super().__init__()
+        self.start = 0
+        self.stop = 0
+    def set_position(self, start, stop):
+        self.start = start
+        self.stop = stop
+def yield_lines_from_file(file_path):
+    with open(file_path, 'rb') as f:
+        for line in f:
+            try:
+                line = line.decode('utf-8')
+            except UnicodeDecodeError:
+                line = line.decode('latin-1').encode('utf-8').decode('utf-8')
+            line = line.strip()
+            yield line
+        f.close()
+def yield_lines_from_doc(doc_text):
+    for line in doc_text.split("\n"):
+        yield line.strip()
+def best_candidates(sentence):
+    """
+    :param sentence: line read from input file
+    :return: a Candidate iterator
+    """
+    if '(' in sentence:
+        # Check some things first
+        if sentence.count('(') != sentence.count(')'):
+            raise ValueError("Unbalanced parentheses: {}".format(sentence))
+        if sentence.find('(') > sentence.find(')'):
+            raise ValueError("First parentheses is right: {}".format(sentence))
+        closeindex = -1
+        while 1:
+            # Look for open parenthesis
+            openindex = sentence.find('(', closeindex + 1)
+            if openindex == -1: break
+            # Look for closing parentheses
+            closeindex = openindex + 1
+            open = 1
+            skip = False
+            while open:
+                try:
+                    char = sentence[closeindex]
+                except IndexError:
+                    # We found an opening bracket but no associated closing bracket
+                    # Skip the opening bracket
+                    skip = True
+                    break
+                if char == '(':
+                    open += 1
+                elif char in [')', ';', ':']:
+                    open -= 1
+                closeindex += 1
+            if skip:
+                closeindex = openindex + 1
+                continue
+            # Output if conditions are met
+            start = openindex + 1
+            stop = closeindex - 1
+            candidate = sentence[start:stop]
+            # Take into account whitespace that should be removed
+            start = start + len(candidate) - len(candidate.lstrip())
+            stop = stop - len(candidate) + len(candidate.rstrip())
+            candidate = sentence[start:stop]
+            if conditions(candidate):
+                new_candidate = Candidate(candidate)
+                new_candidate.set_position(start, stop)
+                yield new_candidate
+def conditions(candidate):
+    """
+    Based on Schwartz&Hearst
+    2 <= len(str) <= 10
+    len(tokens) <= 2
+    re.search('\p{L}', str)
+    str[0].isalnum()
+    and extra:
+    if it matches (\p{L}\.?\s?){2,}
+    it is a good candidate.
+    :param candidate: candidate abbreviation
+    :return: True if this is a good candidate
+    """
+    viable = True
+    if regex.match('(\p{L}\.?\s?){2,}', candidate.lstrip()):
+        viable = True
+    if len(candidate) < 2 or len(candidate) > 10:
+        viable = False
+    if len(candidate.split()) > 2:
+        viable = False
+    if not regex.search('\p{L}', candidate):
+        viable = False
+    if not candidate[0].isalnum():
+        viable = False
+    return viable
+def get_definition(candidate, sentence):
+    """
+    Takes a candidate and a sentence and returns the definition candidate.
+    The definintion candidate is the set of tokens (in front of the candidate)
+    that starts with a token starting with the first character of the candidate
+    :param candidate: candidate abbreviation
+    :param sentence: current sentence (single line from input file)
+    :return: candidate definition for this abbreviation
+    """
+    # Take the tokens in front of the candidate
+    tokens = regex.split(r'[\s\-]+', sentence[:candidate.start - 2].lower())
+    #print(tokens)
+    # the char that we are looking for
+    key = candidate[0].lower()
+    # Count the number of tokens that start with the same character as the candidate
+#     print(tokens)
+    firstchars = [t[0] for t in tokens]
+#     print(firstchars)
+    definition_freq = firstchars.count(key)
+    candidate_freq = candidate.lower().count(key)
+    # Look for the list of tokens in front of candidate that
+    # have a sufficient number of tokens starting with key
+    if candidate_freq <= definition_freq:
+        # we should at least have a good number of starts
+        count = 0
+        start = 0
+        startindex = len(firstchars) - 1
+        while count < candidate_freq:
+            if abs(start) > len(firstchars):
+                raise ValueError("candiate {} not found".format(candidate))
+            start -= 1
+            # Look up key in the definition
+            try:
+                startindex = firstchars.index(key, len(firstchars) + start)
+            except ValueError:
+                pass
+            # Count the number of keys in definition
+            count = firstchars[startindex:].count(key)
+        # We found enough keys in the definition so return the definition as a definition candidate
+        start = len(' '.join(tokens[:startindex]))
+        stop = candidate.start - 1
+        candidate = sentence[start:stop]
+        # Remove whitespace
+        start = start + len(candidate) - len(candidate.lstrip())
+        stop = stop - len(candidate) + len(candidate.rstrip())
+        candidate = sentence[start:stop]
+        new_candidate = Candidate(candidate)
+        new_candidate.set_position(start, stop)
+        #print('new_candidate:')
+        #print(new_candidate,start,stop)
+        return new_candidate
+    else:
+        raise ValueError('There are less keys in the tokens in front of candidate than there are in the candidate')
+def select_definition(definition, abbrev):
+    """
+    Takes a definition candidate and an abbreviation candidate
+    and returns True if the chars in the abbreviation occur in the definition
+    Based on
+    A simple algorithm for identifying abbreviation definitions in biomedical texts, Schwartz & Hearst
+    :param definition: candidate definition
+    :param abbrev: candidate abbreviation
+    :return:
+    """
+    if len(definition) < len(abbrev):
+        raise ValueError('Abbreviation is longer than definition')
+    if abbrev in definition.split():
+        raise ValueError('Abbreviation is full word of definition')
+    sindex = -1
+    lindex = -1
+    while 1:
+        try:
+            longchar = definition[lindex].lower()
+        except IndexError:
+            raise
+        shortchar = abbrev[sindex].lower()
+        if not shortchar.isalnum():
+            sindex -= 1
+        if sindex == -1 * len(abbrev):
+            if shortchar == longchar:
+                if lindex == -1 * len(definition) or not definition[lindex - 1].isalnum():
+                    break
+                else:
+                    lindex -= 1
+            else:
+                lindex -= 1
+                if lindex == -1 * (len(definition) + 1):
+                    raise ValueError("definition {} was not found in {}".format(abbrev, definition))
+        else:
+            if shortchar == longchar:
+                sindex -= 1
+                lindex -= 1
+            else:
+                lindex -= 1
+#     print('lindex:',lindex,len(definition),definition[lindex:len(definition)])
+    new_candidate = Candidate(definition[lindex:len(definition)])
+    new_candidate.set_position(definition.start+lindex+len(definition), definition.stop)
+    definition = new_candidate
+    tokens = len(definition.split())
+    length = len(abbrev)
+    if tokens > min([length + 5, length * 2]):
+        raise ValueError("did not meet min(|A|+5, |A|*2) constraint")
+    # Do not return definitions that contain unbalanced parentheses
+    if definition.count('(') != definition.count(')'):
+        raise ValueError("Unbalanced parentheses not allowed in a definition")
+#     print('select:')
+#     print(definition,definition.start, definition.stop)
+    new_definition_dict={'definition':definition,'start':definition.start,'stop':definition.stop}
+    return new_definition_dict
+def extract_abbreviation_definition_pairs(file_path=None, doc_text=None):
+    abbrev_map = [] #[{definition,start,stop,abbre}]
+    abbr_full_dict={} #{abbre:(fullname_start,fullname_stop)}
+    fullloc_abbr_dict={} #{"fullname_s fullname_e":abbr}
+    omit = 0
+    written = 0
+    if file_path:
+        sentence_iterator = enumerate(yield_lines_from_file(file_path))
+    elif doc_text:
+        sentence_iterator = enumerate(yield_lines_from_doc(doc_text))
+    else:
+        return abbrev_map
+    for i, sentence in sentence_iterator:
+        #print(sentence)
+        try:
+            for candidate in best_candidates(sentence):
+                #print(candidate)
+                try:
+                    #print('begin get definition')
+                    definition = get_definition(candidate, sentence)
+                    #print('get_definition:')
+                    #print(definition)
+                except (ValueError, IndexError) as e:
+                    #log.debug("{} Omitting candidate {}. Reason: {}".format(i, candidate, e.args[0]))
+                    omit += 1
+                else:
+                    try:
+                        definition_dict = select_definition(definition, candidate)
+                    except (ValueError, IndexError) as e:
+                        #log.debug("{} Omitting definition {} for candidate {}. Reason: {}".format(i, definition_dict, candidate, e.args[0]))
+                        omit += 1
+                    else:
+                        definition_dict['abbre']=candidate
+                        abbrev_map.append(definition_dict)
+                        abbr_full_dict[definition_dict['abbre']]=(definition_dict['start'],definition_dict['stop'])
+                        fullloc_abbr_dict[str(definition_dict['start'])+' '+str(definition_dict['stop'])]=definition_dict['abbre']
+                        written += 1
+        except (ValueError, IndexError) as e:
+            log.debug("{} Error processing sentence {}: {}".format(i, sentence, e.args[0]))
+    log.debug("{} abbreviations detected and kept ({} omitted)".format(written, omit))
+    return abbrev_map,abbr_full_dict,fullloc_abbr_dict
+def postprocess_abbr(ner_result,ori_text): #ner_result {'entity_s entity_e':[eles]}
+    final_result=[]
+    if len(ner_result)==0:
+        return {}
+    # abbr recognition
+    abbr_list, abbr_full_dict,fullloc_abbr_dict=extract_abbreviation_definition_pairs(doc_text=ori_text)
+    # print(abbr_list)
+    #print(abbr_full_dict)
+    # print(fullloc_abbr_dict)
+    #ner loc
+    ner_loc_result={}
+    for ele in ner_result.keys():
+        # ner_loc_result[ner_result[ele][0]+' '+ner_result[ele][1]]=ner_result[ele]
+        ner_loc_result[ner_result[ele][1]]=ner_result[ele]
+    # remove the wrong abbr, add miss abbr
+    for entity_loc in ner_result.keys():
+        if (ner_result[entity_loc][-1]!='CellLine') and (ner_result[entity_loc][2] in abbr_full_dict.keys()) : #the entity is abbr
+            #use the fullname entity type
+            fullname_loc_e=str(abbr_full_dict[ner_result[entity_loc][2]][1])
+            if fullname_loc_e in ner_loc_result.keys(): #fullname is entity
+                final_result.append([ner_result[entity_loc][0], ner_result[entity_loc][1],ner_result[entity_loc][2],ner_loc_result[fullname_loc_e][-1]])
+        # # fullname_loc=str(abbr_full_dict[ner_result[entity_loc][2]][0])+' '+str(abbr_full_dict[ner_result[entity_loc][2]][1])
+        #     fullname_loc_e=str(abbr_full_dict[ner_result[entity_loc][2]][1])
+        #     if (ner_result[entity_loc][-1]=='Gene') or (ner_result[entity_loc][-1]=='FamilyName'): #gene keep original entity type
+        #         if fullname_loc_e in ner_loc_result.keys(): #fullname is entity
+        #             final_result.append(ner_result[entity_loc])
+        #         # elif fullname_loc_e in ner_loc_result.keys(): #fullname is entity
+        #         #     final_result.append(ner_result[entity_loc])
+        #     else: # no-gene use the fullname entity type
+        #         if fullname_loc_e in ner_loc_result.keys(): #fullname is entity
+        #             final_result.append([ner_result[entity_loc][0], ner_result[entity_loc][1],ner_result[entity_loc][2],ner_loc_result[fullname_loc_e][-1]])
+        #         # elif fullname_loc_e in ner_loc_result.keys(): #fullname is entity
+        #             # final_result.append([ner_result[entity_loc][0], ner_result[entity_loc][1],ner_result[entity_loc][2],ner_loc_result[fullname_loc_e][-1]])
+        elif entity_loc in fullloc_abbr_dict.keys(): #the entity is fullname
+            abbr_loc_s=ori_text.find(fullloc_abbr_dict[entity_loc],int(ner_result[entity_loc][1]))
+            final_result.append(ner_result[entity_loc])
+            if abbr_loc_s>=0:
+                abbr_loc_e=abbr_loc_s+len(fullloc_abbr_dict[entity_loc])
+                abbr_loc=str(abbr_loc_s)+' '+str(abbr_loc_e)
+                # print(abbr_loc,fullloc_abbr_dict[entity_loc])
+                if abbr_loc not in ner_result.keys():#add abbr
+                    final_result.append([str(abbr_loc_s),str(abbr_loc_e),ori_text[abbr_loc_s:abbr_loc_e],ner_result[entity_loc][-1]])
+        else:
+            #if entity is only Punctuation
+            if len(ner_result[entity_loc][2])==1 and (not ner_result[entity_loc][2].isalpha()):
+                pass
+                # print(ner_result[entity_loc])
+            else:
+                final_result.append(ner_result[entity_loc])
+    #print(final_result)
+    return final_result
+def entity_consistency(ner_result,ori_text): #ner_result=[]
+    final_result={}
+    entity_loc_set=set()
+    entity_type={} #{entity:{type1:num,type2:num}}
+    for segs in ner_result:
+        entity_loc_set.add(segs[0]+' '+segs[1])
+        final_result['\t'.join(segs)]=[int(segs[0]),int(segs[1])]
+        if len(segs[2])>1:
+            if segs[2].isupper():#entity is all supper abbr
+                if segs[2] not in entity_type.keys():
+                    entity_type[segs[2]]={segs[-1]:1}
+                else:
+                    if segs[-1] in entity_type[segs[2]]:
+                        entity_type[segs[2]][segs[-1]]+=1
+                    else:
+                        entity_type[segs[2]][segs[-1]]=1
+            else: #not abbr
+                if segs[2].lower() not in entity_type.keys():
+                    entity_type[segs[2].lower()]={segs[-1]:1}
+                else:
+                    if segs[-1] in entity_type[segs[2].lower()]:
+                        entity_type[segs[2].lower()][segs[-1]]+=1
+                    else:
+                        entity_type[segs[2].lower()][segs[-1]]=1
+    # print(entity_type)
+    # print('..........')
+    entity_type_major={}
+    for ele in entity_type.keys():
+        entity_type_major[ele]=max(zip(entity_type[ele].values(), entity_type[ele].keys()))[1]
+    # print(entity_type_major)
+    #find miss entity
+    for entity_text in entity_type_major.keys():
+        if entity_text.isupper():#entity is all supper abbr
+            new_text=ori_text
+        else:
+            new_text=ori_text.lower()
+        ent_eid=0
+        while new_text.find(entity_text,ent_eid)>=0:
+            ent_sid=new_text.find(entity_text,ent_eid)
+            ent_eid=ent_sid+len(entity_text)
+            entity_loc=str(ent_sid)+' '+str(ent_eid)
+            # print(abbr_sid,abbr_eid)
+            if entity_loc not in entity_loc_set:
+                if ent_sid>0 and ent_eid<len(new_text):
+                    if new_text[ent_sid-1].isalnum()==False and new_text[ent_eid].isalnum()==False:
+                        final_result[str(ent_sid)+'\t'+str(ent_eid)+'\t'+ori_text[ent_sid:ent_eid]+'\t'+entity_type_major[entity_text]]=[ent_sid,ent_eid]
+                        entity_loc_set.add(entity_loc)
+                elif ent_sid==0 and ent_eid<len(new_text):
+                    if new_text[ent_eid].isalnum()==False:
+                        final_result[str(ent_sid)+'\t'+str(ent_eid)+'\t'+ori_text[ent_sid:ent_eid]+'\t'+entity_type_major[entity_text]]=[ent_sid,ent_eid]
+                        entity_loc_set.add(entity_loc)
+                elif ent_sid>0 and ent_eid==len(new_text):
+                    if new_text[ent_sid-1].isalnum()==False :
+                        final_result[str(ent_sid)+'\t'+str(ent_eid)+'\t'+ori_text[ent_sid:ent_eid]+'\t'+entity_type_major[entity_text]]=[ent_sid,ent_eid]
+                        entity_loc_set.add(entity_loc)
+    if len(final_result)!=len(ner_result):#add new entity, sort , remover overloppling
+        final_result=sorted(final_result.items(), key=lambda kv:(kv[1]), reverse=False)
+        mention_list=[]
+        for ele in final_result:
+            mention_list.append(ele[0].split('\t'))
+        final_ner_result=combine_overlap(mention_list)
+    else:
+        final_ner_result=ner_result
+    return final_ner_result
+def combine_overlap(mention_list):
+    entity_list=[]
+    if len(mention_list)>2:
+        first_entity=mention_list[0]
+        nest_list=[first_entity]
+        max_eid=int(first_entity[1])
+        for i in range(1,len(mention_list)):
+            segs=mention_list[i]
+            if int(segs[0])>= max_eid:
+                if len(nest_list)==1:
+                    entity_list.append(nest_list[0])
+                    nest_list=[]
+                    nest_list.append(segs)
+                    if int(segs[1])>max_eid:
+                        max_eid=int(segs[1])
+                else:
+                    tem=find_max_entity(nest_list)#find max entity
+                    entity_list.append(tem)
+                    nest_list=[]
+                    nest_list.append(segs)
+                    if int(segs[1])>max_eid:
+                        max_eid=int(segs[1])
+            else:
+                nest_list.append(segs)
+                if int(segs[1])>max_eid:
+                    max_eid=int(segs[1])
+        if nest_list!=[]:
+            if len(nest_list)==1:
+                entity_list.append(nest_list[0])
+            else:
+                tem=find_max_entity(nest_list)#find max entity
+                entity_list.append(tem)
+    else:
+        entity_list=mention_list
+    return entity_list
+def find_max_entity(nest_list):
+    max_len=0
+    max_entity=[]
+    for i in range(0, len(nest_list)):
+        length=int(nest_list[i][1])-int(nest_list[i][0])
+        if length>max_len:
+                max_len=length
+                max_entity=nest_list[i]
+    return max_entity
+if __name__ == '__main__':
+    path='//panfs/pan1/bionlplab/luol2/PubTator3/example/post-out/'
+    fin=open(path+'PubmedBERT-CRF-AIO_ALL.test_preds','r',encoding='utf-8')
+    all_in=fin.read().strip().split('\n\n')
+    fout=open(path+'PubmedBERT-CRF-AIO_ALL-post4.test_preds','w',encoding='utf-8')
+    for doc in all_in:
+        lines=doc.split('\n')
+        pmid=lines[0].split('|t|')[0]
+        ori_text=lines[0].split('|t|')[1]+' '+lines[1].split('|a|')[1]
+        ner_result={}
+        for i in range(2,len(lines)):
+            seg=lines[i].split('\t')
+            ner_result[seg[1]+' '+seg[2]]=seg[1:]
+        # abbr recognition
+        final_ner=postprocess_abbr(ner_result,ori_text)
+        #entity consistence
+        final_ner=entity_consistency(final_ner,ori_text)
+        # final_result=sorted(final_ner.items(), key=lambda kv:(kv[1]), reverse=False)
+        fout.write(lines[0]+'\n'+lines[1]+'\n')
+        for ele in final_ner:
+            fout.write(pmid+'\t'+'\t'.join(ele)+'\n')
+        fout.write('\n')
+    fout.close()
+        # sys.exit()