|
|
|
""" |
|
Created on Wed Jun 8 09:26:57 2022 |
|
|
|
@author: luol2 |
|
|
|
Pipeline: first gene NER, then species assignment |
|
input: species NER bioc xml file |
|
output: gene ner and species assignment results bioc xml file |
|
""" |
|
import argparse |
|
import os |
|
import io |
|
import time |
|
import sys |
|
import re |
|
import shutil |
|
from src_python.GeneNER import model_ner,ner_tag |
|
from src_python.SpeAss import model_sa,sa_tag |
|
|
|
import tensorflow as tf |
|
|
|
import bioc |
|
import stanza |
|
nlp_token = stanza.Pipeline(model_dir='gnorm_trained_models/stanza', lang='en', processors={'tokenize': 'spacy'},package='None', download_method=None) |
|
|
|
def NER_BioC(infolder,infile,outpath,nn_model): |
|
|
|
with open(infolder+"/"+infile, 'r',encoding='utf-8') as fin: |
|
with open(outpath+"/"+infile,'w', encoding='utf8') as fout: |
|
collection = bioc.load(fin) |
|
|
|
Total_n=len(collection.documents) |
|
print('Total number of sub-documents:', Total_n) |
|
pmid_n=0 |
|
for document in collection.documents: |
|
print("Processing:{0}%".format(round(pmid_n * 100 / Total_n)), end="\r") |
|
pmid_n+=1 |
|
|
|
mention_num_new=0 |
|
for passage in document.passages: |
|
if passage.text!='' and (not passage.text.isspace()) and passage.infons['type']!='ref': |
|
passage_offset=passage.offset |
|
tag_result=ner_tag.ML_Tag(passage.text,nn_model,nlp_token) |
|
mention_num=0 |
|
for ele in tag_result: |
|
bioc_note = bioc.BioCAnnotation() |
|
bioc_note.id = str(mention_num) |
|
mention_num+=1 |
|
bioc_note.infons['type'] = ele[2] |
|
start = int(ele[0]) |
|
last = int(ele[1]) |
|
loc = bioc.BioCLocation(offset=str(passage_offset+start), length= str(last-start)) |
|
bioc_note.locations.append(loc) |
|
bioc_note.text = passage.text[start:last] |
|
passage.annotations.append(bioc_note) |
|
|
|
for temp_annotation in passage.annotations: |
|
temp_annotation.id=str(mention_num_new) |
|
mention_num_new+=1 |
|
bioc.dump(collection, fout, pretty_print=True) |
|
|
|
def NER_PubTator(infolder,infile,outpath,nn_model): |
|
with open(infolder+"/"+infile, 'r',encoding='utf-8') as fin: |
|
with open(outpath+"/"+infile,'w', encoding='utf-8') as fout: |
|
title='' |
|
abstract='' |
|
all_text=fin.read().strip().split('\n\n') |
|
Total_n=len(all_text) |
|
print('Total number of sub-documents:', Total_n) |
|
pmid_n=0 |
|
for doc in all_text: |
|
print("Processing:{0}%".format(round(pmid_n * 100 / Total_n)), end="\r") |
|
pmid_n+=1 |
|
lines = doc.split('\n') |
|
seg=lines[0].split('|t|') |
|
pmid=seg[0] |
|
title="" |
|
if len(seg)>1: |
|
title=seg[1] |
|
abstract="" |
|
if len(lines)>1: |
|
seg=lines[1].split('|a|') |
|
abstract=seg[1] |
|
if len(seg)>1: |
|
abstract=seg[1] |
|
|
|
intext=title+' '+abstract |
|
tag_result=ner_tag.ML_Tag(intext,nn_model,nlp_token) |
|
fout.write(doc+'\n') |
|
for ele in tag_result: |
|
ent_start = ele[0] |
|
ent_last = ele[1] |
|
ent_mention = intext[int(ele[0]):int(ele[1])] |
|
ent_type=ele[2] |
|
fout.write(pmid+"\t"+ent_start+"\t"+ent_last+"\t"+ent_mention+"\t"+ent_type+"\n") |
|
fout.write('\n') |
|
title='' |
|
abstract='' |
|
|
|
def geneNER(infolder, outpath, modelfile): |
|
|
|
print('loading NER models........') |
|
|
|
if modelfile.lower().find('bioformer')>=0: |
|
vocabfiles={'labelfile':'./vocab/GeneNER_label.vocab', |
|
'checkpoint_path':'./gnorm_trained_models/bioformer-cased-v1.0/', |
|
'lowercase':False, |
|
} |
|
else: |
|
vocabfiles={'labelfile':'./vocab/GeneNER_label.vocab', |
|
'checkpoint_path':'./gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/', |
|
'lowercase':True, |
|
} |
|
|
|
nn_model=model_ner.HUGFACE_NER(vocabfiles) |
|
nn_model.build_encoder() |
|
nn_model.build_softmax_decoder() |
|
nn_model.load_model(modelfile) |
|
|
|
|
|
print("begin GeneNER tagging........") |
|
start_time=time.time() |
|
|
|
for infile in os.listdir(infolder): |
|
if os.path.isfile(outpath+"/"+infile): |
|
print(infile+' has exsited.') |
|
else: |
|
print('processing:',infile) |
|
fin = open(infolder+"/"+infile, 'r',encoding='utf-8') |
|
input_format="" |
|
for line in fin: |
|
pattern_bioc = re.compile('.*<collection>.*') |
|
pattern_pubtator = re.compile('^([^\|]+)\|[^\|]+\|(.*)') |
|
if pattern_bioc.search(line): |
|
input_format="BioC" |
|
break |
|
elif pattern_pubtator.search(line): |
|
input_format="PubTator" |
|
break |
|
fin.close() |
|
if(input_format == "PubTator"): |
|
NER_PubTator(infolder,infile,outpath,nn_model) |
|
elif(input_format == "BioC"): |
|
NER_BioC(infolder,infile,outpath,nn_model) |
|
|
|
print('tag done:',time.time()-start_time) |
|
|
|
|
|
|
|
def SA_BioC(infolder,infile,outpath,nn_model,virus_set,prefix_dict): |
|
|
|
|
|
|
|
|
|
|
|
fin = open(infolder+"/"+infile, 'r',encoding='utf-8') |
|
|
|
fin_pubtator0=io.StringIO() |
|
fin_pubtator1=io.StringIO() |
|
fin_pubtator2=io.StringIO() |
|
collection = bioc.load(fin) |
|
fin.close() |
|
ori_ann_index={} |
|
species_count={} |
|
gene_set=['Gene','FamilyName'] |
|
final_sa_results={} |
|
for document in collection.documents: |
|
doc_pmid=document.id |
|
doc_title='' |
|
doc_abstract='' |
|
doc_annotation=[] |
|
_ann_index={} |
|
_species_num={} |
|
_gene_num=0 |
|
_passage_num=0 |
|
if len(document.passages)<=2: |
|
for passage in document.passages: |
|
passage_offset=passage.offset |
|
_passage_num+=1 |
|
|
|
|
|
if _passage_num==1: |
|
doc_title=passage.text |
|
for temp_annotation in passage.annotations: |
|
if temp_annotation.infons['type'] in gene_set: |
|
_gene_num+=1 |
|
ent_start=temp_annotation.locations[0].offset-passage_offset |
|
ent_end=ent_start+temp_annotation.locations[0].length |
|
|
|
_ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end) |
|
|
|
if 'Identifier' in temp_annotation.infons.keys(): |
|
|
|
species_ID=temp_annotation.infons['Identifier'] |
|
if species_ID.find('*')>=0: |
|
if species_ID not in _species_num.keys(): |
|
_species_num[species_ID]=1 |
|
else: |
|
_species_num[species_ID]+=1 |
|
doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID) |
|
else: |
|
doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']) |
|
|
|
|
|
else: |
|
doc_abstract=passage.text |
|
for temp_annotation in passage.annotations: |
|
if temp_annotation.infons['type'] in gene_set: |
|
_gene_num+=1 |
|
ent_start=len(doc_title)+1+temp_annotation.locations[0].offset-passage_offset |
|
ent_end=ent_start+temp_annotation.locations[0].length |
|
|
|
_ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end) |
|
if 'Identifier' in temp_annotation.infons.keys(): |
|
|
|
species_ID=temp_annotation.infons['Identifier'] |
|
if species_ID.find('*')>=0: |
|
if species_ID not in _species_num.keys(): |
|
_species_num[species_ID]=1 |
|
else: |
|
_species_num[species_ID]+=1 |
|
doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID) |
|
else: |
|
doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']) |
|
|
|
if len(_species_num)>=2 and _gene_num>0: |
|
fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n') |
|
fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n') |
|
for ele in doc_annotation: |
|
fin_pubtator2.write(ele+'\n') |
|
fin_pubtator2.write('\n') |
|
elif len(_species_num)==1 and _gene_num>0: |
|
fin_pubtator1.write(doc_pmid+'|t|'+doc_title+'\n') |
|
fin_pubtator1.write(doc_pmid+'|a|'+doc_abstract+'\n') |
|
major_speicesid,=_species_num |
|
fin_pubtator1.write(major_speicesid[1:]+'\n') |
|
for ele in doc_annotation: |
|
fin_pubtator1.write(ele+'\n') |
|
fin_pubtator1.write('\n') |
|
elif len(_species_num)==0 and _gene_num>0: |
|
fin_pubtator0.write(doc_pmid+'|t|'+doc_title+'\n') |
|
fin_pubtator0.write(doc_pmid+'|a|'+doc_abstract+'\n') |
|
for ele in doc_annotation: |
|
fin_pubtator0.write(ele+'\n') |
|
fin_pubtator0.write('\n') |
|
|
|
else: |
|
for passage in document.passages: |
|
passage_annotation=[] |
|
_species_num_passage={} |
|
_gene_num_passage=0 |
|
passage_offset=passage.offset |
|
|
|
if passage.text!='' and (not passage.text.isspace()) and passage.infons['type']!='ref': |
|
doc_title=passage.text |
|
for temp_annotation in passage.annotations: |
|
if temp_annotation.infons['type'] in gene_set: |
|
_gene_num_passage+=1 |
|
ent_start=temp_annotation.locations[0].offset-passage_offset |
|
ent_end=ent_start+temp_annotation.locations[0].length |
|
|
|
_ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end) |
|
|
|
if 'Identifier' in temp_annotation.infons.keys(): |
|
|
|
species_ID=temp_annotation.infons['Identifier'] |
|
if species_ID.find('*')>=0: |
|
if species_ID not in _species_num.keys(): |
|
_species_num[species_ID]=1 |
|
else: |
|
_species_num[species_ID]+=1 |
|
if species_ID not in _species_num_passage.keys(): |
|
_species_num_passage[species_ID]=1 |
|
else: |
|
_species_num_passage[species_ID]+=1 |
|
passage_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID) |
|
else: |
|
passage_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']) |
|
|
|
|
|
if len(_species_num_passage)>=2 and _gene_num_passage>0: |
|
fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n') |
|
fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n') |
|
for ele in passage_annotation: |
|
fin_pubtator2.write(ele+'\n') |
|
fin_pubtator2.write('\n') |
|
elif len(_species_num_passage)==1 and _gene_num_passage>0: |
|
fin_pubtator1.write(doc_pmid+'|t|'+doc_title+'\n') |
|
fin_pubtator1.write(doc_pmid+'|a|'+doc_abstract+'\n') |
|
major_speicesid,=_species_num_passage |
|
fin_pubtator1.write(major_speicesid[1:]+'\n') |
|
for ele in passage_annotation: |
|
fin_pubtator1.write(ele+'\n') |
|
fin_pubtator1.write('\n') |
|
elif len(_species_num_passage)==0 and _gene_num_passage>0: |
|
fin_pubtator0.write(doc_pmid+'|t|'+doc_title+'\n') |
|
fin_pubtator0.write(doc_pmid+'|a|'+doc_abstract+'\n') |
|
for ele in passage_annotation: |
|
fin_pubtator0.write(ele+'\n') |
|
fin_pubtator0.write('\n') |
|
|
|
|
|
ori_ann_index[doc_pmid]=_ann_index |
|
species_count[doc_pmid]=_species_num |
|
|
|
|
|
cache_geneid={} |
|
|
|
if fin_pubtator2.getvalue()!='': |
|
|
|
|
|
ml_out= sa_tag.ml_tag_main(fin_pubtator2,nlp_token, nn_model) |
|
|
|
fin_result=io.StringIO(ml_out.getvalue()) |
|
all_in=fin_result.read().strip().split('\n\n') |
|
|
|
fin_result.close() |
|
|
|
prefix_speid_allset=set(prefix_dict.keys()) |
|
|
|
for doc in all_in: |
|
lines=doc.split('\n') |
|
pmid=lines[0].split('|t|')[0] |
|
_prefix_str2id_dict={} |
|
doc_species=list(species_count[pmid].keys()) |
|
for _spe_ele in doc_species: |
|
if _spe_ele[1:] in prefix_speid_allset: |
|
for ele in prefix_dict[_spe_ele[1:]]: |
|
_prefix_str2id_dict[ele]=_spe_ele[1:] |
|
|
|
for i in range(2,len(lines)): |
|
segs=lines[i].split('\t') |
|
if pmid not in final_sa_results.keys(): |
|
final_sa_results[pmid]={segs[1]:'Focus:'+segs[-1]} |
|
else: |
|
final_sa_results[pmid][segs[1]]='Focus:'+segs[-1] |
|
|
|
if segs[5] in gene_set: |
|
if segs[4][0:2] in _prefix_str2id_dict: |
|
|
|
|
|
if pmid not in final_sa_results.keys(): |
|
final_sa_results[pmid]={segs[1]:'Focus:'+_prefix_str2id_dict[segs[4][0:2]]} |
|
else: |
|
final_sa_results[pmid][segs[1]]='Focus:'+_prefix_str2id_dict[segs[4][0:2]] |
|
if pmid not in cache_geneid.keys(): |
|
cache_geneid[pmid]={segs[4]:{'Focus:'+segs[-1]:1}} |
|
else: |
|
if segs[4] not in cache_geneid[pmid].keys(): |
|
cache_geneid[pmid][segs[4]]={'Focus:'+segs[-1]:1} |
|
else: |
|
if segs[-1] not in cache_geneid[pmid][segs[4]].keys(): |
|
cache_geneid[pmid][segs[4]]['Focus:'+segs[-1]]=1 |
|
else: |
|
cache_geneid[pmid][segs[4]]['Focus:'+segs[-1]]+=1 |
|
|
|
|
|
|
|
|
|
if fin_pubtator1.getvalue()!='': |
|
fin_result=io.StringIO(fin_pubtator1.getvalue()) |
|
all_in=fin_result.read().strip().split('\n\n') |
|
fin_result.close() |
|
|
|
for doc in all_in: |
|
lines=doc.split('\n') |
|
pmid=lines[0].split('|t|')[0] |
|
major_speicesid=lines[2] |
|
for i in range(3,len(lines)): |
|
segs=lines[i].split('\t') |
|
if len(segs)>=7: |
|
if pmid not in final_sa_results.keys(): |
|
final_sa_results[pmid]={segs[1]:segs[-1]} |
|
else: |
|
final_sa_results[pmid][segs[1]]=segs[-1] |
|
else: |
|
marjor_species='Focus:'+major_speicesid |
|
if pmid not in final_sa_results.keys(): |
|
final_sa_results[pmid]={segs[1]:marjor_species} |
|
else: |
|
final_sa_results[pmid][segs[1]]=marjor_species |
|
if pmid not in cache_geneid.keys(): |
|
cache_geneid[pmid]={segs[4]:{marjor_species:1}} |
|
else: |
|
if segs[4] not in cache_geneid[pmid].keys(): |
|
cache_geneid[pmid][segs[4]]={marjor_species:1} |
|
else: |
|
if segs[-1] not in cache_geneid[pmid][segs[4]].keys(): |
|
cache_geneid[pmid][segs[4]][marjor_species]=1 |
|
else: |
|
cache_geneid[pmid][segs[4]][marjor_species]+=1 |
|
|
|
|
|
|
|
fin_result=io.StringIO(fin_pubtator0.getvalue()) |
|
all_in=fin_result.read().strip().split('\n\n') |
|
fin_result.close() |
|
|
|
for doc in all_in: |
|
lines=doc.split('\n') |
|
pmid=lines[0].split('|t|')[0] |
|
|
|
for i in range(2,len(lines)): |
|
segs=lines[i].split('\t') |
|
if (pmid in cache_geneid.keys()) and (segs[4] in cache_geneid[pmid].keys()): |
|
marjor_species = max(zip(cache_geneid[pmid][segs[4]].values(), cache_geneid[pmid][segs[4]].keys())) |
|
if pmid not in final_sa_results.keys(): |
|
final_sa_results[pmid]={segs[1]:marjor_species[1]} |
|
else: |
|
final_sa_results[pmid][segs[1]]=marjor_species[1] |
|
else: |
|
if (pmid in species_count.keys()) and len(species_count[pmid])>0: |
|
marjor_species = max(zip(species_count[pmid].values(), species_count[pmid].keys())) |
|
|
|
if pmid not in final_sa_results.keys(): |
|
final_sa_results[pmid]={segs[1]:'Focus:'+marjor_species[1][1:]} |
|
else: |
|
final_sa_results[pmid][segs[1]]='Focus:'+marjor_species[1][1:] |
|
else: |
|
if pmid not in final_sa_results.keys(): |
|
final_sa_results[pmid]={segs[1]:'Focus:9606'} |
|
else: |
|
final_sa_results[pmid][segs[1]]='Focus:9606' |
|
|
|
|
|
|
|
|
|
fin = open(infolder+"/"+infile, 'r',encoding='utf-8') |
|
fout_xml=open(outpath+"/"+infile,'w', encoding='utf8') |
|
collection = bioc.load(fin) |
|
for document in collection.documents: |
|
doc_pmid=document.id |
|
|
|
|
|
for passage in document.passages: |
|
for temp_annotation in passage.annotations: |
|
if 'Identifier' not in temp_annotation.infons.keys(): |
|
if temp_annotation.id in final_sa_results[doc_pmid].keys(): |
|
if final_sa_results[doc_pmid][temp_annotation.id][6:] in virus_set: |
|
temp_annotation.infons['Identifier']=final_sa_results[doc_pmid][temp_annotation.id]+',9606' |
|
|
|
else: |
|
temp_annotation.infons['Identifier']=final_sa_results[doc_pmid][temp_annotation.id] |
|
else: |
|
if (doc_pmid in cache_geneid.keys()) and (temp_annotation.text in cache_geneid[doc_pmid].keys()): |
|
marjor_species = max(zip(cache_geneid[doc_pmid][temp_annotation.text].values(), cache_geneid[doc_pmid][temp_annotation.text].keys())) |
|
temp_annotation.infons['Identifier']=marjor_species[1] |
|
else: |
|
|
|
temp_annotation.infons['Identifier']='Focus:9606' |
|
bioc.dump(collection, fout_xml, pretty_print=True) |
|
fin.close() |
|
fout_xml.close() |
|
|
|
|
|
|
|
def SA_PubTator(infolder,infile,outpath,nn_model,virus_set,prefix_dict): |
|
|
|
|
|
|
|
|
|
|
|
fin = open(infolder+"/"+infile, 'r',encoding='utf-8') |
|
|
|
fin_pubtator2=io.StringIO() |
|
all_in_ori=fin.read().strip().split('\n\n') |
|
fin.close() |
|
species_gene_count={} |
|
gene_set=['Gene','FamilyName'] |
|
ML_results={} |
|
|
|
prefix_speid_allset=set(prefix_dict.keys()) |
|
|
|
for document in all_in_ori: |
|
lines=document.split('\n') |
|
doc_pmid=lines[0].split('|t|')[0] |
|
doc_title=lines[0].split('|t|')[1] |
|
doc_abstract=lines[1].split('|a|')[1] |
|
doc_annotation=[] |
|
_species_num=set() |
|
_gene_num=0 |
|
_ML_gene_num=0 |
|
_entity_num=0 |
|
_prefix_str2id_dict={} |
|
for i in range(2,len(lines)): |
|
segs=lines[i].split('\t') |
|
if segs[4] in gene_set: |
|
_gene_num+=1 |
|
if len(segs)>=6: |
|
doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:])) |
|
species_ID=segs[-1] |
|
if species_ID.find('*')>=0: |
|
_species_num.add(species_ID) |
|
if species_ID[1:] in prefix_speid_allset: |
|
for ele in prefix_dict[species_ID[1:]]: |
|
_prefix_str2id_dict[ele]=species_ID[1:] |
|
else: |
|
if segs[3][0:2] in _prefix_str2id_dict: |
|
if _prefix_str2id_dict[segs[3][0:2]] in virus_set: |
|
doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:])+'\tFocus:'+_prefix_str2id_dict[segs[3][0:2]]+',9606') |
|
if doc_pmid not in ML_results.keys(): |
|
ML_results[doc_pmid]={segs[1]+'-'+segs[2]:_prefix_str2id_dict[segs[3][0:2]]+',9606'} |
|
else: |
|
ML_results[doc_pmid][segs[1]+'-'+segs[2]]=_prefix_str2id_dict[segs[3][0:2]]+',9606' |
|
|
|
|
|
else: |
|
doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:])+'\tFocus:'+_prefix_str2id_dict[segs[3][0:2]]) |
|
if doc_pmid not in ML_results.keys(): |
|
ML_results[doc_pmid]={segs[1]+'-'+segs[2]:_prefix_str2id_dict[segs[3][0:2]]} |
|
else: |
|
ML_results[doc_pmid][segs[1]+'-'+segs[2]]=_prefix_str2id_dict[segs[3][0:2]] |
|
|
|
|
|
else: |
|
doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:])) |
|
if segs[4] in gene_set: |
|
_ML_gene_num+=1 |
|
_entity_num+=1 |
|
|
|
if len(_species_num)>=2 and _ML_gene_num>0: |
|
fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n') |
|
fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n') |
|
for ele in doc_annotation: |
|
fin_pubtator2.write(ele+'\n') |
|
fin_pubtator2.write('\n') |
|
|
|
species_gene_count[doc_pmid]={'spec':_species_num,'gene':_gene_num} |
|
|
|
if fin_pubtator2.getvalue()!='': |
|
|
|
|
|
ml_out= sa_tag.ml_tag_main(fin_pubtator2,nlp_token, nn_model) |
|
|
|
fin_result=io.StringIO(ml_out.getvalue()) |
|
all_in=fin_result.read().strip().split('\n\n') |
|
|
|
fin_result.close() |
|
for doc in all_in: |
|
lines=doc.split('\n') |
|
pmid=lines[0].split('|t|')[0] |
|
|
|
for i in range(2,len(lines)): |
|
segs=lines[i].split('\t') |
|
if pmid not in ML_results.keys(): |
|
ML_results[pmid]={segs[2]+'-'+segs[3]:segs[-1]} |
|
else: |
|
ML_results[pmid][segs[2]+'-'+segs[3]]=segs[-1] |
|
|
|
|
|
fout_pubtator=open(outpath+"/"+infile,'w', encoding='utf8') |
|
for doc in all_in_ori: |
|
lines=doc.split('\n') |
|
pmid=lines[0].split('|t|')[0] |
|
fout_pubtator.write(lines[0]+'\n'+lines[1]+'\n') |
|
if len(species_gene_count[pmid]['spec'])>1 and species_gene_count[pmid]['gene']>0: |
|
for i in range(2,len(lines)): |
|
segs=lines[i].split('\t') |
|
if len(segs)>=6: |
|
fout_pubtator.write(lines[i]+'\n') |
|
else: |
|
if ML_results[pmid][segs[1]+'-'+segs[2]] in virus_set: |
|
fout_pubtator.write(lines[i]+'\tFocus:'+ML_results[pmid][segs[1]+'-'+segs[2]]+',9606'+'\n') |
|
|
|
else: |
|
fout_pubtator.write(lines[i]+'\tFocus:'+ML_results[pmid][segs[1]+'-'+segs[2]]+'\n') |
|
fout_pubtator.write('\n') |
|
|
|
elif len(species_gene_count[pmid]['spec'])==1 and species_gene_count[pmid]['gene']>0: |
|
for i in range(2,len(lines)): |
|
segs=lines[i].split('\t') |
|
if len(segs)>=6: |
|
fout_pubtator.write(lines[i]+'\n') |
|
else: |
|
major_species,=species_gene_count[pmid]['spec'] |
|
if major_species[1:] in virus_set: |
|
fout_pubtator.write(lines[i]+'\tFocus:'+major_species[1:]+',9606'+'\n') |
|
|
|
fout_pubtator.write(lines[i]+'\tFocus:'+major_species[1:]+'\n') |
|
fout_pubtator.write('\n') |
|
|
|
elif len(species_gene_count[pmid]['spec'])==0 and species_gene_count[pmid]['gene']>0: |
|
for i in range(2,len(lines)): |
|
segs=lines[i].split('\t') |
|
if len(segs)>=6: |
|
fout_pubtator.write(lines[i]+'\n') |
|
else: |
|
fout_pubtator.write(lines[i]+'\tFocus:9606'+'\n') |
|
fout_pubtator.write('\n') |
|
|
|
else: |
|
for i in range(2,len(lines)): |
|
fout_pubtator.write(lines[i]+'\n') |
|
fout_pubtator.write('\n') |
|
fout_pubtator.close() |
|
|
|
|
|
|
|
def speciesAss(infolder,outpath, modelfile): |
|
|
|
if modelfile.lower().find('bioformer')>=0: |
|
model_type='bioformer' |
|
else: |
|
model_type='pubmedbert' |
|
|
|
print('loading SA models........') |
|
if model_type=='bioformer': |
|
|
|
vocabfiles={'labelfile':'./vocab/SpeAss_IO_label.vocab', |
|
'checkpoint_path':'./gnorm_trained_models/bioformer-cased-v1.0/', |
|
'lowercase':False, |
|
} |
|
else: |
|
vocabfiles={'labelfile':'./vocab/SpeAss_IO_label.vocab', |
|
'checkpoint_path':'./gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/', |
|
'lowercase':True, |
|
} |
|
|
|
nn_model=model_sa.HUGFACE_NER(vocabfiles) |
|
nn_model.build_encoder() |
|
nn_model.build_softmax_decoder() |
|
nn_model.load_model(modelfile) |
|
|
|
dict_filename={'prefix':'./Dictionary/SPPrefix.txt', |
|
'virus':'./Dictionary/SP_Virus2HumanList.txt'} |
|
fin=open(dict_filename['virus'],'r',encoding='utf-8') |
|
virus_set=set(fin.read().strip().split('\n')) |
|
fin.close() |
|
|
|
prefix_dict={} |
|
fin=open(dict_filename['prefix'],'r',encoding='utf-8') |
|
for line in fin: |
|
seg= line.strip().split('\t') |
|
if seg[0] not in prefix_dict.keys(): |
|
prefix_dict[seg[0]]=seg[1].split('|') |
|
else: |
|
prefix_dict[seg[0]].extend(seg[1].split('|')) |
|
fin.close() |
|
|
|
|
|
|
|
print("begin species assignment........") |
|
start_time=time.time() |
|
|
|
for infile in os.listdir(infolder): |
|
if os.path.isfile(outpath+"/"+infile): |
|
print(infile+' has exsited.') |
|
else: |
|
print('Processing:',infile) |
|
fin=open(infolder+"/"+infile, 'r',encoding='utf-8') |
|
file_format="" |
|
for line in fin: |
|
pattern_bioc = re.compile('.*<collection>.*') |
|
pattern_pubtator = re.compile('^([^\|]+)\|[^\|]+\|(.*)') |
|
if pattern_bioc.search(line): |
|
file_format="BioC" |
|
break |
|
elif pattern_pubtator.search(line): |
|
file_format="PubTator" |
|
break |
|
fin.close() |
|
if(file_format == "PubTator"): |
|
SA_PubTator(infolder,infile,outpath,nn_model,virus_set,prefix_dict) |
|
elif(file_format == "BioC"): |
|
SA_BioC(infolder,infile,outpath,nn_model,virus_set,prefix_dict) |
|
|
|
|
|
print('species assignment done:',time.time()-start_time) |
|
|
|
if __name__=='__main__': |
|
|
|
parser = argparse.ArgumentParser(description='run GeneNER and species assignment, python GeneNER_SpeAss_run.py -i input -n NERmodel -s SAmodel -r neroutput -a saoutput') |
|
parser.add_argument('--infolder', '-i', help="input folder",default='./example/input/') |
|
parser.add_argument('--NERmodel', '-n', help="trained deep learning NER model file",default='') |
|
parser.add_argument('--SAmodel', '-s', help="trained deep learning species assignment model file",default='') |
|
parser.add_argument('--NERoutpath', '-r', help="output folder to save the NER tagged results",default='./example/ner_output/') |
|
parser.add_argument('--SAoutpath', '-a', help="output folder to save the SA tagged results",default='./example/sa_output/') |
|
parser.add_argument('--NUM_THREADS', '-t', help="Number of threads",default='3') |
|
args = parser.parse_args() |
|
|
|
|
|
if args.NUM_THREADS.isdigit() == False: |
|
args.NUM_THREADS='3' |
|
|
|
tf.config.threading.set_inter_op_parallelism_threads(int(args.NUM_THREADS)) |
|
tf.config.threading.set_intra_op_parallelism_threads(int(args.NUM_THREADS)) |
|
|
|
if args.NERmodel!='' and args.SAmodel!='': |
|
|
|
|
|
print('==============\n| GeneNER and SpeAss |\n==============') |
|
|
|
|
|
|
|
if args.infolder[-1]!='/': |
|
args.infolder+='/' |
|
if not os.path.exists(args.infolder): |
|
os.makedirs(args.infolder) |
|
|
|
if args.NERoutpath[-1]!='/': |
|
args.NERoutpath+='/' |
|
if not os.path.exists(args.NERoutpath): |
|
os.makedirs(args.NERoutpath) |
|
|
|
if args.SAoutpath[-1]!='/': |
|
args.SAoutpath+='/' |
|
if not os.path.exists(args.SAoutpath): |
|
os.makedirs(args.SAoutpath) |
|
|
|
|
|
geneNER(args.infolder,args.NERoutpath, args.NERmodel) |
|
|
|
|
|
|
|
speciesAss(args.NERoutpath,args.SAoutpath, args.SAmodel) |
|
|
|
elif args.NERmodel!='' and args.SAmodel=='': |
|
if args.infolder[-1]!='/': |
|
args.infolder+='/' |
|
if not os.path.exists(args.infolder): |
|
os.makedirs(args.infolder) |
|
|
|
|
|
if args.NERoutpath[-1]!='/': |
|
args.NERoutpath+='/' |
|
if not os.path.exists(args.NERoutpath): |
|
os.makedirs(args.NERoutpath) |
|
|
|
print('==============\n| GeneNER |\n==============') |
|
geneNER(args.infolder,args.NERoutpath,args.NERmodel) |
|
|
|
elif args.NERmodel=='' and args.SAmodel!='': |
|
|
|
if args.SAoutpath[-1]!='/': |
|
args.SAoutpath+='/' |
|
if not os.path.exists(args.SAoutpath): |
|
os.makedirs(args.SAoutpath) |
|
|
|
print('==============\n| SpeAss |\n==============') |
|
speciesAss(args.infolder,args.SAoutpath,args.SAmodel) |
|
else: |
|
print('Please provide models!') |
|
|
|
|
|
|