Steven Tang
Commit all
69fb171
raw
history blame
37.2 kB
/**
* Project: GNormPlus
* Function: Gene Normalization
*/
package GNormPluslib;
import bioc.BioCAnnotation;
import bioc.BioCCollection;
import bioc.BioCDocument;
import bioc.BioCLocation;
import bioc.BioCPassage;
import bioc.io.BioCDocumentWriter;
import bioc.io.BioCFactory;
import bioc.io.woodstox.ConnectorWoodstox;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.text.BreakIterator;
import java.time.LocalDate;
import java.time.ZoneId;
import java.text.DecimalFormat;
import java.math.RoundingMode;
import javax.xml.stream.XMLStreamException;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
public class GN
{
public static HashMap<String, String> MatchedTokens_hash = new HashMap<String, String>();
private double ScoringFunction(String geneid,HashMap<String,String> Mention_hash,String LF)
{
/*
* define gene/homo id
*/
//LF
LF = LF.toLowerCase();
LF = LF.replaceAll("([0-9])([a-z])", "$1 $2");
LF = LF.replaceAll("([a-z])([0-9])", "$1 $2");
LF = LF.replaceAll("([\\W\\-\\_])", " ");
LF = LF.replaceAll("[ ]+", " ");
String LF_tkn[]=LF.split(" ");
int LF_ParticalMatch = 0;
Pattern ptmp = Pattern.compile("[0-9]+\\-([0-9]+)");
Matcher mtmp = ptmp.matcher(geneid);
Pattern ptmp2 = Pattern.compile("([0-9]+)");
Matcher mtmp2 = ptmp.matcher(geneid);
if(mtmp.find())
{
geneid = "Homo:"+mtmp.group(1);
}
else
{
geneid = "Gene:"+geneid;
}
if(GNormPlus.GeneScoring_hash.containsKey(geneid))
{
HashMap<String,Double> TF = new HashMap<String,Double>(); // token i in gene j
HashMap<String,Double> TermFrequency = new HashMap<String,Double>();
/*
* Tokens in Query (Gene id lexicon)
*/
String l[]=GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293 cmk-1,cytidylate-1,kinase-1,mssa-1 0.4096 4 0.0625 1 2.0
String tkns_Gene[] = l[0].split(",");
for(int i=0;i<tkns_Gene.length;i++)
{
String Tkn_Freq[] = tkns_Gene[i].split("-");
TermFrequency.put(Tkn_Freq[0], Double.parseDouble(Tkn_Freq[1]));
}
Double Cj = Double.parseDouble(l[1]);
Double AllTknNum = Double.parseDouble(l[2]);
//Double Cj_max = Double.parseDouble(l[3]);
//Double MaxTknNum = Double.parseDouble(l[4]);
Double Norm = Double.parseDouble(l[5]);
if(Norm == 0.0){Norm=1.0;}
/*
* Tokens in Document (recognized mentions)
*/
for(String Mention : Mention_hash.keySet())
{
Mention = Mention.toLowerCase();
Mention = Mention.replaceAll("([0-9])([a-z])", "$1 $2");
Mention = Mention.replaceAll("([a-z])([0-9])", "$1 $2");
Mention = Mention.replaceAll("([\\W\\-\\_])", " ");
Mention = Mention.replaceAll("[ ]+", " ");
String tkns_Mention[]=Mention.split(" ");
for(int i=0;i<tkns_Mention.length;i++)
{
if(TermFrequency.containsKey(tkns_Mention[i]))
{
TF.put(tkns_Mention[i], TermFrequency.get(tkns_Mention[i]));
}
}
}
Double score=0.0;
for(String Tkn : TF.keySet())
{
//LF
for(int t=0;t<LF_tkn.length;t++)
{
if(LF_tkn[t].equals(Tkn))
{
LF_ParticalMatch++;
}
}
double TFij = TF.get(Tkn)/AllTknNum;
double IDFi=GNormPlus.GeneScoringDF_hash.get(Tkn);
score=score+TFij*IDFi*(1/(1-TFij));
}
//score = Cj * (1/Norm) *score;
if(LF_ParticalMatch>0){score = score + LF_ParticalMatch;/*System.out.println(geneid+"\t"+LF+"\t"+score);*/}
return score;
}
else
{
//System.out.println("Error: cannot find geneid: "+geneid+" in GeneScoring_hash");
return 0.0;
}
}
public void PreProcessing4GN(String Filename,String FilenameBioC) throws IOException, XMLStreamException
{
for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++)
{
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++)
{
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++)
{
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
String start=anno[0];
String last=anno[1];
String mentions=anno[2];
String type=anno[3];
String id="";
if(anno.length>=5)
{
id=anno[4];
}
if(type.equals("Gene"))
{
String mentionArr[] = mentions.split("\\|");
boolean update=false;
for(int m=0;m<mentionArr.length;m++)
{
Pattern ptmp = Pattern.compile("^(.*[0-9A-Z])[ ]*p$");
Matcher mtmp = ptmp.matcher(mentionArr[m]);
Pattern ptmp2 = Pattern.compile("^(.+)nu$");
Matcher mtmp2 = ptmp2.matcher(mentionArr[m]);
Pattern ptmp3 = Pattern.compile("^(.*)alpha(.*)$");
Matcher mtmp3 = ptmp3.matcher(mentionArr[m]);
Pattern ptmp4 = Pattern.compile("^(.*)beta(.*)$");
Matcher mtmp4 = ptmp4.matcher(mentionArr[m]);
Pattern ptmp5 = Pattern.compile("^(.+[0-9])a$");
Matcher mtmp5 = ptmp5.matcher(mentionArr[m]);
Pattern ptmp6 = Pattern.compile("^(.+[0-9])b$");
Matcher mtmp6 = ptmp6.matcher(mentionArr[m]);
Pattern ptmp7 = Pattern.compile("^(.+)II([a-z])$");
Matcher mtmp7 = ptmp7.matcher(mentionArr[m]);
Pattern ptmp8 = Pattern.compile("^(.+)III([a-z])$");
Matcher mtmp8 = ptmp8.matcher(mentionArr[m]);
if(mtmp.find())
{
mentions=mentions+"|"+mtmp.group(1);
update=true;
}
if(mtmp2.find())
{
mentions=mentions+"|"+mtmp2.group(1);
update=true;
}
if(mtmp3.find())
{
mentions=mentions+"|"+mtmp3.group(1)+"a"+mtmp3.group(2);
update=true;
}
if(mtmp4.find())
{
mentions=mentions+"|"+mtmp4.group(1)+"b"+mtmp4.group(2);
update=true;
}
if(mtmp5.find())
{
mentions=mentions+"|"+mtmp5.group(1)+"alpha";
update=true;
}
if(mtmp6.find())
{
mentions=mentions+"|"+mtmp6.group(1)+"beta";
update=true;
}
if(mtmp7.find())
{
mentions=mentions+"|"+mtmp7.group(1)+"2"+mtmp7.group(2);
update=true;
}
if(mtmp8.find())
{
mentions=mentions+"|"+mtmp8.group(1)+"3"+mtmp8.group(2);
update=true;
}
}
if(update == true)
{
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start + "\t" + last + "\t" + mentions + "\t" + type + "\t" + id );
}
}
}
}
}
//GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
}
public void ChromosomeRecognition(String Filename,String FilenameBioC) throws IOException, XMLStreamException
{
for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */
{
String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
{
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
/** Chromosome recognition */
ArrayList<String> locations = GNormPlus.PT_GeneChromosome.SearchMentionLocation(PassageContext,"ChromosomeLocation");
for (int k = 0 ; k < locations.size() ; k++)
{
String anno[]=locations.get(k).split("\t");
//int start= Integer.parseInt(anno[0]);
//int last= Integer.parseInt(anno[1]);
//String mention = anno[2];
String ids = anno[3];
//GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tChromosomeLocation\t"+ids); //paragraph
String IDs[] = ids.split("[\\|,]");
for(int idcount=0;idcount<IDs.length;idcount++)
{
//IDs[idcount] = IDs[idcount].replaceAll("\\-[0-9]+", "");
GNormPlus.Pmid2ChromosomeGene_hash.put(Pmid+"\t"+IDs[idcount],"");
}
}
}
}
//GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
}
public void GeneNormalization(String Filename,String FilenameBioC,boolean GeneIDMatch) throws IOException, XMLStreamException
{
final DecimalFormat df = new DecimalFormat("0.####");
df.setRoundingMode(RoundingMode.HALF_UP);
//Tokenization
for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) /** PMIDs : i */
{
String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
/** Species */
HashMap<String,String> Species_hash = new HashMap<String,String>();
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
{
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) /** Annotation : k */
{
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
String mentions=anno[2];
String type=anno[3];
if(type.matches("(Species|Genus|Strain|CellLine|Cell)"))
{
Species_hash.put(mentions,"");
}
}
}
/*
* Collect Gene mentions :
*
* GeneMention-taxid -> "ID" : geneid
* -> "type" : "Gene"
* -> start1-last1 : ""
* -> start2-last2 : ""
* -> start3-last3 : ""
*/
String tiabs="";
for (int j = 0; j < GNormPlus.BioCDocobj.PassageContexts.get(i).size(); j++) /** Paragraphs : j */
{
tiabs=tiabs+GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).toLowerCase();
}
HashMap<String,HashMap<String,String>> GeneMention_hash = new HashMap<String,HashMap<String,String>>();
HashMap<String,String> Mention_hash = new HashMap<String,String>();
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
{
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) /** Annotation : k */
{
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
String start=anno[0];
String last=anno[1];
String mentions=anno[2];
String type=anno[3];
String taxids="Tax:9606";
if(anno.length>=5)
{
taxids=anno[4];
}
String mentions_tmp=mentions.toLowerCase();
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
taxids=taxids.replaceAll("(Focus|Right|Left|Prefix|Tax):","");
if(taxids.equals(""))
{
taxids="9606";
}
/** Filtering */
boolean found_filter = false;
if(GNormPlus.Filtering_hash.containsKey(mentions_tmp)) // filtering
{
found_filter=true;
}
if(found_filter==false) //abbreviation
{
for(String f : GNormPlus.Filtering_WithLongForm_hash.keySet())
{
if( GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*[\\t\\|]"+f+"\tGene.*") ||
GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*\\t"+f+"\\|[^\t]+\tGene.*")
)
{
String lf=GNormPlus.Filtering_WithLongForm_hash.get(f);
if(tiabs.matches(".*"+lf+".*"))
{
found_filter=true;
break;
}
}
}
}
if(found_filter==false)
{
if( GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*[\\t\\|][a-z]\tGene.*") ||
GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*\\t[a-z]\\|[^\t]+\tGene.*") //32171191 Wuhan's
)
{
found_filter=true;
}
}
if(found_filter == false)
{
if(type.matches("Gene"))
{
if(GeneMention_hash.containsKey(mentions+"\t"+taxids))
{
GeneMention_hash.get(mentions+"\t"+taxids).put(start+"\t"+last,"");
}
else
{
HashMap<String,String> offset_hash = new HashMap<String,String>();
offset_hash.put(start+"\t"+last,"");
GeneMention_hash.put(mentions+"\t"+taxids, offset_hash);
GeneMention_hash.get(mentions+"\t"+taxids).put("type", type);
Mention_hash.put(mentions,"Gene");
}
}
else if(type.matches("(FamilyName|DomainMotif)"))
{
String GMs[]=mentions.split("\\|");
for(int g=0;g<GMs.length;g++)
{
String mention = GMs[g];
Mention_hash.put(mention,"FamilyDomain");
}
}
}
}
}
/*
* Gene id refinement:
* 1. Official name
* 2. only one gene
*/
HashMap<String,String> GuaranteedGene2ID = new HashMap<String,String>();
HashMap<String,String> MultiGene2ID = new HashMap<String,String>();
for(String GeneMentionTax : GeneMention_hash.keySet())
{
String GT[]=GeneMentionTax.split("\\t");
String mentions=GT[0];
String taxids=GT[1];
String GMs[]=mentions.split("\\|");
HashMap<String,String> taxids_hash = new HashMap<String,String>();
String taxids_arr[]=taxids.split(",");
for(int t=0;t<taxids_arr.length;t++)
{
taxids_hash.put(taxids_arr[t], "");
}
for(int ms=0;ms<GMs.length;ms++)
{
String mention = GMs[ms];
String IDstr = GNormPlus.PT_Gene.MentionMatch(mention); /** searched by PT_Gene */
String IDs[]=IDstr.split("\\|");
/*
* printing the ambiguous gene mentions and candidates
*/
//String IDs_s[]=IDstr.split(",");
//if(IDs_s.length>1)
//{
// System.out.println(Pmid+"\t"+mention+"\t"+mentions+"\t"+IDstr);
//}
for(int c=0;c<IDs.length;c++)
{
String tax2ID[]=IDs[c].split(":"); // tax2ID[0] = taxid ; tax2ID[1] = geneids
if(taxids_hash.containsKey(tax2ID[0]))
{
String geneid=tax2ID[1];
String TargetTax=tax2ID[0];
GeneMention_hash.get(GeneMentionTax).put("ID", geneid);
GeneMention_hash.get(GeneMentionTax).put("TargetTax", TargetTax);
break;
}
}
//geneid refinement
if(GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
{
Pattern ptmp = Pattern.compile("\\*([0-9]+(\\-[0-9]+|))");
Matcher mtmp = ptmp.matcher(GeneMention_hash.get(GeneMentionTax).get("ID"));
if(mtmp.find()) // 1. Official Name
{
GeneMention_hash.get(GeneMentionTax).put("ID",mtmp.group(1));
GuaranteedGene2ID.put(GeneMentionTax,mtmp.group(1));
}
else if(GeneMention_hash.get(GeneMentionTax).get("ID").matches("[0-9]+(\\-[0-9]+|)")) // 2. only one gene
{
GuaranteedGene2ID.put(GeneMentionTax,GeneMention_hash.get(GeneMentionTax).get("ID"));
}
else
{
String ID[] = GeneMention_hash.get(GeneMentionTax).get("ID").split(",");
boolean FoundByChroLoca=false;
for(int idcount=0;idcount<ID.length;idcount++)
{
if(GNormPlus.Pmid2ChromosomeGene_hash.containsKey(Pmid+"\t"+ID[idcount])) // 3. Chromosome location
{
GuaranteedGene2ID.put(GeneMentionTax,ID[idcount]);
FoundByChroLoca=true;
break;
}
}
if(FoundByChroLoca == false)
{
MultiGene2ID.put(GeneMentionTax, GeneMention_hash.get(GeneMentionTax).get("ID"));
}
}
}
if(GNormPlus.suffixprefix_orig2modified.containsKey(mention) && (!IDstr.equals("-1")) && (!IDstr.equals("-2")) && (!IDstr.equals("-3")))
{
break;
}
}
}
/*
* Gene id refinement:
* 3. multiple genes but can be inferred by 1. and 2.
*/
for(String GeneMentionTax_M : MultiGene2ID.keySet())
{
for(String GeneMentionTax_G : GuaranteedGene2ID.keySet())
{
String MG[] = MultiGene2ID.get(GeneMentionTax_M).split(",");
for(int m=0;m<MG.length;m++)
{
if(MG[m].equals(GuaranteedGene2ID.get(GeneMentionTax_G)))
{
GeneMention_hash.get(GeneMentionTax_M).put("ID",MG[m]);
}
}
}
}
/*
* Gene id refinement:
* 4. FullName -> Abbreviation
*/
for(String GeneMentionTax : GeneMention_hash.keySet())
{
String MT[] = GeneMentionTax.split("\\t");
if(GNormPlus.PmidLF2Abb_hash.containsKey(Pmid+"\t"+MT[0]))
{
String GeneMentionTax_Abb = GNormPlus.PmidLF2Abb_hash.get(Pmid+"\t"+MT[0]) + "\t" + MT[1];
if(GeneMention_hash.containsKey(GeneMentionTax_Abb) && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
{
GeneMention_hash.get(GeneMentionTax_Abb).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID"));
}
}
}
/*
* Gene id refinement:
* 5. Ranking by scoring function (inference network)
*/
for(String GeneMentionTax : GeneMention_hash.keySet())
{
if(GeneMention_hash.get(GeneMentionTax).containsKey("ID") && GeneMention_hash.get(GeneMentionTax).get("ID").matches(".+,.+"))
{
String geneids=GeneMention_hash.get(GeneMentionTax).get("ID");
String geneid[] = geneids.split(",");
String OutputStyle="Top1";
if(OutputStyle.equals("Top1"))
{
//only return the best one
double max_score=0.0;
String target_geneid="";
for(int g=0;g<geneid.length;g++)
{
String MT[] = GeneMentionTax.split("\\t");
String LF="";
if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
{
LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]);
}
double score = ScoringFunction(geneid[g],Mention_hash,LF);
if(score>max_score)
{
max_score=score;
target_geneid=geneid[g];
}
else if(score == 0.0)
{
//System.out.println(GeneMentionTax);
}
}
GeneMention_hash.get(GeneMentionTax).put("ID", target_geneid);
}
else // "All"
{
//return all geneids
String geneSTR="";
for(int g=0;g<geneid.length;g++)
{
String MT[] = GeneMentionTax.split("\\t");
String LF="";
if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
{
LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]);
}
double score = ScoringFunction(geneid[g],Mention_hash,LF);
String hoge = df.format(score);
score=Double.parseDouble(hoge);
if(geneSTR.equals(""))
{
geneSTR=geneid[g]+"-"+score;
}
else
{
geneSTR=geneSTR+","+geneid[g]+"-"+score;
}
}
GeneMention_hash.get(GeneMentionTax).put("ID", geneSTR);
}
}
}
/*
* Gene id refinement: - removed (Reason: cause too much False Positive)
* 6. Abbreviation -> FullName
*
*/
for(String GeneMentionTax : GeneMention_hash.keySet())
{
String MT[] = GeneMentionTax.split("\\t");
if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
{
String GeneMentionTax_LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]) + "\t" + MT[1];
if(GeneMention_hash.containsKey(GeneMentionTax_LF) && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
{
GeneMention_hash.get(GeneMentionTax_LF).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID"));
}
}
}
/*
* Gene id refinement:
* 7. The inference network tokens of Abbreviation.ID should contain at least LF tokens
* 8. The short mention should be filtered if not long form support
*/
ArrayList<String> removeGMT = new ArrayList<String>();
for(String GeneMentionTax : GeneMention_hash.keySet())
{
String GT[]=GeneMentionTax.split("\\t");
String mentions=GT[0];
String tax=GT[1];
if(GeneMention_hash.get(GeneMentionTax).containsKey("type") && GeneMention_hash.get(GeneMentionTax).get("type").equals("Gene") && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
{
String type = GeneMention_hash.get(GeneMentionTax).get("type");
String id = GeneMention_hash.get(GeneMentionTax).get("ID");
String geneid="";
Pattern ptmp1 = Pattern.compile("^([0-9]+)\\-([0-9]+)$");
Pattern ptmp2 = Pattern.compile("^([0-9]+)$");
Matcher mtmp1 = ptmp1.matcher(id);
Matcher mtmp2 = ptmp2.matcher(id);
//System.out.println(id);
if(mtmp1.find())
{
geneid = "Homo:"+mtmp1.group(2);
}
else if(mtmp2.find())
{
geneid = "Gene:"+mtmp2.group(1);
}
boolean LongFormTknMatch= false;
boolean LongFormExist= true;
if(GNormPlus.GeneScoring_hash.containsKey(geneid))
{
if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mentions.toLowerCase()))
{
/*
* token in lexicon : tkn_lexicon
* token in mention : tkn_mention
*/
String l[]=GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293 cmk-1,cytidylate-1,kinase-1,mssa-1 0.4096 4 0.0625 1 2.0
String tkns_Gene[] = l[0].split(",");
ArrayList<String> tkn_lexicon = new ArrayList<String>();
for(int ti=0;ti<tkns_Gene.length;ti++)
{
String Tkn_Freq[] = tkns_Gene[ti].split("-");
tkn_lexicon.add(Tkn_Freq[0]);
}
String LF_lc=GNormPlus.PmidAbb2LF_lc_hash.get(Pmid+"\t"+mentions.toLowerCase());
LF_lc = LF_lc.replaceAll("([0-9])([A-Za-z])", "$1 $2");
LF_lc = LF_lc.replaceAll("([A-Za-z])([0-9])", "$1 $2");
String tkn_mention[] = LF_lc.split("[\\W\\-\\_]");
for(int tl=0;tl<tkn_lexicon.size();tl++)
{
for(int tm=0;tm<tkn_mention.length;tm++)
{
if(tkn_lexicon.get(tl).equals(tkn_mention[tm]) && (!tkn_mention[tm].matches("[0-9]+")))
{
LongFormTknMatch = true;
}
}
}
}
else{LongFormExist = false;}
}
else{LongFormTknMatch = true;} // exception
if(LongFormTknMatch == false && LongFormExist == true) // 7.
{
removeGMT.add(GeneMentionTax); //remove short form
removeGMT.add(GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+mentions)+"\t"+tax); //remove long form
}
else if(mentions.length()<=2 && LongFormExist == false) // 8.
{
removeGMT.add(GeneMentionTax);
}
}
}
for(int gmti=0;gmti<removeGMT.size();gmti++) // remove
{
GeneMention_hash.remove(removeGMT.get(gmti));
}
// Append gene ids
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) // Paragraphs : j
{
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
{
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
String start=anno[0];
String last=anno[1];
String mentions=anno[2];
String type=anno[3];
String taxid_org="Tax:9606";
if(anno.length>=5)
{
taxid_org=anno[4];
}
String taxids=taxid_org.replaceAll("(Focus|Right|Left|Prefix|Tax):","");
String GMs[]=mentions.split("\\|");
if(GeneMention_hash.containsKey(mentions+"\t"+taxids) && GeneMention_hash.get(mentions+"\t"+taxids).containsKey("TargetTax"))
{
String taxtype=taxid_org.replaceAll(":([0-9,]+)","");
String taxid=GeneMention_hash.get(mentions+"\t"+taxids).get("TargetTax");
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mentions+"\t"+type+"\t"+taxtype+":"+taxid);
}
if(type.equals("Gene"))
{
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k) + "|");
if(GeneMention_hash.containsKey(mentions+"\t"+taxids) && GeneMention_hash.get(mentions+"\t"+taxids).containsKey("ID"))
{
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k) + GeneMention_hash.get(mentions+"\t"+taxids).get("ID") + "," );
}
else // cannot find appropriate species
{
//System.out.println(mention+"\t"+taxid);
}
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).substring(0, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).length()-1)); // remove ",$"
}
}
}
//Extend to all gene mentions
HashMap<String,String> GeneMentions = new HashMap<String,String>(); // Extending Gene mentions
HashMap<String,String> GeneMentionLocation = new HashMap<String,String>(); // Extending Gene mentions
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
{
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
{
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
int start = Integer.parseInt(anno[0]);
int last = Integer.parseInt(anno[1]);
String mentions=anno[2];
String type=anno[3];
String id="Tax:9606";
if(anno.length>=5)
{
id=anno[4];
}
if(type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)"))
{
GeneMentions.put(mentions.toLowerCase(), id);
for (int s=start ;s<=last;s++)
{
GeneMentionLocation.put(j+"\t"+s,"");
}
}
else if(type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)"))
{
GeneMentions.put(mentions.toLowerCase(), id);
for (int s=start ;s<=last;s++)
{
GeneMentionLocation.put(j+"\t"+s,"");
}
}
}
}
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
{
if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j)
{
String PassageContexts = " " + GNormPlus.BioCDocobj.PassageContexts.get(i).get(j) + " ";
String PassageContexts_tmp = PassageContexts.toLowerCase();
for(String gm : GeneMentions.keySet())
{
String id = GeneMentions.get(gm);
if(gm.length()>=3)
{
gm = gm.replaceAll("[ ]*[\\|]*$", "");
gm = gm.replaceAll("^[\\|]*[ ]*", "");
gm = gm.replaceAll("[\\|][\\|]+", "\\|");
if(!gm.matches("[\\W\\-\\_]*"))
{
gm = gm.replaceAll("([^A-Za-z0-9\\| ])", "\\\\$1");
Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])("+gm+")([\\W\\-\\_].*)$");
Matcher mtmp = ptmp.matcher(PassageContexts_tmp);
while(mtmp.find())
{
String pre = mtmp.group(1);
String gmtmp = mtmp.group(2);
String post = mtmp.group(3);
int start = pre.length()-1;
int last = start+gmtmp.length();
if(PassageContexts.length()>=last+1)
{
String mention = PassageContexts.substring(start+1,last+1);
if(!GeneMentionLocation.containsKey(j+"\t"+start) && !GeneMentionLocation.containsKey(j+"\t"+last))
{
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGene\t"+id);
}
}
gmtmp = gmtmp.replaceAll(".", "\\@");
PassageContexts_tmp=pre+""+gmtmp+""+post;
mtmp = ptmp.matcher(PassageContexts_tmp);
}
}
}
}
}
}
//Apply to FamilyNames
HashMap<String,String> geneids = new HashMap<String,String>(); // Extending Gene mentions
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
{
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
{
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
String type=anno[3];
if(type.equals("Gene"))
{
String id="Tax:9606";
if(anno.length>=5)
{
id=anno[4];
}
Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)$");
Matcher mtmp0 = ptmp0.matcher(id);
Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
Matcher mtmp1 = ptmp1.matcher(id);
if(mtmp0.find())
{
geneids.put(mtmp0.group(3), "");
}
if(mtmp1.find())
{
geneids.put(mtmp1.group(3), "");
}
}
}
}
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
{
for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
{
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
String mention=anno[2];
String type=anno[3];
if(type.matches("(FamilyName|DomainMotif)"))
{
String id="Tax:9606";
if(anno.length>=5)
{
id=anno[4];
}
String IDstrs = GNormPlus.PT_FamilyName.MentionMatch(mention);
String IDstr[]=IDstrs.split("\\|");
String ids="";
for(int id_i=0;id_i<IDstr.length;id_i++)
{
if(geneids.containsKey(IDstr[id_i]))
{
if(ids.equals(""))
{
ids=IDstr[id_i];
}
else
{
ids=ids+";"+IDstr[id_i];
}
}
}
if(!ids.equals(""))
{
if(type.equals("FamilyName")){type="Gene";}
String Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\tTax:9606";
if(anno.length>=5)
{
Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\t"+anno[4];
}
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k,Annotation_k+"|"+ids);
}
else
{
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
}
}
}
}
//Species "*" and "(anti)" removed.
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
{
for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
{
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
String type=anno[3];
if(type.equals("Species") || type.equals("Genus") || type.equals("Strain") || type.equals("CellLine") || type.equals("Cell"))
{
String id=anno[4];
id=id.replaceAll("\\*", "");
id=id.replaceAll("\\(anti\\)", "");
String Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\t"+id;
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k,Annotation_k);
}
}
}
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
{
for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
{
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
int start = Integer.parseInt(anno[0]);
int last = Integer.parseInt(anno[1]);
String mention = anno[2];
String type = anno[3];
String id = anno[4];
if(type.equals("Gene") && Species_hash.containsKey(mention))
{
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
}
else if(type.equals("Gene") && id.equals(""))
{
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
}
else
{
for (int k1 = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k1 >=0 ; k1--) // Annotation : k
{
if(k1 != k)
{
String anno1[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k1).split("\t");
int start1 = Integer.parseInt(anno1[0]);
int last1 = Integer.parseInt(anno1[1]);
if((start1<start && last1>=last) || (start1<=start && last1>last))
{
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
break;
}
}
}
}
}
}
}
if(GeneIDMatch == true)
{
//GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
}
else
{
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,true,true);
}
}
/*
* Search Potential GeneID in the Prefix Tree
*/
public ArrayList<String> SearchGeneIDLocation(String Doc)
{
ArrayList<String> location = new ArrayList<String>();
String Doc_tmp=" "+Doc+" ";
Pattern ptmp = Pattern.compile("^(.*[^A-Za-z0-9]+)([0-9]+\\S*[A-Za-z]+|[A-Za-z]+\\S*[0-9]+|[0-9]+\\S*[A-Za-z]+\\S*[0-9]+|[A-Za-z]+\\S*[0-9]+\\S*[A-Za-z]+)([^A-Za-z0-9]+.*)$");
Matcher mtmp = ptmp.matcher(Doc_tmp);
while(mtmp.find())
{
String str1=mtmp.group(1);
String str2=mtmp.group(2);
String str3=mtmp.group(3);
for(int m=str1.length();m<=(str1.length()+str2.length());m++)
{
int start = str1.length()-1;
int last = start+str2.length();
String mention = Doc.substring(start, last);
if(!mention.matches(".*[\\'\\;\\[\\]\\+\\*\\\\].*"))
{
if(last-start>6 && (mention.matches(".*\\(.*\\).*") || mention.matches("[^\\(\\)]+")) )
{
Pattern ptmp1 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-([0-9]+)$");
Matcher mtmp1 = ptmp1.matcher(mention);
Pattern ptmp2 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-(.+[^0-9])([0-9]+)$");
Matcher mtmp2 = ptmp2.matcher(mention);
if(mtmp1.find())
{
String S1 = mtmp1.group(1);
if(mtmp1.group(2).length()<=6 && mtmp1.group(3).length()<=6)
{
int Num1 = Integer.parseInt(mtmp1.group(2));
int Num2 = Integer.parseInt(mtmp1.group(3));
String prefix = "";
Pattern ptmp3 = Pattern.compile("^([0]+)");
Matcher mtmp3 = ptmp3.matcher(mtmp1.group(2));
if(mtmp3.find())
{
prefix = mtmp3.group(1);
}
if(Num2-Num1>0 && (Num2-Num1<=20))
{
for(int n=Num1;n<=Num2;n++)
{
String StrNum=S1+prefix+n;
if(StrNum.length()>=5)
{
location.add(start+"\t"+last+"\t"+StrNum+"\tGeneID");
}
}
}
}
}
else if(mtmp2.find())
{
if(mtmp2.group(2).length()<=6 && mtmp2.group(4).length()<=6)
{
String S1 = mtmp2.group(1);
int Num1 = Integer.parseInt(mtmp2.group(2));
String S2 = mtmp2.group(3);
int Num2 = Integer.parseInt(mtmp2.group(4));
if(S1.equals(S2))
{
String prefix = "";
Pattern ptmp3 = Pattern.compile("^([0]+)");
Matcher mtmp3 = ptmp3.matcher(mtmp2.group(2));
if(mtmp3.find())
{
prefix = mtmp3.group(1);
}
if(Num2-Num1>0 && (Num2-Num1<=20))
{
for(int n=Num1;n<=Num2;n++)
{
String StrNum=S1+prefix+n;
if(StrNum.length()>=5)
{
location.add(start+"\t"+last+"\t"+StrNum+"\tGeneID");
}
}
}
}
}
}
}
location.add(start+"\t"+last+"\t"+mention+"\tGeneID");
}
}
String men="";
for(int m=0;m<str2.length();m++){men=men+"@";}
Doc_tmp=str1+men+str3;
mtmp = ptmp.matcher(Doc_tmp);
}
return location;
}
public void GeneIDRecognition(String Filename,String FilenameBioC) throws IOException, XMLStreamException
{
for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */
{
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
{
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
/** GeneID recognition by pattern match */
ArrayList<String> locations = SearchGeneIDLocation(PassageContext);
for (int k = 0 ; k < locations.size() ; k++)
{
String anno[]=locations.get(k).split("\t");
String mention = anno[2].toLowerCase();
mention = mention.replaceAll("[\\W\\-\\_]+", "");
if(GNormPlus.GeneIDs_hash.containsKey(mention))
{
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(locations.get(k)+"\tGeneID:"+GNormPlus.GeneIDs_hash.get(mention)); //paragraph
}
}
}
}
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,true,true);
}
}