|
|
|
|
|
|
|
|
|
|
|
package GNormPluslib;
|
|
|
|
import bioc.BioCAnnotation;
|
|
import bioc.BioCCollection;
|
|
import bioc.BioCDocument;
|
|
import bioc.BioCLocation;
|
|
import bioc.BioCPassage;
|
|
|
|
import bioc.io.BioCDocumentWriter;
|
|
import bioc.io.BioCFactory;
|
|
import bioc.io.woodstox.ConnectorWoodstox;
|
|
import java.io.BufferedReader;
|
|
import java.io.BufferedWriter;
|
|
import java.io.FileInputStream;
|
|
import java.io.FileNotFoundException;
|
|
import java.io.FileOutputStream;
|
|
import java.io.FileReader;
|
|
import java.io.FileWriter;
|
|
import java.io.IOException;
|
|
import java.io.InputStreamReader;
|
|
import java.io.OutputStreamWriter;
|
|
import java.io.UnsupportedEncodingException;
|
|
import java.time.LocalDate;
|
|
import java.time.ZoneId;
|
|
|
|
import javax.xml.stream.XMLStreamException;
|
|
|
|
import java.util.Map;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
import java.util.ArrayList;
|
|
import java.util.HashMap;
|
|
import java.util.List;
|
|
|
|
public class BioCDoc
|
|
{
|
|
|
|
|
|
|
|
public ArrayList<String> PMIDs=new ArrayList<String>();
|
|
public ArrayList<ArrayList<String>> PassageNames = new ArrayList();
|
|
public ArrayList<ArrayList<Integer>> PassageOffsets = new ArrayList();
|
|
public ArrayList<ArrayList<String>> PassageContexts = new ArrayList();
|
|
public ArrayList<ArrayList<ArrayList<String>>> Annotations = new ArrayList();
|
|
|
|
public String BioCFormatCheck(String InputFile) throws IOException
|
|
{
|
|
|
|
ConnectorWoodstox connector = new ConnectorWoodstox();
|
|
BioCCollection collection = new BioCCollection();
|
|
try
|
|
{
|
|
collection = connector.startRead(new InputStreamReader(new FileInputStream(InputFile), "UTF-8"));
|
|
}
|
|
catch (UnsupportedEncodingException | FileNotFoundException | XMLStreamException e)
|
|
{
|
|
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(InputFile), "UTF-8"));
|
|
String line="";
|
|
String status="";
|
|
String Pmid = "";
|
|
boolean tiabs=false;
|
|
Pattern patt = Pattern.compile("^([^\\|\\t]+)\\|([^\\|\\t]+)\\|(.*)$");
|
|
while ((line = br.readLine()) != null)
|
|
{
|
|
Matcher mat = patt.matcher(line);
|
|
if(mat.find())
|
|
{
|
|
if(Pmid.equals(""))
|
|
{
|
|
Pmid = mat.group(1);
|
|
}
|
|
else if(!Pmid.equals(mat.group(1)))
|
|
{
|
|
return "[Error]: "+InputFile+" - A blank is needed between "+Pmid+" and "+mat.group(1)+".";
|
|
}
|
|
status = "tiabs";
|
|
tiabs = true;
|
|
}
|
|
else if (line.contains("\t"))
|
|
{
|
|
}
|
|
else if(line.length()==0)
|
|
{
|
|
if(status.equals(""))
|
|
{
|
|
if(Pmid.equals(""))
|
|
{
|
|
return "[Error]: "+InputFile+" - It's neither BioC nor PubTator format. PMID is empty.";
|
|
}
|
|
else
|
|
{
|
|
return "[Error]: "+InputFile+" - A redundant blank is after "+Pmid+".";
|
|
}
|
|
}
|
|
Pmid="";
|
|
status="";
|
|
}
|
|
}
|
|
br.close();
|
|
if(tiabs == false)
|
|
{
|
|
return "[Error]: "+InputFile+" - It's neither BioC nor PubTator format.";
|
|
}
|
|
if(status.equals(""))
|
|
{
|
|
return "PubTator";
|
|
}
|
|
else
|
|
{
|
|
return "[Error]: "+InputFile+" - The last column missed a blank.";
|
|
}
|
|
}
|
|
return "BioC";
|
|
}
|
|
public void PubTator2BioC(String input,String output) throws IOException, XMLStreamException
|
|
{
|
|
|
|
|
|
|
|
String parser = BioCFactory.WOODSTOX;
|
|
BioCFactory factory = BioCFactory.newFactory(parser);
|
|
BioCDocumentWriter BioCOutputFormat = factory.createBioCDocumentWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
|
|
BioCCollection biocCollection = new BioCCollection();
|
|
|
|
|
|
ZoneId zonedId = ZoneId.of( "America/Montreal" );
|
|
LocalDate today = LocalDate.now( zonedId );
|
|
biocCollection.setDate(today.toString());
|
|
|
|
biocCollection.setKey("BioC.key");
|
|
biocCollection.setSource("GNormPlus");
|
|
|
|
BioCOutputFormat.writeCollectionInfo(biocCollection);
|
|
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
|
ArrayList<String> ParagraphType=new ArrayList<String>();
|
|
ArrayList<String> ParagraphContent = new ArrayList<String>();
|
|
ArrayList<String> annotations = new ArrayList<String>();
|
|
String line;
|
|
String Pmid="";
|
|
while ((line = inputfile.readLine()) != null)
|
|
{
|
|
if(line.contains("|") && !line.contains("\t"))
|
|
{
|
|
String str[]=line.split("\\|",-1);
|
|
Pmid=str[0];
|
|
if(str[1].equals("t"))
|
|
{
|
|
str[1]="title";
|
|
}
|
|
if(str[1].equals("a"))
|
|
{
|
|
str[1]="abstract";
|
|
}
|
|
ParagraphType.add(str[1]);
|
|
if(str.length==3)
|
|
{
|
|
String txt = str[2];
|
|
txt = txt.replaceAll("ω","w");
|
|
txt = txt.replaceAll("μ","u");
|
|
txt = txt.replaceAll("κ","k");
|
|
txt = txt.replaceAll("α","a");
|
|
txt = txt.replaceAll("γ","g");
|
|
txt = txt.replaceAll("ɣ","g");
|
|
txt = txt.replaceAll("β","b");
|
|
txt = txt.replaceAll("×","x");
|
|
txt = txt.replaceAll("‑","-");
|
|
txt = txt.replaceAll("¹","1");
|
|
txt = txt.replaceAll("²","2");
|
|
txt = txt.replaceAll("°","o");
|
|
txt = txt.replaceAll("ö","o");
|
|
txt = txt.replaceAll("é","e");
|
|
txt = txt.replaceAll("à","a");
|
|
txt = txt.replaceAll("Á","A");
|
|
txt = txt.replaceAll("ε","e");
|
|
txt = txt.replaceAll("θ","O");
|
|
txt = txt.replaceAll("•",".");
|
|
txt = txt.replaceAll("µ","u");
|
|
txt = txt.replaceAll("λ","r");
|
|
txt = txt.replaceAll("⁺","+");
|
|
txt = txt.replaceAll("ν","v");
|
|
txt = txt.replaceAll("ï","i");
|
|
txt = txt.replaceAll("ã","a");
|
|
txt = txt.replaceAll("≡","=");
|
|
txt = txt.replaceAll("ó","o");
|
|
txt = txt.replaceAll("³","3");
|
|
txt = txt.replaceAll("〖","[");
|
|
txt = txt.replaceAll("〗","]");
|
|
txt = txt.replaceAll("Å","A");
|
|
txt = txt.replaceAll("ρ","p");
|
|
txt = txt.replaceAll("ü","u");
|
|
txt = txt.replaceAll("ɛ","e");
|
|
txt = txt.replaceAll("č","c");
|
|
txt = txt.replaceAll("š","s");
|
|
txt = txt.replaceAll("ß","b");
|
|
txt = txt.replaceAll("═","=");
|
|
txt = txt.replaceAll("£","L");
|
|
txt = txt.replaceAll("Ł","L");
|
|
txt = txt.replaceAll("ƒ","f");
|
|
txt = txt.replaceAll("ä","a");
|
|
txt = txt.replaceAll("–","-");
|
|
txt = txt.replaceAll("⁻","-");
|
|
txt = txt.replaceAll("〈","<");
|
|
txt = txt.replaceAll("〉",">");
|
|
txt = txt.replaceAll("χ","X");
|
|
txt = txt.replaceAll("Đ","D");
|
|
txt = txt.replaceAll("‰","%");
|
|
txt = txt.replaceAll("·",".");
|
|
txt = txt.replaceAll("→",">");
|
|
txt = txt.replaceAll("←","<");
|
|
txt = txt.replaceAll("ζ","z");
|
|
txt = txt.replaceAll("π","p");
|
|
txt = txt.replaceAll("τ","t");
|
|
txt = txt.replaceAll("ξ","X");
|
|
txt = txt.replaceAll("η","h");
|
|
txt = txt.replaceAll("ø","0");
|
|
txt = txt.replaceAll("Δ","D");
|
|
txt = txt.replaceAll("∆","D");
|
|
txt = txt.replaceAll("∑","S");
|
|
txt = txt.replaceAll("Ω","O");
|
|
txt = txt.replaceAll("δ","d");
|
|
txt = txt.replaceAll("σ","s");
|
|
txt = txt.replaceAll("Φ","F");
|
|
txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]"," ");
|
|
ParagraphContent.add(txt);
|
|
}
|
|
else
|
|
{
|
|
ParagraphContent.add("- No text -");
|
|
}
|
|
}
|
|
else if (line.contains("\t"))
|
|
{
|
|
String anno[]=line.split("\t");
|
|
if(anno.length==6)
|
|
{
|
|
annotations.add(anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+anno[4]+"\t"+anno[5]);
|
|
}
|
|
else if(anno.length==5)
|
|
{
|
|
annotations.add(anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+anno[4]);
|
|
}
|
|
}
|
|
else if(line.length()==0)
|
|
{
|
|
BioCDocument biocDocument = new BioCDocument();
|
|
biocDocument.setID(Pmid);
|
|
int startoffset=0;
|
|
for(int i=0;i<ParagraphType.size();i++)
|
|
{
|
|
BioCPassage biocPassage = new BioCPassage();
|
|
Map<String, String> Infons = new HashMap<String, String>();
|
|
Infons.put("type", ParagraphType.get(i));
|
|
biocPassage.setInfons(Infons);
|
|
biocPassage.setText(ParagraphContent.get(i));
|
|
biocPassage.setOffset(startoffset);
|
|
startoffset=startoffset+ParagraphContent.get(i).length()+1;
|
|
for(int j=0;j<annotations.size();j++)
|
|
{
|
|
String anno[]=annotations.get(j).split("\t");
|
|
if(Integer.parseInt(anno[0])<startoffset && Integer.parseInt(anno[0])>=startoffset-ParagraphContent.get(i).length()-1)
|
|
{
|
|
BioCAnnotation biocAnnotation = new BioCAnnotation();
|
|
Map<String, String> AnnoInfons = new HashMap<String, String>();
|
|
if(anno.length==5)
|
|
{
|
|
AnnoInfons.put("Identifier", anno[4]);
|
|
}
|
|
AnnoInfons.put("type", anno[3]);
|
|
biocAnnotation.setInfons(AnnoInfons);
|
|
BioCLocation location = new BioCLocation();
|
|
location.setOffset(Integer.parseInt(anno[0]));
|
|
location.setLength(Integer.parseInt(anno[1])-Integer.parseInt(anno[0]));
|
|
biocAnnotation.setLocation(location);
|
|
biocAnnotation.setText(anno[2]);
|
|
biocPassage.addAnnotation(biocAnnotation);
|
|
}
|
|
}
|
|
biocDocument.addPassage(biocPassage);
|
|
}
|
|
biocCollection.addDocument(biocDocument);
|
|
ParagraphType.clear();
|
|
ParagraphContent.clear();
|
|
annotations.clear();
|
|
BioCOutputFormat.writeDocument(biocDocument);
|
|
}
|
|
}
|
|
BioCOutputFormat.close();
|
|
inputfile.close();
|
|
}
|
|
public void BioC2PubTator(String input,String output) throws IOException, XMLStreamException
|
|
{
|
|
|
|
|
|
|
|
HashMap<String, String> pmidlist = new HashMap<String, String>();
|
|
boolean duplicate = false;
|
|
BufferedWriter PubTatorOutputFormat = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
|
|
ConnectorWoodstox connector = new ConnectorWoodstox();
|
|
BioCCollection collection = new BioCCollection();
|
|
collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
|
while (connector.hasNext())
|
|
{
|
|
BioCDocument document = connector.next();
|
|
String PMID = document.getID();
|
|
if(pmidlist.containsKey(PMID)){System.out.println("\nError: duplicate pmid-"+PMID);duplicate = true;}
|
|
else{pmidlist.put(PMID,"");}
|
|
String Anno="";
|
|
for (BioCPassage passage : document.getPassages())
|
|
{
|
|
if(passage.getInfon("type").equals("title"))
|
|
{
|
|
PubTatorOutputFormat.write(PMID+"|t|"+passage.getText()+"\n");
|
|
}
|
|
else if(passage.getInfon("type").equals("abstract"))
|
|
{
|
|
PubTatorOutputFormat.write(PMID+"|a|"+passage.getText()+"\n");
|
|
}
|
|
else
|
|
{
|
|
PubTatorOutputFormat.write(PMID+"|"+passage.getInfon("type")+"|"+passage.getText()+"\n");
|
|
}
|
|
|
|
for (BioCAnnotation annotation : passage.getAnnotations())
|
|
{
|
|
String Annotype = annotation.getInfon("type");
|
|
String Annoid="";
|
|
String Proteinid="";
|
|
if(Annotype.matches("(Gene|FamilyName|DomainMotif)"))
|
|
{
|
|
if(annotation.getInfons().containsKey("NCBI Gene"))
|
|
{
|
|
Annoid = annotation.getInfon("NCBI Gene");
|
|
String Annoidlist[]=Annoid.split(";");
|
|
Annoid="";
|
|
for(int x=0;x<Annoidlist.length;x++)
|
|
{
|
|
|
|
String proteinid="";
|
|
String homoid="";
|
|
|
|
if(GNormPlus.Normalization2Protein_hash.containsKey(Annoidlist[x]))
|
|
{
|
|
proteinid=GNormPlus.Normalization2Protein_hash.get(Annoidlist[x]);
|
|
}
|
|
if(GNormPlus.HomologeneID_hash.containsKey(Annoidlist[x]))
|
|
{
|
|
homoid=GNormPlus.HomologeneID_hash.get(Annoidlist[x]);
|
|
}
|
|
|
|
if((!proteinid.equals("")) || (!homoid.equals("")))
|
|
{
|
|
if(Annoid.equals(""))
|
|
{
|
|
Annoid=Annoidlist[x]+"(";
|
|
if(!proteinid.equals(""))
|
|
{
|
|
Annoid=Annoid+"UniProt:"+proteinid;
|
|
}
|
|
if(!homoid.equals(""))
|
|
{
|
|
if(!proteinid.equals(""))
|
|
{
|
|
Annoid=Annoid+";";
|
|
}
|
|
Annoid=Annoid+"Homoid:"+homoid;
|
|
}
|
|
Annoid=Annoid+")";
|
|
}
|
|
else
|
|
{
|
|
Annoid=Annoid+";"+Annoidlist[x]+"(";
|
|
if(!proteinid.equals(""))
|
|
{
|
|
Annoid=Annoid+"UniProt:"+proteinid;
|
|
}
|
|
if(!homoid.equals(""))
|
|
{
|
|
if(!proteinid.equals(""))
|
|
{
|
|
Annoid=Annoid+";";
|
|
}
|
|
Annoid=Annoid+"Homoid:"+homoid;
|
|
}
|
|
Annoid=Annoid+")";
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if(Annoid.equals(""))
|
|
{
|
|
Annoid=Annoidlist[x];
|
|
}
|
|
else
|
|
{
|
|
Annoid=Annoid+";"+Annoidlist[x];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else
|
|
{
|
|
Annoid = annotation.getInfon("Identifier");
|
|
}
|
|
}
|
|
else if(Annotype.equals("Species") || Annotype.equals("Genus") || Annotype.equals("Strain"))
|
|
{
|
|
if(annotation.getInfons().containsKey("NCBI Taxonomy"))
|
|
{
|
|
Annoid = annotation.getInfon("NCBI Taxonomy");
|
|
}
|
|
else
|
|
{
|
|
Annoid = annotation.getInfon("Identifier");
|
|
}
|
|
}
|
|
else if(Annotype.equals("CellLine"))
|
|
{
|
|
if(annotation.getInfons().containsKey("NCBI Taxonomy"))
|
|
{
|
|
Annoid = annotation.getInfon("NCBI Taxonomy");
|
|
}
|
|
else
|
|
{
|
|
Annoid = annotation.getInfon("Identifier");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
Annoid = annotation.getInfon("Identifier");
|
|
}
|
|
int start = annotation.getLocations().get(0).getOffset();
|
|
int last = start + annotation.getLocations().get(0).getLength();
|
|
String AnnoMention=annotation.getText();
|
|
if(Annoid != null && !Annoid.equals(null) && !Annoid.equals(""))
|
|
{
|
|
Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\t"+Annoid+"\n";
|
|
}
|
|
else
|
|
{
|
|
Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\n";
|
|
}
|
|
}
|
|
}
|
|
PubTatorOutputFormat.write(Anno+"\n");
|
|
}
|
|
PubTatorOutputFormat.close();
|
|
if(duplicate == true){System.exit(0);}
|
|
}
|
|
public void BioC2PubTator(String original_input,String input,String output) throws IOException, XMLStreamException
|
|
{
|
|
|
|
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(original_input), "UTF-8"));
|
|
HashMap<String,String> ParagraphContent = new HashMap<String,String>();
|
|
HashMap<String,String> annotations = new HashMap<String,String>();
|
|
String line;
|
|
String Pmid="";
|
|
int count_paragraph=0;
|
|
while ((line = inputfile.readLine()) != null)
|
|
{
|
|
if(line.contains("|") && !line.contains("\t"))
|
|
{
|
|
String str[]=line.split("\\|",-1);
|
|
Pmid=str[0];
|
|
ParagraphContent.put(Pmid+"\t"+str[1],str[2]);
|
|
count_paragraph++;
|
|
}
|
|
else if (line.contains("\t"))
|
|
{
|
|
annotations.put(Pmid, annotations.get(Pmid)+line);
|
|
}
|
|
else if(line.length()==0)
|
|
{
|
|
count_paragraph=0;
|
|
}
|
|
}
|
|
inputfile.close();
|
|
|
|
|
|
|
|
|
|
HashMap<String, String> pmidlist = new HashMap<String, String>();
|
|
boolean duplicate = false;
|
|
BufferedWriter PubTatorOutputFormat = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
|
|
ConnectorWoodstox connector = new ConnectorWoodstox();
|
|
BioCCollection collection = new BioCCollection();
|
|
collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
|
while (connector.hasNext())
|
|
{
|
|
BioCDocument document = connector.next();
|
|
String PMID = document.getID();
|
|
if(pmidlist.containsKey(PMID)){System.out.println("\nError: duplicate pmid-"+PMID);duplicate = true;}
|
|
else{pmidlist.put(PMID,"");}
|
|
String Anno="";
|
|
for (BioCPassage passage : document.getPassages())
|
|
{
|
|
if(passage.getInfon("type").equals("title") || passage.getInfon("type").equals("t"))
|
|
{
|
|
PubTatorOutputFormat.write(PMID+"|t|"+ParagraphContent.get(PMID+"\tt")+"\n");
|
|
}
|
|
else if(passage.getInfon("type").equals("abstract") || passage.getInfon("type").equals("a"))
|
|
{
|
|
PubTatorOutputFormat.write(PMID+"|a|"+ParagraphContent.get(PMID+"\ta")+"\n");
|
|
}
|
|
else
|
|
{
|
|
PubTatorOutputFormat.write(PMID+"|"+passage.getInfon("type")+"|"+passage.getText()+"\n");
|
|
}
|
|
|
|
for (BioCAnnotation annotation : passage.getAnnotations())
|
|
{
|
|
String Annotype = annotation.getInfon("type");
|
|
String Annoid="";
|
|
String Proteinid="";
|
|
if(Annotype.matches("(Gene|FamilyName|DomainMotif)"))
|
|
{
|
|
if(annotation.getInfons().containsKey("NCBI Gene"))
|
|
{
|
|
Annoid = annotation.getInfon("NCBI Gene");
|
|
String Annoidlist[]=Annoid.split(";");
|
|
Annoid="";
|
|
for(int x=0;x<Annoidlist.length;x++)
|
|
{
|
|
|
|
String proteinid="";
|
|
String homoid="";
|
|
|
|
if(GNormPlus.Normalization2Protein_hash.containsKey(Annoidlist[x]))
|
|
{
|
|
proteinid=GNormPlus.Normalization2Protein_hash.get(Annoidlist[x]);
|
|
}
|
|
if(GNormPlus.HomologeneID_hash.containsKey(Annoidlist[x]))
|
|
{
|
|
homoid=GNormPlus.HomologeneID_hash.get(Annoidlist[x]);
|
|
}
|
|
|
|
if((!proteinid.equals("")) || (!homoid.equals("")))
|
|
{
|
|
if(Annoid.equals(""))
|
|
{
|
|
Annoid=Annoidlist[x]+"(";
|
|
if(!proteinid.equals(""))
|
|
{
|
|
Annoid=Annoid+"UniProt:"+proteinid;
|
|
}
|
|
if(!homoid.equals(""))
|
|
{
|
|
if(!proteinid.equals(""))
|
|
{
|
|
Annoid=Annoid+";";
|
|
}
|
|
Annoid=Annoid+"Homoid:"+homoid;
|
|
}
|
|
Annoid=Annoid+")";
|
|
}
|
|
else
|
|
{
|
|
Annoid=Annoid+";"+Annoidlist[x]+"(";
|
|
if(!proteinid.equals(""))
|
|
{
|
|
Annoid=Annoid+"UniProt:"+proteinid;
|
|
}
|
|
if(!homoid.equals(""))
|
|
{
|
|
if(!proteinid.equals(""))
|
|
{
|
|
Annoid=Annoid+";";
|
|
}
|
|
Annoid=Annoid+"Homoid:"+homoid;
|
|
}
|
|
Annoid=Annoid+")";
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if(Annoid.equals(""))
|
|
{
|
|
Annoid=Annoidlist[x];
|
|
}
|
|
else
|
|
{
|
|
Annoid=Annoid+";"+Annoidlist[x];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else
|
|
{
|
|
Annoid = annotation.getInfon("Identifier");
|
|
}
|
|
}
|
|
else if(Annotype.equals("Species") || Annotype.equals("Genus") || Annotype.equals("Strain"))
|
|
{
|
|
if(annotation.getInfons().containsKey("NCBI Taxonomy"))
|
|
{
|
|
Annoid = annotation.getInfon("NCBI Taxonomy");
|
|
}
|
|
else
|
|
{
|
|
Annoid = annotation.getInfon("Identifier");
|
|
}
|
|
}
|
|
else if(Annotype.equals("CellLine"))
|
|
{
|
|
if(annotation.getInfons().containsKey("NCBI Taxonomy"))
|
|
{
|
|
Annoid = annotation.getInfon("NCBI Taxonomy");
|
|
}
|
|
else
|
|
{
|
|
Annoid = annotation.getInfon("Identifier");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if(annotation.getInfons().containsKey("Identifier"))
|
|
{
|
|
Annoid = annotation.getInfon("Identifier");
|
|
}
|
|
else
|
|
{
|
|
Annoid = "";
|
|
}
|
|
}
|
|
int start = annotation.getLocations().get(0).getOffset();
|
|
int last = start + annotation.getLocations().get(0).getLength();
|
|
String AnnoMention=annotation.getText();
|
|
if(Annoid != null && !Annoid.equals(null) && !Annoid.equals(""))
|
|
{
|
|
Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\t"+Annoid+"\n";
|
|
}
|
|
else
|
|
{
|
|
Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\n";
|
|
}
|
|
}
|
|
}
|
|
PubTatorOutputFormat.write(Anno+"\n");
|
|
}
|
|
PubTatorOutputFormat.close();
|
|
if(duplicate == true){System.exit(0);}
|
|
}
|
|
public void BioCReader(String input) throws IOException, XMLStreamException
|
|
{
|
|
ConnectorWoodstox connector = new ConnectorWoodstox();
|
|
BioCCollection collection = new BioCCollection();
|
|
collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
|
|
|
|
|
|
|
|
|
while (connector.hasNext())
|
|
{
|
|
BioCDocument document = connector.next();
|
|
PMIDs.add(document.getID());
|
|
|
|
ArrayList<String> PassageName= new ArrayList<String>();
|
|
ArrayList<Integer> PassageOffset= new ArrayList<Integer>();
|
|
ArrayList<String> PassageContext= new ArrayList<String>();
|
|
ArrayList<ArrayList<String>> AnnotationInPMID= new ArrayList();
|
|
|
|
|
|
|
|
|
|
for (BioCPassage passage : document.getPassages())
|
|
{
|
|
PassageName.add(passage.getInfon("type"));
|
|
String txt = passage.getText();
|
|
if(txt.matches("[\t ]+"))
|
|
{
|
|
txt = txt.replaceAll(".","@");
|
|
}
|
|
else
|
|
{
|
|
|
|
|
|
|
|
|
|
txt = txt.replaceAll("ω","w");
|
|
txt = txt.replaceAll("μ","u");
|
|
txt = txt.replaceAll("κ","k");
|
|
txt = txt.replaceAll("α","a");
|
|
txt = txt.replaceAll("γ","g");
|
|
txt = txt.replaceAll("ɣ","g");
|
|
txt = txt.replaceAll("β","b");
|
|
txt = txt.replaceAll("×","x");
|
|
txt = txt.replaceAll("‑","-");
|
|
txt = txt.replaceAll("¹","1");
|
|
txt = txt.replaceAll("²","2");
|
|
txt = txt.replaceAll("°","o");
|
|
txt = txt.replaceAll("ö","o");
|
|
txt = txt.replaceAll("é","e");
|
|
txt = txt.replaceAll("à","a");
|
|
txt = txt.replaceAll("Á","A");
|
|
txt = txt.replaceAll("ε","e");
|
|
txt = txt.replaceAll("θ","O");
|
|
txt = txt.replaceAll("•",".");
|
|
txt = txt.replaceAll("µ","u");
|
|
txt = txt.replaceAll("λ","r");
|
|
txt = txt.replaceAll("⁺","+");
|
|
txt = txt.replaceAll("ν","v");
|
|
txt = txt.replaceAll("ï","i");
|
|
txt = txt.replaceAll("ã","a");
|
|
txt = txt.replaceAll("≡","=");
|
|
txt = txt.replaceAll("ó","o");
|
|
txt = txt.replaceAll("³","3");
|
|
txt = txt.replaceAll("〖","[");
|
|
txt = txt.replaceAll("〗","]");
|
|
txt = txt.replaceAll("Å","A");
|
|
txt = txt.replaceAll("ρ","p");
|
|
txt = txt.replaceAll("ü","u");
|
|
txt = txt.replaceAll("ɛ","e");
|
|
txt = txt.replaceAll("č","c");
|
|
txt = txt.replaceAll("š","s");
|
|
txt = txt.replaceAll("ß","b");
|
|
txt = txt.replaceAll("═","=");
|
|
txt = txt.replaceAll("£","L");
|
|
txt = txt.replaceAll("Ł","L");
|
|
txt = txt.replaceAll("ƒ","f");
|
|
txt = txt.replaceAll("ä","a");
|
|
txt = txt.replaceAll("–","-");
|
|
txt = txt.replaceAll("⁻","-");
|
|
txt = txt.replaceAll("〈","<");
|
|
txt = txt.replaceAll("〉",">");
|
|
txt = txt.replaceAll("χ","X");
|
|
txt = txt.replaceAll("Đ","D");
|
|
txt = txt.replaceAll("‰","%");
|
|
txt = txt.replaceAll("·",".");
|
|
txt = txt.replaceAll("→",">");
|
|
txt = txt.replaceAll("←","<");
|
|
txt = txt.replaceAll("ζ","z");
|
|
txt = txt.replaceAll("π","p");
|
|
txt = txt.replaceAll("τ","t");
|
|
txt = txt.replaceAll("ξ","X");
|
|
txt = txt.replaceAll("η","h");
|
|
txt = txt.replaceAll("ø","0");
|
|
txt = txt.replaceAll("Δ","D");
|
|
txt = txt.replaceAll("∆","D");
|
|
txt = txt.replaceAll("∑","S");
|
|
txt = txt.replaceAll("Ω","O");
|
|
txt = txt.replaceAll("δ","d");
|
|
txt = txt.replaceAll("σ","s");
|
|
txt = txt.replaceAll("Φ","F");
|
|
|
|
}
|
|
if(passage.getText().equals("") || passage.getText().matches("[ ]+"))
|
|
{
|
|
PassageContext.add("-notext-");
|
|
}
|
|
else
|
|
{
|
|
PassageContext.add(txt);
|
|
}
|
|
PassageOffset.add(passage.getOffset());
|
|
ArrayList<String> AnnotationInPassage= new ArrayList<String>();
|
|
AnnotationInPMID.add(AnnotationInPassage);
|
|
}
|
|
PassageNames.add(PassageName);
|
|
PassageContexts.add(PassageContext);
|
|
PassageOffsets.add(PassageOffset);
|
|
Annotations.add(AnnotationInPMID);
|
|
}
|
|
}
|
|
public void BioCReaderWithAnnotation(String input) throws IOException, XMLStreamException
|
|
{
|
|
ConnectorWoodstox connector = new ConnectorWoodstox();
|
|
BioCCollection collection = new BioCCollection();
|
|
collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
|
|
|
|
|
|
|
|
|
while (connector.hasNext())
|
|
{
|
|
BioCDocument document = connector.next();
|
|
PMIDs.add(document.getID());
|
|
|
|
ArrayList<String> PassageName= new ArrayList<String>();
|
|
ArrayList<Integer> PassageOffset= new ArrayList<Integer>();
|
|
ArrayList<String> PassageContext= new ArrayList<String>();
|
|
ArrayList<ArrayList<String>> AnnotationInPMID= new ArrayList();
|
|
|
|
|
|
|
|
|
|
for (BioCPassage passage : document.getPassages())
|
|
{
|
|
PassageName.add(passage.getInfon("type"));
|
|
|
|
String txt = passage.getText();
|
|
if(txt.matches("[\t ]+"))
|
|
{
|
|
txt = txt.replaceAll(".","@");
|
|
}
|
|
else
|
|
{
|
|
|
|
|
|
|
|
|
|
txt = txt.replaceAll("ω","w");
|
|
txt = txt.replaceAll("μ","u");
|
|
txt = txt.replaceAll("κ","k");
|
|
txt = txt.replaceAll("α","a");
|
|
txt = txt.replaceAll("γ","g");
|
|
txt = txt.replaceAll("ɣ","g");
|
|
txt = txt.replaceAll("β","b");
|
|
txt = txt.replaceAll("×","x");
|
|
txt = txt.replaceAll("‑","-");
|
|
txt = txt.replaceAll("¹","1");
|
|
txt = txt.replaceAll("²","2");
|
|
txt = txt.replaceAll("°","o");
|
|
txt = txt.replaceAll("ö","o");
|
|
txt = txt.replaceAll("é","e");
|
|
txt = txt.replaceAll("à","a");
|
|
txt = txt.replaceAll("Á","A");
|
|
txt = txt.replaceAll("ε","e");
|
|
txt = txt.replaceAll("θ","O");
|
|
txt = txt.replaceAll("•",".");
|
|
txt = txt.replaceAll("µ","u");
|
|
txt = txt.replaceAll("λ","r");
|
|
txt = txt.replaceAll("⁺","+");
|
|
txt = txt.replaceAll("ν","v");
|
|
txt = txt.replaceAll("ï","i");
|
|
txt = txt.replaceAll("ã","a");
|
|
txt = txt.replaceAll("≡","=");
|
|
txt = txt.replaceAll("ó","o");
|
|
txt = txt.replaceAll("³","3");
|
|
txt = txt.replaceAll("〖","[");
|
|
txt = txt.replaceAll("〗","]");
|
|
txt = txt.replaceAll("Å","A");
|
|
txt = txt.replaceAll("ρ","p");
|
|
txt = txt.replaceAll("ü","u");
|
|
txt = txt.replaceAll("ɛ","e");
|
|
txt = txt.replaceAll("č","c");
|
|
txt = txt.replaceAll("š","s");
|
|
txt = txt.replaceAll("ß","b");
|
|
txt = txt.replaceAll("═","=");
|
|
txt = txt.replaceAll("£","L");
|
|
txt = txt.replaceAll("Ł","L");
|
|
txt = txt.replaceAll("ƒ","f");
|
|
txt = txt.replaceAll("ä","a");
|
|
txt = txt.replaceAll("–","-");
|
|
txt = txt.replaceAll("⁻","-");
|
|
txt = txt.replaceAll("〈","<");
|
|
txt = txt.replaceAll("〉",">");
|
|
txt = txt.replaceAll("χ","X");
|
|
txt = txt.replaceAll("Đ","D");
|
|
txt = txt.replaceAll("‰","%");
|
|
txt = txt.replaceAll("·",".");
|
|
txt = txt.replaceAll("→",">");
|
|
txt = txt.replaceAll("←","<");
|
|
txt = txt.replaceAll("ζ","z");
|
|
txt = txt.replaceAll("π","p");
|
|
txt = txt.replaceAll("τ","t");
|
|
txt = txt.replaceAll("ξ","X");
|
|
txt = txt.replaceAll("η","h");
|
|
txt = txt.replaceAll("ø","0");
|
|
txt = txt.replaceAll("Δ","D");
|
|
txt = txt.replaceAll("∆","D");
|
|
txt = txt.replaceAll("∑","S");
|
|
txt = txt.replaceAll("Ω","O");
|
|
txt = txt.replaceAll("δ","d");
|
|
txt = txt.replaceAll("σ","s");
|
|
txt = txt.replaceAll("Φ","F");
|
|
|
|
}
|
|
if(passage.getText().equals("") || passage.getText().matches("[ ]+"))
|
|
{
|
|
PassageContext.add("-notext-");
|
|
}
|
|
else
|
|
{
|
|
PassageContext.add(txt);
|
|
}
|
|
PassageOffset.add(passage.getOffset());
|
|
ArrayList<String> AnnotationInPassage= new ArrayList<String>();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (BioCAnnotation Anno : passage.getAnnotations())
|
|
{
|
|
int start = Anno.getLocations().get(0).getOffset()-passage.getOffset();
|
|
int last = start + Anno.getLocations().get(0).getLength();
|
|
String AnnoMention=Anno.getText();
|
|
String Annotype = Anno.getInfon("type");
|
|
String Annoid = Anno.getInfon("Identifier");
|
|
if(Annoid == null)
|
|
{
|
|
Annoid = Anno.getInfon("Identifier");
|
|
}
|
|
if(Annoid == null || Annoid.equals("null"))
|
|
{
|
|
AnnotationInPassage.add(start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype);
|
|
}
|
|
else
|
|
{
|
|
AnnotationInPassage.add(start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\t"+Annoid);
|
|
}
|
|
}
|
|
AnnotationInPMID.add(AnnotationInPassage);
|
|
}
|
|
PassageNames.add(PassageName);
|
|
PassageContexts.add(PassageContext);
|
|
PassageOffsets.add(PassageOffset);
|
|
Annotations.add(AnnotationInPMID);
|
|
}
|
|
}
|
|
public void BioCOutput(String input,String output, ArrayList<ArrayList<ArrayList<String>>> Annotations,boolean Final,boolean RemovePreviousAnno) throws IOException, XMLStreamException
|
|
{
|
|
boolean ShowUnNormalizedMention = false;
|
|
if(GNormPlus.setup_hash.containsKey("ShowUnNormalizedMention") && GNormPlus.setup_hash.get("ShowUnNormalizedMention").equals("True"))
|
|
{
|
|
ShowUnNormalizedMention = true;
|
|
}
|
|
|
|
BioCDocumentWriter BioCOutputFormat = BioCFactory.newFactory(BioCFactory.WOODSTOX).createBioCDocumentWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
|
|
BioCCollection biocCollection_input = new BioCCollection();
|
|
BioCCollection biocCollection_output = new BioCCollection();
|
|
|
|
|
|
ConnectorWoodstox connector = new ConnectorWoodstox();
|
|
biocCollection_input = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
|
BioCOutputFormat.writeCollectionInfo(biocCollection_input);
|
|
int i=0;
|
|
while (connector.hasNext())
|
|
{
|
|
BioCDocument document_output = new BioCDocument();
|
|
BioCDocument document_input = connector.next();
|
|
String PMID=document_input.getID();
|
|
document_output.setID(PMID);
|
|
int annotation_count=0;
|
|
int j=0;
|
|
for (BioCPassage passage_input : document_input.getPassages())
|
|
{
|
|
BioCPassage passage_output = passage_input;
|
|
|
|
if(RemovePreviousAnno == true)
|
|
{
|
|
passage_output.clearAnnotations();
|
|
}
|
|
else
|
|
{
|
|
for (BioCAnnotation annotation : passage_output.getAnnotations())
|
|
{
|
|
annotation.setID(""+annotation_count);
|
|
annotation_count++;
|
|
}
|
|
}
|
|
|
|
int passage_Offset = passage_input.getOffset();
|
|
String passage_Text = passage_input.getText();
|
|
ArrayList<String> AnnotationInPassage = new ArrayList<String>();
|
|
|
|
if(Annotations.size()>i && Annotations.get(i).size()>j)
|
|
{
|
|
for(int a=0;a<Annotations.get(i).get(j).size();a++)
|
|
{
|
|
String Anno[]=Annotations.get(i).get(j).get(a).split("\\t");
|
|
int start = Integer.parseInt(Anno[0]);
|
|
int last = Integer.parseInt(Anno[1]);
|
|
boolean found = false;
|
|
if(passage_Text.length()>last)
|
|
{
|
|
String mention = Anno[2];
|
|
if(Final == true && passage_Text.length()>=last)
|
|
{
|
|
mention = passage_Text.substring(start, last);
|
|
}
|
|
if(mention.matches(".*\t.*"))
|
|
{
|
|
Anno[3]=Anno[4];
|
|
if(Anno.length>=6)
|
|
{
|
|
Anno[4]=Anno[5];
|
|
}
|
|
}
|
|
String type = Anno[3];
|
|
String id = "";
|
|
if(Anno.length>=5){id = Anno[4];}
|
|
if(Final == true)
|
|
{
|
|
for(int b=0;b<AnnotationInPassage.size();b++)
|
|
{
|
|
String Annob[]=AnnotationInPassage.get(b).split("\\t");
|
|
int startb = Integer.parseInt(Annob[0]);
|
|
int lastb = Integer.parseInt(Annob[1]);
|
|
String mentionb = Annob[2];
|
|
if(Final == true && passage_Text.length()>=lastb)
|
|
{
|
|
mentionb = passage_Text.substring(startb, lastb);
|
|
}
|
|
if(mentionb.matches(".*\t.*"))
|
|
{
|
|
Annob[3]=Annob[4];
|
|
if(Annob.length>=6)
|
|
{
|
|
Annob[4]=Annob[5];
|
|
}
|
|
}
|
|
String typeb = Annob[3];
|
|
String idb = "";
|
|
if(Annob.length>=5){idb = Annob[4];}
|
|
|
|
if(start == startb && last == lastb && type.equals(typeb))
|
|
{
|
|
found = true;
|
|
if(id.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+") && (!idb.equals("")))
|
|
{
|
|
}
|
|
else if(idb.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+") && (!id.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+")) && (!id.equals("")))
|
|
{
|
|
AnnotationInPassage.set(b, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id);
|
|
}
|
|
else
|
|
{
|
|
if(id.equals(""))
|
|
{
|
|
}
|
|
else
|
|
{
|
|
AnnotationInPassage.set(b, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+idb+";"+id);
|
|
}
|
|
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if(found == false)
|
|
{
|
|
AnnotationInPassage.add(Annotations.get(i).get(j).get(a));
|
|
}
|
|
}
|
|
}
|
|
for(int a=0;a<AnnotationInPassage.size();a++)
|
|
{
|
|
String Anno[]=AnnotationInPassage.get(a).split("\\t");
|
|
HashMap <String,String> id_hash = new HashMap <String,String>();
|
|
if(Anno.length>=5)
|
|
{
|
|
int start = Integer.parseInt(Anno[0]);
|
|
int last = Integer.parseInt(Anno[1]);
|
|
String mention = Anno[2];
|
|
if(Final == true && passage_Text.length()>=last)
|
|
{
|
|
mention = passage_Text.substring(start, last);
|
|
}
|
|
if(mention.matches(".*\t.*"))
|
|
{
|
|
Anno[3]=Anno[4];
|
|
if(Anno.length>=6)
|
|
{
|
|
Anno[4]=Anno[5];
|
|
}
|
|
}
|
|
String ids = Anno[4];
|
|
String idlist[]=ids.split(",");
|
|
for(int b=0;b<idlist.length;b++)
|
|
{
|
|
id_hash.put(idlist[b], "");
|
|
}
|
|
ids = "";
|
|
for(String id :id_hash.keySet())
|
|
{
|
|
if(ids.equals(""))
|
|
{
|
|
ids = id;
|
|
}
|
|
else
|
|
{
|
|
ids = ids + ";" + id;
|
|
}
|
|
}
|
|
AnnotationInPassage.set(a, Anno[0]+"\t"+Anno[1]+"\t"+Anno[2]+"\t"+Anno[3]+"\t"+ids);
|
|
}
|
|
}
|
|
|
|
for(int a=0;a<AnnotationInPassage.size();a++)
|
|
{
|
|
String Anno[]=AnnotationInPassage.get(a).split("\\t");
|
|
int start = Integer.parseInt(Anno[0]);
|
|
int last = Integer.parseInt(Anno[1]);
|
|
if(passage_Text.length()>last)
|
|
{
|
|
String mention = Anno[2];
|
|
if(Final == true && passage_Text.length()>=last)
|
|
{
|
|
mention = passage_Text.substring(start, last);
|
|
}
|
|
if(mention.matches(".*\t.*"))
|
|
{
|
|
Anno[3]=Anno[4];
|
|
if(Anno.length>=6)
|
|
{
|
|
Anno[4]=Anno[5];
|
|
}
|
|
}
|
|
String type = Anno[3];
|
|
if(type.equals("GeneID")){type="Gene";}
|
|
BioCAnnotation biocAnnotation = new BioCAnnotation();
|
|
Map<String, String> AnnoInfons = new HashMap<String, String>();
|
|
AnnoInfons.put("type", type);
|
|
if(Anno.length>=5)
|
|
{
|
|
String identifier = Anno[4];
|
|
if(Final == true && ShowUnNormalizedMention==false)
|
|
{
|
|
if(type.matches("(FamilyName|Domain|Gene)"))
|
|
{
|
|
Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9\\;]+)$");
|
|
Matcher mtmp0 = ptmp0.matcher(identifier);
|
|
Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
|
|
Matcher mtmp1 = ptmp1.matcher(identifier);
|
|
Pattern ptmp2 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)$");
|
|
Matcher mtmp2 = ptmp2.matcher(identifier);
|
|
Pattern ptmp3 = Pattern.compile("^Homo\\:([0-9]+)$");
|
|
Matcher mtmp3 = ptmp3.matcher(identifier);
|
|
if(mtmp0.find())
|
|
{
|
|
String Method_SA = mtmp0.group(1);
|
|
String TaxonomyID = mtmp0.group(2);
|
|
String NCBIGeneID = mtmp0.group(3);
|
|
if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID))
|
|
{
|
|
AnnoInfons.put("UniProt", GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
|
|
}
|
|
if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID))
|
|
{
|
|
AnnoInfons.put("NCBI Homologene", GNormPlus.HomologeneID_hash.get(NCBIGeneID));
|
|
}
|
|
AnnoInfons.put("NCBI Gene", NCBIGeneID);
|
|
}
|
|
else if(mtmp1.find())
|
|
{
|
|
String Method_SA = mtmp1.group(1);
|
|
String TaxonomyID = mtmp1.group(2);
|
|
String NCBIGeneID = mtmp1.group(3);
|
|
String HomoID = mtmp1.group(4);
|
|
if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID))
|
|
{
|
|
AnnoInfons.put("UniProt", GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
|
|
}
|
|
if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID))
|
|
{
|
|
AnnoInfons.put("NCBI Homologene", GNormPlus.HomologeneID_hash.get(NCBIGeneID));
|
|
}
|
|
AnnoInfons.put("NCBI Gene", NCBIGeneID);
|
|
}
|
|
else if(mtmp2.find())
|
|
{
|
|
String Method_SA = mtmp2.group(1);
|
|
String TaxonomyID = mtmp2.group(2);
|
|
AnnoInfons.put("FocusSpecies", "NCBITaxonomyID:"+TaxonomyID);
|
|
}
|
|
else if(mtmp3.find())
|
|
{
|
|
String Method_SA = mtmp3.group(1);
|
|
String HomoID = mtmp3.group(2);
|
|
AnnoInfons.put("NCBI Homologene", HomoID);
|
|
}
|
|
else
|
|
{
|
|
String identifiers[] = identifier.split(";");
|
|
if(identifiers.length>1)
|
|
{
|
|
ArrayList<String> identifierSTR = new ArrayList<String>();
|
|
ArrayList<String> ProteinidSTR = new ArrayList<String>();
|
|
ArrayList<String> HomoidSTR = new ArrayList<String>();
|
|
for(int idi=0;idi<identifiers.length;idi++)
|
|
{
|
|
Pattern ptmp4 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
|
|
Matcher mtmp4 = ptmp4.matcher(identifiers[idi]);
|
|
Pattern ptmp5 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9\\;]+)$");
|
|
Matcher mtmp5 = ptmp5.matcher(identifiers[idi]);
|
|
if(mtmp4.find())
|
|
{
|
|
String Method_SA = mtmp4.group(1);
|
|
String TaxonomyID = mtmp4.group(2);
|
|
String NCBIGeneID = mtmp4.group(3);
|
|
String HomoID = mtmp4.group(4);
|
|
if(!identifierSTR.contains(NCBIGeneID))
|
|
{
|
|
identifierSTR.add(NCBIGeneID);
|
|
}
|
|
if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID))
|
|
{
|
|
if(!ProteinidSTR.contains(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID)))
|
|
{
|
|
ProteinidSTR.add(GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
|
|
}
|
|
}
|
|
if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID))
|
|
{
|
|
if(!HomoidSTR.contains(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID)))
|
|
{
|
|
HomoidSTR.add(GNormPlus.HomologeneID_hash.get(NCBIGeneID));
|
|
}
|
|
}
|
|
|
|
}
|
|
else if(mtmp5.find())
|
|
{
|
|
String Method_SA = mtmp5.group(1);
|
|
String TaxonomyID = mtmp5.group(2);
|
|
String NCBIGeneID = mtmp5.group(3);
|
|
if(!identifierSTR.contains(NCBIGeneID))
|
|
{
|
|
identifierSTR.add(NCBIGeneID);
|
|
}
|
|
}
|
|
}
|
|
String idSTR="";
|
|
for(int x=0;x<identifierSTR.size();x++)
|
|
{
|
|
if(idSTR.equals(""))
|
|
{
|
|
idSTR = identifierSTR.get(x);
|
|
}
|
|
else
|
|
{
|
|
idSTR = idSTR+";"+identifierSTR.get(x);
|
|
}
|
|
}
|
|
AnnoInfons.put("NCBI Gene", idSTR);
|
|
|
|
String pidSTR="";
|
|
for(int x=0;x<ProteinidSTR.size();x++)
|
|
{
|
|
if(pidSTR.equals(""))
|
|
{
|
|
pidSTR = ProteinidSTR.get(x);
|
|
}
|
|
else
|
|
{
|
|
pidSTR = pidSTR+";"+ProteinidSTR.get(x);
|
|
}
|
|
}
|
|
if(!pidSTR.equals(""))
|
|
{
|
|
AnnoInfons.put("UniProt", pidSTR);
|
|
}
|
|
|
|
String hidSTR="";
|
|
for(int x=0;x<HomoidSTR.size();x++)
|
|
{
|
|
if(hidSTR.equals(""))
|
|
{
|
|
hidSTR = HomoidSTR.get(x);
|
|
}
|
|
else
|
|
{
|
|
hidSTR = hidSTR+";"+HomoidSTR.get(x);
|
|
}
|
|
}
|
|
if(!hidSTR.equals(""))
|
|
{
|
|
AnnoInfons.put("NCBI Homologene", hidSTR);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
}
|
|
else if (type.matches("(Species|Genus|Strain)"))
|
|
{
|
|
AnnoInfons.put("type", type);
|
|
AnnoInfons.put("NCBI Taxonomy", identifier);
|
|
}
|
|
else if (type.matches("Cell"))
|
|
{
|
|
AnnoInfons.put("type", "CellLine");
|
|
AnnoInfons.put("NCBI Taxonomy", identifier);
|
|
}
|
|
else
|
|
{
|
|
AnnoInfons.put("Identifier", identifier);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
AnnoInfons.put("Identifier", identifier);
|
|
}
|
|
}
|
|
biocAnnotation.setInfons(AnnoInfons);
|
|
BioCLocation location = new BioCLocation();
|
|
location.setOffset(start+passage_Offset);
|
|
location.setLength(last-start);
|
|
biocAnnotation.setLocation(location);
|
|
biocAnnotation.setText(mention);
|
|
biocAnnotation.setID(""+annotation_count);
|
|
annotation_count++;
|
|
if(Final == true)
|
|
{
|
|
if(AnnoInfons.containsKey("Identifier") || AnnoInfons.containsKey("NCBI Homologene") || AnnoInfons.containsKey("NCBI Gene") || AnnoInfons.containsKey("NCBI Taxonomy"))
|
|
{
|
|
passage_output.addAnnotation(biocAnnotation);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
passage_output.addAnnotation(biocAnnotation);
|
|
}
|
|
}
|
|
}
|
|
document_output.addPassage(passage_output);
|
|
j++;
|
|
}
|
|
biocCollection_output.addDocument(document_output);
|
|
BioCOutputFormat.writeDocument(document_output);
|
|
i++;
|
|
}
|
|
BioCOutputFormat.close();
|
|
}
|
|
} |