|
package GNormPluslib; |
|
|
|
import java.io.BufferedReader; |
|
import java.io.BufferedWriter; |
|
import java.io.File; |
|
import java.io.FileOutputStream; |
|
import java.io.FileReader; |
|
import java.io.IOException; |
|
import java.io.OutputStreamWriter; |
|
import java.sql.SQLException; |
|
import java.util.ArrayList; |
|
import java.util.HashMap; |
|
import java.util.regex.Matcher; |
|
import java.util.regex.Pattern; |
|
|
|
import javax.xml.stream.XMLStreamException; |
|
|
|
import GNormPluslib.PrefixTree; |
|
import GNormPluslib.GNR; |
|
import GNormPluslib.SR; |
|
|
|
public class GNormPlus |
|
{ |
|
public static BioCDoc BioCDocobj = new BioCDoc(); |
|
public static PrefixTree PT_Species = new PrefixTree(); |
|
public static PrefixTree PT_Cell = new PrefixTree(); |
|
public static PrefixTree PT_CTDGene = new PrefixTree(); |
|
public static PrefixTree PT_Gene = new PrefixTree(); |
|
public static PrefixTree PT_GeneChromosome = new PrefixTree(); |
|
public static PrefixTree PT_FamilyName = new PrefixTree(); |
|
public static HashMap<String, String> ent_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> GenusID_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> PrefixID_hash = new HashMap<String, String>(); |
|
public static HashMap<String, Double> TaxFreq_hash = new HashMap<String, Double>(); |
|
public static HashMap<String, String> GeneScoring_hash = new HashMap<String, String>(); |
|
public static HashMap<String, Double> GeneScoringDF_hash = new HashMap<String, Double>(); |
|
public static HashMap<String, String> GeneIDs_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> Normalization2Protein_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> HomologeneID_hash = new HashMap<String, String>(); |
|
public static HashMap<String,String> SuffixTranslationMap_hash = new HashMap<String,String>(); |
|
public static HashMap<String,String> SuffixTranslationMap2_hash = new HashMap<String,String>(); |
|
public static HashMap<String, String> Pmid2Abb_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> PmidAbb2LF_lc_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> PmidLF2Abb_lc_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> PmidAbb2LF_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> PmidLF2Abb_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> Pmid2ChromosomeGene_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> SimConceptMention2Type_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> Filtering_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> Filtering_WithLongForm_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> SP_Virus2Human_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> GeneWithoutSPPrefix_hash = new HashMap<String, String>(); |
|
public static ArrayList <String> taxid4gene = new ArrayList <String>(); |
|
public static HashMap<String, String> setup_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> suffixprefix_orig2modified = new HashMap<String, String>(); |
|
public static HashMap<String, String> Abb2Longformtok_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> StrainID_ancestor2tax_hash = new HashMap<String, String>(); |
|
public static HashMap<String, String> StrainID_taxid2names_hash = new HashMap<String, String>(); |
|
|
|
public static String SetupFile = "setup.txt"; |
|
public static void main(String [] args) throws IOException, InterruptedException, XMLStreamException, SQLException |
|
{ |
|
String InputFolder="input"; |
|
String OutputFolder="output"; |
|
String tmpFolder="tmp"; |
|
String FocusSpecies = ""; |
|
if(args.length<2) |
|
{ |
|
System.out.println("\n$ java -Xmx30G -Xms10G -jar GNormPlus.jar [InputFolder] [OutputFolder] [SetupFile]"); |
|
System.out.println("[InputFolder] Default : input"); |
|
System.out.println("[OutputFolder] Default : output"); |
|
System.out.println("[SetupFile] Default : setup.txt\n\n"); |
|
} |
|
else |
|
{ |
|
|
|
|
|
|
|
InputFolder=args[0]; |
|
OutputFolder=args[1]; |
|
if(args.length>=3) |
|
{ |
|
SetupFile = args[2]; |
|
} |
|
if(args.length>=4) |
|
{ |
|
FocusSpecies=args[3]; |
|
} |
|
} |
|
|
|
BufferedReader br = new BufferedReader(new FileReader(SetupFile)); |
|
String line=""; |
|
Pattern ptmp = Pattern.compile("^ ([A-Za-z0-9]+) = ([^ \\t\\n\\r]+)$"); |
|
while ((line = br.readLine()) != null) |
|
{ |
|
Matcher mtmp = ptmp.matcher(line); |
|
if(mtmp.find()) |
|
{ |
|
setup_hash.put(mtmp.group(1), mtmp.group(2)); |
|
} |
|
} |
|
br.close(); |
|
if(!setup_hash.containsKey("GeneIDMatch")) |
|
{ |
|
setup_hash.put("GeneIDMatch","True"); |
|
} |
|
if(!setup_hash.containsKey("HomologeneID")) |
|
{ |
|
setup_hash.put("HomologeneID","False"); |
|
} |
|
if(!FocusSpecies.equals("")) |
|
{ |
|
setup_hash.put("FocusSpecies",FocusSpecies); |
|
} |
|
if(!setup_hash.containsKey("ShowUnNormalizedMention")) |
|
{ |
|
setup_hash.put("ShowUnNormalizedMention","False"); |
|
} |
|
if(setup_hash.containsKey("tmpFolder")) |
|
{ |
|
tmpFolder=setup_hash.get("tmpFolder"); |
|
} |
|
|
|
|
|
|
|
|
|
double startTime,endTime,totTime; |
|
startTime = System.currentTimeMillis(); |
|
|
|
int NumFiles=0; |
|
File folder = new File(InputFolder); |
|
File[] listOfFiles = folder.listFiles(); |
|
for (int i = 0; i < listOfFiles.length; i++) |
|
{ |
|
if (listOfFiles[i].isFile()) |
|
{ |
|
String InputFile = listOfFiles[i].getName(); |
|
File f = new File(OutputFolder+"/"+InputFile); |
|
if(f.exists() && !f.isDirectory()) |
|
{ |
|
} |
|
else |
|
{ |
|
NumFiles++; |
|
} |
|
} |
|
} |
|
|
|
System.out.println("Total "+NumFiles+" file(s) wait(s) for process."); |
|
|
|
if(NumFiles>0) |
|
{ |
|
|
|
|
|
|
|
String TrainTest = "Test"; |
|
if(setup_hash.containsKey("TrainTest")) |
|
{ |
|
TrainTest = setup_hash.get("TrainTest"); |
|
} |
|
|
|
|
|
|
|
if(setup_hash.containsKey("GeneRecognition") && setup_hash.get("GeneRecognition").toLowerCase().equals("true")) |
|
{ |
|
System.out.print("Loading Gene NER Dictionary : Processing ... \r"); |
|
|
|
if(setup_hash.containsKey("IgnoreNER") && setup_hash.get("IgnoreNER").toLowerCase().equals("true")){} |
|
else if(setup_hash.containsKey("SpeciesAssignmentOnly") && setup_hash.get("SpeciesAssignmentOnly").toLowerCase().equals("true")) {} |
|
else |
|
{ |
|
PT_CTDGene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_CTDGene.txt"); |
|
} |
|
|
|
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/ent.rev.txt")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
String l[]=line.split("\t"); |
|
ent_hash.put(l[0], l[1]); |
|
} |
|
br.close(); |
|
|
|
|
|
if((!setup_hash.containsKey("IgnoreNER")) || setup_hash.get("IgnoreNER").toLowerCase() != "true") |
|
{ |
|
PT_FamilyName.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_FamilyName.txt"); |
|
} |
|
|
|
|
|
|
|
System.out.println("Loading Gene NER Dictionary : Processing ... done."); |
|
} |
|
|
|
if(setup_hash.containsKey("SpeciesRecognition") && setup_hash.get("SpeciesRecognition").toLowerCase().equals("true")) |
|
{ |
|
System.out.print("Loading Species NER Dictionary : Processing ... \r"); |
|
|
|
PT_Species.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Species.txt"); |
|
|
|
|
|
PT_Cell.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Cell.txt"); |
|
|
|
|
|
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SPGenus.txt")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
String l[]=line.split("\t"); |
|
GenusID_hash.put(l[0], l[1]); |
|
} |
|
br.close(); |
|
|
|
|
|
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/tax4gene.txt")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
taxid4gene.add(line); |
|
} |
|
br.close(); |
|
System.out.println("Loading Species NER Dictionary : Processing ... done."); |
|
|
|
} |
|
|
|
if(setup_hash.containsKey("SpeciesAssignment") && setup_hash.get("SpeciesAssignment").toLowerCase().equals("true")) |
|
{ |
|
System.out.print("Loading Species Assignment Dictionary : Processing ... \r"); |
|
|
|
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/GeneWithoutSPPrefix.txt")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
GeneWithoutSPPrefix_hash.put(line, ""); |
|
} |
|
br.close(); |
|
|
|
|
|
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SPPrefix.txt")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
String l[]=line.split("\t"); |
|
PrefixID_hash.put(l[0], l[1]); |
|
} |
|
br.close(); |
|
PrefixID_hash.put("9606", "h"); |
|
PrefixID_hash.put("10090", "m"); |
|
PrefixID_hash.put("10116", "r"); |
|
PrefixID_hash.put("4932", "y"); |
|
PrefixID_hash.put("7227", "d"); |
|
PrefixID_hash.put("7955", "z|dr|Dr|Zf|zf"); |
|
PrefixID_hash.put("3702", "at|At"); |
|
|
|
|
|
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/taxonomy_freq.txt")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
String l[]=line.split("\t"); |
|
TaxFreq_hash.put(l[0], Double.parseDouble(l[1])/200000000); |
|
} |
|
br.close(); |
|
|
|
|
|
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SP_Virus2HumanList.txt")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
SP_Virus2Human_hash.put(line,"9606"); |
|
} |
|
br.close(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
System.out.println("Loading Species Assignment Dictionary : Processing ... done."); |
|
|
|
} |
|
|
|
if(setup_hash.containsKey("GeneNormalization") && setup_hash.get("GeneNormalization").toLowerCase().equals("true")) |
|
{ |
|
System.out.print("Loading Gene normalization Dictionary : Processing ... \r"); |
|
|
|
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/PrefixSuffix.txt")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
String l[]=line.split("\t"); |
|
String org=l[0]; |
|
String mod=l[1]; |
|
suffixprefix_orig2modified.put(org,mod); |
|
} |
|
br.close(); |
|
|
|
|
|
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/NonGeneAbbr.txt")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
String l[]=line.split("\t"); |
|
String shortform=l[0]; |
|
String longform_toks=l[1]; |
|
Abb2Longformtok_hash.put(shortform,longform_toks); |
|
} |
|
br.close(); |
|
|
|
|
|
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SimConcept.MentionType.txt")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
String l[]=line.split("\t"); |
|
SimConceptMention2Type_hash.put(l[0], l[1]); |
|
} |
|
br.close(); |
|
|
|
|
|
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Filtering.txt")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
Filtering_hash.put(line, ""); |
|
} |
|
br.close(); |
|
|
|
|
|
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Filtering_WithLongForm.txt")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
String l[]=line.split("\t"); |
|
Filtering_WithLongForm_hash.put(l[0], l[1]); |
|
} |
|
br.close(); |
|
|
|
|
|
if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All")) |
|
{ |
|
PT_Gene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Gene."+setup_hash.get("FocusSpecies")+".txt"); |
|
} |
|
else if((!FocusSpecies.equals("")) && (!FocusSpecies.equals("All"))) |
|
{ |
|
PT_Gene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Gene."+FocusSpecies+".txt"); |
|
} |
|
else |
|
{ |
|
PT_Gene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Gene.txt"); |
|
} |
|
|
|
|
|
String FileName=setup_hash.get("DictionaryFolder")+"/GeneScoring.txt"; |
|
|
|
if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All")) |
|
{ |
|
FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring."+setup_hash.get("FocusSpecies")+".txt"; |
|
} |
|
else if((!FocusSpecies.equals("")) && (!FocusSpecies.equals("All"))) |
|
{ |
|
FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring."+FocusSpecies+".txt"; |
|
} |
|
br = new BufferedReader(new FileReader(FileName)); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
String l[]=line.split("\t"); |
|
GeneScoring_hash.put(l[0], l[1]+"\t"+l[2]+"\t"+l[3]+"\t"+l[4]+"\t"+l[5]+"\t"+l[6]); |
|
} |
|
br.close(); |
|
|
|
|
|
FileName=setup_hash.get("DictionaryFolder")+"/GeneScoring.DF.txt"; |
|
if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All")) |
|
{ |
|
FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring.DF."+setup_hash.get("FocusSpecies")+".txt"; |
|
} |
|
else if((!FocusSpecies.equals("")) && (!FocusSpecies.equals("All"))) |
|
{ |
|
FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring.DF."+FocusSpecies+".txt"; |
|
} |
|
br = new BufferedReader(new FileReader(FileName)); |
|
double Sum = Double.parseDouble(br.readLine()); |
|
while ((line = br.readLine()) != null) |
|
{ |
|
String l[]=line.split("\t"); |
|
|
|
GeneScoringDF_hash.put(l[0], Math.log10(Sum/Double.parseDouble(l[1]))); |
|
} |
|
br.close(); |
|
|
|
|
|
SuffixTranslationMap_hash.put("alpha","a"); |
|
SuffixTranslationMap_hash.put("a","alpha"); |
|
SuffixTranslationMap_hash.put("beta","b"); |
|
SuffixTranslationMap_hash.put("b","beta"); |
|
SuffixTranslationMap_hash.put("delta","d"); |
|
SuffixTranslationMap_hash.put("d","delta"); |
|
SuffixTranslationMap_hash.put("z","zeta"); |
|
SuffixTranslationMap_hash.put("zeta","z"); |
|
SuffixTranslationMap_hash.put("gamma","g"); |
|
SuffixTranslationMap_hash.put("g","gamma"); |
|
SuffixTranslationMap_hash.put("r","gamma"); |
|
SuffixTranslationMap_hash.put("y","gamma"); |
|
|
|
SuffixTranslationMap2_hash.put("2","ii"); |
|
SuffixTranslationMap2_hash.put("ii","2"); |
|
SuffixTranslationMap2_hash.put("II","2"); |
|
SuffixTranslationMap2_hash.put("1","i"); |
|
SuffixTranslationMap2_hash.put("i","1"); |
|
SuffixTranslationMap2_hash.put("I","1"); |
|
|
|
|
|
if(setup_hash.containsKey("GeneIDMatch") && setup_hash.get("GeneIDMatch").toLowerCase().equals("true")) |
|
{ |
|
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/GeneIDs.txt")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
String l[]=line.split("\t"); |
|
GeneIDs_hash.put(l[0],l[1]); |
|
} |
|
br.close(); |
|
} |
|
|
|
|
|
if(setup_hash.containsKey("Normalization2Protein") && setup_hash.get("Normalization2Protein").toLowerCase().equals("true")) |
|
{ |
|
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Gene2Protein.txt")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
String l[]=line.split("\t"); |
|
Normalization2Protein_hash.put(l[0],l[1]); |
|
} |
|
br.close(); |
|
} |
|
|
|
|
|
if(setup_hash.containsKey("HomologeneID") && setup_hash.get("HomologeneID").toLowerCase().equals("true")) |
|
{ |
|
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Gene2Homoid.txt")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
String l[]=line.split("\t"); |
|
HomologeneID_hash.put(l[0],l[1]); |
|
} |
|
br.close(); |
|
} |
|
System.out.println("Loading Gene normalization Dictionary : Processing ... done."); |
|
} |
|
|
|
endTime = System.currentTimeMillis(); |
|
totTime = endTime - startTime; |
|
System.out.println("Loading Dictionary : Processing Time:"+totTime/1000+"sec"); |
|
|
|
folder = new File(InputFolder); |
|
listOfFiles = folder.listFiles(); |
|
for (int i = 0; i < listOfFiles.length; i++) |
|
{ |
|
if (listOfFiles[i].isFile()) |
|
{ |
|
String InputFile = listOfFiles[i].getName(); |
|
File f = new File(OutputFolder+"/"+InputFile); |
|
if(f.exists() && !f.isDirectory()) |
|
{ |
|
System.out.println(InputFolder+"/"+InputFile+" - Done. (The output file exists in output folder)"); |
|
} |
|
else |
|
{ |
|
String path=tmpFolder; |
|
File file = new File(path); |
|
File[] files = file.listFiles(); |
|
for (File ftmp:files) |
|
{ |
|
if (ftmp.isFile() && ftmp.exists()) |
|
{ |
|
if(ftmp.toString().matches(tmpFolder+"/"+InputFile+".*")) |
|
{ |
|
ftmp.delete(); |
|
} |
|
} |
|
} |
|
|
|
BioCDocobj = new BioCDoc(); |
|
|
|
|
|
|
|
|
|
String Format = ""; |
|
String checkR = BioCDocobj.BioCFormatCheck(InputFolder+"/"+InputFile); |
|
if(checkR.equals("BioC")) |
|
{ |
|
Format = "BioC"; |
|
} |
|
else if(checkR.equals("PubTator")) |
|
{ |
|
Format = "PubTator"; |
|
} |
|
else |
|
{ |
|
System.out.println(checkR); |
|
System.exit(0); |
|
} |
|
|
|
System.out.print(InputFolder+"/"+InputFile+" - ("+Format+" format) : Processing ... \r"); |
|
|
|
|
|
if(Format.equals("PubTator")) |
|
{ |
|
BioCDocobj.PubTator2BioC(InputFolder+"/"+InputFile,tmpFolder+"/"+InputFile); |
|
} |
|
else |
|
{ |
|
br = new BufferedReader(new FileReader(InputFolder+"/"+InputFile)); |
|
BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpFolder+"/"+InputFile), "UTF-8")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
fr.write(line); |
|
} |
|
br.close(); |
|
fr.close(); |
|
} |
|
|
|
|
|
GNR GNRobj = new GNR(); |
|
GNRobj.LoadInputFile(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".Abb",TrainTest); |
|
SR SRobj = new SR(); |
|
SimConcept SCobj = new SimConcept(); |
|
GN GNobj = new GN(); |
|
String FinalStep=""; |
|
|
|
|
|
if(setup_hash.containsKey("SpeciesRecognition") && setup_hash.get("SpeciesRecognition").toLowerCase().equals("true") ) |
|
{ |
|
SRobj.SpeciesRecognition(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SR.xml",setup_hash.get("DictionaryFolder")+"/SPStrain.txt",setup_hash.get("FilterAntibody")); |
|
FinalStep="SpeciesRecognition"; |
|
} |
|
|
|
|
|
if( setup_hash.containsKey("GeneRecognition") && setup_hash.get("GeneRecognition").toLowerCase().equals("true") ) |
|
{ |
|
GNRobj.FeatureExtraction(tmpFolder+"/"+InputFile+".data",tmpFolder+"/"+InputFile+".loca",TrainTest); |
|
GNRobj.CRF_test(setup_hash.get("GNRModel"),tmpFolder+"/"+InputFile+".data",tmpFolder+"/"+InputFile+".output","top3"); |
|
GNRobj.ReadCRFresult(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".loca",tmpFolder+"/"+InputFile+".output",tmpFolder+"/"+InputFile+".GNR.xml",0.005,0.05); |
|
f = new File(tmpFolder+"/"+InputFile+".SR.xml"); |
|
if(f.exists()) |
|
{ |
|
GNRobj.PostProcessing(tmpFolder+"/"+InputFile+".SR.xml",tmpFolder+"/"+InputFile+".GNR.xml"); |
|
} |
|
else |
|
{ |
|
GNRobj.PostProcessing(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GNR.xml"); |
|
} |
|
FinalStep="GeneRecognition"; |
|
} |
|
|
|
|
|
if(setup_hash.containsKey("SpeciesAssignment") && setup_hash.get("SpeciesAssignment").toLowerCase().equals("true") ) |
|
{ |
|
if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All")) |
|
{ |
|
f = new File(tmpFolder+"/"+InputFile+".GNR.xml"); |
|
if(f.exists()) |
|
{ |
|
SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile+".GNR.xml",tmpFolder+"/"+InputFile+".SA.xml",setup_hash.get("FocusSpecies")); |
|
} |
|
else |
|
{ |
|
SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SA.xml",setup_hash.get("FocusSpecies")); |
|
} |
|
} |
|
else |
|
{ |
|
f = new File(tmpFolder+"/"+InputFile+".GNR.xml"); |
|
if(f.exists()) |
|
{ |
|
SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile+".GNR.xml",tmpFolder+"/"+InputFile+".SA.xml"); |
|
} |
|
else |
|
{ |
|
SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SA.xml"); |
|
} |
|
} |
|
FinalStep="SpeciesAssignment"; |
|
} |
|
|
|
|
|
if((setup_hash.containsKey("GeneNormalization")) && setup_hash.get("GeneNormalization").toLowerCase().equals("true") ) |
|
{ |
|
|
|
{ |
|
SCobj.FeatureExtraction_Test(tmpFolder+"/"+InputFile+".SC.data"); |
|
SCobj.CRF_test(setup_hash.get("SCModel"),tmpFolder+"/"+InputFile+".SC.data",tmpFolder+"/"+InputFile+".SC.output"); |
|
SCobj.ReadCRFresult(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SC.output",tmpFolder+"/"+InputFile+".SC.xml"); |
|
} |
|
|
|
|
|
{ |
|
GNobj.PreProcessing4GN(InputFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".PreProcessing4GN.xml"); |
|
GNobj.ChromosomeRecognition(InputFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml"); |
|
if(setup_hash.containsKey("GeneIDMatch") && setup_hash.get("GeneIDMatch").equals("True")) |
|
{ |
|
|
|
GNobj.GeneNormalization(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml",true); |
|
GNobj.GeneIDRecognition(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml"); |
|
} |
|
else |
|
{ |
|
GNobj.GeneNormalization(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml",false); |
|
} |
|
} |
|
FinalStep="GeneNormalization"; |
|
} |
|
|
|
|
|
String final_output=""; |
|
if(FinalStep.equals("GeneNormalization")) |
|
{ |
|
final_output=tmpFolder+"/"+InputFile+".GN.xml"; |
|
} |
|
else if(FinalStep.equals("SpeciesAssignment")) |
|
{ |
|
final_output=tmpFolder+"/"+InputFile+".SA.xml"; |
|
} |
|
else if(FinalStep.equals("SpeciesRecognition")) |
|
{ |
|
final_output=tmpFolder+"/"+InputFile+".SR.xml"; |
|
} |
|
else if(FinalStep.equals("GeneRecognition")) |
|
{ |
|
final_output=tmpFolder+"/"+InputFile+".GNR.xml"; |
|
} |
|
|
|
if(Format.equals("PubTator")) |
|
{ |
|
BioCDocobj.BioC2PubTator(final_output,OutputFolder+"/"+InputFile); |
|
} |
|
else |
|
{ |
|
br = new BufferedReader(new FileReader(final_output)); |
|
BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(OutputFolder+"/"+InputFile), "UTF-8")); |
|
line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
fr.write(line); |
|
} |
|
br.close(); |
|
fr.close(); |
|
} |
|
|
|
|
|
|
|
|
|
if((!setup_hash.containsKey("DeleteTmp")) || setup_hash.get("DeleteTmp").toLowerCase().equals("true")) |
|
{ |
|
path="tmp"; |
|
file = new File(path); |
|
files = file.listFiles(); |
|
for (File ftmp:files) |
|
{ |
|
if (ftmp.isFile() && ftmp.exists()) |
|
{ |
|
if(ftmp.toString().matches(tmpFolder+"/"+InputFile+".*")) |
|
{ |
|
ftmp.delete(); |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
endTime = System.currentTimeMillis(); |
|
totTime = endTime - startTime; |
|
System.out.println(InputFolder+"/"+InputFile+" - ("+Format+" format) : Processing Time:"+totTime/1000+"sec"); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|