steventango
commited on
Commit
•
d5062c8
1
Parent(s):
fbf0d37
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- CRF/java/.am +27 -27
- CRF/java/org/chasen/crfpp/Model.java +51 -51
- CRF/perl/Makefile.old +931 -931
- CRF/ruby/Makefile +157 -157
- CRF/winmain.h +69 -69
- GeneNER_SpeAss_run.py +745 -745
- Library/Ab3P.C +110 -110
- Library/Ab3P.h +83 -83
- Library/AbbrStra.C +1426 -1426
- Library/AbbrStra.h +332 -332
- Library/AbbrvE.C +629 -629
- Library/AbbrvE.h +93 -93
- Library/Btree.C +1304 -1304
- Library/Btree.h +547 -547
- Library/FBase.C +600 -600
- Library/FBase.h +248 -248
- Library/Hash.C +733 -733
- Library/Hash.h +92 -92
- Library/MPtok.C +2036 -2036
- Library/MPtok.h +141 -141
- Library/Makefile +13 -13
- Library/WordData/Ab3P_prec.dat +144 -144
- Library/WordData/Lf1chSf +0 -0
- Library/WordData/stop +313 -313
- Library/runn.C +216 -216
- Library/runn.h +392 -392
- gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/version_vocab/vocab1.txt +0 -0
- gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/version_vocab/vocab_ori.txt +0 -0
- requirements-py310.txt +6 -6
- requirements.txt +76 -76
- run_batches.py +62 -48
- src_Java/GNormPluslib/BioCDoc.java +1343 -1343
- src_Java/GNormPluslib/GN.java +1083 -1083
- src_Java/GNormPluslib/GNR.java +0 -0
- src_Java/GNormPluslib/GNormPlus.java +696 -696
- src_Java/GNormPluslib/PrefixTree.java +892 -892
- src_Java/GNormPluslib/SR.java +1043 -1043
- src_Java/GNormPluslib/SimConcept.java +0 -0
- src_python/GeneNER/BIO_format.py +256 -256
- src_python/GeneNER/Evaluation_ner.py +243 -243
- src_python/GeneNER/model_ner.py +102 -102
- src_python/GeneNER/ner_tag.py +85 -85
- src_python/GeneNER/processing_data_ner.py +210 -210
- src_python/GeneNER/represent_ner.py +183 -183
- src_python/GeneNER/restore_index_ner.py +447 -447
- src_python/SpeAss/Evaluation_sa.py +396 -396
- src_python/SpeAss/SA_Pubtator_Conll.py +493 -493
- src_python/SpeAss/ml_tagging_score_sa.py +220 -220
- src_python/SpeAss/model_sa.py +105 -105
- src_python/SpeAss/processing_data_sa.py +201 -201
CRF/java/.am
CHANGED
@@ -1,27 +1,27 @@
|
|
1 |
-
TARGET=MeCab
|
2 |
-
JAVAC=javac
|
3 |
-
JAVA=java
|
4 |
-
JAR=jar
|
5 |
-
CXX=c++
|
6 |
-
INCLUDE=/usr/lib/jvm/java-6-openjdk/include
|
7 |
-
|
8 |
-
PACKAGE=org/chasen/mecab
|
9 |
-
|
10 |
-
LIBS=`mecab-config --libs`
|
11 |
-
INC=`mecab-config --cflags` -I$(INCLUDE) -I$(INCLUDE)/linux
|
12 |
-
|
13 |
-
all:
|
14 |
-
$(CXX) -O3 -c -fpic $(TARGET)_wrap.cxx $(INC)
|
15 |
-
$(CXX) -shared $(TARGET)_wrap.o -o lib$(TARGET).so $(LIBS)
|
16 |
-
$(JAVAC) $(PACKAGE)/*.java
|
17 |
-
$(JAVAC) test.java
|
18 |
-
$(JAR) cfv $(TARGET).jar $(PACKAGE)/*.class
|
19 |
-
|
20 |
-
test:
|
21 |
-
env LD_LIBRARY_PATH=. $(JAVA) test
|
22 |
-
|
23 |
-
clean:
|
24 |
-
rm -fr *.jar *.o *.so *.class $(PACKAGE)/*.class
|
25 |
-
|
26 |
-
cleanall:
|
27 |
-
rm -fr $(TARGET).java *.cxx
|
|
|
1 |
+
TARGET=MeCab
|
2 |
+
JAVAC=javac
|
3 |
+
JAVA=java
|
4 |
+
JAR=jar
|
5 |
+
CXX=c++
|
6 |
+
INCLUDE=/usr/lib/jvm/java-6-openjdk/include
|
7 |
+
|
8 |
+
PACKAGE=org/chasen/mecab
|
9 |
+
|
10 |
+
LIBS=`mecab-config --libs`
|
11 |
+
INC=`mecab-config --cflags` -I$(INCLUDE) -I$(INCLUDE)/linux
|
12 |
+
|
13 |
+
all:
|
14 |
+
$(CXX) -O3 -c -fpic $(TARGET)_wrap.cxx $(INC)
|
15 |
+
$(CXX) -shared $(TARGET)_wrap.o -o lib$(TARGET).so $(LIBS)
|
16 |
+
$(JAVAC) $(PACKAGE)/*.java
|
17 |
+
$(JAVAC) test.java
|
18 |
+
$(JAR) cfv $(TARGET).jar $(PACKAGE)/*.class
|
19 |
+
|
20 |
+
test:
|
21 |
+
env LD_LIBRARY_PATH=. $(JAVA) test
|
22 |
+
|
23 |
+
clean:
|
24 |
+
rm -fr *.jar *.o *.so *.class $(PACKAGE)/*.class
|
25 |
+
|
26 |
+
cleanall:
|
27 |
+
rm -fr $(TARGET).java *.cxx
|
CRF/java/org/chasen/crfpp/Model.java
CHANGED
@@ -1,51 +1,51 @@
|
|
1 |
-
/* ----------------------------------------------------------------------------
|
2 |
-
* This file was automatically generated by SWIG (http://www.swig.org).
|
3 |
-
* Version 1.3.40
|
4 |
-
*
|
5 |
-
* Do not make changes to this file unless you know what you are doing--modify
|
6 |
-
* the SWIG interface file instead.
|
7 |
-
* ----------------------------------------------------------------------------- */
|
8 |
-
|
9 |
-
package org.chasen.crfpp;
|
10 |
-
|
11 |
-
public class Model {
|
12 |
-
private long swigCPtr;
|
13 |
-
protected boolean swigCMemOwn;
|
14 |
-
|
15 |
-
protected Model(long cPtr, boolean cMemoryOwn) {
|
16 |
-
swigCMemOwn = cMemoryOwn;
|
17 |
-
swigCPtr = cPtr;
|
18 |
-
}
|
19 |
-
|
20 |
-
protected static long getCPtr(Model obj) {
|
21 |
-
return (obj == null) ? 0 : obj.swigCPtr;
|
22 |
-
}
|
23 |
-
|
24 |
-
protected void finalize() {
|
25 |
-
delete();
|
26 |
-
}
|
27 |
-
|
28 |
-
public synchronized void delete() {
|
29 |
-
if (swigCPtr != 0) {
|
30 |
-
if (swigCMemOwn) {
|
31 |
-
swigCMemOwn = false;
|
32 |
-
CRFPPJNI.delete_Model(swigCPtr);
|
33 |
-
}
|
34 |
-
swigCPtr = 0;
|
35 |
-
}
|
36 |
-
}
|
37 |
-
|
38 |
-
public Tagger createTagger() {
|
39 |
-
long cPtr = CRFPPJNI.Model_createTagger(swigCPtr, this);
|
40 |
-
return (cPtr == 0) ? null : new Tagger(cPtr, false);
|
41 |
-
}
|
42 |
-
|
43 |
-
public String what() {
|
44 |
-
return CRFPPJNI.Model_what(swigCPtr, this);
|
45 |
-
}
|
46 |
-
|
47 |
-
public Model(String arg) {
|
48 |
-
this(CRFPPJNI.new_Model(arg), true);
|
49 |
-
}
|
50 |
-
|
51 |
-
}
|
|
|
1 |
+
/* ----------------------------------------------------------------------------
|
2 |
+
* This file was automatically generated by SWIG (http://www.swig.org).
|
3 |
+
* Version 1.3.40
|
4 |
+
*
|
5 |
+
* Do not make changes to this file unless you know what you are doing--modify
|
6 |
+
* the SWIG interface file instead.
|
7 |
+
* ----------------------------------------------------------------------------- */
|
8 |
+
|
9 |
+
package org.chasen.crfpp;
|
10 |
+
|
11 |
+
public class Model {
|
12 |
+
private long swigCPtr;
|
13 |
+
protected boolean swigCMemOwn;
|
14 |
+
|
15 |
+
protected Model(long cPtr, boolean cMemoryOwn) {
|
16 |
+
swigCMemOwn = cMemoryOwn;
|
17 |
+
swigCPtr = cPtr;
|
18 |
+
}
|
19 |
+
|
20 |
+
protected static long getCPtr(Model obj) {
|
21 |
+
return (obj == null) ? 0 : obj.swigCPtr;
|
22 |
+
}
|
23 |
+
|
24 |
+
protected void finalize() {
|
25 |
+
delete();
|
26 |
+
}
|
27 |
+
|
28 |
+
public synchronized void delete() {
|
29 |
+
if (swigCPtr != 0) {
|
30 |
+
if (swigCMemOwn) {
|
31 |
+
swigCMemOwn = false;
|
32 |
+
CRFPPJNI.delete_Model(swigCPtr);
|
33 |
+
}
|
34 |
+
swigCPtr = 0;
|
35 |
+
}
|
36 |
+
}
|
37 |
+
|
38 |
+
public Tagger createTagger() {
|
39 |
+
long cPtr = CRFPPJNI.Model_createTagger(swigCPtr, this);
|
40 |
+
return (cPtr == 0) ? null : new Tagger(cPtr, false);
|
41 |
+
}
|
42 |
+
|
43 |
+
public String what() {
|
44 |
+
return CRFPPJNI.Model_what(swigCPtr, this);
|
45 |
+
}
|
46 |
+
|
47 |
+
public Model(String arg) {
|
48 |
+
this(CRFPPJNI.new_Model(arg), true);
|
49 |
+
}
|
50 |
+
|
51 |
+
}
|
CRF/perl/Makefile.old
CHANGED
@@ -1,931 +1,931 @@
|
|
1 |
-
# This Makefile is for the CRFPP extension to perl.
|
2 |
-
#
|
3 |
-
# It was generated automatically by MakeMaker version
|
4 |
-
# 6.56 (Revision: 65600) from the contents of
|
5 |
-
# Makefile.PL. Don't edit this file, edit Makefile.PL instead.
|
6 |
-
#
|
7 |
-
# ANY CHANGES MADE HERE WILL BE LOST!
|
8 |
-
#
|
9 |
-
# MakeMaker ARGV: ()
|
10 |
-
#
|
11 |
-
|
12 |
-
# MakeMaker Parameters:
|
13 |
-
|
14 |
-
# BUILD_REQUIRES => { }
|
15 |
-
# CC => q[c++]
|
16 |
-
# INC => q[]
|
17 |
-
# LD => q[c++]
|
18 |
-
# LIBS => q[-lpthread -lcrfpp]
|
19 |
-
# NAME => q[CRFPP]
|
20 |
-
# OBJECT => q[CRFPP_wrap.o]
|
21 |
-
# PREREQ_PM => { }
|
22 |
-
|
23 |
-
# --- MakeMaker post_initialize section:
|
24 |
-
|
25 |
-
|
26 |
-
# --- MakeMaker const_config section:
|
27 |
-
|
28 |
-
# These definitions are from config.sh (via /usr/lib/perl/5.12/Config.pm).
|
29 |
-
# They may have been overridden via Makefile.PL or on the command line.
|
30 |
-
AR = ar
|
31 |
-
CC = c++
|
32 |
-
CCCDLFLAGS = -fPIC
|
33 |
-
CCDLFLAGS = -Wl,-E
|
34 |
-
DLEXT = so
|
35 |
-
DLSRC = dl_dlopen.xs
|
36 |
-
EXE_EXT =
|
37 |
-
FULL_AR = /usr/bin/ar
|
38 |
-
LD = c++
|
39 |
-
LDDLFLAGS = -shared -O2 -g -L/usr/local/lib -fstack-protector
|
40 |
-
LDFLAGS = -fstack-protector -L/usr/local/lib
|
41 |
-
LIBC =
|
42 |
-
LIB_EXT = .a
|
43 |
-
OBJ_EXT = .o
|
44 |
-
OSNAME = linux
|
45 |
-
OSVERS = 2.6.24-28-server
|
46 |
-
RANLIB = :
|
47 |
-
SITELIBEXP = /usr/local/share/perl/5.12.4
|
48 |
-
SITEARCHEXP = /usr/local/lib/perl/5.12.4
|
49 |
-
SO = so
|
50 |
-
VENDORARCHEXP = /usr/lib/perl5
|
51 |
-
VENDORLIBEXP = /usr/share/perl5
|
52 |
-
|
53 |
-
|
54 |
-
# --- MakeMaker constants section:
|
55 |
-
AR_STATIC_ARGS = cr
|
56 |
-
DIRFILESEP = /
|
57 |
-
DFSEP = $(DIRFILESEP)
|
58 |
-
NAME = CRFPP
|
59 |
-
NAME_SYM = CRFPP
|
60 |
-
VERSION =
|
61 |
-
VERSION_MACRO = VERSION
|
62 |
-
VERSION_SYM =
|
63 |
-
DEFINE_VERSION = -D$(VERSION_MACRO)=\"$(VERSION)\"
|
64 |
-
XS_VERSION =
|
65 |
-
XS_VERSION_MACRO = XS_VERSION
|
66 |
-
XS_DEFINE_VERSION = -D$(XS_VERSION_MACRO)=\"$(XS_VERSION)\"
|
67 |
-
INST_ARCHLIB = blib/arch
|
68 |
-
INST_SCRIPT = blib/script
|
69 |
-
INST_BIN = blib/bin
|
70 |
-
INST_LIB = blib/lib
|
71 |
-
INST_MAN1DIR = blib/man1
|
72 |
-
INST_MAN3DIR = blib/man3
|
73 |
-
MAN1EXT = 1p
|
74 |
-
MAN3EXT = 3pm
|
75 |
-
INSTALLDIRS = site
|
76 |
-
DESTDIR =
|
77 |
-
PREFIX = /usr
|
78 |
-
PERLPREFIX = $(PREFIX)
|
79 |
-
SITEPREFIX = $(PREFIX)/local
|
80 |
-
VENDORPREFIX = $(PREFIX)
|
81 |
-
INSTALLPRIVLIB = $(PERLPREFIX)/share/perl/5.12
|
82 |
-
DESTINSTALLPRIVLIB = $(DESTDIR)$(INSTALLPRIVLIB)
|
83 |
-
INSTALLSITELIB = $(SITEPREFIX)/share/perl/5.12.4
|
84 |
-
DESTINSTALLSITELIB = $(DESTDIR)$(INSTALLSITELIB)
|
85 |
-
INSTALLVENDORLIB = $(VENDORPREFIX)/share/perl5
|
86 |
-
DESTINSTALLVENDORLIB = $(DESTDIR)$(INSTALLVENDORLIB)
|
87 |
-
INSTALLARCHLIB = $(PERLPREFIX)/lib/perl/5.12
|
88 |
-
DESTINSTALLARCHLIB = $(DESTDIR)$(INSTALLARCHLIB)
|
89 |
-
INSTALLSITEARCH = $(SITEPREFIX)/lib/perl/5.12.4
|
90 |
-
DESTINSTALLSITEARCH = $(DESTDIR)$(INSTALLSITEARCH)
|
91 |
-
INSTALLVENDORARCH = $(VENDORPREFIX)/lib/perl5
|
92 |
-
DESTINSTALLVENDORARCH = $(DESTDIR)$(INSTALLVENDORARCH)
|
93 |
-
INSTALLBIN = $(PERLPREFIX)/bin
|
94 |
-
DESTINSTALLBIN = $(DESTDIR)$(INSTALLBIN)
|
95 |
-
INSTALLSITEBIN = $(SITEPREFIX)/bin
|
96 |
-
DESTINSTALLSITEBIN = $(DESTDIR)$(INSTALLSITEBIN)
|
97 |
-
INSTALLVENDORBIN = $(VENDORPREFIX)/bin
|
98 |
-
DESTINSTALLVENDORBIN = $(DESTDIR)$(INSTALLVENDORBIN)
|
99 |
-
INSTALLSCRIPT = $(PERLPREFIX)/bin
|
100 |
-
DESTINSTALLSCRIPT = $(DESTDIR)$(INSTALLSCRIPT)
|
101 |
-
INSTALLSITESCRIPT = $(SITEPREFIX)/bin
|
102 |
-
DESTINSTALLSITESCRIPT = $(DESTDIR)$(INSTALLSITESCRIPT)
|
103 |
-
INSTALLVENDORSCRIPT = $(VENDORPREFIX)/bin
|
104 |
-
DESTINSTALLVENDORSCRIPT = $(DESTDIR)$(INSTALLVENDORSCRIPT)
|
105 |
-
INSTALLMAN1DIR = $(PERLPREFIX)/share/man/man1
|
106 |
-
DESTINSTALLMAN1DIR = $(DESTDIR)$(INSTALLMAN1DIR)
|
107 |
-
INSTALLSITEMAN1DIR = $(SITEPREFIX)/man/man1
|
108 |
-
DESTINSTALLSITEMAN1DIR = $(DESTDIR)$(INSTALLSITEMAN1DIR)
|
109 |
-
INSTALLVENDORMAN1DIR = $(VENDORPREFIX)/share/man/man1
|
110 |
-
DESTINSTALLVENDORMAN1DIR = $(DESTDIR)$(INSTALLVENDORMAN1DIR)
|
111 |
-
INSTALLMAN3DIR = $(PERLPREFIX)/share/man/man3
|
112 |
-
DESTINSTALLMAN3DIR = $(DESTDIR)$(INSTALLMAN3DIR)
|
113 |
-
INSTALLSITEMAN3DIR = $(SITEPREFIX)/man/man3
|
114 |
-
DESTINSTALLSITEMAN3DIR = $(DESTDIR)$(INSTALLSITEMAN3DIR)
|
115 |
-
INSTALLVENDORMAN3DIR = $(VENDORPREFIX)/share/man/man3
|
116 |
-
DESTINSTALLVENDORMAN3DIR = $(DESTDIR)$(INSTALLVENDORMAN3DIR)
|
117 |
-
PERL_LIB = /usr/share/perl/5.12
|
118 |
-
PERL_ARCHLIB = /usr/lib/perl/5.12
|
119 |
-
LIBPERL_A = libperl.a
|
120 |
-
FIRST_MAKEFILE = Makefile
|
121 |
-
MAKEFILE_OLD = Makefile.old
|
122 |
-
MAKE_APERL_FILE = Makefile.aperl
|
123 |
-
PERLMAINCC = $(CC)
|
124 |
-
PERL_INC = /usr/lib/perl/5.12/CORE
|
125 |
-
PERL = /usr/bin/perl
|
126 |
-
FULLPERL = /usr/bin/perl
|
127 |
-
ABSPERL = $(PERL)
|
128 |
-
PERLRUN = $(PERL)
|
129 |
-
FULLPERLRUN = $(FULLPERL)
|
130 |
-
ABSPERLRUN = $(ABSPERL)
|
131 |
-
PERLRUNINST = $(PERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
|
132 |
-
FULLPERLRUNINST = $(FULLPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
|
133 |
-
ABSPERLRUNINST = $(ABSPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
|
134 |
-
PERL_CORE = 0
|
135 |
-
PERM_DIR = 755
|
136 |
-
PERM_RW = 644
|
137 |
-
PERM_RWX = 755
|
138 |
-
|
139 |
-
MAKEMAKER = /usr/share/perl/5.12/ExtUtils/MakeMaker.pm
|
140 |
-
MM_VERSION = 6.56
|
141 |
-
MM_REVISION = 65600
|
142 |
-
|
143 |
-
# FULLEXT = Pathname for extension directory (eg Foo/Bar/Oracle).
|
144 |
-
# BASEEXT = Basename part of FULLEXT. May be just equal FULLEXT. (eg Oracle)
|
145 |
-
# PARENT_NAME = NAME without BASEEXT and no trailing :: (eg Foo::Bar)
|
146 |
-
# DLBASE = Basename part of dynamic library. May be just equal BASEEXT.
|
147 |
-
MAKE = make
|
148 |
-
FULLEXT = CRFPP
|
149 |
-
BASEEXT = CRFPP
|
150 |
-
PARENT_NAME =
|
151 |
-
DLBASE = $(BASEEXT)
|
152 |
-
VERSION_FROM =
|
153 |
-
INC =
|
154 |
-
OBJECT = CRFPP_wrap$(OBJ_EXT)
|
155 |
-
LDFROM = $(OBJECT)
|
156 |
-
LINKTYPE = dynamic
|
157 |
-
BOOTDEP =
|
158 |
-
|
159 |
-
# Handy lists of source code files:
|
160 |
-
XS_FILES =
|
161 |
-
C_FILES = CRFPP_wrap.cxx
|
162 |
-
O_FILES = CRFPP_wrap.o
|
163 |
-
H_FILES =
|
164 |
-
MAN1PODS =
|
165 |
-
MAN3PODS =
|
166 |
-
|
167 |
-
# Where is the Config information that we are using/depend on
|
168 |
-
CONFIGDEP = $(PERL_ARCHLIB)$(DFSEP)Config.pm $(PERL_INC)$(DFSEP)config.h
|
169 |
-
|
170 |
-
# Where to build things
|
171 |
-
INST_LIBDIR = $(INST_LIB)
|
172 |
-
INST_ARCHLIBDIR = $(INST_ARCHLIB)
|
173 |
-
|
174 |
-
INST_AUTODIR = $(INST_LIB)/auto/$(FULLEXT)
|
175 |
-
INST_ARCHAUTODIR = $(INST_ARCHLIB)/auto/$(FULLEXT)
|
176 |
-
|
177 |
-
INST_STATIC = $(INST_ARCHAUTODIR)/$(BASEEXT)$(LIB_EXT)
|
178 |
-
INST_DYNAMIC = $(INST_ARCHAUTODIR)/$(DLBASE).$(DLEXT)
|
179 |
-
INST_BOOT = $(INST_ARCHAUTODIR)/$(BASEEXT).bs
|
180 |
-
|
181 |
-
# Extra linker info
|
182 |
-
EXPORT_LIST =
|
183 |
-
PERL_ARCHIVE =
|
184 |
-
PERL_ARCHIVE_AFTER =
|
185 |
-
|
186 |
-
|
187 |
-
TO_INST_PM = CRFPP.pm
|
188 |
-
|
189 |
-
PM_TO_BLIB = CRFPP.pm \
|
190 |
-
$(INST_LIB)/CRFPP.pm
|
191 |
-
|
192 |
-
|
193 |
-
# --- MakeMaker platform_constants section:
|
194 |
-
MM_Unix_VERSION = 6.56
|
195 |
-
PERL_MALLOC_DEF = -DPERL_EXTMALLOC_DEF -Dmalloc=Perl_malloc -Dfree=Perl_mfree -Drealloc=Perl_realloc -Dcalloc=Perl_calloc
|
196 |
-
|
197 |
-
|
198 |
-
# --- MakeMaker tool_autosplit section:
|
199 |
-
# Usage: $(AUTOSPLITFILE) FileToSplit AutoDirToSplitInto
|
200 |
-
AUTOSPLITFILE = $(ABSPERLRUN) -e 'use AutoSplit; autosplit($$ARGV[0], $$ARGV[1], 0, 1, 1)' --
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
# --- MakeMaker tool_xsubpp section:
|
205 |
-
|
206 |
-
XSUBPPDIR = /usr/share/perl/5.12/ExtUtils
|
207 |
-
XSUBPP = $(XSUBPPDIR)$(DFSEP)xsubpp
|
208 |
-
XSUBPPRUN = $(PERLRUN) $(XSUBPP)
|
209 |
-
XSPROTOARG =
|
210 |
-
XSUBPPDEPS = /usr/share/perl/5.12/ExtUtils/typemap $(XSUBPP)
|
211 |
-
XSUBPPARGS = -typemap /usr/share/perl/5.12/ExtUtils/typemap
|
212 |
-
XSUBPP_EXTRA_ARGS =
|
213 |
-
|
214 |
-
|
215 |
-
# --- MakeMaker tools_other section:
|
216 |
-
SHELL = /bin/sh
|
217 |
-
CHMOD = chmod
|
218 |
-
CP = cp
|
219 |
-
MV = mv
|
220 |
-
NOOP = $(TRUE)
|
221 |
-
NOECHO = @
|
222 |
-
RM_F = rm -f
|
223 |
-
RM_RF = rm -rf
|
224 |
-
TEST_F = test -f
|
225 |
-
TOUCH = touch
|
226 |
-
UMASK_NULL = umask 0
|
227 |
-
DEV_NULL = > /dev/null 2>&1
|
228 |
-
MKPATH = $(ABSPERLRUN) -MExtUtils::Command -e 'mkpath' --
|
229 |
-
EQUALIZE_TIMESTAMP = $(ABSPERLRUN) -MExtUtils::Command -e 'eqtime' --
|
230 |
-
FALSE = false
|
231 |
-
TRUE = true
|
232 |
-
ECHO = echo
|
233 |
-
ECHO_N = echo -n
|
234 |
-
UNINST = 0
|
235 |
-
VERBINST = 0
|
236 |
-
MOD_INSTALL = $(ABSPERLRUN) -MExtUtils::Install -e 'install([ from_to => {@ARGV}, verbose => '\''$(VERBINST)'\'', uninstall_shadows => '\''$(UNINST)'\'', dir_mode => '\''$(PERM_DIR)'\'' ]);' --
|
237 |
-
DOC_INSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'perllocal_install' --
|
238 |
-
UNINSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'uninstall' --
|
239 |
-
WARN_IF_OLD_PACKLIST = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'warn_if_old_packlist' --
|
240 |
-
MACROSTART =
|
241 |
-
MACROEND =
|
242 |
-
USEMAKEFILE = -f
|
243 |
-
FIXIN = $(ABSPERLRUN) -MExtUtils::MY -e 'MY->fixin(shift)' --
|
244 |
-
|
245 |
-
|
246 |
-
# --- MakeMaker makemakerdflt section:
|
247 |
-
makemakerdflt : all
|
248 |
-
$(NOECHO) $(NOOP)
|
249 |
-
|
250 |
-
|
251 |
-
# --- MakeMaker dist section:
|
252 |
-
TAR = tar
|
253 |
-
TARFLAGS = cvf
|
254 |
-
ZIP = zip
|
255 |
-
ZIPFLAGS = -r
|
256 |
-
COMPRESS = gzip --best
|
257 |
-
SUFFIX = .gz
|
258 |
-
SHAR = shar
|
259 |
-
PREOP = $(NOECHO) $(NOOP)
|
260 |
-
POSTOP = $(NOECHO) $(NOOP)
|
261 |
-
TO_UNIX = $(NOECHO) $(NOOP)
|
262 |
-
CI = ci -u
|
263 |
-
RCS_LABEL = rcs -Nv$(VERSION_SYM): -q
|
264 |
-
DIST_CP = best
|
265 |
-
DIST_DEFAULT = tardist
|
266 |
-
DISTNAME = CRFPP
|
267 |
-
DISTVNAME = CRFPP-
|
268 |
-
|
269 |
-
|
270 |
-
# --- MakeMaker macro section:
|
271 |
-
|
272 |
-
|
273 |
-
# --- MakeMaker depend section:
|
274 |
-
|
275 |
-
|
276 |
-
# --- MakeMaker cflags section:
|
277 |
-
|
278 |
-
CCFLAGS = -D_REENTRANT -D_GNU_SOURCE -DDEBIAN -fno-strict-aliasing -pipe -fstack-protector -I/usr/local/include -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64
|
279 |
-
OPTIMIZE = -O2 -g
|
280 |
-
PERLTYPE =
|
281 |
-
MPOLLUTE =
|
282 |
-
|
283 |
-
|
284 |
-
# --- MakeMaker const_loadlibs section:
|
285 |
-
|
286 |
-
# CRFPP might depend on some other libraries:
|
287 |
-
# See ExtUtils::Liblist for details
|
288 |
-
#
|
289 |
-
EXTRALIBS = -lcrfpp
|
290 |
-
LDLOADLIBS = -lpthread -lcrfpp
|
291 |
-
BSLOADLIBS =
|
292 |
-
|
293 |
-
|
294 |
-
# --- MakeMaker const_cccmd section:
|
295 |
-
CCCMD = $(CC) -c $(PASTHRU_INC) $(INC) \
|
296 |
-
$(CCFLAGS) $(OPTIMIZE) \
|
297 |
-
$(PERLTYPE) $(MPOLLUTE) $(DEFINE_VERSION) \
|
298 |
-
$(XS_DEFINE_VERSION)
|
299 |
-
|
300 |
-
# --- MakeMaker post_constants section:
|
301 |
-
|
302 |
-
|
303 |
-
# --- MakeMaker pasthru section:
|
304 |
-
|
305 |
-
PASTHRU = LIBPERL_A="$(LIBPERL_A)"\
|
306 |
-
LINKTYPE="$(LINKTYPE)"\
|
307 |
-
OPTIMIZE="$(OPTIMIZE)"\
|
308 |
-
PREFIX="$(PREFIX)"\
|
309 |
-
PASTHRU_INC="$(PASTHRU_INC)"
|
310 |
-
|
311 |
-
|
312 |
-
# --- MakeMaker special_targets section:
|
313 |
-
.SUFFIXES : .xs .c .C .cpp .i .s .cxx .cc $(OBJ_EXT)
|
314 |
-
|
315 |
-
.PHONY: all config static dynamic test linkext manifest blibdirs clean realclean disttest distdir
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
# --- MakeMaker c_o section:
|
320 |
-
|
321 |
-
.c.i:
|
322 |
-
cc -E -c $(PASTHRU_INC) $(INC) \
|
323 |
-
$(CCFLAGS) $(OPTIMIZE) \
|
324 |
-
$(PERLTYPE) $(MPOLLUTE) $(DEFINE_VERSION) \
|
325 |
-
$(XS_DEFINE_VERSION) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.c > $*.i
|
326 |
-
|
327 |
-
.c.s:
|
328 |
-
$(CCCMD) -S $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.c
|
329 |
-
|
330 |
-
.c$(OBJ_EXT):
|
331 |
-
$(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.c
|
332 |
-
|
333 |
-
.cpp$(OBJ_EXT):
|
334 |
-
$(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.cpp
|
335 |
-
|
336 |
-
.cxx$(OBJ_EXT):
|
337 |
-
$(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.cxx
|
338 |
-
|
339 |
-
.cc$(OBJ_EXT):
|
340 |
-
$(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.cc
|
341 |
-
|
342 |
-
.C$(OBJ_EXT):
|
343 |
-
$(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.C
|
344 |
-
|
345 |
-
|
346 |
-
# --- MakeMaker xs_c section:
|
347 |
-
|
348 |
-
.xs.c:
|
349 |
-
$(XSUBPPRUN) $(XSPROTOARG) $(XSUBPPARGS) $(XSUBPP_EXTRA_ARGS) $*.xs > $*.xsc && $(MV) $*.xsc $*.c
|
350 |
-
|
351 |
-
|
352 |
-
# --- MakeMaker xs_o section:
|
353 |
-
|
354 |
-
.xs$(OBJ_EXT):
|
355 |
-
$(XSUBPPRUN) $(XSPROTOARG) $(XSUBPPARGS) $*.xs > $*.xsc && $(MV) $*.xsc $*.c
|
356 |
-
$(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.c
|
357 |
-
|
358 |
-
|
359 |
-
# --- MakeMaker top_targets section:
|
360 |
-
all :: pure_all manifypods
|
361 |
-
$(NOECHO) $(NOOP)
|
362 |
-
|
363 |
-
|
364 |
-
pure_all :: config pm_to_blib subdirs linkext
|
365 |
-
$(NOECHO) $(NOOP)
|
366 |
-
|
367 |
-
subdirs :: $(MYEXTLIB)
|
368 |
-
$(NOECHO) $(NOOP)
|
369 |
-
|
370 |
-
config :: $(FIRST_MAKEFILE) blibdirs
|
371 |
-
$(NOECHO) $(NOOP)
|
372 |
-
|
373 |
-
help :
|
374 |
-
perldoc ExtUtils::MakeMaker
|
375 |
-
|
376 |
-
|
377 |
-
# --- MakeMaker blibdirs section:
|
378 |
-
blibdirs : $(INST_LIBDIR)$(DFSEP).exists $(INST_ARCHLIB)$(DFSEP).exists $(INST_AUTODIR)$(DFSEP).exists $(INST_ARCHAUTODIR)$(DFSEP).exists $(INST_BIN)$(DFSEP).exists $(INST_SCRIPT)$(DFSEP).exists $(INST_MAN1DIR)$(DFSEP).exists $(INST_MAN3DIR)$(DFSEP).exists
|
379 |
-
$(NOECHO) $(NOOP)
|
380 |
-
|
381 |
-
# Backwards compat with 6.18 through 6.25
|
382 |
-
blibdirs.ts : blibdirs
|
383 |
-
$(NOECHO) $(NOOP)
|
384 |
-
|
385 |
-
$(INST_LIBDIR)$(DFSEP).exists :: Makefile.PL
|
386 |
-
$(NOECHO) $(MKPATH) $(INST_LIBDIR)
|
387 |
-
$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_LIBDIR)
|
388 |
-
$(NOECHO) $(TOUCH) $(INST_LIBDIR)$(DFSEP).exists
|
389 |
-
|
390 |
-
$(INST_ARCHLIB)$(DFSEP).exists :: Makefile.PL
|
391 |
-
$(NOECHO) $(MKPATH) $(INST_ARCHLIB)
|
392 |
-
$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHLIB)
|
393 |
-
$(NOECHO) $(TOUCH) $(INST_ARCHLIB)$(DFSEP).exists
|
394 |
-
|
395 |
-
$(INST_AUTODIR)$(DFSEP).exists :: Makefile.PL
|
396 |
-
$(NOECHO) $(MKPATH) $(INST_AUTODIR)
|
397 |
-
$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_AUTODIR)
|
398 |
-
$(NOECHO) $(TOUCH) $(INST_AUTODIR)$(DFSEP).exists
|
399 |
-
|
400 |
-
$(INST_ARCHAUTODIR)$(DFSEP).exists :: Makefile.PL
|
401 |
-
$(NOECHO) $(MKPATH) $(INST_ARCHAUTODIR)
|
402 |
-
$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHAUTODIR)
|
403 |
-
$(NOECHO) $(TOUCH) $(INST_ARCHAUTODIR)$(DFSEP).exists
|
404 |
-
|
405 |
-
$(INST_BIN)$(DFSEP).exists :: Makefile.PL
|
406 |
-
$(NOECHO) $(MKPATH) $(INST_BIN)
|
407 |
-
$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_BIN)
|
408 |
-
$(NOECHO) $(TOUCH) $(INST_BIN)$(DFSEP).exists
|
409 |
-
|
410 |
-
$(INST_SCRIPT)$(DFSEP).exists :: Makefile.PL
|
411 |
-
$(NOECHO) $(MKPATH) $(INST_SCRIPT)
|
412 |
-
$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_SCRIPT)
|
413 |
-
$(NOECHO) $(TOUCH) $(INST_SCRIPT)$(DFSEP).exists
|
414 |
-
|
415 |
-
$(INST_MAN1DIR)$(DFSEP).exists :: Makefile.PL
|
416 |
-
$(NOECHO) $(MKPATH) $(INST_MAN1DIR)
|
417 |
-
$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN1DIR)
|
418 |
-
$(NOECHO) $(TOUCH) $(INST_MAN1DIR)$(DFSEP).exists
|
419 |
-
|
420 |
-
$(INST_MAN3DIR)$(DFSEP).exists :: Makefile.PL
|
421 |
-
$(NOECHO) $(MKPATH) $(INST_MAN3DIR)
|
422 |
-
$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN3DIR)
|
423 |
-
$(NOECHO) $(TOUCH) $(INST_MAN3DIR)$(DFSEP).exists
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
# --- MakeMaker linkext section:
|
428 |
-
|
429 |
-
linkext :: $(LINKTYPE)
|
430 |
-
$(NOECHO) $(NOOP)
|
431 |
-
|
432 |
-
|
433 |
-
# --- MakeMaker dlsyms section:
|
434 |
-
|
435 |
-
|
436 |
-
# --- MakeMaker dynamic section:
|
437 |
-
|
438 |
-
dynamic :: $(FIRST_MAKEFILE) $(INST_DYNAMIC) $(INST_BOOT)
|
439 |
-
$(NOECHO) $(NOOP)
|
440 |
-
|
441 |
-
|
442 |
-
# --- MakeMaker dynamic_bs section:
|
443 |
-
BOOTSTRAP = $(BASEEXT).bs
|
444 |
-
|
445 |
-
# As Mkbootstrap might not write a file (if none is required)
|
446 |
-
# we use touch to prevent make continually trying to remake it.
|
447 |
-
# The DynaLoader only reads a non-empty file.
|
448 |
-
$(BOOTSTRAP) : $(FIRST_MAKEFILE) $(BOOTDEP) $(INST_ARCHAUTODIR)$(DFSEP).exists
|
449 |
-
$(NOECHO) $(ECHO) "Running Mkbootstrap for $(NAME) ($(BSLOADLIBS))"
|
450 |
-
$(NOECHO) $(PERLRUN) \
|
451 |
-
"-MExtUtils::Mkbootstrap" \
|
452 |
-
-e "Mkbootstrap('$(BASEEXT)','$(BSLOADLIBS)');"
|
453 |
-
$(NOECHO) $(TOUCH) $@
|
454 |
-
$(CHMOD) $(PERM_RW) $@
|
455 |
-
|
456 |
-
$(INST_BOOT) : $(BOOTSTRAP) $(INST_ARCHAUTODIR)$(DFSEP).exists
|
457 |
-
$(NOECHO) $(RM_RF) $@
|
458 |
-
- $(CP) $(BOOTSTRAP) $@
|
459 |
-
$(CHMOD) $(PERM_RW) $@
|
460 |
-
|
461 |
-
|
462 |
-
# --- MakeMaker dynamic_lib section:
|
463 |
-
|
464 |
-
# This section creates the dynamically loadable $(INST_DYNAMIC)
|
465 |
-
# from $(OBJECT) and possibly $(MYEXTLIB).
|
466 |
-
ARMAYBE = :
|
467 |
-
OTHERLDFLAGS =
|
468 |
-
INST_DYNAMIC_DEP =
|
469 |
-
INST_DYNAMIC_FIX =
|
470 |
-
|
471 |
-
$(INST_DYNAMIC): $(OBJECT) $(MYEXTLIB) $(BOOTSTRAP) $(INST_ARCHAUTODIR)$(DFSEP).exists $(EXPORT_LIST) $(PERL_ARCHIVE) $(PERL_ARCHIVE_AFTER) $(INST_DYNAMIC_DEP)
|
472 |
-
$(RM_F) $@
|
473 |
-
$(LD) $(LDDLFLAGS) $(LDFROM) $(OTHERLDFLAGS) -o $@ $(MYEXTLIB) \
|
474 |
-
$(PERL_ARCHIVE) $(LDLOADLIBS) $(PERL_ARCHIVE_AFTER) $(EXPORT_LIST) \
|
475 |
-
$(INST_DYNAMIC_FIX)
|
476 |
-
$(CHMOD) $(PERM_RWX) $@
|
477 |
-
|
478 |
-
|
479 |
-
# --- MakeMaker static section:
|
480 |
-
|
481 |
-
## $(INST_PM) has been moved to the all: target.
|
482 |
-
## It remains here for awhile to allow for old usage: "make static"
|
483 |
-
static :: $(FIRST_MAKEFILE) $(INST_STATIC)
|
484 |
-
$(NOECHO) $(NOOP)
|
485 |
-
|
486 |
-
|
487 |
-
# --- MakeMaker static_lib section:
|
488 |
-
|
489 |
-
$(INST_STATIC) : $(OBJECT) $(MYEXTLIB) $(INST_ARCHAUTODIR)$(DFSEP).exists
|
490 |
-
$(RM_RF) $@
|
491 |
-
$(FULL_AR) $(AR_STATIC_ARGS) $@ $(OBJECT) && $(RANLIB) $@
|
492 |
-
$(CHMOD) $(PERM_RWX) $@
|
493 |
-
$(NOECHO) $(ECHO) "$(EXTRALIBS)" > $(INST_ARCHAUTODIR)/extralibs.ld
|
494 |
-
|
495 |
-
|
496 |
-
# --- MakeMaker manifypods section:
|
497 |
-
|
498 |
-
POD2MAN_EXE = $(PERLRUN) "-MExtUtils::Command::MM" -e pod2man "--"
|
499 |
-
POD2MAN = $(POD2MAN_EXE)
|
500 |
-
|
501 |
-
|
502 |
-
manifypods : pure_all
|
503 |
-
$(NOECHO) $(NOOP)
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
# --- MakeMaker processPL section:
|
509 |
-
|
510 |
-
|
511 |
-
# --- MakeMaker installbin section:
|
512 |
-
|
513 |
-
|
514 |
-
# --- MakeMaker subdirs section:
|
515 |
-
|
516 |
-
# none
|
517 |
-
|
518 |
-
# --- MakeMaker clean_subdirs section:
|
519 |
-
clean_subdirs :
|
520 |
-
$(NOECHO) $(NOOP)
|
521 |
-
|
522 |
-
|
523 |
-
# --- MakeMaker clean section:
|
524 |
-
|
525 |
-
# Delete temporary files but do not touch installed files. We don't delete
|
526 |
-
# the Makefile here so a later make realclean still has a makefile to use.
|
527 |
-
|
528 |
-
clean :: clean_subdirs
|
529 |
-
- $(RM_F) \
|
530 |
-
*$(LIB_EXT) core \
|
531 |
-
core.[0-9] $(INST_ARCHAUTODIR)/extralibs.all \
|
532 |
-
core.[0-9][0-9] $(BASEEXT).bso \
|
533 |
-
pm_to_blib.ts core.[0-9][0-9][0-9][0-9] \
|
534 |
-
$(BASEEXT).x $(BOOTSTRAP) \
|
535 |
-
perl$(EXE_EXT) tmon.out \
|
536 |
-
*$(OBJ_EXT) pm_to_blib \
|
537 |
-
$(INST_ARCHAUTODIR)/extralibs.ld blibdirs.ts \
|
538 |
-
core.[0-9][0-9][0-9][0-9][0-9] *perl.core \
|
539 |
-
core.*perl.*.? $(MAKE_APERL_FILE) \
|
540 |
-
perl $(BASEEXT).def \
|
541 |
-
core.[0-9][0-9][0-9] mon.out \
|
542 |
-
lib$(BASEEXT).def perlmain.c \
|
543 |
-
perl.exe so_locations \
|
544 |
-
$(BASEEXT).exp
|
545 |
-
- $(RM_RF) \
|
546 |
-
blib
|
547 |
-
- $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD) $(DEV_NULL)
|
548 |
-
|
549 |
-
|
550 |
-
# --- MakeMaker realclean_subdirs section:
|
551 |
-
realclean_subdirs :
|
552 |
-
$(NOECHO) $(NOOP)
|
553 |
-
|
554 |
-
|
555 |
-
# --- MakeMaker realclean section:
|
556 |
-
# Delete temporary files (via clean) and also delete dist files
|
557 |
-
realclean purge :: clean realclean_subdirs
|
558 |
-
- $(RM_F) \
|
559 |
-
$(OBJECT) $(MAKEFILE_OLD) \
|
560 |
-
$(FIRST_MAKEFILE)
|
561 |
-
- $(RM_RF) \
|
562 |
-
$(DISTVNAME)
|
563 |
-
|
564 |
-
|
565 |
-
# --- MakeMaker metafile section:
|
566 |
-
metafile : create_distdir
|
567 |
-
$(NOECHO) $(ECHO) Generating META.yml
|
568 |
-
$(NOECHO) $(ECHO) '--- #YAML:1.0' > META_new.yml
|
569 |
-
$(NOECHO) $(ECHO) 'name: CRFPP' >> META_new.yml
|
570 |
-
$(NOECHO) $(ECHO) 'version: ' >> META_new.yml
|
571 |
-
$(NOECHO) $(ECHO) 'abstract: ~' >> META_new.yml
|
572 |
-
$(NOECHO) $(ECHO) 'author: []' >> META_new.yml
|
573 |
-
$(NOECHO) $(ECHO) 'license: unknown' >> META_new.yml
|
574 |
-
$(NOECHO) $(ECHO) 'distribution_type: module' >> META_new.yml
|
575 |
-
$(NOECHO) $(ECHO) 'configure_requires:' >> META_new.yml
|
576 |
-
$(NOECHO) $(ECHO) ' ExtUtils::MakeMaker: 0' >> META_new.yml
|
577 |
-
$(NOECHO) $(ECHO) 'build_requires:' >> META_new.yml
|
578 |
-
$(NOECHO) $(ECHO) ' ExtUtils::MakeMaker: 0' >> META_new.yml
|
579 |
-
$(NOECHO) $(ECHO) 'requires: {}' >> META_new.yml
|
580 |
-
$(NOECHO) $(ECHO) 'no_index:' >> META_new.yml
|
581 |
-
$(NOECHO) $(ECHO) ' directory:' >> META_new.yml
|
582 |
-
$(NOECHO) $(ECHO) ' - t' >> META_new.yml
|
583 |
-
$(NOECHO) $(ECHO) ' - inc' >> META_new.yml
|
584 |
-
$(NOECHO) $(ECHO) 'generated_by: ExtUtils::MakeMaker version 6.56' >> META_new.yml
|
585 |
-
$(NOECHO) $(ECHO) 'meta-spec:' >> META_new.yml
|
586 |
-
$(NOECHO) $(ECHO) ' url: http://module-build.sourceforge.net/META-spec-v1.4.html' >> META_new.yml
|
587 |
-
$(NOECHO) $(ECHO) ' version: 1.4' >> META_new.yml
|
588 |
-
-$(NOECHO) $(MV) META_new.yml $(DISTVNAME)/META.yml
|
589 |
-
|
590 |
-
|
591 |
-
# --- MakeMaker signature section:
|
592 |
-
signature :
|
593 |
-
cpansign -s
|
594 |
-
|
595 |
-
|
596 |
-
# --- MakeMaker dist_basics section:
|
597 |
-
distclean :: realclean distcheck
|
598 |
-
$(NOECHO) $(NOOP)
|
599 |
-
|
600 |
-
distcheck :
|
601 |
-
$(PERLRUN) "-MExtUtils::Manifest=fullcheck" -e fullcheck
|
602 |
-
|
603 |
-
skipcheck :
|
604 |
-
$(PERLRUN) "-MExtUtils::Manifest=skipcheck" -e skipcheck
|
605 |
-
|
606 |
-
manifest :
|
607 |
-
$(PERLRUN) "-MExtUtils::Manifest=mkmanifest" -e mkmanifest
|
608 |
-
|
609 |
-
veryclean : realclean
|
610 |
-
$(RM_F) *~ */*~ *.orig */*.orig *.bak */*.bak *.old */*.old
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
# --- MakeMaker dist_core section:
|
615 |
-
|
616 |
-
dist : $(DIST_DEFAULT) $(FIRST_MAKEFILE)
|
617 |
-
$(NOECHO) $(ABSPERLRUN) -l -e 'print '\''Warning: Makefile possibly out of date with $(VERSION_FROM)'\''' \
|
618 |
-
-e ' if -e '\''$(VERSION_FROM)'\'' and -M '\''$(VERSION_FROM)'\'' < -M '\''$(FIRST_MAKEFILE)'\'';' --
|
619 |
-
|
620 |
-
tardist : $(DISTVNAME).tar$(SUFFIX)
|
621 |
-
$(NOECHO) $(NOOP)
|
622 |
-
|
623 |
-
uutardist : $(DISTVNAME).tar$(SUFFIX)
|
624 |
-
uuencode $(DISTVNAME).tar$(SUFFIX) $(DISTVNAME).tar$(SUFFIX) > $(DISTVNAME).tar$(SUFFIX)_uu
|
625 |
-
|
626 |
-
$(DISTVNAME).tar$(SUFFIX) : distdir
|
627 |
-
$(PREOP)
|
628 |
-
$(TO_UNIX)
|
629 |
-
$(TAR) $(TARFLAGS) $(DISTVNAME).tar $(DISTVNAME)
|
630 |
-
$(RM_RF) $(DISTVNAME)
|
631 |
-
$(COMPRESS) $(DISTVNAME).tar
|
632 |
-
$(POSTOP)
|
633 |
-
|
634 |
-
zipdist : $(DISTVNAME).zip
|
635 |
-
$(NOECHO) $(NOOP)
|
636 |
-
|
637 |
-
$(DISTVNAME).zip : distdir
|
638 |
-
$(PREOP)
|
639 |
-
$(ZIP) $(ZIPFLAGS) $(DISTVNAME).zip $(DISTVNAME)
|
640 |
-
$(RM_RF) $(DISTVNAME)
|
641 |
-
$(POSTOP)
|
642 |
-
|
643 |
-
shdist : distdir
|
644 |
-
$(PREOP)
|
645 |
-
$(SHAR) $(DISTVNAME) > $(DISTVNAME).shar
|
646 |
-
$(RM_RF) $(DISTVNAME)
|
647 |
-
$(POSTOP)
|
648 |
-
|
649 |
-
|
650 |
-
# --- MakeMaker distdir section:
|
651 |
-
create_distdir :
|
652 |
-
$(RM_RF) $(DISTVNAME)
|
653 |
-
$(PERLRUN) "-MExtUtils::Manifest=manicopy,maniread" \
|
654 |
-
-e "manicopy(maniread(),'$(DISTVNAME)', '$(DIST_CP)');"
|
655 |
-
|
656 |
-
distdir : create_distdir distmeta
|
657 |
-
$(NOECHO) $(NOOP)
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
# --- MakeMaker dist_test section:
|
662 |
-
disttest : distdir
|
663 |
-
cd $(DISTVNAME) && $(ABSPERLRUN) Makefile.PL
|
664 |
-
cd $(DISTVNAME) && $(MAKE) $(PASTHRU)
|
665 |
-
cd $(DISTVNAME) && $(MAKE) test $(PASTHRU)
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
# --- MakeMaker dist_ci section:
|
670 |
-
|
671 |
-
ci :
|
672 |
-
$(PERLRUN) "-MExtUtils::Manifest=maniread" \
|
673 |
-
-e "@all = keys %{ maniread() };" \
|
674 |
-
-e "print(qq{Executing $(CI) @all\n}); system(qq{$(CI) @all});" \
|
675 |
-
-e "print(qq{Executing $(RCS_LABEL) ...\n}); system(qq{$(RCS_LABEL) @all});"
|
676 |
-
|
677 |
-
|
678 |
-
# --- MakeMaker distmeta section:
|
679 |
-
distmeta : create_distdir metafile
|
680 |
-
$(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{META.yml} => q{Module meta-data (added by MakeMaker)}}) } ' \
|
681 |
-
-e ' or print "Could not add META.yml to MANIFEST: $${'\''@'\''}\n"' --
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
-
# --- MakeMaker distsignature section:
|
686 |
-
distsignature : create_distdir
|
687 |
-
$(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{SIGNATURE} => q{Public-key signature (added by MakeMaker)}}) } ' \
|
688 |
-
-e ' or print "Could not add SIGNATURE to MANIFEST: $${'\''@'\''}\n"' --
|
689 |
-
$(NOECHO) cd $(DISTVNAME) && $(TOUCH) SIGNATURE
|
690 |
-
cd $(DISTVNAME) && cpansign -s
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
# --- MakeMaker install section:
|
695 |
-
|
696 |
-
install :: pure_install doc_install
|
697 |
-
$(NOECHO) $(NOOP)
|
698 |
-
|
699 |
-
install_perl :: pure_perl_install doc_perl_install
|
700 |
-
$(NOECHO) $(NOOP)
|
701 |
-
|
702 |
-
install_site :: pure_site_install doc_site_install
|
703 |
-
$(NOECHO) $(NOOP)
|
704 |
-
|
705 |
-
install_vendor :: pure_vendor_install doc_vendor_install
|
706 |
-
$(NOECHO) $(NOOP)
|
707 |
-
|
708 |
-
pure_install :: pure_$(INSTALLDIRS)_install
|
709 |
-
$(NOECHO) $(NOOP)
|
710 |
-
|
711 |
-
doc_install :: doc_$(INSTALLDIRS)_install
|
712 |
-
$(NOECHO) $(NOOP)
|
713 |
-
|
714 |
-
pure__install : pure_site_install
|
715 |
-
$(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site
|
716 |
-
|
717 |
-
doc__install : doc_site_install
|
718 |
-
$(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site
|
719 |
-
|
720 |
-
pure_perl_install :: all
|
721 |
-
$(NOECHO) umask 022; $(MOD_INSTALL) \
|
722 |
-
$(INST_LIB) $(DESTINSTALLPRIVLIB) \
|
723 |
-
$(INST_ARCHLIB) $(DESTINSTALLARCHLIB) \
|
724 |
-
$(INST_BIN) $(DESTINSTALLBIN) \
|
725 |
-
$(INST_SCRIPT) $(DESTINSTALLSCRIPT) \
|
726 |
-
$(INST_MAN1DIR) $(DESTINSTALLMAN1DIR) \
|
727 |
-
$(INST_MAN3DIR) $(DESTINSTALLMAN3DIR)
|
728 |
-
$(NOECHO) $(WARN_IF_OLD_PACKLIST) \
|
729 |
-
$(SITEARCHEXP)/auto/$(FULLEXT)
|
730 |
-
|
731 |
-
|
732 |
-
pure_site_install :: all
|
733 |
-
$(NOECHO) umask 02; $(MOD_INSTALL) \
|
734 |
-
read $(SITEARCHEXP)/auto/$(FULLEXT)/.packlist \
|
735 |
-
write $(DESTINSTALLSITEARCH)/auto/$(FULLEXT)/.packlist \
|
736 |
-
$(INST_LIB) $(DESTINSTALLSITELIB) \
|
737 |
-
$(INST_ARCHLIB) $(DESTINSTALLSITEARCH) \
|
738 |
-
$(INST_BIN) $(DESTINSTALLSITEBIN) \
|
739 |
-
$(INST_SCRIPT) $(DESTINSTALLSITESCRIPT) \
|
740 |
-
$(INST_MAN1DIR) $(DESTINSTALLSITEMAN1DIR) \
|
741 |
-
$(INST_MAN3DIR) $(DESTINSTALLSITEMAN3DIR)
|
742 |
-
$(NOECHO) $(WARN_IF_OLD_PACKLIST) \
|
743 |
-
$(PERL_ARCHLIB)/auto/$(FULLEXT)
|
744 |
-
|
745 |
-
pure_vendor_install :: all
|
746 |
-
$(NOECHO) umask 022; $(MOD_INSTALL) \
|
747 |
-
$(INST_LIB) $(DESTINSTALLVENDORLIB) \
|
748 |
-
$(INST_ARCHLIB) $(DESTINSTALLVENDORARCH) \
|
749 |
-
$(INST_BIN) $(DESTINSTALLVENDORBIN) \
|
750 |
-
$(INST_SCRIPT) $(DESTINSTALLVENDORSCRIPT) \
|
751 |
-
$(INST_MAN1DIR) $(DESTINSTALLVENDORMAN1DIR) \
|
752 |
-
$(INST_MAN3DIR) $(DESTINSTALLVENDORMAN3DIR)
|
753 |
-
|
754 |
-
doc_perl_install :: all
|
755 |
-
|
756 |
-
doc_site_install :: all
|
757 |
-
$(NOECHO) $(ECHO) Appending installation info to $(DESTINSTALLSITEARCH)/perllocal.pod
|
758 |
-
-$(NOECHO) umask 02; $(MKPATH) $(DESTINSTALLSITEARCH)
|
759 |
-
-$(NOECHO) umask 02; $(DOC_INSTALL) \
|
760 |
-
"Module" "$(NAME)" \
|
761 |
-
"installed into" "$(INSTALLSITELIB)" \
|
762 |
-
LINKTYPE "$(LINKTYPE)" \
|
763 |
-
VERSION "$(VERSION)" \
|
764 |
-
EXE_FILES "$(EXE_FILES)" \
|
765 |
-
>> $(DESTINSTALLSITEARCH)/perllocal.pod
|
766 |
-
|
767 |
-
doc_vendor_install :: all
|
768 |
-
|
769 |
-
|
770 |
-
uninstall :: uninstall_from_$(INSTALLDIRS)dirs
|
771 |
-
$(NOECHO) $(NOOP)
|
772 |
-
|
773 |
-
uninstall_from_perldirs ::
|
774 |
-
|
775 |
-
uninstall_from_sitedirs ::
|
776 |
-
$(NOECHO) $(UNINSTALL) $(SITEARCHEXP)/auto/$(FULLEXT)/.packlist
|
777 |
-
|
778 |
-
uninstall_from_vendordirs ::
|
779 |
-
|
780 |
-
|
781 |
-
|
782 |
-
# --- MakeMaker force section:
|
783 |
-
# Phony target to force checking subdirectories.
|
784 |
-
FORCE :
|
785 |
-
$(NOECHO) $(NOOP)
|
786 |
-
|
787 |
-
|
788 |
-
# --- MakeMaker perldepend section:
|
789 |
-
|
790 |
-
PERL_HDRS = \
|
791 |
-
$(PERL_INC)/EXTERN.h \
|
792 |
-
$(PERL_INC)/INTERN.h \
|
793 |
-
$(PERL_INC)/XSUB.h \
|
794 |
-
$(PERL_INC)/av.h \
|
795 |
-
$(PERL_INC)/cc_runtime.h \
|
796 |
-
$(PERL_INC)/config.h \
|
797 |
-
$(PERL_INC)/cop.h \
|
798 |
-
$(PERL_INC)/cv.h \
|
799 |
-
$(PERL_INC)/dosish.h \
|
800 |
-
$(PERL_INC)/embed.h \
|
801 |
-
$(PERL_INC)/embedvar.h \
|
802 |
-
$(PERL_INC)/fakethr.h \
|
803 |
-
$(PERL_INC)/form.h \
|
804 |
-
$(PERL_INC)/gv.h \
|
805 |
-
$(PERL_INC)/handy.h \
|
806 |
-
$(PERL_INC)/hv.h \
|
807 |
-
$(PERL_INC)/intrpvar.h \
|
808 |
-
$(PERL_INC)/iperlsys.h \
|
809 |
-
$(PERL_INC)/keywords.h \
|
810 |
-
$(PERL_INC)/mg.h \
|
811 |
-
$(PERL_INC)/nostdio.h \
|
812 |
-
$(PERL_INC)/op.h \
|
813 |
-
$(PERL_INC)/opcode.h \
|
814 |
-
$(PERL_INC)/patchlevel.h \
|
815 |
-
$(PERL_INC)/perl.h \
|
816 |
-
$(PERL_INC)/perlio.h \
|
817 |
-
$(PERL_INC)/perlsdio.h \
|
818 |
-
$(PERL_INC)/perlsfio.h \
|
819 |
-
$(PERL_INC)/perlvars.h \
|
820 |
-
$(PERL_INC)/perly.h \
|
821 |
-
$(PERL_INC)/pp.h \
|
822 |
-
$(PERL_INC)/pp_proto.h \
|
823 |
-
$(PERL_INC)/proto.h \
|
824 |
-
$(PERL_INC)/regcomp.h \
|
825 |
-
$(PERL_INC)/regexp.h \
|
826 |
-
$(PERL_INC)/regnodes.h \
|
827 |
-
$(PERL_INC)/scope.h \
|
828 |
-
$(PERL_INC)/sv.h \
|
829 |
-
$(PERL_INC)/thread.h \
|
830 |
-
$(PERL_INC)/unixish.h \
|
831 |
-
$(PERL_INC)/util.h
|
832 |
-
|
833 |
-
$(OBJECT) : $(PERL_HDRS)
|
834 |
-
|
835 |
-
|
836 |
-
# --- MakeMaker makefile section:
|
837 |
-
|
838 |
-
$(OBJECT) : $(FIRST_MAKEFILE)
|
839 |
-
|
840 |
-
# We take a very conservative approach here, but it's worth it.
|
841 |
-
# We move Makefile to Makefile.old here to avoid gnu make looping.
|
842 |
-
$(FIRST_MAKEFILE) : Makefile.PL $(CONFIGDEP)
|
843 |
-
$(NOECHO) $(ECHO) "Makefile out-of-date with respect to $?"
|
844 |
-
$(NOECHO) $(ECHO) "Cleaning current config before rebuilding Makefile..."
|
845 |
-
-$(NOECHO) $(RM_F) $(MAKEFILE_OLD)
|
846 |
-
-$(NOECHO) $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD)
|
847 |
-
- $(MAKE) $(USEMAKEFILE) $(MAKEFILE_OLD) clean $(DEV_NULL)
|
848 |
-
$(PERLRUN) Makefile.PL
|
849 |
-
$(NOECHO) $(ECHO) "==> Your Makefile has been rebuilt. <=="
|
850 |
-
$(NOECHO) $(ECHO) "==> Please rerun the $(MAKE) command. <=="
|
851 |
-
$(FALSE)
|
852 |
-
|
853 |
-
|
854 |
-
|
855 |
-
# --- MakeMaker staticmake section:
|
856 |
-
|
857 |
-
# --- MakeMaker makeaperl section ---
|
858 |
-
MAP_TARGET = perl
|
859 |
-
FULLPERL = /usr/bin/perl
|
860 |
-
|
861 |
-
$(MAP_TARGET) :: static $(MAKE_APERL_FILE)
|
862 |
-
$(MAKE) $(USEMAKEFILE) $(MAKE_APERL_FILE) $@
|
863 |
-
|
864 |
-
$(MAKE_APERL_FILE) : $(FIRST_MAKEFILE) pm_to_blib
|
865 |
-
$(NOECHO) $(ECHO) Writing \"$(MAKE_APERL_FILE)\" for this $(MAP_TARGET)
|
866 |
-
$(NOECHO) $(PERLRUNINST) \
|
867 |
-
Makefile.PL DIR= \
|
868 |
-
MAKEFILE=$(MAKE_APERL_FILE) LINKTYPE=static \
|
869 |
-
MAKEAPERL=1 NORECURS=1 CCCDLFLAGS=
|
870 |
-
|
871 |
-
|
872 |
-
# --- MakeMaker test section:
|
873 |
-
|
874 |
-
TEST_VERBOSE=0
|
875 |
-
TEST_TYPE=test_$(LINKTYPE)
|
876 |
-
TEST_FILE = test.pl
|
877 |
-
TEST_FILES =
|
878 |
-
TESTDB_SW = -d
|
879 |
-
|
880 |
-
testdb :: testdb_$(LINKTYPE)
|
881 |
-
|
882 |
-
test :: $(TEST_TYPE) subdirs-test
|
883 |
-
|
884 |
-
subdirs-test ::
|
885 |
-
$(NOECHO) $(NOOP)
|
886 |
-
|
887 |
-
|
888 |
-
test_dynamic :: pure_all
|
889 |
-
PERL_DL_NONLAZY=1 $(FULLPERLRUN) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
|
890 |
-
|
891 |
-
testdb_dynamic :: pure_all
|
892 |
-
PERL_DL_NONLAZY=1 $(FULLPERLRUN) $(TESTDB_SW) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
|
893 |
-
|
894 |
-
test_ : test_dynamic
|
895 |
-
|
896 |
-
test_static :: pure_all $(MAP_TARGET)
|
897 |
-
PERL_DL_NONLAZY=1 ./$(MAP_TARGET) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
|
898 |
-
|
899 |
-
testdb_static :: pure_all $(MAP_TARGET)
|
900 |
-
PERL_DL_NONLAZY=1 ./$(MAP_TARGET) $(TESTDB_SW) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
|
901 |
-
|
902 |
-
|
903 |
-
|
904 |
-
# --- MakeMaker ppd section:
|
905 |
-
# Creates a PPD (Perl Package Description) for a binary distribution.
|
906 |
-
ppd :
|
907 |
-
$(NOECHO) $(ECHO) '<SOFTPKG NAME="$(DISTNAME)" VERSION="">' > $(DISTNAME).ppd
|
908 |
-
$(NOECHO) $(ECHO) ' <ABSTRACT></ABSTRACT>' >> $(DISTNAME).ppd
|
909 |
-
$(NOECHO) $(ECHO) ' <AUTHOR></AUTHOR>' >> $(DISTNAME).ppd
|
910 |
-
$(NOECHO) $(ECHO) ' <IMPLEMENTATION>' >> $(DISTNAME).ppd
|
911 |
-
$(NOECHO) $(ECHO) ' <ARCHITECTURE NAME="x86_64-linux-gnu-thread-multi-5.12" />' >> $(DISTNAME).ppd
|
912 |
-
$(NOECHO) $(ECHO) ' <CODEBASE HREF="" />' >> $(DISTNAME).ppd
|
913 |
-
$(NOECHO) $(ECHO) ' </IMPLEMENTATION>' >> $(DISTNAME).ppd
|
914 |
-
$(NOECHO) $(ECHO) '</SOFTPKG>' >> $(DISTNAME).ppd
|
915 |
-
|
916 |
-
|
917 |
-
# --- MakeMaker pm_to_blib section:
|
918 |
-
|
919 |
-
pm_to_blib : $(FIRST_MAKEFILE) $(TO_INST_PM)
|
920 |
-
$(NOECHO) $(ABSPERLRUN) -MExtUtils::Install -e 'pm_to_blib({@ARGV}, '\''$(INST_LIB)/auto'\'', q[$(PM_FILTER)], '\''$(PERM_DIR)'\'')' -- \
|
921 |
-
CRFPP.pm $(INST_LIB)/CRFPP.pm
|
922 |
-
$(NOECHO) $(TOUCH) pm_to_blib
|
923 |
-
|
924 |
-
|
925 |
-
# --- MakeMaker selfdocument section:
|
926 |
-
|
927 |
-
|
928 |
-
# --- MakeMaker postamble section:
|
929 |
-
|
930 |
-
|
931 |
-
# End.
|
|
|
1 |
+
# This Makefile is for the CRFPP extension to perl.
|
2 |
+
#
|
3 |
+
# It was generated automatically by MakeMaker version
|
4 |
+
# 6.56 (Revision: 65600) from the contents of
|
5 |
+
# Makefile.PL. Don't edit this file, edit Makefile.PL instead.
|
6 |
+
#
|
7 |
+
# ANY CHANGES MADE HERE WILL BE LOST!
|
8 |
+
#
|
9 |
+
# MakeMaker ARGV: ()
|
10 |
+
#
|
11 |
+
|
12 |
+
# MakeMaker Parameters:
|
13 |
+
|
14 |
+
# BUILD_REQUIRES => { }
|
15 |
+
# CC => q[c++]
|
16 |
+
# INC => q[]
|
17 |
+
# LD => q[c++]
|
18 |
+
# LIBS => q[-lpthread -lcrfpp]
|
19 |
+
# NAME => q[CRFPP]
|
20 |
+
# OBJECT => q[CRFPP_wrap.o]
|
21 |
+
# PREREQ_PM => { }
|
22 |
+
|
23 |
+
# --- MakeMaker post_initialize section:
|
24 |
+
|
25 |
+
|
26 |
+
# --- MakeMaker const_config section:
|
27 |
+
|
28 |
+
# These definitions are from config.sh (via /usr/lib/perl/5.12/Config.pm).
|
29 |
+
# They may have been overridden via Makefile.PL or on the command line.
|
30 |
+
AR = ar
|
31 |
+
CC = c++
|
32 |
+
CCCDLFLAGS = -fPIC
|
33 |
+
CCDLFLAGS = -Wl,-E
|
34 |
+
DLEXT = so
|
35 |
+
DLSRC = dl_dlopen.xs
|
36 |
+
EXE_EXT =
|
37 |
+
FULL_AR = /usr/bin/ar
|
38 |
+
LD = c++
|
39 |
+
LDDLFLAGS = -shared -O2 -g -L/usr/local/lib -fstack-protector
|
40 |
+
LDFLAGS = -fstack-protector -L/usr/local/lib
|
41 |
+
LIBC =
|
42 |
+
LIB_EXT = .a
|
43 |
+
OBJ_EXT = .o
|
44 |
+
OSNAME = linux
|
45 |
+
OSVERS = 2.6.24-28-server
|
46 |
+
RANLIB = :
|
47 |
+
SITELIBEXP = /usr/local/share/perl/5.12.4
|
48 |
+
SITEARCHEXP = /usr/local/lib/perl/5.12.4
|
49 |
+
SO = so
|
50 |
+
VENDORARCHEXP = /usr/lib/perl5
|
51 |
+
VENDORLIBEXP = /usr/share/perl5
|
52 |
+
|
53 |
+
|
54 |
+
# --- MakeMaker constants section:
|
55 |
+
AR_STATIC_ARGS = cr
|
56 |
+
DIRFILESEP = /
|
57 |
+
DFSEP = $(DIRFILESEP)
|
58 |
+
NAME = CRFPP
|
59 |
+
NAME_SYM = CRFPP
|
60 |
+
VERSION =
|
61 |
+
VERSION_MACRO = VERSION
|
62 |
+
VERSION_SYM =
|
63 |
+
DEFINE_VERSION = -D$(VERSION_MACRO)=\"$(VERSION)\"
|
64 |
+
XS_VERSION =
|
65 |
+
XS_VERSION_MACRO = XS_VERSION
|
66 |
+
XS_DEFINE_VERSION = -D$(XS_VERSION_MACRO)=\"$(XS_VERSION)\"
|
67 |
+
INST_ARCHLIB = blib/arch
|
68 |
+
INST_SCRIPT = blib/script
|
69 |
+
INST_BIN = blib/bin
|
70 |
+
INST_LIB = blib/lib
|
71 |
+
INST_MAN1DIR = blib/man1
|
72 |
+
INST_MAN3DIR = blib/man3
|
73 |
+
MAN1EXT = 1p
|
74 |
+
MAN3EXT = 3pm
|
75 |
+
INSTALLDIRS = site
|
76 |
+
DESTDIR =
|
77 |
+
PREFIX = /usr
|
78 |
+
PERLPREFIX = $(PREFIX)
|
79 |
+
SITEPREFIX = $(PREFIX)/local
|
80 |
+
VENDORPREFIX = $(PREFIX)
|
81 |
+
INSTALLPRIVLIB = $(PERLPREFIX)/share/perl/5.12
|
82 |
+
DESTINSTALLPRIVLIB = $(DESTDIR)$(INSTALLPRIVLIB)
|
83 |
+
INSTALLSITELIB = $(SITEPREFIX)/share/perl/5.12.4
|
84 |
+
DESTINSTALLSITELIB = $(DESTDIR)$(INSTALLSITELIB)
|
85 |
+
INSTALLVENDORLIB = $(VENDORPREFIX)/share/perl5
|
86 |
+
DESTINSTALLVENDORLIB = $(DESTDIR)$(INSTALLVENDORLIB)
|
87 |
+
INSTALLARCHLIB = $(PERLPREFIX)/lib/perl/5.12
|
88 |
+
DESTINSTALLARCHLIB = $(DESTDIR)$(INSTALLARCHLIB)
|
89 |
+
INSTALLSITEARCH = $(SITEPREFIX)/lib/perl/5.12.4
|
90 |
+
DESTINSTALLSITEARCH = $(DESTDIR)$(INSTALLSITEARCH)
|
91 |
+
INSTALLVENDORARCH = $(VENDORPREFIX)/lib/perl5
|
92 |
+
DESTINSTALLVENDORARCH = $(DESTDIR)$(INSTALLVENDORARCH)
|
93 |
+
INSTALLBIN = $(PERLPREFIX)/bin
|
94 |
+
DESTINSTALLBIN = $(DESTDIR)$(INSTALLBIN)
|
95 |
+
INSTALLSITEBIN = $(SITEPREFIX)/bin
|
96 |
+
DESTINSTALLSITEBIN = $(DESTDIR)$(INSTALLSITEBIN)
|
97 |
+
INSTALLVENDORBIN = $(VENDORPREFIX)/bin
|
98 |
+
DESTINSTALLVENDORBIN = $(DESTDIR)$(INSTALLVENDORBIN)
|
99 |
+
INSTALLSCRIPT = $(PERLPREFIX)/bin
|
100 |
+
DESTINSTALLSCRIPT = $(DESTDIR)$(INSTALLSCRIPT)
|
101 |
+
INSTALLSITESCRIPT = $(SITEPREFIX)/bin
|
102 |
+
DESTINSTALLSITESCRIPT = $(DESTDIR)$(INSTALLSITESCRIPT)
|
103 |
+
INSTALLVENDORSCRIPT = $(VENDORPREFIX)/bin
|
104 |
+
DESTINSTALLVENDORSCRIPT = $(DESTDIR)$(INSTALLVENDORSCRIPT)
|
105 |
+
INSTALLMAN1DIR = $(PERLPREFIX)/share/man/man1
|
106 |
+
DESTINSTALLMAN1DIR = $(DESTDIR)$(INSTALLMAN1DIR)
|
107 |
+
INSTALLSITEMAN1DIR = $(SITEPREFIX)/man/man1
|
108 |
+
DESTINSTALLSITEMAN1DIR = $(DESTDIR)$(INSTALLSITEMAN1DIR)
|
109 |
+
INSTALLVENDORMAN1DIR = $(VENDORPREFIX)/share/man/man1
|
110 |
+
DESTINSTALLVENDORMAN1DIR = $(DESTDIR)$(INSTALLVENDORMAN1DIR)
|
111 |
+
INSTALLMAN3DIR = $(PERLPREFIX)/share/man/man3
|
112 |
+
DESTINSTALLMAN3DIR = $(DESTDIR)$(INSTALLMAN3DIR)
|
113 |
+
INSTALLSITEMAN3DIR = $(SITEPREFIX)/man/man3
|
114 |
+
DESTINSTALLSITEMAN3DIR = $(DESTDIR)$(INSTALLSITEMAN3DIR)
|
115 |
+
INSTALLVENDORMAN3DIR = $(VENDORPREFIX)/share/man/man3
|
116 |
+
DESTINSTALLVENDORMAN3DIR = $(DESTDIR)$(INSTALLVENDORMAN3DIR)
|
117 |
+
PERL_LIB = /usr/share/perl/5.12
|
118 |
+
PERL_ARCHLIB = /usr/lib/perl/5.12
|
119 |
+
LIBPERL_A = libperl.a
|
120 |
+
FIRST_MAKEFILE = Makefile
|
121 |
+
MAKEFILE_OLD = Makefile.old
|
122 |
+
MAKE_APERL_FILE = Makefile.aperl
|
123 |
+
PERLMAINCC = $(CC)
|
124 |
+
PERL_INC = /usr/lib/perl/5.12/CORE
|
125 |
+
PERL = /usr/bin/perl
|
126 |
+
FULLPERL = /usr/bin/perl
|
127 |
+
ABSPERL = $(PERL)
|
128 |
+
PERLRUN = $(PERL)
|
129 |
+
FULLPERLRUN = $(FULLPERL)
|
130 |
+
ABSPERLRUN = $(ABSPERL)
|
131 |
+
PERLRUNINST = $(PERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
|
132 |
+
FULLPERLRUNINST = $(FULLPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
|
133 |
+
ABSPERLRUNINST = $(ABSPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
|
134 |
+
PERL_CORE = 0
|
135 |
+
PERM_DIR = 755
|
136 |
+
PERM_RW = 644
|
137 |
+
PERM_RWX = 755
|
138 |
+
|
139 |
+
MAKEMAKER = /usr/share/perl/5.12/ExtUtils/MakeMaker.pm
|
140 |
+
MM_VERSION = 6.56
|
141 |
+
MM_REVISION = 65600
|
142 |
+
|
143 |
+
# FULLEXT = Pathname for extension directory (eg Foo/Bar/Oracle).
|
144 |
+
# BASEEXT = Basename part of FULLEXT. May be just equal FULLEXT. (eg Oracle)
|
145 |
+
# PARENT_NAME = NAME without BASEEXT and no trailing :: (eg Foo::Bar)
|
146 |
+
# DLBASE = Basename part of dynamic library. May be just equal BASEEXT.
|
147 |
+
MAKE = make
|
148 |
+
FULLEXT = CRFPP
|
149 |
+
BASEEXT = CRFPP
|
150 |
+
PARENT_NAME =
|
151 |
+
DLBASE = $(BASEEXT)
|
152 |
+
VERSION_FROM =
|
153 |
+
INC =
|
154 |
+
OBJECT = CRFPP_wrap$(OBJ_EXT)
|
155 |
+
LDFROM = $(OBJECT)
|
156 |
+
LINKTYPE = dynamic
|
157 |
+
BOOTDEP =
|
158 |
+
|
159 |
+
# Handy lists of source code files:
|
160 |
+
XS_FILES =
|
161 |
+
C_FILES = CRFPP_wrap.cxx
|
162 |
+
O_FILES = CRFPP_wrap.o
|
163 |
+
H_FILES =
|
164 |
+
MAN1PODS =
|
165 |
+
MAN3PODS =
|
166 |
+
|
167 |
+
# Where is the Config information that we are using/depend on
|
168 |
+
CONFIGDEP = $(PERL_ARCHLIB)$(DFSEP)Config.pm $(PERL_INC)$(DFSEP)config.h
|
169 |
+
|
170 |
+
# Where to build things
|
171 |
+
INST_LIBDIR = $(INST_LIB)
|
172 |
+
INST_ARCHLIBDIR = $(INST_ARCHLIB)
|
173 |
+
|
174 |
+
INST_AUTODIR = $(INST_LIB)/auto/$(FULLEXT)
|
175 |
+
INST_ARCHAUTODIR = $(INST_ARCHLIB)/auto/$(FULLEXT)
|
176 |
+
|
177 |
+
INST_STATIC = $(INST_ARCHAUTODIR)/$(BASEEXT)$(LIB_EXT)
|
178 |
+
INST_DYNAMIC = $(INST_ARCHAUTODIR)/$(DLBASE).$(DLEXT)
|
179 |
+
INST_BOOT = $(INST_ARCHAUTODIR)/$(BASEEXT).bs
|
180 |
+
|
181 |
+
# Extra linker info
|
182 |
+
EXPORT_LIST =
|
183 |
+
PERL_ARCHIVE =
|
184 |
+
PERL_ARCHIVE_AFTER =
|
185 |
+
|
186 |
+
|
187 |
+
TO_INST_PM = CRFPP.pm
|
188 |
+
|
189 |
+
PM_TO_BLIB = CRFPP.pm \
|
190 |
+
$(INST_LIB)/CRFPP.pm
|
191 |
+
|
192 |
+
|
193 |
+
# --- MakeMaker platform_constants section:
|
194 |
+
MM_Unix_VERSION = 6.56
|
195 |
+
PERL_MALLOC_DEF = -DPERL_EXTMALLOC_DEF -Dmalloc=Perl_malloc -Dfree=Perl_mfree -Drealloc=Perl_realloc -Dcalloc=Perl_calloc
|
196 |
+
|
197 |
+
|
198 |
+
# --- MakeMaker tool_autosplit section:
|
199 |
+
# Usage: $(AUTOSPLITFILE) FileToSplit AutoDirToSplitInto
|
200 |
+
AUTOSPLITFILE = $(ABSPERLRUN) -e 'use AutoSplit; autosplit($$ARGV[0], $$ARGV[1], 0, 1, 1)' --
|
201 |
+
|
202 |
+
|
203 |
+
|
204 |
+
# --- MakeMaker tool_xsubpp section:
|
205 |
+
|
206 |
+
XSUBPPDIR = /usr/share/perl/5.12/ExtUtils
|
207 |
+
XSUBPP = $(XSUBPPDIR)$(DFSEP)xsubpp
|
208 |
+
XSUBPPRUN = $(PERLRUN) $(XSUBPP)
|
209 |
+
XSPROTOARG =
|
210 |
+
XSUBPPDEPS = /usr/share/perl/5.12/ExtUtils/typemap $(XSUBPP)
|
211 |
+
XSUBPPARGS = -typemap /usr/share/perl/5.12/ExtUtils/typemap
|
212 |
+
XSUBPP_EXTRA_ARGS =
|
213 |
+
|
214 |
+
|
215 |
+
# --- MakeMaker tools_other section:
|
216 |
+
SHELL = /bin/sh
|
217 |
+
CHMOD = chmod
|
218 |
+
CP = cp
|
219 |
+
MV = mv
|
220 |
+
NOOP = $(TRUE)
|
221 |
+
NOECHO = @
|
222 |
+
RM_F = rm -f
|
223 |
+
RM_RF = rm -rf
|
224 |
+
TEST_F = test -f
|
225 |
+
TOUCH = touch
|
226 |
+
UMASK_NULL = umask 0
|
227 |
+
DEV_NULL = > /dev/null 2>&1
|
228 |
+
MKPATH = $(ABSPERLRUN) -MExtUtils::Command -e 'mkpath' --
|
229 |
+
EQUALIZE_TIMESTAMP = $(ABSPERLRUN) -MExtUtils::Command -e 'eqtime' --
|
230 |
+
FALSE = false
|
231 |
+
TRUE = true
|
232 |
+
ECHO = echo
|
233 |
+
ECHO_N = echo -n
|
234 |
+
UNINST = 0
|
235 |
+
VERBINST = 0
|
236 |
+
MOD_INSTALL = $(ABSPERLRUN) -MExtUtils::Install -e 'install([ from_to => {@ARGV}, verbose => '\''$(VERBINST)'\'', uninstall_shadows => '\''$(UNINST)'\'', dir_mode => '\''$(PERM_DIR)'\'' ]);' --
|
237 |
+
DOC_INSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'perllocal_install' --
|
238 |
+
UNINSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'uninstall' --
|
239 |
+
WARN_IF_OLD_PACKLIST = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'warn_if_old_packlist' --
|
240 |
+
MACROSTART =
|
241 |
+
MACROEND =
|
242 |
+
USEMAKEFILE = -f
|
243 |
+
FIXIN = $(ABSPERLRUN) -MExtUtils::MY -e 'MY->fixin(shift)' --
|
244 |
+
|
245 |
+
|
246 |
+
# --- MakeMaker makemakerdflt section:
|
247 |
+
makemakerdflt : all
|
248 |
+
$(NOECHO) $(NOOP)
|
249 |
+
|
250 |
+
|
251 |
+
# --- MakeMaker dist section:
|
252 |
+
TAR = tar
|
253 |
+
TARFLAGS = cvf
|
254 |
+
ZIP = zip
|
255 |
+
ZIPFLAGS = -r
|
256 |
+
COMPRESS = gzip --best
|
257 |
+
SUFFIX = .gz
|
258 |
+
SHAR = shar
|
259 |
+
PREOP = $(NOECHO) $(NOOP)
|
260 |
+
POSTOP = $(NOECHO) $(NOOP)
|
261 |
+
TO_UNIX = $(NOECHO) $(NOOP)
|
262 |
+
CI = ci -u
|
263 |
+
RCS_LABEL = rcs -Nv$(VERSION_SYM): -q
|
264 |
+
DIST_CP = best
|
265 |
+
DIST_DEFAULT = tardist
|
266 |
+
DISTNAME = CRFPP
|
267 |
+
DISTVNAME = CRFPP-
|
268 |
+
|
269 |
+
|
270 |
+
# --- MakeMaker macro section:
|
271 |
+
|
272 |
+
|
273 |
+
# --- MakeMaker depend section:
|
274 |
+
|
275 |
+
|
276 |
+
# --- MakeMaker cflags section:
|
277 |
+
|
278 |
+
CCFLAGS = -D_REENTRANT -D_GNU_SOURCE -DDEBIAN -fno-strict-aliasing -pipe -fstack-protector -I/usr/local/include -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64
|
279 |
+
OPTIMIZE = -O2 -g
|
280 |
+
PERLTYPE =
|
281 |
+
MPOLLUTE =
|
282 |
+
|
283 |
+
|
284 |
+
# --- MakeMaker const_loadlibs section:
|
285 |
+
|
286 |
+
# CRFPP might depend on some other libraries:
|
287 |
+
# See ExtUtils::Liblist for details
|
288 |
+
#
|
289 |
+
EXTRALIBS = -lcrfpp
|
290 |
+
LDLOADLIBS = -lpthread -lcrfpp
|
291 |
+
BSLOADLIBS =
|
292 |
+
|
293 |
+
|
294 |
+
# --- MakeMaker const_cccmd section:
|
295 |
+
CCCMD = $(CC) -c $(PASTHRU_INC) $(INC) \
|
296 |
+
$(CCFLAGS) $(OPTIMIZE) \
|
297 |
+
$(PERLTYPE) $(MPOLLUTE) $(DEFINE_VERSION) \
|
298 |
+
$(XS_DEFINE_VERSION)
|
299 |
+
|
300 |
+
# --- MakeMaker post_constants section:
|
301 |
+
|
302 |
+
|
303 |
+
# --- MakeMaker pasthru section:
|
304 |
+
|
305 |
+
PASTHRU = LIBPERL_A="$(LIBPERL_A)"\
|
306 |
+
LINKTYPE="$(LINKTYPE)"\
|
307 |
+
OPTIMIZE="$(OPTIMIZE)"\
|
308 |
+
PREFIX="$(PREFIX)"\
|
309 |
+
PASTHRU_INC="$(PASTHRU_INC)"
|
310 |
+
|
311 |
+
|
312 |
+
# --- MakeMaker special_targets section:
|
313 |
+
.SUFFIXES : .xs .c .C .cpp .i .s .cxx .cc $(OBJ_EXT)
|
314 |
+
|
315 |
+
.PHONY: all config static dynamic test linkext manifest blibdirs clean realclean disttest distdir
|
316 |
+
|
317 |
+
|
318 |
+
|
319 |
+
# --- MakeMaker c_o section:
|
320 |
+
|
321 |
+
.c.i:
|
322 |
+
cc -E -c $(PASTHRU_INC) $(INC) \
|
323 |
+
$(CCFLAGS) $(OPTIMIZE) \
|
324 |
+
$(PERLTYPE) $(MPOLLUTE) $(DEFINE_VERSION) \
|
325 |
+
$(XS_DEFINE_VERSION) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.c > $*.i
|
326 |
+
|
327 |
+
.c.s:
|
328 |
+
$(CCCMD) -S $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.c
|
329 |
+
|
330 |
+
.c$(OBJ_EXT):
|
331 |
+
$(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.c
|
332 |
+
|
333 |
+
.cpp$(OBJ_EXT):
|
334 |
+
$(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.cpp
|
335 |
+
|
336 |
+
.cxx$(OBJ_EXT):
|
337 |
+
$(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.cxx
|
338 |
+
|
339 |
+
.cc$(OBJ_EXT):
|
340 |
+
$(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.cc
|
341 |
+
|
342 |
+
.C$(OBJ_EXT):
|
343 |
+
$(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.C
|
344 |
+
|
345 |
+
|
346 |
+
# --- MakeMaker xs_c section:
|
347 |
+
|
348 |
+
.xs.c:
|
349 |
+
$(XSUBPPRUN) $(XSPROTOARG) $(XSUBPPARGS) $(XSUBPP_EXTRA_ARGS) $*.xs > $*.xsc && $(MV) $*.xsc $*.c
|
350 |
+
|
351 |
+
|
352 |
+
# --- MakeMaker xs_o section:
|
353 |
+
|
354 |
+
.xs$(OBJ_EXT):
|
355 |
+
$(XSUBPPRUN) $(XSPROTOARG) $(XSUBPPARGS) $*.xs > $*.xsc && $(MV) $*.xsc $*.c
|
356 |
+
$(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.c
|
357 |
+
|
358 |
+
|
359 |
+
# --- MakeMaker top_targets section:
|
360 |
+
all :: pure_all manifypods
|
361 |
+
$(NOECHO) $(NOOP)
|
362 |
+
|
363 |
+
|
364 |
+
pure_all :: config pm_to_blib subdirs linkext
|
365 |
+
$(NOECHO) $(NOOP)
|
366 |
+
|
367 |
+
subdirs :: $(MYEXTLIB)
|
368 |
+
$(NOECHO) $(NOOP)
|
369 |
+
|
370 |
+
config :: $(FIRST_MAKEFILE) blibdirs
|
371 |
+
$(NOECHO) $(NOOP)
|
372 |
+
|
373 |
+
help :
|
374 |
+
perldoc ExtUtils::MakeMaker
|
375 |
+
|
376 |
+
|
377 |
+
# --- MakeMaker blibdirs section:
|
378 |
+
blibdirs : $(INST_LIBDIR)$(DFSEP).exists $(INST_ARCHLIB)$(DFSEP).exists $(INST_AUTODIR)$(DFSEP).exists $(INST_ARCHAUTODIR)$(DFSEP).exists $(INST_BIN)$(DFSEP).exists $(INST_SCRIPT)$(DFSEP).exists $(INST_MAN1DIR)$(DFSEP).exists $(INST_MAN3DIR)$(DFSEP).exists
|
379 |
+
$(NOECHO) $(NOOP)
|
380 |
+
|
381 |
+
# Backwards compat with 6.18 through 6.25
|
382 |
+
blibdirs.ts : blibdirs
|
383 |
+
$(NOECHO) $(NOOP)
|
384 |
+
|
385 |
+
$(INST_LIBDIR)$(DFSEP).exists :: Makefile.PL
|
386 |
+
$(NOECHO) $(MKPATH) $(INST_LIBDIR)
|
387 |
+
$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_LIBDIR)
|
388 |
+
$(NOECHO) $(TOUCH) $(INST_LIBDIR)$(DFSEP).exists
|
389 |
+
|
390 |
+
$(INST_ARCHLIB)$(DFSEP).exists :: Makefile.PL
|
391 |
+
$(NOECHO) $(MKPATH) $(INST_ARCHLIB)
|
392 |
+
$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHLIB)
|
393 |
+
$(NOECHO) $(TOUCH) $(INST_ARCHLIB)$(DFSEP).exists
|
394 |
+
|
395 |
+
$(INST_AUTODIR)$(DFSEP).exists :: Makefile.PL
|
396 |
+
$(NOECHO) $(MKPATH) $(INST_AUTODIR)
|
397 |
+
$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_AUTODIR)
|
398 |
+
$(NOECHO) $(TOUCH) $(INST_AUTODIR)$(DFSEP).exists
|
399 |
+
|
400 |
+
$(INST_ARCHAUTODIR)$(DFSEP).exists :: Makefile.PL
|
401 |
+
$(NOECHO) $(MKPATH) $(INST_ARCHAUTODIR)
|
402 |
+
$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHAUTODIR)
|
403 |
+
$(NOECHO) $(TOUCH) $(INST_ARCHAUTODIR)$(DFSEP).exists
|
404 |
+
|
405 |
+
$(INST_BIN)$(DFSEP).exists :: Makefile.PL
|
406 |
+
$(NOECHO) $(MKPATH) $(INST_BIN)
|
407 |
+
$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_BIN)
|
408 |
+
$(NOECHO) $(TOUCH) $(INST_BIN)$(DFSEP).exists
|
409 |
+
|
410 |
+
$(INST_SCRIPT)$(DFSEP).exists :: Makefile.PL
|
411 |
+
$(NOECHO) $(MKPATH) $(INST_SCRIPT)
|
412 |
+
$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_SCRIPT)
|
413 |
+
$(NOECHO) $(TOUCH) $(INST_SCRIPT)$(DFSEP).exists
|
414 |
+
|
415 |
+
$(INST_MAN1DIR)$(DFSEP).exists :: Makefile.PL
|
416 |
+
$(NOECHO) $(MKPATH) $(INST_MAN1DIR)
|
417 |
+
$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN1DIR)
|
418 |
+
$(NOECHO) $(TOUCH) $(INST_MAN1DIR)$(DFSEP).exists
|
419 |
+
|
420 |
+
$(INST_MAN3DIR)$(DFSEP).exists :: Makefile.PL
|
421 |
+
$(NOECHO) $(MKPATH) $(INST_MAN3DIR)
|
422 |
+
$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN3DIR)
|
423 |
+
$(NOECHO) $(TOUCH) $(INST_MAN3DIR)$(DFSEP).exists
|
424 |
+
|
425 |
+
|
426 |
+
|
427 |
+
# --- MakeMaker linkext section:
|
428 |
+
|
429 |
+
linkext :: $(LINKTYPE)
|
430 |
+
$(NOECHO) $(NOOP)
|
431 |
+
|
432 |
+
|
433 |
+
# --- MakeMaker dlsyms section:
|
434 |
+
|
435 |
+
|
436 |
+
# --- MakeMaker dynamic section:
|
437 |
+
|
438 |
+
dynamic :: $(FIRST_MAKEFILE) $(INST_DYNAMIC) $(INST_BOOT)
|
439 |
+
$(NOECHO) $(NOOP)
|
440 |
+
|
441 |
+
|
442 |
+
# --- MakeMaker dynamic_bs section:
|
443 |
+
BOOTSTRAP = $(BASEEXT).bs
|
444 |
+
|
445 |
+
# As Mkbootstrap might not write a file (if none is required)
|
446 |
+
# we use touch to prevent make continually trying to remake it.
|
447 |
+
# The DynaLoader only reads a non-empty file.
|
448 |
+
$(BOOTSTRAP) : $(FIRST_MAKEFILE) $(BOOTDEP) $(INST_ARCHAUTODIR)$(DFSEP).exists
|
449 |
+
$(NOECHO) $(ECHO) "Running Mkbootstrap for $(NAME) ($(BSLOADLIBS))"
|
450 |
+
$(NOECHO) $(PERLRUN) \
|
451 |
+
"-MExtUtils::Mkbootstrap" \
|
452 |
+
-e "Mkbootstrap('$(BASEEXT)','$(BSLOADLIBS)');"
|
453 |
+
$(NOECHO) $(TOUCH) $@
|
454 |
+
$(CHMOD) $(PERM_RW) $@
|
455 |
+
|
456 |
+
$(INST_BOOT) : $(BOOTSTRAP) $(INST_ARCHAUTODIR)$(DFSEP).exists
|
457 |
+
$(NOECHO) $(RM_RF) $@
|
458 |
+
- $(CP) $(BOOTSTRAP) $@
|
459 |
+
$(CHMOD) $(PERM_RW) $@
|
460 |
+
|
461 |
+
|
462 |
+
# --- MakeMaker dynamic_lib section:
|
463 |
+
|
464 |
+
# This section creates the dynamically loadable $(INST_DYNAMIC)
|
465 |
+
# from $(OBJECT) and possibly $(MYEXTLIB).
|
466 |
+
ARMAYBE = :
|
467 |
+
OTHERLDFLAGS =
|
468 |
+
INST_DYNAMIC_DEP =
|
469 |
+
INST_DYNAMIC_FIX =
|
470 |
+
|
471 |
+
$(INST_DYNAMIC): $(OBJECT) $(MYEXTLIB) $(BOOTSTRAP) $(INST_ARCHAUTODIR)$(DFSEP).exists $(EXPORT_LIST) $(PERL_ARCHIVE) $(PERL_ARCHIVE_AFTER) $(INST_DYNAMIC_DEP)
|
472 |
+
$(RM_F) $@
|
473 |
+
$(LD) $(LDDLFLAGS) $(LDFROM) $(OTHERLDFLAGS) -o $@ $(MYEXTLIB) \
|
474 |
+
$(PERL_ARCHIVE) $(LDLOADLIBS) $(PERL_ARCHIVE_AFTER) $(EXPORT_LIST) \
|
475 |
+
$(INST_DYNAMIC_FIX)
|
476 |
+
$(CHMOD) $(PERM_RWX) $@
|
477 |
+
|
478 |
+
|
479 |
+
# --- MakeMaker static section:
|
480 |
+
|
481 |
+
## $(INST_PM) has been moved to the all: target.
|
482 |
+
## It remains here for awhile to allow for old usage: "make static"
|
483 |
+
static :: $(FIRST_MAKEFILE) $(INST_STATIC)
|
484 |
+
$(NOECHO) $(NOOP)
|
485 |
+
|
486 |
+
|
487 |
+
# --- MakeMaker static_lib section:
|
488 |
+
|
489 |
+
$(INST_STATIC) : $(OBJECT) $(MYEXTLIB) $(INST_ARCHAUTODIR)$(DFSEP).exists
|
490 |
+
$(RM_RF) $@
|
491 |
+
$(FULL_AR) $(AR_STATIC_ARGS) $@ $(OBJECT) && $(RANLIB) $@
|
492 |
+
$(CHMOD) $(PERM_RWX) $@
|
493 |
+
$(NOECHO) $(ECHO) "$(EXTRALIBS)" > $(INST_ARCHAUTODIR)/extralibs.ld
|
494 |
+
|
495 |
+
|
496 |
+
# --- MakeMaker manifypods section:
|
497 |
+
|
498 |
+
POD2MAN_EXE = $(PERLRUN) "-MExtUtils::Command::MM" -e pod2man "--"
|
499 |
+
POD2MAN = $(POD2MAN_EXE)
|
500 |
+
|
501 |
+
|
502 |
+
manifypods : pure_all
|
503 |
+
$(NOECHO) $(NOOP)
|
504 |
+
|
505 |
+
|
506 |
+
|
507 |
+
|
508 |
+
# --- MakeMaker processPL section:
|
509 |
+
|
510 |
+
|
511 |
+
# --- MakeMaker installbin section:
|
512 |
+
|
513 |
+
|
514 |
+
# --- MakeMaker subdirs section:
|
515 |
+
|
516 |
+
# none
|
517 |
+
|
518 |
+
# --- MakeMaker clean_subdirs section:
|
519 |
+
clean_subdirs :
|
520 |
+
$(NOECHO) $(NOOP)
|
521 |
+
|
522 |
+
|
523 |
+
# --- MakeMaker clean section:
|
524 |
+
|
525 |
+
# Delete temporary files but do not touch installed files. We don't delete
|
526 |
+
# the Makefile here so a later make realclean still has a makefile to use.
|
527 |
+
|
528 |
+
clean :: clean_subdirs
|
529 |
+
- $(RM_F) \
|
530 |
+
*$(LIB_EXT) core \
|
531 |
+
core.[0-9] $(INST_ARCHAUTODIR)/extralibs.all \
|
532 |
+
core.[0-9][0-9] $(BASEEXT).bso \
|
533 |
+
pm_to_blib.ts core.[0-9][0-9][0-9][0-9] \
|
534 |
+
$(BASEEXT).x $(BOOTSTRAP) \
|
535 |
+
perl$(EXE_EXT) tmon.out \
|
536 |
+
*$(OBJ_EXT) pm_to_blib \
|
537 |
+
$(INST_ARCHAUTODIR)/extralibs.ld blibdirs.ts \
|
538 |
+
core.[0-9][0-9][0-9][0-9][0-9] *perl.core \
|
539 |
+
core.*perl.*.? $(MAKE_APERL_FILE) \
|
540 |
+
perl $(BASEEXT).def \
|
541 |
+
core.[0-9][0-9][0-9] mon.out \
|
542 |
+
lib$(BASEEXT).def perlmain.c \
|
543 |
+
perl.exe so_locations \
|
544 |
+
$(BASEEXT).exp
|
545 |
+
- $(RM_RF) \
|
546 |
+
blib
|
547 |
+
- $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD) $(DEV_NULL)
|
548 |
+
|
549 |
+
|
550 |
+
# --- MakeMaker realclean_subdirs section:
|
551 |
+
realclean_subdirs :
|
552 |
+
$(NOECHO) $(NOOP)
|
553 |
+
|
554 |
+
|
555 |
+
# --- MakeMaker realclean section:
|
556 |
+
# Delete temporary files (via clean) and also delete dist files
|
557 |
+
realclean purge :: clean realclean_subdirs
|
558 |
+
- $(RM_F) \
|
559 |
+
$(OBJECT) $(MAKEFILE_OLD) \
|
560 |
+
$(FIRST_MAKEFILE)
|
561 |
+
- $(RM_RF) \
|
562 |
+
$(DISTVNAME)
|
563 |
+
|
564 |
+
|
565 |
+
# --- MakeMaker metafile section:
|
566 |
+
metafile : create_distdir
|
567 |
+
$(NOECHO) $(ECHO) Generating META.yml
|
568 |
+
$(NOECHO) $(ECHO) '--- #YAML:1.0' > META_new.yml
|
569 |
+
$(NOECHO) $(ECHO) 'name: CRFPP' >> META_new.yml
|
570 |
+
$(NOECHO) $(ECHO) 'version: ' >> META_new.yml
|
571 |
+
$(NOECHO) $(ECHO) 'abstract: ~' >> META_new.yml
|
572 |
+
$(NOECHO) $(ECHO) 'author: []' >> META_new.yml
|
573 |
+
$(NOECHO) $(ECHO) 'license: unknown' >> META_new.yml
|
574 |
+
$(NOECHO) $(ECHO) 'distribution_type: module' >> META_new.yml
|
575 |
+
$(NOECHO) $(ECHO) 'configure_requires:' >> META_new.yml
|
576 |
+
$(NOECHO) $(ECHO) ' ExtUtils::MakeMaker: 0' >> META_new.yml
|
577 |
+
$(NOECHO) $(ECHO) 'build_requires:' >> META_new.yml
|
578 |
+
$(NOECHO) $(ECHO) ' ExtUtils::MakeMaker: 0' >> META_new.yml
|
579 |
+
$(NOECHO) $(ECHO) 'requires: {}' >> META_new.yml
|
580 |
+
$(NOECHO) $(ECHO) 'no_index:' >> META_new.yml
|
581 |
+
$(NOECHO) $(ECHO) ' directory:' >> META_new.yml
|
582 |
+
$(NOECHO) $(ECHO) ' - t' >> META_new.yml
|
583 |
+
$(NOECHO) $(ECHO) ' - inc' >> META_new.yml
|
584 |
+
$(NOECHO) $(ECHO) 'generated_by: ExtUtils::MakeMaker version 6.56' >> META_new.yml
|
585 |
+
$(NOECHO) $(ECHO) 'meta-spec:' >> META_new.yml
|
586 |
+
$(NOECHO) $(ECHO) ' url: http://module-build.sourceforge.net/META-spec-v1.4.html' >> META_new.yml
|
587 |
+
$(NOECHO) $(ECHO) ' version: 1.4' >> META_new.yml
|
588 |
+
-$(NOECHO) $(MV) META_new.yml $(DISTVNAME)/META.yml
|
589 |
+
|
590 |
+
|
591 |
+
# --- MakeMaker signature section:
|
592 |
+
signature :
|
593 |
+
cpansign -s
|
594 |
+
|
595 |
+
|
596 |
+
# --- MakeMaker dist_basics section:
|
597 |
+
distclean :: realclean distcheck
|
598 |
+
$(NOECHO) $(NOOP)
|
599 |
+
|
600 |
+
distcheck :
|
601 |
+
$(PERLRUN) "-MExtUtils::Manifest=fullcheck" -e fullcheck
|
602 |
+
|
603 |
+
skipcheck :
|
604 |
+
$(PERLRUN) "-MExtUtils::Manifest=skipcheck" -e skipcheck
|
605 |
+
|
606 |
+
manifest :
|
607 |
+
$(PERLRUN) "-MExtUtils::Manifest=mkmanifest" -e mkmanifest
|
608 |
+
|
609 |
+
veryclean : realclean
|
610 |
+
$(RM_F) *~ */*~ *.orig */*.orig *.bak */*.bak *.old */*.old
|
611 |
+
|
612 |
+
|
613 |
+
|
614 |
+
# --- MakeMaker dist_core section:
|
615 |
+
|
616 |
+
dist : $(DIST_DEFAULT) $(FIRST_MAKEFILE)
|
617 |
+
$(NOECHO) $(ABSPERLRUN) -l -e 'print '\''Warning: Makefile possibly out of date with $(VERSION_FROM)'\''' \
|
618 |
+
-e ' if -e '\''$(VERSION_FROM)'\'' and -M '\''$(VERSION_FROM)'\'' < -M '\''$(FIRST_MAKEFILE)'\'';' --
|
619 |
+
|
620 |
+
tardist : $(DISTVNAME).tar$(SUFFIX)
|
621 |
+
$(NOECHO) $(NOOP)
|
622 |
+
|
623 |
+
uutardist : $(DISTVNAME).tar$(SUFFIX)
|
624 |
+
uuencode $(DISTVNAME).tar$(SUFFIX) $(DISTVNAME).tar$(SUFFIX) > $(DISTVNAME).tar$(SUFFIX)_uu
|
625 |
+
|
626 |
+
$(DISTVNAME).tar$(SUFFIX) : distdir
|
627 |
+
$(PREOP)
|
628 |
+
$(TO_UNIX)
|
629 |
+
$(TAR) $(TARFLAGS) $(DISTVNAME).tar $(DISTVNAME)
|
630 |
+
$(RM_RF) $(DISTVNAME)
|
631 |
+
$(COMPRESS) $(DISTVNAME).tar
|
632 |
+
$(POSTOP)
|
633 |
+
|
634 |
+
zipdist : $(DISTVNAME).zip
|
635 |
+
$(NOECHO) $(NOOP)
|
636 |
+
|
637 |
+
$(DISTVNAME).zip : distdir
|
638 |
+
$(PREOP)
|
639 |
+
$(ZIP) $(ZIPFLAGS) $(DISTVNAME).zip $(DISTVNAME)
|
640 |
+
$(RM_RF) $(DISTVNAME)
|
641 |
+
$(POSTOP)
|
642 |
+
|
643 |
+
shdist : distdir
|
644 |
+
$(PREOP)
|
645 |
+
$(SHAR) $(DISTVNAME) > $(DISTVNAME).shar
|
646 |
+
$(RM_RF) $(DISTVNAME)
|
647 |
+
$(POSTOP)
|
648 |
+
|
649 |
+
|
650 |
+
# --- MakeMaker distdir section:
|
651 |
+
create_distdir :
|
652 |
+
$(RM_RF) $(DISTVNAME)
|
653 |
+
$(PERLRUN) "-MExtUtils::Manifest=manicopy,maniread" \
|
654 |
+
-e "manicopy(maniread(),'$(DISTVNAME)', '$(DIST_CP)');"
|
655 |
+
|
656 |
+
distdir : create_distdir distmeta
|
657 |
+
$(NOECHO) $(NOOP)
|
658 |
+
|
659 |
+
|
660 |
+
|
661 |
+
# --- MakeMaker dist_test section:
|
662 |
+
disttest : distdir
|
663 |
+
cd $(DISTVNAME) && $(ABSPERLRUN) Makefile.PL
|
664 |
+
cd $(DISTVNAME) && $(MAKE) $(PASTHRU)
|
665 |
+
cd $(DISTVNAME) && $(MAKE) test $(PASTHRU)
|
666 |
+
|
667 |
+
|
668 |
+
|
669 |
+
# --- MakeMaker dist_ci section:
|
670 |
+
|
671 |
+
ci :
|
672 |
+
$(PERLRUN) "-MExtUtils::Manifest=maniread" \
|
673 |
+
-e "@all = keys %{ maniread() };" \
|
674 |
+
-e "print(qq{Executing $(CI) @all\n}); system(qq{$(CI) @all});" \
|
675 |
+
-e "print(qq{Executing $(RCS_LABEL) ...\n}); system(qq{$(RCS_LABEL) @all});"
|
676 |
+
|
677 |
+
|
678 |
+
# --- MakeMaker distmeta section:
|
679 |
+
distmeta : create_distdir metafile
|
680 |
+
$(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{META.yml} => q{Module meta-data (added by MakeMaker)}}) } ' \
|
681 |
+
-e ' or print "Could not add META.yml to MANIFEST: $${'\''@'\''}\n"' --
|
682 |
+
|
683 |
+
|
684 |
+
|
685 |
+
# --- MakeMaker distsignature section:
|
686 |
+
distsignature : create_distdir
|
687 |
+
$(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{SIGNATURE} => q{Public-key signature (added by MakeMaker)}}) } ' \
|
688 |
+
-e ' or print "Could not add SIGNATURE to MANIFEST: $${'\''@'\''}\n"' --
|
689 |
+
$(NOECHO) cd $(DISTVNAME) && $(TOUCH) SIGNATURE
|
690 |
+
cd $(DISTVNAME) && cpansign -s
|
691 |
+
|
692 |
+
|
693 |
+
|
694 |
+
# --- MakeMaker install section:
|
695 |
+
|
696 |
+
install :: pure_install doc_install
|
697 |
+
$(NOECHO) $(NOOP)
|
698 |
+
|
699 |
+
install_perl :: pure_perl_install doc_perl_install
|
700 |
+
$(NOECHO) $(NOOP)
|
701 |
+
|
702 |
+
install_site :: pure_site_install doc_site_install
|
703 |
+
$(NOECHO) $(NOOP)
|
704 |
+
|
705 |
+
install_vendor :: pure_vendor_install doc_vendor_install
|
706 |
+
$(NOECHO) $(NOOP)
|
707 |
+
|
708 |
+
pure_install :: pure_$(INSTALLDIRS)_install
|
709 |
+
$(NOECHO) $(NOOP)
|
710 |
+
|
711 |
+
doc_install :: doc_$(INSTALLDIRS)_install
|
712 |
+
$(NOECHO) $(NOOP)
|
713 |
+
|
714 |
+
pure__install : pure_site_install
|
715 |
+
$(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site
|
716 |
+
|
717 |
+
doc__install : doc_site_install
|
718 |
+
$(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site
|
719 |
+
|
720 |
+
pure_perl_install :: all
|
721 |
+
$(NOECHO) umask 022; $(MOD_INSTALL) \
|
722 |
+
$(INST_LIB) $(DESTINSTALLPRIVLIB) \
|
723 |
+
$(INST_ARCHLIB) $(DESTINSTALLARCHLIB) \
|
724 |
+
$(INST_BIN) $(DESTINSTALLBIN) \
|
725 |
+
$(INST_SCRIPT) $(DESTINSTALLSCRIPT) \
|
726 |
+
$(INST_MAN1DIR) $(DESTINSTALLMAN1DIR) \
|
727 |
+
$(INST_MAN3DIR) $(DESTINSTALLMAN3DIR)
|
728 |
+
$(NOECHO) $(WARN_IF_OLD_PACKLIST) \
|
729 |
+
$(SITEARCHEXP)/auto/$(FULLEXT)
|
730 |
+
|
731 |
+
|
732 |
+
pure_site_install :: all
|
733 |
+
$(NOECHO) umask 02; $(MOD_INSTALL) \
|
734 |
+
read $(SITEARCHEXP)/auto/$(FULLEXT)/.packlist \
|
735 |
+
write $(DESTINSTALLSITEARCH)/auto/$(FULLEXT)/.packlist \
|
736 |
+
$(INST_LIB) $(DESTINSTALLSITELIB) \
|
737 |
+
$(INST_ARCHLIB) $(DESTINSTALLSITEARCH) \
|
738 |
+
$(INST_BIN) $(DESTINSTALLSITEBIN) \
|
739 |
+
$(INST_SCRIPT) $(DESTINSTALLSITESCRIPT) \
|
740 |
+
$(INST_MAN1DIR) $(DESTINSTALLSITEMAN1DIR) \
|
741 |
+
$(INST_MAN3DIR) $(DESTINSTALLSITEMAN3DIR)
|
742 |
+
$(NOECHO) $(WARN_IF_OLD_PACKLIST) \
|
743 |
+
$(PERL_ARCHLIB)/auto/$(FULLEXT)
|
744 |
+
|
745 |
+
pure_vendor_install :: all
|
746 |
+
$(NOECHO) umask 022; $(MOD_INSTALL) \
|
747 |
+
$(INST_LIB) $(DESTINSTALLVENDORLIB) \
|
748 |
+
$(INST_ARCHLIB) $(DESTINSTALLVENDORARCH) \
|
749 |
+
$(INST_BIN) $(DESTINSTALLVENDORBIN) \
|
750 |
+
$(INST_SCRIPT) $(DESTINSTALLVENDORSCRIPT) \
|
751 |
+
$(INST_MAN1DIR) $(DESTINSTALLVENDORMAN1DIR) \
|
752 |
+
$(INST_MAN3DIR) $(DESTINSTALLVENDORMAN3DIR)
|
753 |
+
|
754 |
+
doc_perl_install :: all
|
755 |
+
|
756 |
+
doc_site_install :: all
|
757 |
+
$(NOECHO) $(ECHO) Appending installation info to $(DESTINSTALLSITEARCH)/perllocal.pod
|
758 |
+
-$(NOECHO) umask 02; $(MKPATH) $(DESTINSTALLSITEARCH)
|
759 |
+
-$(NOECHO) umask 02; $(DOC_INSTALL) \
|
760 |
+
"Module" "$(NAME)" \
|
761 |
+
"installed into" "$(INSTALLSITELIB)" \
|
762 |
+
LINKTYPE "$(LINKTYPE)" \
|
763 |
+
VERSION "$(VERSION)" \
|
764 |
+
EXE_FILES "$(EXE_FILES)" \
|
765 |
+
>> $(DESTINSTALLSITEARCH)/perllocal.pod
|
766 |
+
|
767 |
+
doc_vendor_install :: all
|
768 |
+
|
769 |
+
|
770 |
+
uninstall :: uninstall_from_$(INSTALLDIRS)dirs
|
771 |
+
$(NOECHO) $(NOOP)
|
772 |
+
|
773 |
+
uninstall_from_perldirs ::
|
774 |
+
|
775 |
+
uninstall_from_sitedirs ::
|
776 |
+
$(NOECHO) $(UNINSTALL) $(SITEARCHEXP)/auto/$(FULLEXT)/.packlist
|
777 |
+
|
778 |
+
uninstall_from_vendordirs ::
|
779 |
+
|
780 |
+
|
781 |
+
|
782 |
+
# --- MakeMaker force section:
|
783 |
+
# Phony target to force checking subdirectories.
|
784 |
+
FORCE :
|
785 |
+
$(NOECHO) $(NOOP)
|
786 |
+
|
787 |
+
|
788 |
+
# --- MakeMaker perldepend section:
|
789 |
+
|
790 |
+
PERL_HDRS = \
|
791 |
+
$(PERL_INC)/EXTERN.h \
|
792 |
+
$(PERL_INC)/INTERN.h \
|
793 |
+
$(PERL_INC)/XSUB.h \
|
794 |
+
$(PERL_INC)/av.h \
|
795 |
+
$(PERL_INC)/cc_runtime.h \
|
796 |
+
$(PERL_INC)/config.h \
|
797 |
+
$(PERL_INC)/cop.h \
|
798 |
+
$(PERL_INC)/cv.h \
|
799 |
+
$(PERL_INC)/dosish.h \
|
800 |
+
$(PERL_INC)/embed.h \
|
801 |
+
$(PERL_INC)/embedvar.h \
|
802 |
+
$(PERL_INC)/fakethr.h \
|
803 |
+
$(PERL_INC)/form.h \
|
804 |
+
$(PERL_INC)/gv.h \
|
805 |
+
$(PERL_INC)/handy.h \
|
806 |
+
$(PERL_INC)/hv.h \
|
807 |
+
$(PERL_INC)/intrpvar.h \
|
808 |
+
$(PERL_INC)/iperlsys.h \
|
809 |
+
$(PERL_INC)/keywords.h \
|
810 |
+
$(PERL_INC)/mg.h \
|
811 |
+
$(PERL_INC)/nostdio.h \
|
812 |
+
$(PERL_INC)/op.h \
|
813 |
+
$(PERL_INC)/opcode.h \
|
814 |
+
$(PERL_INC)/patchlevel.h \
|
815 |
+
$(PERL_INC)/perl.h \
|
816 |
+
$(PERL_INC)/perlio.h \
|
817 |
+
$(PERL_INC)/perlsdio.h \
|
818 |
+
$(PERL_INC)/perlsfio.h \
|
819 |
+
$(PERL_INC)/perlvars.h \
|
820 |
+
$(PERL_INC)/perly.h \
|
821 |
+
$(PERL_INC)/pp.h \
|
822 |
+
$(PERL_INC)/pp_proto.h \
|
823 |
+
$(PERL_INC)/proto.h \
|
824 |
+
$(PERL_INC)/regcomp.h \
|
825 |
+
$(PERL_INC)/regexp.h \
|
826 |
+
$(PERL_INC)/regnodes.h \
|
827 |
+
$(PERL_INC)/scope.h \
|
828 |
+
$(PERL_INC)/sv.h \
|
829 |
+
$(PERL_INC)/thread.h \
|
830 |
+
$(PERL_INC)/unixish.h \
|
831 |
+
$(PERL_INC)/util.h
|
832 |
+
|
833 |
+
$(OBJECT) : $(PERL_HDRS)
|
834 |
+
|
835 |
+
|
836 |
+
# --- MakeMaker makefile section:
|
837 |
+
|
838 |
+
$(OBJECT) : $(FIRST_MAKEFILE)
|
839 |
+
|
840 |
+
# We take a very conservative approach here, but it's worth it.
|
841 |
+
# We move Makefile to Makefile.old here to avoid gnu make looping.
|
842 |
+
$(FIRST_MAKEFILE) : Makefile.PL $(CONFIGDEP)
|
843 |
+
$(NOECHO) $(ECHO) "Makefile out-of-date with respect to $?"
|
844 |
+
$(NOECHO) $(ECHO) "Cleaning current config before rebuilding Makefile..."
|
845 |
+
-$(NOECHO) $(RM_F) $(MAKEFILE_OLD)
|
846 |
+
-$(NOECHO) $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD)
|
847 |
+
- $(MAKE) $(USEMAKEFILE) $(MAKEFILE_OLD) clean $(DEV_NULL)
|
848 |
+
$(PERLRUN) Makefile.PL
|
849 |
+
$(NOECHO) $(ECHO) "==> Your Makefile has been rebuilt. <=="
|
850 |
+
$(NOECHO) $(ECHO) "==> Please rerun the $(MAKE) command. <=="
|
851 |
+
$(FALSE)
|
852 |
+
|
853 |
+
|
854 |
+
|
855 |
+
# --- MakeMaker staticmake section:
|
856 |
+
|
857 |
+
# --- MakeMaker makeaperl section ---
|
858 |
+
MAP_TARGET = perl
|
859 |
+
FULLPERL = /usr/bin/perl
|
860 |
+
|
861 |
+
$(MAP_TARGET) :: static $(MAKE_APERL_FILE)
|
862 |
+
$(MAKE) $(USEMAKEFILE) $(MAKE_APERL_FILE) $@
|
863 |
+
|
864 |
+
$(MAKE_APERL_FILE) : $(FIRST_MAKEFILE) pm_to_blib
|
865 |
+
$(NOECHO) $(ECHO) Writing \"$(MAKE_APERL_FILE)\" for this $(MAP_TARGET)
|
866 |
+
$(NOECHO) $(PERLRUNINST) \
|
867 |
+
Makefile.PL DIR= \
|
868 |
+
MAKEFILE=$(MAKE_APERL_FILE) LINKTYPE=static \
|
869 |
+
MAKEAPERL=1 NORECURS=1 CCCDLFLAGS=
|
870 |
+
|
871 |
+
|
872 |
+
# --- MakeMaker test section:
|
873 |
+
|
874 |
+
TEST_VERBOSE=0
|
875 |
+
TEST_TYPE=test_$(LINKTYPE)
|
876 |
+
TEST_FILE = test.pl
|
877 |
+
TEST_FILES =
|
878 |
+
TESTDB_SW = -d
|
879 |
+
|
880 |
+
testdb :: testdb_$(LINKTYPE)
|
881 |
+
|
882 |
+
test :: $(TEST_TYPE) subdirs-test
|
883 |
+
|
884 |
+
subdirs-test ::
|
885 |
+
$(NOECHO) $(NOOP)
|
886 |
+
|
887 |
+
|
888 |
+
test_dynamic :: pure_all
|
889 |
+
PERL_DL_NONLAZY=1 $(FULLPERLRUN) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
|
890 |
+
|
891 |
+
testdb_dynamic :: pure_all
|
892 |
+
PERL_DL_NONLAZY=1 $(FULLPERLRUN) $(TESTDB_SW) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
|
893 |
+
|
894 |
+
test_ : test_dynamic
|
895 |
+
|
896 |
+
test_static :: pure_all $(MAP_TARGET)
|
897 |
+
PERL_DL_NONLAZY=1 ./$(MAP_TARGET) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
|
898 |
+
|
899 |
+
testdb_static :: pure_all $(MAP_TARGET)
|
900 |
+
PERL_DL_NONLAZY=1 ./$(MAP_TARGET) $(TESTDB_SW) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
|
901 |
+
|
902 |
+
|
903 |
+
|
904 |
+
# --- MakeMaker ppd section:
|
905 |
+
# Creates a PPD (Perl Package Description) for a binary distribution.
|
906 |
+
ppd :
|
907 |
+
$(NOECHO) $(ECHO) '<SOFTPKG NAME="$(DISTNAME)" VERSION="">' > $(DISTNAME).ppd
|
908 |
+
$(NOECHO) $(ECHO) ' <ABSTRACT></ABSTRACT>' >> $(DISTNAME).ppd
|
909 |
+
$(NOECHO) $(ECHO) ' <AUTHOR></AUTHOR>' >> $(DISTNAME).ppd
|
910 |
+
$(NOECHO) $(ECHO) ' <IMPLEMENTATION>' >> $(DISTNAME).ppd
|
911 |
+
$(NOECHO) $(ECHO) ' <ARCHITECTURE NAME="x86_64-linux-gnu-thread-multi-5.12" />' >> $(DISTNAME).ppd
|
912 |
+
$(NOECHO) $(ECHO) ' <CODEBASE HREF="" />' >> $(DISTNAME).ppd
|
913 |
+
$(NOECHO) $(ECHO) ' </IMPLEMENTATION>' >> $(DISTNAME).ppd
|
914 |
+
$(NOECHO) $(ECHO) '</SOFTPKG>' >> $(DISTNAME).ppd
|
915 |
+
|
916 |
+
|
917 |
+
# --- MakeMaker pm_to_blib section:
|
918 |
+
|
919 |
+
pm_to_blib : $(FIRST_MAKEFILE) $(TO_INST_PM)
|
920 |
+
$(NOECHO) $(ABSPERLRUN) -MExtUtils::Install -e 'pm_to_blib({@ARGV}, '\''$(INST_LIB)/auto'\'', q[$(PM_FILTER)], '\''$(PERM_DIR)'\'')' -- \
|
921 |
+
CRFPP.pm $(INST_LIB)/CRFPP.pm
|
922 |
+
$(NOECHO) $(TOUCH) pm_to_blib
|
923 |
+
|
924 |
+
|
925 |
+
# --- MakeMaker selfdocument section:
|
926 |
+
|
927 |
+
|
928 |
+
# --- MakeMaker postamble section:
|
929 |
+
|
930 |
+
|
931 |
+
# End.
|
CRF/ruby/Makefile
CHANGED
@@ -1,157 +1,157 @@
|
|
1 |
-
|
2 |
-
SHELL = /bin/sh
|
3 |
-
|
4 |
-
#### Start of system configuration section. ####
|
5 |
-
|
6 |
-
srcdir = .
|
7 |
-
topdir = /usr/lib/ruby/1.8/x86_64-linux
|
8 |
-
hdrdir = $(topdir)
|
9 |
-
VPATH = $(srcdir):$(topdir):$(hdrdir)
|
10 |
-
exec_prefix = $(prefix)
|
11 |
-
prefix = $(DESTDIR)/usr
|
12 |
-
sharedstatedir = $(prefix)/com
|
13 |
-
mandir = $(prefix)/share/man
|
14 |
-
psdir = $(docdir)
|
15 |
-
oldincludedir = $(DESTDIR)/usr/include
|
16 |
-
localedir = $(datarootdir)/locale
|
17 |
-
bindir = $(exec_prefix)/bin
|
18 |
-
libexecdir = $(prefix)/lib/ruby1.8
|
19 |
-
sitedir = $(DESTDIR)/usr/local/lib/site_ruby
|
20 |
-
htmldir = $(docdir)
|
21 |
-
vendorarchdir = $(vendorlibdir)/$(sitearch)
|
22 |
-
includedir = $(prefix)/include
|
23 |
-
infodir = $(prefix)/share/info
|
24 |
-
vendorlibdir = $(vendordir)/$(ruby_version)
|
25 |
-
sysconfdir = $(DESTDIR)/etc
|
26 |
-
libdir = $(exec_prefix)/lib
|
27 |
-
sbindir = $(exec_prefix)/sbin
|
28 |
-
rubylibdir = $(libdir)/ruby/$(ruby_version)
|
29 |
-
docdir = $(datarootdir)/doc/$(PACKAGE)
|
30 |
-
dvidir = $(docdir)
|
31 |
-
vendordir = $(libdir)/ruby/vendor_ruby
|
32 |
-
datarootdir = $(prefix)/share
|
33 |
-
pdfdir = $(docdir)
|
34 |
-
archdir = $(rubylibdir)/$(arch)
|
35 |
-
sitearchdir = $(sitelibdir)/$(sitearch)
|
36 |
-
datadir = $(datarootdir)
|
37 |
-
localstatedir = $(DESTDIR)/var
|
38 |
-
sitelibdir = $(sitedir)/$(ruby_version)
|
39 |
-
|
40 |
-
CC = gcc
|
41 |
-
LIBRUBY = $(LIBRUBY_SO)
|
42 |
-
LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
|
43 |
-
LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
|
44 |
-
LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
|
45 |
-
|
46 |
-
RUBY_EXTCONF_H =
|
47 |
-
CFLAGS = -fPIC -fno-strict-aliasing -g -g -O2 -fPIC $(cflags)
|
48 |
-
INCFLAGS = -I. -I. -I/usr/lib/ruby/1.8/x86_64-linux -I.
|
49 |
-
DEFS =
|
50 |
-
CPPFLAGS = -DHAVE_CRFPP_H
|
51 |
-
CXXFLAGS = $(CFLAGS)
|
52 |
-
ldflags = -L. -Wl,-Bsymbolic-functions -rdynamic -Wl,-export-dynamic
|
53 |
-
dldflags =
|
54 |
-
archflag =
|
55 |
-
DLDFLAGS = $(ldflags) $(dldflags) $(archflag)
|
56 |
-
LDSHARED = $(CC) -shared
|
57 |
-
AR = ar
|
58 |
-
EXEEXT =
|
59 |
-
|
60 |
-
RUBY_INSTALL_NAME = ruby1.8
|
61 |
-
RUBY_SO_NAME = ruby1.8
|
62 |
-
arch = x86_64-linux
|
63 |
-
sitearch = x86_64-linux
|
64 |
-
ruby_version = 1.8
|
65 |
-
ruby = /usr/bin/ruby1.8
|
66 |
-
RUBY = $(ruby)
|
67 |
-
RM = rm -f
|
68 |
-
MAKEDIRS = mkdir -p
|
69 |
-
INSTALL = /usr/bin/install -c
|
70 |
-
INSTALL_PROG = $(INSTALL) -m 0755
|
71 |
-
INSTALL_DATA = $(INSTALL) -m 644
|
72 |
-
COPY = cp
|
73 |
-
|
74 |
-
#### End of system configuration section. ####
|
75 |
-
|
76 |
-
preload =
|
77 |
-
|
78 |
-
libpath = . $(libdir)
|
79 |
-
LIBPATH = -L. -L$(libdir)
|
80 |
-
DEFFILE =
|
81 |
-
|
82 |
-
CLEANFILES = mkmf.log
|
83 |
-
DISTCLEANFILES =
|
84 |
-
|
85 |
-
extout =
|
86 |
-
extout_prefix =
|
87 |
-
target_prefix =
|
88 |
-
LOCAL_LIBS =
|
89 |
-
LIBS = $(LIBRUBYARG_SHARED) -lpthread -lcrfpp -lpthread -lrt -ldl -lcrypt -lm -lc
|
90 |
-
SRCS = CRFPP_wrap.cpp
|
91 |
-
OBJS = CRFPP_wrap.o
|
92 |
-
TARGET = CRFPP
|
93 |
-
DLLIB = $(TARGET).so
|
94 |
-
EXTSTATIC =
|
95 |
-
STATIC_LIB =
|
96 |
-
|
97 |
-
BINDIR = $(bindir)
|
98 |
-
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
99 |
-
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
100 |
-
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
101 |
-
|
102 |
-
TARGET_SO = $(DLLIB)
|
103 |
-
CLEANLIBS = $(TARGET).so $(TARGET).il? $(TARGET).tds $(TARGET).map
|
104 |
-
CLEANOBJS = *.o *.a *.s[ol] *.pdb *.exp *.bak
|
105 |
-
|
106 |
-
all: $(DLLIB)
|
107 |
-
static: $(STATIC_LIB)
|
108 |
-
|
109 |
-
clean:
|
110 |
-
@-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
|
111 |
-
|
112 |
-
distclean: clean
|
113 |
-
@-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
|
114 |
-
@-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
|
115 |
-
|
116 |
-
realclean: distclean
|
117 |
-
install: install-so install-rb
|
118 |
-
|
119 |
-
install-so: $(RUBYARCHDIR)
|
120 |
-
install-so: $(RUBYARCHDIR)/$(DLLIB)
|
121 |
-
$(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
|
122 |
-
$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
|
123 |
-
install-rb: pre-install-rb install-rb-default
|
124 |
-
install-rb-default: pre-install-rb-default
|
125 |
-
pre-install-rb: Makefile
|
126 |
-
pre-install-rb-default: Makefile
|
127 |
-
$(RUBYARCHDIR):
|
128 |
-
$(MAKEDIRS) $@
|
129 |
-
|
130 |
-
site-install: site-install-so site-install-rb
|
131 |
-
site-install-so: install-so
|
132 |
-
site-install-rb: install-rb
|
133 |
-
|
134 |
-
.SUFFIXES: .c .m .cc .cxx .cpp .C .o
|
135 |
-
|
136 |
-
.cc.o:
|
137 |
-
$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
|
138 |
-
|
139 |
-
.cxx.o:
|
140 |
-
$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
|
141 |
-
|
142 |
-
.cpp.o:
|
143 |
-
$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
|
144 |
-
|
145 |
-
.C.o:
|
146 |
-
$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
|
147 |
-
|
148 |
-
.c.o:
|
149 |
-
$(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) -c $<
|
150 |
-
|
151 |
-
$(DLLIB): $(OBJS) Makefile
|
152 |
-
@-$(RM) $@
|
153 |
-
$(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
$(OBJS): ruby.h defines.h
|
|
|
1 |
+
|
2 |
+
SHELL = /bin/sh
|
3 |
+
|
4 |
+
#### Start of system configuration section. ####
|
5 |
+
|
6 |
+
srcdir = .
|
7 |
+
topdir = /usr/lib/ruby/1.8/x86_64-linux
|
8 |
+
hdrdir = $(topdir)
|
9 |
+
VPATH = $(srcdir):$(topdir):$(hdrdir)
|
10 |
+
exec_prefix = $(prefix)
|
11 |
+
prefix = $(DESTDIR)/usr
|
12 |
+
sharedstatedir = $(prefix)/com
|
13 |
+
mandir = $(prefix)/share/man
|
14 |
+
psdir = $(docdir)
|
15 |
+
oldincludedir = $(DESTDIR)/usr/include
|
16 |
+
localedir = $(datarootdir)/locale
|
17 |
+
bindir = $(exec_prefix)/bin
|
18 |
+
libexecdir = $(prefix)/lib/ruby1.8
|
19 |
+
sitedir = $(DESTDIR)/usr/local/lib/site_ruby
|
20 |
+
htmldir = $(docdir)
|
21 |
+
vendorarchdir = $(vendorlibdir)/$(sitearch)
|
22 |
+
includedir = $(prefix)/include
|
23 |
+
infodir = $(prefix)/share/info
|
24 |
+
vendorlibdir = $(vendordir)/$(ruby_version)
|
25 |
+
sysconfdir = $(DESTDIR)/etc
|
26 |
+
libdir = $(exec_prefix)/lib
|
27 |
+
sbindir = $(exec_prefix)/sbin
|
28 |
+
rubylibdir = $(libdir)/ruby/$(ruby_version)
|
29 |
+
docdir = $(datarootdir)/doc/$(PACKAGE)
|
30 |
+
dvidir = $(docdir)
|
31 |
+
vendordir = $(libdir)/ruby/vendor_ruby
|
32 |
+
datarootdir = $(prefix)/share
|
33 |
+
pdfdir = $(docdir)
|
34 |
+
archdir = $(rubylibdir)/$(arch)
|
35 |
+
sitearchdir = $(sitelibdir)/$(sitearch)
|
36 |
+
datadir = $(datarootdir)
|
37 |
+
localstatedir = $(DESTDIR)/var
|
38 |
+
sitelibdir = $(sitedir)/$(ruby_version)
|
39 |
+
|
40 |
+
CC = gcc
|
41 |
+
LIBRUBY = $(LIBRUBY_SO)
|
42 |
+
LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
|
43 |
+
LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
|
44 |
+
LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
|
45 |
+
|
46 |
+
RUBY_EXTCONF_H =
|
47 |
+
CFLAGS = -fPIC -fno-strict-aliasing -g -g -O2 -fPIC $(cflags)
|
48 |
+
INCFLAGS = -I. -I. -I/usr/lib/ruby/1.8/x86_64-linux -I.
|
49 |
+
DEFS =
|
50 |
+
CPPFLAGS = -DHAVE_CRFPP_H
|
51 |
+
CXXFLAGS = $(CFLAGS)
|
52 |
+
ldflags = -L. -Wl,-Bsymbolic-functions -rdynamic -Wl,-export-dynamic
|
53 |
+
dldflags =
|
54 |
+
archflag =
|
55 |
+
DLDFLAGS = $(ldflags) $(dldflags) $(archflag)
|
56 |
+
LDSHARED = $(CC) -shared
|
57 |
+
AR = ar
|
58 |
+
EXEEXT =
|
59 |
+
|
60 |
+
RUBY_INSTALL_NAME = ruby1.8
|
61 |
+
RUBY_SO_NAME = ruby1.8
|
62 |
+
arch = x86_64-linux
|
63 |
+
sitearch = x86_64-linux
|
64 |
+
ruby_version = 1.8
|
65 |
+
ruby = /usr/bin/ruby1.8
|
66 |
+
RUBY = $(ruby)
|
67 |
+
RM = rm -f
|
68 |
+
MAKEDIRS = mkdir -p
|
69 |
+
INSTALL = /usr/bin/install -c
|
70 |
+
INSTALL_PROG = $(INSTALL) -m 0755
|
71 |
+
INSTALL_DATA = $(INSTALL) -m 644
|
72 |
+
COPY = cp
|
73 |
+
|
74 |
+
#### End of system configuration section. ####
|
75 |
+
|
76 |
+
preload =
|
77 |
+
|
78 |
+
libpath = . $(libdir)
|
79 |
+
LIBPATH = -L. -L$(libdir)
|
80 |
+
DEFFILE =
|
81 |
+
|
82 |
+
CLEANFILES = mkmf.log
|
83 |
+
DISTCLEANFILES =
|
84 |
+
|
85 |
+
extout =
|
86 |
+
extout_prefix =
|
87 |
+
target_prefix =
|
88 |
+
LOCAL_LIBS =
|
89 |
+
LIBS = $(LIBRUBYARG_SHARED) -lpthread -lcrfpp -lpthread -lrt -ldl -lcrypt -lm -lc
|
90 |
+
SRCS = CRFPP_wrap.cpp
|
91 |
+
OBJS = CRFPP_wrap.o
|
92 |
+
TARGET = CRFPP
|
93 |
+
DLLIB = $(TARGET).so
|
94 |
+
EXTSTATIC =
|
95 |
+
STATIC_LIB =
|
96 |
+
|
97 |
+
BINDIR = $(bindir)
|
98 |
+
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
99 |
+
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
100 |
+
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
101 |
+
|
102 |
+
TARGET_SO = $(DLLIB)
|
103 |
+
CLEANLIBS = $(TARGET).so $(TARGET).il? $(TARGET).tds $(TARGET).map
|
104 |
+
CLEANOBJS = *.o *.a *.s[ol] *.pdb *.exp *.bak
|
105 |
+
|
106 |
+
all: $(DLLIB)
|
107 |
+
static: $(STATIC_LIB)
|
108 |
+
|
109 |
+
clean:
|
110 |
+
@-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
|
111 |
+
|
112 |
+
distclean: clean
|
113 |
+
@-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
|
114 |
+
@-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
|
115 |
+
|
116 |
+
realclean: distclean
|
117 |
+
install: install-so install-rb
|
118 |
+
|
119 |
+
install-so: $(RUBYARCHDIR)
|
120 |
+
install-so: $(RUBYARCHDIR)/$(DLLIB)
|
121 |
+
$(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
|
122 |
+
$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
|
123 |
+
install-rb: pre-install-rb install-rb-default
|
124 |
+
install-rb-default: pre-install-rb-default
|
125 |
+
pre-install-rb: Makefile
|
126 |
+
pre-install-rb-default: Makefile
|
127 |
+
$(RUBYARCHDIR):
|
128 |
+
$(MAKEDIRS) $@
|
129 |
+
|
130 |
+
site-install: site-install-so site-install-rb
|
131 |
+
site-install-so: install-so
|
132 |
+
site-install-rb: install-rb
|
133 |
+
|
134 |
+
.SUFFIXES: .c .m .cc .cxx .cpp .C .o
|
135 |
+
|
136 |
+
.cc.o:
|
137 |
+
$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
|
138 |
+
|
139 |
+
.cxx.o:
|
140 |
+
$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
|
141 |
+
|
142 |
+
.cpp.o:
|
143 |
+
$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
|
144 |
+
|
145 |
+
.C.o:
|
146 |
+
$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
|
147 |
+
|
148 |
+
.c.o:
|
149 |
+
$(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) -c $<
|
150 |
+
|
151 |
+
$(DLLIB): $(OBJS) Makefile
|
152 |
+
@-$(RM) $@
|
153 |
+
$(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
|
154 |
+
|
155 |
+
|
156 |
+
|
157 |
+
$(OBJS): ruby.h defines.h
|
CRF/winmain.h
CHANGED
@@ -1,69 +1,69 @@
|
|
1 |
-
//
|
2 |
-
// CRF++ -- Yet Another CRF toolkit
|
3 |
-
//
|
4 |
-
// $Id: common.h 1588 2007-02-12 09:03:39Z taku $;
|
5 |
-
//
|
6 |
-
// Copyright(C) 2005-2007 Taku Kudo <taku@chasen.org>
|
7 |
-
//
|
8 |
-
#if defined(_WIN32) || defined(__CYGWIN__)
|
9 |
-
|
10 |
-
#include <windows.h>
|
11 |
-
#include <string>
|
12 |
-
|
13 |
-
namespace {
|
14 |
-
class CommandLine {
|
15 |
-
public:
|
16 |
-
CommandLine(int argc, wchar_t **argv) : argc_(argc), argv_(0) {
|
17 |
-
argv_ = new char * [argc_];
|
18 |
-
for (int i = 0; i < argc_; ++i) {
|
19 |
-
const std::string arg = WideToUtf8(argv[i]);
|
20 |
-
argv_[i] = new char[arg.size() + 1];
|
21 |
-
::memcpy(argv_[i], arg.data(), arg.size());
|
22 |
-
argv_[i][arg.size()] = '\0';
|
23 |
-
}
|
24 |
-
}
|
25 |
-
~CommandLine() {
|
26 |
-
for (int i = 0; i < argc_; ++i) {
|
27 |
-
delete [] argv_[i];
|
28 |
-
}
|
29 |
-
delete [] argv_;
|
30 |
-
}
|
31 |
-
|
32 |
-
int argc() const { return argc_; }
|
33 |
-
char **argv() const { return argv_; }
|
34 |
-
|
35 |
-
private:
|
36 |
-
static std::string WideToUtf8(const std::wstring &input) {
|
37 |
-
const int output_length = ::WideCharToMultiByte(CP_UTF8, 0,
|
38 |
-
input.c_str(), -1, NULL, 0,
|
39 |
-
NULL, NULL);
|
40 |
-
if (output_length == 0) {
|
41 |
-
return "";
|
42 |
-
}
|
43 |
-
|
44 |
-
char *input_encoded = new char[output_length + 1];
|
45 |
-
const int result = ::WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1,
|
46 |
-
input_encoded,
|
47 |
-
output_length + 1, NULL, NULL);
|
48 |
-
std::string output;
|
49 |
-
if (result > 0) {
|
50 |
-
output.assign(input_encoded);
|
51 |
-
}
|
52 |
-
delete [] input_encoded;
|
53 |
-
return output;
|
54 |
-
}
|
55 |
-
|
56 |
-
int argc_;
|
57 |
-
char **argv_;
|
58 |
-
};
|
59 |
-
} // namespace
|
60 |
-
|
61 |
-
#define main(argc, argv) wmain_to_main_wrapper(argc, argv)
|
62 |
-
|
63 |
-
int wmain_to_main_wrapper(int argc, char **argv);
|
64 |
-
|
65 |
-
int wmain(int argc, wchar_t **argv) {
|
66 |
-
CommandLine cmd(argc, argv);
|
67 |
-
return wmain_to_main_wrapper(cmd.argc(), cmd.argv());
|
68 |
-
}
|
69 |
-
#endif
|
|
|
1 |
+
//
|
2 |
+
// CRF++ -- Yet Another CRF toolkit
|
3 |
+
//
|
4 |
+
// $Id: common.h 1588 2007-02-12 09:03:39Z taku $;
|
5 |
+
//
|
6 |
+
// Copyright(C) 2005-2007 Taku Kudo <taku@chasen.org>
|
7 |
+
//
|
8 |
+
#if defined(_WIN32) || defined(__CYGWIN__)
|
9 |
+
|
10 |
+
#include <windows.h>
|
11 |
+
#include <string>
|
12 |
+
|
13 |
+
namespace {
|
14 |
+
class CommandLine {
|
15 |
+
public:
|
16 |
+
CommandLine(int argc, wchar_t **argv) : argc_(argc), argv_(0) {
|
17 |
+
argv_ = new char * [argc_];
|
18 |
+
for (int i = 0; i < argc_; ++i) {
|
19 |
+
const std::string arg = WideToUtf8(argv[i]);
|
20 |
+
argv_[i] = new char[arg.size() + 1];
|
21 |
+
::memcpy(argv_[i], arg.data(), arg.size());
|
22 |
+
argv_[i][arg.size()] = '\0';
|
23 |
+
}
|
24 |
+
}
|
25 |
+
~CommandLine() {
|
26 |
+
for (int i = 0; i < argc_; ++i) {
|
27 |
+
delete [] argv_[i];
|
28 |
+
}
|
29 |
+
delete [] argv_;
|
30 |
+
}
|
31 |
+
|
32 |
+
int argc() const { return argc_; }
|
33 |
+
char **argv() const { return argv_; }
|
34 |
+
|
35 |
+
private:
|
36 |
+
static std::string WideToUtf8(const std::wstring &input) {
|
37 |
+
const int output_length = ::WideCharToMultiByte(CP_UTF8, 0,
|
38 |
+
input.c_str(), -1, NULL, 0,
|
39 |
+
NULL, NULL);
|
40 |
+
if (output_length == 0) {
|
41 |
+
return "";
|
42 |
+
}
|
43 |
+
|
44 |
+
char *input_encoded = new char[output_length + 1];
|
45 |
+
const int result = ::WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1,
|
46 |
+
input_encoded,
|
47 |
+
output_length + 1, NULL, NULL);
|
48 |
+
std::string output;
|
49 |
+
if (result > 0) {
|
50 |
+
output.assign(input_encoded);
|
51 |
+
}
|
52 |
+
delete [] input_encoded;
|
53 |
+
return output;
|
54 |
+
}
|
55 |
+
|
56 |
+
int argc_;
|
57 |
+
char **argv_;
|
58 |
+
};
|
59 |
+
} // namespace
|
60 |
+
|
61 |
+
#define main(argc, argv) wmain_to_main_wrapper(argc, argv)
|
62 |
+
|
63 |
+
int wmain_to_main_wrapper(int argc, char **argv);
|
64 |
+
|
65 |
+
int wmain(int argc, wchar_t **argv) {
|
66 |
+
CommandLine cmd(argc, argv);
|
67 |
+
return wmain_to_main_wrapper(cmd.argc(), cmd.argv());
|
68 |
+
}
|
69 |
+
#endif
|
GeneNER_SpeAss_run.py
CHANGED
@@ -1,746 +1,746 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""
|
3 |
-
Created on Wed Jun 8 09:26:57 2022
|
4 |
-
|
5 |
-
@author: luol2
|
6 |
-
|
7 |
-
Pipeline: first gene NER, then species assignment
|
8 |
-
input: species NER bioc xml file
|
9 |
-
output: gene ner and species assignment results bioc xml file
|
10 |
-
"""
|
11 |
-
import argparse
|
12 |
-
import os
|
13 |
-
import io
|
14 |
-
import time
|
15 |
-
import sys
|
16 |
-
import re
|
17 |
-
import shutil
|
18 |
-
from src_python.GeneNER import model_ner,ner_tag
|
19 |
-
from src_python.SpeAss import model_sa,sa_tag
|
20 |
-
|
21 |
-
import tensorflow as tf
|
22 |
-
|
23 |
-
import bioc
|
24 |
-
import stanza
|
25 |
-
nlp_token = stanza.Pipeline(model_dir='gnorm_trained_models/stanza', lang='en', processors={'tokenize': 'spacy'},package='None', download_method=None) #package='craft' ;./gnorm_trained_models/stanza
|
26 |
-
|
27 |
-
def NER_BioC(infolder,infile,outpath,nn_model):
|
28 |
-
|
29 |
-
with open(infolder+"/"+infile, 'r',encoding='utf-8') as fin:
|
30 |
-
with open(outpath+"/"+infile,'w', encoding='utf8') as fout:
|
31 |
-
collection = bioc.load(fin)
|
32 |
-
|
33 |
-
Total_n=len(collection.documents)
|
34 |
-
print('Total number of sub-documents:', Total_n)
|
35 |
-
pmid_n=0
|
36 |
-
for document in collection.documents:
|
37 |
-
print("Processing:{0}%".format(round(pmid_n * 100 / Total_n)), end="\r")
|
38 |
-
pmid_n+=1
|
39 |
-
# print(document.id)
|
40 |
-
mention_num_new=0
|
41 |
-
for passage in document.passages:
|
42 |
-
if passage.text!='' and (not passage.text.isspace()) and passage.infons['type']!='ref': # have text and is not ref
|
43 |
-
passage_offset=passage.offset
|
44 |
-
tag_result=ner_tag.ML_Tag(passage.text,nn_model,nlp_token)
|
45 |
-
mention_num=0
|
46 |
-
for ele in tag_result:
|
47 |
-
bioc_note = bioc.BioCAnnotation()
|
48 |
-
bioc_note.id = str(mention_num)
|
49 |
-
mention_num+=1
|
50 |
-
bioc_note.infons['type'] = ele[2]
|
51 |
-
start = int(ele[0])
|
52 |
-
last = int(ele[1])
|
53 |
-
loc = bioc.BioCLocation(offset=str(passage_offset+start), length= str(last-start))
|
54 |
-
bioc_note.locations.append(loc)
|
55 |
-
bioc_note.text = passage.text[start:last]
|
56 |
-
passage.annotations.append(bioc_note)
|
57 |
-
#update id
|
58 |
-
for temp_annotation in passage.annotations:
|
59 |
-
temp_annotation.id=str(mention_num_new)
|
60 |
-
mention_num_new+=1
|
61 |
-
bioc.dump(collection, fout, pretty_print=True)
|
62 |
-
|
63 |
-
def NER_PubTator(infolder,infile,outpath,nn_model):
|
64 |
-
with open(infolder+"/"+infile, 'r',encoding='utf-8') as fin:
|
65 |
-
with open(outpath+"/"+infile,'w', encoding='utf-8') as fout:
|
66 |
-
title=''
|
67 |
-
abstract=''
|
68 |
-
all_text=fin.read().strip().split('\n\n')
|
69 |
-
Total_n=len(all_text)
|
70 |
-
print('Total number of sub-documents:', Total_n)
|
71 |
-
pmid_n=0
|
72 |
-
for doc in all_text:
|
73 |
-
print("Processing:{0}%".format(round(pmid_n * 100 / Total_n)), end="\r")
|
74 |
-
pmid_n+=1
|
75 |
-
lines = doc.split('\n')
|
76 |
-
seg=lines[0].split('|t|')
|
77 |
-
pmid=seg[0]
|
78 |
-
title=""
|
79 |
-
if len(seg)>1:
|
80 |
-
title=seg[1]
|
81 |
-
abstract=""
|
82 |
-
if len(lines)>1:
|
83 |
-
seg=lines[1].split('|a|')
|
84 |
-
abstract=seg[1]
|
85 |
-
if len(seg)>1:
|
86 |
-
abstract=seg[1]
|
87 |
-
|
88 |
-
intext=title+' '+abstract
|
89 |
-
tag_result=ner_tag.ML_Tag(intext,nn_model,nlp_token)
|
90 |
-
fout.write(doc+'\n')
|
91 |
-
for ele in tag_result:
|
92 |
-
ent_start = ele[0]
|
93 |
-
ent_last = ele[1]
|
94 |
-
ent_mention = intext[int(ele[0]):int(ele[1])]
|
95 |
-
ent_type=ele[2]
|
96 |
-
fout.write(pmid+"\t"+ent_start+"\t"+ent_last+"\t"+ent_mention+"\t"+ent_type+"\n")
|
97 |
-
fout.write('\n')
|
98 |
-
title=''
|
99 |
-
abstract=''
|
100 |
-
|
101 |
-
def geneNER(infolder, outpath, modelfile):
|
102 |
-
|
103 |
-
print('loading NER models........')
|
104 |
-
|
105 |
-
if modelfile.lower().find('bioformer')>=0:
|
106 |
-
vocabfiles={'labelfile':'./vocab/GeneNER_label.vocab',
|
107 |
-
'checkpoint_path':'./gnorm_trained_models/bioformer-cased-v1.0/', #bioformer-cased-v1.0
|
108 |
-
'lowercase':False,
|
109 |
-
}
|
110 |
-
else:
|
111 |
-
vocabfiles={'labelfile':'./vocab/GeneNER_label.vocab',
|
112 |
-
'checkpoint_path':'./gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/',
|
113 |
-
'lowercase':True,
|
114 |
-
}
|
115 |
-
|
116 |
-
nn_model=model_ner.HUGFACE_NER(vocabfiles)
|
117 |
-
nn_model.build_encoder()
|
118 |
-
nn_model.build_softmax_decoder()
|
119 |
-
nn_model.load_model(modelfile)
|
120 |
-
|
121 |
-
#tagging text
|
122 |
-
print("begin GeneNER tagging........")
|
123 |
-
start_time=time.time()
|
124 |
-
|
125 |
-
for infile in os.listdir(infolder):
|
126 |
-
if os.path.isfile(outpath+"/"+infile):
|
127 |
-
print(infile+' has exsited.')
|
128 |
-
else:
|
129 |
-
print('processing:',infile)
|
130 |
-
fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
|
131 |
-
input_format=""
|
132 |
-
for line in fin:
|
133 |
-
pattern_bioc = re.compile('.*<collection>.*')
|
134 |
-
pattern_pubtator = re.compile('^([^\|]+)\|[^\|]+\|(.*)')
|
135 |
-
if pattern_bioc.search(line):
|
136 |
-
input_format="BioC"
|
137 |
-
break
|
138 |
-
elif pattern_pubtator.search(line):
|
139 |
-
input_format="PubTator"
|
140 |
-
break
|
141 |
-
fin.close()
|
142 |
-
if(input_format == "PubTator"):
|
143 |
-
NER_PubTator(infolder,infile,outpath,nn_model)
|
144 |
-
elif(input_format == "BioC"):
|
145 |
-
NER_BioC(infolder,infile,outpath,nn_model)
|
146 |
-
|
147 |
-
print('tag done:',time.time()-start_time)
|
148 |
-
|
149 |
-
|
150 |
-
#SA for bioc format
|
151 |
-
def SA_BioC(infolder,infile,outpath,nn_model,virus_set,prefix_dict):
|
152 |
-
|
153 |
-
#BioC xml to pubtator
|
154 |
-
# pmid|t|text1
|
155 |
-
#pmid|a|text2
|
156 |
-
#pmid sid eid entity_txt entity_type entity_id (gene is blank)
|
157 |
-
fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
|
158 |
-
# fout_pubtator=open(outpath+'tmp/input_xml.pubtator','w', encoding='utf-8')
|
159 |
-
fin_pubtator0=io.StringIO() #none *species
|
160 |
-
fin_pubtator1=io.StringIO() #one *species
|
161 |
-
fin_pubtator2=io.StringIO() #two or more species
|
162 |
-
collection = bioc.load(fin)
|
163 |
-
fin.close()
|
164 |
-
ori_ann_index={} #{'pmid':{'ent.id':'ent_s-ent_e'}}
|
165 |
-
species_count={} #{pmid:{speid:num}}
|
166 |
-
gene_set=['Gene','FamilyName']
|
167 |
-
final_sa_results={} #{'pmid':{'entity_id':species_id}}
|
168 |
-
for document in collection.documents:
|
169 |
-
doc_pmid=document.id
|
170 |
-
doc_title=''
|
171 |
-
doc_abstract=''
|
172 |
-
doc_annotation=[]
|
173 |
-
_ann_index={}
|
174 |
-
_species_num={} #{*speciesid:num}
|
175 |
-
_gene_num=0
|
176 |
-
_passage_num=0
|
177 |
-
if len(document.passages)<=2: #abstract xml or PMC only have title
|
178 |
-
for passage in document.passages:
|
179 |
-
passage_offset=passage.offset
|
180 |
-
_passage_num+=1
|
181 |
-
#print(passage_offset,type(passage_offset))
|
182 |
-
#if passage.infons['type']=='title' or passage.infons['type']=='front':
|
183 |
-
if _passage_num==1:
|
184 |
-
doc_title=passage.text
|
185 |
-
for temp_annotation in passage.annotations:
|
186 |
-
if temp_annotation.infons['type'] in gene_set:
|
187 |
-
_gene_num+=1
|
188 |
-
ent_start=temp_annotation.locations[0].offset-passage_offset
|
189 |
-
ent_end=ent_start+temp_annotation.locations[0].length
|
190 |
-
#print(ent_start,ent_end)
|
191 |
-
_ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end)
|
192 |
-
# print(temp_annotation.infons)
|
193 |
-
if 'Identifier' in temp_annotation.infons.keys():
|
194 |
-
# print(temp_annotation.infons.keys['Identifier'])
|
195 |
-
species_ID=temp_annotation.infons['Identifier']
|
196 |
-
if species_ID.find('*')>=0:
|
197 |
-
if species_ID not in _species_num.keys():
|
198 |
-
_species_num[species_ID]=1
|
199 |
-
else:
|
200 |
-
_species_num[species_ID]+=1
|
201 |
-
doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID)
|
202 |
-
else:
|
203 |
-
doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type'])
|
204 |
-
|
205 |
-
#elif passage.infons['type']=='abstract' or passage.infons['type']=='paragraph':
|
206 |
-
else:
|
207 |
-
doc_abstract=passage.text
|
208 |
-
for temp_annotation in passage.annotations:
|
209 |
-
if temp_annotation.infons['type'] in gene_set:
|
210 |
-
_gene_num+=1
|
211 |
-
ent_start=len(doc_title)+1+temp_annotation.locations[0].offset-passage_offset
|
212 |
-
ent_end=ent_start+temp_annotation.locations[0].length
|
213 |
-
#print(ent_start,ent_end)
|
214 |
-
_ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end)
|
215 |
-
if 'Identifier' in temp_annotation.infons.keys():
|
216 |
-
# print(temp_annotation.infons.keys['Identifier'])
|
217 |
-
species_ID=temp_annotation.infons['Identifier']
|
218 |
-
if species_ID.find('*')>=0:
|
219 |
-
if species_ID not in _species_num.keys():
|
220 |
-
_species_num[species_ID]=1
|
221 |
-
else:
|
222 |
-
_species_num[species_ID]+=1
|
223 |
-
doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID)
|
224 |
-
else:
|
225 |
-
doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type'])
|
226 |
-
|
227 |
-
if len(_species_num)>=2 and _gene_num>0:
|
228 |
-
fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n')
|
229 |
-
fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
230 |
-
for ele in doc_annotation:
|
231 |
-
fin_pubtator2.write(ele+'\n')
|
232 |
-
fin_pubtator2.write('\n')
|
233 |
-
elif len(_species_num)==1 and _gene_num>0: #可以直接给结果
|
234 |
-
fin_pubtator1.write(doc_pmid+'|t|'+doc_title+'\n')
|
235 |
-
fin_pubtator1.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
236 |
-
major_speicesid,=_species_num
|
237 |
-
fin_pubtator1.write(major_speicesid[1:]+'\n')
|
238 |
-
for ele in doc_annotation:
|
239 |
-
fin_pubtator1.write(ele+'\n')
|
240 |
-
fin_pubtator1.write('\n')
|
241 |
-
elif len(_species_num)==0 and _gene_num>0:
|
242 |
-
fin_pubtator0.write(doc_pmid+'|t|'+doc_title+'\n')
|
243 |
-
fin_pubtator0.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
244 |
-
for ele in doc_annotation:
|
245 |
-
fin_pubtator0.write(ele+'\n')
|
246 |
-
fin_pubtator0.write('\n')
|
247 |
-
|
248 |
-
else: # full text xml
|
249 |
-
for passage in document.passages:
|
250 |
-
passage_annotation=[]
|
251 |
-
_species_num_passage={}
|
252 |
-
_gene_num_passage=0
|
253 |
-
passage_offset=passage.offset
|
254 |
-
#print(passage_offset,type(passage_offset))
|
255 |
-
if passage.text!='' and (not passage.text.isspace()) and passage.infons['type']!='ref':
|
256 |
-
doc_title=passage.text
|
257 |
-
for temp_annotation in passage.annotations:
|
258 |
-
if temp_annotation.infons['type'] in gene_set:
|
259 |
-
_gene_num_passage+=1
|
260 |
-
ent_start=temp_annotation.locations[0].offset-passage_offset
|
261 |
-
ent_end=ent_start+temp_annotation.locations[0].length
|
262 |
-
#print(ent_start,ent_end)
|
263 |
-
_ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end)
|
264 |
-
# print(temp_annotation.infons)
|
265 |
-
if 'Identifier' in temp_annotation.infons.keys():
|
266 |
-
# print(temp_annotation.infons.keys['Identifier'])
|
267 |
-
species_ID=temp_annotation.infons['Identifier']
|
268 |
-
if species_ID.find('*')>=0:
|
269 |
-
if species_ID not in _species_num.keys():
|
270 |
-
_species_num[species_ID]=1
|
271 |
-
else:
|
272 |
-
_species_num[species_ID]+=1
|
273 |
-
if species_ID not in _species_num_passage.keys():
|
274 |
-
_species_num_passage[species_ID]=1
|
275 |
-
else:
|
276 |
-
_species_num_passage[species_ID]+=1
|
277 |
-
passage_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID)
|
278 |
-
else:
|
279 |
-
passage_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type'])
|
280 |
-
|
281 |
-
|
282 |
-
if len(_species_num_passage)>=2 and _gene_num_passage>0:
|
283 |
-
fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n')
|
284 |
-
fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
285 |
-
for ele in passage_annotation:
|
286 |
-
fin_pubtator2.write(ele+'\n')
|
287 |
-
fin_pubtator2.write('\n')
|
288 |
-
elif len(_species_num_passage)==1 and _gene_num_passage>0:
|
289 |
-
fin_pubtator1.write(doc_pmid+'|t|'+doc_title+'\n')
|
290 |
-
fin_pubtator1.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
291 |
-
major_speicesid,=_species_num_passage
|
292 |
-
fin_pubtator1.write(major_speicesid[1:]+'\n')
|
293 |
-
for ele in passage_annotation:
|
294 |
-
fin_pubtator1.write(ele+'\n')
|
295 |
-
fin_pubtator1.write('\n')
|
296 |
-
elif len(_species_num_passage)==0 and _gene_num_passage>0:
|
297 |
-
fin_pubtator0.write(doc_pmid+'|t|'+doc_title+'\n')
|
298 |
-
fin_pubtator0.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
299 |
-
for ele in passage_annotation:
|
300 |
-
fin_pubtator0.write(ele+'\n')
|
301 |
-
fin_pubtator0.write('\n')
|
302 |
-
# print(ori_ann_index)
|
303 |
-
|
304 |
-
ori_ann_index[doc_pmid]=_ann_index
|
305 |
-
species_count[doc_pmid]=_species_num
|
306 |
-
|
307 |
-
|
308 |
-
cache_geneid={} #{pmid:{gene1:{id1:num,id2:num}}}
|
309 |
-
|
310 |
-
if fin_pubtator2.getvalue()!='':
|
311 |
-
#pubtator format ML tagging
|
312 |
-
# print(fin_pubtator2.getvalue())
|
313 |
-
ml_out= sa_tag.ml_tag_main(fin_pubtator2,nlp_token, nn_model)
|
314 |
-
#print(ml_out.getvalue())
|
315 |
-
fin_result=io.StringIO(ml_out.getvalue())
|
316 |
-
all_in=fin_result.read().strip().split('\n\n')
|
317 |
-
#print('+2 species:',len(all_in))
|
318 |
-
fin_result.close()
|
319 |
-
|
320 |
-
prefix_speid_allset=set(prefix_dict.keys())
|
321 |
-
|
322 |
-
for doc in all_in:
|
323 |
-
lines=doc.split('\n')
|
324 |
-
pmid=lines[0].split('|t|')[0]
|
325 |
-
_prefix_str2id_dict={}
|
326 |
-
doc_species=list(species_count[pmid].keys())
|
327 |
-
for _spe_ele in doc_species:
|
328 |
-
if _spe_ele[1:] in prefix_speid_allset:
|
329 |
-
for ele in prefix_dict[_spe_ele[1:]]:
|
330 |
-
_prefix_str2id_dict[ele]=_spe_ele[1:]
|
331 |
-
|
332 |
-
for i in range(2,len(lines)):
|
333 |
-
segs=lines[i].split('\t')
|
334 |
-
if pmid not in final_sa_results.keys():
|
335 |
-
final_sa_results[pmid]={segs[1]:'Focus:'+segs[-1]}
|
336 |
-
else:
|
337 |
-
final_sa_results[pmid][segs[1]]='Focus:'+segs[-1]
|
338 |
-
|
339 |
-
if segs[5] in gene_set:
|
340 |
-
if segs[4][0:2] in _prefix_str2id_dict: #prefix rule
|
341 |
-
#print('prefix rule:', pmid)
|
342 |
-
# print(_prefix_str2id_dict)
|
343 |
-
if pmid not in final_sa_results.keys():
|
344 |
-
final_sa_results[pmid]={segs[1]:'Focus:'+_prefix_str2id_dict[segs[4][0:2]]}
|
345 |
-
else:
|
346 |
-
final_sa_results[pmid][segs[1]]='Focus:'+_prefix_str2id_dict[segs[4][0:2]]
|
347 |
-
if pmid not in cache_geneid.keys():
|
348 |
-
cache_geneid[pmid]={segs[4]:{'Focus:'+segs[-1]:1}}
|
349 |
-
else:
|
350 |
-
if segs[4] not in cache_geneid[pmid].keys():
|
351 |
-
cache_geneid[pmid][segs[4]]={'Focus:'+segs[-1]:1}
|
352 |
-
else:
|
353 |
-
if segs[-1] not in cache_geneid[pmid][segs[4]].keys():
|
354 |
-
cache_geneid[pmid][segs[4]]['Focus:'+segs[-1]]=1
|
355 |
-
else:
|
356 |
-
cache_geneid[pmid][segs[4]]['Focus:'+segs[-1]]+=1
|
357 |
-
|
358 |
-
#print(final_sa_results)
|
359 |
-
|
360 |
-
#one species
|
361 |
-
if fin_pubtator1.getvalue()!='':
|
362 |
-
fin_result=io.StringIO(fin_pubtator1.getvalue())
|
363 |
-
all_in=fin_result.read().strip().split('\n\n')
|
364 |
-
fin_result.close()
|
365 |
-
#print('1 species:',len(all_in))
|
366 |
-
for doc in all_in:
|
367 |
-
lines=doc.split('\n')
|
368 |
-
pmid=lines[0].split('|t|')[0]
|
369 |
-
major_speicesid=lines[2]
|
370 |
-
for i in range(3,len(lines)):
|
371 |
-
segs=lines[i].split('\t')
|
372 |
-
if len(segs)>=7:#species
|
373 |
-
if pmid not in final_sa_results.keys():
|
374 |
-
final_sa_results[pmid]={segs[1]:segs[-1]}
|
375 |
-
else:
|
376 |
-
final_sa_results[pmid][segs[1]]=segs[-1]
|
377 |
-
else:#gene
|
378 |
-
marjor_species='Focus:'+major_speicesid
|
379 |
-
if pmid not in final_sa_results.keys():
|
380 |
-
final_sa_results[pmid]={segs[1]:marjor_species}
|
381 |
-
else:
|
382 |
-
final_sa_results[pmid][segs[1]]=marjor_species
|
383 |
-
if pmid not in cache_geneid.keys():
|
384 |
-
cache_geneid[pmid]={segs[4]:{marjor_species:1}}
|
385 |
-
else:
|
386 |
-
if segs[4] not in cache_geneid[pmid].keys():
|
387 |
-
cache_geneid[pmid][segs[4]]={marjor_species:1}
|
388 |
-
else:
|
389 |
-
if segs[-1] not in cache_geneid[pmid][segs[4]].keys():
|
390 |
-
cache_geneid[pmid][segs[4]][marjor_species]=1
|
391 |
-
else:
|
392 |
-
cache_geneid[pmid][segs[4]][marjor_species]+=1
|
393 |
-
|
394 |
-
|
395 |
-
#no species
|
396 |
-
fin_result=io.StringIO(fin_pubtator0.getvalue())
|
397 |
-
all_in=fin_result.read().strip().split('\n\n')
|
398 |
-
fin_result.close()
|
399 |
-
#print('no species:',len(all_in))
|
400 |
-
for doc in all_in:
|
401 |
-
lines=doc.split('\n')
|
402 |
-
pmid=lines[0].split('|t|')[0]
|
403 |
-
|
404 |
-
for i in range(2,len(lines)):
|
405 |
-
segs=lines[i].split('\t')
|
406 |
-
if (pmid in cache_geneid.keys()) and (segs[4] in cache_geneid[pmid].keys()):#same gene in doc
|
407 |
-
marjor_species = max(zip(cache_geneid[pmid][segs[4]].values(), cache_geneid[pmid][segs[4]].keys()))
|
408 |
-
if pmid not in final_sa_results.keys():
|
409 |
-
final_sa_results[pmid]={segs[1]:marjor_species[1]}
|
410 |
-
else:
|
411 |
-
final_sa_results[pmid][segs[1]]=marjor_species[1]
|
412 |
-
else: #marjor species in doc
|
413 |
-
if (pmid in species_count.keys()) and len(species_count[pmid])>0:#marjor species in doc
|
414 |
-
marjor_species = max(zip(species_count[pmid].values(), species_count[pmid].keys()))
|
415 |
-
|
416 |
-
if pmid not in final_sa_results.keys():
|
417 |
-
final_sa_results[pmid]={segs[1]:'Focus:'+marjor_species[1][1:]}
|
418 |
-
else:
|
419 |
-
final_sa_results[pmid][segs[1]]='Focus:'+marjor_species[1][1:]
|
420 |
-
else:#no any species in doc,assign human
|
421 |
-
if pmid not in final_sa_results.keys():
|
422 |
-
final_sa_results[pmid]={segs[1]:'Focus:9606'}
|
423 |
-
else:
|
424 |
-
final_sa_results[pmid][segs[1]]='Focus:9606'
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
# print(final_sa_results)
|
429 |
-
fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
|
430 |
-
fout_xml=open(outpath+"/"+infile,'w', encoding='utf8')
|
431 |
-
collection = bioc.load(fin)
|
432 |
-
for document in collection.documents:
|
433 |
-
doc_pmid=document.id
|
434 |
-
# print(final_sa_results[doc_pmid])
|
435 |
-
# print(doc_pmid)
|
436 |
-
for passage in document.passages:
|
437 |
-
for temp_annotation in passage.annotations:
|
438 |
-
if 'Identifier' not in temp_annotation.infons.keys():
|
439 |
-
if temp_annotation.id in final_sa_results[doc_pmid].keys():
|
440 |
-
if final_sa_results[doc_pmid][temp_annotation.id][6:] in virus_set:
|
441 |
-
temp_annotation.infons['Identifier']=final_sa_results[doc_pmid][temp_annotation.id]+',9606'
|
442 |
-
# print('!!! virus:', doc_pmid)
|
443 |
-
else:
|
444 |
-
temp_annotation.infons['Identifier']=final_sa_results[doc_pmid][temp_annotation.id]
|
445 |
-
else: #same text bug
|
446 |
-
if (doc_pmid in cache_geneid.keys()) and (temp_annotation.text in cache_geneid[doc_pmid].keys()):#same gene in doc
|
447 |
-
marjor_species = max(zip(cache_geneid[doc_pmid][temp_annotation.text].values(), cache_geneid[doc_pmid][temp_annotation.text].keys()))
|
448 |
-
temp_annotation.infons['Identifier']=marjor_species[1]
|
449 |
-
else:
|
450 |
-
|
451 |
-
temp_annotation.infons['Identifier']='Focus:9606'
|
452 |
-
bioc.dump(collection, fout_xml, pretty_print=True)
|
453 |
-
fin.close()
|
454 |
-
fout_xml.close()
|
455 |
-
|
456 |
-
|
457 |
-
#SA for PubTator format
|
458 |
-
def SA_PubTator(infolder,infile,outpath,nn_model,virus_set,prefix_dict):
|
459 |
-
|
460 |
-
|
461 |
-
# pmid|t|text1
|
462 |
-
#pmid|a|text2
|
463 |
-
#pmid entity_id sid eid entity_txt entity_type (gene is blank)
|
464 |
-
fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
|
465 |
-
# fout_pubtator=open(outpath+'tmp/input_xml.pubtator','w', encoding='utf-8')
|
466 |
-
fin_pubtator2=io.StringIO() #two or more species
|
467 |
-
all_in_ori=fin.read().strip().split('\n\n')
|
468 |
-
fin.close()
|
469 |
-
species_gene_count={} #{pmid:{'spec':_species_num;'gene':_gene_num}}
|
470 |
-
gene_set=['Gene','FamilyName']
|
471 |
-
ML_results={} #{'pmid':{'sid-eid':species_id}}
|
472 |
-
|
473 |
-
prefix_speid_allset=set(prefix_dict.keys())
|
474 |
-
|
475 |
-
for document in all_in_ori:
|
476 |
-
lines=document.split('\n')
|
477 |
-
doc_pmid=lines[0].split('|t|')[0]
|
478 |
-
doc_title=lines[0].split('|t|')[1]
|
479 |
-
doc_abstract=lines[1].split('|a|')[1]
|
480 |
-
doc_annotation=[]
|
481 |
-
_species_num=set() #(*speciesid)
|
482 |
-
_gene_num=0
|
483 |
-
_ML_gene_num=0
|
484 |
-
_entity_num=0
|
485 |
-
_prefix_str2id_dict={} #{prestr:id}
|
486 |
-
for i in range(2,len(lines)):
|
487 |
-
segs=lines[i].split('\t')
|
488 |
-
if segs[4] in gene_set:
|
489 |
-
_gene_num+=1
|
490 |
-
if len(segs)>=6: #species
|
491 |
-
doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:]))
|
492 |
-
species_ID=segs[-1]
|
493 |
-
if species_ID.find('*')>=0:
|
494 |
-
_species_num.add(species_ID)
|
495 |
-
if species_ID[1:] in prefix_speid_allset:
|
496 |
-
for ele in prefix_dict[species_ID[1:]]:
|
497 |
-
_prefix_str2id_dict[ele]=species_ID[1:]
|
498 |
-
else: #gene
|
499 |
-
if segs[3][0:2] in _prefix_str2id_dict:#prefix rule
|
500 |
-
if _prefix_str2id_dict[segs[3][0:2]] in virus_set:
|
501 |
-
doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:])+'\tFocus:'+_prefix_str2id_dict[segs[3][0:2]]+',9606')
|
502 |
-
if doc_pmid not in ML_results.keys():
|
503 |
-
ML_results[doc_pmid]={segs[1]+'-'+segs[2]:_prefix_str2id_dict[segs[3][0:2]]+',9606'}
|
504 |
-
else:
|
505 |
-
ML_results[doc_pmid][segs[1]+'-'+segs[2]]=_prefix_str2id_dict[segs[3][0:2]]+',9606'
|
506 |
-
|
507 |
-
# print('!!! prefixr and virus:', doc_pmid)
|
508 |
-
else:
|
509 |
-
doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:])+'\tFocus:'+_prefix_str2id_dict[segs[3][0:2]])
|
510 |
-
if doc_pmid not in ML_results.keys():
|
511 |
-
ML_results[doc_pmid]={segs[1]+'-'+segs[2]:_prefix_str2id_dict[segs[3][0:2]]}
|
512 |
-
else:
|
513 |
-
ML_results[doc_pmid][segs[1]+'-'+segs[2]]=_prefix_str2id_dict[segs[3][0:2]]
|
514 |
-
# print('prefix rule!!',_prefix_str2id_dict)
|
515 |
-
# print(doc_pmid)
|
516 |
-
else:
|
517 |
-
doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:]))
|
518 |
-
if segs[4] in gene_set:
|
519 |
-
_ML_gene_num+=1
|
520 |
-
_entity_num+=1
|
521 |
-
|
522 |
-
if len(_species_num)>=2 and _ML_gene_num>0:
|
523 |
-
fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n')
|
524 |
-
fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
525 |
-
for ele in doc_annotation:
|
526 |
-
fin_pubtator2.write(ele+'\n')
|
527 |
-
fin_pubtator2.write('\n')
|
528 |
-
|
529 |
-
species_gene_count[doc_pmid]={'spec':_species_num,'gene':_gene_num}
|
530 |
-
|
531 |
-
if fin_pubtator2.getvalue()!='':
|
532 |
-
#pubtator format ML tagging
|
533 |
-
#print(fin_pubtator2.getvalue())
|
534 |
-
ml_out= sa_tag.ml_tag_main(fin_pubtator2,nlp_token, nn_model)
|
535 |
-
#print(ml_out.getvalue())
|
536 |
-
fin_result=io.StringIO(ml_out.getvalue())
|
537 |
-
all_in=fin_result.read().strip().split('\n\n')
|
538 |
-
#print('+2 species:',len(all_in))
|
539 |
-
fin_result.close()
|
540 |
-
for doc in all_in:
|
541 |
-
lines=doc.split('\n')
|
542 |
-
pmid=lines[0].split('|t|')[0]
|
543 |
-
|
544 |
-
for i in range(2,len(lines)):
|
545 |
-
segs=lines[i].split('\t')
|
546 |
-
if pmid not in ML_results.keys():
|
547 |
-
ML_results[pmid]={segs[2]+'-'+segs[3]:segs[-1]}
|
548 |
-
else:
|
549 |
-
ML_results[pmid][segs[2]+'-'+segs[3]]=segs[-1]
|
550 |
-
|
551 |
-
#output
|
552 |
-
fout_pubtator=open(outpath+"/"+infile,'w', encoding='utf8')
|
553 |
-
for doc in all_in_ori:
|
554 |
-
lines=doc.split('\n')
|
555 |
-
pmid=lines[0].split('|t|')[0]
|
556 |
-
fout_pubtator.write(lines[0]+'\n'+lines[1]+'\n')
|
557 |
-
if len(species_gene_count[pmid]['spec'])>1 and species_gene_count[pmid]['gene']>0: # ML
|
558 |
-
for i in range(2,len(lines)):
|
559 |
-
segs=lines[i].split('\t')
|
560 |
-
if len(segs)>=6: #species
|
561 |
-
fout_pubtator.write(lines[i]+'\n')
|
562 |
-
else:#gene
|
563 |
-
if ML_results[pmid][segs[1]+'-'+segs[2]] in virus_set:
|
564 |
-
fout_pubtator.write(lines[i]+'\tFocus:'+ML_results[pmid][segs[1]+'-'+segs[2]]+',9606'+'\n')
|
565 |
-
# print('!!! virus:', pmid)
|
566 |
-
else:
|
567 |
-
fout_pubtator.write(lines[i]+'\tFocus:'+ML_results[pmid][segs[1]+'-'+segs[2]]+'\n')
|
568 |
-
fout_pubtator.write('\n')
|
569 |
-
|
570 |
-
elif len(species_gene_count[pmid]['spec'])==1 and species_gene_count[pmid]['gene']>0: #only one species
|
571 |
-
for i in range(2,len(lines)):
|
572 |
-
segs=lines[i].split('\t')
|
573 |
-
if len(segs)>=6: #species
|
574 |
-
fout_pubtator.write(lines[i]+'\n')
|
575 |
-
else:#gene
|
576 |
-
major_species,=species_gene_count[pmid]['spec']
|
577 |
-
if major_species[1:] in virus_set:
|
578 |
-
fout_pubtator.write(lines[i]+'\tFocus:'+major_species[1:]+',9606'+'\n')
|
579 |
-
# print('!!! virus:', pmid)
|
580 |
-
fout_pubtator.write(lines[i]+'\tFocus:'+major_species[1:]+'\n')
|
581 |
-
fout_pubtator.write('\n')
|
582 |
-
|
583 |
-
elif len(species_gene_count[pmid]['spec'])==0 and species_gene_count[pmid]['gene']>0:#no species
|
584 |
-
for i in range(2,len(lines)):
|
585 |
-
segs=lines[i].split('\t')
|
586 |
-
if len(segs)>=6: #species
|
587 |
-
fout_pubtator.write(lines[i]+'\n')
|
588 |
-
else:#gene
|
589 |
-
fout_pubtator.write(lines[i]+'\tFocus:9606'+'\n')
|
590 |
-
fout_pubtator.write('\n')
|
591 |
-
|
592 |
-
else:
|
593 |
-
for i in range(2,len(lines)):
|
594 |
-
fout_pubtator.write(lines[i]+'\n')
|
595 |
-
fout_pubtator.write('\n')
|
596 |
-
fout_pubtator.close()
|
597 |
-
|
598 |
-
|
599 |
-
#SA main
|
600 |
-
def speciesAss(infolder,outpath, modelfile):
|
601 |
-
|
602 |
-
if modelfile.lower().find('bioformer')>=0:
|
603 |
-
model_type='bioformer'
|
604 |
-
else:
|
605 |
-
model_type='pubmedbert'
|
606 |
-
|
607 |
-
print('loading SA models........')
|
608 |
-
if model_type=='bioformer':
|
609 |
-
|
610 |
-
vocabfiles={'labelfile':'./vocab/SpeAss_IO_label.vocab',
|
611 |
-
'checkpoint_path':'./gnorm_trained_models/bioformer-cased-v1.0/',
|
612 |
-
'lowercase':False,
|
613 |
-
}
|
614 |
-
else:
|
615 |
-
vocabfiles={'labelfile':'./vocab/SpeAss_IO_label.vocab',
|
616 |
-
'checkpoint_path':'./gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/',
|
617 |
-
'lowercase':True,
|
618 |
-
}
|
619 |
-
|
620 |
-
nn_model=model_sa.HUGFACE_NER(vocabfiles)
|
621 |
-
nn_model.build_encoder()
|
622 |
-
nn_model.build_softmax_decoder()
|
623 |
-
nn_model.load_model(modelfile)
|
624 |
-
|
625 |
-
dict_filename={'prefix':'./Dictionary/SPPrefix.txt',
|
626 |
-
'virus':'./Dictionary/SP_Virus2HumanList.txt'}
|
627 |
-
fin=open(dict_filename['virus'],'r',encoding='utf-8')
|
628 |
-
virus_set=set(fin.read().strip().split('\n'))
|
629 |
-
fin.close()
|
630 |
-
|
631 |
-
prefix_dict={}#{id:[prefix1,prefix2]}
|
632 |
-
fin=open(dict_filename['prefix'],'r',encoding='utf-8')
|
633 |
-
for line in fin:
|
634 |
-
seg= line.strip().split('\t')
|
635 |
-
if seg[0] not in prefix_dict.keys():
|
636 |
-
prefix_dict[seg[0]]=seg[1].split('|')
|
637 |
-
else:
|
638 |
-
prefix_dict[seg[0]].extend(seg[1].split('|'))
|
639 |
-
fin.close()
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
print("begin species assignment........")
|
644 |
-
start_time=time.time()
|
645 |
-
|
646 |
-
for infile in os.listdir(infolder):
|
647 |
-
if os.path.isfile(outpath+"/"+infile):
|
648 |
-
print(infile+' has exsited.')
|
649 |
-
else:
|
650 |
-
print('Processing:',infile)
|
651 |
-
fin=open(infolder+"/"+infile, 'r',encoding='utf-8')
|
652 |
-
file_format=""
|
653 |
-
for line in fin:
|
654 |
-
pattern_bioc = re.compile('.*<collection>.*')
|
655 |
-
pattern_pubtator = re.compile('^([^\|]+)\|[^\|]+\|(.*)')
|
656 |
-
if pattern_bioc.search(line):
|
657 |
-
file_format="BioC"
|
658 |
-
break
|
659 |
-
elif pattern_pubtator.search(line):
|
660 |
-
file_format="PubTator"
|
661 |
-
break
|
662 |
-
fin.close()
|
663 |
-
if(file_format == "PubTator"):
|
664 |
-
SA_PubTator(infolder,infile,outpath,nn_model,virus_set,prefix_dict)
|
665 |
-
elif(file_format == "BioC"):
|
666 |
-
SA_BioC(infolder,infile,outpath,nn_model,virus_set,prefix_dict)
|
667 |
-
|
668 |
-
|
669 |
-
print('species assignment done:',time.time()-start_time)
|
670 |
-
|
671 |
-
if __name__=='__main__':
|
672 |
-
|
673 |
-
parser = argparse.ArgumentParser(description='run GeneNER and species assignment, python GeneNER_SpeAss_run.py -i input -n NERmodel -s SAmodel -r neroutput -a saoutput')
|
674 |
-
parser.add_argument('--infolder', '-i', help="input folder",default='./example/input/')
|
675 |
-
parser.add_argument('--NERmodel', '-n', help="trained deep learning NER model file",default='')
|
676 |
-
parser.add_argument('--SAmodel', '-s', help="trained deep learning species assignment model file",default='')
|
677 |
-
parser.add_argument('--NERoutpath', '-r', help="output folder to save the NER tagged results",default='./example/ner_output/')
|
678 |
-
parser.add_argument('--SAoutpath', '-a', help="output folder to save the SA tagged results",default='./example/sa_output/')
|
679 |
-
parser.add_argument('--NUM_THREADS', '-t', help="Number of threads",default='3')
|
680 |
-
args = parser.parse_args()
|
681 |
-
|
682 |
-
|
683 |
-
if args.NUM_THREADS.isdigit() == False:
|
684 |
-
args.NUM_THREADS='3'
|
685 |
-
|
686 |
-
tf.config.threading.set_inter_op_parallelism_threads(int(args.NUM_THREADS))
|
687 |
-
tf.config.threading.set_intra_op_parallelism_threads(int(args.NUM_THREADS))
|
688 |
-
|
689 |
-
if args.NERmodel!='' and args.SAmodel!='':
|
690 |
-
|
691 |
-
#pipleline
|
692 |
-
print('==============\n| GeneNER and SpeAss |\n==============')
|
693 |
-
|
694 |
-
#creat output folder
|
695 |
-
|
696 |
-
if args.infolder[-1]!='/':
|
697 |
-
args.infolder+='/'
|
698 |
-
if not os.path.exists(args.infolder):
|
699 |
-
os.makedirs(args.infolder)
|
700 |
-
|
701 |
-
if args.NERoutpath[-1]!='/':
|
702 |
-
args.NERoutpath+='/'
|
703 |
-
if not os.path.exists(args.NERoutpath):
|
704 |
-
os.makedirs(args.NERoutpath)
|
705 |
-
|
706 |
-
if args.SAoutpath[-1]!='/':
|
707 |
-
args.SAoutpath+='/'
|
708 |
-
if not os.path.exists(args.SAoutpath):
|
709 |
-
os.makedirs(args.SAoutpath)
|
710 |
-
|
711 |
-
#1. gene NER, the results are saved in outpath/ner_tmp/
|
712 |
-
geneNER(args.infolder,args.NERoutpath, args.NERmodel)
|
713 |
-
|
714 |
-
|
715 |
-
#2. species assignment, the results are saved in outpath/sa_tmp/
|
716 |
-
speciesAss(args.NERoutpath,args.SAoutpath, args.SAmodel)
|
717 |
-
|
718 |
-
elif args.NERmodel!='' and args.SAmodel=='':
|
719 |
-
if args.infolder[-1]!='/':
|
720 |
-
args.infolder+='/'
|
721 |
-
if not os.path.exists(args.infolder):
|
722 |
-
os.makedirs(args.infolder)
|
723 |
-
|
724 |
-
# only geneNER
|
725 |
-
if args.NERoutpath[-1]!='/':
|
726 |
-
args.NERoutpath+='/'
|
727 |
-
if not os.path.exists(args.NERoutpath):
|
728 |
-
os.makedirs(args.NERoutpath)
|
729 |
-
|
730 |
-
print('==============\n| GeneNER |\n==============')
|
731 |
-
geneNER(args.infolder,args.NERoutpath,args.NERmodel)
|
732 |
-
|
733 |
-
elif args.NERmodel=='' and args.SAmodel!='':
|
734 |
-
# only speass
|
735 |
-
if args.SAoutpath[-1]!='/':
|
736 |
-
args.SAoutpath+='/'
|
737 |
-
if not os.path.exists(args.SAoutpath):
|
738 |
-
os.makedirs(args.SAoutpath)
|
739 |
-
|
740 |
-
print('==============\n| SpeAss |\n==============')
|
741 |
-
speciesAss(args.infolder,args.SAoutpath,args.SAmodel)
|
742 |
-
else:
|
743 |
-
print('Please provide models!')
|
744 |
-
|
745 |
-
|
746 |
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Wed Jun 8 09:26:57 2022
|
4 |
+
|
5 |
+
@author: luol2
|
6 |
+
|
7 |
+
Pipeline: first gene NER, then species assignment
|
8 |
+
input: species NER bioc xml file
|
9 |
+
output: gene ner and species assignment results bioc xml file
|
10 |
+
"""
|
11 |
+
import argparse
|
12 |
+
import os
|
13 |
+
import io
|
14 |
+
import time
|
15 |
+
import sys
|
16 |
+
import re
|
17 |
+
import shutil
|
18 |
+
from src_python.GeneNER import model_ner,ner_tag
|
19 |
+
from src_python.SpeAss import model_sa,sa_tag
|
20 |
+
|
21 |
+
import tensorflow as tf
|
22 |
+
|
23 |
+
import bioc
|
24 |
+
import stanza
|
25 |
+
nlp_token = stanza.Pipeline(model_dir='gnorm_trained_models/stanza', lang='en', processors={'tokenize': 'spacy'},package='None', download_method=None) #package='craft' ;./gnorm_trained_models/stanza
|
26 |
+
|
27 |
+
def NER_BioC(infolder,infile,outpath,nn_model):
|
28 |
+
|
29 |
+
with open(infolder+"/"+infile, 'r',encoding='utf-8') as fin:
|
30 |
+
with open(outpath+"/"+infile,'w', encoding='utf8') as fout:
|
31 |
+
collection = bioc.load(fin)
|
32 |
+
|
33 |
+
Total_n=len(collection.documents)
|
34 |
+
print('Total number of sub-documents:', Total_n)
|
35 |
+
pmid_n=0
|
36 |
+
for document in collection.documents:
|
37 |
+
print("Processing:{0}%".format(round(pmid_n * 100 / Total_n)), end="\r")
|
38 |
+
pmid_n+=1
|
39 |
+
# print(document.id)
|
40 |
+
mention_num_new=0
|
41 |
+
for passage in document.passages:
|
42 |
+
if passage.text!='' and (not passage.text.isspace()) and passage.infons['type']!='ref': # have text and is not ref
|
43 |
+
passage_offset=passage.offset
|
44 |
+
tag_result=ner_tag.ML_Tag(passage.text,nn_model,nlp_token)
|
45 |
+
mention_num=0
|
46 |
+
for ele in tag_result:
|
47 |
+
bioc_note = bioc.BioCAnnotation()
|
48 |
+
bioc_note.id = str(mention_num)
|
49 |
+
mention_num+=1
|
50 |
+
bioc_note.infons['type'] = ele[2]
|
51 |
+
start = int(ele[0])
|
52 |
+
last = int(ele[1])
|
53 |
+
loc = bioc.BioCLocation(offset=str(passage_offset+start), length= str(last-start))
|
54 |
+
bioc_note.locations.append(loc)
|
55 |
+
bioc_note.text = passage.text[start:last]
|
56 |
+
passage.annotations.append(bioc_note)
|
57 |
+
#update id
|
58 |
+
for temp_annotation in passage.annotations:
|
59 |
+
temp_annotation.id=str(mention_num_new)
|
60 |
+
mention_num_new+=1
|
61 |
+
bioc.dump(collection, fout, pretty_print=True)
|
62 |
+
|
63 |
+
def NER_PubTator(infolder,infile,outpath,nn_model):
|
64 |
+
with open(infolder+"/"+infile, 'r',encoding='utf-8') as fin:
|
65 |
+
with open(outpath+"/"+infile,'w', encoding='utf-8') as fout:
|
66 |
+
title=''
|
67 |
+
abstract=''
|
68 |
+
all_text=fin.read().strip().split('\n\n')
|
69 |
+
Total_n=len(all_text)
|
70 |
+
print('Total number of sub-documents:', Total_n)
|
71 |
+
pmid_n=0
|
72 |
+
for doc in all_text:
|
73 |
+
print("Processing:{0}%".format(round(pmid_n * 100 / Total_n)), end="\r")
|
74 |
+
pmid_n+=1
|
75 |
+
lines = doc.split('\n')
|
76 |
+
seg=lines[0].split('|t|')
|
77 |
+
pmid=seg[0]
|
78 |
+
title=""
|
79 |
+
if len(seg)>1:
|
80 |
+
title=seg[1]
|
81 |
+
abstract=""
|
82 |
+
if len(lines)>1:
|
83 |
+
seg=lines[1].split('|a|')
|
84 |
+
abstract=seg[1]
|
85 |
+
if len(seg)>1:
|
86 |
+
abstract=seg[1]
|
87 |
+
|
88 |
+
intext=title+' '+abstract
|
89 |
+
tag_result=ner_tag.ML_Tag(intext,nn_model,nlp_token)
|
90 |
+
fout.write(doc+'\n')
|
91 |
+
for ele in tag_result:
|
92 |
+
ent_start = ele[0]
|
93 |
+
ent_last = ele[1]
|
94 |
+
ent_mention = intext[int(ele[0]):int(ele[1])]
|
95 |
+
ent_type=ele[2]
|
96 |
+
fout.write(pmid+"\t"+ent_start+"\t"+ent_last+"\t"+ent_mention+"\t"+ent_type+"\n")
|
97 |
+
fout.write('\n')
|
98 |
+
title=''
|
99 |
+
abstract=''
|
100 |
+
|
101 |
+
def geneNER(infolder, outpath, modelfile):
|
102 |
+
|
103 |
+
print('loading NER models........')
|
104 |
+
|
105 |
+
if modelfile.lower().find('bioformer')>=0:
|
106 |
+
vocabfiles={'labelfile':'./vocab/GeneNER_label.vocab',
|
107 |
+
'checkpoint_path':'./gnorm_trained_models/bioformer-cased-v1.0/', #bioformer-cased-v1.0
|
108 |
+
'lowercase':False,
|
109 |
+
}
|
110 |
+
else:
|
111 |
+
vocabfiles={'labelfile':'./vocab/GeneNER_label.vocab',
|
112 |
+
'checkpoint_path':'./gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/',
|
113 |
+
'lowercase':True,
|
114 |
+
}
|
115 |
+
|
116 |
+
nn_model=model_ner.HUGFACE_NER(vocabfiles)
|
117 |
+
nn_model.build_encoder()
|
118 |
+
nn_model.build_softmax_decoder()
|
119 |
+
nn_model.load_model(modelfile)
|
120 |
+
|
121 |
+
#tagging text
|
122 |
+
print("begin GeneNER tagging........")
|
123 |
+
start_time=time.time()
|
124 |
+
|
125 |
+
for infile in os.listdir(infolder):
|
126 |
+
if os.path.isfile(outpath+"/"+infile):
|
127 |
+
print(infile+' has exsited.')
|
128 |
+
else:
|
129 |
+
print('processing:',infile)
|
130 |
+
fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
|
131 |
+
input_format=""
|
132 |
+
for line in fin:
|
133 |
+
pattern_bioc = re.compile('.*<collection>.*')
|
134 |
+
pattern_pubtator = re.compile('^([^\|]+)\|[^\|]+\|(.*)')
|
135 |
+
if pattern_bioc.search(line):
|
136 |
+
input_format="BioC"
|
137 |
+
break
|
138 |
+
elif pattern_pubtator.search(line):
|
139 |
+
input_format="PubTator"
|
140 |
+
break
|
141 |
+
fin.close()
|
142 |
+
if(input_format == "PubTator"):
|
143 |
+
NER_PubTator(infolder,infile,outpath,nn_model)
|
144 |
+
elif(input_format == "BioC"):
|
145 |
+
NER_BioC(infolder,infile,outpath,nn_model)
|
146 |
+
|
147 |
+
print('tag done:',time.time()-start_time)
|
148 |
+
|
149 |
+
|
150 |
+
#SA for bioc format
|
151 |
+
def SA_BioC(infolder,infile,outpath,nn_model,virus_set,prefix_dict):
|
152 |
+
|
153 |
+
#BioC xml to pubtator
|
154 |
+
# pmid|t|text1
|
155 |
+
#pmid|a|text2
|
156 |
+
#pmid sid eid entity_txt entity_type entity_id (gene is blank)
|
157 |
+
fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
|
158 |
+
# fout_pubtator=open(outpath+'tmp/input_xml.pubtator','w', encoding='utf-8')
|
159 |
+
fin_pubtator0=io.StringIO() #none *species
|
160 |
+
fin_pubtator1=io.StringIO() #one *species
|
161 |
+
fin_pubtator2=io.StringIO() #two or more species
|
162 |
+
collection = bioc.load(fin)
|
163 |
+
fin.close()
|
164 |
+
ori_ann_index={} #{'pmid':{'ent.id':'ent_s-ent_e'}}
|
165 |
+
species_count={} #{pmid:{speid:num}}
|
166 |
+
gene_set=['Gene','FamilyName']
|
167 |
+
final_sa_results={} #{'pmid':{'entity_id':species_id}}
|
168 |
+
for document in collection.documents:
|
169 |
+
doc_pmid=document.id
|
170 |
+
doc_title=''
|
171 |
+
doc_abstract=''
|
172 |
+
doc_annotation=[]
|
173 |
+
_ann_index={}
|
174 |
+
_species_num={} #{*speciesid:num}
|
175 |
+
_gene_num=0
|
176 |
+
_passage_num=0
|
177 |
+
if len(document.passages)<=2: #abstract xml or PMC only have title
|
178 |
+
for passage in document.passages:
|
179 |
+
passage_offset=passage.offset
|
180 |
+
_passage_num+=1
|
181 |
+
#print(passage_offset,type(passage_offset))
|
182 |
+
#if passage.infons['type']=='title' or passage.infons['type']=='front':
|
183 |
+
if _passage_num==1:
|
184 |
+
doc_title=passage.text
|
185 |
+
for temp_annotation in passage.annotations:
|
186 |
+
if temp_annotation.infons['type'] in gene_set:
|
187 |
+
_gene_num+=1
|
188 |
+
ent_start=temp_annotation.locations[0].offset-passage_offset
|
189 |
+
ent_end=ent_start+temp_annotation.locations[0].length
|
190 |
+
#print(ent_start,ent_end)
|
191 |
+
_ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end)
|
192 |
+
# print(temp_annotation.infons)
|
193 |
+
if 'Identifier' in temp_annotation.infons.keys():
|
194 |
+
# print(temp_annotation.infons.keys['Identifier'])
|
195 |
+
species_ID=temp_annotation.infons['Identifier']
|
196 |
+
if species_ID.find('*')>=0:
|
197 |
+
if species_ID not in _species_num.keys():
|
198 |
+
_species_num[species_ID]=1
|
199 |
+
else:
|
200 |
+
_species_num[species_ID]+=1
|
201 |
+
doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID)
|
202 |
+
else:
|
203 |
+
doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type'])
|
204 |
+
|
205 |
+
#elif passage.infons['type']=='abstract' or passage.infons['type']=='paragraph':
|
206 |
+
else:
|
207 |
+
doc_abstract=passage.text
|
208 |
+
for temp_annotation in passage.annotations:
|
209 |
+
if temp_annotation.infons['type'] in gene_set:
|
210 |
+
_gene_num+=1
|
211 |
+
ent_start=len(doc_title)+1+temp_annotation.locations[0].offset-passage_offset
|
212 |
+
ent_end=ent_start+temp_annotation.locations[0].length
|
213 |
+
#print(ent_start,ent_end)
|
214 |
+
_ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end)
|
215 |
+
if 'Identifier' in temp_annotation.infons.keys():
|
216 |
+
# print(temp_annotation.infons.keys['Identifier'])
|
217 |
+
species_ID=temp_annotation.infons['Identifier']
|
218 |
+
if species_ID.find('*')>=0:
|
219 |
+
if species_ID not in _species_num.keys():
|
220 |
+
_species_num[species_ID]=1
|
221 |
+
else:
|
222 |
+
_species_num[species_ID]+=1
|
223 |
+
doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID)
|
224 |
+
else:
|
225 |
+
doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type'])
|
226 |
+
|
227 |
+
if len(_species_num)>=2 and _gene_num>0:
|
228 |
+
fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n')
|
229 |
+
fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
230 |
+
for ele in doc_annotation:
|
231 |
+
fin_pubtator2.write(ele+'\n')
|
232 |
+
fin_pubtator2.write('\n')
|
233 |
+
elif len(_species_num)==1 and _gene_num>0: #可以直接给结果
|
234 |
+
fin_pubtator1.write(doc_pmid+'|t|'+doc_title+'\n')
|
235 |
+
fin_pubtator1.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
236 |
+
major_speicesid,=_species_num
|
237 |
+
fin_pubtator1.write(major_speicesid[1:]+'\n')
|
238 |
+
for ele in doc_annotation:
|
239 |
+
fin_pubtator1.write(ele+'\n')
|
240 |
+
fin_pubtator1.write('\n')
|
241 |
+
elif len(_species_num)==0 and _gene_num>0:
|
242 |
+
fin_pubtator0.write(doc_pmid+'|t|'+doc_title+'\n')
|
243 |
+
fin_pubtator0.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
244 |
+
for ele in doc_annotation:
|
245 |
+
fin_pubtator0.write(ele+'\n')
|
246 |
+
fin_pubtator0.write('\n')
|
247 |
+
|
248 |
+
else: # full text xml
|
249 |
+
for passage in document.passages:
|
250 |
+
passage_annotation=[]
|
251 |
+
_species_num_passage={}
|
252 |
+
_gene_num_passage=0
|
253 |
+
passage_offset=passage.offset
|
254 |
+
#print(passage_offset,type(passage_offset))
|
255 |
+
if passage.text!='' and (not passage.text.isspace()) and passage.infons['type']!='ref':
|
256 |
+
doc_title=passage.text
|
257 |
+
for temp_annotation in passage.annotations:
|
258 |
+
if temp_annotation.infons['type'] in gene_set:
|
259 |
+
_gene_num_passage+=1
|
260 |
+
ent_start=temp_annotation.locations[0].offset-passage_offset
|
261 |
+
ent_end=ent_start+temp_annotation.locations[0].length
|
262 |
+
#print(ent_start,ent_end)
|
263 |
+
_ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end)
|
264 |
+
# print(temp_annotation.infons)
|
265 |
+
if 'Identifier' in temp_annotation.infons.keys():
|
266 |
+
# print(temp_annotation.infons.keys['Identifier'])
|
267 |
+
species_ID=temp_annotation.infons['Identifier']
|
268 |
+
if species_ID.find('*')>=0:
|
269 |
+
if species_ID not in _species_num.keys():
|
270 |
+
_species_num[species_ID]=1
|
271 |
+
else:
|
272 |
+
_species_num[species_ID]+=1
|
273 |
+
if species_ID not in _species_num_passage.keys():
|
274 |
+
_species_num_passage[species_ID]=1
|
275 |
+
else:
|
276 |
+
_species_num_passage[species_ID]+=1
|
277 |
+
passage_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID)
|
278 |
+
else:
|
279 |
+
passage_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type'])
|
280 |
+
|
281 |
+
|
282 |
+
if len(_species_num_passage)>=2 and _gene_num_passage>0:
|
283 |
+
fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n')
|
284 |
+
fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
285 |
+
for ele in passage_annotation:
|
286 |
+
fin_pubtator2.write(ele+'\n')
|
287 |
+
fin_pubtator2.write('\n')
|
288 |
+
elif len(_species_num_passage)==1 and _gene_num_passage>0: #可以直��给结果
|
289 |
+
fin_pubtator1.write(doc_pmid+'|t|'+doc_title+'\n')
|
290 |
+
fin_pubtator1.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
291 |
+
major_speicesid,=_species_num_passage
|
292 |
+
fin_pubtator1.write(major_speicesid[1:]+'\n')
|
293 |
+
for ele in passage_annotation:
|
294 |
+
fin_pubtator1.write(ele+'\n')
|
295 |
+
fin_pubtator1.write('\n')
|
296 |
+
elif len(_species_num_passage)==0 and _gene_num_passage>0:
|
297 |
+
fin_pubtator0.write(doc_pmid+'|t|'+doc_title+'\n')
|
298 |
+
fin_pubtator0.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
299 |
+
for ele in passage_annotation:
|
300 |
+
fin_pubtator0.write(ele+'\n')
|
301 |
+
fin_pubtator0.write('\n')
|
302 |
+
# print(ori_ann_index)
|
303 |
+
|
304 |
+
ori_ann_index[doc_pmid]=_ann_index
|
305 |
+
species_count[doc_pmid]=_species_num
|
306 |
+
|
307 |
+
|
308 |
+
cache_geneid={} #{pmid:{gene1:{id1:num,id2:num}}}
|
309 |
+
|
310 |
+
if fin_pubtator2.getvalue()!='':
|
311 |
+
#pubtator format ML tagging
|
312 |
+
# print(fin_pubtator2.getvalue())
|
313 |
+
ml_out= sa_tag.ml_tag_main(fin_pubtator2,nlp_token, nn_model)
|
314 |
+
#print(ml_out.getvalue())
|
315 |
+
fin_result=io.StringIO(ml_out.getvalue())
|
316 |
+
all_in=fin_result.read().strip().split('\n\n')
|
317 |
+
#print('+2 species:',len(all_in))
|
318 |
+
fin_result.close()
|
319 |
+
|
320 |
+
prefix_speid_allset=set(prefix_dict.keys())
|
321 |
+
|
322 |
+
for doc in all_in:
|
323 |
+
lines=doc.split('\n')
|
324 |
+
pmid=lines[0].split('|t|')[0]
|
325 |
+
_prefix_str2id_dict={}
|
326 |
+
doc_species=list(species_count[pmid].keys())
|
327 |
+
for _spe_ele in doc_species:
|
328 |
+
if _spe_ele[1:] in prefix_speid_allset:
|
329 |
+
for ele in prefix_dict[_spe_ele[1:]]:
|
330 |
+
_prefix_str2id_dict[ele]=_spe_ele[1:]
|
331 |
+
|
332 |
+
for i in range(2,len(lines)):
|
333 |
+
segs=lines[i].split('\t')
|
334 |
+
if pmid not in final_sa_results.keys():
|
335 |
+
final_sa_results[pmid]={segs[1]:'Focus:'+segs[-1]}
|
336 |
+
else:
|
337 |
+
final_sa_results[pmid][segs[1]]='Focus:'+segs[-1]
|
338 |
+
|
339 |
+
if segs[5] in gene_set:
|
340 |
+
if segs[4][0:2] in _prefix_str2id_dict: #prefix rule
|
341 |
+
#print('prefix rule:', pmid)
|
342 |
+
# print(_prefix_str2id_dict)
|
343 |
+
if pmid not in final_sa_results.keys():
|
344 |
+
final_sa_results[pmid]={segs[1]:'Focus:'+_prefix_str2id_dict[segs[4][0:2]]}
|
345 |
+
else:
|
346 |
+
final_sa_results[pmid][segs[1]]='Focus:'+_prefix_str2id_dict[segs[4][0:2]]
|
347 |
+
if pmid not in cache_geneid.keys():
|
348 |
+
cache_geneid[pmid]={segs[4]:{'Focus:'+segs[-1]:1}}
|
349 |
+
else:
|
350 |
+
if segs[4] not in cache_geneid[pmid].keys():
|
351 |
+
cache_geneid[pmid][segs[4]]={'Focus:'+segs[-1]:1}
|
352 |
+
else:
|
353 |
+
if segs[-1] not in cache_geneid[pmid][segs[4]].keys():
|
354 |
+
cache_geneid[pmid][segs[4]]['Focus:'+segs[-1]]=1
|
355 |
+
else:
|
356 |
+
cache_geneid[pmid][segs[4]]['Focus:'+segs[-1]]+=1
|
357 |
+
|
358 |
+
#print(final_sa_results)
|
359 |
+
|
360 |
+
#one species
|
361 |
+
if fin_pubtator1.getvalue()!='':
|
362 |
+
fin_result=io.StringIO(fin_pubtator1.getvalue())
|
363 |
+
all_in=fin_result.read().strip().split('\n\n')
|
364 |
+
fin_result.close()
|
365 |
+
#print('1 species:',len(all_in))
|
366 |
+
for doc in all_in:
|
367 |
+
lines=doc.split('\n')
|
368 |
+
pmid=lines[0].split('|t|')[0]
|
369 |
+
major_speicesid=lines[2]
|
370 |
+
for i in range(3,len(lines)):
|
371 |
+
segs=lines[i].split('\t')
|
372 |
+
if len(segs)>=7:#species
|
373 |
+
if pmid not in final_sa_results.keys():
|
374 |
+
final_sa_results[pmid]={segs[1]:segs[-1]}
|
375 |
+
else:
|
376 |
+
final_sa_results[pmid][segs[1]]=segs[-1]
|
377 |
+
else:#gene
|
378 |
+
marjor_species='Focus:'+major_speicesid
|
379 |
+
if pmid not in final_sa_results.keys():
|
380 |
+
final_sa_results[pmid]={segs[1]:marjor_species}
|
381 |
+
else:
|
382 |
+
final_sa_results[pmid][segs[1]]=marjor_species
|
383 |
+
if pmid not in cache_geneid.keys():
|
384 |
+
cache_geneid[pmid]={segs[4]:{marjor_species:1}}
|
385 |
+
else:
|
386 |
+
if segs[4] not in cache_geneid[pmid].keys():
|
387 |
+
cache_geneid[pmid][segs[4]]={marjor_species:1}
|
388 |
+
else:
|
389 |
+
if segs[-1] not in cache_geneid[pmid][segs[4]].keys():
|
390 |
+
cache_geneid[pmid][segs[4]][marjor_species]=1
|
391 |
+
else:
|
392 |
+
cache_geneid[pmid][segs[4]][marjor_species]+=1
|
393 |
+
|
394 |
+
|
395 |
+
#no species
|
396 |
+
fin_result=io.StringIO(fin_pubtator0.getvalue())
|
397 |
+
all_in=fin_result.read().strip().split('\n\n')
|
398 |
+
fin_result.close()
|
399 |
+
#print('no species:',len(all_in))
|
400 |
+
for doc in all_in:
|
401 |
+
lines=doc.split('\n')
|
402 |
+
pmid=lines[0].split('|t|')[0]
|
403 |
+
|
404 |
+
for i in range(2,len(lines)):
|
405 |
+
segs=lines[i].split('\t')
|
406 |
+
if (pmid in cache_geneid.keys()) and (segs[4] in cache_geneid[pmid].keys()):#same gene in doc
|
407 |
+
marjor_species = max(zip(cache_geneid[pmid][segs[4]].values(), cache_geneid[pmid][segs[4]].keys()))
|
408 |
+
if pmid not in final_sa_results.keys():
|
409 |
+
final_sa_results[pmid]={segs[1]:marjor_species[1]}
|
410 |
+
else:
|
411 |
+
final_sa_results[pmid][segs[1]]=marjor_species[1]
|
412 |
+
else: #marjor species in doc
|
413 |
+
if (pmid in species_count.keys()) and len(species_count[pmid])>0:#marjor species in doc
|
414 |
+
marjor_species = max(zip(species_count[pmid].values(), species_count[pmid].keys()))
|
415 |
+
|
416 |
+
if pmid not in final_sa_results.keys():
|
417 |
+
final_sa_results[pmid]={segs[1]:'Focus:'+marjor_species[1][1:]}
|
418 |
+
else:
|
419 |
+
final_sa_results[pmid][segs[1]]='Focus:'+marjor_species[1][1:]
|
420 |
+
else:#no any species in doc,assign human
|
421 |
+
if pmid not in final_sa_results.keys():
|
422 |
+
final_sa_results[pmid]={segs[1]:'Focus:9606'}
|
423 |
+
else:
|
424 |
+
final_sa_results[pmid][segs[1]]='Focus:9606'
|
425 |
+
|
426 |
+
|
427 |
+
|
428 |
+
# print(final_sa_results)
|
429 |
+
fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
|
430 |
+
fout_xml=open(outpath+"/"+infile,'w', encoding='utf8')
|
431 |
+
collection = bioc.load(fin)
|
432 |
+
for document in collection.documents:
|
433 |
+
doc_pmid=document.id
|
434 |
+
# print(final_sa_results[doc_pmid])
|
435 |
+
# print(doc_pmid)
|
436 |
+
for passage in document.passages:
|
437 |
+
for temp_annotation in passage.annotations:
|
438 |
+
if 'Identifier' not in temp_annotation.infons.keys():
|
439 |
+
if temp_annotation.id in final_sa_results[doc_pmid].keys():
|
440 |
+
if final_sa_results[doc_pmid][temp_annotation.id][6:] in virus_set:
|
441 |
+
temp_annotation.infons['Identifier']=final_sa_results[doc_pmid][temp_annotation.id]+',9606'
|
442 |
+
# print('!!! virus:', doc_pmid)
|
443 |
+
else:
|
444 |
+
temp_annotation.infons['Identifier']=final_sa_results[doc_pmid][temp_annotation.id]
|
445 |
+
else: #same text bug
|
446 |
+
if (doc_pmid in cache_geneid.keys()) and (temp_annotation.text in cache_geneid[doc_pmid].keys()):#same gene in doc
|
447 |
+
marjor_species = max(zip(cache_geneid[doc_pmid][temp_annotation.text].values(), cache_geneid[doc_pmid][temp_annotation.text].keys()))
|
448 |
+
temp_annotation.infons['Identifier']=marjor_species[1]
|
449 |
+
else:
|
450 |
+
|
451 |
+
temp_annotation.infons['Identifier']='Focus:9606'
|
452 |
+
bioc.dump(collection, fout_xml, pretty_print=True)
|
453 |
+
fin.close()
|
454 |
+
fout_xml.close()
|
455 |
+
|
456 |
+
|
457 |
+
#SA for PubTator format
|
458 |
+
def SA_PubTator(infolder,infile,outpath,nn_model,virus_set,prefix_dict):
|
459 |
+
|
460 |
+
|
461 |
+
# pmid|t|text1
|
462 |
+
#pmid|a|text2
|
463 |
+
#pmid entity_id sid eid entity_txt entity_type (gene is blank)
|
464 |
+
fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
|
465 |
+
# fout_pubtator=open(outpath+'tmp/input_xml.pubtator','w', encoding='utf-8')
|
466 |
+
fin_pubtator2=io.StringIO() #two or more species
|
467 |
+
all_in_ori=fin.read().strip().split('\n\n')
|
468 |
+
fin.close()
|
469 |
+
species_gene_count={} #{pmid:{'spec':_species_num;'gene':_gene_num}}
|
470 |
+
gene_set=['Gene','FamilyName']
|
471 |
+
ML_results={} #{'pmid':{'sid-eid':species_id}}
|
472 |
+
|
473 |
+
prefix_speid_allset=set(prefix_dict.keys())
|
474 |
+
|
475 |
+
for document in all_in_ori:
|
476 |
+
lines=document.split('\n')
|
477 |
+
doc_pmid=lines[0].split('|t|')[0]
|
478 |
+
doc_title=lines[0].split('|t|')[1]
|
479 |
+
doc_abstract=lines[1].split('|a|')[1]
|
480 |
+
doc_annotation=[]
|
481 |
+
_species_num=set() #(*speciesid)
|
482 |
+
_gene_num=0
|
483 |
+
_ML_gene_num=0
|
484 |
+
_entity_num=0
|
485 |
+
_prefix_str2id_dict={} #{prestr:id}
|
486 |
+
for i in range(2,len(lines)):
|
487 |
+
segs=lines[i].split('\t')
|
488 |
+
if segs[4] in gene_set:
|
489 |
+
_gene_num+=1
|
490 |
+
if len(segs)>=6: #species
|
491 |
+
doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:]))
|
492 |
+
species_ID=segs[-1]
|
493 |
+
if species_ID.find('*')>=0:
|
494 |
+
_species_num.add(species_ID)
|
495 |
+
if species_ID[1:] in prefix_speid_allset:
|
496 |
+
for ele in prefix_dict[species_ID[1:]]:
|
497 |
+
_prefix_str2id_dict[ele]=species_ID[1:]
|
498 |
+
else: #gene
|
499 |
+
if segs[3][0:2] in _prefix_str2id_dict:#prefix rule
|
500 |
+
if _prefix_str2id_dict[segs[3][0:2]] in virus_set:
|
501 |
+
doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:])+'\tFocus:'+_prefix_str2id_dict[segs[3][0:2]]+',9606')
|
502 |
+
if doc_pmid not in ML_results.keys():
|
503 |
+
ML_results[doc_pmid]={segs[1]+'-'+segs[2]:_prefix_str2id_dict[segs[3][0:2]]+',9606'}
|
504 |
+
else:
|
505 |
+
ML_results[doc_pmid][segs[1]+'-'+segs[2]]=_prefix_str2id_dict[segs[3][0:2]]+',9606'
|
506 |
+
|
507 |
+
# print('!!! prefixr and virus:', doc_pmid)
|
508 |
+
else:
|
509 |
+
doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:])+'\tFocus:'+_prefix_str2id_dict[segs[3][0:2]])
|
510 |
+
if doc_pmid not in ML_results.keys():
|
511 |
+
ML_results[doc_pmid]={segs[1]+'-'+segs[2]:_prefix_str2id_dict[segs[3][0:2]]}
|
512 |
+
else:
|
513 |
+
ML_results[doc_pmid][segs[1]+'-'+segs[2]]=_prefix_str2id_dict[segs[3][0:2]]
|
514 |
+
# print('prefix rule!!',_prefix_str2id_dict)
|
515 |
+
# print(doc_pmid)
|
516 |
+
else:
|
517 |
+
doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:]))
|
518 |
+
if segs[4] in gene_set:
|
519 |
+
_ML_gene_num+=1
|
520 |
+
_entity_num+=1
|
521 |
+
|
522 |
+
if len(_species_num)>=2 and _ML_gene_num>0:
|
523 |
+
fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n')
|
524 |
+
fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
525 |
+
for ele in doc_annotation:
|
526 |
+
fin_pubtator2.write(ele+'\n')
|
527 |
+
fin_pubtator2.write('\n')
|
528 |
+
|
529 |
+
species_gene_count[doc_pmid]={'spec':_species_num,'gene':_gene_num}
|
530 |
+
|
531 |
+
if fin_pubtator2.getvalue()!='':
|
532 |
+
#pubtator format ML tagging
|
533 |
+
#print(fin_pubtator2.getvalue())
|
534 |
+
ml_out= sa_tag.ml_tag_main(fin_pubtator2,nlp_token, nn_model)
|
535 |
+
#print(ml_out.getvalue())
|
536 |
+
fin_result=io.StringIO(ml_out.getvalue())
|
537 |
+
all_in=fin_result.read().strip().split('\n\n')
|
538 |
+
#print('+2 species:',len(all_in))
|
539 |
+
fin_result.close()
|
540 |
+
for doc in all_in:
|
541 |
+
lines=doc.split('\n')
|
542 |
+
pmid=lines[0].split('|t|')[0]
|
543 |
+
|
544 |
+
for i in range(2,len(lines)):
|
545 |
+
segs=lines[i].split('\t')
|
546 |
+
if pmid not in ML_results.keys():
|
547 |
+
ML_results[pmid]={segs[2]+'-'+segs[3]:segs[-1]}
|
548 |
+
else:
|
549 |
+
ML_results[pmid][segs[2]+'-'+segs[3]]=segs[-1]
|
550 |
+
|
551 |
+
#output
|
552 |
+
fout_pubtator=open(outpath+"/"+infile,'w', encoding='utf8')
|
553 |
+
for doc in all_in_ori:
|
554 |
+
lines=doc.split('\n')
|
555 |
+
pmid=lines[0].split('|t|')[0]
|
556 |
+
fout_pubtator.write(lines[0]+'\n'+lines[1]+'\n')
|
557 |
+
if len(species_gene_count[pmid]['spec'])>1 and species_gene_count[pmid]['gene']>0: # ML
|
558 |
+
for i in range(2,len(lines)):
|
559 |
+
segs=lines[i].split('\t')
|
560 |
+
if len(segs)>=6: #species
|
561 |
+
fout_pubtator.write(lines[i]+'\n')
|
562 |
+
else:#gene
|
563 |
+
if ML_results[pmid][segs[1]+'-'+segs[2]] in virus_set:
|
564 |
+
fout_pubtator.write(lines[i]+'\tFocus:'+ML_results[pmid][segs[1]+'-'+segs[2]]+',9606'+'\n')
|
565 |
+
# print('!!! virus:', pmid)
|
566 |
+
else:
|
567 |
+
fout_pubtator.write(lines[i]+'\tFocus:'+ML_results[pmid][segs[1]+'-'+segs[2]]+'\n')
|
568 |
+
fout_pubtator.write('\n')
|
569 |
+
|
570 |
+
elif len(species_gene_count[pmid]['spec'])==1 and species_gene_count[pmid]['gene']>0: #only one species
|
571 |
+
for i in range(2,len(lines)):
|
572 |
+
segs=lines[i].split('\t')
|
573 |
+
if len(segs)>=6: #species
|
574 |
+
fout_pubtator.write(lines[i]+'\n')
|
575 |
+
else:#gene
|
576 |
+
major_species,=species_gene_count[pmid]['spec']
|
577 |
+
if major_species[1:] in virus_set:
|
578 |
+
fout_pubtator.write(lines[i]+'\tFocus:'+major_species[1:]+',9606'+'\n')
|
579 |
+
# print('!!! virus:', pmid)
|
580 |
+
fout_pubtator.write(lines[i]+'\tFocus:'+major_species[1:]+'\n')
|
581 |
+
fout_pubtator.write('\n')
|
582 |
+
|
583 |
+
elif len(species_gene_count[pmid]['spec'])==0 and species_gene_count[pmid]['gene']>0:#no species
|
584 |
+
for i in range(2,len(lines)):
|
585 |
+
segs=lines[i].split('\t')
|
586 |
+
if len(segs)>=6: #species
|
587 |
+
fout_pubtator.write(lines[i]+'\n')
|
588 |
+
else:#gene
|
589 |
+
fout_pubtator.write(lines[i]+'\tFocus:9606'+'\n')
|
590 |
+
fout_pubtator.write('\n')
|
591 |
+
|
592 |
+
else:
|
593 |
+
for i in range(2,len(lines)):
|
594 |
+
fout_pubtator.write(lines[i]+'\n')
|
595 |
+
fout_pubtator.write('\n')
|
596 |
+
fout_pubtator.close()
|
597 |
+
|
598 |
+
|
599 |
+
#SA main
|
600 |
+
def speciesAss(infolder,outpath, modelfile):
|
601 |
+
|
602 |
+
if modelfile.lower().find('bioformer')>=0:
|
603 |
+
model_type='bioformer'
|
604 |
+
else:
|
605 |
+
model_type='pubmedbert'
|
606 |
+
|
607 |
+
print('loading SA models........')
|
608 |
+
if model_type=='bioformer':
|
609 |
+
|
610 |
+
vocabfiles={'labelfile':'./vocab/SpeAss_IO_label.vocab',
|
611 |
+
'checkpoint_path':'./gnorm_trained_models/bioformer-cased-v1.0/',
|
612 |
+
'lowercase':False,
|
613 |
+
}
|
614 |
+
else:
|
615 |
+
vocabfiles={'labelfile':'./vocab/SpeAss_IO_label.vocab',
|
616 |
+
'checkpoint_path':'./gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/',
|
617 |
+
'lowercase':True,
|
618 |
+
}
|
619 |
+
|
620 |
+
nn_model=model_sa.HUGFACE_NER(vocabfiles)
|
621 |
+
nn_model.build_encoder()
|
622 |
+
nn_model.build_softmax_decoder()
|
623 |
+
nn_model.load_model(modelfile)
|
624 |
+
|
625 |
+
dict_filename={'prefix':'./Dictionary/SPPrefix.txt',
|
626 |
+
'virus':'./Dictionary/SP_Virus2HumanList.txt'}
|
627 |
+
fin=open(dict_filename['virus'],'r',encoding='utf-8')
|
628 |
+
virus_set=set(fin.read().strip().split('\n'))
|
629 |
+
fin.close()
|
630 |
+
|
631 |
+
prefix_dict={}#{id:[prefix1,prefix2]}
|
632 |
+
fin=open(dict_filename['prefix'],'r',encoding='utf-8')
|
633 |
+
for line in fin:
|
634 |
+
seg= line.strip().split('\t')
|
635 |
+
if seg[0] not in prefix_dict.keys():
|
636 |
+
prefix_dict[seg[0]]=seg[1].split('|')
|
637 |
+
else:
|
638 |
+
prefix_dict[seg[0]].extend(seg[1].split('|'))
|
639 |
+
fin.close()
|
640 |
+
|
641 |
+
|
642 |
+
|
643 |
+
print("begin species assignment........")
|
644 |
+
start_time=time.time()
|
645 |
+
|
646 |
+
for infile in os.listdir(infolder):
|
647 |
+
if os.path.isfile(outpath+"/"+infile):
|
648 |
+
print(infile+' has exsited.')
|
649 |
+
else:
|
650 |
+
print('Processing:',infile)
|
651 |
+
fin=open(infolder+"/"+infile, 'r',encoding='utf-8')
|
652 |
+
file_format=""
|
653 |
+
for line in fin:
|
654 |
+
pattern_bioc = re.compile('.*<collection>.*')
|
655 |
+
pattern_pubtator = re.compile('^([^\|]+)\|[^\|]+\|(.*)')
|
656 |
+
if pattern_bioc.search(line):
|
657 |
+
file_format="BioC"
|
658 |
+
break
|
659 |
+
elif pattern_pubtator.search(line):
|
660 |
+
file_format="PubTator"
|
661 |
+
break
|
662 |
+
fin.close()
|
663 |
+
if(file_format == "PubTator"):
|
664 |
+
SA_PubTator(infolder,infile,outpath,nn_model,virus_set,prefix_dict)
|
665 |
+
elif(file_format == "BioC"):
|
666 |
+
SA_BioC(infolder,infile,outpath,nn_model,virus_set,prefix_dict)
|
667 |
+
|
668 |
+
|
669 |
+
print('species assignment done:',time.time()-start_time)
|
670 |
+
|
671 |
+
if __name__=='__main__':
|
672 |
+
|
673 |
+
parser = argparse.ArgumentParser(description='run GeneNER and species assignment, python GeneNER_SpeAss_run.py -i input -n NERmodel -s SAmodel -r neroutput -a saoutput')
|
674 |
+
parser.add_argument('--infolder', '-i', help="input folder",default='./example/input/')
|
675 |
+
parser.add_argument('--NERmodel', '-n', help="trained deep learning NER model file",default='')
|
676 |
+
parser.add_argument('--SAmodel', '-s', help="trained deep learning species assignment model file",default='')
|
677 |
+
parser.add_argument('--NERoutpath', '-r', help="output folder to save the NER tagged results",default='./example/ner_output/')
|
678 |
+
parser.add_argument('--SAoutpath', '-a', help="output folder to save the SA tagged results",default='./example/sa_output/')
|
679 |
+
parser.add_argument('--NUM_THREADS', '-t', help="Number of threads",default='3')
|
680 |
+
args = parser.parse_args()
|
681 |
+
|
682 |
+
|
683 |
+
if args.NUM_THREADS.isdigit() == False:
|
684 |
+
args.NUM_THREADS='3'
|
685 |
+
|
686 |
+
tf.config.threading.set_inter_op_parallelism_threads(int(args.NUM_THREADS))
|
687 |
+
tf.config.threading.set_intra_op_parallelism_threads(int(args.NUM_THREADS))
|
688 |
+
|
689 |
+
if args.NERmodel!='' and args.SAmodel!='':
|
690 |
+
|
691 |
+
#pipleline
|
692 |
+
print('==============\n| GeneNER and SpeAss |\n==============')
|
693 |
+
|
694 |
+
#creat output folder
|
695 |
+
|
696 |
+
if args.infolder[-1]!='/':
|
697 |
+
args.infolder+='/'
|
698 |
+
if not os.path.exists(args.infolder):
|
699 |
+
os.makedirs(args.infolder)
|
700 |
+
|
701 |
+
if args.NERoutpath[-1]!='/':
|
702 |
+
args.NERoutpath+='/'
|
703 |
+
if not os.path.exists(args.NERoutpath):
|
704 |
+
os.makedirs(args.NERoutpath)
|
705 |
+
|
706 |
+
if args.SAoutpath[-1]!='/':
|
707 |
+
args.SAoutpath+='/'
|
708 |
+
if not os.path.exists(args.SAoutpath):
|
709 |
+
os.makedirs(args.SAoutpath)
|
710 |
+
|
711 |
+
#1. gene NER, the results are saved in outpath/ner_tmp/
|
712 |
+
geneNER(args.infolder,args.NERoutpath, args.NERmodel)
|
713 |
+
|
714 |
+
|
715 |
+
#2. species assignment, the results are saved in outpath/sa_tmp/
|
716 |
+
speciesAss(args.NERoutpath,args.SAoutpath, args.SAmodel)
|
717 |
+
|
718 |
+
elif args.NERmodel!='' and args.SAmodel=='':
|
719 |
+
if args.infolder[-1]!='/':
|
720 |
+
args.infolder+='/'
|
721 |
+
if not os.path.exists(args.infolder):
|
722 |
+
os.makedirs(args.infolder)
|
723 |
+
|
724 |
+
# only geneNER
|
725 |
+
if args.NERoutpath[-1]!='/':
|
726 |
+
args.NERoutpath+='/'
|
727 |
+
if not os.path.exists(args.NERoutpath):
|
728 |
+
os.makedirs(args.NERoutpath)
|
729 |
+
|
730 |
+
print('==============\n| GeneNER |\n==============')
|
731 |
+
geneNER(args.infolder,args.NERoutpath,args.NERmodel)
|
732 |
+
|
733 |
+
elif args.NERmodel=='' and args.SAmodel!='':
|
734 |
+
# only speass
|
735 |
+
if args.SAoutpath[-1]!='/':
|
736 |
+
args.SAoutpath+='/'
|
737 |
+
if not os.path.exists(args.SAoutpath):
|
738 |
+
os.makedirs(args.SAoutpath)
|
739 |
+
|
740 |
+
print('==============\n| SpeAss |\n==============')
|
741 |
+
speciesAss(args.infolder,args.SAoutpath,args.SAmodel)
|
742 |
+
else:
|
743 |
+
print('Please provide models!')
|
744 |
+
|
745 |
+
|
746 |
|
Library/Ab3P.C
CHANGED
@@ -1,110 +1,110 @@
|
|
1 |
-
#include "Ab3P.h"
|
2 |
-
|
3 |
-
Ab3P::Ab3P ( void ) :
|
4 |
-
buffer(""),
|
5 |
-
wrdData( new WordData )
|
6 |
-
{
|
7 |
-
|
8 |
-
string sf_grp, sf_nchr, strat;
|
9 |
-
double value;
|
10 |
-
|
11 |
-
char file_name[1000];
|
12 |
-
get_pathw( file_name, "Ab3P", "prec", "dat" );
|
13 |
-
ifstream fin(file_name);
|
14 |
-
if(!fin) {
|
15 |
-
cout << "Cannot open Ab3P_prec.dat\n";
|
16 |
-
exit(1);
|
17 |
-
}
|
18 |
-
//get precision of a given #-ch SF's strategy
|
19 |
-
while(fin>>sf_grp>>sf_nchr>>strat) {
|
20 |
-
fin>>value; //precision
|
21 |
-
stratPrec.insert(pair<string, double>(sf_grp+sf_nchr+strat, value));
|
22 |
-
util.push_back_strat(sf_grp+sf_nchr, strat); //set strategy sequence
|
23 |
-
}
|
24 |
-
}
|
25 |
-
|
26 |
-
void Ab3P::get_abbrs( char * text, vector<AbbrOut> & abbrs ) {
|
27 |
-
abbrs.clear();
|
28 |
-
|
29 |
-
if( ! text[0] ) return; // skip empty line
|
30 |
-
|
31 |
-
ab.Proc(text); //extract potential SF & LF pairs
|
32 |
-
|
33 |
-
for(int i=0; i<ab.numa; i++) {
|
34 |
-
AbbrOut result;
|
35 |
-
|
36 |
-
try_pair( ab.abbs[i], ab.abbl[i], result );
|
37 |
-
|
38 |
-
// preserve results
|
39 |
-
if ( result.prec > 0 ) {
|
40 |
-
abbrs.push_back( result );
|
41 |
-
}
|
42 |
-
}
|
43 |
-
ab.cleara();
|
44 |
-
|
45 |
-
}
|
46 |
-
|
47 |
-
|
48 |
-
void Ab3P::try_pair( char * sf, char * lf, AbbrOut & result ) {
|
49 |
-
|
50 |
-
//process i) lf (sf)
|
51 |
-
try_strats( sf, lf, false, result );
|
52 |
-
|
53 |
-
//process ii) sf (lf)
|
54 |
-
ab.token(lf);
|
55 |
-
try_strats( ab.lst[ab.num-1], sf, true, result );
|
56 |
-
}
|
57 |
-
|
58 |
-
|
59 |
-
/**
|
60 |
-
psf -- pointer short form
|
61 |
-
plf -- pointer long form
|
62 |
-
**/
|
63 |
-
void Ab3P::try_strats ( char * psf, char * plf, bool swap,
|
64 |
-
AbbrOut & result ) {
|
65 |
-
|
66 |
-
string sfg; //SF group eg) Al1, Num2, Spec3
|
67 |
-
//false if sf is not ok, sfg will be assigned
|
68 |
-
|
69 |
-
if(!util.group_sf(psf,plf,sfg)) return;
|
70 |
-
if (swap) if(!util.exist_upperal(psf)) return;
|
71 |
-
|
72 |
-
char sf[1000], sfl[1000];
|
73 |
-
|
74 |
-
//strategy sequence for a given #-ch SF group
|
75 |
-
vector<string> strats = util.get_strats(sfg);
|
76 |
-
util.remove_nonAlnum(psf,sf); //sf will be w/o non-alnum
|
77 |
-
|
78 |
-
//go through strategies
|
79 |
-
for( int j=0; j<strats.size(); j++) {
|
80 |
-
AbbrStra * strat =
|
81 |
-
util.strat_factory(strats[j]); //set a paticular strategy
|
82 |
-
strat->wData = wrdData; //set wordset, stopword
|
83 |
-
if(strat->strategy(sf,plf)) { //case sensitive
|
84 |
-
strat->str_tolower(sf,sfl);
|
85 |
-
|
86 |
-
if( strat->lf_ok(psf,strat->lf) ) {
|
87 |
-
|
88 |
-
map<string, double>::iterator p =
|
89 |
-
stratPrec.find(sfg+strats[j]);
|
90 |
-
if(p==stratPrec.end()) {
|
91 |
-
cout << "No precision assigned" << endl;
|
92 |
-
exit(1);
|
93 |
-
}
|
94 |
-
|
95 |
-
//add outputs
|
96 |
-
if( p->second>result.prec ) {
|
97 |
-
result.sf = psf;
|
98 |
-
result.lf = strat->lf;
|
99 |
-
result.prec = p->second;
|
100 |
-
result.strat = strats[j];
|
101 |
-
}
|
102 |
-
|
103 |
-
delete strat;
|
104 |
-
return;
|
105 |
-
}
|
106 |
-
}
|
107 |
-
delete strat;
|
108 |
-
}
|
109 |
-
|
110 |
-
}
|
|
|
1 |
+
#include "Ab3P.h"
|
2 |
+
|
3 |
+
Ab3P::Ab3P ( void ) :
|
4 |
+
buffer(""),
|
5 |
+
wrdData( new WordData )
|
6 |
+
{
|
7 |
+
|
8 |
+
string sf_grp, sf_nchr, strat;
|
9 |
+
double value;
|
10 |
+
|
11 |
+
char file_name[1000];
|
12 |
+
get_pathw( file_name, "Ab3P", "prec", "dat" );
|
13 |
+
ifstream fin(file_name);
|
14 |
+
if(!fin) {
|
15 |
+
cout << "Cannot open Ab3P_prec.dat\n";
|
16 |
+
exit(1);
|
17 |
+
}
|
18 |
+
//get precision of a given #-ch SF's strategy
|
19 |
+
while(fin>>sf_grp>>sf_nchr>>strat) {
|
20 |
+
fin>>value; //precision
|
21 |
+
stratPrec.insert(pair<string, double>(sf_grp+sf_nchr+strat, value));
|
22 |
+
util.push_back_strat(sf_grp+sf_nchr, strat); //set strategy sequence
|
23 |
+
}
|
24 |
+
}
|
25 |
+
|
26 |
+
void Ab3P::get_abbrs( char * text, vector<AbbrOut> & abbrs ) {
|
27 |
+
abbrs.clear();
|
28 |
+
|
29 |
+
if( ! text[0] ) return; // skip empty line
|
30 |
+
|
31 |
+
ab.Proc(text); //extract potential SF & LF pairs
|
32 |
+
|
33 |
+
for(int i=0; i<ab.numa; i++) {
|
34 |
+
AbbrOut result;
|
35 |
+
|
36 |
+
try_pair( ab.abbs[i], ab.abbl[i], result );
|
37 |
+
|
38 |
+
// preserve results
|
39 |
+
if ( result.prec > 0 ) {
|
40 |
+
abbrs.push_back( result );
|
41 |
+
}
|
42 |
+
}
|
43 |
+
ab.cleara();
|
44 |
+
|
45 |
+
}
|
46 |
+
|
47 |
+
|
48 |
+
void Ab3P::try_pair( char * sf, char * lf, AbbrOut & result ) {
|
49 |
+
|
50 |
+
//process i) lf (sf)
|
51 |
+
try_strats( sf, lf, false, result );
|
52 |
+
|
53 |
+
//process ii) sf (lf)
|
54 |
+
ab.token(lf);
|
55 |
+
try_strats( ab.lst[ab.num-1], sf, true, result );
|
56 |
+
}
|
57 |
+
|
58 |
+
|
59 |
+
/**
|
60 |
+
psf -- pointer short form
|
61 |
+
plf -- pointer long form
|
62 |
+
**/
|
63 |
+
void Ab3P::try_strats ( char * psf, char * plf, bool swap,
|
64 |
+
AbbrOut & result ) {
|
65 |
+
|
66 |
+
string sfg; //SF group eg) Al1, Num2, Spec3
|
67 |
+
//false if sf is not ok, sfg will be assigned
|
68 |
+
|
69 |
+
if(!util.group_sf(psf,plf,sfg)) return;
|
70 |
+
if (swap) if(!util.exist_upperal(psf)) return;
|
71 |
+
|
72 |
+
char sf[1000], sfl[1000];
|
73 |
+
|
74 |
+
//strategy sequence for a given #-ch SF group
|
75 |
+
vector<string> strats = util.get_strats(sfg);
|
76 |
+
util.remove_nonAlnum(psf,sf); //sf will be w/o non-alnum
|
77 |
+
|
78 |
+
//go through strategies
|
79 |
+
for( int j=0; j<strats.size(); j++) {
|
80 |
+
AbbrStra * strat =
|
81 |
+
util.strat_factory(strats[j]); //set a paticular strategy
|
82 |
+
strat->wData = wrdData; //set wordset, stopword
|
83 |
+
if(strat->strategy(sf,plf)) { //case sensitive
|
84 |
+
strat->str_tolower(sf,sfl);
|
85 |
+
|
86 |
+
if( strat->lf_ok(psf,strat->lf) ) {
|
87 |
+
|
88 |
+
map<string, double>::iterator p =
|
89 |
+
stratPrec.find(sfg+strats[j]);
|
90 |
+
if(p==stratPrec.end()) {
|
91 |
+
cout << "No precision assigned" << endl;
|
92 |
+
exit(1);
|
93 |
+
}
|
94 |
+
|
95 |
+
//add outputs
|
96 |
+
if( p->second>result.prec ) {
|
97 |
+
result.sf = psf;
|
98 |
+
result.lf = strat->lf;
|
99 |
+
result.prec = p->second;
|
100 |
+
result.strat = strats[j];
|
101 |
+
}
|
102 |
+
|
103 |
+
delete strat;
|
104 |
+
return;
|
105 |
+
}
|
106 |
+
}
|
107 |
+
delete strat;
|
108 |
+
}
|
109 |
+
|
110 |
+
}
|
Library/Ab3P.h
CHANGED
@@ -1,83 +1,83 @@
|
|
1 |
-
/*
|
2 |
-
Identify sf & lf pairs from free text using multi-stage algorithm
|
3 |
-
process one line at a time and print out:
|
4 |
-
line
|
5 |
-
sf|lf|P-precision|strategy
|
6 |
-
*/
|
7 |
-
|
8 |
-
#include "AbbrvE.h"
|
9 |
-
#include "AbbrStra.h"
|
10 |
-
#include <vector>
|
11 |
-
#include <map>
|
12 |
-
#include <string>
|
13 |
-
|
14 |
-
using namespace std;
|
15 |
-
using namespace iret;
|
16 |
-
|
17 |
-
namespace iret {
|
18 |
-
|
19 |
-
class AbbrOut {
|
20 |
-
public:
|
21 |
-
string sf, lf, strat;
|
22 |
-
double prec;
|
23 |
-
|
24 |
-
AbbrOut( void ) : sf(""), lf(""), strat(""), prec(0)
|
25 |
-
{}
|
26 |
-
|
27 |
-
void print ( ostream & out ) {
|
28 |
-
out << " " << sf << "|" << lf << "|" << prec;
|
29 |
-
}
|
30 |
-
|
31 |
-
};
|
32 |
-
|
33 |
-
|
34 |
-
class Ab3P {
|
35 |
-
public:
|
36 |
-
Ab3P( void );
|
37 |
-
~Ab3P(void) { delete wrdData; }
|
38 |
-
|
39 |
-
/** Collect text for later abbreviation finding. **/
|
40 |
-
void add_text( const string & text ) {
|
41 |
-
buffer += text;
|
42 |
-
}
|
43 |
-
void add_text( char * text ) {
|
44 |
-
buffer += text;
|
45 |
-
}
|
46 |
-
|
47 |
-
/** Sets abbrs to the abbreviations found in previous calls to add_text.
|
48 |
-
Afterwords, resets the text buffer. **/
|
49 |
-
void get_abbrs( vector<AbbrOut> & abbrs ) {
|
50 |
-
get_abbrs( buffer, abbrs );
|
51 |
-
buffer = "";
|
52 |
-
}
|
53 |
-
|
54 |
-
/** Sets abbrs to the abbreviations found in text
|
55 |
-
Does not interfere with the add_text buffer. **/
|
56 |
-
void get_abbrs( const string & text, vector<AbbrOut> & abbrs ) {
|
57 |
-
abbrs.clear();
|
58 |
-
|
59 |
-
if(text.empty()) return; // skip empty line
|
60 |
-
// const_cast need so correct get_abbrs get called,
|
61 |
-
// otherwise, infinite loop
|
62 |
-
get_abbrs( const_cast<char*>(text.c_str()), abbrs );
|
63 |
-
}
|
64 |
-
void get_abbrs( char * text, vector<AbbrOut> & abbrs );
|
65 |
-
|
66 |
-
/** Try a potential sf-lf form to find proper lf, strategy used,
|
67 |
-
and pseudo-precision of result **/
|
68 |
-
void try_pair( char * sf, char * lf, AbbrOut & abbr );
|
69 |
-
|
70 |
-
/**
|
71 |
-
psf -- pointer short form
|
72 |
-
plf -- pointer long form
|
73 |
-
**/
|
74 |
-
void try_strats ( char * psf, char * plf, bool swap, AbbrOut & result );
|
75 |
-
|
76 |
-
AbbrvE ab; //default # pairs = 10,000
|
77 |
-
map<string, double> stratPrec;
|
78 |
-
StratUtil util;
|
79 |
-
WordData *wrdData; //set data needed for AbbrStra
|
80 |
-
string buffer; // collect text for later use
|
81 |
-
};
|
82 |
-
|
83 |
-
}
|
|
|
1 |
+
/*
|
2 |
+
Identify sf & lf pairs from free text using multi-stage algorithm
|
3 |
+
process one line at a time and print out:
|
4 |
+
line
|
5 |
+
sf|lf|P-precision|strategy
|
6 |
+
*/
|
7 |
+
|
8 |
+
#include "AbbrvE.h"
|
9 |
+
#include "AbbrStra.h"
|
10 |
+
#include <vector>
|
11 |
+
#include <map>
|
12 |
+
#include <string>
|
13 |
+
|
14 |
+
using namespace std;
|
15 |
+
using namespace iret;
|
16 |
+
|
17 |
+
namespace iret {
|
18 |
+
|
19 |
+
class AbbrOut {
|
20 |
+
public:
|
21 |
+
string sf, lf, strat;
|
22 |
+
double prec;
|
23 |
+
|
24 |
+
AbbrOut( void ) : sf(""), lf(""), strat(""), prec(0)
|
25 |
+
{}
|
26 |
+
|
27 |
+
void print ( ostream & out ) {
|
28 |
+
out << " " << sf << "|" << lf << "|" << prec;
|
29 |
+
}
|
30 |
+
|
31 |
+
};
|
32 |
+
|
33 |
+
|
34 |
+
class Ab3P {
|
35 |
+
public:
|
36 |
+
Ab3P( void );
|
37 |
+
~Ab3P(void) { delete wrdData; }
|
38 |
+
|
39 |
+
/** Collect text for later abbreviation finding. **/
|
40 |
+
void add_text( const string & text ) {
|
41 |
+
buffer += text;
|
42 |
+
}
|
43 |
+
void add_text( char * text ) {
|
44 |
+
buffer += text;
|
45 |
+
}
|
46 |
+
|
47 |
+
/** Sets abbrs to the abbreviations found in previous calls to add_text.
|
48 |
+
Afterwords, resets the text buffer. **/
|
49 |
+
void get_abbrs( vector<AbbrOut> & abbrs ) {
|
50 |
+
get_abbrs( buffer, abbrs );
|
51 |
+
buffer = "";
|
52 |
+
}
|
53 |
+
|
54 |
+
/** Sets abbrs to the abbreviations found in text
|
55 |
+
Does not interfere with the add_text buffer. **/
|
56 |
+
void get_abbrs( const string & text, vector<AbbrOut> & abbrs ) {
|
57 |
+
abbrs.clear();
|
58 |
+
|
59 |
+
if(text.empty()) return; // skip empty line
|
60 |
+
// const_cast need so correct get_abbrs get called,
|
61 |
+
// otherwise, infinite loop
|
62 |
+
get_abbrs( const_cast<char*>(text.c_str()), abbrs );
|
63 |
+
}
|
64 |
+
void get_abbrs( char * text, vector<AbbrOut> & abbrs );
|
65 |
+
|
66 |
+
/** Try a potential sf-lf form to find proper lf, strategy used,
|
67 |
+
and pseudo-precision of result **/
|
68 |
+
void try_pair( char * sf, char * lf, AbbrOut & abbr );
|
69 |
+
|
70 |
+
/**
|
71 |
+
psf -- pointer short form
|
72 |
+
plf -- pointer long form
|
73 |
+
**/
|
74 |
+
void try_strats ( char * psf, char * plf, bool swap, AbbrOut & result );
|
75 |
+
|
76 |
+
AbbrvE ab; //default # pairs = 10,000
|
77 |
+
map<string, double> stratPrec;
|
78 |
+
StratUtil util;
|
79 |
+
WordData *wrdData; //set data needed for AbbrStra
|
80 |
+
string buffer; // collect text for later use
|
81 |
+
};
|
82 |
+
|
83 |
+
}
|
Library/AbbrStra.C
CHANGED
@@ -1,1426 +1,1426 @@
|
|
1 |
-
#include "AbbrStra.h"
|
2 |
-
#include <runn.h>
|
3 |
-
#include <vector>
|
4 |
-
#include <fstream>
|
5 |
-
#include <iostream>
|
6 |
-
|
7 |
-
|
8 |
-
WordData::WordData(const char *wrdnam, const char *stpnam,
|
9 |
-
const char *lfsnam) :
|
10 |
-
wrdset(wrdnam), stp(stpnam), lfs(lfsnam)
|
11 |
-
{
|
12 |
-
wrdset.set_path_name("Ab3P");
|
13 |
-
wrdset.gopen_ctable_map();
|
14 |
-
stp.set_path_name("Ab3P");
|
15 |
-
stp.gopen_htable_map();
|
16 |
-
lfs.set_path_name("Ab3P");
|
17 |
-
lfs.gopen_htable_map();
|
18 |
-
}
|
19 |
-
|
20 |
-
WordData::~WordData()
|
21 |
-
{
|
22 |
-
wrdset.gclose_ctable_map();
|
23 |
-
stp.gclose_htable_map();
|
24 |
-
lfs.gclose_htable_map();
|
25 |
-
}
|
26 |
-
|
27 |
-
|
28 |
-
AbbrStra::AbbrStra()
|
29 |
-
{
|
30 |
-
npairs = tpairs = nsfs = nmatchs = amatchs = 0;
|
31 |
-
}
|
32 |
-
|
33 |
-
|
34 |
-
AbbrStra::~AbbrStra()
|
35 |
-
{
|
36 |
-
}
|
37 |
-
|
38 |
-
|
39 |
-
void AbbrStra::token(const char *str, char lst[1000][1000])
|
40 |
-
{
|
41 |
-
long i,j=0,k=0;
|
42 |
-
long n=strlen(str)-1;
|
43 |
-
|
44 |
-
while(isblank(str[n])) n--;
|
45 |
-
|
46 |
-
while(str[j]){
|
47 |
-
while(isblank(str[j]))j++;
|
48 |
-
i=j;
|
49 |
-
while((str[j])&&(!isblank(str[j])))j++;
|
50 |
-
strncpy(lst[k],str+i,j-i);
|
51 |
-
lst[k][j-i]='\0';
|
52 |
-
if(str[j]){
|
53 |
-
k++;
|
54 |
-
j++;
|
55 |
-
}
|
56 |
-
}
|
57 |
-
if((j-1)>n) k--; //added by Sohn (Jan-17-08): "ab cd " -> 2 tokens
|
58 |
-
ntk=k+1; //# tokens, ntk is data member
|
59 |
-
}
|
60 |
-
|
61 |
-
|
62 |
-
long AbbrStra::tokenize(const char *str, char lst[1000][1000])
|
63 |
-
{
|
64 |
-
long i,j=0,k=0;
|
65 |
-
long n=strlen(str)-1;
|
66 |
-
|
67 |
-
while(isblank(str[n])) n--;
|
68 |
-
|
69 |
-
while(str[j]){
|
70 |
-
while(isblank(str[j]))j++;
|
71 |
-
i=j;
|
72 |
-
while((str[j])&&(!isblank(str[j])))j++;
|
73 |
-
strncpy(lst[k],str+i,j-i);
|
74 |
-
lst[k][j-i]='\0';
|
75 |
-
if(str[j]){
|
76 |
-
k++;
|
77 |
-
j++;
|
78 |
-
}
|
79 |
-
}
|
80 |
-
if((j-1)>n) k--; //added by Sohn (Jan-17-08): "ab cd " -> 2 tokens
|
81 |
-
return k+1; //# tokens
|
82 |
-
}
|
83 |
-
|
84 |
-
|
85 |
-
long AbbrStra::num_token(const char *str)
|
86 |
-
{
|
87 |
-
long i,j=0,k=0;
|
88 |
-
long n=strlen(str)-1;
|
89 |
-
|
90 |
-
while(isblank(str[n])) n--;
|
91 |
-
|
92 |
-
while(str[j]){
|
93 |
-
while(isblank(str[j]))j++;
|
94 |
-
i=j;
|
95 |
-
while((str[j])&&(!isblank(str[j])))j++;
|
96 |
-
if(str[j]){
|
97 |
-
k++;
|
98 |
-
j++;
|
99 |
-
}
|
100 |
-
}
|
101 |
-
if((j-1)>n) k--; //added by Sohn (Jan-17-08): "ab cd " -> 2 tokens
|
102 |
-
return k+1; //# tokens
|
103 |
-
}
|
104 |
-
|
105 |
-
|
106 |
-
// fch is 1st char of str token from backward
|
107 |
-
long AbbrStra::first_ch(const char *str, char *fch, long num)
|
108 |
-
{
|
109 |
-
long i, j, numtk;
|
110 |
-
char tk[1000][1000];
|
111 |
-
|
112 |
-
numtk = tokenize(str,tk);
|
113 |
-
if(num>numtk) return 0;
|
114 |
-
|
115 |
-
for(i=0; i<num; i++)
|
116 |
-
fch[i] = tk[numtk-num+i][0];
|
117 |
-
|
118 |
-
return 1;
|
119 |
-
}
|
120 |
-
|
121 |
-
long AbbrStra::is_upperal(const char *str)
|
122 |
-
{
|
123 |
-
for(long i=strlen(str)-1; i>=0; i--)
|
124 |
-
if(!isupper(str[i]) || !isalpha(str[i]))
|
125 |
-
return 0;
|
126 |
-
return 1;
|
127 |
-
}
|
128 |
-
|
129 |
-
long AbbrStra::is_alpha(const char *str)
|
130 |
-
{
|
131 |
-
for(long i=strlen(str)-1; i>=0; i--)
|
132 |
-
if(!isalpha(str[i]))
|
133 |
-
return 0;
|
134 |
-
return 1;
|
135 |
-
}
|
136 |
-
|
137 |
-
|
138 |
-
// str2 will lower-case of str1
|
139 |
-
void AbbrStra::str_tolower(const char *str1, char *str2)
|
140 |
-
{
|
141 |
-
long i=0;
|
142 |
-
|
143 |
-
while(str1[i]) {
|
144 |
-
str2[i] = tolower(str1[i]);
|
145 |
-
i++;
|
146 |
-
}
|
147 |
-
str2[i] = '\0';
|
148 |
-
}
|
149 |
-
|
150 |
-
//copy num tokens from back of str1 to str2
|
151 |
-
long AbbrStra::get_str(const char *str1, char *str2, long num)
|
152 |
-
{
|
153 |
-
char ch, tk[1000][1000];
|
154 |
-
long i, j, numtk;
|
155 |
-
|
156 |
-
if(num<0) { cout<<"num<0\n"; exit(1); }
|
157 |
-
numtk = tokenize(str1,tk);
|
158 |
-
if(numtk<num) return 0;
|
159 |
-
|
160 |
-
strcpy(str2,tk[numtk-num]);
|
161 |
-
for(i=1; i<num; i++) {
|
162 |
-
strcat(str2," ");
|
163 |
-
strcat(str2,tk[numtk-num+i]);
|
164 |
-
}
|
165 |
-
|
166 |
-
return 1;
|
167 |
-
}
|
168 |
-
|
169 |
-
bool AbbrStra::isupper_str(const char *str)
|
170 |
-
{
|
171 |
-
long i, len=strlen(str);
|
172 |
-
|
173 |
-
for(i=0; i<len; i++)
|
174 |
-
if(isalpha(str[i]) && !isupper(str[i]))
|
175 |
-
return false;
|
176 |
-
|
177 |
-
return true;
|
178 |
-
}
|
179 |
-
|
180 |
-
bool AbbrStra::is_onealpha(const char *str)
|
181 |
-
{
|
182 |
-
long i, j=0, len=strlen(str);
|
183 |
-
|
184 |
-
for(i=0; i<len; i++)
|
185 |
-
if(isalpha(str[i])) j++;
|
186 |
-
|
187 |
-
if(j==1) return true;
|
188 |
-
else return false;
|
189 |
-
}
|
190 |
-
|
191 |
-
long AbbrStra::count_upperstr(const char *str)
|
192 |
-
{
|
193 |
-
long i, j, k, numtk;
|
194 |
-
char tk[1000][1000];
|
195 |
-
|
196 |
-
numtk = tokenize(str,tk);
|
197 |
-
|
198 |
-
j = 0;
|
199 |
-
for(i=numtk-1; i>=0; i--) {
|
200 |
-
if(isupper(tk[i][0])) j++;
|
201 |
-
else return j;
|
202 |
-
}
|
203 |
-
|
204 |
-
return j;
|
205 |
-
}
|
206 |
-
|
207 |
-
void AbbrStra::get_alpha(const char *str1, char *str2)
|
208 |
-
{
|
209 |
-
long i = 0, j = 0;
|
210 |
-
long len = strlen(str1);
|
211 |
-
|
212 |
-
while(i<len) {
|
213 |
-
if(isalpha(str1[i])) {
|
214 |
-
str2[j] = str1[i];
|
215 |
-
j++;
|
216 |
-
}
|
217 |
-
i++;
|
218 |
-
}
|
219 |
-
str2[j] = '\0';
|
220 |
-
}
|
221 |
-
|
222 |
-
|
223 |
-
bool AbbrStra::lf_ok(const char *shrtf, const char *longf)
|
224 |
-
{
|
225 |
-
long i;
|
226 |
-
long paren=0, sbrac=0;
|
227 |
-
string s, l;
|
228 |
-
|
229 |
-
//false for one parenthesis or square bracket
|
230 |
-
for(i=strlen(longf)-1; i>=0; i--) {
|
231 |
-
if(longf[i]=='(') paren++;
|
232 |
-
if(longf[i]==')') paren--;
|
233 |
-
if(longf[i]=='[') sbrac++;
|
234 |
-
if(longf[i]==']') sbrac--;
|
235 |
-
}
|
236 |
-
if(paren!=0 || sbrac!=0) return false;
|
237 |
-
|
238 |
-
s.assign(shrtf);
|
239 |
-
l.assign(longf);
|
240 |
-
|
241 |
-
for(i=0; i<s.length(); i++) s[i]=tolower(s[i]);
|
242 |
-
for(i=0; i<l.length(); i++) l[i]=tolower(l[i]);
|
243 |
-
|
244 |
-
//false if LF words contain SF
|
245 |
-
if( (" "+l+" ").find(" "+s+" ")!=string::npos ) return false;
|
246 |
-
|
247 |
-
return true;
|
248 |
-
}
|
249 |
-
|
250 |
-
|
251 |
-
//first=true: allow 1-ahpha, 0 don't allow
|
252 |
-
long AbbrStra::search_backward(long sloc, long tinx, long tloc, const char *abbr, bool first)
|
253 |
-
{
|
254 |
-
long sfloc=sloc, tkinx=tinx, tkloc=tloc;
|
255 |
-
|
256 |
-
while(sfloc>=0) {
|
257 |
-
loop1: while((tkloc>=0)&&(tok[tkinx][tkloc]!=abbr[sfloc])) tkloc--;
|
258 |
-
if(tkloc<0) {
|
259 |
-
tkinx--;
|
260 |
-
if(tkinx<0) return 0; //moved to here (Sep-14-07)
|
261 |
-
tkloc=strlen(tok[tkinx])-1;
|
262 |
-
}
|
263 |
-
else {
|
264 |
-
if(sfloc==0) {
|
265 |
-
if(tkloc!=0) {
|
266 |
-
if(!first) { tkloc--; goto loop1; }
|
267 |
-
else if(isalnum(tok[tkinx][tkloc-1])) { tkloc--; goto loop1; }
|
268 |
-
}
|
269 |
-
}
|
270 |
-
mod[sfloc][0]=tkinx;
|
271 |
-
mod[sfloc][1]=tkloc;
|
272 |
-
sfloc--; tkloc--;
|
273 |
-
}
|
274 |
-
}
|
275 |
-
|
276 |
-
return 1;
|
277 |
-
}
|
278 |
-
|
279 |
-
long AbbrStra::search_backward_adv(const char *abbr, bool flag)
|
280 |
-
{
|
281 |
-
long i;
|
282 |
-
long lna=strlen(abbr);
|
283 |
-
|
284 |
-
i=0;
|
285 |
-
while(i<lna){
|
286 |
-
if(search_backward(i,mod[i][0],mod[i][1]-1,abbr,flag)) return 1;
|
287 |
-
i++;
|
288 |
-
}
|
289 |
-
return 0;
|
290 |
-
}
|
291 |
-
|
292 |
-
void AbbrStra::extract_lf(long begin, long end)
|
293 |
-
{
|
294 |
-
strcpy(lf,tok[begin]);
|
295 |
-
for(long i=begin+1; i<=end; i++) {
|
296 |
-
strcat(lf," ");
|
297 |
-
strcat(lf,tok[i]);
|
298 |
-
}
|
299 |
-
}
|
300 |
-
|
301 |
-
|
302 |
-
void AbbrStra::extract_lf(long begin, long end, const char *str)
|
303 |
-
{
|
304 |
-
token(str,tok);
|
305 |
-
strcpy(lf,tok[begin]);
|
306 |
-
for(long i=begin+1; i<=end; i++) {
|
307 |
-
strcat(lf," ");
|
308 |
-
strcat(lf,tok[i]);
|
309 |
-
}
|
310 |
-
}
|
311 |
-
|
312 |
-
//---
|
313 |
-
bool AbbrStra::exist_skipword(long nsf)
|
314 |
-
{
|
315 |
-
long i=0, j=0, k;
|
316 |
-
|
317 |
-
while(i<nsf) {
|
318 |
-
if(i==(nsf-1)) k=ntk-mod[i][0]-1;
|
319 |
-
else k=mod[i+1][0]-mod[i][0]-1;
|
320 |
-
if(k>0) j+=k;
|
321 |
-
i++;
|
322 |
-
}
|
323 |
-
|
324 |
-
if(j>0) return true;
|
325 |
-
else return false;
|
326 |
-
}
|
327 |
-
|
328 |
-
|
329 |
-
bool AbbrStra::exist_n_skipwords(long nsf, long n)
|
330 |
-
{
|
331 |
-
long i=0, j, k;
|
332 |
-
bool flag=false;
|
333 |
-
|
334 |
-
//k: # skip words
|
335 |
-
while(i<nsf) {
|
336 |
-
if(i==(nsf-1)) k=ntk-mod[i][0]-1;
|
337 |
-
else k=mod[i+1][0]-mod[i][0]-1;
|
338 |
-
if(k>n) return false;
|
339 |
-
if(k==n) flag=true;
|
340 |
-
i++;
|
341 |
-
}
|
342 |
-
|
343 |
-
if(flag) return true;
|
344 |
-
else return false;
|
345 |
-
}
|
346 |
-
|
347 |
-
//exists n consecutive skip stopwords between tokens
|
348 |
-
bool AbbrStra::exist_n_stopwords(long nsf, long n)
|
349 |
-
{
|
350 |
-
long i=0, j, k;
|
351 |
-
bool flag=false;
|
352 |
-
|
353 |
-
while(i<nsf) {
|
354 |
-
if(i==(nsf-1)) k=ntk-mod[i][0]-1;
|
355 |
-
else k=mod[i+1][0]-mod[i][0]-1;
|
356 |
-
if(k>n) return false;
|
357 |
-
if(k==n) flag=true;
|
358 |
-
if(k>0) { //skip word exists
|
359 |
-
while(k) {
|
360 |
-
if(!wData->stp.find(tok[mod[i][0]+k])) return false;
|
361 |
-
k--;
|
362 |
-
}
|
363 |
-
}
|
364 |
-
i++;
|
365 |
-
}
|
366 |
-
|
367 |
-
if(flag) return true;
|
368 |
-
else return false;
|
369 |
-
}
|
370 |
-
|
371 |
-
|
372 |
-
bool AbbrStra::stopword_ok(long nsf, long nsw)
|
373 |
-
{
|
374 |
-
long i=0, j, k;
|
375 |
-
|
376 |
-
while(i<nsf) {
|
377 |
-
if(i==(nsf-1)) k=ntk-mod[i][0]-1;
|
378 |
-
else k=mod[i+1][0]-mod[i][0]-1;
|
379 |
-
if(k>nsw) return false;
|
380 |
-
if(k>0) { //skip word exists
|
381 |
-
while(k) {
|
382 |
-
if(!wData->stp.find(tok[mod[i][0]+k])) return false;
|
383 |
-
k--;
|
384 |
-
}
|
385 |
-
}
|
386 |
-
i++;
|
387 |
-
}
|
388 |
-
|
389 |
-
return true;
|
390 |
-
}
|
391 |
-
|
392 |
-
bool AbbrStra::skip_stop_ok(long nsf, long nsw, long n)
|
393 |
-
{
|
394 |
-
long i=0, j, k, nstp;
|
395 |
-
|
396 |
-
while(i<nsf) {
|
397 |
-
if(i==(nsf-1)) k=ntk-mod[i][0]-1;
|
398 |
-
else k=mod[i+1][0]-mod[i][0]-1;
|
399 |
-
if(k>nsw) return false;
|
400 |
-
//if(k>0) { //skip word exists
|
401 |
-
if(k>(nsw-n)) {
|
402 |
-
nstp=0; //# skiped stopword between tokens
|
403 |
-
while(k) {
|
404 |
-
if(wData->stp.find(tok[mod[i][0]+k])) nstp++;
|
405 |
-
k--;
|
406 |
-
}
|
407 |
-
if(nstp<n) return false;
|
408 |
-
}
|
409 |
-
i++;
|
410 |
-
}
|
411 |
-
|
412 |
-
return true;
|
413 |
-
}
|
414 |
-
|
415 |
-
|
416 |
-
bool AbbrStra::skip_stop_ok2(long nsf, long nsw, long n)
|
417 |
-
{
|
418 |
-
long i=0, j, k, nstp;
|
419 |
-
|
420 |
-
while(i<nsf) {
|
421 |
-
if(i==(nsf-1)) k=ntk-mod[i][0]-1;
|
422 |
-
else k=mod[i+1][0]-mod[i][0]-1;
|
423 |
-
if((k>0)&&(k!=nsw)) return false;
|
424 |
-
if(k>0) { //skip word exists
|
425 |
-
nstp=0; //# skiped stopword between tokens
|
426 |
-
while(k) {
|
427 |
-
if(wData->stp.find(tok[mod[i][0]+k])) nstp++;
|
428 |
-
k--;
|
429 |
-
}
|
430 |
-
if(nstp<n) return false;
|
431 |
-
}
|
432 |
-
|
433 |
-
i++;
|
434 |
-
}
|
435 |
-
|
436 |
-
return true;
|
437 |
-
}
|
438 |
-
|
439 |
-
|
440 |
-
bool AbbrStra::skipword_ok(long nsf, long nsw)
|
441 |
-
{
|
442 |
-
long i=0, j, k;
|
443 |
-
|
444 |
-
while(i<nsf) {
|
445 |
-
if(i==(nsf-1)) k=ntk-mod[i][0]-1;
|
446 |
-
else k=mod[i+1][0]-mod[i][0]-1;
|
447 |
-
if(k>nsw) return false;
|
448 |
-
i++;
|
449 |
-
}
|
450 |
-
|
451 |
-
return true;
|
452 |
-
}
|
453 |
-
|
454 |
-
|
455 |
-
bool AbbrStra::is_subword(long nsf)
|
456 |
-
{
|
457 |
-
long i=0;
|
458 |
-
char word[1000];
|
459 |
-
|
460 |
-
while(i<nsf) {
|
461 |
-
if(mod[i][1]!=0) {
|
462 |
-
strcpy(word,tok[mod[i][0]]+mod[i][1]);
|
463 |
-
if(wData->wrdset.count(word)==0) return false;
|
464 |
-
}
|
465 |
-
i++;
|
466 |
-
}
|
467 |
-
|
468 |
-
return true;
|
469 |
-
}
|
470 |
-
|
471 |
-
|
472 |
-
bool AbbrStra::is_BeginWrdMatch(long nsf, bool general)
|
473 |
-
{
|
474 |
-
long i=0, j;
|
475 |
-
bool *bwm = new bool [ntk]; //BeginWrdMatch of a given tok
|
476 |
-
|
477 |
-
for(j=0; j<ntk; j++) bwm[j] = false;
|
478 |
-
|
479 |
-
while(i<nsf) {
|
480 |
-
if(mod[i][1]==0)
|
481 |
-
bwm[mod[i][0]] = true;
|
482 |
-
else if( general && (!isalnum(tok[mod[i][0]][mod[i][1]-1])) )
|
483 |
-
bwm[mod[i][0]] = true;
|
484 |
-
i++;
|
485 |
-
}
|
486 |
-
|
487 |
-
for(j=0; j<nsf; j++)
|
488 |
-
if(!bwm[mod[j][0]]) {
|
489 |
-
delete [] bwm;
|
490 |
-
return false;
|
491 |
-
}
|
492 |
-
|
493 |
-
delete [] bwm;
|
494 |
-
|
495 |
-
return true;
|
496 |
-
}
|
497 |
-
|
498 |
-
|
499 |
-
bool AbbrStra::is_WithinWrdMatch(long nsf, bool general)
|
500 |
-
{
|
501 |
-
long i=0, wwm=0;
|
502 |
-
|
503 |
-
while(i<nsf) {
|
504 |
-
if(!general) {
|
505 |
-
if(mod[i][1]>0) wwm++;
|
506 |
-
}
|
507 |
-
else {
|
508 |
-
if(mod[i][1]>0 && isalnum(tok[mod[i][0]][mod[i][1]-1])) wwm++;
|
509 |
-
}
|
510 |
-
i++;
|
511 |
-
}
|
512 |
-
|
513 |
-
if(wwm>0) return true;
|
514 |
-
else return false;
|
515 |
-
}
|
516 |
-
|
517 |
-
|
518 |
-
bool AbbrStra::is_FirstLetMatch(long nsf, bool general)
|
519 |
-
{
|
520 |
-
long i=0, flm=0, flm2=0;
|
521 |
-
|
522 |
-
while(i<nsf) {
|
523 |
-
if(mod[i][1]==0) flm++;
|
524 |
-
else if( general && (!isalnum(tok[mod[i][0]][mod[i][1]-1])) ) {
|
525 |
-
flm++; flm2++;
|
526 |
-
}
|
527 |
-
i++;
|
528 |
-
}
|
529 |
-
|
530 |
-
if(flm==nsf) return true;
|
531 |
-
else return false;
|
532 |
-
}
|
533 |
-
|
534 |
-
|
535 |
-
bool AbbrStra::is_FirstLetMatch2(long nsf, bool general)
|
536 |
-
{
|
537 |
-
long i=0, flm=0, flm2=0;
|
538 |
-
|
539 |
-
while(i<nsf) {
|
540 |
-
if(mod[i][1]==0) flm++;
|
541 |
-
else if( general && (!isalnum(tok[mod[i][0]][mod[i][1]-1])) ) {
|
542 |
-
flm++; flm2++;
|
543 |
-
}
|
544 |
-
i++;
|
545 |
-
}
|
546 |
-
|
547 |
-
if( (flm==nsf) && (flm2>=1) ) return true;
|
548 |
-
else return false;
|
549 |
-
}
|
550 |
-
|
551 |
-
|
552 |
-
bool AbbrStra::is_FirstLetSMatch(const char *abbr, bool general)
|
553 |
-
{
|
554 |
-
long i=0, j=strlen(abbr)-1, flm=0, lsm=0;
|
555 |
-
|
556 |
-
while(i<j) {
|
557 |
-
if(mod[i][1]==0) flm++;
|
558 |
-
else if( general && (!isalnum(tok[mod[i][0]][mod[i][1]-1])) ) flm++;
|
559 |
-
i++;
|
560 |
-
}
|
561 |
-
|
562 |
-
if( (tok[mod[j][0]][mod[j][1]]=='s') &&
|
563 |
-
(mod[j][1]==(strlen(tok[mod[j][0]])-1)) &&
|
564 |
-
mod[j][0]==mod[j-1][0] ) lsm++;
|
565 |
-
|
566 |
-
if((flm==j) && (lsm==1)) return true;
|
567 |
-
else return false;
|
568 |
-
}
|
569 |
-
|
570 |
-
|
571 |
-
bool AbbrStra::is_ContLetMatch(long nsf)
|
572 |
-
{
|
573 |
-
long i=0, cl=1;
|
574 |
-
|
575 |
-
while(i<(nsf-1)) {
|
576 |
-
if( mod[i][0]==mod[i+1][0] &&
|
577 |
-
(mod[i][1]+1)==mod[i+1][1] ) cl++;
|
578 |
-
i++;
|
579 |
-
}
|
580 |
-
|
581 |
-
if(cl>=2) return true;
|
582 |
-
else return false;
|
583 |
-
}
|
584 |
-
//----
|
585 |
-
|
586 |
-
|
587 |
-
//---1st ch must be alnum & at least one alphabet for all
|
588 |
-
//str1: sf
|
589 |
-
bool AbbrStra::set_condition(const char *str1)
|
590 |
-
{
|
591 |
-
int n=0, m=0, o=0;
|
592 |
-
|
593 |
-
switch(setCondition) {
|
594 |
-
case 1: //all alphabet SFs
|
595 |
-
for(long i=strlen(str1)-1; i>=0; i--)
|
596 |
-
if(!isalpha(str1[i]))
|
597 |
-
return false;
|
598 |
-
return true;
|
599 |
-
break;
|
600 |
-
case 2: //at least one non-alphabet
|
601 |
-
if(!isalnum(str1[0])) return false;
|
602 |
-
for(long i=strlen(str1)-1; i>=0; i--) {
|
603 |
-
if(isalpha(str1[i])) n++;
|
604 |
-
else m++;
|
605 |
-
}
|
606 |
-
if( (n>0) && (m>0) ) return true;
|
607 |
-
else return false;
|
608 |
-
break;
|
609 |
-
case 3: //only alnum & at least one num
|
610 |
-
for(long i=strlen(str1)-1; i>=0; i--) {
|
611 |
-
if(!isalnum(str1[i])) return false;
|
612 |
-
if(isalpha(str1[i])) n++;
|
613 |
-
if(isdigit(str1[i])) m++;
|
614 |
-
}
|
615 |
-
if( (n>0) && (m>0) ) return true;
|
616 |
-
else return false;
|
617 |
-
break;
|
618 |
-
case 4: //only alpha and non-alnum & at least one non-alnum
|
619 |
-
if(!isalpha(str1[0])) return false;
|
620 |
-
for(long i=strlen(str1)-1; i>=0; i--) {
|
621 |
-
if(isdigit(str1[i])) return false;
|
622 |
-
if(!isalnum(str1[i])) n++;
|
623 |
-
}
|
624 |
-
if(n>0) return true;
|
625 |
-
else return false;
|
626 |
-
break;
|
627 |
-
case 5: //at least one non-alnum
|
628 |
-
if(!isalnum(str1[0])) return false;
|
629 |
-
for(long i=strlen(str1)-1; i>0; i--) {
|
630 |
-
if(!isalnum(str1[i])) return true;
|
631 |
-
}
|
632 |
-
return false;
|
633 |
-
break;
|
634 |
-
case 6: //at least one num and non-alnum
|
635 |
-
if(!isalnum(str1[0])) return false;
|
636 |
-
for(long i=strlen(str1)-1; i>=0; i--) {
|
637 |
-
if(isalpha(str1[i])) n++;
|
638 |
-
if(isdigit(str1[i])) m++;
|
639 |
-
if(!isalnum(str1[i])) o++;
|
640 |
-
}
|
641 |
-
if( (n>0) && (m>0) && (o>0) ) return true;
|
642 |
-
else return false;
|
643 |
-
break;
|
644 |
-
case 7: //1+2 (SH algorithm)
|
645 |
-
if(!isalnum(str1[0])) return false;
|
646 |
-
for(long i=strlen(str1)-1; i>=0; i--)
|
647 |
-
if(isalpha(str1[i])) return true;
|
648 |
-
return false;
|
649 |
-
break;
|
650 |
-
default:
|
651 |
-
cout << "Not defined set condition\n";
|
652 |
-
exit(1);
|
653 |
-
}
|
654 |
-
}
|
655 |
-
|
656 |
-
//---
|
657 |
-
//same as FirstLet::set_condition
|
658 |
-
//but requires extra set conditions
|
659 |
-
bool FirstLetOneChSF::set_condition(const char *shrtf, const char *longf, char *str)
|
660 |
-
{
|
661 |
-
long i=0, len=strlen(shrtf), numtk;
|
662 |
-
char tk[1000][1000];
|
663 |
-
|
664 |
-
//sf conditions: all alphabet
|
665 |
-
while(i<len && isalpha(shrtf[i])) i++;
|
666 |
-
if(i!=len) return false;
|
667 |
-
|
668 |
-
//lf conditions: #tok>=|SF|, 1st ch of words must be alphabet
|
669 |
-
numtk = tokenize(longf,tk);
|
670 |
-
if(len>numtk) return false;
|
671 |
-
|
672 |
-
for(i=0; i<len; i++)
|
673 |
-
str[i] = tk[numtk-len+i][0];
|
674 |
-
str[i] = '\0';
|
675 |
-
|
676 |
-
if(!is_alpha(str)) return false;
|
677 |
-
|
678 |
-
return true;
|
679 |
-
}
|
680 |
-
|
681 |
-
|
682 |
-
long FirstLetOneChSF::strategy(const char *sf_, const char *str_) {
|
683 |
-
long lna,lnt,flag;
|
684 |
-
bool genFL=false; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
685 |
-
char phr[10000], phrl[10000];
|
686 |
-
|
687 |
-
str_tolower(sf_,sf);
|
688 |
-
str_tolower(str_,text);
|
689 |
-
|
690 |
-
get_str(str_,phr,1); //phr: 1st token of str from back
|
691 |
-
str_tolower(phr,phrl);
|
692 |
-
//conditions
|
693 |
-
if(is_onealpha(phr)) return 0; //last token includes 1 alphabet
|
694 |
-
if(isupper_str(phr)) return 0; //last token is all upper-case alphabet
|
695 |
-
if(wData->stp.find(phrl)) return 0; //last token is stopword
|
696 |
-
if(!wData->lfs.find(phrl)) return 0; //lfs (1-ch sf) for FirstLet match cases < 2
|
697 |
-
|
698 |
-
token(text,tok);
|
699 |
-
lna = strlen(sf);
|
700 |
-
lnt = strlen(tok[ntk-1]);
|
701 |
-
|
702 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
703 |
-
if(!flag) return 0;
|
704 |
-
|
705 |
-
do {
|
706 |
-
if(!skipword_ok(lna,0)) continue;
|
707 |
-
if(!is_FirstLetMatch(lna,genFL)) continue; //not allow 1-alpha
|
708 |
-
|
709 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
710 |
-
return 1;
|
711 |
-
} while(search_backward_adv(sf,genFL));
|
712 |
-
|
713 |
-
return 0;
|
714 |
-
}
|
715 |
-
//---
|
716 |
-
|
717 |
-
bool FirstLet::set_condition(const char *shrtf, const char *longf, char *str)
|
718 |
-
{
|
719 |
-
long i=0, len=strlen(shrtf), numtk;
|
720 |
-
char tk[1000][1000];
|
721 |
-
|
722 |
-
//sf conditions
|
723 |
-
while(i<len && isalpha(shrtf[i])) i++;
|
724 |
-
if(i!=len) return false;
|
725 |
-
|
726 |
-
//lf conditions
|
727 |
-
numtk = tokenize(longf,tk);
|
728 |
-
if(len>numtk) return false;
|
729 |
-
|
730 |
-
for(i=0; i<len; i++)
|
731 |
-
str[i] = tk[numtk-len+i][0];
|
732 |
-
str[i] = '\0';
|
733 |
-
|
734 |
-
if(!is_alpha(str)) return false;
|
735 |
-
|
736 |
-
return true;
|
737 |
-
}
|
738 |
-
|
739 |
-
|
740 |
-
long FirstLet::strategy(const char *sf_, const char *str_) {
|
741 |
-
long lna,lnt,flag;
|
742 |
-
bool genFL=false; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
743 |
-
|
744 |
-
str_tolower(sf_,sf);
|
745 |
-
str_tolower(str_,text);
|
746 |
-
|
747 |
-
token(text,tok);
|
748 |
-
lna = strlen(sf);
|
749 |
-
lnt = strlen(tok[ntk-1]);
|
750 |
-
|
751 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
752 |
-
if(!flag) return 0;
|
753 |
-
|
754 |
-
do {
|
755 |
-
if(!skipword_ok(lna,0)) continue;
|
756 |
-
if(!is_FirstLetMatch(lna,genFL)) continue; //not allow 1-alpha
|
757 |
-
|
758 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
759 |
-
return 1;
|
760 |
-
} while(search_backward_adv(sf,genFL));
|
761 |
-
|
762 |
-
return 0;
|
763 |
-
}
|
764 |
-
|
765 |
-
|
766 |
-
long FirstLetGen::strategy(const char *sf_, const char *str_)
|
767 |
-
{
|
768 |
-
long lna,lnt,flag;
|
769 |
-
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
770 |
-
|
771 |
-
str_tolower(sf_,sf);
|
772 |
-
str_tolower(str_,text);
|
773 |
-
|
774 |
-
token(text,tok);
|
775 |
-
lna = strlen(sf);
|
776 |
-
lnt = strlen(tok[ntk-1]);
|
777 |
-
|
778 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
779 |
-
if(!flag) return 0;
|
780 |
-
|
781 |
-
do {
|
782 |
-
if(!skipword_ok(lna,0)) continue;
|
783 |
-
if(!is_FirstLetMatch2(lna,genFL)) continue; //at least 1-alpha
|
784 |
-
|
785 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
786 |
-
return 1;
|
787 |
-
} while(search_backward_adv(sf,genFL));
|
788 |
-
|
789 |
-
return 0;
|
790 |
-
}
|
791 |
-
|
792 |
-
|
793 |
-
long FirstLetGen2::strategy(const char *sf_, const char *str_)
|
794 |
-
{
|
795 |
-
long lna,lnt,flag;
|
796 |
-
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
797 |
-
|
798 |
-
str_tolower(sf_,sf);
|
799 |
-
str_tolower(str_,text);
|
800 |
-
|
801 |
-
token(text,tok);
|
802 |
-
lna = strlen(sf);
|
803 |
-
lnt = strlen(tok[ntk-1]);
|
804 |
-
|
805 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
806 |
-
if(!flag) return 0;
|
807 |
-
|
808 |
-
do {
|
809 |
-
if(!skipword_ok(lna,0)) continue;
|
810 |
-
if(!is_FirstLetMatch(lna,genFL)) continue;
|
811 |
-
|
812 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
813 |
-
return 1;
|
814 |
-
} while(search_backward_adv(sf,genFL));
|
815 |
-
|
816 |
-
return 0;
|
817 |
-
}
|
818 |
-
|
819 |
-
|
820 |
-
bool FirstLetGenS::set_condition(const char *str)
|
821 |
-
{
|
822 |
-
if(str[strlen(str)-1]!='s') return false;
|
823 |
-
|
824 |
-
for(long i=strlen(str)-2; i>=0; i--) {
|
825 |
-
if(!isupper(str[i])) return false;
|
826 |
-
if(!isalpha(str[i])) return false; //necessary?
|
827 |
-
}
|
828 |
-
|
829 |
-
return true;
|
830 |
-
}
|
831 |
-
|
832 |
-
|
833 |
-
long FirstLetGenS::strategy(const char *sf_, const char *str_)
|
834 |
-
{
|
835 |
-
long lna,lnt,flag;
|
836 |
-
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
837 |
-
|
838 |
-
if(!set_condition(sf_)) return 0;
|
839 |
-
|
840 |
-
str_tolower(sf_,sf);
|
841 |
-
str_tolower(str_,text);
|
842 |
-
|
843 |
-
token(text,tok);
|
844 |
-
lna = strlen(sf);
|
845 |
-
lnt = strlen(tok[ntk-1]);
|
846 |
-
|
847 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
848 |
-
if(!flag) return 0;
|
849 |
-
|
850 |
-
do {
|
851 |
-
if(!skipword_ok(lna,0)) continue;
|
852 |
-
if(!is_FirstLetSMatch(sf,genFL)) continue;
|
853 |
-
|
854 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
855 |
-
return 1;
|
856 |
-
} while(search_backward_adv(sf,genFL));
|
857 |
-
|
858 |
-
return 0;
|
859 |
-
}
|
860 |
-
|
861 |
-
|
862 |
-
long FirstLetGenStp::strategy(const char *sf_, const char *str_)
|
863 |
-
{
|
864 |
-
long lna,lnt,flag;
|
865 |
-
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
866 |
-
|
867 |
-
str_tolower(sf_,sf);
|
868 |
-
str_tolower(str_,text);
|
869 |
-
|
870 |
-
token(text,tok);
|
871 |
-
lna = strlen(sf);
|
872 |
-
lnt = strlen(tok[ntk-1]);
|
873 |
-
|
874 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
875 |
-
if(!flag) return 0;
|
876 |
-
|
877 |
-
do {
|
878 |
-
if(!exist_skipword(lna)) continue;
|
879 |
-
if(!stopword_ok(lna,1)) continue;
|
880 |
-
if(!is_FirstLetMatch(lna,genFL)) continue;
|
881 |
-
|
882 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
883 |
-
return 1;
|
884 |
-
} while(search_backward_adv(sf,genFL));
|
885 |
-
|
886 |
-
return 0;
|
887 |
-
}
|
888 |
-
|
889 |
-
|
890 |
-
long FirstLetGenStp2::strategy(const char *sf_, const char *str_)
|
891 |
-
{
|
892 |
-
long lna,lnt,flag;
|
893 |
-
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
894 |
-
|
895 |
-
str_tolower(sf_,sf);
|
896 |
-
str_tolower(str_,text);
|
897 |
-
|
898 |
-
token(text,tok);
|
899 |
-
lna = strlen(sf);
|
900 |
-
lnt = strlen(tok[ntk-1]);
|
901 |
-
|
902 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
903 |
-
if(!flag) return 0;
|
904 |
-
|
905 |
-
do {
|
906 |
-
if(!exist_n_stopwords(lna,2)) continue;
|
907 |
-
if(!is_FirstLetMatch(lna,genFL)) continue;
|
908 |
-
|
909 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
910 |
-
return 1;
|
911 |
-
} while(search_backward_adv(sf,genFL));
|
912 |
-
|
913 |
-
return 0;
|
914 |
-
}
|
915 |
-
|
916 |
-
|
917 |
-
long FirstLetGenSkp::strategy(const char *sf_, const char *str_)
|
918 |
-
{
|
919 |
-
long lna,lnt,flag;
|
920 |
-
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
921 |
-
|
922 |
-
str_tolower(sf_,sf);
|
923 |
-
str_tolower(str_,text);
|
924 |
-
|
925 |
-
token(text,tok);
|
926 |
-
lna = strlen(sf);
|
927 |
-
lnt = strlen(tok[ntk-1]);
|
928 |
-
|
929 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
930 |
-
if(!flag) return 0;
|
931 |
-
|
932 |
-
do {
|
933 |
-
if(!exist_skipword(lna)) continue;
|
934 |
-
if(!skipword_ok(lna,1)) continue;
|
935 |
-
if(!is_FirstLetMatch(lna,genFL)) continue;
|
936 |
-
|
937 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
938 |
-
return 1;
|
939 |
-
} while(search_backward_adv(sf,genFL));
|
940 |
-
|
941 |
-
return 0;
|
942 |
-
}
|
943 |
-
|
944 |
-
|
945 |
-
long WithinWrdWrd::strategy(const char *sf_, const char *str_)
|
946 |
-
{
|
947 |
-
long lna,lnt,flag;
|
948 |
-
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
949 |
-
|
950 |
-
str_tolower(sf_,sf);
|
951 |
-
str_tolower(str_,text);
|
952 |
-
|
953 |
-
token(text,tok);
|
954 |
-
lna = strlen(sf);
|
955 |
-
lnt = strlen(tok[ntk-1]);
|
956 |
-
|
957 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
958 |
-
if(!flag) return 0;
|
959 |
-
|
960 |
-
do {
|
961 |
-
if(!skipword_ok(lna,0)) continue;
|
962 |
-
if(!is_subword(lna)) continue;
|
963 |
-
if(!is_WithinWrdMatch(lna,genFL)) continue;
|
964 |
-
|
965 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
966 |
-
return 1;
|
967 |
-
} while(search_backward_adv(sf,genFL));
|
968 |
-
|
969 |
-
return 0;
|
970 |
-
}
|
971 |
-
|
972 |
-
|
973 |
-
long WithinWrdFWrd::strategy(const char *sf_, const char *str_)
|
974 |
-
{
|
975 |
-
long lna,lnt,flag;
|
976 |
-
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
977 |
-
|
978 |
-
str_tolower(sf_,sf);
|
979 |
-
str_tolower(str_,text);
|
980 |
-
|
981 |
-
token(text,tok);
|
982 |
-
lna = strlen(sf);
|
983 |
-
lnt = strlen(tok[ntk-1]);
|
984 |
-
|
985 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
986 |
-
if(!flag) return 0;
|
987 |
-
|
988 |
-
do {
|
989 |
-
if(!skipword_ok(lna,0)) continue;
|
990 |
-
if(!is_subword(lna)) continue;
|
991 |
-
if(!is_BeginWrdMatch(lna,genFL)) continue;
|
992 |
-
if(!is_WithinWrdMatch(lna,genFL)) continue;
|
993 |
-
|
994 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
995 |
-
return 1;
|
996 |
-
} while(search_backward_adv(sf,genFL));
|
997 |
-
|
998 |
-
return 0;
|
999 |
-
}
|
1000 |
-
|
1001 |
-
|
1002 |
-
long WithinWrdFWrdSkp::strategy(const char *sf_, const char *str_)
|
1003 |
-
{
|
1004 |
-
long lna,lnt,flag;
|
1005 |
-
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
1006 |
-
|
1007 |
-
str_tolower(sf_,sf);
|
1008 |
-
str_tolower(str_,text);
|
1009 |
-
|
1010 |
-
token(text,tok);
|
1011 |
-
lna = strlen(sf);
|
1012 |
-
lnt = strlen(tok[ntk-1]);
|
1013 |
-
|
1014 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
1015 |
-
if(!flag) return 0;
|
1016 |
-
|
1017 |
-
do {
|
1018 |
-
if(!exist_skipword(lna)) continue;
|
1019 |
-
if(!skipword_ok(lna,1)) continue;
|
1020 |
-
if(!is_subword(lna)) continue;
|
1021 |
-
if(!is_BeginWrdMatch(lna,genFL)) continue;
|
1022 |
-
if(!is_WithinWrdMatch(lna,genFL)) continue;
|
1023 |
-
|
1024 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
1025 |
-
return 1;
|
1026 |
-
} while(search_backward_adv(sf,genFL));
|
1027 |
-
|
1028 |
-
return 0;
|
1029 |
-
}
|
1030 |
-
|
1031 |
-
|
1032 |
-
long WithinWrdLet::strategy(const char *sf_, const char *str_)
|
1033 |
-
{
|
1034 |
-
long lna,lnt,flag;
|
1035 |
-
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
1036 |
-
|
1037 |
-
str_tolower(sf_,sf);
|
1038 |
-
str_tolower(str_,text);
|
1039 |
-
|
1040 |
-
token(text,tok);
|
1041 |
-
lna = strlen(sf);
|
1042 |
-
lnt = strlen(tok[ntk-1]);
|
1043 |
-
|
1044 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
1045 |
-
if(!flag) return 0;
|
1046 |
-
|
1047 |
-
do {
|
1048 |
-
if(!skipword_ok(lna,0)) continue;
|
1049 |
-
if(!is_WithinWrdMatch(lna,genFL)) continue;
|
1050 |
-
|
1051 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
1052 |
-
return 1;
|
1053 |
-
} while(search_backward_adv(sf,genFL));
|
1054 |
-
|
1055 |
-
return 0;
|
1056 |
-
}
|
1057 |
-
|
1058 |
-
|
1059 |
-
long WithinWrdFLet::strategy(const char *sf_, const char *str_)
|
1060 |
-
{
|
1061 |
-
long lna,lnt,flag;
|
1062 |
-
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
1063 |
-
|
1064 |
-
str_tolower(sf_,sf);
|
1065 |
-
str_tolower(str_,text);
|
1066 |
-
|
1067 |
-
token(text,tok);
|
1068 |
-
lna = strlen(sf);
|
1069 |
-
lnt = strlen(tok[ntk-1]);
|
1070 |
-
|
1071 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
1072 |
-
if(!flag) return 0;
|
1073 |
-
|
1074 |
-
do {
|
1075 |
-
if(!skipword_ok(lna,0)) continue;
|
1076 |
-
if(!is_BeginWrdMatch(lna,genFL)) continue;
|
1077 |
-
if(!is_WithinWrdMatch(lna,genFL)) continue;
|
1078 |
-
|
1079 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
1080 |
-
return 1;
|
1081 |
-
} while(search_backward_adv(sf,genFL));
|
1082 |
-
|
1083 |
-
return 0;
|
1084 |
-
}
|
1085 |
-
|
1086 |
-
|
1087 |
-
long WithinWrdFLetSkp::strategy(const char *sf_, const char *str_)
|
1088 |
-
{
|
1089 |
-
long lna,lnt,flag;
|
1090 |
-
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
1091 |
-
|
1092 |
-
str_tolower(sf_,sf);
|
1093 |
-
str_tolower(str_,text);
|
1094 |
-
|
1095 |
-
token(text,tok);
|
1096 |
-
lna = strlen(sf);
|
1097 |
-
lnt = strlen(tok[ntk-1]);
|
1098 |
-
|
1099 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
1100 |
-
if(!flag) return 0;
|
1101 |
-
|
1102 |
-
do {
|
1103 |
-
if(!exist_skipword(lna)) continue;
|
1104 |
-
if(!skipword_ok(lna,1)) continue;
|
1105 |
-
if(!is_BeginWrdMatch(lna,genFL)) continue;
|
1106 |
-
if(!is_WithinWrdMatch(lna,genFL)) continue;
|
1107 |
-
|
1108 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
1109 |
-
return 1;
|
1110 |
-
} while(search_backward_adv(sf,genFL));
|
1111 |
-
|
1112 |
-
return 0;
|
1113 |
-
}
|
1114 |
-
|
1115 |
-
|
1116 |
-
long ContLet::strategy(const char *sf_, const char *str_)
|
1117 |
-
{
|
1118 |
-
long lna,lnt,flag;
|
1119 |
-
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
1120 |
-
|
1121 |
-
str_tolower(sf_,sf);
|
1122 |
-
str_tolower(str_,text);
|
1123 |
-
|
1124 |
-
token(text,tok);
|
1125 |
-
lna = strlen(sf);
|
1126 |
-
lnt = strlen(tok[ntk-1]);
|
1127 |
-
|
1128 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
1129 |
-
if(!flag) return 0;
|
1130 |
-
|
1131 |
-
do {
|
1132 |
-
if(!skipword_ok(lna,0)) continue;
|
1133 |
-
if(!is_BeginWrdMatch(lna,genFL)) continue;
|
1134 |
-
if(!is_ContLetMatch(lna)) continue;
|
1135 |
-
|
1136 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
1137 |
-
return 1;
|
1138 |
-
} while(search_backward_adv(sf,genFL));
|
1139 |
-
|
1140 |
-
return 0;
|
1141 |
-
}
|
1142 |
-
|
1143 |
-
|
1144 |
-
long ContLetSkp::strategy(const char *sf_, const char *str_)
|
1145 |
-
{
|
1146 |
-
long lna,lnt,flag;
|
1147 |
-
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
1148 |
-
|
1149 |
-
str_tolower(sf_,sf);
|
1150 |
-
str_tolower(str_,text);
|
1151 |
-
|
1152 |
-
token(text,tok);
|
1153 |
-
lna = strlen(sf);
|
1154 |
-
lnt = strlen(tok[ntk-1]);
|
1155 |
-
|
1156 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
1157 |
-
if(!flag) return 0;
|
1158 |
-
|
1159 |
-
do {
|
1160 |
-
if(!exist_skipword(lna)) continue;
|
1161 |
-
if(!skipword_ok(lna,1)) continue;
|
1162 |
-
if(!is_BeginWrdMatch(lna,genFL)) continue;
|
1163 |
-
if(!is_ContLetMatch(lna)) continue;
|
1164 |
-
|
1165 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
1166 |
-
return 1;
|
1167 |
-
} while(search_backward_adv(sf,genFL));
|
1168 |
-
|
1169 |
-
return 0;
|
1170 |
-
}
|
1171 |
-
|
1172 |
-
|
1173 |
-
long AnyLet::strategy(const char *sf_, const char *str_)
|
1174 |
-
{
|
1175 |
-
long lna,lnt,flag;
|
1176 |
-
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
1177 |
-
|
1178 |
-
str_tolower(sf_,sf);
|
1179 |
-
str_tolower(str_,text);
|
1180 |
-
|
1181 |
-
token(text,tok);
|
1182 |
-
lna = strlen(sf);
|
1183 |
-
lnt = strlen(tok[ntk-1]);
|
1184 |
-
|
1185 |
-
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
1186 |
-
if(!flag) return 0;
|
1187 |
-
|
1188 |
-
do {
|
1189 |
-
if(!skipword_ok(lna,1)) continue;
|
1190 |
-
|
1191 |
-
extract_lf(mod[0][0],ntk-1,str_);
|
1192 |
-
return 1;
|
1193 |
-
} while(search_backward_adv(sf,genFL));
|
1194 |
-
|
1195 |
-
return 0;
|
1196 |
-
}
|
1197 |
-
|
1198 |
-
|
1199 |
-
|
1200 |
-
//-----
|
1201 |
-
AbbrStra * StratUtil::strat_factory(string name)
|
1202 |
-
{
|
1203 |
-
if(name=="FirstLetOneChSF") return new FirstLetOneChSF;
|
1204 |
-
else if(name=="FirstLet") return new FirstLet;
|
1205 |
-
else if(name=="FirstLetGen") return new FirstLetGen;
|
1206 |
-
else if(name=="FirstLetGen2") return new FirstLetGen2;
|
1207 |
-
else if(name=="FirstLetGenS") return new FirstLetGenS;
|
1208 |
-
else if(name=="FirstLetGenStp") return new FirstLetGenStp;
|
1209 |
-
else if(name=="FirstLetGenStp2") return new FirstLetGenStp2;
|
1210 |
-
else if(name=="FirstLetGenSkp") return new FirstLetGenSkp;
|
1211 |
-
else if(name=="WithinWrdWrd") return new WithinWrdWrd;
|
1212 |
-
else if(name=="WithinWrdFWrd") return new WithinWrdFWrd;
|
1213 |
-
else if(name=="WithinWrdFWrdSkp") return new WithinWrdFWrdSkp;
|
1214 |
-
else if(name=="WithinWrdLet") return new WithinWrdLet;
|
1215 |
-
else if(name=="WithinWrdFLet") return new WithinWrdFLet;
|
1216 |
-
else if(name=="WithinWrdFLetSkp") return new WithinWrdFLetSkp;
|
1217 |
-
else if(name=="ContLet") return new ContLet;
|
1218 |
-
else if(name=="ContLetSkp") return new ContLetSkp;
|
1219 |
-
else if(name=="AnyLet") return new AnyLet;
|
1220 |
-
else { cout << "Fail strat_factory\n"; exit(1); }
|
1221 |
-
}
|
1222 |
-
|
1223 |
-
|
1224 |
-
//check if sf is ok and assign a group
|
1225 |
-
//if sf length > 5, use 5!!
|
1226 |
-
//grp will be Al+#ChInSF, Num+#ChInSF, or Spec+#ChInSF
|
1227 |
-
bool StratUtil::group_sf(const char *sf, string &grp)
|
1228 |
-
{
|
1229 |
-
long i, j, len=strlen(sf);
|
1230 |
-
long al=0, num=0, nonalnum=0;
|
1231 |
-
long paren=0, sbrac=0;
|
1232 |
-
|
1233 |
-
grp = ""; // if failure, no group
|
1234 |
-
|
1235 |
-
if(!isalnum(sf[0])) return false; //1sf ch must alnum
|
1236 |
-
for(i=0; i<len; i++) {
|
1237 |
-
if(isalpha(sf[i])) al++;
|
1238 |
-
else if(isdigit(sf[i])) num++;
|
1239 |
-
else nonalnum++;
|
1240 |
-
}
|
1241 |
-
if(al<1) return false; //at least one alphabet
|
1242 |
-
|
1243 |
-
//false for one parenthesis or square bracket
|
1244 |
-
for(i=len-1; i>=0; i--) {
|
1245 |
-
if(sf[i]=='(') paren++;
|
1246 |
-
if(sf[i]==')') paren--;
|
1247 |
-
if(sf[i]=='[') sbrac++;
|
1248 |
-
if(sf[i]==']') sbrac--;
|
1249 |
-
}
|
1250 |
-
if(paren!=0 || sbrac!=0) return false;
|
1251 |
-
|
1252 |
-
if(al==len) grp.assign("Al");
|
1253 |
-
else if(num>0) grp.assign("Num");
|
1254 |
-
else if(nonalnum>0) grp.assign("Spec");
|
1255 |
-
else { cout << "No sf group\n"; exit(1); }
|
1256 |
-
|
1257 |
-
//append sf length
|
1258 |
-
len = len>5 ? 5 : len;
|
1259 |
-
|
1260 |
-
switch(len) {
|
1261 |
-
case 1:
|
1262 |
-
grp.append("1");
|
1263 |
-
break;
|
1264 |
-
case 2:
|
1265 |
-
grp.append("2");
|
1266 |
-
break;
|
1267 |
-
case 3:
|
1268 |
-
grp.append("3");
|
1269 |
-
break;
|
1270 |
-
case 4:
|
1271 |
-
grp.append("4");
|
1272 |
-
break;
|
1273 |
-
case 5:
|
1274 |
-
grp.append("5");
|
1275 |
-
break;
|
1276 |
-
default:
|
1277 |
-
cout << "Not defined #-ch SF" << endl;
|
1278 |
-
exit(1);
|
1279 |
-
}
|
1280 |
-
|
1281 |
-
return true;
|
1282 |
-
}
|
1283 |
-
|
1284 |
-
//add the condition |lf|>|sf|
|
1285 |
-
bool StratUtil::group_sf(const char *sf, const char *lf, string &grp)
|
1286 |
-
{
|
1287 |
-
long i, j, len=strlen(sf);
|
1288 |
-
long al=0, num=0, nonalnum=0;
|
1289 |
-
long paren=0, sbrac=0;
|
1290 |
-
|
1291 |
-
if(strlen(lf)<len) return false; //|lf|>|sf|
|
1292 |
-
if(!isalnum(sf[0])) return false; //1sf ch must alnum
|
1293 |
-
for(i=0; i<len; i++) {
|
1294 |
-
if(isalpha(sf[i])) al++;
|
1295 |
-
else if(isdigit(sf[i])) num++;
|
1296 |
-
else nonalnum++;
|
1297 |
-
}
|
1298 |
-
if(al<1) return false; //at least one alphabet
|
1299 |
-
if(al>10) return false; //|alpha sf| is at most 10
|
1300 |
-
if(num_token(sf)>2) return false; //added Feb-21-08
|
1301 |
-
|
1302 |
-
//false for one parenthesis or square bracket
|
1303 |
-
for(i=len-1; i>=0; i--) {
|
1304 |
-
if(sf[i]=='(') paren++;
|
1305 |
-
if(sf[i]==')') paren--;
|
1306 |
-
if(sf[i]=='[') sbrac++;
|
1307 |
-
if(sf[i]==']') sbrac--;
|
1308 |
-
}
|
1309 |
-
if(paren!=0 || sbrac!=0) return false;
|
1310 |
-
|
1311 |
-
if(al==len) grp.assign("Al");
|
1312 |
-
else if(num>0) grp.assign("Num");
|
1313 |
-
else if(nonalnum>0) grp.assign("Spec");
|
1314 |
-
else { cout << "No sf group\n"; exit(1); }
|
1315 |
-
|
1316 |
-
//append sf length
|
1317 |
-
len = len>5 ? 5 : len;
|
1318 |
-
|
1319 |
-
switch(len) {
|
1320 |
-
case 1:
|
1321 |
-
grp.append("1");
|
1322 |
-
break;
|
1323 |
-
case 2:
|
1324 |
-
grp.append("2");
|
1325 |
-
break;
|
1326 |
-
case 3:
|
1327 |
-
grp.append("3");
|
1328 |
-
break;
|
1329 |
-
case 4:
|
1330 |
-
grp.append("4");
|
1331 |
-
break;
|
1332 |
-
case 5:
|
1333 |
-
grp.append("5");
|
1334 |
-
break;
|
1335 |
-
default:
|
1336 |
-
cout << "Not defined #-ch SF" << endl;
|
1337 |
-
exit(1);
|
1338 |
-
}
|
1339 |
-
|
1340 |
-
return true;
|
1341 |
-
}
|
1342 |
-
|
1343 |
-
|
1344 |
-
//remove non-alnum in str1 and save it to str2
|
1345 |
-
void StratUtil::remove_nonAlnum(const char *str1, char *str2)
|
1346 |
-
{
|
1347 |
-
long i=0, j=0;
|
1348 |
-
|
1349 |
-
while(str1[i]) {
|
1350 |
-
if(isalnum(str1[i])) {
|
1351 |
-
str2[j] = str1[i];
|
1352 |
-
j++;
|
1353 |
-
}
|
1354 |
-
i++;
|
1355 |
-
}
|
1356 |
-
str2[j] = '\0';
|
1357 |
-
}
|
1358 |
-
|
1359 |
-
|
1360 |
-
vector<string> StratUtil::get_strats(string s)
|
1361 |
-
{
|
1362 |
-
if(s=="Al1") return Al1;
|
1363 |
-
else if(s=="Al2") return Al2;
|
1364 |
-
else if(s=="Al3") return Al3;
|
1365 |
-
else if(s=="Al4") return Al4;
|
1366 |
-
else if(s=="Al5") return Al5;
|
1367 |
-
else if(s=="Num2") return Num2;
|
1368 |
-
else if(s=="Num3") return Num3;
|
1369 |
-
else if(s=="Num4") return Num4;
|
1370 |
-
else if(s=="Num5") return Num5;
|
1371 |
-
else if(s=="Spec2") return Spec2;
|
1372 |
-
else if(s=="Spec3") return Spec3;
|
1373 |
-
else if(s=="Spec4") return Spec4;
|
1374 |
-
else if(s=="Spec5") return Spec5;
|
1375 |
-
else { cout << "Incorrect name\n"; exit(1); }
|
1376 |
-
}
|
1377 |
-
|
1378 |
-
|
1379 |
-
void StratUtil::push_back_strat(string sgp, string strat)
|
1380 |
-
{
|
1381 |
-
if(sgp=="Al1") Al1.push_back(strat);
|
1382 |
-
else if(sgp=="Al2") Al2.push_back(strat);
|
1383 |
-
else if(sgp=="Al3") Al3.push_back(strat);
|
1384 |
-
else if(sgp=="Al4") Al4.push_back(strat);
|
1385 |
-
else if(sgp=="Al5") Al5.push_back(strat);
|
1386 |
-
else if(sgp=="Num2") Num2.push_back(strat);
|
1387 |
-
else if(sgp=="Num3") Num3.push_back(strat);
|
1388 |
-
else if(sgp=="Num4") Num4.push_back(strat);
|
1389 |
-
else if(sgp=="Num5") Num5.push_back(strat);
|
1390 |
-
else if(sgp=="Spec2") Spec2.push_back(strat);
|
1391 |
-
else if(sgp=="Spec3") Spec3.push_back(strat);
|
1392 |
-
else if(sgp=="Spec4") Spec4.push_back(strat);
|
1393 |
-
else if(sgp=="Spec5") Spec5.push_back(strat);
|
1394 |
-
}
|
1395 |
-
|
1396 |
-
|
1397 |
-
long StratUtil::exist_upperal(const char *str)
|
1398 |
-
{
|
1399 |
-
long i, len=strlen(str);
|
1400 |
-
|
1401 |
-
for(i=0; i<len; i++)
|
1402 |
-
if(isupper(str[i]))
|
1403 |
-
return 1;
|
1404 |
-
return 0;
|
1405 |
-
}
|
1406 |
-
|
1407 |
-
long StratUtil::num_token(const char *str)
|
1408 |
-
{
|
1409 |
-
long i,j=0,k=0;
|
1410 |
-
long n=strlen(str)-1;
|
1411 |
-
|
1412 |
-
while(isblank(str[n])) n--;
|
1413 |
-
|
1414 |
-
while(str[j]){
|
1415 |
-
while(isblank(str[j]))j++;
|
1416 |
-
i=j;
|
1417 |
-
while((str[j])&&(!isblank(str[j])))j++;
|
1418 |
-
if(str[j]){
|
1419 |
-
k++;
|
1420 |
-
j++;
|
1421 |
-
}
|
1422 |
-
}
|
1423 |
-
if((j-1)>n) k--; //added by Sohn (Jan-17-08): "ab cd " -> 2 tokens
|
1424 |
-
return k+1; //# tokens
|
1425 |
-
}
|
1426 |
-
//-----
|
|
|
1 |
+
#include "AbbrStra.h"
|
2 |
+
#include <runn.h>
|
3 |
+
#include <vector>
|
4 |
+
#include <fstream>
|
5 |
+
#include <iostream>
|
6 |
+
|
7 |
+
|
8 |
+
WordData::WordData(const char *wrdnam, const char *stpnam,
|
9 |
+
const char *lfsnam) :
|
10 |
+
wrdset(wrdnam), stp(stpnam), lfs(lfsnam)
|
11 |
+
{
|
12 |
+
wrdset.set_path_name("Ab3P");
|
13 |
+
wrdset.gopen_ctable_map();
|
14 |
+
stp.set_path_name("Ab3P");
|
15 |
+
stp.gopen_htable_map();
|
16 |
+
lfs.set_path_name("Ab3P");
|
17 |
+
lfs.gopen_htable_map();
|
18 |
+
}
|
19 |
+
|
20 |
+
WordData::~WordData()
|
21 |
+
{
|
22 |
+
wrdset.gclose_ctable_map();
|
23 |
+
stp.gclose_htable_map();
|
24 |
+
lfs.gclose_htable_map();
|
25 |
+
}
|
26 |
+
|
27 |
+
|
28 |
+
AbbrStra::AbbrStra()
|
29 |
+
{
|
30 |
+
npairs = tpairs = nsfs = nmatchs = amatchs = 0;
|
31 |
+
}
|
32 |
+
|
33 |
+
|
34 |
+
AbbrStra::~AbbrStra()
|
35 |
+
{
|
36 |
+
}
|
37 |
+
|
38 |
+
|
39 |
+
void AbbrStra::token(const char *str, char lst[1000][1000])
|
40 |
+
{
|
41 |
+
long i,j=0,k=0;
|
42 |
+
long n=strlen(str)-1;
|
43 |
+
|
44 |
+
while(isblank(str[n])) n--;
|
45 |
+
|
46 |
+
while(str[j]){
|
47 |
+
while(isblank(str[j]))j++;
|
48 |
+
i=j;
|
49 |
+
while((str[j])&&(!isblank(str[j])))j++;
|
50 |
+
strncpy(lst[k],str+i,j-i);
|
51 |
+
lst[k][j-i]='\0';
|
52 |
+
if(str[j]){
|
53 |
+
k++;
|
54 |
+
j++;
|
55 |
+
}
|
56 |
+
}
|
57 |
+
if((j-1)>n) k--; //added by Sohn (Jan-17-08): "ab cd " -> 2 tokens
|
58 |
+
ntk=k+1; //# tokens, ntk is data member
|
59 |
+
}
|
60 |
+
|
61 |
+
|
62 |
+
long AbbrStra::tokenize(const char *str, char lst[1000][1000])
|
63 |
+
{
|
64 |
+
long i,j=0,k=0;
|
65 |
+
long n=strlen(str)-1;
|
66 |
+
|
67 |
+
while(isblank(str[n])) n--;
|
68 |
+
|
69 |
+
while(str[j]){
|
70 |
+
while(isblank(str[j]))j++;
|
71 |
+
i=j;
|
72 |
+
while((str[j])&&(!isblank(str[j])))j++;
|
73 |
+
strncpy(lst[k],str+i,j-i);
|
74 |
+
lst[k][j-i]='\0';
|
75 |
+
if(str[j]){
|
76 |
+
k++;
|
77 |
+
j++;
|
78 |
+
}
|
79 |
+
}
|
80 |
+
if((j-1)>n) k--; //added by Sohn (Jan-17-08): "ab cd " -> 2 tokens
|
81 |
+
return k+1; //# tokens
|
82 |
+
}
|
83 |
+
|
84 |
+
|
85 |
+
long AbbrStra::num_token(const char *str)
|
86 |
+
{
|
87 |
+
long i,j=0,k=0;
|
88 |
+
long n=strlen(str)-1;
|
89 |
+
|
90 |
+
while(isblank(str[n])) n--;
|
91 |
+
|
92 |
+
while(str[j]){
|
93 |
+
while(isblank(str[j]))j++;
|
94 |
+
i=j;
|
95 |
+
while((str[j])&&(!isblank(str[j])))j++;
|
96 |
+
if(str[j]){
|
97 |
+
k++;
|
98 |
+
j++;
|
99 |
+
}
|
100 |
+
}
|
101 |
+
if((j-1)>n) k--; //added by Sohn (Jan-17-08): "ab cd " -> 2 tokens
|
102 |
+
return k+1; //# tokens
|
103 |
+
}
|
104 |
+
|
105 |
+
|
106 |
+
// fch is 1st char of str token from backward
|
107 |
+
long AbbrStra::first_ch(const char *str, char *fch, long num)
|
108 |
+
{
|
109 |
+
long i, j, numtk;
|
110 |
+
char tk[1000][1000];
|
111 |
+
|
112 |
+
numtk = tokenize(str,tk);
|
113 |
+
if(num>numtk) return 0;
|
114 |
+
|
115 |
+
for(i=0; i<num; i++)
|
116 |
+
fch[i] = tk[numtk-num+i][0];
|
117 |
+
|
118 |
+
return 1;
|
119 |
+
}
|
120 |
+
|
121 |
+
long AbbrStra::is_upperal(const char *str)
|
122 |
+
{
|
123 |
+
for(long i=strlen(str)-1; i>=0; i--)
|
124 |
+
if(!isupper(str[i]) || !isalpha(str[i]))
|
125 |
+
return 0;
|
126 |
+
return 1;
|
127 |
+
}
|
128 |
+
|
129 |
+
long AbbrStra::is_alpha(const char *str)
|
130 |
+
{
|
131 |
+
for(long i=strlen(str)-1; i>=0; i--)
|
132 |
+
if(!isalpha(str[i]))
|
133 |
+
return 0;
|
134 |
+
return 1;
|
135 |
+
}
|
136 |
+
|
137 |
+
|
138 |
+
// str2 will lower-case of str1
|
139 |
+
void AbbrStra::str_tolower(const char *str1, char *str2)
|
140 |
+
{
|
141 |
+
long i=0;
|
142 |
+
|
143 |
+
while(str1[i]) {
|
144 |
+
str2[i] = tolower(str1[i]);
|
145 |
+
i++;
|
146 |
+
}
|
147 |
+
str2[i] = '\0';
|
148 |
+
}
|
149 |
+
|
150 |
+
//copy num tokens from back of str1 to str2
|
151 |
+
long AbbrStra::get_str(const char *str1, char *str2, long num)
|
152 |
+
{
|
153 |
+
char ch, tk[1000][1000];
|
154 |
+
long i, j, numtk;
|
155 |
+
|
156 |
+
if(num<0) { cout<<"num<0\n"; exit(1); }
|
157 |
+
numtk = tokenize(str1,tk);
|
158 |
+
if(numtk<num) return 0;
|
159 |
+
|
160 |
+
strcpy(str2,tk[numtk-num]);
|
161 |
+
for(i=1; i<num; i++) {
|
162 |
+
strcat(str2," ");
|
163 |
+
strcat(str2,tk[numtk-num+i]);
|
164 |
+
}
|
165 |
+
|
166 |
+
return 1;
|
167 |
+
}
|
168 |
+
|
169 |
+
bool AbbrStra::isupper_str(const char *str)
|
170 |
+
{
|
171 |
+
long i, len=strlen(str);
|
172 |
+
|
173 |
+
for(i=0; i<len; i++)
|
174 |
+
if(isalpha(str[i]) && !isupper(str[i]))
|
175 |
+
return false;
|
176 |
+
|
177 |
+
return true;
|
178 |
+
}
|
179 |
+
|
180 |
+
bool AbbrStra::is_onealpha(const char *str)
|
181 |
+
{
|
182 |
+
long i, j=0, len=strlen(str);
|
183 |
+
|
184 |
+
for(i=0; i<len; i++)
|
185 |
+
if(isalpha(str[i])) j++;
|
186 |
+
|
187 |
+
if(j==1) return true;
|
188 |
+
else return false;
|
189 |
+
}
|
190 |
+
|
191 |
+
long AbbrStra::count_upperstr(const char *str)
|
192 |
+
{
|
193 |
+
long i, j, k, numtk;
|
194 |
+
char tk[1000][1000];
|
195 |
+
|
196 |
+
numtk = tokenize(str,tk);
|
197 |
+
|
198 |
+
j = 0;
|
199 |
+
for(i=numtk-1; i>=0; i--) {
|
200 |
+
if(isupper(tk[i][0])) j++;
|
201 |
+
else return j;
|
202 |
+
}
|
203 |
+
|
204 |
+
return j;
|
205 |
+
}
|
206 |
+
|
207 |
+
void AbbrStra::get_alpha(const char *str1, char *str2)
|
208 |
+
{
|
209 |
+
long i = 0, j = 0;
|
210 |
+
long len = strlen(str1);
|
211 |
+
|
212 |
+
while(i<len) {
|
213 |
+
if(isalpha(str1[i])) {
|
214 |
+
str2[j] = str1[i];
|
215 |
+
j++;
|
216 |
+
}
|
217 |
+
i++;
|
218 |
+
}
|
219 |
+
str2[j] = '\0';
|
220 |
+
}
|
221 |
+
|
222 |
+
|
223 |
+
bool AbbrStra::lf_ok(const char *shrtf, const char *longf)
|
224 |
+
{
|
225 |
+
long i;
|
226 |
+
long paren=0, sbrac=0;
|
227 |
+
string s, l;
|
228 |
+
|
229 |
+
//false for one parenthesis or square bracket
|
230 |
+
for(i=strlen(longf)-1; i>=0; i--) {
|
231 |
+
if(longf[i]=='(') paren++;
|
232 |
+
if(longf[i]==')') paren--;
|
233 |
+
if(longf[i]=='[') sbrac++;
|
234 |
+
if(longf[i]==']') sbrac--;
|
235 |
+
}
|
236 |
+
if(paren!=0 || sbrac!=0) return false;
|
237 |
+
|
238 |
+
s.assign(shrtf);
|
239 |
+
l.assign(longf);
|
240 |
+
|
241 |
+
for(i=0; i<s.length(); i++) s[i]=tolower(s[i]);
|
242 |
+
for(i=0; i<l.length(); i++) l[i]=tolower(l[i]);
|
243 |
+
|
244 |
+
//false if LF words contain SF
|
245 |
+
if( (" "+l+" ").find(" "+s+" ")!=string::npos ) return false;
|
246 |
+
|
247 |
+
return true;
|
248 |
+
}
|
249 |
+
|
250 |
+
|
251 |
+
//first=true: allow 1-ahpha, 0 don't allow
|
252 |
+
long AbbrStra::search_backward(long sloc, long tinx, long tloc, const char *abbr, bool first)
|
253 |
+
{
|
254 |
+
long sfloc=sloc, tkinx=tinx, tkloc=tloc;
|
255 |
+
|
256 |
+
while(sfloc>=0) {
|
257 |
+
loop1: while((tkloc>=0)&&(tok[tkinx][tkloc]!=abbr[sfloc])) tkloc--;
|
258 |
+
if(tkloc<0) {
|
259 |
+
tkinx--;
|
260 |
+
if(tkinx<0) return 0; //moved to here (Sep-14-07)
|
261 |
+
tkloc=strlen(tok[tkinx])-1;
|
262 |
+
}
|
263 |
+
else {
|
264 |
+
if(sfloc==0) {
|
265 |
+
if(tkloc!=0) {
|
266 |
+
if(!first) { tkloc--; goto loop1; }
|
267 |
+
else if(isalnum(tok[tkinx][tkloc-1])) { tkloc--; goto loop1; }
|
268 |
+
}
|
269 |
+
}
|
270 |
+
mod[sfloc][0]=tkinx;
|
271 |
+
mod[sfloc][1]=tkloc;
|
272 |
+
sfloc--; tkloc--;
|
273 |
+
}
|
274 |
+
}
|
275 |
+
|
276 |
+
return 1;
|
277 |
+
}
|
278 |
+
|
279 |
+
long AbbrStra::search_backward_adv(const char *abbr, bool flag)
|
280 |
+
{
|
281 |
+
long i;
|
282 |
+
long lna=strlen(abbr);
|
283 |
+
|
284 |
+
i=0;
|
285 |
+
while(i<lna){
|
286 |
+
if(search_backward(i,mod[i][0],mod[i][1]-1,abbr,flag)) return 1;
|
287 |
+
i++;
|
288 |
+
}
|
289 |
+
return 0;
|
290 |
+
}
|
291 |
+
|
292 |
+
void AbbrStra::extract_lf(long begin, long end)
|
293 |
+
{
|
294 |
+
strcpy(lf,tok[begin]);
|
295 |
+
for(long i=begin+1; i<=end; i++) {
|
296 |
+
strcat(lf," ");
|
297 |
+
strcat(lf,tok[i]);
|
298 |
+
}
|
299 |
+
}
|
300 |
+
|
301 |
+
|
302 |
+
void AbbrStra::extract_lf(long begin, long end, const char *str)
|
303 |
+
{
|
304 |
+
token(str,tok);
|
305 |
+
strcpy(lf,tok[begin]);
|
306 |
+
for(long i=begin+1; i<=end; i++) {
|
307 |
+
strcat(lf," ");
|
308 |
+
strcat(lf,tok[i]);
|
309 |
+
}
|
310 |
+
}
|
311 |
+
|
312 |
+
//---
|
313 |
+
bool AbbrStra::exist_skipword(long nsf)
|
314 |
+
{
|
315 |
+
long i=0, j=0, k;
|
316 |
+
|
317 |
+
while(i<nsf) {
|
318 |
+
if(i==(nsf-1)) k=ntk-mod[i][0]-1;
|
319 |
+
else k=mod[i+1][0]-mod[i][0]-1;
|
320 |
+
if(k>0) j+=k;
|
321 |
+
i++;
|
322 |
+
}
|
323 |
+
|
324 |
+
if(j>0) return true;
|
325 |
+
else return false;
|
326 |
+
}
|
327 |
+
|
328 |
+
|
329 |
+
bool AbbrStra::exist_n_skipwords(long nsf, long n)
|
330 |
+
{
|
331 |
+
long i=0, j, k;
|
332 |
+
bool flag=false;
|
333 |
+
|
334 |
+
//k: # skip words
|
335 |
+
while(i<nsf) {
|
336 |
+
if(i==(nsf-1)) k=ntk-mod[i][0]-1;
|
337 |
+
else k=mod[i+1][0]-mod[i][0]-1;
|
338 |
+
if(k>n) return false;
|
339 |
+
if(k==n) flag=true;
|
340 |
+
i++;
|
341 |
+
}
|
342 |
+
|
343 |
+
if(flag) return true;
|
344 |
+
else return false;
|
345 |
+
}
|
346 |
+
|
347 |
+
//exists n consecutive skip stopwords between tokens
|
348 |
+
bool AbbrStra::exist_n_stopwords(long nsf, long n)
|
349 |
+
{
|
350 |
+
long i=0, j, k;
|
351 |
+
bool flag=false;
|
352 |
+
|
353 |
+
while(i<nsf) {
|
354 |
+
if(i==(nsf-1)) k=ntk-mod[i][0]-1;
|
355 |
+
else k=mod[i+1][0]-mod[i][0]-1;
|
356 |
+
if(k>n) return false;
|
357 |
+
if(k==n) flag=true;
|
358 |
+
if(k>0) { //skip word exists
|
359 |
+
while(k) {
|
360 |
+
if(!wData->stp.find(tok[mod[i][0]+k])) return false;
|
361 |
+
k--;
|
362 |
+
}
|
363 |
+
}
|
364 |
+
i++;
|
365 |
+
}
|
366 |
+
|
367 |
+
if(flag) return true;
|
368 |
+
else return false;
|
369 |
+
}
|
370 |
+
|
371 |
+
|
372 |
+
bool AbbrStra::stopword_ok(long nsf, long nsw)
|
373 |
+
{
|
374 |
+
long i=0, j, k;
|
375 |
+
|
376 |
+
while(i<nsf) {
|
377 |
+
if(i==(nsf-1)) k=ntk-mod[i][0]-1;
|
378 |
+
else k=mod[i+1][0]-mod[i][0]-1;
|
379 |
+
if(k>nsw) return false;
|
380 |
+
if(k>0) { //skip word exists
|
381 |
+
while(k) {
|
382 |
+
if(!wData->stp.find(tok[mod[i][0]+k])) return false;
|
383 |
+
k--;
|
384 |
+
}
|
385 |
+
}
|
386 |
+
i++;
|
387 |
+
}
|
388 |
+
|
389 |
+
return true;
|
390 |
+
}
|
391 |
+
|
392 |
+
bool AbbrStra::skip_stop_ok(long nsf, long nsw, long n)
|
393 |
+
{
|
394 |
+
long i=0, j, k, nstp;
|
395 |
+
|
396 |
+
while(i<nsf) {
|
397 |
+
if(i==(nsf-1)) k=ntk-mod[i][0]-1;
|
398 |
+
else k=mod[i+1][0]-mod[i][0]-1;
|
399 |
+
if(k>nsw) return false;
|
400 |
+
//if(k>0) { //skip word exists
|
401 |
+
if(k>(nsw-n)) {
|
402 |
+
nstp=0; //# skiped stopword between tokens
|
403 |
+
while(k) {
|
404 |
+
if(wData->stp.find(tok[mod[i][0]+k])) nstp++;
|
405 |
+
k--;
|
406 |
+
}
|
407 |
+
if(nstp<n) return false;
|
408 |
+
}
|
409 |
+
i++;
|
410 |
+
}
|
411 |
+
|
412 |
+
return true;
|
413 |
+
}
|
414 |
+
|
415 |
+
|
416 |
+
bool AbbrStra::skip_stop_ok2(long nsf, long nsw, long n)
|
417 |
+
{
|
418 |
+
long i=0, j, k, nstp;
|
419 |
+
|
420 |
+
while(i<nsf) {
|
421 |
+
if(i==(nsf-1)) k=ntk-mod[i][0]-1;
|
422 |
+
else k=mod[i+1][0]-mod[i][0]-1;
|
423 |
+
if((k>0)&&(k!=nsw)) return false;
|
424 |
+
if(k>0) { //skip word exists
|
425 |
+
nstp=0; //# skiped stopword between tokens
|
426 |
+
while(k) {
|
427 |
+
if(wData->stp.find(tok[mod[i][0]+k])) nstp++;
|
428 |
+
k--;
|
429 |
+
}
|
430 |
+
if(nstp<n) return false;
|
431 |
+
}
|
432 |
+
|
433 |
+
i++;
|
434 |
+
}
|
435 |
+
|
436 |
+
return true;
|
437 |
+
}
|
438 |
+
|
439 |
+
|
440 |
+
bool AbbrStra::skipword_ok(long nsf, long nsw)
|
441 |
+
{
|
442 |
+
long i=0, j, k;
|
443 |
+
|
444 |
+
while(i<nsf) {
|
445 |
+
if(i==(nsf-1)) k=ntk-mod[i][0]-1;
|
446 |
+
else k=mod[i+1][0]-mod[i][0]-1;
|
447 |
+
if(k>nsw) return false;
|
448 |
+
i++;
|
449 |
+
}
|
450 |
+
|
451 |
+
return true;
|
452 |
+
}
|
453 |
+
|
454 |
+
|
455 |
+
bool AbbrStra::is_subword(long nsf)
|
456 |
+
{
|
457 |
+
long i=0;
|
458 |
+
char word[1000];
|
459 |
+
|
460 |
+
while(i<nsf) {
|
461 |
+
if(mod[i][1]!=0) {
|
462 |
+
strcpy(word,tok[mod[i][0]]+mod[i][1]);
|
463 |
+
if(wData->wrdset.count(word)==0) return false;
|
464 |
+
}
|
465 |
+
i++;
|
466 |
+
}
|
467 |
+
|
468 |
+
return true;
|
469 |
+
}
|
470 |
+
|
471 |
+
|
472 |
+
bool AbbrStra::is_BeginWrdMatch(long nsf, bool general)
|
473 |
+
{
|
474 |
+
long i=0, j;
|
475 |
+
bool *bwm = new bool [ntk]; //BeginWrdMatch of a given tok
|
476 |
+
|
477 |
+
for(j=0; j<ntk; j++) bwm[j] = false;
|
478 |
+
|
479 |
+
while(i<nsf) {
|
480 |
+
if(mod[i][1]==0)
|
481 |
+
bwm[mod[i][0]] = true;
|
482 |
+
else if( general && (!isalnum(tok[mod[i][0]][mod[i][1]-1])) )
|
483 |
+
bwm[mod[i][0]] = true;
|
484 |
+
i++;
|
485 |
+
}
|
486 |
+
|
487 |
+
for(j=0; j<nsf; j++)
|
488 |
+
if(!bwm[mod[j][0]]) {
|
489 |
+
delete [] bwm;
|
490 |
+
return false;
|
491 |
+
}
|
492 |
+
|
493 |
+
delete [] bwm;
|
494 |
+
|
495 |
+
return true;
|
496 |
+
}
|
497 |
+
|
498 |
+
|
499 |
+
bool AbbrStra::is_WithinWrdMatch(long nsf, bool general)
|
500 |
+
{
|
501 |
+
long i=0, wwm=0;
|
502 |
+
|
503 |
+
while(i<nsf) {
|
504 |
+
if(!general) {
|
505 |
+
if(mod[i][1]>0) wwm++;
|
506 |
+
}
|
507 |
+
else {
|
508 |
+
if(mod[i][1]>0 && isalnum(tok[mod[i][0]][mod[i][1]-1])) wwm++;
|
509 |
+
}
|
510 |
+
i++;
|
511 |
+
}
|
512 |
+
|
513 |
+
if(wwm>0) return true;
|
514 |
+
else return false;
|
515 |
+
}
|
516 |
+
|
517 |
+
|
518 |
+
bool AbbrStra::is_FirstLetMatch(long nsf, bool general)
|
519 |
+
{
|
520 |
+
long i=0, flm=0, flm2=0;
|
521 |
+
|
522 |
+
while(i<nsf) {
|
523 |
+
if(mod[i][1]==0) flm++;
|
524 |
+
else if( general && (!isalnum(tok[mod[i][0]][mod[i][1]-1])) ) {
|
525 |
+
flm++; flm2++;
|
526 |
+
}
|
527 |
+
i++;
|
528 |
+
}
|
529 |
+
|
530 |
+
if(flm==nsf) return true;
|
531 |
+
else return false;
|
532 |
+
}
|
533 |
+
|
534 |
+
|
535 |
+
bool AbbrStra::is_FirstLetMatch2(long nsf, bool general)
|
536 |
+
{
|
537 |
+
long i=0, flm=0, flm2=0;
|
538 |
+
|
539 |
+
while(i<nsf) {
|
540 |
+
if(mod[i][1]==0) flm++;
|
541 |
+
else if( general && (!isalnum(tok[mod[i][0]][mod[i][1]-1])) ) {
|
542 |
+
flm++; flm2++;
|
543 |
+
}
|
544 |
+
i++;
|
545 |
+
}
|
546 |
+
|
547 |
+
if( (flm==nsf) && (flm2>=1) ) return true;
|
548 |
+
else return false;
|
549 |
+
}
|
550 |
+
|
551 |
+
|
552 |
+
bool AbbrStra::is_FirstLetSMatch(const char *abbr, bool general)
|
553 |
+
{
|
554 |
+
long i=0, j=strlen(abbr)-1, flm=0, lsm=0;
|
555 |
+
|
556 |
+
while(i<j) {
|
557 |
+
if(mod[i][1]==0) flm++;
|
558 |
+
else if( general && (!isalnum(tok[mod[i][0]][mod[i][1]-1])) ) flm++;
|
559 |
+
i++;
|
560 |
+
}
|
561 |
+
|
562 |
+
if( (tok[mod[j][0]][mod[j][1]]=='s') &&
|
563 |
+
(mod[j][1]==(strlen(tok[mod[j][0]])-1)) &&
|
564 |
+
mod[j][0]==mod[j-1][0] ) lsm++;
|
565 |
+
|
566 |
+
if((flm==j) && (lsm==1)) return true;
|
567 |
+
else return false;
|
568 |
+
}
|
569 |
+
|
570 |
+
|
571 |
+
bool AbbrStra::is_ContLetMatch(long nsf)
|
572 |
+
{
|
573 |
+
long i=0, cl=1;
|
574 |
+
|
575 |
+
while(i<(nsf-1)) {
|
576 |
+
if( mod[i][0]==mod[i+1][0] &&
|
577 |
+
(mod[i][1]+1)==mod[i+1][1] ) cl++;
|
578 |
+
i++;
|
579 |
+
}
|
580 |
+
|
581 |
+
if(cl>=2) return true;
|
582 |
+
else return false;
|
583 |
+
}
|
584 |
+
//----
|
585 |
+
|
586 |
+
|
587 |
+
//---1st ch must be alnum & at least one alphabet for all
|
588 |
+
//str1: sf
|
589 |
+
bool AbbrStra::set_condition(const char *str1)
|
590 |
+
{
|
591 |
+
int n=0, m=0, o=0;
|
592 |
+
|
593 |
+
switch(setCondition) {
|
594 |
+
case 1: //all alphabet SFs
|
595 |
+
for(long i=strlen(str1)-1; i>=0; i--)
|
596 |
+
if(!isalpha(str1[i]))
|
597 |
+
return false;
|
598 |
+
return true;
|
599 |
+
break;
|
600 |
+
case 2: //at least one non-alphabet
|
601 |
+
if(!isalnum(str1[0])) return false;
|
602 |
+
for(long i=strlen(str1)-1; i>=0; i--) {
|
603 |
+
if(isalpha(str1[i])) n++;
|
604 |
+
else m++;
|
605 |
+
}
|
606 |
+
if( (n>0) && (m>0) ) return true;
|
607 |
+
else return false;
|
608 |
+
break;
|
609 |
+
case 3: //only alnum & at least one num
|
610 |
+
for(long i=strlen(str1)-1; i>=0; i--) {
|
611 |
+
if(!isalnum(str1[i])) return false;
|
612 |
+
if(isalpha(str1[i])) n++;
|
613 |
+
if(isdigit(str1[i])) m++;
|
614 |
+
}
|
615 |
+
if( (n>0) && (m>0) ) return true;
|
616 |
+
else return false;
|
617 |
+
break;
|
618 |
+
case 4: //only alpha and non-alnum & at least one non-alnum
|
619 |
+
if(!isalpha(str1[0])) return false;
|
620 |
+
for(long i=strlen(str1)-1; i>=0; i--) {
|
621 |
+
if(isdigit(str1[i])) return false;
|
622 |
+
if(!isalnum(str1[i])) n++;
|
623 |
+
}
|
624 |
+
if(n>0) return true;
|
625 |
+
else return false;
|
626 |
+
break;
|
627 |
+
case 5: //at least one non-alnum
|
628 |
+
if(!isalnum(str1[0])) return false;
|
629 |
+
for(long i=strlen(str1)-1; i>0; i--) {
|
630 |
+
if(!isalnum(str1[i])) return true;
|
631 |
+
}
|
632 |
+
return false;
|
633 |
+
break;
|
634 |
+
case 6: //at least one num and non-alnum
|
635 |
+
if(!isalnum(str1[0])) return false;
|
636 |
+
for(long i=strlen(str1)-1; i>=0; i--) {
|
637 |
+
if(isalpha(str1[i])) n++;
|
638 |
+
if(isdigit(str1[i])) m++;
|
639 |
+
if(!isalnum(str1[i])) o++;
|
640 |
+
}
|
641 |
+
if( (n>0) && (m>0) && (o>0) ) return true;
|
642 |
+
else return false;
|
643 |
+
break;
|
644 |
+
case 7: //1+2 (SH algorithm)
|
645 |
+
if(!isalnum(str1[0])) return false;
|
646 |
+
for(long i=strlen(str1)-1; i>=0; i--)
|
647 |
+
if(isalpha(str1[i])) return true;
|
648 |
+
return false;
|
649 |
+
break;
|
650 |
+
default:
|
651 |
+
cout << "Not defined set condition\n";
|
652 |
+
exit(1);
|
653 |
+
}
|
654 |
+
}
|
655 |
+
|
656 |
+
//---
|
657 |
+
//same as FirstLet::set_condition
|
658 |
+
//but requires extra set conditions
|
659 |
+
bool FirstLetOneChSF::set_condition(const char *shrtf, const char *longf, char *str)
|
660 |
+
{
|
661 |
+
long i=0, len=strlen(shrtf), numtk;
|
662 |
+
char tk[1000][1000];
|
663 |
+
|
664 |
+
//sf conditions: all alphabet
|
665 |
+
while(i<len && isalpha(shrtf[i])) i++;
|
666 |
+
if(i!=len) return false;
|
667 |
+
|
668 |
+
//lf conditions: #tok>=|SF|, 1st ch of words must be alphabet
|
669 |
+
numtk = tokenize(longf,tk);
|
670 |
+
if(len>numtk) return false;
|
671 |
+
|
672 |
+
for(i=0; i<len; i++)
|
673 |
+
str[i] = tk[numtk-len+i][0];
|
674 |
+
str[i] = '\0';
|
675 |
+
|
676 |
+
if(!is_alpha(str)) return false;
|
677 |
+
|
678 |
+
return true;
|
679 |
+
}
|
680 |
+
|
681 |
+
|
682 |
+
long FirstLetOneChSF::strategy(const char *sf_, const char *str_) {
|
683 |
+
long lna,lnt,flag;
|
684 |
+
bool genFL=false; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
685 |
+
char phr[10000], phrl[10000];
|
686 |
+
|
687 |
+
str_tolower(sf_,sf);
|
688 |
+
str_tolower(str_,text);
|
689 |
+
|
690 |
+
get_str(str_,phr,1); //phr: 1st token of str from back
|
691 |
+
str_tolower(phr,phrl);
|
692 |
+
//conditions
|
693 |
+
if(is_onealpha(phr)) return 0; //last token includes 1 alphabet
|
694 |
+
if(isupper_str(phr)) return 0; //last token is all upper-case alphabet
|
695 |
+
if(wData->stp.find(phrl)) return 0; //last token is stopword
|
696 |
+
if(!wData->lfs.find(phrl)) return 0; //lfs (1-ch sf) for FirstLet match cases < 2
|
697 |
+
|
698 |
+
token(text,tok);
|
699 |
+
lna = strlen(sf);
|
700 |
+
lnt = strlen(tok[ntk-1]);
|
701 |
+
|
702 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
703 |
+
if(!flag) return 0;
|
704 |
+
|
705 |
+
do {
|
706 |
+
if(!skipword_ok(lna,0)) continue;
|
707 |
+
if(!is_FirstLetMatch(lna,genFL)) continue; //not allow 1-alpha
|
708 |
+
|
709 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
710 |
+
return 1;
|
711 |
+
} while(search_backward_adv(sf,genFL));
|
712 |
+
|
713 |
+
return 0;
|
714 |
+
}
|
715 |
+
//---
|
716 |
+
|
717 |
+
bool FirstLet::set_condition(const char *shrtf, const char *longf, char *str)
|
718 |
+
{
|
719 |
+
long i=0, len=strlen(shrtf), numtk;
|
720 |
+
char tk[1000][1000];
|
721 |
+
|
722 |
+
//sf conditions
|
723 |
+
while(i<len && isalpha(shrtf[i])) i++;
|
724 |
+
if(i!=len) return false;
|
725 |
+
|
726 |
+
//lf conditions
|
727 |
+
numtk = tokenize(longf,tk);
|
728 |
+
if(len>numtk) return false;
|
729 |
+
|
730 |
+
for(i=0; i<len; i++)
|
731 |
+
str[i] = tk[numtk-len+i][0];
|
732 |
+
str[i] = '\0';
|
733 |
+
|
734 |
+
if(!is_alpha(str)) return false;
|
735 |
+
|
736 |
+
return true;
|
737 |
+
}
|
738 |
+
|
739 |
+
|
740 |
+
long FirstLet::strategy(const char *sf_, const char *str_) {
|
741 |
+
long lna,lnt,flag;
|
742 |
+
bool genFL=false; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
743 |
+
|
744 |
+
str_tolower(sf_,sf);
|
745 |
+
str_tolower(str_,text);
|
746 |
+
|
747 |
+
token(text,tok);
|
748 |
+
lna = strlen(sf);
|
749 |
+
lnt = strlen(tok[ntk-1]);
|
750 |
+
|
751 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
752 |
+
if(!flag) return 0;
|
753 |
+
|
754 |
+
do {
|
755 |
+
if(!skipword_ok(lna,0)) continue;
|
756 |
+
if(!is_FirstLetMatch(lna,genFL)) continue; //not allow 1-alpha
|
757 |
+
|
758 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
759 |
+
return 1;
|
760 |
+
} while(search_backward_adv(sf,genFL));
|
761 |
+
|
762 |
+
return 0;
|
763 |
+
}
|
764 |
+
|
765 |
+
|
766 |
+
long FirstLetGen::strategy(const char *sf_, const char *str_)
|
767 |
+
{
|
768 |
+
long lna,lnt,flag;
|
769 |
+
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
770 |
+
|
771 |
+
str_tolower(sf_,sf);
|
772 |
+
str_tolower(str_,text);
|
773 |
+
|
774 |
+
token(text,tok);
|
775 |
+
lna = strlen(sf);
|
776 |
+
lnt = strlen(tok[ntk-1]);
|
777 |
+
|
778 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
779 |
+
if(!flag) return 0;
|
780 |
+
|
781 |
+
do {
|
782 |
+
if(!skipword_ok(lna,0)) continue;
|
783 |
+
if(!is_FirstLetMatch2(lna,genFL)) continue; //at least 1-alpha
|
784 |
+
|
785 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
786 |
+
return 1;
|
787 |
+
} while(search_backward_adv(sf,genFL));
|
788 |
+
|
789 |
+
return 0;
|
790 |
+
}
|
791 |
+
|
792 |
+
|
793 |
+
long FirstLetGen2::strategy(const char *sf_, const char *str_)
|
794 |
+
{
|
795 |
+
long lna,lnt,flag;
|
796 |
+
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
797 |
+
|
798 |
+
str_tolower(sf_,sf);
|
799 |
+
str_tolower(str_,text);
|
800 |
+
|
801 |
+
token(text,tok);
|
802 |
+
lna = strlen(sf);
|
803 |
+
lnt = strlen(tok[ntk-1]);
|
804 |
+
|
805 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
806 |
+
if(!flag) return 0;
|
807 |
+
|
808 |
+
do {
|
809 |
+
if(!skipword_ok(lna,0)) continue;
|
810 |
+
if(!is_FirstLetMatch(lna,genFL)) continue;
|
811 |
+
|
812 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
813 |
+
return 1;
|
814 |
+
} while(search_backward_adv(sf,genFL));
|
815 |
+
|
816 |
+
return 0;
|
817 |
+
}
|
818 |
+
|
819 |
+
|
820 |
+
bool FirstLetGenS::set_condition(const char *str)
|
821 |
+
{
|
822 |
+
if(str[strlen(str)-1]!='s') return false;
|
823 |
+
|
824 |
+
for(long i=strlen(str)-2; i>=0; i--) {
|
825 |
+
if(!isupper(str[i])) return false;
|
826 |
+
if(!isalpha(str[i])) return false; //necessary?
|
827 |
+
}
|
828 |
+
|
829 |
+
return true;
|
830 |
+
}
|
831 |
+
|
832 |
+
|
833 |
+
long FirstLetGenS::strategy(const char *sf_, const char *str_)
|
834 |
+
{
|
835 |
+
long lna,lnt,flag;
|
836 |
+
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
837 |
+
|
838 |
+
if(!set_condition(sf_)) return 0;
|
839 |
+
|
840 |
+
str_tolower(sf_,sf);
|
841 |
+
str_tolower(str_,text);
|
842 |
+
|
843 |
+
token(text,tok);
|
844 |
+
lna = strlen(sf);
|
845 |
+
lnt = strlen(tok[ntk-1]);
|
846 |
+
|
847 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
848 |
+
if(!flag) return 0;
|
849 |
+
|
850 |
+
do {
|
851 |
+
if(!skipword_ok(lna,0)) continue;
|
852 |
+
if(!is_FirstLetSMatch(sf,genFL)) continue;
|
853 |
+
|
854 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
855 |
+
return 1;
|
856 |
+
} while(search_backward_adv(sf,genFL));
|
857 |
+
|
858 |
+
return 0;
|
859 |
+
}
|
860 |
+
|
861 |
+
|
862 |
+
long FirstLetGenStp::strategy(const char *sf_, const char *str_)
|
863 |
+
{
|
864 |
+
long lna,lnt,flag;
|
865 |
+
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
866 |
+
|
867 |
+
str_tolower(sf_,sf);
|
868 |
+
str_tolower(str_,text);
|
869 |
+
|
870 |
+
token(text,tok);
|
871 |
+
lna = strlen(sf);
|
872 |
+
lnt = strlen(tok[ntk-1]);
|
873 |
+
|
874 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
875 |
+
if(!flag) return 0;
|
876 |
+
|
877 |
+
do {
|
878 |
+
if(!exist_skipword(lna)) continue;
|
879 |
+
if(!stopword_ok(lna,1)) continue;
|
880 |
+
if(!is_FirstLetMatch(lna,genFL)) continue;
|
881 |
+
|
882 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
883 |
+
return 1;
|
884 |
+
} while(search_backward_adv(sf,genFL));
|
885 |
+
|
886 |
+
return 0;
|
887 |
+
}
|
888 |
+
|
889 |
+
|
890 |
+
long FirstLetGenStp2::strategy(const char *sf_, const char *str_)
|
891 |
+
{
|
892 |
+
long lna,lnt,flag;
|
893 |
+
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
894 |
+
|
895 |
+
str_tolower(sf_,sf);
|
896 |
+
str_tolower(str_,text);
|
897 |
+
|
898 |
+
token(text,tok);
|
899 |
+
lna = strlen(sf);
|
900 |
+
lnt = strlen(tok[ntk-1]);
|
901 |
+
|
902 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
903 |
+
if(!flag) return 0;
|
904 |
+
|
905 |
+
do {
|
906 |
+
if(!exist_n_stopwords(lna,2)) continue;
|
907 |
+
if(!is_FirstLetMatch(lna,genFL)) continue;
|
908 |
+
|
909 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
910 |
+
return 1;
|
911 |
+
} while(search_backward_adv(sf,genFL));
|
912 |
+
|
913 |
+
return 0;
|
914 |
+
}
|
915 |
+
|
916 |
+
|
917 |
+
long FirstLetGenSkp::strategy(const char *sf_, const char *str_)
|
918 |
+
{
|
919 |
+
long lna,lnt,flag;
|
920 |
+
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
921 |
+
|
922 |
+
str_tolower(sf_,sf);
|
923 |
+
str_tolower(str_,text);
|
924 |
+
|
925 |
+
token(text,tok);
|
926 |
+
lna = strlen(sf);
|
927 |
+
lnt = strlen(tok[ntk-1]);
|
928 |
+
|
929 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
930 |
+
if(!flag) return 0;
|
931 |
+
|
932 |
+
do {
|
933 |
+
if(!exist_skipword(lna)) continue;
|
934 |
+
if(!skipword_ok(lna,1)) continue;
|
935 |
+
if(!is_FirstLetMatch(lna,genFL)) continue;
|
936 |
+
|
937 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
938 |
+
return 1;
|
939 |
+
} while(search_backward_adv(sf,genFL));
|
940 |
+
|
941 |
+
return 0;
|
942 |
+
}
|
943 |
+
|
944 |
+
|
945 |
+
long WithinWrdWrd::strategy(const char *sf_, const char *str_)
|
946 |
+
{
|
947 |
+
long lna,lnt,flag;
|
948 |
+
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
949 |
+
|
950 |
+
str_tolower(sf_,sf);
|
951 |
+
str_tolower(str_,text);
|
952 |
+
|
953 |
+
token(text,tok);
|
954 |
+
lna = strlen(sf);
|
955 |
+
lnt = strlen(tok[ntk-1]);
|
956 |
+
|
957 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
958 |
+
if(!flag) return 0;
|
959 |
+
|
960 |
+
do {
|
961 |
+
if(!skipword_ok(lna,0)) continue;
|
962 |
+
if(!is_subword(lna)) continue;
|
963 |
+
if(!is_WithinWrdMatch(lna,genFL)) continue;
|
964 |
+
|
965 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
966 |
+
return 1;
|
967 |
+
} while(search_backward_adv(sf,genFL));
|
968 |
+
|
969 |
+
return 0;
|
970 |
+
}
|
971 |
+
|
972 |
+
|
973 |
+
long WithinWrdFWrd::strategy(const char *sf_, const char *str_)
|
974 |
+
{
|
975 |
+
long lna,lnt,flag;
|
976 |
+
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
977 |
+
|
978 |
+
str_tolower(sf_,sf);
|
979 |
+
str_tolower(str_,text);
|
980 |
+
|
981 |
+
token(text,tok);
|
982 |
+
lna = strlen(sf);
|
983 |
+
lnt = strlen(tok[ntk-1]);
|
984 |
+
|
985 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
986 |
+
if(!flag) return 0;
|
987 |
+
|
988 |
+
do {
|
989 |
+
if(!skipword_ok(lna,0)) continue;
|
990 |
+
if(!is_subword(lna)) continue;
|
991 |
+
if(!is_BeginWrdMatch(lna,genFL)) continue;
|
992 |
+
if(!is_WithinWrdMatch(lna,genFL)) continue;
|
993 |
+
|
994 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
995 |
+
return 1;
|
996 |
+
} while(search_backward_adv(sf,genFL));
|
997 |
+
|
998 |
+
return 0;
|
999 |
+
}
|
1000 |
+
|
1001 |
+
|
1002 |
+
long WithinWrdFWrdSkp::strategy(const char *sf_, const char *str_)
|
1003 |
+
{
|
1004 |
+
long lna,lnt,flag;
|
1005 |
+
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
1006 |
+
|
1007 |
+
str_tolower(sf_,sf);
|
1008 |
+
str_tolower(str_,text);
|
1009 |
+
|
1010 |
+
token(text,tok);
|
1011 |
+
lna = strlen(sf);
|
1012 |
+
lnt = strlen(tok[ntk-1]);
|
1013 |
+
|
1014 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
1015 |
+
if(!flag) return 0;
|
1016 |
+
|
1017 |
+
do {
|
1018 |
+
if(!exist_skipword(lna)) continue;
|
1019 |
+
if(!skipword_ok(lna,1)) continue;
|
1020 |
+
if(!is_subword(lna)) continue;
|
1021 |
+
if(!is_BeginWrdMatch(lna,genFL)) continue;
|
1022 |
+
if(!is_WithinWrdMatch(lna,genFL)) continue;
|
1023 |
+
|
1024 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
1025 |
+
return 1;
|
1026 |
+
} while(search_backward_adv(sf,genFL));
|
1027 |
+
|
1028 |
+
return 0;
|
1029 |
+
}
|
1030 |
+
|
1031 |
+
|
1032 |
+
long WithinWrdLet::strategy(const char *sf_, const char *str_)
|
1033 |
+
{
|
1034 |
+
long lna,lnt,flag;
|
1035 |
+
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
1036 |
+
|
1037 |
+
str_tolower(sf_,sf);
|
1038 |
+
str_tolower(str_,text);
|
1039 |
+
|
1040 |
+
token(text,tok);
|
1041 |
+
lna = strlen(sf);
|
1042 |
+
lnt = strlen(tok[ntk-1]);
|
1043 |
+
|
1044 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
1045 |
+
if(!flag) return 0;
|
1046 |
+
|
1047 |
+
do {
|
1048 |
+
if(!skipword_ok(lna,0)) continue;
|
1049 |
+
if(!is_WithinWrdMatch(lna,genFL)) continue;
|
1050 |
+
|
1051 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
1052 |
+
return 1;
|
1053 |
+
} while(search_backward_adv(sf,genFL));
|
1054 |
+
|
1055 |
+
return 0;
|
1056 |
+
}
|
1057 |
+
|
1058 |
+
|
1059 |
+
long WithinWrdFLet::strategy(const char *sf_, const char *str_)
|
1060 |
+
{
|
1061 |
+
long lna,lnt,flag;
|
1062 |
+
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
1063 |
+
|
1064 |
+
str_tolower(sf_,sf);
|
1065 |
+
str_tolower(str_,text);
|
1066 |
+
|
1067 |
+
token(text,tok);
|
1068 |
+
lna = strlen(sf);
|
1069 |
+
lnt = strlen(tok[ntk-1]);
|
1070 |
+
|
1071 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
1072 |
+
if(!flag) return 0;
|
1073 |
+
|
1074 |
+
do {
|
1075 |
+
if(!skipword_ok(lna,0)) continue;
|
1076 |
+
if(!is_BeginWrdMatch(lna,genFL)) continue;
|
1077 |
+
if(!is_WithinWrdMatch(lna,genFL)) continue;
|
1078 |
+
|
1079 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
1080 |
+
return 1;
|
1081 |
+
} while(search_backward_adv(sf,genFL));
|
1082 |
+
|
1083 |
+
return 0;
|
1084 |
+
}
|
1085 |
+
|
1086 |
+
|
1087 |
+
long WithinWrdFLetSkp::strategy(const char *sf_, const char *str_)
|
1088 |
+
{
|
1089 |
+
long lna,lnt,flag;
|
1090 |
+
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
1091 |
+
|
1092 |
+
str_tolower(sf_,sf);
|
1093 |
+
str_tolower(str_,text);
|
1094 |
+
|
1095 |
+
token(text,tok);
|
1096 |
+
lna = strlen(sf);
|
1097 |
+
lnt = strlen(tok[ntk-1]);
|
1098 |
+
|
1099 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
1100 |
+
if(!flag) return 0;
|
1101 |
+
|
1102 |
+
do {
|
1103 |
+
if(!exist_skipword(lna)) continue;
|
1104 |
+
if(!skipword_ok(lna,1)) continue;
|
1105 |
+
if(!is_BeginWrdMatch(lna,genFL)) continue;
|
1106 |
+
if(!is_WithinWrdMatch(lna,genFL)) continue;
|
1107 |
+
|
1108 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
1109 |
+
return 1;
|
1110 |
+
} while(search_backward_adv(sf,genFL));
|
1111 |
+
|
1112 |
+
return 0;
|
1113 |
+
}
|
1114 |
+
|
1115 |
+
|
1116 |
+
long ContLet::strategy(const char *sf_, const char *str_)
|
1117 |
+
{
|
1118 |
+
long lna,lnt,flag;
|
1119 |
+
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
1120 |
+
|
1121 |
+
str_tolower(sf_,sf);
|
1122 |
+
str_tolower(str_,text);
|
1123 |
+
|
1124 |
+
token(text,tok);
|
1125 |
+
lna = strlen(sf);
|
1126 |
+
lnt = strlen(tok[ntk-1]);
|
1127 |
+
|
1128 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
1129 |
+
if(!flag) return 0;
|
1130 |
+
|
1131 |
+
do {
|
1132 |
+
if(!skipword_ok(lna,0)) continue;
|
1133 |
+
if(!is_BeginWrdMatch(lna,genFL)) continue;
|
1134 |
+
if(!is_ContLetMatch(lna)) continue;
|
1135 |
+
|
1136 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
1137 |
+
return 1;
|
1138 |
+
} while(search_backward_adv(sf,genFL));
|
1139 |
+
|
1140 |
+
return 0;
|
1141 |
+
}
|
1142 |
+
|
1143 |
+
|
1144 |
+
long ContLetSkp::strategy(const char *sf_, const char *str_)
|
1145 |
+
{
|
1146 |
+
long lna,lnt,flag;
|
1147 |
+
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
1148 |
+
|
1149 |
+
str_tolower(sf_,sf);
|
1150 |
+
str_tolower(str_,text);
|
1151 |
+
|
1152 |
+
token(text,tok);
|
1153 |
+
lna = strlen(sf);
|
1154 |
+
lnt = strlen(tok[ntk-1]);
|
1155 |
+
|
1156 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
1157 |
+
if(!flag) return 0;
|
1158 |
+
|
1159 |
+
do {
|
1160 |
+
if(!exist_skipword(lna)) continue;
|
1161 |
+
if(!skipword_ok(lna,1)) continue;
|
1162 |
+
if(!is_BeginWrdMatch(lna,genFL)) continue;
|
1163 |
+
if(!is_ContLetMatch(lna)) continue;
|
1164 |
+
|
1165 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
1166 |
+
return 1;
|
1167 |
+
} while(search_backward_adv(sf,genFL));
|
1168 |
+
|
1169 |
+
return 0;
|
1170 |
+
}
|
1171 |
+
|
1172 |
+
|
1173 |
+
long AnyLet::strategy(const char *sf_, const char *str_)
|
1174 |
+
{
|
1175 |
+
long lna,lnt,flag;
|
1176 |
+
bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
|
1177 |
+
|
1178 |
+
str_tolower(sf_,sf);
|
1179 |
+
str_tolower(str_,text);
|
1180 |
+
|
1181 |
+
token(text,tok);
|
1182 |
+
lna = strlen(sf);
|
1183 |
+
lnt = strlen(tok[ntk-1]);
|
1184 |
+
|
1185 |
+
flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
|
1186 |
+
if(!flag) return 0;
|
1187 |
+
|
1188 |
+
do {
|
1189 |
+
if(!skipword_ok(lna,1)) continue;
|
1190 |
+
|
1191 |
+
extract_lf(mod[0][0],ntk-1,str_);
|
1192 |
+
return 1;
|
1193 |
+
} while(search_backward_adv(sf,genFL));
|
1194 |
+
|
1195 |
+
return 0;
|
1196 |
+
}
|
1197 |
+
|
1198 |
+
|
1199 |
+
|
1200 |
+
//-----
|
1201 |
+
AbbrStra * StratUtil::strat_factory(string name)
|
1202 |
+
{
|
1203 |
+
if(name=="FirstLetOneChSF") return new FirstLetOneChSF;
|
1204 |
+
else if(name=="FirstLet") return new FirstLet;
|
1205 |
+
else if(name=="FirstLetGen") return new FirstLetGen;
|
1206 |
+
else if(name=="FirstLetGen2") return new FirstLetGen2;
|
1207 |
+
else if(name=="FirstLetGenS") return new FirstLetGenS;
|
1208 |
+
else if(name=="FirstLetGenStp") return new FirstLetGenStp;
|
1209 |
+
else if(name=="FirstLetGenStp2") return new FirstLetGenStp2;
|
1210 |
+
else if(name=="FirstLetGenSkp") return new FirstLetGenSkp;
|
1211 |
+
else if(name=="WithinWrdWrd") return new WithinWrdWrd;
|
1212 |
+
else if(name=="WithinWrdFWrd") return new WithinWrdFWrd;
|
1213 |
+
else if(name=="WithinWrdFWrdSkp") return new WithinWrdFWrdSkp;
|
1214 |
+
else if(name=="WithinWrdLet") return new WithinWrdLet;
|
1215 |
+
else if(name=="WithinWrdFLet") return new WithinWrdFLet;
|
1216 |
+
else if(name=="WithinWrdFLetSkp") return new WithinWrdFLetSkp;
|
1217 |
+
else if(name=="ContLet") return new ContLet;
|
1218 |
+
else if(name=="ContLetSkp") return new ContLetSkp;
|
1219 |
+
else if(name=="AnyLet") return new AnyLet;
|
1220 |
+
else { cout << "Fail strat_factory\n"; exit(1); }
|
1221 |
+
}
|
1222 |
+
|
1223 |
+
|
1224 |
+
//check if sf is ok and assign a group
|
1225 |
+
//if sf length > 5, use 5!!
|
1226 |
+
//grp will be Al+#ChInSF, Num+#ChInSF, or Spec+#ChInSF
|
1227 |
+
bool StratUtil::group_sf(const char *sf, string &grp)
|
1228 |
+
{
|
1229 |
+
long i, j, len=strlen(sf);
|
1230 |
+
long al=0, num=0, nonalnum=0;
|
1231 |
+
long paren=0, sbrac=0;
|
1232 |
+
|
1233 |
+
grp = ""; // if failure, no group
|
1234 |
+
|
1235 |
+
if(!isalnum(sf[0])) return false; //1sf ch must alnum
|
1236 |
+
for(i=0; i<len; i++) {
|
1237 |
+
if(isalpha(sf[i])) al++;
|
1238 |
+
else if(isdigit(sf[i])) num++;
|
1239 |
+
else nonalnum++;
|
1240 |
+
}
|
1241 |
+
if(al<1) return false; //at least one alphabet
|
1242 |
+
|
1243 |
+
//false for one parenthesis or square bracket
|
1244 |
+
for(i=len-1; i>=0; i--) {
|
1245 |
+
if(sf[i]=='(') paren++;
|
1246 |
+
if(sf[i]==')') paren--;
|
1247 |
+
if(sf[i]=='[') sbrac++;
|
1248 |
+
if(sf[i]==']') sbrac--;
|
1249 |
+
}
|
1250 |
+
if(paren!=0 || sbrac!=0) return false;
|
1251 |
+
|
1252 |
+
if(al==len) grp.assign("Al");
|
1253 |
+
else if(num>0) grp.assign("Num");
|
1254 |
+
else if(nonalnum>0) grp.assign("Spec");
|
1255 |
+
else { cout << "No sf group\n"; exit(1); }
|
1256 |
+
|
1257 |
+
//append sf length
|
1258 |
+
len = len>5 ? 5 : len;
|
1259 |
+
|
1260 |
+
switch(len) {
|
1261 |
+
case 1:
|
1262 |
+
grp.append("1");
|
1263 |
+
break;
|
1264 |
+
case 2:
|
1265 |
+
grp.append("2");
|
1266 |
+
break;
|
1267 |
+
case 3:
|
1268 |
+
grp.append("3");
|
1269 |
+
break;
|
1270 |
+
case 4:
|
1271 |
+
grp.append("4");
|
1272 |
+
break;
|
1273 |
+
case 5:
|
1274 |
+
grp.append("5");
|
1275 |
+
break;
|
1276 |
+
default:
|
1277 |
+
cout << "Not defined #-ch SF" << endl;
|
1278 |
+
exit(1);
|
1279 |
+
}
|
1280 |
+
|
1281 |
+
return true;
|
1282 |
+
}
|
1283 |
+
|
1284 |
+
//add the condition |lf|>|sf|
|
1285 |
+
bool StratUtil::group_sf(const char *sf, const char *lf, string &grp)
|
1286 |
+
{
|
1287 |
+
long i, j, len=strlen(sf);
|
1288 |
+
long al=0, num=0, nonalnum=0;
|
1289 |
+
long paren=0, sbrac=0;
|
1290 |
+
|
1291 |
+
if(strlen(lf)<len) return false; //|lf|>|sf|
|
1292 |
+
if(!isalnum(sf[0])) return false; //1sf ch must alnum
|
1293 |
+
for(i=0; i<len; i++) {
|
1294 |
+
if(isalpha(sf[i])) al++;
|
1295 |
+
else if(isdigit(sf[i])) num++;
|
1296 |
+
else nonalnum++;
|
1297 |
+
}
|
1298 |
+
if(al<1) return false; //at least one alphabet
|
1299 |
+
if(al>10) return false; //|alpha sf| is at most 10
|
1300 |
+
if(num_token(sf)>2) return false; //added Feb-21-08
|
1301 |
+
|
1302 |
+
//false for one parenthesis or square bracket
|
1303 |
+
for(i=len-1; i>=0; i--) {
|
1304 |
+
if(sf[i]=='(') paren++;
|
1305 |
+
if(sf[i]==')') paren--;
|
1306 |
+
if(sf[i]=='[') sbrac++;
|
1307 |
+
if(sf[i]==']') sbrac--;
|
1308 |
+
}
|
1309 |
+
if(paren!=0 || sbrac!=0) return false;
|
1310 |
+
|
1311 |
+
if(al==len) grp.assign("Al");
|
1312 |
+
else if(num>0) grp.assign("Num");
|
1313 |
+
else if(nonalnum>0) grp.assign("Spec");
|
1314 |
+
else { cout << "No sf group\n"; exit(1); }
|
1315 |
+
|
1316 |
+
//append sf length
|
1317 |
+
len = len>5 ? 5 : len;
|
1318 |
+
|
1319 |
+
switch(len) {
|
1320 |
+
case 1:
|
1321 |
+
grp.append("1");
|
1322 |
+
break;
|
1323 |
+
case 2:
|
1324 |
+
grp.append("2");
|
1325 |
+
break;
|
1326 |
+
case 3:
|
1327 |
+
grp.append("3");
|
1328 |
+
break;
|
1329 |
+
case 4:
|
1330 |
+
grp.append("4");
|
1331 |
+
break;
|
1332 |
+
case 5:
|
1333 |
+
grp.append("5");
|
1334 |
+
break;
|
1335 |
+
default:
|
1336 |
+
cout << "Not defined #-ch SF" << endl;
|
1337 |
+
exit(1);
|
1338 |
+
}
|
1339 |
+
|
1340 |
+
return true;
|
1341 |
+
}
|
1342 |
+
|
1343 |
+
|
1344 |
+
//remove non-alnum in str1 and save it to str2
|
1345 |
+
void StratUtil::remove_nonAlnum(const char *str1, char *str2)
|
1346 |
+
{
|
1347 |
+
long i=0, j=0;
|
1348 |
+
|
1349 |
+
while(str1[i]) {
|
1350 |
+
if(isalnum(str1[i])) {
|
1351 |
+
str2[j] = str1[i];
|
1352 |
+
j++;
|
1353 |
+
}
|
1354 |
+
i++;
|
1355 |
+
}
|
1356 |
+
str2[j] = '\0';
|
1357 |
+
}
|
1358 |
+
|
1359 |
+
|
1360 |
+
vector<string> StratUtil::get_strats(string s)
|
1361 |
+
{
|
1362 |
+
if(s=="Al1") return Al1;
|
1363 |
+
else if(s=="Al2") return Al2;
|
1364 |
+
else if(s=="Al3") return Al3;
|
1365 |
+
else if(s=="Al4") return Al4;
|
1366 |
+
else if(s=="Al5") return Al5;
|
1367 |
+
else if(s=="Num2") return Num2;
|
1368 |
+
else if(s=="Num3") return Num3;
|
1369 |
+
else if(s=="Num4") return Num4;
|
1370 |
+
else if(s=="Num5") return Num5;
|
1371 |
+
else if(s=="Spec2") return Spec2;
|
1372 |
+
else if(s=="Spec3") return Spec3;
|
1373 |
+
else if(s=="Spec4") return Spec4;
|
1374 |
+
else if(s=="Spec5") return Spec5;
|
1375 |
+
else { cout << "Incorrect name\n"; exit(1); }
|
1376 |
+
}
|
1377 |
+
|
1378 |
+
|
1379 |
+
void StratUtil::push_back_strat(string sgp, string strat)
|
1380 |
+
{
|
1381 |
+
if(sgp=="Al1") Al1.push_back(strat);
|
1382 |
+
else if(sgp=="Al2") Al2.push_back(strat);
|
1383 |
+
else if(sgp=="Al3") Al3.push_back(strat);
|
1384 |
+
else if(sgp=="Al4") Al4.push_back(strat);
|
1385 |
+
else if(sgp=="Al5") Al5.push_back(strat);
|
1386 |
+
else if(sgp=="Num2") Num2.push_back(strat);
|
1387 |
+
else if(sgp=="Num3") Num3.push_back(strat);
|
1388 |
+
else if(sgp=="Num4") Num4.push_back(strat);
|
1389 |
+
else if(sgp=="Num5") Num5.push_back(strat);
|
1390 |
+
else if(sgp=="Spec2") Spec2.push_back(strat);
|
1391 |
+
else if(sgp=="Spec3") Spec3.push_back(strat);
|
1392 |
+
else if(sgp=="Spec4") Spec4.push_back(strat);
|
1393 |
+
else if(sgp=="Spec5") Spec5.push_back(strat);
|
1394 |
+
}
|
1395 |
+
|
1396 |
+
|
1397 |
+
long StratUtil::exist_upperal(const char *str)
|
1398 |
+
{
|
1399 |
+
long i, len=strlen(str);
|
1400 |
+
|
1401 |
+
for(i=0; i<len; i++)
|
1402 |
+
if(isupper(str[i]))
|
1403 |
+
return 1;
|
1404 |
+
return 0;
|
1405 |
+
}
|
1406 |
+
|
1407 |
+
long StratUtil::num_token(const char *str)
|
1408 |
+
{
|
1409 |
+
long i,j=0,k=0;
|
1410 |
+
long n=strlen(str)-1;
|
1411 |
+
|
1412 |
+
while(isblank(str[n])) n--;
|
1413 |
+
|
1414 |
+
while(str[j]){
|
1415 |
+
while(isblank(str[j]))j++;
|
1416 |
+
i=j;
|
1417 |
+
while((str[j])&&(!isblank(str[j])))j++;
|
1418 |
+
if(str[j]){
|
1419 |
+
k++;
|
1420 |
+
j++;
|
1421 |
+
}
|
1422 |
+
}
|
1423 |
+
if((j-1)>n) k--; //added by Sohn (Jan-17-08): "ab cd " -> 2 tokens
|
1424 |
+
return k+1; //# tokens
|
1425 |
+
}
|
1426 |
+
//-----
|
Library/AbbrStra.h
CHANGED
@@ -1,332 +1,332 @@
|
|
1 |
-
#ifndef ABBRSTRA_H
|
2 |
-
#define ABBRSTRA_H
|
3 |
-
|
4 |
-
#include <vector>
|
5 |
-
#include <string>
|
6 |
-
#include <Hash.h>
|
7 |
-
|
8 |
-
using namespace std;
|
9 |
-
using namespace iret;
|
10 |
-
|
11 |
-
|
12 |
-
class WordData {
|
13 |
-
public:
|
14 |
-
WordData(const char *wrdname="wrdset3", const char *stpname="stop",
|
15 |
-
const char *lfsname="Lf1chSf");
|
16 |
-
|
17 |
-
~WordData();
|
18 |
-
|
19 |
-
Chash wrdset; //sigle word in MEDLINE
|
20 |
-
Hash stp; //stopword
|
21 |
-
Hash lfs; //lfs (1-ch sf) for FirstLet match cases >=2
|
22 |
-
};
|
23 |
-
|
24 |
-
|
25 |
-
class AbbrStra {
|
26 |
-
public:
|
27 |
-
AbbrStra();
|
28 |
-
~AbbrStra();
|
29 |
-
void token(const char *str, char lst[1000][1000]); // tokennize & set ntk
|
30 |
-
long tokenize(const char *str, char lst[1000][1000]); //tokennize & return # tokens
|
31 |
-
long num_token(const char *str); //return # tokens
|
32 |
-
long first_ch(const char *str, char *fch, long num);
|
33 |
-
long is_upperal(const char *str);
|
34 |
-
long is_alpha(const char *str);
|
35 |
-
void str_tolower(const char *str1, char *str2);
|
36 |
-
long get_str(const char *str1, char *str2, long num);
|
37 |
-
bool isupper_str(const char *str);
|
38 |
-
bool is_onealpha(const char *str);
|
39 |
-
long count_upperstr(const char *str);
|
40 |
-
//return # upper-case 1st letter of consecutive tokens (backward)
|
41 |
-
void get_alpha(const char *str1, char *str2);
|
42 |
-
//set str2 with only alphabet of str1
|
43 |
-
bool lf_ok(const char *shrtf, const char *longf);
|
44 |
-
|
45 |
-
virtual bool set_condition(const char *sf);
|
46 |
-
//must set nonAlphaSF=true if want to use SF containing non-alphabet
|
47 |
-
virtual long strategy(const char *sf, const char *str) = 0;
|
48 |
-
//sf & str will be lower-cased (OCt-25-2007)
|
49 |
-
long search_backward(long sloc, long tnum, long tloc, const char *sf, bool first);
|
50 |
-
//search backward to find match starting from sf[sloc]
|
51 |
-
//Returns 1 if matches. sf[0] must match with begin word
|
52 |
-
long search_backward_adv(const char *sf, bool first);
|
53 |
-
//Searches for next model setting. Returns 1 if finds one.
|
54 |
-
void extract_lf(long begin, long end);
|
55 |
-
//save strings from begin to end of tok to lf
|
56 |
-
void extract_lf(long begin, long end, const char *str);
|
57 |
-
//save strings from begin to end of str's tok to lf
|
58 |
-
|
59 |
-
//---after set mod check conditions
|
60 |
-
//nsf:# ch in sf, nsw:# allowed skipword, general:true allow 1st ch match after non-alnum
|
61 |
-
bool exist_skipword(long nsf);
|
62 |
-
//true if at least one skip word exists
|
63 |
-
bool exist_n_skipwords(long nsf, long n);
|
64 |
-
//true if exist n consecutive skip words between tokens but cannot be more than n
|
65 |
-
bool exist_n_stopwords(long nsf, long n);
|
66 |
-
//true if exist n consecutive skip stopwords between tokens but cannot be more than n
|
67 |
-
bool stopword_ok(long nsf, long nsw);
|
68 |
-
//true if at most (can be 0) nsw skip stopword in row exists
|
69 |
-
bool skip_stop_ok(long nsf, long nsw, long n);
|
70 |
-
//true if at most (can be 0) nsw skip word, which include at least n stopwords, in row exists
|
71 |
-
bool skip_stop_ok2(long nsf, long nsw, long n);
|
72 |
-
//true if nsw skip word, which include at least n stopwords, in row exists
|
73 |
-
bool skipword_ok(long nsf, long nsw);
|
74 |
-
//true if at most (can be 0) nsw skip word in row exists
|
75 |
-
bool is_subword(long nsf);
|
76 |
-
//true if matching string is begin of a tok or a word in wrdlist
|
77 |
-
bool is_BeginWrdMatch(long nsf, bool general);
|
78 |
-
//true if begining ch of a word match
|
79 |
-
//if general is true, allow match after non-alnum (eg, 1-alpha)
|
80 |
-
bool is_WithinWrdMatch(long nsf, bool general);
|
81 |
-
//true if within word match
|
82 |
-
//if general is true, 1-Alpha: 'A' is not within word match
|
83 |
-
bool is_FirstLetMatch(long nsf, bool general);
|
84 |
-
//true if each ch of sf match with 1st ch of word
|
85 |
-
//(true: Alpha anyword Beta (AB))
|
86 |
-
//if general=true, true: 1-Alpha Beta, Alpha-Beta
|
87 |
-
bool is_FirstLetMatch2(long nsf, bool general);
|
88 |
-
//at least one 1-Alpha
|
89 |
-
bool is_FirstLetSMatch(const char *sf, bool general);
|
90 |
-
//true if first letter match & 's' match with last ch of lf
|
91 |
-
bool is_ContLetMatch(long nsf);
|
92 |
-
//true if two or more consecutive letter match
|
93 |
-
//---
|
94 |
-
|
95 |
-
char *pch; //sf applied to a strategy
|
96 |
-
char *ps, *pl; //sf, potential lf
|
97 |
-
char sf[100], text[10000]; //sf & potential lf used in a strategy
|
98 |
-
char lf[10000]; //lf found by a strategy
|
99 |
-
char tok[1000][1000]; //token of potential lf
|
100 |
-
//lower after strategy, original after extract_lf(b,e,str)
|
101 |
-
long ntk; //# tokens
|
102 |
-
long mod[100][2]; //match locations of tok with a given sf
|
103 |
-
//mod[sf_inx][0]=tok inx, mod[sf_inx][1]=match loc in tok[mod[sf_inx][0]]
|
104 |
-
|
105 |
-
//for each n_ch-SF
|
106 |
-
long npairs; //selected pairs for this strategy
|
107 |
-
long tpairs; //total pairs
|
108 |
-
long nsfs; //# selected unique sfs for this strategy
|
109 |
-
long nmatchs; //# matchs (success strategy & given sf == real sf)
|
110 |
-
long amatchs; //# accumulated matchs up to this strategy
|
111 |
-
long setCondition; //SF condition
|
112 |
-
long greaterEqNsf; //if 1 select SF |SF|>=nsf
|
113 |
-
|
114 |
-
WordData *wData;
|
115 |
-
};
|
116 |
-
|
117 |
-
|
118 |
-
/*
|
119 |
-
alpha beta gamma (ABG)
|
120 |
-
*/
|
121 |
-
class FirstLet : public AbbrStra {
|
122 |
-
public:
|
123 |
-
virtual bool set_condition(const char *str1, const char *str2, char *str);
|
124 |
-
virtual long strategy(const char *sf, const char *str);
|
125 |
-
};
|
126 |
-
|
127 |
-
|
128 |
-
class FirstLetOneChSF : public AbbrStra {
|
129 |
-
public:
|
130 |
-
virtual bool set_condition(const char *str1, const char *str2, char *str);
|
131 |
-
virtual long strategy(const char *sf, const char *str);
|
132 |
-
};
|
133 |
-
|
134 |
-
|
135 |
-
/*
|
136 |
-
- sf ch matchs with 1st ch or ch right after non-alphanum of lf
|
137 |
-
but at least one match right after non-alphanum
|
138 |
-
(eg, success: 1-alpha 2-beta (AB), alpha-beta(AB),
|
139 |
-
fail: alpha beta(AB))
|
140 |
-
*/
|
141 |
-
class FirstLetGen : public AbbrStra {
|
142 |
-
public:
|
143 |
-
virtual long strategy(const char *sf, const char *str);
|
144 |
-
};
|
145 |
-
|
146 |
-
|
147 |
-
/*
|
148 |
-
- sf ch matchs with 1st ch or ch right after non-alphanum of lf
|
149 |
-
(eg, success: 1-alpha 2-beta (AB), alpha-beta(AB),
|
150 |
-
alpha beta(AB))
|
151 |
-
*/
|
152 |
-
class FirstLetGen2 : public AbbrStra {
|
153 |
-
public:
|
154 |
-
virtual long strategy(const char *sf, const char *str);
|
155 |
-
};
|
156 |
-
|
157 |
-
|
158 |
-
/*
|
159 |
-
For sf consisting of capital letters & lower-case 's'
|
160 |
-
- First letter & 's' in the last token of lf
|
161 |
-
(success: Alpha Betas (ABs), 1-Alpha Betas (ABs),
|
162 |
-
1-Alpha-Betas (ABs), Alpha BetaS (ABs)
|
163 |
-
fail: Alpha Beta xxs (ABs) )
|
164 |
-
*/
|
165 |
-
class FirstLetGenS : public AbbrStra {
|
166 |
-
public:
|
167 |
-
virtual bool set_condition(const char *sf); //sf must be an original sf
|
168 |
-
//true if sf is like ABCs
|
169 |
-
virtual long strategy(const char *sf, const char *str);
|
170 |
-
};
|
171 |
-
|
172 |
-
|
173 |
-
/*
|
174 |
-
- sf ch matches with 1st ch or ch right after non-alphanum of lf
|
175 |
-
- allowing one skip stopword between tokens (no more than one in row)
|
176 |
-
at least one skip stopword in total
|
177 |
-
(eg, success: alpha and beta (AB), 1-alpha and beta (AB)
|
178 |
-
fail: alpha beta (AB), alpha word beta (AB))
|
179 |
-
*/
|
180 |
-
class FirstLetGenStp : public AbbrStra {
|
181 |
-
public:
|
182 |
-
virtual long strategy(const char *sf, const char *str);
|
183 |
-
};
|
184 |
-
|
185 |
-
|
186 |
-
/*
|
187 |
-
- same as FirstLetGenStp except for 2 skip stopwords
|
188 |
-
& at least one two consecutive skip stopwords
|
189 |
-
*/
|
190 |
-
class FirstLetGenStp2 : public AbbrStra {
|
191 |
-
public:
|
192 |
-
virtual long strategy(const char *sf, const char *str);
|
193 |
-
};
|
194 |
-
|
195 |
-
|
196 |
-
/*
|
197 |
-
- same as FirstLetGenStp except using skip any word instead of stopword
|
198 |
-
*/
|
199 |
-
class FirstLetGenSkp : public AbbrStra {
|
200 |
-
public:
|
201 |
-
virtual long strategy(const char *sf, const char *str);
|
202 |
-
};
|
203 |
-
|
204 |
-
|
205 |
-
/*
|
206 |
-
- a matching sub-string must be word
|
207 |
-
(eg, success: AlphaBeta (AB), Beta is word
|
208 |
-
x-AlphaBeta (AB) )
|
209 |
-
- at least one within word match
|
210 |
-
(eg,fail: Alpha Beta Word (ABW), Alpha x-Beta x-Word (ABW)
|
211 |
-
success: AlphaBeta Word (ABW), x-AlphaBeta inWord (ABW))
|
212 |
-
*/
|
213 |
-
class WithinWrdWrd : public AbbrStra {
|
214 |
-
public:
|
215 |
-
virtual long strategy(const char *sf, const char *str);
|
216 |
-
};
|
217 |
-
|
218 |
-
|
219 |
-
/*
|
220 |
-
- WithinWrdWrd w/ Begin Word Match
|
221 |
-
(success: AlphaBeta x-Word (ABW)
|
222 |
-
fail: AlphaBeta inWord (ABW) )
|
223 |
-
*/
|
224 |
-
class WithinWrdFWrd : public AbbrStra {
|
225 |
-
public:
|
226 |
-
virtual long strategy(const char *sf, const char *str);
|
227 |
-
};
|
228 |
-
|
229 |
-
|
230 |
-
/*
|
231 |
-
- WithinWrdFWrd w/ allowing one skip word between tokens (no more than one in row)
|
232 |
-
at least one skip word in total
|
233 |
-
(success: AlphaBeta zzz x-Word zzz (ABW)
|
234 |
-
fail: AlphaBeta x-Word (ABW), AlphaBeta zzz yyy x-Word (ABW))
|
235 |
-
*/
|
236 |
-
class WithinWrdFWrdSkp : public AbbrStra {
|
237 |
-
public:
|
238 |
-
virtual long strategy(const char *sf, const char *str);
|
239 |
-
};
|
240 |
-
|
241 |
-
|
242 |
-
/*
|
243 |
-
- at least one within word match
|
244 |
-
( success: Alpha InXyy (AX), x-Alpha InXyy (AX))
|
245 |
-
fail: Alpha Xyy (AX), Alpha 1-Xyy (AX))
|
246 |
-
*/
|
247 |
-
class WithinWrdLet : public AbbrStra {
|
248 |
-
public:
|
249 |
-
virtual long strategy(const char *sf, const char *str);
|
250 |
-
};
|
251 |
-
|
252 |
-
|
253 |
-
/*
|
254 |
-
- WithinWrdLet w/ Begin Word Match
|
255 |
-
(fail: Alpha InXyy (AX), x-Alpha InXyy (AX)
|
256 |
-
success: AlphaXyy Word (AXW), x-AlphaXyy 1-Word (AXW))
|
257 |
-
*/
|
258 |
-
class WithinWrdFLet : public AbbrStra {
|
259 |
-
public:
|
260 |
-
virtual long strategy(const char *sf, const char *str);
|
261 |
-
};
|
262 |
-
|
263 |
-
|
264 |
-
/*
|
265 |
-
- WithinWrdFLet w/ allowing one skip word between tokens (no more than one in row)
|
266 |
-
at least one skip word in total
|
267 |
-
(success: AlphaXyy zzz Word zzz (AXW)
|
268 |
-
fail: AlphaXyy Word (AXW), AlphaXyy zzz yyy Word (AXW))
|
269 |
-
*/
|
270 |
-
class WithinWrdFLetSkp : public AbbrStra {
|
271 |
-
public:
|
272 |
-
virtual long strategy(const char *sf, const char *str);
|
273 |
-
};
|
274 |
-
|
275 |
-
|
276 |
-
/*
|
277 |
-
- any two consecutive letter matching w/ begin word match
|
278 |
-
eg) ABxxx (AB), 1-ABxxx (AB), ABxxx Cxxx (ABC), Axxx BCxxx (ABC)
|
279 |
-
prolactin (PRL), succinylcholine (SCh)
|
280 |
-
*/
|
281 |
-
class ContLet : public AbbrStra {
|
282 |
-
public:
|
283 |
-
virtual long strategy(const char *sf, const char *str);
|
284 |
-
};
|
285 |
-
|
286 |
-
|
287 |
-
/*
|
288 |
-
- ContLet w/ allowing one skip word between tokens (no more than one in row)
|
289 |
-
at least one skip word in total
|
290 |
-
*/
|
291 |
-
class ContLetSkp : public AbbrStra {
|
292 |
-
public:
|
293 |
-
virtual long strategy(const char *sf, const char *str);
|
294 |
-
};
|
295 |
-
|
296 |
-
|
297 |
-
/*
|
298 |
-
- match can occur anywhere
|
299 |
-
- allow one skip word between tokens (no more than one in row)
|
300 |
-
(success: Alpha yXyy (AX), Alpha yXyy word (AX)
|
301 |
-
1-Alpha yXyy word (AX))
|
302 |
-
*/
|
303 |
-
class AnyLet : public AbbrStra {
|
304 |
-
public:
|
305 |
-
virtual long strategy(const char *sf, const char *str);
|
306 |
-
};
|
307 |
-
|
308 |
-
|
309 |
-
class StratUtil {
|
310 |
-
public:
|
311 |
-
AbbrStra *strat_factory(string name);
|
312 |
-
vector<string> get_strats(string s);
|
313 |
-
//get the strategy sequence for a given #-ch SF group
|
314 |
-
void push_back_strat(string sgp, string strat);
|
315 |
-
bool group_sf(const char *sf, string &grp);
|
316 |
-
//check if sf is ok and assign a group
|
317 |
-
bool group_sf(const char *sf, const char *lf, string &grp);
|
318 |
-
//add the contion |lf|>|sf|
|
319 |
-
void remove_nonAlnum(const char *str1, char *str2);
|
320 |
-
//remove non-alnum in str1 and save it to str2
|
321 |
-
long exist_upperal(const char *str); //return 1 if exists upper char, 0 ow
|
322 |
-
long num_token(const char *str); //return # tokens
|
323 |
-
|
324 |
-
vector<string> Al1, Al2, Al3, Al4, Al5;
|
325 |
-
vector<string> Num2, Num3, Num4, Num5;
|
326 |
-
vector<string> Spec2, Spec3, Spec4, Spec5;
|
327 |
-
};
|
328 |
-
|
329 |
-
|
330 |
-
#endif
|
331 |
-
|
332 |
-
|
|
|
1 |
+
#ifndef ABBRSTRA_H
|
2 |
+
#define ABBRSTRA_H
|
3 |
+
|
4 |
+
#include <vector>
|
5 |
+
#include <string>
|
6 |
+
#include <Hash.h>
|
7 |
+
|
8 |
+
using namespace std;
|
9 |
+
using namespace iret;
|
10 |
+
|
11 |
+
|
12 |
+
class WordData {
|
13 |
+
public:
|
14 |
+
WordData(const char *wrdname="wrdset3", const char *stpname="stop",
|
15 |
+
const char *lfsname="Lf1chSf");
|
16 |
+
|
17 |
+
~WordData();
|
18 |
+
|
19 |
+
Chash wrdset; //sigle word in MEDLINE
|
20 |
+
Hash stp; //stopword
|
21 |
+
Hash lfs; //lfs (1-ch sf) for FirstLet match cases >=2
|
22 |
+
};
|
23 |
+
|
24 |
+
|
25 |
+
class AbbrStra {
|
26 |
+
public:
|
27 |
+
AbbrStra();
|
28 |
+
~AbbrStra();
|
29 |
+
void token(const char *str, char lst[1000][1000]); // tokennize & set ntk
|
30 |
+
long tokenize(const char *str, char lst[1000][1000]); //tokennize & return # tokens
|
31 |
+
long num_token(const char *str); //return # tokens
|
32 |
+
long first_ch(const char *str, char *fch, long num);
|
33 |
+
long is_upperal(const char *str);
|
34 |
+
long is_alpha(const char *str);
|
35 |
+
void str_tolower(const char *str1, char *str2);
|
36 |
+
long get_str(const char *str1, char *str2, long num);
|
37 |
+
bool isupper_str(const char *str);
|
38 |
+
bool is_onealpha(const char *str);
|
39 |
+
long count_upperstr(const char *str);
|
40 |
+
//return # upper-case 1st letter of consecutive tokens (backward)
|
41 |
+
void get_alpha(const char *str1, char *str2);
|
42 |
+
//set str2 with only alphabet of str1
|
43 |
+
bool lf_ok(const char *shrtf, const char *longf);
|
44 |
+
|
45 |
+
virtual bool set_condition(const char *sf);
|
46 |
+
//must set nonAlphaSF=true if want to use SF containing non-alphabet
|
47 |
+
virtual long strategy(const char *sf, const char *str) = 0;
|
48 |
+
//sf & str will be lower-cased (OCt-25-2007)
|
49 |
+
long search_backward(long sloc, long tnum, long tloc, const char *sf, bool first);
|
50 |
+
//search backward to find match starting from sf[sloc]
|
51 |
+
//Returns 1 if matches. sf[0] must match with begin word
|
52 |
+
long search_backward_adv(const char *sf, bool first);
|
53 |
+
//Searches for next model setting. Returns 1 if finds one.
|
54 |
+
void extract_lf(long begin, long end);
|
55 |
+
//save strings from begin to end of tok to lf
|
56 |
+
void extract_lf(long begin, long end, const char *str);
|
57 |
+
//save strings from begin to end of str's tok to lf
|
58 |
+
|
59 |
+
//---after set mod check conditions
|
60 |
+
//nsf:# ch in sf, nsw:# allowed skipword, general:true allow 1st ch match after non-alnum
|
61 |
+
bool exist_skipword(long nsf);
|
62 |
+
//true if at least one skip word exists
|
63 |
+
bool exist_n_skipwords(long nsf, long n);
|
64 |
+
//true if exist n consecutive skip words between tokens but cannot be more than n
|
65 |
+
bool exist_n_stopwords(long nsf, long n);
|
66 |
+
//true if exist n consecutive skip stopwords between tokens but cannot be more than n
|
67 |
+
bool stopword_ok(long nsf, long nsw);
|
68 |
+
//true if at most (can be 0) nsw skip stopword in row exists
|
69 |
+
bool skip_stop_ok(long nsf, long nsw, long n);
|
70 |
+
//true if at most (can be 0) nsw skip word, which include at least n stopwords, in row exists
|
71 |
+
bool skip_stop_ok2(long nsf, long nsw, long n);
|
72 |
+
//true if nsw skip word, which include at least n stopwords, in row exists
|
73 |
+
bool skipword_ok(long nsf, long nsw);
|
74 |
+
//true if at most (can be 0) nsw skip word in row exists
|
75 |
+
bool is_subword(long nsf);
|
76 |
+
//true if matching string is begin of a tok or a word in wrdlist
|
77 |
+
bool is_BeginWrdMatch(long nsf, bool general);
|
78 |
+
//true if begining ch of a word match
|
79 |
+
//if general is true, allow match after non-alnum (eg, 1-alpha)
|
80 |
+
bool is_WithinWrdMatch(long nsf, bool general);
|
81 |
+
//true if within word match
|
82 |
+
//if general is true, 1-Alpha: 'A' is not within word match
|
83 |
+
bool is_FirstLetMatch(long nsf, bool general);
|
84 |
+
//true if each ch of sf match with 1st ch of word
|
85 |
+
//(true: Alpha anyword Beta (AB))
|
86 |
+
//if general=true, true: 1-Alpha Beta, Alpha-Beta
|
87 |
+
bool is_FirstLetMatch2(long nsf, bool general);
|
88 |
+
//at least one 1-Alpha
|
89 |
+
bool is_FirstLetSMatch(const char *sf, bool general);
|
90 |
+
//true if first letter match & 's' match with last ch of lf
|
91 |
+
bool is_ContLetMatch(long nsf);
|
92 |
+
//true if two or more consecutive letter match
|
93 |
+
//---
|
94 |
+
|
95 |
+
char *pch; //sf applied to a strategy
|
96 |
+
char *ps, *pl; //sf, potential lf
|
97 |
+
char sf[100], text[10000]; //sf & potential lf used in a strategy
|
98 |
+
char lf[10000]; //lf found by a strategy
|
99 |
+
char tok[1000][1000]; //token of potential lf
|
100 |
+
//lower after strategy, original after extract_lf(b,e,str)
|
101 |
+
long ntk; //# tokens
|
102 |
+
long mod[100][2]; //match locations of tok with a given sf
|
103 |
+
//mod[sf_inx][0]=tok inx, mod[sf_inx][1]=match loc in tok[mod[sf_inx][0]]
|
104 |
+
|
105 |
+
//for each n_ch-SF
|
106 |
+
long npairs; //selected pairs for this strategy
|
107 |
+
long tpairs; //total pairs
|
108 |
+
long nsfs; //# selected unique sfs for this strategy
|
109 |
+
long nmatchs; //# matchs (success strategy & given sf == real sf)
|
110 |
+
long amatchs; //# accumulated matchs up to this strategy
|
111 |
+
long setCondition; //SF condition
|
112 |
+
long greaterEqNsf; //if 1 select SF |SF|>=nsf
|
113 |
+
|
114 |
+
WordData *wData;
|
115 |
+
};
|
116 |
+
|
117 |
+
|
118 |
+
/*
|
119 |
+
alpha beta gamma (ABG)
|
120 |
+
*/
|
121 |
+
class FirstLet : public AbbrStra {
|
122 |
+
public:
|
123 |
+
virtual bool set_condition(const char *str1, const char *str2, char *str);
|
124 |
+
virtual long strategy(const char *sf, const char *str);
|
125 |
+
};
|
126 |
+
|
127 |
+
|
128 |
+
class FirstLetOneChSF : public AbbrStra {
|
129 |
+
public:
|
130 |
+
virtual bool set_condition(const char *str1, const char *str2, char *str);
|
131 |
+
virtual long strategy(const char *sf, const char *str);
|
132 |
+
};
|
133 |
+
|
134 |
+
|
135 |
+
/*
|
136 |
+
- sf ch matchs with 1st ch or ch right after non-alphanum of lf
|
137 |
+
but at least one match right after non-alphanum
|
138 |
+
(eg, success: 1-alpha 2-beta (AB), alpha-beta(AB),
|
139 |
+
fail: alpha beta(AB))
|
140 |
+
*/
|
141 |
+
class FirstLetGen : public AbbrStra {
|
142 |
+
public:
|
143 |
+
virtual long strategy(const char *sf, const char *str);
|
144 |
+
};
|
145 |
+
|
146 |
+
|
147 |
+
/*
|
148 |
+
- sf ch matchs with 1st ch or ch right after non-alphanum of lf
|
149 |
+
(eg, success: 1-alpha 2-beta (AB), alpha-beta(AB),
|
150 |
+
alpha beta(AB))
|
151 |
+
*/
|
152 |
+
class FirstLetGen2 : public AbbrStra {
|
153 |
+
public:
|
154 |
+
virtual long strategy(const char *sf, const char *str);
|
155 |
+
};
|
156 |
+
|
157 |
+
|
158 |
+
/*
|
159 |
+
For sf consisting of capital letters & lower-case 's'
|
160 |
+
- First letter & 's' in the last token of lf
|
161 |
+
(success: Alpha Betas (ABs), 1-Alpha Betas (ABs),
|
162 |
+
1-Alpha-Betas (ABs), Alpha BetaS (ABs)
|
163 |
+
fail: Alpha Beta xxs (ABs) )
|
164 |
+
*/
|
165 |
+
class FirstLetGenS : public AbbrStra {
|
166 |
+
public:
|
167 |
+
virtual bool set_condition(const char *sf); //sf must be an original sf
|
168 |
+
//true if sf is like ABCs
|
169 |
+
virtual long strategy(const char *sf, const char *str);
|
170 |
+
};
|
171 |
+
|
172 |
+
|
173 |
+
/*
|
174 |
+
- sf ch matches with 1st ch or ch right after non-alphanum of lf
|
175 |
+
- allowing one skip stopword between tokens (no more than one in row)
|
176 |
+
at least one skip stopword in total
|
177 |
+
(eg, success: alpha and beta (AB), 1-alpha and beta (AB)
|
178 |
+
fail: alpha beta (AB), alpha word beta (AB))
|
179 |
+
*/
|
180 |
+
class FirstLetGenStp : public AbbrStra {
|
181 |
+
public:
|
182 |
+
virtual long strategy(const char *sf, const char *str);
|
183 |
+
};
|
184 |
+
|
185 |
+
|
186 |
+
/*
|
187 |
+
- same as FirstLetGenStp except for 2 skip stopwords
|
188 |
+
& at least one two consecutive skip stopwords
|
189 |
+
*/
|
190 |
+
class FirstLetGenStp2 : public AbbrStra {
|
191 |
+
public:
|
192 |
+
virtual long strategy(const char *sf, const char *str);
|
193 |
+
};
|
194 |
+
|
195 |
+
|
196 |
+
/*
|
197 |
+
- same as FirstLetGenStp except using skip any word instead of stopword
|
198 |
+
*/
|
199 |
+
class FirstLetGenSkp : public AbbrStra {
|
200 |
+
public:
|
201 |
+
virtual long strategy(const char *sf, const char *str);
|
202 |
+
};
|
203 |
+
|
204 |
+
|
205 |
+
/*
|
206 |
+
- a matching sub-string must be word
|
207 |
+
(eg, success: AlphaBeta (AB), Beta is word
|
208 |
+
x-AlphaBeta (AB) )
|
209 |
+
- at least one within word match
|
210 |
+
(eg,fail: Alpha Beta Word (ABW), Alpha x-Beta x-Word (ABW)
|
211 |
+
success: AlphaBeta Word (ABW), x-AlphaBeta inWord (ABW))
|
212 |
+
*/
|
213 |
+
class WithinWrdWrd : public AbbrStra {
|
214 |
+
public:
|
215 |
+
virtual long strategy(const char *sf, const char *str);
|
216 |
+
};
|
217 |
+
|
218 |
+
|
219 |
+
/*
|
220 |
+
- WithinWrdWrd w/ Begin Word Match
|
221 |
+
(success: AlphaBeta x-Word (ABW)
|
222 |
+
fail: AlphaBeta inWord (ABW) )
|
223 |
+
*/
|
224 |
+
class WithinWrdFWrd : public AbbrStra {
|
225 |
+
public:
|
226 |
+
virtual long strategy(const char *sf, const char *str);
|
227 |
+
};
|
228 |
+
|
229 |
+
|
230 |
+
/*
|
231 |
+
- WithinWrdFWrd w/ allowing one skip word between tokens (no more than one in row)
|
232 |
+
at least one skip word in total
|
233 |
+
(success: AlphaBeta zzz x-Word zzz (ABW)
|
234 |
+
fail: AlphaBeta x-Word (ABW), AlphaBeta zzz yyy x-Word (ABW))
|
235 |
+
*/
|
236 |
+
class WithinWrdFWrdSkp : public AbbrStra {
|
237 |
+
public:
|
238 |
+
virtual long strategy(const char *sf, const char *str);
|
239 |
+
};
|
240 |
+
|
241 |
+
|
242 |
+
/*
|
243 |
+
- at least one within word match
|
244 |
+
( success: Alpha InXyy (AX), x-Alpha InXyy (AX))
|
245 |
+
fail: Alpha Xyy (AX), Alpha 1-Xyy (AX))
|
246 |
+
*/
|
247 |
+
class WithinWrdLet : public AbbrStra {
|
248 |
+
public:
|
249 |
+
virtual long strategy(const char *sf, const char *str);
|
250 |
+
};
|
251 |
+
|
252 |
+
|
253 |
+
/*
|
254 |
+
- WithinWrdLet w/ Begin Word Match
|
255 |
+
(fail: Alpha InXyy (AX), x-Alpha InXyy (AX)
|
256 |
+
success: AlphaXyy Word (AXW), x-AlphaXyy 1-Word (AXW))
|
257 |
+
*/
|
258 |
+
class WithinWrdFLet : public AbbrStra {
|
259 |
+
public:
|
260 |
+
virtual long strategy(const char *sf, const char *str);
|
261 |
+
};
|
262 |
+
|
263 |
+
|
264 |
+
/*
|
265 |
+
- WithinWrdFLet w/ allowing one skip word between tokens (no more than one in row)
|
266 |
+
at least one skip word in total
|
267 |
+
(success: AlphaXyy zzz Word zzz (AXW)
|
268 |
+
fail: AlphaXyy Word (AXW), AlphaXyy zzz yyy Word (AXW))
|
269 |
+
*/
|
270 |
+
class WithinWrdFLetSkp : public AbbrStra {
|
271 |
+
public:
|
272 |
+
virtual long strategy(const char *sf, const char *str);
|
273 |
+
};
|
274 |
+
|
275 |
+
|
276 |
+
/*
|
277 |
+
- any two consecutive letter matching w/ begin word match
|
278 |
+
eg) ABxxx (AB), 1-ABxxx (AB), ABxxx Cxxx (ABC), Axxx BCxxx (ABC)
|
279 |
+
prolactin (PRL), succinylcholine (SCh)
|
280 |
+
*/
|
281 |
+
class ContLet : public AbbrStra {
|
282 |
+
public:
|
283 |
+
virtual long strategy(const char *sf, const char *str);
|
284 |
+
};
|
285 |
+
|
286 |
+
|
287 |
+
/*
|
288 |
+
- ContLet w/ allowing one skip word between tokens (no more than one in row)
|
289 |
+
at least one skip word in total
|
290 |
+
*/
|
291 |
+
class ContLetSkp : public AbbrStra {
|
292 |
+
public:
|
293 |
+
virtual long strategy(const char *sf, const char *str);
|
294 |
+
};
|
295 |
+
|
296 |
+
|
297 |
+
/*
|
298 |
+
- match can occur anywhere
|
299 |
+
- allow one skip word between tokens (no more than one in row)
|
300 |
+
(success: Alpha yXyy (AX), Alpha yXyy word (AX)
|
301 |
+
1-Alpha yXyy word (AX))
|
302 |
+
*/
|
303 |
+
class AnyLet : public AbbrStra {
|
304 |
+
public:
|
305 |
+
virtual long strategy(const char *sf, const char *str);
|
306 |
+
};
|
307 |
+
|
308 |
+
|
309 |
+
class StratUtil {
|
310 |
+
public:
|
311 |
+
AbbrStra *strat_factory(string name);
|
312 |
+
vector<string> get_strats(string s);
|
313 |
+
//get the strategy sequence for a given #-ch SF group
|
314 |
+
void push_back_strat(string sgp, string strat);
|
315 |
+
bool group_sf(const char *sf, string &grp);
|
316 |
+
//check if sf is ok and assign a group
|
317 |
+
bool group_sf(const char *sf, const char *lf, string &grp);
|
318 |
+
//add the contion |lf|>|sf|
|
319 |
+
void remove_nonAlnum(const char *str1, char *str2);
|
320 |
+
//remove non-alnum in str1 and save it to str2
|
321 |
+
long exist_upperal(const char *str); //return 1 if exists upper char, 0 ow
|
322 |
+
long num_token(const char *str); //return # tokens
|
323 |
+
|
324 |
+
vector<string> Al1, Al2, Al3, Al4, Al5;
|
325 |
+
vector<string> Num2, Num3, Num4, Num5;
|
326 |
+
vector<string> Spec2, Spec3, Spec4, Spec5;
|
327 |
+
};
|
328 |
+
|
329 |
+
|
330 |
+
#endif
|
331 |
+
|
332 |
+
|
Library/AbbrvE.C
CHANGED
@@ -1,629 +1,629 @@
|
|
1 |
-
#include "AbbrvE.h"
|
2 |
-
#include <sstream>
|
3 |
-
|
4 |
-
namespace iret {
|
5 |
-
|
6 |
-
Find_Seq::Find_Seq( void )
|
7 |
-
/* initializers work in C++0x
|
8 |
-
:
|
9 |
-
seq_i ( { "i", "ii", "iii", "iv", "v", "vi" } ),
|
10 |
-
seq_I ( { "I", "II", "III", "IV", "V", "VI" } ),
|
11 |
-
seq_a ( { "a", "b", "c", "d", "e", "f" } ),
|
12 |
-
seq_A ( { "A", "B", "C", "D", "E", "F" } )
|
13 |
-
*/
|
14 |
-
{
|
15 |
-
seq_i.push_back("i");
|
16 |
-
seq_i.push_back("ii");
|
17 |
-
seq_i.push_back("iii");
|
18 |
-
seq_i.push_back("iv");
|
19 |
-
seq_i.push_back("v");
|
20 |
-
seq_i.push_back("vi");
|
21 |
-
|
22 |
-
seq_I.push_back("I");
|
23 |
-
seq_I.push_back("II");
|
24 |
-
seq_I.push_back("III");
|
25 |
-
seq_I.push_back("IV");
|
26 |
-
seq_I.push_back("V");
|
27 |
-
seq_I.push_back("VI");
|
28 |
-
|
29 |
-
seq_a.push_back("a");
|
30 |
-
seq_a.push_back("b");
|
31 |
-
seq_a.push_back("c");
|
32 |
-
seq_a.push_back("d");
|
33 |
-
seq_a.push_back("e");
|
34 |
-
seq_a.push_back("f");
|
35 |
-
|
36 |
-
seq_A.push_back("A");
|
37 |
-
seq_A.push_back("B");
|
38 |
-
seq_A.push_back("C");
|
39 |
-
seq_A.push_back("D");
|
40 |
-
seq_A.push_back("E");
|
41 |
-
seq_A.push_back("F");
|
42 |
-
|
43 |
-
}
|
44 |
-
|
45 |
-
void
|
46 |
-
Find_Seq::flag_seq( int numa, char* abbs[] ) {
|
47 |
-
|
48 |
-
my_numa = numa;
|
49 |
-
my_abbs = abbs;
|
50 |
-
|
51 |
-
my_rate.resize(numa);
|
52 |
-
for ( int i = 0; i < numa; ++i )
|
53 |
-
my_rate[i] = true;
|
54 |
-
|
55 |
-
find_seq(seq_i);
|
56 |
-
find_seq(seq_I);
|
57 |
-
find_seq(seq_a);
|
58 |
-
find_seq(seq_A);
|
59 |
-
|
60 |
-
create_seq();
|
61 |
-
}
|
62 |
-
|
63 |
-
|
64 |
-
void
|
65 |
-
Find_Seq::find_seq( const vector<string> & seq ) {
|
66 |
-
|
67 |
-
for ( int i_abbr = 0; i_abbr < my_numa-1; ++i_abbr ) {
|
68 |
-
// need to see at least 2 in sequence
|
69 |
-
|
70 |
-
if ( seq[0] == my_abbs[i_abbr] ) {
|
71 |
-
int i_seq = 1;
|
72 |
-
while ( i_seq < seq.size() and
|
73 |
-
i_seq + i_abbr < my_numa and
|
74 |
-
seq[i_seq] == my_abbs[i_abbr + i_seq ] )
|
75 |
-
++i_seq;
|
76 |
-
|
77 |
-
if ( i_seq > 1 )
|
78 |
-
for ( int i = 0; i < i_seq; ++i )
|
79 |
-
my_rate[i_abbr+i] = false;
|
80 |
-
}
|
81 |
-
|
82 |
-
}
|
83 |
-
}
|
84 |
-
|
85 |
-
void
|
86 |
-
Find_Seq::create_seq( void ) {
|
87 |
-
|
88 |
-
for ( int i_abbr = 0; i_abbr < my_numa; ++i_abbr ) {
|
89 |
-
|
90 |
-
size_t len = std::strlen( my_abbs[i_abbr] );
|
91 |
-
|
92 |
-
if ( my_abbs[i_abbr][len-1] == '1' ) {
|
93 |
-
// create sequence and test
|
94 |
-
|
95 |
-
string prefix( my_abbs[i_abbr], len-1 );
|
96 |
-
size_t seq_len = my_numa - i_abbr; // max possible length
|
97 |
-
vector<string> seq;
|
98 |
-
// sequence starts with 1
|
99 |
-
for ( int i= 1; i <= seq_len; ++i ) {
|
100 |
-
std::ostringstream stream(prefix,std::ios::app);
|
101 |
-
stream << i;
|
102 |
-
seq.push_back( stream.str() );
|
103 |
-
}
|
104 |
-
|
105 |
-
// cout << seq << '\n';
|
106 |
-
find_seq(seq);
|
107 |
-
}
|
108 |
-
}
|
109 |
-
}
|
110 |
-
|
111 |
-
|
112 |
-
AbbrvE::AbbrvE(long ta,long wrd_spc){
|
113 |
-
tta=ta;
|
114 |
-
word_space=wrd_spc;
|
115 |
-
abbl=new char*[tta];
|
116 |
-
abbs=new char*[tta];
|
117 |
-
nt=new int[tta];
|
118 |
-
lst=new char*[word_space];
|
119 |
-
numa=num=0;
|
120 |
-
pMt=new MPtok;
|
121 |
-
setup_Test();
|
122 |
-
}
|
123 |
-
|
124 |
-
AbbrvE::~AbbrvE(){
|
125 |
-
if(numa)cleara();
|
126 |
-
clear();
|
127 |
-
delete [] abbl;
|
128 |
-
delete [] abbs;
|
129 |
-
delete [] nt;
|
130 |
-
delete [] lst;
|
131 |
-
delete pMt;
|
132 |
-
}
|
133 |
-
|
134 |
-
void AbbrvE::Extract(char*pch){
|
135 |
-
long i,j,k,u,flag;
|
136 |
-
int ix;
|
137 |
-
|
138 |
-
if ( strlen(pch) <= 0 ) // no text to look at
|
139 |
-
return;
|
140 |
-
|
141 |
-
token(pch);
|
142 |
-
|
143 |
-
i=j=k=0;
|
144 |
-
flag=0;
|
145 |
-
while(i<num){
|
146 |
-
if(!strcmp("(",lst[i])){
|
147 |
-
if(flag)k=j+1;
|
148 |
-
if((i>k)&&(strcmp(")",lst[i-1]))){
|
149 |
-
j=i;
|
150 |
-
flag=1;
|
151 |
-
}
|
152 |
-
}
|
153 |
-
if(!strcmp(")",lst[i])){
|
154 |
-
if(!flag){j=k=i+1;}
|
155 |
-
else {
|
156 |
-
if(((j>k)&&(i<j+12))&&(i>j+1)){
|
157 |
-
if(k<j-10)k=j-10;
|
158 |
-
strcpy(cnam,lst[k]);
|
159 |
-
for(u=k+1;u<j;u++){
|
160 |
-
strcat(cnam," ");
|
161 |
-
strcat(cnam,lst[u]);
|
162 |
-
}
|
163 |
-
ix=strlen(cnam);
|
164 |
-
abbl[numa]=new char[ix+1];
|
165 |
-
strcpy(abbl[numa],cnam);
|
166 |
-
|
167 |
-
strcpy(cnam,lst[j+1]);
|
168 |
-
for(u=j+2;u<i;u++){
|
169 |
-
strcat(cnam," ");
|
170 |
-
strcat(cnam,lst[u]);
|
171 |
-
}
|
172 |
-
nt[numa]=i-j-1;
|
173 |
-
ix=strlen(cnam);
|
174 |
-
abbs[numa]=new char[ix+1];
|
175 |
-
strcpy(abbs[numa],cnam);
|
176 |
-
if(Test(abbs[numa]))numa++;
|
177 |
-
else{ //if test done earlier would not need to allocate memory
|
178 |
-
//until known to be needed
|
179 |
-
delete [] abbs[numa];
|
180 |
-
delete [] abbl[numa];
|
181 |
-
}
|
182 |
-
flag=0;
|
183 |
-
}
|
184 |
-
else {
|
185 |
-
flag=0;
|
186 |
-
k=i+1;
|
187 |
-
}
|
188 |
-
}
|
189 |
-
}
|
190 |
-
i++;
|
191 |
-
}
|
192 |
-
}
|
193 |
-
|
194 |
-
|
195 |
-
//modified Jan-9-2008
|
196 |
-
//extract SF in [], parse until ';' or ',' in () or []
|
197 |
-
void AbbrvE::Extract2(const char*pch){
|
198 |
-
long i,j,k,u,ii,jj,kk,flag;
|
199 |
-
int ix;
|
200 |
-
char openCh[2], closeCh[2];
|
201 |
-
|
202 |
-
token2(pch); // alpha beta (AB) -> alpha beta ( AB )
|
203 |
-
|
204 |
-
for(jj=0; jj<2; jj++) {//deal with both () & []
|
205 |
-
i=j=k=0;
|
206 |
-
flag=0;
|
207 |
-
|
208 |
-
if(jj==0) { strcpy(openCh,"("); strcpy(closeCh,")"); }
|
209 |
-
else if(jj==1) { strcpy(openCh,"["); strcpy(closeCh,"]"); }
|
210 |
-
|
211 |
-
while(i<num){
|
212 |
-
if(!strcmp(openCh,lst[i])){
|
213 |
-
if(flag)k=j+1; //increment after seeing both '(' and ')'
|
214 |
-
if((i>k)&&(strcmp(closeCh,lst[i-1]))){
|
215 |
-
j=i; //index of '('
|
216 |
-
flag=1;
|
217 |
-
}
|
218 |
-
}
|
219 |
-
if(!strcmp(closeCh,lst[i])){
|
220 |
-
if(!flag){j=k=i+1;} //next token
|
221 |
-
else {
|
222 |
-
if(((j>k)&&(i<j+12))&&(i>j+1)){
|
223 |
-
if(k<j-10)k=j-10;
|
224 |
-
strcpy(cnam,lst[k]);
|
225 |
-
for(u=k+1;u<j;u++){
|
226 |
-
strcat(cnam," ");
|
227 |
-
strcat(cnam,lst[u]);
|
228 |
-
}
|
229 |
-
ix=strlen(cnam);
|
230 |
-
abbl[numa]=new char[ix+1];
|
231 |
-
strcpy(abbl[numa],cnam);
|
232 |
-
|
233 |
-
strcpy(cnam,lst[j+1]);
|
234 |
-
for(u=j+2;u<i;u++){
|
235 |
-
strcat(cnam," ");
|
236 |
-
strcat(cnam,lst[u]);
|
237 |
-
}
|
238 |
-
nt[numa]=i-j-1; //# abbr tokens
|
239 |
-
ix=strlen(cnam);
|
240 |
-
|
241 |
-
//---- parse until ';' or ','
|
242 |
-
ii=0;
|
243 |
-
while(ii<ix) {
|
244 |
-
if( ((cnam[ii]==';')&&(cnam[ii+1]==' ')) ||
|
245 |
-
((cnam[ii]==',')&&(cnam[ii+1]==' ')) ) {
|
246 |
-
ix=ii+1;
|
247 |
-
cnam[ii]='\0';
|
248 |
-
break;
|
249 |
-
}
|
250 |
-
ii++;
|
251 |
-
}
|
252 |
-
//----
|
253 |
-
|
254 |
-
abbs[numa]=new char[ix+1];
|
255 |
-
strcpy(abbs[numa],cnam);
|
256 |
-
if(Test(abbs[numa]))numa++;
|
257 |
-
else{ //if test done earlier would not need to allocate memory
|
258 |
-
//until known to be needed
|
259 |
-
delete [] abbs[numa];
|
260 |
-
delete [] abbl[numa];
|
261 |
-
}
|
262 |
-
flag=0;
|
263 |
-
}
|
264 |
-
else {
|
265 |
-
flag=0;
|
266 |
-
k=i+1;
|
267 |
-
}
|
268 |
-
}
|
269 |
-
}
|
270 |
-
i++;
|
271 |
-
}
|
272 |
-
}
|
273 |
-
}
|
274 |
-
|
275 |
-
|
276 |
-
void AbbrvE::token(const char *pch){
|
277 |
-
long i=1,j=0,k=0;
|
278 |
-
long u=1,flag=0;
|
279 |
-
char c,*str=cnam;
|
280 |
-
int size=strlen(pch);
|
281 |
-
if(size>cnam_size) {
|
282 |
-
cerr<<"Scratch space "<<cnam_size<<", needed "<<size<<endl;
|
283 |
-
exit(1);
|
284 |
-
}
|
285 |
-
clear(); // ready space for tokens
|
286 |
-
cnam[0]=pch[0];
|
287 |
-
while(c=pch[i]){
|
288 |
-
switch(c){
|
289 |
-
case '(': if(isblank(str[u-1])){
|
290 |
-
str[u++]=pch[i++];
|
291 |
-
if(!isblank(pch[i])){
|
292 |
-
str[u++]=' ';
|
293 |
-
}
|
294 |
-
flag=1;
|
295 |
-
}
|
296 |
-
else str[u++]=pch[i++];
|
297 |
-
break;
|
298 |
-
case ')': if(flag){
|
299 |
-
if(!isblank(str[u-1])){
|
300 |
-
str[u++]=' ';
|
301 |
-
str[u++]=pch[i++];
|
302 |
-
}
|
303 |
-
if(!isblank(pch[i]))str[u++]=' ';
|
304 |
-
flag=0;
|
305 |
-
}
|
306 |
-
else str[u++]=pch[i++];
|
307 |
-
break;
|
308 |
-
default: str[u++]=pch[i++];
|
309 |
-
}
|
310 |
-
}
|
311 |
-
while((u>0)&&(isblank(str[u-1])))u--;
|
312 |
-
str[u]='\0';
|
313 |
-
|
314 |
-
while(str[j]){
|
315 |
-
while(isblank(str[j]))j++;
|
316 |
-
i=j;
|
317 |
-
while((str[j])&&(!isblank(str[j])))j++;
|
318 |
-
lst[k]=new char[j-i+1];
|
319 |
-
strncpy(lst[k],str+i,j-i);
|
320 |
-
lst[k][j-i]='\0';
|
321 |
-
if(str[j]){
|
322 |
-
k++;
|
323 |
-
j++;
|
324 |
-
}
|
325 |
-
}
|
326 |
-
num=k+1;
|
327 |
-
}
|
328 |
-
|
329 |
-
|
330 |
-
//both () & [] Jan-9-2008
|
331 |
-
//(G(1)) -> ( G(1) ) Jan-28-2008
|
332 |
-
void AbbrvE::token2(const char *pch){
|
333 |
-
long i=1,j=0,k=0;
|
334 |
-
long u=1;
|
335 |
-
vector<bool> openChFlag1,openChFlag2;
|
336 |
-
long cflag;
|
337 |
-
long ii, jj, kk, sz;
|
338 |
-
char c,*str=cnam;
|
339 |
-
clear(); // ready space for tokens
|
340 |
-
cnam[0]=pch[0];
|
341 |
-
while(c=pch[i]){
|
342 |
-
switch(c){
|
343 |
-
case '(':
|
344 |
-
//--- (h)alpha -> (h)alpha, (h)-alpha -> ( h ) -alpha
|
345 |
-
ii=kk=i;
|
346 |
-
cflag=0;
|
347 |
-
while(pch[ii] && !isblank(pch[ii])) { //pch[ii] can be '\0'
|
348 |
-
if(pch[ii]=='(') cflag -= 1;
|
349 |
-
else if(pch[ii]==')') { cflag += 1; kk=ii; }
|
350 |
-
ii++;
|
351 |
-
}
|
352 |
-
|
353 |
-
if(!cflag && isalnum(pch[kk+1])) { //if alnum right after ')'
|
354 |
-
while(i<ii) str[u++]=pch[i++];
|
355 |
-
break;
|
356 |
-
}
|
357 |
-
//---
|
358 |
-
|
359 |
-
if(isblank(str[u-1])){
|
360 |
-
str[u++]=pch[i++];
|
361 |
-
if(!isblank(pch[i])){
|
362 |
-
str[u++]=' ';
|
363 |
-
}
|
364 |
-
openChFlag1.push_back(true);
|
365 |
-
}
|
366 |
-
else {
|
367 |
-
str[u++]=pch[i++];
|
368 |
-
openChFlag1.push_back(false);
|
369 |
-
}
|
370 |
-
|
371 |
-
break;
|
372 |
-
|
373 |
-
case ')': sz = openChFlag1.size();
|
374 |
-
if(sz>0 && openChFlag1[sz-1]){ //modified Jan-28-08
|
375 |
-
if(!isblank(str[u-1])){
|
376 |
-
str[u++]=' ';
|
377 |
-
str[u++]=pch[i++]; //pch[i++] is ')'
|
378 |
-
}
|
379 |
-
//---added (Jan-11-08): (BIV; ), -> ( BIV; ) ,
|
380 |
-
else if(!isblank(pch[i+1])){
|
381 |
-
str[u++]=pch[i++]; //pch[i++] is ')'
|
382 |
-
}
|
383 |
-
//---
|
384 |
-
|
385 |
-
if(!isblank(pch[i]))str[u++]=' '; //pch[i] must be after ')'
|
386 |
-
}
|
387 |
-
else str[u++]=pch[i++];
|
388 |
-
|
389 |
-
if(sz>0) openChFlag1.pop_back();
|
390 |
-
|
391 |
-
break;
|
392 |
-
|
393 |
-
case '[':
|
394 |
-
//--- [h]alpha -> [h]alpha
|
395 |
-
ii=kk=i;
|
396 |
-
cflag=0;
|
397 |
-
while(pch[ii] && !isblank(pch[ii])) { //pch[ii] can be '\0'
|
398 |
-
if(pch[ii]=='[') cflag -= 1;
|
399 |
-
else if(pch[ii]==']') { cflag += 1; kk=ii; }
|
400 |
-
ii++;
|
401 |
-
}
|
402 |
-
|
403 |
-
if(!cflag && isalnum(pch[kk+1])) { //if alnum right after ')'
|
404 |
-
while(i<ii) str[u++]=pch[i++];
|
405 |
-
break;
|
406 |
-
}
|
407 |
-
//---
|
408 |
-
|
409 |
-
if(isblank(str[u-1])){
|
410 |
-
str[u++]=pch[i++];
|
411 |
-
if(!isblank(pch[i])){
|
412 |
-
str[u++]=' ';
|
413 |
-
}
|
414 |
-
openChFlag2.push_back(true);
|
415 |
-
}
|
416 |
-
else {
|
417 |
-
str[u++]=pch[i++];
|
418 |
-
openChFlag2.push_back(false);
|
419 |
-
}
|
420 |
-
|
421 |
-
break;
|
422 |
-
|
423 |
-
case ']': sz=openChFlag2.size();
|
424 |
-
if(sz>0 && openChFlag2[sz-1]){ //modified Jan-28-08
|
425 |
-
if(!isblank(str[u-1])){
|
426 |
-
str[u++]=' ';
|
427 |
-
str[u++]=pch[i++];
|
428 |
-
}
|
429 |
-
//---added (Jan-11-08): [BIV; ], -> [ BIV; ] ,
|
430 |
-
else if(!isblank(pch[i+1])){
|
431 |
-
str[u++]=pch[i++];
|
432 |
-
}
|
433 |
-
//---
|
434 |
-
if(!isblank(pch[i]))str[u++]=' ';
|
435 |
-
}
|
436 |
-
else str[u++]=pch[i++];
|
437 |
-
|
438 |
-
if(sz>0) openChFlag2.pop_back();
|
439 |
-
|
440 |
-
break;
|
441 |
-
default: str[u++]=pch[i++];
|
442 |
-
}
|
443 |
-
}
|
444 |
-
while((u>0)&&(isblank(str[u-1])))u--;
|
445 |
-
str[u]='\0';
|
446 |
-
|
447 |
-
while(str[j]){
|
448 |
-
while(isblank(str[j]))j++;
|
449 |
-
i=j;
|
450 |
-
while((str[j])&&(!isblank(str[j])))j++;
|
451 |
-
lst[k]=new char[j-i+1];
|
452 |
-
strncpy(lst[k],str+i,j-i);
|
453 |
-
lst[k][j-i]='\0';
|
454 |
-
if(str[j]){
|
455 |
-
k++;
|
456 |
-
j++;
|
457 |
-
}
|
458 |
-
}
|
459 |
-
num=k+1;
|
460 |
-
}
|
461 |
-
|
462 |
-
|
463 |
-
void AbbrvE::clear(void){
|
464 |
-
for ( int i=0; i<num; i++ ) {
|
465 |
-
delete [] lst[i];
|
466 |
-
}
|
467 |
-
num=0;
|
468 |
-
}
|
469 |
-
|
470 |
-
void AbbrvE::cleara(void){
|
471 |
-
long i;
|
472 |
-
for(i=0;i<numa;i++){
|
473 |
-
delete [] abbl[i];
|
474 |
-
delete [] abbs[i];
|
475 |
-
}
|
476 |
-
numa=0;
|
477 |
-
}
|
478 |
-
|
479 |
-
#if 0
|
480 |
-
|
481 |
-
//no space before and after abbs[] (because of using token)
|
482 |
-
int AbbrvE::Test(const char *str){
|
483 |
-
long i,j,k;
|
484 |
-
char b,c;
|
485 |
-
|
486 |
-
if(strchr(str,'='))return(0);
|
487 |
-
if(!strcmp(str,"author's transl"))return(0);
|
488 |
-
if(!strcmp(str,"proceedings"))return(0);
|
489 |
-
//---added (Jan-11-08) & (Apr 08)
|
490 |
-
if(!strcmp(str,"see"))return(0);
|
491 |
-
if(!strcmp(str,"and"))return(0);
|
492 |
-
if(!strcmp(str,"comment"))return(0);
|
493 |
-
if(!strcmp(str,"letter"))return(0);
|
494 |
-
//---
|
495 |
-
if((str[0]=='e')&&(str[1]=='g')){
|
496 |
-
if(!(c=str[2])||(c=='.')||(c==','))return(0);
|
497 |
-
}
|
498 |
-
if((str[0]=='s')&&(str[1]=='e')&&(str[2]=='e')&&(((b=str[3])==' ')||(b==',')))return(0);
|
499 |
-
if('p'==tolower(str[0])){
|
500 |
-
if(strchr(str+1,'<'))return(0);
|
501 |
-
}
|
502 |
-
i=j=k=0;
|
503 |
-
while((c=str[i])&&(c!=' ')){
|
504 |
-
i++;
|
505 |
-
if(isdigit(c))j++;
|
506 |
-
if(isalpha(c))k++;
|
507 |
-
if((i==j)&&(i==3))return(0);
|
508 |
-
}
|
509 |
-
if((i==j)||(k==0))return(0);
|
510 |
-
else return(1);
|
511 |
-
}
|
512 |
-
|
513 |
-
#endif
|
514 |
-
|
515 |
-
bool AbbrvE::prefix_match( const char *str ) {
|
516 |
-
size_t size = strlen(str);
|
517 |
-
for ( int i = 0; i < prefix.size(); ++i ) {
|
518 |
-
string& pre = prefix[i];
|
519 |
-
if ( size > pre.size() and
|
520 |
-
0 == pre.compare( 0, pre.size(), str, pre.size() ) )
|
521 |
-
return true;
|
522 |
-
}
|
523 |
-
return false;
|
524 |
-
}
|
525 |
-
|
526 |
-
|
527 |
-
//no space before and after abbs[] (because of using token)
|
528 |
-
bool AbbrvE::Test(const char *str){
|
529 |
-
|
530 |
-
if ( match.find(str) != match.end() ) return false;
|
531 |
-
if ( prefix_match(str) ) return false;
|
532 |
-
|
533 |
-
size_t length, letters, digits;
|
534 |
-
length = letters = digits = 0;
|
535 |
-
|
536 |
-
char c;
|
537 |
-
while((c=str[length])&&(c!=' ')){
|
538 |
-
length++;
|
539 |
-
if ( isdigit(c) ) digits++;
|
540 |
-
if ( isalpha(c) ) letters++;
|
541 |
-
|
542 |
-
if( length==digits and length>=3 ) return false;
|
543 |
-
}
|
544 |
-
if ( digits == length ) return false;
|
545 |
-
if ( letters <= 0 ) return false;
|
546 |
-
|
547 |
-
return true;
|
548 |
-
}
|
549 |
-
|
550 |
-
void AbbrvE::setup_Test( void ) {
|
551 |
-
|
552 |
-
match.insert("author's transl");
|
553 |
-
match.insert("proceedings");
|
554 |
-
match.insert("see");
|
555 |
-
match.insert("and");
|
556 |
-
match.insert("comment");
|
557 |
-
match.insert("letter");
|
558 |
-
match.insert("eg");
|
559 |
-
|
560 |
-
prefix.push_back("=");
|
561 |
-
prefix.push_back("eg.");
|
562 |
-
prefix.push_back("eg,");
|
563 |
-
prefix.push_back("see ");
|
564 |
-
prefix.push_back("see,");
|
565 |
-
prefix.push_back("p<");
|
566 |
-
prefix.push_back("P<");
|
567 |
-
|
568 |
-
// rules added in 2010
|
569 |
-
match.insert("e.g.");
|
570 |
-
match.insert("ie");
|
571 |
-
match.insert("i.e.");
|
572 |
-
match.insert("mean");
|
573 |
-
match.insert("age");
|
574 |
-
match.insert("std");
|
575 |
-
match.insert("range");
|
576 |
-
match.insert("young");
|
577 |
-
match.insert("old");
|
578 |
-
match.insert("male");
|
579 |
-
match.insert("female");
|
580 |
-
|
581 |
-
}
|
582 |
-
|
583 |
-
void AbbrvE::Proc(char *pxh){
|
584 |
-
long i,j;
|
585 |
-
char *pch,*ptr;
|
586 |
-
pMt->segment(pxh);
|
587 |
-
for(i=0;i<pMt->sent.size();i++){
|
588 |
-
Extract2( (pMt->sent[i]).c_str() );
|
589 |
-
}
|
590 |
-
|
591 |
-
seq.flag_seq( numa, abbs );
|
592 |
-
j=0;
|
593 |
-
for(i=0;i<numa;i++){
|
594 |
-
if( seq.rate(i) ){
|
595 |
-
if(j<i){
|
596 |
-
pch=abbl[i];
|
597 |
-
if(ptr=strchr(pch,'|')){
|
598 |
-
*ptr='/';
|
599 |
-
ptr++;
|
600 |
-
while(ptr=strchr(pch,'|')){
|
601 |
-
*ptr='/';
|
602 |
-
ptr++;
|
603 |
-
}
|
604 |
-
}
|
605 |
-
abbl[j]=pch;
|
606 |
-
pch=abbs[i];
|
607 |
-
if(ptr=strchr(pch,'|')){
|
608 |
-
*ptr='/';
|
609 |
-
ptr++;
|
610 |
-
while(ptr=strchr(pch,'|')){
|
611 |
-
*ptr='/';
|
612 |
-
ptr++;
|
613 |
-
}
|
614 |
-
}
|
615 |
-
abbs[j]=pch;
|
616 |
-
nt[j]=nt[i];
|
617 |
-
}
|
618 |
-
j++;
|
619 |
-
}
|
620 |
-
else {
|
621 |
-
delete [] abbl[i];
|
622 |
-
delete [] abbs[i];
|
623 |
-
}
|
624 |
-
}
|
625 |
-
|
626 |
-
numa=j;
|
627 |
-
}
|
628 |
-
|
629 |
-
}
|
|
|
1 |
+
#include "AbbrvE.h"
|
2 |
+
#include <sstream>
|
3 |
+
|
4 |
+
namespace iret {
|
5 |
+
|
6 |
+
Find_Seq::Find_Seq( void )
|
7 |
+
/* initializers work in C++0x
|
8 |
+
:
|
9 |
+
seq_i ( { "i", "ii", "iii", "iv", "v", "vi" } ),
|
10 |
+
seq_I ( { "I", "II", "III", "IV", "V", "VI" } ),
|
11 |
+
seq_a ( { "a", "b", "c", "d", "e", "f" } ),
|
12 |
+
seq_A ( { "A", "B", "C", "D", "E", "F" } )
|
13 |
+
*/
|
14 |
+
{
|
15 |
+
seq_i.push_back("i");
|
16 |
+
seq_i.push_back("ii");
|
17 |
+
seq_i.push_back("iii");
|
18 |
+
seq_i.push_back("iv");
|
19 |
+
seq_i.push_back("v");
|
20 |
+
seq_i.push_back("vi");
|
21 |
+
|
22 |
+
seq_I.push_back("I");
|
23 |
+
seq_I.push_back("II");
|
24 |
+
seq_I.push_back("III");
|
25 |
+
seq_I.push_back("IV");
|
26 |
+
seq_I.push_back("V");
|
27 |
+
seq_I.push_back("VI");
|
28 |
+
|
29 |
+
seq_a.push_back("a");
|
30 |
+
seq_a.push_back("b");
|
31 |
+
seq_a.push_back("c");
|
32 |
+
seq_a.push_back("d");
|
33 |
+
seq_a.push_back("e");
|
34 |
+
seq_a.push_back("f");
|
35 |
+
|
36 |
+
seq_A.push_back("A");
|
37 |
+
seq_A.push_back("B");
|
38 |
+
seq_A.push_back("C");
|
39 |
+
seq_A.push_back("D");
|
40 |
+
seq_A.push_back("E");
|
41 |
+
seq_A.push_back("F");
|
42 |
+
|
43 |
+
}
|
44 |
+
|
45 |
+
void
|
46 |
+
Find_Seq::flag_seq( int numa, char* abbs[] ) {
|
47 |
+
|
48 |
+
my_numa = numa;
|
49 |
+
my_abbs = abbs;
|
50 |
+
|
51 |
+
my_rate.resize(numa);
|
52 |
+
for ( int i = 0; i < numa; ++i )
|
53 |
+
my_rate[i] = true;
|
54 |
+
|
55 |
+
find_seq(seq_i);
|
56 |
+
find_seq(seq_I);
|
57 |
+
find_seq(seq_a);
|
58 |
+
find_seq(seq_A);
|
59 |
+
|
60 |
+
create_seq();
|
61 |
+
}
|
62 |
+
|
63 |
+
|
64 |
+
void
|
65 |
+
Find_Seq::find_seq( const vector<string> & seq ) {
|
66 |
+
|
67 |
+
for ( int i_abbr = 0; i_abbr < my_numa-1; ++i_abbr ) {
|
68 |
+
// need to see at least 2 in sequence
|
69 |
+
|
70 |
+
if ( seq[0] == my_abbs[i_abbr] ) {
|
71 |
+
int i_seq = 1;
|
72 |
+
while ( i_seq < seq.size() and
|
73 |
+
i_seq + i_abbr < my_numa and
|
74 |
+
seq[i_seq] == my_abbs[i_abbr + i_seq ] )
|
75 |
+
++i_seq;
|
76 |
+
|
77 |
+
if ( i_seq > 1 )
|
78 |
+
for ( int i = 0; i < i_seq; ++i )
|
79 |
+
my_rate[i_abbr+i] = false;
|
80 |
+
}
|
81 |
+
|
82 |
+
}
|
83 |
+
}
|
84 |
+
|
85 |
+
void
|
86 |
+
Find_Seq::create_seq( void ) {
|
87 |
+
|
88 |
+
for ( int i_abbr = 0; i_abbr < my_numa; ++i_abbr ) {
|
89 |
+
|
90 |
+
size_t len = std::strlen( my_abbs[i_abbr] );
|
91 |
+
|
92 |
+
if ( my_abbs[i_abbr][len-1] == '1' ) {
|
93 |
+
// create sequence and test
|
94 |
+
|
95 |
+
string prefix( my_abbs[i_abbr], len-1 );
|
96 |
+
size_t seq_len = my_numa - i_abbr; // max possible length
|
97 |
+
vector<string> seq;
|
98 |
+
// sequence starts with 1
|
99 |
+
for ( int i= 1; i <= seq_len; ++i ) {
|
100 |
+
std::ostringstream stream(prefix,std::ios::app);
|
101 |
+
stream << i;
|
102 |
+
seq.push_back( stream.str() );
|
103 |
+
}
|
104 |
+
|
105 |
+
// cout << seq << '\n';
|
106 |
+
find_seq(seq);
|
107 |
+
}
|
108 |
+
}
|
109 |
+
}
|
110 |
+
|
111 |
+
|
112 |
+
AbbrvE::AbbrvE(long ta,long wrd_spc){
|
113 |
+
tta=ta;
|
114 |
+
word_space=wrd_spc;
|
115 |
+
abbl=new char*[tta];
|
116 |
+
abbs=new char*[tta];
|
117 |
+
nt=new int[tta];
|
118 |
+
lst=new char*[word_space];
|
119 |
+
numa=num=0;
|
120 |
+
pMt=new MPtok;
|
121 |
+
setup_Test();
|
122 |
+
}
|
123 |
+
|
124 |
+
AbbrvE::~AbbrvE(){
|
125 |
+
if(numa)cleara();
|
126 |
+
clear();
|
127 |
+
delete [] abbl;
|
128 |
+
delete [] abbs;
|
129 |
+
delete [] nt;
|
130 |
+
delete [] lst;
|
131 |
+
delete pMt;
|
132 |
+
}
|
133 |
+
|
134 |
+
void AbbrvE::Extract(char*pch){
|
135 |
+
long i,j,k,u,flag;
|
136 |
+
int ix;
|
137 |
+
|
138 |
+
if ( strlen(pch) <= 0 ) // no text to look at
|
139 |
+
return;
|
140 |
+
|
141 |
+
token(pch);
|
142 |
+
|
143 |
+
i=j=k=0;
|
144 |
+
flag=0;
|
145 |
+
while(i<num){
|
146 |
+
if(!strcmp("(",lst[i])){
|
147 |
+
if(flag)k=j+1;
|
148 |
+
if((i>k)&&(strcmp(")",lst[i-1]))){
|
149 |
+
j=i;
|
150 |
+
flag=1;
|
151 |
+
}
|
152 |
+
}
|
153 |
+
if(!strcmp(")",lst[i])){
|
154 |
+
if(!flag){j=k=i+1;}
|
155 |
+
else {
|
156 |
+
if(((j>k)&&(i<j+12))&&(i>j+1)){
|
157 |
+
if(k<j-10)k=j-10;
|
158 |
+
strcpy(cnam,lst[k]);
|
159 |
+
for(u=k+1;u<j;u++){
|
160 |
+
strcat(cnam," ");
|
161 |
+
strcat(cnam,lst[u]);
|
162 |
+
}
|
163 |
+
ix=strlen(cnam);
|
164 |
+
abbl[numa]=new char[ix+1];
|
165 |
+
strcpy(abbl[numa],cnam);
|
166 |
+
|
167 |
+
strcpy(cnam,lst[j+1]);
|
168 |
+
for(u=j+2;u<i;u++){
|
169 |
+
strcat(cnam," ");
|
170 |
+
strcat(cnam,lst[u]);
|
171 |
+
}
|
172 |
+
nt[numa]=i-j-1;
|
173 |
+
ix=strlen(cnam);
|
174 |
+
abbs[numa]=new char[ix+1];
|
175 |
+
strcpy(abbs[numa],cnam);
|
176 |
+
if(Test(abbs[numa]))numa++;
|
177 |
+
else{ //if test done earlier would not need to allocate memory
|
178 |
+
//until known to be needed
|
179 |
+
delete [] abbs[numa];
|
180 |
+
delete [] abbl[numa];
|
181 |
+
}
|
182 |
+
flag=0;
|
183 |
+
}
|
184 |
+
else {
|
185 |
+
flag=0;
|
186 |
+
k=i+1;
|
187 |
+
}
|
188 |
+
}
|
189 |
+
}
|
190 |
+
i++;
|
191 |
+
}
|
192 |
+
}
|
193 |
+
|
194 |
+
|
195 |
+
//modified Jan-9-2008
|
196 |
+
//extract SF in [], parse until ';' or ',' in () or []
|
197 |
+
void AbbrvE::Extract2(const char*pch){
|
198 |
+
long i,j,k,u,ii,jj,kk,flag;
|
199 |
+
int ix;
|
200 |
+
char openCh[2], closeCh[2];
|
201 |
+
|
202 |
+
token2(pch); // alpha beta (AB) -> alpha beta ( AB )
|
203 |
+
|
204 |
+
for(jj=0; jj<2; jj++) {//deal with both () & []
|
205 |
+
i=j=k=0;
|
206 |
+
flag=0;
|
207 |
+
|
208 |
+
if(jj==0) { strcpy(openCh,"("); strcpy(closeCh,")"); }
|
209 |
+
else if(jj==1) { strcpy(openCh,"["); strcpy(closeCh,"]"); }
|
210 |
+
|
211 |
+
while(i<num){
|
212 |
+
if(!strcmp(openCh,lst[i])){
|
213 |
+
if(flag)k=j+1; //increment after seeing both '(' and ')'
|
214 |
+
if((i>k)&&(strcmp(closeCh,lst[i-1]))){
|
215 |
+
j=i; //index of '('
|
216 |
+
flag=1;
|
217 |
+
}
|
218 |
+
}
|
219 |
+
if(!strcmp(closeCh,lst[i])){
|
220 |
+
if(!flag){j=k=i+1;} //next token
|
221 |
+
else {
|
222 |
+
if(((j>k)&&(i<j+12))&&(i>j+1)){
|
223 |
+
if(k<j-10)k=j-10;
|
224 |
+
strcpy(cnam,lst[k]);
|
225 |
+
for(u=k+1;u<j;u++){
|
226 |
+
strcat(cnam," ");
|
227 |
+
strcat(cnam,lst[u]);
|
228 |
+
}
|
229 |
+
ix=strlen(cnam);
|
230 |
+
abbl[numa]=new char[ix+1];
|
231 |
+
strcpy(abbl[numa],cnam);
|
232 |
+
|
233 |
+
strcpy(cnam,lst[j+1]);
|
234 |
+
for(u=j+2;u<i;u++){
|
235 |
+
strcat(cnam," ");
|
236 |
+
strcat(cnam,lst[u]);
|
237 |
+
}
|
238 |
+
nt[numa]=i-j-1; //# abbr tokens
|
239 |
+
ix=strlen(cnam);
|
240 |
+
|
241 |
+
//---- parse until ';' or ','
|
242 |
+
ii=0;
|
243 |
+
while(ii<ix) {
|
244 |
+
if( ((cnam[ii]==';')&&(cnam[ii+1]==' ')) ||
|
245 |
+
((cnam[ii]==',')&&(cnam[ii+1]==' ')) ) {
|
246 |
+
ix=ii+1;
|
247 |
+
cnam[ii]='\0';
|
248 |
+
break;
|
249 |
+
}
|
250 |
+
ii++;
|
251 |
+
}
|
252 |
+
//----
|
253 |
+
|
254 |
+
abbs[numa]=new char[ix+1];
|
255 |
+
strcpy(abbs[numa],cnam);
|
256 |
+
if(Test(abbs[numa]))numa++;
|
257 |
+
else{ //if test done earlier would not need to allocate memory
|
258 |
+
//until known to be needed
|
259 |
+
delete [] abbs[numa];
|
260 |
+
delete [] abbl[numa];
|
261 |
+
}
|
262 |
+
flag=0;
|
263 |
+
}
|
264 |
+
else {
|
265 |
+
flag=0;
|
266 |
+
k=i+1;
|
267 |
+
}
|
268 |
+
}
|
269 |
+
}
|
270 |
+
i++;
|
271 |
+
}
|
272 |
+
}
|
273 |
+
}
|
274 |
+
|
275 |
+
|
276 |
+
void AbbrvE::token(const char *pch){
|
277 |
+
long i=1,j=0,k=0;
|
278 |
+
long u=1,flag=0;
|
279 |
+
char c,*str=cnam;
|
280 |
+
int size=strlen(pch);
|
281 |
+
if(size>cnam_size) {
|
282 |
+
cerr<<"Scratch space "<<cnam_size<<", needed "<<size<<endl;
|
283 |
+
exit(1);
|
284 |
+
}
|
285 |
+
clear(); // ready space for tokens
|
286 |
+
cnam[0]=pch[0];
|
287 |
+
while(c=pch[i]){
|
288 |
+
switch(c){
|
289 |
+
case '(': if(isblank(str[u-1])){
|
290 |
+
str[u++]=pch[i++];
|
291 |
+
if(!isblank(pch[i])){
|
292 |
+
str[u++]=' ';
|
293 |
+
}
|
294 |
+
flag=1;
|
295 |
+
}
|
296 |
+
else str[u++]=pch[i++];
|
297 |
+
break;
|
298 |
+
case ')': if(flag){
|
299 |
+
if(!isblank(str[u-1])){
|
300 |
+
str[u++]=' ';
|
301 |
+
str[u++]=pch[i++];
|
302 |
+
}
|
303 |
+
if(!isblank(pch[i]))str[u++]=' ';
|
304 |
+
flag=0;
|
305 |
+
}
|
306 |
+
else str[u++]=pch[i++];
|
307 |
+
break;
|
308 |
+
default: str[u++]=pch[i++];
|
309 |
+
}
|
310 |
+
}
|
311 |
+
while((u>0)&&(isblank(str[u-1])))u--;
|
312 |
+
str[u]='\0';
|
313 |
+
|
314 |
+
while(str[j]){
|
315 |
+
while(isblank(str[j]))j++;
|
316 |
+
i=j;
|
317 |
+
while((str[j])&&(!isblank(str[j])))j++;
|
318 |
+
lst[k]=new char[j-i+1];
|
319 |
+
strncpy(lst[k],str+i,j-i);
|
320 |
+
lst[k][j-i]='\0';
|
321 |
+
if(str[j]){
|
322 |
+
k++;
|
323 |
+
j++;
|
324 |
+
}
|
325 |
+
}
|
326 |
+
num=k+1;
|
327 |
+
}
|
328 |
+
|
329 |
+
|
330 |
+
//both () & [] Jan-9-2008
|
331 |
+
//(G(1)) -> ( G(1) ) Jan-28-2008
|
332 |
+
void AbbrvE::token2(const char *pch){
|
333 |
+
long i=1,j=0,k=0;
|
334 |
+
long u=1;
|
335 |
+
vector<bool> openChFlag1,openChFlag2;
|
336 |
+
long cflag;
|
337 |
+
long ii, jj, kk, sz;
|
338 |
+
char c,*str=cnam;
|
339 |
+
clear(); // ready space for tokens
|
340 |
+
cnam[0]=pch[0];
|
341 |
+
while(c=pch[i]){
|
342 |
+
switch(c){
|
343 |
+
case '(':
|
344 |
+
//--- (h)alpha -> (h)alpha, (h)-alpha -> ( h ) -alpha
|
345 |
+
ii=kk=i;
|
346 |
+
cflag=0;
|
347 |
+
while(pch[ii] && !isblank(pch[ii])) { //pch[ii] can be '\0'
|
348 |
+
if(pch[ii]=='(') cflag -= 1;
|
349 |
+
else if(pch[ii]==')') { cflag += 1; kk=ii; }
|
350 |
+
ii++;
|
351 |
+
}
|
352 |
+
|
353 |
+
if(!cflag && isalnum(pch[kk+1])) { //if alnum right after ')'
|
354 |
+
while(i<ii) str[u++]=pch[i++];
|
355 |
+
break;
|
356 |
+
}
|
357 |
+
//---
|
358 |
+
|
359 |
+
if(isblank(str[u-1])){
|
360 |
+
str[u++]=pch[i++];
|
361 |
+
if(!isblank(pch[i])){
|
362 |
+
str[u++]=' ';
|
363 |
+
}
|
364 |
+
openChFlag1.push_back(true);
|
365 |
+
}
|
366 |
+
else {
|
367 |
+
str[u++]=pch[i++];
|
368 |
+
openChFlag1.push_back(false);
|
369 |
+
}
|
370 |
+
|
371 |
+
break;
|
372 |
+
|
373 |
+
case ')': sz = openChFlag1.size();
|
374 |
+
if(sz>0 && openChFlag1[sz-1]){ //modified Jan-28-08
|
375 |
+
if(!isblank(str[u-1])){
|
376 |
+
str[u++]=' ';
|
377 |
+
str[u++]=pch[i++]; //pch[i++] is ')'
|
378 |
+
}
|
379 |
+
//---added (Jan-11-08): (BIV; ), -> ( BIV; ) ,
|
380 |
+
else if(!isblank(pch[i+1])){
|
381 |
+
str[u++]=pch[i++]; //pch[i++] is ')'
|
382 |
+
}
|
383 |
+
//---
|
384 |
+
|
385 |
+
if(!isblank(pch[i]))str[u++]=' '; //pch[i] must be after ')'
|
386 |
+
}
|
387 |
+
else str[u++]=pch[i++];
|
388 |
+
|
389 |
+
if(sz>0) openChFlag1.pop_back();
|
390 |
+
|
391 |
+
break;
|
392 |
+
|
393 |
+
case '[':
|
394 |
+
//--- [h]alpha -> [h]alpha
|
395 |
+
ii=kk=i;
|
396 |
+
cflag=0;
|
397 |
+
while(pch[ii] && !isblank(pch[ii])) { //pch[ii] can be '\0'
|
398 |
+
if(pch[ii]=='[') cflag -= 1;
|
399 |
+
else if(pch[ii]==']') { cflag += 1; kk=ii; }
|
400 |
+
ii++;
|
401 |
+
}
|
402 |
+
|
403 |
+
if(!cflag && isalnum(pch[kk+1])) { //if alnum right after ')'
|
404 |
+
while(i<ii) str[u++]=pch[i++];
|
405 |
+
break;
|
406 |
+
}
|
407 |
+
//---
|
408 |
+
|
409 |
+
if(isblank(str[u-1])){
|
410 |
+
str[u++]=pch[i++];
|
411 |
+
if(!isblank(pch[i])){
|
412 |
+
str[u++]=' ';
|
413 |
+
}
|
414 |
+
openChFlag2.push_back(true);
|
415 |
+
}
|
416 |
+
else {
|
417 |
+
str[u++]=pch[i++];
|
418 |
+
openChFlag2.push_back(false);
|
419 |
+
}
|
420 |
+
|
421 |
+
break;
|
422 |
+
|
423 |
+
case ']': sz=openChFlag2.size();
|
424 |
+
if(sz>0 && openChFlag2[sz-1]){ //modified Jan-28-08
|
425 |
+
if(!isblank(str[u-1])){
|
426 |
+
str[u++]=' ';
|
427 |
+
str[u++]=pch[i++];
|
428 |
+
}
|
429 |
+
//---added (Jan-11-08): [BIV; ], -> [ BIV; ] ,
|
430 |
+
else if(!isblank(pch[i+1])){
|
431 |
+
str[u++]=pch[i++];
|
432 |
+
}
|
433 |
+
//---
|
434 |
+
if(!isblank(pch[i]))str[u++]=' ';
|
435 |
+
}
|
436 |
+
else str[u++]=pch[i++];
|
437 |
+
|
438 |
+
if(sz>0) openChFlag2.pop_back();
|
439 |
+
|
440 |
+
break;
|
441 |
+
default: str[u++]=pch[i++];
|
442 |
+
}
|
443 |
+
}
|
444 |
+
while((u>0)&&(isblank(str[u-1])))u--;
|
445 |
+
str[u]='\0';
|
446 |
+
|
447 |
+
while(str[j]){
|
448 |
+
while(isblank(str[j]))j++;
|
449 |
+
i=j;
|
450 |
+
while((str[j])&&(!isblank(str[j])))j++;
|
451 |
+
lst[k]=new char[j-i+1];
|
452 |
+
strncpy(lst[k],str+i,j-i);
|
453 |
+
lst[k][j-i]='\0';
|
454 |
+
if(str[j]){
|
455 |
+
k++;
|
456 |
+
j++;
|
457 |
+
}
|
458 |
+
}
|
459 |
+
num=k+1;
|
460 |
+
}
|
461 |
+
|
462 |
+
|
463 |
+
void AbbrvE::clear(void){
|
464 |
+
for ( int i=0; i<num; i++ ) {
|
465 |
+
delete [] lst[i];
|
466 |
+
}
|
467 |
+
num=0;
|
468 |
+
}
|
469 |
+
|
470 |
+
void AbbrvE::cleara(void){
|
471 |
+
long i;
|
472 |
+
for(i=0;i<numa;i++){
|
473 |
+
delete [] abbl[i];
|
474 |
+
delete [] abbs[i];
|
475 |
+
}
|
476 |
+
numa=0;
|
477 |
+
}
|
478 |
+
|
479 |
+
#if 0
|
480 |
+
|
481 |
+
//no space before and after abbs[] (because of using token)
|
482 |
+
int AbbrvE::Test(const char *str){
|
483 |
+
long i,j,k;
|
484 |
+
char b,c;
|
485 |
+
|
486 |
+
if(strchr(str,'='))return(0);
|
487 |
+
if(!strcmp(str,"author's transl"))return(0);
|
488 |
+
if(!strcmp(str,"proceedings"))return(0);
|
489 |
+
//---added (Jan-11-08) & (Apr 08)
|
490 |
+
if(!strcmp(str,"see"))return(0);
|
491 |
+
if(!strcmp(str,"and"))return(0);
|
492 |
+
if(!strcmp(str,"comment"))return(0);
|
493 |
+
if(!strcmp(str,"letter"))return(0);
|
494 |
+
//---
|
495 |
+
if((str[0]=='e')&&(str[1]=='g')){
|
496 |
+
if(!(c=str[2])||(c=='.')||(c==','))return(0);
|
497 |
+
}
|
498 |
+
if((str[0]=='s')&&(str[1]=='e')&&(str[2]=='e')&&(((b=str[3])==' ')||(b==',')))return(0);
|
499 |
+
if('p'==tolower(str[0])){
|
500 |
+
if(strchr(str+1,'<'))return(0);
|
501 |
+
}
|
502 |
+
i=j=k=0;
|
503 |
+
while((c=str[i])&&(c!=' ')){
|
504 |
+
i++;
|
505 |
+
if(isdigit(c))j++;
|
506 |
+
if(isalpha(c))k++;
|
507 |
+
if((i==j)&&(i==3))return(0);
|
508 |
+
}
|
509 |
+
if((i==j)||(k==0))return(0);
|
510 |
+
else return(1);
|
511 |
+
}
|
512 |
+
|
513 |
+
#endif
|
514 |
+
|
515 |
+
bool AbbrvE::prefix_match( const char *str ) {
|
516 |
+
size_t size = strlen(str);
|
517 |
+
for ( int i = 0; i < prefix.size(); ++i ) {
|
518 |
+
string& pre = prefix[i];
|
519 |
+
if ( size > pre.size() and
|
520 |
+
0 == pre.compare( 0, pre.size(), str, pre.size() ) )
|
521 |
+
return true;
|
522 |
+
}
|
523 |
+
return false;
|
524 |
+
}
|
525 |
+
|
526 |
+
|
527 |
+
//no space before and after abbs[] (because of using token)
|
528 |
+
bool AbbrvE::Test(const char *str){
|
529 |
+
|
530 |
+
if ( match.find(str) != match.end() ) return false;
|
531 |
+
if ( prefix_match(str) ) return false;
|
532 |
+
|
533 |
+
size_t length, letters, digits;
|
534 |
+
length = letters = digits = 0;
|
535 |
+
|
536 |
+
char c;
|
537 |
+
while((c=str[length])&&(c!=' ')){
|
538 |
+
length++;
|
539 |
+
if ( isdigit(c) ) digits++;
|
540 |
+
if ( isalpha(c) ) letters++;
|
541 |
+
|
542 |
+
if( length==digits and length>=3 ) return false;
|
543 |
+
}
|
544 |
+
if ( digits == length ) return false;
|
545 |
+
if ( letters <= 0 ) return false;
|
546 |
+
|
547 |
+
return true;
|
548 |
+
}
|
549 |
+
|
550 |
+
void AbbrvE::setup_Test( void ) {
|
551 |
+
|
552 |
+
match.insert("author's transl");
|
553 |
+
match.insert("proceedings");
|
554 |
+
match.insert("see");
|
555 |
+
match.insert("and");
|
556 |
+
match.insert("comment");
|
557 |
+
match.insert("letter");
|
558 |
+
match.insert("eg");
|
559 |
+
|
560 |
+
prefix.push_back("=");
|
561 |
+
prefix.push_back("eg.");
|
562 |
+
prefix.push_back("eg,");
|
563 |
+
prefix.push_back("see ");
|
564 |
+
prefix.push_back("see,");
|
565 |
+
prefix.push_back("p<");
|
566 |
+
prefix.push_back("P<");
|
567 |
+
|
568 |
+
// rules added in 2010
|
569 |
+
match.insert("e.g.");
|
570 |
+
match.insert("ie");
|
571 |
+
match.insert("i.e.");
|
572 |
+
match.insert("mean");
|
573 |
+
match.insert("age");
|
574 |
+
match.insert("std");
|
575 |
+
match.insert("range");
|
576 |
+
match.insert("young");
|
577 |
+
match.insert("old");
|
578 |
+
match.insert("male");
|
579 |
+
match.insert("female");
|
580 |
+
|
581 |
+
}
|
582 |
+
|
583 |
+
void AbbrvE::Proc(char *pxh){
|
584 |
+
long i,j;
|
585 |
+
char *pch,*ptr;
|
586 |
+
pMt->segment(pxh);
|
587 |
+
for(i=0;i<pMt->sent.size();i++){
|
588 |
+
Extract2( (pMt->sent[i]).c_str() );
|
589 |
+
}
|
590 |
+
|
591 |
+
seq.flag_seq( numa, abbs );
|
592 |
+
j=0;
|
593 |
+
for(i=0;i<numa;i++){
|
594 |
+
if( seq.rate(i) ){
|
595 |
+
if(j<i){
|
596 |
+
pch=abbl[i];
|
597 |
+
if(ptr=strchr(pch,'|')){
|
598 |
+
*ptr='/';
|
599 |
+
ptr++;
|
600 |
+
while(ptr=strchr(pch,'|')){
|
601 |
+
*ptr='/';
|
602 |
+
ptr++;
|
603 |
+
}
|
604 |
+
}
|
605 |
+
abbl[j]=pch;
|
606 |
+
pch=abbs[i];
|
607 |
+
if(ptr=strchr(pch,'|')){
|
608 |
+
*ptr='/';
|
609 |
+
ptr++;
|
610 |
+
while(ptr=strchr(pch,'|')){
|
611 |
+
*ptr='/';
|
612 |
+
ptr++;
|
613 |
+
}
|
614 |
+
}
|
615 |
+
abbs[j]=pch;
|
616 |
+
nt[j]=nt[i];
|
617 |
+
}
|
618 |
+
j++;
|
619 |
+
}
|
620 |
+
else {
|
621 |
+
delete [] abbl[i];
|
622 |
+
delete [] abbs[i];
|
623 |
+
}
|
624 |
+
}
|
625 |
+
|
626 |
+
numa=j;
|
627 |
+
}
|
628 |
+
|
629 |
+
}
|
Library/AbbrvE.h
CHANGED
@@ -1,93 +1,93 @@
|
|
1 |
-
#ifndef ABBRVE_H
|
2 |
-
#define ABBRVE_H
|
3 |
-
#include <fstream>
|
4 |
-
#include <iostream>
|
5 |
-
#include <runn.h>
|
6 |
-
#include <MPtok.h>
|
7 |
-
#include <vector>
|
8 |
-
using namespace std;
|
9 |
-
namespace iret {
|
10 |
-
|
11 |
-
typedef vector<string> strings;
|
12 |
-
|
13 |
-
|
14 |
-
class Find_Seq {
|
15 |
-
public:
|
16 |
-
|
17 |
-
Find_Seq( void );
|
18 |
-
|
19 |
-
// flag the SFs whether part of sequence or not
|
20 |
-
void flag_seq( int numa, char* abbs[] );
|
21 |
-
|
22 |
-
// true if good SF, false if part of sequence
|
23 |
-
bool rate( int i ) const { my_rate[i]; }
|
24 |
-
|
25 |
-
private:
|
26 |
-
void find_seq( const vector<string> & seq );
|
27 |
-
void create_seq( void );
|
28 |
-
|
29 |
-
// const works with c++0x
|
30 |
-
/* const */ strings seq_i;
|
31 |
-
/* const */ strings seq_I;
|
32 |
-
/* const */ strings seq_a;
|
33 |
-
/* const */ strings seq_A;
|
34 |
-
|
35 |
-
vector<bool> my_rate;
|
36 |
-
int my_numa;
|
37 |
-
char ** my_abbs; // really char *[], but that doesn't work
|
38 |
-
|
39 |
-
};
|
40 |
-
|
41 |
-
|
42 |
-
class AbbrvE {
|
43 |
-
public:
|
44 |
-
AbbrvE(long ta=10000,long wrd_spc=10000); //Sets space for extracted
|
45 |
-
//potential abbreviations to ta & word_space to wrd_spc
|
46 |
-
~AbbrvE(void);
|
47 |
-
void Extract(char *pch); //Extracts possible long-short form
|
48 |
-
//pairs, but does not attempt to find the relationship
|
49 |
-
void Extract2(const char *pch); //extened version (Jan-9-2008)
|
50 |
-
bool Test(const char *str); //Tests a single token and returns true
|
51 |
-
//if the token should be a possible first token of a short form
|
52 |
-
void Rate(void); //Sets ratings for the proposed pairs. Effort to
|
53 |
-
//remove (a), (b), etc., sequence markers
|
54 |
-
void token(const char *str); //Produces a list of tokens in order of
|
55 |
-
//of occurrence in the string.
|
56 |
-
void token2(const char *str); //extended version (Jan-9-2008)
|
57 |
-
void cleara(void); //Clear the abbl & abbs memory of strings
|
58 |
-
void clear(void); //Clear the lst memory of words
|
59 |
-
|
60 |
-
//Application functions
|
61 |
-
void Proc(char *pch); //Accepts a natural language statement and
|
62 |
-
//processes to final results stored in tta, abbs, and abbl
|
63 |
-
//Need to call cleara function after each use of this function
|
64 |
-
|
65 |
-
// Internal routines:
|
66 |
-
// setup data for Test method
|
67 |
-
void setup_Test( void );
|
68 |
-
bool prefix_match( const char *str ); // does str begins with a prefix?
|
69 |
-
|
70 |
-
//Data
|
71 |
-
long tta; //Total possible abbreviations extracted
|
72 |
-
//default 10k
|
73 |
-
long numa; //number of abbreviations in current extract
|
74 |
-
char **abbl; //Long form space, hold up to 10 tokens
|
75 |
-
char **abbs; //Short form space, hold up to 10 tokens
|
76 |
-
Find_Seq seq; // identify sequences to ignore
|
77 |
-
int *nt; //Number of tokens within parentheses
|
78 |
-
long word_space; //Space in lst for tokens
|
79 |
-
//default 10k
|
80 |
-
long num; //Number of tokens
|
81 |
-
char **lst; //Holds the tokens
|
82 |
-
|
83 |
-
static const int cnam_size=100000;
|
84 |
-
char cnam[cnam_size]; //Work space
|
85 |
-
MPtok *pMt; //Pointer at tokenizer class. Used to segment text
|
86 |
-
//in Proc function.
|
87 |
-
|
88 |
-
// Test data
|
89 |
-
set<string> match; // bad SF to match exactly
|
90 |
-
vector<string> prefix; // bad SF to match prefix
|
91 |
-
};
|
92 |
-
}
|
93 |
-
#endif
|
|
|
1 |
+
#ifndef ABBRVE_H
|
2 |
+
#define ABBRVE_H
|
3 |
+
#include <fstream>
|
4 |
+
#include <iostream>
|
5 |
+
#include <runn.h>
|
6 |
+
#include <MPtok.h>
|
7 |
+
#include <vector>
|
8 |
+
using namespace std;
|
9 |
+
namespace iret {
|
10 |
+
|
11 |
+
typedef vector<string> strings;
|
12 |
+
|
13 |
+
|
14 |
+
class Find_Seq {
|
15 |
+
public:
|
16 |
+
|
17 |
+
Find_Seq( void );
|
18 |
+
|
19 |
+
// flag the SFs whether part of sequence or not
|
20 |
+
void flag_seq( int numa, char* abbs[] );
|
21 |
+
|
22 |
+
// true if good SF, false if part of sequence
|
23 |
+
bool rate( int i ) const { my_rate[i]; }
|
24 |
+
|
25 |
+
private:
|
26 |
+
void find_seq( const vector<string> & seq );
|
27 |
+
void create_seq( void );
|
28 |
+
|
29 |
+
// const works with c++0x
|
30 |
+
/* const */ strings seq_i;
|
31 |
+
/* const */ strings seq_I;
|
32 |
+
/* const */ strings seq_a;
|
33 |
+
/* const */ strings seq_A;
|
34 |
+
|
35 |
+
vector<bool> my_rate;
|
36 |
+
int my_numa;
|
37 |
+
char ** my_abbs; // really char *[], but that doesn't work
|
38 |
+
|
39 |
+
};
|
40 |
+
|
41 |
+
|
42 |
+
class AbbrvE {
|
43 |
+
public:
|
44 |
+
AbbrvE(long ta=10000,long wrd_spc=10000); //Sets space for extracted
|
45 |
+
//potential abbreviations to ta & word_space to wrd_spc
|
46 |
+
~AbbrvE(void);
|
47 |
+
void Extract(char *pch); //Extracts possible long-short form
|
48 |
+
//pairs, but does not attempt to find the relationship
|
49 |
+
void Extract2(const char *pch); //extened version (Jan-9-2008)
|
50 |
+
bool Test(const char *str); //Tests a single token and returns true
|
51 |
+
//if the token should be a possible first token of a short form
|
52 |
+
void Rate(void); //Sets ratings for the proposed pairs. Effort to
|
53 |
+
//remove (a), (b), etc., sequence markers
|
54 |
+
void token(const char *str); //Produces a list of tokens in order of
|
55 |
+
//of occurrence in the string.
|
56 |
+
void token2(const char *str); //extended version (Jan-9-2008)
|
57 |
+
void cleara(void); //Clear the abbl & abbs memory of strings
|
58 |
+
void clear(void); //Clear the lst memory of words
|
59 |
+
|
60 |
+
//Application functions
|
61 |
+
void Proc(char *pch); //Accepts a natural language statement and
|
62 |
+
//processes to final results stored in tta, abbs, and abbl
|
63 |
+
//Need to call cleara function after each use of this function
|
64 |
+
|
65 |
+
// Internal routines:
|
66 |
+
// setup data for Test method
|
67 |
+
void setup_Test( void );
|
68 |
+
bool prefix_match( const char *str ); // does str begins with a prefix?
|
69 |
+
|
70 |
+
//Data
|
71 |
+
long tta; //Total possible abbreviations extracted
|
72 |
+
//default 10k
|
73 |
+
long numa; //number of abbreviations in current extract
|
74 |
+
char **abbl; //Long form space, hold up to 10 tokens
|
75 |
+
char **abbs; //Short form space, hold up to 10 tokens
|
76 |
+
Find_Seq seq; // identify sequences to ignore
|
77 |
+
int *nt; //Number of tokens within parentheses
|
78 |
+
long word_space; //Space in lst for tokens
|
79 |
+
//default 10k
|
80 |
+
long num; //Number of tokens
|
81 |
+
char **lst; //Holds the tokens
|
82 |
+
|
83 |
+
static const int cnam_size=100000;
|
84 |
+
char cnam[cnam_size]; //Work space
|
85 |
+
MPtok *pMt; //Pointer at tokenizer class. Used to segment text
|
86 |
+
//in Proc function.
|
87 |
+
|
88 |
+
// Test data
|
89 |
+
set<string> match; // bad SF to match exactly
|
90 |
+
vector<string> prefix; // bad SF to match prefix
|
91 |
+
};
|
92 |
+
}
|
93 |
+
#endif
|
Library/Btree.C
CHANGED
@@ -1,1304 +1,1304 @@
|
|
1 |
-
#include <iostream>
|
2 |
-
#include <fstream>
|
3 |
-
#include <cstdio>
|
4 |
-
#include <cstdlib>
|
5 |
-
#include <cmath>
|
6 |
-
#include <cstring>
|
7 |
-
#include <cassert>
|
8 |
-
#include "Btree.h"
|
9 |
-
#include "runn.h"
|
10 |
-
|
11 |
-
using namespace std;
|
12 |
-
namespace iret {
|
13 |
-
|
14 |
-
Node::Node(void){
|
15 |
-
str=NULL;
|
16 |
-
rel=NULL;
|
17 |
-
pdn=NULL;
|
18 |
-
}
|
19 |
-
|
20 |
-
Node::Node(const char *ptr){
|
21 |
-
int i=strlen(ptr);
|
22 |
-
str = new char[i+1];
|
23 |
-
strcpy(str,ptr);
|
24 |
-
rel=NULL;
|
25 |
-
pdn=NULL;
|
26 |
-
}
|
27 |
-
|
28 |
-
Node::Node(char const *ptr,void *dtr){
|
29 |
-
int i=strlen(ptr);
|
30 |
-
str = new char[i+1];
|
31 |
-
strcpy(str,ptr);
|
32 |
-
rel = dtr;
|
33 |
-
pdn=NULL;
|
34 |
-
}
|
35 |
-
|
36 |
-
Node::~Node(){
|
37 |
-
if(str)delete [] str;
|
38 |
-
}
|
39 |
-
|
40 |
-
void Node::debug(void){
|
41 |
-
cout << "Node {" << endl;
|
42 |
-
cout << " str: " << this->str << endl;
|
43 |
-
if(rel==NULL)cout << " rel: NULL" << endl;
|
44 |
-
else cout << " rel: " << (long)rel << endl;
|
45 |
-
if(pdn==NULL)cout << " pdn: NULL" << endl;
|
46 |
-
else cout << " pdn: " << (long)pdn << endl;
|
47 |
-
cout << " }" << endl;
|
48 |
-
}
|
49 |
-
|
50 |
-
Page::Page(){
|
51 |
-
pdn=NULL;
|
52 |
-
ndnm='\0';
|
53 |
-
}
|
54 |
-
|
55 |
-
Page::Page(Page *const pz,Page *const pn,const int n){
|
56 |
-
pdn=pn;
|
57 |
-
int j=(int)(pz->ndnm)-n;
|
58 |
-
ndnm=(char)(j>0 ? j : 0);
|
59 |
-
for(int i=0;i<(int)ndnm;i++){pnd[i]=(pz->pnd)[n+i];}
|
60 |
-
}
|
61 |
-
|
62 |
-
Page::~Page(){
|
63 |
-
for(int i=0;i<(int)ndnm;i++){
|
64 |
-
delete pnd[i];
|
65 |
-
}
|
66 |
-
}
|
67 |
-
|
68 |
-
void Page::clean(void){
|
69 |
-
for(int i=0;i<(int)ndnm;i++){
|
70 |
-
pnd[i]->str=NULL;
|
71 |
-
}
|
72 |
-
}
|
73 |
-
|
74 |
-
void Page::insert(const int n,Node * const nd,const int j){
|
75 |
-
assert(j<ord2);
|
76 |
-
assert(n<=j);
|
77 |
-
if(n==j){
|
78 |
-
pnd[j]=nd;
|
79 |
-
}
|
80 |
-
else {
|
81 |
-
for(int i=j;i>n;i--)pnd[i]=pnd[i-1];
|
82 |
-
pnd[n]=nd;
|
83 |
-
}
|
84 |
-
ndnm++;
|
85 |
-
}
|
86 |
-
|
87 |
-
int Page::search(int &a,int &b,const char *str,int &p){
|
88 |
-
int j;
|
89 |
-
if((j=stc_my(a,b,str,pnd[0]->str))<0){
|
90 |
-
p=0;
|
91 |
-
return(0);
|
92 |
-
}
|
93 |
-
else if(j==0){
|
94 |
-
p=0;
|
95 |
-
return(1);
|
96 |
-
}
|
97 |
-
if((j=stc_my(a,b,str,pnd[(int)(ndnm-1)]->str))>0){
|
98 |
-
p=(int)ndnm;
|
99 |
-
return(0);
|
100 |
-
}
|
101 |
-
else if(j==0){
|
102 |
-
p=(int)(ndnm-1);
|
103 |
-
return(1);
|
104 |
-
}
|
105 |
-
int x=0,i;
|
106 |
-
int y=(int)(ndnm-1);
|
107 |
-
while(y-x>1){
|
108 |
-
i=(y+x)/2;
|
109 |
-
if((j=stc_my(a,b,str,pnd[i]->str))==0){p=i;return(1);}
|
110 |
-
else if(j<0)y=i;
|
111 |
-
else x=i;
|
112 |
-
}
|
113 |
-
p=y;
|
114 |
-
return(0);
|
115 |
-
}
|
116 |
-
|
117 |
-
int Page::search(int &a,int &b,char *str,int &p,Partial_match *btr){
|
118 |
-
int j;
|
119 |
-
if((j=btr->stc_my_long(a,b,str,pnd[0]->str,0))<0){
|
120 |
-
p=0;
|
121 |
-
return(0);
|
122 |
-
}
|
123 |
-
else if(j==0){
|
124 |
-
p=0;
|
125 |
-
return(1);
|
126 |
-
}
|
127 |
-
if((j=btr->stc_my_long(a,b,str,pnd[(int)(ndnm-1)]->str,(int)(ndnm-1)))>0){
|
128 |
-
p=(int)ndnm;
|
129 |
-
return(0);
|
130 |
-
}
|
131 |
-
else if(j==0){
|
132 |
-
p=(int)(ndnm-1);
|
133 |
-
return(1);
|
134 |
-
}
|
135 |
-
int x=0,i;
|
136 |
-
int y=(int)(ndnm-1);
|
137 |
-
while(y-x>1){
|
138 |
-
i=(y+x)/2;
|
139 |
-
if((j=btr->stc_my_long(a,b,str,pnd[i]->str,i))==0){p=i;return(1);}
|
140 |
-
else if(j<0)y=i;
|
141 |
-
else x=i;
|
142 |
-
}
|
143 |
-
p=y;
|
144 |
-
return(0);
|
145 |
-
}
|
146 |
-
|
147 |
-
void Page::debug(void){
|
148 |
-
cout << "Page {" << endl;
|
149 |
-
cout << " ndnm: " << (int)ndnm << endl;
|
150 |
-
if(pdn==NULL)cout << " pdn: NULL" << endl;
|
151 |
-
else cout << " pdn: " << (long)pdn << endl;
|
152 |
-
for(int i=0;i<(int)ndnm;i++){
|
153 |
-
cout << i << " ";
|
154 |
-
(this->pnd[i])->debug();
|
155 |
-
}
|
156 |
-
cout << " }" << endl;
|
157 |
-
}
|
158 |
-
|
159 |
-
int stc_my(int &a,int &b,const char *str,const char *ptr)
|
160 |
-
{register int i=(a<b) ? a : b;
|
161 |
-
register const char *p1=str+i;
|
162 |
-
register const char *p2=ptr+i;
|
163 |
-
register int j=0;
|
164 |
-
while((*p1==*p2)&&(*p1!='\0')){
|
165 |
-
j++;
|
166 |
-
p1++;
|
167 |
-
p2++;
|
168 |
-
}
|
169 |
-
if(*p1==*p2)return(0);
|
170 |
-
else if(*p1<*p2){
|
171 |
-
b=i+j;
|
172 |
-
return(-1);
|
173 |
-
}
|
174 |
-
else {
|
175 |
-
a=i+j;
|
176 |
-
return(1);
|
177 |
-
}
|
178 |
-
}
|
179 |
-
|
180 |
-
Btree::Btree(){
|
181 |
-
iclean=0;
|
182 |
-
copy=false;
|
183 |
-
depth=0;
|
184 |
-
root=new Page;
|
185 |
-
root->ndnm = 1;
|
186 |
-
(root->pnd)[0]=new Node("");
|
187 |
-
}
|
188 |
-
|
189 |
-
int Btree::search(const char *str){
|
190 |
-
depth=-1;
|
191 |
-
Page *pu=root;
|
192 |
-
register int a=0,b=0,i,j;
|
193 |
-
while(pu!=NULL){
|
194 |
-
depth++;
|
195 |
-
pg[depth]=pu;
|
196 |
-
j=(pu->search)(a,b,str,i);
|
197 |
-
cnd[depth]=i;
|
198 |
-
if(j==1)return(1);
|
199 |
-
if(i==0)pu=pu->pdn;
|
200 |
-
else pu=(pu->pnd)[i-1]->pdn;
|
201 |
-
}
|
202 |
-
return(0);
|
203 |
-
}
|
204 |
-
|
205 |
-
int Btree::insert(Node *nd){
|
206 |
-
int w,k;
|
207 |
-
Page *pm,*pz;
|
208 |
-
while((nd!=NULL)&&(depth)){
|
209 |
-
pm=pg[depth];
|
210 |
-
w=pm->ndnm;
|
211 |
-
if(w<ord2){
|
212 |
-
pm->insert(cnd[depth],nd,w);
|
213 |
-
nd=NULL;
|
214 |
-
}
|
215 |
-
else {
|
216 |
-
k=cnd[depth];
|
217 |
-
if(k<order){
|
218 |
-
pz=new Page(pm,((pm->pnd)[order-1])->pdn,order);
|
219 |
-
pm->insert(k,nd,order);
|
220 |
-
nd=pm->pnd[order];
|
221 |
-
nd->pdn=pz;
|
222 |
-
pm->ndnm=order;
|
223 |
-
}
|
224 |
-
else if(k>order){
|
225 |
-
pz=new Page(pm,((pm->pnd)[order])->pdn,order+1);
|
226 |
-
pz->insert(k-order-1,nd,order-1);
|
227 |
-
nd=pm->pnd[order];
|
228 |
-
nd->pdn=pz;
|
229 |
-
pm->ndnm=order;
|
230 |
-
}
|
231 |
-
else {
|
232 |
-
pz=new Page(pm,nd->pdn,order);
|
233 |
-
nd->pdn=pz;
|
234 |
-
pm->ndnm=order;
|
235 |
-
}
|
236 |
-
}
|
237 |
-
depth--;
|
238 |
-
}
|
239 |
-
if(nd!=NULL){
|
240 |
-
pm=pg[depth];
|
241 |
-
w=pm->ndnm;
|
242 |
-
if(w<ord2)pm->insert(cnd[depth],nd,w);
|
243 |
-
else {
|
244 |
-
root=new Page();
|
245 |
-
root->pdn=pm;
|
246 |
-
k=cnd[depth];
|
247 |
-
if(k<order){
|
248 |
-
pz=new Page(pm,((pm->pnd)[order-1])->pdn,order);
|
249 |
-
pm->insert(k,nd,order);
|
250 |
-
(root->pnd)[0]=pm->pnd[order];
|
251 |
-
((root->pnd)[0])->pdn=pz;
|
252 |
-
root->ndnm=1;
|
253 |
-
pm->ndnm=order;
|
254 |
-
}
|
255 |
-
else if(k>order){
|
256 |
-
pz=new Page(pm,((pm->pnd)[order])->pdn,order+1);
|
257 |
-
pz->insert(k-order-1,nd,order-1);
|
258 |
-
(root->pnd)[0]=pm->pnd[order];
|
259 |
-
((root->pnd)[0])->pdn=pz;
|
260 |
-
root->ndnm=1;
|
261 |
-
pm->ndnm=order;
|
262 |
-
}
|
263 |
-
else {
|
264 |
-
pz=new Page(pm,nd->pdn,order);
|
265 |
-
(root->pnd)[0]=nd;
|
266 |
-
nd->pdn=pz;
|
267 |
-
root->ndnm=1;
|
268 |
-
pm->ndnm=order;
|
269 |
-
}
|
270 |
-
}
|
271 |
-
}
|
272 |
-
return(1);
|
273 |
-
}
|
274 |
-
|
275 |
-
void Btree::node_first(void){
|
276 |
-
depth=0;
|
277 |
-
pg[depth]=root;
|
278 |
-
cnd[depth]=0;
|
279 |
-
Page *pm;
|
280 |
-
while((pm=(pg[depth]->pdn))!=NULL){
|
281 |
-
depth++;
|
282 |
-
pg[depth]=pm;
|
283 |
-
cnd[depth]=0;
|
284 |
-
}
|
285 |
-
}
|
286 |
-
|
287 |
-
int Btree::node_next(){
|
288 |
-
int i=cnd[depth];
|
289 |
-
Page *pd=((pg[depth]->pnd)[i])->pdn;
|
290 |
-
if(pd!=NULL){
|
291 |
-
(cnd[depth])++;
|
292 |
-
depth++;
|
293 |
-
pg[depth]=pd;
|
294 |
-
cnd[depth]=0;
|
295 |
-
while((pd=(pg[depth]->pdn))!=NULL){
|
296 |
-
depth++;
|
297 |
-
pg[depth]=pd;
|
298 |
-
cnd[depth]=0;
|
299 |
-
}
|
300 |
-
}
|
301 |
-
else {
|
302 |
-
cnd[depth]=++i;
|
303 |
-
while((depth>=1)&&(i==(pg[depth]->ndnm))){depth--;i=cnd[depth];}
|
304 |
-
if((depth==0)&&(i==(pg[depth]->ndnm)))depth--;
|
305 |
-
if(depth<0)return(0);
|
306 |
-
}
|
307 |
-
return(1);
|
308 |
-
}
|
309 |
-
|
310 |
-
char *Btree::show_str(){
|
311 |
-
return(((pg[depth]->pnd)[cnd[depth]])->str);
|
312 |
-
}
|
313 |
-
|
314 |
-
void *Btree::give_ptr(){
|
315 |
-
return(((pg[depth]->pnd)[cnd[depth]])->rel);
|
316 |
-
}
|
317 |
-
|
318 |
-
void Btree::set_ptr(void *dtr){
|
319 |
-
((pg[depth]->pnd)[cnd[depth]])->rel=dtr;
|
320 |
-
}
|
321 |
-
|
322 |
-
Btree::~Btree(){
|
323 |
-
int pflag=get_qflag();
|
324 |
-
long k=0;
|
325 |
-
if (copy) return; // only delete original
|
326 |
-
if(!iclean){
|
327 |
-
node_first();
|
328 |
-
int i=depth,j;
|
329 |
-
do{
|
330 |
-
j=node_next();
|
331 |
-
if(depth<i){
|
332 |
-
while(i>depth){
|
333 |
-
delete pg[i];
|
334 |
-
i--;
|
335 |
-
mark(pflag,++k,1000,"pages deleted");
|
336 |
-
}
|
337 |
-
}
|
338 |
-
else i=depth;
|
339 |
-
} while(j);
|
340 |
-
}
|
341 |
-
else {
|
342 |
-
node_first();
|
343 |
-
int i=depth,j;
|
344 |
-
do{
|
345 |
-
j=node_next();
|
346 |
-
if(depth<i){
|
347 |
-
while(i>depth){
|
348 |
-
pg[i]->clean();
|
349 |
-
delete pg[i];
|
350 |
-
i--;
|
351 |
-
mark(pflag,++k,1000,"pages deleted");
|
352 |
-
}
|
353 |
-
}
|
354 |
-
else i=depth;
|
355 |
-
} while(j);
|
356 |
-
}
|
357 |
-
}
|
358 |
-
|
359 |
-
long Btree::list_write(ofstream &fout){
|
360 |
-
int pflag=get_qflag();
|
361 |
-
long ct=0;
|
362 |
-
node_first();
|
363 |
-
while(node_next()){
|
364 |
-
fout << show_str() << endl;
|
365 |
-
mark(pflag,++ct,1000,"strings written");
|
366 |
-
}
|
367 |
-
fout.close();
|
368 |
-
return((int)fout.good());
|
369 |
-
}
|
370 |
-
|
371 |
-
Btree::Btree(ifstream &fin){
|
372 |
-
copy=false;
|
373 |
-
char cnam[256];
|
374 |
-
int pflag=get_qflag();
|
375 |
-
depth=0;
|
376 |
-
pg[0]=root=new Page();
|
377 |
-
cnd[0]=root->ndnm = 1;
|
378 |
-
(root->pnd)[0]=new Node("");
|
379 |
-
Node *pno;
|
380 |
-
long ct=0;
|
381 |
-
while(get_string(cnam,fin,'\n')){
|
382 |
-
pno = new Node(cnam);
|
383 |
-
add(pno);
|
384 |
-
mark(pflag,++ct,10000,"strings read");
|
385 |
-
}
|
386 |
-
fin.close();
|
387 |
-
}
|
388 |
-
|
389 |
-
int Btree::add(Node *nd){
|
390 |
-
int w,k,dp;
|
391 |
-
Page *pm,*pz;
|
392 |
-
dp=depth; //uses dp in place of depth in insert.
|
393 |
-
while((nd!=NULL)&&(dp)){
|
394 |
-
pm=pg[dp];
|
395 |
-
w=pm->ndnm;
|
396 |
-
if(w<ord2){
|
397 |
-
pm->insert(cnd[dp],nd,w);
|
398 |
-
nd=NULL;
|
399 |
-
(cnd[dp])++; //variation from insert.
|
400 |
-
}
|
401 |
-
else {
|
402 |
-
k=cnd[dp];
|
403 |
-
if(k<order){
|
404 |
-
pz=new Page(pm,((pm->pnd)[order-1])->pdn,order);
|
405 |
-
pm->insert(k,nd,order);
|
406 |
-
nd=pm->pnd[order];
|
407 |
-
nd->pdn=pz;
|
408 |
-
pm->ndnm=order;
|
409 |
-
}
|
410 |
-
else if(k>order){
|
411 |
-
pz=new Page(pm,((pm->pnd)[order])->pdn,order+1);
|
412 |
-
pz->insert(k-order-1,nd,order-1);
|
413 |
-
nd=pm->pnd[order];
|
414 |
-
nd->pdn=pz;
|
415 |
-
pm->ndnm=order;
|
416 |
-
}
|
417 |
-
else {
|
418 |
-
pz=new Page(pm,nd->pdn,order);
|
419 |
-
nd->pdn=pz;
|
420 |
-
pm->ndnm=order;
|
421 |
-
}
|
422 |
-
pg[dp]=pz; //2 lines of variation from insert.
|
423 |
-
cnd[dp]=order;
|
424 |
-
}
|
425 |
-
dp--;
|
426 |
-
}
|
427 |
-
if(nd!=NULL){
|
428 |
-
pm=pg[dp];
|
429 |
-
w=pm->ndnm;
|
430 |
-
if(w<ord2){
|
431 |
-
pm->insert(cnd[dp],nd,w);
|
432 |
-
(cnd[dp])++; //variation from insert.
|
433 |
-
}
|
434 |
-
else {
|
435 |
-
root=new Page();
|
436 |
-
root->pdn=pm;
|
437 |
-
k=cnd[dp];
|
438 |
-
if(k<order){
|
439 |
-
pz=new Page(pm,((pm->pnd)[order-1])->pdn,order);
|
440 |
-
pm->insert(k,nd,order);
|
441 |
-
(root->pnd)[0]=pm->pnd[order];
|
442 |
-
((root->pnd)[0])->pdn=pz;
|
443 |
-
root->ndnm=1;
|
444 |
-
pm->ndnm=order;
|
445 |
-
}
|
446 |
-
else if(k>order){
|
447 |
-
pz=new Page(pm,((pm->pnd)[order])->pdn,order+1);
|
448 |
-
pz->insert(k-order-1,nd,order-1);
|
449 |
-
(root->pnd)[0]=pm->pnd[order];
|
450 |
-
((root->pnd)[0])->pdn=pz;
|
451 |
-
root->ndnm=1;
|
452 |
-
pm->ndnm=order;
|
453 |
-
}
|
454 |
-
else {
|
455 |
-
pz=new Page(pm,nd->pdn,order);
|
456 |
-
(root->pnd)[0]=nd;
|
457 |
-
nd->pdn=pz;
|
458 |
-
root->ndnm=1;
|
459 |
-
pm->ndnm=order;
|
460 |
-
}
|
461 |
-
next_empty(); //variation from insert.
|
462 |
-
}
|
463 |
-
}
|
464 |
-
return(1);
|
465 |
-
}
|
466 |
-
|
467 |
-
void Btree::next_empty(){
|
468 |
-
depth=0;
|
469 |
-
pg[depth]=root;
|
470 |
-
int i=cnd[depth]=root->ndnm;
|
471 |
-
Page *pm;
|
472 |
-
while((pm=((pg[depth]->pnd)[i-1])->pdn)!=NULL){
|
473 |
-
depth++;
|
474 |
-
pg[depth]=pm;
|
475 |
-
i=cnd[depth]=pm->ndnm;
|
476 |
-
}
|
477 |
-
}
|
478 |
-
|
479 |
-
Str_str::Str_str() : Btree() {
|
480 |
-
}
|
481 |
-
|
482 |
-
Str_str::~Str_str(){
|
483 |
-
if(copy)return;
|
484 |
-
this->node_first();
|
485 |
-
while(this->node_next())delete [] (char*)this->give_ptr();
|
486 |
-
}
|
487 |
-
|
488 |
-
void Str_str::add_pair(const char *one,const char *two){
|
489 |
-
Node *pnd;
|
490 |
-
if(search(one)){
|
491 |
-
cout << "Duplicate string in keys list = " << one << endl;
|
492 |
-
exit(0);
|
493 |
-
}
|
494 |
-
else {
|
495 |
-
int i=strlen(two);
|
496 |
-
char *st=new char[i+1];
|
497 |
-
strcpy(st,two);
|
498 |
-
pnd=new Node(one,(void *)st);
|
499 |
-
add(pnd);
|
500 |
-
}
|
501 |
-
}
|
502 |
-
|
503 |
-
char *Str_str::match(const char *one){
|
504 |
-
if(search(one)){
|
505 |
-
return((char*)give_ptr());
|
506 |
-
}
|
507 |
-
else {
|
508 |
-
cout << "String not a key = " << one << endl;
|
509 |
-
exit(0);
|
510 |
-
}
|
511 |
-
}
|
512 |
-
|
513 |
-
List::List() : Btree() {
|
514 |
-
cnt_key=0;
|
515 |
-
}
|
516 |
-
|
517 |
-
List::~List(){
|
518 |
-
}
|
519 |
-
|
520 |
-
void List::add_key(const char *str){
|
521 |
-
Node *pnd;
|
522 |
-
if(!search(str)){
|
523 |
-
pnd=new Node(str);
|
524 |
-
add(pnd);
|
525 |
-
}
|
526 |
-
}
|
527 |
-
|
528 |
-
void List::add_key_count(const char *str){
|
529 |
-
Node *pnd;
|
530 |
-
if(!search(str)){
|
531 |
-
pnd=new Node(str);
|
532 |
-
add(pnd);
|
533 |
-
cnt_key++;
|
534 |
-
}
|
535 |
-
}
|
536 |
-
|
537 |
-
void List::addp_key_count(char *str){
|
538 |
-
Node *pnd;
|
539 |
-
if(!search(str)){
|
540 |
-
pnd=new Node;
|
541 |
-
pnd->str=str;
|
542 |
-
add(pnd);
|
543 |
-
cnt_key++;
|
544 |
-
}
|
545 |
-
}
|
546 |
-
|
547 |
-
Num_num::Num_num() : Btree() {
|
548 |
-
}
|
549 |
-
|
550 |
-
Num_num::~Num_num(){
|
551 |
-
if(copy)return;
|
552 |
-
this->node_first();
|
553 |
-
while(this->node_next())delete (long*)this->give_ptr();
|
554 |
-
}
|
555 |
-
|
556 |
-
void Num_num::add_pair(long i,long j){
|
557 |
-
Node *pnd;
|
558 |
-
char cnam[256];
|
559 |
-
long_str(cnam,i);
|
560 |
-
if(!search(cnam)){
|
561 |
-
long *st=new long;
|
562 |
-
*st=j;
|
563 |
-
pnd=new Node(cnam,(void *)st);
|
564 |
-
add(pnd);
|
565 |
-
}
|
566 |
-
}
|
567 |
-
|
568 |
-
long Num_num::match(long i){
|
569 |
-
char cnam[256];
|
570 |
-
long_str(cnam,i);
|
571 |
-
if(search(cnam)){
|
572 |
-
return(*((long*)give_ptr()));
|
573 |
-
}
|
574 |
-
else return(LNEG);
|
575 |
-
}
|
576 |
-
|
577 |
-
Count::Count() : List() {
|
578 |
-
total=0;
|
579 |
-
}
|
580 |
-
|
581 |
-
Count::~Count(){
|
582 |
-
if(copy)return;
|
583 |
-
long *pk;
|
584 |
-
this->node_first();
|
585 |
-
while(this->node_next()){
|
586 |
-
pk=(long*)(this->give_ptr());
|
587 |
-
if(pk)delete pk;
|
588 |
-
}
|
589 |
-
}
|
590 |
-
|
591 |
-
void Count::add_count(const char *pch,long n){
|
592 |
-
long *ppt;
|
593 |
-
Node *np;
|
594 |
-
total+=n;
|
595 |
-
if(this->search(pch)==0){
|
596 |
-
ppt = new long;
|
597 |
-
(*ppt) =n;
|
598 |
-
np=new Node(pch,(void*)ppt);
|
599 |
-
this->insert(np);
|
600 |
-
}
|
601 |
-
else {
|
602 |
-
(*(long*) this->give_ptr())+=n;
|
603 |
-
}
|
604 |
-
}
|
605 |
-
|
606 |
-
void Count::add_countz(const char *pch,long n){
|
607 |
-
long *ppt;
|
608 |
-
Node *np;
|
609 |
-
if(this->search(pch)==0){
|
610 |
-
ppt = new long;
|
611 |
-
(*ppt) =n;
|
612 |
-
np=new Node(pch,(void*)ppt);
|
613 |
-
this->insert(np);
|
614 |
-
cnt_key++;
|
615 |
-
}
|
616 |
-
else {
|
617 |
-
(*(long*) this->give_ptr())+=n;
|
618 |
-
}
|
619 |
-
}
|
620 |
-
|
621 |
-
void Count::add_count2(const char *pch,long n){
|
622 |
-
long *ppt;
|
623 |
-
Node *np;
|
624 |
-
total+=n;
|
625 |
-
if(this->search(pch)==0){
|
626 |
-
ppt = new long;
|
627 |
-
(*ppt) =n;
|
628 |
-
np=new Node(pch,(void*)ppt);
|
629 |
-
this->insert(np);
|
630 |
-
cnt_key++;
|
631 |
-
}
|
632 |
-
else {
|
633 |
-
(*(long*) this->give_ptr())+=n;
|
634 |
-
}
|
635 |
-
}
|
636 |
-
|
637 |
-
void Count::addp_count2(char *pch,long n){
|
638 |
-
long *ppt;
|
639 |
-
Node *np;
|
640 |
-
total+=n;
|
641 |
-
if(this->search(pch)==0){
|
642 |
-
ppt = new long;
|
643 |
-
(*ppt) =n;
|
644 |
-
np=new Node;
|
645 |
-
np->str=pch;
|
646 |
-
np->rel=ppt;
|
647 |
-
this->insert(np);
|
648 |
-
cnt_key++;
|
649 |
-
}
|
650 |
-
else {
|
651 |
-
(*(long*) this->give_ptr())+=n;
|
652 |
-
}
|
653 |
-
}
|
654 |
-
|
655 |
-
void Count::correct(const char *pch,long n){
|
656 |
-
if(this->search(pch)){
|
657 |
-
(*(long*) this->give_ptr())=n;
|
658 |
-
}
|
659 |
-
}
|
660 |
-
|
661 |
-
long Count::count(const char *pch){
|
662 |
-
if(this->search(pch)==0){
|
663 |
-
return(0);
|
664 |
-
}
|
665 |
-
else {
|
666 |
-
return(*((long*) this->give_ptr()));
|
667 |
-
}
|
668 |
-
}
|
669 |
-
|
670 |
-
long Count::count(void){
|
671 |
-
return(*((long*) this->give_ptr()));
|
672 |
-
}
|
673 |
-
|
674 |
-
void Count::max_count(const char *pch,long n){
|
675 |
-
long *ppt,i;
|
676 |
-
Node *np;
|
677 |
-
total+=n;
|
678 |
-
if(!search(pch)){
|
679 |
-
ppt = new long;
|
680 |
-
(*ppt) =n;
|
681 |
-
np=new Node(pch,(void*)ppt);
|
682 |
-
this->insert(np);
|
683 |
-
}
|
684 |
-
else {
|
685 |
-
ppt=(long*)give_ptr();
|
686 |
-
if(*ppt<n)*ppt=n;
|
687 |
-
}
|
688 |
-
}
|
689 |
-
|
690 |
-
void Count::max_count2(const char *pch,long n){
|
691 |
-
long *ppt,i;
|
692 |
-
Node *np;
|
693 |
-
total+=n;
|
694 |
-
if(!search(pch)){
|
695 |
-
ppt = new long;
|
696 |
-
(*ppt) =n;
|
697 |
-
np=new Node(pch,(void*)ppt);
|
698 |
-
this->insert(np);
|
699 |
-
cnt_key++;
|
700 |
-
}
|
701 |
-
else {
|
702 |
-
ppt=(long*)give_ptr();
|
703 |
-
if(*ppt<n)*ppt=n;
|
704 |
-
}
|
705 |
-
}
|
706 |
-
|
707 |
-
void Count::maxp_count2(char *pch,long n){
|
708 |
-
long *ppt,i;
|
709 |
-
Node *np;
|
710 |
-
total+=n;
|
711 |
-
if(!search(pch)){
|
712 |
-
ppt = new long;
|
713 |
-
(*ppt) =n;
|
714 |
-
np=new Node;
|
715 |
-
np->str=pch;
|
716 |
-
np->rel=ppt;
|
717 |
-
this->insert(np);
|
718 |
-
cnt_key++;
|
719 |
-
}
|
720 |
-
else {
|
721 |
-
ppt=(long*)give_ptr();
|
722 |
-
if(*ppt<n)*ppt=n;
|
723 |
-
}
|
724 |
-
}
|
725 |
-
|
726 |
-
void Count::min_count(const char *pch,long n){
|
727 |
-
long *ppt,i;
|
728 |
-
Node *np;
|
729 |
-
total+=n;
|
730 |
-
if(!search(pch)){
|
731 |
-
ppt = new long;
|
732 |
-
(*ppt) =n;
|
733 |
-
np=new Node(pch,(void*)ppt);
|
734 |
-
this->insert(np);
|
735 |
-
}
|
736 |
-
else {
|
737 |
-
ppt=(long*)give_ptr();
|
738 |
-
if(*ppt>n)*ppt=n;
|
739 |
-
}
|
740 |
-
}
|
741 |
-
|
742 |
-
void Count::min_count2(const char *pch,long n){
|
743 |
-
long *ppt,i;
|
744 |
-
Node *np;
|
745 |
-
total+=n;
|
746 |
-
if(!search(pch)){
|
747 |
-
ppt = new long;
|
748 |
-
(*ppt) =n;
|
749 |
-
np=new Node(pch,(void*)ppt);
|
750 |
-
this->insert(np);
|
751 |
-
cnt_key++;
|
752 |
-
}
|
753 |
-
else {
|
754 |
-
ppt=(long*)give_ptr();
|
755 |
-
if(*ppt>n)*ppt=n;
|
756 |
-
}
|
757 |
-
}
|
758 |
-
|
759 |
-
void Count::minp_count2(char *pch,long n){
|
760 |
-
long *ppt,i;
|
761 |
-
Node *np;
|
762 |
-
total+=n;
|
763 |
-
if(!search(pch)){
|
764 |
-
ppt = new long;
|
765 |
-
(*ppt) =n;
|
766 |
-
np=new Node;
|
767 |
-
np->str=pch;
|
768 |
-
np->rel=ppt;
|
769 |
-
this->insert(np);
|
770 |
-
cnt_key++;
|
771 |
-
}
|
772 |
-
else {
|
773 |
-
ppt=(long*)give_ptr();
|
774 |
-
if(*ppt>n)*ppt=n;
|
775 |
-
}
|
776 |
-
}
|
777 |
-
|
778 |
-
//FCount (float count tree)
|
779 |
-
|
780 |
-
FCount::FCount() : List() {
|
781 |
-
total=0;
|
782 |
-
}
|
783 |
-
|
784 |
-
FCount::~FCount(){
|
785 |
-
if(copy)return;
|
786 |
-
float *pk;
|
787 |
-
this->node_first();
|
788 |
-
while(this->node_next()){
|
789 |
-
pk=(float*)(this->give_ptr());
|
790 |
-
if(pk)delete pk;
|
791 |
-
}
|
792 |
-
}
|
793 |
-
|
794 |
-
void FCount::Copy(FCount &Fc){
|
795 |
-
char *pch;
|
796 |
-
float *xx,*zz;
|
797 |
-
Node *pN;
|
798 |
-
|
799 |
-
pg[0]=root;
|
800 |
-
cnd[0]=root->ndnm;
|
801 |
-
|
802 |
-
Fc.node_first();
|
803 |
-
while(Fc.node_next()){
|
804 |
-
pch=Fc.show_str();
|
805 |
-
xx=(float*)Fc.give_ptr();
|
806 |
-
zz=new float;
|
807 |
-
*zz=*xx;
|
808 |
-
pN=new Node(pch,(void*)zz);
|
809 |
-
add(pN);
|
810 |
-
}
|
811 |
-
}
|
812 |
-
|
813 |
-
void FCount::add_count(const char *pch,float z){
|
814 |
-
float *ppt;
|
815 |
-
Node *np;
|
816 |
-
total+=z;
|
817 |
-
if(this->search(pch)==0){
|
818 |
-
ppt = new float;
|
819 |
-
(*ppt) =z;
|
820 |
-
np=new Node(pch,(void*)ppt);
|
821 |
-
this->insert(np);
|
822 |
-
}
|
823 |
-
else {
|
824 |
-
(*(float*) this->give_ptr())+=z;
|
825 |
-
}
|
826 |
-
}
|
827 |
-
|
828 |
-
void FCount::add_count2(const char *pch,float z){
|
829 |
-
float *ppt;
|
830 |
-
Node *np;
|
831 |
-
total+=z;
|
832 |
-
if(this->search(pch)==0){
|
833 |
-
ppt = new float;
|
834 |
-
(*ppt) =z;
|
835 |
-
np=new Node(pch,(void*)ppt);
|
836 |
-
this->insert(np);
|
837 |
-
cnt_key++;
|
838 |
-
}
|
839 |
-
else {
|
840 |
-
(*(float*) this->give_ptr())+=z;
|
841 |
-
}
|
842 |
-
}
|
843 |
-
|
844 |
-
void FCount::addp_count2(char *pch,float z){
|
845 |
-
float *ppt;
|
846 |
-
Node *np;
|
847 |
-
total+=z;
|
848 |
-
if(this->search(pch)==0){
|
849 |
-
ppt = new float;
|
850 |
-
(*ppt) =z;
|
851 |
-
np=new Node;
|
852 |
-
np->str=pch;
|
853 |
-
np->rel=ppt;
|
854 |
-
this->insert(np);
|
855 |
-
cnt_key++;
|
856 |
-
}
|
857 |
-
else {
|
858 |
-
(*(float*) this->give_ptr())+=z;
|
859 |
-
}
|
860 |
-
}
|
861 |
-
|
862 |
-
float FCount::count(const char *pch){
|
863 |
-
if(this->search(pch)==0){
|
864 |
-
return(0);
|
865 |
-
}
|
866 |
-
else {
|
867 |
-
return(*((float*) this->give_ptr()));
|
868 |
-
}
|
869 |
-
}
|
870 |
-
|
871 |
-
float FCount::count(void){
|
872 |
-
return(*((float*) this->give_ptr()));
|
873 |
-
}
|
874 |
-
|
875 |
-
//DCount (double precision count tree)
|
876 |
-
|
877 |
-
DCount::DCount() : List() {
|
878 |
-
total=0;
|
879 |
-
}
|
880 |
-
|
881 |
-
DCount::~DCount(){
|
882 |
-
if(copy)return;
|
883 |
-
double *pk;
|
884 |
-
this->node_first();
|
885 |
-
while(this->node_next()){
|
886 |
-
pk=(double*)(this->give_ptr());
|
887 |
-
if(pk)delete pk;
|
888 |
-
}
|
889 |
-
}
|
890 |
-
|
891 |
-
void DCount::Copy(DCount &Dc){
|
892 |
-
char *pch;
|
893 |
-
double *xx,*zz;
|
894 |
-
Node *pN;
|
895 |
-
|
896 |
-
pg[0]=root;
|
897 |
-
cnd[0]=root->ndnm;
|
898 |
-
|
899 |
-
Dc.node_first();
|
900 |
-
while(Dc.node_next()){
|
901 |
-
pch=Dc.show_str();
|
902 |
-
xx=(double*)Dc.give_ptr();
|
903 |
-
zz=new double;
|
904 |
-
*zz=*xx;
|
905 |
-
pN=new Node(pch,(void*)zz);
|
906 |
-
add(pN);
|
907 |
-
}
|
908 |
-
}
|
909 |
-
|
910 |
-
void DCount::add_count(const char *pch,double z){
|
911 |
-
double *ppt;
|
912 |
-
Node *np;
|
913 |
-
total+=z;
|
914 |
-
if(this->search(pch)==0){
|
915 |
-
ppt = new double;
|
916 |
-
(*ppt) =z;
|
917 |
-
np=new Node(pch,(void*)ppt);
|
918 |
-
this->insert(np);
|
919 |
-
}
|
920 |
-
else {
|
921 |
-
(*(double*) this->give_ptr())+=z;
|
922 |
-
}
|
923 |
-
}
|
924 |
-
|
925 |
-
void DCount::add_count2(const char *pch,double z){
|
926 |
-
double *ppt;
|
927 |
-
Node *np;
|
928 |
-
total+=z;
|
929 |
-
if(this->search(pch)==0){
|
930 |
-
ppt = new double;
|
931 |
-
(*ppt) =z;
|
932 |
-
np=new Node(pch,(void*)ppt);
|
933 |
-
this->insert(np);
|
934 |
-
cnt_key++;
|
935 |
-
}
|
936 |
-
else {
|
937 |
-
(*(double*) this->give_ptr())+=z;
|
938 |
-
}
|
939 |
-
}
|
940 |
-
|
941 |
-
void DCount::addp_count2(char *pch,double z){
|
942 |
-
double *ppt;
|
943 |
-
Node *np;
|
944 |
-
total+=z;
|
945 |
-
if(this->search(pch)==0){
|
946 |
-
ppt = new double;
|
947 |
-
(*ppt) =z;
|
948 |
-
np=new Node;
|
949 |
-
np->str=pch;
|
950 |
-
np->rel=ppt;
|
951 |
-
this->insert(np);
|
952 |
-
cnt_key++;
|
953 |
-
}
|
954 |
-
else {
|
955 |
-
(*(double*) this->give_ptr())+=z;
|
956 |
-
}
|
957 |
-
}
|
958 |
-
|
959 |
-
double DCount::count(const char *pch){
|
960 |
-
if(this->search(pch)==0){
|
961 |
-
return(0);
|
962 |
-
}
|
963 |
-
else {
|
964 |
-
return(*((double*) this->give_ptr()));
|
965 |
-
}
|
966 |
-
}
|
967 |
-
|
968 |
-
double DCount::count(void){
|
969 |
-
return(*((double*) this->give_ptr()));
|
970 |
-
}
|
971 |
-
|
972 |
-
void DCount::max_count(const char *pch,double z){
|
973 |
-
double *ppt;
|
974 |
-
Node *np;
|
975 |
-
total+=z;
|
976 |
-
if(!search(pch)){
|
977 |
-
ppt = new double;
|
978 |
-
(*ppt) =z;
|
979 |
-
np=new Node(pch,(void*)ppt);
|
980 |
-
this->insert(np);
|
981 |
-
}
|
982 |
-
else {
|
983 |
-
ppt=(double*)give_ptr();
|
984 |
-
if(*ppt<z)*ppt=z;
|
985 |
-
}
|
986 |
-
}
|
987 |
-
|
988 |
-
void DCount::max_count2(const char *pch,double z){
|
989 |
-
double *ppt;
|
990 |
-
Node *np;
|
991 |
-
total+=z;
|
992 |
-
if(!search(pch)){
|
993 |
-
ppt = new double;
|
994 |
-
(*ppt) =z;
|
995 |
-
np=new Node(pch,(void*)ppt);
|
996 |
-
this->insert(np);
|
997 |
-
cnt_key++;
|
998 |
-
}
|
999 |
-
else {
|
1000 |
-
ppt=(double*)give_ptr();
|
1001 |
-
if(*ppt<z)*ppt=z;
|
1002 |
-
}
|
1003 |
-
}
|
1004 |
-
|
1005 |
-
void DCount::maxp_count2(char *pch,double z){
|
1006 |
-
double *ppt;
|
1007 |
-
Node *np;
|
1008 |
-
total+=z;
|
1009 |
-
if(!search(pch)){
|
1010 |
-
ppt = new double;
|
1011 |
-
(*ppt) =z;
|
1012 |
-
np=new Node;
|
1013 |
-
np->str=pch;
|
1014 |
-
np->rel=ppt;
|
1015 |
-
this->insert(np);
|
1016 |
-
cnt_key++;
|
1017 |
-
}
|
1018 |
-
else {
|
1019 |
-
ppt=(double*)give_ptr();
|
1020 |
-
if(*ppt<z)*ppt=z;
|
1021 |
-
}
|
1022 |
-
}
|
1023 |
-
|
1024 |
-
void DCount::min_count(const char *pch,double z){
|
1025 |
-
double *ppt;
|
1026 |
-
Node *np;
|
1027 |
-
total+=z;
|
1028 |
-
if(!search(pch)){
|
1029 |
-
ppt = new double;
|
1030 |
-
(*ppt) =z;
|
1031 |
-
np=new Node(pch,(void*)ppt);
|
1032 |
-
this->insert(np);
|
1033 |
-
}
|
1034 |
-
else {
|
1035 |
-
ppt=(double*)give_ptr();
|
1036 |
-
if(*ppt>z)*ppt=z;
|
1037 |
-
}
|
1038 |
-
}
|
1039 |
-
|
1040 |
-
void DCount::min_count2(const char *pch,double z){
|
1041 |
-
double *ppt;
|
1042 |
-
Node *np;
|
1043 |
-
total+=z;
|
1044 |
-
if(!search(pch)){
|
1045 |
-
ppt = new double;
|
1046 |
-
(*ppt) =z;
|
1047 |
-
np=new Node(pch,(void*)ppt);
|
1048 |
-
this->insert(np);
|
1049 |
-
cnt_key++;
|
1050 |
-
}
|
1051 |
-
else {
|
1052 |
-
ppt=(double*)give_ptr();
|
1053 |
-
if(*ppt>z)*ppt=z;
|
1054 |
-
}
|
1055 |
-
}
|
1056 |
-
|
1057 |
-
void DCount::minp_count2(char *pch,double z){
|
1058 |
-
double *ppt;
|
1059 |
-
Node *np;
|
1060 |
-
total+=z;
|
1061 |
-
if(!search(pch)){
|
1062 |
-
ppt = new double;
|
1063 |
-
(*ppt) =z;
|
1064 |
-
np=new Node;
|
1065 |
-
np->str=pch;
|
1066 |
-
np->rel=ppt;
|
1067 |
-
this->insert(np);
|
1068 |
-
cnt_key++;
|
1069 |
-
}
|
1070 |
-
else {
|
1071 |
-
ppt=(double*)give_ptr();
|
1072 |
-
if(*ppt>z)*ppt=z;
|
1073 |
-
}
|
1074 |
-
}
|
1075 |
-
|
1076 |
-
void DCount::debug(void){
|
1077 |
-
node_first();
|
1078 |
-
while(node_next()){
|
1079 |
-
cout << count() << " " << show_str() << endl;
|
1080 |
-
}
|
1081 |
-
}
|
1082 |
-
|
1083 |
-
//Partial Match
|
1084 |
-
|
1085 |
-
Partial_match::Partial_match() : Count() {
|
1086 |
-
}
|
1087 |
-
|
1088 |
-
Partial_match::~Partial_match(){
|
1089 |
-
}
|
1090 |
-
|
1091 |
-
void Partial_match::long_match(char *str,List &Lst){
|
1092 |
-
char *pch;
|
1093 |
-
while(*str!='\0'){
|
1094 |
-
if(this->search_long(str)){
|
1095 |
-
pch=this->show_str();
|
1096 |
-
Lst.add_key_count(pch);
|
1097 |
-
}
|
1098 |
-
if((pch=strchr(str,' '))!=NULL)str=pch+1;
|
1099 |
-
else str=str+strlen(str);
|
1100 |
-
}
|
1101 |
-
}
|
1102 |
-
|
1103 |
-
void Partial_match::local_match(char *str,List &Lst){
|
1104 |
-
char *pch;
|
1105 |
-
int i,j;
|
1106 |
-
if(*str!='\0'){
|
1107 |
-
if(this->search_long(str)){
|
1108 |
-
pch=this->show_str();
|
1109 |
-
Lst.add_key_count(pch);
|
1110 |
-
i=strlen(pch)-1;
|
1111 |
-
while(0<i){
|
1112 |
-
while((0<i)&&(*(str+i)!=' '))i--;
|
1113 |
-
if(0<i){
|
1114 |
-
*(str+i)='\0';
|
1115 |
-
j=this->search(str);
|
1116 |
-
*(str+i)=' ';
|
1117 |
-
if(j){
|
1118 |
-
pch=this->show_str();
|
1119 |
-
Lst.add_key_count(pch);
|
1120 |
-
}
|
1121 |
-
i--;
|
1122 |
-
}
|
1123 |
-
}
|
1124 |
-
}
|
1125 |
-
}
|
1126 |
-
}
|
1127 |
-
|
1128 |
-
void Partial_match::all_match(char *str,List &Lst){
|
1129 |
-
char *pch;
|
1130 |
-
int i,j;
|
1131 |
-
while(*str!='\0'){
|
1132 |
-
if(this->search_long(str)){
|
1133 |
-
pch=this->show_str();
|
1134 |
-
Lst.add_key_count(pch);
|
1135 |
-
i=strlen(pch)-1;
|
1136 |
-
while(0<i){
|
1137 |
-
while((0<i)&&(*(str+i)!=' '))i--;
|
1138 |
-
if(0<i){
|
1139 |
-
*(str+i)='\0';
|
1140 |
-
j=this->search(str);
|
1141 |
-
*(str+i)=' ';
|
1142 |
-
if(j){
|
1143 |
-
pch=this->show_str();
|
1144 |
-
Lst.add_key_count(pch);
|
1145 |
-
}
|
1146 |
-
i--;
|
1147 |
-
}
|
1148 |
-
}
|
1149 |
-
}
|
1150 |
-
if((pch=strchr(str,' '))!=NULL)str=pch+1;
|
1151 |
-
else str=str+strlen(str);
|
1152 |
-
}
|
1153 |
-
}
|
1154 |
-
|
1155 |
-
void Partial_match::long_match(char *str,Count &Cnt,long n){
|
1156 |
-
char *pch;
|
1157 |
-
while(*str!='\0'){
|
1158 |
-
if(this->search_long(str)){
|
1159 |
-
pch=this->show_str();
|
1160 |
-
Cnt.add_count2(pch,n);
|
1161 |
-
}
|
1162 |
-
if((pch=strchr(str,' '))!=NULL)str=pch+1;
|
1163 |
-
else str=str+strlen(str);
|
1164 |
-
}
|
1165 |
-
}
|
1166 |
-
|
1167 |
-
void Partial_match::local_match(char *str,Count &Cnt,long n){
|
1168 |
-
char *pch;
|
1169 |
-
int i,j;
|
1170 |
-
if(*str!='\0'){
|
1171 |
-
if(this->search_long(str)){
|
1172 |
-
pch=this->show_str();
|
1173 |
-
Cnt.add_count2(pch,n);
|
1174 |
-
i=strlen(pch)-1;
|
1175 |
-
while(0<i){
|
1176 |
-
while((0<i)&&(*(str+i)!=' '))i--;
|
1177 |
-
if(0<i){
|
1178 |
-
*(str+i)='\0';
|
1179 |
-
j=this->search(str);
|
1180 |
-
*(str+i)=' ';
|
1181 |
-
if(j){
|
1182 |
-
pch=this->show_str();
|
1183 |
-
Cnt.add_count2(pch,n);
|
1184 |
-
}
|
1185 |
-
i--;
|
1186 |
-
}
|
1187 |
-
}
|
1188 |
-
}
|
1189 |
-
}
|
1190 |
-
}
|
1191 |
-
|
1192 |
-
void Partial_match::all_match(char *str,Count &Cnt,long n){
|
1193 |
-
char *pch;
|
1194 |
-
int i,j;
|
1195 |
-
while(*str!='\0'){
|
1196 |
-
if(this->search_long(str)){
|
1197 |
-
pch=this->show_str();
|
1198 |
-
Cnt.add_count2(pch,n);
|
1199 |
-
i=strlen(pch)-1;
|
1200 |
-
while(0<i){
|
1201 |
-
while((0<i)&&(*(str+i)!=' '))i--;
|
1202 |
-
if(0<i){
|
1203 |
-
*(str+i)='\0';
|
1204 |
-
j=this->search(str);
|
1205 |
-
*(str+i)=' ';
|
1206 |
-
if(j){
|
1207 |
-
pch=this->show_str();
|
1208 |
-
Cnt.add_count2(pch,n);
|
1209 |
-
}
|
1210 |
-
i--;
|
1211 |
-
}
|
1212 |
-
}
|
1213 |
-
}
|
1214 |
-
if((pch=strchr(str,' '))!=NULL)str=pch+1;
|
1215 |
-
else str=str+strlen(str);
|
1216 |
-
}
|
1217 |
-
}
|
1218 |
-
|
1219 |
-
int Partial_match::search_long(char *str){
|
1220 |
-
int a=0,b=0,i,j;
|
1221 |
-
len=strlen(str);
|
1222 |
-
if(this->step_one(a,b,str))return(1);
|
1223 |
-
i=(a<b)?b:a;
|
1224 |
-
while(cln_o<i){
|
1225 |
-
while((cln_o<i)&&(*(str+i)!=' '))i--;
|
1226 |
-
if(cln_o<i){
|
1227 |
-
*(str+i)='\0';
|
1228 |
-
j=this->search(str);
|
1229 |
-
*(str+i)=' ';
|
1230 |
-
if(j)return(1);
|
1231 |
-
i--;
|
1232 |
-
}
|
1233 |
-
}
|
1234 |
-
if(cln_o){
|
1235 |
-
depth=depth_o;
|
1236 |
-
cnd[depth]=index_o;
|
1237 |
-
return(1);
|
1238 |
-
}
|
1239 |
-
else return(0);
|
1240 |
-
}
|
1241 |
-
|
1242 |
-
int Partial_match::step_one(int &a,int &b,char *str){
|
1243 |
-
char c;
|
1244 |
-
cln_o=0;
|
1245 |
-
cln=0;
|
1246 |
-
while((c=*(str+cln))&&c!=32)cln++;
|
1247 |
-
*(str+cln)='\0';
|
1248 |
-
depth=-1;
|
1249 |
-
Page *pu=root;
|
1250 |
-
int i,j;
|
1251 |
-
while(pu!=NULL){
|
1252 |
-
depth++;
|
1253 |
-
pg[depth]=pu;
|
1254 |
-
j=(pu->search)(a,b,str,i,this);
|
1255 |
-
cnd[depth]=i;
|
1256 |
-
if(j==1)return(1);
|
1257 |
-
if(i==0)pu=pu->pdn;
|
1258 |
-
else pu=(pu->pnd)[i-1]->pdn;
|
1259 |
-
}
|
1260 |
-
|
1261 |
-
if(cln<len)*(str+cln)=' ';
|
1262 |
-
return(0);
|
1263 |
-
}
|
1264 |
-
|
1265 |
-
int Partial_match::stc_my_long(int &a,int &b,char *str,const char *ptr,int index)
|
1266 |
-
{char c;
|
1267 |
-
int i=(a<b) ? a : b;
|
1268 |
-
const char *p1=str+i;
|
1269 |
-
const char *p2=ptr+i;
|
1270 |
-
int j=0;
|
1271 |
-
while((*p1==*p2)&&(*p1!='\0')){
|
1272 |
-
j++;
|
1273 |
-
p1++;
|
1274 |
-
p2++;
|
1275 |
-
if((*p1=='\0'&&*p2!='\0')&&(cln<len)){
|
1276 |
-
*(str+cln++)=' ';
|
1277 |
-
while((c=*(str+cln))&&c!=32)cln++;
|
1278 |
-
*(str+cln)='\0';
|
1279 |
-
}
|
1280 |
-
}
|
1281 |
-
if(*p1==*p2){
|
1282 |
-
if(cln<len){
|
1283 |
-
depth_o=depth;
|
1284 |
-
index_o=index;
|
1285 |
-
cln_o=cln;
|
1286 |
-
*(str+cln++)=' ';
|
1287 |
-
while((c=*(str+cln))&&c!=32)cln++;
|
1288 |
-
*(str+cln)='\0';
|
1289 |
-
a=i+j;
|
1290 |
-
return(1);
|
1291 |
-
}
|
1292 |
-
else return(0);
|
1293 |
-
}
|
1294 |
-
else if(*p1<*p2){
|
1295 |
-
b=i+j;
|
1296 |
-
return(-1);
|
1297 |
-
}
|
1298 |
-
else {
|
1299 |
-
a=i+j;
|
1300 |
-
return(1);
|
1301 |
-
}
|
1302 |
-
}
|
1303 |
-
|
1304 |
-
}
|
|
|
1 |
+
#include <iostream>
|
2 |
+
#include <fstream>
|
3 |
+
#include <cstdio>
|
4 |
+
#include <cstdlib>
|
5 |
+
#include <cmath>
|
6 |
+
#include <cstring>
|
7 |
+
#include <cassert>
|
8 |
+
#include "Btree.h"
|
9 |
+
#include "runn.h"
|
10 |
+
|
11 |
+
using namespace std;
|
12 |
+
namespace iret {
|
13 |
+
|
14 |
+
Node::Node(void){
|
15 |
+
str=NULL;
|
16 |
+
rel=NULL;
|
17 |
+
pdn=NULL;
|
18 |
+
}
|
19 |
+
|
20 |
+
Node::Node(const char *ptr){
|
21 |
+
int i=strlen(ptr);
|
22 |
+
str = new char[i+1];
|
23 |
+
strcpy(str,ptr);
|
24 |
+
rel=NULL;
|
25 |
+
pdn=NULL;
|
26 |
+
}
|
27 |
+
|
28 |
+
Node::Node(char const *ptr,void *dtr){
|
29 |
+
int i=strlen(ptr);
|
30 |
+
str = new char[i+1];
|
31 |
+
strcpy(str,ptr);
|
32 |
+
rel = dtr;
|
33 |
+
pdn=NULL;
|
34 |
+
}
|
35 |
+
|
36 |
+
Node::~Node(){
|
37 |
+
if(str)delete [] str;
|
38 |
+
}
|
39 |
+
|
40 |
+
void Node::debug(void){
|
41 |
+
cout << "Node {" << endl;
|
42 |
+
cout << " str: " << this->str << endl;
|
43 |
+
if(rel==NULL)cout << " rel: NULL" << endl;
|
44 |
+
else cout << " rel: " << (long)rel << endl;
|
45 |
+
if(pdn==NULL)cout << " pdn: NULL" << endl;
|
46 |
+
else cout << " pdn: " << (long)pdn << endl;
|
47 |
+
cout << " }" << endl;
|
48 |
+
}
|
49 |
+
|
50 |
+
Page::Page(){
|
51 |
+
pdn=NULL;
|
52 |
+
ndnm='\0';
|
53 |
+
}
|
54 |
+
|
55 |
+
Page::Page(Page *const pz,Page *const pn,const int n){
|
56 |
+
pdn=pn;
|
57 |
+
int j=(int)(pz->ndnm)-n;
|
58 |
+
ndnm=(char)(j>0 ? j : 0);
|
59 |
+
for(int i=0;i<(int)ndnm;i++){pnd[i]=(pz->pnd)[n+i];}
|
60 |
+
}
|
61 |
+
|
62 |
+
Page::~Page(){
|
63 |
+
for(int i=0;i<(int)ndnm;i++){
|
64 |
+
delete pnd[i];
|
65 |
+
}
|
66 |
+
}
|
67 |
+
|
68 |
+
void Page::clean(void){
|
69 |
+
for(int i=0;i<(int)ndnm;i++){
|
70 |
+
pnd[i]->str=NULL;
|
71 |
+
}
|
72 |
+
}
|
73 |
+
|
74 |
+
void Page::insert(const int n,Node * const nd,const int j){
|
75 |
+
assert(j<ord2);
|
76 |
+
assert(n<=j);
|
77 |
+
if(n==j){
|
78 |
+
pnd[j]=nd;
|
79 |
+
}
|
80 |
+
else {
|
81 |
+
for(int i=j;i>n;i--)pnd[i]=pnd[i-1];
|
82 |
+
pnd[n]=nd;
|
83 |
+
}
|
84 |
+
ndnm++;
|
85 |
+
}
|
86 |
+
|
87 |
+
int Page::search(int &a,int &b,const char *str,int &p){
|
88 |
+
int j;
|
89 |
+
if((j=stc_my(a,b,str,pnd[0]->str))<0){
|
90 |
+
p=0;
|
91 |
+
return(0);
|
92 |
+
}
|
93 |
+
else if(j==0){
|
94 |
+
p=0;
|
95 |
+
return(1);
|
96 |
+
}
|
97 |
+
if((j=stc_my(a,b,str,pnd[(int)(ndnm-1)]->str))>0){
|
98 |
+
p=(int)ndnm;
|
99 |
+
return(0);
|
100 |
+
}
|
101 |
+
else if(j==0){
|
102 |
+
p=(int)(ndnm-1);
|
103 |
+
return(1);
|
104 |
+
}
|
105 |
+
int x=0,i;
|
106 |
+
int y=(int)(ndnm-1);
|
107 |
+
while(y-x>1){
|
108 |
+
i=(y+x)/2;
|
109 |
+
if((j=stc_my(a,b,str,pnd[i]->str))==0){p=i;return(1);}
|
110 |
+
else if(j<0)y=i;
|
111 |
+
else x=i;
|
112 |
+
}
|
113 |
+
p=y;
|
114 |
+
return(0);
|
115 |
+
}
|
116 |
+
|
117 |
+
int Page::search(int &a,int &b,char *str,int &p,Partial_match *btr){
|
118 |
+
int j;
|
119 |
+
if((j=btr->stc_my_long(a,b,str,pnd[0]->str,0))<0){
|
120 |
+
p=0;
|
121 |
+
return(0);
|
122 |
+
}
|
123 |
+
else if(j==0){
|
124 |
+
p=0;
|
125 |
+
return(1);
|
126 |
+
}
|
127 |
+
if((j=btr->stc_my_long(a,b,str,pnd[(int)(ndnm-1)]->str,(int)(ndnm-1)))>0){
|
128 |
+
p=(int)ndnm;
|
129 |
+
return(0);
|
130 |
+
}
|
131 |
+
else if(j==0){
|
132 |
+
p=(int)(ndnm-1);
|
133 |
+
return(1);
|
134 |
+
}
|
135 |
+
int x=0,i;
|
136 |
+
int y=(int)(ndnm-1);
|
137 |
+
while(y-x>1){
|
138 |
+
i=(y+x)/2;
|
139 |
+
if((j=btr->stc_my_long(a,b,str,pnd[i]->str,i))==0){p=i;return(1);}
|
140 |
+
else if(j<0)y=i;
|
141 |
+
else x=i;
|
142 |
+
}
|
143 |
+
p=y;
|
144 |
+
return(0);
|
145 |
+
}
|
146 |
+
|
147 |
+
void Page::debug(void){
|
148 |
+
cout << "Page {" << endl;
|
149 |
+
cout << " ndnm: " << (int)ndnm << endl;
|
150 |
+
if(pdn==NULL)cout << " pdn: NULL" << endl;
|
151 |
+
else cout << " pdn: " << (long)pdn << endl;
|
152 |
+
for(int i=0;i<(int)ndnm;i++){
|
153 |
+
cout << i << " ";
|
154 |
+
(this->pnd[i])->debug();
|
155 |
+
}
|
156 |
+
cout << " }" << endl;
|
157 |
+
}
|
158 |
+
|
159 |
+
int stc_my(int &a,int &b,const char *str,const char *ptr)
|
160 |
+
{register int i=(a<b) ? a : b;
|
161 |
+
register const char *p1=str+i;
|
162 |
+
register const char *p2=ptr+i;
|
163 |
+
register int j=0;
|
164 |
+
while((*p1==*p2)&&(*p1!='\0')){
|
165 |
+
j++;
|
166 |
+
p1++;
|
167 |
+
p2++;
|
168 |
+
}
|
169 |
+
if(*p1==*p2)return(0);
|
170 |
+
else if(*p1<*p2){
|
171 |
+
b=i+j;
|
172 |
+
return(-1);
|
173 |
+
}
|
174 |
+
else {
|
175 |
+
a=i+j;
|
176 |
+
return(1);
|
177 |
+
}
|
178 |
+
}
|
179 |
+
|
180 |
+
Btree::Btree(){
|
181 |
+
iclean=0;
|
182 |
+
copy=false;
|
183 |
+
depth=0;
|
184 |
+
root=new Page;
|
185 |
+
root->ndnm = 1;
|
186 |
+
(root->pnd)[0]=new Node("");
|
187 |
+
}
|
188 |
+
|
189 |
+
int Btree::search(const char *str){
|
190 |
+
depth=-1;
|
191 |
+
Page *pu=root;
|
192 |
+
register int a=0,b=0,i,j;
|
193 |
+
while(pu!=NULL){
|
194 |
+
depth++;
|
195 |
+
pg[depth]=pu;
|
196 |
+
j=(pu->search)(a,b,str,i);
|
197 |
+
cnd[depth]=i;
|
198 |
+
if(j==1)return(1);
|
199 |
+
if(i==0)pu=pu->pdn;
|
200 |
+
else pu=(pu->pnd)[i-1]->pdn;
|
201 |
+
}
|
202 |
+
return(0);
|
203 |
+
}
|
204 |
+
|
205 |
+
int Btree::insert(Node *nd){
|
206 |
+
int w,k;
|
207 |
+
Page *pm,*pz;
|
208 |
+
while((nd!=NULL)&&(depth)){
|
209 |
+
pm=pg[depth];
|
210 |
+
w=pm->ndnm;
|
211 |
+
if(w<ord2){
|
212 |
+
pm->insert(cnd[depth],nd,w);
|
213 |
+
nd=NULL;
|
214 |
+
}
|
215 |
+
else {
|
216 |
+
k=cnd[depth];
|
217 |
+
if(k<order){
|
218 |
+
pz=new Page(pm,((pm->pnd)[order-1])->pdn,order);
|
219 |
+
pm->insert(k,nd,order);
|
220 |
+
nd=pm->pnd[order];
|
221 |
+
nd->pdn=pz;
|
222 |
+
pm->ndnm=order;
|
223 |
+
}
|
224 |
+
else if(k>order){
|
225 |
+
pz=new Page(pm,((pm->pnd)[order])->pdn,order+1);
|
226 |
+
pz->insert(k-order-1,nd,order-1);
|
227 |
+
nd=pm->pnd[order];
|
228 |
+
nd->pdn=pz;
|
229 |
+
pm->ndnm=order;
|
230 |
+
}
|
231 |
+
else {
|
232 |
+
pz=new Page(pm,nd->pdn,order);
|
233 |
+
nd->pdn=pz;
|
234 |
+
pm->ndnm=order;
|
235 |
+
}
|
236 |
+
}
|
237 |
+
depth--;
|
238 |
+
}
|
239 |
+
if(nd!=NULL){
|
240 |
+
pm=pg[depth];
|
241 |
+
w=pm->ndnm;
|
242 |
+
if(w<ord2)pm->insert(cnd[depth],nd,w);
|
243 |
+
else {
|
244 |
+
root=new Page();
|
245 |
+
root->pdn=pm;
|
246 |
+
k=cnd[depth];
|
247 |
+
if(k<order){
|
248 |
+
pz=new Page(pm,((pm->pnd)[order-1])->pdn,order);
|
249 |
+
pm->insert(k,nd,order);
|
250 |
+
(root->pnd)[0]=pm->pnd[order];
|
251 |
+
((root->pnd)[0])->pdn=pz;
|
252 |
+
root->ndnm=1;
|
253 |
+
pm->ndnm=order;
|
254 |
+
}
|
255 |
+
else if(k>order){
|
256 |
+
pz=new Page(pm,((pm->pnd)[order])->pdn,order+1);
|
257 |
+
pz->insert(k-order-1,nd,order-1);
|
258 |
+
(root->pnd)[0]=pm->pnd[order];
|
259 |
+
((root->pnd)[0])->pdn=pz;
|
260 |
+
root->ndnm=1;
|
261 |
+
pm->ndnm=order;
|
262 |
+
}
|
263 |
+
else {
|
264 |
+
pz=new Page(pm,nd->pdn,order);
|
265 |
+
(root->pnd)[0]=nd;
|
266 |
+
nd->pdn=pz;
|
267 |
+
root->ndnm=1;
|
268 |
+
pm->ndnm=order;
|
269 |
+
}
|
270 |
+
}
|
271 |
+
}
|
272 |
+
return(1);
|
273 |
+
}
|
274 |
+
|
275 |
+
void Btree::node_first(void){
|
276 |
+
depth=0;
|
277 |
+
pg[depth]=root;
|
278 |
+
cnd[depth]=0;
|
279 |
+
Page *pm;
|
280 |
+
while((pm=(pg[depth]->pdn))!=NULL){
|
281 |
+
depth++;
|
282 |
+
pg[depth]=pm;
|
283 |
+
cnd[depth]=0;
|
284 |
+
}
|
285 |
+
}
|
286 |
+
|
287 |
+
int Btree::node_next(){
|
288 |
+
int i=cnd[depth];
|
289 |
+
Page *pd=((pg[depth]->pnd)[i])->pdn;
|
290 |
+
if(pd!=NULL){
|
291 |
+
(cnd[depth])++;
|
292 |
+
depth++;
|
293 |
+
pg[depth]=pd;
|
294 |
+
cnd[depth]=0;
|
295 |
+
while((pd=(pg[depth]->pdn))!=NULL){
|
296 |
+
depth++;
|
297 |
+
pg[depth]=pd;
|
298 |
+
cnd[depth]=0;
|
299 |
+
}
|
300 |
+
}
|
301 |
+
else {
|
302 |
+
cnd[depth]=++i;
|
303 |
+
while((depth>=1)&&(i==(pg[depth]->ndnm))){depth--;i=cnd[depth];}
|
304 |
+
if((depth==0)&&(i==(pg[depth]->ndnm)))depth--;
|
305 |
+
if(depth<0)return(0);
|
306 |
+
}
|
307 |
+
return(1);
|
308 |
+
}
|
309 |
+
|
310 |
+
char *Btree::show_str(){
|
311 |
+
return(((pg[depth]->pnd)[cnd[depth]])->str);
|
312 |
+
}
|
313 |
+
|
314 |
+
void *Btree::give_ptr(){
|
315 |
+
return(((pg[depth]->pnd)[cnd[depth]])->rel);
|
316 |
+
}
|
317 |
+
|
318 |
+
void Btree::set_ptr(void *dtr){
|
319 |
+
((pg[depth]->pnd)[cnd[depth]])->rel=dtr;
|
320 |
+
}
|
321 |
+
|
322 |
+
Btree::~Btree(){
|
323 |
+
int pflag=get_qflag();
|
324 |
+
long k=0;
|
325 |
+
if (copy) return; // only delete original
|
326 |
+
if(!iclean){
|
327 |
+
node_first();
|
328 |
+
int i=depth,j;
|
329 |
+
do{
|
330 |
+
j=node_next();
|
331 |
+
if(depth<i){
|
332 |
+
while(i>depth){
|
333 |
+
delete pg[i];
|
334 |
+
i--;
|
335 |
+
mark(pflag,++k,1000,"pages deleted");
|
336 |
+
}
|
337 |
+
}
|
338 |
+
else i=depth;
|
339 |
+
} while(j);
|
340 |
+
}
|
341 |
+
else {
|
342 |
+
node_first();
|
343 |
+
int i=depth,j;
|
344 |
+
do{
|
345 |
+
j=node_next();
|
346 |
+
if(depth<i){
|
347 |
+
while(i>depth){
|
348 |
+
pg[i]->clean();
|
349 |
+
delete pg[i];
|
350 |
+
i--;
|
351 |
+
mark(pflag,++k,1000,"pages deleted");
|
352 |
+
}
|
353 |
+
}
|
354 |
+
else i=depth;
|
355 |
+
} while(j);
|
356 |
+
}
|
357 |
+
}
|
358 |
+
|
359 |
+
long Btree::list_write(ofstream &fout){
|
360 |
+
int pflag=get_qflag();
|
361 |
+
long ct=0;
|
362 |
+
node_first();
|
363 |
+
while(node_next()){
|
364 |
+
fout << show_str() << endl;
|
365 |
+
mark(pflag,++ct,1000,"strings written");
|
366 |
+
}
|
367 |
+
fout.close();
|
368 |
+
return((int)fout.good());
|
369 |
+
}
|
370 |
+
|
371 |
+
Btree::Btree(ifstream &fin){
|
372 |
+
copy=false;
|
373 |
+
char cnam[256];
|
374 |
+
int pflag=get_qflag();
|
375 |
+
depth=0;
|
376 |
+
pg[0]=root=new Page();
|
377 |
+
cnd[0]=root->ndnm = 1;
|
378 |
+
(root->pnd)[0]=new Node("");
|
379 |
+
Node *pno;
|
380 |
+
long ct=0;
|
381 |
+
while(get_string(cnam,fin,'\n')){
|
382 |
+
pno = new Node(cnam);
|
383 |
+
add(pno);
|
384 |
+
mark(pflag,++ct,10000,"strings read");
|
385 |
+
}
|
386 |
+
fin.close();
|
387 |
+
}
|
388 |
+
|
389 |
+
int Btree::add(Node *nd){
|
390 |
+
int w,k,dp;
|
391 |
+
Page *pm,*pz;
|
392 |
+
dp=depth; //uses dp in place of depth in insert.
|
393 |
+
while((nd!=NULL)&&(dp)){
|
394 |
+
pm=pg[dp];
|
395 |
+
w=pm->ndnm;
|
396 |
+
if(w<ord2){
|
397 |
+
pm->insert(cnd[dp],nd,w);
|
398 |
+
nd=NULL;
|
399 |
+
(cnd[dp])++; //variation from insert.
|
400 |
+
}
|
401 |
+
else {
|
402 |
+
k=cnd[dp];
|
403 |
+
if(k<order){
|
404 |
+
pz=new Page(pm,((pm->pnd)[order-1])->pdn,order);
|
405 |
+
pm->insert(k,nd,order);
|
406 |
+
nd=pm->pnd[order];
|
407 |
+
nd->pdn=pz;
|
408 |
+
pm->ndnm=order;
|
409 |
+
}
|
410 |
+
else if(k>order){
|
411 |
+
pz=new Page(pm,((pm->pnd)[order])->pdn,order+1);
|
412 |
+
pz->insert(k-order-1,nd,order-1);
|
413 |
+
nd=pm->pnd[order];
|
414 |
+
nd->pdn=pz;
|
415 |
+
pm->ndnm=order;
|
416 |
+
}
|
417 |
+
else {
|
418 |
+
pz=new Page(pm,nd->pdn,order);
|
419 |
+
nd->pdn=pz;
|
420 |
+
pm->ndnm=order;
|
421 |
+
}
|
422 |
+
pg[dp]=pz; //2 lines of variation from insert.
|
423 |
+
cnd[dp]=order;
|
424 |
+
}
|
425 |
+
dp--;
|
426 |
+
}
|
427 |
+
if(nd!=NULL){
|
428 |
+
pm=pg[dp];
|
429 |
+
w=pm->ndnm;
|
430 |
+
if(w<ord2){
|
431 |
+
pm->insert(cnd[dp],nd,w);
|
432 |
+
(cnd[dp])++; //variation from insert.
|
433 |
+
}
|
434 |
+
else {
|
435 |
+
root=new Page();
|
436 |
+
root->pdn=pm;
|
437 |
+
k=cnd[dp];
|
438 |
+
if(k<order){
|
439 |
+
pz=new Page(pm,((pm->pnd)[order-1])->pdn,order);
|
440 |
+
pm->insert(k,nd,order);
|
441 |
+
(root->pnd)[0]=pm->pnd[order];
|
442 |
+
((root->pnd)[0])->pdn=pz;
|
443 |
+
root->ndnm=1;
|
444 |
+
pm->ndnm=order;
|
445 |
+
}
|
446 |
+
else if(k>order){
|
447 |
+
pz=new Page(pm,((pm->pnd)[order])->pdn,order+1);
|
448 |
+
pz->insert(k-order-1,nd,order-1);
|
449 |
+
(root->pnd)[0]=pm->pnd[order];
|
450 |
+
((root->pnd)[0])->pdn=pz;
|
451 |
+
root->ndnm=1;
|
452 |
+
pm->ndnm=order;
|
453 |
+
}
|
454 |
+
else {
|
455 |
+
pz=new Page(pm,nd->pdn,order);
|
456 |
+
(root->pnd)[0]=nd;
|
457 |
+
nd->pdn=pz;
|
458 |
+
root->ndnm=1;
|
459 |
+
pm->ndnm=order;
|
460 |
+
}
|
461 |
+
next_empty(); //variation from insert.
|
462 |
+
}
|
463 |
+
}
|
464 |
+
return(1);
|
465 |
+
}
|
466 |
+
|
467 |
+
void Btree::next_empty(){
|
468 |
+
depth=0;
|
469 |
+
pg[depth]=root;
|
470 |
+
int i=cnd[depth]=root->ndnm;
|
471 |
+
Page *pm;
|
472 |
+
while((pm=((pg[depth]->pnd)[i-1])->pdn)!=NULL){
|
473 |
+
depth++;
|
474 |
+
pg[depth]=pm;
|
475 |
+
i=cnd[depth]=pm->ndnm;
|
476 |
+
}
|
477 |
+
}
|
478 |
+
|
479 |
+
Str_str::Str_str() : Btree() {
|
480 |
+
}
|
481 |
+
|
482 |
+
Str_str::~Str_str(){
|
483 |
+
if(copy)return;
|
484 |
+
this->node_first();
|
485 |
+
while(this->node_next())delete [] (char*)this->give_ptr();
|
486 |
+
}
|
487 |
+
|
488 |
+
void Str_str::add_pair(const char *one,const char *two){
|
489 |
+
Node *pnd;
|
490 |
+
if(search(one)){
|
491 |
+
cout << "Duplicate string in keys list = " << one << endl;
|
492 |
+
exit(0);
|
493 |
+
}
|
494 |
+
else {
|
495 |
+
int i=strlen(two);
|
496 |
+
char *st=new char[i+1];
|
497 |
+
strcpy(st,two);
|
498 |
+
pnd=new Node(one,(void *)st);
|
499 |
+
add(pnd);
|
500 |
+
}
|
501 |
+
}
|
502 |
+
|
503 |
+
char *Str_str::match(const char *one){
|
504 |
+
if(search(one)){
|
505 |
+
return((char*)give_ptr());
|
506 |
+
}
|
507 |
+
else {
|
508 |
+
cout << "String not a key = " << one << endl;
|
509 |
+
exit(0);
|
510 |
+
}
|
511 |
+
}
|
512 |
+
|
513 |
+
List::List() : Btree() {
|
514 |
+
cnt_key=0;
|
515 |
+
}
|
516 |
+
|
517 |
+
List::~List(){
|
518 |
+
}
|
519 |
+
|
520 |
+
void List::add_key(const char *str){
|
521 |
+
Node *pnd;
|
522 |
+
if(!search(str)){
|
523 |
+
pnd=new Node(str);
|
524 |
+
add(pnd);
|
525 |
+
}
|
526 |
+
}
|
527 |
+
|
528 |
+
void List::add_key_count(const char *str){
|
529 |
+
Node *pnd;
|
530 |
+
if(!search(str)){
|
531 |
+
pnd=new Node(str);
|
532 |
+
add(pnd);
|
533 |
+
cnt_key++;
|
534 |
+
}
|
535 |
+
}
|
536 |
+
|
537 |
+
void List::addp_key_count(char *str){
|
538 |
+
Node *pnd;
|
539 |
+
if(!search(str)){
|
540 |
+
pnd=new Node;
|
541 |
+
pnd->str=str;
|
542 |
+
add(pnd);
|
543 |
+
cnt_key++;
|
544 |
+
}
|
545 |
+
}
|
546 |
+
|
547 |
+
Num_num::Num_num() : Btree() {
|
548 |
+
}
|
549 |
+
|
550 |
+
Num_num::~Num_num(){
|
551 |
+
if(copy)return;
|
552 |
+
this->node_first();
|
553 |
+
while(this->node_next())delete (long*)this->give_ptr();
|
554 |
+
}
|
555 |
+
|
556 |
+
void Num_num::add_pair(long i,long j){
|
557 |
+
Node *pnd;
|
558 |
+
char cnam[256];
|
559 |
+
long_str(cnam,i);
|
560 |
+
if(!search(cnam)){
|
561 |
+
long *st=new long;
|
562 |
+
*st=j;
|
563 |
+
pnd=new Node(cnam,(void *)st);
|
564 |
+
add(pnd);
|
565 |
+
}
|
566 |
+
}
|
567 |
+
|
568 |
+
long Num_num::match(long i){
|
569 |
+
char cnam[256];
|
570 |
+
long_str(cnam,i);
|
571 |
+
if(search(cnam)){
|
572 |
+
return(*((long*)give_ptr()));
|
573 |
+
}
|
574 |
+
else return(LNEG);
|
575 |
+
}
|
576 |
+
|
577 |
+
Count::Count() : List() {
|
578 |
+
total=0;
|
579 |
+
}
|
580 |
+
|
581 |
+
Count::~Count(){
|
582 |
+
if(copy)return;
|
583 |
+
long *pk;
|
584 |
+
this->node_first();
|
585 |
+
while(this->node_next()){
|
586 |
+
pk=(long*)(this->give_ptr());
|
587 |
+
if(pk)delete pk;
|
588 |
+
}
|
589 |
+
}
|
590 |
+
|
591 |
+
void Count::add_count(const char *pch,long n){
|
592 |
+
long *ppt;
|
593 |
+
Node *np;
|
594 |
+
total+=n;
|
595 |
+
if(this->search(pch)==0){
|
596 |
+
ppt = new long;
|
597 |
+
(*ppt) =n;
|
598 |
+
np=new Node(pch,(void*)ppt);
|
599 |
+
this->insert(np);
|
600 |
+
}
|
601 |
+
else {
|
602 |
+
(*(long*) this->give_ptr())+=n;
|
603 |
+
}
|
604 |
+
}
|
605 |
+
|
606 |
+
void Count::add_countz(const char *pch,long n){
|
607 |
+
long *ppt;
|
608 |
+
Node *np;
|
609 |
+
if(this->search(pch)==0){
|
610 |
+
ppt = new long;
|
611 |
+
(*ppt) =n;
|
612 |
+
np=new Node(pch,(void*)ppt);
|
613 |
+
this->insert(np);
|
614 |
+
cnt_key++;
|
615 |
+
}
|
616 |
+
else {
|
617 |
+
(*(long*) this->give_ptr())+=n;
|
618 |
+
}
|
619 |
+
}
|
620 |
+
|
621 |
+
void Count::add_count2(const char *pch,long n){
|
622 |
+
long *ppt;
|
623 |
+
Node *np;
|
624 |
+
total+=n;
|
625 |
+
if(this->search(pch)==0){
|
626 |
+
ppt = new long;
|
627 |
+
(*ppt) =n;
|
628 |
+
np=new Node(pch,(void*)ppt);
|
629 |
+
this->insert(np);
|
630 |
+
cnt_key++;
|
631 |
+
}
|
632 |
+
else {
|
633 |
+
(*(long*) this->give_ptr())+=n;
|
634 |
+
}
|
635 |
+
}
|
636 |
+
|
637 |
+
void Count::addp_count2(char *pch,long n){
|
638 |
+
long *ppt;
|
639 |
+
Node *np;
|
640 |
+
total+=n;
|
641 |
+
if(this->search(pch)==0){
|
642 |
+
ppt = new long;
|
643 |
+
(*ppt) =n;
|
644 |
+
np=new Node;
|
645 |
+
np->str=pch;
|
646 |
+
np->rel=ppt;
|
647 |
+
this->insert(np);
|
648 |
+
cnt_key++;
|
649 |
+
}
|
650 |
+
else {
|
651 |
+
(*(long*) this->give_ptr())+=n;
|
652 |
+
}
|
653 |
+
}
|
654 |
+
|
655 |
+
void Count::correct(const char *pch,long n){
|
656 |
+
if(this->search(pch)){
|
657 |
+
(*(long*) this->give_ptr())=n;
|
658 |
+
}
|
659 |
+
}
|
660 |
+
|
661 |
+
long Count::count(const char *pch){
|
662 |
+
if(this->search(pch)==0){
|
663 |
+
return(0);
|
664 |
+
}
|
665 |
+
else {
|
666 |
+
return(*((long*) this->give_ptr()));
|
667 |
+
}
|
668 |
+
}
|
669 |
+
|
670 |
+
long Count::count(void){
|
671 |
+
return(*((long*) this->give_ptr()));
|
672 |
+
}
|
673 |
+
|
674 |
+
void Count::max_count(const char *pch,long n){
|
675 |
+
long *ppt,i;
|
676 |
+
Node *np;
|
677 |
+
total+=n;
|
678 |
+
if(!search(pch)){
|
679 |
+
ppt = new long;
|
680 |
+
(*ppt) =n;
|
681 |
+
np=new Node(pch,(void*)ppt);
|
682 |
+
this->insert(np);
|
683 |
+
}
|
684 |
+
else {
|
685 |
+
ppt=(long*)give_ptr();
|
686 |
+
if(*ppt<n)*ppt=n;
|
687 |
+
}
|
688 |
+
}
|
689 |
+
|
690 |
+
void Count::max_count2(const char *pch,long n){
|
691 |
+
long *ppt,i;
|
692 |
+
Node *np;
|
693 |
+
total+=n;
|
694 |
+
if(!search(pch)){
|
695 |
+
ppt = new long;
|
696 |
+
(*ppt) =n;
|
697 |
+
np=new Node(pch,(void*)ppt);
|
698 |
+
this->insert(np);
|
699 |
+
cnt_key++;
|
700 |
+
}
|
701 |
+
else {
|
702 |
+
ppt=(long*)give_ptr();
|
703 |
+
if(*ppt<n)*ppt=n;
|
704 |
+
}
|
705 |
+
}
|
706 |
+
|
707 |
+
void Count::maxp_count2(char *pch,long n){
|
708 |
+
long *ppt,i;
|
709 |
+
Node *np;
|
710 |
+
total+=n;
|
711 |
+
if(!search(pch)){
|
712 |
+
ppt = new long;
|
713 |
+
(*ppt) =n;
|
714 |
+
np=new Node;
|
715 |
+
np->str=pch;
|
716 |
+
np->rel=ppt;
|
717 |
+
this->insert(np);
|
718 |
+
cnt_key++;
|
719 |
+
}
|
720 |
+
else {
|
721 |
+
ppt=(long*)give_ptr();
|
722 |
+
if(*ppt<n)*ppt=n;
|
723 |
+
}
|
724 |
+
}
|
725 |
+
|
726 |
+
void Count::min_count(const char *pch,long n){
|
727 |
+
long *ppt,i;
|
728 |
+
Node *np;
|
729 |
+
total+=n;
|
730 |
+
if(!search(pch)){
|
731 |
+
ppt = new long;
|
732 |
+
(*ppt) =n;
|
733 |
+
np=new Node(pch,(void*)ppt);
|
734 |
+
this->insert(np);
|
735 |
+
}
|
736 |
+
else {
|
737 |
+
ppt=(long*)give_ptr();
|
738 |
+
if(*ppt>n)*ppt=n;
|
739 |
+
}
|
740 |
+
}
|
741 |
+
|
742 |
+
void Count::min_count2(const char *pch,long n){
|
743 |
+
long *ppt,i;
|
744 |
+
Node *np;
|
745 |
+
total+=n;
|
746 |
+
if(!search(pch)){
|
747 |
+
ppt = new long;
|
748 |
+
(*ppt) =n;
|
749 |
+
np=new Node(pch,(void*)ppt);
|
750 |
+
this->insert(np);
|
751 |
+
cnt_key++;
|
752 |
+
}
|
753 |
+
else {
|
754 |
+
ppt=(long*)give_ptr();
|
755 |
+
if(*ppt>n)*ppt=n;
|
756 |
+
}
|
757 |
+
}
|
758 |
+
|
759 |
+
void Count::minp_count2(char *pch,long n){
|
760 |
+
long *ppt,i;
|
761 |
+
Node *np;
|
762 |
+
total+=n;
|
763 |
+
if(!search(pch)){
|
764 |
+
ppt = new long;
|
765 |
+
(*ppt) =n;
|
766 |
+
np=new Node;
|
767 |
+
np->str=pch;
|
768 |
+
np->rel=ppt;
|
769 |
+
this->insert(np);
|
770 |
+
cnt_key++;
|
771 |
+
}
|
772 |
+
else {
|
773 |
+
ppt=(long*)give_ptr();
|
774 |
+
if(*ppt>n)*ppt=n;
|
775 |
+
}
|
776 |
+
}
|
777 |
+
|
778 |
+
//FCount (float count tree)
|
779 |
+
|
780 |
+
FCount::FCount() : List() {
|
781 |
+
total=0;
|
782 |
+
}
|
783 |
+
|
784 |
+
FCount::~FCount(){
|
785 |
+
if(copy)return;
|
786 |
+
float *pk;
|
787 |
+
this->node_first();
|
788 |
+
while(this->node_next()){
|
789 |
+
pk=(float*)(this->give_ptr());
|
790 |
+
if(pk)delete pk;
|
791 |
+
}
|
792 |
+
}
|
793 |
+
|
794 |
+
void FCount::Copy(FCount &Fc){
|
795 |
+
char *pch;
|
796 |
+
float *xx,*zz;
|
797 |
+
Node *pN;
|
798 |
+
|
799 |
+
pg[0]=root;
|
800 |
+
cnd[0]=root->ndnm;
|
801 |
+
|
802 |
+
Fc.node_first();
|
803 |
+
while(Fc.node_next()){
|
804 |
+
pch=Fc.show_str();
|
805 |
+
xx=(float*)Fc.give_ptr();
|
806 |
+
zz=new float;
|
807 |
+
*zz=*xx;
|
808 |
+
pN=new Node(pch,(void*)zz);
|
809 |
+
add(pN);
|
810 |
+
}
|
811 |
+
}
|
812 |
+
|
813 |
+
void FCount::add_count(const char *pch,float z){
|
814 |
+
float *ppt;
|
815 |
+
Node *np;
|
816 |
+
total+=z;
|
817 |
+
if(this->search(pch)==0){
|
818 |
+
ppt = new float;
|
819 |
+
(*ppt) =z;
|
820 |
+
np=new Node(pch,(void*)ppt);
|
821 |
+
this->insert(np);
|
822 |
+
}
|
823 |
+
else {
|
824 |
+
(*(float*) this->give_ptr())+=z;
|
825 |
+
}
|
826 |
+
}
|
827 |
+
|
828 |
+
void FCount::add_count2(const char *pch,float z){
|
829 |
+
float *ppt;
|
830 |
+
Node *np;
|
831 |
+
total+=z;
|
832 |
+
if(this->search(pch)==0){
|
833 |
+
ppt = new float;
|
834 |
+
(*ppt) =z;
|
835 |
+
np=new Node(pch,(void*)ppt);
|
836 |
+
this->insert(np);
|
837 |
+
cnt_key++;
|
838 |
+
}
|
839 |
+
else {
|
840 |
+
(*(float*) this->give_ptr())+=z;
|
841 |
+
}
|
842 |
+
}
|
843 |
+
|
844 |
+
void FCount::addp_count2(char *pch,float z){
|
845 |
+
float *ppt;
|
846 |
+
Node *np;
|
847 |
+
total+=z;
|
848 |
+
if(this->search(pch)==0){
|
849 |
+
ppt = new float;
|
850 |
+
(*ppt) =z;
|
851 |
+
np=new Node;
|
852 |
+
np->str=pch;
|
853 |
+
np->rel=ppt;
|
854 |
+
this->insert(np);
|
855 |
+
cnt_key++;
|
856 |
+
}
|
857 |
+
else {
|
858 |
+
(*(float*) this->give_ptr())+=z;
|
859 |
+
}
|
860 |
+
}
|
861 |
+
|
862 |
+
float FCount::count(const char *pch){
|
863 |
+
if(this->search(pch)==0){
|
864 |
+
return(0);
|
865 |
+
}
|
866 |
+
else {
|
867 |
+
return(*((float*) this->give_ptr()));
|
868 |
+
}
|
869 |
+
}
|
870 |
+
|
871 |
+
float FCount::count(void){
|
872 |
+
return(*((float*) this->give_ptr()));
|
873 |
+
}
|
874 |
+
|
875 |
+
//DCount (double precision count tree)
|
876 |
+
|
877 |
+
DCount::DCount() : List() {
|
878 |
+
total=0;
|
879 |
+
}
|
880 |
+
|
881 |
+
DCount::~DCount(){
|
882 |
+
if(copy)return;
|
883 |
+
double *pk;
|
884 |
+
this->node_first();
|
885 |
+
while(this->node_next()){
|
886 |
+
pk=(double*)(this->give_ptr());
|
887 |
+
if(pk)delete pk;
|
888 |
+
}
|
889 |
+
}
|
890 |
+
|
891 |
+
void DCount::Copy(DCount &Dc){
|
892 |
+
char *pch;
|
893 |
+
double *xx,*zz;
|
894 |
+
Node *pN;
|
895 |
+
|
896 |
+
pg[0]=root;
|
897 |
+
cnd[0]=root->ndnm;
|
898 |
+
|
899 |
+
Dc.node_first();
|
900 |
+
while(Dc.node_next()){
|
901 |
+
pch=Dc.show_str();
|
902 |
+
xx=(double*)Dc.give_ptr();
|
903 |
+
zz=new double;
|
904 |
+
*zz=*xx;
|
905 |
+
pN=new Node(pch,(void*)zz);
|
906 |
+
add(pN);
|
907 |
+
}
|
908 |
+
}
|
909 |
+
|
910 |
+
void DCount::add_count(const char *pch,double z){
|
911 |
+
double *ppt;
|
912 |
+
Node *np;
|
913 |
+
total+=z;
|
914 |
+
if(this->search(pch)==0){
|
915 |
+
ppt = new double;
|
916 |
+
(*ppt) =z;
|
917 |
+
np=new Node(pch,(void*)ppt);
|
918 |
+
this->insert(np);
|
919 |
+
}
|
920 |
+
else {
|
921 |
+
(*(double*) this->give_ptr())+=z;
|
922 |
+
}
|
923 |
+
}
|
924 |
+
|
925 |
+
void DCount::add_count2(const char *pch,double z){
|
926 |
+
double *ppt;
|
927 |
+
Node *np;
|
928 |
+
total+=z;
|
929 |
+
if(this->search(pch)==0){
|
930 |
+
ppt = new double;
|
931 |
+
(*ppt) =z;
|
932 |
+
np=new Node(pch,(void*)ppt);
|
933 |
+
this->insert(np);
|
934 |
+
cnt_key++;
|
935 |
+
}
|
936 |
+
else {
|
937 |
+
(*(double*) this->give_ptr())+=z;
|
938 |
+
}
|
939 |
+
}
|
940 |
+
|
941 |
+
void DCount::addp_count2(char *pch,double z){
|
942 |
+
double *ppt;
|
943 |
+
Node *np;
|
944 |
+
total+=z;
|
945 |
+
if(this->search(pch)==0){
|
946 |
+
ppt = new double;
|
947 |
+
(*ppt) =z;
|
948 |
+
np=new Node;
|
949 |
+
np->str=pch;
|
950 |
+
np->rel=ppt;
|
951 |
+
this->insert(np);
|
952 |
+
cnt_key++;
|
953 |
+
}
|
954 |
+
else {
|
955 |
+
(*(double*) this->give_ptr())+=z;
|
956 |
+
}
|
957 |
+
}
|
958 |
+
|
959 |
+
double DCount::count(const char *pch){
|
960 |
+
if(this->search(pch)==0){
|
961 |
+
return(0);
|
962 |
+
}
|
963 |
+
else {
|
964 |
+
return(*((double*) this->give_ptr()));
|
965 |
+
}
|
966 |
+
}
|
967 |
+
|
968 |
+
double DCount::count(void){
|
969 |
+
return(*((double*) this->give_ptr()));
|
970 |
+
}
|
971 |
+
|
972 |
+
void DCount::max_count(const char *pch,double z){
|
973 |
+
double *ppt;
|
974 |
+
Node *np;
|
975 |
+
total+=z;
|
976 |
+
if(!search(pch)){
|
977 |
+
ppt = new double;
|
978 |
+
(*ppt) =z;
|
979 |
+
np=new Node(pch,(void*)ppt);
|
980 |
+
this->insert(np);
|
981 |
+
}
|
982 |
+
else {
|
983 |
+
ppt=(double*)give_ptr();
|
984 |
+
if(*ppt<z)*ppt=z;
|
985 |
+
}
|
986 |
+
}
|
987 |
+
|
988 |
+
void DCount::max_count2(const char *pch,double z){
|
989 |
+
double *ppt;
|
990 |
+
Node *np;
|
991 |
+
total+=z;
|
992 |
+
if(!search(pch)){
|
993 |
+
ppt = new double;
|
994 |
+
(*ppt) =z;
|
995 |
+
np=new Node(pch,(void*)ppt);
|
996 |
+
this->insert(np);
|
997 |
+
cnt_key++;
|
998 |
+
}
|
999 |
+
else {
|
1000 |
+
ppt=(double*)give_ptr();
|
1001 |
+
if(*ppt<z)*ppt=z;
|
1002 |
+
}
|
1003 |
+
}
|
1004 |
+
|
1005 |
+
void DCount::maxp_count2(char *pch,double z){
|
1006 |
+
double *ppt;
|
1007 |
+
Node *np;
|
1008 |
+
total+=z;
|
1009 |
+
if(!search(pch)){
|
1010 |
+
ppt = new double;
|
1011 |
+
(*ppt) =z;
|
1012 |
+
np=new Node;
|
1013 |
+
np->str=pch;
|
1014 |
+
np->rel=ppt;
|
1015 |
+
this->insert(np);
|
1016 |
+
cnt_key++;
|
1017 |
+
}
|
1018 |
+
else {
|
1019 |
+
ppt=(double*)give_ptr();
|
1020 |
+
if(*ppt<z)*ppt=z;
|
1021 |
+
}
|
1022 |
+
}
|
1023 |
+
|
1024 |
+
void DCount::min_count(const char *pch,double z){
|
1025 |
+
double *ppt;
|
1026 |
+
Node *np;
|
1027 |
+
total+=z;
|
1028 |
+
if(!search(pch)){
|
1029 |
+
ppt = new double;
|
1030 |
+
(*ppt) =z;
|
1031 |
+
np=new Node(pch,(void*)ppt);
|
1032 |
+
this->insert(np);
|
1033 |
+
}
|
1034 |
+
else {
|
1035 |
+
ppt=(double*)give_ptr();
|
1036 |
+
if(*ppt>z)*ppt=z;
|
1037 |
+
}
|
1038 |
+
}
|
1039 |
+
|
1040 |
+
void DCount::min_count2(const char *pch,double z){
|
1041 |
+
double *ppt;
|
1042 |
+
Node *np;
|
1043 |
+
total+=z;
|
1044 |
+
if(!search(pch)){
|
1045 |
+
ppt = new double;
|
1046 |
+
(*ppt) =z;
|
1047 |
+
np=new Node(pch,(void*)ppt);
|
1048 |
+
this->insert(np);
|
1049 |
+
cnt_key++;
|
1050 |
+
}
|
1051 |
+
else {
|
1052 |
+
ppt=(double*)give_ptr();
|
1053 |
+
if(*ppt>z)*ppt=z;
|
1054 |
+
}
|
1055 |
+
}
|
1056 |
+
|
1057 |
+
void DCount::minp_count2(char *pch,double z){
|
1058 |
+
double *ppt;
|
1059 |
+
Node *np;
|
1060 |
+
total+=z;
|
1061 |
+
if(!search(pch)){
|
1062 |
+
ppt = new double;
|
1063 |
+
(*ppt) =z;
|
1064 |
+
np=new Node;
|
1065 |
+
np->str=pch;
|
1066 |
+
np->rel=ppt;
|
1067 |
+
this->insert(np);
|
1068 |
+
cnt_key++;
|
1069 |
+
}
|
1070 |
+
else {
|
1071 |
+
ppt=(double*)give_ptr();
|
1072 |
+
if(*ppt>z)*ppt=z;
|
1073 |
+
}
|
1074 |
+
}
|
1075 |
+
|
1076 |
+
void DCount::debug(void){
|
1077 |
+
node_first();
|
1078 |
+
while(node_next()){
|
1079 |
+
cout << count() << " " << show_str() << endl;
|
1080 |
+
}
|
1081 |
+
}
|
1082 |
+
|
1083 |
+
//Partial Match
|
1084 |
+
|
1085 |
+
Partial_match::Partial_match() : Count() {
|
1086 |
+
}
|
1087 |
+
|
1088 |
+
Partial_match::~Partial_match(){
|
1089 |
+
}
|
1090 |
+
|
1091 |
+
void Partial_match::long_match(char *str,List &Lst){
|
1092 |
+
char *pch;
|
1093 |
+
while(*str!='\0'){
|
1094 |
+
if(this->search_long(str)){
|
1095 |
+
pch=this->show_str();
|
1096 |
+
Lst.add_key_count(pch);
|
1097 |
+
}
|
1098 |
+
if((pch=strchr(str,' '))!=NULL)str=pch+1;
|
1099 |
+
else str=str+strlen(str);
|
1100 |
+
}
|
1101 |
+
}
|
1102 |
+
|
1103 |
+
void Partial_match::local_match(char *str,List &Lst){
|
1104 |
+
char *pch;
|
1105 |
+
int i,j;
|
1106 |
+
if(*str!='\0'){
|
1107 |
+
if(this->search_long(str)){
|
1108 |
+
pch=this->show_str();
|
1109 |
+
Lst.add_key_count(pch);
|
1110 |
+
i=strlen(pch)-1;
|
1111 |
+
while(0<i){
|
1112 |
+
while((0<i)&&(*(str+i)!=' '))i--;
|
1113 |
+
if(0<i){
|
1114 |
+
*(str+i)='\0';
|
1115 |
+
j=this->search(str);
|
1116 |
+
*(str+i)=' ';
|
1117 |
+
if(j){
|
1118 |
+
pch=this->show_str();
|
1119 |
+
Lst.add_key_count(pch);
|
1120 |
+
}
|
1121 |
+
i--;
|
1122 |
+
}
|
1123 |
+
}
|
1124 |
+
}
|
1125 |
+
}
|
1126 |
+
}
|
1127 |
+
|
1128 |
+
void Partial_match::all_match(char *str,List &Lst){
|
1129 |
+
char *pch;
|
1130 |
+
int i,j;
|
1131 |
+
while(*str!='\0'){
|
1132 |
+
if(this->search_long(str)){
|
1133 |
+
pch=this->show_str();
|
1134 |
+
Lst.add_key_count(pch);
|
1135 |
+
i=strlen(pch)-1;
|
1136 |
+
while(0<i){
|
1137 |
+
while((0<i)&&(*(str+i)!=' '))i--;
|
1138 |
+
if(0<i){
|
1139 |
+
*(str+i)='\0';
|
1140 |
+
j=this->search(str);
|
1141 |
+
*(str+i)=' ';
|
1142 |
+
if(j){
|
1143 |
+
pch=this->show_str();
|
1144 |
+
Lst.add_key_count(pch);
|
1145 |
+
}
|
1146 |
+
i--;
|
1147 |
+
}
|
1148 |
+
}
|
1149 |
+
}
|
1150 |
+
if((pch=strchr(str,' '))!=NULL)str=pch+1;
|
1151 |
+
else str=str+strlen(str);
|
1152 |
+
}
|
1153 |
+
}
|
1154 |
+
|
1155 |
+
void Partial_match::long_match(char *str,Count &Cnt,long n){
|
1156 |
+
char *pch;
|
1157 |
+
while(*str!='\0'){
|
1158 |
+
if(this->search_long(str)){
|
1159 |
+
pch=this->show_str();
|
1160 |
+
Cnt.add_count2(pch,n);
|
1161 |
+
}
|
1162 |
+
if((pch=strchr(str,' '))!=NULL)str=pch+1;
|
1163 |
+
else str=str+strlen(str);
|
1164 |
+
}
|
1165 |
+
}
|
1166 |
+
|
1167 |
+
void Partial_match::local_match(char *str,Count &Cnt,long n){
|
1168 |
+
char *pch;
|
1169 |
+
int i,j;
|
1170 |
+
if(*str!='\0'){
|
1171 |
+
if(this->search_long(str)){
|
1172 |
+
pch=this->show_str();
|
1173 |
+
Cnt.add_count2(pch,n);
|
1174 |
+
i=strlen(pch)-1;
|
1175 |
+
while(0<i){
|
1176 |
+
while((0<i)&&(*(str+i)!=' '))i--;
|
1177 |
+
if(0<i){
|
1178 |
+
*(str+i)='\0';
|
1179 |
+
j=this->search(str);
|
1180 |
+
*(str+i)=' ';
|
1181 |
+
if(j){
|
1182 |
+
pch=this->show_str();
|
1183 |
+
Cnt.add_count2(pch,n);
|
1184 |
+
}
|
1185 |
+
i--;
|
1186 |
+
}
|
1187 |
+
}
|
1188 |
+
}
|
1189 |
+
}
|
1190 |
+
}
|
1191 |
+
|
1192 |
+
void Partial_match::all_match(char *str,Count &Cnt,long n){
|
1193 |
+
char *pch;
|
1194 |
+
int i,j;
|
1195 |
+
while(*str!='\0'){
|
1196 |
+
if(this->search_long(str)){
|
1197 |
+
pch=this->show_str();
|
1198 |
+
Cnt.add_count2(pch,n);
|
1199 |
+
i=strlen(pch)-1;
|
1200 |
+
while(0<i){
|
1201 |
+
while((0<i)&&(*(str+i)!=' '))i--;
|
1202 |
+
if(0<i){
|
1203 |
+
*(str+i)='\0';
|
1204 |
+
j=this->search(str);
|
1205 |
+
*(str+i)=' ';
|
1206 |
+
if(j){
|
1207 |
+
pch=this->show_str();
|
1208 |
+
Cnt.add_count2(pch,n);
|
1209 |
+
}
|
1210 |
+
i--;
|
1211 |
+
}
|
1212 |
+
}
|
1213 |
+
}
|
1214 |
+
if((pch=strchr(str,' '))!=NULL)str=pch+1;
|
1215 |
+
else str=str+strlen(str);
|
1216 |
+
}
|
1217 |
+
}
|
1218 |
+
|
1219 |
+
int Partial_match::search_long(char *str){
|
1220 |
+
int a=0,b=0,i,j;
|
1221 |
+
len=strlen(str);
|
1222 |
+
if(this->step_one(a,b,str))return(1);
|
1223 |
+
i=(a<b)?b:a;
|
1224 |
+
while(cln_o<i){
|
1225 |
+
while((cln_o<i)&&(*(str+i)!=' '))i--;
|
1226 |
+
if(cln_o<i){
|
1227 |
+
*(str+i)='\0';
|
1228 |
+
j=this->search(str);
|
1229 |
+
*(str+i)=' ';
|
1230 |
+
if(j)return(1);
|
1231 |
+
i--;
|
1232 |
+
}
|
1233 |
+
}
|
1234 |
+
if(cln_o){
|
1235 |
+
depth=depth_o;
|
1236 |
+
cnd[depth]=index_o;
|
1237 |
+
return(1);
|
1238 |
+
}
|
1239 |
+
else return(0);
|
1240 |
+
}
|
1241 |
+
|
1242 |
+
int Partial_match::step_one(int &a,int &b,char *str){
|
1243 |
+
char c;
|
1244 |
+
cln_o=0;
|
1245 |
+
cln=0;
|
1246 |
+
while((c=*(str+cln))&&c!=32)cln++;
|
1247 |
+
*(str+cln)='\0';
|
1248 |
+
depth=-1;
|
1249 |
+
Page *pu=root;
|
1250 |
+
int i,j;
|
1251 |
+
while(pu!=NULL){
|
1252 |
+
depth++;
|
1253 |
+
pg[depth]=pu;
|
1254 |
+
j=(pu->search)(a,b,str,i,this);
|
1255 |
+
cnd[depth]=i;
|
1256 |
+
if(j==1)return(1);
|
1257 |
+
if(i==0)pu=pu->pdn;
|
1258 |
+
else pu=(pu->pnd)[i-1]->pdn;
|
1259 |
+
}
|
1260 |
+
|
1261 |
+
if(cln<len)*(str+cln)=' ';
|
1262 |
+
return(0);
|
1263 |
+
}
|
1264 |
+
|
1265 |
+
int Partial_match::stc_my_long(int &a,int &b,char *str,const char *ptr,int index)
|
1266 |
+
{char c;
|
1267 |
+
int i=(a<b) ? a : b;
|
1268 |
+
const char *p1=str+i;
|
1269 |
+
const char *p2=ptr+i;
|
1270 |
+
int j=0;
|
1271 |
+
while((*p1==*p2)&&(*p1!='\0')){
|
1272 |
+
j++;
|
1273 |
+
p1++;
|
1274 |
+
p2++;
|
1275 |
+
if((*p1=='\0'&&*p2!='\0')&&(cln<len)){
|
1276 |
+
*(str+cln++)=' ';
|
1277 |
+
while((c=*(str+cln))&&c!=32)cln++;
|
1278 |
+
*(str+cln)='\0';
|
1279 |
+
}
|
1280 |
+
}
|
1281 |
+
if(*p1==*p2){
|
1282 |
+
if(cln<len){
|
1283 |
+
depth_o=depth;
|
1284 |
+
index_o=index;
|
1285 |
+
cln_o=cln;
|
1286 |
+
*(str+cln++)=' ';
|
1287 |
+
while((c=*(str+cln))&&c!=32)cln++;
|
1288 |
+
*(str+cln)='\0';
|
1289 |
+
a=i+j;
|
1290 |
+
return(1);
|
1291 |
+
}
|
1292 |
+
else return(0);
|
1293 |
+
}
|
1294 |
+
else if(*p1<*p2){
|
1295 |
+
b=i+j;
|
1296 |
+
return(-1);
|
1297 |
+
}
|
1298 |
+
else {
|
1299 |
+
a=i+j;
|
1300 |
+
return(1);
|
1301 |
+
}
|
1302 |
+
}
|
1303 |
+
|
1304 |
+
}
|
Library/Btree.h
CHANGED
@@ -1,547 +1,547 @@
|
|
1 |
-
#ifndef BTREE_H
|
2 |
-
#define BTREE_H
|
3 |
-
|
4 |
-
#define LNEG -100000000
|
5 |
-
|
6 |
-
#include <iostream>
|
7 |
-
#include <fstream>
|
8 |
-
using namespace std;
|
9 |
-
namespace iret {
|
10 |
-
|
11 |
-
const int order = 5; //Half the order of the Btree that we build.
|
12 |
-
const int height_limit =12; //Limit on the height of the Btree.
|
13 |
-
const int ord2 = order*2; //The order of the Btree.
|
14 |
-
|
15 |
-
int stc_my(int &,int &,const char *,const char *); //Function used to compare
|
16 |
-
//two strings. The first two arguments hold information about how much the
|
17 |
-
//string can be ignored in the comparison.
|
18 |
-
|
19 |
-
class Page; //forward declaration
|
20 |
-
class Btree; //forward declaration
|
21 |
-
class Partial_match; //forward declaration
|
22 |
-
|
23 |
-
class Node {
|
24 |
-
friend int stc_my(int &,int &,const char *,const char *);
|
25 |
-
friend class Page;
|
26 |
-
friend class Btree;
|
27 |
-
friend class List;
|
28 |
-
friend class Count;
|
29 |
-
friend class FCount;
|
30 |
-
friend class DCount;
|
31 |
-
template<class Z> friend class BCount;
|
32 |
-
friend class Partial_match;
|
33 |
-
friend class Thes;
|
34 |
-
public:
|
35 |
-
Node(void); //Sets all points to NULL.
|
36 |
-
Node(const char * ); //Argument is the string for this node.
|
37 |
-
Node(const char * ,void *); //Arguments are first the string and then the
|
38 |
-
//data pointer.
|
39 |
-
~Node();
|
40 |
-
void debug(); //Prints out the node in simple format.
|
41 |
-
private:
|
42 |
-
char *str; //String pointer.
|
43 |
-
void *rel; //Data pointer.
|
44 |
-
Page *pdn; //Points down to the page below or to NULL.
|
45 |
-
};
|
46 |
-
|
47 |
-
class Page {
|
48 |
-
friend int stc_my(int &,int &,const char *,const char *);
|
49 |
-
friend class Btree;
|
50 |
-
friend class Partial_match;
|
51 |
-
friend class FCount;
|
52 |
-
friend class DCount;
|
53 |
-
public:
|
54 |
-
Page(); //Constructs a new empty page. Only happens at the root.
|
55 |
-
Page(Page * const pz,Page * const pn,const int n); //Constructs a page that
|
56 |
-
//holds the right half of a full page. The full page is pointed at by the
|
57 |
-
//pz. The new pages downward pointer is set to pn.
|
58 |
-
//n tells how much of the full page is to remain or where to begin removal.
|
59 |
-
~Page();
|
60 |
-
void clean(void); //Used to delete without touching search keys in the nodes
|
61 |
-
//which were created with addp functions and do not belong to the tree.
|
62 |
-
void insert(const int n,Node * const nd,const int j); //inserts in partially empty
|
63 |
-
//page. n is insertion point, j is number of nodes on page that are viable.
|
64 |
-
int search(int &a,int &b,const char *,int &p); //searches for string on
|
65 |
-
//the page. Returns 1 if found, 0 otherwise. If found p is the index, otherwise
|
66 |
-
//if p is 0 then the page downward pointer is to next page to search, but if
|
67 |
-
//p is positive then p-1 is number of node that has the downward pointer to
|
68 |
-
//next page to search.
|
69 |
-
int search(int &a,int &b,char *,int &p,Partial_match *btr); //Looks for longest
|
70 |
-
//partial match.
|
71 |
-
void debug(); //Prints out the page for debugging purposes.
|
72 |
-
|
73 |
-
private:
|
74 |
-
char ndnm; //Indicates the number of Nodes on the page.
|
75 |
-
Page *pdn; //Pointer that points to the page below and also lexically below.
|
76 |
-
//May be NULL.
|
77 |
-
Node *pnd[ord2]; //Pointers to the nodes on the page. Some may be NULL.
|
78 |
-
};
|
79 |
-
|
80 |
-
class Btree {
|
81 |
-
friend class Page;
|
82 |
-
public:
|
83 |
-
Btree(void);
|
84 |
-
Btree(ifstream &); //Reads in a Btree in form of list written out by
|
85 |
-
//list_write() from disc. String arguments mark the path in proj file.
|
86 |
-
Btree( const Btree & btree ) {copy = true; root = btree.root;} // Actually
|
87 |
-
// creates another reference to the same tree. Take great care to
|
88 |
-
// avoid simultaneously modifying both copies.
|
89 |
-
~Btree(void);
|
90 |
-
int search(const char *); //Searches for a string and sets the path to that
|
91 |
-
//string or its insertion point.
|
92 |
-
int insert(Node *);//Only to be called after a search has failed to find the
|
93 |
-
//string.
|
94 |
-
void node_first();//Finds the first node in the tree and sets the path to it.
|
95 |
-
int node_next(); //Given the path is already set to a node, this function
|
96 |
-
//finds the next node in lexicographic order.
|
97 |
-
char *show_str();//Used to show the string after a call to next is successful.
|
98 |
-
void *give_ptr();//Used to give the data pointer in the current node.
|
99 |
-
void set_ptr(void *); //Used to set the data pointer after a call to search
|
100 |
-
//has found string.
|
101 |
-
int add(Node *); //Only to be used to construct a tree from a lexical list
|
102 |
-
//as written out by list_write();
|
103 |
-
void next_empty(); //Only used to reset the pointer arrays when the root is
|
104 |
-
//split. Used in add().
|
105 |
-
long list_write(ofstream &); //Writes out a lexical list of the strings in
|
106 |
-
//the tree.
|
107 |
-
int iclean; //Default 0, but set to 1 if want to have destructor run without
|
108 |
-
//touching key strings (if addp used in making tree).
|
109 |
-
protected:
|
110 |
-
int depth; //Tells the depth in the tree that marks the current location.
|
111 |
-
Page *root; //Points at the root page of the tree.
|
112 |
-
Page *pg[height_limit]; //Descending list of pointers that mark the pages.
|
113 |
-
int cnd[height_limit]; //Mark the positions of the nodes just above the
|
114 |
-
//downard page pointer at each level. Thus 0 marks the page's downward
|
115 |
-
//pointer, but a nonzero value must have 1 subtracted and then it gives
|
116 |
-
//the node whose downward pointer is the correct downward pointer.
|
117 |
-
bool copy; //flags copies of a tree with true.
|
118 |
-
};
|
119 |
-
|
120 |
-
class List : public Btree {
|
121 |
-
public:
|
122 |
-
List();
|
123 |
-
List(const List & list) : Btree(list) {}
|
124 |
-
~List();
|
125 |
-
void add_key(const char *str); //Adds the string *str to the tree if not already in list
|
126 |
-
void add_key_count(const char *str); //Adds the string *str to the tree if
|
127 |
-
//not already in list and counts it.
|
128 |
-
void addp_key_count(char *str); //Adds the string *str to the tree if
|
129 |
-
//not already in list and counts it. Uses the actual string pointer instead
|
130 |
-
//of making a copy
|
131 |
-
long cnt_key; //Used to count the number of keys.
|
132 |
-
};
|
133 |
-
|
134 |
-
class Count : public List {
|
135 |
-
public:
|
136 |
-
Count();
|
137 |
-
Count(const Count & Ct) : List(Ct){}
|
138 |
-
~Count();
|
139 |
-
void add_count(const char *str,long n); //Adds the string *str with its count
|
140 |
-
//to the tree if not already in list. String is key and count is data.
|
141 |
-
//If string is already a key the count is incremented by n.
|
142 |
-
void add_countz(const char *str,long n); //Adds the string *str with its count
|
143 |
-
//just as add_count, but also counts number of unique keys in count.
|
144 |
-
//Does not add count to the total variable, unlike add_count2.
|
145 |
-
void add_count2(const char *str,long n); //Adds the string *str with its count
|
146 |
-
//just as add_count, but also counts number of unique keys in count.
|
147 |
-
void addp_count2(char *str,long n); //Adds the string *str with its count
|
148 |
-
//just as add_count, but also counts number of unique keys in count.
|
149 |
-
//Does not make copy of string, but uses the pointer str as key pointer.
|
150 |
-
void correct(const char *str,long n); //If str is in the tree the count is
|
151 |
-
//changed to n. Otherwise nothing is done.
|
152 |
-
|
153 |
-
//Functions for maximum calculation
|
154 |
-
void max_count(const char *str,long n); //Adds the string *str with its count
|
155 |
-
//to the tree if not already in list. String is key and count is data.
|
156 |
-
//If string is already a key the count is max of n and prior value.
|
157 |
-
void max_count2(const char *str,long n); //Adds the string *str with its count
|
158 |
-
//just as max_count, but also counts number of unique keys in count.
|
159 |
-
void maxp_count2(char *str,long n); //Adds the string *str with its count
|
160 |
-
//just as max_count, but also counts number of unique keys in count.
|
161 |
-
//Does not make copy of string, but uses the pointer str as key pointer.
|
162 |
-
|
163 |
-
//Functions for minium calculation
|
164 |
-
void min_count(const char *str,long n); //Adds the string *str with its count
|
165 |
-
//to the tree if not already in list. String is key and count is data.
|
166 |
-
//If string is already a key the count is min of n and prior value.
|
167 |
-
void min_count2(const char *str,long n); //Adds the string *str with its count
|
168 |
-
//just as min_count, but also counts number of unique keys in count.
|
169 |
-
void minp_count2(char *str,long n); //Adds the string *str with its count
|
170 |
-
//just as min_count, but also counts number of unique keys in count.
|
171 |
-
//Does not make copy of string, but uses the pointer str as key pointer.
|
172 |
-
|
173 |
-
long count(const char *str); //Returns the count if a key (in list) otherwise
|
174 |
-
//returns 0.
|
175 |
-
long count(void); //Returns the count of the current string. Assumes the
|
176 |
-
//pointers have already been set by a search or node_next call.
|
177 |
-
long total; //Holds the total of all counts added for all keys.
|
178 |
-
};
|
179 |
-
|
180 |
-
class FCount : public List {
|
181 |
-
public:
|
182 |
-
FCount();
|
183 |
-
FCount(const FCount & Ct) : List(Ct){}
|
184 |
-
~FCount();
|
185 |
-
void Copy(FCount &Dc); //Makes a copy of the tree Dc in the current tree.
|
186 |
-
void add_count(const char *str,float z); //Adds the string *str with its count
|
187 |
-
//to the tree if not already in list. String is key and count is data.
|
188 |
-
//If string is already a key the count is incremented by z.
|
189 |
-
void add_count2(const char *str,float z); //Adds the string *str with its count
|
190 |
-
//just as add_count, but also counts number of unique keys in count.
|
191 |
-
void addp_count2(char *str,float z); //Adds the string *str with its count
|
192 |
-
//just as add_count, but also counts number of unique keys in count.
|
193 |
-
//Does not make copy of string, but uses the pointer str as key pointer.
|
194 |
-
float count(const char *str); //Returns the count if a key (in list) otherwise
|
195 |
-
//returns 0.
|
196 |
-
float count(void); //Returns the count of the current string. Assumes the
|
197 |
-
//pointers have already been set by a search or node_next call.
|
198 |
-
float total; //Holds the total of all counts added for all keys.
|
199 |
-
};
|
200 |
-
|
201 |
-
class DCount : public List {
|
202 |
-
public:
|
203 |
-
DCount();
|
204 |
-
DCount(const DCount & Ct) : List(Ct){}
|
205 |
-
~DCount();
|
206 |
-
void Copy(DCount &Dc); //Makes a copy of the tree Dc in the current tree.
|
207 |
-
void add_count(const char *str,double z); //Adds the string *str with its count
|
208 |
-
//to the tree if not already in list. String is key and count is data.
|
209 |
-
//If string is already a key the count is incremented by z.
|
210 |
-
void add_count2(const char *str,double z); //Adds the string *str with its count
|
211 |
-
//just as add_count, but also counts number of unique keys in count.
|
212 |
-
void addp_count2(char *str,double z); //Adds the string *str with its count
|
213 |
-
//just as add_count, but also counts number of unique keys in count.
|
214 |
-
//Does not make copy of string, but uses the pointer str as key pointer.
|
215 |
-
double count(const char *str); //Returns the count if a key (in list) otherwise
|
216 |
-
//returns 0.
|
217 |
-
double count(void); //Returns the count of the current string. Assumes the
|
218 |
-
//pointers have already been set by a search or node_next call.
|
219 |
-
|
220 |
-
//Functions for maximum calculation
|
221 |
-
void max_count(const char *str,double z); //Adds the string *str with its count
|
222 |
-
//to the tree if not already in list. String is key and count is data.
|
223 |
-
//If string is already a key the count is max of z and prior value.
|
224 |
-
void max_count2(const char *str,double z); //Adds the string *str with its count
|
225 |
-
//just as max_count, but also counts number of unique keys in count.
|
226 |
-
void maxp_count2(char *str,double z); //Adds the string *str with its count
|
227 |
-
//just as max_count, but also counts number of unique keys in count.
|
228 |
-
//Does not make copy of string, but uses the pointer str as key pointer.
|
229 |
-
|
230 |
-
//Functions for minium calculation
|
231 |
-
void min_count(const char *str,double z); //Adds the string *str with its count
|
232 |
-
//to the tree if not already in list. String is key and count is data.
|
233 |
-
//If string is already a key the count is min of z and prior value.
|
234 |
-
void min_count2(const char *str,double z); //Adds the string *str with its count
|
235 |
-
//just as min_count, but also counts number of unique keys in count.
|
236 |
-
void minp_count2(char *str,double z); //Adds the string *str with its count
|
237 |
-
//just as min_count, but also counts number of unique keys in count.
|
238 |
-
//Does not make copy of string, but uses the pointer str as key pointer.
|
239 |
-
|
240 |
-
void debug(void); //Prints to stdout a list "i str[i]"
|
241 |
-
double total; //Holds the total of all counts added for all keys.
|
242 |
-
};
|
243 |
-
|
244 |
-
class Partial_match : public Count {
|
245 |
-
friend class Page;
|
246 |
-
public:
|
247 |
-
Partial_match();
|
248 |
-
Partial_match(const Partial_match & Par_mat) : Count(Par_mat){}
|
249 |
-
~Partial_match();
|
250 |
-
void long_match(char *,List &); //Finds the longest matches for all word
|
251 |
-
//starts in the string and adds them to the list.
|
252 |
-
void local_match(char *,List &); //Finds all matches that start at
|
253 |
-
//beginning of the string and adds them to the list.
|
254 |
-
void all_match(char *,List &); //Finds all matches within the string and
|
255 |
-
//adds them to the list.
|
256 |
-
void long_match(char *,Count &,long n); //Finds the longest matches for all word
|
257 |
-
//starts in the string and adds them to the list in Count.
|
258 |
-
void local_match(char *,Count &,long n); //Finds all matches that start at
|
259 |
-
//beginning of string and adds them to the list in Count.
|
260 |
-
void all_match(char *,Count &,long n); //Finds all matches within the string and
|
261 |
-
//adds them to the list in Count.
|
262 |
-
int search_long(char *); //Searches for longest partial match to an initial
|
263 |
-
//segment of a string that ends at a word boundary and
|
264 |
-
//sets the path to that string or its insertion point.
|
265 |
-
|
266 |
-
private:
|
267 |
-
int stc_my_long(int &,int &,char *,const char *,int); //Function used to compare
|
268 |
-
//two strings. The first two arguments hold information about how much the
|
269 |
-
//string can be ignored in the comparison. The last argument holds the index
|
270 |
-
//or number of the string's node on the page.
|
271 |
-
int step_one(int &,int &,char *); //Looks for partial or complete match and
|
272 |
-
//returns 1 if complete found. Partial is reflected in parameters.
|
273 |
-
|
274 |
-
//Special parameters used in partial matching.
|
275 |
-
int depth_o; //Depth of longest partial match thus far.
|
276 |
-
int index_o; //index of longest partial match thus far.
|
277 |
-
int cln_o; //String length of longest partial match thus far.
|
278 |
-
int len; //Length of query string.
|
279 |
-
int cln; //Current null position in string.
|
280 |
-
};
|
281 |
-
|
282 |
-
class Str_str : public Btree {
|
283 |
-
public:
|
284 |
-
Str_str();
|
285 |
-
Str_str(const Str_str & Stst) : Btree(Stst){}
|
286 |
-
~Str_str();
|
287 |
-
void add_pair(const char *one,const char *two); //Adds the string *one to the tree and stores
|
288 |
-
//the string *two at that node.
|
289 |
-
char *match(const char *one); //Returns pointer to the string stored under string *one.
|
290 |
-
};
|
291 |
-
|
292 |
-
class Num_num : public Btree {
|
293 |
-
public:
|
294 |
-
Num_num();
|
295 |
-
Num_num(const Num_num & Nmnm) : Btree(Nmnm){}
|
296 |
-
~Num_num();
|
297 |
-
void add_pair(long i, long j); //Adds the string for i to the tree and
|
298 |
-
//stores the number j at that node.
|
299 |
-
long match(long i); //Returns the number stored under the string for i.
|
300 |
-
};
|
301 |
-
|
302 |
-
template<class Z>
|
303 |
-
class BCount : public List {
|
304 |
-
public:
|
305 |
-
BCount();
|
306 |
-
BCount(const BCount<Z> & Ct) : List(Ct){}
|
307 |
-
~BCount();
|
308 |
-
void add_count(const char *str,Z n); //Adds the string *str with its count
|
309 |
-
//to the tree if not already in list. String is key and count is data.
|
310 |
-
//If string is already a key the count is incremented by n.
|
311 |
-
void add_count2(const char *str,Z n); //Adds the string *str with its count
|
312 |
-
//just as add_count, but also counts number of unique keys in count.
|
313 |
-
void addp_count2(char *str,Z n); //Adds the string *str with its count
|
314 |
-
//just as add_count, but also counts number of unique keys in count.
|
315 |
-
//Does not make copy of string, but uses the pointer str as key pointer.
|
316 |
-
void correct(const char *str,Z n); //If str is in the tree the count is
|
317 |
-
//changed to n. Otherwise nothing is done.
|
318 |
-
|
319 |
-
//Functions for maximum calculation
|
320 |
-
void max_count(const char *str,Z n); //Adds the string *str with its count
|
321 |
-
//to the tree if not already in list. String is key and count is data.
|
322 |
-
//If string is already a key the count is max of n and prior value.
|
323 |
-
void max_count2(const char *str,Z n); //Adds the string *str with its count
|
324 |
-
//just as max_count, but also counts number of unique keys in count.
|
325 |
-
void maxp_count2(char *str,Z n); //Adds the string *str with its count
|
326 |
-
//just as max_count, but also counts number of unique keys in count.
|
327 |
-
//Does not make copy of string, but uses the pointer str as key pointer.
|
328 |
-
|
329 |
-
//Functions for minium calculation
|
330 |
-
void min_count(const char *str,Z n); //Adds the string *str with its count
|
331 |
-
//to the tree if not already in list. String is key and count is data.
|
332 |
-
//If string is already a key the count is min of n and prior value.
|
333 |
-
void min_count2(const char *str,Z n); //Adds the string *str with its count
|
334 |
-
//just as min_count, but also counts number of unique keys in count.
|
335 |
-
void minp_count2(char *str,Z n); //Adds the string *str with its count
|
336 |
-
//just as min_count, but also counts number of unique keys in count.
|
337 |
-
//Does not make copy of string, but uses the pointer str as key pointer.
|
338 |
-
|
339 |
-
Z count(const char *str); //Returns the count if a key (in list) otherwise
|
340 |
-
//returns 0.
|
341 |
-
Z count(void); //Returns the count of the current string. Assumes the
|
342 |
-
//pointers have already been set by a search or node_next call.
|
343 |
-
Z total; //Holds the total of all counts added for all keys.
|
344 |
-
};
|
345 |
-
|
346 |
-
template<class Z>
|
347 |
-
BCount<Z>::BCount() : List() {
|
348 |
-
total=0;
|
349 |
-
}
|
350 |
-
|
351 |
-
template<class Z>
|
352 |
-
BCount<Z>::~BCount(){
|
353 |
-
if(copy)return;
|
354 |
-
Z *pk;
|
355 |
-
this->node_first();
|
356 |
-
while(this->node_next()){
|
357 |
-
pk=(Z *)(this->give_ptr());
|
358 |
-
if(pk)delete pk;
|
359 |
-
}
|
360 |
-
}
|
361 |
-
|
362 |
-
template<class Z>
|
363 |
-
void BCount<Z>::add_count(const char *pch,Z n){
|
364 |
-
Z *ppt;
|
365 |
-
Node *np;
|
366 |
-
total+=n;
|
367 |
-
if(this->search(pch)==0){
|
368 |
-
ppt = new Z;
|
369 |
-
(*ppt) =n;
|
370 |
-
np=new Node(pch,(void*)ppt);
|
371 |
-
this->insert(np);
|
372 |
-
}
|
373 |
-
else {
|
374 |
-
(*(Z *) this->give_ptr())+=n;
|
375 |
-
}
|
376 |
-
}
|
377 |
-
|
378 |
-
template<class Z>
|
379 |
-
void BCount<Z>::add_count2(const char *pch,Z n){
|
380 |
-
Z *ppt;
|
381 |
-
Node *np;
|
382 |
-
total+=n;
|
383 |
-
if(this->search(pch)==0){
|
384 |
-
ppt = new Z;
|
385 |
-
(*ppt) =n;
|
386 |
-
np=new Node(pch,(void*)ppt);
|
387 |
-
this->insert(np);
|
388 |
-
cnt_key++;
|
389 |
-
}
|
390 |
-
else {
|
391 |
-
(*(Z *) this->give_ptr())+=n;
|
392 |
-
}
|
393 |
-
}
|
394 |
-
|
395 |
-
template<class Z>
|
396 |
-
void BCount<Z>::addp_count2(char *pch,Z n){
|
397 |
-
Z *ppt;
|
398 |
-
Node *np;
|
399 |
-
total+=n;
|
400 |
-
if(this->search(pch)==0){
|
401 |
-
ppt = new Z;
|
402 |
-
(*ppt) =n;
|
403 |
-
np=new Node;
|
404 |
-
np->str=pch;
|
405 |
-
np->rel=ppt;
|
406 |
-
this->insert(np);
|
407 |
-
cnt_key++;
|
408 |
-
}
|
409 |
-
else {
|
410 |
-
(*(Z *) this->give_ptr())+=n;
|
411 |
-
}
|
412 |
-
}
|
413 |
-
|
414 |
-
template<class Z>
|
415 |
-
void BCount<Z>::correct(const char *pch,Z n){
|
416 |
-
if(this->search(pch)){
|
417 |
-
(*(Z *) this->give_ptr())=n;
|
418 |
-
}
|
419 |
-
}
|
420 |
-
|
421 |
-
template<class Z>
|
422 |
-
Z BCount<Z>::count(const char *pch){
|
423 |
-
if(this->search(pch)==0){
|
424 |
-
return(0);
|
425 |
-
}
|
426 |
-
else {
|
427 |
-
return(*((Z *) this->give_ptr()));
|
428 |
-
}
|
429 |
-
}
|
430 |
-
|
431 |
-
template<class Z>
|
432 |
-
Z BCount<Z>::count(void){
|
433 |
-
return(*((Z *) this->give_ptr()));
|
434 |
-
}
|
435 |
-
|
436 |
-
template<class Z>
|
437 |
-
void BCount<Z>::max_count(const char *pch,Z n){
|
438 |
-
Z *ppt,i;
|
439 |
-
Node *np;
|
440 |
-
total+=n;
|
441 |
-
if(!search(pch)){
|
442 |
-
ppt = new Z;
|
443 |
-
(*ppt) =n;
|
444 |
-
np=new Node(pch,(void*)ppt);
|
445 |
-
this->insert(np);
|
446 |
-
}
|
447 |
-
else {
|
448 |
-
ppt=(Z *)give_ptr();
|
449 |
-
if(*ppt<n)*ppt=n;
|
450 |
-
}
|
451 |
-
}
|
452 |
-
|
453 |
-
template<class Z>
|
454 |
-
void BCount<Z>::max_count2(const char *pch,Z n){
|
455 |
-
Z *ppt,i;
|
456 |
-
Node *np;
|
457 |
-
total+=n;
|
458 |
-
if(!search(pch)){
|
459 |
-
ppt = new Z;
|
460 |
-
(*ppt) =n;
|
461 |
-
np=new Node(pch,(void*)ppt);
|
462 |
-
this->insert(np);
|
463 |
-
cnt_key++;
|
464 |
-
}
|
465 |
-
else {
|
466 |
-
ppt=(Z *)give_ptr();
|
467 |
-
if(*ppt<n)*ppt=n;
|
468 |
-
}
|
469 |
-
}
|
470 |
-
|
471 |
-
template<class Z>
|
472 |
-
void BCount<Z>::maxp_count2(char *pch,Z n){
|
473 |
-
Z *ppt,i;
|
474 |
-
Node *np;
|
475 |
-
total+=n;
|
476 |
-
if(!search(pch)){
|
477 |
-
ppt = new Z;
|
478 |
-
(*ppt) =n;
|
479 |
-
np=new Node;
|
480 |
-
np->str=pch;
|
481 |
-
np->rel=ppt;
|
482 |
-
this->insert(np);
|
483 |
-
cnt_key++;
|
484 |
-
}
|
485 |
-
else {
|
486 |
-
ppt=(Z *)give_ptr();
|
487 |
-
if(*ppt<n)*ppt=n;
|
488 |
-
}
|
489 |
-
}
|
490 |
-
|
491 |
-
template<class Z>
|
492 |
-
void BCount<Z>::min_count(const char *pch,Z n){
|
493 |
-
Z *ppt,i;
|
494 |
-
Node *np;
|
495 |
-
total+=n;
|
496 |
-
if(!search(pch)){
|
497 |
-
ppt = new Z;
|
498 |
-
(*ppt) =n;
|
499 |
-
np=new Node(pch,(void*)ppt);
|
500 |
-
this->insert(np);
|
501 |
-
}
|
502 |
-
else {
|
503 |
-
ppt=(Z *)give_ptr();
|
504 |
-
if(*ppt>n)*ppt=n;
|
505 |
-
}
|
506 |
-
}
|
507 |
-
|
508 |
-
template<class Z>
|
509 |
-
void BCount<Z>::min_count2(const char *pch,Z n){
|
510 |
-
Z *ppt,i;
|
511 |
-
Node *np;
|
512 |
-
total+=n;
|
513 |
-
if(!search(pch)){
|
514 |
-
ppt = new Z;
|
515 |
-
(*ppt) =n;
|
516 |
-
np=new Node(pch,(void*)ppt);
|
517 |
-
this->insert(np);
|
518 |
-
cnt_key++;
|
519 |
-
}
|
520 |
-
else {
|
521 |
-
ppt=(Z *)give_ptr();
|
522 |
-
if(*ppt>n)*ppt=n;
|
523 |
-
}
|
524 |
-
}
|
525 |
-
|
526 |
-
template<class Z>
|
527 |
-
void BCount<Z>::minp_count2(char *pch,Z n){
|
528 |
-
Z *ppt,i;
|
529 |
-
Node *np;
|
530 |
-
total+=n;
|
531 |
-
if(!search(pch)){
|
532 |
-
ppt = new Z;
|
533 |
-
(*ppt) =n;
|
534 |
-
np=new Node;
|
535 |
-
np->str=pch;
|
536 |
-
np->rel=ppt;
|
537 |
-
this->insert(np);
|
538 |
-
cnt_key++;
|
539 |
-
}
|
540 |
-
else {
|
541 |
-
ppt=(Z *)give_ptr();
|
542 |
-
if(*ppt>n)*ppt=n;
|
543 |
-
}
|
544 |
-
}
|
545 |
-
|
546 |
-
}
|
547 |
-
#endif
|
|
|
1 |
+
#ifndef BTREE_H
|
2 |
+
#define BTREE_H
|
3 |
+
|
4 |
+
#define LNEG -100000000
|
5 |
+
|
6 |
+
#include <iostream>
|
7 |
+
#include <fstream>
|
8 |
+
using namespace std;
|
9 |
+
namespace iret {
|
10 |
+
|
11 |
+
const int order = 5; //Half the order of the Btree that we build.
|
12 |
+
const int height_limit =12; //Limit on the height of the Btree.
|
13 |
+
const int ord2 = order*2; //The order of the Btree.
|
14 |
+
|
15 |
+
int stc_my(int &,int &,const char *,const char *); //Function used to compare
|
16 |
+
//two strings. The first two arguments hold information about how much the
|
17 |
+
//string can be ignored in the comparison.
|
18 |
+
|
19 |
+
class Page; //forward declaration
|
20 |
+
class Btree; //forward declaration
|
21 |
+
class Partial_match; //forward declaration
|
22 |
+
|
23 |
+
class Node {
|
24 |
+
friend int stc_my(int &,int &,const char *,const char *);
|
25 |
+
friend class Page;
|
26 |
+
friend class Btree;
|
27 |
+
friend class List;
|
28 |
+
friend class Count;
|
29 |
+
friend class FCount;
|
30 |
+
friend class DCount;
|
31 |
+
template<class Z> friend class BCount;
|
32 |
+
friend class Partial_match;
|
33 |
+
friend class Thes;
|
34 |
+
public:
|
35 |
+
Node(void); //Sets all points to NULL.
|
36 |
+
Node(const char * ); //Argument is the string for this node.
|
37 |
+
Node(const char * ,void *); //Arguments are first the string and then the
|
38 |
+
//data pointer.
|
39 |
+
~Node();
|
40 |
+
void debug(); //Prints out the node in simple format.
|
41 |
+
private:
|
42 |
+
char *str; //String pointer.
|
43 |
+
void *rel; //Data pointer.
|
44 |
+
Page *pdn; //Points down to the page below or to NULL.
|
45 |
+
};
|
46 |
+
|
47 |
+
class Page {
|
48 |
+
friend int stc_my(int &,int &,const char *,const char *);
|
49 |
+
friend class Btree;
|
50 |
+
friend class Partial_match;
|
51 |
+
friend class FCount;
|
52 |
+
friend class DCount;
|
53 |
+
public:
|
54 |
+
Page(); //Constructs a new empty page. Only happens at the root.
|
55 |
+
Page(Page * const pz,Page * const pn,const int n); //Constructs a page that
|
56 |
+
//holds the right half of a full page. The full page is pointed at by the
|
57 |
+
//pz. The new pages downward pointer is set to pn.
|
58 |
+
//n tells how much of the full page is to remain or where to begin removal.
|
59 |
+
~Page();
|
60 |
+
void clean(void); //Used to delete without touching search keys in the nodes
|
61 |
+
//which were created with addp functions and do not belong to the tree.
|
62 |
+
void insert(const int n,Node * const nd,const int j); //inserts in partially empty
|
63 |
+
//page. n is insertion point, j is number of nodes on page that are viable.
|
64 |
+
int search(int &a,int &b,const char *,int &p); //searches for string on
|
65 |
+
//the page. Returns 1 if found, 0 otherwise. If found p is the index, otherwise
|
66 |
+
//if p is 0 then the page downward pointer is to next page to search, but if
|
67 |
+
//p is positive then p-1 is number of node that has the downward pointer to
|
68 |
+
//next page to search.
|
69 |
+
int search(int &a,int &b,char *,int &p,Partial_match *btr); //Looks for longest
|
70 |
+
//partial match.
|
71 |
+
void debug(); //Prints out the page for debugging purposes.
|
72 |
+
|
73 |
+
private:
|
74 |
+
char ndnm; //Indicates the number of Nodes on the page.
|
75 |
+
Page *pdn; //Pointer that points to the page below and also lexically below.
|
76 |
+
//May be NULL.
|
77 |
+
Node *pnd[ord2]; //Pointers to the nodes on the page. Some may be NULL.
|
78 |
+
};
|
79 |
+
|
80 |
+
class Btree {
|
81 |
+
friend class Page;
|
82 |
+
public:
|
83 |
+
Btree(void);
|
84 |
+
Btree(ifstream &); //Reads in a Btree in form of list written out by
|
85 |
+
//list_write() from disc. String arguments mark the path in proj file.
|
86 |
+
Btree( const Btree & btree ) {copy = true; root = btree.root;} // Actually
|
87 |
+
// creates another reference to the same tree. Take great care to
|
88 |
+
// avoid simultaneously modifying both copies.
|
89 |
+
~Btree(void);
|
90 |
+
int search(const char *); //Searches for a string and sets the path to that
|
91 |
+
//string or its insertion point.
|
92 |
+
int insert(Node *);//Only to be called after a search has failed to find the
|
93 |
+
//string.
|
94 |
+
void node_first();//Finds the first node in the tree and sets the path to it.
|
95 |
+
int node_next(); //Given the path is already set to a node, this function
|
96 |
+
//finds the next node in lexicographic order.
|
97 |
+
char *show_str();//Used to show the string after a call to next is successful.
|
98 |
+
void *give_ptr();//Used to give the data pointer in the current node.
|
99 |
+
void set_ptr(void *); //Used to set the data pointer after a call to search
|
100 |
+
//has found string.
|
101 |
+
int add(Node *); //Only to be used to construct a tree from a lexical list
|
102 |
+
//as written out by list_write();
|
103 |
+
void next_empty(); //Only used to reset the pointer arrays when the root is
|
104 |
+
//split. Used in add().
|
105 |
+
long list_write(ofstream &); //Writes out a lexical list of the strings in
|
106 |
+
//the tree.
|
107 |
+
int iclean; //Default 0, but set to 1 if want to have destructor run without
|
108 |
+
//touching key strings (if addp used in making tree).
|
109 |
+
protected:
|
110 |
+
int depth; //Tells the depth in the tree that marks the current location.
|
111 |
+
Page *root; //Points at the root page of the tree.
|
112 |
+
Page *pg[height_limit]; //Descending list of pointers that mark the pages.
|
113 |
+
int cnd[height_limit]; //Mark the positions of the nodes just above the
|
114 |
+
//downard page pointer at each level. Thus 0 marks the page's downward
|
115 |
+
//pointer, but a nonzero value must have 1 subtracted and then it gives
|
116 |
+
//the node whose downward pointer is the correct downward pointer.
|
117 |
+
bool copy; //flags copies of a tree with true.
|
118 |
+
};
|
119 |
+
|
120 |
+
class List : public Btree {
|
121 |
+
public:
|
122 |
+
List();
|
123 |
+
List(const List & list) : Btree(list) {}
|
124 |
+
~List();
|
125 |
+
void add_key(const char *str); //Adds the string *str to the tree if not already in list
|
126 |
+
void add_key_count(const char *str); //Adds the string *str to the tree if
|
127 |
+
//not already in list and counts it.
|
128 |
+
void addp_key_count(char *str); //Adds the string *str to the tree if
|
129 |
+
//not already in list and counts it. Uses the actual string pointer instead
|
130 |
+
//of making a copy
|
131 |
+
long cnt_key; //Used to count the number of keys.
|
132 |
+
};
|
133 |
+
|
134 |
+
class Count : public List {
|
135 |
+
public:
|
136 |
+
Count();
|
137 |
+
Count(const Count & Ct) : List(Ct){}
|
138 |
+
~Count();
|
139 |
+
void add_count(const char *str,long n); //Adds the string *str with its count
|
140 |
+
//to the tree if not already in list. String is key and count is data.
|
141 |
+
//If string is already a key the count is incremented by n.
|
142 |
+
void add_countz(const char *str,long n); //Adds the string *str with its count
|
143 |
+
//just as add_count, but also counts number of unique keys in count.
|
144 |
+
//Does not add count to the total variable, unlike add_count2.
|
145 |
+
void add_count2(const char *str,long n); //Adds the string *str with its count
|
146 |
+
//just as add_count, but also counts number of unique keys in count.
|
147 |
+
void addp_count2(char *str,long n); //Adds the string *str with its count
|
148 |
+
//just as add_count, but also counts number of unique keys in count.
|
149 |
+
//Does not make copy of string, but uses the pointer str as key pointer.
|
150 |
+
void correct(const char *str,long n); //If str is in the tree the count is
|
151 |
+
//changed to n. Otherwise nothing is done.
|
152 |
+
|
153 |
+
//Functions for maximum calculation
|
154 |
+
void max_count(const char *str,long n); //Adds the string *str with its count
|
155 |
+
//to the tree if not already in list. String is key and count is data.
|
156 |
+
//If string is already a key the count is max of n and prior value.
|
157 |
+
void max_count2(const char *str,long n); //Adds the string *str with its count
|
158 |
+
//just as max_count, but also counts number of unique keys in count.
|
159 |
+
void maxp_count2(char *str,long n); //Adds the string *str with its count
|
160 |
+
//just as max_count, but also counts number of unique keys in count.
|
161 |
+
//Does not make copy of string, but uses the pointer str as key pointer.
|
162 |
+
|
163 |
+
//Functions for minium calculation
|
164 |
+
void min_count(const char *str,long n); //Adds the string *str with its count
|
165 |
+
//to the tree if not already in list. String is key and count is data.
|
166 |
+
//If string is already a key the count is min of n and prior value.
|
167 |
+
void min_count2(const char *str,long n); //Adds the string *str with its count
|
168 |
+
//just as min_count, but also counts number of unique keys in count.
|
169 |
+
void minp_count2(char *str,long n); //Adds the string *str with its count
|
170 |
+
//just as min_count, but also counts number of unique keys in count.
|
171 |
+
//Does not make copy of string, but uses the pointer str as key pointer.
|
172 |
+
|
173 |
+
long count(const char *str); //Returns the count if a key (in list) otherwise
|
174 |
+
//returns 0.
|
175 |
+
long count(void); //Returns the count of the current string. Assumes the
|
176 |
+
//pointers have already been set by a search or node_next call.
|
177 |
+
long total; //Holds the total of all counts added for all keys.
|
178 |
+
};
|
179 |
+
|
180 |
+
class FCount : public List {
|
181 |
+
public:
|
182 |
+
FCount();
|
183 |
+
FCount(const FCount & Ct) : List(Ct){}
|
184 |
+
~FCount();
|
185 |
+
void Copy(FCount &Dc); //Makes a copy of the tree Dc in the current tree.
|
186 |
+
void add_count(const char *str,float z); //Adds the string *str with its count
|
187 |
+
//to the tree if not already in list. String is key and count is data.
|
188 |
+
//If string is already a key the count is incremented by z.
|
189 |
+
void add_count2(const char *str,float z); //Adds the string *str with its count
|
190 |
+
//just as add_count, but also counts number of unique keys in count.
|
191 |
+
void addp_count2(char *str,float z); //Adds the string *str with its count
|
192 |
+
//just as add_count, but also counts number of unique keys in count.
|
193 |
+
//Does not make copy of string, but uses the pointer str as key pointer.
|
194 |
+
float count(const char *str); //Returns the count if a key (in list) otherwise
|
195 |
+
//returns 0.
|
196 |
+
float count(void); //Returns the count of the current string. Assumes the
|
197 |
+
//pointers have already been set by a search or node_next call.
|
198 |
+
float total; //Holds the total of all counts added for all keys.
|
199 |
+
};
|
200 |
+
|
201 |
+
class DCount : public List {
|
202 |
+
public:
|
203 |
+
DCount();
|
204 |
+
DCount(const DCount & Ct) : List(Ct){}
|
205 |
+
~DCount();
|
206 |
+
void Copy(DCount &Dc); //Makes a copy of the tree Dc in the current tree.
|
207 |
+
void add_count(const char *str,double z); //Adds the string *str with its count
|
208 |
+
//to the tree if not already in list. String is key and count is data.
|
209 |
+
//If string is already a key the count is incremented by z.
|
210 |
+
void add_count2(const char *str,double z); //Adds the string *str with its count
|
211 |
+
//just as add_count, but also counts number of unique keys in count.
|
212 |
+
void addp_count2(char *str,double z); //Adds the string *str with its count
|
213 |
+
//just as add_count, but also counts number of unique keys in count.
|
214 |
+
//Does not make copy of string, but uses the pointer str as key pointer.
|
215 |
+
double count(const char *str); //Returns the count if a key (in list) otherwise
|
216 |
+
//returns 0.
|
217 |
+
double count(void); //Returns the count of the current string. Assumes the
|
218 |
+
//pointers have already been set by a search or node_next call.
|
219 |
+
|
220 |
+
//Functions for maximum calculation
|
221 |
+
void max_count(const char *str,double z); //Adds the string *str with its count
|
222 |
+
//to the tree if not already in list. String is key and count is data.
|
223 |
+
//If string is already a key the count is max of z and prior value.
|
224 |
+
void max_count2(const char *str,double z); //Adds the string *str with its count
|
225 |
+
//just as max_count, but also counts number of unique keys in count.
|
226 |
+
void maxp_count2(char *str,double z); //Adds the string *str with its count
|
227 |
+
//just as max_count, but also counts number of unique keys in count.
|
228 |
+
//Does not make copy of string, but uses the pointer str as key pointer.
|
229 |
+
|
230 |
+
//Functions for minium calculation
|
231 |
+
void min_count(const char *str,double z); //Adds the string *str with its count
|
232 |
+
//to the tree if not already in list. String is key and count is data.
|
233 |
+
//If string is already a key the count is min of z and prior value.
|
234 |
+
void min_count2(const char *str,double z); //Adds the string *str with its count
|
235 |
+
//just as min_count, but also counts number of unique keys in count.
|
236 |
+
void minp_count2(char *str,double z); //Adds the string *str with its count
|
237 |
+
//just as min_count, but also counts number of unique keys in count.
|
238 |
+
//Does not make copy of string, but uses the pointer str as key pointer.
|
239 |
+
|
240 |
+
void debug(void); //Prints to stdout a list "i str[i]"
|
241 |
+
double total; //Holds the total of all counts added for all keys.
|
242 |
+
};
|
243 |
+
|
244 |
+
class Partial_match : public Count {
|
245 |
+
friend class Page;
|
246 |
+
public:
|
247 |
+
Partial_match();
|
248 |
+
Partial_match(const Partial_match & Par_mat) : Count(Par_mat){}
|
249 |
+
~Partial_match();
|
250 |
+
void long_match(char *,List &); //Finds the longest matches for all word
|
251 |
+
//starts in the string and adds them to the list.
|
252 |
+
void local_match(char *,List &); //Finds all matches that start at
|
253 |
+
//beginning of the string and adds them to the list.
|
254 |
+
void all_match(char *,List &); //Finds all matches within the string and
|
255 |
+
//adds them to the list.
|
256 |
+
void long_match(char *,Count &,long n); //Finds the longest matches for all word
|
257 |
+
//starts in the string and adds them to the list in Count.
|
258 |
+
void local_match(char *,Count &,long n); //Finds all matches that start at
|
259 |
+
//beginning of string and adds them to the list in Count.
|
260 |
+
void all_match(char *,Count &,long n); //Finds all matches within the string and
|
261 |
+
//adds them to the list in Count.
|
262 |
+
int search_long(char *); //Searches for longest partial match to an initial
|
263 |
+
//segment of a string that ends at a word boundary and
|
264 |
+
//sets the path to that string or its insertion point.
|
265 |
+
|
266 |
+
private:
|
267 |
+
int stc_my_long(int &,int &,char *,const char *,int); //Function used to compare
|
268 |
+
//two strings. The first two arguments hold information about how much the
|
269 |
+
//string can be ignored in the comparison. The last argument holds the index
|
270 |
+
//or number of the string's node on the page.
|
271 |
+
int step_one(int &,int &,char *); //Looks for partial or complete match and
|
272 |
+
//returns 1 if complete found. Partial is reflected in parameters.
|
273 |
+
|
274 |
+
//Special parameters used in partial matching.
|
275 |
+
int depth_o; //Depth of longest partial match thus far.
|
276 |
+
int index_o; //index of longest partial match thus far.
|
277 |
+
int cln_o; //String length of longest partial match thus far.
|
278 |
+
int len; //Length of query string.
|
279 |
+
int cln; //Current null position in string.
|
280 |
+
};
|
281 |
+
|
282 |
+
class Str_str : public Btree {
|
283 |
+
public:
|
284 |
+
Str_str();
|
285 |
+
Str_str(const Str_str & Stst) : Btree(Stst){}
|
286 |
+
~Str_str();
|
287 |
+
void add_pair(const char *one,const char *two); //Adds the string *one to the tree and stores
|
288 |
+
//the string *two at that node.
|
289 |
+
char *match(const char *one); //Returns pointer to the string stored under string *one.
|
290 |
+
};
|
291 |
+
|
292 |
+
class Num_num : public Btree {
|
293 |
+
public:
|
294 |
+
Num_num();
|
295 |
+
Num_num(const Num_num & Nmnm) : Btree(Nmnm){}
|
296 |
+
~Num_num();
|
297 |
+
void add_pair(long i, long j); //Adds the string for i to the tree and
|
298 |
+
//stores the number j at that node.
|
299 |
+
long match(long i); //Returns the number stored under the string for i.
|
300 |
+
};
|
301 |
+
|
302 |
+
template<class Z>
|
303 |
+
class BCount : public List {
|
304 |
+
public:
|
305 |
+
BCount();
|
306 |
+
BCount(const BCount<Z> & Ct) : List(Ct){}
|
307 |
+
~BCount();
|
308 |
+
void add_count(const char *str,Z n); //Adds the string *str with its count
|
309 |
+
//to the tree if not already in list. String is key and count is data.
|
310 |
+
//If string is already a key the count is incremented by n.
|
311 |
+
void add_count2(const char *str,Z n); //Adds the string *str with its count
|
312 |
+
//just as add_count, but also counts number of unique keys in count.
|
313 |
+
void addp_count2(char *str,Z n); //Adds the string *str with its count
|
314 |
+
//just as add_count, but also counts number of unique keys in count.
|
315 |
+
//Does not make copy of string, but uses the pointer str as key pointer.
|
316 |
+
void correct(const char *str,Z n); //If str is in the tree the count is
|
317 |
+
//changed to n. Otherwise nothing is done.
|
318 |
+
|
319 |
+
//Functions for maximum calculation
|
320 |
+
void max_count(const char *str,Z n); //Adds the string *str with its count
|
321 |
+
//to the tree if not already in list. String is key and count is data.
|
322 |
+
//If string is already a key the count is max of n and prior value.
|
323 |
+
void max_count2(const char *str,Z n); //Adds the string *str with its count
|
324 |
+
//just as max_count, but also counts number of unique keys in count.
|
325 |
+
void maxp_count2(char *str,Z n); //Adds the string *str with its count
|
326 |
+
//just as max_count, but also counts number of unique keys in count.
|
327 |
+
//Does not make copy of string, but uses the pointer str as key pointer.
|
328 |
+
|
329 |
+
//Functions for minium calculation
|
330 |
+
void min_count(const char *str,Z n); //Adds the string *str with its count
|
331 |
+
//to the tree if not already in list. String is key and count is data.
|
332 |
+
//If string is already a key the count is min of n and prior value.
|
333 |
+
void min_count2(const char *str,Z n); //Adds the string *str with its count
|
334 |
+
//just as min_count, but also counts number of unique keys in count.
|
335 |
+
void minp_count2(char *str,Z n); //Adds the string *str with its count
|
336 |
+
//just as min_count, but also counts number of unique keys in count.
|
337 |
+
//Does not make copy of string, but uses the pointer str as key pointer.
|
338 |
+
|
339 |
+
Z count(const char *str); //Returns the count if a key (in list) otherwise
|
340 |
+
//returns 0.
|
341 |
+
Z count(void); //Returns the count of the current string. Assumes the
|
342 |
+
//pointers have already been set by a search or node_next call.
|
343 |
+
Z total; //Holds the total of all counts added for all keys.
|
344 |
+
};
|
345 |
+
|
346 |
+
template<class Z>
|
347 |
+
BCount<Z>::BCount() : List() {
|
348 |
+
total=0;
|
349 |
+
}
|
350 |
+
|
351 |
+
template<class Z>
|
352 |
+
BCount<Z>::~BCount(){
|
353 |
+
if(copy)return;
|
354 |
+
Z *pk;
|
355 |
+
this->node_first();
|
356 |
+
while(this->node_next()){
|
357 |
+
pk=(Z *)(this->give_ptr());
|
358 |
+
if(pk)delete pk;
|
359 |
+
}
|
360 |
+
}
|
361 |
+
|
362 |
+
template<class Z>
|
363 |
+
void BCount<Z>::add_count(const char *pch,Z n){
|
364 |
+
Z *ppt;
|
365 |
+
Node *np;
|
366 |
+
total+=n;
|
367 |
+
if(this->search(pch)==0){
|
368 |
+
ppt = new Z;
|
369 |
+
(*ppt) =n;
|
370 |
+
np=new Node(pch,(void*)ppt);
|
371 |
+
this->insert(np);
|
372 |
+
}
|
373 |
+
else {
|
374 |
+
(*(Z *) this->give_ptr())+=n;
|
375 |
+
}
|
376 |
+
}
|
377 |
+
|
378 |
+
template<class Z>
|
379 |
+
void BCount<Z>::add_count2(const char *pch,Z n){
|
380 |
+
Z *ppt;
|
381 |
+
Node *np;
|
382 |
+
total+=n;
|
383 |
+
if(this->search(pch)==0){
|
384 |
+
ppt = new Z;
|
385 |
+
(*ppt) =n;
|
386 |
+
np=new Node(pch,(void*)ppt);
|
387 |
+
this->insert(np);
|
388 |
+
cnt_key++;
|
389 |
+
}
|
390 |
+
else {
|
391 |
+
(*(Z *) this->give_ptr())+=n;
|
392 |
+
}
|
393 |
+
}
|
394 |
+
|
395 |
+
template<class Z>
|
396 |
+
void BCount<Z>::addp_count2(char *pch,Z n){
|
397 |
+
Z *ppt;
|
398 |
+
Node *np;
|
399 |
+
total+=n;
|
400 |
+
if(this->search(pch)==0){
|
401 |
+
ppt = new Z;
|
402 |
+
(*ppt) =n;
|
403 |
+
np=new Node;
|
404 |
+
np->str=pch;
|
405 |
+
np->rel=ppt;
|
406 |
+
this->insert(np);
|
407 |
+
cnt_key++;
|
408 |
+
}
|
409 |
+
else {
|
410 |
+
(*(Z *) this->give_ptr())+=n;
|
411 |
+
}
|
412 |
+
}
|
413 |
+
|
414 |
+
template<class Z>
|
415 |
+
void BCount<Z>::correct(const char *pch,Z n){
|
416 |
+
if(this->search(pch)){
|
417 |
+
(*(Z *) this->give_ptr())=n;
|
418 |
+
}
|
419 |
+
}
|
420 |
+
|
421 |
+
template<class Z>
|
422 |
+
Z BCount<Z>::count(const char *pch){
|
423 |
+
if(this->search(pch)==0){
|
424 |
+
return(0);
|
425 |
+
}
|
426 |
+
else {
|
427 |
+
return(*((Z *) this->give_ptr()));
|
428 |
+
}
|
429 |
+
}
|
430 |
+
|
431 |
+
template<class Z>
|
432 |
+
Z BCount<Z>::count(void){
|
433 |
+
return(*((Z *) this->give_ptr()));
|
434 |
+
}
|
435 |
+
|
436 |
+
template<class Z>
|
437 |
+
void BCount<Z>::max_count(const char *pch,Z n){
|
438 |
+
Z *ppt,i;
|
439 |
+
Node *np;
|
440 |
+
total+=n;
|
441 |
+
if(!search(pch)){
|
442 |
+
ppt = new Z;
|
443 |
+
(*ppt) =n;
|
444 |
+
np=new Node(pch,(void*)ppt);
|
445 |
+
this->insert(np);
|
446 |
+
}
|
447 |
+
else {
|
448 |
+
ppt=(Z *)give_ptr();
|
449 |
+
if(*ppt<n)*ppt=n;
|
450 |
+
}
|
451 |
+
}
|
452 |
+
|
453 |
+
template<class Z>
|
454 |
+
void BCount<Z>::max_count2(const char *pch,Z n){
|
455 |
+
Z *ppt,i;
|
456 |
+
Node *np;
|
457 |
+
total+=n;
|
458 |
+
if(!search(pch)){
|
459 |
+
ppt = new Z;
|
460 |
+
(*ppt) =n;
|
461 |
+
np=new Node(pch,(void*)ppt);
|
462 |
+
this->insert(np);
|
463 |
+
cnt_key++;
|
464 |
+
}
|
465 |
+
else {
|
466 |
+
ppt=(Z *)give_ptr();
|
467 |
+
if(*ppt<n)*ppt=n;
|
468 |
+
}
|
469 |
+
}
|
470 |
+
|
471 |
+
template<class Z>
|
472 |
+
void BCount<Z>::maxp_count2(char *pch,Z n){
|
473 |
+
Z *ppt,i;
|
474 |
+
Node *np;
|
475 |
+
total+=n;
|
476 |
+
if(!search(pch)){
|
477 |
+
ppt = new Z;
|
478 |
+
(*ppt) =n;
|
479 |
+
np=new Node;
|
480 |
+
np->str=pch;
|
481 |
+
np->rel=ppt;
|
482 |
+
this->insert(np);
|
483 |
+
cnt_key++;
|
484 |
+
}
|
485 |
+
else {
|
486 |
+
ppt=(Z *)give_ptr();
|
487 |
+
if(*ppt<n)*ppt=n;
|
488 |
+
}
|
489 |
+
}
|
490 |
+
|
491 |
+
template<class Z>
|
492 |
+
void BCount<Z>::min_count(const char *pch,Z n){
|
493 |
+
Z *ppt,i;
|
494 |
+
Node *np;
|
495 |
+
total+=n;
|
496 |
+
if(!search(pch)){
|
497 |
+
ppt = new Z;
|
498 |
+
(*ppt) =n;
|
499 |
+
np=new Node(pch,(void*)ppt);
|
500 |
+
this->insert(np);
|
501 |
+
}
|
502 |
+
else {
|
503 |
+
ppt=(Z *)give_ptr();
|
504 |
+
if(*ppt>n)*ppt=n;
|
505 |
+
}
|
506 |
+
}
|
507 |
+
|
508 |
+
template<class Z>
|
509 |
+
void BCount<Z>::min_count2(const char *pch,Z n){
|
510 |
+
Z *ppt,i;
|
511 |
+
Node *np;
|
512 |
+
total+=n;
|
513 |
+
if(!search(pch)){
|
514 |
+
ppt = new Z;
|
515 |
+
(*ppt) =n;
|
516 |
+
np=new Node(pch,(void*)ppt);
|
517 |
+
this->insert(np);
|
518 |
+
cnt_key++;
|
519 |
+
}
|
520 |
+
else {
|
521 |
+
ppt=(Z *)give_ptr();
|
522 |
+
if(*ppt>n)*ppt=n;
|
523 |
+
}
|
524 |
+
}
|
525 |
+
|
526 |
+
template<class Z>
|
527 |
+
void BCount<Z>::minp_count2(char *pch,Z n){
|
528 |
+
Z *ppt,i;
|
529 |
+
Node *np;
|
530 |
+
total+=n;
|
531 |
+
if(!search(pch)){
|
532 |
+
ppt = new Z;
|
533 |
+
(*ppt) =n;
|
534 |
+
np=new Node;
|
535 |
+
np->str=pch;
|
536 |
+
np->rel=ppt;
|
537 |
+
this->insert(np);
|
538 |
+
cnt_key++;
|
539 |
+
}
|
540 |
+
else {
|
541 |
+
ppt=(Z *)give_ptr();
|
542 |
+
if(*ppt>n)*ppt=n;
|
543 |
+
}
|
544 |
+
}
|
545 |
+
|
546 |
+
}
|
547 |
+
#endif
|
Library/FBase.C
CHANGED
@@ -1,600 +1,600 @@
|
|
1 |
-
#include <iostream>
|
2 |
-
#include <fstream>
|
3 |
-
#include <cstdlib>
|
4 |
-
#include <iomanip>
|
5 |
-
#include <cstring>
|
6 |
-
#include <cmath>
|
7 |
-
#include <sys/types.h>
|
8 |
-
#include <sys/stat.h>
|
9 |
-
#include <unistd.h>
|
10 |
-
#include <fcntl.h>
|
11 |
-
#include <sys/mman.h>
|
12 |
-
#include "runn.h"
|
13 |
-
#include "FBase.h"
|
14 |
-
|
15 |
-
using namespace std;
|
16 |
-
namespace iret {
|
17 |
-
|
18 |
-
FBase::FBase(const char *typ,const char *nam){
|
19 |
-
int lxn=strlen(typ);
|
20 |
-
type=new char[lxn+1];
|
21 |
-
tpnm=-1;
|
22 |
-
nmnm=-1;
|
23 |
-
strcpy(type,typ);
|
24 |
-
lxn=strlen(nam);
|
25 |
-
name=new char[lxn+1];
|
26 |
-
strcpy(name,nam);
|
27 |
-
cflag=0;
|
28 |
-
oflag=0;
|
29 |
-
pflag=get_qflag();
|
30 |
-
eflag=1;
|
31 |
-
}
|
32 |
-
|
33 |
-
FBase::FBase(const char *typ,int tpn,const char *nam){
|
34 |
-
int lxn=strlen(typ);
|
35 |
-
type=new char[lxn+1];
|
36 |
-
tpnm=tpn;
|
37 |
-
nmnm=-1;
|
38 |
-
strcpy(type,typ);
|
39 |
-
lxn=strlen(nam);
|
40 |
-
name=new char[lxn+1];
|
41 |
-
strcpy(name,nam);
|
42 |
-
cflag=0;
|
43 |
-
oflag=0;
|
44 |
-
pflag=get_qflag();
|
45 |
-
eflag=1;
|
46 |
-
}
|
47 |
-
|
48 |
-
FBase::FBase(const char *typ,const char *nam,const char *pt){
|
49 |
-
int lxn=strlen(typ);
|
50 |
-
type=new char[lxn+1];
|
51 |
-
tpnm=-1;
|
52 |
-
nmnm=-1;
|
53 |
-
strcpy(type,typ);
|
54 |
-
lxn=strlen(nam);
|
55 |
-
name=new char[lxn+1];
|
56 |
-
strcpy(name,nam);
|
57 |
-
cflag=0;
|
58 |
-
oflag=0;
|
59 |
-
pflag=get_qflag();
|
60 |
-
if(*pt!=':')set_path_name(pt);
|
61 |
-
else set_path_internal(pt+1);
|
62 |
-
}
|
63 |
-
|
64 |
-
FBase::~FBase(void){
|
65 |
-
delete [] type;
|
66 |
-
delete [] name;
|
67 |
-
}
|
68 |
-
|
69 |
-
void FBase::set_type_num(int tn){tpnm=tn;}
|
70 |
-
|
71 |
-
void FBase::set_name_num(int nn){nmnm=nn;}
|
72 |
-
|
73 |
-
void FBase::change_type(const char *typ){
|
74 |
-
if(type!=NULL)delete [] type;
|
75 |
-
int lxn=strlen(typ);
|
76 |
-
type=new char[lxn+1];
|
77 |
-
strcpy(type,typ);
|
78 |
-
}
|
79 |
-
|
80 |
-
void FBase::change_name(const char *nam){
|
81 |
-
if(name!=NULL)delete [] name;
|
82 |
-
int lxn=strlen(nam);
|
83 |
-
name=new char[lxn+1];
|
84 |
-
strcpy(name,nam);
|
85 |
-
}
|
86 |
-
|
87 |
-
void FBase::set_name(const char *nam){
|
88 |
-
if(name!=NULL)delete [] name;
|
89 |
-
int lxn=strlen(nam);
|
90 |
-
name=new char[lxn+1];
|
91 |
-
strcpy(name,nam);
|
92 |
-
}
|
93 |
-
|
94 |
-
void FBase::subname(const char *tph,const char *tpl,const char *nm){
|
95 |
-
char cnam[max_str];
|
96 |
-
long i=strlen(tpl);
|
97 |
-
strcpy(cnam,tpl);
|
98 |
-
cnam[i]='_';
|
99 |
-
cnam[i+1]='\0';
|
100 |
-
strcat(cnam,nm);
|
101 |
-
change_type(tph);
|
102 |
-
change_name(cnam);
|
103 |
-
}
|
104 |
-
|
105 |
-
void FBase::set_path_internal(const char *pt){
|
106 |
-
long len;
|
107 |
-
if(pt&&(len=strlen(pt))){
|
108 |
-
eflag=0;
|
109 |
-
path=new char[len+1];
|
110 |
-
strcpy(path,pt);
|
111 |
-
}
|
112 |
-
else eflag=1;
|
113 |
-
}
|
114 |
-
|
115 |
-
void FBase::set_path_name(const char *pa){
|
116 |
-
long len;
|
117 |
-
if(pa&&(len=strlen(pa))){
|
118 |
-
eflag=2;
|
119 |
-
pnam=new char[len+1];
|
120 |
-
strcpy(pnam,pa);
|
121 |
-
}
|
122 |
-
else eflag=1;
|
123 |
-
}
|
124 |
-
|
125 |
-
void FBase::map_down(FBase *pFb){
|
126 |
-
pFb->change_type(type);
|
127 |
-
pFb->change_name(name);
|
128 |
-
pFb->set_type_num(tpnm);
|
129 |
-
pFb->set_name_num(nmnm);
|
130 |
-
pFb->pflag=pflag;
|
131 |
-
if(eflag==2)pFb->set_path_name(pnam);
|
132 |
-
else if(!eflag)pFb->set_path_internal(path);
|
133 |
-
}
|
134 |
-
|
135 |
-
void FBase::map_down_sub(FBase *pFb,const char *subtype){
|
136 |
-
pFb->subname(type,name,subtype);
|
137 |
-
pFb->set_type_num(tpnm);
|
138 |
-
pFb->set_name_num(nmnm);
|
139 |
-
pFb->pflag=pflag;
|
140 |
-
if(eflag==2)pFb->set_path_name(pnam);
|
141 |
-
else if(!eflag)pFb->set_path_internal(path);
|
142 |
-
}
|
143 |
-
|
144 |
-
void FBase::get_pathx(char *nam,const char *ch){
|
145 |
-
char cnam[256];
|
146 |
-
ifstream fin;
|
147 |
-
|
148 |
-
if(eflag==2){
|
149 |
-
strcpy(cnam,"path_");
|
150 |
-
strcat(cnam,pnam);
|
151 |
-
fin.open(cnam,ios::in);
|
152 |
-
if(!fin.is_open()){
|
153 |
-
fin.clear();
|
154 |
-
strcpy(cnam,"path");
|
155 |
-
fin.open(cnam,ios::in);
|
156 |
-
if(!fin.is_open()){
|
157 |
-
cout << "Path file for type " << type << " does not exist!" << endl;
|
158 |
-
exit(0);
|
159 |
-
}
|
160 |
-
}
|
161 |
-
fin.getline(nam,256);
|
162 |
-
fin.close();
|
163 |
-
}
|
164 |
-
else if(eflag){
|
165 |
-
strcpy(cnam,"path_");
|
166 |
-
strcat(cnam,type);
|
167 |
-
strcat(cnam,"_");
|
168 |
-
strcat(cnam,name);
|
169 |
-
strcat(cnam,".");
|
170 |
-
strcat(cnam,ch);
|
171 |
-
fin.open(cnam,ios::in);
|
172 |
-
if(!fin.is_open()){
|
173 |
-
fin.clear();
|
174 |
-
strcpy(cnam,"path_");
|
175 |
-
strcat(cnam,type);
|
176 |
-
strcat(cnam,"_");
|
177 |
-
strcat(cnam,name);
|
178 |
-
fin.open(cnam,ios::in);
|
179 |
-
if(!fin.is_open()){
|
180 |
-
fin.clear();
|
181 |
-
strcpy(cnam,"path_");
|
182 |
-
strcat(cnam,type);
|
183 |
-
fin.open(cnam,ios::in);
|
184 |
-
if(!fin.is_open()){
|
185 |
-
fin.clear();
|
186 |
-
strcpy(cnam,"path");
|
187 |
-
fin.open(cnam,ios::in);
|
188 |
-
if(!fin.is_open()){
|
189 |
-
cout << "Path file for type " << type << " does not exist!" << endl;
|
190 |
-
exit(0);
|
191 |
-
}
|
192 |
-
}
|
193 |
-
}
|
194 |
-
}
|
195 |
-
fin.getline(nam,256);
|
196 |
-
fin.close();
|
197 |
-
}
|
198 |
-
else {
|
199 |
-
strcpy(nam,path);
|
200 |
-
}
|
201 |
-
|
202 |
-
if(tpnm<0)strcat(nam,type);
|
203 |
-
else cat_num(type,tpnm,nam);
|
204 |
-
strcat(nam,"_");
|
205 |
-
if(nmnm<0)strcat(nam,name);
|
206 |
-
else cat_num(name,nmnm,nam);
|
207 |
-
strcat(nam,".");
|
208 |
-
strcat(nam,ch);
|
209 |
-
}
|
210 |
-
|
211 |
-
void FBase::get_pathx(char *nam,long n,const char *ch){
|
212 |
-
char cnam[256],bnam[256];
|
213 |
-
ifstream fin;
|
214 |
-
|
215 |
-
if(eflag==2){
|
216 |
-
strcpy(cnam,"path_");
|
217 |
-
strcat(cnam,pnam);
|
218 |
-
fin.open(cnam,ios::in);
|
219 |
-
if(!fin.is_open()){
|
220 |
-
fin.clear();
|
221 |
-
strcpy(cnam,"path");
|
222 |
-
fin.open(cnam,ios::in);
|
223 |
-
if(!fin.is_open()){
|
224 |
-
cout << "Path file for type " << type << " does not exist!" << endl;
|
225 |
-
exit(0);
|
226 |
-
}
|
227 |
-
}
|
228 |
-
fin.getline(nam,256);
|
229 |
-
fin.close();
|
230 |
-
}
|
231 |
-
else if(eflag){
|
232 |
-
strcpy(cnam,"path_");
|
233 |
-
strcat(cnam,type);
|
234 |
-
strcat(cnam,"_");
|
235 |
-
strcat(cnam,name);
|
236 |
-
strcat(cnam,".");
|
237 |
-
strcat(cnam,ch);
|
238 |
-
fin.open(cnam,ios::in);
|
239 |
-
if(!fin.is_open()){
|
240 |
-
fin.clear();
|
241 |
-
strcpy(cnam,"path_");
|
242 |
-
strcat(cnam,type);
|
243 |
-
strcat(cnam,"_");
|
244 |
-
strcat(cnam,name);
|
245 |
-
fin.open(cnam,ios::in);
|
246 |
-
if(!fin.is_open()){
|
247 |
-
fin.clear();
|
248 |
-
strcpy(cnam,"path_");
|
249 |
-
strcat(cnam,type);
|
250 |
-
fin.open(cnam,ios::in);
|
251 |
-
if(!fin.is_open()){
|
252 |
-
fin.clear();
|
253 |
-
strcpy(cnam,"path");
|
254 |
-
fin.open(cnam,ios::in);
|
255 |
-
if(!fin.is_open()){
|
256 |
-
cout << "Path file for type " << type << " does not exist!" << endl;
|
257 |
-
exit(0);
|
258 |
-
}
|
259 |
-
}
|
260 |
-
}
|
261 |
-
}
|
262 |
-
fin.getline(nam,256);
|
263 |
-
fin.close();
|
264 |
-
}
|
265 |
-
else {
|
266 |
-
strcpy(nam,path);
|
267 |
-
}
|
268 |
-
|
269 |
-
if(tpnm<0)strcat(nam,type);
|
270 |
-
else cat_num(type,tpnm,nam);
|
271 |
-
strcat(nam,"_");
|
272 |
-
strcat(nam,add_num(name,n,bnam));
|
273 |
-
strcat(nam,".");
|
274 |
-
strcat(nam,ch);
|
275 |
-
}
|
276 |
-
|
277 |
-
char *FBase::add_num(const char *ptr,long n,char *buf){
|
278 |
-
char cnam[100];
|
279 |
-
long_str(cnam,n);
|
280 |
-
strcpy(buf,ptr);
|
281 |
-
strcat(buf,cnam);
|
282 |
-
return(buf);
|
283 |
-
}
|
284 |
-
|
285 |
-
char *FBase::cat_num(const char *ptr,long n,char *buf){
|
286 |
-
char cnam[100];
|
287 |
-
long_str(cnam,n);
|
288 |
-
strcat(buf,ptr);
|
289 |
-
strcat(buf,cnam);
|
290 |
-
return(buf);
|
291 |
-
}
|
292 |
-
|
293 |
-
int FBase::Gcom(int sflag){
|
294 |
-
if((cflag&sflag)&&!(oflag&sflag)){
|
295 |
-
oflag=oflag|sflag;
|
296 |
-
return(1);
|
297 |
-
}
|
298 |
-
else return(0);
|
299 |
-
}
|
300 |
-
|
301 |
-
int FBase::Rcom(int sflag){
|
302 |
-
if((cflag&sflag)&&(oflag&sflag)){
|
303 |
-
oflag=oflag&(~sflag);
|
304 |
-
return(1);
|
305 |
-
}
|
306 |
-
else return(0);
|
307 |
-
}
|
308 |
-
|
309 |
-
ifstream *FBase::get_Istr(const char *a,ios::openmode mode){
|
310 |
-
char cnam[max_str];
|
311 |
-
get_pathx(cnam,a);
|
312 |
-
ifstream *pfin=new ifstream(cnam,mode);
|
313 |
-
if(pfin->is_open())return(pfin);
|
314 |
-
else {
|
315 |
-
cout << "Error: " << cnam << " failed to open!" << endl;
|
316 |
-
exit(0);
|
317 |
-
}
|
318 |
-
}
|
319 |
-
|
320 |
-
ofstream *FBase::get_Ostr(const char *a,ios::openmode mode){
|
321 |
-
char cnam[max_str];
|
322 |
-
get_pathx(cnam,a);
|
323 |
-
ofstream *pfout=new ofstream(cnam,mode);
|
324 |
-
if(pfout->is_open())return(pfout);
|
325 |
-
else {
|
326 |
-
cout << "Error: " << cnam << " failed to open!" << endl;
|
327 |
-
exit(0);
|
328 |
-
}
|
329 |
-
}
|
330 |
-
|
331 |
-
fstream *FBase::get_Fstr(const char *a,ios::openmode mode){
|
332 |
-
char cnam[max_str];
|
333 |
-
get_pathx(cnam,a);
|
334 |
-
fstream *pfstr=new fstream(cnam,mode);
|
335 |
-
if(pfstr->is_open())return(pfstr);
|
336 |
-
else {
|
337 |
-
cout << "Error: " << cnam << " failed to open!" << endl;
|
338 |
-
exit(0);
|
339 |
-
}
|
340 |
-
}
|
341 |
-
|
342 |
-
ifstream *FBase::get_Istr(long n,const char *a,ios::openmode mode){
|
343 |
-
char cnam[max_str];
|
344 |
-
get_pathx(cnam,n,a);
|
345 |
-
ifstream *pfin=new ifstream(cnam,mode);
|
346 |
-
if(pfin->is_open())return(pfin);
|
347 |
-
else {
|
348 |
-
cout << "Error: " << cnam << " failed to open!" << endl;
|
349 |
-
exit(0);
|
350 |
-
}
|
351 |
-
}
|
352 |
-
|
353 |
-
ofstream *FBase::get_Ostr(long n,const char *a,ios::openmode mode){
|
354 |
-
char cnam[max_str];
|
355 |
-
get_pathx(cnam,n,a);
|
356 |
-
ofstream *pfout=new ofstream(cnam,mode);
|
357 |
-
if(pfout->is_open())return(pfout);
|
358 |
-
else {
|
359 |
-
cout << "Error: " << cnam << " failed to open!" << endl;
|
360 |
-
exit(0);
|
361 |
-
}
|
362 |
-
}
|
363 |
-
|
364 |
-
fstream *FBase::get_Fstr(long n,const char *a,ios::openmode mode){
|
365 |
-
char cnam[max_str];
|
366 |
-
get_pathx(cnam,n,a);
|
367 |
-
fstream *pfstr=new fstream(cnam,mode);
|
368 |
-
if(pfstr->is_open())return(pfstr);
|
369 |
-
else {
|
370 |
-
cout << "Error: " << cnam << " failed to open!" << endl;
|
371 |
-
exit(0);
|
372 |
-
}
|
373 |
-
}
|
374 |
-
|
375 |
-
void FBase::dst_Istr(ifstream *pfin){
|
376 |
-
if(!pfin)return;
|
377 |
-
if(!pfin->is_open()){
|
378 |
-
cout << "File not open!" << endl;
|
379 |
-
exit(0);
|
380 |
-
}
|
381 |
-
delete pfin;
|
382 |
-
}
|
383 |
-
|
384 |
-
void FBase::dst_Ostr(ofstream *pfout){
|
385 |
-
if(!pfout)return;
|
386 |
-
if(!pfout->is_open()){
|
387 |
-
cout << "File not open!" << endl;
|
388 |
-
exit(0);
|
389 |
-
}
|
390 |
-
delete pfout;
|
391 |
-
}
|
392 |
-
|
393 |
-
void FBase::dst_Fstr(fstream *pfstr){
|
394 |
-
if(!pfstr)return;
|
395 |
-
if(!pfstr->is_open()){
|
396 |
-
cout << "File not open!" << endl;
|
397 |
-
exit(0);
|
398 |
-
}
|
399 |
-
delete pfstr;
|
400 |
-
}
|
401 |
-
|
402 |
-
long FBase::get_Fsiz(const char *a){
|
403 |
-
if(!Exists(a))return(0);
|
404 |
-
int fld;
|
405 |
-
struct stat datf;
|
406 |
-
char cnam[max_str];
|
407 |
-
get_pathx(cnam,a);
|
408 |
-
fld=::open(cnam,O_RDONLY);
|
409 |
-
if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
|
410 |
-
if(fstat(fld,&datf)){cout << cnam << " failed on size \
|
411 |
-
determination" << endl;exit(0);}
|
412 |
-
::close(fld);
|
413 |
-
return(datf.st_size);
|
414 |
-
}
|
415 |
-
|
416 |
-
long FBase::get_Fsiz(long n,const char *a){
|
417 |
-
if(!Exists(n,a))return(0);
|
418 |
-
int fld;
|
419 |
-
struct stat datf;
|
420 |
-
char cnam[max_str];
|
421 |
-
get_pathx(cnam,n,a);
|
422 |
-
fld=::open(cnam,O_RDONLY);
|
423 |
-
if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
|
424 |
-
if(fstat(fld,&datf)){cout << cnam << " failed on size \
|
425 |
-
determination" << endl;exit(0);}
|
426 |
-
::close(fld);
|
427 |
-
return(datf.st_size);
|
428 |
-
}
|
429 |
-
|
430 |
-
char *FBase::get_Read(const char *a){
|
431 |
-
int fld;
|
432 |
-
struct stat datf;
|
433 |
-
char cnam[max_str];
|
434 |
-
get_pathx(cnam,a);
|
435 |
-
fld=::open(cnam,O_RDONLY);
|
436 |
-
if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
|
437 |
-
if(fstat(fld,&datf)){cout << cnam << " failed on size \
|
438 |
-
determination" << endl;exit(0);}
|
439 |
-
::close(fld);
|
440 |
-
char *ptr=new char[datf.st_size];
|
441 |
-
ifstream fin(cnam,ios::in);
|
442 |
-
if(!fin.is_open()){
|
443 |
-
cout << "Error: " << cnam << " failed to open!" << endl;
|
444 |
-
exit(0);
|
445 |
-
}
|
446 |
-
fin.read(ptr,datf.st_size);
|
447 |
-
return(ptr);
|
448 |
-
}
|
449 |
-
|
450 |
-
char *FBase::get_Read(long n,const char *a){
|
451 |
-
int fld;
|
452 |
-
struct stat datf;
|
453 |
-
char cnam[max_str];
|
454 |
-
get_pathx(cnam,n,a);
|
455 |
-
fld=::open(cnam,O_RDONLY);
|
456 |
-
if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
|
457 |
-
if(fstat(fld,&datf)){cout << cnam << " failed on size \
|
458 |
-
determination" << endl;exit(0);}
|
459 |
-
::close(fld);
|
460 |
-
char *ptr=new char[datf.st_size];
|
461 |
-
ifstream fin(cnam,ios::in);
|
462 |
-
if(!fin.is_open()){
|
463 |
-
cout << "Error: " << cnam << " failed to open!" << endl;
|
464 |
-
exit(0);
|
465 |
-
}
|
466 |
-
fin.read(ptr,datf.st_size);
|
467 |
-
return(ptr);
|
468 |
-
}
|
469 |
-
|
470 |
-
char *FBase::get_Mmap(const char *a){
|
471 |
-
int fld;
|
472 |
-
struct stat datf;
|
473 |
-
char cnam[max_str];
|
474 |
-
get_pathx(cnam,a);
|
475 |
-
fld=::open(cnam,O_RDONLY);
|
476 |
-
if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
|
477 |
-
if(fstat(fld,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
|
478 |
-
char *ptr=(char*)mmap(0,datf.st_size,PROT_READ,MAP_PRIVATE|MAP_NORESERVE,fld,0);
|
479 |
-
if(ptr==MAP_FAILED){cout << cnam << " failed to map" << endl;exit(0);}
|
480 |
-
::close(fld);
|
481 |
-
return(ptr);
|
482 |
-
}
|
483 |
-
|
484 |
-
char *FBase::get_Mmap(long n,const char *a){
|
485 |
-
int fld;
|
486 |
-
struct stat datf;
|
487 |
-
char cnam[max_str];
|
488 |
-
get_pathx(cnam,n,a);
|
489 |
-
fld=::open(cnam,O_RDONLY);
|
490 |
-
if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
|
491 |
-
if(fstat(fld,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
|
492 |
-
char *ptr=(char*)mmap(0,datf.st_size,PROT_READ,MAP_PRIVATE|MAP_NORESERVE,fld,0);
|
493 |
-
if(ptr==MAP_FAILED){cout << cnam << " failed to map" << endl;exit(0);}
|
494 |
-
::close(fld);
|
495 |
-
return(ptr);
|
496 |
-
}
|
497 |
-
|
498 |
-
char *FBase::get_Wmap(const char *a){
|
499 |
-
int fld;
|
500 |
-
struct stat datf;
|
501 |
-
char cnam[max_str];
|
502 |
-
get_pathx(cnam,a);
|
503 |
-
fld=::open(cnam,O_RDWR);
|
504 |
-
if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
|
505 |
-
if(fstat(fld,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
|
506 |
-
char *ptr=(char*)mmap(0,datf.st_size,PROT_READ|PROT_WRITE,MAP_SHARED,fld,0);
|
507 |
-
if(ptr==MAP_FAILED){cout << cnam << " failed to map" << endl;exit(0);}
|
508 |
-
::close(fld);
|
509 |
-
return(ptr);
|
510 |
-
}
|
511 |
-
|
512 |
-
char *FBase::get_Wmap(long n,const char *a){
|
513 |
-
int fld;
|
514 |
-
struct stat datf;
|
515 |
-
char cnam[max_str];
|
516 |
-
get_pathx(cnam,n,a);
|
517 |
-
fld=::open(cnam,O_RDWR);
|
518 |
-
if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
|
519 |
-
if(fstat(fld,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
|
520 |
-
char *ptr=(char*)mmap(0,datf.st_size,PROT_READ|PROT_WRITE,MAP_SHARED,fld,0);
|
521 |
-
if(ptr==MAP_FAILED){cout << cnam << " failed to map" << endl;exit(0);}
|
522 |
-
::close(fld);
|
523 |
-
return(ptr);
|
524 |
-
}
|
525 |
-
|
526 |
-
void FBase::dst_Mmap(const char *a,char *ptr){
|
527 |
-
struct stat datf;
|
528 |
-
char cnam[max_str];
|
529 |
-
if(ptr==NULL){cout << "NULL pointer" << endl;return;}
|
530 |
-
get_pathx(cnam,a);
|
531 |
-
if(stat(cnam,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
|
532 |
-
if(munmap(ptr,datf.st_size)){cout << cnam << " failed to unmap" << endl;exit(0);}
|
533 |
-
ptr=NULL;
|
534 |
-
}
|
535 |
-
|
536 |
-
void FBase::dst_Mmap(long n,const char *a,char *ptr){
|
537 |
-
struct stat datf;
|
538 |
-
char cnam[max_str];
|
539 |
-
if(ptr==NULL){cout << "NULL pointer" << endl;return;}
|
540 |
-
get_pathx(cnam,n,a);
|
541 |
-
if(stat(cnam,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
|
542 |
-
if(munmap(ptr,datf.st_size)){cout << cnam << " failed to unmap" << endl;exit(0);}
|
543 |
-
ptr=NULL;
|
544 |
-
}
|
545 |
-
|
546 |
-
void FBase::bin_Writ(const char *a,long nm,char *ptr){
|
547 |
-
ofstream *pfout=get_Ostr(a,ios::out);
|
548 |
-
long k=100000,i=0;
|
549 |
-
while(i+k<nm){
|
550 |
-
pfout->write((char*)ptr,k);
|
551 |
-
i+=k;
|
552 |
-
ptr=ptr+k;
|
553 |
-
}
|
554 |
-
pfout->write((char*)ptr,nm-i);
|
555 |
-
pfout->close();
|
556 |
-
delete pfout;
|
557 |
-
}
|
558 |
-
|
559 |
-
void FBase::bin_Writ(long n,const char *a,long nm,char *ptr){
|
560 |
-
ofstream *pfout=get_Ostr(n,a,ios::out);
|
561 |
-
long k=100000,i=0;
|
562 |
-
while(i+k<nm){
|
563 |
-
pfout->write((char*)ptr,k);
|
564 |
-
i+=k;
|
565 |
-
ptr=ptr+k;
|
566 |
-
}
|
567 |
-
pfout->write((char*)ptr,nm-i);
|
568 |
-
pfout->close();
|
569 |
-
delete pfout;
|
570 |
-
}
|
571 |
-
|
572 |
-
int FBase::Exists(const char *a){
|
573 |
-
char cnam[max_str];
|
574 |
-
get_pathx(cnam,a);
|
575 |
-
ifstream fin(cnam,ios::in);
|
576 |
-
if(fin.is_open()){
|
577 |
-
fin.close();
|
578 |
-
return(1);
|
579 |
-
}
|
580 |
-
else return(0);
|
581 |
-
}
|
582 |
-
|
583 |
-
int FBase::Exists(long n,const char *a){
|
584 |
-
char cnam[max_str];
|
585 |
-
get_pathx(cnam,n,a);
|
586 |
-
ifstream fin(cnam,ios::in);
|
587 |
-
if(fin.is_open()){
|
588 |
-
fin.close();
|
589 |
-
return(1);
|
590 |
-
}
|
591 |
-
else return(0);
|
592 |
-
}
|
593 |
-
|
594 |
-
void FBase::mark(long ct, int ivl, const char *what){
|
595 |
-
if(pflag&&(ct%ivl==0)){
|
596 |
-
cout << what << " count=" << ct << endl;
|
597 |
-
}
|
598 |
-
}
|
599 |
-
|
600 |
-
}
|
|
|
1 |
+
#include <iostream>
|
2 |
+
#include <fstream>
|
3 |
+
#include <cstdlib>
|
4 |
+
#include <iomanip>
|
5 |
+
#include <cstring>
|
6 |
+
#include <cmath>
|
7 |
+
#include <sys/types.h>
|
8 |
+
#include <sys/stat.h>
|
9 |
+
#include <unistd.h>
|
10 |
+
#include <fcntl.h>
|
11 |
+
#include <sys/mman.h>
|
12 |
+
#include "runn.h"
|
13 |
+
#include "FBase.h"
|
14 |
+
|
15 |
+
using namespace std;
|
16 |
+
namespace iret {
|
17 |
+
|
18 |
+
FBase::FBase(const char *typ,const char *nam){
|
19 |
+
int lxn=strlen(typ);
|
20 |
+
type=new char[lxn+1];
|
21 |
+
tpnm=-1;
|
22 |
+
nmnm=-1;
|
23 |
+
strcpy(type,typ);
|
24 |
+
lxn=strlen(nam);
|
25 |
+
name=new char[lxn+1];
|
26 |
+
strcpy(name,nam);
|
27 |
+
cflag=0;
|
28 |
+
oflag=0;
|
29 |
+
pflag=get_qflag();
|
30 |
+
eflag=1;
|
31 |
+
}
|
32 |
+
|
33 |
+
FBase::FBase(const char *typ,int tpn,const char *nam){
|
34 |
+
int lxn=strlen(typ);
|
35 |
+
type=new char[lxn+1];
|
36 |
+
tpnm=tpn;
|
37 |
+
nmnm=-1;
|
38 |
+
strcpy(type,typ);
|
39 |
+
lxn=strlen(nam);
|
40 |
+
name=new char[lxn+1];
|
41 |
+
strcpy(name,nam);
|
42 |
+
cflag=0;
|
43 |
+
oflag=0;
|
44 |
+
pflag=get_qflag();
|
45 |
+
eflag=1;
|
46 |
+
}
|
47 |
+
|
48 |
+
FBase::FBase(const char *typ,const char *nam,const char *pt){
|
49 |
+
int lxn=strlen(typ);
|
50 |
+
type=new char[lxn+1];
|
51 |
+
tpnm=-1;
|
52 |
+
nmnm=-1;
|
53 |
+
strcpy(type,typ);
|
54 |
+
lxn=strlen(nam);
|
55 |
+
name=new char[lxn+1];
|
56 |
+
strcpy(name,nam);
|
57 |
+
cflag=0;
|
58 |
+
oflag=0;
|
59 |
+
pflag=get_qflag();
|
60 |
+
if(*pt!=':')set_path_name(pt);
|
61 |
+
else set_path_internal(pt+1);
|
62 |
+
}
|
63 |
+
|
64 |
+
FBase::~FBase(void){
|
65 |
+
delete [] type;
|
66 |
+
delete [] name;
|
67 |
+
}
|
68 |
+
|
69 |
+
void FBase::set_type_num(int tn){tpnm=tn;}
|
70 |
+
|
71 |
+
void FBase::set_name_num(int nn){nmnm=nn;}
|
72 |
+
|
73 |
+
void FBase::change_type(const char *typ){
|
74 |
+
if(type!=NULL)delete [] type;
|
75 |
+
int lxn=strlen(typ);
|
76 |
+
type=new char[lxn+1];
|
77 |
+
strcpy(type,typ);
|
78 |
+
}
|
79 |
+
|
80 |
+
void FBase::change_name(const char *nam){
|
81 |
+
if(name!=NULL)delete [] name;
|
82 |
+
int lxn=strlen(nam);
|
83 |
+
name=new char[lxn+1];
|
84 |
+
strcpy(name,nam);
|
85 |
+
}
|
86 |
+
|
87 |
+
void FBase::set_name(const char *nam){
|
88 |
+
if(name!=NULL)delete [] name;
|
89 |
+
int lxn=strlen(nam);
|
90 |
+
name=new char[lxn+1];
|
91 |
+
strcpy(name,nam);
|
92 |
+
}
|
93 |
+
|
94 |
+
void FBase::subname(const char *tph,const char *tpl,const char *nm){
|
95 |
+
char cnam[max_str];
|
96 |
+
long i=strlen(tpl);
|
97 |
+
strcpy(cnam,tpl);
|
98 |
+
cnam[i]='_';
|
99 |
+
cnam[i+1]='\0';
|
100 |
+
strcat(cnam,nm);
|
101 |
+
change_type(tph);
|
102 |
+
change_name(cnam);
|
103 |
+
}
|
104 |
+
|
105 |
+
void FBase::set_path_internal(const char *pt){
|
106 |
+
long len;
|
107 |
+
if(pt&&(len=strlen(pt))){
|
108 |
+
eflag=0;
|
109 |
+
path=new char[len+1];
|
110 |
+
strcpy(path,pt);
|
111 |
+
}
|
112 |
+
else eflag=1;
|
113 |
+
}
|
114 |
+
|
115 |
+
void FBase::set_path_name(const char *pa){
|
116 |
+
long len;
|
117 |
+
if(pa&&(len=strlen(pa))){
|
118 |
+
eflag=2;
|
119 |
+
pnam=new char[len+1];
|
120 |
+
strcpy(pnam,pa);
|
121 |
+
}
|
122 |
+
else eflag=1;
|
123 |
+
}
|
124 |
+
|
125 |
+
void FBase::map_down(FBase *pFb){
|
126 |
+
pFb->change_type(type);
|
127 |
+
pFb->change_name(name);
|
128 |
+
pFb->set_type_num(tpnm);
|
129 |
+
pFb->set_name_num(nmnm);
|
130 |
+
pFb->pflag=pflag;
|
131 |
+
if(eflag==2)pFb->set_path_name(pnam);
|
132 |
+
else if(!eflag)pFb->set_path_internal(path);
|
133 |
+
}
|
134 |
+
|
135 |
+
void FBase::map_down_sub(FBase *pFb,const char *subtype){
|
136 |
+
pFb->subname(type,name,subtype);
|
137 |
+
pFb->set_type_num(tpnm);
|
138 |
+
pFb->set_name_num(nmnm);
|
139 |
+
pFb->pflag=pflag;
|
140 |
+
if(eflag==2)pFb->set_path_name(pnam);
|
141 |
+
else if(!eflag)pFb->set_path_internal(path);
|
142 |
+
}
|
143 |
+
|
144 |
+
void FBase::get_pathx(char *nam,const char *ch){
|
145 |
+
char cnam[256];
|
146 |
+
ifstream fin;
|
147 |
+
|
148 |
+
if(eflag==2){
|
149 |
+
strcpy(cnam,"path_");
|
150 |
+
strcat(cnam,pnam);
|
151 |
+
fin.open(cnam,ios::in);
|
152 |
+
if(!fin.is_open()){
|
153 |
+
fin.clear();
|
154 |
+
strcpy(cnam,"path");
|
155 |
+
fin.open(cnam,ios::in);
|
156 |
+
if(!fin.is_open()){
|
157 |
+
cout << "Path file for type " << type << " does not exist!" << endl;
|
158 |
+
exit(0);
|
159 |
+
}
|
160 |
+
}
|
161 |
+
fin.getline(nam,256);
|
162 |
+
fin.close();
|
163 |
+
}
|
164 |
+
else if(eflag){
|
165 |
+
strcpy(cnam,"path_");
|
166 |
+
strcat(cnam,type);
|
167 |
+
strcat(cnam,"_");
|
168 |
+
strcat(cnam,name);
|
169 |
+
strcat(cnam,".");
|
170 |
+
strcat(cnam,ch);
|
171 |
+
fin.open(cnam,ios::in);
|
172 |
+
if(!fin.is_open()){
|
173 |
+
fin.clear();
|
174 |
+
strcpy(cnam,"path_");
|
175 |
+
strcat(cnam,type);
|
176 |
+
strcat(cnam,"_");
|
177 |
+
strcat(cnam,name);
|
178 |
+
fin.open(cnam,ios::in);
|
179 |
+
if(!fin.is_open()){
|
180 |
+
fin.clear();
|
181 |
+
strcpy(cnam,"path_");
|
182 |
+
strcat(cnam,type);
|
183 |
+
fin.open(cnam,ios::in);
|
184 |
+
if(!fin.is_open()){
|
185 |
+
fin.clear();
|
186 |
+
strcpy(cnam,"path");
|
187 |
+
fin.open(cnam,ios::in);
|
188 |
+
if(!fin.is_open()){
|
189 |
+
cout << "Path file for type " << type << " does not exist!" << endl;
|
190 |
+
exit(0);
|
191 |
+
}
|
192 |
+
}
|
193 |
+
}
|
194 |
+
}
|
195 |
+
fin.getline(nam,256);
|
196 |
+
fin.close();
|
197 |
+
}
|
198 |
+
else {
|
199 |
+
strcpy(nam,path);
|
200 |
+
}
|
201 |
+
|
202 |
+
if(tpnm<0)strcat(nam,type);
|
203 |
+
else cat_num(type,tpnm,nam);
|
204 |
+
strcat(nam,"_");
|
205 |
+
if(nmnm<0)strcat(nam,name);
|
206 |
+
else cat_num(name,nmnm,nam);
|
207 |
+
strcat(nam,".");
|
208 |
+
strcat(nam,ch);
|
209 |
+
}
|
210 |
+
|
211 |
+
void FBase::get_pathx(char *nam,long n,const char *ch){
|
212 |
+
char cnam[256],bnam[256];
|
213 |
+
ifstream fin;
|
214 |
+
|
215 |
+
if(eflag==2){
|
216 |
+
strcpy(cnam,"path_");
|
217 |
+
strcat(cnam,pnam);
|
218 |
+
fin.open(cnam,ios::in);
|
219 |
+
if(!fin.is_open()){
|
220 |
+
fin.clear();
|
221 |
+
strcpy(cnam,"path");
|
222 |
+
fin.open(cnam,ios::in);
|
223 |
+
if(!fin.is_open()){
|
224 |
+
cout << "Path file for type " << type << " does not exist!" << endl;
|
225 |
+
exit(0);
|
226 |
+
}
|
227 |
+
}
|
228 |
+
fin.getline(nam,256);
|
229 |
+
fin.close();
|
230 |
+
}
|
231 |
+
else if(eflag){
|
232 |
+
strcpy(cnam,"path_");
|
233 |
+
strcat(cnam,type);
|
234 |
+
strcat(cnam,"_");
|
235 |
+
strcat(cnam,name);
|
236 |
+
strcat(cnam,".");
|
237 |
+
strcat(cnam,ch);
|
238 |
+
fin.open(cnam,ios::in);
|
239 |
+
if(!fin.is_open()){
|
240 |
+
fin.clear();
|
241 |
+
strcpy(cnam,"path_");
|
242 |
+
strcat(cnam,type);
|
243 |
+
strcat(cnam,"_");
|
244 |
+
strcat(cnam,name);
|
245 |
+
fin.open(cnam,ios::in);
|
246 |
+
if(!fin.is_open()){
|
247 |
+
fin.clear();
|
248 |
+
strcpy(cnam,"path_");
|
249 |
+
strcat(cnam,type);
|
250 |
+
fin.open(cnam,ios::in);
|
251 |
+
if(!fin.is_open()){
|
252 |
+
fin.clear();
|
253 |
+
strcpy(cnam,"path");
|
254 |
+
fin.open(cnam,ios::in);
|
255 |
+
if(!fin.is_open()){
|
256 |
+
cout << "Path file for type " << type << " does not exist!" << endl;
|
257 |
+
exit(0);
|
258 |
+
}
|
259 |
+
}
|
260 |
+
}
|
261 |
+
}
|
262 |
+
fin.getline(nam,256);
|
263 |
+
fin.close();
|
264 |
+
}
|
265 |
+
else {
|
266 |
+
strcpy(nam,path);
|
267 |
+
}
|
268 |
+
|
269 |
+
if(tpnm<0)strcat(nam,type);
|
270 |
+
else cat_num(type,tpnm,nam);
|
271 |
+
strcat(nam,"_");
|
272 |
+
strcat(nam,add_num(name,n,bnam));
|
273 |
+
strcat(nam,".");
|
274 |
+
strcat(nam,ch);
|
275 |
+
}
|
276 |
+
|
277 |
+
char *FBase::add_num(const char *ptr,long n,char *buf){
|
278 |
+
char cnam[100];
|
279 |
+
long_str(cnam,n);
|
280 |
+
strcpy(buf,ptr);
|
281 |
+
strcat(buf,cnam);
|
282 |
+
return(buf);
|
283 |
+
}
|
284 |
+
|
285 |
+
char *FBase::cat_num(const char *ptr,long n,char *buf){
|
286 |
+
char cnam[100];
|
287 |
+
long_str(cnam,n);
|
288 |
+
strcat(buf,ptr);
|
289 |
+
strcat(buf,cnam);
|
290 |
+
return(buf);
|
291 |
+
}
|
292 |
+
|
293 |
+
int FBase::Gcom(int sflag){
|
294 |
+
if((cflag&sflag)&&!(oflag&sflag)){
|
295 |
+
oflag=oflag|sflag;
|
296 |
+
return(1);
|
297 |
+
}
|
298 |
+
else return(0);
|
299 |
+
}
|
300 |
+
|
301 |
+
int FBase::Rcom(int sflag){
|
302 |
+
if((cflag&sflag)&&(oflag&sflag)){
|
303 |
+
oflag=oflag&(~sflag);
|
304 |
+
return(1);
|
305 |
+
}
|
306 |
+
else return(0);
|
307 |
+
}
|
308 |
+
|
309 |
+
ifstream *FBase::get_Istr(const char *a,ios::openmode mode){
|
310 |
+
char cnam[max_str];
|
311 |
+
get_pathx(cnam,a);
|
312 |
+
ifstream *pfin=new ifstream(cnam,mode);
|
313 |
+
if(pfin->is_open())return(pfin);
|
314 |
+
else {
|
315 |
+
cout << "Error: " << cnam << " failed to open!" << endl;
|
316 |
+
exit(0);
|
317 |
+
}
|
318 |
+
}
|
319 |
+
|
320 |
+
ofstream *FBase::get_Ostr(const char *a,ios::openmode mode){
|
321 |
+
char cnam[max_str];
|
322 |
+
get_pathx(cnam,a);
|
323 |
+
ofstream *pfout=new ofstream(cnam,mode);
|
324 |
+
if(pfout->is_open())return(pfout);
|
325 |
+
else {
|
326 |
+
cout << "Error: " << cnam << " failed to open!" << endl;
|
327 |
+
exit(0);
|
328 |
+
}
|
329 |
+
}
|
330 |
+
|
331 |
+
fstream *FBase::get_Fstr(const char *a,ios::openmode mode){
|
332 |
+
char cnam[max_str];
|
333 |
+
get_pathx(cnam,a);
|
334 |
+
fstream *pfstr=new fstream(cnam,mode);
|
335 |
+
if(pfstr->is_open())return(pfstr);
|
336 |
+
else {
|
337 |
+
cout << "Error: " << cnam << " failed to open!" << endl;
|
338 |
+
exit(0);
|
339 |
+
}
|
340 |
+
}
|
341 |
+
|
342 |
+
ifstream *FBase::get_Istr(long n,const char *a,ios::openmode mode){
|
343 |
+
char cnam[max_str];
|
344 |
+
get_pathx(cnam,n,a);
|
345 |
+
ifstream *pfin=new ifstream(cnam,mode);
|
346 |
+
if(pfin->is_open())return(pfin);
|
347 |
+
else {
|
348 |
+
cout << "Error: " << cnam << " failed to open!" << endl;
|
349 |
+
exit(0);
|
350 |
+
}
|
351 |
+
}
|
352 |
+
|
353 |
+
ofstream *FBase::get_Ostr(long n,const char *a,ios::openmode mode){
|
354 |
+
char cnam[max_str];
|
355 |
+
get_pathx(cnam,n,a);
|
356 |
+
ofstream *pfout=new ofstream(cnam,mode);
|
357 |
+
if(pfout->is_open())return(pfout);
|
358 |
+
else {
|
359 |
+
cout << "Error: " << cnam << " failed to open!" << endl;
|
360 |
+
exit(0);
|
361 |
+
}
|
362 |
+
}
|
363 |
+
|
364 |
+
fstream *FBase::get_Fstr(long n,const char *a,ios::openmode mode){
|
365 |
+
char cnam[max_str];
|
366 |
+
get_pathx(cnam,n,a);
|
367 |
+
fstream *pfstr=new fstream(cnam,mode);
|
368 |
+
if(pfstr->is_open())return(pfstr);
|
369 |
+
else {
|
370 |
+
cout << "Error: " << cnam << " failed to open!" << endl;
|
371 |
+
exit(0);
|
372 |
+
}
|
373 |
+
}
|
374 |
+
|
375 |
+
void FBase::dst_Istr(ifstream *pfin){
|
376 |
+
if(!pfin)return;
|
377 |
+
if(!pfin->is_open()){
|
378 |
+
cout << "File not open!" << endl;
|
379 |
+
exit(0);
|
380 |
+
}
|
381 |
+
delete pfin;
|
382 |
+
}
|
383 |
+
|
384 |
+
void FBase::dst_Ostr(ofstream *pfout){
|
385 |
+
if(!pfout)return;
|
386 |
+
if(!pfout->is_open()){
|
387 |
+
cout << "File not open!" << endl;
|
388 |
+
exit(0);
|
389 |
+
}
|
390 |
+
delete pfout;
|
391 |
+
}
|
392 |
+
|
393 |
+
void FBase::dst_Fstr(fstream *pfstr){
|
394 |
+
if(!pfstr)return;
|
395 |
+
if(!pfstr->is_open()){
|
396 |
+
cout << "File not open!" << endl;
|
397 |
+
exit(0);
|
398 |
+
}
|
399 |
+
delete pfstr;
|
400 |
+
}
|
401 |
+
|
402 |
+
long FBase::get_Fsiz(const char *a){
|
403 |
+
if(!Exists(a))return(0);
|
404 |
+
int fld;
|
405 |
+
struct stat datf;
|
406 |
+
char cnam[max_str];
|
407 |
+
get_pathx(cnam,a);
|
408 |
+
fld=::open(cnam,O_RDONLY);
|
409 |
+
if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
|
410 |
+
if(fstat(fld,&datf)){cout << cnam << " failed on size \
|
411 |
+
determination" << endl;exit(0);}
|
412 |
+
::close(fld);
|
413 |
+
return(datf.st_size);
|
414 |
+
}
|
415 |
+
|
416 |
+
long FBase::get_Fsiz(long n,const char *a){
|
417 |
+
if(!Exists(n,a))return(0);
|
418 |
+
int fld;
|
419 |
+
struct stat datf;
|
420 |
+
char cnam[max_str];
|
421 |
+
get_pathx(cnam,n,a);
|
422 |
+
fld=::open(cnam,O_RDONLY);
|
423 |
+
if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
|
424 |
+
if(fstat(fld,&datf)){cout << cnam << " failed on size \
|
425 |
+
determination" << endl;exit(0);}
|
426 |
+
::close(fld);
|
427 |
+
return(datf.st_size);
|
428 |
+
}
|
429 |
+
|
430 |
+
char *FBase::get_Read(const char *a){
|
431 |
+
int fld;
|
432 |
+
struct stat datf;
|
433 |
+
char cnam[max_str];
|
434 |
+
get_pathx(cnam,a);
|
435 |
+
fld=::open(cnam,O_RDONLY);
|
436 |
+
if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
|
437 |
+
if(fstat(fld,&datf)){cout << cnam << " failed on size \
|
438 |
+
determination" << endl;exit(0);}
|
439 |
+
::close(fld);
|
440 |
+
char *ptr=new char[datf.st_size];
|
441 |
+
ifstream fin(cnam,ios::in);
|
442 |
+
if(!fin.is_open()){
|
443 |
+
cout << "Error: " << cnam << " failed to open!" << endl;
|
444 |
+
exit(0);
|
445 |
+
}
|
446 |
+
fin.read(ptr,datf.st_size);
|
447 |
+
return(ptr);
|
448 |
+
}
|
449 |
+
|
450 |
+
char *FBase::get_Read(long n,const char *a){
|
451 |
+
int fld;
|
452 |
+
struct stat datf;
|
453 |
+
char cnam[max_str];
|
454 |
+
get_pathx(cnam,n,a);
|
455 |
+
fld=::open(cnam,O_RDONLY);
|
456 |
+
if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
|
457 |
+
if(fstat(fld,&datf)){cout << cnam << " failed on size \
|
458 |
+
determination" << endl;exit(0);}
|
459 |
+
::close(fld);
|
460 |
+
char *ptr=new char[datf.st_size];
|
461 |
+
ifstream fin(cnam,ios::in);
|
462 |
+
if(!fin.is_open()){
|
463 |
+
cout << "Error: " << cnam << " failed to open!" << endl;
|
464 |
+
exit(0);
|
465 |
+
}
|
466 |
+
fin.read(ptr,datf.st_size);
|
467 |
+
return(ptr);
|
468 |
+
}
|
469 |
+
|
470 |
+
char *FBase::get_Mmap(const char *a){
|
471 |
+
int fld;
|
472 |
+
struct stat datf;
|
473 |
+
char cnam[max_str];
|
474 |
+
get_pathx(cnam,a);
|
475 |
+
fld=::open(cnam,O_RDONLY);
|
476 |
+
if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
|
477 |
+
if(fstat(fld,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
|
478 |
+
char *ptr=(char*)mmap(0,datf.st_size,PROT_READ,MAP_PRIVATE|MAP_NORESERVE,fld,0);
|
479 |
+
if(ptr==MAP_FAILED){cout << cnam << " failed to map" << endl;exit(0);}
|
480 |
+
::close(fld);
|
481 |
+
return(ptr);
|
482 |
+
}
|
483 |
+
|
484 |
+
char *FBase::get_Mmap(long n,const char *a){
|
485 |
+
int fld;
|
486 |
+
struct stat datf;
|
487 |
+
char cnam[max_str];
|
488 |
+
get_pathx(cnam,n,a);
|
489 |
+
fld=::open(cnam,O_RDONLY);
|
490 |
+
if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
|
491 |
+
if(fstat(fld,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
|
492 |
+
char *ptr=(char*)mmap(0,datf.st_size,PROT_READ,MAP_PRIVATE|MAP_NORESERVE,fld,0);
|
493 |
+
if(ptr==MAP_FAILED){cout << cnam << " failed to map" << endl;exit(0);}
|
494 |
+
::close(fld);
|
495 |
+
return(ptr);
|
496 |
+
}
|
497 |
+
|
498 |
+
char *FBase::get_Wmap(const char *a){
|
499 |
+
int fld;
|
500 |
+
struct stat datf;
|
501 |
+
char cnam[max_str];
|
502 |
+
get_pathx(cnam,a);
|
503 |
+
fld=::open(cnam,O_RDWR);
|
504 |
+
if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
|
505 |
+
if(fstat(fld,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
|
506 |
+
char *ptr=(char*)mmap(0,datf.st_size,PROT_READ|PROT_WRITE,MAP_SHARED,fld,0);
|
507 |
+
if(ptr==MAP_FAILED){cout << cnam << " failed to map" << endl;exit(0);}
|
508 |
+
::close(fld);
|
509 |
+
return(ptr);
|
510 |
+
}
|
511 |
+
|
512 |
+
char *FBase::get_Wmap(long n,const char *a){
|
513 |
+
int fld;
|
514 |
+
struct stat datf;
|
515 |
+
char cnam[max_str];
|
516 |
+
get_pathx(cnam,n,a);
|
517 |
+
fld=::open(cnam,O_RDWR);
|
518 |
+
if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
|
519 |
+
if(fstat(fld,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
|
520 |
+
char *ptr=(char*)mmap(0,datf.st_size,PROT_READ|PROT_WRITE,MAP_SHARED,fld,0);
|
521 |
+
if(ptr==MAP_FAILED){cout << cnam << " failed to map" << endl;exit(0);}
|
522 |
+
::close(fld);
|
523 |
+
return(ptr);
|
524 |
+
}
|
525 |
+
|
526 |
+
void FBase::dst_Mmap(const char *a,char *ptr){
|
527 |
+
struct stat datf;
|
528 |
+
char cnam[max_str];
|
529 |
+
if(ptr==NULL){cout << "NULL pointer" << endl;return;}
|
530 |
+
get_pathx(cnam,a);
|
531 |
+
if(stat(cnam,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
|
532 |
+
if(munmap(ptr,datf.st_size)){cout << cnam << " failed to unmap" << endl;exit(0);}
|
533 |
+
ptr=NULL;
|
534 |
+
}
|
535 |
+
|
536 |
+
void FBase::dst_Mmap(long n,const char *a,char *ptr){
|
537 |
+
struct stat datf;
|
538 |
+
char cnam[max_str];
|
539 |
+
if(ptr==NULL){cout << "NULL pointer" << endl;return;}
|
540 |
+
get_pathx(cnam,n,a);
|
541 |
+
if(stat(cnam,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
|
542 |
+
if(munmap(ptr,datf.st_size)){cout << cnam << " failed to unmap" << endl;exit(0);}
|
543 |
+
ptr=NULL;
|
544 |
+
}
|
545 |
+
|
546 |
+
void FBase::bin_Writ(const char *a,long nm,char *ptr){
|
547 |
+
ofstream *pfout=get_Ostr(a,ios::out);
|
548 |
+
long k=100000,i=0;
|
549 |
+
while(i+k<nm){
|
550 |
+
pfout->write((char*)ptr,k);
|
551 |
+
i+=k;
|
552 |
+
ptr=ptr+k;
|
553 |
+
}
|
554 |
+
pfout->write((char*)ptr,nm-i);
|
555 |
+
pfout->close();
|
556 |
+
delete pfout;
|
557 |
+
}
|
558 |
+
|
559 |
+
void FBase::bin_Writ(long n,const char *a,long nm,char *ptr){
|
560 |
+
ofstream *pfout=get_Ostr(n,a,ios::out);
|
561 |
+
long k=100000,i=0;
|
562 |
+
while(i+k<nm){
|
563 |
+
pfout->write((char*)ptr,k);
|
564 |
+
i+=k;
|
565 |
+
ptr=ptr+k;
|
566 |
+
}
|
567 |
+
pfout->write((char*)ptr,nm-i);
|
568 |
+
pfout->close();
|
569 |
+
delete pfout;
|
570 |
+
}
|
571 |
+
|
572 |
+
int FBase::Exists(const char *a){
|
573 |
+
char cnam[max_str];
|
574 |
+
get_pathx(cnam,a);
|
575 |
+
ifstream fin(cnam,ios::in);
|
576 |
+
if(fin.is_open()){
|
577 |
+
fin.close();
|
578 |
+
return(1);
|
579 |
+
}
|
580 |
+
else return(0);
|
581 |
+
}
|
582 |
+
|
583 |
+
int FBase::Exists(long n,const char *a){
|
584 |
+
char cnam[max_str];
|
585 |
+
get_pathx(cnam,n,a);
|
586 |
+
ifstream fin(cnam,ios::in);
|
587 |
+
if(fin.is_open()){
|
588 |
+
fin.close();
|
589 |
+
return(1);
|
590 |
+
}
|
591 |
+
else return(0);
|
592 |
+
}
|
593 |
+
|
594 |
+
void FBase::mark(long ct, int ivl, const char *what){
|
595 |
+
if(pflag&&(ct%ivl==0)){
|
596 |
+
cout << what << " count=" << ct << endl;
|
597 |
+
}
|
598 |
+
}
|
599 |
+
|
600 |
+
}
|
Library/FBase.h
CHANGED
@@ -1,248 +1,248 @@
|
|
1 |
-
#ifndef FBASE_H
|
2 |
-
#define FBASE_H
|
3 |
-
|
4 |
-
#include <iostream>
|
5 |
-
#include <fstream>
|
6 |
-
|
7 |
-
using namespace std;
|
8 |
-
namespace iret {
|
9 |
-
|
10 |
-
typedef char *pChr;
|
11 |
-
|
12 |
-
class FBase {
|
13 |
-
public:
|
14 |
-
FBase(const char *tp,const char *nm); //tp is type name, nm is name
|
15 |
-
FBase(const char *tp,int tn,const char *nm); //tp is type name, if
|
16 |
-
//nonnegative tn is appended to end of tp, nm is name
|
17 |
-
FBase(const char *tp,const char *nm,const char *pt); //tp is type name, nm is name
|
18 |
-
//pt is pointer at a string sss and reads the path from file path_sss in
|
19 |
-
//current directory. But if sss begins with ':' then skips this character
|
20 |
-
//and remaining string is the path string itself.
|
21 |
-
~FBase();
|
22 |
-
void set_type_num(int tn); //Sets tpnm and uses if nonnegative: appended
|
23 |
-
//to end of type name
|
24 |
-
void set_name_num(int nn); //Sets nmnm and uses if nonnegative: appended
|
25 |
-
//to end of name
|
26 |
-
void change_type(const char *nm); //Allows change of type string for class.
|
27 |
-
void change_name(const char *nm); //Allows change of name string for class.
|
28 |
-
void set_name(const char *nm); //Allows change of name string for class.
|
29 |
-
//Included for compatibility
|
30 |
-
void subname(const char *tph,const char *tpl,const char *nm); //Uses the
|
31 |
-
//higher level type tph as type and combines lower level tpl_nm with
|
32 |
-
//name to allow one to keep track of file types.
|
33 |
-
void set_path_internal(const char *pt); //Path is by default external with
|
34 |
-
//eflag=1. But if this function called with nonempty string, then eflag=0
|
35 |
-
//and pt stored in path and used for access to data.
|
36 |
-
void set_path_name(const char *pa); //path will be extracted from path_pa
|
37 |
-
//and eflag=2. Naming conventions for files are unchanged
|
38 |
-
void map_down(FBase *pFb); //Maps naming parameters to class instance pFb
|
39 |
-
void map_down_sub(FBase *pFb,const char *subtype); //Maps naming parameters to class instance pFb
|
40 |
-
//combines subtype with name to make a new name for pFb and type becomes its type
|
41 |
-
|
42 |
-
//Path access functions
|
43 |
-
void get_pathx(char *cn,const char *a);
|
44 |
-
//Reads the path from a file "path_(*name)" and constructs the
|
45 |
-
//file name from as "(*type)_(*name).(*a)". Cats path and file
|
46 |
-
//name and returns the full info in cn.
|
47 |
-
void get_pathx(char *cn,long n,const char *a);
|
48 |
-
char *add_num(const char *ptr,long n,char *buf); //converts long to ascii
|
49 |
-
//and cats to end of string and returns pointer to new string
|
50 |
-
//that results. Does not change input string. The new string is
|
51 |
-
//held in buffer space and this is overwritten at each call.
|
52 |
-
char *cat_num(const char *ptr,long n,char *buf); //converts long to ascii
|
53 |
-
//and cats to end of ptr string and then cats result to end of
|
54 |
-
//whatever is in buffer. Does not change input string. The new string is
|
55 |
-
//held in buffer space.
|
56 |
-
|
57 |
-
//Stream object pointers
|
58 |
-
ifstream *get_Istr(const char *a,ios::openmode m=ios::in);
|
59 |
-
//Opens input file stream by path and name composition.
|
60 |
-
ofstream *get_Ostr(const char *a,ios::openmode m=ios::out);
|
61 |
-
//Opens output file stream by path and name composition.
|
62 |
-
fstream *get_Fstr(const char *a,ios::openmode m=ios::in|ios::out);
|
63 |
-
//Opens output file stream by path and name composition.
|
64 |
-
ifstream *get_Istr(long n,const char *a,ios::openmode m=ios::in);
|
65 |
-
ofstream *get_Ostr(long n,const char *a,ios::openmode m=ios::out);
|
66 |
-
fstream *get_Fstr(long n,const char *a,ios::openmode m=ios::in|ios::out);
|
67 |
-
void dst_Istr(ifstream *pfin);
|
68 |
-
void dst_Ostr(ofstream *pfout);
|
69 |
-
void dst_Fstr(fstream *pfstr);
|
70 |
-
|
71 |
-
//Get file size in bytes
|
72 |
-
long get_Fsiz(const char *a);
|
73 |
-
long get_Fsiz(long n,const char *a);
|
74 |
-
|
75 |
-
//File existence
|
76 |
-
int Exists(const char *a); //returns 1 if file exists
|
77 |
-
int Exists(long n,const char *a); //returns 1 if file exists
|
78 |
-
|
79 |
-
//Read in array pointers
|
80 |
-
char *get_Read(const char *a);
|
81 |
-
//Reads in a file into an char array and returns pointer
|
82 |
-
char *get_Read(long n,const char *a);
|
83 |
-
|
84 |
-
//Memory map pointers
|
85 |
-
char *get_Mmap(const char *a);
|
86 |
-
//Memory maps file by path and name composition.
|
87 |
-
char *get_Mmap(long n,const char *a);
|
88 |
-
char *get_Wmap(const char *a);
|
89 |
-
//Memory maps file by path and name composition.
|
90 |
-
//Allows to modify contents and is written out when dst_Mmap called
|
91 |
-
char *get_Wmap(long n,const char *a);
|
92 |
-
//Allows to modify contents and is written out when dst_Mmap called
|
93 |
-
void dst_Mmap(const char *a,char *ptr);
|
94 |
-
//Removes the memory map for ptr based on path and name composition.
|
95 |
-
void dst_Mmap(long n,const char *a,char *ptr);
|
96 |
-
|
97 |
-
//Array of chars and binary write
|
98 |
-
void bin_Writ(const char *a,long nm,char *ptr);
|
99 |
-
//Writes out nm bytes binary
|
100 |
-
void bin_Writ(long n,const char *a,long nm,char *ptr);
|
101 |
-
|
102 |
-
//Write and read 1, 2, or 3 long integers to or from a file
|
103 |
-
template <typename X>
|
104 |
-
void get_Nnum(const char *a,X &m1);
|
105 |
-
template <typename X,typename Y>
|
106 |
-
void get_Nnum(const char *a,X &m1,Y &m2);
|
107 |
-
template <typename X,typename Y,typename Z>
|
108 |
-
void get_Nnum(const char *a,X &m1,Y &m2,Z &m3);
|
109 |
-
template <typename X>
|
110 |
-
void get_Nnum(long n,const char *a,X &m1);
|
111 |
-
template <typename X,typename Y>
|
112 |
-
void get_Nnum(long n,const char *a,X &m1,Y &m2);
|
113 |
-
template <typename X,typename Y,typename Z>
|
114 |
-
void get_Nnum(long n,const char *a,X &m1,Y &m2,Z &m3);
|
115 |
-
template <typename X>
|
116 |
-
void put_Nnum(const char *a,X &m1);
|
117 |
-
template <typename X,typename Y>
|
118 |
-
void put_Nnum(const char *a,X &m1,Y &m2);
|
119 |
-
template <typename X,typename Y,typename Z>
|
120 |
-
void put_Nnum(const char *a,X &m1,Y &m2,Z &m3);
|
121 |
-
template <typename X>
|
122 |
-
void put_Nnum(long n,const char *a,X &m1);
|
123 |
-
template <typename X,typename Y>
|
124 |
-
void put_Nnum(long n,const char *a,X &m1,Y &m2);
|
125 |
-
template <typename X,typename Y,typename Z>
|
126 |
-
void put_Nnum(long n,const char *a,X &m1,Y &m2,Z &m3);
|
127 |
-
|
128 |
-
//Logical accounting functions
|
129 |
-
int Gcom(int sflag); //sflag is bit marker such as READ_W, etc.
|
130 |
-
//This returns 1 if sflag bit not set in oflag and is in cflag
|
131 |
-
//If this is the case then it sets sflag in oflag.
|
132 |
-
int Rcom(int sflag);
|
133 |
-
//This returns 1 if sflag bit set in oflag and in cflag
|
134 |
-
//If this is the case then it turns off sflag in oflag.
|
135 |
-
void mark(long,int,const char*);
|
136 |
-
//This function prints out string in 3rd argument and count
|
137 |
-
//if first argument is multiple of the second
|
138 |
-
|
139 |
-
//Data
|
140 |
-
int cflag; //Command, what should happen to resources.
|
141 |
-
int oflag; //Bit string status of resources, 1 open, 0 closed.
|
142 |
-
int open1; //flags to mark whether a resource is open or not
|
143 |
-
int open2; //0 means closed, 1 means open
|
144 |
-
int open3; //Used for those resources that are either completely
|
145 |
-
int open4; //closed or completely open.
|
146 |
-
int open5;
|
147 |
-
char *type;
|
148 |
-
int tpnm; //If nonnegative integer it is appended to end of type
|
149 |
-
//in constructing file name
|
150 |
-
char *name;
|
151 |
-
int nmnm; //If nonnegative integer it is appended to end of name
|
152 |
-
//in constructing file name
|
153 |
-
int pflag; //Usual print flag, 1 for verbose output, 0 for none
|
154 |
-
//Print flag set to 1 by default.
|
155 |
-
int eflag; //Flag set to 1 for external path from path file, 0
|
156 |
-
//for internal path
|
157 |
-
char *path; //Path stored here if eflag=0.
|
158 |
-
char *pnam; //Path extension stored here if eflag=2.
|
159 |
-
};
|
160 |
-
|
161 |
-
//Template functions
|
162 |
-
|
163 |
-
template <typename X>
|
164 |
-
void FBase::get_Nnum(const char *a,X &m1){
|
165 |
-
ifstream *pfin=get_Istr(a,ios::in);
|
166 |
-
*pfin >> m1;
|
167 |
-
dst_Istr(pfin);
|
168 |
-
}
|
169 |
-
|
170 |
-
template <typename X,typename Y>
|
171 |
-
void FBase::get_Nnum(const char *a,X &m1,Y &m2){
|
172 |
-
ifstream *pfin=get_Istr(a,ios::in);
|
173 |
-
*pfin >> m1 >> m2;
|
174 |
-
dst_Istr(pfin);
|
175 |
-
}
|
176 |
-
|
177 |
-
template <typename X,typename Y,typename Z>
|
178 |
-
void FBase::get_Nnum(const char *a,X &m1,Y &m2,Z &m3){
|
179 |
-
ifstream *pfin=get_Istr(a,ios::in);
|
180 |
-
*pfin >> m1 >> m2 >> m3;
|
181 |
-
dst_Istr(pfin);
|
182 |
-
}
|
183 |
-
|
184 |
-
template <typename X>
|
185 |
-
void FBase::get_Nnum(long n,const char *a,X &m1){
|
186 |
-
ifstream *pfin=get_Istr(n,a,ios::in);
|
187 |
-
*pfin >> m1;
|
188 |
-
dst_Istr(pfin);
|
189 |
-
}
|
190 |
-
|
191 |
-
template <typename X,typename Y>
|
192 |
-
void FBase::get_Nnum(long n,const char *a,X &m1,Y &m2){
|
193 |
-
ifstream *pfin=get_Istr(n,a,ios::in);
|
194 |
-
*pfin >> m1 >> m2;
|
195 |
-
dst_Istr(pfin);
|
196 |
-
}
|
197 |
-
|
198 |
-
template <typename X,typename Y,typename Z>
|
199 |
-
void FBase::get_Nnum(long n,const char *a,X &m1,Y &m2,Z &m3){
|
200 |
-
ifstream *pfin=get_Istr(n,a,ios::in);
|
201 |
-
*pfin >> m1 >> m2 >> m3;
|
202 |
-
dst_Istr(pfin);
|
203 |
-
}
|
204 |
-
|
205 |
-
template <typename X>
|
206 |
-
void FBase::put_Nnum(const char *a,X &m1){
|
207 |
-
ofstream *pfout=get_Ostr(a,ios::out);
|
208 |
-
*pfout << m1 << endl;
|
209 |
-
dst_Ostr(pfout);
|
210 |
-
}
|
211 |
-
|
212 |
-
template <typename X,typename Y>
|
213 |
-
void FBase::put_Nnum(const char *a,X &m1,Y &m2){
|
214 |
-
ofstream *pfout=get_Ostr(a,ios::out);
|
215 |
-
*pfout << m1 << " " << m2 << endl;
|
216 |
-
dst_Ostr(pfout);
|
217 |
-
}
|
218 |
-
|
219 |
-
template <typename X,typename Y,typename Z>
|
220 |
-
void FBase::put_Nnum(const char *a,X &m1,Y &m2,Z &m3){
|
221 |
-
ofstream *pfout=get_Ostr(a,ios::out);
|
222 |
-
*pfout << m1 << " " << m2 << " " << m3 << endl;
|
223 |
-
dst_Ostr(pfout);
|
224 |
-
}
|
225 |
-
|
226 |
-
template <typename X>
|
227 |
-
void FBase::put_Nnum(long n,const char *a,X &m1){
|
228 |
-
ofstream *pfout=get_Ostr(n,a,ios::out);
|
229 |
-
*pfout << m1 << endl;
|
230 |
-
dst_Ostr(pfout);
|
231 |
-
}
|
232 |
-
|
233 |
-
template <typename X,typename Y>
|
234 |
-
void FBase::put_Nnum(long n,const char *a,X &m1,Y &m2){
|
235 |
-
ofstream *pfout=get_Ostr(n,a,ios::out);
|
236 |
-
*pfout << m1 << " " << m2 << endl;
|
237 |
-
dst_Ostr(pfout);
|
238 |
-
}
|
239 |
-
|
240 |
-
template <typename X,typename Y,typename Z>
|
241 |
-
void FBase::put_Nnum(long n,const char *a,X &m1,Y &m2,Z &m3){
|
242 |
-
ofstream *pfout=get_Ostr(n,a,ios::out);
|
243 |
-
*pfout << m1 << " " << m2 << " " << m3 << endl;
|
244 |
-
dst_Ostr(pfout);
|
245 |
-
}
|
246 |
-
|
247 |
-
}
|
248 |
-
#endif
|
|
|
1 |
+
#ifndef FBASE_H
|
2 |
+
#define FBASE_H
|
3 |
+
|
4 |
+
#include <iostream>
|
5 |
+
#include <fstream>
|
6 |
+
|
7 |
+
using namespace std;
|
8 |
+
namespace iret {
|
9 |
+
|
10 |
+
typedef char *pChr;
|
11 |
+
|
12 |
+
class FBase {
|
13 |
+
public:
|
14 |
+
FBase(const char *tp,const char *nm); //tp is type name, nm is name
|
15 |
+
FBase(const char *tp,int tn,const char *nm); //tp is type name, if
|
16 |
+
//nonnegative tn is appended to end of tp, nm is name
|
17 |
+
FBase(const char *tp,const char *nm,const char *pt); //tp is type name, nm is name
|
18 |
+
//pt is pointer at a string sss and reads the path from file path_sss in
|
19 |
+
//current directory. But if sss begins with ':' then skips this character
|
20 |
+
//and remaining string is the path string itself.
|
21 |
+
~FBase();
|
22 |
+
void set_type_num(int tn); //Sets tpnm and uses if nonnegative: appended
|
23 |
+
//to end of type name
|
24 |
+
void set_name_num(int nn); //Sets nmnm and uses if nonnegative: appended
|
25 |
+
//to end of name
|
26 |
+
void change_type(const char *nm); //Allows change of type string for class.
|
27 |
+
void change_name(const char *nm); //Allows change of name string for class.
|
28 |
+
void set_name(const char *nm); //Allows change of name string for class.
|
29 |
+
//Included for compatibility
|
30 |
+
void subname(const char *tph,const char *tpl,const char *nm); //Uses the
|
31 |
+
//higher level type tph as type and combines lower level tpl_nm with
|
32 |
+
//name to allow one to keep track of file types.
|
33 |
+
void set_path_internal(const char *pt); //Path is by default external with
|
34 |
+
//eflag=1. But if this function called with nonempty string, then eflag=0
|
35 |
+
//and pt stored in path and used for access to data.
|
36 |
+
void set_path_name(const char *pa); //path will be extracted from path_pa
|
37 |
+
//and eflag=2. Naming conventions for files are unchanged
|
38 |
+
void map_down(FBase *pFb); //Maps naming parameters to class instance pFb
|
39 |
+
void map_down_sub(FBase *pFb,const char *subtype); //Maps naming parameters to class instance pFb
|
40 |
+
//combines subtype with name to make a new name for pFb and type becomes its type
|
41 |
+
|
42 |
+
//Path access functions
|
43 |
+
void get_pathx(char *cn,const char *a);
|
44 |
+
//Reads the path from a file "path_(*name)" and constructs the
|
45 |
+
//file name from as "(*type)_(*name).(*a)". Cats path and file
|
46 |
+
//name and returns the full info in cn.
|
47 |
+
void get_pathx(char *cn,long n,const char *a);
|
48 |
+
char *add_num(const char *ptr,long n,char *buf); //converts long to ascii
|
49 |
+
//and cats to end of string and returns pointer to new string
|
50 |
+
//that results. Does not change input string. The new string is
|
51 |
+
//held in buffer space and this is overwritten at each call.
|
52 |
+
char *cat_num(const char *ptr,long n,char *buf); //converts long to ascii
|
53 |
+
//and cats to end of ptr string and then cats result to end of
|
54 |
+
//whatever is in buffer. Does not change input string. The new string is
|
55 |
+
//held in buffer space.
|
56 |
+
|
57 |
+
//Stream object pointers
|
58 |
+
ifstream *get_Istr(const char *a,ios::openmode m=ios::in);
|
59 |
+
//Opens input file stream by path and name composition.
|
60 |
+
ofstream *get_Ostr(const char *a,ios::openmode m=ios::out);
|
61 |
+
//Opens output file stream by path and name composition.
|
62 |
+
fstream *get_Fstr(const char *a,ios::openmode m=ios::in|ios::out);
|
63 |
+
//Opens output file stream by path and name composition.
|
64 |
+
ifstream *get_Istr(long n,const char *a,ios::openmode m=ios::in);
|
65 |
+
ofstream *get_Ostr(long n,const char *a,ios::openmode m=ios::out);
|
66 |
+
fstream *get_Fstr(long n,const char *a,ios::openmode m=ios::in|ios::out);
|
67 |
+
void dst_Istr(ifstream *pfin);
|
68 |
+
void dst_Ostr(ofstream *pfout);
|
69 |
+
void dst_Fstr(fstream *pfstr);
|
70 |
+
|
71 |
+
//Get file size in bytes
|
72 |
+
long get_Fsiz(const char *a);
|
73 |
+
long get_Fsiz(long n,const char *a);
|
74 |
+
|
75 |
+
//File existence
|
76 |
+
int Exists(const char *a); //returns 1 if file exists
|
77 |
+
int Exists(long n,const char *a); //returns 1 if file exists
|
78 |
+
|
79 |
+
//Read in array pointers
|
80 |
+
char *get_Read(const char *a);
|
81 |
+
//Reads in a file into an char array and returns pointer
|
82 |
+
char *get_Read(long n,const char *a);
|
83 |
+
|
84 |
+
//Memory map pointers
|
85 |
+
char *get_Mmap(const char *a);
|
86 |
+
//Memory maps file by path and name composition.
|
87 |
+
char *get_Mmap(long n,const char *a);
|
88 |
+
char *get_Wmap(const char *a);
|
89 |
+
//Memory maps file by path and name composition.
|
90 |
+
//Allows to modify contents and is written out when dst_Mmap called
|
91 |
+
char *get_Wmap(long n,const char *a);
|
92 |
+
//Allows to modify contents and is written out when dst_Mmap called
|
93 |
+
void dst_Mmap(const char *a,char *ptr);
|
94 |
+
//Removes the memory map for ptr based on path and name composition.
|
95 |
+
void dst_Mmap(long n,const char *a,char *ptr);
|
96 |
+
|
97 |
+
//Array of chars and binary write
|
98 |
+
void bin_Writ(const char *a,long nm,char *ptr);
|
99 |
+
//Writes out nm bytes binary
|
100 |
+
void bin_Writ(long n,const char *a,long nm,char *ptr);
|
101 |
+
|
102 |
+
//Write and read 1, 2, or 3 long integers to or from a file
|
103 |
+
template <typename X>
|
104 |
+
void get_Nnum(const char *a,X &m1);
|
105 |
+
template <typename X,typename Y>
|
106 |
+
void get_Nnum(const char *a,X &m1,Y &m2);
|
107 |
+
template <typename X,typename Y,typename Z>
|
108 |
+
void get_Nnum(const char *a,X &m1,Y &m2,Z &m3);
|
109 |
+
template <typename X>
|
110 |
+
void get_Nnum(long n,const char *a,X &m1);
|
111 |
+
template <typename X,typename Y>
|
112 |
+
void get_Nnum(long n,const char *a,X &m1,Y &m2);
|
113 |
+
template <typename X,typename Y,typename Z>
|
114 |
+
void get_Nnum(long n,const char *a,X &m1,Y &m2,Z &m3);
|
115 |
+
template <typename X>
|
116 |
+
void put_Nnum(const char *a,X &m1);
|
117 |
+
template <typename X,typename Y>
|
118 |
+
void put_Nnum(const char *a,X &m1,Y &m2);
|
119 |
+
template <typename X,typename Y,typename Z>
|
120 |
+
void put_Nnum(const char *a,X &m1,Y &m2,Z &m3);
|
121 |
+
template <typename X>
|
122 |
+
void put_Nnum(long n,const char *a,X &m1);
|
123 |
+
template <typename X,typename Y>
|
124 |
+
void put_Nnum(long n,const char *a,X &m1,Y &m2);
|
125 |
+
template <typename X,typename Y,typename Z>
|
126 |
+
void put_Nnum(long n,const char *a,X &m1,Y &m2,Z &m3);
|
127 |
+
|
128 |
+
//Logical accounting functions
|
129 |
+
int Gcom(int sflag); //sflag is bit marker such as READ_W, etc.
|
130 |
+
//This returns 1 if sflag bit not set in oflag and is in cflag
|
131 |
+
//If this is the case then it sets sflag in oflag.
|
132 |
+
int Rcom(int sflag);
|
133 |
+
//This returns 1 if sflag bit set in oflag and in cflag
|
134 |
+
//If this is the case then it turns off sflag in oflag.
|
135 |
+
void mark(long,int,const char*);
|
136 |
+
//This function prints out string in 3rd argument and count
|
137 |
+
//if first argument is multiple of the second
|
138 |
+
|
139 |
+
//Data
|
140 |
+
int cflag; //Command, what should happen to resources.
|
141 |
+
int oflag; //Bit string status of resources, 1 open, 0 closed.
|
142 |
+
int open1; //flags to mark whether a resource is open or not
|
143 |
+
int open2; //0 means closed, 1 means open
|
144 |
+
int open3; //Used for those resources that are either completely
|
145 |
+
int open4; //closed or completely open.
|
146 |
+
int open5;
|
147 |
+
char *type;
|
148 |
+
int tpnm; //If nonnegative integer it is appended to end of type
|
149 |
+
//in constructing file name
|
150 |
+
char *name;
|
151 |
+
int nmnm; //If nonnegative integer it is appended to end of name
|
152 |
+
//in constructing file name
|
153 |
+
int pflag; //Usual print flag, 1 for verbose output, 0 for none
|
154 |
+
//Print flag set to 1 by default.
|
155 |
+
int eflag; //Flag set to 1 for external path from path file, 0
|
156 |
+
//for internal path
|
157 |
+
char *path; //Path stored here if eflag=0.
|
158 |
+
char *pnam; //Path extension stored here if eflag=2.
|
159 |
+
};
|
160 |
+
|
161 |
+
//Template functions
|
162 |
+
|
163 |
+
template <typename X>
|
164 |
+
void FBase::get_Nnum(const char *a,X &m1){
|
165 |
+
ifstream *pfin=get_Istr(a,ios::in);
|
166 |
+
*pfin >> m1;
|
167 |
+
dst_Istr(pfin);
|
168 |
+
}
|
169 |
+
|
170 |
+
template <typename X,typename Y>
|
171 |
+
void FBase::get_Nnum(const char *a,X &m1,Y &m2){
|
172 |
+
ifstream *pfin=get_Istr(a,ios::in);
|
173 |
+
*pfin >> m1 >> m2;
|
174 |
+
dst_Istr(pfin);
|
175 |
+
}
|
176 |
+
|
177 |
+
template <typename X,typename Y,typename Z>
|
178 |
+
void FBase::get_Nnum(const char *a,X &m1,Y &m2,Z &m3){
|
179 |
+
ifstream *pfin=get_Istr(a,ios::in);
|
180 |
+
*pfin >> m1 >> m2 >> m3;
|
181 |
+
dst_Istr(pfin);
|
182 |
+
}
|
183 |
+
|
184 |
+
template <typename X>
|
185 |
+
void FBase::get_Nnum(long n,const char *a,X &m1){
|
186 |
+
ifstream *pfin=get_Istr(n,a,ios::in);
|
187 |
+
*pfin >> m1;
|
188 |
+
dst_Istr(pfin);
|
189 |
+
}
|
190 |
+
|
191 |
+
template <typename X,typename Y>
|
192 |
+
void FBase::get_Nnum(long n,const char *a,X &m1,Y &m2){
|
193 |
+
ifstream *pfin=get_Istr(n,a,ios::in);
|
194 |
+
*pfin >> m1 >> m2;
|
195 |
+
dst_Istr(pfin);
|
196 |
+
}
|
197 |
+
|
198 |
+
template <typename X,typename Y,typename Z>
|
199 |
+
void FBase::get_Nnum(long n,const char *a,X &m1,Y &m2,Z &m3){
|
200 |
+
ifstream *pfin=get_Istr(n,a,ios::in);
|
201 |
+
*pfin >> m1 >> m2 >> m3;
|
202 |
+
dst_Istr(pfin);
|
203 |
+
}
|
204 |
+
|
205 |
+
template <typename X>
|
206 |
+
void FBase::put_Nnum(const char *a,X &m1){
|
207 |
+
ofstream *pfout=get_Ostr(a,ios::out);
|
208 |
+
*pfout << m1 << endl;
|
209 |
+
dst_Ostr(pfout);
|
210 |
+
}
|
211 |
+
|
212 |
+
template <typename X,typename Y>
|
213 |
+
void FBase::put_Nnum(const char *a,X &m1,Y &m2){
|
214 |
+
ofstream *pfout=get_Ostr(a,ios::out);
|
215 |
+
*pfout << m1 << " " << m2 << endl;
|
216 |
+
dst_Ostr(pfout);
|
217 |
+
}
|
218 |
+
|
219 |
+
template <typename X,typename Y,typename Z>
|
220 |
+
void FBase::put_Nnum(const char *a,X &m1,Y &m2,Z &m3){
|
221 |
+
ofstream *pfout=get_Ostr(a,ios::out);
|
222 |
+
*pfout << m1 << " " << m2 << " " << m3 << endl;
|
223 |
+
dst_Ostr(pfout);
|
224 |
+
}
|
225 |
+
|
226 |
+
template <typename X>
|
227 |
+
void FBase::put_Nnum(long n,const char *a,X &m1){
|
228 |
+
ofstream *pfout=get_Ostr(n,a,ios::out);
|
229 |
+
*pfout << m1 << endl;
|
230 |
+
dst_Ostr(pfout);
|
231 |
+
}
|
232 |
+
|
233 |
+
template <typename X,typename Y>
|
234 |
+
void FBase::put_Nnum(long n,const char *a,X &m1,Y &m2){
|
235 |
+
ofstream *pfout=get_Ostr(n,a,ios::out);
|
236 |
+
*pfout << m1 << " " << m2 << endl;
|
237 |
+
dst_Ostr(pfout);
|
238 |
+
}
|
239 |
+
|
240 |
+
template <typename X,typename Y,typename Z>
|
241 |
+
void FBase::put_Nnum(long n,const char *a,X &m1,Y &m2,Z &m3){
|
242 |
+
ofstream *pfout=get_Ostr(n,a,ios::out);
|
243 |
+
*pfout << m1 << " " << m2 << " " << m3 << endl;
|
244 |
+
dst_Ostr(pfout);
|
245 |
+
}
|
246 |
+
|
247 |
+
}
|
248 |
+
#endif
|
Library/Hash.C
CHANGED
@@ -1,733 +1,733 @@
|
|
1 |
-
#include <iostream>
|
2 |
-
#include <fstream>
|
3 |
-
#include <cstdlib>
|
4 |
-
#include <sys/types.h>
|
5 |
-
#include <sys/stat.h>
|
6 |
-
#include <fcntl.h>
|
7 |
-
#include <sys/mman.h>
|
8 |
-
#include <cmath>
|
9 |
-
#include <cstring>
|
10 |
-
#include <cassert>
|
11 |
-
#include "runn.h"
|
12 |
-
#include "Hash.h"
|
13 |
-
|
14 |
-
using namespace std;
|
15 |
-
namespace iret {
|
16 |
-
|
17 |
-
Hash::Hash(void) : FBase("hshset","null"){
|
18 |
-
}
|
19 |
-
|
20 |
-
Hash::Hash(const char *nam) : FBase("hshset",nam){
|
21 |
-
}
|
22 |
-
|
23 |
-
Hash::Hash(int n,const char *nam) : FBase("hshset",n,nam){
|
24 |
-
}
|
25 |
-
|
26 |
-
Hash::~Hash(){
|
27 |
-
}
|
28 |
-
|
29 |
-
void Hash::create_htable(List &Lst,int excess){
|
30 |
-
char cnam[max_str],*cptr,*uptr;
|
31 |
-
int u,len;
|
32 |
-
long ct,i,j,k;
|
33 |
-
ofstream *pfout;
|
34 |
-
|
35 |
-
nwrds=Lst.cnt_key;
|
36 |
-
ct=nwrds;
|
37 |
-
tnum=1;
|
38 |
-
u=0;
|
39 |
-
while(ct=ct/2){tnum*=2;u++;}
|
40 |
-
if(u>30){cout << "Error in size, " << u << endl;exit(0);}
|
41 |
-
i=0;
|
42 |
-
while((u<32)&&(i<excess)){tnum*=2;u++;i++;}
|
43 |
-
tnum--;
|
44 |
-
harr=new long[tnum+2];
|
45 |
-
for(ct=0;ct<tnum+2;ct++)harr[ct]=0;
|
46 |
-
|
47 |
-
farr=new long[1536];
|
48 |
-
ct=1;
|
49 |
-
for(i=0;i<1536;i++){
|
50 |
-
farr[i]=ct=(ct*331)&tnum;
|
51 |
-
}
|
52 |
-
|
53 |
-
long *pc0=farr,*pc1=farr+128,*pc2=farr+256;
|
54 |
-
long *pc3=farr+384,*pc4=farr+512,*pc5=farr+640;
|
55 |
-
long *pc6=farr+768,*pc7=farr+896,*pc8=farr+1024;
|
56 |
-
long *pc9=farr+1152,*pc10=farr+1280,*pc11=farr+1408;
|
57 |
-
|
58 |
-
Lst.node_first();
|
59 |
-
while(Lst.node_next()){
|
60 |
-
cptr=Lst.show_str();
|
61 |
-
ct=0;
|
62 |
-
i=0;
|
63 |
-
while(u=*(cptr++)){
|
64 |
-
switch(i){
|
65 |
-
case 0: ct+=*(pc0+u);
|
66 |
-
break;
|
67 |
-
case 1: ct+=*(pc1+u);
|
68 |
-
break;
|
69 |
-
case 2: ct+=*(pc2+u);
|
70 |
-
break;
|
71 |
-
case 3: ct+=*(pc3+u);
|
72 |
-
break;
|
73 |
-
case 4: ct+=*(pc4+u);
|
74 |
-
break;
|
75 |
-
case 5: ct+=*(pc5+u);
|
76 |
-
break;
|
77 |
-
case 6: ct+=*(pc6+u);
|
78 |
-
break;
|
79 |
-
case 7: ct+=*(pc7+u);
|
80 |
-
break;
|
81 |
-
case 8: ct+=*(pc8+u);
|
82 |
-
break;
|
83 |
-
case 9: ct+=*(pc9+u);
|
84 |
-
break;
|
85 |
-
case 10: ct+=*(pc10+u);
|
86 |
-
break;
|
87 |
-
case 11: ct+=*(pc11+u);
|
88 |
-
i-=12;
|
89 |
-
break;
|
90 |
-
}
|
91 |
-
i++;
|
92 |
-
}
|
93 |
-
(harr[ct&tnum])++;
|
94 |
-
}
|
95 |
-
|
96 |
-
//Set start points in harr.
|
97 |
-
k=0;
|
98 |
-
for(i=0;i<tnum+2;i++){
|
99 |
-
j=harr[i];
|
100 |
-
harr[i]=k;
|
101 |
-
k+=j;
|
102 |
-
}
|
103 |
-
if(k!=nwrds){cout << "Error in summing!" << endl;exit(0);}
|
104 |
-
|
105 |
-
//Write out harr.
|
106 |
-
bin_Writ("ha",(tnum+2)*sizeof(long),(char*)harr);
|
107 |
-
|
108 |
-
//Set addresses
|
109 |
-
char **addt=new char*[nwrds];
|
110 |
-
Lst.node_first();
|
111 |
-
while(Lst.node_next()){
|
112 |
-
uptr=cptr=Lst.show_str();
|
113 |
-
ct=0;
|
114 |
-
i=0;
|
115 |
-
while(u=*(cptr++)){
|
116 |
-
switch(i){
|
117 |
-
case 0: ct+=*(pc0+u);
|
118 |
-
break;
|
119 |
-
case 1: ct+=*(pc1+u);
|
120 |
-
break;
|
121 |
-
case 2: ct+=*(pc2+u);
|
122 |
-
break;
|
123 |
-
case 3: ct+=*(pc3+u);
|
124 |
-
break;
|
125 |
-
case 4: ct+=*(pc4+u);
|
126 |
-
break;
|
127 |
-
case 5: ct+=*(pc5+u);
|
128 |
-
break;
|
129 |
-
case 6: ct+=*(pc6+u);
|
130 |
-
break;
|
131 |
-
case 7: ct+=*(pc7+u);
|
132 |
-
break;
|
133 |
-
case 8: ct+=*(pc8+u);
|
134 |
-
break;
|
135 |
-
case 9: ct+=*(pc9+u);
|
136 |
-
break;
|
137 |
-
case 10: ct+=*(pc10+u);
|
138 |
-
break;
|
139 |
-
case 11: ct+=*(pc11+u);
|
140 |
-
i-=12;
|
141 |
-
break;
|
142 |
-
}
|
143 |
-
i++;
|
144 |
-
}
|
145 |
-
k=ct&tnum;
|
146 |
-
addt[harr[k]]=uptr;
|
147 |
-
(harr[k])++;
|
148 |
-
}
|
149 |
-
|
150 |
-
//Write out string file
|
151 |
-
pfout=get_Ostr("str");
|
152 |
-
k=0;
|
153 |
-
for(i=0;i<nwrds;i++){
|
154 |
-
*pfout << addt[i] << ends;
|
155 |
-
len=strlen((char*)addt[i])+1;
|
156 |
-
addt[i]=(char*)k;
|
157 |
-
k+=len;
|
158 |
-
}
|
159 |
-
dst_Ostr(pfout);
|
160 |
-
|
161 |
-
//Write out addr file
|
162 |
-
bin_Writ("ad",nwrds*sizeof(long),(char*)addt);
|
163 |
-
delete [] addt;
|
164 |
-
|
165 |
-
//Write out counts
|
166 |
-
pfout=get_Ostr("nm");
|
167 |
-
*pfout << nwrds << " " << tnum << " " << k << endl;
|
168 |
-
dst_Ostr(pfout);
|
169 |
-
delete [] harr;
|
170 |
-
delete [] farr;
|
171 |
-
}
|
172 |
-
|
173 |
-
//In memory model intended for small sets
|
174 |
-
void Hash::create_htableM(List &Lst,int excess){
|
175 |
-
char cnam[max_str],*cptr,*uptr;
|
176 |
-
int u,len;
|
177 |
-
long ct,i,j,k,*barr;
|
178 |
-
ofstream *pfout;
|
179 |
-
|
180 |
-
nwrds=Lst.cnt_key;
|
181 |
-
ct=nwrds;
|
182 |
-
tnum=1;
|
183 |
-
u=0;
|
184 |
-
while(ct=ct/2){tnum*=2;u++;}
|
185 |
-
if(u>30){cout << "Error in size, " << u << endl;exit(0);}
|
186 |
-
i=0;
|
187 |
-
while((u<32)&&(i<excess)){tnum*=2;u++;i++;}
|
188 |
-
tnum--;
|
189 |
-
harr=new long[tnum+2];
|
190 |
-
barr=new long[tnum+2];
|
191 |
-
for(ct=0;ct<tnum+2;ct++)harr[ct]=0;
|
192 |
-
|
193 |
-
farr=new long[1536];
|
194 |
-
ct=1;
|
195 |
-
for(i=0;i<1536;i++){
|
196 |
-
farr[i]=ct=(ct*331)&tnum;
|
197 |
-
}
|
198 |
-
|
199 |
-
px0=farr,px1=farr+128,px2=farr+256;
|
200 |
-
px3=farr+384,px4=farr+512,px5=farr+640;
|
201 |
-
px6=farr+768,px7=farr+896,px8=farr+1024;
|
202 |
-
px9=farr+1152,px10=farr+1280,px11=farr+1408;
|
203 |
-
|
204 |
-
Lst.node_first();
|
205 |
-
while(Lst.node_next()){
|
206 |
-
cptr=Lst.show_str();
|
207 |
-
ct=0;
|
208 |
-
i=0;
|
209 |
-
while(u=*(cptr++)){
|
210 |
-
switch(i){
|
211 |
-
case 0: ct+=*(px0+u);
|
212 |
-
break;
|
213 |
-
case 1: ct+=*(px1+u);
|
214 |
-
break;
|
215 |
-
case 2: ct+=*(px2+u);
|
216 |
-
break;
|
217 |
-
case 3: ct+=*(px3+u);
|
218 |
-
break;
|
219 |
-
case 4: ct+=*(px4+u);
|
220 |
-
break;
|
221 |
-
case 5: ct+=*(px5+u);
|
222 |
-
break;
|
223 |
-
case 6: ct+=*(px6+u);
|
224 |
-
break;
|
225 |
-
case 7: ct+=*(px7+u);
|
226 |
-
break;
|
227 |
-
case 8: ct+=*(px8+u);
|
228 |
-
break;
|
229 |
-
case 9: ct+=*(px9+u);
|
230 |
-
break;
|
231 |
-
case 10: ct+=*(px10+u);
|
232 |
-
break;
|
233 |
-
case 11: ct+=*(px11+u);
|
234 |
-
i-=12;
|
235 |
-
break;
|
236 |
-
}
|
237 |
-
i++;
|
238 |
-
}
|
239 |
-
(harr[ct&tnum])++;
|
240 |
-
}
|
241 |
-
|
242 |
-
//Set start points in harr.
|
243 |
-
k=0;
|
244 |
-
for(i=0;i<tnum+2;i++){
|
245 |
-
j=harr[i];
|
246 |
-
barr[i]=harr[i]=k;
|
247 |
-
k+=j;
|
248 |
-
}
|
249 |
-
if(k!=nwrds){cout << "Error in summing!" << endl;exit(0);}
|
250 |
-
|
251 |
-
//Set addresses
|
252 |
-
len=0;
|
253 |
-
char **addt=new char*[nwrds];
|
254 |
-
Lst.node_first();
|
255 |
-
while(Lst.node_next()){
|
256 |
-
uptr=cptr=Lst.show_str();
|
257 |
-
len+=strlen(uptr)+1;
|
258 |
-
ct=0;
|
259 |
-
i=0;
|
260 |
-
while(u=*(cptr++)){
|
261 |
-
switch(i){
|
262 |
-
case 0: ct+=*(px0+u);
|
263 |
-
break;
|
264 |
-
case 1: ct+=*(px1+u);
|
265 |
-
break;
|
266 |
-
case 2: ct+=*(px2+u);
|
267 |
-
break;
|
268 |
-
case 3: ct+=*(px3+u);
|
269 |
-
break;
|
270 |
-
case 4: ct+=*(px4+u);
|
271 |
-
break;
|
272 |
-
case 5: ct+=*(px5+u);
|
273 |
-
break;
|
274 |
-
case 6: ct+=*(px6+u);
|
275 |
-
break;
|
276 |
-
case 7: ct+=*(px7+u);
|
277 |
-
break;
|
278 |
-
case 8: ct+=*(px8+u);
|
279 |
-
break;
|
280 |
-
case 9: ct+=*(px9+u);
|
281 |
-
break;
|
282 |
-
case 10: ct+=*(px10+u);
|
283 |
-
break;
|
284 |
-
case 11: ct+=*(px11+u);
|
285 |
-
i-=12;
|
286 |
-
break;
|
287 |
-
}
|
288 |
-
i++;
|
289 |
-
}
|
290 |
-
k=ct&tnum;
|
291 |
-
addt[barr[k]]=uptr;
|
292 |
-
(barr[k])++;
|
293 |
-
}
|
294 |
-
strmap=new char[len];
|
295 |
-
|
296 |
-
//Set up string array
|
297 |
-
k=0;
|
298 |
-
for(i=0;i<nwrds;i++){
|
299 |
-
len=strlen((char*)addt[i])+1;
|
300 |
-
strcpy(strmap+k,addt[i]);
|
301 |
-
addt[i]=(char*)k;
|
302 |
-
k+=len;
|
303 |
-
}
|
304 |
-
addr=(long*)addt;
|
305 |
-
delete [] barr;
|
306 |
-
}
|
307 |
-
|
308 |
-
void Hash::create_htable(int mz,List &Lst,int excess){
|
309 |
-
char cnam[max_str],*cptr,*uptr;
|
310 |
-
int u,len;
|
311 |
-
long ct,i,j,k;
|
312 |
-
ofstream *pfout;
|
313 |
-
|
314 |
-
nwrds=Lst.cnt_key;
|
315 |
-
ct=nwrds;
|
316 |
-
tnum=1;
|
317 |
-
u=0;
|
318 |
-
while(ct=ct/2){tnum*=2;u++;}
|
319 |
-
if(u>30){cout << "Error in size, " << u << endl;exit(0);}
|
320 |
-
i=0;
|
321 |
-
while((u<32)&&(i<excess)){tnum*=2;u++;i++;}
|
322 |
-
tnum--;
|
323 |
-
harr=new long[tnum+2];
|
324 |
-
for(ct=0;ct<tnum+2;ct++)harr[ct]=0;
|
325 |
-
|
326 |
-
farr=new long[1536];
|
327 |
-
ct=1;
|
328 |
-
for(i=0;i<1536;i++){
|
329 |
-
farr[i]=ct=(ct*331)&tnum;
|
330 |
-
}
|
331 |
-
|
332 |
-
long *pc0=farr,*pc1=farr+128,*pc2=farr+256;
|
333 |
-
long *pc3=farr+384,*pc4=farr+512,*pc5=farr+640;
|
334 |
-
long *pc6=farr+768,*pc7=farr+896,*pc8=farr+1024;
|
335 |
-
long *pc9=farr+1152,*pc10=farr+1280,*pc11=farr+1408;
|
336 |
-
|
337 |
-
Lst.node_first();
|
338 |
-
while(Lst.node_next()){
|
339 |
-
cptr=Lst.show_str();
|
340 |
-
ct=0;
|
341 |
-
i=0;
|
342 |
-
while(u=*(cptr++)){
|
343 |
-
switch(i){
|
344 |
-
case 0: ct+=*(pc0+u);
|
345 |
-
break;
|
346 |
-
case 1: ct+=*(pc1+u);
|
347 |
-
break;
|
348 |
-
case 2: ct+=*(pc2+u);
|
349 |
-
break;
|
350 |
-
case 3: ct+=*(pc3+u);
|
351 |
-
break;
|
352 |
-
case 4: ct+=*(pc4+u);
|
353 |
-
break;
|
354 |
-
case 5: ct+=*(pc5+u);
|
355 |
-
break;
|
356 |
-
case 6: ct+=*(pc6+u);
|
357 |
-
break;
|
358 |
-
case 7: ct+=*(pc7+u);
|
359 |
-
break;
|
360 |
-
case 8: ct+=*(pc8+u);
|
361 |
-
break;
|
362 |
-
case 9: ct+=*(pc9+u);
|
363 |
-
break;
|
364 |
-
case 10: ct+=*(pc10+u);
|
365 |
-
break;
|
366 |
-
case 11: ct+=*(pc11+u);
|
367 |
-
i-=12;
|
368 |
-
break;
|
369 |
-
}
|
370 |
-
i++;
|
371 |
-
}
|
372 |
-
(harr[ct&tnum])++;
|
373 |
-
}
|
374 |
-
|
375 |
-
//Set start points in harr.
|
376 |
-
k=0;
|
377 |
-
for(i=0;i<tnum+2;i++){
|
378 |
-
j=harr[i];
|
379 |
-
harr[i]=k;
|
380 |
-
k+=j;
|
381 |
-
}
|
382 |
-
if(k!=nwrds){cout << "Error in summing!" << endl;exit(0);}
|
383 |
-
|
384 |
-
//Write out harr.
|
385 |
-
bin_Writ(mz,"ha",(tnum+2)*sizeof(long),(char*)harr);
|
386 |
-
|
387 |
-
//Set addresses
|
388 |
-
char **addt=new char*[nwrds];
|
389 |
-
Lst.node_first();
|
390 |
-
while(Lst.node_next()){
|
391 |
-
uptr=cptr=Lst.show_str();
|
392 |
-
ct=0;
|
393 |
-
i=0;
|
394 |
-
while(u=*(cptr++)){
|
395 |
-
switch(i){
|
396 |
-
case 0: ct+=*(pc0+u);
|
397 |
-
break;
|
398 |
-
case 1: ct+=*(pc1+u);
|
399 |
-
break;
|
400 |
-
case 2: ct+=*(pc2+u);
|
401 |
-
break;
|
402 |
-
case 3: ct+=*(pc3+u);
|
403 |
-
break;
|
404 |
-
case 4: ct+=*(pc4+u);
|
405 |
-
break;
|
406 |
-
case 5: ct+=*(pc5+u);
|
407 |
-
break;
|
408 |
-
case 6: ct+=*(pc6+u);
|
409 |
-
break;
|
410 |
-
case 7: ct+=*(pc7+u);
|
411 |
-
break;
|
412 |
-
case 8: ct+=*(pc8+u);
|
413 |
-
break;
|
414 |
-
case 9: ct+=*(pc9+u);
|
415 |
-
break;
|
416 |
-
case 10: ct+=*(pc10+u);
|
417 |
-
break;
|
418 |
-
case 11: ct+=*(pc11+u);
|
419 |
-
i-=12;
|
420 |
-
break;
|
421 |
-
}
|
422 |
-
i++;
|
423 |
-
}
|
424 |
-
k=ct&tnum;
|
425 |
-
addt[harr[k]]=uptr;
|
426 |
-
(harr[k])++;
|
427 |
-
}
|
428 |
-
|
429 |
-
//Write out string file
|
430 |
-
pfout=get_Ostr(mz,"str");
|
431 |
-
k=0;
|
432 |
-
for(i=0;i<nwrds;i++){
|
433 |
-
*pfout << addt[i] << ends;
|
434 |
-
len=strlen((char*)addt[i])+1;
|
435 |
-
addt[i]=(char*)k;
|
436 |
-
k+=len;
|
437 |
-
}
|
438 |
-
dst_Ostr(pfout);
|
439 |
-
|
440 |
-
//Write out addr file
|
441 |
-
bin_Writ(mz,"ad",nwrds*sizeof(long),(char*)addt);
|
442 |
-
delete [] addt;
|
443 |
-
|
444 |
-
//Write out counts
|
445 |
-
pfout=get_Ostr(mz,"nm");
|
446 |
-
*pfout << nwrds << " " << tnum << " " << k << endl;
|
447 |
-
dst_Ostr(pfout);
|
448 |
-
delete [] harr;
|
449 |
-
delete [] farr;
|
450 |
-
}
|
451 |
-
|
452 |
-
void Hash::gopen_htable_map(void){
|
453 |
-
char cnam[max_str],*cptr;
|
454 |
-
int fld;
|
455 |
-
long ct,asize,i;
|
456 |
-
|
457 |
-
ifstream *pfin=get_Istr("nm");
|
458 |
-
*pfin >> nwrds >> tnum >> asize;
|
459 |
-
dst_Istr(pfin);
|
460 |
-
|
461 |
-
harr=(long*)get_Mmap("ha");
|
462 |
-
addr=(long*)get_Mmap("ad");
|
463 |
-
strmap=get_Mmap("str");
|
464 |
-
|
465 |
-
farr=new long[1536];
|
466 |
-
ct=1;
|
467 |
-
for(i=0;i<1536;i++){
|
468 |
-
farr[i]=ct=(ct*331)&tnum;
|
469 |
-
}
|
470 |
-
|
471 |
-
px0=farr,px1=farr+128,px2=farr+256;
|
472 |
-
px3=farr+384,px4=farr+512,px5=farr+640;
|
473 |
-
px6=farr+768,px7=farr+896,px8=farr+1024;
|
474 |
-
px9=farr+1152,px10=farr+1280,px11=farr+1408;
|
475 |
-
}
|
476 |
-
|
477 |
-
void Hash::gopen_htable_map(int mz){
|
478 |
-
char cnam[max_str],*cptr;
|
479 |
-
int fld;
|
480 |
-
long ct,asize,i;
|
481 |
-
|
482 |
-
ifstream *pfin=get_Istr(mz,"nm");
|
483 |
-
*pfin >> nwrds >> tnum >> asize;
|
484 |
-
dst_Istr(pfin);
|
485 |
-
|
486 |
-
harr=(long*)get_Mmap(mz,"ha");
|
487 |
-
addr=(long*)get_Mmap(mz,"ad");
|
488 |
-
strmap=get_Mmap(mz,"str");
|
489 |
-
|
490 |
-
farr=new long[1536];
|
491 |
-
ct=1;
|
492 |
-
for(i=0;i<1536;i++){
|
493 |
-
farr[i]=ct=(ct*331)&tnum;
|
494 |
-
}
|
495 |
-
|
496 |
-
px0=farr,px1=farr+128,px2=farr+256;
|
497 |
-
px3=farr+384,px4=farr+512,px5=farr+640;
|
498 |
-
px6=farr+768,px7=farr+896,px8=farr+1024;
|
499 |
-
px9=farr+1152,px10=farr+1280,px11=farr+1408;
|
500 |
-
}
|
501 |
-
|
502 |
-
void Hash::gopen_htable_copy(Hash *pH){
|
503 |
-
char cnam[max_str],*cptr;
|
504 |
-
int fld;
|
505 |
-
long ct,asize,i;
|
506 |
-
|
507 |
-
nwrds=pH->nwrds;
|
508 |
-
tnum=pH->tnum;
|
509 |
-
|
510 |
-
harr=pH->harr;
|
511 |
-
addr=pH->addr;
|
512 |
-
strmap=pH->strmap;
|
513 |
-
|
514 |
-
farr=pH->farr;
|
515 |
-
|
516 |
-
px0=farr,px1=farr+128,px2=farr+256;
|
517 |
-
px3=farr+384,px4=farr+512,px5=farr+640;
|
518 |
-
px6=farr+768,px7=farr+896,px8=farr+1024;
|
519 |
-
px9=farr+1152,px10=farr+1280,px11=farr+1408;
|
520 |
-
}
|
521 |
-
|
522 |
-
long Hash::find(const char *str){
|
523 |
-
register long ct=0,i=0,k;
|
524 |
-
register int ic;
|
525 |
-
register const char *utr=str;
|
526 |
-
while(ic=*(utr++)){
|
527 |
-
switch(i){
|
528 |
-
case 0: ct+=*(px0+ic);
|
529 |
-
break;
|
530 |
-
case 1: ct+=*(px1+ic);
|
531 |
-
break;
|
532 |
-
case 2: ct+=*(px2+ic);
|
533 |
-
break;
|
534 |
-
case 3: ct+=*(px3+ic);
|
535 |
-
break;
|
536 |
-
case 4: ct+=*(px4+ic);
|
537 |
-
break;
|
538 |
-
case 5: ct+=*(px5+ic);
|
539 |
-
break;
|
540 |
-
case 6: ct+=*(px6+ic);
|
541 |
-
break;
|
542 |
-
case 7: ct+=*(px7+ic);
|
543 |
-
break;
|
544 |
-
case 8: ct+=*(px8+ic);
|
545 |
-
break;
|
546 |
-
case 9: ct+=*(px9+ic);
|
547 |
-
break;
|
548 |
-
case 10: ct+=*(px10+ic);
|
549 |
-
break;
|
550 |
-
case 11: ct+=*(px11+ic);
|
551 |
-
i-=12;
|
552 |
-
break;
|
553 |
-
}
|
554 |
-
i++;
|
555 |
-
}
|
556 |
-
k=ct&tnum;
|
557 |
-
ct=harr[k+1];
|
558 |
-
i=harr[k];
|
559 |
-
//cout << k << " " << i << " " << addr[i] << " " << ct << " " << addr[ct] << endl;
|
560 |
-
switch(ct-i){
|
561 |
-
case 0: return(0);
|
562 |
-
break;
|
563 |
-
case 1: if(!strcmp(str,strmap+addr[i]))return(i+1);
|
564 |
-
else return(0);
|
565 |
-
break;
|
566 |
-
case 2: ic=strcmp(str,strmap+addr[i]);
|
567 |
-
if(ic>0){
|
568 |
-
if(!strcmp(str,strmap+addr[i+1]))return(i+2);
|
569 |
-
else return(0);
|
570 |
-
}
|
571 |
-
else if(ic<0)return(0);
|
572 |
-
else return(i+1);
|
573 |
-
break;
|
574 |
-
default: ic=strcmp(str,strmap+addr[i]);
|
575 |
-
if(ic<0)return(0);
|
576 |
-
else if(!ic)return(i+1);
|
577 |
-
ct--;
|
578 |
-
ic=strcmp(str,strmap+addr[ct]);
|
579 |
-
if(ic>0)return(0);
|
580 |
-
else if(!ic)return(ct+1);
|
581 |
-
while(ct-i>1){
|
582 |
-
k=(ct+i)/2;
|
583 |
-
ic=strcmp(str,strmap+addr[k]);
|
584 |
-
if(ic>0)i=k;
|
585 |
-
else if(ic<0)ct=k;
|
586 |
-
else return(k+1);
|
587 |
-
}
|
588 |
-
return(0);
|
589 |
-
}
|
590 |
-
}
|
591 |
-
|
592 |
-
void Hash::gclose_htable_map(void){
|
593 |
-
dst_Mmap("ha",(char*)harr);
|
594 |
-
dst_Mmap("ad",(char*)addr);
|
595 |
-
dst_Mmap("str",strmap);
|
596 |
-
delete [] farr;
|
597 |
-
}
|
598 |
-
|
599 |
-
void Hash::gclose_htable_map(int mz){
|
600 |
-
dst_Mmap(mz,"ha",(char*)harr);
|
601 |
-
dst_Mmap(mz,"ad",(char*)addr);
|
602 |
-
dst_Mmap(mz,"str",strmap);
|
603 |
-
delete [] farr;
|
604 |
-
}
|
605 |
-
|
606 |
-
//Chash code
|
607 |
-
|
608 |
-
Chash::Chash() : Hash(){
|
609 |
-
change_type("cshset");
|
610 |
-
}
|
611 |
-
|
612 |
-
Chash::Chash(const char *str) : Hash(str){
|
613 |
-
change_type("cshset");
|
614 |
-
}
|
615 |
-
|
616 |
-
Chash::Chash(int n,const char *str) : Hash(n,str){
|
617 |
-
change_type("cshset");
|
618 |
-
}
|
619 |
-
|
620 |
-
Chash::~Chash(void){}
|
621 |
-
|
622 |
-
void Chash::create_ctable(Count &Ct,int excess){
|
623 |
-
create_htable(Ct,excess);
|
624 |
-
gopen_htable_map();
|
625 |
-
long n,i=0;
|
626 |
-
long *pct=new long[Ct.cnt_key];
|
627 |
-
Ct.node_first();
|
628 |
-
while(Ct.node_next()){
|
629 |
-
if(n=find(Ct.show_str())){
|
630 |
-
pct[n-1]=Ct.count();
|
631 |
-
}
|
632 |
-
else {
|
633 |
-
cout << "Error in Count tree!" << endl;exit(0);
|
634 |
-
}
|
635 |
-
mark(++i,10000,"count terms");
|
636 |
-
}
|
637 |
-
bin_Writ("ct",Ct.cnt_key*sizeof(long),(char*)pct);
|
638 |
-
delete [] pct;
|
639 |
-
cnt=(long*)get_Mmap("ct");
|
640 |
-
gclose_htable_map();
|
641 |
-
}
|
642 |
-
|
643 |
-
void Chash::create_ctable(List &Lt,int excess){
|
644 |
-
create_htable(Lt,excess);
|
645 |
-
gopen_htable_map();
|
646 |
-
long n,i=1;
|
647 |
-
long *pct=new long[Lt.cnt_key];
|
648 |
-
Lt.node_first();
|
649 |
-
while(Lt.node_next()){
|
650 |
-
if(n=find(Lt.show_str())){
|
651 |
-
pct[n-1]=i;
|
652 |
-
}
|
653 |
-
else {
|
654 |
-
cout << "Error in List tree!" << endl;exit(0);
|
655 |
-
}
|
656 |
-
mark(++i,10000,"count terms");
|
657 |
-
}
|
658 |
-
bin_Writ("ct",Lt.cnt_key*sizeof(long),(char*)pct);
|
659 |
-
delete [] pct;
|
660 |
-
cnt=(long*)get_Mmap("ct");
|
661 |
-
gclose_htable_map();
|
662 |
-
}
|
663 |
-
|
664 |
-
void Chash::create_ctable(int mz,Count &Ct,int excess){
|
665 |
-
create_htable(mz,Ct,excess);
|
666 |
-
gopen_htable_map(mz);
|
667 |
-
long n,i=0;
|
668 |
-
long *pct=new long[Ct.cnt_key];
|
669 |
-
Ct.node_first();
|
670 |
-
while(Ct.node_next()){
|
671 |
-
if(n=find(Ct.show_str())){
|
672 |
-
pct[n-1]=Ct.count();
|
673 |
-
}
|
674 |
-
else {
|
675 |
-
cout << "Error in Count tree!" << endl;exit(0);
|
676 |
-
}
|
677 |
-
mark(++i,10000,"count terms");
|
678 |
-
}
|
679 |
-
bin_Writ(mz,"ct",Ct.cnt_key*sizeof(long),(char*)pct);
|
680 |
-
delete [] pct;
|
681 |
-
cnt=(long*)get_Mmap(mz,"ct");
|
682 |
-
gclose_htable_map(mz);
|
683 |
-
}
|
684 |
-
|
685 |
-
void Chash::create_ctable(int mz,List &Lt,int excess){
|
686 |
-
create_htable(mz,Lt,excess);
|
687 |
-
gopen_htable_map(mz);
|
688 |
-
long n,i=1;
|
689 |
-
long *pct=new long[Lt.cnt_key];
|
690 |
-
Lt.node_first();
|
691 |
-
while(Lt.node_next()){
|
692 |
-
if(n=find(Lt.show_str())){
|
693 |
-
pct[n-1]=i;
|
694 |
-
}
|
695 |
-
else {
|
696 |
-
cout << "Error in List tree!" << endl;exit(0);
|
697 |
-
}
|
698 |
-
mark(++i,10000,"count terms");
|
699 |
-
}
|
700 |
-
bin_Writ(mz,"ct",Lt.cnt_key*sizeof(long),(char*)pct);
|
701 |
-
delete [] pct;
|
702 |
-
cnt=(long*)get_Mmap(mz,"ct");
|
703 |
-
gclose_htable_map(mz);
|
704 |
-
}
|
705 |
-
|
706 |
-
void Chash::gopen_ctable_map(void){
|
707 |
-
gopen_htable_map();
|
708 |
-
cnt=(long*)get_Mmap("ct");
|
709 |
-
}
|
710 |
-
|
711 |
-
void Chash::gopen_ctable_map(int mz){
|
712 |
-
gopen_htable_map(mz);
|
713 |
-
cnt=(long*)get_Mmap(mz,"ct");
|
714 |
-
}
|
715 |
-
|
716 |
-
void Chash::gclose_ctable_map(void){
|
717 |
-
gclose_htable_map();
|
718 |
-
dst_Mmap("ct",(char*)cnt);
|
719 |
-
}
|
720 |
-
|
721 |
-
void Chash::gclose_ctable_map(int mz){
|
722 |
-
gclose_htable_map(mz);
|
723 |
-
dst_Mmap(mz,"ct",(char*)cnt);
|
724 |
-
}
|
725 |
-
|
726 |
-
long Chash::count(const char *str){
|
727 |
-
long n=find(str);
|
728 |
-
if(n)return(cnt[n-1]);
|
729 |
-
else return(0);
|
730 |
-
}
|
731 |
-
|
732 |
-
}
|
733 |
-
|
|
|
1 |
+
#include <iostream>
|
2 |
+
#include <fstream>
|
3 |
+
#include <cstdlib>
|
4 |
+
#include <sys/types.h>
|
5 |
+
#include <sys/stat.h>
|
6 |
+
#include <fcntl.h>
|
7 |
+
#include <sys/mman.h>
|
8 |
+
#include <cmath>
|
9 |
+
#include <cstring>
|
10 |
+
#include <cassert>
|
11 |
+
#include "runn.h"
|
12 |
+
#include "Hash.h"
|
13 |
+
|
14 |
+
using namespace std;
|
15 |
+
namespace iret {
|
16 |
+
|
17 |
+
Hash::Hash(void) : FBase("hshset","null"){
|
18 |
+
}
|
19 |
+
|
20 |
+
Hash::Hash(const char *nam) : FBase("hshset",nam){
|
21 |
+
}
|
22 |
+
|
23 |
+
Hash::Hash(int n,const char *nam) : FBase("hshset",n,nam){
|
24 |
+
}
|
25 |
+
|
26 |
+
Hash::~Hash(){
|
27 |
+
}
|
28 |
+
|
29 |
+
void Hash::create_htable(List &Lst,int excess){
|
30 |
+
char cnam[max_str],*cptr,*uptr;
|
31 |
+
int u,len;
|
32 |
+
long ct,i,j,k;
|
33 |
+
ofstream *pfout;
|
34 |
+
|
35 |
+
nwrds=Lst.cnt_key;
|
36 |
+
ct=nwrds;
|
37 |
+
tnum=1;
|
38 |
+
u=0;
|
39 |
+
while(ct=ct/2){tnum*=2;u++;}
|
40 |
+
if(u>30){cout << "Error in size, " << u << endl;exit(0);}
|
41 |
+
i=0;
|
42 |
+
while((u<32)&&(i<excess)){tnum*=2;u++;i++;}
|
43 |
+
tnum--;
|
44 |
+
harr=new long[tnum+2];
|
45 |
+
for(ct=0;ct<tnum+2;ct++)harr[ct]=0;
|
46 |
+
|
47 |
+
farr=new long[1536];
|
48 |
+
ct=1;
|
49 |
+
for(i=0;i<1536;i++){
|
50 |
+
farr[i]=ct=(ct*331)&tnum;
|
51 |
+
}
|
52 |
+
|
53 |
+
long *pc0=farr,*pc1=farr+128,*pc2=farr+256;
|
54 |
+
long *pc3=farr+384,*pc4=farr+512,*pc5=farr+640;
|
55 |
+
long *pc6=farr+768,*pc7=farr+896,*pc8=farr+1024;
|
56 |
+
long *pc9=farr+1152,*pc10=farr+1280,*pc11=farr+1408;
|
57 |
+
|
58 |
+
Lst.node_first();
|
59 |
+
while(Lst.node_next()){
|
60 |
+
cptr=Lst.show_str();
|
61 |
+
ct=0;
|
62 |
+
i=0;
|
63 |
+
while(u=*(cptr++)){
|
64 |
+
switch(i){
|
65 |
+
case 0: ct+=*(pc0+u);
|
66 |
+
break;
|
67 |
+
case 1: ct+=*(pc1+u);
|
68 |
+
break;
|
69 |
+
case 2: ct+=*(pc2+u);
|
70 |
+
break;
|
71 |
+
case 3: ct+=*(pc3+u);
|
72 |
+
break;
|
73 |
+
case 4: ct+=*(pc4+u);
|
74 |
+
break;
|
75 |
+
case 5: ct+=*(pc5+u);
|
76 |
+
break;
|
77 |
+
case 6: ct+=*(pc6+u);
|
78 |
+
break;
|
79 |
+
case 7: ct+=*(pc7+u);
|
80 |
+
break;
|
81 |
+
case 8: ct+=*(pc8+u);
|
82 |
+
break;
|
83 |
+
case 9: ct+=*(pc9+u);
|
84 |
+
break;
|
85 |
+
case 10: ct+=*(pc10+u);
|
86 |
+
break;
|
87 |
+
case 11: ct+=*(pc11+u);
|
88 |
+
i-=12;
|
89 |
+
break;
|
90 |
+
}
|
91 |
+
i++;
|
92 |
+
}
|
93 |
+
(harr[ct&tnum])++;
|
94 |
+
}
|
95 |
+
|
96 |
+
//Set start points in harr.
|
97 |
+
k=0;
|
98 |
+
for(i=0;i<tnum+2;i++){
|
99 |
+
j=harr[i];
|
100 |
+
harr[i]=k;
|
101 |
+
k+=j;
|
102 |
+
}
|
103 |
+
if(k!=nwrds){cout << "Error in summing!" << endl;exit(0);}
|
104 |
+
|
105 |
+
//Write out harr.
|
106 |
+
bin_Writ("ha",(tnum+2)*sizeof(long),(char*)harr);
|
107 |
+
|
108 |
+
//Set addresses
|
109 |
+
char **addt=new char*[nwrds];
|
110 |
+
Lst.node_first();
|
111 |
+
while(Lst.node_next()){
|
112 |
+
uptr=cptr=Lst.show_str();
|
113 |
+
ct=0;
|
114 |
+
i=0;
|
115 |
+
while(u=*(cptr++)){
|
116 |
+
switch(i){
|
117 |
+
case 0: ct+=*(pc0+u);
|
118 |
+
break;
|
119 |
+
case 1: ct+=*(pc1+u);
|
120 |
+
break;
|
121 |
+
case 2: ct+=*(pc2+u);
|
122 |
+
break;
|
123 |
+
case 3: ct+=*(pc3+u);
|
124 |
+
break;
|
125 |
+
case 4: ct+=*(pc4+u);
|
126 |
+
break;
|
127 |
+
case 5: ct+=*(pc5+u);
|
128 |
+
break;
|
129 |
+
case 6: ct+=*(pc6+u);
|
130 |
+
break;
|
131 |
+
case 7: ct+=*(pc7+u);
|
132 |
+
break;
|
133 |
+
case 8: ct+=*(pc8+u);
|
134 |
+
break;
|
135 |
+
case 9: ct+=*(pc9+u);
|
136 |
+
break;
|
137 |
+
case 10: ct+=*(pc10+u);
|
138 |
+
break;
|
139 |
+
case 11: ct+=*(pc11+u);
|
140 |
+
i-=12;
|
141 |
+
break;
|
142 |
+
}
|
143 |
+
i++;
|
144 |
+
}
|
145 |
+
k=ct&tnum;
|
146 |
+
addt[harr[k]]=uptr;
|
147 |
+
(harr[k])++;
|
148 |
+
}
|
149 |
+
|
150 |
+
//Write out string file
|
151 |
+
pfout=get_Ostr("str");
|
152 |
+
k=0;
|
153 |
+
for(i=0;i<nwrds;i++){
|
154 |
+
*pfout << addt[i] << ends;
|
155 |
+
len=strlen((char*)addt[i])+1;
|
156 |
+
addt[i]=(char*)k;
|
157 |
+
k+=len;
|
158 |
+
}
|
159 |
+
dst_Ostr(pfout);
|
160 |
+
|
161 |
+
//Write out addr file
|
162 |
+
bin_Writ("ad",nwrds*sizeof(long),(char*)addt);
|
163 |
+
delete [] addt;
|
164 |
+
|
165 |
+
//Write out counts
|
166 |
+
pfout=get_Ostr("nm");
|
167 |
+
*pfout << nwrds << " " << tnum << " " << k << endl;
|
168 |
+
dst_Ostr(pfout);
|
169 |
+
delete [] harr;
|
170 |
+
delete [] farr;
|
171 |
+
}
|
172 |
+
|
173 |
+
//In memory model intended for small sets
|
174 |
+
void Hash::create_htableM(List &Lst,int excess){
|
175 |
+
char cnam[max_str],*cptr,*uptr;
|
176 |
+
int u,len;
|
177 |
+
long ct,i,j,k,*barr;
|
178 |
+
ofstream *pfout;
|
179 |
+
|
180 |
+
nwrds=Lst.cnt_key;
|
181 |
+
ct=nwrds;
|
182 |
+
tnum=1;
|
183 |
+
u=0;
|
184 |
+
while(ct=ct/2){tnum*=2;u++;}
|
185 |
+
if(u>30){cout << "Error in size, " << u << endl;exit(0);}
|
186 |
+
i=0;
|
187 |
+
while((u<32)&&(i<excess)){tnum*=2;u++;i++;}
|
188 |
+
tnum--;
|
189 |
+
harr=new long[tnum+2];
|
190 |
+
barr=new long[tnum+2];
|
191 |
+
for(ct=0;ct<tnum+2;ct++)harr[ct]=0;
|
192 |
+
|
193 |
+
farr=new long[1536];
|
194 |
+
ct=1;
|
195 |
+
for(i=0;i<1536;i++){
|
196 |
+
farr[i]=ct=(ct*331)&tnum;
|
197 |
+
}
|
198 |
+
|
199 |
+
px0=farr,px1=farr+128,px2=farr+256;
|
200 |
+
px3=farr+384,px4=farr+512,px5=farr+640;
|
201 |
+
px6=farr+768,px7=farr+896,px8=farr+1024;
|
202 |
+
px9=farr+1152,px10=farr+1280,px11=farr+1408;
|
203 |
+
|
204 |
+
Lst.node_first();
|
205 |
+
while(Lst.node_next()){
|
206 |
+
cptr=Lst.show_str();
|
207 |
+
ct=0;
|
208 |
+
i=0;
|
209 |
+
while(u=*(cptr++)){
|
210 |
+
switch(i){
|
211 |
+
case 0: ct+=*(px0+u);
|
212 |
+
break;
|
213 |
+
case 1: ct+=*(px1+u);
|
214 |
+
break;
|
215 |
+
case 2: ct+=*(px2+u);
|
216 |
+
break;
|
217 |
+
case 3: ct+=*(px3+u);
|
218 |
+
break;
|
219 |
+
case 4: ct+=*(px4+u);
|
220 |
+
break;
|
221 |
+
case 5: ct+=*(px5+u);
|
222 |
+
break;
|
223 |
+
case 6: ct+=*(px6+u);
|
224 |
+
break;
|
225 |
+
case 7: ct+=*(px7+u);
|
226 |
+
break;
|
227 |
+
case 8: ct+=*(px8+u);
|
228 |
+
break;
|
229 |
+
case 9: ct+=*(px9+u);
|
230 |
+
break;
|
231 |
+
case 10: ct+=*(px10+u);
|
232 |
+
break;
|
233 |
+
case 11: ct+=*(px11+u);
|
234 |
+
i-=12;
|
235 |
+
break;
|
236 |
+
}
|
237 |
+
i++;
|
238 |
+
}
|
239 |
+
(harr[ct&tnum])++;
|
240 |
+
}
|
241 |
+
|
242 |
+
//Set start points in harr.
|
243 |
+
k=0;
|
244 |
+
for(i=0;i<tnum+2;i++){
|
245 |
+
j=harr[i];
|
246 |
+
barr[i]=harr[i]=k;
|
247 |
+
k+=j;
|
248 |
+
}
|
249 |
+
if(k!=nwrds){cout << "Error in summing!" << endl;exit(0);}
|
250 |
+
|
251 |
+
//Set addresses
|
252 |
+
len=0;
|
253 |
+
char **addt=new char*[nwrds];
|
254 |
+
Lst.node_first();
|
255 |
+
while(Lst.node_next()){
|
256 |
+
uptr=cptr=Lst.show_str();
|
257 |
+
len+=strlen(uptr)+1;
|
258 |
+
ct=0;
|
259 |
+
i=0;
|
260 |
+
while(u=*(cptr++)){
|
261 |
+
switch(i){
|
262 |
+
case 0: ct+=*(px0+u);
|
263 |
+
break;
|
264 |
+
case 1: ct+=*(px1+u);
|
265 |
+
break;
|
266 |
+
case 2: ct+=*(px2+u);
|
267 |
+
break;
|
268 |
+
case 3: ct+=*(px3+u);
|
269 |
+
break;
|
270 |
+
case 4: ct+=*(px4+u);
|
271 |
+
break;
|
272 |
+
case 5: ct+=*(px5+u);
|
273 |
+
break;
|
274 |
+
case 6: ct+=*(px6+u);
|
275 |
+
break;
|
276 |
+
case 7: ct+=*(px7+u);
|
277 |
+
break;
|
278 |
+
case 8: ct+=*(px8+u);
|
279 |
+
break;
|
280 |
+
case 9: ct+=*(px9+u);
|
281 |
+
break;
|
282 |
+
case 10: ct+=*(px10+u);
|
283 |
+
break;
|
284 |
+
case 11: ct+=*(px11+u);
|
285 |
+
i-=12;
|
286 |
+
break;
|
287 |
+
}
|
288 |
+
i++;
|
289 |
+
}
|
290 |
+
k=ct&tnum;
|
291 |
+
addt[barr[k]]=uptr;
|
292 |
+
(barr[k])++;
|
293 |
+
}
|
294 |
+
strmap=new char[len];
|
295 |
+
|
296 |
+
//Set up string array
|
297 |
+
k=0;
|
298 |
+
for(i=0;i<nwrds;i++){
|
299 |
+
len=strlen((char*)addt[i])+1;
|
300 |
+
strcpy(strmap+k,addt[i]);
|
301 |
+
addt[i]=(char*)k;
|
302 |
+
k+=len;
|
303 |
+
}
|
304 |
+
addr=(long*)addt;
|
305 |
+
delete [] barr;
|
306 |
+
}
|
307 |
+
|
308 |
+
void Hash::create_htable(int mz,List &Lst,int excess){
|
309 |
+
char cnam[max_str],*cptr,*uptr;
|
310 |
+
int u,len;
|
311 |
+
long ct,i,j,k;
|
312 |
+
ofstream *pfout;
|
313 |
+
|
314 |
+
nwrds=Lst.cnt_key;
|
315 |
+
ct=nwrds;
|
316 |
+
tnum=1;
|
317 |
+
u=0;
|
318 |
+
while(ct=ct/2){tnum*=2;u++;}
|
319 |
+
if(u>30){cout << "Error in size, " << u << endl;exit(0);}
|
320 |
+
i=0;
|
321 |
+
while((u<32)&&(i<excess)){tnum*=2;u++;i++;}
|
322 |
+
tnum--;
|
323 |
+
harr=new long[tnum+2];
|
324 |
+
for(ct=0;ct<tnum+2;ct++)harr[ct]=0;
|
325 |
+
|
326 |
+
farr=new long[1536];
|
327 |
+
ct=1;
|
328 |
+
for(i=0;i<1536;i++){
|
329 |
+
farr[i]=ct=(ct*331)&tnum;
|
330 |
+
}
|
331 |
+
|
332 |
+
long *pc0=farr,*pc1=farr+128,*pc2=farr+256;
|
333 |
+
long *pc3=farr+384,*pc4=farr+512,*pc5=farr+640;
|
334 |
+
long *pc6=farr+768,*pc7=farr+896,*pc8=farr+1024;
|
335 |
+
long *pc9=farr+1152,*pc10=farr+1280,*pc11=farr+1408;
|
336 |
+
|
337 |
+
Lst.node_first();
|
338 |
+
while(Lst.node_next()){
|
339 |
+
cptr=Lst.show_str();
|
340 |
+
ct=0;
|
341 |
+
i=0;
|
342 |
+
while(u=*(cptr++)){
|
343 |
+
switch(i){
|
344 |
+
case 0: ct+=*(pc0+u);
|
345 |
+
break;
|
346 |
+
case 1: ct+=*(pc1+u);
|
347 |
+
break;
|
348 |
+
case 2: ct+=*(pc2+u);
|
349 |
+
break;
|
350 |
+
case 3: ct+=*(pc3+u);
|
351 |
+
break;
|
352 |
+
case 4: ct+=*(pc4+u);
|
353 |
+
break;
|
354 |
+
case 5: ct+=*(pc5+u);
|
355 |
+
break;
|
356 |
+
case 6: ct+=*(pc6+u);
|
357 |
+
break;
|
358 |
+
case 7: ct+=*(pc7+u);
|
359 |
+
break;
|
360 |
+
case 8: ct+=*(pc8+u);
|
361 |
+
break;
|
362 |
+
case 9: ct+=*(pc9+u);
|
363 |
+
break;
|
364 |
+
case 10: ct+=*(pc10+u);
|
365 |
+
break;
|
366 |
+
case 11: ct+=*(pc11+u);
|
367 |
+
i-=12;
|
368 |
+
break;
|
369 |
+
}
|
370 |
+
i++;
|
371 |
+
}
|
372 |
+
(harr[ct&tnum])++;
|
373 |
+
}
|
374 |
+
|
375 |
+
//Set start points in harr.
|
376 |
+
k=0;
|
377 |
+
for(i=0;i<tnum+2;i++){
|
378 |
+
j=harr[i];
|
379 |
+
harr[i]=k;
|
380 |
+
k+=j;
|
381 |
+
}
|
382 |
+
if(k!=nwrds){cout << "Error in summing!" << endl;exit(0);}
|
383 |
+
|
384 |
+
//Write out harr.
|
385 |
+
bin_Writ(mz,"ha",(tnum+2)*sizeof(long),(char*)harr);
|
386 |
+
|
387 |
+
//Set addresses
|
388 |
+
char **addt=new char*[nwrds];
|
389 |
+
Lst.node_first();
|
390 |
+
while(Lst.node_next()){
|
391 |
+
uptr=cptr=Lst.show_str();
|
392 |
+
ct=0;
|
393 |
+
i=0;
|
394 |
+
while(u=*(cptr++)){
|
395 |
+
switch(i){
|
396 |
+
case 0: ct+=*(pc0+u);
|
397 |
+
break;
|
398 |
+
case 1: ct+=*(pc1+u);
|
399 |
+
break;
|
400 |
+
case 2: ct+=*(pc2+u);
|
401 |
+
break;
|
402 |
+
case 3: ct+=*(pc3+u);
|
403 |
+
break;
|
404 |
+
case 4: ct+=*(pc4+u);
|
405 |
+
break;
|
406 |
+
case 5: ct+=*(pc5+u);
|
407 |
+
break;
|
408 |
+
case 6: ct+=*(pc6+u);
|
409 |
+
break;
|
410 |
+
case 7: ct+=*(pc7+u);
|
411 |
+
break;
|
412 |
+
case 8: ct+=*(pc8+u);
|
413 |
+
break;
|
414 |
+
case 9: ct+=*(pc9+u);
|
415 |
+
break;
|
416 |
+
case 10: ct+=*(pc10+u);
|
417 |
+
break;
|
418 |
+
case 11: ct+=*(pc11+u);
|
419 |
+
i-=12;
|
420 |
+
break;
|
421 |
+
}
|
422 |
+
i++;
|
423 |
+
}
|
424 |
+
k=ct&tnum;
|
425 |
+
addt[harr[k]]=uptr;
|
426 |
+
(harr[k])++;
|
427 |
+
}
|
428 |
+
|
429 |
+
//Write out string file
|
430 |
+
pfout=get_Ostr(mz,"str");
|
431 |
+
k=0;
|
432 |
+
for(i=0;i<nwrds;i++){
|
433 |
+
*pfout << addt[i] << ends;
|
434 |
+
len=strlen((char*)addt[i])+1;
|
435 |
+
addt[i]=(char*)k;
|
436 |
+
k+=len;
|
437 |
+
}
|
438 |
+
dst_Ostr(pfout);
|
439 |
+
|
440 |
+
//Write out addr file
|
441 |
+
bin_Writ(mz,"ad",nwrds*sizeof(long),(char*)addt);
|
442 |
+
delete [] addt;
|
443 |
+
|
444 |
+
//Write out counts
|
445 |
+
pfout=get_Ostr(mz,"nm");
|
446 |
+
*pfout << nwrds << " " << tnum << " " << k << endl;
|
447 |
+
dst_Ostr(pfout);
|
448 |
+
delete [] harr;
|
449 |
+
delete [] farr;
|
450 |
+
}
|
451 |
+
|
452 |
+
void Hash::gopen_htable_map(void){
|
453 |
+
char cnam[max_str],*cptr;
|
454 |
+
int fld;
|
455 |
+
long ct,asize,i;
|
456 |
+
|
457 |
+
ifstream *pfin=get_Istr("nm");
|
458 |
+
*pfin >> nwrds >> tnum >> asize;
|
459 |
+
dst_Istr(pfin);
|
460 |
+
|
461 |
+
harr=(long*)get_Mmap("ha");
|
462 |
+
addr=(long*)get_Mmap("ad");
|
463 |
+
strmap=get_Mmap("str");
|
464 |
+
|
465 |
+
farr=new long[1536];
|
466 |
+
ct=1;
|
467 |
+
for(i=0;i<1536;i++){
|
468 |
+
farr[i]=ct=(ct*331)&tnum;
|
469 |
+
}
|
470 |
+
|
471 |
+
px0=farr,px1=farr+128,px2=farr+256;
|
472 |
+
px3=farr+384,px4=farr+512,px5=farr+640;
|
473 |
+
px6=farr+768,px7=farr+896,px8=farr+1024;
|
474 |
+
px9=farr+1152,px10=farr+1280,px11=farr+1408;
|
475 |
+
}
|
476 |
+
|
477 |
+
void Hash::gopen_htable_map(int mz){
|
478 |
+
char cnam[max_str],*cptr;
|
479 |
+
int fld;
|
480 |
+
long ct,asize,i;
|
481 |
+
|
482 |
+
ifstream *pfin=get_Istr(mz,"nm");
|
483 |
+
*pfin >> nwrds >> tnum >> asize;
|
484 |
+
dst_Istr(pfin);
|
485 |
+
|
486 |
+
harr=(long*)get_Mmap(mz,"ha");
|
487 |
+
addr=(long*)get_Mmap(mz,"ad");
|
488 |
+
strmap=get_Mmap(mz,"str");
|
489 |
+
|
490 |
+
farr=new long[1536];
|
491 |
+
ct=1;
|
492 |
+
for(i=0;i<1536;i++){
|
493 |
+
farr[i]=ct=(ct*331)&tnum;
|
494 |
+
}
|
495 |
+
|
496 |
+
px0=farr,px1=farr+128,px2=farr+256;
|
497 |
+
px3=farr+384,px4=farr+512,px5=farr+640;
|
498 |
+
px6=farr+768,px7=farr+896,px8=farr+1024;
|
499 |
+
px9=farr+1152,px10=farr+1280,px11=farr+1408;
|
500 |
+
}
|
501 |
+
|
502 |
+
void Hash::gopen_htable_copy(Hash *pH){
|
503 |
+
char cnam[max_str],*cptr;
|
504 |
+
int fld;
|
505 |
+
long ct,asize,i;
|
506 |
+
|
507 |
+
nwrds=pH->nwrds;
|
508 |
+
tnum=pH->tnum;
|
509 |
+
|
510 |
+
harr=pH->harr;
|
511 |
+
addr=pH->addr;
|
512 |
+
strmap=pH->strmap;
|
513 |
+
|
514 |
+
farr=pH->farr;
|
515 |
+
|
516 |
+
px0=farr,px1=farr+128,px2=farr+256;
|
517 |
+
px3=farr+384,px4=farr+512,px5=farr+640;
|
518 |
+
px6=farr+768,px7=farr+896,px8=farr+1024;
|
519 |
+
px9=farr+1152,px10=farr+1280,px11=farr+1408;
|
520 |
+
}
|
521 |
+
|
522 |
+
long Hash::find(const char *str){
|
523 |
+
register long ct=0,i=0,k;
|
524 |
+
register int ic;
|
525 |
+
register const char *utr=str;
|
526 |
+
while(ic=*(utr++)){
|
527 |
+
switch(i){
|
528 |
+
case 0: ct+=*(px0+ic);
|
529 |
+
break;
|
530 |
+
case 1: ct+=*(px1+ic);
|
531 |
+
break;
|
532 |
+
case 2: ct+=*(px2+ic);
|
533 |
+
break;
|
534 |
+
case 3: ct+=*(px3+ic);
|
535 |
+
break;
|
536 |
+
case 4: ct+=*(px4+ic);
|
537 |
+
break;
|
538 |
+
case 5: ct+=*(px5+ic);
|
539 |
+
break;
|
540 |
+
case 6: ct+=*(px6+ic);
|
541 |
+
break;
|
542 |
+
case 7: ct+=*(px7+ic);
|
543 |
+
break;
|
544 |
+
case 8: ct+=*(px8+ic);
|
545 |
+
break;
|
546 |
+
case 9: ct+=*(px9+ic);
|
547 |
+
break;
|
548 |
+
case 10: ct+=*(px10+ic);
|
549 |
+
break;
|
550 |
+
case 11: ct+=*(px11+ic);
|
551 |
+
i-=12;
|
552 |
+
break;
|
553 |
+
}
|
554 |
+
i++;
|
555 |
+
}
|
556 |
+
k=ct&tnum;
|
557 |
+
ct=harr[k+1];
|
558 |
+
i=harr[k];
|
559 |
+
//cout << k << " " << i << " " << addr[i] << " " << ct << " " << addr[ct] << endl;
|
560 |
+
switch(ct-i){
|
561 |
+
case 0: return(0);
|
562 |
+
break;
|
563 |
+
case 1: if(!strcmp(str,strmap+addr[i]))return(i+1);
|
564 |
+
else return(0);
|
565 |
+
break;
|
566 |
+
case 2: ic=strcmp(str,strmap+addr[i]);
|
567 |
+
if(ic>0){
|
568 |
+
if(!strcmp(str,strmap+addr[i+1]))return(i+2);
|
569 |
+
else return(0);
|
570 |
+
}
|
571 |
+
else if(ic<0)return(0);
|
572 |
+
else return(i+1);
|
573 |
+
break;
|
574 |
+
default: ic=strcmp(str,strmap+addr[i]);
|
575 |
+
if(ic<0)return(0);
|
576 |
+
else if(!ic)return(i+1);
|
577 |
+
ct--;
|
578 |
+
ic=strcmp(str,strmap+addr[ct]);
|
579 |
+
if(ic>0)return(0);
|
580 |
+
else if(!ic)return(ct+1);
|
581 |
+
while(ct-i>1){
|
582 |
+
k=(ct+i)/2;
|
583 |
+
ic=strcmp(str,strmap+addr[k]);
|
584 |
+
if(ic>0)i=k;
|
585 |
+
else if(ic<0)ct=k;
|
586 |
+
else return(k+1);
|
587 |
+
}
|
588 |
+
return(0);
|
589 |
+
}
|
590 |
+
}
|
591 |
+
|
592 |
+
void Hash::gclose_htable_map(void){
|
593 |
+
dst_Mmap("ha",(char*)harr);
|
594 |
+
dst_Mmap("ad",(char*)addr);
|
595 |
+
dst_Mmap("str",strmap);
|
596 |
+
delete [] farr;
|
597 |
+
}
|
598 |
+
|
599 |
+
void Hash::gclose_htable_map(int mz){
|
600 |
+
dst_Mmap(mz,"ha",(char*)harr);
|
601 |
+
dst_Mmap(mz,"ad",(char*)addr);
|
602 |
+
dst_Mmap(mz,"str",strmap);
|
603 |
+
delete [] farr;
|
604 |
+
}
|
605 |
+
|
606 |
+
//Chash code
|
607 |
+
|
608 |
+
Chash::Chash() : Hash(){
|
609 |
+
change_type("cshset");
|
610 |
+
}
|
611 |
+
|
612 |
+
Chash::Chash(const char *str) : Hash(str){
|
613 |
+
change_type("cshset");
|
614 |
+
}
|
615 |
+
|
616 |
+
Chash::Chash(int n,const char *str) : Hash(n,str){
|
617 |
+
change_type("cshset");
|
618 |
+
}
|
619 |
+
|
620 |
+
Chash::~Chash(void){}
|
621 |
+
|
622 |
+
void Chash::create_ctable(Count &Ct,int excess){
|
623 |
+
create_htable(Ct,excess);
|
624 |
+
gopen_htable_map();
|
625 |
+
long n,i=0;
|
626 |
+
long *pct=new long[Ct.cnt_key];
|
627 |
+
Ct.node_first();
|
628 |
+
while(Ct.node_next()){
|
629 |
+
if(n=find(Ct.show_str())){
|
630 |
+
pct[n-1]=Ct.count();
|
631 |
+
}
|
632 |
+
else {
|
633 |
+
cout << "Error in Count tree!" << endl;exit(0);
|
634 |
+
}
|
635 |
+
mark(++i,10000,"count terms");
|
636 |
+
}
|
637 |
+
bin_Writ("ct",Ct.cnt_key*sizeof(long),(char*)pct);
|
638 |
+
delete [] pct;
|
639 |
+
cnt=(long*)get_Mmap("ct");
|
640 |
+
gclose_htable_map();
|
641 |
+
}
|
642 |
+
|
643 |
+
void Chash::create_ctable(List &Lt,int excess){
|
644 |
+
create_htable(Lt,excess);
|
645 |
+
gopen_htable_map();
|
646 |
+
long n,i=1;
|
647 |
+
long *pct=new long[Lt.cnt_key];
|
648 |
+
Lt.node_first();
|
649 |
+
while(Lt.node_next()){
|
650 |
+
if(n=find(Lt.show_str())){
|
651 |
+
pct[n-1]=i;
|
652 |
+
}
|
653 |
+
else {
|
654 |
+
cout << "Error in List tree!" << endl;exit(0);
|
655 |
+
}
|
656 |
+
mark(++i,10000,"count terms");
|
657 |
+
}
|
658 |
+
bin_Writ("ct",Lt.cnt_key*sizeof(long),(char*)pct);
|
659 |
+
delete [] pct;
|
660 |
+
cnt=(long*)get_Mmap("ct");
|
661 |
+
gclose_htable_map();
|
662 |
+
}
|
663 |
+
|
664 |
+
void Chash::create_ctable(int mz,Count &Ct,int excess){
|
665 |
+
create_htable(mz,Ct,excess);
|
666 |
+
gopen_htable_map(mz);
|
667 |
+
long n,i=0;
|
668 |
+
long *pct=new long[Ct.cnt_key];
|
669 |
+
Ct.node_first();
|
670 |
+
while(Ct.node_next()){
|
671 |
+
if(n=find(Ct.show_str())){
|
672 |
+
pct[n-1]=Ct.count();
|
673 |
+
}
|
674 |
+
else {
|
675 |
+
cout << "Error in Count tree!" << endl;exit(0);
|
676 |
+
}
|
677 |
+
mark(++i,10000,"count terms");
|
678 |
+
}
|
679 |
+
bin_Writ(mz,"ct",Ct.cnt_key*sizeof(long),(char*)pct);
|
680 |
+
delete [] pct;
|
681 |
+
cnt=(long*)get_Mmap(mz,"ct");
|
682 |
+
gclose_htable_map(mz);
|
683 |
+
}
|
684 |
+
|
685 |
+
void Chash::create_ctable(int mz,List &Lt,int excess){
|
686 |
+
create_htable(mz,Lt,excess);
|
687 |
+
gopen_htable_map(mz);
|
688 |
+
long n,i=1;
|
689 |
+
long *pct=new long[Lt.cnt_key];
|
690 |
+
Lt.node_first();
|
691 |
+
while(Lt.node_next()){
|
692 |
+
if(n=find(Lt.show_str())){
|
693 |
+
pct[n-1]=i;
|
694 |
+
}
|
695 |
+
else {
|
696 |
+
cout << "Error in List tree!" << endl;exit(0);
|
697 |
+
}
|
698 |
+
mark(++i,10000,"count terms");
|
699 |
+
}
|
700 |
+
bin_Writ(mz,"ct",Lt.cnt_key*sizeof(long),(char*)pct);
|
701 |
+
delete [] pct;
|
702 |
+
cnt=(long*)get_Mmap(mz,"ct");
|
703 |
+
gclose_htable_map(mz);
|
704 |
+
}
|
705 |
+
|
706 |
+
void Chash::gopen_ctable_map(void){
|
707 |
+
gopen_htable_map();
|
708 |
+
cnt=(long*)get_Mmap("ct");
|
709 |
+
}
|
710 |
+
|
711 |
+
void Chash::gopen_ctable_map(int mz){
|
712 |
+
gopen_htable_map(mz);
|
713 |
+
cnt=(long*)get_Mmap(mz,"ct");
|
714 |
+
}
|
715 |
+
|
716 |
+
void Chash::gclose_ctable_map(void){
|
717 |
+
gclose_htable_map();
|
718 |
+
dst_Mmap("ct",(char*)cnt);
|
719 |
+
}
|
720 |
+
|
721 |
+
void Chash::gclose_ctable_map(int mz){
|
722 |
+
gclose_htable_map(mz);
|
723 |
+
dst_Mmap(mz,"ct",(char*)cnt);
|
724 |
+
}
|
725 |
+
|
726 |
+
long Chash::count(const char *str){
|
727 |
+
long n=find(str);
|
728 |
+
if(n)return(cnt[n-1]);
|
729 |
+
else return(0);
|
730 |
+
}
|
731 |
+
|
732 |
+
}
|
733 |
+
|
Library/Hash.h
CHANGED
@@ -1,92 +1,92 @@
|
|
1 |
-
#ifndef HASH_H
|
2 |
-
#define HASH_H
|
3 |
-
|
4 |
-
#include <iostream>
|
5 |
-
#include <fstream>
|
6 |
-
#include <Btree.h>
|
7 |
-
#include <FBase.h>
|
8 |
-
|
9 |
-
namespace iret {
|
10 |
-
|
11 |
-
class Hash : public FBase {
|
12 |
-
public:
|
13 |
-
Hash(void);
|
14 |
-
Hash(const char *nm);
|
15 |
-
Hash(int n,const char *nm); //n gets appended to type if >-1
|
16 |
-
~Hash();
|
17 |
-
|
18 |
-
void create_htable(List &Lst,int excess); //"str" for file of strings,
|
19 |
-
//"ad" for address file, "nm" numbers,
|
20 |
-
//"ha" hash array. Excess is # powers of 2 above size.
|
21 |
-
void create_htableM(List &Lst,int excess); //creates in memory ready for use
|
22 |
-
//and no need to call gopen or gclose functions
|
23 |
-
void create_htable(int mz,List &Lst,int excess); //"str" for file of strings,
|
24 |
-
//Creates a numbered version of above
|
25 |
-
|
26 |
-
void gopen_htable_map(void); //Creates memory maps
|
27 |
-
void gopen_htable_map(int mz); //Creates memory maps
|
28 |
-
void gclose_htable_map(void); //Destroys memory maps
|
29 |
-
//and deletes memory
|
30 |
-
void gclose_htable_map(int mz); //Destroys memory maps
|
31 |
-
//and deletes memory
|
32 |
-
void gopen_htable_copy(Hash *pH); //Copies memory maps
|
33 |
-
|
34 |
-
long find(const char *str); //Return number+1 if present, else 0.
|
35 |
-
//Number is not lexical order but hash order and then lexical
|
36 |
-
//within collesion groups.
|
37 |
-
|
38 |
-
//Data
|
39 |
-
char *strmap; //Holds the bit map.
|
40 |
-
long *addr; //Holds the offsets to strmap.
|
41 |
-
long nwrds; //Number of words.
|
42 |
-
long tnum; //Truncation number, size of har.
|
43 |
-
long *harr; //Holds hash array.
|
44 |
-
long *farr; //Holds the hash coefficients.
|
45 |
-
long *px0;
|
46 |
-
long *px1;
|
47 |
-
long *px2;
|
48 |
-
long *px3;
|
49 |
-
long *px4;
|
50 |
-
long *px5;
|
51 |
-
long *px6;
|
52 |
-
long *px7;
|
53 |
-
long *px8;
|
54 |
-
long *px9;
|
55 |
-
long *px10;
|
56 |
-
long *px11;
|
57 |
-
};
|
58 |
-
|
59 |
-
class Chash : public Hash {
|
60 |
-
public:
|
61 |
-
Chash(void);
|
62 |
-
Chash(const char *nm);
|
63 |
-
Chash(int n,const char *nm); //n gets appended to type if >-1
|
64 |
-
~Chash(void);
|
65 |
-
|
66 |
-
void create_ctable(Count &Ct,int excess); //Adds "ct" for counts
|
67 |
-
//Calls create_htable and then prodoces the array of counts.
|
68 |
-
void create_ctable(int mz,Count &Ct,int excess); //Adds "ct" for counts
|
69 |
-
//Creates a numbered version of above
|
70 |
-
void create_ctable(List &Lt,int excess); //Adds "ct" for term #
|
71 |
-
//and starts the count at 1 and in lexical order. count() will
|
72 |
-
//return 0 if term not in list.
|
73 |
-
void create_ctable(int mz,List &Lt,int excess); //Adds "ct" for term #
|
74 |
-
//Creates a numbered version of above
|
75 |
-
|
76 |
-
void gopen_ctable_map(void); //Calls gopen_htable_map and also
|
77 |
-
//maps "ct" file.
|
78 |
-
void gopen_ctable_map(int mz); //Calls gopen_htable_map and also
|
79 |
-
//maps "ct" file.
|
80 |
-
void gclose_ctable_map(void); //Calls gclose_htable_map and also
|
81 |
-
//Unmaps "ct" file.
|
82 |
-
void gclose_ctable_map(int mz); //Calls gclose_htable_map and also
|
83 |
-
//Unmaps "ct" file.
|
84 |
-
|
85 |
-
long count(const char *str); //Returns count if present, else 0.
|
86 |
-
|
87 |
-
//Data
|
88 |
-
long *cnt;
|
89 |
-
};
|
90 |
-
|
91 |
-
}
|
92 |
-
#endif
|
|
|
1 |
+
#ifndef HASH_H
|
2 |
+
#define HASH_H
|
3 |
+
|
4 |
+
#include <iostream>
|
5 |
+
#include <fstream>
|
6 |
+
#include <Btree.h>
|
7 |
+
#include <FBase.h>
|
8 |
+
|
9 |
+
namespace iret {
|
10 |
+
|
11 |
+
class Hash : public FBase {
|
12 |
+
public:
|
13 |
+
Hash(void);
|
14 |
+
Hash(const char *nm);
|
15 |
+
Hash(int n,const char *nm); //n gets appended to type if >-1
|
16 |
+
~Hash();
|
17 |
+
|
18 |
+
void create_htable(List &Lst,int excess); //"str" for file of strings,
|
19 |
+
//"ad" for address file, "nm" numbers,
|
20 |
+
//"ha" hash array. Excess is # powers of 2 above size.
|
21 |
+
void create_htableM(List &Lst,int excess); //creates in memory ready for use
|
22 |
+
//and no need to call gopen or gclose functions
|
23 |
+
void create_htable(int mz,List &Lst,int excess); //"str" for file of strings,
|
24 |
+
//Creates a numbered version of above
|
25 |
+
|
26 |
+
void gopen_htable_map(void); //Creates memory maps
|
27 |
+
void gopen_htable_map(int mz); //Creates memory maps
|
28 |
+
void gclose_htable_map(void); //Destroys memory maps
|
29 |
+
//and deletes memory
|
30 |
+
void gclose_htable_map(int mz); //Destroys memory maps
|
31 |
+
//and deletes memory
|
32 |
+
void gopen_htable_copy(Hash *pH); //Copies memory maps
|
33 |
+
|
34 |
+
long find(const char *str); //Return number+1 if present, else 0.
|
35 |
+
//Number is not lexical order but hash order and then lexical
|
36 |
+
//within collesion groups.
|
37 |
+
|
38 |
+
//Data
|
39 |
+
char *strmap; //Holds the bit map.
|
40 |
+
long *addr; //Holds the offsets to strmap.
|
41 |
+
long nwrds; //Number of words.
|
42 |
+
long tnum; //Truncation number, size of har.
|
43 |
+
long *harr; //Holds hash array.
|
44 |
+
long *farr; //Holds the hash coefficients.
|
45 |
+
long *px0;
|
46 |
+
long *px1;
|
47 |
+
long *px2;
|
48 |
+
long *px3;
|
49 |
+
long *px4;
|
50 |
+
long *px5;
|
51 |
+
long *px6;
|
52 |
+
long *px7;
|
53 |
+
long *px8;
|
54 |
+
long *px9;
|
55 |
+
long *px10;
|
56 |
+
long *px11;
|
57 |
+
};
|
58 |
+
|
59 |
+
class Chash : public Hash {
|
60 |
+
public:
|
61 |
+
Chash(void);
|
62 |
+
Chash(const char *nm);
|
63 |
+
Chash(int n,const char *nm); //n gets appended to type if >-1
|
64 |
+
~Chash(void);
|
65 |
+
|
66 |
+
void create_ctable(Count &Ct,int excess); //Adds "ct" for counts
|
67 |
+
//Calls create_htable and then prodoces the array of counts.
|
68 |
+
void create_ctable(int mz,Count &Ct,int excess); //Adds "ct" for counts
|
69 |
+
//Creates a numbered version of above
|
70 |
+
void create_ctable(List &Lt,int excess); //Adds "ct" for term #
|
71 |
+
//and starts the count at 1 and in lexical order. count() will
|
72 |
+
//return 0 if term not in list.
|
73 |
+
void create_ctable(int mz,List &Lt,int excess); //Adds "ct" for term #
|
74 |
+
//Creates a numbered version of above
|
75 |
+
|
76 |
+
void gopen_ctable_map(void); //Calls gopen_htable_map and also
|
77 |
+
//maps "ct" file.
|
78 |
+
void gopen_ctable_map(int mz); //Calls gopen_htable_map and also
|
79 |
+
//maps "ct" file.
|
80 |
+
void gclose_ctable_map(void); //Calls gclose_htable_map and also
|
81 |
+
//Unmaps "ct" file.
|
82 |
+
void gclose_ctable_map(int mz); //Calls gclose_htable_map and also
|
83 |
+
//Unmaps "ct" file.
|
84 |
+
|
85 |
+
long count(const char *str); //Returns count if present, else 0.
|
86 |
+
|
87 |
+
//Data
|
88 |
+
long *cnt;
|
89 |
+
};
|
90 |
+
|
91 |
+
}
|
92 |
+
#endif
|
Library/MPtok.C
CHANGED
@@ -1,2036 +1,2036 @@
|
|
1 |
-
#include <stdio.h>
|
2 |
-
#include <ctype.h>
|
3 |
-
#include <string.h>
|
4 |
-
#include <stdlib.h>
|
5 |
-
|
6 |
-
#include <string>
|
7 |
-
#include <iostream>
|
8 |
-
#include <fstream>
|
9 |
-
#include <sstream>
|
10 |
-
|
11 |
-
#include "MPtok.h"
|
12 |
-
|
13 |
-
// These options are probably compile time constants
|
14 |
-
|
15 |
-
static char option_tagsep = '_'; // The tagsep character
|
16 |
-
static char option_replacesep = '-'; // Replace tagsep with this
|
17 |
-
|
18 |
-
static void chomp(char *line)
|
19 |
-
{
|
20 |
-
int i;
|
21 |
-
|
22 |
-
i = strlen(line) - 1;
|
23 |
-
while (i >= 0 && line[i] == '\n' || line[i] == '\r')
|
24 |
-
line[i--] = '\0';
|
25 |
-
}
|
26 |
-
|
27 |
-
// Data structure and algorithm for finding common pairs.
|
28 |
-
|
29 |
-
// read a file of pairs into a data structure,
|
30 |
-
// the file must be sorted first
|
31 |
-
|
32 |
-
void MPtok::init_pair(const string& file_name)
|
33 |
-
{
|
34 |
-
filebuf fb;
|
35 |
-
fb.open(file_name.c_str(), ios::in);
|
36 |
-
istream is(&fb);
|
37 |
-
string pair;
|
38 |
-
|
39 |
-
while (1)
|
40 |
-
{
|
41 |
-
getline(is, pair);
|
42 |
-
if (is.fail()) break;
|
43 |
-
if (pair.size() > 0) common_pair.insert(pair);
|
44 |
-
}
|
45 |
-
|
46 |
-
fb.close();
|
47 |
-
}
|
48 |
-
|
49 |
-
// List of abbreviations in 3 categories
|
50 |
-
// ABB = can occur mid sentence
|
51 |
-
// EOS = can occur at end of sentence
|
52 |
-
// NUM = only used before numbers
|
53 |
-
|
54 |
-
void MPtok::init_abbr(const string& file_name)
|
55 |
-
{
|
56 |
-
filebuf fb;
|
57 |
-
fb.open(file_name.c_str(), ios::in);
|
58 |
-
istream is(&fb);
|
59 |
-
string typ, abb;
|
60 |
-
map<string,int> val;
|
61 |
-
val["ABB"] = ABB_ABB; val["EOS"] = ABB_EOS; val["NUM"] = ABB_NUM;
|
62 |
-
|
63 |
-
while (is.good())
|
64 |
-
{
|
65 |
-
is >> typ;
|
66 |
-
if (val.count(typ))
|
67 |
-
{
|
68 |
-
is >> abb;
|
69 |
-
if (abb.size() > 0) common_abbr[abb] = val[typ];
|
70 |
-
}
|
71 |
-
}
|
72 |
-
fb.close();
|
73 |
-
}
|
74 |
-
|
75 |
-
static char nextchar(const char *t, int i)
|
76 |
-
{
|
77 |
-
while (isspace(t[i])) i++;
|
78 |
-
return t[i];
|
79 |
-
}
|
80 |
-
|
81 |
-
// Look for a token at or prior to the text position
|
82 |
-
|
83 |
-
static int lookbehind(const char *t, int i, const char *s, int *tokflag)
|
84 |
-
{
|
85 |
-
int k = (int) strlen(s) - 1;
|
86 |
-
|
87 |
-
while (i > 0 && isspace(t[i])) i--;
|
88 |
-
|
89 |
-
while (k >= 0 && i >= 0)
|
90 |
-
{
|
91 |
-
if (k > 0 && tokflag[i]) break;
|
92 |
-
|
93 |
-
if (tolower(s[k]) != tolower(t[i]))
|
94 |
-
return -1;
|
95 |
-
k--;
|
96 |
-
i--;
|
97 |
-
}
|
98 |
-
|
99 |
-
return (k < 0 && tokflag[i+1]) ? i + 1 : -1;
|
100 |
-
}
|
101 |
-
|
102 |
-
// Look for a token at or following the text position
|
103 |
-
|
104 |
-
static int lookahead(const char *t, int i, const char *s, int *tokflag)
|
105 |
-
{
|
106 |
-
int k = 0;
|
107 |
-
|
108 |
-
while (isspace(t[i])) i++;
|
109 |
-
|
110 |
-
while (k < strlen(s) && i < strlen(t))
|
111 |
-
{
|
112 |
-
if (k > 0 && tokflag[i]) break;
|
113 |
-
|
114 |
-
if (tolower(s[k]) != tolower(t[i]))
|
115 |
-
return -1;
|
116 |
-
k++;
|
117 |
-
i++;
|
118 |
-
}
|
119 |
-
|
120 |
-
return (k == strlen(s) && tokflag[i]) ? i - (int) strlen(s) : -1;
|
121 |
-
}
|
122 |
-
|
123 |
-
// Set the initial tokens at spaces
|
124 |
-
|
125 |
-
void MPtok::tok_0()
|
126 |
-
{
|
127 |
-
int i;
|
128 |
-
|
129 |
-
tokflag[0] = 1;
|
130 |
-
for (i = 1; i < text_len; i++)
|
131 |
-
{
|
132 |
-
tokflag[i] = isspace(text[i]) || (i > 0 && isspace(text[i - 1])) ? 1 : 0;
|
133 |
-
}
|
134 |
-
tokflag[i] = 1;
|
135 |
-
}
|
136 |
-
|
137 |
-
// Get quotes preceded by open parens
|
138 |
-
//
|
139 |
-
// A double quote, preceded by a space or open bracket is a separate token
|
140 |
-
//
|
141 |
-
|
142 |
-
void MPtok::tok_1()
|
143 |
-
{
|
144 |
-
for (int i = 1; i < text_len; i++)
|
145 |
-
{
|
146 |
-
if (text[i] == '"' && strchr("([{<", text[i-1]))
|
147 |
-
{
|
148 |
-
tokflag[i] = 1;
|
149 |
-
if (i + 1 < text_len) tokflag[i+1] = 1;
|
150 |
-
}
|
151 |
-
}
|
152 |
-
}
|
153 |
-
|
154 |
-
// Look for ellipses
|
155 |
-
//
|
156 |
-
// Three dots in a row is a separate token
|
157 |
-
|
158 |
-
void MPtok::tok_2()
|
159 |
-
{
|
160 |
-
for (int i = 1; i + 2 < text_len; i++)
|
161 |
-
{
|
162 |
-
if (strncmp(&text[i], "...", 3) == 0)
|
163 |
-
{
|
164 |
-
tokflag[i] = 1;
|
165 |
-
if (i + 3 < text_len) tokflag[i+3] = 1;
|
166 |
-
}
|
167 |
-
}
|
168 |
-
}
|
169 |
-
|
170 |
-
// Non-sentence-ending punctuation
|
171 |
-
//
|
172 |
-
// Certain punctuation characters are separate tokens
|
173 |
-
|
174 |
-
void MPtok::tok_3()
|
175 |
-
{
|
176 |
-
for (int i = 0; i < text_len; i++)
|
177 |
-
{
|
178 |
-
// If it is a comma and the next char is not a space and option_comma = 0
|
179 |
-
|
180 |
-
if (option_comma == 0 && text[i] == ',' && isspace(text[i + 1]) == 0)
|
181 |
-
{
|
182 |
-
// do nothing
|
183 |
-
} else if (strchr(",;:@#$%&", text[i]))
|
184 |
-
{
|
185 |
-
tokflag[i] = 1;
|
186 |
-
tokflag[i + 1] = 1;
|
187 |
-
}
|
188 |
-
}
|
189 |
-
}
|
190 |
-
|
191 |
-
// Separate the slashes
|
192 |
-
//
|
193 |
-
// Slashes are a separate token
|
194 |
-
// except for +/-, +/+, -/-, -/+, and and/or.
|
195 |
-
|
196 |
-
void MPtok::tok_5_6_7()
|
197 |
-
{
|
198 |
-
for (int i = 0; i < text_len; i++)
|
199 |
-
{
|
200 |
-
if (text[i] == '/')
|
201 |
-
{
|
202 |
-
tokflag[i] = 1;
|
203 |
-
if (i+1 < text_len) tokflag[i+1] = 1;
|
204 |
-
|
205 |
-
// Put back +/-, etc, unless option_hyphen is 1
|
206 |
-
|
207 |
-
if (i - 1 >= 0
|
208 |
-
&& i + 1 < text_len
|
209 |
-
&& ((option_new < 9
|
210 |
-
&& text[i - 1] == '+' || (text[i - 1] == '-' && option_hyphen == 0)
|
211 |
-
&& text[i + 1] == '+' || (text[i + 1] == '-' && option_hyphen == 0))
|
212 |
-
|| (option_new >= 9
|
213 |
-
&& (text[i - 1] == '+' || text[i - 1] == '-')
|
214 |
-
&& (text[i + 1] == '+' || text[i + 1] == '-'))))
|
215 |
-
{
|
216 |
-
tokflag[i - 1] = 1;
|
217 |
-
tokflag[i] = tokflag[i+1] = 0;
|
218 |
-
tokflag[i + 2] = 1;
|
219 |
-
}
|
220 |
-
|
221 |
-
// Put back and/or, etc
|
222 |
-
|
223 |
-
if (option_new <= 7)
|
224 |
-
{
|
225 |
-
if (i > 5 && strncmp(text + i - 5, " and/or ", 8) == 0)
|
226 |
-
{
|
227 |
-
for (int j = 1; j < 5; j++)
|
228 |
-
tokflag[i - 2 + j] = 0;
|
229 |
-
}
|
230 |
-
} else
|
231 |
-
{
|
232 |
-
if (i > 4 && strncmp(text + i - 4, " and/or ", 8) == 0)
|
233 |
-
{
|
234 |
-
for (int j = 1; j < 6; j++)
|
235 |
-
tokflag[i - 3 + j] = 0;
|
236 |
-
}
|
237 |
-
}
|
238 |
-
}
|
239 |
-
}
|
240 |
-
}
|
241 |
-
|
242 |
-
// All brackets
|
243 |
-
//
|
244 |
-
// Any open or closed bracket is a separate token
|
245 |
-
//
|
246 |
-
// Exclamation and question mark
|
247 |
-
//
|
248 |
-
// Any question or exclamation mark is a separate token
|
249 |
-
|
250 |
-
void MPtok::tok_8_9()
|
251 |
-
{
|
252 |
-
for (int i = 0; i < text_len; i++)
|
253 |
-
{
|
254 |
-
if (strchr("[](){}<>", text[i])
|
255 |
-
|| strchr("?!", text[i]))
|
256 |
-
{
|
257 |
-
tokflag[i] = 1;
|
258 |
-
if (i + 1 < text_len) tokflag[i+1] = 1;
|
259 |
-
}
|
260 |
-
}
|
261 |
-
}
|
262 |
-
|
263 |
-
// Period at the end of a string may be followed by closed-bracket or quote
|
264 |
-
//
|
265 |
-
// A period that is preceded by a non-period
|
266 |
-
// and optionally followed by a close paren
|
267 |
-
// and any amount of space at the end of the string
|
268 |
-
// is a separate token.
|
269 |
-
|
270 |
-
void MPtok::tok_10()
|
271 |
-
{
|
272 |
-
for (int i = text_len - 1; i >= 0; i--)
|
273 |
-
{
|
274 |
-
if (isspace(text[i])) continue;
|
275 |
-
if (strchr("])}>\"'", text[i])) continue;
|
276 |
-
if (text[i] != '.') break;
|
277 |
-
if (text[i] == '.' && (i - 1 < 0 || text[i-1] != '.'))
|
278 |
-
{
|
279 |
-
tokflag[i] = 1;
|
280 |
-
if (i + 1 < text_len) tokflag[i+1] = 1;
|
281 |
-
}
|
282 |
-
}
|
283 |
-
}
|
284 |
-
|
285 |
-
// Period followed by a capitalized word
|
286 |
-
//
|
287 |
-
// A period preceded by a character that is not another period and not a space
|
288 |
-
// and followed by a space then an upper case letter is a separate token
|
289 |
-
|
290 |
-
void MPtok::tok_11()
|
291 |
-
{
|
292 |
-
for (int i = 0; i < text_len; i++)
|
293 |
-
{
|
294 |
-
if (text[i] == '.'
|
295 |
-
&& (i + 1 < text_len && isspace(text[i+1]))
|
296 |
-
&& (i - 1 < 0 || text[i - 1] != '.' || isspace(text[i-1]) == 0)
|
297 |
-
&& isupper(nextchar(text, i + 1)))
|
298 |
-
tokflag[i] = 1;
|
299 |
-
}
|
300 |
-
}
|
301 |
-
|
302 |
-
// A normal word followed by a period
|
303 |
-
//
|
304 |
-
// A period followed by a space
|
305 |
-
// and preceded by 2 or more alphabetic characters or hyphens
|
306 |
-
// is a separate token
|
307 |
-
|
308 |
-
void MPtok::tok_12()
|
309 |
-
{
|
310 |
-
int wcnt = 0;
|
311 |
-
|
312 |
-
for (int i = 0; i < text_len; i++)
|
313 |
-
{
|
314 |
-
if (text[i] == '.'
|
315 |
-
&& tokflag[i + 1]
|
316 |
-
&& wcnt >= 2)
|
317 |
-
tokflag[i] = 1;
|
318 |
-
|
319 |
-
if (isalpha(text[i]) || text[i] == '-')
|
320 |
-
++wcnt;
|
321 |
-
else
|
322 |
-
wcnt = 0;
|
323 |
-
}
|
324 |
-
}
|
325 |
-
|
326 |
-
// A non-normal token (that has no lower case letters) followed by a period
|
327 |
-
//
|
328 |
-
// A period at the end of a token made of characters excluding lower case
|
329 |
-
// is a separate token
|
330 |
-
|
331 |
-
void MPtok::tok_13()
|
332 |
-
{
|
333 |
-
int stok = 0;
|
334 |
-
int wcnt = 0;
|
335 |
-
|
336 |
-
for (int i = 0; i < text_len; i++)
|
337 |
-
{
|
338 |
-
if (text[i] == '.'
|
339 |
-
&& tokflag[i + 1]
|
340 |
-
&& wcnt >= 2)
|
341 |
-
tokflag[i] = 1;
|
342 |
-
|
343 |
-
if (tokflag[i] == 1) stok = 1;
|
344 |
-
|
345 |
-
if (islower(text[i]) || text[i] == '.')
|
346 |
-
{
|
347 |
-
stok = 0;
|
348 |
-
wcnt = 0;
|
349 |
-
}
|
350 |
-
|
351 |
-
if (stok)
|
352 |
-
wcnt++;
|
353 |
-
}
|
354 |
-
}
|
355 |
-
|
356 |
-
// put some periods with single-letter abbreviations
|
357 |
-
//
|
358 |
-
// A single alphabetic token followed by a period followed
|
359 |
-
// by a token that does not begin with an upper case letter
|
360 |
-
// or number is taken to be an abbreviation and the period
|
361 |
-
// does not start a new token.
|
362 |
-
//
|
363 |
-
// NOTE: This does not recognize initials in people's names,
|
364 |
-
// that problem is not simply solved.
|
365 |
-
|
366 |
-
void MPtok::tok_14()
|
367 |
-
{
|
368 |
-
for (int i = 0; i < text_len; i++)
|
369 |
-
{
|
370 |
-
if (text[i] == '.'
|
371 |
-
&& i - 1 >= 0 && isalpha(text[i - 1]) && tokflag[i - 1]
|
372 |
-
&& tokflag[i + 1]
|
373 |
-
&& isupper(nextchar(text, i + 1)) == 0
|
374 |
-
&& isdigit(nextchar(text, i + 1)) == 0
|
375 |
-
&& nextchar(text, i + 1) != '('
|
376 |
-
)
|
377 |
-
{
|
378 |
-
tokflag[i] = 0;
|
379 |
-
}
|
380 |
-
}
|
381 |
-
}
|
382 |
-
|
383 |
-
void MPtok::tok_15()
|
384 |
-
{
|
385 |
-
int i, j, k, a;
|
386 |
-
char buff[MAX_ABB + 1];
|
387 |
-
|
388 |
-
for (i = 0; i < text_len; i++)
|
389 |
-
{
|
390 |
-
// only start at a current token
|
391 |
-
|
392 |
-
if (! tokflag[i]) continue;
|
393 |
-
|
394 |
-
// find alphabetic followed by period
|
395 |
-
|
396 |
-
buff[0] = '\0';
|
397 |
-
for (k = 0; i + k < text_len && k < MAX_ABB; k++)
|
398 |
-
{
|
399 |
-
buff[k] = text[i+k]; buff[k+1] = '\0';
|
400 |
-
if (k > 0 && buff[k] == '.') break; // this is good
|
401 |
-
if (! isalpha(buff[k])) { buff[0] = '\0'; break; } // this is not good
|
402 |
-
}
|
403 |
-
|
404 |
-
if (buff[0] == '\0' || i + k == text_len || k == MAX_ABB) continue;
|
405 |
-
|
406 |
-
// at this point, buff[k] == '.' add 1 to make it the length
|
407 |
-
|
408 |
-
k++;
|
409 |
-
|
410 |
-
// if not found, try finding a concatenated abbrev
|
411 |
-
|
412 |
-
if (! common_abbr.count(buff))
|
413 |
-
{
|
414 |
-
for (; i + k < text_len && k < MAX_ABB; k++)
|
415 |
-
{
|
416 |
-
buff[k] = text[i+k]; buff[k+1] = '\0';
|
417 |
-
if (k > 0 && buff[k] == '.') break; // this is good
|
418 |
-
if (! isalpha(buff[k])) { buff[0] = '\0'; break; } // this is not good
|
419 |
-
}
|
420 |
-
|
421 |
-
if (buff[0] == '\0' || i + k == text_len || k == MAX_ABB) continue;
|
422 |
-
|
423 |
-
// at this point, buff[k] == '.' add 1 to make it the length
|
424 |
-
|
425 |
-
k++;
|
426 |
-
}
|
427 |
-
|
428 |
-
// if not found, give up
|
429 |
-
|
430 |
-
if (! common_abbr.count(buff)) continue;
|
431 |
-
|
432 |
-
if (common_abbr[buff] == ABB_NUM)
|
433 |
-
{
|
434 |
-
for (j = i + k; j < text_len && isspace(text[j]); j++) ; // next must be a number
|
435 |
-
if (! isdigit(text[j])) continue; // go to next abbreviation
|
436 |
-
} else if (common_abbr[buff] == ABB_EOS)
|
437 |
-
{
|
438 |
-
for (j = i + k; j < text_len && isspace(text[j]); j++) ; // if next token is upper case letter
|
439 |
-
if (isupper(text[j])) tokflag[i + (--k)] = 1; // tokenize the final period of this abbreviation
|
440 |
-
}
|
441 |
-
|
442 |
-
// clear all token flags
|
443 |
-
|
444 |
-
for (j = 1; j < k; j++) tokflag[i + j] = 0;
|
445 |
-
}
|
446 |
-
}
|
447 |
-
|
448 |
-
// Check for common pairs that should not be considered sentence breaks
|
449 |
-
|
450 |
-
void MPtok::tok_15_1()
|
451 |
-
{
|
452 |
-
int i, j, k, tnum, p;
|
453 |
-
char buff[MAX_ABB + 1];
|
454 |
-
|
455 |
-
for (i = 0; i < text_len; i++)
|
456 |
-
{
|
457 |
-
if (! tokflag[i]) continue;
|
458 |
-
|
459 |
-
// must be alphanumeric token followed by period token followed by space followed by alphanumeric token
|
460 |
-
|
461 |
-
tnum = 0;
|
462 |
-
buff[0] = '\0';
|
463 |
-
for (p = k = 0; i + k < text_len && k < MAX_ABB; k++)
|
464 |
-
{
|
465 |
-
buff[k] = text[i+k]; buff[k+1] = '\0';
|
466 |
-
|
467 |
-
if (isspace(buff[k]))
|
468 |
-
{
|
469 |
-
if (tnum == 2) break; // this is good
|
470 |
-
else if (tnum == 1) continue; // ok
|
471 |
-
else { buff[0] = '\0'; break; } // this shouldn't happen
|
472 |
-
}
|
473 |
-
|
474 |
-
if (tokflag[i+k])
|
475 |
-
{
|
476 |
-
if (tnum > 2) break; // done
|
477 |
-
else tnum++;
|
478 |
-
}
|
479 |
-
|
480 |
-
if (tnum == 1 && buff[k] == '.') p = k;
|
481 |
-
if (tnum == 1 && buff[k] != '.') { buff[0] = '\0'; break; } // nope
|
482 |
-
if (! isalnum(buff[k])) { buff[0] = '\0'; break; } // nope
|
483 |
-
}
|
484 |
-
|
485 |
-
if (buff[0] == '\0' || i + k == text_len || k == MAX_ABB) continue;
|
486 |
-
|
487 |
-
// at this point buff is a potential pair, so untokenize the period, that's all
|
488 |
-
|
489 |
-
if (common_pair.count(buff))
|
490 |
-
tokflag[p] = 0;
|
491 |
-
}
|
492 |
-
}
|
493 |
-
|
494 |
-
// Get cases where a space after a sentence has been omitted
|
495 |
-
//
|
496 |
-
// A period that occurs in a token consisting of alphabetic
|
497 |
-
// letters with a vowel to the left and the right is a
|
498 |
-
// separate token.
|
499 |
-
|
500 |
-
void MPtok::tok_16()
|
501 |
-
{
|
502 |
-
int j;
|
503 |
-
int has_vowel;
|
504 |
-
|
505 |
-
for (int i = 0; i < text_len; i++)
|
506 |
-
{
|
507 |
-
if (text[i] == '.' && tokflag[i] == 0)
|
508 |
-
{
|
509 |
-
has_vowel = 0;
|
510 |
-
for (j = i - 1; j >= 0; --j)
|
511 |
-
{
|
512 |
-
if (isalpha(text[j]) == 0)
|
513 |
-
break;
|
514 |
-
if (strchr("aeiouAEIOU", text[j]))
|
515 |
-
has_vowel = 1;
|
516 |
-
if (tokflag[j])
|
517 |
-
break;
|
518 |
-
}
|
519 |
-
if ((j >= 0 && tokflag[j] == 0) || has_vowel == 0)
|
520 |
-
continue;
|
521 |
-
|
522 |
-
j = i + 1;
|
523 |
-
|
524 |
-
has_vowel = 0;
|
525 |
-
for (; j < text_len && tokflag[j] == 0; ++j)
|
526 |
-
{
|
527 |
-
if (isalpha(text[j]) == 0)
|
528 |
-
break;
|
529 |
-
if (strchr("aeiouAEIOU", text[j]))
|
530 |
-
has_vowel = 1;
|
531 |
-
}
|
532 |
-
|
533 |
-
if ((j < text_len && tokflag[j] == 0) || has_vowel == 0)
|
534 |
-
continue;
|
535 |
-
|
536 |
-
tokflag[i] = 1;
|
537 |
-
tokflag[i + 1] = 1;
|
538 |
-
}
|
539 |
-
}
|
540 |
-
}
|
541 |
-
|
542 |
-
// Correction to tok_16,
|
543 |
-
// Don't count if the token before is a single letter
|
544 |
-
// or the token following is a single letter other than 'a'.
|
545 |
-
// Also, don't count if the token to the right is gov, com, edu, etc.
|
546 |
-
// because those are web addresses!
|
547 |
-
|
548 |
-
#define COMPLEX_WINDOW 40
|
549 |
-
|
550 |
-
enum {COMPLEX_NOT = 0, COMPLEX_YES, COMPLEX_DONE};
|
551 |
-
|
552 |
-
struct _complex {
|
553 |
-
int flag;
|
554 |
-
int offset;
|
555 |
-
const char *str;
|
556 |
-
int len;
|
557 |
-
} complex[] = {
|
558 |
-
COMPLEX_YES, 0, "complex", 7,
|
559 |
-
COMPLEX_NOT, 0, "complexi", 8,
|
560 |
-
COMPLEX_NOT, 0, "complexed", 9,
|
561 |
-
COMPLEX_NOT, 0, "complexa", 8,
|
562 |
-
COMPLEX_NOT, 0, "complex-", 8,
|
563 |
-
COMPLEX_NOT, 0, "complexl", 8,
|
564 |
-
COMPLEX_NOT, 0, "complexu", 8,
|
565 |
-
COMPLEX_NOT, -1, "-complex", 7,
|
566 |
-
COMPLEX_NOT, -2, "nocomplex", 9,
|
567 |
-
COMPLEX_NOT, -3, "subcomplex", 10,
|
568 |
-
COMPLEX_YES, 0, "hybrid", 6,
|
569 |
-
COMPLEX_NOT, 0, "hybridi", 7,
|
570 |
-
COMPLEX_NOT, 0, "hybrido", 7,
|
571 |
-
COMPLEX_NOT, 0, "hybrida", 7,
|
572 |
-
COMPLEX_NOT, 0, "hybrid-", 7,
|
573 |
-
COMPLEX_NOT, -1, "-hybrid", 7,
|
574 |
-
COMPLEX_YES, 0, "duplex", 6,
|
575 |
-
COMPLEX_NOT, -1, "oduplex", 7,
|
576 |
-
COMPLEX_DONE, 0, NULL, 0,
|
577 |
-
};
|
578 |
-
|
579 |
-
int MPtok::complex_check()
|
580 |
-
{
|
581 |
-
int last_period = -2*COMPLEX_WINDOW;
|
582 |
-
int last_complex = -2*COMPLEX_WINDOW;
|
583 |
-
int i, j;
|
584 |
-
int complex_match;
|
585 |
-
|
586 |
-
for (i = 0; i < text_len; i++)
|
587 |
-
{
|
588 |
-
if (text[i] == '.')
|
589 |
-
{
|
590 |
-
if (i - last_complex <= COMPLEX_WINDOW)
|
591 |
-
return 1;
|
592 |
-
last_period = i;
|
593 |
-
}
|
594 |
-
|
595 |
-
complex_match = 0;
|
596 |
-
for (j = 0; complex[j].str; j++)
|
597 |
-
{
|
598 |
-
if (complex[j].flag == COMPLEX_NOT)
|
599 |
-
{
|
600 |
-
if (i + complex[j].offset >= 0
|
601 |
-
&& strncmp(text+i+complex[j].offset, complex[j].str, complex[j].len) == 0)
|
602 |
-
{
|
603 |
-
// don't match here
|
604 |
-
complex_match = 0;
|
605 |
-
}
|
606 |
-
} else if (complex[j].flag == COMPLEX_YES)
|
607 |
-
{
|
608 |
-
if (i + complex[j].offset >= 0
|
609 |
-
&& strncmp(text+i+complex[j].offset, complex[j].str, complex[j].len) == 0)
|
610 |
-
{
|
611 |
-
// match here
|
612 |
-
complex_match = 1;
|
613 |
-
}
|
614 |
-
}
|
615 |
-
}
|
616 |
-
|
617 |
-
if (complex_match)
|
618 |
-
{
|
619 |
-
if (i - last_period <= COMPLEX_WINDOW)
|
620 |
-
return 1;
|
621 |
-
last_complex = i;
|
622 |
-
}
|
623 |
-
}
|
624 |
-
return 0;
|
625 |
-
}
|
626 |
-
|
627 |
-
void MPtok::tok_16_1()
|
628 |
-
{
|
629 |
-
int i, j;
|
630 |
-
char v1, v2;
|
631 |
-
int c1, c2;
|
632 |
-
|
633 |
-
if (option_new == 3 && strstr(text, "complex"))
|
634 |
-
return;
|
635 |
-
|
636 |
-
if (option_new >= 4 && complex_check())
|
637 |
-
return;
|
638 |
-
|
639 |
-
for (i = 0; i < text_len; i++)
|
640 |
-
{
|
641 |
-
if (text[i] == '.' && tokflag[i] == 0)
|
642 |
-
{
|
643 |
-
char suffix[10];
|
644 |
-
int s_i;
|
645 |
-
|
646 |
-
v1 = '\0';
|
647 |
-
c1 = 0;
|
648 |
-
for (j = i - 1; j >= 0; --j)
|
649 |
-
{
|
650 |
-
if (isalpha(text[j]) == 0)
|
651 |
-
break;
|
652 |
-
if (strchr("aeiouAEIOU", text[j]))
|
653 |
-
v1 = tolower(text[j]);
|
654 |
-
c1++;
|
655 |
-
if (tokflag[j])
|
656 |
-
break;
|
657 |
-
}
|
658 |
-
if ((j >= 0 && tokflag[j] == 0)
|
659 |
-
|| v1 == '\0'
|
660 |
-
|| c1 == 1)
|
661 |
-
continue;
|
662 |
-
|
663 |
-
j = i + 1;
|
664 |
-
|
665 |
-
v2 = '\0';
|
666 |
-
c2 = 0;
|
667 |
-
s_i = 0;
|
668 |
-
for (; j < text_len && tokflag[j] == 0; ++j)
|
669 |
-
{
|
670 |
-
if (isalpha(text[j]) == 0)
|
671 |
-
break;
|
672 |
-
if (strchr("aeiouAEIOU", text[j]))
|
673 |
-
v2 = tolower(text[j]);
|
674 |
-
if (s_i < 3)
|
675 |
-
suffix[s_i++] = tolower(text[j]); suffix[s_i] = '\0';
|
676 |
-
c2++;
|
677 |
-
}
|
678 |
-
|
679 |
-
if ((j < text_len && tokflag[j] == 0)
|
680 |
-
|| v2 == '\0'
|
681 |
-
|| (c2 == 1 && v2 != 'a')
|
682 |
-
|| (c2 == 3 && tokflag[j] == 1 && s_i == 3
|
683 |
-
&& (strcmp(suffix, "gov") == 0
|
684 |
-
|| strcmp(suffix, "edu") == 0
|
685 |
-
|| strcmp(suffix, "org") == 0
|
686 |
-
|| strcmp(suffix, "com") == 0)))
|
687 |
-
continue;
|
688 |
-
|
689 |
-
tokflag[i] = 1;
|
690 |
-
tokflag[i + 1] = 1;
|
691 |
-
}
|
692 |
-
}
|
693 |
-
}
|
694 |
-
|
695 |
-
|
696 |
-
// Numeric endings of sentences
|
697 |
-
//
|
698 |
-
// A period after a numeric token followed by a token that starts
|
699 |
-
// with an alphabetic character, is a separate token.
|
700 |
-
//
|
701 |
-
// This should be covered already by tok_13
|
702 |
-
|
703 |
-
void MPtok::tok_17()
|
704 |
-
{
|
705 |
-
int j;
|
706 |
-
|
707 |
-
for (int i = 0; i < text_len; i++)
|
708 |
-
{
|
709 |
-
if (text[i] == '.'
|
710 |
-
&& tokflag[i] == 0
|
711 |
-
&& tokflag[i + 1])
|
712 |
-
{
|
713 |
-
for (j = i - 1; j >= 0 && isdigit(text[j]) && tokflag[j] == 0; --j)
|
714 |
-
;
|
715 |
-
if (j >= 0 && j < i - 1 && tokflag[j] && isalpha(nextchar(text, i + 1)))
|
716 |
-
tokflag[i] = 1;
|
717 |
-
}
|
718 |
-
}
|
719 |
-
}
|
720 |
-
|
721 |
-
// period at end of string is a token
|
722 |
-
|
723 |
-
void MPtok::tok_20()
|
724 |
-
{
|
725 |
-
for (int i = text_len - 1; i >= 0; --i)
|
726 |
-
{
|
727 |
-
if (isspace(text[i]))
|
728 |
-
continue;
|
729 |
-
|
730 |
-
if (strchr(".!?", text[i]))
|
731 |
-
tokflag[i] = 1;
|
732 |
-
|
733 |
-
break;
|
734 |
-
}
|
735 |
-
}
|
736 |
-
|
737 |
-
// a period that follows a non-common word, and that is
|
738 |
-
// followed by a lower case common word is probably not a token
|
739 |
-
|
740 |
-
void MPtok::tok_20_1()
|
741 |
-
{
|
742 |
-
int j;
|
743 |
-
|
744 |
-
for (int i = 0; i < text_len; ++i)
|
745 |
-
{
|
746 |
-
if (text[i] == '.' && tokflag[i] == 1)
|
747 |
-
{
|
748 |
-
int tcnt, lcnt, ocnt;
|
749 |
-
tcnt = lcnt = ocnt = 0;
|
750 |
-
|
751 |
-
// make sure the previous word was *not* common
|
752 |
-
|
753 |
-
for (j = i - 1; j >= 0; j--)
|
754 |
-
{
|
755 |
-
if (isspace(text[j])) continue;
|
756 |
-
if (option_new >= 2)
|
757 |
-
{
|
758 |
-
if (islower(text[j]) == 0 && text[j] != '-') ocnt++;
|
759 |
-
} else
|
760 |
-
{
|
761 |
-
if (! islower(text[j])) ocnt++;
|
762 |
-
}
|
763 |
-
|
764 |
-
if (tokflag[j] || j == 0)
|
765 |
-
{
|
766 |
-
if (ocnt == 0)
|
767 |
-
{
|
768 |
-
goto nexti;
|
769 |
-
}
|
770 |
-
break;
|
771 |
-
}
|
772 |
-
}
|
773 |
-
|
774 |
-
tcnt = lcnt = ocnt = 0;
|
775 |
-
|
776 |
-
// make sure the next word is common
|
777 |
-
|
778 |
-
for (j = i + 1; j < text_len; j++)
|
779 |
-
{
|
780 |
-
if (isspace(text[j])) continue;
|
781 |
-
if (tokflag[j]) tcnt++;
|
782 |
-
|
783 |
-
if (tcnt == 2 || j == text_len - 1)
|
784 |
-
{
|
785 |
-
if (lcnt > 0 && ocnt == 0) tokflag[i] = 0;
|
786 |
-
break;
|
787 |
-
}
|
788 |
-
|
789 |
-
if (islower(text[j])) lcnt++;
|
790 |
-
else ocnt++;
|
791 |
-
}
|
792 |
-
}
|
793 |
-
nexti: ;
|
794 |
-
}
|
795 |
-
}
|
796 |
-
|
797 |
-
// tokenized period followed by non-space other than close paren
|
798 |
-
// is not a token
|
799 |
-
|
800 |
-
void MPtok::tok_20_2()
|
801 |
-
{
|
802 |
-
int j;
|
803 |
-
|
804 |
-
for (int i = 0; i < text_len - 1; ++i)
|
805 |
-
{
|
806 |
-
if (text[i] == '.' && tokflag[i] == 1
|
807 |
-
&& strchr(" ()[]\"\'\n\t\r", text[i+1]) == 0)
|
808 |
-
{
|
809 |
-
tokflag[i] = 0;
|
810 |
-
}
|
811 |
-
}
|
812 |
-
}
|
813 |
-
|
814 |
-
|
815 |
-
// long dash
|
816 |
-
//
|
817 |
-
// A pair of hyphens is a complete token
|
818 |
-
|
819 |
-
void MPtok::tok_21()
|
820 |
-
{
|
821 |
-
for (int i = 0; i + 1 < text_len; i++)
|
822 |
-
{
|
823 |
-
if (strncmp(&text[i], "--", 2) == 0)
|
824 |
-
{
|
825 |
-
tokflag[i] = 1;
|
826 |
-
if (i + 2 < text_len)
|
827 |
-
{
|
828 |
-
i += 2;
|
829 |
-
tokflag[i] = 1;
|
830 |
-
}
|
831 |
-
}
|
832 |
-
}
|
833 |
-
}
|
834 |
-
|
835 |
-
// hyphens
|
836 |
-
//
|
837 |
-
// If specified as an option, a hyphen between letters is a complete token
|
838 |
-
|
839 |
-
void MPtok::tok_21a()
|
840 |
-
{
|
841 |
-
if (option_hyphen == 0) return;
|
842 |
-
|
843 |
-
for (int i = 0; i + 1 < text_len; i++)
|
844 |
-
{
|
845 |
-
if (text[i] == '-'
|
846 |
-
&& (i == 0 || text[i-1] != '-')
|
847 |
-
&& text[i+1] != '-')
|
848 |
-
{
|
849 |
-
tokflag[i] = 1;
|
850 |
-
tokflag[i+1] = 1;
|
851 |
-
}
|
852 |
-
}
|
853 |
-
}
|
854 |
-
|
855 |
-
|
856 |
-
// quote
|
857 |
-
//
|
858 |
-
// Any double quote is a separate token
|
859 |
-
|
860 |
-
void MPtok::tok_22()
|
861 |
-
{
|
862 |
-
for (int i = 0; i < text_len; i++)
|
863 |
-
{
|
864 |
-
if (text[i] == '"')
|
865 |
-
{
|
866 |
-
tokflag[i] = 1;
|
867 |
-
if (i + 1 < text_len)
|
868 |
-
{
|
869 |
-
i += 1;
|
870 |
-
tokflag[i] = 1;
|
871 |
-
}
|
872 |
-
}
|
873 |
-
}
|
874 |
-
}
|
875 |
-
|
876 |
-
// possessive
|
877 |
-
//
|
878 |
-
// Any single quote at the end of a token that is not
|
879 |
-
// preceded by a single quote is a separate token
|
880 |
-
|
881 |
-
void MPtok::tok_23()
|
882 |
-
{
|
883 |
-
for (int i = 0; i < text_len; i++)
|
884 |
-
{
|
885 |
-
if (text[i] == '\''
|
886 |
-
&& (i - 1 >= 0 && text[i - 1] != '\'')
|
887 |
-
&& tokflag[i + 1])
|
888 |
-
{
|
889 |
-
tokflag[i] = 1;
|
890 |
-
}
|
891 |
-
}
|
892 |
-
}
|
893 |
-
|
894 |
-
|
895 |
-
// quote
|
896 |
-
//
|
897 |
-
// If a single quote starts a token, or is preceded by a
|
898 |
-
// single quote, and followed by a character
|
899 |
-
// that is not a single quote, then
|
900 |
-
// the character to it's right is the start of a new token
|
901 |
-
|
902 |
-
void MPtok::tok_24()
|
903 |
-
{
|
904 |
-
for (int i = 0; i < text_len; i++)
|
905 |
-
{
|
906 |
-
if (text[i] == '\''
|
907 |
-
&& (tokflag[i] == 1 || (i - 1 >= 0 && text[i - 1] == '\''))
|
908 |
-
&& (i + 1 < text_len && text[i + 1] != '\''))
|
909 |
-
{
|
910 |
-
tokflag[i + 1] = 1;
|
911 |
-
}
|
912 |
-
}
|
913 |
-
}
|
914 |
-
|
915 |
-
// put back possessive
|
916 |
-
//
|
917 |
-
// A single quote that is a whole token followed by a lower case s
|
918 |
-
// that is also a whole token (without space between them)
|
919 |
-
// should be merged into a single token
|
920 |
-
|
921 |
-
void MPtok::tok_25()
|
922 |
-
{
|
923 |
-
for (int i = 0; i < text_len; i++)
|
924 |
-
{
|
925 |
-
if (text[i] == '\''
|
926 |
-
&& tokflag[i] == 1
|
927 |
-
&& i + 1 < text_len && text[i + 1] == 's'
|
928 |
-
&& tokflag[i+1] == 1
|
929 |
-
&& (i + 2 >= text_len || isspace(text[i + 2]) || tokflag[i + 2] == 1))
|
930 |
-
{
|
931 |
-
tokflag[i + 1] = 0;
|
932 |
-
}
|
933 |
-
}
|
934 |
-
}
|
935 |
-
|
936 |
-
// quote
|
937 |
-
//
|
938 |
-
// A pair of single quotes is a separate token
|
939 |
-
|
940 |
-
void MPtok::tok_26()
|
941 |
-
{
|
942 |
-
for (int i = 0; i < text_len; i++)
|
943 |
-
{
|
944 |
-
if (strncmp(&text[i], "''", 2) == 0
|
945 |
-
|| strncmp(&text[i], "``", 2) == 0)
|
946 |
-
{
|
947 |
-
tokflag[i] = 1;
|
948 |
-
if (i + 2 < text_len) tokflag[i + 2] = 1;
|
949 |
-
}
|
950 |
-
}
|
951 |
-
}
|
952 |
-
|
953 |
-
// possessive
|
954 |
-
//
|
955 |
-
// A single quote followed by a letter s is a possessive
|
956 |
-
|
957 |
-
void MPtok::tok_27()
|
958 |
-
{
|
959 |
-
for (int i = 0; i < text_len; i++)
|
960 |
-
{
|
961 |
-
if (text[i] == '\''
|
962 |
-
&& i + 1 < text_len
|
963 |
-
&& tolower(text[i + 1]) == 's'
|
964 |
-
&& (i + 2 >= text_len || tokflag[i + 2]))
|
965 |
-
{
|
966 |
-
tokflag[i] = 1;
|
967 |
-
}
|
968 |
-
}
|
969 |
-
}
|
970 |
-
|
971 |
-
// split "cannot" to "can not"
|
972 |
-
//
|
973 |
-
// A single token that is the word cannot (in any case)
|
974 |
-
// is split into two words
|
975 |
-
|
976 |
-
void MPtok::tok_28()
|
977 |
-
{
|
978 |
-
for (int i = 0; i < text_len; i++)
|
979 |
-
{
|
980 |
-
if ((strncmp(&text[i], "cannot", 6) == 0
|
981 |
-
|| strncmp(&text[i], "Cannot", 6) == 0)
|
982 |
-
&& tokflag[i + 6])
|
983 |
-
{
|
984 |
-
tokflag[i + 3] = 1;
|
985 |
-
}
|
986 |
-
}
|
987 |
-
}
|
988 |
-
|
989 |
-
// put list item elements back at sentence end
|
990 |
-
//
|
991 |
-
// A period that is preceded by an alphanumeric (no space)
|
992 |
-
// and any amount of preceding space and an end-mark
|
993 |
-
// stays with the alphanumeric.
|
994 |
-
|
995 |
-
void MPtok::tok_29()
|
996 |
-
{
|
997 |
-
int j;
|
998 |
-
|
999 |
-
for (int i = 0; i < text_len; i++)
|
1000 |
-
{
|
1001 |
-
if (text[i] == '.'
|
1002 |
-
&& tokflag[i] && tokflag[i + 1]
|
1003 |
-
&& i - 1 >= 0 && isalnum(text[i - 1])
|
1004 |
-
&& tokflag[i - 1]
|
1005 |
-
&& ((j = lookbehind(text, i-2, ".", tokflag)) >= 0
|
1006 |
-
|| (j = lookbehind(text, i-2, "?", tokflag)) >= 0
|
1007 |
-
|| (j = lookbehind(text, i-2, "!", tokflag)) >= 0)
|
1008 |
-
&& tokflag[j])
|
1009 |
-
{
|
1010 |
-
tokflag[i] = 0;
|
1011 |
-
}
|
1012 |
-
}
|
1013 |
-
}
|
1014 |
-
|
1015 |
-
// attach list elements to the beginnings of their sentences
|
1016 |
-
// this means, attach the period to the list element
|
1017 |
-
//
|
1018 |
-
// a list element is a single letter or a one or two digits
|
1019 |
-
// which is preceded by an end of sentence ".!?;"
|
1020 |
-
// or colon (provided it doesn't belong to a proportion construct)
|
1021 |
-
|
1022 |
-
void MPtok::tok_29a()
|
1023 |
-
{
|
1024 |
-
int i, j;
|
1025 |
-
|
1026 |
-
for (i = 0; i < text_len; i++)
|
1027 |
-
{
|
1028 |
-
if (text[i] == '.' && tokflag[i])
|
1029 |
-
{
|
1030 |
-
// Look back, make sure the token before the period
|
1031 |
-
// is either single alphanumeric, or at most a two digit number
|
1032 |
-
// and the character before that is a punctuation ".?!:,"
|
1033 |
-
|
1034 |
-
int tcnt, acnt, dcnt, pcnt, ocnt, scnt;
|
1035 |
-
tcnt = acnt = dcnt = pcnt = ocnt = scnt = 0;
|
1036 |
-
char p;
|
1037 |
-
|
1038 |
-
for (j = i - 1; j >= 0; j--)
|
1039 |
-
{
|
1040 |
-
if (isspace(text[j])) { scnt++; continue; }
|
1041 |
-
else if (tcnt == 0 && isalpha(text[j])) ++acnt;
|
1042 |
-
else if (tcnt == 0 && isdigit(text[j])) ++dcnt;
|
1043 |
-
else if (tcnt == 1 && strchr(".!?:;,", text[j])) { pcnt++; p = text[j]; }
|
1044 |
-
else ocnt++;
|
1045 |
-
|
1046 |
-
if (tokflag[j] || j == 0)
|
1047 |
-
{
|
1048 |
-
tcnt++;
|
1049 |
-
if (tcnt == 1 && ocnt == 0 && scnt == 0
|
1050 |
-
&& ((acnt == 1 && dcnt == 0) || (acnt == 0 && dcnt > 0 && dcnt <= 2)))
|
1051 |
-
{
|
1052 |
-
// This is acceptable
|
1053 |
-
} else if (tcnt == 2 && pcnt <= 1 && ocnt == 0 && scnt > 0)
|
1054 |
-
{
|
1055 |
-
if (p == ':')
|
1056 |
-
{
|
1057 |
-
while (--j >= 0 && isspace(text[j]))
|
1058 |
-
;
|
1059 |
-
if (j >= 0 && isdigit(text[j]))
|
1060 |
-
{
|
1061 |
-
// It's probably a proportion
|
1062 |
-
break;
|
1063 |
-
}
|
1064 |
-
}
|
1065 |
-
// Jackpot
|
1066 |
-
tokflag[i] = 0;
|
1067 |
-
} else
|
1068 |
-
{
|
1069 |
-
// This is not
|
1070 |
-
break;
|
1071 |
-
}
|
1072 |
-
scnt = 0;
|
1073 |
-
}
|
1074 |
-
}
|
1075 |
-
}
|
1076 |
-
}
|
1077 |
-
}
|
1078 |
-
|
1079 |
-
// list elements at the beginning of a string
|
1080 |
-
//
|
1081 |
-
// An alphanumeric token followed by a period
|
1082 |
-
// at the beginning of the line stays with the
|
1083 |
-
// alphanumeric
|
1084 |
-
|
1085 |
-
void MPtok::tok_30()
|
1086 |
-
{
|
1087 |
-
int i = 0;
|
1088 |
-
|
1089 |
-
while (isspace(text[i])) i++;
|
1090 |
-
|
1091 |
-
if (isalnum(text[i])
|
1092 |
-
&& tokflag[i]
|
1093 |
-
&& i + 1 < text_len
|
1094 |
-
&& text[i + 1] == '.'
|
1095 |
-
&& tokflag[i + 1])
|
1096 |
-
{
|
1097 |
-
tokflag[i + 1] = 0;
|
1098 |
-
}
|
1099 |
-
}
|
1100 |
-
|
1101 |
-
// process American style numbers
|
1102 |
-
|
1103 |
-
void MPtok::tok_31()
|
1104 |
-
{
|
1105 |
-
int j;
|
1106 |
-
|
1107 |
-
for (int i = 0; i < text_len; i++)
|
1108 |
-
{
|
1109 |
-
if (text[i] == ','
|
1110 |
-
&& i + 3 < text_len
|
1111 |
-
&& tokflag[i] && tokflag[i + 1]
|
1112 |
-
&& isdigit(text[i + 1])
|
1113 |
-
&& isdigit(text[i + 2])
|
1114 |
-
&& isdigit(text[i + 3])
|
1115 |
-
&& i - 1 >= 0 && isdigit(text[i - 1])
|
1116 |
-
)
|
1117 |
-
{
|
1118 |
-
tokflag[i] = 0;
|
1119 |
-
tokflag[i + 1] = 0;
|
1120 |
-
}
|
1121 |
-
}
|
1122 |
-
}
|
1123 |
-
|
1124 |
-
// process British style numbers
|
1125 |
-
|
1126 |
-
void MPtok::tok_32()
|
1127 |
-
{
|
1128 |
-
int j;
|
1129 |
-
|
1130 |
-
for (int i = 0; i < text_len; i++)
|
1131 |
-
{
|
1132 |
-
if (text[i] == ' '
|
1133 |
-
&& i + 3 < text_len
|
1134 |
-
&& tokflag[i] && tokflag[i + 1]
|
1135 |
-
&& isdigit(text[i + 1])
|
1136 |
-
&& isdigit(text[i + 2])
|
1137 |
-
&& isdigit(text[i + 3])
|
1138 |
-
&& i - 1 >= 0 && isdigit(text[i - 1])
|
1139 |
-
)
|
1140 |
-
{
|
1141 |
-
tokflag[i] = 0;
|
1142 |
-
tokflag[i + 1] = 0;
|
1143 |
-
}
|
1144 |
-
}
|
1145 |
-
}
|
1146 |
-
|
1147 |
-
// tokenize unicode escapes
|
1148 |
-
//
|
1149 |
-
// Added
|
1150 |
-
|
1151 |
-
void MPtok::tok_33()
|
1152 |
-
{
|
1153 |
-
int j;
|
1154 |
-
|
1155 |
-
for (int i = 0; i < text_len; i++)
|
1156 |
-
{
|
1157 |
-
if (text[i] == '&')
|
1158 |
-
{
|
1159 |
-
if (text[i + 1] == '#')
|
1160 |
-
{
|
1161 |
-
for (j = i + 2; isdigit(text[j]); j++)
|
1162 |
-
;
|
1163 |
-
} else
|
1164 |
-
{
|
1165 |
-
for (j = i + 1; isalpha(text[j]); j++)
|
1166 |
-
;
|
1167 |
-
}
|
1168 |
-
|
1169 |
-
if (text[j] == ';')
|
1170 |
-
{
|
1171 |
-
// Tokenize the escape, untokenize everything inside
|
1172 |
-
|
1173 |
-
tokflag[i] = 1;
|
1174 |
-
for (i++; i <= j; i++) tokflag[i] = 0;
|
1175 |
-
tokflag[i] = 1;
|
1176 |
-
}
|
1177 |
-
}
|
1178 |
-
}
|
1179 |
-
}
|
1180 |
-
|
1181 |
-
// Remove tags if they are present
|
1182 |
-
|
1183 |
-
void MPtok::tok_un()
|
1184 |
-
{
|
1185 |
-
int untok = 0;
|
1186 |
-
for (int i = 0; text[i]; ++i)
|
1187 |
-
{
|
1188 |
-
if (isspace(text[i])) untok = 0;
|
1189 |
-
if (text[i] == option_tagsep) untok = 1;
|
1190 |
-
if (untok) text[i] = ' ';
|
1191 |
-
}
|
1192 |
-
}
|
1193 |
-
|
1194 |
-
|
1195 |
-
void MPtok::set_tokflag()
|
1196 |
-
{
|
1197 |
-
int i;
|
1198 |
-
|
1199 |
-
tok_0();
|
1200 |
-
tok_1();
|
1201 |
-
tok_2();
|
1202 |
-
tok_3();
|
1203 |
-
|
1204 |
-
// step 4 replaces tag char, this is done at output
|
1205 |
-
|
1206 |
-
tok_5_6_7();
|
1207 |
-
tok_8_9();
|
1208 |
-
|
1209 |
-
tok_10();
|
1210 |
-
tok_11();
|
1211 |
-
if (option_new >= 1)
|
1212 |
-
{
|
1213 |
-
tok_21();
|
1214 |
-
tok_21a();
|
1215 |
-
tok_22();
|
1216 |
-
tok_23();
|
1217 |
-
tok_24();
|
1218 |
-
tok_25();
|
1219 |
-
tok_26();
|
1220 |
-
tok_27();
|
1221 |
-
}
|
1222 |
-
tok_12();
|
1223 |
-
tok_13();
|
1224 |
-
tok_14();
|
1225 |
-
if (option_new <= 5)
|
1226 |
-
tok_15();
|
1227 |
-
if (option_new < 2)
|
1228 |
-
tok_16();
|
1229 |
-
tok_17();
|
1230 |
-
|
1231 |
-
// steps 18 and 19 recognize periods within parens,
|
1232 |
-
// and this is moved to the segmentation section
|
1233 |
-
|
1234 |
-
tok_20();
|
1235 |
-
if (option_new >= 1)
|
1236 |
-
{
|
1237 |
-
tok_20_1();
|
1238 |
-
tok_20_2();
|
1239 |
-
if (option_new >= 2)
|
1240 |
-
tok_16_1();
|
1241 |
-
if (option_new >= 6)
|
1242 |
-
tok_15();
|
1243 |
-
if (option_new >= 7)
|
1244 |
-
tok_15_1();
|
1245 |
-
}
|
1246 |
-
if (option_new < 1)
|
1247 |
-
{
|
1248 |
-
tok_21();
|
1249 |
-
tok_21a();
|
1250 |
-
tok_22();
|
1251 |
-
tok_23();
|
1252 |
-
tok_24();
|
1253 |
-
tok_25();
|
1254 |
-
tok_26();
|
1255 |
-
tok_27();
|
1256 |
-
}
|
1257 |
-
tok_28();
|
1258 |
-
if (option_new >= 1)
|
1259 |
-
tok_29a();
|
1260 |
-
else
|
1261 |
-
tok_29();
|
1262 |
-
tok_30();
|
1263 |
-
tok_31();
|
1264 |
-
tok_32();
|
1265 |
-
|
1266 |
-
tok_33();
|
1267 |
-
}
|
1268 |
-
|
1269 |
-
/* set_endflag
|
1270 |
-
**
|
1271 |
-
** After tokflag has been set, find the possible sentence endings.
|
1272 |
-
*/
|
1273 |
-
|
1274 |
-
void MPtok::set_endflag()
|
1275 |
-
{
|
1276 |
-
int i;
|
1277 |
-
|
1278 |
-
// The following tests look for end-stops and label them.
|
1279 |
-
// They include steps 18 and 19
|
1280 |
-
|
1281 |
-
for (i = 0; i <= text_len; i++)
|
1282 |
-
endflag[i] = 0;
|
1283 |
-
|
1284 |
-
// Count the number of unmatched parens
|
1285 |
-
|
1286 |
-
int up = 0; // unmatched round parens
|
1287 |
-
int ub = 0; // unmatched brackets
|
1288 |
-
|
1289 |
-
for (i = 0; i < text_len; i++)
|
1290 |
-
{
|
1291 |
-
if (text[i] == '(') ++up;
|
1292 |
-
if (text[i] == ')') --up;
|
1293 |
-
if (text[i] == '[') ++ub;
|
1294 |
-
if (text[i] == ']') --ub;
|
1295 |
-
if (up < 0) up = 0;
|
1296 |
-
if (ub < 0) ub = 0;
|
1297 |
-
}
|
1298 |
-
|
1299 |
-
// Now find the end-of-sentence marks
|
1300 |
-
|
1301 |
-
// tok_18: periods within parentheses, allow for nesting
|
1302 |
-
// tok_19: periods within brackets, allow for nesting
|
1303 |
-
// the perl version solves this by putting the period
|
1304 |
-
// back with the previous token, but a better solution
|
1305 |
-
// is to allow it to be tokenized but just don't
|
1306 |
-
// allow it to be an end-of-sentence.
|
1307 |
-
// Therefore, these are moved to the segmentation
|
1308 |
-
// section
|
1309 |
-
|
1310 |
-
int p = 0; // round parens
|
1311 |
-
int b = 0; // brackets
|
1312 |
-
|
1313 |
-
for (i = 0; i < text_len; i++)
|
1314 |
-
{
|
1315 |
-
if (text[i] == '(') ++p;
|
1316 |
-
if (text[i] == ')') --p;
|
1317 |
-
if (text[i] == '[') ++b;
|
1318 |
-
if (text[i] == ']') --b;
|
1319 |
-
if (p < 0) p = 0;
|
1320 |
-
if (b < 0) b = 0;
|
1321 |
-
|
1322 |
-
if (strchr(".!?", text[i])
|
1323 |
-
&& tokflag[i]
|
1324 |
-
&& tokflag[i + 1])
|
1325 |
-
{
|
1326 |
-
if (option_segment && p <= up && b <= ub)
|
1327 |
-
endflag[i] = 1;
|
1328 |
-
|
1329 |
-
// This is optional to join periods with
|
1330 |
-
// probable abbreviations
|
1331 |
-
|
1332 |
-
if (p > up || b > ub)
|
1333 |
-
tokflag[i] = 0;
|
1334 |
-
}
|
1335 |
-
}
|
1336 |
-
|
1337 |
-
// endtokens followed by a single or double quote, which matches
|
1338 |
-
// a single or double quote in the previous sentence
|
1339 |
-
|
1340 |
-
if (option_new >= 1)
|
1341 |
-
{
|
1342 |
-
int dquo, squo;
|
1343 |
-
dquo = squo = 0;
|
1344 |
-
|
1345 |
-
for (i = 0; i < text_len; i++)
|
1346 |
-
{
|
1347 |
-
if (text[i] == '"') dquo = ! dquo;
|
1348 |
-
else if (text[i] == '\'') squo = ! squo;
|
1349 |
-
else if (endflag[i])
|
1350 |
-
{
|
1351 |
-
if ((text[i+1] == '"' && dquo) || (text[i+1] == '\'' && squo))
|
1352 |
-
{
|
1353 |
-
endflag[i] = 0;
|
1354 |
-
|
1355 |
-
// But don't end at all if the next token is something
|
1356 |
-
// other than an upper case letter.
|
1357 |
-
|
1358 |
-
if (option_new >= 2)
|
1359 |
-
{
|
1360 |
-
int j;
|
1361 |
-
int ok = 0;
|
1362 |
-
|
1363 |
-
for (j = i + 2; j < text_len; j++)
|
1364 |
-
{
|
1365 |
-
if (isspace(text[j])) continue;
|
1366 |
-
// if (isupper(text[j]))
|
1367 |
-
if (isupper(text[j]) || text[j] == '(')
|
1368 |
-
{
|
1369 |
-
ok = 1;
|
1370 |
-
break;
|
1371 |
-
}
|
1372 |
-
if (tokflag[j]) break;
|
1373 |
-
}
|
1374 |
-
|
1375 |
-
if (ok)
|
1376 |
-
endflag[i+1] = 1;
|
1377 |
-
} else
|
1378 |
-
{
|
1379 |
-
endflag[i+1] = 1;
|
1380 |
-
}
|
1381 |
-
}
|
1382 |
-
dquo = squo = 0;
|
1383 |
-
}
|
1384 |
-
}
|
1385 |
-
}
|
1386 |
-
}
|
1387 |
-
|
1388 |
-
|
1389 |
-
/* set_endflag_01
|
1390 |
-
**
|
1391 |
-
** After tokflag has been set, find the possible sentence endings.
|
1392 |
-
** This has improved paren matching.
|
1393 |
-
*/
|
1394 |
-
|
1395 |
-
#define MAX_MATCH 500 // Maximum length to get a paren match
|
1396 |
-
|
1397 |
-
void MPtok::set_endflag_01()
|
1398 |
-
{
|
1399 |
-
int match[text_len];
|
1400 |
-
int i, j;
|
1401 |
-
|
1402 |
-
// The following tests look for end-stops and label them.
|
1403 |
-
// They include steps 18 and 19
|
1404 |
-
|
1405 |
-
for (i = 0; i <= text_len; i++)
|
1406 |
-
endflag[i] = 0;
|
1407 |
-
|
1408 |
-
for (i = 0; i < text_len; i++)
|
1409 |
-
match[i] = 0;
|
1410 |
-
|
1411 |
-
for (i = text_len - 1; i >= 0; i--)
|
1412 |
-
{
|
1413 |
-
if (text[i] == '(' || text[i] == '[')
|
1414 |
-
{
|
1415 |
-
for (j = i + 1; text[j] && j - i <= MAX_MATCH; j++)
|
1416 |
-
{
|
1417 |
-
// Skip parens that are already matched
|
1418 |
-
|
1419 |
-
if (match[j] > j)
|
1420 |
-
{
|
1421 |
-
j = match[j];
|
1422 |
-
continue;
|
1423 |
-
}
|
1424 |
-
|
1425 |
-
// Look for a matching close paren
|
1426 |
-
|
1427 |
-
if (match[j] == 0
|
1428 |
-
&& ((text[i] == '(' && text[j] == ')')
|
1429 |
-
|| (text[i] == '[' && text[j] == ']')))
|
1430 |
-
{
|
1431 |
-
match[i] = j;
|
1432 |
-
match[j] = i;
|
1433 |
-
break;
|
1434 |
-
}
|
1435 |
-
}
|
1436 |
-
}
|
1437 |
-
}
|
1438 |
-
|
1439 |
-
int next_match = 0;
|
1440 |
-
for (i = 0; i < text_len; i++)
|
1441 |
-
{
|
1442 |
-
if (match[i] > next_match)
|
1443 |
-
next_match = match[i];
|
1444 |
-
|
1445 |
-
if (strchr(".!?", text[i])
|
1446 |
-
&& tokflag[i]
|
1447 |
-
&& tokflag[i + 1]
|
1448 |
-
&& (option_new <= 4 || option_doteos == 1 || (i > 0 && isspace(text[i-1]) == 0)))
|
1449 |
-
{
|
1450 |
-
if (i <= next_match)
|
1451 |
-
tokflag[i] = 0;
|
1452 |
-
else if (option_segment)
|
1453 |
-
endflag[i] = 1;
|
1454 |
-
}
|
1455 |
-
}
|
1456 |
-
|
1457 |
-
// endtokens followed by a single or double quote, which matches
|
1458 |
-
// a single or double quote in the previous sentence
|
1459 |
-
|
1460 |
-
int dquo, squo;
|
1461 |
-
dquo = squo = 0;
|
1462 |
-
|
1463 |
-
for (i = 0; i < text_len; i++)
|
1464 |
-
{
|
1465 |
-
if (option_new <= 7 && text[i] == '"') dquo = ! dquo;
|
1466 |
-
else if (option_new >= 8 && text[i] == '"' && tokflag[i] && tokflag[i+1]) dquo = ! dquo;
|
1467 |
-
else if (option_new <= 7 && text[i] == '\'') squo = ! squo;
|
1468 |
-
else if (option_new >= 8 && text[i] == '\''
|
1469 |
-
&& tokflag[i] && (tokflag[i+1] || (text[i+1] == '\'' && tokflag[i+2]))) squo = ! squo;
|
1470 |
-
else if (endflag[i])
|
1471 |
-
{
|
1472 |
-
if ((text[i+1] == '"' && dquo) || (text[i+1] == '\'' && squo))
|
1473 |
-
{
|
1474 |
-
endflag[i] = 0;
|
1475 |
-
|
1476 |
-
// But don't end at all if the next token is something
|
1477 |
-
// other than an upper case letter.
|
1478 |
-
|
1479 |
-
if (option_new >= 2)
|
1480 |
-
{
|
1481 |
-
int j;
|
1482 |
-
int ok = 0;
|
1483 |
-
|
1484 |
-
for (j = i + 2; j < text_len; j++)
|
1485 |
-
{
|
1486 |
-
if (isspace(text[j])) continue;
|
1487 |
-
// if (isupper(text[j]))
|
1488 |
-
if (isupper(text[j]) || text[j] == '(')
|
1489 |
-
{
|
1490 |
-
ok = 1;
|
1491 |
-
break;
|
1492 |
-
}
|
1493 |
-
if (tokflag[j]) break;
|
1494 |
-
}
|
1495 |
-
|
1496 |
-
if (ok)
|
1497 |
-
endflag[i+1] = 1;
|
1498 |
-
} else
|
1499 |
-
{
|
1500 |
-
endflag[i+1] = 1;
|
1501 |
-
}
|
1502 |
-
}
|
1503 |
-
dquo = squo = 0;
|
1504 |
-
}
|
1505 |
-
}
|
1506 |
-
}
|
1507 |
-
|
1508 |
-
|
1509 |
-
// Size buffer: return the size of the buffer required to hold all of the tokenized text.
|
1510 |
-
// It can be simply estimated by a formula that depends only on the length of text and number of tokens.
|
1511 |
-
|
1512 |
-
int MPtok::size_buff()
|
1513 |
-
{
|
1514 |
-
int size = 1; // Start with null terminator
|
1515 |
-
int t = option_pretag.size(); // for each tag, the length of the UNTAG string
|
1516 |
-
|
1517 |
-
if (t <= 0) t = 1; // Make sure there is at least one
|
1518 |
-
t += 2; // Add one for underscore and one for space
|
1519 |
-
|
1520 |
-
for (int i = 0; i < text_len; i++)
|
1521 |
-
{
|
1522 |
-
size++; // Count all characters
|
1523 |
-
if (tokflag[i]) size += t; // Count token delimiters (may overcount)
|
1524 |
-
if (endflag[i]) size++; // Add one for newline
|
1525 |
-
}
|
1526 |
-
return size;
|
1527 |
-
}
|
1528 |
-
|
1529 |
-
|
1530 |
-
/* append_token
|
1531 |
-
**
|
1532 |
-
** Save a single token to a buffer.
|
1533 |
-
*/
|
1534 |
-
|
1535 |
-
void MPtok::append_token(string& buff, int& sp, char *tok, int ef)
|
1536 |
-
{
|
1537 |
-
// Convert tag separator chars and back quotes (?)
|
1538 |
-
|
1539 |
-
for (int i = 0; tok[i]; i++)
|
1540 |
-
{
|
1541 |
-
if (tok[i] == option_tagsep) tok[i] = option_replacesep;
|
1542 |
-
if (tok[i] == '`') tok[i] = '\'';
|
1543 |
-
}
|
1544 |
-
|
1545 |
-
// Skip whitespace if tokens are being output
|
1546 |
-
// Otherwise, skip whitespace at the start of a sentence
|
1547 |
-
|
1548 |
-
if (option_token || ! sp) while (isspace(*tok)) ++tok;
|
1549 |
-
|
1550 |
-
// Save the token
|
1551 |
-
|
1552 |
-
if (strlen(tok) > 0)
|
1553 |
-
{
|
1554 |
-
// Add delimiter if needed
|
1555 |
-
|
1556 |
-
if (option_token && sp) buff += ' ';
|
1557 |
-
|
1558 |
-
// Append token to output
|
1559 |
-
|
1560 |
-
if (option_new < 9)
|
1561 |
-
{
|
1562 |
-
while (*tok && (! option_token || ! isspace(*tok)))
|
1563 |
-
buff += *(tok++);
|
1564 |
-
} else
|
1565 |
-
{
|
1566 |
-
while (*tok)
|
1567 |
-
buff += *(tok++);
|
1568 |
-
}
|
1569 |
-
|
1570 |
-
sp = 1;
|
1571 |
-
|
1572 |
-
// Add tag holders
|
1573 |
-
|
1574 |
-
if (option_token && option_pretag.size() > 0)
|
1575 |
-
{
|
1576 |
-
buff += option_tagsep;
|
1577 |
-
buff += option_pretag;
|
1578 |
-
}
|
1579 |
-
|
1580 |
-
// If it was end of sentence, then add newline
|
1581 |
-
|
1582 |
-
if (ef)
|
1583 |
-
{
|
1584 |
-
buff += '\n';
|
1585 |
-
sp = 0;
|
1586 |
-
}
|
1587 |
-
}
|
1588 |
-
}
|
1589 |
-
|
1590 |
-
// Strip whitespace after sentences
|
1591 |
-
|
1592 |
-
static void adjust_space(string& buff)
|
1593 |
-
{
|
1594 |
-
while (buff.size() > 0 && isspace(buff[0])) buff.erase(0, 1);
|
1595 |
-
|
1596 |
-
// delete two spaces in a row, but keep newlines
|
1597 |
-
|
1598 |
-
for (int i = 1; i < buff.size(); i++)
|
1599 |
-
{
|
1600 |
-
if (isspace(buff[i]) && isspace(buff[i-1]))
|
1601 |
-
buff.erase((buff[i] == '\n')?(--i):(i--), 1);
|
1602 |
-
}
|
1603 |
-
|
1604 |
-
for (int i = buff.size() - 1; i >= 0 && isspace(buff[i]); i--)
|
1605 |
-
buff.erase(i, 1);
|
1606 |
-
}
|
1607 |
-
|
1608 |
-
/* token_string
|
1609 |
-
**
|
1610 |
-
** After the tokflag and endflag have been set, copy the tokens to the buffer.
|
1611 |
-
*/
|
1612 |
-
|
1613 |
-
string MPtok::token_string()
|
1614 |
-
{
|
1615 |
-
string buff;
|
1616 |
-
|
1617 |
-
int i;
|
1618 |
-
|
1619 |
-
// Move token starts to non-whitespace chars
|
1620 |
-
|
1621 |
-
int last_tok = 0;
|
1622 |
-
for (i = 0; i < text_len; i++)
|
1623 |
-
{
|
1624 |
-
if (tokflag[i] == 1 && isspace(text[i]))
|
1625 |
-
{
|
1626 |
-
tokflag[i] = 0;
|
1627 |
-
last_tok = 1;
|
1628 |
-
} else if (isspace(text[i]) == 0 && last_tok)
|
1629 |
-
{
|
1630 |
-
tokflag[i] = 1;
|
1631 |
-
last_tok = 0;
|
1632 |
-
}
|
1633 |
-
}
|
1634 |
-
|
1635 |
-
// Extract the tokens and print them out now
|
1636 |
-
|
1637 |
-
char *tok = new char[text_len + 1];
|
1638 |
-
int pos = 0;
|
1639 |
-
int sp = 0;
|
1640 |
-
int ef = 0;
|
1641 |
-
|
1642 |
-
tok[pos] = '\0';
|
1643 |
-
|
1644 |
-
for (i = 0; i <= text_len; i++)
|
1645 |
-
{
|
1646 |
-
// The start of a new token
|
1647 |
-
|
1648 |
-
if (tokflag[i])
|
1649 |
-
{
|
1650 |
-
// Print the current token
|
1651 |
-
|
1652 |
-
append_token(buff, sp, tok, ef);
|
1653 |
-
|
1654 |
-
// Start a new token
|
1655 |
-
|
1656 |
-
pos = 0;
|
1657 |
-
tok[pos] = '\0';
|
1658 |
-
|
1659 |
-
ef = 0;
|
1660 |
-
}
|
1661 |
-
|
1662 |
-
// Append to the current token
|
1663 |
-
|
1664 |
-
tok[pos++] = text[i];
|
1665 |
-
tok[pos] = '\0';
|
1666 |
-
|
1667 |
-
// If any of the characters in the token are endflagged,
|
1668 |
-
// Then pass this information along for end-of-sentence
|
1669 |
-
|
1670 |
-
if (endflag[i]) ef = 1;
|
1671 |
-
}
|
1672 |
-
|
1673 |
-
// Print the last token
|
1674 |
-
|
1675 |
-
append_token(buff, sp, tok, ef);
|
1676 |
-
|
1677 |
-
delete[] tok;
|
1678 |
-
|
1679 |
-
// Adjust the end of sentence boundaries
|
1680 |
-
|
1681 |
-
adjust_space(buff);
|
1682 |
-
|
1683 |
-
return buff;
|
1684 |
-
}
|
1685 |
-
|
1686 |
-
void MPtok::map_escapes()
|
1687 |
-
{
|
1688 |
-
char *s;
|
1689 |
-
int j, k, ch;
|
1690 |
-
char buff[10];
|
1691 |
-
|
1692 |
-
k = 0;
|
1693 |
-
for (int i = 0; text[i]; i++)
|
1694 |
-
{
|
1695 |
-
if (text[i] == '&' && text[i + 1] == '#')
|
1696 |
-
{
|
1697 |
-
for (s = &buff[0], j = 2; j <= 4 && i + j < text_len && isdigit(text[i + j]); j++)
|
1698 |
-
*s++ = text[i + j];
|
1699 |
-
*s = '\0';
|
1700 |
-
ch = atoi(buff);
|
1701 |
-
if (strlen(buff) > 0 && text[i + j] == ';' && ch > 0 && ch <= 256)
|
1702 |
-
{
|
1703 |
-
text[k] = ch;
|
1704 |
-
if (! text[k]) text[k] = ' ';
|
1705 |
-
k++;
|
1706 |
-
i = i + j;
|
1707 |
-
continue;
|
1708 |
-
}
|
1709 |
-
}
|
1710 |
-
text[k++] = text[i];
|
1711 |
-
}
|
1712 |
-
text[k] = '\0';
|
1713 |
-
text_len = k;
|
1714 |
-
}
|
1715 |
-
|
1716 |
-
MPtok::MPtok(string idir, const string& cnam)
|
1717 |
-
{
|
1718 |
-
tok_initialized = 0;
|
1719 |
-
|
1720 |
-
if (idir.size() == 0)
|
1721 |
-
{
|
1722 |
-
char *p = getenv("MEDPOST_HOME");
|
1723 |
-
if (p && strlen(p))
|
1724 |
-
{
|
1725 |
-
idir = p;
|
1726 |
-
|
1727 |
-
int found = idir.find("=");
|
1728 |
-
if (found != string::npos)
|
1729 |
-
idir = idir.substr(found + 1);
|
1730 |
-
}
|
1731 |
-
}
|
1732 |
-
|
1733 |
-
|
1734 |
-
if (idir.size() == 0)
|
1735 |
-
{
|
1736 |
-
char buff[1000];
|
1737 |
-
FILE *fp = fopen("path_medpost", "r");
|
1738 |
-
if (fp)
|
1739 |
-
{
|
1740 |
-
if (fgets(buff, 1000, fp))
|
1741 |
-
{
|
1742 |
-
chomp(buff);
|
1743 |
-
idir = &buff[0];
|
1744 |
-
}
|
1745 |
-
fclose(fp);
|
1746 |
-
}
|
1747 |
-
}
|
1748 |
-
|
1749 |
-
if (idir.size() == 0)
|
1750 |
-
idir = "/home/natxie/CPP64/lib/FIXED_DATA/";
|
1751 |
-
|
1752 |
-
option_dir = idir;
|
1753 |
-
|
1754 |
-
option_token = 1;
|
1755 |
-
option_segment = 1;
|
1756 |
-
option_hyphen = 0;
|
1757 |
-
option_comma = 1;
|
1758 |
-
option_pretok = 0;
|
1759 |
-
option_new = MPTOK_VERSION;
|
1760 |
-
option_doteos = 0;
|
1761 |
-
|
1762 |
-
if (cnam.size() > 0)
|
1763 |
-
{
|
1764 |
-
option_cnam = "_";
|
1765 |
-
option_cnam += cnam;
|
1766 |
-
}
|
1767 |
-
|
1768 |
-
init();
|
1769 |
-
}
|
1770 |
-
|
1771 |
-
void MPtok::init(void)
|
1772 |
-
{
|
1773 |
-
if (tok_initialized) return;
|
1774 |
-
|
1775 |
-
string fname;
|
1776 |
-
|
1777 |
-
fname = option_dir + "/medpost" + option_cnam + ".pairs";
|
1778 |
-
init_pair(fname);
|
1779 |
-
|
1780 |
-
fname = option_dir + "/medpost" + option_cnam + ".abbr";
|
1781 |
-
init_abbr(fname);
|
1782 |
-
|
1783 |
-
tok_initialized = 1;
|
1784 |
-
}
|
1785 |
-
|
1786 |
-
MPtok::~MPtok()
|
1787 |
-
{
|
1788 |
-
}
|
1789 |
-
|
1790 |
-
// Global tokenizer
|
1791 |
-
|
1792 |
-
string MPtok::tokenize(const string& txt, int mt)
|
1793 |
-
{
|
1794 |
-
if (option_pretok) return save_string(txt);
|
1795 |
-
|
1796 |
-
option_token = mt;
|
1797 |
-
text_len = txt.size();
|
1798 |
-
if (text_len == 0) return string("");
|
1799 |
-
|
1800 |
-
text = new char[text_len + 1];
|
1801 |
-
strcpy(text, txt.c_str());
|
1802 |
-
|
1803 |
-
map_escapes();
|
1804 |
-
|
1805 |
-
if (text_len == 0) return NULL;
|
1806 |
-
|
1807 |
-
tokflag = new int[text_len + 1];
|
1808 |
-
endflag = new int[text_len + 1];
|
1809 |
-
|
1810 |
-
set_tokflag();
|
1811 |
-
if (option_new < 3)
|
1812 |
-
set_endflag();
|
1813 |
-
else
|
1814 |
-
set_endflag_01();
|
1815 |
-
|
1816 |
-
string buff = token_string();
|
1817 |
-
save_string(buff);
|
1818 |
-
|
1819 |
-
delete[] text; text = NULL;
|
1820 |
-
delete[] tokflag; tokflag = NULL;
|
1821 |
-
delete[] endflag; endflag = NULL;
|
1822 |
-
|
1823 |
-
return buff;
|
1824 |
-
}
|
1825 |
-
|
1826 |
-
string MPtok::tokenize(const string& text)
|
1827 |
-
{
|
1828 |
-
return tokenize(text, 1);
|
1829 |
-
}
|
1830 |
-
|
1831 |
-
string MPtok::segment(const string& text)
|
1832 |
-
{
|
1833 |
-
sent.clear();
|
1834 |
-
|
1835 |
-
// tokenize the text
|
1836 |
-
|
1837 |
-
int save_option_segment = option_segment;
|
1838 |
-
option_segment = 1;
|
1839 |
-
string buff = tokenize(text, 0);
|
1840 |
-
option_segment = save_option_segment;
|
1841 |
-
|
1842 |
-
if (buff.size() == 0) return text;
|
1843 |
-
|
1844 |
-
int found = 0;
|
1845 |
-
int pos = 0;
|
1846 |
-
|
1847 |
-
while (pos < buff.size())
|
1848 |
-
{
|
1849 |
-
found = buff.find('\n', pos);
|
1850 |
-
if (found == string::npos)
|
1851 |
-
{
|
1852 |
-
sent.push_back(buff.substr(pos));
|
1853 |
-
pos = buff.size();
|
1854 |
-
} else
|
1855 |
-
{
|
1856 |
-
sent.push_back(buff.substr(pos, found - pos));
|
1857 |
-
pos = found + 1;
|
1858 |
-
}
|
1859 |
-
}
|
1860 |
-
|
1861 |
-
return buff;
|
1862 |
-
}
|
1863 |
-
|
1864 |
-
string MPtok::save_string(const string& s)
|
1865 |
-
{
|
1866 |
-
stringstream ss (stringstream::in | stringstream::out);
|
1867 |
-
string w, t;
|
1868 |
-
int found;
|
1869 |
-
string ret;
|
1870 |
-
|
1871 |
-
word.clear();
|
1872 |
-
tag.clear();
|
1873 |
-
|
1874 |
-
ss << s;
|
1875 |
-
while (ss.good())
|
1876 |
-
{
|
1877 |
-
ss >> w;
|
1878 |
-
if (w.size() == 0) break;
|
1879 |
-
|
1880 |
-
found = w.find('_');
|
1881 |
-
|
1882 |
-
if (found != string::npos)
|
1883 |
-
{
|
1884 |
-
t = w.substr(found + 1);
|
1885 |
-
w.resize(found);
|
1886 |
-
word.push_back(w);
|
1887 |
-
tag.push_back(t);
|
1888 |
-
} else
|
1889 |
-
{
|
1890 |
-
word.push_back(w);
|
1891 |
-
tag.push_back(option_pretag);
|
1892 |
-
|
1893 |
-
}
|
1894 |
-
if (ret.size() > 0) ret += " ";
|
1895 |
-
ret += w;
|
1896 |
-
}
|
1897 |
-
|
1898 |
-
// now look for continuation tags...
|
1899 |
-
|
1900 |
-
for (int i = 0; i < word.size(); i++)
|
1901 |
-
{
|
1902 |
-
int j = tag[i].size() - 1;
|
1903 |
-
if (j >= 0 && tag[i][j] == '+' && i < tag.size() - 1)
|
1904 |
-
{
|
1905 |
-
word[i] = word[i] + " " + word[i + 1];
|
1906 |
-
tag[i] = tag[i + 1];
|
1907 |
-
word.erase(word.begin() + i + 1, word.begin() + i + 2);
|
1908 |
-
tag.erase(tag.begin() + i + 1, tag.begin() + i + 2);
|
1909 |
-
i--;
|
1910 |
-
}
|
1911 |
-
}
|
1912 |
-
|
1913 |
-
return ret;
|
1914 |
-
}
|
1915 |
-
|
1916 |
-
|
1917 |
-
static int count_words(const char *s)
|
1918 |
-
{
|
1919 |
-
int i;
|
1920 |
-
|
1921 |
-
i = 1;
|
1922 |
-
for (; *s; ++s)
|
1923 |
-
{
|
1924 |
-
if (*s == ' ') ++i;
|
1925 |
-
}
|
1926 |
-
return i;
|
1927 |
-
}
|
1928 |
-
|
1929 |
-
static void print_word(const char *s, int i)
|
1930 |
-
{
|
1931 |
-
for (; i > 0 && *s; ++s) { if (*s == ' ') --i; }
|
1932 |
-
while (*s && *s != ' ') { printf("%c", *s); ++s; }
|
1933 |
-
}
|
1934 |
-
|
1935 |
-
void MPtok::print(int how)
|
1936 |
-
{
|
1937 |
-
int i, j, w;
|
1938 |
-
|
1939 |
-
if (how != 0 && how != 2)
|
1940 |
-
{
|
1941 |
-
printf("print(%d) not defined\n", how);
|
1942 |
-
return;
|
1943 |
-
}
|
1944 |
-
|
1945 |
-
for (i = 0; i < word.size(); ++i)
|
1946 |
-
{
|
1947 |
-
// Get the words from an idiom
|
1948 |
-
|
1949 |
-
for (w = 0; w < count_words(word[i].c_str()); ++w)
|
1950 |
-
{
|
1951 |
-
if (how == 2 && i + w > 0) printf(" ");
|
1952 |
-
|
1953 |
-
print_word(word[i].c_str(), w);
|
1954 |
-
|
1955 |
-
if (how == 0)
|
1956 |
-
{
|
1957 |
-
printf(" tagged %s", tag[i].c_str());
|
1958 |
-
if (w < count_words(word[i].c_str()) - 1) printf("+");
|
1959 |
-
printf("\n");
|
1960 |
-
} else if (how == 2)
|
1961 |
-
{
|
1962 |
-
printf("%s%s", "_", tag[i].c_str());
|
1963 |
-
if (w < count_words(word[i].c_str()) - 1) printf("+");
|
1964 |
-
}
|
1965 |
-
}
|
1966 |
-
}
|
1967 |
-
if (how == 2)
|
1968 |
-
printf("\n");
|
1969 |
-
}
|
1970 |
-
|
1971 |
-
void MPtok::merge_words(int s, int n)
|
1972 |
-
{
|
1973 |
-
string tmp = word[s];
|
1974 |
-
|
1975 |
-
for (int i = s + 1; i < s + n; i++)
|
1976 |
-
{
|
1977 |
-
tmp += " ";
|
1978 |
-
tmp += word[i];
|
1979 |
-
}
|
1980 |
-
|
1981 |
-
// printf("merging words : '%s' n = %d\n", tmp.c_str(), n);
|
1982 |
-
|
1983 |
-
for (int k = s; k + n < word.size(); k++)
|
1984 |
-
{
|
1985 |
-
word[k+1] = word[k+n];
|
1986 |
-
tag[k+1] = tag[k+n];
|
1987 |
-
}
|
1988 |
-
|
1989 |
-
// Fixup the remaining array
|
1990 |
-
|
1991 |
-
word.resize(word.size() - n + 1);
|
1992 |
-
tag.resize(word.size());
|
1993 |
-
|
1994 |
-
word[s] = tmp;
|
1995 |
-
}
|
1996 |
-
|
1997 |
-
void MPtok::split_words()
|
1998 |
-
{
|
1999 |
-
for (int i = 0; i < word.size(); i++)
|
2000 |
-
{
|
2001 |
-
int found = word[i].find(' ');
|
2002 |
-
|
2003 |
-
if (found != string::npos)
|
2004 |
-
{
|
2005 |
-
string tmp1(word[i], 0, found);
|
2006 |
-
string tmp2(word[i], found + 1, string::npos);
|
2007 |
-
|
2008 |
-
// Move all the words and tags down
|
2009 |
-
|
2010 |
-
word.resize(word.size() + 1);
|
2011 |
-
tag.resize(tag.size() + 1);
|
2012 |
-
|
2013 |
-
for (int j = word.size() - 1; j > i; j--)
|
2014 |
-
{
|
2015 |
-
word[j] = word[j - 1];
|
2016 |
-
tag[j] = tag[j - 1];
|
2017 |
-
}
|
2018 |
-
|
2019 |
-
word[i] = tmp1;
|
2020 |
-
tag[i] = tag[i+1];
|
2021 |
-
tag[i] += "+";
|
2022 |
-
|
2023 |
-
word[i+1] = tmp2;
|
2024 |
-
}
|
2025 |
-
}
|
2026 |
-
}
|
2027 |
-
|
2028 |
-
// Callable functions to set internal options
|
2029 |
-
|
2030 |
-
void MPtok::set_segment(int i) { option_segment = i; }
|
2031 |
-
void MPtok::set_hyphen(int i) { option_hyphen = i; }
|
2032 |
-
void MPtok::set_comma(int i) { option_comma = i; }
|
2033 |
-
void MPtok::set_pretag(char *a) { option_pretag = a; }
|
2034 |
-
void MPtok::set_pretok(int i) { option_pretok = i; }
|
2035 |
-
void MPtok::set_new(int i) { option_new = i; }
|
2036 |
-
void MPtok::set_doteos(int i) { option_doteos = i; }
|
|
|
1 |
+
#include <stdio.h>
|
2 |
+
#include <ctype.h>
|
3 |
+
#include <string.h>
|
4 |
+
#include <stdlib.h>
|
5 |
+
|
6 |
+
#include <string>
|
7 |
+
#include <iostream>
|
8 |
+
#include <fstream>
|
9 |
+
#include <sstream>
|
10 |
+
|
11 |
+
#include "MPtok.h"
|
12 |
+
|
13 |
+
// These options are probably compile time constants
|
14 |
+
|
15 |
+
static char option_tagsep = '_'; // The tagsep character
|
16 |
+
static char option_replacesep = '-'; // Replace tagsep with this
|
17 |
+
|
18 |
+
static void chomp(char *line)
|
19 |
+
{
|
20 |
+
int i;
|
21 |
+
|
22 |
+
i = strlen(line) - 1;
|
23 |
+
while (i >= 0 && line[i] == '\n' || line[i] == '\r')
|
24 |
+
line[i--] = '\0';
|
25 |
+
}
|
26 |
+
|
27 |
+
// Data structure and algorithm for finding common pairs.
|
28 |
+
|
29 |
+
// read a file of pairs into a data structure,
|
30 |
+
// the file must be sorted first
|
31 |
+
|
32 |
+
void MPtok::init_pair(const string& file_name)
|
33 |
+
{
|
34 |
+
filebuf fb;
|
35 |
+
fb.open(file_name.c_str(), ios::in);
|
36 |
+
istream is(&fb);
|
37 |
+
string pair;
|
38 |
+
|
39 |
+
while (1)
|
40 |
+
{
|
41 |
+
getline(is, pair);
|
42 |
+
if (is.fail()) break;
|
43 |
+
if (pair.size() > 0) common_pair.insert(pair);
|
44 |
+
}
|
45 |
+
|
46 |
+
fb.close();
|
47 |
+
}
|
48 |
+
|
49 |
+
// List of abbreviations in 3 categories
|
50 |
+
// ABB = can occur mid sentence
|
51 |
+
// EOS = can occur at end of sentence
|
52 |
+
// NUM = only used before numbers
|
53 |
+
|
54 |
+
void MPtok::init_abbr(const string& file_name)
|
55 |
+
{
|
56 |
+
filebuf fb;
|
57 |
+
fb.open(file_name.c_str(), ios::in);
|
58 |
+
istream is(&fb);
|
59 |
+
string typ, abb;
|
60 |
+
map<string,int> val;
|
61 |
+
val["ABB"] = ABB_ABB; val["EOS"] = ABB_EOS; val["NUM"] = ABB_NUM;
|
62 |
+
|
63 |
+
while (is.good())
|
64 |
+
{
|
65 |
+
is >> typ;
|
66 |
+
if (val.count(typ))
|
67 |
+
{
|
68 |
+
is >> abb;
|
69 |
+
if (abb.size() > 0) common_abbr[abb] = val[typ];
|
70 |
+
}
|
71 |
+
}
|
72 |
+
fb.close();
|
73 |
+
}
|
74 |
+
|
75 |
+
static char nextchar(const char *t, int i)
|
76 |
+
{
|
77 |
+
while (isspace(t[i])) i++;
|
78 |
+
return t[i];
|
79 |
+
}
|
80 |
+
|
81 |
+
// Look for a token at or prior to the text position
|
82 |
+
|
83 |
+
static int lookbehind(const char *t, int i, const char *s, int *tokflag)
|
84 |
+
{
|
85 |
+
int k = (int) strlen(s) - 1;
|
86 |
+
|
87 |
+
while (i > 0 && isspace(t[i])) i--;
|
88 |
+
|
89 |
+
while (k >= 0 && i >= 0)
|
90 |
+
{
|
91 |
+
if (k > 0 && tokflag[i]) break;
|
92 |
+
|
93 |
+
if (tolower(s[k]) != tolower(t[i]))
|
94 |
+
return -1;
|
95 |
+
k--;
|
96 |
+
i--;
|
97 |
+
}
|
98 |
+
|
99 |
+
return (k < 0 && tokflag[i+1]) ? i + 1 : -1;
|
100 |
+
}
|
101 |
+
|
102 |
+
// Look for a token at or following the text position
|
103 |
+
|
104 |
+
static int lookahead(const char *t, int i, const char *s, int *tokflag)
|
105 |
+
{
|
106 |
+
int k = 0;
|
107 |
+
|
108 |
+
while (isspace(t[i])) i++;
|
109 |
+
|
110 |
+
while (k < strlen(s) && i < strlen(t))
|
111 |
+
{
|
112 |
+
if (k > 0 && tokflag[i]) break;
|
113 |
+
|
114 |
+
if (tolower(s[k]) != tolower(t[i]))
|
115 |
+
return -1;
|
116 |
+
k++;
|
117 |
+
i++;
|
118 |
+
}
|
119 |
+
|
120 |
+
return (k == strlen(s) && tokflag[i]) ? i - (int) strlen(s) : -1;
|
121 |
+
}
|
122 |
+
|
123 |
+
// Set the initial tokens at spaces
|
124 |
+
|
125 |
+
void MPtok::tok_0()
|
126 |
+
{
|
127 |
+
int i;
|
128 |
+
|
129 |
+
tokflag[0] = 1;
|
130 |
+
for (i = 1; i < text_len; i++)
|
131 |
+
{
|
132 |
+
tokflag[i] = isspace(text[i]) || (i > 0 && isspace(text[i - 1])) ? 1 : 0;
|
133 |
+
}
|
134 |
+
tokflag[i] = 1;
|
135 |
+
}
|
136 |
+
|
137 |
+
// Get quotes preceded by open parens
|
138 |
+
//
|
139 |
+
// A double quote, preceded by a space or open bracket is a separate token
|
140 |
+
//
|
141 |
+
|
142 |
+
void MPtok::tok_1()
|
143 |
+
{
|
144 |
+
for (int i = 1; i < text_len; i++)
|
145 |
+
{
|
146 |
+
if (text[i] == '"' && strchr("([{<", text[i-1]))
|
147 |
+
{
|
148 |
+
tokflag[i] = 1;
|
149 |
+
if (i + 1 < text_len) tokflag[i+1] = 1;
|
150 |
+
}
|
151 |
+
}
|
152 |
+
}
|
153 |
+
|
154 |
+
// Look for ellipses
|
155 |
+
//
|
156 |
+
// Three dots in a row is a separate token
|
157 |
+
|
158 |
+
void MPtok::tok_2()
|
159 |
+
{
|
160 |
+
for (int i = 1; i + 2 < text_len; i++)
|
161 |
+
{
|
162 |
+
if (strncmp(&text[i], "...", 3) == 0)
|
163 |
+
{
|
164 |
+
tokflag[i] = 1;
|
165 |
+
if (i + 3 < text_len) tokflag[i+3] = 1;
|
166 |
+
}
|
167 |
+
}
|
168 |
+
}
|
169 |
+
|
170 |
+
// Non-sentence-ending punctuation
|
171 |
+
//
|
172 |
+
// Certain punctuation characters are separate tokens
|
173 |
+
|
174 |
+
void MPtok::tok_3()
|
175 |
+
{
|
176 |
+
for (int i = 0; i < text_len; i++)
|
177 |
+
{
|
178 |
+
// If it is a comma and the next char is not a space and option_comma = 0
|
179 |
+
|
180 |
+
if (option_comma == 0 && text[i] == ',' && isspace(text[i + 1]) == 0)
|
181 |
+
{
|
182 |
+
// do nothing
|
183 |
+
} else if (strchr(",;:@#$%&", text[i]))
|
184 |
+
{
|
185 |
+
tokflag[i] = 1;
|
186 |
+
tokflag[i + 1] = 1;
|
187 |
+
}
|
188 |
+
}
|
189 |
+
}
|
190 |
+
|
191 |
+
// Separate the slashes
|
192 |
+
//
|
193 |
+
// Slashes are a separate token
|
194 |
+
// except for +/-, +/+, -/-, -/+, and and/or.
|
195 |
+
|
196 |
+
void MPtok::tok_5_6_7()
|
197 |
+
{
|
198 |
+
for (int i = 0; i < text_len; i++)
|
199 |
+
{
|
200 |
+
if (text[i] == '/')
|
201 |
+
{
|
202 |
+
tokflag[i] = 1;
|
203 |
+
if (i+1 < text_len) tokflag[i+1] = 1;
|
204 |
+
|
205 |
+
// Put back +/-, etc, unless option_hyphen is 1
|
206 |
+
|
207 |
+
if (i - 1 >= 0
|
208 |
+
&& i + 1 < text_len
|
209 |
+
&& ((option_new < 9
|
210 |
+
&& text[i - 1] == '+' || (text[i - 1] == '-' && option_hyphen == 0)
|
211 |
+
&& text[i + 1] == '+' || (text[i + 1] == '-' && option_hyphen == 0))
|
212 |
+
|| (option_new >= 9
|
213 |
+
&& (text[i - 1] == '+' || text[i - 1] == '-')
|
214 |
+
&& (text[i + 1] == '+' || text[i + 1] == '-'))))
|
215 |
+
{
|
216 |
+
tokflag[i - 1] = 1;
|
217 |
+
tokflag[i] = tokflag[i+1] = 0;
|
218 |
+
tokflag[i + 2] = 1;
|
219 |
+
}
|
220 |
+
|
221 |
+
// Put back and/or, etc
|
222 |
+
|
223 |
+
if (option_new <= 7)
|
224 |
+
{
|
225 |
+
if (i > 5 && strncmp(text + i - 5, " and/or ", 8) == 0)
|
226 |
+
{
|
227 |
+
for (int j = 1; j < 5; j++)
|
228 |
+
tokflag[i - 2 + j] = 0;
|
229 |
+
}
|
230 |
+
} else
|
231 |
+
{
|
232 |
+
if (i > 4 && strncmp(text + i - 4, " and/or ", 8) == 0)
|
233 |
+
{
|
234 |
+
for (int j = 1; j < 6; j++)
|
235 |
+
tokflag[i - 3 + j] = 0;
|
236 |
+
}
|
237 |
+
}
|
238 |
+
}
|
239 |
+
}
|
240 |
+
}
|
241 |
+
|
242 |
+
// All brackets
|
243 |
+
//
|
244 |
+
// Any open or closed bracket is a separate token
|
245 |
+
//
|
246 |
+
// Exclamation and question mark
|
247 |
+
//
|
248 |
+
// Any question or exclamation mark is a separate token
|
249 |
+
|
250 |
+
void MPtok::tok_8_9()
|
251 |
+
{
|
252 |
+
for (int i = 0; i < text_len; i++)
|
253 |
+
{
|
254 |
+
if (strchr("[](){}<>", text[i])
|
255 |
+
|| strchr("?!", text[i]))
|
256 |
+
{
|
257 |
+
tokflag[i] = 1;
|
258 |
+
if (i + 1 < text_len) tokflag[i+1] = 1;
|
259 |
+
}
|
260 |
+
}
|
261 |
+
}
|
262 |
+
|
263 |
+
// Period at the end of a string may be followed by closed-bracket or quote
|
264 |
+
//
|
265 |
+
// A period that is preceded by a non-period
|
266 |
+
// and optionally followed by a close paren
|
267 |
+
// and any amount of space at the end of the string
|
268 |
+
// is a separate token.
|
269 |
+
|
270 |
+
void MPtok::tok_10()
|
271 |
+
{
|
272 |
+
for (int i = text_len - 1; i >= 0; i--)
|
273 |
+
{
|
274 |
+
if (isspace(text[i])) continue;
|
275 |
+
if (strchr("])}>\"'", text[i])) continue;
|
276 |
+
if (text[i] != '.') break;
|
277 |
+
if (text[i] == '.' && (i - 1 < 0 || text[i-1] != '.'))
|
278 |
+
{
|
279 |
+
tokflag[i] = 1;
|
280 |
+
if (i + 1 < text_len) tokflag[i+1] = 1;
|
281 |
+
}
|
282 |
+
}
|
283 |
+
}
|
284 |
+
|
285 |
+
// Period followed by a capitalized word
|
286 |
+
//
|
287 |
+
// A period preceded by a character that is not another period and not a space
|
288 |
+
// and followed by a space then an upper case letter is a separate token
|
289 |
+
|
290 |
+
void MPtok::tok_11()
|
291 |
+
{
|
292 |
+
for (int i = 0; i < text_len; i++)
|
293 |
+
{
|
294 |
+
if (text[i] == '.'
|
295 |
+
&& (i + 1 < text_len && isspace(text[i+1]))
|
296 |
+
&& (i - 1 < 0 || text[i - 1] != '.' || isspace(text[i-1]) == 0)
|
297 |
+
&& isupper(nextchar(text, i + 1)))
|
298 |
+
tokflag[i] = 1;
|
299 |
+
}
|
300 |
+
}
|
301 |
+
|
302 |
+
// A normal word followed by a period
|
303 |
+
//
|
304 |
+
// A period followed by a space
|
305 |
+
// and preceded by 2 or more alphabetic characters or hyphens
|
306 |
+
// is a separate token
|
307 |
+
|
308 |
+
void MPtok::tok_12()
|
309 |
+
{
|
310 |
+
int wcnt = 0;
|
311 |
+
|
312 |
+
for (int i = 0; i < text_len; i++)
|
313 |
+
{
|
314 |
+
if (text[i] == '.'
|
315 |
+
&& tokflag[i + 1]
|
316 |
+
&& wcnt >= 2)
|
317 |
+
tokflag[i] = 1;
|
318 |
+
|
319 |
+
if (isalpha(text[i]) || text[i] == '-')
|
320 |
+
++wcnt;
|
321 |
+
else
|
322 |
+
wcnt = 0;
|
323 |
+
}
|
324 |
+
}
|
325 |
+
|
326 |
+
// A non-normal token (that has no lower case letters) followed by a period
|
327 |
+
//
|
328 |
+
// A period at the end of a token made of characters excluding lower case
|
329 |
+
// is a separate token
|
330 |
+
|
331 |
+
void MPtok::tok_13()
|
332 |
+
{
|
333 |
+
int stok = 0;
|
334 |
+
int wcnt = 0;
|
335 |
+
|
336 |
+
for (int i = 0; i < text_len; i++)
|
337 |
+
{
|
338 |
+
if (text[i] == '.'
|
339 |
+
&& tokflag[i + 1]
|
340 |
+
&& wcnt >= 2)
|
341 |
+
tokflag[i] = 1;
|
342 |
+
|
343 |
+
if (tokflag[i] == 1) stok = 1;
|
344 |
+
|
345 |
+
if (islower(text[i]) || text[i] == '.')
|
346 |
+
{
|
347 |
+
stok = 0;
|
348 |
+
wcnt = 0;
|
349 |
+
}
|
350 |
+
|
351 |
+
if (stok)
|
352 |
+
wcnt++;
|
353 |
+
}
|
354 |
+
}
|
355 |
+
|
356 |
+
// put some periods with single-letter abbreviations
|
357 |
+
//
|
358 |
+
// A single alphabetic token followed by a period followed
|
359 |
+
// by a token that does not begin with an upper case letter
|
360 |
+
// or number is taken to be an abbreviation and the period
|
361 |
+
// does not start a new token.
|
362 |
+
//
|
363 |
+
// NOTE: This does not recognize initials in people's names,
|
364 |
+
// that problem is not simply solved.
|
365 |
+
|
366 |
+
void MPtok::tok_14()
|
367 |
+
{
|
368 |
+
for (int i = 0; i < text_len; i++)
|
369 |
+
{
|
370 |
+
if (text[i] == '.'
|
371 |
+
&& i - 1 >= 0 && isalpha(text[i - 1]) && tokflag[i - 1]
|
372 |
+
&& tokflag[i + 1]
|
373 |
+
&& isupper(nextchar(text, i + 1)) == 0
|
374 |
+
&& isdigit(nextchar(text, i + 1)) == 0
|
375 |
+
&& nextchar(text, i + 1) != '('
|
376 |
+
)
|
377 |
+
{
|
378 |
+
tokflag[i] = 0;
|
379 |
+
}
|
380 |
+
}
|
381 |
+
}
|
382 |
+
|
383 |
+
void MPtok::tok_15()
|
384 |
+
{
|
385 |
+
int i, j, k, a;
|
386 |
+
char buff[MAX_ABB + 1];
|
387 |
+
|
388 |
+
for (i = 0; i < text_len; i++)
|
389 |
+
{
|
390 |
+
// only start at a current token
|
391 |
+
|
392 |
+
if (! tokflag[i]) continue;
|
393 |
+
|
394 |
+
// find alphabetic followed by period
|
395 |
+
|
396 |
+
buff[0] = '\0';
|
397 |
+
for (k = 0; i + k < text_len && k < MAX_ABB; k++)
|
398 |
+
{
|
399 |
+
buff[k] = text[i+k]; buff[k+1] = '\0';
|
400 |
+
if (k > 0 && buff[k] == '.') break; // this is good
|
401 |
+
if (! isalpha(buff[k])) { buff[0] = '\0'; break; } // this is not good
|
402 |
+
}
|
403 |
+
|
404 |
+
if (buff[0] == '\0' || i + k == text_len || k == MAX_ABB) continue;
|
405 |
+
|
406 |
+
// at this point, buff[k] == '.' add 1 to make it the length
|
407 |
+
|
408 |
+
k++;
|
409 |
+
|
410 |
+
// if not found, try finding a concatenated abbrev
|
411 |
+
|
412 |
+
if (! common_abbr.count(buff))
|
413 |
+
{
|
414 |
+
for (; i + k < text_len && k < MAX_ABB; k++)
|
415 |
+
{
|
416 |
+
buff[k] = text[i+k]; buff[k+1] = '\0';
|
417 |
+
if (k > 0 && buff[k] == '.') break; // this is good
|
418 |
+
if (! isalpha(buff[k])) { buff[0] = '\0'; break; } // this is not good
|
419 |
+
}
|
420 |
+
|
421 |
+
if (buff[0] == '\0' || i + k == text_len || k == MAX_ABB) continue;
|
422 |
+
|
423 |
+
// at this point, buff[k] == '.' add 1 to make it the length
|
424 |
+
|
425 |
+
k++;
|
426 |
+
}
|
427 |
+
|
428 |
+
// if not found, give up
|
429 |
+
|
430 |
+
if (! common_abbr.count(buff)) continue;
|
431 |
+
|
432 |
+
if (common_abbr[buff] == ABB_NUM)
|
433 |
+
{
|
434 |
+
for (j = i + k; j < text_len && isspace(text[j]); j++) ; // next must be a number
|
435 |
+
if (! isdigit(text[j])) continue; // go to next abbreviation
|
436 |
+
} else if (common_abbr[buff] == ABB_EOS)
|
437 |
+
{
|
438 |
+
for (j = i + k; j < text_len && isspace(text[j]); j++) ; // if next token is upper case letter
|
439 |
+
if (isupper(text[j])) tokflag[i + (--k)] = 1; // tokenize the final period of this abbreviation
|
440 |
+
}
|
441 |
+
|
442 |
+
// clear all token flags
|
443 |
+
|
444 |
+
for (j = 1; j < k; j++) tokflag[i + j] = 0;
|
445 |
+
}
|
446 |
+
}
|
447 |
+
|
448 |
+
// Check for common pairs that should not be considered sentence breaks
|
449 |
+
|
450 |
+
void MPtok::tok_15_1()
|
451 |
+
{
|
452 |
+
int i, j, k, tnum, p;
|
453 |
+
char buff[MAX_ABB + 1];
|
454 |
+
|
455 |
+
for (i = 0; i < text_len; i++)
|
456 |
+
{
|
457 |
+
if (! tokflag[i]) continue;
|
458 |
+
|
459 |
+
// must be alphanumeric token followed by period token followed by space followed by alphanumeric token
|
460 |
+
|
461 |
+
tnum = 0;
|
462 |
+
buff[0] = '\0';
|
463 |
+
for (p = k = 0; i + k < text_len && k < MAX_ABB; k++)
|
464 |
+
{
|
465 |
+
buff[k] = text[i+k]; buff[k+1] = '\0';
|
466 |
+
|
467 |
+
if (isspace(buff[k]))
|
468 |
+
{
|
469 |
+
if (tnum == 2) break; // this is good
|
470 |
+
else if (tnum == 1) continue; // ok
|
471 |
+
else { buff[0] = '\0'; break; } // this shouldn't happen
|
472 |
+
}
|
473 |
+
|
474 |
+
if (tokflag[i+k])
|
475 |
+
{
|
476 |
+
if (tnum > 2) break; // done
|
477 |
+
else tnum++;
|
478 |
+
}
|
479 |
+
|
480 |
+
if (tnum == 1 && buff[k] == '.') p = k;
|
481 |
+
if (tnum == 1 && buff[k] != '.') { buff[0] = '\0'; break; } // nope
|
482 |
+
if (! isalnum(buff[k])) { buff[0] = '\0'; break; } // nope
|
483 |
+
}
|
484 |
+
|
485 |
+
if (buff[0] == '\0' || i + k == text_len || k == MAX_ABB) continue;
|
486 |
+
|
487 |
+
// at this point buff is a potential pair, so untokenize the period, that's all
|
488 |
+
|
489 |
+
if (common_pair.count(buff))
|
490 |
+
tokflag[p] = 0;
|
491 |
+
}
|
492 |
+
}
|
493 |
+
|
494 |
+
// Get cases where a space after a sentence has been omitted
|
495 |
+
//
|
496 |
+
// A period that occurs in a token consisting of alphabetic
|
497 |
+
// letters with a vowel to the left and the right is a
|
498 |
+
// separate token.
|
499 |
+
|
500 |
+
void MPtok::tok_16()
|
501 |
+
{
|
502 |
+
int j;
|
503 |
+
int has_vowel;
|
504 |
+
|
505 |
+
for (int i = 0; i < text_len; i++)
|
506 |
+
{
|
507 |
+
if (text[i] == '.' && tokflag[i] == 0)
|
508 |
+
{
|
509 |
+
has_vowel = 0;
|
510 |
+
for (j = i - 1; j >= 0; --j)
|
511 |
+
{
|
512 |
+
if (isalpha(text[j]) == 0)
|
513 |
+
break;
|
514 |
+
if (strchr("aeiouAEIOU", text[j]))
|
515 |
+
has_vowel = 1;
|
516 |
+
if (tokflag[j])
|
517 |
+
break;
|
518 |
+
}
|
519 |
+
if ((j >= 0 && tokflag[j] == 0) || has_vowel == 0)
|
520 |
+
continue;
|
521 |
+
|
522 |
+
j = i + 1;
|
523 |
+
|
524 |
+
has_vowel = 0;
|
525 |
+
for (; j < text_len && tokflag[j] == 0; ++j)
|
526 |
+
{
|
527 |
+
if (isalpha(text[j]) == 0)
|
528 |
+
break;
|
529 |
+
if (strchr("aeiouAEIOU", text[j]))
|
530 |
+
has_vowel = 1;
|
531 |
+
}
|
532 |
+
|
533 |
+
if ((j < text_len && tokflag[j] == 0) || has_vowel == 0)
|
534 |
+
continue;
|
535 |
+
|
536 |
+
tokflag[i] = 1;
|
537 |
+
tokflag[i + 1] = 1;
|
538 |
+
}
|
539 |
+
}
|
540 |
+
}
|
541 |
+
|
542 |
+
// Correction to tok_16,
|
543 |
+
// Don't count if the token before is a single letter
|
544 |
+
// or the token following is a single letter other than 'a'.
|
545 |
+
// Also, don't count if the token to the right is gov, com, edu, etc.
|
546 |
+
// because those are web addresses!
|
547 |
+
|
548 |
+
#define COMPLEX_WINDOW 40
|
549 |
+
|
550 |
+
enum {COMPLEX_NOT = 0, COMPLEX_YES, COMPLEX_DONE};
|
551 |
+
|
552 |
+
struct _complex {
|
553 |
+
int flag;
|
554 |
+
int offset;
|
555 |
+
const char *str;
|
556 |
+
int len;
|
557 |
+
} complex[] = {
|
558 |
+
COMPLEX_YES, 0, "complex", 7,
|
559 |
+
COMPLEX_NOT, 0, "complexi", 8,
|
560 |
+
COMPLEX_NOT, 0, "complexed", 9,
|
561 |
+
COMPLEX_NOT, 0, "complexa", 8,
|
562 |
+
COMPLEX_NOT, 0, "complex-", 8,
|
563 |
+
COMPLEX_NOT, 0, "complexl", 8,
|
564 |
+
COMPLEX_NOT, 0, "complexu", 8,
|
565 |
+
COMPLEX_NOT, -1, "-complex", 7,
|
566 |
+
COMPLEX_NOT, -2, "nocomplex", 9,
|
567 |
+
COMPLEX_NOT, -3, "subcomplex", 10,
|
568 |
+
COMPLEX_YES, 0, "hybrid", 6,
|
569 |
+
COMPLEX_NOT, 0, "hybridi", 7,
|
570 |
+
COMPLEX_NOT, 0, "hybrido", 7,
|
571 |
+
COMPLEX_NOT, 0, "hybrida", 7,
|
572 |
+
COMPLEX_NOT, 0, "hybrid-", 7,
|
573 |
+
COMPLEX_NOT, -1, "-hybrid", 7,
|
574 |
+
COMPLEX_YES, 0, "duplex", 6,
|
575 |
+
COMPLEX_NOT, -1, "oduplex", 7,
|
576 |
+
COMPLEX_DONE, 0, NULL, 0,
|
577 |
+
};
|
578 |
+
|
579 |
+
int MPtok::complex_check()
|
580 |
+
{
|
581 |
+
int last_period = -2*COMPLEX_WINDOW;
|
582 |
+
int last_complex = -2*COMPLEX_WINDOW;
|
583 |
+
int i, j;
|
584 |
+
int complex_match;
|
585 |
+
|
586 |
+
for (i = 0; i < text_len; i++)
|
587 |
+
{
|
588 |
+
if (text[i] == '.')
|
589 |
+
{
|
590 |
+
if (i - last_complex <= COMPLEX_WINDOW)
|
591 |
+
return 1;
|
592 |
+
last_period = i;
|
593 |
+
}
|
594 |
+
|
595 |
+
complex_match = 0;
|
596 |
+
for (j = 0; complex[j].str; j++)
|
597 |
+
{
|
598 |
+
if (complex[j].flag == COMPLEX_NOT)
|
599 |
+
{
|
600 |
+
if (i + complex[j].offset >= 0
|
601 |
+
&& strncmp(text+i+complex[j].offset, complex[j].str, complex[j].len) == 0)
|
602 |
+
{
|
603 |
+
// don't match here
|
604 |
+
complex_match = 0;
|
605 |
+
}
|
606 |
+
} else if (complex[j].flag == COMPLEX_YES)
|
607 |
+
{
|
608 |
+
if (i + complex[j].offset >= 0
|
609 |
+
&& strncmp(text+i+complex[j].offset, complex[j].str, complex[j].len) == 0)
|
610 |
+
{
|
611 |
+
// match here
|
612 |
+
complex_match = 1;
|
613 |
+
}
|
614 |
+
}
|
615 |
+
}
|
616 |
+
|
617 |
+
if (complex_match)
|
618 |
+
{
|
619 |
+
if (i - last_period <= COMPLEX_WINDOW)
|
620 |
+
return 1;
|
621 |
+
last_complex = i;
|
622 |
+
}
|
623 |
+
}
|
624 |
+
return 0;
|
625 |
+
}
|
626 |
+
|
627 |
+
void MPtok::tok_16_1()
|
628 |
+
{
|
629 |
+
int i, j;
|
630 |
+
char v1, v2;
|
631 |
+
int c1, c2;
|
632 |
+
|
633 |
+
if (option_new == 3 && strstr(text, "complex"))
|
634 |
+
return;
|
635 |
+
|
636 |
+
if (option_new >= 4 && complex_check())
|
637 |
+
return;
|
638 |
+
|
639 |
+
for (i = 0; i < text_len; i++)
|
640 |
+
{
|
641 |
+
if (text[i] == '.' && tokflag[i] == 0)
|
642 |
+
{
|
643 |
+
char suffix[10];
|
644 |
+
int s_i;
|
645 |
+
|
646 |
+
v1 = '\0';
|
647 |
+
c1 = 0;
|
648 |
+
for (j = i - 1; j >= 0; --j)
|
649 |
+
{
|
650 |
+
if (isalpha(text[j]) == 0)
|
651 |
+
break;
|
652 |
+
if (strchr("aeiouAEIOU", text[j]))
|
653 |
+
v1 = tolower(text[j]);
|
654 |
+
c1++;
|
655 |
+
if (tokflag[j])
|
656 |
+
break;
|
657 |
+
}
|
658 |
+
if ((j >= 0 && tokflag[j] == 0)
|
659 |
+
|| v1 == '\0'
|
660 |
+
|| c1 == 1)
|
661 |
+
continue;
|
662 |
+
|
663 |
+
j = i + 1;
|
664 |
+
|
665 |
+
v2 = '\0';
|
666 |
+
c2 = 0;
|
667 |
+
s_i = 0;
|
668 |
+
for (; j < text_len && tokflag[j] == 0; ++j)
|
669 |
+
{
|
670 |
+
if (isalpha(text[j]) == 0)
|
671 |
+
break;
|
672 |
+
if (strchr("aeiouAEIOU", text[j]))
|
673 |
+
v2 = tolower(text[j]);
|
674 |
+
if (s_i < 3)
|
675 |
+
suffix[s_i++] = tolower(text[j]); suffix[s_i] = '\0';
|
676 |
+
c2++;
|
677 |
+
}
|
678 |
+
|
679 |
+
if ((j < text_len && tokflag[j] == 0)
|
680 |
+
|| v2 == '\0'
|
681 |
+
|| (c2 == 1 && v2 != 'a')
|
682 |
+
|| (c2 == 3 && tokflag[j] == 1 && s_i == 3
|
683 |
+
&& (strcmp(suffix, "gov") == 0
|
684 |
+
|| strcmp(suffix, "edu") == 0
|
685 |
+
|| strcmp(suffix, "org") == 0
|
686 |
+
|| strcmp(suffix, "com") == 0)))
|
687 |
+
continue;
|
688 |
+
|
689 |
+
tokflag[i] = 1;
|
690 |
+
tokflag[i + 1] = 1;
|
691 |
+
}
|
692 |
+
}
|
693 |
+
}
|
694 |
+
|
695 |
+
|
696 |
+
// Numeric endings of sentences
|
697 |
+
//
|
698 |
+
// A period after a numeric token followed by a token that starts
|
699 |
+
// with an alphabetic character, is a separate token.
|
700 |
+
//
|
701 |
+
// This should be covered already by tok_13
|
702 |
+
|
703 |
+
void MPtok::tok_17()
|
704 |
+
{
|
705 |
+
int j;
|
706 |
+
|
707 |
+
for (int i = 0; i < text_len; i++)
|
708 |
+
{
|
709 |
+
if (text[i] == '.'
|
710 |
+
&& tokflag[i] == 0
|
711 |
+
&& tokflag[i + 1])
|
712 |
+
{
|
713 |
+
for (j = i - 1; j >= 0 && isdigit(text[j]) && tokflag[j] == 0; --j)
|
714 |
+
;
|
715 |
+
if (j >= 0 && j < i - 1 && tokflag[j] && isalpha(nextchar(text, i + 1)))
|
716 |
+
tokflag[i] = 1;
|
717 |
+
}
|
718 |
+
}
|
719 |
+
}
|
720 |
+
|
721 |
+
// period at end of string is a token
|
722 |
+
|
723 |
+
void MPtok::tok_20()
|
724 |
+
{
|
725 |
+
for (int i = text_len - 1; i >= 0; --i)
|
726 |
+
{
|
727 |
+
if (isspace(text[i]))
|
728 |
+
continue;
|
729 |
+
|
730 |
+
if (strchr(".!?", text[i]))
|
731 |
+
tokflag[i] = 1;
|
732 |
+
|
733 |
+
break;
|
734 |
+
}
|
735 |
+
}
|
736 |
+
|
737 |
+
// a period that follows a non-common word, and that is
|
738 |
+
// followed by a lower case common word is probably not a token
|
739 |
+
|
740 |
+
void MPtok::tok_20_1()
|
741 |
+
{
|
742 |
+
int j;
|
743 |
+
|
744 |
+
for (int i = 0; i < text_len; ++i)
|
745 |
+
{
|
746 |
+
if (text[i] == '.' && tokflag[i] == 1)
|
747 |
+
{
|
748 |
+
int tcnt, lcnt, ocnt;
|
749 |
+
tcnt = lcnt = ocnt = 0;
|
750 |
+
|
751 |
+
// make sure the previous word was *not* common
|
752 |
+
|
753 |
+
for (j = i - 1; j >= 0; j--)
|
754 |
+
{
|
755 |
+
if (isspace(text[j])) continue;
|
756 |
+
if (option_new >= 2)
|
757 |
+
{
|
758 |
+
if (islower(text[j]) == 0 && text[j] != '-') ocnt++;
|
759 |
+
} else
|
760 |
+
{
|
761 |
+
if (! islower(text[j])) ocnt++;
|
762 |
+
}
|
763 |
+
|
764 |
+
if (tokflag[j] || j == 0)
|
765 |
+
{
|
766 |
+
if (ocnt == 0)
|
767 |
+
{
|
768 |
+
goto nexti;
|
769 |
+
}
|
770 |
+
break;
|
771 |
+
}
|
772 |
+
}
|
773 |
+
|
774 |
+
tcnt = lcnt = ocnt = 0;
|
775 |
+
|
776 |
+
// make sure the next word is common
|
777 |
+
|
778 |
+
for (j = i + 1; j < text_len; j++)
|
779 |
+
{
|
780 |
+
if (isspace(text[j])) continue;
|
781 |
+
if (tokflag[j]) tcnt++;
|
782 |
+
|
783 |
+
if (tcnt == 2 || j == text_len - 1)
|
784 |
+
{
|
785 |
+
if (lcnt > 0 && ocnt == 0) tokflag[i] = 0;
|
786 |
+
break;
|
787 |
+
}
|
788 |
+
|
789 |
+
if (islower(text[j])) lcnt++;
|
790 |
+
else ocnt++;
|
791 |
+
}
|
792 |
+
}
|
793 |
+
nexti: ;
|
794 |
+
}
|
795 |
+
}
|
796 |
+
|
797 |
+
// tokenized period followed by non-space other than close paren
|
798 |
+
// is not a token
|
799 |
+
|
800 |
+
void MPtok::tok_20_2()
|
801 |
+
{
|
802 |
+
int j;
|
803 |
+
|
804 |
+
for (int i = 0; i < text_len - 1; ++i)
|
805 |
+
{
|
806 |
+
if (text[i] == '.' && tokflag[i] == 1
|
807 |
+
&& strchr(" ()[]\"\'\n\t\r", text[i+1]) == 0)
|
808 |
+
{
|
809 |
+
tokflag[i] = 0;
|
810 |
+
}
|
811 |
+
}
|
812 |
+
}
|
813 |
+
|
814 |
+
|
815 |
+
// long dash
|
816 |
+
//
|
817 |
+
// A pair of hyphens is a complete token
|
818 |
+
|
819 |
+
void MPtok::tok_21()
|
820 |
+
{
|
821 |
+
for (int i = 0; i + 1 < text_len; i++)
|
822 |
+
{
|
823 |
+
if (strncmp(&text[i], "--", 2) == 0)
|
824 |
+
{
|
825 |
+
tokflag[i] = 1;
|
826 |
+
if (i + 2 < text_len)
|
827 |
+
{
|
828 |
+
i += 2;
|
829 |
+
tokflag[i] = 1;
|
830 |
+
}
|
831 |
+
}
|
832 |
+
}
|
833 |
+
}
|
834 |
+
|
835 |
+
// hyphens
|
836 |
+
//
|
837 |
+
// If specified as an option, a hyphen between letters is a complete token
|
838 |
+
|
839 |
+
void MPtok::tok_21a()
|
840 |
+
{
|
841 |
+
if (option_hyphen == 0) return;
|
842 |
+
|
843 |
+
for (int i = 0; i + 1 < text_len; i++)
|
844 |
+
{
|
845 |
+
if (text[i] == '-'
|
846 |
+
&& (i == 0 || text[i-1] != '-')
|
847 |
+
&& text[i+1] != '-')
|
848 |
+
{
|
849 |
+
tokflag[i] = 1;
|
850 |
+
tokflag[i+1] = 1;
|
851 |
+
}
|
852 |
+
}
|
853 |
+
}
|
854 |
+
|
855 |
+
|
856 |
+
// quote
|
857 |
+
//
|
858 |
+
// Any double quote is a separate token
|
859 |
+
|
860 |
+
void MPtok::tok_22()
|
861 |
+
{
|
862 |
+
for (int i = 0; i < text_len; i++)
|
863 |
+
{
|
864 |
+
if (text[i] == '"')
|
865 |
+
{
|
866 |
+
tokflag[i] = 1;
|
867 |
+
if (i + 1 < text_len)
|
868 |
+
{
|
869 |
+
i += 1;
|
870 |
+
tokflag[i] = 1;
|
871 |
+
}
|
872 |
+
}
|
873 |
+
}
|
874 |
+
}
|
875 |
+
|
876 |
+
// possessive
|
877 |
+
//
|
878 |
+
// Any single quote at the end of a token that is not
|
879 |
+
// preceded by a single quote is a separate token
|
880 |
+
|
881 |
+
void MPtok::tok_23()
|
882 |
+
{
|
883 |
+
for (int i = 0; i < text_len; i++)
|
884 |
+
{
|
885 |
+
if (text[i] == '\''
|
886 |
+
&& (i - 1 >= 0 && text[i - 1] != '\'')
|
887 |
+
&& tokflag[i + 1])
|
888 |
+
{
|
889 |
+
tokflag[i] = 1;
|
890 |
+
}
|
891 |
+
}
|
892 |
+
}
|
893 |
+
|
894 |
+
|
895 |
+
// quote
|
896 |
+
//
|
897 |
+
// If a single quote starts a token, or is preceded by a
|
898 |
+
// single quote, and followed by a character
|
899 |
+
// that is not a single quote, then
|
900 |
+
// the character to it's right is the start of a new token
|
901 |
+
|
902 |
+
void MPtok::tok_24()
|
903 |
+
{
|
904 |
+
for (int i = 0; i < text_len; i++)
|
905 |
+
{
|
906 |
+
if (text[i] == '\''
|
907 |
+
&& (tokflag[i] == 1 || (i - 1 >= 0 && text[i - 1] == '\''))
|
908 |
+
&& (i + 1 < text_len && text[i + 1] != '\''))
|
909 |
+
{
|
910 |
+
tokflag[i + 1] = 1;
|
911 |
+
}
|
912 |
+
}
|
913 |
+
}
|
914 |
+
|
915 |
+
// put back possessive
|
916 |
+
//
|
917 |
+
// A single quote that is a whole token followed by a lower case s
|
918 |
+
// that is also a whole token (without space between them)
|
919 |
+
// should be merged into a single token
|
920 |
+
|
921 |
+
void MPtok::tok_25()
|
922 |
+
{
|
923 |
+
for (int i = 0; i < text_len; i++)
|
924 |
+
{
|
925 |
+
if (text[i] == '\''
|
926 |
+
&& tokflag[i] == 1
|
927 |
+
&& i + 1 < text_len && text[i + 1] == 's'
|
928 |
+
&& tokflag[i+1] == 1
|
929 |
+
&& (i + 2 >= text_len || isspace(text[i + 2]) || tokflag[i + 2] == 1))
|
930 |
+
{
|
931 |
+
tokflag[i + 1] = 0;
|
932 |
+
}
|
933 |
+
}
|
934 |
+
}
|
935 |
+
|
936 |
+
// quote
|
937 |
+
//
|
938 |
+
// A pair of single quotes is a separate token
|
939 |
+
|
940 |
+
void MPtok::tok_26()
|
941 |
+
{
|
942 |
+
for (int i = 0; i < text_len; i++)
|
943 |
+
{
|
944 |
+
if (strncmp(&text[i], "''", 2) == 0
|
945 |
+
|| strncmp(&text[i], "``", 2) == 0)
|
946 |
+
{
|
947 |
+
tokflag[i] = 1;
|
948 |
+
if (i + 2 < text_len) tokflag[i + 2] = 1;
|
949 |
+
}
|
950 |
+
}
|
951 |
+
}
|
952 |
+
|
953 |
+
// possessive
|
954 |
+
//
|
955 |
+
// A single quote followed by a letter s is a possessive
|
956 |
+
|
957 |
+
void MPtok::tok_27()
|
958 |
+
{
|
959 |
+
for (int i = 0; i < text_len; i++)
|
960 |
+
{
|
961 |
+
if (text[i] == '\''
|
962 |
+
&& i + 1 < text_len
|
963 |
+
&& tolower(text[i + 1]) == 's'
|
964 |
+
&& (i + 2 >= text_len || tokflag[i + 2]))
|
965 |
+
{
|
966 |
+
tokflag[i] = 1;
|
967 |
+
}
|
968 |
+
}
|
969 |
+
}
|
970 |
+
|
971 |
+
// split "cannot" to "can not"
|
972 |
+
//
|
973 |
+
// A single token that is the word cannot (in any case)
|
974 |
+
// is split into two words
|
975 |
+
|
976 |
+
void MPtok::tok_28()
|
977 |
+
{
|
978 |
+
for (int i = 0; i < text_len; i++)
|
979 |
+
{
|
980 |
+
if ((strncmp(&text[i], "cannot", 6) == 0
|
981 |
+
|| strncmp(&text[i], "Cannot", 6) == 0)
|
982 |
+
&& tokflag[i + 6])
|
983 |
+
{
|
984 |
+
tokflag[i + 3] = 1;
|
985 |
+
}
|
986 |
+
}
|
987 |
+
}
|
988 |
+
|
989 |
+
// put list item elements back at sentence end
|
990 |
+
//
|
991 |
+
// A period that is preceded by an alphanumeric (no space)
|
992 |
+
// and any amount of preceding space and an end-mark
|
993 |
+
// stays with the alphanumeric.
|
994 |
+
|
995 |
+
void MPtok::tok_29()
|
996 |
+
{
|
997 |
+
int j;
|
998 |
+
|
999 |
+
for (int i = 0; i < text_len; i++)
|
1000 |
+
{
|
1001 |
+
if (text[i] == '.'
|
1002 |
+
&& tokflag[i] && tokflag[i + 1]
|
1003 |
+
&& i - 1 >= 0 && isalnum(text[i - 1])
|
1004 |
+
&& tokflag[i - 1]
|
1005 |
+
&& ((j = lookbehind(text, i-2, ".", tokflag)) >= 0
|
1006 |
+
|| (j = lookbehind(text, i-2, "?", tokflag)) >= 0
|
1007 |
+
|| (j = lookbehind(text, i-2, "!", tokflag)) >= 0)
|
1008 |
+
&& tokflag[j])
|
1009 |
+
{
|
1010 |
+
tokflag[i] = 0;
|
1011 |
+
}
|
1012 |
+
}
|
1013 |
+
}
|
1014 |
+
|
1015 |
+
// attach list elements to the beginnings of their sentences
|
1016 |
+
// this means, attach the period to the list element
|
1017 |
+
//
|
1018 |
+
// a list element is a single letter or a one or two digits
|
1019 |
+
// which is preceded by an end of sentence ".!?;"
|
1020 |
+
// or colon (provided it doesn't belong to a proportion construct)
|
1021 |
+
|
1022 |
+
void MPtok::tok_29a()
|
1023 |
+
{
|
1024 |
+
int i, j;
|
1025 |
+
|
1026 |
+
for (i = 0; i < text_len; i++)
|
1027 |
+
{
|
1028 |
+
if (text[i] == '.' && tokflag[i])
|
1029 |
+
{
|
1030 |
+
// Look back, make sure the token before the period
|
1031 |
+
// is either single alphanumeric, or at most a two digit number
|
1032 |
+
// and the character before that is a punctuation ".?!:,"
|
1033 |
+
|
1034 |
+
int tcnt, acnt, dcnt, pcnt, ocnt, scnt;
|
1035 |
+
tcnt = acnt = dcnt = pcnt = ocnt = scnt = 0;
|
1036 |
+
char p;
|
1037 |
+
|
1038 |
+
for (j = i - 1; j >= 0; j--)
|
1039 |
+
{
|
1040 |
+
if (isspace(text[j])) { scnt++; continue; }
|
1041 |
+
else if (tcnt == 0 && isalpha(text[j])) ++acnt;
|
1042 |
+
else if (tcnt == 0 && isdigit(text[j])) ++dcnt;
|
1043 |
+
else if (tcnt == 1 && strchr(".!?:;,", text[j])) { pcnt++; p = text[j]; }
|
1044 |
+
else ocnt++;
|
1045 |
+
|
1046 |
+
if (tokflag[j] || j == 0)
|
1047 |
+
{
|
1048 |
+
tcnt++;
|
1049 |
+
if (tcnt == 1 && ocnt == 0 && scnt == 0
|
1050 |
+
&& ((acnt == 1 && dcnt == 0) || (acnt == 0 && dcnt > 0 && dcnt <= 2)))
|
1051 |
+
{
|
1052 |
+
// This is acceptable
|
1053 |
+
} else if (tcnt == 2 && pcnt <= 1 && ocnt == 0 && scnt > 0)
|
1054 |
+
{
|
1055 |
+
if (p == ':')
|
1056 |
+
{
|
1057 |
+
while (--j >= 0 && isspace(text[j]))
|
1058 |
+
;
|
1059 |
+
if (j >= 0 && isdigit(text[j]))
|
1060 |
+
{
|
1061 |
+
// It's probably a proportion
|
1062 |
+
break;
|
1063 |
+
}
|
1064 |
+
}
|
1065 |
+
// Jackpot
|
1066 |
+
tokflag[i] = 0;
|
1067 |
+
} else
|
1068 |
+
{
|
1069 |
+
// This is not
|
1070 |
+
break;
|
1071 |
+
}
|
1072 |
+
scnt = 0;
|
1073 |
+
}
|
1074 |
+
}
|
1075 |
+
}
|
1076 |
+
}
|
1077 |
+
}
|
1078 |
+
|
1079 |
+
// list elements at the beginning of a string
|
1080 |
+
//
|
1081 |
+
// An alphanumeric token followed by a period
|
1082 |
+
// at the beginning of the line stays with the
|
1083 |
+
// alphanumeric
|
1084 |
+
|
1085 |
+
void MPtok::tok_30()
|
1086 |
+
{
|
1087 |
+
int i = 0;
|
1088 |
+
|
1089 |
+
while (isspace(text[i])) i++;
|
1090 |
+
|
1091 |
+
if (isalnum(text[i])
|
1092 |
+
&& tokflag[i]
|
1093 |
+
&& i + 1 < text_len
|
1094 |
+
&& text[i + 1] == '.'
|
1095 |
+
&& tokflag[i + 1])
|
1096 |
+
{
|
1097 |
+
tokflag[i + 1] = 0;
|
1098 |
+
}
|
1099 |
+
}
|
1100 |
+
|
1101 |
+
// process American style numbers
|
1102 |
+
|
1103 |
+
void MPtok::tok_31()
|
1104 |
+
{
|
1105 |
+
int j;
|
1106 |
+
|
1107 |
+
for (int i = 0; i < text_len; i++)
|
1108 |
+
{
|
1109 |
+
if (text[i] == ','
|
1110 |
+
&& i + 3 < text_len
|
1111 |
+
&& tokflag[i] && tokflag[i + 1]
|
1112 |
+
&& isdigit(text[i + 1])
|
1113 |
+
&& isdigit(text[i + 2])
|
1114 |
+
&& isdigit(text[i + 3])
|
1115 |
+
&& i - 1 >= 0 && isdigit(text[i - 1])
|
1116 |
+
)
|
1117 |
+
{
|
1118 |
+
tokflag[i] = 0;
|
1119 |
+
tokflag[i + 1] = 0;
|
1120 |
+
}
|
1121 |
+
}
|
1122 |
+
}
|
1123 |
+
|
1124 |
+
// process British style numbers
|
1125 |
+
|
1126 |
+
void MPtok::tok_32()
|
1127 |
+
{
|
1128 |
+
int j;
|
1129 |
+
|
1130 |
+
for (int i = 0; i < text_len; i++)
|
1131 |
+
{
|
1132 |
+
if (text[i] == ' '
|
1133 |
+
&& i + 3 < text_len
|
1134 |
+
&& tokflag[i] && tokflag[i + 1]
|
1135 |
+
&& isdigit(text[i + 1])
|
1136 |
+
&& isdigit(text[i + 2])
|
1137 |
+
&& isdigit(text[i + 3])
|
1138 |
+
&& i - 1 >= 0 && isdigit(text[i - 1])
|
1139 |
+
)
|
1140 |
+
{
|
1141 |
+
tokflag[i] = 0;
|
1142 |
+
tokflag[i + 1] = 0;
|
1143 |
+
}
|
1144 |
+
}
|
1145 |
+
}
|
1146 |
+
|
1147 |
+
// tokenize unicode escapes
|
1148 |
+
//
|
1149 |
+
// Added
|
1150 |
+
|
1151 |
+
void MPtok::tok_33()
|
1152 |
+
{
|
1153 |
+
int j;
|
1154 |
+
|
1155 |
+
for (int i = 0; i < text_len; i++)
|
1156 |
+
{
|
1157 |
+
if (text[i] == '&')
|
1158 |
+
{
|
1159 |
+
if (text[i + 1] == '#')
|
1160 |
+
{
|
1161 |
+
for (j = i + 2; isdigit(text[j]); j++)
|
1162 |
+
;
|
1163 |
+
} else
|
1164 |
+
{
|
1165 |
+
for (j = i + 1; isalpha(text[j]); j++)
|
1166 |
+
;
|
1167 |
+
}
|
1168 |
+
|
1169 |
+
if (text[j] == ';')
|
1170 |
+
{
|
1171 |
+
// Tokenize the escape, untokenize everything inside
|
1172 |
+
|
1173 |
+
tokflag[i] = 1;
|
1174 |
+
for (i++; i <= j; i++) tokflag[i] = 0;
|
1175 |
+
tokflag[i] = 1;
|
1176 |
+
}
|
1177 |
+
}
|
1178 |
+
}
|
1179 |
+
}
|
1180 |
+
|
1181 |
+
// Remove tags if they are present
|
1182 |
+
|
1183 |
+
void MPtok::tok_un()
|
1184 |
+
{
|
1185 |
+
int untok = 0;
|
1186 |
+
for (int i = 0; text[i]; ++i)
|
1187 |
+
{
|
1188 |
+
if (isspace(text[i])) untok = 0;
|
1189 |
+
if (text[i] == option_tagsep) untok = 1;
|
1190 |
+
if (untok) text[i] = ' ';
|
1191 |
+
}
|
1192 |
+
}
|
1193 |
+
|
1194 |
+
|
1195 |
+
void MPtok::set_tokflag()
|
1196 |
+
{
|
1197 |
+
int i;
|
1198 |
+
|
1199 |
+
tok_0();
|
1200 |
+
tok_1();
|
1201 |
+
tok_2();
|
1202 |
+
tok_3();
|
1203 |
+
|
1204 |
+
// step 4 replaces tag char, this is done at output
|
1205 |
+
|
1206 |
+
tok_5_6_7();
|
1207 |
+
tok_8_9();
|
1208 |
+
|
1209 |
+
tok_10();
|
1210 |
+
tok_11();
|
1211 |
+
if (option_new >= 1)
|
1212 |
+
{
|
1213 |
+
tok_21();
|
1214 |
+
tok_21a();
|
1215 |
+
tok_22();
|
1216 |
+
tok_23();
|
1217 |
+
tok_24();
|
1218 |
+
tok_25();
|
1219 |
+
tok_26();
|
1220 |
+
tok_27();
|
1221 |
+
}
|
1222 |
+
tok_12();
|
1223 |
+
tok_13();
|
1224 |
+
tok_14();
|
1225 |
+
if (option_new <= 5)
|
1226 |
+
tok_15();
|
1227 |
+
if (option_new < 2)
|
1228 |
+
tok_16();
|
1229 |
+
tok_17();
|
1230 |
+
|
1231 |
+
// steps 18 and 19 recognize periods within parens,
|
1232 |
+
// and this is moved to the segmentation section
|
1233 |
+
|
1234 |
+
tok_20();
|
1235 |
+
if (option_new >= 1)
|
1236 |
+
{
|
1237 |
+
tok_20_1();
|
1238 |
+
tok_20_2();
|
1239 |
+
if (option_new >= 2)
|
1240 |
+
tok_16_1();
|
1241 |
+
if (option_new >= 6)
|
1242 |
+
tok_15();
|
1243 |
+
if (option_new >= 7)
|
1244 |
+
tok_15_1();
|
1245 |
+
}
|
1246 |
+
if (option_new < 1)
|
1247 |
+
{
|
1248 |
+
tok_21();
|
1249 |
+
tok_21a();
|
1250 |
+
tok_22();
|
1251 |
+
tok_23();
|
1252 |
+
tok_24();
|
1253 |
+
tok_25();
|
1254 |
+
tok_26();
|
1255 |
+
tok_27();
|
1256 |
+
}
|
1257 |
+
tok_28();
|
1258 |
+
if (option_new >= 1)
|
1259 |
+
tok_29a();
|
1260 |
+
else
|
1261 |
+
tok_29();
|
1262 |
+
tok_30();
|
1263 |
+
tok_31();
|
1264 |
+
tok_32();
|
1265 |
+
|
1266 |
+
tok_33();
|
1267 |
+
}
|
1268 |
+
|
1269 |
+
/* set_endflag
|
1270 |
+
**
|
1271 |
+
** After tokflag has been set, find the possible sentence endings.
|
1272 |
+
*/
|
1273 |
+
|
1274 |
+
void MPtok::set_endflag()
|
1275 |
+
{
|
1276 |
+
int i;
|
1277 |
+
|
1278 |
+
// The following tests look for end-stops and label them.
|
1279 |
+
// They include steps 18 and 19
|
1280 |
+
|
1281 |
+
for (i = 0; i <= text_len; i++)
|
1282 |
+
endflag[i] = 0;
|
1283 |
+
|
1284 |
+
// Count the number of unmatched parens
|
1285 |
+
|
1286 |
+
int up = 0; // unmatched round parens
|
1287 |
+
int ub = 0; // unmatched brackets
|
1288 |
+
|
1289 |
+
for (i = 0; i < text_len; i++)
|
1290 |
+
{
|
1291 |
+
if (text[i] == '(') ++up;
|
1292 |
+
if (text[i] == ')') --up;
|
1293 |
+
if (text[i] == '[') ++ub;
|
1294 |
+
if (text[i] == ']') --ub;
|
1295 |
+
if (up < 0) up = 0;
|
1296 |
+
if (ub < 0) ub = 0;
|
1297 |
+
}
|
1298 |
+
|
1299 |
+
// Now find the end-of-sentence marks
|
1300 |
+
|
1301 |
+
// tok_18: periods within parentheses, allow for nesting
|
1302 |
+
// tok_19: periods within brackets, allow for nesting
|
1303 |
+
// the perl version solves this by putting the period
|
1304 |
+
// back with the previous token, but a better solution
|
1305 |
+
// is to allow it to be tokenized but just don't
|
1306 |
+
// allow it to be an end-of-sentence.
|
1307 |
+
// Therefore, these are moved to the segmentation
|
1308 |
+
// section
|
1309 |
+
|
1310 |
+
int p = 0; // round parens
|
1311 |
+
int b = 0; // brackets
|
1312 |
+
|
1313 |
+
for (i = 0; i < text_len; i++)
|
1314 |
+
{
|
1315 |
+
if (text[i] == '(') ++p;
|
1316 |
+
if (text[i] == ')') --p;
|
1317 |
+
if (text[i] == '[') ++b;
|
1318 |
+
if (text[i] == ']') --b;
|
1319 |
+
if (p < 0) p = 0;
|
1320 |
+
if (b < 0) b = 0;
|
1321 |
+
|
1322 |
+
if (strchr(".!?", text[i])
|
1323 |
+
&& tokflag[i]
|
1324 |
+
&& tokflag[i + 1])
|
1325 |
+
{
|
1326 |
+
if (option_segment && p <= up && b <= ub)
|
1327 |
+
endflag[i] = 1;
|
1328 |
+
|
1329 |
+
// This is optional to join periods with
|
1330 |
+
// probable abbreviations
|
1331 |
+
|
1332 |
+
if (p > up || b > ub)
|
1333 |
+
tokflag[i] = 0;
|
1334 |
+
}
|
1335 |
+
}
|
1336 |
+
|
1337 |
+
// endtokens followed by a single or double quote, which matches
|
1338 |
+
// a single or double quote in the previous sentence
|
1339 |
+
|
1340 |
+
if (option_new >= 1)
|
1341 |
+
{
|
1342 |
+
int dquo, squo;
|
1343 |
+
dquo = squo = 0;
|
1344 |
+
|
1345 |
+
for (i = 0; i < text_len; i++)
|
1346 |
+
{
|
1347 |
+
if (text[i] == '"') dquo = ! dquo;
|
1348 |
+
else if (text[i] == '\'') squo = ! squo;
|
1349 |
+
else if (endflag[i])
|
1350 |
+
{
|
1351 |
+
if ((text[i+1] == '"' && dquo) || (text[i+1] == '\'' && squo))
|
1352 |
+
{
|
1353 |
+
endflag[i] = 0;
|
1354 |
+
|
1355 |
+
// But don't end at all if the next token is something
|
1356 |
+
// other than an upper case letter.
|
1357 |
+
|
1358 |
+
if (option_new >= 2)
|
1359 |
+
{
|
1360 |
+
int j;
|
1361 |
+
int ok = 0;
|
1362 |
+
|
1363 |
+
for (j = i + 2; j < text_len; j++)
|
1364 |
+
{
|
1365 |
+
if (isspace(text[j])) continue;
|
1366 |
+
// if (isupper(text[j]))
|
1367 |
+
if (isupper(text[j]) || text[j] == '(')
|
1368 |
+
{
|
1369 |
+
ok = 1;
|
1370 |
+
break;
|
1371 |
+
}
|
1372 |
+
if (tokflag[j]) break;
|
1373 |
+
}
|
1374 |
+
|
1375 |
+
if (ok)
|
1376 |
+
endflag[i+1] = 1;
|
1377 |
+
} else
|
1378 |
+
{
|
1379 |
+
endflag[i+1] = 1;
|
1380 |
+
}
|
1381 |
+
}
|
1382 |
+
dquo = squo = 0;
|
1383 |
+
}
|
1384 |
+
}
|
1385 |
+
}
|
1386 |
+
}
|
1387 |
+
|
1388 |
+
|
1389 |
+
/* set_endflag_01
|
1390 |
+
**
|
1391 |
+
** After tokflag has been set, find the possible sentence endings.
|
1392 |
+
** This has improved paren matching.
|
1393 |
+
*/
|
1394 |
+
|
1395 |
+
#define MAX_MATCH 500 // Maximum length to get a paren match
|
1396 |
+
|
1397 |
+
void MPtok::set_endflag_01()
|
1398 |
+
{
|
1399 |
+
int match[text_len];
|
1400 |
+
int i, j;
|
1401 |
+
|
1402 |
+
// The following tests look for end-stops and label them.
|
1403 |
+
// They include steps 18 and 19
|
1404 |
+
|
1405 |
+
for (i = 0; i <= text_len; i++)
|
1406 |
+
endflag[i] = 0;
|
1407 |
+
|
1408 |
+
for (i = 0; i < text_len; i++)
|
1409 |
+
match[i] = 0;
|
1410 |
+
|
1411 |
+
for (i = text_len - 1; i >= 0; i--)
|
1412 |
+
{
|
1413 |
+
if (text[i] == '(' || text[i] == '[')
|
1414 |
+
{
|
1415 |
+
for (j = i + 1; text[j] && j - i <= MAX_MATCH; j++)
|
1416 |
+
{
|
1417 |
+
// Skip parens that are already matched
|
1418 |
+
|
1419 |
+
if (match[j] > j)
|
1420 |
+
{
|
1421 |
+
j = match[j];
|
1422 |
+
continue;
|
1423 |
+
}
|
1424 |
+
|
1425 |
+
// Look for a matching close paren
|
1426 |
+
|
1427 |
+
if (match[j] == 0
|
1428 |
+
&& ((text[i] == '(' && text[j] == ')')
|
1429 |
+
|| (text[i] == '[' && text[j] == ']')))
|
1430 |
+
{
|
1431 |
+
match[i] = j;
|
1432 |
+
match[j] = i;
|
1433 |
+
break;
|
1434 |
+
}
|
1435 |
+
}
|
1436 |
+
}
|
1437 |
+
}
|
1438 |
+
|
1439 |
+
int next_match = 0;
|
1440 |
+
for (i = 0; i < text_len; i++)
|
1441 |
+
{
|
1442 |
+
if (match[i] > next_match)
|
1443 |
+
next_match = match[i];
|
1444 |
+
|
1445 |
+
if (strchr(".!?", text[i])
|
1446 |
+
&& tokflag[i]
|
1447 |
+
&& tokflag[i + 1]
|
1448 |
+
&& (option_new <= 4 || option_doteos == 1 || (i > 0 && isspace(text[i-1]) == 0)))
|
1449 |
+
{
|
1450 |
+
if (i <= next_match)
|
1451 |
+
tokflag[i] = 0;
|
1452 |
+
else if (option_segment)
|
1453 |
+
endflag[i] = 1;
|
1454 |
+
}
|
1455 |
+
}
|
1456 |
+
|
1457 |
+
// endtokens followed by a single or double quote, which matches
|
1458 |
+
// a single or double quote in the previous sentence
|
1459 |
+
|
1460 |
+
int dquo, squo;
|
1461 |
+
dquo = squo = 0;
|
1462 |
+
|
1463 |
+
for (i = 0; i < text_len; i++)
|
1464 |
+
{
|
1465 |
+
if (option_new <= 7 && text[i] == '"') dquo = ! dquo;
|
1466 |
+
else if (option_new >= 8 && text[i] == '"' && tokflag[i] && tokflag[i+1]) dquo = ! dquo;
|
1467 |
+
else if (option_new <= 7 && text[i] == '\'') squo = ! squo;
|
1468 |
+
else if (option_new >= 8 && text[i] == '\''
|
1469 |
+
&& tokflag[i] && (tokflag[i+1] || (text[i+1] == '\'' && tokflag[i+2]))) squo = ! squo;
|
1470 |
+
else if (endflag[i])
|
1471 |
+
{
|
1472 |
+
if ((text[i+1] == '"' && dquo) || (text[i+1] == '\'' && squo))
|
1473 |
+
{
|
1474 |
+
endflag[i] = 0;
|
1475 |
+
|
1476 |
+
// But don't end at all if the next token is something
|
1477 |
+
// other than an upper case letter.
|
1478 |
+
|
1479 |
+
if (option_new >= 2)
|
1480 |
+
{
|
1481 |
+
int j;
|
1482 |
+
int ok = 0;
|
1483 |
+
|
1484 |
+
for (j = i + 2; j < text_len; j++)
|
1485 |
+
{
|
1486 |
+
if (isspace(text[j])) continue;
|
1487 |
+
// if (isupper(text[j]))
|
1488 |
+
if (isupper(text[j]) || text[j] == '(')
|
1489 |
+
{
|
1490 |
+
ok = 1;
|
1491 |
+
break;
|
1492 |
+
}
|
1493 |
+
if (tokflag[j]) break;
|
1494 |
+
}
|
1495 |
+
|
1496 |
+
if (ok)
|
1497 |
+
endflag[i+1] = 1;
|
1498 |
+
} else
|
1499 |
+
{
|
1500 |
+
endflag[i+1] = 1;
|
1501 |
+
}
|
1502 |
+
}
|
1503 |
+
dquo = squo = 0;
|
1504 |
+
}
|
1505 |
+
}
|
1506 |
+
}
|
1507 |
+
|
1508 |
+
|
1509 |
+
// Size buffer: return the size of the buffer required to hold all of the tokenized text.
|
1510 |
+
// It can be simply estimated by a formula that depends only on the length of text and number of tokens.
|
1511 |
+
|
1512 |
+
int MPtok::size_buff()
|
1513 |
+
{
|
1514 |
+
int size = 1; // Start with null terminator
|
1515 |
+
int t = option_pretag.size(); // for each tag, the length of the UNTAG string
|
1516 |
+
|
1517 |
+
if (t <= 0) t = 1; // Make sure there is at least one
|
1518 |
+
t += 2; // Add one for underscore and one for space
|
1519 |
+
|
1520 |
+
for (int i = 0; i < text_len; i++)
|
1521 |
+
{
|
1522 |
+
size++; // Count all characters
|
1523 |
+
if (tokflag[i]) size += t; // Count token delimiters (may overcount)
|
1524 |
+
if (endflag[i]) size++; // Add one for newline
|
1525 |
+
}
|
1526 |
+
return size;
|
1527 |
+
}
|
1528 |
+
|
1529 |
+
|
1530 |
+
/* append_token
|
1531 |
+
**
|
1532 |
+
** Save a single token to a buffer.
|
1533 |
+
*/
|
1534 |
+
|
1535 |
+
void MPtok::append_token(string& buff, int& sp, char *tok, int ef)
|
1536 |
+
{
|
1537 |
+
// Convert tag separator chars and back quotes (?)
|
1538 |
+
|
1539 |
+
for (int i = 0; tok[i]; i++)
|
1540 |
+
{
|
1541 |
+
if (tok[i] == option_tagsep) tok[i] = option_replacesep;
|
1542 |
+
if (tok[i] == '`') tok[i] = '\'';
|
1543 |
+
}
|
1544 |
+
|
1545 |
+
// Skip whitespace if tokens are being output
|
1546 |
+
// Otherwise, skip whitespace at the start of a sentence
|
1547 |
+
|
1548 |
+
if (option_token || ! sp) while (isspace(*tok)) ++tok;
|
1549 |
+
|
1550 |
+
// Save the token
|
1551 |
+
|
1552 |
+
if (strlen(tok) > 0)
|
1553 |
+
{
|
1554 |
+
// Add delimiter if needed
|
1555 |
+
|
1556 |
+
if (option_token && sp) buff += ' ';
|
1557 |
+
|
1558 |
+
// Append token to output
|
1559 |
+
|
1560 |
+
if (option_new < 9)
|
1561 |
+
{
|
1562 |
+
while (*tok && (! option_token || ! isspace(*tok)))
|
1563 |
+
buff += *(tok++);
|
1564 |
+
} else
|
1565 |
+
{
|
1566 |
+
while (*tok)
|
1567 |
+
buff += *(tok++);
|
1568 |
+
}
|
1569 |
+
|
1570 |
+
sp = 1;
|
1571 |
+
|
1572 |
+
// Add tag holders
|
1573 |
+
|
1574 |
+
if (option_token && option_pretag.size() > 0)
|
1575 |
+
{
|
1576 |
+
buff += option_tagsep;
|
1577 |
+
buff += option_pretag;
|
1578 |
+
}
|
1579 |
+
|
1580 |
+
// If it was end of sentence, then add newline
|
1581 |
+
|
1582 |
+
if (ef)
|
1583 |
+
{
|
1584 |
+
buff += '\n';
|
1585 |
+
sp = 0;
|
1586 |
+
}
|
1587 |
+
}
|
1588 |
+
}
|
1589 |
+
|
1590 |
+
// Strip whitespace after sentences
|
1591 |
+
|
1592 |
+
static void adjust_space(string& buff)
|
1593 |
+
{
|
1594 |
+
while (buff.size() > 0 && isspace(buff[0])) buff.erase(0, 1);
|
1595 |
+
|
1596 |
+
// delete two spaces in a row, but keep newlines
|
1597 |
+
|
1598 |
+
for (int i = 1; i < buff.size(); i++)
|
1599 |
+
{
|
1600 |
+
if (isspace(buff[i]) && isspace(buff[i-1]))
|
1601 |
+
buff.erase((buff[i] == '\n')?(--i):(i--), 1);
|
1602 |
+
}
|
1603 |
+
|
1604 |
+
for (int i = buff.size() - 1; i >= 0 && isspace(buff[i]); i--)
|
1605 |
+
buff.erase(i, 1);
|
1606 |
+
}
|
1607 |
+
|
1608 |
+
/* token_string
|
1609 |
+
**
|
1610 |
+
** After the tokflag and endflag have been set, copy the tokens to the buffer.
|
1611 |
+
*/
|
1612 |
+
|
1613 |
+
string MPtok::token_string()
|
1614 |
+
{
|
1615 |
+
string buff;
|
1616 |
+
|
1617 |
+
int i;
|
1618 |
+
|
1619 |
+
// Move token starts to non-whitespace chars
|
1620 |
+
|
1621 |
+
int last_tok = 0;
|
1622 |
+
for (i = 0; i < text_len; i++)
|
1623 |
+
{
|
1624 |
+
if (tokflag[i] == 1 && isspace(text[i]))
|
1625 |
+
{
|
1626 |
+
tokflag[i] = 0;
|
1627 |
+
last_tok = 1;
|
1628 |
+
} else if (isspace(text[i]) == 0 && last_tok)
|
1629 |
+
{
|
1630 |
+
tokflag[i] = 1;
|
1631 |
+
last_tok = 0;
|
1632 |
+
}
|
1633 |
+
}
|
1634 |
+
|
1635 |
+
// Extract the tokens and print them out now
|
1636 |
+
|
1637 |
+
char *tok = new char[text_len + 1];
|
1638 |
+
int pos = 0;
|
1639 |
+
int sp = 0;
|
1640 |
+
int ef = 0;
|
1641 |
+
|
1642 |
+
tok[pos] = '\0';
|
1643 |
+
|
1644 |
+
for (i = 0; i <= text_len; i++)
|
1645 |
+
{
|
1646 |
+
// The start of a new token
|
1647 |
+
|
1648 |
+
if (tokflag[i])
|
1649 |
+
{
|
1650 |
+
// Print the current token
|
1651 |
+
|
1652 |
+
append_token(buff, sp, tok, ef);
|
1653 |
+
|
1654 |
+
// Start a new token
|
1655 |
+
|
1656 |
+
pos = 0;
|
1657 |
+
tok[pos] = '\0';
|
1658 |
+
|
1659 |
+
ef = 0;
|
1660 |
+
}
|
1661 |
+
|
1662 |
+
// Append to the current token
|
1663 |
+
|
1664 |
+
tok[pos++] = text[i];
|
1665 |
+
tok[pos] = '\0';
|
1666 |
+
|
1667 |
+
// If any of the characters in the token are endflagged,
|
1668 |
+
// Then pass this information along for end-of-sentence
|
1669 |
+
|
1670 |
+
if (endflag[i]) ef = 1;
|
1671 |
+
}
|
1672 |
+
|
1673 |
+
// Print the last token
|
1674 |
+
|
1675 |
+
append_token(buff, sp, tok, ef);
|
1676 |
+
|
1677 |
+
delete[] tok;
|
1678 |
+
|
1679 |
+
// Adjust the end of sentence boundaries
|
1680 |
+
|
1681 |
+
adjust_space(buff);
|
1682 |
+
|
1683 |
+
return buff;
|
1684 |
+
}
|
1685 |
+
|
1686 |
+
void MPtok::map_escapes()
|
1687 |
+
{
|
1688 |
+
char *s;
|
1689 |
+
int j, k, ch;
|
1690 |
+
char buff[10];
|
1691 |
+
|
1692 |
+
k = 0;
|
1693 |
+
for (int i = 0; text[i]; i++)
|
1694 |
+
{
|
1695 |
+
if (text[i] == '&' && text[i + 1] == '#')
|
1696 |
+
{
|
1697 |
+
for (s = &buff[0], j = 2; j <= 4 && i + j < text_len && isdigit(text[i + j]); j++)
|
1698 |
+
*s++ = text[i + j];
|
1699 |
+
*s = '\0';
|
1700 |
+
ch = atoi(buff);
|
1701 |
+
if (strlen(buff) > 0 && text[i + j] == ';' && ch > 0 && ch <= 256)
|
1702 |
+
{
|
1703 |
+
text[k] = ch;
|
1704 |
+
if (! text[k]) text[k] = ' ';
|
1705 |
+
k++;
|
1706 |
+
i = i + j;
|
1707 |
+
continue;
|
1708 |
+
}
|
1709 |
+
}
|
1710 |
+
text[k++] = text[i];
|
1711 |
+
}
|
1712 |
+
text[k] = '\0';
|
1713 |
+
text_len = k;
|
1714 |
+
}
|
1715 |
+
|
1716 |
+
MPtok::MPtok(string idir, const string& cnam)
|
1717 |
+
{
|
1718 |
+
tok_initialized = 0;
|
1719 |
+
|
1720 |
+
if (idir.size() == 0)
|
1721 |
+
{
|
1722 |
+
char *p = getenv("MEDPOST_HOME");
|
1723 |
+
if (p && strlen(p))
|
1724 |
+
{
|
1725 |
+
idir = p;
|
1726 |
+
|
1727 |
+
int found = idir.find("=");
|
1728 |
+
if (found != string::npos)
|
1729 |
+
idir = idir.substr(found + 1);
|
1730 |
+
}
|
1731 |
+
}
|
1732 |
+
|
1733 |
+
|
1734 |
+
if (idir.size() == 0)
|
1735 |
+
{
|
1736 |
+
char buff[1000];
|
1737 |
+
FILE *fp = fopen("path_medpost", "r");
|
1738 |
+
if (fp)
|
1739 |
+
{
|
1740 |
+
if (fgets(buff, 1000, fp))
|
1741 |
+
{
|
1742 |
+
chomp(buff);
|
1743 |
+
idir = &buff[0];
|
1744 |
+
}
|
1745 |
+
fclose(fp);
|
1746 |
+
}
|
1747 |
+
}
|
1748 |
+
|
1749 |
+
if (idir.size() == 0)
|
1750 |
+
idir = "/home/natxie/CPP64/lib/FIXED_DATA/";
|
1751 |
+
|
1752 |
+
option_dir = idir;
|
1753 |
+
|
1754 |
+
option_token = 1;
|
1755 |
+
option_segment = 1;
|
1756 |
+
option_hyphen = 0;
|
1757 |
+
option_comma = 1;
|
1758 |
+
option_pretok = 0;
|
1759 |
+
option_new = MPTOK_VERSION;
|
1760 |
+
option_doteos = 0;
|
1761 |
+
|
1762 |
+
if (cnam.size() > 0)
|
1763 |
+
{
|
1764 |
+
option_cnam = "_";
|
1765 |
+
option_cnam += cnam;
|
1766 |
+
}
|
1767 |
+
|
1768 |
+
init();
|
1769 |
+
}
|
1770 |
+
|
1771 |
+
void MPtok::init(void)
|
1772 |
+
{
|
1773 |
+
if (tok_initialized) return;
|
1774 |
+
|
1775 |
+
string fname;
|
1776 |
+
|
1777 |
+
fname = option_dir + "/medpost" + option_cnam + ".pairs";
|
1778 |
+
init_pair(fname);
|
1779 |
+
|
1780 |
+
fname = option_dir + "/medpost" + option_cnam + ".abbr";
|
1781 |
+
init_abbr(fname);
|
1782 |
+
|
1783 |
+
tok_initialized = 1;
|
1784 |
+
}
|
1785 |
+
|
1786 |
+
MPtok::~MPtok()
|
1787 |
+
{
|
1788 |
+
}
|
1789 |
+
|
1790 |
+
// Global tokenizer
|
1791 |
+
|
1792 |
+
string MPtok::tokenize(const string& txt, int mt)
|
1793 |
+
{
|
1794 |
+
if (option_pretok) return save_string(txt);
|
1795 |
+
|
1796 |
+
option_token = mt;
|
1797 |
+
text_len = txt.size();
|
1798 |
+
if (text_len == 0) return string("");
|
1799 |
+
|
1800 |
+
text = new char[text_len + 1];
|
1801 |
+
strcpy(text, txt.c_str());
|
1802 |
+
|
1803 |
+
map_escapes();
|
1804 |
+
|
1805 |
+
if (text_len == 0) return NULL;
|
1806 |
+
|
1807 |
+
tokflag = new int[text_len + 1];
|
1808 |
+
endflag = new int[text_len + 1];
|
1809 |
+
|
1810 |
+
set_tokflag();
|
1811 |
+
if (option_new < 3)
|
1812 |
+
set_endflag();
|
1813 |
+
else
|
1814 |
+
set_endflag_01();
|
1815 |
+
|
1816 |
+
string buff = token_string();
|
1817 |
+
save_string(buff);
|
1818 |
+
|
1819 |
+
delete[] text; text = NULL;
|
1820 |
+
delete[] tokflag; tokflag = NULL;
|
1821 |
+
delete[] endflag; endflag = NULL;
|
1822 |
+
|
1823 |
+
return buff;
|
1824 |
+
}
|
1825 |
+
|
1826 |
+
string MPtok::tokenize(const string& text)
|
1827 |
+
{
|
1828 |
+
return tokenize(text, 1);
|
1829 |
+
}
|
1830 |
+
|
1831 |
+
string MPtok::segment(const string& text)
|
1832 |
+
{
|
1833 |
+
sent.clear();
|
1834 |
+
|
1835 |
+
// tokenize the text
|
1836 |
+
|
1837 |
+
int save_option_segment = option_segment;
|
1838 |
+
option_segment = 1;
|
1839 |
+
string buff = tokenize(text, 0);
|
1840 |
+
option_segment = save_option_segment;
|
1841 |
+
|
1842 |
+
if (buff.size() == 0) return text;
|
1843 |
+
|
1844 |
+
int found = 0;
|
1845 |
+
int pos = 0;
|
1846 |
+
|
1847 |
+
while (pos < buff.size())
|
1848 |
+
{
|
1849 |
+
found = buff.find('\n', pos);
|
1850 |
+
if (found == string::npos)
|
1851 |
+
{
|
1852 |
+
sent.push_back(buff.substr(pos));
|
1853 |
+
pos = buff.size();
|
1854 |
+
} else
|
1855 |
+
{
|
1856 |
+
sent.push_back(buff.substr(pos, found - pos));
|
1857 |
+
pos = found + 1;
|
1858 |
+
}
|
1859 |
+
}
|
1860 |
+
|
1861 |
+
return buff;
|
1862 |
+
}
|
1863 |
+
|
1864 |
+
string MPtok::save_string(const string& s)
|
1865 |
+
{
|
1866 |
+
stringstream ss (stringstream::in | stringstream::out);
|
1867 |
+
string w, t;
|
1868 |
+
int found;
|
1869 |
+
string ret;
|
1870 |
+
|
1871 |
+
word.clear();
|
1872 |
+
tag.clear();
|
1873 |
+
|
1874 |
+
ss << s;
|
1875 |
+
while (ss.good())
|
1876 |
+
{
|
1877 |
+
ss >> w;
|
1878 |
+
if (w.size() == 0) break;
|
1879 |
+
|
1880 |
+
found = w.find('_');
|
1881 |
+
|
1882 |
+
if (found != string::npos)
|
1883 |
+
{
|
1884 |
+
t = w.substr(found + 1);
|
1885 |
+
w.resize(found);
|
1886 |
+
word.push_back(w);
|
1887 |
+
tag.push_back(t);
|
1888 |
+
} else
|
1889 |
+
{
|
1890 |
+
word.push_back(w);
|
1891 |
+
tag.push_back(option_pretag);
|
1892 |
+
|
1893 |
+
}
|
1894 |
+
if (ret.size() > 0) ret += " ";
|
1895 |
+
ret += w;
|
1896 |
+
}
|
1897 |
+
|
1898 |
+
// now look for continuation tags...
|
1899 |
+
|
1900 |
+
for (int i = 0; i < word.size(); i++)
|
1901 |
+
{
|
1902 |
+
int j = tag[i].size() - 1;
|
1903 |
+
if (j >= 0 && tag[i][j] == '+' && i < tag.size() - 1)
|
1904 |
+
{
|
1905 |
+
word[i] = word[i] + " " + word[i + 1];
|
1906 |
+
tag[i] = tag[i + 1];
|
1907 |
+
word.erase(word.begin() + i + 1, word.begin() + i + 2);
|
1908 |
+
tag.erase(tag.begin() + i + 1, tag.begin() + i + 2);
|
1909 |
+
i--;
|
1910 |
+
}
|
1911 |
+
}
|
1912 |
+
|
1913 |
+
return ret;
|
1914 |
+
}
|
1915 |
+
|
1916 |
+
|
1917 |
+
static int count_words(const char *s)
|
1918 |
+
{
|
1919 |
+
int i;
|
1920 |
+
|
1921 |
+
i = 1;
|
1922 |
+
for (; *s; ++s)
|
1923 |
+
{
|
1924 |
+
if (*s == ' ') ++i;
|
1925 |
+
}
|
1926 |
+
return i;
|
1927 |
+
}
|
1928 |
+
|
1929 |
+
static void print_word(const char *s, int i)
|
1930 |
+
{
|
1931 |
+
for (; i > 0 && *s; ++s) { if (*s == ' ') --i; }
|
1932 |
+
while (*s && *s != ' ') { printf("%c", *s); ++s; }
|
1933 |
+
}
|
1934 |
+
|
1935 |
+
void MPtok::print(int how)
|
1936 |
+
{
|
1937 |
+
int i, j, w;
|
1938 |
+
|
1939 |
+
if (how != 0 && how != 2)
|
1940 |
+
{
|
1941 |
+
printf("print(%d) not defined\n", how);
|
1942 |
+
return;
|
1943 |
+
}
|
1944 |
+
|
1945 |
+
for (i = 0; i < word.size(); ++i)
|
1946 |
+
{
|
1947 |
+
// Get the words from an idiom
|
1948 |
+
|
1949 |
+
for (w = 0; w < count_words(word[i].c_str()); ++w)
|
1950 |
+
{
|
1951 |
+
if (how == 2 && i + w > 0) printf(" ");
|
1952 |
+
|
1953 |
+
print_word(word[i].c_str(), w);
|
1954 |
+
|
1955 |
+
if (how == 0)
|
1956 |
+
{
|
1957 |
+
printf(" tagged %s", tag[i].c_str());
|
1958 |
+
if (w < count_words(word[i].c_str()) - 1) printf("+");
|
1959 |
+
printf("\n");
|
1960 |
+
} else if (how == 2)
|
1961 |
+
{
|
1962 |
+
printf("%s%s", "_", tag[i].c_str());
|
1963 |
+
if (w < count_words(word[i].c_str()) - 1) printf("+");
|
1964 |
+
}
|
1965 |
+
}
|
1966 |
+
}
|
1967 |
+
if (how == 2)
|
1968 |
+
printf("\n");
|
1969 |
+
}
|
1970 |
+
|
1971 |
+
void MPtok::merge_words(int s, int n)
|
1972 |
+
{
|
1973 |
+
string tmp = word[s];
|
1974 |
+
|
1975 |
+
for (int i = s + 1; i < s + n; i++)
|
1976 |
+
{
|
1977 |
+
tmp += " ";
|
1978 |
+
tmp += word[i];
|
1979 |
+
}
|
1980 |
+
|
1981 |
+
// printf("merging words : '%s' n = %d\n", tmp.c_str(), n);
|
1982 |
+
|
1983 |
+
for (int k = s; k + n < word.size(); k++)
|
1984 |
+
{
|
1985 |
+
word[k+1] = word[k+n];
|
1986 |
+
tag[k+1] = tag[k+n];
|
1987 |
+
}
|
1988 |
+
|
1989 |
+
// Fixup the remaining array
|
1990 |
+
|
1991 |
+
word.resize(word.size() - n + 1);
|
1992 |
+
tag.resize(word.size());
|
1993 |
+
|
1994 |
+
word[s] = tmp;
|
1995 |
+
}
|
1996 |
+
|
1997 |
+
void MPtok::split_words()
|
1998 |
+
{
|
1999 |
+
for (int i = 0; i < word.size(); i++)
|
2000 |
+
{
|
2001 |
+
int found = word[i].find(' ');
|
2002 |
+
|
2003 |
+
if (found != string::npos)
|
2004 |
+
{
|
2005 |
+
string tmp1(word[i], 0, found);
|
2006 |
+
string tmp2(word[i], found + 1, string::npos);
|
2007 |
+
|
2008 |
+
// Move all the words and tags down
|
2009 |
+
|
2010 |
+
word.resize(word.size() + 1);
|
2011 |
+
tag.resize(tag.size() + 1);
|
2012 |
+
|
2013 |
+
for (int j = word.size() - 1; j > i; j--)
|
2014 |
+
{
|
2015 |
+
word[j] = word[j - 1];
|
2016 |
+
tag[j] = tag[j - 1];
|
2017 |
+
}
|
2018 |
+
|
2019 |
+
word[i] = tmp1;
|
2020 |
+
tag[i] = tag[i+1];
|
2021 |
+
tag[i] += "+";
|
2022 |
+
|
2023 |
+
word[i+1] = tmp2;
|
2024 |
+
}
|
2025 |
+
}
|
2026 |
+
}
|
2027 |
+
|
2028 |
+
// Callable functions to set internal options
|
2029 |
+
|
2030 |
+
void MPtok::set_segment(int i) { option_segment = i; }
|
2031 |
+
void MPtok::set_hyphen(int i) { option_hyphen = i; }
|
2032 |
+
void MPtok::set_comma(int i) { option_comma = i; }
|
2033 |
+
void MPtok::set_pretag(char *a) { option_pretag = a; }
|
2034 |
+
void MPtok::set_pretok(int i) { option_pretok = i; }
|
2035 |
+
void MPtok::set_new(int i) { option_new = i; }
|
2036 |
+
void MPtok::set_doteos(int i) { option_doteos = i; }
|
Library/MPtok.h
CHANGED
@@ -1,141 +1,141 @@
|
|
1 |
-
#ifndef _MPTOK_H
|
2 |
-
#define _MPTOK_H
|
3 |
-
|
4 |
-
#include <stdio.h>
|
5 |
-
|
6 |
-
#include <string>
|
7 |
-
#include <vector>
|
8 |
-
#include <map>
|
9 |
-
#include <set>
|
10 |
-
|
11 |
-
using namespace std;
|
12 |
-
|
13 |
-
#define MPTOK_VERSION 11 // The latest version
|
14 |
-
|
15 |
-
// Maximum number of words in a sentence
|
16 |
-
|
17 |
-
#define MAX_WORDS 10000
|
18 |
-
|
19 |
-
enum { ABB_ABB, ABB_EOS, ABB_NUM };
|
20 |
-
#define MAX_ABB 100
|
21 |
-
|
22 |
-
/*! \brief A class to perform tokenization.
|
23 |
-
*
|
24 |
-
* The MPtag class can be used to perform tokenization and segmentation
|
25 |
-
* of strings into tokens or sentences. It is inherited and used by MPtag
|
26 |
-
* so if the user is only interested in tagging, this class does not
|
27 |
-
* need to be referenced.
|
28 |
-
*/
|
29 |
-
|
30 |
-
class MPtok
|
31 |
-
{
|
32 |
-
public:
|
33 |
-
/// \brief A MPtok object, giving the install directory \p idir where data files can be found
|
34 |
-
MPtok(string idir = "", const string& cnam = "");
|
35 |
-
~MPtok();
|
36 |
-
|
37 |
-
void init(); // Initialize (call only once)
|
38 |
-
void init(const string& idir) { option_dir = idir; init(); } // Initialize using specified install directory
|
39 |
-
|
40 |
-
string option_pretag; // The tag to use on tokens
|
41 |
-
int option_segment; // Segment into sentences
|
42 |
-
int option_hyphen; // Hyphens are separate tokens
|
43 |
-
int option_comma; // Commas are always tokenized
|
44 |
-
int option_pretok; // The text is pre-tokenized
|
45 |
-
int option_new; // Use new algorithms, used in development only
|
46 |
-
int option_doteos; // If " . " occurs, it's an end EOS (new >= 5)
|
47 |
-
|
48 |
-
void set_segment(int i); ///< \brief Sentences are broken up during tokenization (default 1)
|
49 |
-
void set_token(int i); ///< \brief Break tokens apart with white space (default 1)
|
50 |
-
void set_hyphen(int i); ///< \brief Hyphens are separate tokens (default 0)
|
51 |
-
void set_comma(int i); ///< \brief Commas are separate tokens (default 1)
|
52 |
-
void set_pretag(char *a); ///< \brief Use this tag on all tokens (default empty string)
|
53 |
-
void set_pretok(int i); ///< \brief Assume string is already tokenized using spaces (default 0)
|
54 |
-
void set_new(int i); ///< \brief Use a previous algorithm (defaults to most recent)
|
55 |
-
void set_doteos(int i); ///< \brief Ignore abbreviations, and always assume a period ends a sentence (default 0)
|
56 |
-
|
57 |
-
void merge_words(int s, int e); // merge words between s and e (idiom)
|
58 |
-
void split_words(void); // split all merged words
|
59 |
-
|
60 |
-
string tokenize(const string&); ///< \brief Tokenize, save (in \p word), and return space delimited tokens
|
61 |
-
string segment(const string&); ///< \brief Segment, save (in \p sent), and return newline delimited sentences
|
62 |
-
|
63 |
-
string save_string(const string&); // save a buffer
|
64 |
-
string tokenize_nosave(const string&); // tokenize without saving
|
65 |
-
string tokenize(const string&,int); // do tokenization with or without inserting spaces between them
|
66 |
-
|
67 |
-
void print(int); ///< \brief Print tokens/tags with given verbosity
|
68 |
-
|
69 |
-
vector<string> word; ///< \brief Vector of words (tokens) of most recently tagged (or tokenized) text
|
70 |
-
vector<string> tag; ///< \brief Vector of tags of most recently tagged (or tokenized) text
|
71 |
-
vector<string> sent; ///< \brief Vector of sentences of most recently sentence-segmented text
|
72 |
-
|
73 |
-
char *text; // Input text arg
|
74 |
-
int text_len; // It's length
|
75 |
-
int *tokflag; // token flags
|
76 |
-
int *endflag; // end-sentence flags
|
77 |
-
|
78 |
-
string option_cnam; // A suffix, for opening variant support files
|
79 |
-
string option_dir; // Directory to find things
|
80 |
-
|
81 |
-
protected:
|
82 |
-
|
83 |
-
void set_tokflag();
|
84 |
-
void set_endflag();
|
85 |
-
void set_endflag_01();
|
86 |
-
int size_buff();
|
87 |
-
|
88 |
-
void init_pair(const string& file_name); // read a file of common pairs
|
89 |
-
void init_abbr(const string& file_name); // read a file of abbreviations
|
90 |
-
|
91 |
-
void tok_0();
|
92 |
-
void tok_1();
|
93 |
-
void tok_2();
|
94 |
-
void tok_3();
|
95 |
-
void tok_5_6_7();
|
96 |
-
void tok_8_9();
|
97 |
-
void tok_10();
|
98 |
-
void tok_11();
|
99 |
-
void tok_12();
|
100 |
-
void tok_13();
|
101 |
-
void tok_14();
|
102 |
-
void tok_15();
|
103 |
-
void tok_15_1();
|
104 |
-
void tok_16();
|
105 |
-
void tok_16_1();
|
106 |
-
void tok_17();
|
107 |
-
void tok_20();
|
108 |
-
void tok_20_1();
|
109 |
-
void tok_20_2();
|
110 |
-
void tok_21();
|
111 |
-
void tok_21a();
|
112 |
-
void tok_22();
|
113 |
-
void tok_23();
|
114 |
-
void tok_24();
|
115 |
-
void tok_25();
|
116 |
-
void tok_26();
|
117 |
-
void tok_27();
|
118 |
-
void tok_28();
|
119 |
-
void tok_29();
|
120 |
-
void tok_29a();
|
121 |
-
void tok_30();
|
122 |
-
void tok_31();
|
123 |
-
void tok_32();
|
124 |
-
void tok_33();
|
125 |
-
int complex_check();
|
126 |
-
void map_escapes();
|
127 |
-
void tok_un();
|
128 |
-
|
129 |
-
void append_token(string&, int&, char*, int);
|
130 |
-
string token_string();
|
131 |
-
|
132 |
-
set<string> common_pair;
|
133 |
-
map<string,int> common_abbr;
|
134 |
-
|
135 |
-
private:
|
136 |
-
int option_token; // Output tokenized text (only use internally)
|
137 |
-
int tok_initialized; // is it inited?
|
138 |
-
};
|
139 |
-
|
140 |
-
#endif
|
141 |
-
|
|
|
1 |
+
#ifndef _MPTOK_H
|
2 |
+
#define _MPTOK_H
|
3 |
+
|
4 |
+
#include <stdio.h>
|
5 |
+
|
6 |
+
#include <string>
|
7 |
+
#include <vector>
|
8 |
+
#include <map>
|
9 |
+
#include <set>
|
10 |
+
|
11 |
+
using namespace std;
|
12 |
+
|
13 |
+
#define MPTOK_VERSION 11 // The latest version
|
14 |
+
|
15 |
+
// Maximum number of words in a sentence
|
16 |
+
|
17 |
+
#define MAX_WORDS 10000
|
18 |
+
|
19 |
+
enum { ABB_ABB, ABB_EOS, ABB_NUM };
|
20 |
+
#define MAX_ABB 100
|
21 |
+
|
22 |
+
/*! \brief A class to perform tokenization.
|
23 |
+
*
|
24 |
+
* The MPtag class can be used to perform tokenization and segmentation
|
25 |
+
* of strings into tokens or sentences. It is inherited and used by MPtag
|
26 |
+
* so if the user is only interested in tagging, this class does not
|
27 |
+
* need to be referenced.
|
28 |
+
*/
|
29 |
+
|
30 |
+
class MPtok
|
31 |
+
{
|
32 |
+
public:
|
33 |
+
/// \brief A MPtok object, giving the install directory \p idir where data files can be found
|
34 |
+
MPtok(string idir = "", const string& cnam = "");
|
35 |
+
~MPtok();
|
36 |
+
|
37 |
+
void init(); // Initialize (call only once)
|
38 |
+
void init(const string& idir) { option_dir = idir; init(); } // Initialize using specified install directory
|
39 |
+
|
40 |
+
string option_pretag; // The tag to use on tokens
|
41 |
+
int option_segment; // Segment into sentences
|
42 |
+
int option_hyphen; // Hyphens are separate tokens
|
43 |
+
int option_comma; // Commas are always tokenized
|
44 |
+
int option_pretok; // The text is pre-tokenized
|
45 |
+
int option_new; // Use new algorithms, used in development only
|
46 |
+
int option_doteos; // If " . " occurs, it's an end EOS (new >= 5)
|
47 |
+
|
48 |
+
void set_segment(int i); ///< \brief Sentences are broken up during tokenization (default 1)
|
49 |
+
void set_token(int i); ///< \brief Break tokens apart with white space (default 1)
|
50 |
+
void set_hyphen(int i); ///< \brief Hyphens are separate tokens (default 0)
|
51 |
+
void set_comma(int i); ///< \brief Commas are separate tokens (default 1)
|
52 |
+
void set_pretag(char *a); ///< \brief Use this tag on all tokens (default empty string)
|
53 |
+
void set_pretok(int i); ///< \brief Assume string is already tokenized using spaces (default 0)
|
54 |
+
void set_new(int i); ///< \brief Use a previous algorithm (defaults to most recent)
|
55 |
+
void set_doteos(int i); ///< \brief Ignore abbreviations, and always assume a period ends a sentence (default 0)
|
56 |
+
|
57 |
+
void merge_words(int s, int e); // merge words between s and e (idiom)
|
58 |
+
void split_words(void); // split all merged words
|
59 |
+
|
60 |
+
string tokenize(const string&); ///< \brief Tokenize, save (in \p word), and return space delimited tokens
|
61 |
+
string segment(const string&); ///< \brief Segment, save (in \p sent), and return newline delimited sentences
|
62 |
+
|
63 |
+
string save_string(const string&); // save a buffer
|
64 |
+
string tokenize_nosave(const string&); // tokenize without saving
|
65 |
+
string tokenize(const string&,int); // do tokenization with or without inserting spaces between them
|
66 |
+
|
67 |
+
void print(int); ///< \brief Print tokens/tags with given verbosity
|
68 |
+
|
69 |
+
vector<string> word; ///< \brief Vector of words (tokens) of most recently tagged (or tokenized) text
|
70 |
+
vector<string> tag; ///< \brief Vector of tags of most recently tagged (or tokenized) text
|
71 |
+
vector<string> sent; ///< \brief Vector of sentences of most recently sentence-segmented text
|
72 |
+
|
73 |
+
char *text; // Input text arg
|
74 |
+
int text_len; // It's length
|
75 |
+
int *tokflag; // token flags
|
76 |
+
int *endflag; // end-sentence flags
|
77 |
+
|
78 |
+
string option_cnam; // A suffix, for opening variant support files
|
79 |
+
string option_dir; // Directory to find things
|
80 |
+
|
81 |
+
protected:
|
82 |
+
|
83 |
+
void set_tokflag();
|
84 |
+
void set_endflag();
|
85 |
+
void set_endflag_01();
|
86 |
+
int size_buff();
|
87 |
+
|
88 |
+
void init_pair(const string& file_name); // read a file of common pairs
|
89 |
+
void init_abbr(const string& file_name); // read a file of abbreviations
|
90 |
+
|
91 |
+
void tok_0();
|
92 |
+
void tok_1();
|
93 |
+
void tok_2();
|
94 |
+
void tok_3();
|
95 |
+
void tok_5_6_7();
|
96 |
+
void tok_8_9();
|
97 |
+
void tok_10();
|
98 |
+
void tok_11();
|
99 |
+
void tok_12();
|
100 |
+
void tok_13();
|
101 |
+
void tok_14();
|
102 |
+
void tok_15();
|
103 |
+
void tok_15_1();
|
104 |
+
void tok_16();
|
105 |
+
void tok_16_1();
|
106 |
+
void tok_17();
|
107 |
+
void tok_20();
|
108 |
+
void tok_20_1();
|
109 |
+
void tok_20_2();
|
110 |
+
void tok_21();
|
111 |
+
void tok_21a();
|
112 |
+
void tok_22();
|
113 |
+
void tok_23();
|
114 |
+
void tok_24();
|
115 |
+
void tok_25();
|
116 |
+
void tok_26();
|
117 |
+
void tok_27();
|
118 |
+
void tok_28();
|
119 |
+
void tok_29();
|
120 |
+
void tok_29a();
|
121 |
+
void tok_30();
|
122 |
+
void tok_31();
|
123 |
+
void tok_32();
|
124 |
+
void tok_33();
|
125 |
+
int complex_check();
|
126 |
+
void map_escapes();
|
127 |
+
void tok_un();
|
128 |
+
|
129 |
+
void append_token(string&, int&, char*, int);
|
130 |
+
string token_string();
|
131 |
+
|
132 |
+
set<string> common_pair;
|
133 |
+
map<string,int> common_abbr;
|
134 |
+
|
135 |
+
private:
|
136 |
+
int option_token; // Output tokenized text (only use internally)
|
137 |
+
int tok_initialized; // is it inited?
|
138 |
+
};
|
139 |
+
|
140 |
+
#endif
|
141 |
+
|
Library/Makefile
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
-
SRC_DIR=./
|
2 |
-
TRASHFILES = *.o *~ *.bak core
|
3 |
-
LIB_INC=-I./
|
4 |
-
#.KEEP_STATE:
|
5 |
-
libops.a: runn.o Btree.o FBase.o Hash.o MPtok.o \
|
6 |
-
AbbrStra.o AbbrvE.o Ab3P.o
|
7 |
-
ar rus $@ $?
|
8 |
-
OS=-g
|
9 |
-
%.o: $(SRC_DIR)/%.C
|
10 |
-
g++ -c $(OS) $< -o $@ $(LIB_INC)
|
11 |
-
|
12 |
-
clean: rm -f $(TRASHFILES)
|
13 |
-
|
|
|
1 |
+
SRC_DIR=./
|
2 |
+
TRASHFILES = *.o *~ *.bak core
|
3 |
+
LIB_INC=-I./
|
4 |
+
#.KEEP_STATE:
|
5 |
+
libops.a: runn.o Btree.o FBase.o Hash.o MPtok.o \
|
6 |
+
AbbrStra.o AbbrvE.o Ab3P.o
|
7 |
+
ar rus $@ $?
|
8 |
+
OS=-g
|
9 |
+
%.o: $(SRC_DIR)/%.C
|
10 |
+
g++ -c $(OS) $< -o $@ $(LIB_INC)
|
11 |
+
|
12 |
+
clean: rm -f $(TRASHFILES)
|
13 |
+
|
Library/WordData/Ab3P_prec.dat
CHANGED
@@ -1,145 +1,145 @@
|
|
1 |
-
Al 1 FirstLetOneChSF 0.967224
|
2 |
-
Al 2 FirstLet 0.99818
|
3 |
-
Al 2 FirstLetGen 0.994292
|
4 |
-
Al 2 WithinWrdFWrd 0.989054
|
5 |
-
Al 2 FirstLetGenStp 0.970019
|
6 |
-
Al 2 ContLet 0.96935
|
7 |
-
Al 2 WithinWrdFLet 0.941981
|
8 |
-
Al 2 FirstLetGenSkp 0.949988
|
9 |
-
Al 2 WithinWrdFWrdSkp 0.947364
|
10 |
-
Al 2 ContLetSkp 0.877216
|
11 |
-
Al 2 WithinWrdWrd 0.74768
|
12 |
-
Al 2 WithinWrdFLetSkp 0.640805
|
13 |
-
Num 2 ContLet 0.975372
|
14 |
-
Num 2 ContLetSkp 0.96617
|
15 |
-
Num 2 WithinWrdFWrdSkp 0.988426
|
16 |
-
Num 2 FirstLetGen2 0.909995
|
17 |
-
Num 2 FirstLetGenStp 0.856401
|
18 |
-
Num 2 FirstLetGenSkp 0.858132
|
19 |
-
Num 2 WithinWrdFWrd 0.726155
|
20 |
-
Num 2 WithinWrdFLetSkp 0.607829
|
21 |
-
Num 2 WithinWrdFLet 0.493922
|
22 |
-
Spec 2 FirstLetGen2 0.854368
|
23 |
-
Spec 2 FirstLetGenStp 0.664622
|
24 |
-
Spec 2 FirstLetGenSkp 0.657475
|
25 |
-
Al 3 FirstLet 0.999808
|
26 |
-
Al 3 FirstLetGen 0.999408
|
27 |
-
Al 3 FirstLetGenS 0.998732
|
28 |
-
Al 3 WithinWrdFWrd 0.997824
|
29 |
-
Al 3 FirstLetGenStp 0.997839
|
30 |
-
Al 3 FirstLetGenStp2 0.997264
|
31 |
-
Al 3 FirstLetGenSkp 0.988583
|
32 |
-
Al 3 ContLet 0.987697
|
33 |
-
Al 3 WithinWrdFWrdSkp 0.981107
|
34 |
-
Al 3 WithinWrdFLet 0.981322
|
35 |
-
Al 3 ContLetSkp 0.968185
|
36 |
-
Al 3 WithinWrdWrd 0.9437
|
37 |
-
Al 3 WithinWrdFLetSkp 0.904799
|
38 |
-
Al 3 WithinWrdLet 0.663735
|
39 |
-
Al 3 AnyLet 0.303503
|
40 |
-
Num 3 FirstLetGen2 0.998497
|
41 |
-
Num 3 WithinWrdFWrd 0.99964
|
42 |
-
Num 3 FirstLetGenStp 0.998807
|
43 |
-
Num 3 FirstLetGenStp2 0.991256
|
44 |
-
Num 3 FirstLetGenSkp 0.991202
|
45 |
-
Num 3 ContLet 0.996938
|
46 |
-
Num 3 WithinWrdFWrdSkp 0.998821
|
47 |
-
Num 3 WithinWrdFLet 0.985676
|
48 |
-
Num 3 ContLetSkp 0.995076
|
49 |
-
Num 3 WithinWrdWrd 0.999245
|
50 |
-
Num 3 WithinWrdFLetSkp 0.971123
|
51 |
-
Num 3 WithinWrdLet 0.819989
|
52 |
-
Num 3 AnyLet 0.797932
|
53 |
-
Spec 3 FirstLetGen2 0.978311
|
54 |
-
Spec 3 FirstLetGenStp 0.977779
|
55 |
-
Spec 3 FirstLetGenStp2 0.929197
|
56 |
-
Spec 3 WithinWrdFWrd 0.930654
|
57 |
-
Spec 3 ContLet 0.923911
|
58 |
-
Spec 3 FirstLetGenSkp 0.904086
|
59 |
-
Spec 3 WithinWrdFWrdSkp 0.893989
|
60 |
-
Spec 3 ContLetSkp 0.851583
|
61 |
-
Spec 3 WithinWrdFLet 0.712331
|
62 |
-
Spec 3 WithinWrdFLetSkp 0.64667
|
63 |
-
Spec 3 WithinWrdWrd 0.428
|
64 |
-
Al 4 FirstLet 0.999964
|
65 |
-
Al 4 FirstLetGen 0.99993
|
66 |
-
Al 4 FirstLetGenS 0.999811
|
67 |
-
Al 4 WithinWrdFWrd 0.999616
|
68 |
-
Al 4 FirstLetGenStp 0.999868
|
69 |
-
Al 4 FirstLetGenStp2 0.999948
|
70 |
-
Al 4 FirstLetGenSkp 0.998534
|
71 |
-
Al 4 ContLet 0.992792
|
72 |
-
Al 4 WithinWrdFWrdSkp 0.997097
|
73 |
-
Al 4 WithinWrdFLet 0.992955
|
74 |
-
Al 4 ContLetSkp 0.985568
|
75 |
-
Al 4 WithinWrdWrd 0.995823
|
76 |
-
Al 4 WithinWrdFLetSkp 0.976873
|
77 |
-
Al 4 WithinWrdLet 0.917863
|
78 |
-
Al 4 AnyLet 0.696532
|
79 |
-
Num 4 FirstLetGen2 0.99992
|
80 |
-
Num 4 WithinWrdFWrd 0.999835
|
81 |
-
Num 4 FirstLetGenStp 0.999903
|
82 |
-
Num 4 FirstLetGenStp2 0.999936
|
83 |
-
Num 4 FirstLetGenSkp 0.999577
|
84 |
-
Num 4 ContLet 0.999555
|
85 |
-
Num 4 WithinWrdFWrdSkp 0.999885
|
86 |
-
Num 4 WithinWrdFLet 0.9975
|
87 |
-
Num 4 ContLetSkp 0.998578
|
88 |
-
Num 4 WithinWrdWrd 0.997703
|
89 |
-
Num 4 WithinWrdFLetSkp 0.996501
|
90 |
-
Num 4 WithinWrdLet 0.986326
|
91 |
-
Num 4 AnyLet 0.953126
|
92 |
-
Spec 4 FirstLetGen2 0.99278
|
93 |
-
Spec 4 FirstLetGenStp 0.98597
|
94 |
-
Spec 4 FirstLetGenStp2 0.982127
|
95 |
-
Spec 4 WithinWrdFWrd 0.997649
|
96 |
-
Spec 4 ContLet 0.980869
|
97 |
-
Spec 4 FirstLetGenSkp 0.944843
|
98 |
-
Spec 4 WithinWrdFWrdSkp 0.985685
|
99 |
-
Spec 4 ContLetSkp 0.973983
|
100 |
-
Spec 4 WithinWrdFLet 0.992773
|
101 |
-
Spec 4 WithinWrdFLetSkp 0.863247
|
102 |
-
Spec 4 WithinWrdWrd 0.931745
|
103 |
-
Spec 4 WithinWrdLet 0.418068
|
104 |
-
Spec 4 AnyLet 0.223562
|
105 |
-
Al 5 FirstLet 0.999979
|
106 |
-
Al 5 FirstLetGen 0.999979
|
107 |
-
Al 5 FirstLetGenS 0.999913
|
108 |
-
Al 5 WithinWrdFWrd 0.999928
|
109 |
-
Al 5 FirstLetGenStp 0.999989
|
110 |
-
Al 5 FirstLetGenStp2 0.999887
|
111 |
-
Al 5 FirstLetGenSkp 0.999852
|
112 |
-
Al 5 ContLet 0.997596
|
113 |
-
Al 5 WithinWrdFWrdSkp 0.999602
|
114 |
-
Al 5 WithinWrdFLet 0.997473
|
115 |
-
Al 5 ContLetSkp 0.989703
|
116 |
-
Al 5 WithinWrdWrd 0.999812
|
117 |
-
Al 5 WithinWrdFLetSkp 0.986066
|
118 |
-
Al 5 WithinWrdLet 0.889324
|
119 |
-
Al 5 AnyLet 0.73859
|
120 |
-
Num 5 FirstLetGen2 0.999987
|
121 |
-
Num 5 WithinWrdFWrd 0.999922
|
122 |
-
Num 5 FirstLetGenStp 0.99998
|
123 |
-
Num 5 FirstLetGenStp2 1
|
124 |
-
Num 5 FirstLetGenSkp 0.999901
|
125 |
-
Num 5 ContLet 0.999613
|
126 |
-
Num 5 WithinWrdFWrdSkp 0.999937
|
127 |
-
Num 5 WithinWrdFLet 0.999386
|
128 |
-
Num 5 ContLetSkp 0.999312
|
129 |
-
Num 5 WithinWrdWrd 1
|
130 |
-
Num 5 WithinWrdFLetSkp 0.998939
|
131 |
-
Num 5 WithinWrdLet 0.996068
|
132 |
-
Num 5 AnyLet 0.986193
|
133 |
-
Spec 5 FirstLetGen2 0.999701
|
134 |
-
Spec 5 FirstLetGenStp 0.9999
|
135 |
-
Spec 5 FirstLetGenStp2 0.999757
|
136 |
-
Spec 5 WithinWrdFWrd 0.999517
|
137 |
-
Spec 5 ContLet 0.994648
|
138 |
-
Spec 5 FirstLetGenSkp 0.997065
|
139 |
-
Spec 5 WithinWrdFWrdSkp 0.998513
|
140 |
-
Spec 5 ContLetSkp 0.992445
|
141 |
-
Spec 5 WithinWrdFLet 0.996623
|
142 |
-
Spec 5 WithinWrdFLetSkp 0.978026
|
143 |
-
Spec 5 WithinWrdWrd 0.996879
|
144 |
-
Spec 5 WithinWrdLet 0.862993
|
145 |
Spec 5 AnyLet 0.745608
|
|
|
1 |
+
Al 1 FirstLetOneChSF 0.967224
|
2 |
+
Al 2 FirstLet 0.99818
|
3 |
+
Al 2 FirstLetGen 0.994292
|
4 |
+
Al 2 WithinWrdFWrd 0.989054
|
5 |
+
Al 2 FirstLetGenStp 0.970019
|
6 |
+
Al 2 ContLet 0.96935
|
7 |
+
Al 2 WithinWrdFLet 0.941981
|
8 |
+
Al 2 FirstLetGenSkp 0.949988
|
9 |
+
Al 2 WithinWrdFWrdSkp 0.947364
|
10 |
+
Al 2 ContLetSkp 0.877216
|
11 |
+
Al 2 WithinWrdWrd 0.74768
|
12 |
+
Al 2 WithinWrdFLetSkp 0.640805
|
13 |
+
Num 2 ContLet 0.975372
|
14 |
+
Num 2 ContLetSkp 0.96617
|
15 |
+
Num 2 WithinWrdFWrdSkp 0.988426
|
16 |
+
Num 2 FirstLetGen2 0.909995
|
17 |
+
Num 2 FirstLetGenStp 0.856401
|
18 |
+
Num 2 FirstLetGenSkp 0.858132
|
19 |
+
Num 2 WithinWrdFWrd 0.726155
|
20 |
+
Num 2 WithinWrdFLetSkp 0.607829
|
21 |
+
Num 2 WithinWrdFLet 0.493922
|
22 |
+
Spec 2 FirstLetGen2 0.854368
|
23 |
+
Spec 2 FirstLetGenStp 0.664622
|
24 |
+
Spec 2 FirstLetGenSkp 0.657475
|
25 |
+
Al 3 FirstLet 0.999808
|
26 |
+
Al 3 FirstLetGen 0.999408
|
27 |
+
Al 3 FirstLetGenS 0.998732
|
28 |
+
Al 3 WithinWrdFWrd 0.997824
|
29 |
+
Al 3 FirstLetGenStp 0.997839
|
30 |
+
Al 3 FirstLetGenStp2 0.997264
|
31 |
+
Al 3 FirstLetGenSkp 0.988583
|
32 |
+
Al 3 ContLet 0.987697
|
33 |
+
Al 3 WithinWrdFWrdSkp 0.981107
|
34 |
+
Al 3 WithinWrdFLet 0.981322
|
35 |
+
Al 3 ContLetSkp 0.968185
|
36 |
+
Al 3 WithinWrdWrd 0.9437
|
37 |
+
Al 3 WithinWrdFLetSkp 0.904799
|
38 |
+
Al 3 WithinWrdLet 0.663735
|
39 |
+
Al 3 AnyLet 0.303503
|
40 |
+
Num 3 FirstLetGen2 0.998497
|
41 |
+
Num 3 WithinWrdFWrd 0.99964
|
42 |
+
Num 3 FirstLetGenStp 0.998807
|
43 |
+
Num 3 FirstLetGenStp2 0.991256
|
44 |
+
Num 3 FirstLetGenSkp 0.991202
|
45 |
+
Num 3 ContLet 0.996938
|
46 |
+
Num 3 WithinWrdFWrdSkp 0.998821
|
47 |
+
Num 3 WithinWrdFLet 0.985676
|
48 |
+
Num 3 ContLetSkp 0.995076
|
49 |
+
Num 3 WithinWrdWrd 0.999245
|
50 |
+
Num 3 WithinWrdFLetSkp 0.971123
|
51 |
+
Num 3 WithinWrdLet 0.819989
|
52 |
+
Num 3 AnyLet 0.797932
|
53 |
+
Spec 3 FirstLetGen2 0.978311
|
54 |
+
Spec 3 FirstLetGenStp 0.977779
|
55 |
+
Spec 3 FirstLetGenStp2 0.929197
|
56 |
+
Spec 3 WithinWrdFWrd 0.930654
|
57 |
+
Spec 3 ContLet 0.923911
|
58 |
+
Spec 3 FirstLetGenSkp 0.904086
|
59 |
+
Spec 3 WithinWrdFWrdSkp 0.893989
|
60 |
+
Spec 3 ContLetSkp 0.851583
|
61 |
+
Spec 3 WithinWrdFLet 0.712331
|
62 |
+
Spec 3 WithinWrdFLetSkp 0.64667
|
63 |
+
Spec 3 WithinWrdWrd 0.428
|
64 |
+
Al 4 FirstLet 0.999964
|
65 |
+
Al 4 FirstLetGen 0.99993
|
66 |
+
Al 4 FirstLetGenS 0.999811
|
67 |
+
Al 4 WithinWrdFWrd 0.999616
|
68 |
+
Al 4 FirstLetGenStp 0.999868
|
69 |
+
Al 4 FirstLetGenStp2 0.999948
|
70 |
+
Al 4 FirstLetGenSkp 0.998534
|
71 |
+
Al 4 ContLet 0.992792
|
72 |
+
Al 4 WithinWrdFWrdSkp 0.997097
|
73 |
+
Al 4 WithinWrdFLet 0.992955
|
74 |
+
Al 4 ContLetSkp 0.985568
|
75 |
+
Al 4 WithinWrdWrd 0.995823
|
76 |
+
Al 4 WithinWrdFLetSkp 0.976873
|
77 |
+
Al 4 WithinWrdLet 0.917863
|
78 |
+
Al 4 AnyLet 0.696532
|
79 |
+
Num 4 FirstLetGen2 0.99992
|
80 |
+
Num 4 WithinWrdFWrd 0.999835
|
81 |
+
Num 4 FirstLetGenStp 0.999903
|
82 |
+
Num 4 FirstLetGenStp2 0.999936
|
83 |
+
Num 4 FirstLetGenSkp 0.999577
|
84 |
+
Num 4 ContLet 0.999555
|
85 |
+
Num 4 WithinWrdFWrdSkp 0.999885
|
86 |
+
Num 4 WithinWrdFLet 0.9975
|
87 |
+
Num 4 ContLetSkp 0.998578
|
88 |
+
Num 4 WithinWrdWrd 0.997703
|
89 |
+
Num 4 WithinWrdFLetSkp 0.996501
|
90 |
+
Num 4 WithinWrdLet 0.986326
|
91 |
+
Num 4 AnyLet 0.953126
|
92 |
+
Spec 4 FirstLetGen2 0.99278
|
93 |
+
Spec 4 FirstLetGenStp 0.98597
|
94 |
+
Spec 4 FirstLetGenStp2 0.982127
|
95 |
+
Spec 4 WithinWrdFWrd 0.997649
|
96 |
+
Spec 4 ContLet 0.980869
|
97 |
+
Spec 4 FirstLetGenSkp 0.944843
|
98 |
+
Spec 4 WithinWrdFWrdSkp 0.985685
|
99 |
+
Spec 4 ContLetSkp 0.973983
|
100 |
+
Spec 4 WithinWrdFLet 0.992773
|
101 |
+
Spec 4 WithinWrdFLetSkp 0.863247
|
102 |
+
Spec 4 WithinWrdWrd 0.931745
|
103 |
+
Spec 4 WithinWrdLet 0.418068
|
104 |
+
Spec 4 AnyLet 0.223562
|
105 |
+
Al 5 FirstLet 0.999979
|
106 |
+
Al 5 FirstLetGen 0.999979
|
107 |
+
Al 5 FirstLetGenS 0.999913
|
108 |
+
Al 5 WithinWrdFWrd 0.999928
|
109 |
+
Al 5 FirstLetGenStp 0.999989
|
110 |
+
Al 5 FirstLetGenStp2 0.999887
|
111 |
+
Al 5 FirstLetGenSkp 0.999852
|
112 |
+
Al 5 ContLet 0.997596
|
113 |
+
Al 5 WithinWrdFWrdSkp 0.999602
|
114 |
+
Al 5 WithinWrdFLet 0.997473
|
115 |
+
Al 5 ContLetSkp 0.989703
|
116 |
+
Al 5 WithinWrdWrd 0.999812
|
117 |
+
Al 5 WithinWrdFLetSkp 0.986066
|
118 |
+
Al 5 WithinWrdLet 0.889324
|
119 |
+
Al 5 AnyLet 0.73859
|
120 |
+
Num 5 FirstLetGen2 0.999987
|
121 |
+
Num 5 WithinWrdFWrd 0.999922
|
122 |
+
Num 5 FirstLetGenStp 0.99998
|
123 |
+
Num 5 FirstLetGenStp2 1
|
124 |
+
Num 5 FirstLetGenSkp 0.999901
|
125 |
+
Num 5 ContLet 0.999613
|
126 |
+
Num 5 WithinWrdFWrdSkp 0.999937
|
127 |
+
Num 5 WithinWrdFLet 0.999386
|
128 |
+
Num 5 ContLetSkp 0.999312
|
129 |
+
Num 5 WithinWrdWrd 1
|
130 |
+
Num 5 WithinWrdFLetSkp 0.998939
|
131 |
+
Num 5 WithinWrdLet 0.996068
|
132 |
+
Num 5 AnyLet 0.986193
|
133 |
+
Spec 5 FirstLetGen2 0.999701
|
134 |
+
Spec 5 FirstLetGenStp 0.9999
|
135 |
+
Spec 5 FirstLetGenStp2 0.999757
|
136 |
+
Spec 5 WithinWrdFWrd 0.999517
|
137 |
+
Spec 5 ContLet 0.994648
|
138 |
+
Spec 5 FirstLetGenSkp 0.997065
|
139 |
+
Spec 5 WithinWrdFWrdSkp 0.998513
|
140 |
+
Spec 5 ContLetSkp 0.992445
|
141 |
+
Spec 5 WithinWrdFLet 0.996623
|
142 |
+
Spec 5 WithinWrdFLetSkp 0.978026
|
143 |
+
Spec 5 WithinWrdWrd 0.996879
|
144 |
+
Spec 5 WithinWrdLet 0.862993
|
145 |
Spec 5 AnyLet 0.745608
|
Library/WordData/Lf1chSf
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
Library/WordData/stop
CHANGED
@@ -1,313 +1,313 @@
|
|
1 |
-
a
|
2 |
-
about
|
3 |
-
above
|
4 |
-
across
|
5 |
-
after
|
6 |
-
afterwards
|
7 |
-
again
|
8 |
-
against
|
9 |
-
al
|
10 |
-
all
|
11 |
-
almost
|
12 |
-
alone
|
13 |
-
along
|
14 |
-
already
|
15 |
-
also
|
16 |
-
although
|
17 |
-
always
|
18 |
-
am
|
19 |
-
among
|
20 |
-
amongst
|
21 |
-
an
|
22 |
-
analyze
|
23 |
-
and
|
24 |
-
another
|
25 |
-
any
|
26 |
-
anyhow
|
27 |
-
anyone
|
28 |
-
anything
|
29 |
-
anywhere
|
30 |
-
applicable
|
31 |
-
apply
|
32 |
-
are
|
33 |
-
around
|
34 |
-
as
|
35 |
-
assume
|
36 |
-
at
|
37 |
-
be
|
38 |
-
became
|
39 |
-
because
|
40 |
-
become
|
41 |
-
becomes
|
42 |
-
becoming
|
43 |
-
been
|
44 |
-
before
|
45 |
-
beforehand
|
46 |
-
being
|
47 |
-
below
|
48 |
-
beside
|
49 |
-
besides
|
50 |
-
between
|
51 |
-
beyond
|
52 |
-
both
|
53 |
-
but
|
54 |
-
by
|
55 |
-
came
|
56 |
-
cannot
|
57 |
-
cc
|
58 |
-
cm
|
59 |
-
come
|
60 |
-
compare
|
61 |
-
could
|
62 |
-
de
|
63 |
-
dealing
|
64 |
-
department
|
65 |
-
depend
|
66 |
-
did
|
67 |
-
discover
|
68 |
-
dl
|
69 |
-
do
|
70 |
-
does
|
71 |
-
during
|
72 |
-
each
|
73 |
-
ec
|
74 |
-
ed
|
75 |
-
effected
|
76 |
-
eg
|
77 |
-
either
|
78 |
-
else
|
79 |
-
elsewhere
|
80 |
-
enough
|
81 |
-
et
|
82 |
-
etc
|
83 |
-
ever
|
84 |
-
every
|
85 |
-
everyone
|
86 |
-
everything
|
87 |
-
everywhere
|
88 |
-
except
|
89 |
-
find
|
90 |
-
for
|
91 |
-
found
|
92 |
-
from
|
93 |
-
further
|
94 |
-
get
|
95 |
-
give
|
96 |
-
go
|
97 |
-
gov
|
98 |
-
had
|
99 |
-
has
|
100 |
-
have
|
101 |
-
he
|
102 |
-
hence
|
103 |
-
her
|
104 |
-
here
|
105 |
-
hereafter
|
106 |
-
hereby
|
107 |
-
herein
|
108 |
-
hereupon
|
109 |
-
hers
|
110 |
-
herself
|
111 |
-
him
|
112 |
-
himself
|
113 |
-
his
|
114 |
-
how
|
115 |
-
however
|
116 |
-
hr
|
117 |
-
ie
|
118 |
-
if
|
119 |
-
ii
|
120 |
-
iii
|
121 |
-
in
|
122 |
-
inc
|
123 |
-
incl
|
124 |
-
indeed
|
125 |
-
into
|
126 |
-
investigate
|
127 |
-
is
|
128 |
-
it
|
129 |
-
its
|
130 |
-
itself
|
131 |
-
j
|
132 |
-
jour
|
133 |
-
journal
|
134 |
-
just
|
135 |
-
kg
|
136 |
-
last
|
137 |
-
latter
|
138 |
-
latterly
|
139 |
-
lb
|
140 |
-
ld
|
141 |
-
letter
|
142 |
-
like
|
143 |
-
ltd
|
144 |
-
made
|
145 |
-
make
|
146 |
-
many
|
147 |
-
may
|
148 |
-
me
|
149 |
-
meanwhile
|
150 |
-
mg
|
151 |
-
might
|
152 |
-
ml
|
153 |
-
mm
|
154 |
-
mo
|
155 |
-
more
|
156 |
-
moreover
|
157 |
-
most
|
158 |
-
mostly
|
159 |
-
mr
|
160 |
-
much
|
161 |
-
must
|
162 |
-
my
|
163 |
-
myself
|
164 |
-
namely
|
165 |
-
neither
|
166 |
-
never
|
167 |
-
nevertheless
|
168 |
-
next
|
169 |
-
no
|
170 |
-
nobody
|
171 |
-
noone
|
172 |
-
nor
|
173 |
-
not
|
174 |
-
nothing
|
175 |
-
now
|
176 |
-
nowhere
|
177 |
-
of
|
178 |
-
off
|
179 |
-
often
|
180 |
-
on
|
181 |
-
only
|
182 |
-
onto
|
183 |
-
or
|
184 |
-
other
|
185 |
-
others
|
186 |
-
otherwise
|
187 |
-
our
|
188 |
-
ours
|
189 |
-
ourselves
|
190 |
-
out
|
191 |
-
over
|
192 |
-
own
|
193 |
-
oz
|
194 |
-
per
|
195 |
-
perhaps
|
196 |
-
pm
|
197 |
-
precede
|
198 |
-
presently
|
199 |
-
previously
|
200 |
-
pt
|
201 |
-
rather
|
202 |
-
regarding
|
203 |
-
relate
|
204 |
-
said
|
205 |
-
same
|
206 |
-
seem
|
207 |
-
seemed
|
208 |
-
seeming
|
209 |
-
seems
|
210 |
-
seriously
|
211 |
-
several
|
212 |
-
she
|
213 |
-
should
|
214 |
-
show
|
215 |
-
showed
|
216 |
-
shown
|
217 |
-
since
|
218 |
-
so
|
219 |
-
some
|
220 |
-
somehow
|
221 |
-
someone
|
222 |
-
something
|
223 |
-
sometime
|
224 |
-
sometimes
|
225 |
-
somewhere
|
226 |
-
still
|
227 |
-
studied
|
228 |
-
sub
|
229 |
-
such
|
230 |
-
take
|
231 |
-
tell
|
232 |
-
th
|
233 |
-
than
|
234 |
-
that
|
235 |
-
the
|
236 |
-
their
|
237 |
-
them
|
238 |
-
themselves
|
239 |
-
then
|
240 |
-
thence
|
241 |
-
there
|
242 |
-
thereafter
|
243 |
-
thereby
|
244 |
-
therefore
|
245 |
-
therein
|
246 |
-
thereupon
|
247 |
-
these
|
248 |
-
they
|
249 |
-
this
|
250 |
-
thorough
|
251 |
-
those
|
252 |
-
though
|
253 |
-
through
|
254 |
-
throughout
|
255 |
-
thru
|
256 |
-
thus
|
257 |
-
to
|
258 |
-
together
|
259 |
-
too
|
260 |
-
toward
|
261 |
-
towards
|
262 |
-
try
|
263 |
-
type
|
264 |
-
ug
|
265 |
-
under
|
266 |
-
unless
|
267 |
-
until
|
268 |
-
up
|
269 |
-
upon
|
270 |
-
us
|
271 |
-
used
|
272 |
-
using
|
273 |
-
various
|
274 |
-
very
|
275 |
-
via
|
276 |
-
was
|
277 |
-
we
|
278 |
-
were
|
279 |
-
what
|
280 |
-
whatever
|
281 |
-
when
|
282 |
-
whence
|
283 |
-
whenever
|
284 |
-
where
|
285 |
-
whereafter
|
286 |
-
whereas
|
287 |
-
whereby
|
288 |
-
wherein
|
289 |
-
whereupon
|
290 |
-
wherever
|
291 |
-
whether
|
292 |
-
which
|
293 |
-
while
|
294 |
-
whither
|
295 |
-
who
|
296 |
-
whoever
|
297 |
-
whom
|
298 |
-
whose
|
299 |
-
why
|
300 |
-
will
|
301 |
-
with
|
302 |
-
within
|
303 |
-
without
|
304 |
-
wk
|
305 |
-
would
|
306 |
-
wt
|
307 |
-
yet
|
308 |
-
you
|
309 |
-
your
|
310 |
-
yours
|
311 |
-
yourself
|
312 |
-
yourselves
|
313 |
-
yr
|
|
|
1 |
+
a
|
2 |
+
about
|
3 |
+
above
|
4 |
+
across
|
5 |
+
after
|
6 |
+
afterwards
|
7 |
+
again
|
8 |
+
against
|
9 |
+
al
|
10 |
+
all
|
11 |
+
almost
|
12 |
+
alone
|
13 |
+
along
|
14 |
+
already
|
15 |
+
also
|
16 |
+
although
|
17 |
+
always
|
18 |
+
am
|
19 |
+
among
|
20 |
+
amongst
|
21 |
+
an
|
22 |
+
analyze
|
23 |
+
and
|
24 |
+
another
|
25 |
+
any
|
26 |
+
anyhow
|
27 |
+
anyone
|
28 |
+
anything
|
29 |
+
anywhere
|
30 |
+
applicable
|
31 |
+
apply
|
32 |
+
are
|
33 |
+
around
|
34 |
+
as
|
35 |
+
assume
|
36 |
+
at
|
37 |
+
be
|
38 |
+
became
|
39 |
+
because
|
40 |
+
become
|
41 |
+
becomes
|
42 |
+
becoming
|
43 |
+
been
|
44 |
+
before
|
45 |
+
beforehand
|
46 |
+
being
|
47 |
+
below
|
48 |
+
beside
|
49 |
+
besides
|
50 |
+
between
|
51 |
+
beyond
|
52 |
+
both
|
53 |
+
but
|
54 |
+
by
|
55 |
+
came
|
56 |
+
cannot
|
57 |
+
cc
|
58 |
+
cm
|
59 |
+
come
|
60 |
+
compare
|
61 |
+
could
|
62 |
+
de
|
63 |
+
dealing
|
64 |
+
department
|
65 |
+
depend
|
66 |
+
did
|
67 |
+
discover
|
68 |
+
dl
|
69 |
+
do
|
70 |
+
does
|
71 |
+
during
|
72 |
+
each
|
73 |
+
ec
|
74 |
+
ed
|
75 |
+
effected
|
76 |
+
eg
|
77 |
+
either
|
78 |
+
else
|
79 |
+
elsewhere
|
80 |
+
enough
|
81 |
+
et
|
82 |
+
etc
|
83 |
+
ever
|
84 |
+
every
|
85 |
+
everyone
|
86 |
+
everything
|
87 |
+
everywhere
|
88 |
+
except
|
89 |
+
find
|
90 |
+
for
|
91 |
+
found
|
92 |
+
from
|
93 |
+
further
|
94 |
+
get
|
95 |
+
give
|
96 |
+
go
|
97 |
+
gov
|
98 |
+
had
|
99 |
+
has
|
100 |
+
have
|
101 |
+
he
|
102 |
+
hence
|
103 |
+
her
|
104 |
+
here
|
105 |
+
hereafter
|
106 |
+
hereby
|
107 |
+
herein
|
108 |
+
hereupon
|
109 |
+
hers
|
110 |
+
herself
|
111 |
+
him
|
112 |
+
himself
|
113 |
+
his
|
114 |
+
how
|
115 |
+
however
|
116 |
+
hr
|
117 |
+
ie
|
118 |
+
if
|
119 |
+
ii
|
120 |
+
iii
|
121 |
+
in
|
122 |
+
inc
|
123 |
+
incl
|
124 |
+
indeed
|
125 |
+
into
|
126 |
+
investigate
|
127 |
+
is
|
128 |
+
it
|
129 |
+
its
|
130 |
+
itself
|
131 |
+
j
|
132 |
+
jour
|
133 |
+
journal
|
134 |
+
just
|
135 |
+
kg
|
136 |
+
last
|
137 |
+
latter
|
138 |
+
latterly
|
139 |
+
lb
|
140 |
+
ld
|
141 |
+
letter
|
142 |
+
like
|
143 |
+
ltd
|
144 |
+
made
|
145 |
+
make
|
146 |
+
many
|
147 |
+
may
|
148 |
+
me
|
149 |
+
meanwhile
|
150 |
+
mg
|
151 |
+
might
|
152 |
+
ml
|
153 |
+
mm
|
154 |
+
mo
|
155 |
+
more
|
156 |
+
moreover
|
157 |
+
most
|
158 |
+
mostly
|
159 |
+
mr
|
160 |
+
much
|
161 |
+
must
|
162 |
+
my
|
163 |
+
myself
|
164 |
+
namely
|
165 |
+
neither
|
166 |
+
never
|
167 |
+
nevertheless
|
168 |
+
next
|
169 |
+
no
|
170 |
+
nobody
|
171 |
+
noone
|
172 |
+
nor
|
173 |
+
not
|
174 |
+
nothing
|
175 |
+
now
|
176 |
+
nowhere
|
177 |
+
of
|
178 |
+
off
|
179 |
+
often
|
180 |
+
on
|
181 |
+
only
|
182 |
+
onto
|
183 |
+
or
|
184 |
+
other
|
185 |
+
others
|
186 |
+
otherwise
|
187 |
+
our
|
188 |
+
ours
|
189 |
+
ourselves
|
190 |
+
out
|
191 |
+
over
|
192 |
+
own
|
193 |
+
oz
|
194 |
+
per
|
195 |
+
perhaps
|
196 |
+
pm
|
197 |
+
precede
|
198 |
+
presently
|
199 |
+
previously
|
200 |
+
pt
|
201 |
+
rather
|
202 |
+
regarding
|
203 |
+
relate
|
204 |
+
said
|
205 |
+
same
|
206 |
+
seem
|
207 |
+
seemed
|
208 |
+
seeming
|
209 |
+
seems
|
210 |
+
seriously
|
211 |
+
several
|
212 |
+
she
|
213 |
+
should
|
214 |
+
show
|
215 |
+
showed
|
216 |
+
shown
|
217 |
+
since
|
218 |
+
so
|
219 |
+
some
|
220 |
+
somehow
|
221 |
+
someone
|
222 |
+
something
|
223 |
+
sometime
|
224 |
+
sometimes
|
225 |
+
somewhere
|
226 |
+
still
|
227 |
+
studied
|
228 |
+
sub
|
229 |
+
such
|
230 |
+
take
|
231 |
+
tell
|
232 |
+
th
|
233 |
+
than
|
234 |
+
that
|
235 |
+
the
|
236 |
+
their
|
237 |
+
them
|
238 |
+
themselves
|
239 |
+
then
|
240 |
+
thence
|
241 |
+
there
|
242 |
+
thereafter
|
243 |
+
thereby
|
244 |
+
therefore
|
245 |
+
therein
|
246 |
+
thereupon
|
247 |
+
these
|
248 |
+
they
|
249 |
+
this
|
250 |
+
thorough
|
251 |
+
those
|
252 |
+
though
|
253 |
+
through
|
254 |
+
throughout
|
255 |
+
thru
|
256 |
+
thus
|
257 |
+
to
|
258 |
+
together
|
259 |
+
too
|
260 |
+
toward
|
261 |
+
towards
|
262 |
+
try
|
263 |
+
type
|
264 |
+
ug
|
265 |
+
under
|
266 |
+
unless
|
267 |
+
until
|
268 |
+
up
|
269 |
+
upon
|
270 |
+
us
|
271 |
+
used
|
272 |
+
using
|
273 |
+
various
|
274 |
+
very
|
275 |
+
via
|
276 |
+
was
|
277 |
+
we
|
278 |
+
were
|
279 |
+
what
|
280 |
+
whatever
|
281 |
+
when
|
282 |
+
whence
|
283 |
+
whenever
|
284 |
+
where
|
285 |
+
whereafter
|
286 |
+
whereas
|
287 |
+
whereby
|
288 |
+
wherein
|
289 |
+
whereupon
|
290 |
+
wherever
|
291 |
+
whether
|
292 |
+
which
|
293 |
+
while
|
294 |
+
whither
|
295 |
+
who
|
296 |
+
whoever
|
297 |
+
whom
|
298 |
+
whose
|
299 |
+
why
|
300 |
+
will
|
301 |
+
with
|
302 |
+
within
|
303 |
+
without
|
304 |
+
wk
|
305 |
+
would
|
306 |
+
wt
|
307 |
+
yet
|
308 |
+
you
|
309 |
+
your
|
310 |
+
yours
|
311 |
+
yourself
|
312 |
+
yourselves
|
313 |
+
yr
|
Library/runn.C
CHANGED
@@ -1,216 +1,216 @@
|
|
1 |
-
#include <iostream>
|
2 |
-
#include <fstream>
|
3 |
-
#include <cstdlib>
|
4 |
-
#include <sstream>
|
5 |
-
#include <iomanip>
|
6 |
-
#include <cstring>
|
7 |
-
#include <cmath>
|
8 |
-
#include <sys/types.h>
|
9 |
-
#include <sys/stat.h>
|
10 |
-
#include <unistd.h>
|
11 |
-
#include <fcntl.h>
|
12 |
-
#include <sys/mman.h>
|
13 |
-
#include "runn.h"
|
14 |
-
using namespace std;
|
15 |
-
namespace iret {
|
16 |
-
|
17 |
-
int mark(int pflag, long ct, int ivl, const char *what){
|
18 |
-
if(pflag&&((ct%ivl)==0)){cout << what << " count=" << ct << endl;
|
19 |
-
return(1);}
|
20 |
-
else return(0);
|
21 |
-
}
|
22 |
-
|
23 |
-
int get_qflag(){
|
24 |
-
int pflag=1;
|
25 |
-
ifstream fin("quiet.flag",ios::in);
|
26 |
-
if(fin.is_open()){
|
27 |
-
fin >> pflag;
|
28 |
-
fin.close();
|
29 |
-
fin.clear();
|
30 |
-
}
|
31 |
-
return(pflag);
|
32 |
-
}
|
33 |
-
|
34 |
-
int get_pathw(char *nam,const char *pfl,const char *pex,const char *ch){
|
35 |
-
char cnam[256];
|
36 |
-
|
37 |
-
strcpy(cnam,"path_");
|
38 |
-
strcat(cnam,pfl);
|
39 |
-
strcat(cnam,"_");
|
40 |
-
strcat(cnam,pex);
|
41 |
-
strcat(cnam,".");
|
42 |
-
strcat(cnam,ch);
|
43 |
-
ifstream fin(cnam,ios::in);
|
44 |
-
if(!fin.is_open()){
|
45 |
-
fin.clear();
|
46 |
-
strcpy(cnam,"path_");
|
47 |
-
strcat(cnam,pfl);
|
48 |
-
strcat(cnam,"_");
|
49 |
-
strcat(cnam,pex);
|
50 |
-
fin.open(cnam,ios::in);
|
51 |
-
if(!fin.is_open()){
|
52 |
-
fin.clear();
|
53 |
-
strcpy(cnam,"path_");
|
54 |
-
strcat(cnam,pfl);
|
55 |
-
fin.open(cnam,ios::in);
|
56 |
-
if(!fin.is_open()){
|
57 |
-
fin.clear();
|
58 |
-
strcpy(cnam,"path");
|
59 |
-
fin.open(cnam,ios::in);
|
60 |
-
if(!fin.is_open()){
|
61 |
-
cout << "Path file for type " << pfl
|
62 |
-
<< " does not exist!" << endl;
|
63 |
-
exit(1);
|
64 |
-
}
|
65 |
-
}
|
66 |
-
}
|
67 |
-
}
|
68 |
-
|
69 |
-
fin.getline(nam,256);
|
70 |
-
fin.close();
|
71 |
-
strcat(nam,pfl);
|
72 |
-
strcat(nam,"_");
|
73 |
-
strcat(nam,pex);
|
74 |
-
strcat(nam,".");
|
75 |
-
strcat(nam,ch);
|
76 |
-
return(1);
|
77 |
-
}
|
78 |
-
|
79 |
-
char *add_num(const char *ptr,long n,char *buf){
|
80 |
-
char cnam[100];
|
81 |
-
long_str(cnam,n);
|
82 |
-
strcpy(buf,ptr);
|
83 |
-
strcat(buf,cnam);
|
84 |
-
return(buf);
|
85 |
-
}
|
86 |
-
|
87 |
-
long gseed(int x, char **v, const char *c){
|
88 |
-
long seed;
|
89 |
-
|
90 |
-
seed=clnga(x,v,c,"seed for random number generator");
|
91 |
-
srandom((unsigned int)seed);
|
92 |
-
return seed;
|
93 |
-
}
|
94 |
-
|
95 |
-
long zrand(long p){
|
96 |
-
return(((long)random())%p);
|
97 |
-
}
|
98 |
-
|
99 |
-
void shuffle(long n,long *idx){
|
100 |
-
long i,j,k;
|
101 |
-
for(i=0;i<n;i++){
|
102 |
-
k=zrand(n);
|
103 |
-
j=*(idx+i);
|
104 |
-
*(idx+i)=*(idx+k);
|
105 |
-
*(idx+k)=j;
|
106 |
-
}
|
107 |
-
}
|
108 |
-
|
109 |
-
void dshuffle(long n,long *idx){
|
110 |
-
long i,j,k;
|
111 |
-
for(i=n-1;i>0;i--){
|
112 |
-
k=zrand(i+1);
|
113 |
-
j=idx[i];
|
114 |
-
idx[i]=idx[k];
|
115 |
-
idx[k]=j;
|
116 |
-
}
|
117 |
-
}
|
118 |
-
|
119 |
-
long clnga(int x, char **v, const char *c, const char *name){
|
120 |
-
int i,flag=1;
|
121 |
-
long num;
|
122 |
-
|
123 |
-
for(i=1;i<x-1;i++)
|
124 |
-
if(strcmp(c,*(v+i))==0){
|
125 |
-
flag=0;
|
126 |
-
istringstream oss(*(v+i+1) );
|
127 |
-
oss >> num;
|
128 |
-
if(oss.fail()){
|
129 |
-
cout << "Enter " << name << ":" << endl;
|
130 |
-
cin >> num;
|
131 |
-
}
|
132 |
-
}
|
133 |
-
if(flag==1){
|
134 |
-
cout << "Enter " << name << ":" << endl;
|
135 |
-
cin >> num;
|
136 |
-
cin.get();
|
137 |
-
}
|
138 |
-
return(num);
|
139 |
-
}
|
140 |
-
|
141 |
-
long rnd(double p)
|
142 |
-
{
|
143 |
-
return((long)floor(p+.5));
|
144 |
-
}
|
145 |
-
|
146 |
-
double cdbla(int x, char **v, const char *c, const char *name){
|
147 |
-
int i,flag=1;
|
148 |
-
double num;
|
149 |
-
|
150 |
-
for(i=1;i<x-1;i++)
|
151 |
-
if(strcmp(c,*(v+i))==0){
|
152 |
-
flag=0;
|
153 |
-
istringstream oss(*(v+i+1));
|
154 |
-
oss >> num;
|
155 |
-
if(oss.fail()){
|
156 |
-
cout << "Enter " << name << ":" << endl;
|
157 |
-
cin >> num;
|
158 |
-
}
|
159 |
-
}
|
160 |
-
if(flag==1){
|
161 |
-
cout << "Enter " << name << ":" << endl;
|
162 |
-
cin >> num;
|
163 |
-
cin.get();
|
164 |
-
}
|
165 |
-
return(num);
|
166 |
-
}
|
167 |
-
|
168 |
-
char *cstra(int x, char **v, const char *c, const char *name){
|
169 |
-
int i;
|
170 |
-
char cnam[max_str];
|
171 |
-
|
172 |
-
for(i=1;i<x-1;i++){
|
173 |
-
if(strcmp(c,*(v+i))==0){
|
174 |
-
return(*(v+i+1));
|
175 |
-
}
|
176 |
-
}
|
177 |
-
|
178 |
-
restart:
|
179 |
-
cout << "Enter " << name << ":" << endl;
|
180 |
-
cin.getline(cnam,max_str);
|
181 |
-
if(i=cin.gcount()){
|
182 |
-
char *pch=new char[i+1];
|
183 |
-
strcpy(pch,cnam);
|
184 |
-
return(pch);
|
185 |
-
}
|
186 |
-
else {
|
187 |
-
cin.clear();
|
188 |
-
goto restart;
|
189 |
-
}
|
190 |
-
}
|
191 |
-
|
192 |
-
//Function to convert a long to a null terminated string.
|
193 |
-
void long_str(char *cnam,long n){
|
194 |
-
ostringstream oss;
|
195 |
-
oss << n;
|
196 |
-
const string & str = oss.str();
|
197 |
-
str.copy(cnam,20);
|
198 |
-
cnam[str.length()]='\0';
|
199 |
-
}
|
200 |
-
|
201 |
-
//Function to convert a string with null termination
|
202 |
-
//to a long.
|
203 |
-
void str_long(char *cnam,long &n){
|
204 |
-
istringstream(cnam) >> n;
|
205 |
-
}
|
206 |
-
|
207 |
-
//Function to convert first two char of string to an
|
208 |
-
//integer. Should be an ASCII null terminated string
|
209 |
-
int trac(const char *str){
|
210 |
-
if(!(*str))return(0);
|
211 |
-
else {
|
212 |
-
return((int)(*(str+1))+128*((int)(*str)));
|
213 |
-
}
|
214 |
-
}
|
215 |
-
|
216 |
-
}
|
|
|
1 |
+
#include <iostream>
|
2 |
+
#include <fstream>
|
3 |
+
#include <cstdlib>
|
4 |
+
#include <sstream>
|
5 |
+
#include <iomanip>
|
6 |
+
#include <cstring>
|
7 |
+
#include <cmath>
|
8 |
+
#include <sys/types.h>
|
9 |
+
#include <sys/stat.h>
|
10 |
+
#include <unistd.h>
|
11 |
+
#include <fcntl.h>
|
12 |
+
#include <sys/mman.h>
|
13 |
+
#include "runn.h"
|
14 |
+
using namespace std;
|
15 |
+
namespace iret {
|
16 |
+
|
17 |
+
int mark(int pflag, long ct, int ivl, const char *what){
|
18 |
+
if(pflag&&((ct%ivl)==0)){cout << what << " count=" << ct << endl;
|
19 |
+
return(1);}
|
20 |
+
else return(0);
|
21 |
+
}
|
22 |
+
|
23 |
+
int get_qflag(){
|
24 |
+
int pflag=1;
|
25 |
+
ifstream fin("quiet.flag",ios::in);
|
26 |
+
if(fin.is_open()){
|
27 |
+
fin >> pflag;
|
28 |
+
fin.close();
|
29 |
+
fin.clear();
|
30 |
+
}
|
31 |
+
return(pflag);
|
32 |
+
}
|
33 |
+
|
34 |
+
int get_pathw(char *nam,const char *pfl,const char *pex,const char *ch){
|
35 |
+
char cnam[256];
|
36 |
+
|
37 |
+
strcpy(cnam,"path_");
|
38 |
+
strcat(cnam,pfl);
|
39 |
+
strcat(cnam,"_");
|
40 |
+
strcat(cnam,pex);
|
41 |
+
strcat(cnam,".");
|
42 |
+
strcat(cnam,ch);
|
43 |
+
ifstream fin(cnam,ios::in);
|
44 |
+
if(!fin.is_open()){
|
45 |
+
fin.clear();
|
46 |
+
strcpy(cnam,"path_");
|
47 |
+
strcat(cnam,pfl);
|
48 |
+
strcat(cnam,"_");
|
49 |
+
strcat(cnam,pex);
|
50 |
+
fin.open(cnam,ios::in);
|
51 |
+
if(!fin.is_open()){
|
52 |
+
fin.clear();
|
53 |
+
strcpy(cnam,"path_");
|
54 |
+
strcat(cnam,pfl);
|
55 |
+
fin.open(cnam,ios::in);
|
56 |
+
if(!fin.is_open()){
|
57 |
+
fin.clear();
|
58 |
+
strcpy(cnam,"path");
|
59 |
+
fin.open(cnam,ios::in);
|
60 |
+
if(!fin.is_open()){
|
61 |
+
cout << "Path file for type " << pfl
|
62 |
+
<< " does not exist!" << endl;
|
63 |
+
exit(1);
|
64 |
+
}
|
65 |
+
}
|
66 |
+
}
|
67 |
+
}
|
68 |
+
|
69 |
+
fin.getline(nam,256);
|
70 |
+
fin.close();
|
71 |
+
strcat(nam,pfl);
|
72 |
+
strcat(nam,"_");
|
73 |
+
strcat(nam,pex);
|
74 |
+
strcat(nam,".");
|
75 |
+
strcat(nam,ch);
|
76 |
+
return(1);
|
77 |
+
}
|
78 |
+
|
79 |
+
char *add_num(const char *ptr,long n,char *buf){
|
80 |
+
char cnam[100];
|
81 |
+
long_str(cnam,n);
|
82 |
+
strcpy(buf,ptr);
|
83 |
+
strcat(buf,cnam);
|
84 |
+
return(buf);
|
85 |
+
}
|
86 |
+
|
87 |
+
long gseed(int x, char **v, const char *c){
|
88 |
+
long seed;
|
89 |
+
|
90 |
+
seed=clnga(x,v,c,"seed for random number generator");
|
91 |
+
srandom((unsigned int)seed);
|
92 |
+
return seed;
|
93 |
+
}
|
94 |
+
|
95 |
+
long zrand(long p){
|
96 |
+
return(((long)random())%p);
|
97 |
+
}
|
98 |
+
|
99 |
+
void shuffle(long n,long *idx){
|
100 |
+
long i,j,k;
|
101 |
+
for(i=0;i<n;i++){
|
102 |
+
k=zrand(n);
|
103 |
+
j=*(idx+i);
|
104 |
+
*(idx+i)=*(idx+k);
|
105 |
+
*(idx+k)=j;
|
106 |
+
}
|
107 |
+
}
|
108 |
+
|
109 |
+
void dshuffle(long n,long *idx){
|
110 |
+
long i,j,k;
|
111 |
+
for(i=n-1;i>0;i--){
|
112 |
+
k=zrand(i+1);
|
113 |
+
j=idx[i];
|
114 |
+
idx[i]=idx[k];
|
115 |
+
idx[k]=j;
|
116 |
+
}
|
117 |
+
}
|
118 |
+
|
119 |
+
long clnga(int x, char **v, const char *c, const char *name){
|
120 |
+
int i,flag=1;
|
121 |
+
long num;
|
122 |
+
|
123 |
+
for(i=1;i<x-1;i++)
|
124 |
+
if(strcmp(c,*(v+i))==0){
|
125 |
+
flag=0;
|
126 |
+
istringstream oss(*(v+i+1) );
|
127 |
+
oss >> num;
|
128 |
+
if(oss.fail()){
|
129 |
+
cout << "Enter " << name << ":" << endl;
|
130 |
+
cin >> num;
|
131 |
+
}
|
132 |
+
}
|
133 |
+
if(flag==1){
|
134 |
+
cout << "Enter " << name << ":" << endl;
|
135 |
+
cin >> num;
|
136 |
+
cin.get();
|
137 |
+
}
|
138 |
+
return(num);
|
139 |
+
}
|
140 |
+
|
141 |
+
long rnd(double p)
|
142 |
+
{
|
143 |
+
return((long)floor(p+.5));
|
144 |
+
}
|
145 |
+
|
146 |
+
double cdbla(int x, char **v, const char *c, const char *name){
|
147 |
+
int i,flag=1;
|
148 |
+
double num;
|
149 |
+
|
150 |
+
for(i=1;i<x-1;i++)
|
151 |
+
if(strcmp(c,*(v+i))==0){
|
152 |
+
flag=0;
|
153 |
+
istringstream oss(*(v+i+1));
|
154 |
+
oss >> num;
|
155 |
+
if(oss.fail()){
|
156 |
+
cout << "Enter " << name << ":" << endl;
|
157 |
+
cin >> num;
|
158 |
+
}
|
159 |
+
}
|
160 |
+
if(flag==1){
|
161 |
+
cout << "Enter " << name << ":" << endl;
|
162 |
+
cin >> num;
|
163 |
+
cin.get();
|
164 |
+
}
|
165 |
+
return(num);
|
166 |
+
}
|
167 |
+
|
168 |
+
char *cstra(int x, char **v, const char *c, const char *name){
|
169 |
+
int i;
|
170 |
+
char cnam[max_str];
|
171 |
+
|
172 |
+
for(i=1;i<x-1;i++){
|
173 |
+
if(strcmp(c,*(v+i))==0){
|
174 |
+
return(*(v+i+1));
|
175 |
+
}
|
176 |
+
}
|
177 |
+
|
178 |
+
restart:
|
179 |
+
cout << "Enter " << name << ":" << endl;
|
180 |
+
cin.getline(cnam,max_str);
|
181 |
+
if(i=cin.gcount()){
|
182 |
+
char *pch=new char[i+1];
|
183 |
+
strcpy(pch,cnam);
|
184 |
+
return(pch);
|
185 |
+
}
|
186 |
+
else {
|
187 |
+
cin.clear();
|
188 |
+
goto restart;
|
189 |
+
}
|
190 |
+
}
|
191 |
+
|
192 |
+
//Function to convert a long to a null terminated string.
|
193 |
+
void long_str(char *cnam,long n){
|
194 |
+
ostringstream oss;
|
195 |
+
oss << n;
|
196 |
+
const string & str = oss.str();
|
197 |
+
str.copy(cnam,20);
|
198 |
+
cnam[str.length()]='\0';
|
199 |
+
}
|
200 |
+
|
201 |
+
//Function to convert a string with null termination
|
202 |
+
//to a long.
|
203 |
+
void str_long(char *cnam,long &n){
|
204 |
+
istringstream(cnam) >> n;
|
205 |
+
}
|
206 |
+
|
207 |
+
//Function to convert first two char of string to an
|
208 |
+
//integer. Should be an ASCII null terminated string
|
209 |
+
int trac(const char *str){
|
210 |
+
if(!(*str))return(0);
|
211 |
+
else {
|
212 |
+
return((int)(*(str+1))+128*((int)(*str)));
|
213 |
+
}
|
214 |
+
}
|
215 |
+
|
216 |
+
}
|
Library/runn.h
CHANGED
@@ -1,392 +1,392 @@
|
|
1 |
-
#ifndef RUNN_H
|
2 |
-
#define RUNN_H
|
3 |
-
|
4 |
-
#include <fstream>
|
5 |
-
#include <iostream>
|
6 |
-
#include <cctype>
|
7 |
-
#include <cstring>
|
8 |
-
#include <cstdlib>
|
9 |
-
using namespace std;
|
10 |
-
namespace iret {
|
11 |
-
|
12 |
-
const int word_cnt = 5000; //Maximum number of words in a document.
|
13 |
-
const int word_len = 1500; //Maximum word length.
|
14 |
-
const long max_str=1500; //Maximum string length.
|
15 |
-
|
16 |
-
int get_pathw(char *cn,const char *dfl,const char *dex,const char *a);
|
17 |
-
//Reads the path from a file "path_(*dfl)" and constructs the
|
18 |
-
//file name from as "(*dfl)_(*dex).(*a)". Cats path and file
|
19 |
-
//name and returns the full info in cn.
|
20 |
-
char *add_num(const char *ptr,long n,char *buf); //converts long to ascii
|
21 |
-
//and cats to end of string and returns pointer to new string
|
22 |
-
//that results. Does not change input string. The new string is
|
23 |
-
//held in buffer space and this is overwritten at each call.
|
24 |
-
|
25 |
-
int get_qflag();
|
26 |
-
//This function gets the value of the print flag pflag that is
|
27 |
-
//used to control output.
|
28 |
-
int mark(int,long,int,const char*);
|
29 |
-
//This function is used to print out information that indicates
|
30 |
-
//how a function is progressing. It is dependent on the value of
|
31 |
-
//pflag.
|
32 |
-
long gseed(int,char**,const char*);
|
33 |
-
//This function is called to allow the input of a seed value for
|
34 |
-
//the random number generator. It must be called in main or the
|
35 |
-
//arguments of main must be passed down to it if it is to allow
|
36 |
-
//command line entry. Otherwise the first argument may be set to
|
37 |
-
//zero and it may be used to enter the seed at run time from the
|
38 |
-
//console.
|
39 |
-
long clnga(int,char**,const char*,const char*);
|
40 |
-
//Allows a long to be entered from the console at run time if the
|
41 |
-
//first argument is set to zero. If the first two arguments are
|
42 |
-
//the arguments of main, then it allows command line entry with
|
43 |
-
//the flag that is the third argument and with a statement about
|
44 |
-
//the input that is the fourth argument.
|
45 |
-
double cdbla(int,char**,const char*,const char*);
|
46 |
-
char *cstra(int,char**,const char*,const char*);
|
47 |
-
long zrand(long);
|
48 |
-
//Produces a random long integer that is in the range [0,argument).
|
49 |
-
//Machinery of the random number generator.
|
50 |
-
void shuffle(long n,long *idx); //Randomly shuffles an array of longs.
|
51 |
-
void dshuffle(long n,long *idx); //Randomly shuffles an array of longs.
|
52 |
-
//Improved version suggested by Don Comeau
|
53 |
-
long rnd(double);
|
54 |
-
//Rounds off a double and returns the integer that results.
|
55 |
-
|
56 |
-
//Reads in a string including white space and ends the string
|
57 |
-
//just before the character a.
|
58 |
-
inline int get_string(char *cnam,ifstream &ifile,char a){
|
59 |
-
char *pch = cnam;
|
60 |
-
long j=1;
|
61 |
-
|
62 |
-
start:
|
63 |
-
if((*(pch++)=ifile.get())!=EOF){
|
64 |
-
if(*(pch-1)==a){pch--;goto start;}
|
65 |
-
while(((*(pch++)=ifile.get())!=a)&&(j<max_str))j++;
|
66 |
-
if(j<max_str){
|
67 |
-
*(--pch)='\0';
|
68 |
-
return(j);
|
69 |
-
}
|
70 |
-
else return(0);
|
71 |
-
}
|
72 |
-
return(0);
|
73 |
-
}
|
74 |
-
|
75 |
-
inline int get_strinf(char *cnam,fstream &ifile,char a){
|
76 |
-
char *pch = cnam;
|
77 |
-
long j=1;
|
78 |
-
if((*(pch++)=ifile.get())!=EOF){
|
79 |
-
while(((*(pch++)=ifile.get())!=a)&&(j<max_str))j++;
|
80 |
-
if(j<max_str){
|
81 |
-
*(--pch)='\0';
|
82 |
-
return(j);
|
83 |
-
}
|
84 |
-
else return(0);
|
85 |
-
}
|
86 |
-
return(0);
|
87 |
-
}
|
88 |
-
|
89 |
-
//Function to lower case a string.
|
90 |
-
inline void lower_case(char *cnam){
|
91 |
-
int i=0;
|
92 |
-
char ch;
|
93 |
-
|
94 |
-
while((ch=cnam[i])!='\0'){
|
95 |
-
cnam[i++]=tolower(ch);
|
96 |
-
}
|
97 |
-
}
|
98 |
-
|
99 |
-
//Note that ordering functions beginning with sS or hS
|
100 |
-
//produce an order that is increasing with increasing
|
101 |
-
//index, while sR or hR produces the reverse order.
|
102 |
-
|
103 |
-
template <class X>
|
104 |
-
void sSort(const long ix, X *idx){
|
105 |
-
long k, j, ir, i;
|
106 |
-
X rra;
|
107 |
-
|
108 |
-
if(ix<=1)return;
|
109 |
-
|
110 |
-
k=(ix>>1);
|
111 |
-
ir=ix-1;
|
112 |
-
for(;;) {
|
113 |
-
if(k>0) {
|
114 |
-
rra=idx[--k];
|
115 |
-
}
|
116 |
-
else {
|
117 |
-
rra=idx[ir];
|
118 |
-
idx[ir] = idx[0];
|
119 |
-
if(--ir ==0) {
|
120 |
-
idx[0]=rra;
|
121 |
-
return;
|
122 |
-
}
|
123 |
-
}
|
124 |
-
i=k;
|
125 |
-
j=((k+1)<<1)-1;
|
126 |
-
while(j<=ir) {
|
127 |
-
if(j<ir && (idx[j]<idx[j+1])) ++j;
|
128 |
-
if(rra<idx[j]) {
|
129 |
-
idx[i]=idx[j];
|
130 |
-
j +=(i=j)+1;
|
131 |
-
}
|
132 |
-
else j=ir+1;
|
133 |
-
}
|
134 |
-
idx[i]=rra;
|
135 |
-
}
|
136 |
-
}
|
137 |
-
|
138 |
-
template <class X>
|
139 |
-
void sRort(const long ix, X *idx){
|
140 |
-
long k, j, ir, i;
|
141 |
-
X rra;
|
142 |
-
|
143 |
-
if(ix<=1)return;
|
144 |
-
|
145 |
-
k=(ix>>1);
|
146 |
-
ir=ix-1;
|
147 |
-
for(;;) {
|
148 |
-
if(k>0) {
|
149 |
-
rra=idx[--k];
|
150 |
-
}
|
151 |
-
else {
|
152 |
-
rra=idx[ir];
|
153 |
-
idx[ir] = idx[0];
|
154 |
-
if(--ir ==0) {
|
155 |
-
idx[0]=rra;
|
156 |
-
return;
|
157 |
-
}
|
158 |
-
}
|
159 |
-
i=k;
|
160 |
-
j=((k+1)<<1)-1;
|
161 |
-
while(j<=ir) {
|
162 |
-
if(j<ir && (idx[j]>idx[j+1])) ++j;
|
163 |
-
if(rra>idx[j]) {
|
164 |
-
idx[i]=idx[j];
|
165 |
-
j +=(i=j)+1;
|
166 |
-
}
|
167 |
-
else j=ir+1;
|
168 |
-
}
|
169 |
-
idx[i]=rra;
|
170 |
-
}
|
171 |
-
}
|
172 |
-
|
173 |
-
template <class X, class Y>
|
174 |
-
void hSort(const long n, X *ra, Y *rb) {
|
175 |
-
long k, j, ir, i;
|
176 |
-
X rra;
|
177 |
-
Y rrb;
|
178 |
-
|
179 |
-
if(n<=1)return;
|
180 |
-
|
181 |
-
k=(n>>1);
|
182 |
-
ir=n-1;
|
183 |
-
for(;;) {
|
184 |
-
if(k>0) {
|
185 |
-
rra=ra[--k];
|
186 |
-
rrb=rb[k];
|
187 |
-
}
|
188 |
-
else {
|
189 |
-
rra=ra[ir];
|
190 |
-
rrb=rb[ir];
|
191 |
-
ra[ir] = ra[0];
|
192 |
-
rb[ir] = rb[0];
|
193 |
-
if(--ir ==0) {
|
194 |
-
ra[0]=rra;
|
195 |
-
rb[0]=rrb;
|
196 |
-
return;
|
197 |
-
}
|
198 |
-
}
|
199 |
-
i=k;
|
200 |
-
j=((k+1)<<1)-1;
|
201 |
-
while(j<=ir) {
|
202 |
-
if(j<ir && ra[j] < ra[j+1]) ++j;
|
203 |
-
if(rra<ra[j]) {
|
204 |
-
ra[i]=ra[j];
|
205 |
-
rb[i]=rb[j];
|
206 |
-
j +=(i=j)+1;
|
207 |
-
}
|
208 |
-
else j=ir+1;
|
209 |
-
}
|
210 |
-
ra[i]=rra;
|
211 |
-
rb[i]=rrb;
|
212 |
-
}
|
213 |
-
}
|
214 |
-
|
215 |
-
template <class X, class Y, class Z>
|
216 |
-
void hSort(const long n, X *ra, Y *rb, Z *rc) {
|
217 |
-
long k, j, ir, i;
|
218 |
-
X rra;
|
219 |
-
Y rrb;
|
220 |
-
Z rrc;
|
221 |
-
|
222 |
-
if(n<=1)return;
|
223 |
-
|
224 |
-
k=(n>>1);
|
225 |
-
ir=n-1;
|
226 |
-
for(;;) {
|
227 |
-
if(k>0) {
|
228 |
-
rra=ra[--k];
|
229 |
-
rrb=rb[k];
|
230 |
-
rrc=rc[k];
|
231 |
-
}
|
232 |
-
else {
|
233 |
-
rra=ra[ir];
|
234 |
-
rrb=rb[ir];
|
235 |
-
rrc=rc[ir];
|
236 |
-
ra[ir] = ra[0];
|
237 |
-
rb[ir] = rb[0];
|
238 |
-
rc[ir] = rc[0];
|
239 |
-
if(--ir ==0) {
|
240 |
-
ra[0]=rra;
|
241 |
-
rb[0]=rrb;
|
242 |
-
rc[0]=rrc;
|
243 |
-
return;
|
244 |
-
}
|
245 |
-
}
|
246 |
-
i=k;
|
247 |
-
j=((k+1)<<1)-1;
|
248 |
-
while(j<=ir) {
|
249 |
-
if(j<ir && ra[j] < ra[j+1]) ++j;
|
250 |
-
if(rra<ra[j]) {
|
251 |
-
ra[i]=ra[j];
|
252 |
-
rb[i]=rb[j];
|
253 |
-
rc[i]=rc[j];
|
254 |
-
j +=(i=j)+1;
|
255 |
-
}
|
256 |
-
else j=ir+1;
|
257 |
-
}
|
258 |
-
ra[i]=rra;
|
259 |
-
rb[i]=rrb;
|
260 |
-
rc[i]=rrc;
|
261 |
-
}
|
262 |
-
}
|
263 |
-
|
264 |
-
template <class X, class Y>
|
265 |
-
void hRort(const long n, X *ra, Y *rb) {
|
266 |
-
long k, j, ir, i;
|
267 |
-
X rra;
|
268 |
-
Y rrb;
|
269 |
-
|
270 |
-
if(n<=1)return;
|
271 |
-
|
272 |
-
k=(n>>1);
|
273 |
-
ir=n-1;
|
274 |
-
for(;;) {
|
275 |
-
if(k>0) {
|
276 |
-
rra=ra[--k];
|
277 |
-
rrb=rb[k];
|
278 |
-
}
|
279 |
-
else {
|
280 |
-
rra=ra[ir];
|
281 |
-
rrb=rb[ir];
|
282 |
-
ra[ir] = ra[0];
|
283 |
-
rb[ir] = rb[0];
|
284 |
-
if(--ir ==0) {
|
285 |
-
ra[0]=rra;
|
286 |
-
rb[0]=rrb;
|
287 |
-
return;
|
288 |
-
}
|
289 |
-
}
|
290 |
-
i=k;
|
291 |
-
j=((k+1)<<1)-1;
|
292 |
-
while(j<=ir) {
|
293 |
-
if(j<ir && ra[j] > ra[j+1]) ++j;
|
294 |
-
if(rra>ra[j]) {
|
295 |
-
ra[i]=ra[j];
|
296 |
-
rb[i]=rb[j];
|
297 |
-
j +=(i=j)+1;
|
298 |
-
}
|
299 |
-
else j=ir+1;
|
300 |
-
}
|
301 |
-
ra[i]=rra;
|
302 |
-
rb[i]=rrb;
|
303 |
-
}
|
304 |
-
}
|
305 |
-
|
306 |
-
template <class X, class Y, class Z>
|
307 |
-
void hRort(const long n, X *ra, Y *rb, Z *rc) {
|
308 |
-
long k, j, ir, i;
|
309 |
-
X rra;
|
310 |
-
Y rrb;
|
311 |
-
Z rrc;
|
312 |
-
|
313 |
-
if(n<=1)return;
|
314 |
-
|
315 |
-
k=(n>>1);
|
316 |
-
ir=n-1;
|
317 |
-
for(;;) {
|
318 |
-
if(k>0) {
|
319 |
-
rra=ra[--k];
|
320 |
-
rrb=rb[k];
|
321 |
-
rrc=rc[k];
|
322 |
-
}
|
323 |
-
else {
|
324 |
-
rra=ra[ir];
|
325 |
-
rrb=rb[ir];
|
326 |
-
rrc=rc[ir];
|
327 |
-
ra[ir] = ra[0];
|
328 |
-
rb[ir] = rb[0];
|
329 |
-
rc[ir] = rc[0];
|
330 |
-
if(--ir ==0) {
|
331 |
-
ra[0]=rra;
|
332 |
-
rb[0]=rrb;
|
333 |
-
rc[0]=rrc;
|
334 |
-
return;
|
335 |
-
}
|
336 |
-
}
|
337 |
-
i=k;
|
338 |
-
j=((k+1)<<1)-1;
|
339 |
-
while(j<=ir) {
|
340 |
-
if(j<ir && ra[j] > ra[j+1]) ++j;
|
341 |
-
if(rra>ra[j]) {
|
342 |
-
ra[i]=ra[j];
|
343 |
-
rb[i]=rb[j];
|
344 |
-
rc[i]=rc[j];
|
345 |
-
j +=(i=j)+1;
|
346 |
-
}
|
347 |
-
else j=ir+1;
|
348 |
-
}
|
349 |
-
ra[i]=rra;
|
350 |
-
rb[i]=rrb;
|
351 |
-
rc[i]=rrc;
|
352 |
-
}
|
353 |
-
}
|
354 |
-
|
355 |
-
|
356 |
-
//Function to convert a long to a null terminated string.
|
357 |
-
void long_str(char *cnam,long n);
|
358 |
-
|
359 |
-
//Function to convert a string with null termination
|
360 |
-
//to a long.
|
361 |
-
void str_long(char *cnam,long &n);
|
362 |
-
|
363 |
-
//Function to convert first two char of string to an
|
364 |
-
//integer. Should be an ASCII null terminated string
|
365 |
-
int trac(const char *str);
|
366 |
-
|
367 |
-
template<typename Y,typename Z>
|
368 |
-
void xshuffle(Y n,Z *idx){ //Randomly shuffles an array of longs.
|
369 |
-
Y i,k;
|
370 |
-
Z u;
|
371 |
-
for(i=n-1;i>0;i--){
|
372 |
-
k=(Y)zrand((long)i+1);
|
373 |
-
u=idx[i];
|
374 |
-
idx[i]=idx[k];
|
375 |
-
idx[k]=u;
|
376 |
-
}
|
377 |
-
}
|
378 |
-
|
379 |
-
template<class Z>
|
380 |
-
void dxhuffle(long n,Z *idx){ //Randomly shuffles an array type Z*.
|
381 |
-
long i,k;
|
382 |
-
Z xx;
|
383 |
-
for(i=n-1;i>0;i--){
|
384 |
-
k=zrand(i+1);
|
385 |
-
xx=idx[i];
|
386 |
-
idx[i]=idx[k];
|
387 |
-
idx[k]=xx;
|
388 |
-
}
|
389 |
-
}
|
390 |
-
|
391 |
-
}
|
392 |
-
#endif
|
|
|
1 |
+
#ifndef RUNN_H
|
2 |
+
#define RUNN_H
|
3 |
+
|
4 |
+
#include <fstream>
|
5 |
+
#include <iostream>
|
6 |
+
#include <cctype>
|
7 |
+
#include <cstring>
|
8 |
+
#include <cstdlib>
|
9 |
+
using namespace std;
|
10 |
+
namespace iret {
|
11 |
+
|
12 |
+
const int word_cnt = 5000; //Maximum number of words in a document.
|
13 |
+
const int word_len = 1500; //Maximum word length.
|
14 |
+
const long max_str=1500; //Maximum string length.
|
15 |
+
|
16 |
+
int get_pathw(char *cn,const char *dfl,const char *dex,const char *a);
|
17 |
+
//Reads the path from a file "path_(*dfl)" and constructs the
|
18 |
+
//file name from as "(*dfl)_(*dex).(*a)". Cats path and file
|
19 |
+
//name and returns the full info in cn.
|
20 |
+
char *add_num(const char *ptr,long n,char *buf); //converts long to ascii
|
21 |
+
//and cats to end of string and returns pointer to new string
|
22 |
+
//that results. Does not change input string. The new string is
|
23 |
+
//held in buffer space and this is overwritten at each call.
|
24 |
+
|
25 |
+
int get_qflag();
|
26 |
+
//This function gets the value of the print flag pflag that is
|
27 |
+
//used to control output.
|
28 |
+
int mark(int,long,int,const char*);
|
29 |
+
//This function is used to print out information that indicates
|
30 |
+
//how a function is progressing. It is dependent on the value of
|
31 |
+
//pflag.
|
32 |
+
long gseed(int,char**,const char*);
|
33 |
+
//This function is called to allow the input of a seed value for
|
34 |
+
//the random number generator. It must be called in main or the
|
35 |
+
//arguments of main must be passed down to it if it is to allow
|
36 |
+
//command line entry. Otherwise the first argument may be set to
|
37 |
+
//zero and it may be used to enter the seed at run time from the
|
38 |
+
//console.
|
39 |
+
long clnga(int,char**,const char*,const char*);
|
40 |
+
//Allows a long to be entered from the console at run time if the
|
41 |
+
//first argument is set to zero. If the first two arguments are
|
42 |
+
//the arguments of main, then it allows command line entry with
|
43 |
+
//the flag that is the third argument and with a statement about
|
44 |
+
//the input that is the fourth argument.
|
45 |
+
double cdbla(int,char**,const char*,const char*);
|
46 |
+
char *cstra(int,char**,const char*,const char*);
|
47 |
+
long zrand(long);
|
48 |
+
//Produces a random long integer that is in the range [0,argument).
|
49 |
+
//Machinery of the random number generator.
|
50 |
+
void shuffle(long n,long *idx); //Randomly shuffles an array of longs.
|
51 |
+
void dshuffle(long n,long *idx); //Randomly shuffles an array of longs.
|
52 |
+
//Improved version suggested by Don Comeau
|
53 |
+
long rnd(double);
|
54 |
+
//Rounds off a double and returns the integer that results.
|
55 |
+
|
56 |
+
//Reads in a string including white space and ends the string
|
57 |
+
//just before the character a.
|
58 |
+
inline int get_string(char *cnam,ifstream &ifile,char a){
|
59 |
+
char *pch = cnam;
|
60 |
+
long j=1;
|
61 |
+
|
62 |
+
start:
|
63 |
+
if((*(pch++)=ifile.get())!=EOF){
|
64 |
+
if(*(pch-1)==a){pch--;goto start;}
|
65 |
+
while(((*(pch++)=ifile.get())!=a)&&(j<max_str))j++;
|
66 |
+
if(j<max_str){
|
67 |
+
*(--pch)='\0';
|
68 |
+
return(j);
|
69 |
+
}
|
70 |
+
else return(0);
|
71 |
+
}
|
72 |
+
return(0);
|
73 |
+
}
|
74 |
+
|
75 |
+
inline int get_strinf(char *cnam,fstream &ifile,char a){
|
76 |
+
char *pch = cnam;
|
77 |
+
long j=1;
|
78 |
+
if((*(pch++)=ifile.get())!=EOF){
|
79 |
+
while(((*(pch++)=ifile.get())!=a)&&(j<max_str))j++;
|
80 |
+
if(j<max_str){
|
81 |
+
*(--pch)='\0';
|
82 |
+
return(j);
|
83 |
+
}
|
84 |
+
else return(0);
|
85 |
+
}
|
86 |
+
return(0);
|
87 |
+
}
|
88 |
+
|
89 |
+
//Function to lower case a string.
|
90 |
+
inline void lower_case(char *cnam){
|
91 |
+
int i=0;
|
92 |
+
char ch;
|
93 |
+
|
94 |
+
while((ch=cnam[i])!='\0'){
|
95 |
+
cnam[i++]=tolower(ch);
|
96 |
+
}
|
97 |
+
}
|
98 |
+
|
99 |
+
//Note that ordering functions beginning with sS or hS
|
100 |
+
//produce an order that is increasing with increasing
|
101 |
+
//index, while sR or hR produces the reverse order.
|
102 |
+
|
103 |
+
template <class X>
|
104 |
+
void sSort(const long ix, X *idx){
|
105 |
+
long k, j, ir, i;
|
106 |
+
X rra;
|
107 |
+
|
108 |
+
if(ix<=1)return;
|
109 |
+
|
110 |
+
k=(ix>>1);
|
111 |
+
ir=ix-1;
|
112 |
+
for(;;) {
|
113 |
+
if(k>0) {
|
114 |
+
rra=idx[--k];
|
115 |
+
}
|
116 |
+
else {
|
117 |
+
rra=idx[ir];
|
118 |
+
idx[ir] = idx[0];
|
119 |
+
if(--ir ==0) {
|
120 |
+
idx[0]=rra;
|
121 |
+
return;
|
122 |
+
}
|
123 |
+
}
|
124 |
+
i=k;
|
125 |
+
j=((k+1)<<1)-1;
|
126 |
+
while(j<=ir) {
|
127 |
+
if(j<ir && (idx[j]<idx[j+1])) ++j;
|
128 |
+
if(rra<idx[j]) {
|
129 |
+
idx[i]=idx[j];
|
130 |
+
j +=(i=j)+1;
|
131 |
+
}
|
132 |
+
else j=ir+1;
|
133 |
+
}
|
134 |
+
idx[i]=rra;
|
135 |
+
}
|
136 |
+
}
|
137 |
+
|
138 |
+
template <class X>
|
139 |
+
void sRort(const long ix, X *idx){
|
140 |
+
long k, j, ir, i;
|
141 |
+
X rra;
|
142 |
+
|
143 |
+
if(ix<=1)return;
|
144 |
+
|
145 |
+
k=(ix>>1);
|
146 |
+
ir=ix-1;
|
147 |
+
for(;;) {
|
148 |
+
if(k>0) {
|
149 |
+
rra=idx[--k];
|
150 |
+
}
|
151 |
+
else {
|
152 |
+
rra=idx[ir];
|
153 |
+
idx[ir] = idx[0];
|
154 |
+
if(--ir ==0) {
|
155 |
+
idx[0]=rra;
|
156 |
+
return;
|
157 |
+
}
|
158 |
+
}
|
159 |
+
i=k;
|
160 |
+
j=((k+1)<<1)-1;
|
161 |
+
while(j<=ir) {
|
162 |
+
if(j<ir && (idx[j]>idx[j+1])) ++j;
|
163 |
+
if(rra>idx[j]) {
|
164 |
+
idx[i]=idx[j];
|
165 |
+
j +=(i=j)+1;
|
166 |
+
}
|
167 |
+
else j=ir+1;
|
168 |
+
}
|
169 |
+
idx[i]=rra;
|
170 |
+
}
|
171 |
+
}
|
172 |
+
|
173 |
+
template <class X, class Y>
|
174 |
+
void hSort(const long n, X *ra, Y *rb) {
|
175 |
+
long k, j, ir, i;
|
176 |
+
X rra;
|
177 |
+
Y rrb;
|
178 |
+
|
179 |
+
if(n<=1)return;
|
180 |
+
|
181 |
+
k=(n>>1);
|
182 |
+
ir=n-1;
|
183 |
+
for(;;) {
|
184 |
+
if(k>0) {
|
185 |
+
rra=ra[--k];
|
186 |
+
rrb=rb[k];
|
187 |
+
}
|
188 |
+
else {
|
189 |
+
rra=ra[ir];
|
190 |
+
rrb=rb[ir];
|
191 |
+
ra[ir] = ra[0];
|
192 |
+
rb[ir] = rb[0];
|
193 |
+
if(--ir ==0) {
|
194 |
+
ra[0]=rra;
|
195 |
+
rb[0]=rrb;
|
196 |
+
return;
|
197 |
+
}
|
198 |
+
}
|
199 |
+
i=k;
|
200 |
+
j=((k+1)<<1)-1;
|
201 |
+
while(j<=ir) {
|
202 |
+
if(j<ir && ra[j] < ra[j+1]) ++j;
|
203 |
+
if(rra<ra[j]) {
|
204 |
+
ra[i]=ra[j];
|
205 |
+
rb[i]=rb[j];
|
206 |
+
j +=(i=j)+1;
|
207 |
+
}
|
208 |
+
else j=ir+1;
|
209 |
+
}
|
210 |
+
ra[i]=rra;
|
211 |
+
rb[i]=rrb;
|
212 |
+
}
|
213 |
+
}
|
214 |
+
|
215 |
+
template <class X, class Y, class Z>
|
216 |
+
void hSort(const long n, X *ra, Y *rb, Z *rc) {
|
217 |
+
long k, j, ir, i;
|
218 |
+
X rra;
|
219 |
+
Y rrb;
|
220 |
+
Z rrc;
|
221 |
+
|
222 |
+
if(n<=1)return;
|
223 |
+
|
224 |
+
k=(n>>1);
|
225 |
+
ir=n-1;
|
226 |
+
for(;;) {
|
227 |
+
if(k>0) {
|
228 |
+
rra=ra[--k];
|
229 |
+
rrb=rb[k];
|
230 |
+
rrc=rc[k];
|
231 |
+
}
|
232 |
+
else {
|
233 |
+
rra=ra[ir];
|
234 |
+
rrb=rb[ir];
|
235 |
+
rrc=rc[ir];
|
236 |
+
ra[ir] = ra[0];
|
237 |
+
rb[ir] = rb[0];
|
238 |
+
rc[ir] = rc[0];
|
239 |
+
if(--ir ==0) {
|
240 |
+
ra[0]=rra;
|
241 |
+
rb[0]=rrb;
|
242 |
+
rc[0]=rrc;
|
243 |
+
return;
|
244 |
+
}
|
245 |
+
}
|
246 |
+
i=k;
|
247 |
+
j=((k+1)<<1)-1;
|
248 |
+
while(j<=ir) {
|
249 |
+
if(j<ir && ra[j] < ra[j+1]) ++j;
|
250 |
+
if(rra<ra[j]) {
|
251 |
+
ra[i]=ra[j];
|
252 |
+
rb[i]=rb[j];
|
253 |
+
rc[i]=rc[j];
|
254 |
+
j +=(i=j)+1;
|
255 |
+
}
|
256 |
+
else j=ir+1;
|
257 |
+
}
|
258 |
+
ra[i]=rra;
|
259 |
+
rb[i]=rrb;
|
260 |
+
rc[i]=rrc;
|
261 |
+
}
|
262 |
+
}
|
263 |
+
|
264 |
+
template <class X, class Y>
|
265 |
+
void hRort(const long n, X *ra, Y *rb) {
|
266 |
+
long k, j, ir, i;
|
267 |
+
X rra;
|
268 |
+
Y rrb;
|
269 |
+
|
270 |
+
if(n<=1)return;
|
271 |
+
|
272 |
+
k=(n>>1);
|
273 |
+
ir=n-1;
|
274 |
+
for(;;) {
|
275 |
+
if(k>0) {
|
276 |
+
rra=ra[--k];
|
277 |
+
rrb=rb[k];
|
278 |
+
}
|
279 |
+
else {
|
280 |
+
rra=ra[ir];
|
281 |
+
rrb=rb[ir];
|
282 |
+
ra[ir] = ra[0];
|
283 |
+
rb[ir] = rb[0];
|
284 |
+
if(--ir ==0) {
|
285 |
+
ra[0]=rra;
|
286 |
+
rb[0]=rrb;
|
287 |
+
return;
|
288 |
+
}
|
289 |
+
}
|
290 |
+
i=k;
|
291 |
+
j=((k+1)<<1)-1;
|
292 |
+
while(j<=ir) {
|
293 |
+
if(j<ir && ra[j] > ra[j+1]) ++j;
|
294 |
+
if(rra>ra[j]) {
|
295 |
+
ra[i]=ra[j];
|
296 |
+
rb[i]=rb[j];
|
297 |
+
j +=(i=j)+1;
|
298 |
+
}
|
299 |
+
else j=ir+1;
|
300 |
+
}
|
301 |
+
ra[i]=rra;
|
302 |
+
rb[i]=rrb;
|
303 |
+
}
|
304 |
+
}
|
305 |
+
|
306 |
+
template <class X, class Y, class Z>
|
307 |
+
void hRort(const long n, X *ra, Y *rb, Z *rc) {
|
308 |
+
long k, j, ir, i;
|
309 |
+
X rra;
|
310 |
+
Y rrb;
|
311 |
+
Z rrc;
|
312 |
+
|
313 |
+
if(n<=1)return;
|
314 |
+
|
315 |
+
k=(n>>1);
|
316 |
+
ir=n-1;
|
317 |
+
for(;;) {
|
318 |
+
if(k>0) {
|
319 |
+
rra=ra[--k];
|
320 |
+
rrb=rb[k];
|
321 |
+
rrc=rc[k];
|
322 |
+
}
|
323 |
+
else {
|
324 |
+
rra=ra[ir];
|
325 |
+
rrb=rb[ir];
|
326 |
+
rrc=rc[ir];
|
327 |
+
ra[ir] = ra[0];
|
328 |
+
rb[ir] = rb[0];
|
329 |
+
rc[ir] = rc[0];
|
330 |
+
if(--ir ==0) {
|
331 |
+
ra[0]=rra;
|
332 |
+
rb[0]=rrb;
|
333 |
+
rc[0]=rrc;
|
334 |
+
return;
|
335 |
+
}
|
336 |
+
}
|
337 |
+
i=k;
|
338 |
+
j=((k+1)<<1)-1;
|
339 |
+
while(j<=ir) {
|
340 |
+
if(j<ir && ra[j] > ra[j+1]) ++j;
|
341 |
+
if(rra>ra[j]) {
|
342 |
+
ra[i]=ra[j];
|
343 |
+
rb[i]=rb[j];
|
344 |
+
rc[i]=rc[j];
|
345 |
+
j +=(i=j)+1;
|
346 |
+
}
|
347 |
+
else j=ir+1;
|
348 |
+
}
|
349 |
+
ra[i]=rra;
|
350 |
+
rb[i]=rrb;
|
351 |
+
rc[i]=rrc;
|
352 |
+
}
|
353 |
+
}
|
354 |
+
|
355 |
+
|
356 |
+
//Function to convert a long to a null terminated string.
|
357 |
+
void long_str(char *cnam,long n);
|
358 |
+
|
359 |
+
//Function to convert a string with null termination
|
360 |
+
//to a long.
|
361 |
+
void str_long(char *cnam,long &n);
|
362 |
+
|
363 |
+
//Function to convert first two char of string to an
|
364 |
+
//integer. Should be an ASCII null terminated string
|
365 |
+
int trac(const char *str);
|
366 |
+
|
367 |
+
template<typename Y,typename Z>
|
368 |
+
void xshuffle(Y n,Z *idx){ //Randomly shuffles an array of longs.
|
369 |
+
Y i,k;
|
370 |
+
Z u;
|
371 |
+
for(i=n-1;i>0;i--){
|
372 |
+
k=(Y)zrand((long)i+1);
|
373 |
+
u=idx[i];
|
374 |
+
idx[i]=idx[k];
|
375 |
+
idx[k]=u;
|
376 |
+
}
|
377 |
+
}
|
378 |
+
|
379 |
+
template<class Z>
|
380 |
+
void dxhuffle(long n,Z *idx){ //Randomly shuffles an array type Z*.
|
381 |
+
long i,k;
|
382 |
+
Z xx;
|
383 |
+
for(i=n-1;i>0;i--){
|
384 |
+
k=zrand(i+1);
|
385 |
+
xx=idx[i];
|
386 |
+
idx[i]=idx[k];
|
387 |
+
idx[k]=xx;
|
388 |
+
}
|
389 |
+
}
|
390 |
+
|
391 |
+
}
|
392 |
+
#endif
|
gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/version_vocab/vocab1.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/version_vocab/vocab_ori.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
requirements-py310.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
-
tensorflow==2.8
|
2 |
-
transformers==4.37.2
|
3 |
-
stanza==1.4.0
|
4 |
-
spacy==3.2.4
|
5 |
-
bioc==2.0.post4
|
6 |
-
spacy==3.2.4
|
7 |
protobuf==3.20.1
|
|
|
1 |
+
tensorflow==2.8
|
2 |
+
transformers==4.37.2
|
3 |
+
stanza==1.4.0
|
4 |
+
spacy==3.2.4
|
5 |
+
bioc==2.0.post4
|
6 |
+
spacy==3.2.4
|
7 |
protobuf==3.20.1
|
requirements.txt
CHANGED
@@ -1,76 +1,76 @@
|
|
1 |
-
absl-py
|
2 |
-
astunparse
|
3 |
-
attrs
|
4 |
-
bioc
|
5 |
-
blis
|
6 |
-
cachetools
|
7 |
-
catalogue
|
8 |
-
certifi
|
9 |
-
charset-normalizer
|
10 |
-
click
|
11 |
-
cymem
|
12 |
-
emoji
|
13 |
-
filelock
|
14 |
-
gast
|
15 |
-
google-auth
|
16 |
-
google-auth-oauthlib
|
17 |
-
google-pasta
|
18 |
-
grpcio
|
19 |
-
h5py
|
20 |
-
huggingface-hub
|
21 |
-
idna
|
22 |
-
importlib-metadata
|
23 |
-
intervaltree
|
24 |
-
Jinja2
|
25 |
-
joblib
|
26 |
-
jsonlines
|
27 |
-
Keras-Preprocessing
|
28 |
-
langcodes
|
29 |
-
lxml
|
30 |
-
Markdown
|
31 |
-
MarkupSafe
|
32 |
-
murmurhash
|
33 |
-
numpy
|
34 |
-
oauthlib
|
35 |
-
opt-einsum
|
36 |
-
packaging
|
37 |
-
pathy
|
38 |
-
preshed
|
39 |
-
protobuf
|
40 |
-
pyasn1
|
41 |
-
pyasn1-modules
|
42 |
-
pydantic
|
43 |
-
pyparsing
|
44 |
-
PyYAML
|
45 |
-
regex
|
46 |
-
requests
|
47 |
-
requests-oauthlib
|
48 |
-
rsa
|
49 |
-
sacremoses
|
50 |
-
scipy
|
51 |
-
six
|
52 |
-
smart-open
|
53 |
-
sortedcontainers
|
54 |
-
spacy
|
55 |
-
spacy-legacy
|
56 |
-
spacy-loggers
|
57 |
-
srsly
|
58 |
-
stanza
|
59 |
-
tensorboard
|
60 |
-
tensorboard-data-server
|
61 |
-
tensorboard-plugin-wit
|
62 |
-
tensorflow
|
63 |
-
tensorflow-estimator
|
64 |
-
termcolor
|
65 |
-
thinc
|
66 |
-
tokenizers
|
67 |
-
torch
|
68 |
-
tqdm
|
69 |
-
transformers
|
70 |
-
typer
|
71 |
-
typing_extensions
|
72 |
-
urllib3
|
73 |
-
wasabi
|
74 |
-
Werkzeug
|
75 |
-
wrapt
|
76 |
-
zipp
|
|
|
1 |
+
absl-py
|
2 |
+
astunparse
|
3 |
+
attrs
|
4 |
+
bioc
|
5 |
+
blis
|
6 |
+
cachetools
|
7 |
+
catalogue
|
8 |
+
certifi
|
9 |
+
charset-normalizer
|
10 |
+
click
|
11 |
+
cymem
|
12 |
+
emoji
|
13 |
+
filelock
|
14 |
+
gast
|
15 |
+
google-auth
|
16 |
+
google-auth-oauthlib
|
17 |
+
google-pasta
|
18 |
+
grpcio
|
19 |
+
h5py
|
20 |
+
huggingface-hub
|
21 |
+
idna
|
22 |
+
importlib-metadata
|
23 |
+
intervaltree
|
24 |
+
Jinja2
|
25 |
+
joblib
|
26 |
+
jsonlines
|
27 |
+
Keras-Preprocessing
|
28 |
+
langcodes
|
29 |
+
lxml
|
30 |
+
Markdown
|
31 |
+
MarkupSafe
|
32 |
+
murmurhash
|
33 |
+
numpy
|
34 |
+
oauthlib
|
35 |
+
opt-einsum
|
36 |
+
packaging
|
37 |
+
pathy
|
38 |
+
preshed
|
39 |
+
protobuf
|
40 |
+
pyasn1
|
41 |
+
pyasn1-modules
|
42 |
+
pydantic
|
43 |
+
pyparsing
|
44 |
+
PyYAML
|
45 |
+
regex
|
46 |
+
requests
|
47 |
+
requests-oauthlib
|
48 |
+
rsa
|
49 |
+
sacremoses
|
50 |
+
scipy
|
51 |
+
six
|
52 |
+
smart-open
|
53 |
+
sortedcontainers
|
54 |
+
spacy
|
55 |
+
spacy-legacy
|
56 |
+
spacy-loggers
|
57 |
+
srsly
|
58 |
+
stanza
|
59 |
+
tensorboard
|
60 |
+
tensorboard-data-server
|
61 |
+
tensorboard-plugin-wit
|
62 |
+
tensorflow
|
63 |
+
tensorflow-estimator
|
64 |
+
termcolor
|
65 |
+
thinc
|
66 |
+
tokenizers
|
67 |
+
torch
|
68 |
+
tqdm
|
69 |
+
transformers
|
70 |
+
typer
|
71 |
+
typing_extensions
|
72 |
+
urllib3
|
73 |
+
wasabi
|
74 |
+
Werkzeug
|
75 |
+
wrapt
|
76 |
+
zipp
|
run_batches.py
CHANGED
@@ -1,12 +1,19 @@
|
|
1 |
import argparse
|
2 |
import logging
|
|
|
3 |
import shutil
|
4 |
import subprocess
|
5 |
-
import time
|
6 |
-
from datetime import timedelta
|
7 |
from pathlib import Path
|
8 |
from tempfile import TemporaryDirectory
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def main():
|
12 |
logging.basicConfig(level=logging.INFO)
|
@@ -14,7 +21,8 @@ def main():
|
|
14 |
parser.add_argument("--mode", type=str, default="gnorm2", help="mode to run in (gnorm2, gnormplus)")
|
15 |
parser.add_argument("input_dir", type=str, help="directory containing files to process")
|
16 |
parser.add_argument("output_dir", type=str, help="directory to write processed files to")
|
17 |
-
parser.add_argument("--batch_size", type=int, default=
|
|
|
18 |
args = parser.parse_args()
|
19 |
|
20 |
input_dir = Path(args.input_dir)
|
@@ -32,51 +40,57 @@ def main():
|
|
32 |
|
33 |
logging.info(f"Processing {len(input_files)} files")
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
|
82 |
if __name__ == "__main__":
|
|
|
1 |
import argparse
|
2 |
import logging
|
3 |
+
import os
|
4 |
import shutil
|
5 |
import subprocess
|
|
|
|
|
6 |
from pathlib import Path
|
7 |
from tempfile import TemporaryDirectory
|
8 |
|
9 |
+
from tqdm.contrib.concurrent import process_map
|
10 |
+
|
11 |
+
|
12 |
+
def batch(iterable, n=1):
|
13 |
+
l = len(iterable)
|
14 |
+
for ndx in range(0, l, n):
|
15 |
+
yield iterable[ndx : min(ndx + n, l)]
|
16 |
+
|
17 |
|
18 |
def main():
|
19 |
logging.basicConfig(level=logging.INFO)
|
|
|
21 |
parser.add_argument("--mode", type=str, default="gnorm2", help="mode to run in (gnorm2, gnormplus)")
|
22 |
parser.add_argument("input_dir", type=str, help="directory containing files to process")
|
23 |
parser.add_argument("output_dir", type=str, help="directory to write processed files to")
|
24 |
+
parser.add_argument("--batch_size", type=int, default=8)
|
25 |
+
parser.add_argument("--max_workers", type=int, default=os.cpu_count() - 4)
|
26 |
args = parser.parse_args()
|
27 |
|
28 |
input_dir = Path(args.input_dir)
|
|
|
40 |
|
41 |
logging.info(f"Processing {len(input_files)} files")
|
42 |
|
43 |
+
input_files = sorted(input_files, key=lambda file: (input_dir / file).stat().st_size)
|
44 |
+
|
45 |
+
input_files_batches = list(batch(list(input_files), args.batch_size))
|
46 |
+
process_map(
|
47 |
+
run_batch,
|
48 |
+
input_files_batches,
|
49 |
+
[input_dir] * len(input_files_batches),
|
50 |
+
[output_dir] * len(input_files_batches),
|
51 |
+
[args.mode] * len(input_files_batches),
|
52 |
+
max_workers=args.max_workers,
|
53 |
+
chunksize=1,
|
54 |
+
)
|
55 |
+
|
56 |
+
|
57 |
+
def run_batch(input_files_batch, input_dir, output_dir, mode):
|
58 |
+
with TemporaryDirectory() as temp_dir_SR, TemporaryDirectory() as temp_dir_GNR, TemporaryDirectory() as temp_dir_SA, TemporaryDirectory() as input_temp_dir, TemporaryDirectory() as output_temp_dir:
|
59 |
+
input_temp_dir = Path(input_temp_dir)
|
60 |
+
output_temp_dir = Path(output_temp_dir)
|
61 |
+
for file in input_files_batch:
|
62 |
+
logging.info(f"cp {input_dir / file} {input_temp_dir}")
|
63 |
+
shutil.copy(input_dir / file, input_temp_dir)
|
64 |
+
|
65 |
+
if mode == "gnorm2":
|
66 |
+
command_SR = (
|
67 |
+
f"java -Xmx32G -Xms16G -jar GNormPlus.jar {str(input_temp_dir)} {str(temp_dir_SR)} setup.SR.txt"
|
68 |
+
)
|
69 |
+
command_GNR_SA = f"python GeneNER_SpeAss_run.py -i {str(temp_dir_SR)} -r {str(temp_dir_GNR)} -a {str(temp_dir_SA)} -n gnorm_trained_models/geneNER/GeneNER-Bioformer.h5 -s gnorm_trained_models/SpeAss/SpeAss-Bioformer.h5"
|
70 |
+
command_GN = (
|
71 |
+
f"java -Xmx32G -Xms16G -jar GNormPlus.jar {str(temp_dir_SA)} {str(output_temp_dir)} setup.GN.txt"
|
72 |
+
)
|
73 |
+
commands = [command_SR, command_GNR_SA, command_GN]
|
74 |
+
elif mode == "gnormplus":
|
75 |
+
commands = [
|
76 |
+
f"java -Xmx32G -Xms16G -jar GNormPlus.jar {str(input_temp_dir)} {str(output_temp_dir)} setup.txt"
|
77 |
+
]
|
78 |
+
else:
|
79 |
+
raise ValueError(f"Invalid mode: {mode}")
|
80 |
+
|
81 |
+
for command in commands:
|
82 |
+
try:
|
83 |
+
logging.info(command)
|
84 |
+
subprocess.run([command], check=True, shell=True)
|
85 |
+
except subprocess.CalledProcessError as e:
|
86 |
+
logging.exception(f"Error running command: {command}")
|
87 |
+
raise e
|
88 |
+
|
89 |
+
output_paths = list(output_temp_dir.rglob("*"))
|
90 |
+
for output_path in output_paths:
|
91 |
+
logging.info(f"cp {output_path} {output_dir}")
|
92 |
+
shutil.copy(output_path, output_dir)
|
93 |
+
output_file = output_path.name
|
94 |
|
95 |
|
96 |
if __name__ == "__main__":
|
src_Java/GNormPluslib/BioCDoc.java
CHANGED
@@ -1,1344 +1,1344 @@
|
|
1 |
-
/**
|
2 |
-
* Project: GNormPlus
|
3 |
-
* Function: Data storage in BioC format
|
4 |
-
*/
|
5 |
-
|
6 |
-
package GNormPluslib;
|
7 |
-
|
8 |
-
import bioc.BioCAnnotation;
|
9 |
-
import bioc.BioCCollection;
|
10 |
-
import bioc.BioCDocument;
|
11 |
-
import bioc.BioCLocation;
|
12 |
-
import bioc.BioCPassage;
|
13 |
-
|
14 |
-
import bioc.io.BioCDocumentWriter;
|
15 |
-
import bioc.io.BioCFactory;
|
16 |
-
import bioc.io.woodstox.ConnectorWoodstox;
|
17 |
-
import java.io.BufferedReader;
|
18 |
-
import java.io.BufferedWriter;
|
19 |
-
import java.io.FileInputStream;
|
20 |
-
import java.io.FileNotFoundException;
|
21 |
-
import java.io.FileOutputStream;
|
22 |
-
import java.io.FileReader;
|
23 |
-
import java.io.FileWriter;
|
24 |
-
import java.io.IOException;
|
25 |
-
import java.io.InputStreamReader;
|
26 |
-
import java.io.OutputStreamWriter;
|
27 |
-
import java.io.UnsupportedEncodingException;
|
28 |
-
import java.time.LocalDate;
|
29 |
-
import java.time.ZoneId;
|
30 |
-
|
31 |
-
import javax.xml.stream.XMLStreamException;
|
32 |
-
|
33 |
-
import java.util.Map;
|
34 |
-
import java.util.regex.Matcher;
|
35 |
-
import java.util.regex.Pattern;
|
36 |
-
import java.util.ArrayList;
|
37 |
-
import java.util.HashMap;
|
38 |
-
import java.util.List;
|
39 |
-
|
40 |
-
public class BioCDoc
|
41 |
-
{
|
42 |
-
/*
|
43 |
-
* Contexts in BioC file
|
44 |
-
*/
|
45 |
-
public ArrayList<String> PMIDs=new ArrayList<String>(); // Type: PMIDs
|
46 |
-
public ArrayList<ArrayList<String>> PassageNames = new ArrayList(); // PassageName
|
47 |
-
public ArrayList<ArrayList<Integer>> PassageOffsets = new ArrayList(); // PassageOffset
|
48 |
-
public ArrayList<ArrayList<String>> PassageContexts = new ArrayList(); // PassageContext
|
49 |
-
public ArrayList<ArrayList<ArrayList<String>>> Annotations = new ArrayList(); // Annotation - GNormPlus
|
50 |
-
|
51 |
-
public String BioCFormatCheck(String InputFile) throws IOException
|
52 |
-
{
|
53 |
-
|
54 |
-
ConnectorWoodstox connector = new ConnectorWoodstox();
|
55 |
-
BioCCollection collection = new BioCCollection();
|
56 |
-
try
|
57 |
-
{
|
58 |
-
collection = connector.startRead(new InputStreamReader(new FileInputStream(InputFile), "UTF-8"));
|
59 |
-
}
|
60 |
-
catch (UnsupportedEncodingException | FileNotFoundException | XMLStreamException e)
|
61 |
-
{
|
62 |
-
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(InputFile), "UTF-8"));
|
63 |
-
String line="";
|
64 |
-
String status="";
|
65 |
-
String Pmid = "";
|
66 |
-
boolean tiabs=false;
|
67 |
-
Pattern patt = Pattern.compile("^([^\\|\\t]+)\\|([^\\|\\t]+)\\|(.*)$");
|
68 |
-
while ((line = br.readLine()) != null)
|
69 |
-
{
|
70 |
-
Matcher mat = patt.matcher(line);
|
71 |
-
if(mat.find()) //Title|Abstract
|
72 |
-
{
|
73 |
-
if(Pmid.equals(""))
|
74 |
-
{
|
75 |
-
Pmid = mat.group(1);
|
76 |
-
}
|
77 |
-
else if(!Pmid.equals(mat.group(1)))
|
78 |
-
{
|
79 |
-
return "[Error]: "+InputFile+" - A blank is needed between "+Pmid+" and "+mat.group(1)+".";
|
80 |
-
}
|
81 |
-
status = "tiabs";
|
82 |
-
tiabs = true;
|
83 |
-
}
|
84 |
-
else if (line.contains("\t")) //Annotation
|
85 |
-
{
|
86 |
-
}
|
87 |
-
else if(line.length()==0) //Processing
|
88 |
-
{
|
89 |
-
if(status.equals(""))
|
90 |
-
{
|
91 |
-
if(Pmid.equals(""))
|
92 |
-
{
|
93 |
-
return "[Error]: "+InputFile+" - It's neither BioC nor PubTator format. PMID is empty.";
|
94 |
-
}
|
95 |
-
else
|
96 |
-
{
|
97 |
-
return "[Error]: "+InputFile+" - A redundant blank is after "+Pmid+".";
|
98 |
-
}
|
99 |
-
}
|
100 |
-
Pmid="";
|
101 |
-
status="";
|
102 |
-
}
|
103 |
-
}
|
104 |
-
br.close();
|
105 |
-
if(tiabs == false)
|
106 |
-
{
|
107 |
-
return "[Error]: "+InputFile+" - It's neither BioC nor PubTator format.";
|
108 |
-
}
|
109 |
-
if(status.equals(""))
|
110 |
-
{
|
111 |
-
return "PubTator";
|
112 |
-
}
|
113 |
-
else
|
114 |
-
{
|
115 |
-
return "[Error]: "+InputFile+" - The last column missed a blank.";
|
116 |
-
}
|
117 |
-
}
|
118 |
-
return "BioC";
|
119 |
-
}
|
120 |
-
public void PubTator2BioC(String input,String output) throws IOException, XMLStreamException // Input
|
121 |
-
{
|
122 |
-
/*
|
123 |
-
* PubTator2BioC
|
124 |
-
*/
|
125 |
-
String parser = BioCFactory.WOODSTOX;
|
126 |
-
BioCFactory factory = BioCFactory.newFactory(parser);
|
127 |
-
BioCDocumentWriter BioCOutputFormat = factory.createBioCDocumentWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
|
128 |
-
BioCCollection biocCollection = new BioCCollection();
|
129 |
-
|
130 |
-
//time
|
131 |
-
ZoneId zonedId = ZoneId.of( "America/Montreal" );
|
132 |
-
LocalDate today = LocalDate.now( zonedId );
|
133 |
-
biocCollection.setDate(today.toString());
|
134 |
-
|
135 |
-
biocCollection.setKey("BioC.key");//key
|
136 |
-
biocCollection.setSource("GNormPlus");//source
|
137 |
-
|
138 |
-
BioCOutputFormat.writeCollectionInfo(biocCollection);
|
139 |
-
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
140 |
-
ArrayList<String> ParagraphType=new ArrayList<String>(); // Type: Title|Abstract
|
141 |
-
ArrayList<String> ParagraphContent = new ArrayList<String>(); // Text
|
142 |
-
ArrayList<String> annotations = new ArrayList<String>(); // Annotation
|
143 |
-
String line;
|
144 |
-
String Pmid="";
|
145 |
-
while ((line = inputfile.readLine()) != null)
|
146 |
-
{
|
147 |
-
if(line.contains("|") && !line.contains("\t")) //Title|Abstract
|
148 |
-
{
|
149 |
-
String str[]=line.split("\\|",-1);
|
150 |
-
Pmid=str[0];
|
151 |
-
if(str[1].equals("t"))
|
152 |
-
{
|
153 |
-
str[1]="title";
|
154 |
-
}
|
155 |
-
if(str[1].equals("a"))
|
156 |
-
{
|
157 |
-
str[1]="abstract";
|
158 |
-
}
|
159 |
-
ParagraphType.add(str[1]);
|
160 |
-
if(str.length==3)
|
161 |
-
{
|
162 |
-
String txt = str[2];
|
163 |
-
txt = txt.replaceAll("ω","w");
|
164 |
-
txt = txt.replaceAll("μ","u");
|
165 |
-
txt = txt.replaceAll("κ","k");
|
166 |
-
txt = txt.replaceAll("α","a");
|
167 |
-
txt = txt.replaceAll("γ","g");
|
168 |
-
txt = txt.replaceAll("ɣ","g");
|
169 |
-
txt = txt.replaceAll("β","b");
|
170 |
-
txt = txt.replaceAll("×","x");
|
171 |
-
txt = txt.replaceAll("‑","-");
|
172 |
-
txt = txt.replaceAll("¹","1");
|
173 |
-
txt = txt.replaceAll("²","2");
|
174 |
-
txt = txt.replaceAll("°","o");
|
175 |
-
txt = txt.replaceAll("ö","o");
|
176 |
-
txt = txt.replaceAll("é","e");
|
177 |
-
txt = txt.replaceAll("à","a");
|
178 |
-
txt = txt.replaceAll("Á","A");
|
179 |
-
txt = txt.replaceAll("ε","e");
|
180 |
-
txt = txt.replaceAll("θ","O");
|
181 |
-
txt = txt.replaceAll("•",".");
|
182 |
-
txt = txt.replaceAll("µ","u");
|
183 |
-
txt = txt.replaceAll("λ","r");
|
184 |
-
txt = txt.replaceAll("⁺","+");
|
185 |
-
txt = txt.replaceAll("ν","v");
|
186 |
-
txt = txt.replaceAll("ï","i");
|
187 |
-
txt = txt.replaceAll("ã","a");
|
188 |
-
txt = txt.replaceAll("≡","=");
|
189 |
-
txt = txt.replaceAll("ó","o");
|
190 |
-
txt = txt.replaceAll("³","3");
|
191 |
-
txt = txt.replaceAll("〖","[");
|
192 |
-
txt = txt.replaceAll("〗","]");
|
193 |
-
txt = txt.replaceAll("Å","A");
|
194 |
-
txt = txt.replaceAll("ρ","p");
|
195 |
-
txt = txt.replaceAll("ü","u");
|
196 |
-
txt = txt.replaceAll("ɛ","e");
|
197 |
-
txt = txt.replaceAll("č","c");
|
198 |
-
txt = txt.replaceAll("š","s");
|
199 |
-
txt = txt.replaceAll("ß","b");
|
200 |
-
txt = txt.replaceAll("═","=");
|
201 |
-
txt = txt.replaceAll("£","L");
|
202 |
-
txt = txt.replaceAll("Ł","L");
|
203 |
-
txt = txt.replaceAll("ƒ","f");
|
204 |
-
txt = txt.replaceAll("ä","a");
|
205 |
-
txt = txt.replaceAll("–","-");
|
206 |
-
txt = txt.replaceAll("⁻","-");
|
207 |
-
txt = txt.replaceAll("〈","<");
|
208 |
-
txt = txt.replaceAll("〉",">");
|
209 |
-
txt = txt.replaceAll("χ","X");
|
210 |
-
txt = txt.replaceAll("Đ","D");
|
211 |
-
txt = txt.replaceAll("‰","%");
|
212 |
-
txt = txt.replaceAll("·",".");
|
213 |
-
txt = txt.replaceAll("→",">");
|
214 |
-
txt = txt.replaceAll("←","<");
|
215 |
-
txt = txt.replaceAll("ζ","z");
|
216 |
-
txt = txt.replaceAll("π","p");
|
217 |
-
txt = txt.replaceAll("τ","t");
|
218 |
-
txt = txt.replaceAll("ξ","X");
|
219 |
-
txt = txt.replaceAll("η","h");
|
220 |
-
txt = txt.replaceAll("ø","0");
|
221 |
-
txt = txt.replaceAll("Δ","D");
|
222 |
-
txt = txt.replaceAll("∆","D");
|
223 |
-
txt = txt.replaceAll("∑","S");
|
224 |
-
txt = txt.replaceAll("Ω","O");
|
225 |
-
txt = txt.replaceAll("δ","d");
|
226 |
-
txt = txt.replaceAll("σ","s");
|
227 |
-
txt = txt.replaceAll("Φ","F");
|
228 |
-
txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]"," ");
|
229 |
-
ParagraphContent.add(txt);
|
230 |
-
}
|
231 |
-
else
|
232 |
-
{
|
233 |
-
ParagraphContent.add("- No text -");
|
234 |
-
}
|
235 |
-
}
|
236 |
-
else if (line.contains("\t")) //Annotation
|
237 |
-
{
|
238 |
-
String anno[]=line.split("\t");
|
239 |
-
if(anno.length==6)
|
240 |
-
{
|
241 |
-
annotations.add(anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+anno[4]+"\t"+anno[5]);
|
242 |
-
}
|
243 |
-
else if(anno.length==5)
|
244 |
-
{
|
245 |
-
annotations.add(anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+anno[4]);
|
246 |
-
}
|
247 |
-
}
|
248 |
-
else if(line.length()==0) //Processing
|
249 |
-
{
|
250 |
-
BioCDocument biocDocument = new BioCDocument();
|
251 |
-
biocDocument.setID(Pmid);
|
252 |
-
int startoffset=0;
|
253 |
-
for(int i=0;i<ParagraphType.size();i++)
|
254 |
-
{
|
255 |
-
BioCPassage biocPassage = new BioCPassage();
|
256 |
-
Map<String, String> Infons = new HashMap<String, String>();
|
257 |
-
Infons.put("type", ParagraphType.get(i));
|
258 |
-
biocPassage.setInfons(Infons);
|
259 |
-
biocPassage.setText(ParagraphContent.get(i));
|
260 |
-
biocPassage.setOffset(startoffset);
|
261 |
-
startoffset=startoffset+ParagraphContent.get(i).length()+1;
|
262 |
-
for(int j=0;j<annotations.size();j++)
|
263 |
-
{
|
264 |
-
String anno[]=annotations.get(j).split("\t");
|
265 |
-
if(Integer.parseInt(anno[0])<startoffset && Integer.parseInt(anno[0])>=startoffset-ParagraphContent.get(i).length()-1)
|
266 |
-
{
|
267 |
-
BioCAnnotation biocAnnotation = new BioCAnnotation();
|
268 |
-
Map<String, String> AnnoInfons = new HashMap<String, String>();
|
269 |
-
if(anno.length==5)
|
270 |
-
{
|
271 |
-
AnnoInfons.put("Identifier", anno[4]);
|
272 |
-
}
|
273 |
-
AnnoInfons.put("type", anno[3]);
|
274 |
-
biocAnnotation.setInfons(AnnoInfons);
|
275 |
-
BioCLocation location = new BioCLocation();
|
276 |
-
location.setOffset(Integer.parseInt(anno[0]));
|
277 |
-
location.setLength(Integer.parseInt(anno[1])-Integer.parseInt(anno[0]));
|
278 |
-
biocAnnotation.setLocation(location);
|
279 |
-
biocAnnotation.setText(anno[2]);
|
280 |
-
biocPassage.addAnnotation(biocAnnotation);
|
281 |
-
}
|
282 |
-
}
|
283 |
-
biocDocument.addPassage(biocPassage);
|
284 |
-
}
|
285 |
-
biocCollection.addDocument(biocDocument);
|
286 |
-
ParagraphType.clear();
|
287 |
-
ParagraphContent.clear();
|
288 |
-
annotations.clear();
|
289 |
-
BioCOutputFormat.writeDocument(biocDocument);
|
290 |
-
}
|
291 |
-
}
|
292 |
-
BioCOutputFormat.close();
|
293 |
-
inputfile.close();
|
294 |
-
}
|
295 |
-
public void BioC2PubTator(String input,String output) throws IOException, XMLStreamException //Output
|
296 |
-
{
|
297 |
-
/*
|
298 |
-
* BioC2PubTator
|
299 |
-
*/
|
300 |
-
HashMap<String, String> pmidlist = new HashMap<String, String>(); // check if appear duplicate pmids
|
301 |
-
boolean duplicate = false;
|
302 |
-
BufferedWriter PubTatorOutputFormat = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
|
303 |
-
ConnectorWoodstox connector = new ConnectorWoodstox();
|
304 |
-
BioCCollection collection = new BioCCollection();
|
305 |
-
collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
306 |
-
while (connector.hasNext())
|
307 |
-
{
|
308 |
-
BioCDocument document = connector.next();
|
309 |
-
String PMID = document.getID();
|
310 |
-
if(pmidlist.containsKey(PMID)){System.out.println("\nError: duplicate pmid-"+PMID);duplicate = true;}
|
311 |
-
else{pmidlist.put(PMID,"");}
|
312 |
-
String Anno="";
|
313 |
-
for (BioCPassage passage : document.getPassages())
|
314 |
-
{
|
315 |
-
if(passage.getInfon("type").equals("title"))
|
316 |
-
{
|
317 |
-
PubTatorOutputFormat.write(PMID+"|t|"+passage.getText()+"\n");
|
318 |
-
}
|
319 |
-
else if(passage.getInfon("type").equals("abstract"))
|
320 |
-
{
|
321 |
-
PubTatorOutputFormat.write(PMID+"|a|"+passage.getText()+"\n");
|
322 |
-
}
|
323 |
-
else
|
324 |
-
{
|
325 |
-
PubTatorOutputFormat.write(PMID+"|"+passage.getInfon("type")+"|"+passage.getText()+"\n");
|
326 |
-
}
|
327 |
-
|
328 |
-
for (BioCAnnotation annotation : passage.getAnnotations())
|
329 |
-
{
|
330 |
-
String Annotype = annotation.getInfon("type");
|
331 |
-
String Annoid="";
|
332 |
-
String Proteinid="";
|
333 |
-
if(Annotype.matches("(Gene|FamilyName|DomainMotif)"))
|
334 |
-
{
|
335 |
-
if(annotation.getInfons().containsKey("NCBI Gene"))
|
336 |
-
{
|
337 |
-
Annoid = annotation.getInfon("NCBI Gene");
|
338 |
-
String Annoidlist[]=Annoid.split(";");
|
339 |
-
Annoid="";
|
340 |
-
for(int x=0;x<Annoidlist.length;x++)
|
341 |
-
{
|
342 |
-
//Normalization2Protein
|
343 |
-
String proteinid="";
|
344 |
-
String homoid="";
|
345 |
-
|
346 |
-
if(GNormPlus.Normalization2Protein_hash.containsKey(Annoidlist[x]))
|
347 |
-
{
|
348 |
-
proteinid=GNormPlus.Normalization2Protein_hash.get(Annoidlist[x]);
|
349 |
-
}
|
350 |
-
if(GNormPlus.HomologeneID_hash.containsKey(Annoidlist[x]))
|
351 |
-
{
|
352 |
-
homoid=GNormPlus.HomologeneID_hash.get(Annoidlist[x]);
|
353 |
-
}
|
354 |
-
|
355 |
-
if((!proteinid.equals("")) || (!homoid.equals("")))
|
356 |
-
{
|
357 |
-
if(Annoid.equals(""))
|
358 |
-
{
|
359 |
-
Annoid=Annoidlist[x]+"(";
|
360 |
-
if(!proteinid.equals(""))
|
361 |
-
{
|
362 |
-
Annoid=Annoid+"UniProt:"+proteinid;
|
363 |
-
}
|
364 |
-
if(!homoid.equals(""))
|
365 |
-
{
|
366 |
-
if(!proteinid.equals(""))
|
367 |
-
{
|
368 |
-
Annoid=Annoid+";";
|
369 |
-
}
|
370 |
-
Annoid=Annoid+"Homoid:"+homoid;
|
371 |
-
}
|
372 |
-
Annoid=Annoid+")";
|
373 |
-
}
|
374 |
-
else
|
375 |
-
{
|
376 |
-
Annoid=Annoid+";"+Annoidlist[x]+"(";
|
377 |
-
if(!proteinid.equals(""))
|
378 |
-
{
|
379 |
-
Annoid=Annoid+"UniProt:"+proteinid;
|
380 |
-
}
|
381 |
-
if(!homoid.equals(""))
|
382 |
-
{
|
383 |
-
if(!proteinid.equals(""))
|
384 |
-
{
|
385 |
-
Annoid=Annoid+";";
|
386 |
-
}
|
387 |
-
Annoid=Annoid+"Homoid:"+homoid;
|
388 |
-
}
|
389 |
-
Annoid=Annoid+")";
|
390 |
-
}
|
391 |
-
}
|
392 |
-
else
|
393 |
-
{
|
394 |
-
if(Annoid.equals(""))
|
395 |
-
{
|
396 |
-
Annoid=Annoidlist[x];
|
397 |
-
}
|
398 |
-
else
|
399 |
-
{
|
400 |
-
Annoid=Annoid+";"+Annoidlist[x];
|
401 |
-
}
|
402 |
-
}
|
403 |
-
}
|
404 |
-
}
|
405 |
-
//else if(annotation.getInfons().containsKey("NCBI Homologene"))
|
406 |
-
//{
|
407 |
-
// Annoid = annotation.getInfon("NCBI Homologene");
|
408 |
-
//}
|
409 |
-
//else if(!annotation.getInfons().containsKey("FocusSpecies"))
|
410 |
-
//{
|
411 |
-
// Annoid = annotation.getInfon("FocusSpecies");
|
412 |
-
//}
|
413 |
-
else
|
414 |
-
{
|
415 |
-
Annoid = annotation.getInfon("Identifier");
|
416 |
-
}
|
417 |
-
}
|
418 |
-
else if(Annotype.equals("Species") || Annotype.equals("Genus") || Annotype.equals("Strain"))
|
419 |
-
{
|
420 |
-
if(annotation.getInfons().containsKey("NCBI Taxonomy"))
|
421 |
-
{
|
422 |
-
Annoid = annotation.getInfon("NCBI Taxonomy");
|
423 |
-
}
|
424 |
-
else
|
425 |
-
{
|
426 |
-
Annoid = annotation.getInfon("Identifier");
|
427 |
-
}
|
428 |
-
}
|
429 |
-
else if(Annotype.equals("CellLine"))
|
430 |
-
{
|
431 |
-
if(annotation.getInfons().containsKey("NCBI Taxonomy"))
|
432 |
-
{
|
433 |
-
Annoid = annotation.getInfon("NCBI Taxonomy");
|
434 |
-
}
|
435 |
-
else
|
436 |
-
{
|
437 |
-
Annoid = annotation.getInfon("Identifier");
|
438 |
-
}
|
439 |
-
}
|
440 |
-
else
|
441 |
-
{
|
442 |
-
Annoid = annotation.getInfon("Identifier");
|
443 |
-
}
|
444 |
-
int start = annotation.getLocations().get(0).getOffset();
|
445 |
-
int last = start + annotation.getLocations().get(0).getLength();
|
446 |
-
String AnnoMention=annotation.getText();
|
447 |
-
if(Annoid != null && !Annoid.equals(null) && !Annoid.equals(""))
|
448 |
-
{
|
449 |
-
Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\t"+Annoid+"\n";
|
450 |
-
}
|
451 |
-
else
|
452 |
-
{
|
453 |
-
Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\n";
|
454 |
-
}
|
455 |
-
}
|
456 |
-
}
|
457 |
-
PubTatorOutputFormat.write(Anno+"\n");
|
458 |
-
}
|
459 |
-
PubTatorOutputFormat.close();
|
460 |
-
if(duplicate == true){System.exit(0);}
|
461 |
-
}
|
462 |
-
public void BioC2PubTator(String original_input,String input,String output) throws IOException, XMLStreamException //Output
|
463 |
-
{
|
464 |
-
/* original tiabs*/
|
465 |
-
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(original_input), "UTF-8"));
|
466 |
-
HashMap<String,String> ParagraphContent = new HashMap<String,String>(); // [PMID,0] -> title
|
467 |
-
HashMap<String,String> annotations = new HashMap<String,String>(); // PMID ->Annotation
|
468 |
-
String line;
|
469 |
-
String Pmid="";
|
470 |
-
int count_paragraph=0;
|
471 |
-
while ((line = inputfile.readLine()) != null)
|
472 |
-
{
|
473 |
-
if(line.contains("|") && !line.contains("\t")) //Title|Abstract
|
474 |
-
{
|
475 |
-
String str[]=line.split("\\|",-1);
|
476 |
-
Pmid=str[0];
|
477 |
-
ParagraphContent.put(Pmid+"\t"+str[1],str[2]);
|
478 |
-
count_paragraph++;
|
479 |
-
}
|
480 |
-
else if (line.contains("\t")) //Annotation
|
481 |
-
{
|
482 |
-
annotations.put(Pmid, annotations.get(Pmid)+line);
|
483 |
-
}
|
484 |
-
else if(line.length()==0) //Processing
|
485 |
-
{
|
486 |
-
count_paragraph=0;
|
487 |
-
}
|
488 |
-
}
|
489 |
-
inputfile.close();
|
490 |
-
|
491 |
-
/*
|
492 |
-
* BioC2PubTator
|
493 |
-
*/
|
494 |
-
HashMap<String, String> pmidlist = new HashMap<String, String>(); // check if appear duplicate pmids
|
495 |
-
boolean duplicate = false;
|
496 |
-
BufferedWriter PubTatorOutputFormat = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
|
497 |
-
ConnectorWoodstox connector = new ConnectorWoodstox();
|
498 |
-
BioCCollection collection = new BioCCollection();
|
499 |
-
collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
500 |
-
while (connector.hasNext())
|
501 |
-
{
|
502 |
-
BioCDocument document = connector.next();
|
503 |
-
String PMID = document.getID();
|
504 |
-
if(pmidlist.containsKey(PMID)){System.out.println("\nError: duplicate pmid-"+PMID);duplicate = true;}
|
505 |
-
else{pmidlist.put(PMID,"");}
|
506 |
-
String Anno="";
|
507 |
-
for (BioCPassage passage : document.getPassages())
|
508 |
-
{
|
509 |
-
if(passage.getInfon("type").equals("title") || passage.getInfon("type").equals("t"))
|
510 |
-
{
|
511 |
-
PubTatorOutputFormat.write(PMID+"|t|"+ParagraphContent.get(PMID+"\tt")+"\n");
|
512 |
-
}
|
513 |
-
else if(passage.getInfon("type").equals("abstract") || passage.getInfon("type").equals("a"))
|
514 |
-
{
|
515 |
-
PubTatorOutputFormat.write(PMID+"|a|"+ParagraphContent.get(PMID+"\ta")+"\n");
|
516 |
-
}
|
517 |
-
else
|
518 |
-
{
|
519 |
-
PubTatorOutputFormat.write(PMID+"|"+passage.getInfon("type")+"|"+passage.getText()+"\n");
|
520 |
-
}
|
521 |
-
|
522 |
-
for (BioCAnnotation annotation : passage.getAnnotations())
|
523 |
-
{
|
524 |
-
String Annotype = annotation.getInfon("type");
|
525 |
-
String Annoid="";
|
526 |
-
String Proteinid="";
|
527 |
-
if(Annotype.matches("(Gene|FamilyName|DomainMotif)"))
|
528 |
-
{
|
529 |
-
if(annotation.getInfons().containsKey("NCBI Gene"))
|
530 |
-
{
|
531 |
-
Annoid = annotation.getInfon("NCBI Gene");
|
532 |
-
String Annoidlist[]=Annoid.split(";");
|
533 |
-
Annoid="";
|
534 |
-
for(int x=0;x<Annoidlist.length;x++)
|
535 |
-
{
|
536 |
-
//Normalization2Protein
|
537 |
-
String proteinid="";
|
538 |
-
String homoid="";
|
539 |
-
|
540 |
-
if(GNormPlus.Normalization2Protein_hash.containsKey(Annoidlist[x]))
|
541 |
-
{
|
542 |
-
proteinid=GNormPlus.Normalization2Protein_hash.get(Annoidlist[x]);
|
543 |
-
}
|
544 |
-
if(GNormPlus.HomologeneID_hash.containsKey(Annoidlist[x]))
|
545 |
-
{
|
546 |
-
homoid=GNormPlus.HomologeneID_hash.get(Annoidlist[x]);
|
547 |
-
}
|
548 |
-
|
549 |
-
if((!proteinid.equals("")) || (!homoid.equals("")))
|
550 |
-
{
|
551 |
-
if(Annoid.equals(""))
|
552 |
-
{
|
553 |
-
Annoid=Annoidlist[x]+"(";
|
554 |
-
if(!proteinid.equals(""))
|
555 |
-
{
|
556 |
-
Annoid=Annoid+"UniProt:"+proteinid;
|
557 |
-
}
|
558 |
-
if(!homoid.equals(""))
|
559 |
-
{
|
560 |
-
if(!proteinid.equals(""))
|
561 |
-
{
|
562 |
-
Annoid=Annoid+";";
|
563 |
-
}
|
564 |
-
Annoid=Annoid+"Homoid:"+homoid;
|
565 |
-
}
|
566 |
-
Annoid=Annoid+")";
|
567 |
-
}
|
568 |
-
else
|
569 |
-
{
|
570 |
-
Annoid=Annoid+";"+Annoidlist[x]+"(";
|
571 |
-
if(!proteinid.equals(""))
|
572 |
-
{
|
573 |
-
Annoid=Annoid+"UniProt:"+proteinid;
|
574 |
-
}
|
575 |
-
if(!homoid.equals(""))
|
576 |
-
{
|
577 |
-
if(!proteinid.equals(""))
|
578 |
-
{
|
579 |
-
Annoid=Annoid+";";
|
580 |
-
}
|
581 |
-
Annoid=Annoid+"Homoid:"+homoid;
|
582 |
-
}
|
583 |
-
Annoid=Annoid+")";
|
584 |
-
}
|
585 |
-
}
|
586 |
-
else
|
587 |
-
{
|
588 |
-
if(Annoid.equals(""))
|
589 |
-
{
|
590 |
-
Annoid=Annoidlist[x];
|
591 |
-
}
|
592 |
-
else
|
593 |
-
{
|
594 |
-
Annoid=Annoid+";"+Annoidlist[x];
|
595 |
-
}
|
596 |
-
}
|
597 |
-
}
|
598 |
-
}
|
599 |
-
//else if(annotation.getInfons().containsKey("NCBI Homologene"))
|
600 |
-
//{
|
601 |
-
// Annoid = annotation.getInfon("NCBI Homologene");
|
602 |
-
//}
|
603 |
-
//else if(annotation.getInfons().containsKey("FocusSpecies"))
|
604 |
-
//{
|
605 |
-
// Annoid = annotation.getInfon("FocusSpecies");
|
606 |
-
//}
|
607 |
-
else
|
608 |
-
{
|
609 |
-
Annoid = annotation.getInfon("Identifier");
|
610 |
-
}
|
611 |
-
}
|
612 |
-
else if(Annotype.equals("Species") || Annotype.equals("Genus") || Annotype.equals("Strain"))
|
613 |
-
{
|
614 |
-
if(annotation.getInfons().containsKey("NCBI Taxonomy"))
|
615 |
-
{
|
616 |
-
Annoid = annotation.getInfon("NCBI Taxonomy");
|
617 |
-
}
|
618 |
-
else
|
619 |
-
{
|
620 |
-
Annoid = annotation.getInfon("Identifier");
|
621 |
-
}
|
622 |
-
}
|
623 |
-
else if(Annotype.equals("CellLine"))
|
624 |
-
{
|
625 |
-
if(annotation.getInfons().containsKey("NCBI Taxonomy"))
|
626 |
-
{
|
627 |
-
Annoid = annotation.getInfon("NCBI Taxonomy");
|
628 |
-
}
|
629 |
-
else
|
630 |
-
{
|
631 |
-
Annoid = annotation.getInfon("Identifier");
|
632 |
-
}
|
633 |
-
}
|
634 |
-
else
|
635 |
-
{
|
636 |
-
if(annotation.getInfons().containsKey("Identifier"))
|
637 |
-
{
|
638 |
-
Annoid = annotation.getInfon("Identifier");
|
639 |
-
}
|
640 |
-
else
|
641 |
-
{
|
642 |
-
Annoid = "";
|
643 |
-
}
|
644 |
-
}
|
645 |
-
int start = annotation.getLocations().get(0).getOffset();
|
646 |
-
int last = start + annotation.getLocations().get(0).getLength();
|
647 |
-
String AnnoMention=annotation.getText();
|
648 |
-
if(Annoid != null && !Annoid.equals(null) && !Annoid.equals(""))
|
649 |
-
{
|
650 |
-
Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\t"+Annoid+"\n";
|
651 |
-
}
|
652 |
-
else
|
653 |
-
{
|
654 |
-
Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\n";
|
655 |
-
}
|
656 |
-
}
|
657 |
-
}
|
658 |
-
PubTatorOutputFormat.write(Anno+"\n");
|
659 |
-
}
|
660 |
-
PubTatorOutputFormat.close();
|
661 |
-
if(duplicate == true){System.exit(0);}
|
662 |
-
}
|
663 |
-
public void BioCReader(String input) throws IOException, XMLStreamException
|
664 |
-
{
|
665 |
-
ConnectorWoodstox connector = new ConnectorWoodstox();
|
666 |
-
BioCCollection collection = new BioCCollection();
|
667 |
-
collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
668 |
-
|
669 |
-
/*
|
670 |
-
* Per document
|
671 |
-
*/
|
672 |
-
while (connector.hasNext())
|
673 |
-
{
|
674 |
-
BioCDocument document = connector.next();
|
675 |
-
PMIDs.add(document.getID());
|
676 |
-
|
677 |
-
ArrayList<String> PassageName= new ArrayList<String>(); // array of Passage name
|
678 |
-
ArrayList<Integer> PassageOffset= new ArrayList<Integer>(); // array of Passage offset
|
679 |
-
ArrayList<String> PassageContext= new ArrayList<String>(); // array of Passage context
|
680 |
-
ArrayList<ArrayList<String>> AnnotationInPMID= new ArrayList(); // array of Annotations in the PassageName
|
681 |
-
|
682 |
-
/*
|
683 |
-
* Per Passage
|
684 |
-
*/
|
685 |
-
for (BioCPassage passage : document.getPassages())
|
686 |
-
{
|
687 |
-
PassageName.add(passage.getInfon("type")); //Paragraph
|
688 |
-
String txt = passage.getText();
|
689 |
-
if(txt.matches("[\t ]+"))
|
690 |
-
{
|
691 |
-
txt = txt.replaceAll(".","@");
|
692 |
-
}
|
693 |
-
else
|
694 |
-
{
|
695 |
-
//if(passage.getInfon("type").toLowerCase().equals("table"))
|
696 |
-
//{
|
697 |
-
// txt=txt.replaceAll(" ", "|");
|
698 |
-
//}
|
699 |
-
txt = txt.replaceAll("ω","w");
|
700 |
-
txt = txt.replaceAll("μ","u");
|
701 |
-
txt = txt.replaceAll("κ","k");
|
702 |
-
txt = txt.replaceAll("α","a");
|
703 |
-
txt = txt.replaceAll("γ","g");
|
704 |
-
txt = txt.replaceAll("ɣ","g");
|
705 |
-
txt = txt.replaceAll("β","b");
|
706 |
-
txt = txt.replaceAll("×","x");
|
707 |
-
txt = txt.replaceAll("‑","-");
|
708 |
-
txt = txt.replaceAll("¹","1");
|
709 |
-
txt = txt.replaceAll("²","2");
|
710 |
-
txt = txt.replaceAll("°","o");
|
711 |
-
txt = txt.replaceAll("ö","o");
|
712 |
-
txt = txt.replaceAll("é","e");
|
713 |
-
txt = txt.replaceAll("à","a");
|
714 |
-
txt = txt.replaceAll("Á","A");
|
715 |
-
txt = txt.replaceAll("ε","e");
|
716 |
-
txt = txt.replaceAll("θ","O");
|
717 |
-
txt = txt.replaceAll("•",".");
|
718 |
-
txt = txt.replaceAll("µ","u");
|
719 |
-
txt = txt.replaceAll("λ","r");
|
720 |
-
txt = txt.replaceAll("⁺","+");
|
721 |
-
txt = txt.replaceAll("ν","v");
|
722 |
-
txt = txt.replaceAll("ï","i");
|
723 |
-
txt = txt.replaceAll("ã","a");
|
724 |
-
txt = txt.replaceAll("≡","=");
|
725 |
-
txt = txt.replaceAll("ó","o");
|
726 |
-
txt = txt.replaceAll("³","3");
|
727 |
-
txt = txt.replaceAll("〖","[");
|
728 |
-
txt = txt.replaceAll("〗","]");
|
729 |
-
txt = txt.replaceAll("Å","A");
|
730 |
-
txt = txt.replaceAll("ρ","p");
|
731 |
-
txt = txt.replaceAll("ü","u");
|
732 |
-
txt = txt.replaceAll("ɛ","e");
|
733 |
-
txt = txt.replaceAll("č","c");
|
734 |
-
txt = txt.replaceAll("š","s");
|
735 |
-
txt = txt.replaceAll("ß","b");
|
736 |
-
txt = txt.replaceAll("═","=");
|
737 |
-
txt = txt.replaceAll("£","L");
|
738 |
-
txt = txt.replaceAll("Ł","L");
|
739 |
-
txt = txt.replaceAll("ƒ","f");
|
740 |
-
txt = txt.replaceAll("ä","a");
|
741 |
-
txt = txt.replaceAll("–","-");
|
742 |
-
txt = txt.replaceAll("⁻","-");
|
743 |
-
txt = txt.replaceAll("〈","<");
|
744 |
-
txt = txt.replaceAll("〉",">");
|
745 |
-
txt = txt.replaceAll("χ","X");
|
746 |
-
txt = txt.replaceAll("Đ","D");
|
747 |
-
txt = txt.replaceAll("‰","%");
|
748 |
-
txt = txt.replaceAll("·",".");
|
749 |
-
txt = txt.replaceAll("→",">");
|
750 |
-
txt = txt.replaceAll("←","<");
|
751 |
-
txt = txt.replaceAll("ζ","z");
|
752 |
-
txt = txt.replaceAll("π","p");
|
753 |
-
txt = txt.replaceAll("τ","t");
|
754 |
-
txt = txt.replaceAll("ξ","X");
|
755 |
-
txt = txt.replaceAll("η","h");
|
756 |
-
txt = txt.replaceAll("ø","0");
|
757 |
-
txt = txt.replaceAll("Δ","D");
|
758 |
-
txt = txt.replaceAll("∆","D");
|
759 |
-
txt = txt.replaceAll("∑","S");
|
760 |
-
txt = txt.replaceAll("Ω","O");
|
761 |
-
txt = txt.replaceAll("δ","d");
|
762 |
-
txt = txt.replaceAll("σ","s");
|
763 |
-
txt = txt.replaceAll("Φ","F");
|
764 |
-
//txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]"," ");
|
765 |
-
}
|
766 |
-
if(passage.getText().equals("") || passage.getText().matches("[ ]+"))
|
767 |
-
{
|
768 |
-
PassageContext.add("-notext-"); //Context
|
769 |
-
}
|
770 |
-
else
|
771 |
-
{
|
772 |
-
PassageContext.add(txt); //Context
|
773 |
-
}
|
774 |
-
PassageOffset.add(passage.getOffset()); //Offset
|
775 |
-
ArrayList<String> AnnotationInPassage= new ArrayList<String>(); // array of Annotations in the PassageName
|
776 |
-
AnnotationInPMID.add(AnnotationInPassage);
|
777 |
-
}
|
778 |
-
PassageNames.add(PassageName);
|
779 |
-
PassageContexts.add(PassageContext);
|
780 |
-
PassageOffsets.add(PassageOffset);
|
781 |
-
Annotations.add(AnnotationInPMID);
|
782 |
-
}
|
783 |
-
}
|
784 |
-
public void BioCReaderWithAnnotation(String input) throws IOException, XMLStreamException
|
785 |
-
{
|
786 |
-
ConnectorWoodstox connector = new ConnectorWoodstox();
|
787 |
-
BioCCollection collection = new BioCCollection();
|
788 |
-
collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
789 |
-
|
790 |
-
/*
|
791 |
-
* Per document
|
792 |
-
*/
|
793 |
-
while (connector.hasNext())
|
794 |
-
{
|
795 |
-
BioCDocument document = connector.next();
|
796 |
-
PMIDs.add(document.getID());
|
797 |
-
|
798 |
-
ArrayList<String> PassageName= new ArrayList<String>(); // array of Passage name
|
799 |
-
ArrayList<Integer> PassageOffset= new ArrayList<Integer>(); // array of Passage offset
|
800 |
-
ArrayList<String> PassageContext= new ArrayList<String>(); // array of Passage context
|
801 |
-
ArrayList<ArrayList<String>> AnnotationInPMID= new ArrayList(); // array of Annotations in the PassageName
|
802 |
-
|
803 |
-
/*
|
804 |
-
* Per Passage
|
805 |
-
*/
|
806 |
-
for (BioCPassage passage : document.getPassages())
|
807 |
-
{
|
808 |
-
PassageName.add(passage.getInfon("type")); //Paragraph
|
809 |
-
|
810 |
-
String txt = passage.getText();
|
811 |
-
if(txt.matches("[\t ]+"))
|
812 |
-
{
|
813 |
-
txt = txt.replaceAll(".","@");
|
814 |
-
}
|
815 |
-
else
|
816 |
-
{
|
817 |
-
//if(passage.getInfon("type").toLowerCase().equals("table"))
|
818 |
-
//{
|
819 |
-
// txt=txt.replaceAll(" ", "|");
|
820 |
-
//}
|
821 |
-
txt = txt.replaceAll("ω","w");
|
822 |
-
txt = txt.replaceAll("μ","u");
|
823 |
-
txt = txt.replaceAll("κ","k");
|
824 |
-
txt = txt.replaceAll("α","a");
|
825 |
-
txt = txt.replaceAll("γ","g");
|
826 |
-
txt = txt.replaceAll("ɣ","g");
|
827 |
-
txt = txt.replaceAll("β","b");
|
828 |
-
txt = txt.replaceAll("×","x");
|
829 |
-
txt = txt.replaceAll("‑","-");
|
830 |
-
txt = txt.replaceAll("¹","1");
|
831 |
-
txt = txt.replaceAll("²","2");
|
832 |
-
txt = txt.replaceAll("°","o");
|
833 |
-
txt = txt.replaceAll("ö","o");
|
834 |
-
txt = txt.replaceAll("é","e");
|
835 |
-
txt = txt.replaceAll("à","a");
|
836 |
-
txt = txt.replaceAll("Á","A");
|
837 |
-
txt = txt.replaceAll("ε","e");
|
838 |
-
txt = txt.replaceAll("θ","O");
|
839 |
-
txt = txt.replaceAll("•",".");
|
840 |
-
txt = txt.replaceAll("µ","u");
|
841 |
-
txt = txt.replaceAll("λ","r");
|
842 |
-
txt = txt.replaceAll("⁺","+");
|
843 |
-
txt = txt.replaceAll("ν","v");
|
844 |
-
txt = txt.replaceAll("ï","i");
|
845 |
-
txt = txt.replaceAll("ã","a");
|
846 |
-
txt = txt.replaceAll("≡","=");
|
847 |
-
txt = txt.replaceAll("ó","o");
|
848 |
-
txt = txt.replaceAll("³","3");
|
849 |
-
txt = txt.replaceAll("〖","[");
|
850 |
-
txt = txt.replaceAll("〗","]");
|
851 |
-
txt = txt.replaceAll("Å","A");
|
852 |
-
txt = txt.replaceAll("ρ","p");
|
853 |
-
txt = txt.replaceAll("ü","u");
|
854 |
-
txt = txt.replaceAll("ɛ","e");
|
855 |
-
txt = txt.replaceAll("č","c");
|
856 |
-
txt = txt.replaceAll("š","s");
|
857 |
-
txt = txt.replaceAll("ß","b");
|
858 |
-
txt = txt.replaceAll("═","=");
|
859 |
-
txt = txt.replaceAll("£","L");
|
860 |
-
txt = txt.replaceAll("Ł","L");
|
861 |
-
txt = txt.replaceAll("ƒ","f");
|
862 |
-
txt = txt.replaceAll("ä","a");
|
863 |
-
txt = txt.replaceAll("–","-");
|
864 |
-
txt = txt.replaceAll("⁻","-");
|
865 |
-
txt = txt.replaceAll("〈","<");
|
866 |
-
txt = txt.replaceAll("〉",">");
|
867 |
-
txt = txt.replaceAll("χ","X");
|
868 |
-
txt = txt.replaceAll("Đ","D");
|
869 |
-
txt = txt.replaceAll("‰","%");
|
870 |
-
txt = txt.replaceAll("·",".");
|
871 |
-
txt = txt.replaceAll("→",">");
|
872 |
-
txt = txt.replaceAll("←","<");
|
873 |
-
txt = txt.replaceAll("ζ","z");
|
874 |
-
txt = txt.replaceAll("π","p");
|
875 |
-
txt = txt.replaceAll("τ","t");
|
876 |
-
txt = txt.replaceAll("ξ","X");
|
877 |
-
txt = txt.replaceAll("η","h");
|
878 |
-
txt = txt.replaceAll("ø","0");
|
879 |
-
txt = txt.replaceAll("Δ","D");
|
880 |
-
txt = txt.replaceAll("∆","D");
|
881 |
-
txt = txt.replaceAll("∑","S");
|
882 |
-
txt = txt.replaceAll("Ω","O");
|
883 |
-
txt = txt.replaceAll("δ","d");
|
884 |
-
txt = txt.replaceAll("σ","s");
|
885 |
-
txt = txt.replaceAll("Φ","F");
|
886 |
-
//txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]"," ");
|
887 |
-
}
|
888 |
-
if(passage.getText().equals("") || passage.getText().matches("[ ]+"))
|
889 |
-
{
|
890 |
-
PassageContext.add("-notext-"); //Context
|
891 |
-
}
|
892 |
-
else
|
893 |
-
{
|
894 |
-
PassageContext.add(txt); //Context
|
895 |
-
}
|
896 |
-
PassageOffset.add(passage.getOffset()); //Offset
|
897 |
-
ArrayList<String> AnnotationInPassage= new ArrayList<String>(); // array of Annotations in the PassageName
|
898 |
-
|
899 |
-
/*
|
900 |
-
* Per Annotation :
|
901 |
-
* start
|
902 |
-
* last
|
903 |
-
* mention
|
904 |
-
* type
|
905 |
-
* id
|
906 |
-
*/
|
907 |
-
for (BioCAnnotation Anno : passage.getAnnotations())
|
908 |
-
{
|
909 |
-
int start = Anno.getLocations().get(0).getOffset()-passage.getOffset(); // start
|
910 |
-
int last = start + Anno.getLocations().get(0).getLength(); // last
|
911 |
-
String AnnoMention=Anno.getText(); // mention
|
912 |
-
String Annotype = Anno.getInfon("type"); // type
|
913 |
-
String Annoid = Anno.getInfon("Identifier"); // identifier | MESH
|
914 |
-
if(Annoid == null)
|
915 |
-
{
|
916 |
-
Annoid = Anno.getInfon("Identifier"); // identifier | MESH
|
917 |
-
}
|
918 |
-
if(Annoid == null || Annoid.equals("null"))
|
919 |
-
{
|
920 |
-
AnnotationInPassage.add(start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype); //paragraph
|
921 |
-
}
|
922 |
-
else
|
923 |
-
{
|
924 |
-
AnnotationInPassage.add(start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\t"+Annoid); //paragraph
|
925 |
-
}
|
926 |
-
}
|
927 |
-
AnnotationInPMID.add(AnnotationInPassage);
|
928 |
-
}
|
929 |
-
PassageNames.add(PassageName);
|
930 |
-
PassageContexts.add(PassageContext);
|
931 |
-
PassageOffsets.add(PassageOffset);
|
932 |
-
Annotations.add(AnnotationInPMID);
|
933 |
-
}
|
934 |
-
}
|
935 |
-
public void BioCOutput(String input,String output, ArrayList<ArrayList<ArrayList<String>>> Annotations,boolean Final,boolean RemovePreviousAnno) throws IOException, XMLStreamException
|
936 |
-
{
|
937 |
-
boolean ShowUnNormalizedMention = false;
|
938 |
-
if(GNormPlus.setup_hash.containsKey("ShowUnNormalizedMention") && GNormPlus.setup_hash.get("ShowUnNormalizedMention").equals("True"))
|
939 |
-
{
|
940 |
-
ShowUnNormalizedMention = true;
|
941 |
-
}
|
942 |
-
|
943 |
-
BioCDocumentWriter BioCOutputFormat = BioCFactory.newFactory(BioCFactory.WOODSTOX).createBioCDocumentWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
|
944 |
-
BioCCollection biocCollection_input = new BioCCollection();
|
945 |
-
BioCCollection biocCollection_output = new BioCCollection();
|
946 |
-
|
947 |
-
//input: BioC
|
948 |
-
ConnectorWoodstox connector = new ConnectorWoodstox();
|
949 |
-
biocCollection_input = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
950 |
-
BioCOutputFormat.writeCollectionInfo(biocCollection_input);
|
951 |
-
int i=0; //count for pmid
|
952 |
-
while (connector.hasNext())
|
953 |
-
{
|
954 |
-
BioCDocument document_output = new BioCDocument();
|
955 |
-
BioCDocument document_input = connector.next();
|
956 |
-
String PMID=document_input.getID();
|
957 |
-
document_output.setID(PMID);
|
958 |
-
int annotation_count=0;
|
959 |
-
int j=0; //count for paragraph
|
960 |
-
for (BioCPassage passage_input : document_input.getPassages())
|
961 |
-
{
|
962 |
-
BioCPassage passage_output = passage_input;
|
963 |
-
|
964 |
-
if(RemovePreviousAnno == true) //clean the previous annotation, if the NER result is provided
|
965 |
-
{
|
966 |
-
passage_output.clearAnnotations();
|
967 |
-
}
|
968 |
-
else
|
969 |
-
{
|
970 |
-
for (BioCAnnotation annotation : passage_output.getAnnotations())
|
971 |
-
{
|
972 |
-
annotation.setID(""+annotation_count);
|
973 |
-
annotation_count++;
|
974 |
-
}
|
975 |
-
}
|
976 |
-
|
977 |
-
int passage_Offset = passage_input.getOffset();
|
978 |
-
String passage_Text = passage_input.getText();
|
979 |
-
ArrayList<String> AnnotationInPassage = new ArrayList<String>();
|
980 |
-
//ArrayList<String> AnnotationInPassage = Annotations.get(i).get(j);
|
981 |
-
if(Annotations.size()>i && Annotations.get(i).size()>j)
|
982 |
-
{
|
983 |
-
for(int a=0;a<Annotations.get(i).get(j).size();a++)
|
984 |
-
{
|
985 |
-
String Anno[]=Annotations.get(i).get(j).get(a).split("\\t");
|
986 |
-
int start = Integer.parseInt(Anno[0]);
|
987 |
-
int last = Integer.parseInt(Anno[1]);
|
988 |
-
boolean found = false;
|
989 |
-
if(passage_Text.length()>last)
|
990 |
-
{
|
991 |
-
String mention = Anno[2];
|
992 |
-
if(Final == true && passage_Text.length()>=last)
|
993 |
-
{
|
994 |
-
mention = passage_Text.substring(start, last);
|
995 |
-
}
|
996 |
-
if(mention.matches(".*\t.*"))
|
997 |
-
{
|
998 |
-
Anno[3]=Anno[4];
|
999 |
-
if(Anno.length>=6)
|
1000 |
-
{
|
1001 |
-
Anno[4]=Anno[5];
|
1002 |
-
}
|
1003 |
-
}
|
1004 |
-
String type = Anno[3];
|
1005 |
-
String id = ""; // optional
|
1006 |
-
if(Anno.length>=5){id = Anno[4];}
|
1007 |
-
if(Final == true)
|
1008 |
-
{
|
1009 |
-
for(int b=0;b<AnnotationInPassage.size();b++)
|
1010 |
-
{
|
1011 |
-
String Annob[]=AnnotationInPassage.get(b).split("\\t");
|
1012 |
-
int startb = Integer.parseInt(Annob[0]);
|
1013 |
-
int lastb = Integer.parseInt(Annob[1]);
|
1014 |
-
String mentionb = Annob[2];
|
1015 |
-
if(Final == true && passage_Text.length()>=lastb)
|
1016 |
-
{
|
1017 |
-
mentionb = passage_Text.substring(startb, lastb);
|
1018 |
-
}
|
1019 |
-
if(mentionb.matches(".*\t.*"))
|
1020 |
-
{
|
1021 |
-
Annob[3]=Annob[4];
|
1022 |
-
if(Annob.length>=6)
|
1023 |
-
{
|
1024 |
-
Annob[4]=Annob[5];
|
1025 |
-
}
|
1026 |
-
}
|
1027 |
-
String typeb = Annob[3];
|
1028 |
-
String idb = ""; // optional
|
1029 |
-
if(Annob.length>=5){idb = Annob[4];}
|
1030 |
-
|
1031 |
-
if(start == startb && last == lastb && type.equals(typeb))
|
1032 |
-
{
|
1033 |
-
found = true;
|
1034 |
-
if(id.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+") && (!idb.equals("")))
|
1035 |
-
{
|
1036 |
-
}
|
1037 |
-
else if(idb.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+") && (!id.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+")) && (!id.equals("")))
|
1038 |
-
{
|
1039 |
-
AnnotationInPassage.set(b, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id);
|
1040 |
-
}
|
1041 |
-
else
|
1042 |
-
{
|
1043 |
-
if(id.equals(""))
|
1044 |
-
{
|
1045 |
-
}
|
1046 |
-
else
|
1047 |
-
{
|
1048 |
-
AnnotationInPassage.set(b, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+idb+";"+id);
|
1049 |
-
}
|
1050 |
-
|
1051 |
-
}
|
1052 |
-
break;
|
1053 |
-
}
|
1054 |
-
}
|
1055 |
-
}
|
1056 |
-
}
|
1057 |
-
if(found == false)
|
1058 |
-
{
|
1059 |
-
AnnotationInPassage.add(Annotations.get(i).get(j).get(a));
|
1060 |
-
}
|
1061 |
-
}
|
1062 |
-
}
|
1063 |
-
for(int a=0;a<AnnotationInPassage.size();a++)
|
1064 |
-
{
|
1065 |
-
String Anno[]=AnnotationInPassage.get(a).split("\\t");
|
1066 |
-
HashMap <String,String> id_hash = new HashMap <String,String>();
|
1067 |
-
if(Anno.length>=5)
|
1068 |
-
{
|
1069 |
-
int start = Integer.parseInt(Anno[0]);
|
1070 |
-
int last = Integer.parseInt(Anno[1]);
|
1071 |
-
String mention = Anno[2];
|
1072 |
-
if(Final == true && passage_Text.length()>=last)
|
1073 |
-
{
|
1074 |
-
mention = passage_Text.substring(start, last);
|
1075 |
-
}
|
1076 |
-
if(mention.matches(".*\t.*"))
|
1077 |
-
{
|
1078 |
-
Anno[3]=Anno[4];
|
1079 |
-
if(Anno.length>=6)
|
1080 |
-
{
|
1081 |
-
Anno[4]=Anno[5];
|
1082 |
-
}
|
1083 |
-
}
|
1084 |
-
String ids = Anno[4];
|
1085 |
-
String idlist[]=ids.split(",");
|
1086 |
-
for(int b=0;b<idlist.length;b++)
|
1087 |
-
{
|
1088 |
-
id_hash.put(idlist[b], "");
|
1089 |
-
}
|
1090 |
-
ids = "";
|
1091 |
-
for(String id :id_hash.keySet())
|
1092 |
-
{
|
1093 |
-
if(ids.equals(""))
|
1094 |
-
{
|
1095 |
-
ids = id;
|
1096 |
-
}
|
1097 |
-
else
|
1098 |
-
{
|
1099 |
-
ids = ids + ";" + id;
|
1100 |
-
}
|
1101 |
-
}
|
1102 |
-
AnnotationInPassage.set(a, Anno[0]+"\t"+Anno[1]+"\t"+Anno[2]+"\t"+Anno[3]+"\t"+ids);
|
1103 |
-
}
|
1104 |
-
}
|
1105 |
-
|
1106 |
-
for(int a=0;a<AnnotationInPassage.size();a++)
|
1107 |
-
{
|
1108 |
-
String Anno[]=AnnotationInPassage.get(a).split("\\t");
|
1109 |
-
int start = Integer.parseInt(Anno[0]);
|
1110 |
-
int last = Integer.parseInt(Anno[1]);
|
1111 |
-
if(passage_Text.length()>last)
|
1112 |
-
{
|
1113 |
-
String mention = Anno[2];
|
1114 |
-
if(Final == true && passage_Text.length()>=last)
|
1115 |
-
{
|
1116 |
-
mention = passage_Text.substring(start, last);
|
1117 |
-
}
|
1118 |
-
if(mention.matches(".*\t.*"))
|
1119 |
-
{
|
1120 |
-
Anno[3]=Anno[4];
|
1121 |
-
if(Anno.length>=6)
|
1122 |
-
{
|
1123 |
-
Anno[4]=Anno[5];
|
1124 |
-
}
|
1125 |
-
}
|
1126 |
-
String type = Anno[3];
|
1127 |
-
if(type.equals("GeneID")){type="Gene";}
|
1128 |
-
BioCAnnotation biocAnnotation = new BioCAnnotation();
|
1129 |
-
Map<String, String> AnnoInfons = new HashMap<String, String>();
|
1130 |
-
AnnoInfons.put("type", type);
|
1131 |
-
if(Anno.length>=5)
|
1132 |
-
{
|
1133 |
-
String identifier = Anno[4];
|
1134 |
-
if(Final == true && ShowUnNormalizedMention==false)
|
1135 |
-
{
|
1136 |
-
if(type.matches("(FamilyName|Domain|Gene)"))
|
1137 |
-
{
|
1138 |
-
Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9\\;]+)$");
|
1139 |
-
Matcher mtmp0 = ptmp0.matcher(identifier);
|
1140 |
-
Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
|
1141 |
-
Matcher mtmp1 = ptmp1.matcher(identifier);
|
1142 |
-
Pattern ptmp2 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)$");
|
1143 |
-
Matcher mtmp2 = ptmp2.matcher(identifier);
|
1144 |
-
Pattern ptmp3 = Pattern.compile("^Homo\\:([0-9]+)$");
|
1145 |
-
Matcher mtmp3 = ptmp3.matcher(identifier);
|
1146 |
-
if(mtmp0.find())
|
1147 |
-
{
|
1148 |
-
String Method_SA = mtmp0.group(1);
|
1149 |
-
String TaxonomyID = mtmp0.group(2);
|
1150 |
-
String NCBIGeneID = mtmp0.group(3);
|
1151 |
-
if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID))
|
1152 |
-
{
|
1153 |
-
AnnoInfons.put("UniProt", GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
|
1154 |
-
}
|
1155 |
-
if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID))
|
1156 |
-
{
|
1157 |
-
AnnoInfons.put("NCBI Homologene", GNormPlus.HomologeneID_hash.get(NCBIGeneID));
|
1158 |
-
}
|
1159 |
-
AnnoInfons.put("NCBI Gene", NCBIGeneID);
|
1160 |
-
}
|
1161 |
-
else if(mtmp1.find())
|
1162 |
-
{
|
1163 |
-
String Method_SA = mtmp1.group(1);
|
1164 |
-
String TaxonomyID = mtmp1.group(2);
|
1165 |
-
String NCBIGeneID = mtmp1.group(3);
|
1166 |
-
String HomoID = mtmp1.group(4);
|
1167 |
-
if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID))
|
1168 |
-
{
|
1169 |
-
AnnoInfons.put("UniProt", GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
|
1170 |
-
}
|
1171 |
-
if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID))
|
1172 |
-
{
|
1173 |
-
AnnoInfons.put("NCBI Homologene", GNormPlus.HomologeneID_hash.get(NCBIGeneID));
|
1174 |
-
}
|
1175 |
-
AnnoInfons.put("NCBI Gene", NCBIGeneID);
|
1176 |
-
}
|
1177 |
-
else if(mtmp2.find())
|
1178 |
-
{
|
1179 |
-
String Method_SA = mtmp2.group(1);
|
1180 |
-
String TaxonomyID = mtmp2.group(2);
|
1181 |
-
AnnoInfons.put("FocusSpecies", "NCBITaxonomyID:"+TaxonomyID);
|
1182 |
-
}
|
1183 |
-
else if(mtmp3.find())
|
1184 |
-
{
|
1185 |
-
String Method_SA = mtmp3.group(1);
|
1186 |
-
String HomoID = mtmp3.group(2);
|
1187 |
-
AnnoInfons.put("NCBI Homologene", HomoID);
|
1188 |
-
}
|
1189 |
-
else
|
1190 |
-
{
|
1191 |
-
String identifiers[] = identifier.split(";");
|
1192 |
-
if(identifiers.length>1)
|
1193 |
-
{
|
1194 |
-
ArrayList<String> identifierSTR = new ArrayList<String>();
|
1195 |
-
ArrayList<String> ProteinidSTR = new ArrayList<String>();
|
1196 |
-
ArrayList<String> HomoidSTR = new ArrayList<String>();
|
1197 |
-
for(int idi=0;idi<identifiers.length;idi++)
|
1198 |
-
{
|
1199 |
-
Pattern ptmp4 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
|
1200 |
-
Matcher mtmp4 = ptmp4.matcher(identifiers[idi]);
|
1201 |
-
Pattern ptmp5 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9\\;]+)$");
|
1202 |
-
Matcher mtmp5 = ptmp5.matcher(identifiers[idi]);
|
1203 |
-
if(mtmp4.find())
|
1204 |
-
{
|
1205 |
-
String Method_SA = mtmp4.group(1);
|
1206 |
-
String TaxonomyID = mtmp4.group(2);
|
1207 |
-
String NCBIGeneID = mtmp4.group(3);
|
1208 |
-
String HomoID = mtmp4.group(4);
|
1209 |
-
if(!identifierSTR.contains(NCBIGeneID))
|
1210 |
-
{
|
1211 |
-
identifierSTR.add(NCBIGeneID);
|
1212 |
-
}
|
1213 |
-
if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID))
|
1214 |
-
{
|
1215 |
-
if(!ProteinidSTR.contains(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID)))
|
1216 |
-
{
|
1217 |
-
ProteinidSTR.add(GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
|
1218 |
-
}
|
1219 |
-
}
|
1220 |
-
if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID))
|
1221 |
-
{
|
1222 |
-
if(!HomoidSTR.contains(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID)))
|
1223 |
-
{
|
1224 |
-
HomoidSTR.add(GNormPlus.HomologeneID_hash.get(NCBIGeneID));
|
1225 |
-
}
|
1226 |
-
}
|
1227 |
-
|
1228 |
-
}
|
1229 |
-
else if(mtmp5.find())
|
1230 |
-
{
|
1231 |
-
String Method_SA = mtmp5.group(1);
|
1232 |
-
String TaxonomyID = mtmp5.group(2);
|
1233 |
-
String NCBIGeneID = mtmp5.group(3);
|
1234 |
-
if(!identifierSTR.contains(NCBIGeneID))
|
1235 |
-
{
|
1236 |
-
identifierSTR.add(NCBIGeneID);
|
1237 |
-
}
|
1238 |
-
}
|
1239 |
-
}
|
1240 |
-
String idSTR="";
|
1241 |
-
for(int x=0;x<identifierSTR.size();x++)
|
1242 |
-
{
|
1243 |
-
if(idSTR.equals(""))
|
1244 |
-
{
|
1245 |
-
idSTR = identifierSTR.get(x);
|
1246 |
-
}
|
1247 |
-
else
|
1248 |
-
{
|
1249 |
-
idSTR = idSTR+";"+identifierSTR.get(x);
|
1250 |
-
}
|
1251 |
-
}
|
1252 |
-
AnnoInfons.put("NCBI Gene", idSTR);
|
1253 |
-
|
1254 |
-
String pidSTR="";
|
1255 |
-
for(int x=0;x<ProteinidSTR.size();x++)
|
1256 |
-
{
|
1257 |
-
if(pidSTR.equals(""))
|
1258 |
-
{
|
1259 |
-
pidSTR = ProteinidSTR.get(x);
|
1260 |
-
}
|
1261 |
-
else
|
1262 |
-
{
|
1263 |
-
pidSTR = pidSTR+";"+ProteinidSTR.get(x);
|
1264 |
-
}
|
1265 |
-
}
|
1266 |
-
if(!pidSTR.equals(""))
|
1267 |
-
{
|
1268 |
-
AnnoInfons.put("UniProt", pidSTR);
|
1269 |
-
}
|
1270 |
-
|
1271 |
-
String hidSTR="";
|
1272 |
-
for(int x=0;x<HomoidSTR.size();x++)
|
1273 |
-
{
|
1274 |
-
if(hidSTR.equals(""))
|
1275 |
-
{
|
1276 |
-
hidSTR = HomoidSTR.get(x);
|
1277 |
-
}
|
1278 |
-
else
|
1279 |
-
{
|
1280 |
-
hidSTR = hidSTR+";"+HomoidSTR.get(x);
|
1281 |
-
}
|
1282 |
-
}
|
1283 |
-
if(!hidSTR.equals(""))
|
1284 |
-
{
|
1285 |
-
AnnoInfons.put("NCBI Homologene", hidSTR);
|
1286 |
-
}
|
1287 |
-
}
|
1288 |
-
//else
|
1289 |
-
//{
|
1290 |
-
// AnnoInfons.put("Identifier", identifier);
|
1291 |
-
//}
|
1292 |
-
}
|
1293 |
-
}
|
1294 |
-
else if (type.matches("(Species|Genus|Strain)"))
|
1295 |
-
{
|
1296 |
-
AnnoInfons.put("type", type);
|
1297 |
-
AnnoInfons.put("NCBI Taxonomy", identifier);
|
1298 |
-
}
|
1299 |
-
else if (type.matches("Cell"))
|
1300 |
-
{
|
1301 |
-
AnnoInfons.put("type", "CellLine");
|
1302 |
-
AnnoInfons.put("NCBI Taxonomy", identifier);
|
1303 |
-
}
|
1304 |
-
else
|
1305 |
-
{
|
1306 |
-
AnnoInfons.put("Identifier", identifier);
|
1307 |
-
}
|
1308 |
-
}
|
1309 |
-
else
|
1310 |
-
{
|
1311 |
-
AnnoInfons.put("Identifier", identifier);
|
1312 |
-
}
|
1313 |
-
}
|
1314 |
-
biocAnnotation.setInfons(AnnoInfons);
|
1315 |
-
BioCLocation location = new BioCLocation();
|
1316 |
-
location.setOffset(start+passage_Offset);
|
1317 |
-
location.setLength(last-start);
|
1318 |
-
biocAnnotation.setLocation(location);
|
1319 |
-
biocAnnotation.setText(mention);
|
1320 |
-
biocAnnotation.setID(""+annotation_count);
|
1321 |
-
annotation_count++;
|
1322 |
-
if(Final == true)
|
1323 |
-
{
|
1324 |
-
if(AnnoInfons.containsKey("Identifier") || AnnoInfons.containsKey("NCBI Homologene") || AnnoInfons.containsKey("NCBI Gene") || AnnoInfons.containsKey("NCBI Taxonomy"))
|
1325 |
-
{
|
1326 |
-
passage_output.addAnnotation(biocAnnotation);
|
1327 |
-
}
|
1328 |
-
}
|
1329 |
-
else
|
1330 |
-
{
|
1331 |
-
passage_output.addAnnotation(biocAnnotation);
|
1332 |
-
}
|
1333 |
-
}
|
1334 |
-
}
|
1335 |
-
document_output.addPassage(passage_output);
|
1336 |
-
j++;
|
1337 |
-
}
|
1338 |
-
biocCollection_output.addDocument(document_output);
|
1339 |
-
BioCOutputFormat.writeDocument(document_output);
|
1340 |
-
i++;
|
1341 |
-
}
|
1342 |
-
BioCOutputFormat.close();
|
1343 |
-
}
|
1344 |
}
|
|
|
1 |
+
/**
|
2 |
+
* Project: GNormPlus
|
3 |
+
* Function: Data storage in BioC format
|
4 |
+
*/
|
5 |
+
|
6 |
+
package GNormPluslib;
|
7 |
+
|
8 |
+
import bioc.BioCAnnotation;
|
9 |
+
import bioc.BioCCollection;
|
10 |
+
import bioc.BioCDocument;
|
11 |
+
import bioc.BioCLocation;
|
12 |
+
import bioc.BioCPassage;
|
13 |
+
|
14 |
+
import bioc.io.BioCDocumentWriter;
|
15 |
+
import bioc.io.BioCFactory;
|
16 |
+
import bioc.io.woodstox.ConnectorWoodstox;
|
17 |
+
import java.io.BufferedReader;
|
18 |
+
import java.io.BufferedWriter;
|
19 |
+
import java.io.FileInputStream;
|
20 |
+
import java.io.FileNotFoundException;
|
21 |
+
import java.io.FileOutputStream;
|
22 |
+
import java.io.FileReader;
|
23 |
+
import java.io.FileWriter;
|
24 |
+
import java.io.IOException;
|
25 |
+
import java.io.InputStreamReader;
|
26 |
+
import java.io.OutputStreamWriter;
|
27 |
+
import java.io.UnsupportedEncodingException;
|
28 |
+
import java.time.LocalDate;
|
29 |
+
import java.time.ZoneId;
|
30 |
+
|
31 |
+
import javax.xml.stream.XMLStreamException;
|
32 |
+
|
33 |
+
import java.util.Map;
|
34 |
+
import java.util.regex.Matcher;
|
35 |
+
import java.util.regex.Pattern;
|
36 |
+
import java.util.ArrayList;
|
37 |
+
import java.util.HashMap;
|
38 |
+
import java.util.List;
|
39 |
+
|
40 |
+
public class BioCDoc
|
41 |
+
{
|
42 |
+
/*
|
43 |
+
* Contexts in BioC file
|
44 |
+
*/
|
45 |
+
public ArrayList<String> PMIDs=new ArrayList<String>(); // Type: PMIDs
|
46 |
+
public ArrayList<ArrayList<String>> PassageNames = new ArrayList(); // PassageName
|
47 |
+
public ArrayList<ArrayList<Integer>> PassageOffsets = new ArrayList(); // PassageOffset
|
48 |
+
public ArrayList<ArrayList<String>> PassageContexts = new ArrayList(); // PassageContext
|
49 |
+
public ArrayList<ArrayList<ArrayList<String>>> Annotations = new ArrayList(); // Annotation - GNormPlus
|
50 |
+
|
51 |
+
public String BioCFormatCheck(String InputFile) throws IOException
|
52 |
+
{
|
53 |
+
|
54 |
+
ConnectorWoodstox connector = new ConnectorWoodstox();
|
55 |
+
BioCCollection collection = new BioCCollection();
|
56 |
+
try
|
57 |
+
{
|
58 |
+
collection = connector.startRead(new InputStreamReader(new FileInputStream(InputFile), "UTF-8"));
|
59 |
+
}
|
60 |
+
catch (UnsupportedEncodingException | FileNotFoundException | XMLStreamException e)
|
61 |
+
{
|
62 |
+
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(InputFile), "UTF-8"));
|
63 |
+
String line="";
|
64 |
+
String status="";
|
65 |
+
String Pmid = "";
|
66 |
+
boolean tiabs=false;
|
67 |
+
Pattern patt = Pattern.compile("^([^\\|\\t]+)\\|([^\\|\\t]+)\\|(.*)$");
|
68 |
+
while ((line = br.readLine()) != null)
|
69 |
+
{
|
70 |
+
Matcher mat = patt.matcher(line);
|
71 |
+
if(mat.find()) //Title|Abstract
|
72 |
+
{
|
73 |
+
if(Pmid.equals(""))
|
74 |
+
{
|
75 |
+
Pmid = mat.group(1);
|
76 |
+
}
|
77 |
+
else if(!Pmid.equals(mat.group(1)))
|
78 |
+
{
|
79 |
+
return "[Error]: "+InputFile+" - A blank is needed between "+Pmid+" and "+mat.group(1)+".";
|
80 |
+
}
|
81 |
+
status = "tiabs";
|
82 |
+
tiabs = true;
|
83 |
+
}
|
84 |
+
else if (line.contains("\t")) //Annotation
|
85 |
+
{
|
86 |
+
}
|
87 |
+
else if(line.length()==0) //Processing
|
88 |
+
{
|
89 |
+
if(status.equals(""))
|
90 |
+
{
|
91 |
+
if(Pmid.equals(""))
|
92 |
+
{
|
93 |
+
return "[Error]: "+InputFile+" - It's neither BioC nor PubTator format. PMID is empty.";
|
94 |
+
}
|
95 |
+
else
|
96 |
+
{
|
97 |
+
return "[Error]: "+InputFile+" - A redundant blank is after "+Pmid+".";
|
98 |
+
}
|
99 |
+
}
|
100 |
+
Pmid="";
|
101 |
+
status="";
|
102 |
+
}
|
103 |
+
}
|
104 |
+
br.close();
|
105 |
+
if(tiabs == false)
|
106 |
+
{
|
107 |
+
return "[Error]: "+InputFile+" - It's neither BioC nor PubTator format.";
|
108 |
+
}
|
109 |
+
if(status.equals(""))
|
110 |
+
{
|
111 |
+
return "PubTator";
|
112 |
+
}
|
113 |
+
else
|
114 |
+
{
|
115 |
+
return "[Error]: "+InputFile+" - The last column missed a blank.";
|
116 |
+
}
|
117 |
+
}
|
118 |
+
return "BioC";
|
119 |
+
}
|
120 |
+
public void PubTator2BioC(String input,String output) throws IOException, XMLStreamException // Input
|
121 |
+
{
|
122 |
+
/*
|
123 |
+
* PubTator2BioC
|
124 |
+
*/
|
125 |
+
String parser = BioCFactory.WOODSTOX;
|
126 |
+
BioCFactory factory = BioCFactory.newFactory(parser);
|
127 |
+
BioCDocumentWriter BioCOutputFormat = factory.createBioCDocumentWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
|
128 |
+
BioCCollection biocCollection = new BioCCollection();
|
129 |
+
|
130 |
+
//time
|
131 |
+
ZoneId zonedId = ZoneId.of( "America/Montreal" );
|
132 |
+
LocalDate today = LocalDate.now( zonedId );
|
133 |
+
biocCollection.setDate(today.toString());
|
134 |
+
|
135 |
+
biocCollection.setKey("BioC.key");//key
|
136 |
+
biocCollection.setSource("GNormPlus");//source
|
137 |
+
|
138 |
+
BioCOutputFormat.writeCollectionInfo(biocCollection);
|
139 |
+
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
140 |
+
ArrayList<String> ParagraphType=new ArrayList<String>(); // Type: Title|Abstract
|
141 |
+
ArrayList<String> ParagraphContent = new ArrayList<String>(); // Text
|
142 |
+
ArrayList<String> annotations = new ArrayList<String>(); // Annotation
|
143 |
+
String line;
|
144 |
+
String Pmid="";
|
145 |
+
while ((line = inputfile.readLine()) != null)
|
146 |
+
{
|
147 |
+
if(line.contains("|") && !line.contains("\t")) //Title|Abstract
|
148 |
+
{
|
149 |
+
String str[]=line.split("\\|",-1);
|
150 |
+
Pmid=str[0];
|
151 |
+
if(str[1].equals("t"))
|
152 |
+
{
|
153 |
+
str[1]="title";
|
154 |
+
}
|
155 |
+
if(str[1].equals("a"))
|
156 |
+
{
|
157 |
+
str[1]="abstract";
|
158 |
+
}
|
159 |
+
ParagraphType.add(str[1]);
|
160 |
+
if(str.length==3)
|
161 |
+
{
|
162 |
+
String txt = str[2];
|
163 |
+
txt = txt.replaceAll("ω","w");
|
164 |
+
txt = txt.replaceAll("μ","u");
|
165 |
+
txt = txt.replaceAll("κ","k");
|
166 |
+
txt = txt.replaceAll("α","a");
|
167 |
+
txt = txt.replaceAll("γ","g");
|
168 |
+
txt = txt.replaceAll("ɣ","g");
|
169 |
+
txt = txt.replaceAll("β","b");
|
170 |
+
txt = txt.replaceAll("×","x");
|
171 |
+
txt = txt.replaceAll("‑","-");
|
172 |
+
txt = txt.replaceAll("¹","1");
|
173 |
+
txt = txt.replaceAll("²","2");
|
174 |
+
txt = txt.replaceAll("°","o");
|
175 |
+
txt = txt.replaceAll("ö","o");
|
176 |
+
txt = txt.replaceAll("é","e");
|
177 |
+
txt = txt.replaceAll("à","a");
|
178 |
+
txt = txt.replaceAll("Á","A");
|
179 |
+
txt = txt.replaceAll("ε","e");
|
180 |
+
txt = txt.replaceAll("θ","O");
|
181 |
+
txt = txt.replaceAll("•",".");
|
182 |
+
txt = txt.replaceAll("µ","u");
|
183 |
+
txt = txt.replaceAll("λ","r");
|
184 |
+
txt = txt.replaceAll("⁺","+");
|
185 |
+
txt = txt.replaceAll("ν","v");
|
186 |
+
txt = txt.replaceAll("ï","i");
|
187 |
+
txt = txt.replaceAll("ã","a");
|
188 |
+
txt = txt.replaceAll("≡","=");
|
189 |
+
txt = txt.replaceAll("ó","o");
|
190 |
+
txt = txt.replaceAll("³","3");
|
191 |
+
txt = txt.replaceAll("〖","[");
|
192 |
+
txt = txt.replaceAll("〗","]");
|
193 |
+
txt = txt.replaceAll("Å","A");
|
194 |
+
txt = txt.replaceAll("ρ","p");
|
195 |
+
txt = txt.replaceAll("ü","u");
|
196 |
+
txt = txt.replaceAll("ɛ","e");
|
197 |
+
txt = txt.replaceAll("č","c");
|
198 |
+
txt = txt.replaceAll("š","s");
|
199 |
+
txt = txt.replaceAll("ß","b");
|
200 |
+
txt = txt.replaceAll("═","=");
|
201 |
+
txt = txt.replaceAll("£","L");
|
202 |
+
txt = txt.replaceAll("Ł","L");
|
203 |
+
txt = txt.replaceAll("ƒ","f");
|
204 |
+
txt = txt.replaceAll("ä","a");
|
205 |
+
txt = txt.replaceAll("–","-");
|
206 |
+
txt = txt.replaceAll("⁻","-");
|
207 |
+
txt = txt.replaceAll("〈","<");
|
208 |
+
txt = txt.replaceAll("〉",">");
|
209 |
+
txt = txt.replaceAll("χ","X");
|
210 |
+
txt = txt.replaceAll("Đ","D");
|
211 |
+
txt = txt.replaceAll("‰","%");
|
212 |
+
txt = txt.replaceAll("·",".");
|
213 |
+
txt = txt.replaceAll("→",">");
|
214 |
+
txt = txt.replaceAll("←","<");
|
215 |
+
txt = txt.replaceAll("ζ","z");
|
216 |
+
txt = txt.replaceAll("π","p");
|
217 |
+
txt = txt.replaceAll("τ","t");
|
218 |
+
txt = txt.replaceAll("ξ","X");
|
219 |
+
txt = txt.replaceAll("η","h");
|
220 |
+
txt = txt.replaceAll("ø","0");
|
221 |
+
txt = txt.replaceAll("Δ","D");
|
222 |
+
txt = txt.replaceAll("∆","D");
|
223 |
+
txt = txt.replaceAll("∑","S");
|
224 |
+
txt = txt.replaceAll("Ω","O");
|
225 |
+
txt = txt.replaceAll("δ","d");
|
226 |
+
txt = txt.replaceAll("σ","s");
|
227 |
+
txt = txt.replaceAll("Φ","F");
|
228 |
+
txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]"," ");
|
229 |
+
ParagraphContent.add(txt);
|
230 |
+
}
|
231 |
+
else
|
232 |
+
{
|
233 |
+
ParagraphContent.add("- No text -");
|
234 |
+
}
|
235 |
+
}
|
236 |
+
else if (line.contains("\t")) //Annotation
|
237 |
+
{
|
238 |
+
String anno[]=line.split("\t");
|
239 |
+
if(anno.length==6)
|
240 |
+
{
|
241 |
+
annotations.add(anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+anno[4]+"\t"+anno[5]);
|
242 |
+
}
|
243 |
+
else if(anno.length==5)
|
244 |
+
{
|
245 |
+
annotations.add(anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+anno[4]);
|
246 |
+
}
|
247 |
+
}
|
248 |
+
else if(line.length()==0) //Processing
|
249 |
+
{
|
250 |
+
BioCDocument biocDocument = new BioCDocument();
|
251 |
+
biocDocument.setID(Pmid);
|
252 |
+
int startoffset=0;
|
253 |
+
for(int i=0;i<ParagraphType.size();i++)
|
254 |
+
{
|
255 |
+
BioCPassage biocPassage = new BioCPassage();
|
256 |
+
Map<String, String> Infons = new HashMap<String, String>();
|
257 |
+
Infons.put("type", ParagraphType.get(i));
|
258 |
+
biocPassage.setInfons(Infons);
|
259 |
+
biocPassage.setText(ParagraphContent.get(i));
|
260 |
+
biocPassage.setOffset(startoffset);
|
261 |
+
startoffset=startoffset+ParagraphContent.get(i).length()+1;
|
262 |
+
for(int j=0;j<annotations.size();j++)
|
263 |
+
{
|
264 |
+
String anno[]=annotations.get(j).split("\t");
|
265 |
+
if(Integer.parseInt(anno[0])<startoffset && Integer.parseInt(anno[0])>=startoffset-ParagraphContent.get(i).length()-1)
|
266 |
+
{
|
267 |
+
BioCAnnotation biocAnnotation = new BioCAnnotation();
|
268 |
+
Map<String, String> AnnoInfons = new HashMap<String, String>();
|
269 |
+
if(anno.length==5)
|
270 |
+
{
|
271 |
+
AnnoInfons.put("Identifier", anno[4]);
|
272 |
+
}
|
273 |
+
AnnoInfons.put("type", anno[3]);
|
274 |
+
biocAnnotation.setInfons(AnnoInfons);
|
275 |
+
BioCLocation location = new BioCLocation();
|
276 |
+
location.setOffset(Integer.parseInt(anno[0]));
|
277 |
+
location.setLength(Integer.parseInt(anno[1])-Integer.parseInt(anno[0]));
|
278 |
+
biocAnnotation.setLocation(location);
|
279 |
+
biocAnnotation.setText(anno[2]);
|
280 |
+
biocPassage.addAnnotation(biocAnnotation);
|
281 |
+
}
|
282 |
+
}
|
283 |
+
biocDocument.addPassage(biocPassage);
|
284 |
+
}
|
285 |
+
biocCollection.addDocument(biocDocument);
|
286 |
+
ParagraphType.clear();
|
287 |
+
ParagraphContent.clear();
|
288 |
+
annotations.clear();
|
289 |
+
BioCOutputFormat.writeDocument(biocDocument);
|
290 |
+
}
|
291 |
+
}
|
292 |
+
BioCOutputFormat.close();
|
293 |
+
inputfile.close();
|
294 |
+
}
|
295 |
+
public void BioC2PubTator(String input,String output) throws IOException, XMLStreamException //Output
|
296 |
+
{
|
297 |
+
/*
|
298 |
+
* BioC2PubTator
|
299 |
+
*/
|
300 |
+
HashMap<String, String> pmidlist = new HashMap<String, String>(); // check if appear duplicate pmids
|
301 |
+
boolean duplicate = false;
|
302 |
+
BufferedWriter PubTatorOutputFormat = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
|
303 |
+
ConnectorWoodstox connector = new ConnectorWoodstox();
|
304 |
+
BioCCollection collection = new BioCCollection();
|
305 |
+
collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
306 |
+
while (connector.hasNext())
|
307 |
+
{
|
308 |
+
BioCDocument document = connector.next();
|
309 |
+
String PMID = document.getID();
|
310 |
+
if(pmidlist.containsKey(PMID)){System.out.println("\nError: duplicate pmid-"+PMID);duplicate = true;}
|
311 |
+
else{pmidlist.put(PMID,"");}
|
312 |
+
String Anno="";
|
313 |
+
for (BioCPassage passage : document.getPassages())
|
314 |
+
{
|
315 |
+
if(passage.getInfon("type").equals("title"))
|
316 |
+
{
|
317 |
+
PubTatorOutputFormat.write(PMID+"|t|"+passage.getText()+"\n");
|
318 |
+
}
|
319 |
+
else if(passage.getInfon("type").equals("abstract"))
|
320 |
+
{
|
321 |
+
PubTatorOutputFormat.write(PMID+"|a|"+passage.getText()+"\n");
|
322 |
+
}
|
323 |
+
else
|
324 |
+
{
|
325 |
+
PubTatorOutputFormat.write(PMID+"|"+passage.getInfon("type")+"|"+passage.getText()+"\n");
|
326 |
+
}
|
327 |
+
|
328 |
+
for (BioCAnnotation annotation : passage.getAnnotations())
|
329 |
+
{
|
330 |
+
String Annotype = annotation.getInfon("type");
|
331 |
+
String Annoid="";
|
332 |
+
String Proteinid="";
|
333 |
+
if(Annotype.matches("(Gene|FamilyName|DomainMotif)"))
|
334 |
+
{
|
335 |
+
if(annotation.getInfons().containsKey("NCBI Gene"))
|
336 |
+
{
|
337 |
+
Annoid = annotation.getInfon("NCBI Gene");
|
338 |
+
String Annoidlist[]=Annoid.split(";");
|
339 |
+
Annoid="";
|
340 |
+
for(int x=0;x<Annoidlist.length;x++)
|
341 |
+
{
|
342 |
+
//Normalization2Protein
|
343 |
+
String proteinid="";
|
344 |
+
String homoid="";
|
345 |
+
|
346 |
+
if(GNormPlus.Normalization2Protein_hash.containsKey(Annoidlist[x]))
|
347 |
+
{
|
348 |
+
proteinid=GNormPlus.Normalization2Protein_hash.get(Annoidlist[x]);
|
349 |
+
}
|
350 |
+
if(GNormPlus.HomologeneID_hash.containsKey(Annoidlist[x]))
|
351 |
+
{
|
352 |
+
homoid=GNormPlus.HomologeneID_hash.get(Annoidlist[x]);
|
353 |
+
}
|
354 |
+
|
355 |
+
if((!proteinid.equals("")) || (!homoid.equals("")))
|
356 |
+
{
|
357 |
+
if(Annoid.equals(""))
|
358 |
+
{
|
359 |
+
Annoid=Annoidlist[x]+"(";
|
360 |
+
if(!proteinid.equals(""))
|
361 |
+
{
|
362 |
+
Annoid=Annoid+"UniProt:"+proteinid;
|
363 |
+
}
|
364 |
+
if(!homoid.equals(""))
|
365 |
+
{
|
366 |
+
if(!proteinid.equals(""))
|
367 |
+
{
|
368 |
+
Annoid=Annoid+";";
|
369 |
+
}
|
370 |
+
Annoid=Annoid+"Homoid:"+homoid;
|
371 |
+
}
|
372 |
+
Annoid=Annoid+")";
|
373 |
+
}
|
374 |
+
else
|
375 |
+
{
|
376 |
+
Annoid=Annoid+";"+Annoidlist[x]+"(";
|
377 |
+
if(!proteinid.equals(""))
|
378 |
+
{
|
379 |
+
Annoid=Annoid+"UniProt:"+proteinid;
|
380 |
+
}
|
381 |
+
if(!homoid.equals(""))
|
382 |
+
{
|
383 |
+
if(!proteinid.equals(""))
|
384 |
+
{
|
385 |
+
Annoid=Annoid+";";
|
386 |
+
}
|
387 |
+
Annoid=Annoid+"Homoid:"+homoid;
|
388 |
+
}
|
389 |
+
Annoid=Annoid+")";
|
390 |
+
}
|
391 |
+
}
|
392 |
+
else
|
393 |
+
{
|
394 |
+
if(Annoid.equals(""))
|
395 |
+
{
|
396 |
+
Annoid=Annoidlist[x];
|
397 |
+
}
|
398 |
+
else
|
399 |
+
{
|
400 |
+
Annoid=Annoid+";"+Annoidlist[x];
|
401 |
+
}
|
402 |
+
}
|
403 |
+
}
|
404 |
+
}
|
405 |
+
//else if(annotation.getInfons().containsKey("NCBI Homologene"))
|
406 |
+
//{
|
407 |
+
// Annoid = annotation.getInfon("NCBI Homologene");
|
408 |
+
//}
|
409 |
+
//else if(!annotation.getInfons().containsKey("FocusSpecies"))
|
410 |
+
//{
|
411 |
+
// Annoid = annotation.getInfon("FocusSpecies");
|
412 |
+
//}
|
413 |
+
else
|
414 |
+
{
|
415 |
+
Annoid = annotation.getInfon("Identifier");
|
416 |
+
}
|
417 |
+
}
|
418 |
+
else if(Annotype.equals("Species") || Annotype.equals("Genus") || Annotype.equals("Strain"))
|
419 |
+
{
|
420 |
+
if(annotation.getInfons().containsKey("NCBI Taxonomy"))
|
421 |
+
{
|
422 |
+
Annoid = annotation.getInfon("NCBI Taxonomy");
|
423 |
+
}
|
424 |
+
else
|
425 |
+
{
|
426 |
+
Annoid = annotation.getInfon("Identifier");
|
427 |
+
}
|
428 |
+
}
|
429 |
+
else if(Annotype.equals("CellLine"))
|
430 |
+
{
|
431 |
+
if(annotation.getInfons().containsKey("NCBI Taxonomy"))
|
432 |
+
{
|
433 |
+
Annoid = annotation.getInfon("NCBI Taxonomy");
|
434 |
+
}
|
435 |
+
else
|
436 |
+
{
|
437 |
+
Annoid = annotation.getInfon("Identifier");
|
438 |
+
}
|
439 |
+
}
|
440 |
+
else
|
441 |
+
{
|
442 |
+
Annoid = annotation.getInfon("Identifier");
|
443 |
+
}
|
444 |
+
int start = annotation.getLocations().get(0).getOffset();
|
445 |
+
int last = start + annotation.getLocations().get(0).getLength();
|
446 |
+
String AnnoMention=annotation.getText();
|
447 |
+
if(Annoid != null && !Annoid.equals(null) && !Annoid.equals(""))
|
448 |
+
{
|
449 |
+
Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\t"+Annoid+"\n";
|
450 |
+
}
|
451 |
+
else
|
452 |
+
{
|
453 |
+
Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\n";
|
454 |
+
}
|
455 |
+
}
|
456 |
+
}
|
457 |
+
PubTatorOutputFormat.write(Anno+"\n");
|
458 |
+
}
|
459 |
+
PubTatorOutputFormat.close();
|
460 |
+
if(duplicate == true){System.exit(0);}
|
461 |
+
}
|
462 |
+
public void BioC2PubTator(String original_input,String input,String output) throws IOException, XMLStreamException //Output
|
463 |
+
{
|
464 |
+
/* original tiabs*/
|
465 |
+
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(original_input), "UTF-8"));
|
466 |
+
HashMap<String,String> ParagraphContent = new HashMap<String,String>(); // [PMID,0] -> title
|
467 |
+
HashMap<String,String> annotations = new HashMap<String,String>(); // PMID ->Annotation
|
468 |
+
String line;
|
469 |
+
String Pmid="";
|
470 |
+
int count_paragraph=0;
|
471 |
+
while ((line = inputfile.readLine()) != null)
|
472 |
+
{
|
473 |
+
if(line.contains("|") && !line.contains("\t")) //Title|Abstract
|
474 |
+
{
|
475 |
+
String str[]=line.split("\\|",-1);
|
476 |
+
Pmid=str[0];
|
477 |
+
ParagraphContent.put(Pmid+"\t"+str[1],str[2]);
|
478 |
+
count_paragraph++;
|
479 |
+
}
|
480 |
+
else if (line.contains("\t")) //Annotation
|
481 |
+
{
|
482 |
+
annotations.put(Pmid, annotations.get(Pmid)+line);
|
483 |
+
}
|
484 |
+
else if(line.length()==0) //Processing
|
485 |
+
{
|
486 |
+
count_paragraph=0;
|
487 |
+
}
|
488 |
+
}
|
489 |
+
inputfile.close();
|
490 |
+
|
491 |
+
/*
|
492 |
+
* BioC2PubTator
|
493 |
+
*/
|
494 |
+
HashMap<String, String> pmidlist = new HashMap<String, String>(); // check if appear duplicate pmids
|
495 |
+
boolean duplicate = false;
|
496 |
+
BufferedWriter PubTatorOutputFormat = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
|
497 |
+
ConnectorWoodstox connector = new ConnectorWoodstox();
|
498 |
+
BioCCollection collection = new BioCCollection();
|
499 |
+
collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
500 |
+
while (connector.hasNext())
|
501 |
+
{
|
502 |
+
BioCDocument document = connector.next();
|
503 |
+
String PMID = document.getID();
|
504 |
+
if(pmidlist.containsKey(PMID)){System.out.println("\nError: duplicate pmid-"+PMID);duplicate = true;}
|
505 |
+
else{pmidlist.put(PMID,"");}
|
506 |
+
String Anno="";
|
507 |
+
for (BioCPassage passage : document.getPassages())
|
508 |
+
{
|
509 |
+
if(passage.getInfon("type").equals("title") || passage.getInfon("type").equals("t"))
|
510 |
+
{
|
511 |
+
PubTatorOutputFormat.write(PMID+"|t|"+ParagraphContent.get(PMID+"\tt")+"\n");
|
512 |
+
}
|
513 |
+
else if(passage.getInfon("type").equals("abstract") || passage.getInfon("type").equals("a"))
|
514 |
+
{
|
515 |
+
PubTatorOutputFormat.write(PMID+"|a|"+ParagraphContent.get(PMID+"\ta")+"\n");
|
516 |
+
}
|
517 |
+
else
|
518 |
+
{
|
519 |
+
PubTatorOutputFormat.write(PMID+"|"+passage.getInfon("type")+"|"+passage.getText()+"\n");
|
520 |
+
}
|
521 |
+
|
522 |
+
for (BioCAnnotation annotation : passage.getAnnotations())
|
523 |
+
{
|
524 |
+
String Annotype = annotation.getInfon("type");
|
525 |
+
String Annoid="";
|
526 |
+
String Proteinid="";
|
527 |
+
if(Annotype.matches("(Gene|FamilyName|DomainMotif)"))
|
528 |
+
{
|
529 |
+
if(annotation.getInfons().containsKey("NCBI Gene"))
|
530 |
+
{
|
531 |
+
Annoid = annotation.getInfon("NCBI Gene");
|
532 |
+
String Annoidlist[]=Annoid.split(";");
|
533 |
+
Annoid="";
|
534 |
+
for(int x=0;x<Annoidlist.length;x++)
|
535 |
+
{
|
536 |
+
//Normalization2Protein
|
537 |
+
String proteinid="";
|
538 |
+
String homoid="";
|
539 |
+
|
540 |
+
if(GNormPlus.Normalization2Protein_hash.containsKey(Annoidlist[x]))
|
541 |
+
{
|
542 |
+
proteinid=GNormPlus.Normalization2Protein_hash.get(Annoidlist[x]);
|
543 |
+
}
|
544 |
+
if(GNormPlus.HomologeneID_hash.containsKey(Annoidlist[x]))
|
545 |
+
{
|
546 |
+
homoid=GNormPlus.HomologeneID_hash.get(Annoidlist[x]);
|
547 |
+
}
|
548 |
+
|
549 |
+
if((!proteinid.equals("")) || (!homoid.equals("")))
|
550 |
+
{
|
551 |
+
if(Annoid.equals(""))
|
552 |
+
{
|
553 |
+
Annoid=Annoidlist[x]+"(";
|
554 |
+
if(!proteinid.equals(""))
|
555 |
+
{
|
556 |
+
Annoid=Annoid+"UniProt:"+proteinid;
|
557 |
+
}
|
558 |
+
if(!homoid.equals(""))
|
559 |
+
{
|
560 |
+
if(!proteinid.equals(""))
|
561 |
+
{
|
562 |
+
Annoid=Annoid+";";
|
563 |
+
}
|
564 |
+
Annoid=Annoid+"Homoid:"+homoid;
|
565 |
+
}
|
566 |
+
Annoid=Annoid+")";
|
567 |
+
}
|
568 |
+
else
|
569 |
+
{
|
570 |
+
Annoid=Annoid+";"+Annoidlist[x]+"(";
|
571 |
+
if(!proteinid.equals(""))
|
572 |
+
{
|
573 |
+
Annoid=Annoid+"UniProt:"+proteinid;
|
574 |
+
}
|
575 |
+
if(!homoid.equals(""))
|
576 |
+
{
|
577 |
+
if(!proteinid.equals(""))
|
578 |
+
{
|
579 |
+
Annoid=Annoid+";";
|
580 |
+
}
|
581 |
+
Annoid=Annoid+"Homoid:"+homoid;
|
582 |
+
}
|
583 |
+
Annoid=Annoid+")";
|
584 |
+
}
|
585 |
+
}
|
586 |
+
else
|
587 |
+
{
|
588 |
+
if(Annoid.equals(""))
|
589 |
+
{
|
590 |
+
Annoid=Annoidlist[x];
|
591 |
+
}
|
592 |
+
else
|
593 |
+
{
|
594 |
+
Annoid=Annoid+";"+Annoidlist[x];
|
595 |
+
}
|
596 |
+
}
|
597 |
+
}
|
598 |
+
}
|
599 |
+
//else if(annotation.getInfons().containsKey("NCBI Homologene"))
|
600 |
+
//{
|
601 |
+
// Annoid = annotation.getInfon("NCBI Homologene");
|
602 |
+
//}
|
603 |
+
//else if(annotation.getInfons().containsKey("FocusSpecies"))
|
604 |
+
//{
|
605 |
+
// Annoid = annotation.getInfon("FocusSpecies");
|
606 |
+
//}
|
607 |
+
else
|
608 |
+
{
|
609 |
+
Annoid = annotation.getInfon("Identifier");
|
610 |
+
}
|
611 |
+
}
|
612 |
+
else if(Annotype.equals("Species") || Annotype.equals("Genus") || Annotype.equals("Strain"))
|
613 |
+
{
|
614 |
+
if(annotation.getInfons().containsKey("NCBI Taxonomy"))
|
615 |
+
{
|
616 |
+
Annoid = annotation.getInfon("NCBI Taxonomy");
|
617 |
+
}
|
618 |
+
else
|
619 |
+
{
|
620 |
+
Annoid = annotation.getInfon("Identifier");
|
621 |
+
}
|
622 |
+
}
|
623 |
+
else if(Annotype.equals("CellLine"))
|
624 |
+
{
|
625 |
+
if(annotation.getInfons().containsKey("NCBI Taxonomy"))
|
626 |
+
{
|
627 |
+
Annoid = annotation.getInfon("NCBI Taxonomy");
|
628 |
+
}
|
629 |
+
else
|
630 |
+
{
|
631 |
+
Annoid = annotation.getInfon("Identifier");
|
632 |
+
}
|
633 |
+
}
|
634 |
+
else
|
635 |
+
{
|
636 |
+
if(annotation.getInfons().containsKey("Identifier"))
|
637 |
+
{
|
638 |
+
Annoid = annotation.getInfon("Identifier");
|
639 |
+
}
|
640 |
+
else
|
641 |
+
{
|
642 |
+
Annoid = "";
|
643 |
+
}
|
644 |
+
}
|
645 |
+
int start = annotation.getLocations().get(0).getOffset();
|
646 |
+
int last = start + annotation.getLocations().get(0).getLength();
|
647 |
+
String AnnoMention=annotation.getText();
|
648 |
+
if(Annoid != null && !Annoid.equals(null) && !Annoid.equals(""))
|
649 |
+
{
|
650 |
+
Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\t"+Annoid+"\n";
|
651 |
+
}
|
652 |
+
else
|
653 |
+
{
|
654 |
+
Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\n";
|
655 |
+
}
|
656 |
+
}
|
657 |
+
}
|
658 |
+
PubTatorOutputFormat.write(Anno+"\n");
|
659 |
+
}
|
660 |
+
PubTatorOutputFormat.close();
|
661 |
+
if(duplicate == true){System.exit(0);}
|
662 |
+
}
|
663 |
+
public void BioCReader(String input) throws IOException, XMLStreamException
|
664 |
+
{
|
665 |
+
ConnectorWoodstox connector = new ConnectorWoodstox();
|
666 |
+
BioCCollection collection = new BioCCollection();
|
667 |
+
collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
668 |
+
|
669 |
+
/*
|
670 |
+
* Per document
|
671 |
+
*/
|
672 |
+
while (connector.hasNext())
|
673 |
+
{
|
674 |
+
BioCDocument document = connector.next();
|
675 |
+
PMIDs.add(document.getID());
|
676 |
+
|
677 |
+
ArrayList<String> PassageName= new ArrayList<String>(); // array of Passage name
|
678 |
+
ArrayList<Integer> PassageOffset= new ArrayList<Integer>(); // array of Passage offset
|
679 |
+
ArrayList<String> PassageContext= new ArrayList<String>(); // array of Passage context
|
680 |
+
ArrayList<ArrayList<String>> AnnotationInPMID= new ArrayList(); // array of Annotations in the PassageName
|
681 |
+
|
682 |
+
/*
|
683 |
+
* Per Passage
|
684 |
+
*/
|
685 |
+
for (BioCPassage passage : document.getPassages())
|
686 |
+
{
|
687 |
+
PassageName.add(passage.getInfon("type")); //Paragraph
|
688 |
+
String txt = passage.getText();
|
689 |
+
if(txt.matches("[\t ]+"))
|
690 |
+
{
|
691 |
+
txt = txt.replaceAll(".","@");
|
692 |
+
}
|
693 |
+
else
|
694 |
+
{
|
695 |
+
//if(passage.getInfon("type").toLowerCase().equals("table"))
|
696 |
+
//{
|
697 |
+
// txt=txt.replaceAll(" ", "|");
|
698 |
+
//}
|
699 |
+
txt = txt.replaceAll("ω","w");
|
700 |
+
txt = txt.replaceAll("μ","u");
|
701 |
+
txt = txt.replaceAll("κ","k");
|
702 |
+
txt = txt.replaceAll("α","a");
|
703 |
+
txt = txt.replaceAll("γ","g");
|
704 |
+
txt = txt.replaceAll("ɣ","g");
|
705 |
+
txt = txt.replaceAll("β","b");
|
706 |
+
txt = txt.replaceAll("×","x");
|
707 |
+
txt = txt.replaceAll("‑","-");
|
708 |
+
txt = txt.replaceAll("¹","1");
|
709 |
+
txt = txt.replaceAll("²","2");
|
710 |
+
txt = txt.replaceAll("°","o");
|
711 |
+
txt = txt.replaceAll("ö","o");
|
712 |
+
txt = txt.replaceAll("é","e");
|
713 |
+
txt = txt.replaceAll("à","a");
|
714 |
+
txt = txt.replaceAll("Á","A");
|
715 |
+
txt = txt.replaceAll("ε","e");
|
716 |
+
txt = txt.replaceAll("θ","O");
|
717 |
+
txt = txt.replaceAll("•",".");
|
718 |
+
txt = txt.replaceAll("µ","u");
|
719 |
+
txt = txt.replaceAll("λ","r");
|
720 |
+
txt = txt.replaceAll("⁺","+");
|
721 |
+
txt = txt.replaceAll("ν","v");
|
722 |
+
txt = txt.replaceAll("ï","i");
|
723 |
+
txt = txt.replaceAll("ã","a");
|
724 |
+
txt = txt.replaceAll("≡","=");
|
725 |
+
txt = txt.replaceAll("ó","o");
|
726 |
+
txt = txt.replaceAll("³","3");
|
727 |
+
txt = txt.replaceAll("〖","[");
|
728 |
+
txt = txt.replaceAll("〗","]");
|
729 |
+
txt = txt.replaceAll("Å","A");
|
730 |
+
txt = txt.replaceAll("ρ","p");
|
731 |
+
txt = txt.replaceAll("ü","u");
|
732 |
+
txt = txt.replaceAll("ɛ","e");
|
733 |
+
txt = txt.replaceAll("č","c");
|
734 |
+
txt = txt.replaceAll("š","s");
|
735 |
+
txt = txt.replaceAll("ß","b");
|
736 |
+
txt = txt.replaceAll("═","=");
|
737 |
+
txt = txt.replaceAll("£","L");
|
738 |
+
txt = txt.replaceAll("Ł","L");
|
739 |
+
txt = txt.replaceAll("ƒ","f");
|
740 |
+
txt = txt.replaceAll("ä","a");
|
741 |
+
txt = txt.replaceAll("–","-");
|
742 |
+
txt = txt.replaceAll("⁻","-");
|
743 |
+
txt = txt.replaceAll("〈","<");
|
744 |
+
txt = txt.replaceAll("〉",">");
|
745 |
+
txt = txt.replaceAll("χ","X");
|
746 |
+
txt = txt.replaceAll("Đ","D");
|
747 |
+
txt = txt.replaceAll("‰","%");
|
748 |
+
txt = txt.replaceAll("·",".");
|
749 |
+
txt = txt.replaceAll("→",">");
|
750 |
+
txt = txt.replaceAll("←","<");
|
751 |
+
txt = txt.replaceAll("ζ","z");
|
752 |
+
txt = txt.replaceAll("π","p");
|
753 |
+
txt = txt.replaceAll("τ","t");
|
754 |
+
txt = txt.replaceAll("ξ","X");
|
755 |
+
txt = txt.replaceAll("η","h");
|
756 |
+
txt = txt.replaceAll("ø","0");
|
757 |
+
txt = txt.replaceAll("Δ","D");
|
758 |
+
txt = txt.replaceAll("∆","D");
|
759 |
+
txt = txt.replaceAll("∑","S");
|
760 |
+
txt = txt.replaceAll("Ω","O");
|
761 |
+
txt = txt.replaceAll("δ","d");
|
762 |
+
txt = txt.replaceAll("σ","s");
|
763 |
+
txt = txt.replaceAll("Φ","F");
|
764 |
+
//txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]"," ");
|
765 |
+
}
|
766 |
+
if(passage.getText().equals("") || passage.getText().matches("[ ]+"))
|
767 |
+
{
|
768 |
+
PassageContext.add("-notext-"); //Context
|
769 |
+
}
|
770 |
+
else
|
771 |
+
{
|
772 |
+
PassageContext.add(txt); //Context
|
773 |
+
}
|
774 |
+
PassageOffset.add(passage.getOffset()); //Offset
|
775 |
+
ArrayList<String> AnnotationInPassage= new ArrayList<String>(); // array of Annotations in the PassageName
|
776 |
+
AnnotationInPMID.add(AnnotationInPassage);
|
777 |
+
}
|
778 |
+
PassageNames.add(PassageName);
|
779 |
+
PassageContexts.add(PassageContext);
|
780 |
+
PassageOffsets.add(PassageOffset);
|
781 |
+
Annotations.add(AnnotationInPMID);
|
782 |
+
}
|
783 |
+
}
|
784 |
+
public void BioCReaderWithAnnotation(String input) throws IOException, XMLStreamException
|
785 |
+
{
|
786 |
+
ConnectorWoodstox connector = new ConnectorWoodstox();
|
787 |
+
BioCCollection collection = new BioCCollection();
|
788 |
+
collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
789 |
+
|
790 |
+
/*
|
791 |
+
* Per document
|
792 |
+
*/
|
793 |
+
while (connector.hasNext())
|
794 |
+
{
|
795 |
+
BioCDocument document = connector.next();
|
796 |
+
PMIDs.add(document.getID());
|
797 |
+
|
798 |
+
ArrayList<String> PassageName= new ArrayList<String>(); // array of Passage name
|
799 |
+
ArrayList<Integer> PassageOffset= new ArrayList<Integer>(); // array of Passage offset
|
800 |
+
ArrayList<String> PassageContext= new ArrayList<String>(); // array of Passage context
|
801 |
+
ArrayList<ArrayList<String>> AnnotationInPMID= new ArrayList(); // array of Annotations in the PassageName
|
802 |
+
|
803 |
+
/*
|
804 |
+
* Per Passage
|
805 |
+
*/
|
806 |
+
for (BioCPassage passage : document.getPassages())
|
807 |
+
{
|
808 |
+
PassageName.add(passage.getInfon("type")); //Paragraph
|
809 |
+
|
810 |
+
String txt = passage.getText();
|
811 |
+
if(txt.matches("[\t ]+"))
|
812 |
+
{
|
813 |
+
txt = txt.replaceAll(".","@");
|
814 |
+
}
|
815 |
+
else
|
816 |
+
{
|
817 |
+
//if(passage.getInfon("type").toLowerCase().equals("table"))
|
818 |
+
//{
|
819 |
+
// txt=txt.replaceAll(" ", "|");
|
820 |
+
//}
|
821 |
+
txt = txt.replaceAll("ω","w");
|
822 |
+
txt = txt.replaceAll("μ","u");
|
823 |
+
txt = txt.replaceAll("κ","k");
|
824 |
+
txt = txt.replaceAll("α","a");
|
825 |
+
txt = txt.replaceAll("γ","g");
|
826 |
+
txt = txt.replaceAll("ɣ","g");
|
827 |
+
txt = txt.replaceAll("β","b");
|
828 |
+
txt = txt.replaceAll("×","x");
|
829 |
+
txt = txt.replaceAll("‑","-");
|
830 |
+
txt = txt.replaceAll("¹","1");
|
831 |
+
txt = txt.replaceAll("²","2");
|
832 |
+
txt = txt.replaceAll("°","o");
|
833 |
+
txt = txt.replaceAll("ö","o");
|
834 |
+
txt = txt.replaceAll("é","e");
|
835 |
+
txt = txt.replaceAll("à","a");
|
836 |
+
txt = txt.replaceAll("Á","A");
|
837 |
+
txt = txt.replaceAll("ε","e");
|
838 |
+
txt = txt.replaceAll("θ","O");
|
839 |
+
txt = txt.replaceAll("•",".");
|
840 |
+
txt = txt.replaceAll("µ","u");
|
841 |
+
txt = txt.replaceAll("λ","r");
|
842 |
+
txt = txt.replaceAll("⁺","+");
|
843 |
+
txt = txt.replaceAll("ν","v");
|
844 |
+
txt = txt.replaceAll("ï","i");
|
845 |
+
txt = txt.replaceAll("ã","a");
|
846 |
+
txt = txt.replaceAll("≡","=");
|
847 |
+
txt = txt.replaceAll("ó","o");
|
848 |
+
txt = txt.replaceAll("³","3");
|
849 |
+
txt = txt.replaceAll("〖","[");
|
850 |
+
txt = txt.replaceAll("〗","]");
|
851 |
+
txt = txt.replaceAll("Å","A");
|
852 |
+
txt = txt.replaceAll("ρ","p");
|
853 |
+
txt = txt.replaceAll("ü","u");
|
854 |
+
txt = txt.replaceAll("ɛ","e");
|
855 |
+
txt = txt.replaceAll("č","c");
|
856 |
+
txt = txt.replaceAll("š","s");
|
857 |
+
txt = txt.replaceAll("ß","b");
|
858 |
+
txt = txt.replaceAll("═","=");
|
859 |
+
txt = txt.replaceAll("£","L");
|
860 |
+
txt = txt.replaceAll("Ł","L");
|
861 |
+
txt = txt.replaceAll("ƒ","f");
|
862 |
+
txt = txt.replaceAll("ä","a");
|
863 |
+
txt = txt.replaceAll("–","-");
|
864 |
+
txt = txt.replaceAll("⁻","-");
|
865 |
+
txt = txt.replaceAll("〈","<");
|
866 |
+
txt = txt.replaceAll("〉",">");
|
867 |
+
txt = txt.replaceAll("χ","X");
|
868 |
+
txt = txt.replaceAll("Đ","D");
|
869 |
+
txt = txt.replaceAll("‰","%");
|
870 |
+
txt = txt.replaceAll("·",".");
|
871 |
+
txt = txt.replaceAll("→",">");
|
872 |
+
txt = txt.replaceAll("←","<");
|
873 |
+
txt = txt.replaceAll("ζ","z");
|
874 |
+
txt = txt.replaceAll("π","p");
|
875 |
+
txt = txt.replaceAll("τ","t");
|
876 |
+
txt = txt.replaceAll("ξ","X");
|
877 |
+
txt = txt.replaceAll("η","h");
|
878 |
+
txt = txt.replaceAll("ø","0");
|
879 |
+
txt = txt.replaceAll("Δ","D");
|
880 |
+
txt = txt.replaceAll("∆","D");
|
881 |
+
txt = txt.replaceAll("∑","S");
|
882 |
+
txt = txt.replaceAll("Ω","O");
|
883 |
+
txt = txt.replaceAll("δ","d");
|
884 |
+
txt = txt.replaceAll("σ","s");
|
885 |
+
txt = txt.replaceAll("Φ","F");
|
886 |
+
//txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]"," ");
|
887 |
+
}
|
888 |
+
if(passage.getText().equals("") || passage.getText().matches("[ ]+"))
|
889 |
+
{
|
890 |
+
PassageContext.add("-notext-"); //Context
|
891 |
+
}
|
892 |
+
else
|
893 |
+
{
|
894 |
+
PassageContext.add(txt); //Context
|
895 |
+
}
|
896 |
+
PassageOffset.add(passage.getOffset()); //Offset
|
897 |
+
ArrayList<String> AnnotationInPassage= new ArrayList<String>(); // array of Annotations in the PassageName
|
898 |
+
|
899 |
+
/*
|
900 |
+
* Per Annotation :
|
901 |
+
* start
|
902 |
+
* last
|
903 |
+
* mention
|
904 |
+
* type
|
905 |
+
* id
|
906 |
+
*/
|
907 |
+
for (BioCAnnotation Anno : passage.getAnnotations())
|
908 |
+
{
|
909 |
+
int start = Anno.getLocations().get(0).getOffset()-passage.getOffset(); // start
|
910 |
+
int last = start + Anno.getLocations().get(0).getLength(); // last
|
911 |
+
String AnnoMention=Anno.getText(); // mention
|
912 |
+
String Annotype = Anno.getInfon("type"); // type
|
913 |
+
String Annoid = Anno.getInfon("Identifier"); // identifier | MESH
|
914 |
+
if(Annoid == null)
|
915 |
+
{
|
916 |
+
Annoid = Anno.getInfon("Identifier"); // identifier | MESH
|
917 |
+
}
|
918 |
+
if(Annoid == null || Annoid.equals("null"))
|
919 |
+
{
|
920 |
+
AnnotationInPassage.add(start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype); //paragraph
|
921 |
+
}
|
922 |
+
else
|
923 |
+
{
|
924 |
+
AnnotationInPassage.add(start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\t"+Annoid); //paragraph
|
925 |
+
}
|
926 |
+
}
|
927 |
+
AnnotationInPMID.add(AnnotationInPassage);
|
928 |
+
}
|
929 |
+
PassageNames.add(PassageName);
|
930 |
+
PassageContexts.add(PassageContext);
|
931 |
+
PassageOffsets.add(PassageOffset);
|
932 |
+
Annotations.add(AnnotationInPMID);
|
933 |
+
}
|
934 |
+
}
|
935 |
+
public void BioCOutput(String input,String output, ArrayList<ArrayList<ArrayList<String>>> Annotations,boolean Final,boolean RemovePreviousAnno) throws IOException, XMLStreamException
|
936 |
+
{
|
937 |
+
boolean ShowUnNormalizedMention = false;
|
938 |
+
if(GNormPlus.setup_hash.containsKey("ShowUnNormalizedMention") && GNormPlus.setup_hash.get("ShowUnNormalizedMention").equals("True"))
|
939 |
+
{
|
940 |
+
ShowUnNormalizedMention = true;
|
941 |
+
}
|
942 |
+
|
943 |
+
BioCDocumentWriter BioCOutputFormat = BioCFactory.newFactory(BioCFactory.WOODSTOX).createBioCDocumentWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
|
944 |
+
BioCCollection biocCollection_input = new BioCCollection();
|
945 |
+
BioCCollection biocCollection_output = new BioCCollection();
|
946 |
+
|
947 |
+
//input: BioC
|
948 |
+
ConnectorWoodstox connector = new ConnectorWoodstox();
|
949 |
+
biocCollection_input = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
|
950 |
+
BioCOutputFormat.writeCollectionInfo(biocCollection_input);
|
951 |
+
int i=0; //count for pmid
|
952 |
+
while (connector.hasNext())
|
953 |
+
{
|
954 |
+
BioCDocument document_output = new BioCDocument();
|
955 |
+
BioCDocument document_input = connector.next();
|
956 |
+
String PMID=document_input.getID();
|
957 |
+
document_output.setID(PMID);
|
958 |
+
int annotation_count=0;
|
959 |
+
int j=0; //count for paragraph
|
960 |
+
for (BioCPassage passage_input : document_input.getPassages())
|
961 |
+
{
|
962 |
+
BioCPassage passage_output = passage_input;
|
963 |
+
|
964 |
+
if(RemovePreviousAnno == true) //clean the previous annotation, if the NER result is provided
|
965 |
+
{
|
966 |
+
passage_output.clearAnnotations();
|
967 |
+
}
|
968 |
+
else
|
969 |
+
{
|
970 |
+
for (BioCAnnotation annotation : passage_output.getAnnotations())
|
971 |
+
{
|
972 |
+
annotation.setID(""+annotation_count);
|
973 |
+
annotation_count++;
|
974 |
+
}
|
975 |
+
}
|
976 |
+
|
977 |
+
int passage_Offset = passage_input.getOffset();
|
978 |
+
String passage_Text = passage_input.getText();
|
979 |
+
ArrayList<String> AnnotationInPassage = new ArrayList<String>();
|
980 |
+
//ArrayList<String> AnnotationInPassage = Annotations.get(i).get(j);
|
981 |
+
if(Annotations.size()>i && Annotations.get(i).size()>j)
|
982 |
+
{
|
983 |
+
for(int a=0;a<Annotations.get(i).get(j).size();a++)
|
984 |
+
{
|
985 |
+
String Anno[]=Annotations.get(i).get(j).get(a).split("\\t");
|
986 |
+
int start = Integer.parseInt(Anno[0]);
|
987 |
+
int last = Integer.parseInt(Anno[1]);
|
988 |
+
boolean found = false;
|
989 |
+
if(passage_Text.length()>last)
|
990 |
+
{
|
991 |
+
String mention = Anno[2];
|
992 |
+
if(Final == true && passage_Text.length()>=last)
|
993 |
+
{
|
994 |
+
mention = passage_Text.substring(start, last);
|
995 |
+
}
|
996 |
+
if(mention.matches(".*\t.*"))
|
997 |
+
{
|
998 |
+
Anno[3]=Anno[4];
|
999 |
+
if(Anno.length>=6)
|
1000 |
+
{
|
1001 |
+
Anno[4]=Anno[5];
|
1002 |
+
}
|
1003 |
+
}
|
1004 |
+
String type = Anno[3];
|
1005 |
+
String id = ""; // optional
|
1006 |
+
if(Anno.length>=5){id = Anno[4];}
|
1007 |
+
if(Final == true)
|
1008 |
+
{
|
1009 |
+
for(int b=0;b<AnnotationInPassage.size();b++)
|
1010 |
+
{
|
1011 |
+
String Annob[]=AnnotationInPassage.get(b).split("\\t");
|
1012 |
+
int startb = Integer.parseInt(Annob[0]);
|
1013 |
+
int lastb = Integer.parseInt(Annob[1]);
|
1014 |
+
String mentionb = Annob[2];
|
1015 |
+
if(Final == true && passage_Text.length()>=lastb)
|
1016 |
+
{
|
1017 |
+
mentionb = passage_Text.substring(startb, lastb);
|
1018 |
+
}
|
1019 |
+
if(mentionb.matches(".*\t.*"))
|
1020 |
+
{
|
1021 |
+
Annob[3]=Annob[4];
|
1022 |
+
if(Annob.length>=6)
|
1023 |
+
{
|
1024 |
+
Annob[4]=Annob[5];
|
1025 |
+
}
|
1026 |
+
}
|
1027 |
+
String typeb = Annob[3];
|
1028 |
+
String idb = ""; // optional
|
1029 |
+
if(Annob.length>=5){idb = Annob[4];}
|
1030 |
+
|
1031 |
+
if(start == startb && last == lastb && type.equals(typeb))
|
1032 |
+
{
|
1033 |
+
found = true;
|
1034 |
+
if(id.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+") && (!idb.equals("")))
|
1035 |
+
{
|
1036 |
+
}
|
1037 |
+
else if(idb.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+") && (!id.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+")) && (!id.equals("")))
|
1038 |
+
{
|
1039 |
+
AnnotationInPassage.set(b, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id);
|
1040 |
+
}
|
1041 |
+
else
|
1042 |
+
{
|
1043 |
+
if(id.equals(""))
|
1044 |
+
{
|
1045 |
+
}
|
1046 |
+
else
|
1047 |
+
{
|
1048 |
+
AnnotationInPassage.set(b, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+idb+";"+id);
|
1049 |
+
}
|
1050 |
+
|
1051 |
+
}
|
1052 |
+
break;
|
1053 |
+
}
|
1054 |
+
}
|
1055 |
+
}
|
1056 |
+
}
|
1057 |
+
if(found == false)
|
1058 |
+
{
|
1059 |
+
AnnotationInPassage.add(Annotations.get(i).get(j).get(a));
|
1060 |
+
}
|
1061 |
+
}
|
1062 |
+
}
|
1063 |
+
for(int a=0;a<AnnotationInPassage.size();a++)
|
1064 |
+
{
|
1065 |
+
String Anno[]=AnnotationInPassage.get(a).split("\\t");
|
1066 |
+
HashMap <String,String> id_hash = new HashMap <String,String>();
|
1067 |
+
if(Anno.length>=5)
|
1068 |
+
{
|
1069 |
+
int start = Integer.parseInt(Anno[0]);
|
1070 |
+
int last = Integer.parseInt(Anno[1]);
|
1071 |
+
String mention = Anno[2];
|
1072 |
+
if(Final == true && passage_Text.length()>=last)
|
1073 |
+
{
|
1074 |
+
mention = passage_Text.substring(start, last);
|
1075 |
+
}
|
1076 |
+
if(mention.matches(".*\t.*"))
|
1077 |
+
{
|
1078 |
+
Anno[3]=Anno[4];
|
1079 |
+
if(Anno.length>=6)
|
1080 |
+
{
|
1081 |
+
Anno[4]=Anno[5];
|
1082 |
+
}
|
1083 |
+
}
|
1084 |
+
String ids = Anno[4];
|
1085 |
+
String idlist[]=ids.split(",");
|
1086 |
+
for(int b=0;b<idlist.length;b++)
|
1087 |
+
{
|
1088 |
+
id_hash.put(idlist[b], "");
|
1089 |
+
}
|
1090 |
+
ids = "";
|
1091 |
+
for(String id :id_hash.keySet())
|
1092 |
+
{
|
1093 |
+
if(ids.equals(""))
|
1094 |
+
{
|
1095 |
+
ids = id;
|
1096 |
+
}
|
1097 |
+
else
|
1098 |
+
{
|
1099 |
+
ids = ids + ";" + id;
|
1100 |
+
}
|
1101 |
+
}
|
1102 |
+
AnnotationInPassage.set(a, Anno[0]+"\t"+Anno[1]+"\t"+Anno[2]+"\t"+Anno[3]+"\t"+ids);
|
1103 |
+
}
|
1104 |
+
}
|
1105 |
+
|
1106 |
+
for(int a=0;a<AnnotationInPassage.size();a++)
|
1107 |
+
{
|
1108 |
+
String Anno[]=AnnotationInPassage.get(a).split("\\t");
|
1109 |
+
int start = Integer.parseInt(Anno[0]);
|
1110 |
+
int last = Integer.parseInt(Anno[1]);
|
1111 |
+
if(passage_Text.length()>last)
|
1112 |
+
{
|
1113 |
+
String mention = Anno[2];
|
1114 |
+
if(Final == true && passage_Text.length()>=last)
|
1115 |
+
{
|
1116 |
+
mention = passage_Text.substring(start, last);
|
1117 |
+
}
|
1118 |
+
if(mention.matches(".*\t.*"))
|
1119 |
+
{
|
1120 |
+
Anno[3]=Anno[4];
|
1121 |
+
if(Anno.length>=6)
|
1122 |
+
{
|
1123 |
+
Anno[4]=Anno[5];
|
1124 |
+
}
|
1125 |
+
}
|
1126 |
+
String type = Anno[3];
|
1127 |
+
if(type.equals("GeneID")){type="Gene";}
|
1128 |
+
BioCAnnotation biocAnnotation = new BioCAnnotation();
|
1129 |
+
Map<String, String> AnnoInfons = new HashMap<String, String>();
|
1130 |
+
AnnoInfons.put("type", type);
|
1131 |
+
if(Anno.length>=5)
|
1132 |
+
{
|
1133 |
+
String identifier = Anno[4];
|
1134 |
+
if(Final == true && ShowUnNormalizedMention==false)
|
1135 |
+
{
|
1136 |
+
if(type.matches("(FamilyName|Domain|Gene)"))
|
1137 |
+
{
|
1138 |
+
Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9\\;]+)$");
|
1139 |
+
Matcher mtmp0 = ptmp0.matcher(identifier);
|
1140 |
+
Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
|
1141 |
+
Matcher mtmp1 = ptmp1.matcher(identifier);
|
1142 |
+
Pattern ptmp2 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)$");
|
1143 |
+
Matcher mtmp2 = ptmp2.matcher(identifier);
|
1144 |
+
Pattern ptmp3 = Pattern.compile("^Homo\\:([0-9]+)$");
|
1145 |
+
Matcher mtmp3 = ptmp3.matcher(identifier);
|
1146 |
+
if(mtmp0.find())
|
1147 |
+
{
|
1148 |
+
String Method_SA = mtmp0.group(1);
|
1149 |
+
String TaxonomyID = mtmp0.group(2);
|
1150 |
+
String NCBIGeneID = mtmp0.group(3);
|
1151 |
+
if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID))
|
1152 |
+
{
|
1153 |
+
AnnoInfons.put("UniProt", GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
|
1154 |
+
}
|
1155 |
+
if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID))
|
1156 |
+
{
|
1157 |
+
AnnoInfons.put("NCBI Homologene", GNormPlus.HomologeneID_hash.get(NCBIGeneID));
|
1158 |
+
}
|
1159 |
+
AnnoInfons.put("NCBI Gene", NCBIGeneID);
|
1160 |
+
}
|
1161 |
+
else if(mtmp1.find())
|
1162 |
+
{
|
1163 |
+
String Method_SA = mtmp1.group(1);
|
1164 |
+
String TaxonomyID = mtmp1.group(2);
|
1165 |
+
String NCBIGeneID = mtmp1.group(3);
|
1166 |
+
String HomoID = mtmp1.group(4);
|
1167 |
+
if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID))
|
1168 |
+
{
|
1169 |
+
AnnoInfons.put("UniProt", GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
|
1170 |
+
}
|
1171 |
+
if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID))
|
1172 |
+
{
|
1173 |
+
AnnoInfons.put("NCBI Homologene", GNormPlus.HomologeneID_hash.get(NCBIGeneID));
|
1174 |
+
}
|
1175 |
+
AnnoInfons.put("NCBI Gene", NCBIGeneID);
|
1176 |
+
}
|
1177 |
+
else if(mtmp2.find())
|
1178 |
+
{
|
1179 |
+
String Method_SA = mtmp2.group(1);
|
1180 |
+
String TaxonomyID = mtmp2.group(2);
|
1181 |
+
AnnoInfons.put("FocusSpecies", "NCBITaxonomyID:"+TaxonomyID);
|
1182 |
+
}
|
1183 |
+
else if(mtmp3.find())
|
1184 |
+
{
|
1185 |
+
String Method_SA = mtmp3.group(1);
|
1186 |
+
String HomoID = mtmp3.group(2);
|
1187 |
+
AnnoInfons.put("NCBI Homologene", HomoID);
|
1188 |
+
}
|
1189 |
+
else
|
1190 |
+
{
|
1191 |
+
String identifiers[] = identifier.split(";");
|
1192 |
+
if(identifiers.length>1)
|
1193 |
+
{
|
1194 |
+
ArrayList<String> identifierSTR = new ArrayList<String>();
|
1195 |
+
ArrayList<String> ProteinidSTR = new ArrayList<String>();
|
1196 |
+
ArrayList<String> HomoidSTR = new ArrayList<String>();
|
1197 |
+
for(int idi=0;idi<identifiers.length;idi++)
|
1198 |
+
{
|
1199 |
+
Pattern ptmp4 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
|
1200 |
+
Matcher mtmp4 = ptmp4.matcher(identifiers[idi]);
|
1201 |
+
Pattern ptmp5 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9\\;]+)$");
|
1202 |
+
Matcher mtmp5 = ptmp5.matcher(identifiers[idi]);
|
1203 |
+
if(mtmp4.find())
|
1204 |
+
{
|
1205 |
+
String Method_SA = mtmp4.group(1);
|
1206 |
+
String TaxonomyID = mtmp4.group(2);
|
1207 |
+
String NCBIGeneID = mtmp4.group(3);
|
1208 |
+
String HomoID = mtmp4.group(4);
|
1209 |
+
if(!identifierSTR.contains(NCBIGeneID))
|
1210 |
+
{
|
1211 |
+
identifierSTR.add(NCBIGeneID);
|
1212 |
+
}
|
1213 |
+
if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID))
|
1214 |
+
{
|
1215 |
+
if(!ProteinidSTR.contains(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID)))
|
1216 |
+
{
|
1217 |
+
ProteinidSTR.add(GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
|
1218 |
+
}
|
1219 |
+
}
|
1220 |
+
if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID))
|
1221 |
+
{
|
1222 |
+
if(!HomoidSTR.contains(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID)))
|
1223 |
+
{
|
1224 |
+
HomoidSTR.add(GNormPlus.HomologeneID_hash.get(NCBIGeneID));
|
1225 |
+
}
|
1226 |
+
}
|
1227 |
+
|
1228 |
+
}
|
1229 |
+
else if(mtmp5.find())
|
1230 |
+
{
|
1231 |
+
String Method_SA = mtmp5.group(1);
|
1232 |
+
String TaxonomyID = mtmp5.group(2);
|
1233 |
+
String NCBIGeneID = mtmp5.group(3);
|
1234 |
+
if(!identifierSTR.contains(NCBIGeneID))
|
1235 |
+
{
|
1236 |
+
identifierSTR.add(NCBIGeneID);
|
1237 |
+
}
|
1238 |
+
}
|
1239 |
+
}
|
1240 |
+
String idSTR="";
|
1241 |
+
for(int x=0;x<identifierSTR.size();x++)
|
1242 |
+
{
|
1243 |
+
if(idSTR.equals(""))
|
1244 |
+
{
|
1245 |
+
idSTR = identifierSTR.get(x);
|
1246 |
+
}
|
1247 |
+
else
|
1248 |
+
{
|
1249 |
+
idSTR = idSTR+";"+identifierSTR.get(x);
|
1250 |
+
}
|
1251 |
+
}
|
1252 |
+
AnnoInfons.put("NCBI Gene", idSTR);
|
1253 |
+
|
1254 |
+
String pidSTR="";
|
1255 |
+
for(int x=0;x<ProteinidSTR.size();x++)
|
1256 |
+
{
|
1257 |
+
if(pidSTR.equals(""))
|
1258 |
+
{
|
1259 |
+
pidSTR = ProteinidSTR.get(x);
|
1260 |
+
}
|
1261 |
+
else
|
1262 |
+
{
|
1263 |
+
pidSTR = pidSTR+";"+ProteinidSTR.get(x);
|
1264 |
+
}
|
1265 |
+
}
|
1266 |
+
if(!pidSTR.equals(""))
|
1267 |
+
{
|
1268 |
+
AnnoInfons.put("UniProt", pidSTR);
|
1269 |
+
}
|
1270 |
+
|
1271 |
+
String hidSTR="";
|
1272 |
+
for(int x=0;x<HomoidSTR.size();x++)
|
1273 |
+
{
|
1274 |
+
if(hidSTR.equals(""))
|
1275 |
+
{
|
1276 |
+
hidSTR = HomoidSTR.get(x);
|
1277 |
+
}
|
1278 |
+
else
|
1279 |
+
{
|
1280 |
+
hidSTR = hidSTR+";"+HomoidSTR.get(x);
|
1281 |
+
}
|
1282 |
+
}
|
1283 |
+
if(!hidSTR.equals(""))
|
1284 |
+
{
|
1285 |
+
AnnoInfons.put("NCBI Homologene", hidSTR);
|
1286 |
+
}
|
1287 |
+
}
|
1288 |
+
//else
|
1289 |
+
//{
|
1290 |
+
// AnnoInfons.put("Identifier", identifier);
|
1291 |
+
//}
|
1292 |
+
}
|
1293 |
+
}
|
1294 |
+
else if (type.matches("(Species|Genus|Strain)"))
|
1295 |
+
{
|
1296 |
+
AnnoInfons.put("type", type);
|
1297 |
+
AnnoInfons.put("NCBI Taxonomy", identifier);
|
1298 |
+
}
|
1299 |
+
else if (type.matches("Cell"))
|
1300 |
+
{
|
1301 |
+
AnnoInfons.put("type", "CellLine");
|
1302 |
+
AnnoInfons.put("NCBI Taxonomy", identifier);
|
1303 |
+
}
|
1304 |
+
else
|
1305 |
+
{
|
1306 |
+
AnnoInfons.put("Identifier", identifier);
|
1307 |
+
}
|
1308 |
+
}
|
1309 |
+
else
|
1310 |
+
{
|
1311 |
+
AnnoInfons.put("Identifier", identifier);
|
1312 |
+
}
|
1313 |
+
}
|
1314 |
+
biocAnnotation.setInfons(AnnoInfons);
|
1315 |
+
BioCLocation location = new BioCLocation();
|
1316 |
+
location.setOffset(start+passage_Offset);
|
1317 |
+
location.setLength(last-start);
|
1318 |
+
biocAnnotation.setLocation(location);
|
1319 |
+
biocAnnotation.setText(mention);
|
1320 |
+
biocAnnotation.setID(""+annotation_count);
|
1321 |
+
annotation_count++;
|
1322 |
+
if(Final == true)
|
1323 |
+
{
|
1324 |
+
if(AnnoInfons.containsKey("Identifier") || AnnoInfons.containsKey("NCBI Homologene") || AnnoInfons.containsKey("NCBI Gene") || AnnoInfons.containsKey("NCBI Taxonomy"))
|
1325 |
+
{
|
1326 |
+
passage_output.addAnnotation(biocAnnotation);
|
1327 |
+
}
|
1328 |
+
}
|
1329 |
+
else
|
1330 |
+
{
|
1331 |
+
passage_output.addAnnotation(biocAnnotation);
|
1332 |
+
}
|
1333 |
+
}
|
1334 |
+
}
|
1335 |
+
document_output.addPassage(passage_output);
|
1336 |
+
j++;
|
1337 |
+
}
|
1338 |
+
biocCollection_output.addDocument(document_output);
|
1339 |
+
BioCOutputFormat.writeDocument(document_output);
|
1340 |
+
i++;
|
1341 |
+
}
|
1342 |
+
BioCOutputFormat.close();
|
1343 |
+
}
|
1344 |
}
|
src_Java/GNormPluslib/GN.java
CHANGED
@@ -1,1084 +1,1084 @@
|
|
1 |
-
/**
|
2 |
-
* Project: GNormPlus
|
3 |
-
* Function: Gene Normalization
|
4 |
-
*/
|
5 |
-
|
6 |
-
package GNormPluslib;
|
7 |
-
|
8 |
-
import bioc.BioCAnnotation;
|
9 |
-
import bioc.BioCCollection;
|
10 |
-
import bioc.BioCDocument;
|
11 |
-
import bioc.BioCLocation;
|
12 |
-
import bioc.BioCPassage;
|
13 |
-
|
14 |
-
import bioc.io.BioCDocumentWriter;
|
15 |
-
import bioc.io.BioCFactory;
|
16 |
-
import bioc.io.woodstox.ConnectorWoodstox;
|
17 |
-
import java.io.BufferedReader;
|
18 |
-
import java.io.BufferedWriter;
|
19 |
-
import java.io.FileInputStream;
|
20 |
-
import java.io.FileOutputStream;
|
21 |
-
import java.io.FileReader;
|
22 |
-
import java.io.FileWriter;
|
23 |
-
import java.io.IOException;
|
24 |
-
import java.io.InputStreamReader;
|
25 |
-
import java.io.OutputStreamWriter;
|
26 |
-
import java.text.BreakIterator;
|
27 |
-
import java.time.LocalDate;
|
28 |
-
import java.time.ZoneId;
|
29 |
-
import java.text.DecimalFormat;
|
30 |
-
import java.math.RoundingMode;
|
31 |
-
|
32 |
-
import javax.xml.stream.XMLStreamException;
|
33 |
-
|
34 |
-
import java.util.Map;
|
35 |
-
import java.util.regex.Matcher;
|
36 |
-
import java.util.regex.Pattern;
|
37 |
-
import java.util.ArrayList;
|
38 |
-
import java.util.HashMap;
|
39 |
-
import java.util.List;
|
40 |
-
import java.util.Locale;
|
41 |
-
|
42 |
-
public class GN
|
43 |
-
{
|
44 |
-
public static HashMap<String, String> MatchedTokens_hash = new HashMap<String, String>();
|
45 |
-
private double ScoringFunction(String geneid,HashMap<String,String> Mention_hash,String LF)
|
46 |
-
{
|
47 |
-
/*
|
48 |
-
* define gene/homo id
|
49 |
-
*/
|
50 |
-
|
51 |
-
//LF
|
52 |
-
LF = LF.toLowerCase();
|
53 |
-
LF = LF.replaceAll("([0-9])([a-z])", "$1 $2");
|
54 |
-
LF = LF.replaceAll("([a-z])([0-9])", "$1 $2");
|
55 |
-
LF = LF.replaceAll("([\\W\\-\\_])", " ");
|
56 |
-
LF = LF.replaceAll("[ ]+", " ");
|
57 |
-
String LF_tkn[]=LF.split(" ");
|
58 |
-
int LF_ParticalMatch = 0;
|
59 |
-
|
60 |
-
Pattern ptmp = Pattern.compile("[0-9]+\\-([0-9]+)");
|
61 |
-
Matcher mtmp = ptmp.matcher(geneid);
|
62 |
-
Pattern ptmp2 = Pattern.compile("([0-9]+)");
|
63 |
-
Matcher mtmp2 = ptmp.matcher(geneid);
|
64 |
-
if(mtmp.find())
|
65 |
-
{
|
66 |
-
geneid = "Homo:"+mtmp.group(1);
|
67 |
-
}
|
68 |
-
else
|
69 |
-
{
|
70 |
-
geneid = "Gene:"+geneid;
|
71 |
-
}
|
72 |
-
|
73 |
-
if(GNormPlus.GeneScoring_hash.containsKey(geneid))
|
74 |
-
{
|
75 |
-
HashMap<String,Double> TF = new HashMap<String,Double>(); // token i in gene j
|
76 |
-
HashMap<String,Double> TermFrequency = new HashMap<String,Double>();
|
77 |
-
|
78 |
-
/*
|
79 |
-
* Tokens in Query (Gene id lexicon)
|
80 |
-
*/
|
81 |
-
String l[]=GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293 cmk-1,cytidylate-1,kinase-1,mssa-1 0.4096 4 0.0625 1 2.0
|
82 |
-
String tkns_Gene[] = l[0].split(",");
|
83 |
-
for(int i=0;i<tkns_Gene.length;i++)
|
84 |
-
{
|
85 |
-
String Tkn_Freq[] = tkns_Gene[i].split("-");
|
86 |
-
TermFrequency.put(Tkn_Freq[0], Double.parseDouble(Tkn_Freq[1]));
|
87 |
-
}
|
88 |
-
Double Cj = Double.parseDouble(l[1]);
|
89 |
-
Double AllTknNum = Double.parseDouble(l[2]);
|
90 |
-
//Double Cj_max = Double.parseDouble(l[3]);
|
91 |
-
//Double MaxTknNum = Double.parseDouble(l[4]);
|
92 |
-
Double Norm = Double.parseDouble(l[5]);
|
93 |
-
if(Norm == 0.0){Norm=1.0;}
|
94 |
-
|
95 |
-
/*
|
96 |
-
* Tokens in Document (recognized mentions)
|
97 |
-
*/
|
98 |
-
for(String Mention : Mention_hash.keySet())
|
99 |
-
{
|
100 |
-
Mention = Mention.toLowerCase();
|
101 |
-
Mention = Mention.replaceAll("([0-9])([a-z])", "$1 $2");
|
102 |
-
Mention = Mention.replaceAll("([a-z])([0-9])", "$1 $2");
|
103 |
-
Mention = Mention.replaceAll("([\\W\\-\\_])", " ");
|
104 |
-
Mention = Mention.replaceAll("[ ]+", " ");
|
105 |
-
String tkns_Mention[]=Mention.split(" ");
|
106 |
-
for(int i=0;i<tkns_Mention.length;i++)
|
107 |
-
{
|
108 |
-
if(TermFrequency.containsKey(tkns_Mention[i]))
|
109 |
-
{
|
110 |
-
TF.put(tkns_Mention[i], TermFrequency.get(tkns_Mention[i]));
|
111 |
-
}
|
112 |
-
}
|
113 |
-
}
|
114 |
-
|
115 |
-
Double score=0.0;
|
116 |
-
for(String Tkn : TF.keySet())
|
117 |
-
{
|
118 |
-
//LF
|
119 |
-
for(int t=0;t<LF_tkn.length;t++)
|
120 |
-
{
|
121 |
-
if(LF_tkn[t].equals(Tkn))
|
122 |
-
{
|
123 |
-
LF_ParticalMatch++;
|
124 |
-
}
|
125 |
-
}
|
126 |
-
|
127 |
-
double TFij = TF.get(Tkn)/AllTknNum;
|
128 |
-
double IDFi=GNormPlus.GeneScoringDF_hash.get(Tkn);
|
129 |
-
score=score+TFij*IDFi*(1/(1-TFij));
|
130 |
-
}
|
131 |
-
//score = Cj * (1/Norm) *score;
|
132 |
-
if(LF_ParticalMatch>0){score = score + LF_ParticalMatch;/*System.out.println(geneid+"\t"+LF+"\t"+score);*/}
|
133 |
-
return score;
|
134 |
-
}
|
135 |
-
else
|
136 |
-
{
|
137 |
-
//System.out.println("Error: cannot find geneid: "+geneid+" in GeneScoring_hash");
|
138 |
-
return 0.0;
|
139 |
-
}
|
140 |
-
}
|
141 |
-
|
142 |
-
public void PreProcessing4GN(String Filename,String FilenameBioC) throws IOException, XMLStreamException
|
143 |
-
{
|
144 |
-
for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++)
|
145 |
-
{
|
146 |
-
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++)
|
147 |
-
{
|
148 |
-
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++)
|
149 |
-
{
|
150 |
-
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
151 |
-
String start=anno[0];
|
152 |
-
String last=anno[1];
|
153 |
-
String mentions=anno[2];
|
154 |
-
String type=anno[3];
|
155 |
-
String id="";
|
156 |
-
if(anno.length>=5)
|
157 |
-
{
|
158 |
-
id=anno[4];
|
159 |
-
}
|
160 |
-
|
161 |
-
if(type.equals("Gene"))
|
162 |
-
{
|
163 |
-
String mentionArr[] = mentions.split("\\|");
|
164 |
-
boolean update=false;
|
165 |
-
for(int m=0;m<mentionArr.length;m++)
|
166 |
-
{
|
167 |
-
Pattern ptmp = Pattern.compile("^(.*[0-9A-Z])[ ]*p$");
|
168 |
-
Matcher mtmp = ptmp.matcher(mentionArr[m]);
|
169 |
-
Pattern ptmp2 = Pattern.compile("^(.+)nu$");
|
170 |
-
Matcher mtmp2 = ptmp2.matcher(mentionArr[m]);
|
171 |
-
Pattern ptmp3 = Pattern.compile("^(.*)alpha(.*)$");
|
172 |
-
Matcher mtmp3 = ptmp3.matcher(mentionArr[m]);
|
173 |
-
Pattern ptmp4 = Pattern.compile("^(.*)beta(.*)$");
|
174 |
-
Matcher mtmp4 = ptmp4.matcher(mentionArr[m]);
|
175 |
-
Pattern ptmp5 = Pattern.compile("^(.+[0-9])a$");
|
176 |
-
Matcher mtmp5 = ptmp5.matcher(mentionArr[m]);
|
177 |
-
Pattern ptmp6 = Pattern.compile("^(.+[0-9])b$");
|
178 |
-
Matcher mtmp6 = ptmp6.matcher(mentionArr[m]);
|
179 |
-
Pattern ptmp7 = Pattern.compile("^(.+)II([a-z])$");
|
180 |
-
Matcher mtmp7 = ptmp7.matcher(mentionArr[m]);
|
181 |
-
Pattern ptmp8 = Pattern.compile("^(.+)III([a-z])$");
|
182 |
-
Matcher mtmp8 = ptmp8.matcher(mentionArr[m]);
|
183 |
-
if(mtmp.find())
|
184 |
-
{
|
185 |
-
mentions=mentions+"|"+mtmp.group(1);
|
186 |
-
update=true;
|
187 |
-
}
|
188 |
-
if(mtmp2.find())
|
189 |
-
{
|
190 |
-
mentions=mentions+"|"+mtmp2.group(1);
|
191 |
-
update=true;
|
192 |
-
}
|
193 |
-
if(mtmp3.find())
|
194 |
-
{
|
195 |
-
mentions=mentions+"|"+mtmp3.group(1)+"a"+mtmp3.group(2);
|
196 |
-
update=true;
|
197 |
-
}
|
198 |
-
if(mtmp4.find())
|
199 |
-
{
|
200 |
-
mentions=mentions+"|"+mtmp4.group(1)+"b"+mtmp4.group(2);
|
201 |
-
update=true;
|
202 |
-
}
|
203 |
-
if(mtmp5.find())
|
204 |
-
{
|
205 |
-
mentions=mentions+"|"+mtmp5.group(1)+"alpha";
|
206 |
-
update=true;
|
207 |
-
}
|
208 |
-
if(mtmp6.find())
|
209 |
-
{
|
210 |
-
mentions=mentions+"|"+mtmp6.group(1)+"beta";
|
211 |
-
update=true;
|
212 |
-
}
|
213 |
-
if(mtmp7.find())
|
214 |
-
{
|
215 |
-
mentions=mentions+"|"+mtmp7.group(1)+"2"+mtmp7.group(2);
|
216 |
-
update=true;
|
217 |
-
}
|
218 |
-
if(mtmp8.find())
|
219 |
-
{
|
220 |
-
mentions=mentions+"|"+mtmp8.group(1)+"3"+mtmp8.group(2);
|
221 |
-
update=true;
|
222 |
-
}
|
223 |
-
}
|
224 |
-
if(update == true)
|
225 |
-
{
|
226 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start + "\t" + last + "\t" + mentions + "\t" + type + "\t" + id );
|
227 |
-
}
|
228 |
-
}
|
229 |
-
}
|
230 |
-
}
|
231 |
-
}
|
232 |
-
//GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
|
233 |
-
}
|
234 |
-
|
235 |
-
public void ChromosomeRecognition(String Filename,String FilenameBioC) throws IOException, XMLStreamException
|
236 |
-
{
|
237 |
-
for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */
|
238 |
-
{
|
239 |
-
String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
|
240 |
-
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
|
241 |
-
{
|
242 |
-
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
|
243 |
-
|
244 |
-
/** Chromosome recognition */
|
245 |
-
ArrayList<String> locations = GNormPlus.PT_GeneChromosome.SearchMentionLocation(PassageContext,"ChromosomeLocation");
|
246 |
-
for (int k = 0 ; k < locations.size() ; k++)
|
247 |
-
{
|
248 |
-
String anno[]=locations.get(k).split("\t");
|
249 |
-
//int start= Integer.parseInt(anno[0]);
|
250 |
-
//int last= Integer.parseInt(anno[1]);
|
251 |
-
//String mention = anno[2];
|
252 |
-
String ids = anno[3];
|
253 |
-
//GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tChromosomeLocation\t"+ids); //paragraph
|
254 |
-
String IDs[] = ids.split("[\\|,]");
|
255 |
-
for(int idcount=0;idcount<IDs.length;idcount++)
|
256 |
-
{
|
257 |
-
//IDs[idcount] = IDs[idcount].replaceAll("\\-[0-9]+", "");
|
258 |
-
GNormPlus.Pmid2ChromosomeGene_hash.put(Pmid+"\t"+IDs[idcount],"");
|
259 |
-
}
|
260 |
-
}
|
261 |
-
}
|
262 |
-
}
|
263 |
-
//GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
|
264 |
-
}
|
265 |
-
|
266 |
-
public void GeneNormalization(String Filename,String FilenameBioC,boolean GeneIDMatch) throws IOException, XMLStreamException
|
267 |
-
{
|
268 |
-
final DecimalFormat df = new DecimalFormat("0.####");
|
269 |
-
df.setRoundingMode(RoundingMode.HALF_UP);
|
270 |
-
|
271 |
-
//Tokenization
|
272 |
-
for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) /** PMIDs : i */
|
273 |
-
{
|
274 |
-
String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
|
275 |
-
|
276 |
-
/** Species */
|
277 |
-
HashMap<String,String> Species_hash = new HashMap<String,String>();
|
278 |
-
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
|
279 |
-
{
|
280 |
-
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) /** Annotation : k */
|
281 |
-
{
|
282 |
-
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
283 |
-
String mentions=anno[2];
|
284 |
-
String type=anno[3];
|
285 |
-
if(type.matches("(Species|Genus|Strain|CellLine|Cell)"))
|
286 |
-
{
|
287 |
-
Species_hash.put(mentions,"");
|
288 |
-
}
|
289 |
-
}
|
290 |
-
}
|
291 |
-
|
292 |
-
|
293 |
-
/*
|
294 |
-
* Collect Gene mentions :
|
295 |
-
*
|
296 |
-
* GeneMention-taxid -> "ID" : geneid
|
297 |
-
* -> "type" : "Gene"
|
298 |
-
* -> start1-last1 : ""
|
299 |
-
* -> start2-last2 : ""
|
300 |
-
* -> start3-last3 : ""
|
301 |
-
*/
|
302 |
-
|
303 |
-
String tiabs="";
|
304 |
-
for (int j = 0; j < GNormPlus.BioCDocobj.PassageContexts.get(i).size(); j++) /** Paragraphs : j */
|
305 |
-
{
|
306 |
-
tiabs=tiabs+GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).toLowerCase();
|
307 |
-
}
|
308 |
-
HashMap<String,HashMap<String,String>> GeneMention_hash = new HashMap<String,HashMap<String,String>>();
|
309 |
-
HashMap<String,String> Mention_hash = new HashMap<String,String>();
|
310 |
-
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
|
311 |
-
{
|
312 |
-
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) /** Annotation : k */
|
313 |
-
{
|
314 |
-
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
315 |
-
String start=anno[0];
|
316 |
-
String last=anno[1];
|
317 |
-
String mentions=anno[2];
|
318 |
-
String type=anno[3];
|
319 |
-
String taxids="Tax:9606";
|
320 |
-
|
321 |
-
if(anno.length>=5)
|
322 |
-
{
|
323 |
-
taxids=anno[4];
|
324 |
-
}
|
325 |
-
String mentions_tmp=mentions.toLowerCase();
|
326 |
-
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
327 |
-
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
328 |
-
taxids=taxids.replaceAll("(Focus|Right|Left|Prefix|Tax):","");
|
329 |
-
if(taxids.equals(""))
|
330 |
-
{
|
331 |
-
taxids="9606";
|
332 |
-
}
|
333 |
-
/** Filtering */
|
334 |
-
boolean found_filter = false;
|
335 |
-
if(GNormPlus.Filtering_hash.containsKey(mentions_tmp)) // filtering
|
336 |
-
{
|
337 |
-
found_filter=true;
|
338 |
-
}
|
339 |
-
|
340 |
-
if(found_filter==false) //abbreviation
|
341 |
-
{
|
342 |
-
for(String f : GNormPlus.Filtering_WithLongForm_hash.keySet())
|
343 |
-
{
|
344 |
-
if( GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*[\\t\\|]"+f+"\tGene.*") ||
|
345 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*\\t"+f+"\\|[^\t]+\tGene.*")
|
346 |
-
)
|
347 |
-
{
|
348 |
-
String lf=GNormPlus.Filtering_WithLongForm_hash.get(f);
|
349 |
-
if(tiabs.matches(".*"+lf+".*"))
|
350 |
-
{
|
351 |
-
found_filter=true;
|
352 |
-
break;
|
353 |
-
}
|
354 |
-
}
|
355 |
-
}
|
356 |
-
}
|
357 |
-
|
358 |
-
if(found_filter==false)
|
359 |
-
{
|
360 |
-
if( GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*[\\t\\|][a-z]\tGene.*") ||
|
361 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*\\t[a-z]\\|[^\t]+\tGene.*") //32171191 Wuhan's
|
362 |
-
)
|
363 |
-
{
|
364 |
-
found_filter=true;
|
365 |
-
|
366 |
-
}
|
367 |
-
}
|
368 |
-
|
369 |
-
if(found_filter == false)
|
370 |
-
{
|
371 |
-
if(type.matches("Gene"))
|
372 |
-
{
|
373 |
-
if(GeneMention_hash.containsKey(mentions+"\t"+taxids))
|
374 |
-
{
|
375 |
-
GeneMention_hash.get(mentions+"\t"+taxids).put(start+"\t"+last,"");
|
376 |
-
}
|
377 |
-
else
|
378 |
-
{
|
379 |
-
HashMap<String,String> offset_hash = new HashMap<String,String>();
|
380 |
-
offset_hash.put(start+"\t"+last,"");
|
381 |
-
GeneMention_hash.put(mentions+"\t"+taxids, offset_hash);
|
382 |
-
GeneMention_hash.get(mentions+"\t"+taxids).put("type", type);
|
383 |
-
Mention_hash.put(mentions,"Gene");
|
384 |
-
}
|
385 |
-
}
|
386 |
-
else if(type.matches("(FamilyName|DomainMotif)"))
|
387 |
-
{
|
388 |
-
String GMs[]=mentions.split("\\|");
|
389 |
-
for(int g=0;g<GMs.length;g++)
|
390 |
-
{
|
391 |
-
String mention = GMs[g];
|
392 |
-
Mention_hash.put(mention,"FamilyDomain");
|
393 |
-
}
|
394 |
-
}
|
395 |
-
}
|
396 |
-
|
397 |
-
}
|
398 |
-
}
|
399 |
-
|
400 |
-
/*
|
401 |
-
* Gene id refinement:
|
402 |
-
* 1. Official name
|
403 |
-
* 2. only one gene
|
404 |
-
*/
|
405 |
-
HashMap<String,String> GuaranteedGene2ID = new HashMap<String,String>();
|
406 |
-
HashMap<String,String> MultiGene2ID = new HashMap<String,String>();
|
407 |
-
for(String GeneMentionTax : GeneMention_hash.keySet())
|
408 |
-
{
|
409 |
-
String GT[]=GeneMentionTax.split("\\t");
|
410 |
-
String mentions=GT[0];
|
411 |
-
String taxids=GT[1];
|
412 |
-
String GMs[]=mentions.split("\\|");
|
413 |
-
|
414 |
-
HashMap<String,String> taxids_hash = new HashMap<String,String>();
|
415 |
-
String taxids_arr[]=taxids.split(",");
|
416 |
-
for(int t=0;t<taxids_arr.length;t++)
|
417 |
-
{
|
418 |
-
taxids_hash.put(taxids_arr[t], "");
|
419 |
-
}
|
420 |
-
|
421 |
-
for(int ms=0;ms<GMs.length;ms++)
|
422 |
-
{
|
423 |
-
String mention = GMs[ms];
|
424 |
-
String IDstr = GNormPlus.PT_Gene.MentionMatch(mention); /** searched by PT_Gene */
|
425 |
-
String IDs[]=IDstr.split("\\|");
|
426 |
-
|
427 |
-
/*
|
428 |
-
* printing the ambiguous gene mentions and candidates
|
429 |
-
*/
|
430 |
-
//String IDs_s[]=IDstr.split(",");
|
431 |
-
//if(IDs_s.length>1)
|
432 |
-
//{
|
433 |
-
// System.out.println(Pmid+"\t"+mention+"\t"+mentions+"\t"+IDstr);
|
434 |
-
//}
|
435 |
-
|
436 |
-
for(int c=0;c<IDs.length;c++)
|
437 |
-
{
|
438 |
-
String tax2ID[]=IDs[c].split(":"); // tax2ID[0] = taxid ; tax2ID[1] = geneids
|
439 |
-
if(taxids_hash.containsKey(tax2ID[0]))
|
440 |
-
{
|
441 |
-
String geneid=tax2ID[1];
|
442 |
-
String TargetTax=tax2ID[0];
|
443 |
-
GeneMention_hash.get(GeneMentionTax).put("ID", geneid);
|
444 |
-
GeneMention_hash.get(GeneMentionTax).put("TargetTax", TargetTax);
|
445 |
-
break;
|
446 |
-
}
|
447 |
-
}
|
448 |
-
|
449 |
-
//geneid refinement
|
450 |
-
if(GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
|
451 |
-
{
|
452 |
-
Pattern ptmp = Pattern.compile("\\*([0-9]+(\\-[0-9]+|))");
|
453 |
-
Matcher mtmp = ptmp.matcher(GeneMention_hash.get(GeneMentionTax).get("ID"));
|
454 |
-
|
455 |
-
if(mtmp.find()) // 1. Official Name
|
456 |
-
{
|
457 |
-
GeneMention_hash.get(GeneMentionTax).put("ID",mtmp.group(1));
|
458 |
-
GuaranteedGene2ID.put(GeneMentionTax,mtmp.group(1));
|
459 |
-
}
|
460 |
-
else if(GeneMention_hash.get(GeneMentionTax).get("ID").matches("[0-9]+(\\-[0-9]+|)")) // 2. only one gene
|
461 |
-
{
|
462 |
-
GuaranteedGene2ID.put(GeneMentionTax,GeneMention_hash.get(GeneMentionTax).get("ID"));
|
463 |
-
}
|
464 |
-
else
|
465 |
-
{
|
466 |
-
String ID[] = GeneMention_hash.get(GeneMentionTax).get("ID").split(",");
|
467 |
-
boolean FoundByChroLoca=false;
|
468 |
-
for(int idcount=0;idcount<ID.length;idcount++)
|
469 |
-
{
|
470 |
-
if(GNormPlus.Pmid2ChromosomeGene_hash.containsKey(Pmid+"\t"+ID[idcount])) // 3. Chromosome location
|
471 |
-
{
|
472 |
-
GuaranteedGene2ID.put(GeneMentionTax,ID[idcount]);
|
473 |
-
FoundByChroLoca=true;
|
474 |
-
break;
|
475 |
-
}
|
476 |
-
}
|
477 |
-
if(FoundByChroLoca == false)
|
478 |
-
{
|
479 |
-
MultiGene2ID.put(GeneMentionTax, GeneMention_hash.get(GeneMentionTax).get("ID"));
|
480 |
-
}
|
481 |
-
}
|
482 |
-
}
|
483 |
-
if(GNormPlus.suffixprefix_orig2modified.containsKey(mention) && (!IDstr.equals("-1")) && (!IDstr.equals("-2")) && (!IDstr.equals("-3")))
|
484 |
-
{
|
485 |
-
break;
|
486 |
-
}
|
487 |
-
}
|
488 |
-
}
|
489 |
-
|
490 |
-
/*
|
491 |
-
* Gene id refinement:
|
492 |
-
* 3. multiple genes but can be inferred by 1. and 2.
|
493 |
-
*/
|
494 |
-
for(String GeneMentionTax_M : MultiGene2ID.keySet())
|
495 |
-
{
|
496 |
-
for(String GeneMentionTax_G : GuaranteedGene2ID.keySet())
|
497 |
-
{
|
498 |
-
String MG[] = MultiGene2ID.get(GeneMentionTax_M).split(",");
|
499 |
-
for(int m=0;m<MG.length;m++)
|
500 |
-
{
|
501 |
-
if(MG[m].equals(GuaranteedGene2ID.get(GeneMentionTax_G)))
|
502 |
-
{
|
503 |
-
GeneMention_hash.get(GeneMentionTax_M).put("ID",MG[m]);
|
504 |
-
}
|
505 |
-
}
|
506 |
-
}
|
507 |
-
}
|
508 |
-
|
509 |
-
/*
|
510 |
-
* Gene id refinement:
|
511 |
-
* 4. FullName -> Abbreviation
|
512 |
-
*/
|
513 |
-
for(String GeneMentionTax : GeneMention_hash.keySet())
|
514 |
-
{
|
515 |
-
String MT[] = GeneMentionTax.split("\\t");
|
516 |
-
if(GNormPlus.PmidLF2Abb_hash.containsKey(Pmid+"\t"+MT[0]))
|
517 |
-
{
|
518 |
-
String GeneMentionTax_Abb = GNormPlus.PmidLF2Abb_hash.get(Pmid+"\t"+MT[0]) + "\t" + MT[1];
|
519 |
-
if(GeneMention_hash.containsKey(GeneMentionTax_Abb) && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
|
520 |
-
{
|
521 |
-
GeneMention_hash.get(GeneMentionTax_Abb).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID"));
|
522 |
-
}
|
523 |
-
}
|
524 |
-
}
|
525 |
-
|
526 |
-
/*
|
527 |
-
* Gene id refinement:
|
528 |
-
* 5. Ranking by scoring function (inference network)
|
529 |
-
*/
|
530 |
-
for(String GeneMentionTax : GeneMention_hash.keySet())
|
531 |
-
{
|
532 |
-
if(GeneMention_hash.get(GeneMentionTax).containsKey("ID") && GeneMention_hash.get(GeneMentionTax).get("ID").matches(".+,.+"))
|
533 |
-
{
|
534 |
-
String geneids=GeneMention_hash.get(GeneMentionTax).get("ID");
|
535 |
-
String geneid[] = geneids.split(",");
|
536 |
-
|
537 |
-
String OutputStyle="Top1";
|
538 |
-
if(OutputStyle.equals("Top1"))
|
539 |
-
{
|
540 |
-
//only return the best one
|
541 |
-
double max_score=0.0;
|
542 |
-
String target_geneid="";
|
543 |
-
for(int g=0;g<geneid.length;g++)
|
544 |
-
{
|
545 |
-
String MT[] = GeneMentionTax.split("\\t");
|
546 |
-
String LF="";
|
547 |
-
if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
|
548 |
-
{
|
549 |
-
LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]);
|
550 |
-
}
|
551 |
-
double score = ScoringFunction(geneid[g],Mention_hash,LF);
|
552 |
-
if(score>max_score)
|
553 |
-
{
|
554 |
-
max_score=score;
|
555 |
-
target_geneid=geneid[g];
|
556 |
-
}
|
557 |
-
else if(score == 0.0)
|
558 |
-
{
|
559 |
-
//System.out.println(GeneMentionTax);
|
560 |
-
}
|
561 |
-
}
|
562 |
-
GeneMention_hash.get(GeneMentionTax).put("ID", target_geneid);
|
563 |
-
}
|
564 |
-
else // "All"
|
565 |
-
{
|
566 |
-
//return all geneids
|
567 |
-
String geneSTR="";
|
568 |
-
for(int g=0;g<geneid.length;g++)
|
569 |
-
{
|
570 |
-
String MT[] = GeneMentionTax.split("\\t");
|
571 |
-
String LF="";
|
572 |
-
if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
|
573 |
-
{
|
574 |
-
LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]);
|
575 |
-
}
|
576 |
-
double score = ScoringFunction(geneid[g],Mention_hash,LF);
|
577 |
-
String hoge = df.format(score);
|
578 |
-
score=Double.parseDouble(hoge);
|
579 |
-
|
580 |
-
if(geneSTR.equals(""))
|
581 |
-
{
|
582 |
-
geneSTR=geneid[g]+"-"+score;
|
583 |
-
}
|
584 |
-
else
|
585 |
-
{
|
586 |
-
geneSTR=geneSTR+","+geneid[g]+"-"+score;
|
587 |
-
}
|
588 |
-
}
|
589 |
-
GeneMention_hash.get(GeneMentionTax).put("ID", geneSTR);
|
590 |
-
}
|
591 |
-
}
|
592 |
-
}
|
593 |
-
|
594 |
-
/*
|
595 |
-
* Gene id refinement: - removed (Reason: cause too much False Positive)
|
596 |
-
* 6. Abbreviation -> FullName
|
597 |
-
*
|
598 |
-
*/
|
599 |
-
for(String GeneMentionTax : GeneMention_hash.keySet())
|
600 |
-
{
|
601 |
-
String MT[] = GeneMentionTax.split("\\t");
|
602 |
-
if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
|
603 |
-
{
|
604 |
-
String GeneMentionTax_LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]) + "\t" + MT[1];
|
605 |
-
if(GeneMention_hash.containsKey(GeneMentionTax_LF) && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
|
606 |
-
{
|
607 |
-
GeneMention_hash.get(GeneMentionTax_LF).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID"));
|
608 |
-
}
|
609 |
-
}
|
610 |
-
}
|
611 |
-
|
612 |
-
/*
|
613 |
-
* Gene id refinement:
|
614 |
-
* 7. The inference network tokens of Abbreviation.ID should contain at least LF tokens
|
615 |
-
* 8. The short mention should be filtered if not long form support
|
616 |
-
*/
|
617 |
-
ArrayList<String> removeGMT = new ArrayList<String>();
|
618 |
-
for(String GeneMentionTax : GeneMention_hash.keySet())
|
619 |
-
{
|
620 |
-
String GT[]=GeneMentionTax.split("\\t");
|
621 |
-
String mentions=GT[0];
|
622 |
-
String tax=GT[1];
|
623 |
-
if(GeneMention_hash.get(GeneMentionTax).containsKey("type") && GeneMention_hash.get(GeneMentionTax).get("type").equals("Gene") && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
|
624 |
-
{
|
625 |
-
String type = GeneMention_hash.get(GeneMentionTax).get("type");
|
626 |
-
String id = GeneMention_hash.get(GeneMentionTax).get("ID");
|
627 |
-
String geneid="";
|
628 |
-
Pattern ptmp1 = Pattern.compile("^([0-9]+)\\-([0-9]+)$");
|
629 |
-
Pattern ptmp2 = Pattern.compile("^([0-9]+)$");
|
630 |
-
Matcher mtmp1 = ptmp1.matcher(id);
|
631 |
-
Matcher mtmp2 = ptmp2.matcher(id);
|
632 |
-
//System.out.println(id);
|
633 |
-
if(mtmp1.find())
|
634 |
-
{
|
635 |
-
geneid = "Homo:"+mtmp1.group(2);
|
636 |
-
}
|
637 |
-
else if(mtmp2.find())
|
638 |
-
{
|
639 |
-
geneid = "Gene:"+mtmp2.group(1);
|
640 |
-
}
|
641 |
-
|
642 |
-
boolean LongFormTknMatch= false;
|
643 |
-
boolean LongFormExist= true;
|
644 |
-
if(GNormPlus.GeneScoring_hash.containsKey(geneid))
|
645 |
-
{
|
646 |
-
if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mentions.toLowerCase()))
|
647 |
-
{
|
648 |
-
/*
|
649 |
-
* token in lexicon : tkn_lexicon
|
650 |
-
* token in mention : tkn_mention
|
651 |
-
*/
|
652 |
-
String l[]=GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293 cmk-1,cytidylate-1,kinase-1,mssa-1 0.4096 4 0.0625 1 2.0
|
653 |
-
String tkns_Gene[] = l[0].split(",");
|
654 |
-
ArrayList<String> tkn_lexicon = new ArrayList<String>();
|
655 |
-
for(int ti=0;ti<tkns_Gene.length;ti++)
|
656 |
-
{
|
657 |
-
String Tkn_Freq[] = tkns_Gene[ti].split("-");
|
658 |
-
tkn_lexicon.add(Tkn_Freq[0]);
|
659 |
-
}
|
660 |
-
|
661 |
-
String LF_lc=GNormPlus.PmidAbb2LF_lc_hash.get(Pmid+"\t"+mentions.toLowerCase());
|
662 |
-
LF_lc = LF_lc.replaceAll("([0-9])([A-Za-z])", "$1 $2");
|
663 |
-
LF_lc = LF_lc.replaceAll("([A-Za-z])([0-9])", "$1 $2");
|
664 |
-
String tkn_mention[] = LF_lc.split("[\\W\\-\\_]");
|
665 |
-
for(int tl=0;tl<tkn_lexicon.size();tl++)
|
666 |
-
{
|
667 |
-
for(int tm=0;tm<tkn_mention.length;tm++)
|
668 |
-
{
|
669 |
-
if(tkn_lexicon.get(tl).equals(tkn_mention[tm]) && (!tkn_mention[tm].matches("[0-9]+")))
|
670 |
-
{
|
671 |
-
LongFormTknMatch = true;
|
672 |
-
}
|
673 |
-
}
|
674 |
-
}
|
675 |
-
}
|
676 |
-
else{LongFormExist = false;}
|
677 |
-
}
|
678 |
-
else{LongFormTknMatch = true;} // exception
|
679 |
-
|
680 |
-
if(LongFormTknMatch == false && LongFormExist == true) // 7.
|
681 |
-
{
|
682 |
-
removeGMT.add(GeneMentionTax); //remove short form
|
683 |
-
removeGMT.add(GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+mentions)+"\t"+tax); //remove long form
|
684 |
-
}
|
685 |
-
else if(mentions.length()<=2 && LongFormExist == false) // 8.
|
686 |
-
{
|
687 |
-
removeGMT.add(GeneMentionTax);
|
688 |
-
}
|
689 |
-
}
|
690 |
-
}
|
691 |
-
|
692 |
-
for(int gmti=0;gmti<removeGMT.size();gmti++) // remove
|
693 |
-
{
|
694 |
-
GeneMention_hash.remove(removeGMT.get(gmti));
|
695 |
-
}
|
696 |
-
|
697 |
-
// Append gene ids
|
698 |
-
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) // Paragraphs : j
|
699 |
-
{
|
700 |
-
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
|
701 |
-
{
|
702 |
-
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
703 |
-
String start=anno[0];
|
704 |
-
String last=anno[1];
|
705 |
-
String mentions=anno[2];
|
706 |
-
String type=anno[3];
|
707 |
-
String taxid_org="Tax:9606";
|
708 |
-
if(anno.length>=5)
|
709 |
-
{
|
710 |
-
taxid_org=anno[4];
|
711 |
-
}
|
712 |
-
String taxids=taxid_org.replaceAll("(Focus|Right|Left|Prefix|Tax):","");
|
713 |
-
String GMs[]=mentions.split("\\|");
|
714 |
-
|
715 |
-
if(GeneMention_hash.containsKey(mentions+"\t"+taxids) && GeneMention_hash.get(mentions+"\t"+taxids).containsKey("TargetTax"))
|
716 |
-
{
|
717 |
-
String taxtype=taxid_org.replaceAll(":([0-9,]+)","");
|
718 |
-
String taxid=GeneMention_hash.get(mentions+"\t"+taxids).get("TargetTax");
|
719 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mentions+"\t"+type+"\t"+taxtype+":"+taxid);
|
720 |
-
}
|
721 |
-
|
722 |
-
if(type.equals("Gene"))
|
723 |
-
{
|
724 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k) + "|");
|
725 |
-
|
726 |
-
|
727 |
-
if(GeneMention_hash.containsKey(mentions+"\t"+taxids) && GeneMention_hash.get(mentions+"\t"+taxids).containsKey("ID"))
|
728 |
-
{
|
729 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k) + GeneMention_hash.get(mentions+"\t"+taxids).get("ID") + "," );
|
730 |
-
}
|
731 |
-
else // cannot find appropriate species
|
732 |
-
{
|
733 |
-
//System.out.println(mention+"\t"+taxid);
|
734 |
-
}
|
735 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).substring(0, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).length()-1)); // remove ",$"
|
736 |
-
}
|
737 |
-
}
|
738 |
-
}
|
739 |
-
|
740 |
-
//Extend to all gene mentions
|
741 |
-
HashMap<String,String> GeneMentions = new HashMap<String,String>(); // Extending Gene mentions
|
742 |
-
HashMap<String,String> GeneMentionLocation = new HashMap<String,String>(); // Extending Gene mentions
|
743 |
-
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
|
744 |
-
{
|
745 |
-
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
|
746 |
-
{
|
747 |
-
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
748 |
-
int start = Integer.parseInt(anno[0]);
|
749 |
-
int last = Integer.parseInt(anno[1]);
|
750 |
-
String mentions=anno[2];
|
751 |
-
String type=anno[3];
|
752 |
-
String id="Tax:9606";
|
753 |
-
if(anno.length>=5)
|
754 |
-
{
|
755 |
-
id=anno[4];
|
756 |
-
}
|
757 |
-
if(type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)"))
|
758 |
-
{
|
759 |
-
GeneMentions.put(mentions.toLowerCase(), id);
|
760 |
-
for (int s=start ;s<=last;s++)
|
761 |
-
{
|
762 |
-
GeneMentionLocation.put(j+"\t"+s,"");
|
763 |
-
}
|
764 |
-
}
|
765 |
-
else if(type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)"))
|
766 |
-
{
|
767 |
-
GeneMentions.put(mentions.toLowerCase(), id);
|
768 |
-
for (int s=start ;s<=last;s++)
|
769 |
-
{
|
770 |
-
GeneMentionLocation.put(j+"\t"+s,"");
|
771 |
-
}
|
772 |
-
}
|
773 |
-
}
|
774 |
-
}
|
775 |
-
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
|
776 |
-
{
|
777 |
-
if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j)
|
778 |
-
{
|
779 |
-
String PassageContexts = " " + GNormPlus.BioCDocobj.PassageContexts.get(i).get(j) + " ";
|
780 |
-
String PassageContexts_tmp = PassageContexts.toLowerCase();
|
781 |
-
for(String gm : GeneMentions.keySet())
|
782 |
-
{
|
783 |
-
String id = GeneMentions.get(gm);
|
784 |
-
if(gm.length()>=3)
|
785 |
-
{
|
786 |
-
gm = gm.replaceAll("[ ]*[\\|]*$", "");
|
787 |
-
gm = gm.replaceAll("^[\\|]*[ ]*", "");
|
788 |
-
gm = gm.replaceAll("[\\|][\\|]+", "\\|");
|
789 |
-
if(!gm.matches("[\\W\\-\\_]*"))
|
790 |
-
{
|
791 |
-
gm = gm.replaceAll("([^A-Za-z0-9\\| ])", "\\\\$1");
|
792 |
-
Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])("+gm+")([\\W\\-\\_].*)$");
|
793 |
-
Matcher mtmp = ptmp.matcher(PassageContexts_tmp);
|
794 |
-
while(mtmp.find())
|
795 |
-
{
|
796 |
-
String pre = mtmp.group(1);
|
797 |
-
String gmtmp = mtmp.group(2);
|
798 |
-
String post = mtmp.group(3);
|
799 |
-
|
800 |
-
int start = pre.length()-1;
|
801 |
-
int last = start+gmtmp.length();
|
802 |
-
if(PassageContexts.length()>=last+1)
|
803 |
-
{
|
804 |
-
String mention = PassageContexts.substring(start+1,last+1);
|
805 |
-
if(!GeneMentionLocation.containsKey(j+"\t"+start) && !GeneMentionLocation.containsKey(j+"\t"+last))
|
806 |
-
{
|
807 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGene\t"+id);
|
808 |
-
}
|
809 |
-
}
|
810 |
-
gmtmp = gmtmp.replaceAll(".", "\\@");
|
811 |
-
PassageContexts_tmp=pre+""+gmtmp+""+post;
|
812 |
-
mtmp = ptmp.matcher(PassageContexts_tmp);
|
813 |
-
}
|
814 |
-
}
|
815 |
-
}
|
816 |
-
}
|
817 |
-
}
|
818 |
-
}
|
819 |
-
|
820 |
-
//Apply to FamilyNames
|
821 |
-
HashMap<String,String> geneids = new HashMap<String,String>(); // Extending Gene mentions
|
822 |
-
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
|
823 |
-
{
|
824 |
-
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
|
825 |
-
{
|
826 |
-
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
827 |
-
String type=anno[3];
|
828 |
-
if(type.equals("Gene"))
|
829 |
-
{
|
830 |
-
String id="Tax:9606";
|
831 |
-
if(anno.length>=5)
|
832 |
-
{
|
833 |
-
id=anno[4];
|
834 |
-
}
|
835 |
-
Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)$");
|
836 |
-
Matcher mtmp0 = ptmp0.matcher(id);
|
837 |
-
Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
|
838 |
-
Matcher mtmp1 = ptmp1.matcher(id);
|
839 |
-
if(mtmp0.find())
|
840 |
-
{
|
841 |
-
geneids.put(mtmp0.group(3), "");
|
842 |
-
}
|
843 |
-
if(mtmp1.find())
|
844 |
-
{
|
845 |
-
geneids.put(mtmp1.group(3), "");
|
846 |
-
}
|
847 |
-
}
|
848 |
-
}
|
849 |
-
}
|
850 |
-
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
|
851 |
-
{
|
852 |
-
for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
|
853 |
-
{
|
854 |
-
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
855 |
-
String mention=anno[2];
|
856 |
-
String type=anno[3];
|
857 |
-
if(type.matches("(FamilyName|DomainMotif)"))
|
858 |
-
{
|
859 |
-
String id="Tax:9606";
|
860 |
-
if(anno.length>=5)
|
861 |
-
{
|
862 |
-
id=anno[4];
|
863 |
-
}
|
864 |
-
String IDstrs = GNormPlus.PT_FamilyName.MentionMatch(mention);
|
865 |
-
String IDstr[]=IDstrs.split("\\|");
|
866 |
-
String ids="";
|
867 |
-
for(int id_i=0;id_i<IDstr.length;id_i++)
|
868 |
-
{
|
869 |
-
if(geneids.containsKey(IDstr[id_i]))
|
870 |
-
{
|
871 |
-
if(ids.equals(""))
|
872 |
-
{
|
873 |
-
ids=IDstr[id_i];
|
874 |
-
}
|
875 |
-
else
|
876 |
-
{
|
877 |
-
ids=ids+";"+IDstr[id_i];
|
878 |
-
}
|
879 |
-
}
|
880 |
-
}
|
881 |
-
if(!ids.equals(""))
|
882 |
-
{
|
883 |
-
if(type.equals("FamilyName")){type="Gene";}
|
884 |
-
String Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\tTax:9606";
|
885 |
-
if(anno.length>=5)
|
886 |
-
{
|
887 |
-
Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\t"+anno[4];
|
888 |
-
}
|
889 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k,Annotation_k+"|"+ids);
|
890 |
-
}
|
891 |
-
else
|
892 |
-
{
|
893 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
|
894 |
-
}
|
895 |
-
}
|
896 |
-
}
|
897 |
-
}
|
898 |
-
//Species "*" and "(anti)" removed.
|
899 |
-
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
|
900 |
-
{
|
901 |
-
for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
|
902 |
-
{
|
903 |
-
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
904 |
-
String type=anno[3];
|
905 |
-
if(type.equals("Species") || type.equals("Genus") || type.equals("Strain") || type.equals("CellLine") || type.equals("Cell"))
|
906 |
-
{
|
907 |
-
String id=anno[4];
|
908 |
-
id=id.replaceAll("\\*", "");
|
909 |
-
id=id.replaceAll("\\(anti\\)", "");
|
910 |
-
String Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\t"+id;
|
911 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k,Annotation_k);
|
912 |
-
}
|
913 |
-
}
|
914 |
-
}
|
915 |
-
|
916 |
-
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
|
917 |
-
{
|
918 |
-
|
919 |
-
for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
|
920 |
-
{
|
921 |
-
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
922 |
-
int start = Integer.parseInt(anno[0]);
|
923 |
-
int last = Integer.parseInt(anno[1]);
|
924 |
-
String mention = anno[2];
|
925 |
-
String type = anno[3];
|
926 |
-
String id = anno[4];
|
927 |
-
if(type.equals("Gene") && Species_hash.containsKey(mention))
|
928 |
-
{
|
929 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
|
930 |
-
}
|
931 |
-
else if(type.equals("Gene") && id.equals(""))
|
932 |
-
{
|
933 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
|
934 |
-
}
|
935 |
-
else
|
936 |
-
{
|
937 |
-
for (int k1 = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k1 >=0 ; k1--) // Annotation : k
|
938 |
-
{
|
939 |
-
if(k1 != k)
|
940 |
-
{
|
941 |
-
String anno1[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k1).split("\t");
|
942 |
-
int start1 = Integer.parseInt(anno1[0]);
|
943 |
-
int last1 = Integer.parseInt(anno1[1]);
|
944 |
-
if((start1<start && last1>=last) || (start1<=start && last1>last))
|
945 |
-
{
|
946 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
|
947 |
-
break;
|
948 |
-
}
|
949 |
-
}
|
950 |
-
}
|
951 |
-
}
|
952 |
-
}
|
953 |
-
}
|
954 |
-
}
|
955 |
-
if(GeneIDMatch == true)
|
956 |
-
{
|
957 |
-
//GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
|
958 |
-
}
|
959 |
-
else
|
960 |
-
{
|
961 |
-
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,true,true);
|
962 |
-
}
|
963 |
-
}
|
964 |
-
/*
|
965 |
-
* Search Potential GeneID in the Prefix Tree
|
966 |
-
*/
|
967 |
-
public ArrayList<String> SearchGeneIDLocation(String Doc)
|
968 |
-
{
|
969 |
-
ArrayList<String> location = new ArrayList<String>();
|
970 |
-
|
971 |
-
String Doc_tmp=" "+Doc+" ";
|
972 |
-
Pattern ptmp = Pattern.compile("^(.*[^A-Za-z0-9]+)([0-9]+\\S*[A-Za-z]+|[A-Za-z]+\\S*[0-9]+|[0-9]+\\S*[A-Za-z]+\\S*[0-9]+|[A-Za-z]+\\S*[0-9]+\\S*[A-Za-z]+)([^A-Za-z0-9]+.*)$");
|
973 |
-
Matcher mtmp = ptmp.matcher(Doc_tmp);
|
974 |
-
while(mtmp.find())
|
975 |
-
{
|
976 |
-
String str1=mtmp.group(1);
|
977 |
-
String str2=mtmp.group(2);
|
978 |
-
String str3=mtmp.group(3);
|
979 |
-
for(int m=str1.length();m<=(str1.length()+str2.length());m++)
|
980 |
-
{
|
981 |
-
int start = str1.length()-1;
|
982 |
-
int last = start+str2.length();
|
983 |
-
String mention = Doc.substring(start, last);
|
984 |
-
if(!mention.matches(".*[\\'\\;\\[\\]\\+\\*\\\\].*"))
|
985 |
-
{
|
986 |
-
if(last-start>6 && (mention.matches(".*\\(.*\\).*") || mention.matches("[^\\(\\)]+")) )
|
987 |
-
{
|
988 |
-
Pattern ptmp1 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-([0-9]+)$");
|
989 |
-
Matcher mtmp1 = ptmp1.matcher(mention);
|
990 |
-
Pattern ptmp2 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-(.+[^0-9])([0-9]+)$");
|
991 |
-
Matcher mtmp2 = ptmp2.matcher(mention);
|
992 |
-
if(mtmp1.find())
|
993 |
-
{
|
994 |
-
String S1 = mtmp1.group(1);
|
995 |
-
if(mtmp1.group(2).length()<=6 && mtmp1.group(3).length()<=6)
|
996 |
-
{
|
997 |
-
int Num1 = Integer.parseInt(mtmp1.group(2));
|
998 |
-
int Num2 = Integer.parseInt(mtmp1.group(3));
|
999 |
-
String prefix = "";
|
1000 |
-
Pattern ptmp3 = Pattern.compile("^([0]+)");
|
1001 |
-
Matcher mtmp3 = ptmp3.matcher(mtmp1.group(2));
|
1002 |
-
if(mtmp3.find())
|
1003 |
-
{
|
1004 |
-
prefix = mtmp3.group(1);
|
1005 |
-
}
|
1006 |
-
if(Num2-Num1>0 && (Num2-Num1<=20))
|
1007 |
-
{
|
1008 |
-
for(int n=Num1;n<=Num2;n++)
|
1009 |
-
{
|
1010 |
-
String StrNum=S1+prefix+n;
|
1011 |
-
if(StrNum.length()>=5)
|
1012 |
-
{
|
1013 |
-
location.add(start+"\t"+last+"\t"+StrNum+"\tGeneID");
|
1014 |
-
}
|
1015 |
-
}
|
1016 |
-
}
|
1017 |
-
}
|
1018 |
-
}
|
1019 |
-
else if(mtmp2.find())
|
1020 |
-
{
|
1021 |
-
if(mtmp2.group(2).length()<=6 && mtmp2.group(4).length()<=6)
|
1022 |
-
{
|
1023 |
-
String S1 = mtmp2.group(1);
|
1024 |
-
int Num1 = Integer.parseInt(mtmp2.group(2));
|
1025 |
-
String S2 = mtmp2.group(3);
|
1026 |
-
int Num2 = Integer.parseInt(mtmp2.group(4));
|
1027 |
-
if(S1.equals(S2))
|
1028 |
-
{
|
1029 |
-
String prefix = "";
|
1030 |
-
Pattern ptmp3 = Pattern.compile("^([0]+)");
|
1031 |
-
Matcher mtmp3 = ptmp3.matcher(mtmp2.group(2));
|
1032 |
-
if(mtmp3.find())
|
1033 |
-
{
|
1034 |
-
prefix = mtmp3.group(1);
|
1035 |
-
}
|
1036 |
-
if(Num2-Num1>0 && (Num2-Num1<=20))
|
1037 |
-
{
|
1038 |
-
for(int n=Num1;n<=Num2;n++)
|
1039 |
-
{
|
1040 |
-
String StrNum=S1+prefix+n;
|
1041 |
-
if(StrNum.length()>=5)
|
1042 |
-
{
|
1043 |
-
location.add(start+"\t"+last+"\t"+StrNum+"\tGeneID");
|
1044 |
-
}
|
1045 |
-
}
|
1046 |
-
}
|
1047 |
-
}
|
1048 |
-
}
|
1049 |
-
}
|
1050 |
-
}
|
1051 |
-
location.add(start+"\t"+last+"\t"+mention+"\tGeneID");
|
1052 |
-
}
|
1053 |
-
}
|
1054 |
-
String men="";
|
1055 |
-
for(int m=0;m<str2.length();m++){men=men+"@";}
|
1056 |
-
Doc_tmp=str1+men+str3;
|
1057 |
-
mtmp = ptmp.matcher(Doc_tmp);
|
1058 |
-
}
|
1059 |
-
return location;
|
1060 |
-
}
|
1061 |
-
public void GeneIDRecognition(String Filename,String FilenameBioC) throws IOException, XMLStreamException
|
1062 |
-
{
|
1063 |
-
for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */
|
1064 |
-
{
|
1065 |
-
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
|
1066 |
-
{
|
1067 |
-
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
|
1068 |
-
/** GeneID recognition by pattern match */
|
1069 |
-
ArrayList<String> locations = SearchGeneIDLocation(PassageContext);
|
1070 |
-
for (int k = 0 ; k < locations.size() ; k++)
|
1071 |
-
{
|
1072 |
-
String anno[]=locations.get(k).split("\t");
|
1073 |
-
String mention = anno[2].toLowerCase();
|
1074 |
-
mention = mention.replaceAll("[\\W\\-\\_]+", "");
|
1075 |
-
if(GNormPlus.GeneIDs_hash.containsKey(mention))
|
1076 |
-
{
|
1077 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(locations.get(k)+"\tGeneID:"+GNormPlus.GeneIDs_hash.get(mention)); //paragraph
|
1078 |
-
}
|
1079 |
-
}
|
1080 |
-
}
|
1081 |
-
}
|
1082 |
-
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,true,true);
|
1083 |
-
}
|
1084 |
}
|
|
|
1 |
+
/**
|
2 |
+
* Project: GNormPlus
|
3 |
+
* Function: Gene Normalization
|
4 |
+
*/
|
5 |
+
|
6 |
+
package GNormPluslib;
|
7 |
+
|
8 |
+
import bioc.BioCAnnotation;
|
9 |
+
import bioc.BioCCollection;
|
10 |
+
import bioc.BioCDocument;
|
11 |
+
import bioc.BioCLocation;
|
12 |
+
import bioc.BioCPassage;
|
13 |
+
|
14 |
+
import bioc.io.BioCDocumentWriter;
|
15 |
+
import bioc.io.BioCFactory;
|
16 |
+
import bioc.io.woodstox.ConnectorWoodstox;
|
17 |
+
import java.io.BufferedReader;
|
18 |
+
import java.io.BufferedWriter;
|
19 |
+
import java.io.FileInputStream;
|
20 |
+
import java.io.FileOutputStream;
|
21 |
+
import java.io.FileReader;
|
22 |
+
import java.io.FileWriter;
|
23 |
+
import java.io.IOException;
|
24 |
+
import java.io.InputStreamReader;
|
25 |
+
import java.io.OutputStreamWriter;
|
26 |
+
import java.text.BreakIterator;
|
27 |
+
import java.time.LocalDate;
|
28 |
+
import java.time.ZoneId;
|
29 |
+
import java.text.DecimalFormat;
|
30 |
+
import java.math.RoundingMode;
|
31 |
+
|
32 |
+
import javax.xml.stream.XMLStreamException;
|
33 |
+
|
34 |
+
import java.util.Map;
|
35 |
+
import java.util.regex.Matcher;
|
36 |
+
import java.util.regex.Pattern;
|
37 |
+
import java.util.ArrayList;
|
38 |
+
import java.util.HashMap;
|
39 |
+
import java.util.List;
|
40 |
+
import java.util.Locale;
|
41 |
+
|
42 |
+
public class GN
|
43 |
+
{
|
44 |
+
public static HashMap<String, String> MatchedTokens_hash = new HashMap<String, String>();
|
45 |
+
private double ScoringFunction(String geneid,HashMap<String,String> Mention_hash,String LF)
|
46 |
+
{
|
47 |
+
/*
|
48 |
+
* define gene/homo id
|
49 |
+
*/
|
50 |
+
|
51 |
+
//LF
|
52 |
+
LF = LF.toLowerCase();
|
53 |
+
LF = LF.replaceAll("([0-9])([a-z])", "$1 $2");
|
54 |
+
LF = LF.replaceAll("([a-z])([0-9])", "$1 $2");
|
55 |
+
LF = LF.replaceAll("([\\W\\-\\_])", " ");
|
56 |
+
LF = LF.replaceAll("[ ]+", " ");
|
57 |
+
String LF_tkn[]=LF.split(" ");
|
58 |
+
int LF_ParticalMatch = 0;
|
59 |
+
|
60 |
+
Pattern ptmp = Pattern.compile("[0-9]+\\-([0-9]+)");
|
61 |
+
Matcher mtmp = ptmp.matcher(geneid);
|
62 |
+
Pattern ptmp2 = Pattern.compile("([0-9]+)");
|
63 |
+
Matcher mtmp2 = ptmp.matcher(geneid);
|
64 |
+
if(mtmp.find())
|
65 |
+
{
|
66 |
+
geneid = "Homo:"+mtmp.group(1);
|
67 |
+
}
|
68 |
+
else
|
69 |
+
{
|
70 |
+
geneid = "Gene:"+geneid;
|
71 |
+
}
|
72 |
+
|
73 |
+
if(GNormPlus.GeneScoring_hash.containsKey(geneid))
|
74 |
+
{
|
75 |
+
HashMap<String,Double> TF = new HashMap<String,Double>(); // token i in gene j
|
76 |
+
HashMap<String,Double> TermFrequency = new HashMap<String,Double>();
|
77 |
+
|
78 |
+
/*
|
79 |
+
* Tokens in Query (Gene id lexicon)
|
80 |
+
*/
|
81 |
+
String l[]=GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293 cmk-1,cytidylate-1,kinase-1,mssa-1 0.4096 4 0.0625 1 2.0
|
82 |
+
String tkns_Gene[] = l[0].split(",");
|
83 |
+
for(int i=0;i<tkns_Gene.length;i++)
|
84 |
+
{
|
85 |
+
String Tkn_Freq[] = tkns_Gene[i].split("-");
|
86 |
+
TermFrequency.put(Tkn_Freq[0], Double.parseDouble(Tkn_Freq[1]));
|
87 |
+
}
|
88 |
+
Double Cj = Double.parseDouble(l[1]);
|
89 |
+
Double AllTknNum = Double.parseDouble(l[2]);
|
90 |
+
//Double Cj_max = Double.parseDouble(l[3]);
|
91 |
+
//Double MaxTknNum = Double.parseDouble(l[4]);
|
92 |
+
Double Norm = Double.parseDouble(l[5]);
|
93 |
+
if(Norm == 0.0){Norm=1.0;}
|
94 |
+
|
95 |
+
/*
|
96 |
+
* Tokens in Document (recognized mentions)
|
97 |
+
*/
|
98 |
+
for(String Mention : Mention_hash.keySet())
|
99 |
+
{
|
100 |
+
Mention = Mention.toLowerCase();
|
101 |
+
Mention = Mention.replaceAll("([0-9])([a-z])", "$1 $2");
|
102 |
+
Mention = Mention.replaceAll("([a-z])([0-9])", "$1 $2");
|
103 |
+
Mention = Mention.replaceAll("([\\W\\-\\_])", " ");
|
104 |
+
Mention = Mention.replaceAll("[ ]+", " ");
|
105 |
+
String tkns_Mention[]=Mention.split(" ");
|
106 |
+
for(int i=0;i<tkns_Mention.length;i++)
|
107 |
+
{
|
108 |
+
if(TermFrequency.containsKey(tkns_Mention[i]))
|
109 |
+
{
|
110 |
+
TF.put(tkns_Mention[i], TermFrequency.get(tkns_Mention[i]));
|
111 |
+
}
|
112 |
+
}
|
113 |
+
}
|
114 |
+
|
115 |
+
Double score=0.0;
|
116 |
+
for(String Tkn : TF.keySet())
|
117 |
+
{
|
118 |
+
//LF
|
119 |
+
for(int t=0;t<LF_tkn.length;t++)
|
120 |
+
{
|
121 |
+
if(LF_tkn[t].equals(Tkn))
|
122 |
+
{
|
123 |
+
LF_ParticalMatch++;
|
124 |
+
}
|
125 |
+
}
|
126 |
+
|
127 |
+
double TFij = TF.get(Tkn)/AllTknNum;
|
128 |
+
double IDFi=GNormPlus.GeneScoringDF_hash.get(Tkn);
|
129 |
+
score=score+TFij*IDFi*(1/(1-TFij));
|
130 |
+
}
|
131 |
+
//score = Cj * (1/Norm) *score;
|
132 |
+
if(LF_ParticalMatch>0){score = score + LF_ParticalMatch;/*System.out.println(geneid+"\t"+LF+"\t"+score);*/}
|
133 |
+
return score;
|
134 |
+
}
|
135 |
+
else
|
136 |
+
{
|
137 |
+
//System.out.println("Error: cannot find geneid: "+geneid+" in GeneScoring_hash");
|
138 |
+
return 0.0;
|
139 |
+
}
|
140 |
+
}
|
141 |
+
|
142 |
+
public void PreProcessing4GN(String Filename,String FilenameBioC) throws IOException, XMLStreamException
|
143 |
+
{
|
144 |
+
for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++)
|
145 |
+
{
|
146 |
+
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++)
|
147 |
+
{
|
148 |
+
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++)
|
149 |
+
{
|
150 |
+
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
151 |
+
String start=anno[0];
|
152 |
+
String last=anno[1];
|
153 |
+
String mentions=anno[2];
|
154 |
+
String type=anno[3];
|
155 |
+
String id="";
|
156 |
+
if(anno.length>=5)
|
157 |
+
{
|
158 |
+
id=anno[4];
|
159 |
+
}
|
160 |
+
|
161 |
+
if(type.equals("Gene"))
|
162 |
+
{
|
163 |
+
String mentionArr[] = mentions.split("\\|");
|
164 |
+
boolean update=false;
|
165 |
+
for(int m=0;m<mentionArr.length;m++)
|
166 |
+
{
|
167 |
+
Pattern ptmp = Pattern.compile("^(.*[0-9A-Z])[ ]*p$");
|
168 |
+
Matcher mtmp = ptmp.matcher(mentionArr[m]);
|
169 |
+
Pattern ptmp2 = Pattern.compile("^(.+)nu$");
|
170 |
+
Matcher mtmp2 = ptmp2.matcher(mentionArr[m]);
|
171 |
+
Pattern ptmp3 = Pattern.compile("^(.*)alpha(.*)$");
|
172 |
+
Matcher mtmp3 = ptmp3.matcher(mentionArr[m]);
|
173 |
+
Pattern ptmp4 = Pattern.compile("^(.*)beta(.*)$");
|
174 |
+
Matcher mtmp4 = ptmp4.matcher(mentionArr[m]);
|
175 |
+
Pattern ptmp5 = Pattern.compile("^(.+[0-9])a$");
|
176 |
+
Matcher mtmp5 = ptmp5.matcher(mentionArr[m]);
|
177 |
+
Pattern ptmp6 = Pattern.compile("^(.+[0-9])b$");
|
178 |
+
Matcher mtmp6 = ptmp6.matcher(mentionArr[m]);
|
179 |
+
Pattern ptmp7 = Pattern.compile("^(.+)II([a-z])$");
|
180 |
+
Matcher mtmp7 = ptmp7.matcher(mentionArr[m]);
|
181 |
+
Pattern ptmp8 = Pattern.compile("^(.+)III([a-z])$");
|
182 |
+
Matcher mtmp8 = ptmp8.matcher(mentionArr[m]);
|
183 |
+
if(mtmp.find())
|
184 |
+
{
|
185 |
+
mentions=mentions+"|"+mtmp.group(1);
|
186 |
+
update=true;
|
187 |
+
}
|
188 |
+
if(mtmp2.find())
|
189 |
+
{
|
190 |
+
mentions=mentions+"|"+mtmp2.group(1);
|
191 |
+
update=true;
|
192 |
+
}
|
193 |
+
if(mtmp3.find())
|
194 |
+
{
|
195 |
+
mentions=mentions+"|"+mtmp3.group(1)+"a"+mtmp3.group(2);
|
196 |
+
update=true;
|
197 |
+
}
|
198 |
+
if(mtmp4.find())
|
199 |
+
{
|
200 |
+
mentions=mentions+"|"+mtmp4.group(1)+"b"+mtmp4.group(2);
|
201 |
+
update=true;
|
202 |
+
}
|
203 |
+
if(mtmp5.find())
|
204 |
+
{
|
205 |
+
mentions=mentions+"|"+mtmp5.group(1)+"alpha";
|
206 |
+
update=true;
|
207 |
+
}
|
208 |
+
if(mtmp6.find())
|
209 |
+
{
|
210 |
+
mentions=mentions+"|"+mtmp6.group(1)+"beta";
|
211 |
+
update=true;
|
212 |
+
}
|
213 |
+
if(mtmp7.find())
|
214 |
+
{
|
215 |
+
mentions=mentions+"|"+mtmp7.group(1)+"2"+mtmp7.group(2);
|
216 |
+
update=true;
|
217 |
+
}
|
218 |
+
if(mtmp8.find())
|
219 |
+
{
|
220 |
+
mentions=mentions+"|"+mtmp8.group(1)+"3"+mtmp8.group(2);
|
221 |
+
update=true;
|
222 |
+
}
|
223 |
+
}
|
224 |
+
if(update == true)
|
225 |
+
{
|
226 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start + "\t" + last + "\t" + mentions + "\t" + type + "\t" + id );
|
227 |
+
}
|
228 |
+
}
|
229 |
+
}
|
230 |
+
}
|
231 |
+
}
|
232 |
+
//GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
|
233 |
+
}
|
234 |
+
|
235 |
+
public void ChromosomeRecognition(String Filename,String FilenameBioC) throws IOException, XMLStreamException
|
236 |
+
{
|
237 |
+
for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */
|
238 |
+
{
|
239 |
+
String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
|
240 |
+
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
|
241 |
+
{
|
242 |
+
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
|
243 |
+
|
244 |
+
/** Chromosome recognition */
|
245 |
+
ArrayList<String> locations = GNormPlus.PT_GeneChromosome.SearchMentionLocation(PassageContext,"ChromosomeLocation");
|
246 |
+
for (int k = 0 ; k < locations.size() ; k++)
|
247 |
+
{
|
248 |
+
String anno[]=locations.get(k).split("\t");
|
249 |
+
//int start= Integer.parseInt(anno[0]);
|
250 |
+
//int last= Integer.parseInt(anno[1]);
|
251 |
+
//String mention = anno[2];
|
252 |
+
String ids = anno[3];
|
253 |
+
//GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tChromosomeLocation\t"+ids); //paragraph
|
254 |
+
String IDs[] = ids.split("[\\|,]");
|
255 |
+
for(int idcount=0;idcount<IDs.length;idcount++)
|
256 |
+
{
|
257 |
+
//IDs[idcount] = IDs[idcount].replaceAll("\\-[0-9]+", "");
|
258 |
+
GNormPlus.Pmid2ChromosomeGene_hash.put(Pmid+"\t"+IDs[idcount],"");
|
259 |
+
}
|
260 |
+
}
|
261 |
+
}
|
262 |
+
}
|
263 |
+
//GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
|
264 |
+
}
|
265 |
+
|
266 |
+
public void GeneNormalization(String Filename,String FilenameBioC,boolean GeneIDMatch) throws IOException, XMLStreamException
|
267 |
+
{
|
268 |
+
final DecimalFormat df = new DecimalFormat("0.####");
|
269 |
+
df.setRoundingMode(RoundingMode.HALF_UP);
|
270 |
+
|
271 |
+
//Tokenization
|
272 |
+
for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) /** PMIDs : i */
|
273 |
+
{
|
274 |
+
String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
|
275 |
+
|
276 |
+
/** Species */
|
277 |
+
HashMap<String,String> Species_hash = new HashMap<String,String>();
|
278 |
+
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
|
279 |
+
{
|
280 |
+
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) /** Annotation : k */
|
281 |
+
{
|
282 |
+
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
283 |
+
String mentions=anno[2];
|
284 |
+
String type=anno[3];
|
285 |
+
if(type.matches("(Species|Genus|Strain|CellLine|Cell)"))
|
286 |
+
{
|
287 |
+
Species_hash.put(mentions,"");
|
288 |
+
}
|
289 |
+
}
|
290 |
+
}
|
291 |
+
|
292 |
+
|
293 |
+
/*
|
294 |
+
* Collect Gene mentions :
|
295 |
+
*
|
296 |
+
* GeneMention-taxid -> "ID" : geneid
|
297 |
+
* -> "type" : "Gene"
|
298 |
+
* -> start1-last1 : ""
|
299 |
+
* -> start2-last2 : ""
|
300 |
+
* -> start3-last3 : ""
|
301 |
+
*/
|
302 |
+
|
303 |
+
String tiabs="";
|
304 |
+
for (int j = 0; j < GNormPlus.BioCDocobj.PassageContexts.get(i).size(); j++) /** Paragraphs : j */
|
305 |
+
{
|
306 |
+
tiabs=tiabs+GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).toLowerCase();
|
307 |
+
}
|
308 |
+
HashMap<String,HashMap<String,String>> GeneMention_hash = new HashMap<String,HashMap<String,String>>();
|
309 |
+
HashMap<String,String> Mention_hash = new HashMap<String,String>();
|
310 |
+
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
|
311 |
+
{
|
312 |
+
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) /** Annotation : k */
|
313 |
+
{
|
314 |
+
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
315 |
+
String start=anno[0];
|
316 |
+
String last=anno[1];
|
317 |
+
String mentions=anno[2];
|
318 |
+
String type=anno[3];
|
319 |
+
String taxids="Tax:9606";
|
320 |
+
|
321 |
+
if(anno.length>=5)
|
322 |
+
{
|
323 |
+
taxids=anno[4];
|
324 |
+
}
|
325 |
+
String mentions_tmp=mentions.toLowerCase();
|
326 |
+
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
327 |
+
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
328 |
+
taxids=taxids.replaceAll("(Focus|Right|Left|Prefix|Tax):","");
|
329 |
+
if(taxids.equals(""))
|
330 |
+
{
|
331 |
+
taxids="9606";
|
332 |
+
}
|
333 |
+
/** Filtering */
|
334 |
+
boolean found_filter = false;
|
335 |
+
if(GNormPlus.Filtering_hash.containsKey(mentions_tmp)) // filtering
|
336 |
+
{
|
337 |
+
found_filter=true;
|
338 |
+
}
|
339 |
+
|
340 |
+
if(found_filter==false) //abbreviation
|
341 |
+
{
|
342 |
+
for(String f : GNormPlus.Filtering_WithLongForm_hash.keySet())
|
343 |
+
{
|
344 |
+
if( GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*[\\t\\|]"+f+"\tGene.*") ||
|
345 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*\\t"+f+"\\|[^\t]+\tGene.*")
|
346 |
+
)
|
347 |
+
{
|
348 |
+
String lf=GNormPlus.Filtering_WithLongForm_hash.get(f);
|
349 |
+
if(tiabs.matches(".*"+lf+".*"))
|
350 |
+
{
|
351 |
+
found_filter=true;
|
352 |
+
break;
|
353 |
+
}
|
354 |
+
}
|
355 |
+
}
|
356 |
+
}
|
357 |
+
|
358 |
+
if(found_filter==false)
|
359 |
+
{
|
360 |
+
if( GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*[\\t\\|][a-z]\tGene.*") ||
|
361 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*\\t[a-z]\\|[^\t]+\tGene.*") //32171191 Wuhan's
|
362 |
+
)
|
363 |
+
{
|
364 |
+
found_filter=true;
|
365 |
+
|
366 |
+
}
|
367 |
+
}
|
368 |
+
|
369 |
+
if(found_filter == false)
|
370 |
+
{
|
371 |
+
if(type.matches("Gene"))
|
372 |
+
{
|
373 |
+
if(GeneMention_hash.containsKey(mentions+"\t"+taxids))
|
374 |
+
{
|
375 |
+
GeneMention_hash.get(mentions+"\t"+taxids).put(start+"\t"+last,"");
|
376 |
+
}
|
377 |
+
else
|
378 |
+
{
|
379 |
+
HashMap<String,String> offset_hash = new HashMap<String,String>();
|
380 |
+
offset_hash.put(start+"\t"+last,"");
|
381 |
+
GeneMention_hash.put(mentions+"\t"+taxids, offset_hash);
|
382 |
+
GeneMention_hash.get(mentions+"\t"+taxids).put("type", type);
|
383 |
+
Mention_hash.put(mentions,"Gene");
|
384 |
+
}
|
385 |
+
}
|
386 |
+
else if(type.matches("(FamilyName|DomainMotif)"))
|
387 |
+
{
|
388 |
+
String GMs[]=mentions.split("\\|");
|
389 |
+
for(int g=0;g<GMs.length;g++)
|
390 |
+
{
|
391 |
+
String mention = GMs[g];
|
392 |
+
Mention_hash.put(mention,"FamilyDomain");
|
393 |
+
}
|
394 |
+
}
|
395 |
+
}
|
396 |
+
|
397 |
+
}
|
398 |
+
}
|
399 |
+
|
400 |
+
/*
|
401 |
+
* Gene id refinement:
|
402 |
+
* 1. Official name
|
403 |
+
* 2. only one gene
|
404 |
+
*/
|
405 |
+
HashMap<String,String> GuaranteedGene2ID = new HashMap<String,String>();
|
406 |
+
HashMap<String,String> MultiGene2ID = new HashMap<String,String>();
|
407 |
+
for(String GeneMentionTax : GeneMention_hash.keySet())
|
408 |
+
{
|
409 |
+
String GT[]=GeneMentionTax.split("\\t");
|
410 |
+
String mentions=GT[0];
|
411 |
+
String taxids=GT[1];
|
412 |
+
String GMs[]=mentions.split("\\|");
|
413 |
+
|
414 |
+
HashMap<String,String> taxids_hash = new HashMap<String,String>();
|
415 |
+
String taxids_arr[]=taxids.split(",");
|
416 |
+
for(int t=0;t<taxids_arr.length;t++)
|
417 |
+
{
|
418 |
+
taxids_hash.put(taxids_arr[t], "");
|
419 |
+
}
|
420 |
+
|
421 |
+
for(int ms=0;ms<GMs.length;ms++)
|
422 |
+
{
|
423 |
+
String mention = GMs[ms];
|
424 |
+
String IDstr = GNormPlus.PT_Gene.MentionMatch(mention); /** searched by PT_Gene */
|
425 |
+
String IDs[]=IDstr.split("\\|");
|
426 |
+
|
427 |
+
/*
|
428 |
+
* printing the ambiguous gene mentions and candidates
|
429 |
+
*/
|
430 |
+
//String IDs_s[]=IDstr.split(",");
|
431 |
+
//if(IDs_s.length>1)
|
432 |
+
//{
|
433 |
+
// System.out.println(Pmid+"\t"+mention+"\t"+mentions+"\t"+IDstr);
|
434 |
+
//}
|
435 |
+
|
436 |
+
for(int c=0;c<IDs.length;c++)
|
437 |
+
{
|
438 |
+
String tax2ID[]=IDs[c].split(":"); // tax2ID[0] = taxid ; tax2ID[1] = geneids
|
439 |
+
if(taxids_hash.containsKey(tax2ID[0]))
|
440 |
+
{
|
441 |
+
String geneid=tax2ID[1];
|
442 |
+
String TargetTax=tax2ID[0];
|
443 |
+
GeneMention_hash.get(GeneMentionTax).put("ID", geneid);
|
444 |
+
GeneMention_hash.get(GeneMentionTax).put("TargetTax", TargetTax);
|
445 |
+
break;
|
446 |
+
}
|
447 |
+
}
|
448 |
+
|
449 |
+
//geneid refinement
|
450 |
+
if(GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
|
451 |
+
{
|
452 |
+
Pattern ptmp = Pattern.compile("\\*([0-9]+(\\-[0-9]+|))");
|
453 |
+
Matcher mtmp = ptmp.matcher(GeneMention_hash.get(GeneMentionTax).get("ID"));
|
454 |
+
|
455 |
+
if(mtmp.find()) // 1. Official Name
|
456 |
+
{
|
457 |
+
GeneMention_hash.get(GeneMentionTax).put("ID",mtmp.group(1));
|
458 |
+
GuaranteedGene2ID.put(GeneMentionTax,mtmp.group(1));
|
459 |
+
}
|
460 |
+
else if(GeneMention_hash.get(GeneMentionTax).get("ID").matches("[0-9]+(\\-[0-9]+|)")) // 2. only one gene
|
461 |
+
{
|
462 |
+
GuaranteedGene2ID.put(GeneMentionTax,GeneMention_hash.get(GeneMentionTax).get("ID"));
|
463 |
+
}
|
464 |
+
else
|
465 |
+
{
|
466 |
+
String ID[] = GeneMention_hash.get(GeneMentionTax).get("ID").split(",");
|
467 |
+
boolean FoundByChroLoca=false;
|
468 |
+
for(int idcount=0;idcount<ID.length;idcount++)
|
469 |
+
{
|
470 |
+
if(GNormPlus.Pmid2ChromosomeGene_hash.containsKey(Pmid+"\t"+ID[idcount])) // 3. Chromosome location
|
471 |
+
{
|
472 |
+
GuaranteedGene2ID.put(GeneMentionTax,ID[idcount]);
|
473 |
+
FoundByChroLoca=true;
|
474 |
+
break;
|
475 |
+
}
|
476 |
+
}
|
477 |
+
if(FoundByChroLoca == false)
|
478 |
+
{
|
479 |
+
MultiGene2ID.put(GeneMentionTax, GeneMention_hash.get(GeneMentionTax).get("ID"));
|
480 |
+
}
|
481 |
+
}
|
482 |
+
}
|
483 |
+
if(GNormPlus.suffixprefix_orig2modified.containsKey(mention) && (!IDstr.equals("-1")) && (!IDstr.equals("-2")) && (!IDstr.equals("-3")))
|
484 |
+
{
|
485 |
+
break;
|
486 |
+
}
|
487 |
+
}
|
488 |
+
}
|
489 |
+
|
490 |
+
/*
|
491 |
+
* Gene id refinement:
|
492 |
+
* 3. multiple genes but can be inferred by 1. and 2.
|
493 |
+
*/
|
494 |
+
for(String GeneMentionTax_M : MultiGene2ID.keySet())
|
495 |
+
{
|
496 |
+
for(String GeneMentionTax_G : GuaranteedGene2ID.keySet())
|
497 |
+
{
|
498 |
+
String MG[] = MultiGene2ID.get(GeneMentionTax_M).split(",");
|
499 |
+
for(int m=0;m<MG.length;m++)
|
500 |
+
{
|
501 |
+
if(MG[m].equals(GuaranteedGene2ID.get(GeneMentionTax_G)))
|
502 |
+
{
|
503 |
+
GeneMention_hash.get(GeneMentionTax_M).put("ID",MG[m]);
|
504 |
+
}
|
505 |
+
}
|
506 |
+
}
|
507 |
+
}
|
508 |
+
|
509 |
+
/*
|
510 |
+
* Gene id refinement:
|
511 |
+
* 4. FullName -> Abbreviation
|
512 |
+
*/
|
513 |
+
for(String GeneMentionTax : GeneMention_hash.keySet())
|
514 |
+
{
|
515 |
+
String MT[] = GeneMentionTax.split("\\t");
|
516 |
+
if(GNormPlus.PmidLF2Abb_hash.containsKey(Pmid+"\t"+MT[0]))
|
517 |
+
{
|
518 |
+
String GeneMentionTax_Abb = GNormPlus.PmidLF2Abb_hash.get(Pmid+"\t"+MT[0]) + "\t" + MT[1];
|
519 |
+
if(GeneMention_hash.containsKey(GeneMentionTax_Abb) && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
|
520 |
+
{
|
521 |
+
GeneMention_hash.get(GeneMentionTax_Abb).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID"));
|
522 |
+
}
|
523 |
+
}
|
524 |
+
}
|
525 |
+
|
526 |
+
/*
|
527 |
+
* Gene id refinement:
|
528 |
+
* 5. Ranking by scoring function (inference network)
|
529 |
+
*/
|
530 |
+
for(String GeneMentionTax : GeneMention_hash.keySet())
|
531 |
+
{
|
532 |
+
if(GeneMention_hash.get(GeneMentionTax).containsKey("ID") && GeneMention_hash.get(GeneMentionTax).get("ID").matches(".+,.+"))
|
533 |
+
{
|
534 |
+
String geneids=GeneMention_hash.get(GeneMentionTax).get("ID");
|
535 |
+
String geneid[] = geneids.split(",");
|
536 |
+
|
537 |
+
String OutputStyle="Top1";
|
538 |
+
if(OutputStyle.equals("Top1"))
|
539 |
+
{
|
540 |
+
//only return the best one
|
541 |
+
double max_score=0.0;
|
542 |
+
String target_geneid="";
|
543 |
+
for(int g=0;g<geneid.length;g++)
|
544 |
+
{
|
545 |
+
String MT[] = GeneMentionTax.split("\\t");
|
546 |
+
String LF="";
|
547 |
+
if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
|
548 |
+
{
|
549 |
+
LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]);
|
550 |
+
}
|
551 |
+
double score = ScoringFunction(geneid[g],Mention_hash,LF);
|
552 |
+
if(score>max_score)
|
553 |
+
{
|
554 |
+
max_score=score;
|
555 |
+
target_geneid=geneid[g];
|
556 |
+
}
|
557 |
+
else if(score == 0.0)
|
558 |
+
{
|
559 |
+
//System.out.println(GeneMentionTax);
|
560 |
+
}
|
561 |
+
}
|
562 |
+
GeneMention_hash.get(GeneMentionTax).put("ID", target_geneid);
|
563 |
+
}
|
564 |
+
else // "All"
|
565 |
+
{
|
566 |
+
//return all geneids
|
567 |
+
String geneSTR="";
|
568 |
+
for(int g=0;g<geneid.length;g++)
|
569 |
+
{
|
570 |
+
String MT[] = GeneMentionTax.split("\\t");
|
571 |
+
String LF="";
|
572 |
+
if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
|
573 |
+
{
|
574 |
+
LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]);
|
575 |
+
}
|
576 |
+
double score = ScoringFunction(geneid[g],Mention_hash,LF);
|
577 |
+
String hoge = df.format(score);
|
578 |
+
score=Double.parseDouble(hoge);
|
579 |
+
|
580 |
+
if(geneSTR.equals(""))
|
581 |
+
{
|
582 |
+
geneSTR=geneid[g]+"-"+score;
|
583 |
+
}
|
584 |
+
else
|
585 |
+
{
|
586 |
+
geneSTR=geneSTR+","+geneid[g]+"-"+score;
|
587 |
+
}
|
588 |
+
}
|
589 |
+
GeneMention_hash.get(GeneMentionTax).put("ID", geneSTR);
|
590 |
+
}
|
591 |
+
}
|
592 |
+
}
|
593 |
+
|
594 |
+
/*
|
595 |
+
* Gene id refinement: - removed (Reason: cause too much False Positive)
|
596 |
+
* 6. Abbreviation -> FullName
|
597 |
+
*
|
598 |
+
*/
|
599 |
+
for(String GeneMentionTax : GeneMention_hash.keySet())
|
600 |
+
{
|
601 |
+
String MT[] = GeneMentionTax.split("\\t");
|
602 |
+
if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
|
603 |
+
{
|
604 |
+
String GeneMentionTax_LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]) + "\t" + MT[1];
|
605 |
+
if(GeneMention_hash.containsKey(GeneMentionTax_LF) && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
|
606 |
+
{
|
607 |
+
GeneMention_hash.get(GeneMentionTax_LF).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID"));
|
608 |
+
}
|
609 |
+
}
|
610 |
+
}
|
611 |
+
|
612 |
+
/*
|
613 |
+
* Gene id refinement:
|
614 |
+
* 7. The inference network tokens of Abbreviation.ID should contain at least LF tokens
|
615 |
+
* 8. The short mention should be filtered if not long form support
|
616 |
+
*/
|
617 |
+
ArrayList<String> removeGMT = new ArrayList<String>();
|
618 |
+
for(String GeneMentionTax : GeneMention_hash.keySet())
|
619 |
+
{
|
620 |
+
String GT[]=GeneMentionTax.split("\\t");
|
621 |
+
String mentions=GT[0];
|
622 |
+
String tax=GT[1];
|
623 |
+
if(GeneMention_hash.get(GeneMentionTax).containsKey("type") && GeneMention_hash.get(GeneMentionTax).get("type").equals("Gene") && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
|
624 |
+
{
|
625 |
+
String type = GeneMention_hash.get(GeneMentionTax).get("type");
|
626 |
+
String id = GeneMention_hash.get(GeneMentionTax).get("ID");
|
627 |
+
String geneid="";
|
628 |
+
Pattern ptmp1 = Pattern.compile("^([0-9]+)\\-([0-9]+)$");
|
629 |
+
Pattern ptmp2 = Pattern.compile("^([0-9]+)$");
|
630 |
+
Matcher mtmp1 = ptmp1.matcher(id);
|
631 |
+
Matcher mtmp2 = ptmp2.matcher(id);
|
632 |
+
//System.out.println(id);
|
633 |
+
if(mtmp1.find())
|
634 |
+
{
|
635 |
+
geneid = "Homo:"+mtmp1.group(2);
|
636 |
+
}
|
637 |
+
else if(mtmp2.find())
|
638 |
+
{
|
639 |
+
geneid = "Gene:"+mtmp2.group(1);
|
640 |
+
}
|
641 |
+
|
642 |
+
boolean LongFormTknMatch= false;
|
643 |
+
boolean LongFormExist= true;
|
644 |
+
if(GNormPlus.GeneScoring_hash.containsKey(geneid))
|
645 |
+
{
|
646 |
+
if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mentions.toLowerCase()))
|
647 |
+
{
|
648 |
+
/*
|
649 |
+
* token in lexicon : tkn_lexicon
|
650 |
+
* token in mention : tkn_mention
|
651 |
+
*/
|
652 |
+
String l[]=GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293 cmk-1,cytidylate-1,kinase-1,mssa-1 0.4096 4 0.0625 1 2.0
|
653 |
+
String tkns_Gene[] = l[0].split(",");
|
654 |
+
ArrayList<String> tkn_lexicon = new ArrayList<String>();
|
655 |
+
for(int ti=0;ti<tkns_Gene.length;ti++)
|
656 |
+
{
|
657 |
+
String Tkn_Freq[] = tkns_Gene[ti].split("-");
|
658 |
+
tkn_lexicon.add(Tkn_Freq[0]);
|
659 |
+
}
|
660 |
+
|
661 |
+
String LF_lc=GNormPlus.PmidAbb2LF_lc_hash.get(Pmid+"\t"+mentions.toLowerCase());
|
662 |
+
LF_lc = LF_lc.replaceAll("([0-9])([A-Za-z])", "$1 $2");
|
663 |
+
LF_lc = LF_lc.replaceAll("([A-Za-z])([0-9])", "$1 $2");
|
664 |
+
String tkn_mention[] = LF_lc.split("[\\W\\-\\_]");
|
665 |
+
for(int tl=0;tl<tkn_lexicon.size();tl++)
|
666 |
+
{
|
667 |
+
for(int tm=0;tm<tkn_mention.length;tm++)
|
668 |
+
{
|
669 |
+
if(tkn_lexicon.get(tl).equals(tkn_mention[tm]) && (!tkn_mention[tm].matches("[0-9]+")))
|
670 |
+
{
|
671 |
+
LongFormTknMatch = true;
|
672 |
+
}
|
673 |
+
}
|
674 |
+
}
|
675 |
+
}
|
676 |
+
else{LongFormExist = false;}
|
677 |
+
}
|
678 |
+
else{LongFormTknMatch = true;} // exception
|
679 |
+
|
680 |
+
if(LongFormTknMatch == false && LongFormExist == true) // 7.
|
681 |
+
{
|
682 |
+
removeGMT.add(GeneMentionTax); //remove short form
|
683 |
+
removeGMT.add(GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+mentions)+"\t"+tax); //remove long form
|
684 |
+
}
|
685 |
+
else if(mentions.length()<=2 && LongFormExist == false) // 8.
|
686 |
+
{
|
687 |
+
removeGMT.add(GeneMentionTax);
|
688 |
+
}
|
689 |
+
}
|
690 |
+
}
|
691 |
+
|
692 |
+
for(int gmti=0;gmti<removeGMT.size();gmti++) // remove
|
693 |
+
{
|
694 |
+
GeneMention_hash.remove(removeGMT.get(gmti));
|
695 |
+
}
|
696 |
+
|
697 |
+
// Append gene ids
|
698 |
+
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) // Paragraphs : j
|
699 |
+
{
|
700 |
+
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
|
701 |
+
{
|
702 |
+
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
703 |
+
String start=anno[0];
|
704 |
+
String last=anno[1];
|
705 |
+
String mentions=anno[2];
|
706 |
+
String type=anno[3];
|
707 |
+
String taxid_org="Tax:9606";
|
708 |
+
if(anno.length>=5)
|
709 |
+
{
|
710 |
+
taxid_org=anno[4];
|
711 |
+
}
|
712 |
+
String taxids=taxid_org.replaceAll("(Focus|Right|Left|Prefix|Tax):","");
|
713 |
+
String GMs[]=mentions.split("\\|");
|
714 |
+
|
715 |
+
if(GeneMention_hash.containsKey(mentions+"\t"+taxids) && GeneMention_hash.get(mentions+"\t"+taxids).containsKey("TargetTax"))
|
716 |
+
{
|
717 |
+
String taxtype=taxid_org.replaceAll(":([0-9,]+)","");
|
718 |
+
String taxid=GeneMention_hash.get(mentions+"\t"+taxids).get("TargetTax");
|
719 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mentions+"\t"+type+"\t"+taxtype+":"+taxid);
|
720 |
+
}
|
721 |
+
|
722 |
+
if(type.equals("Gene"))
|
723 |
+
{
|
724 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k) + "|");
|
725 |
+
|
726 |
+
|
727 |
+
if(GeneMention_hash.containsKey(mentions+"\t"+taxids) && GeneMention_hash.get(mentions+"\t"+taxids).containsKey("ID"))
|
728 |
+
{
|
729 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k) + GeneMention_hash.get(mentions+"\t"+taxids).get("ID") + "," );
|
730 |
+
}
|
731 |
+
else // cannot find appropriate species
|
732 |
+
{
|
733 |
+
//System.out.println(mention+"\t"+taxid);
|
734 |
+
}
|
735 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).substring(0, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).length()-1)); // remove ",$"
|
736 |
+
}
|
737 |
+
}
|
738 |
+
}
|
739 |
+
|
740 |
+
//Extend to all gene mentions
|
741 |
+
HashMap<String,String> GeneMentions = new HashMap<String,String>(); // Extending Gene mentions
|
742 |
+
HashMap<String,String> GeneMentionLocation = new HashMap<String,String>(); // Extending Gene mentions
|
743 |
+
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
|
744 |
+
{
|
745 |
+
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
|
746 |
+
{
|
747 |
+
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
748 |
+
int start = Integer.parseInt(anno[0]);
|
749 |
+
int last = Integer.parseInt(anno[1]);
|
750 |
+
String mentions=anno[2];
|
751 |
+
String type=anno[3];
|
752 |
+
String id="Tax:9606";
|
753 |
+
if(anno.length>=5)
|
754 |
+
{
|
755 |
+
id=anno[4];
|
756 |
+
}
|
757 |
+
if(type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)"))
|
758 |
+
{
|
759 |
+
GeneMentions.put(mentions.toLowerCase(), id);
|
760 |
+
for (int s=start ;s<=last;s++)
|
761 |
+
{
|
762 |
+
GeneMentionLocation.put(j+"\t"+s,"");
|
763 |
+
}
|
764 |
+
}
|
765 |
+
else if(type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)"))
|
766 |
+
{
|
767 |
+
GeneMentions.put(mentions.toLowerCase(), id);
|
768 |
+
for (int s=start ;s<=last;s++)
|
769 |
+
{
|
770 |
+
GeneMentionLocation.put(j+"\t"+s,"");
|
771 |
+
}
|
772 |
+
}
|
773 |
+
}
|
774 |
+
}
|
775 |
+
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
|
776 |
+
{
|
777 |
+
if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j)
|
778 |
+
{
|
779 |
+
String PassageContexts = " " + GNormPlus.BioCDocobj.PassageContexts.get(i).get(j) + " ";
|
780 |
+
String PassageContexts_tmp = PassageContexts.toLowerCase();
|
781 |
+
for(String gm : GeneMentions.keySet())
|
782 |
+
{
|
783 |
+
String id = GeneMentions.get(gm);
|
784 |
+
if(gm.length()>=3)
|
785 |
+
{
|
786 |
+
gm = gm.replaceAll("[ ]*[\\|]*$", "");
|
787 |
+
gm = gm.replaceAll("^[\\|]*[ ]*", "");
|
788 |
+
gm = gm.replaceAll("[\\|][\\|]+", "\\|");
|
789 |
+
if(!gm.matches("[\\W\\-\\_]*"))
|
790 |
+
{
|
791 |
+
gm = gm.replaceAll("([^A-Za-z0-9\\| ])", "\\\\$1");
|
792 |
+
Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])("+gm+")([\\W\\-\\_].*)$");
|
793 |
+
Matcher mtmp = ptmp.matcher(PassageContexts_tmp);
|
794 |
+
while(mtmp.find())
|
795 |
+
{
|
796 |
+
String pre = mtmp.group(1);
|
797 |
+
String gmtmp = mtmp.group(2);
|
798 |
+
String post = mtmp.group(3);
|
799 |
+
|
800 |
+
int start = pre.length()-1;
|
801 |
+
int last = start+gmtmp.length();
|
802 |
+
if(PassageContexts.length()>=last+1)
|
803 |
+
{
|
804 |
+
String mention = PassageContexts.substring(start+1,last+1);
|
805 |
+
if(!GeneMentionLocation.containsKey(j+"\t"+start) && !GeneMentionLocation.containsKey(j+"\t"+last))
|
806 |
+
{
|
807 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGene\t"+id);
|
808 |
+
}
|
809 |
+
}
|
810 |
+
gmtmp = gmtmp.replaceAll(".", "\\@");
|
811 |
+
PassageContexts_tmp=pre+""+gmtmp+""+post;
|
812 |
+
mtmp = ptmp.matcher(PassageContexts_tmp);
|
813 |
+
}
|
814 |
+
}
|
815 |
+
}
|
816 |
+
}
|
817 |
+
}
|
818 |
+
}
|
819 |
+
|
820 |
+
//Apply to FamilyNames
|
821 |
+
HashMap<String,String> geneids = new HashMap<String,String>(); // Extending Gene mentions
|
822 |
+
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
|
823 |
+
{
|
824 |
+
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
|
825 |
+
{
|
826 |
+
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
827 |
+
String type=anno[3];
|
828 |
+
if(type.equals("Gene"))
|
829 |
+
{
|
830 |
+
String id="Tax:9606";
|
831 |
+
if(anno.length>=5)
|
832 |
+
{
|
833 |
+
id=anno[4];
|
834 |
+
}
|
835 |
+
Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)$");
|
836 |
+
Matcher mtmp0 = ptmp0.matcher(id);
|
837 |
+
Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
|
838 |
+
Matcher mtmp1 = ptmp1.matcher(id);
|
839 |
+
if(mtmp0.find())
|
840 |
+
{
|
841 |
+
geneids.put(mtmp0.group(3), "");
|
842 |
+
}
|
843 |
+
if(mtmp1.find())
|
844 |
+
{
|
845 |
+
geneids.put(mtmp1.group(3), "");
|
846 |
+
}
|
847 |
+
}
|
848 |
+
}
|
849 |
+
}
|
850 |
+
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
|
851 |
+
{
|
852 |
+
for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
|
853 |
+
{
|
854 |
+
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
855 |
+
String mention=anno[2];
|
856 |
+
String type=anno[3];
|
857 |
+
if(type.matches("(FamilyName|DomainMotif)"))
|
858 |
+
{
|
859 |
+
String id="Tax:9606";
|
860 |
+
if(anno.length>=5)
|
861 |
+
{
|
862 |
+
id=anno[4];
|
863 |
+
}
|
864 |
+
String IDstrs = GNormPlus.PT_FamilyName.MentionMatch(mention);
|
865 |
+
String IDstr[]=IDstrs.split("\\|");
|
866 |
+
String ids="";
|
867 |
+
for(int id_i=0;id_i<IDstr.length;id_i++)
|
868 |
+
{
|
869 |
+
if(geneids.containsKey(IDstr[id_i]))
|
870 |
+
{
|
871 |
+
if(ids.equals(""))
|
872 |
+
{
|
873 |
+
ids=IDstr[id_i];
|
874 |
+
}
|
875 |
+
else
|
876 |
+
{
|
877 |
+
ids=ids+";"+IDstr[id_i];
|
878 |
+
}
|
879 |
+
}
|
880 |
+
}
|
881 |
+
if(!ids.equals(""))
|
882 |
+
{
|
883 |
+
if(type.equals("FamilyName")){type="Gene";}
|
884 |
+
String Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\tTax:9606";
|
885 |
+
if(anno.length>=5)
|
886 |
+
{
|
887 |
+
Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\t"+anno[4];
|
888 |
+
}
|
889 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k,Annotation_k+"|"+ids);
|
890 |
+
}
|
891 |
+
else
|
892 |
+
{
|
893 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
|
894 |
+
}
|
895 |
+
}
|
896 |
+
}
|
897 |
+
}
|
898 |
+
//Species "*" and "(anti)" removed.
|
899 |
+
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
|
900 |
+
{
|
901 |
+
for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
|
902 |
+
{
|
903 |
+
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
904 |
+
String type=anno[3];
|
905 |
+
if(type.equals("Species") || type.equals("Genus") || type.equals("Strain") || type.equals("CellLine") || type.equals("Cell"))
|
906 |
+
{
|
907 |
+
String id=anno[4];
|
908 |
+
id=id.replaceAll("\\*", "");
|
909 |
+
id=id.replaceAll("\\(anti\\)", "");
|
910 |
+
String Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\t"+id;
|
911 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k,Annotation_k);
|
912 |
+
}
|
913 |
+
}
|
914 |
+
}
|
915 |
+
|
916 |
+
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
|
917 |
+
{
|
918 |
+
|
919 |
+
for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
|
920 |
+
{
|
921 |
+
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
922 |
+
int start = Integer.parseInt(anno[0]);
|
923 |
+
int last = Integer.parseInt(anno[1]);
|
924 |
+
String mention = anno[2];
|
925 |
+
String type = anno[3];
|
926 |
+
String id = anno[4];
|
927 |
+
if(type.equals("Gene") && Species_hash.containsKey(mention))
|
928 |
+
{
|
929 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
|
930 |
+
}
|
931 |
+
else if(type.equals("Gene") && id.equals(""))
|
932 |
+
{
|
933 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
|
934 |
+
}
|
935 |
+
else
|
936 |
+
{
|
937 |
+
for (int k1 = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k1 >=0 ; k1--) // Annotation : k
|
938 |
+
{
|
939 |
+
if(k1 != k)
|
940 |
+
{
|
941 |
+
String anno1[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k1).split("\t");
|
942 |
+
int start1 = Integer.parseInt(anno1[0]);
|
943 |
+
int last1 = Integer.parseInt(anno1[1]);
|
944 |
+
if((start1<start && last1>=last) || (start1<=start && last1>last))
|
945 |
+
{
|
946 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
|
947 |
+
break;
|
948 |
+
}
|
949 |
+
}
|
950 |
+
}
|
951 |
+
}
|
952 |
+
}
|
953 |
+
}
|
954 |
+
}
|
955 |
+
if(GeneIDMatch == true)
|
956 |
+
{
|
957 |
+
//GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
|
958 |
+
}
|
959 |
+
else
|
960 |
+
{
|
961 |
+
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,true,true);
|
962 |
+
}
|
963 |
+
}
|
964 |
+
/*
|
965 |
+
* Search Potential GeneID in the Prefix Tree
|
966 |
+
*/
|
967 |
+
public ArrayList<String> SearchGeneIDLocation(String Doc)
|
968 |
+
{
|
969 |
+
ArrayList<String> location = new ArrayList<String>();
|
970 |
+
|
971 |
+
String Doc_tmp=" "+Doc+" ";
|
972 |
+
Pattern ptmp = Pattern.compile("^(.*[^A-Za-z0-9]+)([0-9]+\\S*[A-Za-z]+|[A-Za-z]+\\S*[0-9]+|[0-9]+\\S*[A-Za-z]+\\S*[0-9]+|[A-Za-z]+\\S*[0-9]+\\S*[A-Za-z]+)([^A-Za-z0-9]+.*)$");
|
973 |
+
Matcher mtmp = ptmp.matcher(Doc_tmp);
|
974 |
+
while(mtmp.find())
|
975 |
+
{
|
976 |
+
String str1=mtmp.group(1);
|
977 |
+
String str2=mtmp.group(2);
|
978 |
+
String str3=mtmp.group(3);
|
979 |
+
for(int m=str1.length();m<=(str1.length()+str2.length());m++)
|
980 |
+
{
|
981 |
+
int start = str1.length()-1;
|
982 |
+
int last = start+str2.length();
|
983 |
+
String mention = Doc.substring(start, last);
|
984 |
+
if(!mention.matches(".*[\\'\\;\\[\\]\\+\\*\\\\].*"))
|
985 |
+
{
|
986 |
+
if(last-start>6 && (mention.matches(".*\\(.*\\).*") || mention.matches("[^\\(\\)]+")) )
|
987 |
+
{
|
988 |
+
Pattern ptmp1 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-([0-9]+)$");
|
989 |
+
Matcher mtmp1 = ptmp1.matcher(mention);
|
990 |
+
Pattern ptmp2 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-(.+[^0-9])([0-9]+)$");
|
991 |
+
Matcher mtmp2 = ptmp2.matcher(mention);
|
992 |
+
if(mtmp1.find())
|
993 |
+
{
|
994 |
+
String S1 = mtmp1.group(1);
|
995 |
+
if(mtmp1.group(2).length()<=6 && mtmp1.group(3).length()<=6)
|
996 |
+
{
|
997 |
+
int Num1 = Integer.parseInt(mtmp1.group(2));
|
998 |
+
int Num2 = Integer.parseInt(mtmp1.group(3));
|
999 |
+
String prefix = "";
|
1000 |
+
Pattern ptmp3 = Pattern.compile("^([0]+)");
|
1001 |
+
Matcher mtmp3 = ptmp3.matcher(mtmp1.group(2));
|
1002 |
+
if(mtmp3.find())
|
1003 |
+
{
|
1004 |
+
prefix = mtmp3.group(1);
|
1005 |
+
}
|
1006 |
+
if(Num2-Num1>0 && (Num2-Num1<=20))
|
1007 |
+
{
|
1008 |
+
for(int n=Num1;n<=Num2;n++)
|
1009 |
+
{
|
1010 |
+
String StrNum=S1+prefix+n;
|
1011 |
+
if(StrNum.length()>=5)
|
1012 |
+
{
|
1013 |
+
location.add(start+"\t"+last+"\t"+StrNum+"\tGeneID");
|
1014 |
+
}
|
1015 |
+
}
|
1016 |
+
}
|
1017 |
+
}
|
1018 |
+
}
|
1019 |
+
else if(mtmp2.find())
|
1020 |
+
{
|
1021 |
+
if(mtmp2.group(2).length()<=6 && mtmp2.group(4).length()<=6)
|
1022 |
+
{
|
1023 |
+
String S1 = mtmp2.group(1);
|
1024 |
+
int Num1 = Integer.parseInt(mtmp2.group(2));
|
1025 |
+
String S2 = mtmp2.group(3);
|
1026 |
+
int Num2 = Integer.parseInt(mtmp2.group(4));
|
1027 |
+
if(S1.equals(S2))
|
1028 |
+
{
|
1029 |
+
String prefix = "";
|
1030 |
+
Pattern ptmp3 = Pattern.compile("^([0]+)");
|
1031 |
+
Matcher mtmp3 = ptmp3.matcher(mtmp2.group(2));
|
1032 |
+
if(mtmp3.find())
|
1033 |
+
{
|
1034 |
+
prefix = mtmp3.group(1);
|
1035 |
+
}
|
1036 |
+
if(Num2-Num1>0 && (Num2-Num1<=20))
|
1037 |
+
{
|
1038 |
+
for(int n=Num1;n<=Num2;n++)
|
1039 |
+
{
|
1040 |
+
String StrNum=S1+prefix+n;
|
1041 |
+
if(StrNum.length()>=5)
|
1042 |
+
{
|
1043 |
+
location.add(start+"\t"+last+"\t"+StrNum+"\tGeneID");
|
1044 |
+
}
|
1045 |
+
}
|
1046 |
+
}
|
1047 |
+
}
|
1048 |
+
}
|
1049 |
+
}
|
1050 |
+
}
|
1051 |
+
location.add(start+"\t"+last+"\t"+mention+"\tGeneID");
|
1052 |
+
}
|
1053 |
+
}
|
1054 |
+
String men="";
|
1055 |
+
for(int m=0;m<str2.length();m++){men=men+"@";}
|
1056 |
+
Doc_tmp=str1+men+str3;
|
1057 |
+
mtmp = ptmp.matcher(Doc_tmp);
|
1058 |
+
}
|
1059 |
+
return location;
|
1060 |
+
}
|
1061 |
+
public void GeneIDRecognition(String Filename,String FilenameBioC) throws IOException, XMLStreamException
|
1062 |
+
{
|
1063 |
+
for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */
|
1064 |
+
{
|
1065 |
+
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
|
1066 |
+
{
|
1067 |
+
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
|
1068 |
+
/** GeneID recognition by pattern match */
|
1069 |
+
ArrayList<String> locations = SearchGeneIDLocation(PassageContext);
|
1070 |
+
for (int k = 0 ; k < locations.size() ; k++)
|
1071 |
+
{
|
1072 |
+
String anno[]=locations.get(k).split("\t");
|
1073 |
+
String mention = anno[2].toLowerCase();
|
1074 |
+
mention = mention.replaceAll("[\\W\\-\\_]+", "");
|
1075 |
+
if(GNormPlus.GeneIDs_hash.containsKey(mention))
|
1076 |
+
{
|
1077 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(locations.get(k)+"\tGeneID:"+GNormPlus.GeneIDs_hash.get(mention)); //paragraph
|
1078 |
+
}
|
1079 |
+
}
|
1080 |
+
}
|
1081 |
+
}
|
1082 |
+
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,true,true);
|
1083 |
+
}
|
1084 |
}
|
src_Java/GNormPluslib/GNR.java
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
src_Java/GNormPluslib/GNormPlus.java
CHANGED
@@ -1,696 +1,696 @@
|
|
1 |
-
package GNormPluslib;
|
2 |
-
|
3 |
-
import java.io.BufferedReader;
|
4 |
-
import java.io.BufferedWriter;
|
5 |
-
import java.io.File;
|
6 |
-
import java.io.FileOutputStream;
|
7 |
-
import java.io.FileReader;
|
8 |
-
import java.io.IOException;
|
9 |
-
import java.io.OutputStreamWriter;
|
10 |
-
import java.sql.SQLException;
|
11 |
-
import java.util.ArrayList;
|
12 |
-
import java.util.HashMap;
|
13 |
-
import java.util.regex.Matcher;
|
14 |
-
import java.util.regex.Pattern;
|
15 |
-
|
16 |
-
import javax.xml.stream.XMLStreamException;
|
17 |
-
|
18 |
-
import GNormPluslib.PrefixTree;
|
19 |
-
import GNormPluslib.GNR;
|
20 |
-
import GNormPluslib.SR;
|
21 |
-
|
22 |
-
public class GNormPlus
|
23 |
-
{
|
24 |
-
public static BioCDoc BioCDocobj = new BioCDoc();
|
25 |
-
public static PrefixTree PT_Species = new PrefixTree();
|
26 |
-
public static PrefixTree PT_Cell = new PrefixTree();
|
27 |
-
public static PrefixTree PT_CTDGene = new PrefixTree();
|
28 |
-
public static PrefixTree PT_Gene = new PrefixTree();
|
29 |
-
public static PrefixTree PT_GeneChromosome = new PrefixTree();
|
30 |
-
public static PrefixTree PT_FamilyName = new PrefixTree();
|
31 |
-
public static HashMap<String, String> ent_hash = new HashMap<String, String>();
|
32 |
-
public static HashMap<String, String> GenusID_hash = new HashMap<String, String>();
|
33 |
-
public static HashMap<String, String> PrefixID_hash = new HashMap<String, String>();
|
34 |
-
public static HashMap<String, Double> TaxFreq_hash = new HashMap<String, Double>();
|
35 |
-
public static HashMap<String, String> GeneScoring_hash = new HashMap<String, String>();
|
36 |
-
public static HashMap<String, Double> GeneScoringDF_hash = new HashMap<String, Double>();
|
37 |
-
public static HashMap<String, String> GeneIDs_hash = new HashMap<String, String>();
|
38 |
-
public static HashMap<String, String> Normalization2Protein_hash = new HashMap<String, String>();
|
39 |
-
public static HashMap<String, String> HomologeneID_hash = new HashMap<String, String>();
|
40 |
-
public static HashMap<String,String> SuffixTranslationMap_hash = new HashMap<String,String>();
|
41 |
-
public static HashMap<String,String> SuffixTranslationMap2_hash = new HashMap<String,String>();
|
42 |
-
public static HashMap<String, String> Pmid2Abb_hash = new HashMap<String, String>();
|
43 |
-
public static HashMap<String, String> PmidAbb2LF_lc_hash = new HashMap<String, String>();
|
44 |
-
public static HashMap<String, String> PmidLF2Abb_lc_hash = new HashMap<String, String>();
|
45 |
-
public static HashMap<String, String> PmidAbb2LF_hash = new HashMap<String, String>();
|
46 |
-
public static HashMap<String, String> PmidLF2Abb_hash = new HashMap<String, String>();
|
47 |
-
public static HashMap<String, String> Pmid2ChromosomeGene_hash = new HashMap<String, String>();
|
48 |
-
public static HashMap<String, String> SimConceptMention2Type_hash = new HashMap<String, String>();
|
49 |
-
public static HashMap<String, String> Filtering_hash = new HashMap<String, String>();
|
50 |
-
public static HashMap<String, String> Filtering_WithLongForm_hash = new HashMap<String, String>();
|
51 |
-
public static HashMap<String, String> SP_Virus2Human_hash = new HashMap<String, String>();
|
52 |
-
public static HashMap<String, String> GeneWithoutSPPrefix_hash = new HashMap<String, String>();
|
53 |
-
public static ArrayList <String> taxid4gene = new ArrayList <String>();
|
54 |
-
public static HashMap<String, String> setup_hash = new HashMap<String, String>();
|
55 |
-
public static HashMap<String, String> suffixprefix_orig2modified = new HashMap<String, String>();
|
56 |
-
public static HashMap<String, String> Abb2Longformtok_hash = new HashMap<String, String>();
|
57 |
-
public static HashMap<String, String> StrainID_ancestor2tax_hash = new HashMap<String, String>();
|
58 |
-
public static HashMap<String, String> StrainID_taxid2names_hash = new HashMap<String, String>();
|
59 |
-
|
60 |
-
public static String SetupFile = "setup.txt";
|
61 |
-
public static void main(String [] args) throws IOException, InterruptedException, XMLStreamException, SQLException
|
62 |
-
{
|
63 |
-
String InputFolder="input";
|
64 |
-
String OutputFolder="output";
|
65 |
-
String tmpFolder="tmp";
|
66 |
-
String FocusSpecies = "";
|
67 |
-
if(args.length<2)
|
68 |
-
{
|
69 |
-
System.out.println("\n$ java -Xmx30G -Xms10G -jar GNormPlus.jar [InputFolder] [OutputFolder] [SetupFile]");
|
70 |
-
System.out.println("[InputFolder] Default : input");
|
71 |
-
System.out.println("[OutputFolder] Default : output");
|
72 |
-
System.out.println("[SetupFile] Default : setup.txt\n\n");
|
73 |
-
}
|
74 |
-
else
|
75 |
-
{
|
76 |
-
/*
|
77 |
-
* Parameters
|
78 |
-
*/
|
79 |
-
InputFolder=args[0];
|
80 |
-
OutputFolder=args[1];
|
81 |
-
if(args.length>=3)
|
82 |
-
{
|
83 |
-
SetupFile = args[2];
|
84 |
-
}
|
85 |
-
if(args.length>=4)
|
86 |
-
{
|
87 |
-
FocusSpecies=args[3];
|
88 |
-
}
|
89 |
-
}
|
90 |
-
|
91 |
-
BufferedReader br = new BufferedReader(new FileReader(SetupFile));
|
92 |
-
String line="";
|
93 |
-
Pattern ptmp = Pattern.compile("^ ([A-Za-z0-9]+) = ([^ \\t\\n\\r]+)$");
|
94 |
-
while ((line = br.readLine()) != null)
|
95 |
-
{
|
96 |
-
Matcher mtmp = ptmp.matcher(line);
|
97 |
-
if(mtmp.find())
|
98 |
-
{
|
99 |
-
setup_hash.put(mtmp.group(1), mtmp.group(2));
|
100 |
-
}
|
101 |
-
}
|
102 |
-
br.close();
|
103 |
-
if(!setup_hash.containsKey("GeneIDMatch"))
|
104 |
-
{
|
105 |
-
setup_hash.put("GeneIDMatch","True");
|
106 |
-
}
|
107 |
-
if(!setup_hash.containsKey("HomologeneID"))
|
108 |
-
{
|
109 |
-
setup_hash.put("HomologeneID","False");
|
110 |
-
}
|
111 |
-
if(!FocusSpecies.equals(""))
|
112 |
-
{
|
113 |
-
setup_hash.put("FocusSpecies",FocusSpecies);
|
114 |
-
}
|
115 |
-
if(!setup_hash.containsKey("ShowUnNormalizedMention"))
|
116 |
-
{
|
117 |
-
setup_hash.put("ShowUnNormalizedMention","False");
|
118 |
-
}
|
119 |
-
if(setup_hash.containsKey("tmpFolder"))
|
120 |
-
{
|
121 |
-
tmpFolder=setup_hash.get("tmpFolder");
|
122 |
-
}
|
123 |
-
|
124 |
-
/*
|
125 |
-
* Time stamp - start : All
|
126 |
-
*/
|
127 |
-
double startTime,endTime,totTime;
|
128 |
-
startTime = System.currentTimeMillis();//start time
|
129 |
-
|
130 |
-
int NumFiles=0;
|
131 |
-
File folder = new File(InputFolder);
|
132 |
-
File[] listOfFiles = folder.listFiles();
|
133 |
-
for (int i = 0; i < listOfFiles.length; i++)
|
134 |
-
{
|
135 |
-
if (listOfFiles[i].isFile())
|
136 |
-
{
|
137 |
-
String InputFile = listOfFiles[i].getName();
|
138 |
-
File f = new File(OutputFolder+"/"+InputFile);
|
139 |
-
if(f.exists() && !f.isDirectory())
|
140 |
-
{
|
141 |
-
}
|
142 |
-
else
|
143 |
-
{
|
144 |
-
NumFiles++;
|
145 |
-
}
|
146 |
-
}
|
147 |
-
}
|
148 |
-
|
149 |
-
System.out.println("Total "+NumFiles+" file(s) wait(s) for process.");
|
150 |
-
|
151 |
-
if(NumFiles>0)
|
152 |
-
{
|
153 |
-
/*
|
154 |
-
* Start & Load Dictionary
|
155 |
-
*/
|
156 |
-
String TrainTest = "Test";
|
157 |
-
if(setup_hash.containsKey("TrainTest"))
|
158 |
-
{
|
159 |
-
TrainTest = setup_hash.get("TrainTest");
|
160 |
-
}
|
161 |
-
|
162 |
-
|
163 |
-
/** Load Dictionary */
|
164 |
-
if(setup_hash.containsKey("GeneRecognition") && setup_hash.get("GeneRecognition").toLowerCase().equals("true"))
|
165 |
-
{
|
166 |
-
System.out.print("Loading Gene NER Dictionary : Processing ... \r");
|
167 |
-
/** CTDGene */
|
168 |
-
if(setup_hash.containsKey("IgnoreNER") && setup_hash.get("IgnoreNER").toLowerCase().equals("true")){} // not NER (entities are pre-annotated)
|
169 |
-
else if(setup_hash.containsKey("SpeciesAssignmentOnly") && setup_hash.get("SpeciesAssignmentOnly").toLowerCase().equals("true")) {} // species assignment
|
170 |
-
else
|
171 |
-
{
|
172 |
-
PT_CTDGene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_CTDGene.txt");
|
173 |
-
}
|
174 |
-
/** ent */
|
175 |
-
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/ent.rev.txt"));
|
176 |
-
line="";
|
177 |
-
while ((line = br.readLine()) != null)
|
178 |
-
{
|
179 |
-
String l[]=line.split("\t"); //Α Alpha
|
180 |
-
ent_hash.put(l[0], l[1]);
|
181 |
-
}
|
182 |
-
br.close();
|
183 |
-
|
184 |
-
/** FamilyName */
|
185 |
-
if((!setup_hash.containsKey("IgnoreNER")) || setup_hash.get("IgnoreNER").toLowerCase() != "true")
|
186 |
-
{
|
187 |
-
PT_FamilyName.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_FamilyName.txt");
|
188 |
-
}
|
189 |
-
|
190 |
-
/** GeneChromosome */
|
191 |
-
//PT_GeneChromosome.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_GeneChromosome.txt");
|
192 |
-
System.out.println("Loading Gene NER Dictionary : Processing ... done.");
|
193 |
-
}
|
194 |
-
|
195 |
-
if(setup_hash.containsKey("SpeciesRecognition") && setup_hash.get("SpeciesRecognition").toLowerCase().equals("true"))
|
196 |
-
{
|
197 |
-
System.out.print("Loading Species NER Dictionary : Processing ... \r");
|
198 |
-
/** Species */
|
199 |
-
PT_Species.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Species.txt");
|
200 |
-
|
201 |
-
/** Cell */
|
202 |
-
PT_Cell.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Cell.txt");
|
203 |
-
|
204 |
-
/** Genus */
|
205 |
-
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SPGenus.txt"));
|
206 |
-
line="";
|
207 |
-
while ((line = br.readLine()) != null)
|
208 |
-
{
|
209 |
-
String l[]=line.split("\t");
|
210 |
-
GenusID_hash.put(l[0], l[1]); // tax id -> Genus
|
211 |
-
}
|
212 |
-
br.close();
|
213 |
-
|
214 |
-
/** taxid4gene */
|
215 |
-
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/tax4gene.txt"));
|
216 |
-
line="";
|
217 |
-
while ((line = br.readLine()) != null)
|
218 |
-
{
|
219 |
-
taxid4gene.add(line); // tax id -> Genus
|
220 |
-
}
|
221 |
-
br.close();
|
222 |
-
System.out.println("Loading Species NER Dictionary : Processing ... done.");
|
223 |
-
|
224 |
-
}
|
225 |
-
|
226 |
-
if(setup_hash.containsKey("SpeciesAssignment") && setup_hash.get("SpeciesAssignment").toLowerCase().equals("true"))
|
227 |
-
{
|
228 |
-
System.out.print("Loading Species Assignment Dictionary : Processing ... \r");
|
229 |
-
/** GeneWithoutSPPrefix */
|
230 |
-
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/GeneWithoutSPPrefix.txt"));
|
231 |
-
line="";
|
232 |
-
while ((line = br.readLine()) != null)
|
233 |
-
{
|
234 |
-
GeneWithoutSPPrefix_hash.put(line, "");
|
235 |
-
}
|
236 |
-
br.close();
|
237 |
-
|
238 |
-
/** Prefix */
|
239 |
-
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SPPrefix.txt"));
|
240 |
-
line="";
|
241 |
-
while ((line = br.readLine()) != null)
|
242 |
-
{
|
243 |
-
String l[]=line.split("\t");
|
244 |
-
PrefixID_hash.put(l[0], l[1]); //tax id -> prefix
|
245 |
-
}
|
246 |
-
br.close();
|
247 |
-
PrefixID_hash.put("9606", "h");
|
248 |
-
PrefixID_hash.put("10090", "m");
|
249 |
-
PrefixID_hash.put("10116", "r");
|
250 |
-
PrefixID_hash.put("4932", "y");
|
251 |
-
PrefixID_hash.put("7227", "d");
|
252 |
-
PrefixID_hash.put("7955", "z|dr|Dr|Zf|zf");
|
253 |
-
PrefixID_hash.put("3702", "at|At");
|
254 |
-
|
255 |
-
/** Frequency */
|
256 |
-
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/taxonomy_freq.txt"));
|
257 |
-
line="";
|
258 |
-
while ((line = br.readLine()) != null)
|
259 |
-
{
|
260 |
-
String l[]=line.split("\t");
|
261 |
-
TaxFreq_hash.put(l[0], Double.parseDouble(l[1])/200000000); //tax id -> prefix
|
262 |
-
}
|
263 |
-
br.close();
|
264 |
-
|
265 |
-
/** SP_Virus2Human_hash */
|
266 |
-
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SP_Virus2HumanList.txt"));
|
267 |
-
line="";
|
268 |
-
while ((line = br.readLine()) != null)
|
269 |
-
{
|
270 |
-
SP_Virus2Human_hash.put(line,"9606");
|
271 |
-
}
|
272 |
-
br.close();
|
273 |
-
|
274 |
-
/** SPStrain */
|
275 |
-
/*
|
276 |
-
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SPStrain.txt"));
|
277 |
-
line="";
|
278 |
-
while ((line = br.readLine()) != null)
|
279 |
-
{
|
280 |
-
String l[]=line.split("\t");
|
281 |
-
String ancestor_id = l[0];
|
282 |
-
String tax_id = l[1];
|
283 |
-
String tax_names = l[2];
|
284 |
-
StrainID_ancestor2tax_hash.put(ancestor_id, tax_id); // ancestor -> tax_id
|
285 |
-
StrainID_taxid2names_hash.put(tax_id, tax_names); // tax id -> strain
|
286 |
-
}
|
287 |
-
br.close();
|
288 |
-
*/
|
289 |
-
System.out.println("Loading Species Assignment Dictionary : Processing ... done.");
|
290 |
-
|
291 |
-
}
|
292 |
-
|
293 |
-
if(setup_hash.containsKey("GeneNormalization") && setup_hash.get("GeneNormalization").toLowerCase().equals("true"))
|
294 |
-
{
|
295 |
-
System.out.print("Loading Gene normalization Dictionary : Processing ... \r");
|
296 |
-
/** gene_prefix & gene_suffix */
|
297 |
-
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/PrefixSuffix.txt"));
|
298 |
-
line="";
|
299 |
-
while ((line = br.readLine()) != null)
|
300 |
-
{
|
301 |
-
String l[]=line.split("\t");
|
302 |
-
String org=l[0];
|
303 |
-
String mod=l[1];
|
304 |
-
suffixprefix_orig2modified.put(org,mod);
|
305 |
-
}
|
306 |
-
br.close();
|
307 |
-
|
308 |
-
/** gene_prefix & gene_suffix */
|
309 |
-
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/NonGeneAbbr.txt"));
|
310 |
-
line="";
|
311 |
-
while ((line = br.readLine()) != null)
|
312 |
-
{
|
313 |
-
String l[]=line.split("\t");
|
314 |
-
String shortform=l[0];
|
315 |
-
String longform_toks=l[1];
|
316 |
-
Abb2Longformtok_hash.put(shortform,longform_toks);
|
317 |
-
}
|
318 |
-
br.close();
|
319 |
-
|
320 |
-
/** SimConcept.MentionType */
|
321 |
-
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SimConcept.MentionType.txt"));
|
322 |
-
line="";
|
323 |
-
while ((line = br.readLine()) != null)
|
324 |
-
{
|
325 |
-
String l[]=line.split("\t");
|
326 |
-
SimConceptMention2Type_hash.put(l[0], l[1]);
|
327 |
-
}
|
328 |
-
br.close();
|
329 |
-
|
330 |
-
/** Filtering */
|
331 |
-
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Filtering.txt"));
|
332 |
-
line="";
|
333 |
-
while ((line = br.readLine()) != null)
|
334 |
-
{
|
335 |
-
Filtering_hash.put(line, "");
|
336 |
-
}
|
337 |
-
br.close();
|
338 |
-
|
339 |
-
/** Filtering_WithLongForm.txt */
|
340 |
-
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Filtering_WithLongForm.txt"));
|
341 |
-
line="";
|
342 |
-
while ((line = br.readLine()) != null)
|
343 |
-
{
|
344 |
-
String l[]=line.split("\t");
|
345 |
-
Filtering_WithLongForm_hash.put(l[0], l[1]);
|
346 |
-
}
|
347 |
-
br.close();
|
348 |
-
|
349 |
-
/** Gene Dictionary */
|
350 |
-
if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All"))
|
351 |
-
{
|
352 |
-
PT_Gene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Gene."+setup_hash.get("FocusSpecies")+".txt");
|
353 |
-
}
|
354 |
-
else if((!FocusSpecies.equals("")) && (!FocusSpecies.equals("All")))
|
355 |
-
{
|
356 |
-
PT_Gene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Gene."+FocusSpecies+".txt");
|
357 |
-
}
|
358 |
-
else
|
359 |
-
{
|
360 |
-
PT_Gene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Gene.txt");
|
361 |
-
}
|
362 |
-
|
363 |
-
/** GeneScoring */
|
364 |
-
String FileName=setup_hash.get("DictionaryFolder")+"/GeneScoring.txt";
|
365 |
-
|
366 |
-
if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All"))
|
367 |
-
{
|
368 |
-
FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring."+setup_hash.get("FocusSpecies")+".txt";
|
369 |
-
}
|
370 |
-
else if((!FocusSpecies.equals("")) && (!FocusSpecies.equals("All")))
|
371 |
-
{
|
372 |
-
FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring."+FocusSpecies+".txt";
|
373 |
-
}
|
374 |
-
br = new BufferedReader(new FileReader(FileName));
|
375 |
-
line="";
|
376 |
-
while ((line = br.readLine()) != null)
|
377 |
-
{
|
378 |
-
String l[]=line.split("\t");
|
379 |
-
GeneScoring_hash.put(l[0], l[1]+"\t"+l[2]+"\t"+l[3]+"\t"+l[4]+"\t"+l[5]+"\t"+l[6]);
|
380 |
-
}
|
381 |
-
br.close();
|
382 |
-
|
383 |
-
/** GeneScoring.DF */
|
384 |
-
FileName=setup_hash.get("DictionaryFolder")+"/GeneScoring.DF.txt";
|
385 |
-
if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All"))
|
386 |
-
{
|
387 |
-
FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring.DF."+setup_hash.get("FocusSpecies")+".txt";
|
388 |
-
}
|
389 |
-
else if((!FocusSpecies.equals("")) && (!FocusSpecies.equals("All")))
|
390 |
-
{
|
391 |
-
FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring.DF."+FocusSpecies+".txt";
|
392 |
-
}
|
393 |
-
br = new BufferedReader(new FileReader(FileName));
|
394 |
-
double Sum = Double.parseDouble(br.readLine());
|
395 |
-
while ((line = br.readLine()) != null)
|
396 |
-
{
|
397 |
-
String l[]=line.split("\t");
|
398 |
-
// token -> idf
|
399 |
-
GeneScoringDF_hash.put(l[0], Math.log10(Sum/Double.parseDouble(l[1])));
|
400 |
-
}
|
401 |
-
br.close();
|
402 |
-
|
403 |
-
/** Suffix Translation */
|
404 |
-
SuffixTranslationMap_hash.put("alpha","a");
|
405 |
-
SuffixTranslationMap_hash.put("a","alpha");
|
406 |
-
SuffixTranslationMap_hash.put("beta","b");
|
407 |
-
SuffixTranslationMap_hash.put("b","beta");
|
408 |
-
SuffixTranslationMap_hash.put("delta","d");
|
409 |
-
SuffixTranslationMap_hash.put("d","delta");
|
410 |
-
SuffixTranslationMap_hash.put("z","zeta");
|
411 |
-
SuffixTranslationMap_hash.put("zeta","z");
|
412 |
-
SuffixTranslationMap_hash.put("gamma","g");
|
413 |
-
SuffixTranslationMap_hash.put("g","gamma");
|
414 |
-
SuffixTranslationMap_hash.put("r","gamma");
|
415 |
-
SuffixTranslationMap_hash.put("y","gamma");
|
416 |
-
|
417 |
-
SuffixTranslationMap2_hash.put("2","ii");
|
418 |
-
SuffixTranslationMap2_hash.put("ii","2");
|
419 |
-
SuffixTranslationMap2_hash.put("II","2");
|
420 |
-
SuffixTranslationMap2_hash.put("1","i");
|
421 |
-
SuffixTranslationMap2_hash.put("i","1");
|
422 |
-
SuffixTranslationMap2_hash.put("I","1");
|
423 |
-
|
424 |
-
/** GeneID */
|
425 |
-
if(setup_hash.containsKey("GeneIDMatch") && setup_hash.get("GeneIDMatch").toLowerCase().equals("true"))
|
426 |
-
{
|
427 |
-
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/GeneIDs.txt"));
|
428 |
-
line="";
|
429 |
-
while ((line = br.readLine()) != null)
|
430 |
-
{
|
431 |
-
String l[]=line.split("\t");
|
432 |
-
GeneIDs_hash.put(l[0],l[1]);
|
433 |
-
}
|
434 |
-
br.close();
|
435 |
-
}
|
436 |
-
|
437 |
-
/** Normalization2Protein */
|
438 |
-
if(setup_hash.containsKey("Normalization2Protein") && setup_hash.get("Normalization2Protein").toLowerCase().equals("true"))
|
439 |
-
{
|
440 |
-
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Gene2Protein.txt"));
|
441 |
-
line="";
|
442 |
-
while ((line = br.readLine()) != null)
|
443 |
-
{
|
444 |
-
String l[]=line.split("\t");
|
445 |
-
Normalization2Protein_hash.put(l[0],l[1]);
|
446 |
-
}
|
447 |
-
br.close();
|
448 |
-
}
|
449 |
-
|
450 |
-
/** HomologeneID */
|
451 |
-
if(setup_hash.containsKey("HomologeneID") && setup_hash.get("HomologeneID").toLowerCase().equals("true"))
|
452 |
-
{
|
453 |
-
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Gene2Homoid.txt"));
|
454 |
-
line="";
|
455 |
-
while ((line = br.readLine()) != null)
|
456 |
-
{
|
457 |
-
String l[]=line.split("\t");
|
458 |
-
HomologeneID_hash.put(l[0],l[1]);
|
459 |
-
}
|
460 |
-
br.close();
|
461 |
-
}
|
462 |
-
System.out.println("Loading Gene normalization Dictionary : Processing ... done.");
|
463 |
-
}
|
464 |
-
|
465 |
-
endTime = System.currentTimeMillis();
|
466 |
-
totTime = endTime - startTime;
|
467 |
-
System.out.println("Loading Dictionary : Processing Time:"+totTime/1000+"sec");
|
468 |
-
|
469 |
-
folder = new File(InputFolder);
|
470 |
-
listOfFiles = folder.listFiles();
|
471 |
-
for (int i = 0; i < listOfFiles.length; i++)
|
472 |
-
{
|
473 |
-
if (listOfFiles[i].isFile())
|
474 |
-
{
|
475 |
-
String InputFile = listOfFiles[i].getName();
|
476 |
-
File f = new File(OutputFolder+"/"+InputFile);
|
477 |
-
if(f.exists() && !f.isDirectory())
|
478 |
-
{
|
479 |
-
System.out.println(InputFolder+"/"+InputFile+" - Done. (The output file exists in output folder)");
|
480 |
-
}
|
481 |
-
else
|
482 |
-
{
|
483 |
-
String path=tmpFolder;
|
484 |
-
File file = new File(path);
|
485 |
-
File[] files = file.listFiles();
|
486 |
-
for (File ftmp:files)
|
487 |
-
{
|
488 |
-
if (ftmp.isFile() && ftmp.exists())
|
489 |
-
{
|
490 |
-
if(ftmp.toString().matches(tmpFolder+"/"+InputFile+".*"))
|
491 |
-
{
|
492 |
-
ftmp.delete();
|
493 |
-
}
|
494 |
-
}
|
495 |
-
}
|
496 |
-
|
497 |
-
BioCDocobj = new BioCDoc();
|
498 |
-
|
499 |
-
/*
|
500 |
-
* Format Check
|
501 |
-
*/
|
502 |
-
String Format = "";
|
503 |
-
String checkR = BioCDocobj.BioCFormatCheck(InputFolder+"/"+InputFile);
|
504 |
-
if(checkR.equals("BioC"))
|
505 |
-
{
|
506 |
-
Format = "BioC";
|
507 |
-
}
|
508 |
-
else if(checkR.equals("PubTator"))
|
509 |
-
{
|
510 |
-
Format = "PubTator";
|
511 |
-
}
|
512 |
-
else
|
513 |
-
{
|
514 |
-
System.out.println(checkR);
|
515 |
-
System.exit(0);
|
516 |
-
}
|
517 |
-
|
518 |
-
System.out.print(InputFolder+"/"+InputFile+" - ("+Format+" format) : Processing ... \r");
|
519 |
-
|
520 |
-
/** PubTator2BioC*/
|
521 |
-
if(Format.equals("PubTator"))
|
522 |
-
{
|
523 |
-
BioCDocobj.PubTator2BioC(InputFolder+"/"+InputFile,tmpFolder+"/"+InputFile);
|
524 |
-
}
|
525 |
-
else
|
526 |
-
{
|
527 |
-
br = new BufferedReader(new FileReader(InputFolder+"/"+InputFile));
|
528 |
-
BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpFolder+"/"+InputFile), "UTF-8"));
|
529 |
-
line="";
|
530 |
-
while ((line = br.readLine()) != null)
|
531 |
-
{
|
532 |
-
fr.write(line);
|
533 |
-
}
|
534 |
-
br.close();
|
535 |
-
fr.close();
|
536 |
-
}
|
537 |
-
|
538 |
-
/** load file */
|
539 |
-
GNR GNRobj = new GNR();
|
540 |
-
GNRobj.LoadInputFile(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".Abb",TrainTest);
|
541 |
-
SR SRobj = new SR();
|
542 |
-
SimConcept SCobj = new SimConcept();
|
543 |
-
GN GNobj = new GN();
|
544 |
-
String FinalStep="";
|
545 |
-
|
546 |
-
/** SpeciesRecognition */
|
547 |
-
if(setup_hash.containsKey("SpeciesRecognition") && setup_hash.get("SpeciesRecognition").toLowerCase().equals("true") ) // pre-annotated name entities
|
548 |
-
{
|
549 |
-
SRobj.SpeciesRecognition(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SR.xml",setup_hash.get("DictionaryFolder")+"/SPStrain.txt",setup_hash.get("FilterAntibody"));
|
550 |
-
FinalStep="SpeciesRecognition";
|
551 |
-
}
|
552 |
-
|
553 |
-
/** GeneRecognition */
|
554 |
-
if( setup_hash.containsKey("GeneRecognition") && setup_hash.get("GeneRecognition").toLowerCase().equals("true") )
|
555 |
-
{
|
556 |
-
GNRobj.FeatureExtraction(tmpFolder+"/"+InputFile+".data",tmpFolder+"/"+InputFile+".loca",TrainTest);
|
557 |
-
GNRobj.CRF_test(setup_hash.get("GNRModel"),tmpFolder+"/"+InputFile+".data",tmpFolder+"/"+InputFile+".output","top3"); //top3
|
558 |
-
GNRobj.ReadCRFresult(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".loca",tmpFolder+"/"+InputFile+".output",tmpFolder+"/"+InputFile+".GNR.xml",0.005,0.05); //0.005,0.05
|
559 |
-
f = new File(tmpFolder+"/"+InputFile+".SR.xml");
|
560 |
-
if(f.exists())
|
561 |
-
{
|
562 |
-
GNRobj.PostProcessing(tmpFolder+"/"+InputFile+".SR.xml",tmpFolder+"/"+InputFile+".GNR.xml");
|
563 |
-
}
|
564 |
-
else
|
565 |
-
{
|
566 |
-
GNRobj.PostProcessing(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GNR.xml");
|
567 |
-
}
|
568 |
-
FinalStep="GeneRecognition";
|
569 |
-
}
|
570 |
-
|
571 |
-
/** SpeciesAssignment */
|
572 |
-
if(setup_hash.containsKey("SpeciesAssignment") && setup_hash.get("SpeciesAssignment").toLowerCase().equals("true") ) // pre-annotated name entities
|
573 |
-
{
|
574 |
-
if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All")) // FocusSpecies
|
575 |
-
{
|
576 |
-
f = new File(tmpFolder+"/"+InputFile+".GNR.xml");
|
577 |
-
if(f.exists())
|
578 |
-
{
|
579 |
-
SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile+".GNR.xml",tmpFolder+"/"+InputFile+".SA.xml",setup_hash.get("FocusSpecies"));
|
580 |
-
}
|
581 |
-
else
|
582 |
-
{
|
583 |
-
SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SA.xml",setup_hash.get("FocusSpecies"));
|
584 |
-
}
|
585 |
-
}
|
586 |
-
else// All Species
|
587 |
-
{
|
588 |
-
f = new File(tmpFolder+"/"+InputFile+".GNR.xml");
|
589 |
-
if(f.exists())
|
590 |
-
{
|
591 |
-
SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile+".GNR.xml",tmpFolder+"/"+InputFile+".SA.xml");
|
592 |
-
}
|
593 |
-
else
|
594 |
-
{
|
595 |
-
SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SA.xml");
|
596 |
-
}
|
597 |
-
}
|
598 |
-
FinalStep="SpeciesAssignment";
|
599 |
-
}
|
600 |
-
|
601 |
-
/** GeneNormalization */
|
602 |
-
if((setup_hash.containsKey("GeneNormalization")) && setup_hash.get("GeneNormalization").toLowerCase().equals("true") )
|
603 |
-
{
|
604 |
-
/** SimConcept */
|
605 |
-
{
|
606 |
-
SCobj.FeatureExtraction_Test(tmpFolder+"/"+InputFile+".SC.data");
|
607 |
-
SCobj.CRF_test(setup_hash.get("SCModel"),tmpFolder+"/"+InputFile+".SC.data",tmpFolder+"/"+InputFile+".SC.output");
|
608 |
-
SCobj.ReadCRFresult(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SC.output",tmpFolder+"/"+InputFile+".SC.xml");
|
609 |
-
}
|
610 |
-
|
611 |
-
/** GeneNormalization */
|
612 |
-
{
|
613 |
-
GNobj.PreProcessing4GN(InputFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".PreProcessing4GN.xml");
|
614 |
-
GNobj.ChromosomeRecognition(InputFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml");
|
615 |
-
if(setup_hash.containsKey("GeneIDMatch") && setup_hash.get("GeneIDMatch").equals("True"))
|
616 |
-
{
|
617 |
-
|
618 |
-
GNobj.GeneNormalization(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml",true);
|
619 |
-
GNobj.GeneIDRecognition(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml");
|
620 |
-
}
|
621 |
-
else
|
622 |
-
{
|
623 |
-
GNobj.GeneNormalization(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml",false);
|
624 |
-
}
|
625 |
-
}
|
626 |
-
FinalStep="GeneNormalization";
|
627 |
-
}
|
628 |
-
|
629 |
-
/** BioC2PubTator*/
|
630 |
-
String final_output="";
|
631 |
-
if(FinalStep.equals("GeneNormalization"))
|
632 |
-
{
|
633 |
-
final_output=tmpFolder+"/"+InputFile+".GN.xml";
|
634 |
-
}
|
635 |
-
else if(FinalStep.equals("SpeciesAssignment"))
|
636 |
-
{
|
637 |
-
final_output=tmpFolder+"/"+InputFile+".SA.xml";
|
638 |
-
}
|
639 |
-
else if(FinalStep.equals("SpeciesRecognition"))
|
640 |
-
{
|
641 |
-
final_output=tmpFolder+"/"+InputFile+".SR.xml";
|
642 |
-
}
|
643 |
-
else if(FinalStep.equals("GeneRecognition"))
|
644 |
-
{
|
645 |
-
final_output=tmpFolder+"/"+InputFile+".GNR.xml";
|
646 |
-
}
|
647 |
-
|
648 |
-
if(Format.equals("PubTator"))
|
649 |
-
{
|
650 |
-
BioCDocobj.BioC2PubTator(final_output,OutputFolder+"/"+InputFile);
|
651 |
-
}
|
652 |
-
else
|
653 |
-
{
|
654 |
-
br = new BufferedReader(new FileReader(final_output));
|
655 |
-
BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(OutputFolder+"/"+InputFile), "UTF-8"));
|
656 |
-
line="";
|
657 |
-
while ((line = br.readLine()) != null)
|
658 |
-
{
|
659 |
-
fr.write(line);
|
660 |
-
}
|
661 |
-
br.close();
|
662 |
-
fr.close();
|
663 |
-
}
|
664 |
-
|
665 |
-
/*
|
666 |
-
* remove tmp files
|
667 |
-
*/
|
668 |
-
if((!setup_hash.containsKey("DeleteTmp")) || setup_hash.get("DeleteTmp").toLowerCase().equals("true"))
|
669 |
-
{
|
670 |
-
path="tmp";
|
671 |
-
file = new File(path);
|
672 |
-
files = file.listFiles();
|
673 |
-
for (File ftmp:files)
|
674 |
-
{
|
675 |
-
if (ftmp.isFile() && ftmp.exists())
|
676 |
-
{
|
677 |
-
if(ftmp.toString().matches(tmpFolder+"/"+InputFile+".*"))
|
678 |
-
{
|
679 |
-
ftmp.delete();
|
680 |
-
}
|
681 |
-
}
|
682 |
-
}
|
683 |
-
}
|
684 |
-
|
685 |
-
/*
|
686 |
-
* Time stamp - last
|
687 |
-
*/
|
688 |
-
endTime = System.currentTimeMillis();
|
689 |
-
totTime = endTime - startTime;
|
690 |
-
System.out.println(InputFolder+"/"+InputFile+" - ("+Format+" format) : Processing Time:"+totTime/1000+"sec");
|
691 |
-
}
|
692 |
-
}
|
693 |
-
}
|
694 |
-
}
|
695 |
-
}
|
696 |
-
}
|
|
|
1 |
+
package GNormPluslib;
|
2 |
+
|
3 |
+
import java.io.BufferedReader;
|
4 |
+
import java.io.BufferedWriter;
|
5 |
+
import java.io.File;
|
6 |
+
import java.io.FileOutputStream;
|
7 |
+
import java.io.FileReader;
|
8 |
+
import java.io.IOException;
|
9 |
+
import java.io.OutputStreamWriter;
|
10 |
+
import java.sql.SQLException;
|
11 |
+
import java.util.ArrayList;
|
12 |
+
import java.util.HashMap;
|
13 |
+
import java.util.regex.Matcher;
|
14 |
+
import java.util.regex.Pattern;
|
15 |
+
|
16 |
+
import javax.xml.stream.XMLStreamException;
|
17 |
+
|
18 |
+
import GNormPluslib.PrefixTree;
|
19 |
+
import GNormPluslib.GNR;
|
20 |
+
import GNormPluslib.SR;
|
21 |
+
|
22 |
+
public class GNormPlus
|
23 |
+
{
|
24 |
+
public static BioCDoc BioCDocobj = new BioCDoc();
|
25 |
+
public static PrefixTree PT_Species = new PrefixTree();
|
26 |
+
public static PrefixTree PT_Cell = new PrefixTree();
|
27 |
+
public static PrefixTree PT_CTDGene = new PrefixTree();
|
28 |
+
public static PrefixTree PT_Gene = new PrefixTree();
|
29 |
+
public static PrefixTree PT_GeneChromosome = new PrefixTree();
|
30 |
+
public static PrefixTree PT_FamilyName = new PrefixTree();
|
31 |
+
public static HashMap<String, String> ent_hash = new HashMap<String, String>();
|
32 |
+
public static HashMap<String, String> GenusID_hash = new HashMap<String, String>();
|
33 |
+
public static HashMap<String, String> PrefixID_hash = new HashMap<String, String>();
|
34 |
+
public static HashMap<String, Double> TaxFreq_hash = new HashMap<String, Double>();
|
35 |
+
public static HashMap<String, String> GeneScoring_hash = new HashMap<String, String>();
|
36 |
+
public static HashMap<String, Double> GeneScoringDF_hash = new HashMap<String, Double>();
|
37 |
+
public static HashMap<String, String> GeneIDs_hash = new HashMap<String, String>();
|
38 |
+
public static HashMap<String, String> Normalization2Protein_hash = new HashMap<String, String>();
|
39 |
+
public static HashMap<String, String> HomologeneID_hash = new HashMap<String, String>();
|
40 |
+
public static HashMap<String,String> SuffixTranslationMap_hash = new HashMap<String,String>();
|
41 |
+
public static HashMap<String,String> SuffixTranslationMap2_hash = new HashMap<String,String>();
|
42 |
+
public static HashMap<String, String> Pmid2Abb_hash = new HashMap<String, String>();
|
43 |
+
public static HashMap<String, String> PmidAbb2LF_lc_hash = new HashMap<String, String>();
|
44 |
+
public static HashMap<String, String> PmidLF2Abb_lc_hash = new HashMap<String, String>();
|
45 |
+
public static HashMap<String, String> PmidAbb2LF_hash = new HashMap<String, String>();
|
46 |
+
public static HashMap<String, String> PmidLF2Abb_hash = new HashMap<String, String>();
|
47 |
+
public static HashMap<String, String> Pmid2ChromosomeGene_hash = new HashMap<String, String>();
|
48 |
+
public static HashMap<String, String> SimConceptMention2Type_hash = new HashMap<String, String>();
|
49 |
+
public static HashMap<String, String> Filtering_hash = new HashMap<String, String>();
|
50 |
+
public static HashMap<String, String> Filtering_WithLongForm_hash = new HashMap<String, String>();
|
51 |
+
public static HashMap<String, String> SP_Virus2Human_hash = new HashMap<String, String>();
|
52 |
+
public static HashMap<String, String> GeneWithoutSPPrefix_hash = new HashMap<String, String>();
|
53 |
+
public static ArrayList <String> taxid4gene = new ArrayList <String>();
|
54 |
+
public static HashMap<String, String> setup_hash = new HashMap<String, String>();
|
55 |
+
public static HashMap<String, String> suffixprefix_orig2modified = new HashMap<String, String>();
|
56 |
+
public static HashMap<String, String> Abb2Longformtok_hash = new HashMap<String, String>();
|
57 |
+
public static HashMap<String, String> StrainID_ancestor2tax_hash = new HashMap<String, String>();
|
58 |
+
public static HashMap<String, String> StrainID_taxid2names_hash = new HashMap<String, String>();
|
59 |
+
|
60 |
+
public static String SetupFile = "setup.txt";
|
61 |
+
public static void main(String [] args) throws IOException, InterruptedException, XMLStreamException, SQLException
|
62 |
+
{
|
63 |
+
String InputFolder="input";
|
64 |
+
String OutputFolder="output";
|
65 |
+
String tmpFolder="tmp";
|
66 |
+
String FocusSpecies = "";
|
67 |
+
if(args.length<2)
|
68 |
+
{
|
69 |
+
System.out.println("\n$ java -Xmx30G -Xms10G -jar GNormPlus.jar [InputFolder] [OutputFolder] [SetupFile]");
|
70 |
+
System.out.println("[InputFolder] Default : input");
|
71 |
+
System.out.println("[OutputFolder] Default : output");
|
72 |
+
System.out.println("[SetupFile] Default : setup.txt\n\n");
|
73 |
+
}
|
74 |
+
else
|
75 |
+
{
|
76 |
+
/*
|
77 |
+
* Parameters
|
78 |
+
*/
|
79 |
+
InputFolder=args[0];
|
80 |
+
OutputFolder=args[1];
|
81 |
+
if(args.length>=3)
|
82 |
+
{
|
83 |
+
SetupFile = args[2];
|
84 |
+
}
|
85 |
+
if(args.length>=4)
|
86 |
+
{
|
87 |
+
FocusSpecies=args[3];
|
88 |
+
}
|
89 |
+
}
|
90 |
+
|
91 |
+
BufferedReader br = new BufferedReader(new FileReader(SetupFile));
|
92 |
+
String line="";
|
93 |
+
Pattern ptmp = Pattern.compile("^ ([A-Za-z0-9]+) = ([^ \\t\\n\\r]+)$");
|
94 |
+
while ((line = br.readLine()) != null)
|
95 |
+
{
|
96 |
+
Matcher mtmp = ptmp.matcher(line);
|
97 |
+
if(mtmp.find())
|
98 |
+
{
|
99 |
+
setup_hash.put(mtmp.group(1), mtmp.group(2));
|
100 |
+
}
|
101 |
+
}
|
102 |
+
br.close();
|
103 |
+
if(!setup_hash.containsKey("GeneIDMatch"))
|
104 |
+
{
|
105 |
+
setup_hash.put("GeneIDMatch","True");
|
106 |
+
}
|
107 |
+
if(!setup_hash.containsKey("HomologeneID"))
|
108 |
+
{
|
109 |
+
setup_hash.put("HomologeneID","False");
|
110 |
+
}
|
111 |
+
if(!FocusSpecies.equals(""))
|
112 |
+
{
|
113 |
+
setup_hash.put("FocusSpecies",FocusSpecies);
|
114 |
+
}
|
115 |
+
if(!setup_hash.containsKey("ShowUnNormalizedMention"))
|
116 |
+
{
|
117 |
+
setup_hash.put("ShowUnNormalizedMention","False");
|
118 |
+
}
|
119 |
+
if(setup_hash.containsKey("tmpFolder"))
|
120 |
+
{
|
121 |
+
tmpFolder=setup_hash.get("tmpFolder");
|
122 |
+
}
|
123 |
+
|
124 |
+
/*
|
125 |
+
* Time stamp - start : All
|
126 |
+
*/
|
127 |
+
double startTime,endTime,totTime;
|
128 |
+
startTime = System.currentTimeMillis();//start time
|
129 |
+
|
130 |
+
int NumFiles=0;
|
131 |
+
File folder = new File(InputFolder);
|
132 |
+
File[] listOfFiles = folder.listFiles();
|
133 |
+
for (int i = 0; i < listOfFiles.length; i++)
|
134 |
+
{
|
135 |
+
if (listOfFiles[i].isFile())
|
136 |
+
{
|
137 |
+
String InputFile = listOfFiles[i].getName();
|
138 |
+
File f = new File(OutputFolder+"/"+InputFile);
|
139 |
+
if(f.exists() && !f.isDirectory())
|
140 |
+
{
|
141 |
+
}
|
142 |
+
else
|
143 |
+
{
|
144 |
+
NumFiles++;
|
145 |
+
}
|
146 |
+
}
|
147 |
+
}
|
148 |
+
|
149 |
+
System.out.println("Total "+NumFiles+" file(s) wait(s) for process.");
|
150 |
+
|
151 |
+
if(NumFiles>0)
|
152 |
+
{
|
153 |
+
/*
|
154 |
+
* Start & Load Dictionary
|
155 |
+
*/
|
156 |
+
String TrainTest = "Test";
|
157 |
+
if(setup_hash.containsKey("TrainTest"))
|
158 |
+
{
|
159 |
+
TrainTest = setup_hash.get("TrainTest");
|
160 |
+
}
|
161 |
+
|
162 |
+
|
163 |
+
/** Load Dictionary */
|
164 |
+
if(setup_hash.containsKey("GeneRecognition") && setup_hash.get("GeneRecognition").toLowerCase().equals("true"))
|
165 |
+
{
|
166 |
+
System.out.print("Loading Gene NER Dictionary : Processing ... \r");
|
167 |
+
/** CTDGene */
|
168 |
+
if(setup_hash.containsKey("IgnoreNER") && setup_hash.get("IgnoreNER").toLowerCase().equals("true")){} // not NER (entities are pre-annotated)
|
169 |
+
else if(setup_hash.containsKey("SpeciesAssignmentOnly") && setup_hash.get("SpeciesAssignmentOnly").toLowerCase().equals("true")) {} // species assignment
|
170 |
+
else
|
171 |
+
{
|
172 |
+
PT_CTDGene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_CTDGene.txt");
|
173 |
+
}
|
174 |
+
/** ent */
|
175 |
+
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/ent.rev.txt"));
|
176 |
+
line="";
|
177 |
+
while ((line = br.readLine()) != null)
|
178 |
+
{
|
179 |
+
String l[]=line.split("\t"); //Α Alpha
|
180 |
+
ent_hash.put(l[0], l[1]);
|
181 |
+
}
|
182 |
+
br.close();
|
183 |
+
|
184 |
+
/** FamilyName */
|
185 |
+
if((!setup_hash.containsKey("IgnoreNER")) || setup_hash.get("IgnoreNER").toLowerCase() != "true")
|
186 |
+
{
|
187 |
+
PT_FamilyName.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_FamilyName.txt");
|
188 |
+
}
|
189 |
+
|
190 |
+
/** GeneChromosome */
|
191 |
+
//PT_GeneChromosome.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_GeneChromosome.txt");
|
192 |
+
System.out.println("Loading Gene NER Dictionary : Processing ... done.");
|
193 |
+
}
|
194 |
+
|
195 |
+
if(setup_hash.containsKey("SpeciesRecognition") && setup_hash.get("SpeciesRecognition").toLowerCase().equals("true"))
|
196 |
+
{
|
197 |
+
System.out.print("Loading Species NER Dictionary : Processing ... \r");
|
198 |
+
/** Species */
|
199 |
+
PT_Species.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Species.txt");
|
200 |
+
|
201 |
+
/** Cell */
|
202 |
+
PT_Cell.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Cell.txt");
|
203 |
+
|
204 |
+
/** Genus */
|
205 |
+
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SPGenus.txt"));
|
206 |
+
line="";
|
207 |
+
while ((line = br.readLine()) != null)
|
208 |
+
{
|
209 |
+
String l[]=line.split("\t");
|
210 |
+
GenusID_hash.put(l[0], l[1]); // tax id -> Genus
|
211 |
+
}
|
212 |
+
br.close();
|
213 |
+
|
214 |
+
/** taxid4gene */
|
215 |
+
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/tax4gene.txt"));
|
216 |
+
line="";
|
217 |
+
while ((line = br.readLine()) != null)
|
218 |
+
{
|
219 |
+
taxid4gene.add(line); // tax id -> Genus
|
220 |
+
}
|
221 |
+
br.close();
|
222 |
+
System.out.println("Loading Species NER Dictionary : Processing ... done.");
|
223 |
+
|
224 |
+
}
|
225 |
+
|
226 |
+
if(setup_hash.containsKey("SpeciesAssignment") && setup_hash.get("SpeciesAssignment").toLowerCase().equals("true"))
|
227 |
+
{
|
228 |
+
System.out.print("Loading Species Assignment Dictionary : Processing ... \r");
|
229 |
+
/** GeneWithoutSPPrefix */
|
230 |
+
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/GeneWithoutSPPrefix.txt"));
|
231 |
+
line="";
|
232 |
+
while ((line = br.readLine()) != null)
|
233 |
+
{
|
234 |
+
GeneWithoutSPPrefix_hash.put(line, "");
|
235 |
+
}
|
236 |
+
br.close();
|
237 |
+
|
238 |
+
/** Prefix */
|
239 |
+
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SPPrefix.txt"));
|
240 |
+
line="";
|
241 |
+
while ((line = br.readLine()) != null)
|
242 |
+
{
|
243 |
+
String l[]=line.split("\t");
|
244 |
+
PrefixID_hash.put(l[0], l[1]); //tax id -> prefix
|
245 |
+
}
|
246 |
+
br.close();
|
247 |
+
PrefixID_hash.put("9606", "h");
|
248 |
+
PrefixID_hash.put("10090", "m");
|
249 |
+
PrefixID_hash.put("10116", "r");
|
250 |
+
PrefixID_hash.put("4932", "y");
|
251 |
+
PrefixID_hash.put("7227", "d");
|
252 |
+
PrefixID_hash.put("7955", "z|dr|Dr|Zf|zf");
|
253 |
+
PrefixID_hash.put("3702", "at|At");
|
254 |
+
|
255 |
+
/** Frequency */
|
256 |
+
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/taxonomy_freq.txt"));
|
257 |
+
line="";
|
258 |
+
while ((line = br.readLine()) != null)
|
259 |
+
{
|
260 |
+
String l[]=line.split("\t");
|
261 |
+
TaxFreq_hash.put(l[0], Double.parseDouble(l[1])/200000000); //tax id -> prefix
|
262 |
+
}
|
263 |
+
br.close();
|
264 |
+
|
265 |
+
/** SP_Virus2Human_hash */
|
266 |
+
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SP_Virus2HumanList.txt"));
|
267 |
+
line="";
|
268 |
+
while ((line = br.readLine()) != null)
|
269 |
+
{
|
270 |
+
SP_Virus2Human_hash.put(line,"9606");
|
271 |
+
}
|
272 |
+
br.close();
|
273 |
+
|
274 |
+
/** SPStrain */
|
275 |
+
/*
|
276 |
+
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SPStrain.txt"));
|
277 |
+
line="";
|
278 |
+
while ((line = br.readLine()) != null)
|
279 |
+
{
|
280 |
+
String l[]=line.split("\t");
|
281 |
+
String ancestor_id = l[0];
|
282 |
+
String tax_id = l[1];
|
283 |
+
String tax_names = l[2];
|
284 |
+
StrainID_ancestor2tax_hash.put(ancestor_id, tax_id); // ancestor -> tax_id
|
285 |
+
StrainID_taxid2names_hash.put(tax_id, tax_names); // tax id -> strain
|
286 |
+
}
|
287 |
+
br.close();
|
288 |
+
*/
|
289 |
+
System.out.println("Loading Species Assignment Dictionary : Processing ... done.");
|
290 |
+
|
291 |
+
}
|
292 |
+
|
293 |
+
if(setup_hash.containsKey("GeneNormalization") && setup_hash.get("GeneNormalization").toLowerCase().equals("true"))
|
294 |
+
{
|
295 |
+
System.out.print("Loading Gene normalization Dictionary : Processing ... \r");
|
296 |
+
/** gene_prefix & gene_suffix */
|
297 |
+
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/PrefixSuffix.txt"));
|
298 |
+
line="";
|
299 |
+
while ((line = br.readLine()) != null)
|
300 |
+
{
|
301 |
+
String l[]=line.split("\t");
|
302 |
+
String org=l[0];
|
303 |
+
String mod=l[1];
|
304 |
+
suffixprefix_orig2modified.put(org,mod);
|
305 |
+
}
|
306 |
+
br.close();
|
307 |
+
|
308 |
+
/** gene_prefix & gene_suffix */
|
309 |
+
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/NonGeneAbbr.txt"));
|
310 |
+
line="";
|
311 |
+
while ((line = br.readLine()) != null)
|
312 |
+
{
|
313 |
+
String l[]=line.split("\t");
|
314 |
+
String shortform=l[0];
|
315 |
+
String longform_toks=l[1];
|
316 |
+
Abb2Longformtok_hash.put(shortform,longform_toks);
|
317 |
+
}
|
318 |
+
br.close();
|
319 |
+
|
320 |
+
/** SimConcept.MentionType */
|
321 |
+
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SimConcept.MentionType.txt"));
|
322 |
+
line="";
|
323 |
+
while ((line = br.readLine()) != null)
|
324 |
+
{
|
325 |
+
String l[]=line.split("\t");
|
326 |
+
SimConceptMention2Type_hash.put(l[0], l[1]);
|
327 |
+
}
|
328 |
+
br.close();
|
329 |
+
|
330 |
+
/** Filtering */
|
331 |
+
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Filtering.txt"));
|
332 |
+
line="";
|
333 |
+
while ((line = br.readLine()) != null)
|
334 |
+
{
|
335 |
+
Filtering_hash.put(line, "");
|
336 |
+
}
|
337 |
+
br.close();
|
338 |
+
|
339 |
+
/** Filtering_WithLongForm.txt */
|
340 |
+
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Filtering_WithLongForm.txt"));
|
341 |
+
line="";
|
342 |
+
while ((line = br.readLine()) != null)
|
343 |
+
{
|
344 |
+
String l[]=line.split("\t");
|
345 |
+
Filtering_WithLongForm_hash.put(l[0], l[1]);
|
346 |
+
}
|
347 |
+
br.close();
|
348 |
+
|
349 |
+
/** Gene Dictionary */
|
350 |
+
if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All"))
|
351 |
+
{
|
352 |
+
PT_Gene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Gene."+setup_hash.get("FocusSpecies")+".txt");
|
353 |
+
}
|
354 |
+
else if((!FocusSpecies.equals("")) && (!FocusSpecies.equals("All")))
|
355 |
+
{
|
356 |
+
PT_Gene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Gene."+FocusSpecies+".txt");
|
357 |
+
}
|
358 |
+
else
|
359 |
+
{
|
360 |
+
PT_Gene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Gene.txt");
|
361 |
+
}
|
362 |
+
|
363 |
+
/** GeneScoring */
|
364 |
+
String FileName=setup_hash.get("DictionaryFolder")+"/GeneScoring.txt";
|
365 |
+
|
366 |
+
if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All"))
|
367 |
+
{
|
368 |
+
FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring."+setup_hash.get("FocusSpecies")+".txt";
|
369 |
+
}
|
370 |
+
else if((!FocusSpecies.equals("")) && (!FocusSpecies.equals("All")))
|
371 |
+
{
|
372 |
+
FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring."+FocusSpecies+".txt";
|
373 |
+
}
|
374 |
+
br = new BufferedReader(new FileReader(FileName));
|
375 |
+
line="";
|
376 |
+
while ((line = br.readLine()) != null)
|
377 |
+
{
|
378 |
+
String l[]=line.split("\t");
|
379 |
+
GeneScoring_hash.put(l[0], l[1]+"\t"+l[2]+"\t"+l[3]+"\t"+l[4]+"\t"+l[5]+"\t"+l[6]);
|
380 |
+
}
|
381 |
+
br.close();
|
382 |
+
|
383 |
+
/** GeneScoring.DF */
|
384 |
+
FileName=setup_hash.get("DictionaryFolder")+"/GeneScoring.DF.txt";
|
385 |
+
if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All"))
|
386 |
+
{
|
387 |
+
FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring.DF."+setup_hash.get("FocusSpecies")+".txt";
|
388 |
+
}
|
389 |
+
else if((!FocusSpecies.equals("")) && (!FocusSpecies.equals("All")))
|
390 |
+
{
|
391 |
+
FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring.DF."+FocusSpecies+".txt";
|
392 |
+
}
|
393 |
+
br = new BufferedReader(new FileReader(FileName));
|
394 |
+
double Sum = Double.parseDouble(br.readLine());
|
395 |
+
while ((line = br.readLine()) != null)
|
396 |
+
{
|
397 |
+
String l[]=line.split("\t");
|
398 |
+
// token -> idf
|
399 |
+
GeneScoringDF_hash.put(l[0], Math.log10(Sum/Double.parseDouble(l[1])));
|
400 |
+
}
|
401 |
+
br.close();
|
402 |
+
|
403 |
+
/** Suffix Translation */
|
404 |
+
SuffixTranslationMap_hash.put("alpha","a");
|
405 |
+
SuffixTranslationMap_hash.put("a","alpha");
|
406 |
+
SuffixTranslationMap_hash.put("beta","b");
|
407 |
+
SuffixTranslationMap_hash.put("b","beta");
|
408 |
+
SuffixTranslationMap_hash.put("delta","d");
|
409 |
+
SuffixTranslationMap_hash.put("d","delta");
|
410 |
+
SuffixTranslationMap_hash.put("z","zeta");
|
411 |
+
SuffixTranslationMap_hash.put("zeta","z");
|
412 |
+
SuffixTranslationMap_hash.put("gamma","g");
|
413 |
+
SuffixTranslationMap_hash.put("g","gamma");
|
414 |
+
SuffixTranslationMap_hash.put("r","gamma");
|
415 |
+
SuffixTranslationMap_hash.put("y","gamma");
|
416 |
+
|
417 |
+
SuffixTranslationMap2_hash.put("2","ii");
|
418 |
+
SuffixTranslationMap2_hash.put("ii","2");
|
419 |
+
SuffixTranslationMap2_hash.put("II","2");
|
420 |
+
SuffixTranslationMap2_hash.put("1","i");
|
421 |
+
SuffixTranslationMap2_hash.put("i","1");
|
422 |
+
SuffixTranslationMap2_hash.put("I","1");
|
423 |
+
|
424 |
+
/** GeneID */
|
425 |
+
if(setup_hash.containsKey("GeneIDMatch") && setup_hash.get("GeneIDMatch").toLowerCase().equals("true"))
|
426 |
+
{
|
427 |
+
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/GeneIDs.txt"));
|
428 |
+
line="";
|
429 |
+
while ((line = br.readLine()) != null)
|
430 |
+
{
|
431 |
+
String l[]=line.split("\t");
|
432 |
+
GeneIDs_hash.put(l[0],l[1]);
|
433 |
+
}
|
434 |
+
br.close();
|
435 |
+
}
|
436 |
+
|
437 |
+
/** Normalization2Protein */
|
438 |
+
if(setup_hash.containsKey("Normalization2Protein") && setup_hash.get("Normalization2Protein").toLowerCase().equals("true"))
|
439 |
+
{
|
440 |
+
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Gene2Protein.txt"));
|
441 |
+
line="";
|
442 |
+
while ((line = br.readLine()) != null)
|
443 |
+
{
|
444 |
+
String l[]=line.split("\t");
|
445 |
+
Normalization2Protein_hash.put(l[0],l[1]);
|
446 |
+
}
|
447 |
+
br.close();
|
448 |
+
}
|
449 |
+
|
450 |
+
/** HomologeneID */
|
451 |
+
if(setup_hash.containsKey("HomologeneID") && setup_hash.get("HomologeneID").toLowerCase().equals("true"))
|
452 |
+
{
|
453 |
+
br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Gene2Homoid.txt"));
|
454 |
+
line="";
|
455 |
+
while ((line = br.readLine()) != null)
|
456 |
+
{
|
457 |
+
String l[]=line.split("\t");
|
458 |
+
HomologeneID_hash.put(l[0],l[1]);
|
459 |
+
}
|
460 |
+
br.close();
|
461 |
+
}
|
462 |
+
System.out.println("Loading Gene normalization Dictionary : Processing ... done.");
|
463 |
+
}
|
464 |
+
|
465 |
+
endTime = System.currentTimeMillis();
|
466 |
+
totTime = endTime - startTime;
|
467 |
+
System.out.println("Loading Dictionary : Processing Time:"+totTime/1000+"sec");
|
468 |
+
|
469 |
+
folder = new File(InputFolder);
|
470 |
+
listOfFiles = folder.listFiles();
|
471 |
+
for (int i = 0; i < listOfFiles.length; i++)
|
472 |
+
{
|
473 |
+
if (listOfFiles[i].isFile())
|
474 |
+
{
|
475 |
+
String InputFile = listOfFiles[i].getName();
|
476 |
+
File f = new File(OutputFolder+"/"+InputFile);
|
477 |
+
if(f.exists() && !f.isDirectory())
|
478 |
+
{
|
479 |
+
System.out.println(InputFolder+"/"+InputFile+" - Done. (The output file exists in output folder)");
|
480 |
+
}
|
481 |
+
else
|
482 |
+
{
|
483 |
+
String path=tmpFolder;
|
484 |
+
File file = new File(path);
|
485 |
+
File[] files = file.listFiles();
|
486 |
+
for (File ftmp:files)
|
487 |
+
{
|
488 |
+
if (ftmp.isFile() && ftmp.exists())
|
489 |
+
{
|
490 |
+
if(ftmp.toString().matches(tmpFolder+"/"+InputFile+".*"))
|
491 |
+
{
|
492 |
+
ftmp.delete();
|
493 |
+
}
|
494 |
+
}
|
495 |
+
}
|
496 |
+
|
497 |
+
BioCDocobj = new BioCDoc();
|
498 |
+
|
499 |
+
/*
|
500 |
+
* Format Check
|
501 |
+
*/
|
502 |
+
String Format = "";
|
503 |
+
String checkR = BioCDocobj.BioCFormatCheck(InputFolder+"/"+InputFile);
|
504 |
+
if(checkR.equals("BioC"))
|
505 |
+
{
|
506 |
+
Format = "BioC";
|
507 |
+
}
|
508 |
+
else if(checkR.equals("PubTator"))
|
509 |
+
{
|
510 |
+
Format = "PubTator";
|
511 |
+
}
|
512 |
+
else
|
513 |
+
{
|
514 |
+
System.out.println(checkR);
|
515 |
+
System.exit(0);
|
516 |
+
}
|
517 |
+
|
518 |
+
System.out.print(InputFolder+"/"+InputFile+" - ("+Format+" format) : Processing ... \r");
|
519 |
+
|
520 |
+
/** PubTator2BioC*/
|
521 |
+
if(Format.equals("PubTator"))
|
522 |
+
{
|
523 |
+
BioCDocobj.PubTator2BioC(InputFolder+"/"+InputFile,tmpFolder+"/"+InputFile);
|
524 |
+
}
|
525 |
+
else
|
526 |
+
{
|
527 |
+
br = new BufferedReader(new FileReader(InputFolder+"/"+InputFile));
|
528 |
+
BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpFolder+"/"+InputFile), "UTF-8"));
|
529 |
+
line="";
|
530 |
+
while ((line = br.readLine()) != null)
|
531 |
+
{
|
532 |
+
fr.write(line);
|
533 |
+
}
|
534 |
+
br.close();
|
535 |
+
fr.close();
|
536 |
+
}
|
537 |
+
|
538 |
+
/** load file */
|
539 |
+
GNR GNRobj = new GNR();
|
540 |
+
GNRobj.LoadInputFile(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".Abb",TrainTest);
|
541 |
+
SR SRobj = new SR();
|
542 |
+
SimConcept SCobj = new SimConcept();
|
543 |
+
GN GNobj = new GN();
|
544 |
+
String FinalStep="";
|
545 |
+
|
546 |
+
/** SpeciesRecognition */
|
547 |
+
if(setup_hash.containsKey("SpeciesRecognition") && setup_hash.get("SpeciesRecognition").toLowerCase().equals("true") ) // pre-annotated name entities
|
548 |
+
{
|
549 |
+
SRobj.SpeciesRecognition(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SR.xml",setup_hash.get("DictionaryFolder")+"/SPStrain.txt",setup_hash.get("FilterAntibody"));
|
550 |
+
FinalStep="SpeciesRecognition";
|
551 |
+
}
|
552 |
+
|
553 |
+
/** GeneRecognition */
|
554 |
+
if( setup_hash.containsKey("GeneRecognition") && setup_hash.get("GeneRecognition").toLowerCase().equals("true") )
|
555 |
+
{
|
556 |
+
GNRobj.FeatureExtraction(tmpFolder+"/"+InputFile+".data",tmpFolder+"/"+InputFile+".loca",TrainTest);
|
557 |
+
GNRobj.CRF_test(setup_hash.get("GNRModel"),tmpFolder+"/"+InputFile+".data",tmpFolder+"/"+InputFile+".output","top3"); //top3
|
558 |
+
GNRobj.ReadCRFresult(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".loca",tmpFolder+"/"+InputFile+".output",tmpFolder+"/"+InputFile+".GNR.xml",0.005,0.05); //0.005,0.05
|
559 |
+
f = new File(tmpFolder+"/"+InputFile+".SR.xml");
|
560 |
+
if(f.exists())
|
561 |
+
{
|
562 |
+
GNRobj.PostProcessing(tmpFolder+"/"+InputFile+".SR.xml",tmpFolder+"/"+InputFile+".GNR.xml");
|
563 |
+
}
|
564 |
+
else
|
565 |
+
{
|
566 |
+
GNRobj.PostProcessing(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GNR.xml");
|
567 |
+
}
|
568 |
+
FinalStep="GeneRecognition";
|
569 |
+
}
|
570 |
+
|
571 |
+
/** SpeciesAssignment */
|
572 |
+
if(setup_hash.containsKey("SpeciesAssignment") && setup_hash.get("SpeciesAssignment").toLowerCase().equals("true") ) // pre-annotated name entities
|
573 |
+
{
|
574 |
+
if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All")) // FocusSpecies
|
575 |
+
{
|
576 |
+
f = new File(tmpFolder+"/"+InputFile+".GNR.xml");
|
577 |
+
if(f.exists())
|
578 |
+
{
|
579 |
+
SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile+".GNR.xml",tmpFolder+"/"+InputFile+".SA.xml",setup_hash.get("FocusSpecies"));
|
580 |
+
}
|
581 |
+
else
|
582 |
+
{
|
583 |
+
SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SA.xml",setup_hash.get("FocusSpecies"));
|
584 |
+
}
|
585 |
+
}
|
586 |
+
else// All Species
|
587 |
+
{
|
588 |
+
f = new File(tmpFolder+"/"+InputFile+".GNR.xml");
|
589 |
+
if(f.exists())
|
590 |
+
{
|
591 |
+
SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile+".GNR.xml",tmpFolder+"/"+InputFile+".SA.xml");
|
592 |
+
}
|
593 |
+
else
|
594 |
+
{
|
595 |
+
SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SA.xml");
|
596 |
+
}
|
597 |
+
}
|
598 |
+
FinalStep="SpeciesAssignment";
|
599 |
+
}
|
600 |
+
|
601 |
+
/** GeneNormalization */
|
602 |
+
if((setup_hash.containsKey("GeneNormalization")) && setup_hash.get("GeneNormalization").toLowerCase().equals("true") )
|
603 |
+
{
|
604 |
+
/** SimConcept */
|
605 |
+
{
|
606 |
+
SCobj.FeatureExtraction_Test(tmpFolder+"/"+InputFile+".SC.data");
|
607 |
+
SCobj.CRF_test(setup_hash.get("SCModel"),tmpFolder+"/"+InputFile+".SC.data",tmpFolder+"/"+InputFile+".SC.output");
|
608 |
+
SCobj.ReadCRFresult(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SC.output",tmpFolder+"/"+InputFile+".SC.xml");
|
609 |
+
}
|
610 |
+
|
611 |
+
/** GeneNormalization */
|
612 |
+
{
|
613 |
+
GNobj.PreProcessing4GN(InputFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".PreProcessing4GN.xml");
|
614 |
+
GNobj.ChromosomeRecognition(InputFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml");
|
615 |
+
if(setup_hash.containsKey("GeneIDMatch") && setup_hash.get("GeneIDMatch").equals("True"))
|
616 |
+
{
|
617 |
+
|
618 |
+
GNobj.GeneNormalization(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml",true);
|
619 |
+
GNobj.GeneIDRecognition(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml");
|
620 |
+
}
|
621 |
+
else
|
622 |
+
{
|
623 |
+
GNobj.GeneNormalization(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml",false);
|
624 |
+
}
|
625 |
+
}
|
626 |
+
FinalStep="GeneNormalization";
|
627 |
+
}
|
628 |
+
|
629 |
+
/** BioC2PubTator*/
|
630 |
+
String final_output="";
|
631 |
+
if(FinalStep.equals("GeneNormalization"))
|
632 |
+
{
|
633 |
+
final_output=tmpFolder+"/"+InputFile+".GN.xml";
|
634 |
+
}
|
635 |
+
else if(FinalStep.equals("SpeciesAssignment"))
|
636 |
+
{
|
637 |
+
final_output=tmpFolder+"/"+InputFile+".SA.xml";
|
638 |
+
}
|
639 |
+
else if(FinalStep.equals("SpeciesRecognition"))
|
640 |
+
{
|
641 |
+
final_output=tmpFolder+"/"+InputFile+".SR.xml";
|
642 |
+
}
|
643 |
+
else if(FinalStep.equals("GeneRecognition"))
|
644 |
+
{
|
645 |
+
final_output=tmpFolder+"/"+InputFile+".GNR.xml";
|
646 |
+
}
|
647 |
+
|
648 |
+
if(Format.equals("PubTator"))
|
649 |
+
{
|
650 |
+
BioCDocobj.BioC2PubTator(final_output,OutputFolder+"/"+InputFile);
|
651 |
+
}
|
652 |
+
else
|
653 |
+
{
|
654 |
+
br = new BufferedReader(new FileReader(final_output));
|
655 |
+
BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(OutputFolder+"/"+InputFile), "UTF-8"));
|
656 |
+
line="";
|
657 |
+
while ((line = br.readLine()) != null)
|
658 |
+
{
|
659 |
+
fr.write(line);
|
660 |
+
}
|
661 |
+
br.close();
|
662 |
+
fr.close();
|
663 |
+
}
|
664 |
+
|
665 |
+
/*
|
666 |
+
* remove tmp files
|
667 |
+
*/
|
668 |
+
if((!setup_hash.containsKey("DeleteTmp")) || setup_hash.get("DeleteTmp").toLowerCase().equals("true"))
|
669 |
+
{
|
670 |
+
path="tmp";
|
671 |
+
file = new File(path);
|
672 |
+
files = file.listFiles();
|
673 |
+
for (File ftmp:files)
|
674 |
+
{
|
675 |
+
if (ftmp.isFile() && ftmp.exists())
|
676 |
+
{
|
677 |
+
if(ftmp.toString().matches(tmpFolder+"/"+InputFile+".*"))
|
678 |
+
{
|
679 |
+
ftmp.delete();
|
680 |
+
}
|
681 |
+
}
|
682 |
+
}
|
683 |
+
}
|
684 |
+
|
685 |
+
/*
|
686 |
+
* Time stamp - last
|
687 |
+
*/
|
688 |
+
endTime = System.currentTimeMillis();
|
689 |
+
totTime = endTime - startTime;
|
690 |
+
System.out.println(InputFolder+"/"+InputFile+" - ("+Format+" format) : Processing Time:"+totTime/1000+"sec");
|
691 |
+
}
|
692 |
+
}
|
693 |
+
}
|
694 |
+
}
|
695 |
+
}
|
696 |
+
}
|
src_Java/GNormPluslib/PrefixTree.java
CHANGED
@@ -1,893 +1,893 @@
|
|
1 |
-
/**
|
2 |
-
* Project: GNormPlus
|
3 |
-
* Function: Dictionary lookup by Prefix Tree
|
4 |
-
*/
|
5 |
-
|
6 |
-
package GNormPluslib;
|
7 |
-
|
8 |
-
import java.io.*;
|
9 |
-
import java.util.*;
|
10 |
-
import java.util.regex.Matcher;
|
11 |
-
import java.util.regex.Pattern;
|
12 |
-
|
13 |
-
public class PrefixTree
|
14 |
-
{
|
15 |
-
private Tree Tr=new Tree();
|
16 |
-
|
17 |
-
/*
|
18 |
-
* Read Dictionary and insert Mention into the Prefix Tree
|
19 |
-
*/
|
20 |
-
public static HashMap<String, String> StopWord_hash = new HashMap<String, String>();
|
21 |
-
|
22 |
-
public void Hash2Tree(HashMap<String, String> ID2Names)
|
23 |
-
{
|
24 |
-
for(String ID : ID2Names.keySet())
|
25 |
-
{
|
26 |
-
String NameColumn[]=ID2Names.get(ID).split("\\|");
|
27 |
-
for(int i=0;i<NameColumn.length;i++)
|
28 |
-
{
|
29 |
-
Tr.insertMention(NameColumn[i],ID);
|
30 |
-
}
|
31 |
-
}
|
32 |
-
}
|
33 |
-
public void Dictionary2Tree_Combine(String Filename,String StopWords,String MentionType)
|
34 |
-
{
|
35 |
-
try
|
36 |
-
{
|
37 |
-
//System.out.println("Dictionary2Tree_Combine : " + Filename);
|
38 |
-
|
39 |
-
/** Stop Word */
|
40 |
-
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8"));
|
41 |
-
String line="";
|
42 |
-
while ((line = br.readLine()) != null)
|
43 |
-
{
|
44 |
-
StopWord_hash.put(line, "StopWord");
|
45 |
-
}
|
46 |
-
br.close();
|
47 |
-
|
48 |
-
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
|
49 |
-
line="";
|
50 |
-
//int count=0;
|
51 |
-
while ((line = inputfile.readLine()) != null)
|
52 |
-
{
|
53 |
-
//count++;
|
54 |
-
//if(count%10000==0){ System.out.println(count); }
|
55 |
-
String Column[]=line.split("\t");
|
56 |
-
if(Column.length>1)
|
57 |
-
{
|
58 |
-
Column[0]=Column[0].replace("species:ncbi:","");
|
59 |
-
Column[1]=Column[1].replaceAll(" strain=", " ");
|
60 |
-
Column[1]=Column[1].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
|
61 |
-
Column[1]=Column[1].replaceAll("[\\(\\)]", " ");
|
62 |
-
String SpNameColumn[]=Column[1].split("\\|");
|
63 |
-
for(int i=0;i<SpNameColumn.length;i++)
|
64 |
-
{
|
65 |
-
String tmp = SpNameColumn[i];
|
66 |
-
tmp=tmp.replaceAll("[\\W\\-\\_]", "");
|
67 |
-
|
68 |
-
/*
|
69 |
-
* Criteria for Species
|
70 |
-
*/
|
71 |
-
if( MentionType.equals("Species") &&
|
72 |
-
(!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
|
73 |
-
(!SpNameColumn[i].matches("a[\\W\\-\\_].*")) &&
|
74 |
-
tmp.length()>=3
|
75 |
-
)
|
76 |
-
{
|
77 |
-
boolean stopword_boolean=false;
|
78 |
-
for(String stopword_RegEx : StopWord_hash.keySet())
|
79 |
-
{
|
80 |
-
Pattern ptmp = Pattern.compile("^"+stopword_RegEx+"$");
|
81 |
-
Matcher mtmp = ptmp.matcher(SpNameColumn[i].toLowerCase());
|
82 |
-
if(mtmp.find())
|
83 |
-
{
|
84 |
-
stopword_boolean=true;
|
85 |
-
}
|
86 |
-
}
|
87 |
-
if(stopword_boolean == false)
|
88 |
-
{
|
89 |
-
Tr.insertMention(SpNameColumn[i],Column[0]);
|
90 |
-
}
|
91 |
-
}
|
92 |
-
/*
|
93 |
-
* Criteria for Gene
|
94 |
-
*/
|
95 |
-
else if (MentionType.equals("Gene") &&
|
96 |
-
(!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
|
97 |
-
tmp.length()>=3
|
98 |
-
)
|
99 |
-
{
|
100 |
-
if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
|
101 |
-
{
|
102 |
-
Tr.insertMention(SpNameColumn[i],Column[0]);
|
103 |
-
}
|
104 |
-
}
|
105 |
-
/*
|
106 |
-
* Criteria for Cell
|
107 |
-
*/
|
108 |
-
else if (MentionType.equals("Cell") &&
|
109 |
-
(!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
|
110 |
-
tmp.length()>=3
|
111 |
-
)
|
112 |
-
{
|
113 |
-
if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
|
114 |
-
{
|
115 |
-
Tr.insertMention(SpNameColumn[i],Column[0]);
|
116 |
-
}
|
117 |
-
}
|
118 |
-
/*
|
119 |
-
* others
|
120 |
-
*/
|
121 |
-
else if ((!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
|
122 |
-
tmp.length()>=3
|
123 |
-
)
|
124 |
-
{
|
125 |
-
if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
|
126 |
-
{
|
127 |
-
Tr.insertMention(SpNameColumn[i],Column[0]);
|
128 |
-
}
|
129 |
-
}
|
130 |
-
}
|
131 |
-
}
|
132 |
-
}
|
133 |
-
inputfile.close();
|
134 |
-
}
|
135 |
-
catch(IOException e1){ System.out.println("[Dictionary2Tree_Combine]: Input file is not exist.");}
|
136 |
-
}
|
137 |
-
public void Dictionary2Tree_UniqueGene(String Filename,String StopWords,String Preifx)
|
138 |
-
{
|
139 |
-
try
|
140 |
-
{
|
141 |
-
//System.out.println("Dictionary2Tree_UniqueGene : " + Filename);
|
142 |
-
|
143 |
-
/** Stop Word */
|
144 |
-
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8"));
|
145 |
-
String line="";
|
146 |
-
while ((line = br.readLine()) != null)
|
147 |
-
{
|
148 |
-
StopWord_hash.put(line, "StopWord");
|
149 |
-
}
|
150 |
-
br.close();
|
151 |
-
|
152 |
-
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
|
153 |
-
line="";
|
154 |
-
//int count=0;
|
155 |
-
while ((line = inputfile.readLine()) != null)
|
156 |
-
{
|
157 |
-
//count++;
|
158 |
-
//if(count%10000==0){ System.out.println(count); }
|
159 |
-
String Column[]=line.split("\t");
|
160 |
-
if(Column.length>1)
|
161 |
-
{
|
162 |
-
if(!StopWord_hash.containsKey(Column[0].toLowerCase()))
|
163 |
-
{
|
164 |
-
if(Preifx.equals(""))
|
165 |
-
{
|
166 |
-
Tr.insertMention(Column[0],Column[1]);
|
167 |
-
}
|
168 |
-
else if(Preifx.equals("Num") && Column[0].matches("[0-9].*"))
|
169 |
-
{
|
170 |
-
Tr.insertMention(Column[0],Column[1]);
|
171 |
-
}
|
172 |
-
else if(Preifx.equals("AZNum") && Column[0].matches("[a-z][0-9].*"))
|
173 |
-
{
|
174 |
-
Tr.insertMention(Column[0],Column[1]);
|
175 |
-
}
|
176 |
-
else if(Preifx.equals("lo") && Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
|
177 |
-
{
|
178 |
-
if( ! Column[0].matches("loc[0-9]+"))
|
179 |
-
{
|
180 |
-
Tr.insertMention(Column[0],Column[1]);
|
181 |
-
}
|
182 |
-
}
|
183 |
-
else if(Preifx.equals("un") && Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
|
184 |
-
{
|
185 |
-
if(Column[0].length()>=6 && Column[0].substring(0,6).equals("unchar"))
|
186 |
-
{
|
187 |
-
// remove uncharacterized
|
188 |
-
}
|
189 |
-
else
|
190 |
-
{
|
191 |
-
Tr.insertMention(Column[0],Column[1]);
|
192 |
-
}
|
193 |
-
}
|
194 |
-
else if(Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
|
195 |
-
{
|
196 |
-
Tr.insertMention(Column[0],Column[1]);
|
197 |
-
}
|
198 |
-
}
|
199 |
-
}
|
200 |
-
}
|
201 |
-
inputfile.close();
|
202 |
-
}
|
203 |
-
catch(IOException e1){ System.out.println("[Dictionary2Tree_UniqueGene]: Input file is not exist.");}
|
204 |
-
}
|
205 |
-
public void Dictionary2Tree_UniqueSpecies(String Filename,String StopWords,String Preifx)
|
206 |
-
{
|
207 |
-
try
|
208 |
-
{
|
209 |
-
//System.out.println("Dictionary2Tree_UniqueGene : " + Filename);
|
210 |
-
|
211 |
-
/** Stop Word */
|
212 |
-
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8"));
|
213 |
-
String line="";
|
214 |
-
while ((line = br.readLine()) != null)
|
215 |
-
{
|
216 |
-
StopWord_hash.put(line, "StopWord");
|
217 |
-
}
|
218 |
-
br.close();
|
219 |
-
|
220 |
-
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
|
221 |
-
line="";
|
222 |
-
while ((line = inputfile.readLine()) != null)
|
223 |
-
{
|
224 |
-
//count++;
|
225 |
-
//if(count%10000==0){ System.out.println(count); }
|
226 |
-
String Column[]=line.split("\t");
|
227 |
-
if(Column.length>1)
|
228 |
-
{
|
229 |
-
if(!StopWord_hash.containsKey(Column[0].toLowerCase()))
|
230 |
-
{
|
231 |
-
if(Preifx.equals("")) //all
|
232 |
-
{
|
233 |
-
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
|
234 |
-
{
|
235 |
-
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
|
236 |
-
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
|
237 |
-
if(mention_tmp.length()>=10)
|
238 |
-
{
|
239 |
-
Tr.insertMention(mention_rev,Column[1]);
|
240 |
-
}
|
241 |
-
}
|
242 |
-
else
|
243 |
-
{
|
244 |
-
Tr.insertMention(Column[0],Column[1]); // mention, id
|
245 |
-
}
|
246 |
-
|
247 |
-
}
|
248 |
-
else if(Column[0].matches("[0-9][0-9].*"))
|
249 |
-
{
|
250 |
-
if(Preifx.equals("Num"))
|
251 |
-
{
|
252 |
-
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
|
253 |
-
{
|
254 |
-
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
|
255 |
-
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
|
256 |
-
if(mention_tmp.length()>=10)
|
257 |
-
{
|
258 |
-
Tr.insertMention(mention_rev,Column[1]);
|
259 |
-
}
|
260 |
-
}
|
261 |
-
else
|
262 |
-
{
|
263 |
-
Tr.insertMention(Column[0],Column[1]); // mention, id
|
264 |
-
}
|
265 |
-
}
|
266 |
-
}
|
267 |
-
/*
|
268 |
-
else if(Column[0].matches("[a-z][0-9].*"))
|
269 |
-
{
|
270 |
-
if(Preifx.equals("AZNum"))
|
271 |
-
{
|
272 |
-
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
|
273 |
-
{
|
274 |
-
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
|
275 |
-
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
|
276 |
-
if(mention_tmp.length()>=10)
|
277 |
-
{
|
278 |
-
Tr.insertMention(mention_rev,Column[1]);
|
279 |
-
}
|
280 |
-
}
|
281 |
-
else
|
282 |
-
{
|
283 |
-
Tr.insertMention(Column[0],Column[1]); // mention, id
|
284 |
-
}
|
285 |
-
}
|
286 |
-
}
|
287 |
-
*/
|
288 |
-
else if(Column[0].matches("[a-z][a-z].*"))
|
289 |
-
{
|
290 |
-
if(Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
|
291 |
-
{
|
292 |
-
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
|
293 |
-
{
|
294 |
-
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
|
295 |
-
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
|
296 |
-
if(mention_tmp.length()>=10)
|
297 |
-
{
|
298 |
-
Tr.insertMention(mention_rev,Column[1]);
|
299 |
-
}
|
300 |
-
}
|
301 |
-
else
|
302 |
-
{
|
303 |
-
Tr.insertMention(Column[0],Column[1]); // mention, id
|
304 |
-
}
|
305 |
-
}
|
306 |
-
}
|
307 |
-
else if(Preifx.equals("Others"))
|
308 |
-
{
|
309 |
-
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
|
310 |
-
{
|
311 |
-
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
|
312 |
-
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
|
313 |
-
if(mention_tmp.length()>=10)
|
314 |
-
{
|
315 |
-
Tr.insertMention(mention_rev,Column[1]);
|
316 |
-
}
|
317 |
-
}
|
318 |
-
else
|
319 |
-
{
|
320 |
-
Tr.insertMention(Column[0],Column[1]); // mention, id
|
321 |
-
}
|
322 |
-
}
|
323 |
-
}
|
324 |
-
}
|
325 |
-
}
|
326 |
-
inputfile.close();
|
327 |
-
}
|
328 |
-
catch(IOException e1){ System.out.println("[Dictionary2Tree_UniqueGene]: Input file is not exist.");}
|
329 |
-
}
|
330 |
-
public void TreeFile2Tree(String Filename)
|
331 |
-
{
|
332 |
-
try
|
333 |
-
{
|
334 |
-
//System.out.println("TreeFile2Tree : " + Filename);
|
335 |
-
|
336 |
-
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
|
337 |
-
String line="";
|
338 |
-
int count=0;
|
339 |
-
while ((line = inputfile.readLine()) != null)
|
340 |
-
{
|
341 |
-
String Anno[]=line.split("\t");
|
342 |
-
if(Anno.length<2){System.out.println(count+"\t"+line);} //check error
|
343 |
-
String LocationInTree = Anno[0];
|
344 |
-
String token = Anno[1];
|
345 |
-
String identifier="";
|
346 |
-
if(Anno.length==3)
|
347 |
-
{
|
348 |
-
identifier = Anno[2];
|
349 |
-
}
|
350 |
-
String LocationsInTree[]=LocationInTree.split("-");
|
351 |
-
TreeNode tmp = Tr.root;
|
352 |
-
for(int i=0;i<LocationsInTree.length-1;i++)
|
353 |
-
{
|
354 |
-
tmp=tmp.links.get(Integer.parseInt(LocationsInTree[i])-1);
|
355 |
-
}
|
356 |
-
tmp.InsertToken(token,identifier);
|
357 |
-
//if(count%10000==0){System.out.println(count);}
|
358 |
-
count++;
|
359 |
-
}
|
360 |
-
inputfile.close();
|
361 |
-
}
|
362 |
-
catch(IOException e1){ System.out.println("[TreeFile2Tee]: Input file: "+ Filename +" is not exist.");}
|
363 |
-
}
|
364 |
-
|
365 |
-
/*
|
366 |
-
* Search target mention in the Prefix Tree
|
367 |
-
*/
|
368 |
-
public String MentionMatch(String Mentions)
|
369 |
-
{
|
370 |
-
ArrayList<String> location = new ArrayList<String>();
|
371 |
-
String Menlist[]=Mentions.split("\\|");
|
372 |
-
for(int m=0;m<Menlist.length;m++)
|
373 |
-
{
|
374 |
-
String Mention=Menlist[m];
|
375 |
-
String Mention_lc=Mention.toLowerCase();
|
376 |
-
Mention_lc = Mention_lc.replaceAll("[\\W\\-\\_]+", "");
|
377 |
-
Mention_lc = Mention_lc.replaceAll("([0-9])([a-z])", "$1 $2");
|
378 |
-
Mention_lc = Mention_lc.replaceAll("([a-z])([0-9])", "$1 $2");
|
379 |
-
String Tkns[]=Mention_lc.split(" ");
|
380 |
-
|
381 |
-
int PrefixTranslation=0;
|
382 |
-
int i=0;
|
383 |
-
boolean find=false;
|
384 |
-
TreeNode tmp = Tr.root;
|
385 |
-
|
386 |
-
while( i<Tkns.length && tmp.CheckChild(Tkns[i],PrefixTranslation)>=0) //Find Tokens in the links
|
387 |
-
{
|
388 |
-
if(i == Tkns.length-1){PrefixTranslation = 1;}
|
389 |
-
tmp=tmp.links.get(tmp.CheckChild(Tkns[i],PrefixTranslation)); //move point to the link
|
390 |
-
find=true;
|
391 |
-
i++;
|
392 |
-
}
|
393 |
-
if(find == true)
|
394 |
-
{
|
395 |
-
if(i==Tkns.length)
|
396 |
-
{
|
397 |
-
if(!tmp.Concept.equals(""))
|
398 |
-
{
|
399 |
-
return tmp.Concept;
|
400 |
-
}
|
401 |
-
else
|
402 |
-
{
|
403 |
-
return "-1";
|
404 |
-
//gene id is not found.
|
405 |
-
}
|
406 |
-
}
|
407 |
-
else
|
408 |
-
{
|
409 |
-
return "-2";
|
410 |
-
//the gene mention matched a substring in PrefixTree.
|
411 |
-
}
|
412 |
-
}
|
413 |
-
else
|
414 |
-
{
|
415 |
-
return "-3";
|
416 |
-
//mention is not found
|
417 |
-
}
|
418 |
-
}
|
419 |
-
return "-3"; //mention is not found
|
420 |
-
}
|
421 |
-
|
422 |
-
/*
|
423 |
-
* Search target mention in the Prefix Tree
|
424 |
-
*/
|
425 |
-
public String MentionMatch_species(String Mentions)
|
426 |
-
{
|
427 |
-
ArrayList<String> location = new ArrayList<String>();
|
428 |
-
String Menlist[]=Mentions.split("\\|");
|
429 |
-
for(int m=0;m<Menlist.length;m++)
|
430 |
-
{
|
431 |
-
String Mention=Menlist[m];
|
432 |
-
String Mention_lc=Mention.toLowerCase();
|
433 |
-
Mention_lc = Mention_lc.replaceAll("[\\W\\-\\_]+", " ");
|
434 |
-
Mention_lc = Mention_lc.replaceAll("([0-9])([a-z])", "$1 $2");
|
435 |
-
Mention_lc = Mention_lc.replaceAll("([a-z])([0-9])", "$1 $2");
|
436 |
-
Mention_lc = Mention_lc.replaceAll("^[ ]+", "");
|
437 |
-
Mention_lc = Mention_lc.replaceAll("[ ]+$", "");
|
438 |
-
String Tkns[]=Mention_lc.split(" ");
|
439 |
-
|
440 |
-
int PrefixTranslation=0;
|
441 |
-
int i=0;
|
442 |
-
boolean find=false;
|
443 |
-
TreeNode tmp = Tr.root;
|
444 |
-
|
445 |
-
while( i<Tkns.length && tmp.CheckChild(Tkns[i],PrefixTranslation)>=0) //Find Tokens in the links
|
446 |
-
{
|
447 |
-
if(i == Tkns.length-1){PrefixTranslation = 1;}
|
448 |
-
tmp=tmp.links.get(tmp.CheckChild(Tkns[i],PrefixTranslation)); //move point to the link
|
449 |
-
find=true;
|
450 |
-
i++;
|
451 |
-
}
|
452 |
-
if(find == true)
|
453 |
-
{
|
454 |
-
if(i==Tkns.length)
|
455 |
-
{
|
456 |
-
if(!tmp.Concept.equals(""))
|
457 |
-
{
|
458 |
-
return tmp.Concept;
|
459 |
-
}
|
460 |
-
else
|
461 |
-
{
|
462 |
-
return "-1";
|
463 |
-
//gene id is not found.
|
464 |
-
}
|
465 |
-
}
|
466 |
-
else
|
467 |
-
{
|
468 |
-
return "-2";
|
469 |
-
//the gene mention matched a substring in PrefixTree.
|
470 |
-
}
|
471 |
-
}
|
472 |
-
else
|
473 |
-
{
|
474 |
-
return "-3";
|
475 |
-
//mention is not found
|
476 |
-
}
|
477 |
-
}
|
478 |
-
return "-3"; //mention is not found
|
479 |
-
}
|
480 |
-
|
481 |
-
/*
|
482 |
-
* Search target mention in the Prefix Tree
|
483 |
-
* ConceptType: Species|Genus|Cell|CTDGene
|
484 |
-
*/
|
485 |
-
public ArrayList<String> SearchMentionLocation(String Doc,String ConceptType)
|
486 |
-
{
|
487 |
-
ArrayList<String> location = new ArrayList<String>();
|
488 |
-
Doc=Doc+" XXXX XXXX";
|
489 |
-
String Doc_org=Doc;
|
490 |
-
Doc=Doc.toLowerCase();
|
491 |
-
String Doc_lc=Doc;
|
492 |
-
Doc = Doc.replaceAll("([0-9])([A-Za-z])", "$1 $2");
|
493 |
-
Doc = Doc.replaceAll("([A-Za-z])([0-9])", "$1 $2");
|
494 |
-
Doc = Doc.replaceAll("[\\W^;:,]+", " ");
|
495 |
-
|
496 |
-
/* = keep special characters =
|
497 |
-
*
|
498 |
-
String regex="\\s+|(?=\\p{Punct})|(?<=\\p{Punct})";
|
499 |
-
String DocTkns[]=Doc.split(regex);
|
500 |
-
*/
|
501 |
-
|
502 |
-
String DocTkns[]=Doc.split(" ");
|
503 |
-
int Offset=0;
|
504 |
-
int Start=0;
|
505 |
-
int Last=0;
|
506 |
-
int FirstTime=0;
|
507 |
-
|
508 |
-
while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
|
509 |
-
{
|
510 |
-
Doc_lc=Doc_lc.substring(1);
|
511 |
-
Offset++;
|
512 |
-
}
|
513 |
-
|
514 |
-
for(int i=0;i<DocTkns.length;i++)
|
515 |
-
{
|
516 |
-
//System.out.println(i+"\t"+Start+"\t"+Last+"\t"+Offset+"\t"+Doc_lc);
|
517 |
-
|
518 |
-
int pre_i=i;
|
519 |
-
int pre_Start=Start;
|
520 |
-
int pre_Last=Last;
|
521 |
-
String pre_Doc_lc=Doc_lc;
|
522 |
-
int pre_Offset=Offset;
|
523 |
-
|
524 |
-
TreeNode tmp = Tr.root;
|
525 |
-
boolean find=false;
|
526 |
-
int PrefixTranslation=2;
|
527 |
-
if(ConceptType.equals("Species"))
|
528 |
-
{
|
529 |
-
PrefixTranslation=3;
|
530 |
-
}
|
531 |
-
int ConceptFound=i; //Keep found concept
|
532 |
-
String ConceptFound_STR="";//Keep found concept
|
533 |
-
int FirstTime_while = -1;
|
534 |
-
|
535 |
-
while( tmp.CheckChild(DocTkns[i],PrefixTranslation)>=0 ) //Find Tokens in the links
|
536 |
-
{
|
537 |
-
FirstTime_while++;
|
538 |
-
tmp=tmp.links.get(tmp.CheckChild(DocTkns[i],PrefixTranslation)); //move point to the link
|
539 |
-
if(Start==0 && FirstTime>0){Start = Offset;} //Start <- Offset
|
540 |
-
if(Doc_lc.length()>=DocTkns[i].length() && Doc_lc.substring(0,DocTkns[i].length()).equals(DocTkns[i]))
|
541 |
-
{
|
542 |
-
if(DocTkns[i].length()>0)
|
543 |
-
{
|
544 |
-
Doc_lc=Doc_lc.substring(DocTkns[i].length());
|
545 |
-
Offset=Offset+DocTkns[i].length();
|
546 |
-
}
|
547 |
-
}
|
548 |
-
Last = Offset;
|
549 |
-
while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
|
550 |
-
{
|
551 |
-
Doc_lc=Doc_lc.substring(1);
|
552 |
-
Offset++;
|
553 |
-
}
|
554 |
-
i++;
|
555 |
-
|
556 |
-
if(ConceptType.equals("Species"))
|
557 |
-
{
|
558 |
-
if(i<DocTkns.length-3 && DocTkns[i].matches("(str|strain|substr|substrain|subspecies|subsp|var|variant|pathovars|pv|biovar|bv)"))
|
559 |
-
{
|
560 |
-
Doc_lc=Doc_lc.substring(DocTkns[i].length());
|
561 |
-
Offset=Offset+DocTkns[i].length();
|
562 |
-
Last = Offset;
|
563 |
-
while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
|
564 |
-
{
|
565 |
-
Doc_lc=Doc_lc.substring(1);
|
566 |
-
Offset++;
|
567 |
-
}
|
568 |
-
i++;
|
569 |
-
}
|
570 |
-
}
|
571 |
-
|
572 |
-
if(!tmp.Concept.equals("") && (Last-Start>0)) //Keep found concept
|
573 |
-
{
|
574 |
-
if(Last<Doc_org.length())
|
575 |
-
{
|
576 |
-
ConceptFound=i;
|
577 |
-
ConceptFound_STR=Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept;
|
578 |
-
//System.out.println(ConceptFound_STR);
|
579 |
-
}
|
580 |
-
}
|
581 |
-
|
582 |
-
find=true;
|
583 |
-
if(i>=DocTkns.length){break;}
|
584 |
-
else if(i==DocTkns.length-1){PrefixTranslation=2;}
|
585 |
-
|
586 |
-
//System.out.println(i+"\t"+Start+"\t"+Last+"\t("+FirstTime_while+")\t"+Offset+"\t"+Doc_lc);
|
587 |
-
|
588 |
-
if(FirstTime_while==0) // first matched token
|
589 |
-
{
|
590 |
-
pre_i=i;
|
591 |
-
pre_Start=Start;
|
592 |
-
pre_Last=Last;
|
593 |
-
pre_Doc_lc=Doc_lc;
|
594 |
-
pre_Offset=Offset;
|
595 |
-
}
|
596 |
-
}
|
597 |
-
|
598 |
-
if(find == true)
|
599 |
-
{
|
600 |
-
//System.out.println(find+"\t"+FirstTime_while+"\t"+Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
|
601 |
-
if(!tmp.Concept.equals("")) //the last matched token has concept id
|
602 |
-
{
|
603 |
-
if(Last<Doc_org.length() && Last>Start)
|
604 |
-
{
|
605 |
-
location.add(Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
|
606 |
-
}
|
607 |
-
}
|
608 |
-
else
|
609 |
-
{
|
610 |
-
if(!ConceptFound_STR.equals("")) //Keep found concept
|
611 |
-
{
|
612 |
-
location.add(ConceptFound_STR);
|
613 |
-
i = ConceptFound + 1;
|
614 |
-
}
|
615 |
-
|
616 |
-
if(FirstTime_while>=1)
|
617 |
-
{
|
618 |
-
i=pre_i;
|
619 |
-
Start=pre_Start;
|
620 |
-
Last=pre_Last;
|
621 |
-
Doc_lc=pre_Doc_lc;
|
622 |
-
Offset=pre_Offset;
|
623 |
-
}
|
624 |
-
}
|
625 |
-
Start=0;
|
626 |
-
Last=0;
|
627 |
-
if(i>0){i--;}
|
628 |
-
ConceptFound=i; //Keep found concept
|
629 |
-
ConceptFound_STR="";//Keep found concept
|
630 |
-
}
|
631 |
-
else //if(find == false)
|
632 |
-
{
|
633 |
-
//System.out.println(find+"\t"+FirstTime_while+"\t"+Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
|
634 |
-
|
635 |
-
if(FirstTime_while>=1 && tmp.Concept.equals(""))
|
636 |
-
{
|
637 |
-
i=pre_i;
|
638 |
-
Start=pre_Start;
|
639 |
-
Last=pre_Last;
|
640 |
-
Doc_lc=pre_Doc_lc;
|
641 |
-
Offset=pre_Offset;
|
642 |
-
}
|
643 |
-
|
644 |
-
if(Doc_lc.length()>=DocTkns[i].length() && Doc_lc.substring(0,DocTkns[i].length()).equals(DocTkns[i]))
|
645 |
-
{
|
646 |
-
if(DocTkns[i].length()>0)
|
647 |
-
{
|
648 |
-
Doc_lc=Doc_lc.substring(DocTkns[i].length());
|
649 |
-
Offset=Offset+DocTkns[i].length();
|
650 |
-
}
|
651 |
-
}
|
652 |
-
}
|
653 |
-
|
654 |
-
while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
|
655 |
-
{
|
656 |
-
Doc_lc=Doc_lc.substring(1);
|
657 |
-
Offset++;
|
658 |
-
}
|
659 |
-
FirstTime++;
|
660 |
-
|
661 |
-
//System.out.println();
|
662 |
-
}
|
663 |
-
return location;
|
664 |
-
}
|
665 |
-
|
666 |
-
/*
|
667 |
-
* Print out the Prefix Tree
|
668 |
-
*/
|
669 |
-
public String PrintTree()
|
670 |
-
{
|
671 |
-
return Tr.PrintTree_preorder(Tr.root,"");
|
672 |
-
}
|
673 |
-
|
674 |
-
public void SaveTree(String outputfile) throws IOException
|
675 |
-
{
|
676 |
-
BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputfile), "UTF-8"));
|
677 |
-
Tr.SaveTree_preorder(Tr.root,"",fr);
|
678 |
-
fr.close();
|
679 |
-
}
|
680 |
-
|
681 |
-
|
682 |
-
public void insertMention(String Mention, String Identifier)
|
683 |
-
{
|
684 |
-
Tr.insertMention(Mention,Identifier);
|
685 |
-
}
|
686 |
-
}
|
687 |
-
|
688 |
-
class Tree
|
689 |
-
{
|
690 |
-
/*
|
691 |
-
* Prefix Tree - root node
|
692 |
-
*/
|
693 |
-
public TreeNode root;
|
694 |
-
|
695 |
-
public Tree()
|
696 |
-
{
|
697 |
-
root = new TreeNode("-ROOT-");
|
698 |
-
}
|
699 |
-
|
700 |
-
/*
|
701 |
-
* Insert mention into the tree
|
702 |
-
*/
|
703 |
-
public void insertMention(String Mention, String Identifier)
|
704 |
-
{
|
705 |
-
Mention=Mention.toLowerCase();
|
706 |
-
|
707 |
-
Mention = Mention.replaceAll("([0-9])([A-Za-z])", "$1 $2");
|
708 |
-
Mention = Mention.replaceAll("([A-Za-z])([0-9])", "$1 $2");
|
709 |
-
Mention = Mention.replaceAll("[\\W\\-\\_]+", " ");
|
710 |
-
/* = keep special characters =
|
711 |
-
*
|
712 |
-
String regex="\\s+|(?=\\p{Punct})|(?<=\\p{Punct})";
|
713 |
-
String Tokens[]=Mention.split(regex);
|
714 |
-
*/
|
715 |
-
String Tokens[]=Mention.split(" ");
|
716 |
-
TreeNode tmp = root;
|
717 |
-
for(int i=0;i<Tokens.length;i++)
|
718 |
-
{
|
719 |
-
if(tmp.CheckChild(Tokens[i],0)>=0)
|
720 |
-
{
|
721 |
-
tmp=tmp.links.get( tmp.CheckChild(Tokens[i],0) ); //go through next generation (exist node)
|
722 |
-
if(i == Tokens.length-1)
|
723 |
-
{
|
724 |
-
tmp.Concept=Identifier;
|
725 |
-
}
|
726 |
-
}
|
727 |
-
else //not exist
|
728 |
-
{
|
729 |
-
if(i == Tokens.length-1)
|
730 |
-
{
|
731 |
-
tmp.InsertToken(Tokens[i],Identifier);
|
732 |
-
}
|
733 |
-
else
|
734 |
-
{
|
735 |
-
tmp.InsertToken(Tokens[i]);
|
736 |
-
}
|
737 |
-
tmp=tmp.links.get(tmp.NumOflinks-1); //go to the next generation (new node)
|
738 |
-
}
|
739 |
-
}
|
740 |
-
}
|
741 |
-
|
742 |
-
/*
|
743 |
-
* Print the tree by pre-order
|
744 |
-
*/
|
745 |
-
public String PrintTree_preorder(TreeNode node, String LocationInTree)
|
746 |
-
{
|
747 |
-
String opt="";
|
748 |
-
if(!node.token.equals("-ROOT-"))//Ignore root
|
749 |
-
{
|
750 |
-
if(node.Concept.equals(""))
|
751 |
-
{
|
752 |
-
opt=opt+LocationInTree+"\t"+node.token+"\n";
|
753 |
-
}
|
754 |
-
else
|
755 |
-
{
|
756 |
-
opt=opt+LocationInTree+"\t"+node.token+"\t"+node.Concept+"\n";
|
757 |
-
}
|
758 |
-
}
|
759 |
-
if(!LocationInTree.equals("")){LocationInTree=LocationInTree+"-";}
|
760 |
-
for(int i=0;i<node.NumOflinks;i++)
|
761 |
-
{
|
762 |
-
opt=opt+PrintTree_preorder(node.links.get(i),LocationInTree+(i+1));
|
763 |
-
}
|
764 |
-
return opt;
|
765 |
-
}
|
766 |
-
|
767 |
-
/*
|
768 |
-
* Print the tree by pre-order
|
769 |
-
*/
|
770 |
-
public void SaveTree_preorder(TreeNode node, String LocationInTree, BufferedWriter fr) throws IOException
|
771 |
-
{
|
772 |
-
if(!node.token.equals("-ROOT-"))//Ignore root
|
773 |
-
{
|
774 |
-
if(node.Concept.equals(""))
|
775 |
-
{
|
776 |
-
fr.write(LocationInTree+"\t"+node.token+"\n");
|
777 |
-
}
|
778 |
-
else
|
779 |
-
{
|
780 |
-
fr.write(LocationInTree+"\t"+node.token+"\t"+node.Concept+"\n");
|
781 |
-
}
|
782 |
-
}
|
783 |
-
if(!LocationInTree.equals("")){LocationInTree=LocationInTree+"-";}
|
784 |
-
for(int i=0;i<node.NumOflinks;i++)
|
785 |
-
{
|
786 |
-
SaveTree_preorder(node.links.get(i),LocationInTree+(i+1),fr);
|
787 |
-
}
|
788 |
-
}
|
789 |
-
}
|
790 |
-
|
791 |
-
class TreeNode
|
792 |
-
{
|
793 |
-
String token; //token of the node
|
794 |
-
int NumOflinks; //Number of links
|
795 |
-
public String Concept;
|
796 |
-
HashMap<String,Integer> Hashs;
|
797 |
-
ArrayList<TreeNode> links;
|
798 |
-
|
799 |
-
public TreeNode(String Tok,String ID)
|
800 |
-
{
|
801 |
-
token = Tok;
|
802 |
-
NumOflinks = 0;
|
803 |
-
Concept = ID;
|
804 |
-
links = new ArrayList<TreeNode>();/*link*/
|
805 |
-
Hashs = new HashMap<String,Integer>();/*hash*/
|
806 |
-
}
|
807 |
-
public TreeNode(String Tok)
|
808 |
-
{
|
809 |
-
token = Tok;
|
810 |
-
NumOflinks = 0;
|
811 |
-
Concept = "";
|
812 |
-
links = new ArrayList<TreeNode>();/*link*/
|
813 |
-
Hashs = new HashMap<String,Integer>();/*hash*/
|
814 |
-
}
|
815 |
-
public TreeNode()
|
816 |
-
{
|
817 |
-
token = "";
|
818 |
-
NumOflinks = 0;
|
819 |
-
Concept = "";
|
820 |
-
links = new ArrayList<TreeNode>();/*link*/
|
821 |
-
Hashs = new HashMap<String,Integer>();/*hash*/
|
822 |
-
}
|
823 |
-
|
824 |
-
public String toString()
|
825 |
-
{
|
826 |
-
return (token+"\t"+Concept);
|
827 |
-
}
|
828 |
-
|
829 |
-
/*
|
830 |
-
* Insert an new node under the target node
|
831 |
-
*/
|
832 |
-
public void InsertToken(String Tok)
|
833 |
-
{
|
834 |
-
TreeNode NewNode = new TreeNode(Tok);
|
835 |
-
|
836 |
-
/*link*/
|
837 |
-
links.add(NewNode);
|
838 |
-
|
839 |
-
/*hash*/
|
840 |
-
Hashs.put(Tok, NumOflinks);
|
841 |
-
|
842 |
-
NumOflinks++;
|
843 |
-
}
|
844 |
-
public void InsertToken(String Tok,String ID)
|
845 |
-
{
|
846 |
-
TreeNode NewNode = new TreeNode(Tok,ID);
|
847 |
-
/*link*/
|
848 |
-
links.add(NewNode);
|
849 |
-
|
850 |
-
/*hash*/
|
851 |
-
Hashs.put(Tok, NumOflinks);
|
852 |
-
|
853 |
-
NumOflinks++;
|
854 |
-
}
|
855 |
-
|
856 |
-
/*
|
857 |
-
* Check the tokens of children
|
858 |
-
*/
|
859 |
-
public int CheckChild(String Tok, Integer PrefixTranslation)
|
860 |
-
{
|
861 |
-
if(Hashs.containsKey(Tok))
|
862 |
-
{
|
863 |
-
return(Hashs.get(Tok));
|
864 |
-
}
|
865 |
-
|
866 |
-
if(PrefixTranslation == 1 && Tok.matches("(alpha|beta|gamam|[abg]|[12])")) // SuffixTranslationMap
|
867 |
-
{
|
868 |
-
if(Hashs.containsKey(GNormPlus.SuffixTranslationMap_hash.get(Tok)))
|
869 |
-
{
|
870 |
-
return(Hashs.get(GNormPlus.SuffixTranslationMap_hash.get(Tok)));
|
871 |
-
}
|
872 |
-
|
873 |
-
}
|
874 |
-
else if(PrefixTranslation == 2 && Tok.matches("[1-5]")) // for CTDGene feature
|
875 |
-
{
|
876 |
-
for(int i=0;i<links.size();i++)
|
877 |
-
{
|
878 |
-
if(links.get(i).token.matches("[1-5]"))
|
879 |
-
{
|
880 |
-
return(i);
|
881 |
-
}
|
882 |
-
}
|
883 |
-
|
884 |
-
for(int i=1;i<=5;i++)
|
885 |
-
{
|
886 |
-
if(Hashs.containsKey(i)){return(Hashs.get(i));}
|
887 |
-
}
|
888 |
-
}
|
889 |
-
|
890 |
-
return(-1);
|
891 |
-
}
|
892 |
-
}
|
893 |
|
|
|
1 |
+
/**
|
2 |
+
* Project: GNormPlus
|
3 |
+
* Function: Dictionary lookup by Prefix Tree
|
4 |
+
*/
|
5 |
+
|
6 |
+
package GNormPluslib;
|
7 |
+
|
8 |
+
import java.io.*;
|
9 |
+
import java.util.*;
|
10 |
+
import java.util.regex.Matcher;
|
11 |
+
import java.util.regex.Pattern;
|
12 |
+
|
13 |
+
public class PrefixTree
|
14 |
+
{
|
15 |
+
private Tree Tr=new Tree();
|
16 |
+
|
17 |
+
/*
|
18 |
+
* Read Dictionary and insert Mention into the Prefix Tree
|
19 |
+
*/
|
20 |
+
public static HashMap<String, String> StopWord_hash = new HashMap<String, String>();
|
21 |
+
|
22 |
+
public void Hash2Tree(HashMap<String, String> ID2Names)
|
23 |
+
{
|
24 |
+
for(String ID : ID2Names.keySet())
|
25 |
+
{
|
26 |
+
String NameColumn[]=ID2Names.get(ID).split("\\|");
|
27 |
+
for(int i=0;i<NameColumn.length;i++)
|
28 |
+
{
|
29 |
+
Tr.insertMention(NameColumn[i],ID);
|
30 |
+
}
|
31 |
+
}
|
32 |
+
}
|
33 |
+
public void Dictionary2Tree_Combine(String Filename,String StopWords,String MentionType)
|
34 |
+
{
|
35 |
+
try
|
36 |
+
{
|
37 |
+
//System.out.println("Dictionary2Tree_Combine : " + Filename);
|
38 |
+
|
39 |
+
/** Stop Word */
|
40 |
+
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8"));
|
41 |
+
String line="";
|
42 |
+
while ((line = br.readLine()) != null)
|
43 |
+
{
|
44 |
+
StopWord_hash.put(line, "StopWord");
|
45 |
+
}
|
46 |
+
br.close();
|
47 |
+
|
48 |
+
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
|
49 |
+
line="";
|
50 |
+
//int count=0;
|
51 |
+
while ((line = inputfile.readLine()) != null)
|
52 |
+
{
|
53 |
+
//count++;
|
54 |
+
//if(count%10000==0){ System.out.println(count); }
|
55 |
+
String Column[]=line.split("\t");
|
56 |
+
if(Column.length>1)
|
57 |
+
{
|
58 |
+
Column[0]=Column[0].replace("species:ncbi:","");
|
59 |
+
Column[1]=Column[1].replaceAll(" strain=", " ");
|
60 |
+
Column[1]=Column[1].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
|
61 |
+
Column[1]=Column[1].replaceAll("[\\(\\)]", " ");
|
62 |
+
String SpNameColumn[]=Column[1].split("\\|");
|
63 |
+
for(int i=0;i<SpNameColumn.length;i++)
|
64 |
+
{
|
65 |
+
String tmp = SpNameColumn[i];
|
66 |
+
tmp=tmp.replaceAll("[\\W\\-\\_]", "");
|
67 |
+
|
68 |
+
/*
|
69 |
+
* Criteria for Species
|
70 |
+
*/
|
71 |
+
if( MentionType.equals("Species") &&
|
72 |
+
(!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
|
73 |
+
(!SpNameColumn[i].matches("a[\\W\\-\\_].*")) &&
|
74 |
+
tmp.length()>=3
|
75 |
+
)
|
76 |
+
{
|
77 |
+
boolean stopword_boolean=false;
|
78 |
+
for(String stopword_RegEx : StopWord_hash.keySet())
|
79 |
+
{
|
80 |
+
Pattern ptmp = Pattern.compile("^"+stopword_RegEx+"$");
|
81 |
+
Matcher mtmp = ptmp.matcher(SpNameColumn[i].toLowerCase());
|
82 |
+
if(mtmp.find())
|
83 |
+
{
|
84 |
+
stopword_boolean=true;
|
85 |
+
}
|
86 |
+
}
|
87 |
+
if(stopword_boolean == false)
|
88 |
+
{
|
89 |
+
Tr.insertMention(SpNameColumn[i],Column[0]);
|
90 |
+
}
|
91 |
+
}
|
92 |
+
/*
|
93 |
+
* Criteria for Gene
|
94 |
+
*/
|
95 |
+
else if (MentionType.equals("Gene") &&
|
96 |
+
(!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
|
97 |
+
tmp.length()>=3
|
98 |
+
)
|
99 |
+
{
|
100 |
+
if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
|
101 |
+
{
|
102 |
+
Tr.insertMention(SpNameColumn[i],Column[0]);
|
103 |
+
}
|
104 |
+
}
|
105 |
+
/*
|
106 |
+
* Criteria for Cell
|
107 |
+
*/
|
108 |
+
else if (MentionType.equals("Cell") &&
|
109 |
+
(!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
|
110 |
+
tmp.length()>=3
|
111 |
+
)
|
112 |
+
{
|
113 |
+
if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
|
114 |
+
{
|
115 |
+
Tr.insertMention(SpNameColumn[i],Column[0]);
|
116 |
+
}
|
117 |
+
}
|
118 |
+
/*
|
119 |
+
* others
|
120 |
+
*/
|
121 |
+
else if ((!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
|
122 |
+
tmp.length()>=3
|
123 |
+
)
|
124 |
+
{
|
125 |
+
if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
|
126 |
+
{
|
127 |
+
Tr.insertMention(SpNameColumn[i],Column[0]);
|
128 |
+
}
|
129 |
+
}
|
130 |
+
}
|
131 |
+
}
|
132 |
+
}
|
133 |
+
inputfile.close();
|
134 |
+
}
|
135 |
+
catch(IOException e1){ System.out.println("[Dictionary2Tree_Combine]: Input file is not exist.");}
|
136 |
+
}
|
137 |
+
public void Dictionary2Tree_UniqueGene(String Filename,String StopWords,String Preifx)
|
138 |
+
{
|
139 |
+
try
|
140 |
+
{
|
141 |
+
//System.out.println("Dictionary2Tree_UniqueGene : " + Filename);
|
142 |
+
|
143 |
+
/** Stop Word */
|
144 |
+
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8"));
|
145 |
+
String line="";
|
146 |
+
while ((line = br.readLine()) != null)
|
147 |
+
{
|
148 |
+
StopWord_hash.put(line, "StopWord");
|
149 |
+
}
|
150 |
+
br.close();
|
151 |
+
|
152 |
+
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
|
153 |
+
line="";
|
154 |
+
//int count=0;
|
155 |
+
while ((line = inputfile.readLine()) != null)
|
156 |
+
{
|
157 |
+
//count++;
|
158 |
+
//if(count%10000==0){ System.out.println(count); }
|
159 |
+
String Column[]=line.split("\t");
|
160 |
+
if(Column.length>1)
|
161 |
+
{
|
162 |
+
if(!StopWord_hash.containsKey(Column[0].toLowerCase()))
|
163 |
+
{
|
164 |
+
if(Preifx.equals(""))
|
165 |
+
{
|
166 |
+
Tr.insertMention(Column[0],Column[1]);
|
167 |
+
}
|
168 |
+
else if(Preifx.equals("Num") && Column[0].matches("[0-9].*"))
|
169 |
+
{
|
170 |
+
Tr.insertMention(Column[0],Column[1]);
|
171 |
+
}
|
172 |
+
else if(Preifx.equals("AZNum") && Column[0].matches("[a-z][0-9].*"))
|
173 |
+
{
|
174 |
+
Tr.insertMention(Column[0],Column[1]);
|
175 |
+
}
|
176 |
+
else if(Preifx.equals("lo") && Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
|
177 |
+
{
|
178 |
+
if( ! Column[0].matches("loc[0-9]+"))
|
179 |
+
{
|
180 |
+
Tr.insertMention(Column[0],Column[1]);
|
181 |
+
}
|
182 |
+
}
|
183 |
+
else if(Preifx.equals("un") && Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
|
184 |
+
{
|
185 |
+
if(Column[0].length()>=6 && Column[0].substring(0,6).equals("unchar"))
|
186 |
+
{
|
187 |
+
// remove uncharacterized
|
188 |
+
}
|
189 |
+
else
|
190 |
+
{
|
191 |
+
Tr.insertMention(Column[0],Column[1]);
|
192 |
+
}
|
193 |
+
}
|
194 |
+
else if(Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
|
195 |
+
{
|
196 |
+
Tr.insertMention(Column[0],Column[1]);
|
197 |
+
}
|
198 |
+
}
|
199 |
+
}
|
200 |
+
}
|
201 |
+
inputfile.close();
|
202 |
+
}
|
203 |
+
catch(IOException e1){ System.out.println("[Dictionary2Tree_UniqueGene]: Input file is not exist.");}
|
204 |
+
}
|
205 |
+
public void Dictionary2Tree_UniqueSpecies(String Filename,String StopWords,String Preifx)
|
206 |
+
{
|
207 |
+
try
|
208 |
+
{
|
209 |
+
//System.out.println("Dictionary2Tree_UniqueGene : " + Filename);
|
210 |
+
|
211 |
+
/** Stop Word */
|
212 |
+
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8"));
|
213 |
+
String line="";
|
214 |
+
while ((line = br.readLine()) != null)
|
215 |
+
{
|
216 |
+
StopWord_hash.put(line, "StopWord");
|
217 |
+
}
|
218 |
+
br.close();
|
219 |
+
|
220 |
+
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
|
221 |
+
line="";
|
222 |
+
while ((line = inputfile.readLine()) != null)
|
223 |
+
{
|
224 |
+
//count++;
|
225 |
+
//if(count%10000==0){ System.out.println(count); }
|
226 |
+
String Column[]=line.split("\t");
|
227 |
+
if(Column.length>1)
|
228 |
+
{
|
229 |
+
if(!StopWord_hash.containsKey(Column[0].toLowerCase()))
|
230 |
+
{
|
231 |
+
if(Preifx.equals("")) //all
|
232 |
+
{
|
233 |
+
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
|
234 |
+
{
|
235 |
+
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
|
236 |
+
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
|
237 |
+
if(mention_tmp.length()>=10)
|
238 |
+
{
|
239 |
+
Tr.insertMention(mention_rev,Column[1]);
|
240 |
+
}
|
241 |
+
}
|
242 |
+
else
|
243 |
+
{
|
244 |
+
Tr.insertMention(Column[0],Column[1]); // mention, id
|
245 |
+
}
|
246 |
+
|
247 |
+
}
|
248 |
+
else if(Column[0].matches("[0-9][0-9].*"))
|
249 |
+
{
|
250 |
+
if(Preifx.equals("Num"))
|
251 |
+
{
|
252 |
+
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
|
253 |
+
{
|
254 |
+
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
|
255 |
+
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
|
256 |
+
if(mention_tmp.length()>=10)
|
257 |
+
{
|
258 |
+
Tr.insertMention(mention_rev,Column[1]);
|
259 |
+
}
|
260 |
+
}
|
261 |
+
else
|
262 |
+
{
|
263 |
+
Tr.insertMention(Column[0],Column[1]); // mention, id
|
264 |
+
}
|
265 |
+
}
|
266 |
+
}
|
267 |
+
/*
|
268 |
+
else if(Column[0].matches("[a-z][0-9].*"))
|
269 |
+
{
|
270 |
+
if(Preifx.equals("AZNum"))
|
271 |
+
{
|
272 |
+
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
|
273 |
+
{
|
274 |
+
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
|
275 |
+
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
|
276 |
+
if(mention_tmp.length()>=10)
|
277 |
+
{
|
278 |
+
Tr.insertMention(mention_rev,Column[1]);
|
279 |
+
}
|
280 |
+
}
|
281 |
+
else
|
282 |
+
{
|
283 |
+
Tr.insertMention(Column[0],Column[1]); // mention, id
|
284 |
+
}
|
285 |
+
}
|
286 |
+
}
|
287 |
+
*/
|
288 |
+
else if(Column[0].matches("[a-z][a-z].*"))
|
289 |
+
{
|
290 |
+
if(Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
|
291 |
+
{
|
292 |
+
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
|
293 |
+
{
|
294 |
+
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
|
295 |
+
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
|
296 |
+
if(mention_tmp.length()>=10)
|
297 |
+
{
|
298 |
+
Tr.insertMention(mention_rev,Column[1]);
|
299 |
+
}
|
300 |
+
}
|
301 |
+
else
|
302 |
+
{
|
303 |
+
Tr.insertMention(Column[0],Column[1]); // mention, id
|
304 |
+
}
|
305 |
+
}
|
306 |
+
}
|
307 |
+
else if(Preifx.equals("Others"))
|
308 |
+
{
|
309 |
+
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
|
310 |
+
{
|
311 |
+
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
|
312 |
+
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
|
313 |
+
if(mention_tmp.length()>=10)
|
314 |
+
{
|
315 |
+
Tr.insertMention(mention_rev,Column[1]);
|
316 |
+
}
|
317 |
+
}
|
318 |
+
else
|
319 |
+
{
|
320 |
+
Tr.insertMention(Column[0],Column[1]); // mention, id
|
321 |
+
}
|
322 |
+
}
|
323 |
+
}
|
324 |
+
}
|
325 |
+
}
|
326 |
+
inputfile.close();
|
327 |
+
}
|
328 |
+
catch(IOException e1){ System.out.println("[Dictionary2Tree_UniqueGene]: Input file is not exist.");}
|
329 |
+
}
|
330 |
+
public void TreeFile2Tree(String Filename)
|
331 |
+
{
|
332 |
+
try
|
333 |
+
{
|
334 |
+
//System.out.println("TreeFile2Tree : " + Filename);
|
335 |
+
|
336 |
+
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
|
337 |
+
String line="";
|
338 |
+
int count=0;
|
339 |
+
while ((line = inputfile.readLine()) != null)
|
340 |
+
{
|
341 |
+
String Anno[]=line.split("\t");
|
342 |
+
if(Anno.length<2){System.out.println(count+"\t"+line);} //check error
|
343 |
+
String LocationInTree = Anno[0];
|
344 |
+
String token = Anno[1];
|
345 |
+
String identifier="";
|
346 |
+
if(Anno.length==3)
|
347 |
+
{
|
348 |
+
identifier = Anno[2];
|
349 |
+
}
|
350 |
+
String LocationsInTree[]=LocationInTree.split("-");
|
351 |
+
TreeNode tmp = Tr.root;
|
352 |
+
for(int i=0;i<LocationsInTree.length-1;i++)
|
353 |
+
{
|
354 |
+
tmp=tmp.links.get(Integer.parseInt(LocationsInTree[i])-1);
|
355 |
+
}
|
356 |
+
tmp.InsertToken(token,identifier);
|
357 |
+
//if(count%10000==0){System.out.println(count);}
|
358 |
+
count++;
|
359 |
+
}
|
360 |
+
inputfile.close();
|
361 |
+
}
|
362 |
+
catch(IOException e1){ System.out.println("[TreeFile2Tee]: Input file: "+ Filename +" is not exist.");}
|
363 |
+
}
|
364 |
+
|
365 |
+
/*
|
366 |
+
* Search target mention in the Prefix Tree
|
367 |
+
*/
|
368 |
+
public String MentionMatch(String Mentions)
|
369 |
+
{
|
370 |
+
ArrayList<String> location = new ArrayList<String>();
|
371 |
+
String Menlist[]=Mentions.split("\\|");
|
372 |
+
for(int m=0;m<Menlist.length;m++)
|
373 |
+
{
|
374 |
+
String Mention=Menlist[m];
|
375 |
+
String Mention_lc=Mention.toLowerCase();
|
376 |
+
Mention_lc = Mention_lc.replaceAll("[\\W\\-\\_]+", "");
|
377 |
+
Mention_lc = Mention_lc.replaceAll("([0-9])([a-z])", "$1 $2");
|
378 |
+
Mention_lc = Mention_lc.replaceAll("([a-z])([0-9])", "$1 $2");
|
379 |
+
String Tkns[]=Mention_lc.split(" ");
|
380 |
+
|
381 |
+
int PrefixTranslation=0;
|
382 |
+
int i=0;
|
383 |
+
boolean find=false;
|
384 |
+
TreeNode tmp = Tr.root;
|
385 |
+
|
386 |
+
while( i<Tkns.length && tmp.CheckChild(Tkns[i],PrefixTranslation)>=0) //Find Tokens in the links
|
387 |
+
{
|
388 |
+
if(i == Tkns.length-1){PrefixTranslation = 1;}
|
389 |
+
tmp=tmp.links.get(tmp.CheckChild(Tkns[i],PrefixTranslation)); //move point to the link
|
390 |
+
find=true;
|
391 |
+
i++;
|
392 |
+
}
|
393 |
+
if(find == true)
|
394 |
+
{
|
395 |
+
if(i==Tkns.length)
|
396 |
+
{
|
397 |
+
if(!tmp.Concept.equals(""))
|
398 |
+
{
|
399 |
+
return tmp.Concept;
|
400 |
+
}
|
401 |
+
else
|
402 |
+
{
|
403 |
+
return "-1";
|
404 |
+
//gene id is not found.
|
405 |
+
}
|
406 |
+
}
|
407 |
+
else
|
408 |
+
{
|
409 |
+
return "-2";
|
410 |
+
//the gene mention matched a substring in PrefixTree.
|
411 |
+
}
|
412 |
+
}
|
413 |
+
else
|
414 |
+
{
|
415 |
+
return "-3";
|
416 |
+
//mention is not found
|
417 |
+
}
|
418 |
+
}
|
419 |
+
return "-3"; //mention is not found
|
420 |
+
}
|
421 |
+
|
422 |
+
/*
|
423 |
+
* Search target mention in the Prefix Tree
|
424 |
+
*/
|
425 |
+
public String MentionMatch_species(String Mentions)
|
426 |
+
{
|
427 |
+
ArrayList<String> location = new ArrayList<String>();
|
428 |
+
String Menlist[]=Mentions.split("\\|");
|
429 |
+
for(int m=0;m<Menlist.length;m++)
|
430 |
+
{
|
431 |
+
String Mention=Menlist[m];
|
432 |
+
String Mention_lc=Mention.toLowerCase();
|
433 |
+
Mention_lc = Mention_lc.replaceAll("[\\W\\-\\_]+", " ");
|
434 |
+
Mention_lc = Mention_lc.replaceAll("([0-9])([a-z])", "$1 $2");
|
435 |
+
Mention_lc = Mention_lc.replaceAll("([a-z])([0-9])", "$1 $2");
|
436 |
+
Mention_lc = Mention_lc.replaceAll("^[ ]+", "");
|
437 |
+
Mention_lc = Mention_lc.replaceAll("[ ]+$", "");
|
438 |
+
String Tkns[]=Mention_lc.split(" ");
|
439 |
+
|
440 |
+
int PrefixTranslation=0;
|
441 |
+
int i=0;
|
442 |
+
boolean find=false;
|
443 |
+
TreeNode tmp = Tr.root;
|
444 |
+
|
445 |
+
while( i<Tkns.length && tmp.CheckChild(Tkns[i],PrefixTranslation)>=0) //Find Tokens in the links
|
446 |
+
{
|
447 |
+
if(i == Tkns.length-1){PrefixTranslation = 1;}
|
448 |
+
tmp=tmp.links.get(tmp.CheckChild(Tkns[i],PrefixTranslation)); //move point to the link
|
449 |
+
find=true;
|
450 |
+
i++;
|
451 |
+
}
|
452 |
+
if(find == true)
|
453 |
+
{
|
454 |
+
if(i==Tkns.length)
|
455 |
+
{
|
456 |
+
if(!tmp.Concept.equals(""))
|
457 |
+
{
|
458 |
+
return tmp.Concept;
|
459 |
+
}
|
460 |
+
else
|
461 |
+
{
|
462 |
+
return "-1";
|
463 |
+
//gene id is not found.
|
464 |
+
}
|
465 |
+
}
|
466 |
+
else
|
467 |
+
{
|
468 |
+
return "-2";
|
469 |
+
//the gene mention matched a substring in PrefixTree.
|
470 |
+
}
|
471 |
+
}
|
472 |
+
else
|
473 |
+
{
|
474 |
+
return "-3";
|
475 |
+
//mention is not found
|
476 |
+
}
|
477 |
+
}
|
478 |
+
return "-3"; //mention is not found
|
479 |
+
}
|
480 |
+
|
481 |
+
/*
|
482 |
+
* Search target mention in the Prefix Tree
|
483 |
+
* ConceptType: Species|Genus|Cell|CTDGene
|
484 |
+
*/
|
485 |
+
public ArrayList<String> SearchMentionLocation(String Doc,String ConceptType)
|
486 |
+
{
|
487 |
+
ArrayList<String> location = new ArrayList<String>();
|
488 |
+
Doc=Doc+" XXXX XXXX";
|
489 |
+
String Doc_org=Doc;
|
490 |
+
Doc=Doc.toLowerCase();
|
491 |
+
String Doc_lc=Doc;
|
492 |
+
Doc = Doc.replaceAll("([0-9])([A-Za-z])", "$1 $2");
|
493 |
+
Doc = Doc.replaceAll("([A-Za-z])([0-9])", "$1 $2");
|
494 |
+
Doc = Doc.replaceAll("[\\W^;:,]+", " ");
|
495 |
+
|
496 |
+
/* = keep special characters =
|
497 |
+
*
|
498 |
+
String regex="\\s+|(?=\\p{Punct})|(?<=\\p{Punct})";
|
499 |
+
String DocTkns[]=Doc.split(regex);
|
500 |
+
*/
|
501 |
+
|
502 |
+
String DocTkns[]=Doc.split(" ");
|
503 |
+
int Offset=0;
|
504 |
+
int Start=0;
|
505 |
+
int Last=0;
|
506 |
+
int FirstTime=0;
|
507 |
+
|
508 |
+
while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
|
509 |
+
{
|
510 |
+
Doc_lc=Doc_lc.substring(1);
|
511 |
+
Offset++;
|
512 |
+
}
|
513 |
+
|
514 |
+
for(int i=0;i<DocTkns.length;i++)
|
515 |
+
{
|
516 |
+
//System.out.println(i+"\t"+Start+"\t"+Last+"\t"+Offset+"\t"+Doc_lc);
|
517 |
+
|
518 |
+
int pre_i=i;
|
519 |
+
int pre_Start=Start;
|
520 |
+
int pre_Last=Last;
|
521 |
+
String pre_Doc_lc=Doc_lc;
|
522 |
+
int pre_Offset=Offset;
|
523 |
+
|
524 |
+
TreeNode tmp = Tr.root;
|
525 |
+
boolean find=false;
|
526 |
+
int PrefixTranslation=2;
|
527 |
+
if(ConceptType.equals("Species"))
|
528 |
+
{
|
529 |
+
PrefixTranslation=3;
|
530 |
+
}
|
531 |
+
int ConceptFound=i; //Keep found concept
|
532 |
+
String ConceptFound_STR="";//Keep found concept
|
533 |
+
int FirstTime_while = -1;
|
534 |
+
|
535 |
+
while( tmp.CheckChild(DocTkns[i],PrefixTranslation)>=0 ) //Find Tokens in the links
|
536 |
+
{
|
537 |
+
FirstTime_while++;
|
538 |
+
tmp=tmp.links.get(tmp.CheckChild(DocTkns[i],PrefixTranslation)); //move point to the link
|
539 |
+
if(Start==0 && FirstTime>0){Start = Offset;} //Start <- Offset
|
540 |
+
if(Doc_lc.length()>=DocTkns[i].length() && Doc_lc.substring(0,DocTkns[i].length()).equals(DocTkns[i]))
|
541 |
+
{
|
542 |
+
if(DocTkns[i].length()>0)
|
543 |
+
{
|
544 |
+
Doc_lc=Doc_lc.substring(DocTkns[i].length());
|
545 |
+
Offset=Offset+DocTkns[i].length();
|
546 |
+
}
|
547 |
+
}
|
548 |
+
Last = Offset;
|
549 |
+
while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
|
550 |
+
{
|
551 |
+
Doc_lc=Doc_lc.substring(1);
|
552 |
+
Offset++;
|
553 |
+
}
|
554 |
+
i++;
|
555 |
+
|
556 |
+
if(ConceptType.equals("Species"))
|
557 |
+
{
|
558 |
+
if(i<DocTkns.length-3 && DocTkns[i].matches("(str|strain|substr|substrain|subspecies|subsp|var|variant|pathovars|pv|biovar|bv)"))
|
559 |
+
{
|
560 |
+
Doc_lc=Doc_lc.substring(DocTkns[i].length());
|
561 |
+
Offset=Offset+DocTkns[i].length();
|
562 |
+
Last = Offset;
|
563 |
+
while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
|
564 |
+
{
|
565 |
+
Doc_lc=Doc_lc.substring(1);
|
566 |
+
Offset++;
|
567 |
+
}
|
568 |
+
i++;
|
569 |
+
}
|
570 |
+
}
|
571 |
+
|
572 |
+
if(!tmp.Concept.equals("") && (Last-Start>0)) //Keep found concept
|
573 |
+
{
|
574 |
+
if(Last<Doc_org.length())
|
575 |
+
{
|
576 |
+
ConceptFound=i;
|
577 |
+
ConceptFound_STR=Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept;
|
578 |
+
//System.out.println(ConceptFound_STR);
|
579 |
+
}
|
580 |
+
}
|
581 |
+
|
582 |
+
find=true;
|
583 |
+
if(i>=DocTkns.length){break;}
|
584 |
+
else if(i==DocTkns.length-1){PrefixTranslation=2;}
|
585 |
+
|
586 |
+
//System.out.println(i+"\t"+Start+"\t"+Last+"\t("+FirstTime_while+")\t"+Offset+"\t"+Doc_lc);
|
587 |
+
|
588 |
+
if(FirstTime_while==0) // first matched token
|
589 |
+
{
|
590 |
+
pre_i=i;
|
591 |
+
pre_Start=Start;
|
592 |
+
pre_Last=Last;
|
593 |
+
pre_Doc_lc=Doc_lc;
|
594 |
+
pre_Offset=Offset;
|
595 |
+
}
|
596 |
+
}
|
597 |
+
|
598 |
+
if(find == true)
|
599 |
+
{
|
600 |
+
//System.out.println(find+"\t"+FirstTime_while+"\t"+Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
|
601 |
+
if(!tmp.Concept.equals("")) //the last matched token has concept id
|
602 |
+
{
|
603 |
+
if(Last<Doc_org.length() && Last>Start)
|
604 |
+
{
|
605 |
+
location.add(Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
|
606 |
+
}
|
607 |
+
}
|
608 |
+
else
|
609 |
+
{
|
610 |
+
if(!ConceptFound_STR.equals("")) //Keep found concept
|
611 |
+
{
|
612 |
+
location.add(ConceptFound_STR);
|
613 |
+
i = ConceptFound + 1;
|
614 |
+
}
|
615 |
+
|
616 |
+
if(FirstTime_while>=1)
|
617 |
+
{
|
618 |
+
i=pre_i;
|
619 |
+
Start=pre_Start;
|
620 |
+
Last=pre_Last;
|
621 |
+
Doc_lc=pre_Doc_lc;
|
622 |
+
Offset=pre_Offset;
|
623 |
+
}
|
624 |
+
}
|
625 |
+
Start=0;
|
626 |
+
Last=0;
|
627 |
+
if(i>0){i--;}
|
628 |
+
ConceptFound=i; //Keep found concept
|
629 |
+
ConceptFound_STR="";//Keep found concept
|
630 |
+
}
|
631 |
+
else //if(find == false)
|
632 |
+
{
|
633 |
+
//System.out.println(find+"\t"+FirstTime_while+"\t"+Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
|
634 |
+
|
635 |
+
if(FirstTime_while>=1 && tmp.Concept.equals(""))
|
636 |
+
{
|
637 |
+
i=pre_i;
|
638 |
+
Start=pre_Start;
|
639 |
+
Last=pre_Last;
|
640 |
+
Doc_lc=pre_Doc_lc;
|
641 |
+
Offset=pre_Offset;
|
642 |
+
}
|
643 |
+
|
644 |
+
if(Doc_lc.length()>=DocTkns[i].length() && Doc_lc.substring(0,DocTkns[i].length()).equals(DocTkns[i]))
|
645 |
+
{
|
646 |
+
if(DocTkns[i].length()>0)
|
647 |
+
{
|
648 |
+
Doc_lc=Doc_lc.substring(DocTkns[i].length());
|
649 |
+
Offset=Offset+DocTkns[i].length();
|
650 |
+
}
|
651 |
+
}
|
652 |
+
}
|
653 |
+
|
654 |
+
while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
|
655 |
+
{
|
656 |
+
Doc_lc=Doc_lc.substring(1);
|
657 |
+
Offset++;
|
658 |
+
}
|
659 |
+
FirstTime++;
|
660 |
+
|
661 |
+
//System.out.println();
|
662 |
+
}
|
663 |
+
return location;
|
664 |
+
}
|
665 |
+
|
666 |
+
/*
|
667 |
+
* Print out the Prefix Tree
|
668 |
+
*/
|
669 |
+
public String PrintTree()
|
670 |
+
{
|
671 |
+
return Tr.PrintTree_preorder(Tr.root,"");
|
672 |
+
}
|
673 |
+
|
674 |
+
public void SaveTree(String outputfile) throws IOException
|
675 |
+
{
|
676 |
+
BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputfile), "UTF-8"));
|
677 |
+
Tr.SaveTree_preorder(Tr.root,"",fr);
|
678 |
+
fr.close();
|
679 |
+
}
|
680 |
+
|
681 |
+
|
682 |
+
public void insertMention(String Mention, String Identifier)
|
683 |
+
{
|
684 |
+
Tr.insertMention(Mention,Identifier);
|
685 |
+
}
|
686 |
+
}
|
687 |
+
|
688 |
+
class Tree
|
689 |
+
{
|
690 |
+
/*
|
691 |
+
* Prefix Tree - root node
|
692 |
+
*/
|
693 |
+
public TreeNode root;
|
694 |
+
|
695 |
+
public Tree()
|
696 |
+
{
|
697 |
+
root = new TreeNode("-ROOT-");
|
698 |
+
}
|
699 |
+
|
700 |
+
/*
|
701 |
+
* Insert mention into the tree
|
702 |
+
*/
|
703 |
+
public void insertMention(String Mention, String Identifier)
|
704 |
+
{
|
705 |
+
Mention=Mention.toLowerCase();
|
706 |
+
|
707 |
+
Mention = Mention.replaceAll("([0-9])([A-Za-z])", "$1 $2");
|
708 |
+
Mention = Mention.replaceAll("([A-Za-z])([0-9])", "$1 $2");
|
709 |
+
Mention = Mention.replaceAll("[\\W\\-\\_]+", " ");
|
710 |
+
/* = keep special characters =
|
711 |
+
*
|
712 |
+
String regex="\\s+|(?=\\p{Punct})|(?<=\\p{Punct})";
|
713 |
+
String Tokens[]=Mention.split(regex);
|
714 |
+
*/
|
715 |
+
String Tokens[]=Mention.split(" ");
|
716 |
+
TreeNode tmp = root;
|
717 |
+
for(int i=0;i<Tokens.length;i++)
|
718 |
+
{
|
719 |
+
if(tmp.CheckChild(Tokens[i],0)>=0)
|
720 |
+
{
|
721 |
+
tmp=tmp.links.get( tmp.CheckChild(Tokens[i],0) ); //go through next generation (exist node)
|
722 |
+
if(i == Tokens.length-1)
|
723 |
+
{
|
724 |
+
tmp.Concept=Identifier;
|
725 |
+
}
|
726 |
+
}
|
727 |
+
else //not exist
|
728 |
+
{
|
729 |
+
if(i == Tokens.length-1)
|
730 |
+
{
|
731 |
+
tmp.InsertToken(Tokens[i],Identifier);
|
732 |
+
}
|
733 |
+
else
|
734 |
+
{
|
735 |
+
tmp.InsertToken(Tokens[i]);
|
736 |
+
}
|
737 |
+
tmp=tmp.links.get(tmp.NumOflinks-1); //go to the next generation (new node)
|
738 |
+
}
|
739 |
+
}
|
740 |
+
}
|
741 |
+
|
742 |
+
/*
|
743 |
+
* Print the tree by pre-order
|
744 |
+
*/
|
745 |
+
public String PrintTree_preorder(TreeNode node, String LocationInTree)
|
746 |
+
{
|
747 |
+
String opt="";
|
748 |
+
if(!node.token.equals("-ROOT-"))//Ignore root
|
749 |
+
{
|
750 |
+
if(node.Concept.equals(""))
|
751 |
+
{
|
752 |
+
opt=opt+LocationInTree+"\t"+node.token+"\n";
|
753 |
+
}
|
754 |
+
else
|
755 |
+
{
|
756 |
+
opt=opt+LocationInTree+"\t"+node.token+"\t"+node.Concept+"\n";
|
757 |
+
}
|
758 |
+
}
|
759 |
+
if(!LocationInTree.equals("")){LocationInTree=LocationInTree+"-";}
|
760 |
+
for(int i=0;i<node.NumOflinks;i++)
|
761 |
+
{
|
762 |
+
opt=opt+PrintTree_preorder(node.links.get(i),LocationInTree+(i+1));
|
763 |
+
}
|
764 |
+
return opt;
|
765 |
+
}
|
766 |
+
|
767 |
+
/*
|
768 |
+
* Print the tree by pre-order
|
769 |
+
*/
|
770 |
+
public void SaveTree_preorder(TreeNode node, String LocationInTree, BufferedWriter fr) throws IOException
|
771 |
+
{
|
772 |
+
if(!node.token.equals("-ROOT-"))//Ignore root
|
773 |
+
{
|
774 |
+
if(node.Concept.equals(""))
|
775 |
+
{
|
776 |
+
fr.write(LocationInTree+"\t"+node.token+"\n");
|
777 |
+
}
|
778 |
+
else
|
779 |
+
{
|
780 |
+
fr.write(LocationInTree+"\t"+node.token+"\t"+node.Concept+"\n");
|
781 |
+
}
|
782 |
+
}
|
783 |
+
if(!LocationInTree.equals("")){LocationInTree=LocationInTree+"-";}
|
784 |
+
for(int i=0;i<node.NumOflinks;i++)
|
785 |
+
{
|
786 |
+
SaveTree_preorder(node.links.get(i),LocationInTree+(i+1),fr);
|
787 |
+
}
|
788 |
+
}
|
789 |
+
}
|
790 |
+
|
791 |
+
class TreeNode
|
792 |
+
{
|
793 |
+
String token; //token of the node
|
794 |
+
int NumOflinks; //Number of links
|
795 |
+
public String Concept;
|
796 |
+
HashMap<String,Integer> Hashs;
|
797 |
+
ArrayList<TreeNode> links;
|
798 |
+
|
799 |
+
public TreeNode(String Tok,String ID)
|
800 |
+
{
|
801 |
+
token = Tok;
|
802 |
+
NumOflinks = 0;
|
803 |
+
Concept = ID;
|
804 |
+
links = new ArrayList<TreeNode>();/*link*/
|
805 |
+
Hashs = new HashMap<String,Integer>();/*hash*/
|
806 |
+
}
|
807 |
+
public TreeNode(String Tok)
|
808 |
+
{
|
809 |
+
token = Tok;
|
810 |
+
NumOflinks = 0;
|
811 |
+
Concept = "";
|
812 |
+
links = new ArrayList<TreeNode>();/*link*/
|
813 |
+
Hashs = new HashMap<String,Integer>();/*hash*/
|
814 |
+
}
|
815 |
+
public TreeNode()
|
816 |
+
{
|
817 |
+
token = "";
|
818 |
+
NumOflinks = 0;
|
819 |
+
Concept = "";
|
820 |
+
links = new ArrayList<TreeNode>();/*link*/
|
821 |
+
Hashs = new HashMap<String,Integer>();/*hash*/
|
822 |
+
}
|
823 |
+
|
824 |
+
public String toString()
|
825 |
+
{
|
826 |
+
return (token+"\t"+Concept);
|
827 |
+
}
|
828 |
+
|
829 |
+
/*
|
830 |
+
* Insert an new node under the target node
|
831 |
+
*/
|
832 |
+
public void InsertToken(String Tok)
|
833 |
+
{
|
834 |
+
TreeNode NewNode = new TreeNode(Tok);
|
835 |
+
|
836 |
+
/*link*/
|
837 |
+
links.add(NewNode);
|
838 |
+
|
839 |
+
/*hash*/
|
840 |
+
Hashs.put(Tok, NumOflinks);
|
841 |
+
|
842 |
+
NumOflinks++;
|
843 |
+
}
|
844 |
+
public void InsertToken(String Tok,String ID)
|
845 |
+
{
|
846 |
+
TreeNode NewNode = new TreeNode(Tok,ID);
|
847 |
+
/*link*/
|
848 |
+
links.add(NewNode);
|
849 |
+
|
850 |
+
/*hash*/
|
851 |
+
Hashs.put(Tok, NumOflinks);
|
852 |
+
|
853 |
+
NumOflinks++;
|
854 |
+
}
|
855 |
+
|
856 |
+
/*
|
857 |
+
* Check the tokens of children
|
858 |
+
*/
|
859 |
+
public int CheckChild(String Tok, Integer PrefixTranslation)
|
860 |
+
{
|
861 |
+
if(Hashs.containsKey(Tok))
|
862 |
+
{
|
863 |
+
return(Hashs.get(Tok));
|
864 |
+
}
|
865 |
+
|
866 |
+
if(PrefixTranslation == 1 && Tok.matches("(alpha|beta|gamam|[abg]|[12])")) // SuffixTranslationMap
|
867 |
+
{
|
868 |
+
if(Hashs.containsKey(GNormPlus.SuffixTranslationMap_hash.get(Tok)))
|
869 |
+
{
|
870 |
+
return(Hashs.get(GNormPlus.SuffixTranslationMap_hash.get(Tok)));
|
871 |
+
}
|
872 |
+
|
873 |
+
}
|
874 |
+
else if(PrefixTranslation == 2 && Tok.matches("[1-5]")) // for CTDGene feature
|
875 |
+
{
|
876 |
+
for(int i=0;i<links.size();i++)
|
877 |
+
{
|
878 |
+
if(links.get(i).token.matches("[1-5]"))
|
879 |
+
{
|
880 |
+
return(i);
|
881 |
+
}
|
882 |
+
}
|
883 |
+
|
884 |
+
for(int i=1;i<=5;i++)
|
885 |
+
{
|
886 |
+
if(Hashs.containsKey(i)){return(Hashs.get(i));}
|
887 |
+
}
|
888 |
+
}
|
889 |
+
|
890 |
+
return(-1);
|
891 |
+
}
|
892 |
+
}
|
893 |
|
src_Java/GNormPluslib/SR.java
CHANGED
@@ -1,1044 +1,1044 @@
|
|
1 |
-
/**
|
2 |
-
* Project: GNormPlus
|
3 |
-
* Function: Species recognition and Species assignment
|
4 |
-
*/
|
5 |
-
|
6 |
-
package GNormPluslib;
|
7 |
-
|
8 |
-
import bioc.BioCAnnotation;
|
9 |
-
import bioc.BioCCollection;
|
10 |
-
import bioc.BioCDocument;
|
11 |
-
import bioc.BioCLocation;
|
12 |
-
import bioc.BioCPassage;
|
13 |
-
|
14 |
-
import bioc.io.BioCDocumentWriter;
|
15 |
-
import bioc.io.BioCFactory;
|
16 |
-
import bioc.io.woodstox.ConnectorWoodstox;
|
17 |
-
import java.io.BufferedReader;
|
18 |
-
import java.io.BufferedWriter;
|
19 |
-
import java.io.FileInputStream;
|
20 |
-
import java.io.FileOutputStream;
|
21 |
-
import java.io.FileReader;
|
22 |
-
import java.io.FileWriter;
|
23 |
-
import java.io.IOException;
|
24 |
-
import java.io.InputStreamReader;
|
25 |
-
import java.io.OutputStreamWriter;
|
26 |
-
import java.text.BreakIterator;
|
27 |
-
import java.time.LocalDate;
|
28 |
-
import java.time.ZoneId;
|
29 |
-
|
30 |
-
import javax.xml.stream.XMLStreamException;
|
31 |
-
|
32 |
-
import org.tartarus.snowball.SnowballStemmer;
|
33 |
-
import org.tartarus.snowball.ext.englishStemmer;
|
34 |
-
|
35 |
-
import java.util.Map;
|
36 |
-
import java.util.regex.Matcher;
|
37 |
-
import java.util.regex.Pattern;
|
38 |
-
import java.util.ArrayList;
|
39 |
-
import java.util.HashMap;
|
40 |
-
import java.util.List;
|
41 |
-
import java.util.Locale;
|
42 |
-
import java.util.Collections;
|
43 |
-
|
44 |
-
public class SR
|
45 |
-
{
|
46 |
-
@SuppressWarnings("null")
|
47 |
-
public void SpeciesRecognition(String Filename,String FilenameBioC,String StrainFilename,String FilterAntibody) throws IOException, XMLStreamException
|
48 |
-
{
|
49 |
-
/** Recognizing Species Names: SP */
|
50 |
-
for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */
|
51 |
-
{
|
52 |
-
String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
|
53 |
-
PrefixTree PT_Genus = new PrefixTree();
|
54 |
-
HashMap<String, String> SPID_hash = new HashMap<String, String>();
|
55 |
-
ArrayList<String> TargetedLocation = new ArrayList<String>();
|
56 |
-
HashMap<String, String> GenusNames = new HashMap<String, String>();
|
57 |
-
HashMap<String, String> Mention2ID_lc = new HashMap<String, String>();
|
58 |
-
ArrayList<String> IDset = new ArrayList<String>();
|
59 |
-
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
|
60 |
-
{
|
61 |
-
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
|
62 |
-
|
63 |
-
/** Species recognition */
|
64 |
-
ArrayList<String> locations = GNormPlus.PT_Species.SearchMentionLocation(PassageContext,"Species"); /** PT_Species */
|
65 |
-
for (int k = 0 ; k < locations.size() ; k++)
|
66 |
-
{
|
67 |
-
String anno[]=locations.get(k).split("\t");
|
68 |
-
int start= Integer.parseInt(anno[0]);
|
69 |
-
int last= Integer.parseInt(anno[1]);
|
70 |
-
|
71 |
-
// For anti-serum filtering
|
72 |
-
String ForwardSTR="";
|
73 |
-
String BackwardSTR="";
|
74 |
-
if(start>21)
|
75 |
-
{
|
76 |
-
ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(start-21,last);
|
77 |
-
}
|
78 |
-
else
|
79 |
-
{
|
80 |
-
ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(0,last);
|
81 |
-
}
|
82 |
-
if(PassageContext.length()>last+21)
|
83 |
-
{
|
84 |
-
BackwardSTR = PassageContext.substring(start,last+21);
|
85 |
-
}
|
86 |
-
else
|
87 |
-
{
|
88 |
-
BackwardSTR = PassageContext.substring(start,PassageContext.length());
|
89 |
-
}
|
90 |
-
|
91 |
-
String mention = anno[2];
|
92 |
-
String id = anno[3];
|
93 |
-
String mention_tmp=mention.toLowerCase();
|
94 |
-
mention_tmp = mention_tmp.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
|
95 |
-
String antibody="";
|
96 |
-
if(ForwardSTR.toLowerCase().matches(".*(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg)[\\W\\-\\_]+"+mention_tmp)) {antibody="(anti)";}//filtering : antibody
|
97 |
-
else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[\\W\\-\\_]+(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg).*")){antibody="(anti)";} //filtering : antibody
|
98 |
-
else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[\\W\\-\\_]+[A-Za-z0-9]+[\\W\\-\\_]+(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg).*")){antibody="(anti)";} //filtering : antibody
|
99 |
-
|
100 |
-
if(mention.matches(".*[\\(\\[\\{].*") && BackwardSTR.toLowerCase().matches(mention_tmp+"\\).*") )
|
101 |
-
{
|
102 |
-
last=last+1;
|
103 |
-
mention=mention+")";
|
104 |
-
}
|
105 |
-
|
106 |
-
if(BackwardSTR.toLowerCase().matches(mention_tmp+"[0-9].*")){} // filtered: Bee1p
|
107 |
-
else if((mention.matches(".*[;:,].*")) && mention.length()<=10){} // filtered : x, XXX
|
108 |
-
else if(mention.matches("to[\\W\\-\\_]+[0-9]+")){} // to 7
|
109 |
-
else if(mention.matches("[a-z][\\)\\]\\}].*") && (!mention.matches(".*[\\(\\[\\{].*")) && mention.length()<=10){} // s). Major
|
110 |
-
else if(mention.matches(".*[\\(\\[\\{].*") && (!mention.matches(".*[\\)\\]\\}].*")) && mention.length()<=10){} // s). Major
|
111 |
-
else if(!id.equals("NA"))
|
112 |
-
{
|
113 |
-
if(GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
|
114 |
-
{
|
115 |
-
if((!mention.matches("^[A-Za-z] [A-Za-z0-9]+$")) && (mention.length()>=3)) // invalid species: "a group/a GAL4/a strain"
|
116 |
-
{
|
117 |
-
if(FilterAntibody.equals("False") || (!antibody.equals("(anti)")))
|
118 |
-
{
|
119 |
-
String patt="^(.+?) [sS]train";
|
120 |
-
Pattern ptmp = Pattern.compile(patt);
|
121 |
-
Matcher mtmp = ptmp.matcher(mention);
|
122 |
-
if(mtmp.find())
|
123 |
-
{
|
124 |
-
mention=mtmp.group(1);
|
125 |
-
last=last-7;
|
126 |
-
}
|
127 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tSpecies\t"+id); //+antibody
|
128 |
-
String mentions_tmp=mention.toLowerCase();
|
129 |
-
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
130 |
-
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
131 |
-
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
132 |
-
Mention2ID_lc.put(mention.toLowerCase(), id); //+antibody
|
133 |
-
|
134 |
-
String mention_genus = "";
|
135 |
-
patt="^([A-Za-z]+) ";
|
136 |
-
ptmp = Pattern.compile(patt);
|
137 |
-
mtmp = ptmp.matcher(mention);
|
138 |
-
if(mtmp.find())
|
139 |
-
{
|
140 |
-
mention_genus=mtmp.group(1); // get genus
|
141 |
-
}
|
142 |
-
|
143 |
-
IDset.add(id);
|
144 |
-
for(int s=start;s<last;s++)
|
145 |
-
{
|
146 |
-
TargetedLocation.add(j+"\t"+s);
|
147 |
-
}
|
148 |
-
String ids[]=id.split(";");
|
149 |
-
for(int x=0;x<ids.length;x++)
|
150 |
-
{
|
151 |
-
patt="^\\**([0-9]+)";
|
152 |
-
ptmp = Pattern.compile(patt);
|
153 |
-
mtmp = ptmp.matcher(ids[x]);
|
154 |
-
if(mtmp.find())
|
155 |
-
{
|
156 |
-
SPID_hash.put(mtmp.group(1), mention_genus);
|
157 |
-
}
|
158 |
-
}
|
159 |
-
}
|
160 |
-
}
|
161 |
-
}
|
162 |
-
}
|
163 |
-
}
|
164 |
-
|
165 |
-
/** Cell Line recognition */
|
166 |
-
locations = GNormPlus.PT_Cell.SearchMentionLocation(PassageContext,"Cell"); /** PT_Cell */
|
167 |
-
for (int k = 0 ; k < locations.size() ; k++)
|
168 |
-
{
|
169 |
-
String anno[]=locations.get(k).split("\t");
|
170 |
-
int start= Integer.parseInt(anno[0]);
|
171 |
-
int last= Integer.parseInt(anno[1]);
|
172 |
-
String mention = anno[2];
|
173 |
-
String id = anno[3];
|
174 |
-
if(GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
|
175 |
-
{
|
176 |
-
if(!TargetedLocation.contains(j+"\t"+start)) //already exists
|
177 |
-
{
|
178 |
-
int last40=0;
|
179 |
-
if(PassageContext.length()>=last+40)
|
180 |
-
{
|
181 |
-
last40=last+40;
|
182 |
-
}
|
183 |
-
else
|
184 |
-
{
|
185 |
-
last40=PassageContext.length();
|
186 |
-
}
|
187 |
-
|
188 |
-
// For anti-serum filtering
|
189 |
-
String ForwardSTR="";
|
190 |
-
String BackwardSTR="";
|
191 |
-
if(start>21)
|
192 |
-
{
|
193 |
-
ForwardSTR = PassageContext.substring(start-21,last);
|
194 |
-
}
|
195 |
-
else
|
196 |
-
{
|
197 |
-
ForwardSTR = PassageContext.substring(0,last);
|
198 |
-
}
|
199 |
-
if(PassageContext.length()>last+21)
|
200 |
-
{
|
201 |
-
BackwardSTR = PassageContext.substring(start,last+21);
|
202 |
-
}
|
203 |
-
else
|
204 |
-
{
|
205 |
-
BackwardSTR = PassageContext.substring(start,PassageContext.length());
|
206 |
-
}
|
207 |
-
String mention_tmp=mention.toLowerCase();
|
208 |
-
mention_tmp = mention_tmp.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
|
209 |
-
if(mention_tmp.matches(".*[\\[\\]\\(\\)\\{\\}].*")){}
|
210 |
-
else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[0-9\\-\\_].*")){} // filtered: Bee1p
|
211 |
-
else if(ForwardSTR.toLowerCase().matches(".*[0-9\\-\\_]"+mention_tmp)){} // filtered: IL-22RA1
|
212 |
-
else
|
213 |
-
{
|
214 |
-
String patt="[\\W\\-]cell([\\- ]*line|)[s]*[\\W\\-]";
|
215 |
-
Pattern ptmp = Pattern.compile(patt);
|
216 |
-
Matcher mtmp = ptmp.matcher(PassageContext.substring(last, last40).toLowerCase());
|
217 |
-
if(mtmp.find())
|
218 |
-
{
|
219 |
-
if(GNormPlus.taxid4gene.contains(id)) // for gene
|
220 |
-
{
|
221 |
-
id="*"+id;
|
222 |
-
}
|
223 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tCell\t"+id);
|
224 |
-
String mentions_tmp=mention.toLowerCase();
|
225 |
-
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
226 |
-
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
227 |
-
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
228 |
-
IDset.add(id);
|
229 |
-
for(int s=start;s<last;s++)
|
230 |
-
{
|
231 |
-
TargetedLocation.add(j+"\t"+s);
|
232 |
-
}
|
233 |
-
}
|
234 |
-
}
|
235 |
-
}
|
236 |
-
}
|
237 |
-
}
|
238 |
-
|
239 |
-
/** Genus names*/
|
240 |
-
for(String ID: SPID_hash.keySet())
|
241 |
-
{
|
242 |
-
if(GNormPlus.GenusID_hash.containsKey(ID))
|
243 |
-
{
|
244 |
-
GenusNames.put(ID,GNormPlus.GenusID_hash.get(ID));
|
245 |
-
}
|
246 |
-
if(SPID_hash.get(ID).length()>=7)
|
247 |
-
{
|
248 |
-
GenusNames.put(ID,SPID_hash.get(ID));
|
249 |
-
}
|
250 |
-
}
|
251 |
-
}
|
252 |
-
|
253 |
-
GenusNames.put("3702", "arabidopsis");
|
254 |
-
GenusNames.put("4932", "saccharomyces");
|
255 |
-
GenusNames.put("562", "escherichia");
|
256 |
-
GenusNames.put("7227", "drosophila");
|
257 |
-
GenusNames.put("8355", "xenopus");
|
258 |
-
|
259 |
-
PT_Genus.Hash2Tree(GenusNames);
|
260 |
-
|
261 |
-
/** Genus recognition */
|
262 |
-
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
|
263 |
-
{
|
264 |
-
if(GNormPlus.BioCDocobj.PassageContexts.size()>i &&
|
265 |
-
GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j &&
|
266 |
-
GNormPlus.BioCDocobj.Annotations.size()>i &&
|
267 |
-
GNormPlus.BioCDocobj.Annotations.get(i).size()>j
|
268 |
-
)
|
269 |
-
{
|
270 |
-
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j);
|
271 |
-
ArrayList<String> locations_Genus = PT_Genus.SearchMentionLocation(PassageContext,"Genus"); /** PT_Genus*/
|
272 |
-
for (int k = 0 ; k < locations_Genus.size() ; k++)
|
273 |
-
{
|
274 |
-
String anno[]=locations_Genus.get(k).split("\t");
|
275 |
-
String start= anno[0];
|
276 |
-
String last= anno[1];
|
277 |
-
String mention = anno[2];
|
278 |
-
String id = anno[3];
|
279 |
-
if(!TargetedLocation.contains(j+"\t"+start)) //already exists
|
280 |
-
{
|
281 |
-
String patt="^\\**([0-9]+)$";
|
282 |
-
Pattern ptmp = Pattern.compile(patt);
|
283 |
-
Matcher mtmp = ptmp.matcher(id);
|
284 |
-
if(mtmp.find())
|
285 |
-
{
|
286 |
-
id = mtmp.group(1);
|
287 |
-
}
|
288 |
-
|
289 |
-
if(GNormPlus.taxid4gene.contains(id)) // for gene
|
290 |
-
{
|
291 |
-
id="*"+id;
|
292 |
-
}
|
293 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGenus\t"+id);
|
294 |
-
String mentions_tmp=mention.toLowerCase();
|
295 |
-
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
296 |
-
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
297 |
-
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
298 |
-
IDset.add(id);
|
299 |
-
for(int s=Integer.parseInt(start);s<Integer.parseInt(last);s++)
|
300 |
-
{
|
301 |
-
TargetedLocation.add(j+"\t"+s);
|
302 |
-
}
|
303 |
-
}
|
304 |
-
}
|
305 |
-
}
|
306 |
-
}
|
307 |
-
|
308 |
-
/** Strain Tree */
|
309 |
-
PrefixTree PT_Strain = new PrefixTree();
|
310 |
-
HashMap<String, String> StrainID_hash = new HashMap<String, String>();
|
311 |
-
BufferedReader br = new BufferedReader(new FileReader(StrainFilename));
|
312 |
-
String line="";
|
313 |
-
while ((line = br.readLine()) != null)
|
314 |
-
{
|
315 |
-
String l[]=line.split("\t");
|
316 |
-
String ancestor = l[0];
|
317 |
-
String tax_id = l[1];
|
318 |
-
String tax_names = l[2];
|
319 |
-
if(SPID_hash.containsKey(ancestor))
|
320 |
-
{
|
321 |
-
StrainID_hash.put(tax_id, tax_names); // tax id -> strain
|
322 |
-
}
|
323 |
-
else if(SPID_hash.containsKey(tax_id))
|
324 |
-
{
|
325 |
-
StrainID_hash.put(tax_id, tax_names); // tax id -> strain
|
326 |
-
}
|
327 |
-
}
|
328 |
-
br.close();
|
329 |
-
HashMap<String, String> StrainNames = new HashMap<String, String>();
|
330 |
-
for(String ID: StrainID_hash.keySet())
|
331 |
-
{
|
332 |
-
StrainNames.put(ID,StrainID_hash.get(ID));
|
333 |
-
}
|
334 |
-
|
335 |
-
PT_Strain.Hash2Tree(StrainNames);
|
336 |
-
|
337 |
-
/** Strain recognition */
|
338 |
-
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
|
339 |
-
{
|
340 |
-
if(GNormPlus.BioCDocobj.PassageContexts.size()>i &&
|
341 |
-
GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j &&
|
342 |
-
GNormPlus.BioCDocobj.Annotations.size()>i &&
|
343 |
-
GNormPlus.BioCDocobj.Annotations.get(i).size()>j
|
344 |
-
)
|
345 |
-
{
|
346 |
-
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
|
347 |
-
ArrayList<String> locations_Strain = PT_Strain.SearchMentionLocation(PassageContext,"Strain"); /** PT_Strain*/
|
348 |
-
for (int k = 0 ; k < locations_Strain.size() ; k++)
|
349 |
-
{
|
350 |
-
String anno[]=locations_Strain.get(k).split("\t");
|
351 |
-
String start= anno[0];
|
352 |
-
String last= anno[1];
|
353 |
-
String mention = anno[2];
|
354 |
-
String id = anno[3];
|
355 |
-
if(!TargetedLocation.contains(j+"\t"+start)) //already exists
|
356 |
-
{
|
357 |
-
if((!mention.matches(".*[;,\\{\\}\\(\\)\\[\\]].*")) && !mention.matches("[a-z]{1,4} [0-9]{1,3}"))
|
358 |
-
{
|
359 |
-
if(GNormPlus.taxid4gene.contains(id)) // for gene
|
360 |
-
{
|
361 |
-
id="*"+id;
|
362 |
-
}
|
363 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tStrain\t"+id);
|
364 |
-
String mentions_tmp=mention.toLowerCase();
|
365 |
-
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
366 |
-
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
367 |
-
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
368 |
-
IDset.add(id);
|
369 |
-
for(int s=Integer.parseInt(start);s<Integer.parseInt(last);s++)
|
370 |
-
{
|
371 |
-
TargetedLocation.add(j+"\t"+s);
|
372 |
-
}
|
373 |
-
}
|
374 |
-
}
|
375 |
-
}
|
376 |
-
}
|
377 |
-
}
|
378 |
-
|
379 |
-
HashMap<String, String> OtherNames = new HashMap<String, String>();
|
380 |
-
for(String men : Mention2ID_lc.keySet())
|
381 |
-
{
|
382 |
-
String men_id= Mention2ID_lc.get(men);
|
383 |
-
if(GNormPlus.PmidLF2Abb_lc_hash.containsKey(Pmid+"\t"+men))
|
384 |
-
{
|
385 |
-
String Abb = GNormPlus.PmidLF2Abb_lc_hash.get(Pmid+"\t"+men);
|
386 |
-
// Abbreviation
|
387 |
-
if(OtherNames.containsKey(men_id))
|
388 |
-
{
|
389 |
-
OtherNames.put(men_id, OtherNames.get(men_id)+"|"+Abb);
|
390 |
-
}
|
391 |
-
else
|
392 |
-
{
|
393 |
-
OtherNames.put(men_id,Abb);
|
394 |
-
}
|
395 |
-
}
|
396 |
-
String men_nospace=men.replaceAll(" ", "");
|
397 |
-
// no space
|
398 |
-
if(OtherNames.containsKey(men_id))
|
399 |
-
{
|
400 |
-
OtherNames.put(men_id, OtherNames.get(men_id)+"|"+men_nospace);
|
401 |
-
}
|
402 |
-
else
|
403 |
-
{
|
404 |
-
OtherNames.put(men_id,men_nospace);
|
405 |
-
}
|
406 |
-
}
|
407 |
-
PrefixTree PT_Others = new PrefixTree();
|
408 |
-
PT_Others.Hash2Tree(OtherNames);
|
409 |
-
|
410 |
-
/**
|
411 |
-
*
|
412 |
-
* Others:
|
413 |
-
* 1) Abbreviation
|
414 |
-
* 2) no space
|
415 |
-
*
|
416 |
-
* */
|
417 |
-
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
|
418 |
-
{
|
419 |
-
if(GNormPlus.BioCDocobj.PassageContexts.size()>i &&
|
420 |
-
GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j &&
|
421 |
-
GNormPlus.BioCDocobj.Annotations.size()>i &&
|
422 |
-
GNormPlus.BioCDocobj.Annotations.get(i).size()>j
|
423 |
-
)
|
424 |
-
{
|
425 |
-
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
|
426 |
-
ArrayList<String> locations_Abb = PT_Others.SearchMentionLocation(PassageContext,"Species"); /** PT_Abb*/
|
427 |
-
for (int k = 0 ; k < locations_Abb.size() ; k++)
|
428 |
-
{
|
429 |
-
String anno[]=locations_Abb.get(k).split("\t");
|
430 |
-
String start= anno[0];
|
431 |
-
String last= anno[1];
|
432 |
-
String mention = anno[2];
|
433 |
-
String id = anno[3];
|
434 |
-
if(!TargetedLocation.contains(j+"\t"+start)) //already exists
|
435 |
-
{
|
436 |
-
if(GNormPlus.taxid4gene.contains(id)) // for gene
|
437 |
-
{
|
438 |
-
id="*"+id;
|
439 |
-
}
|
440 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tSpecies\t"+id);
|
441 |
-
String mentions_tmp=mention.toLowerCase();
|
442 |
-
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
443 |
-
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
444 |
-
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
445 |
-
Mention2ID_lc.put(mention.toLowerCase(), id);
|
446 |
-
IDset.add(id);
|
447 |
-
for(int s=Integer.parseInt(start);s<Integer.parseInt(last);s++)
|
448 |
-
{
|
449 |
-
TargetedLocation.add(j+"\t"+s);
|
450 |
-
}
|
451 |
-
}
|
452 |
-
}
|
453 |
-
}
|
454 |
-
}
|
455 |
-
|
456 |
-
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
|
457 |
-
{
|
458 |
-
if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j && GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
|
459 |
-
{
|
460 |
-
ArrayList <Integer> remove_anno = new ArrayList <Integer>();
|
461 |
-
for (int a = 0; a < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); a++) /** Annotations : a */
|
462 |
-
{
|
463 |
-
String SpAnno[]=GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(a).split("\t");
|
464 |
-
String start= SpAnno[0];
|
465 |
-
String last= SpAnno[1];
|
466 |
-
String mention = SpAnno[2];
|
467 |
-
String type = SpAnno[3];
|
468 |
-
|
469 |
-
if(type.matches("Gene|FamilyName"))
|
470 |
-
{
|
471 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a,start+"\t"+last+"\t"+mention+"\t"+type);
|
472 |
-
}
|
473 |
-
else if(type.matches("Species|Genus|Strain|Cell") && SpAnno.length==5)
|
474 |
-
{
|
475 |
-
//System.out.println(GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(a));
|
476 |
-
/** Abbreviation solution */
|
477 |
-
if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mention.toLowerCase()) && Mention2ID_lc.containsKey(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mention.toLowerCase())))
|
478 |
-
{
|
479 |
-
String LF_lc=GNormPlus.PmidAbb2LF_lc_hash.get(Pmid+"\t"+mention.toLowerCase());
|
480 |
-
if(Mention2ID_lc.containsKey(LF_lc))
|
481 |
-
{
|
482 |
-
String LF_ID=Mention2ID_lc.get(LF_lc);
|
483 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+LF_ID);
|
484 |
-
String mentions_tmp=mention.toLowerCase();
|
485 |
-
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
486 |
-
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
487 |
-
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
488 |
-
}
|
489 |
-
}
|
490 |
-
else if (SpAnno.length>4)
|
491 |
-
{
|
492 |
-
String id = SpAnno[4];
|
493 |
-
String id_split[]=id.split(";");
|
494 |
-
if(id_split.length>=2)
|
495 |
-
{
|
496 |
-
/** Smallest set of tax ids */
|
497 |
-
boolean found=false;
|
498 |
-
for(int x=0;x<IDset.size();x++)
|
499 |
-
{
|
500 |
-
String id_tmp= IDset.get(x);
|
501 |
-
for(int y=0;y<id_split.length;y++) // if any other id is a component of the target id
|
502 |
-
{
|
503 |
-
if(id_split[y].equals(id_tmp))
|
504 |
-
{
|
505 |
-
found=true;
|
506 |
-
}
|
507 |
-
}
|
508 |
-
if(found == true)
|
509 |
-
{
|
510 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id_tmp);
|
511 |
-
String mentions_tmp=mention.toLowerCase();
|
512 |
-
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
513 |
-
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
514 |
-
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
515 |
-
x=1000000;
|
516 |
-
}
|
517 |
-
}
|
518 |
-
|
519 |
-
/** smallest tax id number */
|
520 |
-
if(found == false)
|
521 |
-
{
|
522 |
-
int min=10000000;
|
523 |
-
String min_id="";
|
524 |
-
for(int y=0;y<id_split.length;y++) // if any other id is a component of the target id
|
525 |
-
{
|
526 |
-
String id_tmp = id_split[y];
|
527 |
-
String patt="^\\**([0-9]+)";
|
528 |
-
Pattern ptmp = Pattern.compile(patt);
|
529 |
-
Matcher mtmp = ptmp.matcher(id_tmp);
|
530 |
-
if(mtmp.find())
|
531 |
-
{
|
532 |
-
id_tmp = mtmp.group(1);
|
533 |
-
}
|
534 |
-
|
535 |
-
if(y==0)
|
536 |
-
{
|
537 |
-
min_id=id_split[y];
|
538 |
-
min=Integer.parseInt(id_tmp);
|
539 |
-
}
|
540 |
-
else if(Integer.parseInt(id_tmp)<min)
|
541 |
-
{
|
542 |
-
min=Integer.parseInt(id_tmp);
|
543 |
-
min_id=id_tmp;
|
544 |
-
}
|
545 |
-
}
|
546 |
-
if(GNormPlus.taxid4gene.contains(min_id)) // for gene
|
547 |
-
{
|
548 |
-
min_id="*"+min_id;
|
549 |
-
}
|
550 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a,start+"\t"+last+"\t"+mention+"\tSpecies\t"+min_id);
|
551 |
-
String mentions_tmp=mention.toLowerCase();
|
552 |
-
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
553 |
-
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
554 |
-
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
555 |
-
}
|
556 |
-
}
|
557 |
-
}
|
558 |
-
}
|
559 |
-
else //disease, and other concepts
|
560 |
-
{
|
561 |
-
remove_anno.add(a);
|
562 |
-
}
|
563 |
-
}
|
564 |
-
|
565 |
-
Collections.sort(remove_anno);
|
566 |
-
for (int counter = remove_anno.size()-1; counter >= 0 ; counter--)
|
567 |
-
{
|
568 |
-
int ai=remove_anno.get(counter);
|
569 |
-
//System.out.println("\n"+ai+"\t"+GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(ai));
|
570 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(ai);
|
571 |
-
}
|
572 |
-
}
|
573 |
-
}
|
574 |
-
}
|
575 |
-
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true); //save in BioC file
|
576 |
-
}
|
577 |
-
public void SpeciesAssignment(String Filename,String FilenameBioC) throws IOException, XMLStreamException
|
578 |
-
{
|
579 |
-
GNormPlus.BioCDocobj.Annotations = new ArrayList();
|
580 |
-
GNormPlus.BioCDocobj.BioCReaderWithAnnotation(Filename);
|
581 |
-
|
582 |
-
BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);
|
583 |
-
for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) /** PMIDs : i */
|
584 |
-
{
|
585 |
-
HashMap<String, String> PrefixIDTarget_hash = new HashMap<String, String>();
|
586 |
-
PrefixIDTarget_hash.put("9606", "h");
|
587 |
-
PrefixIDTarget_hash.put("10090", "m");
|
588 |
-
PrefixIDTarget_hash.put("10116", "r");
|
589 |
-
PrefixIDTarget_hash.put("4932", "y");
|
590 |
-
PrefixIDTarget_hash.put("7227", "d");
|
591 |
-
PrefixIDTarget_hash.put("7955", "z|zf|Zf|dr|Dr");
|
592 |
-
PrefixIDTarget_hash.put("3702", "at|At");
|
593 |
-
|
594 |
-
HashMap<String, Double> SP2Num_hash = new HashMap<String, Double>();
|
595 |
-
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
|
596 |
-
{
|
597 |
-
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
|
598 |
-
{
|
599 |
-
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
600 |
-
if(anno.length==5) //Species
|
601 |
-
{
|
602 |
-
String patt="^\\**([0-9]+)$";
|
603 |
-
Pattern ptmp = Pattern.compile(patt);
|
604 |
-
Matcher mtmp = ptmp.matcher(anno[4]);
|
605 |
-
if(mtmp.find())
|
606 |
-
{
|
607 |
-
String id = mtmp.group(1);
|
608 |
-
|
609 |
-
if(!PrefixIDTarget_hash.containsKey(id))
|
610 |
-
{
|
611 |
-
PrefixIDTarget_hash.put(id,GNormPlus.PrefixID_hash.get(id)); // taxid -> prefix
|
612 |
-
}
|
613 |
-
if(j == 0)//title
|
614 |
-
{
|
615 |
-
if(SP2Num_hash.containsKey(id))
|
616 |
-
{
|
617 |
-
SP2Num_hash.put(id, SP2Num_hash.get(id)+2);
|
618 |
-
}
|
619 |
-
else
|
620 |
-
{
|
621 |
-
if(GNormPlus.TaxFreq_hash.containsKey(id))
|
622 |
-
{
|
623 |
-
SP2Num_hash.put(id, GNormPlus.TaxFreq_hash.get(id)+2);
|
624 |
-
}
|
625 |
-
else
|
626 |
-
{
|
627 |
-
SP2Num_hash.put(id, 2.0);
|
628 |
-
}
|
629 |
-
}
|
630 |
-
// Virus -> Human (not to double weight human to virus)
|
631 |
-
/*if(GNormPlus.SP_Virus2Human_hash.containsKey(id))
|
632 |
-
{
|
633 |
-
if(SP2Num_hash.containsKey("9606"))
|
634 |
-
{
|
635 |
-
SP2Num_hash.put("9606", SP2Num_hash.get("9606")+2);
|
636 |
-
}
|
637 |
-
else
|
638 |
-
{
|
639 |
-
SP2Num_hash.put("9606", 2 + GNormPlus.TaxFreq_hash.get("9606")+1);
|
640 |
-
}
|
641 |
-
}*/
|
642 |
-
}
|
643 |
-
else
|
644 |
-
{
|
645 |
-
if(SP2Num_hash.containsKey(id))
|
646 |
-
{
|
647 |
-
SP2Num_hash.put(id, SP2Num_hash.get(id)+1);
|
648 |
-
}
|
649 |
-
else
|
650 |
-
{
|
651 |
-
if(GNormPlus.TaxFreq_hash.containsKey(id))
|
652 |
-
{
|
653 |
-
SP2Num_hash.put(id, 1 + GNormPlus.TaxFreq_hash.get(id));
|
654 |
-
}
|
655 |
-
else
|
656 |
-
{
|
657 |
-
SP2Num_hash.put(id, 1.0);
|
658 |
-
}
|
659 |
-
}
|
660 |
-
// Virus -> Human
|
661 |
-
/*if(GNormPlus.SP_Virus2Human_hash.containsKey(id))
|
662 |
-
{
|
663 |
-
if(SP2Num_hash.containsKey("9606"))
|
664 |
-
{
|
665 |
-
SP2Num_hash.put("9606", SP2Num_hash.get("9606")+1);
|
666 |
-
}
|
667 |
-
else
|
668 |
-
{
|
669 |
-
SP2Num_hash.put("9606", GNormPlus.TaxFreq_hash.get("9606")+1);
|
670 |
-
}
|
671 |
-
}*/
|
672 |
-
}
|
673 |
-
}
|
674 |
-
}
|
675 |
-
}
|
676 |
-
}
|
677 |
-
String MajorSP="9606";
|
678 |
-
double MaxSP=0;
|
679 |
-
for(String tid : SP2Num_hash.keySet())
|
680 |
-
{
|
681 |
-
if(SP2Num_hash.get(tid)>MaxSP)
|
682 |
-
{
|
683 |
-
MajorSP=tid;
|
684 |
-
MaxSP=SP2Num_hash.get(tid);
|
685 |
-
}
|
686 |
-
}
|
687 |
-
|
688 |
-
for (int j = 0; j < GNormPlus.BioCDocobj.PassageContexts.get(i).size(); j++) /** Paragraphs : j */
|
689 |
-
{
|
690 |
-
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
|
691 |
-
//int PassageOffset = GNormPlus.BioCDocobj.PassageOffsets.get(i).get(j); // Passage offset
|
692 |
-
iterator.setText(PassageContext);
|
693 |
-
ArrayList<Integer> Sentence_offsets = new ArrayList<Integer>();
|
694 |
-
int Sent_start = iterator.first();
|
695 |
-
for (int Sent_last = iterator.next(); Sent_last != BreakIterator.DONE; Sent_start = Sent_last, Sent_last = iterator.next())
|
696 |
-
{
|
697 |
-
Sentence_offsets.add(Sent_start);
|
698 |
-
}
|
699 |
-
|
700 |
-
HashMap<Integer,String> Annotations_Gene_hash = new HashMap<Integer,String>();
|
701 |
-
ArrayList<String> Annotations_Species = new ArrayList<String>();
|
702 |
-
if(GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
|
703 |
-
{
|
704 |
-
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
|
705 |
-
{
|
706 |
-
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
707 |
-
if(anno.length==5) //Species
|
708 |
-
{
|
709 |
-
Annotations_Species.add(GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k));
|
710 |
-
}
|
711 |
-
else //Gene : if(anno.length==3)
|
712 |
-
{
|
713 |
-
//String mention = PassageContext.substring(Integer.parseInt(anno[0]), Integer.parseInt(anno[1]));
|
714 |
-
Annotations_Gene_hash.put(k,GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k)); // k -> Gene Annotation
|
715 |
-
}
|
716 |
-
}
|
717 |
-
|
718 |
-
//Gene --> Species Inference (PMID:28777492)
|
719 |
-
HashMap<String,HashMap<Integer,String>> mention2Location2Species_hash = new HashMap<String,HashMap<Integer,String>>();
|
720 |
-
HashMap<Integer,String> Location2Species_hash = new HashMap<Integer,String>();
|
721 |
-
for (int k : Annotations_Gene_hash.keySet()) // k is the index of GNormPlus.BioCDocobj.Annotations.get(i).get(j)
|
722 |
-
{
|
723 |
-
boolean SPfound = false;
|
724 |
-
String anno[] = Annotations_Gene_hash.get(k).split("\t");
|
725 |
-
int G_Start= Integer.parseInt(anno[0]);
|
726 |
-
int G_Last= Integer.parseInt(anno[1]);
|
727 |
-
String G_mentions = anno[2];
|
728 |
-
/**
|
729 |
-
* 2. Co-occurring word
|
730 |
-
* boundary :
|
731 |
-
* Sentence Start: Sentence_offsets.get(Target_Sentence)
|
732 |
-
* Sentence Last: Sentence_offsets.get(Target_Sentence+1)
|
733 |
-
*/
|
734 |
-
//Find the target sentence
|
735 |
-
int Target_Sentence=0;
|
736 |
-
if(SPfound == false) // 1. left : Closed to start of the gene mention
|
737 |
-
{
|
738 |
-
for(int s=0;s<Sentence_offsets.size();s++)
|
739 |
-
|
740 |
-
{
|
741 |
-
int Sentence_last=1000000;
|
742 |
-
if(s<Sentence_offsets.size()-1)
|
743 |
-
{
|
744 |
-
Sentence_last=Sentence_offsets.get(s+1);
|
745 |
-
}
|
746 |
-
if(G_Start<Sentence_last)
|
747 |
-
{
|
748 |
-
Target_Sentence=s;
|
749 |
-
break;
|
750 |
-
}
|
751 |
-
}
|
752 |
-
}
|
753 |
-
int Sentence_Start = Sentence_offsets.get(Target_Sentence);
|
754 |
-
int Sentence_Last = 1000000;
|
755 |
-
if(Sentence_offsets.size() > Target_Sentence+1){ Sentence_Last = Sentence_offsets.get(Target_Sentence+1); }
|
756 |
-
if(SPfound == false) // 1. left : Closed to start of the gene mention
|
757 |
-
{
|
758 |
-
int closet_Sp_Start=0;
|
759 |
-
for(int sp=0;sp<Annotations_Species.size();sp++) // Find the closet species
|
760 |
-
{
|
761 |
-
String AnnoSp[]=Annotations_Species.get(sp).split("\t");
|
762 |
-
int Sp_Start = Integer.parseInt(AnnoSp[0]);
|
763 |
-
String patt="^\\**([0-9]+)$";
|
764 |
-
Pattern ptmp = Pattern.compile(patt);
|
765 |
-
Matcher mtmp = ptmp.matcher(AnnoSp[4]);
|
766 |
-
if(mtmp.find())
|
767 |
-
{
|
768 |
-
String taxid = mtmp.group(1);
|
769 |
-
Location2Species_hash.put(Sp_Start,taxid);
|
770 |
-
if(Sp_Start <= G_Start && Sp_Start >= Sentence_Start && Sp_Start >closet_Sp_Start)
|
771 |
-
{
|
772 |
-
closet_Sp_Start=Sp_Start;
|
773 |
-
Location2Species_hash.put(Integer.parseInt(anno[0]), taxid);
|
774 |
-
|
775 |
-
if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
|
776 |
-
{
|
777 |
-
mention2Location2Species_hash.get(G_mentions.toLowerCase()).put(Integer.parseInt(anno[0]), taxid);
|
778 |
-
}
|
779 |
-
else
|
780 |
-
{
|
781 |
-
mention2Location2Species_hash.put(G_mentions.toLowerCase(),Location2Species_hash);
|
782 |
-
}
|
783 |
-
|
784 |
-
SPfound=true;
|
785 |
-
}
|
786 |
-
}
|
787 |
-
}
|
788 |
-
}
|
789 |
-
if(SPfound == false) // 2. right : Closed to last of the gene mention
|
790 |
-
{
|
791 |
-
int closet_Sp_Last=1000000;
|
792 |
-
for(int sp=0;sp<Annotations_Species.size();sp++) // Find the closet species
|
793 |
-
{
|
794 |
-
String AnnoSp[]=Annotations_Species.get(sp).split("\t");
|
795 |
-
int Sp_Last = Integer.parseInt(AnnoSp[1]);
|
796 |
-
String patt="^\\**([0-9]+)$";
|
797 |
-
Pattern ptmp = Pattern.compile(patt);
|
798 |
-
Matcher mtmp = ptmp.matcher(AnnoSp[4]);
|
799 |
-
if(mtmp.find())
|
800 |
-
{
|
801 |
-
String taxid = mtmp.group(1);
|
802 |
-
if(Sp_Last >= G_Last && Sp_Last <= Sentence_Last && Sp_Last < closet_Sp_Last)
|
803 |
-
{
|
804 |
-
closet_Sp_Last=Sp_Last;
|
805 |
-
Location2Species_hash.put(Integer.parseInt(anno[0]), taxid);
|
806 |
-
|
807 |
-
if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
|
808 |
-
{
|
809 |
-
mention2Location2Species_hash.get(G_mentions.toLowerCase()).put(Integer.parseInt(anno[0]), taxid);
|
810 |
-
}
|
811 |
-
else
|
812 |
-
{
|
813 |
-
mention2Location2Species_hash.put(G_mentions.toLowerCase(),Location2Species_hash);
|
814 |
-
}
|
815 |
-
|
816 |
-
SPfound=true;
|
817 |
-
}
|
818 |
-
}
|
819 |
-
}
|
820 |
-
}
|
821 |
-
}
|
822 |
-
|
823 |
-
for (int k : Annotations_Gene_hash.keySet()) // k is the index of GNormPlus.BioCDocobj.Annotations.get(i).get(j)
|
824 |
-
{
|
825 |
-
String anno[] = Annotations_Gene_hash.get(k).split("\t");
|
826 |
-
int G_Start= Integer.parseInt(anno[0]);
|
827 |
-
int G_Last= Integer.parseInt(anno[1]);
|
828 |
-
String G_mentions = anno[2];
|
829 |
-
String G_type = anno[3];
|
830 |
-
String G_mention_list[]=G_mentions.split("\\|");
|
831 |
-
String G_mention=G_mention_list[0]; // only use the first term to detect species ; should be updated after SimConcept
|
832 |
-
|
833 |
-
/** 1. prefix */
|
834 |
-
boolean SPfound = false;
|
835 |
-
for(String taxid: PrefixIDTarget_hash.keySet())
|
836 |
-
{
|
837 |
-
if(GNormPlus.GeneWithoutSPPrefix_hash.containsKey(G_mention.toLowerCase()))
|
838 |
-
{
|
839 |
-
//special case, and no need for prefix - SA
|
840 |
-
}
|
841 |
-
else
|
842 |
-
{
|
843 |
-
Pattern ptmp = Pattern.compile("^("+PrefixIDTarget_hash.get(taxid)+")([A-Z].*)$");
|
844 |
-
Matcher mtmp = ptmp.matcher(G_mention);
|
845 |
-
if(mtmp.find())
|
846 |
-
{
|
847 |
-
String MentionWoPrefix=mtmp.group(2);
|
848 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"|"+MentionWoPrefix+"\t"+anno[3]+"\tPrefix:"+taxid);
|
849 |
-
SPfound=true;
|
850 |
-
break;
|
851 |
-
}
|
852 |
-
}
|
853 |
-
}
|
854 |
-
|
855 |
-
/**
|
856 |
-
* 2. Co-occurring word
|
857 |
-
* boundary :
|
858 |
-
* Sentence Start: Sentence_offsets.get(Target_Sentence)
|
859 |
-
* Sentence Last: Sentence_offsets.get(Target_Sentence+1)
|
860 |
-
*/
|
861 |
-
//Find the target sentence
|
862 |
-
int Target_Sentence=0;
|
863 |
-
if(SPfound == false) // 1. left : Closed to start of the gene mention
|
864 |
-
{
|
865 |
-
for(int s=0;s<Sentence_offsets.size();s++)
|
866 |
-
|
867 |
-
{
|
868 |
-
int Sentence_last=1000000;
|
869 |
-
if(s<Sentence_offsets.size()-1)
|
870 |
-
{
|
871 |
-
Sentence_last=Sentence_offsets.get(s+1);
|
872 |
-
}
|
873 |
-
if(G_Start<Sentence_last)
|
874 |
-
{
|
875 |
-
Target_Sentence=s;
|
876 |
-
break;
|
877 |
-
}
|
878 |
-
}
|
879 |
-
}
|
880 |
-
int Sentence_Start = Sentence_offsets.get(Target_Sentence);
|
881 |
-
int Sentence_Last = 1000000;
|
882 |
-
if(Sentence_offsets.size() > Target_Sentence+1){ Sentence_Last = Sentence_offsets.get(Target_Sentence+1); }
|
883 |
-
if(SPfound == false) // 1. left : Closed to start of the gene mention
|
884 |
-
{
|
885 |
-
int closet_Sp_Start=0;
|
886 |
-
for(int sp=0;sp<Annotations_Species.size();sp++) // Find the closet species
|
887 |
-
{
|
888 |
-
String AnnoSp[]=Annotations_Species.get(sp).split("\t");
|
889 |
-
int Sp_Start = Integer.parseInt(AnnoSp[0]);
|
890 |
-
String patt="^\\**([0-9]+)$";
|
891 |
-
Pattern ptmp = Pattern.compile(patt);
|
892 |
-
Matcher mtmp = ptmp.matcher(AnnoSp[4]);
|
893 |
-
if(mtmp.find())
|
894 |
-
{
|
895 |
-
String taxid = mtmp.group(1);
|
896 |
-
if(Sp_Start <= G_Start && Sp_Start >= Sentence_Start && Sp_Start >closet_Sp_Start)
|
897 |
-
{
|
898 |
-
closet_Sp_Start=Sp_Start;
|
899 |
-
if(GNormPlus.SP_Virus2Human_hash.containsKey(taxid))
|
900 |
-
{
|
901 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tLeft:"+taxid+"&9606");
|
902 |
-
}
|
903 |
-
else
|
904 |
-
{
|
905 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tLeft:"+taxid);
|
906 |
-
}
|
907 |
-
SPfound=true;
|
908 |
-
}
|
909 |
-
}
|
910 |
-
}
|
911 |
-
}
|
912 |
-
if(SPfound == false) // 2. right : Closed to last of the gene mention
|
913 |
-
{
|
914 |
-
int closet_Sp_Last=1000000;
|
915 |
-
for(int sp=0;sp<Annotations_Species.size();sp++) // Find the closet species
|
916 |
-
{
|
917 |
-
String AnnoSp[]=Annotations_Species.get(sp).split("\t");
|
918 |
-
int Sp_Last = Integer.parseInt(AnnoSp[1]);
|
919 |
-
String patt="^\\**([0-9]+)$";
|
920 |
-
Pattern ptmp = Pattern.compile(patt);
|
921 |
-
Matcher mtmp = ptmp.matcher(AnnoSp[4]);
|
922 |
-
if(mtmp.find())
|
923 |
-
{
|
924 |
-
String taxid = mtmp.group(1);
|
925 |
-
if(Sp_Last >= G_Last && Sp_Last <= Sentence_Last && Sp_Last < closet_Sp_Last)
|
926 |
-
{
|
927 |
-
closet_Sp_Last=Sp_Last;
|
928 |
-
if(GNormPlus.SP_Virus2Human_hash.containsKey(taxid))
|
929 |
-
{
|
930 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tRight:"+taxid+"&9606");
|
931 |
-
}
|
932 |
-
else
|
933 |
-
{
|
934 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tRight:"+taxid);
|
935 |
-
}
|
936 |
-
SPfound=true;
|
937 |
-
}
|
938 |
-
}
|
939 |
-
}
|
940 |
-
}
|
941 |
-
|
942 |
-
/** 3. Focus species */
|
943 |
-
if(SPfound == false) // 2. right : Closed to last of the gene mention
|
944 |
-
{
|
945 |
-
// 1. only the mentions appeared earlier are inferred
|
946 |
-
//
|
947 |
-
if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
|
948 |
-
{
|
949 |
-
int closed_loca=0;
|
950 |
-
for (int loca_start : mention2Location2Species_hash.get(G_mentions.toLowerCase()).keySet())
|
951 |
-
{
|
952 |
-
if(loca_start<G_Start)
|
953 |
-
{
|
954 |
-
if(loca_start>closed_loca)
|
955 |
-
{
|
956 |
-
closed_loca=loca_start;
|
957 |
-
}
|
958 |
-
}
|
959 |
-
}
|
960 |
-
if(closed_loca>0)
|
961 |
-
{
|
962 |
-
if(GNormPlus.SP_Virus2Human_hash.containsKey(Location2Species_hash.get(closed_loca)))
|
963 |
-
{
|
964 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+Location2Species_hash.get(closed_loca)+"&9606");
|
965 |
-
}
|
966 |
-
else
|
967 |
-
{
|
968 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+Location2Species_hash.get(closed_loca));
|
969 |
-
}
|
970 |
-
}
|
971 |
-
else
|
972 |
-
{
|
973 |
-
if(GNormPlus.SP_Virus2Human_hash.containsKey(MajorSP))
|
974 |
-
{
|
975 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP+"&9606");
|
976 |
-
}
|
977 |
-
else
|
978 |
-
{
|
979 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP);
|
980 |
-
}
|
981 |
-
}
|
982 |
-
}
|
983 |
-
else
|
984 |
-
{
|
985 |
-
if(GNormPlus.SP_Virus2Human_hash.containsKey(MajorSP))
|
986 |
-
{
|
987 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP+"&9606");
|
988 |
-
}
|
989 |
-
else
|
990 |
-
{
|
991 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP);
|
992 |
-
}
|
993 |
-
}
|
994 |
-
}
|
995 |
-
}
|
996 |
-
}
|
997 |
-
}
|
998 |
-
}
|
999 |
-
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
|
1000 |
-
}
|
1001 |
-
public void SpeciesAssignment(String Filename,String FilenameBioC,String FocusSpecies) throws IOException, XMLStreamException
|
1002 |
-
{
|
1003 |
-
for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) /** PMIDs : i */
|
1004 |
-
{
|
1005 |
-
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
|
1006 |
-
{
|
1007 |
-
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
|
1008 |
-
{
|
1009 |
-
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
1010 |
-
if(anno.length==5) //Species
|
1011 |
-
{
|
1012 |
-
String id=anno[4].replaceAll("\\*", "");
|
1013 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+id);
|
1014 |
-
}
|
1015 |
-
else //Gene : if(anno.length==3)
|
1016 |
-
{
|
1017 |
-
/** 1. prefix */
|
1018 |
-
boolean SPfound = false;
|
1019 |
-
if(GNormPlus.GeneWithoutSPPrefix_hash.containsKey(anno[2].toLowerCase()))
|
1020 |
-
{
|
1021 |
-
//special case, and no need for prefix - SA
|
1022 |
-
}
|
1023 |
-
else
|
1024 |
-
{
|
1025 |
-
Pattern ptmp = Pattern.compile("^("+GNormPlus.PrefixID_hash.get(FocusSpecies)+")([A-Z].*)$");
|
1026 |
-
Matcher mtmp = ptmp.matcher(anno[2]);
|
1027 |
-
if(mtmp.find())
|
1028 |
-
{
|
1029 |
-
String MentionWoPrefix=mtmp.group(2);
|
1030 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"|"+MentionWoPrefix+"\t"+anno[3]+"\tPrefix:"+FocusSpecies);
|
1031 |
-
SPfound=true;
|
1032 |
-
}
|
1033 |
-
}
|
1034 |
-
if(SPfound == false)
|
1035 |
-
{
|
1036 |
-
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k)+"\tFocus:"+FocusSpecies);
|
1037 |
-
}
|
1038 |
-
}
|
1039 |
-
}
|
1040 |
-
}
|
1041 |
-
}
|
1042 |
-
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
|
1043 |
-
}
|
1044 |
}
|
|
|
1 |
+
/**
|
2 |
+
* Project: GNormPlus
|
3 |
+
* Function: Species recognition and Species assignment
|
4 |
+
*/
|
5 |
+
|
6 |
+
package GNormPluslib;
|
7 |
+
|
8 |
+
import bioc.BioCAnnotation;
|
9 |
+
import bioc.BioCCollection;
|
10 |
+
import bioc.BioCDocument;
|
11 |
+
import bioc.BioCLocation;
|
12 |
+
import bioc.BioCPassage;
|
13 |
+
|
14 |
+
import bioc.io.BioCDocumentWriter;
|
15 |
+
import bioc.io.BioCFactory;
|
16 |
+
import bioc.io.woodstox.ConnectorWoodstox;
|
17 |
+
import java.io.BufferedReader;
|
18 |
+
import java.io.BufferedWriter;
|
19 |
+
import java.io.FileInputStream;
|
20 |
+
import java.io.FileOutputStream;
|
21 |
+
import java.io.FileReader;
|
22 |
+
import java.io.FileWriter;
|
23 |
+
import java.io.IOException;
|
24 |
+
import java.io.InputStreamReader;
|
25 |
+
import java.io.OutputStreamWriter;
|
26 |
+
import java.text.BreakIterator;
|
27 |
+
import java.time.LocalDate;
|
28 |
+
import java.time.ZoneId;
|
29 |
+
|
30 |
+
import javax.xml.stream.XMLStreamException;
|
31 |
+
|
32 |
+
import org.tartarus.snowball.SnowballStemmer;
|
33 |
+
import org.tartarus.snowball.ext.englishStemmer;
|
34 |
+
|
35 |
+
import java.util.Map;
|
36 |
+
import java.util.regex.Matcher;
|
37 |
+
import java.util.regex.Pattern;
|
38 |
+
import java.util.ArrayList;
|
39 |
+
import java.util.HashMap;
|
40 |
+
import java.util.List;
|
41 |
+
import java.util.Locale;
|
42 |
+
import java.util.Collections;
|
43 |
+
|
44 |
+
public class SR
|
45 |
+
{
|
46 |
+
@SuppressWarnings("null")
|
47 |
+
public void SpeciesRecognition(String Filename,String FilenameBioC,String StrainFilename,String FilterAntibody) throws IOException, XMLStreamException
|
48 |
+
{
|
49 |
+
/** Recognizing Species Names: SP */
|
50 |
+
for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */
|
51 |
+
{
|
52 |
+
String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
|
53 |
+
PrefixTree PT_Genus = new PrefixTree();
|
54 |
+
HashMap<String, String> SPID_hash = new HashMap<String, String>();
|
55 |
+
ArrayList<String> TargetedLocation = new ArrayList<String>();
|
56 |
+
HashMap<String, String> GenusNames = new HashMap<String, String>();
|
57 |
+
HashMap<String, String> Mention2ID_lc = new HashMap<String, String>();
|
58 |
+
ArrayList<String> IDset = new ArrayList<String>();
|
59 |
+
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
|
60 |
+
{
|
61 |
+
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
|
62 |
+
|
63 |
+
/** Species recognition */
|
64 |
+
ArrayList<String> locations = GNormPlus.PT_Species.SearchMentionLocation(PassageContext,"Species"); /** PT_Species */
|
65 |
+
for (int k = 0 ; k < locations.size() ; k++)
|
66 |
+
{
|
67 |
+
String anno[]=locations.get(k).split("\t");
|
68 |
+
int start= Integer.parseInt(anno[0]);
|
69 |
+
int last= Integer.parseInt(anno[1]);
|
70 |
+
|
71 |
+
// For anti-serum filtering
|
72 |
+
String ForwardSTR="";
|
73 |
+
String BackwardSTR="";
|
74 |
+
if(start>21)
|
75 |
+
{
|
76 |
+
ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(start-21,last);
|
77 |
+
}
|
78 |
+
else
|
79 |
+
{
|
80 |
+
ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(0,last);
|
81 |
+
}
|
82 |
+
if(PassageContext.length()>last+21)
|
83 |
+
{
|
84 |
+
BackwardSTR = PassageContext.substring(start,last+21);
|
85 |
+
}
|
86 |
+
else
|
87 |
+
{
|
88 |
+
BackwardSTR = PassageContext.substring(start,PassageContext.length());
|
89 |
+
}
|
90 |
+
|
91 |
+
String mention = anno[2];
|
92 |
+
String id = anno[3];
|
93 |
+
String mention_tmp=mention.toLowerCase();
|
94 |
+
mention_tmp = mention_tmp.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
|
95 |
+
String antibody="";
|
96 |
+
if(ForwardSTR.toLowerCase().matches(".*(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg)[\\W\\-\\_]+"+mention_tmp)) {antibody="(anti)";}//filtering : antibody
|
97 |
+
else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[\\W\\-\\_]+(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg).*")){antibody="(anti)";} //filtering : antibody
|
98 |
+
else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[\\W\\-\\_]+[A-Za-z0-9]+[\\W\\-\\_]+(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg).*")){antibody="(anti)";} //filtering : antibody
|
99 |
+
|
100 |
+
if(mention.matches(".*[\\(\\[\\{].*") && BackwardSTR.toLowerCase().matches(mention_tmp+"\\).*") )
|
101 |
+
{
|
102 |
+
last=last+1;
|
103 |
+
mention=mention+")";
|
104 |
+
}
|
105 |
+
|
106 |
+
if(BackwardSTR.toLowerCase().matches(mention_tmp+"[0-9].*")){} // filtered: Bee1p
|
107 |
+
else if((mention.matches(".*[;:,].*")) && mention.length()<=10){} // filtered : x, XXX
|
108 |
+
else if(mention.matches("to[\\W\\-\\_]+[0-9]+")){} // to 7
|
109 |
+
else if(mention.matches("[a-z][\\)\\]\\}].*") && (!mention.matches(".*[\\(\\[\\{].*")) && mention.length()<=10){} // s). Major
|
110 |
+
else if(mention.matches(".*[\\(\\[\\{].*") && (!mention.matches(".*[\\)\\]\\}].*")) && mention.length()<=10){} // s). Major
|
111 |
+
else if(!id.equals("NA"))
|
112 |
+
{
|
113 |
+
if(GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
|
114 |
+
{
|
115 |
+
if((!mention.matches("^[A-Za-z] [A-Za-z0-9]+$")) && (mention.length()>=3)) // invalid species: "a group/a GAL4/a strain"
|
116 |
+
{
|
117 |
+
if(FilterAntibody.equals("False") || (!antibody.equals("(anti)")))
|
118 |
+
{
|
119 |
+
String patt="^(.+?) [sS]train";
|
120 |
+
Pattern ptmp = Pattern.compile(patt);
|
121 |
+
Matcher mtmp = ptmp.matcher(mention);
|
122 |
+
if(mtmp.find())
|
123 |
+
{
|
124 |
+
mention=mtmp.group(1);
|
125 |
+
last=last-7;
|
126 |
+
}
|
127 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tSpecies\t"+id); //+antibody
|
128 |
+
String mentions_tmp=mention.toLowerCase();
|
129 |
+
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
130 |
+
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
131 |
+
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
132 |
+
Mention2ID_lc.put(mention.toLowerCase(), id); //+antibody
|
133 |
+
|
134 |
+
String mention_genus = "";
|
135 |
+
patt="^([A-Za-z]+) ";
|
136 |
+
ptmp = Pattern.compile(patt);
|
137 |
+
mtmp = ptmp.matcher(mention);
|
138 |
+
if(mtmp.find())
|
139 |
+
{
|
140 |
+
mention_genus=mtmp.group(1); // get genus
|
141 |
+
}
|
142 |
+
|
143 |
+
IDset.add(id);
|
144 |
+
for(int s=start;s<last;s++)
|
145 |
+
{
|
146 |
+
TargetedLocation.add(j+"\t"+s);
|
147 |
+
}
|
148 |
+
String ids[]=id.split(";");
|
149 |
+
for(int x=0;x<ids.length;x++)
|
150 |
+
{
|
151 |
+
patt="^\\**([0-9]+)";
|
152 |
+
ptmp = Pattern.compile(patt);
|
153 |
+
mtmp = ptmp.matcher(ids[x]);
|
154 |
+
if(mtmp.find())
|
155 |
+
{
|
156 |
+
SPID_hash.put(mtmp.group(1), mention_genus);
|
157 |
+
}
|
158 |
+
}
|
159 |
+
}
|
160 |
+
}
|
161 |
+
}
|
162 |
+
}
|
163 |
+
}
|
164 |
+
|
165 |
+
/** Cell Line recognition */
|
166 |
+
locations = GNormPlus.PT_Cell.SearchMentionLocation(PassageContext,"Cell"); /** PT_Cell */
|
167 |
+
for (int k = 0 ; k < locations.size() ; k++)
|
168 |
+
{
|
169 |
+
String anno[]=locations.get(k).split("\t");
|
170 |
+
int start= Integer.parseInt(anno[0]);
|
171 |
+
int last= Integer.parseInt(anno[1]);
|
172 |
+
String mention = anno[2];
|
173 |
+
String id = anno[3];
|
174 |
+
if(GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
|
175 |
+
{
|
176 |
+
if(!TargetedLocation.contains(j+"\t"+start)) //already exists
|
177 |
+
{
|
178 |
+
int last40=0;
|
179 |
+
if(PassageContext.length()>=last+40)
|
180 |
+
{
|
181 |
+
last40=last+40;
|
182 |
+
}
|
183 |
+
else
|
184 |
+
{
|
185 |
+
last40=PassageContext.length();
|
186 |
+
}
|
187 |
+
|
188 |
+
// For anti-serum filtering
|
189 |
+
String ForwardSTR="";
|
190 |
+
String BackwardSTR="";
|
191 |
+
if(start>21)
|
192 |
+
{
|
193 |
+
ForwardSTR = PassageContext.substring(start-21,last);
|
194 |
+
}
|
195 |
+
else
|
196 |
+
{
|
197 |
+
ForwardSTR = PassageContext.substring(0,last);
|
198 |
+
}
|
199 |
+
if(PassageContext.length()>last+21)
|
200 |
+
{
|
201 |
+
BackwardSTR = PassageContext.substring(start,last+21);
|
202 |
+
}
|
203 |
+
else
|
204 |
+
{
|
205 |
+
BackwardSTR = PassageContext.substring(start,PassageContext.length());
|
206 |
+
}
|
207 |
+
String mention_tmp=mention.toLowerCase();
|
208 |
+
mention_tmp = mention_tmp.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
|
209 |
+
if(mention_tmp.matches(".*[\\[\\]\\(\\)\\{\\}].*")){}
|
210 |
+
else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[0-9\\-\\_].*")){} // filtered: Bee1p
|
211 |
+
else if(ForwardSTR.toLowerCase().matches(".*[0-9\\-\\_]"+mention_tmp)){} // filtered: IL-22RA1
|
212 |
+
else
|
213 |
+
{
|
214 |
+
String patt="[\\W\\-]cell([\\- ]*line|)[s]*[\\W\\-]";
|
215 |
+
Pattern ptmp = Pattern.compile(patt);
|
216 |
+
Matcher mtmp = ptmp.matcher(PassageContext.substring(last, last40).toLowerCase());
|
217 |
+
if(mtmp.find())
|
218 |
+
{
|
219 |
+
if(GNormPlus.taxid4gene.contains(id)) // for gene
|
220 |
+
{
|
221 |
+
id="*"+id;
|
222 |
+
}
|
223 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tCell\t"+id);
|
224 |
+
String mentions_tmp=mention.toLowerCase();
|
225 |
+
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
226 |
+
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
227 |
+
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
228 |
+
IDset.add(id);
|
229 |
+
for(int s=start;s<last;s++)
|
230 |
+
{
|
231 |
+
TargetedLocation.add(j+"\t"+s);
|
232 |
+
}
|
233 |
+
}
|
234 |
+
}
|
235 |
+
}
|
236 |
+
}
|
237 |
+
}
|
238 |
+
|
239 |
+
/** Genus names*/
|
240 |
+
for(String ID: SPID_hash.keySet())
|
241 |
+
{
|
242 |
+
if(GNormPlus.GenusID_hash.containsKey(ID))
|
243 |
+
{
|
244 |
+
GenusNames.put(ID,GNormPlus.GenusID_hash.get(ID));
|
245 |
+
}
|
246 |
+
if(SPID_hash.get(ID).length()>=7)
|
247 |
+
{
|
248 |
+
GenusNames.put(ID,SPID_hash.get(ID));
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
252 |
+
|
253 |
+
GenusNames.put("3702", "arabidopsis");
|
254 |
+
GenusNames.put("4932", "saccharomyces");
|
255 |
+
GenusNames.put("562", "escherichia");
|
256 |
+
GenusNames.put("7227", "drosophila");
|
257 |
+
GenusNames.put("8355", "xenopus");
|
258 |
+
|
259 |
+
PT_Genus.Hash2Tree(GenusNames);
|
260 |
+
|
261 |
+
/** Genus recognition */
|
262 |
+
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
|
263 |
+
{
|
264 |
+
if(GNormPlus.BioCDocobj.PassageContexts.size()>i &&
|
265 |
+
GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j &&
|
266 |
+
GNormPlus.BioCDocobj.Annotations.size()>i &&
|
267 |
+
GNormPlus.BioCDocobj.Annotations.get(i).size()>j
|
268 |
+
)
|
269 |
+
{
|
270 |
+
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j);
|
271 |
+
ArrayList<String> locations_Genus = PT_Genus.SearchMentionLocation(PassageContext,"Genus"); /** PT_Genus*/
|
272 |
+
for (int k = 0 ; k < locations_Genus.size() ; k++)
|
273 |
+
{
|
274 |
+
String anno[]=locations_Genus.get(k).split("\t");
|
275 |
+
String start= anno[0];
|
276 |
+
String last= anno[1];
|
277 |
+
String mention = anno[2];
|
278 |
+
String id = anno[3];
|
279 |
+
if(!TargetedLocation.contains(j+"\t"+start)) //already exists
|
280 |
+
{
|
281 |
+
String patt="^\\**([0-9]+)$";
|
282 |
+
Pattern ptmp = Pattern.compile(patt);
|
283 |
+
Matcher mtmp = ptmp.matcher(id);
|
284 |
+
if(mtmp.find())
|
285 |
+
{
|
286 |
+
id = mtmp.group(1);
|
287 |
+
}
|
288 |
+
|
289 |
+
if(GNormPlus.taxid4gene.contains(id)) // for gene
|
290 |
+
{
|
291 |
+
id="*"+id;
|
292 |
+
}
|
293 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGenus\t"+id);
|
294 |
+
String mentions_tmp=mention.toLowerCase();
|
295 |
+
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
296 |
+
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
297 |
+
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
298 |
+
IDset.add(id);
|
299 |
+
for(int s=Integer.parseInt(start);s<Integer.parseInt(last);s++)
|
300 |
+
{
|
301 |
+
TargetedLocation.add(j+"\t"+s);
|
302 |
+
}
|
303 |
+
}
|
304 |
+
}
|
305 |
+
}
|
306 |
+
}
|
307 |
+
|
308 |
+
/** Strain Tree */
|
309 |
+
PrefixTree PT_Strain = new PrefixTree();
|
310 |
+
HashMap<String, String> StrainID_hash = new HashMap<String, String>();
|
311 |
+
BufferedReader br = new BufferedReader(new FileReader(StrainFilename));
|
312 |
+
String line="";
|
313 |
+
while ((line = br.readLine()) != null)
|
314 |
+
{
|
315 |
+
String l[]=line.split("\t");
|
316 |
+
String ancestor = l[0];
|
317 |
+
String tax_id = l[1];
|
318 |
+
String tax_names = l[2];
|
319 |
+
if(SPID_hash.containsKey(ancestor))
|
320 |
+
{
|
321 |
+
StrainID_hash.put(tax_id, tax_names); // tax id -> strain
|
322 |
+
}
|
323 |
+
else if(SPID_hash.containsKey(tax_id))
|
324 |
+
{
|
325 |
+
StrainID_hash.put(tax_id, tax_names); // tax id -> strain
|
326 |
+
}
|
327 |
+
}
|
328 |
+
br.close();
|
329 |
+
HashMap<String, String> StrainNames = new HashMap<String, String>();
|
330 |
+
for(String ID: StrainID_hash.keySet())
|
331 |
+
{
|
332 |
+
StrainNames.put(ID,StrainID_hash.get(ID));
|
333 |
+
}
|
334 |
+
|
335 |
+
PT_Strain.Hash2Tree(StrainNames);
|
336 |
+
|
337 |
+
/** Strain recognition */
|
338 |
+
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
|
339 |
+
{
|
340 |
+
if(GNormPlus.BioCDocobj.PassageContexts.size()>i &&
|
341 |
+
GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j &&
|
342 |
+
GNormPlus.BioCDocobj.Annotations.size()>i &&
|
343 |
+
GNormPlus.BioCDocobj.Annotations.get(i).size()>j
|
344 |
+
)
|
345 |
+
{
|
346 |
+
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
|
347 |
+
ArrayList<String> locations_Strain = PT_Strain.SearchMentionLocation(PassageContext,"Strain"); /** PT_Strain*/
|
348 |
+
for (int k = 0 ; k < locations_Strain.size() ; k++)
|
349 |
+
{
|
350 |
+
String anno[]=locations_Strain.get(k).split("\t");
|
351 |
+
String start= anno[0];
|
352 |
+
String last= anno[1];
|
353 |
+
String mention = anno[2];
|
354 |
+
String id = anno[3];
|
355 |
+
if(!TargetedLocation.contains(j+"\t"+start)) //already exists
|
356 |
+
{
|
357 |
+
if((!mention.matches(".*[;,\\{\\}\\(\\)\\[\\]].*")) && !mention.matches("[a-z]{1,4} [0-9]{1,3}"))
|
358 |
+
{
|
359 |
+
if(GNormPlus.taxid4gene.contains(id)) // for gene
|
360 |
+
{
|
361 |
+
id="*"+id;
|
362 |
+
}
|
363 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tStrain\t"+id);
|
364 |
+
String mentions_tmp=mention.toLowerCase();
|
365 |
+
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
366 |
+
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
367 |
+
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
368 |
+
IDset.add(id);
|
369 |
+
for(int s=Integer.parseInt(start);s<Integer.parseInt(last);s++)
|
370 |
+
{
|
371 |
+
TargetedLocation.add(j+"\t"+s);
|
372 |
+
}
|
373 |
+
}
|
374 |
+
}
|
375 |
+
}
|
376 |
+
}
|
377 |
+
}
|
378 |
+
|
379 |
+
HashMap<String, String> OtherNames = new HashMap<String, String>();
|
380 |
+
for(String men : Mention2ID_lc.keySet())
|
381 |
+
{
|
382 |
+
String men_id= Mention2ID_lc.get(men);
|
383 |
+
if(GNormPlus.PmidLF2Abb_lc_hash.containsKey(Pmid+"\t"+men))
|
384 |
+
{
|
385 |
+
String Abb = GNormPlus.PmidLF2Abb_lc_hash.get(Pmid+"\t"+men);
|
386 |
+
// Abbreviation
|
387 |
+
if(OtherNames.containsKey(men_id))
|
388 |
+
{
|
389 |
+
OtherNames.put(men_id, OtherNames.get(men_id)+"|"+Abb);
|
390 |
+
}
|
391 |
+
else
|
392 |
+
{
|
393 |
+
OtherNames.put(men_id,Abb);
|
394 |
+
}
|
395 |
+
}
|
396 |
+
String men_nospace=men.replaceAll(" ", "");
|
397 |
+
// no space
|
398 |
+
if(OtherNames.containsKey(men_id))
|
399 |
+
{
|
400 |
+
OtherNames.put(men_id, OtherNames.get(men_id)+"|"+men_nospace);
|
401 |
+
}
|
402 |
+
else
|
403 |
+
{
|
404 |
+
OtherNames.put(men_id,men_nospace);
|
405 |
+
}
|
406 |
+
}
|
407 |
+
PrefixTree PT_Others = new PrefixTree();
|
408 |
+
PT_Others.Hash2Tree(OtherNames);
|
409 |
+
|
410 |
+
/**
|
411 |
+
*
|
412 |
+
* Others:
|
413 |
+
* 1) Abbreviation
|
414 |
+
* 2) no space
|
415 |
+
*
|
416 |
+
* */
|
417 |
+
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
|
418 |
+
{
|
419 |
+
if(GNormPlus.BioCDocobj.PassageContexts.size()>i &&
|
420 |
+
GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j &&
|
421 |
+
GNormPlus.BioCDocobj.Annotations.size()>i &&
|
422 |
+
GNormPlus.BioCDocobj.Annotations.get(i).size()>j
|
423 |
+
)
|
424 |
+
{
|
425 |
+
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
|
426 |
+
ArrayList<String> locations_Abb = PT_Others.SearchMentionLocation(PassageContext,"Species"); /** PT_Abb*/
|
427 |
+
for (int k = 0 ; k < locations_Abb.size() ; k++)
|
428 |
+
{
|
429 |
+
String anno[]=locations_Abb.get(k).split("\t");
|
430 |
+
String start= anno[0];
|
431 |
+
String last= anno[1];
|
432 |
+
String mention = anno[2];
|
433 |
+
String id = anno[3];
|
434 |
+
if(!TargetedLocation.contains(j+"\t"+start)) //already exists
|
435 |
+
{
|
436 |
+
if(GNormPlus.taxid4gene.contains(id)) // for gene
|
437 |
+
{
|
438 |
+
id="*"+id;
|
439 |
+
}
|
440 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tSpecies\t"+id);
|
441 |
+
String mentions_tmp=mention.toLowerCase();
|
442 |
+
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
443 |
+
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
444 |
+
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
445 |
+
Mention2ID_lc.put(mention.toLowerCase(), id);
|
446 |
+
IDset.add(id);
|
447 |
+
for(int s=Integer.parseInt(start);s<Integer.parseInt(last);s++)
|
448 |
+
{
|
449 |
+
TargetedLocation.add(j+"\t"+s);
|
450 |
+
}
|
451 |
+
}
|
452 |
+
}
|
453 |
+
}
|
454 |
+
}
|
455 |
+
|
456 |
+
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
|
457 |
+
{
|
458 |
+
if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j && GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
|
459 |
+
{
|
460 |
+
ArrayList <Integer> remove_anno = new ArrayList <Integer>();
|
461 |
+
for (int a = 0; a < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); a++) /** Annotations : a */
|
462 |
+
{
|
463 |
+
String SpAnno[]=GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(a).split("\t");
|
464 |
+
String start= SpAnno[0];
|
465 |
+
String last= SpAnno[1];
|
466 |
+
String mention = SpAnno[2];
|
467 |
+
String type = SpAnno[3];
|
468 |
+
|
469 |
+
if(type.matches("Gene|FamilyName"))
|
470 |
+
{
|
471 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a,start+"\t"+last+"\t"+mention+"\t"+type);
|
472 |
+
}
|
473 |
+
else if(type.matches("Species|Genus|Strain|Cell") && SpAnno.length==5)
|
474 |
+
{
|
475 |
+
//System.out.println(GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(a));
|
476 |
+
/** Abbreviation solution */
|
477 |
+
if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mention.toLowerCase()) && Mention2ID_lc.containsKey(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mention.toLowerCase())))
|
478 |
+
{
|
479 |
+
String LF_lc=GNormPlus.PmidAbb2LF_lc_hash.get(Pmid+"\t"+mention.toLowerCase());
|
480 |
+
if(Mention2ID_lc.containsKey(LF_lc))
|
481 |
+
{
|
482 |
+
String LF_ID=Mention2ID_lc.get(LF_lc);
|
483 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+LF_ID);
|
484 |
+
String mentions_tmp=mention.toLowerCase();
|
485 |
+
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
486 |
+
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
487 |
+
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
488 |
+
}
|
489 |
+
}
|
490 |
+
else if (SpAnno.length>4)
|
491 |
+
{
|
492 |
+
String id = SpAnno[4];
|
493 |
+
String id_split[]=id.split(";");
|
494 |
+
if(id_split.length>=2)
|
495 |
+
{
|
496 |
+
/** Smallest set of tax ids */
|
497 |
+
boolean found=false;
|
498 |
+
for(int x=0;x<IDset.size();x++)
|
499 |
+
{
|
500 |
+
String id_tmp= IDset.get(x);
|
501 |
+
for(int y=0;y<id_split.length;y++) // if any other id is a component of the target id
|
502 |
+
{
|
503 |
+
if(id_split[y].equals(id_tmp))
|
504 |
+
{
|
505 |
+
found=true;
|
506 |
+
}
|
507 |
+
}
|
508 |
+
if(found == true)
|
509 |
+
{
|
510 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id_tmp);
|
511 |
+
String mentions_tmp=mention.toLowerCase();
|
512 |
+
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
513 |
+
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
514 |
+
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
515 |
+
x=1000000;
|
516 |
+
}
|
517 |
+
}
|
518 |
+
|
519 |
+
/** smallest tax id number */
|
520 |
+
if(found == false)
|
521 |
+
{
|
522 |
+
int min=10000000;
|
523 |
+
String min_id="";
|
524 |
+
for(int y=0;y<id_split.length;y++) // if any other id is a component of the target id
|
525 |
+
{
|
526 |
+
String id_tmp = id_split[y];
|
527 |
+
String patt="^\\**([0-9]+)";
|
528 |
+
Pattern ptmp = Pattern.compile(patt);
|
529 |
+
Matcher mtmp = ptmp.matcher(id_tmp);
|
530 |
+
if(mtmp.find())
|
531 |
+
{
|
532 |
+
id_tmp = mtmp.group(1);
|
533 |
+
}
|
534 |
+
|
535 |
+
if(y==0)
|
536 |
+
{
|
537 |
+
min_id=id_split[y];
|
538 |
+
min=Integer.parseInt(id_tmp);
|
539 |
+
}
|
540 |
+
else if(Integer.parseInt(id_tmp)<min)
|
541 |
+
{
|
542 |
+
min=Integer.parseInt(id_tmp);
|
543 |
+
min_id=id_tmp;
|
544 |
+
}
|
545 |
+
}
|
546 |
+
if(GNormPlus.taxid4gene.contains(min_id)) // for gene
|
547 |
+
{
|
548 |
+
min_id="*"+min_id;
|
549 |
+
}
|
550 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a,start+"\t"+last+"\t"+mention+"\tSpecies\t"+min_id);
|
551 |
+
String mentions_tmp=mention.toLowerCase();
|
552 |
+
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
553 |
+
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
554 |
+
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
555 |
+
}
|
556 |
+
}
|
557 |
+
}
|
558 |
+
}
|
559 |
+
else //disease, and other concepts
|
560 |
+
{
|
561 |
+
remove_anno.add(a);
|
562 |
+
}
|
563 |
+
}
|
564 |
+
|
565 |
+
Collections.sort(remove_anno);
|
566 |
+
for (int counter = remove_anno.size()-1; counter >= 0 ; counter--)
|
567 |
+
{
|
568 |
+
int ai=remove_anno.get(counter);
|
569 |
+
//System.out.println("\n"+ai+"\t"+GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(ai));
|
570 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(ai);
|
571 |
+
}
|
572 |
+
}
|
573 |
+
}
|
574 |
+
}
|
575 |
+
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true); //save in BioC file
|
576 |
+
}
|
577 |
+
public void SpeciesAssignment(String Filename,String FilenameBioC) throws IOException, XMLStreamException
|
578 |
+
{
|
579 |
+
GNormPlus.BioCDocobj.Annotations = new ArrayList();
|
580 |
+
GNormPlus.BioCDocobj.BioCReaderWithAnnotation(Filename);
|
581 |
+
|
582 |
+
BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);
|
583 |
+
for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) /** PMIDs : i */
|
584 |
+
{
|
585 |
+
HashMap<String, String> PrefixIDTarget_hash = new HashMap<String, String>();
|
586 |
+
PrefixIDTarget_hash.put("9606", "h");
|
587 |
+
PrefixIDTarget_hash.put("10090", "m");
|
588 |
+
PrefixIDTarget_hash.put("10116", "r");
|
589 |
+
PrefixIDTarget_hash.put("4932", "y");
|
590 |
+
PrefixIDTarget_hash.put("7227", "d");
|
591 |
+
PrefixIDTarget_hash.put("7955", "z|zf|Zf|dr|Dr");
|
592 |
+
PrefixIDTarget_hash.put("3702", "at|At");
|
593 |
+
|
594 |
+
HashMap<String, Double> SP2Num_hash = new HashMap<String, Double>();
|
595 |
+
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
|
596 |
+
{
|
597 |
+
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
|
598 |
+
{
|
599 |
+
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
600 |
+
if(anno.length==5) //Species
|
601 |
+
{
|
602 |
+
String patt="^\\**([0-9]+)$";
|
603 |
+
Pattern ptmp = Pattern.compile(patt);
|
604 |
+
Matcher mtmp = ptmp.matcher(anno[4]);
|
605 |
+
if(mtmp.find())
|
606 |
+
{
|
607 |
+
String id = mtmp.group(1);
|
608 |
+
|
609 |
+
if(!PrefixIDTarget_hash.containsKey(id))
|
610 |
+
{
|
611 |
+
PrefixIDTarget_hash.put(id,GNormPlus.PrefixID_hash.get(id)); // taxid -> prefix
|
612 |
+
}
|
613 |
+
if(j == 0)//title
|
614 |
+
{
|
615 |
+
if(SP2Num_hash.containsKey(id))
|
616 |
+
{
|
617 |
+
SP2Num_hash.put(id, SP2Num_hash.get(id)+2);
|
618 |
+
}
|
619 |
+
else
|
620 |
+
{
|
621 |
+
if(GNormPlus.TaxFreq_hash.containsKey(id))
|
622 |
+
{
|
623 |
+
SP2Num_hash.put(id, GNormPlus.TaxFreq_hash.get(id)+2);
|
624 |
+
}
|
625 |
+
else
|
626 |
+
{
|
627 |
+
SP2Num_hash.put(id, 2.0);
|
628 |
+
}
|
629 |
+
}
|
630 |
+
// Virus -> Human (not to double weight human to virus)
|
631 |
+
/*if(GNormPlus.SP_Virus2Human_hash.containsKey(id))
|
632 |
+
{
|
633 |
+
if(SP2Num_hash.containsKey("9606"))
|
634 |
+
{
|
635 |
+
SP2Num_hash.put("9606", SP2Num_hash.get("9606")+2);
|
636 |
+
}
|
637 |
+
else
|
638 |
+
{
|
639 |
+
SP2Num_hash.put("9606", 2 + GNormPlus.TaxFreq_hash.get("9606")+1);
|
640 |
+
}
|
641 |
+
}*/
|
642 |
+
}
|
643 |
+
else
|
644 |
+
{
|
645 |
+
if(SP2Num_hash.containsKey(id))
|
646 |
+
{
|
647 |
+
SP2Num_hash.put(id, SP2Num_hash.get(id)+1);
|
648 |
+
}
|
649 |
+
else
|
650 |
+
{
|
651 |
+
if(GNormPlus.TaxFreq_hash.containsKey(id))
|
652 |
+
{
|
653 |
+
SP2Num_hash.put(id, 1 + GNormPlus.TaxFreq_hash.get(id));
|
654 |
+
}
|
655 |
+
else
|
656 |
+
{
|
657 |
+
SP2Num_hash.put(id, 1.0);
|
658 |
+
}
|
659 |
+
}
|
660 |
+
// Virus -> Human
|
661 |
+
/*if(GNormPlus.SP_Virus2Human_hash.containsKey(id))
|
662 |
+
{
|
663 |
+
if(SP2Num_hash.containsKey("9606"))
|
664 |
+
{
|
665 |
+
SP2Num_hash.put("9606", SP2Num_hash.get("9606")+1);
|
666 |
+
}
|
667 |
+
else
|
668 |
+
{
|
669 |
+
SP2Num_hash.put("9606", GNormPlus.TaxFreq_hash.get("9606")+1);
|
670 |
+
}
|
671 |
+
}*/
|
672 |
+
}
|
673 |
+
}
|
674 |
+
}
|
675 |
+
}
|
676 |
+
}
|
677 |
+
String MajorSP="9606";
|
678 |
+
double MaxSP=0;
|
679 |
+
for(String tid : SP2Num_hash.keySet())
|
680 |
+
{
|
681 |
+
if(SP2Num_hash.get(tid)>MaxSP)
|
682 |
+
{
|
683 |
+
MajorSP=tid;
|
684 |
+
MaxSP=SP2Num_hash.get(tid);
|
685 |
+
}
|
686 |
+
}
|
687 |
+
|
688 |
+
for (int j = 0; j < GNormPlus.BioCDocobj.PassageContexts.get(i).size(); j++) /** Paragraphs : j */
|
689 |
+
{
|
690 |
+
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
|
691 |
+
//int PassageOffset = GNormPlus.BioCDocobj.PassageOffsets.get(i).get(j); // Passage offset
|
692 |
+
iterator.setText(PassageContext);
|
693 |
+
ArrayList<Integer> Sentence_offsets = new ArrayList<Integer>();
|
694 |
+
int Sent_start = iterator.first();
|
695 |
+
for (int Sent_last = iterator.next(); Sent_last != BreakIterator.DONE; Sent_start = Sent_last, Sent_last = iterator.next())
|
696 |
+
{
|
697 |
+
Sentence_offsets.add(Sent_start);
|
698 |
+
}
|
699 |
+
|
700 |
+
HashMap<Integer,String> Annotations_Gene_hash = new HashMap<Integer,String>();
|
701 |
+
ArrayList<String> Annotations_Species = new ArrayList<String>();
|
702 |
+
if(GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
|
703 |
+
{
|
704 |
+
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
|
705 |
+
{
|
706 |
+
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
707 |
+
if(anno.length==5) //Species
|
708 |
+
{
|
709 |
+
Annotations_Species.add(GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k));
|
710 |
+
}
|
711 |
+
else //Gene : if(anno.length==3)
|
712 |
+
{
|
713 |
+
//String mention = PassageContext.substring(Integer.parseInt(anno[0]), Integer.parseInt(anno[1]));
|
714 |
+
Annotations_Gene_hash.put(k,GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k)); // k -> Gene Annotation
|
715 |
+
}
|
716 |
+
}
|
717 |
+
|
718 |
+
//Gene --> Species Inference (PMID:28777492)
|
719 |
+
HashMap<String,HashMap<Integer,String>> mention2Location2Species_hash = new HashMap<String,HashMap<Integer,String>>();
|
720 |
+
HashMap<Integer,String> Location2Species_hash = new HashMap<Integer,String>();
|
721 |
+
for (int k : Annotations_Gene_hash.keySet()) // k is the index of GNormPlus.BioCDocobj.Annotations.get(i).get(j)
|
722 |
+
{
|
723 |
+
boolean SPfound = false;
|
724 |
+
String anno[] = Annotations_Gene_hash.get(k).split("\t");
|
725 |
+
int G_Start= Integer.parseInt(anno[0]);
|
726 |
+
int G_Last= Integer.parseInt(anno[1]);
|
727 |
+
String G_mentions = anno[2];
|
728 |
+
/**
|
729 |
+
* 2. Co-occurring word
|
730 |
+
* boundary :
|
731 |
+
* Sentence Start: Sentence_offsets.get(Target_Sentence)
|
732 |
+
* Sentence Last: Sentence_offsets.get(Target_Sentence+1)
|
733 |
+
*/
|
734 |
+
//Find the target sentence
|
735 |
+
int Target_Sentence=0;
|
736 |
+
if(SPfound == false) // 1. left : Closed to start of the gene mention
|
737 |
+
{
|
738 |
+
for(int s=0;s<Sentence_offsets.size();s++)
|
739 |
+
|
740 |
+
{
|
741 |
+
int Sentence_last=1000000;
|
742 |
+
if(s<Sentence_offsets.size()-1)
|
743 |
+
{
|
744 |
+
Sentence_last=Sentence_offsets.get(s+1);
|
745 |
+
}
|
746 |
+
if(G_Start<Sentence_last)
|
747 |
+
{
|
748 |
+
Target_Sentence=s;
|
749 |
+
break;
|
750 |
+
}
|
751 |
+
}
|
752 |
+
}
|
753 |
+
int Sentence_Start = Sentence_offsets.get(Target_Sentence);
|
754 |
+
int Sentence_Last = 1000000;
|
755 |
+
if(Sentence_offsets.size() > Target_Sentence+1){ Sentence_Last = Sentence_offsets.get(Target_Sentence+1); }
|
756 |
+
if(SPfound == false) // 1. left : Closed to start of the gene mention
|
757 |
+
{
|
758 |
+
int closet_Sp_Start=0;
|
759 |
+
for(int sp=0;sp<Annotations_Species.size();sp++) // Find the closet species
|
760 |
+
{
|
761 |
+
String AnnoSp[]=Annotations_Species.get(sp).split("\t");
|
762 |
+
int Sp_Start = Integer.parseInt(AnnoSp[0]);
|
763 |
+
String patt="^\\**([0-9]+)$";
|
764 |
+
Pattern ptmp = Pattern.compile(patt);
|
765 |
+
Matcher mtmp = ptmp.matcher(AnnoSp[4]);
|
766 |
+
if(mtmp.find())
|
767 |
+
{
|
768 |
+
String taxid = mtmp.group(1);
|
769 |
+
Location2Species_hash.put(Sp_Start,taxid);
|
770 |
+
if(Sp_Start <= G_Start && Sp_Start >= Sentence_Start && Sp_Start >closet_Sp_Start)
|
771 |
+
{
|
772 |
+
closet_Sp_Start=Sp_Start;
|
773 |
+
Location2Species_hash.put(Integer.parseInt(anno[0]), taxid);
|
774 |
+
|
775 |
+
if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
|
776 |
+
{
|
777 |
+
mention2Location2Species_hash.get(G_mentions.toLowerCase()).put(Integer.parseInt(anno[0]), taxid);
|
778 |
+
}
|
779 |
+
else
|
780 |
+
{
|
781 |
+
mention2Location2Species_hash.put(G_mentions.toLowerCase(),Location2Species_hash);
|
782 |
+
}
|
783 |
+
|
784 |
+
SPfound=true;
|
785 |
+
}
|
786 |
+
}
|
787 |
+
}
|
788 |
+
}
|
789 |
+
if(SPfound == false) // 2. right : Closed to last of the gene mention
|
790 |
+
{
|
791 |
+
int closet_Sp_Last=1000000;
|
792 |
+
for(int sp=0;sp<Annotations_Species.size();sp++) // Find the closet species
|
793 |
+
{
|
794 |
+
String AnnoSp[]=Annotations_Species.get(sp).split("\t");
|
795 |
+
int Sp_Last = Integer.parseInt(AnnoSp[1]);
|
796 |
+
String patt="^\\**([0-9]+)$";
|
797 |
+
Pattern ptmp = Pattern.compile(patt);
|
798 |
+
Matcher mtmp = ptmp.matcher(AnnoSp[4]);
|
799 |
+
if(mtmp.find())
|
800 |
+
{
|
801 |
+
String taxid = mtmp.group(1);
|
802 |
+
if(Sp_Last >= G_Last && Sp_Last <= Sentence_Last && Sp_Last < closet_Sp_Last)
|
803 |
+
{
|
804 |
+
closet_Sp_Last=Sp_Last;
|
805 |
+
Location2Species_hash.put(Integer.parseInt(anno[0]), taxid);
|
806 |
+
|
807 |
+
if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
|
808 |
+
{
|
809 |
+
mention2Location2Species_hash.get(G_mentions.toLowerCase()).put(Integer.parseInt(anno[0]), taxid);
|
810 |
+
}
|
811 |
+
else
|
812 |
+
{
|
813 |
+
mention2Location2Species_hash.put(G_mentions.toLowerCase(),Location2Species_hash);
|
814 |
+
}
|
815 |
+
|
816 |
+
SPfound=true;
|
817 |
+
}
|
818 |
+
}
|
819 |
+
}
|
820 |
+
}
|
821 |
+
}
|
822 |
+
|
823 |
+
for (int k : Annotations_Gene_hash.keySet()) // k is the index of GNormPlus.BioCDocobj.Annotations.get(i).get(j)
|
824 |
+
{
|
825 |
+
String anno[] = Annotations_Gene_hash.get(k).split("\t");
|
826 |
+
int G_Start= Integer.parseInt(anno[0]);
|
827 |
+
int G_Last= Integer.parseInt(anno[1]);
|
828 |
+
String G_mentions = anno[2];
|
829 |
+
String G_type = anno[3];
|
830 |
+
String G_mention_list[]=G_mentions.split("\\|");
|
831 |
+
String G_mention=G_mention_list[0]; // only use the first term to detect species ; should be updated after SimConcept
|
832 |
+
|
833 |
+
/** 1. prefix */
|
834 |
+
boolean SPfound = false;
|
835 |
+
for(String taxid: PrefixIDTarget_hash.keySet())
|
836 |
+
{
|
837 |
+
if(GNormPlus.GeneWithoutSPPrefix_hash.containsKey(G_mention.toLowerCase()))
|
838 |
+
{
|
839 |
+
//special case, and no need for prefix - SA
|
840 |
+
}
|
841 |
+
else
|
842 |
+
{
|
843 |
+
Pattern ptmp = Pattern.compile("^("+PrefixIDTarget_hash.get(taxid)+")([A-Z].*)$");
|
844 |
+
Matcher mtmp = ptmp.matcher(G_mention);
|
845 |
+
if(mtmp.find())
|
846 |
+
{
|
847 |
+
String MentionWoPrefix=mtmp.group(2);
|
848 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"|"+MentionWoPrefix+"\t"+anno[3]+"\tPrefix:"+taxid);
|
849 |
+
SPfound=true;
|
850 |
+
break;
|
851 |
+
}
|
852 |
+
}
|
853 |
+
}
|
854 |
+
|
855 |
+
/**
|
856 |
+
* 2. Co-occurring word
|
857 |
+
* boundary :
|
858 |
+
* Sentence Start: Sentence_offsets.get(Target_Sentence)
|
859 |
+
* Sentence Last: Sentence_offsets.get(Target_Sentence+1)
|
860 |
+
*/
|
861 |
+
//Find the target sentence
|
862 |
+
int Target_Sentence=0;
|
863 |
+
if(SPfound == false) // 1. left : Closed to start of the gene mention
|
864 |
+
{
|
865 |
+
for(int s=0;s<Sentence_offsets.size();s++)
|
866 |
+
|
867 |
+
{
|
868 |
+
int Sentence_last=1000000;
|
869 |
+
if(s<Sentence_offsets.size()-1)
|
870 |
+
{
|
871 |
+
Sentence_last=Sentence_offsets.get(s+1);
|
872 |
+
}
|
873 |
+
if(G_Start<Sentence_last)
|
874 |
+
{
|
875 |
+
Target_Sentence=s;
|
876 |
+
break;
|
877 |
+
}
|
878 |
+
}
|
879 |
+
}
|
880 |
+
int Sentence_Start = Sentence_offsets.get(Target_Sentence);
|
881 |
+
int Sentence_Last = 1000000;
|
882 |
+
if(Sentence_offsets.size() > Target_Sentence+1){ Sentence_Last = Sentence_offsets.get(Target_Sentence+1); }
|
883 |
+
if(SPfound == false) // 1. left : Closed to start of the gene mention
|
884 |
+
{
|
885 |
+
int closet_Sp_Start=0;
|
886 |
+
for(int sp=0;sp<Annotations_Species.size();sp++) // Find the closet species
|
887 |
+
{
|
888 |
+
String AnnoSp[]=Annotations_Species.get(sp).split("\t");
|
889 |
+
int Sp_Start = Integer.parseInt(AnnoSp[0]);
|
890 |
+
String patt="^\\**([0-9]+)$";
|
891 |
+
Pattern ptmp = Pattern.compile(patt);
|
892 |
+
Matcher mtmp = ptmp.matcher(AnnoSp[4]);
|
893 |
+
if(mtmp.find())
|
894 |
+
{
|
895 |
+
String taxid = mtmp.group(1);
|
896 |
+
if(Sp_Start <= G_Start && Sp_Start >= Sentence_Start && Sp_Start >closet_Sp_Start)
|
897 |
+
{
|
898 |
+
closet_Sp_Start=Sp_Start;
|
899 |
+
if(GNormPlus.SP_Virus2Human_hash.containsKey(taxid))
|
900 |
+
{
|
901 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tLeft:"+taxid+"&9606");
|
902 |
+
}
|
903 |
+
else
|
904 |
+
{
|
905 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tLeft:"+taxid);
|
906 |
+
}
|
907 |
+
SPfound=true;
|
908 |
+
}
|
909 |
+
}
|
910 |
+
}
|
911 |
+
}
|
912 |
+
if(SPfound == false) // 2. right : Closed to last of the gene mention
|
913 |
+
{
|
914 |
+
int closet_Sp_Last=1000000;
|
915 |
+
for(int sp=0;sp<Annotations_Species.size();sp++) // Find the closet species
|
916 |
+
{
|
917 |
+
String AnnoSp[]=Annotations_Species.get(sp).split("\t");
|
918 |
+
int Sp_Last = Integer.parseInt(AnnoSp[1]);
|
919 |
+
String patt="^\\**([0-9]+)$";
|
920 |
+
Pattern ptmp = Pattern.compile(patt);
|
921 |
+
Matcher mtmp = ptmp.matcher(AnnoSp[4]);
|
922 |
+
if(mtmp.find())
|
923 |
+
{
|
924 |
+
String taxid = mtmp.group(1);
|
925 |
+
if(Sp_Last >= G_Last && Sp_Last <= Sentence_Last && Sp_Last < closet_Sp_Last)
|
926 |
+
{
|
927 |
+
closet_Sp_Last=Sp_Last;
|
928 |
+
if(GNormPlus.SP_Virus2Human_hash.containsKey(taxid))
|
929 |
+
{
|
930 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tRight:"+taxid+"&9606");
|
931 |
+
}
|
932 |
+
else
|
933 |
+
{
|
934 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tRight:"+taxid);
|
935 |
+
}
|
936 |
+
SPfound=true;
|
937 |
+
}
|
938 |
+
}
|
939 |
+
}
|
940 |
+
}
|
941 |
+
|
942 |
+
/** 3. Focus species */
|
943 |
+
if(SPfound == false) // 2. right : Closed to last of the gene mention
|
944 |
+
{
|
945 |
+
// 1. only the mentions appeared earlier are inferred
|
946 |
+
//
|
947 |
+
if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
|
948 |
+
{
|
949 |
+
int closed_loca=0;
|
950 |
+
for (int loca_start : mention2Location2Species_hash.get(G_mentions.toLowerCase()).keySet())
|
951 |
+
{
|
952 |
+
if(loca_start<G_Start)
|
953 |
+
{
|
954 |
+
if(loca_start>closed_loca)
|
955 |
+
{
|
956 |
+
closed_loca=loca_start;
|
957 |
+
}
|
958 |
+
}
|
959 |
+
}
|
960 |
+
if(closed_loca>0)
|
961 |
+
{
|
962 |
+
if(GNormPlus.SP_Virus2Human_hash.containsKey(Location2Species_hash.get(closed_loca)))
|
963 |
+
{
|
964 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+Location2Species_hash.get(closed_loca)+"&9606");
|
965 |
+
}
|
966 |
+
else
|
967 |
+
{
|
968 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+Location2Species_hash.get(closed_loca));
|
969 |
+
}
|
970 |
+
}
|
971 |
+
else
|
972 |
+
{
|
973 |
+
if(GNormPlus.SP_Virus2Human_hash.containsKey(MajorSP))
|
974 |
+
{
|
975 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP+"&9606");
|
976 |
+
}
|
977 |
+
else
|
978 |
+
{
|
979 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP);
|
980 |
+
}
|
981 |
+
}
|
982 |
+
}
|
983 |
+
else
|
984 |
+
{
|
985 |
+
if(GNormPlus.SP_Virus2Human_hash.containsKey(MajorSP))
|
986 |
+
{
|
987 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP+"&9606");
|
988 |
+
}
|
989 |
+
else
|
990 |
+
{
|
991 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP);
|
992 |
+
}
|
993 |
+
}
|
994 |
+
}
|
995 |
+
}
|
996 |
+
}
|
997 |
+
}
|
998 |
+
}
|
999 |
+
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
|
1000 |
+
}
|
1001 |
+
public void SpeciesAssignment(String Filename,String FilenameBioC,String FocusSpecies) throws IOException, XMLStreamException
|
1002 |
+
{
|
1003 |
+
for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) /** PMIDs : i */
|
1004 |
+
{
|
1005 |
+
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
|
1006 |
+
{
|
1007 |
+
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
|
1008 |
+
{
|
1009 |
+
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
1010 |
+
if(anno.length==5) //Species
|
1011 |
+
{
|
1012 |
+
String id=anno[4].replaceAll("\\*", "");
|
1013 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+id);
|
1014 |
+
}
|
1015 |
+
else //Gene : if(anno.length==3)
|
1016 |
+
{
|
1017 |
+
/** 1. prefix */
|
1018 |
+
boolean SPfound = false;
|
1019 |
+
if(GNormPlus.GeneWithoutSPPrefix_hash.containsKey(anno[2].toLowerCase()))
|
1020 |
+
{
|
1021 |
+
//special case, and no need for prefix - SA
|
1022 |
+
}
|
1023 |
+
else
|
1024 |
+
{
|
1025 |
+
Pattern ptmp = Pattern.compile("^("+GNormPlus.PrefixID_hash.get(FocusSpecies)+")([A-Z].*)$");
|
1026 |
+
Matcher mtmp = ptmp.matcher(anno[2]);
|
1027 |
+
if(mtmp.find())
|
1028 |
+
{
|
1029 |
+
String MentionWoPrefix=mtmp.group(2);
|
1030 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"|"+MentionWoPrefix+"\t"+anno[3]+"\tPrefix:"+FocusSpecies);
|
1031 |
+
SPfound=true;
|
1032 |
+
}
|
1033 |
+
}
|
1034 |
+
if(SPfound == false)
|
1035 |
+
{
|
1036 |
+
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k)+"\tFocus:"+FocusSpecies);
|
1037 |
+
}
|
1038 |
+
}
|
1039 |
+
}
|
1040 |
+
}
|
1041 |
+
}
|
1042 |
+
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
|
1043 |
+
}
|
1044 |
}
|
src_Java/GNormPluslib/SimConcept.java
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
src_python/GeneNER/BIO_format.py
CHANGED
@@ -1,257 +1,257 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""
|
3 |
-
Created on Wed Sep 7 08:58:22 2022
|
4 |
-
|
5 |
-
@author: luol2
|
6 |
-
"""
|
7 |
-
|
8 |
-
# -*- coding: utf-8 -*-
|
9 |
-
"""
|
10 |
-
Created on Fri Jun 24 11:27:57 2022
|
11 |
-
|
12 |
-
@author: luol2
|
13 |
-
"""
|
14 |
-
|
15 |
-
|
16 |
-
import stanza
|
17 |
-
import sys
|
18 |
-
import os
|
19 |
-
import io
|
20 |
-
import json
|
21 |
-
import re
|
22 |
-
#sort entity by position in text
|
23 |
-
def pubtator_entitysort(infile):
|
24 |
-
|
25 |
-
fin=open(infile,'r',encoding='utf-8')
|
26 |
-
# fout=open(path+'LitCoin/sort/Train_sort.PubTator','w',encoding='utf-8')
|
27 |
-
fout=io.StringIO()
|
28 |
-
all_in=fin.read().strip().split('\n\n')
|
29 |
-
fin.close()
|
30 |
-
error_dict={} #use to debug error
|
31 |
-
for doc in all_in:
|
32 |
-
entity_dict={}
|
33 |
-
lines=doc.split('\n')
|
34 |
-
fout.write(lines[0]+'\n'+lines[1]+'\n')
|
35 |
-
for i in range(2,len(lines)):
|
36 |
-
segs=lines[i].split('\t')
|
37 |
-
if len(segs)>=5:
|
38 |
-
if lines[i] not in entity_dict.keys():
|
39 |
-
entity_dict[lines[i]]=int(segs[1])
|
40 |
-
else:
|
41 |
-
print('entity have in',lines[i])
|
42 |
-
if segs[0] not in error_dict.keys():
|
43 |
-
error_dict[segs[0]]=[lines[i]]
|
44 |
-
else:
|
45 |
-
if lines[i] not in error_dict[segs[0]]:
|
46 |
-
error_dict[segs[0]].append(lines[i])
|
47 |
-
|
48 |
-
entity_sort=sorted(entity_dict.items(), key=lambda kv:(kv[1]), reverse=False)
|
49 |
-
for ele in entity_sort:
|
50 |
-
fout.write(ele[0]+'\n')
|
51 |
-
fout.write('\n')
|
52 |
-
return fout
|
53 |
-
|
54 |
-
def filter_overlap(infile): #nonest
|
55 |
-
|
56 |
-
fin=io.StringIO(infile.getvalue())
|
57 |
-
fout=io.StringIO()
|
58 |
-
|
59 |
-
documents=fin.read().strip().split('\n\n')
|
60 |
-
fin.close()
|
61 |
-
total_entity=0
|
62 |
-
over_entity=0
|
63 |
-
nest_entity=0
|
64 |
-
for doc in documents:
|
65 |
-
lines=doc.split('\n')
|
66 |
-
entity_list=[]
|
67 |
-
if len(lines)>2:
|
68 |
-
first_entity=lines[2].split('\t')
|
69 |
-
nest_list=[first_entity]
|
70 |
-
max_eid=int(first_entity[2])
|
71 |
-
total_entity+=len(lines)-2
|
72 |
-
for i in range(3,len(lines)):
|
73 |
-
segs=lines[i].split('\t')
|
74 |
-
if int(segs[1])> max_eid:
|
75 |
-
if len(nest_list)==1:
|
76 |
-
entity_list.append(nest_list[0])
|
77 |
-
nest_list=[]
|
78 |
-
nest_list.append(segs)
|
79 |
-
if int(segs[2])>max_eid:
|
80 |
-
max_eid=int(segs[2])
|
81 |
-
else:
|
82 |
-
# print(nest_list)
|
83 |
-
nest_entity+=len(nest_list)-1
|
84 |
-
tem=find_max_entity(nest_list)#find max entity
|
85 |
-
# if len(tem)>1:
|
86 |
-
# print('max nest >1:',tem)
|
87 |
-
entity_list.extend(tem)
|
88 |
-
nest_list=[]
|
89 |
-
nest_list.append(segs)
|
90 |
-
if int(segs[2])>max_eid:
|
91 |
-
max_eid=int(segs[2])
|
92 |
-
|
93 |
-
else:
|
94 |
-
nest_list.append(segs)
|
95 |
-
if int(segs[2])>max_eid:
|
96 |
-
max_eid=int(segs[2])
|
97 |
-
if nest_list!=[]:
|
98 |
-
if len(nest_list)==1:
|
99 |
-
entity_list.append(nest_list[0])
|
100 |
-
|
101 |
-
else:
|
102 |
-
tem=find_max_entity(nest_list)#find max entity
|
103 |
-
# if len(tem)>1:
|
104 |
-
# print('max nest >1:',tem)
|
105 |
-
entity_list.extend(tem)
|
106 |
-
fout.write(lines[0]+'\n'+lines[1]+'\n')
|
107 |
-
for ele in entity_list:
|
108 |
-
fout.write('\t'.join(ele)+'\n')
|
109 |
-
fout.write('\n')
|
110 |
-
# print(total_entity,over_entity, nest_entity)
|
111 |
-
return fout
|
112 |
-
def find_max_entity(nest_list): #longest entity
|
113 |
-
max_len=0
|
114 |
-
final_tem=[]
|
115 |
-
max_index=0
|
116 |
-
for i in range(0, len(nest_list)):
|
117 |
-
cur_len=int(nest_list[i][2])-int(nest_list[i][1])
|
118 |
-
if cur_len>max_len:
|
119 |
-
max_len=cur_len
|
120 |
-
max_index=i
|
121 |
-
|
122 |
-
final_tem.append(nest_list[max_index])
|
123 |
-
return final_tem
|
124 |
-
|
125 |
-
# change ori pubtator format to labeled text , entity begin with " ssss", end with 'eeee '
|
126 |
-
def pubtator_to_labeltext(infile):
|
127 |
-
|
128 |
-
fin=io.StringIO(infile.getvalue())
|
129 |
-
all_context=fin.read().strip().split('\n\n')
|
130 |
-
fin.close()
|
131 |
-
fout=io.StringIO()
|
132 |
-
label_dic={}
|
133 |
-
|
134 |
-
for doc in all_context:
|
135 |
-
lines=doc.split('\n')
|
136 |
-
ori_text=lines[0].split('|t|')[1]+' '+lines[1].split('|a|')[1]
|
137 |
-
pmid=lines[0].split('|t|')[0]
|
138 |
-
s_index=0
|
139 |
-
e_index=0
|
140 |
-
new_text=''
|
141 |
-
for i in range(2,len(lines)):
|
142 |
-
segs=lines[i].split('\t')
|
143 |
-
label_dic[segs[4].lower()]=segs[4]
|
144 |
-
if len(segs)==6:
|
145 |
-
e_index=int(segs[1])
|
146 |
-
new_text+=ori_text[s_index:e_index]+' ssss'+segs[4].lower()+' '+ori_text[int(segs[1]):int(segs[2])]+' eeee'+segs[4].lower()+' '
|
147 |
-
s_index=int(segs[2])
|
148 |
-
# if ori_text[int(segs[1]):int(segs[2])]!=segs[3]:
|
149 |
-
# print('error(ori,label):',ori_text[int(segs[1]):int(segs[2])],segs[3])
|
150 |
-
|
151 |
-
new_text+=ori_text[s_index:]
|
152 |
-
fout.write(pmid+'\t'+' '.join(new_text.strip().split())+'\n')
|
153 |
-
return fout, label_dic
|
154 |
-
|
155 |
-
|
156 |
-
def pre_token(sentence):
|
157 |
-
sentence=re.sub("([\=\/\(\)\<\>\+\-\_])"," \\1 ",sentence)
|
158 |
-
sentence=re.sub("[ ]+"," ",sentence);
|
159 |
-
return sentence
|
160 |
-
|
161 |
-
# labeltext to conll format (BIO), a token (including features) per line. sentences are split by '\n', or docs are split by '\n'
|
162 |
-
def labeltext_to_conll_fasttoken(infile,label_dic):
|
163 |
-
|
164 |
-
fin=io.StringIO(infile.getvalue())
|
165 |
-
all_context=fin.read().strip().split('\n')
|
166 |
-
fin.close()
|
167 |
-
fout=io.StringIO()
|
168 |
-
|
169 |
-
# nlp = stanza.Pipeline(lang='en', processors='tokenize',package='craft') #package='craft'
|
170 |
-
nlp = stanza.Pipeline(lang='en', processors={'tokenize': 'spacy'},package='None') #package='craft'
|
171 |
-
|
172 |
-
doc_i=0
|
173 |
-
for doc in all_context:
|
174 |
-
doc_text=doc.split('\t')[1]
|
175 |
-
doc_text=pre_token(doc_text)
|
176 |
-
doc_stanza = nlp(doc_text)
|
177 |
-
doc_i+=1
|
178 |
-
#print(doc_i)
|
179 |
-
inentity_flag=0
|
180 |
-
last_label='O'
|
181 |
-
for sent in doc_stanza.sentences:
|
182 |
-
temp_sent=[]
|
183 |
-
word_num=0
|
184 |
-
for word in sent.words:
|
185 |
-
word_num+=1
|
186 |
-
# print(word.text)
|
187 |
-
if word.text.strip()=='':
|
188 |
-
continue
|
189 |
-
temp_sent.append(word.text)
|
190 |
-
if word.text.startswith('ssss')==True:
|
191 |
-
last_label=word.text
|
192 |
-
inentity_flag=1
|
193 |
-
elif word.text.startswith('eeee')==True:
|
194 |
-
last_label=word.text
|
195 |
-
inentity_flag=0
|
196 |
-
else:
|
197 |
-
if last_label=='O':
|
198 |
-
now_label='O'
|
199 |
-
elif last_label.startswith('ssss')==True:
|
200 |
-
now_label='B-'+label_dic[last_label[4:]]
|
201 |
-
|
202 |
-
elif last_label.startswith('B-')==True:
|
203 |
-
now_label='I-'+last_label[2:]
|
204 |
-
elif last_label.startswith('I-')==True:
|
205 |
-
now_label='I-'+last_label[2:]
|
206 |
-
elif last_label.startswith('eeee')==True:
|
207 |
-
now_label='O'
|
208 |
-
|
209 |
-
fout.write(word.text+'\t'+now_label+'\n')
|
210 |
-
last_label=now_label
|
211 |
-
if inentity_flag==1: # if entity is split by sentence, will connate the sentence
|
212 |
-
# print('sentence error!!!')
|
213 |
-
# print(word.text,word_num)
|
214 |
-
# print(temp_sent)
|
215 |
-
pass
|
216 |
-
else:
|
217 |
-
fout.write('\n')
|
218 |
-
return fout
|
219 |
-
|
220 |
-
def pubtator_to_conll(infile):
|
221 |
-
|
222 |
-
#1.entity sort
|
223 |
-
input_sort=pubtator_entitysort(infile)
|
224 |
-
#print(input_sort.getvalue())
|
225 |
-
|
226 |
-
#2. no overlap, if overlap get longest entity
|
227 |
-
input_nonest=filter_overlap(input_sort)
|
228 |
-
# print('......sort.....\n',input_sort.getvalue())
|
229 |
-
|
230 |
-
#3. pubtator to label text
|
231 |
-
input_labtext,label_dic=pubtator_to_labeltext(input_nonest)
|
232 |
-
# print('......label.....\n',input_labtext.getvalue())
|
233 |
-
#print(label_dic)
|
234 |
-
|
235 |
-
#4. label text to conll
|
236 |
-
output = labeltext_to_conll_fasttoken(input_labtext,label_dic)
|
237 |
-
# print('......output.....\n',output.getvalue())
|
238 |
-
# fout=open(outfile,'w',encoding='utf-8')
|
239 |
-
# fout.write(input_nonest.getvalue())
|
240 |
-
# fout.close()
|
241 |
-
return output
|
242 |
-
|
243 |
-
if __name__=='__main__':
|
244 |
-
|
245 |
-
|
246 |
-
infile='../../TrainingSet/No100/NER.Train.txt'
|
247 |
-
output=pubtator_to_conll(infile)
|
248 |
-
fout=open('../../TrainingSet/No100/NER.Train.conll','w',encoding='utf-8')
|
249 |
-
fout.write(output.getvalue())
|
250 |
-
fout.close()
|
251 |
-
output.close()
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Wed Sep 7 08:58:22 2022
|
4 |
+
|
5 |
+
@author: luol2
|
6 |
+
"""
|
7 |
+
|
8 |
+
# -*- coding: utf-8 -*-
|
9 |
+
"""
|
10 |
+
Created on Fri Jun 24 11:27:57 2022
|
11 |
+
|
12 |
+
@author: luol2
|
13 |
+
"""
|
14 |
+
|
15 |
+
|
16 |
+
import stanza
|
17 |
+
import sys
|
18 |
+
import os
|
19 |
+
import io
|
20 |
+
import json
|
21 |
+
import re
|
22 |
+
#sort entity by position in text
|
23 |
+
def pubtator_entitysort(infile):
|
24 |
+
|
25 |
+
fin=open(infile,'r',encoding='utf-8')
|
26 |
+
# fout=open(path+'LitCoin/sort/Train_sort.PubTator','w',encoding='utf-8')
|
27 |
+
fout=io.StringIO()
|
28 |
+
all_in=fin.read().strip().split('\n\n')
|
29 |
+
fin.close()
|
30 |
+
error_dict={} #use to debug error
|
31 |
+
for doc in all_in:
|
32 |
+
entity_dict={}
|
33 |
+
lines=doc.split('\n')
|
34 |
+
fout.write(lines[0]+'\n'+lines[1]+'\n')
|
35 |
+
for i in range(2,len(lines)):
|
36 |
+
segs=lines[i].split('\t')
|
37 |
+
if len(segs)>=5:
|
38 |
+
if lines[i] not in entity_dict.keys():
|
39 |
+
entity_dict[lines[i]]=int(segs[1])
|
40 |
+
else:
|
41 |
+
print('entity have in',lines[i])
|
42 |
+
if segs[0] not in error_dict.keys():
|
43 |
+
error_dict[segs[0]]=[lines[i]]
|
44 |
+
else:
|
45 |
+
if lines[i] not in error_dict[segs[0]]:
|
46 |
+
error_dict[segs[0]].append(lines[i])
|
47 |
+
|
48 |
+
entity_sort=sorted(entity_dict.items(), key=lambda kv:(kv[1]), reverse=False)
|
49 |
+
for ele in entity_sort:
|
50 |
+
fout.write(ele[0]+'\n')
|
51 |
+
fout.write('\n')
|
52 |
+
return fout
|
53 |
+
|
54 |
+
def filter_overlap(infile): #nonest
|
55 |
+
|
56 |
+
fin=io.StringIO(infile.getvalue())
|
57 |
+
fout=io.StringIO()
|
58 |
+
|
59 |
+
documents=fin.read().strip().split('\n\n')
|
60 |
+
fin.close()
|
61 |
+
total_entity=0
|
62 |
+
over_entity=0
|
63 |
+
nest_entity=0
|
64 |
+
for doc in documents:
|
65 |
+
lines=doc.split('\n')
|
66 |
+
entity_list=[]
|
67 |
+
if len(lines)>2:
|
68 |
+
first_entity=lines[2].split('\t')
|
69 |
+
nest_list=[first_entity]
|
70 |
+
max_eid=int(first_entity[2])
|
71 |
+
total_entity+=len(lines)-2
|
72 |
+
for i in range(3,len(lines)):
|
73 |
+
segs=lines[i].split('\t')
|
74 |
+
if int(segs[1])> max_eid:
|
75 |
+
if len(nest_list)==1:
|
76 |
+
entity_list.append(nest_list[0])
|
77 |
+
nest_list=[]
|
78 |
+
nest_list.append(segs)
|
79 |
+
if int(segs[2])>max_eid:
|
80 |
+
max_eid=int(segs[2])
|
81 |
+
else:
|
82 |
+
# print(nest_list)
|
83 |
+
nest_entity+=len(nest_list)-1
|
84 |
+
tem=find_max_entity(nest_list)#find max entity
|
85 |
+
# if len(tem)>1:
|
86 |
+
# print('max nest >1:',tem)
|
87 |
+
entity_list.extend(tem)
|
88 |
+
nest_list=[]
|
89 |
+
nest_list.append(segs)
|
90 |
+
if int(segs[2])>max_eid:
|
91 |
+
max_eid=int(segs[2])
|
92 |
+
|
93 |
+
else:
|
94 |
+
nest_list.append(segs)
|
95 |
+
if int(segs[2])>max_eid:
|
96 |
+
max_eid=int(segs[2])
|
97 |
+
if nest_list!=[]:
|
98 |
+
if len(nest_list)==1:
|
99 |
+
entity_list.append(nest_list[0])
|
100 |
+
|
101 |
+
else:
|
102 |
+
tem=find_max_entity(nest_list)#find max entity
|
103 |
+
# if len(tem)>1:
|
104 |
+
# print('max nest >1:',tem)
|
105 |
+
entity_list.extend(tem)
|
106 |
+
fout.write(lines[0]+'\n'+lines[1]+'\n')
|
107 |
+
for ele in entity_list:
|
108 |
+
fout.write('\t'.join(ele)+'\n')
|
109 |
+
fout.write('\n')
|
110 |
+
# print(total_entity,over_entity, nest_entity)
|
111 |
+
return fout
|
112 |
+
def find_max_entity(nest_list): #longest entity
|
113 |
+
max_len=0
|
114 |
+
final_tem=[]
|
115 |
+
max_index=0
|
116 |
+
for i in range(0, len(nest_list)):
|
117 |
+
cur_len=int(nest_list[i][2])-int(nest_list[i][1])
|
118 |
+
if cur_len>max_len:
|
119 |
+
max_len=cur_len
|
120 |
+
max_index=i
|
121 |
+
|
122 |
+
final_tem.append(nest_list[max_index])
|
123 |
+
return final_tem
|
124 |
+
|
125 |
+
# change ori pubtator format to labeled text , entity begin with " ssss", end with 'eeee '
|
126 |
+
def pubtator_to_labeltext(infile):
|
127 |
+
|
128 |
+
fin=io.StringIO(infile.getvalue())
|
129 |
+
all_context=fin.read().strip().split('\n\n')
|
130 |
+
fin.close()
|
131 |
+
fout=io.StringIO()
|
132 |
+
label_dic={}
|
133 |
+
|
134 |
+
for doc in all_context:
|
135 |
+
lines=doc.split('\n')
|
136 |
+
ori_text=lines[0].split('|t|')[1]+' '+lines[1].split('|a|')[1]
|
137 |
+
pmid=lines[0].split('|t|')[0]
|
138 |
+
s_index=0
|
139 |
+
e_index=0
|
140 |
+
new_text=''
|
141 |
+
for i in range(2,len(lines)):
|
142 |
+
segs=lines[i].split('\t')
|
143 |
+
label_dic[segs[4].lower()]=segs[4]
|
144 |
+
if len(segs)==6:
|
145 |
+
e_index=int(segs[1])
|
146 |
+
new_text+=ori_text[s_index:e_index]+' ssss'+segs[4].lower()+' '+ori_text[int(segs[1]):int(segs[2])]+' eeee'+segs[4].lower()+' '
|
147 |
+
s_index=int(segs[2])
|
148 |
+
# if ori_text[int(segs[1]):int(segs[2])]!=segs[3]:
|
149 |
+
# print('error(ori,label):',ori_text[int(segs[1]):int(segs[2])],segs[3])
|
150 |
+
|
151 |
+
new_text+=ori_text[s_index:]
|
152 |
+
fout.write(pmid+'\t'+' '.join(new_text.strip().split())+'\n')
|
153 |
+
return fout, label_dic
|
154 |
+
|
155 |
+
|
156 |
+
def pre_token(sentence):
|
157 |
+
sentence=re.sub("([\=\/\(\)\<\>\+\-\_])"," \\1 ",sentence)
|
158 |
+
sentence=re.sub("[ ]+"," ",sentence);
|
159 |
+
return sentence
|
160 |
+
|
161 |
+
# labeltext to conll format (BIO), a token (including features) per line. sentences are split by '\n', or docs are split by '\n'
|
162 |
+
def labeltext_to_conll_fasttoken(infile,label_dic):
|
163 |
+
|
164 |
+
fin=io.StringIO(infile.getvalue())
|
165 |
+
all_context=fin.read().strip().split('\n')
|
166 |
+
fin.close()
|
167 |
+
fout=io.StringIO()
|
168 |
+
|
169 |
+
# nlp = stanza.Pipeline(lang='en', processors='tokenize',package='craft') #package='craft'
|
170 |
+
nlp = stanza.Pipeline(lang='en', processors={'tokenize': 'spacy'},package='None') #package='craft'
|
171 |
+
|
172 |
+
doc_i=0
|
173 |
+
for doc in all_context:
|
174 |
+
doc_text=doc.split('\t')[1]
|
175 |
+
doc_text=pre_token(doc_text)
|
176 |
+
doc_stanza = nlp(doc_text)
|
177 |
+
doc_i+=1
|
178 |
+
#print(doc_i)
|
179 |
+
inentity_flag=0
|
180 |
+
last_label='O'
|
181 |
+
for sent in doc_stanza.sentences:
|
182 |
+
temp_sent=[]
|
183 |
+
word_num=0
|
184 |
+
for word in sent.words:
|
185 |
+
word_num+=1
|
186 |
+
# print(word.text)
|
187 |
+
if word.text.strip()=='':
|
188 |
+
continue
|
189 |
+
temp_sent.append(word.text)
|
190 |
+
if word.text.startswith('ssss')==True:
|
191 |
+
last_label=word.text
|
192 |
+
inentity_flag=1
|
193 |
+
elif word.text.startswith('eeee')==True:
|
194 |
+
last_label=word.text
|
195 |
+
inentity_flag=0
|
196 |
+
else:
|
197 |
+
if last_label=='O':
|
198 |
+
now_label='O'
|
199 |
+
elif last_label.startswith('ssss')==True:
|
200 |
+
now_label='B-'+label_dic[last_label[4:]]
|
201 |
+
|
202 |
+
elif last_label.startswith('B-')==True:
|
203 |
+
now_label='I-'+last_label[2:]
|
204 |
+
elif last_label.startswith('I-')==True:
|
205 |
+
now_label='I-'+last_label[2:]
|
206 |
+
elif last_label.startswith('eeee')==True:
|
207 |
+
now_label='O'
|
208 |
+
|
209 |
+
fout.write(word.text+'\t'+now_label+'\n')
|
210 |
+
last_label=now_label
|
211 |
+
if inentity_flag==1: # if entity is split by sentence, will connate the sentence
|
212 |
+
# print('sentence error!!!')
|
213 |
+
# print(word.text,word_num)
|
214 |
+
# print(temp_sent)
|
215 |
+
pass
|
216 |
+
else:
|
217 |
+
fout.write('\n')
|
218 |
+
return fout
|
219 |
+
|
220 |
+
def pubtator_to_conll(infile):
|
221 |
+
|
222 |
+
#1.entity sort
|
223 |
+
input_sort=pubtator_entitysort(infile)
|
224 |
+
#print(input_sort.getvalue())
|
225 |
+
|
226 |
+
#2. no overlap, if overlap get longest entity
|
227 |
+
input_nonest=filter_overlap(input_sort)
|
228 |
+
# print('......sort.....\n',input_sort.getvalue())
|
229 |
+
|
230 |
+
#3. pubtator to label text
|
231 |
+
input_labtext,label_dic=pubtator_to_labeltext(input_nonest)
|
232 |
+
# print('......label.....\n',input_labtext.getvalue())
|
233 |
+
#print(label_dic)
|
234 |
+
|
235 |
+
#4. label text to conll
|
236 |
+
output = labeltext_to_conll_fasttoken(input_labtext,label_dic)
|
237 |
+
# print('......output.....\n',output.getvalue())
|
238 |
+
# fout=open(outfile,'w',encoding='utf-8')
|
239 |
+
# fout.write(input_nonest.getvalue())
|
240 |
+
# fout.close()
|
241 |
+
return output
|
242 |
+
|
243 |
+
if __name__=='__main__':
|
244 |
+
|
245 |
+
|
246 |
+
infile='../../TrainingSet/No100/NER.Train.txt'
|
247 |
+
output=pubtator_to_conll(infile)
|
248 |
+
fout=open('../../TrainingSet/No100/NER.Train.conll','w',encoding='utf-8')
|
249 |
+
fout.write(output.getvalue())
|
250 |
+
fout.close()
|
251 |
+
output.close()
|
252 |
+
|
253 |
+
|
254 |
+
|
255 |
+
|
256 |
+
|
257 |
|
src_python/GeneNER/Evaluation_ner.py
CHANGED
@@ -1,243 +1,243 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""
|
3 |
-
Created on Mon Mar 1 15:33:54 2021
|
4 |
-
|
5 |
-
@author: luol2
|
6 |
-
"""
|
7 |
-
# from BIO format to entity
|
8 |
-
def BIO_tag(tokens):
|
9 |
-
gold_entity={}
|
10 |
-
pre_entity={}
|
11 |
-
gold_start,gold_end=0,0
|
12 |
-
pre_start,pre_end=0,0
|
13 |
-
for i in range(0,len(tokens)):
|
14 |
-
segs=tokens[i].split('\t')
|
15 |
-
|
16 |
-
# generate gold entity
|
17 |
-
if segs[1].startswith('B-')>0:
|
18 |
-
gold_start=i
|
19 |
-
gold_type=segs[1][2:]
|
20 |
-
if i+1>=len(tokens): # the last word
|
21 |
-
gold_end=i
|
22 |
-
if gold_type in gold_entity.keys():
|
23 |
-
gold_entity[gold_type].append([gold_start,gold_end])
|
24 |
-
else:
|
25 |
-
gold_entity[gold_type]=[[gold_start,gold_end]]
|
26 |
-
else: # non last word
|
27 |
-
next_seg=tokens[i+1].split('\t')
|
28 |
-
if next_seg[1].startswith('B-')>0 or next_seg[1]=='O':
|
29 |
-
gold_end=i
|
30 |
-
if gold_type in gold_entity.keys():
|
31 |
-
gold_entity[gold_type].append([gold_start,gold_end])
|
32 |
-
else:
|
33 |
-
gold_entity[gold_type]=[[gold_start,gold_end]]
|
34 |
-
elif next_seg[1].startswith('I-')>0:
|
35 |
-
pass
|
36 |
-
elif segs[1].startswith('I-')>0:
|
37 |
-
if i+1>=len(tokens): # the last word
|
38 |
-
gold_end=i
|
39 |
-
if gold_type in gold_entity.keys():
|
40 |
-
gold_entity[gold_type].append([gold_start,gold_end])
|
41 |
-
else:
|
42 |
-
gold_entity[gold_type]=[[gold_start,gold_end]]
|
43 |
-
else: # non last word
|
44 |
-
next_seg=tokens[i+1].split('\t')
|
45 |
-
if next_seg[1].startswith('B-')>0 or next_seg[1]=='O':
|
46 |
-
gold_end=i
|
47 |
-
if gold_type in gold_entity.keys():
|
48 |
-
gold_entity[gold_type].append([gold_start,gold_end])
|
49 |
-
else:
|
50 |
-
gold_entity[gold_type]=[[gold_start,gold_end]]
|
51 |
-
elif next_seg[1].startswith('I-')>0:
|
52 |
-
pass
|
53 |
-
elif segs[1]=='O':
|
54 |
-
pass
|
55 |
-
|
56 |
-
# generate prediction entity
|
57 |
-
if segs[2].startswith('B-')>0:
|
58 |
-
pre_start=i
|
59 |
-
pre_type=segs[2][2:]
|
60 |
-
if i+1>=len(tokens): # the last word
|
61 |
-
pre_end=i
|
62 |
-
if pre_type in pre_entity.keys():
|
63 |
-
pre_entity[pre_type].append([pre_start,pre_end])
|
64 |
-
else:
|
65 |
-
pre_entity[pre_type]=[[pre_start,pre_end]]
|
66 |
-
else: # non last word
|
67 |
-
next_seg=tokens[i+1].split('\t')
|
68 |
-
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
69 |
-
pre_end=i
|
70 |
-
if pre_type in pre_entity.keys():
|
71 |
-
pre_entity[pre_type].append([pre_start,pre_end])
|
72 |
-
else:
|
73 |
-
pre_entity[pre_type]=[[pre_start,pre_end]]
|
74 |
-
elif next_seg[2].startswith('I-')>0:
|
75 |
-
pass
|
76 |
-
elif segs[2].startswith('I-')>0:
|
77 |
-
if i==0 and i+1<len(tokens): # the first word and not only a word
|
78 |
-
pre_start=i
|
79 |
-
pre_type=segs[2][2:]
|
80 |
-
next_seg=tokens[i+1].split('\t')
|
81 |
-
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
82 |
-
pre_end=i
|
83 |
-
if pre_type in pre_entity.keys():
|
84 |
-
pre_entity[pre_type].append([pre_start,pre_end])
|
85 |
-
else:
|
86 |
-
pre_entity[pre_type]=[[pre_start,pre_end]]
|
87 |
-
elif next_seg[2].startswith('I-')>0:
|
88 |
-
pass
|
89 |
-
elif i==0 and i+1==len(tokens):# only one word:
|
90 |
-
pre_start=i
|
91 |
-
pre_type=segs[2][2:]
|
92 |
-
pre_end=i
|
93 |
-
if pre_type in pre_entity.keys():
|
94 |
-
pre_entity[pre_type].append([pre_start,pre_end])
|
95 |
-
else:
|
96 |
-
pre_entity[pre_type]=[[pre_start,pre_end]]
|
97 |
-
elif i+1>=len(tokens): # the last word
|
98 |
-
last_seg=tokens[i-1].split('\t')
|
99 |
-
if last_seg[2]=='O':
|
100 |
-
pre_start=i
|
101 |
-
pre_type=segs[2][2:]
|
102 |
-
pre_end=i
|
103 |
-
if pre_type in pre_entity.keys():
|
104 |
-
pre_entity[pre_type].append([pre_start,pre_end])
|
105 |
-
else:
|
106 |
-
pre_entity[pre_type]=[[pre_start,pre_end]]
|
107 |
-
elif i+1< len(tokens): # non last word
|
108 |
-
next_seg=tokens[i+1].split('\t')
|
109 |
-
last_seg=tokens[i-1].split('\t')
|
110 |
-
if last_seg[2]=='O':
|
111 |
-
pre_start=i
|
112 |
-
pre_type=segs[2][2:]
|
113 |
-
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
114 |
-
pre_end=i
|
115 |
-
if pre_type in pre_entity.keys():
|
116 |
-
pre_entity[pre_type].append([pre_start,pre_end])
|
117 |
-
else:
|
118 |
-
pre_entity[pre_type]=[[pre_start,pre_end]]
|
119 |
-
elif next_seg[2].startswith('I-')>0:
|
120 |
-
pass
|
121 |
-
elif segs[2]=='O':
|
122 |
-
pass
|
123 |
-
# print(tokens)
|
124 |
-
# print(gold_entity)
|
125 |
-
# print(pre_entity)
|
126 |
-
return gold_entity,pre_entity
|
127 |
-
|
128 |
-
# input: token \t Gold \t Prediction\n, sentence is split "\n"
|
129 |
-
def NER_Evaluation():
|
130 |
-
path='//panfs/pan1/bionlp/lulab/luoling/OpenBioIE_project/models/Kfold/BiLSTM-CRF/'
|
131 |
-
fin=open(path+'dev_pre.conll_all','r',encoding='utf-8')
|
132 |
-
all_sentence=fin.read().strip().split('\n\n')
|
133 |
-
fin.close()
|
134 |
-
Metrics={} #{'entity_type':[TP,gold_num,pre_num]}
|
135 |
-
|
136 |
-
for sentence in all_sentence:
|
137 |
-
tokens=sentence.split('\n')
|
138 |
-
gold_entity,pre_entity=BIO_tag(tokens)
|
139 |
-
# print(tokens)
|
140 |
-
for entity_type in gold_entity.keys():
|
141 |
-
if entity_type not in Metrics.keys():
|
142 |
-
Metrics[entity_type]=[0,len(gold_entity[entity_type]),0]
|
143 |
-
else:
|
144 |
-
Metrics[entity_type][1]+=len(gold_entity[entity_type])
|
145 |
-
for entity_type in pre_entity.keys():
|
146 |
-
if entity_type not in Metrics.keys():
|
147 |
-
Metrics[entity_type]=[0,0,len(pre_entity[entity_type])]
|
148 |
-
else:
|
149 |
-
Metrics[entity_type][2]+=len(pre_entity[entity_type])
|
150 |
-
for mention in pre_entity[entity_type]:
|
151 |
-
if entity_type in gold_entity.keys():
|
152 |
-
if mention in gold_entity[entity_type]:
|
153 |
-
Metrics[entity_type][0]+=1
|
154 |
-
print(Metrics)
|
155 |
-
TP,Gold_num,Pre_num=0,0,0
|
156 |
-
for ele in Metrics.keys():
|
157 |
-
if Metrics[ele][2]==0:
|
158 |
-
p=0
|
159 |
-
else:
|
160 |
-
p=Metrics[ele][0]/Metrics[ele][2]
|
161 |
-
if Metrics[ele][1]==0:
|
162 |
-
r=0
|
163 |
-
else:
|
164 |
-
r=Metrics[ele][0]/Metrics[ele][1]
|
165 |
-
if p+r==0:
|
166 |
-
f1=0
|
167 |
-
else:
|
168 |
-
f1=2*p*r/(p+r)
|
169 |
-
TP+=Metrics[ele][0]
|
170 |
-
Gold_num+=Metrics[ele][1]
|
171 |
-
Pre_num+=Metrics[ele][2]
|
172 |
-
print(ele+': P=%.5f, R=%.5f, F1=%.5f' % (p,r,f1))
|
173 |
-
# break
|
174 |
-
if Pre_num==0:
|
175 |
-
P=0
|
176 |
-
else:
|
177 |
-
P=TP/Pre_num
|
178 |
-
R=TP/Gold_num
|
179 |
-
F1=2*P*R/(P+R)
|
180 |
-
print("Overall: P=%.5f, R=%.5f, F1=%.5f"% (P,R,F1))
|
181 |
-
|
182 |
-
def NER_Evaluation_fn(file):
|
183 |
-
|
184 |
-
fin=open(file,'r',encoding='utf-8')
|
185 |
-
all_sentence=fin.read().strip().split('\n\n')
|
186 |
-
fin.close()
|
187 |
-
Metrics={} #{'entity_type':[TP,gold_num,pre_num]}
|
188 |
-
breai=0
|
189 |
-
for sentence in all_sentence:
|
190 |
-
breai+=1
|
191 |
-
if breai>5000:
|
192 |
-
break
|
193 |
-
tokens=sentence.split('\n')
|
194 |
-
gold_entity,pre_entity=BIO_tag(tokens)
|
195 |
-
# print(tokens)
|
196 |
-
for entity_type in gold_entity.keys():
|
197 |
-
if entity_type not in Metrics.keys():
|
198 |
-
Metrics[entity_type]=[0,len(gold_entity[entity_type]),0]
|
199 |
-
else:
|
200 |
-
Metrics[entity_type][1]+=len(gold_entity[entity_type])
|
201 |
-
for entity_type in pre_entity.keys():
|
202 |
-
if entity_type not in Metrics.keys():
|
203 |
-
Metrics[entity_type]=[0,0,len(pre_entity[entity_type])]
|
204 |
-
else:
|
205 |
-
Metrics[entity_type][2]+=len(pre_entity[entity_type])
|
206 |
-
for mention in pre_entity[entity_type]:
|
207 |
-
if entity_type in gold_entity.keys():
|
208 |
-
if mention in gold_entity[entity_type]:
|
209 |
-
Metrics[entity_type][0]+=1
|
210 |
-
print(Metrics)
|
211 |
-
TP,Gold_num,Pre_num=0,0,0
|
212 |
-
for ele in Metrics.keys():
|
213 |
-
if Metrics[ele][2]==0:
|
214 |
-
p=0
|
215 |
-
else:
|
216 |
-
p=Metrics[ele][0]/Metrics[ele][2]
|
217 |
-
if Metrics[ele][1]==0:
|
218 |
-
r=0
|
219 |
-
else:
|
220 |
-
r=Metrics[ele][0]/Metrics[ele][1]
|
221 |
-
if p+r==0:
|
222 |
-
f1=0
|
223 |
-
else:
|
224 |
-
f1=2*p*r/(p+r)
|
225 |
-
TP+=Metrics[ele][0]
|
226 |
-
Gold_num+=Metrics[ele][1]
|
227 |
-
Pre_num+=Metrics[ele][2]
|
228 |
-
print(ele+': P=%.5f, R=%.5f, F1=%.5f' % (p,r,f1))
|
229 |
-
# break
|
230 |
-
if Pre_num==0:
|
231 |
-
P=0
|
232 |
-
else:
|
233 |
-
P=TP/Pre_num
|
234 |
-
R=TP/Gold_num
|
235 |
-
if P+R==0:
|
236 |
-
F1=0
|
237 |
-
else:
|
238 |
-
F1=2*P*R/(P+R)
|
239 |
-
print("Overall: P=%.5f, R=%.5f, F1=%.5f"% (P,R,F1))
|
240 |
-
return F1
|
241 |
-
|
242 |
-
if __name__=='__main__':
|
243 |
-
NER_Evaluation()
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Mon Mar 1 15:33:54 2021
|
4 |
+
|
5 |
+
@author: luol2
|
6 |
+
"""
|
7 |
+
# from BIO format to entity
|
8 |
+
def BIO_tag(tokens):
|
9 |
+
gold_entity={}
|
10 |
+
pre_entity={}
|
11 |
+
gold_start,gold_end=0,0
|
12 |
+
pre_start,pre_end=0,0
|
13 |
+
for i in range(0,len(tokens)):
|
14 |
+
segs=tokens[i].split('\t')
|
15 |
+
|
16 |
+
# generate gold entity
|
17 |
+
if segs[1].startswith('B-')>0:
|
18 |
+
gold_start=i
|
19 |
+
gold_type=segs[1][2:]
|
20 |
+
if i+1>=len(tokens): # the last word
|
21 |
+
gold_end=i
|
22 |
+
if gold_type in gold_entity.keys():
|
23 |
+
gold_entity[gold_type].append([gold_start,gold_end])
|
24 |
+
else:
|
25 |
+
gold_entity[gold_type]=[[gold_start,gold_end]]
|
26 |
+
else: # non last word
|
27 |
+
next_seg=tokens[i+1].split('\t')
|
28 |
+
if next_seg[1].startswith('B-')>0 or next_seg[1]=='O':
|
29 |
+
gold_end=i
|
30 |
+
if gold_type in gold_entity.keys():
|
31 |
+
gold_entity[gold_type].append([gold_start,gold_end])
|
32 |
+
else:
|
33 |
+
gold_entity[gold_type]=[[gold_start,gold_end]]
|
34 |
+
elif next_seg[1].startswith('I-')>0:
|
35 |
+
pass
|
36 |
+
elif segs[1].startswith('I-')>0:
|
37 |
+
if i+1>=len(tokens): # the last word
|
38 |
+
gold_end=i
|
39 |
+
if gold_type in gold_entity.keys():
|
40 |
+
gold_entity[gold_type].append([gold_start,gold_end])
|
41 |
+
else:
|
42 |
+
gold_entity[gold_type]=[[gold_start,gold_end]]
|
43 |
+
else: # non last word
|
44 |
+
next_seg=tokens[i+1].split('\t')
|
45 |
+
if next_seg[1].startswith('B-')>0 or next_seg[1]=='O':
|
46 |
+
gold_end=i
|
47 |
+
if gold_type in gold_entity.keys():
|
48 |
+
gold_entity[gold_type].append([gold_start,gold_end])
|
49 |
+
else:
|
50 |
+
gold_entity[gold_type]=[[gold_start,gold_end]]
|
51 |
+
elif next_seg[1].startswith('I-')>0:
|
52 |
+
pass
|
53 |
+
elif segs[1]=='O':
|
54 |
+
pass
|
55 |
+
|
56 |
+
# generate prediction entity
|
57 |
+
if segs[2].startswith('B-')>0:
|
58 |
+
pre_start=i
|
59 |
+
pre_type=segs[2][2:]
|
60 |
+
if i+1>=len(tokens): # the last word
|
61 |
+
pre_end=i
|
62 |
+
if pre_type in pre_entity.keys():
|
63 |
+
pre_entity[pre_type].append([pre_start,pre_end])
|
64 |
+
else:
|
65 |
+
pre_entity[pre_type]=[[pre_start,pre_end]]
|
66 |
+
else: # non last word
|
67 |
+
next_seg=tokens[i+1].split('\t')
|
68 |
+
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
69 |
+
pre_end=i
|
70 |
+
if pre_type in pre_entity.keys():
|
71 |
+
pre_entity[pre_type].append([pre_start,pre_end])
|
72 |
+
else:
|
73 |
+
pre_entity[pre_type]=[[pre_start,pre_end]]
|
74 |
+
elif next_seg[2].startswith('I-')>0:
|
75 |
+
pass
|
76 |
+
elif segs[2].startswith('I-')>0:
|
77 |
+
if i==0 and i+1<len(tokens): # the first word and not only a word
|
78 |
+
pre_start=i
|
79 |
+
pre_type=segs[2][2:]
|
80 |
+
next_seg=tokens[i+1].split('\t')
|
81 |
+
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
82 |
+
pre_end=i
|
83 |
+
if pre_type in pre_entity.keys():
|
84 |
+
pre_entity[pre_type].append([pre_start,pre_end])
|
85 |
+
else:
|
86 |
+
pre_entity[pre_type]=[[pre_start,pre_end]]
|
87 |
+
elif next_seg[2].startswith('I-')>0:
|
88 |
+
pass
|
89 |
+
elif i==0 and i+1==len(tokens):# only one word:
|
90 |
+
pre_start=i
|
91 |
+
pre_type=segs[2][2:]
|
92 |
+
pre_end=i
|
93 |
+
if pre_type in pre_entity.keys():
|
94 |
+
pre_entity[pre_type].append([pre_start,pre_end])
|
95 |
+
else:
|
96 |
+
pre_entity[pre_type]=[[pre_start,pre_end]]
|
97 |
+
elif i+1>=len(tokens): # the last word
|
98 |
+
last_seg=tokens[i-1].split('\t')
|
99 |
+
if last_seg[2]=='O':
|
100 |
+
pre_start=i
|
101 |
+
pre_type=segs[2][2:]
|
102 |
+
pre_end=i
|
103 |
+
if pre_type in pre_entity.keys():
|
104 |
+
pre_entity[pre_type].append([pre_start,pre_end])
|
105 |
+
else:
|
106 |
+
pre_entity[pre_type]=[[pre_start,pre_end]]
|
107 |
+
elif i+1< len(tokens): # non last word
|
108 |
+
next_seg=tokens[i+1].split('\t')
|
109 |
+
last_seg=tokens[i-1].split('\t')
|
110 |
+
if last_seg[2]=='O':
|
111 |
+
pre_start=i
|
112 |
+
pre_type=segs[2][2:]
|
113 |
+
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
114 |
+
pre_end=i
|
115 |
+
if pre_type in pre_entity.keys():
|
116 |
+
pre_entity[pre_type].append([pre_start,pre_end])
|
117 |
+
else:
|
118 |
+
pre_entity[pre_type]=[[pre_start,pre_end]]
|
119 |
+
elif next_seg[2].startswith('I-')>0:
|
120 |
+
pass
|
121 |
+
elif segs[2]=='O':
|
122 |
+
pass
|
123 |
+
# print(tokens)
|
124 |
+
# print(gold_entity)
|
125 |
+
# print(pre_entity)
|
126 |
+
return gold_entity,pre_entity
|
127 |
+
|
128 |
+
# input: token \t Gold \t Prediction\n, sentence is split "\n"
|
129 |
+
def NER_Evaluation():
|
130 |
+
path='//panfs/pan1/bionlp/lulab/luoling/OpenBioIE_project/models/Kfold/BiLSTM-CRF/'
|
131 |
+
fin=open(path+'dev_pre.conll_all','r',encoding='utf-8')
|
132 |
+
all_sentence=fin.read().strip().split('\n\n')
|
133 |
+
fin.close()
|
134 |
+
Metrics={} #{'entity_type':[TP,gold_num,pre_num]}
|
135 |
+
|
136 |
+
for sentence in all_sentence:
|
137 |
+
tokens=sentence.split('\n')
|
138 |
+
gold_entity,pre_entity=BIO_tag(tokens)
|
139 |
+
# print(tokens)
|
140 |
+
for entity_type in gold_entity.keys():
|
141 |
+
if entity_type not in Metrics.keys():
|
142 |
+
Metrics[entity_type]=[0,len(gold_entity[entity_type]),0]
|
143 |
+
else:
|
144 |
+
Metrics[entity_type][1]+=len(gold_entity[entity_type])
|
145 |
+
for entity_type in pre_entity.keys():
|
146 |
+
if entity_type not in Metrics.keys():
|
147 |
+
Metrics[entity_type]=[0,0,len(pre_entity[entity_type])]
|
148 |
+
else:
|
149 |
+
Metrics[entity_type][2]+=len(pre_entity[entity_type])
|
150 |
+
for mention in pre_entity[entity_type]:
|
151 |
+
if entity_type in gold_entity.keys():
|
152 |
+
if mention in gold_entity[entity_type]:
|
153 |
+
Metrics[entity_type][0]+=1
|
154 |
+
print(Metrics)
|
155 |
+
TP,Gold_num,Pre_num=0,0,0
|
156 |
+
for ele in Metrics.keys():
|
157 |
+
if Metrics[ele][2]==0:
|
158 |
+
p=0
|
159 |
+
else:
|
160 |
+
p=Metrics[ele][0]/Metrics[ele][2]
|
161 |
+
if Metrics[ele][1]==0:
|
162 |
+
r=0
|
163 |
+
else:
|
164 |
+
r=Metrics[ele][0]/Metrics[ele][1]
|
165 |
+
if p+r==0:
|
166 |
+
f1=0
|
167 |
+
else:
|
168 |
+
f1=2*p*r/(p+r)
|
169 |
+
TP+=Metrics[ele][0]
|
170 |
+
Gold_num+=Metrics[ele][1]
|
171 |
+
Pre_num+=Metrics[ele][2]
|
172 |
+
print(ele+': P=%.5f, R=%.5f, F1=%.5f' % (p,r,f1))
|
173 |
+
# break
|
174 |
+
if Pre_num==0:
|
175 |
+
P=0
|
176 |
+
else:
|
177 |
+
P=TP/Pre_num
|
178 |
+
R=TP/Gold_num
|
179 |
+
F1=2*P*R/(P+R)
|
180 |
+
print("Overall: P=%.5f, R=%.5f, F1=%.5f"% (P,R,F1))
|
181 |
+
|
182 |
+
def NER_Evaluation_fn(file):
|
183 |
+
|
184 |
+
fin=open(file,'r',encoding='utf-8')
|
185 |
+
all_sentence=fin.read().strip().split('\n\n')
|
186 |
+
fin.close()
|
187 |
+
Metrics={} #{'entity_type':[TP,gold_num,pre_num]}
|
188 |
+
breai=0
|
189 |
+
for sentence in all_sentence:
|
190 |
+
breai+=1
|
191 |
+
if breai>5000:
|
192 |
+
break
|
193 |
+
tokens=sentence.split('\n')
|
194 |
+
gold_entity,pre_entity=BIO_tag(tokens)
|
195 |
+
# print(tokens)
|
196 |
+
for entity_type in gold_entity.keys():
|
197 |
+
if entity_type not in Metrics.keys():
|
198 |
+
Metrics[entity_type]=[0,len(gold_entity[entity_type]),0]
|
199 |
+
else:
|
200 |
+
Metrics[entity_type][1]+=len(gold_entity[entity_type])
|
201 |
+
for entity_type in pre_entity.keys():
|
202 |
+
if entity_type not in Metrics.keys():
|
203 |
+
Metrics[entity_type]=[0,0,len(pre_entity[entity_type])]
|
204 |
+
else:
|
205 |
+
Metrics[entity_type][2]+=len(pre_entity[entity_type])
|
206 |
+
for mention in pre_entity[entity_type]:
|
207 |
+
if entity_type in gold_entity.keys():
|
208 |
+
if mention in gold_entity[entity_type]:
|
209 |
+
Metrics[entity_type][0]+=1
|
210 |
+
print(Metrics)
|
211 |
+
TP,Gold_num,Pre_num=0,0,0
|
212 |
+
for ele in Metrics.keys():
|
213 |
+
if Metrics[ele][2]==0:
|
214 |
+
p=0
|
215 |
+
else:
|
216 |
+
p=Metrics[ele][0]/Metrics[ele][2]
|
217 |
+
if Metrics[ele][1]==0:
|
218 |
+
r=0
|
219 |
+
else:
|
220 |
+
r=Metrics[ele][0]/Metrics[ele][1]
|
221 |
+
if p+r==0:
|
222 |
+
f1=0
|
223 |
+
else:
|
224 |
+
f1=2*p*r/(p+r)
|
225 |
+
TP+=Metrics[ele][0]
|
226 |
+
Gold_num+=Metrics[ele][1]
|
227 |
+
Pre_num+=Metrics[ele][2]
|
228 |
+
print(ele+': P=%.5f, R=%.5f, F1=%.5f' % (p,r,f1))
|
229 |
+
# break
|
230 |
+
if Pre_num==0:
|
231 |
+
P=0
|
232 |
+
else:
|
233 |
+
P=TP/Pre_num
|
234 |
+
R=TP/Gold_num
|
235 |
+
if P+R==0:
|
236 |
+
F1=0
|
237 |
+
else:
|
238 |
+
F1=2*P*R/(P+R)
|
239 |
+
print("Overall: P=%.5f, R=%.5f, F1=%.5f"% (P,R,F1))
|
240 |
+
return F1
|
241 |
+
|
242 |
+
if __name__=='__main__':
|
243 |
+
NER_Evaluation()
|
src_python/GeneNER/model_ner.py
CHANGED
@@ -1,102 +1,102 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""
|
3 |
-
Created on Wed Feb 10 09:08:09 2021
|
4 |
-
|
5 |
-
@author: luol2
|
6 |
-
"""
|
7 |
-
import tensorflow as tf
|
8 |
-
from src_python.GeneNER.represent_ner import Hugface_RepresentationLayer
|
9 |
-
from tensorflow.keras.layers import *
|
10 |
-
from tensorflow.keras.models import Model
|
11 |
-
from tensorflow.keras.optimizers import RMSprop, SGD, Adam, Adadelta, Adagrad,Nadam
|
12 |
-
from transformers import TFBertModel, BertConfig,TFElectraModel,TFAutoModel
|
13 |
-
import numpy as np
|
14 |
-
import sys
|
15 |
-
|
16 |
-
|
17 |
-
class LRSchedule_LINEAR(tf.keras.optimizers.schedules.LearningRateSchedule):
|
18 |
-
def __init__(
|
19 |
-
self,
|
20 |
-
init_lr=5e-5,
|
21 |
-
init_warmup_lr=0.0,
|
22 |
-
final_lr=5e-7,
|
23 |
-
warmup_steps=0,
|
24 |
-
decay_steps=0,
|
25 |
-
):
|
26 |
-
super().__init__()
|
27 |
-
self.init_lr = init_lr
|
28 |
-
self.init_warmup_lr=init_warmup_lr
|
29 |
-
self.final_lr = final_lr
|
30 |
-
self.warmup_steps = warmup_steps
|
31 |
-
self.decay_steps = decay_steps
|
32 |
-
|
33 |
-
def __call__(self, step):
|
34 |
-
""" linear warm up - linear decay """
|
35 |
-
if self.warmup_steps>0:
|
36 |
-
warmup_lr = (self.init_lr - self.init_warmup_lr)/self.warmup_steps * step+self.init_warmup_lr
|
37 |
-
else:
|
38 |
-
warmup_lr=1000.0
|
39 |
-
#print('\n.......warmup_lr:',warmup_lr)
|
40 |
-
decay_lr = tf.math.maximum(
|
41 |
-
self.final_lr,
|
42 |
-
self.init_lr - (step - self.warmup_steps)/self.decay_steps*(self.init_lr - self.final_lr)
|
43 |
-
)
|
44 |
-
#print('\n.....decay_lr:',decay_lr)
|
45 |
-
return tf.math.minimum(warmup_lr,decay_lr)
|
46 |
-
|
47 |
-
|
48 |
-
class HUGFACE_NER(): #huggingface transformers
|
49 |
-
def __init__(self, model_files):
|
50 |
-
self.model_type='HUGFACE'
|
51 |
-
self.maxlen = 256 #sent 256 doc-512,pretrain-sent 128
|
52 |
-
self.checkpoint_path = model_files['checkpoint_path']
|
53 |
-
self.label_file=model_files['labelfile']
|
54 |
-
self.lowercase=model_files['lowercase']
|
55 |
-
self.rep = Hugface_RepresentationLayer(self.checkpoint_path, self.label_file, lowercase=self.lowercase)
|
56 |
-
|
57 |
-
|
58 |
-
def build_encoder(self):
|
59 |
-
print('...vocab len:',self.rep.vocab_len)
|
60 |
-
plm_model = TFAutoModel.from_pretrained(self.checkpoint_path, from_pt=True)
|
61 |
-
# plm_model.resize_token_embeddings(self.rep.vocab_len)
|
62 |
-
x1_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='input_ids')
|
63 |
-
x2_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='token_type_ids')
|
64 |
-
x3_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='attention_mask')
|
65 |
-
x = plm_model(x1_in, token_type_ids=x2_in, attention_mask=x3_in)[0]
|
66 |
-
#dense = TimeDistributed(Dense(512, activation='relu'), name='dense1')(x)
|
67 |
-
self.encoder = Model (inputs=[x1_in,x2_in,x3_in], outputs=x,name='hugface_encoder')
|
68 |
-
self.encoder.summary()
|
69 |
-
|
70 |
-
def build_softmax_decoder(self):
|
71 |
-
|
72 |
-
x1_in = Input(shape=(self.maxlen,),dtype=tf.int32)
|
73 |
-
x2_in = Input(shape=(self.maxlen,),dtype=tf.int32)
|
74 |
-
x3_in = Input(shape=(self.maxlen,),dtype=tf.int32)
|
75 |
-
features = self.encoder([x1_in,x2_in,x3_in])
|
76 |
-
#features = Dropout(0.4)(features)
|
77 |
-
features = TimeDistributed(Dense(128, activation='relu'), name='dense2')(features)
|
78 |
-
features= Dropout(0.1)(features)
|
79 |
-
output = TimeDistributed(Dense(self.rep.label_table_size, activation='softmax'), name='softmax')(features)
|
80 |
-
self.model = Model(inputs=[x1_in,x2_in,x3_in], outputs=output, name="hugface_softmax")
|
81 |
-
|
82 |
-
lr_schedule=LRSchedule_LINEAR(
|
83 |
-
init_lr=1e-5,
|
84 |
-
init_warmup_lr=1e-7,
|
85 |
-
final_lr=5e-6,
|
86 |
-
warmup_steps=0,
|
87 |
-
decay_steps=1000)
|
88 |
-
|
89 |
-
opt = Adam(learning_rate = lr_schedule)
|
90 |
-
#opt = Adam(lr=5e-6)
|
91 |
-
self.model.compile(
|
92 |
-
optimizer=opt,
|
93 |
-
loss='sparse_categorical_crossentropy',
|
94 |
-
metrics=['accuracy'],
|
95 |
-
)
|
96 |
-
self.model.summary()
|
97 |
-
|
98 |
-
|
99 |
-
def load_model(self,model_file):
|
100 |
-
self.model.load_weights(model_file)
|
101 |
-
self.model.summary()
|
102 |
-
print('load HUGFACE model done!')
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Wed Feb 10 09:08:09 2021
|
4 |
+
|
5 |
+
@author: luol2
|
6 |
+
"""
|
7 |
+
import tensorflow as tf
|
8 |
+
from src_python.GeneNER.represent_ner import Hugface_RepresentationLayer
|
9 |
+
from tensorflow.keras.layers import *
|
10 |
+
from tensorflow.keras.models import Model
|
11 |
+
from tensorflow.keras.optimizers import RMSprop, SGD, Adam, Adadelta, Adagrad,Nadam
|
12 |
+
from transformers import TFBertModel, BertConfig,TFElectraModel,TFAutoModel
|
13 |
+
import numpy as np
|
14 |
+
import sys
|
15 |
+
|
16 |
+
|
17 |
+
class LRSchedule_LINEAR(tf.keras.optimizers.schedules.LearningRateSchedule):
|
18 |
+
def __init__(
|
19 |
+
self,
|
20 |
+
init_lr=5e-5,
|
21 |
+
init_warmup_lr=0.0,
|
22 |
+
final_lr=5e-7,
|
23 |
+
warmup_steps=0,
|
24 |
+
decay_steps=0,
|
25 |
+
):
|
26 |
+
super().__init__()
|
27 |
+
self.init_lr = init_lr
|
28 |
+
self.init_warmup_lr=init_warmup_lr
|
29 |
+
self.final_lr = final_lr
|
30 |
+
self.warmup_steps = warmup_steps
|
31 |
+
self.decay_steps = decay_steps
|
32 |
+
|
33 |
+
def __call__(self, step):
|
34 |
+
""" linear warm up - linear decay """
|
35 |
+
if self.warmup_steps>0:
|
36 |
+
warmup_lr = (self.init_lr - self.init_warmup_lr)/self.warmup_steps * step+self.init_warmup_lr
|
37 |
+
else:
|
38 |
+
warmup_lr=1000.0
|
39 |
+
#print('\n.......warmup_lr:',warmup_lr)
|
40 |
+
decay_lr = tf.math.maximum(
|
41 |
+
self.final_lr,
|
42 |
+
self.init_lr - (step - self.warmup_steps)/self.decay_steps*(self.init_lr - self.final_lr)
|
43 |
+
)
|
44 |
+
#print('\n.....decay_lr:',decay_lr)
|
45 |
+
return tf.math.minimum(warmup_lr,decay_lr)
|
46 |
+
|
47 |
+
|
48 |
+
class HUGFACE_NER(): #huggingface transformers
|
49 |
+
def __init__(self, model_files):
|
50 |
+
self.model_type='HUGFACE'
|
51 |
+
self.maxlen = 256 #sent 256 doc-512,pretrain-sent 128
|
52 |
+
self.checkpoint_path = model_files['checkpoint_path']
|
53 |
+
self.label_file=model_files['labelfile']
|
54 |
+
self.lowercase=model_files['lowercase']
|
55 |
+
self.rep = Hugface_RepresentationLayer(self.checkpoint_path, self.label_file, lowercase=self.lowercase)
|
56 |
+
|
57 |
+
|
58 |
+
def build_encoder(self):
|
59 |
+
print('...vocab len:',self.rep.vocab_len)
|
60 |
+
plm_model = TFAutoModel.from_pretrained(self.checkpoint_path, from_pt=True)
|
61 |
+
# plm_model.resize_token_embeddings(self.rep.vocab_len)
|
62 |
+
x1_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='input_ids')
|
63 |
+
x2_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='token_type_ids')
|
64 |
+
x3_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='attention_mask')
|
65 |
+
x = plm_model(x1_in, token_type_ids=x2_in, attention_mask=x3_in)[0]
|
66 |
+
#dense = TimeDistributed(Dense(512, activation='relu'), name='dense1')(x)
|
67 |
+
self.encoder = Model (inputs=[x1_in,x2_in,x3_in], outputs=x,name='hugface_encoder')
|
68 |
+
self.encoder.summary()
|
69 |
+
|
70 |
+
def build_softmax_decoder(self):
|
71 |
+
|
72 |
+
x1_in = Input(shape=(self.maxlen,),dtype=tf.int32)
|
73 |
+
x2_in = Input(shape=(self.maxlen,),dtype=tf.int32)
|
74 |
+
x3_in = Input(shape=(self.maxlen,),dtype=tf.int32)
|
75 |
+
features = self.encoder([x1_in,x2_in,x3_in])
|
76 |
+
#features = Dropout(0.4)(features)
|
77 |
+
features = TimeDistributed(Dense(128, activation='relu'), name='dense2')(features)
|
78 |
+
features= Dropout(0.1)(features)
|
79 |
+
output = TimeDistributed(Dense(self.rep.label_table_size, activation='softmax'), name='softmax')(features)
|
80 |
+
self.model = Model(inputs=[x1_in,x2_in,x3_in], outputs=output, name="hugface_softmax")
|
81 |
+
|
82 |
+
lr_schedule=LRSchedule_LINEAR(
|
83 |
+
init_lr=1e-5,
|
84 |
+
init_warmup_lr=1e-7,
|
85 |
+
final_lr=5e-6,
|
86 |
+
warmup_steps=0,
|
87 |
+
decay_steps=1000)
|
88 |
+
|
89 |
+
opt = Adam(learning_rate = lr_schedule)
|
90 |
+
#opt = Adam(lr=5e-6)
|
91 |
+
self.model.compile(
|
92 |
+
optimizer=opt,
|
93 |
+
loss='sparse_categorical_crossentropy',
|
94 |
+
metrics=['accuracy'],
|
95 |
+
)
|
96 |
+
self.model.summary()
|
97 |
+
|
98 |
+
|
99 |
+
def load_model(self,model_file):
|
100 |
+
self.model.load_weights(model_file)
|
101 |
+
self.model.summary()
|
102 |
+
print('load HUGFACE model done!')
|
src_python/GeneNER/ner_tag.py
CHANGED
@@ -1,85 +1,85 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""
|
3 |
-
Created on Wed Jun 8 11:01:23 2022
|
4 |
-
|
5 |
-
@author: luol2
|
6 |
-
"""
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
import io
|
11 |
-
import re
|
12 |
-
from src_python.GeneNER.processing_data_ner import ml_intext_fn,out_BIO_BERT_softmax_fn
|
13 |
-
from src_python.GeneNER.restore_index_ner import NN_restore_index_fn
|
14 |
-
import tensorflow as tf
|
15 |
-
gpu = tf.config.list_physical_devices('GPU')
|
16 |
-
print("Num GPUs Available: ", len(gpu))
|
17 |
-
if len(gpu) > 0:
|
18 |
-
tf.config.experimental.set_memory_growth(gpu[0], True)
|
19 |
-
|
20 |
-
def pre_token(sentence):
|
21 |
-
sentence=re.sub("([\W\-\_])"," \\1 ",sentence)
|
22 |
-
sentence=re.sub("[ ]+"," ",sentence);
|
23 |
-
return sentence
|
24 |
-
|
25 |
-
def ssplit_token_pos_lemma(in_text,text_level,nlp_token, max_len=400):
|
26 |
-
#print('max_len:',max_len)
|
27 |
-
fout=io.StringIO()
|
28 |
-
|
29 |
-
in_text=in_text.strip()
|
30 |
-
in_text=pre_token(in_text)
|
31 |
-
doc_stanza = nlp_token(in_text)
|
32 |
-
strlen=0
|
33 |
-
for sent in doc_stanza.sentences:
|
34 |
-
for word in sent.words:
|
35 |
-
strlen+=1
|
36 |
-
if word.text.strip()=='':
|
37 |
-
pass
|
38 |
-
#print('!!!!blank token text!!!')
|
39 |
-
else:
|
40 |
-
fout.write(word.text+'\tO\n')
|
41 |
-
if strlen>=max_len:
|
42 |
-
#print('long sentence:',strlen)
|
43 |
-
fout.write('\n')
|
44 |
-
strlen=0
|
45 |
-
if text_level=='SENT':
|
46 |
-
fout.write('\n')
|
47 |
-
strlen=0
|
48 |
-
if text_level=='DOC':
|
49 |
-
fout.write('\n')
|
50 |
-
|
51 |
-
return fout.getvalue()
|
52 |
-
|
53 |
-
def ml_tagging(ml_input,nn_model):
|
54 |
-
|
55 |
-
test_list = ml_intext_fn(ml_input)
|
56 |
-
test_x,test_y, test_bert_text_label=nn_model.rep.load_data_hugface(test_list,word_max_len=nn_model.maxlen,label_type='softmax')
|
57 |
-
test_pre = nn_model.model.predict(test_x,batch_size=64)
|
58 |
-
test_decode_temp=out_BIO_BERT_softmax_fn(test_pre,test_bert_text_label,nn_model.rep.index_2_label)
|
59 |
-
|
60 |
-
return test_decode_temp
|
61 |
-
# only machine learning-based method
|
62 |
-
def ML_Tag(text,ml_model,nlp_token,text_level='SENT'):
|
63 |
-
|
64 |
-
# startTime=time.time()
|
65 |
-
ssplit_token=ssplit_token_pos_lemma(text, text_level, nlp_token, max_len=ml_model.maxlen)
|
66 |
-
#print(ssplit_token)
|
67 |
-
# print('ssplit token:',time.time()-startTime)
|
68 |
-
|
69 |
-
# startTime=time.time()
|
70 |
-
ml_tsv=ml_tagging(ssplit_token,ml_model)
|
71 |
-
#print(ml_tsv)
|
72 |
-
# print('ml ner:',time.time()-startTime)
|
73 |
-
|
74 |
-
final_result= NN_restore_index_fn(text,ml_tsv)
|
75 |
-
|
76 |
-
# print('final ner:',time.time()-startTime)
|
77 |
-
|
78 |
-
return final_result
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Wed Jun 8 11:01:23 2022
|
4 |
+
|
5 |
+
@author: luol2
|
6 |
+
"""
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
import io
|
11 |
+
import re
|
12 |
+
from src_python.GeneNER.processing_data_ner import ml_intext_fn,out_BIO_BERT_softmax_fn
|
13 |
+
from src_python.GeneNER.restore_index_ner import NN_restore_index_fn
|
14 |
+
import tensorflow as tf
|
15 |
+
gpu = tf.config.list_physical_devices('GPU')
|
16 |
+
print("Num GPUs Available: ", len(gpu))
|
17 |
+
if len(gpu) > 0:
|
18 |
+
tf.config.experimental.set_memory_growth(gpu[0], True)
|
19 |
+
|
20 |
+
def pre_token(sentence):
|
21 |
+
sentence=re.sub("([\W\-\_])"," \\1 ",sentence)
|
22 |
+
sentence=re.sub("[ ]+"," ",sentence);
|
23 |
+
return sentence
|
24 |
+
|
25 |
+
def ssplit_token_pos_lemma(in_text,text_level,nlp_token, max_len=400):
|
26 |
+
#print('max_len:',max_len)
|
27 |
+
fout=io.StringIO()
|
28 |
+
|
29 |
+
in_text=in_text.strip()
|
30 |
+
in_text=pre_token(in_text)
|
31 |
+
doc_stanza = nlp_token(in_text)
|
32 |
+
strlen=0
|
33 |
+
for sent in doc_stanza.sentences:
|
34 |
+
for word in sent.words:
|
35 |
+
strlen+=1
|
36 |
+
if word.text.strip()=='':
|
37 |
+
pass
|
38 |
+
#print('!!!!blank token text!!!')
|
39 |
+
else:
|
40 |
+
fout.write(word.text+'\tO\n')
|
41 |
+
if strlen>=max_len:
|
42 |
+
#print('long sentence:',strlen)
|
43 |
+
fout.write('\n')
|
44 |
+
strlen=0
|
45 |
+
if text_level=='SENT':
|
46 |
+
fout.write('\n')
|
47 |
+
strlen=0
|
48 |
+
if text_level=='DOC':
|
49 |
+
fout.write('\n')
|
50 |
+
|
51 |
+
return fout.getvalue()
|
52 |
+
|
53 |
+
def ml_tagging(ml_input,nn_model):
|
54 |
+
|
55 |
+
test_list = ml_intext_fn(ml_input)
|
56 |
+
test_x,test_y, test_bert_text_label=nn_model.rep.load_data_hugface(test_list,word_max_len=nn_model.maxlen,label_type='softmax')
|
57 |
+
test_pre = nn_model.model.predict(test_x,batch_size=64)
|
58 |
+
test_decode_temp=out_BIO_BERT_softmax_fn(test_pre,test_bert_text_label,nn_model.rep.index_2_label)
|
59 |
+
|
60 |
+
return test_decode_temp
|
61 |
+
# only machine learning-based method
|
62 |
+
def ML_Tag(text,ml_model,nlp_token,text_level='SENT'):
|
63 |
+
|
64 |
+
# startTime=time.time()
|
65 |
+
ssplit_token=ssplit_token_pos_lemma(text, text_level, nlp_token, max_len=ml_model.maxlen)
|
66 |
+
#print(ssplit_token)
|
67 |
+
# print('ssplit token:',time.time()-startTime)
|
68 |
+
|
69 |
+
# startTime=time.time()
|
70 |
+
ml_tsv=ml_tagging(ssplit_token,ml_model)
|
71 |
+
#print(ml_tsv)
|
72 |
+
# print('ml ner:',time.time()-startTime)
|
73 |
+
|
74 |
+
final_result= NN_restore_index_fn(text,ml_tsv)
|
75 |
+
|
76 |
+
# print('final ner:',time.time()-startTime)
|
77 |
+
|
78 |
+
return final_result
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
|
src_python/GeneNER/processing_data_ner.py
CHANGED
@@ -1,210 +1,210 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""
|
3 |
-
Created on Tue Mar 10 16:34:12 2020
|
4 |
-
|
5 |
-
@author: luol2
|
6 |
-
"""
|
7 |
-
import numpy as np
|
8 |
-
import io
|
9 |
-
import sys
|
10 |
-
#read ner text (word\tlabel), generate the list[[[w1,label],[w2,label]]]
|
11 |
-
def ml_intext(file):
|
12 |
-
fin=open(file,'r',encoding='utf-8')
|
13 |
-
alltexts=fin.read().strip().split('\n\n')
|
14 |
-
fin.close()
|
15 |
-
data_list=[]
|
16 |
-
|
17 |
-
for sents in alltexts:
|
18 |
-
lines=sents.split('\n')
|
19 |
-
temp_sentece=[]
|
20 |
-
for i in range(0,len(lines)):
|
21 |
-
seg=lines[i].split('\t')
|
22 |
-
temp_sentece.append(seg[:])
|
23 |
-
|
24 |
-
data_list.append(temp_sentece)
|
25 |
-
#print(data_list)
|
26 |
-
#print(label_list)
|
27 |
-
return data_list
|
28 |
-
|
29 |
-
def ml_intext_fn(ml_input):
|
30 |
-
fin=io.StringIO(ml_input)
|
31 |
-
alltexts=fin.read().strip().split('\n\n')
|
32 |
-
fin.close()
|
33 |
-
data_list=[]
|
34 |
-
|
35 |
-
for sents in alltexts:
|
36 |
-
lines=sents.split('\n')
|
37 |
-
temp_sentece=[]
|
38 |
-
for i in range(0,len(lines)):
|
39 |
-
seg=lines[i].split('\t')
|
40 |
-
temp_sentece.append(seg[:])
|
41 |
-
|
42 |
-
data_list.append(temp_sentece)
|
43 |
-
#print(data_list)
|
44 |
-
#print(label_list)
|
45 |
-
return data_list
|
46 |
-
|
47 |
-
# model predict result to conll evalute format [token answer predict]
|
48 |
-
def out_BIO_crf(file,raw_pre,raw_input,label_set):
|
49 |
-
fout=open(file,'w',encoding='utf-8')
|
50 |
-
for i in range(len(raw_input)):
|
51 |
-
|
52 |
-
for j in range(len(raw_input[i])):
|
53 |
-
if j<len(raw_pre[i]):
|
54 |
-
label_id = raw_pre[i][j]
|
55 |
-
label_tag = label_set[str(label_id)]
|
56 |
-
else:
|
57 |
-
label_tag='O'
|
58 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
59 |
-
fout.write('\n')
|
60 |
-
fout.close()
|
61 |
-
def out_BIO_crf_fn(raw_pre,raw_input,label_set):
|
62 |
-
fout=io.StringIO()
|
63 |
-
for i in range(len(raw_input)):
|
64 |
-
|
65 |
-
for j in range(len(raw_input[i])):
|
66 |
-
if j<len(raw_pre[i]):
|
67 |
-
label_id = raw_pre[i][j]
|
68 |
-
label_tag = label_set[str(label_id)]
|
69 |
-
else:
|
70 |
-
label_tag='O'
|
71 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
72 |
-
fout.write('\n')
|
73 |
-
return fout.getvalue()
|
74 |
-
def out_BIO_softmax(file,raw_pre,raw_input,label_set):
|
75 |
-
fout=open(file,'w',encoding='utf-8')
|
76 |
-
#print(raw_pre[0:2])
|
77 |
-
for i in range(len(raw_input)):
|
78 |
-
|
79 |
-
for j in range(len(raw_input[i])):
|
80 |
-
if j<len(raw_pre[i]):
|
81 |
-
label_id = np.argmax(raw_pre[i][j])
|
82 |
-
#print(label_id)
|
83 |
-
label_tag = label_set[str(label_id)]
|
84 |
-
else:
|
85 |
-
label_tag='O'
|
86 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
87 |
-
fout.write('\n')
|
88 |
-
fout.close()
|
89 |
-
def out_BIO_softmax_fn(raw_pre,raw_input,label_set):
|
90 |
-
fout=io.StringIO()
|
91 |
-
#print(raw_pre[0:2])
|
92 |
-
for i in range(len(raw_input)):
|
93 |
-
|
94 |
-
for j in range(len(raw_input[i])):
|
95 |
-
if j<len(raw_pre[i]):
|
96 |
-
label_id = np.argmax(raw_pre[i][j])
|
97 |
-
#print(label_id)
|
98 |
-
label_tag = label_set[str(label_id)]
|
99 |
-
else:
|
100 |
-
label_tag='O'
|
101 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
102 |
-
fout.write('\n')
|
103 |
-
return fout.getvalue()
|
104 |
-
|
105 |
-
def out_BIO_BERT_softmax(file,raw_pre,raw_input,label_set):
|
106 |
-
fout=open(file,'w',encoding='utf-8')
|
107 |
-
for i in range(len(raw_input)):
|
108 |
-
for j in range(len(raw_input[i])):
|
109 |
-
if raw_input[i][j][-1]<len(raw_pre[i]):
|
110 |
-
# label_id = raw_pre[i][j]
|
111 |
-
label_id = np.argmax(raw_pre[i][raw_input[i][j][-1]])
|
112 |
-
label_tag = label_set[str(label_id)]
|
113 |
-
else:
|
114 |
-
label_tag='O'
|
115 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
|
116 |
-
fout.write('\n')
|
117 |
-
fout.close()
|
118 |
-
def out_BIO_BERT_softmax_fn(raw_pre,raw_input,label_set):
|
119 |
-
fout=io.StringIO()
|
120 |
-
for i in range(len(raw_input)):
|
121 |
-
for j in range(len(raw_input[i])):
|
122 |
-
if raw_input[i][j][-1]<len(raw_pre[i]):
|
123 |
-
#label_id = raw_pre[i][j]
|
124 |
-
label_id = np.argmax(raw_pre[i][raw_input[i][j][-1]])
|
125 |
-
label_tag = label_set[str(label_id)]
|
126 |
-
else:
|
127 |
-
label_tag='O'
|
128 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
|
129 |
-
fout.write('\n')
|
130 |
-
return fout.getvalue()
|
131 |
-
def out_BIO_BERT_crf(file,raw_pre,raw_input,label_set):
|
132 |
-
fout=open(file,'w',encoding='utf-8')
|
133 |
-
for i in range(len(raw_input)):
|
134 |
-
|
135 |
-
for j in range(len(raw_input[i])):
|
136 |
-
if raw_input[i][j][-1]<len(raw_pre[i]):
|
137 |
-
label_id = raw_pre[i][raw_input[i][j][-1]]
|
138 |
-
label_tag = label_set[str(label_id)]
|
139 |
-
else:
|
140 |
-
label_tag='O'
|
141 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
|
142 |
-
fout.write('\n')
|
143 |
-
fout.close()
|
144 |
-
def out_BIO_BERT_crf_fn(raw_pre,raw_input,label_set):
|
145 |
-
fout=io.StringIO()
|
146 |
-
for i in range(len(raw_input)):
|
147 |
-
|
148 |
-
for j in range(len(raw_input[i])):
|
149 |
-
if raw_input[i][j][-1]<len(raw_pre[i]):
|
150 |
-
label_id = raw_pre[i][raw_input[i][j][-1]]
|
151 |
-
label_tag = label_set[str(label_id)]
|
152 |
-
else:
|
153 |
-
label_tag='O'
|
154 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
|
155 |
-
fout.write('\n')
|
156 |
-
return fout.getvalue()
|
157 |
-
|
158 |
-
def out_BIO_BERT_softmax_score_fn(raw_pre,raw_input,label_set):
|
159 |
-
fout=io.StringIO()
|
160 |
-
for i in range(len(raw_input)):
|
161 |
-
|
162 |
-
for j in range(len(raw_input[i])):
|
163 |
-
if j<len(raw_pre[i]):
|
164 |
-
#label_id = raw_pre[i][j]
|
165 |
-
label_id = np.argmax(raw_pre[i][j])
|
166 |
-
label_score = round(raw_pre[i][j][label_id],4)
|
167 |
-
label_tag = label_set[str(label_id)]
|
168 |
-
else:
|
169 |
-
label_tag='O'
|
170 |
-
label_score = 0.0
|
171 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\t'+str(label_score)+'\n')
|
172 |
-
fout.write('\n')
|
173 |
-
return fout.getvalue()
|
174 |
-
#generate char vocab
|
175 |
-
def char_vocab(infile,outfile_char):
|
176 |
-
fin=open(infile,'r',encoding='utf-8')
|
177 |
-
#fout=open(outfile,'w',encoding='utf-8')
|
178 |
-
fout_char=open(outfile_char,'w',encoding='utf-8')
|
179 |
-
char_vocab=['oov_char']
|
180 |
-
max_len=0
|
181 |
-
for line in fin:
|
182 |
-
if line.strip()!='':
|
183 |
-
seg=line.split('\t')
|
184 |
-
word_len=len(seg[0])
|
185 |
-
#if word_len<1000:
|
186 |
-
# fout.write(line)
|
187 |
-
if word_len>max_len:
|
188 |
-
max_len=word_len
|
189 |
-
print(seg[0])
|
190 |
-
for i in range(word_len):
|
191 |
-
if seg[0][i] not in char_vocab:
|
192 |
-
char_vocab.append(seg[0][i])
|
193 |
-
#else:
|
194 |
-
# fout.write(line)
|
195 |
-
fin.close()
|
196 |
-
#fout.close()
|
197 |
-
for ele in char_vocab:
|
198 |
-
fout_char.write(ele+'\n')
|
199 |
-
fout_char.close()
|
200 |
-
print('max_len:',max_len)
|
201 |
-
|
202 |
-
|
203 |
-
if __name__=='__main__':
|
204 |
-
# infile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO'
|
205 |
-
# #outfile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO_new'
|
206 |
-
# outfile_char='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/src/nn_model/vocab/char_vocab'
|
207 |
-
# #processing_text(file)
|
208 |
-
# char_vocab(infile,outfile_char)
|
209 |
-
a=[1,2,3]
|
210 |
-
print(a[:-1])
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Tue Mar 10 16:34:12 2020
|
4 |
+
|
5 |
+
@author: luol2
|
6 |
+
"""
|
7 |
+
import numpy as np
|
8 |
+
import io
|
9 |
+
import sys
|
10 |
+
#read ner text (word\tlabel), generate the list[[[w1,label],[w2,label]]]
|
11 |
+
def ml_intext(file):
|
12 |
+
fin=open(file,'r',encoding='utf-8')
|
13 |
+
alltexts=fin.read().strip().split('\n\n')
|
14 |
+
fin.close()
|
15 |
+
data_list=[]
|
16 |
+
|
17 |
+
for sents in alltexts:
|
18 |
+
lines=sents.split('\n')
|
19 |
+
temp_sentece=[]
|
20 |
+
for i in range(0,len(lines)):
|
21 |
+
seg=lines[i].split('\t')
|
22 |
+
temp_sentece.append(seg[:])
|
23 |
+
|
24 |
+
data_list.append(temp_sentece)
|
25 |
+
#print(data_list)
|
26 |
+
#print(label_list)
|
27 |
+
return data_list
|
28 |
+
|
29 |
+
def ml_intext_fn(ml_input):
|
30 |
+
fin=io.StringIO(ml_input)
|
31 |
+
alltexts=fin.read().strip().split('\n\n')
|
32 |
+
fin.close()
|
33 |
+
data_list=[]
|
34 |
+
|
35 |
+
for sents in alltexts:
|
36 |
+
lines=sents.split('\n')
|
37 |
+
temp_sentece=[]
|
38 |
+
for i in range(0,len(lines)):
|
39 |
+
seg=lines[i].split('\t')
|
40 |
+
temp_sentece.append(seg[:])
|
41 |
+
|
42 |
+
data_list.append(temp_sentece)
|
43 |
+
#print(data_list)
|
44 |
+
#print(label_list)
|
45 |
+
return data_list
|
46 |
+
|
47 |
+
# model predict result to conll evalute format [token answer predict]
|
48 |
+
def out_BIO_crf(file,raw_pre,raw_input,label_set):
|
49 |
+
fout=open(file,'w',encoding='utf-8')
|
50 |
+
for i in range(len(raw_input)):
|
51 |
+
|
52 |
+
for j in range(len(raw_input[i])):
|
53 |
+
if j<len(raw_pre[i]):
|
54 |
+
label_id = raw_pre[i][j]
|
55 |
+
label_tag = label_set[str(label_id)]
|
56 |
+
else:
|
57 |
+
label_tag='O'
|
58 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
59 |
+
fout.write('\n')
|
60 |
+
fout.close()
|
61 |
+
def out_BIO_crf_fn(raw_pre,raw_input,label_set):
|
62 |
+
fout=io.StringIO()
|
63 |
+
for i in range(len(raw_input)):
|
64 |
+
|
65 |
+
for j in range(len(raw_input[i])):
|
66 |
+
if j<len(raw_pre[i]):
|
67 |
+
label_id = raw_pre[i][j]
|
68 |
+
label_tag = label_set[str(label_id)]
|
69 |
+
else:
|
70 |
+
label_tag='O'
|
71 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
72 |
+
fout.write('\n')
|
73 |
+
return fout.getvalue()
|
74 |
+
def out_BIO_softmax(file,raw_pre,raw_input,label_set):
|
75 |
+
fout=open(file,'w',encoding='utf-8')
|
76 |
+
#print(raw_pre[0:2])
|
77 |
+
for i in range(len(raw_input)):
|
78 |
+
|
79 |
+
for j in range(len(raw_input[i])):
|
80 |
+
if j<len(raw_pre[i]):
|
81 |
+
label_id = np.argmax(raw_pre[i][j])
|
82 |
+
#print(label_id)
|
83 |
+
label_tag = label_set[str(label_id)]
|
84 |
+
else:
|
85 |
+
label_tag='O'
|
86 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
87 |
+
fout.write('\n')
|
88 |
+
fout.close()
|
89 |
+
def out_BIO_softmax_fn(raw_pre,raw_input,label_set):
|
90 |
+
fout=io.StringIO()
|
91 |
+
#print(raw_pre[0:2])
|
92 |
+
for i in range(len(raw_input)):
|
93 |
+
|
94 |
+
for j in range(len(raw_input[i])):
|
95 |
+
if j<len(raw_pre[i]):
|
96 |
+
label_id = np.argmax(raw_pre[i][j])
|
97 |
+
#print(label_id)
|
98 |
+
label_tag = label_set[str(label_id)]
|
99 |
+
else:
|
100 |
+
label_tag='O'
|
101 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
102 |
+
fout.write('\n')
|
103 |
+
return fout.getvalue()
|
104 |
+
|
105 |
+
def out_BIO_BERT_softmax(file,raw_pre,raw_input,label_set):
|
106 |
+
fout=open(file,'w',encoding='utf-8')
|
107 |
+
for i in range(len(raw_input)):
|
108 |
+
for j in range(len(raw_input[i])):
|
109 |
+
if raw_input[i][j][-1]<len(raw_pre[i]):
|
110 |
+
# label_id = raw_pre[i][j]
|
111 |
+
label_id = np.argmax(raw_pre[i][raw_input[i][j][-1]])
|
112 |
+
label_tag = label_set[str(label_id)]
|
113 |
+
else:
|
114 |
+
label_tag='O'
|
115 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
|
116 |
+
fout.write('\n')
|
117 |
+
fout.close()
|
118 |
+
def out_BIO_BERT_softmax_fn(raw_pre,raw_input,label_set):
|
119 |
+
fout=io.StringIO()
|
120 |
+
for i in range(len(raw_input)):
|
121 |
+
for j in range(len(raw_input[i])):
|
122 |
+
if raw_input[i][j][-1]<len(raw_pre[i]):
|
123 |
+
#label_id = raw_pre[i][j]
|
124 |
+
label_id = np.argmax(raw_pre[i][raw_input[i][j][-1]])
|
125 |
+
label_tag = label_set[str(label_id)]
|
126 |
+
else:
|
127 |
+
label_tag='O'
|
128 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
|
129 |
+
fout.write('\n')
|
130 |
+
return fout.getvalue()
|
131 |
+
def out_BIO_BERT_crf(file,raw_pre,raw_input,label_set):
|
132 |
+
fout=open(file,'w',encoding='utf-8')
|
133 |
+
for i in range(len(raw_input)):
|
134 |
+
|
135 |
+
for j in range(len(raw_input[i])):
|
136 |
+
if raw_input[i][j][-1]<len(raw_pre[i]):
|
137 |
+
label_id = raw_pre[i][raw_input[i][j][-1]]
|
138 |
+
label_tag = label_set[str(label_id)]
|
139 |
+
else:
|
140 |
+
label_tag='O'
|
141 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
|
142 |
+
fout.write('\n')
|
143 |
+
fout.close()
|
144 |
+
def out_BIO_BERT_crf_fn(raw_pre,raw_input,label_set):
|
145 |
+
fout=io.StringIO()
|
146 |
+
for i in range(len(raw_input)):
|
147 |
+
|
148 |
+
for j in range(len(raw_input[i])):
|
149 |
+
if raw_input[i][j][-1]<len(raw_pre[i]):
|
150 |
+
label_id = raw_pre[i][raw_input[i][j][-1]]
|
151 |
+
label_tag = label_set[str(label_id)]
|
152 |
+
else:
|
153 |
+
label_tag='O'
|
154 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
|
155 |
+
fout.write('\n')
|
156 |
+
return fout.getvalue()
|
157 |
+
|
158 |
+
def out_BIO_BERT_softmax_score_fn(raw_pre,raw_input,label_set):
|
159 |
+
fout=io.StringIO()
|
160 |
+
for i in range(len(raw_input)):
|
161 |
+
|
162 |
+
for j in range(len(raw_input[i])):
|
163 |
+
if j<len(raw_pre[i]):
|
164 |
+
#label_id = raw_pre[i][j]
|
165 |
+
label_id = np.argmax(raw_pre[i][j])
|
166 |
+
label_score = round(raw_pre[i][j][label_id],4)
|
167 |
+
label_tag = label_set[str(label_id)]
|
168 |
+
else:
|
169 |
+
label_tag='O'
|
170 |
+
label_score = 0.0
|
171 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\t'+str(label_score)+'\n')
|
172 |
+
fout.write('\n')
|
173 |
+
return fout.getvalue()
|
174 |
+
#generate char vocab
|
175 |
+
def char_vocab(infile,outfile_char):
|
176 |
+
fin=open(infile,'r',encoding='utf-8')
|
177 |
+
#fout=open(outfile,'w',encoding='utf-8')
|
178 |
+
fout_char=open(outfile_char,'w',encoding='utf-8')
|
179 |
+
char_vocab=['oov_char']
|
180 |
+
max_len=0
|
181 |
+
for line in fin:
|
182 |
+
if line.strip()!='':
|
183 |
+
seg=line.split('\t')
|
184 |
+
word_len=len(seg[0])
|
185 |
+
#if word_len<1000:
|
186 |
+
# fout.write(line)
|
187 |
+
if word_len>max_len:
|
188 |
+
max_len=word_len
|
189 |
+
print(seg[0])
|
190 |
+
for i in range(word_len):
|
191 |
+
if seg[0][i] not in char_vocab:
|
192 |
+
char_vocab.append(seg[0][i])
|
193 |
+
#else:
|
194 |
+
# fout.write(line)
|
195 |
+
fin.close()
|
196 |
+
#fout.close()
|
197 |
+
for ele in char_vocab:
|
198 |
+
fout_char.write(ele+'\n')
|
199 |
+
fout_char.close()
|
200 |
+
print('max_len:',max_len)
|
201 |
+
|
202 |
+
|
203 |
+
if __name__=='__main__':
|
204 |
+
# infile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO'
|
205 |
+
# #outfile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO_new'
|
206 |
+
# outfile_char='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/src/nn_model/vocab/char_vocab'
|
207 |
+
# #processing_text(file)
|
208 |
+
# char_vocab(infile,outfile_char)
|
209 |
+
a=[1,2,3]
|
210 |
+
print(a[:-1])
|
src_python/GeneNER/represent_ner.py
CHANGED
@@ -1,183 +1,183 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""
|
3 |
-
Created on Mon Aug 30 19:54:17 2021
|
4 |
-
|
5 |
-
@author: luol2
|
6 |
-
"""
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
import os, sys
|
11 |
-
import numpy as np
|
12 |
-
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
13 |
-
from transformers import AutoTokenizer
|
14 |
-
|
15 |
-
|
16 |
-
class Hugface_RepresentationLayer(object):
|
17 |
-
|
18 |
-
|
19 |
-
def __init__(self, tokenizer_name_or_path, label_file,lowercase=True):
|
20 |
-
|
21 |
-
|
22 |
-
#load vocab
|
23 |
-
|
24 |
-
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True,do_lower_case=lowercase)
|
25 |
-
self.label_2_index={}
|
26 |
-
self.index_2_label={}
|
27 |
-
self.label_table_size=0
|
28 |
-
self.load_label_vocab(label_file,self.label_2_index,self.index_2_label)
|
29 |
-
self.label_table_size=len(self.label_2_index)
|
30 |
-
self.vocab_len=len(self.tokenizer)
|
31 |
-
|
32 |
-
def load_label_vocab(self,fea_file,fea_index,index_2_label):
|
33 |
-
|
34 |
-
fin=open(fea_file,'r',encoding='utf-8')
|
35 |
-
all_text=fin.read().strip().split('\n')
|
36 |
-
fin.close()
|
37 |
-
for i in range(0,len(all_text)):
|
38 |
-
fea_index[all_text[i]]=i
|
39 |
-
index_2_label[str(i)]=all_text[i]
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
def generate_label_list(self,ori_tokens,labels,word_index): #the lable of subtoken is the same with the label of first subtoken
|
44 |
-
label_list=['O']*len(word_index)
|
45 |
-
|
46 |
-
label_list_index=[]
|
47 |
-
old_new_token_map=[]
|
48 |
-
ori_i=0
|
49 |
-
for i in range(0,len(word_index)):
|
50 |
-
if word_index[i]==None:
|
51 |
-
label_list_index.append(self.label_2_index[label_list[i]])
|
52 |
-
else:
|
53 |
-
label_list[i]=labels[word_index[i]]
|
54 |
-
label_list_index.append(self.label_2_index[label_list[i]])
|
55 |
-
if word_index[i]==ori_i:
|
56 |
-
old_new_token_map.append(i)
|
57 |
-
ori_i+=1
|
58 |
-
|
59 |
-
|
60 |
-
bert_text_label=[]
|
61 |
-
for i in range(0,len(ori_tokens)):
|
62 |
-
bert_text_label.append([ori_tokens[i],labels[i],old_new_token_map[i]])
|
63 |
-
|
64 |
-
return label_list_index,bert_text_label
|
65 |
-
|
66 |
-
def generate_label_list_B(self,ori_tokens,labels,word_index): #tonly first subtoken is B, other is I
|
67 |
-
label_list=['O']*len(word_index)
|
68 |
-
|
69 |
-
label_list_index=[]
|
70 |
-
old_new_token_map=[]
|
71 |
-
ori_i=0
|
72 |
-
first_index=-1
|
73 |
-
i=0
|
74 |
-
while i <len(word_index):
|
75 |
-
if word_index[i]==None:
|
76 |
-
label_list_index.append(self.label_2_index[label_list[i]])
|
77 |
-
i+=1
|
78 |
-
else:
|
79 |
-
first_index=word_index[i]
|
80 |
-
if first_index==ori_i:
|
81 |
-
old_new_token_map.append(i)
|
82 |
-
ori_i+=1
|
83 |
-
label_list[i]=labels[word_index[i]]
|
84 |
-
label_list_index.append(self.label_2_index[label_list[i]])
|
85 |
-
i+=1
|
86 |
-
while word_index[i]==first_index and word_index[i]!=None:
|
87 |
-
#print(first_index)
|
88 |
-
if labels[first_index].startswith("B-"):
|
89 |
-
label_list[i]='I-'+labels[first_index][2:]
|
90 |
-
label_list_index.append(self.label_2_index[label_list[i]])
|
91 |
-
else:
|
92 |
-
label_list[i]=labels[word_index[i]]
|
93 |
-
label_list_index.append(self.label_2_index[label_list[i]])
|
94 |
-
i+=1
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
bert_text_label=[]
|
100 |
-
#print(len(old_new_token_map))
|
101 |
-
for i in range(0,len(ori_tokens)):
|
102 |
-
if i<len(old_new_token_map):
|
103 |
-
bert_text_label.append([ori_tokens[i],labels[i],old_new_token_map[i]])
|
104 |
-
else: # after token > max len
|
105 |
-
break
|
106 |
-
return label_list_index,bert_text_label
|
107 |
-
|
108 |
-
def load_data_hugface(self,instances, word_max_len=100, label_type='softmax'):
|
109 |
-
|
110 |
-
x_index=[]
|
111 |
-
x_seg=[]
|
112 |
-
x_mask=[]
|
113 |
-
y_list=[]
|
114 |
-
bert_text_labels=[]
|
115 |
-
max_len=0
|
116 |
-
over_num=0
|
117 |
-
maxT=word_max_len
|
118 |
-
ave_len=0
|
119 |
-
|
120 |
-
#print('instances:', instances)
|
121 |
-
#print('labels:',labels)
|
122 |
-
|
123 |
-
|
124 |
-
for sentence in instances:
|
125 |
-
sentence_text_list=[]
|
126 |
-
label_list=[]
|
127 |
-
for j in range(0,len(sentence)):
|
128 |
-
sentence_text_list.append(sentence[j][0])
|
129 |
-
label_list.append(sentence[j][-1])
|
130 |
-
|
131 |
-
token_result=self.tokenizer(
|
132 |
-
sentence_text_list,
|
133 |
-
max_length=word_max_len,
|
134 |
-
truncation=True,is_split_into_words=True)
|
135 |
-
|
136 |
-
bert_tokens=self.tokenizer.convert_ids_to_tokens(token_result['input_ids'])
|
137 |
-
word_index=token_result.word_ids(batch_index=0)
|
138 |
-
ave_len+=len(bert_tokens)
|
139 |
-
if len(sentence_text_list)>max_len:
|
140 |
-
max_len=len(sentence_text_list)
|
141 |
-
if len(bert_tokens)==maxT:
|
142 |
-
over_num+=1
|
143 |
-
|
144 |
-
x_index.append(token_result['input_ids'])
|
145 |
-
x_seg.append(token_result['token_type_ids'])
|
146 |
-
x_mask.append(token_result['attention_mask'])
|
147 |
-
|
148 |
-
#print('\nsentence_text_list:',len(sentence_text_list),sentence_text_list)
|
149 |
-
#print('\nlabel:',len(label_list),label_list)
|
150 |
-
#print('\nword_index:',len(word_index),word_index)
|
151 |
-
#print('\nbert_tokens:',len(bert_tokens),bert_tokens)
|
152 |
-
label_list,bert_text_label=self.generate_label_list_B(sentence_text_list,label_list,word_index) # the label list after bert token, ori token/lable/new index
|
153 |
-
#print('\nlabel list:',len(label_list),label_list)
|
154 |
-
#print('\nbert_text_label:',len(bert_text_label),bert_text_label)
|
155 |
-
#sys.exit()
|
156 |
-
y_list.append(label_list)
|
157 |
-
#print(y_list)
|
158 |
-
bert_text_labels.append(bert_text_label)
|
159 |
-
|
160 |
-
|
161 |
-
x1_np = pad_sequences(x_index, word_max_len, value=0, padding='post',truncating='post') # right padding
|
162 |
-
x2_np = pad_sequences(x_seg, word_max_len, value=0, padding='post',truncating='post')
|
163 |
-
x3_np = pad_sequences(x_mask, word_max_len, value=0, padding='post',truncating='post')
|
164 |
-
y_np = pad_sequences(y_list, word_max_len, value=0, padding='post',truncating='post')
|
165 |
-
#print('x1_np:',x1_np)
|
166 |
-
#print('\nx2_np:',x2_np)
|
167 |
-
#print('\ny_np:',y_np)
|
168 |
-
#print('\nbert_text:',bert_text_labels)
|
169 |
-
# print('bert max len:',max_len,',Over',maxT,':',over_num,'ave len:',ave_len/len(instances),'total:',len(instances))
|
170 |
-
|
171 |
-
if label_type=='softmax':
|
172 |
-
y_np = np.expand_dims(y_np, 2)
|
173 |
-
elif label_type=='crf':
|
174 |
-
pass
|
175 |
-
|
176 |
-
return [x1_np, x2_np,x3_np], y_np,bert_text_labels
|
177 |
-
|
178 |
-
|
179 |
-
if __name__ == '__main__':
|
180 |
-
pass
|
181 |
-
|
182 |
-
|
183 |
-
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Mon Aug 30 19:54:17 2021
|
4 |
+
|
5 |
+
@author: luol2
|
6 |
+
"""
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
import os, sys
|
11 |
+
import numpy as np
|
12 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
13 |
+
from transformers import AutoTokenizer
|
14 |
+
|
15 |
+
|
16 |
+
class Hugface_RepresentationLayer(object):
|
17 |
+
|
18 |
+
|
19 |
+
def __init__(self, tokenizer_name_or_path, label_file,lowercase=True):
|
20 |
+
|
21 |
+
|
22 |
+
#load vocab
|
23 |
+
|
24 |
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True,do_lower_case=lowercase)
|
25 |
+
self.label_2_index={}
|
26 |
+
self.index_2_label={}
|
27 |
+
self.label_table_size=0
|
28 |
+
self.load_label_vocab(label_file,self.label_2_index,self.index_2_label)
|
29 |
+
self.label_table_size=len(self.label_2_index)
|
30 |
+
self.vocab_len=len(self.tokenizer)
|
31 |
+
|
32 |
+
def load_label_vocab(self,fea_file,fea_index,index_2_label):
|
33 |
+
|
34 |
+
fin=open(fea_file,'r',encoding='utf-8')
|
35 |
+
all_text=fin.read().strip().split('\n')
|
36 |
+
fin.close()
|
37 |
+
for i in range(0,len(all_text)):
|
38 |
+
fea_index[all_text[i]]=i
|
39 |
+
index_2_label[str(i)]=all_text[i]
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
def generate_label_list(self,ori_tokens,labels,word_index): #the lable of subtoken is the same with the label of first subtoken
|
44 |
+
label_list=['O']*len(word_index)
|
45 |
+
|
46 |
+
label_list_index=[]
|
47 |
+
old_new_token_map=[]
|
48 |
+
ori_i=0
|
49 |
+
for i in range(0,len(word_index)):
|
50 |
+
if word_index[i]==None:
|
51 |
+
label_list_index.append(self.label_2_index[label_list[i]])
|
52 |
+
else:
|
53 |
+
label_list[i]=labels[word_index[i]]
|
54 |
+
label_list_index.append(self.label_2_index[label_list[i]])
|
55 |
+
if word_index[i]==ori_i:
|
56 |
+
old_new_token_map.append(i)
|
57 |
+
ori_i+=1
|
58 |
+
|
59 |
+
|
60 |
+
bert_text_label=[]
|
61 |
+
for i in range(0,len(ori_tokens)):
|
62 |
+
bert_text_label.append([ori_tokens[i],labels[i],old_new_token_map[i]])
|
63 |
+
|
64 |
+
return label_list_index,bert_text_label
|
65 |
+
|
66 |
+
def generate_label_list_B(self,ori_tokens,labels,word_index): #tonly first subtoken is B, other is I
|
67 |
+
label_list=['O']*len(word_index)
|
68 |
+
|
69 |
+
label_list_index=[]
|
70 |
+
old_new_token_map=[]
|
71 |
+
ori_i=0
|
72 |
+
first_index=-1
|
73 |
+
i=0
|
74 |
+
while i <len(word_index):
|
75 |
+
if word_index[i]==None:
|
76 |
+
label_list_index.append(self.label_2_index[label_list[i]])
|
77 |
+
i+=1
|
78 |
+
else:
|
79 |
+
first_index=word_index[i]
|
80 |
+
if first_index==ori_i:
|
81 |
+
old_new_token_map.append(i)
|
82 |
+
ori_i+=1
|
83 |
+
label_list[i]=labels[word_index[i]]
|
84 |
+
label_list_index.append(self.label_2_index[label_list[i]])
|
85 |
+
i+=1
|
86 |
+
while word_index[i]==first_index and word_index[i]!=None:
|
87 |
+
#print(first_index)
|
88 |
+
if labels[first_index].startswith("B-"):
|
89 |
+
label_list[i]='I-'+labels[first_index][2:]
|
90 |
+
label_list_index.append(self.label_2_index[label_list[i]])
|
91 |
+
else:
|
92 |
+
label_list[i]=labels[word_index[i]]
|
93 |
+
label_list_index.append(self.label_2_index[label_list[i]])
|
94 |
+
i+=1
|
95 |
+
|
96 |
+
|
97 |
+
|
98 |
+
|
99 |
+
bert_text_label=[]
|
100 |
+
#print(len(old_new_token_map))
|
101 |
+
for i in range(0,len(ori_tokens)):
|
102 |
+
if i<len(old_new_token_map):
|
103 |
+
bert_text_label.append([ori_tokens[i],labels[i],old_new_token_map[i]])
|
104 |
+
else: # after token > max len
|
105 |
+
break
|
106 |
+
return label_list_index,bert_text_label
|
107 |
+
|
108 |
+
def load_data_hugface(self,instances, word_max_len=100, label_type='softmax'):
|
109 |
+
|
110 |
+
x_index=[]
|
111 |
+
x_seg=[]
|
112 |
+
x_mask=[]
|
113 |
+
y_list=[]
|
114 |
+
bert_text_labels=[]
|
115 |
+
max_len=0
|
116 |
+
over_num=0
|
117 |
+
maxT=word_max_len
|
118 |
+
ave_len=0
|
119 |
+
|
120 |
+
#print('instances:', instances)
|
121 |
+
#print('labels:',labels)
|
122 |
+
|
123 |
+
|
124 |
+
for sentence in instances:
|
125 |
+
sentence_text_list=[]
|
126 |
+
label_list=[]
|
127 |
+
for j in range(0,len(sentence)):
|
128 |
+
sentence_text_list.append(sentence[j][0])
|
129 |
+
label_list.append(sentence[j][-1])
|
130 |
+
|
131 |
+
token_result=self.tokenizer(
|
132 |
+
sentence_text_list,
|
133 |
+
max_length=word_max_len,
|
134 |
+
truncation=True,is_split_into_words=True)
|
135 |
+
|
136 |
+
bert_tokens=self.tokenizer.convert_ids_to_tokens(token_result['input_ids'])
|
137 |
+
word_index=token_result.word_ids(batch_index=0)
|
138 |
+
ave_len+=len(bert_tokens)
|
139 |
+
if len(sentence_text_list)>max_len:
|
140 |
+
max_len=len(sentence_text_list)
|
141 |
+
if len(bert_tokens)==maxT:
|
142 |
+
over_num+=1
|
143 |
+
|
144 |
+
x_index.append(token_result['input_ids'])
|
145 |
+
x_seg.append(token_result['token_type_ids'])
|
146 |
+
x_mask.append(token_result['attention_mask'])
|
147 |
+
|
148 |
+
#print('\nsentence_text_list:',len(sentence_text_list),sentence_text_list)
|
149 |
+
#print('\nlabel:',len(label_list),label_list)
|
150 |
+
#print('\nword_index:',len(word_index),word_index)
|
151 |
+
#print('\nbert_tokens:',len(bert_tokens),bert_tokens)
|
152 |
+
label_list,bert_text_label=self.generate_label_list_B(sentence_text_list,label_list,word_index) # the label list after bert token, ori token/lable/new index
|
153 |
+
#print('\nlabel list:',len(label_list),label_list)
|
154 |
+
#print('\nbert_text_label:',len(bert_text_label),bert_text_label)
|
155 |
+
#sys.exit()
|
156 |
+
y_list.append(label_list)
|
157 |
+
#print(y_list)
|
158 |
+
bert_text_labels.append(bert_text_label)
|
159 |
+
|
160 |
+
|
161 |
+
x1_np = pad_sequences(x_index, word_max_len, value=0, padding='post',truncating='post') # right padding
|
162 |
+
x2_np = pad_sequences(x_seg, word_max_len, value=0, padding='post',truncating='post')
|
163 |
+
x3_np = pad_sequences(x_mask, word_max_len, value=0, padding='post',truncating='post')
|
164 |
+
y_np = pad_sequences(y_list, word_max_len, value=0, padding='post',truncating='post')
|
165 |
+
#print('x1_np:',x1_np)
|
166 |
+
#print('\nx2_np:',x2_np)
|
167 |
+
#print('\ny_np:',y_np)
|
168 |
+
#print('\nbert_text:',bert_text_labels)
|
169 |
+
# print('bert max len:',max_len,',Over',maxT,':',over_num,'ave len:',ave_len/len(instances),'total:',len(instances))
|
170 |
+
|
171 |
+
if label_type=='softmax':
|
172 |
+
y_np = np.expand_dims(y_np, 2)
|
173 |
+
elif label_type=='crf':
|
174 |
+
pass
|
175 |
+
|
176 |
+
return [x1_np, x2_np,x3_np], y_np,bert_text_labels
|
177 |
+
|
178 |
+
|
179 |
+
if __name__ == '__main__':
|
180 |
+
pass
|
181 |
+
|
182 |
+
|
183 |
+
|
src_python/GeneNER/restore_index_ner.py
CHANGED
@@ -1,447 +1,447 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""
|
3 |
-
Created on Fri Mar 5 10:40:08 2021
|
4 |
-
|
5 |
-
@author: luol2
|
6 |
-
"""
|
7 |
-
|
8 |
-
# -*- coding: utf-8 -*-
|
9 |
-
"""
|
10 |
-
Created on Sun Jun 14 17:19:02 2020
|
11 |
-
|
12 |
-
@author: luol2
|
13 |
-
"""
|
14 |
-
|
15 |
-
import io
|
16 |
-
import sys
|
17 |
-
|
18 |
-
# from BIO format to entity,list line is sentence, follwing the entity(start, end, text, entity, type)
|
19 |
-
def NN_BIO_tag_entity(pre_BIO):
|
20 |
-
sentences=pre_BIO.strip().split('\n\n')
|
21 |
-
|
22 |
-
pre_result=[]
|
23 |
-
#print(sentences)
|
24 |
-
for sent in sentences:
|
25 |
-
tokens=sent.split('\n')
|
26 |
-
pre_entity=[]
|
27 |
-
pre_start,pre_end=0,0
|
28 |
-
sent_text=''
|
29 |
-
for i in range(0,len(tokens)):
|
30 |
-
segs=tokens[i].split('\t')
|
31 |
-
sent_text+=segs[0]+' '
|
32 |
-
if len(segs)<3:
|
33 |
-
continue
|
34 |
-
#print(tokens)
|
35 |
-
# generate prediction entity
|
36 |
-
if segs[2].startswith('B-')>0:
|
37 |
-
pre_start=i
|
38 |
-
pre_type=segs[2][2:]
|
39 |
-
if i+1>=len(tokens): # the last word
|
40 |
-
pre_end=i
|
41 |
-
pre_entity.append([pre_start,pre_end,pre_type])
|
42 |
-
else: # non last word
|
43 |
-
next_seg=tokens[i+1].split('\t')
|
44 |
-
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
45 |
-
pre_end=i
|
46 |
-
pre_entity.append([pre_start,pre_end,pre_type])
|
47 |
-
elif next_seg[2].startswith('I-')>0:
|
48 |
-
pass
|
49 |
-
elif segs[2].startswith('I-')>0:
|
50 |
-
if i==0 and i+1<len(tokens): # the first word and not only a word
|
51 |
-
pre_start=i
|
52 |
-
pre_type=segs[2][2:]
|
53 |
-
next_seg=tokens[i+1].split('\t')
|
54 |
-
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
55 |
-
pre_end=i
|
56 |
-
pre_entity.append([pre_start,pre_end,pre_type])
|
57 |
-
elif next_seg[2].startswith('I-')>0:
|
58 |
-
pass
|
59 |
-
elif i==0 and i+1==len(tokens):# only one word:
|
60 |
-
pre_start=i
|
61 |
-
pre_type=segs[2][2:]
|
62 |
-
pre_end=i
|
63 |
-
pre_entity.append([pre_start,pre_end,pre_type])
|
64 |
-
elif i+1>=len(tokens): # the last word
|
65 |
-
last_seg=tokens[i-1].split('\t')
|
66 |
-
if last_seg[2]=='O':
|
67 |
-
pre_start=i
|
68 |
-
pre_type=segs[2][2:]
|
69 |
-
pre_end=i
|
70 |
-
pre_entity.append([pre_start,pre_end,pre_type])
|
71 |
-
elif i+1< len(tokens): # non last word
|
72 |
-
next_seg=tokens[i+1].split('\t')
|
73 |
-
last_seg=tokens[i-1].split('\t')
|
74 |
-
if last_seg[2]=='O':
|
75 |
-
pre_start=i
|
76 |
-
pre_type=segs[2][2:]
|
77 |
-
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
78 |
-
pre_end=i
|
79 |
-
pre_entity.append([pre_start,pre_end,pre_type])
|
80 |
-
elif next_seg[2].startswith('I-')>0:
|
81 |
-
pass
|
82 |
-
elif segs[2]=='O':
|
83 |
-
pass
|
84 |
-
pre_result.append([sent_text.rstrip(),pre_entity])
|
85 |
-
|
86 |
-
|
87 |
-
# print(pre_entity)
|
88 |
-
return pre_result
|
89 |
-
|
90 |
-
def NN_restore_index_fn(ori_text,file_pre):
|
91 |
-
|
92 |
-
input_result=NN_BIO_tag_entity(file_pre)
|
93 |
-
#print(input_result)
|
94 |
-
|
95 |
-
|
96 |
-
new_sentence=''
|
97 |
-
restore_result=[]
|
98 |
-
|
99 |
-
sentence_ori=ori_text.lower()
|
100 |
-
|
101 |
-
for sent_ele in input_result:
|
102 |
-
|
103 |
-
#print(pre_lines)
|
104 |
-
# print(sentence_ori)
|
105 |
-
if len(sent_ele[1])>0:
|
106 |
-
#print(pre_lines)
|
107 |
-
sentence_pre=sent_ele[0].lower()
|
108 |
-
sentence_pre=sentence_pre.split()
|
109 |
-
|
110 |
-
pre_result=sent_ele[1]
|
111 |
-
|
112 |
-
|
113 |
-
restore_sid=0
|
114 |
-
restore_eid=0
|
115 |
-
each_word_id=[]
|
116 |
-
|
117 |
-
for i in range(0,len(sentence_pre)):
|
118 |
-
|
119 |
-
temp_id=sentence_ori.find(sentence_pre[i])
|
120 |
-
if temp_id<0:
|
121 |
-
#print('ori:',sentence_ori)
|
122 |
-
print('resotr index error:',sentence_pre[i])
|
123 |
-
new_sentence+=sentence_ori[0:temp_id]
|
124 |
-
|
125 |
-
restore_sid=len(new_sentence)
|
126 |
-
restore_eid=len(new_sentence)+len(sentence_pre[i])
|
127 |
-
each_word_id.append([str(restore_sid),str(restore_eid)])
|
128 |
-
new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
|
129 |
-
sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
|
130 |
-
# print('each_word:',each_word_id)
|
131 |
-
for pre_ele in pre_result:
|
132 |
-
temp_pre_result=[each_word_id[int(pre_ele[0])][0],each_word_id[int(pre_ele[1])][1],pre_ele[2]]
|
133 |
-
if temp_pre_result not in restore_result:
|
134 |
-
restore_result.append(temp_pre_result)
|
135 |
-
else:
|
136 |
-
sentence_pre=sent_ele[0].lower()
|
137 |
-
sentence_pre=sentence_pre.split()
|
138 |
-
|
139 |
-
for i in range(0,len(sentence_pre)):
|
140 |
-
|
141 |
-
temp_id=sentence_ori.find(sentence_pre[i])
|
142 |
-
if temp_id<0:
|
143 |
-
print('resotr index error:',sentence_pre[i])
|
144 |
-
new_sentence+=sentence_ori[0:temp_id]
|
145 |
-
new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
|
146 |
-
sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
|
147 |
-
#print('resotre:',restore_result)
|
148 |
-
return restore_result
|
149 |
-
|
150 |
-
def BERT_BIO_tag_entity(pre_BIO):
|
151 |
-
sentences=pre_BIO.strip().split('\n\n')
|
152 |
-
|
153 |
-
pre_result=[]
|
154 |
-
for sent in sentences:
|
155 |
-
tokens=sent.split('\n')
|
156 |
-
pre_entity=[]
|
157 |
-
pre_start,pre_end=0,0
|
158 |
-
sent_text=''
|
159 |
-
for i in range(1,len(tokens)-1):
|
160 |
-
segs=tokens[i].split('\t')
|
161 |
-
sent_text+=segs[0]+' '
|
162 |
-
# generate prediction entity
|
163 |
-
if segs[2].startswith('B-')>0:
|
164 |
-
pre_start=i
|
165 |
-
pre_type=segs[2][2:]
|
166 |
-
if i+1>=len(tokens): # the last word
|
167 |
-
pre_end=i
|
168 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
169 |
-
else: # non last word
|
170 |
-
next_seg=tokens[i+1].split('\t')
|
171 |
-
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
172 |
-
pre_end=i
|
173 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
174 |
-
elif next_seg[2].startswith('I-')>0:
|
175 |
-
pass
|
176 |
-
elif segs[2].startswith('I-')>0:
|
177 |
-
if i==0 and i+1<len(tokens): # the first word and not only a word
|
178 |
-
pre_start=i
|
179 |
-
pre_type=segs[2][2:]
|
180 |
-
next_seg=tokens[i+1].split('\t')
|
181 |
-
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
182 |
-
pre_end=i
|
183 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
184 |
-
elif next_seg[2].startswith('I-')>0:
|
185 |
-
pass
|
186 |
-
elif i==0 and i+1==len(tokens):# only one word:
|
187 |
-
pre_start=i
|
188 |
-
pre_type=segs[2][2:]
|
189 |
-
pre_end=i
|
190 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
191 |
-
elif i+1>=len(tokens): # the last word
|
192 |
-
last_seg=tokens[i-1].split('\t')
|
193 |
-
if last_seg[2]=='O':
|
194 |
-
pre_start=i
|
195 |
-
pre_type=segs[2][2:]
|
196 |
-
pre_end=i
|
197 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
198 |
-
elif i+1< len(tokens): # non last word
|
199 |
-
next_seg=tokens[i+1].split('\t')
|
200 |
-
last_seg=tokens[i-1].split('\t')
|
201 |
-
if last_seg[2]=='O':
|
202 |
-
pre_start=i
|
203 |
-
pre_type=segs[2][2:]
|
204 |
-
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
205 |
-
pre_end=i
|
206 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
207 |
-
elif next_seg[2].startswith('I-')>0:
|
208 |
-
pass
|
209 |
-
elif segs[2]=='O':
|
210 |
-
pass
|
211 |
-
pre_result.append([sent_text.rstrip(),pre_entity])
|
212 |
-
|
213 |
-
|
214 |
-
#print(pre_result)
|
215 |
-
return pre_result
|
216 |
-
|
217 |
-
def BERT_BIO_tag_entity_revised(pre_BIO):
|
218 |
-
print('revised version')
|
219 |
-
sentences=pre_BIO.strip().split('\n\n')
|
220 |
-
|
221 |
-
pre_result=[]
|
222 |
-
for sent in sentences:
|
223 |
-
tokens=sent.split('\n')
|
224 |
-
pre_entity=[]
|
225 |
-
pre_start,pre_end=0,0
|
226 |
-
sent_text=''
|
227 |
-
for i in range(1,len(tokens)-1):
|
228 |
-
segs=tokens[i].split('\t')
|
229 |
-
sent_text+=segs[0]+' '
|
230 |
-
# generate prediction entity
|
231 |
-
if segs[2].startswith('B-')>0:
|
232 |
-
pre_start=i
|
233 |
-
pre_type=segs[2][2:]
|
234 |
-
if i+1>=len(tokens)-1: # the last word
|
235 |
-
pre_end=i
|
236 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
237 |
-
else: # non last word
|
238 |
-
next_seg=tokens[i+1].split('\t')
|
239 |
-
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
240 |
-
pre_end=i
|
241 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
242 |
-
elif next_seg[2].startswith('I-')>0:
|
243 |
-
pass
|
244 |
-
elif segs[2].startswith('I-')>0:
|
245 |
-
if i==1 and i+1<len(tokens)-1: # the first word and not only a word
|
246 |
-
pre_start=i
|
247 |
-
pre_type=segs[2][2:]
|
248 |
-
next_seg=tokens[i+1].split('\t')
|
249 |
-
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
250 |
-
pre_end=i
|
251 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
252 |
-
elif next_seg[2].startswith('I-')>0:
|
253 |
-
pass
|
254 |
-
elif i==1 and i+1==len(tokens)-1:# only one word:
|
255 |
-
pre_start=i
|
256 |
-
pre_type=segs[2][2:]
|
257 |
-
pre_end=i
|
258 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
259 |
-
elif i+1>=len(tokens)-1: # the last word
|
260 |
-
last_seg=tokens[i-1].split('\t')
|
261 |
-
if last_seg[2]=='O':
|
262 |
-
pre_start=i
|
263 |
-
pre_type=segs[2][2:]
|
264 |
-
pre_end=i
|
265 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
266 |
-
elif i+1< len(tokens)-1: # non last word
|
267 |
-
next_seg=tokens[i+1].split('\t')
|
268 |
-
last_seg=tokens[i-1].split('\t')
|
269 |
-
if last_seg[2]=='O':
|
270 |
-
pre_start=i
|
271 |
-
pre_type=segs[2][2:]
|
272 |
-
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
273 |
-
pre_end=i
|
274 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
275 |
-
elif next_seg[2].startswith('I-')>0:
|
276 |
-
pass
|
277 |
-
elif segs[2]=='O':
|
278 |
-
pass
|
279 |
-
pre_result.append([sent_text.rstrip(),pre_entity])
|
280 |
-
|
281 |
-
|
282 |
-
#print(pre_result)
|
283 |
-
return pre_result
|
284 |
-
|
285 |
-
# only predict on the first token of the ori word
|
286 |
-
def BERT_BIO_tag_entity_word(pre_BIO):
|
287 |
-
sentences=pre_BIO.strip().split('\n\n')
|
288 |
-
|
289 |
-
pre_result=[]
|
290 |
-
for sent in sentences:
|
291 |
-
tokens=sent.split('\n')
|
292 |
-
pre_entity=[]
|
293 |
-
pre_start,pre_end=0,0
|
294 |
-
sent_text=''
|
295 |
-
i=1
|
296 |
-
while i< len(tokens)-1:
|
297 |
-
# for i in range(1,len(tokens)-1):
|
298 |
-
segs=tokens[i].split('\t')
|
299 |
-
sent_text+=segs[0]+' '
|
300 |
-
# generate prediction entity
|
301 |
-
if segs[2].startswith('B-')>0:
|
302 |
-
pre_start=i
|
303 |
-
pre_type=segs[2][2:]
|
304 |
-
if i+1>=len(tokens)-1: # the last word
|
305 |
-
pre_end=i
|
306 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
307 |
-
else: # non last word
|
308 |
-
#pass a word
|
309 |
-
sub_segs=tokens[i+1].split('\t')
|
310 |
-
while(sub_segs[0].find('##')==0):
|
311 |
-
i+=1
|
312 |
-
sent_text+=sub_segs[0]+' '
|
313 |
-
sub_segs=tokens[i+1].split('\t')
|
314 |
-
|
315 |
-
|
316 |
-
next_seg=tokens[i+1].split('\t')
|
317 |
-
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
318 |
-
pre_end=i
|
319 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
320 |
-
elif next_seg[2].startswith('I-')>0:
|
321 |
-
pass
|
322 |
-
elif segs[2].startswith('I-')>0:
|
323 |
-
if i==1 and i+1<len(tokens)-1: # the first word and not only a word
|
324 |
-
pre_start=i
|
325 |
-
pre_type=segs[2][2:]
|
326 |
-
#pass a word
|
327 |
-
sub_segs=tokens[i+1].split('\t')
|
328 |
-
while(sub_segs[0].find('##')==0):
|
329 |
-
i+=1
|
330 |
-
sent_text+=sub_segs[0]+' '
|
331 |
-
sub_segs=tokens[i+1].split('\t')
|
332 |
-
|
333 |
-
next_seg=tokens[i+1].split('\t')
|
334 |
-
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
335 |
-
pre_end=i
|
336 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
337 |
-
elif next_seg[2].startswith('I-')>0:
|
338 |
-
pass
|
339 |
-
elif i==1 and i+1==len(tokens)-1:# only one word:
|
340 |
-
pre_start=i
|
341 |
-
pre_type=segs[2][2:]
|
342 |
-
pre_end=i
|
343 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
344 |
-
elif i+1>=len(tokens)-1: # the last word
|
345 |
-
last_seg=tokens[i-1].split('\t')
|
346 |
-
if last_seg[2]=='O':
|
347 |
-
pre_start=i
|
348 |
-
pre_type=segs[2][2:]
|
349 |
-
pre_end=i
|
350 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
351 |
-
elif i+1< len(tokens)-1: # non last word
|
352 |
-
|
353 |
-
last_seg=tokens[i-1].split('\t')
|
354 |
-
if last_seg[2]=='O':
|
355 |
-
pre_start=i
|
356 |
-
pre_type=segs[2][2:]
|
357 |
-
#pass a word
|
358 |
-
sub_segs=tokens[i+1].split('\t')
|
359 |
-
while(sub_segs[0].find('##')==0):
|
360 |
-
i+=1
|
361 |
-
sent_text+=sub_segs[0]+' '
|
362 |
-
sub_segs=tokens[i+1].split('\t')
|
363 |
-
next_seg=tokens[i+1].split('\t')
|
364 |
-
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
365 |
-
pre_end=i
|
366 |
-
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
367 |
-
elif next_seg[2].startswith('I-')>0:
|
368 |
-
pass
|
369 |
-
elif segs[2]=='O':
|
370 |
-
pass
|
371 |
-
i+=1
|
372 |
-
pre_result.append([sent_text.rstrip(),pre_entity])
|
373 |
-
|
374 |
-
|
375 |
-
#print(pre_result)
|
376 |
-
return pre_result
|
377 |
-
|
378 |
-
|
379 |
-
def BERT_restore_index_fn(ori_text,file_pre):
|
380 |
-
|
381 |
-
# input_result=BERT_BIO_tag_entity_revised(file_pre)
|
382 |
-
input_result=BERT_BIO_tag_entity_word(file_pre)
|
383 |
-
#print(input_result)
|
384 |
-
|
385 |
-
|
386 |
-
new_sentence=''
|
387 |
-
restore_result=[]
|
388 |
-
|
389 |
-
sentence_ori=ori_text.lower()
|
390 |
-
|
391 |
-
for sent_ele in input_result:
|
392 |
-
|
393 |
-
#print(pre_lines)
|
394 |
-
# print(sentence_ori)
|
395 |
-
if len(sent_ele[1])>0:
|
396 |
-
#print(pre_lines)
|
397 |
-
sentence_pre=sent_ele[0].lower()
|
398 |
-
sentence_pre=sentence_pre.split()
|
399 |
-
|
400 |
-
pre_result=sent_ele[1]
|
401 |
-
|
402 |
-
|
403 |
-
restore_sid=0
|
404 |
-
restore_eid=0
|
405 |
-
each_word_id=[]
|
406 |
-
|
407 |
-
|
408 |
-
for i in range(0,len(sentence_pre)):
|
409 |
-
if sentence_pre[i][0:2]=="##":
|
410 |
-
sentence_pre[i]=sentence_pre[i][2:]
|
411 |
-
temp_id=sentence_ori.find(sentence_pre[i])
|
412 |
-
if temp_id<0:
|
413 |
-
#print('ori:',sentence_ori)
|
414 |
-
print('resotr index error:',sentence_pre[i])
|
415 |
-
new_sentence+=sentence_ori[0:temp_id]
|
416 |
-
|
417 |
-
restore_sid=len(new_sentence)
|
418 |
-
restore_eid=len(new_sentence)+len(sentence_pre[i])
|
419 |
-
each_word_id.append([str(restore_sid),str(restore_eid)])
|
420 |
-
new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
|
421 |
-
sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
|
422 |
-
# print('each_word:',each_word_id)
|
423 |
-
for pre_ele in pre_result:
|
424 |
-
temp_pre_result=[each_word_id[int(pre_ele[0])][0],each_word_id[int(pre_ele[1])][1],pre_ele[2]]
|
425 |
-
if temp_pre_result not in restore_result:
|
426 |
-
restore_result.append(temp_pre_result)
|
427 |
-
else:
|
428 |
-
sentence_pre=sent_ele[0].lower()
|
429 |
-
sentence_pre=sentence_pre.split()
|
430 |
-
|
431 |
-
for i in range(0,len(sentence_pre)):
|
432 |
-
if sentence_pre[i][0:2]=="##":
|
433 |
-
sentence_pre[i]=sentence_pre[i][2:]
|
434 |
-
temp_id=sentence_ori.find(sentence_pre[i])
|
435 |
-
if temp_id<0:
|
436 |
-
print('resotr index error:',sentence_pre[i])
|
437 |
-
new_sentence+=sentence_ori[0:temp_id]
|
438 |
-
new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
|
439 |
-
sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
|
440 |
-
#print('resotre:',restore_result)
|
441 |
-
return restore_result
|
442 |
-
if __name__=='__main__':
|
443 |
-
path='//panfs/pan1/bionlp/lulab/luoling/OpenBioIE_project/models/'
|
444 |
-
fin=open(path+'devout_test.txt','r',encoding='utf-8')
|
445 |
-
file_pre=fin.read()
|
446 |
-
ori_text="D90A-SOD1 mediated amyotrophic lateral sclerosis: a single founder for all cases with evidence for a Cis-acting disease modifier in the recessive haplotype. More than 100 different heterozygous mutations in copper/zinc superoxide dismutase (SOD1) have been found in patients with amyotrophic lateral sclerosis (ALS), a fatal neurodegenerative disease. Uniquely, D90A-SOD1 has been identified in recessive, dominant and apparently sporadic pedigrees. The phenotype of homozygotes is stereotyped with an extended survival, whereas that of affected heterozygotes varies. The frequency of D90A-SOD1 is 50 times higher in Scandinavia (2.5%) than elsewhere, though ALS prevalence is not raised there. Our earlier study indicated separate founders for recessive and dominant/sporadic ALS and we proposed a disease-modifying factor linked to the recessive mutation. Here we have doubled our sample set and employed novel markers to characterise the mutation's origin and localise any modifying factor. Linkage disequilibrium analysis indicates that D90A homozygotes and heterozygotes share a rare haplotype and are all descended from a single ancient founder (alpha 0.974) c.895 generations ago. Homozygotes arose subsequently only c.63 generations ago (alpha 0.878). Recombination has reduced the region shared by recessive kindreds to 97-265 kb around SOD1, excluding all neighbouring genes. We propose that a cis-acting regulatory polymorphism has arisen close to D90A-SOD1 in the recessive founder, which decreases ALS susceptibility in heterozygotes and slows disease progression."
|
447 |
-
NN_restore_index_fn(ori_text,file_pre)
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Fri Mar 5 10:40:08 2021
|
4 |
+
|
5 |
+
@author: luol2
|
6 |
+
"""
|
7 |
+
|
8 |
+
# -*- coding: utf-8 -*-
|
9 |
+
"""
|
10 |
+
Created on Sun Jun 14 17:19:02 2020
|
11 |
+
|
12 |
+
@author: luol2
|
13 |
+
"""
|
14 |
+
|
15 |
+
import io
|
16 |
+
import sys
|
17 |
+
|
18 |
+
# from BIO format to entity,list line is sentence, follwing the entity(start, end, text, entity, type)
|
19 |
+
def NN_BIO_tag_entity(pre_BIO):
|
20 |
+
sentences=pre_BIO.strip().split('\n\n')
|
21 |
+
|
22 |
+
pre_result=[]
|
23 |
+
#print(sentences)
|
24 |
+
for sent in sentences:
|
25 |
+
tokens=sent.split('\n')
|
26 |
+
pre_entity=[]
|
27 |
+
pre_start,pre_end=0,0
|
28 |
+
sent_text=''
|
29 |
+
for i in range(0,len(tokens)):
|
30 |
+
segs=tokens[i].split('\t')
|
31 |
+
sent_text+=segs[0]+' '
|
32 |
+
if len(segs)<3:
|
33 |
+
continue
|
34 |
+
#print(tokens)
|
35 |
+
# generate prediction entity
|
36 |
+
if segs[2].startswith('B-')>0:
|
37 |
+
pre_start=i
|
38 |
+
pre_type=segs[2][2:]
|
39 |
+
if i+1>=len(tokens): # the last word
|
40 |
+
pre_end=i
|
41 |
+
pre_entity.append([pre_start,pre_end,pre_type])
|
42 |
+
else: # non last word
|
43 |
+
next_seg=tokens[i+1].split('\t')
|
44 |
+
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
45 |
+
pre_end=i
|
46 |
+
pre_entity.append([pre_start,pre_end,pre_type])
|
47 |
+
elif next_seg[2].startswith('I-')>0:
|
48 |
+
pass
|
49 |
+
elif segs[2].startswith('I-')>0:
|
50 |
+
if i==0 and i+1<len(tokens): # the first word and not only a word
|
51 |
+
pre_start=i
|
52 |
+
pre_type=segs[2][2:]
|
53 |
+
next_seg=tokens[i+1].split('\t')
|
54 |
+
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
55 |
+
pre_end=i
|
56 |
+
pre_entity.append([pre_start,pre_end,pre_type])
|
57 |
+
elif next_seg[2].startswith('I-')>0:
|
58 |
+
pass
|
59 |
+
elif i==0 and i+1==len(tokens):# only one word:
|
60 |
+
pre_start=i
|
61 |
+
pre_type=segs[2][2:]
|
62 |
+
pre_end=i
|
63 |
+
pre_entity.append([pre_start,pre_end,pre_type])
|
64 |
+
elif i+1>=len(tokens): # the last word
|
65 |
+
last_seg=tokens[i-1].split('\t')
|
66 |
+
if last_seg[2]=='O':
|
67 |
+
pre_start=i
|
68 |
+
pre_type=segs[2][2:]
|
69 |
+
pre_end=i
|
70 |
+
pre_entity.append([pre_start,pre_end,pre_type])
|
71 |
+
elif i+1< len(tokens): # non last word
|
72 |
+
next_seg=tokens[i+1].split('\t')
|
73 |
+
last_seg=tokens[i-1].split('\t')
|
74 |
+
if last_seg[2]=='O':
|
75 |
+
pre_start=i
|
76 |
+
pre_type=segs[2][2:]
|
77 |
+
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
78 |
+
pre_end=i
|
79 |
+
pre_entity.append([pre_start,pre_end,pre_type])
|
80 |
+
elif next_seg[2].startswith('I-')>0:
|
81 |
+
pass
|
82 |
+
elif segs[2]=='O':
|
83 |
+
pass
|
84 |
+
pre_result.append([sent_text.rstrip(),pre_entity])
|
85 |
+
|
86 |
+
|
87 |
+
# print(pre_entity)
|
88 |
+
return pre_result
|
89 |
+
|
90 |
+
def NN_restore_index_fn(ori_text,file_pre):
|
91 |
+
|
92 |
+
input_result=NN_BIO_tag_entity(file_pre)
|
93 |
+
#print(input_result)
|
94 |
+
|
95 |
+
|
96 |
+
new_sentence=''
|
97 |
+
restore_result=[]
|
98 |
+
|
99 |
+
sentence_ori=ori_text.lower()
|
100 |
+
|
101 |
+
for sent_ele in input_result:
|
102 |
+
|
103 |
+
#print(pre_lines)
|
104 |
+
# print(sentence_ori)
|
105 |
+
if len(sent_ele[1])>0:
|
106 |
+
#print(pre_lines)
|
107 |
+
sentence_pre=sent_ele[0].lower()
|
108 |
+
sentence_pre=sentence_pre.split()
|
109 |
+
|
110 |
+
pre_result=sent_ele[1]
|
111 |
+
|
112 |
+
|
113 |
+
restore_sid=0
|
114 |
+
restore_eid=0
|
115 |
+
each_word_id=[]
|
116 |
+
|
117 |
+
for i in range(0,len(sentence_pre)):
|
118 |
+
|
119 |
+
temp_id=sentence_ori.find(sentence_pre[i])
|
120 |
+
if temp_id<0:
|
121 |
+
#print('ori:',sentence_ori)
|
122 |
+
print('resotr index error:',sentence_pre[i])
|
123 |
+
new_sentence+=sentence_ori[0:temp_id]
|
124 |
+
|
125 |
+
restore_sid=len(new_sentence)
|
126 |
+
restore_eid=len(new_sentence)+len(sentence_pre[i])
|
127 |
+
each_word_id.append([str(restore_sid),str(restore_eid)])
|
128 |
+
new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
|
129 |
+
sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
|
130 |
+
# print('each_word:',each_word_id)
|
131 |
+
for pre_ele in pre_result:
|
132 |
+
temp_pre_result=[each_word_id[int(pre_ele[0])][0],each_word_id[int(pre_ele[1])][1],pre_ele[2]]
|
133 |
+
if temp_pre_result not in restore_result:
|
134 |
+
restore_result.append(temp_pre_result)
|
135 |
+
else:
|
136 |
+
sentence_pre=sent_ele[0].lower()
|
137 |
+
sentence_pre=sentence_pre.split()
|
138 |
+
|
139 |
+
for i in range(0,len(sentence_pre)):
|
140 |
+
|
141 |
+
temp_id=sentence_ori.find(sentence_pre[i])
|
142 |
+
if temp_id<0:
|
143 |
+
print('resotr index error:',sentence_pre[i])
|
144 |
+
new_sentence+=sentence_ori[0:temp_id]
|
145 |
+
new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
|
146 |
+
sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
|
147 |
+
#print('resotre:',restore_result)
|
148 |
+
return restore_result
|
149 |
+
|
150 |
+
def BERT_BIO_tag_entity(pre_BIO):
|
151 |
+
sentences=pre_BIO.strip().split('\n\n')
|
152 |
+
|
153 |
+
pre_result=[]
|
154 |
+
for sent in sentences:
|
155 |
+
tokens=sent.split('\n')
|
156 |
+
pre_entity=[]
|
157 |
+
pre_start,pre_end=0,0
|
158 |
+
sent_text=''
|
159 |
+
for i in range(1,len(tokens)-1):
|
160 |
+
segs=tokens[i].split('\t')
|
161 |
+
sent_text+=segs[0]+' '
|
162 |
+
# generate prediction entity
|
163 |
+
if segs[2].startswith('B-')>0:
|
164 |
+
pre_start=i
|
165 |
+
pre_type=segs[2][2:]
|
166 |
+
if i+1>=len(tokens): # the last word
|
167 |
+
pre_end=i
|
168 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
169 |
+
else: # non last word
|
170 |
+
next_seg=tokens[i+1].split('\t')
|
171 |
+
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
172 |
+
pre_end=i
|
173 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
174 |
+
elif next_seg[2].startswith('I-')>0:
|
175 |
+
pass
|
176 |
+
elif segs[2].startswith('I-')>0:
|
177 |
+
if i==0 and i+1<len(tokens): # the first word and not only a word
|
178 |
+
pre_start=i
|
179 |
+
pre_type=segs[2][2:]
|
180 |
+
next_seg=tokens[i+1].split('\t')
|
181 |
+
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
182 |
+
pre_end=i
|
183 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
184 |
+
elif next_seg[2].startswith('I-')>0:
|
185 |
+
pass
|
186 |
+
elif i==0 and i+1==len(tokens):# only one word:
|
187 |
+
pre_start=i
|
188 |
+
pre_type=segs[2][2:]
|
189 |
+
pre_end=i
|
190 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
191 |
+
elif i+1>=len(tokens): # the last word
|
192 |
+
last_seg=tokens[i-1].split('\t')
|
193 |
+
if last_seg[2]=='O':
|
194 |
+
pre_start=i
|
195 |
+
pre_type=segs[2][2:]
|
196 |
+
pre_end=i
|
197 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
198 |
+
elif i+1< len(tokens): # non last word
|
199 |
+
next_seg=tokens[i+1].split('\t')
|
200 |
+
last_seg=tokens[i-1].split('\t')
|
201 |
+
if last_seg[2]=='O':
|
202 |
+
pre_start=i
|
203 |
+
pre_type=segs[2][2:]
|
204 |
+
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
205 |
+
pre_end=i
|
206 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
207 |
+
elif next_seg[2].startswith('I-')>0:
|
208 |
+
pass
|
209 |
+
elif segs[2]=='O':
|
210 |
+
pass
|
211 |
+
pre_result.append([sent_text.rstrip(),pre_entity])
|
212 |
+
|
213 |
+
|
214 |
+
#print(pre_result)
|
215 |
+
return pre_result
|
216 |
+
|
217 |
+
def BERT_BIO_tag_entity_revised(pre_BIO):
|
218 |
+
print('revised version')
|
219 |
+
sentences=pre_BIO.strip().split('\n\n')
|
220 |
+
|
221 |
+
pre_result=[]
|
222 |
+
for sent in sentences:
|
223 |
+
tokens=sent.split('\n')
|
224 |
+
pre_entity=[]
|
225 |
+
pre_start,pre_end=0,0
|
226 |
+
sent_text=''
|
227 |
+
for i in range(1,len(tokens)-1):
|
228 |
+
segs=tokens[i].split('\t')
|
229 |
+
sent_text+=segs[0]+' '
|
230 |
+
# generate prediction entity
|
231 |
+
if segs[2].startswith('B-')>0:
|
232 |
+
pre_start=i
|
233 |
+
pre_type=segs[2][2:]
|
234 |
+
if i+1>=len(tokens)-1: # the last word
|
235 |
+
pre_end=i
|
236 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
237 |
+
else: # non last word
|
238 |
+
next_seg=tokens[i+1].split('\t')
|
239 |
+
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
240 |
+
pre_end=i
|
241 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
242 |
+
elif next_seg[2].startswith('I-')>0:
|
243 |
+
pass
|
244 |
+
elif segs[2].startswith('I-')>0:
|
245 |
+
if i==1 and i+1<len(tokens)-1: # the first word and not only a word
|
246 |
+
pre_start=i
|
247 |
+
pre_type=segs[2][2:]
|
248 |
+
next_seg=tokens[i+1].split('\t')
|
249 |
+
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
250 |
+
pre_end=i
|
251 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
252 |
+
elif next_seg[2].startswith('I-')>0:
|
253 |
+
pass
|
254 |
+
elif i==1 and i+1==len(tokens)-1:# only one word:
|
255 |
+
pre_start=i
|
256 |
+
pre_type=segs[2][2:]
|
257 |
+
pre_end=i
|
258 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
259 |
+
elif i+1>=len(tokens)-1: # the last word
|
260 |
+
last_seg=tokens[i-1].split('\t')
|
261 |
+
if last_seg[2]=='O':
|
262 |
+
pre_start=i
|
263 |
+
pre_type=segs[2][2:]
|
264 |
+
pre_end=i
|
265 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
266 |
+
elif i+1< len(tokens)-1: # non last word
|
267 |
+
next_seg=tokens[i+1].split('\t')
|
268 |
+
last_seg=tokens[i-1].split('\t')
|
269 |
+
if last_seg[2]=='O':
|
270 |
+
pre_start=i
|
271 |
+
pre_type=segs[2][2:]
|
272 |
+
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
273 |
+
pre_end=i
|
274 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
275 |
+
elif next_seg[2].startswith('I-')>0:
|
276 |
+
pass
|
277 |
+
elif segs[2]=='O':
|
278 |
+
pass
|
279 |
+
pre_result.append([sent_text.rstrip(),pre_entity])
|
280 |
+
|
281 |
+
|
282 |
+
#print(pre_result)
|
283 |
+
return pre_result
|
284 |
+
|
285 |
+
# only predict on the first token of the ori word
|
286 |
+
def BERT_BIO_tag_entity_word(pre_BIO):
|
287 |
+
sentences=pre_BIO.strip().split('\n\n')
|
288 |
+
|
289 |
+
pre_result=[]
|
290 |
+
for sent in sentences:
|
291 |
+
tokens=sent.split('\n')
|
292 |
+
pre_entity=[]
|
293 |
+
pre_start,pre_end=0,0
|
294 |
+
sent_text=''
|
295 |
+
i=1
|
296 |
+
while i< len(tokens)-1:
|
297 |
+
# for i in range(1,len(tokens)-1):
|
298 |
+
segs=tokens[i].split('\t')
|
299 |
+
sent_text+=segs[0]+' '
|
300 |
+
# generate prediction entity
|
301 |
+
if segs[2].startswith('B-')>0:
|
302 |
+
pre_start=i
|
303 |
+
pre_type=segs[2][2:]
|
304 |
+
if i+1>=len(tokens)-1: # the last word
|
305 |
+
pre_end=i
|
306 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
307 |
+
else: # non last word
|
308 |
+
#pass a word
|
309 |
+
sub_segs=tokens[i+1].split('\t')
|
310 |
+
while(sub_segs[0].find('##')==0):
|
311 |
+
i+=1
|
312 |
+
sent_text+=sub_segs[0]+' '
|
313 |
+
sub_segs=tokens[i+1].split('\t')
|
314 |
+
|
315 |
+
|
316 |
+
next_seg=tokens[i+1].split('\t')
|
317 |
+
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
318 |
+
pre_end=i
|
319 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
320 |
+
elif next_seg[2].startswith('I-')>0:
|
321 |
+
pass
|
322 |
+
elif segs[2].startswith('I-')>0:
|
323 |
+
if i==1 and i+1<len(tokens)-1: # the first word and not only a word
|
324 |
+
pre_start=i
|
325 |
+
pre_type=segs[2][2:]
|
326 |
+
#pass a word
|
327 |
+
sub_segs=tokens[i+1].split('\t')
|
328 |
+
while(sub_segs[0].find('##')==0):
|
329 |
+
i+=1
|
330 |
+
sent_text+=sub_segs[0]+' '
|
331 |
+
sub_segs=tokens[i+1].split('\t')
|
332 |
+
|
333 |
+
next_seg=tokens[i+1].split('\t')
|
334 |
+
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
335 |
+
pre_end=i
|
336 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
337 |
+
elif next_seg[2].startswith('I-')>0:
|
338 |
+
pass
|
339 |
+
elif i==1 and i+1==len(tokens)-1:# only one word:
|
340 |
+
pre_start=i
|
341 |
+
pre_type=segs[2][2:]
|
342 |
+
pre_end=i
|
343 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
344 |
+
elif i+1>=len(tokens)-1: # the last word
|
345 |
+
last_seg=tokens[i-1].split('\t')
|
346 |
+
if last_seg[2]=='O':
|
347 |
+
pre_start=i
|
348 |
+
pre_type=segs[2][2:]
|
349 |
+
pre_end=i
|
350 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
351 |
+
elif i+1< len(tokens)-1: # non last word
|
352 |
+
|
353 |
+
last_seg=tokens[i-1].split('\t')
|
354 |
+
if last_seg[2]=='O':
|
355 |
+
pre_start=i
|
356 |
+
pre_type=segs[2][2:]
|
357 |
+
#pass a word
|
358 |
+
sub_segs=tokens[i+1].split('\t')
|
359 |
+
while(sub_segs[0].find('##')==0):
|
360 |
+
i+=1
|
361 |
+
sent_text+=sub_segs[0]+' '
|
362 |
+
sub_segs=tokens[i+1].split('\t')
|
363 |
+
next_seg=tokens[i+1].split('\t')
|
364 |
+
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
365 |
+
pre_end=i
|
366 |
+
pre_entity.append([pre_start-1,pre_end-1,pre_type])
|
367 |
+
elif next_seg[2].startswith('I-')>0:
|
368 |
+
pass
|
369 |
+
elif segs[2]=='O':
|
370 |
+
pass
|
371 |
+
i+=1
|
372 |
+
pre_result.append([sent_text.rstrip(),pre_entity])
|
373 |
+
|
374 |
+
|
375 |
+
#print(pre_result)
|
376 |
+
return pre_result
|
377 |
+
|
378 |
+
|
379 |
+
def BERT_restore_index_fn(ori_text,file_pre):
|
380 |
+
|
381 |
+
# input_result=BERT_BIO_tag_entity_revised(file_pre)
|
382 |
+
input_result=BERT_BIO_tag_entity_word(file_pre)
|
383 |
+
#print(input_result)
|
384 |
+
|
385 |
+
|
386 |
+
new_sentence=''
|
387 |
+
restore_result=[]
|
388 |
+
|
389 |
+
sentence_ori=ori_text.lower()
|
390 |
+
|
391 |
+
for sent_ele in input_result:
|
392 |
+
|
393 |
+
#print(pre_lines)
|
394 |
+
# print(sentence_ori)
|
395 |
+
if len(sent_ele[1])>0:
|
396 |
+
#print(pre_lines)
|
397 |
+
sentence_pre=sent_ele[0].lower()
|
398 |
+
sentence_pre=sentence_pre.split()
|
399 |
+
|
400 |
+
pre_result=sent_ele[1]
|
401 |
+
|
402 |
+
|
403 |
+
restore_sid=0
|
404 |
+
restore_eid=0
|
405 |
+
each_word_id=[]
|
406 |
+
|
407 |
+
|
408 |
+
for i in range(0,len(sentence_pre)):
|
409 |
+
if sentence_pre[i][0:2]=="##":
|
410 |
+
sentence_pre[i]=sentence_pre[i][2:]
|
411 |
+
temp_id=sentence_ori.find(sentence_pre[i])
|
412 |
+
if temp_id<0:
|
413 |
+
#print('ori:',sentence_ori)
|
414 |
+
print('resotr index error:',sentence_pre[i])
|
415 |
+
new_sentence+=sentence_ori[0:temp_id]
|
416 |
+
|
417 |
+
restore_sid=len(new_sentence)
|
418 |
+
restore_eid=len(new_sentence)+len(sentence_pre[i])
|
419 |
+
each_word_id.append([str(restore_sid),str(restore_eid)])
|
420 |
+
new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
|
421 |
+
sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
|
422 |
+
# print('each_word:',each_word_id)
|
423 |
+
for pre_ele in pre_result:
|
424 |
+
temp_pre_result=[each_word_id[int(pre_ele[0])][0],each_word_id[int(pre_ele[1])][1],pre_ele[2]]
|
425 |
+
if temp_pre_result not in restore_result:
|
426 |
+
restore_result.append(temp_pre_result)
|
427 |
+
else:
|
428 |
+
sentence_pre=sent_ele[0].lower()
|
429 |
+
sentence_pre=sentence_pre.split()
|
430 |
+
|
431 |
+
for i in range(0,len(sentence_pre)):
|
432 |
+
if sentence_pre[i][0:2]=="##":
|
433 |
+
sentence_pre[i]=sentence_pre[i][2:]
|
434 |
+
temp_id=sentence_ori.find(sentence_pre[i])
|
435 |
+
if temp_id<0:
|
436 |
+
print('resotr index error:',sentence_pre[i])
|
437 |
+
new_sentence+=sentence_ori[0:temp_id]
|
438 |
+
new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
|
439 |
+
sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
|
440 |
+
#print('resotre:',restore_result)
|
441 |
+
return restore_result
|
442 |
+
if __name__=='__main__':
|
443 |
+
path='//panfs/pan1/bionlp/lulab/luoling/OpenBioIE_project/models/'
|
444 |
+
fin=open(path+'devout_test.txt','r',encoding='utf-8')
|
445 |
+
file_pre=fin.read()
|
446 |
+
ori_text="D90A-SOD1 mediated amyotrophic lateral sclerosis: a single founder for all cases with evidence for a Cis-acting disease modifier in the recessive haplotype. More than 100 different heterozygous mutations in copper/zinc superoxide dismutase (SOD1) have been found in patients with amyotrophic lateral sclerosis (ALS), a fatal neurodegenerative disease. Uniquely, D90A-SOD1 has been identified in recessive, dominant and apparently sporadic pedigrees. The phenotype of homozygotes is stereotyped with an extended survival, whereas that of affected heterozygotes varies. The frequency of D90A-SOD1 is 50 times higher in Scandinavia (2.5%) than elsewhere, though ALS prevalence is not raised there. Our earlier study indicated separate founders for recessive and dominant/sporadic ALS and we proposed a disease-modifying factor linked to the recessive mutation. Here we have doubled our sample set and employed novel markers to characterise the mutation's origin and localise any modifying factor. Linkage disequilibrium analysis indicates that D90A homozygotes and heterozygotes share a rare haplotype and are all descended from a single ancient founder (alpha 0.974) c.895 generations ago. Homozygotes arose subsequently only c.63 generations ago (alpha 0.878). Recombination has reduced the region shared by recessive kindreds to 97-265 kb around SOD1, excluding all neighbouring genes. We propose that a cis-acting regulatory polymorphism has arisen close to D90A-SOD1 in the recessive founder, which decreases ALS susceptibility in heterozygotes and slows disease progression."
|
447 |
+
NN_restore_index_fn(ori_text,file_pre)
|
src_python/SpeAss/Evaluation_sa.py
CHANGED
@@ -1,396 +1,396 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""
|
3 |
-
Created on Mon Mar 1 15:33:54 2021
|
4 |
-
|
5 |
-
@author: luol2
|
6 |
-
"""
|
7 |
-
# compute metrics using IO prefile
|
8 |
-
#ignore arg1
|
9 |
-
def Rel_Evaluation(prefile):
|
10 |
-
fin=open(prefile,'r',encoding='utf-8')
|
11 |
-
all_in=fin.read().strip().split('\n\n')
|
12 |
-
fin.close()
|
13 |
-
TP=0 #gold=pre=pos
|
14 |
-
FP=0 #gold=neg, pre=pos
|
15 |
-
FN=0 #gold=pos, pre=Neg
|
16 |
-
for sentence in all_in:
|
17 |
-
tokens=sentence.split('\n')
|
18 |
-
entity_id=0
|
19 |
-
token_id=0
|
20 |
-
temp_gold='O'
|
21 |
-
temp_pre='O'
|
22 |
-
while (token_id<len(tokens)):
|
23 |
-
seg=tokens[token_id].split('\t')
|
24 |
-
if seg[0]=='<GENE>':
|
25 |
-
if seg[1]=='O':
|
26 |
-
temp_gold=seg[1]
|
27 |
-
else:
|
28 |
-
temp_gold=seg[1][2:]
|
29 |
-
if seg[2]=='O':
|
30 |
-
temp_pre=seg[2]
|
31 |
-
else:
|
32 |
-
temp_pre=seg[2][2:]
|
33 |
-
token_id+=1
|
34 |
-
seg=tokens[token_id].split('\t')
|
35 |
-
while seg[0]!='</GENE>':
|
36 |
-
token_id+=1
|
37 |
-
seg=tokens[token_id].split('\t')
|
38 |
-
if seg[1]!='O' and temp_gold=='O':
|
39 |
-
temp_gold=seg[1][2:]
|
40 |
-
if seg[2]!='O' and temp_pre=='O':
|
41 |
-
temp_pre=seg[2][2:]
|
42 |
-
if temp_pre!='O' and temp_gold!='O' and temp_pre==temp_gold:
|
43 |
-
TP+=1
|
44 |
-
elif temp_pre!='O' and temp_gold!='O' and temp_pre!=temp_gold:
|
45 |
-
FP+=1
|
46 |
-
FN+=1
|
47 |
-
elif temp_pre!='O' and temp_gold=='O' :
|
48 |
-
FP+=1
|
49 |
-
elif temp_pre=='O' and temp_gold!='O' :
|
50 |
-
FN+=1
|
51 |
-
temp_pre='O'
|
52 |
-
temp_gold='O'
|
53 |
-
|
54 |
-
else:
|
55 |
-
pass
|
56 |
-
token_id+=1
|
57 |
-
# print('TP,FP,FN:',TP,FP,FN)
|
58 |
-
if TP+FP==0:
|
59 |
-
P=0
|
60 |
-
else:
|
61 |
-
P=TP/(TP+FP)
|
62 |
-
if TP+FN==0:
|
63 |
-
R=0
|
64 |
-
else:
|
65 |
-
R=TP/(TP+FN)
|
66 |
-
if P+R==0:
|
67 |
-
F1=0
|
68 |
-
else:
|
69 |
-
F1=2*P*R/(P+R)
|
70 |
-
print('TP,FP,FN:',TP,FP,FN)
|
71 |
-
print('P,R,F1:',P,R,F1)
|
72 |
-
|
73 |
-
|
74 |
-
def Rel_Evaluation_fn(prefile):
|
75 |
-
fin=open(prefile,'r',encoding='utf-8')
|
76 |
-
all_in=fin.read().strip().split('\n\n')
|
77 |
-
fin.close()
|
78 |
-
TP=0 #gold=pre=pos
|
79 |
-
FP=0 #gold=neg, pre=pos
|
80 |
-
FN=0 #gold=pos, pre=Neg
|
81 |
-
for sentence in all_in:
|
82 |
-
tokens=sentence.split('\n')
|
83 |
-
entity_id=0
|
84 |
-
token_id=0
|
85 |
-
temp_gold='O'
|
86 |
-
temp_pre='O'
|
87 |
-
while (token_id<len(tokens)):
|
88 |
-
seg=tokens[token_id].split('\t')
|
89 |
-
if seg[0]=='<GENE>':
|
90 |
-
if seg[1]=='O':
|
91 |
-
temp_gold=seg[1]
|
92 |
-
else:
|
93 |
-
temp_gold=seg[1][2:]
|
94 |
-
if seg[2]=='O':
|
95 |
-
temp_pre=seg[2]
|
96 |
-
else:
|
97 |
-
temp_pre=seg[2][2:]
|
98 |
-
token_id+=1
|
99 |
-
seg=tokens[token_id].split('\t')
|
100 |
-
while seg[0]!='</GENE>':
|
101 |
-
token_id+=1
|
102 |
-
seg=tokens[token_id].split('\t')
|
103 |
-
if seg[1]!='O' and temp_gold=='O':
|
104 |
-
temp_gold=seg[1][2:]
|
105 |
-
if seg[2]!='O' and temp_pre=='O':
|
106 |
-
temp_pre=seg[2][2:]
|
107 |
-
if temp_pre!='O' and temp_gold!='O' and temp_pre==temp_gold:
|
108 |
-
TP+=1
|
109 |
-
elif temp_pre!='O' and temp_gold!='O' and temp_pre!=temp_gold:
|
110 |
-
FP+=1
|
111 |
-
elif temp_pre!='O' and temp_gold=='O' :
|
112 |
-
FP+=1
|
113 |
-
elif temp_pre=='O' and temp_gold!='O' :
|
114 |
-
FN+=1
|
115 |
-
temp_pre='O'
|
116 |
-
temp_gold='O'
|
117 |
-
|
118 |
-
else:
|
119 |
-
pass
|
120 |
-
token_id+=1
|
121 |
-
print('TP,FP,FN:',TP,FP,FN)
|
122 |
-
if TP+FP==0:
|
123 |
-
P=0
|
124 |
-
else:
|
125 |
-
P=TP/(TP+FP)
|
126 |
-
if TP+FN==0:
|
127 |
-
R=0
|
128 |
-
else:
|
129 |
-
R=TP/(TP+FN)
|
130 |
-
if P+R==0:
|
131 |
-
F1=0
|
132 |
-
else:
|
133 |
-
F1=2*P*R/(P+R)
|
134 |
-
# print('TP,FP,FN:',TP,FP,FN)
|
135 |
-
print('P,R,F1:',P,R,F1)
|
136 |
-
return F1
|
137 |
-
|
138 |
-
def Rel_Evaluation_Hugface_fn(prefile,ARG2_label='gene1s'):
|
139 |
-
fin=open(prefile,'r',encoding='utf-8')
|
140 |
-
all_in=fin.read().strip().split('\n\n')
|
141 |
-
fin.close()
|
142 |
-
TP=0 #gold=pre=pos
|
143 |
-
FP=0 #gold=neg, pre=pos
|
144 |
-
FN=0 #gold=pos, pre=Neg
|
145 |
-
result_dict={}#{'rel type':[TP,FP,FN],...,}
|
146 |
-
for sentence in all_in:
|
147 |
-
tokens=sentence.split('\n')
|
148 |
-
for token in tokens:
|
149 |
-
seg=token.split('\t')
|
150 |
-
if seg[0]==ARG2_label:
|
151 |
-
if seg[1].find('ARG2')>=0:
|
152 |
-
if seg[2]==seg[1]:
|
153 |
-
if seg[1] not in result_dict.keys():
|
154 |
-
result_dict[seg[1]]=[1,0,0]
|
155 |
-
else:
|
156 |
-
result_dict[seg[1]][0]+=1
|
157 |
-
TP+=1
|
158 |
-
elif seg[2].find('ARG2')>=0:
|
159 |
-
if seg[1] not in result_dict.keys():
|
160 |
-
result_dict[seg[1]]=[0,0,1]
|
161 |
-
else:
|
162 |
-
result_dict[seg[1]][2]+=1
|
163 |
-
if seg[2] not in result_dict.keys():
|
164 |
-
result_dict[seg[2]]=[0,1,0]
|
165 |
-
else:
|
166 |
-
result_dict[seg[2]][1]+=1
|
167 |
-
FP+=1
|
168 |
-
FN+=1
|
169 |
-
else:
|
170 |
-
if seg[1] not in result_dict.keys():
|
171 |
-
result_dict[seg[1]]=[0,0,1]
|
172 |
-
else:
|
173 |
-
result_dict[seg[1]][2]+=1
|
174 |
-
FN+=1
|
175 |
-
|
176 |
-
else:
|
177 |
-
if seg[2].find('ARG2')>=0:
|
178 |
-
if seg[2] not in result_dict.keys():
|
179 |
-
result_dict[seg[2]]=[0,1,0]
|
180 |
-
else:
|
181 |
-
result_dict[seg[2]][1]+=1
|
182 |
-
FP+=1
|
183 |
-
# print('TP,FP,FN:',TP,FP,FN)
|
184 |
-
rel_metrics={}
|
185 |
-
for rel_type in result_dict.keys():
|
186 |
-
if result_dict[rel_type][0]+result_dict[rel_type][1]==0:
|
187 |
-
p=0
|
188 |
-
else:
|
189 |
-
p=result_dict[rel_type][0]/(result_dict[rel_type][0]+result_dict[rel_type][1])
|
190 |
-
if result_dict[rel_type][0]+result_dict[rel_type][2]==0:
|
191 |
-
r=0
|
192 |
-
else:
|
193 |
-
r=result_dict[rel_type][0]/(result_dict[rel_type][0]+result_dict[rel_type][2])
|
194 |
-
if p+r==0:
|
195 |
-
f1=0
|
196 |
-
else:
|
197 |
-
f1=2*p*r/(p+r)
|
198 |
-
rel_metrics[rel_type]=[round(p,4),round(r,4),round(f1,4)]
|
199 |
-
if TP+FP==0:
|
200 |
-
P=0
|
201 |
-
else:
|
202 |
-
P=TP/(TP+FP)
|
203 |
-
if TP+FN==0:
|
204 |
-
R=0
|
205 |
-
else:
|
206 |
-
R=TP/(TP+FN)
|
207 |
-
if P+R==0:
|
208 |
-
F1=0
|
209 |
-
else:
|
210 |
-
F1=2*P*R/(P+R)
|
211 |
-
P=round(P,4)
|
212 |
-
R=round(R,4)
|
213 |
-
F1=round(F1,4)
|
214 |
-
print('mertics:\n',rel_metrics)
|
215 |
-
print('\nTP,FP,FN:',TP,FP,FN)
|
216 |
-
print('Overall P,R,F1:',P,R,F1)
|
217 |
-
return [P,R,F1],rel_metrics
|
218 |
-
|
219 |
-
def Rel_Evaluation_AIO_fn(prefile):
|
220 |
-
fin=open(prefile,'r',encoding='utf-8')
|
221 |
-
all_in=fin.read().strip().split('\n\n')
|
222 |
-
fin.close()
|
223 |
-
TP=0 #gold=pre=pos
|
224 |
-
FP=0 #gold=neg, pre=pos
|
225 |
-
FN=0 #gold=pos, pre=Neg
|
226 |
-
for sentence in all_in:
|
227 |
-
tokens=sentence.split('\n')
|
228 |
-
for token in tokens:
|
229 |
-
seg=token.split('\t')
|
230 |
-
if seg[0]=='<GENE>':
|
231 |
-
if seg[1].find('ARG2-')>=0:
|
232 |
-
if seg[2]==seg[1]:
|
233 |
-
TP+=1
|
234 |
-
elif seg[2].find('ARG2-')>=0:
|
235 |
-
FP+=1
|
236 |
-
FN+=1
|
237 |
-
else:
|
238 |
-
FN+=1
|
239 |
-
|
240 |
-
else:
|
241 |
-
if seg[2].find('ARG2-')>=0:
|
242 |
-
FP+=1
|
243 |
-
# print('TP,FP,FN:',TP,FP,FN)
|
244 |
-
if TP+FP==0:
|
245 |
-
P=0
|
246 |
-
else:
|
247 |
-
P=TP/(TP+FP)
|
248 |
-
if TP+FN==0:
|
249 |
-
R=0
|
250 |
-
else:
|
251 |
-
R=TP/(TP+FN)
|
252 |
-
if P+R==0:
|
253 |
-
F1=0
|
254 |
-
else:
|
255 |
-
F1=2*P*R/(P+R)
|
256 |
-
P=round(P,4)
|
257 |
-
R=round(R,4)
|
258 |
-
F1=round(F1,4)
|
259 |
-
print('TP,FP,FN:',TP,FP,FN)
|
260 |
-
print('P,R,F1:',P,R,F1)
|
261 |
-
return [P,R,F1]
|
262 |
-
|
263 |
-
def Rel_Evaluation_AIO_GC_fn(prefile):
|
264 |
-
fin=open(prefile,'r',encoding='utf-8')
|
265 |
-
all_in=fin.read().strip().split('\n\n')
|
266 |
-
fin.close()
|
267 |
-
TP=0 #gold=pre=pos
|
268 |
-
FP=0 #gold=neg, pre=pos
|
269 |
-
FN=0 #gold=pos, pre=Neg
|
270 |
-
for sentence in all_in:
|
271 |
-
tokens=sentence.split('\n')
|
272 |
-
for token in tokens:
|
273 |
-
seg=token.split('\t')
|
274 |
-
if seg[0]=='<CHEMICAL>':
|
275 |
-
if seg[1].find('ARG2-')>=0:
|
276 |
-
if seg[2]==seg[1]:
|
277 |
-
TP+=1
|
278 |
-
elif seg[2].find('ARG2-')>=0:
|
279 |
-
FP+=1
|
280 |
-
FN+=1
|
281 |
-
else:
|
282 |
-
FN+=1
|
283 |
-
|
284 |
-
else:
|
285 |
-
if seg[2].find('ARG2-')>=0:
|
286 |
-
FP+=1
|
287 |
-
# print('TP,FP,FN:',TP,FP,FN)
|
288 |
-
if TP+FP==0:
|
289 |
-
P=0
|
290 |
-
else:
|
291 |
-
P=TP/(TP+FP)
|
292 |
-
if TP+FN==0:
|
293 |
-
R=0
|
294 |
-
else:
|
295 |
-
R=TP/(TP+FN)
|
296 |
-
if P+R==0:
|
297 |
-
F1=0
|
298 |
-
else:
|
299 |
-
F1=2*P*R/(P+R)
|
300 |
-
P=round(P,4)
|
301 |
-
R=round(R,4)
|
302 |
-
F1=round(F1,4)
|
303 |
-
print('TP,FP,FN:',TP,FP,FN)
|
304 |
-
print('P,R,F1:',P,R,F1)
|
305 |
-
return [P,R,F1]
|
306 |
-
|
307 |
-
def office_evaluation(goldfile,prefile):
|
308 |
-
fin_gold=open(goldfile,'r',encoding='utf-8')
|
309 |
-
all_gold=fin_gold.read().strip().split('\n')
|
310 |
-
fin_gold.close()
|
311 |
-
fin_pre=open(prefile,'r',encoding='utf-8')
|
312 |
-
all_pre=fin_pre.read().strip().split('\n')
|
313 |
-
fin_pre.close()
|
314 |
-
|
315 |
-
gold_result={}#{'relation type':set(line)}
|
316 |
-
pre_result={}
|
317 |
-
all_result={} #{'relation type':[tp,fp,fn]}
|
318 |
-
for line in all_gold:
|
319 |
-
seg=line.split('\t')
|
320 |
-
if seg[1] not in all_result.keys():
|
321 |
-
all_result[seg[1]]=[0,0,0]
|
322 |
-
if seg[1] not in gold_result.keys():
|
323 |
-
gold_result[seg[1]]=set()
|
324 |
-
gold_result[seg[1]].add(line)
|
325 |
-
else:
|
326 |
-
gold_result[seg[1]].add(line)
|
327 |
-
|
328 |
-
for line in all_pre:
|
329 |
-
seg=line.split('\t')
|
330 |
-
if seg[1] not in pre_result.keys():
|
331 |
-
pre_result[seg[1]]=set()
|
332 |
-
pre_result[seg[1]].add(line)
|
333 |
-
else:
|
334 |
-
pre_result[seg[1]].add(line)
|
335 |
-
|
336 |
-
for rel_type in gold_result.keys():
|
337 |
-
for gold_ele in gold_result[rel_type]:
|
338 |
-
if rel_type not in pre_result.keys():
|
339 |
-
all_result[rel_type][2]+=1
|
340 |
-
else:
|
341 |
-
if gold_ele in pre_result[rel_type]:
|
342 |
-
all_result[rel_type][0]+=1
|
343 |
-
else:
|
344 |
-
all_result[rel_type][2]+=1
|
345 |
-
if rel_type in pre_result.keys():
|
346 |
-
for pre_ele in pre_result[rel_type]:
|
347 |
-
if pre_ele not in gold_result[rel_type]:
|
348 |
-
all_result[rel_type][1]+=1
|
349 |
-
ave_f=0
|
350 |
-
TP,FP,FN=0,0,0
|
351 |
-
print(all_result)
|
352 |
-
for rel_type in all_result.keys():
|
353 |
-
TP+=all_result[rel_type][0]
|
354 |
-
FP+=all_result[rel_type][1]
|
355 |
-
FN+=all_result[rel_type][2]
|
356 |
-
tem_p,tem_r,tem_f=0,0,0
|
357 |
-
if all_result[rel_type][0]+all_result[rel_type][1]==0:
|
358 |
-
tem_p=0
|
359 |
-
else:
|
360 |
-
tem_p=all_result[rel_type][0]/(all_result[rel_type][0]+all_result[rel_type][1])
|
361 |
-
if all_result[rel_type][0]+all_result[rel_type][2]==0:
|
362 |
-
tem_r=0
|
363 |
-
else:
|
364 |
-
tem_r=all_result[rel_type][0]/(all_result[rel_type][0]+all_result[rel_type][2])
|
365 |
-
if tem_p+tem_r==0:
|
366 |
-
tem_f=0
|
367 |
-
else:
|
368 |
-
tem_f=2*tem_p*tem_r/(tem_p+tem_r)
|
369 |
-
ave_f+=tem_f
|
370 |
-
print('%s:p=%.4f,r=%.4f,f=%.4f' % (rel_type,tem_p,tem_r,tem_f))
|
371 |
-
|
372 |
-
if TP+FP==0:
|
373 |
-
P=0
|
374 |
-
else:
|
375 |
-
P=TP/(TP+FP)
|
376 |
-
if TP+FN==0:
|
377 |
-
R=0
|
378 |
-
else:
|
379 |
-
R=TP/(TP+FN)
|
380 |
-
if P+R==0:
|
381 |
-
F1=0
|
382 |
-
else:
|
383 |
-
F1=2*P*R/(P+R)
|
384 |
-
ave_f+=tem_f
|
385 |
-
|
386 |
-
print('Overall:')
|
387 |
-
print('ave_f1:',ave_f/len(all_result))
|
388 |
-
print('TP=%d, FP=%d, FN=%d'%(TP,FP,FN))
|
389 |
-
print('P=%.4f, R=%.4f, F1=%.4f'%(P,R,F1))
|
390 |
-
|
391 |
-
|
392 |
-
if __name__=='__main__':
|
393 |
-
path='//panfs/pan1/bionlplab/luol2/BC7DrugProt/results/'
|
394 |
-
office_evaluation(path+'dev/dev_gold_relations.tsv',path+'drugprot_dev_LSTM-CRF-ES_pre.tsv')
|
395 |
-
print('............')
|
396 |
-
Rel_Evaluation_check('//panfs/pan1/bionlplab/luol2/BC7DrugProt/check/dev_pre_temp.conll')
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Mon Mar 1 15:33:54 2021
|
4 |
+
|
5 |
+
@author: luol2
|
6 |
+
"""
|
7 |
+
# compute metrics using IO prefile
|
8 |
+
#ignore arg1
|
9 |
+
def Rel_Evaluation(prefile):
|
10 |
+
fin=open(prefile,'r',encoding='utf-8')
|
11 |
+
all_in=fin.read().strip().split('\n\n')
|
12 |
+
fin.close()
|
13 |
+
TP=0 #gold=pre=pos
|
14 |
+
FP=0 #gold=neg, pre=pos
|
15 |
+
FN=0 #gold=pos, pre=Neg
|
16 |
+
for sentence in all_in:
|
17 |
+
tokens=sentence.split('\n')
|
18 |
+
entity_id=0
|
19 |
+
token_id=0
|
20 |
+
temp_gold='O'
|
21 |
+
temp_pre='O'
|
22 |
+
while (token_id<len(tokens)):
|
23 |
+
seg=tokens[token_id].split('\t')
|
24 |
+
if seg[0]=='<GENE>':
|
25 |
+
if seg[1]=='O':
|
26 |
+
temp_gold=seg[1]
|
27 |
+
else:
|
28 |
+
temp_gold=seg[1][2:]
|
29 |
+
if seg[2]=='O':
|
30 |
+
temp_pre=seg[2]
|
31 |
+
else:
|
32 |
+
temp_pre=seg[2][2:]
|
33 |
+
token_id+=1
|
34 |
+
seg=tokens[token_id].split('\t')
|
35 |
+
while seg[0]!='</GENE>':
|
36 |
+
token_id+=1
|
37 |
+
seg=tokens[token_id].split('\t')
|
38 |
+
if seg[1]!='O' and temp_gold=='O':
|
39 |
+
temp_gold=seg[1][2:]
|
40 |
+
if seg[2]!='O' and temp_pre=='O':
|
41 |
+
temp_pre=seg[2][2:]
|
42 |
+
if temp_pre!='O' and temp_gold!='O' and temp_pre==temp_gold:
|
43 |
+
TP+=1
|
44 |
+
elif temp_pre!='O' and temp_gold!='O' and temp_pre!=temp_gold:
|
45 |
+
FP+=1
|
46 |
+
FN+=1
|
47 |
+
elif temp_pre!='O' and temp_gold=='O' :
|
48 |
+
FP+=1
|
49 |
+
elif temp_pre=='O' and temp_gold!='O' :
|
50 |
+
FN+=1
|
51 |
+
temp_pre='O'
|
52 |
+
temp_gold='O'
|
53 |
+
|
54 |
+
else:
|
55 |
+
pass
|
56 |
+
token_id+=1
|
57 |
+
# print('TP,FP,FN:',TP,FP,FN)
|
58 |
+
if TP+FP==0:
|
59 |
+
P=0
|
60 |
+
else:
|
61 |
+
P=TP/(TP+FP)
|
62 |
+
if TP+FN==0:
|
63 |
+
R=0
|
64 |
+
else:
|
65 |
+
R=TP/(TP+FN)
|
66 |
+
if P+R==0:
|
67 |
+
F1=0
|
68 |
+
else:
|
69 |
+
F1=2*P*R/(P+R)
|
70 |
+
print('TP,FP,FN:',TP,FP,FN)
|
71 |
+
print('P,R,F1:',P,R,F1)
|
72 |
+
|
73 |
+
|
74 |
+
def Rel_Evaluation_fn(prefile):
|
75 |
+
fin=open(prefile,'r',encoding='utf-8')
|
76 |
+
all_in=fin.read().strip().split('\n\n')
|
77 |
+
fin.close()
|
78 |
+
TP=0 #gold=pre=pos
|
79 |
+
FP=0 #gold=neg, pre=pos
|
80 |
+
FN=0 #gold=pos, pre=Neg
|
81 |
+
for sentence in all_in:
|
82 |
+
tokens=sentence.split('\n')
|
83 |
+
entity_id=0
|
84 |
+
token_id=0
|
85 |
+
temp_gold='O'
|
86 |
+
temp_pre='O'
|
87 |
+
while (token_id<len(tokens)):
|
88 |
+
seg=tokens[token_id].split('\t')
|
89 |
+
if seg[0]=='<GENE>':
|
90 |
+
if seg[1]=='O':
|
91 |
+
temp_gold=seg[1]
|
92 |
+
else:
|
93 |
+
temp_gold=seg[1][2:]
|
94 |
+
if seg[2]=='O':
|
95 |
+
temp_pre=seg[2]
|
96 |
+
else:
|
97 |
+
temp_pre=seg[2][2:]
|
98 |
+
token_id+=1
|
99 |
+
seg=tokens[token_id].split('\t')
|
100 |
+
while seg[0]!='</GENE>':
|
101 |
+
token_id+=1
|
102 |
+
seg=tokens[token_id].split('\t')
|
103 |
+
if seg[1]!='O' and temp_gold=='O':
|
104 |
+
temp_gold=seg[1][2:]
|
105 |
+
if seg[2]!='O' and temp_pre=='O':
|
106 |
+
temp_pre=seg[2][2:]
|
107 |
+
if temp_pre!='O' and temp_gold!='O' and temp_pre==temp_gold:
|
108 |
+
TP+=1
|
109 |
+
elif temp_pre!='O' and temp_gold!='O' and temp_pre!=temp_gold:
|
110 |
+
FP+=1
|
111 |
+
elif temp_pre!='O' and temp_gold=='O' :
|
112 |
+
FP+=1
|
113 |
+
elif temp_pre=='O' and temp_gold!='O' :
|
114 |
+
FN+=1
|
115 |
+
temp_pre='O'
|
116 |
+
temp_gold='O'
|
117 |
+
|
118 |
+
else:
|
119 |
+
pass
|
120 |
+
token_id+=1
|
121 |
+
print('TP,FP,FN:',TP,FP,FN)
|
122 |
+
if TP+FP==0:
|
123 |
+
P=0
|
124 |
+
else:
|
125 |
+
P=TP/(TP+FP)
|
126 |
+
if TP+FN==0:
|
127 |
+
R=0
|
128 |
+
else:
|
129 |
+
R=TP/(TP+FN)
|
130 |
+
if P+R==0:
|
131 |
+
F1=0
|
132 |
+
else:
|
133 |
+
F1=2*P*R/(P+R)
|
134 |
+
# print('TP,FP,FN:',TP,FP,FN)
|
135 |
+
print('P,R,F1:',P,R,F1)
|
136 |
+
return F1
|
137 |
+
|
138 |
+
def Rel_Evaluation_Hugface_fn(prefile,ARG2_label='gene1s'):
|
139 |
+
fin=open(prefile,'r',encoding='utf-8')
|
140 |
+
all_in=fin.read().strip().split('\n\n')
|
141 |
+
fin.close()
|
142 |
+
TP=0 #gold=pre=pos
|
143 |
+
FP=0 #gold=neg, pre=pos
|
144 |
+
FN=0 #gold=pos, pre=Neg
|
145 |
+
result_dict={}#{'rel type':[TP,FP,FN],...,}
|
146 |
+
for sentence in all_in:
|
147 |
+
tokens=sentence.split('\n')
|
148 |
+
for token in tokens:
|
149 |
+
seg=token.split('\t')
|
150 |
+
if seg[0]==ARG2_label:
|
151 |
+
if seg[1].find('ARG2')>=0:
|
152 |
+
if seg[2]==seg[1]:
|
153 |
+
if seg[1] not in result_dict.keys():
|
154 |
+
result_dict[seg[1]]=[1,0,0]
|
155 |
+
else:
|
156 |
+
result_dict[seg[1]][0]+=1
|
157 |
+
TP+=1
|
158 |
+
elif seg[2].find('ARG2')>=0:
|
159 |
+
if seg[1] not in result_dict.keys():
|
160 |
+
result_dict[seg[1]]=[0,0,1]
|
161 |
+
else:
|
162 |
+
result_dict[seg[1]][2]+=1
|
163 |
+
if seg[2] not in result_dict.keys():
|
164 |
+
result_dict[seg[2]]=[0,1,0]
|
165 |
+
else:
|
166 |
+
result_dict[seg[2]][1]+=1
|
167 |
+
FP+=1
|
168 |
+
FN+=1
|
169 |
+
else:
|
170 |
+
if seg[1] not in result_dict.keys():
|
171 |
+
result_dict[seg[1]]=[0,0,1]
|
172 |
+
else:
|
173 |
+
result_dict[seg[1]][2]+=1
|
174 |
+
FN+=1
|
175 |
+
|
176 |
+
else:
|
177 |
+
if seg[2].find('ARG2')>=0:
|
178 |
+
if seg[2] not in result_dict.keys():
|
179 |
+
result_dict[seg[2]]=[0,1,0]
|
180 |
+
else:
|
181 |
+
result_dict[seg[2]][1]+=1
|
182 |
+
FP+=1
|
183 |
+
# print('TP,FP,FN:',TP,FP,FN)
|
184 |
+
rel_metrics={}
|
185 |
+
for rel_type in result_dict.keys():
|
186 |
+
if result_dict[rel_type][0]+result_dict[rel_type][1]==0:
|
187 |
+
p=0
|
188 |
+
else:
|
189 |
+
p=result_dict[rel_type][0]/(result_dict[rel_type][0]+result_dict[rel_type][1])
|
190 |
+
if result_dict[rel_type][0]+result_dict[rel_type][2]==0:
|
191 |
+
r=0
|
192 |
+
else:
|
193 |
+
r=result_dict[rel_type][0]/(result_dict[rel_type][0]+result_dict[rel_type][2])
|
194 |
+
if p+r==0:
|
195 |
+
f1=0
|
196 |
+
else:
|
197 |
+
f1=2*p*r/(p+r)
|
198 |
+
rel_metrics[rel_type]=[round(p,4),round(r,4),round(f1,4)]
|
199 |
+
if TP+FP==0:
|
200 |
+
P=0
|
201 |
+
else:
|
202 |
+
P=TP/(TP+FP)
|
203 |
+
if TP+FN==0:
|
204 |
+
R=0
|
205 |
+
else:
|
206 |
+
R=TP/(TP+FN)
|
207 |
+
if P+R==0:
|
208 |
+
F1=0
|
209 |
+
else:
|
210 |
+
F1=2*P*R/(P+R)
|
211 |
+
P=round(P,4)
|
212 |
+
R=round(R,4)
|
213 |
+
F1=round(F1,4)
|
214 |
+
print('mertics:\n',rel_metrics)
|
215 |
+
print('\nTP,FP,FN:',TP,FP,FN)
|
216 |
+
print('Overall P,R,F1:',P,R,F1)
|
217 |
+
return [P,R,F1],rel_metrics
|
218 |
+
|
219 |
+
def Rel_Evaluation_AIO_fn(prefile):
|
220 |
+
fin=open(prefile,'r',encoding='utf-8')
|
221 |
+
all_in=fin.read().strip().split('\n\n')
|
222 |
+
fin.close()
|
223 |
+
TP=0 #gold=pre=pos
|
224 |
+
FP=0 #gold=neg, pre=pos
|
225 |
+
FN=0 #gold=pos, pre=Neg
|
226 |
+
for sentence in all_in:
|
227 |
+
tokens=sentence.split('\n')
|
228 |
+
for token in tokens:
|
229 |
+
seg=token.split('\t')
|
230 |
+
if seg[0]=='<GENE>':
|
231 |
+
if seg[1].find('ARG2-')>=0:
|
232 |
+
if seg[2]==seg[1]:
|
233 |
+
TP+=1
|
234 |
+
elif seg[2].find('ARG2-')>=0:
|
235 |
+
FP+=1
|
236 |
+
FN+=1
|
237 |
+
else:
|
238 |
+
FN+=1
|
239 |
+
|
240 |
+
else:
|
241 |
+
if seg[2].find('ARG2-')>=0:
|
242 |
+
FP+=1
|
243 |
+
# print('TP,FP,FN:',TP,FP,FN)
|
244 |
+
if TP+FP==0:
|
245 |
+
P=0
|
246 |
+
else:
|
247 |
+
P=TP/(TP+FP)
|
248 |
+
if TP+FN==0:
|
249 |
+
R=0
|
250 |
+
else:
|
251 |
+
R=TP/(TP+FN)
|
252 |
+
if P+R==0:
|
253 |
+
F1=0
|
254 |
+
else:
|
255 |
+
F1=2*P*R/(P+R)
|
256 |
+
P=round(P,4)
|
257 |
+
R=round(R,4)
|
258 |
+
F1=round(F1,4)
|
259 |
+
print('TP,FP,FN:',TP,FP,FN)
|
260 |
+
print('P,R,F1:',P,R,F1)
|
261 |
+
return [P,R,F1]
|
262 |
+
|
263 |
+
def Rel_Evaluation_AIO_GC_fn(prefile):
|
264 |
+
fin=open(prefile,'r',encoding='utf-8')
|
265 |
+
all_in=fin.read().strip().split('\n\n')
|
266 |
+
fin.close()
|
267 |
+
TP=0 #gold=pre=pos
|
268 |
+
FP=0 #gold=neg, pre=pos
|
269 |
+
FN=0 #gold=pos, pre=Neg
|
270 |
+
for sentence in all_in:
|
271 |
+
tokens=sentence.split('\n')
|
272 |
+
for token in tokens:
|
273 |
+
seg=token.split('\t')
|
274 |
+
if seg[0]=='<CHEMICAL>':
|
275 |
+
if seg[1].find('ARG2-')>=0:
|
276 |
+
if seg[2]==seg[1]:
|
277 |
+
TP+=1
|
278 |
+
elif seg[2].find('ARG2-')>=0:
|
279 |
+
FP+=1
|
280 |
+
FN+=1
|
281 |
+
else:
|
282 |
+
FN+=1
|
283 |
+
|
284 |
+
else:
|
285 |
+
if seg[2].find('ARG2-')>=0:
|
286 |
+
FP+=1
|
287 |
+
# print('TP,FP,FN:',TP,FP,FN)
|
288 |
+
if TP+FP==0:
|
289 |
+
P=0
|
290 |
+
else:
|
291 |
+
P=TP/(TP+FP)
|
292 |
+
if TP+FN==0:
|
293 |
+
R=0
|
294 |
+
else:
|
295 |
+
R=TP/(TP+FN)
|
296 |
+
if P+R==0:
|
297 |
+
F1=0
|
298 |
+
else:
|
299 |
+
F1=2*P*R/(P+R)
|
300 |
+
P=round(P,4)
|
301 |
+
R=round(R,4)
|
302 |
+
F1=round(F1,4)
|
303 |
+
print('TP,FP,FN:',TP,FP,FN)
|
304 |
+
print('P,R,F1:',P,R,F1)
|
305 |
+
return [P,R,F1]
|
306 |
+
|
307 |
+
def office_evaluation(goldfile,prefile):
|
308 |
+
fin_gold=open(goldfile,'r',encoding='utf-8')
|
309 |
+
all_gold=fin_gold.read().strip().split('\n')
|
310 |
+
fin_gold.close()
|
311 |
+
fin_pre=open(prefile,'r',encoding='utf-8')
|
312 |
+
all_pre=fin_pre.read().strip().split('\n')
|
313 |
+
fin_pre.close()
|
314 |
+
|
315 |
+
gold_result={}#{'relation type':set(line)}
|
316 |
+
pre_result={}
|
317 |
+
all_result={} #{'relation type':[tp,fp,fn]}
|
318 |
+
for line in all_gold:
|
319 |
+
seg=line.split('\t')
|
320 |
+
if seg[1] not in all_result.keys():
|
321 |
+
all_result[seg[1]]=[0,0,0]
|
322 |
+
if seg[1] not in gold_result.keys():
|
323 |
+
gold_result[seg[1]]=set()
|
324 |
+
gold_result[seg[1]].add(line)
|
325 |
+
else:
|
326 |
+
gold_result[seg[1]].add(line)
|
327 |
+
|
328 |
+
for line in all_pre:
|
329 |
+
seg=line.split('\t')
|
330 |
+
if seg[1] not in pre_result.keys():
|
331 |
+
pre_result[seg[1]]=set()
|
332 |
+
pre_result[seg[1]].add(line)
|
333 |
+
else:
|
334 |
+
pre_result[seg[1]].add(line)
|
335 |
+
|
336 |
+
for rel_type in gold_result.keys():
|
337 |
+
for gold_ele in gold_result[rel_type]:
|
338 |
+
if rel_type not in pre_result.keys():
|
339 |
+
all_result[rel_type][2]+=1
|
340 |
+
else:
|
341 |
+
if gold_ele in pre_result[rel_type]:
|
342 |
+
all_result[rel_type][0]+=1
|
343 |
+
else:
|
344 |
+
all_result[rel_type][2]+=1
|
345 |
+
if rel_type in pre_result.keys():
|
346 |
+
for pre_ele in pre_result[rel_type]:
|
347 |
+
if pre_ele not in gold_result[rel_type]:
|
348 |
+
all_result[rel_type][1]+=1
|
349 |
+
ave_f=0
|
350 |
+
TP,FP,FN=0,0,0
|
351 |
+
print(all_result)
|
352 |
+
for rel_type in all_result.keys():
|
353 |
+
TP+=all_result[rel_type][0]
|
354 |
+
FP+=all_result[rel_type][1]
|
355 |
+
FN+=all_result[rel_type][2]
|
356 |
+
tem_p,tem_r,tem_f=0,0,0
|
357 |
+
if all_result[rel_type][0]+all_result[rel_type][1]==0:
|
358 |
+
tem_p=0
|
359 |
+
else:
|
360 |
+
tem_p=all_result[rel_type][0]/(all_result[rel_type][0]+all_result[rel_type][1])
|
361 |
+
if all_result[rel_type][0]+all_result[rel_type][2]==0:
|
362 |
+
tem_r=0
|
363 |
+
else:
|
364 |
+
tem_r=all_result[rel_type][0]/(all_result[rel_type][0]+all_result[rel_type][2])
|
365 |
+
if tem_p+tem_r==0:
|
366 |
+
tem_f=0
|
367 |
+
else:
|
368 |
+
tem_f=2*tem_p*tem_r/(tem_p+tem_r)
|
369 |
+
ave_f+=tem_f
|
370 |
+
print('%s:p=%.4f,r=%.4f,f=%.4f' % (rel_type,tem_p,tem_r,tem_f))
|
371 |
+
|
372 |
+
if TP+FP==0:
|
373 |
+
P=0
|
374 |
+
else:
|
375 |
+
P=TP/(TP+FP)
|
376 |
+
if TP+FN==0:
|
377 |
+
R=0
|
378 |
+
else:
|
379 |
+
R=TP/(TP+FN)
|
380 |
+
if P+R==0:
|
381 |
+
F1=0
|
382 |
+
else:
|
383 |
+
F1=2*P*R/(P+R)
|
384 |
+
ave_f+=tem_f
|
385 |
+
|
386 |
+
print('Overall:')
|
387 |
+
print('ave_f1:',ave_f/len(all_result))
|
388 |
+
print('TP=%d, FP=%d, FN=%d'%(TP,FP,FN))
|
389 |
+
print('P=%.4f, R=%.4f, F1=%.4f'%(P,R,F1))
|
390 |
+
|
391 |
+
|
392 |
+
if __name__=='__main__':
|
393 |
+
path='//panfs/pan1/bionlplab/luol2/BC7DrugProt/results/'
|
394 |
+
office_evaluation(path+'dev/dev_gold_relations.tsv',path+'drugprot_dev_LSTM-CRF-ES_pre.tsv')
|
395 |
+
print('............')
|
396 |
+
Rel_Evaluation_check('//panfs/pan1/bionlplab/luol2/BC7DrugProt/check/dev_pre_temp.conll')
|
src_python/SpeAss/SA_Pubtator_Conll.py
CHANGED
@@ -1,494 +1,494 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
|
3 |
-
import sys
|
4 |
-
import io
|
5 |
-
import stanza
|
6 |
-
# nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma',package='craft') #package='craft'
|
7 |
-
nlp = stanza.Pipeline(lang='en', processors={'tokenize': 'spacy'},package='None') #package='craft'
|
8 |
-
REL_ENT={'arg1':'Species',
|
9 |
-
'arg2':'Gene'}
|
10 |
-
|
11 |
-
ENTITY_TAG={'arg1':['arg1s','arg1e'],
|
12 |
-
'arg2':['arg2s','arg2e'],
|
13 |
-
'gene':['gene1s','gene1e'],
|
14 |
-
'species':['species1s','species1e']
|
15 |
-
}
|
16 |
-
|
17 |
-
# ssplit token and revise index
|
18 |
-
def ssplit_token(infile):
|
19 |
-
fin=open(infile,'r',encoding='utf-8')
|
20 |
-
fout=io.StringIO()
|
21 |
-
all_in=fin.read().strip().split('\n\n')
|
22 |
-
fin.close()
|
23 |
-
for doc_text in all_in:
|
24 |
-
lines=doc_text.split('\n')
|
25 |
-
ori_text=lines[0].split('|t|')[1]+' '+lines[1].split('|a|')[1]
|
26 |
-
pmid=lines[0].split('|t|')[0]
|
27 |
-
# print(pmid)
|
28 |
-
entity_all=[] #[[seg0,seg1,...,],[]]
|
29 |
-
for i in range(2,len(lines)):
|
30 |
-
seg=lines[i].split('\t')
|
31 |
-
entity_all.append(seg)
|
32 |
-
|
33 |
-
#ssplit token
|
34 |
-
doc_stanza = nlp(ori_text)
|
35 |
-
token_text=''
|
36 |
-
for sent in doc_stanza.sentences:
|
37 |
-
for word in sent.words:
|
38 |
-
if word.text==' ':
|
39 |
-
pass
|
40 |
-
# print('token is blank!')
|
41 |
-
else:
|
42 |
-
token_text+=word.text+' '
|
43 |
-
#token_text=token_text+' ' #sentence split by four blank
|
44 |
-
|
45 |
-
#ori_index map token_index
|
46 |
-
index_map=[-1]*len(ori_text)
|
47 |
-
j=0
|
48 |
-
space_list=[' ',chr(160),chr(8201),chr(8194),chr(8197),chr(8202)] #空格有好几种,第一个是常用32,第二个shi 160,8201,8194,8197
|
49 |
-
for i in range(0,len(ori_text)):
|
50 |
-
if ori_text[i] in space_list:
|
51 |
-
pass
|
52 |
-
elif ori_text[i]==token_text[j]:
|
53 |
-
#if i>0 and i<285:
|
54 |
-
# print('=i,j:',i,j,ori_text[i-1:i+1],token_text[j-1:j+1])
|
55 |
-
index_map[i]=j
|
56 |
-
j+=1
|
57 |
-
else:
|
58 |
-
#if i==283:
|
59 |
-
# print('!i,j:',i,j,ori_text[i-1:i+1],token_text[j-1:j+1])
|
60 |
-
j+=1
|
61 |
-
temp_log=j
|
62 |
-
try:
|
63 |
-
while(ori_text[i]!=token_text[j]):
|
64 |
-
j+=1
|
65 |
-
except:
|
66 |
-
print('doc',doc_text)
|
67 |
-
print('token_text:',token_text)
|
68 |
-
print('error:',ori_text[i-10:i+10],'i:',ori_text[i],'j:',token_text[temp_log],',',token_text[temp_log-10:temp_log+10])
|
69 |
-
print(ord(ori_text[i]),ord(' '))
|
70 |
-
sys.exit()
|
71 |
-
index_map[i]=j
|
72 |
-
j+=1
|
73 |
-
# print(index_map)
|
74 |
-
# token_text=token_text.replace(' ','<EOS>')
|
75 |
-
# print(token_text)
|
76 |
-
fout.write(token_text+'\n')
|
77 |
-
for ele in entity_all:
|
78 |
-
if index_map[int(ele[1])]==-1:
|
79 |
-
new_ents=index_map[int(ele[1])+1]
|
80 |
-
else:
|
81 |
-
new_ents=index_map[int(ele[1])]
|
82 |
-
if index_map[int(ele[2])-1]==-1:
|
83 |
-
new_ente=index_map[int(ele[2])-1-1]+1
|
84 |
-
else:
|
85 |
-
new_ente=index_map[int(ele[2])-1]+1
|
86 |
-
new_ent=token_text[new_ents:new_ente]
|
87 |
-
if ele[4]=='Species' or ele[4]=='Gene':
|
88 |
-
fout.write(ele[0]+'\t'+str(new_ents)+'\t'+str(new_ente)+'\t'+new_ent+'\t'+ele[4]+'\t'+ele[5]+'\n')
|
89 |
-
else:
|
90 |
-
# print(ele[4])
|
91 |
-
fout.write(ele[0]+'\t'+str(new_ents)+'\t'+str(new_ente)+'\t'+new_ent+'\t'+'Gene'+'\t'+ele[5]+'\n')
|
92 |
-
fout.write('\n')
|
93 |
-
return fout.getvalue()
|
94 |
-
|
95 |
-
|
96 |
-
def corpus_noNest(token_input):
|
97 |
-
|
98 |
-
fin=io.StringIO(token_input)
|
99 |
-
fout=io.StringIO()
|
100 |
-
|
101 |
-
documents=fin.read().strip().split('\n\n')
|
102 |
-
fin.close()
|
103 |
-
total_entity=0
|
104 |
-
over_entity=0
|
105 |
-
nest_entity=0
|
106 |
-
for doc in documents:
|
107 |
-
lines=doc.split('\n')
|
108 |
-
context=lines[0]
|
109 |
-
entity_list=[]
|
110 |
-
if len(lines)>1:
|
111 |
-
doc_result={}
|
112 |
-
for i in range(1,len(lines)):
|
113 |
-
segs=lines[i].split('\t')
|
114 |
-
doc_result[lines[i]]=[int(segs[1]),int(segs[2])]
|
115 |
-
doc_result=sorted(doc_result.items(), key=lambda kv:(kv[1]), reverse=False)
|
116 |
-
doc_result_sort=[]
|
117 |
-
for ele in doc_result:
|
118 |
-
doc_result_sort.append(ele[0])
|
119 |
-
|
120 |
-
first_entity=doc_result_sort[0].split('\t')
|
121 |
-
nest_list=[first_entity]
|
122 |
-
max_eid=int(first_entity[2])
|
123 |
-
total_entity+=len(lines)-2
|
124 |
-
for i in range(1,len(doc_result_sort)):
|
125 |
-
segs=doc_result_sort[i].split('\t')
|
126 |
-
if int(segs[1])> max_eid:
|
127 |
-
if len(nest_list)==1:
|
128 |
-
entity_list.append(nest_list[0])
|
129 |
-
nest_list=[]
|
130 |
-
nest_list.append(segs)
|
131 |
-
if int(segs[2])>max_eid:
|
132 |
-
max_eid=int(segs[2])
|
133 |
-
else:
|
134 |
-
# print(nest_list)
|
135 |
-
nest_entity+=len(nest_list)-1
|
136 |
-
tem=find_max_entity(nest_list,context)#find max entity
|
137 |
-
# if len(tem)>1:
|
138 |
-
# print('max nest >1:',tem)
|
139 |
-
entity_list.extend(tem)
|
140 |
-
nest_list=[]
|
141 |
-
nest_list.append(segs)
|
142 |
-
if int(segs[2])>max_eid:
|
143 |
-
max_eid=int(segs[2])
|
144 |
-
|
145 |
-
else:
|
146 |
-
nest_list.append(segs)
|
147 |
-
over_entity+=1
|
148 |
-
if int(segs[2])>max_eid:
|
149 |
-
max_eid=int(segs[2])
|
150 |
-
if nest_list!=[]:
|
151 |
-
if len(nest_list)==1:
|
152 |
-
entity_list.append(nest_list[0])
|
153 |
-
|
154 |
-
else:
|
155 |
-
tem=find_max_entity(nest_list,context)#find max entity
|
156 |
-
# if len(tem)>1:
|
157 |
-
# print('max nest >1:',tem)
|
158 |
-
entity_list.extend(tem)
|
159 |
-
fout.write(context+'\n')
|
160 |
-
for ele in entity_list:
|
161 |
-
if ele[4]=='Gene':
|
162 |
-
temp_gene={}
|
163 |
-
gene_ids=ele[5].split(',')
|
164 |
-
for gene_id in gene_ids:
|
165 |
-
temp_id=gene_id[gene_id.find('Species:'):-1]
|
166 |
-
spe_id=temp_id[len('Species:'):]
|
167 |
-
temp_gene[temp_id]=int(spe_id)
|
168 |
-
temp_gene_sort=sorted(temp_gene.items(), key=lambda kv:(kv[1]), reverse=False)
|
169 |
-
final_gene_id=''
|
170 |
-
for temp_ele in temp_gene_sort:
|
171 |
-
final_gene_id+=temp_ele[0]+','
|
172 |
-
fout.write('\t'.join(ele[:-1])+'\t'+final_gene_id[:-1]+'\n')
|
173 |
-
else:
|
174 |
-
fout.write('\t'.join(ele)+'\n')
|
175 |
-
fout.write('\n')
|
176 |
-
# print(total_entity,over_entity, nest_entity)
|
177 |
-
return fout.getvalue()
|
178 |
-
|
179 |
-
def find_max_entity(nest_list,text):
|
180 |
-
max_len=0
|
181 |
-
final_tem=[]
|
182 |
-
max_index=0
|
183 |
-
for i in range(0, len(nest_list)):
|
184 |
-
if nest_list[i][4] =='Species':
|
185 |
-
final_tem.append(nest_list[i])
|
186 |
-
else:
|
187 |
-
cur_len=int(nest_list[i][2])-int(nest_list[i][1])
|
188 |
-
if cur_len>max_len:
|
189 |
-
max_len=cur_len
|
190 |
-
max_index=i
|
191 |
-
final_tem.append(nest_list[max_index])
|
192 |
-
return final_tem
|
193 |
-
|
194 |
-
|
195 |
-
def generate_seq_input(nonest_input,outfile):
|
196 |
-
|
197 |
-
fin=io.StringIO(nonest_input)
|
198 |
-
fout=open(outfile,'w',encoding='utf-8')
|
199 |
-
all_in=fin.read().strip().split('\n\n')
|
200 |
-
fin.close()
|
201 |
-
|
202 |
-
final_input=[]
|
203 |
-
|
204 |
-
for doc in all_in:
|
205 |
-
lines=doc.split('\n')
|
206 |
-
token_text=lines[0]
|
207 |
-
pmid=lines[1].split('\t')[0]
|
208 |
-
# print(pmid)
|
209 |
-
#read entity and relation
|
210 |
-
entity_arg1={} #only entity offset
|
211 |
-
entity_arg2={} #only entity offset
|
212 |
-
entity_all=[] #all entity infor
|
213 |
-
|
214 |
-
for i in range(1,len(lines)):
|
215 |
-
seg=lines[i].split('\t')
|
216 |
-
if seg[4]==REL_ENT['arg1']:
|
217 |
-
if seg[-1] in entity_arg1.keys():
|
218 |
-
entity_arg1[seg[-1]].append([seg[1],seg[2]])
|
219 |
-
else:
|
220 |
-
entity_arg1[seg[-1]]=[[seg[1],seg[2]]]
|
221 |
-
elif seg[4]==REL_ENT['arg2']:
|
222 |
-
temp_spes=seg[-1].split(',')
|
223 |
-
for ele in temp_spes:
|
224 |
-
gene_spe_id=ele
|
225 |
-
if gene_spe_id in entity_arg2.keys():
|
226 |
-
entity_arg2[gene_spe_id].append([seg[1],seg[2]])
|
227 |
-
else:
|
228 |
-
entity_arg2[gene_spe_id]=[[seg[1],seg[2]]]
|
229 |
-
|
230 |
-
entity_all.append(seg)
|
231 |
-
# print('\narg1:',entity_arg1)
|
232 |
-
# print('\narg2:',entity_arg2)
|
233 |
-
# print('\nall entity:',entity_all)
|
234 |
-
# for all arg1 to produce inst
|
235 |
-
for cur_ele in entity_arg1.keys():
|
236 |
-
|
237 |
-
#1. ner label text
|
238 |
-
#check cur_ele in relation?
|
239 |
-
# print(relation_all.keys())
|
240 |
-
if cur_ele in entity_arg2.keys(): #pos instance
|
241 |
-
rel_ent2=entity_arg2[cur_ele]
|
242 |
-
ner_text=''
|
243 |
-
text_sid=0
|
244 |
-
#print('nonest:',entity_nonest)
|
245 |
-
for ele_nonest in entity_all:
|
246 |
-
ent_id=[ele_nonest[1],ele_nonest[2]]
|
247 |
-
ent_sid=int(ele_nonest[1])
|
248 |
-
ent_eid=int(ele_nonest[2])
|
249 |
-
# print('sid,eid:',ent_sid,ent_eid)
|
250 |
-
ent_text=ele_nonest[3]
|
251 |
-
ent_type=ele_nonest[4]
|
252 |
-
if ent_sid>=text_sid:
|
253 |
-
if ent_id in entity_arg1[cur_ele]:
|
254 |
-
ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG['arg1'][0]+' '+ent_text+ ' '+ENTITY_TAG['arg1'][1]+' '
|
255 |
-
else:
|
256 |
-
if ent_id in rel_ent2: #arg2 entity
|
257 |
-
if ent_type!=REL_ENT['arg2']:
|
258 |
-
pass
|
259 |
-
# print('arg2 is error! not ',REL_ENT['arg2'], ele_nonest)
|
260 |
-
ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG['arg2'][0]+' '+ent_text+ ' '+ENTITY_TAG['arg2'][1]+' '
|
261 |
-
else:
|
262 |
-
ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG[ent_type.lower()][0]+' '+ent_text+ ' '+ENTITY_TAG[ent_type.lower()][1]+' '
|
263 |
-
text_sid=ent_eid
|
264 |
-
else:
|
265 |
-
pass
|
266 |
-
# print('ner entity error!!!',ele_nonest,text_sid)
|
267 |
-
ner_text+=token_text[text_sid:]
|
268 |
-
sen_tokens=ner_text.split()
|
269 |
-
# print('\nner_text:',ner_text)
|
270 |
-
|
271 |
-
#3 produce pos input
|
272 |
-
|
273 |
-
temp_input=[]
|
274 |
-
token_id=0
|
275 |
-
while token_id <len(sen_tokens):
|
276 |
-
if sen_tokens[token_id].find(ENTITY_TAG['arg1'][0])>=0:
|
277 |
-
temp_input.append(ENTITY_TAG['arg1'][0]+'\tO')
|
278 |
-
token_id+=1
|
279 |
-
while(sen_tokens[token_id]!=ENTITY_TAG['arg1'][1]):
|
280 |
-
temp_input.append(sen_tokens[token_id]+'\tO')
|
281 |
-
token_id+=1
|
282 |
-
temp_input.append(ENTITY_TAG['arg1'][1]+'\tO')
|
283 |
-
elif sen_tokens[token_id].find(ENTITY_TAG['arg2'][0])>=0:
|
284 |
-
temp_input.append(ENTITY_TAG[REL_ENT['arg2'].lower()][0]+'\tARG2')
|
285 |
-
token_id+=1
|
286 |
-
while(sen_tokens[token_id]!=ENTITY_TAG['arg2'][1]):
|
287 |
-
temp_input.append(sen_tokens[token_id]+'\tARG2')
|
288 |
-
token_id+=1
|
289 |
-
temp_input.append(ENTITY_TAG[REL_ENT['arg2'].lower()][1]+'\tARG2')
|
290 |
-
elif sen_tokens[token_id].find(ENTITY_TAG['gene'][0])>=0:
|
291 |
-
temp_input.append(ENTITY_TAG['gene'][0]+'\tO')
|
292 |
-
token_id+=1
|
293 |
-
while(sen_tokens[token_id]!=ENTITY_TAG['gene'][1]):
|
294 |
-
temp_input.append(sen_tokens[token_id]+'\tO')
|
295 |
-
token_id+=1
|
296 |
-
temp_input.append(ENTITY_TAG['gene'][1]+'\tO')
|
297 |
-
elif sen_tokens[token_id].find(ENTITY_TAG['species'][0])>=0:
|
298 |
-
temp_input.append(ENTITY_TAG['species'][0]+'\tO')
|
299 |
-
token_id+=1
|
300 |
-
while(sen_tokens[token_id]!=ENTITY_TAG['species'][1]):
|
301 |
-
temp_input.append(sen_tokens[token_id]+'\tO')
|
302 |
-
token_id+=1
|
303 |
-
temp_input.append(ENTITY_TAG['species'][1]+'\tO')
|
304 |
-
else:
|
305 |
-
if sen_tokens[token_id]=='':
|
306 |
-
# print('token is none!error!')
|
307 |
-
pass
|
308 |
-
else:
|
309 |
-
temp_input.append(sen_tokens[token_id]+'\tO')
|
310 |
-
token_id+=1
|
311 |
-
|
312 |
-
final_input.append('\n'.join(temp_input))
|
313 |
-
|
314 |
-
else: #neg instance
|
315 |
-
ner_text=''
|
316 |
-
text_sid=0
|
317 |
-
#print('nonest:',entity_nonest)
|
318 |
-
for ele_nonest in entity_all:
|
319 |
-
ent_id=[ele_nonest[1],ele_nonest[2]]
|
320 |
-
ent_sid=int(ele_nonest[1])
|
321 |
-
ent_eid=int(ele_nonest[2])
|
322 |
-
# print('sid,eid:',ent_sid,ent_eid)
|
323 |
-
ent_text=ele_nonest[3]
|
324 |
-
ent_type=ele_nonest[4]
|
325 |
-
if ent_sid>=text_sid:
|
326 |
-
if ent_id in entity_arg1[cur_ele]:
|
327 |
-
ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG['arg1'][0]+' '+ent_text+ ' '+ENTITY_TAG['arg1'][1]+' '
|
328 |
-
else:
|
329 |
-
ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG[ent_type.lower()][0]+' '+ent_text+ ' '+ENTITY_TAG[ent_type.lower()][1]+' '
|
330 |
-
text_sid=ent_eid
|
331 |
-
else:
|
332 |
-
pass
|
333 |
-
# print('ner entity error!!!')
|
334 |
-
ner_text+=token_text[text_sid:]
|
335 |
-
sen_tokens=ner_text.split()
|
336 |
-
# print('\nner_text:',ner_text)
|
337 |
-
# print('ner_Text')
|
338 |
-
#3 produce NEG input
|
339 |
-
|
340 |
-
temp_input=[]
|
341 |
-
token_id=0
|
342 |
-
while token_id <len(sen_tokens):
|
343 |
-
if sen_tokens[token_id].find(ENTITY_TAG['arg1'][0])>=0:
|
344 |
-
temp_input.append(ENTITY_TAG['arg1'][0]+'\tO')
|
345 |
-
token_id+=1
|
346 |
-
while(sen_tokens[token_id]!=ENTITY_TAG['arg1'][1]):
|
347 |
-
temp_input.append(sen_tokens[token_id]+'\tO')
|
348 |
-
token_id+=1
|
349 |
-
temp_input.append(ENTITY_TAG['arg1'][1]+'\tO')
|
350 |
-
elif sen_tokens[token_id].find(ENTITY_TAG['gene'][0])>=0:
|
351 |
-
temp_input.append(ENTITY_TAG['gene'][0]+'\tO')
|
352 |
-
token_id+=1
|
353 |
-
while(sen_tokens[token_id]!=ENTITY_TAG['gene'][1]):
|
354 |
-
temp_input.append(sen_tokens[token_id]+'\tO')
|
355 |
-
token_id+=1
|
356 |
-
temp_input.append(ENTITY_TAG['gene'][1]+'\tO')
|
357 |
-
elif sen_tokens[token_id].find(ENTITY_TAG['species'][0])>=0:
|
358 |
-
temp_input.append(ENTITY_TAG['species'][0]+'\tO')
|
359 |
-
token_id+=1
|
360 |
-
while(sen_tokens[token_id]!=ENTITY_TAG['species'][1]):
|
361 |
-
temp_input.append(sen_tokens[token_id]+'\tO')
|
362 |
-
token_id+=1
|
363 |
-
temp_input.append(ENTITY_TAG['species'][1]+'\tO')
|
364 |
-
else:
|
365 |
-
if sen_tokens[token_id]=='':
|
366 |
-
print('token is none!error!')
|
367 |
-
else:
|
368 |
-
temp_input.append(sen_tokens[token_id]+'\tO')
|
369 |
-
token_id+=1
|
370 |
-
|
371 |
-
final_input.append('\n'.join(temp_input))
|
372 |
-
# print(entity_nonest)
|
373 |
-
# sys.exit()
|
374 |
-
fout.write('\n\n'.join(final_input))
|
375 |
-
fout.write('\n')
|
376 |
-
fout.close()
|
377 |
-
|
378 |
-
def check_entity_pos(line,relations):
|
379 |
-
|
380 |
-
seg=line.split(' ')
|
381 |
-
stack_ent=[]
|
382 |
-
# print(seg)
|
383 |
-
entity_num={'arg1':0,'arg2':0, 'gene':0,'chemical':0}
|
384 |
-
|
385 |
-
temp_arg2=[]
|
386 |
-
for i in range(0,len(seg)):
|
387 |
-
if seg[i].find(ENTITY_TAG['gene'][0])>=0:
|
388 |
-
entity_num['gene']+=1
|
389 |
-
stack_ent.append(seg[i])
|
390 |
-
elif seg[i].find(ENTITY_TAG['chemical'][0])>=0:
|
391 |
-
entity_num['chemical']+=1
|
392 |
-
stack_ent.append(seg[i])
|
393 |
-
# print(stack_ent)
|
394 |
-
elif seg[i].find(ENTITY_TAG['arg1'][0])>=0:
|
395 |
-
entity_num['arg1']+=1
|
396 |
-
stack_ent.append(seg[i])
|
397 |
-
elif seg[i].find(ENTITY_TAG['arg2'][0])>=0:
|
398 |
-
entity_num['arg2']+=1
|
399 |
-
temp_arg2.append(seg[i].split('|')[0])
|
400 |
-
stack_ent.append(seg[i])
|
401 |
-
elif seg[i].find(ENTITY_TAG['arg1'][1])>=0 or seg[i].find(ENTITY_TAG['arg2'][1])>=0 or seg[i].find(ENTITY_TAG['gene'][1])>=0 or seg[i].find(ENTITY_TAG['chemical'][1])>=0:
|
402 |
-
stack_ent.pop()
|
403 |
-
if stack_ent!=[]:
|
404 |
-
# print('entity no match!',stack_ent)
|
405 |
-
return(-1,seg,entity_num)
|
406 |
-
|
407 |
-
else:
|
408 |
-
if entity_num['arg1']!=0:
|
409 |
-
for arg2_id in relations.keys():
|
410 |
-
if arg2_id not in temp_arg2:
|
411 |
-
# print('\ntemp_arg2:',temp_arg2)
|
412 |
-
# print('\narg2_id:',arg2_id)
|
413 |
-
return(0,seg,entity_num) #some arg2 not in sentence
|
414 |
-
if entity_num['arg2']!=0 and entity_num['arg1']==0:
|
415 |
-
return(0,seg,entity_num) #only arg2, but no arg1
|
416 |
-
return(1,seg,entity_num)
|
417 |
-
|
418 |
-
def check_entity_neg(line):
|
419 |
-
|
420 |
-
seg=line.split(' ')
|
421 |
-
stack_ent=[]
|
422 |
-
# print(seg)
|
423 |
-
entity_num={'arg1':0,'gene':0,'chemical':0}
|
424 |
-
for i in range(0,len(seg)):
|
425 |
-
if seg[i].find(ENTITY_TAG['gene'][0])>=0:
|
426 |
-
entity_num['gene']+=1
|
427 |
-
stack_ent.append(seg[i])
|
428 |
-
elif seg[i].find(ENTITY_TAG['chemical'][0])>=0:
|
429 |
-
entity_num['chemical']+=1
|
430 |
-
stack_ent.append(seg[i])
|
431 |
-
# print(stack_ent)
|
432 |
-
elif seg[i].find(ENTITY_TAG['arg1'][0])>=0:
|
433 |
-
entity_num['arg1']+=1
|
434 |
-
stack_ent.append(seg[i])
|
435 |
-
elif seg[i].find(ENTITY_TAG['arg1'][1])>=0 or seg[i].find(ENTITY_TAG['gene'][1])>=0 or seg[i].find(ENTITY_TAG['chemical'][1])>=0:
|
436 |
-
stack_ent.pop()
|
437 |
-
if stack_ent!=[]:
|
438 |
-
# print('entity no match!',stack_ent)
|
439 |
-
return(-1,seg,entity_num)
|
440 |
-
|
441 |
-
else:
|
442 |
-
return(1,seg,entity_num)
|
443 |
-
|
444 |
-
def get_one_entity(nest_list,cur_ent,rel_entity2_id):
|
445 |
-
max_len=0
|
446 |
-
max_entity=[]
|
447 |
-
final_entity=[]
|
448 |
-
for i in range(0, len(nest_list)):
|
449 |
-
if nest_list[i][1]==cur_ent:#current entity
|
450 |
-
final_entity=[]
|
451 |
-
max_entity=nest_list[i]
|
452 |
-
final_entity.append(nest_list[i])
|
453 |
-
return(final_entity)
|
454 |
-
if nest_list[i][1] in rel_entity2_id: #invole rel
|
455 |
-
final_entity.append(nest_list[i])
|
456 |
-
continue
|
457 |
-
length=int(nest_list[i][4])-int(nest_list[i][3])
|
458 |
-
if max_entity==[]: #first entity
|
459 |
-
max_len=length
|
460 |
-
max_entity=nest_list[i]
|
461 |
-
else:
|
462 |
-
if length>max_len:
|
463 |
-
if max_entity[2]==REL_ENT['arg1']:
|
464 |
-
max_len=length
|
465 |
-
max_entity=nest_list[i]
|
466 |
-
else:
|
467 |
-
if nest_list[i][2]==REL_ENT['arg2'] and max_entity[1] not in rel_entity2_id:
|
468 |
-
max_len=length
|
469 |
-
max_entity=nest_list[i]
|
470 |
-
|
471 |
-
else:
|
472 |
-
if nest_list[i][1] in rel_entity2_id:
|
473 |
-
max_len=length
|
474 |
-
max_entity=nest_list[i]
|
475 |
-
elif max_entity[2]==REL_ENT['arg1'] and nest_list[i][2]==REL_ENT['arg2']:
|
476 |
-
max_len=length
|
477 |
-
max_entity=nest_list[i]
|
478 |
-
if final_entity==[]:
|
479 |
-
final_entity.append(max_entity)
|
480 |
-
return final_entity
|
481 |
-
|
482 |
-
if __name__=='__main__':
|
483 |
-
|
484 |
-
infile='../../TrainingSet/No505/SA.Train.txt'
|
485 |
-
outfile='../../TrainingSet/No505/SA.Train.conll'
|
486 |
-
|
487 |
-
#tokenizer
|
488 |
-
token_input=ssplit_token(infile)
|
489 |
-
|
490 |
-
#filter nest entity
|
491 |
-
nonest_input=corpus_noNest(token_input)
|
492 |
-
|
493 |
-
# to conll
|
494 |
generate_seq_input(nonest_input,outfile)
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
import sys
|
4 |
+
import io
|
5 |
+
import stanza
|
6 |
+
# nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma',package='craft') #package='craft'
|
7 |
+
nlp = stanza.Pipeline(lang='en', processors={'tokenize': 'spacy'},package='None') #package='craft'
|
8 |
+
REL_ENT={'arg1':'Species',
|
9 |
+
'arg2':'Gene'}
|
10 |
+
|
11 |
+
ENTITY_TAG={'arg1':['arg1s','arg1e'],
|
12 |
+
'arg2':['arg2s','arg2e'],
|
13 |
+
'gene':['gene1s','gene1e'],
|
14 |
+
'species':['species1s','species1e']
|
15 |
+
}
|
16 |
+
|
17 |
+
# ssplit token and revise index
|
18 |
+
def ssplit_token(infile):
|
19 |
+
fin=open(infile,'r',encoding='utf-8')
|
20 |
+
fout=io.StringIO()
|
21 |
+
all_in=fin.read().strip().split('\n\n')
|
22 |
+
fin.close()
|
23 |
+
for doc_text in all_in:
|
24 |
+
lines=doc_text.split('\n')
|
25 |
+
ori_text=lines[0].split('|t|')[1]+' '+lines[1].split('|a|')[1]
|
26 |
+
pmid=lines[0].split('|t|')[0]
|
27 |
+
# print(pmid)
|
28 |
+
entity_all=[] #[[seg0,seg1,...,],[]]
|
29 |
+
for i in range(2,len(lines)):
|
30 |
+
seg=lines[i].split('\t')
|
31 |
+
entity_all.append(seg)
|
32 |
+
|
33 |
+
#ssplit token
|
34 |
+
doc_stanza = nlp(ori_text)
|
35 |
+
token_text=''
|
36 |
+
for sent in doc_stanza.sentences:
|
37 |
+
for word in sent.words:
|
38 |
+
if word.text==' ':
|
39 |
+
pass
|
40 |
+
# print('token is blank!')
|
41 |
+
else:
|
42 |
+
token_text+=word.text+' '
|
43 |
+
#token_text=token_text+' ' #sentence split by four blank
|
44 |
+
|
45 |
+
#ori_index map token_index
|
46 |
+
index_map=[-1]*len(ori_text)
|
47 |
+
j=0
|
48 |
+
space_list=[' ',chr(160),chr(8201),chr(8194),chr(8197),chr(8202)] #空格有好几种,第一个是常用32,第二个shi 160,8201,8194,8197
|
49 |
+
for i in range(0,len(ori_text)):
|
50 |
+
if ori_text[i] in space_list:
|
51 |
+
pass
|
52 |
+
elif ori_text[i]==token_text[j]:
|
53 |
+
#if i>0 and i<285:
|
54 |
+
# print('=i,j:',i,j,ori_text[i-1:i+1],token_text[j-1:j+1])
|
55 |
+
index_map[i]=j
|
56 |
+
j+=1
|
57 |
+
else:
|
58 |
+
#if i==283:
|
59 |
+
# print('!i,j:',i,j,ori_text[i-1:i+1],token_text[j-1:j+1])
|
60 |
+
j+=1
|
61 |
+
temp_log=j
|
62 |
+
try:
|
63 |
+
while(ori_text[i]!=token_text[j]):
|
64 |
+
j+=1
|
65 |
+
except:
|
66 |
+
print('doc',doc_text)
|
67 |
+
print('token_text:',token_text)
|
68 |
+
print('error:',ori_text[i-10:i+10],'i:',ori_text[i],'j:',token_text[temp_log],',',token_text[temp_log-10:temp_log+10])
|
69 |
+
print(ord(ori_text[i]),ord(' '))
|
70 |
+
sys.exit()
|
71 |
+
index_map[i]=j
|
72 |
+
j+=1
|
73 |
+
# print(index_map)
|
74 |
+
# token_text=token_text.replace(' ','<EOS>')
|
75 |
+
# print(token_text)
|
76 |
+
fout.write(token_text+'\n')
|
77 |
+
for ele in entity_all:
|
78 |
+
if index_map[int(ele[1])]==-1:
|
79 |
+
new_ents=index_map[int(ele[1])+1]
|
80 |
+
else:
|
81 |
+
new_ents=index_map[int(ele[1])]
|
82 |
+
if index_map[int(ele[2])-1]==-1:
|
83 |
+
new_ente=index_map[int(ele[2])-1-1]+1
|
84 |
+
else:
|
85 |
+
new_ente=index_map[int(ele[2])-1]+1
|
86 |
+
new_ent=token_text[new_ents:new_ente]
|
87 |
+
if ele[4]=='Species' or ele[4]=='Gene':
|
88 |
+
fout.write(ele[0]+'\t'+str(new_ents)+'\t'+str(new_ente)+'\t'+new_ent+'\t'+ele[4]+'\t'+ele[5]+'\n')
|
89 |
+
else:
|
90 |
+
# print(ele[4])
|
91 |
+
fout.write(ele[0]+'\t'+str(new_ents)+'\t'+str(new_ente)+'\t'+new_ent+'\t'+'Gene'+'\t'+ele[5]+'\n')
|
92 |
+
fout.write('\n')
|
93 |
+
return fout.getvalue()
|
94 |
+
|
95 |
+
|
96 |
+
def corpus_noNest(token_input):
|
97 |
+
|
98 |
+
fin=io.StringIO(token_input)
|
99 |
+
fout=io.StringIO()
|
100 |
+
|
101 |
+
documents=fin.read().strip().split('\n\n')
|
102 |
+
fin.close()
|
103 |
+
total_entity=0
|
104 |
+
over_entity=0
|
105 |
+
nest_entity=0
|
106 |
+
for doc in documents:
|
107 |
+
lines=doc.split('\n')
|
108 |
+
context=lines[0]
|
109 |
+
entity_list=[]
|
110 |
+
if len(lines)>1:
|
111 |
+
doc_result={}
|
112 |
+
for i in range(1,len(lines)):
|
113 |
+
segs=lines[i].split('\t')
|
114 |
+
doc_result[lines[i]]=[int(segs[1]),int(segs[2])]
|
115 |
+
doc_result=sorted(doc_result.items(), key=lambda kv:(kv[1]), reverse=False)
|
116 |
+
doc_result_sort=[]
|
117 |
+
for ele in doc_result:
|
118 |
+
doc_result_sort.append(ele[0])
|
119 |
+
|
120 |
+
first_entity=doc_result_sort[0].split('\t')
|
121 |
+
nest_list=[first_entity]
|
122 |
+
max_eid=int(first_entity[2])
|
123 |
+
total_entity+=len(lines)-2
|
124 |
+
for i in range(1,len(doc_result_sort)):
|
125 |
+
segs=doc_result_sort[i].split('\t')
|
126 |
+
if int(segs[1])> max_eid:
|
127 |
+
if len(nest_list)==1:
|
128 |
+
entity_list.append(nest_list[0])
|
129 |
+
nest_list=[]
|
130 |
+
nest_list.append(segs)
|
131 |
+
if int(segs[2])>max_eid:
|
132 |
+
max_eid=int(segs[2])
|
133 |
+
else:
|
134 |
+
# print(nest_list)
|
135 |
+
nest_entity+=len(nest_list)-1
|
136 |
+
tem=find_max_entity(nest_list,context)#find max entity
|
137 |
+
# if len(tem)>1:
|
138 |
+
# print('max nest >1:',tem)
|
139 |
+
entity_list.extend(tem)
|
140 |
+
nest_list=[]
|
141 |
+
nest_list.append(segs)
|
142 |
+
if int(segs[2])>max_eid:
|
143 |
+
max_eid=int(segs[2])
|
144 |
+
|
145 |
+
else:
|
146 |
+
nest_list.append(segs)
|
147 |
+
over_entity+=1
|
148 |
+
if int(segs[2])>max_eid:
|
149 |
+
max_eid=int(segs[2])
|
150 |
+
if nest_list!=[]:
|
151 |
+
if len(nest_list)==1:
|
152 |
+
entity_list.append(nest_list[0])
|
153 |
+
|
154 |
+
else:
|
155 |
+
tem=find_max_entity(nest_list,context)#find max entity
|
156 |
+
# if len(tem)>1:
|
157 |
+
# print('max nest >1:',tem)
|
158 |
+
entity_list.extend(tem)
|
159 |
+
fout.write(context+'\n')
|
160 |
+
for ele in entity_list:
|
161 |
+
if ele[4]=='Gene':
|
162 |
+
temp_gene={}
|
163 |
+
gene_ids=ele[5].split(',')
|
164 |
+
for gene_id in gene_ids:
|
165 |
+
temp_id=gene_id[gene_id.find('Species:'):-1]
|
166 |
+
spe_id=temp_id[len('Species:'):]
|
167 |
+
temp_gene[temp_id]=int(spe_id)
|
168 |
+
temp_gene_sort=sorted(temp_gene.items(), key=lambda kv:(kv[1]), reverse=False)
|
169 |
+
final_gene_id=''
|
170 |
+
for temp_ele in temp_gene_sort:
|
171 |
+
final_gene_id+=temp_ele[0]+','
|
172 |
+
fout.write('\t'.join(ele[:-1])+'\t'+final_gene_id[:-1]+'\n')
|
173 |
+
else:
|
174 |
+
fout.write('\t'.join(ele)+'\n')
|
175 |
+
fout.write('\n')
|
176 |
+
# print(total_entity,over_entity, nest_entity)
|
177 |
+
return fout.getvalue()
|
178 |
+
|
179 |
+
def find_max_entity(nest_list,text):
|
180 |
+
max_len=0
|
181 |
+
final_tem=[]
|
182 |
+
max_index=0
|
183 |
+
for i in range(0, len(nest_list)):
|
184 |
+
if nest_list[i][4] =='Species':
|
185 |
+
final_tem.append(nest_list[i])
|
186 |
+
else:
|
187 |
+
cur_len=int(nest_list[i][2])-int(nest_list[i][1])
|
188 |
+
if cur_len>max_len:
|
189 |
+
max_len=cur_len
|
190 |
+
max_index=i
|
191 |
+
final_tem.append(nest_list[max_index])
|
192 |
+
return final_tem
|
193 |
+
|
194 |
+
|
195 |
+
def generate_seq_input(nonest_input,outfile):
|
196 |
+
|
197 |
+
fin=io.StringIO(nonest_input)
|
198 |
+
fout=open(outfile,'w',encoding='utf-8')
|
199 |
+
all_in=fin.read().strip().split('\n\n')
|
200 |
+
fin.close()
|
201 |
+
|
202 |
+
final_input=[]
|
203 |
+
|
204 |
+
for doc in all_in:
|
205 |
+
lines=doc.split('\n')
|
206 |
+
token_text=lines[0]
|
207 |
+
pmid=lines[1].split('\t')[0]
|
208 |
+
# print(pmid)
|
209 |
+
#read entity and relation
|
210 |
+
entity_arg1={} #only entity offset
|
211 |
+
entity_arg2={} #only entity offset
|
212 |
+
entity_all=[] #all entity infor
|
213 |
+
|
214 |
+
for i in range(1,len(lines)):
|
215 |
+
seg=lines[i].split('\t')
|
216 |
+
if seg[4]==REL_ENT['arg1']:
|
217 |
+
if seg[-1] in entity_arg1.keys():
|
218 |
+
entity_arg1[seg[-1]].append([seg[1],seg[2]])
|
219 |
+
else:
|
220 |
+
entity_arg1[seg[-1]]=[[seg[1],seg[2]]]
|
221 |
+
elif seg[4]==REL_ENT['arg2']:
|
222 |
+
temp_spes=seg[-1].split(',')
|
223 |
+
for ele in temp_spes:
|
224 |
+
gene_spe_id=ele
|
225 |
+
if gene_spe_id in entity_arg2.keys():
|
226 |
+
entity_arg2[gene_spe_id].append([seg[1],seg[2]])
|
227 |
+
else:
|
228 |
+
entity_arg2[gene_spe_id]=[[seg[1],seg[2]]]
|
229 |
+
|
230 |
+
entity_all.append(seg)
|
231 |
+
# print('\narg1:',entity_arg1)
|
232 |
+
# print('\narg2:',entity_arg2)
|
233 |
+
# print('\nall entity:',entity_all)
|
234 |
+
# for all arg1 to produce inst
|
235 |
+
for cur_ele in entity_arg1.keys():
|
236 |
+
|
237 |
+
#1. ner label text
|
238 |
+
#check cur_ele in relation?
|
239 |
+
# print(relation_all.keys())
|
240 |
+
if cur_ele in entity_arg2.keys(): #pos instance
|
241 |
+
rel_ent2=entity_arg2[cur_ele]
|
242 |
+
ner_text=''
|
243 |
+
text_sid=0
|
244 |
+
#print('nonest:',entity_nonest)
|
245 |
+
for ele_nonest in entity_all:
|
246 |
+
ent_id=[ele_nonest[1],ele_nonest[2]]
|
247 |
+
ent_sid=int(ele_nonest[1])
|
248 |
+
ent_eid=int(ele_nonest[2])
|
249 |
+
# print('sid,eid:',ent_sid,ent_eid)
|
250 |
+
ent_text=ele_nonest[3]
|
251 |
+
ent_type=ele_nonest[4]
|
252 |
+
if ent_sid>=text_sid:
|
253 |
+
if ent_id in entity_arg1[cur_ele]:
|
254 |
+
ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG['arg1'][0]+' '+ent_text+ ' '+ENTITY_TAG['arg1'][1]+' '
|
255 |
+
else:
|
256 |
+
if ent_id in rel_ent2: #arg2 entity
|
257 |
+
if ent_type!=REL_ENT['arg2']:
|
258 |
+
pass
|
259 |
+
# print('arg2 is error! not ',REL_ENT['arg2'], ele_nonest)
|
260 |
+
ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG['arg2'][0]+' '+ent_text+ ' '+ENTITY_TAG['arg2'][1]+' '
|
261 |
+
else:
|
262 |
+
ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG[ent_type.lower()][0]+' '+ent_text+ ' '+ENTITY_TAG[ent_type.lower()][1]+' '
|
263 |
+
text_sid=ent_eid
|
264 |
+
else:
|
265 |
+
pass
|
266 |
+
# print('ner entity error!!!',ele_nonest,text_sid)
|
267 |
+
ner_text+=token_text[text_sid:]
|
268 |
+
sen_tokens=ner_text.split()
|
269 |
+
# print('\nner_text:',ner_text)
|
270 |
+
|
271 |
+
#3 produce pos input
|
272 |
+
|
273 |
+
temp_input=[]
|
274 |
+
token_id=0
|
275 |
+
while token_id <len(sen_tokens):
|
276 |
+
if sen_tokens[token_id].find(ENTITY_TAG['arg1'][0])>=0:
|
277 |
+
temp_input.append(ENTITY_TAG['arg1'][0]+'\tO')
|
278 |
+
token_id+=1
|
279 |
+
while(sen_tokens[token_id]!=ENTITY_TAG['arg1'][1]):
|
280 |
+
temp_input.append(sen_tokens[token_id]+'\tO')
|
281 |
+
token_id+=1
|
282 |
+
temp_input.append(ENTITY_TAG['arg1'][1]+'\tO')
|
283 |
+
elif sen_tokens[token_id].find(ENTITY_TAG['arg2'][0])>=0:
|
284 |
+
temp_input.append(ENTITY_TAG[REL_ENT['arg2'].lower()][0]+'\tARG2')
|
285 |
+
token_id+=1
|
286 |
+
while(sen_tokens[token_id]!=ENTITY_TAG['arg2'][1]):
|
287 |
+
temp_input.append(sen_tokens[token_id]+'\tARG2')
|
288 |
+
token_id+=1
|
289 |
+
temp_input.append(ENTITY_TAG[REL_ENT['arg2'].lower()][1]+'\tARG2')
|
290 |
+
elif sen_tokens[token_id].find(ENTITY_TAG['gene'][0])>=0:
|
291 |
+
temp_input.append(ENTITY_TAG['gene'][0]+'\tO')
|
292 |
+
token_id+=1
|
293 |
+
while(sen_tokens[token_id]!=ENTITY_TAG['gene'][1]):
|
294 |
+
temp_input.append(sen_tokens[token_id]+'\tO')
|
295 |
+
token_id+=1
|
296 |
+
temp_input.append(ENTITY_TAG['gene'][1]+'\tO')
|
297 |
+
elif sen_tokens[token_id].find(ENTITY_TAG['species'][0])>=0:
|
298 |
+
temp_input.append(ENTITY_TAG['species'][0]+'\tO')
|
299 |
+
token_id+=1
|
300 |
+
while(sen_tokens[token_id]!=ENTITY_TAG['species'][1]):
|
301 |
+
temp_input.append(sen_tokens[token_id]+'\tO')
|
302 |
+
token_id+=1
|
303 |
+
temp_input.append(ENTITY_TAG['species'][1]+'\tO')
|
304 |
+
else:
|
305 |
+
if sen_tokens[token_id]=='':
|
306 |
+
# print('token is none!error!')
|
307 |
+
pass
|
308 |
+
else:
|
309 |
+
temp_input.append(sen_tokens[token_id]+'\tO')
|
310 |
+
token_id+=1
|
311 |
+
|
312 |
+
final_input.append('\n'.join(temp_input))
|
313 |
+
|
314 |
+
else: #neg instance
|
315 |
+
ner_text=''
|
316 |
+
text_sid=0
|
317 |
+
#print('nonest:',entity_nonest)
|
318 |
+
for ele_nonest in entity_all:
|
319 |
+
ent_id=[ele_nonest[1],ele_nonest[2]]
|
320 |
+
ent_sid=int(ele_nonest[1])
|
321 |
+
ent_eid=int(ele_nonest[2])
|
322 |
+
# print('sid,eid:',ent_sid,ent_eid)
|
323 |
+
ent_text=ele_nonest[3]
|
324 |
+
ent_type=ele_nonest[4]
|
325 |
+
if ent_sid>=text_sid:
|
326 |
+
if ent_id in entity_arg1[cur_ele]:
|
327 |
+
ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG['arg1'][0]+' '+ent_text+ ' '+ENTITY_TAG['arg1'][1]+' '
|
328 |
+
else:
|
329 |
+
ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG[ent_type.lower()][0]+' '+ent_text+ ' '+ENTITY_TAG[ent_type.lower()][1]+' '
|
330 |
+
text_sid=ent_eid
|
331 |
+
else:
|
332 |
+
pass
|
333 |
+
# print('ner entity error!!!')
|
334 |
+
ner_text+=token_text[text_sid:]
|
335 |
+
sen_tokens=ner_text.split()
|
336 |
+
# print('\nner_text:',ner_text)
|
337 |
+
# print('ner_Text')
|
338 |
+
#3 produce NEG input
|
339 |
+
|
340 |
+
temp_input=[]
|
341 |
+
token_id=0
|
342 |
+
while token_id <len(sen_tokens):
|
343 |
+
if sen_tokens[token_id].find(ENTITY_TAG['arg1'][0])>=0:
|
344 |
+
temp_input.append(ENTITY_TAG['arg1'][0]+'\tO')
|
345 |
+
token_id+=1
|
346 |
+
while(sen_tokens[token_id]!=ENTITY_TAG['arg1'][1]):
|
347 |
+
temp_input.append(sen_tokens[token_id]+'\tO')
|
348 |
+
token_id+=1
|
349 |
+
temp_input.append(ENTITY_TAG['arg1'][1]+'\tO')
|
350 |
+
elif sen_tokens[token_id].find(ENTITY_TAG['gene'][0])>=0:
|
351 |
+
temp_input.append(ENTITY_TAG['gene'][0]+'\tO')
|
352 |
+
token_id+=1
|
353 |
+
while(sen_tokens[token_id]!=ENTITY_TAG['gene'][1]):
|
354 |
+
temp_input.append(sen_tokens[token_id]+'\tO')
|
355 |
+
token_id+=1
|
356 |
+
temp_input.append(ENTITY_TAG['gene'][1]+'\tO')
|
357 |
+
elif sen_tokens[token_id].find(ENTITY_TAG['species'][0])>=0:
|
358 |
+
temp_input.append(ENTITY_TAG['species'][0]+'\tO')
|
359 |
+
token_id+=1
|
360 |
+
while(sen_tokens[token_id]!=ENTITY_TAG['species'][1]):
|
361 |
+
temp_input.append(sen_tokens[token_id]+'\tO')
|
362 |
+
token_id+=1
|
363 |
+
temp_input.append(ENTITY_TAG['species'][1]+'\tO')
|
364 |
+
else:
|
365 |
+
if sen_tokens[token_id]=='':
|
366 |
+
print('token is none!error!')
|
367 |
+
else:
|
368 |
+
temp_input.append(sen_tokens[token_id]+'\tO')
|
369 |
+
token_id+=1
|
370 |
+
|
371 |
+
final_input.append('\n'.join(temp_input))
|
372 |
+
# print(entity_nonest)
|
373 |
+
# sys.exit()
|
374 |
+
fout.write('\n\n'.join(final_input))
|
375 |
+
fout.write('\n')
|
376 |
+
fout.close()
|
377 |
+
|
378 |
+
def check_entity_pos(line,relations):
|
379 |
+
|
380 |
+
seg=line.split(' ')
|
381 |
+
stack_ent=[]
|
382 |
+
# print(seg)
|
383 |
+
entity_num={'arg1':0,'arg2':0, 'gene':0,'chemical':0}
|
384 |
+
|
385 |
+
temp_arg2=[]
|
386 |
+
for i in range(0,len(seg)):
|
387 |
+
if seg[i].find(ENTITY_TAG['gene'][0])>=0:
|
388 |
+
entity_num['gene']+=1
|
389 |
+
stack_ent.append(seg[i])
|
390 |
+
elif seg[i].find(ENTITY_TAG['chemical'][0])>=0:
|
391 |
+
entity_num['chemical']+=1
|
392 |
+
stack_ent.append(seg[i])
|
393 |
+
# print(stack_ent)
|
394 |
+
elif seg[i].find(ENTITY_TAG['arg1'][0])>=0:
|
395 |
+
entity_num['arg1']+=1
|
396 |
+
stack_ent.append(seg[i])
|
397 |
+
elif seg[i].find(ENTITY_TAG['arg2'][0])>=0:
|
398 |
+
entity_num['arg2']+=1
|
399 |
+
temp_arg2.append(seg[i].split('|')[0])
|
400 |
+
stack_ent.append(seg[i])
|
401 |
+
elif seg[i].find(ENTITY_TAG['arg1'][1])>=0 or seg[i].find(ENTITY_TAG['arg2'][1])>=0 or seg[i].find(ENTITY_TAG['gene'][1])>=0 or seg[i].find(ENTITY_TAG['chemical'][1])>=0:
|
402 |
+
stack_ent.pop()
|
403 |
+
if stack_ent!=[]:
|
404 |
+
# print('entity no match!',stack_ent)
|
405 |
+
return(-1,seg,entity_num)
|
406 |
+
|
407 |
+
else:
|
408 |
+
if entity_num['arg1']!=0:
|
409 |
+
for arg2_id in relations.keys():
|
410 |
+
if arg2_id not in temp_arg2:
|
411 |
+
# print('\ntemp_arg2:',temp_arg2)
|
412 |
+
# print('\narg2_id:',arg2_id)
|
413 |
+
return(0,seg,entity_num) #some arg2 not in sentence
|
414 |
+
if entity_num['arg2']!=0 and entity_num['arg1']==0:
|
415 |
+
return(0,seg,entity_num) #only arg2, but no arg1
|
416 |
+
return(1,seg,entity_num)
|
417 |
+
|
418 |
+
def check_entity_neg(line):
|
419 |
+
|
420 |
+
seg=line.split(' ')
|
421 |
+
stack_ent=[]
|
422 |
+
# print(seg)
|
423 |
+
entity_num={'arg1':0,'gene':0,'chemical':0}
|
424 |
+
for i in range(0,len(seg)):
|
425 |
+
if seg[i].find(ENTITY_TAG['gene'][0])>=0:
|
426 |
+
entity_num['gene']+=1
|
427 |
+
stack_ent.append(seg[i])
|
428 |
+
elif seg[i].find(ENTITY_TAG['chemical'][0])>=0:
|
429 |
+
entity_num['chemical']+=1
|
430 |
+
stack_ent.append(seg[i])
|
431 |
+
# print(stack_ent)
|
432 |
+
elif seg[i].find(ENTITY_TAG['arg1'][0])>=0:
|
433 |
+
entity_num['arg1']+=1
|
434 |
+
stack_ent.append(seg[i])
|
435 |
+
elif seg[i].find(ENTITY_TAG['arg1'][1])>=0 or seg[i].find(ENTITY_TAG['gene'][1])>=0 or seg[i].find(ENTITY_TAG['chemical'][1])>=0:
|
436 |
+
stack_ent.pop()
|
437 |
+
if stack_ent!=[]:
|
438 |
+
# print('entity no match!',stack_ent)
|
439 |
+
return(-1,seg,entity_num)
|
440 |
+
|
441 |
+
else:
|
442 |
+
return(1,seg,entity_num)
|
443 |
+
|
444 |
+
def get_one_entity(nest_list,cur_ent,rel_entity2_id):
|
445 |
+
max_len=0
|
446 |
+
max_entity=[]
|
447 |
+
final_entity=[]
|
448 |
+
for i in range(0, len(nest_list)):
|
449 |
+
if nest_list[i][1]==cur_ent:#current entity
|
450 |
+
final_entity=[]
|
451 |
+
max_entity=nest_list[i]
|
452 |
+
final_entity.append(nest_list[i])
|
453 |
+
return(final_entity)
|
454 |
+
if nest_list[i][1] in rel_entity2_id: #invole rel
|
455 |
+
final_entity.append(nest_list[i])
|
456 |
+
continue
|
457 |
+
length=int(nest_list[i][4])-int(nest_list[i][3])
|
458 |
+
if max_entity==[]: #first entity
|
459 |
+
max_len=length
|
460 |
+
max_entity=nest_list[i]
|
461 |
+
else:
|
462 |
+
if length>max_len:
|
463 |
+
if max_entity[2]==REL_ENT['arg1']:
|
464 |
+
max_len=length
|
465 |
+
max_entity=nest_list[i]
|
466 |
+
else:
|
467 |
+
if nest_list[i][2]==REL_ENT['arg2'] and max_entity[1] not in rel_entity2_id:
|
468 |
+
max_len=length
|
469 |
+
max_entity=nest_list[i]
|
470 |
+
|
471 |
+
else:
|
472 |
+
if nest_list[i][1] in rel_entity2_id:
|
473 |
+
max_len=length
|
474 |
+
max_entity=nest_list[i]
|
475 |
+
elif max_entity[2]==REL_ENT['arg1'] and nest_list[i][2]==REL_ENT['arg2']:
|
476 |
+
max_len=length
|
477 |
+
max_entity=nest_list[i]
|
478 |
+
if final_entity==[]:
|
479 |
+
final_entity.append(max_entity)
|
480 |
+
return final_entity
|
481 |
+
|
482 |
+
if __name__=='__main__':
|
483 |
+
|
484 |
+
infile='../../TrainingSet/No505/SA.Train.txt'
|
485 |
+
outfile='../../TrainingSet/No505/SA.Train.conll'
|
486 |
+
|
487 |
+
#tokenizer
|
488 |
+
token_input=ssplit_token(infile)
|
489 |
+
|
490 |
+
#filter nest entity
|
491 |
+
nonest_input=corpus_noNest(token_input)
|
492 |
+
|
493 |
+
# to conll
|
494 |
generate_seq_input(nonest_input,outfile)
|
src_python/SpeAss/ml_tagging_score_sa.py
CHANGED
@@ -1,220 +1,220 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""
|
3 |
-
Created on Fri Jan 7 09:29:46 2022
|
4 |
-
|
5 |
-
@author: luol2
|
6 |
-
|
7 |
-
machine learning tagging
|
8 |
-
|
9 |
-
"""
|
10 |
-
|
11 |
-
|
12 |
-
import time
|
13 |
-
import io
|
14 |
-
|
15 |
-
from src_python.SpeAss.processing_data_sa import ml_intext_fn,out_BIO_BERT_softmax_score_fn
|
16 |
-
import tensorflow as tf
|
17 |
-
gpu = tf.config.list_physical_devices('GPU')
|
18 |
-
print("Num GPUs Available: ", len(gpu))
|
19 |
-
if len(gpu) > 0:
|
20 |
-
tf.config.experimental.set_memory_growth(gpu[0], True)
|
21 |
-
#tf.compat.v1.disable_eager_execution()
|
22 |
-
|
23 |
-
REL_ENT={'arg1':'Species',
|
24 |
-
'arg2':'Gene'}
|
25 |
-
|
26 |
-
entity_tag={'arg1':['arg1s','arg1e'],
|
27 |
-
'gene':['gene1s','gene1e'],
|
28 |
-
'species':['species1s','species1e']
|
29 |
-
}
|
30 |
-
|
31 |
-
def input_preprocess_notoken(doc_text):
|
32 |
-
final_input=[]
|
33 |
-
final_id=[]
|
34 |
-
|
35 |
-
lines=doc_text.split('\n')
|
36 |
-
token_text=lines[0]
|
37 |
-
pmid=lines[1].split('\t')[0]
|
38 |
-
entity_arg1={} #{species_id:[[spe_sid1,sep_eid1],[...]]}
|
39 |
-
entity_all=[]
|
40 |
-
for i in range(1,len(lines)):
|
41 |
-
seg=lines[i].split('\t')
|
42 |
-
if seg[6]==REL_ENT['arg1']:
|
43 |
-
if seg[-1] in entity_arg1.keys():
|
44 |
-
entity_arg1[seg[-1]].append([seg[3],seg[4]])
|
45 |
-
else:
|
46 |
-
entity_arg1[seg[-1]]=[[seg[3],seg[4]]]
|
47 |
-
entity_all.append(seg)
|
48 |
-
|
49 |
-
#print(token_text)
|
50 |
-
#print(entity_chemical)
|
51 |
-
#generate input instance
|
52 |
-
for cur_ele in entity_arg1:
|
53 |
-
|
54 |
-
#2. ner label text
|
55 |
-
ner_text=''
|
56 |
-
text_sid=0
|
57 |
-
#print('nonest:',entity_nonest)
|
58 |
-
for ele_nonest in entity_all:
|
59 |
-
ent_id=[ele_nonest[3],ele_nonest[4]]
|
60 |
-
ent_spe_id=ele_nonest[-1]
|
61 |
-
ent_sid=int(ele_nonest[3])
|
62 |
-
ent_eid=int(ele_nonest[4])
|
63 |
-
# print('sid,eid:',ent_sid,ent_eid)
|
64 |
-
ent_text=ele_nonest[5]
|
65 |
-
ent_type=ele_nonest[6]
|
66 |
-
if ent_sid>=text_sid:
|
67 |
-
# if token_text[ent_sid:ent_eid]!=ent_text:
|
68 |
-
# print('error!index_text,entext:',token_text[ent_sid:ent_eid],ent_text)
|
69 |
-
if ent_id in entity_arg1[cur_ele]: #is species
|
70 |
-
ner_text+=token_text[text_sid:ent_sid]+' '+ent_spe_id+'|'+entity_tag['arg1'][0]+' '+ent_text+' '+entity_tag['arg1'][1]+' '
|
71 |
-
else:
|
72 |
-
ner_text+=token_text[text_sid:ent_sid]+' '+str(ent_sid)+'-'+str(ent_eid)+'|'+entity_tag[ent_type.lower()][0]+' '+ent_text+' '+entity_tag[ent_type.lower()][1]+' '
|
73 |
-
text_sid=ent_eid
|
74 |
-
ner_text+=token_text[text_sid:]
|
75 |
-
sen_tokens=ner_text.split()
|
76 |
-
#print('\nner_text:',ner_text)
|
77 |
-
|
78 |
-
#3. produce input
|
79 |
-
temp_input=[]
|
80 |
-
temp_id={'species':'','gene':[]}
|
81 |
-
for sen_token in sen_tokens:
|
82 |
-
if sen_token.find(entity_tag['arg1'][0])>=0:
|
83 |
-
en_id=sen_token.split('|')[0]
|
84 |
-
temp_id['species']=en_id
|
85 |
-
temp_input.append(entity_tag['arg1'][0]+'\tO')
|
86 |
-
elif sen_token.find(entity_tag['gene'][0])>=0:
|
87 |
-
en_id=sen_token.split('|')[0]
|
88 |
-
temp_id['gene'].append(en_id)
|
89 |
-
temp_input.append(entity_tag['gene'][0]+'\tO')
|
90 |
-
elif sen_token.find(entity_tag['species'][0])>=0:
|
91 |
-
en_id=sen_token.split('|')[0]
|
92 |
-
# temp_id.append(en_id)
|
93 |
-
temp_input.append(entity_tag['species'][0]+'\tO')
|
94 |
-
else:
|
95 |
-
if sen_token=='':
|
96 |
-
# print('token is none!error!')
|
97 |
-
pass
|
98 |
-
else:
|
99 |
-
temp_input.append(sen_token+'\tO')
|
100 |
-
final_input.append('\n'.join(temp_input))
|
101 |
-
final_id.append(temp_id)
|
102 |
-
|
103 |
-
# print(entity_nonest)
|
104 |
-
return final_input,final_id,entity_all,pmid
|
105 |
-
|
106 |
-
|
107 |
-
def ml_tagging(ml_input,nn_model):
|
108 |
-
|
109 |
-
test_set,test_label = ml_intext_fn(ml_input)
|
110 |
-
test_x,test_y, test_bert_text_label=nn_model.rep.load_data_hugface(test_set,test_label,word_max_len=nn_model.maxlen,label_type='softmax')
|
111 |
-
test_pre = nn_model.model.predict(test_x)
|
112 |
-
ml_out=out_BIO_BERT_softmax_score_fn(test_pre,test_bert_text_label,nn_model.rep.index_2_label)
|
113 |
-
return ml_out
|
114 |
-
|
115 |
-
def output_rel(ml_output,entity_map,pmid):
|
116 |
-
fin=io.StringIO(ml_output)
|
117 |
-
alltexts=fin.read().strip().split('\n\n')
|
118 |
-
fin.close()
|
119 |
-
final_out={} #{'sid-eid':[spechies id]}
|
120 |
-
for sen_id,sentence in enumerate(alltexts):
|
121 |
-
tokens=sentence.split('\n')
|
122 |
-
gene_entity_id=0
|
123 |
-
token_id=0
|
124 |
-
arg1=''
|
125 |
-
arg2_list=[] #[[ID, score],[id,score]]
|
126 |
-
while (token_id<len(tokens)):
|
127 |
-
seg=tokens[token_id].split('\t')
|
128 |
-
if seg[0]==entity_tag['arg1'][0]:
|
129 |
-
arg1=entity_map[sen_id]['species']
|
130 |
-
token_id+=1
|
131 |
-
if token_id >=len(tokens):
|
132 |
-
break
|
133 |
-
seg=tokens[token_id].split('\t')
|
134 |
-
while seg[0]!=entity_tag['arg1'][1]:
|
135 |
-
token_id+=1
|
136 |
-
if token_id >=len(tokens):
|
137 |
-
break
|
138 |
-
seg=tokens[token_id].split('\t')
|
139 |
-
elif seg[0]==entity_tag[REL_ENT['arg2'].lower()][0]:
|
140 |
-
temp_rel=seg[-2]
|
141 |
-
temp_score=seg[-1]
|
142 |
-
arg2_id=entity_map[sen_id]['gene'][gene_entity_id]
|
143 |
-
gene_entity_id+=1
|
144 |
-
token_id+=1
|
145 |
-
if token_id >=len(tokens):
|
146 |
-
break
|
147 |
-
seg=tokens[token_id].split('\t')
|
148 |
-
while seg[0]!=entity_tag[REL_ENT['arg2'].lower()][1]:
|
149 |
-
token_id+=1
|
150 |
-
if token_id >=len(tokens):
|
151 |
-
break
|
152 |
-
seg=tokens[token_id].split('\t')
|
153 |
-
if seg[-2].find('ARG2')>=0 and temp_rel.find('ARG2')<0:
|
154 |
-
temp_rel=seg[-2]
|
155 |
-
temp_score=seg[-1]
|
156 |
-
if temp_rel.find('ARG2')>=0:
|
157 |
-
arg2_list.append([arg2_id,temp_score])
|
158 |
-
elif seg[0]==entity_tag[REL_ENT['arg1'].lower()][0]:
|
159 |
-
token_id+=1
|
160 |
-
if token_id >=len(tokens):
|
161 |
-
break
|
162 |
-
seg=tokens[token_id].split('\t')
|
163 |
-
while seg[0]!=entity_tag[REL_ENT['arg1'].lower()][1]:
|
164 |
-
token_id+=1
|
165 |
-
if token_id >=len(tokens):
|
166 |
-
break
|
167 |
-
seg=tokens[token_id].split('\t')
|
168 |
-
|
169 |
-
else:
|
170 |
-
pass
|
171 |
-
token_id+=1
|
172 |
-
#print(arg1,arg2_list)
|
173 |
-
if arg2_list!=[] and arg1!='':
|
174 |
-
for arg2_ele in arg2_list:
|
175 |
-
if arg2_ele[0] not in final_out.keys():
|
176 |
-
final_out[arg2_ele[0]]=[arg1+'|'+arg2_ele[1]]
|
177 |
-
else:
|
178 |
-
final_out[arg2_ele[0]].append(arg1+'|'+arg2_ele[1])
|
179 |
-
return(final_out)
|
180 |
-
|
181 |
-
def NER_Tag(doc_in,nn_model):
|
182 |
-
|
183 |
-
#1. preprocess input, input_text:conll格式, input_entity:相应的实体列表
|
184 |
-
#print(doc_in)
|
185 |
-
input_text,entity_index,entity_all,pmid=input_preprocess_notoken(doc_in)
|
186 |
-
# print('pmid:',pmid)
|
187 |
-
# print('\entity_index:',entity_index)
|
188 |
-
|
189 |
-
|
190 |
-
#2. ml tagging
|
191 |
-
if input_text!=[]:
|
192 |
-
ml_pre=ml_tagging(input_text,nn_model)
|
193 |
-
#print('\noutput:')
|
194 |
-
#print(ml_pre)
|
195 |
-
|
196 |
-
#3.generate output
|
197 |
-
final_output=output_rel(ml_pre,entity_index,pmid)
|
198 |
-
else:
|
199 |
-
final_output={}
|
200 |
-
return final_output,entity_all
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Fri Jan 7 09:29:46 2022
|
4 |
+
|
5 |
+
@author: luol2
|
6 |
+
|
7 |
+
machine learning tagging
|
8 |
+
|
9 |
+
"""
|
10 |
+
|
11 |
+
|
12 |
+
import time
|
13 |
+
import io
|
14 |
+
|
15 |
+
from src_python.SpeAss.processing_data_sa import ml_intext_fn,out_BIO_BERT_softmax_score_fn
|
16 |
+
import tensorflow as tf
|
17 |
+
gpu = tf.config.list_physical_devices('GPU')
|
18 |
+
print("Num GPUs Available: ", len(gpu))
|
19 |
+
if len(gpu) > 0:
|
20 |
+
tf.config.experimental.set_memory_growth(gpu[0], True)
|
21 |
+
#tf.compat.v1.disable_eager_execution()
|
22 |
+
|
23 |
+
REL_ENT={'arg1':'Species',
|
24 |
+
'arg2':'Gene'}
|
25 |
+
|
26 |
+
entity_tag={'arg1':['arg1s','arg1e'],
|
27 |
+
'gene':['gene1s','gene1e'],
|
28 |
+
'species':['species1s','species1e']
|
29 |
+
}
|
30 |
+
|
31 |
+
def input_preprocess_notoken(doc_text):
|
32 |
+
final_input=[]
|
33 |
+
final_id=[]
|
34 |
+
|
35 |
+
lines=doc_text.split('\n')
|
36 |
+
token_text=lines[0]
|
37 |
+
pmid=lines[1].split('\t')[0]
|
38 |
+
entity_arg1={} #{species_id:[[spe_sid1,sep_eid1],[...]]}
|
39 |
+
entity_all=[]
|
40 |
+
for i in range(1,len(lines)):
|
41 |
+
seg=lines[i].split('\t')
|
42 |
+
if seg[6]==REL_ENT['arg1']:
|
43 |
+
if seg[-1] in entity_arg1.keys():
|
44 |
+
entity_arg1[seg[-1]].append([seg[3],seg[4]])
|
45 |
+
else:
|
46 |
+
entity_arg1[seg[-1]]=[[seg[3],seg[4]]]
|
47 |
+
entity_all.append(seg)
|
48 |
+
|
49 |
+
#print(token_text)
|
50 |
+
#print(entity_chemical)
|
51 |
+
#generate input instance
|
52 |
+
for cur_ele in entity_arg1:
|
53 |
+
|
54 |
+
#2. ner label text
|
55 |
+
ner_text=''
|
56 |
+
text_sid=0
|
57 |
+
#print('nonest:',entity_nonest)
|
58 |
+
for ele_nonest in entity_all:
|
59 |
+
ent_id=[ele_nonest[3],ele_nonest[4]]
|
60 |
+
ent_spe_id=ele_nonest[-1]
|
61 |
+
ent_sid=int(ele_nonest[3])
|
62 |
+
ent_eid=int(ele_nonest[4])
|
63 |
+
# print('sid,eid:',ent_sid,ent_eid)
|
64 |
+
ent_text=ele_nonest[5]
|
65 |
+
ent_type=ele_nonest[6]
|
66 |
+
if ent_sid>=text_sid:
|
67 |
+
# if token_text[ent_sid:ent_eid]!=ent_text:
|
68 |
+
# print('error!index_text,entext:',token_text[ent_sid:ent_eid],ent_text)
|
69 |
+
if ent_id in entity_arg1[cur_ele]: #is species
|
70 |
+
ner_text+=token_text[text_sid:ent_sid]+' '+ent_spe_id+'|'+entity_tag['arg1'][0]+' '+ent_text+' '+entity_tag['arg1'][1]+' '
|
71 |
+
else:
|
72 |
+
ner_text+=token_text[text_sid:ent_sid]+' '+str(ent_sid)+'-'+str(ent_eid)+'|'+entity_tag[ent_type.lower()][0]+' '+ent_text+' '+entity_tag[ent_type.lower()][1]+' '
|
73 |
+
text_sid=ent_eid
|
74 |
+
ner_text+=token_text[text_sid:]
|
75 |
+
sen_tokens=ner_text.split()
|
76 |
+
#print('\nner_text:',ner_text)
|
77 |
+
|
78 |
+
#3. produce input
|
79 |
+
temp_input=[]
|
80 |
+
temp_id={'species':'','gene':[]}
|
81 |
+
for sen_token in sen_tokens:
|
82 |
+
if sen_token.find(entity_tag['arg1'][0])>=0:
|
83 |
+
en_id=sen_token.split('|')[0]
|
84 |
+
temp_id['species']=en_id
|
85 |
+
temp_input.append(entity_tag['arg1'][0]+'\tO')
|
86 |
+
elif sen_token.find(entity_tag['gene'][0])>=0:
|
87 |
+
en_id=sen_token.split('|')[0]
|
88 |
+
temp_id['gene'].append(en_id)
|
89 |
+
temp_input.append(entity_tag['gene'][0]+'\tO')
|
90 |
+
elif sen_token.find(entity_tag['species'][0])>=0:
|
91 |
+
en_id=sen_token.split('|')[0]
|
92 |
+
# temp_id.append(en_id)
|
93 |
+
temp_input.append(entity_tag['species'][0]+'\tO')
|
94 |
+
else:
|
95 |
+
if sen_token=='':
|
96 |
+
# print('token is none!error!')
|
97 |
+
pass
|
98 |
+
else:
|
99 |
+
temp_input.append(sen_token+'\tO')
|
100 |
+
final_input.append('\n'.join(temp_input))
|
101 |
+
final_id.append(temp_id)
|
102 |
+
|
103 |
+
# print(entity_nonest)
|
104 |
+
return final_input,final_id,entity_all,pmid
|
105 |
+
|
106 |
+
|
107 |
+
def ml_tagging(ml_input,nn_model):
|
108 |
+
|
109 |
+
test_set,test_label = ml_intext_fn(ml_input)
|
110 |
+
test_x,test_y, test_bert_text_label=nn_model.rep.load_data_hugface(test_set,test_label,word_max_len=nn_model.maxlen,label_type='softmax')
|
111 |
+
test_pre = nn_model.model.predict(test_x)
|
112 |
+
ml_out=out_BIO_BERT_softmax_score_fn(test_pre,test_bert_text_label,nn_model.rep.index_2_label)
|
113 |
+
return ml_out
|
114 |
+
|
115 |
+
def output_rel(ml_output,entity_map,pmid):
|
116 |
+
fin=io.StringIO(ml_output)
|
117 |
+
alltexts=fin.read().strip().split('\n\n')
|
118 |
+
fin.close()
|
119 |
+
final_out={} #{'sid-eid':[spechies id]}
|
120 |
+
for sen_id,sentence in enumerate(alltexts):
|
121 |
+
tokens=sentence.split('\n')
|
122 |
+
gene_entity_id=0
|
123 |
+
token_id=0
|
124 |
+
arg1=''
|
125 |
+
arg2_list=[] #[[ID, score],[id,score]]
|
126 |
+
while (token_id<len(tokens)):
|
127 |
+
seg=tokens[token_id].split('\t')
|
128 |
+
if seg[0]==entity_tag['arg1'][0]:
|
129 |
+
arg1=entity_map[sen_id]['species']
|
130 |
+
token_id+=1
|
131 |
+
if token_id >=len(tokens):
|
132 |
+
break
|
133 |
+
seg=tokens[token_id].split('\t')
|
134 |
+
while seg[0]!=entity_tag['arg1'][1]:
|
135 |
+
token_id+=1
|
136 |
+
if token_id >=len(tokens):
|
137 |
+
break
|
138 |
+
seg=tokens[token_id].split('\t')
|
139 |
+
elif seg[0]==entity_tag[REL_ENT['arg2'].lower()][0]:
|
140 |
+
temp_rel=seg[-2]
|
141 |
+
temp_score=seg[-1]
|
142 |
+
arg2_id=entity_map[sen_id]['gene'][gene_entity_id]
|
143 |
+
gene_entity_id+=1
|
144 |
+
token_id+=1
|
145 |
+
if token_id >=len(tokens):
|
146 |
+
break
|
147 |
+
seg=tokens[token_id].split('\t')
|
148 |
+
while seg[0]!=entity_tag[REL_ENT['arg2'].lower()][1]:
|
149 |
+
token_id+=1
|
150 |
+
if token_id >=len(tokens):
|
151 |
+
break
|
152 |
+
seg=tokens[token_id].split('\t')
|
153 |
+
if seg[-2].find('ARG2')>=0 and temp_rel.find('ARG2')<0:
|
154 |
+
temp_rel=seg[-2]
|
155 |
+
temp_score=seg[-1]
|
156 |
+
if temp_rel.find('ARG2')>=0:
|
157 |
+
arg2_list.append([arg2_id,temp_score])
|
158 |
+
elif seg[0]==entity_tag[REL_ENT['arg1'].lower()][0]:
|
159 |
+
token_id+=1
|
160 |
+
if token_id >=len(tokens):
|
161 |
+
break
|
162 |
+
seg=tokens[token_id].split('\t')
|
163 |
+
while seg[0]!=entity_tag[REL_ENT['arg1'].lower()][1]:
|
164 |
+
token_id+=1
|
165 |
+
if token_id >=len(tokens):
|
166 |
+
break
|
167 |
+
seg=tokens[token_id].split('\t')
|
168 |
+
|
169 |
+
else:
|
170 |
+
pass
|
171 |
+
token_id+=1
|
172 |
+
#print(arg1,arg2_list)
|
173 |
+
if arg2_list!=[] and arg1!='':
|
174 |
+
for arg2_ele in arg2_list:
|
175 |
+
if arg2_ele[0] not in final_out.keys():
|
176 |
+
final_out[arg2_ele[0]]=[arg1+'|'+arg2_ele[1]]
|
177 |
+
else:
|
178 |
+
final_out[arg2_ele[0]].append(arg1+'|'+arg2_ele[1])
|
179 |
+
return(final_out)
|
180 |
+
|
181 |
+
def NER_Tag(doc_in,nn_model):
|
182 |
+
|
183 |
+
#1. preprocess input, input_text:conll格式, input_entity:相应的实体列表
|
184 |
+
#print(doc_in)
|
185 |
+
input_text,entity_index,entity_all,pmid=input_preprocess_notoken(doc_in)
|
186 |
+
# print('pmid:',pmid)
|
187 |
+
# print('\entity_index:',entity_index)
|
188 |
+
|
189 |
+
|
190 |
+
#2. ml tagging
|
191 |
+
if input_text!=[]:
|
192 |
+
ml_pre=ml_tagging(input_text,nn_model)
|
193 |
+
#print('\noutput:')
|
194 |
+
#print(ml_pre)
|
195 |
+
|
196 |
+
#3.generate output
|
197 |
+
final_output=output_rel(ml_pre,entity_index,pmid)
|
198 |
+
else:
|
199 |
+
final_output={}
|
200 |
+
return final_output,entity_all
|
201 |
+
|
202 |
+
|
203 |
+
|
204 |
+
|
205 |
+
|
206 |
+
|
207 |
+
|
208 |
+
|
209 |
+
|
210 |
+
|
211 |
+
|
212 |
+
|
213 |
+
|
214 |
+
|
215 |
+
|
216 |
+
|
217 |
+
|
218 |
+
|
219 |
+
|
220 |
+
|
src_python/SpeAss/model_sa.py
CHANGED
@@ -1,105 +1,105 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""
|
3 |
-
Created on Wed Feb 10 09:08:09 2021
|
4 |
-
|
5 |
-
@author: luol2
|
6 |
-
|
7 |
-
Model Architecture
|
8 |
-
|
9 |
-
"""
|
10 |
-
import tensorflow as tf
|
11 |
-
from src_python.SpeAss.represent_sa import Hugface_RepresentationLayer
|
12 |
-
from tensorflow.keras.layers import *
|
13 |
-
from tensorflow.keras.models import Model
|
14 |
-
from tensorflow.keras.optimizers import RMSprop, SGD, Adam, Adadelta, Adagrad,Nadam
|
15 |
-
from transformers import TFAutoModel
|
16 |
-
import numpy as np
|
17 |
-
import sys
|
18 |
-
|
19 |
-
|
20 |
-
class LRSchedule_LINEAR(tf.keras.optimizers.schedules.LearningRateSchedule):
|
21 |
-
def __init__(
|
22 |
-
self,
|
23 |
-
init_lr=5e-5,
|
24 |
-
init_warmup_lr=0.0,
|
25 |
-
final_lr=5e-7,
|
26 |
-
warmup_steps=0,
|
27 |
-
decay_steps=0,
|
28 |
-
):
|
29 |
-
super().__init__()
|
30 |
-
self.init_lr = init_lr
|
31 |
-
self.init_warmup_lr=init_warmup_lr
|
32 |
-
self.final_lr = final_lr
|
33 |
-
self.warmup_steps = warmup_steps
|
34 |
-
self.decay_steps = decay_steps
|
35 |
-
|
36 |
-
def __call__(self, step):
|
37 |
-
""" linear warm up - linear decay """
|
38 |
-
if self.warmup_steps>0:
|
39 |
-
warmup_lr = (self.init_lr - self.init_warmup_lr)/self.warmup_steps * step+self.init_warmup_lr
|
40 |
-
else:
|
41 |
-
warmup_lr=1000.0
|
42 |
-
#print('\n.......warmup_lr:',warmup_lr)
|
43 |
-
decay_lr = tf.math.maximum(
|
44 |
-
self.final_lr,
|
45 |
-
self.init_lr - (step - self.warmup_steps)/self.decay_steps*(self.init_lr - self.final_lr)
|
46 |
-
)
|
47 |
-
#print('\n.....decay_lr:',decay_lr)
|
48 |
-
return tf.math.minimum(warmup_lr,decay_lr)
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
class HUGFACE_NER(): #huggingface transformers
|
53 |
-
def __init__(self, model_files):
|
54 |
-
self.model_type='HUGFACE'
|
55 |
-
self.maxlen = 512
|
56 |
-
self.checkpoint_path = model_files['checkpoint_path']
|
57 |
-
self.label_file=model_files['labelfile']
|
58 |
-
self.lowercase=model_files['lowercase']
|
59 |
-
self.rep = Hugface_RepresentationLayer(self.checkpoint_path, self.label_file, lowercase=self.lowercase)
|
60 |
-
|
61 |
-
|
62 |
-
def build_encoder(self):
|
63 |
-
print('...vocab len:',self.rep.vocab_len)
|
64 |
-
plm_model = TFAutoModel.from_pretrained(self.checkpoint_path, from_pt=True)
|
65 |
-
plm_model.resize_token_embeddings(self.rep.vocab_len)
|
66 |
-
x1_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='input_ids')
|
67 |
-
x2_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='token_type_ids')
|
68 |
-
x3_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='attention_mask')
|
69 |
-
x = plm_model(x1_in, token_type_ids=x2_in, attention_mask=x3_in)[0]
|
70 |
-
#dense = TimeDistributed(Dense(512, activation='relu'), name='dense1')(x)
|
71 |
-
self.encoder = Model (inputs=[x1_in,x2_in,x3_in], outputs=x,name='hugface_encoder')
|
72 |
-
self.encoder.summary()
|
73 |
-
|
74 |
-
def build_softmax_decoder(self):
|
75 |
-
|
76 |
-
x1_in = Input(shape=(self.maxlen,),dtype=tf.int32)
|
77 |
-
x2_in = Input(shape=(self.maxlen,),dtype=tf.int32)
|
78 |
-
x3_in = Input(shape=(self.maxlen,),dtype=tf.int32)
|
79 |
-
features = self.encoder([x1_in,x2_in,x3_in])
|
80 |
-
#features = Dropout(0.4)(features)
|
81 |
-
#features = TimeDistributed(Dense(128, activation='relu'), name='dense2')(features)
|
82 |
-
features= Dropout(0.1)(features)
|
83 |
-
output = TimeDistributed(Dense(self.rep.label_table_size, activation='softmax'), name='softmax')(features)
|
84 |
-
self.model = Model(inputs=[x1_in,x2_in,x3_in], outputs=output, name="hugface_softmax")
|
85 |
-
|
86 |
-
# lr_schedule=LRSchedule_LINEAR(
|
87 |
-
# init_lr=1e-5,
|
88 |
-
# init_warmup_lr=1e-7,
|
89 |
-
# final_lr=1e-6,
|
90 |
-
# warmup_steps=0,
|
91 |
-
# decay_steps=40000)
|
92 |
-
|
93 |
-
opt = Adam(learning_rate = 5e-6)
|
94 |
-
self.model.compile(
|
95 |
-
optimizer=opt,
|
96 |
-
loss='sparse_categorical_crossentropy',
|
97 |
-
metrics=['accuracy'],
|
98 |
-
)
|
99 |
-
self.model.summary()
|
100 |
-
|
101 |
-
|
102 |
-
def load_model(self,model_file):
|
103 |
-
self.model.load_weights(model_file)
|
104 |
-
self.model.summary()
|
105 |
-
print('load HUGFACE model done!')
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Wed Feb 10 09:08:09 2021
|
4 |
+
|
5 |
+
@author: luol2
|
6 |
+
|
7 |
+
Model Architecture
|
8 |
+
|
9 |
+
"""
|
10 |
+
import tensorflow as tf
|
11 |
+
from src_python.SpeAss.represent_sa import Hugface_RepresentationLayer
|
12 |
+
from tensorflow.keras.layers import *
|
13 |
+
from tensorflow.keras.models import Model
|
14 |
+
from tensorflow.keras.optimizers import RMSprop, SGD, Adam, Adadelta, Adagrad,Nadam
|
15 |
+
from transformers import TFAutoModel
|
16 |
+
import numpy as np
|
17 |
+
import sys
|
18 |
+
|
19 |
+
|
20 |
+
class LRSchedule_LINEAR(tf.keras.optimizers.schedules.LearningRateSchedule):
|
21 |
+
def __init__(
|
22 |
+
self,
|
23 |
+
init_lr=5e-5,
|
24 |
+
init_warmup_lr=0.0,
|
25 |
+
final_lr=5e-7,
|
26 |
+
warmup_steps=0,
|
27 |
+
decay_steps=0,
|
28 |
+
):
|
29 |
+
super().__init__()
|
30 |
+
self.init_lr = init_lr
|
31 |
+
self.init_warmup_lr=init_warmup_lr
|
32 |
+
self.final_lr = final_lr
|
33 |
+
self.warmup_steps = warmup_steps
|
34 |
+
self.decay_steps = decay_steps
|
35 |
+
|
36 |
+
def __call__(self, step):
|
37 |
+
""" linear warm up - linear decay """
|
38 |
+
if self.warmup_steps>0:
|
39 |
+
warmup_lr = (self.init_lr - self.init_warmup_lr)/self.warmup_steps * step+self.init_warmup_lr
|
40 |
+
else:
|
41 |
+
warmup_lr=1000.0
|
42 |
+
#print('\n.......warmup_lr:',warmup_lr)
|
43 |
+
decay_lr = tf.math.maximum(
|
44 |
+
self.final_lr,
|
45 |
+
self.init_lr - (step - self.warmup_steps)/self.decay_steps*(self.init_lr - self.final_lr)
|
46 |
+
)
|
47 |
+
#print('\n.....decay_lr:',decay_lr)
|
48 |
+
return tf.math.minimum(warmup_lr,decay_lr)
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
class HUGFACE_NER(): #huggingface transformers
|
53 |
+
def __init__(self, model_files):
|
54 |
+
self.model_type='HUGFACE'
|
55 |
+
self.maxlen = 512
|
56 |
+
self.checkpoint_path = model_files['checkpoint_path']
|
57 |
+
self.label_file=model_files['labelfile']
|
58 |
+
self.lowercase=model_files['lowercase']
|
59 |
+
self.rep = Hugface_RepresentationLayer(self.checkpoint_path, self.label_file, lowercase=self.lowercase)
|
60 |
+
|
61 |
+
|
62 |
+
def build_encoder(self):
|
63 |
+
print('...vocab len:',self.rep.vocab_len)
|
64 |
+
plm_model = TFAutoModel.from_pretrained(self.checkpoint_path, from_pt=True)
|
65 |
+
plm_model.resize_token_embeddings(self.rep.vocab_len)
|
66 |
+
x1_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='input_ids')
|
67 |
+
x2_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='token_type_ids')
|
68 |
+
x3_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='attention_mask')
|
69 |
+
x = plm_model(x1_in, token_type_ids=x2_in, attention_mask=x3_in)[0]
|
70 |
+
#dense = TimeDistributed(Dense(512, activation='relu'), name='dense1')(x)
|
71 |
+
self.encoder = Model (inputs=[x1_in,x2_in,x3_in], outputs=x,name='hugface_encoder')
|
72 |
+
self.encoder.summary()
|
73 |
+
|
74 |
+
def build_softmax_decoder(self):
|
75 |
+
|
76 |
+
x1_in = Input(shape=(self.maxlen,),dtype=tf.int32)
|
77 |
+
x2_in = Input(shape=(self.maxlen,),dtype=tf.int32)
|
78 |
+
x3_in = Input(shape=(self.maxlen,),dtype=tf.int32)
|
79 |
+
features = self.encoder([x1_in,x2_in,x3_in])
|
80 |
+
#features = Dropout(0.4)(features)
|
81 |
+
#features = TimeDistributed(Dense(128, activation='relu'), name='dense2')(features)
|
82 |
+
features= Dropout(0.1)(features)
|
83 |
+
output = TimeDistributed(Dense(self.rep.label_table_size, activation='softmax'), name='softmax')(features)
|
84 |
+
self.model = Model(inputs=[x1_in,x2_in,x3_in], outputs=output, name="hugface_softmax")
|
85 |
+
|
86 |
+
# lr_schedule=LRSchedule_LINEAR(
|
87 |
+
# init_lr=1e-5,
|
88 |
+
# init_warmup_lr=1e-7,
|
89 |
+
# final_lr=1e-6,
|
90 |
+
# warmup_steps=0,
|
91 |
+
# decay_steps=40000)
|
92 |
+
|
93 |
+
opt = Adam(learning_rate = 5e-6)
|
94 |
+
self.model.compile(
|
95 |
+
optimizer=opt,
|
96 |
+
loss='sparse_categorical_crossentropy',
|
97 |
+
metrics=['accuracy'],
|
98 |
+
)
|
99 |
+
self.model.summary()
|
100 |
+
|
101 |
+
|
102 |
+
def load_model(self,model_file):
|
103 |
+
self.model.load_weights(model_file)
|
104 |
+
self.model.summary()
|
105 |
+
print('load HUGFACE model done!')
|
src_python/SpeAss/processing_data_sa.py
CHANGED
@@ -1,201 +1,201 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""
|
3 |
-
Created on Tue Mar 10 16:34:12 2020
|
4 |
-
|
5 |
-
@author: luol2
|
6 |
-
"""
|
7 |
-
import numpy as np
|
8 |
-
import io
|
9 |
-
import sys
|
10 |
-
#read ner text (word\tlabel), generate the list[[[w1,label],[w2,label]]]
|
11 |
-
def ml_intext(file):
|
12 |
-
fin=open(file,'r',encoding='utf-8')
|
13 |
-
alltexts=fin.read().strip().split('\n\n')
|
14 |
-
fin.close()
|
15 |
-
data_list=[]
|
16 |
-
label_list=[]
|
17 |
-
|
18 |
-
for sents in alltexts:
|
19 |
-
lines=sents.split('\n')
|
20 |
-
temp_sentece=[]
|
21 |
-
for i in range(0,len(lines)):
|
22 |
-
seg=lines[i].split('\t')
|
23 |
-
temp_sentece.append(seg[:])
|
24 |
-
label_list.append(seg[-1])
|
25 |
-
|
26 |
-
data_list.append(temp_sentece)
|
27 |
-
#print(data_list)
|
28 |
-
#print(label_list)
|
29 |
-
return data_list,label_list
|
30 |
-
|
31 |
-
def ml_intext_fn(alltexts):
|
32 |
-
# fin=io.StringIO(ml_input)
|
33 |
-
# alltexts=fin.read().strip().split('\n\n')
|
34 |
-
# fin.close()
|
35 |
-
data_list=[]
|
36 |
-
label_list=[]
|
37 |
-
|
38 |
-
for sents in alltexts:
|
39 |
-
lines=sents.split('\n')
|
40 |
-
temp_sentece=[]
|
41 |
-
for i in range(0,len(lines)):
|
42 |
-
seg=lines[i].split('\t')
|
43 |
-
temp_sentece.append(seg[:])
|
44 |
-
label_list.append(seg[-1])
|
45 |
-
|
46 |
-
data_list.append(temp_sentece)
|
47 |
-
#print(data_list)
|
48 |
-
#print(label_list)
|
49 |
-
return data_list,label_list
|
50 |
-
|
51 |
-
# model predict result to conll evalute format [token answer predict]
|
52 |
-
def out_BIO(file,raw_pre,raw_input,label_set):
|
53 |
-
fout=open(file,'w',encoding='utf-8')
|
54 |
-
for i in range(len(raw_input)):
|
55 |
-
|
56 |
-
for j in range(len(raw_input[i])):
|
57 |
-
if j<len(raw_pre[i]):
|
58 |
-
label_id = raw_pre[i][j]
|
59 |
-
label_tag = label_set[str(label_id)]
|
60 |
-
else:
|
61 |
-
label_tag='O'
|
62 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
63 |
-
fout.write('\n')
|
64 |
-
fout.close()
|
65 |
-
|
66 |
-
def out_BIO_softmax(file,raw_pre,raw_input,label_set):
|
67 |
-
fout=open(file,'w',encoding='utf-8')
|
68 |
-
#print(raw_pre[0:2])
|
69 |
-
for i in range(len(raw_input)):
|
70 |
-
|
71 |
-
for j in range(len(raw_input[i])):
|
72 |
-
if j<len(raw_pre[i]):
|
73 |
-
label_id = np.argmax(raw_pre[i][j])
|
74 |
-
#print(label_id)
|
75 |
-
label_tag = label_set[str(label_id)]
|
76 |
-
else:
|
77 |
-
label_tag='O'
|
78 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
79 |
-
fout.write('\n')
|
80 |
-
fout.close()
|
81 |
-
|
82 |
-
def out_BIO_fn(raw_pre,raw_input,label_set):
|
83 |
-
fout=io.StringIO()
|
84 |
-
for i in range(len(raw_input)):
|
85 |
-
|
86 |
-
for j in range(len(raw_input[i])):
|
87 |
-
if j<len(raw_pre[i]):
|
88 |
-
label_id = raw_pre[i][j]
|
89 |
-
label_tag = label_set[str(label_id)]
|
90 |
-
else:
|
91 |
-
label_tag='O'
|
92 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
93 |
-
fout.write('\n')
|
94 |
-
return fout.getvalue()
|
95 |
-
def out_BIO_BERT_softmax(file,raw_pre,raw_input,label_set):
|
96 |
-
fout=open(file,'w',encoding='utf-8')
|
97 |
-
for i in range(len(raw_input)):
|
98 |
-
|
99 |
-
for j in range(len(raw_input[i])):
|
100 |
-
if j<len(raw_pre[i]):
|
101 |
-
# label_id = raw_pre[i][j]
|
102 |
-
label_id = np.argmax(raw_pre[i][j])
|
103 |
-
label_tag = label_set[str(label_id)]
|
104 |
-
else:
|
105 |
-
label_tag='O'
|
106 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
107 |
-
fout.write('\n')
|
108 |
-
fout.close()
|
109 |
-
def out_BIO_BERT(file,raw_pre,raw_input,label_set):
|
110 |
-
fout=open(file,'w',encoding='utf-8')
|
111 |
-
for i in range(len(raw_input)):
|
112 |
-
|
113 |
-
for j in range(len(raw_input[i])):
|
114 |
-
if j<len(raw_pre[i]):
|
115 |
-
label_id = raw_pre[i][j]
|
116 |
-
label_tag = label_set[str(label_id)]
|
117 |
-
else:
|
118 |
-
label_tag='O'
|
119 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
120 |
-
fout.write('\n')
|
121 |
-
fout.close()
|
122 |
-
def out_BIO_BERT_fn(raw_pre,raw_input,label_set):
|
123 |
-
fout=io.StringIO()
|
124 |
-
for i in range(len(raw_input)):
|
125 |
-
|
126 |
-
for j in range(len(raw_input[i])):
|
127 |
-
if j<len(raw_pre[i]):
|
128 |
-
label_id = raw_pre[i][j]
|
129 |
-
label_tag = label_set[str(label_id)]
|
130 |
-
else:
|
131 |
-
label_tag='O'
|
132 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
133 |
-
fout.write('\n')
|
134 |
-
return fout.getvalue()
|
135 |
-
def out_BIO_BERT_softmax_fn(raw_pre,raw_input,label_set):
|
136 |
-
fout=io.StringIO()
|
137 |
-
for i in range(len(raw_input)):
|
138 |
-
|
139 |
-
for j in range(len(raw_input[i])):
|
140 |
-
if j<len(raw_pre[i]):
|
141 |
-
#label_id = raw_pre[i][j]
|
142 |
-
label_id = np.argmax(raw_pre[i][j])
|
143 |
-
label_tag = label_set[str(label_id)]
|
144 |
-
else:
|
145 |
-
label_tag='O'
|
146 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
147 |
-
fout.write('\n')
|
148 |
-
return fout.getvalue()
|
149 |
-
def out_BIO_BERT_softmax_score_fn(raw_pre,raw_input,label_set):
|
150 |
-
fout=io.StringIO()
|
151 |
-
for i in range(len(raw_input)):
|
152 |
-
|
153 |
-
for j in range(len(raw_input[i])):
|
154 |
-
if j<len(raw_pre[i]):
|
155 |
-
#label_id = raw_pre[i][j]
|
156 |
-
label_id = np.argmax(raw_pre[i][j])
|
157 |
-
label_score = round(raw_pre[i][j][label_id],4)
|
158 |
-
label_tag = label_set[str(label_id)]
|
159 |
-
else:
|
160 |
-
label_tag='O'
|
161 |
-
label_score = 0.0
|
162 |
-
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\t'+str(label_score)+'\n')
|
163 |
-
fout.write('\n')
|
164 |
-
return fout.getvalue()
|
165 |
-
#generate char vocab
|
166 |
-
def char_vocab(infile,outfile_char):
|
167 |
-
fin=open(infile,'r',encoding='utf-8')
|
168 |
-
#fout=open(outfile,'w',encoding='utf-8')
|
169 |
-
fout_char=open(outfile_char,'w',encoding='utf-8')
|
170 |
-
char_vocab=['oov_char']
|
171 |
-
max_len=0
|
172 |
-
for line in fin:
|
173 |
-
if line.strip()!='':
|
174 |
-
seg=line.split('\t')
|
175 |
-
word_len=len(seg[0])
|
176 |
-
#if word_len<1000:
|
177 |
-
# fout.write(line)
|
178 |
-
if word_len>max_len:
|
179 |
-
max_len=word_len
|
180 |
-
print(seg[0])
|
181 |
-
for i in range(word_len):
|
182 |
-
if seg[0][i] not in char_vocab:
|
183 |
-
char_vocab.append(seg[0][i])
|
184 |
-
#else:
|
185 |
-
# fout.write(line)
|
186 |
-
fin.close()
|
187 |
-
#fout.close()
|
188 |
-
for ele in char_vocab:
|
189 |
-
fout_char.write(ele+'\n')
|
190 |
-
fout_char.close()
|
191 |
-
print('max_len:',max_len)
|
192 |
-
|
193 |
-
|
194 |
-
if __name__=='__main__':
|
195 |
-
# infile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO'
|
196 |
-
# #outfile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO_new'
|
197 |
-
# outfile_char='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/src/nn_model/vocab/char_vocab'
|
198 |
-
# #processing_text(file)
|
199 |
-
# char_vocab(infile,outfile_char)
|
200 |
-
a=[1,2,3]
|
201 |
-
print(a[:-1])
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Created on Tue Mar 10 16:34:12 2020
|
4 |
+
|
5 |
+
@author: luol2
|
6 |
+
"""
|
7 |
+
import numpy as np
|
8 |
+
import io
|
9 |
+
import sys
|
10 |
+
#read ner text (word\tlabel), generate the list[[[w1,label],[w2,label]]]
|
11 |
+
def ml_intext(file):
|
12 |
+
fin=open(file,'r',encoding='utf-8')
|
13 |
+
alltexts=fin.read().strip().split('\n\n')
|
14 |
+
fin.close()
|
15 |
+
data_list=[]
|
16 |
+
label_list=[]
|
17 |
+
|
18 |
+
for sents in alltexts:
|
19 |
+
lines=sents.split('\n')
|
20 |
+
temp_sentece=[]
|
21 |
+
for i in range(0,len(lines)):
|
22 |
+
seg=lines[i].split('\t')
|
23 |
+
temp_sentece.append(seg[:])
|
24 |
+
label_list.append(seg[-1])
|
25 |
+
|
26 |
+
data_list.append(temp_sentece)
|
27 |
+
#print(data_list)
|
28 |
+
#print(label_list)
|
29 |
+
return data_list,label_list
|
30 |
+
|
31 |
+
def ml_intext_fn(alltexts):
|
32 |
+
# fin=io.StringIO(ml_input)
|
33 |
+
# alltexts=fin.read().strip().split('\n\n')
|
34 |
+
# fin.close()
|
35 |
+
data_list=[]
|
36 |
+
label_list=[]
|
37 |
+
|
38 |
+
for sents in alltexts:
|
39 |
+
lines=sents.split('\n')
|
40 |
+
temp_sentece=[]
|
41 |
+
for i in range(0,len(lines)):
|
42 |
+
seg=lines[i].split('\t')
|
43 |
+
temp_sentece.append(seg[:])
|
44 |
+
label_list.append(seg[-1])
|
45 |
+
|
46 |
+
data_list.append(temp_sentece)
|
47 |
+
#print(data_list)
|
48 |
+
#print(label_list)
|
49 |
+
return data_list,label_list
|
50 |
+
|
51 |
+
# model predict result to conll evalute format [token answer predict]
|
52 |
+
def out_BIO(file,raw_pre,raw_input,label_set):
|
53 |
+
fout=open(file,'w',encoding='utf-8')
|
54 |
+
for i in range(len(raw_input)):
|
55 |
+
|
56 |
+
for j in range(len(raw_input[i])):
|
57 |
+
if j<len(raw_pre[i]):
|
58 |
+
label_id = raw_pre[i][j]
|
59 |
+
label_tag = label_set[str(label_id)]
|
60 |
+
else:
|
61 |
+
label_tag='O'
|
62 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
63 |
+
fout.write('\n')
|
64 |
+
fout.close()
|
65 |
+
|
66 |
+
def out_BIO_softmax(file,raw_pre,raw_input,label_set):
|
67 |
+
fout=open(file,'w',encoding='utf-8')
|
68 |
+
#print(raw_pre[0:2])
|
69 |
+
for i in range(len(raw_input)):
|
70 |
+
|
71 |
+
for j in range(len(raw_input[i])):
|
72 |
+
if j<len(raw_pre[i]):
|
73 |
+
label_id = np.argmax(raw_pre[i][j])
|
74 |
+
#print(label_id)
|
75 |
+
label_tag = label_set[str(label_id)]
|
76 |
+
else:
|
77 |
+
label_tag='O'
|
78 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
79 |
+
fout.write('\n')
|
80 |
+
fout.close()
|
81 |
+
|
82 |
+
def out_BIO_fn(raw_pre,raw_input,label_set):
|
83 |
+
fout=io.StringIO()
|
84 |
+
for i in range(len(raw_input)):
|
85 |
+
|
86 |
+
for j in range(len(raw_input[i])):
|
87 |
+
if j<len(raw_pre[i]):
|
88 |
+
label_id = raw_pre[i][j]
|
89 |
+
label_tag = label_set[str(label_id)]
|
90 |
+
else:
|
91 |
+
label_tag='O'
|
92 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
93 |
+
fout.write('\n')
|
94 |
+
return fout.getvalue()
|
95 |
+
def out_BIO_BERT_softmax(file,raw_pre,raw_input,label_set):
|
96 |
+
fout=open(file,'w',encoding='utf-8')
|
97 |
+
for i in range(len(raw_input)):
|
98 |
+
|
99 |
+
for j in range(len(raw_input[i])):
|
100 |
+
if j<len(raw_pre[i]):
|
101 |
+
# label_id = raw_pre[i][j]
|
102 |
+
label_id = np.argmax(raw_pre[i][j])
|
103 |
+
label_tag = label_set[str(label_id)]
|
104 |
+
else:
|
105 |
+
label_tag='O'
|
106 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
107 |
+
fout.write('\n')
|
108 |
+
fout.close()
|
109 |
+
def out_BIO_BERT(file,raw_pre,raw_input,label_set):
|
110 |
+
fout=open(file,'w',encoding='utf-8')
|
111 |
+
for i in range(len(raw_input)):
|
112 |
+
|
113 |
+
for j in range(len(raw_input[i])):
|
114 |
+
if j<len(raw_pre[i]):
|
115 |
+
label_id = raw_pre[i][j]
|
116 |
+
label_tag = label_set[str(label_id)]
|
117 |
+
else:
|
118 |
+
label_tag='O'
|
119 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
120 |
+
fout.write('\n')
|
121 |
+
fout.close()
|
122 |
+
def out_BIO_BERT_fn(raw_pre,raw_input,label_set):
|
123 |
+
fout=io.StringIO()
|
124 |
+
for i in range(len(raw_input)):
|
125 |
+
|
126 |
+
for j in range(len(raw_input[i])):
|
127 |
+
if j<len(raw_pre[i]):
|
128 |
+
label_id = raw_pre[i][j]
|
129 |
+
label_tag = label_set[str(label_id)]
|
130 |
+
else:
|
131 |
+
label_tag='O'
|
132 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
133 |
+
fout.write('\n')
|
134 |
+
return fout.getvalue()
|
135 |
+
def out_BIO_BERT_softmax_fn(raw_pre,raw_input,label_set):
|
136 |
+
fout=io.StringIO()
|
137 |
+
for i in range(len(raw_input)):
|
138 |
+
|
139 |
+
for j in range(len(raw_input[i])):
|
140 |
+
if j<len(raw_pre[i]):
|
141 |
+
#label_id = raw_pre[i][j]
|
142 |
+
label_id = np.argmax(raw_pre[i][j])
|
143 |
+
label_tag = label_set[str(label_id)]
|
144 |
+
else:
|
145 |
+
label_tag='O'
|
146 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
147 |
+
fout.write('\n')
|
148 |
+
return fout.getvalue()
|
149 |
+
def out_BIO_BERT_softmax_score_fn(raw_pre,raw_input,label_set):
|
150 |
+
fout=io.StringIO()
|
151 |
+
for i in range(len(raw_input)):
|
152 |
+
|
153 |
+
for j in range(len(raw_input[i])):
|
154 |
+
if j<len(raw_pre[i]):
|
155 |
+
#label_id = raw_pre[i][j]
|
156 |
+
label_id = np.argmax(raw_pre[i][j])
|
157 |
+
label_score = round(raw_pre[i][j][label_id],4)
|
158 |
+
label_tag = label_set[str(label_id)]
|
159 |
+
else:
|
160 |
+
label_tag='O'
|
161 |
+
label_score = 0.0
|
162 |
+
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\t'+str(label_score)+'\n')
|
163 |
+
fout.write('\n')
|
164 |
+
return fout.getvalue()
|
165 |
+
#generate char vocab
|
166 |
+
def char_vocab(infile,outfile_char):
|
167 |
+
fin=open(infile,'r',encoding='utf-8')
|
168 |
+
#fout=open(outfile,'w',encoding='utf-8')
|
169 |
+
fout_char=open(outfile_char,'w',encoding='utf-8')
|
170 |
+
char_vocab=['oov_char']
|
171 |
+
max_len=0
|
172 |
+
for line in fin:
|
173 |
+
if line.strip()!='':
|
174 |
+
seg=line.split('\t')
|
175 |
+
word_len=len(seg[0])
|
176 |
+
#if word_len<1000:
|
177 |
+
# fout.write(line)
|
178 |
+
if word_len>max_len:
|
179 |
+
max_len=word_len
|
180 |
+
print(seg[0])
|
181 |
+
for i in range(word_len):
|
182 |
+
if seg[0][i] not in char_vocab:
|
183 |
+
char_vocab.append(seg[0][i])
|
184 |
+
#else:
|
185 |
+
# fout.write(line)
|
186 |
+
fin.close()
|
187 |
+
#fout.close()
|
188 |
+
for ele in char_vocab:
|
189 |
+
fout_char.write(ele+'\n')
|
190 |
+
fout_char.close()
|
191 |
+
print('max_len:',max_len)
|
192 |
+
|
193 |
+
|
194 |
+
if __name__=='__main__':
|
195 |
+
# infile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO'
|
196 |
+
# #outfile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO_new'
|
197 |
+
# outfile_char='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/src/nn_model/vocab/char_vocab'
|
198 |
+
# #processing_text(file)
|
199 |
+
# char_vocab(infile,outfile_char)
|
200 |
+
a=[1,2,3]
|
201 |
+
print(a[:-1])
|