steventango commited on
Commit
d5062c8
1 Parent(s): fbf0d37

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. CRF/java/.am +27 -27
  2. CRF/java/org/chasen/crfpp/Model.java +51 -51
  3. CRF/perl/Makefile.old +931 -931
  4. CRF/ruby/Makefile +157 -157
  5. CRF/winmain.h +69 -69
  6. GeneNER_SpeAss_run.py +745 -745
  7. Library/Ab3P.C +110 -110
  8. Library/Ab3P.h +83 -83
  9. Library/AbbrStra.C +1426 -1426
  10. Library/AbbrStra.h +332 -332
  11. Library/AbbrvE.C +629 -629
  12. Library/AbbrvE.h +93 -93
  13. Library/Btree.C +1304 -1304
  14. Library/Btree.h +547 -547
  15. Library/FBase.C +600 -600
  16. Library/FBase.h +248 -248
  17. Library/Hash.C +733 -733
  18. Library/Hash.h +92 -92
  19. Library/MPtok.C +2036 -2036
  20. Library/MPtok.h +141 -141
  21. Library/Makefile +13 -13
  22. Library/WordData/Ab3P_prec.dat +144 -144
  23. Library/WordData/Lf1chSf +0 -0
  24. Library/WordData/stop +313 -313
  25. Library/runn.C +216 -216
  26. Library/runn.h +392 -392
  27. gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/version_vocab/vocab1.txt +0 -0
  28. gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/version_vocab/vocab_ori.txt +0 -0
  29. requirements-py310.txt +6 -6
  30. requirements.txt +76 -76
  31. run_batches.py +62 -48
  32. src_Java/GNormPluslib/BioCDoc.java +1343 -1343
  33. src_Java/GNormPluslib/GN.java +1083 -1083
  34. src_Java/GNormPluslib/GNR.java +0 -0
  35. src_Java/GNormPluslib/GNormPlus.java +696 -696
  36. src_Java/GNormPluslib/PrefixTree.java +892 -892
  37. src_Java/GNormPluslib/SR.java +1043 -1043
  38. src_Java/GNormPluslib/SimConcept.java +0 -0
  39. src_python/GeneNER/BIO_format.py +256 -256
  40. src_python/GeneNER/Evaluation_ner.py +243 -243
  41. src_python/GeneNER/model_ner.py +102 -102
  42. src_python/GeneNER/ner_tag.py +85 -85
  43. src_python/GeneNER/processing_data_ner.py +210 -210
  44. src_python/GeneNER/represent_ner.py +183 -183
  45. src_python/GeneNER/restore_index_ner.py +447 -447
  46. src_python/SpeAss/Evaluation_sa.py +396 -396
  47. src_python/SpeAss/SA_Pubtator_Conll.py +493 -493
  48. src_python/SpeAss/ml_tagging_score_sa.py +220 -220
  49. src_python/SpeAss/model_sa.py +105 -105
  50. src_python/SpeAss/processing_data_sa.py +201 -201
CRF/java/.am CHANGED
@@ -1,27 +1,27 @@
1
- TARGET=MeCab
2
- JAVAC=javac
3
- JAVA=java
4
- JAR=jar
5
- CXX=c++
6
- INCLUDE=/usr/lib/jvm/java-6-openjdk/include
7
-
8
- PACKAGE=org/chasen/mecab
9
-
10
- LIBS=`mecab-config --libs`
11
- INC=`mecab-config --cflags` -I$(INCLUDE) -I$(INCLUDE)/linux
12
-
13
- all:
14
- $(CXX) -O3 -c -fpic $(TARGET)_wrap.cxx $(INC)
15
- $(CXX) -shared $(TARGET)_wrap.o -o lib$(TARGET).so $(LIBS)
16
- $(JAVAC) $(PACKAGE)/*.java
17
- $(JAVAC) test.java
18
- $(JAR) cfv $(TARGET).jar $(PACKAGE)/*.class
19
-
20
- test:
21
- env LD_LIBRARY_PATH=. $(JAVA) test
22
-
23
- clean:
24
- rm -fr *.jar *.o *.so *.class $(PACKAGE)/*.class
25
-
26
- cleanall:
27
- rm -fr $(TARGET).java *.cxx
 
1
+ TARGET=MeCab
2
+ JAVAC=javac
3
+ JAVA=java
4
+ JAR=jar
5
+ CXX=c++
6
+ INCLUDE=/usr/lib/jvm/java-6-openjdk/include
7
+
8
+ PACKAGE=org/chasen/mecab
9
+
10
+ LIBS=`mecab-config --libs`
11
+ INC=`mecab-config --cflags` -I$(INCLUDE) -I$(INCLUDE)/linux
12
+
13
+ all:
14
+ $(CXX) -O3 -c -fpic $(TARGET)_wrap.cxx $(INC)
15
+ $(CXX) -shared $(TARGET)_wrap.o -o lib$(TARGET).so $(LIBS)
16
+ $(JAVAC) $(PACKAGE)/*.java
17
+ $(JAVAC) test.java
18
+ $(JAR) cfv $(TARGET).jar $(PACKAGE)/*.class
19
+
20
+ test:
21
+ env LD_LIBRARY_PATH=. $(JAVA) test
22
+
23
+ clean:
24
+ rm -fr *.jar *.o *.so *.class $(PACKAGE)/*.class
25
+
26
+ cleanall:
27
+ rm -fr $(TARGET).java *.cxx
CRF/java/org/chasen/crfpp/Model.java CHANGED
@@ -1,51 +1,51 @@
1
- /* ----------------------------------------------------------------------------
2
- * This file was automatically generated by SWIG (http://www.swig.org).
3
- * Version 1.3.40
4
- *
5
- * Do not make changes to this file unless you know what you are doing--modify
6
- * the SWIG interface file instead.
7
- * ----------------------------------------------------------------------------- */
8
-
9
- package org.chasen.crfpp;
10
-
11
- public class Model {
12
- private long swigCPtr;
13
- protected boolean swigCMemOwn;
14
-
15
- protected Model(long cPtr, boolean cMemoryOwn) {
16
- swigCMemOwn = cMemoryOwn;
17
- swigCPtr = cPtr;
18
- }
19
-
20
- protected static long getCPtr(Model obj) {
21
- return (obj == null) ? 0 : obj.swigCPtr;
22
- }
23
-
24
- protected void finalize() {
25
- delete();
26
- }
27
-
28
- public synchronized void delete() {
29
- if (swigCPtr != 0) {
30
- if (swigCMemOwn) {
31
- swigCMemOwn = false;
32
- CRFPPJNI.delete_Model(swigCPtr);
33
- }
34
- swigCPtr = 0;
35
- }
36
- }
37
-
38
- public Tagger createTagger() {
39
- long cPtr = CRFPPJNI.Model_createTagger(swigCPtr, this);
40
- return (cPtr == 0) ? null : new Tagger(cPtr, false);
41
- }
42
-
43
- public String what() {
44
- return CRFPPJNI.Model_what(swigCPtr, this);
45
- }
46
-
47
- public Model(String arg) {
48
- this(CRFPPJNI.new_Model(arg), true);
49
- }
50
-
51
- }
 
1
+ /* ----------------------------------------------------------------------------
2
+ * This file was automatically generated by SWIG (http://www.swig.org).
3
+ * Version 1.3.40
4
+ *
5
+ * Do not make changes to this file unless you know what you are doing--modify
6
+ * the SWIG interface file instead.
7
+ * ----------------------------------------------------------------------------- */
8
+
9
+ package org.chasen.crfpp;
10
+
11
+ public class Model {
12
+ private long swigCPtr;
13
+ protected boolean swigCMemOwn;
14
+
15
+ protected Model(long cPtr, boolean cMemoryOwn) {
16
+ swigCMemOwn = cMemoryOwn;
17
+ swigCPtr = cPtr;
18
+ }
19
+
20
+ protected static long getCPtr(Model obj) {
21
+ return (obj == null) ? 0 : obj.swigCPtr;
22
+ }
23
+
24
+ protected void finalize() {
25
+ delete();
26
+ }
27
+
28
+ public synchronized void delete() {
29
+ if (swigCPtr != 0) {
30
+ if (swigCMemOwn) {
31
+ swigCMemOwn = false;
32
+ CRFPPJNI.delete_Model(swigCPtr);
33
+ }
34
+ swigCPtr = 0;
35
+ }
36
+ }
37
+
38
+ public Tagger createTagger() {
39
+ long cPtr = CRFPPJNI.Model_createTagger(swigCPtr, this);
40
+ return (cPtr == 0) ? null : new Tagger(cPtr, false);
41
+ }
42
+
43
+ public String what() {
44
+ return CRFPPJNI.Model_what(swigCPtr, this);
45
+ }
46
+
47
+ public Model(String arg) {
48
+ this(CRFPPJNI.new_Model(arg), true);
49
+ }
50
+
51
+ }
CRF/perl/Makefile.old CHANGED
@@ -1,931 +1,931 @@
1
- # This Makefile is for the CRFPP extension to perl.
2
- #
3
- # It was generated automatically by MakeMaker version
4
- # 6.56 (Revision: 65600) from the contents of
5
- # Makefile.PL. Don't edit this file, edit Makefile.PL instead.
6
- #
7
- # ANY CHANGES MADE HERE WILL BE LOST!
8
- #
9
- # MakeMaker ARGV: ()
10
- #
11
-
12
- # MakeMaker Parameters:
13
-
14
- # BUILD_REQUIRES => { }
15
- # CC => q[c++]
16
- # INC => q[]
17
- # LD => q[c++]
18
- # LIBS => q[-lpthread -lcrfpp]
19
- # NAME => q[CRFPP]
20
- # OBJECT => q[CRFPP_wrap.o]
21
- # PREREQ_PM => { }
22
-
23
- # --- MakeMaker post_initialize section:
24
-
25
-
26
- # --- MakeMaker const_config section:
27
-
28
- # These definitions are from config.sh (via /usr/lib/perl/5.12/Config.pm).
29
- # They may have been overridden via Makefile.PL or on the command line.
30
- AR = ar
31
- CC = c++
32
- CCCDLFLAGS = -fPIC
33
- CCDLFLAGS = -Wl,-E
34
- DLEXT = so
35
- DLSRC = dl_dlopen.xs
36
- EXE_EXT =
37
- FULL_AR = /usr/bin/ar
38
- LD = c++
39
- LDDLFLAGS = -shared -O2 -g -L/usr/local/lib -fstack-protector
40
- LDFLAGS = -fstack-protector -L/usr/local/lib
41
- LIBC =
42
- LIB_EXT = .a
43
- OBJ_EXT = .o
44
- OSNAME = linux
45
- OSVERS = 2.6.24-28-server
46
- RANLIB = :
47
- SITELIBEXP = /usr/local/share/perl/5.12.4
48
- SITEARCHEXP = /usr/local/lib/perl/5.12.4
49
- SO = so
50
- VENDORARCHEXP = /usr/lib/perl5
51
- VENDORLIBEXP = /usr/share/perl5
52
-
53
-
54
- # --- MakeMaker constants section:
55
- AR_STATIC_ARGS = cr
56
- DIRFILESEP = /
57
- DFSEP = $(DIRFILESEP)
58
- NAME = CRFPP
59
- NAME_SYM = CRFPP
60
- VERSION =
61
- VERSION_MACRO = VERSION
62
- VERSION_SYM =
63
- DEFINE_VERSION = -D$(VERSION_MACRO)=\"$(VERSION)\"
64
- XS_VERSION =
65
- XS_VERSION_MACRO = XS_VERSION
66
- XS_DEFINE_VERSION = -D$(XS_VERSION_MACRO)=\"$(XS_VERSION)\"
67
- INST_ARCHLIB = blib/arch
68
- INST_SCRIPT = blib/script
69
- INST_BIN = blib/bin
70
- INST_LIB = blib/lib
71
- INST_MAN1DIR = blib/man1
72
- INST_MAN3DIR = blib/man3
73
- MAN1EXT = 1p
74
- MAN3EXT = 3pm
75
- INSTALLDIRS = site
76
- DESTDIR =
77
- PREFIX = /usr
78
- PERLPREFIX = $(PREFIX)
79
- SITEPREFIX = $(PREFIX)/local
80
- VENDORPREFIX = $(PREFIX)
81
- INSTALLPRIVLIB = $(PERLPREFIX)/share/perl/5.12
82
- DESTINSTALLPRIVLIB = $(DESTDIR)$(INSTALLPRIVLIB)
83
- INSTALLSITELIB = $(SITEPREFIX)/share/perl/5.12.4
84
- DESTINSTALLSITELIB = $(DESTDIR)$(INSTALLSITELIB)
85
- INSTALLVENDORLIB = $(VENDORPREFIX)/share/perl5
86
- DESTINSTALLVENDORLIB = $(DESTDIR)$(INSTALLVENDORLIB)
87
- INSTALLARCHLIB = $(PERLPREFIX)/lib/perl/5.12
88
- DESTINSTALLARCHLIB = $(DESTDIR)$(INSTALLARCHLIB)
89
- INSTALLSITEARCH = $(SITEPREFIX)/lib/perl/5.12.4
90
- DESTINSTALLSITEARCH = $(DESTDIR)$(INSTALLSITEARCH)
91
- INSTALLVENDORARCH = $(VENDORPREFIX)/lib/perl5
92
- DESTINSTALLVENDORARCH = $(DESTDIR)$(INSTALLVENDORARCH)
93
- INSTALLBIN = $(PERLPREFIX)/bin
94
- DESTINSTALLBIN = $(DESTDIR)$(INSTALLBIN)
95
- INSTALLSITEBIN = $(SITEPREFIX)/bin
96
- DESTINSTALLSITEBIN = $(DESTDIR)$(INSTALLSITEBIN)
97
- INSTALLVENDORBIN = $(VENDORPREFIX)/bin
98
- DESTINSTALLVENDORBIN = $(DESTDIR)$(INSTALLVENDORBIN)
99
- INSTALLSCRIPT = $(PERLPREFIX)/bin
100
- DESTINSTALLSCRIPT = $(DESTDIR)$(INSTALLSCRIPT)
101
- INSTALLSITESCRIPT = $(SITEPREFIX)/bin
102
- DESTINSTALLSITESCRIPT = $(DESTDIR)$(INSTALLSITESCRIPT)
103
- INSTALLVENDORSCRIPT = $(VENDORPREFIX)/bin
104
- DESTINSTALLVENDORSCRIPT = $(DESTDIR)$(INSTALLVENDORSCRIPT)
105
- INSTALLMAN1DIR = $(PERLPREFIX)/share/man/man1
106
- DESTINSTALLMAN1DIR = $(DESTDIR)$(INSTALLMAN1DIR)
107
- INSTALLSITEMAN1DIR = $(SITEPREFIX)/man/man1
108
- DESTINSTALLSITEMAN1DIR = $(DESTDIR)$(INSTALLSITEMAN1DIR)
109
- INSTALLVENDORMAN1DIR = $(VENDORPREFIX)/share/man/man1
110
- DESTINSTALLVENDORMAN1DIR = $(DESTDIR)$(INSTALLVENDORMAN1DIR)
111
- INSTALLMAN3DIR = $(PERLPREFIX)/share/man/man3
112
- DESTINSTALLMAN3DIR = $(DESTDIR)$(INSTALLMAN3DIR)
113
- INSTALLSITEMAN3DIR = $(SITEPREFIX)/man/man3
114
- DESTINSTALLSITEMAN3DIR = $(DESTDIR)$(INSTALLSITEMAN3DIR)
115
- INSTALLVENDORMAN3DIR = $(VENDORPREFIX)/share/man/man3
116
- DESTINSTALLVENDORMAN3DIR = $(DESTDIR)$(INSTALLVENDORMAN3DIR)
117
- PERL_LIB = /usr/share/perl/5.12
118
- PERL_ARCHLIB = /usr/lib/perl/5.12
119
- LIBPERL_A = libperl.a
120
- FIRST_MAKEFILE = Makefile
121
- MAKEFILE_OLD = Makefile.old
122
- MAKE_APERL_FILE = Makefile.aperl
123
- PERLMAINCC = $(CC)
124
- PERL_INC = /usr/lib/perl/5.12/CORE
125
- PERL = /usr/bin/perl
126
- FULLPERL = /usr/bin/perl
127
- ABSPERL = $(PERL)
128
- PERLRUN = $(PERL)
129
- FULLPERLRUN = $(FULLPERL)
130
- ABSPERLRUN = $(ABSPERL)
131
- PERLRUNINST = $(PERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
132
- FULLPERLRUNINST = $(FULLPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
133
- ABSPERLRUNINST = $(ABSPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
134
- PERL_CORE = 0
135
- PERM_DIR = 755
136
- PERM_RW = 644
137
- PERM_RWX = 755
138
-
139
- MAKEMAKER = /usr/share/perl/5.12/ExtUtils/MakeMaker.pm
140
- MM_VERSION = 6.56
141
- MM_REVISION = 65600
142
-
143
- # FULLEXT = Pathname for extension directory (eg Foo/Bar/Oracle).
144
- # BASEEXT = Basename part of FULLEXT. May be just equal FULLEXT. (eg Oracle)
145
- # PARENT_NAME = NAME without BASEEXT and no trailing :: (eg Foo::Bar)
146
- # DLBASE = Basename part of dynamic library. May be just equal BASEEXT.
147
- MAKE = make
148
- FULLEXT = CRFPP
149
- BASEEXT = CRFPP
150
- PARENT_NAME =
151
- DLBASE = $(BASEEXT)
152
- VERSION_FROM =
153
- INC =
154
- OBJECT = CRFPP_wrap$(OBJ_EXT)
155
- LDFROM = $(OBJECT)
156
- LINKTYPE = dynamic
157
- BOOTDEP =
158
-
159
- # Handy lists of source code files:
160
- XS_FILES =
161
- C_FILES = CRFPP_wrap.cxx
162
- O_FILES = CRFPP_wrap.o
163
- H_FILES =
164
- MAN1PODS =
165
- MAN3PODS =
166
-
167
- # Where is the Config information that we are using/depend on
168
- CONFIGDEP = $(PERL_ARCHLIB)$(DFSEP)Config.pm $(PERL_INC)$(DFSEP)config.h
169
-
170
- # Where to build things
171
- INST_LIBDIR = $(INST_LIB)
172
- INST_ARCHLIBDIR = $(INST_ARCHLIB)
173
-
174
- INST_AUTODIR = $(INST_LIB)/auto/$(FULLEXT)
175
- INST_ARCHAUTODIR = $(INST_ARCHLIB)/auto/$(FULLEXT)
176
-
177
- INST_STATIC = $(INST_ARCHAUTODIR)/$(BASEEXT)$(LIB_EXT)
178
- INST_DYNAMIC = $(INST_ARCHAUTODIR)/$(DLBASE).$(DLEXT)
179
- INST_BOOT = $(INST_ARCHAUTODIR)/$(BASEEXT).bs
180
-
181
- # Extra linker info
182
- EXPORT_LIST =
183
- PERL_ARCHIVE =
184
- PERL_ARCHIVE_AFTER =
185
-
186
-
187
- TO_INST_PM = CRFPP.pm
188
-
189
- PM_TO_BLIB = CRFPP.pm \
190
- $(INST_LIB)/CRFPP.pm
191
-
192
-
193
- # --- MakeMaker platform_constants section:
194
- MM_Unix_VERSION = 6.56
195
- PERL_MALLOC_DEF = -DPERL_EXTMALLOC_DEF -Dmalloc=Perl_malloc -Dfree=Perl_mfree -Drealloc=Perl_realloc -Dcalloc=Perl_calloc
196
-
197
-
198
- # --- MakeMaker tool_autosplit section:
199
- # Usage: $(AUTOSPLITFILE) FileToSplit AutoDirToSplitInto
200
- AUTOSPLITFILE = $(ABSPERLRUN) -e 'use AutoSplit; autosplit($$ARGV[0], $$ARGV[1], 0, 1, 1)' --
201
-
202
-
203
-
204
- # --- MakeMaker tool_xsubpp section:
205
-
206
- XSUBPPDIR = /usr/share/perl/5.12/ExtUtils
207
- XSUBPP = $(XSUBPPDIR)$(DFSEP)xsubpp
208
- XSUBPPRUN = $(PERLRUN) $(XSUBPP)
209
- XSPROTOARG =
210
- XSUBPPDEPS = /usr/share/perl/5.12/ExtUtils/typemap $(XSUBPP)
211
- XSUBPPARGS = -typemap /usr/share/perl/5.12/ExtUtils/typemap
212
- XSUBPP_EXTRA_ARGS =
213
-
214
-
215
- # --- MakeMaker tools_other section:
216
- SHELL = /bin/sh
217
- CHMOD = chmod
218
- CP = cp
219
- MV = mv
220
- NOOP = $(TRUE)
221
- NOECHO = @
222
- RM_F = rm -f
223
- RM_RF = rm -rf
224
- TEST_F = test -f
225
- TOUCH = touch
226
- UMASK_NULL = umask 0
227
- DEV_NULL = > /dev/null 2>&1
228
- MKPATH = $(ABSPERLRUN) -MExtUtils::Command -e 'mkpath' --
229
- EQUALIZE_TIMESTAMP = $(ABSPERLRUN) -MExtUtils::Command -e 'eqtime' --
230
- FALSE = false
231
- TRUE = true
232
- ECHO = echo
233
- ECHO_N = echo -n
234
- UNINST = 0
235
- VERBINST = 0
236
- MOD_INSTALL = $(ABSPERLRUN) -MExtUtils::Install -e 'install([ from_to => {@ARGV}, verbose => '\''$(VERBINST)'\'', uninstall_shadows => '\''$(UNINST)'\'', dir_mode => '\''$(PERM_DIR)'\'' ]);' --
237
- DOC_INSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'perllocal_install' --
238
- UNINSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'uninstall' --
239
- WARN_IF_OLD_PACKLIST = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'warn_if_old_packlist' --
240
- MACROSTART =
241
- MACROEND =
242
- USEMAKEFILE = -f
243
- FIXIN = $(ABSPERLRUN) -MExtUtils::MY -e 'MY->fixin(shift)' --
244
-
245
-
246
- # --- MakeMaker makemakerdflt section:
247
- makemakerdflt : all
248
- $(NOECHO) $(NOOP)
249
-
250
-
251
- # --- MakeMaker dist section:
252
- TAR = tar
253
- TARFLAGS = cvf
254
- ZIP = zip
255
- ZIPFLAGS = -r
256
- COMPRESS = gzip --best
257
- SUFFIX = .gz
258
- SHAR = shar
259
- PREOP = $(NOECHO) $(NOOP)
260
- POSTOP = $(NOECHO) $(NOOP)
261
- TO_UNIX = $(NOECHO) $(NOOP)
262
- CI = ci -u
263
- RCS_LABEL = rcs -Nv$(VERSION_SYM): -q
264
- DIST_CP = best
265
- DIST_DEFAULT = tardist
266
- DISTNAME = CRFPP
267
- DISTVNAME = CRFPP-
268
-
269
-
270
- # --- MakeMaker macro section:
271
-
272
-
273
- # --- MakeMaker depend section:
274
-
275
-
276
- # --- MakeMaker cflags section:
277
-
278
- CCFLAGS = -D_REENTRANT -D_GNU_SOURCE -DDEBIAN -fno-strict-aliasing -pipe -fstack-protector -I/usr/local/include -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64
279
- OPTIMIZE = -O2 -g
280
- PERLTYPE =
281
- MPOLLUTE =
282
-
283
-
284
- # --- MakeMaker const_loadlibs section:
285
-
286
- # CRFPP might depend on some other libraries:
287
- # See ExtUtils::Liblist for details
288
- #
289
- EXTRALIBS = -lcrfpp
290
- LDLOADLIBS = -lpthread -lcrfpp
291
- BSLOADLIBS =
292
-
293
-
294
- # --- MakeMaker const_cccmd section:
295
- CCCMD = $(CC) -c $(PASTHRU_INC) $(INC) \
296
- $(CCFLAGS) $(OPTIMIZE) \
297
- $(PERLTYPE) $(MPOLLUTE) $(DEFINE_VERSION) \
298
- $(XS_DEFINE_VERSION)
299
-
300
- # --- MakeMaker post_constants section:
301
-
302
-
303
- # --- MakeMaker pasthru section:
304
-
305
- PASTHRU = LIBPERL_A="$(LIBPERL_A)"\
306
- LINKTYPE="$(LINKTYPE)"\
307
- OPTIMIZE="$(OPTIMIZE)"\
308
- PREFIX="$(PREFIX)"\
309
- PASTHRU_INC="$(PASTHRU_INC)"
310
-
311
-
312
- # --- MakeMaker special_targets section:
313
- .SUFFIXES : .xs .c .C .cpp .i .s .cxx .cc $(OBJ_EXT)
314
-
315
- .PHONY: all config static dynamic test linkext manifest blibdirs clean realclean disttest distdir
316
-
317
-
318
-
319
- # --- MakeMaker c_o section:
320
-
321
- .c.i:
322
- cc -E -c $(PASTHRU_INC) $(INC) \
323
- $(CCFLAGS) $(OPTIMIZE) \
324
- $(PERLTYPE) $(MPOLLUTE) $(DEFINE_VERSION) \
325
- $(XS_DEFINE_VERSION) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.c > $*.i
326
-
327
- .c.s:
328
- $(CCCMD) -S $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.c
329
-
330
- .c$(OBJ_EXT):
331
- $(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.c
332
-
333
- .cpp$(OBJ_EXT):
334
- $(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.cpp
335
-
336
- .cxx$(OBJ_EXT):
337
- $(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.cxx
338
-
339
- .cc$(OBJ_EXT):
340
- $(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.cc
341
-
342
- .C$(OBJ_EXT):
343
- $(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.C
344
-
345
-
346
- # --- MakeMaker xs_c section:
347
-
348
- .xs.c:
349
- $(XSUBPPRUN) $(XSPROTOARG) $(XSUBPPARGS) $(XSUBPP_EXTRA_ARGS) $*.xs > $*.xsc && $(MV) $*.xsc $*.c
350
-
351
-
352
- # --- MakeMaker xs_o section:
353
-
354
- .xs$(OBJ_EXT):
355
- $(XSUBPPRUN) $(XSPROTOARG) $(XSUBPPARGS) $*.xs > $*.xsc && $(MV) $*.xsc $*.c
356
- $(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.c
357
-
358
-
359
- # --- MakeMaker top_targets section:
360
- all :: pure_all manifypods
361
- $(NOECHO) $(NOOP)
362
-
363
-
364
- pure_all :: config pm_to_blib subdirs linkext
365
- $(NOECHO) $(NOOP)
366
-
367
- subdirs :: $(MYEXTLIB)
368
- $(NOECHO) $(NOOP)
369
-
370
- config :: $(FIRST_MAKEFILE) blibdirs
371
- $(NOECHO) $(NOOP)
372
-
373
- help :
374
- perldoc ExtUtils::MakeMaker
375
-
376
-
377
- # --- MakeMaker blibdirs section:
378
- blibdirs : $(INST_LIBDIR)$(DFSEP).exists $(INST_ARCHLIB)$(DFSEP).exists $(INST_AUTODIR)$(DFSEP).exists $(INST_ARCHAUTODIR)$(DFSEP).exists $(INST_BIN)$(DFSEP).exists $(INST_SCRIPT)$(DFSEP).exists $(INST_MAN1DIR)$(DFSEP).exists $(INST_MAN3DIR)$(DFSEP).exists
379
- $(NOECHO) $(NOOP)
380
-
381
- # Backwards compat with 6.18 through 6.25
382
- blibdirs.ts : blibdirs
383
- $(NOECHO) $(NOOP)
384
-
385
- $(INST_LIBDIR)$(DFSEP).exists :: Makefile.PL
386
- $(NOECHO) $(MKPATH) $(INST_LIBDIR)
387
- $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_LIBDIR)
388
- $(NOECHO) $(TOUCH) $(INST_LIBDIR)$(DFSEP).exists
389
-
390
- $(INST_ARCHLIB)$(DFSEP).exists :: Makefile.PL
391
- $(NOECHO) $(MKPATH) $(INST_ARCHLIB)
392
- $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHLIB)
393
- $(NOECHO) $(TOUCH) $(INST_ARCHLIB)$(DFSEP).exists
394
-
395
- $(INST_AUTODIR)$(DFSEP).exists :: Makefile.PL
396
- $(NOECHO) $(MKPATH) $(INST_AUTODIR)
397
- $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_AUTODIR)
398
- $(NOECHO) $(TOUCH) $(INST_AUTODIR)$(DFSEP).exists
399
-
400
- $(INST_ARCHAUTODIR)$(DFSEP).exists :: Makefile.PL
401
- $(NOECHO) $(MKPATH) $(INST_ARCHAUTODIR)
402
- $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHAUTODIR)
403
- $(NOECHO) $(TOUCH) $(INST_ARCHAUTODIR)$(DFSEP).exists
404
-
405
- $(INST_BIN)$(DFSEP).exists :: Makefile.PL
406
- $(NOECHO) $(MKPATH) $(INST_BIN)
407
- $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_BIN)
408
- $(NOECHO) $(TOUCH) $(INST_BIN)$(DFSEP).exists
409
-
410
- $(INST_SCRIPT)$(DFSEP).exists :: Makefile.PL
411
- $(NOECHO) $(MKPATH) $(INST_SCRIPT)
412
- $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_SCRIPT)
413
- $(NOECHO) $(TOUCH) $(INST_SCRIPT)$(DFSEP).exists
414
-
415
- $(INST_MAN1DIR)$(DFSEP).exists :: Makefile.PL
416
- $(NOECHO) $(MKPATH) $(INST_MAN1DIR)
417
- $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN1DIR)
418
- $(NOECHO) $(TOUCH) $(INST_MAN1DIR)$(DFSEP).exists
419
-
420
- $(INST_MAN3DIR)$(DFSEP).exists :: Makefile.PL
421
- $(NOECHO) $(MKPATH) $(INST_MAN3DIR)
422
- $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN3DIR)
423
- $(NOECHO) $(TOUCH) $(INST_MAN3DIR)$(DFSEP).exists
424
-
425
-
426
-
427
- # --- MakeMaker linkext section:
428
-
429
- linkext :: $(LINKTYPE)
430
- $(NOECHO) $(NOOP)
431
-
432
-
433
- # --- MakeMaker dlsyms section:
434
-
435
-
436
- # --- MakeMaker dynamic section:
437
-
438
- dynamic :: $(FIRST_MAKEFILE) $(INST_DYNAMIC) $(INST_BOOT)
439
- $(NOECHO) $(NOOP)
440
-
441
-
442
- # --- MakeMaker dynamic_bs section:
443
- BOOTSTRAP = $(BASEEXT).bs
444
-
445
- # As Mkbootstrap might not write a file (if none is required)
446
- # we use touch to prevent make continually trying to remake it.
447
- # The DynaLoader only reads a non-empty file.
448
- $(BOOTSTRAP) : $(FIRST_MAKEFILE) $(BOOTDEP) $(INST_ARCHAUTODIR)$(DFSEP).exists
449
- $(NOECHO) $(ECHO) "Running Mkbootstrap for $(NAME) ($(BSLOADLIBS))"
450
- $(NOECHO) $(PERLRUN) \
451
- "-MExtUtils::Mkbootstrap" \
452
- -e "Mkbootstrap('$(BASEEXT)','$(BSLOADLIBS)');"
453
- $(NOECHO) $(TOUCH) $@
454
- $(CHMOD) $(PERM_RW) $@
455
-
456
- $(INST_BOOT) : $(BOOTSTRAP) $(INST_ARCHAUTODIR)$(DFSEP).exists
457
- $(NOECHO) $(RM_RF) $@
458
- - $(CP) $(BOOTSTRAP) $@
459
- $(CHMOD) $(PERM_RW) $@
460
-
461
-
462
- # --- MakeMaker dynamic_lib section:
463
-
464
- # This section creates the dynamically loadable $(INST_DYNAMIC)
465
- # from $(OBJECT) and possibly $(MYEXTLIB).
466
- ARMAYBE = :
467
- OTHERLDFLAGS =
468
- INST_DYNAMIC_DEP =
469
- INST_DYNAMIC_FIX =
470
-
471
- $(INST_DYNAMIC): $(OBJECT) $(MYEXTLIB) $(BOOTSTRAP) $(INST_ARCHAUTODIR)$(DFSEP).exists $(EXPORT_LIST) $(PERL_ARCHIVE) $(PERL_ARCHIVE_AFTER) $(INST_DYNAMIC_DEP)
472
- $(RM_F) $@
473
- $(LD) $(LDDLFLAGS) $(LDFROM) $(OTHERLDFLAGS) -o $@ $(MYEXTLIB) \
474
- $(PERL_ARCHIVE) $(LDLOADLIBS) $(PERL_ARCHIVE_AFTER) $(EXPORT_LIST) \
475
- $(INST_DYNAMIC_FIX)
476
- $(CHMOD) $(PERM_RWX) $@
477
-
478
-
479
- # --- MakeMaker static section:
480
-
481
- ## $(INST_PM) has been moved to the all: target.
482
- ## It remains here for awhile to allow for old usage: "make static"
483
- static :: $(FIRST_MAKEFILE) $(INST_STATIC)
484
- $(NOECHO) $(NOOP)
485
-
486
-
487
- # --- MakeMaker static_lib section:
488
-
489
- $(INST_STATIC) : $(OBJECT) $(MYEXTLIB) $(INST_ARCHAUTODIR)$(DFSEP).exists
490
- $(RM_RF) $@
491
- $(FULL_AR) $(AR_STATIC_ARGS) $@ $(OBJECT) && $(RANLIB) $@
492
- $(CHMOD) $(PERM_RWX) $@
493
- $(NOECHO) $(ECHO) "$(EXTRALIBS)" > $(INST_ARCHAUTODIR)/extralibs.ld
494
-
495
-
496
- # --- MakeMaker manifypods section:
497
-
498
- POD2MAN_EXE = $(PERLRUN) "-MExtUtils::Command::MM" -e pod2man "--"
499
- POD2MAN = $(POD2MAN_EXE)
500
-
501
-
502
- manifypods : pure_all
503
- $(NOECHO) $(NOOP)
504
-
505
-
506
-
507
-
508
- # --- MakeMaker processPL section:
509
-
510
-
511
- # --- MakeMaker installbin section:
512
-
513
-
514
- # --- MakeMaker subdirs section:
515
-
516
- # none
517
-
518
- # --- MakeMaker clean_subdirs section:
519
- clean_subdirs :
520
- $(NOECHO) $(NOOP)
521
-
522
-
523
- # --- MakeMaker clean section:
524
-
525
- # Delete temporary files but do not touch installed files. We don't delete
526
- # the Makefile here so a later make realclean still has a makefile to use.
527
-
528
- clean :: clean_subdirs
529
- - $(RM_F) \
530
- *$(LIB_EXT) core \
531
- core.[0-9] $(INST_ARCHAUTODIR)/extralibs.all \
532
- core.[0-9][0-9] $(BASEEXT).bso \
533
- pm_to_blib.ts core.[0-9][0-9][0-9][0-9] \
534
- $(BASEEXT).x $(BOOTSTRAP) \
535
- perl$(EXE_EXT) tmon.out \
536
- *$(OBJ_EXT) pm_to_blib \
537
- $(INST_ARCHAUTODIR)/extralibs.ld blibdirs.ts \
538
- core.[0-9][0-9][0-9][0-9][0-9] *perl.core \
539
- core.*perl.*.? $(MAKE_APERL_FILE) \
540
- perl $(BASEEXT).def \
541
- core.[0-9][0-9][0-9] mon.out \
542
- lib$(BASEEXT).def perlmain.c \
543
- perl.exe so_locations \
544
- $(BASEEXT).exp
545
- - $(RM_RF) \
546
- blib
547
- - $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD) $(DEV_NULL)
548
-
549
-
550
- # --- MakeMaker realclean_subdirs section:
551
- realclean_subdirs :
552
- $(NOECHO) $(NOOP)
553
-
554
-
555
- # --- MakeMaker realclean section:
556
- # Delete temporary files (via clean) and also delete dist files
557
- realclean purge :: clean realclean_subdirs
558
- - $(RM_F) \
559
- $(OBJECT) $(MAKEFILE_OLD) \
560
- $(FIRST_MAKEFILE)
561
- - $(RM_RF) \
562
- $(DISTVNAME)
563
-
564
-
565
- # --- MakeMaker metafile section:
566
- metafile : create_distdir
567
- $(NOECHO) $(ECHO) Generating META.yml
568
- $(NOECHO) $(ECHO) '--- #YAML:1.0' > META_new.yml
569
- $(NOECHO) $(ECHO) 'name: CRFPP' >> META_new.yml
570
- $(NOECHO) $(ECHO) 'version: ' >> META_new.yml
571
- $(NOECHO) $(ECHO) 'abstract: ~' >> META_new.yml
572
- $(NOECHO) $(ECHO) 'author: []' >> META_new.yml
573
- $(NOECHO) $(ECHO) 'license: unknown' >> META_new.yml
574
- $(NOECHO) $(ECHO) 'distribution_type: module' >> META_new.yml
575
- $(NOECHO) $(ECHO) 'configure_requires:' >> META_new.yml
576
- $(NOECHO) $(ECHO) ' ExtUtils::MakeMaker: 0' >> META_new.yml
577
- $(NOECHO) $(ECHO) 'build_requires:' >> META_new.yml
578
- $(NOECHO) $(ECHO) ' ExtUtils::MakeMaker: 0' >> META_new.yml
579
- $(NOECHO) $(ECHO) 'requires: {}' >> META_new.yml
580
- $(NOECHO) $(ECHO) 'no_index:' >> META_new.yml
581
- $(NOECHO) $(ECHO) ' directory:' >> META_new.yml
582
- $(NOECHO) $(ECHO) ' - t' >> META_new.yml
583
- $(NOECHO) $(ECHO) ' - inc' >> META_new.yml
584
- $(NOECHO) $(ECHO) 'generated_by: ExtUtils::MakeMaker version 6.56' >> META_new.yml
585
- $(NOECHO) $(ECHO) 'meta-spec:' >> META_new.yml
586
- $(NOECHO) $(ECHO) ' url: http://module-build.sourceforge.net/META-spec-v1.4.html' >> META_new.yml
587
- $(NOECHO) $(ECHO) ' version: 1.4' >> META_new.yml
588
- -$(NOECHO) $(MV) META_new.yml $(DISTVNAME)/META.yml
589
-
590
-
591
- # --- MakeMaker signature section:
592
- signature :
593
- cpansign -s
594
-
595
-
596
- # --- MakeMaker dist_basics section:
597
- distclean :: realclean distcheck
598
- $(NOECHO) $(NOOP)
599
-
600
- distcheck :
601
- $(PERLRUN) "-MExtUtils::Manifest=fullcheck" -e fullcheck
602
-
603
- skipcheck :
604
- $(PERLRUN) "-MExtUtils::Manifest=skipcheck" -e skipcheck
605
-
606
- manifest :
607
- $(PERLRUN) "-MExtUtils::Manifest=mkmanifest" -e mkmanifest
608
-
609
- veryclean : realclean
610
- $(RM_F) *~ */*~ *.orig */*.orig *.bak */*.bak *.old */*.old
611
-
612
-
613
-
614
- # --- MakeMaker dist_core section:
615
-
616
- dist : $(DIST_DEFAULT) $(FIRST_MAKEFILE)
617
- $(NOECHO) $(ABSPERLRUN) -l -e 'print '\''Warning: Makefile possibly out of date with $(VERSION_FROM)'\''' \
618
- -e ' if -e '\''$(VERSION_FROM)'\'' and -M '\''$(VERSION_FROM)'\'' < -M '\''$(FIRST_MAKEFILE)'\'';' --
619
-
620
- tardist : $(DISTVNAME).tar$(SUFFIX)
621
- $(NOECHO) $(NOOP)
622
-
623
- uutardist : $(DISTVNAME).tar$(SUFFIX)
624
- uuencode $(DISTVNAME).tar$(SUFFIX) $(DISTVNAME).tar$(SUFFIX) > $(DISTVNAME).tar$(SUFFIX)_uu
625
-
626
- $(DISTVNAME).tar$(SUFFIX) : distdir
627
- $(PREOP)
628
- $(TO_UNIX)
629
- $(TAR) $(TARFLAGS) $(DISTVNAME).tar $(DISTVNAME)
630
- $(RM_RF) $(DISTVNAME)
631
- $(COMPRESS) $(DISTVNAME).tar
632
- $(POSTOP)
633
-
634
- zipdist : $(DISTVNAME).zip
635
- $(NOECHO) $(NOOP)
636
-
637
- $(DISTVNAME).zip : distdir
638
- $(PREOP)
639
- $(ZIP) $(ZIPFLAGS) $(DISTVNAME).zip $(DISTVNAME)
640
- $(RM_RF) $(DISTVNAME)
641
- $(POSTOP)
642
-
643
- shdist : distdir
644
- $(PREOP)
645
- $(SHAR) $(DISTVNAME) > $(DISTVNAME).shar
646
- $(RM_RF) $(DISTVNAME)
647
- $(POSTOP)
648
-
649
-
650
- # --- MakeMaker distdir section:
651
- create_distdir :
652
- $(RM_RF) $(DISTVNAME)
653
- $(PERLRUN) "-MExtUtils::Manifest=manicopy,maniread" \
654
- -e "manicopy(maniread(),'$(DISTVNAME)', '$(DIST_CP)');"
655
-
656
- distdir : create_distdir distmeta
657
- $(NOECHO) $(NOOP)
658
-
659
-
660
-
661
- # --- MakeMaker dist_test section:
662
- disttest : distdir
663
- cd $(DISTVNAME) && $(ABSPERLRUN) Makefile.PL
664
- cd $(DISTVNAME) && $(MAKE) $(PASTHRU)
665
- cd $(DISTVNAME) && $(MAKE) test $(PASTHRU)
666
-
667
-
668
-
669
- # --- MakeMaker dist_ci section:
670
-
671
- ci :
672
- $(PERLRUN) "-MExtUtils::Manifest=maniread" \
673
- -e "@all = keys %{ maniread() };" \
674
- -e "print(qq{Executing $(CI) @all\n}); system(qq{$(CI) @all});" \
675
- -e "print(qq{Executing $(RCS_LABEL) ...\n}); system(qq{$(RCS_LABEL) @all});"
676
-
677
-
678
- # --- MakeMaker distmeta section:
679
- distmeta : create_distdir metafile
680
- $(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{META.yml} => q{Module meta-data (added by MakeMaker)}}) } ' \
681
- -e ' or print "Could not add META.yml to MANIFEST: $${'\''@'\''}\n"' --
682
-
683
-
684
-
685
- # --- MakeMaker distsignature section:
686
- distsignature : create_distdir
687
- $(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{SIGNATURE} => q{Public-key signature (added by MakeMaker)}}) } ' \
688
- -e ' or print "Could not add SIGNATURE to MANIFEST: $${'\''@'\''}\n"' --
689
- $(NOECHO) cd $(DISTVNAME) && $(TOUCH) SIGNATURE
690
- cd $(DISTVNAME) && cpansign -s
691
-
692
-
693
-
694
- # --- MakeMaker install section:
695
-
696
- install :: pure_install doc_install
697
- $(NOECHO) $(NOOP)
698
-
699
- install_perl :: pure_perl_install doc_perl_install
700
- $(NOECHO) $(NOOP)
701
-
702
- install_site :: pure_site_install doc_site_install
703
- $(NOECHO) $(NOOP)
704
-
705
- install_vendor :: pure_vendor_install doc_vendor_install
706
- $(NOECHO) $(NOOP)
707
-
708
- pure_install :: pure_$(INSTALLDIRS)_install
709
- $(NOECHO) $(NOOP)
710
-
711
- doc_install :: doc_$(INSTALLDIRS)_install
712
- $(NOECHO) $(NOOP)
713
-
714
- pure__install : pure_site_install
715
- $(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site
716
-
717
- doc__install : doc_site_install
718
- $(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site
719
-
720
- pure_perl_install :: all
721
- $(NOECHO) umask 022; $(MOD_INSTALL) \
722
- $(INST_LIB) $(DESTINSTALLPRIVLIB) \
723
- $(INST_ARCHLIB) $(DESTINSTALLARCHLIB) \
724
- $(INST_BIN) $(DESTINSTALLBIN) \
725
- $(INST_SCRIPT) $(DESTINSTALLSCRIPT) \
726
- $(INST_MAN1DIR) $(DESTINSTALLMAN1DIR) \
727
- $(INST_MAN3DIR) $(DESTINSTALLMAN3DIR)
728
- $(NOECHO) $(WARN_IF_OLD_PACKLIST) \
729
- $(SITEARCHEXP)/auto/$(FULLEXT)
730
-
731
-
732
- pure_site_install :: all
733
- $(NOECHO) umask 02; $(MOD_INSTALL) \
734
- read $(SITEARCHEXP)/auto/$(FULLEXT)/.packlist \
735
- write $(DESTINSTALLSITEARCH)/auto/$(FULLEXT)/.packlist \
736
- $(INST_LIB) $(DESTINSTALLSITELIB) \
737
- $(INST_ARCHLIB) $(DESTINSTALLSITEARCH) \
738
- $(INST_BIN) $(DESTINSTALLSITEBIN) \
739
- $(INST_SCRIPT) $(DESTINSTALLSITESCRIPT) \
740
- $(INST_MAN1DIR) $(DESTINSTALLSITEMAN1DIR) \
741
- $(INST_MAN3DIR) $(DESTINSTALLSITEMAN3DIR)
742
- $(NOECHO) $(WARN_IF_OLD_PACKLIST) \
743
- $(PERL_ARCHLIB)/auto/$(FULLEXT)
744
-
745
- pure_vendor_install :: all
746
- $(NOECHO) umask 022; $(MOD_INSTALL) \
747
- $(INST_LIB) $(DESTINSTALLVENDORLIB) \
748
- $(INST_ARCHLIB) $(DESTINSTALLVENDORARCH) \
749
- $(INST_BIN) $(DESTINSTALLVENDORBIN) \
750
- $(INST_SCRIPT) $(DESTINSTALLVENDORSCRIPT) \
751
- $(INST_MAN1DIR) $(DESTINSTALLVENDORMAN1DIR) \
752
- $(INST_MAN3DIR) $(DESTINSTALLVENDORMAN3DIR)
753
-
754
- doc_perl_install :: all
755
-
756
- doc_site_install :: all
757
- $(NOECHO) $(ECHO) Appending installation info to $(DESTINSTALLSITEARCH)/perllocal.pod
758
- -$(NOECHO) umask 02; $(MKPATH) $(DESTINSTALLSITEARCH)
759
- -$(NOECHO) umask 02; $(DOC_INSTALL) \
760
- "Module" "$(NAME)" \
761
- "installed into" "$(INSTALLSITELIB)" \
762
- LINKTYPE "$(LINKTYPE)" \
763
- VERSION "$(VERSION)" \
764
- EXE_FILES "$(EXE_FILES)" \
765
- >> $(DESTINSTALLSITEARCH)/perllocal.pod
766
-
767
- doc_vendor_install :: all
768
-
769
-
770
- uninstall :: uninstall_from_$(INSTALLDIRS)dirs
771
- $(NOECHO) $(NOOP)
772
-
773
- uninstall_from_perldirs ::
774
-
775
- uninstall_from_sitedirs ::
776
- $(NOECHO) $(UNINSTALL) $(SITEARCHEXP)/auto/$(FULLEXT)/.packlist
777
-
778
- uninstall_from_vendordirs ::
779
-
780
-
781
-
782
- # --- MakeMaker force section:
783
- # Phony target to force checking subdirectories.
784
- FORCE :
785
- $(NOECHO) $(NOOP)
786
-
787
-
788
- # --- MakeMaker perldepend section:
789
-
790
- PERL_HDRS = \
791
- $(PERL_INC)/EXTERN.h \
792
- $(PERL_INC)/INTERN.h \
793
- $(PERL_INC)/XSUB.h \
794
- $(PERL_INC)/av.h \
795
- $(PERL_INC)/cc_runtime.h \
796
- $(PERL_INC)/config.h \
797
- $(PERL_INC)/cop.h \
798
- $(PERL_INC)/cv.h \
799
- $(PERL_INC)/dosish.h \
800
- $(PERL_INC)/embed.h \
801
- $(PERL_INC)/embedvar.h \
802
- $(PERL_INC)/fakethr.h \
803
- $(PERL_INC)/form.h \
804
- $(PERL_INC)/gv.h \
805
- $(PERL_INC)/handy.h \
806
- $(PERL_INC)/hv.h \
807
- $(PERL_INC)/intrpvar.h \
808
- $(PERL_INC)/iperlsys.h \
809
- $(PERL_INC)/keywords.h \
810
- $(PERL_INC)/mg.h \
811
- $(PERL_INC)/nostdio.h \
812
- $(PERL_INC)/op.h \
813
- $(PERL_INC)/opcode.h \
814
- $(PERL_INC)/patchlevel.h \
815
- $(PERL_INC)/perl.h \
816
- $(PERL_INC)/perlio.h \
817
- $(PERL_INC)/perlsdio.h \
818
- $(PERL_INC)/perlsfio.h \
819
- $(PERL_INC)/perlvars.h \
820
- $(PERL_INC)/perly.h \
821
- $(PERL_INC)/pp.h \
822
- $(PERL_INC)/pp_proto.h \
823
- $(PERL_INC)/proto.h \
824
- $(PERL_INC)/regcomp.h \
825
- $(PERL_INC)/regexp.h \
826
- $(PERL_INC)/regnodes.h \
827
- $(PERL_INC)/scope.h \
828
- $(PERL_INC)/sv.h \
829
- $(PERL_INC)/thread.h \
830
- $(PERL_INC)/unixish.h \
831
- $(PERL_INC)/util.h
832
-
833
- $(OBJECT) : $(PERL_HDRS)
834
-
835
-
836
- # --- MakeMaker makefile section:
837
-
838
- $(OBJECT) : $(FIRST_MAKEFILE)
839
-
840
- # We take a very conservative approach here, but it's worth it.
841
- # We move Makefile to Makefile.old here to avoid gnu make looping.
842
- $(FIRST_MAKEFILE) : Makefile.PL $(CONFIGDEP)
843
- $(NOECHO) $(ECHO) "Makefile out-of-date with respect to $?"
844
- $(NOECHO) $(ECHO) "Cleaning current config before rebuilding Makefile..."
845
- -$(NOECHO) $(RM_F) $(MAKEFILE_OLD)
846
- -$(NOECHO) $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD)
847
- - $(MAKE) $(USEMAKEFILE) $(MAKEFILE_OLD) clean $(DEV_NULL)
848
- $(PERLRUN) Makefile.PL
849
- $(NOECHO) $(ECHO) "==> Your Makefile has been rebuilt. <=="
850
- $(NOECHO) $(ECHO) "==> Please rerun the $(MAKE) command. <=="
851
- $(FALSE)
852
-
853
-
854
-
855
- # --- MakeMaker staticmake section:
856
-
857
- # --- MakeMaker makeaperl section ---
858
- MAP_TARGET = perl
859
- FULLPERL = /usr/bin/perl
860
-
861
- $(MAP_TARGET) :: static $(MAKE_APERL_FILE)
862
- $(MAKE) $(USEMAKEFILE) $(MAKE_APERL_FILE) $@
863
-
864
- $(MAKE_APERL_FILE) : $(FIRST_MAKEFILE) pm_to_blib
865
- $(NOECHO) $(ECHO) Writing \"$(MAKE_APERL_FILE)\" for this $(MAP_TARGET)
866
- $(NOECHO) $(PERLRUNINST) \
867
- Makefile.PL DIR= \
868
- MAKEFILE=$(MAKE_APERL_FILE) LINKTYPE=static \
869
- MAKEAPERL=1 NORECURS=1 CCCDLFLAGS=
870
-
871
-
872
- # --- MakeMaker test section:
873
-
874
- TEST_VERBOSE=0
875
- TEST_TYPE=test_$(LINKTYPE)
876
- TEST_FILE = test.pl
877
- TEST_FILES =
878
- TESTDB_SW = -d
879
-
880
- testdb :: testdb_$(LINKTYPE)
881
-
882
- test :: $(TEST_TYPE) subdirs-test
883
-
884
- subdirs-test ::
885
- $(NOECHO) $(NOOP)
886
-
887
-
888
- test_dynamic :: pure_all
889
- PERL_DL_NONLAZY=1 $(FULLPERLRUN) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
890
-
891
- testdb_dynamic :: pure_all
892
- PERL_DL_NONLAZY=1 $(FULLPERLRUN) $(TESTDB_SW) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
893
-
894
- test_ : test_dynamic
895
-
896
- test_static :: pure_all $(MAP_TARGET)
897
- PERL_DL_NONLAZY=1 ./$(MAP_TARGET) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
898
-
899
- testdb_static :: pure_all $(MAP_TARGET)
900
- PERL_DL_NONLAZY=1 ./$(MAP_TARGET) $(TESTDB_SW) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
901
-
902
-
903
-
904
- # --- MakeMaker ppd section:
905
- # Creates a PPD (Perl Package Description) for a binary distribution.
906
- ppd :
907
- $(NOECHO) $(ECHO) '<SOFTPKG NAME="$(DISTNAME)" VERSION="">' > $(DISTNAME).ppd
908
- $(NOECHO) $(ECHO) ' <ABSTRACT></ABSTRACT>' >> $(DISTNAME).ppd
909
- $(NOECHO) $(ECHO) ' <AUTHOR></AUTHOR>' >> $(DISTNAME).ppd
910
- $(NOECHO) $(ECHO) ' <IMPLEMENTATION>' >> $(DISTNAME).ppd
911
- $(NOECHO) $(ECHO) ' <ARCHITECTURE NAME="x86_64-linux-gnu-thread-multi-5.12" />' >> $(DISTNAME).ppd
912
- $(NOECHO) $(ECHO) ' <CODEBASE HREF="" />' >> $(DISTNAME).ppd
913
- $(NOECHO) $(ECHO) ' </IMPLEMENTATION>' >> $(DISTNAME).ppd
914
- $(NOECHO) $(ECHO) '</SOFTPKG>' >> $(DISTNAME).ppd
915
-
916
-
917
- # --- MakeMaker pm_to_blib section:
918
-
919
- pm_to_blib : $(FIRST_MAKEFILE) $(TO_INST_PM)
920
- $(NOECHO) $(ABSPERLRUN) -MExtUtils::Install -e 'pm_to_blib({@ARGV}, '\''$(INST_LIB)/auto'\'', q[$(PM_FILTER)], '\''$(PERM_DIR)'\'')' -- \
921
- CRFPP.pm $(INST_LIB)/CRFPP.pm
922
- $(NOECHO) $(TOUCH) pm_to_blib
923
-
924
-
925
- # --- MakeMaker selfdocument section:
926
-
927
-
928
- # --- MakeMaker postamble section:
929
-
930
-
931
- # End.
 
1
+ # This Makefile is for the CRFPP extension to perl.
2
+ #
3
+ # It was generated automatically by MakeMaker version
4
+ # 6.56 (Revision: 65600) from the contents of
5
+ # Makefile.PL. Don't edit this file, edit Makefile.PL instead.
6
+ #
7
+ # ANY CHANGES MADE HERE WILL BE LOST!
8
+ #
9
+ # MakeMaker ARGV: ()
10
+ #
11
+
12
+ # MakeMaker Parameters:
13
+
14
+ # BUILD_REQUIRES => { }
15
+ # CC => q[c++]
16
+ # INC => q[]
17
+ # LD => q[c++]
18
+ # LIBS => q[-lpthread -lcrfpp]
19
+ # NAME => q[CRFPP]
20
+ # OBJECT => q[CRFPP_wrap.o]
21
+ # PREREQ_PM => { }
22
+
23
+ # --- MakeMaker post_initialize section:
24
+
25
+
26
+ # --- MakeMaker const_config section:
27
+
28
+ # These definitions are from config.sh (via /usr/lib/perl/5.12/Config.pm).
29
+ # They may have been overridden via Makefile.PL or on the command line.
30
+ AR = ar
31
+ CC = c++
32
+ CCCDLFLAGS = -fPIC
33
+ CCDLFLAGS = -Wl,-E
34
+ DLEXT = so
35
+ DLSRC = dl_dlopen.xs
36
+ EXE_EXT =
37
+ FULL_AR = /usr/bin/ar
38
+ LD = c++
39
+ LDDLFLAGS = -shared -O2 -g -L/usr/local/lib -fstack-protector
40
+ LDFLAGS = -fstack-protector -L/usr/local/lib
41
+ LIBC =
42
+ LIB_EXT = .a
43
+ OBJ_EXT = .o
44
+ OSNAME = linux
45
+ OSVERS = 2.6.24-28-server
46
+ RANLIB = :
47
+ SITELIBEXP = /usr/local/share/perl/5.12.4
48
+ SITEARCHEXP = /usr/local/lib/perl/5.12.4
49
+ SO = so
50
+ VENDORARCHEXP = /usr/lib/perl5
51
+ VENDORLIBEXP = /usr/share/perl5
52
+
53
+
54
+ # --- MakeMaker constants section:
55
+ AR_STATIC_ARGS = cr
56
+ DIRFILESEP = /
57
+ DFSEP = $(DIRFILESEP)
58
+ NAME = CRFPP
59
+ NAME_SYM = CRFPP
60
+ VERSION =
61
+ VERSION_MACRO = VERSION
62
+ VERSION_SYM =
63
+ DEFINE_VERSION = -D$(VERSION_MACRO)=\"$(VERSION)\"
64
+ XS_VERSION =
65
+ XS_VERSION_MACRO = XS_VERSION
66
+ XS_DEFINE_VERSION = -D$(XS_VERSION_MACRO)=\"$(XS_VERSION)\"
67
+ INST_ARCHLIB = blib/arch
68
+ INST_SCRIPT = blib/script
69
+ INST_BIN = blib/bin
70
+ INST_LIB = blib/lib
71
+ INST_MAN1DIR = blib/man1
72
+ INST_MAN3DIR = blib/man3
73
+ MAN1EXT = 1p
74
+ MAN3EXT = 3pm
75
+ INSTALLDIRS = site
76
+ DESTDIR =
77
+ PREFIX = /usr
78
+ PERLPREFIX = $(PREFIX)
79
+ SITEPREFIX = $(PREFIX)/local
80
+ VENDORPREFIX = $(PREFIX)
81
+ INSTALLPRIVLIB = $(PERLPREFIX)/share/perl/5.12
82
+ DESTINSTALLPRIVLIB = $(DESTDIR)$(INSTALLPRIVLIB)
83
+ INSTALLSITELIB = $(SITEPREFIX)/share/perl/5.12.4
84
+ DESTINSTALLSITELIB = $(DESTDIR)$(INSTALLSITELIB)
85
+ INSTALLVENDORLIB = $(VENDORPREFIX)/share/perl5
86
+ DESTINSTALLVENDORLIB = $(DESTDIR)$(INSTALLVENDORLIB)
87
+ INSTALLARCHLIB = $(PERLPREFIX)/lib/perl/5.12
88
+ DESTINSTALLARCHLIB = $(DESTDIR)$(INSTALLARCHLIB)
89
+ INSTALLSITEARCH = $(SITEPREFIX)/lib/perl/5.12.4
90
+ DESTINSTALLSITEARCH = $(DESTDIR)$(INSTALLSITEARCH)
91
+ INSTALLVENDORARCH = $(VENDORPREFIX)/lib/perl5
92
+ DESTINSTALLVENDORARCH = $(DESTDIR)$(INSTALLVENDORARCH)
93
+ INSTALLBIN = $(PERLPREFIX)/bin
94
+ DESTINSTALLBIN = $(DESTDIR)$(INSTALLBIN)
95
+ INSTALLSITEBIN = $(SITEPREFIX)/bin
96
+ DESTINSTALLSITEBIN = $(DESTDIR)$(INSTALLSITEBIN)
97
+ INSTALLVENDORBIN = $(VENDORPREFIX)/bin
98
+ DESTINSTALLVENDORBIN = $(DESTDIR)$(INSTALLVENDORBIN)
99
+ INSTALLSCRIPT = $(PERLPREFIX)/bin
100
+ DESTINSTALLSCRIPT = $(DESTDIR)$(INSTALLSCRIPT)
101
+ INSTALLSITESCRIPT = $(SITEPREFIX)/bin
102
+ DESTINSTALLSITESCRIPT = $(DESTDIR)$(INSTALLSITESCRIPT)
103
+ INSTALLVENDORSCRIPT = $(VENDORPREFIX)/bin
104
+ DESTINSTALLVENDORSCRIPT = $(DESTDIR)$(INSTALLVENDORSCRIPT)
105
+ INSTALLMAN1DIR = $(PERLPREFIX)/share/man/man1
106
+ DESTINSTALLMAN1DIR = $(DESTDIR)$(INSTALLMAN1DIR)
107
+ INSTALLSITEMAN1DIR = $(SITEPREFIX)/man/man1
108
+ DESTINSTALLSITEMAN1DIR = $(DESTDIR)$(INSTALLSITEMAN1DIR)
109
+ INSTALLVENDORMAN1DIR = $(VENDORPREFIX)/share/man/man1
110
+ DESTINSTALLVENDORMAN1DIR = $(DESTDIR)$(INSTALLVENDORMAN1DIR)
111
+ INSTALLMAN3DIR = $(PERLPREFIX)/share/man/man3
112
+ DESTINSTALLMAN3DIR = $(DESTDIR)$(INSTALLMAN3DIR)
113
+ INSTALLSITEMAN3DIR = $(SITEPREFIX)/man/man3
114
+ DESTINSTALLSITEMAN3DIR = $(DESTDIR)$(INSTALLSITEMAN3DIR)
115
+ INSTALLVENDORMAN3DIR = $(VENDORPREFIX)/share/man/man3
116
+ DESTINSTALLVENDORMAN3DIR = $(DESTDIR)$(INSTALLVENDORMAN3DIR)
117
+ PERL_LIB = /usr/share/perl/5.12
118
+ PERL_ARCHLIB = /usr/lib/perl/5.12
119
+ LIBPERL_A = libperl.a
120
+ FIRST_MAKEFILE = Makefile
121
+ MAKEFILE_OLD = Makefile.old
122
+ MAKE_APERL_FILE = Makefile.aperl
123
+ PERLMAINCC = $(CC)
124
+ PERL_INC = /usr/lib/perl/5.12/CORE
125
+ PERL = /usr/bin/perl
126
+ FULLPERL = /usr/bin/perl
127
+ ABSPERL = $(PERL)
128
+ PERLRUN = $(PERL)
129
+ FULLPERLRUN = $(FULLPERL)
130
+ ABSPERLRUN = $(ABSPERL)
131
+ PERLRUNINST = $(PERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
132
+ FULLPERLRUNINST = $(FULLPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
133
+ ABSPERLRUNINST = $(ABSPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
134
+ PERL_CORE = 0
135
+ PERM_DIR = 755
136
+ PERM_RW = 644
137
+ PERM_RWX = 755
138
+
139
+ MAKEMAKER = /usr/share/perl/5.12/ExtUtils/MakeMaker.pm
140
+ MM_VERSION = 6.56
141
+ MM_REVISION = 65600
142
+
143
+ # FULLEXT = Pathname for extension directory (eg Foo/Bar/Oracle).
144
+ # BASEEXT = Basename part of FULLEXT. May be just equal FULLEXT. (eg Oracle)
145
+ # PARENT_NAME = NAME without BASEEXT and no trailing :: (eg Foo::Bar)
146
+ # DLBASE = Basename part of dynamic library. May be just equal BASEEXT.
147
+ MAKE = make
148
+ FULLEXT = CRFPP
149
+ BASEEXT = CRFPP
150
+ PARENT_NAME =
151
+ DLBASE = $(BASEEXT)
152
+ VERSION_FROM =
153
+ INC =
154
+ OBJECT = CRFPP_wrap$(OBJ_EXT)
155
+ LDFROM = $(OBJECT)
156
+ LINKTYPE = dynamic
157
+ BOOTDEP =
158
+
159
+ # Handy lists of source code files:
160
+ XS_FILES =
161
+ C_FILES = CRFPP_wrap.cxx
162
+ O_FILES = CRFPP_wrap.o
163
+ H_FILES =
164
+ MAN1PODS =
165
+ MAN3PODS =
166
+
167
+ # Where is the Config information that we are using/depend on
168
+ CONFIGDEP = $(PERL_ARCHLIB)$(DFSEP)Config.pm $(PERL_INC)$(DFSEP)config.h
169
+
170
+ # Where to build things
171
+ INST_LIBDIR = $(INST_LIB)
172
+ INST_ARCHLIBDIR = $(INST_ARCHLIB)
173
+
174
+ INST_AUTODIR = $(INST_LIB)/auto/$(FULLEXT)
175
+ INST_ARCHAUTODIR = $(INST_ARCHLIB)/auto/$(FULLEXT)
176
+
177
+ INST_STATIC = $(INST_ARCHAUTODIR)/$(BASEEXT)$(LIB_EXT)
178
+ INST_DYNAMIC = $(INST_ARCHAUTODIR)/$(DLBASE).$(DLEXT)
179
+ INST_BOOT = $(INST_ARCHAUTODIR)/$(BASEEXT).bs
180
+
181
+ # Extra linker info
182
+ EXPORT_LIST =
183
+ PERL_ARCHIVE =
184
+ PERL_ARCHIVE_AFTER =
185
+
186
+
187
+ TO_INST_PM = CRFPP.pm
188
+
189
+ PM_TO_BLIB = CRFPP.pm \
190
+ $(INST_LIB)/CRFPP.pm
191
+
192
+
193
+ # --- MakeMaker platform_constants section:
194
+ MM_Unix_VERSION = 6.56
195
+ PERL_MALLOC_DEF = -DPERL_EXTMALLOC_DEF -Dmalloc=Perl_malloc -Dfree=Perl_mfree -Drealloc=Perl_realloc -Dcalloc=Perl_calloc
196
+
197
+
198
+ # --- MakeMaker tool_autosplit section:
199
+ # Usage: $(AUTOSPLITFILE) FileToSplit AutoDirToSplitInto
200
+ AUTOSPLITFILE = $(ABSPERLRUN) -e 'use AutoSplit; autosplit($$ARGV[0], $$ARGV[1], 0, 1, 1)' --
201
+
202
+
203
+
204
+ # --- MakeMaker tool_xsubpp section:
205
+
206
+ XSUBPPDIR = /usr/share/perl/5.12/ExtUtils
207
+ XSUBPP = $(XSUBPPDIR)$(DFSEP)xsubpp
208
+ XSUBPPRUN = $(PERLRUN) $(XSUBPP)
209
+ XSPROTOARG =
210
+ XSUBPPDEPS = /usr/share/perl/5.12/ExtUtils/typemap $(XSUBPP)
211
+ XSUBPPARGS = -typemap /usr/share/perl/5.12/ExtUtils/typemap
212
+ XSUBPP_EXTRA_ARGS =
213
+
214
+
215
+ # --- MakeMaker tools_other section:
216
+ SHELL = /bin/sh
217
+ CHMOD = chmod
218
+ CP = cp
219
+ MV = mv
220
+ NOOP = $(TRUE)
221
+ NOECHO = @
222
+ RM_F = rm -f
223
+ RM_RF = rm -rf
224
+ TEST_F = test -f
225
+ TOUCH = touch
226
+ UMASK_NULL = umask 0
227
+ DEV_NULL = > /dev/null 2>&1
228
+ MKPATH = $(ABSPERLRUN) -MExtUtils::Command -e 'mkpath' --
229
+ EQUALIZE_TIMESTAMP = $(ABSPERLRUN) -MExtUtils::Command -e 'eqtime' --
230
+ FALSE = false
231
+ TRUE = true
232
+ ECHO = echo
233
+ ECHO_N = echo -n
234
+ UNINST = 0
235
+ VERBINST = 0
236
+ MOD_INSTALL = $(ABSPERLRUN) -MExtUtils::Install -e 'install([ from_to => {@ARGV}, verbose => '\''$(VERBINST)'\'', uninstall_shadows => '\''$(UNINST)'\'', dir_mode => '\''$(PERM_DIR)'\'' ]);' --
237
+ DOC_INSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'perllocal_install' --
238
+ UNINSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'uninstall' --
239
+ WARN_IF_OLD_PACKLIST = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'warn_if_old_packlist' --
240
+ MACROSTART =
241
+ MACROEND =
242
+ USEMAKEFILE = -f
243
+ FIXIN = $(ABSPERLRUN) -MExtUtils::MY -e 'MY->fixin(shift)' --
244
+
245
+
246
+ # --- MakeMaker makemakerdflt section:
247
+ makemakerdflt : all
248
+ $(NOECHO) $(NOOP)
249
+
250
+
251
+ # --- MakeMaker dist section:
252
+ TAR = tar
253
+ TARFLAGS = cvf
254
+ ZIP = zip
255
+ ZIPFLAGS = -r
256
+ COMPRESS = gzip --best
257
+ SUFFIX = .gz
258
+ SHAR = shar
259
+ PREOP = $(NOECHO) $(NOOP)
260
+ POSTOP = $(NOECHO) $(NOOP)
261
+ TO_UNIX = $(NOECHO) $(NOOP)
262
+ CI = ci -u
263
+ RCS_LABEL = rcs -Nv$(VERSION_SYM): -q
264
+ DIST_CP = best
265
+ DIST_DEFAULT = tardist
266
+ DISTNAME = CRFPP
267
+ DISTVNAME = CRFPP-
268
+
269
+
270
+ # --- MakeMaker macro section:
271
+
272
+
273
+ # --- MakeMaker depend section:
274
+
275
+
276
+ # --- MakeMaker cflags section:
277
+
278
+ CCFLAGS = -D_REENTRANT -D_GNU_SOURCE -DDEBIAN -fno-strict-aliasing -pipe -fstack-protector -I/usr/local/include -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64
279
+ OPTIMIZE = -O2 -g
280
+ PERLTYPE =
281
+ MPOLLUTE =
282
+
283
+
284
+ # --- MakeMaker const_loadlibs section:
285
+
286
+ # CRFPP might depend on some other libraries:
287
+ # See ExtUtils::Liblist for details
288
+ #
289
+ EXTRALIBS = -lcrfpp
290
+ LDLOADLIBS = -lpthread -lcrfpp
291
+ BSLOADLIBS =
292
+
293
+
294
+ # --- MakeMaker const_cccmd section:
295
+ CCCMD = $(CC) -c $(PASTHRU_INC) $(INC) \
296
+ $(CCFLAGS) $(OPTIMIZE) \
297
+ $(PERLTYPE) $(MPOLLUTE) $(DEFINE_VERSION) \
298
+ $(XS_DEFINE_VERSION)
299
+
300
+ # --- MakeMaker post_constants section:
301
+
302
+
303
+ # --- MakeMaker pasthru section:
304
+
305
+ PASTHRU = LIBPERL_A="$(LIBPERL_A)"\
306
+ LINKTYPE="$(LINKTYPE)"\
307
+ OPTIMIZE="$(OPTIMIZE)"\
308
+ PREFIX="$(PREFIX)"\
309
+ PASTHRU_INC="$(PASTHRU_INC)"
310
+
311
+
312
+ # --- MakeMaker special_targets section:
313
+ .SUFFIXES : .xs .c .C .cpp .i .s .cxx .cc $(OBJ_EXT)
314
+
315
+ .PHONY: all config static dynamic test linkext manifest blibdirs clean realclean disttest distdir
316
+
317
+
318
+
319
+ # --- MakeMaker c_o section:
320
+
321
+ .c.i:
322
+ cc -E -c $(PASTHRU_INC) $(INC) \
323
+ $(CCFLAGS) $(OPTIMIZE) \
324
+ $(PERLTYPE) $(MPOLLUTE) $(DEFINE_VERSION) \
325
+ $(XS_DEFINE_VERSION) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.c > $*.i
326
+
327
+ .c.s:
328
+ $(CCCMD) -S $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.c
329
+
330
+ .c$(OBJ_EXT):
331
+ $(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.c
332
+
333
+ .cpp$(OBJ_EXT):
334
+ $(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.cpp
335
+
336
+ .cxx$(OBJ_EXT):
337
+ $(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.cxx
338
+
339
+ .cc$(OBJ_EXT):
340
+ $(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.cc
341
+
342
+ .C$(OBJ_EXT):
343
+ $(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.C
344
+
345
+
346
+ # --- MakeMaker xs_c section:
347
+
348
+ .xs.c:
349
+ $(XSUBPPRUN) $(XSPROTOARG) $(XSUBPPARGS) $(XSUBPP_EXTRA_ARGS) $*.xs > $*.xsc && $(MV) $*.xsc $*.c
350
+
351
+
352
+ # --- MakeMaker xs_o section:
353
+
354
+ .xs$(OBJ_EXT):
355
+ $(XSUBPPRUN) $(XSPROTOARG) $(XSUBPPARGS) $*.xs > $*.xsc && $(MV) $*.xsc $*.c
356
+ $(CCCMD) $(CCCDLFLAGS) "-I$(PERL_INC)" $(PASTHRU_DEFINE) $(DEFINE) $*.c
357
+
358
+
359
+ # --- MakeMaker top_targets section:
360
+ all :: pure_all manifypods
361
+ $(NOECHO) $(NOOP)
362
+
363
+
364
+ pure_all :: config pm_to_blib subdirs linkext
365
+ $(NOECHO) $(NOOP)
366
+
367
+ subdirs :: $(MYEXTLIB)
368
+ $(NOECHO) $(NOOP)
369
+
370
+ config :: $(FIRST_MAKEFILE) blibdirs
371
+ $(NOECHO) $(NOOP)
372
+
373
+ help :
374
+ perldoc ExtUtils::MakeMaker
375
+
376
+
377
+ # --- MakeMaker blibdirs section:
378
+ blibdirs : $(INST_LIBDIR)$(DFSEP).exists $(INST_ARCHLIB)$(DFSEP).exists $(INST_AUTODIR)$(DFSEP).exists $(INST_ARCHAUTODIR)$(DFSEP).exists $(INST_BIN)$(DFSEP).exists $(INST_SCRIPT)$(DFSEP).exists $(INST_MAN1DIR)$(DFSEP).exists $(INST_MAN3DIR)$(DFSEP).exists
379
+ $(NOECHO) $(NOOP)
380
+
381
+ # Backwards compat with 6.18 through 6.25
382
+ blibdirs.ts : blibdirs
383
+ $(NOECHO) $(NOOP)
384
+
385
+ $(INST_LIBDIR)$(DFSEP).exists :: Makefile.PL
386
+ $(NOECHO) $(MKPATH) $(INST_LIBDIR)
387
+ $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_LIBDIR)
388
+ $(NOECHO) $(TOUCH) $(INST_LIBDIR)$(DFSEP).exists
389
+
390
+ $(INST_ARCHLIB)$(DFSEP).exists :: Makefile.PL
391
+ $(NOECHO) $(MKPATH) $(INST_ARCHLIB)
392
+ $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHLIB)
393
+ $(NOECHO) $(TOUCH) $(INST_ARCHLIB)$(DFSEP).exists
394
+
395
+ $(INST_AUTODIR)$(DFSEP).exists :: Makefile.PL
396
+ $(NOECHO) $(MKPATH) $(INST_AUTODIR)
397
+ $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_AUTODIR)
398
+ $(NOECHO) $(TOUCH) $(INST_AUTODIR)$(DFSEP).exists
399
+
400
+ $(INST_ARCHAUTODIR)$(DFSEP).exists :: Makefile.PL
401
+ $(NOECHO) $(MKPATH) $(INST_ARCHAUTODIR)
402
+ $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHAUTODIR)
403
+ $(NOECHO) $(TOUCH) $(INST_ARCHAUTODIR)$(DFSEP).exists
404
+
405
+ $(INST_BIN)$(DFSEP).exists :: Makefile.PL
406
+ $(NOECHO) $(MKPATH) $(INST_BIN)
407
+ $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_BIN)
408
+ $(NOECHO) $(TOUCH) $(INST_BIN)$(DFSEP).exists
409
+
410
+ $(INST_SCRIPT)$(DFSEP).exists :: Makefile.PL
411
+ $(NOECHO) $(MKPATH) $(INST_SCRIPT)
412
+ $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_SCRIPT)
413
+ $(NOECHO) $(TOUCH) $(INST_SCRIPT)$(DFSEP).exists
414
+
415
+ $(INST_MAN1DIR)$(DFSEP).exists :: Makefile.PL
416
+ $(NOECHO) $(MKPATH) $(INST_MAN1DIR)
417
+ $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN1DIR)
418
+ $(NOECHO) $(TOUCH) $(INST_MAN1DIR)$(DFSEP).exists
419
+
420
+ $(INST_MAN3DIR)$(DFSEP).exists :: Makefile.PL
421
+ $(NOECHO) $(MKPATH) $(INST_MAN3DIR)
422
+ $(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN3DIR)
423
+ $(NOECHO) $(TOUCH) $(INST_MAN3DIR)$(DFSEP).exists
424
+
425
+
426
+
427
+ # --- MakeMaker linkext section:
428
+
429
+ linkext :: $(LINKTYPE)
430
+ $(NOECHO) $(NOOP)
431
+
432
+
433
+ # --- MakeMaker dlsyms section:
434
+
435
+
436
+ # --- MakeMaker dynamic section:
437
+
438
+ dynamic :: $(FIRST_MAKEFILE) $(INST_DYNAMIC) $(INST_BOOT)
439
+ $(NOECHO) $(NOOP)
440
+
441
+
442
+ # --- MakeMaker dynamic_bs section:
443
+ BOOTSTRAP = $(BASEEXT).bs
444
+
445
+ # As Mkbootstrap might not write a file (if none is required)
446
+ # we use touch to prevent make continually trying to remake it.
447
+ # The DynaLoader only reads a non-empty file.
448
+ $(BOOTSTRAP) : $(FIRST_MAKEFILE) $(BOOTDEP) $(INST_ARCHAUTODIR)$(DFSEP).exists
449
+ $(NOECHO) $(ECHO) "Running Mkbootstrap for $(NAME) ($(BSLOADLIBS))"
450
+ $(NOECHO) $(PERLRUN) \
451
+ "-MExtUtils::Mkbootstrap" \
452
+ -e "Mkbootstrap('$(BASEEXT)','$(BSLOADLIBS)');"
453
+ $(NOECHO) $(TOUCH) $@
454
+ $(CHMOD) $(PERM_RW) $@
455
+
456
+ $(INST_BOOT) : $(BOOTSTRAP) $(INST_ARCHAUTODIR)$(DFSEP).exists
457
+ $(NOECHO) $(RM_RF) $@
458
+ - $(CP) $(BOOTSTRAP) $@
459
+ $(CHMOD) $(PERM_RW) $@
460
+
461
+
462
+ # --- MakeMaker dynamic_lib section:
463
+
464
+ # This section creates the dynamically loadable $(INST_DYNAMIC)
465
+ # from $(OBJECT) and possibly $(MYEXTLIB).
466
+ ARMAYBE = :
467
+ OTHERLDFLAGS =
468
+ INST_DYNAMIC_DEP =
469
+ INST_DYNAMIC_FIX =
470
+
471
+ $(INST_DYNAMIC): $(OBJECT) $(MYEXTLIB) $(BOOTSTRAP) $(INST_ARCHAUTODIR)$(DFSEP).exists $(EXPORT_LIST) $(PERL_ARCHIVE) $(PERL_ARCHIVE_AFTER) $(INST_DYNAMIC_DEP)
472
+ $(RM_F) $@
473
+ $(LD) $(LDDLFLAGS) $(LDFROM) $(OTHERLDFLAGS) -o $@ $(MYEXTLIB) \
474
+ $(PERL_ARCHIVE) $(LDLOADLIBS) $(PERL_ARCHIVE_AFTER) $(EXPORT_LIST) \
475
+ $(INST_DYNAMIC_FIX)
476
+ $(CHMOD) $(PERM_RWX) $@
477
+
478
+
479
+ # --- MakeMaker static section:
480
+
481
+ ## $(INST_PM) has been moved to the all: target.
482
+ ## It remains here for awhile to allow for old usage: "make static"
483
+ static :: $(FIRST_MAKEFILE) $(INST_STATIC)
484
+ $(NOECHO) $(NOOP)
485
+
486
+
487
+ # --- MakeMaker static_lib section:
488
+
489
+ $(INST_STATIC) : $(OBJECT) $(MYEXTLIB) $(INST_ARCHAUTODIR)$(DFSEP).exists
490
+ $(RM_RF) $@
491
+ $(FULL_AR) $(AR_STATIC_ARGS) $@ $(OBJECT) && $(RANLIB) $@
492
+ $(CHMOD) $(PERM_RWX) $@
493
+ $(NOECHO) $(ECHO) "$(EXTRALIBS)" > $(INST_ARCHAUTODIR)/extralibs.ld
494
+
495
+
496
+ # --- MakeMaker manifypods section:
497
+
498
+ POD2MAN_EXE = $(PERLRUN) "-MExtUtils::Command::MM" -e pod2man "--"
499
+ POD2MAN = $(POD2MAN_EXE)
500
+
501
+
502
+ manifypods : pure_all
503
+ $(NOECHO) $(NOOP)
504
+
505
+
506
+
507
+
508
+ # --- MakeMaker processPL section:
509
+
510
+
511
+ # --- MakeMaker installbin section:
512
+
513
+
514
+ # --- MakeMaker subdirs section:
515
+
516
+ # none
517
+
518
+ # --- MakeMaker clean_subdirs section:
519
+ clean_subdirs :
520
+ $(NOECHO) $(NOOP)
521
+
522
+
523
+ # --- MakeMaker clean section:
524
+
525
+ # Delete temporary files but do not touch installed files. We don't delete
526
+ # the Makefile here so a later make realclean still has a makefile to use.
527
+
528
+ clean :: clean_subdirs
529
+ - $(RM_F) \
530
+ *$(LIB_EXT) core \
531
+ core.[0-9] $(INST_ARCHAUTODIR)/extralibs.all \
532
+ core.[0-9][0-9] $(BASEEXT).bso \
533
+ pm_to_blib.ts core.[0-9][0-9][0-9][0-9] \
534
+ $(BASEEXT).x $(BOOTSTRAP) \
535
+ perl$(EXE_EXT) tmon.out \
536
+ *$(OBJ_EXT) pm_to_blib \
537
+ $(INST_ARCHAUTODIR)/extralibs.ld blibdirs.ts \
538
+ core.[0-9][0-9][0-9][0-9][0-9] *perl.core \
539
+ core.*perl.*.? $(MAKE_APERL_FILE) \
540
+ perl $(BASEEXT).def \
541
+ core.[0-9][0-9][0-9] mon.out \
542
+ lib$(BASEEXT).def perlmain.c \
543
+ perl.exe so_locations \
544
+ $(BASEEXT).exp
545
+ - $(RM_RF) \
546
+ blib
547
+ - $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD) $(DEV_NULL)
548
+
549
+
550
+ # --- MakeMaker realclean_subdirs section:
551
+ realclean_subdirs :
552
+ $(NOECHO) $(NOOP)
553
+
554
+
555
+ # --- MakeMaker realclean section:
556
+ # Delete temporary files (via clean) and also delete dist files
557
+ realclean purge :: clean realclean_subdirs
558
+ - $(RM_F) \
559
+ $(OBJECT) $(MAKEFILE_OLD) \
560
+ $(FIRST_MAKEFILE)
561
+ - $(RM_RF) \
562
+ $(DISTVNAME)
563
+
564
+
565
+ # --- MakeMaker metafile section:
566
+ metafile : create_distdir
567
+ $(NOECHO) $(ECHO) Generating META.yml
568
+ $(NOECHO) $(ECHO) '--- #YAML:1.0' > META_new.yml
569
+ $(NOECHO) $(ECHO) 'name: CRFPP' >> META_new.yml
570
+ $(NOECHO) $(ECHO) 'version: ' >> META_new.yml
571
+ $(NOECHO) $(ECHO) 'abstract: ~' >> META_new.yml
572
+ $(NOECHO) $(ECHO) 'author: []' >> META_new.yml
573
+ $(NOECHO) $(ECHO) 'license: unknown' >> META_new.yml
574
+ $(NOECHO) $(ECHO) 'distribution_type: module' >> META_new.yml
575
+ $(NOECHO) $(ECHO) 'configure_requires:' >> META_new.yml
576
+ $(NOECHO) $(ECHO) ' ExtUtils::MakeMaker: 0' >> META_new.yml
577
+ $(NOECHO) $(ECHO) 'build_requires:' >> META_new.yml
578
+ $(NOECHO) $(ECHO) ' ExtUtils::MakeMaker: 0' >> META_new.yml
579
+ $(NOECHO) $(ECHO) 'requires: {}' >> META_new.yml
580
+ $(NOECHO) $(ECHO) 'no_index:' >> META_new.yml
581
+ $(NOECHO) $(ECHO) ' directory:' >> META_new.yml
582
+ $(NOECHO) $(ECHO) ' - t' >> META_new.yml
583
+ $(NOECHO) $(ECHO) ' - inc' >> META_new.yml
584
+ $(NOECHO) $(ECHO) 'generated_by: ExtUtils::MakeMaker version 6.56' >> META_new.yml
585
+ $(NOECHO) $(ECHO) 'meta-spec:' >> META_new.yml
586
+ $(NOECHO) $(ECHO) ' url: http://module-build.sourceforge.net/META-spec-v1.4.html' >> META_new.yml
587
+ $(NOECHO) $(ECHO) ' version: 1.4' >> META_new.yml
588
+ -$(NOECHO) $(MV) META_new.yml $(DISTVNAME)/META.yml
589
+
590
+
591
+ # --- MakeMaker signature section:
592
+ signature :
593
+ cpansign -s
594
+
595
+
596
+ # --- MakeMaker dist_basics section:
597
+ distclean :: realclean distcheck
598
+ $(NOECHO) $(NOOP)
599
+
600
+ distcheck :
601
+ $(PERLRUN) "-MExtUtils::Manifest=fullcheck" -e fullcheck
602
+
603
+ skipcheck :
604
+ $(PERLRUN) "-MExtUtils::Manifest=skipcheck" -e skipcheck
605
+
606
+ manifest :
607
+ $(PERLRUN) "-MExtUtils::Manifest=mkmanifest" -e mkmanifest
608
+
609
+ veryclean : realclean
610
+ $(RM_F) *~ */*~ *.orig */*.orig *.bak */*.bak *.old */*.old
611
+
612
+
613
+
614
+ # --- MakeMaker dist_core section:
615
+
616
+ dist : $(DIST_DEFAULT) $(FIRST_MAKEFILE)
617
+ $(NOECHO) $(ABSPERLRUN) -l -e 'print '\''Warning: Makefile possibly out of date with $(VERSION_FROM)'\''' \
618
+ -e ' if -e '\''$(VERSION_FROM)'\'' and -M '\''$(VERSION_FROM)'\'' < -M '\''$(FIRST_MAKEFILE)'\'';' --
619
+
620
+ tardist : $(DISTVNAME).tar$(SUFFIX)
621
+ $(NOECHO) $(NOOP)
622
+
623
+ uutardist : $(DISTVNAME).tar$(SUFFIX)
624
+ uuencode $(DISTVNAME).tar$(SUFFIX) $(DISTVNAME).tar$(SUFFIX) > $(DISTVNAME).tar$(SUFFIX)_uu
625
+
626
+ $(DISTVNAME).tar$(SUFFIX) : distdir
627
+ $(PREOP)
628
+ $(TO_UNIX)
629
+ $(TAR) $(TARFLAGS) $(DISTVNAME).tar $(DISTVNAME)
630
+ $(RM_RF) $(DISTVNAME)
631
+ $(COMPRESS) $(DISTVNAME).tar
632
+ $(POSTOP)
633
+
634
+ zipdist : $(DISTVNAME).zip
635
+ $(NOECHO) $(NOOP)
636
+
637
+ $(DISTVNAME).zip : distdir
638
+ $(PREOP)
639
+ $(ZIP) $(ZIPFLAGS) $(DISTVNAME).zip $(DISTVNAME)
640
+ $(RM_RF) $(DISTVNAME)
641
+ $(POSTOP)
642
+
643
+ shdist : distdir
644
+ $(PREOP)
645
+ $(SHAR) $(DISTVNAME) > $(DISTVNAME).shar
646
+ $(RM_RF) $(DISTVNAME)
647
+ $(POSTOP)
648
+
649
+
650
+ # --- MakeMaker distdir section:
651
+ create_distdir :
652
+ $(RM_RF) $(DISTVNAME)
653
+ $(PERLRUN) "-MExtUtils::Manifest=manicopy,maniread" \
654
+ -e "manicopy(maniread(),'$(DISTVNAME)', '$(DIST_CP)');"
655
+
656
+ distdir : create_distdir distmeta
657
+ $(NOECHO) $(NOOP)
658
+
659
+
660
+
661
+ # --- MakeMaker dist_test section:
662
+ disttest : distdir
663
+ cd $(DISTVNAME) && $(ABSPERLRUN) Makefile.PL
664
+ cd $(DISTVNAME) && $(MAKE) $(PASTHRU)
665
+ cd $(DISTVNAME) && $(MAKE) test $(PASTHRU)
666
+
667
+
668
+
669
+ # --- MakeMaker dist_ci section:
670
+
671
+ ci :
672
+ $(PERLRUN) "-MExtUtils::Manifest=maniread" \
673
+ -e "@all = keys %{ maniread() };" \
674
+ -e "print(qq{Executing $(CI) @all\n}); system(qq{$(CI) @all});" \
675
+ -e "print(qq{Executing $(RCS_LABEL) ...\n}); system(qq{$(RCS_LABEL) @all});"
676
+
677
+
678
+ # --- MakeMaker distmeta section:
679
+ distmeta : create_distdir metafile
680
+ $(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{META.yml} => q{Module meta-data (added by MakeMaker)}}) } ' \
681
+ -e ' or print "Could not add META.yml to MANIFEST: $${'\''@'\''}\n"' --
682
+
683
+
684
+
685
+ # --- MakeMaker distsignature section:
686
+ distsignature : create_distdir
687
+ $(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{SIGNATURE} => q{Public-key signature (added by MakeMaker)}}) } ' \
688
+ -e ' or print "Could not add SIGNATURE to MANIFEST: $${'\''@'\''}\n"' --
689
+ $(NOECHO) cd $(DISTVNAME) && $(TOUCH) SIGNATURE
690
+ cd $(DISTVNAME) && cpansign -s
691
+
692
+
693
+
694
+ # --- MakeMaker install section:
695
+
696
+ install :: pure_install doc_install
697
+ $(NOECHO) $(NOOP)
698
+
699
+ install_perl :: pure_perl_install doc_perl_install
700
+ $(NOECHO) $(NOOP)
701
+
702
+ install_site :: pure_site_install doc_site_install
703
+ $(NOECHO) $(NOOP)
704
+
705
+ install_vendor :: pure_vendor_install doc_vendor_install
706
+ $(NOECHO) $(NOOP)
707
+
708
+ pure_install :: pure_$(INSTALLDIRS)_install
709
+ $(NOECHO) $(NOOP)
710
+
711
+ doc_install :: doc_$(INSTALLDIRS)_install
712
+ $(NOECHO) $(NOOP)
713
+
714
+ pure__install : pure_site_install
715
+ $(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site
716
+
717
+ doc__install : doc_site_install
718
+ $(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site
719
+
720
+ pure_perl_install :: all
721
+ $(NOECHO) umask 022; $(MOD_INSTALL) \
722
+ $(INST_LIB) $(DESTINSTALLPRIVLIB) \
723
+ $(INST_ARCHLIB) $(DESTINSTALLARCHLIB) \
724
+ $(INST_BIN) $(DESTINSTALLBIN) \
725
+ $(INST_SCRIPT) $(DESTINSTALLSCRIPT) \
726
+ $(INST_MAN1DIR) $(DESTINSTALLMAN1DIR) \
727
+ $(INST_MAN3DIR) $(DESTINSTALLMAN3DIR)
728
+ $(NOECHO) $(WARN_IF_OLD_PACKLIST) \
729
+ $(SITEARCHEXP)/auto/$(FULLEXT)
730
+
731
+
732
+ pure_site_install :: all
733
+ $(NOECHO) umask 02; $(MOD_INSTALL) \
734
+ read $(SITEARCHEXP)/auto/$(FULLEXT)/.packlist \
735
+ write $(DESTINSTALLSITEARCH)/auto/$(FULLEXT)/.packlist \
736
+ $(INST_LIB) $(DESTINSTALLSITELIB) \
737
+ $(INST_ARCHLIB) $(DESTINSTALLSITEARCH) \
738
+ $(INST_BIN) $(DESTINSTALLSITEBIN) \
739
+ $(INST_SCRIPT) $(DESTINSTALLSITESCRIPT) \
740
+ $(INST_MAN1DIR) $(DESTINSTALLSITEMAN1DIR) \
741
+ $(INST_MAN3DIR) $(DESTINSTALLSITEMAN3DIR)
742
+ $(NOECHO) $(WARN_IF_OLD_PACKLIST) \
743
+ $(PERL_ARCHLIB)/auto/$(FULLEXT)
744
+
745
+ pure_vendor_install :: all
746
+ $(NOECHO) umask 022; $(MOD_INSTALL) \
747
+ $(INST_LIB) $(DESTINSTALLVENDORLIB) \
748
+ $(INST_ARCHLIB) $(DESTINSTALLVENDORARCH) \
749
+ $(INST_BIN) $(DESTINSTALLVENDORBIN) \
750
+ $(INST_SCRIPT) $(DESTINSTALLVENDORSCRIPT) \
751
+ $(INST_MAN1DIR) $(DESTINSTALLVENDORMAN1DIR) \
752
+ $(INST_MAN3DIR) $(DESTINSTALLVENDORMAN3DIR)
753
+
754
+ doc_perl_install :: all
755
+
756
+ doc_site_install :: all
757
+ $(NOECHO) $(ECHO) Appending installation info to $(DESTINSTALLSITEARCH)/perllocal.pod
758
+ -$(NOECHO) umask 02; $(MKPATH) $(DESTINSTALLSITEARCH)
759
+ -$(NOECHO) umask 02; $(DOC_INSTALL) \
760
+ "Module" "$(NAME)" \
761
+ "installed into" "$(INSTALLSITELIB)" \
762
+ LINKTYPE "$(LINKTYPE)" \
763
+ VERSION "$(VERSION)" \
764
+ EXE_FILES "$(EXE_FILES)" \
765
+ >> $(DESTINSTALLSITEARCH)/perllocal.pod
766
+
767
+ doc_vendor_install :: all
768
+
769
+
770
+ uninstall :: uninstall_from_$(INSTALLDIRS)dirs
771
+ $(NOECHO) $(NOOP)
772
+
773
+ uninstall_from_perldirs ::
774
+
775
+ uninstall_from_sitedirs ::
776
+ $(NOECHO) $(UNINSTALL) $(SITEARCHEXP)/auto/$(FULLEXT)/.packlist
777
+
778
+ uninstall_from_vendordirs ::
779
+
780
+
781
+
782
+ # --- MakeMaker force section:
783
+ # Phony target to force checking subdirectories.
784
+ FORCE :
785
+ $(NOECHO) $(NOOP)
786
+
787
+
788
+ # --- MakeMaker perldepend section:
789
+
790
+ PERL_HDRS = \
791
+ $(PERL_INC)/EXTERN.h \
792
+ $(PERL_INC)/INTERN.h \
793
+ $(PERL_INC)/XSUB.h \
794
+ $(PERL_INC)/av.h \
795
+ $(PERL_INC)/cc_runtime.h \
796
+ $(PERL_INC)/config.h \
797
+ $(PERL_INC)/cop.h \
798
+ $(PERL_INC)/cv.h \
799
+ $(PERL_INC)/dosish.h \
800
+ $(PERL_INC)/embed.h \
801
+ $(PERL_INC)/embedvar.h \
802
+ $(PERL_INC)/fakethr.h \
803
+ $(PERL_INC)/form.h \
804
+ $(PERL_INC)/gv.h \
805
+ $(PERL_INC)/handy.h \
806
+ $(PERL_INC)/hv.h \
807
+ $(PERL_INC)/intrpvar.h \
808
+ $(PERL_INC)/iperlsys.h \
809
+ $(PERL_INC)/keywords.h \
810
+ $(PERL_INC)/mg.h \
811
+ $(PERL_INC)/nostdio.h \
812
+ $(PERL_INC)/op.h \
813
+ $(PERL_INC)/opcode.h \
814
+ $(PERL_INC)/patchlevel.h \
815
+ $(PERL_INC)/perl.h \
816
+ $(PERL_INC)/perlio.h \
817
+ $(PERL_INC)/perlsdio.h \
818
+ $(PERL_INC)/perlsfio.h \
819
+ $(PERL_INC)/perlvars.h \
820
+ $(PERL_INC)/perly.h \
821
+ $(PERL_INC)/pp.h \
822
+ $(PERL_INC)/pp_proto.h \
823
+ $(PERL_INC)/proto.h \
824
+ $(PERL_INC)/regcomp.h \
825
+ $(PERL_INC)/regexp.h \
826
+ $(PERL_INC)/regnodes.h \
827
+ $(PERL_INC)/scope.h \
828
+ $(PERL_INC)/sv.h \
829
+ $(PERL_INC)/thread.h \
830
+ $(PERL_INC)/unixish.h \
831
+ $(PERL_INC)/util.h
832
+
833
+ $(OBJECT) : $(PERL_HDRS)
834
+
835
+
836
+ # --- MakeMaker makefile section:
837
+
838
+ $(OBJECT) : $(FIRST_MAKEFILE)
839
+
840
+ # We take a very conservative approach here, but it's worth it.
841
+ # We move Makefile to Makefile.old here to avoid gnu make looping.
842
+ $(FIRST_MAKEFILE) : Makefile.PL $(CONFIGDEP)
843
+ $(NOECHO) $(ECHO) "Makefile out-of-date with respect to $?"
844
+ $(NOECHO) $(ECHO) "Cleaning current config before rebuilding Makefile..."
845
+ -$(NOECHO) $(RM_F) $(MAKEFILE_OLD)
846
+ -$(NOECHO) $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD)
847
+ - $(MAKE) $(USEMAKEFILE) $(MAKEFILE_OLD) clean $(DEV_NULL)
848
+ $(PERLRUN) Makefile.PL
849
+ $(NOECHO) $(ECHO) "==> Your Makefile has been rebuilt. <=="
850
+ $(NOECHO) $(ECHO) "==> Please rerun the $(MAKE) command. <=="
851
+ $(FALSE)
852
+
853
+
854
+
855
+ # --- MakeMaker staticmake section:
856
+
857
+ # --- MakeMaker makeaperl section ---
858
+ MAP_TARGET = perl
859
+ FULLPERL = /usr/bin/perl
860
+
861
+ $(MAP_TARGET) :: static $(MAKE_APERL_FILE)
862
+ $(MAKE) $(USEMAKEFILE) $(MAKE_APERL_FILE) $@
863
+
864
+ $(MAKE_APERL_FILE) : $(FIRST_MAKEFILE) pm_to_blib
865
+ $(NOECHO) $(ECHO) Writing \"$(MAKE_APERL_FILE)\" for this $(MAP_TARGET)
866
+ $(NOECHO) $(PERLRUNINST) \
867
+ Makefile.PL DIR= \
868
+ MAKEFILE=$(MAKE_APERL_FILE) LINKTYPE=static \
869
+ MAKEAPERL=1 NORECURS=1 CCCDLFLAGS=
870
+
871
+
872
+ # --- MakeMaker test section:
873
+
874
+ TEST_VERBOSE=0
875
+ TEST_TYPE=test_$(LINKTYPE)
876
+ TEST_FILE = test.pl
877
+ TEST_FILES =
878
+ TESTDB_SW = -d
879
+
880
+ testdb :: testdb_$(LINKTYPE)
881
+
882
+ test :: $(TEST_TYPE) subdirs-test
883
+
884
+ subdirs-test ::
885
+ $(NOECHO) $(NOOP)
886
+
887
+
888
+ test_dynamic :: pure_all
889
+ PERL_DL_NONLAZY=1 $(FULLPERLRUN) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
890
+
891
+ testdb_dynamic :: pure_all
892
+ PERL_DL_NONLAZY=1 $(FULLPERLRUN) $(TESTDB_SW) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
893
+
894
+ test_ : test_dynamic
895
+
896
+ test_static :: pure_all $(MAP_TARGET)
897
+ PERL_DL_NONLAZY=1 ./$(MAP_TARGET) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
898
+
899
+ testdb_static :: pure_all $(MAP_TARGET)
900
+ PERL_DL_NONLAZY=1 ./$(MAP_TARGET) $(TESTDB_SW) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
901
+
902
+
903
+
904
+ # --- MakeMaker ppd section:
905
+ # Creates a PPD (Perl Package Description) for a binary distribution.
906
+ ppd :
907
+ $(NOECHO) $(ECHO) '<SOFTPKG NAME="$(DISTNAME)" VERSION="">' > $(DISTNAME).ppd
908
+ $(NOECHO) $(ECHO) ' <ABSTRACT></ABSTRACT>' >> $(DISTNAME).ppd
909
+ $(NOECHO) $(ECHO) ' <AUTHOR></AUTHOR>' >> $(DISTNAME).ppd
910
+ $(NOECHO) $(ECHO) ' <IMPLEMENTATION>' >> $(DISTNAME).ppd
911
+ $(NOECHO) $(ECHO) ' <ARCHITECTURE NAME="x86_64-linux-gnu-thread-multi-5.12" />' >> $(DISTNAME).ppd
912
+ $(NOECHO) $(ECHO) ' <CODEBASE HREF="" />' >> $(DISTNAME).ppd
913
+ $(NOECHO) $(ECHO) ' </IMPLEMENTATION>' >> $(DISTNAME).ppd
914
+ $(NOECHO) $(ECHO) '</SOFTPKG>' >> $(DISTNAME).ppd
915
+
916
+
917
+ # --- MakeMaker pm_to_blib section:
918
+
919
+ pm_to_blib : $(FIRST_MAKEFILE) $(TO_INST_PM)
920
+ $(NOECHO) $(ABSPERLRUN) -MExtUtils::Install -e 'pm_to_blib({@ARGV}, '\''$(INST_LIB)/auto'\'', q[$(PM_FILTER)], '\''$(PERM_DIR)'\'')' -- \
921
+ CRFPP.pm $(INST_LIB)/CRFPP.pm
922
+ $(NOECHO) $(TOUCH) pm_to_blib
923
+
924
+
925
+ # --- MakeMaker selfdocument section:
926
+
927
+
928
+ # --- MakeMaker postamble section:
929
+
930
+
931
+ # End.
CRF/ruby/Makefile CHANGED
@@ -1,157 +1,157 @@
1
-
2
- SHELL = /bin/sh
3
-
4
- #### Start of system configuration section. ####
5
-
6
- srcdir = .
7
- topdir = /usr/lib/ruby/1.8/x86_64-linux
8
- hdrdir = $(topdir)
9
- VPATH = $(srcdir):$(topdir):$(hdrdir)
10
- exec_prefix = $(prefix)
11
- prefix = $(DESTDIR)/usr
12
- sharedstatedir = $(prefix)/com
13
- mandir = $(prefix)/share/man
14
- psdir = $(docdir)
15
- oldincludedir = $(DESTDIR)/usr/include
16
- localedir = $(datarootdir)/locale
17
- bindir = $(exec_prefix)/bin
18
- libexecdir = $(prefix)/lib/ruby1.8
19
- sitedir = $(DESTDIR)/usr/local/lib/site_ruby
20
- htmldir = $(docdir)
21
- vendorarchdir = $(vendorlibdir)/$(sitearch)
22
- includedir = $(prefix)/include
23
- infodir = $(prefix)/share/info
24
- vendorlibdir = $(vendordir)/$(ruby_version)
25
- sysconfdir = $(DESTDIR)/etc
26
- libdir = $(exec_prefix)/lib
27
- sbindir = $(exec_prefix)/sbin
28
- rubylibdir = $(libdir)/ruby/$(ruby_version)
29
- docdir = $(datarootdir)/doc/$(PACKAGE)
30
- dvidir = $(docdir)
31
- vendordir = $(libdir)/ruby/vendor_ruby
32
- datarootdir = $(prefix)/share
33
- pdfdir = $(docdir)
34
- archdir = $(rubylibdir)/$(arch)
35
- sitearchdir = $(sitelibdir)/$(sitearch)
36
- datadir = $(datarootdir)
37
- localstatedir = $(DESTDIR)/var
38
- sitelibdir = $(sitedir)/$(ruby_version)
39
-
40
- CC = gcc
41
- LIBRUBY = $(LIBRUBY_SO)
42
- LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
43
- LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
44
- LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
45
-
46
- RUBY_EXTCONF_H =
47
- CFLAGS = -fPIC -fno-strict-aliasing -g -g -O2 -fPIC $(cflags)
48
- INCFLAGS = -I. -I. -I/usr/lib/ruby/1.8/x86_64-linux -I.
49
- DEFS =
50
- CPPFLAGS = -DHAVE_CRFPP_H
51
- CXXFLAGS = $(CFLAGS)
52
- ldflags = -L. -Wl,-Bsymbolic-functions -rdynamic -Wl,-export-dynamic
53
- dldflags =
54
- archflag =
55
- DLDFLAGS = $(ldflags) $(dldflags) $(archflag)
56
- LDSHARED = $(CC) -shared
57
- AR = ar
58
- EXEEXT =
59
-
60
- RUBY_INSTALL_NAME = ruby1.8
61
- RUBY_SO_NAME = ruby1.8
62
- arch = x86_64-linux
63
- sitearch = x86_64-linux
64
- ruby_version = 1.8
65
- ruby = /usr/bin/ruby1.8
66
- RUBY = $(ruby)
67
- RM = rm -f
68
- MAKEDIRS = mkdir -p
69
- INSTALL = /usr/bin/install -c
70
- INSTALL_PROG = $(INSTALL) -m 0755
71
- INSTALL_DATA = $(INSTALL) -m 644
72
- COPY = cp
73
-
74
- #### End of system configuration section. ####
75
-
76
- preload =
77
-
78
- libpath = . $(libdir)
79
- LIBPATH = -L. -L$(libdir)
80
- DEFFILE =
81
-
82
- CLEANFILES = mkmf.log
83
- DISTCLEANFILES =
84
-
85
- extout =
86
- extout_prefix =
87
- target_prefix =
88
- LOCAL_LIBS =
89
- LIBS = $(LIBRUBYARG_SHARED) -lpthread -lcrfpp -lpthread -lrt -ldl -lcrypt -lm -lc
90
- SRCS = CRFPP_wrap.cpp
91
- OBJS = CRFPP_wrap.o
92
- TARGET = CRFPP
93
- DLLIB = $(TARGET).so
94
- EXTSTATIC =
95
- STATIC_LIB =
96
-
97
- BINDIR = $(bindir)
98
- RUBYCOMMONDIR = $(sitedir)$(target_prefix)
99
- RUBYLIBDIR = $(sitelibdir)$(target_prefix)
100
- RUBYARCHDIR = $(sitearchdir)$(target_prefix)
101
-
102
- TARGET_SO = $(DLLIB)
103
- CLEANLIBS = $(TARGET).so $(TARGET).il? $(TARGET).tds $(TARGET).map
104
- CLEANOBJS = *.o *.a *.s[ol] *.pdb *.exp *.bak
105
-
106
- all: $(DLLIB)
107
- static: $(STATIC_LIB)
108
-
109
- clean:
110
- @-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
111
-
112
- distclean: clean
113
- @-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
114
- @-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
115
-
116
- realclean: distclean
117
- install: install-so install-rb
118
-
119
- install-so: $(RUBYARCHDIR)
120
- install-so: $(RUBYARCHDIR)/$(DLLIB)
121
- $(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
122
- $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
123
- install-rb: pre-install-rb install-rb-default
124
- install-rb-default: pre-install-rb-default
125
- pre-install-rb: Makefile
126
- pre-install-rb-default: Makefile
127
- $(RUBYARCHDIR):
128
- $(MAKEDIRS) $@
129
-
130
- site-install: site-install-so site-install-rb
131
- site-install-so: install-so
132
- site-install-rb: install-rb
133
-
134
- .SUFFIXES: .c .m .cc .cxx .cpp .C .o
135
-
136
- .cc.o:
137
- $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
138
-
139
- .cxx.o:
140
- $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
141
-
142
- .cpp.o:
143
- $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
144
-
145
- .C.o:
146
- $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
147
-
148
- .c.o:
149
- $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) -c $<
150
-
151
- $(DLLIB): $(OBJS) Makefile
152
- @-$(RM) $@
153
- $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
154
-
155
-
156
-
157
- $(OBJS): ruby.h defines.h
 
1
+
2
+ SHELL = /bin/sh
3
+
4
+ #### Start of system configuration section. ####
5
+
6
+ srcdir = .
7
+ topdir = /usr/lib/ruby/1.8/x86_64-linux
8
+ hdrdir = $(topdir)
9
+ VPATH = $(srcdir):$(topdir):$(hdrdir)
10
+ exec_prefix = $(prefix)
11
+ prefix = $(DESTDIR)/usr
12
+ sharedstatedir = $(prefix)/com
13
+ mandir = $(prefix)/share/man
14
+ psdir = $(docdir)
15
+ oldincludedir = $(DESTDIR)/usr/include
16
+ localedir = $(datarootdir)/locale
17
+ bindir = $(exec_prefix)/bin
18
+ libexecdir = $(prefix)/lib/ruby1.8
19
+ sitedir = $(DESTDIR)/usr/local/lib/site_ruby
20
+ htmldir = $(docdir)
21
+ vendorarchdir = $(vendorlibdir)/$(sitearch)
22
+ includedir = $(prefix)/include
23
+ infodir = $(prefix)/share/info
24
+ vendorlibdir = $(vendordir)/$(ruby_version)
25
+ sysconfdir = $(DESTDIR)/etc
26
+ libdir = $(exec_prefix)/lib
27
+ sbindir = $(exec_prefix)/sbin
28
+ rubylibdir = $(libdir)/ruby/$(ruby_version)
29
+ docdir = $(datarootdir)/doc/$(PACKAGE)
30
+ dvidir = $(docdir)
31
+ vendordir = $(libdir)/ruby/vendor_ruby
32
+ datarootdir = $(prefix)/share
33
+ pdfdir = $(docdir)
34
+ archdir = $(rubylibdir)/$(arch)
35
+ sitearchdir = $(sitelibdir)/$(sitearch)
36
+ datadir = $(datarootdir)
37
+ localstatedir = $(DESTDIR)/var
38
+ sitelibdir = $(sitedir)/$(ruby_version)
39
+
40
+ CC = gcc
41
+ LIBRUBY = $(LIBRUBY_SO)
42
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
43
+ LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
44
+ LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
45
+
46
+ RUBY_EXTCONF_H =
47
+ CFLAGS = -fPIC -fno-strict-aliasing -g -g -O2 -fPIC $(cflags)
48
+ INCFLAGS = -I. -I. -I/usr/lib/ruby/1.8/x86_64-linux -I.
49
+ DEFS =
50
+ CPPFLAGS = -DHAVE_CRFPP_H
51
+ CXXFLAGS = $(CFLAGS)
52
+ ldflags = -L. -Wl,-Bsymbolic-functions -rdynamic -Wl,-export-dynamic
53
+ dldflags =
54
+ archflag =
55
+ DLDFLAGS = $(ldflags) $(dldflags) $(archflag)
56
+ LDSHARED = $(CC) -shared
57
+ AR = ar
58
+ EXEEXT =
59
+
60
+ RUBY_INSTALL_NAME = ruby1.8
61
+ RUBY_SO_NAME = ruby1.8
62
+ arch = x86_64-linux
63
+ sitearch = x86_64-linux
64
+ ruby_version = 1.8
65
+ ruby = /usr/bin/ruby1.8
66
+ RUBY = $(ruby)
67
+ RM = rm -f
68
+ MAKEDIRS = mkdir -p
69
+ INSTALL = /usr/bin/install -c
70
+ INSTALL_PROG = $(INSTALL) -m 0755
71
+ INSTALL_DATA = $(INSTALL) -m 644
72
+ COPY = cp
73
+
74
+ #### End of system configuration section. ####
75
+
76
+ preload =
77
+
78
+ libpath = . $(libdir)
79
+ LIBPATH = -L. -L$(libdir)
80
+ DEFFILE =
81
+
82
+ CLEANFILES = mkmf.log
83
+ DISTCLEANFILES =
84
+
85
+ extout =
86
+ extout_prefix =
87
+ target_prefix =
88
+ LOCAL_LIBS =
89
+ LIBS = $(LIBRUBYARG_SHARED) -lpthread -lcrfpp -lpthread -lrt -ldl -lcrypt -lm -lc
90
+ SRCS = CRFPP_wrap.cpp
91
+ OBJS = CRFPP_wrap.o
92
+ TARGET = CRFPP
93
+ DLLIB = $(TARGET).so
94
+ EXTSTATIC =
95
+ STATIC_LIB =
96
+
97
+ BINDIR = $(bindir)
98
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
99
+ RUBYLIBDIR = $(sitelibdir)$(target_prefix)
100
+ RUBYARCHDIR = $(sitearchdir)$(target_prefix)
101
+
102
+ TARGET_SO = $(DLLIB)
103
+ CLEANLIBS = $(TARGET).so $(TARGET).il? $(TARGET).tds $(TARGET).map
104
+ CLEANOBJS = *.o *.a *.s[ol] *.pdb *.exp *.bak
105
+
106
+ all: $(DLLIB)
107
+ static: $(STATIC_LIB)
108
+
109
+ clean:
110
+ @-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
111
+
112
+ distclean: clean
113
+ @-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
114
+ @-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
115
+
116
+ realclean: distclean
117
+ install: install-so install-rb
118
+
119
+ install-so: $(RUBYARCHDIR)
120
+ install-so: $(RUBYARCHDIR)/$(DLLIB)
121
+ $(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
122
+ $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
123
+ install-rb: pre-install-rb install-rb-default
124
+ install-rb-default: pre-install-rb-default
125
+ pre-install-rb: Makefile
126
+ pre-install-rb-default: Makefile
127
+ $(RUBYARCHDIR):
128
+ $(MAKEDIRS) $@
129
+
130
+ site-install: site-install-so site-install-rb
131
+ site-install-so: install-so
132
+ site-install-rb: install-rb
133
+
134
+ .SUFFIXES: .c .m .cc .cxx .cpp .C .o
135
+
136
+ .cc.o:
137
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
138
+
139
+ .cxx.o:
140
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
141
+
142
+ .cpp.o:
143
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
144
+
145
+ .C.o:
146
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
147
+
148
+ .c.o:
149
+ $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) -c $<
150
+
151
+ $(DLLIB): $(OBJS) Makefile
152
+ @-$(RM) $@
153
+ $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
154
+
155
+
156
+
157
+ $(OBJS): ruby.h defines.h
CRF/winmain.h CHANGED
@@ -1,69 +1,69 @@
1
- //
2
- // CRF++ -- Yet Another CRF toolkit
3
- //
4
- // $Id: common.h 1588 2007-02-12 09:03:39Z taku $;
5
- //
6
- // Copyright(C) 2005-2007 Taku Kudo <taku@chasen.org>
7
- //
8
- #if defined(_WIN32) || defined(__CYGWIN__)
9
-
10
- #include <windows.h>
11
- #include <string>
12
-
13
- namespace {
14
- class CommandLine {
15
- public:
16
- CommandLine(int argc, wchar_t **argv) : argc_(argc), argv_(0) {
17
- argv_ = new char * [argc_];
18
- for (int i = 0; i < argc_; ++i) {
19
- const std::string arg = WideToUtf8(argv[i]);
20
- argv_[i] = new char[arg.size() + 1];
21
- ::memcpy(argv_[i], arg.data(), arg.size());
22
- argv_[i][arg.size()] = '\0';
23
- }
24
- }
25
- ~CommandLine() {
26
- for (int i = 0; i < argc_; ++i) {
27
- delete [] argv_[i];
28
- }
29
- delete [] argv_;
30
- }
31
-
32
- int argc() const { return argc_; }
33
- char **argv() const { return argv_; }
34
-
35
- private:
36
- static std::string WideToUtf8(const std::wstring &input) {
37
- const int output_length = ::WideCharToMultiByte(CP_UTF8, 0,
38
- input.c_str(), -1, NULL, 0,
39
- NULL, NULL);
40
- if (output_length == 0) {
41
- return "";
42
- }
43
-
44
- char *input_encoded = new char[output_length + 1];
45
- const int result = ::WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1,
46
- input_encoded,
47
- output_length + 1, NULL, NULL);
48
- std::string output;
49
- if (result > 0) {
50
- output.assign(input_encoded);
51
- }
52
- delete [] input_encoded;
53
- return output;
54
- }
55
-
56
- int argc_;
57
- char **argv_;
58
- };
59
- } // namespace
60
-
61
- #define main(argc, argv) wmain_to_main_wrapper(argc, argv)
62
-
63
- int wmain_to_main_wrapper(int argc, char **argv);
64
-
65
- int wmain(int argc, wchar_t **argv) {
66
- CommandLine cmd(argc, argv);
67
- return wmain_to_main_wrapper(cmd.argc(), cmd.argv());
68
- }
69
- #endif
 
1
+ //
2
+ // CRF++ -- Yet Another CRF toolkit
3
+ //
4
+ // $Id: common.h 1588 2007-02-12 09:03:39Z taku $;
5
+ //
6
+ // Copyright(C) 2005-2007 Taku Kudo <taku@chasen.org>
7
+ //
8
+ #if defined(_WIN32) || defined(__CYGWIN__)
9
+
10
+ #include <windows.h>
11
+ #include <string>
12
+
13
+ namespace {
14
+ class CommandLine {
15
+ public:
16
+ CommandLine(int argc, wchar_t **argv) : argc_(argc), argv_(0) {
17
+ argv_ = new char * [argc_];
18
+ for (int i = 0; i < argc_; ++i) {
19
+ const std::string arg = WideToUtf8(argv[i]);
20
+ argv_[i] = new char[arg.size() + 1];
21
+ ::memcpy(argv_[i], arg.data(), arg.size());
22
+ argv_[i][arg.size()] = '\0';
23
+ }
24
+ }
25
+ ~CommandLine() {
26
+ for (int i = 0; i < argc_; ++i) {
27
+ delete [] argv_[i];
28
+ }
29
+ delete [] argv_;
30
+ }
31
+
32
+ int argc() const { return argc_; }
33
+ char **argv() const { return argv_; }
34
+
35
+ private:
36
+ static std::string WideToUtf8(const std::wstring &input) {
37
+ const int output_length = ::WideCharToMultiByte(CP_UTF8, 0,
38
+ input.c_str(), -1, NULL, 0,
39
+ NULL, NULL);
40
+ if (output_length == 0) {
41
+ return "";
42
+ }
43
+
44
+ char *input_encoded = new char[output_length + 1];
45
+ const int result = ::WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1,
46
+ input_encoded,
47
+ output_length + 1, NULL, NULL);
48
+ std::string output;
49
+ if (result > 0) {
50
+ output.assign(input_encoded);
51
+ }
52
+ delete [] input_encoded;
53
+ return output;
54
+ }
55
+
56
+ int argc_;
57
+ char **argv_;
58
+ };
59
+ } // namespace
60
+
61
+ #define main(argc, argv) wmain_to_main_wrapper(argc, argv)
62
+
63
+ int wmain_to_main_wrapper(int argc, char **argv);
64
+
65
+ int wmain(int argc, wchar_t **argv) {
66
+ CommandLine cmd(argc, argv);
67
+ return wmain_to_main_wrapper(cmd.argc(), cmd.argv());
68
+ }
69
+ #endif
GeneNER_SpeAss_run.py CHANGED
@@ -1,746 +1,746 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on Wed Jun 8 09:26:57 2022
4
-
5
- @author: luol2
6
-
7
- Pipeline: first gene NER, then species assignment
8
- input: species NER bioc xml file
9
- output: gene ner and species assignment results bioc xml file
10
- """
11
- import argparse
12
- import os
13
- import io
14
- import time
15
- import sys
16
- import re
17
- import shutil
18
- from src_python.GeneNER import model_ner,ner_tag
19
- from src_python.SpeAss import model_sa,sa_tag
20
-
21
- import tensorflow as tf
22
-
23
- import bioc
24
- import stanza
25
- nlp_token = stanza.Pipeline(model_dir='gnorm_trained_models/stanza', lang='en', processors={'tokenize': 'spacy'},package='None', download_method=None) #package='craft' ;./gnorm_trained_models/stanza
26
-
27
- def NER_BioC(infolder,infile,outpath,nn_model):
28
-
29
- with open(infolder+"/"+infile, 'r',encoding='utf-8') as fin:
30
- with open(outpath+"/"+infile,'w', encoding='utf8') as fout:
31
- collection = bioc.load(fin)
32
-
33
- Total_n=len(collection.documents)
34
- print('Total number of sub-documents:', Total_n)
35
- pmid_n=0
36
- for document in collection.documents:
37
- print("Processing:{0}%".format(round(pmid_n * 100 / Total_n)), end="\r")
38
- pmid_n+=1
39
- # print(document.id)
40
- mention_num_new=0
41
- for passage in document.passages:
42
- if passage.text!='' and (not passage.text.isspace()) and passage.infons['type']!='ref': # have text and is not ref
43
- passage_offset=passage.offset
44
- tag_result=ner_tag.ML_Tag(passage.text,nn_model,nlp_token)
45
- mention_num=0
46
- for ele in tag_result:
47
- bioc_note = bioc.BioCAnnotation()
48
- bioc_note.id = str(mention_num)
49
- mention_num+=1
50
- bioc_note.infons['type'] = ele[2]
51
- start = int(ele[0])
52
- last = int(ele[1])
53
- loc = bioc.BioCLocation(offset=str(passage_offset+start), length= str(last-start))
54
- bioc_note.locations.append(loc)
55
- bioc_note.text = passage.text[start:last]
56
- passage.annotations.append(bioc_note)
57
- #update id
58
- for temp_annotation in passage.annotations:
59
- temp_annotation.id=str(mention_num_new)
60
- mention_num_new+=1
61
- bioc.dump(collection, fout, pretty_print=True)
62
-
63
- def NER_PubTator(infolder,infile,outpath,nn_model):
64
- with open(infolder+"/"+infile, 'r',encoding='utf-8') as fin:
65
- with open(outpath+"/"+infile,'w', encoding='utf-8') as fout:
66
- title=''
67
- abstract=''
68
- all_text=fin.read().strip().split('\n\n')
69
- Total_n=len(all_text)
70
- print('Total number of sub-documents:', Total_n)
71
- pmid_n=0
72
- for doc in all_text:
73
- print("Processing:{0}%".format(round(pmid_n * 100 / Total_n)), end="\r")
74
- pmid_n+=1
75
- lines = doc.split('\n')
76
- seg=lines[0].split('|t|')
77
- pmid=seg[0]
78
- title=""
79
- if len(seg)>1:
80
- title=seg[1]
81
- abstract=""
82
- if len(lines)>1:
83
- seg=lines[1].split('|a|')
84
- abstract=seg[1]
85
- if len(seg)>1:
86
- abstract=seg[1]
87
-
88
- intext=title+' '+abstract
89
- tag_result=ner_tag.ML_Tag(intext,nn_model,nlp_token)
90
- fout.write(doc+'\n')
91
- for ele in tag_result:
92
- ent_start = ele[0]
93
- ent_last = ele[1]
94
- ent_mention = intext[int(ele[0]):int(ele[1])]
95
- ent_type=ele[2]
96
- fout.write(pmid+"\t"+ent_start+"\t"+ent_last+"\t"+ent_mention+"\t"+ent_type+"\n")
97
- fout.write('\n')
98
- title=''
99
- abstract=''
100
-
101
- def geneNER(infolder, outpath, modelfile):
102
-
103
- print('loading NER models........')
104
-
105
- if modelfile.lower().find('bioformer')>=0:
106
- vocabfiles={'labelfile':'./vocab/GeneNER_label.vocab',
107
- 'checkpoint_path':'./gnorm_trained_models/bioformer-cased-v1.0/', #bioformer-cased-v1.0
108
- 'lowercase':False,
109
- }
110
- else:
111
- vocabfiles={'labelfile':'./vocab/GeneNER_label.vocab',
112
- 'checkpoint_path':'./gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/',
113
- 'lowercase':True,
114
- }
115
-
116
- nn_model=model_ner.HUGFACE_NER(vocabfiles)
117
- nn_model.build_encoder()
118
- nn_model.build_softmax_decoder()
119
- nn_model.load_model(modelfile)
120
-
121
- #tagging text
122
- print("begin GeneNER tagging........")
123
- start_time=time.time()
124
-
125
- for infile in os.listdir(infolder):
126
- if os.path.isfile(outpath+"/"+infile):
127
- print(infile+' has exsited.')
128
- else:
129
- print('processing:',infile)
130
- fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
131
- input_format=""
132
- for line in fin:
133
- pattern_bioc = re.compile('.*<collection>.*')
134
- pattern_pubtator = re.compile('^([^\|]+)\|[^\|]+\|(.*)')
135
- if pattern_bioc.search(line):
136
- input_format="BioC"
137
- break
138
- elif pattern_pubtator.search(line):
139
- input_format="PubTator"
140
- break
141
- fin.close()
142
- if(input_format == "PubTator"):
143
- NER_PubTator(infolder,infile,outpath,nn_model)
144
- elif(input_format == "BioC"):
145
- NER_BioC(infolder,infile,outpath,nn_model)
146
-
147
- print('tag done:',time.time()-start_time)
148
-
149
-
150
- #SA for bioc format
151
- def SA_BioC(infolder,infile,outpath,nn_model,virus_set,prefix_dict):
152
-
153
- #BioC xml to pubtator
154
- # pmid|t|text1
155
- #pmid|a|text2
156
- #pmid sid eid entity_txt entity_type entity_id (gene is blank)
157
- fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
158
- # fout_pubtator=open(outpath+'tmp/input_xml.pubtator','w', encoding='utf-8')
159
- fin_pubtator0=io.StringIO() #none *species
160
- fin_pubtator1=io.StringIO() #one *species
161
- fin_pubtator2=io.StringIO() #two or more species
162
- collection = bioc.load(fin)
163
- fin.close()
164
- ori_ann_index={} #{'pmid':{'ent.id':'ent_s-ent_e'}}
165
- species_count={} #{pmid:{speid:num}}
166
- gene_set=['Gene','FamilyName']
167
- final_sa_results={} #{'pmid':{'entity_id':species_id}}
168
- for document in collection.documents:
169
- doc_pmid=document.id
170
- doc_title=''
171
- doc_abstract=''
172
- doc_annotation=[]
173
- _ann_index={}
174
- _species_num={} #{*speciesid:num}
175
- _gene_num=0
176
- _passage_num=0
177
- if len(document.passages)<=2: #abstract xml or PMC only have title
178
- for passage in document.passages:
179
- passage_offset=passage.offset
180
- _passage_num+=1
181
- #print(passage_offset,type(passage_offset))
182
- #if passage.infons['type']=='title' or passage.infons['type']=='front':
183
- if _passage_num==1:
184
- doc_title=passage.text
185
- for temp_annotation in passage.annotations:
186
- if temp_annotation.infons['type'] in gene_set:
187
- _gene_num+=1
188
- ent_start=temp_annotation.locations[0].offset-passage_offset
189
- ent_end=ent_start+temp_annotation.locations[0].length
190
- #print(ent_start,ent_end)
191
- _ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end)
192
- # print(temp_annotation.infons)
193
- if 'Identifier' in temp_annotation.infons.keys():
194
- # print(temp_annotation.infons.keys['Identifier'])
195
- species_ID=temp_annotation.infons['Identifier']
196
- if species_ID.find('*')>=0:
197
- if species_ID not in _species_num.keys():
198
- _species_num[species_ID]=1
199
- else:
200
- _species_num[species_ID]+=1
201
- doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID)
202
- else:
203
- doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type'])
204
-
205
- #elif passage.infons['type']=='abstract' or passage.infons['type']=='paragraph':
206
- else:
207
- doc_abstract=passage.text
208
- for temp_annotation in passage.annotations:
209
- if temp_annotation.infons['type'] in gene_set:
210
- _gene_num+=1
211
- ent_start=len(doc_title)+1+temp_annotation.locations[0].offset-passage_offset
212
- ent_end=ent_start+temp_annotation.locations[0].length
213
- #print(ent_start,ent_end)
214
- _ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end)
215
- if 'Identifier' in temp_annotation.infons.keys():
216
- # print(temp_annotation.infons.keys['Identifier'])
217
- species_ID=temp_annotation.infons['Identifier']
218
- if species_ID.find('*')>=0:
219
- if species_ID not in _species_num.keys():
220
- _species_num[species_ID]=1
221
- else:
222
- _species_num[species_ID]+=1
223
- doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID)
224
- else:
225
- doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type'])
226
-
227
- if len(_species_num)>=2 and _gene_num>0:
228
- fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n')
229
- fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n')
230
- for ele in doc_annotation:
231
- fin_pubtator2.write(ele+'\n')
232
- fin_pubtator2.write('\n')
233
- elif len(_species_num)==1 and _gene_num>0: #可以直接给结果
234
- fin_pubtator1.write(doc_pmid+'|t|'+doc_title+'\n')
235
- fin_pubtator1.write(doc_pmid+'|a|'+doc_abstract+'\n')
236
- major_speicesid,=_species_num
237
- fin_pubtator1.write(major_speicesid[1:]+'\n')
238
- for ele in doc_annotation:
239
- fin_pubtator1.write(ele+'\n')
240
- fin_pubtator1.write('\n')
241
- elif len(_species_num)==0 and _gene_num>0:
242
- fin_pubtator0.write(doc_pmid+'|t|'+doc_title+'\n')
243
- fin_pubtator0.write(doc_pmid+'|a|'+doc_abstract+'\n')
244
- for ele in doc_annotation:
245
- fin_pubtator0.write(ele+'\n')
246
- fin_pubtator0.write('\n')
247
-
248
- else: # full text xml
249
- for passage in document.passages:
250
- passage_annotation=[]
251
- _species_num_passage={}
252
- _gene_num_passage=0
253
- passage_offset=passage.offset
254
- #print(passage_offset,type(passage_offset))
255
- if passage.text!='' and (not passage.text.isspace()) and passage.infons['type']!='ref':
256
- doc_title=passage.text
257
- for temp_annotation in passage.annotations:
258
- if temp_annotation.infons['type'] in gene_set:
259
- _gene_num_passage+=1
260
- ent_start=temp_annotation.locations[0].offset-passage_offset
261
- ent_end=ent_start+temp_annotation.locations[0].length
262
- #print(ent_start,ent_end)
263
- _ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end)
264
- # print(temp_annotation.infons)
265
- if 'Identifier' in temp_annotation.infons.keys():
266
- # print(temp_annotation.infons.keys['Identifier'])
267
- species_ID=temp_annotation.infons['Identifier']
268
- if species_ID.find('*')>=0:
269
- if species_ID not in _species_num.keys():
270
- _species_num[species_ID]=1
271
- else:
272
- _species_num[species_ID]+=1
273
- if species_ID not in _species_num_passage.keys():
274
- _species_num_passage[species_ID]=1
275
- else:
276
- _species_num_passage[species_ID]+=1
277
- passage_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID)
278
- else:
279
- passage_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type'])
280
-
281
-
282
- if len(_species_num_passage)>=2 and _gene_num_passage>0:
283
- fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n')
284
- fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n')
285
- for ele in passage_annotation:
286
- fin_pubtator2.write(ele+'\n')
287
- fin_pubtator2.write('\n')
288
- elif len(_species_num_passage)==1 and _gene_num_passage>0: #可以直接给结果
289
- fin_pubtator1.write(doc_pmid+'|t|'+doc_title+'\n')
290
- fin_pubtator1.write(doc_pmid+'|a|'+doc_abstract+'\n')
291
- major_speicesid,=_species_num_passage
292
- fin_pubtator1.write(major_speicesid[1:]+'\n')
293
- for ele in passage_annotation:
294
- fin_pubtator1.write(ele+'\n')
295
- fin_pubtator1.write('\n')
296
- elif len(_species_num_passage)==0 and _gene_num_passage>0:
297
- fin_pubtator0.write(doc_pmid+'|t|'+doc_title+'\n')
298
- fin_pubtator0.write(doc_pmid+'|a|'+doc_abstract+'\n')
299
- for ele in passage_annotation:
300
- fin_pubtator0.write(ele+'\n')
301
- fin_pubtator0.write('\n')
302
- # print(ori_ann_index)
303
-
304
- ori_ann_index[doc_pmid]=_ann_index
305
- species_count[doc_pmid]=_species_num
306
-
307
-
308
- cache_geneid={} #{pmid:{gene1:{id1:num,id2:num}}}
309
-
310
- if fin_pubtator2.getvalue()!='':
311
- #pubtator format ML tagging
312
- # print(fin_pubtator2.getvalue())
313
- ml_out= sa_tag.ml_tag_main(fin_pubtator2,nlp_token, nn_model)
314
- #print(ml_out.getvalue())
315
- fin_result=io.StringIO(ml_out.getvalue())
316
- all_in=fin_result.read().strip().split('\n\n')
317
- #print('+2 species:',len(all_in))
318
- fin_result.close()
319
-
320
- prefix_speid_allset=set(prefix_dict.keys())
321
-
322
- for doc in all_in:
323
- lines=doc.split('\n')
324
- pmid=lines[0].split('|t|')[0]
325
- _prefix_str2id_dict={}
326
- doc_species=list(species_count[pmid].keys())
327
- for _spe_ele in doc_species:
328
- if _spe_ele[1:] in prefix_speid_allset:
329
- for ele in prefix_dict[_spe_ele[1:]]:
330
- _prefix_str2id_dict[ele]=_spe_ele[1:]
331
-
332
- for i in range(2,len(lines)):
333
- segs=lines[i].split('\t')
334
- if pmid not in final_sa_results.keys():
335
- final_sa_results[pmid]={segs[1]:'Focus:'+segs[-1]}
336
- else:
337
- final_sa_results[pmid][segs[1]]='Focus:'+segs[-1]
338
-
339
- if segs[5] in gene_set:
340
- if segs[4][0:2] in _prefix_str2id_dict: #prefix rule
341
- #print('prefix rule:', pmid)
342
- # print(_prefix_str2id_dict)
343
- if pmid not in final_sa_results.keys():
344
- final_sa_results[pmid]={segs[1]:'Focus:'+_prefix_str2id_dict[segs[4][0:2]]}
345
- else:
346
- final_sa_results[pmid][segs[1]]='Focus:'+_prefix_str2id_dict[segs[4][0:2]]
347
- if pmid not in cache_geneid.keys():
348
- cache_geneid[pmid]={segs[4]:{'Focus:'+segs[-1]:1}}
349
- else:
350
- if segs[4] not in cache_geneid[pmid].keys():
351
- cache_geneid[pmid][segs[4]]={'Focus:'+segs[-1]:1}
352
- else:
353
- if segs[-1] not in cache_geneid[pmid][segs[4]].keys():
354
- cache_geneid[pmid][segs[4]]['Focus:'+segs[-1]]=1
355
- else:
356
- cache_geneid[pmid][segs[4]]['Focus:'+segs[-1]]+=1
357
-
358
- #print(final_sa_results)
359
-
360
- #one species
361
- if fin_pubtator1.getvalue()!='':
362
- fin_result=io.StringIO(fin_pubtator1.getvalue())
363
- all_in=fin_result.read().strip().split('\n\n')
364
- fin_result.close()
365
- #print('1 species:',len(all_in))
366
- for doc in all_in:
367
- lines=doc.split('\n')
368
- pmid=lines[0].split('|t|')[0]
369
- major_speicesid=lines[2]
370
- for i in range(3,len(lines)):
371
- segs=lines[i].split('\t')
372
- if len(segs)>=7:#species
373
- if pmid not in final_sa_results.keys():
374
- final_sa_results[pmid]={segs[1]:segs[-1]}
375
- else:
376
- final_sa_results[pmid][segs[1]]=segs[-1]
377
- else:#gene
378
- marjor_species='Focus:'+major_speicesid
379
- if pmid not in final_sa_results.keys():
380
- final_sa_results[pmid]={segs[1]:marjor_species}
381
- else:
382
- final_sa_results[pmid][segs[1]]=marjor_species
383
- if pmid not in cache_geneid.keys():
384
- cache_geneid[pmid]={segs[4]:{marjor_species:1}}
385
- else:
386
- if segs[4] not in cache_geneid[pmid].keys():
387
- cache_geneid[pmid][segs[4]]={marjor_species:1}
388
- else:
389
- if segs[-1] not in cache_geneid[pmid][segs[4]].keys():
390
- cache_geneid[pmid][segs[4]][marjor_species]=1
391
- else:
392
- cache_geneid[pmid][segs[4]][marjor_species]+=1
393
-
394
-
395
- #no species
396
- fin_result=io.StringIO(fin_pubtator0.getvalue())
397
- all_in=fin_result.read().strip().split('\n\n')
398
- fin_result.close()
399
- #print('no species:',len(all_in))
400
- for doc in all_in:
401
- lines=doc.split('\n')
402
- pmid=lines[0].split('|t|')[0]
403
-
404
- for i in range(2,len(lines)):
405
- segs=lines[i].split('\t')
406
- if (pmid in cache_geneid.keys()) and (segs[4] in cache_geneid[pmid].keys()):#same gene in doc
407
- marjor_species = max(zip(cache_geneid[pmid][segs[4]].values(), cache_geneid[pmid][segs[4]].keys()))
408
- if pmid not in final_sa_results.keys():
409
- final_sa_results[pmid]={segs[1]:marjor_species[1]}
410
- else:
411
- final_sa_results[pmid][segs[1]]=marjor_species[1]
412
- else: #marjor species in doc
413
- if (pmid in species_count.keys()) and len(species_count[pmid])>0:#marjor species in doc
414
- marjor_species = max(zip(species_count[pmid].values(), species_count[pmid].keys()))
415
-
416
- if pmid not in final_sa_results.keys():
417
- final_sa_results[pmid]={segs[1]:'Focus:'+marjor_species[1][1:]}
418
- else:
419
- final_sa_results[pmid][segs[1]]='Focus:'+marjor_species[1][1:]
420
- else:#no any species in doc,assign human
421
- if pmid not in final_sa_results.keys():
422
- final_sa_results[pmid]={segs[1]:'Focus:9606'}
423
- else:
424
- final_sa_results[pmid][segs[1]]='Focus:9606'
425
-
426
-
427
-
428
- # print(final_sa_results)
429
- fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
430
- fout_xml=open(outpath+"/"+infile,'w', encoding='utf8')
431
- collection = bioc.load(fin)
432
- for document in collection.documents:
433
- doc_pmid=document.id
434
- # print(final_sa_results[doc_pmid])
435
- # print(doc_pmid)
436
- for passage in document.passages:
437
- for temp_annotation in passage.annotations:
438
- if 'Identifier' not in temp_annotation.infons.keys():
439
- if temp_annotation.id in final_sa_results[doc_pmid].keys():
440
- if final_sa_results[doc_pmid][temp_annotation.id][6:] in virus_set:
441
- temp_annotation.infons['Identifier']=final_sa_results[doc_pmid][temp_annotation.id]+',9606'
442
- # print('!!! virus:', doc_pmid)
443
- else:
444
- temp_annotation.infons['Identifier']=final_sa_results[doc_pmid][temp_annotation.id]
445
- else: #same text bug
446
- if (doc_pmid in cache_geneid.keys()) and (temp_annotation.text in cache_geneid[doc_pmid].keys()):#same gene in doc
447
- marjor_species = max(zip(cache_geneid[doc_pmid][temp_annotation.text].values(), cache_geneid[doc_pmid][temp_annotation.text].keys()))
448
- temp_annotation.infons['Identifier']=marjor_species[1]
449
- else:
450
-
451
- temp_annotation.infons['Identifier']='Focus:9606'
452
- bioc.dump(collection, fout_xml, pretty_print=True)
453
- fin.close()
454
- fout_xml.close()
455
-
456
-
457
- #SA for PubTator format
458
- def SA_PubTator(infolder,infile,outpath,nn_model,virus_set,prefix_dict):
459
-
460
-
461
- # pmid|t|text1
462
- #pmid|a|text2
463
- #pmid entity_id sid eid entity_txt entity_type (gene is blank)
464
- fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
465
- # fout_pubtator=open(outpath+'tmp/input_xml.pubtator','w', encoding='utf-8')
466
- fin_pubtator2=io.StringIO() #two or more species
467
- all_in_ori=fin.read().strip().split('\n\n')
468
- fin.close()
469
- species_gene_count={} #{pmid:{'spec':_species_num;'gene':_gene_num}}
470
- gene_set=['Gene','FamilyName']
471
- ML_results={} #{'pmid':{'sid-eid':species_id}}
472
-
473
- prefix_speid_allset=set(prefix_dict.keys())
474
-
475
- for document in all_in_ori:
476
- lines=document.split('\n')
477
- doc_pmid=lines[0].split('|t|')[0]
478
- doc_title=lines[0].split('|t|')[1]
479
- doc_abstract=lines[1].split('|a|')[1]
480
- doc_annotation=[]
481
- _species_num=set() #(*speciesid)
482
- _gene_num=0
483
- _ML_gene_num=0
484
- _entity_num=0
485
- _prefix_str2id_dict={} #{prestr:id}
486
- for i in range(2,len(lines)):
487
- segs=lines[i].split('\t')
488
- if segs[4] in gene_set:
489
- _gene_num+=1
490
- if len(segs)>=6: #species
491
- doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:]))
492
- species_ID=segs[-1]
493
- if species_ID.find('*')>=0:
494
- _species_num.add(species_ID)
495
- if species_ID[1:] in prefix_speid_allset:
496
- for ele in prefix_dict[species_ID[1:]]:
497
- _prefix_str2id_dict[ele]=species_ID[1:]
498
- else: #gene
499
- if segs[3][0:2] in _prefix_str2id_dict:#prefix rule
500
- if _prefix_str2id_dict[segs[3][0:2]] in virus_set:
501
- doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:])+'\tFocus:'+_prefix_str2id_dict[segs[3][0:2]]+',9606')
502
- if doc_pmid not in ML_results.keys():
503
- ML_results[doc_pmid]={segs[1]+'-'+segs[2]:_prefix_str2id_dict[segs[3][0:2]]+',9606'}
504
- else:
505
- ML_results[doc_pmid][segs[1]+'-'+segs[2]]=_prefix_str2id_dict[segs[3][0:2]]+',9606'
506
-
507
- # print('!!! prefixr and virus:', doc_pmid)
508
- else:
509
- doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:])+'\tFocus:'+_prefix_str2id_dict[segs[3][0:2]])
510
- if doc_pmid not in ML_results.keys():
511
- ML_results[doc_pmid]={segs[1]+'-'+segs[2]:_prefix_str2id_dict[segs[3][0:2]]}
512
- else:
513
- ML_results[doc_pmid][segs[1]+'-'+segs[2]]=_prefix_str2id_dict[segs[3][0:2]]
514
- # print('prefix rule!!',_prefix_str2id_dict)
515
- # print(doc_pmid)
516
- else:
517
- doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:]))
518
- if segs[4] in gene_set:
519
- _ML_gene_num+=1
520
- _entity_num+=1
521
-
522
- if len(_species_num)>=2 and _ML_gene_num>0:
523
- fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n')
524
- fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n')
525
- for ele in doc_annotation:
526
- fin_pubtator2.write(ele+'\n')
527
- fin_pubtator2.write('\n')
528
-
529
- species_gene_count[doc_pmid]={'spec':_species_num,'gene':_gene_num}
530
-
531
- if fin_pubtator2.getvalue()!='':
532
- #pubtator format ML tagging
533
- #print(fin_pubtator2.getvalue())
534
- ml_out= sa_tag.ml_tag_main(fin_pubtator2,nlp_token, nn_model)
535
- #print(ml_out.getvalue())
536
- fin_result=io.StringIO(ml_out.getvalue())
537
- all_in=fin_result.read().strip().split('\n\n')
538
- #print('+2 species:',len(all_in))
539
- fin_result.close()
540
- for doc in all_in:
541
- lines=doc.split('\n')
542
- pmid=lines[0].split('|t|')[0]
543
-
544
- for i in range(2,len(lines)):
545
- segs=lines[i].split('\t')
546
- if pmid not in ML_results.keys():
547
- ML_results[pmid]={segs[2]+'-'+segs[3]:segs[-1]}
548
- else:
549
- ML_results[pmid][segs[2]+'-'+segs[3]]=segs[-1]
550
-
551
- #output
552
- fout_pubtator=open(outpath+"/"+infile,'w', encoding='utf8')
553
- for doc in all_in_ori:
554
- lines=doc.split('\n')
555
- pmid=lines[0].split('|t|')[0]
556
- fout_pubtator.write(lines[0]+'\n'+lines[1]+'\n')
557
- if len(species_gene_count[pmid]['spec'])>1 and species_gene_count[pmid]['gene']>0: # ML
558
- for i in range(2,len(lines)):
559
- segs=lines[i].split('\t')
560
- if len(segs)>=6: #species
561
- fout_pubtator.write(lines[i]+'\n')
562
- else:#gene
563
- if ML_results[pmid][segs[1]+'-'+segs[2]] in virus_set:
564
- fout_pubtator.write(lines[i]+'\tFocus:'+ML_results[pmid][segs[1]+'-'+segs[2]]+',9606'+'\n')
565
- # print('!!! virus:', pmid)
566
- else:
567
- fout_pubtator.write(lines[i]+'\tFocus:'+ML_results[pmid][segs[1]+'-'+segs[2]]+'\n')
568
- fout_pubtator.write('\n')
569
-
570
- elif len(species_gene_count[pmid]['spec'])==1 and species_gene_count[pmid]['gene']>0: #only one species
571
- for i in range(2,len(lines)):
572
- segs=lines[i].split('\t')
573
- if len(segs)>=6: #species
574
- fout_pubtator.write(lines[i]+'\n')
575
- else:#gene
576
- major_species,=species_gene_count[pmid]['spec']
577
- if major_species[1:] in virus_set:
578
- fout_pubtator.write(lines[i]+'\tFocus:'+major_species[1:]+',9606'+'\n')
579
- # print('!!! virus:', pmid)
580
- fout_pubtator.write(lines[i]+'\tFocus:'+major_species[1:]+'\n')
581
- fout_pubtator.write('\n')
582
-
583
- elif len(species_gene_count[pmid]['spec'])==0 and species_gene_count[pmid]['gene']>0:#no species
584
- for i in range(2,len(lines)):
585
- segs=lines[i].split('\t')
586
- if len(segs)>=6: #species
587
- fout_pubtator.write(lines[i]+'\n')
588
- else:#gene
589
- fout_pubtator.write(lines[i]+'\tFocus:9606'+'\n')
590
- fout_pubtator.write('\n')
591
-
592
- else:
593
- for i in range(2,len(lines)):
594
- fout_pubtator.write(lines[i]+'\n')
595
- fout_pubtator.write('\n')
596
- fout_pubtator.close()
597
-
598
-
599
- #SA main
600
- def speciesAss(infolder,outpath, modelfile):
601
-
602
- if modelfile.lower().find('bioformer')>=0:
603
- model_type='bioformer'
604
- else:
605
- model_type='pubmedbert'
606
-
607
- print('loading SA models........')
608
- if model_type=='bioformer':
609
-
610
- vocabfiles={'labelfile':'./vocab/SpeAss_IO_label.vocab',
611
- 'checkpoint_path':'./gnorm_trained_models/bioformer-cased-v1.0/',
612
- 'lowercase':False,
613
- }
614
- else:
615
- vocabfiles={'labelfile':'./vocab/SpeAss_IO_label.vocab',
616
- 'checkpoint_path':'./gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/',
617
- 'lowercase':True,
618
- }
619
-
620
- nn_model=model_sa.HUGFACE_NER(vocabfiles)
621
- nn_model.build_encoder()
622
- nn_model.build_softmax_decoder()
623
- nn_model.load_model(modelfile)
624
-
625
- dict_filename={'prefix':'./Dictionary/SPPrefix.txt',
626
- 'virus':'./Dictionary/SP_Virus2HumanList.txt'}
627
- fin=open(dict_filename['virus'],'r',encoding='utf-8')
628
- virus_set=set(fin.read().strip().split('\n'))
629
- fin.close()
630
-
631
- prefix_dict={}#{id:[prefix1,prefix2]}
632
- fin=open(dict_filename['prefix'],'r',encoding='utf-8')
633
- for line in fin:
634
- seg= line.strip().split('\t')
635
- if seg[0] not in prefix_dict.keys():
636
- prefix_dict[seg[0]]=seg[1].split('|')
637
- else:
638
- prefix_dict[seg[0]].extend(seg[1].split('|'))
639
- fin.close()
640
-
641
-
642
-
643
- print("begin species assignment........")
644
- start_time=time.time()
645
-
646
- for infile in os.listdir(infolder):
647
- if os.path.isfile(outpath+"/"+infile):
648
- print(infile+' has exsited.')
649
- else:
650
- print('Processing:',infile)
651
- fin=open(infolder+"/"+infile, 'r',encoding='utf-8')
652
- file_format=""
653
- for line in fin:
654
- pattern_bioc = re.compile('.*<collection>.*')
655
- pattern_pubtator = re.compile('^([^\|]+)\|[^\|]+\|(.*)')
656
- if pattern_bioc.search(line):
657
- file_format="BioC"
658
- break
659
- elif pattern_pubtator.search(line):
660
- file_format="PubTator"
661
- break
662
- fin.close()
663
- if(file_format == "PubTator"):
664
- SA_PubTator(infolder,infile,outpath,nn_model,virus_set,prefix_dict)
665
- elif(file_format == "BioC"):
666
- SA_BioC(infolder,infile,outpath,nn_model,virus_set,prefix_dict)
667
-
668
-
669
- print('species assignment done:',time.time()-start_time)
670
-
671
- if __name__=='__main__':
672
-
673
- parser = argparse.ArgumentParser(description='run GeneNER and species assignment, python GeneNER_SpeAss_run.py -i input -n NERmodel -s SAmodel -r neroutput -a saoutput')
674
- parser.add_argument('--infolder', '-i', help="input folder",default='./example/input/')
675
- parser.add_argument('--NERmodel', '-n', help="trained deep learning NER model file",default='')
676
- parser.add_argument('--SAmodel', '-s', help="trained deep learning species assignment model file",default='')
677
- parser.add_argument('--NERoutpath', '-r', help="output folder to save the NER tagged results",default='./example/ner_output/')
678
- parser.add_argument('--SAoutpath', '-a', help="output folder to save the SA tagged results",default='./example/sa_output/')
679
- parser.add_argument('--NUM_THREADS', '-t', help="Number of threads",default='3')
680
- args = parser.parse_args()
681
-
682
-
683
- if args.NUM_THREADS.isdigit() == False:
684
- args.NUM_THREADS='3'
685
-
686
- tf.config.threading.set_inter_op_parallelism_threads(int(args.NUM_THREADS))
687
- tf.config.threading.set_intra_op_parallelism_threads(int(args.NUM_THREADS))
688
-
689
- if args.NERmodel!='' and args.SAmodel!='':
690
-
691
- #pipleline
692
- print('==============\n| GeneNER and SpeAss |\n==============')
693
-
694
- #creat output folder
695
-
696
- if args.infolder[-1]!='/':
697
- args.infolder+='/'
698
- if not os.path.exists(args.infolder):
699
- os.makedirs(args.infolder)
700
-
701
- if args.NERoutpath[-1]!='/':
702
- args.NERoutpath+='/'
703
- if not os.path.exists(args.NERoutpath):
704
- os.makedirs(args.NERoutpath)
705
-
706
- if args.SAoutpath[-1]!='/':
707
- args.SAoutpath+='/'
708
- if not os.path.exists(args.SAoutpath):
709
- os.makedirs(args.SAoutpath)
710
-
711
- #1. gene NER, the results are saved in outpath/ner_tmp/
712
- geneNER(args.infolder,args.NERoutpath, args.NERmodel)
713
-
714
-
715
- #2. species assignment, the results are saved in outpath/sa_tmp/
716
- speciesAss(args.NERoutpath,args.SAoutpath, args.SAmodel)
717
-
718
- elif args.NERmodel!='' and args.SAmodel=='':
719
- if args.infolder[-1]!='/':
720
- args.infolder+='/'
721
- if not os.path.exists(args.infolder):
722
- os.makedirs(args.infolder)
723
-
724
- # only geneNER
725
- if args.NERoutpath[-1]!='/':
726
- args.NERoutpath+='/'
727
- if not os.path.exists(args.NERoutpath):
728
- os.makedirs(args.NERoutpath)
729
-
730
- print('==============\n| GeneNER |\n==============')
731
- geneNER(args.infolder,args.NERoutpath,args.NERmodel)
732
-
733
- elif args.NERmodel=='' and args.SAmodel!='':
734
- # only speass
735
- if args.SAoutpath[-1]!='/':
736
- args.SAoutpath+='/'
737
- if not os.path.exists(args.SAoutpath):
738
- os.makedirs(args.SAoutpath)
739
-
740
- print('==============\n| SpeAss |\n==============')
741
- speciesAss(args.infolder,args.SAoutpath,args.SAmodel)
742
- else:
743
- print('Please provide models!')
744
-
745
-
746
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Wed Jun 8 09:26:57 2022
4
+
5
+ @author: luol2
6
+
7
+ Pipeline: first gene NER, then species assignment
8
+ input: species NER bioc xml file
9
+ output: gene ner and species assignment results bioc xml file
10
+ """
11
+ import argparse
12
+ import os
13
+ import io
14
+ import time
15
+ import sys
16
+ import re
17
+ import shutil
18
+ from src_python.GeneNER import model_ner,ner_tag
19
+ from src_python.SpeAss import model_sa,sa_tag
20
+
21
+ import tensorflow as tf
22
+
23
+ import bioc
24
+ import stanza
25
+ nlp_token = stanza.Pipeline(model_dir='gnorm_trained_models/stanza', lang='en', processors={'tokenize': 'spacy'},package='None', download_method=None) #package='craft' ;./gnorm_trained_models/stanza
26
+
27
+ def NER_BioC(infolder,infile,outpath,nn_model):
28
+
29
+ with open(infolder+"/"+infile, 'r',encoding='utf-8') as fin:
30
+ with open(outpath+"/"+infile,'w', encoding='utf8') as fout:
31
+ collection = bioc.load(fin)
32
+
33
+ Total_n=len(collection.documents)
34
+ print('Total number of sub-documents:', Total_n)
35
+ pmid_n=0
36
+ for document in collection.documents:
37
+ print("Processing:{0}%".format(round(pmid_n * 100 / Total_n)), end="\r")
38
+ pmid_n+=1
39
+ # print(document.id)
40
+ mention_num_new=0
41
+ for passage in document.passages:
42
+ if passage.text!='' and (not passage.text.isspace()) and passage.infons['type']!='ref': # have text and is not ref
43
+ passage_offset=passage.offset
44
+ tag_result=ner_tag.ML_Tag(passage.text,nn_model,nlp_token)
45
+ mention_num=0
46
+ for ele in tag_result:
47
+ bioc_note = bioc.BioCAnnotation()
48
+ bioc_note.id = str(mention_num)
49
+ mention_num+=1
50
+ bioc_note.infons['type'] = ele[2]
51
+ start = int(ele[0])
52
+ last = int(ele[1])
53
+ loc = bioc.BioCLocation(offset=str(passage_offset+start), length= str(last-start))
54
+ bioc_note.locations.append(loc)
55
+ bioc_note.text = passage.text[start:last]
56
+ passage.annotations.append(bioc_note)
57
+ #update id
58
+ for temp_annotation in passage.annotations:
59
+ temp_annotation.id=str(mention_num_new)
60
+ mention_num_new+=1
61
+ bioc.dump(collection, fout, pretty_print=True)
62
+
63
+ def NER_PubTator(infolder,infile,outpath,nn_model):
64
+ with open(infolder+"/"+infile, 'r',encoding='utf-8') as fin:
65
+ with open(outpath+"/"+infile,'w', encoding='utf-8') as fout:
66
+ title=''
67
+ abstract=''
68
+ all_text=fin.read().strip().split('\n\n')
69
+ Total_n=len(all_text)
70
+ print('Total number of sub-documents:', Total_n)
71
+ pmid_n=0
72
+ for doc in all_text:
73
+ print("Processing:{0}%".format(round(pmid_n * 100 / Total_n)), end="\r")
74
+ pmid_n+=1
75
+ lines = doc.split('\n')
76
+ seg=lines[0].split('|t|')
77
+ pmid=seg[0]
78
+ title=""
79
+ if len(seg)>1:
80
+ title=seg[1]
81
+ abstract=""
82
+ if len(lines)>1:
83
+ seg=lines[1].split('|a|')
84
+ abstract=seg[1]
85
+ if len(seg)>1:
86
+ abstract=seg[1]
87
+
88
+ intext=title+' '+abstract
89
+ tag_result=ner_tag.ML_Tag(intext,nn_model,nlp_token)
90
+ fout.write(doc+'\n')
91
+ for ele in tag_result:
92
+ ent_start = ele[0]
93
+ ent_last = ele[1]
94
+ ent_mention = intext[int(ele[0]):int(ele[1])]
95
+ ent_type=ele[2]
96
+ fout.write(pmid+"\t"+ent_start+"\t"+ent_last+"\t"+ent_mention+"\t"+ent_type+"\n")
97
+ fout.write('\n')
98
+ title=''
99
+ abstract=''
100
+
101
+ def geneNER(infolder, outpath, modelfile):
102
+
103
+ print('loading NER models........')
104
+
105
+ if modelfile.lower().find('bioformer')>=0:
106
+ vocabfiles={'labelfile':'./vocab/GeneNER_label.vocab',
107
+ 'checkpoint_path':'./gnorm_trained_models/bioformer-cased-v1.0/', #bioformer-cased-v1.0
108
+ 'lowercase':False,
109
+ }
110
+ else:
111
+ vocabfiles={'labelfile':'./vocab/GeneNER_label.vocab',
112
+ 'checkpoint_path':'./gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/',
113
+ 'lowercase':True,
114
+ }
115
+
116
+ nn_model=model_ner.HUGFACE_NER(vocabfiles)
117
+ nn_model.build_encoder()
118
+ nn_model.build_softmax_decoder()
119
+ nn_model.load_model(modelfile)
120
+
121
+ #tagging text
122
+ print("begin GeneNER tagging........")
123
+ start_time=time.time()
124
+
125
+ for infile in os.listdir(infolder):
126
+ if os.path.isfile(outpath+"/"+infile):
127
+ print(infile+' has exsited.')
128
+ else:
129
+ print('processing:',infile)
130
+ fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
131
+ input_format=""
132
+ for line in fin:
133
+ pattern_bioc = re.compile('.*<collection>.*')
134
+ pattern_pubtator = re.compile('^([^\|]+)\|[^\|]+\|(.*)')
135
+ if pattern_bioc.search(line):
136
+ input_format="BioC"
137
+ break
138
+ elif pattern_pubtator.search(line):
139
+ input_format="PubTator"
140
+ break
141
+ fin.close()
142
+ if(input_format == "PubTator"):
143
+ NER_PubTator(infolder,infile,outpath,nn_model)
144
+ elif(input_format == "BioC"):
145
+ NER_BioC(infolder,infile,outpath,nn_model)
146
+
147
+ print('tag done:',time.time()-start_time)
148
+
149
+
150
+ #SA for bioc format
151
+ def SA_BioC(infolder,infile,outpath,nn_model,virus_set,prefix_dict):
152
+
153
+ #BioC xml to pubtator
154
+ # pmid|t|text1
155
+ #pmid|a|text2
156
+ #pmid sid eid entity_txt entity_type entity_id (gene is blank)
157
+ fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
158
+ # fout_pubtator=open(outpath+'tmp/input_xml.pubtator','w', encoding='utf-8')
159
+ fin_pubtator0=io.StringIO() #none *species
160
+ fin_pubtator1=io.StringIO() #one *species
161
+ fin_pubtator2=io.StringIO() #two or more species
162
+ collection = bioc.load(fin)
163
+ fin.close()
164
+ ori_ann_index={} #{'pmid':{'ent.id':'ent_s-ent_e'}}
165
+ species_count={} #{pmid:{speid:num}}
166
+ gene_set=['Gene','FamilyName']
167
+ final_sa_results={} #{'pmid':{'entity_id':species_id}}
168
+ for document in collection.documents:
169
+ doc_pmid=document.id
170
+ doc_title=''
171
+ doc_abstract=''
172
+ doc_annotation=[]
173
+ _ann_index={}
174
+ _species_num={} #{*speciesid:num}
175
+ _gene_num=0
176
+ _passage_num=0
177
+ if len(document.passages)<=2: #abstract xml or PMC only have title
178
+ for passage in document.passages:
179
+ passage_offset=passage.offset
180
+ _passage_num+=1
181
+ #print(passage_offset,type(passage_offset))
182
+ #if passage.infons['type']=='title' or passage.infons['type']=='front':
183
+ if _passage_num==1:
184
+ doc_title=passage.text
185
+ for temp_annotation in passage.annotations:
186
+ if temp_annotation.infons['type'] in gene_set:
187
+ _gene_num+=1
188
+ ent_start=temp_annotation.locations[0].offset-passage_offset
189
+ ent_end=ent_start+temp_annotation.locations[0].length
190
+ #print(ent_start,ent_end)
191
+ _ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end)
192
+ # print(temp_annotation.infons)
193
+ if 'Identifier' in temp_annotation.infons.keys():
194
+ # print(temp_annotation.infons.keys['Identifier'])
195
+ species_ID=temp_annotation.infons['Identifier']
196
+ if species_ID.find('*')>=0:
197
+ if species_ID not in _species_num.keys():
198
+ _species_num[species_ID]=1
199
+ else:
200
+ _species_num[species_ID]+=1
201
+ doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID)
202
+ else:
203
+ doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type'])
204
+
205
+ #elif passage.infons['type']=='abstract' or passage.infons['type']=='paragraph':
206
+ else:
207
+ doc_abstract=passage.text
208
+ for temp_annotation in passage.annotations:
209
+ if temp_annotation.infons['type'] in gene_set:
210
+ _gene_num+=1
211
+ ent_start=len(doc_title)+1+temp_annotation.locations[0].offset-passage_offset
212
+ ent_end=ent_start+temp_annotation.locations[0].length
213
+ #print(ent_start,ent_end)
214
+ _ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end)
215
+ if 'Identifier' in temp_annotation.infons.keys():
216
+ # print(temp_annotation.infons.keys['Identifier'])
217
+ species_ID=temp_annotation.infons['Identifier']
218
+ if species_ID.find('*')>=0:
219
+ if species_ID not in _species_num.keys():
220
+ _species_num[species_ID]=1
221
+ else:
222
+ _species_num[species_ID]+=1
223
+ doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID)
224
+ else:
225
+ doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type'])
226
+
227
+ if len(_species_num)>=2 and _gene_num>0:
228
+ fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n')
229
+ fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n')
230
+ for ele in doc_annotation:
231
+ fin_pubtator2.write(ele+'\n')
232
+ fin_pubtator2.write('\n')
233
+ elif len(_species_num)==1 and _gene_num>0: #可以直接给结果
234
+ fin_pubtator1.write(doc_pmid+'|t|'+doc_title+'\n')
235
+ fin_pubtator1.write(doc_pmid+'|a|'+doc_abstract+'\n')
236
+ major_speicesid,=_species_num
237
+ fin_pubtator1.write(major_speicesid[1:]+'\n')
238
+ for ele in doc_annotation:
239
+ fin_pubtator1.write(ele+'\n')
240
+ fin_pubtator1.write('\n')
241
+ elif len(_species_num)==0 and _gene_num>0:
242
+ fin_pubtator0.write(doc_pmid+'|t|'+doc_title+'\n')
243
+ fin_pubtator0.write(doc_pmid+'|a|'+doc_abstract+'\n')
244
+ for ele in doc_annotation:
245
+ fin_pubtator0.write(ele+'\n')
246
+ fin_pubtator0.write('\n')
247
+
248
+ else: # full text xml
249
+ for passage in document.passages:
250
+ passage_annotation=[]
251
+ _species_num_passage={}
252
+ _gene_num_passage=0
253
+ passage_offset=passage.offset
254
+ #print(passage_offset,type(passage_offset))
255
+ if passage.text!='' and (not passage.text.isspace()) and passage.infons['type']!='ref':
256
+ doc_title=passage.text
257
+ for temp_annotation in passage.annotations:
258
+ if temp_annotation.infons['type'] in gene_set:
259
+ _gene_num_passage+=1
260
+ ent_start=temp_annotation.locations[0].offset-passage_offset
261
+ ent_end=ent_start+temp_annotation.locations[0].length
262
+ #print(ent_start,ent_end)
263
+ _ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end)
264
+ # print(temp_annotation.infons)
265
+ if 'Identifier' in temp_annotation.infons.keys():
266
+ # print(temp_annotation.infons.keys['Identifier'])
267
+ species_ID=temp_annotation.infons['Identifier']
268
+ if species_ID.find('*')>=0:
269
+ if species_ID not in _species_num.keys():
270
+ _species_num[species_ID]=1
271
+ else:
272
+ _species_num[species_ID]+=1
273
+ if species_ID not in _species_num_passage.keys():
274
+ _species_num_passage[species_ID]=1
275
+ else:
276
+ _species_num_passage[species_ID]+=1
277
+ passage_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID)
278
+ else:
279
+ passage_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type'])
280
+
281
+
282
+ if len(_species_num_passage)>=2 and _gene_num_passage>0:
283
+ fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n')
284
+ fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n')
285
+ for ele in passage_annotation:
286
+ fin_pubtator2.write(ele+'\n')
287
+ fin_pubtator2.write('\n')
288
+ elif len(_species_num_passage)==1 and _gene_num_passage>0: #可以直��给结果
289
+ fin_pubtator1.write(doc_pmid+'|t|'+doc_title+'\n')
290
+ fin_pubtator1.write(doc_pmid+'|a|'+doc_abstract+'\n')
291
+ major_speicesid,=_species_num_passage
292
+ fin_pubtator1.write(major_speicesid[1:]+'\n')
293
+ for ele in passage_annotation:
294
+ fin_pubtator1.write(ele+'\n')
295
+ fin_pubtator1.write('\n')
296
+ elif len(_species_num_passage)==0 and _gene_num_passage>0:
297
+ fin_pubtator0.write(doc_pmid+'|t|'+doc_title+'\n')
298
+ fin_pubtator0.write(doc_pmid+'|a|'+doc_abstract+'\n')
299
+ for ele in passage_annotation:
300
+ fin_pubtator0.write(ele+'\n')
301
+ fin_pubtator0.write('\n')
302
+ # print(ori_ann_index)
303
+
304
+ ori_ann_index[doc_pmid]=_ann_index
305
+ species_count[doc_pmid]=_species_num
306
+
307
+
308
+ cache_geneid={} #{pmid:{gene1:{id1:num,id2:num}}}
309
+
310
+ if fin_pubtator2.getvalue()!='':
311
+ #pubtator format ML tagging
312
+ # print(fin_pubtator2.getvalue())
313
+ ml_out= sa_tag.ml_tag_main(fin_pubtator2,nlp_token, nn_model)
314
+ #print(ml_out.getvalue())
315
+ fin_result=io.StringIO(ml_out.getvalue())
316
+ all_in=fin_result.read().strip().split('\n\n')
317
+ #print('+2 species:',len(all_in))
318
+ fin_result.close()
319
+
320
+ prefix_speid_allset=set(prefix_dict.keys())
321
+
322
+ for doc in all_in:
323
+ lines=doc.split('\n')
324
+ pmid=lines[0].split('|t|')[0]
325
+ _prefix_str2id_dict={}
326
+ doc_species=list(species_count[pmid].keys())
327
+ for _spe_ele in doc_species:
328
+ if _spe_ele[1:] in prefix_speid_allset:
329
+ for ele in prefix_dict[_spe_ele[1:]]:
330
+ _prefix_str2id_dict[ele]=_spe_ele[1:]
331
+
332
+ for i in range(2,len(lines)):
333
+ segs=lines[i].split('\t')
334
+ if pmid not in final_sa_results.keys():
335
+ final_sa_results[pmid]={segs[1]:'Focus:'+segs[-1]}
336
+ else:
337
+ final_sa_results[pmid][segs[1]]='Focus:'+segs[-1]
338
+
339
+ if segs[5] in gene_set:
340
+ if segs[4][0:2] in _prefix_str2id_dict: #prefix rule
341
+ #print('prefix rule:', pmid)
342
+ # print(_prefix_str2id_dict)
343
+ if pmid not in final_sa_results.keys():
344
+ final_sa_results[pmid]={segs[1]:'Focus:'+_prefix_str2id_dict[segs[4][0:2]]}
345
+ else:
346
+ final_sa_results[pmid][segs[1]]='Focus:'+_prefix_str2id_dict[segs[4][0:2]]
347
+ if pmid not in cache_geneid.keys():
348
+ cache_geneid[pmid]={segs[4]:{'Focus:'+segs[-1]:1}}
349
+ else:
350
+ if segs[4] not in cache_geneid[pmid].keys():
351
+ cache_geneid[pmid][segs[4]]={'Focus:'+segs[-1]:1}
352
+ else:
353
+ if segs[-1] not in cache_geneid[pmid][segs[4]].keys():
354
+ cache_geneid[pmid][segs[4]]['Focus:'+segs[-1]]=1
355
+ else:
356
+ cache_geneid[pmid][segs[4]]['Focus:'+segs[-1]]+=1
357
+
358
+ #print(final_sa_results)
359
+
360
+ #one species
361
+ if fin_pubtator1.getvalue()!='':
362
+ fin_result=io.StringIO(fin_pubtator1.getvalue())
363
+ all_in=fin_result.read().strip().split('\n\n')
364
+ fin_result.close()
365
+ #print('1 species:',len(all_in))
366
+ for doc in all_in:
367
+ lines=doc.split('\n')
368
+ pmid=lines[0].split('|t|')[0]
369
+ major_speicesid=lines[2]
370
+ for i in range(3,len(lines)):
371
+ segs=lines[i].split('\t')
372
+ if len(segs)>=7:#species
373
+ if pmid not in final_sa_results.keys():
374
+ final_sa_results[pmid]={segs[1]:segs[-1]}
375
+ else:
376
+ final_sa_results[pmid][segs[1]]=segs[-1]
377
+ else:#gene
378
+ marjor_species='Focus:'+major_speicesid
379
+ if pmid not in final_sa_results.keys():
380
+ final_sa_results[pmid]={segs[1]:marjor_species}
381
+ else:
382
+ final_sa_results[pmid][segs[1]]=marjor_species
383
+ if pmid not in cache_geneid.keys():
384
+ cache_geneid[pmid]={segs[4]:{marjor_species:1}}
385
+ else:
386
+ if segs[4] not in cache_geneid[pmid].keys():
387
+ cache_geneid[pmid][segs[4]]={marjor_species:1}
388
+ else:
389
+ if segs[-1] not in cache_geneid[pmid][segs[4]].keys():
390
+ cache_geneid[pmid][segs[4]][marjor_species]=1
391
+ else:
392
+ cache_geneid[pmid][segs[4]][marjor_species]+=1
393
+
394
+
395
+ #no species
396
+ fin_result=io.StringIO(fin_pubtator0.getvalue())
397
+ all_in=fin_result.read().strip().split('\n\n')
398
+ fin_result.close()
399
+ #print('no species:',len(all_in))
400
+ for doc in all_in:
401
+ lines=doc.split('\n')
402
+ pmid=lines[0].split('|t|')[0]
403
+
404
+ for i in range(2,len(lines)):
405
+ segs=lines[i].split('\t')
406
+ if (pmid in cache_geneid.keys()) and (segs[4] in cache_geneid[pmid].keys()):#same gene in doc
407
+ marjor_species = max(zip(cache_geneid[pmid][segs[4]].values(), cache_geneid[pmid][segs[4]].keys()))
408
+ if pmid not in final_sa_results.keys():
409
+ final_sa_results[pmid]={segs[1]:marjor_species[1]}
410
+ else:
411
+ final_sa_results[pmid][segs[1]]=marjor_species[1]
412
+ else: #marjor species in doc
413
+ if (pmid in species_count.keys()) and len(species_count[pmid])>0:#marjor species in doc
414
+ marjor_species = max(zip(species_count[pmid].values(), species_count[pmid].keys()))
415
+
416
+ if pmid not in final_sa_results.keys():
417
+ final_sa_results[pmid]={segs[1]:'Focus:'+marjor_species[1][1:]}
418
+ else:
419
+ final_sa_results[pmid][segs[1]]='Focus:'+marjor_species[1][1:]
420
+ else:#no any species in doc,assign human
421
+ if pmid not in final_sa_results.keys():
422
+ final_sa_results[pmid]={segs[1]:'Focus:9606'}
423
+ else:
424
+ final_sa_results[pmid][segs[1]]='Focus:9606'
425
+
426
+
427
+
428
+ # print(final_sa_results)
429
+ fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
430
+ fout_xml=open(outpath+"/"+infile,'w', encoding='utf8')
431
+ collection = bioc.load(fin)
432
+ for document in collection.documents:
433
+ doc_pmid=document.id
434
+ # print(final_sa_results[doc_pmid])
435
+ # print(doc_pmid)
436
+ for passage in document.passages:
437
+ for temp_annotation in passage.annotations:
438
+ if 'Identifier' not in temp_annotation.infons.keys():
439
+ if temp_annotation.id in final_sa_results[doc_pmid].keys():
440
+ if final_sa_results[doc_pmid][temp_annotation.id][6:] in virus_set:
441
+ temp_annotation.infons['Identifier']=final_sa_results[doc_pmid][temp_annotation.id]+',9606'
442
+ # print('!!! virus:', doc_pmid)
443
+ else:
444
+ temp_annotation.infons['Identifier']=final_sa_results[doc_pmid][temp_annotation.id]
445
+ else: #same text bug
446
+ if (doc_pmid in cache_geneid.keys()) and (temp_annotation.text in cache_geneid[doc_pmid].keys()):#same gene in doc
447
+ marjor_species = max(zip(cache_geneid[doc_pmid][temp_annotation.text].values(), cache_geneid[doc_pmid][temp_annotation.text].keys()))
448
+ temp_annotation.infons['Identifier']=marjor_species[1]
449
+ else:
450
+
451
+ temp_annotation.infons['Identifier']='Focus:9606'
452
+ bioc.dump(collection, fout_xml, pretty_print=True)
453
+ fin.close()
454
+ fout_xml.close()
455
+
456
+
457
+ #SA for PubTator format
458
+ def SA_PubTator(infolder,infile,outpath,nn_model,virus_set,prefix_dict):
459
+
460
+
461
+ # pmid|t|text1
462
+ #pmid|a|text2
463
+ #pmid entity_id sid eid entity_txt entity_type (gene is blank)
464
+ fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
465
+ # fout_pubtator=open(outpath+'tmp/input_xml.pubtator','w', encoding='utf-8')
466
+ fin_pubtator2=io.StringIO() #two or more species
467
+ all_in_ori=fin.read().strip().split('\n\n')
468
+ fin.close()
469
+ species_gene_count={} #{pmid:{'spec':_species_num;'gene':_gene_num}}
470
+ gene_set=['Gene','FamilyName']
471
+ ML_results={} #{'pmid':{'sid-eid':species_id}}
472
+
473
+ prefix_speid_allset=set(prefix_dict.keys())
474
+
475
+ for document in all_in_ori:
476
+ lines=document.split('\n')
477
+ doc_pmid=lines[0].split('|t|')[0]
478
+ doc_title=lines[0].split('|t|')[1]
479
+ doc_abstract=lines[1].split('|a|')[1]
480
+ doc_annotation=[]
481
+ _species_num=set() #(*speciesid)
482
+ _gene_num=0
483
+ _ML_gene_num=0
484
+ _entity_num=0
485
+ _prefix_str2id_dict={} #{prestr:id}
486
+ for i in range(2,len(lines)):
487
+ segs=lines[i].split('\t')
488
+ if segs[4] in gene_set:
489
+ _gene_num+=1
490
+ if len(segs)>=6: #species
491
+ doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:]))
492
+ species_ID=segs[-1]
493
+ if species_ID.find('*')>=0:
494
+ _species_num.add(species_ID)
495
+ if species_ID[1:] in prefix_speid_allset:
496
+ for ele in prefix_dict[species_ID[1:]]:
497
+ _prefix_str2id_dict[ele]=species_ID[1:]
498
+ else: #gene
499
+ if segs[3][0:2] in _prefix_str2id_dict:#prefix rule
500
+ if _prefix_str2id_dict[segs[3][0:2]] in virus_set:
501
+ doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:])+'\tFocus:'+_prefix_str2id_dict[segs[3][0:2]]+',9606')
502
+ if doc_pmid not in ML_results.keys():
503
+ ML_results[doc_pmid]={segs[1]+'-'+segs[2]:_prefix_str2id_dict[segs[3][0:2]]+',9606'}
504
+ else:
505
+ ML_results[doc_pmid][segs[1]+'-'+segs[2]]=_prefix_str2id_dict[segs[3][0:2]]+',9606'
506
+
507
+ # print('!!! prefixr and virus:', doc_pmid)
508
+ else:
509
+ doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:])+'\tFocus:'+_prefix_str2id_dict[segs[3][0:2]])
510
+ if doc_pmid not in ML_results.keys():
511
+ ML_results[doc_pmid]={segs[1]+'-'+segs[2]:_prefix_str2id_dict[segs[3][0:2]]}
512
+ else:
513
+ ML_results[doc_pmid][segs[1]+'-'+segs[2]]=_prefix_str2id_dict[segs[3][0:2]]
514
+ # print('prefix rule!!',_prefix_str2id_dict)
515
+ # print(doc_pmid)
516
+ else:
517
+ doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:]))
518
+ if segs[4] in gene_set:
519
+ _ML_gene_num+=1
520
+ _entity_num+=1
521
+
522
+ if len(_species_num)>=2 and _ML_gene_num>0:
523
+ fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n')
524
+ fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n')
525
+ for ele in doc_annotation:
526
+ fin_pubtator2.write(ele+'\n')
527
+ fin_pubtator2.write('\n')
528
+
529
+ species_gene_count[doc_pmid]={'spec':_species_num,'gene':_gene_num}
530
+
531
+ if fin_pubtator2.getvalue()!='':
532
+ #pubtator format ML tagging
533
+ #print(fin_pubtator2.getvalue())
534
+ ml_out= sa_tag.ml_tag_main(fin_pubtator2,nlp_token, nn_model)
535
+ #print(ml_out.getvalue())
536
+ fin_result=io.StringIO(ml_out.getvalue())
537
+ all_in=fin_result.read().strip().split('\n\n')
538
+ #print('+2 species:',len(all_in))
539
+ fin_result.close()
540
+ for doc in all_in:
541
+ lines=doc.split('\n')
542
+ pmid=lines[0].split('|t|')[0]
543
+
544
+ for i in range(2,len(lines)):
545
+ segs=lines[i].split('\t')
546
+ if pmid not in ML_results.keys():
547
+ ML_results[pmid]={segs[2]+'-'+segs[3]:segs[-1]}
548
+ else:
549
+ ML_results[pmid][segs[2]+'-'+segs[3]]=segs[-1]
550
+
551
+ #output
552
+ fout_pubtator=open(outpath+"/"+infile,'w', encoding='utf8')
553
+ for doc in all_in_ori:
554
+ lines=doc.split('\n')
555
+ pmid=lines[0].split('|t|')[0]
556
+ fout_pubtator.write(lines[0]+'\n'+lines[1]+'\n')
557
+ if len(species_gene_count[pmid]['spec'])>1 and species_gene_count[pmid]['gene']>0: # ML
558
+ for i in range(2,len(lines)):
559
+ segs=lines[i].split('\t')
560
+ if len(segs)>=6: #species
561
+ fout_pubtator.write(lines[i]+'\n')
562
+ else:#gene
563
+ if ML_results[pmid][segs[1]+'-'+segs[2]] in virus_set:
564
+ fout_pubtator.write(lines[i]+'\tFocus:'+ML_results[pmid][segs[1]+'-'+segs[2]]+',9606'+'\n')
565
+ # print('!!! virus:', pmid)
566
+ else:
567
+ fout_pubtator.write(lines[i]+'\tFocus:'+ML_results[pmid][segs[1]+'-'+segs[2]]+'\n')
568
+ fout_pubtator.write('\n')
569
+
570
+ elif len(species_gene_count[pmid]['spec'])==1 and species_gene_count[pmid]['gene']>0: #only one species
571
+ for i in range(2,len(lines)):
572
+ segs=lines[i].split('\t')
573
+ if len(segs)>=6: #species
574
+ fout_pubtator.write(lines[i]+'\n')
575
+ else:#gene
576
+ major_species,=species_gene_count[pmid]['spec']
577
+ if major_species[1:] in virus_set:
578
+ fout_pubtator.write(lines[i]+'\tFocus:'+major_species[1:]+',9606'+'\n')
579
+ # print('!!! virus:', pmid)
580
+ fout_pubtator.write(lines[i]+'\tFocus:'+major_species[1:]+'\n')
581
+ fout_pubtator.write('\n')
582
+
583
+ elif len(species_gene_count[pmid]['spec'])==0 and species_gene_count[pmid]['gene']>0:#no species
584
+ for i in range(2,len(lines)):
585
+ segs=lines[i].split('\t')
586
+ if len(segs)>=6: #species
587
+ fout_pubtator.write(lines[i]+'\n')
588
+ else:#gene
589
+ fout_pubtator.write(lines[i]+'\tFocus:9606'+'\n')
590
+ fout_pubtator.write('\n')
591
+
592
+ else:
593
+ for i in range(2,len(lines)):
594
+ fout_pubtator.write(lines[i]+'\n')
595
+ fout_pubtator.write('\n')
596
+ fout_pubtator.close()
597
+
598
+
599
+ #SA main
600
+ def speciesAss(infolder,outpath, modelfile):
601
+
602
+ if modelfile.lower().find('bioformer')>=0:
603
+ model_type='bioformer'
604
+ else:
605
+ model_type='pubmedbert'
606
+
607
+ print('loading SA models........')
608
+ if model_type=='bioformer':
609
+
610
+ vocabfiles={'labelfile':'./vocab/SpeAss_IO_label.vocab',
611
+ 'checkpoint_path':'./gnorm_trained_models/bioformer-cased-v1.0/',
612
+ 'lowercase':False,
613
+ }
614
+ else:
615
+ vocabfiles={'labelfile':'./vocab/SpeAss_IO_label.vocab',
616
+ 'checkpoint_path':'./gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/',
617
+ 'lowercase':True,
618
+ }
619
+
620
+ nn_model=model_sa.HUGFACE_NER(vocabfiles)
621
+ nn_model.build_encoder()
622
+ nn_model.build_softmax_decoder()
623
+ nn_model.load_model(modelfile)
624
+
625
+ dict_filename={'prefix':'./Dictionary/SPPrefix.txt',
626
+ 'virus':'./Dictionary/SP_Virus2HumanList.txt'}
627
+ fin=open(dict_filename['virus'],'r',encoding='utf-8')
628
+ virus_set=set(fin.read().strip().split('\n'))
629
+ fin.close()
630
+
631
+ prefix_dict={}#{id:[prefix1,prefix2]}
632
+ fin=open(dict_filename['prefix'],'r',encoding='utf-8')
633
+ for line in fin:
634
+ seg= line.strip().split('\t')
635
+ if seg[0] not in prefix_dict.keys():
636
+ prefix_dict[seg[0]]=seg[1].split('|')
637
+ else:
638
+ prefix_dict[seg[0]].extend(seg[1].split('|'))
639
+ fin.close()
640
+
641
+
642
+
643
+ print("begin species assignment........")
644
+ start_time=time.time()
645
+
646
+ for infile in os.listdir(infolder):
647
+ if os.path.isfile(outpath+"/"+infile):
648
+ print(infile+' has exsited.')
649
+ else:
650
+ print('Processing:',infile)
651
+ fin=open(infolder+"/"+infile, 'r',encoding='utf-8')
652
+ file_format=""
653
+ for line in fin:
654
+ pattern_bioc = re.compile('.*<collection>.*')
655
+ pattern_pubtator = re.compile('^([^\|]+)\|[^\|]+\|(.*)')
656
+ if pattern_bioc.search(line):
657
+ file_format="BioC"
658
+ break
659
+ elif pattern_pubtator.search(line):
660
+ file_format="PubTator"
661
+ break
662
+ fin.close()
663
+ if(file_format == "PubTator"):
664
+ SA_PubTator(infolder,infile,outpath,nn_model,virus_set,prefix_dict)
665
+ elif(file_format == "BioC"):
666
+ SA_BioC(infolder,infile,outpath,nn_model,virus_set,prefix_dict)
667
+
668
+
669
+ print('species assignment done:',time.time()-start_time)
670
+
671
+ if __name__=='__main__':
672
+
673
+ parser = argparse.ArgumentParser(description='run GeneNER and species assignment, python GeneNER_SpeAss_run.py -i input -n NERmodel -s SAmodel -r neroutput -a saoutput')
674
+ parser.add_argument('--infolder', '-i', help="input folder",default='./example/input/')
675
+ parser.add_argument('--NERmodel', '-n', help="trained deep learning NER model file",default='')
676
+ parser.add_argument('--SAmodel', '-s', help="trained deep learning species assignment model file",default='')
677
+ parser.add_argument('--NERoutpath', '-r', help="output folder to save the NER tagged results",default='./example/ner_output/')
678
+ parser.add_argument('--SAoutpath', '-a', help="output folder to save the SA tagged results",default='./example/sa_output/')
679
+ parser.add_argument('--NUM_THREADS', '-t', help="Number of threads",default='3')
680
+ args = parser.parse_args()
681
+
682
+
683
+ if args.NUM_THREADS.isdigit() == False:
684
+ args.NUM_THREADS='3'
685
+
686
+ tf.config.threading.set_inter_op_parallelism_threads(int(args.NUM_THREADS))
687
+ tf.config.threading.set_intra_op_parallelism_threads(int(args.NUM_THREADS))
688
+
689
+ if args.NERmodel!='' and args.SAmodel!='':
690
+
691
+ #pipleline
692
+ print('==============\n| GeneNER and SpeAss |\n==============')
693
+
694
+ #creat output folder
695
+
696
+ if args.infolder[-1]!='/':
697
+ args.infolder+='/'
698
+ if not os.path.exists(args.infolder):
699
+ os.makedirs(args.infolder)
700
+
701
+ if args.NERoutpath[-1]!='/':
702
+ args.NERoutpath+='/'
703
+ if not os.path.exists(args.NERoutpath):
704
+ os.makedirs(args.NERoutpath)
705
+
706
+ if args.SAoutpath[-1]!='/':
707
+ args.SAoutpath+='/'
708
+ if not os.path.exists(args.SAoutpath):
709
+ os.makedirs(args.SAoutpath)
710
+
711
+ #1. gene NER, the results are saved in outpath/ner_tmp/
712
+ geneNER(args.infolder,args.NERoutpath, args.NERmodel)
713
+
714
+
715
+ #2. species assignment, the results are saved in outpath/sa_tmp/
716
+ speciesAss(args.NERoutpath,args.SAoutpath, args.SAmodel)
717
+
718
+ elif args.NERmodel!='' and args.SAmodel=='':
719
+ if args.infolder[-1]!='/':
720
+ args.infolder+='/'
721
+ if not os.path.exists(args.infolder):
722
+ os.makedirs(args.infolder)
723
+
724
+ # only geneNER
725
+ if args.NERoutpath[-1]!='/':
726
+ args.NERoutpath+='/'
727
+ if not os.path.exists(args.NERoutpath):
728
+ os.makedirs(args.NERoutpath)
729
+
730
+ print('==============\n| GeneNER |\n==============')
731
+ geneNER(args.infolder,args.NERoutpath,args.NERmodel)
732
+
733
+ elif args.NERmodel=='' and args.SAmodel!='':
734
+ # only speass
735
+ if args.SAoutpath[-1]!='/':
736
+ args.SAoutpath+='/'
737
+ if not os.path.exists(args.SAoutpath):
738
+ os.makedirs(args.SAoutpath)
739
+
740
+ print('==============\n| SpeAss |\n==============')
741
+ speciesAss(args.infolder,args.SAoutpath,args.SAmodel)
742
+ else:
743
+ print('Please provide models!')
744
+
745
+
746
 
Library/Ab3P.C CHANGED
@@ -1,110 +1,110 @@
1
- #include "Ab3P.h"
2
-
3
- Ab3P::Ab3P ( void ) :
4
- buffer(""),
5
- wrdData( new WordData )
6
- {
7
-
8
- string sf_grp, sf_nchr, strat;
9
- double value;
10
-
11
- char file_name[1000];
12
- get_pathw( file_name, "Ab3P", "prec", "dat" );
13
- ifstream fin(file_name);
14
- if(!fin) {
15
- cout << "Cannot open Ab3P_prec.dat\n";
16
- exit(1);
17
- }
18
- //get precision of a given #-ch SF's strategy
19
- while(fin>>sf_grp>>sf_nchr>>strat) {
20
- fin>>value; //precision
21
- stratPrec.insert(pair<string, double>(sf_grp+sf_nchr+strat, value));
22
- util.push_back_strat(sf_grp+sf_nchr, strat); //set strategy sequence
23
- }
24
- }
25
-
26
- void Ab3P::get_abbrs( char * text, vector<AbbrOut> & abbrs ) {
27
- abbrs.clear();
28
-
29
- if( ! text[0] ) return; // skip empty line
30
-
31
- ab.Proc(text); //extract potential SF & LF pairs
32
-
33
- for(int i=0; i<ab.numa; i++) {
34
- AbbrOut result;
35
-
36
- try_pair( ab.abbs[i], ab.abbl[i], result );
37
-
38
- // preserve results
39
- if ( result.prec > 0 ) {
40
- abbrs.push_back( result );
41
- }
42
- }
43
- ab.cleara();
44
-
45
- }
46
-
47
-
48
- void Ab3P::try_pair( char * sf, char * lf, AbbrOut & result ) {
49
-
50
- //process i) lf (sf)
51
- try_strats( sf, lf, false, result );
52
-
53
- //process ii) sf (lf)
54
- ab.token(lf);
55
- try_strats( ab.lst[ab.num-1], sf, true, result );
56
- }
57
-
58
-
59
- /**
60
- psf -- pointer short form
61
- plf -- pointer long form
62
- **/
63
- void Ab3P::try_strats ( char * psf, char * plf, bool swap,
64
- AbbrOut & result ) {
65
-
66
- string sfg; //SF group eg) Al1, Num2, Spec3
67
- //false if sf is not ok, sfg will be assigned
68
-
69
- if(!util.group_sf(psf,plf,sfg)) return;
70
- if (swap) if(!util.exist_upperal(psf)) return;
71
-
72
- char sf[1000], sfl[1000];
73
-
74
- //strategy sequence for a given #-ch SF group
75
- vector<string> strats = util.get_strats(sfg);
76
- util.remove_nonAlnum(psf,sf); //sf will be w/o non-alnum
77
-
78
- //go through strategies
79
- for( int j=0; j<strats.size(); j++) {
80
- AbbrStra * strat =
81
- util.strat_factory(strats[j]); //set a paticular strategy
82
- strat->wData = wrdData; //set wordset, stopword
83
- if(strat->strategy(sf,plf)) { //case sensitive
84
- strat->str_tolower(sf,sfl);
85
-
86
- if( strat->lf_ok(psf,strat->lf) ) {
87
-
88
- map<string, double>::iterator p =
89
- stratPrec.find(sfg+strats[j]);
90
- if(p==stratPrec.end()) {
91
- cout << "No precision assigned" << endl;
92
- exit(1);
93
- }
94
-
95
- //add outputs
96
- if( p->second>result.prec ) {
97
- result.sf = psf;
98
- result.lf = strat->lf;
99
- result.prec = p->second;
100
- result.strat = strats[j];
101
- }
102
-
103
- delete strat;
104
- return;
105
- }
106
- }
107
- delete strat;
108
- }
109
-
110
- }
 
1
+ #include "Ab3P.h"
2
+
3
+ Ab3P::Ab3P ( void ) :
4
+ buffer(""),
5
+ wrdData( new WordData )
6
+ {
7
+
8
+ string sf_grp, sf_nchr, strat;
9
+ double value;
10
+
11
+ char file_name[1000];
12
+ get_pathw( file_name, "Ab3P", "prec", "dat" );
13
+ ifstream fin(file_name);
14
+ if(!fin) {
15
+ cout << "Cannot open Ab3P_prec.dat\n";
16
+ exit(1);
17
+ }
18
+ //get precision of a given #-ch SF's strategy
19
+ while(fin>>sf_grp>>sf_nchr>>strat) {
20
+ fin>>value; //precision
21
+ stratPrec.insert(pair<string, double>(sf_grp+sf_nchr+strat, value));
22
+ util.push_back_strat(sf_grp+sf_nchr, strat); //set strategy sequence
23
+ }
24
+ }
25
+
26
+ void Ab3P::get_abbrs( char * text, vector<AbbrOut> & abbrs ) {
27
+ abbrs.clear();
28
+
29
+ if( ! text[0] ) return; // skip empty line
30
+
31
+ ab.Proc(text); //extract potential SF & LF pairs
32
+
33
+ for(int i=0; i<ab.numa; i++) {
34
+ AbbrOut result;
35
+
36
+ try_pair( ab.abbs[i], ab.abbl[i], result );
37
+
38
+ // preserve results
39
+ if ( result.prec > 0 ) {
40
+ abbrs.push_back( result );
41
+ }
42
+ }
43
+ ab.cleara();
44
+
45
+ }
46
+
47
+
48
+ void Ab3P::try_pair( char * sf, char * lf, AbbrOut & result ) {
49
+
50
+ //process i) lf (sf)
51
+ try_strats( sf, lf, false, result );
52
+
53
+ //process ii) sf (lf)
54
+ ab.token(lf);
55
+ try_strats( ab.lst[ab.num-1], sf, true, result );
56
+ }
57
+
58
+
59
+ /**
60
+ psf -- pointer short form
61
+ plf -- pointer long form
62
+ **/
63
+ void Ab3P::try_strats ( char * psf, char * plf, bool swap,
64
+ AbbrOut & result ) {
65
+
66
+ string sfg; //SF group eg) Al1, Num2, Spec3
67
+ //false if sf is not ok, sfg will be assigned
68
+
69
+ if(!util.group_sf(psf,plf,sfg)) return;
70
+ if (swap) if(!util.exist_upperal(psf)) return;
71
+
72
+ char sf[1000], sfl[1000];
73
+
74
+ //strategy sequence for a given #-ch SF group
75
+ vector<string> strats = util.get_strats(sfg);
76
+ util.remove_nonAlnum(psf,sf); //sf will be w/o non-alnum
77
+
78
+ //go through strategies
79
+ for( int j=0; j<strats.size(); j++) {
80
+ AbbrStra * strat =
81
+ util.strat_factory(strats[j]); //set a paticular strategy
82
+ strat->wData = wrdData; //set wordset, stopword
83
+ if(strat->strategy(sf,plf)) { //case sensitive
84
+ strat->str_tolower(sf,sfl);
85
+
86
+ if( strat->lf_ok(psf,strat->lf) ) {
87
+
88
+ map<string, double>::iterator p =
89
+ stratPrec.find(sfg+strats[j]);
90
+ if(p==stratPrec.end()) {
91
+ cout << "No precision assigned" << endl;
92
+ exit(1);
93
+ }
94
+
95
+ //add outputs
96
+ if( p->second>result.prec ) {
97
+ result.sf = psf;
98
+ result.lf = strat->lf;
99
+ result.prec = p->second;
100
+ result.strat = strats[j];
101
+ }
102
+
103
+ delete strat;
104
+ return;
105
+ }
106
+ }
107
+ delete strat;
108
+ }
109
+
110
+ }
Library/Ab3P.h CHANGED
@@ -1,83 +1,83 @@
1
- /*
2
- Identify sf & lf pairs from free text using multi-stage algorithm
3
- process one line at a time and print out:
4
- line
5
- sf|lf|P-precision|strategy
6
- */
7
-
8
- #include "AbbrvE.h"
9
- #include "AbbrStra.h"
10
- #include <vector>
11
- #include <map>
12
- #include <string>
13
-
14
- using namespace std;
15
- using namespace iret;
16
-
17
- namespace iret {
18
-
19
- class AbbrOut {
20
- public:
21
- string sf, lf, strat;
22
- double prec;
23
-
24
- AbbrOut( void ) : sf(""), lf(""), strat(""), prec(0)
25
- {}
26
-
27
- void print ( ostream & out ) {
28
- out << " " << sf << "|" << lf << "|" << prec;
29
- }
30
-
31
- };
32
-
33
-
34
- class Ab3P {
35
- public:
36
- Ab3P( void );
37
- ~Ab3P(void) { delete wrdData; }
38
-
39
- /** Collect text for later abbreviation finding. **/
40
- void add_text( const string & text ) {
41
- buffer += text;
42
- }
43
- void add_text( char * text ) {
44
- buffer += text;
45
- }
46
-
47
- /** Sets abbrs to the abbreviations found in previous calls to add_text.
48
- Afterwords, resets the text buffer. **/
49
- void get_abbrs( vector<AbbrOut> & abbrs ) {
50
- get_abbrs( buffer, abbrs );
51
- buffer = "";
52
- }
53
-
54
- /** Sets abbrs to the abbreviations found in text
55
- Does not interfere with the add_text buffer. **/
56
- void get_abbrs( const string & text, vector<AbbrOut> & abbrs ) {
57
- abbrs.clear();
58
-
59
- if(text.empty()) return; // skip empty line
60
- // const_cast need so correct get_abbrs get called,
61
- // otherwise, infinite loop
62
- get_abbrs( const_cast<char*>(text.c_str()), abbrs );
63
- }
64
- void get_abbrs( char * text, vector<AbbrOut> & abbrs );
65
-
66
- /** Try a potential sf-lf form to find proper lf, strategy used,
67
- and pseudo-precision of result **/
68
- void try_pair( char * sf, char * lf, AbbrOut & abbr );
69
-
70
- /**
71
- psf -- pointer short form
72
- plf -- pointer long form
73
- **/
74
- void try_strats ( char * psf, char * plf, bool swap, AbbrOut & result );
75
-
76
- AbbrvE ab; //default # pairs = 10,000
77
- map<string, double> stratPrec;
78
- StratUtil util;
79
- WordData *wrdData; //set data needed for AbbrStra
80
- string buffer; // collect text for later use
81
- };
82
-
83
- }
 
1
+ /*
2
+ Identify sf & lf pairs from free text using multi-stage algorithm
3
+ process one line at a time and print out:
4
+ line
5
+ sf|lf|P-precision|strategy
6
+ */
7
+
8
+ #include "AbbrvE.h"
9
+ #include "AbbrStra.h"
10
+ #include <vector>
11
+ #include <map>
12
+ #include <string>
13
+
14
+ using namespace std;
15
+ using namespace iret;
16
+
17
+ namespace iret {
18
+
19
+ class AbbrOut {
20
+ public:
21
+ string sf, lf, strat;
22
+ double prec;
23
+
24
+ AbbrOut( void ) : sf(""), lf(""), strat(""), prec(0)
25
+ {}
26
+
27
+ void print ( ostream & out ) {
28
+ out << " " << sf << "|" << lf << "|" << prec;
29
+ }
30
+
31
+ };
32
+
33
+
34
+ class Ab3P {
35
+ public:
36
+ Ab3P( void );
37
+ ~Ab3P(void) { delete wrdData; }
38
+
39
+ /** Collect text for later abbreviation finding. **/
40
+ void add_text( const string & text ) {
41
+ buffer += text;
42
+ }
43
+ void add_text( char * text ) {
44
+ buffer += text;
45
+ }
46
+
47
+ /** Sets abbrs to the abbreviations found in previous calls to add_text.
48
+ Afterwords, resets the text buffer. **/
49
+ void get_abbrs( vector<AbbrOut> & abbrs ) {
50
+ get_abbrs( buffer, abbrs );
51
+ buffer = "";
52
+ }
53
+
54
+ /** Sets abbrs to the abbreviations found in text
55
+ Does not interfere with the add_text buffer. **/
56
+ void get_abbrs( const string & text, vector<AbbrOut> & abbrs ) {
57
+ abbrs.clear();
58
+
59
+ if(text.empty()) return; // skip empty line
60
+ // const_cast need so correct get_abbrs get called,
61
+ // otherwise, infinite loop
62
+ get_abbrs( const_cast<char*>(text.c_str()), abbrs );
63
+ }
64
+ void get_abbrs( char * text, vector<AbbrOut> & abbrs );
65
+
66
+ /** Try a potential sf-lf form to find proper lf, strategy used,
67
+ and pseudo-precision of result **/
68
+ void try_pair( char * sf, char * lf, AbbrOut & abbr );
69
+
70
+ /**
71
+ psf -- pointer short form
72
+ plf -- pointer long form
73
+ **/
74
+ void try_strats ( char * psf, char * plf, bool swap, AbbrOut & result );
75
+
76
+ AbbrvE ab; //default # pairs = 10,000
77
+ map<string, double> stratPrec;
78
+ StratUtil util;
79
+ WordData *wrdData; //set data needed for AbbrStra
80
+ string buffer; // collect text for later use
81
+ };
82
+
83
+ }
Library/AbbrStra.C CHANGED
@@ -1,1426 +1,1426 @@
1
- #include "AbbrStra.h"
2
- #include <runn.h>
3
- #include <vector>
4
- #include <fstream>
5
- #include <iostream>
6
-
7
-
8
- WordData::WordData(const char *wrdnam, const char *stpnam,
9
- const char *lfsnam) :
10
- wrdset(wrdnam), stp(stpnam), lfs(lfsnam)
11
- {
12
- wrdset.set_path_name("Ab3P");
13
- wrdset.gopen_ctable_map();
14
- stp.set_path_name("Ab3P");
15
- stp.gopen_htable_map();
16
- lfs.set_path_name("Ab3P");
17
- lfs.gopen_htable_map();
18
- }
19
-
20
- WordData::~WordData()
21
- {
22
- wrdset.gclose_ctable_map();
23
- stp.gclose_htable_map();
24
- lfs.gclose_htable_map();
25
- }
26
-
27
-
28
- AbbrStra::AbbrStra()
29
- {
30
- npairs = tpairs = nsfs = nmatchs = amatchs = 0;
31
- }
32
-
33
-
34
- AbbrStra::~AbbrStra()
35
- {
36
- }
37
-
38
-
39
- void AbbrStra::token(const char *str, char lst[1000][1000])
40
- {
41
- long i,j=0,k=0;
42
- long n=strlen(str)-1;
43
-
44
- while(isblank(str[n])) n--;
45
-
46
- while(str[j]){
47
- while(isblank(str[j]))j++;
48
- i=j;
49
- while((str[j])&&(!isblank(str[j])))j++;
50
- strncpy(lst[k],str+i,j-i);
51
- lst[k][j-i]='\0';
52
- if(str[j]){
53
- k++;
54
- j++;
55
- }
56
- }
57
- if((j-1)>n) k--; //added by Sohn (Jan-17-08): "ab cd " -> 2 tokens
58
- ntk=k+1; //# tokens, ntk is data member
59
- }
60
-
61
-
62
- long AbbrStra::tokenize(const char *str, char lst[1000][1000])
63
- {
64
- long i,j=0,k=0;
65
- long n=strlen(str)-1;
66
-
67
- while(isblank(str[n])) n--;
68
-
69
- while(str[j]){
70
- while(isblank(str[j]))j++;
71
- i=j;
72
- while((str[j])&&(!isblank(str[j])))j++;
73
- strncpy(lst[k],str+i,j-i);
74
- lst[k][j-i]='\0';
75
- if(str[j]){
76
- k++;
77
- j++;
78
- }
79
- }
80
- if((j-1)>n) k--; //added by Sohn (Jan-17-08): "ab cd " -> 2 tokens
81
- return k+1; //# tokens
82
- }
83
-
84
-
85
- long AbbrStra::num_token(const char *str)
86
- {
87
- long i,j=0,k=0;
88
- long n=strlen(str)-1;
89
-
90
- while(isblank(str[n])) n--;
91
-
92
- while(str[j]){
93
- while(isblank(str[j]))j++;
94
- i=j;
95
- while((str[j])&&(!isblank(str[j])))j++;
96
- if(str[j]){
97
- k++;
98
- j++;
99
- }
100
- }
101
- if((j-1)>n) k--; //added by Sohn (Jan-17-08): "ab cd " -> 2 tokens
102
- return k+1; //# tokens
103
- }
104
-
105
-
106
- // fch is 1st char of str token from backward
107
- long AbbrStra::first_ch(const char *str, char *fch, long num)
108
- {
109
- long i, j, numtk;
110
- char tk[1000][1000];
111
-
112
- numtk = tokenize(str,tk);
113
- if(num>numtk) return 0;
114
-
115
- for(i=0; i<num; i++)
116
- fch[i] = tk[numtk-num+i][0];
117
-
118
- return 1;
119
- }
120
-
121
- long AbbrStra::is_upperal(const char *str)
122
- {
123
- for(long i=strlen(str)-1; i>=0; i--)
124
- if(!isupper(str[i]) || !isalpha(str[i]))
125
- return 0;
126
- return 1;
127
- }
128
-
129
- long AbbrStra::is_alpha(const char *str)
130
- {
131
- for(long i=strlen(str)-1; i>=0; i--)
132
- if(!isalpha(str[i]))
133
- return 0;
134
- return 1;
135
- }
136
-
137
-
138
- // str2 will lower-case of str1
139
- void AbbrStra::str_tolower(const char *str1, char *str2)
140
- {
141
- long i=0;
142
-
143
- while(str1[i]) {
144
- str2[i] = tolower(str1[i]);
145
- i++;
146
- }
147
- str2[i] = '\0';
148
- }
149
-
150
- //copy num tokens from back of str1 to str2
151
- long AbbrStra::get_str(const char *str1, char *str2, long num)
152
- {
153
- char ch, tk[1000][1000];
154
- long i, j, numtk;
155
-
156
- if(num<0) { cout<<"num<0\n"; exit(1); }
157
- numtk = tokenize(str1,tk);
158
- if(numtk<num) return 0;
159
-
160
- strcpy(str2,tk[numtk-num]);
161
- for(i=1; i<num; i++) {
162
- strcat(str2," ");
163
- strcat(str2,tk[numtk-num+i]);
164
- }
165
-
166
- return 1;
167
- }
168
-
169
- bool AbbrStra::isupper_str(const char *str)
170
- {
171
- long i, len=strlen(str);
172
-
173
- for(i=0; i<len; i++)
174
- if(isalpha(str[i]) && !isupper(str[i]))
175
- return false;
176
-
177
- return true;
178
- }
179
-
180
- bool AbbrStra::is_onealpha(const char *str)
181
- {
182
- long i, j=0, len=strlen(str);
183
-
184
- for(i=0; i<len; i++)
185
- if(isalpha(str[i])) j++;
186
-
187
- if(j==1) return true;
188
- else return false;
189
- }
190
-
191
- long AbbrStra::count_upperstr(const char *str)
192
- {
193
- long i, j, k, numtk;
194
- char tk[1000][1000];
195
-
196
- numtk = tokenize(str,tk);
197
-
198
- j = 0;
199
- for(i=numtk-1; i>=0; i--) {
200
- if(isupper(tk[i][0])) j++;
201
- else return j;
202
- }
203
-
204
- return j;
205
- }
206
-
207
- void AbbrStra::get_alpha(const char *str1, char *str2)
208
- {
209
- long i = 0, j = 0;
210
- long len = strlen(str1);
211
-
212
- while(i<len) {
213
- if(isalpha(str1[i])) {
214
- str2[j] = str1[i];
215
- j++;
216
- }
217
- i++;
218
- }
219
- str2[j] = '\0';
220
- }
221
-
222
-
223
- bool AbbrStra::lf_ok(const char *shrtf, const char *longf)
224
- {
225
- long i;
226
- long paren=0, sbrac=0;
227
- string s, l;
228
-
229
- //false for one parenthesis or square bracket
230
- for(i=strlen(longf)-1; i>=0; i--) {
231
- if(longf[i]=='(') paren++;
232
- if(longf[i]==')') paren--;
233
- if(longf[i]=='[') sbrac++;
234
- if(longf[i]==']') sbrac--;
235
- }
236
- if(paren!=0 || sbrac!=0) return false;
237
-
238
- s.assign(shrtf);
239
- l.assign(longf);
240
-
241
- for(i=0; i<s.length(); i++) s[i]=tolower(s[i]);
242
- for(i=0; i<l.length(); i++) l[i]=tolower(l[i]);
243
-
244
- //false if LF words contain SF
245
- if( (" "+l+" ").find(" "+s+" ")!=string::npos ) return false;
246
-
247
- return true;
248
- }
249
-
250
-
251
- //first=true: allow 1-ahpha, 0 don't allow
252
- long AbbrStra::search_backward(long sloc, long tinx, long tloc, const char *abbr, bool first)
253
- {
254
- long sfloc=sloc, tkinx=tinx, tkloc=tloc;
255
-
256
- while(sfloc>=0) {
257
- loop1: while((tkloc>=0)&&(tok[tkinx][tkloc]!=abbr[sfloc])) tkloc--;
258
- if(tkloc<0) {
259
- tkinx--;
260
- if(tkinx<0) return 0; //moved to here (Sep-14-07)
261
- tkloc=strlen(tok[tkinx])-1;
262
- }
263
- else {
264
- if(sfloc==0) {
265
- if(tkloc!=0) {
266
- if(!first) { tkloc--; goto loop1; }
267
- else if(isalnum(tok[tkinx][tkloc-1])) { tkloc--; goto loop1; }
268
- }
269
- }
270
- mod[sfloc][0]=tkinx;
271
- mod[sfloc][1]=tkloc;
272
- sfloc--; tkloc--;
273
- }
274
- }
275
-
276
- return 1;
277
- }
278
-
279
- long AbbrStra::search_backward_adv(const char *abbr, bool flag)
280
- {
281
- long i;
282
- long lna=strlen(abbr);
283
-
284
- i=0;
285
- while(i<lna){
286
- if(search_backward(i,mod[i][0],mod[i][1]-1,abbr,flag)) return 1;
287
- i++;
288
- }
289
- return 0;
290
- }
291
-
292
- void AbbrStra::extract_lf(long begin, long end)
293
- {
294
- strcpy(lf,tok[begin]);
295
- for(long i=begin+1; i<=end; i++) {
296
- strcat(lf," ");
297
- strcat(lf,tok[i]);
298
- }
299
- }
300
-
301
-
302
- void AbbrStra::extract_lf(long begin, long end, const char *str)
303
- {
304
- token(str,tok);
305
- strcpy(lf,tok[begin]);
306
- for(long i=begin+1; i<=end; i++) {
307
- strcat(lf," ");
308
- strcat(lf,tok[i]);
309
- }
310
- }
311
-
312
- //---
313
- bool AbbrStra::exist_skipword(long nsf)
314
- {
315
- long i=0, j=0, k;
316
-
317
- while(i<nsf) {
318
- if(i==(nsf-1)) k=ntk-mod[i][0]-1;
319
- else k=mod[i+1][0]-mod[i][0]-1;
320
- if(k>0) j+=k;
321
- i++;
322
- }
323
-
324
- if(j>0) return true;
325
- else return false;
326
- }
327
-
328
-
329
- bool AbbrStra::exist_n_skipwords(long nsf, long n)
330
- {
331
- long i=0, j, k;
332
- bool flag=false;
333
-
334
- //k: # skip words
335
- while(i<nsf) {
336
- if(i==(nsf-1)) k=ntk-mod[i][0]-1;
337
- else k=mod[i+1][0]-mod[i][0]-1;
338
- if(k>n) return false;
339
- if(k==n) flag=true;
340
- i++;
341
- }
342
-
343
- if(flag) return true;
344
- else return false;
345
- }
346
-
347
- //exists n consecutive skip stopwords between tokens
348
- bool AbbrStra::exist_n_stopwords(long nsf, long n)
349
- {
350
- long i=0, j, k;
351
- bool flag=false;
352
-
353
- while(i<nsf) {
354
- if(i==(nsf-1)) k=ntk-mod[i][0]-1;
355
- else k=mod[i+1][0]-mod[i][0]-1;
356
- if(k>n) return false;
357
- if(k==n) flag=true;
358
- if(k>0) { //skip word exists
359
- while(k) {
360
- if(!wData->stp.find(tok[mod[i][0]+k])) return false;
361
- k--;
362
- }
363
- }
364
- i++;
365
- }
366
-
367
- if(flag) return true;
368
- else return false;
369
- }
370
-
371
-
372
- bool AbbrStra::stopword_ok(long nsf, long nsw)
373
- {
374
- long i=0, j, k;
375
-
376
- while(i<nsf) {
377
- if(i==(nsf-1)) k=ntk-mod[i][0]-1;
378
- else k=mod[i+1][0]-mod[i][0]-1;
379
- if(k>nsw) return false;
380
- if(k>0) { //skip word exists
381
- while(k) {
382
- if(!wData->stp.find(tok[mod[i][0]+k])) return false;
383
- k--;
384
- }
385
- }
386
- i++;
387
- }
388
-
389
- return true;
390
- }
391
-
392
- bool AbbrStra::skip_stop_ok(long nsf, long nsw, long n)
393
- {
394
- long i=0, j, k, nstp;
395
-
396
- while(i<nsf) {
397
- if(i==(nsf-1)) k=ntk-mod[i][0]-1;
398
- else k=mod[i+1][0]-mod[i][0]-1;
399
- if(k>nsw) return false;
400
- //if(k>0) { //skip word exists
401
- if(k>(nsw-n)) {
402
- nstp=0; //# skiped stopword between tokens
403
- while(k) {
404
- if(wData->stp.find(tok[mod[i][0]+k])) nstp++;
405
- k--;
406
- }
407
- if(nstp<n) return false;
408
- }
409
- i++;
410
- }
411
-
412
- return true;
413
- }
414
-
415
-
416
- bool AbbrStra::skip_stop_ok2(long nsf, long nsw, long n)
417
- {
418
- long i=0, j, k, nstp;
419
-
420
- while(i<nsf) {
421
- if(i==(nsf-1)) k=ntk-mod[i][0]-1;
422
- else k=mod[i+1][0]-mod[i][0]-1;
423
- if((k>0)&&(k!=nsw)) return false;
424
- if(k>0) { //skip word exists
425
- nstp=0; //# skiped stopword between tokens
426
- while(k) {
427
- if(wData->stp.find(tok[mod[i][0]+k])) nstp++;
428
- k--;
429
- }
430
- if(nstp<n) return false;
431
- }
432
-
433
- i++;
434
- }
435
-
436
- return true;
437
- }
438
-
439
-
440
- bool AbbrStra::skipword_ok(long nsf, long nsw)
441
- {
442
- long i=0, j, k;
443
-
444
- while(i<nsf) {
445
- if(i==(nsf-1)) k=ntk-mod[i][0]-1;
446
- else k=mod[i+1][0]-mod[i][0]-1;
447
- if(k>nsw) return false;
448
- i++;
449
- }
450
-
451
- return true;
452
- }
453
-
454
-
455
- bool AbbrStra::is_subword(long nsf)
456
- {
457
- long i=0;
458
- char word[1000];
459
-
460
- while(i<nsf) {
461
- if(mod[i][1]!=0) {
462
- strcpy(word,tok[mod[i][0]]+mod[i][1]);
463
- if(wData->wrdset.count(word)==0) return false;
464
- }
465
- i++;
466
- }
467
-
468
- return true;
469
- }
470
-
471
-
472
- bool AbbrStra::is_BeginWrdMatch(long nsf, bool general)
473
- {
474
- long i=0, j;
475
- bool *bwm = new bool [ntk]; //BeginWrdMatch of a given tok
476
-
477
- for(j=0; j<ntk; j++) bwm[j] = false;
478
-
479
- while(i<nsf) {
480
- if(mod[i][1]==0)
481
- bwm[mod[i][0]] = true;
482
- else if( general && (!isalnum(tok[mod[i][0]][mod[i][1]-1])) )
483
- bwm[mod[i][0]] = true;
484
- i++;
485
- }
486
-
487
- for(j=0; j<nsf; j++)
488
- if(!bwm[mod[j][0]]) {
489
- delete [] bwm;
490
- return false;
491
- }
492
-
493
- delete [] bwm;
494
-
495
- return true;
496
- }
497
-
498
-
499
- bool AbbrStra::is_WithinWrdMatch(long nsf, bool general)
500
- {
501
- long i=0, wwm=0;
502
-
503
- while(i<nsf) {
504
- if(!general) {
505
- if(mod[i][1]>0) wwm++;
506
- }
507
- else {
508
- if(mod[i][1]>0 && isalnum(tok[mod[i][0]][mod[i][1]-1])) wwm++;
509
- }
510
- i++;
511
- }
512
-
513
- if(wwm>0) return true;
514
- else return false;
515
- }
516
-
517
-
518
- bool AbbrStra::is_FirstLetMatch(long nsf, bool general)
519
- {
520
- long i=0, flm=0, flm2=0;
521
-
522
- while(i<nsf) {
523
- if(mod[i][1]==0) flm++;
524
- else if( general && (!isalnum(tok[mod[i][0]][mod[i][1]-1])) ) {
525
- flm++; flm2++;
526
- }
527
- i++;
528
- }
529
-
530
- if(flm==nsf) return true;
531
- else return false;
532
- }
533
-
534
-
535
- bool AbbrStra::is_FirstLetMatch2(long nsf, bool general)
536
- {
537
- long i=0, flm=0, flm2=0;
538
-
539
- while(i<nsf) {
540
- if(mod[i][1]==0) flm++;
541
- else if( general && (!isalnum(tok[mod[i][0]][mod[i][1]-1])) ) {
542
- flm++; flm2++;
543
- }
544
- i++;
545
- }
546
-
547
- if( (flm==nsf) && (flm2>=1) ) return true;
548
- else return false;
549
- }
550
-
551
-
552
- bool AbbrStra::is_FirstLetSMatch(const char *abbr, bool general)
553
- {
554
- long i=0, j=strlen(abbr)-1, flm=0, lsm=0;
555
-
556
- while(i<j) {
557
- if(mod[i][1]==0) flm++;
558
- else if( general && (!isalnum(tok[mod[i][0]][mod[i][1]-1])) ) flm++;
559
- i++;
560
- }
561
-
562
- if( (tok[mod[j][0]][mod[j][1]]=='s') &&
563
- (mod[j][1]==(strlen(tok[mod[j][0]])-1)) &&
564
- mod[j][0]==mod[j-1][0] ) lsm++;
565
-
566
- if((flm==j) && (lsm==1)) return true;
567
- else return false;
568
- }
569
-
570
-
571
- bool AbbrStra::is_ContLetMatch(long nsf)
572
- {
573
- long i=0, cl=1;
574
-
575
- while(i<(nsf-1)) {
576
- if( mod[i][0]==mod[i+1][0] &&
577
- (mod[i][1]+1)==mod[i+1][1] ) cl++;
578
- i++;
579
- }
580
-
581
- if(cl>=2) return true;
582
- else return false;
583
- }
584
- //----
585
-
586
-
587
- //---1st ch must be alnum & at least one alphabet for all
588
- //str1: sf
589
- bool AbbrStra::set_condition(const char *str1)
590
- {
591
- int n=0, m=0, o=0;
592
-
593
- switch(setCondition) {
594
- case 1: //all alphabet SFs
595
- for(long i=strlen(str1)-1; i>=0; i--)
596
- if(!isalpha(str1[i]))
597
- return false;
598
- return true;
599
- break;
600
- case 2: //at least one non-alphabet
601
- if(!isalnum(str1[0])) return false;
602
- for(long i=strlen(str1)-1; i>=0; i--) {
603
- if(isalpha(str1[i])) n++;
604
- else m++;
605
- }
606
- if( (n>0) && (m>0) ) return true;
607
- else return false;
608
- break;
609
- case 3: //only alnum & at least one num
610
- for(long i=strlen(str1)-1; i>=0; i--) {
611
- if(!isalnum(str1[i])) return false;
612
- if(isalpha(str1[i])) n++;
613
- if(isdigit(str1[i])) m++;
614
- }
615
- if( (n>0) && (m>0) ) return true;
616
- else return false;
617
- break;
618
- case 4: //only alpha and non-alnum & at least one non-alnum
619
- if(!isalpha(str1[0])) return false;
620
- for(long i=strlen(str1)-1; i>=0; i--) {
621
- if(isdigit(str1[i])) return false;
622
- if(!isalnum(str1[i])) n++;
623
- }
624
- if(n>0) return true;
625
- else return false;
626
- break;
627
- case 5: //at least one non-alnum
628
- if(!isalnum(str1[0])) return false;
629
- for(long i=strlen(str1)-1; i>0; i--) {
630
- if(!isalnum(str1[i])) return true;
631
- }
632
- return false;
633
- break;
634
- case 6: //at least one num and non-alnum
635
- if(!isalnum(str1[0])) return false;
636
- for(long i=strlen(str1)-1; i>=0; i--) {
637
- if(isalpha(str1[i])) n++;
638
- if(isdigit(str1[i])) m++;
639
- if(!isalnum(str1[i])) o++;
640
- }
641
- if( (n>0) && (m>0) && (o>0) ) return true;
642
- else return false;
643
- break;
644
- case 7: //1+2 (SH algorithm)
645
- if(!isalnum(str1[0])) return false;
646
- for(long i=strlen(str1)-1; i>=0; i--)
647
- if(isalpha(str1[i])) return true;
648
- return false;
649
- break;
650
- default:
651
- cout << "Not defined set condition\n";
652
- exit(1);
653
- }
654
- }
655
-
656
- //---
657
- //same as FirstLet::set_condition
658
- //but requires extra set conditions
659
- bool FirstLetOneChSF::set_condition(const char *shrtf, const char *longf, char *str)
660
- {
661
- long i=0, len=strlen(shrtf), numtk;
662
- char tk[1000][1000];
663
-
664
- //sf conditions: all alphabet
665
- while(i<len && isalpha(shrtf[i])) i++;
666
- if(i!=len) return false;
667
-
668
- //lf conditions: #tok>=|SF|, 1st ch of words must be alphabet
669
- numtk = tokenize(longf,tk);
670
- if(len>numtk) return false;
671
-
672
- for(i=0; i<len; i++)
673
- str[i] = tk[numtk-len+i][0];
674
- str[i] = '\0';
675
-
676
- if(!is_alpha(str)) return false;
677
-
678
- return true;
679
- }
680
-
681
-
682
- long FirstLetOneChSF::strategy(const char *sf_, const char *str_) {
683
- long lna,lnt,flag;
684
- bool genFL=false; //1:allow 1-ahpha for 1ch of SF match, 0:don't
685
- char phr[10000], phrl[10000];
686
-
687
- str_tolower(sf_,sf);
688
- str_tolower(str_,text);
689
-
690
- get_str(str_,phr,1); //phr: 1st token of str from back
691
- str_tolower(phr,phrl);
692
- //conditions
693
- if(is_onealpha(phr)) return 0; //last token includes 1 alphabet
694
- if(isupper_str(phr)) return 0; //last token is all upper-case alphabet
695
- if(wData->stp.find(phrl)) return 0; //last token is stopword
696
- if(!wData->lfs.find(phrl)) return 0; //lfs (1-ch sf) for FirstLet match cases < 2
697
-
698
- token(text,tok);
699
- lna = strlen(sf);
700
- lnt = strlen(tok[ntk-1]);
701
-
702
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
703
- if(!flag) return 0;
704
-
705
- do {
706
- if(!skipword_ok(lna,0)) continue;
707
- if(!is_FirstLetMatch(lna,genFL)) continue; //not allow 1-alpha
708
-
709
- extract_lf(mod[0][0],ntk-1,str_);
710
- return 1;
711
- } while(search_backward_adv(sf,genFL));
712
-
713
- return 0;
714
- }
715
- //---
716
-
717
- bool FirstLet::set_condition(const char *shrtf, const char *longf, char *str)
718
- {
719
- long i=0, len=strlen(shrtf), numtk;
720
- char tk[1000][1000];
721
-
722
- //sf conditions
723
- while(i<len && isalpha(shrtf[i])) i++;
724
- if(i!=len) return false;
725
-
726
- //lf conditions
727
- numtk = tokenize(longf,tk);
728
- if(len>numtk) return false;
729
-
730
- for(i=0; i<len; i++)
731
- str[i] = tk[numtk-len+i][0];
732
- str[i] = '\0';
733
-
734
- if(!is_alpha(str)) return false;
735
-
736
- return true;
737
- }
738
-
739
-
740
- long FirstLet::strategy(const char *sf_, const char *str_) {
741
- long lna,lnt,flag;
742
- bool genFL=false; //1:allow 1-ahpha for 1ch of SF match, 0:don't
743
-
744
- str_tolower(sf_,sf);
745
- str_tolower(str_,text);
746
-
747
- token(text,tok);
748
- lna = strlen(sf);
749
- lnt = strlen(tok[ntk-1]);
750
-
751
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
752
- if(!flag) return 0;
753
-
754
- do {
755
- if(!skipword_ok(lna,0)) continue;
756
- if(!is_FirstLetMatch(lna,genFL)) continue; //not allow 1-alpha
757
-
758
- extract_lf(mod[0][0],ntk-1,str_);
759
- return 1;
760
- } while(search_backward_adv(sf,genFL));
761
-
762
- return 0;
763
- }
764
-
765
-
766
- long FirstLetGen::strategy(const char *sf_, const char *str_)
767
- {
768
- long lna,lnt,flag;
769
- bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
770
-
771
- str_tolower(sf_,sf);
772
- str_tolower(str_,text);
773
-
774
- token(text,tok);
775
- lna = strlen(sf);
776
- lnt = strlen(tok[ntk-1]);
777
-
778
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
779
- if(!flag) return 0;
780
-
781
- do {
782
- if(!skipword_ok(lna,0)) continue;
783
- if(!is_FirstLetMatch2(lna,genFL)) continue; //at least 1-alpha
784
-
785
- extract_lf(mod[0][0],ntk-1,str_);
786
- return 1;
787
- } while(search_backward_adv(sf,genFL));
788
-
789
- return 0;
790
- }
791
-
792
-
793
- long FirstLetGen2::strategy(const char *sf_, const char *str_)
794
- {
795
- long lna,lnt,flag;
796
- bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
797
-
798
- str_tolower(sf_,sf);
799
- str_tolower(str_,text);
800
-
801
- token(text,tok);
802
- lna = strlen(sf);
803
- lnt = strlen(tok[ntk-1]);
804
-
805
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
806
- if(!flag) return 0;
807
-
808
- do {
809
- if(!skipword_ok(lna,0)) continue;
810
- if(!is_FirstLetMatch(lna,genFL)) continue;
811
-
812
- extract_lf(mod[0][0],ntk-1,str_);
813
- return 1;
814
- } while(search_backward_adv(sf,genFL));
815
-
816
- return 0;
817
- }
818
-
819
-
820
- bool FirstLetGenS::set_condition(const char *str)
821
- {
822
- if(str[strlen(str)-1]!='s') return false;
823
-
824
- for(long i=strlen(str)-2; i>=0; i--) {
825
- if(!isupper(str[i])) return false;
826
- if(!isalpha(str[i])) return false; //necessary?
827
- }
828
-
829
- return true;
830
- }
831
-
832
-
833
- long FirstLetGenS::strategy(const char *sf_, const char *str_)
834
- {
835
- long lna,lnt,flag;
836
- bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
837
-
838
- if(!set_condition(sf_)) return 0;
839
-
840
- str_tolower(sf_,sf);
841
- str_tolower(str_,text);
842
-
843
- token(text,tok);
844
- lna = strlen(sf);
845
- lnt = strlen(tok[ntk-1]);
846
-
847
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
848
- if(!flag) return 0;
849
-
850
- do {
851
- if(!skipword_ok(lna,0)) continue;
852
- if(!is_FirstLetSMatch(sf,genFL)) continue;
853
-
854
- extract_lf(mod[0][0],ntk-1,str_);
855
- return 1;
856
- } while(search_backward_adv(sf,genFL));
857
-
858
- return 0;
859
- }
860
-
861
-
862
- long FirstLetGenStp::strategy(const char *sf_, const char *str_)
863
- {
864
- long lna,lnt,flag;
865
- bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
866
-
867
- str_tolower(sf_,sf);
868
- str_tolower(str_,text);
869
-
870
- token(text,tok);
871
- lna = strlen(sf);
872
- lnt = strlen(tok[ntk-1]);
873
-
874
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
875
- if(!flag) return 0;
876
-
877
- do {
878
- if(!exist_skipword(lna)) continue;
879
- if(!stopword_ok(lna,1)) continue;
880
- if(!is_FirstLetMatch(lna,genFL)) continue;
881
-
882
- extract_lf(mod[0][0],ntk-1,str_);
883
- return 1;
884
- } while(search_backward_adv(sf,genFL));
885
-
886
- return 0;
887
- }
888
-
889
-
890
- long FirstLetGenStp2::strategy(const char *sf_, const char *str_)
891
- {
892
- long lna,lnt,flag;
893
- bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
894
-
895
- str_tolower(sf_,sf);
896
- str_tolower(str_,text);
897
-
898
- token(text,tok);
899
- lna = strlen(sf);
900
- lnt = strlen(tok[ntk-1]);
901
-
902
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
903
- if(!flag) return 0;
904
-
905
- do {
906
- if(!exist_n_stopwords(lna,2)) continue;
907
- if(!is_FirstLetMatch(lna,genFL)) continue;
908
-
909
- extract_lf(mod[0][0],ntk-1,str_);
910
- return 1;
911
- } while(search_backward_adv(sf,genFL));
912
-
913
- return 0;
914
- }
915
-
916
-
917
- long FirstLetGenSkp::strategy(const char *sf_, const char *str_)
918
- {
919
- long lna,lnt,flag;
920
- bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
921
-
922
- str_tolower(sf_,sf);
923
- str_tolower(str_,text);
924
-
925
- token(text,tok);
926
- lna = strlen(sf);
927
- lnt = strlen(tok[ntk-1]);
928
-
929
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
930
- if(!flag) return 0;
931
-
932
- do {
933
- if(!exist_skipword(lna)) continue;
934
- if(!skipword_ok(lna,1)) continue;
935
- if(!is_FirstLetMatch(lna,genFL)) continue;
936
-
937
- extract_lf(mod[0][0],ntk-1,str_);
938
- return 1;
939
- } while(search_backward_adv(sf,genFL));
940
-
941
- return 0;
942
- }
943
-
944
-
945
- long WithinWrdWrd::strategy(const char *sf_, const char *str_)
946
- {
947
- long lna,lnt,flag;
948
- bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
949
-
950
- str_tolower(sf_,sf);
951
- str_tolower(str_,text);
952
-
953
- token(text,tok);
954
- lna = strlen(sf);
955
- lnt = strlen(tok[ntk-1]);
956
-
957
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
958
- if(!flag) return 0;
959
-
960
- do {
961
- if(!skipword_ok(lna,0)) continue;
962
- if(!is_subword(lna)) continue;
963
- if(!is_WithinWrdMatch(lna,genFL)) continue;
964
-
965
- extract_lf(mod[0][0],ntk-1,str_);
966
- return 1;
967
- } while(search_backward_adv(sf,genFL));
968
-
969
- return 0;
970
- }
971
-
972
-
973
- long WithinWrdFWrd::strategy(const char *sf_, const char *str_)
974
- {
975
- long lna,lnt,flag;
976
- bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
977
-
978
- str_tolower(sf_,sf);
979
- str_tolower(str_,text);
980
-
981
- token(text,tok);
982
- lna = strlen(sf);
983
- lnt = strlen(tok[ntk-1]);
984
-
985
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
986
- if(!flag) return 0;
987
-
988
- do {
989
- if(!skipword_ok(lna,0)) continue;
990
- if(!is_subword(lna)) continue;
991
- if(!is_BeginWrdMatch(lna,genFL)) continue;
992
- if(!is_WithinWrdMatch(lna,genFL)) continue;
993
-
994
- extract_lf(mod[0][0],ntk-1,str_);
995
- return 1;
996
- } while(search_backward_adv(sf,genFL));
997
-
998
- return 0;
999
- }
1000
-
1001
-
1002
- long WithinWrdFWrdSkp::strategy(const char *sf_, const char *str_)
1003
- {
1004
- long lna,lnt,flag;
1005
- bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
1006
-
1007
- str_tolower(sf_,sf);
1008
- str_tolower(str_,text);
1009
-
1010
- token(text,tok);
1011
- lna = strlen(sf);
1012
- lnt = strlen(tok[ntk-1]);
1013
-
1014
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
1015
- if(!flag) return 0;
1016
-
1017
- do {
1018
- if(!exist_skipword(lna)) continue;
1019
- if(!skipword_ok(lna,1)) continue;
1020
- if(!is_subword(lna)) continue;
1021
- if(!is_BeginWrdMatch(lna,genFL)) continue;
1022
- if(!is_WithinWrdMatch(lna,genFL)) continue;
1023
-
1024
- extract_lf(mod[0][0],ntk-1,str_);
1025
- return 1;
1026
- } while(search_backward_adv(sf,genFL));
1027
-
1028
- return 0;
1029
- }
1030
-
1031
-
1032
- long WithinWrdLet::strategy(const char *sf_, const char *str_)
1033
- {
1034
- long lna,lnt,flag;
1035
- bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
1036
-
1037
- str_tolower(sf_,sf);
1038
- str_tolower(str_,text);
1039
-
1040
- token(text,tok);
1041
- lna = strlen(sf);
1042
- lnt = strlen(tok[ntk-1]);
1043
-
1044
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
1045
- if(!flag) return 0;
1046
-
1047
- do {
1048
- if(!skipword_ok(lna,0)) continue;
1049
- if(!is_WithinWrdMatch(lna,genFL)) continue;
1050
-
1051
- extract_lf(mod[0][0],ntk-1,str_);
1052
- return 1;
1053
- } while(search_backward_adv(sf,genFL));
1054
-
1055
- return 0;
1056
- }
1057
-
1058
-
1059
- long WithinWrdFLet::strategy(const char *sf_, const char *str_)
1060
- {
1061
- long lna,lnt,flag;
1062
- bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
1063
-
1064
- str_tolower(sf_,sf);
1065
- str_tolower(str_,text);
1066
-
1067
- token(text,tok);
1068
- lna = strlen(sf);
1069
- lnt = strlen(tok[ntk-1]);
1070
-
1071
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
1072
- if(!flag) return 0;
1073
-
1074
- do {
1075
- if(!skipword_ok(lna,0)) continue;
1076
- if(!is_BeginWrdMatch(lna,genFL)) continue;
1077
- if(!is_WithinWrdMatch(lna,genFL)) continue;
1078
-
1079
- extract_lf(mod[0][0],ntk-1,str_);
1080
- return 1;
1081
- } while(search_backward_adv(sf,genFL));
1082
-
1083
- return 0;
1084
- }
1085
-
1086
-
1087
- long WithinWrdFLetSkp::strategy(const char *sf_, const char *str_)
1088
- {
1089
- long lna,lnt,flag;
1090
- bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
1091
-
1092
- str_tolower(sf_,sf);
1093
- str_tolower(str_,text);
1094
-
1095
- token(text,tok);
1096
- lna = strlen(sf);
1097
- lnt = strlen(tok[ntk-1]);
1098
-
1099
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
1100
- if(!flag) return 0;
1101
-
1102
- do {
1103
- if(!exist_skipword(lna)) continue;
1104
- if(!skipword_ok(lna,1)) continue;
1105
- if(!is_BeginWrdMatch(lna,genFL)) continue;
1106
- if(!is_WithinWrdMatch(lna,genFL)) continue;
1107
-
1108
- extract_lf(mod[0][0],ntk-1,str_);
1109
- return 1;
1110
- } while(search_backward_adv(sf,genFL));
1111
-
1112
- return 0;
1113
- }
1114
-
1115
-
1116
- long ContLet::strategy(const char *sf_, const char *str_)
1117
- {
1118
- long lna,lnt,flag;
1119
- bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
1120
-
1121
- str_tolower(sf_,sf);
1122
- str_tolower(str_,text);
1123
-
1124
- token(text,tok);
1125
- lna = strlen(sf);
1126
- lnt = strlen(tok[ntk-1]);
1127
-
1128
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
1129
- if(!flag) return 0;
1130
-
1131
- do {
1132
- if(!skipword_ok(lna,0)) continue;
1133
- if(!is_BeginWrdMatch(lna,genFL)) continue;
1134
- if(!is_ContLetMatch(lna)) continue;
1135
-
1136
- extract_lf(mod[0][0],ntk-1,str_);
1137
- return 1;
1138
- } while(search_backward_adv(sf,genFL));
1139
-
1140
- return 0;
1141
- }
1142
-
1143
-
1144
- long ContLetSkp::strategy(const char *sf_, const char *str_)
1145
- {
1146
- long lna,lnt,flag;
1147
- bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
1148
-
1149
- str_tolower(sf_,sf);
1150
- str_tolower(str_,text);
1151
-
1152
- token(text,tok);
1153
- lna = strlen(sf);
1154
- lnt = strlen(tok[ntk-1]);
1155
-
1156
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
1157
- if(!flag) return 0;
1158
-
1159
- do {
1160
- if(!exist_skipword(lna)) continue;
1161
- if(!skipword_ok(lna,1)) continue;
1162
- if(!is_BeginWrdMatch(lna,genFL)) continue;
1163
- if(!is_ContLetMatch(lna)) continue;
1164
-
1165
- extract_lf(mod[0][0],ntk-1,str_);
1166
- return 1;
1167
- } while(search_backward_adv(sf,genFL));
1168
-
1169
- return 0;
1170
- }
1171
-
1172
-
1173
- long AnyLet::strategy(const char *sf_, const char *str_)
1174
- {
1175
- long lna,lnt,flag;
1176
- bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
1177
-
1178
- str_tolower(sf_,sf);
1179
- str_tolower(str_,text);
1180
-
1181
- token(text,tok);
1182
- lna = strlen(sf);
1183
- lnt = strlen(tok[ntk-1]);
1184
-
1185
- flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
1186
- if(!flag) return 0;
1187
-
1188
- do {
1189
- if(!skipword_ok(lna,1)) continue;
1190
-
1191
- extract_lf(mod[0][0],ntk-1,str_);
1192
- return 1;
1193
- } while(search_backward_adv(sf,genFL));
1194
-
1195
- return 0;
1196
- }
1197
-
1198
-
1199
-
1200
- //-----
1201
- AbbrStra * StratUtil::strat_factory(string name)
1202
- {
1203
- if(name=="FirstLetOneChSF") return new FirstLetOneChSF;
1204
- else if(name=="FirstLet") return new FirstLet;
1205
- else if(name=="FirstLetGen") return new FirstLetGen;
1206
- else if(name=="FirstLetGen2") return new FirstLetGen2;
1207
- else if(name=="FirstLetGenS") return new FirstLetGenS;
1208
- else if(name=="FirstLetGenStp") return new FirstLetGenStp;
1209
- else if(name=="FirstLetGenStp2") return new FirstLetGenStp2;
1210
- else if(name=="FirstLetGenSkp") return new FirstLetGenSkp;
1211
- else if(name=="WithinWrdWrd") return new WithinWrdWrd;
1212
- else if(name=="WithinWrdFWrd") return new WithinWrdFWrd;
1213
- else if(name=="WithinWrdFWrdSkp") return new WithinWrdFWrdSkp;
1214
- else if(name=="WithinWrdLet") return new WithinWrdLet;
1215
- else if(name=="WithinWrdFLet") return new WithinWrdFLet;
1216
- else if(name=="WithinWrdFLetSkp") return new WithinWrdFLetSkp;
1217
- else if(name=="ContLet") return new ContLet;
1218
- else if(name=="ContLetSkp") return new ContLetSkp;
1219
- else if(name=="AnyLet") return new AnyLet;
1220
- else { cout << "Fail strat_factory\n"; exit(1); }
1221
- }
1222
-
1223
-
1224
- //check if sf is ok and assign a group
1225
- //if sf length > 5, use 5!!
1226
- //grp will be Al+#ChInSF, Num+#ChInSF, or Spec+#ChInSF
1227
- bool StratUtil::group_sf(const char *sf, string &grp)
1228
- {
1229
- long i, j, len=strlen(sf);
1230
- long al=0, num=0, nonalnum=0;
1231
- long paren=0, sbrac=0;
1232
-
1233
- grp = ""; // if failure, no group
1234
-
1235
- if(!isalnum(sf[0])) return false; //1sf ch must alnum
1236
- for(i=0; i<len; i++) {
1237
- if(isalpha(sf[i])) al++;
1238
- else if(isdigit(sf[i])) num++;
1239
- else nonalnum++;
1240
- }
1241
- if(al<1) return false; //at least one alphabet
1242
-
1243
- //false for one parenthesis or square bracket
1244
- for(i=len-1; i>=0; i--) {
1245
- if(sf[i]=='(') paren++;
1246
- if(sf[i]==')') paren--;
1247
- if(sf[i]=='[') sbrac++;
1248
- if(sf[i]==']') sbrac--;
1249
- }
1250
- if(paren!=0 || sbrac!=0) return false;
1251
-
1252
- if(al==len) grp.assign("Al");
1253
- else if(num>0) grp.assign("Num");
1254
- else if(nonalnum>0) grp.assign("Spec");
1255
- else { cout << "No sf group\n"; exit(1); }
1256
-
1257
- //append sf length
1258
- len = len>5 ? 5 : len;
1259
-
1260
- switch(len) {
1261
- case 1:
1262
- grp.append("1");
1263
- break;
1264
- case 2:
1265
- grp.append("2");
1266
- break;
1267
- case 3:
1268
- grp.append("3");
1269
- break;
1270
- case 4:
1271
- grp.append("4");
1272
- break;
1273
- case 5:
1274
- grp.append("5");
1275
- break;
1276
- default:
1277
- cout << "Not defined #-ch SF" << endl;
1278
- exit(1);
1279
- }
1280
-
1281
- return true;
1282
- }
1283
-
1284
- //add the condition |lf|>|sf|
1285
- bool StratUtil::group_sf(const char *sf, const char *lf, string &grp)
1286
- {
1287
- long i, j, len=strlen(sf);
1288
- long al=0, num=0, nonalnum=0;
1289
- long paren=0, sbrac=0;
1290
-
1291
- if(strlen(lf)<len) return false; //|lf|>|sf|
1292
- if(!isalnum(sf[0])) return false; //1sf ch must alnum
1293
- for(i=0; i<len; i++) {
1294
- if(isalpha(sf[i])) al++;
1295
- else if(isdigit(sf[i])) num++;
1296
- else nonalnum++;
1297
- }
1298
- if(al<1) return false; //at least one alphabet
1299
- if(al>10) return false; //|alpha sf| is at most 10
1300
- if(num_token(sf)>2) return false; //added Feb-21-08
1301
-
1302
- //false for one parenthesis or square bracket
1303
- for(i=len-1; i>=0; i--) {
1304
- if(sf[i]=='(') paren++;
1305
- if(sf[i]==')') paren--;
1306
- if(sf[i]=='[') sbrac++;
1307
- if(sf[i]==']') sbrac--;
1308
- }
1309
- if(paren!=0 || sbrac!=0) return false;
1310
-
1311
- if(al==len) grp.assign("Al");
1312
- else if(num>0) grp.assign("Num");
1313
- else if(nonalnum>0) grp.assign("Spec");
1314
- else { cout << "No sf group\n"; exit(1); }
1315
-
1316
- //append sf length
1317
- len = len>5 ? 5 : len;
1318
-
1319
- switch(len) {
1320
- case 1:
1321
- grp.append("1");
1322
- break;
1323
- case 2:
1324
- grp.append("2");
1325
- break;
1326
- case 3:
1327
- grp.append("3");
1328
- break;
1329
- case 4:
1330
- grp.append("4");
1331
- break;
1332
- case 5:
1333
- grp.append("5");
1334
- break;
1335
- default:
1336
- cout << "Not defined #-ch SF" << endl;
1337
- exit(1);
1338
- }
1339
-
1340
- return true;
1341
- }
1342
-
1343
-
1344
- //remove non-alnum in str1 and save it to str2
1345
- void StratUtil::remove_nonAlnum(const char *str1, char *str2)
1346
- {
1347
- long i=0, j=0;
1348
-
1349
- while(str1[i]) {
1350
- if(isalnum(str1[i])) {
1351
- str2[j] = str1[i];
1352
- j++;
1353
- }
1354
- i++;
1355
- }
1356
- str2[j] = '\0';
1357
- }
1358
-
1359
-
1360
- vector<string> StratUtil::get_strats(string s)
1361
- {
1362
- if(s=="Al1") return Al1;
1363
- else if(s=="Al2") return Al2;
1364
- else if(s=="Al3") return Al3;
1365
- else if(s=="Al4") return Al4;
1366
- else if(s=="Al5") return Al5;
1367
- else if(s=="Num2") return Num2;
1368
- else if(s=="Num3") return Num3;
1369
- else if(s=="Num4") return Num4;
1370
- else if(s=="Num5") return Num5;
1371
- else if(s=="Spec2") return Spec2;
1372
- else if(s=="Spec3") return Spec3;
1373
- else if(s=="Spec4") return Spec4;
1374
- else if(s=="Spec5") return Spec5;
1375
- else { cout << "Incorrect name\n"; exit(1); }
1376
- }
1377
-
1378
-
1379
- void StratUtil::push_back_strat(string sgp, string strat)
1380
- {
1381
- if(sgp=="Al1") Al1.push_back(strat);
1382
- else if(sgp=="Al2") Al2.push_back(strat);
1383
- else if(sgp=="Al3") Al3.push_back(strat);
1384
- else if(sgp=="Al4") Al4.push_back(strat);
1385
- else if(sgp=="Al5") Al5.push_back(strat);
1386
- else if(sgp=="Num2") Num2.push_back(strat);
1387
- else if(sgp=="Num3") Num3.push_back(strat);
1388
- else if(sgp=="Num4") Num4.push_back(strat);
1389
- else if(sgp=="Num5") Num5.push_back(strat);
1390
- else if(sgp=="Spec2") Spec2.push_back(strat);
1391
- else if(sgp=="Spec3") Spec3.push_back(strat);
1392
- else if(sgp=="Spec4") Spec4.push_back(strat);
1393
- else if(sgp=="Spec5") Spec5.push_back(strat);
1394
- }
1395
-
1396
-
1397
- long StratUtil::exist_upperal(const char *str)
1398
- {
1399
- long i, len=strlen(str);
1400
-
1401
- for(i=0; i<len; i++)
1402
- if(isupper(str[i]))
1403
- return 1;
1404
- return 0;
1405
- }
1406
-
1407
- long StratUtil::num_token(const char *str)
1408
- {
1409
- long i,j=0,k=0;
1410
- long n=strlen(str)-1;
1411
-
1412
- while(isblank(str[n])) n--;
1413
-
1414
- while(str[j]){
1415
- while(isblank(str[j]))j++;
1416
- i=j;
1417
- while((str[j])&&(!isblank(str[j])))j++;
1418
- if(str[j]){
1419
- k++;
1420
- j++;
1421
- }
1422
- }
1423
- if((j-1)>n) k--; //added by Sohn (Jan-17-08): "ab cd " -> 2 tokens
1424
- return k+1; //# tokens
1425
- }
1426
- //-----
 
1
+ #include "AbbrStra.h"
2
+ #include <runn.h>
3
+ #include <vector>
4
+ #include <fstream>
5
+ #include <iostream>
6
+
7
+
8
+ WordData::WordData(const char *wrdnam, const char *stpnam,
9
+ const char *lfsnam) :
10
+ wrdset(wrdnam), stp(stpnam), lfs(lfsnam)
11
+ {
12
+ wrdset.set_path_name("Ab3P");
13
+ wrdset.gopen_ctable_map();
14
+ stp.set_path_name("Ab3P");
15
+ stp.gopen_htable_map();
16
+ lfs.set_path_name("Ab3P");
17
+ lfs.gopen_htable_map();
18
+ }
19
+
20
+ WordData::~WordData()
21
+ {
22
+ wrdset.gclose_ctable_map();
23
+ stp.gclose_htable_map();
24
+ lfs.gclose_htable_map();
25
+ }
26
+
27
+
28
+ AbbrStra::AbbrStra()
29
+ {
30
+ npairs = tpairs = nsfs = nmatchs = amatchs = 0;
31
+ }
32
+
33
+
34
+ AbbrStra::~AbbrStra()
35
+ {
36
+ }
37
+
38
+
39
+ void AbbrStra::token(const char *str, char lst[1000][1000])
40
+ {
41
+ long i,j=0,k=0;
42
+ long n=strlen(str)-1;
43
+
44
+ while(isblank(str[n])) n--;
45
+
46
+ while(str[j]){
47
+ while(isblank(str[j]))j++;
48
+ i=j;
49
+ while((str[j])&&(!isblank(str[j])))j++;
50
+ strncpy(lst[k],str+i,j-i);
51
+ lst[k][j-i]='\0';
52
+ if(str[j]){
53
+ k++;
54
+ j++;
55
+ }
56
+ }
57
+ if((j-1)>n) k--; //added by Sohn (Jan-17-08): "ab cd " -> 2 tokens
58
+ ntk=k+1; //# tokens, ntk is data member
59
+ }
60
+
61
+
62
+ long AbbrStra::tokenize(const char *str, char lst[1000][1000])
63
+ {
64
+ long i,j=0,k=0;
65
+ long n=strlen(str)-1;
66
+
67
+ while(isblank(str[n])) n--;
68
+
69
+ while(str[j]){
70
+ while(isblank(str[j]))j++;
71
+ i=j;
72
+ while((str[j])&&(!isblank(str[j])))j++;
73
+ strncpy(lst[k],str+i,j-i);
74
+ lst[k][j-i]='\0';
75
+ if(str[j]){
76
+ k++;
77
+ j++;
78
+ }
79
+ }
80
+ if((j-1)>n) k--; //added by Sohn (Jan-17-08): "ab cd " -> 2 tokens
81
+ return k+1; //# tokens
82
+ }
83
+
84
+
85
+ long AbbrStra::num_token(const char *str)
86
+ {
87
+ long i,j=0,k=0;
88
+ long n=strlen(str)-1;
89
+
90
+ while(isblank(str[n])) n--;
91
+
92
+ while(str[j]){
93
+ while(isblank(str[j]))j++;
94
+ i=j;
95
+ while((str[j])&&(!isblank(str[j])))j++;
96
+ if(str[j]){
97
+ k++;
98
+ j++;
99
+ }
100
+ }
101
+ if((j-1)>n) k--; //added by Sohn (Jan-17-08): "ab cd " -> 2 tokens
102
+ return k+1; //# tokens
103
+ }
104
+
105
+
106
+ // fch is 1st char of str token from backward
107
+ long AbbrStra::first_ch(const char *str, char *fch, long num)
108
+ {
109
+ long i, j, numtk;
110
+ char tk[1000][1000];
111
+
112
+ numtk = tokenize(str,tk);
113
+ if(num>numtk) return 0;
114
+
115
+ for(i=0; i<num; i++)
116
+ fch[i] = tk[numtk-num+i][0];
117
+
118
+ return 1;
119
+ }
120
+
121
+ long AbbrStra::is_upperal(const char *str)
122
+ {
123
+ for(long i=strlen(str)-1; i>=0; i--)
124
+ if(!isupper(str[i]) || !isalpha(str[i]))
125
+ return 0;
126
+ return 1;
127
+ }
128
+
129
+ long AbbrStra::is_alpha(const char *str)
130
+ {
131
+ for(long i=strlen(str)-1; i>=0; i--)
132
+ if(!isalpha(str[i]))
133
+ return 0;
134
+ return 1;
135
+ }
136
+
137
+
138
+ // str2 will lower-case of str1
139
+ void AbbrStra::str_tolower(const char *str1, char *str2)
140
+ {
141
+ long i=0;
142
+
143
+ while(str1[i]) {
144
+ str2[i] = tolower(str1[i]);
145
+ i++;
146
+ }
147
+ str2[i] = '\0';
148
+ }
149
+
150
+ //copy num tokens from back of str1 to str2
151
+ long AbbrStra::get_str(const char *str1, char *str2, long num)
152
+ {
153
+ char ch, tk[1000][1000];
154
+ long i, j, numtk;
155
+
156
+ if(num<0) { cout<<"num<0\n"; exit(1); }
157
+ numtk = tokenize(str1,tk);
158
+ if(numtk<num) return 0;
159
+
160
+ strcpy(str2,tk[numtk-num]);
161
+ for(i=1; i<num; i++) {
162
+ strcat(str2," ");
163
+ strcat(str2,tk[numtk-num+i]);
164
+ }
165
+
166
+ return 1;
167
+ }
168
+
169
+ bool AbbrStra::isupper_str(const char *str)
170
+ {
171
+ long i, len=strlen(str);
172
+
173
+ for(i=0; i<len; i++)
174
+ if(isalpha(str[i]) && !isupper(str[i]))
175
+ return false;
176
+
177
+ return true;
178
+ }
179
+
180
+ bool AbbrStra::is_onealpha(const char *str)
181
+ {
182
+ long i, j=0, len=strlen(str);
183
+
184
+ for(i=0; i<len; i++)
185
+ if(isalpha(str[i])) j++;
186
+
187
+ if(j==1) return true;
188
+ else return false;
189
+ }
190
+
191
+ long AbbrStra::count_upperstr(const char *str)
192
+ {
193
+ long i, j, k, numtk;
194
+ char tk[1000][1000];
195
+
196
+ numtk = tokenize(str,tk);
197
+
198
+ j = 0;
199
+ for(i=numtk-1; i>=0; i--) {
200
+ if(isupper(tk[i][0])) j++;
201
+ else return j;
202
+ }
203
+
204
+ return j;
205
+ }
206
+
207
+ void AbbrStra::get_alpha(const char *str1, char *str2)
208
+ {
209
+ long i = 0, j = 0;
210
+ long len = strlen(str1);
211
+
212
+ while(i<len) {
213
+ if(isalpha(str1[i])) {
214
+ str2[j] = str1[i];
215
+ j++;
216
+ }
217
+ i++;
218
+ }
219
+ str2[j] = '\0';
220
+ }
221
+
222
+
223
+ bool AbbrStra::lf_ok(const char *shrtf, const char *longf)
224
+ {
225
+ long i;
226
+ long paren=0, sbrac=0;
227
+ string s, l;
228
+
229
+ //false for one parenthesis or square bracket
230
+ for(i=strlen(longf)-1; i>=0; i--) {
231
+ if(longf[i]=='(') paren++;
232
+ if(longf[i]==')') paren--;
233
+ if(longf[i]=='[') sbrac++;
234
+ if(longf[i]==']') sbrac--;
235
+ }
236
+ if(paren!=0 || sbrac!=0) return false;
237
+
238
+ s.assign(shrtf);
239
+ l.assign(longf);
240
+
241
+ for(i=0; i<s.length(); i++) s[i]=tolower(s[i]);
242
+ for(i=0; i<l.length(); i++) l[i]=tolower(l[i]);
243
+
244
+ //false if LF words contain SF
245
+ if( (" "+l+" ").find(" "+s+" ")!=string::npos ) return false;
246
+
247
+ return true;
248
+ }
249
+
250
+
251
+ //first=true: allow 1-ahpha, 0 don't allow
252
+ long AbbrStra::search_backward(long sloc, long tinx, long tloc, const char *abbr, bool first)
253
+ {
254
+ long sfloc=sloc, tkinx=tinx, tkloc=tloc;
255
+
256
+ while(sfloc>=0) {
257
+ loop1: while((tkloc>=0)&&(tok[tkinx][tkloc]!=abbr[sfloc])) tkloc--;
258
+ if(tkloc<0) {
259
+ tkinx--;
260
+ if(tkinx<0) return 0; //moved to here (Sep-14-07)
261
+ tkloc=strlen(tok[tkinx])-1;
262
+ }
263
+ else {
264
+ if(sfloc==0) {
265
+ if(tkloc!=0) {
266
+ if(!first) { tkloc--; goto loop1; }
267
+ else if(isalnum(tok[tkinx][tkloc-1])) { tkloc--; goto loop1; }
268
+ }
269
+ }
270
+ mod[sfloc][0]=tkinx;
271
+ mod[sfloc][1]=tkloc;
272
+ sfloc--; tkloc--;
273
+ }
274
+ }
275
+
276
+ return 1;
277
+ }
278
+
279
+ long AbbrStra::search_backward_adv(const char *abbr, bool flag)
280
+ {
281
+ long i;
282
+ long lna=strlen(abbr);
283
+
284
+ i=0;
285
+ while(i<lna){
286
+ if(search_backward(i,mod[i][0],mod[i][1]-1,abbr,flag)) return 1;
287
+ i++;
288
+ }
289
+ return 0;
290
+ }
291
+
292
+ void AbbrStra::extract_lf(long begin, long end)
293
+ {
294
+ strcpy(lf,tok[begin]);
295
+ for(long i=begin+1; i<=end; i++) {
296
+ strcat(lf," ");
297
+ strcat(lf,tok[i]);
298
+ }
299
+ }
300
+
301
+
302
+ void AbbrStra::extract_lf(long begin, long end, const char *str)
303
+ {
304
+ token(str,tok);
305
+ strcpy(lf,tok[begin]);
306
+ for(long i=begin+1; i<=end; i++) {
307
+ strcat(lf," ");
308
+ strcat(lf,tok[i]);
309
+ }
310
+ }
311
+
312
+ //---
313
+ bool AbbrStra::exist_skipword(long nsf)
314
+ {
315
+ long i=0, j=0, k;
316
+
317
+ while(i<nsf) {
318
+ if(i==(nsf-1)) k=ntk-mod[i][0]-1;
319
+ else k=mod[i+1][0]-mod[i][0]-1;
320
+ if(k>0) j+=k;
321
+ i++;
322
+ }
323
+
324
+ if(j>0) return true;
325
+ else return false;
326
+ }
327
+
328
+
329
+ bool AbbrStra::exist_n_skipwords(long nsf, long n)
330
+ {
331
+ long i=0, j, k;
332
+ bool flag=false;
333
+
334
+ //k: # skip words
335
+ while(i<nsf) {
336
+ if(i==(nsf-1)) k=ntk-mod[i][0]-1;
337
+ else k=mod[i+1][0]-mod[i][0]-1;
338
+ if(k>n) return false;
339
+ if(k==n) flag=true;
340
+ i++;
341
+ }
342
+
343
+ if(flag) return true;
344
+ else return false;
345
+ }
346
+
347
+ //exists n consecutive skip stopwords between tokens
348
+ bool AbbrStra::exist_n_stopwords(long nsf, long n)
349
+ {
350
+ long i=0, j, k;
351
+ bool flag=false;
352
+
353
+ while(i<nsf) {
354
+ if(i==(nsf-1)) k=ntk-mod[i][0]-1;
355
+ else k=mod[i+1][0]-mod[i][0]-1;
356
+ if(k>n) return false;
357
+ if(k==n) flag=true;
358
+ if(k>0) { //skip word exists
359
+ while(k) {
360
+ if(!wData->stp.find(tok[mod[i][0]+k])) return false;
361
+ k--;
362
+ }
363
+ }
364
+ i++;
365
+ }
366
+
367
+ if(flag) return true;
368
+ else return false;
369
+ }
370
+
371
+
372
+ bool AbbrStra::stopword_ok(long nsf, long nsw)
373
+ {
374
+ long i=0, j, k;
375
+
376
+ while(i<nsf) {
377
+ if(i==(nsf-1)) k=ntk-mod[i][0]-1;
378
+ else k=mod[i+1][0]-mod[i][0]-1;
379
+ if(k>nsw) return false;
380
+ if(k>0) { //skip word exists
381
+ while(k) {
382
+ if(!wData->stp.find(tok[mod[i][0]+k])) return false;
383
+ k--;
384
+ }
385
+ }
386
+ i++;
387
+ }
388
+
389
+ return true;
390
+ }
391
+
392
+ bool AbbrStra::skip_stop_ok(long nsf, long nsw, long n)
393
+ {
394
+ long i=0, j, k, nstp;
395
+
396
+ while(i<nsf) {
397
+ if(i==(nsf-1)) k=ntk-mod[i][0]-1;
398
+ else k=mod[i+1][0]-mod[i][0]-1;
399
+ if(k>nsw) return false;
400
+ //if(k>0) { //skip word exists
401
+ if(k>(nsw-n)) {
402
+ nstp=0; //# skiped stopword between tokens
403
+ while(k) {
404
+ if(wData->stp.find(tok[mod[i][0]+k])) nstp++;
405
+ k--;
406
+ }
407
+ if(nstp<n) return false;
408
+ }
409
+ i++;
410
+ }
411
+
412
+ return true;
413
+ }
414
+
415
+
416
+ bool AbbrStra::skip_stop_ok2(long nsf, long nsw, long n)
417
+ {
418
+ long i=0, j, k, nstp;
419
+
420
+ while(i<nsf) {
421
+ if(i==(nsf-1)) k=ntk-mod[i][0]-1;
422
+ else k=mod[i+1][0]-mod[i][0]-1;
423
+ if((k>0)&&(k!=nsw)) return false;
424
+ if(k>0) { //skip word exists
425
+ nstp=0; //# skiped stopword between tokens
426
+ while(k) {
427
+ if(wData->stp.find(tok[mod[i][0]+k])) nstp++;
428
+ k--;
429
+ }
430
+ if(nstp<n) return false;
431
+ }
432
+
433
+ i++;
434
+ }
435
+
436
+ return true;
437
+ }
438
+
439
+
440
+ bool AbbrStra::skipword_ok(long nsf, long nsw)
441
+ {
442
+ long i=0, j, k;
443
+
444
+ while(i<nsf) {
445
+ if(i==(nsf-1)) k=ntk-mod[i][0]-1;
446
+ else k=mod[i+1][0]-mod[i][0]-1;
447
+ if(k>nsw) return false;
448
+ i++;
449
+ }
450
+
451
+ return true;
452
+ }
453
+
454
+
455
+ bool AbbrStra::is_subword(long nsf)
456
+ {
457
+ long i=0;
458
+ char word[1000];
459
+
460
+ while(i<nsf) {
461
+ if(mod[i][1]!=0) {
462
+ strcpy(word,tok[mod[i][0]]+mod[i][1]);
463
+ if(wData->wrdset.count(word)==0) return false;
464
+ }
465
+ i++;
466
+ }
467
+
468
+ return true;
469
+ }
470
+
471
+
472
+ bool AbbrStra::is_BeginWrdMatch(long nsf, bool general)
473
+ {
474
+ long i=0, j;
475
+ bool *bwm = new bool [ntk]; //BeginWrdMatch of a given tok
476
+
477
+ for(j=0; j<ntk; j++) bwm[j] = false;
478
+
479
+ while(i<nsf) {
480
+ if(mod[i][1]==0)
481
+ bwm[mod[i][0]] = true;
482
+ else if( general && (!isalnum(tok[mod[i][0]][mod[i][1]-1])) )
483
+ bwm[mod[i][0]] = true;
484
+ i++;
485
+ }
486
+
487
+ for(j=0; j<nsf; j++)
488
+ if(!bwm[mod[j][0]]) {
489
+ delete [] bwm;
490
+ return false;
491
+ }
492
+
493
+ delete [] bwm;
494
+
495
+ return true;
496
+ }
497
+
498
+
499
+ bool AbbrStra::is_WithinWrdMatch(long nsf, bool general)
500
+ {
501
+ long i=0, wwm=0;
502
+
503
+ while(i<nsf) {
504
+ if(!general) {
505
+ if(mod[i][1]>0) wwm++;
506
+ }
507
+ else {
508
+ if(mod[i][1]>0 && isalnum(tok[mod[i][0]][mod[i][1]-1])) wwm++;
509
+ }
510
+ i++;
511
+ }
512
+
513
+ if(wwm>0) return true;
514
+ else return false;
515
+ }
516
+
517
+
518
+ bool AbbrStra::is_FirstLetMatch(long nsf, bool general)
519
+ {
520
+ long i=0, flm=0, flm2=0;
521
+
522
+ while(i<nsf) {
523
+ if(mod[i][1]==0) flm++;
524
+ else if( general && (!isalnum(tok[mod[i][0]][mod[i][1]-1])) ) {
525
+ flm++; flm2++;
526
+ }
527
+ i++;
528
+ }
529
+
530
+ if(flm==nsf) return true;
531
+ else return false;
532
+ }
533
+
534
+
535
+ bool AbbrStra::is_FirstLetMatch2(long nsf, bool general)
536
+ {
537
+ long i=0, flm=0, flm2=0;
538
+
539
+ while(i<nsf) {
540
+ if(mod[i][1]==0) flm++;
541
+ else if( general && (!isalnum(tok[mod[i][0]][mod[i][1]-1])) ) {
542
+ flm++; flm2++;
543
+ }
544
+ i++;
545
+ }
546
+
547
+ if( (flm==nsf) && (flm2>=1) ) return true;
548
+ else return false;
549
+ }
550
+
551
+
552
+ bool AbbrStra::is_FirstLetSMatch(const char *abbr, bool general)
553
+ {
554
+ long i=0, j=strlen(abbr)-1, flm=0, lsm=0;
555
+
556
+ while(i<j) {
557
+ if(mod[i][1]==0) flm++;
558
+ else if( general && (!isalnum(tok[mod[i][0]][mod[i][1]-1])) ) flm++;
559
+ i++;
560
+ }
561
+
562
+ if( (tok[mod[j][0]][mod[j][1]]=='s') &&
563
+ (mod[j][1]==(strlen(tok[mod[j][0]])-1)) &&
564
+ mod[j][0]==mod[j-1][0] ) lsm++;
565
+
566
+ if((flm==j) && (lsm==1)) return true;
567
+ else return false;
568
+ }
569
+
570
+
571
+ bool AbbrStra::is_ContLetMatch(long nsf)
572
+ {
573
+ long i=0, cl=1;
574
+
575
+ while(i<(nsf-1)) {
576
+ if( mod[i][0]==mod[i+1][0] &&
577
+ (mod[i][1]+1)==mod[i+1][1] ) cl++;
578
+ i++;
579
+ }
580
+
581
+ if(cl>=2) return true;
582
+ else return false;
583
+ }
584
+ //----
585
+
586
+
587
+ //---1st ch must be alnum & at least one alphabet for all
588
+ //str1: sf
589
+ bool AbbrStra::set_condition(const char *str1)
590
+ {
591
+ int n=0, m=0, o=0;
592
+
593
+ switch(setCondition) {
594
+ case 1: //all alphabet SFs
595
+ for(long i=strlen(str1)-1; i>=0; i--)
596
+ if(!isalpha(str1[i]))
597
+ return false;
598
+ return true;
599
+ break;
600
+ case 2: //at least one non-alphabet
601
+ if(!isalnum(str1[0])) return false;
602
+ for(long i=strlen(str1)-1; i>=0; i--) {
603
+ if(isalpha(str1[i])) n++;
604
+ else m++;
605
+ }
606
+ if( (n>0) && (m>0) ) return true;
607
+ else return false;
608
+ break;
609
+ case 3: //only alnum & at least one num
610
+ for(long i=strlen(str1)-1; i>=0; i--) {
611
+ if(!isalnum(str1[i])) return false;
612
+ if(isalpha(str1[i])) n++;
613
+ if(isdigit(str1[i])) m++;
614
+ }
615
+ if( (n>0) && (m>0) ) return true;
616
+ else return false;
617
+ break;
618
+ case 4: //only alpha and non-alnum & at least one non-alnum
619
+ if(!isalpha(str1[0])) return false;
620
+ for(long i=strlen(str1)-1; i>=0; i--) {
621
+ if(isdigit(str1[i])) return false;
622
+ if(!isalnum(str1[i])) n++;
623
+ }
624
+ if(n>0) return true;
625
+ else return false;
626
+ break;
627
+ case 5: //at least one non-alnum
628
+ if(!isalnum(str1[0])) return false;
629
+ for(long i=strlen(str1)-1; i>0; i--) {
630
+ if(!isalnum(str1[i])) return true;
631
+ }
632
+ return false;
633
+ break;
634
+ case 6: //at least one num and non-alnum
635
+ if(!isalnum(str1[0])) return false;
636
+ for(long i=strlen(str1)-1; i>=0; i--) {
637
+ if(isalpha(str1[i])) n++;
638
+ if(isdigit(str1[i])) m++;
639
+ if(!isalnum(str1[i])) o++;
640
+ }
641
+ if( (n>0) && (m>0) && (o>0) ) return true;
642
+ else return false;
643
+ break;
644
+ case 7: //1+2 (SH algorithm)
645
+ if(!isalnum(str1[0])) return false;
646
+ for(long i=strlen(str1)-1; i>=0; i--)
647
+ if(isalpha(str1[i])) return true;
648
+ return false;
649
+ break;
650
+ default:
651
+ cout << "Not defined set condition\n";
652
+ exit(1);
653
+ }
654
+ }
655
+
656
+ //---
657
+ //same as FirstLet::set_condition
658
+ //but requires extra set conditions
659
+ bool FirstLetOneChSF::set_condition(const char *shrtf, const char *longf, char *str)
660
+ {
661
+ long i=0, len=strlen(shrtf), numtk;
662
+ char tk[1000][1000];
663
+
664
+ //sf conditions: all alphabet
665
+ while(i<len && isalpha(shrtf[i])) i++;
666
+ if(i!=len) return false;
667
+
668
+ //lf conditions: #tok>=|SF|, 1st ch of words must be alphabet
669
+ numtk = tokenize(longf,tk);
670
+ if(len>numtk) return false;
671
+
672
+ for(i=0; i<len; i++)
673
+ str[i] = tk[numtk-len+i][0];
674
+ str[i] = '\0';
675
+
676
+ if(!is_alpha(str)) return false;
677
+
678
+ return true;
679
+ }
680
+
681
+
682
+ long FirstLetOneChSF::strategy(const char *sf_, const char *str_) {
683
+ long lna,lnt,flag;
684
+ bool genFL=false; //1:allow 1-ahpha for 1ch of SF match, 0:don't
685
+ char phr[10000], phrl[10000];
686
+
687
+ str_tolower(sf_,sf);
688
+ str_tolower(str_,text);
689
+
690
+ get_str(str_,phr,1); //phr: 1st token of str from back
691
+ str_tolower(phr,phrl);
692
+ //conditions
693
+ if(is_onealpha(phr)) return 0; //last token includes 1 alphabet
694
+ if(isupper_str(phr)) return 0; //last token is all upper-case alphabet
695
+ if(wData->stp.find(phrl)) return 0; //last token is stopword
696
+ if(!wData->lfs.find(phrl)) return 0; //lfs (1-ch sf) for FirstLet match cases < 2
697
+
698
+ token(text,tok);
699
+ lna = strlen(sf);
700
+ lnt = strlen(tok[ntk-1]);
701
+
702
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
703
+ if(!flag) return 0;
704
+
705
+ do {
706
+ if(!skipword_ok(lna,0)) continue;
707
+ if(!is_FirstLetMatch(lna,genFL)) continue; //not allow 1-alpha
708
+
709
+ extract_lf(mod[0][0],ntk-1,str_);
710
+ return 1;
711
+ } while(search_backward_adv(sf,genFL));
712
+
713
+ return 0;
714
+ }
715
+ //---
716
+
717
+ bool FirstLet::set_condition(const char *shrtf, const char *longf, char *str)
718
+ {
719
+ long i=0, len=strlen(shrtf), numtk;
720
+ char tk[1000][1000];
721
+
722
+ //sf conditions
723
+ while(i<len && isalpha(shrtf[i])) i++;
724
+ if(i!=len) return false;
725
+
726
+ //lf conditions
727
+ numtk = tokenize(longf,tk);
728
+ if(len>numtk) return false;
729
+
730
+ for(i=0; i<len; i++)
731
+ str[i] = tk[numtk-len+i][0];
732
+ str[i] = '\0';
733
+
734
+ if(!is_alpha(str)) return false;
735
+
736
+ return true;
737
+ }
738
+
739
+
740
+ long FirstLet::strategy(const char *sf_, const char *str_) {
741
+ long lna,lnt,flag;
742
+ bool genFL=false; //1:allow 1-ahpha for 1ch of SF match, 0:don't
743
+
744
+ str_tolower(sf_,sf);
745
+ str_tolower(str_,text);
746
+
747
+ token(text,tok);
748
+ lna = strlen(sf);
749
+ lnt = strlen(tok[ntk-1]);
750
+
751
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
752
+ if(!flag) return 0;
753
+
754
+ do {
755
+ if(!skipword_ok(lna,0)) continue;
756
+ if(!is_FirstLetMatch(lna,genFL)) continue; //not allow 1-alpha
757
+
758
+ extract_lf(mod[0][0],ntk-1,str_);
759
+ return 1;
760
+ } while(search_backward_adv(sf,genFL));
761
+
762
+ return 0;
763
+ }
764
+
765
+
766
+ long FirstLetGen::strategy(const char *sf_, const char *str_)
767
+ {
768
+ long lna,lnt,flag;
769
+ bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
770
+
771
+ str_tolower(sf_,sf);
772
+ str_tolower(str_,text);
773
+
774
+ token(text,tok);
775
+ lna = strlen(sf);
776
+ lnt = strlen(tok[ntk-1]);
777
+
778
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
779
+ if(!flag) return 0;
780
+
781
+ do {
782
+ if(!skipword_ok(lna,0)) continue;
783
+ if(!is_FirstLetMatch2(lna,genFL)) continue; //at least 1-alpha
784
+
785
+ extract_lf(mod[0][0],ntk-1,str_);
786
+ return 1;
787
+ } while(search_backward_adv(sf,genFL));
788
+
789
+ return 0;
790
+ }
791
+
792
+
793
+ long FirstLetGen2::strategy(const char *sf_, const char *str_)
794
+ {
795
+ long lna,lnt,flag;
796
+ bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
797
+
798
+ str_tolower(sf_,sf);
799
+ str_tolower(str_,text);
800
+
801
+ token(text,tok);
802
+ lna = strlen(sf);
803
+ lnt = strlen(tok[ntk-1]);
804
+
805
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
806
+ if(!flag) return 0;
807
+
808
+ do {
809
+ if(!skipword_ok(lna,0)) continue;
810
+ if(!is_FirstLetMatch(lna,genFL)) continue;
811
+
812
+ extract_lf(mod[0][0],ntk-1,str_);
813
+ return 1;
814
+ } while(search_backward_adv(sf,genFL));
815
+
816
+ return 0;
817
+ }
818
+
819
+
820
+ bool FirstLetGenS::set_condition(const char *str)
821
+ {
822
+ if(str[strlen(str)-1]!='s') return false;
823
+
824
+ for(long i=strlen(str)-2; i>=0; i--) {
825
+ if(!isupper(str[i])) return false;
826
+ if(!isalpha(str[i])) return false; //necessary?
827
+ }
828
+
829
+ return true;
830
+ }
831
+
832
+
833
+ long FirstLetGenS::strategy(const char *sf_, const char *str_)
834
+ {
835
+ long lna,lnt,flag;
836
+ bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
837
+
838
+ if(!set_condition(sf_)) return 0;
839
+
840
+ str_tolower(sf_,sf);
841
+ str_tolower(str_,text);
842
+
843
+ token(text,tok);
844
+ lna = strlen(sf);
845
+ lnt = strlen(tok[ntk-1]);
846
+
847
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
848
+ if(!flag) return 0;
849
+
850
+ do {
851
+ if(!skipword_ok(lna,0)) continue;
852
+ if(!is_FirstLetSMatch(sf,genFL)) continue;
853
+
854
+ extract_lf(mod[0][0],ntk-1,str_);
855
+ return 1;
856
+ } while(search_backward_adv(sf,genFL));
857
+
858
+ return 0;
859
+ }
860
+
861
+
862
+ long FirstLetGenStp::strategy(const char *sf_, const char *str_)
863
+ {
864
+ long lna,lnt,flag;
865
+ bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
866
+
867
+ str_tolower(sf_,sf);
868
+ str_tolower(str_,text);
869
+
870
+ token(text,tok);
871
+ lna = strlen(sf);
872
+ lnt = strlen(tok[ntk-1]);
873
+
874
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
875
+ if(!flag) return 0;
876
+
877
+ do {
878
+ if(!exist_skipword(lna)) continue;
879
+ if(!stopword_ok(lna,1)) continue;
880
+ if(!is_FirstLetMatch(lna,genFL)) continue;
881
+
882
+ extract_lf(mod[0][0],ntk-1,str_);
883
+ return 1;
884
+ } while(search_backward_adv(sf,genFL));
885
+
886
+ return 0;
887
+ }
888
+
889
+
890
+ long FirstLetGenStp2::strategy(const char *sf_, const char *str_)
891
+ {
892
+ long lna,lnt,flag;
893
+ bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
894
+
895
+ str_tolower(sf_,sf);
896
+ str_tolower(str_,text);
897
+
898
+ token(text,tok);
899
+ lna = strlen(sf);
900
+ lnt = strlen(tok[ntk-1]);
901
+
902
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
903
+ if(!flag) return 0;
904
+
905
+ do {
906
+ if(!exist_n_stopwords(lna,2)) continue;
907
+ if(!is_FirstLetMatch(lna,genFL)) continue;
908
+
909
+ extract_lf(mod[0][0],ntk-1,str_);
910
+ return 1;
911
+ } while(search_backward_adv(sf,genFL));
912
+
913
+ return 0;
914
+ }
915
+
916
+
917
+ long FirstLetGenSkp::strategy(const char *sf_, const char *str_)
918
+ {
919
+ long lna,lnt,flag;
920
+ bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
921
+
922
+ str_tolower(sf_,sf);
923
+ str_tolower(str_,text);
924
+
925
+ token(text,tok);
926
+ lna = strlen(sf);
927
+ lnt = strlen(tok[ntk-1]);
928
+
929
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
930
+ if(!flag) return 0;
931
+
932
+ do {
933
+ if(!exist_skipword(lna)) continue;
934
+ if(!skipword_ok(lna,1)) continue;
935
+ if(!is_FirstLetMatch(lna,genFL)) continue;
936
+
937
+ extract_lf(mod[0][0],ntk-1,str_);
938
+ return 1;
939
+ } while(search_backward_adv(sf,genFL));
940
+
941
+ return 0;
942
+ }
943
+
944
+
945
+ long WithinWrdWrd::strategy(const char *sf_, const char *str_)
946
+ {
947
+ long lna,lnt,flag;
948
+ bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
949
+
950
+ str_tolower(sf_,sf);
951
+ str_tolower(str_,text);
952
+
953
+ token(text,tok);
954
+ lna = strlen(sf);
955
+ lnt = strlen(tok[ntk-1]);
956
+
957
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
958
+ if(!flag) return 0;
959
+
960
+ do {
961
+ if(!skipword_ok(lna,0)) continue;
962
+ if(!is_subword(lna)) continue;
963
+ if(!is_WithinWrdMatch(lna,genFL)) continue;
964
+
965
+ extract_lf(mod[0][0],ntk-1,str_);
966
+ return 1;
967
+ } while(search_backward_adv(sf,genFL));
968
+
969
+ return 0;
970
+ }
971
+
972
+
973
+ long WithinWrdFWrd::strategy(const char *sf_, const char *str_)
974
+ {
975
+ long lna,lnt,flag;
976
+ bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
977
+
978
+ str_tolower(sf_,sf);
979
+ str_tolower(str_,text);
980
+
981
+ token(text,tok);
982
+ lna = strlen(sf);
983
+ lnt = strlen(tok[ntk-1]);
984
+
985
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
986
+ if(!flag) return 0;
987
+
988
+ do {
989
+ if(!skipword_ok(lna,0)) continue;
990
+ if(!is_subword(lna)) continue;
991
+ if(!is_BeginWrdMatch(lna,genFL)) continue;
992
+ if(!is_WithinWrdMatch(lna,genFL)) continue;
993
+
994
+ extract_lf(mod[0][0],ntk-1,str_);
995
+ return 1;
996
+ } while(search_backward_adv(sf,genFL));
997
+
998
+ return 0;
999
+ }
1000
+
1001
+
1002
+ long WithinWrdFWrdSkp::strategy(const char *sf_, const char *str_)
1003
+ {
1004
+ long lna,lnt,flag;
1005
+ bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
1006
+
1007
+ str_tolower(sf_,sf);
1008
+ str_tolower(str_,text);
1009
+
1010
+ token(text,tok);
1011
+ lna = strlen(sf);
1012
+ lnt = strlen(tok[ntk-1]);
1013
+
1014
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
1015
+ if(!flag) return 0;
1016
+
1017
+ do {
1018
+ if(!exist_skipword(lna)) continue;
1019
+ if(!skipword_ok(lna,1)) continue;
1020
+ if(!is_subword(lna)) continue;
1021
+ if(!is_BeginWrdMatch(lna,genFL)) continue;
1022
+ if(!is_WithinWrdMatch(lna,genFL)) continue;
1023
+
1024
+ extract_lf(mod[0][0],ntk-1,str_);
1025
+ return 1;
1026
+ } while(search_backward_adv(sf,genFL));
1027
+
1028
+ return 0;
1029
+ }
1030
+
1031
+
1032
+ long WithinWrdLet::strategy(const char *sf_, const char *str_)
1033
+ {
1034
+ long lna,lnt,flag;
1035
+ bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
1036
+
1037
+ str_tolower(sf_,sf);
1038
+ str_tolower(str_,text);
1039
+
1040
+ token(text,tok);
1041
+ lna = strlen(sf);
1042
+ lnt = strlen(tok[ntk-1]);
1043
+
1044
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
1045
+ if(!flag) return 0;
1046
+
1047
+ do {
1048
+ if(!skipword_ok(lna,0)) continue;
1049
+ if(!is_WithinWrdMatch(lna,genFL)) continue;
1050
+
1051
+ extract_lf(mod[0][0],ntk-1,str_);
1052
+ return 1;
1053
+ } while(search_backward_adv(sf,genFL));
1054
+
1055
+ return 0;
1056
+ }
1057
+
1058
+
1059
+ long WithinWrdFLet::strategy(const char *sf_, const char *str_)
1060
+ {
1061
+ long lna,lnt,flag;
1062
+ bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
1063
+
1064
+ str_tolower(sf_,sf);
1065
+ str_tolower(str_,text);
1066
+
1067
+ token(text,tok);
1068
+ lna = strlen(sf);
1069
+ lnt = strlen(tok[ntk-1]);
1070
+
1071
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
1072
+ if(!flag) return 0;
1073
+
1074
+ do {
1075
+ if(!skipword_ok(lna,0)) continue;
1076
+ if(!is_BeginWrdMatch(lna,genFL)) continue;
1077
+ if(!is_WithinWrdMatch(lna,genFL)) continue;
1078
+
1079
+ extract_lf(mod[0][0],ntk-1,str_);
1080
+ return 1;
1081
+ } while(search_backward_adv(sf,genFL));
1082
+
1083
+ return 0;
1084
+ }
1085
+
1086
+
1087
+ long WithinWrdFLetSkp::strategy(const char *sf_, const char *str_)
1088
+ {
1089
+ long lna,lnt,flag;
1090
+ bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
1091
+
1092
+ str_tolower(sf_,sf);
1093
+ str_tolower(str_,text);
1094
+
1095
+ token(text,tok);
1096
+ lna = strlen(sf);
1097
+ lnt = strlen(tok[ntk-1]);
1098
+
1099
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
1100
+ if(!flag) return 0;
1101
+
1102
+ do {
1103
+ if(!exist_skipword(lna)) continue;
1104
+ if(!skipword_ok(lna,1)) continue;
1105
+ if(!is_BeginWrdMatch(lna,genFL)) continue;
1106
+ if(!is_WithinWrdMatch(lna,genFL)) continue;
1107
+
1108
+ extract_lf(mod[0][0],ntk-1,str_);
1109
+ return 1;
1110
+ } while(search_backward_adv(sf,genFL));
1111
+
1112
+ return 0;
1113
+ }
1114
+
1115
+
1116
+ long ContLet::strategy(const char *sf_, const char *str_)
1117
+ {
1118
+ long lna,lnt,flag;
1119
+ bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
1120
+
1121
+ str_tolower(sf_,sf);
1122
+ str_tolower(str_,text);
1123
+
1124
+ token(text,tok);
1125
+ lna = strlen(sf);
1126
+ lnt = strlen(tok[ntk-1]);
1127
+
1128
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
1129
+ if(!flag) return 0;
1130
+
1131
+ do {
1132
+ if(!skipword_ok(lna,0)) continue;
1133
+ if(!is_BeginWrdMatch(lna,genFL)) continue;
1134
+ if(!is_ContLetMatch(lna)) continue;
1135
+
1136
+ extract_lf(mod[0][0],ntk-1,str_);
1137
+ return 1;
1138
+ } while(search_backward_adv(sf,genFL));
1139
+
1140
+ return 0;
1141
+ }
1142
+
1143
+
1144
+ long ContLetSkp::strategy(const char *sf_, const char *str_)
1145
+ {
1146
+ long lna,lnt,flag;
1147
+ bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
1148
+
1149
+ str_tolower(sf_,sf);
1150
+ str_tolower(str_,text);
1151
+
1152
+ token(text,tok);
1153
+ lna = strlen(sf);
1154
+ lnt = strlen(tok[ntk-1]);
1155
+
1156
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
1157
+ if(!flag) return 0;
1158
+
1159
+ do {
1160
+ if(!exist_skipword(lna)) continue;
1161
+ if(!skipword_ok(lna,1)) continue;
1162
+ if(!is_BeginWrdMatch(lna,genFL)) continue;
1163
+ if(!is_ContLetMatch(lna)) continue;
1164
+
1165
+ extract_lf(mod[0][0],ntk-1,str_);
1166
+ return 1;
1167
+ } while(search_backward_adv(sf,genFL));
1168
+
1169
+ return 0;
1170
+ }
1171
+
1172
+
1173
+ long AnyLet::strategy(const char *sf_, const char *str_)
1174
+ {
1175
+ long lna,lnt,flag;
1176
+ bool genFL=true; //1:allow 1-ahpha for 1ch of SF match, 0:don't
1177
+
1178
+ str_tolower(sf_,sf);
1179
+ str_tolower(str_,text);
1180
+
1181
+ token(text,tok);
1182
+ lna = strlen(sf);
1183
+ lnt = strlen(tok[ntk-1]);
1184
+
1185
+ flag = search_backward(lna-1,ntk-1,lnt-1,sf,genFL);
1186
+ if(!flag) return 0;
1187
+
1188
+ do {
1189
+ if(!skipword_ok(lna,1)) continue;
1190
+
1191
+ extract_lf(mod[0][0],ntk-1,str_);
1192
+ return 1;
1193
+ } while(search_backward_adv(sf,genFL));
1194
+
1195
+ return 0;
1196
+ }
1197
+
1198
+
1199
+
1200
+ //-----
1201
+ AbbrStra * StratUtil::strat_factory(string name)
1202
+ {
1203
+ if(name=="FirstLetOneChSF") return new FirstLetOneChSF;
1204
+ else if(name=="FirstLet") return new FirstLet;
1205
+ else if(name=="FirstLetGen") return new FirstLetGen;
1206
+ else if(name=="FirstLetGen2") return new FirstLetGen2;
1207
+ else if(name=="FirstLetGenS") return new FirstLetGenS;
1208
+ else if(name=="FirstLetGenStp") return new FirstLetGenStp;
1209
+ else if(name=="FirstLetGenStp2") return new FirstLetGenStp2;
1210
+ else if(name=="FirstLetGenSkp") return new FirstLetGenSkp;
1211
+ else if(name=="WithinWrdWrd") return new WithinWrdWrd;
1212
+ else if(name=="WithinWrdFWrd") return new WithinWrdFWrd;
1213
+ else if(name=="WithinWrdFWrdSkp") return new WithinWrdFWrdSkp;
1214
+ else if(name=="WithinWrdLet") return new WithinWrdLet;
1215
+ else if(name=="WithinWrdFLet") return new WithinWrdFLet;
1216
+ else if(name=="WithinWrdFLetSkp") return new WithinWrdFLetSkp;
1217
+ else if(name=="ContLet") return new ContLet;
1218
+ else if(name=="ContLetSkp") return new ContLetSkp;
1219
+ else if(name=="AnyLet") return new AnyLet;
1220
+ else { cout << "Fail strat_factory\n"; exit(1); }
1221
+ }
1222
+
1223
+
1224
+ //check if sf is ok and assign a group
1225
+ //if sf length > 5, use 5!!
1226
+ //grp will be Al+#ChInSF, Num+#ChInSF, or Spec+#ChInSF
1227
+ bool StratUtil::group_sf(const char *sf, string &grp)
1228
+ {
1229
+ long i, j, len=strlen(sf);
1230
+ long al=0, num=0, nonalnum=0;
1231
+ long paren=0, sbrac=0;
1232
+
1233
+ grp = ""; // if failure, no group
1234
+
1235
+ if(!isalnum(sf[0])) return false; //1sf ch must alnum
1236
+ for(i=0; i<len; i++) {
1237
+ if(isalpha(sf[i])) al++;
1238
+ else if(isdigit(sf[i])) num++;
1239
+ else nonalnum++;
1240
+ }
1241
+ if(al<1) return false; //at least one alphabet
1242
+
1243
+ //false for one parenthesis or square bracket
1244
+ for(i=len-1; i>=0; i--) {
1245
+ if(sf[i]=='(') paren++;
1246
+ if(sf[i]==')') paren--;
1247
+ if(sf[i]=='[') sbrac++;
1248
+ if(sf[i]==']') sbrac--;
1249
+ }
1250
+ if(paren!=0 || sbrac!=0) return false;
1251
+
1252
+ if(al==len) grp.assign("Al");
1253
+ else if(num>0) grp.assign("Num");
1254
+ else if(nonalnum>0) grp.assign("Spec");
1255
+ else { cout << "No sf group\n"; exit(1); }
1256
+
1257
+ //append sf length
1258
+ len = len>5 ? 5 : len;
1259
+
1260
+ switch(len) {
1261
+ case 1:
1262
+ grp.append("1");
1263
+ break;
1264
+ case 2:
1265
+ grp.append("2");
1266
+ break;
1267
+ case 3:
1268
+ grp.append("3");
1269
+ break;
1270
+ case 4:
1271
+ grp.append("4");
1272
+ break;
1273
+ case 5:
1274
+ grp.append("5");
1275
+ break;
1276
+ default:
1277
+ cout << "Not defined #-ch SF" << endl;
1278
+ exit(1);
1279
+ }
1280
+
1281
+ return true;
1282
+ }
1283
+
1284
+ //add the condition |lf|>|sf|
1285
+ bool StratUtil::group_sf(const char *sf, const char *lf, string &grp)
1286
+ {
1287
+ long i, j, len=strlen(sf);
1288
+ long al=0, num=0, nonalnum=0;
1289
+ long paren=0, sbrac=0;
1290
+
1291
+ if(strlen(lf)<len) return false; //|lf|>|sf|
1292
+ if(!isalnum(sf[0])) return false; //1sf ch must alnum
1293
+ for(i=0; i<len; i++) {
1294
+ if(isalpha(sf[i])) al++;
1295
+ else if(isdigit(sf[i])) num++;
1296
+ else nonalnum++;
1297
+ }
1298
+ if(al<1) return false; //at least one alphabet
1299
+ if(al>10) return false; //|alpha sf| is at most 10
1300
+ if(num_token(sf)>2) return false; //added Feb-21-08
1301
+
1302
+ //false for one parenthesis or square bracket
1303
+ for(i=len-1; i>=0; i--) {
1304
+ if(sf[i]=='(') paren++;
1305
+ if(sf[i]==')') paren--;
1306
+ if(sf[i]=='[') sbrac++;
1307
+ if(sf[i]==']') sbrac--;
1308
+ }
1309
+ if(paren!=0 || sbrac!=0) return false;
1310
+
1311
+ if(al==len) grp.assign("Al");
1312
+ else if(num>0) grp.assign("Num");
1313
+ else if(nonalnum>0) grp.assign("Spec");
1314
+ else { cout << "No sf group\n"; exit(1); }
1315
+
1316
+ //append sf length
1317
+ len = len>5 ? 5 : len;
1318
+
1319
+ switch(len) {
1320
+ case 1:
1321
+ grp.append("1");
1322
+ break;
1323
+ case 2:
1324
+ grp.append("2");
1325
+ break;
1326
+ case 3:
1327
+ grp.append("3");
1328
+ break;
1329
+ case 4:
1330
+ grp.append("4");
1331
+ break;
1332
+ case 5:
1333
+ grp.append("5");
1334
+ break;
1335
+ default:
1336
+ cout << "Not defined #-ch SF" << endl;
1337
+ exit(1);
1338
+ }
1339
+
1340
+ return true;
1341
+ }
1342
+
1343
+
1344
+ //remove non-alnum in str1 and save it to str2
1345
+ void StratUtil::remove_nonAlnum(const char *str1, char *str2)
1346
+ {
1347
+ long i=0, j=0;
1348
+
1349
+ while(str1[i]) {
1350
+ if(isalnum(str1[i])) {
1351
+ str2[j] = str1[i];
1352
+ j++;
1353
+ }
1354
+ i++;
1355
+ }
1356
+ str2[j] = '\0';
1357
+ }
1358
+
1359
+
1360
+ vector<string> StratUtil::get_strats(string s)
1361
+ {
1362
+ if(s=="Al1") return Al1;
1363
+ else if(s=="Al2") return Al2;
1364
+ else if(s=="Al3") return Al3;
1365
+ else if(s=="Al4") return Al4;
1366
+ else if(s=="Al5") return Al5;
1367
+ else if(s=="Num2") return Num2;
1368
+ else if(s=="Num3") return Num3;
1369
+ else if(s=="Num4") return Num4;
1370
+ else if(s=="Num5") return Num5;
1371
+ else if(s=="Spec2") return Spec2;
1372
+ else if(s=="Spec3") return Spec3;
1373
+ else if(s=="Spec4") return Spec4;
1374
+ else if(s=="Spec5") return Spec5;
1375
+ else { cout << "Incorrect name\n"; exit(1); }
1376
+ }
1377
+
1378
+
1379
+ void StratUtil::push_back_strat(string sgp, string strat)
1380
+ {
1381
+ if(sgp=="Al1") Al1.push_back(strat);
1382
+ else if(sgp=="Al2") Al2.push_back(strat);
1383
+ else if(sgp=="Al3") Al3.push_back(strat);
1384
+ else if(sgp=="Al4") Al4.push_back(strat);
1385
+ else if(sgp=="Al5") Al5.push_back(strat);
1386
+ else if(sgp=="Num2") Num2.push_back(strat);
1387
+ else if(sgp=="Num3") Num3.push_back(strat);
1388
+ else if(sgp=="Num4") Num4.push_back(strat);
1389
+ else if(sgp=="Num5") Num5.push_back(strat);
1390
+ else if(sgp=="Spec2") Spec2.push_back(strat);
1391
+ else if(sgp=="Spec3") Spec3.push_back(strat);
1392
+ else if(sgp=="Spec4") Spec4.push_back(strat);
1393
+ else if(sgp=="Spec5") Spec5.push_back(strat);
1394
+ }
1395
+
1396
+
1397
+ long StratUtil::exist_upperal(const char *str)
1398
+ {
1399
+ long i, len=strlen(str);
1400
+
1401
+ for(i=0; i<len; i++)
1402
+ if(isupper(str[i]))
1403
+ return 1;
1404
+ return 0;
1405
+ }
1406
+
1407
+ long StratUtil::num_token(const char *str)
1408
+ {
1409
+ long i,j=0,k=0;
1410
+ long n=strlen(str)-1;
1411
+
1412
+ while(isblank(str[n])) n--;
1413
+
1414
+ while(str[j]){
1415
+ while(isblank(str[j]))j++;
1416
+ i=j;
1417
+ while((str[j])&&(!isblank(str[j])))j++;
1418
+ if(str[j]){
1419
+ k++;
1420
+ j++;
1421
+ }
1422
+ }
1423
+ if((j-1)>n) k--; //added by Sohn (Jan-17-08): "ab cd " -> 2 tokens
1424
+ return k+1; //# tokens
1425
+ }
1426
+ //-----
Library/AbbrStra.h CHANGED
@@ -1,332 +1,332 @@
1
- #ifndef ABBRSTRA_H
2
- #define ABBRSTRA_H
3
-
4
- #include <vector>
5
- #include <string>
6
- #include <Hash.h>
7
-
8
- using namespace std;
9
- using namespace iret;
10
-
11
-
12
- class WordData {
13
- public:
14
- WordData(const char *wrdname="wrdset3", const char *stpname="stop",
15
- const char *lfsname="Lf1chSf");
16
-
17
- ~WordData();
18
-
19
- Chash wrdset; //sigle word in MEDLINE
20
- Hash stp; //stopword
21
- Hash lfs; //lfs (1-ch sf) for FirstLet match cases >=2
22
- };
23
-
24
-
25
- class AbbrStra {
26
- public:
27
- AbbrStra();
28
- ~AbbrStra();
29
- void token(const char *str, char lst[1000][1000]); // tokennize & set ntk
30
- long tokenize(const char *str, char lst[1000][1000]); //tokennize & return # tokens
31
- long num_token(const char *str); //return # tokens
32
- long first_ch(const char *str, char *fch, long num);
33
- long is_upperal(const char *str);
34
- long is_alpha(const char *str);
35
- void str_tolower(const char *str1, char *str2);
36
- long get_str(const char *str1, char *str2, long num);
37
- bool isupper_str(const char *str);
38
- bool is_onealpha(const char *str);
39
- long count_upperstr(const char *str);
40
- //return # upper-case 1st letter of consecutive tokens (backward)
41
- void get_alpha(const char *str1, char *str2);
42
- //set str2 with only alphabet of str1
43
- bool lf_ok(const char *shrtf, const char *longf);
44
-
45
- virtual bool set_condition(const char *sf);
46
- //must set nonAlphaSF=true if want to use SF containing non-alphabet
47
- virtual long strategy(const char *sf, const char *str) = 0;
48
- //sf & str will be lower-cased (OCt-25-2007)
49
- long search_backward(long sloc, long tnum, long tloc, const char *sf, bool first);
50
- //search backward to find match starting from sf[sloc]
51
- //Returns 1 if matches. sf[0] must match with begin word
52
- long search_backward_adv(const char *sf, bool first);
53
- //Searches for next model setting. Returns 1 if finds one.
54
- void extract_lf(long begin, long end);
55
- //save strings from begin to end of tok to lf
56
- void extract_lf(long begin, long end, const char *str);
57
- //save strings from begin to end of str's tok to lf
58
-
59
- //---after set mod check conditions
60
- //nsf:# ch in sf, nsw:# allowed skipword, general:true allow 1st ch match after non-alnum
61
- bool exist_skipword(long nsf);
62
- //true if at least one skip word exists
63
- bool exist_n_skipwords(long nsf, long n);
64
- //true if exist n consecutive skip words between tokens but cannot be more than n
65
- bool exist_n_stopwords(long nsf, long n);
66
- //true if exist n consecutive skip stopwords between tokens but cannot be more than n
67
- bool stopword_ok(long nsf, long nsw);
68
- //true if at most (can be 0) nsw skip stopword in row exists
69
- bool skip_stop_ok(long nsf, long nsw, long n);
70
- //true if at most (can be 0) nsw skip word, which include at least n stopwords, in row exists
71
- bool skip_stop_ok2(long nsf, long nsw, long n);
72
- //true if nsw skip word, which include at least n stopwords, in row exists
73
- bool skipword_ok(long nsf, long nsw);
74
- //true if at most (can be 0) nsw skip word in row exists
75
- bool is_subword(long nsf);
76
- //true if matching string is begin of a tok or a word in wrdlist
77
- bool is_BeginWrdMatch(long nsf, bool general);
78
- //true if begining ch of a word match
79
- //if general is true, allow match after non-alnum (eg, 1-alpha)
80
- bool is_WithinWrdMatch(long nsf, bool general);
81
- //true if within word match
82
- //if general is true, 1-Alpha: 'A' is not within word match
83
- bool is_FirstLetMatch(long nsf, bool general);
84
- //true if each ch of sf match with 1st ch of word
85
- //(true: Alpha anyword Beta (AB))
86
- //if general=true, true: 1-Alpha Beta, Alpha-Beta
87
- bool is_FirstLetMatch2(long nsf, bool general);
88
- //at least one 1-Alpha
89
- bool is_FirstLetSMatch(const char *sf, bool general);
90
- //true if first letter match & 's' match with last ch of lf
91
- bool is_ContLetMatch(long nsf);
92
- //true if two or more consecutive letter match
93
- //---
94
-
95
- char *pch; //sf applied to a strategy
96
- char *ps, *pl; //sf, potential lf
97
- char sf[100], text[10000]; //sf & potential lf used in a strategy
98
- char lf[10000]; //lf found by a strategy
99
- char tok[1000][1000]; //token of potential lf
100
- //lower after strategy, original after extract_lf(b,e,str)
101
- long ntk; //# tokens
102
- long mod[100][2]; //match locations of tok with a given sf
103
- //mod[sf_inx][0]=tok inx, mod[sf_inx][1]=match loc in tok[mod[sf_inx][0]]
104
-
105
- //for each n_ch-SF
106
- long npairs; //selected pairs for this strategy
107
- long tpairs; //total pairs
108
- long nsfs; //# selected unique sfs for this strategy
109
- long nmatchs; //# matchs (success strategy & given sf == real sf)
110
- long amatchs; //# accumulated matchs up to this strategy
111
- long setCondition; //SF condition
112
- long greaterEqNsf; //if 1 select SF |SF|>=nsf
113
-
114
- WordData *wData;
115
- };
116
-
117
-
118
- /*
119
- alpha beta gamma (ABG)
120
- */
121
- class FirstLet : public AbbrStra {
122
- public:
123
- virtual bool set_condition(const char *str1, const char *str2, char *str);
124
- virtual long strategy(const char *sf, const char *str);
125
- };
126
-
127
-
128
- class FirstLetOneChSF : public AbbrStra {
129
- public:
130
- virtual bool set_condition(const char *str1, const char *str2, char *str);
131
- virtual long strategy(const char *sf, const char *str);
132
- };
133
-
134
-
135
- /*
136
- - sf ch matchs with 1st ch or ch right after non-alphanum of lf
137
- but at least one match right after non-alphanum
138
- (eg, success: 1-alpha 2-beta (AB), alpha-beta(AB),
139
- fail: alpha beta(AB))
140
- */
141
- class FirstLetGen : public AbbrStra {
142
- public:
143
- virtual long strategy(const char *sf, const char *str);
144
- };
145
-
146
-
147
- /*
148
- - sf ch matchs with 1st ch or ch right after non-alphanum of lf
149
- (eg, success: 1-alpha 2-beta (AB), alpha-beta(AB),
150
- alpha beta(AB))
151
- */
152
- class FirstLetGen2 : public AbbrStra {
153
- public:
154
- virtual long strategy(const char *sf, const char *str);
155
- };
156
-
157
-
158
- /*
159
- For sf consisting of capital letters & lower-case 's'
160
- - First letter & 's' in the last token of lf
161
- (success: Alpha Betas (ABs), 1-Alpha Betas (ABs),
162
- 1-Alpha-Betas (ABs), Alpha BetaS (ABs)
163
- fail: Alpha Beta xxs (ABs) )
164
- */
165
- class FirstLetGenS : public AbbrStra {
166
- public:
167
- virtual bool set_condition(const char *sf); //sf must be an original sf
168
- //true if sf is like ABCs
169
- virtual long strategy(const char *sf, const char *str);
170
- };
171
-
172
-
173
- /*
174
- - sf ch matches with 1st ch or ch right after non-alphanum of lf
175
- - allowing one skip stopword between tokens (no more than one in row)
176
- at least one skip stopword in total
177
- (eg, success: alpha and beta (AB), 1-alpha and beta (AB)
178
- fail: alpha beta (AB), alpha word beta (AB))
179
- */
180
- class FirstLetGenStp : public AbbrStra {
181
- public:
182
- virtual long strategy(const char *sf, const char *str);
183
- };
184
-
185
-
186
- /*
187
- - same as FirstLetGenStp except for 2 skip stopwords
188
- & at least one two consecutive skip stopwords
189
- */
190
- class FirstLetGenStp2 : public AbbrStra {
191
- public:
192
- virtual long strategy(const char *sf, const char *str);
193
- };
194
-
195
-
196
- /*
197
- - same as FirstLetGenStp except using skip any word instead of stopword
198
- */
199
- class FirstLetGenSkp : public AbbrStra {
200
- public:
201
- virtual long strategy(const char *sf, const char *str);
202
- };
203
-
204
-
205
- /*
206
- - a matching sub-string must be word
207
- (eg, success: AlphaBeta (AB), Beta is word
208
- x-AlphaBeta (AB) )
209
- - at least one within word match
210
- (eg,fail: Alpha Beta Word (ABW), Alpha x-Beta x-Word (ABW)
211
- success: AlphaBeta Word (ABW), x-AlphaBeta inWord (ABW))
212
- */
213
- class WithinWrdWrd : public AbbrStra {
214
- public:
215
- virtual long strategy(const char *sf, const char *str);
216
- };
217
-
218
-
219
- /*
220
- - WithinWrdWrd w/ Begin Word Match
221
- (success: AlphaBeta x-Word (ABW)
222
- fail: AlphaBeta inWord (ABW) )
223
- */
224
- class WithinWrdFWrd : public AbbrStra {
225
- public:
226
- virtual long strategy(const char *sf, const char *str);
227
- };
228
-
229
-
230
- /*
231
- - WithinWrdFWrd w/ allowing one skip word between tokens (no more than one in row)
232
- at least one skip word in total
233
- (success: AlphaBeta zzz x-Word zzz (ABW)
234
- fail: AlphaBeta x-Word (ABW), AlphaBeta zzz yyy x-Word (ABW))
235
- */
236
- class WithinWrdFWrdSkp : public AbbrStra {
237
- public:
238
- virtual long strategy(const char *sf, const char *str);
239
- };
240
-
241
-
242
- /*
243
- - at least one within word match
244
- ( success: Alpha InXyy (AX), x-Alpha InXyy (AX))
245
- fail: Alpha Xyy (AX), Alpha 1-Xyy (AX))
246
- */
247
- class WithinWrdLet : public AbbrStra {
248
- public:
249
- virtual long strategy(const char *sf, const char *str);
250
- };
251
-
252
-
253
- /*
254
- - WithinWrdLet w/ Begin Word Match
255
- (fail: Alpha InXyy (AX), x-Alpha InXyy (AX)
256
- success: AlphaXyy Word (AXW), x-AlphaXyy 1-Word (AXW))
257
- */
258
- class WithinWrdFLet : public AbbrStra {
259
- public:
260
- virtual long strategy(const char *sf, const char *str);
261
- };
262
-
263
-
264
- /*
265
- - WithinWrdFLet w/ allowing one skip word between tokens (no more than one in row)
266
- at least one skip word in total
267
- (success: AlphaXyy zzz Word zzz (AXW)
268
- fail: AlphaXyy Word (AXW), AlphaXyy zzz yyy Word (AXW))
269
- */
270
- class WithinWrdFLetSkp : public AbbrStra {
271
- public:
272
- virtual long strategy(const char *sf, const char *str);
273
- };
274
-
275
-
276
- /*
277
- - any two consecutive letter matching w/ begin word match
278
- eg) ABxxx (AB), 1-ABxxx (AB), ABxxx Cxxx (ABC), Axxx BCxxx (ABC)
279
- prolactin (PRL), succinylcholine (SCh)
280
- */
281
- class ContLet : public AbbrStra {
282
- public:
283
- virtual long strategy(const char *sf, const char *str);
284
- };
285
-
286
-
287
- /*
288
- - ContLet w/ allowing one skip word between tokens (no more than one in row)
289
- at least one skip word in total
290
- */
291
- class ContLetSkp : public AbbrStra {
292
- public:
293
- virtual long strategy(const char *sf, const char *str);
294
- };
295
-
296
-
297
- /*
298
- - match can occur anywhere
299
- - allow one skip word between tokens (no more than one in row)
300
- (success: Alpha yXyy (AX), Alpha yXyy word (AX)
301
- 1-Alpha yXyy word (AX))
302
- */
303
- class AnyLet : public AbbrStra {
304
- public:
305
- virtual long strategy(const char *sf, const char *str);
306
- };
307
-
308
-
309
- class StratUtil {
310
- public:
311
- AbbrStra *strat_factory(string name);
312
- vector<string> get_strats(string s);
313
- //get the strategy sequence for a given #-ch SF group
314
- void push_back_strat(string sgp, string strat);
315
- bool group_sf(const char *sf, string &grp);
316
- //check if sf is ok and assign a group
317
- bool group_sf(const char *sf, const char *lf, string &grp);
318
- //add the contion |lf|>|sf|
319
- void remove_nonAlnum(const char *str1, char *str2);
320
- //remove non-alnum in str1 and save it to str2
321
- long exist_upperal(const char *str); //return 1 if exists upper char, 0 ow
322
- long num_token(const char *str); //return # tokens
323
-
324
- vector<string> Al1, Al2, Al3, Al4, Al5;
325
- vector<string> Num2, Num3, Num4, Num5;
326
- vector<string> Spec2, Spec3, Spec4, Spec5;
327
- };
328
-
329
-
330
- #endif
331
-
332
-
 
1
+ #ifndef ABBRSTRA_H
2
+ #define ABBRSTRA_H
3
+
4
+ #include <vector>
5
+ #include <string>
6
+ #include <Hash.h>
7
+
8
+ using namespace std;
9
+ using namespace iret;
10
+
11
+
12
+ class WordData {
13
+ public:
14
+ WordData(const char *wrdname="wrdset3", const char *stpname="stop",
15
+ const char *lfsname="Lf1chSf");
16
+
17
+ ~WordData();
18
+
19
+ Chash wrdset; //sigle word in MEDLINE
20
+ Hash stp; //stopword
21
+ Hash lfs; //lfs (1-ch sf) for FirstLet match cases >=2
22
+ };
23
+
24
+
25
+ class AbbrStra {
26
+ public:
27
+ AbbrStra();
28
+ ~AbbrStra();
29
+ void token(const char *str, char lst[1000][1000]); // tokennize & set ntk
30
+ long tokenize(const char *str, char lst[1000][1000]); //tokennize & return # tokens
31
+ long num_token(const char *str); //return # tokens
32
+ long first_ch(const char *str, char *fch, long num);
33
+ long is_upperal(const char *str);
34
+ long is_alpha(const char *str);
35
+ void str_tolower(const char *str1, char *str2);
36
+ long get_str(const char *str1, char *str2, long num);
37
+ bool isupper_str(const char *str);
38
+ bool is_onealpha(const char *str);
39
+ long count_upperstr(const char *str);
40
+ //return # upper-case 1st letter of consecutive tokens (backward)
41
+ void get_alpha(const char *str1, char *str2);
42
+ //set str2 with only alphabet of str1
43
+ bool lf_ok(const char *shrtf, const char *longf);
44
+
45
+ virtual bool set_condition(const char *sf);
46
+ //must set nonAlphaSF=true if want to use SF containing non-alphabet
47
+ virtual long strategy(const char *sf, const char *str) = 0;
48
+ //sf & str will be lower-cased (OCt-25-2007)
49
+ long search_backward(long sloc, long tnum, long tloc, const char *sf, bool first);
50
+ //search backward to find match starting from sf[sloc]
51
+ //Returns 1 if matches. sf[0] must match with begin word
52
+ long search_backward_adv(const char *sf, bool first);
53
+ //Searches for next model setting. Returns 1 if finds one.
54
+ void extract_lf(long begin, long end);
55
+ //save strings from begin to end of tok to lf
56
+ void extract_lf(long begin, long end, const char *str);
57
+ //save strings from begin to end of str's tok to lf
58
+
59
+ //---after set mod check conditions
60
+ //nsf:# ch in sf, nsw:# allowed skipword, general:true allow 1st ch match after non-alnum
61
+ bool exist_skipword(long nsf);
62
+ //true if at least one skip word exists
63
+ bool exist_n_skipwords(long nsf, long n);
64
+ //true if exist n consecutive skip words between tokens but cannot be more than n
65
+ bool exist_n_stopwords(long nsf, long n);
66
+ //true if exist n consecutive skip stopwords between tokens but cannot be more than n
67
+ bool stopword_ok(long nsf, long nsw);
68
+ //true if at most (can be 0) nsw skip stopword in row exists
69
+ bool skip_stop_ok(long nsf, long nsw, long n);
70
+ //true if at most (can be 0) nsw skip word, which include at least n stopwords, in row exists
71
+ bool skip_stop_ok2(long nsf, long nsw, long n);
72
+ //true if nsw skip word, which include at least n stopwords, in row exists
73
+ bool skipword_ok(long nsf, long nsw);
74
+ //true if at most (can be 0) nsw skip word in row exists
75
+ bool is_subword(long nsf);
76
+ //true if matching string is begin of a tok or a word in wrdlist
77
+ bool is_BeginWrdMatch(long nsf, bool general);
78
+ //true if begining ch of a word match
79
+ //if general is true, allow match after non-alnum (eg, 1-alpha)
80
+ bool is_WithinWrdMatch(long nsf, bool general);
81
+ //true if within word match
82
+ //if general is true, 1-Alpha: 'A' is not within word match
83
+ bool is_FirstLetMatch(long nsf, bool general);
84
+ //true if each ch of sf match with 1st ch of word
85
+ //(true: Alpha anyword Beta (AB))
86
+ //if general=true, true: 1-Alpha Beta, Alpha-Beta
87
+ bool is_FirstLetMatch2(long nsf, bool general);
88
+ //at least one 1-Alpha
89
+ bool is_FirstLetSMatch(const char *sf, bool general);
90
+ //true if first letter match & 's' match with last ch of lf
91
+ bool is_ContLetMatch(long nsf);
92
+ //true if two or more consecutive letter match
93
+ //---
94
+
95
+ char *pch; //sf applied to a strategy
96
+ char *ps, *pl; //sf, potential lf
97
+ char sf[100], text[10000]; //sf & potential lf used in a strategy
98
+ char lf[10000]; //lf found by a strategy
99
+ char tok[1000][1000]; //token of potential lf
100
+ //lower after strategy, original after extract_lf(b,e,str)
101
+ long ntk; //# tokens
102
+ long mod[100][2]; //match locations of tok with a given sf
103
+ //mod[sf_inx][0]=tok inx, mod[sf_inx][1]=match loc in tok[mod[sf_inx][0]]
104
+
105
+ //for each n_ch-SF
106
+ long npairs; //selected pairs for this strategy
107
+ long tpairs; //total pairs
108
+ long nsfs; //# selected unique sfs for this strategy
109
+ long nmatchs; //# matchs (success strategy & given sf == real sf)
110
+ long amatchs; //# accumulated matchs up to this strategy
111
+ long setCondition; //SF condition
112
+ long greaterEqNsf; //if 1 select SF |SF|>=nsf
113
+
114
+ WordData *wData;
115
+ };
116
+
117
+
118
+ /*
119
+ alpha beta gamma (ABG)
120
+ */
121
+ class FirstLet : public AbbrStra {
122
+ public:
123
+ virtual bool set_condition(const char *str1, const char *str2, char *str);
124
+ virtual long strategy(const char *sf, const char *str);
125
+ };
126
+
127
+
128
+ class FirstLetOneChSF : public AbbrStra {
129
+ public:
130
+ virtual bool set_condition(const char *str1, const char *str2, char *str);
131
+ virtual long strategy(const char *sf, const char *str);
132
+ };
133
+
134
+
135
+ /*
136
+ - sf ch matchs with 1st ch or ch right after non-alphanum of lf
137
+ but at least one match right after non-alphanum
138
+ (eg, success: 1-alpha 2-beta (AB), alpha-beta(AB),
139
+ fail: alpha beta(AB))
140
+ */
141
+ class FirstLetGen : public AbbrStra {
142
+ public:
143
+ virtual long strategy(const char *sf, const char *str);
144
+ };
145
+
146
+
147
+ /*
148
+ - sf ch matchs with 1st ch or ch right after non-alphanum of lf
149
+ (eg, success: 1-alpha 2-beta (AB), alpha-beta(AB),
150
+ alpha beta(AB))
151
+ */
152
+ class FirstLetGen2 : public AbbrStra {
153
+ public:
154
+ virtual long strategy(const char *sf, const char *str);
155
+ };
156
+
157
+
158
+ /*
159
+ For sf consisting of capital letters & lower-case 's'
160
+ - First letter & 's' in the last token of lf
161
+ (success: Alpha Betas (ABs), 1-Alpha Betas (ABs),
162
+ 1-Alpha-Betas (ABs), Alpha BetaS (ABs)
163
+ fail: Alpha Beta xxs (ABs) )
164
+ */
165
+ class FirstLetGenS : public AbbrStra {
166
+ public:
167
+ virtual bool set_condition(const char *sf); //sf must be an original sf
168
+ //true if sf is like ABCs
169
+ virtual long strategy(const char *sf, const char *str);
170
+ };
171
+
172
+
173
+ /*
174
+ - sf ch matches with 1st ch or ch right after non-alphanum of lf
175
+ - allowing one skip stopword between tokens (no more than one in row)
176
+ at least one skip stopword in total
177
+ (eg, success: alpha and beta (AB), 1-alpha and beta (AB)
178
+ fail: alpha beta (AB), alpha word beta (AB))
179
+ */
180
+ class FirstLetGenStp : public AbbrStra {
181
+ public:
182
+ virtual long strategy(const char *sf, const char *str);
183
+ };
184
+
185
+
186
+ /*
187
+ - same as FirstLetGenStp except for 2 skip stopwords
188
+ & at least one two consecutive skip stopwords
189
+ */
190
+ class FirstLetGenStp2 : public AbbrStra {
191
+ public:
192
+ virtual long strategy(const char *sf, const char *str);
193
+ };
194
+
195
+
196
+ /*
197
+ - same as FirstLetGenStp except using skip any word instead of stopword
198
+ */
199
+ class FirstLetGenSkp : public AbbrStra {
200
+ public:
201
+ virtual long strategy(const char *sf, const char *str);
202
+ };
203
+
204
+
205
+ /*
206
+ - a matching sub-string must be word
207
+ (eg, success: AlphaBeta (AB), Beta is word
208
+ x-AlphaBeta (AB) )
209
+ - at least one within word match
210
+ (eg,fail: Alpha Beta Word (ABW), Alpha x-Beta x-Word (ABW)
211
+ success: AlphaBeta Word (ABW), x-AlphaBeta inWord (ABW))
212
+ */
213
+ class WithinWrdWrd : public AbbrStra {
214
+ public:
215
+ virtual long strategy(const char *sf, const char *str);
216
+ };
217
+
218
+
219
+ /*
220
+ - WithinWrdWrd w/ Begin Word Match
221
+ (success: AlphaBeta x-Word (ABW)
222
+ fail: AlphaBeta inWord (ABW) )
223
+ */
224
+ class WithinWrdFWrd : public AbbrStra {
225
+ public:
226
+ virtual long strategy(const char *sf, const char *str);
227
+ };
228
+
229
+
230
+ /*
231
+ - WithinWrdFWrd w/ allowing one skip word between tokens (no more than one in row)
232
+ at least one skip word in total
233
+ (success: AlphaBeta zzz x-Word zzz (ABW)
234
+ fail: AlphaBeta x-Word (ABW), AlphaBeta zzz yyy x-Word (ABW))
235
+ */
236
+ class WithinWrdFWrdSkp : public AbbrStra {
237
+ public:
238
+ virtual long strategy(const char *sf, const char *str);
239
+ };
240
+
241
+
242
+ /*
243
+ - at least one within word match
244
+ ( success: Alpha InXyy (AX), x-Alpha InXyy (AX))
245
+ fail: Alpha Xyy (AX), Alpha 1-Xyy (AX))
246
+ */
247
+ class WithinWrdLet : public AbbrStra {
248
+ public:
249
+ virtual long strategy(const char *sf, const char *str);
250
+ };
251
+
252
+
253
+ /*
254
+ - WithinWrdLet w/ Begin Word Match
255
+ (fail: Alpha InXyy (AX), x-Alpha InXyy (AX)
256
+ success: AlphaXyy Word (AXW), x-AlphaXyy 1-Word (AXW))
257
+ */
258
+ class WithinWrdFLet : public AbbrStra {
259
+ public:
260
+ virtual long strategy(const char *sf, const char *str);
261
+ };
262
+
263
+
264
+ /*
265
+ - WithinWrdFLet w/ allowing one skip word between tokens (no more than one in row)
266
+ at least one skip word in total
267
+ (success: AlphaXyy zzz Word zzz (AXW)
268
+ fail: AlphaXyy Word (AXW), AlphaXyy zzz yyy Word (AXW))
269
+ */
270
+ class WithinWrdFLetSkp : public AbbrStra {
271
+ public:
272
+ virtual long strategy(const char *sf, const char *str);
273
+ };
274
+
275
+
276
+ /*
277
+ - any two consecutive letter matching w/ begin word match
278
+ eg) ABxxx (AB), 1-ABxxx (AB), ABxxx Cxxx (ABC), Axxx BCxxx (ABC)
279
+ prolactin (PRL), succinylcholine (SCh)
280
+ */
281
+ class ContLet : public AbbrStra {
282
+ public:
283
+ virtual long strategy(const char *sf, const char *str);
284
+ };
285
+
286
+
287
+ /*
288
+ - ContLet w/ allowing one skip word between tokens (no more than one in row)
289
+ at least one skip word in total
290
+ */
291
+ class ContLetSkp : public AbbrStra {
292
+ public:
293
+ virtual long strategy(const char *sf, const char *str);
294
+ };
295
+
296
+
297
+ /*
298
+ - match can occur anywhere
299
+ - allow one skip word between tokens (no more than one in row)
300
+ (success: Alpha yXyy (AX), Alpha yXyy word (AX)
301
+ 1-Alpha yXyy word (AX))
302
+ */
303
+ class AnyLet : public AbbrStra {
304
+ public:
305
+ virtual long strategy(const char *sf, const char *str);
306
+ };
307
+
308
+
309
+ class StratUtil {
310
+ public:
311
+ AbbrStra *strat_factory(string name);
312
+ vector<string> get_strats(string s);
313
+ //get the strategy sequence for a given #-ch SF group
314
+ void push_back_strat(string sgp, string strat);
315
+ bool group_sf(const char *sf, string &grp);
316
+ //check if sf is ok and assign a group
317
+ bool group_sf(const char *sf, const char *lf, string &grp);
318
+ //add the contion |lf|>|sf|
319
+ void remove_nonAlnum(const char *str1, char *str2);
320
+ //remove non-alnum in str1 and save it to str2
321
+ long exist_upperal(const char *str); //return 1 if exists upper char, 0 ow
322
+ long num_token(const char *str); //return # tokens
323
+
324
+ vector<string> Al1, Al2, Al3, Al4, Al5;
325
+ vector<string> Num2, Num3, Num4, Num5;
326
+ vector<string> Spec2, Spec3, Spec4, Spec5;
327
+ };
328
+
329
+
330
+ #endif
331
+
332
+
Library/AbbrvE.C CHANGED
@@ -1,629 +1,629 @@
1
- #include "AbbrvE.h"
2
- #include <sstream>
3
-
4
- namespace iret {
5
-
6
- Find_Seq::Find_Seq( void )
7
- /* initializers work in C++0x
8
- :
9
- seq_i ( { "i", "ii", "iii", "iv", "v", "vi" } ),
10
- seq_I ( { "I", "II", "III", "IV", "V", "VI" } ),
11
- seq_a ( { "a", "b", "c", "d", "e", "f" } ),
12
- seq_A ( { "A", "B", "C", "D", "E", "F" } )
13
- */
14
- {
15
- seq_i.push_back("i");
16
- seq_i.push_back("ii");
17
- seq_i.push_back("iii");
18
- seq_i.push_back("iv");
19
- seq_i.push_back("v");
20
- seq_i.push_back("vi");
21
-
22
- seq_I.push_back("I");
23
- seq_I.push_back("II");
24
- seq_I.push_back("III");
25
- seq_I.push_back("IV");
26
- seq_I.push_back("V");
27
- seq_I.push_back("VI");
28
-
29
- seq_a.push_back("a");
30
- seq_a.push_back("b");
31
- seq_a.push_back("c");
32
- seq_a.push_back("d");
33
- seq_a.push_back("e");
34
- seq_a.push_back("f");
35
-
36
- seq_A.push_back("A");
37
- seq_A.push_back("B");
38
- seq_A.push_back("C");
39
- seq_A.push_back("D");
40
- seq_A.push_back("E");
41
- seq_A.push_back("F");
42
-
43
- }
44
-
45
- void
46
- Find_Seq::flag_seq( int numa, char* abbs[] ) {
47
-
48
- my_numa = numa;
49
- my_abbs = abbs;
50
-
51
- my_rate.resize(numa);
52
- for ( int i = 0; i < numa; ++i )
53
- my_rate[i] = true;
54
-
55
- find_seq(seq_i);
56
- find_seq(seq_I);
57
- find_seq(seq_a);
58
- find_seq(seq_A);
59
-
60
- create_seq();
61
- }
62
-
63
-
64
- void
65
- Find_Seq::find_seq( const vector<string> & seq ) {
66
-
67
- for ( int i_abbr = 0; i_abbr < my_numa-1; ++i_abbr ) {
68
- // need to see at least 2 in sequence
69
-
70
- if ( seq[0] == my_abbs[i_abbr] ) {
71
- int i_seq = 1;
72
- while ( i_seq < seq.size() and
73
- i_seq + i_abbr < my_numa and
74
- seq[i_seq] == my_abbs[i_abbr + i_seq ] )
75
- ++i_seq;
76
-
77
- if ( i_seq > 1 )
78
- for ( int i = 0; i < i_seq; ++i )
79
- my_rate[i_abbr+i] = false;
80
- }
81
-
82
- }
83
- }
84
-
85
- void
86
- Find_Seq::create_seq( void ) {
87
-
88
- for ( int i_abbr = 0; i_abbr < my_numa; ++i_abbr ) {
89
-
90
- size_t len = std::strlen( my_abbs[i_abbr] );
91
-
92
- if ( my_abbs[i_abbr][len-1] == '1' ) {
93
- // create sequence and test
94
-
95
- string prefix( my_abbs[i_abbr], len-1 );
96
- size_t seq_len = my_numa - i_abbr; // max possible length
97
- vector<string> seq;
98
- // sequence starts with 1
99
- for ( int i= 1; i <= seq_len; ++i ) {
100
- std::ostringstream stream(prefix,std::ios::app);
101
- stream << i;
102
- seq.push_back( stream.str() );
103
- }
104
-
105
- // cout << seq << '\n';
106
- find_seq(seq);
107
- }
108
- }
109
- }
110
-
111
-
112
- AbbrvE::AbbrvE(long ta,long wrd_spc){
113
- tta=ta;
114
- word_space=wrd_spc;
115
- abbl=new char*[tta];
116
- abbs=new char*[tta];
117
- nt=new int[tta];
118
- lst=new char*[word_space];
119
- numa=num=0;
120
- pMt=new MPtok;
121
- setup_Test();
122
- }
123
-
124
- AbbrvE::~AbbrvE(){
125
- if(numa)cleara();
126
- clear();
127
- delete [] abbl;
128
- delete [] abbs;
129
- delete [] nt;
130
- delete [] lst;
131
- delete pMt;
132
- }
133
-
134
- void AbbrvE::Extract(char*pch){
135
- long i,j,k,u,flag;
136
- int ix;
137
-
138
- if ( strlen(pch) <= 0 ) // no text to look at
139
- return;
140
-
141
- token(pch);
142
-
143
- i=j=k=0;
144
- flag=0;
145
- while(i<num){
146
- if(!strcmp("(",lst[i])){
147
- if(flag)k=j+1;
148
- if((i>k)&&(strcmp(")",lst[i-1]))){
149
- j=i;
150
- flag=1;
151
- }
152
- }
153
- if(!strcmp(")",lst[i])){
154
- if(!flag){j=k=i+1;}
155
- else {
156
- if(((j>k)&&(i<j+12))&&(i>j+1)){
157
- if(k<j-10)k=j-10;
158
- strcpy(cnam,lst[k]);
159
- for(u=k+1;u<j;u++){
160
- strcat(cnam," ");
161
- strcat(cnam,lst[u]);
162
- }
163
- ix=strlen(cnam);
164
- abbl[numa]=new char[ix+1];
165
- strcpy(abbl[numa],cnam);
166
-
167
- strcpy(cnam,lst[j+1]);
168
- for(u=j+2;u<i;u++){
169
- strcat(cnam," ");
170
- strcat(cnam,lst[u]);
171
- }
172
- nt[numa]=i-j-1;
173
- ix=strlen(cnam);
174
- abbs[numa]=new char[ix+1];
175
- strcpy(abbs[numa],cnam);
176
- if(Test(abbs[numa]))numa++;
177
- else{ //if test done earlier would not need to allocate memory
178
- //until known to be needed
179
- delete [] abbs[numa];
180
- delete [] abbl[numa];
181
- }
182
- flag=0;
183
- }
184
- else {
185
- flag=0;
186
- k=i+1;
187
- }
188
- }
189
- }
190
- i++;
191
- }
192
- }
193
-
194
-
195
- //modified Jan-9-2008
196
- //extract SF in [], parse until ';' or ',' in () or []
197
- void AbbrvE::Extract2(const char*pch){
198
- long i,j,k,u,ii,jj,kk,flag;
199
- int ix;
200
- char openCh[2], closeCh[2];
201
-
202
- token2(pch); // alpha beta (AB) -> alpha beta ( AB )
203
-
204
- for(jj=0; jj<2; jj++) {//deal with both () & []
205
- i=j=k=0;
206
- flag=0;
207
-
208
- if(jj==0) { strcpy(openCh,"("); strcpy(closeCh,")"); }
209
- else if(jj==1) { strcpy(openCh,"["); strcpy(closeCh,"]"); }
210
-
211
- while(i<num){
212
- if(!strcmp(openCh,lst[i])){
213
- if(flag)k=j+1; //increment after seeing both '(' and ')'
214
- if((i>k)&&(strcmp(closeCh,lst[i-1]))){
215
- j=i; //index of '('
216
- flag=1;
217
- }
218
- }
219
- if(!strcmp(closeCh,lst[i])){
220
- if(!flag){j=k=i+1;} //next token
221
- else {
222
- if(((j>k)&&(i<j+12))&&(i>j+1)){
223
- if(k<j-10)k=j-10;
224
- strcpy(cnam,lst[k]);
225
- for(u=k+1;u<j;u++){
226
- strcat(cnam," ");
227
- strcat(cnam,lst[u]);
228
- }
229
- ix=strlen(cnam);
230
- abbl[numa]=new char[ix+1];
231
- strcpy(abbl[numa],cnam);
232
-
233
- strcpy(cnam,lst[j+1]);
234
- for(u=j+2;u<i;u++){
235
- strcat(cnam," ");
236
- strcat(cnam,lst[u]);
237
- }
238
- nt[numa]=i-j-1; //# abbr tokens
239
- ix=strlen(cnam);
240
-
241
- //---- parse until ';' or ','
242
- ii=0;
243
- while(ii<ix) {
244
- if( ((cnam[ii]==';')&&(cnam[ii+1]==' ')) ||
245
- ((cnam[ii]==',')&&(cnam[ii+1]==' ')) ) {
246
- ix=ii+1;
247
- cnam[ii]='\0';
248
- break;
249
- }
250
- ii++;
251
- }
252
- //----
253
-
254
- abbs[numa]=new char[ix+1];
255
- strcpy(abbs[numa],cnam);
256
- if(Test(abbs[numa]))numa++;
257
- else{ //if test done earlier would not need to allocate memory
258
- //until known to be needed
259
- delete [] abbs[numa];
260
- delete [] abbl[numa];
261
- }
262
- flag=0;
263
- }
264
- else {
265
- flag=0;
266
- k=i+1;
267
- }
268
- }
269
- }
270
- i++;
271
- }
272
- }
273
- }
274
-
275
-
276
- void AbbrvE::token(const char *pch){
277
- long i=1,j=0,k=0;
278
- long u=1,flag=0;
279
- char c,*str=cnam;
280
- int size=strlen(pch);
281
- if(size>cnam_size) {
282
- cerr<<"Scratch space "<<cnam_size<<", needed "<<size<<endl;
283
- exit(1);
284
- }
285
- clear(); // ready space for tokens
286
- cnam[0]=pch[0];
287
- while(c=pch[i]){
288
- switch(c){
289
- case '(': if(isblank(str[u-1])){
290
- str[u++]=pch[i++];
291
- if(!isblank(pch[i])){
292
- str[u++]=' ';
293
- }
294
- flag=1;
295
- }
296
- else str[u++]=pch[i++];
297
- break;
298
- case ')': if(flag){
299
- if(!isblank(str[u-1])){
300
- str[u++]=' ';
301
- str[u++]=pch[i++];
302
- }
303
- if(!isblank(pch[i]))str[u++]=' ';
304
- flag=0;
305
- }
306
- else str[u++]=pch[i++];
307
- break;
308
- default: str[u++]=pch[i++];
309
- }
310
- }
311
- while((u>0)&&(isblank(str[u-1])))u--;
312
- str[u]='\0';
313
-
314
- while(str[j]){
315
- while(isblank(str[j]))j++;
316
- i=j;
317
- while((str[j])&&(!isblank(str[j])))j++;
318
- lst[k]=new char[j-i+1];
319
- strncpy(lst[k],str+i,j-i);
320
- lst[k][j-i]='\0';
321
- if(str[j]){
322
- k++;
323
- j++;
324
- }
325
- }
326
- num=k+1;
327
- }
328
-
329
-
330
- //both () & [] Jan-9-2008
331
- //(G(1)) -> ( G(1) ) Jan-28-2008
332
- void AbbrvE::token2(const char *pch){
333
- long i=1,j=0,k=0;
334
- long u=1;
335
- vector<bool> openChFlag1,openChFlag2;
336
- long cflag;
337
- long ii, jj, kk, sz;
338
- char c,*str=cnam;
339
- clear(); // ready space for tokens
340
- cnam[0]=pch[0];
341
- while(c=pch[i]){
342
- switch(c){
343
- case '(':
344
- //--- (h)alpha -> (h)alpha, (h)-alpha -> ( h ) -alpha
345
- ii=kk=i;
346
- cflag=0;
347
- while(pch[ii] && !isblank(pch[ii])) { //pch[ii] can be '\0'
348
- if(pch[ii]=='(') cflag -= 1;
349
- else if(pch[ii]==')') { cflag += 1; kk=ii; }
350
- ii++;
351
- }
352
-
353
- if(!cflag && isalnum(pch[kk+1])) { //if alnum right after ')'
354
- while(i<ii) str[u++]=pch[i++];
355
- break;
356
- }
357
- //---
358
-
359
- if(isblank(str[u-1])){
360
- str[u++]=pch[i++];
361
- if(!isblank(pch[i])){
362
- str[u++]=' ';
363
- }
364
- openChFlag1.push_back(true);
365
- }
366
- else {
367
- str[u++]=pch[i++];
368
- openChFlag1.push_back(false);
369
- }
370
-
371
- break;
372
-
373
- case ')': sz = openChFlag1.size();
374
- if(sz>0 && openChFlag1[sz-1]){ //modified Jan-28-08
375
- if(!isblank(str[u-1])){
376
- str[u++]=' ';
377
- str[u++]=pch[i++]; //pch[i++] is ')'
378
- }
379
- //---added (Jan-11-08): (BIV; ), -> ( BIV; ) ,
380
- else if(!isblank(pch[i+1])){
381
- str[u++]=pch[i++]; //pch[i++] is ')'
382
- }
383
- //---
384
-
385
- if(!isblank(pch[i]))str[u++]=' '; //pch[i] must be after ')'
386
- }
387
- else str[u++]=pch[i++];
388
-
389
- if(sz>0) openChFlag1.pop_back();
390
-
391
- break;
392
-
393
- case '[':
394
- //--- [h]alpha -> [h]alpha
395
- ii=kk=i;
396
- cflag=0;
397
- while(pch[ii] && !isblank(pch[ii])) { //pch[ii] can be '\0'
398
- if(pch[ii]=='[') cflag -= 1;
399
- else if(pch[ii]==']') { cflag += 1; kk=ii; }
400
- ii++;
401
- }
402
-
403
- if(!cflag && isalnum(pch[kk+1])) { //if alnum right after ')'
404
- while(i<ii) str[u++]=pch[i++];
405
- break;
406
- }
407
- //---
408
-
409
- if(isblank(str[u-1])){
410
- str[u++]=pch[i++];
411
- if(!isblank(pch[i])){
412
- str[u++]=' ';
413
- }
414
- openChFlag2.push_back(true);
415
- }
416
- else {
417
- str[u++]=pch[i++];
418
- openChFlag2.push_back(false);
419
- }
420
-
421
- break;
422
-
423
- case ']': sz=openChFlag2.size();
424
- if(sz>0 && openChFlag2[sz-1]){ //modified Jan-28-08
425
- if(!isblank(str[u-1])){
426
- str[u++]=' ';
427
- str[u++]=pch[i++];
428
- }
429
- //---added (Jan-11-08): [BIV; ], -> [ BIV; ] ,
430
- else if(!isblank(pch[i+1])){
431
- str[u++]=pch[i++];
432
- }
433
- //---
434
- if(!isblank(pch[i]))str[u++]=' ';
435
- }
436
- else str[u++]=pch[i++];
437
-
438
- if(sz>0) openChFlag2.pop_back();
439
-
440
- break;
441
- default: str[u++]=pch[i++];
442
- }
443
- }
444
- while((u>0)&&(isblank(str[u-1])))u--;
445
- str[u]='\0';
446
-
447
- while(str[j]){
448
- while(isblank(str[j]))j++;
449
- i=j;
450
- while((str[j])&&(!isblank(str[j])))j++;
451
- lst[k]=new char[j-i+1];
452
- strncpy(lst[k],str+i,j-i);
453
- lst[k][j-i]='\0';
454
- if(str[j]){
455
- k++;
456
- j++;
457
- }
458
- }
459
- num=k+1;
460
- }
461
-
462
-
463
- void AbbrvE::clear(void){
464
- for ( int i=0; i<num; i++ ) {
465
- delete [] lst[i];
466
- }
467
- num=0;
468
- }
469
-
470
- void AbbrvE::cleara(void){
471
- long i;
472
- for(i=0;i<numa;i++){
473
- delete [] abbl[i];
474
- delete [] abbs[i];
475
- }
476
- numa=0;
477
- }
478
-
479
- #if 0
480
-
481
- //no space before and after abbs[] (because of using token)
482
- int AbbrvE::Test(const char *str){
483
- long i,j,k;
484
- char b,c;
485
-
486
- if(strchr(str,'='))return(0);
487
- if(!strcmp(str,"author's transl"))return(0);
488
- if(!strcmp(str,"proceedings"))return(0);
489
- //---added (Jan-11-08) & (Apr 08)
490
- if(!strcmp(str,"see"))return(0);
491
- if(!strcmp(str,"and"))return(0);
492
- if(!strcmp(str,"comment"))return(0);
493
- if(!strcmp(str,"letter"))return(0);
494
- //---
495
- if((str[0]=='e')&&(str[1]=='g')){
496
- if(!(c=str[2])||(c=='.')||(c==','))return(0);
497
- }
498
- if((str[0]=='s')&&(str[1]=='e')&&(str[2]=='e')&&(((b=str[3])==' ')||(b==',')))return(0);
499
- if('p'==tolower(str[0])){
500
- if(strchr(str+1,'<'))return(0);
501
- }
502
- i=j=k=0;
503
- while((c=str[i])&&(c!=' ')){
504
- i++;
505
- if(isdigit(c))j++;
506
- if(isalpha(c))k++;
507
- if((i==j)&&(i==3))return(0);
508
- }
509
- if((i==j)||(k==0))return(0);
510
- else return(1);
511
- }
512
-
513
- #endif
514
-
515
- bool AbbrvE::prefix_match( const char *str ) {
516
- size_t size = strlen(str);
517
- for ( int i = 0; i < prefix.size(); ++i ) {
518
- string& pre = prefix[i];
519
- if ( size > pre.size() and
520
- 0 == pre.compare( 0, pre.size(), str, pre.size() ) )
521
- return true;
522
- }
523
- return false;
524
- }
525
-
526
-
527
- //no space before and after abbs[] (because of using token)
528
- bool AbbrvE::Test(const char *str){
529
-
530
- if ( match.find(str) != match.end() ) return false;
531
- if ( prefix_match(str) ) return false;
532
-
533
- size_t length, letters, digits;
534
- length = letters = digits = 0;
535
-
536
- char c;
537
- while((c=str[length])&&(c!=' ')){
538
- length++;
539
- if ( isdigit(c) ) digits++;
540
- if ( isalpha(c) ) letters++;
541
-
542
- if( length==digits and length>=3 ) return false;
543
- }
544
- if ( digits == length ) return false;
545
- if ( letters <= 0 ) return false;
546
-
547
- return true;
548
- }
549
-
550
- void AbbrvE::setup_Test( void ) {
551
-
552
- match.insert("author's transl");
553
- match.insert("proceedings");
554
- match.insert("see");
555
- match.insert("and");
556
- match.insert("comment");
557
- match.insert("letter");
558
- match.insert("eg");
559
-
560
- prefix.push_back("=");
561
- prefix.push_back("eg.");
562
- prefix.push_back("eg,");
563
- prefix.push_back("see ");
564
- prefix.push_back("see,");
565
- prefix.push_back("p<");
566
- prefix.push_back("P<");
567
-
568
- // rules added in 2010
569
- match.insert("e.g.");
570
- match.insert("ie");
571
- match.insert("i.e.");
572
- match.insert("mean");
573
- match.insert("age");
574
- match.insert("std");
575
- match.insert("range");
576
- match.insert("young");
577
- match.insert("old");
578
- match.insert("male");
579
- match.insert("female");
580
-
581
- }
582
-
583
- void AbbrvE::Proc(char *pxh){
584
- long i,j;
585
- char *pch,*ptr;
586
- pMt->segment(pxh);
587
- for(i=0;i<pMt->sent.size();i++){
588
- Extract2( (pMt->sent[i]).c_str() );
589
- }
590
-
591
- seq.flag_seq( numa, abbs );
592
- j=0;
593
- for(i=0;i<numa;i++){
594
- if( seq.rate(i) ){
595
- if(j<i){
596
- pch=abbl[i];
597
- if(ptr=strchr(pch,'|')){
598
- *ptr='/';
599
- ptr++;
600
- while(ptr=strchr(pch,'|')){
601
- *ptr='/';
602
- ptr++;
603
- }
604
- }
605
- abbl[j]=pch;
606
- pch=abbs[i];
607
- if(ptr=strchr(pch,'|')){
608
- *ptr='/';
609
- ptr++;
610
- while(ptr=strchr(pch,'|')){
611
- *ptr='/';
612
- ptr++;
613
- }
614
- }
615
- abbs[j]=pch;
616
- nt[j]=nt[i];
617
- }
618
- j++;
619
- }
620
- else {
621
- delete [] abbl[i];
622
- delete [] abbs[i];
623
- }
624
- }
625
-
626
- numa=j;
627
- }
628
-
629
- }
 
1
+ #include "AbbrvE.h"
2
+ #include <sstream>
3
+
4
+ namespace iret {
5
+
6
+ Find_Seq::Find_Seq( void )
7
+ /* initializers work in C++0x
8
+ :
9
+ seq_i ( { "i", "ii", "iii", "iv", "v", "vi" } ),
10
+ seq_I ( { "I", "II", "III", "IV", "V", "VI" } ),
11
+ seq_a ( { "a", "b", "c", "d", "e", "f" } ),
12
+ seq_A ( { "A", "B", "C", "D", "E", "F" } )
13
+ */
14
+ {
15
+ seq_i.push_back("i");
16
+ seq_i.push_back("ii");
17
+ seq_i.push_back("iii");
18
+ seq_i.push_back("iv");
19
+ seq_i.push_back("v");
20
+ seq_i.push_back("vi");
21
+
22
+ seq_I.push_back("I");
23
+ seq_I.push_back("II");
24
+ seq_I.push_back("III");
25
+ seq_I.push_back("IV");
26
+ seq_I.push_back("V");
27
+ seq_I.push_back("VI");
28
+
29
+ seq_a.push_back("a");
30
+ seq_a.push_back("b");
31
+ seq_a.push_back("c");
32
+ seq_a.push_back("d");
33
+ seq_a.push_back("e");
34
+ seq_a.push_back("f");
35
+
36
+ seq_A.push_back("A");
37
+ seq_A.push_back("B");
38
+ seq_A.push_back("C");
39
+ seq_A.push_back("D");
40
+ seq_A.push_back("E");
41
+ seq_A.push_back("F");
42
+
43
+ }
44
+
45
+ void
46
+ Find_Seq::flag_seq( int numa, char* abbs[] ) {
47
+
48
+ my_numa = numa;
49
+ my_abbs = abbs;
50
+
51
+ my_rate.resize(numa);
52
+ for ( int i = 0; i < numa; ++i )
53
+ my_rate[i] = true;
54
+
55
+ find_seq(seq_i);
56
+ find_seq(seq_I);
57
+ find_seq(seq_a);
58
+ find_seq(seq_A);
59
+
60
+ create_seq();
61
+ }
62
+
63
+
64
+ void
65
+ Find_Seq::find_seq( const vector<string> & seq ) {
66
+
67
+ for ( int i_abbr = 0; i_abbr < my_numa-1; ++i_abbr ) {
68
+ // need to see at least 2 in sequence
69
+
70
+ if ( seq[0] == my_abbs[i_abbr] ) {
71
+ int i_seq = 1;
72
+ while ( i_seq < seq.size() and
73
+ i_seq + i_abbr < my_numa and
74
+ seq[i_seq] == my_abbs[i_abbr + i_seq ] )
75
+ ++i_seq;
76
+
77
+ if ( i_seq > 1 )
78
+ for ( int i = 0; i < i_seq; ++i )
79
+ my_rate[i_abbr+i] = false;
80
+ }
81
+
82
+ }
83
+ }
84
+
85
+ void
86
+ Find_Seq::create_seq( void ) {
87
+
88
+ for ( int i_abbr = 0; i_abbr < my_numa; ++i_abbr ) {
89
+
90
+ size_t len = std::strlen( my_abbs[i_abbr] );
91
+
92
+ if ( my_abbs[i_abbr][len-1] == '1' ) {
93
+ // create sequence and test
94
+
95
+ string prefix( my_abbs[i_abbr], len-1 );
96
+ size_t seq_len = my_numa - i_abbr; // max possible length
97
+ vector<string> seq;
98
+ // sequence starts with 1
99
+ for ( int i= 1; i <= seq_len; ++i ) {
100
+ std::ostringstream stream(prefix,std::ios::app);
101
+ stream << i;
102
+ seq.push_back( stream.str() );
103
+ }
104
+
105
+ // cout << seq << '\n';
106
+ find_seq(seq);
107
+ }
108
+ }
109
+ }
110
+
111
+
112
+ AbbrvE::AbbrvE(long ta,long wrd_spc){
113
+ tta=ta;
114
+ word_space=wrd_spc;
115
+ abbl=new char*[tta];
116
+ abbs=new char*[tta];
117
+ nt=new int[tta];
118
+ lst=new char*[word_space];
119
+ numa=num=0;
120
+ pMt=new MPtok;
121
+ setup_Test();
122
+ }
123
+
124
+ AbbrvE::~AbbrvE(){
125
+ if(numa)cleara();
126
+ clear();
127
+ delete [] abbl;
128
+ delete [] abbs;
129
+ delete [] nt;
130
+ delete [] lst;
131
+ delete pMt;
132
+ }
133
+
134
+ void AbbrvE::Extract(char*pch){
135
+ long i,j,k,u,flag;
136
+ int ix;
137
+
138
+ if ( strlen(pch) <= 0 ) // no text to look at
139
+ return;
140
+
141
+ token(pch);
142
+
143
+ i=j=k=0;
144
+ flag=0;
145
+ while(i<num){
146
+ if(!strcmp("(",lst[i])){
147
+ if(flag)k=j+1;
148
+ if((i>k)&&(strcmp(")",lst[i-1]))){
149
+ j=i;
150
+ flag=1;
151
+ }
152
+ }
153
+ if(!strcmp(")",lst[i])){
154
+ if(!flag){j=k=i+1;}
155
+ else {
156
+ if(((j>k)&&(i<j+12))&&(i>j+1)){
157
+ if(k<j-10)k=j-10;
158
+ strcpy(cnam,lst[k]);
159
+ for(u=k+1;u<j;u++){
160
+ strcat(cnam," ");
161
+ strcat(cnam,lst[u]);
162
+ }
163
+ ix=strlen(cnam);
164
+ abbl[numa]=new char[ix+1];
165
+ strcpy(abbl[numa],cnam);
166
+
167
+ strcpy(cnam,lst[j+1]);
168
+ for(u=j+2;u<i;u++){
169
+ strcat(cnam," ");
170
+ strcat(cnam,lst[u]);
171
+ }
172
+ nt[numa]=i-j-1;
173
+ ix=strlen(cnam);
174
+ abbs[numa]=new char[ix+1];
175
+ strcpy(abbs[numa],cnam);
176
+ if(Test(abbs[numa]))numa++;
177
+ else{ //if test done earlier would not need to allocate memory
178
+ //until known to be needed
179
+ delete [] abbs[numa];
180
+ delete [] abbl[numa];
181
+ }
182
+ flag=0;
183
+ }
184
+ else {
185
+ flag=0;
186
+ k=i+1;
187
+ }
188
+ }
189
+ }
190
+ i++;
191
+ }
192
+ }
193
+
194
+
195
+ //modified Jan-9-2008
196
+ //extract SF in [], parse until ';' or ',' in () or []
197
+ void AbbrvE::Extract2(const char*pch){
198
+ long i,j,k,u,ii,jj,kk,flag;
199
+ int ix;
200
+ char openCh[2], closeCh[2];
201
+
202
+ token2(pch); // alpha beta (AB) -> alpha beta ( AB )
203
+
204
+ for(jj=0; jj<2; jj++) {//deal with both () & []
205
+ i=j=k=0;
206
+ flag=0;
207
+
208
+ if(jj==0) { strcpy(openCh,"("); strcpy(closeCh,")"); }
209
+ else if(jj==1) { strcpy(openCh,"["); strcpy(closeCh,"]"); }
210
+
211
+ while(i<num){
212
+ if(!strcmp(openCh,lst[i])){
213
+ if(flag)k=j+1; //increment after seeing both '(' and ')'
214
+ if((i>k)&&(strcmp(closeCh,lst[i-1]))){
215
+ j=i; //index of '('
216
+ flag=1;
217
+ }
218
+ }
219
+ if(!strcmp(closeCh,lst[i])){
220
+ if(!flag){j=k=i+1;} //next token
221
+ else {
222
+ if(((j>k)&&(i<j+12))&&(i>j+1)){
223
+ if(k<j-10)k=j-10;
224
+ strcpy(cnam,lst[k]);
225
+ for(u=k+1;u<j;u++){
226
+ strcat(cnam," ");
227
+ strcat(cnam,lst[u]);
228
+ }
229
+ ix=strlen(cnam);
230
+ abbl[numa]=new char[ix+1];
231
+ strcpy(abbl[numa],cnam);
232
+
233
+ strcpy(cnam,lst[j+1]);
234
+ for(u=j+2;u<i;u++){
235
+ strcat(cnam," ");
236
+ strcat(cnam,lst[u]);
237
+ }
238
+ nt[numa]=i-j-1; //# abbr tokens
239
+ ix=strlen(cnam);
240
+
241
+ //---- parse until ';' or ','
242
+ ii=0;
243
+ while(ii<ix) {
244
+ if( ((cnam[ii]==';')&&(cnam[ii+1]==' ')) ||
245
+ ((cnam[ii]==',')&&(cnam[ii+1]==' ')) ) {
246
+ ix=ii+1;
247
+ cnam[ii]='\0';
248
+ break;
249
+ }
250
+ ii++;
251
+ }
252
+ //----
253
+
254
+ abbs[numa]=new char[ix+1];
255
+ strcpy(abbs[numa],cnam);
256
+ if(Test(abbs[numa]))numa++;
257
+ else{ //if test done earlier would not need to allocate memory
258
+ //until known to be needed
259
+ delete [] abbs[numa];
260
+ delete [] abbl[numa];
261
+ }
262
+ flag=0;
263
+ }
264
+ else {
265
+ flag=0;
266
+ k=i+1;
267
+ }
268
+ }
269
+ }
270
+ i++;
271
+ }
272
+ }
273
+ }
274
+
275
+
276
+ void AbbrvE::token(const char *pch){
277
+ long i=1,j=0,k=0;
278
+ long u=1,flag=0;
279
+ char c,*str=cnam;
280
+ int size=strlen(pch);
281
+ if(size>cnam_size) {
282
+ cerr<<"Scratch space "<<cnam_size<<", needed "<<size<<endl;
283
+ exit(1);
284
+ }
285
+ clear(); // ready space for tokens
286
+ cnam[0]=pch[0];
287
+ while(c=pch[i]){
288
+ switch(c){
289
+ case '(': if(isblank(str[u-1])){
290
+ str[u++]=pch[i++];
291
+ if(!isblank(pch[i])){
292
+ str[u++]=' ';
293
+ }
294
+ flag=1;
295
+ }
296
+ else str[u++]=pch[i++];
297
+ break;
298
+ case ')': if(flag){
299
+ if(!isblank(str[u-1])){
300
+ str[u++]=' ';
301
+ str[u++]=pch[i++];
302
+ }
303
+ if(!isblank(pch[i]))str[u++]=' ';
304
+ flag=0;
305
+ }
306
+ else str[u++]=pch[i++];
307
+ break;
308
+ default: str[u++]=pch[i++];
309
+ }
310
+ }
311
+ while((u>0)&&(isblank(str[u-1])))u--;
312
+ str[u]='\0';
313
+
314
+ while(str[j]){
315
+ while(isblank(str[j]))j++;
316
+ i=j;
317
+ while((str[j])&&(!isblank(str[j])))j++;
318
+ lst[k]=new char[j-i+1];
319
+ strncpy(lst[k],str+i,j-i);
320
+ lst[k][j-i]='\0';
321
+ if(str[j]){
322
+ k++;
323
+ j++;
324
+ }
325
+ }
326
+ num=k+1;
327
+ }
328
+
329
+
330
+ //both () & [] Jan-9-2008
331
+ //(G(1)) -> ( G(1) ) Jan-28-2008
332
+ void AbbrvE::token2(const char *pch){
333
+ long i=1,j=0,k=0;
334
+ long u=1;
335
+ vector<bool> openChFlag1,openChFlag2;
336
+ long cflag;
337
+ long ii, jj, kk, sz;
338
+ char c,*str=cnam;
339
+ clear(); // ready space for tokens
340
+ cnam[0]=pch[0];
341
+ while(c=pch[i]){
342
+ switch(c){
343
+ case '(':
344
+ //--- (h)alpha -> (h)alpha, (h)-alpha -> ( h ) -alpha
345
+ ii=kk=i;
346
+ cflag=0;
347
+ while(pch[ii] && !isblank(pch[ii])) { //pch[ii] can be '\0'
348
+ if(pch[ii]=='(') cflag -= 1;
349
+ else if(pch[ii]==')') { cflag += 1; kk=ii; }
350
+ ii++;
351
+ }
352
+
353
+ if(!cflag && isalnum(pch[kk+1])) { //if alnum right after ')'
354
+ while(i<ii) str[u++]=pch[i++];
355
+ break;
356
+ }
357
+ //---
358
+
359
+ if(isblank(str[u-1])){
360
+ str[u++]=pch[i++];
361
+ if(!isblank(pch[i])){
362
+ str[u++]=' ';
363
+ }
364
+ openChFlag1.push_back(true);
365
+ }
366
+ else {
367
+ str[u++]=pch[i++];
368
+ openChFlag1.push_back(false);
369
+ }
370
+
371
+ break;
372
+
373
+ case ')': sz = openChFlag1.size();
374
+ if(sz>0 && openChFlag1[sz-1]){ //modified Jan-28-08
375
+ if(!isblank(str[u-1])){
376
+ str[u++]=' ';
377
+ str[u++]=pch[i++]; //pch[i++] is ')'
378
+ }
379
+ //---added (Jan-11-08): (BIV; ), -> ( BIV; ) ,
380
+ else if(!isblank(pch[i+1])){
381
+ str[u++]=pch[i++]; //pch[i++] is ')'
382
+ }
383
+ //---
384
+
385
+ if(!isblank(pch[i]))str[u++]=' '; //pch[i] must be after ')'
386
+ }
387
+ else str[u++]=pch[i++];
388
+
389
+ if(sz>0) openChFlag1.pop_back();
390
+
391
+ break;
392
+
393
+ case '[':
394
+ //--- [h]alpha -> [h]alpha
395
+ ii=kk=i;
396
+ cflag=0;
397
+ while(pch[ii] && !isblank(pch[ii])) { //pch[ii] can be '\0'
398
+ if(pch[ii]=='[') cflag -= 1;
399
+ else if(pch[ii]==']') { cflag += 1; kk=ii; }
400
+ ii++;
401
+ }
402
+
403
+ if(!cflag && isalnum(pch[kk+1])) { //if alnum right after ')'
404
+ while(i<ii) str[u++]=pch[i++];
405
+ break;
406
+ }
407
+ //---
408
+
409
+ if(isblank(str[u-1])){
410
+ str[u++]=pch[i++];
411
+ if(!isblank(pch[i])){
412
+ str[u++]=' ';
413
+ }
414
+ openChFlag2.push_back(true);
415
+ }
416
+ else {
417
+ str[u++]=pch[i++];
418
+ openChFlag2.push_back(false);
419
+ }
420
+
421
+ break;
422
+
423
+ case ']': sz=openChFlag2.size();
424
+ if(sz>0 && openChFlag2[sz-1]){ //modified Jan-28-08
425
+ if(!isblank(str[u-1])){
426
+ str[u++]=' ';
427
+ str[u++]=pch[i++];
428
+ }
429
+ //---added (Jan-11-08): [BIV; ], -> [ BIV; ] ,
430
+ else if(!isblank(pch[i+1])){
431
+ str[u++]=pch[i++];
432
+ }
433
+ //---
434
+ if(!isblank(pch[i]))str[u++]=' ';
435
+ }
436
+ else str[u++]=pch[i++];
437
+
438
+ if(sz>0) openChFlag2.pop_back();
439
+
440
+ break;
441
+ default: str[u++]=pch[i++];
442
+ }
443
+ }
444
+ while((u>0)&&(isblank(str[u-1])))u--;
445
+ str[u]='\0';
446
+
447
+ while(str[j]){
448
+ while(isblank(str[j]))j++;
449
+ i=j;
450
+ while((str[j])&&(!isblank(str[j])))j++;
451
+ lst[k]=new char[j-i+1];
452
+ strncpy(lst[k],str+i,j-i);
453
+ lst[k][j-i]='\0';
454
+ if(str[j]){
455
+ k++;
456
+ j++;
457
+ }
458
+ }
459
+ num=k+1;
460
+ }
461
+
462
+
463
+ void AbbrvE::clear(void){
464
+ for ( int i=0; i<num; i++ ) {
465
+ delete [] lst[i];
466
+ }
467
+ num=0;
468
+ }
469
+
470
+ void AbbrvE::cleara(void){
471
+ long i;
472
+ for(i=0;i<numa;i++){
473
+ delete [] abbl[i];
474
+ delete [] abbs[i];
475
+ }
476
+ numa=0;
477
+ }
478
+
479
+ #if 0
480
+
481
+ //no space before and after abbs[] (because of using token)
482
+ int AbbrvE::Test(const char *str){
483
+ long i,j,k;
484
+ char b,c;
485
+
486
+ if(strchr(str,'='))return(0);
487
+ if(!strcmp(str,"author's transl"))return(0);
488
+ if(!strcmp(str,"proceedings"))return(0);
489
+ //---added (Jan-11-08) & (Apr 08)
490
+ if(!strcmp(str,"see"))return(0);
491
+ if(!strcmp(str,"and"))return(0);
492
+ if(!strcmp(str,"comment"))return(0);
493
+ if(!strcmp(str,"letter"))return(0);
494
+ //---
495
+ if((str[0]=='e')&&(str[1]=='g')){
496
+ if(!(c=str[2])||(c=='.')||(c==','))return(0);
497
+ }
498
+ if((str[0]=='s')&&(str[1]=='e')&&(str[2]=='e')&&(((b=str[3])==' ')||(b==',')))return(0);
499
+ if('p'==tolower(str[0])){
500
+ if(strchr(str+1,'<'))return(0);
501
+ }
502
+ i=j=k=0;
503
+ while((c=str[i])&&(c!=' ')){
504
+ i++;
505
+ if(isdigit(c))j++;
506
+ if(isalpha(c))k++;
507
+ if((i==j)&&(i==3))return(0);
508
+ }
509
+ if((i==j)||(k==0))return(0);
510
+ else return(1);
511
+ }
512
+
513
+ #endif
514
+
515
+ bool AbbrvE::prefix_match( const char *str ) {
516
+ size_t size = strlen(str);
517
+ for ( int i = 0; i < prefix.size(); ++i ) {
518
+ string& pre = prefix[i];
519
+ if ( size > pre.size() and
520
+ 0 == pre.compare( 0, pre.size(), str, pre.size() ) )
521
+ return true;
522
+ }
523
+ return false;
524
+ }
525
+
526
+
527
+ //no space before and after abbs[] (because of using token)
528
+ bool AbbrvE::Test(const char *str){
529
+
530
+ if ( match.find(str) != match.end() ) return false;
531
+ if ( prefix_match(str) ) return false;
532
+
533
+ size_t length, letters, digits;
534
+ length = letters = digits = 0;
535
+
536
+ char c;
537
+ while((c=str[length])&&(c!=' ')){
538
+ length++;
539
+ if ( isdigit(c) ) digits++;
540
+ if ( isalpha(c) ) letters++;
541
+
542
+ if( length==digits and length>=3 ) return false;
543
+ }
544
+ if ( digits == length ) return false;
545
+ if ( letters <= 0 ) return false;
546
+
547
+ return true;
548
+ }
549
+
550
+ void AbbrvE::setup_Test( void ) {
551
+
552
+ match.insert("author's transl");
553
+ match.insert("proceedings");
554
+ match.insert("see");
555
+ match.insert("and");
556
+ match.insert("comment");
557
+ match.insert("letter");
558
+ match.insert("eg");
559
+
560
+ prefix.push_back("=");
561
+ prefix.push_back("eg.");
562
+ prefix.push_back("eg,");
563
+ prefix.push_back("see ");
564
+ prefix.push_back("see,");
565
+ prefix.push_back("p<");
566
+ prefix.push_back("P<");
567
+
568
+ // rules added in 2010
569
+ match.insert("e.g.");
570
+ match.insert("ie");
571
+ match.insert("i.e.");
572
+ match.insert("mean");
573
+ match.insert("age");
574
+ match.insert("std");
575
+ match.insert("range");
576
+ match.insert("young");
577
+ match.insert("old");
578
+ match.insert("male");
579
+ match.insert("female");
580
+
581
+ }
582
+
583
+ void AbbrvE::Proc(char *pxh){
584
+ long i,j;
585
+ char *pch,*ptr;
586
+ pMt->segment(pxh);
587
+ for(i=0;i<pMt->sent.size();i++){
588
+ Extract2( (pMt->sent[i]).c_str() );
589
+ }
590
+
591
+ seq.flag_seq( numa, abbs );
592
+ j=0;
593
+ for(i=0;i<numa;i++){
594
+ if( seq.rate(i) ){
595
+ if(j<i){
596
+ pch=abbl[i];
597
+ if(ptr=strchr(pch,'|')){
598
+ *ptr='/';
599
+ ptr++;
600
+ while(ptr=strchr(pch,'|')){
601
+ *ptr='/';
602
+ ptr++;
603
+ }
604
+ }
605
+ abbl[j]=pch;
606
+ pch=abbs[i];
607
+ if(ptr=strchr(pch,'|')){
608
+ *ptr='/';
609
+ ptr++;
610
+ while(ptr=strchr(pch,'|')){
611
+ *ptr='/';
612
+ ptr++;
613
+ }
614
+ }
615
+ abbs[j]=pch;
616
+ nt[j]=nt[i];
617
+ }
618
+ j++;
619
+ }
620
+ else {
621
+ delete [] abbl[i];
622
+ delete [] abbs[i];
623
+ }
624
+ }
625
+
626
+ numa=j;
627
+ }
628
+
629
+ }
Library/AbbrvE.h CHANGED
@@ -1,93 +1,93 @@
1
- #ifndef ABBRVE_H
2
- #define ABBRVE_H
3
- #include <fstream>
4
- #include <iostream>
5
- #include <runn.h>
6
- #include <MPtok.h>
7
- #include <vector>
8
- using namespace std;
9
- namespace iret {
10
-
11
- typedef vector<string> strings;
12
-
13
-
14
- class Find_Seq {
15
- public:
16
-
17
- Find_Seq( void );
18
-
19
- // flag the SFs whether part of sequence or not
20
- void flag_seq( int numa, char* abbs[] );
21
-
22
- // true if good SF, false if part of sequence
23
- bool rate( int i ) const { my_rate[i]; }
24
-
25
- private:
26
- void find_seq( const vector<string> & seq );
27
- void create_seq( void );
28
-
29
- // const works with c++0x
30
- /* const */ strings seq_i;
31
- /* const */ strings seq_I;
32
- /* const */ strings seq_a;
33
- /* const */ strings seq_A;
34
-
35
- vector<bool> my_rate;
36
- int my_numa;
37
- char ** my_abbs; // really char *[], but that doesn't work
38
-
39
- };
40
-
41
-
42
- class AbbrvE {
43
- public:
44
- AbbrvE(long ta=10000,long wrd_spc=10000); //Sets space for extracted
45
- //potential abbreviations to ta & word_space to wrd_spc
46
- ~AbbrvE(void);
47
- void Extract(char *pch); //Extracts possible long-short form
48
- //pairs, but does not attempt to find the relationship
49
- void Extract2(const char *pch); //extened version (Jan-9-2008)
50
- bool Test(const char *str); //Tests a single token and returns true
51
- //if the token should be a possible first token of a short form
52
- void Rate(void); //Sets ratings for the proposed pairs. Effort to
53
- //remove (a), (b), etc., sequence markers
54
- void token(const char *str); //Produces a list of tokens in order of
55
- //of occurrence in the string.
56
- void token2(const char *str); //extended version (Jan-9-2008)
57
- void cleara(void); //Clear the abbl & abbs memory of strings
58
- void clear(void); //Clear the lst memory of words
59
-
60
- //Application functions
61
- void Proc(char *pch); //Accepts a natural language statement and
62
- //processes to final results stored in tta, abbs, and abbl
63
- //Need to call cleara function after each use of this function
64
-
65
- // Internal routines:
66
- // setup data for Test method
67
- void setup_Test( void );
68
- bool prefix_match( const char *str ); // does str begins with a prefix?
69
-
70
- //Data
71
- long tta; //Total possible abbreviations extracted
72
- //default 10k
73
- long numa; //number of abbreviations in current extract
74
- char **abbl; //Long form space, hold up to 10 tokens
75
- char **abbs; //Short form space, hold up to 10 tokens
76
- Find_Seq seq; // identify sequences to ignore
77
- int *nt; //Number of tokens within parentheses
78
- long word_space; //Space in lst for tokens
79
- //default 10k
80
- long num; //Number of tokens
81
- char **lst; //Holds the tokens
82
-
83
- static const int cnam_size=100000;
84
- char cnam[cnam_size]; //Work space
85
- MPtok *pMt; //Pointer at tokenizer class. Used to segment text
86
- //in Proc function.
87
-
88
- // Test data
89
- set<string> match; // bad SF to match exactly
90
- vector<string> prefix; // bad SF to match prefix
91
- };
92
- }
93
- #endif
 
1
+ #ifndef ABBRVE_H
2
+ #define ABBRVE_H
3
+ #include <fstream>
4
+ #include <iostream>
5
+ #include <runn.h>
6
+ #include <MPtok.h>
7
+ #include <vector>
8
+ using namespace std;
9
+ namespace iret {
10
+
11
+ typedef vector<string> strings;
12
+
13
+
14
+ class Find_Seq {
15
+ public:
16
+
17
+ Find_Seq( void );
18
+
19
+ // flag the SFs whether part of sequence or not
20
+ void flag_seq( int numa, char* abbs[] );
21
+
22
+ // true if good SF, false if part of sequence
23
+ bool rate( int i ) const { my_rate[i]; }
24
+
25
+ private:
26
+ void find_seq( const vector<string> & seq );
27
+ void create_seq( void );
28
+
29
+ // const works with c++0x
30
+ /* const */ strings seq_i;
31
+ /* const */ strings seq_I;
32
+ /* const */ strings seq_a;
33
+ /* const */ strings seq_A;
34
+
35
+ vector<bool> my_rate;
36
+ int my_numa;
37
+ char ** my_abbs; // really char *[], but that doesn't work
38
+
39
+ };
40
+
41
+
42
+ class AbbrvE {
43
+ public:
44
+ AbbrvE(long ta=10000,long wrd_spc=10000); //Sets space for extracted
45
+ //potential abbreviations to ta & word_space to wrd_spc
46
+ ~AbbrvE(void);
47
+ void Extract(char *pch); //Extracts possible long-short form
48
+ //pairs, but does not attempt to find the relationship
49
+ void Extract2(const char *pch); //extened version (Jan-9-2008)
50
+ bool Test(const char *str); //Tests a single token and returns true
51
+ //if the token should be a possible first token of a short form
52
+ void Rate(void); //Sets ratings for the proposed pairs. Effort to
53
+ //remove (a), (b), etc., sequence markers
54
+ void token(const char *str); //Produces a list of tokens in order of
55
+ //of occurrence in the string.
56
+ void token2(const char *str); //extended version (Jan-9-2008)
57
+ void cleara(void); //Clear the abbl & abbs memory of strings
58
+ void clear(void); //Clear the lst memory of words
59
+
60
+ //Application functions
61
+ void Proc(char *pch); //Accepts a natural language statement and
62
+ //processes to final results stored in tta, abbs, and abbl
63
+ //Need to call cleara function after each use of this function
64
+
65
+ // Internal routines:
66
+ // setup data for Test method
67
+ void setup_Test( void );
68
+ bool prefix_match( const char *str ); // does str begins with a prefix?
69
+
70
+ //Data
71
+ long tta; //Total possible abbreviations extracted
72
+ //default 10k
73
+ long numa; //number of abbreviations in current extract
74
+ char **abbl; //Long form space, hold up to 10 tokens
75
+ char **abbs; //Short form space, hold up to 10 tokens
76
+ Find_Seq seq; // identify sequences to ignore
77
+ int *nt; //Number of tokens within parentheses
78
+ long word_space; //Space in lst for tokens
79
+ //default 10k
80
+ long num; //Number of tokens
81
+ char **lst; //Holds the tokens
82
+
83
+ static const int cnam_size=100000;
84
+ char cnam[cnam_size]; //Work space
85
+ MPtok *pMt; //Pointer at tokenizer class. Used to segment text
86
+ //in Proc function.
87
+
88
+ // Test data
89
+ set<string> match; // bad SF to match exactly
90
+ vector<string> prefix; // bad SF to match prefix
91
+ };
92
+ }
93
+ #endif
Library/Btree.C CHANGED
@@ -1,1304 +1,1304 @@
1
- #include <iostream>
2
- #include <fstream>
3
- #include <cstdio>
4
- #include <cstdlib>
5
- #include <cmath>
6
- #include <cstring>
7
- #include <cassert>
8
- #include "Btree.h"
9
- #include "runn.h"
10
-
11
- using namespace std;
12
- namespace iret {
13
-
14
- Node::Node(void){
15
- str=NULL;
16
- rel=NULL;
17
- pdn=NULL;
18
- }
19
-
20
- Node::Node(const char *ptr){
21
- int i=strlen(ptr);
22
- str = new char[i+1];
23
- strcpy(str,ptr);
24
- rel=NULL;
25
- pdn=NULL;
26
- }
27
-
28
- Node::Node(char const *ptr,void *dtr){
29
- int i=strlen(ptr);
30
- str = new char[i+1];
31
- strcpy(str,ptr);
32
- rel = dtr;
33
- pdn=NULL;
34
- }
35
-
36
- Node::~Node(){
37
- if(str)delete [] str;
38
- }
39
-
40
- void Node::debug(void){
41
- cout << "Node {" << endl;
42
- cout << " str: " << this->str << endl;
43
- if(rel==NULL)cout << " rel: NULL" << endl;
44
- else cout << " rel: " << (long)rel << endl;
45
- if(pdn==NULL)cout << " pdn: NULL" << endl;
46
- else cout << " pdn: " << (long)pdn << endl;
47
- cout << " }" << endl;
48
- }
49
-
50
- Page::Page(){
51
- pdn=NULL;
52
- ndnm='\0';
53
- }
54
-
55
- Page::Page(Page *const pz,Page *const pn,const int n){
56
- pdn=pn;
57
- int j=(int)(pz->ndnm)-n;
58
- ndnm=(char)(j>0 ? j : 0);
59
- for(int i=0;i<(int)ndnm;i++){pnd[i]=(pz->pnd)[n+i];}
60
- }
61
-
62
- Page::~Page(){
63
- for(int i=0;i<(int)ndnm;i++){
64
- delete pnd[i];
65
- }
66
- }
67
-
68
- void Page::clean(void){
69
- for(int i=0;i<(int)ndnm;i++){
70
- pnd[i]->str=NULL;
71
- }
72
- }
73
-
74
- void Page::insert(const int n,Node * const nd,const int j){
75
- assert(j<ord2);
76
- assert(n<=j);
77
- if(n==j){
78
- pnd[j]=nd;
79
- }
80
- else {
81
- for(int i=j;i>n;i--)pnd[i]=pnd[i-1];
82
- pnd[n]=nd;
83
- }
84
- ndnm++;
85
- }
86
-
87
- int Page::search(int &a,int &b,const char *str,int &p){
88
- int j;
89
- if((j=stc_my(a,b,str,pnd[0]->str))<0){
90
- p=0;
91
- return(0);
92
- }
93
- else if(j==0){
94
- p=0;
95
- return(1);
96
- }
97
- if((j=stc_my(a,b,str,pnd[(int)(ndnm-1)]->str))>0){
98
- p=(int)ndnm;
99
- return(0);
100
- }
101
- else if(j==0){
102
- p=(int)(ndnm-1);
103
- return(1);
104
- }
105
- int x=0,i;
106
- int y=(int)(ndnm-1);
107
- while(y-x>1){
108
- i=(y+x)/2;
109
- if((j=stc_my(a,b,str,pnd[i]->str))==0){p=i;return(1);}
110
- else if(j<0)y=i;
111
- else x=i;
112
- }
113
- p=y;
114
- return(0);
115
- }
116
-
117
- int Page::search(int &a,int &b,char *str,int &p,Partial_match *btr){
118
- int j;
119
- if((j=btr->stc_my_long(a,b,str,pnd[0]->str,0))<0){
120
- p=0;
121
- return(0);
122
- }
123
- else if(j==0){
124
- p=0;
125
- return(1);
126
- }
127
- if((j=btr->stc_my_long(a,b,str,pnd[(int)(ndnm-1)]->str,(int)(ndnm-1)))>0){
128
- p=(int)ndnm;
129
- return(0);
130
- }
131
- else if(j==0){
132
- p=(int)(ndnm-1);
133
- return(1);
134
- }
135
- int x=0,i;
136
- int y=(int)(ndnm-1);
137
- while(y-x>1){
138
- i=(y+x)/2;
139
- if((j=btr->stc_my_long(a,b,str,pnd[i]->str,i))==0){p=i;return(1);}
140
- else if(j<0)y=i;
141
- else x=i;
142
- }
143
- p=y;
144
- return(0);
145
- }
146
-
147
- void Page::debug(void){
148
- cout << "Page {" << endl;
149
- cout << " ndnm: " << (int)ndnm << endl;
150
- if(pdn==NULL)cout << " pdn: NULL" << endl;
151
- else cout << " pdn: " << (long)pdn << endl;
152
- for(int i=0;i<(int)ndnm;i++){
153
- cout << i << " ";
154
- (this->pnd[i])->debug();
155
- }
156
- cout << " }" << endl;
157
- }
158
-
159
- int stc_my(int &a,int &b,const char *str,const char *ptr)
160
- {register int i=(a<b) ? a : b;
161
- register const char *p1=str+i;
162
- register const char *p2=ptr+i;
163
- register int j=0;
164
- while((*p1==*p2)&&(*p1!='\0')){
165
- j++;
166
- p1++;
167
- p2++;
168
- }
169
- if(*p1==*p2)return(0);
170
- else if(*p1<*p2){
171
- b=i+j;
172
- return(-1);
173
- }
174
- else {
175
- a=i+j;
176
- return(1);
177
- }
178
- }
179
-
180
- Btree::Btree(){
181
- iclean=0;
182
- copy=false;
183
- depth=0;
184
- root=new Page;
185
- root->ndnm = 1;
186
- (root->pnd)[0]=new Node("");
187
- }
188
-
189
- int Btree::search(const char *str){
190
- depth=-1;
191
- Page *pu=root;
192
- register int a=0,b=0,i,j;
193
- while(pu!=NULL){
194
- depth++;
195
- pg[depth]=pu;
196
- j=(pu->search)(a,b,str,i);
197
- cnd[depth]=i;
198
- if(j==1)return(1);
199
- if(i==0)pu=pu->pdn;
200
- else pu=(pu->pnd)[i-1]->pdn;
201
- }
202
- return(0);
203
- }
204
-
205
- int Btree::insert(Node *nd){
206
- int w,k;
207
- Page *pm,*pz;
208
- while((nd!=NULL)&&(depth)){
209
- pm=pg[depth];
210
- w=pm->ndnm;
211
- if(w<ord2){
212
- pm->insert(cnd[depth],nd,w);
213
- nd=NULL;
214
- }
215
- else {
216
- k=cnd[depth];
217
- if(k<order){
218
- pz=new Page(pm,((pm->pnd)[order-1])->pdn,order);
219
- pm->insert(k,nd,order);
220
- nd=pm->pnd[order];
221
- nd->pdn=pz;
222
- pm->ndnm=order;
223
- }
224
- else if(k>order){
225
- pz=new Page(pm,((pm->pnd)[order])->pdn,order+1);
226
- pz->insert(k-order-1,nd,order-1);
227
- nd=pm->pnd[order];
228
- nd->pdn=pz;
229
- pm->ndnm=order;
230
- }
231
- else {
232
- pz=new Page(pm,nd->pdn,order);
233
- nd->pdn=pz;
234
- pm->ndnm=order;
235
- }
236
- }
237
- depth--;
238
- }
239
- if(nd!=NULL){
240
- pm=pg[depth];
241
- w=pm->ndnm;
242
- if(w<ord2)pm->insert(cnd[depth],nd,w);
243
- else {
244
- root=new Page();
245
- root->pdn=pm;
246
- k=cnd[depth];
247
- if(k<order){
248
- pz=new Page(pm,((pm->pnd)[order-1])->pdn,order);
249
- pm->insert(k,nd,order);
250
- (root->pnd)[0]=pm->pnd[order];
251
- ((root->pnd)[0])->pdn=pz;
252
- root->ndnm=1;
253
- pm->ndnm=order;
254
- }
255
- else if(k>order){
256
- pz=new Page(pm,((pm->pnd)[order])->pdn,order+1);
257
- pz->insert(k-order-1,nd,order-1);
258
- (root->pnd)[0]=pm->pnd[order];
259
- ((root->pnd)[0])->pdn=pz;
260
- root->ndnm=1;
261
- pm->ndnm=order;
262
- }
263
- else {
264
- pz=new Page(pm,nd->pdn,order);
265
- (root->pnd)[0]=nd;
266
- nd->pdn=pz;
267
- root->ndnm=1;
268
- pm->ndnm=order;
269
- }
270
- }
271
- }
272
- return(1);
273
- }
274
-
275
- void Btree::node_first(void){
276
- depth=0;
277
- pg[depth]=root;
278
- cnd[depth]=0;
279
- Page *pm;
280
- while((pm=(pg[depth]->pdn))!=NULL){
281
- depth++;
282
- pg[depth]=pm;
283
- cnd[depth]=0;
284
- }
285
- }
286
-
287
- int Btree::node_next(){
288
- int i=cnd[depth];
289
- Page *pd=((pg[depth]->pnd)[i])->pdn;
290
- if(pd!=NULL){
291
- (cnd[depth])++;
292
- depth++;
293
- pg[depth]=pd;
294
- cnd[depth]=0;
295
- while((pd=(pg[depth]->pdn))!=NULL){
296
- depth++;
297
- pg[depth]=pd;
298
- cnd[depth]=0;
299
- }
300
- }
301
- else {
302
- cnd[depth]=++i;
303
- while((depth>=1)&&(i==(pg[depth]->ndnm))){depth--;i=cnd[depth];}
304
- if((depth==0)&&(i==(pg[depth]->ndnm)))depth--;
305
- if(depth<0)return(0);
306
- }
307
- return(1);
308
- }
309
-
310
- char *Btree::show_str(){
311
- return(((pg[depth]->pnd)[cnd[depth]])->str);
312
- }
313
-
314
- void *Btree::give_ptr(){
315
- return(((pg[depth]->pnd)[cnd[depth]])->rel);
316
- }
317
-
318
- void Btree::set_ptr(void *dtr){
319
- ((pg[depth]->pnd)[cnd[depth]])->rel=dtr;
320
- }
321
-
322
- Btree::~Btree(){
323
- int pflag=get_qflag();
324
- long k=0;
325
- if (copy) return; // only delete original
326
- if(!iclean){
327
- node_first();
328
- int i=depth,j;
329
- do{
330
- j=node_next();
331
- if(depth<i){
332
- while(i>depth){
333
- delete pg[i];
334
- i--;
335
- mark(pflag,++k,1000,"pages deleted");
336
- }
337
- }
338
- else i=depth;
339
- } while(j);
340
- }
341
- else {
342
- node_first();
343
- int i=depth,j;
344
- do{
345
- j=node_next();
346
- if(depth<i){
347
- while(i>depth){
348
- pg[i]->clean();
349
- delete pg[i];
350
- i--;
351
- mark(pflag,++k,1000,"pages deleted");
352
- }
353
- }
354
- else i=depth;
355
- } while(j);
356
- }
357
- }
358
-
359
- long Btree::list_write(ofstream &fout){
360
- int pflag=get_qflag();
361
- long ct=0;
362
- node_first();
363
- while(node_next()){
364
- fout << show_str() << endl;
365
- mark(pflag,++ct,1000,"strings written");
366
- }
367
- fout.close();
368
- return((int)fout.good());
369
- }
370
-
371
- Btree::Btree(ifstream &fin){
372
- copy=false;
373
- char cnam[256];
374
- int pflag=get_qflag();
375
- depth=0;
376
- pg[0]=root=new Page();
377
- cnd[0]=root->ndnm = 1;
378
- (root->pnd)[0]=new Node("");
379
- Node *pno;
380
- long ct=0;
381
- while(get_string(cnam,fin,'\n')){
382
- pno = new Node(cnam);
383
- add(pno);
384
- mark(pflag,++ct,10000,"strings read");
385
- }
386
- fin.close();
387
- }
388
-
389
- int Btree::add(Node *nd){
390
- int w,k,dp;
391
- Page *pm,*pz;
392
- dp=depth; //uses dp in place of depth in insert.
393
- while((nd!=NULL)&&(dp)){
394
- pm=pg[dp];
395
- w=pm->ndnm;
396
- if(w<ord2){
397
- pm->insert(cnd[dp],nd,w);
398
- nd=NULL;
399
- (cnd[dp])++; //variation from insert.
400
- }
401
- else {
402
- k=cnd[dp];
403
- if(k<order){
404
- pz=new Page(pm,((pm->pnd)[order-1])->pdn,order);
405
- pm->insert(k,nd,order);
406
- nd=pm->pnd[order];
407
- nd->pdn=pz;
408
- pm->ndnm=order;
409
- }
410
- else if(k>order){
411
- pz=new Page(pm,((pm->pnd)[order])->pdn,order+1);
412
- pz->insert(k-order-1,nd,order-1);
413
- nd=pm->pnd[order];
414
- nd->pdn=pz;
415
- pm->ndnm=order;
416
- }
417
- else {
418
- pz=new Page(pm,nd->pdn,order);
419
- nd->pdn=pz;
420
- pm->ndnm=order;
421
- }
422
- pg[dp]=pz; //2 lines of variation from insert.
423
- cnd[dp]=order;
424
- }
425
- dp--;
426
- }
427
- if(nd!=NULL){
428
- pm=pg[dp];
429
- w=pm->ndnm;
430
- if(w<ord2){
431
- pm->insert(cnd[dp],nd,w);
432
- (cnd[dp])++; //variation from insert.
433
- }
434
- else {
435
- root=new Page();
436
- root->pdn=pm;
437
- k=cnd[dp];
438
- if(k<order){
439
- pz=new Page(pm,((pm->pnd)[order-1])->pdn,order);
440
- pm->insert(k,nd,order);
441
- (root->pnd)[0]=pm->pnd[order];
442
- ((root->pnd)[0])->pdn=pz;
443
- root->ndnm=1;
444
- pm->ndnm=order;
445
- }
446
- else if(k>order){
447
- pz=new Page(pm,((pm->pnd)[order])->pdn,order+1);
448
- pz->insert(k-order-1,nd,order-1);
449
- (root->pnd)[0]=pm->pnd[order];
450
- ((root->pnd)[0])->pdn=pz;
451
- root->ndnm=1;
452
- pm->ndnm=order;
453
- }
454
- else {
455
- pz=new Page(pm,nd->pdn,order);
456
- (root->pnd)[0]=nd;
457
- nd->pdn=pz;
458
- root->ndnm=1;
459
- pm->ndnm=order;
460
- }
461
- next_empty(); //variation from insert.
462
- }
463
- }
464
- return(1);
465
- }
466
-
467
- void Btree::next_empty(){
468
- depth=0;
469
- pg[depth]=root;
470
- int i=cnd[depth]=root->ndnm;
471
- Page *pm;
472
- while((pm=((pg[depth]->pnd)[i-1])->pdn)!=NULL){
473
- depth++;
474
- pg[depth]=pm;
475
- i=cnd[depth]=pm->ndnm;
476
- }
477
- }
478
-
479
- Str_str::Str_str() : Btree() {
480
- }
481
-
482
- Str_str::~Str_str(){
483
- if(copy)return;
484
- this->node_first();
485
- while(this->node_next())delete [] (char*)this->give_ptr();
486
- }
487
-
488
- void Str_str::add_pair(const char *one,const char *two){
489
- Node *pnd;
490
- if(search(one)){
491
- cout << "Duplicate string in keys list = " << one << endl;
492
- exit(0);
493
- }
494
- else {
495
- int i=strlen(two);
496
- char *st=new char[i+1];
497
- strcpy(st,two);
498
- pnd=new Node(one,(void *)st);
499
- add(pnd);
500
- }
501
- }
502
-
503
- char *Str_str::match(const char *one){
504
- if(search(one)){
505
- return((char*)give_ptr());
506
- }
507
- else {
508
- cout << "String not a key = " << one << endl;
509
- exit(0);
510
- }
511
- }
512
-
513
- List::List() : Btree() {
514
- cnt_key=0;
515
- }
516
-
517
- List::~List(){
518
- }
519
-
520
- void List::add_key(const char *str){
521
- Node *pnd;
522
- if(!search(str)){
523
- pnd=new Node(str);
524
- add(pnd);
525
- }
526
- }
527
-
528
- void List::add_key_count(const char *str){
529
- Node *pnd;
530
- if(!search(str)){
531
- pnd=new Node(str);
532
- add(pnd);
533
- cnt_key++;
534
- }
535
- }
536
-
537
- void List::addp_key_count(char *str){
538
- Node *pnd;
539
- if(!search(str)){
540
- pnd=new Node;
541
- pnd->str=str;
542
- add(pnd);
543
- cnt_key++;
544
- }
545
- }
546
-
547
- Num_num::Num_num() : Btree() {
548
- }
549
-
550
- Num_num::~Num_num(){
551
- if(copy)return;
552
- this->node_first();
553
- while(this->node_next())delete (long*)this->give_ptr();
554
- }
555
-
556
- void Num_num::add_pair(long i,long j){
557
- Node *pnd;
558
- char cnam[256];
559
- long_str(cnam,i);
560
- if(!search(cnam)){
561
- long *st=new long;
562
- *st=j;
563
- pnd=new Node(cnam,(void *)st);
564
- add(pnd);
565
- }
566
- }
567
-
568
- long Num_num::match(long i){
569
- char cnam[256];
570
- long_str(cnam,i);
571
- if(search(cnam)){
572
- return(*((long*)give_ptr()));
573
- }
574
- else return(LNEG);
575
- }
576
-
577
- Count::Count() : List() {
578
- total=0;
579
- }
580
-
581
- Count::~Count(){
582
- if(copy)return;
583
- long *pk;
584
- this->node_first();
585
- while(this->node_next()){
586
- pk=(long*)(this->give_ptr());
587
- if(pk)delete pk;
588
- }
589
- }
590
-
591
- void Count::add_count(const char *pch,long n){
592
- long *ppt;
593
- Node *np;
594
- total+=n;
595
- if(this->search(pch)==0){
596
- ppt = new long;
597
- (*ppt) =n;
598
- np=new Node(pch,(void*)ppt);
599
- this->insert(np);
600
- }
601
- else {
602
- (*(long*) this->give_ptr())+=n;
603
- }
604
- }
605
-
606
- void Count::add_countz(const char *pch,long n){
607
- long *ppt;
608
- Node *np;
609
- if(this->search(pch)==0){
610
- ppt = new long;
611
- (*ppt) =n;
612
- np=new Node(pch,(void*)ppt);
613
- this->insert(np);
614
- cnt_key++;
615
- }
616
- else {
617
- (*(long*) this->give_ptr())+=n;
618
- }
619
- }
620
-
621
- void Count::add_count2(const char *pch,long n){
622
- long *ppt;
623
- Node *np;
624
- total+=n;
625
- if(this->search(pch)==0){
626
- ppt = new long;
627
- (*ppt) =n;
628
- np=new Node(pch,(void*)ppt);
629
- this->insert(np);
630
- cnt_key++;
631
- }
632
- else {
633
- (*(long*) this->give_ptr())+=n;
634
- }
635
- }
636
-
637
- void Count::addp_count2(char *pch,long n){
638
- long *ppt;
639
- Node *np;
640
- total+=n;
641
- if(this->search(pch)==0){
642
- ppt = new long;
643
- (*ppt) =n;
644
- np=new Node;
645
- np->str=pch;
646
- np->rel=ppt;
647
- this->insert(np);
648
- cnt_key++;
649
- }
650
- else {
651
- (*(long*) this->give_ptr())+=n;
652
- }
653
- }
654
-
655
- void Count::correct(const char *pch,long n){
656
- if(this->search(pch)){
657
- (*(long*) this->give_ptr())=n;
658
- }
659
- }
660
-
661
- long Count::count(const char *pch){
662
- if(this->search(pch)==0){
663
- return(0);
664
- }
665
- else {
666
- return(*((long*) this->give_ptr()));
667
- }
668
- }
669
-
670
- long Count::count(void){
671
- return(*((long*) this->give_ptr()));
672
- }
673
-
674
- void Count::max_count(const char *pch,long n){
675
- long *ppt,i;
676
- Node *np;
677
- total+=n;
678
- if(!search(pch)){
679
- ppt = new long;
680
- (*ppt) =n;
681
- np=new Node(pch,(void*)ppt);
682
- this->insert(np);
683
- }
684
- else {
685
- ppt=(long*)give_ptr();
686
- if(*ppt<n)*ppt=n;
687
- }
688
- }
689
-
690
- void Count::max_count2(const char *pch,long n){
691
- long *ppt,i;
692
- Node *np;
693
- total+=n;
694
- if(!search(pch)){
695
- ppt = new long;
696
- (*ppt) =n;
697
- np=new Node(pch,(void*)ppt);
698
- this->insert(np);
699
- cnt_key++;
700
- }
701
- else {
702
- ppt=(long*)give_ptr();
703
- if(*ppt<n)*ppt=n;
704
- }
705
- }
706
-
707
- void Count::maxp_count2(char *pch,long n){
708
- long *ppt,i;
709
- Node *np;
710
- total+=n;
711
- if(!search(pch)){
712
- ppt = new long;
713
- (*ppt) =n;
714
- np=new Node;
715
- np->str=pch;
716
- np->rel=ppt;
717
- this->insert(np);
718
- cnt_key++;
719
- }
720
- else {
721
- ppt=(long*)give_ptr();
722
- if(*ppt<n)*ppt=n;
723
- }
724
- }
725
-
726
- void Count::min_count(const char *pch,long n){
727
- long *ppt,i;
728
- Node *np;
729
- total+=n;
730
- if(!search(pch)){
731
- ppt = new long;
732
- (*ppt) =n;
733
- np=new Node(pch,(void*)ppt);
734
- this->insert(np);
735
- }
736
- else {
737
- ppt=(long*)give_ptr();
738
- if(*ppt>n)*ppt=n;
739
- }
740
- }
741
-
742
- void Count::min_count2(const char *pch,long n){
743
- long *ppt,i;
744
- Node *np;
745
- total+=n;
746
- if(!search(pch)){
747
- ppt = new long;
748
- (*ppt) =n;
749
- np=new Node(pch,(void*)ppt);
750
- this->insert(np);
751
- cnt_key++;
752
- }
753
- else {
754
- ppt=(long*)give_ptr();
755
- if(*ppt>n)*ppt=n;
756
- }
757
- }
758
-
759
- void Count::minp_count2(char *pch,long n){
760
- long *ppt,i;
761
- Node *np;
762
- total+=n;
763
- if(!search(pch)){
764
- ppt = new long;
765
- (*ppt) =n;
766
- np=new Node;
767
- np->str=pch;
768
- np->rel=ppt;
769
- this->insert(np);
770
- cnt_key++;
771
- }
772
- else {
773
- ppt=(long*)give_ptr();
774
- if(*ppt>n)*ppt=n;
775
- }
776
- }
777
-
778
- //FCount (float count tree)
779
-
780
- FCount::FCount() : List() {
781
- total=0;
782
- }
783
-
784
- FCount::~FCount(){
785
- if(copy)return;
786
- float *pk;
787
- this->node_first();
788
- while(this->node_next()){
789
- pk=(float*)(this->give_ptr());
790
- if(pk)delete pk;
791
- }
792
- }
793
-
794
- void FCount::Copy(FCount &Fc){
795
- char *pch;
796
- float *xx,*zz;
797
- Node *pN;
798
-
799
- pg[0]=root;
800
- cnd[0]=root->ndnm;
801
-
802
- Fc.node_first();
803
- while(Fc.node_next()){
804
- pch=Fc.show_str();
805
- xx=(float*)Fc.give_ptr();
806
- zz=new float;
807
- *zz=*xx;
808
- pN=new Node(pch,(void*)zz);
809
- add(pN);
810
- }
811
- }
812
-
813
- void FCount::add_count(const char *pch,float z){
814
- float *ppt;
815
- Node *np;
816
- total+=z;
817
- if(this->search(pch)==0){
818
- ppt = new float;
819
- (*ppt) =z;
820
- np=new Node(pch,(void*)ppt);
821
- this->insert(np);
822
- }
823
- else {
824
- (*(float*) this->give_ptr())+=z;
825
- }
826
- }
827
-
828
- void FCount::add_count2(const char *pch,float z){
829
- float *ppt;
830
- Node *np;
831
- total+=z;
832
- if(this->search(pch)==0){
833
- ppt = new float;
834
- (*ppt) =z;
835
- np=new Node(pch,(void*)ppt);
836
- this->insert(np);
837
- cnt_key++;
838
- }
839
- else {
840
- (*(float*) this->give_ptr())+=z;
841
- }
842
- }
843
-
844
- void FCount::addp_count2(char *pch,float z){
845
- float *ppt;
846
- Node *np;
847
- total+=z;
848
- if(this->search(pch)==0){
849
- ppt = new float;
850
- (*ppt) =z;
851
- np=new Node;
852
- np->str=pch;
853
- np->rel=ppt;
854
- this->insert(np);
855
- cnt_key++;
856
- }
857
- else {
858
- (*(float*) this->give_ptr())+=z;
859
- }
860
- }
861
-
862
- float FCount::count(const char *pch){
863
- if(this->search(pch)==0){
864
- return(0);
865
- }
866
- else {
867
- return(*((float*) this->give_ptr()));
868
- }
869
- }
870
-
871
- float FCount::count(void){
872
- return(*((float*) this->give_ptr()));
873
- }
874
-
875
- //DCount (double precision count tree)
876
-
877
- DCount::DCount() : List() {
878
- total=0;
879
- }
880
-
881
- DCount::~DCount(){
882
- if(copy)return;
883
- double *pk;
884
- this->node_first();
885
- while(this->node_next()){
886
- pk=(double*)(this->give_ptr());
887
- if(pk)delete pk;
888
- }
889
- }
890
-
891
- void DCount::Copy(DCount &Dc){
892
- char *pch;
893
- double *xx,*zz;
894
- Node *pN;
895
-
896
- pg[0]=root;
897
- cnd[0]=root->ndnm;
898
-
899
- Dc.node_first();
900
- while(Dc.node_next()){
901
- pch=Dc.show_str();
902
- xx=(double*)Dc.give_ptr();
903
- zz=new double;
904
- *zz=*xx;
905
- pN=new Node(pch,(void*)zz);
906
- add(pN);
907
- }
908
- }
909
-
910
- void DCount::add_count(const char *pch,double z){
911
- double *ppt;
912
- Node *np;
913
- total+=z;
914
- if(this->search(pch)==0){
915
- ppt = new double;
916
- (*ppt) =z;
917
- np=new Node(pch,(void*)ppt);
918
- this->insert(np);
919
- }
920
- else {
921
- (*(double*) this->give_ptr())+=z;
922
- }
923
- }
924
-
925
- void DCount::add_count2(const char *pch,double z){
926
- double *ppt;
927
- Node *np;
928
- total+=z;
929
- if(this->search(pch)==0){
930
- ppt = new double;
931
- (*ppt) =z;
932
- np=new Node(pch,(void*)ppt);
933
- this->insert(np);
934
- cnt_key++;
935
- }
936
- else {
937
- (*(double*) this->give_ptr())+=z;
938
- }
939
- }
940
-
941
- void DCount::addp_count2(char *pch,double z){
942
- double *ppt;
943
- Node *np;
944
- total+=z;
945
- if(this->search(pch)==0){
946
- ppt = new double;
947
- (*ppt) =z;
948
- np=new Node;
949
- np->str=pch;
950
- np->rel=ppt;
951
- this->insert(np);
952
- cnt_key++;
953
- }
954
- else {
955
- (*(double*) this->give_ptr())+=z;
956
- }
957
- }
958
-
959
- double DCount::count(const char *pch){
960
- if(this->search(pch)==0){
961
- return(0);
962
- }
963
- else {
964
- return(*((double*) this->give_ptr()));
965
- }
966
- }
967
-
968
- double DCount::count(void){
969
- return(*((double*) this->give_ptr()));
970
- }
971
-
972
- void DCount::max_count(const char *pch,double z){
973
- double *ppt;
974
- Node *np;
975
- total+=z;
976
- if(!search(pch)){
977
- ppt = new double;
978
- (*ppt) =z;
979
- np=new Node(pch,(void*)ppt);
980
- this->insert(np);
981
- }
982
- else {
983
- ppt=(double*)give_ptr();
984
- if(*ppt<z)*ppt=z;
985
- }
986
- }
987
-
988
- void DCount::max_count2(const char *pch,double z){
989
- double *ppt;
990
- Node *np;
991
- total+=z;
992
- if(!search(pch)){
993
- ppt = new double;
994
- (*ppt) =z;
995
- np=new Node(pch,(void*)ppt);
996
- this->insert(np);
997
- cnt_key++;
998
- }
999
- else {
1000
- ppt=(double*)give_ptr();
1001
- if(*ppt<z)*ppt=z;
1002
- }
1003
- }
1004
-
1005
- void DCount::maxp_count2(char *pch,double z){
1006
- double *ppt;
1007
- Node *np;
1008
- total+=z;
1009
- if(!search(pch)){
1010
- ppt = new double;
1011
- (*ppt) =z;
1012
- np=new Node;
1013
- np->str=pch;
1014
- np->rel=ppt;
1015
- this->insert(np);
1016
- cnt_key++;
1017
- }
1018
- else {
1019
- ppt=(double*)give_ptr();
1020
- if(*ppt<z)*ppt=z;
1021
- }
1022
- }
1023
-
1024
- void DCount::min_count(const char *pch,double z){
1025
- double *ppt;
1026
- Node *np;
1027
- total+=z;
1028
- if(!search(pch)){
1029
- ppt = new double;
1030
- (*ppt) =z;
1031
- np=new Node(pch,(void*)ppt);
1032
- this->insert(np);
1033
- }
1034
- else {
1035
- ppt=(double*)give_ptr();
1036
- if(*ppt>z)*ppt=z;
1037
- }
1038
- }
1039
-
1040
- void DCount::min_count2(const char *pch,double z){
1041
- double *ppt;
1042
- Node *np;
1043
- total+=z;
1044
- if(!search(pch)){
1045
- ppt = new double;
1046
- (*ppt) =z;
1047
- np=new Node(pch,(void*)ppt);
1048
- this->insert(np);
1049
- cnt_key++;
1050
- }
1051
- else {
1052
- ppt=(double*)give_ptr();
1053
- if(*ppt>z)*ppt=z;
1054
- }
1055
- }
1056
-
1057
- void DCount::minp_count2(char *pch,double z){
1058
- double *ppt;
1059
- Node *np;
1060
- total+=z;
1061
- if(!search(pch)){
1062
- ppt = new double;
1063
- (*ppt) =z;
1064
- np=new Node;
1065
- np->str=pch;
1066
- np->rel=ppt;
1067
- this->insert(np);
1068
- cnt_key++;
1069
- }
1070
- else {
1071
- ppt=(double*)give_ptr();
1072
- if(*ppt>z)*ppt=z;
1073
- }
1074
- }
1075
-
1076
- void DCount::debug(void){
1077
- node_first();
1078
- while(node_next()){
1079
- cout << count() << " " << show_str() << endl;
1080
- }
1081
- }
1082
-
1083
- //Partial Match
1084
-
1085
- Partial_match::Partial_match() : Count() {
1086
- }
1087
-
1088
- Partial_match::~Partial_match(){
1089
- }
1090
-
1091
- void Partial_match::long_match(char *str,List &Lst){
1092
- char *pch;
1093
- while(*str!='\0'){
1094
- if(this->search_long(str)){
1095
- pch=this->show_str();
1096
- Lst.add_key_count(pch);
1097
- }
1098
- if((pch=strchr(str,' '))!=NULL)str=pch+1;
1099
- else str=str+strlen(str);
1100
- }
1101
- }
1102
-
1103
- void Partial_match::local_match(char *str,List &Lst){
1104
- char *pch;
1105
- int i,j;
1106
- if(*str!='\0'){
1107
- if(this->search_long(str)){
1108
- pch=this->show_str();
1109
- Lst.add_key_count(pch);
1110
- i=strlen(pch)-1;
1111
- while(0<i){
1112
- while((0<i)&&(*(str+i)!=' '))i--;
1113
- if(0<i){
1114
- *(str+i)='\0';
1115
- j=this->search(str);
1116
- *(str+i)=' ';
1117
- if(j){
1118
- pch=this->show_str();
1119
- Lst.add_key_count(pch);
1120
- }
1121
- i--;
1122
- }
1123
- }
1124
- }
1125
- }
1126
- }
1127
-
1128
- void Partial_match::all_match(char *str,List &Lst){
1129
- char *pch;
1130
- int i,j;
1131
- while(*str!='\0'){
1132
- if(this->search_long(str)){
1133
- pch=this->show_str();
1134
- Lst.add_key_count(pch);
1135
- i=strlen(pch)-1;
1136
- while(0<i){
1137
- while((0<i)&&(*(str+i)!=' '))i--;
1138
- if(0<i){
1139
- *(str+i)='\0';
1140
- j=this->search(str);
1141
- *(str+i)=' ';
1142
- if(j){
1143
- pch=this->show_str();
1144
- Lst.add_key_count(pch);
1145
- }
1146
- i--;
1147
- }
1148
- }
1149
- }
1150
- if((pch=strchr(str,' '))!=NULL)str=pch+1;
1151
- else str=str+strlen(str);
1152
- }
1153
- }
1154
-
1155
- void Partial_match::long_match(char *str,Count &Cnt,long n){
1156
- char *pch;
1157
- while(*str!='\0'){
1158
- if(this->search_long(str)){
1159
- pch=this->show_str();
1160
- Cnt.add_count2(pch,n);
1161
- }
1162
- if((pch=strchr(str,' '))!=NULL)str=pch+1;
1163
- else str=str+strlen(str);
1164
- }
1165
- }
1166
-
1167
- void Partial_match::local_match(char *str,Count &Cnt,long n){
1168
- char *pch;
1169
- int i,j;
1170
- if(*str!='\0'){
1171
- if(this->search_long(str)){
1172
- pch=this->show_str();
1173
- Cnt.add_count2(pch,n);
1174
- i=strlen(pch)-1;
1175
- while(0<i){
1176
- while((0<i)&&(*(str+i)!=' '))i--;
1177
- if(0<i){
1178
- *(str+i)='\0';
1179
- j=this->search(str);
1180
- *(str+i)=' ';
1181
- if(j){
1182
- pch=this->show_str();
1183
- Cnt.add_count2(pch,n);
1184
- }
1185
- i--;
1186
- }
1187
- }
1188
- }
1189
- }
1190
- }
1191
-
1192
- void Partial_match::all_match(char *str,Count &Cnt,long n){
1193
- char *pch;
1194
- int i,j;
1195
- while(*str!='\0'){
1196
- if(this->search_long(str)){
1197
- pch=this->show_str();
1198
- Cnt.add_count2(pch,n);
1199
- i=strlen(pch)-1;
1200
- while(0<i){
1201
- while((0<i)&&(*(str+i)!=' '))i--;
1202
- if(0<i){
1203
- *(str+i)='\0';
1204
- j=this->search(str);
1205
- *(str+i)=' ';
1206
- if(j){
1207
- pch=this->show_str();
1208
- Cnt.add_count2(pch,n);
1209
- }
1210
- i--;
1211
- }
1212
- }
1213
- }
1214
- if((pch=strchr(str,' '))!=NULL)str=pch+1;
1215
- else str=str+strlen(str);
1216
- }
1217
- }
1218
-
1219
- int Partial_match::search_long(char *str){
1220
- int a=0,b=0,i,j;
1221
- len=strlen(str);
1222
- if(this->step_one(a,b,str))return(1);
1223
- i=(a<b)?b:a;
1224
- while(cln_o<i){
1225
- while((cln_o<i)&&(*(str+i)!=' '))i--;
1226
- if(cln_o<i){
1227
- *(str+i)='\0';
1228
- j=this->search(str);
1229
- *(str+i)=' ';
1230
- if(j)return(1);
1231
- i--;
1232
- }
1233
- }
1234
- if(cln_o){
1235
- depth=depth_o;
1236
- cnd[depth]=index_o;
1237
- return(1);
1238
- }
1239
- else return(0);
1240
- }
1241
-
1242
- int Partial_match::step_one(int &a,int &b,char *str){
1243
- char c;
1244
- cln_o=0;
1245
- cln=0;
1246
- while((c=*(str+cln))&&c!=32)cln++;
1247
- *(str+cln)='\0';
1248
- depth=-1;
1249
- Page *pu=root;
1250
- int i,j;
1251
- while(pu!=NULL){
1252
- depth++;
1253
- pg[depth]=pu;
1254
- j=(pu->search)(a,b,str,i,this);
1255
- cnd[depth]=i;
1256
- if(j==1)return(1);
1257
- if(i==0)pu=pu->pdn;
1258
- else pu=(pu->pnd)[i-1]->pdn;
1259
- }
1260
-
1261
- if(cln<len)*(str+cln)=' ';
1262
- return(0);
1263
- }
1264
-
1265
- int Partial_match::stc_my_long(int &a,int &b,char *str,const char *ptr,int index)
1266
- {char c;
1267
- int i=(a<b) ? a : b;
1268
- const char *p1=str+i;
1269
- const char *p2=ptr+i;
1270
- int j=0;
1271
- while((*p1==*p2)&&(*p1!='\0')){
1272
- j++;
1273
- p1++;
1274
- p2++;
1275
- if((*p1=='\0'&&*p2!='\0')&&(cln<len)){
1276
- *(str+cln++)=' ';
1277
- while((c=*(str+cln))&&c!=32)cln++;
1278
- *(str+cln)='\0';
1279
- }
1280
- }
1281
- if(*p1==*p2){
1282
- if(cln<len){
1283
- depth_o=depth;
1284
- index_o=index;
1285
- cln_o=cln;
1286
- *(str+cln++)=' ';
1287
- while((c=*(str+cln))&&c!=32)cln++;
1288
- *(str+cln)='\0';
1289
- a=i+j;
1290
- return(1);
1291
- }
1292
- else return(0);
1293
- }
1294
- else if(*p1<*p2){
1295
- b=i+j;
1296
- return(-1);
1297
- }
1298
- else {
1299
- a=i+j;
1300
- return(1);
1301
- }
1302
- }
1303
-
1304
- }
 
1
+ #include <iostream>
2
+ #include <fstream>
3
+ #include <cstdio>
4
+ #include <cstdlib>
5
+ #include <cmath>
6
+ #include <cstring>
7
+ #include <cassert>
8
+ #include "Btree.h"
9
+ #include "runn.h"
10
+
11
+ using namespace std;
12
+ namespace iret {
13
+
14
+ Node::Node(void){
15
+ str=NULL;
16
+ rel=NULL;
17
+ pdn=NULL;
18
+ }
19
+
20
+ Node::Node(const char *ptr){
21
+ int i=strlen(ptr);
22
+ str = new char[i+1];
23
+ strcpy(str,ptr);
24
+ rel=NULL;
25
+ pdn=NULL;
26
+ }
27
+
28
+ Node::Node(char const *ptr,void *dtr){
29
+ int i=strlen(ptr);
30
+ str = new char[i+1];
31
+ strcpy(str,ptr);
32
+ rel = dtr;
33
+ pdn=NULL;
34
+ }
35
+
36
+ Node::~Node(){
37
+ if(str)delete [] str;
38
+ }
39
+
40
+ void Node::debug(void){
41
+ cout << "Node {" << endl;
42
+ cout << " str: " << this->str << endl;
43
+ if(rel==NULL)cout << " rel: NULL" << endl;
44
+ else cout << " rel: " << (long)rel << endl;
45
+ if(pdn==NULL)cout << " pdn: NULL" << endl;
46
+ else cout << " pdn: " << (long)pdn << endl;
47
+ cout << " }" << endl;
48
+ }
49
+
50
+ Page::Page(){
51
+ pdn=NULL;
52
+ ndnm='\0';
53
+ }
54
+
55
+ Page::Page(Page *const pz,Page *const pn,const int n){
56
+ pdn=pn;
57
+ int j=(int)(pz->ndnm)-n;
58
+ ndnm=(char)(j>0 ? j : 0);
59
+ for(int i=0;i<(int)ndnm;i++){pnd[i]=(pz->pnd)[n+i];}
60
+ }
61
+
62
+ Page::~Page(){
63
+ for(int i=0;i<(int)ndnm;i++){
64
+ delete pnd[i];
65
+ }
66
+ }
67
+
68
+ void Page::clean(void){
69
+ for(int i=0;i<(int)ndnm;i++){
70
+ pnd[i]->str=NULL;
71
+ }
72
+ }
73
+
74
+ void Page::insert(const int n,Node * const nd,const int j){
75
+ assert(j<ord2);
76
+ assert(n<=j);
77
+ if(n==j){
78
+ pnd[j]=nd;
79
+ }
80
+ else {
81
+ for(int i=j;i>n;i--)pnd[i]=pnd[i-1];
82
+ pnd[n]=nd;
83
+ }
84
+ ndnm++;
85
+ }
86
+
87
+ int Page::search(int &a,int &b,const char *str,int &p){
88
+ int j;
89
+ if((j=stc_my(a,b,str,pnd[0]->str))<0){
90
+ p=0;
91
+ return(0);
92
+ }
93
+ else if(j==0){
94
+ p=0;
95
+ return(1);
96
+ }
97
+ if((j=stc_my(a,b,str,pnd[(int)(ndnm-1)]->str))>0){
98
+ p=(int)ndnm;
99
+ return(0);
100
+ }
101
+ else if(j==0){
102
+ p=(int)(ndnm-1);
103
+ return(1);
104
+ }
105
+ int x=0,i;
106
+ int y=(int)(ndnm-1);
107
+ while(y-x>1){
108
+ i=(y+x)/2;
109
+ if((j=stc_my(a,b,str,pnd[i]->str))==0){p=i;return(1);}
110
+ else if(j<0)y=i;
111
+ else x=i;
112
+ }
113
+ p=y;
114
+ return(0);
115
+ }
116
+
117
+ int Page::search(int &a,int &b,char *str,int &p,Partial_match *btr){
118
+ int j;
119
+ if((j=btr->stc_my_long(a,b,str,pnd[0]->str,0))<0){
120
+ p=0;
121
+ return(0);
122
+ }
123
+ else if(j==0){
124
+ p=0;
125
+ return(1);
126
+ }
127
+ if((j=btr->stc_my_long(a,b,str,pnd[(int)(ndnm-1)]->str,(int)(ndnm-1)))>0){
128
+ p=(int)ndnm;
129
+ return(0);
130
+ }
131
+ else if(j==0){
132
+ p=(int)(ndnm-1);
133
+ return(1);
134
+ }
135
+ int x=0,i;
136
+ int y=(int)(ndnm-1);
137
+ while(y-x>1){
138
+ i=(y+x)/2;
139
+ if((j=btr->stc_my_long(a,b,str,pnd[i]->str,i))==0){p=i;return(1);}
140
+ else if(j<0)y=i;
141
+ else x=i;
142
+ }
143
+ p=y;
144
+ return(0);
145
+ }
146
+
147
+ void Page::debug(void){
148
+ cout << "Page {" << endl;
149
+ cout << " ndnm: " << (int)ndnm << endl;
150
+ if(pdn==NULL)cout << " pdn: NULL" << endl;
151
+ else cout << " pdn: " << (long)pdn << endl;
152
+ for(int i=0;i<(int)ndnm;i++){
153
+ cout << i << " ";
154
+ (this->pnd[i])->debug();
155
+ }
156
+ cout << " }" << endl;
157
+ }
158
+
159
+ int stc_my(int &a,int &b,const char *str,const char *ptr)
160
+ {register int i=(a<b) ? a : b;
161
+ register const char *p1=str+i;
162
+ register const char *p2=ptr+i;
163
+ register int j=0;
164
+ while((*p1==*p2)&&(*p1!='\0')){
165
+ j++;
166
+ p1++;
167
+ p2++;
168
+ }
169
+ if(*p1==*p2)return(0);
170
+ else if(*p1<*p2){
171
+ b=i+j;
172
+ return(-1);
173
+ }
174
+ else {
175
+ a=i+j;
176
+ return(1);
177
+ }
178
+ }
179
+
180
+ Btree::Btree(){
181
+ iclean=0;
182
+ copy=false;
183
+ depth=0;
184
+ root=new Page;
185
+ root->ndnm = 1;
186
+ (root->pnd)[0]=new Node("");
187
+ }
188
+
189
+ int Btree::search(const char *str){
190
+ depth=-1;
191
+ Page *pu=root;
192
+ register int a=0,b=0,i,j;
193
+ while(pu!=NULL){
194
+ depth++;
195
+ pg[depth]=pu;
196
+ j=(pu->search)(a,b,str,i);
197
+ cnd[depth]=i;
198
+ if(j==1)return(1);
199
+ if(i==0)pu=pu->pdn;
200
+ else pu=(pu->pnd)[i-1]->pdn;
201
+ }
202
+ return(0);
203
+ }
204
+
205
+ int Btree::insert(Node *nd){
206
+ int w,k;
207
+ Page *pm,*pz;
208
+ while((nd!=NULL)&&(depth)){
209
+ pm=pg[depth];
210
+ w=pm->ndnm;
211
+ if(w<ord2){
212
+ pm->insert(cnd[depth],nd,w);
213
+ nd=NULL;
214
+ }
215
+ else {
216
+ k=cnd[depth];
217
+ if(k<order){
218
+ pz=new Page(pm,((pm->pnd)[order-1])->pdn,order);
219
+ pm->insert(k,nd,order);
220
+ nd=pm->pnd[order];
221
+ nd->pdn=pz;
222
+ pm->ndnm=order;
223
+ }
224
+ else if(k>order){
225
+ pz=new Page(pm,((pm->pnd)[order])->pdn,order+1);
226
+ pz->insert(k-order-1,nd,order-1);
227
+ nd=pm->pnd[order];
228
+ nd->pdn=pz;
229
+ pm->ndnm=order;
230
+ }
231
+ else {
232
+ pz=new Page(pm,nd->pdn,order);
233
+ nd->pdn=pz;
234
+ pm->ndnm=order;
235
+ }
236
+ }
237
+ depth--;
238
+ }
239
+ if(nd!=NULL){
240
+ pm=pg[depth];
241
+ w=pm->ndnm;
242
+ if(w<ord2)pm->insert(cnd[depth],nd,w);
243
+ else {
244
+ root=new Page();
245
+ root->pdn=pm;
246
+ k=cnd[depth];
247
+ if(k<order){
248
+ pz=new Page(pm,((pm->pnd)[order-1])->pdn,order);
249
+ pm->insert(k,nd,order);
250
+ (root->pnd)[0]=pm->pnd[order];
251
+ ((root->pnd)[0])->pdn=pz;
252
+ root->ndnm=1;
253
+ pm->ndnm=order;
254
+ }
255
+ else if(k>order){
256
+ pz=new Page(pm,((pm->pnd)[order])->pdn,order+1);
257
+ pz->insert(k-order-1,nd,order-1);
258
+ (root->pnd)[0]=pm->pnd[order];
259
+ ((root->pnd)[0])->pdn=pz;
260
+ root->ndnm=1;
261
+ pm->ndnm=order;
262
+ }
263
+ else {
264
+ pz=new Page(pm,nd->pdn,order);
265
+ (root->pnd)[0]=nd;
266
+ nd->pdn=pz;
267
+ root->ndnm=1;
268
+ pm->ndnm=order;
269
+ }
270
+ }
271
+ }
272
+ return(1);
273
+ }
274
+
275
+ void Btree::node_first(void){
276
+ depth=0;
277
+ pg[depth]=root;
278
+ cnd[depth]=0;
279
+ Page *pm;
280
+ while((pm=(pg[depth]->pdn))!=NULL){
281
+ depth++;
282
+ pg[depth]=pm;
283
+ cnd[depth]=0;
284
+ }
285
+ }
286
+
287
+ int Btree::node_next(){
288
+ int i=cnd[depth];
289
+ Page *pd=((pg[depth]->pnd)[i])->pdn;
290
+ if(pd!=NULL){
291
+ (cnd[depth])++;
292
+ depth++;
293
+ pg[depth]=pd;
294
+ cnd[depth]=0;
295
+ while((pd=(pg[depth]->pdn))!=NULL){
296
+ depth++;
297
+ pg[depth]=pd;
298
+ cnd[depth]=0;
299
+ }
300
+ }
301
+ else {
302
+ cnd[depth]=++i;
303
+ while((depth>=1)&&(i==(pg[depth]->ndnm))){depth--;i=cnd[depth];}
304
+ if((depth==0)&&(i==(pg[depth]->ndnm)))depth--;
305
+ if(depth<0)return(0);
306
+ }
307
+ return(1);
308
+ }
309
+
310
+ char *Btree::show_str(){
311
+ return(((pg[depth]->pnd)[cnd[depth]])->str);
312
+ }
313
+
314
+ void *Btree::give_ptr(){
315
+ return(((pg[depth]->pnd)[cnd[depth]])->rel);
316
+ }
317
+
318
+ void Btree::set_ptr(void *dtr){
319
+ ((pg[depth]->pnd)[cnd[depth]])->rel=dtr;
320
+ }
321
+
322
+ Btree::~Btree(){
323
+ int pflag=get_qflag();
324
+ long k=0;
325
+ if (copy) return; // only delete original
326
+ if(!iclean){
327
+ node_first();
328
+ int i=depth,j;
329
+ do{
330
+ j=node_next();
331
+ if(depth<i){
332
+ while(i>depth){
333
+ delete pg[i];
334
+ i--;
335
+ mark(pflag,++k,1000,"pages deleted");
336
+ }
337
+ }
338
+ else i=depth;
339
+ } while(j);
340
+ }
341
+ else {
342
+ node_first();
343
+ int i=depth,j;
344
+ do{
345
+ j=node_next();
346
+ if(depth<i){
347
+ while(i>depth){
348
+ pg[i]->clean();
349
+ delete pg[i];
350
+ i--;
351
+ mark(pflag,++k,1000,"pages deleted");
352
+ }
353
+ }
354
+ else i=depth;
355
+ } while(j);
356
+ }
357
+ }
358
+
359
+ long Btree::list_write(ofstream &fout){
360
+ int pflag=get_qflag();
361
+ long ct=0;
362
+ node_first();
363
+ while(node_next()){
364
+ fout << show_str() << endl;
365
+ mark(pflag,++ct,1000,"strings written");
366
+ }
367
+ fout.close();
368
+ return((int)fout.good());
369
+ }
370
+
371
+ Btree::Btree(ifstream &fin){
372
+ copy=false;
373
+ char cnam[256];
374
+ int pflag=get_qflag();
375
+ depth=0;
376
+ pg[0]=root=new Page();
377
+ cnd[0]=root->ndnm = 1;
378
+ (root->pnd)[0]=new Node("");
379
+ Node *pno;
380
+ long ct=0;
381
+ while(get_string(cnam,fin,'\n')){
382
+ pno = new Node(cnam);
383
+ add(pno);
384
+ mark(pflag,++ct,10000,"strings read");
385
+ }
386
+ fin.close();
387
+ }
388
+
389
+ int Btree::add(Node *nd){
390
+ int w,k,dp;
391
+ Page *pm,*pz;
392
+ dp=depth; //uses dp in place of depth in insert.
393
+ while((nd!=NULL)&&(dp)){
394
+ pm=pg[dp];
395
+ w=pm->ndnm;
396
+ if(w<ord2){
397
+ pm->insert(cnd[dp],nd,w);
398
+ nd=NULL;
399
+ (cnd[dp])++; //variation from insert.
400
+ }
401
+ else {
402
+ k=cnd[dp];
403
+ if(k<order){
404
+ pz=new Page(pm,((pm->pnd)[order-1])->pdn,order);
405
+ pm->insert(k,nd,order);
406
+ nd=pm->pnd[order];
407
+ nd->pdn=pz;
408
+ pm->ndnm=order;
409
+ }
410
+ else if(k>order){
411
+ pz=new Page(pm,((pm->pnd)[order])->pdn,order+1);
412
+ pz->insert(k-order-1,nd,order-1);
413
+ nd=pm->pnd[order];
414
+ nd->pdn=pz;
415
+ pm->ndnm=order;
416
+ }
417
+ else {
418
+ pz=new Page(pm,nd->pdn,order);
419
+ nd->pdn=pz;
420
+ pm->ndnm=order;
421
+ }
422
+ pg[dp]=pz; //2 lines of variation from insert.
423
+ cnd[dp]=order;
424
+ }
425
+ dp--;
426
+ }
427
+ if(nd!=NULL){
428
+ pm=pg[dp];
429
+ w=pm->ndnm;
430
+ if(w<ord2){
431
+ pm->insert(cnd[dp],nd,w);
432
+ (cnd[dp])++; //variation from insert.
433
+ }
434
+ else {
435
+ root=new Page();
436
+ root->pdn=pm;
437
+ k=cnd[dp];
438
+ if(k<order){
439
+ pz=new Page(pm,((pm->pnd)[order-1])->pdn,order);
440
+ pm->insert(k,nd,order);
441
+ (root->pnd)[0]=pm->pnd[order];
442
+ ((root->pnd)[0])->pdn=pz;
443
+ root->ndnm=1;
444
+ pm->ndnm=order;
445
+ }
446
+ else if(k>order){
447
+ pz=new Page(pm,((pm->pnd)[order])->pdn,order+1);
448
+ pz->insert(k-order-1,nd,order-1);
449
+ (root->pnd)[0]=pm->pnd[order];
450
+ ((root->pnd)[0])->pdn=pz;
451
+ root->ndnm=1;
452
+ pm->ndnm=order;
453
+ }
454
+ else {
455
+ pz=new Page(pm,nd->pdn,order);
456
+ (root->pnd)[0]=nd;
457
+ nd->pdn=pz;
458
+ root->ndnm=1;
459
+ pm->ndnm=order;
460
+ }
461
+ next_empty(); //variation from insert.
462
+ }
463
+ }
464
+ return(1);
465
+ }
466
+
467
+ void Btree::next_empty(){
468
+ depth=0;
469
+ pg[depth]=root;
470
+ int i=cnd[depth]=root->ndnm;
471
+ Page *pm;
472
+ while((pm=((pg[depth]->pnd)[i-1])->pdn)!=NULL){
473
+ depth++;
474
+ pg[depth]=pm;
475
+ i=cnd[depth]=pm->ndnm;
476
+ }
477
+ }
478
+
479
+ Str_str::Str_str() : Btree() {
480
+ }
481
+
482
+ Str_str::~Str_str(){
483
+ if(copy)return;
484
+ this->node_first();
485
+ while(this->node_next())delete [] (char*)this->give_ptr();
486
+ }
487
+
488
+ void Str_str::add_pair(const char *one,const char *two){
489
+ Node *pnd;
490
+ if(search(one)){
491
+ cout << "Duplicate string in keys list = " << one << endl;
492
+ exit(0);
493
+ }
494
+ else {
495
+ int i=strlen(two);
496
+ char *st=new char[i+1];
497
+ strcpy(st,two);
498
+ pnd=new Node(one,(void *)st);
499
+ add(pnd);
500
+ }
501
+ }
502
+
503
+ char *Str_str::match(const char *one){
504
+ if(search(one)){
505
+ return((char*)give_ptr());
506
+ }
507
+ else {
508
+ cout << "String not a key = " << one << endl;
509
+ exit(0);
510
+ }
511
+ }
512
+
513
+ List::List() : Btree() {
514
+ cnt_key=0;
515
+ }
516
+
517
+ List::~List(){
518
+ }
519
+
520
+ void List::add_key(const char *str){
521
+ Node *pnd;
522
+ if(!search(str)){
523
+ pnd=new Node(str);
524
+ add(pnd);
525
+ }
526
+ }
527
+
528
+ void List::add_key_count(const char *str){
529
+ Node *pnd;
530
+ if(!search(str)){
531
+ pnd=new Node(str);
532
+ add(pnd);
533
+ cnt_key++;
534
+ }
535
+ }
536
+
537
+ void List::addp_key_count(char *str){
538
+ Node *pnd;
539
+ if(!search(str)){
540
+ pnd=new Node;
541
+ pnd->str=str;
542
+ add(pnd);
543
+ cnt_key++;
544
+ }
545
+ }
546
+
547
+ Num_num::Num_num() : Btree() {
548
+ }
549
+
550
+ Num_num::~Num_num(){
551
+ if(copy)return;
552
+ this->node_first();
553
+ while(this->node_next())delete (long*)this->give_ptr();
554
+ }
555
+
556
+ void Num_num::add_pair(long i,long j){
557
+ Node *pnd;
558
+ char cnam[256];
559
+ long_str(cnam,i);
560
+ if(!search(cnam)){
561
+ long *st=new long;
562
+ *st=j;
563
+ pnd=new Node(cnam,(void *)st);
564
+ add(pnd);
565
+ }
566
+ }
567
+
568
+ long Num_num::match(long i){
569
+ char cnam[256];
570
+ long_str(cnam,i);
571
+ if(search(cnam)){
572
+ return(*((long*)give_ptr()));
573
+ }
574
+ else return(LNEG);
575
+ }
576
+
577
+ Count::Count() : List() {
578
+ total=0;
579
+ }
580
+
581
+ Count::~Count(){
582
+ if(copy)return;
583
+ long *pk;
584
+ this->node_first();
585
+ while(this->node_next()){
586
+ pk=(long*)(this->give_ptr());
587
+ if(pk)delete pk;
588
+ }
589
+ }
590
+
591
+ void Count::add_count(const char *pch,long n){
592
+ long *ppt;
593
+ Node *np;
594
+ total+=n;
595
+ if(this->search(pch)==0){
596
+ ppt = new long;
597
+ (*ppt) =n;
598
+ np=new Node(pch,(void*)ppt);
599
+ this->insert(np);
600
+ }
601
+ else {
602
+ (*(long*) this->give_ptr())+=n;
603
+ }
604
+ }
605
+
606
+ void Count::add_countz(const char *pch,long n){
607
+ long *ppt;
608
+ Node *np;
609
+ if(this->search(pch)==0){
610
+ ppt = new long;
611
+ (*ppt) =n;
612
+ np=new Node(pch,(void*)ppt);
613
+ this->insert(np);
614
+ cnt_key++;
615
+ }
616
+ else {
617
+ (*(long*) this->give_ptr())+=n;
618
+ }
619
+ }
620
+
621
+ void Count::add_count2(const char *pch,long n){
622
+ long *ppt;
623
+ Node *np;
624
+ total+=n;
625
+ if(this->search(pch)==0){
626
+ ppt = new long;
627
+ (*ppt) =n;
628
+ np=new Node(pch,(void*)ppt);
629
+ this->insert(np);
630
+ cnt_key++;
631
+ }
632
+ else {
633
+ (*(long*) this->give_ptr())+=n;
634
+ }
635
+ }
636
+
637
+ void Count::addp_count2(char *pch,long n){
638
+ long *ppt;
639
+ Node *np;
640
+ total+=n;
641
+ if(this->search(pch)==0){
642
+ ppt = new long;
643
+ (*ppt) =n;
644
+ np=new Node;
645
+ np->str=pch;
646
+ np->rel=ppt;
647
+ this->insert(np);
648
+ cnt_key++;
649
+ }
650
+ else {
651
+ (*(long*) this->give_ptr())+=n;
652
+ }
653
+ }
654
+
655
+ void Count::correct(const char *pch,long n){
656
+ if(this->search(pch)){
657
+ (*(long*) this->give_ptr())=n;
658
+ }
659
+ }
660
+
661
+ long Count::count(const char *pch){
662
+ if(this->search(pch)==0){
663
+ return(0);
664
+ }
665
+ else {
666
+ return(*((long*) this->give_ptr()));
667
+ }
668
+ }
669
+
670
+ long Count::count(void){
671
+ return(*((long*) this->give_ptr()));
672
+ }
673
+
674
+ void Count::max_count(const char *pch,long n){
675
+ long *ppt,i;
676
+ Node *np;
677
+ total+=n;
678
+ if(!search(pch)){
679
+ ppt = new long;
680
+ (*ppt) =n;
681
+ np=new Node(pch,(void*)ppt);
682
+ this->insert(np);
683
+ }
684
+ else {
685
+ ppt=(long*)give_ptr();
686
+ if(*ppt<n)*ppt=n;
687
+ }
688
+ }
689
+
690
+ void Count::max_count2(const char *pch,long n){
691
+ long *ppt,i;
692
+ Node *np;
693
+ total+=n;
694
+ if(!search(pch)){
695
+ ppt = new long;
696
+ (*ppt) =n;
697
+ np=new Node(pch,(void*)ppt);
698
+ this->insert(np);
699
+ cnt_key++;
700
+ }
701
+ else {
702
+ ppt=(long*)give_ptr();
703
+ if(*ppt<n)*ppt=n;
704
+ }
705
+ }
706
+
707
+ void Count::maxp_count2(char *pch,long n){
708
+ long *ppt,i;
709
+ Node *np;
710
+ total+=n;
711
+ if(!search(pch)){
712
+ ppt = new long;
713
+ (*ppt) =n;
714
+ np=new Node;
715
+ np->str=pch;
716
+ np->rel=ppt;
717
+ this->insert(np);
718
+ cnt_key++;
719
+ }
720
+ else {
721
+ ppt=(long*)give_ptr();
722
+ if(*ppt<n)*ppt=n;
723
+ }
724
+ }
725
+
726
+ void Count::min_count(const char *pch,long n){
727
+ long *ppt,i;
728
+ Node *np;
729
+ total+=n;
730
+ if(!search(pch)){
731
+ ppt = new long;
732
+ (*ppt) =n;
733
+ np=new Node(pch,(void*)ppt);
734
+ this->insert(np);
735
+ }
736
+ else {
737
+ ppt=(long*)give_ptr();
738
+ if(*ppt>n)*ppt=n;
739
+ }
740
+ }
741
+
742
+ void Count::min_count2(const char *pch,long n){
743
+ long *ppt,i;
744
+ Node *np;
745
+ total+=n;
746
+ if(!search(pch)){
747
+ ppt = new long;
748
+ (*ppt) =n;
749
+ np=new Node(pch,(void*)ppt);
750
+ this->insert(np);
751
+ cnt_key++;
752
+ }
753
+ else {
754
+ ppt=(long*)give_ptr();
755
+ if(*ppt>n)*ppt=n;
756
+ }
757
+ }
758
+
759
+ void Count::minp_count2(char *pch,long n){
760
+ long *ppt,i;
761
+ Node *np;
762
+ total+=n;
763
+ if(!search(pch)){
764
+ ppt = new long;
765
+ (*ppt) =n;
766
+ np=new Node;
767
+ np->str=pch;
768
+ np->rel=ppt;
769
+ this->insert(np);
770
+ cnt_key++;
771
+ }
772
+ else {
773
+ ppt=(long*)give_ptr();
774
+ if(*ppt>n)*ppt=n;
775
+ }
776
+ }
777
+
778
+ //FCount (float count tree)
779
+
780
+ FCount::FCount() : List() {
781
+ total=0;
782
+ }
783
+
784
+ FCount::~FCount(){
785
+ if(copy)return;
786
+ float *pk;
787
+ this->node_first();
788
+ while(this->node_next()){
789
+ pk=(float*)(this->give_ptr());
790
+ if(pk)delete pk;
791
+ }
792
+ }
793
+
794
+ void FCount::Copy(FCount &Fc){
795
+ char *pch;
796
+ float *xx,*zz;
797
+ Node *pN;
798
+
799
+ pg[0]=root;
800
+ cnd[0]=root->ndnm;
801
+
802
+ Fc.node_first();
803
+ while(Fc.node_next()){
804
+ pch=Fc.show_str();
805
+ xx=(float*)Fc.give_ptr();
806
+ zz=new float;
807
+ *zz=*xx;
808
+ pN=new Node(pch,(void*)zz);
809
+ add(pN);
810
+ }
811
+ }
812
+
813
+ void FCount::add_count(const char *pch,float z){
814
+ float *ppt;
815
+ Node *np;
816
+ total+=z;
817
+ if(this->search(pch)==0){
818
+ ppt = new float;
819
+ (*ppt) =z;
820
+ np=new Node(pch,(void*)ppt);
821
+ this->insert(np);
822
+ }
823
+ else {
824
+ (*(float*) this->give_ptr())+=z;
825
+ }
826
+ }
827
+
828
+ void FCount::add_count2(const char *pch,float z){
829
+ float *ppt;
830
+ Node *np;
831
+ total+=z;
832
+ if(this->search(pch)==0){
833
+ ppt = new float;
834
+ (*ppt) =z;
835
+ np=new Node(pch,(void*)ppt);
836
+ this->insert(np);
837
+ cnt_key++;
838
+ }
839
+ else {
840
+ (*(float*) this->give_ptr())+=z;
841
+ }
842
+ }
843
+
844
+ void FCount::addp_count2(char *pch,float z){
845
+ float *ppt;
846
+ Node *np;
847
+ total+=z;
848
+ if(this->search(pch)==0){
849
+ ppt = new float;
850
+ (*ppt) =z;
851
+ np=new Node;
852
+ np->str=pch;
853
+ np->rel=ppt;
854
+ this->insert(np);
855
+ cnt_key++;
856
+ }
857
+ else {
858
+ (*(float*) this->give_ptr())+=z;
859
+ }
860
+ }
861
+
862
+ float FCount::count(const char *pch){
863
+ if(this->search(pch)==0){
864
+ return(0);
865
+ }
866
+ else {
867
+ return(*((float*) this->give_ptr()));
868
+ }
869
+ }
870
+
871
+ float FCount::count(void){
872
+ return(*((float*) this->give_ptr()));
873
+ }
874
+
875
+ //DCount (double precision count tree)
876
+
877
+ DCount::DCount() : List() {
878
+ total=0;
879
+ }
880
+
881
+ DCount::~DCount(){
882
+ if(copy)return;
883
+ double *pk;
884
+ this->node_first();
885
+ while(this->node_next()){
886
+ pk=(double*)(this->give_ptr());
887
+ if(pk)delete pk;
888
+ }
889
+ }
890
+
891
+ void DCount::Copy(DCount &Dc){
892
+ char *pch;
893
+ double *xx,*zz;
894
+ Node *pN;
895
+
896
+ pg[0]=root;
897
+ cnd[0]=root->ndnm;
898
+
899
+ Dc.node_first();
900
+ while(Dc.node_next()){
901
+ pch=Dc.show_str();
902
+ xx=(double*)Dc.give_ptr();
903
+ zz=new double;
904
+ *zz=*xx;
905
+ pN=new Node(pch,(void*)zz);
906
+ add(pN);
907
+ }
908
+ }
909
+
910
+ void DCount::add_count(const char *pch,double z){
911
+ double *ppt;
912
+ Node *np;
913
+ total+=z;
914
+ if(this->search(pch)==0){
915
+ ppt = new double;
916
+ (*ppt) =z;
917
+ np=new Node(pch,(void*)ppt);
918
+ this->insert(np);
919
+ }
920
+ else {
921
+ (*(double*) this->give_ptr())+=z;
922
+ }
923
+ }
924
+
925
+ void DCount::add_count2(const char *pch,double z){
926
+ double *ppt;
927
+ Node *np;
928
+ total+=z;
929
+ if(this->search(pch)==0){
930
+ ppt = new double;
931
+ (*ppt) =z;
932
+ np=new Node(pch,(void*)ppt);
933
+ this->insert(np);
934
+ cnt_key++;
935
+ }
936
+ else {
937
+ (*(double*) this->give_ptr())+=z;
938
+ }
939
+ }
940
+
941
+ void DCount::addp_count2(char *pch,double z){
942
+ double *ppt;
943
+ Node *np;
944
+ total+=z;
945
+ if(this->search(pch)==0){
946
+ ppt = new double;
947
+ (*ppt) =z;
948
+ np=new Node;
949
+ np->str=pch;
950
+ np->rel=ppt;
951
+ this->insert(np);
952
+ cnt_key++;
953
+ }
954
+ else {
955
+ (*(double*) this->give_ptr())+=z;
956
+ }
957
+ }
958
+
959
+ double DCount::count(const char *pch){
960
+ if(this->search(pch)==0){
961
+ return(0);
962
+ }
963
+ else {
964
+ return(*((double*) this->give_ptr()));
965
+ }
966
+ }
967
+
968
+ double DCount::count(void){
969
+ return(*((double*) this->give_ptr()));
970
+ }
971
+
972
+ void DCount::max_count(const char *pch,double z){
973
+ double *ppt;
974
+ Node *np;
975
+ total+=z;
976
+ if(!search(pch)){
977
+ ppt = new double;
978
+ (*ppt) =z;
979
+ np=new Node(pch,(void*)ppt);
980
+ this->insert(np);
981
+ }
982
+ else {
983
+ ppt=(double*)give_ptr();
984
+ if(*ppt<z)*ppt=z;
985
+ }
986
+ }
987
+
988
+ void DCount::max_count2(const char *pch,double z){
989
+ double *ppt;
990
+ Node *np;
991
+ total+=z;
992
+ if(!search(pch)){
993
+ ppt = new double;
994
+ (*ppt) =z;
995
+ np=new Node(pch,(void*)ppt);
996
+ this->insert(np);
997
+ cnt_key++;
998
+ }
999
+ else {
1000
+ ppt=(double*)give_ptr();
1001
+ if(*ppt<z)*ppt=z;
1002
+ }
1003
+ }
1004
+
1005
+ void DCount::maxp_count2(char *pch,double z){
1006
+ double *ppt;
1007
+ Node *np;
1008
+ total+=z;
1009
+ if(!search(pch)){
1010
+ ppt = new double;
1011
+ (*ppt) =z;
1012
+ np=new Node;
1013
+ np->str=pch;
1014
+ np->rel=ppt;
1015
+ this->insert(np);
1016
+ cnt_key++;
1017
+ }
1018
+ else {
1019
+ ppt=(double*)give_ptr();
1020
+ if(*ppt<z)*ppt=z;
1021
+ }
1022
+ }
1023
+
1024
+ void DCount::min_count(const char *pch,double z){
1025
+ double *ppt;
1026
+ Node *np;
1027
+ total+=z;
1028
+ if(!search(pch)){
1029
+ ppt = new double;
1030
+ (*ppt) =z;
1031
+ np=new Node(pch,(void*)ppt);
1032
+ this->insert(np);
1033
+ }
1034
+ else {
1035
+ ppt=(double*)give_ptr();
1036
+ if(*ppt>z)*ppt=z;
1037
+ }
1038
+ }
1039
+
1040
+ void DCount::min_count2(const char *pch,double z){
1041
+ double *ppt;
1042
+ Node *np;
1043
+ total+=z;
1044
+ if(!search(pch)){
1045
+ ppt = new double;
1046
+ (*ppt) =z;
1047
+ np=new Node(pch,(void*)ppt);
1048
+ this->insert(np);
1049
+ cnt_key++;
1050
+ }
1051
+ else {
1052
+ ppt=(double*)give_ptr();
1053
+ if(*ppt>z)*ppt=z;
1054
+ }
1055
+ }
1056
+
1057
+ void DCount::minp_count2(char *pch,double z){
1058
+ double *ppt;
1059
+ Node *np;
1060
+ total+=z;
1061
+ if(!search(pch)){
1062
+ ppt = new double;
1063
+ (*ppt) =z;
1064
+ np=new Node;
1065
+ np->str=pch;
1066
+ np->rel=ppt;
1067
+ this->insert(np);
1068
+ cnt_key++;
1069
+ }
1070
+ else {
1071
+ ppt=(double*)give_ptr();
1072
+ if(*ppt>z)*ppt=z;
1073
+ }
1074
+ }
1075
+
1076
+ void DCount::debug(void){
1077
+ node_first();
1078
+ while(node_next()){
1079
+ cout << count() << " " << show_str() << endl;
1080
+ }
1081
+ }
1082
+
1083
+ //Partial Match
1084
+
1085
+ Partial_match::Partial_match() : Count() {
1086
+ }
1087
+
1088
+ Partial_match::~Partial_match(){
1089
+ }
1090
+
1091
+ void Partial_match::long_match(char *str,List &Lst){
1092
+ char *pch;
1093
+ while(*str!='\0'){
1094
+ if(this->search_long(str)){
1095
+ pch=this->show_str();
1096
+ Lst.add_key_count(pch);
1097
+ }
1098
+ if((pch=strchr(str,' '))!=NULL)str=pch+1;
1099
+ else str=str+strlen(str);
1100
+ }
1101
+ }
1102
+
1103
+ void Partial_match::local_match(char *str,List &Lst){
1104
+ char *pch;
1105
+ int i,j;
1106
+ if(*str!='\0'){
1107
+ if(this->search_long(str)){
1108
+ pch=this->show_str();
1109
+ Lst.add_key_count(pch);
1110
+ i=strlen(pch)-1;
1111
+ while(0<i){
1112
+ while((0<i)&&(*(str+i)!=' '))i--;
1113
+ if(0<i){
1114
+ *(str+i)='\0';
1115
+ j=this->search(str);
1116
+ *(str+i)=' ';
1117
+ if(j){
1118
+ pch=this->show_str();
1119
+ Lst.add_key_count(pch);
1120
+ }
1121
+ i--;
1122
+ }
1123
+ }
1124
+ }
1125
+ }
1126
+ }
1127
+
1128
+ void Partial_match::all_match(char *str,List &Lst){
1129
+ char *pch;
1130
+ int i,j;
1131
+ while(*str!='\0'){
1132
+ if(this->search_long(str)){
1133
+ pch=this->show_str();
1134
+ Lst.add_key_count(pch);
1135
+ i=strlen(pch)-1;
1136
+ while(0<i){
1137
+ while((0<i)&&(*(str+i)!=' '))i--;
1138
+ if(0<i){
1139
+ *(str+i)='\0';
1140
+ j=this->search(str);
1141
+ *(str+i)=' ';
1142
+ if(j){
1143
+ pch=this->show_str();
1144
+ Lst.add_key_count(pch);
1145
+ }
1146
+ i--;
1147
+ }
1148
+ }
1149
+ }
1150
+ if((pch=strchr(str,' '))!=NULL)str=pch+1;
1151
+ else str=str+strlen(str);
1152
+ }
1153
+ }
1154
+
1155
+ void Partial_match::long_match(char *str,Count &Cnt,long n){
1156
+ char *pch;
1157
+ while(*str!='\0'){
1158
+ if(this->search_long(str)){
1159
+ pch=this->show_str();
1160
+ Cnt.add_count2(pch,n);
1161
+ }
1162
+ if((pch=strchr(str,' '))!=NULL)str=pch+1;
1163
+ else str=str+strlen(str);
1164
+ }
1165
+ }
1166
+
1167
+ void Partial_match::local_match(char *str,Count &Cnt,long n){
1168
+ char *pch;
1169
+ int i,j;
1170
+ if(*str!='\0'){
1171
+ if(this->search_long(str)){
1172
+ pch=this->show_str();
1173
+ Cnt.add_count2(pch,n);
1174
+ i=strlen(pch)-1;
1175
+ while(0<i){
1176
+ while((0<i)&&(*(str+i)!=' '))i--;
1177
+ if(0<i){
1178
+ *(str+i)='\0';
1179
+ j=this->search(str);
1180
+ *(str+i)=' ';
1181
+ if(j){
1182
+ pch=this->show_str();
1183
+ Cnt.add_count2(pch,n);
1184
+ }
1185
+ i--;
1186
+ }
1187
+ }
1188
+ }
1189
+ }
1190
+ }
1191
+
1192
+ void Partial_match::all_match(char *str,Count &Cnt,long n){
1193
+ char *pch;
1194
+ int i,j;
1195
+ while(*str!='\0'){
1196
+ if(this->search_long(str)){
1197
+ pch=this->show_str();
1198
+ Cnt.add_count2(pch,n);
1199
+ i=strlen(pch)-1;
1200
+ while(0<i){
1201
+ while((0<i)&&(*(str+i)!=' '))i--;
1202
+ if(0<i){
1203
+ *(str+i)='\0';
1204
+ j=this->search(str);
1205
+ *(str+i)=' ';
1206
+ if(j){
1207
+ pch=this->show_str();
1208
+ Cnt.add_count2(pch,n);
1209
+ }
1210
+ i--;
1211
+ }
1212
+ }
1213
+ }
1214
+ if((pch=strchr(str,' '))!=NULL)str=pch+1;
1215
+ else str=str+strlen(str);
1216
+ }
1217
+ }
1218
+
1219
+ int Partial_match::search_long(char *str){
1220
+ int a=0,b=0,i,j;
1221
+ len=strlen(str);
1222
+ if(this->step_one(a,b,str))return(1);
1223
+ i=(a<b)?b:a;
1224
+ while(cln_o<i){
1225
+ while((cln_o<i)&&(*(str+i)!=' '))i--;
1226
+ if(cln_o<i){
1227
+ *(str+i)='\0';
1228
+ j=this->search(str);
1229
+ *(str+i)=' ';
1230
+ if(j)return(1);
1231
+ i--;
1232
+ }
1233
+ }
1234
+ if(cln_o){
1235
+ depth=depth_o;
1236
+ cnd[depth]=index_o;
1237
+ return(1);
1238
+ }
1239
+ else return(0);
1240
+ }
1241
+
1242
+ int Partial_match::step_one(int &a,int &b,char *str){
1243
+ char c;
1244
+ cln_o=0;
1245
+ cln=0;
1246
+ while((c=*(str+cln))&&c!=32)cln++;
1247
+ *(str+cln)='\0';
1248
+ depth=-1;
1249
+ Page *pu=root;
1250
+ int i,j;
1251
+ while(pu!=NULL){
1252
+ depth++;
1253
+ pg[depth]=pu;
1254
+ j=(pu->search)(a,b,str,i,this);
1255
+ cnd[depth]=i;
1256
+ if(j==1)return(1);
1257
+ if(i==0)pu=pu->pdn;
1258
+ else pu=(pu->pnd)[i-1]->pdn;
1259
+ }
1260
+
1261
+ if(cln<len)*(str+cln)=' ';
1262
+ return(0);
1263
+ }
1264
+
1265
+ int Partial_match::stc_my_long(int &a,int &b,char *str,const char *ptr,int index)
1266
+ {char c;
1267
+ int i=(a<b) ? a : b;
1268
+ const char *p1=str+i;
1269
+ const char *p2=ptr+i;
1270
+ int j=0;
1271
+ while((*p1==*p2)&&(*p1!='\0')){
1272
+ j++;
1273
+ p1++;
1274
+ p2++;
1275
+ if((*p1=='\0'&&*p2!='\0')&&(cln<len)){
1276
+ *(str+cln++)=' ';
1277
+ while((c=*(str+cln))&&c!=32)cln++;
1278
+ *(str+cln)='\0';
1279
+ }
1280
+ }
1281
+ if(*p1==*p2){
1282
+ if(cln<len){
1283
+ depth_o=depth;
1284
+ index_o=index;
1285
+ cln_o=cln;
1286
+ *(str+cln++)=' ';
1287
+ while((c=*(str+cln))&&c!=32)cln++;
1288
+ *(str+cln)='\0';
1289
+ a=i+j;
1290
+ return(1);
1291
+ }
1292
+ else return(0);
1293
+ }
1294
+ else if(*p1<*p2){
1295
+ b=i+j;
1296
+ return(-1);
1297
+ }
1298
+ else {
1299
+ a=i+j;
1300
+ return(1);
1301
+ }
1302
+ }
1303
+
1304
+ }
Library/Btree.h CHANGED
@@ -1,547 +1,547 @@
1
- #ifndef BTREE_H
2
- #define BTREE_H
3
-
4
- #define LNEG -100000000
5
-
6
- #include <iostream>
7
- #include <fstream>
8
- using namespace std;
9
- namespace iret {
10
-
11
- const int order = 5; //Half the order of the Btree that we build.
12
- const int height_limit =12; //Limit on the height of the Btree.
13
- const int ord2 = order*2; //The order of the Btree.
14
-
15
- int stc_my(int &,int &,const char *,const char *); //Function used to compare
16
- //two strings. The first two arguments hold information about how much the
17
- //string can be ignored in the comparison.
18
-
19
- class Page; //forward declaration
20
- class Btree; //forward declaration
21
- class Partial_match; //forward declaration
22
-
23
- class Node {
24
- friend int stc_my(int &,int &,const char *,const char *);
25
- friend class Page;
26
- friend class Btree;
27
- friend class List;
28
- friend class Count;
29
- friend class FCount;
30
- friend class DCount;
31
- template<class Z> friend class BCount;
32
- friend class Partial_match;
33
- friend class Thes;
34
- public:
35
- Node(void); //Sets all points to NULL.
36
- Node(const char * ); //Argument is the string for this node.
37
- Node(const char * ,void *); //Arguments are first the string and then the
38
- //data pointer.
39
- ~Node();
40
- void debug(); //Prints out the node in simple format.
41
- private:
42
- char *str; //String pointer.
43
- void *rel; //Data pointer.
44
- Page *pdn; //Points down to the page below or to NULL.
45
- };
46
-
47
- class Page {
48
- friend int stc_my(int &,int &,const char *,const char *);
49
- friend class Btree;
50
- friend class Partial_match;
51
- friend class FCount;
52
- friend class DCount;
53
- public:
54
- Page(); //Constructs a new empty page. Only happens at the root.
55
- Page(Page * const pz,Page * const pn,const int n); //Constructs a page that
56
- //holds the right half of a full page. The full page is pointed at by the
57
- //pz. The new pages downward pointer is set to pn.
58
- //n tells how much of the full page is to remain or where to begin removal.
59
- ~Page();
60
- void clean(void); //Used to delete without touching search keys in the nodes
61
- //which were created with addp functions and do not belong to the tree.
62
- void insert(const int n,Node * const nd,const int j); //inserts in partially empty
63
- //page. n is insertion point, j is number of nodes on page that are viable.
64
- int search(int &a,int &b,const char *,int &p); //searches for string on
65
- //the page. Returns 1 if found, 0 otherwise. If found p is the index, otherwise
66
- //if p is 0 then the page downward pointer is to next page to search, but if
67
- //p is positive then p-1 is number of node that has the downward pointer to
68
- //next page to search.
69
- int search(int &a,int &b,char *,int &p,Partial_match *btr); //Looks for longest
70
- //partial match.
71
- void debug(); //Prints out the page for debugging purposes.
72
-
73
- private:
74
- char ndnm; //Indicates the number of Nodes on the page.
75
- Page *pdn; //Pointer that points to the page below and also lexically below.
76
- //May be NULL.
77
- Node *pnd[ord2]; //Pointers to the nodes on the page. Some may be NULL.
78
- };
79
-
80
- class Btree {
81
- friend class Page;
82
- public:
83
- Btree(void);
84
- Btree(ifstream &); //Reads in a Btree in form of list written out by
85
- //list_write() from disc. String arguments mark the path in proj file.
86
- Btree( const Btree & btree ) {copy = true; root = btree.root;} // Actually
87
- // creates another reference to the same tree. Take great care to
88
- // avoid simultaneously modifying both copies.
89
- ~Btree(void);
90
- int search(const char *); //Searches for a string and sets the path to that
91
- //string or its insertion point.
92
- int insert(Node *);//Only to be called after a search has failed to find the
93
- //string.
94
- void node_first();//Finds the first node in the tree and sets the path to it.
95
- int node_next(); //Given the path is already set to a node, this function
96
- //finds the next node in lexicographic order.
97
- char *show_str();//Used to show the string after a call to next is successful.
98
- void *give_ptr();//Used to give the data pointer in the current node.
99
- void set_ptr(void *); //Used to set the data pointer after a call to search
100
- //has found string.
101
- int add(Node *); //Only to be used to construct a tree from a lexical list
102
- //as written out by list_write();
103
- void next_empty(); //Only used to reset the pointer arrays when the root is
104
- //split. Used in add().
105
- long list_write(ofstream &); //Writes out a lexical list of the strings in
106
- //the tree.
107
- int iclean; //Default 0, but set to 1 if want to have destructor run without
108
- //touching key strings (if addp used in making tree).
109
- protected:
110
- int depth; //Tells the depth in the tree that marks the current location.
111
- Page *root; //Points at the root page of the tree.
112
- Page *pg[height_limit]; //Descending list of pointers that mark the pages.
113
- int cnd[height_limit]; //Mark the positions of the nodes just above the
114
- //downard page pointer at each level. Thus 0 marks the page's downward
115
- //pointer, but a nonzero value must have 1 subtracted and then it gives
116
- //the node whose downward pointer is the correct downward pointer.
117
- bool copy; //flags copies of a tree with true.
118
- };
119
-
120
- class List : public Btree {
121
- public:
122
- List();
123
- List(const List & list) : Btree(list) {}
124
- ~List();
125
- void add_key(const char *str); //Adds the string *str to the tree if not already in list
126
- void add_key_count(const char *str); //Adds the string *str to the tree if
127
- //not already in list and counts it.
128
- void addp_key_count(char *str); //Adds the string *str to the tree if
129
- //not already in list and counts it. Uses the actual string pointer instead
130
- //of making a copy
131
- long cnt_key; //Used to count the number of keys.
132
- };
133
-
134
- class Count : public List {
135
- public:
136
- Count();
137
- Count(const Count & Ct) : List(Ct){}
138
- ~Count();
139
- void add_count(const char *str,long n); //Adds the string *str with its count
140
- //to the tree if not already in list. String is key and count is data.
141
- //If string is already a key the count is incremented by n.
142
- void add_countz(const char *str,long n); //Adds the string *str with its count
143
- //just as add_count, but also counts number of unique keys in count.
144
- //Does not add count to the total variable, unlike add_count2.
145
- void add_count2(const char *str,long n); //Adds the string *str with its count
146
- //just as add_count, but also counts number of unique keys in count.
147
- void addp_count2(char *str,long n); //Adds the string *str with its count
148
- //just as add_count, but also counts number of unique keys in count.
149
- //Does not make copy of string, but uses the pointer str as key pointer.
150
- void correct(const char *str,long n); //If str is in the tree the count is
151
- //changed to n. Otherwise nothing is done.
152
-
153
- //Functions for maximum calculation
154
- void max_count(const char *str,long n); //Adds the string *str with its count
155
- //to the tree if not already in list. String is key and count is data.
156
- //If string is already a key the count is max of n and prior value.
157
- void max_count2(const char *str,long n); //Adds the string *str with its count
158
- //just as max_count, but also counts number of unique keys in count.
159
- void maxp_count2(char *str,long n); //Adds the string *str with its count
160
- //just as max_count, but also counts number of unique keys in count.
161
- //Does not make copy of string, but uses the pointer str as key pointer.
162
-
163
- //Functions for minium calculation
164
- void min_count(const char *str,long n); //Adds the string *str with its count
165
- //to the tree if not already in list. String is key and count is data.
166
- //If string is already a key the count is min of n and prior value.
167
- void min_count2(const char *str,long n); //Adds the string *str with its count
168
- //just as min_count, but also counts number of unique keys in count.
169
- void minp_count2(char *str,long n); //Adds the string *str with its count
170
- //just as min_count, but also counts number of unique keys in count.
171
- //Does not make copy of string, but uses the pointer str as key pointer.
172
-
173
- long count(const char *str); //Returns the count if a key (in list) otherwise
174
- //returns 0.
175
- long count(void); //Returns the count of the current string. Assumes the
176
- //pointers have already been set by a search or node_next call.
177
- long total; //Holds the total of all counts added for all keys.
178
- };
179
-
180
- class FCount : public List {
181
- public:
182
- FCount();
183
- FCount(const FCount & Ct) : List(Ct){}
184
- ~FCount();
185
- void Copy(FCount &Dc); //Makes a copy of the tree Dc in the current tree.
186
- void add_count(const char *str,float z); //Adds the string *str with its count
187
- //to the tree if not already in list. String is key and count is data.
188
- //If string is already a key the count is incremented by z.
189
- void add_count2(const char *str,float z); //Adds the string *str with its count
190
- //just as add_count, but also counts number of unique keys in count.
191
- void addp_count2(char *str,float z); //Adds the string *str with its count
192
- //just as add_count, but also counts number of unique keys in count.
193
- //Does not make copy of string, but uses the pointer str as key pointer.
194
- float count(const char *str); //Returns the count if a key (in list) otherwise
195
- //returns 0.
196
- float count(void); //Returns the count of the current string. Assumes the
197
- //pointers have already been set by a search or node_next call.
198
- float total; //Holds the total of all counts added for all keys.
199
- };
200
-
201
- class DCount : public List {
202
- public:
203
- DCount();
204
- DCount(const DCount & Ct) : List(Ct){}
205
- ~DCount();
206
- void Copy(DCount &Dc); //Makes a copy of the tree Dc in the current tree.
207
- void add_count(const char *str,double z); //Adds the string *str with its count
208
- //to the tree if not already in list. String is key and count is data.
209
- //If string is already a key the count is incremented by z.
210
- void add_count2(const char *str,double z); //Adds the string *str with its count
211
- //just as add_count, but also counts number of unique keys in count.
212
- void addp_count2(char *str,double z); //Adds the string *str with its count
213
- //just as add_count, but also counts number of unique keys in count.
214
- //Does not make copy of string, but uses the pointer str as key pointer.
215
- double count(const char *str); //Returns the count if a key (in list) otherwise
216
- //returns 0.
217
- double count(void); //Returns the count of the current string. Assumes the
218
- //pointers have already been set by a search or node_next call.
219
-
220
- //Functions for maximum calculation
221
- void max_count(const char *str,double z); //Adds the string *str with its count
222
- //to the tree if not already in list. String is key and count is data.
223
- //If string is already a key the count is max of z and prior value.
224
- void max_count2(const char *str,double z); //Adds the string *str with its count
225
- //just as max_count, but also counts number of unique keys in count.
226
- void maxp_count2(char *str,double z); //Adds the string *str with its count
227
- //just as max_count, but also counts number of unique keys in count.
228
- //Does not make copy of string, but uses the pointer str as key pointer.
229
-
230
- //Functions for minium calculation
231
- void min_count(const char *str,double z); //Adds the string *str with its count
232
- //to the tree if not already in list. String is key and count is data.
233
- //If string is already a key the count is min of z and prior value.
234
- void min_count2(const char *str,double z); //Adds the string *str with its count
235
- //just as min_count, but also counts number of unique keys in count.
236
- void minp_count2(char *str,double z); //Adds the string *str with its count
237
- //just as min_count, but also counts number of unique keys in count.
238
- //Does not make copy of string, but uses the pointer str as key pointer.
239
-
240
- void debug(void); //Prints to stdout a list "i str[i]"
241
- double total; //Holds the total of all counts added for all keys.
242
- };
243
-
244
- class Partial_match : public Count {
245
- friend class Page;
246
- public:
247
- Partial_match();
248
- Partial_match(const Partial_match & Par_mat) : Count(Par_mat){}
249
- ~Partial_match();
250
- void long_match(char *,List &); //Finds the longest matches for all word
251
- //starts in the string and adds them to the list.
252
- void local_match(char *,List &); //Finds all matches that start at
253
- //beginning of the string and adds them to the list.
254
- void all_match(char *,List &); //Finds all matches within the string and
255
- //adds them to the list.
256
- void long_match(char *,Count &,long n); //Finds the longest matches for all word
257
- //starts in the string and adds them to the list in Count.
258
- void local_match(char *,Count &,long n); //Finds all matches that start at
259
- //beginning of string and adds them to the list in Count.
260
- void all_match(char *,Count &,long n); //Finds all matches within the string and
261
- //adds them to the list in Count.
262
- int search_long(char *); //Searches for longest partial match to an initial
263
- //segment of a string that ends at a word boundary and
264
- //sets the path to that string or its insertion point.
265
-
266
- private:
267
- int stc_my_long(int &,int &,char *,const char *,int); //Function used to compare
268
- //two strings. The first two arguments hold information about how much the
269
- //string can be ignored in the comparison. The last argument holds the index
270
- //or number of the string's node on the page.
271
- int step_one(int &,int &,char *); //Looks for partial or complete match and
272
- //returns 1 if complete found. Partial is reflected in parameters.
273
-
274
- //Special parameters used in partial matching.
275
- int depth_o; //Depth of longest partial match thus far.
276
- int index_o; //index of longest partial match thus far.
277
- int cln_o; //String length of longest partial match thus far.
278
- int len; //Length of query string.
279
- int cln; //Current null position in string.
280
- };
281
-
282
- class Str_str : public Btree {
283
- public:
284
- Str_str();
285
- Str_str(const Str_str & Stst) : Btree(Stst){}
286
- ~Str_str();
287
- void add_pair(const char *one,const char *two); //Adds the string *one to the tree and stores
288
- //the string *two at that node.
289
- char *match(const char *one); //Returns pointer to the string stored under string *one.
290
- };
291
-
292
- class Num_num : public Btree {
293
- public:
294
- Num_num();
295
- Num_num(const Num_num & Nmnm) : Btree(Nmnm){}
296
- ~Num_num();
297
- void add_pair(long i, long j); //Adds the string for i to the tree and
298
- //stores the number j at that node.
299
- long match(long i); //Returns the number stored under the string for i.
300
- };
301
-
302
- template<class Z>
303
- class BCount : public List {
304
- public:
305
- BCount();
306
- BCount(const BCount<Z> & Ct) : List(Ct){}
307
- ~BCount();
308
- void add_count(const char *str,Z n); //Adds the string *str with its count
309
- //to the tree if not already in list. String is key and count is data.
310
- //If string is already a key the count is incremented by n.
311
- void add_count2(const char *str,Z n); //Adds the string *str with its count
312
- //just as add_count, but also counts number of unique keys in count.
313
- void addp_count2(char *str,Z n); //Adds the string *str with its count
314
- //just as add_count, but also counts number of unique keys in count.
315
- //Does not make copy of string, but uses the pointer str as key pointer.
316
- void correct(const char *str,Z n); //If str is in the tree the count is
317
- //changed to n. Otherwise nothing is done.
318
-
319
- //Functions for maximum calculation
320
- void max_count(const char *str,Z n); //Adds the string *str with its count
321
- //to the tree if not already in list. String is key and count is data.
322
- //If string is already a key the count is max of n and prior value.
323
- void max_count2(const char *str,Z n); //Adds the string *str with its count
324
- //just as max_count, but also counts number of unique keys in count.
325
- void maxp_count2(char *str,Z n); //Adds the string *str with its count
326
- //just as max_count, but also counts number of unique keys in count.
327
- //Does not make copy of string, but uses the pointer str as key pointer.
328
-
329
- //Functions for minium calculation
330
- void min_count(const char *str,Z n); //Adds the string *str with its count
331
- //to the tree if not already in list. String is key and count is data.
332
- //If string is already a key the count is min of n and prior value.
333
- void min_count2(const char *str,Z n); //Adds the string *str with its count
334
- //just as min_count, but also counts number of unique keys in count.
335
- void minp_count2(char *str,Z n); //Adds the string *str with its count
336
- //just as min_count, but also counts number of unique keys in count.
337
- //Does not make copy of string, but uses the pointer str as key pointer.
338
-
339
- Z count(const char *str); //Returns the count if a key (in list) otherwise
340
- //returns 0.
341
- Z count(void); //Returns the count of the current string. Assumes the
342
- //pointers have already been set by a search or node_next call.
343
- Z total; //Holds the total of all counts added for all keys.
344
- };
345
-
346
- template<class Z>
347
- BCount<Z>::BCount() : List() {
348
- total=0;
349
- }
350
-
351
- template<class Z>
352
- BCount<Z>::~BCount(){
353
- if(copy)return;
354
- Z *pk;
355
- this->node_first();
356
- while(this->node_next()){
357
- pk=(Z *)(this->give_ptr());
358
- if(pk)delete pk;
359
- }
360
- }
361
-
362
- template<class Z>
363
- void BCount<Z>::add_count(const char *pch,Z n){
364
- Z *ppt;
365
- Node *np;
366
- total+=n;
367
- if(this->search(pch)==0){
368
- ppt = new Z;
369
- (*ppt) =n;
370
- np=new Node(pch,(void*)ppt);
371
- this->insert(np);
372
- }
373
- else {
374
- (*(Z *) this->give_ptr())+=n;
375
- }
376
- }
377
-
378
- template<class Z>
379
- void BCount<Z>::add_count2(const char *pch,Z n){
380
- Z *ppt;
381
- Node *np;
382
- total+=n;
383
- if(this->search(pch)==0){
384
- ppt = new Z;
385
- (*ppt) =n;
386
- np=new Node(pch,(void*)ppt);
387
- this->insert(np);
388
- cnt_key++;
389
- }
390
- else {
391
- (*(Z *) this->give_ptr())+=n;
392
- }
393
- }
394
-
395
- template<class Z>
396
- void BCount<Z>::addp_count2(char *pch,Z n){
397
- Z *ppt;
398
- Node *np;
399
- total+=n;
400
- if(this->search(pch)==0){
401
- ppt = new Z;
402
- (*ppt) =n;
403
- np=new Node;
404
- np->str=pch;
405
- np->rel=ppt;
406
- this->insert(np);
407
- cnt_key++;
408
- }
409
- else {
410
- (*(Z *) this->give_ptr())+=n;
411
- }
412
- }
413
-
414
- template<class Z>
415
- void BCount<Z>::correct(const char *pch,Z n){
416
- if(this->search(pch)){
417
- (*(Z *) this->give_ptr())=n;
418
- }
419
- }
420
-
421
- template<class Z>
422
- Z BCount<Z>::count(const char *pch){
423
- if(this->search(pch)==0){
424
- return(0);
425
- }
426
- else {
427
- return(*((Z *) this->give_ptr()));
428
- }
429
- }
430
-
431
- template<class Z>
432
- Z BCount<Z>::count(void){
433
- return(*((Z *) this->give_ptr()));
434
- }
435
-
436
- template<class Z>
437
- void BCount<Z>::max_count(const char *pch,Z n){
438
- Z *ppt,i;
439
- Node *np;
440
- total+=n;
441
- if(!search(pch)){
442
- ppt = new Z;
443
- (*ppt) =n;
444
- np=new Node(pch,(void*)ppt);
445
- this->insert(np);
446
- }
447
- else {
448
- ppt=(Z *)give_ptr();
449
- if(*ppt<n)*ppt=n;
450
- }
451
- }
452
-
453
- template<class Z>
454
- void BCount<Z>::max_count2(const char *pch,Z n){
455
- Z *ppt,i;
456
- Node *np;
457
- total+=n;
458
- if(!search(pch)){
459
- ppt = new Z;
460
- (*ppt) =n;
461
- np=new Node(pch,(void*)ppt);
462
- this->insert(np);
463
- cnt_key++;
464
- }
465
- else {
466
- ppt=(Z *)give_ptr();
467
- if(*ppt<n)*ppt=n;
468
- }
469
- }
470
-
471
- template<class Z>
472
- void BCount<Z>::maxp_count2(char *pch,Z n){
473
- Z *ppt,i;
474
- Node *np;
475
- total+=n;
476
- if(!search(pch)){
477
- ppt = new Z;
478
- (*ppt) =n;
479
- np=new Node;
480
- np->str=pch;
481
- np->rel=ppt;
482
- this->insert(np);
483
- cnt_key++;
484
- }
485
- else {
486
- ppt=(Z *)give_ptr();
487
- if(*ppt<n)*ppt=n;
488
- }
489
- }
490
-
491
- template<class Z>
492
- void BCount<Z>::min_count(const char *pch,Z n){
493
- Z *ppt,i;
494
- Node *np;
495
- total+=n;
496
- if(!search(pch)){
497
- ppt = new Z;
498
- (*ppt) =n;
499
- np=new Node(pch,(void*)ppt);
500
- this->insert(np);
501
- }
502
- else {
503
- ppt=(Z *)give_ptr();
504
- if(*ppt>n)*ppt=n;
505
- }
506
- }
507
-
508
- template<class Z>
509
- void BCount<Z>::min_count2(const char *pch,Z n){
510
- Z *ppt,i;
511
- Node *np;
512
- total+=n;
513
- if(!search(pch)){
514
- ppt = new Z;
515
- (*ppt) =n;
516
- np=new Node(pch,(void*)ppt);
517
- this->insert(np);
518
- cnt_key++;
519
- }
520
- else {
521
- ppt=(Z *)give_ptr();
522
- if(*ppt>n)*ppt=n;
523
- }
524
- }
525
-
526
- template<class Z>
527
- void BCount<Z>::minp_count2(char *pch,Z n){
528
- Z *ppt,i;
529
- Node *np;
530
- total+=n;
531
- if(!search(pch)){
532
- ppt = new Z;
533
- (*ppt) =n;
534
- np=new Node;
535
- np->str=pch;
536
- np->rel=ppt;
537
- this->insert(np);
538
- cnt_key++;
539
- }
540
- else {
541
- ppt=(Z *)give_ptr();
542
- if(*ppt>n)*ppt=n;
543
- }
544
- }
545
-
546
- }
547
- #endif
 
1
+ #ifndef BTREE_H
2
+ #define BTREE_H
3
+
4
+ #define LNEG -100000000
5
+
6
+ #include <iostream>
7
+ #include <fstream>
8
+ using namespace std;
9
+ namespace iret {
10
+
11
+ const int order = 5; //Half the order of the Btree that we build.
12
+ const int height_limit =12; //Limit on the height of the Btree.
13
+ const int ord2 = order*2; //The order of the Btree.
14
+
15
+ int stc_my(int &,int &,const char *,const char *); //Function used to compare
16
+ //two strings. The first two arguments hold information about how much the
17
+ //string can be ignored in the comparison.
18
+
19
+ class Page; //forward declaration
20
+ class Btree; //forward declaration
21
+ class Partial_match; //forward declaration
22
+
23
+ class Node {
24
+ friend int stc_my(int &,int &,const char *,const char *);
25
+ friend class Page;
26
+ friend class Btree;
27
+ friend class List;
28
+ friend class Count;
29
+ friend class FCount;
30
+ friend class DCount;
31
+ template<class Z> friend class BCount;
32
+ friend class Partial_match;
33
+ friend class Thes;
34
+ public:
35
+ Node(void); //Sets all points to NULL.
36
+ Node(const char * ); //Argument is the string for this node.
37
+ Node(const char * ,void *); //Arguments are first the string and then the
38
+ //data pointer.
39
+ ~Node();
40
+ void debug(); //Prints out the node in simple format.
41
+ private:
42
+ char *str; //String pointer.
43
+ void *rel; //Data pointer.
44
+ Page *pdn; //Points down to the page below or to NULL.
45
+ };
46
+
47
+ class Page {
48
+ friend int stc_my(int &,int &,const char *,const char *);
49
+ friend class Btree;
50
+ friend class Partial_match;
51
+ friend class FCount;
52
+ friend class DCount;
53
+ public:
54
+ Page(); //Constructs a new empty page. Only happens at the root.
55
+ Page(Page * const pz,Page * const pn,const int n); //Constructs a page that
56
+ //holds the right half of a full page. The full page is pointed at by the
57
+ //pz. The new pages downward pointer is set to pn.
58
+ //n tells how much of the full page is to remain or where to begin removal.
59
+ ~Page();
60
+ void clean(void); //Used to delete without touching search keys in the nodes
61
+ //which were created with addp functions and do not belong to the tree.
62
+ void insert(const int n,Node * const nd,const int j); //inserts in partially empty
63
+ //page. n is insertion point, j is number of nodes on page that are viable.
64
+ int search(int &a,int &b,const char *,int &p); //searches for string on
65
+ //the page. Returns 1 if found, 0 otherwise. If found p is the index, otherwise
66
+ //if p is 0 then the page downward pointer is to next page to search, but if
67
+ //p is positive then p-1 is number of node that has the downward pointer to
68
+ //next page to search.
69
+ int search(int &a,int &b,char *,int &p,Partial_match *btr); //Looks for longest
70
+ //partial match.
71
+ void debug(); //Prints out the page for debugging purposes.
72
+
73
+ private:
74
+ char ndnm; //Indicates the number of Nodes on the page.
75
+ Page *pdn; //Pointer that points to the page below and also lexically below.
76
+ //May be NULL.
77
+ Node *pnd[ord2]; //Pointers to the nodes on the page. Some may be NULL.
78
+ };
79
+
80
+ class Btree {
81
+ friend class Page;
82
+ public:
83
+ Btree(void);
84
+ Btree(ifstream &); //Reads in a Btree in form of list written out by
85
+ //list_write() from disc. String arguments mark the path in proj file.
86
+ Btree( const Btree & btree ) {copy = true; root = btree.root;} // Actually
87
+ // creates another reference to the same tree. Take great care to
88
+ // avoid simultaneously modifying both copies.
89
+ ~Btree(void);
90
+ int search(const char *); //Searches for a string and sets the path to that
91
+ //string or its insertion point.
92
+ int insert(Node *);//Only to be called after a search has failed to find the
93
+ //string.
94
+ void node_first();//Finds the first node in the tree and sets the path to it.
95
+ int node_next(); //Given the path is already set to a node, this function
96
+ //finds the next node in lexicographic order.
97
+ char *show_str();//Used to show the string after a call to next is successful.
98
+ void *give_ptr();//Used to give the data pointer in the current node.
99
+ void set_ptr(void *); //Used to set the data pointer after a call to search
100
+ //has found string.
101
+ int add(Node *); //Only to be used to construct a tree from a lexical list
102
+ //as written out by list_write();
103
+ void next_empty(); //Only used to reset the pointer arrays when the root is
104
+ //split. Used in add().
105
+ long list_write(ofstream &); //Writes out a lexical list of the strings in
106
+ //the tree.
107
+ int iclean; //Default 0, but set to 1 if want to have destructor run without
108
+ //touching key strings (if addp used in making tree).
109
+ protected:
110
+ int depth; //Tells the depth in the tree that marks the current location.
111
+ Page *root; //Points at the root page of the tree.
112
+ Page *pg[height_limit]; //Descending list of pointers that mark the pages.
113
+ int cnd[height_limit]; //Mark the positions of the nodes just above the
114
+ //downard page pointer at each level. Thus 0 marks the page's downward
115
+ //pointer, but a nonzero value must have 1 subtracted and then it gives
116
+ //the node whose downward pointer is the correct downward pointer.
117
+ bool copy; //flags copies of a tree with true.
118
+ };
119
+
120
+ class List : public Btree {
121
+ public:
122
+ List();
123
+ List(const List & list) : Btree(list) {}
124
+ ~List();
125
+ void add_key(const char *str); //Adds the string *str to the tree if not already in list
126
+ void add_key_count(const char *str); //Adds the string *str to the tree if
127
+ //not already in list and counts it.
128
+ void addp_key_count(char *str); //Adds the string *str to the tree if
129
+ //not already in list and counts it. Uses the actual string pointer instead
130
+ //of making a copy
131
+ long cnt_key; //Used to count the number of keys.
132
+ };
133
+
134
+ class Count : public List {
135
+ public:
136
+ Count();
137
+ Count(const Count & Ct) : List(Ct){}
138
+ ~Count();
139
+ void add_count(const char *str,long n); //Adds the string *str with its count
140
+ //to the tree if not already in list. String is key and count is data.
141
+ //If string is already a key the count is incremented by n.
142
+ void add_countz(const char *str,long n); //Adds the string *str with its count
143
+ //just as add_count, but also counts number of unique keys in count.
144
+ //Does not add count to the total variable, unlike add_count2.
145
+ void add_count2(const char *str,long n); //Adds the string *str with its count
146
+ //just as add_count, but also counts number of unique keys in count.
147
+ void addp_count2(char *str,long n); //Adds the string *str with its count
148
+ //just as add_count, but also counts number of unique keys in count.
149
+ //Does not make copy of string, but uses the pointer str as key pointer.
150
+ void correct(const char *str,long n); //If str is in the tree the count is
151
+ //changed to n. Otherwise nothing is done.
152
+
153
+ //Functions for maximum calculation
154
+ void max_count(const char *str,long n); //Adds the string *str with its count
155
+ //to the tree if not already in list. String is key and count is data.
156
+ //If string is already a key the count is max of n and prior value.
157
+ void max_count2(const char *str,long n); //Adds the string *str with its count
158
+ //just as max_count, but also counts number of unique keys in count.
159
+ void maxp_count2(char *str,long n); //Adds the string *str with its count
160
+ //just as max_count, but also counts number of unique keys in count.
161
+ //Does not make copy of string, but uses the pointer str as key pointer.
162
+
163
+ //Functions for minium calculation
164
+ void min_count(const char *str,long n); //Adds the string *str with its count
165
+ //to the tree if not already in list. String is key and count is data.
166
+ //If string is already a key the count is min of n and prior value.
167
+ void min_count2(const char *str,long n); //Adds the string *str with its count
168
+ //just as min_count, but also counts number of unique keys in count.
169
+ void minp_count2(char *str,long n); //Adds the string *str with its count
170
+ //just as min_count, but also counts number of unique keys in count.
171
+ //Does not make copy of string, but uses the pointer str as key pointer.
172
+
173
+ long count(const char *str); //Returns the count if a key (in list) otherwise
174
+ //returns 0.
175
+ long count(void); //Returns the count of the current string. Assumes the
176
+ //pointers have already been set by a search or node_next call.
177
+ long total; //Holds the total of all counts added for all keys.
178
+ };
179
+
180
+ class FCount : public List {
181
+ public:
182
+ FCount();
183
+ FCount(const FCount & Ct) : List(Ct){}
184
+ ~FCount();
185
+ void Copy(FCount &Dc); //Makes a copy of the tree Dc in the current tree.
186
+ void add_count(const char *str,float z); //Adds the string *str with its count
187
+ //to the tree if not already in list. String is key and count is data.
188
+ //If string is already a key the count is incremented by z.
189
+ void add_count2(const char *str,float z); //Adds the string *str with its count
190
+ //just as add_count, but also counts number of unique keys in count.
191
+ void addp_count2(char *str,float z); //Adds the string *str with its count
192
+ //just as add_count, but also counts number of unique keys in count.
193
+ //Does not make copy of string, but uses the pointer str as key pointer.
194
+ float count(const char *str); //Returns the count if a key (in list) otherwise
195
+ //returns 0.
196
+ float count(void); //Returns the count of the current string. Assumes the
197
+ //pointers have already been set by a search or node_next call.
198
+ float total; //Holds the total of all counts added for all keys.
199
+ };
200
+
201
+ class DCount : public List {
202
+ public:
203
+ DCount();
204
+ DCount(const DCount & Ct) : List(Ct){}
205
+ ~DCount();
206
+ void Copy(DCount &Dc); //Makes a copy of the tree Dc in the current tree.
207
+ void add_count(const char *str,double z); //Adds the string *str with its count
208
+ //to the tree if not already in list. String is key and count is data.
209
+ //If string is already a key the count is incremented by z.
210
+ void add_count2(const char *str,double z); //Adds the string *str with its count
211
+ //just as add_count, but also counts number of unique keys in count.
212
+ void addp_count2(char *str,double z); //Adds the string *str with its count
213
+ //just as add_count, but also counts number of unique keys in count.
214
+ //Does not make copy of string, but uses the pointer str as key pointer.
215
+ double count(const char *str); //Returns the count if a key (in list) otherwise
216
+ //returns 0.
217
+ double count(void); //Returns the count of the current string. Assumes the
218
+ //pointers have already been set by a search or node_next call.
219
+
220
+ //Functions for maximum calculation
221
+ void max_count(const char *str,double z); //Adds the string *str with its count
222
+ //to the tree if not already in list. String is key and count is data.
223
+ //If string is already a key the count is max of z and prior value.
224
+ void max_count2(const char *str,double z); //Adds the string *str with its count
225
+ //just as max_count, but also counts number of unique keys in count.
226
+ void maxp_count2(char *str,double z); //Adds the string *str with its count
227
+ //just as max_count, but also counts number of unique keys in count.
228
+ //Does not make copy of string, but uses the pointer str as key pointer.
229
+
230
+ //Functions for minium calculation
231
+ void min_count(const char *str,double z); //Adds the string *str with its count
232
+ //to the tree if not already in list. String is key and count is data.
233
+ //If string is already a key the count is min of z and prior value.
234
+ void min_count2(const char *str,double z); //Adds the string *str with its count
235
+ //just as min_count, but also counts number of unique keys in count.
236
+ void minp_count2(char *str,double z); //Adds the string *str with its count
237
+ //just as min_count, but also counts number of unique keys in count.
238
+ //Does not make copy of string, but uses the pointer str as key pointer.
239
+
240
+ void debug(void); //Prints to stdout a list "i str[i]"
241
+ double total; //Holds the total of all counts added for all keys.
242
+ };
243
+
244
+ class Partial_match : public Count {
245
+ friend class Page;
246
+ public:
247
+ Partial_match();
248
+ Partial_match(const Partial_match & Par_mat) : Count(Par_mat){}
249
+ ~Partial_match();
250
+ void long_match(char *,List &); //Finds the longest matches for all word
251
+ //starts in the string and adds them to the list.
252
+ void local_match(char *,List &); //Finds all matches that start at
253
+ //beginning of the string and adds them to the list.
254
+ void all_match(char *,List &); //Finds all matches within the string and
255
+ //adds them to the list.
256
+ void long_match(char *,Count &,long n); //Finds the longest matches for all word
257
+ //starts in the string and adds them to the list in Count.
258
+ void local_match(char *,Count &,long n); //Finds all matches that start at
259
+ //beginning of string and adds them to the list in Count.
260
+ void all_match(char *,Count &,long n); //Finds all matches within the string and
261
+ //adds them to the list in Count.
262
+ int search_long(char *); //Searches for longest partial match to an initial
263
+ //segment of a string that ends at a word boundary and
264
+ //sets the path to that string or its insertion point.
265
+
266
+ private:
267
+ int stc_my_long(int &,int &,char *,const char *,int); //Function used to compare
268
+ //two strings. The first two arguments hold information about how much the
269
+ //string can be ignored in the comparison. The last argument holds the index
270
+ //or number of the string's node on the page.
271
+ int step_one(int &,int &,char *); //Looks for partial or complete match and
272
+ //returns 1 if complete found. Partial is reflected in parameters.
273
+
274
+ //Special parameters used in partial matching.
275
+ int depth_o; //Depth of longest partial match thus far.
276
+ int index_o; //index of longest partial match thus far.
277
+ int cln_o; //String length of longest partial match thus far.
278
+ int len; //Length of query string.
279
+ int cln; //Current null position in string.
280
+ };
281
+
282
+ class Str_str : public Btree {
283
+ public:
284
+ Str_str();
285
+ Str_str(const Str_str & Stst) : Btree(Stst){}
286
+ ~Str_str();
287
+ void add_pair(const char *one,const char *two); //Adds the string *one to the tree and stores
288
+ //the string *two at that node.
289
+ char *match(const char *one); //Returns pointer to the string stored under string *one.
290
+ };
291
+
292
+ class Num_num : public Btree {
293
+ public:
294
+ Num_num();
295
+ Num_num(const Num_num & Nmnm) : Btree(Nmnm){}
296
+ ~Num_num();
297
+ void add_pair(long i, long j); //Adds the string for i to the tree and
298
+ //stores the number j at that node.
299
+ long match(long i); //Returns the number stored under the string for i.
300
+ };
301
+
302
+ template<class Z>
303
+ class BCount : public List {
304
+ public:
305
+ BCount();
306
+ BCount(const BCount<Z> & Ct) : List(Ct){}
307
+ ~BCount();
308
+ void add_count(const char *str,Z n); //Adds the string *str with its count
309
+ //to the tree if not already in list. String is key and count is data.
310
+ //If string is already a key the count is incremented by n.
311
+ void add_count2(const char *str,Z n); //Adds the string *str with its count
312
+ //just as add_count, but also counts number of unique keys in count.
313
+ void addp_count2(char *str,Z n); //Adds the string *str with its count
314
+ //just as add_count, but also counts number of unique keys in count.
315
+ //Does not make copy of string, but uses the pointer str as key pointer.
316
+ void correct(const char *str,Z n); //If str is in the tree the count is
317
+ //changed to n. Otherwise nothing is done.
318
+
319
+ //Functions for maximum calculation
320
+ void max_count(const char *str,Z n); //Adds the string *str with its count
321
+ //to the tree if not already in list. String is key and count is data.
322
+ //If string is already a key the count is max of n and prior value.
323
+ void max_count2(const char *str,Z n); //Adds the string *str with its count
324
+ //just as max_count, but also counts number of unique keys in count.
325
+ void maxp_count2(char *str,Z n); //Adds the string *str with its count
326
+ //just as max_count, but also counts number of unique keys in count.
327
+ //Does not make copy of string, but uses the pointer str as key pointer.
328
+
329
+ //Functions for minium calculation
330
+ void min_count(const char *str,Z n); //Adds the string *str with its count
331
+ //to the tree if not already in list. String is key and count is data.
332
+ //If string is already a key the count is min of n and prior value.
333
+ void min_count2(const char *str,Z n); //Adds the string *str with its count
334
+ //just as min_count, but also counts number of unique keys in count.
335
+ void minp_count2(char *str,Z n); //Adds the string *str with its count
336
+ //just as min_count, but also counts number of unique keys in count.
337
+ //Does not make copy of string, but uses the pointer str as key pointer.
338
+
339
+ Z count(const char *str); //Returns the count if a key (in list) otherwise
340
+ //returns 0.
341
+ Z count(void); //Returns the count of the current string. Assumes the
342
+ //pointers have already been set by a search or node_next call.
343
+ Z total; //Holds the total of all counts added for all keys.
344
+ };
345
+
346
+ template<class Z>
347
+ BCount<Z>::BCount() : List() {
348
+ total=0;
349
+ }
350
+
351
+ template<class Z>
352
+ BCount<Z>::~BCount(){
353
+ if(copy)return;
354
+ Z *pk;
355
+ this->node_first();
356
+ while(this->node_next()){
357
+ pk=(Z *)(this->give_ptr());
358
+ if(pk)delete pk;
359
+ }
360
+ }
361
+
362
+ template<class Z>
363
+ void BCount<Z>::add_count(const char *pch,Z n){
364
+ Z *ppt;
365
+ Node *np;
366
+ total+=n;
367
+ if(this->search(pch)==0){
368
+ ppt = new Z;
369
+ (*ppt) =n;
370
+ np=new Node(pch,(void*)ppt);
371
+ this->insert(np);
372
+ }
373
+ else {
374
+ (*(Z *) this->give_ptr())+=n;
375
+ }
376
+ }
377
+
378
+ template<class Z>
379
+ void BCount<Z>::add_count2(const char *pch,Z n){
380
+ Z *ppt;
381
+ Node *np;
382
+ total+=n;
383
+ if(this->search(pch)==0){
384
+ ppt = new Z;
385
+ (*ppt) =n;
386
+ np=new Node(pch,(void*)ppt);
387
+ this->insert(np);
388
+ cnt_key++;
389
+ }
390
+ else {
391
+ (*(Z *) this->give_ptr())+=n;
392
+ }
393
+ }
394
+
395
+ template<class Z>
396
+ void BCount<Z>::addp_count2(char *pch,Z n){
397
+ Z *ppt;
398
+ Node *np;
399
+ total+=n;
400
+ if(this->search(pch)==0){
401
+ ppt = new Z;
402
+ (*ppt) =n;
403
+ np=new Node;
404
+ np->str=pch;
405
+ np->rel=ppt;
406
+ this->insert(np);
407
+ cnt_key++;
408
+ }
409
+ else {
410
+ (*(Z *) this->give_ptr())+=n;
411
+ }
412
+ }
413
+
414
+ template<class Z>
415
+ void BCount<Z>::correct(const char *pch,Z n){
416
+ if(this->search(pch)){
417
+ (*(Z *) this->give_ptr())=n;
418
+ }
419
+ }
420
+
421
+ template<class Z>
422
+ Z BCount<Z>::count(const char *pch){
423
+ if(this->search(pch)==0){
424
+ return(0);
425
+ }
426
+ else {
427
+ return(*((Z *) this->give_ptr()));
428
+ }
429
+ }
430
+
431
+ template<class Z>
432
+ Z BCount<Z>::count(void){
433
+ return(*((Z *) this->give_ptr()));
434
+ }
435
+
436
+ template<class Z>
437
+ void BCount<Z>::max_count(const char *pch,Z n){
438
+ Z *ppt,i;
439
+ Node *np;
440
+ total+=n;
441
+ if(!search(pch)){
442
+ ppt = new Z;
443
+ (*ppt) =n;
444
+ np=new Node(pch,(void*)ppt);
445
+ this->insert(np);
446
+ }
447
+ else {
448
+ ppt=(Z *)give_ptr();
449
+ if(*ppt<n)*ppt=n;
450
+ }
451
+ }
452
+
453
+ template<class Z>
454
+ void BCount<Z>::max_count2(const char *pch,Z n){
455
+ Z *ppt,i;
456
+ Node *np;
457
+ total+=n;
458
+ if(!search(pch)){
459
+ ppt = new Z;
460
+ (*ppt) =n;
461
+ np=new Node(pch,(void*)ppt);
462
+ this->insert(np);
463
+ cnt_key++;
464
+ }
465
+ else {
466
+ ppt=(Z *)give_ptr();
467
+ if(*ppt<n)*ppt=n;
468
+ }
469
+ }
470
+
471
+ template<class Z>
472
+ void BCount<Z>::maxp_count2(char *pch,Z n){
473
+ Z *ppt,i;
474
+ Node *np;
475
+ total+=n;
476
+ if(!search(pch)){
477
+ ppt = new Z;
478
+ (*ppt) =n;
479
+ np=new Node;
480
+ np->str=pch;
481
+ np->rel=ppt;
482
+ this->insert(np);
483
+ cnt_key++;
484
+ }
485
+ else {
486
+ ppt=(Z *)give_ptr();
487
+ if(*ppt<n)*ppt=n;
488
+ }
489
+ }
490
+
491
+ template<class Z>
492
+ void BCount<Z>::min_count(const char *pch,Z n){
493
+ Z *ppt,i;
494
+ Node *np;
495
+ total+=n;
496
+ if(!search(pch)){
497
+ ppt = new Z;
498
+ (*ppt) =n;
499
+ np=new Node(pch,(void*)ppt);
500
+ this->insert(np);
501
+ }
502
+ else {
503
+ ppt=(Z *)give_ptr();
504
+ if(*ppt>n)*ppt=n;
505
+ }
506
+ }
507
+
508
+ template<class Z>
509
+ void BCount<Z>::min_count2(const char *pch,Z n){
510
+ Z *ppt,i;
511
+ Node *np;
512
+ total+=n;
513
+ if(!search(pch)){
514
+ ppt = new Z;
515
+ (*ppt) =n;
516
+ np=new Node(pch,(void*)ppt);
517
+ this->insert(np);
518
+ cnt_key++;
519
+ }
520
+ else {
521
+ ppt=(Z *)give_ptr();
522
+ if(*ppt>n)*ppt=n;
523
+ }
524
+ }
525
+
526
+ template<class Z>
527
+ void BCount<Z>::minp_count2(char *pch,Z n){
528
+ Z *ppt,i;
529
+ Node *np;
530
+ total+=n;
531
+ if(!search(pch)){
532
+ ppt = new Z;
533
+ (*ppt) =n;
534
+ np=new Node;
535
+ np->str=pch;
536
+ np->rel=ppt;
537
+ this->insert(np);
538
+ cnt_key++;
539
+ }
540
+ else {
541
+ ppt=(Z *)give_ptr();
542
+ if(*ppt>n)*ppt=n;
543
+ }
544
+ }
545
+
546
+ }
547
+ #endif
Library/FBase.C CHANGED
@@ -1,600 +1,600 @@
1
- #include <iostream>
2
- #include <fstream>
3
- #include <cstdlib>
4
- #include <iomanip>
5
- #include <cstring>
6
- #include <cmath>
7
- #include <sys/types.h>
8
- #include <sys/stat.h>
9
- #include <unistd.h>
10
- #include <fcntl.h>
11
- #include <sys/mman.h>
12
- #include "runn.h"
13
- #include "FBase.h"
14
-
15
- using namespace std;
16
- namespace iret {
17
-
18
- FBase::FBase(const char *typ,const char *nam){
19
- int lxn=strlen(typ);
20
- type=new char[lxn+1];
21
- tpnm=-1;
22
- nmnm=-1;
23
- strcpy(type,typ);
24
- lxn=strlen(nam);
25
- name=new char[lxn+1];
26
- strcpy(name,nam);
27
- cflag=0;
28
- oflag=0;
29
- pflag=get_qflag();
30
- eflag=1;
31
- }
32
-
33
- FBase::FBase(const char *typ,int tpn,const char *nam){
34
- int lxn=strlen(typ);
35
- type=new char[lxn+1];
36
- tpnm=tpn;
37
- nmnm=-1;
38
- strcpy(type,typ);
39
- lxn=strlen(nam);
40
- name=new char[lxn+1];
41
- strcpy(name,nam);
42
- cflag=0;
43
- oflag=0;
44
- pflag=get_qflag();
45
- eflag=1;
46
- }
47
-
48
- FBase::FBase(const char *typ,const char *nam,const char *pt){
49
- int lxn=strlen(typ);
50
- type=new char[lxn+1];
51
- tpnm=-1;
52
- nmnm=-1;
53
- strcpy(type,typ);
54
- lxn=strlen(nam);
55
- name=new char[lxn+1];
56
- strcpy(name,nam);
57
- cflag=0;
58
- oflag=0;
59
- pflag=get_qflag();
60
- if(*pt!=':')set_path_name(pt);
61
- else set_path_internal(pt+1);
62
- }
63
-
64
- FBase::~FBase(void){
65
- delete [] type;
66
- delete [] name;
67
- }
68
-
69
- void FBase::set_type_num(int tn){tpnm=tn;}
70
-
71
- void FBase::set_name_num(int nn){nmnm=nn;}
72
-
73
- void FBase::change_type(const char *typ){
74
- if(type!=NULL)delete [] type;
75
- int lxn=strlen(typ);
76
- type=new char[lxn+1];
77
- strcpy(type,typ);
78
- }
79
-
80
- void FBase::change_name(const char *nam){
81
- if(name!=NULL)delete [] name;
82
- int lxn=strlen(nam);
83
- name=new char[lxn+1];
84
- strcpy(name,nam);
85
- }
86
-
87
- void FBase::set_name(const char *nam){
88
- if(name!=NULL)delete [] name;
89
- int lxn=strlen(nam);
90
- name=new char[lxn+1];
91
- strcpy(name,nam);
92
- }
93
-
94
- void FBase::subname(const char *tph,const char *tpl,const char *nm){
95
- char cnam[max_str];
96
- long i=strlen(tpl);
97
- strcpy(cnam,tpl);
98
- cnam[i]='_';
99
- cnam[i+1]='\0';
100
- strcat(cnam,nm);
101
- change_type(tph);
102
- change_name(cnam);
103
- }
104
-
105
- void FBase::set_path_internal(const char *pt){
106
- long len;
107
- if(pt&&(len=strlen(pt))){
108
- eflag=0;
109
- path=new char[len+1];
110
- strcpy(path,pt);
111
- }
112
- else eflag=1;
113
- }
114
-
115
- void FBase::set_path_name(const char *pa){
116
- long len;
117
- if(pa&&(len=strlen(pa))){
118
- eflag=2;
119
- pnam=new char[len+1];
120
- strcpy(pnam,pa);
121
- }
122
- else eflag=1;
123
- }
124
-
125
- void FBase::map_down(FBase *pFb){
126
- pFb->change_type(type);
127
- pFb->change_name(name);
128
- pFb->set_type_num(tpnm);
129
- pFb->set_name_num(nmnm);
130
- pFb->pflag=pflag;
131
- if(eflag==2)pFb->set_path_name(pnam);
132
- else if(!eflag)pFb->set_path_internal(path);
133
- }
134
-
135
- void FBase::map_down_sub(FBase *pFb,const char *subtype){
136
- pFb->subname(type,name,subtype);
137
- pFb->set_type_num(tpnm);
138
- pFb->set_name_num(nmnm);
139
- pFb->pflag=pflag;
140
- if(eflag==2)pFb->set_path_name(pnam);
141
- else if(!eflag)pFb->set_path_internal(path);
142
- }
143
-
144
- void FBase::get_pathx(char *nam,const char *ch){
145
- char cnam[256];
146
- ifstream fin;
147
-
148
- if(eflag==2){
149
- strcpy(cnam,"path_");
150
- strcat(cnam,pnam);
151
- fin.open(cnam,ios::in);
152
- if(!fin.is_open()){
153
- fin.clear();
154
- strcpy(cnam,"path");
155
- fin.open(cnam,ios::in);
156
- if(!fin.is_open()){
157
- cout << "Path file for type " << type << " does not exist!" << endl;
158
- exit(0);
159
- }
160
- }
161
- fin.getline(nam,256);
162
- fin.close();
163
- }
164
- else if(eflag){
165
- strcpy(cnam,"path_");
166
- strcat(cnam,type);
167
- strcat(cnam,"_");
168
- strcat(cnam,name);
169
- strcat(cnam,".");
170
- strcat(cnam,ch);
171
- fin.open(cnam,ios::in);
172
- if(!fin.is_open()){
173
- fin.clear();
174
- strcpy(cnam,"path_");
175
- strcat(cnam,type);
176
- strcat(cnam,"_");
177
- strcat(cnam,name);
178
- fin.open(cnam,ios::in);
179
- if(!fin.is_open()){
180
- fin.clear();
181
- strcpy(cnam,"path_");
182
- strcat(cnam,type);
183
- fin.open(cnam,ios::in);
184
- if(!fin.is_open()){
185
- fin.clear();
186
- strcpy(cnam,"path");
187
- fin.open(cnam,ios::in);
188
- if(!fin.is_open()){
189
- cout << "Path file for type " << type << " does not exist!" << endl;
190
- exit(0);
191
- }
192
- }
193
- }
194
- }
195
- fin.getline(nam,256);
196
- fin.close();
197
- }
198
- else {
199
- strcpy(nam,path);
200
- }
201
-
202
- if(tpnm<0)strcat(nam,type);
203
- else cat_num(type,tpnm,nam);
204
- strcat(nam,"_");
205
- if(nmnm<0)strcat(nam,name);
206
- else cat_num(name,nmnm,nam);
207
- strcat(nam,".");
208
- strcat(nam,ch);
209
- }
210
-
211
- void FBase::get_pathx(char *nam,long n,const char *ch){
212
- char cnam[256],bnam[256];
213
- ifstream fin;
214
-
215
- if(eflag==2){
216
- strcpy(cnam,"path_");
217
- strcat(cnam,pnam);
218
- fin.open(cnam,ios::in);
219
- if(!fin.is_open()){
220
- fin.clear();
221
- strcpy(cnam,"path");
222
- fin.open(cnam,ios::in);
223
- if(!fin.is_open()){
224
- cout << "Path file for type " << type << " does not exist!" << endl;
225
- exit(0);
226
- }
227
- }
228
- fin.getline(nam,256);
229
- fin.close();
230
- }
231
- else if(eflag){
232
- strcpy(cnam,"path_");
233
- strcat(cnam,type);
234
- strcat(cnam,"_");
235
- strcat(cnam,name);
236
- strcat(cnam,".");
237
- strcat(cnam,ch);
238
- fin.open(cnam,ios::in);
239
- if(!fin.is_open()){
240
- fin.clear();
241
- strcpy(cnam,"path_");
242
- strcat(cnam,type);
243
- strcat(cnam,"_");
244
- strcat(cnam,name);
245
- fin.open(cnam,ios::in);
246
- if(!fin.is_open()){
247
- fin.clear();
248
- strcpy(cnam,"path_");
249
- strcat(cnam,type);
250
- fin.open(cnam,ios::in);
251
- if(!fin.is_open()){
252
- fin.clear();
253
- strcpy(cnam,"path");
254
- fin.open(cnam,ios::in);
255
- if(!fin.is_open()){
256
- cout << "Path file for type " << type << " does not exist!" << endl;
257
- exit(0);
258
- }
259
- }
260
- }
261
- }
262
- fin.getline(nam,256);
263
- fin.close();
264
- }
265
- else {
266
- strcpy(nam,path);
267
- }
268
-
269
- if(tpnm<0)strcat(nam,type);
270
- else cat_num(type,tpnm,nam);
271
- strcat(nam,"_");
272
- strcat(nam,add_num(name,n,bnam));
273
- strcat(nam,".");
274
- strcat(nam,ch);
275
- }
276
-
277
- char *FBase::add_num(const char *ptr,long n,char *buf){
278
- char cnam[100];
279
- long_str(cnam,n);
280
- strcpy(buf,ptr);
281
- strcat(buf,cnam);
282
- return(buf);
283
- }
284
-
285
- char *FBase::cat_num(const char *ptr,long n,char *buf){
286
- char cnam[100];
287
- long_str(cnam,n);
288
- strcat(buf,ptr);
289
- strcat(buf,cnam);
290
- return(buf);
291
- }
292
-
293
- int FBase::Gcom(int sflag){
294
- if((cflag&sflag)&&!(oflag&sflag)){
295
- oflag=oflag|sflag;
296
- return(1);
297
- }
298
- else return(0);
299
- }
300
-
301
- int FBase::Rcom(int sflag){
302
- if((cflag&sflag)&&(oflag&sflag)){
303
- oflag=oflag&(~sflag);
304
- return(1);
305
- }
306
- else return(0);
307
- }
308
-
309
- ifstream *FBase::get_Istr(const char *a,ios::openmode mode){
310
- char cnam[max_str];
311
- get_pathx(cnam,a);
312
- ifstream *pfin=new ifstream(cnam,mode);
313
- if(pfin->is_open())return(pfin);
314
- else {
315
- cout << "Error: " << cnam << " failed to open!" << endl;
316
- exit(0);
317
- }
318
- }
319
-
320
- ofstream *FBase::get_Ostr(const char *a,ios::openmode mode){
321
- char cnam[max_str];
322
- get_pathx(cnam,a);
323
- ofstream *pfout=new ofstream(cnam,mode);
324
- if(pfout->is_open())return(pfout);
325
- else {
326
- cout << "Error: " << cnam << " failed to open!" << endl;
327
- exit(0);
328
- }
329
- }
330
-
331
- fstream *FBase::get_Fstr(const char *a,ios::openmode mode){
332
- char cnam[max_str];
333
- get_pathx(cnam,a);
334
- fstream *pfstr=new fstream(cnam,mode);
335
- if(pfstr->is_open())return(pfstr);
336
- else {
337
- cout << "Error: " << cnam << " failed to open!" << endl;
338
- exit(0);
339
- }
340
- }
341
-
342
- ifstream *FBase::get_Istr(long n,const char *a,ios::openmode mode){
343
- char cnam[max_str];
344
- get_pathx(cnam,n,a);
345
- ifstream *pfin=new ifstream(cnam,mode);
346
- if(pfin->is_open())return(pfin);
347
- else {
348
- cout << "Error: " << cnam << " failed to open!" << endl;
349
- exit(0);
350
- }
351
- }
352
-
353
- ofstream *FBase::get_Ostr(long n,const char *a,ios::openmode mode){
354
- char cnam[max_str];
355
- get_pathx(cnam,n,a);
356
- ofstream *pfout=new ofstream(cnam,mode);
357
- if(pfout->is_open())return(pfout);
358
- else {
359
- cout << "Error: " << cnam << " failed to open!" << endl;
360
- exit(0);
361
- }
362
- }
363
-
364
- fstream *FBase::get_Fstr(long n,const char *a,ios::openmode mode){
365
- char cnam[max_str];
366
- get_pathx(cnam,n,a);
367
- fstream *pfstr=new fstream(cnam,mode);
368
- if(pfstr->is_open())return(pfstr);
369
- else {
370
- cout << "Error: " << cnam << " failed to open!" << endl;
371
- exit(0);
372
- }
373
- }
374
-
375
- void FBase::dst_Istr(ifstream *pfin){
376
- if(!pfin)return;
377
- if(!pfin->is_open()){
378
- cout << "File not open!" << endl;
379
- exit(0);
380
- }
381
- delete pfin;
382
- }
383
-
384
- void FBase::dst_Ostr(ofstream *pfout){
385
- if(!pfout)return;
386
- if(!pfout->is_open()){
387
- cout << "File not open!" << endl;
388
- exit(0);
389
- }
390
- delete pfout;
391
- }
392
-
393
- void FBase::dst_Fstr(fstream *pfstr){
394
- if(!pfstr)return;
395
- if(!pfstr->is_open()){
396
- cout << "File not open!" << endl;
397
- exit(0);
398
- }
399
- delete pfstr;
400
- }
401
-
402
- long FBase::get_Fsiz(const char *a){
403
- if(!Exists(a))return(0);
404
- int fld;
405
- struct stat datf;
406
- char cnam[max_str];
407
- get_pathx(cnam,a);
408
- fld=::open(cnam,O_RDONLY);
409
- if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
410
- if(fstat(fld,&datf)){cout << cnam << " failed on size \
411
- determination" << endl;exit(0);}
412
- ::close(fld);
413
- return(datf.st_size);
414
- }
415
-
416
- long FBase::get_Fsiz(long n,const char *a){
417
- if(!Exists(n,a))return(0);
418
- int fld;
419
- struct stat datf;
420
- char cnam[max_str];
421
- get_pathx(cnam,n,a);
422
- fld=::open(cnam,O_RDONLY);
423
- if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
424
- if(fstat(fld,&datf)){cout << cnam << " failed on size \
425
- determination" << endl;exit(0);}
426
- ::close(fld);
427
- return(datf.st_size);
428
- }
429
-
430
- char *FBase::get_Read(const char *a){
431
- int fld;
432
- struct stat datf;
433
- char cnam[max_str];
434
- get_pathx(cnam,a);
435
- fld=::open(cnam,O_RDONLY);
436
- if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
437
- if(fstat(fld,&datf)){cout << cnam << " failed on size \
438
- determination" << endl;exit(0);}
439
- ::close(fld);
440
- char *ptr=new char[datf.st_size];
441
- ifstream fin(cnam,ios::in);
442
- if(!fin.is_open()){
443
- cout << "Error: " << cnam << " failed to open!" << endl;
444
- exit(0);
445
- }
446
- fin.read(ptr,datf.st_size);
447
- return(ptr);
448
- }
449
-
450
- char *FBase::get_Read(long n,const char *a){
451
- int fld;
452
- struct stat datf;
453
- char cnam[max_str];
454
- get_pathx(cnam,n,a);
455
- fld=::open(cnam,O_RDONLY);
456
- if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
457
- if(fstat(fld,&datf)){cout << cnam << " failed on size \
458
- determination" << endl;exit(0);}
459
- ::close(fld);
460
- char *ptr=new char[datf.st_size];
461
- ifstream fin(cnam,ios::in);
462
- if(!fin.is_open()){
463
- cout << "Error: " << cnam << " failed to open!" << endl;
464
- exit(0);
465
- }
466
- fin.read(ptr,datf.st_size);
467
- return(ptr);
468
- }
469
-
470
- char *FBase::get_Mmap(const char *a){
471
- int fld;
472
- struct stat datf;
473
- char cnam[max_str];
474
- get_pathx(cnam,a);
475
- fld=::open(cnam,O_RDONLY);
476
- if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
477
- if(fstat(fld,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
478
- char *ptr=(char*)mmap(0,datf.st_size,PROT_READ,MAP_PRIVATE|MAP_NORESERVE,fld,0);
479
- if(ptr==MAP_FAILED){cout << cnam << " failed to map" << endl;exit(0);}
480
- ::close(fld);
481
- return(ptr);
482
- }
483
-
484
- char *FBase::get_Mmap(long n,const char *a){
485
- int fld;
486
- struct stat datf;
487
- char cnam[max_str];
488
- get_pathx(cnam,n,a);
489
- fld=::open(cnam,O_RDONLY);
490
- if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
491
- if(fstat(fld,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
492
- char *ptr=(char*)mmap(0,datf.st_size,PROT_READ,MAP_PRIVATE|MAP_NORESERVE,fld,0);
493
- if(ptr==MAP_FAILED){cout << cnam << " failed to map" << endl;exit(0);}
494
- ::close(fld);
495
- return(ptr);
496
- }
497
-
498
- char *FBase::get_Wmap(const char *a){
499
- int fld;
500
- struct stat datf;
501
- char cnam[max_str];
502
- get_pathx(cnam,a);
503
- fld=::open(cnam,O_RDWR);
504
- if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
505
- if(fstat(fld,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
506
- char *ptr=(char*)mmap(0,datf.st_size,PROT_READ|PROT_WRITE,MAP_SHARED,fld,0);
507
- if(ptr==MAP_FAILED){cout << cnam << " failed to map" << endl;exit(0);}
508
- ::close(fld);
509
- return(ptr);
510
- }
511
-
512
- char *FBase::get_Wmap(long n,const char *a){
513
- int fld;
514
- struct stat datf;
515
- char cnam[max_str];
516
- get_pathx(cnam,n,a);
517
- fld=::open(cnam,O_RDWR);
518
- if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
519
- if(fstat(fld,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
520
- char *ptr=(char*)mmap(0,datf.st_size,PROT_READ|PROT_WRITE,MAP_SHARED,fld,0);
521
- if(ptr==MAP_FAILED){cout << cnam << " failed to map" << endl;exit(0);}
522
- ::close(fld);
523
- return(ptr);
524
- }
525
-
526
- void FBase::dst_Mmap(const char *a,char *ptr){
527
- struct stat datf;
528
- char cnam[max_str];
529
- if(ptr==NULL){cout << "NULL pointer" << endl;return;}
530
- get_pathx(cnam,a);
531
- if(stat(cnam,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
532
- if(munmap(ptr,datf.st_size)){cout << cnam << " failed to unmap" << endl;exit(0);}
533
- ptr=NULL;
534
- }
535
-
536
- void FBase::dst_Mmap(long n,const char *a,char *ptr){
537
- struct stat datf;
538
- char cnam[max_str];
539
- if(ptr==NULL){cout << "NULL pointer" << endl;return;}
540
- get_pathx(cnam,n,a);
541
- if(stat(cnam,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
542
- if(munmap(ptr,datf.st_size)){cout << cnam << " failed to unmap" << endl;exit(0);}
543
- ptr=NULL;
544
- }
545
-
546
- void FBase::bin_Writ(const char *a,long nm,char *ptr){
547
- ofstream *pfout=get_Ostr(a,ios::out);
548
- long k=100000,i=0;
549
- while(i+k<nm){
550
- pfout->write((char*)ptr,k);
551
- i+=k;
552
- ptr=ptr+k;
553
- }
554
- pfout->write((char*)ptr,nm-i);
555
- pfout->close();
556
- delete pfout;
557
- }
558
-
559
- void FBase::bin_Writ(long n,const char *a,long nm,char *ptr){
560
- ofstream *pfout=get_Ostr(n,a,ios::out);
561
- long k=100000,i=0;
562
- while(i+k<nm){
563
- pfout->write((char*)ptr,k);
564
- i+=k;
565
- ptr=ptr+k;
566
- }
567
- pfout->write((char*)ptr,nm-i);
568
- pfout->close();
569
- delete pfout;
570
- }
571
-
572
- int FBase::Exists(const char *a){
573
- char cnam[max_str];
574
- get_pathx(cnam,a);
575
- ifstream fin(cnam,ios::in);
576
- if(fin.is_open()){
577
- fin.close();
578
- return(1);
579
- }
580
- else return(0);
581
- }
582
-
583
- int FBase::Exists(long n,const char *a){
584
- char cnam[max_str];
585
- get_pathx(cnam,n,a);
586
- ifstream fin(cnam,ios::in);
587
- if(fin.is_open()){
588
- fin.close();
589
- return(1);
590
- }
591
- else return(0);
592
- }
593
-
594
- void FBase::mark(long ct, int ivl, const char *what){
595
- if(pflag&&(ct%ivl==0)){
596
- cout << what << " count=" << ct << endl;
597
- }
598
- }
599
-
600
- }
 
1
+ #include <iostream>
2
+ #include <fstream>
3
+ #include <cstdlib>
4
+ #include <iomanip>
5
+ #include <cstring>
6
+ #include <cmath>
7
+ #include <sys/types.h>
8
+ #include <sys/stat.h>
9
+ #include <unistd.h>
10
+ #include <fcntl.h>
11
+ #include <sys/mman.h>
12
+ #include "runn.h"
13
+ #include "FBase.h"
14
+
15
+ using namespace std;
16
+ namespace iret {
17
+
18
+ FBase::FBase(const char *typ,const char *nam){
19
+ int lxn=strlen(typ);
20
+ type=new char[lxn+1];
21
+ tpnm=-1;
22
+ nmnm=-1;
23
+ strcpy(type,typ);
24
+ lxn=strlen(nam);
25
+ name=new char[lxn+1];
26
+ strcpy(name,nam);
27
+ cflag=0;
28
+ oflag=0;
29
+ pflag=get_qflag();
30
+ eflag=1;
31
+ }
32
+
33
+ FBase::FBase(const char *typ,int tpn,const char *nam){
34
+ int lxn=strlen(typ);
35
+ type=new char[lxn+1];
36
+ tpnm=tpn;
37
+ nmnm=-1;
38
+ strcpy(type,typ);
39
+ lxn=strlen(nam);
40
+ name=new char[lxn+1];
41
+ strcpy(name,nam);
42
+ cflag=0;
43
+ oflag=0;
44
+ pflag=get_qflag();
45
+ eflag=1;
46
+ }
47
+
48
+ FBase::FBase(const char *typ,const char *nam,const char *pt){
49
+ int lxn=strlen(typ);
50
+ type=new char[lxn+1];
51
+ tpnm=-1;
52
+ nmnm=-1;
53
+ strcpy(type,typ);
54
+ lxn=strlen(nam);
55
+ name=new char[lxn+1];
56
+ strcpy(name,nam);
57
+ cflag=0;
58
+ oflag=0;
59
+ pflag=get_qflag();
60
+ if(*pt!=':')set_path_name(pt);
61
+ else set_path_internal(pt+1);
62
+ }
63
+
64
+ FBase::~FBase(void){
65
+ delete [] type;
66
+ delete [] name;
67
+ }
68
+
69
+ void FBase::set_type_num(int tn){tpnm=tn;}
70
+
71
+ void FBase::set_name_num(int nn){nmnm=nn;}
72
+
73
+ void FBase::change_type(const char *typ){
74
+ if(type!=NULL)delete [] type;
75
+ int lxn=strlen(typ);
76
+ type=new char[lxn+1];
77
+ strcpy(type,typ);
78
+ }
79
+
80
+ void FBase::change_name(const char *nam){
81
+ if(name!=NULL)delete [] name;
82
+ int lxn=strlen(nam);
83
+ name=new char[lxn+1];
84
+ strcpy(name,nam);
85
+ }
86
+
87
+ void FBase::set_name(const char *nam){
88
+ if(name!=NULL)delete [] name;
89
+ int lxn=strlen(nam);
90
+ name=new char[lxn+1];
91
+ strcpy(name,nam);
92
+ }
93
+
94
+ void FBase::subname(const char *tph,const char *tpl,const char *nm){
95
+ char cnam[max_str];
96
+ long i=strlen(tpl);
97
+ strcpy(cnam,tpl);
98
+ cnam[i]='_';
99
+ cnam[i+1]='\0';
100
+ strcat(cnam,nm);
101
+ change_type(tph);
102
+ change_name(cnam);
103
+ }
104
+
105
+ void FBase::set_path_internal(const char *pt){
106
+ long len;
107
+ if(pt&&(len=strlen(pt))){
108
+ eflag=0;
109
+ path=new char[len+1];
110
+ strcpy(path,pt);
111
+ }
112
+ else eflag=1;
113
+ }
114
+
115
+ void FBase::set_path_name(const char *pa){
116
+ long len;
117
+ if(pa&&(len=strlen(pa))){
118
+ eflag=2;
119
+ pnam=new char[len+1];
120
+ strcpy(pnam,pa);
121
+ }
122
+ else eflag=1;
123
+ }
124
+
125
+ void FBase::map_down(FBase *pFb){
126
+ pFb->change_type(type);
127
+ pFb->change_name(name);
128
+ pFb->set_type_num(tpnm);
129
+ pFb->set_name_num(nmnm);
130
+ pFb->pflag=pflag;
131
+ if(eflag==2)pFb->set_path_name(pnam);
132
+ else if(!eflag)pFb->set_path_internal(path);
133
+ }
134
+
135
+ void FBase::map_down_sub(FBase *pFb,const char *subtype){
136
+ pFb->subname(type,name,subtype);
137
+ pFb->set_type_num(tpnm);
138
+ pFb->set_name_num(nmnm);
139
+ pFb->pflag=pflag;
140
+ if(eflag==2)pFb->set_path_name(pnam);
141
+ else if(!eflag)pFb->set_path_internal(path);
142
+ }
143
+
144
+ void FBase::get_pathx(char *nam,const char *ch){
145
+ char cnam[256];
146
+ ifstream fin;
147
+
148
+ if(eflag==2){
149
+ strcpy(cnam,"path_");
150
+ strcat(cnam,pnam);
151
+ fin.open(cnam,ios::in);
152
+ if(!fin.is_open()){
153
+ fin.clear();
154
+ strcpy(cnam,"path");
155
+ fin.open(cnam,ios::in);
156
+ if(!fin.is_open()){
157
+ cout << "Path file for type " << type << " does not exist!" << endl;
158
+ exit(0);
159
+ }
160
+ }
161
+ fin.getline(nam,256);
162
+ fin.close();
163
+ }
164
+ else if(eflag){
165
+ strcpy(cnam,"path_");
166
+ strcat(cnam,type);
167
+ strcat(cnam,"_");
168
+ strcat(cnam,name);
169
+ strcat(cnam,".");
170
+ strcat(cnam,ch);
171
+ fin.open(cnam,ios::in);
172
+ if(!fin.is_open()){
173
+ fin.clear();
174
+ strcpy(cnam,"path_");
175
+ strcat(cnam,type);
176
+ strcat(cnam,"_");
177
+ strcat(cnam,name);
178
+ fin.open(cnam,ios::in);
179
+ if(!fin.is_open()){
180
+ fin.clear();
181
+ strcpy(cnam,"path_");
182
+ strcat(cnam,type);
183
+ fin.open(cnam,ios::in);
184
+ if(!fin.is_open()){
185
+ fin.clear();
186
+ strcpy(cnam,"path");
187
+ fin.open(cnam,ios::in);
188
+ if(!fin.is_open()){
189
+ cout << "Path file for type " << type << " does not exist!" << endl;
190
+ exit(0);
191
+ }
192
+ }
193
+ }
194
+ }
195
+ fin.getline(nam,256);
196
+ fin.close();
197
+ }
198
+ else {
199
+ strcpy(nam,path);
200
+ }
201
+
202
+ if(tpnm<0)strcat(nam,type);
203
+ else cat_num(type,tpnm,nam);
204
+ strcat(nam,"_");
205
+ if(nmnm<0)strcat(nam,name);
206
+ else cat_num(name,nmnm,nam);
207
+ strcat(nam,".");
208
+ strcat(nam,ch);
209
+ }
210
+
211
+ void FBase::get_pathx(char *nam,long n,const char *ch){
212
+ char cnam[256],bnam[256];
213
+ ifstream fin;
214
+
215
+ if(eflag==2){
216
+ strcpy(cnam,"path_");
217
+ strcat(cnam,pnam);
218
+ fin.open(cnam,ios::in);
219
+ if(!fin.is_open()){
220
+ fin.clear();
221
+ strcpy(cnam,"path");
222
+ fin.open(cnam,ios::in);
223
+ if(!fin.is_open()){
224
+ cout << "Path file for type " << type << " does not exist!" << endl;
225
+ exit(0);
226
+ }
227
+ }
228
+ fin.getline(nam,256);
229
+ fin.close();
230
+ }
231
+ else if(eflag){
232
+ strcpy(cnam,"path_");
233
+ strcat(cnam,type);
234
+ strcat(cnam,"_");
235
+ strcat(cnam,name);
236
+ strcat(cnam,".");
237
+ strcat(cnam,ch);
238
+ fin.open(cnam,ios::in);
239
+ if(!fin.is_open()){
240
+ fin.clear();
241
+ strcpy(cnam,"path_");
242
+ strcat(cnam,type);
243
+ strcat(cnam,"_");
244
+ strcat(cnam,name);
245
+ fin.open(cnam,ios::in);
246
+ if(!fin.is_open()){
247
+ fin.clear();
248
+ strcpy(cnam,"path_");
249
+ strcat(cnam,type);
250
+ fin.open(cnam,ios::in);
251
+ if(!fin.is_open()){
252
+ fin.clear();
253
+ strcpy(cnam,"path");
254
+ fin.open(cnam,ios::in);
255
+ if(!fin.is_open()){
256
+ cout << "Path file for type " << type << " does not exist!" << endl;
257
+ exit(0);
258
+ }
259
+ }
260
+ }
261
+ }
262
+ fin.getline(nam,256);
263
+ fin.close();
264
+ }
265
+ else {
266
+ strcpy(nam,path);
267
+ }
268
+
269
+ if(tpnm<0)strcat(nam,type);
270
+ else cat_num(type,tpnm,nam);
271
+ strcat(nam,"_");
272
+ strcat(nam,add_num(name,n,bnam));
273
+ strcat(nam,".");
274
+ strcat(nam,ch);
275
+ }
276
+
277
+ char *FBase::add_num(const char *ptr,long n,char *buf){
278
+ char cnam[100];
279
+ long_str(cnam,n);
280
+ strcpy(buf,ptr);
281
+ strcat(buf,cnam);
282
+ return(buf);
283
+ }
284
+
285
+ char *FBase::cat_num(const char *ptr,long n,char *buf){
286
+ char cnam[100];
287
+ long_str(cnam,n);
288
+ strcat(buf,ptr);
289
+ strcat(buf,cnam);
290
+ return(buf);
291
+ }
292
+
293
+ int FBase::Gcom(int sflag){
294
+ if((cflag&sflag)&&!(oflag&sflag)){
295
+ oflag=oflag|sflag;
296
+ return(1);
297
+ }
298
+ else return(0);
299
+ }
300
+
301
+ int FBase::Rcom(int sflag){
302
+ if((cflag&sflag)&&(oflag&sflag)){
303
+ oflag=oflag&(~sflag);
304
+ return(1);
305
+ }
306
+ else return(0);
307
+ }
308
+
309
+ ifstream *FBase::get_Istr(const char *a,ios::openmode mode){
310
+ char cnam[max_str];
311
+ get_pathx(cnam,a);
312
+ ifstream *pfin=new ifstream(cnam,mode);
313
+ if(pfin->is_open())return(pfin);
314
+ else {
315
+ cout << "Error: " << cnam << " failed to open!" << endl;
316
+ exit(0);
317
+ }
318
+ }
319
+
320
+ ofstream *FBase::get_Ostr(const char *a,ios::openmode mode){
321
+ char cnam[max_str];
322
+ get_pathx(cnam,a);
323
+ ofstream *pfout=new ofstream(cnam,mode);
324
+ if(pfout->is_open())return(pfout);
325
+ else {
326
+ cout << "Error: " << cnam << " failed to open!" << endl;
327
+ exit(0);
328
+ }
329
+ }
330
+
331
+ fstream *FBase::get_Fstr(const char *a,ios::openmode mode){
332
+ char cnam[max_str];
333
+ get_pathx(cnam,a);
334
+ fstream *pfstr=new fstream(cnam,mode);
335
+ if(pfstr->is_open())return(pfstr);
336
+ else {
337
+ cout << "Error: " << cnam << " failed to open!" << endl;
338
+ exit(0);
339
+ }
340
+ }
341
+
342
+ ifstream *FBase::get_Istr(long n,const char *a,ios::openmode mode){
343
+ char cnam[max_str];
344
+ get_pathx(cnam,n,a);
345
+ ifstream *pfin=new ifstream(cnam,mode);
346
+ if(pfin->is_open())return(pfin);
347
+ else {
348
+ cout << "Error: " << cnam << " failed to open!" << endl;
349
+ exit(0);
350
+ }
351
+ }
352
+
353
+ ofstream *FBase::get_Ostr(long n,const char *a,ios::openmode mode){
354
+ char cnam[max_str];
355
+ get_pathx(cnam,n,a);
356
+ ofstream *pfout=new ofstream(cnam,mode);
357
+ if(pfout->is_open())return(pfout);
358
+ else {
359
+ cout << "Error: " << cnam << " failed to open!" << endl;
360
+ exit(0);
361
+ }
362
+ }
363
+
364
+ fstream *FBase::get_Fstr(long n,const char *a,ios::openmode mode){
365
+ char cnam[max_str];
366
+ get_pathx(cnam,n,a);
367
+ fstream *pfstr=new fstream(cnam,mode);
368
+ if(pfstr->is_open())return(pfstr);
369
+ else {
370
+ cout << "Error: " << cnam << " failed to open!" << endl;
371
+ exit(0);
372
+ }
373
+ }
374
+
375
+ void FBase::dst_Istr(ifstream *pfin){
376
+ if(!pfin)return;
377
+ if(!pfin->is_open()){
378
+ cout << "File not open!" << endl;
379
+ exit(0);
380
+ }
381
+ delete pfin;
382
+ }
383
+
384
+ void FBase::dst_Ostr(ofstream *pfout){
385
+ if(!pfout)return;
386
+ if(!pfout->is_open()){
387
+ cout << "File not open!" << endl;
388
+ exit(0);
389
+ }
390
+ delete pfout;
391
+ }
392
+
393
+ void FBase::dst_Fstr(fstream *pfstr){
394
+ if(!pfstr)return;
395
+ if(!pfstr->is_open()){
396
+ cout << "File not open!" << endl;
397
+ exit(0);
398
+ }
399
+ delete pfstr;
400
+ }
401
+
402
+ long FBase::get_Fsiz(const char *a){
403
+ if(!Exists(a))return(0);
404
+ int fld;
405
+ struct stat datf;
406
+ char cnam[max_str];
407
+ get_pathx(cnam,a);
408
+ fld=::open(cnam,O_RDONLY);
409
+ if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
410
+ if(fstat(fld,&datf)){cout << cnam << " failed on size \
411
+ determination" << endl;exit(0);}
412
+ ::close(fld);
413
+ return(datf.st_size);
414
+ }
415
+
416
+ long FBase::get_Fsiz(long n,const char *a){
417
+ if(!Exists(n,a))return(0);
418
+ int fld;
419
+ struct stat datf;
420
+ char cnam[max_str];
421
+ get_pathx(cnam,n,a);
422
+ fld=::open(cnam,O_RDONLY);
423
+ if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
424
+ if(fstat(fld,&datf)){cout << cnam << " failed on size \
425
+ determination" << endl;exit(0);}
426
+ ::close(fld);
427
+ return(datf.st_size);
428
+ }
429
+
430
+ char *FBase::get_Read(const char *a){
431
+ int fld;
432
+ struct stat datf;
433
+ char cnam[max_str];
434
+ get_pathx(cnam,a);
435
+ fld=::open(cnam,O_RDONLY);
436
+ if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
437
+ if(fstat(fld,&datf)){cout << cnam << " failed on size \
438
+ determination" << endl;exit(0);}
439
+ ::close(fld);
440
+ char *ptr=new char[datf.st_size];
441
+ ifstream fin(cnam,ios::in);
442
+ if(!fin.is_open()){
443
+ cout << "Error: " << cnam << " failed to open!" << endl;
444
+ exit(0);
445
+ }
446
+ fin.read(ptr,datf.st_size);
447
+ return(ptr);
448
+ }
449
+
450
+ char *FBase::get_Read(long n,const char *a){
451
+ int fld;
452
+ struct stat datf;
453
+ char cnam[max_str];
454
+ get_pathx(cnam,n,a);
455
+ fld=::open(cnam,O_RDONLY);
456
+ if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
457
+ if(fstat(fld,&datf)){cout << cnam << " failed on size \
458
+ determination" << endl;exit(0);}
459
+ ::close(fld);
460
+ char *ptr=new char[datf.st_size];
461
+ ifstream fin(cnam,ios::in);
462
+ if(!fin.is_open()){
463
+ cout << "Error: " << cnam << " failed to open!" << endl;
464
+ exit(0);
465
+ }
466
+ fin.read(ptr,datf.st_size);
467
+ return(ptr);
468
+ }
469
+
470
+ char *FBase::get_Mmap(const char *a){
471
+ int fld;
472
+ struct stat datf;
473
+ char cnam[max_str];
474
+ get_pathx(cnam,a);
475
+ fld=::open(cnam,O_RDONLY);
476
+ if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
477
+ if(fstat(fld,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
478
+ char *ptr=(char*)mmap(0,datf.st_size,PROT_READ,MAP_PRIVATE|MAP_NORESERVE,fld,0);
479
+ if(ptr==MAP_FAILED){cout << cnam << " failed to map" << endl;exit(0);}
480
+ ::close(fld);
481
+ return(ptr);
482
+ }
483
+
484
+ char *FBase::get_Mmap(long n,const char *a){
485
+ int fld;
486
+ struct stat datf;
487
+ char cnam[max_str];
488
+ get_pathx(cnam,n,a);
489
+ fld=::open(cnam,O_RDONLY);
490
+ if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
491
+ if(fstat(fld,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
492
+ char *ptr=(char*)mmap(0,datf.st_size,PROT_READ,MAP_PRIVATE|MAP_NORESERVE,fld,0);
493
+ if(ptr==MAP_FAILED){cout << cnam << " failed to map" << endl;exit(0);}
494
+ ::close(fld);
495
+ return(ptr);
496
+ }
497
+
498
+ char *FBase::get_Wmap(const char *a){
499
+ int fld;
500
+ struct stat datf;
501
+ char cnam[max_str];
502
+ get_pathx(cnam,a);
503
+ fld=::open(cnam,O_RDWR);
504
+ if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
505
+ if(fstat(fld,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
506
+ char *ptr=(char*)mmap(0,datf.st_size,PROT_READ|PROT_WRITE,MAP_SHARED,fld,0);
507
+ if(ptr==MAP_FAILED){cout << cnam << " failed to map" << endl;exit(0);}
508
+ ::close(fld);
509
+ return(ptr);
510
+ }
511
+
512
+ char *FBase::get_Wmap(long n,const char *a){
513
+ int fld;
514
+ struct stat datf;
515
+ char cnam[max_str];
516
+ get_pathx(cnam,n,a);
517
+ fld=::open(cnam,O_RDWR);
518
+ if(fld<=0){cout << cnam << " failed to open" << endl;exit(0);}
519
+ if(fstat(fld,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
520
+ char *ptr=(char*)mmap(0,datf.st_size,PROT_READ|PROT_WRITE,MAP_SHARED,fld,0);
521
+ if(ptr==MAP_FAILED){cout << cnam << " failed to map" << endl;exit(0);}
522
+ ::close(fld);
523
+ return(ptr);
524
+ }
525
+
526
+ void FBase::dst_Mmap(const char *a,char *ptr){
527
+ struct stat datf;
528
+ char cnam[max_str];
529
+ if(ptr==NULL){cout << "NULL pointer" << endl;return;}
530
+ get_pathx(cnam,a);
531
+ if(stat(cnam,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
532
+ if(munmap(ptr,datf.st_size)){cout << cnam << " failed to unmap" << endl;exit(0);}
533
+ ptr=NULL;
534
+ }
535
+
536
+ void FBase::dst_Mmap(long n,const char *a,char *ptr){
537
+ struct stat datf;
538
+ char cnam[max_str];
539
+ if(ptr==NULL){cout << "NULL pointer" << endl;return;}
540
+ get_pathx(cnam,n,a);
541
+ if(stat(cnam,&datf)){cout << cnam << " failed on size determination" << endl;exit(0);}
542
+ if(munmap(ptr,datf.st_size)){cout << cnam << " failed to unmap" << endl;exit(0);}
543
+ ptr=NULL;
544
+ }
545
+
546
+ void FBase::bin_Writ(const char *a,long nm,char *ptr){
547
+ ofstream *pfout=get_Ostr(a,ios::out);
548
+ long k=100000,i=0;
549
+ while(i+k<nm){
550
+ pfout->write((char*)ptr,k);
551
+ i+=k;
552
+ ptr=ptr+k;
553
+ }
554
+ pfout->write((char*)ptr,nm-i);
555
+ pfout->close();
556
+ delete pfout;
557
+ }
558
+
559
+ void FBase::bin_Writ(long n,const char *a,long nm,char *ptr){
560
+ ofstream *pfout=get_Ostr(n,a,ios::out);
561
+ long k=100000,i=0;
562
+ while(i+k<nm){
563
+ pfout->write((char*)ptr,k);
564
+ i+=k;
565
+ ptr=ptr+k;
566
+ }
567
+ pfout->write((char*)ptr,nm-i);
568
+ pfout->close();
569
+ delete pfout;
570
+ }
571
+
572
+ int FBase::Exists(const char *a){
573
+ char cnam[max_str];
574
+ get_pathx(cnam,a);
575
+ ifstream fin(cnam,ios::in);
576
+ if(fin.is_open()){
577
+ fin.close();
578
+ return(1);
579
+ }
580
+ else return(0);
581
+ }
582
+
583
+ int FBase::Exists(long n,const char *a){
584
+ char cnam[max_str];
585
+ get_pathx(cnam,n,a);
586
+ ifstream fin(cnam,ios::in);
587
+ if(fin.is_open()){
588
+ fin.close();
589
+ return(1);
590
+ }
591
+ else return(0);
592
+ }
593
+
594
+ void FBase::mark(long ct, int ivl, const char *what){
595
+ if(pflag&&(ct%ivl==0)){
596
+ cout << what << " count=" << ct << endl;
597
+ }
598
+ }
599
+
600
+ }
Library/FBase.h CHANGED
@@ -1,248 +1,248 @@
1
- #ifndef FBASE_H
2
- #define FBASE_H
3
-
4
- #include <iostream>
5
- #include <fstream>
6
-
7
- using namespace std;
8
- namespace iret {
9
-
10
- typedef char *pChr;
11
-
12
- class FBase {
13
- public:
14
- FBase(const char *tp,const char *nm); //tp is type name, nm is name
15
- FBase(const char *tp,int tn,const char *nm); //tp is type name, if
16
- //nonnegative tn is appended to end of tp, nm is name
17
- FBase(const char *tp,const char *nm,const char *pt); //tp is type name, nm is name
18
- //pt is pointer at a string sss and reads the path from file path_sss in
19
- //current directory. But if sss begins with ':' then skips this character
20
- //and remaining string is the path string itself.
21
- ~FBase();
22
- void set_type_num(int tn); //Sets tpnm and uses if nonnegative: appended
23
- //to end of type name
24
- void set_name_num(int nn); //Sets nmnm and uses if nonnegative: appended
25
- //to end of name
26
- void change_type(const char *nm); //Allows change of type string for class.
27
- void change_name(const char *nm); //Allows change of name string for class.
28
- void set_name(const char *nm); //Allows change of name string for class.
29
- //Included for compatibility
30
- void subname(const char *tph,const char *tpl,const char *nm); //Uses the
31
- //higher level type tph as type and combines lower level tpl_nm with
32
- //name to allow one to keep track of file types.
33
- void set_path_internal(const char *pt); //Path is by default external with
34
- //eflag=1. But if this function called with nonempty string, then eflag=0
35
- //and pt stored in path and used for access to data.
36
- void set_path_name(const char *pa); //path will be extracted from path_pa
37
- //and eflag=2. Naming conventions for files are unchanged
38
- void map_down(FBase *pFb); //Maps naming parameters to class instance pFb
39
- void map_down_sub(FBase *pFb,const char *subtype); //Maps naming parameters to class instance pFb
40
- //combines subtype with name to make a new name for pFb and type becomes its type
41
-
42
- //Path access functions
43
- void get_pathx(char *cn,const char *a);
44
- //Reads the path from a file "path_(*name)" and constructs the
45
- //file name from as "(*type)_(*name).(*a)". Cats path and file
46
- //name and returns the full info in cn.
47
- void get_pathx(char *cn,long n,const char *a);
48
- char *add_num(const char *ptr,long n,char *buf); //converts long to ascii
49
- //and cats to end of string and returns pointer to new string
50
- //that results. Does not change input string. The new string is
51
- //held in buffer space and this is overwritten at each call.
52
- char *cat_num(const char *ptr,long n,char *buf); //converts long to ascii
53
- //and cats to end of ptr string and then cats result to end of
54
- //whatever is in buffer. Does not change input string. The new string is
55
- //held in buffer space.
56
-
57
- //Stream object pointers
58
- ifstream *get_Istr(const char *a,ios::openmode m=ios::in);
59
- //Opens input file stream by path and name composition.
60
- ofstream *get_Ostr(const char *a,ios::openmode m=ios::out);
61
- //Opens output file stream by path and name composition.
62
- fstream *get_Fstr(const char *a,ios::openmode m=ios::in|ios::out);
63
- //Opens output file stream by path and name composition.
64
- ifstream *get_Istr(long n,const char *a,ios::openmode m=ios::in);
65
- ofstream *get_Ostr(long n,const char *a,ios::openmode m=ios::out);
66
- fstream *get_Fstr(long n,const char *a,ios::openmode m=ios::in|ios::out);
67
- void dst_Istr(ifstream *pfin);
68
- void dst_Ostr(ofstream *pfout);
69
- void dst_Fstr(fstream *pfstr);
70
-
71
- //Get file size in bytes
72
- long get_Fsiz(const char *a);
73
- long get_Fsiz(long n,const char *a);
74
-
75
- //File existence
76
- int Exists(const char *a); //returns 1 if file exists
77
- int Exists(long n,const char *a); //returns 1 if file exists
78
-
79
- //Read in array pointers
80
- char *get_Read(const char *a);
81
- //Reads in a file into an char array and returns pointer
82
- char *get_Read(long n,const char *a);
83
-
84
- //Memory map pointers
85
- char *get_Mmap(const char *a);
86
- //Memory maps file by path and name composition.
87
- char *get_Mmap(long n,const char *a);
88
- char *get_Wmap(const char *a);
89
- //Memory maps file by path and name composition.
90
- //Allows to modify contents and is written out when dst_Mmap called
91
- char *get_Wmap(long n,const char *a);
92
- //Allows to modify contents and is written out when dst_Mmap called
93
- void dst_Mmap(const char *a,char *ptr);
94
- //Removes the memory map for ptr based on path and name composition.
95
- void dst_Mmap(long n,const char *a,char *ptr);
96
-
97
- //Array of chars and binary write
98
- void bin_Writ(const char *a,long nm,char *ptr);
99
- //Writes out nm bytes binary
100
- void bin_Writ(long n,const char *a,long nm,char *ptr);
101
-
102
- //Write and read 1, 2, or 3 long integers to or from a file
103
- template <typename X>
104
- void get_Nnum(const char *a,X &m1);
105
- template <typename X,typename Y>
106
- void get_Nnum(const char *a,X &m1,Y &m2);
107
- template <typename X,typename Y,typename Z>
108
- void get_Nnum(const char *a,X &m1,Y &m2,Z &m3);
109
- template <typename X>
110
- void get_Nnum(long n,const char *a,X &m1);
111
- template <typename X,typename Y>
112
- void get_Nnum(long n,const char *a,X &m1,Y &m2);
113
- template <typename X,typename Y,typename Z>
114
- void get_Nnum(long n,const char *a,X &m1,Y &m2,Z &m3);
115
- template <typename X>
116
- void put_Nnum(const char *a,X &m1);
117
- template <typename X,typename Y>
118
- void put_Nnum(const char *a,X &m1,Y &m2);
119
- template <typename X,typename Y,typename Z>
120
- void put_Nnum(const char *a,X &m1,Y &m2,Z &m3);
121
- template <typename X>
122
- void put_Nnum(long n,const char *a,X &m1);
123
- template <typename X,typename Y>
124
- void put_Nnum(long n,const char *a,X &m1,Y &m2);
125
- template <typename X,typename Y,typename Z>
126
- void put_Nnum(long n,const char *a,X &m1,Y &m2,Z &m3);
127
-
128
- //Logical accounting functions
129
- int Gcom(int sflag); //sflag is bit marker such as READ_W, etc.
130
- //This returns 1 if sflag bit not set in oflag and is in cflag
131
- //If this is the case then it sets sflag in oflag.
132
- int Rcom(int sflag);
133
- //This returns 1 if sflag bit set in oflag and in cflag
134
- //If this is the case then it turns off sflag in oflag.
135
- void mark(long,int,const char*);
136
- //This function prints out string in 3rd argument and count
137
- //if first argument is multiple of the second
138
-
139
- //Data
140
- int cflag; //Command, what should happen to resources.
141
- int oflag; //Bit string status of resources, 1 open, 0 closed.
142
- int open1; //flags to mark whether a resource is open or not
143
- int open2; //0 means closed, 1 means open
144
- int open3; //Used for those resources that are either completely
145
- int open4; //closed or completely open.
146
- int open5;
147
- char *type;
148
- int tpnm; //If nonnegative integer it is appended to end of type
149
- //in constructing file name
150
- char *name;
151
- int nmnm; //If nonnegative integer it is appended to end of name
152
- //in constructing file name
153
- int pflag; //Usual print flag, 1 for verbose output, 0 for none
154
- //Print flag set to 1 by default.
155
- int eflag; //Flag set to 1 for external path from path file, 0
156
- //for internal path
157
- char *path; //Path stored here if eflag=0.
158
- char *pnam; //Path extension stored here if eflag=2.
159
- };
160
-
161
- //Template functions
162
-
163
- template <typename X>
164
- void FBase::get_Nnum(const char *a,X &m1){
165
- ifstream *pfin=get_Istr(a,ios::in);
166
- *pfin >> m1;
167
- dst_Istr(pfin);
168
- }
169
-
170
- template <typename X,typename Y>
171
- void FBase::get_Nnum(const char *a,X &m1,Y &m2){
172
- ifstream *pfin=get_Istr(a,ios::in);
173
- *pfin >> m1 >> m2;
174
- dst_Istr(pfin);
175
- }
176
-
177
- template <typename X,typename Y,typename Z>
178
- void FBase::get_Nnum(const char *a,X &m1,Y &m2,Z &m3){
179
- ifstream *pfin=get_Istr(a,ios::in);
180
- *pfin >> m1 >> m2 >> m3;
181
- dst_Istr(pfin);
182
- }
183
-
184
- template <typename X>
185
- void FBase::get_Nnum(long n,const char *a,X &m1){
186
- ifstream *pfin=get_Istr(n,a,ios::in);
187
- *pfin >> m1;
188
- dst_Istr(pfin);
189
- }
190
-
191
- template <typename X,typename Y>
192
- void FBase::get_Nnum(long n,const char *a,X &m1,Y &m2){
193
- ifstream *pfin=get_Istr(n,a,ios::in);
194
- *pfin >> m1 >> m2;
195
- dst_Istr(pfin);
196
- }
197
-
198
- template <typename X,typename Y,typename Z>
199
- void FBase::get_Nnum(long n,const char *a,X &m1,Y &m2,Z &m3){
200
- ifstream *pfin=get_Istr(n,a,ios::in);
201
- *pfin >> m1 >> m2 >> m3;
202
- dst_Istr(pfin);
203
- }
204
-
205
- template <typename X>
206
- void FBase::put_Nnum(const char *a,X &m1){
207
- ofstream *pfout=get_Ostr(a,ios::out);
208
- *pfout << m1 << endl;
209
- dst_Ostr(pfout);
210
- }
211
-
212
- template <typename X,typename Y>
213
- void FBase::put_Nnum(const char *a,X &m1,Y &m2){
214
- ofstream *pfout=get_Ostr(a,ios::out);
215
- *pfout << m1 << " " << m2 << endl;
216
- dst_Ostr(pfout);
217
- }
218
-
219
- template <typename X,typename Y,typename Z>
220
- void FBase::put_Nnum(const char *a,X &m1,Y &m2,Z &m3){
221
- ofstream *pfout=get_Ostr(a,ios::out);
222
- *pfout << m1 << " " << m2 << " " << m3 << endl;
223
- dst_Ostr(pfout);
224
- }
225
-
226
- template <typename X>
227
- void FBase::put_Nnum(long n,const char *a,X &m1){
228
- ofstream *pfout=get_Ostr(n,a,ios::out);
229
- *pfout << m1 << endl;
230
- dst_Ostr(pfout);
231
- }
232
-
233
- template <typename X,typename Y>
234
- void FBase::put_Nnum(long n,const char *a,X &m1,Y &m2){
235
- ofstream *pfout=get_Ostr(n,a,ios::out);
236
- *pfout << m1 << " " << m2 << endl;
237
- dst_Ostr(pfout);
238
- }
239
-
240
- template <typename X,typename Y,typename Z>
241
- void FBase::put_Nnum(long n,const char *a,X &m1,Y &m2,Z &m3){
242
- ofstream *pfout=get_Ostr(n,a,ios::out);
243
- *pfout << m1 << " " << m2 << " " << m3 << endl;
244
- dst_Ostr(pfout);
245
- }
246
-
247
- }
248
- #endif
 
1
+ #ifndef FBASE_H
2
+ #define FBASE_H
3
+
4
+ #include <iostream>
5
+ #include <fstream>
6
+
7
+ using namespace std;
8
+ namespace iret {
9
+
10
+ typedef char *pChr;
11
+
12
+ class FBase {
13
+ public:
14
+ FBase(const char *tp,const char *nm); //tp is type name, nm is name
15
+ FBase(const char *tp,int tn,const char *nm); //tp is type name, if
16
+ //nonnegative tn is appended to end of tp, nm is name
17
+ FBase(const char *tp,const char *nm,const char *pt); //tp is type name, nm is name
18
+ //pt is pointer at a string sss and reads the path from file path_sss in
19
+ //current directory. But if sss begins with ':' then skips this character
20
+ //and remaining string is the path string itself.
21
+ ~FBase();
22
+ void set_type_num(int tn); //Sets tpnm and uses if nonnegative: appended
23
+ //to end of type name
24
+ void set_name_num(int nn); //Sets nmnm and uses if nonnegative: appended
25
+ //to end of name
26
+ void change_type(const char *nm); //Allows change of type string for class.
27
+ void change_name(const char *nm); //Allows change of name string for class.
28
+ void set_name(const char *nm); //Allows change of name string for class.
29
+ //Included for compatibility
30
+ void subname(const char *tph,const char *tpl,const char *nm); //Uses the
31
+ //higher level type tph as type and combines lower level tpl_nm with
32
+ //name to allow one to keep track of file types.
33
+ void set_path_internal(const char *pt); //Path is by default external with
34
+ //eflag=1. But if this function called with nonempty string, then eflag=0
35
+ //and pt stored in path and used for access to data.
36
+ void set_path_name(const char *pa); //path will be extracted from path_pa
37
+ //and eflag=2. Naming conventions for files are unchanged
38
+ void map_down(FBase *pFb); //Maps naming parameters to class instance pFb
39
+ void map_down_sub(FBase *pFb,const char *subtype); //Maps naming parameters to class instance pFb
40
+ //combines subtype with name to make a new name for pFb and type becomes its type
41
+
42
+ //Path access functions
43
+ void get_pathx(char *cn,const char *a);
44
+ //Reads the path from a file "path_(*name)" and constructs the
45
+ //file name from as "(*type)_(*name).(*a)". Cats path and file
46
+ //name and returns the full info in cn.
47
+ void get_pathx(char *cn,long n,const char *a);
48
+ char *add_num(const char *ptr,long n,char *buf); //converts long to ascii
49
+ //and cats to end of string and returns pointer to new string
50
+ //that results. Does not change input string. The new string is
51
+ //held in buffer space and this is overwritten at each call.
52
+ char *cat_num(const char *ptr,long n,char *buf); //converts long to ascii
53
+ //and cats to end of ptr string and then cats result to end of
54
+ //whatever is in buffer. Does not change input string. The new string is
55
+ //held in buffer space.
56
+
57
+ //Stream object pointers
58
+ ifstream *get_Istr(const char *a,ios::openmode m=ios::in);
59
+ //Opens input file stream by path and name composition.
60
+ ofstream *get_Ostr(const char *a,ios::openmode m=ios::out);
61
+ //Opens output file stream by path and name composition.
62
+ fstream *get_Fstr(const char *a,ios::openmode m=ios::in|ios::out);
63
+ //Opens output file stream by path and name composition.
64
+ ifstream *get_Istr(long n,const char *a,ios::openmode m=ios::in);
65
+ ofstream *get_Ostr(long n,const char *a,ios::openmode m=ios::out);
66
+ fstream *get_Fstr(long n,const char *a,ios::openmode m=ios::in|ios::out);
67
+ void dst_Istr(ifstream *pfin);
68
+ void dst_Ostr(ofstream *pfout);
69
+ void dst_Fstr(fstream *pfstr);
70
+
71
+ //Get file size in bytes
72
+ long get_Fsiz(const char *a);
73
+ long get_Fsiz(long n,const char *a);
74
+
75
+ //File existence
76
+ int Exists(const char *a); //returns 1 if file exists
77
+ int Exists(long n,const char *a); //returns 1 if file exists
78
+
79
+ //Read in array pointers
80
+ char *get_Read(const char *a);
81
+ //Reads in a file into an char array and returns pointer
82
+ char *get_Read(long n,const char *a);
83
+
84
+ //Memory map pointers
85
+ char *get_Mmap(const char *a);
86
+ //Memory maps file by path and name composition.
87
+ char *get_Mmap(long n,const char *a);
88
+ char *get_Wmap(const char *a);
89
+ //Memory maps file by path and name composition.
90
+ //Allows to modify contents and is written out when dst_Mmap called
91
+ char *get_Wmap(long n,const char *a);
92
+ //Allows to modify contents and is written out when dst_Mmap called
93
+ void dst_Mmap(const char *a,char *ptr);
94
+ //Removes the memory map for ptr based on path and name composition.
95
+ void dst_Mmap(long n,const char *a,char *ptr);
96
+
97
+ //Array of chars and binary write
98
+ void bin_Writ(const char *a,long nm,char *ptr);
99
+ //Writes out nm bytes binary
100
+ void bin_Writ(long n,const char *a,long nm,char *ptr);
101
+
102
+ //Write and read 1, 2, or 3 long integers to or from a file
103
+ template <typename X>
104
+ void get_Nnum(const char *a,X &m1);
105
+ template <typename X,typename Y>
106
+ void get_Nnum(const char *a,X &m1,Y &m2);
107
+ template <typename X,typename Y,typename Z>
108
+ void get_Nnum(const char *a,X &m1,Y &m2,Z &m3);
109
+ template <typename X>
110
+ void get_Nnum(long n,const char *a,X &m1);
111
+ template <typename X,typename Y>
112
+ void get_Nnum(long n,const char *a,X &m1,Y &m2);
113
+ template <typename X,typename Y,typename Z>
114
+ void get_Nnum(long n,const char *a,X &m1,Y &m2,Z &m3);
115
+ template <typename X>
116
+ void put_Nnum(const char *a,X &m1);
117
+ template <typename X,typename Y>
118
+ void put_Nnum(const char *a,X &m1,Y &m2);
119
+ template <typename X,typename Y,typename Z>
120
+ void put_Nnum(const char *a,X &m1,Y &m2,Z &m3);
121
+ template <typename X>
122
+ void put_Nnum(long n,const char *a,X &m1);
123
+ template <typename X,typename Y>
124
+ void put_Nnum(long n,const char *a,X &m1,Y &m2);
125
+ template <typename X,typename Y,typename Z>
126
+ void put_Nnum(long n,const char *a,X &m1,Y &m2,Z &m3);
127
+
128
+ //Logical accounting functions
129
+ int Gcom(int sflag); //sflag is bit marker such as READ_W, etc.
130
+ //This returns 1 if sflag bit not set in oflag and is in cflag
131
+ //If this is the case then it sets sflag in oflag.
132
+ int Rcom(int sflag);
133
+ //This returns 1 if sflag bit set in oflag and in cflag
134
+ //If this is the case then it turns off sflag in oflag.
135
+ void mark(long,int,const char*);
136
+ //This function prints out string in 3rd argument and count
137
+ //if first argument is multiple of the second
138
+
139
+ //Data
140
+ int cflag; //Command, what should happen to resources.
141
+ int oflag; //Bit string status of resources, 1 open, 0 closed.
142
+ int open1; //flags to mark whether a resource is open or not
143
+ int open2; //0 means closed, 1 means open
144
+ int open3; //Used for those resources that are either completely
145
+ int open4; //closed or completely open.
146
+ int open5;
147
+ char *type;
148
+ int tpnm; //If nonnegative integer it is appended to end of type
149
+ //in constructing file name
150
+ char *name;
151
+ int nmnm; //If nonnegative integer it is appended to end of name
152
+ //in constructing file name
153
+ int pflag; //Usual print flag, 1 for verbose output, 0 for none
154
+ //Print flag set to 1 by default.
155
+ int eflag; //Flag set to 1 for external path from path file, 0
156
+ //for internal path
157
+ char *path; //Path stored here if eflag=0.
158
+ char *pnam; //Path extension stored here if eflag=2.
159
+ };
160
+
161
+ //Template functions
162
+
163
+ template <typename X>
164
+ void FBase::get_Nnum(const char *a,X &m1){
165
+ ifstream *pfin=get_Istr(a,ios::in);
166
+ *pfin >> m1;
167
+ dst_Istr(pfin);
168
+ }
169
+
170
+ template <typename X,typename Y>
171
+ void FBase::get_Nnum(const char *a,X &m1,Y &m2){
172
+ ifstream *pfin=get_Istr(a,ios::in);
173
+ *pfin >> m1 >> m2;
174
+ dst_Istr(pfin);
175
+ }
176
+
177
+ template <typename X,typename Y,typename Z>
178
+ void FBase::get_Nnum(const char *a,X &m1,Y &m2,Z &m3){
179
+ ifstream *pfin=get_Istr(a,ios::in);
180
+ *pfin >> m1 >> m2 >> m3;
181
+ dst_Istr(pfin);
182
+ }
183
+
184
+ template <typename X>
185
+ void FBase::get_Nnum(long n,const char *a,X &m1){
186
+ ifstream *pfin=get_Istr(n,a,ios::in);
187
+ *pfin >> m1;
188
+ dst_Istr(pfin);
189
+ }
190
+
191
+ template <typename X,typename Y>
192
+ void FBase::get_Nnum(long n,const char *a,X &m1,Y &m2){
193
+ ifstream *pfin=get_Istr(n,a,ios::in);
194
+ *pfin >> m1 >> m2;
195
+ dst_Istr(pfin);
196
+ }
197
+
198
+ template <typename X,typename Y,typename Z>
199
+ void FBase::get_Nnum(long n,const char *a,X &m1,Y &m2,Z &m3){
200
+ ifstream *pfin=get_Istr(n,a,ios::in);
201
+ *pfin >> m1 >> m2 >> m3;
202
+ dst_Istr(pfin);
203
+ }
204
+
205
+ template <typename X>
206
+ void FBase::put_Nnum(const char *a,X &m1){
207
+ ofstream *pfout=get_Ostr(a,ios::out);
208
+ *pfout << m1 << endl;
209
+ dst_Ostr(pfout);
210
+ }
211
+
212
+ template <typename X,typename Y>
213
+ void FBase::put_Nnum(const char *a,X &m1,Y &m2){
214
+ ofstream *pfout=get_Ostr(a,ios::out);
215
+ *pfout << m1 << " " << m2 << endl;
216
+ dst_Ostr(pfout);
217
+ }
218
+
219
+ template <typename X,typename Y,typename Z>
220
+ void FBase::put_Nnum(const char *a,X &m1,Y &m2,Z &m3){
221
+ ofstream *pfout=get_Ostr(a,ios::out);
222
+ *pfout << m1 << " " << m2 << " " << m3 << endl;
223
+ dst_Ostr(pfout);
224
+ }
225
+
226
+ template <typename X>
227
+ void FBase::put_Nnum(long n,const char *a,X &m1){
228
+ ofstream *pfout=get_Ostr(n,a,ios::out);
229
+ *pfout << m1 << endl;
230
+ dst_Ostr(pfout);
231
+ }
232
+
233
+ template <typename X,typename Y>
234
+ void FBase::put_Nnum(long n,const char *a,X &m1,Y &m2){
235
+ ofstream *pfout=get_Ostr(n,a,ios::out);
236
+ *pfout << m1 << " " << m2 << endl;
237
+ dst_Ostr(pfout);
238
+ }
239
+
240
+ template <typename X,typename Y,typename Z>
241
+ void FBase::put_Nnum(long n,const char *a,X &m1,Y &m2,Z &m3){
242
+ ofstream *pfout=get_Ostr(n,a,ios::out);
243
+ *pfout << m1 << " " << m2 << " " << m3 << endl;
244
+ dst_Ostr(pfout);
245
+ }
246
+
247
+ }
248
+ #endif
Library/Hash.C CHANGED
@@ -1,733 +1,733 @@
1
- #include <iostream>
2
- #include <fstream>
3
- #include <cstdlib>
4
- #include <sys/types.h>
5
- #include <sys/stat.h>
6
- #include <fcntl.h>
7
- #include <sys/mman.h>
8
- #include <cmath>
9
- #include <cstring>
10
- #include <cassert>
11
- #include "runn.h"
12
- #include "Hash.h"
13
-
14
- using namespace std;
15
- namespace iret {
16
-
17
- Hash::Hash(void) : FBase("hshset","null"){
18
- }
19
-
20
- Hash::Hash(const char *nam) : FBase("hshset",nam){
21
- }
22
-
23
- Hash::Hash(int n,const char *nam) : FBase("hshset",n,nam){
24
- }
25
-
26
- Hash::~Hash(){
27
- }
28
-
29
- void Hash::create_htable(List &Lst,int excess){
30
- char cnam[max_str],*cptr,*uptr;
31
- int u,len;
32
- long ct,i,j,k;
33
- ofstream *pfout;
34
-
35
- nwrds=Lst.cnt_key;
36
- ct=nwrds;
37
- tnum=1;
38
- u=0;
39
- while(ct=ct/2){tnum*=2;u++;}
40
- if(u>30){cout << "Error in size, " << u << endl;exit(0);}
41
- i=0;
42
- while((u<32)&&(i<excess)){tnum*=2;u++;i++;}
43
- tnum--;
44
- harr=new long[tnum+2];
45
- for(ct=0;ct<tnum+2;ct++)harr[ct]=0;
46
-
47
- farr=new long[1536];
48
- ct=1;
49
- for(i=0;i<1536;i++){
50
- farr[i]=ct=(ct*331)&tnum;
51
- }
52
-
53
- long *pc0=farr,*pc1=farr+128,*pc2=farr+256;
54
- long *pc3=farr+384,*pc4=farr+512,*pc5=farr+640;
55
- long *pc6=farr+768,*pc7=farr+896,*pc8=farr+1024;
56
- long *pc9=farr+1152,*pc10=farr+1280,*pc11=farr+1408;
57
-
58
- Lst.node_first();
59
- while(Lst.node_next()){
60
- cptr=Lst.show_str();
61
- ct=0;
62
- i=0;
63
- while(u=*(cptr++)){
64
- switch(i){
65
- case 0: ct+=*(pc0+u);
66
- break;
67
- case 1: ct+=*(pc1+u);
68
- break;
69
- case 2: ct+=*(pc2+u);
70
- break;
71
- case 3: ct+=*(pc3+u);
72
- break;
73
- case 4: ct+=*(pc4+u);
74
- break;
75
- case 5: ct+=*(pc5+u);
76
- break;
77
- case 6: ct+=*(pc6+u);
78
- break;
79
- case 7: ct+=*(pc7+u);
80
- break;
81
- case 8: ct+=*(pc8+u);
82
- break;
83
- case 9: ct+=*(pc9+u);
84
- break;
85
- case 10: ct+=*(pc10+u);
86
- break;
87
- case 11: ct+=*(pc11+u);
88
- i-=12;
89
- break;
90
- }
91
- i++;
92
- }
93
- (harr[ct&tnum])++;
94
- }
95
-
96
- //Set start points in harr.
97
- k=0;
98
- for(i=0;i<tnum+2;i++){
99
- j=harr[i];
100
- harr[i]=k;
101
- k+=j;
102
- }
103
- if(k!=nwrds){cout << "Error in summing!" << endl;exit(0);}
104
-
105
- //Write out harr.
106
- bin_Writ("ha",(tnum+2)*sizeof(long),(char*)harr);
107
-
108
- //Set addresses
109
- char **addt=new char*[nwrds];
110
- Lst.node_first();
111
- while(Lst.node_next()){
112
- uptr=cptr=Lst.show_str();
113
- ct=0;
114
- i=0;
115
- while(u=*(cptr++)){
116
- switch(i){
117
- case 0: ct+=*(pc0+u);
118
- break;
119
- case 1: ct+=*(pc1+u);
120
- break;
121
- case 2: ct+=*(pc2+u);
122
- break;
123
- case 3: ct+=*(pc3+u);
124
- break;
125
- case 4: ct+=*(pc4+u);
126
- break;
127
- case 5: ct+=*(pc5+u);
128
- break;
129
- case 6: ct+=*(pc6+u);
130
- break;
131
- case 7: ct+=*(pc7+u);
132
- break;
133
- case 8: ct+=*(pc8+u);
134
- break;
135
- case 9: ct+=*(pc9+u);
136
- break;
137
- case 10: ct+=*(pc10+u);
138
- break;
139
- case 11: ct+=*(pc11+u);
140
- i-=12;
141
- break;
142
- }
143
- i++;
144
- }
145
- k=ct&tnum;
146
- addt[harr[k]]=uptr;
147
- (harr[k])++;
148
- }
149
-
150
- //Write out string file
151
- pfout=get_Ostr("str");
152
- k=0;
153
- for(i=0;i<nwrds;i++){
154
- *pfout << addt[i] << ends;
155
- len=strlen((char*)addt[i])+1;
156
- addt[i]=(char*)k;
157
- k+=len;
158
- }
159
- dst_Ostr(pfout);
160
-
161
- //Write out addr file
162
- bin_Writ("ad",nwrds*sizeof(long),(char*)addt);
163
- delete [] addt;
164
-
165
- //Write out counts
166
- pfout=get_Ostr("nm");
167
- *pfout << nwrds << " " << tnum << " " << k << endl;
168
- dst_Ostr(pfout);
169
- delete [] harr;
170
- delete [] farr;
171
- }
172
-
173
- //In memory model intended for small sets
174
- void Hash::create_htableM(List &Lst,int excess){
175
- char cnam[max_str],*cptr,*uptr;
176
- int u,len;
177
- long ct,i,j,k,*barr;
178
- ofstream *pfout;
179
-
180
- nwrds=Lst.cnt_key;
181
- ct=nwrds;
182
- tnum=1;
183
- u=0;
184
- while(ct=ct/2){tnum*=2;u++;}
185
- if(u>30){cout << "Error in size, " << u << endl;exit(0);}
186
- i=0;
187
- while((u<32)&&(i<excess)){tnum*=2;u++;i++;}
188
- tnum--;
189
- harr=new long[tnum+2];
190
- barr=new long[tnum+2];
191
- for(ct=0;ct<tnum+2;ct++)harr[ct]=0;
192
-
193
- farr=new long[1536];
194
- ct=1;
195
- for(i=0;i<1536;i++){
196
- farr[i]=ct=(ct*331)&tnum;
197
- }
198
-
199
- px0=farr,px1=farr+128,px2=farr+256;
200
- px3=farr+384,px4=farr+512,px5=farr+640;
201
- px6=farr+768,px7=farr+896,px8=farr+1024;
202
- px9=farr+1152,px10=farr+1280,px11=farr+1408;
203
-
204
- Lst.node_first();
205
- while(Lst.node_next()){
206
- cptr=Lst.show_str();
207
- ct=0;
208
- i=0;
209
- while(u=*(cptr++)){
210
- switch(i){
211
- case 0: ct+=*(px0+u);
212
- break;
213
- case 1: ct+=*(px1+u);
214
- break;
215
- case 2: ct+=*(px2+u);
216
- break;
217
- case 3: ct+=*(px3+u);
218
- break;
219
- case 4: ct+=*(px4+u);
220
- break;
221
- case 5: ct+=*(px5+u);
222
- break;
223
- case 6: ct+=*(px6+u);
224
- break;
225
- case 7: ct+=*(px7+u);
226
- break;
227
- case 8: ct+=*(px8+u);
228
- break;
229
- case 9: ct+=*(px9+u);
230
- break;
231
- case 10: ct+=*(px10+u);
232
- break;
233
- case 11: ct+=*(px11+u);
234
- i-=12;
235
- break;
236
- }
237
- i++;
238
- }
239
- (harr[ct&tnum])++;
240
- }
241
-
242
- //Set start points in harr.
243
- k=0;
244
- for(i=0;i<tnum+2;i++){
245
- j=harr[i];
246
- barr[i]=harr[i]=k;
247
- k+=j;
248
- }
249
- if(k!=nwrds){cout << "Error in summing!" << endl;exit(0);}
250
-
251
- //Set addresses
252
- len=0;
253
- char **addt=new char*[nwrds];
254
- Lst.node_first();
255
- while(Lst.node_next()){
256
- uptr=cptr=Lst.show_str();
257
- len+=strlen(uptr)+1;
258
- ct=0;
259
- i=0;
260
- while(u=*(cptr++)){
261
- switch(i){
262
- case 0: ct+=*(px0+u);
263
- break;
264
- case 1: ct+=*(px1+u);
265
- break;
266
- case 2: ct+=*(px2+u);
267
- break;
268
- case 3: ct+=*(px3+u);
269
- break;
270
- case 4: ct+=*(px4+u);
271
- break;
272
- case 5: ct+=*(px5+u);
273
- break;
274
- case 6: ct+=*(px6+u);
275
- break;
276
- case 7: ct+=*(px7+u);
277
- break;
278
- case 8: ct+=*(px8+u);
279
- break;
280
- case 9: ct+=*(px9+u);
281
- break;
282
- case 10: ct+=*(px10+u);
283
- break;
284
- case 11: ct+=*(px11+u);
285
- i-=12;
286
- break;
287
- }
288
- i++;
289
- }
290
- k=ct&tnum;
291
- addt[barr[k]]=uptr;
292
- (barr[k])++;
293
- }
294
- strmap=new char[len];
295
-
296
- //Set up string array
297
- k=0;
298
- for(i=0;i<nwrds;i++){
299
- len=strlen((char*)addt[i])+1;
300
- strcpy(strmap+k,addt[i]);
301
- addt[i]=(char*)k;
302
- k+=len;
303
- }
304
- addr=(long*)addt;
305
- delete [] barr;
306
- }
307
-
308
- void Hash::create_htable(int mz,List &Lst,int excess){
309
- char cnam[max_str],*cptr,*uptr;
310
- int u,len;
311
- long ct,i,j,k;
312
- ofstream *pfout;
313
-
314
- nwrds=Lst.cnt_key;
315
- ct=nwrds;
316
- tnum=1;
317
- u=0;
318
- while(ct=ct/2){tnum*=2;u++;}
319
- if(u>30){cout << "Error in size, " << u << endl;exit(0);}
320
- i=0;
321
- while((u<32)&&(i<excess)){tnum*=2;u++;i++;}
322
- tnum--;
323
- harr=new long[tnum+2];
324
- for(ct=0;ct<tnum+2;ct++)harr[ct]=0;
325
-
326
- farr=new long[1536];
327
- ct=1;
328
- for(i=0;i<1536;i++){
329
- farr[i]=ct=(ct*331)&tnum;
330
- }
331
-
332
- long *pc0=farr,*pc1=farr+128,*pc2=farr+256;
333
- long *pc3=farr+384,*pc4=farr+512,*pc5=farr+640;
334
- long *pc6=farr+768,*pc7=farr+896,*pc8=farr+1024;
335
- long *pc9=farr+1152,*pc10=farr+1280,*pc11=farr+1408;
336
-
337
- Lst.node_first();
338
- while(Lst.node_next()){
339
- cptr=Lst.show_str();
340
- ct=0;
341
- i=0;
342
- while(u=*(cptr++)){
343
- switch(i){
344
- case 0: ct+=*(pc0+u);
345
- break;
346
- case 1: ct+=*(pc1+u);
347
- break;
348
- case 2: ct+=*(pc2+u);
349
- break;
350
- case 3: ct+=*(pc3+u);
351
- break;
352
- case 4: ct+=*(pc4+u);
353
- break;
354
- case 5: ct+=*(pc5+u);
355
- break;
356
- case 6: ct+=*(pc6+u);
357
- break;
358
- case 7: ct+=*(pc7+u);
359
- break;
360
- case 8: ct+=*(pc8+u);
361
- break;
362
- case 9: ct+=*(pc9+u);
363
- break;
364
- case 10: ct+=*(pc10+u);
365
- break;
366
- case 11: ct+=*(pc11+u);
367
- i-=12;
368
- break;
369
- }
370
- i++;
371
- }
372
- (harr[ct&tnum])++;
373
- }
374
-
375
- //Set start points in harr.
376
- k=0;
377
- for(i=0;i<tnum+2;i++){
378
- j=harr[i];
379
- harr[i]=k;
380
- k+=j;
381
- }
382
- if(k!=nwrds){cout << "Error in summing!" << endl;exit(0);}
383
-
384
- //Write out harr.
385
- bin_Writ(mz,"ha",(tnum+2)*sizeof(long),(char*)harr);
386
-
387
- //Set addresses
388
- char **addt=new char*[nwrds];
389
- Lst.node_first();
390
- while(Lst.node_next()){
391
- uptr=cptr=Lst.show_str();
392
- ct=0;
393
- i=0;
394
- while(u=*(cptr++)){
395
- switch(i){
396
- case 0: ct+=*(pc0+u);
397
- break;
398
- case 1: ct+=*(pc1+u);
399
- break;
400
- case 2: ct+=*(pc2+u);
401
- break;
402
- case 3: ct+=*(pc3+u);
403
- break;
404
- case 4: ct+=*(pc4+u);
405
- break;
406
- case 5: ct+=*(pc5+u);
407
- break;
408
- case 6: ct+=*(pc6+u);
409
- break;
410
- case 7: ct+=*(pc7+u);
411
- break;
412
- case 8: ct+=*(pc8+u);
413
- break;
414
- case 9: ct+=*(pc9+u);
415
- break;
416
- case 10: ct+=*(pc10+u);
417
- break;
418
- case 11: ct+=*(pc11+u);
419
- i-=12;
420
- break;
421
- }
422
- i++;
423
- }
424
- k=ct&tnum;
425
- addt[harr[k]]=uptr;
426
- (harr[k])++;
427
- }
428
-
429
- //Write out string file
430
- pfout=get_Ostr(mz,"str");
431
- k=0;
432
- for(i=0;i<nwrds;i++){
433
- *pfout << addt[i] << ends;
434
- len=strlen((char*)addt[i])+1;
435
- addt[i]=(char*)k;
436
- k+=len;
437
- }
438
- dst_Ostr(pfout);
439
-
440
- //Write out addr file
441
- bin_Writ(mz,"ad",nwrds*sizeof(long),(char*)addt);
442
- delete [] addt;
443
-
444
- //Write out counts
445
- pfout=get_Ostr(mz,"nm");
446
- *pfout << nwrds << " " << tnum << " " << k << endl;
447
- dst_Ostr(pfout);
448
- delete [] harr;
449
- delete [] farr;
450
- }
451
-
452
- void Hash::gopen_htable_map(void){
453
- char cnam[max_str],*cptr;
454
- int fld;
455
- long ct,asize,i;
456
-
457
- ifstream *pfin=get_Istr("nm");
458
- *pfin >> nwrds >> tnum >> asize;
459
- dst_Istr(pfin);
460
-
461
- harr=(long*)get_Mmap("ha");
462
- addr=(long*)get_Mmap("ad");
463
- strmap=get_Mmap("str");
464
-
465
- farr=new long[1536];
466
- ct=1;
467
- for(i=0;i<1536;i++){
468
- farr[i]=ct=(ct*331)&tnum;
469
- }
470
-
471
- px0=farr,px1=farr+128,px2=farr+256;
472
- px3=farr+384,px4=farr+512,px5=farr+640;
473
- px6=farr+768,px7=farr+896,px8=farr+1024;
474
- px9=farr+1152,px10=farr+1280,px11=farr+1408;
475
- }
476
-
477
- void Hash::gopen_htable_map(int mz){
478
- char cnam[max_str],*cptr;
479
- int fld;
480
- long ct,asize,i;
481
-
482
- ifstream *pfin=get_Istr(mz,"nm");
483
- *pfin >> nwrds >> tnum >> asize;
484
- dst_Istr(pfin);
485
-
486
- harr=(long*)get_Mmap(mz,"ha");
487
- addr=(long*)get_Mmap(mz,"ad");
488
- strmap=get_Mmap(mz,"str");
489
-
490
- farr=new long[1536];
491
- ct=1;
492
- for(i=0;i<1536;i++){
493
- farr[i]=ct=(ct*331)&tnum;
494
- }
495
-
496
- px0=farr,px1=farr+128,px2=farr+256;
497
- px3=farr+384,px4=farr+512,px5=farr+640;
498
- px6=farr+768,px7=farr+896,px8=farr+1024;
499
- px9=farr+1152,px10=farr+1280,px11=farr+1408;
500
- }
501
-
502
- void Hash::gopen_htable_copy(Hash *pH){
503
- char cnam[max_str],*cptr;
504
- int fld;
505
- long ct,asize,i;
506
-
507
- nwrds=pH->nwrds;
508
- tnum=pH->tnum;
509
-
510
- harr=pH->harr;
511
- addr=pH->addr;
512
- strmap=pH->strmap;
513
-
514
- farr=pH->farr;
515
-
516
- px0=farr,px1=farr+128,px2=farr+256;
517
- px3=farr+384,px4=farr+512,px5=farr+640;
518
- px6=farr+768,px7=farr+896,px8=farr+1024;
519
- px9=farr+1152,px10=farr+1280,px11=farr+1408;
520
- }
521
-
522
- long Hash::find(const char *str){
523
- register long ct=0,i=0,k;
524
- register int ic;
525
- register const char *utr=str;
526
- while(ic=*(utr++)){
527
- switch(i){
528
- case 0: ct+=*(px0+ic);
529
- break;
530
- case 1: ct+=*(px1+ic);
531
- break;
532
- case 2: ct+=*(px2+ic);
533
- break;
534
- case 3: ct+=*(px3+ic);
535
- break;
536
- case 4: ct+=*(px4+ic);
537
- break;
538
- case 5: ct+=*(px5+ic);
539
- break;
540
- case 6: ct+=*(px6+ic);
541
- break;
542
- case 7: ct+=*(px7+ic);
543
- break;
544
- case 8: ct+=*(px8+ic);
545
- break;
546
- case 9: ct+=*(px9+ic);
547
- break;
548
- case 10: ct+=*(px10+ic);
549
- break;
550
- case 11: ct+=*(px11+ic);
551
- i-=12;
552
- break;
553
- }
554
- i++;
555
- }
556
- k=ct&tnum;
557
- ct=harr[k+1];
558
- i=harr[k];
559
- //cout << k << " " << i << " " << addr[i] << " " << ct << " " << addr[ct] << endl;
560
- switch(ct-i){
561
- case 0: return(0);
562
- break;
563
- case 1: if(!strcmp(str,strmap+addr[i]))return(i+1);
564
- else return(0);
565
- break;
566
- case 2: ic=strcmp(str,strmap+addr[i]);
567
- if(ic>0){
568
- if(!strcmp(str,strmap+addr[i+1]))return(i+2);
569
- else return(0);
570
- }
571
- else if(ic<0)return(0);
572
- else return(i+1);
573
- break;
574
- default: ic=strcmp(str,strmap+addr[i]);
575
- if(ic<0)return(0);
576
- else if(!ic)return(i+1);
577
- ct--;
578
- ic=strcmp(str,strmap+addr[ct]);
579
- if(ic>0)return(0);
580
- else if(!ic)return(ct+1);
581
- while(ct-i>1){
582
- k=(ct+i)/2;
583
- ic=strcmp(str,strmap+addr[k]);
584
- if(ic>0)i=k;
585
- else if(ic<0)ct=k;
586
- else return(k+1);
587
- }
588
- return(0);
589
- }
590
- }
591
-
592
- void Hash::gclose_htable_map(void){
593
- dst_Mmap("ha",(char*)harr);
594
- dst_Mmap("ad",(char*)addr);
595
- dst_Mmap("str",strmap);
596
- delete [] farr;
597
- }
598
-
599
- void Hash::gclose_htable_map(int mz){
600
- dst_Mmap(mz,"ha",(char*)harr);
601
- dst_Mmap(mz,"ad",(char*)addr);
602
- dst_Mmap(mz,"str",strmap);
603
- delete [] farr;
604
- }
605
-
606
- //Chash code
607
-
608
- Chash::Chash() : Hash(){
609
- change_type("cshset");
610
- }
611
-
612
- Chash::Chash(const char *str) : Hash(str){
613
- change_type("cshset");
614
- }
615
-
616
- Chash::Chash(int n,const char *str) : Hash(n,str){
617
- change_type("cshset");
618
- }
619
-
620
- Chash::~Chash(void){}
621
-
622
- void Chash::create_ctable(Count &Ct,int excess){
623
- create_htable(Ct,excess);
624
- gopen_htable_map();
625
- long n,i=0;
626
- long *pct=new long[Ct.cnt_key];
627
- Ct.node_first();
628
- while(Ct.node_next()){
629
- if(n=find(Ct.show_str())){
630
- pct[n-1]=Ct.count();
631
- }
632
- else {
633
- cout << "Error in Count tree!" << endl;exit(0);
634
- }
635
- mark(++i,10000,"count terms");
636
- }
637
- bin_Writ("ct",Ct.cnt_key*sizeof(long),(char*)pct);
638
- delete [] pct;
639
- cnt=(long*)get_Mmap("ct");
640
- gclose_htable_map();
641
- }
642
-
643
- void Chash::create_ctable(List &Lt,int excess){
644
- create_htable(Lt,excess);
645
- gopen_htable_map();
646
- long n,i=1;
647
- long *pct=new long[Lt.cnt_key];
648
- Lt.node_first();
649
- while(Lt.node_next()){
650
- if(n=find(Lt.show_str())){
651
- pct[n-1]=i;
652
- }
653
- else {
654
- cout << "Error in List tree!" << endl;exit(0);
655
- }
656
- mark(++i,10000,"count terms");
657
- }
658
- bin_Writ("ct",Lt.cnt_key*sizeof(long),(char*)pct);
659
- delete [] pct;
660
- cnt=(long*)get_Mmap("ct");
661
- gclose_htable_map();
662
- }
663
-
664
- void Chash::create_ctable(int mz,Count &Ct,int excess){
665
- create_htable(mz,Ct,excess);
666
- gopen_htable_map(mz);
667
- long n,i=0;
668
- long *pct=new long[Ct.cnt_key];
669
- Ct.node_first();
670
- while(Ct.node_next()){
671
- if(n=find(Ct.show_str())){
672
- pct[n-1]=Ct.count();
673
- }
674
- else {
675
- cout << "Error in Count tree!" << endl;exit(0);
676
- }
677
- mark(++i,10000,"count terms");
678
- }
679
- bin_Writ(mz,"ct",Ct.cnt_key*sizeof(long),(char*)pct);
680
- delete [] pct;
681
- cnt=(long*)get_Mmap(mz,"ct");
682
- gclose_htable_map(mz);
683
- }
684
-
685
- void Chash::create_ctable(int mz,List &Lt,int excess){
686
- create_htable(mz,Lt,excess);
687
- gopen_htable_map(mz);
688
- long n,i=1;
689
- long *pct=new long[Lt.cnt_key];
690
- Lt.node_first();
691
- while(Lt.node_next()){
692
- if(n=find(Lt.show_str())){
693
- pct[n-1]=i;
694
- }
695
- else {
696
- cout << "Error in List tree!" << endl;exit(0);
697
- }
698
- mark(++i,10000,"count terms");
699
- }
700
- bin_Writ(mz,"ct",Lt.cnt_key*sizeof(long),(char*)pct);
701
- delete [] pct;
702
- cnt=(long*)get_Mmap(mz,"ct");
703
- gclose_htable_map(mz);
704
- }
705
-
706
- void Chash::gopen_ctable_map(void){
707
- gopen_htable_map();
708
- cnt=(long*)get_Mmap("ct");
709
- }
710
-
711
- void Chash::gopen_ctable_map(int mz){
712
- gopen_htable_map(mz);
713
- cnt=(long*)get_Mmap(mz,"ct");
714
- }
715
-
716
- void Chash::gclose_ctable_map(void){
717
- gclose_htable_map();
718
- dst_Mmap("ct",(char*)cnt);
719
- }
720
-
721
- void Chash::gclose_ctable_map(int mz){
722
- gclose_htable_map(mz);
723
- dst_Mmap(mz,"ct",(char*)cnt);
724
- }
725
-
726
- long Chash::count(const char *str){
727
- long n=find(str);
728
- if(n)return(cnt[n-1]);
729
- else return(0);
730
- }
731
-
732
- }
733
-
 
1
+ #include <iostream>
2
+ #include <fstream>
3
+ #include <cstdlib>
4
+ #include <sys/types.h>
5
+ #include <sys/stat.h>
6
+ #include <fcntl.h>
7
+ #include <sys/mman.h>
8
+ #include <cmath>
9
+ #include <cstring>
10
+ #include <cassert>
11
+ #include "runn.h"
12
+ #include "Hash.h"
13
+
14
+ using namespace std;
15
+ namespace iret {
16
+
17
+ Hash::Hash(void) : FBase("hshset","null"){
18
+ }
19
+
20
+ Hash::Hash(const char *nam) : FBase("hshset",nam){
21
+ }
22
+
23
+ Hash::Hash(int n,const char *nam) : FBase("hshset",n,nam){
24
+ }
25
+
26
+ Hash::~Hash(){
27
+ }
28
+
29
+ void Hash::create_htable(List &Lst,int excess){
30
+ char cnam[max_str],*cptr,*uptr;
31
+ int u,len;
32
+ long ct,i,j,k;
33
+ ofstream *pfout;
34
+
35
+ nwrds=Lst.cnt_key;
36
+ ct=nwrds;
37
+ tnum=1;
38
+ u=0;
39
+ while(ct=ct/2){tnum*=2;u++;}
40
+ if(u>30){cout << "Error in size, " << u << endl;exit(0);}
41
+ i=0;
42
+ while((u<32)&&(i<excess)){tnum*=2;u++;i++;}
43
+ tnum--;
44
+ harr=new long[tnum+2];
45
+ for(ct=0;ct<tnum+2;ct++)harr[ct]=0;
46
+
47
+ farr=new long[1536];
48
+ ct=1;
49
+ for(i=0;i<1536;i++){
50
+ farr[i]=ct=(ct*331)&tnum;
51
+ }
52
+
53
+ long *pc0=farr,*pc1=farr+128,*pc2=farr+256;
54
+ long *pc3=farr+384,*pc4=farr+512,*pc5=farr+640;
55
+ long *pc6=farr+768,*pc7=farr+896,*pc8=farr+1024;
56
+ long *pc9=farr+1152,*pc10=farr+1280,*pc11=farr+1408;
57
+
58
+ Lst.node_first();
59
+ while(Lst.node_next()){
60
+ cptr=Lst.show_str();
61
+ ct=0;
62
+ i=0;
63
+ while(u=*(cptr++)){
64
+ switch(i){
65
+ case 0: ct+=*(pc0+u);
66
+ break;
67
+ case 1: ct+=*(pc1+u);
68
+ break;
69
+ case 2: ct+=*(pc2+u);
70
+ break;
71
+ case 3: ct+=*(pc3+u);
72
+ break;
73
+ case 4: ct+=*(pc4+u);
74
+ break;
75
+ case 5: ct+=*(pc5+u);
76
+ break;
77
+ case 6: ct+=*(pc6+u);
78
+ break;
79
+ case 7: ct+=*(pc7+u);
80
+ break;
81
+ case 8: ct+=*(pc8+u);
82
+ break;
83
+ case 9: ct+=*(pc9+u);
84
+ break;
85
+ case 10: ct+=*(pc10+u);
86
+ break;
87
+ case 11: ct+=*(pc11+u);
88
+ i-=12;
89
+ break;
90
+ }
91
+ i++;
92
+ }
93
+ (harr[ct&tnum])++;
94
+ }
95
+
96
+ //Set start points in harr.
97
+ k=0;
98
+ for(i=0;i<tnum+2;i++){
99
+ j=harr[i];
100
+ harr[i]=k;
101
+ k+=j;
102
+ }
103
+ if(k!=nwrds){cout << "Error in summing!" << endl;exit(0);}
104
+
105
+ //Write out harr.
106
+ bin_Writ("ha",(tnum+2)*sizeof(long),(char*)harr);
107
+
108
+ //Set addresses
109
+ char **addt=new char*[nwrds];
110
+ Lst.node_first();
111
+ while(Lst.node_next()){
112
+ uptr=cptr=Lst.show_str();
113
+ ct=0;
114
+ i=0;
115
+ while(u=*(cptr++)){
116
+ switch(i){
117
+ case 0: ct+=*(pc0+u);
118
+ break;
119
+ case 1: ct+=*(pc1+u);
120
+ break;
121
+ case 2: ct+=*(pc2+u);
122
+ break;
123
+ case 3: ct+=*(pc3+u);
124
+ break;
125
+ case 4: ct+=*(pc4+u);
126
+ break;
127
+ case 5: ct+=*(pc5+u);
128
+ break;
129
+ case 6: ct+=*(pc6+u);
130
+ break;
131
+ case 7: ct+=*(pc7+u);
132
+ break;
133
+ case 8: ct+=*(pc8+u);
134
+ break;
135
+ case 9: ct+=*(pc9+u);
136
+ break;
137
+ case 10: ct+=*(pc10+u);
138
+ break;
139
+ case 11: ct+=*(pc11+u);
140
+ i-=12;
141
+ break;
142
+ }
143
+ i++;
144
+ }
145
+ k=ct&tnum;
146
+ addt[harr[k]]=uptr;
147
+ (harr[k])++;
148
+ }
149
+
150
+ //Write out string file
151
+ pfout=get_Ostr("str");
152
+ k=0;
153
+ for(i=0;i<nwrds;i++){
154
+ *pfout << addt[i] << ends;
155
+ len=strlen((char*)addt[i])+1;
156
+ addt[i]=(char*)k;
157
+ k+=len;
158
+ }
159
+ dst_Ostr(pfout);
160
+
161
+ //Write out addr file
162
+ bin_Writ("ad",nwrds*sizeof(long),(char*)addt);
163
+ delete [] addt;
164
+
165
+ //Write out counts
166
+ pfout=get_Ostr("nm");
167
+ *pfout << nwrds << " " << tnum << " " << k << endl;
168
+ dst_Ostr(pfout);
169
+ delete [] harr;
170
+ delete [] farr;
171
+ }
172
+
173
+ //In memory model intended for small sets
174
+ void Hash::create_htableM(List &Lst,int excess){
175
+ char cnam[max_str],*cptr,*uptr;
176
+ int u,len;
177
+ long ct,i,j,k,*barr;
178
+ ofstream *pfout;
179
+
180
+ nwrds=Lst.cnt_key;
181
+ ct=nwrds;
182
+ tnum=1;
183
+ u=0;
184
+ while(ct=ct/2){tnum*=2;u++;}
185
+ if(u>30){cout << "Error in size, " << u << endl;exit(0);}
186
+ i=0;
187
+ while((u<32)&&(i<excess)){tnum*=2;u++;i++;}
188
+ tnum--;
189
+ harr=new long[tnum+2];
190
+ barr=new long[tnum+2];
191
+ for(ct=0;ct<tnum+2;ct++)harr[ct]=0;
192
+
193
+ farr=new long[1536];
194
+ ct=1;
195
+ for(i=0;i<1536;i++){
196
+ farr[i]=ct=(ct*331)&tnum;
197
+ }
198
+
199
+ px0=farr,px1=farr+128,px2=farr+256;
200
+ px3=farr+384,px4=farr+512,px5=farr+640;
201
+ px6=farr+768,px7=farr+896,px8=farr+1024;
202
+ px9=farr+1152,px10=farr+1280,px11=farr+1408;
203
+
204
+ Lst.node_first();
205
+ while(Lst.node_next()){
206
+ cptr=Lst.show_str();
207
+ ct=0;
208
+ i=0;
209
+ while(u=*(cptr++)){
210
+ switch(i){
211
+ case 0: ct+=*(px0+u);
212
+ break;
213
+ case 1: ct+=*(px1+u);
214
+ break;
215
+ case 2: ct+=*(px2+u);
216
+ break;
217
+ case 3: ct+=*(px3+u);
218
+ break;
219
+ case 4: ct+=*(px4+u);
220
+ break;
221
+ case 5: ct+=*(px5+u);
222
+ break;
223
+ case 6: ct+=*(px6+u);
224
+ break;
225
+ case 7: ct+=*(px7+u);
226
+ break;
227
+ case 8: ct+=*(px8+u);
228
+ break;
229
+ case 9: ct+=*(px9+u);
230
+ break;
231
+ case 10: ct+=*(px10+u);
232
+ break;
233
+ case 11: ct+=*(px11+u);
234
+ i-=12;
235
+ break;
236
+ }
237
+ i++;
238
+ }
239
+ (harr[ct&tnum])++;
240
+ }
241
+
242
+ //Set start points in harr.
243
+ k=0;
244
+ for(i=0;i<tnum+2;i++){
245
+ j=harr[i];
246
+ barr[i]=harr[i]=k;
247
+ k+=j;
248
+ }
249
+ if(k!=nwrds){cout << "Error in summing!" << endl;exit(0);}
250
+
251
+ //Set addresses
252
+ len=0;
253
+ char **addt=new char*[nwrds];
254
+ Lst.node_first();
255
+ while(Lst.node_next()){
256
+ uptr=cptr=Lst.show_str();
257
+ len+=strlen(uptr)+1;
258
+ ct=0;
259
+ i=0;
260
+ while(u=*(cptr++)){
261
+ switch(i){
262
+ case 0: ct+=*(px0+u);
263
+ break;
264
+ case 1: ct+=*(px1+u);
265
+ break;
266
+ case 2: ct+=*(px2+u);
267
+ break;
268
+ case 3: ct+=*(px3+u);
269
+ break;
270
+ case 4: ct+=*(px4+u);
271
+ break;
272
+ case 5: ct+=*(px5+u);
273
+ break;
274
+ case 6: ct+=*(px6+u);
275
+ break;
276
+ case 7: ct+=*(px7+u);
277
+ break;
278
+ case 8: ct+=*(px8+u);
279
+ break;
280
+ case 9: ct+=*(px9+u);
281
+ break;
282
+ case 10: ct+=*(px10+u);
283
+ break;
284
+ case 11: ct+=*(px11+u);
285
+ i-=12;
286
+ break;
287
+ }
288
+ i++;
289
+ }
290
+ k=ct&tnum;
291
+ addt[barr[k]]=uptr;
292
+ (barr[k])++;
293
+ }
294
+ strmap=new char[len];
295
+
296
+ //Set up string array
297
+ k=0;
298
+ for(i=0;i<nwrds;i++){
299
+ len=strlen((char*)addt[i])+1;
300
+ strcpy(strmap+k,addt[i]);
301
+ addt[i]=(char*)k;
302
+ k+=len;
303
+ }
304
+ addr=(long*)addt;
305
+ delete [] barr;
306
+ }
307
+
308
+ void Hash::create_htable(int mz,List &Lst,int excess){
309
+ char cnam[max_str],*cptr,*uptr;
310
+ int u,len;
311
+ long ct,i,j,k;
312
+ ofstream *pfout;
313
+
314
+ nwrds=Lst.cnt_key;
315
+ ct=nwrds;
316
+ tnum=1;
317
+ u=0;
318
+ while(ct=ct/2){tnum*=2;u++;}
319
+ if(u>30){cout << "Error in size, " << u << endl;exit(0);}
320
+ i=0;
321
+ while((u<32)&&(i<excess)){tnum*=2;u++;i++;}
322
+ tnum--;
323
+ harr=new long[tnum+2];
324
+ for(ct=0;ct<tnum+2;ct++)harr[ct]=0;
325
+
326
+ farr=new long[1536];
327
+ ct=1;
328
+ for(i=0;i<1536;i++){
329
+ farr[i]=ct=(ct*331)&tnum;
330
+ }
331
+
332
+ long *pc0=farr,*pc1=farr+128,*pc2=farr+256;
333
+ long *pc3=farr+384,*pc4=farr+512,*pc5=farr+640;
334
+ long *pc6=farr+768,*pc7=farr+896,*pc8=farr+1024;
335
+ long *pc9=farr+1152,*pc10=farr+1280,*pc11=farr+1408;
336
+
337
+ Lst.node_first();
338
+ while(Lst.node_next()){
339
+ cptr=Lst.show_str();
340
+ ct=0;
341
+ i=0;
342
+ while(u=*(cptr++)){
343
+ switch(i){
344
+ case 0: ct+=*(pc0+u);
345
+ break;
346
+ case 1: ct+=*(pc1+u);
347
+ break;
348
+ case 2: ct+=*(pc2+u);
349
+ break;
350
+ case 3: ct+=*(pc3+u);
351
+ break;
352
+ case 4: ct+=*(pc4+u);
353
+ break;
354
+ case 5: ct+=*(pc5+u);
355
+ break;
356
+ case 6: ct+=*(pc6+u);
357
+ break;
358
+ case 7: ct+=*(pc7+u);
359
+ break;
360
+ case 8: ct+=*(pc8+u);
361
+ break;
362
+ case 9: ct+=*(pc9+u);
363
+ break;
364
+ case 10: ct+=*(pc10+u);
365
+ break;
366
+ case 11: ct+=*(pc11+u);
367
+ i-=12;
368
+ break;
369
+ }
370
+ i++;
371
+ }
372
+ (harr[ct&tnum])++;
373
+ }
374
+
375
+ //Set start points in harr.
376
+ k=0;
377
+ for(i=0;i<tnum+2;i++){
378
+ j=harr[i];
379
+ harr[i]=k;
380
+ k+=j;
381
+ }
382
+ if(k!=nwrds){cout << "Error in summing!" << endl;exit(0);}
383
+
384
+ //Write out harr.
385
+ bin_Writ(mz,"ha",(tnum+2)*sizeof(long),(char*)harr);
386
+
387
+ //Set addresses
388
+ char **addt=new char*[nwrds];
389
+ Lst.node_first();
390
+ while(Lst.node_next()){
391
+ uptr=cptr=Lst.show_str();
392
+ ct=0;
393
+ i=0;
394
+ while(u=*(cptr++)){
395
+ switch(i){
396
+ case 0: ct+=*(pc0+u);
397
+ break;
398
+ case 1: ct+=*(pc1+u);
399
+ break;
400
+ case 2: ct+=*(pc2+u);
401
+ break;
402
+ case 3: ct+=*(pc3+u);
403
+ break;
404
+ case 4: ct+=*(pc4+u);
405
+ break;
406
+ case 5: ct+=*(pc5+u);
407
+ break;
408
+ case 6: ct+=*(pc6+u);
409
+ break;
410
+ case 7: ct+=*(pc7+u);
411
+ break;
412
+ case 8: ct+=*(pc8+u);
413
+ break;
414
+ case 9: ct+=*(pc9+u);
415
+ break;
416
+ case 10: ct+=*(pc10+u);
417
+ break;
418
+ case 11: ct+=*(pc11+u);
419
+ i-=12;
420
+ break;
421
+ }
422
+ i++;
423
+ }
424
+ k=ct&tnum;
425
+ addt[harr[k]]=uptr;
426
+ (harr[k])++;
427
+ }
428
+
429
+ //Write out string file
430
+ pfout=get_Ostr(mz,"str");
431
+ k=0;
432
+ for(i=0;i<nwrds;i++){
433
+ *pfout << addt[i] << ends;
434
+ len=strlen((char*)addt[i])+1;
435
+ addt[i]=(char*)k;
436
+ k+=len;
437
+ }
438
+ dst_Ostr(pfout);
439
+
440
+ //Write out addr file
441
+ bin_Writ(mz,"ad",nwrds*sizeof(long),(char*)addt);
442
+ delete [] addt;
443
+
444
+ //Write out counts
445
+ pfout=get_Ostr(mz,"nm");
446
+ *pfout << nwrds << " " << tnum << " " << k << endl;
447
+ dst_Ostr(pfout);
448
+ delete [] harr;
449
+ delete [] farr;
450
+ }
451
+
452
+ void Hash::gopen_htable_map(void){
453
+ char cnam[max_str],*cptr;
454
+ int fld;
455
+ long ct,asize,i;
456
+
457
+ ifstream *pfin=get_Istr("nm");
458
+ *pfin >> nwrds >> tnum >> asize;
459
+ dst_Istr(pfin);
460
+
461
+ harr=(long*)get_Mmap("ha");
462
+ addr=(long*)get_Mmap("ad");
463
+ strmap=get_Mmap("str");
464
+
465
+ farr=new long[1536];
466
+ ct=1;
467
+ for(i=0;i<1536;i++){
468
+ farr[i]=ct=(ct*331)&tnum;
469
+ }
470
+
471
+ px0=farr,px1=farr+128,px2=farr+256;
472
+ px3=farr+384,px4=farr+512,px5=farr+640;
473
+ px6=farr+768,px7=farr+896,px8=farr+1024;
474
+ px9=farr+1152,px10=farr+1280,px11=farr+1408;
475
+ }
476
+
477
+ void Hash::gopen_htable_map(int mz){
478
+ char cnam[max_str],*cptr;
479
+ int fld;
480
+ long ct,asize,i;
481
+
482
+ ifstream *pfin=get_Istr(mz,"nm");
483
+ *pfin >> nwrds >> tnum >> asize;
484
+ dst_Istr(pfin);
485
+
486
+ harr=(long*)get_Mmap(mz,"ha");
487
+ addr=(long*)get_Mmap(mz,"ad");
488
+ strmap=get_Mmap(mz,"str");
489
+
490
+ farr=new long[1536];
491
+ ct=1;
492
+ for(i=0;i<1536;i++){
493
+ farr[i]=ct=(ct*331)&tnum;
494
+ }
495
+
496
+ px0=farr,px1=farr+128,px2=farr+256;
497
+ px3=farr+384,px4=farr+512,px5=farr+640;
498
+ px6=farr+768,px7=farr+896,px8=farr+1024;
499
+ px9=farr+1152,px10=farr+1280,px11=farr+1408;
500
+ }
501
+
502
+ void Hash::gopen_htable_copy(Hash *pH){
503
+ char cnam[max_str],*cptr;
504
+ int fld;
505
+ long ct,asize,i;
506
+
507
+ nwrds=pH->nwrds;
508
+ tnum=pH->tnum;
509
+
510
+ harr=pH->harr;
511
+ addr=pH->addr;
512
+ strmap=pH->strmap;
513
+
514
+ farr=pH->farr;
515
+
516
+ px0=farr,px1=farr+128,px2=farr+256;
517
+ px3=farr+384,px4=farr+512,px5=farr+640;
518
+ px6=farr+768,px7=farr+896,px8=farr+1024;
519
+ px9=farr+1152,px10=farr+1280,px11=farr+1408;
520
+ }
521
+
522
+ long Hash::find(const char *str){
523
+ register long ct=0,i=0,k;
524
+ register int ic;
525
+ register const char *utr=str;
526
+ while(ic=*(utr++)){
527
+ switch(i){
528
+ case 0: ct+=*(px0+ic);
529
+ break;
530
+ case 1: ct+=*(px1+ic);
531
+ break;
532
+ case 2: ct+=*(px2+ic);
533
+ break;
534
+ case 3: ct+=*(px3+ic);
535
+ break;
536
+ case 4: ct+=*(px4+ic);
537
+ break;
538
+ case 5: ct+=*(px5+ic);
539
+ break;
540
+ case 6: ct+=*(px6+ic);
541
+ break;
542
+ case 7: ct+=*(px7+ic);
543
+ break;
544
+ case 8: ct+=*(px8+ic);
545
+ break;
546
+ case 9: ct+=*(px9+ic);
547
+ break;
548
+ case 10: ct+=*(px10+ic);
549
+ break;
550
+ case 11: ct+=*(px11+ic);
551
+ i-=12;
552
+ break;
553
+ }
554
+ i++;
555
+ }
556
+ k=ct&tnum;
557
+ ct=harr[k+1];
558
+ i=harr[k];
559
+ //cout << k << " " << i << " " << addr[i] << " " << ct << " " << addr[ct] << endl;
560
+ switch(ct-i){
561
+ case 0: return(0);
562
+ break;
563
+ case 1: if(!strcmp(str,strmap+addr[i]))return(i+1);
564
+ else return(0);
565
+ break;
566
+ case 2: ic=strcmp(str,strmap+addr[i]);
567
+ if(ic>0){
568
+ if(!strcmp(str,strmap+addr[i+1]))return(i+2);
569
+ else return(0);
570
+ }
571
+ else if(ic<0)return(0);
572
+ else return(i+1);
573
+ break;
574
+ default: ic=strcmp(str,strmap+addr[i]);
575
+ if(ic<0)return(0);
576
+ else if(!ic)return(i+1);
577
+ ct--;
578
+ ic=strcmp(str,strmap+addr[ct]);
579
+ if(ic>0)return(0);
580
+ else if(!ic)return(ct+1);
581
+ while(ct-i>1){
582
+ k=(ct+i)/2;
583
+ ic=strcmp(str,strmap+addr[k]);
584
+ if(ic>0)i=k;
585
+ else if(ic<0)ct=k;
586
+ else return(k+1);
587
+ }
588
+ return(0);
589
+ }
590
+ }
591
+
592
+ void Hash::gclose_htable_map(void){
593
+ dst_Mmap("ha",(char*)harr);
594
+ dst_Mmap("ad",(char*)addr);
595
+ dst_Mmap("str",strmap);
596
+ delete [] farr;
597
+ }
598
+
599
+ void Hash::gclose_htable_map(int mz){
600
+ dst_Mmap(mz,"ha",(char*)harr);
601
+ dst_Mmap(mz,"ad",(char*)addr);
602
+ dst_Mmap(mz,"str",strmap);
603
+ delete [] farr;
604
+ }
605
+
606
+ //Chash code
607
+
608
+ Chash::Chash() : Hash(){
609
+ change_type("cshset");
610
+ }
611
+
612
+ Chash::Chash(const char *str) : Hash(str){
613
+ change_type("cshset");
614
+ }
615
+
616
+ Chash::Chash(int n,const char *str) : Hash(n,str){
617
+ change_type("cshset");
618
+ }
619
+
620
+ Chash::~Chash(void){}
621
+
622
+ void Chash::create_ctable(Count &Ct,int excess){
623
+ create_htable(Ct,excess);
624
+ gopen_htable_map();
625
+ long n,i=0;
626
+ long *pct=new long[Ct.cnt_key];
627
+ Ct.node_first();
628
+ while(Ct.node_next()){
629
+ if(n=find(Ct.show_str())){
630
+ pct[n-1]=Ct.count();
631
+ }
632
+ else {
633
+ cout << "Error in Count tree!" << endl;exit(0);
634
+ }
635
+ mark(++i,10000,"count terms");
636
+ }
637
+ bin_Writ("ct",Ct.cnt_key*sizeof(long),(char*)pct);
638
+ delete [] pct;
639
+ cnt=(long*)get_Mmap("ct");
640
+ gclose_htable_map();
641
+ }
642
+
643
+ void Chash::create_ctable(List &Lt,int excess){
644
+ create_htable(Lt,excess);
645
+ gopen_htable_map();
646
+ long n,i=1;
647
+ long *pct=new long[Lt.cnt_key];
648
+ Lt.node_first();
649
+ while(Lt.node_next()){
650
+ if(n=find(Lt.show_str())){
651
+ pct[n-1]=i;
652
+ }
653
+ else {
654
+ cout << "Error in List tree!" << endl;exit(0);
655
+ }
656
+ mark(++i,10000,"count terms");
657
+ }
658
+ bin_Writ("ct",Lt.cnt_key*sizeof(long),(char*)pct);
659
+ delete [] pct;
660
+ cnt=(long*)get_Mmap("ct");
661
+ gclose_htable_map();
662
+ }
663
+
664
+ void Chash::create_ctable(int mz,Count &Ct,int excess){
665
+ create_htable(mz,Ct,excess);
666
+ gopen_htable_map(mz);
667
+ long n,i=0;
668
+ long *pct=new long[Ct.cnt_key];
669
+ Ct.node_first();
670
+ while(Ct.node_next()){
671
+ if(n=find(Ct.show_str())){
672
+ pct[n-1]=Ct.count();
673
+ }
674
+ else {
675
+ cout << "Error in Count tree!" << endl;exit(0);
676
+ }
677
+ mark(++i,10000,"count terms");
678
+ }
679
+ bin_Writ(mz,"ct",Ct.cnt_key*sizeof(long),(char*)pct);
680
+ delete [] pct;
681
+ cnt=(long*)get_Mmap(mz,"ct");
682
+ gclose_htable_map(mz);
683
+ }
684
+
685
+ void Chash::create_ctable(int mz,List &Lt,int excess){
686
+ create_htable(mz,Lt,excess);
687
+ gopen_htable_map(mz);
688
+ long n,i=1;
689
+ long *pct=new long[Lt.cnt_key];
690
+ Lt.node_first();
691
+ while(Lt.node_next()){
692
+ if(n=find(Lt.show_str())){
693
+ pct[n-1]=i;
694
+ }
695
+ else {
696
+ cout << "Error in List tree!" << endl;exit(0);
697
+ }
698
+ mark(++i,10000,"count terms");
699
+ }
700
+ bin_Writ(mz,"ct",Lt.cnt_key*sizeof(long),(char*)pct);
701
+ delete [] pct;
702
+ cnt=(long*)get_Mmap(mz,"ct");
703
+ gclose_htable_map(mz);
704
+ }
705
+
706
+ void Chash::gopen_ctable_map(void){
707
+ gopen_htable_map();
708
+ cnt=(long*)get_Mmap("ct");
709
+ }
710
+
711
+ void Chash::gopen_ctable_map(int mz){
712
+ gopen_htable_map(mz);
713
+ cnt=(long*)get_Mmap(mz,"ct");
714
+ }
715
+
716
+ void Chash::gclose_ctable_map(void){
717
+ gclose_htable_map();
718
+ dst_Mmap("ct",(char*)cnt);
719
+ }
720
+
721
+ void Chash::gclose_ctable_map(int mz){
722
+ gclose_htable_map(mz);
723
+ dst_Mmap(mz,"ct",(char*)cnt);
724
+ }
725
+
726
+ long Chash::count(const char *str){
727
+ long n=find(str);
728
+ if(n)return(cnt[n-1]);
729
+ else return(0);
730
+ }
731
+
732
+ }
733
+
Library/Hash.h CHANGED
@@ -1,92 +1,92 @@
1
- #ifndef HASH_H
2
- #define HASH_H
3
-
4
- #include <iostream>
5
- #include <fstream>
6
- #include <Btree.h>
7
- #include <FBase.h>
8
-
9
- namespace iret {
10
-
11
- class Hash : public FBase {
12
- public:
13
- Hash(void);
14
- Hash(const char *nm);
15
- Hash(int n,const char *nm); //n gets appended to type if >-1
16
- ~Hash();
17
-
18
- void create_htable(List &Lst,int excess); //"str" for file of strings,
19
- //"ad" for address file, "nm" numbers,
20
- //"ha" hash array. Excess is # powers of 2 above size.
21
- void create_htableM(List &Lst,int excess); //creates in memory ready for use
22
- //and no need to call gopen or gclose functions
23
- void create_htable(int mz,List &Lst,int excess); //"str" for file of strings,
24
- //Creates a numbered version of above
25
-
26
- void gopen_htable_map(void); //Creates memory maps
27
- void gopen_htable_map(int mz); //Creates memory maps
28
- void gclose_htable_map(void); //Destroys memory maps
29
- //and deletes memory
30
- void gclose_htable_map(int mz); //Destroys memory maps
31
- //and deletes memory
32
- void gopen_htable_copy(Hash *pH); //Copies memory maps
33
-
34
- long find(const char *str); //Return number+1 if present, else 0.
35
- //Number is not lexical order but hash order and then lexical
36
- //within collesion groups.
37
-
38
- //Data
39
- char *strmap; //Holds the bit map.
40
- long *addr; //Holds the offsets to strmap.
41
- long nwrds; //Number of words.
42
- long tnum; //Truncation number, size of har.
43
- long *harr; //Holds hash array.
44
- long *farr; //Holds the hash coefficients.
45
- long *px0;
46
- long *px1;
47
- long *px2;
48
- long *px3;
49
- long *px4;
50
- long *px5;
51
- long *px6;
52
- long *px7;
53
- long *px8;
54
- long *px9;
55
- long *px10;
56
- long *px11;
57
- };
58
-
59
- class Chash : public Hash {
60
- public:
61
- Chash(void);
62
- Chash(const char *nm);
63
- Chash(int n,const char *nm); //n gets appended to type if >-1
64
- ~Chash(void);
65
-
66
- void create_ctable(Count &Ct,int excess); //Adds "ct" for counts
67
- //Calls create_htable and then prodoces the array of counts.
68
- void create_ctable(int mz,Count &Ct,int excess); //Adds "ct" for counts
69
- //Creates a numbered version of above
70
- void create_ctable(List &Lt,int excess); //Adds "ct" for term #
71
- //and starts the count at 1 and in lexical order. count() will
72
- //return 0 if term not in list.
73
- void create_ctable(int mz,List &Lt,int excess); //Adds "ct" for term #
74
- //Creates a numbered version of above
75
-
76
- void gopen_ctable_map(void); //Calls gopen_htable_map and also
77
- //maps "ct" file.
78
- void gopen_ctable_map(int mz); //Calls gopen_htable_map and also
79
- //maps "ct" file.
80
- void gclose_ctable_map(void); //Calls gclose_htable_map and also
81
- //Unmaps "ct" file.
82
- void gclose_ctable_map(int mz); //Calls gclose_htable_map and also
83
- //Unmaps "ct" file.
84
-
85
- long count(const char *str); //Returns count if present, else 0.
86
-
87
- //Data
88
- long *cnt;
89
- };
90
-
91
- }
92
- #endif
 
1
+ #ifndef HASH_H
2
+ #define HASH_H
3
+
4
+ #include <iostream>
5
+ #include <fstream>
6
+ #include <Btree.h>
7
+ #include <FBase.h>
8
+
9
+ namespace iret {
10
+
11
+ class Hash : public FBase {
12
+ public:
13
+ Hash(void);
14
+ Hash(const char *nm);
15
+ Hash(int n,const char *nm); //n gets appended to type if >-1
16
+ ~Hash();
17
+
18
+ void create_htable(List &Lst,int excess); //"str" for file of strings,
19
+ //"ad" for address file, "nm" numbers,
20
+ //"ha" hash array. Excess is # powers of 2 above size.
21
+ void create_htableM(List &Lst,int excess); //creates in memory ready for use
22
+ //and no need to call gopen or gclose functions
23
+ void create_htable(int mz,List &Lst,int excess); //"str" for file of strings,
24
+ //Creates a numbered version of above
25
+
26
+ void gopen_htable_map(void); //Creates memory maps
27
+ void gopen_htable_map(int mz); //Creates memory maps
28
+ void gclose_htable_map(void); //Destroys memory maps
29
+ //and deletes memory
30
+ void gclose_htable_map(int mz); //Destroys memory maps
31
+ //and deletes memory
32
+ void gopen_htable_copy(Hash *pH); //Copies memory maps
33
+
34
+ long find(const char *str); //Return number+1 if present, else 0.
35
+ //Number is not lexical order but hash order and then lexical
36
+ //within collesion groups.
37
+
38
+ //Data
39
+ char *strmap; //Holds the bit map.
40
+ long *addr; //Holds the offsets to strmap.
41
+ long nwrds; //Number of words.
42
+ long tnum; //Truncation number, size of har.
43
+ long *harr; //Holds hash array.
44
+ long *farr; //Holds the hash coefficients.
45
+ long *px0;
46
+ long *px1;
47
+ long *px2;
48
+ long *px3;
49
+ long *px4;
50
+ long *px5;
51
+ long *px6;
52
+ long *px7;
53
+ long *px8;
54
+ long *px9;
55
+ long *px10;
56
+ long *px11;
57
+ };
58
+
59
+ class Chash : public Hash {
60
+ public:
61
+ Chash(void);
62
+ Chash(const char *nm);
63
+ Chash(int n,const char *nm); //n gets appended to type if >-1
64
+ ~Chash(void);
65
+
66
+ void create_ctable(Count &Ct,int excess); //Adds "ct" for counts
67
+ //Calls create_htable and then prodoces the array of counts.
68
+ void create_ctable(int mz,Count &Ct,int excess); //Adds "ct" for counts
69
+ //Creates a numbered version of above
70
+ void create_ctable(List &Lt,int excess); //Adds "ct" for term #
71
+ //and starts the count at 1 and in lexical order. count() will
72
+ //return 0 if term not in list.
73
+ void create_ctable(int mz,List &Lt,int excess); //Adds "ct" for term #
74
+ //Creates a numbered version of above
75
+
76
+ void gopen_ctable_map(void); //Calls gopen_htable_map and also
77
+ //maps "ct" file.
78
+ void gopen_ctable_map(int mz); //Calls gopen_htable_map and also
79
+ //maps "ct" file.
80
+ void gclose_ctable_map(void); //Calls gclose_htable_map and also
81
+ //Unmaps "ct" file.
82
+ void gclose_ctable_map(int mz); //Calls gclose_htable_map and also
83
+ //Unmaps "ct" file.
84
+
85
+ long count(const char *str); //Returns count if present, else 0.
86
+
87
+ //Data
88
+ long *cnt;
89
+ };
90
+
91
+ }
92
+ #endif
Library/MPtok.C CHANGED
@@ -1,2036 +1,2036 @@
1
- #include <stdio.h>
2
- #include <ctype.h>
3
- #include <string.h>
4
- #include <stdlib.h>
5
-
6
- #include <string>
7
- #include <iostream>
8
- #include <fstream>
9
- #include <sstream>
10
-
11
- #include "MPtok.h"
12
-
13
- // These options are probably compile time constants
14
-
15
- static char option_tagsep = '_'; // The tagsep character
16
- static char option_replacesep = '-'; // Replace tagsep with this
17
-
18
- static void chomp(char *line)
19
- {
20
- int i;
21
-
22
- i = strlen(line) - 1;
23
- while (i >= 0 && line[i] == '\n' || line[i] == '\r')
24
- line[i--] = '\0';
25
- }
26
-
27
- // Data structure and algorithm for finding common pairs.
28
-
29
- // read a file of pairs into a data structure,
30
- // the file must be sorted first
31
-
32
- void MPtok::init_pair(const string& file_name)
33
- {
34
- filebuf fb;
35
- fb.open(file_name.c_str(), ios::in);
36
- istream is(&fb);
37
- string pair;
38
-
39
- while (1)
40
- {
41
- getline(is, pair);
42
- if (is.fail()) break;
43
- if (pair.size() > 0) common_pair.insert(pair);
44
- }
45
-
46
- fb.close();
47
- }
48
-
49
- // List of abbreviations in 3 categories
50
- // ABB = can occur mid sentence
51
- // EOS = can occur at end of sentence
52
- // NUM = only used before numbers
53
-
54
- void MPtok::init_abbr(const string& file_name)
55
- {
56
- filebuf fb;
57
- fb.open(file_name.c_str(), ios::in);
58
- istream is(&fb);
59
- string typ, abb;
60
- map<string,int> val;
61
- val["ABB"] = ABB_ABB; val["EOS"] = ABB_EOS; val["NUM"] = ABB_NUM;
62
-
63
- while (is.good())
64
- {
65
- is >> typ;
66
- if (val.count(typ))
67
- {
68
- is >> abb;
69
- if (abb.size() > 0) common_abbr[abb] = val[typ];
70
- }
71
- }
72
- fb.close();
73
- }
74
-
75
- static char nextchar(const char *t, int i)
76
- {
77
- while (isspace(t[i])) i++;
78
- return t[i];
79
- }
80
-
81
- // Look for a token at or prior to the text position
82
-
83
- static int lookbehind(const char *t, int i, const char *s, int *tokflag)
84
- {
85
- int k = (int) strlen(s) - 1;
86
-
87
- while (i > 0 && isspace(t[i])) i--;
88
-
89
- while (k >= 0 && i >= 0)
90
- {
91
- if (k > 0 && tokflag[i]) break;
92
-
93
- if (tolower(s[k]) != tolower(t[i]))
94
- return -1;
95
- k--;
96
- i--;
97
- }
98
-
99
- return (k < 0 && tokflag[i+1]) ? i + 1 : -1;
100
- }
101
-
102
- // Look for a token at or following the text position
103
-
104
- static int lookahead(const char *t, int i, const char *s, int *tokflag)
105
- {
106
- int k = 0;
107
-
108
- while (isspace(t[i])) i++;
109
-
110
- while (k < strlen(s) && i < strlen(t))
111
- {
112
- if (k > 0 && tokflag[i]) break;
113
-
114
- if (tolower(s[k]) != tolower(t[i]))
115
- return -1;
116
- k++;
117
- i++;
118
- }
119
-
120
- return (k == strlen(s) && tokflag[i]) ? i - (int) strlen(s) : -1;
121
- }
122
-
123
- // Set the initial tokens at spaces
124
-
125
- void MPtok::tok_0()
126
- {
127
- int i;
128
-
129
- tokflag[0] = 1;
130
- for (i = 1; i < text_len; i++)
131
- {
132
- tokflag[i] = isspace(text[i]) || (i > 0 && isspace(text[i - 1])) ? 1 : 0;
133
- }
134
- tokflag[i] = 1;
135
- }
136
-
137
- // Get quotes preceded by open parens
138
- //
139
- // A double quote, preceded by a space or open bracket is a separate token
140
- //
141
-
142
- void MPtok::tok_1()
143
- {
144
- for (int i = 1; i < text_len; i++)
145
- {
146
- if (text[i] == '"' && strchr("([{<", text[i-1]))
147
- {
148
- tokflag[i] = 1;
149
- if (i + 1 < text_len) tokflag[i+1] = 1;
150
- }
151
- }
152
- }
153
-
154
- // Look for ellipses
155
- //
156
- // Three dots in a row is a separate token
157
-
158
- void MPtok::tok_2()
159
- {
160
- for (int i = 1; i + 2 < text_len; i++)
161
- {
162
- if (strncmp(&text[i], "...", 3) == 0)
163
- {
164
- tokflag[i] = 1;
165
- if (i + 3 < text_len) tokflag[i+3] = 1;
166
- }
167
- }
168
- }
169
-
170
- // Non-sentence-ending punctuation
171
- //
172
- // Certain punctuation characters are separate tokens
173
-
174
- void MPtok::tok_3()
175
- {
176
- for (int i = 0; i < text_len; i++)
177
- {
178
- // If it is a comma and the next char is not a space and option_comma = 0
179
-
180
- if (option_comma == 0 && text[i] == ',' && isspace(text[i + 1]) == 0)
181
- {
182
- // do nothing
183
- } else if (strchr(",;:@#$%&", text[i]))
184
- {
185
- tokflag[i] = 1;
186
- tokflag[i + 1] = 1;
187
- }
188
- }
189
- }
190
-
191
- // Separate the slashes
192
- //
193
- // Slashes are a separate token
194
- // except for +/-, +/+, -/-, -/+, and and/or.
195
-
196
- void MPtok::tok_5_6_7()
197
- {
198
- for (int i = 0; i < text_len; i++)
199
- {
200
- if (text[i] == '/')
201
- {
202
- tokflag[i] = 1;
203
- if (i+1 < text_len) tokflag[i+1] = 1;
204
-
205
- // Put back +/-, etc, unless option_hyphen is 1
206
-
207
- if (i - 1 >= 0
208
- && i + 1 < text_len
209
- && ((option_new < 9
210
- && text[i - 1] == '+' || (text[i - 1] == '-' && option_hyphen == 0)
211
- && text[i + 1] == '+' || (text[i + 1] == '-' && option_hyphen == 0))
212
- || (option_new >= 9
213
- && (text[i - 1] == '+' || text[i - 1] == '-')
214
- && (text[i + 1] == '+' || text[i + 1] == '-'))))
215
- {
216
- tokflag[i - 1] = 1;
217
- tokflag[i] = tokflag[i+1] = 0;
218
- tokflag[i + 2] = 1;
219
- }
220
-
221
- // Put back and/or, etc
222
-
223
- if (option_new <= 7)
224
- {
225
- if (i > 5 && strncmp(text + i - 5, " and/or ", 8) == 0)
226
- {
227
- for (int j = 1; j < 5; j++)
228
- tokflag[i - 2 + j] = 0;
229
- }
230
- } else
231
- {
232
- if (i > 4 && strncmp(text + i - 4, " and/or ", 8) == 0)
233
- {
234
- for (int j = 1; j < 6; j++)
235
- tokflag[i - 3 + j] = 0;
236
- }
237
- }
238
- }
239
- }
240
- }
241
-
242
- // All brackets
243
- //
244
- // Any open or closed bracket is a separate token
245
- //
246
- // Exclamation and question mark
247
- //
248
- // Any question or exclamation mark is a separate token
249
-
250
- void MPtok::tok_8_9()
251
- {
252
- for (int i = 0; i < text_len; i++)
253
- {
254
- if (strchr("[](){}<>", text[i])
255
- || strchr("?!", text[i]))
256
- {
257
- tokflag[i] = 1;
258
- if (i + 1 < text_len) tokflag[i+1] = 1;
259
- }
260
- }
261
- }
262
-
263
- // Period at the end of a string may be followed by closed-bracket or quote
264
- //
265
- // A period that is preceded by a non-period
266
- // and optionally followed by a close paren
267
- // and any amount of space at the end of the string
268
- // is a separate token.
269
-
270
- void MPtok::tok_10()
271
- {
272
- for (int i = text_len - 1; i >= 0; i--)
273
- {
274
- if (isspace(text[i])) continue;
275
- if (strchr("])}>\"'", text[i])) continue;
276
- if (text[i] != '.') break;
277
- if (text[i] == '.' && (i - 1 < 0 || text[i-1] != '.'))
278
- {
279
- tokflag[i] = 1;
280
- if (i + 1 < text_len) tokflag[i+1] = 1;
281
- }
282
- }
283
- }
284
-
285
- // Period followed by a capitalized word
286
- //
287
- // A period preceded by a character that is not another period and not a space
288
- // and followed by a space then an upper case letter is a separate token
289
-
290
- void MPtok::tok_11()
291
- {
292
- for (int i = 0; i < text_len; i++)
293
- {
294
- if (text[i] == '.'
295
- && (i + 1 < text_len && isspace(text[i+1]))
296
- && (i - 1 < 0 || text[i - 1] != '.' || isspace(text[i-1]) == 0)
297
- && isupper(nextchar(text, i + 1)))
298
- tokflag[i] = 1;
299
- }
300
- }
301
-
302
- // A normal word followed by a period
303
- //
304
- // A period followed by a space
305
- // and preceded by 2 or more alphabetic characters or hyphens
306
- // is a separate token
307
-
308
- void MPtok::tok_12()
309
- {
310
- int wcnt = 0;
311
-
312
- for (int i = 0; i < text_len; i++)
313
- {
314
- if (text[i] == '.'
315
- && tokflag[i + 1]
316
- && wcnt >= 2)
317
- tokflag[i] = 1;
318
-
319
- if (isalpha(text[i]) || text[i] == '-')
320
- ++wcnt;
321
- else
322
- wcnt = 0;
323
- }
324
- }
325
-
326
- // A non-normal token (that has no lower case letters) followed by a period
327
- //
328
- // A period at the end of a token made of characters excluding lower case
329
- // is a separate token
330
-
331
- void MPtok::tok_13()
332
- {
333
- int stok = 0;
334
- int wcnt = 0;
335
-
336
- for (int i = 0; i < text_len; i++)
337
- {
338
- if (text[i] == '.'
339
- && tokflag[i + 1]
340
- && wcnt >= 2)
341
- tokflag[i] = 1;
342
-
343
- if (tokflag[i] == 1) stok = 1;
344
-
345
- if (islower(text[i]) || text[i] == '.')
346
- {
347
- stok = 0;
348
- wcnt = 0;
349
- }
350
-
351
- if (stok)
352
- wcnt++;
353
- }
354
- }
355
-
356
- // put some periods with single-letter abbreviations
357
- //
358
- // A single alphabetic token followed by a period followed
359
- // by a token that does not begin with an upper case letter
360
- // or number is taken to be an abbreviation and the period
361
- // does not start a new token.
362
- //
363
- // NOTE: This does not recognize initials in people's names,
364
- // that problem is not simply solved.
365
-
366
- void MPtok::tok_14()
367
- {
368
- for (int i = 0; i < text_len; i++)
369
- {
370
- if (text[i] == '.'
371
- && i - 1 >= 0 && isalpha(text[i - 1]) && tokflag[i - 1]
372
- && tokflag[i + 1]
373
- && isupper(nextchar(text, i + 1)) == 0
374
- && isdigit(nextchar(text, i + 1)) == 0
375
- && nextchar(text, i + 1) != '('
376
- )
377
- {
378
- tokflag[i] = 0;
379
- }
380
- }
381
- }
382
-
383
- void MPtok::tok_15()
384
- {
385
- int i, j, k, a;
386
- char buff[MAX_ABB + 1];
387
-
388
- for (i = 0; i < text_len; i++)
389
- {
390
- // only start at a current token
391
-
392
- if (! tokflag[i]) continue;
393
-
394
- // find alphabetic followed by period
395
-
396
- buff[0] = '\0';
397
- for (k = 0; i + k < text_len && k < MAX_ABB; k++)
398
- {
399
- buff[k] = text[i+k]; buff[k+1] = '\0';
400
- if (k > 0 && buff[k] == '.') break; // this is good
401
- if (! isalpha(buff[k])) { buff[0] = '\0'; break; } // this is not good
402
- }
403
-
404
- if (buff[0] == '\0' || i + k == text_len || k == MAX_ABB) continue;
405
-
406
- // at this point, buff[k] == '.' add 1 to make it the length
407
-
408
- k++;
409
-
410
- // if not found, try finding a concatenated abbrev
411
-
412
- if (! common_abbr.count(buff))
413
- {
414
- for (; i + k < text_len && k < MAX_ABB; k++)
415
- {
416
- buff[k] = text[i+k]; buff[k+1] = '\0';
417
- if (k > 0 && buff[k] == '.') break; // this is good
418
- if (! isalpha(buff[k])) { buff[0] = '\0'; break; } // this is not good
419
- }
420
-
421
- if (buff[0] == '\0' || i + k == text_len || k == MAX_ABB) continue;
422
-
423
- // at this point, buff[k] == '.' add 1 to make it the length
424
-
425
- k++;
426
- }
427
-
428
- // if not found, give up
429
-
430
- if (! common_abbr.count(buff)) continue;
431
-
432
- if (common_abbr[buff] == ABB_NUM)
433
- {
434
- for (j = i + k; j < text_len && isspace(text[j]); j++) ; // next must be a number
435
- if (! isdigit(text[j])) continue; // go to next abbreviation
436
- } else if (common_abbr[buff] == ABB_EOS)
437
- {
438
- for (j = i + k; j < text_len && isspace(text[j]); j++) ; // if next token is upper case letter
439
- if (isupper(text[j])) tokflag[i + (--k)] = 1; // tokenize the final period of this abbreviation
440
- }
441
-
442
- // clear all token flags
443
-
444
- for (j = 1; j < k; j++) tokflag[i + j] = 0;
445
- }
446
- }
447
-
448
- // Check for common pairs that should not be considered sentence breaks
449
-
450
- void MPtok::tok_15_1()
451
- {
452
- int i, j, k, tnum, p;
453
- char buff[MAX_ABB + 1];
454
-
455
- for (i = 0; i < text_len; i++)
456
- {
457
- if (! tokflag[i]) continue;
458
-
459
- // must be alphanumeric token followed by period token followed by space followed by alphanumeric token
460
-
461
- tnum = 0;
462
- buff[0] = '\0';
463
- for (p = k = 0; i + k < text_len && k < MAX_ABB; k++)
464
- {
465
- buff[k] = text[i+k]; buff[k+1] = '\0';
466
-
467
- if (isspace(buff[k]))
468
- {
469
- if (tnum == 2) break; // this is good
470
- else if (tnum == 1) continue; // ok
471
- else { buff[0] = '\0'; break; } // this shouldn't happen
472
- }
473
-
474
- if (tokflag[i+k])
475
- {
476
- if (tnum > 2) break; // done
477
- else tnum++;
478
- }
479
-
480
- if (tnum == 1 && buff[k] == '.') p = k;
481
- if (tnum == 1 && buff[k] != '.') { buff[0] = '\0'; break; } // nope
482
- if (! isalnum(buff[k])) { buff[0] = '\0'; break; } // nope
483
- }
484
-
485
- if (buff[0] == '\0' || i + k == text_len || k == MAX_ABB) continue;
486
-
487
- // at this point buff is a potential pair, so untokenize the period, that's all
488
-
489
- if (common_pair.count(buff))
490
- tokflag[p] = 0;
491
- }
492
- }
493
-
494
- // Get cases where a space after a sentence has been omitted
495
- //
496
- // A period that occurs in a token consisting of alphabetic
497
- // letters with a vowel to the left and the right is a
498
- // separate token.
499
-
500
- void MPtok::tok_16()
501
- {
502
- int j;
503
- int has_vowel;
504
-
505
- for (int i = 0; i < text_len; i++)
506
- {
507
- if (text[i] == '.' && tokflag[i] == 0)
508
- {
509
- has_vowel = 0;
510
- for (j = i - 1; j >= 0; --j)
511
- {
512
- if (isalpha(text[j]) == 0)
513
- break;
514
- if (strchr("aeiouAEIOU", text[j]))
515
- has_vowel = 1;
516
- if (tokflag[j])
517
- break;
518
- }
519
- if ((j >= 0 && tokflag[j] == 0) || has_vowel == 0)
520
- continue;
521
-
522
- j = i + 1;
523
-
524
- has_vowel = 0;
525
- for (; j < text_len && tokflag[j] == 0; ++j)
526
- {
527
- if (isalpha(text[j]) == 0)
528
- break;
529
- if (strchr("aeiouAEIOU", text[j]))
530
- has_vowel = 1;
531
- }
532
-
533
- if ((j < text_len && tokflag[j] == 0) || has_vowel == 0)
534
- continue;
535
-
536
- tokflag[i] = 1;
537
- tokflag[i + 1] = 1;
538
- }
539
- }
540
- }
541
-
542
- // Correction to tok_16,
543
- // Don't count if the token before is a single letter
544
- // or the token following is a single letter other than 'a'.
545
- // Also, don't count if the token to the right is gov, com, edu, etc.
546
- // because those are web addresses!
547
-
548
- #define COMPLEX_WINDOW 40
549
-
550
- enum {COMPLEX_NOT = 0, COMPLEX_YES, COMPLEX_DONE};
551
-
552
- struct _complex {
553
- int flag;
554
- int offset;
555
- const char *str;
556
- int len;
557
- } complex[] = {
558
- COMPLEX_YES, 0, "complex", 7,
559
- COMPLEX_NOT, 0, "complexi", 8,
560
- COMPLEX_NOT, 0, "complexed", 9,
561
- COMPLEX_NOT, 0, "complexa", 8,
562
- COMPLEX_NOT, 0, "complex-", 8,
563
- COMPLEX_NOT, 0, "complexl", 8,
564
- COMPLEX_NOT, 0, "complexu", 8,
565
- COMPLEX_NOT, -1, "-complex", 7,
566
- COMPLEX_NOT, -2, "nocomplex", 9,
567
- COMPLEX_NOT, -3, "subcomplex", 10,
568
- COMPLEX_YES, 0, "hybrid", 6,
569
- COMPLEX_NOT, 0, "hybridi", 7,
570
- COMPLEX_NOT, 0, "hybrido", 7,
571
- COMPLEX_NOT, 0, "hybrida", 7,
572
- COMPLEX_NOT, 0, "hybrid-", 7,
573
- COMPLEX_NOT, -1, "-hybrid", 7,
574
- COMPLEX_YES, 0, "duplex", 6,
575
- COMPLEX_NOT, -1, "oduplex", 7,
576
- COMPLEX_DONE, 0, NULL, 0,
577
- };
578
-
579
- int MPtok::complex_check()
580
- {
581
- int last_period = -2*COMPLEX_WINDOW;
582
- int last_complex = -2*COMPLEX_WINDOW;
583
- int i, j;
584
- int complex_match;
585
-
586
- for (i = 0; i < text_len; i++)
587
- {
588
- if (text[i] == '.')
589
- {
590
- if (i - last_complex <= COMPLEX_WINDOW)
591
- return 1;
592
- last_period = i;
593
- }
594
-
595
- complex_match = 0;
596
- for (j = 0; complex[j].str; j++)
597
- {
598
- if (complex[j].flag == COMPLEX_NOT)
599
- {
600
- if (i + complex[j].offset >= 0
601
- && strncmp(text+i+complex[j].offset, complex[j].str, complex[j].len) == 0)
602
- {
603
- // don't match here
604
- complex_match = 0;
605
- }
606
- } else if (complex[j].flag == COMPLEX_YES)
607
- {
608
- if (i + complex[j].offset >= 0
609
- && strncmp(text+i+complex[j].offset, complex[j].str, complex[j].len) == 0)
610
- {
611
- // match here
612
- complex_match = 1;
613
- }
614
- }
615
- }
616
-
617
- if (complex_match)
618
- {
619
- if (i - last_period <= COMPLEX_WINDOW)
620
- return 1;
621
- last_complex = i;
622
- }
623
- }
624
- return 0;
625
- }
626
-
627
- void MPtok::tok_16_1()
628
- {
629
- int i, j;
630
- char v1, v2;
631
- int c1, c2;
632
-
633
- if (option_new == 3 && strstr(text, "complex"))
634
- return;
635
-
636
- if (option_new >= 4 && complex_check())
637
- return;
638
-
639
- for (i = 0; i < text_len; i++)
640
- {
641
- if (text[i] == '.' && tokflag[i] == 0)
642
- {
643
- char suffix[10];
644
- int s_i;
645
-
646
- v1 = '\0';
647
- c1 = 0;
648
- for (j = i - 1; j >= 0; --j)
649
- {
650
- if (isalpha(text[j]) == 0)
651
- break;
652
- if (strchr("aeiouAEIOU", text[j]))
653
- v1 = tolower(text[j]);
654
- c1++;
655
- if (tokflag[j])
656
- break;
657
- }
658
- if ((j >= 0 && tokflag[j] == 0)
659
- || v1 == '\0'
660
- || c1 == 1)
661
- continue;
662
-
663
- j = i + 1;
664
-
665
- v2 = '\0';
666
- c2 = 0;
667
- s_i = 0;
668
- for (; j < text_len && tokflag[j] == 0; ++j)
669
- {
670
- if (isalpha(text[j]) == 0)
671
- break;
672
- if (strchr("aeiouAEIOU", text[j]))
673
- v2 = tolower(text[j]);
674
- if (s_i < 3)
675
- suffix[s_i++] = tolower(text[j]); suffix[s_i] = '\0';
676
- c2++;
677
- }
678
-
679
- if ((j < text_len && tokflag[j] == 0)
680
- || v2 == '\0'
681
- || (c2 == 1 && v2 != 'a')
682
- || (c2 == 3 && tokflag[j] == 1 && s_i == 3
683
- && (strcmp(suffix, "gov") == 0
684
- || strcmp(suffix, "edu") == 0
685
- || strcmp(suffix, "org") == 0
686
- || strcmp(suffix, "com") == 0)))
687
- continue;
688
-
689
- tokflag[i] = 1;
690
- tokflag[i + 1] = 1;
691
- }
692
- }
693
- }
694
-
695
-
696
- // Numeric endings of sentences
697
- //
698
- // A period after a numeric token followed by a token that starts
699
- // with an alphabetic character, is a separate token.
700
- //
701
- // This should be covered already by tok_13
702
-
703
- void MPtok::tok_17()
704
- {
705
- int j;
706
-
707
- for (int i = 0; i < text_len; i++)
708
- {
709
- if (text[i] == '.'
710
- && tokflag[i] == 0
711
- && tokflag[i + 1])
712
- {
713
- for (j = i - 1; j >= 0 && isdigit(text[j]) && tokflag[j] == 0; --j)
714
- ;
715
- if (j >= 0 && j < i - 1 && tokflag[j] && isalpha(nextchar(text, i + 1)))
716
- tokflag[i] = 1;
717
- }
718
- }
719
- }
720
-
721
- // period at end of string is a token
722
-
723
- void MPtok::tok_20()
724
- {
725
- for (int i = text_len - 1; i >= 0; --i)
726
- {
727
- if (isspace(text[i]))
728
- continue;
729
-
730
- if (strchr(".!?", text[i]))
731
- tokflag[i] = 1;
732
-
733
- break;
734
- }
735
- }
736
-
737
- // a period that follows a non-common word, and that is
738
- // followed by a lower case common word is probably not a token
739
-
740
- void MPtok::tok_20_1()
741
- {
742
- int j;
743
-
744
- for (int i = 0; i < text_len; ++i)
745
- {
746
- if (text[i] == '.' && tokflag[i] == 1)
747
- {
748
- int tcnt, lcnt, ocnt;
749
- tcnt = lcnt = ocnt = 0;
750
-
751
- // make sure the previous word was *not* common
752
-
753
- for (j = i - 1; j >= 0; j--)
754
- {
755
- if (isspace(text[j])) continue;
756
- if (option_new >= 2)
757
- {
758
- if (islower(text[j]) == 0 && text[j] != '-') ocnt++;
759
- } else
760
- {
761
- if (! islower(text[j])) ocnt++;
762
- }
763
-
764
- if (tokflag[j] || j == 0)
765
- {
766
- if (ocnt == 0)
767
- {
768
- goto nexti;
769
- }
770
- break;
771
- }
772
- }
773
-
774
- tcnt = lcnt = ocnt = 0;
775
-
776
- // make sure the next word is common
777
-
778
- for (j = i + 1; j < text_len; j++)
779
- {
780
- if (isspace(text[j])) continue;
781
- if (tokflag[j]) tcnt++;
782
-
783
- if (tcnt == 2 || j == text_len - 1)
784
- {
785
- if (lcnt > 0 && ocnt == 0) tokflag[i] = 0;
786
- break;
787
- }
788
-
789
- if (islower(text[j])) lcnt++;
790
- else ocnt++;
791
- }
792
- }
793
- nexti: ;
794
- }
795
- }
796
-
797
- // tokenized period followed by non-space other than close paren
798
- // is not a token
799
-
800
- void MPtok::tok_20_2()
801
- {
802
- int j;
803
-
804
- for (int i = 0; i < text_len - 1; ++i)
805
- {
806
- if (text[i] == '.' && tokflag[i] == 1
807
- && strchr(" ()[]\"\'\n\t\r", text[i+1]) == 0)
808
- {
809
- tokflag[i] = 0;
810
- }
811
- }
812
- }
813
-
814
-
815
- // long dash
816
- //
817
- // A pair of hyphens is a complete token
818
-
819
- void MPtok::tok_21()
820
- {
821
- for (int i = 0; i + 1 < text_len; i++)
822
- {
823
- if (strncmp(&text[i], "--", 2) == 0)
824
- {
825
- tokflag[i] = 1;
826
- if (i + 2 < text_len)
827
- {
828
- i += 2;
829
- tokflag[i] = 1;
830
- }
831
- }
832
- }
833
- }
834
-
835
- // hyphens
836
- //
837
- // If specified as an option, a hyphen between letters is a complete token
838
-
839
- void MPtok::tok_21a()
840
- {
841
- if (option_hyphen == 0) return;
842
-
843
- for (int i = 0; i + 1 < text_len; i++)
844
- {
845
- if (text[i] == '-'
846
- && (i == 0 || text[i-1] != '-')
847
- && text[i+1] != '-')
848
- {
849
- tokflag[i] = 1;
850
- tokflag[i+1] = 1;
851
- }
852
- }
853
- }
854
-
855
-
856
- // quote
857
- //
858
- // Any double quote is a separate token
859
-
860
- void MPtok::tok_22()
861
- {
862
- for (int i = 0; i < text_len; i++)
863
- {
864
- if (text[i] == '"')
865
- {
866
- tokflag[i] = 1;
867
- if (i + 1 < text_len)
868
- {
869
- i += 1;
870
- tokflag[i] = 1;
871
- }
872
- }
873
- }
874
- }
875
-
876
- // possessive
877
- //
878
- // Any single quote at the end of a token that is not
879
- // preceded by a single quote is a separate token
880
-
881
- void MPtok::tok_23()
882
- {
883
- for (int i = 0; i < text_len; i++)
884
- {
885
- if (text[i] == '\''
886
- && (i - 1 >= 0 && text[i - 1] != '\'')
887
- && tokflag[i + 1])
888
- {
889
- tokflag[i] = 1;
890
- }
891
- }
892
- }
893
-
894
-
895
- // quote
896
- //
897
- // If a single quote starts a token, or is preceded by a
898
- // single quote, and followed by a character
899
- // that is not a single quote, then
900
- // the character to it's right is the start of a new token
901
-
902
- void MPtok::tok_24()
903
- {
904
- for (int i = 0; i < text_len; i++)
905
- {
906
- if (text[i] == '\''
907
- && (tokflag[i] == 1 || (i - 1 >= 0 && text[i - 1] == '\''))
908
- && (i + 1 < text_len && text[i + 1] != '\''))
909
- {
910
- tokflag[i + 1] = 1;
911
- }
912
- }
913
- }
914
-
915
- // put back possessive
916
- //
917
- // A single quote that is a whole token followed by a lower case s
918
- // that is also a whole token (without space between them)
919
- // should be merged into a single token
920
-
921
- void MPtok::tok_25()
922
- {
923
- for (int i = 0; i < text_len; i++)
924
- {
925
- if (text[i] == '\''
926
- && tokflag[i] == 1
927
- && i + 1 < text_len && text[i + 1] == 's'
928
- && tokflag[i+1] == 1
929
- && (i + 2 >= text_len || isspace(text[i + 2]) || tokflag[i + 2] == 1))
930
- {
931
- tokflag[i + 1] = 0;
932
- }
933
- }
934
- }
935
-
936
- // quote
937
- //
938
- // A pair of single quotes is a separate token
939
-
940
- void MPtok::tok_26()
941
- {
942
- for (int i = 0; i < text_len; i++)
943
- {
944
- if (strncmp(&text[i], "''", 2) == 0
945
- || strncmp(&text[i], "``", 2) == 0)
946
- {
947
- tokflag[i] = 1;
948
- if (i + 2 < text_len) tokflag[i + 2] = 1;
949
- }
950
- }
951
- }
952
-
953
- // possessive
954
- //
955
- // A single quote followed by a letter s is a possessive
956
-
957
- void MPtok::tok_27()
958
- {
959
- for (int i = 0; i < text_len; i++)
960
- {
961
- if (text[i] == '\''
962
- && i + 1 < text_len
963
- && tolower(text[i + 1]) == 's'
964
- && (i + 2 >= text_len || tokflag[i + 2]))
965
- {
966
- tokflag[i] = 1;
967
- }
968
- }
969
- }
970
-
971
- // split "cannot" to "can not"
972
- //
973
- // A single token that is the word cannot (in any case)
974
- // is split into two words
975
-
976
- void MPtok::tok_28()
977
- {
978
- for (int i = 0; i < text_len; i++)
979
- {
980
- if ((strncmp(&text[i], "cannot", 6) == 0
981
- || strncmp(&text[i], "Cannot", 6) == 0)
982
- && tokflag[i + 6])
983
- {
984
- tokflag[i + 3] = 1;
985
- }
986
- }
987
- }
988
-
989
- // put list item elements back at sentence end
990
- //
991
- // A period that is preceded by an alphanumeric (no space)
992
- // and any amount of preceding space and an end-mark
993
- // stays with the alphanumeric.
994
-
995
- void MPtok::tok_29()
996
- {
997
- int j;
998
-
999
- for (int i = 0; i < text_len; i++)
1000
- {
1001
- if (text[i] == '.'
1002
- && tokflag[i] && tokflag[i + 1]
1003
- && i - 1 >= 0 && isalnum(text[i - 1])
1004
- && tokflag[i - 1]
1005
- && ((j = lookbehind(text, i-2, ".", tokflag)) >= 0
1006
- || (j = lookbehind(text, i-2, "?", tokflag)) >= 0
1007
- || (j = lookbehind(text, i-2, "!", tokflag)) >= 0)
1008
- && tokflag[j])
1009
- {
1010
- tokflag[i] = 0;
1011
- }
1012
- }
1013
- }
1014
-
1015
- // attach list elements to the beginnings of their sentences
1016
- // this means, attach the period to the list element
1017
- //
1018
- // a list element is a single letter or a one or two digits
1019
- // which is preceded by an end of sentence ".!?;"
1020
- // or colon (provided it doesn't belong to a proportion construct)
1021
-
1022
- void MPtok::tok_29a()
1023
- {
1024
- int i, j;
1025
-
1026
- for (i = 0; i < text_len; i++)
1027
- {
1028
- if (text[i] == '.' && tokflag[i])
1029
- {
1030
- // Look back, make sure the token before the period
1031
- // is either single alphanumeric, or at most a two digit number
1032
- // and the character before that is a punctuation ".?!:,"
1033
-
1034
- int tcnt, acnt, dcnt, pcnt, ocnt, scnt;
1035
- tcnt = acnt = dcnt = pcnt = ocnt = scnt = 0;
1036
- char p;
1037
-
1038
- for (j = i - 1; j >= 0; j--)
1039
- {
1040
- if (isspace(text[j])) { scnt++; continue; }
1041
- else if (tcnt == 0 && isalpha(text[j])) ++acnt;
1042
- else if (tcnt == 0 && isdigit(text[j])) ++dcnt;
1043
- else if (tcnt == 1 && strchr(".!?:;,", text[j])) { pcnt++; p = text[j]; }
1044
- else ocnt++;
1045
-
1046
- if (tokflag[j] || j == 0)
1047
- {
1048
- tcnt++;
1049
- if (tcnt == 1 && ocnt == 0 && scnt == 0
1050
- && ((acnt == 1 && dcnt == 0) || (acnt == 0 && dcnt > 0 && dcnt <= 2)))
1051
- {
1052
- // This is acceptable
1053
- } else if (tcnt == 2 && pcnt <= 1 && ocnt == 0 && scnt > 0)
1054
- {
1055
- if (p == ':')
1056
- {
1057
- while (--j >= 0 && isspace(text[j]))
1058
- ;
1059
- if (j >= 0 && isdigit(text[j]))
1060
- {
1061
- // It's probably a proportion
1062
- break;
1063
- }
1064
- }
1065
- // Jackpot
1066
- tokflag[i] = 0;
1067
- } else
1068
- {
1069
- // This is not
1070
- break;
1071
- }
1072
- scnt = 0;
1073
- }
1074
- }
1075
- }
1076
- }
1077
- }
1078
-
1079
- // list elements at the beginning of a string
1080
- //
1081
- // An alphanumeric token followed by a period
1082
- // at the beginning of the line stays with the
1083
- // alphanumeric
1084
-
1085
- void MPtok::tok_30()
1086
- {
1087
- int i = 0;
1088
-
1089
- while (isspace(text[i])) i++;
1090
-
1091
- if (isalnum(text[i])
1092
- && tokflag[i]
1093
- && i + 1 < text_len
1094
- && text[i + 1] == '.'
1095
- && tokflag[i + 1])
1096
- {
1097
- tokflag[i + 1] = 0;
1098
- }
1099
- }
1100
-
1101
- // process American style numbers
1102
-
1103
- void MPtok::tok_31()
1104
- {
1105
- int j;
1106
-
1107
- for (int i = 0; i < text_len; i++)
1108
- {
1109
- if (text[i] == ','
1110
- && i + 3 < text_len
1111
- && tokflag[i] && tokflag[i + 1]
1112
- && isdigit(text[i + 1])
1113
- && isdigit(text[i + 2])
1114
- && isdigit(text[i + 3])
1115
- && i - 1 >= 0 && isdigit(text[i - 1])
1116
- )
1117
- {
1118
- tokflag[i] = 0;
1119
- tokflag[i + 1] = 0;
1120
- }
1121
- }
1122
- }
1123
-
1124
- // process British style numbers
1125
-
1126
- void MPtok::tok_32()
1127
- {
1128
- int j;
1129
-
1130
- for (int i = 0; i < text_len; i++)
1131
- {
1132
- if (text[i] == ' '
1133
- && i + 3 < text_len
1134
- && tokflag[i] && tokflag[i + 1]
1135
- && isdigit(text[i + 1])
1136
- && isdigit(text[i + 2])
1137
- && isdigit(text[i + 3])
1138
- && i - 1 >= 0 && isdigit(text[i - 1])
1139
- )
1140
- {
1141
- tokflag[i] = 0;
1142
- tokflag[i + 1] = 0;
1143
- }
1144
- }
1145
- }
1146
-
1147
- // tokenize unicode escapes
1148
- //
1149
- // Added
1150
-
1151
- void MPtok::tok_33()
1152
- {
1153
- int j;
1154
-
1155
- for (int i = 0; i < text_len; i++)
1156
- {
1157
- if (text[i] == '&')
1158
- {
1159
- if (text[i + 1] == '#')
1160
- {
1161
- for (j = i + 2; isdigit(text[j]); j++)
1162
- ;
1163
- } else
1164
- {
1165
- for (j = i + 1; isalpha(text[j]); j++)
1166
- ;
1167
- }
1168
-
1169
- if (text[j] == ';')
1170
- {
1171
- // Tokenize the escape, untokenize everything inside
1172
-
1173
- tokflag[i] = 1;
1174
- for (i++; i <= j; i++) tokflag[i] = 0;
1175
- tokflag[i] = 1;
1176
- }
1177
- }
1178
- }
1179
- }
1180
-
1181
- // Remove tags if they are present
1182
-
1183
- void MPtok::tok_un()
1184
- {
1185
- int untok = 0;
1186
- for (int i = 0; text[i]; ++i)
1187
- {
1188
- if (isspace(text[i])) untok = 0;
1189
- if (text[i] == option_tagsep) untok = 1;
1190
- if (untok) text[i] = ' ';
1191
- }
1192
- }
1193
-
1194
-
1195
- void MPtok::set_tokflag()
1196
- {
1197
- int i;
1198
-
1199
- tok_0();
1200
- tok_1();
1201
- tok_2();
1202
- tok_3();
1203
-
1204
- // step 4 replaces tag char, this is done at output
1205
-
1206
- tok_5_6_7();
1207
- tok_8_9();
1208
-
1209
- tok_10();
1210
- tok_11();
1211
- if (option_new >= 1)
1212
- {
1213
- tok_21();
1214
- tok_21a();
1215
- tok_22();
1216
- tok_23();
1217
- tok_24();
1218
- tok_25();
1219
- tok_26();
1220
- tok_27();
1221
- }
1222
- tok_12();
1223
- tok_13();
1224
- tok_14();
1225
- if (option_new <= 5)
1226
- tok_15();
1227
- if (option_new < 2)
1228
- tok_16();
1229
- tok_17();
1230
-
1231
- // steps 18 and 19 recognize periods within parens,
1232
- // and this is moved to the segmentation section
1233
-
1234
- tok_20();
1235
- if (option_new >= 1)
1236
- {
1237
- tok_20_1();
1238
- tok_20_2();
1239
- if (option_new >= 2)
1240
- tok_16_1();
1241
- if (option_new >= 6)
1242
- tok_15();
1243
- if (option_new >= 7)
1244
- tok_15_1();
1245
- }
1246
- if (option_new < 1)
1247
- {
1248
- tok_21();
1249
- tok_21a();
1250
- tok_22();
1251
- tok_23();
1252
- tok_24();
1253
- tok_25();
1254
- tok_26();
1255
- tok_27();
1256
- }
1257
- tok_28();
1258
- if (option_new >= 1)
1259
- tok_29a();
1260
- else
1261
- tok_29();
1262
- tok_30();
1263
- tok_31();
1264
- tok_32();
1265
-
1266
- tok_33();
1267
- }
1268
-
1269
- /* set_endflag
1270
- **
1271
- ** After tokflag has been set, find the possible sentence endings.
1272
- */
1273
-
1274
- void MPtok::set_endflag()
1275
- {
1276
- int i;
1277
-
1278
- // The following tests look for end-stops and label them.
1279
- // They include steps 18 and 19
1280
-
1281
- for (i = 0; i <= text_len; i++)
1282
- endflag[i] = 0;
1283
-
1284
- // Count the number of unmatched parens
1285
-
1286
- int up = 0; // unmatched round parens
1287
- int ub = 0; // unmatched brackets
1288
-
1289
- for (i = 0; i < text_len; i++)
1290
- {
1291
- if (text[i] == '(') ++up;
1292
- if (text[i] == ')') --up;
1293
- if (text[i] == '[') ++ub;
1294
- if (text[i] == ']') --ub;
1295
- if (up < 0) up = 0;
1296
- if (ub < 0) ub = 0;
1297
- }
1298
-
1299
- // Now find the end-of-sentence marks
1300
-
1301
- // tok_18: periods within parentheses, allow for nesting
1302
- // tok_19: periods within brackets, allow for nesting
1303
- // the perl version solves this by putting the period
1304
- // back with the previous token, but a better solution
1305
- // is to allow it to be tokenized but just don't
1306
- // allow it to be an end-of-sentence.
1307
- // Therefore, these are moved to the segmentation
1308
- // section
1309
-
1310
- int p = 0; // round parens
1311
- int b = 0; // brackets
1312
-
1313
- for (i = 0; i < text_len; i++)
1314
- {
1315
- if (text[i] == '(') ++p;
1316
- if (text[i] == ')') --p;
1317
- if (text[i] == '[') ++b;
1318
- if (text[i] == ']') --b;
1319
- if (p < 0) p = 0;
1320
- if (b < 0) b = 0;
1321
-
1322
- if (strchr(".!?", text[i])
1323
- && tokflag[i]
1324
- && tokflag[i + 1])
1325
- {
1326
- if (option_segment && p <= up && b <= ub)
1327
- endflag[i] = 1;
1328
-
1329
- // This is optional to join periods with
1330
- // probable abbreviations
1331
-
1332
- if (p > up || b > ub)
1333
- tokflag[i] = 0;
1334
- }
1335
- }
1336
-
1337
- // endtokens followed by a single or double quote, which matches
1338
- // a single or double quote in the previous sentence
1339
-
1340
- if (option_new >= 1)
1341
- {
1342
- int dquo, squo;
1343
- dquo = squo = 0;
1344
-
1345
- for (i = 0; i < text_len; i++)
1346
- {
1347
- if (text[i] == '"') dquo = ! dquo;
1348
- else if (text[i] == '\'') squo = ! squo;
1349
- else if (endflag[i])
1350
- {
1351
- if ((text[i+1] == '"' && dquo) || (text[i+1] == '\'' && squo))
1352
- {
1353
- endflag[i] = 0;
1354
-
1355
- // But don't end at all if the next token is something
1356
- // other than an upper case letter.
1357
-
1358
- if (option_new >= 2)
1359
- {
1360
- int j;
1361
- int ok = 0;
1362
-
1363
- for (j = i + 2; j < text_len; j++)
1364
- {
1365
- if (isspace(text[j])) continue;
1366
- // if (isupper(text[j]))
1367
- if (isupper(text[j]) || text[j] == '(')
1368
- {
1369
- ok = 1;
1370
- break;
1371
- }
1372
- if (tokflag[j]) break;
1373
- }
1374
-
1375
- if (ok)
1376
- endflag[i+1] = 1;
1377
- } else
1378
- {
1379
- endflag[i+1] = 1;
1380
- }
1381
- }
1382
- dquo = squo = 0;
1383
- }
1384
- }
1385
- }
1386
- }
1387
-
1388
-
1389
- /* set_endflag_01
1390
- **
1391
- ** After tokflag has been set, find the possible sentence endings.
1392
- ** This has improved paren matching.
1393
- */
1394
-
1395
- #define MAX_MATCH 500 // Maximum length to get a paren match
1396
-
1397
- void MPtok::set_endflag_01()
1398
- {
1399
- int match[text_len];
1400
- int i, j;
1401
-
1402
- // The following tests look for end-stops and label them.
1403
- // They include steps 18 and 19
1404
-
1405
- for (i = 0; i <= text_len; i++)
1406
- endflag[i] = 0;
1407
-
1408
- for (i = 0; i < text_len; i++)
1409
- match[i] = 0;
1410
-
1411
- for (i = text_len - 1; i >= 0; i--)
1412
- {
1413
- if (text[i] == '(' || text[i] == '[')
1414
- {
1415
- for (j = i + 1; text[j] && j - i <= MAX_MATCH; j++)
1416
- {
1417
- // Skip parens that are already matched
1418
-
1419
- if (match[j] > j)
1420
- {
1421
- j = match[j];
1422
- continue;
1423
- }
1424
-
1425
- // Look for a matching close paren
1426
-
1427
- if (match[j] == 0
1428
- && ((text[i] == '(' && text[j] == ')')
1429
- || (text[i] == '[' && text[j] == ']')))
1430
- {
1431
- match[i] = j;
1432
- match[j] = i;
1433
- break;
1434
- }
1435
- }
1436
- }
1437
- }
1438
-
1439
- int next_match = 0;
1440
- for (i = 0; i < text_len; i++)
1441
- {
1442
- if (match[i] > next_match)
1443
- next_match = match[i];
1444
-
1445
- if (strchr(".!?", text[i])
1446
- && tokflag[i]
1447
- && tokflag[i + 1]
1448
- && (option_new <= 4 || option_doteos == 1 || (i > 0 && isspace(text[i-1]) == 0)))
1449
- {
1450
- if (i <= next_match)
1451
- tokflag[i] = 0;
1452
- else if (option_segment)
1453
- endflag[i] = 1;
1454
- }
1455
- }
1456
-
1457
- // endtokens followed by a single or double quote, which matches
1458
- // a single or double quote in the previous sentence
1459
-
1460
- int dquo, squo;
1461
- dquo = squo = 0;
1462
-
1463
- for (i = 0; i < text_len; i++)
1464
- {
1465
- if (option_new <= 7 && text[i] == '"') dquo = ! dquo;
1466
- else if (option_new >= 8 && text[i] == '"' && tokflag[i] && tokflag[i+1]) dquo = ! dquo;
1467
- else if (option_new <= 7 && text[i] == '\'') squo = ! squo;
1468
- else if (option_new >= 8 && text[i] == '\''
1469
- && tokflag[i] && (tokflag[i+1] || (text[i+1] == '\'' && tokflag[i+2]))) squo = ! squo;
1470
- else if (endflag[i])
1471
- {
1472
- if ((text[i+1] == '"' && dquo) || (text[i+1] == '\'' && squo))
1473
- {
1474
- endflag[i] = 0;
1475
-
1476
- // But don't end at all if the next token is something
1477
- // other than an upper case letter.
1478
-
1479
- if (option_new >= 2)
1480
- {
1481
- int j;
1482
- int ok = 0;
1483
-
1484
- for (j = i + 2; j < text_len; j++)
1485
- {
1486
- if (isspace(text[j])) continue;
1487
- // if (isupper(text[j]))
1488
- if (isupper(text[j]) || text[j] == '(')
1489
- {
1490
- ok = 1;
1491
- break;
1492
- }
1493
- if (tokflag[j]) break;
1494
- }
1495
-
1496
- if (ok)
1497
- endflag[i+1] = 1;
1498
- } else
1499
- {
1500
- endflag[i+1] = 1;
1501
- }
1502
- }
1503
- dquo = squo = 0;
1504
- }
1505
- }
1506
- }
1507
-
1508
-
1509
- // Size buffer: return the size of the buffer required to hold all of the tokenized text.
1510
- // It can be simply estimated by a formula that depends only on the length of text and number of tokens.
1511
-
1512
- int MPtok::size_buff()
1513
- {
1514
- int size = 1; // Start with null terminator
1515
- int t = option_pretag.size(); // for each tag, the length of the UNTAG string
1516
-
1517
- if (t <= 0) t = 1; // Make sure there is at least one
1518
- t += 2; // Add one for underscore and one for space
1519
-
1520
- for (int i = 0; i < text_len; i++)
1521
- {
1522
- size++; // Count all characters
1523
- if (tokflag[i]) size += t; // Count token delimiters (may overcount)
1524
- if (endflag[i]) size++; // Add one for newline
1525
- }
1526
- return size;
1527
- }
1528
-
1529
-
1530
- /* append_token
1531
- **
1532
- ** Save a single token to a buffer.
1533
- */
1534
-
1535
- void MPtok::append_token(string& buff, int& sp, char *tok, int ef)
1536
- {
1537
- // Convert tag separator chars and back quotes (?)
1538
-
1539
- for (int i = 0; tok[i]; i++)
1540
- {
1541
- if (tok[i] == option_tagsep) tok[i] = option_replacesep;
1542
- if (tok[i] == '`') tok[i] = '\'';
1543
- }
1544
-
1545
- // Skip whitespace if tokens are being output
1546
- // Otherwise, skip whitespace at the start of a sentence
1547
-
1548
- if (option_token || ! sp) while (isspace(*tok)) ++tok;
1549
-
1550
- // Save the token
1551
-
1552
- if (strlen(tok) > 0)
1553
- {
1554
- // Add delimiter if needed
1555
-
1556
- if (option_token && sp) buff += ' ';
1557
-
1558
- // Append token to output
1559
-
1560
- if (option_new < 9)
1561
- {
1562
- while (*tok && (! option_token || ! isspace(*tok)))
1563
- buff += *(tok++);
1564
- } else
1565
- {
1566
- while (*tok)
1567
- buff += *(tok++);
1568
- }
1569
-
1570
- sp = 1;
1571
-
1572
- // Add tag holders
1573
-
1574
- if (option_token && option_pretag.size() > 0)
1575
- {
1576
- buff += option_tagsep;
1577
- buff += option_pretag;
1578
- }
1579
-
1580
- // If it was end of sentence, then add newline
1581
-
1582
- if (ef)
1583
- {
1584
- buff += '\n';
1585
- sp = 0;
1586
- }
1587
- }
1588
- }
1589
-
1590
- // Strip whitespace after sentences
1591
-
1592
- static void adjust_space(string& buff)
1593
- {
1594
- while (buff.size() > 0 && isspace(buff[0])) buff.erase(0, 1);
1595
-
1596
- // delete two spaces in a row, but keep newlines
1597
-
1598
- for (int i = 1; i < buff.size(); i++)
1599
- {
1600
- if (isspace(buff[i]) && isspace(buff[i-1]))
1601
- buff.erase((buff[i] == '\n')?(--i):(i--), 1);
1602
- }
1603
-
1604
- for (int i = buff.size() - 1; i >= 0 && isspace(buff[i]); i--)
1605
- buff.erase(i, 1);
1606
- }
1607
-
1608
- /* token_string
1609
- **
1610
- ** After the tokflag and endflag have been set, copy the tokens to the buffer.
1611
- */
1612
-
1613
- string MPtok::token_string()
1614
- {
1615
- string buff;
1616
-
1617
- int i;
1618
-
1619
- // Move token starts to non-whitespace chars
1620
-
1621
- int last_tok = 0;
1622
- for (i = 0; i < text_len; i++)
1623
- {
1624
- if (tokflag[i] == 1 && isspace(text[i]))
1625
- {
1626
- tokflag[i] = 0;
1627
- last_tok = 1;
1628
- } else if (isspace(text[i]) == 0 && last_tok)
1629
- {
1630
- tokflag[i] = 1;
1631
- last_tok = 0;
1632
- }
1633
- }
1634
-
1635
- // Extract the tokens and print them out now
1636
-
1637
- char *tok = new char[text_len + 1];
1638
- int pos = 0;
1639
- int sp = 0;
1640
- int ef = 0;
1641
-
1642
- tok[pos] = '\0';
1643
-
1644
- for (i = 0; i <= text_len; i++)
1645
- {
1646
- // The start of a new token
1647
-
1648
- if (tokflag[i])
1649
- {
1650
- // Print the current token
1651
-
1652
- append_token(buff, sp, tok, ef);
1653
-
1654
- // Start a new token
1655
-
1656
- pos = 0;
1657
- tok[pos] = '\0';
1658
-
1659
- ef = 0;
1660
- }
1661
-
1662
- // Append to the current token
1663
-
1664
- tok[pos++] = text[i];
1665
- tok[pos] = '\0';
1666
-
1667
- // If any of the characters in the token are endflagged,
1668
- // Then pass this information along for end-of-sentence
1669
-
1670
- if (endflag[i]) ef = 1;
1671
- }
1672
-
1673
- // Print the last token
1674
-
1675
- append_token(buff, sp, tok, ef);
1676
-
1677
- delete[] tok;
1678
-
1679
- // Adjust the end of sentence boundaries
1680
-
1681
- adjust_space(buff);
1682
-
1683
- return buff;
1684
- }
1685
-
1686
- void MPtok::map_escapes()
1687
- {
1688
- char *s;
1689
- int j, k, ch;
1690
- char buff[10];
1691
-
1692
- k = 0;
1693
- for (int i = 0; text[i]; i++)
1694
- {
1695
- if (text[i] == '&' && text[i + 1] == '#')
1696
- {
1697
- for (s = &buff[0], j = 2; j <= 4 && i + j < text_len && isdigit(text[i + j]); j++)
1698
- *s++ = text[i + j];
1699
- *s = '\0';
1700
- ch = atoi(buff);
1701
- if (strlen(buff) > 0 && text[i + j] == ';' && ch > 0 && ch <= 256)
1702
- {
1703
- text[k] = ch;
1704
- if (! text[k]) text[k] = ' ';
1705
- k++;
1706
- i = i + j;
1707
- continue;
1708
- }
1709
- }
1710
- text[k++] = text[i];
1711
- }
1712
- text[k] = '\0';
1713
- text_len = k;
1714
- }
1715
-
1716
- MPtok::MPtok(string idir, const string& cnam)
1717
- {
1718
- tok_initialized = 0;
1719
-
1720
- if (idir.size() == 0)
1721
- {
1722
- char *p = getenv("MEDPOST_HOME");
1723
- if (p && strlen(p))
1724
- {
1725
- idir = p;
1726
-
1727
- int found = idir.find("=");
1728
- if (found != string::npos)
1729
- idir = idir.substr(found + 1);
1730
- }
1731
- }
1732
-
1733
-
1734
- if (idir.size() == 0)
1735
- {
1736
- char buff[1000];
1737
- FILE *fp = fopen("path_medpost", "r");
1738
- if (fp)
1739
- {
1740
- if (fgets(buff, 1000, fp))
1741
- {
1742
- chomp(buff);
1743
- idir = &buff[0];
1744
- }
1745
- fclose(fp);
1746
- }
1747
- }
1748
-
1749
- if (idir.size() == 0)
1750
- idir = "/home/natxie/CPP64/lib/FIXED_DATA/";
1751
-
1752
- option_dir = idir;
1753
-
1754
- option_token = 1;
1755
- option_segment = 1;
1756
- option_hyphen = 0;
1757
- option_comma = 1;
1758
- option_pretok = 0;
1759
- option_new = MPTOK_VERSION;
1760
- option_doteos = 0;
1761
-
1762
- if (cnam.size() > 0)
1763
- {
1764
- option_cnam = "_";
1765
- option_cnam += cnam;
1766
- }
1767
-
1768
- init();
1769
- }
1770
-
1771
- void MPtok::init(void)
1772
- {
1773
- if (tok_initialized) return;
1774
-
1775
- string fname;
1776
-
1777
- fname = option_dir + "/medpost" + option_cnam + ".pairs";
1778
- init_pair(fname);
1779
-
1780
- fname = option_dir + "/medpost" + option_cnam + ".abbr";
1781
- init_abbr(fname);
1782
-
1783
- tok_initialized = 1;
1784
- }
1785
-
1786
- MPtok::~MPtok()
1787
- {
1788
- }
1789
-
1790
- // Global tokenizer
1791
-
1792
- string MPtok::tokenize(const string& txt, int mt)
1793
- {
1794
- if (option_pretok) return save_string(txt);
1795
-
1796
- option_token = mt;
1797
- text_len = txt.size();
1798
- if (text_len == 0) return string("");
1799
-
1800
- text = new char[text_len + 1];
1801
- strcpy(text, txt.c_str());
1802
-
1803
- map_escapes();
1804
-
1805
- if (text_len == 0) return NULL;
1806
-
1807
- tokflag = new int[text_len + 1];
1808
- endflag = new int[text_len + 1];
1809
-
1810
- set_tokflag();
1811
- if (option_new < 3)
1812
- set_endflag();
1813
- else
1814
- set_endflag_01();
1815
-
1816
- string buff = token_string();
1817
- save_string(buff);
1818
-
1819
- delete[] text; text = NULL;
1820
- delete[] tokflag; tokflag = NULL;
1821
- delete[] endflag; endflag = NULL;
1822
-
1823
- return buff;
1824
- }
1825
-
1826
- string MPtok::tokenize(const string& text)
1827
- {
1828
- return tokenize(text, 1);
1829
- }
1830
-
1831
- string MPtok::segment(const string& text)
1832
- {
1833
- sent.clear();
1834
-
1835
- // tokenize the text
1836
-
1837
- int save_option_segment = option_segment;
1838
- option_segment = 1;
1839
- string buff = tokenize(text, 0);
1840
- option_segment = save_option_segment;
1841
-
1842
- if (buff.size() == 0) return text;
1843
-
1844
- int found = 0;
1845
- int pos = 0;
1846
-
1847
- while (pos < buff.size())
1848
- {
1849
- found = buff.find('\n', pos);
1850
- if (found == string::npos)
1851
- {
1852
- sent.push_back(buff.substr(pos));
1853
- pos = buff.size();
1854
- } else
1855
- {
1856
- sent.push_back(buff.substr(pos, found - pos));
1857
- pos = found + 1;
1858
- }
1859
- }
1860
-
1861
- return buff;
1862
- }
1863
-
1864
- string MPtok::save_string(const string& s)
1865
- {
1866
- stringstream ss (stringstream::in | stringstream::out);
1867
- string w, t;
1868
- int found;
1869
- string ret;
1870
-
1871
- word.clear();
1872
- tag.clear();
1873
-
1874
- ss << s;
1875
- while (ss.good())
1876
- {
1877
- ss >> w;
1878
- if (w.size() == 0) break;
1879
-
1880
- found = w.find('_');
1881
-
1882
- if (found != string::npos)
1883
- {
1884
- t = w.substr(found + 1);
1885
- w.resize(found);
1886
- word.push_back(w);
1887
- tag.push_back(t);
1888
- } else
1889
- {
1890
- word.push_back(w);
1891
- tag.push_back(option_pretag);
1892
-
1893
- }
1894
- if (ret.size() > 0) ret += " ";
1895
- ret += w;
1896
- }
1897
-
1898
- // now look for continuation tags...
1899
-
1900
- for (int i = 0; i < word.size(); i++)
1901
- {
1902
- int j = tag[i].size() - 1;
1903
- if (j >= 0 && tag[i][j] == '+' && i < tag.size() - 1)
1904
- {
1905
- word[i] = word[i] + " " + word[i + 1];
1906
- tag[i] = tag[i + 1];
1907
- word.erase(word.begin() + i + 1, word.begin() + i + 2);
1908
- tag.erase(tag.begin() + i + 1, tag.begin() + i + 2);
1909
- i--;
1910
- }
1911
- }
1912
-
1913
- return ret;
1914
- }
1915
-
1916
-
1917
- static int count_words(const char *s)
1918
- {
1919
- int i;
1920
-
1921
- i = 1;
1922
- for (; *s; ++s)
1923
- {
1924
- if (*s == ' ') ++i;
1925
- }
1926
- return i;
1927
- }
1928
-
1929
- static void print_word(const char *s, int i)
1930
- {
1931
- for (; i > 0 && *s; ++s) { if (*s == ' ') --i; }
1932
- while (*s && *s != ' ') { printf("%c", *s); ++s; }
1933
- }
1934
-
1935
- void MPtok::print(int how)
1936
- {
1937
- int i, j, w;
1938
-
1939
- if (how != 0 && how != 2)
1940
- {
1941
- printf("print(%d) not defined\n", how);
1942
- return;
1943
- }
1944
-
1945
- for (i = 0; i < word.size(); ++i)
1946
- {
1947
- // Get the words from an idiom
1948
-
1949
- for (w = 0; w < count_words(word[i].c_str()); ++w)
1950
- {
1951
- if (how == 2 && i + w > 0) printf(" ");
1952
-
1953
- print_word(word[i].c_str(), w);
1954
-
1955
- if (how == 0)
1956
- {
1957
- printf(" tagged %s", tag[i].c_str());
1958
- if (w < count_words(word[i].c_str()) - 1) printf("+");
1959
- printf("\n");
1960
- } else if (how == 2)
1961
- {
1962
- printf("%s%s", "_", tag[i].c_str());
1963
- if (w < count_words(word[i].c_str()) - 1) printf("+");
1964
- }
1965
- }
1966
- }
1967
- if (how == 2)
1968
- printf("\n");
1969
- }
1970
-
1971
- void MPtok::merge_words(int s, int n)
1972
- {
1973
- string tmp = word[s];
1974
-
1975
- for (int i = s + 1; i < s + n; i++)
1976
- {
1977
- tmp += " ";
1978
- tmp += word[i];
1979
- }
1980
-
1981
- // printf("merging words : '%s' n = %d\n", tmp.c_str(), n);
1982
-
1983
- for (int k = s; k + n < word.size(); k++)
1984
- {
1985
- word[k+1] = word[k+n];
1986
- tag[k+1] = tag[k+n];
1987
- }
1988
-
1989
- // Fixup the remaining array
1990
-
1991
- word.resize(word.size() - n + 1);
1992
- tag.resize(word.size());
1993
-
1994
- word[s] = tmp;
1995
- }
1996
-
1997
- void MPtok::split_words()
1998
- {
1999
- for (int i = 0; i < word.size(); i++)
2000
- {
2001
- int found = word[i].find(' ');
2002
-
2003
- if (found != string::npos)
2004
- {
2005
- string tmp1(word[i], 0, found);
2006
- string tmp2(word[i], found + 1, string::npos);
2007
-
2008
- // Move all the words and tags down
2009
-
2010
- word.resize(word.size() + 1);
2011
- tag.resize(tag.size() + 1);
2012
-
2013
- for (int j = word.size() - 1; j > i; j--)
2014
- {
2015
- word[j] = word[j - 1];
2016
- tag[j] = tag[j - 1];
2017
- }
2018
-
2019
- word[i] = tmp1;
2020
- tag[i] = tag[i+1];
2021
- tag[i] += "+";
2022
-
2023
- word[i+1] = tmp2;
2024
- }
2025
- }
2026
- }
2027
-
2028
- // Callable functions to set internal options
2029
-
2030
- void MPtok::set_segment(int i) { option_segment = i; }
2031
- void MPtok::set_hyphen(int i) { option_hyphen = i; }
2032
- void MPtok::set_comma(int i) { option_comma = i; }
2033
- void MPtok::set_pretag(char *a) { option_pretag = a; }
2034
- void MPtok::set_pretok(int i) { option_pretok = i; }
2035
- void MPtok::set_new(int i) { option_new = i; }
2036
- void MPtok::set_doteos(int i) { option_doteos = i; }
 
1
+ #include <stdio.h>
2
+ #include <ctype.h>
3
+ #include <string.h>
4
+ #include <stdlib.h>
5
+
6
+ #include <string>
7
+ #include <iostream>
8
+ #include <fstream>
9
+ #include <sstream>
10
+
11
+ #include "MPtok.h"
12
+
13
+ // These options are probably compile time constants
14
+
15
+ static char option_tagsep = '_'; // The tagsep character
16
+ static char option_replacesep = '-'; // Replace tagsep with this
17
+
18
+ static void chomp(char *line)
19
+ {
20
+ int i;
21
+
22
+ i = strlen(line) - 1;
23
+ while (i >= 0 && line[i] == '\n' || line[i] == '\r')
24
+ line[i--] = '\0';
25
+ }
26
+
27
+ // Data structure and algorithm for finding common pairs.
28
+
29
+ // read a file of pairs into a data structure,
30
+ // the file must be sorted first
31
+
32
+ void MPtok::init_pair(const string& file_name)
33
+ {
34
+ filebuf fb;
35
+ fb.open(file_name.c_str(), ios::in);
36
+ istream is(&fb);
37
+ string pair;
38
+
39
+ while (1)
40
+ {
41
+ getline(is, pair);
42
+ if (is.fail()) break;
43
+ if (pair.size() > 0) common_pair.insert(pair);
44
+ }
45
+
46
+ fb.close();
47
+ }
48
+
49
+ // List of abbreviations in 3 categories
50
+ // ABB = can occur mid sentence
51
+ // EOS = can occur at end of sentence
52
+ // NUM = only used before numbers
53
+
54
+ void MPtok::init_abbr(const string& file_name)
55
+ {
56
+ filebuf fb;
57
+ fb.open(file_name.c_str(), ios::in);
58
+ istream is(&fb);
59
+ string typ, abb;
60
+ map<string,int> val;
61
+ val["ABB"] = ABB_ABB; val["EOS"] = ABB_EOS; val["NUM"] = ABB_NUM;
62
+
63
+ while (is.good())
64
+ {
65
+ is >> typ;
66
+ if (val.count(typ))
67
+ {
68
+ is >> abb;
69
+ if (abb.size() > 0) common_abbr[abb] = val[typ];
70
+ }
71
+ }
72
+ fb.close();
73
+ }
74
+
75
+ static char nextchar(const char *t, int i)
76
+ {
77
+ while (isspace(t[i])) i++;
78
+ return t[i];
79
+ }
80
+
81
+ // Look for a token at or prior to the text position
82
+
83
+ static int lookbehind(const char *t, int i, const char *s, int *tokflag)
84
+ {
85
+ int k = (int) strlen(s) - 1;
86
+
87
+ while (i > 0 && isspace(t[i])) i--;
88
+
89
+ while (k >= 0 && i >= 0)
90
+ {
91
+ if (k > 0 && tokflag[i]) break;
92
+
93
+ if (tolower(s[k]) != tolower(t[i]))
94
+ return -1;
95
+ k--;
96
+ i--;
97
+ }
98
+
99
+ return (k < 0 && tokflag[i+1]) ? i + 1 : -1;
100
+ }
101
+
102
+ // Look for a token at or following the text position
103
+
104
+ static int lookahead(const char *t, int i, const char *s, int *tokflag)
105
+ {
106
+ int k = 0;
107
+
108
+ while (isspace(t[i])) i++;
109
+
110
+ while (k < strlen(s) && i < strlen(t))
111
+ {
112
+ if (k > 0 && tokflag[i]) break;
113
+
114
+ if (tolower(s[k]) != tolower(t[i]))
115
+ return -1;
116
+ k++;
117
+ i++;
118
+ }
119
+
120
+ return (k == strlen(s) && tokflag[i]) ? i - (int) strlen(s) : -1;
121
+ }
122
+
123
+ // Set the initial tokens at spaces
124
+
125
+ void MPtok::tok_0()
126
+ {
127
+ int i;
128
+
129
+ tokflag[0] = 1;
130
+ for (i = 1; i < text_len; i++)
131
+ {
132
+ tokflag[i] = isspace(text[i]) || (i > 0 && isspace(text[i - 1])) ? 1 : 0;
133
+ }
134
+ tokflag[i] = 1;
135
+ }
136
+
137
+ // Get quotes preceded by open parens
138
+ //
139
+ // A double quote, preceded by a space or open bracket is a separate token
140
+ //
141
+
142
+ void MPtok::tok_1()
143
+ {
144
+ for (int i = 1; i < text_len; i++)
145
+ {
146
+ if (text[i] == '"' && strchr("([{<", text[i-1]))
147
+ {
148
+ tokflag[i] = 1;
149
+ if (i + 1 < text_len) tokflag[i+1] = 1;
150
+ }
151
+ }
152
+ }
153
+
154
+ // Look for ellipses
155
+ //
156
+ // Three dots in a row is a separate token
157
+
158
+ void MPtok::tok_2()
159
+ {
160
+ for (int i = 1; i + 2 < text_len; i++)
161
+ {
162
+ if (strncmp(&text[i], "...", 3) == 0)
163
+ {
164
+ tokflag[i] = 1;
165
+ if (i + 3 < text_len) tokflag[i+3] = 1;
166
+ }
167
+ }
168
+ }
169
+
170
+ // Non-sentence-ending punctuation
171
+ //
172
+ // Certain punctuation characters are separate tokens
173
+
174
+ void MPtok::tok_3()
175
+ {
176
+ for (int i = 0; i < text_len; i++)
177
+ {
178
+ // If it is a comma and the next char is not a space and option_comma = 0
179
+
180
+ if (option_comma == 0 && text[i] == ',' && isspace(text[i + 1]) == 0)
181
+ {
182
+ // do nothing
183
+ } else if (strchr(",;:@#$%&", text[i]))
184
+ {
185
+ tokflag[i] = 1;
186
+ tokflag[i + 1] = 1;
187
+ }
188
+ }
189
+ }
190
+
191
+ // Separate the slashes
192
+ //
193
+ // Slashes are a separate token
194
+ // except for +/-, +/+, -/-, -/+, and and/or.
195
+
196
+ void MPtok::tok_5_6_7()
197
+ {
198
+ for (int i = 0; i < text_len; i++)
199
+ {
200
+ if (text[i] == '/')
201
+ {
202
+ tokflag[i] = 1;
203
+ if (i+1 < text_len) tokflag[i+1] = 1;
204
+
205
+ // Put back +/-, etc, unless option_hyphen is 1
206
+
207
+ if (i - 1 >= 0
208
+ && i + 1 < text_len
209
+ && ((option_new < 9
210
+ && text[i - 1] == '+' || (text[i - 1] == '-' && option_hyphen == 0)
211
+ && text[i + 1] == '+' || (text[i + 1] == '-' && option_hyphen == 0))
212
+ || (option_new >= 9
213
+ && (text[i - 1] == '+' || text[i - 1] == '-')
214
+ && (text[i + 1] == '+' || text[i + 1] == '-'))))
215
+ {
216
+ tokflag[i - 1] = 1;
217
+ tokflag[i] = tokflag[i+1] = 0;
218
+ tokflag[i + 2] = 1;
219
+ }
220
+
221
+ // Put back and/or, etc
222
+
223
+ if (option_new <= 7)
224
+ {
225
+ if (i > 5 && strncmp(text + i - 5, " and/or ", 8) == 0)
226
+ {
227
+ for (int j = 1; j < 5; j++)
228
+ tokflag[i - 2 + j] = 0;
229
+ }
230
+ } else
231
+ {
232
+ if (i > 4 && strncmp(text + i - 4, " and/or ", 8) == 0)
233
+ {
234
+ for (int j = 1; j < 6; j++)
235
+ tokflag[i - 3 + j] = 0;
236
+ }
237
+ }
238
+ }
239
+ }
240
+ }
241
+
242
+ // All brackets
243
+ //
244
+ // Any open or closed bracket is a separate token
245
+ //
246
+ // Exclamation and question mark
247
+ //
248
+ // Any question or exclamation mark is a separate token
249
+
250
+ void MPtok::tok_8_9()
251
+ {
252
+ for (int i = 0; i < text_len; i++)
253
+ {
254
+ if (strchr("[](){}<>", text[i])
255
+ || strchr("?!", text[i]))
256
+ {
257
+ tokflag[i] = 1;
258
+ if (i + 1 < text_len) tokflag[i+1] = 1;
259
+ }
260
+ }
261
+ }
262
+
263
+ // Period at the end of a string may be followed by closed-bracket or quote
264
+ //
265
+ // A period that is preceded by a non-period
266
+ // and optionally followed by a close paren
267
+ // and any amount of space at the end of the string
268
+ // is a separate token.
269
+
270
+ void MPtok::tok_10()
271
+ {
272
+ for (int i = text_len - 1; i >= 0; i--)
273
+ {
274
+ if (isspace(text[i])) continue;
275
+ if (strchr("])}>\"'", text[i])) continue;
276
+ if (text[i] != '.') break;
277
+ if (text[i] == '.' && (i - 1 < 0 || text[i-1] != '.'))
278
+ {
279
+ tokflag[i] = 1;
280
+ if (i + 1 < text_len) tokflag[i+1] = 1;
281
+ }
282
+ }
283
+ }
284
+
285
+ // Period followed by a capitalized word
286
+ //
287
+ // A period preceded by a character that is not another period and not a space
288
+ // and followed by a space then an upper case letter is a separate token
289
+
290
+ void MPtok::tok_11()
291
+ {
292
+ for (int i = 0; i < text_len; i++)
293
+ {
294
+ if (text[i] == '.'
295
+ && (i + 1 < text_len && isspace(text[i+1]))
296
+ && (i - 1 < 0 || text[i - 1] != '.' || isspace(text[i-1]) == 0)
297
+ && isupper(nextchar(text, i + 1)))
298
+ tokflag[i] = 1;
299
+ }
300
+ }
301
+
302
+ // A normal word followed by a period
303
+ //
304
+ // A period followed by a space
305
+ // and preceded by 2 or more alphabetic characters or hyphens
306
+ // is a separate token
307
+
308
+ void MPtok::tok_12()
309
+ {
310
+ int wcnt = 0;
311
+
312
+ for (int i = 0; i < text_len; i++)
313
+ {
314
+ if (text[i] == '.'
315
+ && tokflag[i + 1]
316
+ && wcnt >= 2)
317
+ tokflag[i] = 1;
318
+
319
+ if (isalpha(text[i]) || text[i] == '-')
320
+ ++wcnt;
321
+ else
322
+ wcnt = 0;
323
+ }
324
+ }
325
+
326
+ // A non-normal token (that has no lower case letters) followed by a period
327
+ //
328
+ // A period at the end of a token made of characters excluding lower case
329
+ // is a separate token
330
+
331
+ void MPtok::tok_13()
332
+ {
333
+ int stok = 0;
334
+ int wcnt = 0;
335
+
336
+ for (int i = 0; i < text_len; i++)
337
+ {
338
+ if (text[i] == '.'
339
+ && tokflag[i + 1]
340
+ && wcnt >= 2)
341
+ tokflag[i] = 1;
342
+
343
+ if (tokflag[i] == 1) stok = 1;
344
+
345
+ if (islower(text[i]) || text[i] == '.')
346
+ {
347
+ stok = 0;
348
+ wcnt = 0;
349
+ }
350
+
351
+ if (stok)
352
+ wcnt++;
353
+ }
354
+ }
355
+
356
+ // put some periods with single-letter abbreviations
357
+ //
358
+ // A single alphabetic token followed by a period followed
359
+ // by a token that does not begin with an upper case letter
360
+ // or number is taken to be an abbreviation and the period
361
+ // does not start a new token.
362
+ //
363
+ // NOTE: This does not recognize initials in people's names,
364
+ // that problem is not simply solved.
365
+
366
+ void MPtok::tok_14()
367
+ {
368
+ for (int i = 0; i < text_len; i++)
369
+ {
370
+ if (text[i] == '.'
371
+ && i - 1 >= 0 && isalpha(text[i - 1]) && tokflag[i - 1]
372
+ && tokflag[i + 1]
373
+ && isupper(nextchar(text, i + 1)) == 0
374
+ && isdigit(nextchar(text, i + 1)) == 0
375
+ && nextchar(text, i + 1) != '('
376
+ )
377
+ {
378
+ tokflag[i] = 0;
379
+ }
380
+ }
381
+ }
382
+
383
+ void MPtok::tok_15()
384
+ {
385
+ int i, j, k, a;
386
+ char buff[MAX_ABB + 1];
387
+
388
+ for (i = 0; i < text_len; i++)
389
+ {
390
+ // only start at a current token
391
+
392
+ if (! tokflag[i]) continue;
393
+
394
+ // find alphabetic followed by period
395
+
396
+ buff[0] = '\0';
397
+ for (k = 0; i + k < text_len && k < MAX_ABB; k++)
398
+ {
399
+ buff[k] = text[i+k]; buff[k+1] = '\0';
400
+ if (k > 0 && buff[k] == '.') break; // this is good
401
+ if (! isalpha(buff[k])) { buff[0] = '\0'; break; } // this is not good
402
+ }
403
+
404
+ if (buff[0] == '\0' || i + k == text_len || k == MAX_ABB) continue;
405
+
406
+ // at this point, buff[k] == '.' add 1 to make it the length
407
+
408
+ k++;
409
+
410
+ // if not found, try finding a concatenated abbrev
411
+
412
+ if (! common_abbr.count(buff))
413
+ {
414
+ for (; i + k < text_len && k < MAX_ABB; k++)
415
+ {
416
+ buff[k] = text[i+k]; buff[k+1] = '\0';
417
+ if (k > 0 && buff[k] == '.') break; // this is good
418
+ if (! isalpha(buff[k])) { buff[0] = '\0'; break; } // this is not good
419
+ }
420
+
421
+ if (buff[0] == '\0' || i + k == text_len || k == MAX_ABB) continue;
422
+
423
+ // at this point, buff[k] == '.' add 1 to make it the length
424
+
425
+ k++;
426
+ }
427
+
428
+ // if not found, give up
429
+
430
+ if (! common_abbr.count(buff)) continue;
431
+
432
+ if (common_abbr[buff] == ABB_NUM)
433
+ {
434
+ for (j = i + k; j < text_len && isspace(text[j]); j++) ; // next must be a number
435
+ if (! isdigit(text[j])) continue; // go to next abbreviation
436
+ } else if (common_abbr[buff] == ABB_EOS)
437
+ {
438
+ for (j = i + k; j < text_len && isspace(text[j]); j++) ; // if next token is upper case letter
439
+ if (isupper(text[j])) tokflag[i + (--k)] = 1; // tokenize the final period of this abbreviation
440
+ }
441
+
442
+ // clear all token flags
443
+
444
+ for (j = 1; j < k; j++) tokflag[i + j] = 0;
445
+ }
446
+ }
447
+
448
+ // Check for common pairs that should not be considered sentence breaks
449
+
450
+ void MPtok::tok_15_1()
451
+ {
452
+ int i, j, k, tnum, p;
453
+ char buff[MAX_ABB + 1];
454
+
455
+ for (i = 0; i < text_len; i++)
456
+ {
457
+ if (! tokflag[i]) continue;
458
+
459
+ // must be alphanumeric token followed by period token followed by space followed by alphanumeric token
460
+
461
+ tnum = 0;
462
+ buff[0] = '\0';
463
+ for (p = k = 0; i + k < text_len && k < MAX_ABB; k++)
464
+ {
465
+ buff[k] = text[i+k]; buff[k+1] = '\0';
466
+
467
+ if (isspace(buff[k]))
468
+ {
469
+ if (tnum == 2) break; // this is good
470
+ else if (tnum == 1) continue; // ok
471
+ else { buff[0] = '\0'; break; } // this shouldn't happen
472
+ }
473
+
474
+ if (tokflag[i+k])
475
+ {
476
+ if (tnum > 2) break; // done
477
+ else tnum++;
478
+ }
479
+
480
+ if (tnum == 1 && buff[k] == '.') p = k;
481
+ if (tnum == 1 && buff[k] != '.') { buff[0] = '\0'; break; } // nope
482
+ if (! isalnum(buff[k])) { buff[0] = '\0'; break; } // nope
483
+ }
484
+
485
+ if (buff[0] == '\0' || i + k == text_len || k == MAX_ABB) continue;
486
+
487
+ // at this point buff is a potential pair, so untokenize the period, that's all
488
+
489
+ if (common_pair.count(buff))
490
+ tokflag[p] = 0;
491
+ }
492
+ }
493
+
494
+ // Get cases where a space after a sentence has been omitted
495
+ //
496
+ // A period that occurs in a token consisting of alphabetic
497
+ // letters with a vowel to the left and the right is a
498
+ // separate token.
499
+
500
+ void MPtok::tok_16()
501
+ {
502
+ int j;
503
+ int has_vowel;
504
+
505
+ for (int i = 0; i < text_len; i++)
506
+ {
507
+ if (text[i] == '.' && tokflag[i] == 0)
508
+ {
509
+ has_vowel = 0;
510
+ for (j = i - 1; j >= 0; --j)
511
+ {
512
+ if (isalpha(text[j]) == 0)
513
+ break;
514
+ if (strchr("aeiouAEIOU", text[j]))
515
+ has_vowel = 1;
516
+ if (tokflag[j])
517
+ break;
518
+ }
519
+ if ((j >= 0 && tokflag[j] == 0) || has_vowel == 0)
520
+ continue;
521
+
522
+ j = i + 1;
523
+
524
+ has_vowel = 0;
525
+ for (; j < text_len && tokflag[j] == 0; ++j)
526
+ {
527
+ if (isalpha(text[j]) == 0)
528
+ break;
529
+ if (strchr("aeiouAEIOU", text[j]))
530
+ has_vowel = 1;
531
+ }
532
+
533
+ if ((j < text_len && tokflag[j] == 0) || has_vowel == 0)
534
+ continue;
535
+
536
+ tokflag[i] = 1;
537
+ tokflag[i + 1] = 1;
538
+ }
539
+ }
540
+ }
541
+
542
+ // Correction to tok_16,
543
+ // Don't count if the token before is a single letter
544
+ // or the token following is a single letter other than 'a'.
545
+ // Also, don't count if the token to the right is gov, com, edu, etc.
546
+ // because those are web addresses!
547
+
548
+ #define COMPLEX_WINDOW 40
549
+
550
+ enum {COMPLEX_NOT = 0, COMPLEX_YES, COMPLEX_DONE};
551
+
552
+ struct _complex {
553
+ int flag;
554
+ int offset;
555
+ const char *str;
556
+ int len;
557
+ } complex[] = {
558
+ COMPLEX_YES, 0, "complex", 7,
559
+ COMPLEX_NOT, 0, "complexi", 8,
560
+ COMPLEX_NOT, 0, "complexed", 9,
561
+ COMPLEX_NOT, 0, "complexa", 8,
562
+ COMPLEX_NOT, 0, "complex-", 8,
563
+ COMPLEX_NOT, 0, "complexl", 8,
564
+ COMPLEX_NOT, 0, "complexu", 8,
565
+ COMPLEX_NOT, -1, "-complex", 7,
566
+ COMPLEX_NOT, -2, "nocomplex", 9,
567
+ COMPLEX_NOT, -3, "subcomplex", 10,
568
+ COMPLEX_YES, 0, "hybrid", 6,
569
+ COMPLEX_NOT, 0, "hybridi", 7,
570
+ COMPLEX_NOT, 0, "hybrido", 7,
571
+ COMPLEX_NOT, 0, "hybrida", 7,
572
+ COMPLEX_NOT, 0, "hybrid-", 7,
573
+ COMPLEX_NOT, -1, "-hybrid", 7,
574
+ COMPLEX_YES, 0, "duplex", 6,
575
+ COMPLEX_NOT, -1, "oduplex", 7,
576
+ COMPLEX_DONE, 0, NULL, 0,
577
+ };
578
+
579
+ int MPtok::complex_check()
580
+ {
581
+ int last_period = -2*COMPLEX_WINDOW;
582
+ int last_complex = -2*COMPLEX_WINDOW;
583
+ int i, j;
584
+ int complex_match;
585
+
586
+ for (i = 0; i < text_len; i++)
587
+ {
588
+ if (text[i] == '.')
589
+ {
590
+ if (i - last_complex <= COMPLEX_WINDOW)
591
+ return 1;
592
+ last_period = i;
593
+ }
594
+
595
+ complex_match = 0;
596
+ for (j = 0; complex[j].str; j++)
597
+ {
598
+ if (complex[j].flag == COMPLEX_NOT)
599
+ {
600
+ if (i + complex[j].offset >= 0
601
+ && strncmp(text+i+complex[j].offset, complex[j].str, complex[j].len) == 0)
602
+ {
603
+ // don't match here
604
+ complex_match = 0;
605
+ }
606
+ } else if (complex[j].flag == COMPLEX_YES)
607
+ {
608
+ if (i + complex[j].offset >= 0
609
+ && strncmp(text+i+complex[j].offset, complex[j].str, complex[j].len) == 0)
610
+ {
611
+ // match here
612
+ complex_match = 1;
613
+ }
614
+ }
615
+ }
616
+
617
+ if (complex_match)
618
+ {
619
+ if (i - last_period <= COMPLEX_WINDOW)
620
+ return 1;
621
+ last_complex = i;
622
+ }
623
+ }
624
+ return 0;
625
+ }
626
+
627
+ void MPtok::tok_16_1()
628
+ {
629
+ int i, j;
630
+ char v1, v2;
631
+ int c1, c2;
632
+
633
+ if (option_new == 3 && strstr(text, "complex"))
634
+ return;
635
+
636
+ if (option_new >= 4 && complex_check())
637
+ return;
638
+
639
+ for (i = 0; i < text_len; i++)
640
+ {
641
+ if (text[i] == '.' && tokflag[i] == 0)
642
+ {
643
+ char suffix[10];
644
+ int s_i;
645
+
646
+ v1 = '\0';
647
+ c1 = 0;
648
+ for (j = i - 1; j >= 0; --j)
649
+ {
650
+ if (isalpha(text[j]) == 0)
651
+ break;
652
+ if (strchr("aeiouAEIOU", text[j]))
653
+ v1 = tolower(text[j]);
654
+ c1++;
655
+ if (tokflag[j])
656
+ break;
657
+ }
658
+ if ((j >= 0 && tokflag[j] == 0)
659
+ || v1 == '\0'
660
+ || c1 == 1)
661
+ continue;
662
+
663
+ j = i + 1;
664
+
665
+ v2 = '\0';
666
+ c2 = 0;
667
+ s_i = 0;
668
+ for (; j < text_len && tokflag[j] == 0; ++j)
669
+ {
670
+ if (isalpha(text[j]) == 0)
671
+ break;
672
+ if (strchr("aeiouAEIOU", text[j]))
673
+ v2 = tolower(text[j]);
674
+ if (s_i < 3)
675
+ suffix[s_i++] = tolower(text[j]); suffix[s_i] = '\0';
676
+ c2++;
677
+ }
678
+
679
+ if ((j < text_len && tokflag[j] == 0)
680
+ || v2 == '\0'
681
+ || (c2 == 1 && v2 != 'a')
682
+ || (c2 == 3 && tokflag[j] == 1 && s_i == 3
683
+ && (strcmp(suffix, "gov") == 0
684
+ || strcmp(suffix, "edu") == 0
685
+ || strcmp(suffix, "org") == 0
686
+ || strcmp(suffix, "com") == 0)))
687
+ continue;
688
+
689
+ tokflag[i] = 1;
690
+ tokflag[i + 1] = 1;
691
+ }
692
+ }
693
+ }
694
+
695
+
696
+ // Numeric endings of sentences
697
+ //
698
+ // A period after a numeric token followed by a token that starts
699
+ // with an alphabetic character, is a separate token.
700
+ //
701
+ // This should be covered already by tok_13
702
+
703
+ void MPtok::tok_17()
704
+ {
705
+ int j;
706
+
707
+ for (int i = 0; i < text_len; i++)
708
+ {
709
+ if (text[i] == '.'
710
+ && tokflag[i] == 0
711
+ && tokflag[i + 1])
712
+ {
713
+ for (j = i - 1; j >= 0 && isdigit(text[j]) && tokflag[j] == 0; --j)
714
+ ;
715
+ if (j >= 0 && j < i - 1 && tokflag[j] && isalpha(nextchar(text, i + 1)))
716
+ tokflag[i] = 1;
717
+ }
718
+ }
719
+ }
720
+
721
+ // period at end of string is a token
722
+
723
+ void MPtok::tok_20()
724
+ {
725
+ for (int i = text_len - 1; i >= 0; --i)
726
+ {
727
+ if (isspace(text[i]))
728
+ continue;
729
+
730
+ if (strchr(".!?", text[i]))
731
+ tokflag[i] = 1;
732
+
733
+ break;
734
+ }
735
+ }
736
+
737
+ // a period that follows a non-common word, and that is
738
+ // followed by a lower case common word is probably not a token
739
+
740
+ void MPtok::tok_20_1()
741
+ {
742
+ int j;
743
+
744
+ for (int i = 0; i < text_len; ++i)
745
+ {
746
+ if (text[i] == '.' && tokflag[i] == 1)
747
+ {
748
+ int tcnt, lcnt, ocnt;
749
+ tcnt = lcnt = ocnt = 0;
750
+
751
+ // make sure the previous word was *not* common
752
+
753
+ for (j = i - 1; j >= 0; j--)
754
+ {
755
+ if (isspace(text[j])) continue;
756
+ if (option_new >= 2)
757
+ {
758
+ if (islower(text[j]) == 0 && text[j] != '-') ocnt++;
759
+ } else
760
+ {
761
+ if (! islower(text[j])) ocnt++;
762
+ }
763
+
764
+ if (tokflag[j] || j == 0)
765
+ {
766
+ if (ocnt == 0)
767
+ {
768
+ goto nexti;
769
+ }
770
+ break;
771
+ }
772
+ }
773
+
774
+ tcnt = lcnt = ocnt = 0;
775
+
776
+ // make sure the next word is common
777
+
778
+ for (j = i + 1; j < text_len; j++)
779
+ {
780
+ if (isspace(text[j])) continue;
781
+ if (tokflag[j]) tcnt++;
782
+
783
+ if (tcnt == 2 || j == text_len - 1)
784
+ {
785
+ if (lcnt > 0 && ocnt == 0) tokflag[i] = 0;
786
+ break;
787
+ }
788
+
789
+ if (islower(text[j])) lcnt++;
790
+ else ocnt++;
791
+ }
792
+ }
793
+ nexti: ;
794
+ }
795
+ }
796
+
797
+ // tokenized period followed by non-space other than close paren
798
+ // is not a token
799
+
800
+ void MPtok::tok_20_2()
801
+ {
802
+ int j;
803
+
804
+ for (int i = 0; i < text_len - 1; ++i)
805
+ {
806
+ if (text[i] == '.' && tokflag[i] == 1
807
+ && strchr(" ()[]\"\'\n\t\r", text[i+1]) == 0)
808
+ {
809
+ tokflag[i] = 0;
810
+ }
811
+ }
812
+ }
813
+
814
+
815
+ // long dash
816
+ //
817
+ // A pair of hyphens is a complete token
818
+
819
+ void MPtok::tok_21()
820
+ {
821
+ for (int i = 0; i + 1 < text_len; i++)
822
+ {
823
+ if (strncmp(&text[i], "--", 2) == 0)
824
+ {
825
+ tokflag[i] = 1;
826
+ if (i + 2 < text_len)
827
+ {
828
+ i += 2;
829
+ tokflag[i] = 1;
830
+ }
831
+ }
832
+ }
833
+ }
834
+
835
+ // hyphens
836
+ //
837
+ // If specified as an option, a hyphen between letters is a complete token
838
+
839
+ void MPtok::tok_21a()
840
+ {
841
+ if (option_hyphen == 0) return;
842
+
843
+ for (int i = 0; i + 1 < text_len; i++)
844
+ {
845
+ if (text[i] == '-'
846
+ && (i == 0 || text[i-1] != '-')
847
+ && text[i+1] != '-')
848
+ {
849
+ tokflag[i] = 1;
850
+ tokflag[i+1] = 1;
851
+ }
852
+ }
853
+ }
854
+
855
+
856
+ // quote
857
+ //
858
+ // Any double quote is a separate token
859
+
860
+ void MPtok::tok_22()
861
+ {
862
+ for (int i = 0; i < text_len; i++)
863
+ {
864
+ if (text[i] == '"')
865
+ {
866
+ tokflag[i] = 1;
867
+ if (i + 1 < text_len)
868
+ {
869
+ i += 1;
870
+ tokflag[i] = 1;
871
+ }
872
+ }
873
+ }
874
+ }
875
+
876
+ // possessive
877
+ //
878
+ // Any single quote at the end of a token that is not
879
+ // preceded by a single quote is a separate token
880
+
881
+ void MPtok::tok_23()
882
+ {
883
+ for (int i = 0; i < text_len; i++)
884
+ {
885
+ if (text[i] == '\''
886
+ && (i - 1 >= 0 && text[i - 1] != '\'')
887
+ && tokflag[i + 1])
888
+ {
889
+ tokflag[i] = 1;
890
+ }
891
+ }
892
+ }
893
+
894
+
895
+ // quote
896
+ //
897
+ // If a single quote starts a token, or is preceded by a
898
+ // single quote, and followed by a character
899
+ // that is not a single quote, then
900
+ // the character to it's right is the start of a new token
901
+
902
+ void MPtok::tok_24()
903
+ {
904
+ for (int i = 0; i < text_len; i++)
905
+ {
906
+ if (text[i] == '\''
907
+ && (tokflag[i] == 1 || (i - 1 >= 0 && text[i - 1] == '\''))
908
+ && (i + 1 < text_len && text[i + 1] != '\''))
909
+ {
910
+ tokflag[i + 1] = 1;
911
+ }
912
+ }
913
+ }
914
+
915
+ // put back possessive
916
+ //
917
+ // A single quote that is a whole token followed by a lower case s
918
+ // that is also a whole token (without space between them)
919
+ // should be merged into a single token
920
+
921
+ void MPtok::tok_25()
922
+ {
923
+ for (int i = 0; i < text_len; i++)
924
+ {
925
+ if (text[i] == '\''
926
+ && tokflag[i] == 1
927
+ && i + 1 < text_len && text[i + 1] == 's'
928
+ && tokflag[i+1] == 1
929
+ && (i + 2 >= text_len || isspace(text[i + 2]) || tokflag[i + 2] == 1))
930
+ {
931
+ tokflag[i + 1] = 0;
932
+ }
933
+ }
934
+ }
935
+
936
+ // quote
937
+ //
938
+ // A pair of single quotes is a separate token
939
+
940
+ void MPtok::tok_26()
941
+ {
942
+ for (int i = 0; i < text_len; i++)
943
+ {
944
+ if (strncmp(&text[i], "''", 2) == 0
945
+ || strncmp(&text[i], "``", 2) == 0)
946
+ {
947
+ tokflag[i] = 1;
948
+ if (i + 2 < text_len) tokflag[i + 2] = 1;
949
+ }
950
+ }
951
+ }
952
+
953
+ // possessive
954
+ //
955
+ // A single quote followed by a letter s is a possessive
956
+
957
+ void MPtok::tok_27()
958
+ {
959
+ for (int i = 0; i < text_len; i++)
960
+ {
961
+ if (text[i] == '\''
962
+ && i + 1 < text_len
963
+ && tolower(text[i + 1]) == 's'
964
+ && (i + 2 >= text_len || tokflag[i + 2]))
965
+ {
966
+ tokflag[i] = 1;
967
+ }
968
+ }
969
+ }
970
+
971
+ // split "cannot" to "can not"
972
+ //
973
+ // A single token that is the word cannot (in any case)
974
+ // is split into two words
975
+
976
+ void MPtok::tok_28()
977
+ {
978
+ for (int i = 0; i < text_len; i++)
979
+ {
980
+ if ((strncmp(&text[i], "cannot", 6) == 0
981
+ || strncmp(&text[i], "Cannot", 6) == 0)
982
+ && tokflag[i + 6])
983
+ {
984
+ tokflag[i + 3] = 1;
985
+ }
986
+ }
987
+ }
988
+
989
+ // put list item elements back at sentence end
990
+ //
991
+ // A period that is preceded by an alphanumeric (no space)
992
+ // and any amount of preceding space and an end-mark
993
+ // stays with the alphanumeric.
994
+
995
+ void MPtok::tok_29()
996
+ {
997
+ int j;
998
+
999
+ for (int i = 0; i < text_len; i++)
1000
+ {
1001
+ if (text[i] == '.'
1002
+ && tokflag[i] && tokflag[i + 1]
1003
+ && i - 1 >= 0 && isalnum(text[i - 1])
1004
+ && tokflag[i - 1]
1005
+ && ((j = lookbehind(text, i-2, ".", tokflag)) >= 0
1006
+ || (j = lookbehind(text, i-2, "?", tokflag)) >= 0
1007
+ || (j = lookbehind(text, i-2, "!", tokflag)) >= 0)
1008
+ && tokflag[j])
1009
+ {
1010
+ tokflag[i] = 0;
1011
+ }
1012
+ }
1013
+ }
1014
+
1015
+ // attach list elements to the beginnings of their sentences
1016
+ // this means, attach the period to the list element
1017
+ //
1018
+ // a list element is a single letter or a one or two digits
1019
+ // which is preceded by an end of sentence ".!?;"
1020
+ // or colon (provided it doesn't belong to a proportion construct)
1021
+
1022
+ void MPtok::tok_29a()
1023
+ {
1024
+ int i, j;
1025
+
1026
+ for (i = 0; i < text_len; i++)
1027
+ {
1028
+ if (text[i] == '.' && tokflag[i])
1029
+ {
1030
+ // Look back, make sure the token before the period
1031
+ // is either single alphanumeric, or at most a two digit number
1032
+ // and the character before that is a punctuation ".?!:,"
1033
+
1034
+ int tcnt, acnt, dcnt, pcnt, ocnt, scnt;
1035
+ tcnt = acnt = dcnt = pcnt = ocnt = scnt = 0;
1036
+ char p;
1037
+
1038
+ for (j = i - 1; j >= 0; j--)
1039
+ {
1040
+ if (isspace(text[j])) { scnt++; continue; }
1041
+ else if (tcnt == 0 && isalpha(text[j])) ++acnt;
1042
+ else if (tcnt == 0 && isdigit(text[j])) ++dcnt;
1043
+ else if (tcnt == 1 && strchr(".!?:;,", text[j])) { pcnt++; p = text[j]; }
1044
+ else ocnt++;
1045
+
1046
+ if (tokflag[j] || j == 0)
1047
+ {
1048
+ tcnt++;
1049
+ if (tcnt == 1 && ocnt == 0 && scnt == 0
1050
+ && ((acnt == 1 && dcnt == 0) || (acnt == 0 && dcnt > 0 && dcnt <= 2)))
1051
+ {
1052
+ // This is acceptable
1053
+ } else if (tcnt == 2 && pcnt <= 1 && ocnt == 0 && scnt > 0)
1054
+ {
1055
+ if (p == ':')
1056
+ {
1057
+ while (--j >= 0 && isspace(text[j]))
1058
+ ;
1059
+ if (j >= 0 && isdigit(text[j]))
1060
+ {
1061
+ // It's probably a proportion
1062
+ break;
1063
+ }
1064
+ }
1065
+ // Jackpot
1066
+ tokflag[i] = 0;
1067
+ } else
1068
+ {
1069
+ // This is not
1070
+ break;
1071
+ }
1072
+ scnt = 0;
1073
+ }
1074
+ }
1075
+ }
1076
+ }
1077
+ }
1078
+
1079
+ // list elements at the beginning of a string
1080
+ //
1081
+ // An alphanumeric token followed by a period
1082
+ // at the beginning of the line stays with the
1083
+ // alphanumeric
1084
+
1085
+ void MPtok::tok_30()
1086
+ {
1087
+ int i = 0;
1088
+
1089
+ while (isspace(text[i])) i++;
1090
+
1091
+ if (isalnum(text[i])
1092
+ && tokflag[i]
1093
+ && i + 1 < text_len
1094
+ && text[i + 1] == '.'
1095
+ && tokflag[i + 1])
1096
+ {
1097
+ tokflag[i + 1] = 0;
1098
+ }
1099
+ }
1100
+
1101
+ // process American style numbers
1102
+
1103
+ void MPtok::tok_31()
1104
+ {
1105
+ int j;
1106
+
1107
+ for (int i = 0; i < text_len; i++)
1108
+ {
1109
+ if (text[i] == ','
1110
+ && i + 3 < text_len
1111
+ && tokflag[i] && tokflag[i + 1]
1112
+ && isdigit(text[i + 1])
1113
+ && isdigit(text[i + 2])
1114
+ && isdigit(text[i + 3])
1115
+ && i - 1 >= 0 && isdigit(text[i - 1])
1116
+ )
1117
+ {
1118
+ tokflag[i] = 0;
1119
+ tokflag[i + 1] = 0;
1120
+ }
1121
+ }
1122
+ }
1123
+
1124
+ // process British style numbers
1125
+
1126
+ void MPtok::tok_32()
1127
+ {
1128
+ int j;
1129
+
1130
+ for (int i = 0; i < text_len; i++)
1131
+ {
1132
+ if (text[i] == ' '
1133
+ && i + 3 < text_len
1134
+ && tokflag[i] && tokflag[i + 1]
1135
+ && isdigit(text[i + 1])
1136
+ && isdigit(text[i + 2])
1137
+ && isdigit(text[i + 3])
1138
+ && i - 1 >= 0 && isdigit(text[i - 1])
1139
+ )
1140
+ {
1141
+ tokflag[i] = 0;
1142
+ tokflag[i + 1] = 0;
1143
+ }
1144
+ }
1145
+ }
1146
+
1147
+ // tokenize unicode escapes
1148
+ //
1149
+ // Added
1150
+
1151
+ void MPtok::tok_33()
1152
+ {
1153
+ int j;
1154
+
1155
+ for (int i = 0; i < text_len; i++)
1156
+ {
1157
+ if (text[i] == '&')
1158
+ {
1159
+ if (text[i + 1] == '#')
1160
+ {
1161
+ for (j = i + 2; isdigit(text[j]); j++)
1162
+ ;
1163
+ } else
1164
+ {
1165
+ for (j = i + 1; isalpha(text[j]); j++)
1166
+ ;
1167
+ }
1168
+
1169
+ if (text[j] == ';')
1170
+ {
1171
+ // Tokenize the escape, untokenize everything inside
1172
+
1173
+ tokflag[i] = 1;
1174
+ for (i++; i <= j; i++) tokflag[i] = 0;
1175
+ tokflag[i] = 1;
1176
+ }
1177
+ }
1178
+ }
1179
+ }
1180
+
1181
+ // Remove tags if they are present
1182
+
1183
+ void MPtok::tok_un()
1184
+ {
1185
+ int untok = 0;
1186
+ for (int i = 0; text[i]; ++i)
1187
+ {
1188
+ if (isspace(text[i])) untok = 0;
1189
+ if (text[i] == option_tagsep) untok = 1;
1190
+ if (untok) text[i] = ' ';
1191
+ }
1192
+ }
1193
+
1194
+
1195
+ void MPtok::set_tokflag()
1196
+ {
1197
+ int i;
1198
+
1199
+ tok_0();
1200
+ tok_1();
1201
+ tok_2();
1202
+ tok_3();
1203
+
1204
+ // step 4 replaces tag char, this is done at output
1205
+
1206
+ tok_5_6_7();
1207
+ tok_8_9();
1208
+
1209
+ tok_10();
1210
+ tok_11();
1211
+ if (option_new >= 1)
1212
+ {
1213
+ tok_21();
1214
+ tok_21a();
1215
+ tok_22();
1216
+ tok_23();
1217
+ tok_24();
1218
+ tok_25();
1219
+ tok_26();
1220
+ tok_27();
1221
+ }
1222
+ tok_12();
1223
+ tok_13();
1224
+ tok_14();
1225
+ if (option_new <= 5)
1226
+ tok_15();
1227
+ if (option_new < 2)
1228
+ tok_16();
1229
+ tok_17();
1230
+
1231
+ // steps 18 and 19 recognize periods within parens,
1232
+ // and this is moved to the segmentation section
1233
+
1234
+ tok_20();
1235
+ if (option_new >= 1)
1236
+ {
1237
+ tok_20_1();
1238
+ tok_20_2();
1239
+ if (option_new >= 2)
1240
+ tok_16_1();
1241
+ if (option_new >= 6)
1242
+ tok_15();
1243
+ if (option_new >= 7)
1244
+ tok_15_1();
1245
+ }
1246
+ if (option_new < 1)
1247
+ {
1248
+ tok_21();
1249
+ tok_21a();
1250
+ tok_22();
1251
+ tok_23();
1252
+ tok_24();
1253
+ tok_25();
1254
+ tok_26();
1255
+ tok_27();
1256
+ }
1257
+ tok_28();
1258
+ if (option_new >= 1)
1259
+ tok_29a();
1260
+ else
1261
+ tok_29();
1262
+ tok_30();
1263
+ tok_31();
1264
+ tok_32();
1265
+
1266
+ tok_33();
1267
+ }
1268
+
1269
+ /* set_endflag
1270
+ **
1271
+ ** After tokflag has been set, find the possible sentence endings.
1272
+ */
1273
+
1274
+ void MPtok::set_endflag()
1275
+ {
1276
+ int i;
1277
+
1278
+ // The following tests look for end-stops and label them.
1279
+ // They include steps 18 and 19
1280
+
1281
+ for (i = 0; i <= text_len; i++)
1282
+ endflag[i] = 0;
1283
+
1284
+ // Count the number of unmatched parens
1285
+
1286
+ int up = 0; // unmatched round parens
1287
+ int ub = 0; // unmatched brackets
1288
+
1289
+ for (i = 0; i < text_len; i++)
1290
+ {
1291
+ if (text[i] == '(') ++up;
1292
+ if (text[i] == ')') --up;
1293
+ if (text[i] == '[') ++ub;
1294
+ if (text[i] == ']') --ub;
1295
+ if (up < 0) up = 0;
1296
+ if (ub < 0) ub = 0;
1297
+ }
1298
+
1299
+ // Now find the end-of-sentence marks
1300
+
1301
+ // tok_18: periods within parentheses, allow for nesting
1302
+ // tok_19: periods within brackets, allow for nesting
1303
+ // the perl version solves this by putting the period
1304
+ // back with the previous token, but a better solution
1305
+ // is to allow it to be tokenized but just don't
1306
+ // allow it to be an end-of-sentence.
1307
+ // Therefore, these are moved to the segmentation
1308
+ // section
1309
+
1310
+ int p = 0; // round parens
1311
+ int b = 0; // brackets
1312
+
1313
+ for (i = 0; i < text_len; i++)
1314
+ {
1315
+ if (text[i] == '(') ++p;
1316
+ if (text[i] == ')') --p;
1317
+ if (text[i] == '[') ++b;
1318
+ if (text[i] == ']') --b;
1319
+ if (p < 0) p = 0;
1320
+ if (b < 0) b = 0;
1321
+
1322
+ if (strchr(".!?", text[i])
1323
+ && tokflag[i]
1324
+ && tokflag[i + 1])
1325
+ {
1326
+ if (option_segment && p <= up && b <= ub)
1327
+ endflag[i] = 1;
1328
+
1329
+ // This is optional to join periods with
1330
+ // probable abbreviations
1331
+
1332
+ if (p > up || b > ub)
1333
+ tokflag[i] = 0;
1334
+ }
1335
+ }
1336
+
1337
+ // endtokens followed by a single or double quote, which matches
1338
+ // a single or double quote in the previous sentence
1339
+
1340
+ if (option_new >= 1)
1341
+ {
1342
+ int dquo, squo;
1343
+ dquo = squo = 0;
1344
+
1345
+ for (i = 0; i < text_len; i++)
1346
+ {
1347
+ if (text[i] == '"') dquo = ! dquo;
1348
+ else if (text[i] == '\'') squo = ! squo;
1349
+ else if (endflag[i])
1350
+ {
1351
+ if ((text[i+1] == '"' && dquo) || (text[i+1] == '\'' && squo))
1352
+ {
1353
+ endflag[i] = 0;
1354
+
1355
+ // But don't end at all if the next token is something
1356
+ // other than an upper case letter.
1357
+
1358
+ if (option_new >= 2)
1359
+ {
1360
+ int j;
1361
+ int ok = 0;
1362
+
1363
+ for (j = i + 2; j < text_len; j++)
1364
+ {
1365
+ if (isspace(text[j])) continue;
1366
+ // if (isupper(text[j]))
1367
+ if (isupper(text[j]) || text[j] == '(')
1368
+ {
1369
+ ok = 1;
1370
+ break;
1371
+ }
1372
+ if (tokflag[j]) break;
1373
+ }
1374
+
1375
+ if (ok)
1376
+ endflag[i+1] = 1;
1377
+ } else
1378
+ {
1379
+ endflag[i+1] = 1;
1380
+ }
1381
+ }
1382
+ dquo = squo = 0;
1383
+ }
1384
+ }
1385
+ }
1386
+ }
1387
+
1388
+
1389
+ /* set_endflag_01
1390
+ **
1391
+ ** After tokflag has been set, find the possible sentence endings.
1392
+ ** This has improved paren matching.
1393
+ */
1394
+
1395
+ #define MAX_MATCH 500 // Maximum length to get a paren match
1396
+
1397
+ void MPtok::set_endflag_01()
1398
+ {
1399
+ int match[text_len];
1400
+ int i, j;
1401
+
1402
+ // The following tests look for end-stops and label them.
1403
+ // They include steps 18 and 19
1404
+
1405
+ for (i = 0; i <= text_len; i++)
1406
+ endflag[i] = 0;
1407
+
1408
+ for (i = 0; i < text_len; i++)
1409
+ match[i] = 0;
1410
+
1411
+ for (i = text_len - 1; i >= 0; i--)
1412
+ {
1413
+ if (text[i] == '(' || text[i] == '[')
1414
+ {
1415
+ for (j = i + 1; text[j] && j - i <= MAX_MATCH; j++)
1416
+ {
1417
+ // Skip parens that are already matched
1418
+
1419
+ if (match[j] > j)
1420
+ {
1421
+ j = match[j];
1422
+ continue;
1423
+ }
1424
+
1425
+ // Look for a matching close paren
1426
+
1427
+ if (match[j] == 0
1428
+ && ((text[i] == '(' && text[j] == ')')
1429
+ || (text[i] == '[' && text[j] == ']')))
1430
+ {
1431
+ match[i] = j;
1432
+ match[j] = i;
1433
+ break;
1434
+ }
1435
+ }
1436
+ }
1437
+ }
1438
+
1439
+ int next_match = 0;
1440
+ for (i = 0; i < text_len; i++)
1441
+ {
1442
+ if (match[i] > next_match)
1443
+ next_match = match[i];
1444
+
1445
+ if (strchr(".!?", text[i])
1446
+ && tokflag[i]
1447
+ && tokflag[i + 1]
1448
+ && (option_new <= 4 || option_doteos == 1 || (i > 0 && isspace(text[i-1]) == 0)))
1449
+ {
1450
+ if (i <= next_match)
1451
+ tokflag[i] = 0;
1452
+ else if (option_segment)
1453
+ endflag[i] = 1;
1454
+ }
1455
+ }
1456
+
1457
+ // endtokens followed by a single or double quote, which matches
1458
+ // a single or double quote in the previous sentence
1459
+
1460
+ int dquo, squo;
1461
+ dquo = squo = 0;
1462
+
1463
+ for (i = 0; i < text_len; i++)
1464
+ {
1465
+ if (option_new <= 7 && text[i] == '"') dquo = ! dquo;
1466
+ else if (option_new >= 8 && text[i] == '"' && tokflag[i] && tokflag[i+1]) dquo = ! dquo;
1467
+ else if (option_new <= 7 && text[i] == '\'') squo = ! squo;
1468
+ else if (option_new >= 8 && text[i] == '\''
1469
+ && tokflag[i] && (tokflag[i+1] || (text[i+1] == '\'' && tokflag[i+2]))) squo = ! squo;
1470
+ else if (endflag[i])
1471
+ {
1472
+ if ((text[i+1] == '"' && dquo) || (text[i+1] == '\'' && squo))
1473
+ {
1474
+ endflag[i] = 0;
1475
+
1476
+ // But don't end at all if the next token is something
1477
+ // other than an upper case letter.
1478
+
1479
+ if (option_new >= 2)
1480
+ {
1481
+ int j;
1482
+ int ok = 0;
1483
+
1484
+ for (j = i + 2; j < text_len; j++)
1485
+ {
1486
+ if (isspace(text[j])) continue;
1487
+ // if (isupper(text[j]))
1488
+ if (isupper(text[j]) || text[j] == '(')
1489
+ {
1490
+ ok = 1;
1491
+ break;
1492
+ }
1493
+ if (tokflag[j]) break;
1494
+ }
1495
+
1496
+ if (ok)
1497
+ endflag[i+1] = 1;
1498
+ } else
1499
+ {
1500
+ endflag[i+1] = 1;
1501
+ }
1502
+ }
1503
+ dquo = squo = 0;
1504
+ }
1505
+ }
1506
+ }
1507
+
1508
+
1509
+ // Size buffer: return the size of the buffer required to hold all of the tokenized text.
1510
+ // It can be simply estimated by a formula that depends only on the length of text and number of tokens.
1511
+
1512
+ int MPtok::size_buff()
1513
+ {
1514
+ int size = 1; // Start with null terminator
1515
+ int t = option_pretag.size(); // for each tag, the length of the UNTAG string
1516
+
1517
+ if (t <= 0) t = 1; // Make sure there is at least one
1518
+ t += 2; // Add one for underscore and one for space
1519
+
1520
+ for (int i = 0; i < text_len; i++)
1521
+ {
1522
+ size++; // Count all characters
1523
+ if (tokflag[i]) size += t; // Count token delimiters (may overcount)
1524
+ if (endflag[i]) size++; // Add one for newline
1525
+ }
1526
+ return size;
1527
+ }
1528
+
1529
+
1530
+ /* append_token
1531
+ **
1532
+ ** Save a single token to a buffer.
1533
+ */
1534
+
1535
+ void MPtok::append_token(string& buff, int& sp, char *tok, int ef)
1536
+ {
1537
+ // Convert tag separator chars and back quotes (?)
1538
+
1539
+ for (int i = 0; tok[i]; i++)
1540
+ {
1541
+ if (tok[i] == option_tagsep) tok[i] = option_replacesep;
1542
+ if (tok[i] == '`') tok[i] = '\'';
1543
+ }
1544
+
1545
+ // Skip whitespace if tokens are being output
1546
+ // Otherwise, skip whitespace at the start of a sentence
1547
+
1548
+ if (option_token || ! sp) while (isspace(*tok)) ++tok;
1549
+
1550
+ // Save the token
1551
+
1552
+ if (strlen(tok) > 0)
1553
+ {
1554
+ // Add delimiter if needed
1555
+
1556
+ if (option_token && sp) buff += ' ';
1557
+
1558
+ // Append token to output
1559
+
1560
+ if (option_new < 9)
1561
+ {
1562
+ while (*tok && (! option_token || ! isspace(*tok)))
1563
+ buff += *(tok++);
1564
+ } else
1565
+ {
1566
+ while (*tok)
1567
+ buff += *(tok++);
1568
+ }
1569
+
1570
+ sp = 1;
1571
+
1572
+ // Add tag holders
1573
+
1574
+ if (option_token && option_pretag.size() > 0)
1575
+ {
1576
+ buff += option_tagsep;
1577
+ buff += option_pretag;
1578
+ }
1579
+
1580
+ // If it was end of sentence, then add newline
1581
+
1582
+ if (ef)
1583
+ {
1584
+ buff += '\n';
1585
+ sp = 0;
1586
+ }
1587
+ }
1588
+ }
1589
+
1590
+ // Strip whitespace after sentences
1591
+
1592
+ static void adjust_space(string& buff)
1593
+ {
1594
+ while (buff.size() > 0 && isspace(buff[0])) buff.erase(0, 1);
1595
+
1596
+ // delete two spaces in a row, but keep newlines
1597
+
1598
+ for (int i = 1; i < buff.size(); i++)
1599
+ {
1600
+ if (isspace(buff[i]) && isspace(buff[i-1]))
1601
+ buff.erase((buff[i] == '\n')?(--i):(i--), 1);
1602
+ }
1603
+
1604
+ for (int i = buff.size() - 1; i >= 0 && isspace(buff[i]); i--)
1605
+ buff.erase(i, 1);
1606
+ }
1607
+
1608
+ /* token_string
1609
+ **
1610
+ ** After the tokflag and endflag have been set, copy the tokens to the buffer.
1611
+ */
1612
+
1613
+ string MPtok::token_string()
1614
+ {
1615
+ string buff;
1616
+
1617
+ int i;
1618
+
1619
+ // Move token starts to non-whitespace chars
1620
+
1621
+ int last_tok = 0;
1622
+ for (i = 0; i < text_len; i++)
1623
+ {
1624
+ if (tokflag[i] == 1 && isspace(text[i]))
1625
+ {
1626
+ tokflag[i] = 0;
1627
+ last_tok = 1;
1628
+ } else if (isspace(text[i]) == 0 && last_tok)
1629
+ {
1630
+ tokflag[i] = 1;
1631
+ last_tok = 0;
1632
+ }
1633
+ }
1634
+
1635
+ // Extract the tokens and print them out now
1636
+
1637
+ char *tok = new char[text_len + 1];
1638
+ int pos = 0;
1639
+ int sp = 0;
1640
+ int ef = 0;
1641
+
1642
+ tok[pos] = '\0';
1643
+
1644
+ for (i = 0; i <= text_len; i++)
1645
+ {
1646
+ // The start of a new token
1647
+
1648
+ if (tokflag[i])
1649
+ {
1650
+ // Print the current token
1651
+
1652
+ append_token(buff, sp, tok, ef);
1653
+
1654
+ // Start a new token
1655
+
1656
+ pos = 0;
1657
+ tok[pos] = '\0';
1658
+
1659
+ ef = 0;
1660
+ }
1661
+
1662
+ // Append to the current token
1663
+
1664
+ tok[pos++] = text[i];
1665
+ tok[pos] = '\0';
1666
+
1667
+ // If any of the characters in the token are endflagged,
1668
+ // Then pass this information along for end-of-sentence
1669
+
1670
+ if (endflag[i]) ef = 1;
1671
+ }
1672
+
1673
+ // Print the last token
1674
+
1675
+ append_token(buff, sp, tok, ef);
1676
+
1677
+ delete[] tok;
1678
+
1679
+ // Adjust the end of sentence boundaries
1680
+
1681
+ adjust_space(buff);
1682
+
1683
+ return buff;
1684
+ }
1685
+
1686
+ void MPtok::map_escapes()
1687
+ {
1688
+ char *s;
1689
+ int j, k, ch;
1690
+ char buff[10];
1691
+
1692
+ k = 0;
1693
+ for (int i = 0; text[i]; i++)
1694
+ {
1695
+ if (text[i] == '&' && text[i + 1] == '#')
1696
+ {
1697
+ for (s = &buff[0], j = 2; j <= 4 && i + j < text_len && isdigit(text[i + j]); j++)
1698
+ *s++ = text[i + j];
1699
+ *s = '\0';
1700
+ ch = atoi(buff);
1701
+ if (strlen(buff) > 0 && text[i + j] == ';' && ch > 0 && ch <= 256)
1702
+ {
1703
+ text[k] = ch;
1704
+ if (! text[k]) text[k] = ' ';
1705
+ k++;
1706
+ i = i + j;
1707
+ continue;
1708
+ }
1709
+ }
1710
+ text[k++] = text[i];
1711
+ }
1712
+ text[k] = '\0';
1713
+ text_len = k;
1714
+ }
1715
+
1716
+ MPtok::MPtok(string idir, const string& cnam)
1717
+ {
1718
+ tok_initialized = 0;
1719
+
1720
+ if (idir.size() == 0)
1721
+ {
1722
+ char *p = getenv("MEDPOST_HOME");
1723
+ if (p && strlen(p))
1724
+ {
1725
+ idir = p;
1726
+
1727
+ int found = idir.find("=");
1728
+ if (found != string::npos)
1729
+ idir = idir.substr(found + 1);
1730
+ }
1731
+ }
1732
+
1733
+
1734
+ if (idir.size() == 0)
1735
+ {
1736
+ char buff[1000];
1737
+ FILE *fp = fopen("path_medpost", "r");
1738
+ if (fp)
1739
+ {
1740
+ if (fgets(buff, 1000, fp))
1741
+ {
1742
+ chomp(buff);
1743
+ idir = &buff[0];
1744
+ }
1745
+ fclose(fp);
1746
+ }
1747
+ }
1748
+
1749
+ if (idir.size() == 0)
1750
+ idir = "/home/natxie/CPP64/lib/FIXED_DATA/";
1751
+
1752
+ option_dir = idir;
1753
+
1754
+ option_token = 1;
1755
+ option_segment = 1;
1756
+ option_hyphen = 0;
1757
+ option_comma = 1;
1758
+ option_pretok = 0;
1759
+ option_new = MPTOK_VERSION;
1760
+ option_doteos = 0;
1761
+
1762
+ if (cnam.size() > 0)
1763
+ {
1764
+ option_cnam = "_";
1765
+ option_cnam += cnam;
1766
+ }
1767
+
1768
+ init();
1769
+ }
1770
+
1771
+ void MPtok::init(void)
1772
+ {
1773
+ if (tok_initialized) return;
1774
+
1775
+ string fname;
1776
+
1777
+ fname = option_dir + "/medpost" + option_cnam + ".pairs";
1778
+ init_pair(fname);
1779
+
1780
+ fname = option_dir + "/medpost" + option_cnam + ".abbr";
1781
+ init_abbr(fname);
1782
+
1783
+ tok_initialized = 1;
1784
+ }
1785
+
1786
+ MPtok::~MPtok()
1787
+ {
1788
+ }
1789
+
1790
+ // Global tokenizer
1791
+
1792
+ string MPtok::tokenize(const string& txt, int mt)
1793
+ {
1794
+ if (option_pretok) return save_string(txt);
1795
+
1796
+ option_token = mt;
1797
+ text_len = txt.size();
1798
+ if (text_len == 0) return string("");
1799
+
1800
+ text = new char[text_len + 1];
1801
+ strcpy(text, txt.c_str());
1802
+
1803
+ map_escapes();
1804
+
1805
+ if (text_len == 0) return NULL;
1806
+
1807
+ tokflag = new int[text_len + 1];
1808
+ endflag = new int[text_len + 1];
1809
+
1810
+ set_tokflag();
1811
+ if (option_new < 3)
1812
+ set_endflag();
1813
+ else
1814
+ set_endflag_01();
1815
+
1816
+ string buff = token_string();
1817
+ save_string(buff);
1818
+
1819
+ delete[] text; text = NULL;
1820
+ delete[] tokflag; tokflag = NULL;
1821
+ delete[] endflag; endflag = NULL;
1822
+
1823
+ return buff;
1824
+ }
1825
+
1826
+ string MPtok::tokenize(const string& text)
1827
+ {
1828
+ return tokenize(text, 1);
1829
+ }
1830
+
1831
+ string MPtok::segment(const string& text)
1832
+ {
1833
+ sent.clear();
1834
+
1835
+ // tokenize the text
1836
+
1837
+ int save_option_segment = option_segment;
1838
+ option_segment = 1;
1839
+ string buff = tokenize(text, 0);
1840
+ option_segment = save_option_segment;
1841
+
1842
+ if (buff.size() == 0) return text;
1843
+
1844
+ int found = 0;
1845
+ int pos = 0;
1846
+
1847
+ while (pos < buff.size())
1848
+ {
1849
+ found = buff.find('\n', pos);
1850
+ if (found == string::npos)
1851
+ {
1852
+ sent.push_back(buff.substr(pos));
1853
+ pos = buff.size();
1854
+ } else
1855
+ {
1856
+ sent.push_back(buff.substr(pos, found - pos));
1857
+ pos = found + 1;
1858
+ }
1859
+ }
1860
+
1861
+ return buff;
1862
+ }
1863
+
1864
+ string MPtok::save_string(const string& s)
1865
+ {
1866
+ stringstream ss (stringstream::in | stringstream::out);
1867
+ string w, t;
1868
+ int found;
1869
+ string ret;
1870
+
1871
+ word.clear();
1872
+ tag.clear();
1873
+
1874
+ ss << s;
1875
+ while (ss.good())
1876
+ {
1877
+ ss >> w;
1878
+ if (w.size() == 0) break;
1879
+
1880
+ found = w.find('_');
1881
+
1882
+ if (found != string::npos)
1883
+ {
1884
+ t = w.substr(found + 1);
1885
+ w.resize(found);
1886
+ word.push_back(w);
1887
+ tag.push_back(t);
1888
+ } else
1889
+ {
1890
+ word.push_back(w);
1891
+ tag.push_back(option_pretag);
1892
+
1893
+ }
1894
+ if (ret.size() > 0) ret += " ";
1895
+ ret += w;
1896
+ }
1897
+
1898
+ // now look for continuation tags...
1899
+
1900
+ for (int i = 0; i < word.size(); i++)
1901
+ {
1902
+ int j = tag[i].size() - 1;
1903
+ if (j >= 0 && tag[i][j] == '+' && i < tag.size() - 1)
1904
+ {
1905
+ word[i] = word[i] + " " + word[i + 1];
1906
+ tag[i] = tag[i + 1];
1907
+ word.erase(word.begin() + i + 1, word.begin() + i + 2);
1908
+ tag.erase(tag.begin() + i + 1, tag.begin() + i + 2);
1909
+ i--;
1910
+ }
1911
+ }
1912
+
1913
+ return ret;
1914
+ }
1915
+
1916
+
1917
+ static int count_words(const char *s)
1918
+ {
1919
+ int i;
1920
+
1921
+ i = 1;
1922
+ for (; *s; ++s)
1923
+ {
1924
+ if (*s == ' ') ++i;
1925
+ }
1926
+ return i;
1927
+ }
1928
+
1929
+ static void print_word(const char *s, int i)
1930
+ {
1931
+ for (; i > 0 && *s; ++s) { if (*s == ' ') --i; }
1932
+ while (*s && *s != ' ') { printf("%c", *s); ++s; }
1933
+ }
1934
+
1935
+ void MPtok::print(int how)
1936
+ {
1937
+ int i, j, w;
1938
+
1939
+ if (how != 0 && how != 2)
1940
+ {
1941
+ printf("print(%d) not defined\n", how);
1942
+ return;
1943
+ }
1944
+
1945
+ for (i = 0; i < word.size(); ++i)
1946
+ {
1947
+ // Get the words from an idiom
1948
+
1949
+ for (w = 0; w < count_words(word[i].c_str()); ++w)
1950
+ {
1951
+ if (how == 2 && i + w > 0) printf(" ");
1952
+
1953
+ print_word(word[i].c_str(), w);
1954
+
1955
+ if (how == 0)
1956
+ {
1957
+ printf(" tagged %s", tag[i].c_str());
1958
+ if (w < count_words(word[i].c_str()) - 1) printf("+");
1959
+ printf("\n");
1960
+ } else if (how == 2)
1961
+ {
1962
+ printf("%s%s", "_", tag[i].c_str());
1963
+ if (w < count_words(word[i].c_str()) - 1) printf("+");
1964
+ }
1965
+ }
1966
+ }
1967
+ if (how == 2)
1968
+ printf("\n");
1969
+ }
1970
+
1971
+ void MPtok::merge_words(int s, int n)
1972
+ {
1973
+ string tmp = word[s];
1974
+
1975
+ for (int i = s + 1; i < s + n; i++)
1976
+ {
1977
+ tmp += " ";
1978
+ tmp += word[i];
1979
+ }
1980
+
1981
+ // printf("merging words : '%s' n = %d\n", tmp.c_str(), n);
1982
+
1983
+ for (int k = s; k + n < word.size(); k++)
1984
+ {
1985
+ word[k+1] = word[k+n];
1986
+ tag[k+1] = tag[k+n];
1987
+ }
1988
+
1989
+ // Fixup the remaining array
1990
+
1991
+ word.resize(word.size() - n + 1);
1992
+ tag.resize(word.size());
1993
+
1994
+ word[s] = tmp;
1995
+ }
1996
+
1997
+ void MPtok::split_words()
1998
+ {
1999
+ for (int i = 0; i < word.size(); i++)
2000
+ {
2001
+ int found = word[i].find(' ');
2002
+
2003
+ if (found != string::npos)
2004
+ {
2005
+ string tmp1(word[i], 0, found);
2006
+ string tmp2(word[i], found + 1, string::npos);
2007
+
2008
+ // Move all the words and tags down
2009
+
2010
+ word.resize(word.size() + 1);
2011
+ tag.resize(tag.size() + 1);
2012
+
2013
+ for (int j = word.size() - 1; j > i; j--)
2014
+ {
2015
+ word[j] = word[j - 1];
2016
+ tag[j] = tag[j - 1];
2017
+ }
2018
+
2019
+ word[i] = tmp1;
2020
+ tag[i] = tag[i+1];
2021
+ tag[i] += "+";
2022
+
2023
+ word[i+1] = tmp2;
2024
+ }
2025
+ }
2026
+ }
2027
+
2028
+ // Callable functions to set internal options
2029
+
2030
+ void MPtok::set_segment(int i) { option_segment = i; }
2031
+ void MPtok::set_hyphen(int i) { option_hyphen = i; }
2032
+ void MPtok::set_comma(int i) { option_comma = i; }
2033
+ void MPtok::set_pretag(char *a) { option_pretag = a; }
2034
+ void MPtok::set_pretok(int i) { option_pretok = i; }
2035
+ void MPtok::set_new(int i) { option_new = i; }
2036
+ void MPtok::set_doteos(int i) { option_doteos = i; }
Library/MPtok.h CHANGED
@@ -1,141 +1,141 @@
1
- #ifndef _MPTOK_H
2
- #define _MPTOK_H
3
-
4
- #include <stdio.h>
5
-
6
- #include <string>
7
- #include <vector>
8
- #include <map>
9
- #include <set>
10
-
11
- using namespace std;
12
-
13
- #define MPTOK_VERSION 11 // The latest version
14
-
15
- // Maximum number of words in a sentence
16
-
17
- #define MAX_WORDS 10000
18
-
19
- enum { ABB_ABB, ABB_EOS, ABB_NUM };
20
- #define MAX_ABB 100
21
-
22
- /*! \brief A class to perform tokenization.
23
- *
24
- * The MPtag class can be used to perform tokenization and segmentation
25
- * of strings into tokens or sentences. It is inherited and used by MPtag
26
- * so if the user is only interested in tagging, this class does not
27
- * need to be referenced.
28
- */
29
-
30
- class MPtok
31
- {
32
- public:
33
- /// \brief A MPtok object, giving the install directory \p idir where data files can be found
34
- MPtok(string idir = "", const string& cnam = "");
35
- ~MPtok();
36
-
37
- void init(); // Initialize (call only once)
38
- void init(const string& idir) { option_dir = idir; init(); } // Initialize using specified install directory
39
-
40
- string option_pretag; // The tag to use on tokens
41
- int option_segment; // Segment into sentences
42
- int option_hyphen; // Hyphens are separate tokens
43
- int option_comma; // Commas are always tokenized
44
- int option_pretok; // The text is pre-tokenized
45
- int option_new; // Use new algorithms, used in development only
46
- int option_doteos; // If " . " occurs, it's an end EOS (new >= 5)
47
-
48
- void set_segment(int i); ///< \brief Sentences are broken up during tokenization (default 1)
49
- void set_token(int i); ///< \brief Break tokens apart with white space (default 1)
50
- void set_hyphen(int i); ///< \brief Hyphens are separate tokens (default 0)
51
- void set_comma(int i); ///< \brief Commas are separate tokens (default 1)
52
- void set_pretag(char *a); ///< \brief Use this tag on all tokens (default empty string)
53
- void set_pretok(int i); ///< \brief Assume string is already tokenized using spaces (default 0)
54
- void set_new(int i); ///< \brief Use a previous algorithm (defaults to most recent)
55
- void set_doteos(int i); ///< \brief Ignore abbreviations, and always assume a period ends a sentence (default 0)
56
-
57
- void merge_words(int s, int e); // merge words between s and e (idiom)
58
- void split_words(void); // split all merged words
59
-
60
- string tokenize(const string&); ///< \brief Tokenize, save (in \p word), and return space delimited tokens
61
- string segment(const string&); ///< \brief Segment, save (in \p sent), and return newline delimited sentences
62
-
63
- string save_string(const string&); // save a buffer
64
- string tokenize_nosave(const string&); // tokenize without saving
65
- string tokenize(const string&,int); // do tokenization with or without inserting spaces between them
66
-
67
- void print(int); ///< \brief Print tokens/tags with given verbosity
68
-
69
- vector<string> word; ///< \brief Vector of words (tokens) of most recently tagged (or tokenized) text
70
- vector<string> tag; ///< \brief Vector of tags of most recently tagged (or tokenized) text
71
- vector<string> sent; ///< \brief Vector of sentences of most recently sentence-segmented text
72
-
73
- char *text; // Input text arg
74
- int text_len; // It's length
75
- int *tokflag; // token flags
76
- int *endflag; // end-sentence flags
77
-
78
- string option_cnam; // A suffix, for opening variant support files
79
- string option_dir; // Directory to find things
80
-
81
- protected:
82
-
83
- void set_tokflag();
84
- void set_endflag();
85
- void set_endflag_01();
86
- int size_buff();
87
-
88
- void init_pair(const string& file_name); // read a file of common pairs
89
- void init_abbr(const string& file_name); // read a file of abbreviations
90
-
91
- void tok_0();
92
- void tok_1();
93
- void tok_2();
94
- void tok_3();
95
- void tok_5_6_7();
96
- void tok_8_9();
97
- void tok_10();
98
- void tok_11();
99
- void tok_12();
100
- void tok_13();
101
- void tok_14();
102
- void tok_15();
103
- void tok_15_1();
104
- void tok_16();
105
- void tok_16_1();
106
- void tok_17();
107
- void tok_20();
108
- void tok_20_1();
109
- void tok_20_2();
110
- void tok_21();
111
- void tok_21a();
112
- void tok_22();
113
- void tok_23();
114
- void tok_24();
115
- void tok_25();
116
- void tok_26();
117
- void tok_27();
118
- void tok_28();
119
- void tok_29();
120
- void tok_29a();
121
- void tok_30();
122
- void tok_31();
123
- void tok_32();
124
- void tok_33();
125
- int complex_check();
126
- void map_escapes();
127
- void tok_un();
128
-
129
- void append_token(string&, int&, char*, int);
130
- string token_string();
131
-
132
- set<string> common_pair;
133
- map<string,int> common_abbr;
134
-
135
- private:
136
- int option_token; // Output tokenized text (only use internally)
137
- int tok_initialized; // is it inited?
138
- };
139
-
140
- #endif
141
-
 
1
+ #ifndef _MPTOK_H
2
+ #define _MPTOK_H
3
+
4
+ #include <stdio.h>
5
+
6
+ #include <string>
7
+ #include <vector>
8
+ #include <map>
9
+ #include <set>
10
+
11
+ using namespace std;
12
+
13
+ #define MPTOK_VERSION 11 // The latest version
14
+
15
+ // Maximum number of words in a sentence
16
+
17
+ #define MAX_WORDS 10000
18
+
19
+ enum { ABB_ABB, ABB_EOS, ABB_NUM };
20
+ #define MAX_ABB 100
21
+
22
+ /*! \brief A class to perform tokenization.
23
+ *
24
+ * The MPtag class can be used to perform tokenization and segmentation
25
+ * of strings into tokens or sentences. It is inherited and used by MPtag
26
+ * so if the user is only interested in tagging, this class does not
27
+ * need to be referenced.
28
+ */
29
+
30
+ class MPtok
31
+ {
32
+ public:
33
+ /// \brief A MPtok object, giving the install directory \p idir where data files can be found
34
+ MPtok(string idir = "", const string& cnam = "");
35
+ ~MPtok();
36
+
37
+ void init(); // Initialize (call only once)
38
+ void init(const string& idir) { option_dir = idir; init(); } // Initialize using specified install directory
39
+
40
+ string option_pretag; // The tag to use on tokens
41
+ int option_segment; // Segment into sentences
42
+ int option_hyphen; // Hyphens are separate tokens
43
+ int option_comma; // Commas are always tokenized
44
+ int option_pretok; // The text is pre-tokenized
45
+ int option_new; // Use new algorithms, used in development only
46
+ int option_doteos; // If " . " occurs, it's an end EOS (new >= 5)
47
+
48
+ void set_segment(int i); ///< \brief Sentences are broken up during tokenization (default 1)
49
+ void set_token(int i); ///< \brief Break tokens apart with white space (default 1)
50
+ void set_hyphen(int i); ///< \brief Hyphens are separate tokens (default 0)
51
+ void set_comma(int i); ///< \brief Commas are separate tokens (default 1)
52
+ void set_pretag(char *a); ///< \brief Use this tag on all tokens (default empty string)
53
+ void set_pretok(int i); ///< \brief Assume string is already tokenized using spaces (default 0)
54
+ void set_new(int i); ///< \brief Use a previous algorithm (defaults to most recent)
55
+ void set_doteos(int i); ///< \brief Ignore abbreviations, and always assume a period ends a sentence (default 0)
56
+
57
+ void merge_words(int s, int e); // merge words between s and e (idiom)
58
+ void split_words(void); // split all merged words
59
+
60
+ string tokenize(const string&); ///< \brief Tokenize, save (in \p word), and return space delimited tokens
61
+ string segment(const string&); ///< \brief Segment, save (in \p sent), and return newline delimited sentences
62
+
63
+ string save_string(const string&); // save a buffer
64
+ string tokenize_nosave(const string&); // tokenize without saving
65
+ string tokenize(const string&,int); // do tokenization with or without inserting spaces between them
66
+
67
+ void print(int); ///< \brief Print tokens/tags with given verbosity
68
+
69
+ vector<string> word; ///< \brief Vector of words (tokens) of most recently tagged (or tokenized) text
70
+ vector<string> tag; ///< \brief Vector of tags of most recently tagged (or tokenized) text
71
+ vector<string> sent; ///< \brief Vector of sentences of most recently sentence-segmented text
72
+
73
+ char *text; // Input text arg
74
+ int text_len; // It's length
75
+ int *tokflag; // token flags
76
+ int *endflag; // end-sentence flags
77
+
78
+ string option_cnam; // A suffix, for opening variant support files
79
+ string option_dir; // Directory to find things
80
+
81
+ protected:
82
+
83
+ void set_tokflag();
84
+ void set_endflag();
85
+ void set_endflag_01();
86
+ int size_buff();
87
+
88
+ void init_pair(const string& file_name); // read a file of common pairs
89
+ void init_abbr(const string& file_name); // read a file of abbreviations
90
+
91
+ void tok_0();
92
+ void tok_1();
93
+ void tok_2();
94
+ void tok_3();
95
+ void tok_5_6_7();
96
+ void tok_8_9();
97
+ void tok_10();
98
+ void tok_11();
99
+ void tok_12();
100
+ void tok_13();
101
+ void tok_14();
102
+ void tok_15();
103
+ void tok_15_1();
104
+ void tok_16();
105
+ void tok_16_1();
106
+ void tok_17();
107
+ void tok_20();
108
+ void tok_20_1();
109
+ void tok_20_2();
110
+ void tok_21();
111
+ void tok_21a();
112
+ void tok_22();
113
+ void tok_23();
114
+ void tok_24();
115
+ void tok_25();
116
+ void tok_26();
117
+ void tok_27();
118
+ void tok_28();
119
+ void tok_29();
120
+ void tok_29a();
121
+ void tok_30();
122
+ void tok_31();
123
+ void tok_32();
124
+ void tok_33();
125
+ int complex_check();
126
+ void map_escapes();
127
+ void tok_un();
128
+
129
+ void append_token(string&, int&, char*, int);
130
+ string token_string();
131
+
132
+ set<string> common_pair;
133
+ map<string,int> common_abbr;
134
+
135
+ private:
136
+ int option_token; // Output tokenized text (only use internally)
137
+ int tok_initialized; // is it inited?
138
+ };
139
+
140
+ #endif
141
+
Library/Makefile CHANGED
@@ -1,13 +1,13 @@
1
- SRC_DIR=./
2
- TRASHFILES = *.o *~ *.bak core
3
- LIB_INC=-I./
4
- #.KEEP_STATE:
5
- libops.a: runn.o Btree.o FBase.o Hash.o MPtok.o \
6
- AbbrStra.o AbbrvE.o Ab3P.o
7
- ar rus $@ $?
8
- OS=-g
9
- %.o: $(SRC_DIR)/%.C
10
- g++ -c $(OS) $< -o $@ $(LIB_INC)
11
-
12
- clean: rm -f $(TRASHFILES)
13
-
 
1
+ SRC_DIR=./
2
+ TRASHFILES = *.o *~ *.bak core
3
+ LIB_INC=-I./
4
+ #.KEEP_STATE:
5
+ libops.a: runn.o Btree.o FBase.o Hash.o MPtok.o \
6
+ AbbrStra.o AbbrvE.o Ab3P.o
7
+ ar rus $@ $?
8
+ OS=-g
9
+ %.o: $(SRC_DIR)/%.C
10
+ g++ -c $(OS) $< -o $@ $(LIB_INC)
11
+
12
+ clean: rm -f $(TRASHFILES)
13
+
Library/WordData/Ab3P_prec.dat CHANGED
@@ -1,145 +1,145 @@
1
- Al 1 FirstLetOneChSF 0.967224
2
- Al 2 FirstLet 0.99818
3
- Al 2 FirstLetGen 0.994292
4
- Al 2 WithinWrdFWrd 0.989054
5
- Al 2 FirstLetGenStp 0.970019
6
- Al 2 ContLet 0.96935
7
- Al 2 WithinWrdFLet 0.941981
8
- Al 2 FirstLetGenSkp 0.949988
9
- Al 2 WithinWrdFWrdSkp 0.947364
10
- Al 2 ContLetSkp 0.877216
11
- Al 2 WithinWrdWrd 0.74768
12
- Al 2 WithinWrdFLetSkp 0.640805
13
- Num 2 ContLet 0.975372
14
- Num 2 ContLetSkp 0.96617
15
- Num 2 WithinWrdFWrdSkp 0.988426
16
- Num 2 FirstLetGen2 0.909995
17
- Num 2 FirstLetGenStp 0.856401
18
- Num 2 FirstLetGenSkp 0.858132
19
- Num 2 WithinWrdFWrd 0.726155
20
- Num 2 WithinWrdFLetSkp 0.607829
21
- Num 2 WithinWrdFLet 0.493922
22
- Spec 2 FirstLetGen2 0.854368
23
- Spec 2 FirstLetGenStp 0.664622
24
- Spec 2 FirstLetGenSkp 0.657475
25
- Al 3 FirstLet 0.999808
26
- Al 3 FirstLetGen 0.999408
27
- Al 3 FirstLetGenS 0.998732
28
- Al 3 WithinWrdFWrd 0.997824
29
- Al 3 FirstLetGenStp 0.997839
30
- Al 3 FirstLetGenStp2 0.997264
31
- Al 3 FirstLetGenSkp 0.988583
32
- Al 3 ContLet 0.987697
33
- Al 3 WithinWrdFWrdSkp 0.981107
34
- Al 3 WithinWrdFLet 0.981322
35
- Al 3 ContLetSkp 0.968185
36
- Al 3 WithinWrdWrd 0.9437
37
- Al 3 WithinWrdFLetSkp 0.904799
38
- Al 3 WithinWrdLet 0.663735
39
- Al 3 AnyLet 0.303503
40
- Num 3 FirstLetGen2 0.998497
41
- Num 3 WithinWrdFWrd 0.99964
42
- Num 3 FirstLetGenStp 0.998807
43
- Num 3 FirstLetGenStp2 0.991256
44
- Num 3 FirstLetGenSkp 0.991202
45
- Num 3 ContLet 0.996938
46
- Num 3 WithinWrdFWrdSkp 0.998821
47
- Num 3 WithinWrdFLet 0.985676
48
- Num 3 ContLetSkp 0.995076
49
- Num 3 WithinWrdWrd 0.999245
50
- Num 3 WithinWrdFLetSkp 0.971123
51
- Num 3 WithinWrdLet 0.819989
52
- Num 3 AnyLet 0.797932
53
- Spec 3 FirstLetGen2 0.978311
54
- Spec 3 FirstLetGenStp 0.977779
55
- Spec 3 FirstLetGenStp2 0.929197
56
- Spec 3 WithinWrdFWrd 0.930654
57
- Spec 3 ContLet 0.923911
58
- Spec 3 FirstLetGenSkp 0.904086
59
- Spec 3 WithinWrdFWrdSkp 0.893989
60
- Spec 3 ContLetSkp 0.851583
61
- Spec 3 WithinWrdFLet 0.712331
62
- Spec 3 WithinWrdFLetSkp 0.64667
63
- Spec 3 WithinWrdWrd 0.428
64
- Al 4 FirstLet 0.999964
65
- Al 4 FirstLetGen 0.99993
66
- Al 4 FirstLetGenS 0.999811
67
- Al 4 WithinWrdFWrd 0.999616
68
- Al 4 FirstLetGenStp 0.999868
69
- Al 4 FirstLetGenStp2 0.999948
70
- Al 4 FirstLetGenSkp 0.998534
71
- Al 4 ContLet 0.992792
72
- Al 4 WithinWrdFWrdSkp 0.997097
73
- Al 4 WithinWrdFLet 0.992955
74
- Al 4 ContLetSkp 0.985568
75
- Al 4 WithinWrdWrd 0.995823
76
- Al 4 WithinWrdFLetSkp 0.976873
77
- Al 4 WithinWrdLet 0.917863
78
- Al 4 AnyLet 0.696532
79
- Num 4 FirstLetGen2 0.99992
80
- Num 4 WithinWrdFWrd 0.999835
81
- Num 4 FirstLetGenStp 0.999903
82
- Num 4 FirstLetGenStp2 0.999936
83
- Num 4 FirstLetGenSkp 0.999577
84
- Num 4 ContLet 0.999555
85
- Num 4 WithinWrdFWrdSkp 0.999885
86
- Num 4 WithinWrdFLet 0.9975
87
- Num 4 ContLetSkp 0.998578
88
- Num 4 WithinWrdWrd 0.997703
89
- Num 4 WithinWrdFLetSkp 0.996501
90
- Num 4 WithinWrdLet 0.986326
91
- Num 4 AnyLet 0.953126
92
- Spec 4 FirstLetGen2 0.99278
93
- Spec 4 FirstLetGenStp 0.98597
94
- Spec 4 FirstLetGenStp2 0.982127
95
- Spec 4 WithinWrdFWrd 0.997649
96
- Spec 4 ContLet 0.980869
97
- Spec 4 FirstLetGenSkp 0.944843
98
- Spec 4 WithinWrdFWrdSkp 0.985685
99
- Spec 4 ContLetSkp 0.973983
100
- Spec 4 WithinWrdFLet 0.992773
101
- Spec 4 WithinWrdFLetSkp 0.863247
102
- Spec 4 WithinWrdWrd 0.931745
103
- Spec 4 WithinWrdLet 0.418068
104
- Spec 4 AnyLet 0.223562
105
- Al 5 FirstLet 0.999979
106
- Al 5 FirstLetGen 0.999979
107
- Al 5 FirstLetGenS 0.999913
108
- Al 5 WithinWrdFWrd 0.999928
109
- Al 5 FirstLetGenStp 0.999989
110
- Al 5 FirstLetGenStp2 0.999887
111
- Al 5 FirstLetGenSkp 0.999852
112
- Al 5 ContLet 0.997596
113
- Al 5 WithinWrdFWrdSkp 0.999602
114
- Al 5 WithinWrdFLet 0.997473
115
- Al 5 ContLetSkp 0.989703
116
- Al 5 WithinWrdWrd 0.999812
117
- Al 5 WithinWrdFLetSkp 0.986066
118
- Al 5 WithinWrdLet 0.889324
119
- Al 5 AnyLet 0.73859
120
- Num 5 FirstLetGen2 0.999987
121
- Num 5 WithinWrdFWrd 0.999922
122
- Num 5 FirstLetGenStp 0.99998
123
- Num 5 FirstLetGenStp2 1
124
- Num 5 FirstLetGenSkp 0.999901
125
- Num 5 ContLet 0.999613
126
- Num 5 WithinWrdFWrdSkp 0.999937
127
- Num 5 WithinWrdFLet 0.999386
128
- Num 5 ContLetSkp 0.999312
129
- Num 5 WithinWrdWrd 1
130
- Num 5 WithinWrdFLetSkp 0.998939
131
- Num 5 WithinWrdLet 0.996068
132
- Num 5 AnyLet 0.986193
133
- Spec 5 FirstLetGen2 0.999701
134
- Spec 5 FirstLetGenStp 0.9999
135
- Spec 5 FirstLetGenStp2 0.999757
136
- Spec 5 WithinWrdFWrd 0.999517
137
- Spec 5 ContLet 0.994648
138
- Spec 5 FirstLetGenSkp 0.997065
139
- Spec 5 WithinWrdFWrdSkp 0.998513
140
- Spec 5 ContLetSkp 0.992445
141
- Spec 5 WithinWrdFLet 0.996623
142
- Spec 5 WithinWrdFLetSkp 0.978026
143
- Spec 5 WithinWrdWrd 0.996879
144
- Spec 5 WithinWrdLet 0.862993
145
  Spec 5 AnyLet 0.745608
 
1
+ Al 1 FirstLetOneChSF 0.967224
2
+ Al 2 FirstLet 0.99818
3
+ Al 2 FirstLetGen 0.994292
4
+ Al 2 WithinWrdFWrd 0.989054
5
+ Al 2 FirstLetGenStp 0.970019
6
+ Al 2 ContLet 0.96935
7
+ Al 2 WithinWrdFLet 0.941981
8
+ Al 2 FirstLetGenSkp 0.949988
9
+ Al 2 WithinWrdFWrdSkp 0.947364
10
+ Al 2 ContLetSkp 0.877216
11
+ Al 2 WithinWrdWrd 0.74768
12
+ Al 2 WithinWrdFLetSkp 0.640805
13
+ Num 2 ContLet 0.975372
14
+ Num 2 ContLetSkp 0.96617
15
+ Num 2 WithinWrdFWrdSkp 0.988426
16
+ Num 2 FirstLetGen2 0.909995
17
+ Num 2 FirstLetGenStp 0.856401
18
+ Num 2 FirstLetGenSkp 0.858132
19
+ Num 2 WithinWrdFWrd 0.726155
20
+ Num 2 WithinWrdFLetSkp 0.607829
21
+ Num 2 WithinWrdFLet 0.493922
22
+ Spec 2 FirstLetGen2 0.854368
23
+ Spec 2 FirstLetGenStp 0.664622
24
+ Spec 2 FirstLetGenSkp 0.657475
25
+ Al 3 FirstLet 0.999808
26
+ Al 3 FirstLetGen 0.999408
27
+ Al 3 FirstLetGenS 0.998732
28
+ Al 3 WithinWrdFWrd 0.997824
29
+ Al 3 FirstLetGenStp 0.997839
30
+ Al 3 FirstLetGenStp2 0.997264
31
+ Al 3 FirstLetGenSkp 0.988583
32
+ Al 3 ContLet 0.987697
33
+ Al 3 WithinWrdFWrdSkp 0.981107
34
+ Al 3 WithinWrdFLet 0.981322
35
+ Al 3 ContLetSkp 0.968185
36
+ Al 3 WithinWrdWrd 0.9437
37
+ Al 3 WithinWrdFLetSkp 0.904799
38
+ Al 3 WithinWrdLet 0.663735
39
+ Al 3 AnyLet 0.303503
40
+ Num 3 FirstLetGen2 0.998497
41
+ Num 3 WithinWrdFWrd 0.99964
42
+ Num 3 FirstLetGenStp 0.998807
43
+ Num 3 FirstLetGenStp2 0.991256
44
+ Num 3 FirstLetGenSkp 0.991202
45
+ Num 3 ContLet 0.996938
46
+ Num 3 WithinWrdFWrdSkp 0.998821
47
+ Num 3 WithinWrdFLet 0.985676
48
+ Num 3 ContLetSkp 0.995076
49
+ Num 3 WithinWrdWrd 0.999245
50
+ Num 3 WithinWrdFLetSkp 0.971123
51
+ Num 3 WithinWrdLet 0.819989
52
+ Num 3 AnyLet 0.797932
53
+ Spec 3 FirstLetGen2 0.978311
54
+ Spec 3 FirstLetGenStp 0.977779
55
+ Spec 3 FirstLetGenStp2 0.929197
56
+ Spec 3 WithinWrdFWrd 0.930654
57
+ Spec 3 ContLet 0.923911
58
+ Spec 3 FirstLetGenSkp 0.904086
59
+ Spec 3 WithinWrdFWrdSkp 0.893989
60
+ Spec 3 ContLetSkp 0.851583
61
+ Spec 3 WithinWrdFLet 0.712331
62
+ Spec 3 WithinWrdFLetSkp 0.64667
63
+ Spec 3 WithinWrdWrd 0.428
64
+ Al 4 FirstLet 0.999964
65
+ Al 4 FirstLetGen 0.99993
66
+ Al 4 FirstLetGenS 0.999811
67
+ Al 4 WithinWrdFWrd 0.999616
68
+ Al 4 FirstLetGenStp 0.999868
69
+ Al 4 FirstLetGenStp2 0.999948
70
+ Al 4 FirstLetGenSkp 0.998534
71
+ Al 4 ContLet 0.992792
72
+ Al 4 WithinWrdFWrdSkp 0.997097
73
+ Al 4 WithinWrdFLet 0.992955
74
+ Al 4 ContLetSkp 0.985568
75
+ Al 4 WithinWrdWrd 0.995823
76
+ Al 4 WithinWrdFLetSkp 0.976873
77
+ Al 4 WithinWrdLet 0.917863
78
+ Al 4 AnyLet 0.696532
79
+ Num 4 FirstLetGen2 0.99992
80
+ Num 4 WithinWrdFWrd 0.999835
81
+ Num 4 FirstLetGenStp 0.999903
82
+ Num 4 FirstLetGenStp2 0.999936
83
+ Num 4 FirstLetGenSkp 0.999577
84
+ Num 4 ContLet 0.999555
85
+ Num 4 WithinWrdFWrdSkp 0.999885
86
+ Num 4 WithinWrdFLet 0.9975
87
+ Num 4 ContLetSkp 0.998578
88
+ Num 4 WithinWrdWrd 0.997703
89
+ Num 4 WithinWrdFLetSkp 0.996501
90
+ Num 4 WithinWrdLet 0.986326
91
+ Num 4 AnyLet 0.953126
92
+ Spec 4 FirstLetGen2 0.99278
93
+ Spec 4 FirstLetGenStp 0.98597
94
+ Spec 4 FirstLetGenStp2 0.982127
95
+ Spec 4 WithinWrdFWrd 0.997649
96
+ Spec 4 ContLet 0.980869
97
+ Spec 4 FirstLetGenSkp 0.944843
98
+ Spec 4 WithinWrdFWrdSkp 0.985685
99
+ Spec 4 ContLetSkp 0.973983
100
+ Spec 4 WithinWrdFLet 0.992773
101
+ Spec 4 WithinWrdFLetSkp 0.863247
102
+ Spec 4 WithinWrdWrd 0.931745
103
+ Spec 4 WithinWrdLet 0.418068
104
+ Spec 4 AnyLet 0.223562
105
+ Al 5 FirstLet 0.999979
106
+ Al 5 FirstLetGen 0.999979
107
+ Al 5 FirstLetGenS 0.999913
108
+ Al 5 WithinWrdFWrd 0.999928
109
+ Al 5 FirstLetGenStp 0.999989
110
+ Al 5 FirstLetGenStp2 0.999887
111
+ Al 5 FirstLetGenSkp 0.999852
112
+ Al 5 ContLet 0.997596
113
+ Al 5 WithinWrdFWrdSkp 0.999602
114
+ Al 5 WithinWrdFLet 0.997473
115
+ Al 5 ContLetSkp 0.989703
116
+ Al 5 WithinWrdWrd 0.999812
117
+ Al 5 WithinWrdFLetSkp 0.986066
118
+ Al 5 WithinWrdLet 0.889324
119
+ Al 5 AnyLet 0.73859
120
+ Num 5 FirstLetGen2 0.999987
121
+ Num 5 WithinWrdFWrd 0.999922
122
+ Num 5 FirstLetGenStp 0.99998
123
+ Num 5 FirstLetGenStp2 1
124
+ Num 5 FirstLetGenSkp 0.999901
125
+ Num 5 ContLet 0.999613
126
+ Num 5 WithinWrdFWrdSkp 0.999937
127
+ Num 5 WithinWrdFLet 0.999386
128
+ Num 5 ContLetSkp 0.999312
129
+ Num 5 WithinWrdWrd 1
130
+ Num 5 WithinWrdFLetSkp 0.998939
131
+ Num 5 WithinWrdLet 0.996068
132
+ Num 5 AnyLet 0.986193
133
+ Spec 5 FirstLetGen2 0.999701
134
+ Spec 5 FirstLetGenStp 0.9999
135
+ Spec 5 FirstLetGenStp2 0.999757
136
+ Spec 5 WithinWrdFWrd 0.999517
137
+ Spec 5 ContLet 0.994648
138
+ Spec 5 FirstLetGenSkp 0.997065
139
+ Spec 5 WithinWrdFWrdSkp 0.998513
140
+ Spec 5 ContLetSkp 0.992445
141
+ Spec 5 WithinWrdFLet 0.996623
142
+ Spec 5 WithinWrdFLetSkp 0.978026
143
+ Spec 5 WithinWrdWrd 0.996879
144
+ Spec 5 WithinWrdLet 0.862993
145
  Spec 5 AnyLet 0.745608
Library/WordData/Lf1chSf CHANGED
The diff for this file is too large to render. See raw diff
 
Library/WordData/stop CHANGED
@@ -1,313 +1,313 @@
1
- a
2
- about
3
- above
4
- across
5
- after
6
- afterwards
7
- again
8
- against
9
- al
10
- all
11
- almost
12
- alone
13
- along
14
- already
15
- also
16
- although
17
- always
18
- am
19
- among
20
- amongst
21
- an
22
- analyze
23
- and
24
- another
25
- any
26
- anyhow
27
- anyone
28
- anything
29
- anywhere
30
- applicable
31
- apply
32
- are
33
- around
34
- as
35
- assume
36
- at
37
- be
38
- became
39
- because
40
- become
41
- becomes
42
- becoming
43
- been
44
- before
45
- beforehand
46
- being
47
- below
48
- beside
49
- besides
50
- between
51
- beyond
52
- both
53
- but
54
- by
55
- came
56
- cannot
57
- cc
58
- cm
59
- come
60
- compare
61
- could
62
- de
63
- dealing
64
- department
65
- depend
66
- did
67
- discover
68
- dl
69
- do
70
- does
71
- during
72
- each
73
- ec
74
- ed
75
- effected
76
- eg
77
- either
78
- else
79
- elsewhere
80
- enough
81
- et
82
- etc
83
- ever
84
- every
85
- everyone
86
- everything
87
- everywhere
88
- except
89
- find
90
- for
91
- found
92
- from
93
- further
94
- get
95
- give
96
- go
97
- gov
98
- had
99
- has
100
- have
101
- he
102
- hence
103
- her
104
- here
105
- hereafter
106
- hereby
107
- herein
108
- hereupon
109
- hers
110
- herself
111
- him
112
- himself
113
- his
114
- how
115
- however
116
- hr
117
- ie
118
- if
119
- ii
120
- iii
121
- in
122
- inc
123
- incl
124
- indeed
125
- into
126
- investigate
127
- is
128
- it
129
- its
130
- itself
131
- j
132
- jour
133
- journal
134
- just
135
- kg
136
- last
137
- latter
138
- latterly
139
- lb
140
- ld
141
- letter
142
- like
143
- ltd
144
- made
145
- make
146
- many
147
- may
148
- me
149
- meanwhile
150
- mg
151
- might
152
- ml
153
- mm
154
- mo
155
- more
156
- moreover
157
- most
158
- mostly
159
- mr
160
- much
161
- must
162
- my
163
- myself
164
- namely
165
- neither
166
- never
167
- nevertheless
168
- next
169
- no
170
- nobody
171
- noone
172
- nor
173
- not
174
- nothing
175
- now
176
- nowhere
177
- of
178
- off
179
- often
180
- on
181
- only
182
- onto
183
- or
184
- other
185
- others
186
- otherwise
187
- our
188
- ours
189
- ourselves
190
- out
191
- over
192
- own
193
- oz
194
- per
195
- perhaps
196
- pm
197
- precede
198
- presently
199
- previously
200
- pt
201
- rather
202
- regarding
203
- relate
204
- said
205
- same
206
- seem
207
- seemed
208
- seeming
209
- seems
210
- seriously
211
- several
212
- she
213
- should
214
- show
215
- showed
216
- shown
217
- since
218
- so
219
- some
220
- somehow
221
- someone
222
- something
223
- sometime
224
- sometimes
225
- somewhere
226
- still
227
- studied
228
- sub
229
- such
230
- take
231
- tell
232
- th
233
- than
234
- that
235
- the
236
- their
237
- them
238
- themselves
239
- then
240
- thence
241
- there
242
- thereafter
243
- thereby
244
- therefore
245
- therein
246
- thereupon
247
- these
248
- they
249
- this
250
- thorough
251
- those
252
- though
253
- through
254
- throughout
255
- thru
256
- thus
257
- to
258
- together
259
- too
260
- toward
261
- towards
262
- try
263
- type
264
- ug
265
- under
266
- unless
267
- until
268
- up
269
- upon
270
- us
271
- used
272
- using
273
- various
274
- very
275
- via
276
- was
277
- we
278
- were
279
- what
280
- whatever
281
- when
282
- whence
283
- whenever
284
- where
285
- whereafter
286
- whereas
287
- whereby
288
- wherein
289
- whereupon
290
- wherever
291
- whether
292
- which
293
- while
294
- whither
295
- who
296
- whoever
297
- whom
298
- whose
299
- why
300
- will
301
- with
302
- within
303
- without
304
- wk
305
- would
306
- wt
307
- yet
308
- you
309
- your
310
- yours
311
- yourself
312
- yourselves
313
- yr
 
1
+ a
2
+ about
3
+ above
4
+ across
5
+ after
6
+ afterwards
7
+ again
8
+ against
9
+ al
10
+ all
11
+ almost
12
+ alone
13
+ along
14
+ already
15
+ also
16
+ although
17
+ always
18
+ am
19
+ among
20
+ amongst
21
+ an
22
+ analyze
23
+ and
24
+ another
25
+ any
26
+ anyhow
27
+ anyone
28
+ anything
29
+ anywhere
30
+ applicable
31
+ apply
32
+ are
33
+ around
34
+ as
35
+ assume
36
+ at
37
+ be
38
+ became
39
+ because
40
+ become
41
+ becomes
42
+ becoming
43
+ been
44
+ before
45
+ beforehand
46
+ being
47
+ below
48
+ beside
49
+ besides
50
+ between
51
+ beyond
52
+ both
53
+ but
54
+ by
55
+ came
56
+ cannot
57
+ cc
58
+ cm
59
+ come
60
+ compare
61
+ could
62
+ de
63
+ dealing
64
+ department
65
+ depend
66
+ did
67
+ discover
68
+ dl
69
+ do
70
+ does
71
+ during
72
+ each
73
+ ec
74
+ ed
75
+ effected
76
+ eg
77
+ either
78
+ else
79
+ elsewhere
80
+ enough
81
+ et
82
+ etc
83
+ ever
84
+ every
85
+ everyone
86
+ everything
87
+ everywhere
88
+ except
89
+ find
90
+ for
91
+ found
92
+ from
93
+ further
94
+ get
95
+ give
96
+ go
97
+ gov
98
+ had
99
+ has
100
+ have
101
+ he
102
+ hence
103
+ her
104
+ here
105
+ hereafter
106
+ hereby
107
+ herein
108
+ hereupon
109
+ hers
110
+ herself
111
+ him
112
+ himself
113
+ his
114
+ how
115
+ however
116
+ hr
117
+ ie
118
+ if
119
+ ii
120
+ iii
121
+ in
122
+ inc
123
+ incl
124
+ indeed
125
+ into
126
+ investigate
127
+ is
128
+ it
129
+ its
130
+ itself
131
+ j
132
+ jour
133
+ journal
134
+ just
135
+ kg
136
+ last
137
+ latter
138
+ latterly
139
+ lb
140
+ ld
141
+ letter
142
+ like
143
+ ltd
144
+ made
145
+ make
146
+ many
147
+ may
148
+ me
149
+ meanwhile
150
+ mg
151
+ might
152
+ ml
153
+ mm
154
+ mo
155
+ more
156
+ moreover
157
+ most
158
+ mostly
159
+ mr
160
+ much
161
+ must
162
+ my
163
+ myself
164
+ namely
165
+ neither
166
+ never
167
+ nevertheless
168
+ next
169
+ no
170
+ nobody
171
+ noone
172
+ nor
173
+ not
174
+ nothing
175
+ now
176
+ nowhere
177
+ of
178
+ off
179
+ often
180
+ on
181
+ only
182
+ onto
183
+ or
184
+ other
185
+ others
186
+ otherwise
187
+ our
188
+ ours
189
+ ourselves
190
+ out
191
+ over
192
+ own
193
+ oz
194
+ per
195
+ perhaps
196
+ pm
197
+ precede
198
+ presently
199
+ previously
200
+ pt
201
+ rather
202
+ regarding
203
+ relate
204
+ said
205
+ same
206
+ seem
207
+ seemed
208
+ seeming
209
+ seems
210
+ seriously
211
+ several
212
+ she
213
+ should
214
+ show
215
+ showed
216
+ shown
217
+ since
218
+ so
219
+ some
220
+ somehow
221
+ someone
222
+ something
223
+ sometime
224
+ sometimes
225
+ somewhere
226
+ still
227
+ studied
228
+ sub
229
+ such
230
+ take
231
+ tell
232
+ th
233
+ than
234
+ that
235
+ the
236
+ their
237
+ them
238
+ themselves
239
+ then
240
+ thence
241
+ there
242
+ thereafter
243
+ thereby
244
+ therefore
245
+ therein
246
+ thereupon
247
+ these
248
+ they
249
+ this
250
+ thorough
251
+ those
252
+ though
253
+ through
254
+ throughout
255
+ thru
256
+ thus
257
+ to
258
+ together
259
+ too
260
+ toward
261
+ towards
262
+ try
263
+ type
264
+ ug
265
+ under
266
+ unless
267
+ until
268
+ up
269
+ upon
270
+ us
271
+ used
272
+ using
273
+ various
274
+ very
275
+ via
276
+ was
277
+ we
278
+ were
279
+ what
280
+ whatever
281
+ when
282
+ whence
283
+ whenever
284
+ where
285
+ whereafter
286
+ whereas
287
+ whereby
288
+ wherein
289
+ whereupon
290
+ wherever
291
+ whether
292
+ which
293
+ while
294
+ whither
295
+ who
296
+ whoever
297
+ whom
298
+ whose
299
+ why
300
+ will
301
+ with
302
+ within
303
+ without
304
+ wk
305
+ would
306
+ wt
307
+ yet
308
+ you
309
+ your
310
+ yours
311
+ yourself
312
+ yourselves
313
+ yr
Library/runn.C CHANGED
@@ -1,216 +1,216 @@
1
- #include <iostream>
2
- #include <fstream>
3
- #include <cstdlib>
4
- #include <sstream>
5
- #include <iomanip>
6
- #include <cstring>
7
- #include <cmath>
8
- #include <sys/types.h>
9
- #include <sys/stat.h>
10
- #include <unistd.h>
11
- #include <fcntl.h>
12
- #include <sys/mman.h>
13
- #include "runn.h"
14
- using namespace std;
15
- namespace iret {
16
-
17
- int mark(int pflag, long ct, int ivl, const char *what){
18
- if(pflag&&((ct%ivl)==0)){cout << what << " count=" << ct << endl;
19
- return(1);}
20
- else return(0);
21
- }
22
-
23
- int get_qflag(){
24
- int pflag=1;
25
- ifstream fin("quiet.flag",ios::in);
26
- if(fin.is_open()){
27
- fin >> pflag;
28
- fin.close();
29
- fin.clear();
30
- }
31
- return(pflag);
32
- }
33
-
34
- int get_pathw(char *nam,const char *pfl,const char *pex,const char *ch){
35
- char cnam[256];
36
-
37
- strcpy(cnam,"path_");
38
- strcat(cnam,pfl);
39
- strcat(cnam,"_");
40
- strcat(cnam,pex);
41
- strcat(cnam,".");
42
- strcat(cnam,ch);
43
- ifstream fin(cnam,ios::in);
44
- if(!fin.is_open()){
45
- fin.clear();
46
- strcpy(cnam,"path_");
47
- strcat(cnam,pfl);
48
- strcat(cnam,"_");
49
- strcat(cnam,pex);
50
- fin.open(cnam,ios::in);
51
- if(!fin.is_open()){
52
- fin.clear();
53
- strcpy(cnam,"path_");
54
- strcat(cnam,pfl);
55
- fin.open(cnam,ios::in);
56
- if(!fin.is_open()){
57
- fin.clear();
58
- strcpy(cnam,"path");
59
- fin.open(cnam,ios::in);
60
- if(!fin.is_open()){
61
- cout << "Path file for type " << pfl
62
- << " does not exist!" << endl;
63
- exit(1);
64
- }
65
- }
66
- }
67
- }
68
-
69
- fin.getline(nam,256);
70
- fin.close();
71
- strcat(nam,pfl);
72
- strcat(nam,"_");
73
- strcat(nam,pex);
74
- strcat(nam,".");
75
- strcat(nam,ch);
76
- return(1);
77
- }
78
-
79
- char *add_num(const char *ptr,long n,char *buf){
80
- char cnam[100];
81
- long_str(cnam,n);
82
- strcpy(buf,ptr);
83
- strcat(buf,cnam);
84
- return(buf);
85
- }
86
-
87
- long gseed(int x, char **v, const char *c){
88
- long seed;
89
-
90
- seed=clnga(x,v,c,"seed for random number generator");
91
- srandom((unsigned int)seed);
92
- return seed;
93
- }
94
-
95
- long zrand(long p){
96
- return(((long)random())%p);
97
- }
98
-
99
- void shuffle(long n,long *idx){
100
- long i,j,k;
101
- for(i=0;i<n;i++){
102
- k=zrand(n);
103
- j=*(idx+i);
104
- *(idx+i)=*(idx+k);
105
- *(idx+k)=j;
106
- }
107
- }
108
-
109
- void dshuffle(long n,long *idx){
110
- long i,j,k;
111
- for(i=n-1;i>0;i--){
112
- k=zrand(i+1);
113
- j=idx[i];
114
- idx[i]=idx[k];
115
- idx[k]=j;
116
- }
117
- }
118
-
119
- long clnga(int x, char **v, const char *c, const char *name){
120
- int i,flag=1;
121
- long num;
122
-
123
- for(i=1;i<x-1;i++)
124
- if(strcmp(c,*(v+i))==0){
125
- flag=0;
126
- istringstream oss(*(v+i+1) );
127
- oss >> num;
128
- if(oss.fail()){
129
- cout << "Enter " << name << ":" << endl;
130
- cin >> num;
131
- }
132
- }
133
- if(flag==1){
134
- cout << "Enter " << name << ":" << endl;
135
- cin >> num;
136
- cin.get();
137
- }
138
- return(num);
139
- }
140
-
141
- long rnd(double p)
142
- {
143
- return((long)floor(p+.5));
144
- }
145
-
146
- double cdbla(int x, char **v, const char *c, const char *name){
147
- int i,flag=1;
148
- double num;
149
-
150
- for(i=1;i<x-1;i++)
151
- if(strcmp(c,*(v+i))==0){
152
- flag=0;
153
- istringstream oss(*(v+i+1));
154
- oss >> num;
155
- if(oss.fail()){
156
- cout << "Enter " << name << ":" << endl;
157
- cin >> num;
158
- }
159
- }
160
- if(flag==1){
161
- cout << "Enter " << name << ":" << endl;
162
- cin >> num;
163
- cin.get();
164
- }
165
- return(num);
166
- }
167
-
168
- char *cstra(int x, char **v, const char *c, const char *name){
169
- int i;
170
- char cnam[max_str];
171
-
172
- for(i=1;i<x-1;i++){
173
- if(strcmp(c,*(v+i))==0){
174
- return(*(v+i+1));
175
- }
176
- }
177
-
178
- restart:
179
- cout << "Enter " << name << ":" << endl;
180
- cin.getline(cnam,max_str);
181
- if(i=cin.gcount()){
182
- char *pch=new char[i+1];
183
- strcpy(pch,cnam);
184
- return(pch);
185
- }
186
- else {
187
- cin.clear();
188
- goto restart;
189
- }
190
- }
191
-
192
- //Function to convert a long to a null terminated string.
193
- void long_str(char *cnam,long n){
194
- ostringstream oss;
195
- oss << n;
196
- const string & str = oss.str();
197
- str.copy(cnam,20);
198
- cnam[str.length()]='\0';
199
- }
200
-
201
- //Function to convert a string with null termination
202
- //to a long.
203
- void str_long(char *cnam,long &n){
204
- istringstream(cnam) >> n;
205
- }
206
-
207
- //Function to convert first two char of string to an
208
- //integer. Should be an ASCII null terminated string
209
- int trac(const char *str){
210
- if(!(*str))return(0);
211
- else {
212
- return((int)(*(str+1))+128*((int)(*str)));
213
- }
214
- }
215
-
216
- }
 
1
+ #include <iostream>
2
+ #include <fstream>
3
+ #include <cstdlib>
4
+ #include <sstream>
5
+ #include <iomanip>
6
+ #include <cstring>
7
+ #include <cmath>
8
+ #include <sys/types.h>
9
+ #include <sys/stat.h>
10
+ #include <unistd.h>
11
+ #include <fcntl.h>
12
+ #include <sys/mman.h>
13
+ #include "runn.h"
14
+ using namespace std;
15
+ namespace iret {
16
+
17
+ int mark(int pflag, long ct, int ivl, const char *what){
18
+ if(pflag&&((ct%ivl)==0)){cout << what << " count=" << ct << endl;
19
+ return(1);}
20
+ else return(0);
21
+ }
22
+
23
+ int get_qflag(){
24
+ int pflag=1;
25
+ ifstream fin("quiet.flag",ios::in);
26
+ if(fin.is_open()){
27
+ fin >> pflag;
28
+ fin.close();
29
+ fin.clear();
30
+ }
31
+ return(pflag);
32
+ }
33
+
34
+ int get_pathw(char *nam,const char *pfl,const char *pex,const char *ch){
35
+ char cnam[256];
36
+
37
+ strcpy(cnam,"path_");
38
+ strcat(cnam,pfl);
39
+ strcat(cnam,"_");
40
+ strcat(cnam,pex);
41
+ strcat(cnam,".");
42
+ strcat(cnam,ch);
43
+ ifstream fin(cnam,ios::in);
44
+ if(!fin.is_open()){
45
+ fin.clear();
46
+ strcpy(cnam,"path_");
47
+ strcat(cnam,pfl);
48
+ strcat(cnam,"_");
49
+ strcat(cnam,pex);
50
+ fin.open(cnam,ios::in);
51
+ if(!fin.is_open()){
52
+ fin.clear();
53
+ strcpy(cnam,"path_");
54
+ strcat(cnam,pfl);
55
+ fin.open(cnam,ios::in);
56
+ if(!fin.is_open()){
57
+ fin.clear();
58
+ strcpy(cnam,"path");
59
+ fin.open(cnam,ios::in);
60
+ if(!fin.is_open()){
61
+ cout << "Path file for type " << pfl
62
+ << " does not exist!" << endl;
63
+ exit(1);
64
+ }
65
+ }
66
+ }
67
+ }
68
+
69
+ fin.getline(nam,256);
70
+ fin.close();
71
+ strcat(nam,pfl);
72
+ strcat(nam,"_");
73
+ strcat(nam,pex);
74
+ strcat(nam,".");
75
+ strcat(nam,ch);
76
+ return(1);
77
+ }
78
+
79
+ char *add_num(const char *ptr,long n,char *buf){
80
+ char cnam[100];
81
+ long_str(cnam,n);
82
+ strcpy(buf,ptr);
83
+ strcat(buf,cnam);
84
+ return(buf);
85
+ }
86
+
87
+ long gseed(int x, char **v, const char *c){
88
+ long seed;
89
+
90
+ seed=clnga(x,v,c,"seed for random number generator");
91
+ srandom((unsigned int)seed);
92
+ return seed;
93
+ }
94
+
95
+ long zrand(long p){
96
+ return(((long)random())%p);
97
+ }
98
+
99
+ void shuffle(long n,long *idx){
100
+ long i,j,k;
101
+ for(i=0;i<n;i++){
102
+ k=zrand(n);
103
+ j=*(idx+i);
104
+ *(idx+i)=*(idx+k);
105
+ *(idx+k)=j;
106
+ }
107
+ }
108
+
109
+ void dshuffle(long n,long *idx){
110
+ long i,j,k;
111
+ for(i=n-1;i>0;i--){
112
+ k=zrand(i+1);
113
+ j=idx[i];
114
+ idx[i]=idx[k];
115
+ idx[k]=j;
116
+ }
117
+ }
118
+
119
+ long clnga(int x, char **v, const char *c, const char *name){
120
+ int i,flag=1;
121
+ long num;
122
+
123
+ for(i=1;i<x-1;i++)
124
+ if(strcmp(c,*(v+i))==0){
125
+ flag=0;
126
+ istringstream oss(*(v+i+1) );
127
+ oss >> num;
128
+ if(oss.fail()){
129
+ cout << "Enter " << name << ":" << endl;
130
+ cin >> num;
131
+ }
132
+ }
133
+ if(flag==1){
134
+ cout << "Enter " << name << ":" << endl;
135
+ cin >> num;
136
+ cin.get();
137
+ }
138
+ return(num);
139
+ }
140
+
141
+ long rnd(double p)
142
+ {
143
+ return((long)floor(p+.5));
144
+ }
145
+
146
+ double cdbla(int x, char **v, const char *c, const char *name){
147
+ int i,flag=1;
148
+ double num;
149
+
150
+ for(i=1;i<x-1;i++)
151
+ if(strcmp(c,*(v+i))==0){
152
+ flag=0;
153
+ istringstream oss(*(v+i+1));
154
+ oss >> num;
155
+ if(oss.fail()){
156
+ cout << "Enter " << name << ":" << endl;
157
+ cin >> num;
158
+ }
159
+ }
160
+ if(flag==1){
161
+ cout << "Enter " << name << ":" << endl;
162
+ cin >> num;
163
+ cin.get();
164
+ }
165
+ return(num);
166
+ }
167
+
168
+ char *cstra(int x, char **v, const char *c, const char *name){
169
+ int i;
170
+ char cnam[max_str];
171
+
172
+ for(i=1;i<x-1;i++){
173
+ if(strcmp(c,*(v+i))==0){
174
+ return(*(v+i+1));
175
+ }
176
+ }
177
+
178
+ restart:
179
+ cout << "Enter " << name << ":" << endl;
180
+ cin.getline(cnam,max_str);
181
+ if(i=cin.gcount()){
182
+ char *pch=new char[i+1];
183
+ strcpy(pch,cnam);
184
+ return(pch);
185
+ }
186
+ else {
187
+ cin.clear();
188
+ goto restart;
189
+ }
190
+ }
191
+
192
+ //Function to convert a long to a null terminated string.
193
+ void long_str(char *cnam,long n){
194
+ ostringstream oss;
195
+ oss << n;
196
+ const string & str = oss.str();
197
+ str.copy(cnam,20);
198
+ cnam[str.length()]='\0';
199
+ }
200
+
201
+ //Function to convert a string with null termination
202
+ //to a long.
203
+ void str_long(char *cnam,long &n){
204
+ istringstream(cnam) >> n;
205
+ }
206
+
207
+ //Function to convert first two char of string to an
208
+ //integer. Should be an ASCII null terminated string
209
+ int trac(const char *str){
210
+ if(!(*str))return(0);
211
+ else {
212
+ return((int)(*(str+1))+128*((int)(*str)));
213
+ }
214
+ }
215
+
216
+ }
Library/runn.h CHANGED
@@ -1,392 +1,392 @@
1
- #ifndef RUNN_H
2
- #define RUNN_H
3
-
4
- #include <fstream>
5
- #include <iostream>
6
- #include <cctype>
7
- #include <cstring>
8
- #include <cstdlib>
9
- using namespace std;
10
- namespace iret {
11
-
12
- const int word_cnt = 5000; //Maximum number of words in a document.
13
- const int word_len = 1500; //Maximum word length.
14
- const long max_str=1500; //Maximum string length.
15
-
16
- int get_pathw(char *cn,const char *dfl,const char *dex,const char *a);
17
- //Reads the path from a file "path_(*dfl)" and constructs the
18
- //file name from as "(*dfl)_(*dex).(*a)". Cats path and file
19
- //name and returns the full info in cn.
20
- char *add_num(const char *ptr,long n,char *buf); //converts long to ascii
21
- //and cats to end of string and returns pointer to new string
22
- //that results. Does not change input string. The new string is
23
- //held in buffer space and this is overwritten at each call.
24
-
25
- int get_qflag();
26
- //This function gets the value of the print flag pflag that is
27
- //used to control output.
28
- int mark(int,long,int,const char*);
29
- //This function is used to print out information that indicates
30
- //how a function is progressing. It is dependent on the value of
31
- //pflag.
32
- long gseed(int,char**,const char*);
33
- //This function is called to allow the input of a seed value for
34
- //the random number generator. It must be called in main or the
35
- //arguments of main must be passed down to it if it is to allow
36
- //command line entry. Otherwise the first argument may be set to
37
- //zero and it may be used to enter the seed at run time from the
38
- //console.
39
- long clnga(int,char**,const char*,const char*);
40
- //Allows a long to be entered from the console at run time if the
41
- //first argument is set to zero. If the first two arguments are
42
- //the arguments of main, then it allows command line entry with
43
- //the flag that is the third argument and with a statement about
44
- //the input that is the fourth argument.
45
- double cdbla(int,char**,const char*,const char*);
46
- char *cstra(int,char**,const char*,const char*);
47
- long zrand(long);
48
- //Produces a random long integer that is in the range [0,argument).
49
- //Machinery of the random number generator.
50
- void shuffle(long n,long *idx); //Randomly shuffles an array of longs.
51
- void dshuffle(long n,long *idx); //Randomly shuffles an array of longs.
52
- //Improved version suggested by Don Comeau
53
- long rnd(double);
54
- //Rounds off a double and returns the integer that results.
55
-
56
- //Reads in a string including white space and ends the string
57
- //just before the character a.
58
- inline int get_string(char *cnam,ifstream &ifile,char a){
59
- char *pch = cnam;
60
- long j=1;
61
-
62
- start:
63
- if((*(pch++)=ifile.get())!=EOF){
64
- if(*(pch-1)==a){pch--;goto start;}
65
- while(((*(pch++)=ifile.get())!=a)&&(j<max_str))j++;
66
- if(j<max_str){
67
- *(--pch)='\0';
68
- return(j);
69
- }
70
- else return(0);
71
- }
72
- return(0);
73
- }
74
-
75
- inline int get_strinf(char *cnam,fstream &ifile,char a){
76
- char *pch = cnam;
77
- long j=1;
78
- if((*(pch++)=ifile.get())!=EOF){
79
- while(((*(pch++)=ifile.get())!=a)&&(j<max_str))j++;
80
- if(j<max_str){
81
- *(--pch)='\0';
82
- return(j);
83
- }
84
- else return(0);
85
- }
86
- return(0);
87
- }
88
-
89
- //Function to lower case a string.
90
- inline void lower_case(char *cnam){
91
- int i=0;
92
- char ch;
93
-
94
- while((ch=cnam[i])!='\0'){
95
- cnam[i++]=tolower(ch);
96
- }
97
- }
98
-
99
- //Note that ordering functions beginning with sS or hS
100
- //produce an order that is increasing with increasing
101
- //index, while sR or hR produces the reverse order.
102
-
103
- template <class X>
104
- void sSort(const long ix, X *idx){
105
- long k, j, ir, i;
106
- X rra;
107
-
108
- if(ix<=1)return;
109
-
110
- k=(ix>>1);
111
- ir=ix-1;
112
- for(;;) {
113
- if(k>0) {
114
- rra=idx[--k];
115
- }
116
- else {
117
- rra=idx[ir];
118
- idx[ir] = idx[0];
119
- if(--ir ==0) {
120
- idx[0]=rra;
121
- return;
122
- }
123
- }
124
- i=k;
125
- j=((k+1)<<1)-1;
126
- while(j<=ir) {
127
- if(j<ir && (idx[j]<idx[j+1])) ++j;
128
- if(rra<idx[j]) {
129
- idx[i]=idx[j];
130
- j +=(i=j)+1;
131
- }
132
- else j=ir+1;
133
- }
134
- idx[i]=rra;
135
- }
136
- }
137
-
138
- template <class X>
139
- void sRort(const long ix, X *idx){
140
- long k, j, ir, i;
141
- X rra;
142
-
143
- if(ix<=1)return;
144
-
145
- k=(ix>>1);
146
- ir=ix-1;
147
- for(;;) {
148
- if(k>0) {
149
- rra=idx[--k];
150
- }
151
- else {
152
- rra=idx[ir];
153
- idx[ir] = idx[0];
154
- if(--ir ==0) {
155
- idx[0]=rra;
156
- return;
157
- }
158
- }
159
- i=k;
160
- j=((k+1)<<1)-1;
161
- while(j<=ir) {
162
- if(j<ir && (idx[j]>idx[j+1])) ++j;
163
- if(rra>idx[j]) {
164
- idx[i]=idx[j];
165
- j +=(i=j)+1;
166
- }
167
- else j=ir+1;
168
- }
169
- idx[i]=rra;
170
- }
171
- }
172
-
173
- template <class X, class Y>
174
- void hSort(const long n, X *ra, Y *rb) {
175
- long k, j, ir, i;
176
- X rra;
177
- Y rrb;
178
-
179
- if(n<=1)return;
180
-
181
- k=(n>>1);
182
- ir=n-1;
183
- for(;;) {
184
- if(k>0) {
185
- rra=ra[--k];
186
- rrb=rb[k];
187
- }
188
- else {
189
- rra=ra[ir];
190
- rrb=rb[ir];
191
- ra[ir] = ra[0];
192
- rb[ir] = rb[0];
193
- if(--ir ==0) {
194
- ra[0]=rra;
195
- rb[0]=rrb;
196
- return;
197
- }
198
- }
199
- i=k;
200
- j=((k+1)<<1)-1;
201
- while(j<=ir) {
202
- if(j<ir && ra[j] < ra[j+1]) ++j;
203
- if(rra<ra[j]) {
204
- ra[i]=ra[j];
205
- rb[i]=rb[j];
206
- j +=(i=j)+1;
207
- }
208
- else j=ir+1;
209
- }
210
- ra[i]=rra;
211
- rb[i]=rrb;
212
- }
213
- }
214
-
215
- template <class X, class Y, class Z>
216
- void hSort(const long n, X *ra, Y *rb, Z *rc) {
217
- long k, j, ir, i;
218
- X rra;
219
- Y rrb;
220
- Z rrc;
221
-
222
- if(n<=1)return;
223
-
224
- k=(n>>1);
225
- ir=n-1;
226
- for(;;) {
227
- if(k>0) {
228
- rra=ra[--k];
229
- rrb=rb[k];
230
- rrc=rc[k];
231
- }
232
- else {
233
- rra=ra[ir];
234
- rrb=rb[ir];
235
- rrc=rc[ir];
236
- ra[ir] = ra[0];
237
- rb[ir] = rb[0];
238
- rc[ir] = rc[0];
239
- if(--ir ==0) {
240
- ra[0]=rra;
241
- rb[0]=rrb;
242
- rc[0]=rrc;
243
- return;
244
- }
245
- }
246
- i=k;
247
- j=((k+1)<<1)-1;
248
- while(j<=ir) {
249
- if(j<ir && ra[j] < ra[j+1]) ++j;
250
- if(rra<ra[j]) {
251
- ra[i]=ra[j];
252
- rb[i]=rb[j];
253
- rc[i]=rc[j];
254
- j +=(i=j)+1;
255
- }
256
- else j=ir+1;
257
- }
258
- ra[i]=rra;
259
- rb[i]=rrb;
260
- rc[i]=rrc;
261
- }
262
- }
263
-
264
- template <class X, class Y>
265
- void hRort(const long n, X *ra, Y *rb) {
266
- long k, j, ir, i;
267
- X rra;
268
- Y rrb;
269
-
270
- if(n<=1)return;
271
-
272
- k=(n>>1);
273
- ir=n-1;
274
- for(;;) {
275
- if(k>0) {
276
- rra=ra[--k];
277
- rrb=rb[k];
278
- }
279
- else {
280
- rra=ra[ir];
281
- rrb=rb[ir];
282
- ra[ir] = ra[0];
283
- rb[ir] = rb[0];
284
- if(--ir ==0) {
285
- ra[0]=rra;
286
- rb[0]=rrb;
287
- return;
288
- }
289
- }
290
- i=k;
291
- j=((k+1)<<1)-1;
292
- while(j<=ir) {
293
- if(j<ir && ra[j] > ra[j+1]) ++j;
294
- if(rra>ra[j]) {
295
- ra[i]=ra[j];
296
- rb[i]=rb[j];
297
- j +=(i=j)+1;
298
- }
299
- else j=ir+1;
300
- }
301
- ra[i]=rra;
302
- rb[i]=rrb;
303
- }
304
- }
305
-
306
- template <class X, class Y, class Z>
307
- void hRort(const long n, X *ra, Y *rb, Z *rc) {
308
- long k, j, ir, i;
309
- X rra;
310
- Y rrb;
311
- Z rrc;
312
-
313
- if(n<=1)return;
314
-
315
- k=(n>>1);
316
- ir=n-1;
317
- for(;;) {
318
- if(k>0) {
319
- rra=ra[--k];
320
- rrb=rb[k];
321
- rrc=rc[k];
322
- }
323
- else {
324
- rra=ra[ir];
325
- rrb=rb[ir];
326
- rrc=rc[ir];
327
- ra[ir] = ra[0];
328
- rb[ir] = rb[0];
329
- rc[ir] = rc[0];
330
- if(--ir ==0) {
331
- ra[0]=rra;
332
- rb[0]=rrb;
333
- rc[0]=rrc;
334
- return;
335
- }
336
- }
337
- i=k;
338
- j=((k+1)<<1)-1;
339
- while(j<=ir) {
340
- if(j<ir && ra[j] > ra[j+1]) ++j;
341
- if(rra>ra[j]) {
342
- ra[i]=ra[j];
343
- rb[i]=rb[j];
344
- rc[i]=rc[j];
345
- j +=(i=j)+1;
346
- }
347
- else j=ir+1;
348
- }
349
- ra[i]=rra;
350
- rb[i]=rrb;
351
- rc[i]=rrc;
352
- }
353
- }
354
-
355
-
356
- //Function to convert a long to a null terminated string.
357
- void long_str(char *cnam,long n);
358
-
359
- //Function to convert a string with null termination
360
- //to a long.
361
- void str_long(char *cnam,long &n);
362
-
363
- //Function to convert first two char of string to an
364
- //integer. Should be an ASCII null terminated string
365
- int trac(const char *str);
366
-
367
- template<typename Y,typename Z>
368
- void xshuffle(Y n,Z *idx){ //Randomly shuffles an array of longs.
369
- Y i,k;
370
- Z u;
371
- for(i=n-1;i>0;i--){
372
- k=(Y)zrand((long)i+1);
373
- u=idx[i];
374
- idx[i]=idx[k];
375
- idx[k]=u;
376
- }
377
- }
378
-
379
- template<class Z>
380
- void dxhuffle(long n,Z *idx){ //Randomly shuffles an array type Z*.
381
- long i,k;
382
- Z xx;
383
- for(i=n-1;i>0;i--){
384
- k=zrand(i+1);
385
- xx=idx[i];
386
- idx[i]=idx[k];
387
- idx[k]=xx;
388
- }
389
- }
390
-
391
- }
392
- #endif
 
1
+ #ifndef RUNN_H
2
+ #define RUNN_H
3
+
4
+ #include <fstream>
5
+ #include <iostream>
6
+ #include <cctype>
7
+ #include <cstring>
8
+ #include <cstdlib>
9
+ using namespace std;
10
+ namespace iret {
11
+
12
+ const int word_cnt = 5000; //Maximum number of words in a document.
13
+ const int word_len = 1500; //Maximum word length.
14
+ const long max_str=1500; //Maximum string length.
15
+
16
+ int get_pathw(char *cn,const char *dfl,const char *dex,const char *a);
17
+ //Reads the path from a file "path_(*dfl)" and constructs the
18
+ //file name from as "(*dfl)_(*dex).(*a)". Cats path and file
19
+ //name and returns the full info in cn.
20
+ char *add_num(const char *ptr,long n,char *buf); //converts long to ascii
21
+ //and cats to end of string and returns pointer to new string
22
+ //that results. Does not change input string. The new string is
23
+ //held in buffer space and this is overwritten at each call.
24
+
25
+ int get_qflag();
26
+ //This function gets the value of the print flag pflag that is
27
+ //used to control output.
28
+ int mark(int,long,int,const char*);
29
+ //This function is used to print out information that indicates
30
+ //how a function is progressing. It is dependent on the value of
31
+ //pflag.
32
+ long gseed(int,char**,const char*);
33
+ //This function is called to allow the input of a seed value for
34
+ //the random number generator. It must be called in main or the
35
+ //arguments of main must be passed down to it if it is to allow
36
+ //command line entry. Otherwise the first argument may be set to
37
+ //zero and it may be used to enter the seed at run time from the
38
+ //console.
39
+ long clnga(int,char**,const char*,const char*);
40
+ //Allows a long to be entered from the console at run time if the
41
+ //first argument is set to zero. If the first two arguments are
42
+ //the arguments of main, then it allows command line entry with
43
+ //the flag that is the third argument and with a statement about
44
+ //the input that is the fourth argument.
45
+ double cdbla(int,char**,const char*,const char*);
46
+ char *cstra(int,char**,const char*,const char*);
47
+ long zrand(long);
48
+ //Produces a random long integer that is in the range [0,argument).
49
+ //Machinery of the random number generator.
50
+ void shuffle(long n,long *idx); //Randomly shuffles an array of longs.
51
+ void dshuffle(long n,long *idx); //Randomly shuffles an array of longs.
52
+ //Improved version suggested by Don Comeau
53
+ long rnd(double);
54
+ //Rounds off a double and returns the integer that results.
55
+
56
+ //Reads in a string including white space and ends the string
57
+ //just before the character a.
58
+ inline int get_string(char *cnam,ifstream &ifile,char a){
59
+ char *pch = cnam;
60
+ long j=1;
61
+
62
+ start:
63
+ if((*(pch++)=ifile.get())!=EOF){
64
+ if(*(pch-1)==a){pch--;goto start;}
65
+ while(((*(pch++)=ifile.get())!=a)&&(j<max_str))j++;
66
+ if(j<max_str){
67
+ *(--pch)='\0';
68
+ return(j);
69
+ }
70
+ else return(0);
71
+ }
72
+ return(0);
73
+ }
74
+
75
+ inline int get_strinf(char *cnam,fstream &ifile,char a){
76
+ char *pch = cnam;
77
+ long j=1;
78
+ if((*(pch++)=ifile.get())!=EOF){
79
+ while(((*(pch++)=ifile.get())!=a)&&(j<max_str))j++;
80
+ if(j<max_str){
81
+ *(--pch)='\0';
82
+ return(j);
83
+ }
84
+ else return(0);
85
+ }
86
+ return(0);
87
+ }
88
+
89
+ //Function to lower case a string.
90
+ inline void lower_case(char *cnam){
91
+ int i=0;
92
+ char ch;
93
+
94
+ while((ch=cnam[i])!='\0'){
95
+ cnam[i++]=tolower(ch);
96
+ }
97
+ }
98
+
99
+ //Note that ordering functions beginning with sS or hS
100
+ //produce an order that is increasing with increasing
101
+ //index, while sR or hR produces the reverse order.
102
+
103
+ template <class X>
104
+ void sSort(const long ix, X *idx){
105
+ long k, j, ir, i;
106
+ X rra;
107
+
108
+ if(ix<=1)return;
109
+
110
+ k=(ix>>1);
111
+ ir=ix-1;
112
+ for(;;) {
113
+ if(k>0) {
114
+ rra=idx[--k];
115
+ }
116
+ else {
117
+ rra=idx[ir];
118
+ idx[ir] = idx[0];
119
+ if(--ir ==0) {
120
+ idx[0]=rra;
121
+ return;
122
+ }
123
+ }
124
+ i=k;
125
+ j=((k+1)<<1)-1;
126
+ while(j<=ir) {
127
+ if(j<ir && (idx[j]<idx[j+1])) ++j;
128
+ if(rra<idx[j]) {
129
+ idx[i]=idx[j];
130
+ j +=(i=j)+1;
131
+ }
132
+ else j=ir+1;
133
+ }
134
+ idx[i]=rra;
135
+ }
136
+ }
137
+
138
+ template <class X>
139
+ void sRort(const long ix, X *idx){
140
+ long k, j, ir, i;
141
+ X rra;
142
+
143
+ if(ix<=1)return;
144
+
145
+ k=(ix>>1);
146
+ ir=ix-1;
147
+ for(;;) {
148
+ if(k>0) {
149
+ rra=idx[--k];
150
+ }
151
+ else {
152
+ rra=idx[ir];
153
+ idx[ir] = idx[0];
154
+ if(--ir ==0) {
155
+ idx[0]=rra;
156
+ return;
157
+ }
158
+ }
159
+ i=k;
160
+ j=((k+1)<<1)-1;
161
+ while(j<=ir) {
162
+ if(j<ir && (idx[j]>idx[j+1])) ++j;
163
+ if(rra>idx[j]) {
164
+ idx[i]=idx[j];
165
+ j +=(i=j)+1;
166
+ }
167
+ else j=ir+1;
168
+ }
169
+ idx[i]=rra;
170
+ }
171
+ }
172
+
173
+ template <class X, class Y>
174
+ void hSort(const long n, X *ra, Y *rb) {
175
+ long k, j, ir, i;
176
+ X rra;
177
+ Y rrb;
178
+
179
+ if(n<=1)return;
180
+
181
+ k=(n>>1);
182
+ ir=n-1;
183
+ for(;;) {
184
+ if(k>0) {
185
+ rra=ra[--k];
186
+ rrb=rb[k];
187
+ }
188
+ else {
189
+ rra=ra[ir];
190
+ rrb=rb[ir];
191
+ ra[ir] = ra[0];
192
+ rb[ir] = rb[0];
193
+ if(--ir ==0) {
194
+ ra[0]=rra;
195
+ rb[0]=rrb;
196
+ return;
197
+ }
198
+ }
199
+ i=k;
200
+ j=((k+1)<<1)-1;
201
+ while(j<=ir) {
202
+ if(j<ir && ra[j] < ra[j+1]) ++j;
203
+ if(rra<ra[j]) {
204
+ ra[i]=ra[j];
205
+ rb[i]=rb[j];
206
+ j +=(i=j)+1;
207
+ }
208
+ else j=ir+1;
209
+ }
210
+ ra[i]=rra;
211
+ rb[i]=rrb;
212
+ }
213
+ }
214
+
215
+ template <class X, class Y, class Z>
216
+ void hSort(const long n, X *ra, Y *rb, Z *rc) {
217
+ long k, j, ir, i;
218
+ X rra;
219
+ Y rrb;
220
+ Z rrc;
221
+
222
+ if(n<=1)return;
223
+
224
+ k=(n>>1);
225
+ ir=n-1;
226
+ for(;;) {
227
+ if(k>0) {
228
+ rra=ra[--k];
229
+ rrb=rb[k];
230
+ rrc=rc[k];
231
+ }
232
+ else {
233
+ rra=ra[ir];
234
+ rrb=rb[ir];
235
+ rrc=rc[ir];
236
+ ra[ir] = ra[0];
237
+ rb[ir] = rb[0];
238
+ rc[ir] = rc[0];
239
+ if(--ir ==0) {
240
+ ra[0]=rra;
241
+ rb[0]=rrb;
242
+ rc[0]=rrc;
243
+ return;
244
+ }
245
+ }
246
+ i=k;
247
+ j=((k+1)<<1)-1;
248
+ while(j<=ir) {
249
+ if(j<ir && ra[j] < ra[j+1]) ++j;
250
+ if(rra<ra[j]) {
251
+ ra[i]=ra[j];
252
+ rb[i]=rb[j];
253
+ rc[i]=rc[j];
254
+ j +=(i=j)+1;
255
+ }
256
+ else j=ir+1;
257
+ }
258
+ ra[i]=rra;
259
+ rb[i]=rrb;
260
+ rc[i]=rrc;
261
+ }
262
+ }
263
+
264
+ template <class X, class Y>
265
+ void hRort(const long n, X *ra, Y *rb) {
266
+ long k, j, ir, i;
267
+ X rra;
268
+ Y rrb;
269
+
270
+ if(n<=1)return;
271
+
272
+ k=(n>>1);
273
+ ir=n-1;
274
+ for(;;) {
275
+ if(k>0) {
276
+ rra=ra[--k];
277
+ rrb=rb[k];
278
+ }
279
+ else {
280
+ rra=ra[ir];
281
+ rrb=rb[ir];
282
+ ra[ir] = ra[0];
283
+ rb[ir] = rb[0];
284
+ if(--ir ==0) {
285
+ ra[0]=rra;
286
+ rb[0]=rrb;
287
+ return;
288
+ }
289
+ }
290
+ i=k;
291
+ j=((k+1)<<1)-1;
292
+ while(j<=ir) {
293
+ if(j<ir && ra[j] > ra[j+1]) ++j;
294
+ if(rra>ra[j]) {
295
+ ra[i]=ra[j];
296
+ rb[i]=rb[j];
297
+ j +=(i=j)+1;
298
+ }
299
+ else j=ir+1;
300
+ }
301
+ ra[i]=rra;
302
+ rb[i]=rrb;
303
+ }
304
+ }
305
+
306
+ template <class X, class Y, class Z>
307
+ void hRort(const long n, X *ra, Y *rb, Z *rc) {
308
+ long k, j, ir, i;
309
+ X rra;
310
+ Y rrb;
311
+ Z rrc;
312
+
313
+ if(n<=1)return;
314
+
315
+ k=(n>>1);
316
+ ir=n-1;
317
+ for(;;) {
318
+ if(k>0) {
319
+ rra=ra[--k];
320
+ rrb=rb[k];
321
+ rrc=rc[k];
322
+ }
323
+ else {
324
+ rra=ra[ir];
325
+ rrb=rb[ir];
326
+ rrc=rc[ir];
327
+ ra[ir] = ra[0];
328
+ rb[ir] = rb[0];
329
+ rc[ir] = rc[0];
330
+ if(--ir ==0) {
331
+ ra[0]=rra;
332
+ rb[0]=rrb;
333
+ rc[0]=rrc;
334
+ return;
335
+ }
336
+ }
337
+ i=k;
338
+ j=((k+1)<<1)-1;
339
+ while(j<=ir) {
340
+ if(j<ir && ra[j] > ra[j+1]) ++j;
341
+ if(rra>ra[j]) {
342
+ ra[i]=ra[j];
343
+ rb[i]=rb[j];
344
+ rc[i]=rc[j];
345
+ j +=(i=j)+1;
346
+ }
347
+ else j=ir+1;
348
+ }
349
+ ra[i]=rra;
350
+ rb[i]=rrb;
351
+ rc[i]=rrc;
352
+ }
353
+ }
354
+
355
+
356
+ //Function to convert a long to a null terminated string.
357
+ void long_str(char *cnam,long n);
358
+
359
+ //Function to convert a string with null termination
360
+ //to a long.
361
+ void str_long(char *cnam,long &n);
362
+
363
+ //Function to convert first two char of string to an
364
+ //integer. Should be an ASCII null terminated string
365
+ int trac(const char *str);
366
+
367
+ template<typename Y,typename Z>
368
+ void xshuffle(Y n,Z *idx){ //Randomly shuffles an array of longs.
369
+ Y i,k;
370
+ Z u;
371
+ for(i=n-1;i>0;i--){
372
+ k=(Y)zrand((long)i+1);
373
+ u=idx[i];
374
+ idx[i]=idx[k];
375
+ idx[k]=u;
376
+ }
377
+ }
378
+
379
+ template<class Z>
380
+ void dxhuffle(long n,Z *idx){ //Randomly shuffles an array type Z*.
381
+ long i,k;
382
+ Z xx;
383
+ for(i=n-1;i>0;i--){
384
+ k=zrand(i+1);
385
+ xx=idx[i];
386
+ idx[i]=idx[k];
387
+ idx[k]=xx;
388
+ }
389
+ }
390
+
391
+ }
392
+ #endif
gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/version_vocab/vocab1.txt CHANGED
The diff for this file is too large to render. See raw diff
 
gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/version_vocab/vocab_ori.txt CHANGED
The diff for this file is too large to render. See raw diff
 
requirements-py310.txt CHANGED
@@ -1,7 +1,7 @@
1
- tensorflow==2.8
2
- transformers==4.37.2
3
- stanza==1.4.0
4
- spacy==3.2.4
5
- bioc==2.0.post4
6
- spacy==3.2.4
7
  protobuf==3.20.1
 
1
+ tensorflow==2.8
2
+ transformers==4.37.2
3
+ stanza==1.4.0
4
+ spacy==3.2.4
5
+ bioc==2.0.post4
6
+ spacy==3.2.4
7
  protobuf==3.20.1
requirements.txt CHANGED
@@ -1,76 +1,76 @@
1
- absl-py
2
- astunparse
3
- attrs
4
- bioc
5
- blis
6
- cachetools
7
- catalogue
8
- certifi
9
- charset-normalizer
10
- click
11
- cymem
12
- emoji
13
- filelock
14
- gast
15
- google-auth
16
- google-auth-oauthlib
17
- google-pasta
18
- grpcio
19
- h5py
20
- huggingface-hub
21
- idna
22
- importlib-metadata
23
- intervaltree
24
- Jinja2
25
- joblib
26
- jsonlines
27
- Keras-Preprocessing
28
- langcodes
29
- lxml
30
- Markdown
31
- MarkupSafe
32
- murmurhash
33
- numpy
34
- oauthlib
35
- opt-einsum
36
- packaging
37
- pathy
38
- preshed
39
- protobuf
40
- pyasn1
41
- pyasn1-modules
42
- pydantic
43
- pyparsing
44
- PyYAML
45
- regex
46
- requests
47
- requests-oauthlib
48
- rsa
49
- sacremoses
50
- scipy
51
- six
52
- smart-open
53
- sortedcontainers
54
- spacy
55
- spacy-legacy
56
- spacy-loggers
57
- srsly
58
- stanza
59
- tensorboard
60
- tensorboard-data-server
61
- tensorboard-plugin-wit
62
- tensorflow
63
- tensorflow-estimator
64
- termcolor
65
- thinc
66
- tokenizers
67
- torch
68
- tqdm
69
- transformers
70
- typer
71
- typing_extensions
72
- urllib3
73
- wasabi
74
- Werkzeug
75
- wrapt
76
- zipp
 
1
+ absl-py
2
+ astunparse
3
+ attrs
4
+ bioc
5
+ blis
6
+ cachetools
7
+ catalogue
8
+ certifi
9
+ charset-normalizer
10
+ click
11
+ cymem
12
+ emoji
13
+ filelock
14
+ gast
15
+ google-auth
16
+ google-auth-oauthlib
17
+ google-pasta
18
+ grpcio
19
+ h5py
20
+ huggingface-hub
21
+ idna
22
+ importlib-metadata
23
+ intervaltree
24
+ Jinja2
25
+ joblib
26
+ jsonlines
27
+ Keras-Preprocessing
28
+ langcodes
29
+ lxml
30
+ Markdown
31
+ MarkupSafe
32
+ murmurhash
33
+ numpy
34
+ oauthlib
35
+ opt-einsum
36
+ packaging
37
+ pathy
38
+ preshed
39
+ protobuf
40
+ pyasn1
41
+ pyasn1-modules
42
+ pydantic
43
+ pyparsing
44
+ PyYAML
45
+ regex
46
+ requests
47
+ requests-oauthlib
48
+ rsa
49
+ sacremoses
50
+ scipy
51
+ six
52
+ smart-open
53
+ sortedcontainers
54
+ spacy
55
+ spacy-legacy
56
+ spacy-loggers
57
+ srsly
58
+ stanza
59
+ tensorboard
60
+ tensorboard-data-server
61
+ tensorboard-plugin-wit
62
+ tensorflow
63
+ tensorflow-estimator
64
+ termcolor
65
+ thinc
66
+ tokenizers
67
+ torch
68
+ tqdm
69
+ transformers
70
+ typer
71
+ typing_extensions
72
+ urllib3
73
+ wasabi
74
+ Werkzeug
75
+ wrapt
76
+ zipp
run_batches.py CHANGED
@@ -1,12 +1,19 @@
1
  import argparse
2
  import logging
 
3
  import shutil
4
  import subprocess
5
- import time
6
- from datetime import timedelta
7
  from pathlib import Path
8
  from tempfile import TemporaryDirectory
9
 
 
 
 
 
 
 
 
 
10
 
11
  def main():
12
  logging.basicConfig(level=logging.INFO)
@@ -14,7 +21,8 @@ def main():
14
  parser.add_argument("--mode", type=str, default="gnorm2", help="mode to run in (gnorm2, gnormplus)")
15
  parser.add_argument("input_dir", type=str, help="directory containing files to process")
16
  parser.add_argument("output_dir", type=str, help="directory to write processed files to")
17
- parser.add_argument("--batch_size", type=int, default=64)
 
18
  args = parser.parse_args()
19
 
20
  input_dir = Path(args.input_dir)
@@ -32,51 +40,57 @@ def main():
32
 
33
  logging.info(f"Processing {len(input_files)} files")
34
 
35
-
36
- while input_files:
37
- start = time.time()
38
- logging.info(f"{len(input_files)} remaining files")
39
- input_files_batch = list(input_files)[: args.batch_size]
40
-
41
- with TemporaryDirectory() as temp_dir_SR, TemporaryDirectory() as temp_dir_GNR, TemporaryDirectory() as temp_dir_SA, TemporaryDirectory() as input_temp_dir, TemporaryDirectory() as output_temp_dir:
42
- input_temp_dir = Path(input_temp_dir)
43
- output_temp_dir = Path(output_temp_dir)
44
- for file in input_files_batch:
45
- shutil.copy(input_dir / file, input_temp_dir)
46
-
47
- if args.mode == "gnorm2":
48
- command_SR = f"java -Xmx60G -Xms30G -jar GNormPlus.jar {str(input_temp_dir)} {str(temp_dir_SR)} setup.SR.txt"
49
- command_GNR_SA = f"python GeneNER_SpeAss_run.py -i {str(temp_dir_SR)} -r {str(temp_dir_GNR)} -a {str(temp_dir_SA)} -n gnorm_trained_models/geneNER/GeneNER-Bioformer.h5 -s gnorm_trained_models/SpeAss/SpeAss-Bioformer.h5"
50
- command_GN = f"java -Xmx60G -Xms30G -jar GNormPlus.jar {str(temp_dir_SA)} {str(output_temp_dir)} setup.GN.txt"
51
- commands = [command_SR, command_GNR_SA, command_GN]
52
- elif args.mode == "gnormplus":
53
- commands = [f"java -Xmx60G -Xms30G -jar GNormPlus.jar {str(input_temp_dir)} {str(output_temp_dir)} setup.txt"]
54
- else:
55
- raise ValueError(f"Invalid mode: {args.mode}")
56
-
57
- for command in commands:
58
- try:
59
- logging.info(command)
60
- subprocess.run([command], check=True, shell=True)
61
- except subprocess.CalledProcessError as e:
62
- logging.exception(f"Error running command: {command}")
63
- raise e
64
-
65
- output_paths = output_temp_dir.rglob("*")
66
- output_files = set(file.name for file in output_paths)
67
- for output_path, output_file in zip(output_paths, output_files):
68
- shutil.copy(output_path, output_dir)
69
- input_files.remove(output_file)
70
- end = time.time()
71
- logging.info(f"Processed {len(output_files)} files in {timedelta(seconds=end - start)}")
72
-
73
- if not len(output_files):
74
- raise Exception("No files were output")
75
-
76
- if output_files:
77
- logging.info(
78
- f"Estimated time remaining: {timedelta(seconds=(end - start) * len(input_files) / output_files)}"
79
- )
 
 
 
 
 
 
80
 
81
 
82
  if __name__ == "__main__":
 
1
  import argparse
2
  import logging
3
+ import os
4
  import shutil
5
  import subprocess
 
 
6
  from pathlib import Path
7
  from tempfile import TemporaryDirectory
8
 
9
+ from tqdm.contrib.concurrent import process_map
10
+
11
+
12
+ def batch(iterable, n=1):
13
+ l = len(iterable)
14
+ for ndx in range(0, l, n):
15
+ yield iterable[ndx : min(ndx + n, l)]
16
+
17
 
18
  def main():
19
  logging.basicConfig(level=logging.INFO)
 
21
  parser.add_argument("--mode", type=str, default="gnorm2", help="mode to run in (gnorm2, gnormplus)")
22
  parser.add_argument("input_dir", type=str, help="directory containing files to process")
23
  parser.add_argument("output_dir", type=str, help="directory to write processed files to")
24
+ parser.add_argument("--batch_size", type=int, default=8)
25
+ parser.add_argument("--max_workers", type=int, default=os.cpu_count() - 4)
26
  args = parser.parse_args()
27
 
28
  input_dir = Path(args.input_dir)
 
40
 
41
  logging.info(f"Processing {len(input_files)} files")
42
 
43
+ input_files = sorted(input_files, key=lambda file: (input_dir / file).stat().st_size)
44
+
45
+ input_files_batches = list(batch(list(input_files), args.batch_size))
46
+ process_map(
47
+ run_batch,
48
+ input_files_batches,
49
+ [input_dir] * len(input_files_batches),
50
+ [output_dir] * len(input_files_batches),
51
+ [args.mode] * len(input_files_batches),
52
+ max_workers=args.max_workers,
53
+ chunksize=1,
54
+ )
55
+
56
+
57
+ def run_batch(input_files_batch, input_dir, output_dir, mode):
58
+ with TemporaryDirectory() as temp_dir_SR, TemporaryDirectory() as temp_dir_GNR, TemporaryDirectory() as temp_dir_SA, TemporaryDirectory() as input_temp_dir, TemporaryDirectory() as output_temp_dir:
59
+ input_temp_dir = Path(input_temp_dir)
60
+ output_temp_dir = Path(output_temp_dir)
61
+ for file in input_files_batch:
62
+ logging.info(f"cp {input_dir / file} {input_temp_dir}")
63
+ shutil.copy(input_dir / file, input_temp_dir)
64
+
65
+ if mode == "gnorm2":
66
+ command_SR = (
67
+ f"java -Xmx32G -Xms16G -jar GNormPlus.jar {str(input_temp_dir)} {str(temp_dir_SR)} setup.SR.txt"
68
+ )
69
+ command_GNR_SA = f"python GeneNER_SpeAss_run.py -i {str(temp_dir_SR)} -r {str(temp_dir_GNR)} -a {str(temp_dir_SA)} -n gnorm_trained_models/geneNER/GeneNER-Bioformer.h5 -s gnorm_trained_models/SpeAss/SpeAss-Bioformer.h5"
70
+ command_GN = (
71
+ f"java -Xmx32G -Xms16G -jar GNormPlus.jar {str(temp_dir_SA)} {str(output_temp_dir)} setup.GN.txt"
72
+ )
73
+ commands = [command_SR, command_GNR_SA, command_GN]
74
+ elif mode == "gnormplus":
75
+ commands = [
76
+ f"java -Xmx32G -Xms16G -jar GNormPlus.jar {str(input_temp_dir)} {str(output_temp_dir)} setup.txt"
77
+ ]
78
+ else:
79
+ raise ValueError(f"Invalid mode: {mode}")
80
+
81
+ for command in commands:
82
+ try:
83
+ logging.info(command)
84
+ subprocess.run([command], check=True, shell=True)
85
+ except subprocess.CalledProcessError as e:
86
+ logging.exception(f"Error running command: {command}")
87
+ raise e
88
+
89
+ output_paths = list(output_temp_dir.rglob("*"))
90
+ for output_path in output_paths:
91
+ logging.info(f"cp {output_path} {output_dir}")
92
+ shutil.copy(output_path, output_dir)
93
+ output_file = output_path.name
94
 
95
 
96
  if __name__ == "__main__":
src_Java/GNormPluslib/BioCDoc.java CHANGED
@@ -1,1344 +1,1344 @@
1
- /**
2
- * Project: GNormPlus
3
- * Function: Data storage in BioC format
4
- */
5
-
6
- package GNormPluslib;
7
-
8
- import bioc.BioCAnnotation;
9
- import bioc.BioCCollection;
10
- import bioc.BioCDocument;
11
- import bioc.BioCLocation;
12
- import bioc.BioCPassage;
13
-
14
- import bioc.io.BioCDocumentWriter;
15
- import bioc.io.BioCFactory;
16
- import bioc.io.woodstox.ConnectorWoodstox;
17
- import java.io.BufferedReader;
18
- import java.io.BufferedWriter;
19
- import java.io.FileInputStream;
20
- import java.io.FileNotFoundException;
21
- import java.io.FileOutputStream;
22
- import java.io.FileReader;
23
- import java.io.FileWriter;
24
- import java.io.IOException;
25
- import java.io.InputStreamReader;
26
- import java.io.OutputStreamWriter;
27
- import java.io.UnsupportedEncodingException;
28
- import java.time.LocalDate;
29
- import java.time.ZoneId;
30
-
31
- import javax.xml.stream.XMLStreamException;
32
-
33
- import java.util.Map;
34
- import java.util.regex.Matcher;
35
- import java.util.regex.Pattern;
36
- import java.util.ArrayList;
37
- import java.util.HashMap;
38
- import java.util.List;
39
-
40
- public class BioCDoc
41
- {
42
- /*
43
- * Contexts in BioC file
44
- */
45
- public ArrayList<String> PMIDs=new ArrayList<String>(); // Type: PMIDs
46
- public ArrayList<ArrayList<String>> PassageNames = new ArrayList(); // PassageName
47
- public ArrayList<ArrayList<Integer>> PassageOffsets = new ArrayList(); // PassageOffset
48
- public ArrayList<ArrayList<String>> PassageContexts = new ArrayList(); // PassageContext
49
- public ArrayList<ArrayList<ArrayList<String>>> Annotations = new ArrayList(); // Annotation - GNormPlus
50
-
51
- public String BioCFormatCheck(String InputFile) throws IOException
52
- {
53
-
54
- ConnectorWoodstox connector = new ConnectorWoodstox();
55
- BioCCollection collection = new BioCCollection();
56
- try
57
- {
58
- collection = connector.startRead(new InputStreamReader(new FileInputStream(InputFile), "UTF-8"));
59
- }
60
- catch (UnsupportedEncodingException | FileNotFoundException | XMLStreamException e)
61
- {
62
- BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(InputFile), "UTF-8"));
63
- String line="";
64
- String status="";
65
- String Pmid = "";
66
- boolean tiabs=false;
67
- Pattern patt = Pattern.compile("^([^\\|\\t]+)\\|([^\\|\\t]+)\\|(.*)$");
68
- while ((line = br.readLine()) != null)
69
- {
70
- Matcher mat = patt.matcher(line);
71
- if(mat.find()) //Title|Abstract
72
- {
73
- if(Pmid.equals(""))
74
- {
75
- Pmid = mat.group(1);
76
- }
77
- else if(!Pmid.equals(mat.group(1)))
78
- {
79
- return "[Error]: "+InputFile+" - A blank is needed between "+Pmid+" and "+mat.group(1)+".";
80
- }
81
- status = "tiabs";
82
- tiabs = true;
83
- }
84
- else if (line.contains("\t")) //Annotation
85
- {
86
- }
87
- else if(line.length()==0) //Processing
88
- {
89
- if(status.equals(""))
90
- {
91
- if(Pmid.equals(""))
92
- {
93
- return "[Error]: "+InputFile+" - It's neither BioC nor PubTator format. PMID is empty.";
94
- }
95
- else
96
- {
97
- return "[Error]: "+InputFile+" - A redundant blank is after "+Pmid+".";
98
- }
99
- }
100
- Pmid="";
101
- status="";
102
- }
103
- }
104
- br.close();
105
- if(tiabs == false)
106
- {
107
- return "[Error]: "+InputFile+" - It's neither BioC nor PubTator format.";
108
- }
109
- if(status.equals(""))
110
- {
111
- return "PubTator";
112
- }
113
- else
114
- {
115
- return "[Error]: "+InputFile+" - The last column missed a blank.";
116
- }
117
- }
118
- return "BioC";
119
- }
120
- public void PubTator2BioC(String input,String output) throws IOException, XMLStreamException // Input
121
- {
122
- /*
123
- * PubTator2BioC
124
- */
125
- String parser = BioCFactory.WOODSTOX;
126
- BioCFactory factory = BioCFactory.newFactory(parser);
127
- BioCDocumentWriter BioCOutputFormat = factory.createBioCDocumentWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
128
- BioCCollection biocCollection = new BioCCollection();
129
-
130
- //time
131
- ZoneId zonedId = ZoneId.of( "America/Montreal" );
132
- LocalDate today = LocalDate.now( zonedId );
133
- biocCollection.setDate(today.toString());
134
-
135
- biocCollection.setKey("BioC.key");//key
136
- biocCollection.setSource("GNormPlus");//source
137
-
138
- BioCOutputFormat.writeCollectionInfo(biocCollection);
139
- BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(input), "UTF-8"));
140
- ArrayList<String> ParagraphType=new ArrayList<String>(); // Type: Title|Abstract
141
- ArrayList<String> ParagraphContent = new ArrayList<String>(); // Text
142
- ArrayList<String> annotations = new ArrayList<String>(); // Annotation
143
- String line;
144
- String Pmid="";
145
- while ((line = inputfile.readLine()) != null)
146
- {
147
- if(line.contains("|") && !line.contains("\t")) //Title|Abstract
148
- {
149
- String str[]=line.split("\\|",-1);
150
- Pmid=str[0];
151
- if(str[1].equals("t"))
152
- {
153
- str[1]="title";
154
- }
155
- if(str[1].equals("a"))
156
- {
157
- str[1]="abstract";
158
- }
159
- ParagraphType.add(str[1]);
160
- if(str.length==3)
161
- {
162
- String txt = str[2];
163
- txt = txt.replaceAll("ω","w");
164
- txt = txt.replaceAll("μ","u");
165
- txt = txt.replaceAll("κ","k");
166
- txt = txt.replaceAll("α","a");
167
- txt = txt.replaceAll("γ","g");
168
- txt = txt.replaceAll("ɣ","g");
169
- txt = txt.replaceAll("β","b");
170
- txt = txt.replaceAll("×","x");
171
- txt = txt.replaceAll("‑","-");
172
- txt = txt.replaceAll("¹","1");
173
- txt = txt.replaceAll("²","2");
174
- txt = txt.replaceAll("°","o");
175
- txt = txt.replaceAll("ö","o");
176
- txt = txt.replaceAll("é","e");
177
- txt = txt.replaceAll("à","a");
178
- txt = txt.replaceAll("Á","A");
179
- txt = txt.replaceAll("ε","e");
180
- txt = txt.replaceAll("θ","O");
181
- txt = txt.replaceAll("•",".");
182
- txt = txt.replaceAll("µ","u");
183
- txt = txt.replaceAll("λ","r");
184
- txt = txt.replaceAll("⁺","+");
185
- txt = txt.replaceAll("ν","v");
186
- txt = txt.replaceAll("ï","i");
187
- txt = txt.replaceAll("ã","a");
188
- txt = txt.replaceAll("≡","=");
189
- txt = txt.replaceAll("ó","o");
190
- txt = txt.replaceAll("³","3");
191
- txt = txt.replaceAll("〖","[");
192
- txt = txt.replaceAll("〗","]");
193
- txt = txt.replaceAll("Å","A");
194
- txt = txt.replaceAll("ρ","p");
195
- txt = txt.replaceAll("ü","u");
196
- txt = txt.replaceAll("ɛ","e");
197
- txt = txt.replaceAll("č","c");
198
- txt = txt.replaceAll("š","s");
199
- txt = txt.replaceAll("ß","b");
200
- txt = txt.replaceAll("═","=");
201
- txt = txt.replaceAll("£","L");
202
- txt = txt.replaceAll("Ł","L");
203
- txt = txt.replaceAll("ƒ","f");
204
- txt = txt.replaceAll("ä","a");
205
- txt = txt.replaceAll("–","-");
206
- txt = txt.replaceAll("⁻","-");
207
- txt = txt.replaceAll("〈","<");
208
- txt = txt.replaceAll("〉",">");
209
- txt = txt.replaceAll("χ","X");
210
- txt = txt.replaceAll("Đ","D");
211
- txt = txt.replaceAll("‰","%");
212
- txt = txt.replaceAll("·",".");
213
- txt = txt.replaceAll("→",">");
214
- txt = txt.replaceAll("←","<");
215
- txt = txt.replaceAll("ζ","z");
216
- txt = txt.replaceAll("π","p");
217
- txt = txt.replaceAll("τ","t");
218
- txt = txt.replaceAll("ξ","X");
219
- txt = txt.replaceAll("η","h");
220
- txt = txt.replaceAll("ø","0");
221
- txt = txt.replaceAll("Δ","D");
222
- txt = txt.replaceAll("∆","D");
223
- txt = txt.replaceAll("∑","S");
224
- txt = txt.replaceAll("Ω","O");
225
- txt = txt.replaceAll("δ","d");
226
- txt = txt.replaceAll("σ","s");
227
- txt = txt.replaceAll("Φ","F");
228
- txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]"," ");
229
- ParagraphContent.add(txt);
230
- }
231
- else
232
- {
233
- ParagraphContent.add("- No text -");
234
- }
235
- }
236
- else if (line.contains("\t")) //Annotation
237
- {
238
- String anno[]=line.split("\t");
239
- if(anno.length==6)
240
- {
241
- annotations.add(anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+anno[4]+"\t"+anno[5]);
242
- }
243
- else if(anno.length==5)
244
- {
245
- annotations.add(anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+anno[4]);
246
- }
247
- }
248
- else if(line.length()==0) //Processing
249
- {
250
- BioCDocument biocDocument = new BioCDocument();
251
- biocDocument.setID(Pmid);
252
- int startoffset=0;
253
- for(int i=0;i<ParagraphType.size();i++)
254
- {
255
- BioCPassage biocPassage = new BioCPassage();
256
- Map<String, String> Infons = new HashMap<String, String>();
257
- Infons.put("type", ParagraphType.get(i));
258
- biocPassage.setInfons(Infons);
259
- biocPassage.setText(ParagraphContent.get(i));
260
- biocPassage.setOffset(startoffset);
261
- startoffset=startoffset+ParagraphContent.get(i).length()+1;
262
- for(int j=0;j<annotations.size();j++)
263
- {
264
- String anno[]=annotations.get(j).split("\t");
265
- if(Integer.parseInt(anno[0])<startoffset && Integer.parseInt(anno[0])>=startoffset-ParagraphContent.get(i).length()-1)
266
- {
267
- BioCAnnotation biocAnnotation = new BioCAnnotation();
268
- Map<String, String> AnnoInfons = new HashMap<String, String>();
269
- if(anno.length==5)
270
- {
271
- AnnoInfons.put("Identifier", anno[4]);
272
- }
273
- AnnoInfons.put("type", anno[3]);
274
- biocAnnotation.setInfons(AnnoInfons);
275
- BioCLocation location = new BioCLocation();
276
- location.setOffset(Integer.parseInt(anno[0]));
277
- location.setLength(Integer.parseInt(anno[1])-Integer.parseInt(anno[0]));
278
- biocAnnotation.setLocation(location);
279
- biocAnnotation.setText(anno[2]);
280
- biocPassage.addAnnotation(biocAnnotation);
281
- }
282
- }
283
- biocDocument.addPassage(biocPassage);
284
- }
285
- biocCollection.addDocument(biocDocument);
286
- ParagraphType.clear();
287
- ParagraphContent.clear();
288
- annotations.clear();
289
- BioCOutputFormat.writeDocument(biocDocument);
290
- }
291
- }
292
- BioCOutputFormat.close();
293
- inputfile.close();
294
- }
295
- public void BioC2PubTator(String input,String output) throws IOException, XMLStreamException //Output
296
- {
297
- /*
298
- * BioC2PubTator
299
- */
300
- HashMap<String, String> pmidlist = new HashMap<String, String>(); // check if appear duplicate pmids
301
- boolean duplicate = false;
302
- BufferedWriter PubTatorOutputFormat = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
303
- ConnectorWoodstox connector = new ConnectorWoodstox();
304
- BioCCollection collection = new BioCCollection();
305
- collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
306
- while (connector.hasNext())
307
- {
308
- BioCDocument document = connector.next();
309
- String PMID = document.getID();
310
- if(pmidlist.containsKey(PMID)){System.out.println("\nError: duplicate pmid-"+PMID);duplicate = true;}
311
- else{pmidlist.put(PMID,"");}
312
- String Anno="";
313
- for (BioCPassage passage : document.getPassages())
314
- {
315
- if(passage.getInfon("type").equals("title"))
316
- {
317
- PubTatorOutputFormat.write(PMID+"|t|"+passage.getText()+"\n");
318
- }
319
- else if(passage.getInfon("type").equals("abstract"))
320
- {
321
- PubTatorOutputFormat.write(PMID+"|a|"+passage.getText()+"\n");
322
- }
323
- else
324
- {
325
- PubTatorOutputFormat.write(PMID+"|"+passage.getInfon("type")+"|"+passage.getText()+"\n");
326
- }
327
-
328
- for (BioCAnnotation annotation : passage.getAnnotations())
329
- {
330
- String Annotype = annotation.getInfon("type");
331
- String Annoid="";
332
- String Proteinid="";
333
- if(Annotype.matches("(Gene|FamilyName|DomainMotif)"))
334
- {
335
- if(annotation.getInfons().containsKey("NCBI Gene"))
336
- {
337
- Annoid = annotation.getInfon("NCBI Gene");
338
- String Annoidlist[]=Annoid.split(";");
339
- Annoid="";
340
- for(int x=0;x<Annoidlist.length;x++)
341
- {
342
- //Normalization2Protein
343
- String proteinid="";
344
- String homoid="";
345
-
346
- if(GNormPlus.Normalization2Protein_hash.containsKey(Annoidlist[x]))
347
- {
348
- proteinid=GNormPlus.Normalization2Protein_hash.get(Annoidlist[x]);
349
- }
350
- if(GNormPlus.HomologeneID_hash.containsKey(Annoidlist[x]))
351
- {
352
- homoid=GNormPlus.HomologeneID_hash.get(Annoidlist[x]);
353
- }
354
-
355
- if((!proteinid.equals("")) || (!homoid.equals("")))
356
- {
357
- if(Annoid.equals(""))
358
- {
359
- Annoid=Annoidlist[x]+"(";
360
- if(!proteinid.equals(""))
361
- {
362
- Annoid=Annoid+"UniProt:"+proteinid;
363
- }
364
- if(!homoid.equals(""))
365
- {
366
- if(!proteinid.equals(""))
367
- {
368
- Annoid=Annoid+";";
369
- }
370
- Annoid=Annoid+"Homoid:"+homoid;
371
- }
372
- Annoid=Annoid+")";
373
- }
374
- else
375
- {
376
- Annoid=Annoid+";"+Annoidlist[x]+"(";
377
- if(!proteinid.equals(""))
378
- {
379
- Annoid=Annoid+"UniProt:"+proteinid;
380
- }
381
- if(!homoid.equals(""))
382
- {
383
- if(!proteinid.equals(""))
384
- {
385
- Annoid=Annoid+";";
386
- }
387
- Annoid=Annoid+"Homoid:"+homoid;
388
- }
389
- Annoid=Annoid+")";
390
- }
391
- }
392
- else
393
- {
394
- if(Annoid.equals(""))
395
- {
396
- Annoid=Annoidlist[x];
397
- }
398
- else
399
- {
400
- Annoid=Annoid+";"+Annoidlist[x];
401
- }
402
- }
403
- }
404
- }
405
- //else if(annotation.getInfons().containsKey("NCBI Homologene"))
406
- //{
407
- // Annoid = annotation.getInfon("NCBI Homologene");
408
- //}
409
- //else if(!annotation.getInfons().containsKey("FocusSpecies"))
410
- //{
411
- // Annoid = annotation.getInfon("FocusSpecies");
412
- //}
413
- else
414
- {
415
- Annoid = annotation.getInfon("Identifier");
416
- }
417
- }
418
- else if(Annotype.equals("Species") || Annotype.equals("Genus") || Annotype.equals("Strain"))
419
- {
420
- if(annotation.getInfons().containsKey("NCBI Taxonomy"))
421
- {
422
- Annoid = annotation.getInfon("NCBI Taxonomy");
423
- }
424
- else
425
- {
426
- Annoid = annotation.getInfon("Identifier");
427
- }
428
- }
429
- else if(Annotype.equals("CellLine"))
430
- {
431
- if(annotation.getInfons().containsKey("NCBI Taxonomy"))
432
- {
433
- Annoid = annotation.getInfon("NCBI Taxonomy");
434
- }
435
- else
436
- {
437
- Annoid = annotation.getInfon("Identifier");
438
- }
439
- }
440
- else
441
- {
442
- Annoid = annotation.getInfon("Identifier");
443
- }
444
- int start = annotation.getLocations().get(0).getOffset();
445
- int last = start + annotation.getLocations().get(0).getLength();
446
- String AnnoMention=annotation.getText();
447
- if(Annoid != null && !Annoid.equals(null) && !Annoid.equals(""))
448
- {
449
- Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\t"+Annoid+"\n";
450
- }
451
- else
452
- {
453
- Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\n";
454
- }
455
- }
456
- }
457
- PubTatorOutputFormat.write(Anno+"\n");
458
- }
459
- PubTatorOutputFormat.close();
460
- if(duplicate == true){System.exit(0);}
461
- }
462
- public void BioC2PubTator(String original_input,String input,String output) throws IOException, XMLStreamException //Output
463
- {
464
- /* original tiabs*/
465
- BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(original_input), "UTF-8"));
466
- HashMap<String,String> ParagraphContent = new HashMap<String,String>(); // [PMID,0] -> title
467
- HashMap<String,String> annotations = new HashMap<String,String>(); // PMID ->Annotation
468
- String line;
469
- String Pmid="";
470
- int count_paragraph=0;
471
- while ((line = inputfile.readLine()) != null)
472
- {
473
- if(line.contains("|") && !line.contains("\t")) //Title|Abstract
474
- {
475
- String str[]=line.split("\\|",-1);
476
- Pmid=str[0];
477
- ParagraphContent.put(Pmid+"\t"+str[1],str[2]);
478
- count_paragraph++;
479
- }
480
- else if (line.contains("\t")) //Annotation
481
- {
482
- annotations.put(Pmid, annotations.get(Pmid)+line);
483
- }
484
- else if(line.length()==0) //Processing
485
- {
486
- count_paragraph=0;
487
- }
488
- }
489
- inputfile.close();
490
-
491
- /*
492
- * BioC2PubTator
493
- */
494
- HashMap<String, String> pmidlist = new HashMap<String, String>(); // check if appear duplicate pmids
495
- boolean duplicate = false;
496
- BufferedWriter PubTatorOutputFormat = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
497
- ConnectorWoodstox connector = new ConnectorWoodstox();
498
- BioCCollection collection = new BioCCollection();
499
- collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
500
- while (connector.hasNext())
501
- {
502
- BioCDocument document = connector.next();
503
- String PMID = document.getID();
504
- if(pmidlist.containsKey(PMID)){System.out.println("\nError: duplicate pmid-"+PMID);duplicate = true;}
505
- else{pmidlist.put(PMID,"");}
506
- String Anno="";
507
- for (BioCPassage passage : document.getPassages())
508
- {
509
- if(passage.getInfon("type").equals("title") || passage.getInfon("type").equals("t"))
510
- {
511
- PubTatorOutputFormat.write(PMID+"|t|"+ParagraphContent.get(PMID+"\tt")+"\n");
512
- }
513
- else if(passage.getInfon("type").equals("abstract") || passage.getInfon("type").equals("a"))
514
- {
515
- PubTatorOutputFormat.write(PMID+"|a|"+ParagraphContent.get(PMID+"\ta")+"\n");
516
- }
517
- else
518
- {
519
- PubTatorOutputFormat.write(PMID+"|"+passage.getInfon("type")+"|"+passage.getText()+"\n");
520
- }
521
-
522
- for (BioCAnnotation annotation : passage.getAnnotations())
523
- {
524
- String Annotype = annotation.getInfon("type");
525
- String Annoid="";
526
- String Proteinid="";
527
- if(Annotype.matches("(Gene|FamilyName|DomainMotif)"))
528
- {
529
- if(annotation.getInfons().containsKey("NCBI Gene"))
530
- {
531
- Annoid = annotation.getInfon("NCBI Gene");
532
- String Annoidlist[]=Annoid.split(";");
533
- Annoid="";
534
- for(int x=0;x<Annoidlist.length;x++)
535
- {
536
- //Normalization2Protein
537
- String proteinid="";
538
- String homoid="";
539
-
540
- if(GNormPlus.Normalization2Protein_hash.containsKey(Annoidlist[x]))
541
- {
542
- proteinid=GNormPlus.Normalization2Protein_hash.get(Annoidlist[x]);
543
- }
544
- if(GNormPlus.HomologeneID_hash.containsKey(Annoidlist[x]))
545
- {
546
- homoid=GNormPlus.HomologeneID_hash.get(Annoidlist[x]);
547
- }
548
-
549
- if((!proteinid.equals("")) || (!homoid.equals("")))
550
- {
551
- if(Annoid.equals(""))
552
- {
553
- Annoid=Annoidlist[x]+"(";
554
- if(!proteinid.equals(""))
555
- {
556
- Annoid=Annoid+"UniProt:"+proteinid;
557
- }
558
- if(!homoid.equals(""))
559
- {
560
- if(!proteinid.equals(""))
561
- {
562
- Annoid=Annoid+";";
563
- }
564
- Annoid=Annoid+"Homoid:"+homoid;
565
- }
566
- Annoid=Annoid+")";
567
- }
568
- else
569
- {
570
- Annoid=Annoid+";"+Annoidlist[x]+"(";
571
- if(!proteinid.equals(""))
572
- {
573
- Annoid=Annoid+"UniProt:"+proteinid;
574
- }
575
- if(!homoid.equals(""))
576
- {
577
- if(!proteinid.equals(""))
578
- {
579
- Annoid=Annoid+";";
580
- }
581
- Annoid=Annoid+"Homoid:"+homoid;
582
- }
583
- Annoid=Annoid+")";
584
- }
585
- }
586
- else
587
- {
588
- if(Annoid.equals(""))
589
- {
590
- Annoid=Annoidlist[x];
591
- }
592
- else
593
- {
594
- Annoid=Annoid+";"+Annoidlist[x];
595
- }
596
- }
597
- }
598
- }
599
- //else if(annotation.getInfons().containsKey("NCBI Homologene"))
600
- //{
601
- // Annoid = annotation.getInfon("NCBI Homologene");
602
- //}
603
- //else if(annotation.getInfons().containsKey("FocusSpecies"))
604
- //{
605
- // Annoid = annotation.getInfon("FocusSpecies");
606
- //}
607
- else
608
- {
609
- Annoid = annotation.getInfon("Identifier");
610
- }
611
- }
612
- else if(Annotype.equals("Species") || Annotype.equals("Genus") || Annotype.equals("Strain"))
613
- {
614
- if(annotation.getInfons().containsKey("NCBI Taxonomy"))
615
- {
616
- Annoid = annotation.getInfon("NCBI Taxonomy");
617
- }
618
- else
619
- {
620
- Annoid = annotation.getInfon("Identifier");
621
- }
622
- }
623
- else if(Annotype.equals("CellLine"))
624
- {
625
- if(annotation.getInfons().containsKey("NCBI Taxonomy"))
626
- {
627
- Annoid = annotation.getInfon("NCBI Taxonomy");
628
- }
629
- else
630
- {
631
- Annoid = annotation.getInfon("Identifier");
632
- }
633
- }
634
- else
635
- {
636
- if(annotation.getInfons().containsKey("Identifier"))
637
- {
638
- Annoid = annotation.getInfon("Identifier");
639
- }
640
- else
641
- {
642
- Annoid = "";
643
- }
644
- }
645
- int start = annotation.getLocations().get(0).getOffset();
646
- int last = start + annotation.getLocations().get(0).getLength();
647
- String AnnoMention=annotation.getText();
648
- if(Annoid != null && !Annoid.equals(null) && !Annoid.equals(""))
649
- {
650
- Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\t"+Annoid+"\n";
651
- }
652
- else
653
- {
654
- Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\n";
655
- }
656
- }
657
- }
658
- PubTatorOutputFormat.write(Anno+"\n");
659
- }
660
- PubTatorOutputFormat.close();
661
- if(duplicate == true){System.exit(0);}
662
- }
663
- public void BioCReader(String input) throws IOException, XMLStreamException
664
- {
665
- ConnectorWoodstox connector = new ConnectorWoodstox();
666
- BioCCollection collection = new BioCCollection();
667
- collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
668
-
669
- /*
670
- * Per document
671
- */
672
- while (connector.hasNext())
673
- {
674
- BioCDocument document = connector.next();
675
- PMIDs.add(document.getID());
676
-
677
- ArrayList<String> PassageName= new ArrayList<String>(); // array of Passage name
678
- ArrayList<Integer> PassageOffset= new ArrayList<Integer>(); // array of Passage offset
679
- ArrayList<String> PassageContext= new ArrayList<String>(); // array of Passage context
680
- ArrayList<ArrayList<String>> AnnotationInPMID= new ArrayList(); // array of Annotations in the PassageName
681
-
682
- /*
683
- * Per Passage
684
- */
685
- for (BioCPassage passage : document.getPassages())
686
- {
687
- PassageName.add(passage.getInfon("type")); //Paragraph
688
- String txt = passage.getText();
689
- if(txt.matches("[\t ]+"))
690
- {
691
- txt = txt.replaceAll(".","@");
692
- }
693
- else
694
- {
695
- //if(passage.getInfon("type").toLowerCase().equals("table"))
696
- //{
697
- // txt=txt.replaceAll(" ", "|");
698
- //}
699
- txt = txt.replaceAll("ω","w");
700
- txt = txt.replaceAll("μ","u");
701
- txt = txt.replaceAll("κ","k");
702
- txt = txt.replaceAll("α","a");
703
- txt = txt.replaceAll("γ","g");
704
- txt = txt.replaceAll("ɣ","g");
705
- txt = txt.replaceAll("β","b");
706
- txt = txt.replaceAll("×","x");
707
- txt = txt.replaceAll("‑","-");
708
- txt = txt.replaceAll("¹","1");
709
- txt = txt.replaceAll("²","2");
710
- txt = txt.replaceAll("°","o");
711
- txt = txt.replaceAll("ö","o");
712
- txt = txt.replaceAll("é","e");
713
- txt = txt.replaceAll("à","a");
714
- txt = txt.replaceAll("Á","A");
715
- txt = txt.replaceAll("ε","e");
716
- txt = txt.replaceAll("θ","O");
717
- txt = txt.replaceAll("•",".");
718
- txt = txt.replaceAll("µ","u");
719
- txt = txt.replaceAll("λ","r");
720
- txt = txt.replaceAll("⁺","+");
721
- txt = txt.replaceAll("ν","v");
722
- txt = txt.replaceAll("ï","i");
723
- txt = txt.replaceAll("ã","a");
724
- txt = txt.replaceAll("≡","=");
725
- txt = txt.replaceAll("ó","o");
726
- txt = txt.replaceAll("³","3");
727
- txt = txt.replaceAll("〖","[");
728
- txt = txt.replaceAll("〗","]");
729
- txt = txt.replaceAll("Å","A");
730
- txt = txt.replaceAll("ρ","p");
731
- txt = txt.replaceAll("ü","u");
732
- txt = txt.replaceAll("ɛ","e");
733
- txt = txt.replaceAll("č","c");
734
- txt = txt.replaceAll("š","s");
735
- txt = txt.replaceAll("ß","b");
736
- txt = txt.replaceAll("═","=");
737
- txt = txt.replaceAll("£","L");
738
- txt = txt.replaceAll("Ł","L");
739
- txt = txt.replaceAll("ƒ","f");
740
- txt = txt.replaceAll("ä","a");
741
- txt = txt.replaceAll("–","-");
742
- txt = txt.replaceAll("⁻","-");
743
- txt = txt.replaceAll("〈","<");
744
- txt = txt.replaceAll("〉",">");
745
- txt = txt.replaceAll("χ","X");
746
- txt = txt.replaceAll("Đ","D");
747
- txt = txt.replaceAll("‰","%");
748
- txt = txt.replaceAll("·",".");
749
- txt = txt.replaceAll("→",">");
750
- txt = txt.replaceAll("←","<");
751
- txt = txt.replaceAll("ζ","z");
752
- txt = txt.replaceAll("π","p");
753
- txt = txt.replaceAll("τ","t");
754
- txt = txt.replaceAll("ξ","X");
755
- txt = txt.replaceAll("η","h");
756
- txt = txt.replaceAll("ø","0");
757
- txt = txt.replaceAll("Δ","D");
758
- txt = txt.replaceAll("∆","D");
759
- txt = txt.replaceAll("∑","S");
760
- txt = txt.replaceAll("Ω","O");
761
- txt = txt.replaceAll("δ","d");
762
- txt = txt.replaceAll("σ","s");
763
- txt = txt.replaceAll("Φ","F");
764
- //txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]"," ");
765
- }
766
- if(passage.getText().equals("") || passage.getText().matches("[ ]+"))
767
- {
768
- PassageContext.add("-notext-"); //Context
769
- }
770
- else
771
- {
772
- PassageContext.add(txt); //Context
773
- }
774
- PassageOffset.add(passage.getOffset()); //Offset
775
- ArrayList<String> AnnotationInPassage= new ArrayList<String>(); // array of Annotations in the PassageName
776
- AnnotationInPMID.add(AnnotationInPassage);
777
- }
778
- PassageNames.add(PassageName);
779
- PassageContexts.add(PassageContext);
780
- PassageOffsets.add(PassageOffset);
781
- Annotations.add(AnnotationInPMID);
782
- }
783
- }
784
- public void BioCReaderWithAnnotation(String input) throws IOException, XMLStreamException
785
- {
786
- ConnectorWoodstox connector = new ConnectorWoodstox();
787
- BioCCollection collection = new BioCCollection();
788
- collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
789
-
790
- /*
791
- * Per document
792
- */
793
- while (connector.hasNext())
794
- {
795
- BioCDocument document = connector.next();
796
- PMIDs.add(document.getID());
797
-
798
- ArrayList<String> PassageName= new ArrayList<String>(); // array of Passage name
799
- ArrayList<Integer> PassageOffset= new ArrayList<Integer>(); // array of Passage offset
800
- ArrayList<String> PassageContext= new ArrayList<String>(); // array of Passage context
801
- ArrayList<ArrayList<String>> AnnotationInPMID= new ArrayList(); // array of Annotations in the PassageName
802
-
803
- /*
804
- * Per Passage
805
- */
806
- for (BioCPassage passage : document.getPassages())
807
- {
808
- PassageName.add(passage.getInfon("type")); //Paragraph
809
-
810
- String txt = passage.getText();
811
- if(txt.matches("[\t ]+"))
812
- {
813
- txt = txt.replaceAll(".","@");
814
- }
815
- else
816
- {
817
- //if(passage.getInfon("type").toLowerCase().equals("table"))
818
- //{
819
- // txt=txt.replaceAll(" ", "|");
820
- //}
821
- txt = txt.replaceAll("ω","w");
822
- txt = txt.replaceAll("μ","u");
823
- txt = txt.replaceAll("κ","k");
824
- txt = txt.replaceAll("α","a");
825
- txt = txt.replaceAll("γ","g");
826
- txt = txt.replaceAll("ɣ","g");
827
- txt = txt.replaceAll("β","b");
828
- txt = txt.replaceAll("×","x");
829
- txt = txt.replaceAll("‑","-");
830
- txt = txt.replaceAll("¹","1");
831
- txt = txt.replaceAll("²","2");
832
- txt = txt.replaceAll("°","o");
833
- txt = txt.replaceAll("ö","o");
834
- txt = txt.replaceAll("é","e");
835
- txt = txt.replaceAll("à","a");
836
- txt = txt.replaceAll("Á","A");
837
- txt = txt.replaceAll("ε","e");
838
- txt = txt.replaceAll("θ","O");
839
- txt = txt.replaceAll("•",".");
840
- txt = txt.replaceAll("µ","u");
841
- txt = txt.replaceAll("λ","r");
842
- txt = txt.replaceAll("⁺","+");
843
- txt = txt.replaceAll("ν","v");
844
- txt = txt.replaceAll("ï","i");
845
- txt = txt.replaceAll("ã","a");
846
- txt = txt.replaceAll("≡","=");
847
- txt = txt.replaceAll("ó","o");
848
- txt = txt.replaceAll("³","3");
849
- txt = txt.replaceAll("〖","[");
850
- txt = txt.replaceAll("〗","]");
851
- txt = txt.replaceAll("Å","A");
852
- txt = txt.replaceAll("ρ","p");
853
- txt = txt.replaceAll("ü","u");
854
- txt = txt.replaceAll("ɛ","e");
855
- txt = txt.replaceAll("č","c");
856
- txt = txt.replaceAll("š","s");
857
- txt = txt.replaceAll("ß","b");
858
- txt = txt.replaceAll("═","=");
859
- txt = txt.replaceAll("£","L");
860
- txt = txt.replaceAll("Ł","L");
861
- txt = txt.replaceAll("ƒ","f");
862
- txt = txt.replaceAll("ä","a");
863
- txt = txt.replaceAll("–","-");
864
- txt = txt.replaceAll("⁻","-");
865
- txt = txt.replaceAll("〈","<");
866
- txt = txt.replaceAll("〉",">");
867
- txt = txt.replaceAll("χ","X");
868
- txt = txt.replaceAll("Đ","D");
869
- txt = txt.replaceAll("‰","%");
870
- txt = txt.replaceAll("·",".");
871
- txt = txt.replaceAll("→",">");
872
- txt = txt.replaceAll("←","<");
873
- txt = txt.replaceAll("ζ","z");
874
- txt = txt.replaceAll("π","p");
875
- txt = txt.replaceAll("τ","t");
876
- txt = txt.replaceAll("ξ","X");
877
- txt = txt.replaceAll("η","h");
878
- txt = txt.replaceAll("ø","0");
879
- txt = txt.replaceAll("Δ","D");
880
- txt = txt.replaceAll("∆","D");
881
- txt = txt.replaceAll("∑","S");
882
- txt = txt.replaceAll("Ω","O");
883
- txt = txt.replaceAll("δ","d");
884
- txt = txt.replaceAll("σ","s");
885
- txt = txt.replaceAll("Φ","F");
886
- //txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]"," ");
887
- }
888
- if(passage.getText().equals("") || passage.getText().matches("[ ]+"))
889
- {
890
- PassageContext.add("-notext-"); //Context
891
- }
892
- else
893
- {
894
- PassageContext.add(txt); //Context
895
- }
896
- PassageOffset.add(passage.getOffset()); //Offset
897
- ArrayList<String> AnnotationInPassage= new ArrayList<String>(); // array of Annotations in the PassageName
898
-
899
- /*
900
- * Per Annotation :
901
- * start
902
- * last
903
- * mention
904
- * type
905
- * id
906
- */
907
- for (BioCAnnotation Anno : passage.getAnnotations())
908
- {
909
- int start = Anno.getLocations().get(0).getOffset()-passage.getOffset(); // start
910
- int last = start + Anno.getLocations().get(0).getLength(); // last
911
- String AnnoMention=Anno.getText(); // mention
912
- String Annotype = Anno.getInfon("type"); // type
913
- String Annoid = Anno.getInfon("Identifier"); // identifier | MESH
914
- if(Annoid == null)
915
- {
916
- Annoid = Anno.getInfon("Identifier"); // identifier | MESH
917
- }
918
- if(Annoid == null || Annoid.equals("null"))
919
- {
920
- AnnotationInPassage.add(start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype); //paragraph
921
- }
922
- else
923
- {
924
- AnnotationInPassage.add(start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\t"+Annoid); //paragraph
925
- }
926
- }
927
- AnnotationInPMID.add(AnnotationInPassage);
928
- }
929
- PassageNames.add(PassageName);
930
- PassageContexts.add(PassageContext);
931
- PassageOffsets.add(PassageOffset);
932
- Annotations.add(AnnotationInPMID);
933
- }
934
- }
935
- public void BioCOutput(String input,String output, ArrayList<ArrayList<ArrayList<String>>> Annotations,boolean Final,boolean RemovePreviousAnno) throws IOException, XMLStreamException
936
- {
937
- boolean ShowUnNormalizedMention = false;
938
- if(GNormPlus.setup_hash.containsKey("ShowUnNormalizedMention") && GNormPlus.setup_hash.get("ShowUnNormalizedMention").equals("True"))
939
- {
940
- ShowUnNormalizedMention = true;
941
- }
942
-
943
- BioCDocumentWriter BioCOutputFormat = BioCFactory.newFactory(BioCFactory.WOODSTOX).createBioCDocumentWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
944
- BioCCollection biocCollection_input = new BioCCollection();
945
- BioCCollection biocCollection_output = new BioCCollection();
946
-
947
- //input: BioC
948
- ConnectorWoodstox connector = new ConnectorWoodstox();
949
- biocCollection_input = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
950
- BioCOutputFormat.writeCollectionInfo(biocCollection_input);
951
- int i=0; //count for pmid
952
- while (connector.hasNext())
953
- {
954
- BioCDocument document_output = new BioCDocument();
955
- BioCDocument document_input = connector.next();
956
- String PMID=document_input.getID();
957
- document_output.setID(PMID);
958
- int annotation_count=0;
959
- int j=0; //count for paragraph
960
- for (BioCPassage passage_input : document_input.getPassages())
961
- {
962
- BioCPassage passage_output = passage_input;
963
-
964
- if(RemovePreviousAnno == true) //clean the previous annotation, if the NER result is provided
965
- {
966
- passage_output.clearAnnotations();
967
- }
968
- else
969
- {
970
- for (BioCAnnotation annotation : passage_output.getAnnotations())
971
- {
972
- annotation.setID(""+annotation_count);
973
- annotation_count++;
974
- }
975
- }
976
-
977
- int passage_Offset = passage_input.getOffset();
978
- String passage_Text = passage_input.getText();
979
- ArrayList<String> AnnotationInPassage = new ArrayList<String>();
980
- //ArrayList<String> AnnotationInPassage = Annotations.get(i).get(j);
981
- if(Annotations.size()>i && Annotations.get(i).size()>j)
982
- {
983
- for(int a=0;a<Annotations.get(i).get(j).size();a++)
984
- {
985
- String Anno[]=Annotations.get(i).get(j).get(a).split("\\t");
986
- int start = Integer.parseInt(Anno[0]);
987
- int last = Integer.parseInt(Anno[1]);
988
- boolean found = false;
989
- if(passage_Text.length()>last)
990
- {
991
- String mention = Anno[2];
992
- if(Final == true && passage_Text.length()>=last)
993
- {
994
- mention = passage_Text.substring(start, last);
995
- }
996
- if(mention.matches(".*\t.*"))
997
- {
998
- Anno[3]=Anno[4];
999
- if(Anno.length>=6)
1000
- {
1001
- Anno[4]=Anno[5];
1002
- }
1003
- }
1004
- String type = Anno[3];
1005
- String id = ""; // optional
1006
- if(Anno.length>=5){id = Anno[4];}
1007
- if(Final == true)
1008
- {
1009
- for(int b=0;b<AnnotationInPassage.size();b++)
1010
- {
1011
- String Annob[]=AnnotationInPassage.get(b).split("\\t");
1012
- int startb = Integer.parseInt(Annob[0]);
1013
- int lastb = Integer.parseInt(Annob[1]);
1014
- String mentionb = Annob[2];
1015
- if(Final == true && passage_Text.length()>=lastb)
1016
- {
1017
- mentionb = passage_Text.substring(startb, lastb);
1018
- }
1019
- if(mentionb.matches(".*\t.*"))
1020
- {
1021
- Annob[3]=Annob[4];
1022
- if(Annob.length>=6)
1023
- {
1024
- Annob[4]=Annob[5];
1025
- }
1026
- }
1027
- String typeb = Annob[3];
1028
- String idb = ""; // optional
1029
- if(Annob.length>=5){idb = Annob[4];}
1030
-
1031
- if(start == startb && last == lastb && type.equals(typeb))
1032
- {
1033
- found = true;
1034
- if(id.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+") && (!idb.equals("")))
1035
- {
1036
- }
1037
- else if(idb.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+") && (!id.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+")) && (!id.equals("")))
1038
- {
1039
- AnnotationInPassage.set(b, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id);
1040
- }
1041
- else
1042
- {
1043
- if(id.equals(""))
1044
- {
1045
- }
1046
- else
1047
- {
1048
- AnnotationInPassage.set(b, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+idb+";"+id);
1049
- }
1050
-
1051
- }
1052
- break;
1053
- }
1054
- }
1055
- }
1056
- }
1057
- if(found == false)
1058
- {
1059
- AnnotationInPassage.add(Annotations.get(i).get(j).get(a));
1060
- }
1061
- }
1062
- }
1063
- for(int a=0;a<AnnotationInPassage.size();a++)
1064
- {
1065
- String Anno[]=AnnotationInPassage.get(a).split("\\t");
1066
- HashMap <String,String> id_hash = new HashMap <String,String>();
1067
- if(Anno.length>=5)
1068
- {
1069
- int start = Integer.parseInt(Anno[0]);
1070
- int last = Integer.parseInt(Anno[1]);
1071
- String mention = Anno[2];
1072
- if(Final == true && passage_Text.length()>=last)
1073
- {
1074
- mention = passage_Text.substring(start, last);
1075
- }
1076
- if(mention.matches(".*\t.*"))
1077
- {
1078
- Anno[3]=Anno[4];
1079
- if(Anno.length>=6)
1080
- {
1081
- Anno[4]=Anno[5];
1082
- }
1083
- }
1084
- String ids = Anno[4];
1085
- String idlist[]=ids.split(",");
1086
- for(int b=0;b<idlist.length;b++)
1087
- {
1088
- id_hash.put(idlist[b], "");
1089
- }
1090
- ids = "";
1091
- for(String id :id_hash.keySet())
1092
- {
1093
- if(ids.equals(""))
1094
- {
1095
- ids = id;
1096
- }
1097
- else
1098
- {
1099
- ids = ids + ";" + id;
1100
- }
1101
- }
1102
- AnnotationInPassage.set(a, Anno[0]+"\t"+Anno[1]+"\t"+Anno[2]+"\t"+Anno[3]+"\t"+ids);
1103
- }
1104
- }
1105
-
1106
- for(int a=0;a<AnnotationInPassage.size();a++)
1107
- {
1108
- String Anno[]=AnnotationInPassage.get(a).split("\\t");
1109
- int start = Integer.parseInt(Anno[0]);
1110
- int last = Integer.parseInt(Anno[1]);
1111
- if(passage_Text.length()>last)
1112
- {
1113
- String mention = Anno[2];
1114
- if(Final == true && passage_Text.length()>=last)
1115
- {
1116
- mention = passage_Text.substring(start, last);
1117
- }
1118
- if(mention.matches(".*\t.*"))
1119
- {
1120
- Anno[3]=Anno[4];
1121
- if(Anno.length>=6)
1122
- {
1123
- Anno[4]=Anno[5];
1124
- }
1125
- }
1126
- String type = Anno[3];
1127
- if(type.equals("GeneID")){type="Gene";}
1128
- BioCAnnotation biocAnnotation = new BioCAnnotation();
1129
- Map<String, String> AnnoInfons = new HashMap<String, String>();
1130
- AnnoInfons.put("type", type);
1131
- if(Anno.length>=5)
1132
- {
1133
- String identifier = Anno[4];
1134
- if(Final == true && ShowUnNormalizedMention==false)
1135
- {
1136
- if(type.matches("(FamilyName|Domain|Gene)"))
1137
- {
1138
- Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9\\;]+)$");
1139
- Matcher mtmp0 = ptmp0.matcher(identifier);
1140
- Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
1141
- Matcher mtmp1 = ptmp1.matcher(identifier);
1142
- Pattern ptmp2 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)$");
1143
- Matcher mtmp2 = ptmp2.matcher(identifier);
1144
- Pattern ptmp3 = Pattern.compile("^Homo\\:([0-9]+)$");
1145
- Matcher mtmp3 = ptmp3.matcher(identifier);
1146
- if(mtmp0.find())
1147
- {
1148
- String Method_SA = mtmp0.group(1);
1149
- String TaxonomyID = mtmp0.group(2);
1150
- String NCBIGeneID = mtmp0.group(3);
1151
- if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID))
1152
- {
1153
- AnnoInfons.put("UniProt", GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
1154
- }
1155
- if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID))
1156
- {
1157
- AnnoInfons.put("NCBI Homologene", GNormPlus.HomologeneID_hash.get(NCBIGeneID));
1158
- }
1159
- AnnoInfons.put("NCBI Gene", NCBIGeneID);
1160
- }
1161
- else if(mtmp1.find())
1162
- {
1163
- String Method_SA = mtmp1.group(1);
1164
- String TaxonomyID = mtmp1.group(2);
1165
- String NCBIGeneID = mtmp1.group(3);
1166
- String HomoID = mtmp1.group(4);
1167
- if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID))
1168
- {
1169
- AnnoInfons.put("UniProt", GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
1170
- }
1171
- if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID))
1172
- {
1173
- AnnoInfons.put("NCBI Homologene", GNormPlus.HomologeneID_hash.get(NCBIGeneID));
1174
- }
1175
- AnnoInfons.put("NCBI Gene", NCBIGeneID);
1176
- }
1177
- else if(mtmp2.find())
1178
- {
1179
- String Method_SA = mtmp2.group(1);
1180
- String TaxonomyID = mtmp2.group(2);
1181
- AnnoInfons.put("FocusSpecies", "NCBITaxonomyID:"+TaxonomyID);
1182
- }
1183
- else if(mtmp3.find())
1184
- {
1185
- String Method_SA = mtmp3.group(1);
1186
- String HomoID = mtmp3.group(2);
1187
- AnnoInfons.put("NCBI Homologene", HomoID);
1188
- }
1189
- else
1190
- {
1191
- String identifiers[] = identifier.split(";");
1192
- if(identifiers.length>1)
1193
- {
1194
- ArrayList<String> identifierSTR = new ArrayList<String>();
1195
- ArrayList<String> ProteinidSTR = new ArrayList<String>();
1196
- ArrayList<String> HomoidSTR = new ArrayList<String>();
1197
- for(int idi=0;idi<identifiers.length;idi++)
1198
- {
1199
- Pattern ptmp4 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
1200
- Matcher mtmp4 = ptmp4.matcher(identifiers[idi]);
1201
- Pattern ptmp5 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9\\;]+)$");
1202
- Matcher mtmp5 = ptmp5.matcher(identifiers[idi]);
1203
- if(mtmp4.find())
1204
- {
1205
- String Method_SA = mtmp4.group(1);
1206
- String TaxonomyID = mtmp4.group(2);
1207
- String NCBIGeneID = mtmp4.group(3);
1208
- String HomoID = mtmp4.group(4);
1209
- if(!identifierSTR.contains(NCBIGeneID))
1210
- {
1211
- identifierSTR.add(NCBIGeneID);
1212
- }
1213
- if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID))
1214
- {
1215
- if(!ProteinidSTR.contains(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID)))
1216
- {
1217
- ProteinidSTR.add(GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
1218
- }
1219
- }
1220
- if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID))
1221
- {
1222
- if(!HomoidSTR.contains(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID)))
1223
- {
1224
- HomoidSTR.add(GNormPlus.HomologeneID_hash.get(NCBIGeneID));
1225
- }
1226
- }
1227
-
1228
- }
1229
- else if(mtmp5.find())
1230
- {
1231
- String Method_SA = mtmp5.group(1);
1232
- String TaxonomyID = mtmp5.group(2);
1233
- String NCBIGeneID = mtmp5.group(3);
1234
- if(!identifierSTR.contains(NCBIGeneID))
1235
- {
1236
- identifierSTR.add(NCBIGeneID);
1237
- }
1238
- }
1239
- }
1240
- String idSTR="";
1241
- for(int x=0;x<identifierSTR.size();x++)
1242
- {
1243
- if(idSTR.equals(""))
1244
- {
1245
- idSTR = identifierSTR.get(x);
1246
- }
1247
- else
1248
- {
1249
- idSTR = idSTR+";"+identifierSTR.get(x);
1250
- }
1251
- }
1252
- AnnoInfons.put("NCBI Gene", idSTR);
1253
-
1254
- String pidSTR="";
1255
- for(int x=0;x<ProteinidSTR.size();x++)
1256
- {
1257
- if(pidSTR.equals(""))
1258
- {
1259
- pidSTR = ProteinidSTR.get(x);
1260
- }
1261
- else
1262
- {
1263
- pidSTR = pidSTR+";"+ProteinidSTR.get(x);
1264
- }
1265
- }
1266
- if(!pidSTR.equals(""))
1267
- {
1268
- AnnoInfons.put("UniProt", pidSTR);
1269
- }
1270
-
1271
- String hidSTR="";
1272
- for(int x=0;x<HomoidSTR.size();x++)
1273
- {
1274
- if(hidSTR.equals(""))
1275
- {
1276
- hidSTR = HomoidSTR.get(x);
1277
- }
1278
- else
1279
- {
1280
- hidSTR = hidSTR+";"+HomoidSTR.get(x);
1281
- }
1282
- }
1283
- if(!hidSTR.equals(""))
1284
- {
1285
- AnnoInfons.put("NCBI Homologene", hidSTR);
1286
- }
1287
- }
1288
- //else
1289
- //{
1290
- // AnnoInfons.put("Identifier", identifier);
1291
- //}
1292
- }
1293
- }
1294
- else if (type.matches("(Species|Genus|Strain)"))
1295
- {
1296
- AnnoInfons.put("type", type);
1297
- AnnoInfons.put("NCBI Taxonomy", identifier);
1298
- }
1299
- else if (type.matches("Cell"))
1300
- {
1301
- AnnoInfons.put("type", "CellLine");
1302
- AnnoInfons.put("NCBI Taxonomy", identifier);
1303
- }
1304
- else
1305
- {
1306
- AnnoInfons.put("Identifier", identifier);
1307
- }
1308
- }
1309
- else
1310
- {
1311
- AnnoInfons.put("Identifier", identifier);
1312
- }
1313
- }
1314
- biocAnnotation.setInfons(AnnoInfons);
1315
- BioCLocation location = new BioCLocation();
1316
- location.setOffset(start+passage_Offset);
1317
- location.setLength(last-start);
1318
- biocAnnotation.setLocation(location);
1319
- biocAnnotation.setText(mention);
1320
- biocAnnotation.setID(""+annotation_count);
1321
- annotation_count++;
1322
- if(Final == true)
1323
- {
1324
- if(AnnoInfons.containsKey("Identifier") || AnnoInfons.containsKey("NCBI Homologene") || AnnoInfons.containsKey("NCBI Gene") || AnnoInfons.containsKey("NCBI Taxonomy"))
1325
- {
1326
- passage_output.addAnnotation(biocAnnotation);
1327
- }
1328
- }
1329
- else
1330
- {
1331
- passage_output.addAnnotation(biocAnnotation);
1332
- }
1333
- }
1334
- }
1335
- document_output.addPassage(passage_output);
1336
- j++;
1337
- }
1338
- biocCollection_output.addDocument(document_output);
1339
- BioCOutputFormat.writeDocument(document_output);
1340
- i++;
1341
- }
1342
- BioCOutputFormat.close();
1343
- }
1344
  }
 
1
+ /**
2
+ * Project: GNormPlus
3
+ * Function: Data storage in BioC format
4
+ */
5
+
6
+ package GNormPluslib;
7
+
8
+ import bioc.BioCAnnotation;
9
+ import bioc.BioCCollection;
10
+ import bioc.BioCDocument;
11
+ import bioc.BioCLocation;
12
+ import bioc.BioCPassage;
13
+
14
+ import bioc.io.BioCDocumentWriter;
15
+ import bioc.io.BioCFactory;
16
+ import bioc.io.woodstox.ConnectorWoodstox;
17
+ import java.io.BufferedReader;
18
+ import java.io.BufferedWriter;
19
+ import java.io.FileInputStream;
20
+ import java.io.FileNotFoundException;
21
+ import java.io.FileOutputStream;
22
+ import java.io.FileReader;
23
+ import java.io.FileWriter;
24
+ import java.io.IOException;
25
+ import java.io.InputStreamReader;
26
+ import java.io.OutputStreamWriter;
27
+ import java.io.UnsupportedEncodingException;
28
+ import java.time.LocalDate;
29
+ import java.time.ZoneId;
30
+
31
+ import javax.xml.stream.XMLStreamException;
32
+
33
+ import java.util.Map;
34
+ import java.util.regex.Matcher;
35
+ import java.util.regex.Pattern;
36
+ import java.util.ArrayList;
37
+ import java.util.HashMap;
38
+ import java.util.List;
39
+
40
+ public class BioCDoc
41
+ {
42
+ /*
43
+ * Contexts in BioC file
44
+ */
45
+ public ArrayList<String> PMIDs=new ArrayList<String>(); // Type: PMIDs
46
+ public ArrayList<ArrayList<String>> PassageNames = new ArrayList(); // PassageName
47
+ public ArrayList<ArrayList<Integer>> PassageOffsets = new ArrayList(); // PassageOffset
48
+ public ArrayList<ArrayList<String>> PassageContexts = new ArrayList(); // PassageContext
49
+ public ArrayList<ArrayList<ArrayList<String>>> Annotations = new ArrayList(); // Annotation - GNormPlus
50
+
51
+ public String BioCFormatCheck(String InputFile) throws IOException
52
+ {
53
+
54
+ ConnectorWoodstox connector = new ConnectorWoodstox();
55
+ BioCCollection collection = new BioCCollection();
56
+ try
57
+ {
58
+ collection = connector.startRead(new InputStreamReader(new FileInputStream(InputFile), "UTF-8"));
59
+ }
60
+ catch (UnsupportedEncodingException | FileNotFoundException | XMLStreamException e)
61
+ {
62
+ BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(InputFile), "UTF-8"));
63
+ String line="";
64
+ String status="";
65
+ String Pmid = "";
66
+ boolean tiabs=false;
67
+ Pattern patt = Pattern.compile("^([^\\|\\t]+)\\|([^\\|\\t]+)\\|(.*)$");
68
+ while ((line = br.readLine()) != null)
69
+ {
70
+ Matcher mat = patt.matcher(line);
71
+ if(mat.find()) //Title|Abstract
72
+ {
73
+ if(Pmid.equals(""))
74
+ {
75
+ Pmid = mat.group(1);
76
+ }
77
+ else if(!Pmid.equals(mat.group(1)))
78
+ {
79
+ return "[Error]: "+InputFile+" - A blank is needed between "+Pmid+" and "+mat.group(1)+".";
80
+ }
81
+ status = "tiabs";
82
+ tiabs = true;
83
+ }
84
+ else if (line.contains("\t")) //Annotation
85
+ {
86
+ }
87
+ else if(line.length()==0) //Processing
88
+ {
89
+ if(status.equals(""))
90
+ {
91
+ if(Pmid.equals(""))
92
+ {
93
+ return "[Error]: "+InputFile+" - It's neither BioC nor PubTator format. PMID is empty.";
94
+ }
95
+ else
96
+ {
97
+ return "[Error]: "+InputFile+" - A redundant blank is after "+Pmid+".";
98
+ }
99
+ }
100
+ Pmid="";
101
+ status="";
102
+ }
103
+ }
104
+ br.close();
105
+ if(tiabs == false)
106
+ {
107
+ return "[Error]: "+InputFile+" - It's neither BioC nor PubTator format.";
108
+ }
109
+ if(status.equals(""))
110
+ {
111
+ return "PubTator";
112
+ }
113
+ else
114
+ {
115
+ return "[Error]: "+InputFile+" - The last column missed a blank.";
116
+ }
117
+ }
118
+ return "BioC";
119
+ }
120
+ public void PubTator2BioC(String input,String output) throws IOException, XMLStreamException // Input
121
+ {
122
+ /*
123
+ * PubTator2BioC
124
+ */
125
+ String parser = BioCFactory.WOODSTOX;
126
+ BioCFactory factory = BioCFactory.newFactory(parser);
127
+ BioCDocumentWriter BioCOutputFormat = factory.createBioCDocumentWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
128
+ BioCCollection biocCollection = new BioCCollection();
129
+
130
+ //time
131
+ ZoneId zonedId = ZoneId.of( "America/Montreal" );
132
+ LocalDate today = LocalDate.now( zonedId );
133
+ biocCollection.setDate(today.toString());
134
+
135
+ biocCollection.setKey("BioC.key");//key
136
+ biocCollection.setSource("GNormPlus");//source
137
+
138
+ BioCOutputFormat.writeCollectionInfo(biocCollection);
139
+ BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(input), "UTF-8"));
140
+ ArrayList<String> ParagraphType=new ArrayList<String>(); // Type: Title|Abstract
141
+ ArrayList<String> ParagraphContent = new ArrayList<String>(); // Text
142
+ ArrayList<String> annotations = new ArrayList<String>(); // Annotation
143
+ String line;
144
+ String Pmid="";
145
+ while ((line = inputfile.readLine()) != null)
146
+ {
147
+ if(line.contains("|") && !line.contains("\t")) //Title|Abstract
148
+ {
149
+ String str[]=line.split("\\|",-1);
150
+ Pmid=str[0];
151
+ if(str[1].equals("t"))
152
+ {
153
+ str[1]="title";
154
+ }
155
+ if(str[1].equals("a"))
156
+ {
157
+ str[1]="abstract";
158
+ }
159
+ ParagraphType.add(str[1]);
160
+ if(str.length==3)
161
+ {
162
+ String txt = str[2];
163
+ txt = txt.replaceAll("ω","w");
164
+ txt = txt.replaceAll("μ","u");
165
+ txt = txt.replaceAll("κ","k");
166
+ txt = txt.replaceAll("α","a");
167
+ txt = txt.replaceAll("γ","g");
168
+ txt = txt.replaceAll("ɣ","g");
169
+ txt = txt.replaceAll("β","b");
170
+ txt = txt.replaceAll("×","x");
171
+ txt = txt.replaceAll("‑","-");
172
+ txt = txt.replaceAll("¹","1");
173
+ txt = txt.replaceAll("²","2");
174
+ txt = txt.replaceAll("°","o");
175
+ txt = txt.replaceAll("ö","o");
176
+ txt = txt.replaceAll("é","e");
177
+ txt = txt.replaceAll("à","a");
178
+ txt = txt.replaceAll("Á","A");
179
+ txt = txt.replaceAll("ε","e");
180
+ txt = txt.replaceAll("θ","O");
181
+ txt = txt.replaceAll("•",".");
182
+ txt = txt.replaceAll("µ","u");
183
+ txt = txt.replaceAll("λ","r");
184
+ txt = txt.replaceAll("⁺","+");
185
+ txt = txt.replaceAll("ν","v");
186
+ txt = txt.replaceAll("ï","i");
187
+ txt = txt.replaceAll("ã","a");
188
+ txt = txt.replaceAll("≡","=");
189
+ txt = txt.replaceAll("ó","o");
190
+ txt = txt.replaceAll("³","3");
191
+ txt = txt.replaceAll("〖","[");
192
+ txt = txt.replaceAll("〗","]");
193
+ txt = txt.replaceAll("Å","A");
194
+ txt = txt.replaceAll("ρ","p");
195
+ txt = txt.replaceAll("ü","u");
196
+ txt = txt.replaceAll("ɛ","e");
197
+ txt = txt.replaceAll("č","c");
198
+ txt = txt.replaceAll("š","s");
199
+ txt = txt.replaceAll("ß","b");
200
+ txt = txt.replaceAll("═","=");
201
+ txt = txt.replaceAll("£","L");
202
+ txt = txt.replaceAll("Ł","L");
203
+ txt = txt.replaceAll("ƒ","f");
204
+ txt = txt.replaceAll("ä","a");
205
+ txt = txt.replaceAll("–","-");
206
+ txt = txt.replaceAll("⁻","-");
207
+ txt = txt.replaceAll("〈","<");
208
+ txt = txt.replaceAll("〉",">");
209
+ txt = txt.replaceAll("χ","X");
210
+ txt = txt.replaceAll("Đ","D");
211
+ txt = txt.replaceAll("‰","%");
212
+ txt = txt.replaceAll("·",".");
213
+ txt = txt.replaceAll("→",">");
214
+ txt = txt.replaceAll("←","<");
215
+ txt = txt.replaceAll("ζ","z");
216
+ txt = txt.replaceAll("π","p");
217
+ txt = txt.replaceAll("τ","t");
218
+ txt = txt.replaceAll("ξ","X");
219
+ txt = txt.replaceAll("η","h");
220
+ txt = txt.replaceAll("ø","0");
221
+ txt = txt.replaceAll("Δ","D");
222
+ txt = txt.replaceAll("∆","D");
223
+ txt = txt.replaceAll("∑","S");
224
+ txt = txt.replaceAll("Ω","O");
225
+ txt = txt.replaceAll("δ","d");
226
+ txt = txt.replaceAll("σ","s");
227
+ txt = txt.replaceAll("Φ","F");
228
+ txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]"," ");
229
+ ParagraphContent.add(txt);
230
+ }
231
+ else
232
+ {
233
+ ParagraphContent.add("- No text -");
234
+ }
235
+ }
236
+ else if (line.contains("\t")) //Annotation
237
+ {
238
+ String anno[]=line.split("\t");
239
+ if(anno.length==6)
240
+ {
241
+ annotations.add(anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+anno[4]+"\t"+anno[5]);
242
+ }
243
+ else if(anno.length==5)
244
+ {
245
+ annotations.add(anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+anno[4]);
246
+ }
247
+ }
248
+ else if(line.length()==0) //Processing
249
+ {
250
+ BioCDocument biocDocument = new BioCDocument();
251
+ biocDocument.setID(Pmid);
252
+ int startoffset=0;
253
+ for(int i=0;i<ParagraphType.size();i++)
254
+ {
255
+ BioCPassage biocPassage = new BioCPassage();
256
+ Map<String, String> Infons = new HashMap<String, String>();
257
+ Infons.put("type", ParagraphType.get(i));
258
+ biocPassage.setInfons(Infons);
259
+ biocPassage.setText(ParagraphContent.get(i));
260
+ biocPassage.setOffset(startoffset);
261
+ startoffset=startoffset+ParagraphContent.get(i).length()+1;
262
+ for(int j=0;j<annotations.size();j++)
263
+ {
264
+ String anno[]=annotations.get(j).split("\t");
265
+ if(Integer.parseInt(anno[0])<startoffset && Integer.parseInt(anno[0])>=startoffset-ParagraphContent.get(i).length()-1)
266
+ {
267
+ BioCAnnotation biocAnnotation = new BioCAnnotation();
268
+ Map<String, String> AnnoInfons = new HashMap<String, String>();
269
+ if(anno.length==5)
270
+ {
271
+ AnnoInfons.put("Identifier", anno[4]);
272
+ }
273
+ AnnoInfons.put("type", anno[3]);
274
+ biocAnnotation.setInfons(AnnoInfons);
275
+ BioCLocation location = new BioCLocation();
276
+ location.setOffset(Integer.parseInt(anno[0]));
277
+ location.setLength(Integer.parseInt(anno[1])-Integer.parseInt(anno[0]));
278
+ biocAnnotation.setLocation(location);
279
+ biocAnnotation.setText(anno[2]);
280
+ biocPassage.addAnnotation(biocAnnotation);
281
+ }
282
+ }
283
+ biocDocument.addPassage(biocPassage);
284
+ }
285
+ biocCollection.addDocument(biocDocument);
286
+ ParagraphType.clear();
287
+ ParagraphContent.clear();
288
+ annotations.clear();
289
+ BioCOutputFormat.writeDocument(biocDocument);
290
+ }
291
+ }
292
+ BioCOutputFormat.close();
293
+ inputfile.close();
294
+ }
295
+ public void BioC2PubTator(String input,String output) throws IOException, XMLStreamException //Output
296
+ {
297
+ /*
298
+ * BioC2PubTator
299
+ */
300
+ HashMap<String, String> pmidlist = new HashMap<String, String>(); // check if appear duplicate pmids
301
+ boolean duplicate = false;
302
+ BufferedWriter PubTatorOutputFormat = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
303
+ ConnectorWoodstox connector = new ConnectorWoodstox();
304
+ BioCCollection collection = new BioCCollection();
305
+ collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
306
+ while (connector.hasNext())
307
+ {
308
+ BioCDocument document = connector.next();
309
+ String PMID = document.getID();
310
+ if(pmidlist.containsKey(PMID)){System.out.println("\nError: duplicate pmid-"+PMID);duplicate = true;}
311
+ else{pmidlist.put(PMID,"");}
312
+ String Anno="";
313
+ for (BioCPassage passage : document.getPassages())
314
+ {
315
+ if(passage.getInfon("type").equals("title"))
316
+ {
317
+ PubTatorOutputFormat.write(PMID+"|t|"+passage.getText()+"\n");
318
+ }
319
+ else if(passage.getInfon("type").equals("abstract"))
320
+ {
321
+ PubTatorOutputFormat.write(PMID+"|a|"+passage.getText()+"\n");
322
+ }
323
+ else
324
+ {
325
+ PubTatorOutputFormat.write(PMID+"|"+passage.getInfon("type")+"|"+passage.getText()+"\n");
326
+ }
327
+
328
+ for (BioCAnnotation annotation : passage.getAnnotations())
329
+ {
330
+ String Annotype = annotation.getInfon("type");
331
+ String Annoid="";
332
+ String Proteinid="";
333
+ if(Annotype.matches("(Gene|FamilyName|DomainMotif)"))
334
+ {
335
+ if(annotation.getInfons().containsKey("NCBI Gene"))
336
+ {
337
+ Annoid = annotation.getInfon("NCBI Gene");
338
+ String Annoidlist[]=Annoid.split(";");
339
+ Annoid="";
340
+ for(int x=0;x<Annoidlist.length;x++)
341
+ {
342
+ //Normalization2Protein
343
+ String proteinid="";
344
+ String homoid="";
345
+
346
+ if(GNormPlus.Normalization2Protein_hash.containsKey(Annoidlist[x]))
347
+ {
348
+ proteinid=GNormPlus.Normalization2Protein_hash.get(Annoidlist[x]);
349
+ }
350
+ if(GNormPlus.HomologeneID_hash.containsKey(Annoidlist[x]))
351
+ {
352
+ homoid=GNormPlus.HomologeneID_hash.get(Annoidlist[x]);
353
+ }
354
+
355
+ if((!proteinid.equals("")) || (!homoid.equals("")))
356
+ {
357
+ if(Annoid.equals(""))
358
+ {
359
+ Annoid=Annoidlist[x]+"(";
360
+ if(!proteinid.equals(""))
361
+ {
362
+ Annoid=Annoid+"UniProt:"+proteinid;
363
+ }
364
+ if(!homoid.equals(""))
365
+ {
366
+ if(!proteinid.equals(""))
367
+ {
368
+ Annoid=Annoid+";";
369
+ }
370
+ Annoid=Annoid+"Homoid:"+homoid;
371
+ }
372
+ Annoid=Annoid+")";
373
+ }
374
+ else
375
+ {
376
+ Annoid=Annoid+";"+Annoidlist[x]+"(";
377
+ if(!proteinid.equals(""))
378
+ {
379
+ Annoid=Annoid+"UniProt:"+proteinid;
380
+ }
381
+ if(!homoid.equals(""))
382
+ {
383
+ if(!proteinid.equals(""))
384
+ {
385
+ Annoid=Annoid+";";
386
+ }
387
+ Annoid=Annoid+"Homoid:"+homoid;
388
+ }
389
+ Annoid=Annoid+")";
390
+ }
391
+ }
392
+ else
393
+ {
394
+ if(Annoid.equals(""))
395
+ {
396
+ Annoid=Annoidlist[x];
397
+ }
398
+ else
399
+ {
400
+ Annoid=Annoid+";"+Annoidlist[x];
401
+ }
402
+ }
403
+ }
404
+ }
405
+ //else if(annotation.getInfons().containsKey("NCBI Homologene"))
406
+ //{
407
+ // Annoid = annotation.getInfon("NCBI Homologene");
408
+ //}
409
+ //else if(!annotation.getInfons().containsKey("FocusSpecies"))
410
+ //{
411
+ // Annoid = annotation.getInfon("FocusSpecies");
412
+ //}
413
+ else
414
+ {
415
+ Annoid = annotation.getInfon("Identifier");
416
+ }
417
+ }
418
+ else if(Annotype.equals("Species") || Annotype.equals("Genus") || Annotype.equals("Strain"))
419
+ {
420
+ if(annotation.getInfons().containsKey("NCBI Taxonomy"))
421
+ {
422
+ Annoid = annotation.getInfon("NCBI Taxonomy");
423
+ }
424
+ else
425
+ {
426
+ Annoid = annotation.getInfon("Identifier");
427
+ }
428
+ }
429
+ else if(Annotype.equals("CellLine"))
430
+ {
431
+ if(annotation.getInfons().containsKey("NCBI Taxonomy"))
432
+ {
433
+ Annoid = annotation.getInfon("NCBI Taxonomy");
434
+ }
435
+ else
436
+ {
437
+ Annoid = annotation.getInfon("Identifier");
438
+ }
439
+ }
440
+ else
441
+ {
442
+ Annoid = annotation.getInfon("Identifier");
443
+ }
444
+ int start = annotation.getLocations().get(0).getOffset();
445
+ int last = start + annotation.getLocations().get(0).getLength();
446
+ String AnnoMention=annotation.getText();
447
+ if(Annoid != null && !Annoid.equals(null) && !Annoid.equals(""))
448
+ {
449
+ Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\t"+Annoid+"\n";
450
+ }
451
+ else
452
+ {
453
+ Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\n";
454
+ }
455
+ }
456
+ }
457
+ PubTatorOutputFormat.write(Anno+"\n");
458
+ }
459
+ PubTatorOutputFormat.close();
460
+ if(duplicate == true){System.exit(0);}
461
+ }
462
+ public void BioC2PubTator(String original_input,String input,String output) throws IOException, XMLStreamException //Output
463
+ {
464
+ /* original tiabs*/
465
+ BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(original_input), "UTF-8"));
466
+ HashMap<String,String> ParagraphContent = new HashMap<String,String>(); // [PMID,0] -> title
467
+ HashMap<String,String> annotations = new HashMap<String,String>(); // PMID ->Annotation
468
+ String line;
469
+ String Pmid="";
470
+ int count_paragraph=0;
471
+ while ((line = inputfile.readLine()) != null)
472
+ {
473
+ if(line.contains("|") && !line.contains("\t")) //Title|Abstract
474
+ {
475
+ String str[]=line.split("\\|",-1);
476
+ Pmid=str[0];
477
+ ParagraphContent.put(Pmid+"\t"+str[1],str[2]);
478
+ count_paragraph++;
479
+ }
480
+ else if (line.contains("\t")) //Annotation
481
+ {
482
+ annotations.put(Pmid, annotations.get(Pmid)+line);
483
+ }
484
+ else if(line.length()==0) //Processing
485
+ {
486
+ count_paragraph=0;
487
+ }
488
+ }
489
+ inputfile.close();
490
+
491
+ /*
492
+ * BioC2PubTator
493
+ */
494
+ HashMap<String, String> pmidlist = new HashMap<String, String>(); // check if appear duplicate pmids
495
+ boolean duplicate = false;
496
+ BufferedWriter PubTatorOutputFormat = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
497
+ ConnectorWoodstox connector = new ConnectorWoodstox();
498
+ BioCCollection collection = new BioCCollection();
499
+ collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
500
+ while (connector.hasNext())
501
+ {
502
+ BioCDocument document = connector.next();
503
+ String PMID = document.getID();
504
+ if(pmidlist.containsKey(PMID)){System.out.println("\nError: duplicate pmid-"+PMID);duplicate = true;}
505
+ else{pmidlist.put(PMID,"");}
506
+ String Anno="";
507
+ for (BioCPassage passage : document.getPassages())
508
+ {
509
+ if(passage.getInfon("type").equals("title") || passage.getInfon("type").equals("t"))
510
+ {
511
+ PubTatorOutputFormat.write(PMID+"|t|"+ParagraphContent.get(PMID+"\tt")+"\n");
512
+ }
513
+ else if(passage.getInfon("type").equals("abstract") || passage.getInfon("type").equals("a"))
514
+ {
515
+ PubTatorOutputFormat.write(PMID+"|a|"+ParagraphContent.get(PMID+"\ta")+"\n");
516
+ }
517
+ else
518
+ {
519
+ PubTatorOutputFormat.write(PMID+"|"+passage.getInfon("type")+"|"+passage.getText()+"\n");
520
+ }
521
+
522
+ for (BioCAnnotation annotation : passage.getAnnotations())
523
+ {
524
+ String Annotype = annotation.getInfon("type");
525
+ String Annoid="";
526
+ String Proteinid="";
527
+ if(Annotype.matches("(Gene|FamilyName|DomainMotif)"))
528
+ {
529
+ if(annotation.getInfons().containsKey("NCBI Gene"))
530
+ {
531
+ Annoid = annotation.getInfon("NCBI Gene");
532
+ String Annoidlist[]=Annoid.split(";");
533
+ Annoid="";
534
+ for(int x=0;x<Annoidlist.length;x++)
535
+ {
536
+ //Normalization2Protein
537
+ String proteinid="";
538
+ String homoid="";
539
+
540
+ if(GNormPlus.Normalization2Protein_hash.containsKey(Annoidlist[x]))
541
+ {
542
+ proteinid=GNormPlus.Normalization2Protein_hash.get(Annoidlist[x]);
543
+ }
544
+ if(GNormPlus.HomologeneID_hash.containsKey(Annoidlist[x]))
545
+ {
546
+ homoid=GNormPlus.HomologeneID_hash.get(Annoidlist[x]);
547
+ }
548
+
549
+ if((!proteinid.equals("")) || (!homoid.equals("")))
550
+ {
551
+ if(Annoid.equals(""))
552
+ {
553
+ Annoid=Annoidlist[x]+"(";
554
+ if(!proteinid.equals(""))
555
+ {
556
+ Annoid=Annoid+"UniProt:"+proteinid;
557
+ }
558
+ if(!homoid.equals(""))
559
+ {
560
+ if(!proteinid.equals(""))
561
+ {
562
+ Annoid=Annoid+";";
563
+ }
564
+ Annoid=Annoid+"Homoid:"+homoid;
565
+ }
566
+ Annoid=Annoid+")";
567
+ }
568
+ else
569
+ {
570
+ Annoid=Annoid+";"+Annoidlist[x]+"(";
571
+ if(!proteinid.equals(""))
572
+ {
573
+ Annoid=Annoid+"UniProt:"+proteinid;
574
+ }
575
+ if(!homoid.equals(""))
576
+ {
577
+ if(!proteinid.equals(""))
578
+ {
579
+ Annoid=Annoid+";";
580
+ }
581
+ Annoid=Annoid+"Homoid:"+homoid;
582
+ }
583
+ Annoid=Annoid+")";
584
+ }
585
+ }
586
+ else
587
+ {
588
+ if(Annoid.equals(""))
589
+ {
590
+ Annoid=Annoidlist[x];
591
+ }
592
+ else
593
+ {
594
+ Annoid=Annoid+";"+Annoidlist[x];
595
+ }
596
+ }
597
+ }
598
+ }
599
+ //else if(annotation.getInfons().containsKey("NCBI Homologene"))
600
+ //{
601
+ // Annoid = annotation.getInfon("NCBI Homologene");
602
+ //}
603
+ //else if(annotation.getInfons().containsKey("FocusSpecies"))
604
+ //{
605
+ // Annoid = annotation.getInfon("FocusSpecies");
606
+ //}
607
+ else
608
+ {
609
+ Annoid = annotation.getInfon("Identifier");
610
+ }
611
+ }
612
+ else if(Annotype.equals("Species") || Annotype.equals("Genus") || Annotype.equals("Strain"))
613
+ {
614
+ if(annotation.getInfons().containsKey("NCBI Taxonomy"))
615
+ {
616
+ Annoid = annotation.getInfon("NCBI Taxonomy");
617
+ }
618
+ else
619
+ {
620
+ Annoid = annotation.getInfon("Identifier");
621
+ }
622
+ }
623
+ else if(Annotype.equals("CellLine"))
624
+ {
625
+ if(annotation.getInfons().containsKey("NCBI Taxonomy"))
626
+ {
627
+ Annoid = annotation.getInfon("NCBI Taxonomy");
628
+ }
629
+ else
630
+ {
631
+ Annoid = annotation.getInfon("Identifier");
632
+ }
633
+ }
634
+ else
635
+ {
636
+ if(annotation.getInfons().containsKey("Identifier"))
637
+ {
638
+ Annoid = annotation.getInfon("Identifier");
639
+ }
640
+ else
641
+ {
642
+ Annoid = "";
643
+ }
644
+ }
645
+ int start = annotation.getLocations().get(0).getOffset();
646
+ int last = start + annotation.getLocations().get(0).getLength();
647
+ String AnnoMention=annotation.getText();
648
+ if(Annoid != null && !Annoid.equals(null) && !Annoid.equals(""))
649
+ {
650
+ Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\t"+Annoid+"\n";
651
+ }
652
+ else
653
+ {
654
+ Anno=Anno+PMID+"\t"+start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\n";
655
+ }
656
+ }
657
+ }
658
+ PubTatorOutputFormat.write(Anno+"\n");
659
+ }
660
+ PubTatorOutputFormat.close();
661
+ if(duplicate == true){System.exit(0);}
662
+ }
663
+ public void BioCReader(String input) throws IOException, XMLStreamException
664
+ {
665
+ ConnectorWoodstox connector = new ConnectorWoodstox();
666
+ BioCCollection collection = new BioCCollection();
667
+ collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
668
+
669
+ /*
670
+ * Per document
671
+ */
672
+ while (connector.hasNext())
673
+ {
674
+ BioCDocument document = connector.next();
675
+ PMIDs.add(document.getID());
676
+
677
+ ArrayList<String> PassageName= new ArrayList<String>(); // array of Passage name
678
+ ArrayList<Integer> PassageOffset= new ArrayList<Integer>(); // array of Passage offset
679
+ ArrayList<String> PassageContext= new ArrayList<String>(); // array of Passage context
680
+ ArrayList<ArrayList<String>> AnnotationInPMID= new ArrayList(); // array of Annotations in the PassageName
681
+
682
+ /*
683
+ * Per Passage
684
+ */
685
+ for (BioCPassage passage : document.getPassages())
686
+ {
687
+ PassageName.add(passage.getInfon("type")); //Paragraph
688
+ String txt = passage.getText();
689
+ if(txt.matches("[\t ]+"))
690
+ {
691
+ txt = txt.replaceAll(".","@");
692
+ }
693
+ else
694
+ {
695
+ //if(passage.getInfon("type").toLowerCase().equals("table"))
696
+ //{
697
+ // txt=txt.replaceAll(" ", "|");
698
+ //}
699
+ txt = txt.replaceAll("ω","w");
700
+ txt = txt.replaceAll("μ","u");
701
+ txt = txt.replaceAll("κ","k");
702
+ txt = txt.replaceAll("α","a");
703
+ txt = txt.replaceAll("γ","g");
704
+ txt = txt.replaceAll("ɣ","g");
705
+ txt = txt.replaceAll("β","b");
706
+ txt = txt.replaceAll("×","x");
707
+ txt = txt.replaceAll("‑","-");
708
+ txt = txt.replaceAll("¹","1");
709
+ txt = txt.replaceAll("²","2");
710
+ txt = txt.replaceAll("°","o");
711
+ txt = txt.replaceAll("ö","o");
712
+ txt = txt.replaceAll("é","e");
713
+ txt = txt.replaceAll("à","a");
714
+ txt = txt.replaceAll("Á","A");
715
+ txt = txt.replaceAll("ε","e");
716
+ txt = txt.replaceAll("θ","O");
717
+ txt = txt.replaceAll("•",".");
718
+ txt = txt.replaceAll("µ","u");
719
+ txt = txt.replaceAll("λ","r");
720
+ txt = txt.replaceAll("⁺","+");
721
+ txt = txt.replaceAll("ν","v");
722
+ txt = txt.replaceAll("ï","i");
723
+ txt = txt.replaceAll("ã","a");
724
+ txt = txt.replaceAll("≡","=");
725
+ txt = txt.replaceAll("ó","o");
726
+ txt = txt.replaceAll("³","3");
727
+ txt = txt.replaceAll("〖","[");
728
+ txt = txt.replaceAll("〗","]");
729
+ txt = txt.replaceAll("Å","A");
730
+ txt = txt.replaceAll("ρ","p");
731
+ txt = txt.replaceAll("ü","u");
732
+ txt = txt.replaceAll("ɛ","e");
733
+ txt = txt.replaceAll("č","c");
734
+ txt = txt.replaceAll("š","s");
735
+ txt = txt.replaceAll("ß","b");
736
+ txt = txt.replaceAll("═","=");
737
+ txt = txt.replaceAll("£","L");
738
+ txt = txt.replaceAll("Ł","L");
739
+ txt = txt.replaceAll("ƒ","f");
740
+ txt = txt.replaceAll("ä","a");
741
+ txt = txt.replaceAll("–","-");
742
+ txt = txt.replaceAll("⁻","-");
743
+ txt = txt.replaceAll("〈","<");
744
+ txt = txt.replaceAll("〉",">");
745
+ txt = txt.replaceAll("χ","X");
746
+ txt = txt.replaceAll("Đ","D");
747
+ txt = txt.replaceAll("‰","%");
748
+ txt = txt.replaceAll("·",".");
749
+ txt = txt.replaceAll("→",">");
750
+ txt = txt.replaceAll("←","<");
751
+ txt = txt.replaceAll("ζ","z");
752
+ txt = txt.replaceAll("π","p");
753
+ txt = txt.replaceAll("τ","t");
754
+ txt = txt.replaceAll("ξ","X");
755
+ txt = txt.replaceAll("η","h");
756
+ txt = txt.replaceAll("ø","0");
757
+ txt = txt.replaceAll("Δ","D");
758
+ txt = txt.replaceAll("∆","D");
759
+ txt = txt.replaceAll("∑","S");
760
+ txt = txt.replaceAll("Ω","O");
761
+ txt = txt.replaceAll("δ","d");
762
+ txt = txt.replaceAll("σ","s");
763
+ txt = txt.replaceAll("Φ","F");
764
+ //txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]"," ");
765
+ }
766
+ if(passage.getText().equals("") || passage.getText().matches("[ ]+"))
767
+ {
768
+ PassageContext.add("-notext-"); //Context
769
+ }
770
+ else
771
+ {
772
+ PassageContext.add(txt); //Context
773
+ }
774
+ PassageOffset.add(passage.getOffset()); //Offset
775
+ ArrayList<String> AnnotationInPassage= new ArrayList<String>(); // array of Annotations in the PassageName
776
+ AnnotationInPMID.add(AnnotationInPassage);
777
+ }
778
+ PassageNames.add(PassageName);
779
+ PassageContexts.add(PassageContext);
780
+ PassageOffsets.add(PassageOffset);
781
+ Annotations.add(AnnotationInPMID);
782
+ }
783
+ }
784
+ public void BioCReaderWithAnnotation(String input) throws IOException, XMLStreamException
785
+ {
786
+ ConnectorWoodstox connector = new ConnectorWoodstox();
787
+ BioCCollection collection = new BioCCollection();
788
+ collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
789
+
790
+ /*
791
+ * Per document
792
+ */
793
+ while (connector.hasNext())
794
+ {
795
+ BioCDocument document = connector.next();
796
+ PMIDs.add(document.getID());
797
+
798
+ ArrayList<String> PassageName= new ArrayList<String>(); // array of Passage name
799
+ ArrayList<Integer> PassageOffset= new ArrayList<Integer>(); // array of Passage offset
800
+ ArrayList<String> PassageContext= new ArrayList<String>(); // array of Passage context
801
+ ArrayList<ArrayList<String>> AnnotationInPMID= new ArrayList(); // array of Annotations in the PassageName
802
+
803
+ /*
804
+ * Per Passage
805
+ */
806
+ for (BioCPassage passage : document.getPassages())
807
+ {
808
+ PassageName.add(passage.getInfon("type")); //Paragraph
809
+
810
+ String txt = passage.getText();
811
+ if(txt.matches("[\t ]+"))
812
+ {
813
+ txt = txt.replaceAll(".","@");
814
+ }
815
+ else
816
+ {
817
+ //if(passage.getInfon("type").toLowerCase().equals("table"))
818
+ //{
819
+ // txt=txt.replaceAll(" ", "|");
820
+ //}
821
+ txt = txt.replaceAll("ω","w");
822
+ txt = txt.replaceAll("μ","u");
823
+ txt = txt.replaceAll("κ","k");
824
+ txt = txt.replaceAll("α","a");
825
+ txt = txt.replaceAll("γ","g");
826
+ txt = txt.replaceAll("ɣ","g");
827
+ txt = txt.replaceAll("β","b");
828
+ txt = txt.replaceAll("×","x");
829
+ txt = txt.replaceAll("‑","-");
830
+ txt = txt.replaceAll("¹","1");
831
+ txt = txt.replaceAll("²","2");
832
+ txt = txt.replaceAll("°","o");
833
+ txt = txt.replaceAll("ö","o");
834
+ txt = txt.replaceAll("é","e");
835
+ txt = txt.replaceAll("à","a");
836
+ txt = txt.replaceAll("Á","A");
837
+ txt = txt.replaceAll("ε","e");
838
+ txt = txt.replaceAll("θ","O");
839
+ txt = txt.replaceAll("•",".");
840
+ txt = txt.replaceAll("µ","u");
841
+ txt = txt.replaceAll("λ","r");
842
+ txt = txt.replaceAll("⁺","+");
843
+ txt = txt.replaceAll("ν","v");
844
+ txt = txt.replaceAll("ï","i");
845
+ txt = txt.replaceAll("ã","a");
846
+ txt = txt.replaceAll("≡","=");
847
+ txt = txt.replaceAll("ó","o");
848
+ txt = txt.replaceAll("³","3");
849
+ txt = txt.replaceAll("〖","[");
850
+ txt = txt.replaceAll("〗","]");
851
+ txt = txt.replaceAll("Å","A");
852
+ txt = txt.replaceAll("ρ","p");
853
+ txt = txt.replaceAll("ü","u");
854
+ txt = txt.replaceAll("ɛ","e");
855
+ txt = txt.replaceAll("č","c");
856
+ txt = txt.replaceAll("š","s");
857
+ txt = txt.replaceAll("ß","b");
858
+ txt = txt.replaceAll("═","=");
859
+ txt = txt.replaceAll("£","L");
860
+ txt = txt.replaceAll("Ł","L");
861
+ txt = txt.replaceAll("ƒ","f");
862
+ txt = txt.replaceAll("ä","a");
863
+ txt = txt.replaceAll("–","-");
864
+ txt = txt.replaceAll("⁻","-");
865
+ txt = txt.replaceAll("〈","<");
866
+ txt = txt.replaceAll("〉",">");
867
+ txt = txt.replaceAll("χ","X");
868
+ txt = txt.replaceAll("Đ","D");
869
+ txt = txt.replaceAll("‰","%");
870
+ txt = txt.replaceAll("·",".");
871
+ txt = txt.replaceAll("→",">");
872
+ txt = txt.replaceAll("←","<");
873
+ txt = txt.replaceAll("ζ","z");
874
+ txt = txt.replaceAll("π","p");
875
+ txt = txt.replaceAll("τ","t");
876
+ txt = txt.replaceAll("ξ","X");
877
+ txt = txt.replaceAll("η","h");
878
+ txt = txt.replaceAll("ø","0");
879
+ txt = txt.replaceAll("Δ","D");
880
+ txt = txt.replaceAll("∆","D");
881
+ txt = txt.replaceAll("∑","S");
882
+ txt = txt.replaceAll("Ω","O");
883
+ txt = txt.replaceAll("δ","d");
884
+ txt = txt.replaceAll("σ","s");
885
+ txt = txt.replaceAll("Φ","F");
886
+ //txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]"," ");
887
+ }
888
+ if(passage.getText().equals("") || passage.getText().matches("[ ]+"))
889
+ {
890
+ PassageContext.add("-notext-"); //Context
891
+ }
892
+ else
893
+ {
894
+ PassageContext.add(txt); //Context
895
+ }
896
+ PassageOffset.add(passage.getOffset()); //Offset
897
+ ArrayList<String> AnnotationInPassage= new ArrayList<String>(); // array of Annotations in the PassageName
898
+
899
+ /*
900
+ * Per Annotation :
901
+ * start
902
+ * last
903
+ * mention
904
+ * type
905
+ * id
906
+ */
907
+ for (BioCAnnotation Anno : passage.getAnnotations())
908
+ {
909
+ int start = Anno.getLocations().get(0).getOffset()-passage.getOffset(); // start
910
+ int last = start + Anno.getLocations().get(0).getLength(); // last
911
+ String AnnoMention=Anno.getText(); // mention
912
+ String Annotype = Anno.getInfon("type"); // type
913
+ String Annoid = Anno.getInfon("Identifier"); // identifier | MESH
914
+ if(Annoid == null)
915
+ {
916
+ Annoid = Anno.getInfon("Identifier"); // identifier | MESH
917
+ }
918
+ if(Annoid == null || Annoid.equals("null"))
919
+ {
920
+ AnnotationInPassage.add(start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype); //paragraph
921
+ }
922
+ else
923
+ {
924
+ AnnotationInPassage.add(start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\t"+Annoid); //paragraph
925
+ }
926
+ }
927
+ AnnotationInPMID.add(AnnotationInPassage);
928
+ }
929
+ PassageNames.add(PassageName);
930
+ PassageContexts.add(PassageContext);
931
+ PassageOffsets.add(PassageOffset);
932
+ Annotations.add(AnnotationInPMID);
933
+ }
934
+ }
935
+ public void BioCOutput(String input,String output, ArrayList<ArrayList<ArrayList<String>>> Annotations,boolean Final,boolean RemovePreviousAnno) throws IOException, XMLStreamException
936
+ {
937
+ boolean ShowUnNormalizedMention = false;
938
+ if(GNormPlus.setup_hash.containsKey("ShowUnNormalizedMention") && GNormPlus.setup_hash.get("ShowUnNormalizedMention").equals("True"))
939
+ {
940
+ ShowUnNormalizedMention = true;
941
+ }
942
+
943
+ BioCDocumentWriter BioCOutputFormat = BioCFactory.newFactory(BioCFactory.WOODSTOX).createBioCDocumentWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
944
+ BioCCollection biocCollection_input = new BioCCollection();
945
+ BioCCollection biocCollection_output = new BioCCollection();
946
+
947
+ //input: BioC
948
+ ConnectorWoodstox connector = new ConnectorWoodstox();
949
+ biocCollection_input = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8"));
950
+ BioCOutputFormat.writeCollectionInfo(biocCollection_input);
951
+ int i=0; //count for pmid
952
+ while (connector.hasNext())
953
+ {
954
+ BioCDocument document_output = new BioCDocument();
955
+ BioCDocument document_input = connector.next();
956
+ String PMID=document_input.getID();
957
+ document_output.setID(PMID);
958
+ int annotation_count=0;
959
+ int j=0; //count for paragraph
960
+ for (BioCPassage passage_input : document_input.getPassages())
961
+ {
962
+ BioCPassage passage_output = passage_input;
963
+
964
+ if(RemovePreviousAnno == true) //clean the previous annotation, if the NER result is provided
965
+ {
966
+ passage_output.clearAnnotations();
967
+ }
968
+ else
969
+ {
970
+ for (BioCAnnotation annotation : passage_output.getAnnotations())
971
+ {
972
+ annotation.setID(""+annotation_count);
973
+ annotation_count++;
974
+ }
975
+ }
976
+
977
+ int passage_Offset = passage_input.getOffset();
978
+ String passage_Text = passage_input.getText();
979
+ ArrayList<String> AnnotationInPassage = new ArrayList<String>();
980
+ //ArrayList<String> AnnotationInPassage = Annotations.get(i).get(j);
981
+ if(Annotations.size()>i && Annotations.get(i).size()>j)
982
+ {
983
+ for(int a=0;a<Annotations.get(i).get(j).size();a++)
984
+ {
985
+ String Anno[]=Annotations.get(i).get(j).get(a).split("\\t");
986
+ int start = Integer.parseInt(Anno[0]);
987
+ int last = Integer.parseInt(Anno[1]);
988
+ boolean found = false;
989
+ if(passage_Text.length()>last)
990
+ {
991
+ String mention = Anno[2];
992
+ if(Final == true && passage_Text.length()>=last)
993
+ {
994
+ mention = passage_Text.substring(start, last);
995
+ }
996
+ if(mention.matches(".*\t.*"))
997
+ {
998
+ Anno[3]=Anno[4];
999
+ if(Anno.length>=6)
1000
+ {
1001
+ Anno[4]=Anno[5];
1002
+ }
1003
+ }
1004
+ String type = Anno[3];
1005
+ String id = ""; // optional
1006
+ if(Anno.length>=5){id = Anno[4];}
1007
+ if(Final == true)
1008
+ {
1009
+ for(int b=0;b<AnnotationInPassage.size();b++)
1010
+ {
1011
+ String Annob[]=AnnotationInPassage.get(b).split("\\t");
1012
+ int startb = Integer.parseInt(Annob[0]);
1013
+ int lastb = Integer.parseInt(Annob[1]);
1014
+ String mentionb = Annob[2];
1015
+ if(Final == true && passage_Text.length()>=lastb)
1016
+ {
1017
+ mentionb = passage_Text.substring(startb, lastb);
1018
+ }
1019
+ if(mentionb.matches(".*\t.*"))
1020
+ {
1021
+ Annob[3]=Annob[4];
1022
+ if(Annob.length>=6)
1023
+ {
1024
+ Annob[4]=Annob[5];
1025
+ }
1026
+ }
1027
+ String typeb = Annob[3];
1028
+ String idb = ""; // optional
1029
+ if(Annob.length>=5){idb = Annob[4];}
1030
+
1031
+ if(start == startb && last == lastb && type.equals(typeb))
1032
+ {
1033
+ found = true;
1034
+ if(id.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+") && (!idb.equals("")))
1035
+ {
1036
+ }
1037
+ else if(idb.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+") && (!id.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+")) && (!id.equals("")))
1038
+ {
1039
+ AnnotationInPassage.set(b, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id);
1040
+ }
1041
+ else
1042
+ {
1043
+ if(id.equals(""))
1044
+ {
1045
+ }
1046
+ else
1047
+ {
1048
+ AnnotationInPassage.set(b, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+idb+";"+id);
1049
+ }
1050
+
1051
+ }
1052
+ break;
1053
+ }
1054
+ }
1055
+ }
1056
+ }
1057
+ if(found == false)
1058
+ {
1059
+ AnnotationInPassage.add(Annotations.get(i).get(j).get(a));
1060
+ }
1061
+ }
1062
+ }
1063
+ for(int a=0;a<AnnotationInPassage.size();a++)
1064
+ {
1065
+ String Anno[]=AnnotationInPassage.get(a).split("\\t");
1066
+ HashMap <String,String> id_hash = new HashMap <String,String>();
1067
+ if(Anno.length>=5)
1068
+ {
1069
+ int start = Integer.parseInt(Anno[0]);
1070
+ int last = Integer.parseInt(Anno[1]);
1071
+ String mention = Anno[2];
1072
+ if(Final == true && passage_Text.length()>=last)
1073
+ {
1074
+ mention = passage_Text.substring(start, last);
1075
+ }
1076
+ if(mention.matches(".*\t.*"))
1077
+ {
1078
+ Anno[3]=Anno[4];
1079
+ if(Anno.length>=6)
1080
+ {
1081
+ Anno[4]=Anno[5];
1082
+ }
1083
+ }
1084
+ String ids = Anno[4];
1085
+ String idlist[]=ids.split(",");
1086
+ for(int b=0;b<idlist.length;b++)
1087
+ {
1088
+ id_hash.put(idlist[b], "");
1089
+ }
1090
+ ids = "";
1091
+ for(String id :id_hash.keySet())
1092
+ {
1093
+ if(ids.equals(""))
1094
+ {
1095
+ ids = id;
1096
+ }
1097
+ else
1098
+ {
1099
+ ids = ids + ";" + id;
1100
+ }
1101
+ }
1102
+ AnnotationInPassage.set(a, Anno[0]+"\t"+Anno[1]+"\t"+Anno[2]+"\t"+Anno[3]+"\t"+ids);
1103
+ }
1104
+ }
1105
+
1106
+ for(int a=0;a<AnnotationInPassage.size();a++)
1107
+ {
1108
+ String Anno[]=AnnotationInPassage.get(a).split("\\t");
1109
+ int start = Integer.parseInt(Anno[0]);
1110
+ int last = Integer.parseInt(Anno[1]);
1111
+ if(passage_Text.length()>last)
1112
+ {
1113
+ String mention = Anno[2];
1114
+ if(Final == true && passage_Text.length()>=last)
1115
+ {
1116
+ mention = passage_Text.substring(start, last);
1117
+ }
1118
+ if(mention.matches(".*\t.*"))
1119
+ {
1120
+ Anno[3]=Anno[4];
1121
+ if(Anno.length>=6)
1122
+ {
1123
+ Anno[4]=Anno[5];
1124
+ }
1125
+ }
1126
+ String type = Anno[3];
1127
+ if(type.equals("GeneID")){type="Gene";}
1128
+ BioCAnnotation biocAnnotation = new BioCAnnotation();
1129
+ Map<String, String> AnnoInfons = new HashMap<String, String>();
1130
+ AnnoInfons.put("type", type);
1131
+ if(Anno.length>=5)
1132
+ {
1133
+ String identifier = Anno[4];
1134
+ if(Final == true && ShowUnNormalizedMention==false)
1135
+ {
1136
+ if(type.matches("(FamilyName|Domain|Gene)"))
1137
+ {
1138
+ Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9\\;]+)$");
1139
+ Matcher mtmp0 = ptmp0.matcher(identifier);
1140
+ Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
1141
+ Matcher mtmp1 = ptmp1.matcher(identifier);
1142
+ Pattern ptmp2 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)$");
1143
+ Matcher mtmp2 = ptmp2.matcher(identifier);
1144
+ Pattern ptmp3 = Pattern.compile("^Homo\\:([0-9]+)$");
1145
+ Matcher mtmp3 = ptmp3.matcher(identifier);
1146
+ if(mtmp0.find())
1147
+ {
1148
+ String Method_SA = mtmp0.group(1);
1149
+ String TaxonomyID = mtmp0.group(2);
1150
+ String NCBIGeneID = mtmp0.group(3);
1151
+ if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID))
1152
+ {
1153
+ AnnoInfons.put("UniProt", GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
1154
+ }
1155
+ if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID))
1156
+ {
1157
+ AnnoInfons.put("NCBI Homologene", GNormPlus.HomologeneID_hash.get(NCBIGeneID));
1158
+ }
1159
+ AnnoInfons.put("NCBI Gene", NCBIGeneID);
1160
+ }
1161
+ else if(mtmp1.find())
1162
+ {
1163
+ String Method_SA = mtmp1.group(1);
1164
+ String TaxonomyID = mtmp1.group(2);
1165
+ String NCBIGeneID = mtmp1.group(3);
1166
+ String HomoID = mtmp1.group(4);
1167
+ if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID))
1168
+ {
1169
+ AnnoInfons.put("UniProt", GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
1170
+ }
1171
+ if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID))
1172
+ {
1173
+ AnnoInfons.put("NCBI Homologene", GNormPlus.HomologeneID_hash.get(NCBIGeneID));
1174
+ }
1175
+ AnnoInfons.put("NCBI Gene", NCBIGeneID);
1176
+ }
1177
+ else if(mtmp2.find())
1178
+ {
1179
+ String Method_SA = mtmp2.group(1);
1180
+ String TaxonomyID = mtmp2.group(2);
1181
+ AnnoInfons.put("FocusSpecies", "NCBITaxonomyID:"+TaxonomyID);
1182
+ }
1183
+ else if(mtmp3.find())
1184
+ {
1185
+ String Method_SA = mtmp3.group(1);
1186
+ String HomoID = mtmp3.group(2);
1187
+ AnnoInfons.put("NCBI Homologene", HomoID);
1188
+ }
1189
+ else
1190
+ {
1191
+ String identifiers[] = identifier.split(";");
1192
+ if(identifiers.length>1)
1193
+ {
1194
+ ArrayList<String> identifierSTR = new ArrayList<String>();
1195
+ ArrayList<String> ProteinidSTR = new ArrayList<String>();
1196
+ ArrayList<String> HomoidSTR = new ArrayList<String>();
1197
+ for(int idi=0;idi<identifiers.length;idi++)
1198
+ {
1199
+ Pattern ptmp4 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
1200
+ Matcher mtmp4 = ptmp4.matcher(identifiers[idi]);
1201
+ Pattern ptmp5 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9\\;]+)$");
1202
+ Matcher mtmp5 = ptmp5.matcher(identifiers[idi]);
1203
+ if(mtmp4.find())
1204
+ {
1205
+ String Method_SA = mtmp4.group(1);
1206
+ String TaxonomyID = mtmp4.group(2);
1207
+ String NCBIGeneID = mtmp4.group(3);
1208
+ String HomoID = mtmp4.group(4);
1209
+ if(!identifierSTR.contains(NCBIGeneID))
1210
+ {
1211
+ identifierSTR.add(NCBIGeneID);
1212
+ }
1213
+ if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID))
1214
+ {
1215
+ if(!ProteinidSTR.contains(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID)))
1216
+ {
1217
+ ProteinidSTR.add(GNormPlus.Normalization2Protein_hash.get(NCBIGeneID));
1218
+ }
1219
+ }
1220
+ if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID))
1221
+ {
1222
+ if(!HomoidSTR.contains(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID)))
1223
+ {
1224
+ HomoidSTR.add(GNormPlus.HomologeneID_hash.get(NCBIGeneID));
1225
+ }
1226
+ }
1227
+
1228
+ }
1229
+ else if(mtmp5.find())
1230
+ {
1231
+ String Method_SA = mtmp5.group(1);
1232
+ String TaxonomyID = mtmp5.group(2);
1233
+ String NCBIGeneID = mtmp5.group(3);
1234
+ if(!identifierSTR.contains(NCBIGeneID))
1235
+ {
1236
+ identifierSTR.add(NCBIGeneID);
1237
+ }
1238
+ }
1239
+ }
1240
+ String idSTR="";
1241
+ for(int x=0;x<identifierSTR.size();x++)
1242
+ {
1243
+ if(idSTR.equals(""))
1244
+ {
1245
+ idSTR = identifierSTR.get(x);
1246
+ }
1247
+ else
1248
+ {
1249
+ idSTR = idSTR+";"+identifierSTR.get(x);
1250
+ }
1251
+ }
1252
+ AnnoInfons.put("NCBI Gene", idSTR);
1253
+
1254
+ String pidSTR="";
1255
+ for(int x=0;x<ProteinidSTR.size();x++)
1256
+ {
1257
+ if(pidSTR.equals(""))
1258
+ {
1259
+ pidSTR = ProteinidSTR.get(x);
1260
+ }
1261
+ else
1262
+ {
1263
+ pidSTR = pidSTR+";"+ProteinidSTR.get(x);
1264
+ }
1265
+ }
1266
+ if(!pidSTR.equals(""))
1267
+ {
1268
+ AnnoInfons.put("UniProt", pidSTR);
1269
+ }
1270
+
1271
+ String hidSTR="";
1272
+ for(int x=0;x<HomoidSTR.size();x++)
1273
+ {
1274
+ if(hidSTR.equals(""))
1275
+ {
1276
+ hidSTR = HomoidSTR.get(x);
1277
+ }
1278
+ else
1279
+ {
1280
+ hidSTR = hidSTR+";"+HomoidSTR.get(x);
1281
+ }
1282
+ }
1283
+ if(!hidSTR.equals(""))
1284
+ {
1285
+ AnnoInfons.put("NCBI Homologene", hidSTR);
1286
+ }
1287
+ }
1288
+ //else
1289
+ //{
1290
+ // AnnoInfons.put("Identifier", identifier);
1291
+ //}
1292
+ }
1293
+ }
1294
+ else if (type.matches("(Species|Genus|Strain)"))
1295
+ {
1296
+ AnnoInfons.put("type", type);
1297
+ AnnoInfons.put("NCBI Taxonomy", identifier);
1298
+ }
1299
+ else if (type.matches("Cell"))
1300
+ {
1301
+ AnnoInfons.put("type", "CellLine");
1302
+ AnnoInfons.put("NCBI Taxonomy", identifier);
1303
+ }
1304
+ else
1305
+ {
1306
+ AnnoInfons.put("Identifier", identifier);
1307
+ }
1308
+ }
1309
+ else
1310
+ {
1311
+ AnnoInfons.put("Identifier", identifier);
1312
+ }
1313
+ }
1314
+ biocAnnotation.setInfons(AnnoInfons);
1315
+ BioCLocation location = new BioCLocation();
1316
+ location.setOffset(start+passage_Offset);
1317
+ location.setLength(last-start);
1318
+ biocAnnotation.setLocation(location);
1319
+ biocAnnotation.setText(mention);
1320
+ biocAnnotation.setID(""+annotation_count);
1321
+ annotation_count++;
1322
+ if(Final == true)
1323
+ {
1324
+ if(AnnoInfons.containsKey("Identifier") || AnnoInfons.containsKey("NCBI Homologene") || AnnoInfons.containsKey("NCBI Gene") || AnnoInfons.containsKey("NCBI Taxonomy"))
1325
+ {
1326
+ passage_output.addAnnotation(biocAnnotation);
1327
+ }
1328
+ }
1329
+ else
1330
+ {
1331
+ passage_output.addAnnotation(biocAnnotation);
1332
+ }
1333
+ }
1334
+ }
1335
+ document_output.addPassage(passage_output);
1336
+ j++;
1337
+ }
1338
+ biocCollection_output.addDocument(document_output);
1339
+ BioCOutputFormat.writeDocument(document_output);
1340
+ i++;
1341
+ }
1342
+ BioCOutputFormat.close();
1343
+ }
1344
  }
src_Java/GNormPluslib/GN.java CHANGED
@@ -1,1084 +1,1084 @@
1
- /**
2
- * Project: GNormPlus
3
- * Function: Gene Normalization
4
- */
5
-
6
- package GNormPluslib;
7
-
8
- import bioc.BioCAnnotation;
9
- import bioc.BioCCollection;
10
- import bioc.BioCDocument;
11
- import bioc.BioCLocation;
12
- import bioc.BioCPassage;
13
-
14
- import bioc.io.BioCDocumentWriter;
15
- import bioc.io.BioCFactory;
16
- import bioc.io.woodstox.ConnectorWoodstox;
17
- import java.io.BufferedReader;
18
- import java.io.BufferedWriter;
19
- import java.io.FileInputStream;
20
- import java.io.FileOutputStream;
21
- import java.io.FileReader;
22
- import java.io.FileWriter;
23
- import java.io.IOException;
24
- import java.io.InputStreamReader;
25
- import java.io.OutputStreamWriter;
26
- import java.text.BreakIterator;
27
- import java.time.LocalDate;
28
- import java.time.ZoneId;
29
- import java.text.DecimalFormat;
30
- import java.math.RoundingMode;
31
-
32
- import javax.xml.stream.XMLStreamException;
33
-
34
- import java.util.Map;
35
- import java.util.regex.Matcher;
36
- import java.util.regex.Pattern;
37
- import java.util.ArrayList;
38
- import java.util.HashMap;
39
- import java.util.List;
40
- import java.util.Locale;
41
-
42
- public class GN
43
- {
44
- public static HashMap<String, String> MatchedTokens_hash = new HashMap<String, String>();
45
- private double ScoringFunction(String geneid,HashMap<String,String> Mention_hash,String LF)
46
- {
47
- /*
48
- * define gene/homo id
49
- */
50
-
51
- //LF
52
- LF = LF.toLowerCase();
53
- LF = LF.replaceAll("([0-9])([a-z])", "$1 $2");
54
- LF = LF.replaceAll("([a-z])([0-9])", "$1 $2");
55
- LF = LF.replaceAll("([\\W\\-\\_])", " ");
56
- LF = LF.replaceAll("[ ]+", " ");
57
- String LF_tkn[]=LF.split(" ");
58
- int LF_ParticalMatch = 0;
59
-
60
- Pattern ptmp = Pattern.compile("[0-9]+\\-([0-9]+)");
61
- Matcher mtmp = ptmp.matcher(geneid);
62
- Pattern ptmp2 = Pattern.compile("([0-9]+)");
63
- Matcher mtmp2 = ptmp.matcher(geneid);
64
- if(mtmp.find())
65
- {
66
- geneid = "Homo:"+mtmp.group(1);
67
- }
68
- else
69
- {
70
- geneid = "Gene:"+geneid;
71
- }
72
-
73
- if(GNormPlus.GeneScoring_hash.containsKey(geneid))
74
- {
75
- HashMap<String,Double> TF = new HashMap<String,Double>(); // token i in gene j
76
- HashMap<String,Double> TermFrequency = new HashMap<String,Double>();
77
-
78
- /*
79
- * Tokens in Query (Gene id lexicon)
80
- */
81
- String l[]=GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293 cmk-1,cytidylate-1,kinase-1,mssa-1 0.4096 4 0.0625 1 2.0
82
- String tkns_Gene[] = l[0].split(",");
83
- for(int i=0;i<tkns_Gene.length;i++)
84
- {
85
- String Tkn_Freq[] = tkns_Gene[i].split("-");
86
- TermFrequency.put(Tkn_Freq[0], Double.parseDouble(Tkn_Freq[1]));
87
- }
88
- Double Cj = Double.parseDouble(l[1]);
89
- Double AllTknNum = Double.parseDouble(l[2]);
90
- //Double Cj_max = Double.parseDouble(l[3]);
91
- //Double MaxTknNum = Double.parseDouble(l[4]);
92
- Double Norm = Double.parseDouble(l[5]);
93
- if(Norm == 0.0){Norm=1.0;}
94
-
95
- /*
96
- * Tokens in Document (recognized mentions)
97
- */
98
- for(String Mention : Mention_hash.keySet())
99
- {
100
- Mention = Mention.toLowerCase();
101
- Mention = Mention.replaceAll("([0-9])([a-z])", "$1 $2");
102
- Mention = Mention.replaceAll("([a-z])([0-9])", "$1 $2");
103
- Mention = Mention.replaceAll("([\\W\\-\\_])", " ");
104
- Mention = Mention.replaceAll("[ ]+", " ");
105
- String tkns_Mention[]=Mention.split(" ");
106
- for(int i=0;i<tkns_Mention.length;i++)
107
- {
108
- if(TermFrequency.containsKey(tkns_Mention[i]))
109
- {
110
- TF.put(tkns_Mention[i], TermFrequency.get(tkns_Mention[i]));
111
- }
112
- }
113
- }
114
-
115
- Double score=0.0;
116
- for(String Tkn : TF.keySet())
117
- {
118
- //LF
119
- for(int t=0;t<LF_tkn.length;t++)
120
- {
121
- if(LF_tkn[t].equals(Tkn))
122
- {
123
- LF_ParticalMatch++;
124
- }
125
- }
126
-
127
- double TFij = TF.get(Tkn)/AllTknNum;
128
- double IDFi=GNormPlus.GeneScoringDF_hash.get(Tkn);
129
- score=score+TFij*IDFi*(1/(1-TFij));
130
- }
131
- //score = Cj * (1/Norm) *score;
132
- if(LF_ParticalMatch>0){score = score + LF_ParticalMatch;/*System.out.println(geneid+"\t"+LF+"\t"+score);*/}
133
- return score;
134
- }
135
- else
136
- {
137
- //System.out.println("Error: cannot find geneid: "+geneid+" in GeneScoring_hash");
138
- return 0.0;
139
- }
140
- }
141
-
142
- public void PreProcessing4GN(String Filename,String FilenameBioC) throws IOException, XMLStreamException
143
- {
144
- for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++)
145
- {
146
- for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++)
147
- {
148
- for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++)
149
- {
150
- String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
151
- String start=anno[0];
152
- String last=anno[1];
153
- String mentions=anno[2];
154
- String type=anno[3];
155
- String id="";
156
- if(anno.length>=5)
157
- {
158
- id=anno[4];
159
- }
160
-
161
- if(type.equals("Gene"))
162
- {
163
- String mentionArr[] = mentions.split("\\|");
164
- boolean update=false;
165
- for(int m=0;m<mentionArr.length;m++)
166
- {
167
- Pattern ptmp = Pattern.compile("^(.*[0-9A-Z])[ ]*p$");
168
- Matcher mtmp = ptmp.matcher(mentionArr[m]);
169
- Pattern ptmp2 = Pattern.compile("^(.+)nu$");
170
- Matcher mtmp2 = ptmp2.matcher(mentionArr[m]);
171
- Pattern ptmp3 = Pattern.compile("^(.*)alpha(.*)$");
172
- Matcher mtmp3 = ptmp3.matcher(mentionArr[m]);
173
- Pattern ptmp4 = Pattern.compile("^(.*)beta(.*)$");
174
- Matcher mtmp4 = ptmp4.matcher(mentionArr[m]);
175
- Pattern ptmp5 = Pattern.compile("^(.+[0-9])a$");
176
- Matcher mtmp5 = ptmp5.matcher(mentionArr[m]);
177
- Pattern ptmp6 = Pattern.compile("^(.+[0-9])b$");
178
- Matcher mtmp6 = ptmp6.matcher(mentionArr[m]);
179
- Pattern ptmp7 = Pattern.compile("^(.+)II([a-z])$");
180
- Matcher mtmp7 = ptmp7.matcher(mentionArr[m]);
181
- Pattern ptmp8 = Pattern.compile("^(.+)III([a-z])$");
182
- Matcher mtmp8 = ptmp8.matcher(mentionArr[m]);
183
- if(mtmp.find())
184
- {
185
- mentions=mentions+"|"+mtmp.group(1);
186
- update=true;
187
- }
188
- if(mtmp2.find())
189
- {
190
- mentions=mentions+"|"+mtmp2.group(1);
191
- update=true;
192
- }
193
- if(mtmp3.find())
194
- {
195
- mentions=mentions+"|"+mtmp3.group(1)+"a"+mtmp3.group(2);
196
- update=true;
197
- }
198
- if(mtmp4.find())
199
- {
200
- mentions=mentions+"|"+mtmp4.group(1)+"b"+mtmp4.group(2);
201
- update=true;
202
- }
203
- if(mtmp5.find())
204
- {
205
- mentions=mentions+"|"+mtmp5.group(1)+"alpha";
206
- update=true;
207
- }
208
- if(mtmp6.find())
209
- {
210
- mentions=mentions+"|"+mtmp6.group(1)+"beta";
211
- update=true;
212
- }
213
- if(mtmp7.find())
214
- {
215
- mentions=mentions+"|"+mtmp7.group(1)+"2"+mtmp7.group(2);
216
- update=true;
217
- }
218
- if(mtmp8.find())
219
- {
220
- mentions=mentions+"|"+mtmp8.group(1)+"3"+mtmp8.group(2);
221
- update=true;
222
- }
223
- }
224
- if(update == true)
225
- {
226
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start + "\t" + last + "\t" + mentions + "\t" + type + "\t" + id );
227
- }
228
- }
229
- }
230
- }
231
- }
232
- //GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
233
- }
234
-
235
- public void ChromosomeRecognition(String Filename,String FilenameBioC) throws IOException, XMLStreamException
236
- {
237
- for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */
238
- {
239
- String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
240
- for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
241
- {
242
- String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
243
-
244
- /** Chromosome recognition */
245
- ArrayList<String> locations = GNormPlus.PT_GeneChromosome.SearchMentionLocation(PassageContext,"ChromosomeLocation");
246
- for (int k = 0 ; k < locations.size() ; k++)
247
- {
248
- String anno[]=locations.get(k).split("\t");
249
- //int start= Integer.parseInt(anno[0]);
250
- //int last= Integer.parseInt(anno[1]);
251
- //String mention = anno[2];
252
- String ids = anno[3];
253
- //GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tChromosomeLocation\t"+ids); //paragraph
254
- String IDs[] = ids.split("[\\|,]");
255
- for(int idcount=0;idcount<IDs.length;idcount++)
256
- {
257
- //IDs[idcount] = IDs[idcount].replaceAll("\\-[0-9]+", "");
258
- GNormPlus.Pmid2ChromosomeGene_hash.put(Pmid+"\t"+IDs[idcount],"");
259
- }
260
- }
261
- }
262
- }
263
- //GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
264
- }
265
-
266
- public void GeneNormalization(String Filename,String FilenameBioC,boolean GeneIDMatch) throws IOException, XMLStreamException
267
- {
268
- final DecimalFormat df = new DecimalFormat("0.####");
269
- df.setRoundingMode(RoundingMode.HALF_UP);
270
-
271
- //Tokenization
272
- for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) /** PMIDs : i */
273
- {
274
- String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
275
-
276
- /** Species */
277
- HashMap<String,String> Species_hash = new HashMap<String,String>();
278
- for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
279
- {
280
- for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) /** Annotation : k */
281
- {
282
- String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
283
- String mentions=anno[2];
284
- String type=anno[3];
285
- if(type.matches("(Species|Genus|Strain|CellLine|Cell)"))
286
- {
287
- Species_hash.put(mentions,"");
288
- }
289
- }
290
- }
291
-
292
-
293
- /*
294
- * Collect Gene mentions :
295
- *
296
- * GeneMention-taxid -> "ID" : geneid
297
- * -> "type" : "Gene"
298
- * -> start1-last1 : ""
299
- * -> start2-last2 : ""
300
- * -> start3-last3 : ""
301
- */
302
-
303
- String tiabs="";
304
- for (int j = 0; j < GNormPlus.BioCDocobj.PassageContexts.get(i).size(); j++) /** Paragraphs : j */
305
- {
306
- tiabs=tiabs+GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).toLowerCase();
307
- }
308
- HashMap<String,HashMap<String,String>> GeneMention_hash = new HashMap<String,HashMap<String,String>>();
309
- HashMap<String,String> Mention_hash = new HashMap<String,String>();
310
- for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
311
- {
312
- for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) /** Annotation : k */
313
- {
314
- String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
315
- String start=anno[0];
316
- String last=anno[1];
317
- String mentions=anno[2];
318
- String type=anno[3];
319
- String taxids="Tax:9606";
320
-
321
- if(anno.length>=5)
322
- {
323
- taxids=anno[4];
324
- }
325
- String mentions_tmp=mentions.toLowerCase();
326
- mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
327
- mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
328
- taxids=taxids.replaceAll("(Focus|Right|Left|Prefix|Tax):","");
329
- if(taxids.equals(""))
330
- {
331
- taxids="9606";
332
- }
333
- /** Filtering */
334
- boolean found_filter = false;
335
- if(GNormPlus.Filtering_hash.containsKey(mentions_tmp)) // filtering
336
- {
337
- found_filter=true;
338
- }
339
-
340
- if(found_filter==false) //abbreviation
341
- {
342
- for(String f : GNormPlus.Filtering_WithLongForm_hash.keySet())
343
- {
344
- if( GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*[\\t\\|]"+f+"\tGene.*") ||
345
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*\\t"+f+"\\|[^\t]+\tGene.*")
346
- )
347
- {
348
- String lf=GNormPlus.Filtering_WithLongForm_hash.get(f);
349
- if(tiabs.matches(".*"+lf+".*"))
350
- {
351
- found_filter=true;
352
- break;
353
- }
354
- }
355
- }
356
- }
357
-
358
- if(found_filter==false)
359
- {
360
- if( GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*[\\t\\|][a-z]\tGene.*") ||
361
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*\\t[a-z]\\|[^\t]+\tGene.*") //32171191 Wuhan's
362
- )
363
- {
364
- found_filter=true;
365
-
366
- }
367
- }
368
-
369
- if(found_filter == false)
370
- {
371
- if(type.matches("Gene"))
372
- {
373
- if(GeneMention_hash.containsKey(mentions+"\t"+taxids))
374
- {
375
- GeneMention_hash.get(mentions+"\t"+taxids).put(start+"\t"+last,"");
376
- }
377
- else
378
- {
379
- HashMap<String,String> offset_hash = new HashMap<String,String>();
380
- offset_hash.put(start+"\t"+last,"");
381
- GeneMention_hash.put(mentions+"\t"+taxids, offset_hash);
382
- GeneMention_hash.get(mentions+"\t"+taxids).put("type", type);
383
- Mention_hash.put(mentions,"Gene");
384
- }
385
- }
386
- else if(type.matches("(FamilyName|DomainMotif)"))
387
- {
388
- String GMs[]=mentions.split("\\|");
389
- for(int g=0;g<GMs.length;g++)
390
- {
391
- String mention = GMs[g];
392
- Mention_hash.put(mention,"FamilyDomain");
393
- }
394
- }
395
- }
396
-
397
- }
398
- }
399
-
400
- /*
401
- * Gene id refinement:
402
- * 1. Official name
403
- * 2. only one gene
404
- */
405
- HashMap<String,String> GuaranteedGene2ID = new HashMap<String,String>();
406
- HashMap<String,String> MultiGene2ID = new HashMap<String,String>();
407
- for(String GeneMentionTax : GeneMention_hash.keySet())
408
- {
409
- String GT[]=GeneMentionTax.split("\\t");
410
- String mentions=GT[0];
411
- String taxids=GT[1];
412
- String GMs[]=mentions.split("\\|");
413
-
414
- HashMap<String,String> taxids_hash = new HashMap<String,String>();
415
- String taxids_arr[]=taxids.split(",");
416
- for(int t=0;t<taxids_arr.length;t++)
417
- {
418
- taxids_hash.put(taxids_arr[t], "");
419
- }
420
-
421
- for(int ms=0;ms<GMs.length;ms++)
422
- {
423
- String mention = GMs[ms];
424
- String IDstr = GNormPlus.PT_Gene.MentionMatch(mention); /** searched by PT_Gene */
425
- String IDs[]=IDstr.split("\\|");
426
-
427
- /*
428
- * printing the ambiguous gene mentions and candidates
429
- */
430
- //String IDs_s[]=IDstr.split(",");
431
- //if(IDs_s.length>1)
432
- //{
433
- // System.out.println(Pmid+"\t"+mention+"\t"+mentions+"\t"+IDstr);
434
- //}
435
-
436
- for(int c=0;c<IDs.length;c++)
437
- {
438
- String tax2ID[]=IDs[c].split(":"); // tax2ID[0] = taxid ; tax2ID[1] = geneids
439
- if(taxids_hash.containsKey(tax2ID[0]))
440
- {
441
- String geneid=tax2ID[1];
442
- String TargetTax=tax2ID[0];
443
- GeneMention_hash.get(GeneMentionTax).put("ID", geneid);
444
- GeneMention_hash.get(GeneMentionTax).put("TargetTax", TargetTax);
445
- break;
446
- }
447
- }
448
-
449
- //geneid refinement
450
- if(GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
451
- {
452
- Pattern ptmp = Pattern.compile("\\*([0-9]+(\\-[0-9]+|))");
453
- Matcher mtmp = ptmp.matcher(GeneMention_hash.get(GeneMentionTax).get("ID"));
454
-
455
- if(mtmp.find()) // 1. Official Name
456
- {
457
- GeneMention_hash.get(GeneMentionTax).put("ID",mtmp.group(1));
458
- GuaranteedGene2ID.put(GeneMentionTax,mtmp.group(1));
459
- }
460
- else if(GeneMention_hash.get(GeneMentionTax).get("ID").matches("[0-9]+(\\-[0-9]+|)")) // 2. only one gene
461
- {
462
- GuaranteedGene2ID.put(GeneMentionTax,GeneMention_hash.get(GeneMentionTax).get("ID"));
463
- }
464
- else
465
- {
466
- String ID[] = GeneMention_hash.get(GeneMentionTax).get("ID").split(",");
467
- boolean FoundByChroLoca=false;
468
- for(int idcount=0;idcount<ID.length;idcount++)
469
- {
470
- if(GNormPlus.Pmid2ChromosomeGene_hash.containsKey(Pmid+"\t"+ID[idcount])) // 3. Chromosome location
471
- {
472
- GuaranteedGene2ID.put(GeneMentionTax,ID[idcount]);
473
- FoundByChroLoca=true;
474
- break;
475
- }
476
- }
477
- if(FoundByChroLoca == false)
478
- {
479
- MultiGene2ID.put(GeneMentionTax, GeneMention_hash.get(GeneMentionTax).get("ID"));
480
- }
481
- }
482
- }
483
- if(GNormPlus.suffixprefix_orig2modified.containsKey(mention) && (!IDstr.equals("-1")) && (!IDstr.equals("-2")) && (!IDstr.equals("-3")))
484
- {
485
- break;
486
- }
487
- }
488
- }
489
-
490
- /*
491
- * Gene id refinement:
492
- * 3. multiple genes but can be inferred by 1. and 2.
493
- */
494
- for(String GeneMentionTax_M : MultiGene2ID.keySet())
495
- {
496
- for(String GeneMentionTax_G : GuaranteedGene2ID.keySet())
497
- {
498
- String MG[] = MultiGene2ID.get(GeneMentionTax_M).split(",");
499
- for(int m=0;m<MG.length;m++)
500
- {
501
- if(MG[m].equals(GuaranteedGene2ID.get(GeneMentionTax_G)))
502
- {
503
- GeneMention_hash.get(GeneMentionTax_M).put("ID",MG[m]);
504
- }
505
- }
506
- }
507
- }
508
-
509
- /*
510
- * Gene id refinement:
511
- * 4. FullName -> Abbreviation
512
- */
513
- for(String GeneMentionTax : GeneMention_hash.keySet())
514
- {
515
- String MT[] = GeneMentionTax.split("\\t");
516
- if(GNormPlus.PmidLF2Abb_hash.containsKey(Pmid+"\t"+MT[0]))
517
- {
518
- String GeneMentionTax_Abb = GNormPlus.PmidLF2Abb_hash.get(Pmid+"\t"+MT[0]) + "\t" + MT[1];
519
- if(GeneMention_hash.containsKey(GeneMentionTax_Abb) && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
520
- {
521
- GeneMention_hash.get(GeneMentionTax_Abb).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID"));
522
- }
523
- }
524
- }
525
-
526
- /*
527
- * Gene id refinement:
528
- * 5. Ranking by scoring function (inference network)
529
- */
530
- for(String GeneMentionTax : GeneMention_hash.keySet())
531
- {
532
- if(GeneMention_hash.get(GeneMentionTax).containsKey("ID") && GeneMention_hash.get(GeneMentionTax).get("ID").matches(".+,.+"))
533
- {
534
- String geneids=GeneMention_hash.get(GeneMentionTax).get("ID");
535
- String geneid[] = geneids.split(",");
536
-
537
- String OutputStyle="Top1";
538
- if(OutputStyle.equals("Top1"))
539
- {
540
- //only return the best one
541
- double max_score=0.0;
542
- String target_geneid="";
543
- for(int g=0;g<geneid.length;g++)
544
- {
545
- String MT[] = GeneMentionTax.split("\\t");
546
- String LF="";
547
- if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
548
- {
549
- LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]);
550
- }
551
- double score = ScoringFunction(geneid[g],Mention_hash,LF);
552
- if(score>max_score)
553
- {
554
- max_score=score;
555
- target_geneid=geneid[g];
556
- }
557
- else if(score == 0.0)
558
- {
559
- //System.out.println(GeneMentionTax);
560
- }
561
- }
562
- GeneMention_hash.get(GeneMentionTax).put("ID", target_geneid);
563
- }
564
- else // "All"
565
- {
566
- //return all geneids
567
- String geneSTR="";
568
- for(int g=0;g<geneid.length;g++)
569
- {
570
- String MT[] = GeneMentionTax.split("\\t");
571
- String LF="";
572
- if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
573
- {
574
- LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]);
575
- }
576
- double score = ScoringFunction(geneid[g],Mention_hash,LF);
577
- String hoge = df.format(score);
578
- score=Double.parseDouble(hoge);
579
-
580
- if(geneSTR.equals(""))
581
- {
582
- geneSTR=geneid[g]+"-"+score;
583
- }
584
- else
585
- {
586
- geneSTR=geneSTR+","+geneid[g]+"-"+score;
587
- }
588
- }
589
- GeneMention_hash.get(GeneMentionTax).put("ID", geneSTR);
590
- }
591
- }
592
- }
593
-
594
- /*
595
- * Gene id refinement: - removed (Reason: cause too much False Positive)
596
- * 6. Abbreviation -> FullName
597
- *
598
- */
599
- for(String GeneMentionTax : GeneMention_hash.keySet())
600
- {
601
- String MT[] = GeneMentionTax.split("\\t");
602
- if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
603
- {
604
- String GeneMentionTax_LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]) + "\t" + MT[1];
605
- if(GeneMention_hash.containsKey(GeneMentionTax_LF) && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
606
- {
607
- GeneMention_hash.get(GeneMentionTax_LF).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID"));
608
- }
609
- }
610
- }
611
-
612
- /*
613
- * Gene id refinement:
614
- * 7. The inference network tokens of Abbreviation.ID should contain at least LF tokens
615
- * 8. The short mention should be filtered if not long form support
616
- */
617
- ArrayList<String> removeGMT = new ArrayList<String>();
618
- for(String GeneMentionTax : GeneMention_hash.keySet())
619
- {
620
- String GT[]=GeneMentionTax.split("\\t");
621
- String mentions=GT[0];
622
- String tax=GT[1];
623
- if(GeneMention_hash.get(GeneMentionTax).containsKey("type") && GeneMention_hash.get(GeneMentionTax).get("type").equals("Gene") && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
624
- {
625
- String type = GeneMention_hash.get(GeneMentionTax).get("type");
626
- String id = GeneMention_hash.get(GeneMentionTax).get("ID");
627
- String geneid="";
628
- Pattern ptmp1 = Pattern.compile("^([0-9]+)\\-([0-9]+)$");
629
- Pattern ptmp2 = Pattern.compile("^([0-9]+)$");
630
- Matcher mtmp1 = ptmp1.matcher(id);
631
- Matcher mtmp2 = ptmp2.matcher(id);
632
- //System.out.println(id);
633
- if(mtmp1.find())
634
- {
635
- geneid = "Homo:"+mtmp1.group(2);
636
- }
637
- else if(mtmp2.find())
638
- {
639
- geneid = "Gene:"+mtmp2.group(1);
640
- }
641
-
642
- boolean LongFormTknMatch= false;
643
- boolean LongFormExist= true;
644
- if(GNormPlus.GeneScoring_hash.containsKey(geneid))
645
- {
646
- if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mentions.toLowerCase()))
647
- {
648
- /*
649
- * token in lexicon : tkn_lexicon
650
- * token in mention : tkn_mention
651
- */
652
- String l[]=GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293 cmk-1,cytidylate-1,kinase-1,mssa-1 0.4096 4 0.0625 1 2.0
653
- String tkns_Gene[] = l[0].split(",");
654
- ArrayList<String> tkn_lexicon = new ArrayList<String>();
655
- for(int ti=0;ti<tkns_Gene.length;ti++)
656
- {
657
- String Tkn_Freq[] = tkns_Gene[ti].split("-");
658
- tkn_lexicon.add(Tkn_Freq[0]);
659
- }
660
-
661
- String LF_lc=GNormPlus.PmidAbb2LF_lc_hash.get(Pmid+"\t"+mentions.toLowerCase());
662
- LF_lc = LF_lc.replaceAll("([0-9])([A-Za-z])", "$1 $2");
663
- LF_lc = LF_lc.replaceAll("([A-Za-z])([0-9])", "$1 $2");
664
- String tkn_mention[] = LF_lc.split("[\\W\\-\\_]");
665
- for(int tl=0;tl<tkn_lexicon.size();tl++)
666
- {
667
- for(int tm=0;tm<tkn_mention.length;tm++)
668
- {
669
- if(tkn_lexicon.get(tl).equals(tkn_mention[tm]) && (!tkn_mention[tm].matches("[0-9]+")))
670
- {
671
- LongFormTknMatch = true;
672
- }
673
- }
674
- }
675
- }
676
- else{LongFormExist = false;}
677
- }
678
- else{LongFormTknMatch = true;} // exception
679
-
680
- if(LongFormTknMatch == false && LongFormExist == true) // 7.
681
- {
682
- removeGMT.add(GeneMentionTax); //remove short form
683
- removeGMT.add(GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+mentions)+"\t"+tax); //remove long form
684
- }
685
- else if(mentions.length()<=2 && LongFormExist == false) // 8.
686
- {
687
- removeGMT.add(GeneMentionTax);
688
- }
689
- }
690
- }
691
-
692
- for(int gmti=0;gmti<removeGMT.size();gmti++) // remove
693
- {
694
- GeneMention_hash.remove(removeGMT.get(gmti));
695
- }
696
-
697
- // Append gene ids
698
- for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) // Paragraphs : j
699
- {
700
- for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
701
- {
702
- String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
703
- String start=anno[0];
704
- String last=anno[1];
705
- String mentions=anno[2];
706
- String type=anno[3];
707
- String taxid_org="Tax:9606";
708
- if(anno.length>=5)
709
- {
710
- taxid_org=anno[4];
711
- }
712
- String taxids=taxid_org.replaceAll("(Focus|Right|Left|Prefix|Tax):","");
713
- String GMs[]=mentions.split("\\|");
714
-
715
- if(GeneMention_hash.containsKey(mentions+"\t"+taxids) && GeneMention_hash.get(mentions+"\t"+taxids).containsKey("TargetTax"))
716
- {
717
- String taxtype=taxid_org.replaceAll(":([0-9,]+)","");
718
- String taxid=GeneMention_hash.get(mentions+"\t"+taxids).get("TargetTax");
719
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mentions+"\t"+type+"\t"+taxtype+":"+taxid);
720
- }
721
-
722
- if(type.equals("Gene"))
723
- {
724
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k) + "|");
725
-
726
-
727
- if(GeneMention_hash.containsKey(mentions+"\t"+taxids) && GeneMention_hash.get(mentions+"\t"+taxids).containsKey("ID"))
728
- {
729
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k) + GeneMention_hash.get(mentions+"\t"+taxids).get("ID") + "," );
730
- }
731
- else // cannot find appropriate species
732
- {
733
- //System.out.println(mention+"\t"+taxid);
734
- }
735
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).substring(0, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).length()-1)); // remove ",$"
736
- }
737
- }
738
- }
739
-
740
- //Extend to all gene mentions
741
- HashMap<String,String> GeneMentions = new HashMap<String,String>(); // Extending Gene mentions
742
- HashMap<String,String> GeneMentionLocation = new HashMap<String,String>(); // Extending Gene mentions
743
- for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
744
- {
745
- for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
746
- {
747
- String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
748
- int start = Integer.parseInt(anno[0]);
749
- int last = Integer.parseInt(anno[1]);
750
- String mentions=anno[2];
751
- String type=anno[3];
752
- String id="Tax:9606";
753
- if(anno.length>=5)
754
- {
755
- id=anno[4];
756
- }
757
- if(type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)"))
758
- {
759
- GeneMentions.put(mentions.toLowerCase(), id);
760
- for (int s=start ;s<=last;s++)
761
- {
762
- GeneMentionLocation.put(j+"\t"+s,"");
763
- }
764
- }
765
- else if(type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)"))
766
- {
767
- GeneMentions.put(mentions.toLowerCase(), id);
768
- for (int s=start ;s<=last;s++)
769
- {
770
- GeneMentionLocation.put(j+"\t"+s,"");
771
- }
772
- }
773
- }
774
- }
775
- for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
776
- {
777
- if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j)
778
- {
779
- String PassageContexts = " " + GNormPlus.BioCDocobj.PassageContexts.get(i).get(j) + " ";
780
- String PassageContexts_tmp = PassageContexts.toLowerCase();
781
- for(String gm : GeneMentions.keySet())
782
- {
783
- String id = GeneMentions.get(gm);
784
- if(gm.length()>=3)
785
- {
786
- gm = gm.replaceAll("[ ]*[\\|]*$", "");
787
- gm = gm.replaceAll("^[\\|]*[ ]*", "");
788
- gm = gm.replaceAll("[\\|][\\|]+", "\\|");
789
- if(!gm.matches("[\\W\\-\\_]*"))
790
- {
791
- gm = gm.replaceAll("([^A-Za-z0-9\\| ])", "\\\\$1");
792
- Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])("+gm+")([\\W\\-\\_].*)$");
793
- Matcher mtmp = ptmp.matcher(PassageContexts_tmp);
794
- while(mtmp.find())
795
- {
796
- String pre = mtmp.group(1);
797
- String gmtmp = mtmp.group(2);
798
- String post = mtmp.group(3);
799
-
800
- int start = pre.length()-1;
801
- int last = start+gmtmp.length();
802
- if(PassageContexts.length()>=last+1)
803
- {
804
- String mention = PassageContexts.substring(start+1,last+1);
805
- if(!GeneMentionLocation.containsKey(j+"\t"+start) && !GeneMentionLocation.containsKey(j+"\t"+last))
806
- {
807
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGene\t"+id);
808
- }
809
- }
810
- gmtmp = gmtmp.replaceAll(".", "\\@");
811
- PassageContexts_tmp=pre+""+gmtmp+""+post;
812
- mtmp = ptmp.matcher(PassageContexts_tmp);
813
- }
814
- }
815
- }
816
- }
817
- }
818
- }
819
-
820
- //Apply to FamilyNames
821
- HashMap<String,String> geneids = new HashMap<String,String>(); // Extending Gene mentions
822
- for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
823
- {
824
- for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
825
- {
826
- String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
827
- String type=anno[3];
828
- if(type.equals("Gene"))
829
- {
830
- String id="Tax:9606";
831
- if(anno.length>=5)
832
- {
833
- id=anno[4];
834
- }
835
- Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)$");
836
- Matcher mtmp0 = ptmp0.matcher(id);
837
- Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
838
- Matcher mtmp1 = ptmp1.matcher(id);
839
- if(mtmp0.find())
840
- {
841
- geneids.put(mtmp0.group(3), "");
842
- }
843
- if(mtmp1.find())
844
- {
845
- geneids.put(mtmp1.group(3), "");
846
- }
847
- }
848
- }
849
- }
850
- for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
851
- {
852
- for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
853
- {
854
- String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
855
- String mention=anno[2];
856
- String type=anno[3];
857
- if(type.matches("(FamilyName|DomainMotif)"))
858
- {
859
- String id="Tax:9606";
860
- if(anno.length>=5)
861
- {
862
- id=anno[4];
863
- }
864
- String IDstrs = GNormPlus.PT_FamilyName.MentionMatch(mention);
865
- String IDstr[]=IDstrs.split("\\|");
866
- String ids="";
867
- for(int id_i=0;id_i<IDstr.length;id_i++)
868
- {
869
- if(geneids.containsKey(IDstr[id_i]))
870
- {
871
- if(ids.equals(""))
872
- {
873
- ids=IDstr[id_i];
874
- }
875
- else
876
- {
877
- ids=ids+";"+IDstr[id_i];
878
- }
879
- }
880
- }
881
- if(!ids.equals(""))
882
- {
883
- if(type.equals("FamilyName")){type="Gene";}
884
- String Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\tTax:9606";
885
- if(anno.length>=5)
886
- {
887
- Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\t"+anno[4];
888
- }
889
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k,Annotation_k+"|"+ids);
890
- }
891
- else
892
- {
893
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
894
- }
895
- }
896
- }
897
- }
898
- //Species "*" and "(anti)" removed.
899
- for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
900
- {
901
- for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
902
- {
903
- String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
904
- String type=anno[3];
905
- if(type.equals("Species") || type.equals("Genus") || type.equals("Strain") || type.equals("CellLine") || type.equals("Cell"))
906
- {
907
- String id=anno[4];
908
- id=id.replaceAll("\\*", "");
909
- id=id.replaceAll("\\(anti\\)", "");
910
- String Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\t"+id;
911
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k,Annotation_k);
912
- }
913
- }
914
- }
915
-
916
- for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
917
- {
918
-
919
- for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
920
- {
921
- String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
922
- int start = Integer.parseInt(anno[0]);
923
- int last = Integer.parseInt(anno[1]);
924
- String mention = anno[2];
925
- String type = anno[3];
926
- String id = anno[4];
927
- if(type.equals("Gene") && Species_hash.containsKey(mention))
928
- {
929
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
930
- }
931
- else if(type.equals("Gene") && id.equals(""))
932
- {
933
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
934
- }
935
- else
936
- {
937
- for (int k1 = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k1 >=0 ; k1--) // Annotation : k
938
- {
939
- if(k1 != k)
940
- {
941
- String anno1[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k1).split("\t");
942
- int start1 = Integer.parseInt(anno1[0]);
943
- int last1 = Integer.parseInt(anno1[1]);
944
- if((start1<start && last1>=last) || (start1<=start && last1>last))
945
- {
946
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
947
- break;
948
- }
949
- }
950
- }
951
- }
952
- }
953
- }
954
- }
955
- if(GeneIDMatch == true)
956
- {
957
- //GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
958
- }
959
- else
960
- {
961
- GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,true,true);
962
- }
963
- }
964
- /*
965
- * Search Potential GeneID in the Prefix Tree
966
- */
967
- public ArrayList<String> SearchGeneIDLocation(String Doc)
968
- {
969
- ArrayList<String> location = new ArrayList<String>();
970
-
971
- String Doc_tmp=" "+Doc+" ";
972
- Pattern ptmp = Pattern.compile("^(.*[^A-Za-z0-9]+)([0-9]+\\S*[A-Za-z]+|[A-Za-z]+\\S*[0-9]+|[0-9]+\\S*[A-Za-z]+\\S*[0-9]+|[A-Za-z]+\\S*[0-9]+\\S*[A-Za-z]+)([^A-Za-z0-9]+.*)$");
973
- Matcher mtmp = ptmp.matcher(Doc_tmp);
974
- while(mtmp.find())
975
- {
976
- String str1=mtmp.group(1);
977
- String str2=mtmp.group(2);
978
- String str3=mtmp.group(3);
979
- for(int m=str1.length();m<=(str1.length()+str2.length());m++)
980
- {
981
- int start = str1.length()-1;
982
- int last = start+str2.length();
983
- String mention = Doc.substring(start, last);
984
- if(!mention.matches(".*[\\'\\;\\[\\]\\+\\*\\\\].*"))
985
- {
986
- if(last-start>6 && (mention.matches(".*\\(.*\\).*") || mention.matches("[^\\(\\)]+")) )
987
- {
988
- Pattern ptmp1 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-([0-9]+)$");
989
- Matcher mtmp1 = ptmp1.matcher(mention);
990
- Pattern ptmp2 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-(.+[^0-9])([0-9]+)$");
991
- Matcher mtmp2 = ptmp2.matcher(mention);
992
- if(mtmp1.find())
993
- {
994
- String S1 = mtmp1.group(1);
995
- if(mtmp1.group(2).length()<=6 && mtmp1.group(3).length()<=6)
996
- {
997
- int Num1 = Integer.parseInt(mtmp1.group(2));
998
- int Num2 = Integer.parseInt(mtmp1.group(3));
999
- String prefix = "";
1000
- Pattern ptmp3 = Pattern.compile("^([0]+)");
1001
- Matcher mtmp3 = ptmp3.matcher(mtmp1.group(2));
1002
- if(mtmp3.find())
1003
- {
1004
- prefix = mtmp3.group(1);
1005
- }
1006
- if(Num2-Num1>0 && (Num2-Num1<=20))
1007
- {
1008
- for(int n=Num1;n<=Num2;n++)
1009
- {
1010
- String StrNum=S1+prefix+n;
1011
- if(StrNum.length()>=5)
1012
- {
1013
- location.add(start+"\t"+last+"\t"+StrNum+"\tGeneID");
1014
- }
1015
- }
1016
- }
1017
- }
1018
- }
1019
- else if(mtmp2.find())
1020
- {
1021
- if(mtmp2.group(2).length()<=6 && mtmp2.group(4).length()<=6)
1022
- {
1023
- String S1 = mtmp2.group(1);
1024
- int Num1 = Integer.parseInt(mtmp2.group(2));
1025
- String S2 = mtmp2.group(3);
1026
- int Num2 = Integer.parseInt(mtmp2.group(4));
1027
- if(S1.equals(S2))
1028
- {
1029
- String prefix = "";
1030
- Pattern ptmp3 = Pattern.compile("^([0]+)");
1031
- Matcher mtmp3 = ptmp3.matcher(mtmp2.group(2));
1032
- if(mtmp3.find())
1033
- {
1034
- prefix = mtmp3.group(1);
1035
- }
1036
- if(Num2-Num1>0 && (Num2-Num1<=20))
1037
- {
1038
- for(int n=Num1;n<=Num2;n++)
1039
- {
1040
- String StrNum=S1+prefix+n;
1041
- if(StrNum.length()>=5)
1042
- {
1043
- location.add(start+"\t"+last+"\t"+StrNum+"\tGeneID");
1044
- }
1045
- }
1046
- }
1047
- }
1048
- }
1049
- }
1050
- }
1051
- location.add(start+"\t"+last+"\t"+mention+"\tGeneID");
1052
- }
1053
- }
1054
- String men="";
1055
- for(int m=0;m<str2.length();m++){men=men+"@";}
1056
- Doc_tmp=str1+men+str3;
1057
- mtmp = ptmp.matcher(Doc_tmp);
1058
- }
1059
- return location;
1060
- }
1061
- public void GeneIDRecognition(String Filename,String FilenameBioC) throws IOException, XMLStreamException
1062
- {
1063
- for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */
1064
- {
1065
- for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
1066
- {
1067
- String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
1068
- /** GeneID recognition by pattern match */
1069
- ArrayList<String> locations = SearchGeneIDLocation(PassageContext);
1070
- for (int k = 0 ; k < locations.size() ; k++)
1071
- {
1072
- String anno[]=locations.get(k).split("\t");
1073
- String mention = anno[2].toLowerCase();
1074
- mention = mention.replaceAll("[\\W\\-\\_]+", "");
1075
- if(GNormPlus.GeneIDs_hash.containsKey(mention))
1076
- {
1077
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(locations.get(k)+"\tGeneID:"+GNormPlus.GeneIDs_hash.get(mention)); //paragraph
1078
- }
1079
- }
1080
- }
1081
- }
1082
- GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,true,true);
1083
- }
1084
  }
 
1
+ /**
2
+ * Project: GNormPlus
3
+ * Function: Gene Normalization
4
+ */
5
+
6
+ package GNormPluslib;
7
+
8
+ import bioc.BioCAnnotation;
9
+ import bioc.BioCCollection;
10
+ import bioc.BioCDocument;
11
+ import bioc.BioCLocation;
12
+ import bioc.BioCPassage;
13
+
14
+ import bioc.io.BioCDocumentWriter;
15
+ import bioc.io.BioCFactory;
16
+ import bioc.io.woodstox.ConnectorWoodstox;
17
+ import java.io.BufferedReader;
18
+ import java.io.BufferedWriter;
19
+ import java.io.FileInputStream;
20
+ import java.io.FileOutputStream;
21
+ import java.io.FileReader;
22
+ import java.io.FileWriter;
23
+ import java.io.IOException;
24
+ import java.io.InputStreamReader;
25
+ import java.io.OutputStreamWriter;
26
+ import java.text.BreakIterator;
27
+ import java.time.LocalDate;
28
+ import java.time.ZoneId;
29
+ import java.text.DecimalFormat;
30
+ import java.math.RoundingMode;
31
+
32
+ import javax.xml.stream.XMLStreamException;
33
+
34
+ import java.util.Map;
35
+ import java.util.regex.Matcher;
36
+ import java.util.regex.Pattern;
37
+ import java.util.ArrayList;
38
+ import java.util.HashMap;
39
+ import java.util.List;
40
+ import java.util.Locale;
41
+
42
+ public class GN
43
+ {
44
+ public static HashMap<String, String> MatchedTokens_hash = new HashMap<String, String>();
45
+ private double ScoringFunction(String geneid,HashMap<String,String> Mention_hash,String LF)
46
+ {
47
+ /*
48
+ * define gene/homo id
49
+ */
50
+
51
+ //LF
52
+ LF = LF.toLowerCase();
53
+ LF = LF.replaceAll("([0-9])([a-z])", "$1 $2");
54
+ LF = LF.replaceAll("([a-z])([0-9])", "$1 $2");
55
+ LF = LF.replaceAll("([\\W\\-\\_])", " ");
56
+ LF = LF.replaceAll("[ ]+", " ");
57
+ String LF_tkn[]=LF.split(" ");
58
+ int LF_ParticalMatch = 0;
59
+
60
+ Pattern ptmp = Pattern.compile("[0-9]+\\-([0-9]+)");
61
+ Matcher mtmp = ptmp.matcher(geneid);
62
+ Pattern ptmp2 = Pattern.compile("([0-9]+)");
63
+ Matcher mtmp2 = ptmp.matcher(geneid);
64
+ if(mtmp.find())
65
+ {
66
+ geneid = "Homo:"+mtmp.group(1);
67
+ }
68
+ else
69
+ {
70
+ geneid = "Gene:"+geneid;
71
+ }
72
+
73
+ if(GNormPlus.GeneScoring_hash.containsKey(geneid))
74
+ {
75
+ HashMap<String,Double> TF = new HashMap<String,Double>(); // token i in gene j
76
+ HashMap<String,Double> TermFrequency = new HashMap<String,Double>();
77
+
78
+ /*
79
+ * Tokens in Query (Gene id lexicon)
80
+ */
81
+ String l[]=GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293 cmk-1,cytidylate-1,kinase-1,mssa-1 0.4096 4 0.0625 1 2.0
82
+ String tkns_Gene[] = l[0].split(",");
83
+ for(int i=0;i<tkns_Gene.length;i++)
84
+ {
85
+ String Tkn_Freq[] = tkns_Gene[i].split("-");
86
+ TermFrequency.put(Tkn_Freq[0], Double.parseDouble(Tkn_Freq[1]));
87
+ }
88
+ Double Cj = Double.parseDouble(l[1]);
89
+ Double AllTknNum = Double.parseDouble(l[2]);
90
+ //Double Cj_max = Double.parseDouble(l[3]);
91
+ //Double MaxTknNum = Double.parseDouble(l[4]);
92
+ Double Norm = Double.parseDouble(l[5]);
93
+ if(Norm == 0.0){Norm=1.0;}
94
+
95
+ /*
96
+ * Tokens in Document (recognized mentions)
97
+ */
98
+ for(String Mention : Mention_hash.keySet())
99
+ {
100
+ Mention = Mention.toLowerCase();
101
+ Mention = Mention.replaceAll("([0-9])([a-z])", "$1 $2");
102
+ Mention = Mention.replaceAll("([a-z])([0-9])", "$1 $2");
103
+ Mention = Mention.replaceAll("([\\W\\-\\_])", " ");
104
+ Mention = Mention.replaceAll("[ ]+", " ");
105
+ String tkns_Mention[]=Mention.split(" ");
106
+ for(int i=0;i<tkns_Mention.length;i++)
107
+ {
108
+ if(TermFrequency.containsKey(tkns_Mention[i]))
109
+ {
110
+ TF.put(tkns_Mention[i], TermFrequency.get(tkns_Mention[i]));
111
+ }
112
+ }
113
+ }
114
+
115
+ Double score=0.0;
116
+ for(String Tkn : TF.keySet())
117
+ {
118
+ //LF
119
+ for(int t=0;t<LF_tkn.length;t++)
120
+ {
121
+ if(LF_tkn[t].equals(Tkn))
122
+ {
123
+ LF_ParticalMatch++;
124
+ }
125
+ }
126
+
127
+ double TFij = TF.get(Tkn)/AllTknNum;
128
+ double IDFi=GNormPlus.GeneScoringDF_hash.get(Tkn);
129
+ score=score+TFij*IDFi*(1/(1-TFij));
130
+ }
131
+ //score = Cj * (1/Norm) *score;
132
+ if(LF_ParticalMatch>0){score = score + LF_ParticalMatch;/*System.out.println(geneid+"\t"+LF+"\t"+score);*/}
133
+ return score;
134
+ }
135
+ else
136
+ {
137
+ //System.out.println("Error: cannot find geneid: "+geneid+" in GeneScoring_hash");
138
+ return 0.0;
139
+ }
140
+ }
141
+
142
+ public void PreProcessing4GN(String Filename,String FilenameBioC) throws IOException, XMLStreamException
143
+ {
144
+ for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++)
145
+ {
146
+ for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++)
147
+ {
148
+ for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++)
149
+ {
150
+ String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
151
+ String start=anno[0];
152
+ String last=anno[1];
153
+ String mentions=anno[2];
154
+ String type=anno[3];
155
+ String id="";
156
+ if(anno.length>=5)
157
+ {
158
+ id=anno[4];
159
+ }
160
+
161
+ if(type.equals("Gene"))
162
+ {
163
+ String mentionArr[] = mentions.split("\\|");
164
+ boolean update=false;
165
+ for(int m=0;m<mentionArr.length;m++)
166
+ {
167
+ Pattern ptmp = Pattern.compile("^(.*[0-9A-Z])[ ]*p$");
168
+ Matcher mtmp = ptmp.matcher(mentionArr[m]);
169
+ Pattern ptmp2 = Pattern.compile("^(.+)nu$");
170
+ Matcher mtmp2 = ptmp2.matcher(mentionArr[m]);
171
+ Pattern ptmp3 = Pattern.compile("^(.*)alpha(.*)$");
172
+ Matcher mtmp3 = ptmp3.matcher(mentionArr[m]);
173
+ Pattern ptmp4 = Pattern.compile("^(.*)beta(.*)$");
174
+ Matcher mtmp4 = ptmp4.matcher(mentionArr[m]);
175
+ Pattern ptmp5 = Pattern.compile("^(.+[0-9])a$");
176
+ Matcher mtmp5 = ptmp5.matcher(mentionArr[m]);
177
+ Pattern ptmp6 = Pattern.compile("^(.+[0-9])b$");
178
+ Matcher mtmp6 = ptmp6.matcher(mentionArr[m]);
179
+ Pattern ptmp7 = Pattern.compile("^(.+)II([a-z])$");
180
+ Matcher mtmp7 = ptmp7.matcher(mentionArr[m]);
181
+ Pattern ptmp8 = Pattern.compile("^(.+)III([a-z])$");
182
+ Matcher mtmp8 = ptmp8.matcher(mentionArr[m]);
183
+ if(mtmp.find())
184
+ {
185
+ mentions=mentions+"|"+mtmp.group(1);
186
+ update=true;
187
+ }
188
+ if(mtmp2.find())
189
+ {
190
+ mentions=mentions+"|"+mtmp2.group(1);
191
+ update=true;
192
+ }
193
+ if(mtmp3.find())
194
+ {
195
+ mentions=mentions+"|"+mtmp3.group(1)+"a"+mtmp3.group(2);
196
+ update=true;
197
+ }
198
+ if(mtmp4.find())
199
+ {
200
+ mentions=mentions+"|"+mtmp4.group(1)+"b"+mtmp4.group(2);
201
+ update=true;
202
+ }
203
+ if(mtmp5.find())
204
+ {
205
+ mentions=mentions+"|"+mtmp5.group(1)+"alpha";
206
+ update=true;
207
+ }
208
+ if(mtmp6.find())
209
+ {
210
+ mentions=mentions+"|"+mtmp6.group(1)+"beta";
211
+ update=true;
212
+ }
213
+ if(mtmp7.find())
214
+ {
215
+ mentions=mentions+"|"+mtmp7.group(1)+"2"+mtmp7.group(2);
216
+ update=true;
217
+ }
218
+ if(mtmp8.find())
219
+ {
220
+ mentions=mentions+"|"+mtmp8.group(1)+"3"+mtmp8.group(2);
221
+ update=true;
222
+ }
223
+ }
224
+ if(update == true)
225
+ {
226
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start + "\t" + last + "\t" + mentions + "\t" + type + "\t" + id );
227
+ }
228
+ }
229
+ }
230
+ }
231
+ }
232
+ //GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
233
+ }
234
+
235
+ public void ChromosomeRecognition(String Filename,String FilenameBioC) throws IOException, XMLStreamException
236
+ {
237
+ for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */
238
+ {
239
+ String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
240
+ for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
241
+ {
242
+ String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
243
+
244
+ /** Chromosome recognition */
245
+ ArrayList<String> locations = GNormPlus.PT_GeneChromosome.SearchMentionLocation(PassageContext,"ChromosomeLocation");
246
+ for (int k = 0 ; k < locations.size() ; k++)
247
+ {
248
+ String anno[]=locations.get(k).split("\t");
249
+ //int start= Integer.parseInt(anno[0]);
250
+ //int last= Integer.parseInt(anno[1]);
251
+ //String mention = anno[2];
252
+ String ids = anno[3];
253
+ //GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tChromosomeLocation\t"+ids); //paragraph
254
+ String IDs[] = ids.split("[\\|,]");
255
+ for(int idcount=0;idcount<IDs.length;idcount++)
256
+ {
257
+ //IDs[idcount] = IDs[idcount].replaceAll("\\-[0-9]+", "");
258
+ GNormPlus.Pmid2ChromosomeGene_hash.put(Pmid+"\t"+IDs[idcount],"");
259
+ }
260
+ }
261
+ }
262
+ }
263
+ //GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
264
+ }
265
+
266
+ public void GeneNormalization(String Filename,String FilenameBioC,boolean GeneIDMatch) throws IOException, XMLStreamException
267
+ {
268
+ final DecimalFormat df = new DecimalFormat("0.####");
269
+ df.setRoundingMode(RoundingMode.HALF_UP);
270
+
271
+ //Tokenization
272
+ for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) /** PMIDs : i */
273
+ {
274
+ String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
275
+
276
+ /** Species */
277
+ HashMap<String,String> Species_hash = new HashMap<String,String>();
278
+ for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
279
+ {
280
+ for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) /** Annotation : k */
281
+ {
282
+ String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
283
+ String mentions=anno[2];
284
+ String type=anno[3];
285
+ if(type.matches("(Species|Genus|Strain|CellLine|Cell)"))
286
+ {
287
+ Species_hash.put(mentions,"");
288
+ }
289
+ }
290
+ }
291
+
292
+
293
+ /*
294
+ * Collect Gene mentions :
295
+ *
296
+ * GeneMention-taxid -> "ID" : geneid
297
+ * -> "type" : "Gene"
298
+ * -> start1-last1 : ""
299
+ * -> start2-last2 : ""
300
+ * -> start3-last3 : ""
301
+ */
302
+
303
+ String tiabs="";
304
+ for (int j = 0; j < GNormPlus.BioCDocobj.PassageContexts.get(i).size(); j++) /** Paragraphs : j */
305
+ {
306
+ tiabs=tiabs+GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).toLowerCase();
307
+ }
308
+ HashMap<String,HashMap<String,String>> GeneMention_hash = new HashMap<String,HashMap<String,String>>();
309
+ HashMap<String,String> Mention_hash = new HashMap<String,String>();
310
+ for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
311
+ {
312
+ for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) /** Annotation : k */
313
+ {
314
+ String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
315
+ String start=anno[0];
316
+ String last=anno[1];
317
+ String mentions=anno[2];
318
+ String type=anno[3];
319
+ String taxids="Tax:9606";
320
+
321
+ if(anno.length>=5)
322
+ {
323
+ taxids=anno[4];
324
+ }
325
+ String mentions_tmp=mentions.toLowerCase();
326
+ mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
327
+ mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
328
+ taxids=taxids.replaceAll("(Focus|Right|Left|Prefix|Tax):","");
329
+ if(taxids.equals(""))
330
+ {
331
+ taxids="9606";
332
+ }
333
+ /** Filtering */
334
+ boolean found_filter = false;
335
+ if(GNormPlus.Filtering_hash.containsKey(mentions_tmp)) // filtering
336
+ {
337
+ found_filter=true;
338
+ }
339
+
340
+ if(found_filter==false) //abbreviation
341
+ {
342
+ for(String f : GNormPlus.Filtering_WithLongForm_hash.keySet())
343
+ {
344
+ if( GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*[\\t\\|]"+f+"\tGene.*") ||
345
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*\\t"+f+"\\|[^\t]+\tGene.*")
346
+ )
347
+ {
348
+ String lf=GNormPlus.Filtering_WithLongForm_hash.get(f);
349
+ if(tiabs.matches(".*"+lf+".*"))
350
+ {
351
+ found_filter=true;
352
+ break;
353
+ }
354
+ }
355
+ }
356
+ }
357
+
358
+ if(found_filter==false)
359
+ {
360
+ if( GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*[\\t\\|][a-z]\tGene.*") ||
361
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*\\t[a-z]\\|[^\t]+\tGene.*") //32171191 Wuhan's
362
+ )
363
+ {
364
+ found_filter=true;
365
+
366
+ }
367
+ }
368
+
369
+ if(found_filter == false)
370
+ {
371
+ if(type.matches("Gene"))
372
+ {
373
+ if(GeneMention_hash.containsKey(mentions+"\t"+taxids))
374
+ {
375
+ GeneMention_hash.get(mentions+"\t"+taxids).put(start+"\t"+last,"");
376
+ }
377
+ else
378
+ {
379
+ HashMap<String,String> offset_hash = new HashMap<String,String>();
380
+ offset_hash.put(start+"\t"+last,"");
381
+ GeneMention_hash.put(mentions+"\t"+taxids, offset_hash);
382
+ GeneMention_hash.get(mentions+"\t"+taxids).put("type", type);
383
+ Mention_hash.put(mentions,"Gene");
384
+ }
385
+ }
386
+ else if(type.matches("(FamilyName|DomainMotif)"))
387
+ {
388
+ String GMs[]=mentions.split("\\|");
389
+ for(int g=0;g<GMs.length;g++)
390
+ {
391
+ String mention = GMs[g];
392
+ Mention_hash.put(mention,"FamilyDomain");
393
+ }
394
+ }
395
+ }
396
+
397
+ }
398
+ }
399
+
400
+ /*
401
+ * Gene id refinement:
402
+ * 1. Official name
403
+ * 2. only one gene
404
+ */
405
+ HashMap<String,String> GuaranteedGene2ID = new HashMap<String,String>();
406
+ HashMap<String,String> MultiGene2ID = new HashMap<String,String>();
407
+ for(String GeneMentionTax : GeneMention_hash.keySet())
408
+ {
409
+ String GT[]=GeneMentionTax.split("\\t");
410
+ String mentions=GT[0];
411
+ String taxids=GT[1];
412
+ String GMs[]=mentions.split("\\|");
413
+
414
+ HashMap<String,String> taxids_hash = new HashMap<String,String>();
415
+ String taxids_arr[]=taxids.split(",");
416
+ for(int t=0;t<taxids_arr.length;t++)
417
+ {
418
+ taxids_hash.put(taxids_arr[t], "");
419
+ }
420
+
421
+ for(int ms=0;ms<GMs.length;ms++)
422
+ {
423
+ String mention = GMs[ms];
424
+ String IDstr = GNormPlus.PT_Gene.MentionMatch(mention); /** searched by PT_Gene */
425
+ String IDs[]=IDstr.split("\\|");
426
+
427
+ /*
428
+ * printing the ambiguous gene mentions and candidates
429
+ */
430
+ //String IDs_s[]=IDstr.split(",");
431
+ //if(IDs_s.length>1)
432
+ //{
433
+ // System.out.println(Pmid+"\t"+mention+"\t"+mentions+"\t"+IDstr);
434
+ //}
435
+
436
+ for(int c=0;c<IDs.length;c++)
437
+ {
438
+ String tax2ID[]=IDs[c].split(":"); // tax2ID[0] = taxid ; tax2ID[1] = geneids
439
+ if(taxids_hash.containsKey(tax2ID[0]))
440
+ {
441
+ String geneid=tax2ID[1];
442
+ String TargetTax=tax2ID[0];
443
+ GeneMention_hash.get(GeneMentionTax).put("ID", geneid);
444
+ GeneMention_hash.get(GeneMentionTax).put("TargetTax", TargetTax);
445
+ break;
446
+ }
447
+ }
448
+
449
+ //geneid refinement
450
+ if(GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
451
+ {
452
+ Pattern ptmp = Pattern.compile("\\*([0-9]+(\\-[0-9]+|))");
453
+ Matcher mtmp = ptmp.matcher(GeneMention_hash.get(GeneMentionTax).get("ID"));
454
+
455
+ if(mtmp.find()) // 1. Official Name
456
+ {
457
+ GeneMention_hash.get(GeneMentionTax).put("ID",mtmp.group(1));
458
+ GuaranteedGene2ID.put(GeneMentionTax,mtmp.group(1));
459
+ }
460
+ else if(GeneMention_hash.get(GeneMentionTax).get("ID").matches("[0-9]+(\\-[0-9]+|)")) // 2. only one gene
461
+ {
462
+ GuaranteedGene2ID.put(GeneMentionTax,GeneMention_hash.get(GeneMentionTax).get("ID"));
463
+ }
464
+ else
465
+ {
466
+ String ID[] = GeneMention_hash.get(GeneMentionTax).get("ID").split(",");
467
+ boolean FoundByChroLoca=false;
468
+ for(int idcount=0;idcount<ID.length;idcount++)
469
+ {
470
+ if(GNormPlus.Pmid2ChromosomeGene_hash.containsKey(Pmid+"\t"+ID[idcount])) // 3. Chromosome location
471
+ {
472
+ GuaranteedGene2ID.put(GeneMentionTax,ID[idcount]);
473
+ FoundByChroLoca=true;
474
+ break;
475
+ }
476
+ }
477
+ if(FoundByChroLoca == false)
478
+ {
479
+ MultiGene2ID.put(GeneMentionTax, GeneMention_hash.get(GeneMentionTax).get("ID"));
480
+ }
481
+ }
482
+ }
483
+ if(GNormPlus.suffixprefix_orig2modified.containsKey(mention) && (!IDstr.equals("-1")) && (!IDstr.equals("-2")) && (!IDstr.equals("-3")))
484
+ {
485
+ break;
486
+ }
487
+ }
488
+ }
489
+
490
+ /*
491
+ * Gene id refinement:
492
+ * 3. multiple genes but can be inferred by 1. and 2.
493
+ */
494
+ for(String GeneMentionTax_M : MultiGene2ID.keySet())
495
+ {
496
+ for(String GeneMentionTax_G : GuaranteedGene2ID.keySet())
497
+ {
498
+ String MG[] = MultiGene2ID.get(GeneMentionTax_M).split(",");
499
+ for(int m=0;m<MG.length;m++)
500
+ {
501
+ if(MG[m].equals(GuaranteedGene2ID.get(GeneMentionTax_G)))
502
+ {
503
+ GeneMention_hash.get(GeneMentionTax_M).put("ID",MG[m]);
504
+ }
505
+ }
506
+ }
507
+ }
508
+
509
+ /*
510
+ * Gene id refinement:
511
+ * 4. FullName -> Abbreviation
512
+ */
513
+ for(String GeneMentionTax : GeneMention_hash.keySet())
514
+ {
515
+ String MT[] = GeneMentionTax.split("\\t");
516
+ if(GNormPlus.PmidLF2Abb_hash.containsKey(Pmid+"\t"+MT[0]))
517
+ {
518
+ String GeneMentionTax_Abb = GNormPlus.PmidLF2Abb_hash.get(Pmid+"\t"+MT[0]) + "\t" + MT[1];
519
+ if(GeneMention_hash.containsKey(GeneMentionTax_Abb) && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
520
+ {
521
+ GeneMention_hash.get(GeneMentionTax_Abb).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID"));
522
+ }
523
+ }
524
+ }
525
+
526
+ /*
527
+ * Gene id refinement:
528
+ * 5. Ranking by scoring function (inference network)
529
+ */
530
+ for(String GeneMentionTax : GeneMention_hash.keySet())
531
+ {
532
+ if(GeneMention_hash.get(GeneMentionTax).containsKey("ID") && GeneMention_hash.get(GeneMentionTax).get("ID").matches(".+,.+"))
533
+ {
534
+ String geneids=GeneMention_hash.get(GeneMentionTax).get("ID");
535
+ String geneid[] = geneids.split(",");
536
+
537
+ String OutputStyle="Top1";
538
+ if(OutputStyle.equals("Top1"))
539
+ {
540
+ //only return the best one
541
+ double max_score=0.0;
542
+ String target_geneid="";
543
+ for(int g=0;g<geneid.length;g++)
544
+ {
545
+ String MT[] = GeneMentionTax.split("\\t");
546
+ String LF="";
547
+ if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
548
+ {
549
+ LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]);
550
+ }
551
+ double score = ScoringFunction(geneid[g],Mention_hash,LF);
552
+ if(score>max_score)
553
+ {
554
+ max_score=score;
555
+ target_geneid=geneid[g];
556
+ }
557
+ else if(score == 0.0)
558
+ {
559
+ //System.out.println(GeneMentionTax);
560
+ }
561
+ }
562
+ GeneMention_hash.get(GeneMentionTax).put("ID", target_geneid);
563
+ }
564
+ else // "All"
565
+ {
566
+ //return all geneids
567
+ String geneSTR="";
568
+ for(int g=0;g<geneid.length;g++)
569
+ {
570
+ String MT[] = GeneMentionTax.split("\\t");
571
+ String LF="";
572
+ if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
573
+ {
574
+ LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]);
575
+ }
576
+ double score = ScoringFunction(geneid[g],Mention_hash,LF);
577
+ String hoge = df.format(score);
578
+ score=Double.parseDouble(hoge);
579
+
580
+ if(geneSTR.equals(""))
581
+ {
582
+ geneSTR=geneid[g]+"-"+score;
583
+ }
584
+ else
585
+ {
586
+ geneSTR=geneSTR+","+geneid[g]+"-"+score;
587
+ }
588
+ }
589
+ GeneMention_hash.get(GeneMentionTax).put("ID", geneSTR);
590
+ }
591
+ }
592
+ }
593
+
594
+ /*
595
+ * Gene id refinement: - removed (Reason: cause too much False Positive)
596
+ * 6. Abbreviation -> FullName
597
+ *
598
+ */
599
+ for(String GeneMentionTax : GeneMention_hash.keySet())
600
+ {
601
+ String MT[] = GeneMentionTax.split("\\t");
602
+ if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
603
+ {
604
+ String GeneMentionTax_LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]) + "\t" + MT[1];
605
+ if(GeneMention_hash.containsKey(GeneMentionTax_LF) && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
606
+ {
607
+ GeneMention_hash.get(GeneMentionTax_LF).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID"));
608
+ }
609
+ }
610
+ }
611
+
612
+ /*
613
+ * Gene id refinement:
614
+ * 7. The inference network tokens of Abbreviation.ID should contain at least LF tokens
615
+ * 8. The short mention should be filtered if not long form support
616
+ */
617
+ ArrayList<String> removeGMT = new ArrayList<String>();
618
+ for(String GeneMentionTax : GeneMention_hash.keySet())
619
+ {
620
+ String GT[]=GeneMentionTax.split("\\t");
621
+ String mentions=GT[0];
622
+ String tax=GT[1];
623
+ if(GeneMention_hash.get(GeneMentionTax).containsKey("type") && GeneMention_hash.get(GeneMentionTax).get("type").equals("Gene") && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
624
+ {
625
+ String type = GeneMention_hash.get(GeneMentionTax).get("type");
626
+ String id = GeneMention_hash.get(GeneMentionTax).get("ID");
627
+ String geneid="";
628
+ Pattern ptmp1 = Pattern.compile("^([0-9]+)\\-([0-9]+)$");
629
+ Pattern ptmp2 = Pattern.compile("^([0-9]+)$");
630
+ Matcher mtmp1 = ptmp1.matcher(id);
631
+ Matcher mtmp2 = ptmp2.matcher(id);
632
+ //System.out.println(id);
633
+ if(mtmp1.find())
634
+ {
635
+ geneid = "Homo:"+mtmp1.group(2);
636
+ }
637
+ else if(mtmp2.find())
638
+ {
639
+ geneid = "Gene:"+mtmp2.group(1);
640
+ }
641
+
642
+ boolean LongFormTknMatch= false;
643
+ boolean LongFormExist= true;
644
+ if(GNormPlus.GeneScoring_hash.containsKey(geneid))
645
+ {
646
+ if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mentions.toLowerCase()))
647
+ {
648
+ /*
649
+ * token in lexicon : tkn_lexicon
650
+ * token in mention : tkn_mention
651
+ */
652
+ String l[]=GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293 cmk-1,cytidylate-1,kinase-1,mssa-1 0.4096 4 0.0625 1 2.0
653
+ String tkns_Gene[] = l[0].split(",");
654
+ ArrayList<String> tkn_lexicon = new ArrayList<String>();
655
+ for(int ti=0;ti<tkns_Gene.length;ti++)
656
+ {
657
+ String Tkn_Freq[] = tkns_Gene[ti].split("-");
658
+ tkn_lexicon.add(Tkn_Freq[0]);
659
+ }
660
+
661
+ String LF_lc=GNormPlus.PmidAbb2LF_lc_hash.get(Pmid+"\t"+mentions.toLowerCase());
662
+ LF_lc = LF_lc.replaceAll("([0-9])([A-Za-z])", "$1 $2");
663
+ LF_lc = LF_lc.replaceAll("([A-Za-z])([0-9])", "$1 $2");
664
+ String tkn_mention[] = LF_lc.split("[\\W\\-\\_]");
665
+ for(int tl=0;tl<tkn_lexicon.size();tl++)
666
+ {
667
+ for(int tm=0;tm<tkn_mention.length;tm++)
668
+ {
669
+ if(tkn_lexicon.get(tl).equals(tkn_mention[tm]) && (!tkn_mention[tm].matches("[0-9]+")))
670
+ {
671
+ LongFormTknMatch = true;
672
+ }
673
+ }
674
+ }
675
+ }
676
+ else{LongFormExist = false;}
677
+ }
678
+ else{LongFormTknMatch = true;} // exception
679
+
680
+ if(LongFormTknMatch == false && LongFormExist == true) // 7.
681
+ {
682
+ removeGMT.add(GeneMentionTax); //remove short form
683
+ removeGMT.add(GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+mentions)+"\t"+tax); //remove long form
684
+ }
685
+ else if(mentions.length()<=2 && LongFormExist == false) // 8.
686
+ {
687
+ removeGMT.add(GeneMentionTax);
688
+ }
689
+ }
690
+ }
691
+
692
+ for(int gmti=0;gmti<removeGMT.size();gmti++) // remove
693
+ {
694
+ GeneMention_hash.remove(removeGMT.get(gmti));
695
+ }
696
+
697
+ // Append gene ids
698
+ for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) // Paragraphs : j
699
+ {
700
+ for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
701
+ {
702
+ String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
703
+ String start=anno[0];
704
+ String last=anno[1];
705
+ String mentions=anno[2];
706
+ String type=anno[3];
707
+ String taxid_org="Tax:9606";
708
+ if(anno.length>=5)
709
+ {
710
+ taxid_org=anno[4];
711
+ }
712
+ String taxids=taxid_org.replaceAll("(Focus|Right|Left|Prefix|Tax):","");
713
+ String GMs[]=mentions.split("\\|");
714
+
715
+ if(GeneMention_hash.containsKey(mentions+"\t"+taxids) && GeneMention_hash.get(mentions+"\t"+taxids).containsKey("TargetTax"))
716
+ {
717
+ String taxtype=taxid_org.replaceAll(":([0-9,]+)","");
718
+ String taxid=GeneMention_hash.get(mentions+"\t"+taxids).get("TargetTax");
719
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mentions+"\t"+type+"\t"+taxtype+":"+taxid);
720
+ }
721
+
722
+ if(type.equals("Gene"))
723
+ {
724
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k) + "|");
725
+
726
+
727
+ if(GeneMention_hash.containsKey(mentions+"\t"+taxids) && GeneMention_hash.get(mentions+"\t"+taxids).containsKey("ID"))
728
+ {
729
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k) + GeneMention_hash.get(mentions+"\t"+taxids).get("ID") + "," );
730
+ }
731
+ else // cannot find appropriate species
732
+ {
733
+ //System.out.println(mention+"\t"+taxid);
734
+ }
735
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).substring(0, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).length()-1)); // remove ",$"
736
+ }
737
+ }
738
+ }
739
+
740
+ //Extend to all gene mentions
741
+ HashMap<String,String> GeneMentions = new HashMap<String,String>(); // Extending Gene mentions
742
+ HashMap<String,String> GeneMentionLocation = new HashMap<String,String>(); // Extending Gene mentions
743
+ for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
744
+ {
745
+ for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
746
+ {
747
+ String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
748
+ int start = Integer.parseInt(anno[0]);
749
+ int last = Integer.parseInt(anno[1]);
750
+ String mentions=anno[2];
751
+ String type=anno[3];
752
+ String id="Tax:9606";
753
+ if(anno.length>=5)
754
+ {
755
+ id=anno[4];
756
+ }
757
+ if(type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)"))
758
+ {
759
+ GeneMentions.put(mentions.toLowerCase(), id);
760
+ for (int s=start ;s<=last;s++)
761
+ {
762
+ GeneMentionLocation.put(j+"\t"+s,"");
763
+ }
764
+ }
765
+ else if(type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)"))
766
+ {
767
+ GeneMentions.put(mentions.toLowerCase(), id);
768
+ for (int s=start ;s<=last;s++)
769
+ {
770
+ GeneMentionLocation.put(j+"\t"+s,"");
771
+ }
772
+ }
773
+ }
774
+ }
775
+ for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
776
+ {
777
+ if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j)
778
+ {
779
+ String PassageContexts = " " + GNormPlus.BioCDocobj.PassageContexts.get(i).get(j) + " ";
780
+ String PassageContexts_tmp = PassageContexts.toLowerCase();
781
+ for(String gm : GeneMentions.keySet())
782
+ {
783
+ String id = GeneMentions.get(gm);
784
+ if(gm.length()>=3)
785
+ {
786
+ gm = gm.replaceAll("[ ]*[\\|]*$", "");
787
+ gm = gm.replaceAll("^[\\|]*[ ]*", "");
788
+ gm = gm.replaceAll("[\\|][\\|]+", "\\|");
789
+ if(!gm.matches("[\\W\\-\\_]*"))
790
+ {
791
+ gm = gm.replaceAll("([^A-Za-z0-9\\| ])", "\\\\$1");
792
+ Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])("+gm+")([\\W\\-\\_].*)$");
793
+ Matcher mtmp = ptmp.matcher(PassageContexts_tmp);
794
+ while(mtmp.find())
795
+ {
796
+ String pre = mtmp.group(1);
797
+ String gmtmp = mtmp.group(2);
798
+ String post = mtmp.group(3);
799
+
800
+ int start = pre.length()-1;
801
+ int last = start+gmtmp.length();
802
+ if(PassageContexts.length()>=last+1)
803
+ {
804
+ String mention = PassageContexts.substring(start+1,last+1);
805
+ if(!GeneMentionLocation.containsKey(j+"\t"+start) && !GeneMentionLocation.containsKey(j+"\t"+last))
806
+ {
807
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGene\t"+id);
808
+ }
809
+ }
810
+ gmtmp = gmtmp.replaceAll(".", "\\@");
811
+ PassageContexts_tmp=pre+""+gmtmp+""+post;
812
+ mtmp = ptmp.matcher(PassageContexts_tmp);
813
+ }
814
+ }
815
+ }
816
+ }
817
+ }
818
+ }
819
+
820
+ //Apply to FamilyNames
821
+ HashMap<String,String> geneids = new HashMap<String,String>(); // Extending Gene mentions
822
+ for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
823
+ {
824
+ for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
825
+ {
826
+ String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
827
+ String type=anno[3];
828
+ if(type.equals("Gene"))
829
+ {
830
+ String id="Tax:9606";
831
+ if(anno.length>=5)
832
+ {
833
+ id=anno[4];
834
+ }
835
+ Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)$");
836
+ Matcher mtmp0 = ptmp0.matcher(id);
837
+ Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
838
+ Matcher mtmp1 = ptmp1.matcher(id);
839
+ if(mtmp0.find())
840
+ {
841
+ geneids.put(mtmp0.group(3), "");
842
+ }
843
+ if(mtmp1.find())
844
+ {
845
+ geneids.put(mtmp1.group(3), "");
846
+ }
847
+ }
848
+ }
849
+ }
850
+ for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
851
+ {
852
+ for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
853
+ {
854
+ String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
855
+ String mention=anno[2];
856
+ String type=anno[3];
857
+ if(type.matches("(FamilyName|DomainMotif)"))
858
+ {
859
+ String id="Tax:9606";
860
+ if(anno.length>=5)
861
+ {
862
+ id=anno[4];
863
+ }
864
+ String IDstrs = GNormPlus.PT_FamilyName.MentionMatch(mention);
865
+ String IDstr[]=IDstrs.split("\\|");
866
+ String ids="";
867
+ for(int id_i=0;id_i<IDstr.length;id_i++)
868
+ {
869
+ if(geneids.containsKey(IDstr[id_i]))
870
+ {
871
+ if(ids.equals(""))
872
+ {
873
+ ids=IDstr[id_i];
874
+ }
875
+ else
876
+ {
877
+ ids=ids+";"+IDstr[id_i];
878
+ }
879
+ }
880
+ }
881
+ if(!ids.equals(""))
882
+ {
883
+ if(type.equals("FamilyName")){type="Gene";}
884
+ String Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\tTax:9606";
885
+ if(anno.length>=5)
886
+ {
887
+ Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\t"+anno[4];
888
+ }
889
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k,Annotation_k+"|"+ids);
890
+ }
891
+ else
892
+ {
893
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
894
+ }
895
+ }
896
+ }
897
+ }
898
+ //Species "*" and "(anti)" removed.
899
+ for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
900
+ {
901
+ for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
902
+ {
903
+ String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
904
+ String type=anno[3];
905
+ if(type.equals("Species") || type.equals("Genus") || type.equals("Strain") || type.equals("CellLine") || type.equals("Cell"))
906
+ {
907
+ String id=anno[4];
908
+ id=id.replaceAll("\\*", "");
909
+ id=id.replaceAll("\\(anti\\)", "");
910
+ String Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\t"+id;
911
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k,Annotation_k);
912
+ }
913
+ }
914
+ }
915
+
916
+ for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
917
+ {
918
+
919
+ for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
920
+ {
921
+ String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
922
+ int start = Integer.parseInt(anno[0]);
923
+ int last = Integer.parseInt(anno[1]);
924
+ String mention = anno[2];
925
+ String type = anno[3];
926
+ String id = anno[4];
927
+ if(type.equals("Gene") && Species_hash.containsKey(mention))
928
+ {
929
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
930
+ }
931
+ else if(type.equals("Gene") && id.equals(""))
932
+ {
933
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
934
+ }
935
+ else
936
+ {
937
+ for (int k1 = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k1 >=0 ; k1--) // Annotation : k
938
+ {
939
+ if(k1 != k)
940
+ {
941
+ String anno1[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k1).split("\t");
942
+ int start1 = Integer.parseInt(anno1[0]);
943
+ int last1 = Integer.parseInt(anno1[1]);
944
+ if((start1<start && last1>=last) || (start1<=start && last1>last))
945
+ {
946
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
947
+ break;
948
+ }
949
+ }
950
+ }
951
+ }
952
+ }
953
+ }
954
+ }
955
+ if(GeneIDMatch == true)
956
+ {
957
+ //GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
958
+ }
959
+ else
960
+ {
961
+ GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,true,true);
962
+ }
963
+ }
964
+ /*
965
+ * Search Potential GeneID in the Prefix Tree
966
+ */
967
+ public ArrayList<String> SearchGeneIDLocation(String Doc)
968
+ {
969
+ ArrayList<String> location = new ArrayList<String>();
970
+
971
+ String Doc_tmp=" "+Doc+" ";
972
+ Pattern ptmp = Pattern.compile("^(.*[^A-Za-z0-9]+)([0-9]+\\S*[A-Za-z]+|[A-Za-z]+\\S*[0-9]+|[0-9]+\\S*[A-Za-z]+\\S*[0-9]+|[A-Za-z]+\\S*[0-9]+\\S*[A-Za-z]+)([^A-Za-z0-9]+.*)$");
973
+ Matcher mtmp = ptmp.matcher(Doc_tmp);
974
+ while(mtmp.find())
975
+ {
976
+ String str1=mtmp.group(1);
977
+ String str2=mtmp.group(2);
978
+ String str3=mtmp.group(3);
979
+ for(int m=str1.length();m<=(str1.length()+str2.length());m++)
980
+ {
981
+ int start = str1.length()-1;
982
+ int last = start+str2.length();
983
+ String mention = Doc.substring(start, last);
984
+ if(!mention.matches(".*[\\'\\;\\[\\]\\+\\*\\\\].*"))
985
+ {
986
+ if(last-start>6 && (mention.matches(".*\\(.*\\).*") || mention.matches("[^\\(\\)]+")) )
987
+ {
988
+ Pattern ptmp1 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-([0-9]+)$");
989
+ Matcher mtmp1 = ptmp1.matcher(mention);
990
+ Pattern ptmp2 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-(.+[^0-9])([0-9]+)$");
991
+ Matcher mtmp2 = ptmp2.matcher(mention);
992
+ if(mtmp1.find())
993
+ {
994
+ String S1 = mtmp1.group(1);
995
+ if(mtmp1.group(2).length()<=6 && mtmp1.group(3).length()<=6)
996
+ {
997
+ int Num1 = Integer.parseInt(mtmp1.group(2));
998
+ int Num2 = Integer.parseInt(mtmp1.group(3));
999
+ String prefix = "";
1000
+ Pattern ptmp3 = Pattern.compile("^([0]+)");
1001
+ Matcher mtmp3 = ptmp3.matcher(mtmp1.group(2));
1002
+ if(mtmp3.find())
1003
+ {
1004
+ prefix = mtmp3.group(1);
1005
+ }
1006
+ if(Num2-Num1>0 && (Num2-Num1<=20))
1007
+ {
1008
+ for(int n=Num1;n<=Num2;n++)
1009
+ {
1010
+ String StrNum=S1+prefix+n;
1011
+ if(StrNum.length()>=5)
1012
+ {
1013
+ location.add(start+"\t"+last+"\t"+StrNum+"\tGeneID");
1014
+ }
1015
+ }
1016
+ }
1017
+ }
1018
+ }
1019
+ else if(mtmp2.find())
1020
+ {
1021
+ if(mtmp2.group(2).length()<=6 && mtmp2.group(4).length()<=6)
1022
+ {
1023
+ String S1 = mtmp2.group(1);
1024
+ int Num1 = Integer.parseInt(mtmp2.group(2));
1025
+ String S2 = mtmp2.group(3);
1026
+ int Num2 = Integer.parseInt(mtmp2.group(4));
1027
+ if(S1.equals(S2))
1028
+ {
1029
+ String prefix = "";
1030
+ Pattern ptmp3 = Pattern.compile("^([0]+)");
1031
+ Matcher mtmp3 = ptmp3.matcher(mtmp2.group(2));
1032
+ if(mtmp3.find())
1033
+ {
1034
+ prefix = mtmp3.group(1);
1035
+ }
1036
+ if(Num2-Num1>0 && (Num2-Num1<=20))
1037
+ {
1038
+ for(int n=Num1;n<=Num2;n++)
1039
+ {
1040
+ String StrNum=S1+prefix+n;
1041
+ if(StrNum.length()>=5)
1042
+ {
1043
+ location.add(start+"\t"+last+"\t"+StrNum+"\tGeneID");
1044
+ }
1045
+ }
1046
+ }
1047
+ }
1048
+ }
1049
+ }
1050
+ }
1051
+ location.add(start+"\t"+last+"\t"+mention+"\tGeneID");
1052
+ }
1053
+ }
1054
+ String men="";
1055
+ for(int m=0;m<str2.length();m++){men=men+"@";}
1056
+ Doc_tmp=str1+men+str3;
1057
+ mtmp = ptmp.matcher(Doc_tmp);
1058
+ }
1059
+ return location;
1060
+ }
1061
+ public void GeneIDRecognition(String Filename,String FilenameBioC) throws IOException, XMLStreamException
1062
+ {
1063
+ for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */
1064
+ {
1065
+ for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
1066
+ {
1067
+ String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
1068
+ /** GeneID recognition by pattern match */
1069
+ ArrayList<String> locations = SearchGeneIDLocation(PassageContext);
1070
+ for (int k = 0 ; k < locations.size() ; k++)
1071
+ {
1072
+ String anno[]=locations.get(k).split("\t");
1073
+ String mention = anno[2].toLowerCase();
1074
+ mention = mention.replaceAll("[\\W\\-\\_]+", "");
1075
+ if(GNormPlus.GeneIDs_hash.containsKey(mention))
1076
+ {
1077
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(locations.get(k)+"\tGeneID:"+GNormPlus.GeneIDs_hash.get(mention)); //paragraph
1078
+ }
1079
+ }
1080
+ }
1081
+ }
1082
+ GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,true,true);
1083
+ }
1084
  }
src_Java/GNormPluslib/GNR.java CHANGED
The diff for this file is too large to render. See raw diff
 
src_Java/GNormPluslib/GNormPlus.java CHANGED
@@ -1,696 +1,696 @@
1
- package GNormPluslib;
2
-
3
- import java.io.BufferedReader;
4
- import java.io.BufferedWriter;
5
- import java.io.File;
6
- import java.io.FileOutputStream;
7
- import java.io.FileReader;
8
- import java.io.IOException;
9
- import java.io.OutputStreamWriter;
10
- import java.sql.SQLException;
11
- import java.util.ArrayList;
12
- import java.util.HashMap;
13
- import java.util.regex.Matcher;
14
- import java.util.regex.Pattern;
15
-
16
- import javax.xml.stream.XMLStreamException;
17
-
18
- import GNormPluslib.PrefixTree;
19
- import GNormPluslib.GNR;
20
- import GNormPluslib.SR;
21
-
22
- public class GNormPlus
23
- {
24
- public static BioCDoc BioCDocobj = new BioCDoc();
25
- public static PrefixTree PT_Species = new PrefixTree();
26
- public static PrefixTree PT_Cell = new PrefixTree();
27
- public static PrefixTree PT_CTDGene = new PrefixTree();
28
- public static PrefixTree PT_Gene = new PrefixTree();
29
- public static PrefixTree PT_GeneChromosome = new PrefixTree();
30
- public static PrefixTree PT_FamilyName = new PrefixTree();
31
- public static HashMap<String, String> ent_hash = new HashMap<String, String>();
32
- public static HashMap<String, String> GenusID_hash = new HashMap<String, String>();
33
- public static HashMap<String, String> PrefixID_hash = new HashMap<String, String>();
34
- public static HashMap<String, Double> TaxFreq_hash = new HashMap<String, Double>();
35
- public static HashMap<String, String> GeneScoring_hash = new HashMap<String, String>();
36
- public static HashMap<String, Double> GeneScoringDF_hash = new HashMap<String, Double>();
37
- public static HashMap<String, String> GeneIDs_hash = new HashMap<String, String>();
38
- public static HashMap<String, String> Normalization2Protein_hash = new HashMap<String, String>();
39
- public static HashMap<String, String> HomologeneID_hash = new HashMap<String, String>();
40
- public static HashMap<String,String> SuffixTranslationMap_hash = new HashMap<String,String>();
41
- public static HashMap<String,String> SuffixTranslationMap2_hash = new HashMap<String,String>();
42
- public static HashMap<String, String> Pmid2Abb_hash = new HashMap<String, String>();
43
- public static HashMap<String, String> PmidAbb2LF_lc_hash = new HashMap<String, String>();
44
- public static HashMap<String, String> PmidLF2Abb_lc_hash = new HashMap<String, String>();
45
- public static HashMap<String, String> PmidAbb2LF_hash = new HashMap<String, String>();
46
- public static HashMap<String, String> PmidLF2Abb_hash = new HashMap<String, String>();
47
- public static HashMap<String, String> Pmid2ChromosomeGene_hash = new HashMap<String, String>();
48
- public static HashMap<String, String> SimConceptMention2Type_hash = new HashMap<String, String>();
49
- public static HashMap<String, String> Filtering_hash = new HashMap<String, String>();
50
- public static HashMap<String, String> Filtering_WithLongForm_hash = new HashMap<String, String>();
51
- public static HashMap<String, String> SP_Virus2Human_hash = new HashMap<String, String>();
52
- public static HashMap<String, String> GeneWithoutSPPrefix_hash = new HashMap<String, String>();
53
- public static ArrayList <String> taxid4gene = new ArrayList <String>();
54
- public static HashMap<String, String> setup_hash = new HashMap<String, String>();
55
- public static HashMap<String, String> suffixprefix_orig2modified = new HashMap<String, String>();
56
- public static HashMap<String, String> Abb2Longformtok_hash = new HashMap<String, String>();
57
- public static HashMap<String, String> StrainID_ancestor2tax_hash = new HashMap<String, String>();
58
- public static HashMap<String, String> StrainID_taxid2names_hash = new HashMap<String, String>();
59
-
60
- public static String SetupFile = "setup.txt";
61
- public static void main(String [] args) throws IOException, InterruptedException, XMLStreamException, SQLException
62
- {
63
- String InputFolder="input";
64
- String OutputFolder="output";
65
- String tmpFolder="tmp";
66
- String FocusSpecies = "";
67
- if(args.length<2)
68
- {
69
- System.out.println("\n$ java -Xmx30G -Xms10G -jar GNormPlus.jar [InputFolder] [OutputFolder] [SetupFile]");
70
- System.out.println("[InputFolder] Default : input");
71
- System.out.println("[OutputFolder] Default : output");
72
- System.out.println("[SetupFile] Default : setup.txt\n\n");
73
- }
74
- else
75
- {
76
- /*
77
- * Parameters
78
- */
79
- InputFolder=args[0];
80
- OutputFolder=args[1];
81
- if(args.length>=3)
82
- {
83
- SetupFile = args[2];
84
- }
85
- if(args.length>=4)
86
- {
87
- FocusSpecies=args[3];
88
- }
89
- }
90
-
91
- BufferedReader br = new BufferedReader(new FileReader(SetupFile));
92
- String line="";
93
- Pattern ptmp = Pattern.compile("^ ([A-Za-z0-9]+) = ([^ \\t\\n\\r]+)$");
94
- while ((line = br.readLine()) != null)
95
- {
96
- Matcher mtmp = ptmp.matcher(line);
97
- if(mtmp.find())
98
- {
99
- setup_hash.put(mtmp.group(1), mtmp.group(2));
100
- }
101
- }
102
- br.close();
103
- if(!setup_hash.containsKey("GeneIDMatch"))
104
- {
105
- setup_hash.put("GeneIDMatch","True");
106
- }
107
- if(!setup_hash.containsKey("HomologeneID"))
108
- {
109
- setup_hash.put("HomologeneID","False");
110
- }
111
- if(!FocusSpecies.equals(""))
112
- {
113
- setup_hash.put("FocusSpecies",FocusSpecies);
114
- }
115
- if(!setup_hash.containsKey("ShowUnNormalizedMention"))
116
- {
117
- setup_hash.put("ShowUnNormalizedMention","False");
118
- }
119
- if(setup_hash.containsKey("tmpFolder"))
120
- {
121
- tmpFolder=setup_hash.get("tmpFolder");
122
- }
123
-
124
- /*
125
- * Time stamp - start : All
126
- */
127
- double startTime,endTime,totTime;
128
- startTime = System.currentTimeMillis();//start time
129
-
130
- int NumFiles=0;
131
- File folder = new File(InputFolder);
132
- File[] listOfFiles = folder.listFiles();
133
- for (int i = 0; i < listOfFiles.length; i++)
134
- {
135
- if (listOfFiles[i].isFile())
136
- {
137
- String InputFile = listOfFiles[i].getName();
138
- File f = new File(OutputFolder+"/"+InputFile);
139
- if(f.exists() && !f.isDirectory())
140
- {
141
- }
142
- else
143
- {
144
- NumFiles++;
145
- }
146
- }
147
- }
148
-
149
- System.out.println("Total "+NumFiles+" file(s) wait(s) for process.");
150
-
151
- if(NumFiles>0)
152
- {
153
- /*
154
- * Start & Load Dictionary
155
- */
156
- String TrainTest = "Test";
157
- if(setup_hash.containsKey("TrainTest"))
158
- {
159
- TrainTest = setup_hash.get("TrainTest");
160
- }
161
-
162
-
163
- /** Load Dictionary */
164
- if(setup_hash.containsKey("GeneRecognition") && setup_hash.get("GeneRecognition").toLowerCase().equals("true"))
165
- {
166
- System.out.print("Loading Gene NER Dictionary : Processing ... \r");
167
- /** CTDGene */
168
- if(setup_hash.containsKey("IgnoreNER") && setup_hash.get("IgnoreNER").toLowerCase().equals("true")){} // not NER (entities are pre-annotated)
169
- else if(setup_hash.containsKey("SpeciesAssignmentOnly") && setup_hash.get("SpeciesAssignmentOnly").toLowerCase().equals("true")) {} // species assignment
170
- else
171
- {
172
- PT_CTDGene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_CTDGene.txt");
173
- }
174
- /** ent */
175
- br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/ent.rev.txt"));
176
- line="";
177
- while ((line = br.readLine()) != null)
178
- {
179
- String l[]=line.split("\t"); //&#x00391; Alpha
180
- ent_hash.put(l[0], l[1]);
181
- }
182
- br.close();
183
-
184
- /** FamilyName */
185
- if((!setup_hash.containsKey("IgnoreNER")) || setup_hash.get("IgnoreNER").toLowerCase() != "true")
186
- {
187
- PT_FamilyName.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_FamilyName.txt");
188
- }
189
-
190
- /** GeneChromosome */
191
- //PT_GeneChromosome.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_GeneChromosome.txt");
192
- System.out.println("Loading Gene NER Dictionary : Processing ... done.");
193
- }
194
-
195
- if(setup_hash.containsKey("SpeciesRecognition") && setup_hash.get("SpeciesRecognition").toLowerCase().equals("true"))
196
- {
197
- System.out.print("Loading Species NER Dictionary : Processing ... \r");
198
- /** Species */
199
- PT_Species.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Species.txt");
200
-
201
- /** Cell */
202
- PT_Cell.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Cell.txt");
203
-
204
- /** Genus */
205
- br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SPGenus.txt"));
206
- line="";
207
- while ((line = br.readLine()) != null)
208
- {
209
- String l[]=line.split("\t");
210
- GenusID_hash.put(l[0], l[1]); // tax id -> Genus
211
- }
212
- br.close();
213
-
214
- /** taxid4gene */
215
- br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/tax4gene.txt"));
216
- line="";
217
- while ((line = br.readLine()) != null)
218
- {
219
- taxid4gene.add(line); // tax id -> Genus
220
- }
221
- br.close();
222
- System.out.println("Loading Species NER Dictionary : Processing ... done.");
223
-
224
- }
225
-
226
- if(setup_hash.containsKey("SpeciesAssignment") && setup_hash.get("SpeciesAssignment").toLowerCase().equals("true"))
227
- {
228
- System.out.print("Loading Species Assignment Dictionary : Processing ... \r");
229
- /** GeneWithoutSPPrefix */
230
- br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/GeneWithoutSPPrefix.txt"));
231
- line="";
232
- while ((line = br.readLine()) != null)
233
- {
234
- GeneWithoutSPPrefix_hash.put(line, "");
235
- }
236
- br.close();
237
-
238
- /** Prefix */
239
- br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SPPrefix.txt"));
240
- line="";
241
- while ((line = br.readLine()) != null)
242
- {
243
- String l[]=line.split("\t");
244
- PrefixID_hash.put(l[0], l[1]); //tax id -> prefix
245
- }
246
- br.close();
247
- PrefixID_hash.put("9606", "h");
248
- PrefixID_hash.put("10090", "m");
249
- PrefixID_hash.put("10116", "r");
250
- PrefixID_hash.put("4932", "y");
251
- PrefixID_hash.put("7227", "d");
252
- PrefixID_hash.put("7955", "z|dr|Dr|Zf|zf");
253
- PrefixID_hash.put("3702", "at|At");
254
-
255
- /** Frequency */
256
- br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/taxonomy_freq.txt"));
257
- line="";
258
- while ((line = br.readLine()) != null)
259
- {
260
- String l[]=line.split("\t");
261
- TaxFreq_hash.put(l[0], Double.parseDouble(l[1])/200000000); //tax id -> prefix
262
- }
263
- br.close();
264
-
265
- /** SP_Virus2Human_hash */
266
- br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SP_Virus2HumanList.txt"));
267
- line="";
268
- while ((line = br.readLine()) != null)
269
- {
270
- SP_Virus2Human_hash.put(line,"9606");
271
- }
272
- br.close();
273
-
274
- /** SPStrain */
275
- /*
276
- br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SPStrain.txt"));
277
- line="";
278
- while ((line = br.readLine()) != null)
279
- {
280
- String l[]=line.split("\t");
281
- String ancestor_id = l[0];
282
- String tax_id = l[1];
283
- String tax_names = l[2];
284
- StrainID_ancestor2tax_hash.put(ancestor_id, tax_id); // ancestor -> tax_id
285
- StrainID_taxid2names_hash.put(tax_id, tax_names); // tax id -> strain
286
- }
287
- br.close();
288
- */
289
- System.out.println("Loading Species Assignment Dictionary : Processing ... done.");
290
-
291
- }
292
-
293
- if(setup_hash.containsKey("GeneNormalization") && setup_hash.get("GeneNormalization").toLowerCase().equals("true"))
294
- {
295
- System.out.print("Loading Gene normalization Dictionary : Processing ... \r");
296
- /** gene_prefix & gene_suffix */
297
- br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/PrefixSuffix.txt"));
298
- line="";
299
- while ((line = br.readLine()) != null)
300
- {
301
- String l[]=line.split("\t");
302
- String org=l[0];
303
- String mod=l[1];
304
- suffixprefix_orig2modified.put(org,mod);
305
- }
306
- br.close();
307
-
308
- /** gene_prefix & gene_suffix */
309
- br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/NonGeneAbbr.txt"));
310
- line="";
311
- while ((line = br.readLine()) != null)
312
- {
313
- String l[]=line.split("\t");
314
- String shortform=l[0];
315
- String longform_toks=l[1];
316
- Abb2Longformtok_hash.put(shortform,longform_toks);
317
- }
318
- br.close();
319
-
320
- /** SimConcept.MentionType */
321
- br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SimConcept.MentionType.txt"));
322
- line="";
323
- while ((line = br.readLine()) != null)
324
- {
325
- String l[]=line.split("\t");
326
- SimConceptMention2Type_hash.put(l[0], l[1]);
327
- }
328
- br.close();
329
-
330
- /** Filtering */
331
- br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Filtering.txt"));
332
- line="";
333
- while ((line = br.readLine()) != null)
334
- {
335
- Filtering_hash.put(line, "");
336
- }
337
- br.close();
338
-
339
- /** Filtering_WithLongForm.txt */
340
- br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Filtering_WithLongForm.txt"));
341
- line="";
342
- while ((line = br.readLine()) != null)
343
- {
344
- String l[]=line.split("\t");
345
- Filtering_WithLongForm_hash.put(l[0], l[1]);
346
- }
347
- br.close();
348
-
349
- /** Gene Dictionary */
350
- if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All"))
351
- {
352
- PT_Gene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Gene."+setup_hash.get("FocusSpecies")+".txt");
353
- }
354
- else if((!FocusSpecies.equals("")) && (!FocusSpecies.equals("All")))
355
- {
356
- PT_Gene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Gene."+FocusSpecies+".txt");
357
- }
358
- else
359
- {
360
- PT_Gene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Gene.txt");
361
- }
362
-
363
- /** GeneScoring */
364
- String FileName=setup_hash.get("DictionaryFolder")+"/GeneScoring.txt";
365
-
366
- if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All"))
367
- {
368
- FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring."+setup_hash.get("FocusSpecies")+".txt";
369
- }
370
- else if((!FocusSpecies.equals("")) && (!FocusSpecies.equals("All")))
371
- {
372
- FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring."+FocusSpecies+".txt";
373
- }
374
- br = new BufferedReader(new FileReader(FileName));
375
- line="";
376
- while ((line = br.readLine()) != null)
377
- {
378
- String l[]=line.split("\t");
379
- GeneScoring_hash.put(l[0], l[1]+"\t"+l[2]+"\t"+l[3]+"\t"+l[4]+"\t"+l[5]+"\t"+l[6]);
380
- }
381
- br.close();
382
-
383
- /** GeneScoring.DF */
384
- FileName=setup_hash.get("DictionaryFolder")+"/GeneScoring.DF.txt";
385
- if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All"))
386
- {
387
- FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring.DF."+setup_hash.get("FocusSpecies")+".txt";
388
- }
389
- else if((!FocusSpecies.equals("")) && (!FocusSpecies.equals("All")))
390
- {
391
- FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring.DF."+FocusSpecies+".txt";
392
- }
393
- br = new BufferedReader(new FileReader(FileName));
394
- double Sum = Double.parseDouble(br.readLine());
395
- while ((line = br.readLine()) != null)
396
- {
397
- String l[]=line.split("\t");
398
- // token -> idf
399
- GeneScoringDF_hash.put(l[0], Math.log10(Sum/Double.parseDouble(l[1])));
400
- }
401
- br.close();
402
-
403
- /** Suffix Translation */
404
- SuffixTranslationMap_hash.put("alpha","a");
405
- SuffixTranslationMap_hash.put("a","alpha");
406
- SuffixTranslationMap_hash.put("beta","b");
407
- SuffixTranslationMap_hash.put("b","beta");
408
- SuffixTranslationMap_hash.put("delta","d");
409
- SuffixTranslationMap_hash.put("d","delta");
410
- SuffixTranslationMap_hash.put("z","zeta");
411
- SuffixTranslationMap_hash.put("zeta","z");
412
- SuffixTranslationMap_hash.put("gamma","g");
413
- SuffixTranslationMap_hash.put("g","gamma");
414
- SuffixTranslationMap_hash.put("r","gamma");
415
- SuffixTranslationMap_hash.put("y","gamma");
416
-
417
- SuffixTranslationMap2_hash.put("2","ii");
418
- SuffixTranslationMap2_hash.put("ii","2");
419
- SuffixTranslationMap2_hash.put("II","2");
420
- SuffixTranslationMap2_hash.put("1","i");
421
- SuffixTranslationMap2_hash.put("i","1");
422
- SuffixTranslationMap2_hash.put("I","1");
423
-
424
- /** GeneID */
425
- if(setup_hash.containsKey("GeneIDMatch") && setup_hash.get("GeneIDMatch").toLowerCase().equals("true"))
426
- {
427
- br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/GeneIDs.txt"));
428
- line="";
429
- while ((line = br.readLine()) != null)
430
- {
431
- String l[]=line.split("\t");
432
- GeneIDs_hash.put(l[0],l[1]);
433
- }
434
- br.close();
435
- }
436
-
437
- /** Normalization2Protein */
438
- if(setup_hash.containsKey("Normalization2Protein") && setup_hash.get("Normalization2Protein").toLowerCase().equals("true"))
439
- {
440
- br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Gene2Protein.txt"));
441
- line="";
442
- while ((line = br.readLine()) != null)
443
- {
444
- String l[]=line.split("\t");
445
- Normalization2Protein_hash.put(l[0],l[1]);
446
- }
447
- br.close();
448
- }
449
-
450
- /** HomologeneID */
451
- if(setup_hash.containsKey("HomologeneID") && setup_hash.get("HomologeneID").toLowerCase().equals("true"))
452
- {
453
- br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Gene2Homoid.txt"));
454
- line="";
455
- while ((line = br.readLine()) != null)
456
- {
457
- String l[]=line.split("\t");
458
- HomologeneID_hash.put(l[0],l[1]);
459
- }
460
- br.close();
461
- }
462
- System.out.println("Loading Gene normalization Dictionary : Processing ... done.");
463
- }
464
-
465
- endTime = System.currentTimeMillis();
466
- totTime = endTime - startTime;
467
- System.out.println("Loading Dictionary : Processing Time:"+totTime/1000+"sec");
468
-
469
- folder = new File(InputFolder);
470
- listOfFiles = folder.listFiles();
471
- for (int i = 0; i < listOfFiles.length; i++)
472
- {
473
- if (listOfFiles[i].isFile())
474
- {
475
- String InputFile = listOfFiles[i].getName();
476
- File f = new File(OutputFolder+"/"+InputFile);
477
- if(f.exists() && !f.isDirectory())
478
- {
479
- System.out.println(InputFolder+"/"+InputFile+" - Done. (The output file exists in output folder)");
480
- }
481
- else
482
- {
483
- String path=tmpFolder;
484
- File file = new File(path);
485
- File[] files = file.listFiles();
486
- for (File ftmp:files)
487
- {
488
- if (ftmp.isFile() && ftmp.exists())
489
- {
490
- if(ftmp.toString().matches(tmpFolder+"/"+InputFile+".*"))
491
- {
492
- ftmp.delete();
493
- }
494
- }
495
- }
496
-
497
- BioCDocobj = new BioCDoc();
498
-
499
- /*
500
- * Format Check
501
- */
502
- String Format = "";
503
- String checkR = BioCDocobj.BioCFormatCheck(InputFolder+"/"+InputFile);
504
- if(checkR.equals("BioC"))
505
- {
506
- Format = "BioC";
507
- }
508
- else if(checkR.equals("PubTator"))
509
- {
510
- Format = "PubTator";
511
- }
512
- else
513
- {
514
- System.out.println(checkR);
515
- System.exit(0);
516
- }
517
-
518
- System.out.print(InputFolder+"/"+InputFile+" - ("+Format+" format) : Processing ... \r");
519
-
520
- /** PubTator2BioC*/
521
- if(Format.equals("PubTator"))
522
- {
523
- BioCDocobj.PubTator2BioC(InputFolder+"/"+InputFile,tmpFolder+"/"+InputFile);
524
- }
525
- else
526
- {
527
- br = new BufferedReader(new FileReader(InputFolder+"/"+InputFile));
528
- BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpFolder+"/"+InputFile), "UTF-8"));
529
- line="";
530
- while ((line = br.readLine()) != null)
531
- {
532
- fr.write(line);
533
- }
534
- br.close();
535
- fr.close();
536
- }
537
-
538
- /** load file */
539
- GNR GNRobj = new GNR();
540
- GNRobj.LoadInputFile(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".Abb",TrainTest);
541
- SR SRobj = new SR();
542
- SimConcept SCobj = new SimConcept();
543
- GN GNobj = new GN();
544
- String FinalStep="";
545
-
546
- /** SpeciesRecognition */
547
- if(setup_hash.containsKey("SpeciesRecognition") && setup_hash.get("SpeciesRecognition").toLowerCase().equals("true") ) // pre-annotated name entities
548
- {
549
- SRobj.SpeciesRecognition(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SR.xml",setup_hash.get("DictionaryFolder")+"/SPStrain.txt",setup_hash.get("FilterAntibody"));
550
- FinalStep="SpeciesRecognition";
551
- }
552
-
553
- /** GeneRecognition */
554
- if( setup_hash.containsKey("GeneRecognition") && setup_hash.get("GeneRecognition").toLowerCase().equals("true") )
555
- {
556
- GNRobj.FeatureExtraction(tmpFolder+"/"+InputFile+".data",tmpFolder+"/"+InputFile+".loca",TrainTest);
557
- GNRobj.CRF_test(setup_hash.get("GNRModel"),tmpFolder+"/"+InputFile+".data",tmpFolder+"/"+InputFile+".output","top3"); //top3
558
- GNRobj.ReadCRFresult(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".loca",tmpFolder+"/"+InputFile+".output",tmpFolder+"/"+InputFile+".GNR.xml",0.005,0.05); //0.005,0.05
559
- f = new File(tmpFolder+"/"+InputFile+".SR.xml");
560
- if(f.exists())
561
- {
562
- GNRobj.PostProcessing(tmpFolder+"/"+InputFile+".SR.xml",tmpFolder+"/"+InputFile+".GNR.xml");
563
- }
564
- else
565
- {
566
- GNRobj.PostProcessing(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GNR.xml");
567
- }
568
- FinalStep="GeneRecognition";
569
- }
570
-
571
- /** SpeciesAssignment */
572
- if(setup_hash.containsKey("SpeciesAssignment") && setup_hash.get("SpeciesAssignment").toLowerCase().equals("true") ) // pre-annotated name entities
573
- {
574
- if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All")) // FocusSpecies
575
- {
576
- f = new File(tmpFolder+"/"+InputFile+".GNR.xml");
577
- if(f.exists())
578
- {
579
- SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile+".GNR.xml",tmpFolder+"/"+InputFile+".SA.xml",setup_hash.get("FocusSpecies"));
580
- }
581
- else
582
- {
583
- SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SA.xml",setup_hash.get("FocusSpecies"));
584
- }
585
- }
586
- else// All Species
587
- {
588
- f = new File(tmpFolder+"/"+InputFile+".GNR.xml");
589
- if(f.exists())
590
- {
591
- SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile+".GNR.xml",tmpFolder+"/"+InputFile+".SA.xml");
592
- }
593
- else
594
- {
595
- SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SA.xml");
596
- }
597
- }
598
- FinalStep="SpeciesAssignment";
599
- }
600
-
601
- /** GeneNormalization */
602
- if((setup_hash.containsKey("GeneNormalization")) && setup_hash.get("GeneNormalization").toLowerCase().equals("true") )
603
- {
604
- /** SimConcept */
605
- {
606
- SCobj.FeatureExtraction_Test(tmpFolder+"/"+InputFile+".SC.data");
607
- SCobj.CRF_test(setup_hash.get("SCModel"),tmpFolder+"/"+InputFile+".SC.data",tmpFolder+"/"+InputFile+".SC.output");
608
- SCobj.ReadCRFresult(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SC.output",tmpFolder+"/"+InputFile+".SC.xml");
609
- }
610
-
611
- /** GeneNormalization */
612
- {
613
- GNobj.PreProcessing4GN(InputFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".PreProcessing4GN.xml");
614
- GNobj.ChromosomeRecognition(InputFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml");
615
- if(setup_hash.containsKey("GeneIDMatch") && setup_hash.get("GeneIDMatch").equals("True"))
616
- {
617
-
618
- GNobj.GeneNormalization(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml",true);
619
- GNobj.GeneIDRecognition(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml");
620
- }
621
- else
622
- {
623
- GNobj.GeneNormalization(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml",false);
624
- }
625
- }
626
- FinalStep="GeneNormalization";
627
- }
628
-
629
- /** BioC2PubTator*/
630
- String final_output="";
631
- if(FinalStep.equals("GeneNormalization"))
632
- {
633
- final_output=tmpFolder+"/"+InputFile+".GN.xml";
634
- }
635
- else if(FinalStep.equals("SpeciesAssignment"))
636
- {
637
- final_output=tmpFolder+"/"+InputFile+".SA.xml";
638
- }
639
- else if(FinalStep.equals("SpeciesRecognition"))
640
- {
641
- final_output=tmpFolder+"/"+InputFile+".SR.xml";
642
- }
643
- else if(FinalStep.equals("GeneRecognition"))
644
- {
645
- final_output=tmpFolder+"/"+InputFile+".GNR.xml";
646
- }
647
-
648
- if(Format.equals("PubTator"))
649
- {
650
- BioCDocobj.BioC2PubTator(final_output,OutputFolder+"/"+InputFile);
651
- }
652
- else
653
- {
654
- br = new BufferedReader(new FileReader(final_output));
655
- BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(OutputFolder+"/"+InputFile), "UTF-8"));
656
- line="";
657
- while ((line = br.readLine()) != null)
658
- {
659
- fr.write(line);
660
- }
661
- br.close();
662
- fr.close();
663
- }
664
-
665
- /*
666
- * remove tmp files
667
- */
668
- if((!setup_hash.containsKey("DeleteTmp")) || setup_hash.get("DeleteTmp").toLowerCase().equals("true"))
669
- {
670
- path="tmp";
671
- file = new File(path);
672
- files = file.listFiles();
673
- for (File ftmp:files)
674
- {
675
- if (ftmp.isFile() && ftmp.exists())
676
- {
677
- if(ftmp.toString().matches(tmpFolder+"/"+InputFile+".*"))
678
- {
679
- ftmp.delete();
680
- }
681
- }
682
- }
683
- }
684
-
685
- /*
686
- * Time stamp - last
687
- */
688
- endTime = System.currentTimeMillis();
689
- totTime = endTime - startTime;
690
- System.out.println(InputFolder+"/"+InputFile+" - ("+Format+" format) : Processing Time:"+totTime/1000+"sec");
691
- }
692
- }
693
- }
694
- }
695
- }
696
- }
 
1
+ package GNormPluslib;
2
+
3
+ import java.io.BufferedReader;
4
+ import java.io.BufferedWriter;
5
+ import java.io.File;
6
+ import java.io.FileOutputStream;
7
+ import java.io.FileReader;
8
+ import java.io.IOException;
9
+ import java.io.OutputStreamWriter;
10
+ import java.sql.SQLException;
11
+ import java.util.ArrayList;
12
+ import java.util.HashMap;
13
+ import java.util.regex.Matcher;
14
+ import java.util.regex.Pattern;
15
+
16
+ import javax.xml.stream.XMLStreamException;
17
+
18
+ import GNormPluslib.PrefixTree;
19
+ import GNormPluslib.GNR;
20
+ import GNormPluslib.SR;
21
+
22
+ public class GNormPlus
23
+ {
24
+ public static BioCDoc BioCDocobj = new BioCDoc();
25
+ public static PrefixTree PT_Species = new PrefixTree();
26
+ public static PrefixTree PT_Cell = new PrefixTree();
27
+ public static PrefixTree PT_CTDGene = new PrefixTree();
28
+ public static PrefixTree PT_Gene = new PrefixTree();
29
+ public static PrefixTree PT_GeneChromosome = new PrefixTree();
30
+ public static PrefixTree PT_FamilyName = new PrefixTree();
31
+ public static HashMap<String, String> ent_hash = new HashMap<String, String>();
32
+ public static HashMap<String, String> GenusID_hash = new HashMap<String, String>();
33
+ public static HashMap<String, String> PrefixID_hash = new HashMap<String, String>();
34
+ public static HashMap<String, Double> TaxFreq_hash = new HashMap<String, Double>();
35
+ public static HashMap<String, String> GeneScoring_hash = new HashMap<String, String>();
36
+ public static HashMap<String, Double> GeneScoringDF_hash = new HashMap<String, Double>();
37
+ public static HashMap<String, String> GeneIDs_hash = new HashMap<String, String>();
38
+ public static HashMap<String, String> Normalization2Protein_hash = new HashMap<String, String>();
39
+ public static HashMap<String, String> HomologeneID_hash = new HashMap<String, String>();
40
+ public static HashMap<String,String> SuffixTranslationMap_hash = new HashMap<String,String>();
41
+ public static HashMap<String,String> SuffixTranslationMap2_hash = new HashMap<String,String>();
42
+ public static HashMap<String, String> Pmid2Abb_hash = new HashMap<String, String>();
43
+ public static HashMap<String, String> PmidAbb2LF_lc_hash = new HashMap<String, String>();
44
+ public static HashMap<String, String> PmidLF2Abb_lc_hash = new HashMap<String, String>();
45
+ public static HashMap<String, String> PmidAbb2LF_hash = new HashMap<String, String>();
46
+ public static HashMap<String, String> PmidLF2Abb_hash = new HashMap<String, String>();
47
+ public static HashMap<String, String> Pmid2ChromosomeGene_hash = new HashMap<String, String>();
48
+ public static HashMap<String, String> SimConceptMention2Type_hash = new HashMap<String, String>();
49
+ public static HashMap<String, String> Filtering_hash = new HashMap<String, String>();
50
+ public static HashMap<String, String> Filtering_WithLongForm_hash = new HashMap<String, String>();
51
+ public static HashMap<String, String> SP_Virus2Human_hash = new HashMap<String, String>();
52
+ public static HashMap<String, String> GeneWithoutSPPrefix_hash = new HashMap<String, String>();
53
+ public static ArrayList <String> taxid4gene = new ArrayList <String>();
54
+ public static HashMap<String, String> setup_hash = new HashMap<String, String>();
55
+ public static HashMap<String, String> suffixprefix_orig2modified = new HashMap<String, String>();
56
+ public static HashMap<String, String> Abb2Longformtok_hash = new HashMap<String, String>();
57
+ public static HashMap<String, String> StrainID_ancestor2tax_hash = new HashMap<String, String>();
58
+ public static HashMap<String, String> StrainID_taxid2names_hash = new HashMap<String, String>();
59
+
60
+ public static String SetupFile = "setup.txt";
61
+ public static void main(String [] args) throws IOException, InterruptedException, XMLStreamException, SQLException
62
+ {
63
+ String InputFolder="input";
64
+ String OutputFolder="output";
65
+ String tmpFolder="tmp";
66
+ String FocusSpecies = "";
67
+ if(args.length<2)
68
+ {
69
+ System.out.println("\n$ java -Xmx30G -Xms10G -jar GNormPlus.jar [InputFolder] [OutputFolder] [SetupFile]");
70
+ System.out.println("[InputFolder] Default : input");
71
+ System.out.println("[OutputFolder] Default : output");
72
+ System.out.println("[SetupFile] Default : setup.txt\n\n");
73
+ }
74
+ else
75
+ {
76
+ /*
77
+ * Parameters
78
+ */
79
+ InputFolder=args[0];
80
+ OutputFolder=args[1];
81
+ if(args.length>=3)
82
+ {
83
+ SetupFile = args[2];
84
+ }
85
+ if(args.length>=4)
86
+ {
87
+ FocusSpecies=args[3];
88
+ }
89
+ }
90
+
91
+ BufferedReader br = new BufferedReader(new FileReader(SetupFile));
92
+ String line="";
93
+ Pattern ptmp = Pattern.compile("^ ([A-Za-z0-9]+) = ([^ \\t\\n\\r]+)$");
94
+ while ((line = br.readLine()) != null)
95
+ {
96
+ Matcher mtmp = ptmp.matcher(line);
97
+ if(mtmp.find())
98
+ {
99
+ setup_hash.put(mtmp.group(1), mtmp.group(2));
100
+ }
101
+ }
102
+ br.close();
103
+ if(!setup_hash.containsKey("GeneIDMatch"))
104
+ {
105
+ setup_hash.put("GeneIDMatch","True");
106
+ }
107
+ if(!setup_hash.containsKey("HomologeneID"))
108
+ {
109
+ setup_hash.put("HomologeneID","False");
110
+ }
111
+ if(!FocusSpecies.equals(""))
112
+ {
113
+ setup_hash.put("FocusSpecies",FocusSpecies);
114
+ }
115
+ if(!setup_hash.containsKey("ShowUnNormalizedMention"))
116
+ {
117
+ setup_hash.put("ShowUnNormalizedMention","False");
118
+ }
119
+ if(setup_hash.containsKey("tmpFolder"))
120
+ {
121
+ tmpFolder=setup_hash.get("tmpFolder");
122
+ }
123
+
124
+ /*
125
+ * Time stamp - start : All
126
+ */
127
+ double startTime,endTime,totTime;
128
+ startTime = System.currentTimeMillis();//start time
129
+
130
+ int NumFiles=0;
131
+ File folder = new File(InputFolder);
132
+ File[] listOfFiles = folder.listFiles();
133
+ for (int i = 0; i < listOfFiles.length; i++)
134
+ {
135
+ if (listOfFiles[i].isFile())
136
+ {
137
+ String InputFile = listOfFiles[i].getName();
138
+ File f = new File(OutputFolder+"/"+InputFile);
139
+ if(f.exists() && !f.isDirectory())
140
+ {
141
+ }
142
+ else
143
+ {
144
+ NumFiles++;
145
+ }
146
+ }
147
+ }
148
+
149
+ System.out.println("Total "+NumFiles+" file(s) wait(s) for process.");
150
+
151
+ if(NumFiles>0)
152
+ {
153
+ /*
154
+ * Start & Load Dictionary
155
+ */
156
+ String TrainTest = "Test";
157
+ if(setup_hash.containsKey("TrainTest"))
158
+ {
159
+ TrainTest = setup_hash.get("TrainTest");
160
+ }
161
+
162
+
163
+ /** Load Dictionary */
164
+ if(setup_hash.containsKey("GeneRecognition") && setup_hash.get("GeneRecognition").toLowerCase().equals("true"))
165
+ {
166
+ System.out.print("Loading Gene NER Dictionary : Processing ... \r");
167
+ /** CTDGene */
168
+ if(setup_hash.containsKey("IgnoreNER") && setup_hash.get("IgnoreNER").toLowerCase().equals("true")){} // not NER (entities are pre-annotated)
169
+ else if(setup_hash.containsKey("SpeciesAssignmentOnly") && setup_hash.get("SpeciesAssignmentOnly").toLowerCase().equals("true")) {} // species assignment
170
+ else
171
+ {
172
+ PT_CTDGene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_CTDGene.txt");
173
+ }
174
+ /** ent */
175
+ br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/ent.rev.txt"));
176
+ line="";
177
+ while ((line = br.readLine()) != null)
178
+ {
179
+ String l[]=line.split("\t"); //&#x00391; Alpha
180
+ ent_hash.put(l[0], l[1]);
181
+ }
182
+ br.close();
183
+
184
+ /** FamilyName */
185
+ if((!setup_hash.containsKey("IgnoreNER")) || setup_hash.get("IgnoreNER").toLowerCase() != "true")
186
+ {
187
+ PT_FamilyName.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_FamilyName.txt");
188
+ }
189
+
190
+ /** GeneChromosome */
191
+ //PT_GeneChromosome.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_GeneChromosome.txt");
192
+ System.out.println("Loading Gene NER Dictionary : Processing ... done.");
193
+ }
194
+
195
+ if(setup_hash.containsKey("SpeciesRecognition") && setup_hash.get("SpeciesRecognition").toLowerCase().equals("true"))
196
+ {
197
+ System.out.print("Loading Species NER Dictionary : Processing ... \r");
198
+ /** Species */
199
+ PT_Species.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Species.txt");
200
+
201
+ /** Cell */
202
+ PT_Cell.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Cell.txt");
203
+
204
+ /** Genus */
205
+ br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SPGenus.txt"));
206
+ line="";
207
+ while ((line = br.readLine()) != null)
208
+ {
209
+ String l[]=line.split("\t");
210
+ GenusID_hash.put(l[0], l[1]); // tax id -> Genus
211
+ }
212
+ br.close();
213
+
214
+ /** taxid4gene */
215
+ br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/tax4gene.txt"));
216
+ line="";
217
+ while ((line = br.readLine()) != null)
218
+ {
219
+ taxid4gene.add(line); // tax id -> Genus
220
+ }
221
+ br.close();
222
+ System.out.println("Loading Species NER Dictionary : Processing ... done.");
223
+
224
+ }
225
+
226
+ if(setup_hash.containsKey("SpeciesAssignment") && setup_hash.get("SpeciesAssignment").toLowerCase().equals("true"))
227
+ {
228
+ System.out.print("Loading Species Assignment Dictionary : Processing ... \r");
229
+ /** GeneWithoutSPPrefix */
230
+ br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/GeneWithoutSPPrefix.txt"));
231
+ line="";
232
+ while ((line = br.readLine()) != null)
233
+ {
234
+ GeneWithoutSPPrefix_hash.put(line, "");
235
+ }
236
+ br.close();
237
+
238
+ /** Prefix */
239
+ br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SPPrefix.txt"));
240
+ line="";
241
+ while ((line = br.readLine()) != null)
242
+ {
243
+ String l[]=line.split("\t");
244
+ PrefixID_hash.put(l[0], l[1]); //tax id -> prefix
245
+ }
246
+ br.close();
247
+ PrefixID_hash.put("9606", "h");
248
+ PrefixID_hash.put("10090", "m");
249
+ PrefixID_hash.put("10116", "r");
250
+ PrefixID_hash.put("4932", "y");
251
+ PrefixID_hash.put("7227", "d");
252
+ PrefixID_hash.put("7955", "z|dr|Dr|Zf|zf");
253
+ PrefixID_hash.put("3702", "at|At");
254
+
255
+ /** Frequency */
256
+ br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/taxonomy_freq.txt"));
257
+ line="";
258
+ while ((line = br.readLine()) != null)
259
+ {
260
+ String l[]=line.split("\t");
261
+ TaxFreq_hash.put(l[0], Double.parseDouble(l[1])/200000000); //tax id -> prefix
262
+ }
263
+ br.close();
264
+
265
+ /** SP_Virus2Human_hash */
266
+ br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SP_Virus2HumanList.txt"));
267
+ line="";
268
+ while ((line = br.readLine()) != null)
269
+ {
270
+ SP_Virus2Human_hash.put(line,"9606");
271
+ }
272
+ br.close();
273
+
274
+ /** SPStrain */
275
+ /*
276
+ br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SPStrain.txt"));
277
+ line="";
278
+ while ((line = br.readLine()) != null)
279
+ {
280
+ String l[]=line.split("\t");
281
+ String ancestor_id = l[0];
282
+ String tax_id = l[1];
283
+ String tax_names = l[2];
284
+ StrainID_ancestor2tax_hash.put(ancestor_id, tax_id); // ancestor -> tax_id
285
+ StrainID_taxid2names_hash.put(tax_id, tax_names); // tax id -> strain
286
+ }
287
+ br.close();
288
+ */
289
+ System.out.println("Loading Species Assignment Dictionary : Processing ... done.");
290
+
291
+ }
292
+
293
+ if(setup_hash.containsKey("GeneNormalization") && setup_hash.get("GeneNormalization").toLowerCase().equals("true"))
294
+ {
295
+ System.out.print("Loading Gene normalization Dictionary : Processing ... \r");
296
+ /** gene_prefix & gene_suffix */
297
+ br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/PrefixSuffix.txt"));
298
+ line="";
299
+ while ((line = br.readLine()) != null)
300
+ {
301
+ String l[]=line.split("\t");
302
+ String org=l[0];
303
+ String mod=l[1];
304
+ suffixprefix_orig2modified.put(org,mod);
305
+ }
306
+ br.close();
307
+
308
+ /** gene_prefix & gene_suffix */
309
+ br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/NonGeneAbbr.txt"));
310
+ line="";
311
+ while ((line = br.readLine()) != null)
312
+ {
313
+ String l[]=line.split("\t");
314
+ String shortform=l[0];
315
+ String longform_toks=l[1];
316
+ Abb2Longformtok_hash.put(shortform,longform_toks);
317
+ }
318
+ br.close();
319
+
320
+ /** SimConcept.MentionType */
321
+ br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/SimConcept.MentionType.txt"));
322
+ line="";
323
+ while ((line = br.readLine()) != null)
324
+ {
325
+ String l[]=line.split("\t");
326
+ SimConceptMention2Type_hash.put(l[0], l[1]);
327
+ }
328
+ br.close();
329
+
330
+ /** Filtering */
331
+ br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Filtering.txt"));
332
+ line="";
333
+ while ((line = br.readLine()) != null)
334
+ {
335
+ Filtering_hash.put(line, "");
336
+ }
337
+ br.close();
338
+
339
+ /** Filtering_WithLongForm.txt */
340
+ br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Filtering_WithLongForm.txt"));
341
+ line="";
342
+ while ((line = br.readLine()) != null)
343
+ {
344
+ String l[]=line.split("\t");
345
+ Filtering_WithLongForm_hash.put(l[0], l[1]);
346
+ }
347
+ br.close();
348
+
349
+ /** Gene Dictionary */
350
+ if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All"))
351
+ {
352
+ PT_Gene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Gene."+setup_hash.get("FocusSpecies")+".txt");
353
+ }
354
+ else if((!FocusSpecies.equals("")) && (!FocusSpecies.equals("All")))
355
+ {
356
+ PT_Gene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Gene."+FocusSpecies+".txt");
357
+ }
358
+ else
359
+ {
360
+ PT_Gene.TreeFile2Tree(setup_hash.get("DictionaryFolder")+"/PT_Gene.txt");
361
+ }
362
+
363
+ /** GeneScoring */
364
+ String FileName=setup_hash.get("DictionaryFolder")+"/GeneScoring.txt";
365
+
366
+ if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All"))
367
+ {
368
+ FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring."+setup_hash.get("FocusSpecies")+".txt";
369
+ }
370
+ else if((!FocusSpecies.equals("")) && (!FocusSpecies.equals("All")))
371
+ {
372
+ FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring."+FocusSpecies+".txt";
373
+ }
374
+ br = new BufferedReader(new FileReader(FileName));
375
+ line="";
376
+ while ((line = br.readLine()) != null)
377
+ {
378
+ String l[]=line.split("\t");
379
+ GeneScoring_hash.put(l[0], l[1]+"\t"+l[2]+"\t"+l[3]+"\t"+l[4]+"\t"+l[5]+"\t"+l[6]);
380
+ }
381
+ br.close();
382
+
383
+ /** GeneScoring.DF */
384
+ FileName=setup_hash.get("DictionaryFolder")+"/GeneScoring.DF.txt";
385
+ if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All"))
386
+ {
387
+ FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring.DF."+setup_hash.get("FocusSpecies")+".txt";
388
+ }
389
+ else if((!FocusSpecies.equals("")) && (!FocusSpecies.equals("All")))
390
+ {
391
+ FileName = setup_hash.get("DictionaryFolder")+"/GeneScoring.DF."+FocusSpecies+".txt";
392
+ }
393
+ br = new BufferedReader(new FileReader(FileName));
394
+ double Sum = Double.parseDouble(br.readLine());
395
+ while ((line = br.readLine()) != null)
396
+ {
397
+ String l[]=line.split("\t");
398
+ // token -> idf
399
+ GeneScoringDF_hash.put(l[0], Math.log10(Sum/Double.parseDouble(l[1])));
400
+ }
401
+ br.close();
402
+
403
+ /** Suffix Translation */
404
+ SuffixTranslationMap_hash.put("alpha","a");
405
+ SuffixTranslationMap_hash.put("a","alpha");
406
+ SuffixTranslationMap_hash.put("beta","b");
407
+ SuffixTranslationMap_hash.put("b","beta");
408
+ SuffixTranslationMap_hash.put("delta","d");
409
+ SuffixTranslationMap_hash.put("d","delta");
410
+ SuffixTranslationMap_hash.put("z","zeta");
411
+ SuffixTranslationMap_hash.put("zeta","z");
412
+ SuffixTranslationMap_hash.put("gamma","g");
413
+ SuffixTranslationMap_hash.put("g","gamma");
414
+ SuffixTranslationMap_hash.put("r","gamma");
415
+ SuffixTranslationMap_hash.put("y","gamma");
416
+
417
+ SuffixTranslationMap2_hash.put("2","ii");
418
+ SuffixTranslationMap2_hash.put("ii","2");
419
+ SuffixTranslationMap2_hash.put("II","2");
420
+ SuffixTranslationMap2_hash.put("1","i");
421
+ SuffixTranslationMap2_hash.put("i","1");
422
+ SuffixTranslationMap2_hash.put("I","1");
423
+
424
+ /** GeneID */
425
+ if(setup_hash.containsKey("GeneIDMatch") && setup_hash.get("GeneIDMatch").toLowerCase().equals("true"))
426
+ {
427
+ br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/GeneIDs.txt"));
428
+ line="";
429
+ while ((line = br.readLine()) != null)
430
+ {
431
+ String l[]=line.split("\t");
432
+ GeneIDs_hash.put(l[0],l[1]);
433
+ }
434
+ br.close();
435
+ }
436
+
437
+ /** Normalization2Protein */
438
+ if(setup_hash.containsKey("Normalization2Protein") && setup_hash.get("Normalization2Protein").toLowerCase().equals("true"))
439
+ {
440
+ br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Gene2Protein.txt"));
441
+ line="";
442
+ while ((line = br.readLine()) != null)
443
+ {
444
+ String l[]=line.split("\t");
445
+ Normalization2Protein_hash.put(l[0],l[1]);
446
+ }
447
+ br.close();
448
+ }
449
+
450
+ /** HomologeneID */
451
+ if(setup_hash.containsKey("HomologeneID") && setup_hash.get("HomologeneID").toLowerCase().equals("true"))
452
+ {
453
+ br = new BufferedReader(new FileReader(setup_hash.get("DictionaryFolder")+"/Gene2Homoid.txt"));
454
+ line="";
455
+ while ((line = br.readLine()) != null)
456
+ {
457
+ String l[]=line.split("\t");
458
+ HomologeneID_hash.put(l[0],l[1]);
459
+ }
460
+ br.close();
461
+ }
462
+ System.out.println("Loading Gene normalization Dictionary : Processing ... done.");
463
+ }
464
+
465
+ endTime = System.currentTimeMillis();
466
+ totTime = endTime - startTime;
467
+ System.out.println("Loading Dictionary : Processing Time:"+totTime/1000+"sec");
468
+
469
+ folder = new File(InputFolder);
470
+ listOfFiles = folder.listFiles();
471
+ for (int i = 0; i < listOfFiles.length; i++)
472
+ {
473
+ if (listOfFiles[i].isFile())
474
+ {
475
+ String InputFile = listOfFiles[i].getName();
476
+ File f = new File(OutputFolder+"/"+InputFile);
477
+ if(f.exists() && !f.isDirectory())
478
+ {
479
+ System.out.println(InputFolder+"/"+InputFile+" - Done. (The output file exists in output folder)");
480
+ }
481
+ else
482
+ {
483
+ String path=tmpFolder;
484
+ File file = new File(path);
485
+ File[] files = file.listFiles();
486
+ for (File ftmp:files)
487
+ {
488
+ if (ftmp.isFile() && ftmp.exists())
489
+ {
490
+ if(ftmp.toString().matches(tmpFolder+"/"+InputFile+".*"))
491
+ {
492
+ ftmp.delete();
493
+ }
494
+ }
495
+ }
496
+
497
+ BioCDocobj = new BioCDoc();
498
+
499
+ /*
500
+ * Format Check
501
+ */
502
+ String Format = "";
503
+ String checkR = BioCDocobj.BioCFormatCheck(InputFolder+"/"+InputFile);
504
+ if(checkR.equals("BioC"))
505
+ {
506
+ Format = "BioC";
507
+ }
508
+ else if(checkR.equals("PubTator"))
509
+ {
510
+ Format = "PubTator";
511
+ }
512
+ else
513
+ {
514
+ System.out.println(checkR);
515
+ System.exit(0);
516
+ }
517
+
518
+ System.out.print(InputFolder+"/"+InputFile+" - ("+Format+" format) : Processing ... \r");
519
+
520
+ /** PubTator2BioC*/
521
+ if(Format.equals("PubTator"))
522
+ {
523
+ BioCDocobj.PubTator2BioC(InputFolder+"/"+InputFile,tmpFolder+"/"+InputFile);
524
+ }
525
+ else
526
+ {
527
+ br = new BufferedReader(new FileReader(InputFolder+"/"+InputFile));
528
+ BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpFolder+"/"+InputFile), "UTF-8"));
529
+ line="";
530
+ while ((line = br.readLine()) != null)
531
+ {
532
+ fr.write(line);
533
+ }
534
+ br.close();
535
+ fr.close();
536
+ }
537
+
538
+ /** load file */
539
+ GNR GNRobj = new GNR();
540
+ GNRobj.LoadInputFile(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".Abb",TrainTest);
541
+ SR SRobj = new SR();
542
+ SimConcept SCobj = new SimConcept();
543
+ GN GNobj = new GN();
544
+ String FinalStep="";
545
+
546
+ /** SpeciesRecognition */
547
+ if(setup_hash.containsKey("SpeciesRecognition") && setup_hash.get("SpeciesRecognition").toLowerCase().equals("true") ) // pre-annotated name entities
548
+ {
549
+ SRobj.SpeciesRecognition(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SR.xml",setup_hash.get("DictionaryFolder")+"/SPStrain.txt",setup_hash.get("FilterAntibody"));
550
+ FinalStep="SpeciesRecognition";
551
+ }
552
+
553
+ /** GeneRecognition */
554
+ if( setup_hash.containsKey("GeneRecognition") && setup_hash.get("GeneRecognition").toLowerCase().equals("true") )
555
+ {
556
+ GNRobj.FeatureExtraction(tmpFolder+"/"+InputFile+".data",tmpFolder+"/"+InputFile+".loca",TrainTest);
557
+ GNRobj.CRF_test(setup_hash.get("GNRModel"),tmpFolder+"/"+InputFile+".data",tmpFolder+"/"+InputFile+".output","top3"); //top3
558
+ GNRobj.ReadCRFresult(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".loca",tmpFolder+"/"+InputFile+".output",tmpFolder+"/"+InputFile+".GNR.xml",0.005,0.05); //0.005,0.05
559
+ f = new File(tmpFolder+"/"+InputFile+".SR.xml");
560
+ if(f.exists())
561
+ {
562
+ GNRobj.PostProcessing(tmpFolder+"/"+InputFile+".SR.xml",tmpFolder+"/"+InputFile+".GNR.xml");
563
+ }
564
+ else
565
+ {
566
+ GNRobj.PostProcessing(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GNR.xml");
567
+ }
568
+ FinalStep="GeneRecognition";
569
+ }
570
+
571
+ /** SpeciesAssignment */
572
+ if(setup_hash.containsKey("SpeciesAssignment") && setup_hash.get("SpeciesAssignment").toLowerCase().equals("true") ) // pre-annotated name entities
573
+ {
574
+ if(setup_hash.containsKey("FocusSpecies") && !setup_hash.get("FocusSpecies").equals("All")) // FocusSpecies
575
+ {
576
+ f = new File(tmpFolder+"/"+InputFile+".GNR.xml");
577
+ if(f.exists())
578
+ {
579
+ SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile+".GNR.xml",tmpFolder+"/"+InputFile+".SA.xml",setup_hash.get("FocusSpecies"));
580
+ }
581
+ else
582
+ {
583
+ SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SA.xml",setup_hash.get("FocusSpecies"));
584
+ }
585
+ }
586
+ else// All Species
587
+ {
588
+ f = new File(tmpFolder+"/"+InputFile+".GNR.xml");
589
+ if(f.exists())
590
+ {
591
+ SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile+".GNR.xml",tmpFolder+"/"+InputFile+".SA.xml");
592
+ }
593
+ else
594
+ {
595
+ SRobj.SpeciesAssignment(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SA.xml");
596
+ }
597
+ }
598
+ FinalStep="SpeciesAssignment";
599
+ }
600
+
601
+ /** GeneNormalization */
602
+ if((setup_hash.containsKey("GeneNormalization")) && setup_hash.get("GeneNormalization").toLowerCase().equals("true") )
603
+ {
604
+ /** SimConcept */
605
+ {
606
+ SCobj.FeatureExtraction_Test(tmpFolder+"/"+InputFile+".SC.data");
607
+ SCobj.CRF_test(setup_hash.get("SCModel"),tmpFolder+"/"+InputFile+".SC.data",tmpFolder+"/"+InputFile+".SC.output");
608
+ SCobj.ReadCRFresult(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".SC.output",tmpFolder+"/"+InputFile+".SC.xml");
609
+ }
610
+
611
+ /** GeneNormalization */
612
+ {
613
+ GNobj.PreProcessing4GN(InputFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".PreProcessing4GN.xml");
614
+ GNobj.ChromosomeRecognition(InputFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml");
615
+ if(setup_hash.containsKey("GeneIDMatch") && setup_hash.get("GeneIDMatch").equals("True"))
616
+ {
617
+
618
+ GNobj.GeneNormalization(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml",true);
619
+ GNobj.GeneIDRecognition(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml");
620
+ }
621
+ else
622
+ {
623
+ GNobj.GeneNormalization(tmpFolder+"/"+InputFile,tmpFolder+"/"+InputFile+".GN.xml",false);
624
+ }
625
+ }
626
+ FinalStep="GeneNormalization";
627
+ }
628
+
629
+ /** BioC2PubTator*/
630
+ String final_output="";
631
+ if(FinalStep.equals("GeneNormalization"))
632
+ {
633
+ final_output=tmpFolder+"/"+InputFile+".GN.xml";
634
+ }
635
+ else if(FinalStep.equals("SpeciesAssignment"))
636
+ {
637
+ final_output=tmpFolder+"/"+InputFile+".SA.xml";
638
+ }
639
+ else if(FinalStep.equals("SpeciesRecognition"))
640
+ {
641
+ final_output=tmpFolder+"/"+InputFile+".SR.xml";
642
+ }
643
+ else if(FinalStep.equals("GeneRecognition"))
644
+ {
645
+ final_output=tmpFolder+"/"+InputFile+".GNR.xml";
646
+ }
647
+
648
+ if(Format.equals("PubTator"))
649
+ {
650
+ BioCDocobj.BioC2PubTator(final_output,OutputFolder+"/"+InputFile);
651
+ }
652
+ else
653
+ {
654
+ br = new BufferedReader(new FileReader(final_output));
655
+ BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(OutputFolder+"/"+InputFile), "UTF-8"));
656
+ line="";
657
+ while ((line = br.readLine()) != null)
658
+ {
659
+ fr.write(line);
660
+ }
661
+ br.close();
662
+ fr.close();
663
+ }
664
+
665
+ /*
666
+ * remove tmp files
667
+ */
668
+ if((!setup_hash.containsKey("DeleteTmp")) || setup_hash.get("DeleteTmp").toLowerCase().equals("true"))
669
+ {
670
+ path="tmp";
671
+ file = new File(path);
672
+ files = file.listFiles();
673
+ for (File ftmp:files)
674
+ {
675
+ if (ftmp.isFile() && ftmp.exists())
676
+ {
677
+ if(ftmp.toString().matches(tmpFolder+"/"+InputFile+".*"))
678
+ {
679
+ ftmp.delete();
680
+ }
681
+ }
682
+ }
683
+ }
684
+
685
+ /*
686
+ * Time stamp - last
687
+ */
688
+ endTime = System.currentTimeMillis();
689
+ totTime = endTime - startTime;
690
+ System.out.println(InputFolder+"/"+InputFile+" - ("+Format+" format) : Processing Time:"+totTime/1000+"sec");
691
+ }
692
+ }
693
+ }
694
+ }
695
+ }
696
+ }
src_Java/GNormPluslib/PrefixTree.java CHANGED
@@ -1,893 +1,893 @@
1
- /**
2
- * Project: GNormPlus
3
- * Function: Dictionary lookup by Prefix Tree
4
- */
5
-
6
- package GNormPluslib;
7
-
8
- import java.io.*;
9
- import java.util.*;
10
- import java.util.regex.Matcher;
11
- import java.util.regex.Pattern;
12
-
13
- public class PrefixTree
14
- {
15
- private Tree Tr=new Tree();
16
-
17
- /*
18
- * Read Dictionary and insert Mention into the Prefix Tree
19
- */
20
- public static HashMap<String, String> StopWord_hash = new HashMap<String, String>();
21
-
22
- public void Hash2Tree(HashMap<String, String> ID2Names)
23
- {
24
- for(String ID : ID2Names.keySet())
25
- {
26
- String NameColumn[]=ID2Names.get(ID).split("\\|");
27
- for(int i=0;i<NameColumn.length;i++)
28
- {
29
- Tr.insertMention(NameColumn[i],ID);
30
- }
31
- }
32
- }
33
- public void Dictionary2Tree_Combine(String Filename,String StopWords,String MentionType)
34
- {
35
- try
36
- {
37
- //System.out.println("Dictionary2Tree_Combine : " + Filename);
38
-
39
- /** Stop Word */
40
- BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8"));
41
- String line="";
42
- while ((line = br.readLine()) != null)
43
- {
44
- StopWord_hash.put(line, "StopWord");
45
- }
46
- br.close();
47
-
48
- BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
49
- line="";
50
- //int count=0;
51
- while ((line = inputfile.readLine()) != null)
52
- {
53
- //count++;
54
- //if(count%10000==0){ System.out.println(count); }
55
- String Column[]=line.split("\t");
56
- if(Column.length>1)
57
- {
58
- Column[0]=Column[0].replace("species:ncbi:","");
59
- Column[1]=Column[1].replaceAll(" strain=", " ");
60
- Column[1]=Column[1].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
61
- Column[1]=Column[1].replaceAll("[\\(\\)]", " ");
62
- String SpNameColumn[]=Column[1].split("\\|");
63
- for(int i=0;i<SpNameColumn.length;i++)
64
- {
65
- String tmp = SpNameColumn[i];
66
- tmp=tmp.replaceAll("[\\W\\-\\_]", "");
67
-
68
- /*
69
- * Criteria for Species
70
- */
71
- if( MentionType.equals("Species") &&
72
- (!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
73
- (!SpNameColumn[i].matches("a[\\W\\-\\_].*")) &&
74
- tmp.length()>=3
75
- )
76
- {
77
- boolean stopword_boolean=false;
78
- for(String stopword_RegEx : StopWord_hash.keySet())
79
- {
80
- Pattern ptmp = Pattern.compile("^"+stopword_RegEx+"$");
81
- Matcher mtmp = ptmp.matcher(SpNameColumn[i].toLowerCase());
82
- if(mtmp.find())
83
- {
84
- stopword_boolean=true;
85
- }
86
- }
87
- if(stopword_boolean == false)
88
- {
89
- Tr.insertMention(SpNameColumn[i],Column[0]);
90
- }
91
- }
92
- /*
93
- * Criteria for Gene
94
- */
95
- else if (MentionType.equals("Gene") &&
96
- (!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
97
- tmp.length()>=3
98
- )
99
- {
100
- if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
101
- {
102
- Tr.insertMention(SpNameColumn[i],Column[0]);
103
- }
104
- }
105
- /*
106
- * Criteria for Cell
107
- */
108
- else if (MentionType.equals("Cell") &&
109
- (!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
110
- tmp.length()>=3
111
- )
112
- {
113
- if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
114
- {
115
- Tr.insertMention(SpNameColumn[i],Column[0]);
116
- }
117
- }
118
- /*
119
- * others
120
- */
121
- else if ((!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
122
- tmp.length()>=3
123
- )
124
- {
125
- if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
126
- {
127
- Tr.insertMention(SpNameColumn[i],Column[0]);
128
- }
129
- }
130
- }
131
- }
132
- }
133
- inputfile.close();
134
- }
135
- catch(IOException e1){ System.out.println("[Dictionary2Tree_Combine]: Input file is not exist.");}
136
- }
137
- public void Dictionary2Tree_UniqueGene(String Filename,String StopWords,String Preifx)
138
- {
139
- try
140
- {
141
- //System.out.println("Dictionary2Tree_UniqueGene : " + Filename);
142
-
143
- /** Stop Word */
144
- BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8"));
145
- String line="";
146
- while ((line = br.readLine()) != null)
147
- {
148
- StopWord_hash.put(line, "StopWord");
149
- }
150
- br.close();
151
-
152
- BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
153
- line="";
154
- //int count=0;
155
- while ((line = inputfile.readLine()) != null)
156
- {
157
- //count++;
158
- //if(count%10000==0){ System.out.println(count); }
159
- String Column[]=line.split("\t");
160
- if(Column.length>1)
161
- {
162
- if(!StopWord_hash.containsKey(Column[0].toLowerCase()))
163
- {
164
- if(Preifx.equals(""))
165
- {
166
- Tr.insertMention(Column[0],Column[1]);
167
- }
168
- else if(Preifx.equals("Num") && Column[0].matches("[0-9].*"))
169
- {
170
- Tr.insertMention(Column[0],Column[1]);
171
- }
172
- else if(Preifx.equals("AZNum") && Column[0].matches("[a-z][0-9].*"))
173
- {
174
- Tr.insertMention(Column[0],Column[1]);
175
- }
176
- else if(Preifx.equals("lo") && Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
177
- {
178
- if( ! Column[0].matches("loc[0-9]+"))
179
- {
180
- Tr.insertMention(Column[0],Column[1]);
181
- }
182
- }
183
- else if(Preifx.equals("un") && Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
184
- {
185
- if(Column[0].length()>=6 && Column[0].substring(0,6).equals("unchar"))
186
- {
187
- // remove uncharacterized
188
- }
189
- else
190
- {
191
- Tr.insertMention(Column[0],Column[1]);
192
- }
193
- }
194
- else if(Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
195
- {
196
- Tr.insertMention(Column[0],Column[1]);
197
- }
198
- }
199
- }
200
- }
201
- inputfile.close();
202
- }
203
- catch(IOException e1){ System.out.println("[Dictionary2Tree_UniqueGene]: Input file is not exist.");}
204
- }
205
- public void Dictionary2Tree_UniqueSpecies(String Filename,String StopWords,String Preifx)
206
- {
207
- try
208
- {
209
- //System.out.println("Dictionary2Tree_UniqueGene : " + Filename);
210
-
211
- /** Stop Word */
212
- BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8"));
213
- String line="";
214
- while ((line = br.readLine()) != null)
215
- {
216
- StopWord_hash.put(line, "StopWord");
217
- }
218
- br.close();
219
-
220
- BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
221
- line="";
222
- while ((line = inputfile.readLine()) != null)
223
- {
224
- //count++;
225
- //if(count%10000==0){ System.out.println(count); }
226
- String Column[]=line.split("\t");
227
- if(Column.length>1)
228
- {
229
- if(!StopWord_hash.containsKey(Column[0].toLowerCase()))
230
- {
231
- if(Preifx.equals("")) //all
232
- {
233
- if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
234
- {
235
- String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
236
- String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
237
- if(mention_tmp.length()>=10)
238
- {
239
- Tr.insertMention(mention_rev,Column[1]);
240
- }
241
- }
242
- else
243
- {
244
- Tr.insertMention(Column[0],Column[1]); // mention, id
245
- }
246
-
247
- }
248
- else if(Column[0].matches("[0-9][0-9].*"))
249
- {
250
- if(Preifx.equals("Num"))
251
- {
252
- if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
253
- {
254
- String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
255
- String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
256
- if(mention_tmp.length()>=10)
257
- {
258
- Tr.insertMention(mention_rev,Column[1]);
259
- }
260
- }
261
- else
262
- {
263
- Tr.insertMention(Column[0],Column[1]); // mention, id
264
- }
265
- }
266
- }
267
- /*
268
- else if(Column[0].matches("[a-z][0-9].*"))
269
- {
270
- if(Preifx.equals("AZNum"))
271
- {
272
- if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
273
- {
274
- String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
275
- String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
276
- if(mention_tmp.length()>=10)
277
- {
278
- Tr.insertMention(mention_rev,Column[1]);
279
- }
280
- }
281
- else
282
- {
283
- Tr.insertMention(Column[0],Column[1]); // mention, id
284
- }
285
- }
286
- }
287
- */
288
- else if(Column[0].matches("[a-z][a-z].*"))
289
- {
290
- if(Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
291
- {
292
- if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
293
- {
294
- String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
295
- String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
296
- if(mention_tmp.length()>=10)
297
- {
298
- Tr.insertMention(mention_rev,Column[1]);
299
- }
300
- }
301
- else
302
- {
303
- Tr.insertMention(Column[0],Column[1]); // mention, id
304
- }
305
- }
306
- }
307
- else if(Preifx.equals("Others"))
308
- {
309
- if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
310
- {
311
- String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
312
- String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
313
- if(mention_tmp.length()>=10)
314
- {
315
- Tr.insertMention(mention_rev,Column[1]);
316
- }
317
- }
318
- else
319
- {
320
- Tr.insertMention(Column[0],Column[1]); // mention, id
321
- }
322
- }
323
- }
324
- }
325
- }
326
- inputfile.close();
327
- }
328
- catch(IOException e1){ System.out.println("[Dictionary2Tree_UniqueGene]: Input file is not exist.");}
329
- }
330
- public void TreeFile2Tree(String Filename)
331
- {
332
- try
333
- {
334
- //System.out.println("TreeFile2Tree : " + Filename);
335
-
336
- BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
337
- String line="";
338
- int count=0;
339
- while ((line = inputfile.readLine()) != null)
340
- {
341
- String Anno[]=line.split("\t");
342
- if(Anno.length<2){System.out.println(count+"\t"+line);} //check error
343
- String LocationInTree = Anno[0];
344
- String token = Anno[1];
345
- String identifier="";
346
- if(Anno.length==3)
347
- {
348
- identifier = Anno[2];
349
- }
350
- String LocationsInTree[]=LocationInTree.split("-");
351
- TreeNode tmp = Tr.root;
352
- for(int i=0;i<LocationsInTree.length-1;i++)
353
- {
354
- tmp=tmp.links.get(Integer.parseInt(LocationsInTree[i])-1);
355
- }
356
- tmp.InsertToken(token,identifier);
357
- //if(count%10000==0){System.out.println(count);}
358
- count++;
359
- }
360
- inputfile.close();
361
- }
362
- catch(IOException e1){ System.out.println("[TreeFile2Tee]: Input file: "+ Filename +" is not exist.");}
363
- }
364
-
365
- /*
366
- * Search target mention in the Prefix Tree
367
- */
368
- public String MentionMatch(String Mentions)
369
- {
370
- ArrayList<String> location = new ArrayList<String>();
371
- String Menlist[]=Mentions.split("\\|");
372
- for(int m=0;m<Menlist.length;m++)
373
- {
374
- String Mention=Menlist[m];
375
- String Mention_lc=Mention.toLowerCase();
376
- Mention_lc = Mention_lc.replaceAll("[\\W\\-\\_]+", "");
377
- Mention_lc = Mention_lc.replaceAll("([0-9])([a-z])", "$1 $2");
378
- Mention_lc = Mention_lc.replaceAll("([a-z])([0-9])", "$1 $2");
379
- String Tkns[]=Mention_lc.split(" ");
380
-
381
- int PrefixTranslation=0;
382
- int i=0;
383
- boolean find=false;
384
- TreeNode tmp = Tr.root;
385
-
386
- while( i<Tkns.length && tmp.CheckChild(Tkns[i],PrefixTranslation)>=0) //Find Tokens in the links
387
- {
388
- if(i == Tkns.length-1){PrefixTranslation = 1;}
389
- tmp=tmp.links.get(tmp.CheckChild(Tkns[i],PrefixTranslation)); //move point to the link
390
- find=true;
391
- i++;
392
- }
393
- if(find == true)
394
- {
395
- if(i==Tkns.length)
396
- {
397
- if(!tmp.Concept.equals(""))
398
- {
399
- return tmp.Concept;
400
- }
401
- else
402
- {
403
- return "-1";
404
- //gene id is not found.
405
- }
406
- }
407
- else
408
- {
409
- return "-2";
410
- //the gene mention matched a substring in PrefixTree.
411
- }
412
- }
413
- else
414
- {
415
- return "-3";
416
- //mention is not found
417
- }
418
- }
419
- return "-3"; //mention is not found
420
- }
421
-
422
- /*
423
- * Search target mention in the Prefix Tree
424
- */
425
- public String MentionMatch_species(String Mentions)
426
- {
427
- ArrayList<String> location = new ArrayList<String>();
428
- String Menlist[]=Mentions.split("\\|");
429
- for(int m=0;m<Menlist.length;m++)
430
- {
431
- String Mention=Menlist[m];
432
- String Mention_lc=Mention.toLowerCase();
433
- Mention_lc = Mention_lc.replaceAll("[\\W\\-\\_]+", " ");
434
- Mention_lc = Mention_lc.replaceAll("([0-9])([a-z])", "$1 $2");
435
- Mention_lc = Mention_lc.replaceAll("([a-z])([0-9])", "$1 $2");
436
- Mention_lc = Mention_lc.replaceAll("^[ ]+", "");
437
- Mention_lc = Mention_lc.replaceAll("[ ]+$", "");
438
- String Tkns[]=Mention_lc.split(" ");
439
-
440
- int PrefixTranslation=0;
441
- int i=0;
442
- boolean find=false;
443
- TreeNode tmp = Tr.root;
444
-
445
- while( i<Tkns.length && tmp.CheckChild(Tkns[i],PrefixTranslation)>=0) //Find Tokens in the links
446
- {
447
- if(i == Tkns.length-1){PrefixTranslation = 1;}
448
- tmp=tmp.links.get(tmp.CheckChild(Tkns[i],PrefixTranslation)); //move point to the link
449
- find=true;
450
- i++;
451
- }
452
- if(find == true)
453
- {
454
- if(i==Tkns.length)
455
- {
456
- if(!tmp.Concept.equals(""))
457
- {
458
- return tmp.Concept;
459
- }
460
- else
461
- {
462
- return "-1";
463
- //gene id is not found.
464
- }
465
- }
466
- else
467
- {
468
- return "-2";
469
- //the gene mention matched a substring in PrefixTree.
470
- }
471
- }
472
- else
473
- {
474
- return "-3";
475
- //mention is not found
476
- }
477
- }
478
- return "-3"; //mention is not found
479
- }
480
-
481
- /*
482
- * Search target mention in the Prefix Tree
483
- * ConceptType: Species|Genus|Cell|CTDGene
484
- */
485
- public ArrayList<String> SearchMentionLocation(String Doc,String ConceptType)
486
- {
487
- ArrayList<String> location = new ArrayList<String>();
488
- Doc=Doc+" XXXX XXXX";
489
- String Doc_org=Doc;
490
- Doc=Doc.toLowerCase();
491
- String Doc_lc=Doc;
492
- Doc = Doc.replaceAll("([0-9])([A-Za-z])", "$1 $2");
493
- Doc = Doc.replaceAll("([A-Za-z])([0-9])", "$1 $2");
494
- Doc = Doc.replaceAll("[\\W^;:,]+", " ");
495
-
496
- /* = keep special characters =
497
- *
498
- String regex="\\s+|(?=\\p{Punct})|(?<=\\p{Punct})";
499
- String DocTkns[]=Doc.split(regex);
500
- */
501
-
502
- String DocTkns[]=Doc.split(" ");
503
- int Offset=0;
504
- int Start=0;
505
- int Last=0;
506
- int FirstTime=0;
507
-
508
- while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
509
- {
510
- Doc_lc=Doc_lc.substring(1);
511
- Offset++;
512
- }
513
-
514
- for(int i=0;i<DocTkns.length;i++)
515
- {
516
- //System.out.println(i+"\t"+Start+"\t"+Last+"\t"+Offset+"\t"+Doc_lc);
517
-
518
- int pre_i=i;
519
- int pre_Start=Start;
520
- int pre_Last=Last;
521
- String pre_Doc_lc=Doc_lc;
522
- int pre_Offset=Offset;
523
-
524
- TreeNode tmp = Tr.root;
525
- boolean find=false;
526
- int PrefixTranslation=2;
527
- if(ConceptType.equals("Species"))
528
- {
529
- PrefixTranslation=3;
530
- }
531
- int ConceptFound=i; //Keep found concept
532
- String ConceptFound_STR="";//Keep found concept
533
- int FirstTime_while = -1;
534
-
535
- while( tmp.CheckChild(DocTkns[i],PrefixTranslation)>=0 ) //Find Tokens in the links
536
- {
537
- FirstTime_while++;
538
- tmp=tmp.links.get(tmp.CheckChild(DocTkns[i],PrefixTranslation)); //move point to the link
539
- if(Start==0 && FirstTime>0){Start = Offset;} //Start <- Offset
540
- if(Doc_lc.length()>=DocTkns[i].length() && Doc_lc.substring(0,DocTkns[i].length()).equals(DocTkns[i]))
541
- {
542
- if(DocTkns[i].length()>0)
543
- {
544
- Doc_lc=Doc_lc.substring(DocTkns[i].length());
545
- Offset=Offset+DocTkns[i].length();
546
- }
547
- }
548
- Last = Offset;
549
- while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
550
- {
551
- Doc_lc=Doc_lc.substring(1);
552
- Offset++;
553
- }
554
- i++;
555
-
556
- if(ConceptType.equals("Species"))
557
- {
558
- if(i<DocTkns.length-3 && DocTkns[i].matches("(str|strain|substr|substrain|subspecies|subsp|var|variant|pathovars|pv|biovar|bv)"))
559
- {
560
- Doc_lc=Doc_lc.substring(DocTkns[i].length());
561
- Offset=Offset+DocTkns[i].length();
562
- Last = Offset;
563
- while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
564
- {
565
- Doc_lc=Doc_lc.substring(1);
566
- Offset++;
567
- }
568
- i++;
569
- }
570
- }
571
-
572
- if(!tmp.Concept.equals("") && (Last-Start>0)) //Keep found concept
573
- {
574
- if(Last<Doc_org.length())
575
- {
576
- ConceptFound=i;
577
- ConceptFound_STR=Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept;
578
- //System.out.println(ConceptFound_STR);
579
- }
580
- }
581
-
582
- find=true;
583
- if(i>=DocTkns.length){break;}
584
- else if(i==DocTkns.length-1){PrefixTranslation=2;}
585
-
586
- //System.out.println(i+"\t"+Start+"\t"+Last+"\t("+FirstTime_while+")\t"+Offset+"\t"+Doc_lc);
587
-
588
- if(FirstTime_while==0) // first matched token
589
- {
590
- pre_i=i;
591
- pre_Start=Start;
592
- pre_Last=Last;
593
- pre_Doc_lc=Doc_lc;
594
- pre_Offset=Offset;
595
- }
596
- }
597
-
598
- if(find == true)
599
- {
600
- //System.out.println(find+"\t"+FirstTime_while+"\t"+Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
601
- if(!tmp.Concept.equals("")) //the last matched token has concept id
602
- {
603
- if(Last<Doc_org.length() && Last>Start)
604
- {
605
- location.add(Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
606
- }
607
- }
608
- else
609
- {
610
- if(!ConceptFound_STR.equals("")) //Keep found concept
611
- {
612
- location.add(ConceptFound_STR);
613
- i = ConceptFound + 1;
614
- }
615
-
616
- if(FirstTime_while>=1)
617
- {
618
- i=pre_i;
619
- Start=pre_Start;
620
- Last=pre_Last;
621
- Doc_lc=pre_Doc_lc;
622
- Offset=pre_Offset;
623
- }
624
- }
625
- Start=0;
626
- Last=0;
627
- if(i>0){i--;}
628
- ConceptFound=i; //Keep found concept
629
- ConceptFound_STR="";//Keep found concept
630
- }
631
- else //if(find == false)
632
- {
633
- //System.out.println(find+"\t"+FirstTime_while+"\t"+Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
634
-
635
- if(FirstTime_while>=1 && tmp.Concept.equals(""))
636
- {
637
- i=pre_i;
638
- Start=pre_Start;
639
- Last=pre_Last;
640
- Doc_lc=pre_Doc_lc;
641
- Offset=pre_Offset;
642
- }
643
-
644
- if(Doc_lc.length()>=DocTkns[i].length() && Doc_lc.substring(0,DocTkns[i].length()).equals(DocTkns[i]))
645
- {
646
- if(DocTkns[i].length()>0)
647
- {
648
- Doc_lc=Doc_lc.substring(DocTkns[i].length());
649
- Offset=Offset+DocTkns[i].length();
650
- }
651
- }
652
- }
653
-
654
- while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
655
- {
656
- Doc_lc=Doc_lc.substring(1);
657
- Offset++;
658
- }
659
- FirstTime++;
660
-
661
- //System.out.println();
662
- }
663
- return location;
664
- }
665
-
666
- /*
667
- * Print out the Prefix Tree
668
- */
669
- public String PrintTree()
670
- {
671
- return Tr.PrintTree_preorder(Tr.root,"");
672
- }
673
-
674
- public void SaveTree(String outputfile) throws IOException
675
- {
676
- BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputfile), "UTF-8"));
677
- Tr.SaveTree_preorder(Tr.root,"",fr);
678
- fr.close();
679
- }
680
-
681
-
682
- public void insertMention(String Mention, String Identifier)
683
- {
684
- Tr.insertMention(Mention,Identifier);
685
- }
686
- }
687
-
688
- class Tree
689
- {
690
- /*
691
- * Prefix Tree - root node
692
- */
693
- public TreeNode root;
694
-
695
- public Tree()
696
- {
697
- root = new TreeNode("-ROOT-");
698
- }
699
-
700
- /*
701
- * Insert mention into the tree
702
- */
703
- public void insertMention(String Mention, String Identifier)
704
- {
705
- Mention=Mention.toLowerCase();
706
-
707
- Mention = Mention.replaceAll("([0-9])([A-Za-z])", "$1 $2");
708
- Mention = Mention.replaceAll("([A-Za-z])([0-9])", "$1 $2");
709
- Mention = Mention.replaceAll("[\\W\\-\\_]+", " ");
710
- /* = keep special characters =
711
- *
712
- String regex="\\s+|(?=\\p{Punct})|(?<=\\p{Punct})";
713
- String Tokens[]=Mention.split(regex);
714
- */
715
- String Tokens[]=Mention.split(" ");
716
- TreeNode tmp = root;
717
- for(int i=0;i<Tokens.length;i++)
718
- {
719
- if(tmp.CheckChild(Tokens[i],0)>=0)
720
- {
721
- tmp=tmp.links.get( tmp.CheckChild(Tokens[i],0) ); //go through next generation (exist node)
722
- if(i == Tokens.length-1)
723
- {
724
- tmp.Concept=Identifier;
725
- }
726
- }
727
- else //not exist
728
- {
729
- if(i == Tokens.length-1)
730
- {
731
- tmp.InsertToken(Tokens[i],Identifier);
732
- }
733
- else
734
- {
735
- tmp.InsertToken(Tokens[i]);
736
- }
737
- tmp=tmp.links.get(tmp.NumOflinks-1); //go to the next generation (new node)
738
- }
739
- }
740
- }
741
-
742
- /*
743
- * Print the tree by pre-order
744
- */
745
- public String PrintTree_preorder(TreeNode node, String LocationInTree)
746
- {
747
- String opt="";
748
- if(!node.token.equals("-ROOT-"))//Ignore root
749
- {
750
- if(node.Concept.equals(""))
751
- {
752
- opt=opt+LocationInTree+"\t"+node.token+"\n";
753
- }
754
- else
755
- {
756
- opt=opt+LocationInTree+"\t"+node.token+"\t"+node.Concept+"\n";
757
- }
758
- }
759
- if(!LocationInTree.equals("")){LocationInTree=LocationInTree+"-";}
760
- for(int i=0;i<node.NumOflinks;i++)
761
- {
762
- opt=opt+PrintTree_preorder(node.links.get(i),LocationInTree+(i+1));
763
- }
764
- return opt;
765
- }
766
-
767
- /*
768
- * Print the tree by pre-order
769
- */
770
- public void SaveTree_preorder(TreeNode node, String LocationInTree, BufferedWriter fr) throws IOException
771
- {
772
- if(!node.token.equals("-ROOT-"))//Ignore root
773
- {
774
- if(node.Concept.equals(""))
775
- {
776
- fr.write(LocationInTree+"\t"+node.token+"\n");
777
- }
778
- else
779
- {
780
- fr.write(LocationInTree+"\t"+node.token+"\t"+node.Concept+"\n");
781
- }
782
- }
783
- if(!LocationInTree.equals("")){LocationInTree=LocationInTree+"-";}
784
- for(int i=0;i<node.NumOflinks;i++)
785
- {
786
- SaveTree_preorder(node.links.get(i),LocationInTree+(i+1),fr);
787
- }
788
- }
789
- }
790
-
791
- class TreeNode
792
- {
793
- String token; //token of the node
794
- int NumOflinks; //Number of links
795
- public String Concept;
796
- HashMap<String,Integer> Hashs;
797
- ArrayList<TreeNode> links;
798
-
799
- public TreeNode(String Tok,String ID)
800
- {
801
- token = Tok;
802
- NumOflinks = 0;
803
- Concept = ID;
804
- links = new ArrayList<TreeNode>();/*link*/
805
- Hashs = new HashMap<String,Integer>();/*hash*/
806
- }
807
- public TreeNode(String Tok)
808
- {
809
- token = Tok;
810
- NumOflinks = 0;
811
- Concept = "";
812
- links = new ArrayList<TreeNode>();/*link*/
813
- Hashs = new HashMap<String,Integer>();/*hash*/
814
- }
815
- public TreeNode()
816
- {
817
- token = "";
818
- NumOflinks = 0;
819
- Concept = "";
820
- links = new ArrayList<TreeNode>();/*link*/
821
- Hashs = new HashMap<String,Integer>();/*hash*/
822
- }
823
-
824
- public String toString()
825
- {
826
- return (token+"\t"+Concept);
827
- }
828
-
829
- /*
830
- * Insert an new node under the target node
831
- */
832
- public void InsertToken(String Tok)
833
- {
834
- TreeNode NewNode = new TreeNode(Tok);
835
-
836
- /*link*/
837
- links.add(NewNode);
838
-
839
- /*hash*/
840
- Hashs.put(Tok, NumOflinks);
841
-
842
- NumOflinks++;
843
- }
844
- public void InsertToken(String Tok,String ID)
845
- {
846
- TreeNode NewNode = new TreeNode(Tok,ID);
847
- /*link*/
848
- links.add(NewNode);
849
-
850
- /*hash*/
851
- Hashs.put(Tok, NumOflinks);
852
-
853
- NumOflinks++;
854
- }
855
-
856
- /*
857
- * Check the tokens of children
858
- */
859
- public int CheckChild(String Tok, Integer PrefixTranslation)
860
- {
861
- if(Hashs.containsKey(Tok))
862
- {
863
- return(Hashs.get(Tok));
864
- }
865
-
866
- if(PrefixTranslation == 1 && Tok.matches("(alpha|beta|gamam|[abg]|[12])")) // SuffixTranslationMap
867
- {
868
- if(Hashs.containsKey(GNormPlus.SuffixTranslationMap_hash.get(Tok)))
869
- {
870
- return(Hashs.get(GNormPlus.SuffixTranslationMap_hash.get(Tok)));
871
- }
872
-
873
- }
874
- else if(PrefixTranslation == 2 && Tok.matches("[1-5]")) // for CTDGene feature
875
- {
876
- for(int i=0;i<links.size();i++)
877
- {
878
- if(links.get(i).token.matches("[1-5]"))
879
- {
880
- return(i);
881
- }
882
- }
883
-
884
- for(int i=1;i<=5;i++)
885
- {
886
- if(Hashs.containsKey(i)){return(Hashs.get(i));}
887
- }
888
- }
889
-
890
- return(-1);
891
- }
892
- }
893
 
 
1
+ /**
2
+ * Project: GNormPlus
3
+ * Function: Dictionary lookup by Prefix Tree
4
+ */
5
+
6
+ package GNormPluslib;
7
+
8
+ import java.io.*;
9
+ import java.util.*;
10
+ import java.util.regex.Matcher;
11
+ import java.util.regex.Pattern;
12
+
13
+ public class PrefixTree
14
+ {
15
+ private Tree Tr=new Tree();
16
+
17
+ /*
18
+ * Read Dictionary and insert Mention into the Prefix Tree
19
+ */
20
+ public static HashMap<String, String> StopWord_hash = new HashMap<String, String>();
21
+
22
+ public void Hash2Tree(HashMap<String, String> ID2Names)
23
+ {
24
+ for(String ID : ID2Names.keySet())
25
+ {
26
+ String NameColumn[]=ID2Names.get(ID).split("\\|");
27
+ for(int i=0;i<NameColumn.length;i++)
28
+ {
29
+ Tr.insertMention(NameColumn[i],ID);
30
+ }
31
+ }
32
+ }
33
+ public void Dictionary2Tree_Combine(String Filename,String StopWords,String MentionType)
34
+ {
35
+ try
36
+ {
37
+ //System.out.println("Dictionary2Tree_Combine : " + Filename);
38
+
39
+ /** Stop Word */
40
+ BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8"));
41
+ String line="";
42
+ while ((line = br.readLine()) != null)
43
+ {
44
+ StopWord_hash.put(line, "StopWord");
45
+ }
46
+ br.close();
47
+
48
+ BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
49
+ line="";
50
+ //int count=0;
51
+ while ((line = inputfile.readLine()) != null)
52
+ {
53
+ //count++;
54
+ //if(count%10000==0){ System.out.println(count); }
55
+ String Column[]=line.split("\t");
56
+ if(Column.length>1)
57
+ {
58
+ Column[0]=Column[0].replace("species:ncbi:","");
59
+ Column[1]=Column[1].replaceAll(" strain=", " ");
60
+ Column[1]=Column[1].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
61
+ Column[1]=Column[1].replaceAll("[\\(\\)]", " ");
62
+ String SpNameColumn[]=Column[1].split("\\|");
63
+ for(int i=0;i<SpNameColumn.length;i++)
64
+ {
65
+ String tmp = SpNameColumn[i];
66
+ tmp=tmp.replaceAll("[\\W\\-\\_]", "");
67
+
68
+ /*
69
+ * Criteria for Species
70
+ */
71
+ if( MentionType.equals("Species") &&
72
+ (!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
73
+ (!SpNameColumn[i].matches("a[\\W\\-\\_].*")) &&
74
+ tmp.length()>=3
75
+ )
76
+ {
77
+ boolean stopword_boolean=false;
78
+ for(String stopword_RegEx : StopWord_hash.keySet())
79
+ {
80
+ Pattern ptmp = Pattern.compile("^"+stopword_RegEx+"$");
81
+ Matcher mtmp = ptmp.matcher(SpNameColumn[i].toLowerCase());
82
+ if(mtmp.find())
83
+ {
84
+ stopword_boolean=true;
85
+ }
86
+ }
87
+ if(stopword_boolean == false)
88
+ {
89
+ Tr.insertMention(SpNameColumn[i],Column[0]);
90
+ }
91
+ }
92
+ /*
93
+ * Criteria for Gene
94
+ */
95
+ else if (MentionType.equals("Gene") &&
96
+ (!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
97
+ tmp.length()>=3
98
+ )
99
+ {
100
+ if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
101
+ {
102
+ Tr.insertMention(SpNameColumn[i],Column[0]);
103
+ }
104
+ }
105
+ /*
106
+ * Criteria for Cell
107
+ */
108
+ else if (MentionType.equals("Cell") &&
109
+ (!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
110
+ tmp.length()>=3
111
+ )
112
+ {
113
+ if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
114
+ {
115
+ Tr.insertMention(SpNameColumn[i],Column[0]);
116
+ }
117
+ }
118
+ /*
119
+ * others
120
+ */
121
+ else if ((!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
122
+ tmp.length()>=3
123
+ )
124
+ {
125
+ if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
126
+ {
127
+ Tr.insertMention(SpNameColumn[i],Column[0]);
128
+ }
129
+ }
130
+ }
131
+ }
132
+ }
133
+ inputfile.close();
134
+ }
135
+ catch(IOException e1){ System.out.println("[Dictionary2Tree_Combine]: Input file is not exist.");}
136
+ }
137
+ public void Dictionary2Tree_UniqueGene(String Filename,String StopWords,String Preifx)
138
+ {
139
+ try
140
+ {
141
+ //System.out.println("Dictionary2Tree_UniqueGene : " + Filename);
142
+
143
+ /** Stop Word */
144
+ BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8"));
145
+ String line="";
146
+ while ((line = br.readLine()) != null)
147
+ {
148
+ StopWord_hash.put(line, "StopWord");
149
+ }
150
+ br.close();
151
+
152
+ BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
153
+ line="";
154
+ //int count=0;
155
+ while ((line = inputfile.readLine()) != null)
156
+ {
157
+ //count++;
158
+ //if(count%10000==0){ System.out.println(count); }
159
+ String Column[]=line.split("\t");
160
+ if(Column.length>1)
161
+ {
162
+ if(!StopWord_hash.containsKey(Column[0].toLowerCase()))
163
+ {
164
+ if(Preifx.equals(""))
165
+ {
166
+ Tr.insertMention(Column[0],Column[1]);
167
+ }
168
+ else if(Preifx.equals("Num") && Column[0].matches("[0-9].*"))
169
+ {
170
+ Tr.insertMention(Column[0],Column[1]);
171
+ }
172
+ else if(Preifx.equals("AZNum") && Column[0].matches("[a-z][0-9].*"))
173
+ {
174
+ Tr.insertMention(Column[0],Column[1]);
175
+ }
176
+ else if(Preifx.equals("lo") && Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
177
+ {
178
+ if( ! Column[0].matches("loc[0-9]+"))
179
+ {
180
+ Tr.insertMention(Column[0],Column[1]);
181
+ }
182
+ }
183
+ else if(Preifx.equals("un") && Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
184
+ {
185
+ if(Column[0].length()>=6 && Column[0].substring(0,6).equals("unchar"))
186
+ {
187
+ // remove uncharacterized
188
+ }
189
+ else
190
+ {
191
+ Tr.insertMention(Column[0],Column[1]);
192
+ }
193
+ }
194
+ else if(Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
195
+ {
196
+ Tr.insertMention(Column[0],Column[1]);
197
+ }
198
+ }
199
+ }
200
+ }
201
+ inputfile.close();
202
+ }
203
+ catch(IOException e1){ System.out.println("[Dictionary2Tree_UniqueGene]: Input file is not exist.");}
204
+ }
205
+ public void Dictionary2Tree_UniqueSpecies(String Filename,String StopWords,String Preifx)
206
+ {
207
+ try
208
+ {
209
+ //System.out.println("Dictionary2Tree_UniqueGene : " + Filename);
210
+
211
+ /** Stop Word */
212
+ BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8"));
213
+ String line="";
214
+ while ((line = br.readLine()) != null)
215
+ {
216
+ StopWord_hash.put(line, "StopWord");
217
+ }
218
+ br.close();
219
+
220
+ BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
221
+ line="";
222
+ while ((line = inputfile.readLine()) != null)
223
+ {
224
+ //count++;
225
+ //if(count%10000==0){ System.out.println(count); }
226
+ String Column[]=line.split("\t");
227
+ if(Column.length>1)
228
+ {
229
+ if(!StopWord_hash.containsKey(Column[0].toLowerCase()))
230
+ {
231
+ if(Preifx.equals("")) //all
232
+ {
233
+ if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
234
+ {
235
+ String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
236
+ String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
237
+ if(mention_tmp.length()>=10)
238
+ {
239
+ Tr.insertMention(mention_rev,Column[1]);
240
+ }
241
+ }
242
+ else
243
+ {
244
+ Tr.insertMention(Column[0],Column[1]); // mention, id
245
+ }
246
+
247
+ }
248
+ else if(Column[0].matches("[0-9][0-9].*"))
249
+ {
250
+ if(Preifx.equals("Num"))
251
+ {
252
+ if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
253
+ {
254
+ String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
255
+ String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
256
+ if(mention_tmp.length()>=10)
257
+ {
258
+ Tr.insertMention(mention_rev,Column[1]);
259
+ }
260
+ }
261
+ else
262
+ {
263
+ Tr.insertMention(Column[0],Column[1]); // mention, id
264
+ }
265
+ }
266
+ }
267
+ /*
268
+ else if(Column[0].matches("[a-z][0-9].*"))
269
+ {
270
+ if(Preifx.equals("AZNum"))
271
+ {
272
+ if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
273
+ {
274
+ String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
275
+ String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
276
+ if(mention_tmp.length()>=10)
277
+ {
278
+ Tr.insertMention(mention_rev,Column[1]);
279
+ }
280
+ }
281
+ else
282
+ {
283
+ Tr.insertMention(Column[0],Column[1]); // mention, id
284
+ }
285
+ }
286
+ }
287
+ */
288
+ else if(Column[0].matches("[a-z][a-z].*"))
289
+ {
290
+ if(Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
291
+ {
292
+ if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
293
+ {
294
+ String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
295
+ String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
296
+ if(mention_tmp.length()>=10)
297
+ {
298
+ Tr.insertMention(mention_rev,Column[1]);
299
+ }
300
+ }
301
+ else
302
+ {
303
+ Tr.insertMention(Column[0],Column[1]); // mention, id
304
+ }
305
+ }
306
+ }
307
+ else if(Preifx.equals("Others"))
308
+ {
309
+ if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
310
+ {
311
+ String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
312
+ String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
313
+ if(mention_tmp.length()>=10)
314
+ {
315
+ Tr.insertMention(mention_rev,Column[1]);
316
+ }
317
+ }
318
+ else
319
+ {
320
+ Tr.insertMention(Column[0],Column[1]); // mention, id
321
+ }
322
+ }
323
+ }
324
+ }
325
+ }
326
+ inputfile.close();
327
+ }
328
+ catch(IOException e1){ System.out.println("[Dictionary2Tree_UniqueGene]: Input file is not exist.");}
329
+ }
330
+ public void TreeFile2Tree(String Filename)
331
+ {
332
+ try
333
+ {
334
+ //System.out.println("TreeFile2Tree : " + Filename);
335
+
336
+ BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
337
+ String line="";
338
+ int count=0;
339
+ while ((line = inputfile.readLine()) != null)
340
+ {
341
+ String Anno[]=line.split("\t");
342
+ if(Anno.length<2){System.out.println(count+"\t"+line);} //check error
343
+ String LocationInTree = Anno[0];
344
+ String token = Anno[1];
345
+ String identifier="";
346
+ if(Anno.length==3)
347
+ {
348
+ identifier = Anno[2];
349
+ }
350
+ String LocationsInTree[]=LocationInTree.split("-");
351
+ TreeNode tmp = Tr.root;
352
+ for(int i=0;i<LocationsInTree.length-1;i++)
353
+ {
354
+ tmp=tmp.links.get(Integer.parseInt(LocationsInTree[i])-1);
355
+ }
356
+ tmp.InsertToken(token,identifier);
357
+ //if(count%10000==0){System.out.println(count);}
358
+ count++;
359
+ }
360
+ inputfile.close();
361
+ }
362
+ catch(IOException e1){ System.out.println("[TreeFile2Tee]: Input file: "+ Filename +" is not exist.");}
363
+ }
364
+
365
+ /*
366
+ * Search target mention in the Prefix Tree
367
+ */
368
+ public String MentionMatch(String Mentions)
369
+ {
370
+ ArrayList<String> location = new ArrayList<String>();
371
+ String Menlist[]=Mentions.split("\\|");
372
+ for(int m=0;m<Menlist.length;m++)
373
+ {
374
+ String Mention=Menlist[m];
375
+ String Mention_lc=Mention.toLowerCase();
376
+ Mention_lc = Mention_lc.replaceAll("[\\W\\-\\_]+", "");
377
+ Mention_lc = Mention_lc.replaceAll("([0-9])([a-z])", "$1 $2");
378
+ Mention_lc = Mention_lc.replaceAll("([a-z])([0-9])", "$1 $2");
379
+ String Tkns[]=Mention_lc.split(" ");
380
+
381
+ int PrefixTranslation=0;
382
+ int i=0;
383
+ boolean find=false;
384
+ TreeNode tmp = Tr.root;
385
+
386
+ while( i<Tkns.length && tmp.CheckChild(Tkns[i],PrefixTranslation)>=0) //Find Tokens in the links
387
+ {
388
+ if(i == Tkns.length-1){PrefixTranslation = 1;}
389
+ tmp=tmp.links.get(tmp.CheckChild(Tkns[i],PrefixTranslation)); //move point to the link
390
+ find=true;
391
+ i++;
392
+ }
393
+ if(find == true)
394
+ {
395
+ if(i==Tkns.length)
396
+ {
397
+ if(!tmp.Concept.equals(""))
398
+ {
399
+ return tmp.Concept;
400
+ }
401
+ else
402
+ {
403
+ return "-1";
404
+ //gene id is not found.
405
+ }
406
+ }
407
+ else
408
+ {
409
+ return "-2";
410
+ //the gene mention matched a substring in PrefixTree.
411
+ }
412
+ }
413
+ else
414
+ {
415
+ return "-3";
416
+ //mention is not found
417
+ }
418
+ }
419
+ return "-3"; //mention is not found
420
+ }
421
+
422
+ /*
423
+ * Search target mention in the Prefix Tree
424
+ */
425
+ public String MentionMatch_species(String Mentions)
426
+ {
427
+ ArrayList<String> location = new ArrayList<String>();
428
+ String Menlist[]=Mentions.split("\\|");
429
+ for(int m=0;m<Menlist.length;m++)
430
+ {
431
+ String Mention=Menlist[m];
432
+ String Mention_lc=Mention.toLowerCase();
433
+ Mention_lc = Mention_lc.replaceAll("[\\W\\-\\_]+", " ");
434
+ Mention_lc = Mention_lc.replaceAll("([0-9])([a-z])", "$1 $2");
435
+ Mention_lc = Mention_lc.replaceAll("([a-z])([0-9])", "$1 $2");
436
+ Mention_lc = Mention_lc.replaceAll("^[ ]+", "");
437
+ Mention_lc = Mention_lc.replaceAll("[ ]+$", "");
438
+ String Tkns[]=Mention_lc.split(" ");
439
+
440
+ int PrefixTranslation=0;
441
+ int i=0;
442
+ boolean find=false;
443
+ TreeNode tmp = Tr.root;
444
+
445
+ while( i<Tkns.length && tmp.CheckChild(Tkns[i],PrefixTranslation)>=0) //Find Tokens in the links
446
+ {
447
+ if(i == Tkns.length-1){PrefixTranslation = 1;}
448
+ tmp=tmp.links.get(tmp.CheckChild(Tkns[i],PrefixTranslation)); //move point to the link
449
+ find=true;
450
+ i++;
451
+ }
452
+ if(find == true)
453
+ {
454
+ if(i==Tkns.length)
455
+ {
456
+ if(!tmp.Concept.equals(""))
457
+ {
458
+ return tmp.Concept;
459
+ }
460
+ else
461
+ {
462
+ return "-1";
463
+ //gene id is not found.
464
+ }
465
+ }
466
+ else
467
+ {
468
+ return "-2";
469
+ //the gene mention matched a substring in PrefixTree.
470
+ }
471
+ }
472
+ else
473
+ {
474
+ return "-3";
475
+ //mention is not found
476
+ }
477
+ }
478
+ return "-3"; //mention is not found
479
+ }
480
+
481
+ /*
482
+ * Search target mention in the Prefix Tree
483
+ * ConceptType: Species|Genus|Cell|CTDGene
484
+ */
485
+ public ArrayList<String> SearchMentionLocation(String Doc,String ConceptType)
486
+ {
487
+ ArrayList<String> location = new ArrayList<String>();
488
+ Doc=Doc+" XXXX XXXX";
489
+ String Doc_org=Doc;
490
+ Doc=Doc.toLowerCase();
491
+ String Doc_lc=Doc;
492
+ Doc = Doc.replaceAll("([0-9])([A-Za-z])", "$1 $2");
493
+ Doc = Doc.replaceAll("([A-Za-z])([0-9])", "$1 $2");
494
+ Doc = Doc.replaceAll("[\\W^;:,]+", " ");
495
+
496
+ /* = keep special characters =
497
+ *
498
+ String regex="\\s+|(?=\\p{Punct})|(?<=\\p{Punct})";
499
+ String DocTkns[]=Doc.split(regex);
500
+ */
501
+
502
+ String DocTkns[]=Doc.split(" ");
503
+ int Offset=0;
504
+ int Start=0;
505
+ int Last=0;
506
+ int FirstTime=0;
507
+
508
+ while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
509
+ {
510
+ Doc_lc=Doc_lc.substring(1);
511
+ Offset++;
512
+ }
513
+
514
+ for(int i=0;i<DocTkns.length;i++)
515
+ {
516
+ //System.out.println(i+"\t"+Start+"\t"+Last+"\t"+Offset+"\t"+Doc_lc);
517
+
518
+ int pre_i=i;
519
+ int pre_Start=Start;
520
+ int pre_Last=Last;
521
+ String pre_Doc_lc=Doc_lc;
522
+ int pre_Offset=Offset;
523
+
524
+ TreeNode tmp = Tr.root;
525
+ boolean find=false;
526
+ int PrefixTranslation=2;
527
+ if(ConceptType.equals("Species"))
528
+ {
529
+ PrefixTranslation=3;
530
+ }
531
+ int ConceptFound=i; //Keep found concept
532
+ String ConceptFound_STR="";//Keep found concept
533
+ int FirstTime_while = -1;
534
+
535
+ while( tmp.CheckChild(DocTkns[i],PrefixTranslation)>=0 ) //Find Tokens in the links
536
+ {
537
+ FirstTime_while++;
538
+ tmp=tmp.links.get(tmp.CheckChild(DocTkns[i],PrefixTranslation)); //move point to the link
539
+ if(Start==0 && FirstTime>0){Start = Offset;} //Start <- Offset
540
+ if(Doc_lc.length()>=DocTkns[i].length() && Doc_lc.substring(0,DocTkns[i].length()).equals(DocTkns[i]))
541
+ {
542
+ if(DocTkns[i].length()>0)
543
+ {
544
+ Doc_lc=Doc_lc.substring(DocTkns[i].length());
545
+ Offset=Offset+DocTkns[i].length();
546
+ }
547
+ }
548
+ Last = Offset;
549
+ while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
550
+ {
551
+ Doc_lc=Doc_lc.substring(1);
552
+ Offset++;
553
+ }
554
+ i++;
555
+
556
+ if(ConceptType.equals("Species"))
557
+ {
558
+ if(i<DocTkns.length-3 && DocTkns[i].matches("(str|strain|substr|substrain|subspecies|subsp|var|variant|pathovars|pv|biovar|bv)"))
559
+ {
560
+ Doc_lc=Doc_lc.substring(DocTkns[i].length());
561
+ Offset=Offset+DocTkns[i].length();
562
+ Last = Offset;
563
+ while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
564
+ {
565
+ Doc_lc=Doc_lc.substring(1);
566
+ Offset++;
567
+ }
568
+ i++;
569
+ }
570
+ }
571
+
572
+ if(!tmp.Concept.equals("") && (Last-Start>0)) //Keep found concept
573
+ {
574
+ if(Last<Doc_org.length())
575
+ {
576
+ ConceptFound=i;
577
+ ConceptFound_STR=Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept;
578
+ //System.out.println(ConceptFound_STR);
579
+ }
580
+ }
581
+
582
+ find=true;
583
+ if(i>=DocTkns.length){break;}
584
+ else if(i==DocTkns.length-1){PrefixTranslation=2;}
585
+
586
+ //System.out.println(i+"\t"+Start+"\t"+Last+"\t("+FirstTime_while+")\t"+Offset+"\t"+Doc_lc);
587
+
588
+ if(FirstTime_while==0) // first matched token
589
+ {
590
+ pre_i=i;
591
+ pre_Start=Start;
592
+ pre_Last=Last;
593
+ pre_Doc_lc=Doc_lc;
594
+ pre_Offset=Offset;
595
+ }
596
+ }
597
+
598
+ if(find == true)
599
+ {
600
+ //System.out.println(find+"\t"+FirstTime_while+"\t"+Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
601
+ if(!tmp.Concept.equals("")) //the last matched token has concept id
602
+ {
603
+ if(Last<Doc_org.length() && Last>Start)
604
+ {
605
+ location.add(Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
606
+ }
607
+ }
608
+ else
609
+ {
610
+ if(!ConceptFound_STR.equals("")) //Keep found concept
611
+ {
612
+ location.add(ConceptFound_STR);
613
+ i = ConceptFound + 1;
614
+ }
615
+
616
+ if(FirstTime_while>=1)
617
+ {
618
+ i=pre_i;
619
+ Start=pre_Start;
620
+ Last=pre_Last;
621
+ Doc_lc=pre_Doc_lc;
622
+ Offset=pre_Offset;
623
+ }
624
+ }
625
+ Start=0;
626
+ Last=0;
627
+ if(i>0){i--;}
628
+ ConceptFound=i; //Keep found concept
629
+ ConceptFound_STR="";//Keep found concept
630
+ }
631
+ else //if(find == false)
632
+ {
633
+ //System.out.println(find+"\t"+FirstTime_while+"\t"+Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
634
+
635
+ if(FirstTime_while>=1 && tmp.Concept.equals(""))
636
+ {
637
+ i=pre_i;
638
+ Start=pre_Start;
639
+ Last=pre_Last;
640
+ Doc_lc=pre_Doc_lc;
641
+ Offset=pre_Offset;
642
+ }
643
+
644
+ if(Doc_lc.length()>=DocTkns[i].length() && Doc_lc.substring(0,DocTkns[i].length()).equals(DocTkns[i]))
645
+ {
646
+ if(DocTkns[i].length()>0)
647
+ {
648
+ Doc_lc=Doc_lc.substring(DocTkns[i].length());
649
+ Offset=Offset+DocTkns[i].length();
650
+ }
651
+ }
652
+ }
653
+
654
+ while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
655
+ {
656
+ Doc_lc=Doc_lc.substring(1);
657
+ Offset++;
658
+ }
659
+ FirstTime++;
660
+
661
+ //System.out.println();
662
+ }
663
+ return location;
664
+ }
665
+
666
+ /*
667
+ * Print out the Prefix Tree
668
+ */
669
+ public String PrintTree()
670
+ {
671
+ return Tr.PrintTree_preorder(Tr.root,"");
672
+ }
673
+
674
+ public void SaveTree(String outputfile) throws IOException
675
+ {
676
+ BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputfile), "UTF-8"));
677
+ Tr.SaveTree_preorder(Tr.root,"",fr);
678
+ fr.close();
679
+ }
680
+
681
+
682
+ public void insertMention(String Mention, String Identifier)
683
+ {
684
+ Tr.insertMention(Mention,Identifier);
685
+ }
686
+ }
687
+
688
+ class Tree
689
+ {
690
+ /*
691
+ * Prefix Tree - root node
692
+ */
693
+ public TreeNode root;
694
+
695
+ public Tree()
696
+ {
697
+ root = new TreeNode("-ROOT-");
698
+ }
699
+
700
+ /*
701
+ * Insert mention into the tree
702
+ */
703
+ public void insertMention(String Mention, String Identifier)
704
+ {
705
+ Mention=Mention.toLowerCase();
706
+
707
+ Mention = Mention.replaceAll("([0-9])([A-Za-z])", "$1 $2");
708
+ Mention = Mention.replaceAll("([A-Za-z])([0-9])", "$1 $2");
709
+ Mention = Mention.replaceAll("[\\W\\-\\_]+", " ");
710
+ /* = keep special characters =
711
+ *
712
+ String regex="\\s+|(?=\\p{Punct})|(?<=\\p{Punct})";
713
+ String Tokens[]=Mention.split(regex);
714
+ */
715
+ String Tokens[]=Mention.split(" ");
716
+ TreeNode tmp = root;
717
+ for(int i=0;i<Tokens.length;i++)
718
+ {
719
+ if(tmp.CheckChild(Tokens[i],0)>=0)
720
+ {
721
+ tmp=tmp.links.get( tmp.CheckChild(Tokens[i],0) ); //go through next generation (exist node)
722
+ if(i == Tokens.length-1)
723
+ {
724
+ tmp.Concept=Identifier;
725
+ }
726
+ }
727
+ else //not exist
728
+ {
729
+ if(i == Tokens.length-1)
730
+ {
731
+ tmp.InsertToken(Tokens[i],Identifier);
732
+ }
733
+ else
734
+ {
735
+ tmp.InsertToken(Tokens[i]);
736
+ }
737
+ tmp=tmp.links.get(tmp.NumOflinks-1); //go to the next generation (new node)
738
+ }
739
+ }
740
+ }
741
+
742
+ /*
743
+ * Print the tree by pre-order
744
+ */
745
+ public String PrintTree_preorder(TreeNode node, String LocationInTree)
746
+ {
747
+ String opt="";
748
+ if(!node.token.equals("-ROOT-"))//Ignore root
749
+ {
750
+ if(node.Concept.equals(""))
751
+ {
752
+ opt=opt+LocationInTree+"\t"+node.token+"\n";
753
+ }
754
+ else
755
+ {
756
+ opt=opt+LocationInTree+"\t"+node.token+"\t"+node.Concept+"\n";
757
+ }
758
+ }
759
+ if(!LocationInTree.equals("")){LocationInTree=LocationInTree+"-";}
760
+ for(int i=0;i<node.NumOflinks;i++)
761
+ {
762
+ opt=opt+PrintTree_preorder(node.links.get(i),LocationInTree+(i+1));
763
+ }
764
+ return opt;
765
+ }
766
+
767
+ /*
768
+ * Print the tree by pre-order
769
+ */
770
+ public void SaveTree_preorder(TreeNode node, String LocationInTree, BufferedWriter fr) throws IOException
771
+ {
772
+ if(!node.token.equals("-ROOT-"))//Ignore root
773
+ {
774
+ if(node.Concept.equals(""))
775
+ {
776
+ fr.write(LocationInTree+"\t"+node.token+"\n");
777
+ }
778
+ else
779
+ {
780
+ fr.write(LocationInTree+"\t"+node.token+"\t"+node.Concept+"\n");
781
+ }
782
+ }
783
+ if(!LocationInTree.equals("")){LocationInTree=LocationInTree+"-";}
784
+ for(int i=0;i<node.NumOflinks;i++)
785
+ {
786
+ SaveTree_preorder(node.links.get(i),LocationInTree+(i+1),fr);
787
+ }
788
+ }
789
+ }
790
+
791
+ class TreeNode
792
+ {
793
+ String token; //token of the node
794
+ int NumOflinks; //Number of links
795
+ public String Concept;
796
+ HashMap<String,Integer> Hashs;
797
+ ArrayList<TreeNode> links;
798
+
799
+ public TreeNode(String Tok,String ID)
800
+ {
801
+ token = Tok;
802
+ NumOflinks = 0;
803
+ Concept = ID;
804
+ links = new ArrayList<TreeNode>();/*link*/
805
+ Hashs = new HashMap<String,Integer>();/*hash*/
806
+ }
807
+ public TreeNode(String Tok)
808
+ {
809
+ token = Tok;
810
+ NumOflinks = 0;
811
+ Concept = "";
812
+ links = new ArrayList<TreeNode>();/*link*/
813
+ Hashs = new HashMap<String,Integer>();/*hash*/
814
+ }
815
+ public TreeNode()
816
+ {
817
+ token = "";
818
+ NumOflinks = 0;
819
+ Concept = "";
820
+ links = new ArrayList<TreeNode>();/*link*/
821
+ Hashs = new HashMap<String,Integer>();/*hash*/
822
+ }
823
+
824
+ public String toString()
825
+ {
826
+ return (token+"\t"+Concept);
827
+ }
828
+
829
+ /*
830
+ * Insert an new node under the target node
831
+ */
832
+ public void InsertToken(String Tok)
833
+ {
834
+ TreeNode NewNode = new TreeNode(Tok);
835
+
836
+ /*link*/
837
+ links.add(NewNode);
838
+
839
+ /*hash*/
840
+ Hashs.put(Tok, NumOflinks);
841
+
842
+ NumOflinks++;
843
+ }
844
+ public void InsertToken(String Tok,String ID)
845
+ {
846
+ TreeNode NewNode = new TreeNode(Tok,ID);
847
+ /*link*/
848
+ links.add(NewNode);
849
+
850
+ /*hash*/
851
+ Hashs.put(Tok, NumOflinks);
852
+
853
+ NumOflinks++;
854
+ }
855
+
856
+ /*
857
+ * Check the tokens of children
858
+ */
859
+ public int CheckChild(String Tok, Integer PrefixTranslation)
860
+ {
861
+ if(Hashs.containsKey(Tok))
862
+ {
863
+ return(Hashs.get(Tok));
864
+ }
865
+
866
+ if(PrefixTranslation == 1 && Tok.matches("(alpha|beta|gamam|[abg]|[12])")) // SuffixTranslationMap
867
+ {
868
+ if(Hashs.containsKey(GNormPlus.SuffixTranslationMap_hash.get(Tok)))
869
+ {
870
+ return(Hashs.get(GNormPlus.SuffixTranslationMap_hash.get(Tok)));
871
+ }
872
+
873
+ }
874
+ else if(PrefixTranslation == 2 && Tok.matches("[1-5]")) // for CTDGene feature
875
+ {
876
+ for(int i=0;i<links.size();i++)
877
+ {
878
+ if(links.get(i).token.matches("[1-5]"))
879
+ {
880
+ return(i);
881
+ }
882
+ }
883
+
884
+ for(int i=1;i<=5;i++)
885
+ {
886
+ if(Hashs.containsKey(i)){return(Hashs.get(i));}
887
+ }
888
+ }
889
+
890
+ return(-1);
891
+ }
892
+ }
893
 
src_Java/GNormPluslib/SR.java CHANGED
@@ -1,1044 +1,1044 @@
1
- /**
2
- * Project: GNormPlus
3
- * Function: Species recognition and Species assignment
4
- */
5
-
6
- package GNormPluslib;
7
-
8
- import bioc.BioCAnnotation;
9
- import bioc.BioCCollection;
10
- import bioc.BioCDocument;
11
- import bioc.BioCLocation;
12
- import bioc.BioCPassage;
13
-
14
- import bioc.io.BioCDocumentWriter;
15
- import bioc.io.BioCFactory;
16
- import bioc.io.woodstox.ConnectorWoodstox;
17
- import java.io.BufferedReader;
18
- import java.io.BufferedWriter;
19
- import java.io.FileInputStream;
20
- import java.io.FileOutputStream;
21
- import java.io.FileReader;
22
- import java.io.FileWriter;
23
- import java.io.IOException;
24
- import java.io.InputStreamReader;
25
- import java.io.OutputStreamWriter;
26
- import java.text.BreakIterator;
27
- import java.time.LocalDate;
28
- import java.time.ZoneId;
29
-
30
- import javax.xml.stream.XMLStreamException;
31
-
32
- import org.tartarus.snowball.SnowballStemmer;
33
- import org.tartarus.snowball.ext.englishStemmer;
34
-
35
- import java.util.Map;
36
- import java.util.regex.Matcher;
37
- import java.util.regex.Pattern;
38
- import java.util.ArrayList;
39
- import java.util.HashMap;
40
- import java.util.List;
41
- import java.util.Locale;
42
- import java.util.Collections;
43
-
44
- public class SR
45
- {
46
- @SuppressWarnings("null")
47
- public void SpeciesRecognition(String Filename,String FilenameBioC,String StrainFilename,String FilterAntibody) throws IOException, XMLStreamException
48
- {
49
- /** Recognizing Species Names: SP */
50
- for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */
51
- {
52
- String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
53
- PrefixTree PT_Genus = new PrefixTree();
54
- HashMap<String, String> SPID_hash = new HashMap<String, String>();
55
- ArrayList<String> TargetedLocation = new ArrayList<String>();
56
- HashMap<String, String> GenusNames = new HashMap<String, String>();
57
- HashMap<String, String> Mention2ID_lc = new HashMap<String, String>();
58
- ArrayList<String> IDset = new ArrayList<String>();
59
- for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
60
- {
61
- String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
62
-
63
- /** Species recognition */
64
- ArrayList<String> locations = GNormPlus.PT_Species.SearchMentionLocation(PassageContext,"Species"); /** PT_Species */
65
- for (int k = 0 ; k < locations.size() ; k++)
66
- {
67
- String anno[]=locations.get(k).split("\t");
68
- int start= Integer.parseInt(anno[0]);
69
- int last= Integer.parseInt(anno[1]);
70
-
71
- // For anti-serum filtering
72
- String ForwardSTR="";
73
- String BackwardSTR="";
74
- if(start>21)
75
- {
76
- ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(start-21,last);
77
- }
78
- else
79
- {
80
- ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(0,last);
81
- }
82
- if(PassageContext.length()>last+21)
83
- {
84
- BackwardSTR = PassageContext.substring(start,last+21);
85
- }
86
- else
87
- {
88
- BackwardSTR = PassageContext.substring(start,PassageContext.length());
89
- }
90
-
91
- String mention = anno[2];
92
- String id = anno[3];
93
- String mention_tmp=mention.toLowerCase();
94
- mention_tmp = mention_tmp.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
95
- String antibody="";
96
- if(ForwardSTR.toLowerCase().matches(".*(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg)[\\W\\-\\_]+"+mention_tmp)) {antibody="(anti)";}//filtering : antibody
97
- else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[\\W\\-\\_]+(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg).*")){antibody="(anti)";} //filtering : antibody
98
- else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[\\W\\-\\_]+[A-Za-z0-9]+[\\W\\-\\_]+(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg).*")){antibody="(anti)";} //filtering : antibody
99
-
100
- if(mention.matches(".*[\\(\\[\\{].*") && BackwardSTR.toLowerCase().matches(mention_tmp+"\\).*") )
101
- {
102
- last=last+1;
103
- mention=mention+")";
104
- }
105
-
106
- if(BackwardSTR.toLowerCase().matches(mention_tmp+"[0-9].*")){} // filtered: Bee1p
107
- else if((mention.matches(".*[;:,].*")) && mention.length()<=10){} // filtered : x, XXX
108
- else if(mention.matches("to[\\W\\-\\_]+[0-9]+")){} // to 7
109
- else if(mention.matches("[a-z][\\)\\]\\}].*") && (!mention.matches(".*[\\(\\[\\{].*")) && mention.length()<=10){} // s). Major
110
- else if(mention.matches(".*[\\(\\[\\{].*") && (!mention.matches(".*[\\)\\]\\}].*")) && mention.length()<=10){} // s). Major
111
- else if(!id.equals("NA"))
112
- {
113
- if(GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
114
- {
115
- if((!mention.matches("^[A-Za-z] [A-Za-z0-9]+$")) && (mention.length()>=3)) // invalid species: "a group/a GAL4/a strain"
116
- {
117
- if(FilterAntibody.equals("False") || (!antibody.equals("(anti)")))
118
- {
119
- String patt="^(.+?) [sS]train";
120
- Pattern ptmp = Pattern.compile(patt);
121
- Matcher mtmp = ptmp.matcher(mention);
122
- if(mtmp.find())
123
- {
124
- mention=mtmp.group(1);
125
- last=last-7;
126
- }
127
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tSpecies\t"+id); //+antibody
128
- String mentions_tmp=mention.toLowerCase();
129
- mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
130
- mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
131
- GNormPlus.Filtering_hash.put(mentions_tmp,"");
132
- Mention2ID_lc.put(mention.toLowerCase(), id); //+antibody
133
-
134
- String mention_genus = "";
135
- patt="^([A-Za-z]+) ";
136
- ptmp = Pattern.compile(patt);
137
- mtmp = ptmp.matcher(mention);
138
- if(mtmp.find())
139
- {
140
- mention_genus=mtmp.group(1); // get genus
141
- }
142
-
143
- IDset.add(id);
144
- for(int s=start;s<last;s++)
145
- {
146
- TargetedLocation.add(j+"\t"+s);
147
- }
148
- String ids[]=id.split(";");
149
- for(int x=0;x<ids.length;x++)
150
- {
151
- patt="^\\**([0-9]+)";
152
- ptmp = Pattern.compile(patt);
153
- mtmp = ptmp.matcher(ids[x]);
154
- if(mtmp.find())
155
- {
156
- SPID_hash.put(mtmp.group(1), mention_genus);
157
- }
158
- }
159
- }
160
- }
161
- }
162
- }
163
- }
164
-
165
- /** Cell Line recognition */
166
- locations = GNormPlus.PT_Cell.SearchMentionLocation(PassageContext,"Cell"); /** PT_Cell */
167
- for (int k = 0 ; k < locations.size() ; k++)
168
- {
169
- String anno[]=locations.get(k).split("\t");
170
- int start= Integer.parseInt(anno[0]);
171
- int last= Integer.parseInt(anno[1]);
172
- String mention = anno[2];
173
- String id = anno[3];
174
- if(GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
175
- {
176
- if(!TargetedLocation.contains(j+"\t"+start)) //already exists
177
- {
178
- int last40=0;
179
- if(PassageContext.length()>=last+40)
180
- {
181
- last40=last+40;
182
- }
183
- else
184
- {
185
- last40=PassageContext.length();
186
- }
187
-
188
- // For anti-serum filtering
189
- String ForwardSTR="";
190
- String BackwardSTR="";
191
- if(start>21)
192
- {
193
- ForwardSTR = PassageContext.substring(start-21,last);
194
- }
195
- else
196
- {
197
- ForwardSTR = PassageContext.substring(0,last);
198
- }
199
- if(PassageContext.length()>last+21)
200
- {
201
- BackwardSTR = PassageContext.substring(start,last+21);
202
- }
203
- else
204
- {
205
- BackwardSTR = PassageContext.substring(start,PassageContext.length());
206
- }
207
- String mention_tmp=mention.toLowerCase();
208
- mention_tmp = mention_tmp.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
209
- if(mention_tmp.matches(".*[\\[\\]\\(\\)\\{\\}].*")){}
210
- else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[0-9\\-\\_].*")){} // filtered: Bee1p
211
- else if(ForwardSTR.toLowerCase().matches(".*[0-9\\-\\_]"+mention_tmp)){} // filtered: IL-22RA1
212
- else
213
- {
214
- String patt="[\\W\\-]cell([\\- ]*line|)[s]*[\\W\\-]";
215
- Pattern ptmp = Pattern.compile(patt);
216
- Matcher mtmp = ptmp.matcher(PassageContext.substring(last, last40).toLowerCase());
217
- if(mtmp.find())
218
- {
219
- if(GNormPlus.taxid4gene.contains(id)) // for gene
220
- {
221
- id="*"+id;
222
- }
223
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tCell\t"+id);
224
- String mentions_tmp=mention.toLowerCase();
225
- mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
226
- mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
227
- GNormPlus.Filtering_hash.put(mentions_tmp,"");
228
- IDset.add(id);
229
- for(int s=start;s<last;s++)
230
- {
231
- TargetedLocation.add(j+"\t"+s);
232
- }
233
- }
234
- }
235
- }
236
- }
237
- }
238
-
239
- /** Genus names*/
240
- for(String ID: SPID_hash.keySet())
241
- {
242
- if(GNormPlus.GenusID_hash.containsKey(ID))
243
- {
244
- GenusNames.put(ID,GNormPlus.GenusID_hash.get(ID));
245
- }
246
- if(SPID_hash.get(ID).length()>=7)
247
- {
248
- GenusNames.put(ID,SPID_hash.get(ID));
249
- }
250
- }
251
- }
252
-
253
- GenusNames.put("3702", "arabidopsis");
254
- GenusNames.put("4932", "saccharomyces");
255
- GenusNames.put("562", "escherichia");
256
- GenusNames.put("7227", "drosophila");
257
- GenusNames.put("8355", "xenopus");
258
-
259
- PT_Genus.Hash2Tree(GenusNames);
260
-
261
- /** Genus recognition */
262
- for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
263
- {
264
- if(GNormPlus.BioCDocobj.PassageContexts.size()>i &&
265
- GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j &&
266
- GNormPlus.BioCDocobj.Annotations.size()>i &&
267
- GNormPlus.BioCDocobj.Annotations.get(i).size()>j
268
- )
269
- {
270
- String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j);
271
- ArrayList<String> locations_Genus = PT_Genus.SearchMentionLocation(PassageContext,"Genus"); /** PT_Genus*/
272
- for (int k = 0 ; k < locations_Genus.size() ; k++)
273
- {
274
- String anno[]=locations_Genus.get(k).split("\t");
275
- String start= anno[0];
276
- String last= anno[1];
277
- String mention = anno[2];
278
- String id = anno[3];
279
- if(!TargetedLocation.contains(j+"\t"+start)) //already exists
280
- {
281
- String patt="^\\**([0-9]+)$";
282
- Pattern ptmp = Pattern.compile(patt);
283
- Matcher mtmp = ptmp.matcher(id);
284
- if(mtmp.find())
285
- {
286
- id = mtmp.group(1);
287
- }
288
-
289
- if(GNormPlus.taxid4gene.contains(id)) // for gene
290
- {
291
- id="*"+id;
292
- }
293
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGenus\t"+id);
294
- String mentions_tmp=mention.toLowerCase();
295
- mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
296
- mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
297
- GNormPlus.Filtering_hash.put(mentions_tmp,"");
298
- IDset.add(id);
299
- for(int s=Integer.parseInt(start);s<Integer.parseInt(last);s++)
300
- {
301
- TargetedLocation.add(j+"\t"+s);
302
- }
303
- }
304
- }
305
- }
306
- }
307
-
308
- /** Strain Tree */
309
- PrefixTree PT_Strain = new PrefixTree();
310
- HashMap<String, String> StrainID_hash = new HashMap<String, String>();
311
- BufferedReader br = new BufferedReader(new FileReader(StrainFilename));
312
- String line="";
313
- while ((line = br.readLine()) != null)
314
- {
315
- String l[]=line.split("\t");
316
- String ancestor = l[0];
317
- String tax_id = l[1];
318
- String tax_names = l[2];
319
- if(SPID_hash.containsKey(ancestor))
320
- {
321
- StrainID_hash.put(tax_id, tax_names); // tax id -> strain
322
- }
323
- else if(SPID_hash.containsKey(tax_id))
324
- {
325
- StrainID_hash.put(tax_id, tax_names); // tax id -> strain
326
- }
327
- }
328
- br.close();
329
- HashMap<String, String> StrainNames = new HashMap<String, String>();
330
- for(String ID: StrainID_hash.keySet())
331
- {
332
- StrainNames.put(ID,StrainID_hash.get(ID));
333
- }
334
-
335
- PT_Strain.Hash2Tree(StrainNames);
336
-
337
- /** Strain recognition */
338
- for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
339
- {
340
- if(GNormPlus.BioCDocobj.PassageContexts.size()>i &&
341
- GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j &&
342
- GNormPlus.BioCDocobj.Annotations.size()>i &&
343
- GNormPlus.BioCDocobj.Annotations.get(i).size()>j
344
- )
345
- {
346
- String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
347
- ArrayList<String> locations_Strain = PT_Strain.SearchMentionLocation(PassageContext,"Strain"); /** PT_Strain*/
348
- for (int k = 0 ; k < locations_Strain.size() ; k++)
349
- {
350
- String anno[]=locations_Strain.get(k).split("\t");
351
- String start= anno[0];
352
- String last= anno[1];
353
- String mention = anno[2];
354
- String id = anno[3];
355
- if(!TargetedLocation.contains(j+"\t"+start)) //already exists
356
- {
357
- if((!mention.matches(".*[;,\\{\\}\\(\\)\\[\\]].*")) && !mention.matches("[a-z]{1,4} [0-9]{1,3}"))
358
- {
359
- if(GNormPlus.taxid4gene.contains(id)) // for gene
360
- {
361
- id="*"+id;
362
- }
363
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tStrain\t"+id);
364
- String mentions_tmp=mention.toLowerCase();
365
- mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
366
- mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
367
- GNormPlus.Filtering_hash.put(mentions_tmp,"");
368
- IDset.add(id);
369
- for(int s=Integer.parseInt(start);s<Integer.parseInt(last);s++)
370
- {
371
- TargetedLocation.add(j+"\t"+s);
372
- }
373
- }
374
- }
375
- }
376
- }
377
- }
378
-
379
- HashMap<String, String> OtherNames = new HashMap<String, String>();
380
- for(String men : Mention2ID_lc.keySet())
381
- {
382
- String men_id= Mention2ID_lc.get(men);
383
- if(GNormPlus.PmidLF2Abb_lc_hash.containsKey(Pmid+"\t"+men))
384
- {
385
- String Abb = GNormPlus.PmidLF2Abb_lc_hash.get(Pmid+"\t"+men);
386
- // Abbreviation
387
- if(OtherNames.containsKey(men_id))
388
- {
389
- OtherNames.put(men_id, OtherNames.get(men_id)+"|"+Abb);
390
- }
391
- else
392
- {
393
- OtherNames.put(men_id,Abb);
394
- }
395
- }
396
- String men_nospace=men.replaceAll(" ", "");
397
- // no space
398
- if(OtherNames.containsKey(men_id))
399
- {
400
- OtherNames.put(men_id, OtherNames.get(men_id)+"|"+men_nospace);
401
- }
402
- else
403
- {
404
- OtherNames.put(men_id,men_nospace);
405
- }
406
- }
407
- PrefixTree PT_Others = new PrefixTree();
408
- PT_Others.Hash2Tree(OtherNames);
409
-
410
- /**
411
- *
412
- * Others:
413
- * 1) Abbreviation
414
- * 2) no space
415
- *
416
- * */
417
- for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
418
- {
419
- if(GNormPlus.BioCDocobj.PassageContexts.size()>i &&
420
- GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j &&
421
- GNormPlus.BioCDocobj.Annotations.size()>i &&
422
- GNormPlus.BioCDocobj.Annotations.get(i).size()>j
423
- )
424
- {
425
- String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
426
- ArrayList<String> locations_Abb = PT_Others.SearchMentionLocation(PassageContext,"Species"); /** PT_Abb*/
427
- for (int k = 0 ; k < locations_Abb.size() ; k++)
428
- {
429
- String anno[]=locations_Abb.get(k).split("\t");
430
- String start= anno[0];
431
- String last= anno[1];
432
- String mention = anno[2];
433
- String id = anno[3];
434
- if(!TargetedLocation.contains(j+"\t"+start)) //already exists
435
- {
436
- if(GNormPlus.taxid4gene.contains(id)) // for gene
437
- {
438
- id="*"+id;
439
- }
440
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tSpecies\t"+id);
441
- String mentions_tmp=mention.toLowerCase();
442
- mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
443
- mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
444
- GNormPlus.Filtering_hash.put(mentions_tmp,"");
445
- Mention2ID_lc.put(mention.toLowerCase(), id);
446
- IDset.add(id);
447
- for(int s=Integer.parseInt(start);s<Integer.parseInt(last);s++)
448
- {
449
- TargetedLocation.add(j+"\t"+s);
450
- }
451
- }
452
- }
453
- }
454
- }
455
-
456
- for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
457
- {
458
- if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j && GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
459
- {
460
- ArrayList <Integer> remove_anno = new ArrayList <Integer>();
461
- for (int a = 0; a < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); a++) /** Annotations : a */
462
- {
463
- String SpAnno[]=GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(a).split("\t");
464
- String start= SpAnno[0];
465
- String last= SpAnno[1];
466
- String mention = SpAnno[2];
467
- String type = SpAnno[3];
468
-
469
- if(type.matches("Gene|FamilyName"))
470
- {
471
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a,start+"\t"+last+"\t"+mention+"\t"+type);
472
- }
473
- else if(type.matches("Species|Genus|Strain|Cell") && SpAnno.length==5)
474
- {
475
- //System.out.println(GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(a));
476
- /** Abbreviation solution */
477
- if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mention.toLowerCase()) && Mention2ID_lc.containsKey(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mention.toLowerCase())))
478
- {
479
- String LF_lc=GNormPlus.PmidAbb2LF_lc_hash.get(Pmid+"\t"+mention.toLowerCase());
480
- if(Mention2ID_lc.containsKey(LF_lc))
481
- {
482
- String LF_ID=Mention2ID_lc.get(LF_lc);
483
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+LF_ID);
484
- String mentions_tmp=mention.toLowerCase();
485
- mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
486
- mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
487
- GNormPlus.Filtering_hash.put(mentions_tmp,"");
488
- }
489
- }
490
- else if (SpAnno.length>4)
491
- {
492
- String id = SpAnno[4];
493
- String id_split[]=id.split(";");
494
- if(id_split.length>=2)
495
- {
496
- /** Smallest set of tax ids */
497
- boolean found=false;
498
- for(int x=0;x<IDset.size();x++)
499
- {
500
- String id_tmp= IDset.get(x);
501
- for(int y=0;y<id_split.length;y++) // if any other id is a component of the target id
502
- {
503
- if(id_split[y].equals(id_tmp))
504
- {
505
- found=true;
506
- }
507
- }
508
- if(found == true)
509
- {
510
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id_tmp);
511
- String mentions_tmp=mention.toLowerCase();
512
- mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
513
- mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
514
- GNormPlus.Filtering_hash.put(mentions_tmp,"");
515
- x=1000000;
516
- }
517
- }
518
-
519
- /** smallest tax id number */
520
- if(found == false)
521
- {
522
- int min=10000000;
523
- String min_id="";
524
- for(int y=0;y<id_split.length;y++) // if any other id is a component of the target id
525
- {
526
- String id_tmp = id_split[y];
527
- String patt="^\\**([0-9]+)";
528
- Pattern ptmp = Pattern.compile(patt);
529
- Matcher mtmp = ptmp.matcher(id_tmp);
530
- if(mtmp.find())
531
- {
532
- id_tmp = mtmp.group(1);
533
- }
534
-
535
- if(y==0)
536
- {
537
- min_id=id_split[y];
538
- min=Integer.parseInt(id_tmp);
539
- }
540
- else if(Integer.parseInt(id_tmp)<min)
541
- {
542
- min=Integer.parseInt(id_tmp);
543
- min_id=id_tmp;
544
- }
545
- }
546
- if(GNormPlus.taxid4gene.contains(min_id)) // for gene
547
- {
548
- min_id="*"+min_id;
549
- }
550
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a,start+"\t"+last+"\t"+mention+"\tSpecies\t"+min_id);
551
- String mentions_tmp=mention.toLowerCase();
552
- mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
553
- mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
554
- GNormPlus.Filtering_hash.put(mentions_tmp,"");
555
- }
556
- }
557
- }
558
- }
559
- else //disease, and other concepts
560
- {
561
- remove_anno.add(a);
562
- }
563
- }
564
-
565
- Collections.sort(remove_anno);
566
- for (int counter = remove_anno.size()-1; counter >= 0 ; counter--)
567
- {
568
- int ai=remove_anno.get(counter);
569
- //System.out.println("\n"+ai+"\t"+GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(ai));
570
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(ai);
571
- }
572
- }
573
- }
574
- }
575
- GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true); //save in BioC file
576
- }
577
- public void SpeciesAssignment(String Filename,String FilenameBioC) throws IOException, XMLStreamException
578
- {
579
- GNormPlus.BioCDocobj.Annotations = new ArrayList();
580
- GNormPlus.BioCDocobj.BioCReaderWithAnnotation(Filename);
581
-
582
- BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);
583
- for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) /** PMIDs : i */
584
- {
585
- HashMap<String, String> PrefixIDTarget_hash = new HashMap<String, String>();
586
- PrefixIDTarget_hash.put("9606", "h");
587
- PrefixIDTarget_hash.put("10090", "m");
588
- PrefixIDTarget_hash.put("10116", "r");
589
- PrefixIDTarget_hash.put("4932", "y");
590
- PrefixIDTarget_hash.put("7227", "d");
591
- PrefixIDTarget_hash.put("7955", "z|zf|Zf|dr|Dr");
592
- PrefixIDTarget_hash.put("3702", "at|At");
593
-
594
- HashMap<String, Double> SP2Num_hash = new HashMap<String, Double>();
595
- for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
596
- {
597
- for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
598
- {
599
- String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
600
- if(anno.length==5) //Species
601
- {
602
- String patt="^\\**([0-9]+)$";
603
- Pattern ptmp = Pattern.compile(patt);
604
- Matcher mtmp = ptmp.matcher(anno[4]);
605
- if(mtmp.find())
606
- {
607
- String id = mtmp.group(1);
608
-
609
- if(!PrefixIDTarget_hash.containsKey(id))
610
- {
611
- PrefixIDTarget_hash.put(id,GNormPlus.PrefixID_hash.get(id)); // taxid -> prefix
612
- }
613
- if(j == 0)//title
614
- {
615
- if(SP2Num_hash.containsKey(id))
616
- {
617
- SP2Num_hash.put(id, SP2Num_hash.get(id)+2);
618
- }
619
- else
620
- {
621
- if(GNormPlus.TaxFreq_hash.containsKey(id))
622
- {
623
- SP2Num_hash.put(id, GNormPlus.TaxFreq_hash.get(id)+2);
624
- }
625
- else
626
- {
627
- SP2Num_hash.put(id, 2.0);
628
- }
629
- }
630
- // Virus -> Human (not to double weight human to virus)
631
- /*if(GNormPlus.SP_Virus2Human_hash.containsKey(id))
632
- {
633
- if(SP2Num_hash.containsKey("9606"))
634
- {
635
- SP2Num_hash.put("9606", SP2Num_hash.get("9606")+2);
636
- }
637
- else
638
- {
639
- SP2Num_hash.put("9606", 2 + GNormPlus.TaxFreq_hash.get("9606")+1);
640
- }
641
- }*/
642
- }
643
- else
644
- {
645
- if(SP2Num_hash.containsKey(id))
646
- {
647
- SP2Num_hash.put(id, SP2Num_hash.get(id)+1);
648
- }
649
- else
650
- {
651
- if(GNormPlus.TaxFreq_hash.containsKey(id))
652
- {
653
- SP2Num_hash.put(id, 1 + GNormPlus.TaxFreq_hash.get(id));
654
- }
655
- else
656
- {
657
- SP2Num_hash.put(id, 1.0);
658
- }
659
- }
660
- // Virus -> Human
661
- /*if(GNormPlus.SP_Virus2Human_hash.containsKey(id))
662
- {
663
- if(SP2Num_hash.containsKey("9606"))
664
- {
665
- SP2Num_hash.put("9606", SP2Num_hash.get("9606")+1);
666
- }
667
- else
668
- {
669
- SP2Num_hash.put("9606", GNormPlus.TaxFreq_hash.get("9606")+1);
670
- }
671
- }*/
672
- }
673
- }
674
- }
675
- }
676
- }
677
- String MajorSP="9606";
678
- double MaxSP=0;
679
- for(String tid : SP2Num_hash.keySet())
680
- {
681
- if(SP2Num_hash.get(tid)>MaxSP)
682
- {
683
- MajorSP=tid;
684
- MaxSP=SP2Num_hash.get(tid);
685
- }
686
- }
687
-
688
- for (int j = 0; j < GNormPlus.BioCDocobj.PassageContexts.get(i).size(); j++) /** Paragraphs : j */
689
- {
690
- String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
691
- //int PassageOffset = GNormPlus.BioCDocobj.PassageOffsets.get(i).get(j); // Passage offset
692
- iterator.setText(PassageContext);
693
- ArrayList<Integer> Sentence_offsets = new ArrayList<Integer>();
694
- int Sent_start = iterator.first();
695
- for (int Sent_last = iterator.next(); Sent_last != BreakIterator.DONE; Sent_start = Sent_last, Sent_last = iterator.next())
696
- {
697
- Sentence_offsets.add(Sent_start);
698
- }
699
-
700
- HashMap<Integer,String> Annotations_Gene_hash = new HashMap<Integer,String>();
701
- ArrayList<String> Annotations_Species = new ArrayList<String>();
702
- if(GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
703
- {
704
- for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
705
- {
706
- String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
707
- if(anno.length==5) //Species
708
- {
709
- Annotations_Species.add(GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k));
710
- }
711
- else //Gene : if(anno.length==3)
712
- {
713
- //String mention = PassageContext.substring(Integer.parseInt(anno[0]), Integer.parseInt(anno[1]));
714
- Annotations_Gene_hash.put(k,GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k)); // k -> Gene Annotation
715
- }
716
- }
717
-
718
- //Gene --> Species Inference (PMID:28777492)
719
- HashMap<String,HashMap<Integer,String>> mention2Location2Species_hash = new HashMap<String,HashMap<Integer,String>>();
720
- HashMap<Integer,String> Location2Species_hash = new HashMap<Integer,String>();
721
- for (int k : Annotations_Gene_hash.keySet()) // k is the index of GNormPlus.BioCDocobj.Annotations.get(i).get(j)
722
- {
723
- boolean SPfound = false;
724
- String anno[] = Annotations_Gene_hash.get(k).split("\t");
725
- int G_Start= Integer.parseInt(anno[0]);
726
- int G_Last= Integer.parseInt(anno[1]);
727
- String G_mentions = anno[2];
728
- /**
729
- * 2. Co-occurring word
730
- * boundary :
731
- * Sentence Start: Sentence_offsets.get(Target_Sentence)
732
- * Sentence Last: Sentence_offsets.get(Target_Sentence+1)
733
- */
734
- //Find the target sentence
735
- int Target_Sentence=0;
736
- if(SPfound == false) // 1. left : Closed to start of the gene mention
737
- {
738
- for(int s=0;s<Sentence_offsets.size();s++)
739
-
740
- {
741
- int Sentence_last=1000000;
742
- if(s<Sentence_offsets.size()-1)
743
- {
744
- Sentence_last=Sentence_offsets.get(s+1);
745
- }
746
- if(G_Start<Sentence_last)
747
- {
748
- Target_Sentence=s;
749
- break;
750
- }
751
- }
752
- }
753
- int Sentence_Start = Sentence_offsets.get(Target_Sentence);
754
- int Sentence_Last = 1000000;
755
- if(Sentence_offsets.size() > Target_Sentence+1){ Sentence_Last = Sentence_offsets.get(Target_Sentence+1); }
756
- if(SPfound == false) // 1. left : Closed to start of the gene mention
757
- {
758
- int closet_Sp_Start=0;
759
- for(int sp=0;sp<Annotations_Species.size();sp++) // Find the closet species
760
- {
761
- String AnnoSp[]=Annotations_Species.get(sp).split("\t");
762
- int Sp_Start = Integer.parseInt(AnnoSp[0]);
763
- String patt="^\\**([0-9]+)$";
764
- Pattern ptmp = Pattern.compile(patt);
765
- Matcher mtmp = ptmp.matcher(AnnoSp[4]);
766
- if(mtmp.find())
767
- {
768
- String taxid = mtmp.group(1);
769
- Location2Species_hash.put(Sp_Start,taxid);
770
- if(Sp_Start <= G_Start && Sp_Start >= Sentence_Start && Sp_Start >closet_Sp_Start)
771
- {
772
- closet_Sp_Start=Sp_Start;
773
- Location2Species_hash.put(Integer.parseInt(anno[0]), taxid);
774
-
775
- if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
776
- {
777
- mention2Location2Species_hash.get(G_mentions.toLowerCase()).put(Integer.parseInt(anno[0]), taxid);
778
- }
779
- else
780
- {
781
- mention2Location2Species_hash.put(G_mentions.toLowerCase(),Location2Species_hash);
782
- }
783
-
784
- SPfound=true;
785
- }
786
- }
787
- }
788
- }
789
- if(SPfound == false) // 2. right : Closed to last of the gene mention
790
- {
791
- int closet_Sp_Last=1000000;
792
- for(int sp=0;sp<Annotations_Species.size();sp++) // Find the closet species
793
- {
794
- String AnnoSp[]=Annotations_Species.get(sp).split("\t");
795
- int Sp_Last = Integer.parseInt(AnnoSp[1]);
796
- String patt="^\\**([0-9]+)$";
797
- Pattern ptmp = Pattern.compile(patt);
798
- Matcher mtmp = ptmp.matcher(AnnoSp[4]);
799
- if(mtmp.find())
800
- {
801
- String taxid = mtmp.group(1);
802
- if(Sp_Last >= G_Last && Sp_Last <= Sentence_Last && Sp_Last < closet_Sp_Last)
803
- {
804
- closet_Sp_Last=Sp_Last;
805
- Location2Species_hash.put(Integer.parseInt(anno[0]), taxid);
806
-
807
- if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
808
- {
809
- mention2Location2Species_hash.get(G_mentions.toLowerCase()).put(Integer.parseInt(anno[0]), taxid);
810
- }
811
- else
812
- {
813
- mention2Location2Species_hash.put(G_mentions.toLowerCase(),Location2Species_hash);
814
- }
815
-
816
- SPfound=true;
817
- }
818
- }
819
- }
820
- }
821
- }
822
-
823
- for (int k : Annotations_Gene_hash.keySet()) // k is the index of GNormPlus.BioCDocobj.Annotations.get(i).get(j)
824
- {
825
- String anno[] = Annotations_Gene_hash.get(k).split("\t");
826
- int G_Start= Integer.parseInt(anno[0]);
827
- int G_Last= Integer.parseInt(anno[1]);
828
- String G_mentions = anno[2];
829
- String G_type = anno[3];
830
- String G_mention_list[]=G_mentions.split("\\|");
831
- String G_mention=G_mention_list[0]; // only use the first term to detect species ; should be updated after SimConcept
832
-
833
- /** 1. prefix */
834
- boolean SPfound = false;
835
- for(String taxid: PrefixIDTarget_hash.keySet())
836
- {
837
- if(GNormPlus.GeneWithoutSPPrefix_hash.containsKey(G_mention.toLowerCase()))
838
- {
839
- //special case, and no need for prefix - SA
840
- }
841
- else
842
- {
843
- Pattern ptmp = Pattern.compile("^("+PrefixIDTarget_hash.get(taxid)+")([A-Z].*)$");
844
- Matcher mtmp = ptmp.matcher(G_mention);
845
- if(mtmp.find())
846
- {
847
- String MentionWoPrefix=mtmp.group(2);
848
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"|"+MentionWoPrefix+"\t"+anno[3]+"\tPrefix:"+taxid);
849
- SPfound=true;
850
- break;
851
- }
852
- }
853
- }
854
-
855
- /**
856
- * 2. Co-occurring word
857
- * boundary :
858
- * Sentence Start: Sentence_offsets.get(Target_Sentence)
859
- * Sentence Last: Sentence_offsets.get(Target_Sentence+1)
860
- */
861
- //Find the target sentence
862
- int Target_Sentence=0;
863
- if(SPfound == false) // 1. left : Closed to start of the gene mention
864
- {
865
- for(int s=0;s<Sentence_offsets.size();s++)
866
-
867
- {
868
- int Sentence_last=1000000;
869
- if(s<Sentence_offsets.size()-1)
870
- {
871
- Sentence_last=Sentence_offsets.get(s+1);
872
- }
873
- if(G_Start<Sentence_last)
874
- {
875
- Target_Sentence=s;
876
- break;
877
- }
878
- }
879
- }
880
- int Sentence_Start = Sentence_offsets.get(Target_Sentence);
881
- int Sentence_Last = 1000000;
882
- if(Sentence_offsets.size() > Target_Sentence+1){ Sentence_Last = Sentence_offsets.get(Target_Sentence+1); }
883
- if(SPfound == false) // 1. left : Closed to start of the gene mention
884
- {
885
- int closet_Sp_Start=0;
886
- for(int sp=0;sp<Annotations_Species.size();sp++) // Find the closet species
887
- {
888
- String AnnoSp[]=Annotations_Species.get(sp).split("\t");
889
- int Sp_Start = Integer.parseInt(AnnoSp[0]);
890
- String patt="^\\**([0-9]+)$";
891
- Pattern ptmp = Pattern.compile(patt);
892
- Matcher mtmp = ptmp.matcher(AnnoSp[4]);
893
- if(mtmp.find())
894
- {
895
- String taxid = mtmp.group(1);
896
- if(Sp_Start <= G_Start && Sp_Start >= Sentence_Start && Sp_Start >closet_Sp_Start)
897
- {
898
- closet_Sp_Start=Sp_Start;
899
- if(GNormPlus.SP_Virus2Human_hash.containsKey(taxid))
900
- {
901
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tLeft:"+taxid+"&9606");
902
- }
903
- else
904
- {
905
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tLeft:"+taxid);
906
- }
907
- SPfound=true;
908
- }
909
- }
910
- }
911
- }
912
- if(SPfound == false) // 2. right : Closed to last of the gene mention
913
- {
914
- int closet_Sp_Last=1000000;
915
- for(int sp=0;sp<Annotations_Species.size();sp++) // Find the closet species
916
- {
917
- String AnnoSp[]=Annotations_Species.get(sp).split("\t");
918
- int Sp_Last = Integer.parseInt(AnnoSp[1]);
919
- String patt="^\\**([0-9]+)$";
920
- Pattern ptmp = Pattern.compile(patt);
921
- Matcher mtmp = ptmp.matcher(AnnoSp[4]);
922
- if(mtmp.find())
923
- {
924
- String taxid = mtmp.group(1);
925
- if(Sp_Last >= G_Last && Sp_Last <= Sentence_Last && Sp_Last < closet_Sp_Last)
926
- {
927
- closet_Sp_Last=Sp_Last;
928
- if(GNormPlus.SP_Virus2Human_hash.containsKey(taxid))
929
- {
930
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tRight:"+taxid+"&9606");
931
- }
932
- else
933
- {
934
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tRight:"+taxid);
935
- }
936
- SPfound=true;
937
- }
938
- }
939
- }
940
- }
941
-
942
- /** 3. Focus species */
943
- if(SPfound == false) // 2. right : Closed to last of the gene mention
944
- {
945
- // 1. only the mentions appeared earlier are inferred
946
- //
947
- if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
948
- {
949
- int closed_loca=0;
950
- for (int loca_start : mention2Location2Species_hash.get(G_mentions.toLowerCase()).keySet())
951
- {
952
- if(loca_start<G_Start)
953
- {
954
- if(loca_start>closed_loca)
955
- {
956
- closed_loca=loca_start;
957
- }
958
- }
959
- }
960
- if(closed_loca>0)
961
- {
962
- if(GNormPlus.SP_Virus2Human_hash.containsKey(Location2Species_hash.get(closed_loca)))
963
- {
964
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+Location2Species_hash.get(closed_loca)+"&9606");
965
- }
966
- else
967
- {
968
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+Location2Species_hash.get(closed_loca));
969
- }
970
- }
971
- else
972
- {
973
- if(GNormPlus.SP_Virus2Human_hash.containsKey(MajorSP))
974
- {
975
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP+"&9606");
976
- }
977
- else
978
- {
979
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP);
980
- }
981
- }
982
- }
983
- else
984
- {
985
- if(GNormPlus.SP_Virus2Human_hash.containsKey(MajorSP))
986
- {
987
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP+"&9606");
988
- }
989
- else
990
- {
991
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP);
992
- }
993
- }
994
- }
995
- }
996
- }
997
- }
998
- }
999
- GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
1000
- }
1001
- public void SpeciesAssignment(String Filename,String FilenameBioC,String FocusSpecies) throws IOException, XMLStreamException
1002
- {
1003
- for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) /** PMIDs : i */
1004
- {
1005
- for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
1006
- {
1007
- for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
1008
- {
1009
- String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
1010
- if(anno.length==5) //Species
1011
- {
1012
- String id=anno[4].replaceAll("\\*", "");
1013
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+id);
1014
- }
1015
- else //Gene : if(anno.length==3)
1016
- {
1017
- /** 1. prefix */
1018
- boolean SPfound = false;
1019
- if(GNormPlus.GeneWithoutSPPrefix_hash.containsKey(anno[2].toLowerCase()))
1020
- {
1021
- //special case, and no need for prefix - SA
1022
- }
1023
- else
1024
- {
1025
- Pattern ptmp = Pattern.compile("^("+GNormPlus.PrefixID_hash.get(FocusSpecies)+")([A-Z].*)$");
1026
- Matcher mtmp = ptmp.matcher(anno[2]);
1027
- if(mtmp.find())
1028
- {
1029
- String MentionWoPrefix=mtmp.group(2);
1030
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"|"+MentionWoPrefix+"\t"+anno[3]+"\tPrefix:"+FocusSpecies);
1031
- SPfound=true;
1032
- }
1033
- }
1034
- if(SPfound == false)
1035
- {
1036
- GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k)+"\tFocus:"+FocusSpecies);
1037
- }
1038
- }
1039
- }
1040
- }
1041
- }
1042
- GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
1043
- }
1044
  }
 
1
+ /**
2
+ * Project: GNormPlus
3
+ * Function: Species recognition and Species assignment
4
+ */
5
+
6
+ package GNormPluslib;
7
+
8
+ import bioc.BioCAnnotation;
9
+ import bioc.BioCCollection;
10
+ import bioc.BioCDocument;
11
+ import bioc.BioCLocation;
12
+ import bioc.BioCPassage;
13
+
14
+ import bioc.io.BioCDocumentWriter;
15
+ import bioc.io.BioCFactory;
16
+ import bioc.io.woodstox.ConnectorWoodstox;
17
+ import java.io.BufferedReader;
18
+ import java.io.BufferedWriter;
19
+ import java.io.FileInputStream;
20
+ import java.io.FileOutputStream;
21
+ import java.io.FileReader;
22
+ import java.io.FileWriter;
23
+ import java.io.IOException;
24
+ import java.io.InputStreamReader;
25
+ import java.io.OutputStreamWriter;
26
+ import java.text.BreakIterator;
27
+ import java.time.LocalDate;
28
+ import java.time.ZoneId;
29
+
30
+ import javax.xml.stream.XMLStreamException;
31
+
32
+ import org.tartarus.snowball.SnowballStemmer;
33
+ import org.tartarus.snowball.ext.englishStemmer;
34
+
35
+ import java.util.Map;
36
+ import java.util.regex.Matcher;
37
+ import java.util.regex.Pattern;
38
+ import java.util.ArrayList;
39
+ import java.util.HashMap;
40
+ import java.util.List;
41
+ import java.util.Locale;
42
+ import java.util.Collections;
43
+
44
+ public class SR
45
+ {
46
+ @SuppressWarnings("null")
47
+ public void SpeciesRecognition(String Filename,String FilenameBioC,String StrainFilename,String FilterAntibody) throws IOException, XMLStreamException
48
+ {
49
+ /** Recognizing Species Names: SP */
50
+ for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */
51
+ {
52
+ String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
53
+ PrefixTree PT_Genus = new PrefixTree();
54
+ HashMap<String, String> SPID_hash = new HashMap<String, String>();
55
+ ArrayList<String> TargetedLocation = new ArrayList<String>();
56
+ HashMap<String, String> GenusNames = new HashMap<String, String>();
57
+ HashMap<String, String> Mention2ID_lc = new HashMap<String, String>();
58
+ ArrayList<String> IDset = new ArrayList<String>();
59
+ for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
60
+ {
61
+ String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
62
+
63
+ /** Species recognition */
64
+ ArrayList<String> locations = GNormPlus.PT_Species.SearchMentionLocation(PassageContext,"Species"); /** PT_Species */
65
+ for (int k = 0 ; k < locations.size() ; k++)
66
+ {
67
+ String anno[]=locations.get(k).split("\t");
68
+ int start= Integer.parseInt(anno[0]);
69
+ int last= Integer.parseInt(anno[1]);
70
+
71
+ // For anti-serum filtering
72
+ String ForwardSTR="";
73
+ String BackwardSTR="";
74
+ if(start>21)
75
+ {
76
+ ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(start-21,last);
77
+ }
78
+ else
79
+ {
80
+ ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(0,last);
81
+ }
82
+ if(PassageContext.length()>last+21)
83
+ {
84
+ BackwardSTR = PassageContext.substring(start,last+21);
85
+ }
86
+ else
87
+ {
88
+ BackwardSTR = PassageContext.substring(start,PassageContext.length());
89
+ }
90
+
91
+ String mention = anno[2];
92
+ String id = anno[3];
93
+ String mention_tmp=mention.toLowerCase();
94
+ mention_tmp = mention_tmp.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
95
+ String antibody="";
96
+ if(ForwardSTR.toLowerCase().matches(".*(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg)[\\W\\-\\_]+"+mention_tmp)) {antibody="(anti)";}//filtering : antibody
97
+ else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[\\W\\-\\_]+(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg).*")){antibody="(anti)";} //filtering : antibody
98
+ else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[\\W\\-\\_]+[A-Za-z0-9]+[\\W\\-\\_]+(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg).*")){antibody="(anti)";} //filtering : antibody
99
+
100
+ if(mention.matches(".*[\\(\\[\\{].*") && BackwardSTR.toLowerCase().matches(mention_tmp+"\\).*") )
101
+ {
102
+ last=last+1;
103
+ mention=mention+")";
104
+ }
105
+
106
+ if(BackwardSTR.toLowerCase().matches(mention_tmp+"[0-9].*")){} // filtered: Bee1p
107
+ else if((mention.matches(".*[;:,].*")) && mention.length()<=10){} // filtered : x, XXX
108
+ else if(mention.matches("to[\\W\\-\\_]+[0-9]+")){} // to 7
109
+ else if(mention.matches("[a-z][\\)\\]\\}].*") && (!mention.matches(".*[\\(\\[\\{].*")) && mention.length()<=10){} // s). Major
110
+ else if(mention.matches(".*[\\(\\[\\{].*") && (!mention.matches(".*[\\)\\]\\}].*")) && mention.length()<=10){} // s). Major
111
+ else if(!id.equals("NA"))
112
+ {
113
+ if(GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
114
+ {
115
+ if((!mention.matches("^[A-Za-z] [A-Za-z0-9]+$")) && (mention.length()>=3)) // invalid species: "a group/a GAL4/a strain"
116
+ {
117
+ if(FilterAntibody.equals("False") || (!antibody.equals("(anti)")))
118
+ {
119
+ String patt="^(.+?) [sS]train";
120
+ Pattern ptmp = Pattern.compile(patt);
121
+ Matcher mtmp = ptmp.matcher(mention);
122
+ if(mtmp.find())
123
+ {
124
+ mention=mtmp.group(1);
125
+ last=last-7;
126
+ }
127
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tSpecies\t"+id); //+antibody
128
+ String mentions_tmp=mention.toLowerCase();
129
+ mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
130
+ mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
131
+ GNormPlus.Filtering_hash.put(mentions_tmp,"");
132
+ Mention2ID_lc.put(mention.toLowerCase(), id); //+antibody
133
+
134
+ String mention_genus = "";
135
+ patt="^([A-Za-z]+) ";
136
+ ptmp = Pattern.compile(patt);
137
+ mtmp = ptmp.matcher(mention);
138
+ if(mtmp.find())
139
+ {
140
+ mention_genus=mtmp.group(1); // get genus
141
+ }
142
+
143
+ IDset.add(id);
144
+ for(int s=start;s<last;s++)
145
+ {
146
+ TargetedLocation.add(j+"\t"+s);
147
+ }
148
+ String ids[]=id.split(";");
149
+ for(int x=0;x<ids.length;x++)
150
+ {
151
+ patt="^\\**([0-9]+)";
152
+ ptmp = Pattern.compile(patt);
153
+ mtmp = ptmp.matcher(ids[x]);
154
+ if(mtmp.find())
155
+ {
156
+ SPID_hash.put(mtmp.group(1), mention_genus);
157
+ }
158
+ }
159
+ }
160
+ }
161
+ }
162
+ }
163
+ }
164
+
165
+ /** Cell Line recognition */
166
+ locations = GNormPlus.PT_Cell.SearchMentionLocation(PassageContext,"Cell"); /** PT_Cell */
167
+ for (int k = 0 ; k < locations.size() ; k++)
168
+ {
169
+ String anno[]=locations.get(k).split("\t");
170
+ int start= Integer.parseInt(anno[0]);
171
+ int last= Integer.parseInt(anno[1]);
172
+ String mention = anno[2];
173
+ String id = anno[3];
174
+ if(GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
175
+ {
176
+ if(!TargetedLocation.contains(j+"\t"+start)) //already exists
177
+ {
178
+ int last40=0;
179
+ if(PassageContext.length()>=last+40)
180
+ {
181
+ last40=last+40;
182
+ }
183
+ else
184
+ {
185
+ last40=PassageContext.length();
186
+ }
187
+
188
+ // For anti-serum filtering
189
+ String ForwardSTR="";
190
+ String BackwardSTR="";
191
+ if(start>21)
192
+ {
193
+ ForwardSTR = PassageContext.substring(start-21,last);
194
+ }
195
+ else
196
+ {
197
+ ForwardSTR = PassageContext.substring(0,last);
198
+ }
199
+ if(PassageContext.length()>last+21)
200
+ {
201
+ BackwardSTR = PassageContext.substring(start,last+21);
202
+ }
203
+ else
204
+ {
205
+ BackwardSTR = PassageContext.substring(start,PassageContext.length());
206
+ }
207
+ String mention_tmp=mention.toLowerCase();
208
+ mention_tmp = mention_tmp.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
209
+ if(mention_tmp.matches(".*[\\[\\]\\(\\)\\{\\}].*")){}
210
+ else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[0-9\\-\\_].*")){} // filtered: Bee1p
211
+ else if(ForwardSTR.toLowerCase().matches(".*[0-9\\-\\_]"+mention_tmp)){} // filtered: IL-22RA1
212
+ else
213
+ {
214
+ String patt="[\\W\\-]cell([\\- ]*line|)[s]*[\\W\\-]";
215
+ Pattern ptmp = Pattern.compile(patt);
216
+ Matcher mtmp = ptmp.matcher(PassageContext.substring(last, last40).toLowerCase());
217
+ if(mtmp.find())
218
+ {
219
+ if(GNormPlus.taxid4gene.contains(id)) // for gene
220
+ {
221
+ id="*"+id;
222
+ }
223
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tCell\t"+id);
224
+ String mentions_tmp=mention.toLowerCase();
225
+ mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
226
+ mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
227
+ GNormPlus.Filtering_hash.put(mentions_tmp,"");
228
+ IDset.add(id);
229
+ for(int s=start;s<last;s++)
230
+ {
231
+ TargetedLocation.add(j+"\t"+s);
232
+ }
233
+ }
234
+ }
235
+ }
236
+ }
237
+ }
238
+
239
+ /** Genus names*/
240
+ for(String ID: SPID_hash.keySet())
241
+ {
242
+ if(GNormPlus.GenusID_hash.containsKey(ID))
243
+ {
244
+ GenusNames.put(ID,GNormPlus.GenusID_hash.get(ID));
245
+ }
246
+ if(SPID_hash.get(ID).length()>=7)
247
+ {
248
+ GenusNames.put(ID,SPID_hash.get(ID));
249
+ }
250
+ }
251
+ }
252
+
253
+ GenusNames.put("3702", "arabidopsis");
254
+ GenusNames.put("4932", "saccharomyces");
255
+ GenusNames.put("562", "escherichia");
256
+ GenusNames.put("7227", "drosophila");
257
+ GenusNames.put("8355", "xenopus");
258
+
259
+ PT_Genus.Hash2Tree(GenusNames);
260
+
261
+ /** Genus recognition */
262
+ for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
263
+ {
264
+ if(GNormPlus.BioCDocobj.PassageContexts.size()>i &&
265
+ GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j &&
266
+ GNormPlus.BioCDocobj.Annotations.size()>i &&
267
+ GNormPlus.BioCDocobj.Annotations.get(i).size()>j
268
+ )
269
+ {
270
+ String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j);
271
+ ArrayList<String> locations_Genus = PT_Genus.SearchMentionLocation(PassageContext,"Genus"); /** PT_Genus*/
272
+ for (int k = 0 ; k < locations_Genus.size() ; k++)
273
+ {
274
+ String anno[]=locations_Genus.get(k).split("\t");
275
+ String start= anno[0];
276
+ String last= anno[1];
277
+ String mention = anno[2];
278
+ String id = anno[3];
279
+ if(!TargetedLocation.contains(j+"\t"+start)) //already exists
280
+ {
281
+ String patt="^\\**([0-9]+)$";
282
+ Pattern ptmp = Pattern.compile(patt);
283
+ Matcher mtmp = ptmp.matcher(id);
284
+ if(mtmp.find())
285
+ {
286
+ id = mtmp.group(1);
287
+ }
288
+
289
+ if(GNormPlus.taxid4gene.contains(id)) // for gene
290
+ {
291
+ id="*"+id;
292
+ }
293
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGenus\t"+id);
294
+ String mentions_tmp=mention.toLowerCase();
295
+ mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
296
+ mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
297
+ GNormPlus.Filtering_hash.put(mentions_tmp,"");
298
+ IDset.add(id);
299
+ for(int s=Integer.parseInt(start);s<Integer.parseInt(last);s++)
300
+ {
301
+ TargetedLocation.add(j+"\t"+s);
302
+ }
303
+ }
304
+ }
305
+ }
306
+ }
307
+
308
+ /** Strain Tree */
309
+ PrefixTree PT_Strain = new PrefixTree();
310
+ HashMap<String, String> StrainID_hash = new HashMap<String, String>();
311
+ BufferedReader br = new BufferedReader(new FileReader(StrainFilename));
312
+ String line="";
313
+ while ((line = br.readLine()) != null)
314
+ {
315
+ String l[]=line.split("\t");
316
+ String ancestor = l[0];
317
+ String tax_id = l[1];
318
+ String tax_names = l[2];
319
+ if(SPID_hash.containsKey(ancestor))
320
+ {
321
+ StrainID_hash.put(tax_id, tax_names); // tax id -> strain
322
+ }
323
+ else if(SPID_hash.containsKey(tax_id))
324
+ {
325
+ StrainID_hash.put(tax_id, tax_names); // tax id -> strain
326
+ }
327
+ }
328
+ br.close();
329
+ HashMap<String, String> StrainNames = new HashMap<String, String>();
330
+ for(String ID: StrainID_hash.keySet())
331
+ {
332
+ StrainNames.put(ID,StrainID_hash.get(ID));
333
+ }
334
+
335
+ PT_Strain.Hash2Tree(StrainNames);
336
+
337
+ /** Strain recognition */
338
+ for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
339
+ {
340
+ if(GNormPlus.BioCDocobj.PassageContexts.size()>i &&
341
+ GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j &&
342
+ GNormPlus.BioCDocobj.Annotations.size()>i &&
343
+ GNormPlus.BioCDocobj.Annotations.get(i).size()>j
344
+ )
345
+ {
346
+ String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
347
+ ArrayList<String> locations_Strain = PT_Strain.SearchMentionLocation(PassageContext,"Strain"); /** PT_Strain*/
348
+ for (int k = 0 ; k < locations_Strain.size() ; k++)
349
+ {
350
+ String anno[]=locations_Strain.get(k).split("\t");
351
+ String start= anno[0];
352
+ String last= anno[1];
353
+ String mention = anno[2];
354
+ String id = anno[3];
355
+ if(!TargetedLocation.contains(j+"\t"+start)) //already exists
356
+ {
357
+ if((!mention.matches(".*[;,\\{\\}\\(\\)\\[\\]].*")) && !mention.matches("[a-z]{1,4} [0-9]{1,3}"))
358
+ {
359
+ if(GNormPlus.taxid4gene.contains(id)) // for gene
360
+ {
361
+ id="*"+id;
362
+ }
363
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tStrain\t"+id);
364
+ String mentions_tmp=mention.toLowerCase();
365
+ mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
366
+ mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
367
+ GNormPlus.Filtering_hash.put(mentions_tmp,"");
368
+ IDset.add(id);
369
+ for(int s=Integer.parseInt(start);s<Integer.parseInt(last);s++)
370
+ {
371
+ TargetedLocation.add(j+"\t"+s);
372
+ }
373
+ }
374
+ }
375
+ }
376
+ }
377
+ }
378
+
379
+ HashMap<String, String> OtherNames = new HashMap<String, String>();
380
+ for(String men : Mention2ID_lc.keySet())
381
+ {
382
+ String men_id= Mention2ID_lc.get(men);
383
+ if(GNormPlus.PmidLF2Abb_lc_hash.containsKey(Pmid+"\t"+men))
384
+ {
385
+ String Abb = GNormPlus.PmidLF2Abb_lc_hash.get(Pmid+"\t"+men);
386
+ // Abbreviation
387
+ if(OtherNames.containsKey(men_id))
388
+ {
389
+ OtherNames.put(men_id, OtherNames.get(men_id)+"|"+Abb);
390
+ }
391
+ else
392
+ {
393
+ OtherNames.put(men_id,Abb);
394
+ }
395
+ }
396
+ String men_nospace=men.replaceAll(" ", "");
397
+ // no space
398
+ if(OtherNames.containsKey(men_id))
399
+ {
400
+ OtherNames.put(men_id, OtherNames.get(men_id)+"|"+men_nospace);
401
+ }
402
+ else
403
+ {
404
+ OtherNames.put(men_id,men_nospace);
405
+ }
406
+ }
407
+ PrefixTree PT_Others = new PrefixTree();
408
+ PT_Others.Hash2Tree(OtherNames);
409
+
410
+ /**
411
+ *
412
+ * Others:
413
+ * 1) Abbreviation
414
+ * 2) no space
415
+ *
416
+ * */
417
+ for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
418
+ {
419
+ if(GNormPlus.BioCDocobj.PassageContexts.size()>i &&
420
+ GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j &&
421
+ GNormPlus.BioCDocobj.Annotations.size()>i &&
422
+ GNormPlus.BioCDocobj.Annotations.get(i).size()>j
423
+ )
424
+ {
425
+ String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
426
+ ArrayList<String> locations_Abb = PT_Others.SearchMentionLocation(PassageContext,"Species"); /** PT_Abb*/
427
+ for (int k = 0 ; k < locations_Abb.size() ; k++)
428
+ {
429
+ String anno[]=locations_Abb.get(k).split("\t");
430
+ String start= anno[0];
431
+ String last= anno[1];
432
+ String mention = anno[2];
433
+ String id = anno[3];
434
+ if(!TargetedLocation.contains(j+"\t"+start)) //already exists
435
+ {
436
+ if(GNormPlus.taxid4gene.contains(id)) // for gene
437
+ {
438
+ id="*"+id;
439
+ }
440
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tSpecies\t"+id);
441
+ String mentions_tmp=mention.toLowerCase();
442
+ mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
443
+ mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
444
+ GNormPlus.Filtering_hash.put(mentions_tmp,"");
445
+ Mention2ID_lc.put(mention.toLowerCase(), id);
446
+ IDset.add(id);
447
+ for(int s=Integer.parseInt(start);s<Integer.parseInt(last);s++)
448
+ {
449
+ TargetedLocation.add(j+"\t"+s);
450
+ }
451
+ }
452
+ }
453
+ }
454
+ }
455
+
456
+ for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
457
+ {
458
+ if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j && GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
459
+ {
460
+ ArrayList <Integer> remove_anno = new ArrayList <Integer>();
461
+ for (int a = 0; a < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); a++) /** Annotations : a */
462
+ {
463
+ String SpAnno[]=GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(a).split("\t");
464
+ String start= SpAnno[0];
465
+ String last= SpAnno[1];
466
+ String mention = SpAnno[2];
467
+ String type = SpAnno[3];
468
+
469
+ if(type.matches("Gene|FamilyName"))
470
+ {
471
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a,start+"\t"+last+"\t"+mention+"\t"+type);
472
+ }
473
+ else if(type.matches("Species|Genus|Strain|Cell") && SpAnno.length==5)
474
+ {
475
+ //System.out.println(GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(a));
476
+ /** Abbreviation solution */
477
+ if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mention.toLowerCase()) && Mention2ID_lc.containsKey(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mention.toLowerCase())))
478
+ {
479
+ String LF_lc=GNormPlus.PmidAbb2LF_lc_hash.get(Pmid+"\t"+mention.toLowerCase());
480
+ if(Mention2ID_lc.containsKey(LF_lc))
481
+ {
482
+ String LF_ID=Mention2ID_lc.get(LF_lc);
483
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+LF_ID);
484
+ String mentions_tmp=mention.toLowerCase();
485
+ mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
486
+ mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
487
+ GNormPlus.Filtering_hash.put(mentions_tmp,"");
488
+ }
489
+ }
490
+ else if (SpAnno.length>4)
491
+ {
492
+ String id = SpAnno[4];
493
+ String id_split[]=id.split(";");
494
+ if(id_split.length>=2)
495
+ {
496
+ /** Smallest set of tax ids */
497
+ boolean found=false;
498
+ for(int x=0;x<IDset.size();x++)
499
+ {
500
+ String id_tmp= IDset.get(x);
501
+ for(int y=0;y<id_split.length;y++) // if any other id is a component of the target id
502
+ {
503
+ if(id_split[y].equals(id_tmp))
504
+ {
505
+ found=true;
506
+ }
507
+ }
508
+ if(found == true)
509
+ {
510
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id_tmp);
511
+ String mentions_tmp=mention.toLowerCase();
512
+ mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
513
+ mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
514
+ GNormPlus.Filtering_hash.put(mentions_tmp,"");
515
+ x=1000000;
516
+ }
517
+ }
518
+
519
+ /** smallest tax id number */
520
+ if(found == false)
521
+ {
522
+ int min=10000000;
523
+ String min_id="";
524
+ for(int y=0;y<id_split.length;y++) // if any other id is a component of the target id
525
+ {
526
+ String id_tmp = id_split[y];
527
+ String patt="^\\**([0-9]+)";
528
+ Pattern ptmp = Pattern.compile(patt);
529
+ Matcher mtmp = ptmp.matcher(id_tmp);
530
+ if(mtmp.find())
531
+ {
532
+ id_tmp = mtmp.group(1);
533
+ }
534
+
535
+ if(y==0)
536
+ {
537
+ min_id=id_split[y];
538
+ min=Integer.parseInt(id_tmp);
539
+ }
540
+ else if(Integer.parseInt(id_tmp)<min)
541
+ {
542
+ min=Integer.parseInt(id_tmp);
543
+ min_id=id_tmp;
544
+ }
545
+ }
546
+ if(GNormPlus.taxid4gene.contains(min_id)) // for gene
547
+ {
548
+ min_id="*"+min_id;
549
+ }
550
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a,start+"\t"+last+"\t"+mention+"\tSpecies\t"+min_id);
551
+ String mentions_tmp=mention.toLowerCase();
552
+ mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
553
+ mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
554
+ GNormPlus.Filtering_hash.put(mentions_tmp,"");
555
+ }
556
+ }
557
+ }
558
+ }
559
+ else //disease, and other concepts
560
+ {
561
+ remove_anno.add(a);
562
+ }
563
+ }
564
+
565
+ Collections.sort(remove_anno);
566
+ for (int counter = remove_anno.size()-1; counter >= 0 ; counter--)
567
+ {
568
+ int ai=remove_anno.get(counter);
569
+ //System.out.println("\n"+ai+"\t"+GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(ai));
570
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(ai);
571
+ }
572
+ }
573
+ }
574
+ }
575
+ GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true); //save in BioC file
576
+ }
577
+ public void SpeciesAssignment(String Filename,String FilenameBioC) throws IOException, XMLStreamException
578
+ {
579
+ GNormPlus.BioCDocobj.Annotations = new ArrayList();
580
+ GNormPlus.BioCDocobj.BioCReaderWithAnnotation(Filename);
581
+
582
+ BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);
583
+ for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) /** PMIDs : i */
584
+ {
585
+ HashMap<String, String> PrefixIDTarget_hash = new HashMap<String, String>();
586
+ PrefixIDTarget_hash.put("9606", "h");
587
+ PrefixIDTarget_hash.put("10090", "m");
588
+ PrefixIDTarget_hash.put("10116", "r");
589
+ PrefixIDTarget_hash.put("4932", "y");
590
+ PrefixIDTarget_hash.put("7227", "d");
591
+ PrefixIDTarget_hash.put("7955", "z|zf|Zf|dr|Dr");
592
+ PrefixIDTarget_hash.put("3702", "at|At");
593
+
594
+ HashMap<String, Double> SP2Num_hash = new HashMap<String, Double>();
595
+ for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
596
+ {
597
+ for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
598
+ {
599
+ String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
600
+ if(anno.length==5) //Species
601
+ {
602
+ String patt="^\\**([0-9]+)$";
603
+ Pattern ptmp = Pattern.compile(patt);
604
+ Matcher mtmp = ptmp.matcher(anno[4]);
605
+ if(mtmp.find())
606
+ {
607
+ String id = mtmp.group(1);
608
+
609
+ if(!PrefixIDTarget_hash.containsKey(id))
610
+ {
611
+ PrefixIDTarget_hash.put(id,GNormPlus.PrefixID_hash.get(id)); // taxid -> prefix
612
+ }
613
+ if(j == 0)//title
614
+ {
615
+ if(SP2Num_hash.containsKey(id))
616
+ {
617
+ SP2Num_hash.put(id, SP2Num_hash.get(id)+2);
618
+ }
619
+ else
620
+ {
621
+ if(GNormPlus.TaxFreq_hash.containsKey(id))
622
+ {
623
+ SP2Num_hash.put(id, GNormPlus.TaxFreq_hash.get(id)+2);
624
+ }
625
+ else
626
+ {
627
+ SP2Num_hash.put(id, 2.0);
628
+ }
629
+ }
630
+ // Virus -> Human (not to double weight human to virus)
631
+ /*if(GNormPlus.SP_Virus2Human_hash.containsKey(id))
632
+ {
633
+ if(SP2Num_hash.containsKey("9606"))
634
+ {
635
+ SP2Num_hash.put("9606", SP2Num_hash.get("9606")+2);
636
+ }
637
+ else
638
+ {
639
+ SP2Num_hash.put("9606", 2 + GNormPlus.TaxFreq_hash.get("9606")+1);
640
+ }
641
+ }*/
642
+ }
643
+ else
644
+ {
645
+ if(SP2Num_hash.containsKey(id))
646
+ {
647
+ SP2Num_hash.put(id, SP2Num_hash.get(id)+1);
648
+ }
649
+ else
650
+ {
651
+ if(GNormPlus.TaxFreq_hash.containsKey(id))
652
+ {
653
+ SP2Num_hash.put(id, 1 + GNormPlus.TaxFreq_hash.get(id));
654
+ }
655
+ else
656
+ {
657
+ SP2Num_hash.put(id, 1.0);
658
+ }
659
+ }
660
+ // Virus -> Human
661
+ /*if(GNormPlus.SP_Virus2Human_hash.containsKey(id))
662
+ {
663
+ if(SP2Num_hash.containsKey("9606"))
664
+ {
665
+ SP2Num_hash.put("9606", SP2Num_hash.get("9606")+1);
666
+ }
667
+ else
668
+ {
669
+ SP2Num_hash.put("9606", GNormPlus.TaxFreq_hash.get("9606")+1);
670
+ }
671
+ }*/
672
+ }
673
+ }
674
+ }
675
+ }
676
+ }
677
+ String MajorSP="9606";
678
+ double MaxSP=0;
679
+ for(String tid : SP2Num_hash.keySet())
680
+ {
681
+ if(SP2Num_hash.get(tid)>MaxSP)
682
+ {
683
+ MajorSP=tid;
684
+ MaxSP=SP2Num_hash.get(tid);
685
+ }
686
+ }
687
+
688
+ for (int j = 0; j < GNormPlus.BioCDocobj.PassageContexts.get(i).size(); j++) /** Paragraphs : j */
689
+ {
690
+ String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
691
+ //int PassageOffset = GNormPlus.BioCDocobj.PassageOffsets.get(i).get(j); // Passage offset
692
+ iterator.setText(PassageContext);
693
+ ArrayList<Integer> Sentence_offsets = new ArrayList<Integer>();
694
+ int Sent_start = iterator.first();
695
+ for (int Sent_last = iterator.next(); Sent_last != BreakIterator.DONE; Sent_start = Sent_last, Sent_last = iterator.next())
696
+ {
697
+ Sentence_offsets.add(Sent_start);
698
+ }
699
+
700
+ HashMap<Integer,String> Annotations_Gene_hash = new HashMap<Integer,String>();
701
+ ArrayList<String> Annotations_Species = new ArrayList<String>();
702
+ if(GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
703
+ {
704
+ for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
705
+ {
706
+ String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
707
+ if(anno.length==5) //Species
708
+ {
709
+ Annotations_Species.add(GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k));
710
+ }
711
+ else //Gene : if(anno.length==3)
712
+ {
713
+ //String mention = PassageContext.substring(Integer.parseInt(anno[0]), Integer.parseInt(anno[1]));
714
+ Annotations_Gene_hash.put(k,GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k)); // k -> Gene Annotation
715
+ }
716
+ }
717
+
718
+ //Gene --> Species Inference (PMID:28777492)
719
+ HashMap<String,HashMap<Integer,String>> mention2Location2Species_hash = new HashMap<String,HashMap<Integer,String>>();
720
+ HashMap<Integer,String> Location2Species_hash = new HashMap<Integer,String>();
721
+ for (int k : Annotations_Gene_hash.keySet()) // k is the index of GNormPlus.BioCDocobj.Annotations.get(i).get(j)
722
+ {
723
+ boolean SPfound = false;
724
+ String anno[] = Annotations_Gene_hash.get(k).split("\t");
725
+ int G_Start= Integer.parseInt(anno[0]);
726
+ int G_Last= Integer.parseInt(anno[1]);
727
+ String G_mentions = anno[2];
728
+ /**
729
+ * 2. Co-occurring word
730
+ * boundary :
731
+ * Sentence Start: Sentence_offsets.get(Target_Sentence)
732
+ * Sentence Last: Sentence_offsets.get(Target_Sentence+1)
733
+ */
734
+ //Find the target sentence
735
+ int Target_Sentence=0;
736
+ if(SPfound == false) // 1. left : Closed to start of the gene mention
737
+ {
738
+ for(int s=0;s<Sentence_offsets.size();s++)
739
+
740
+ {
741
+ int Sentence_last=1000000;
742
+ if(s<Sentence_offsets.size()-1)
743
+ {
744
+ Sentence_last=Sentence_offsets.get(s+1);
745
+ }
746
+ if(G_Start<Sentence_last)
747
+ {
748
+ Target_Sentence=s;
749
+ break;
750
+ }
751
+ }
752
+ }
753
+ int Sentence_Start = Sentence_offsets.get(Target_Sentence);
754
+ int Sentence_Last = 1000000;
755
+ if(Sentence_offsets.size() > Target_Sentence+1){ Sentence_Last = Sentence_offsets.get(Target_Sentence+1); }
756
+ if(SPfound == false) // 1. left : Closed to start of the gene mention
757
+ {
758
+ int closet_Sp_Start=0;
759
+ for(int sp=0;sp<Annotations_Species.size();sp++) // Find the closet species
760
+ {
761
+ String AnnoSp[]=Annotations_Species.get(sp).split("\t");
762
+ int Sp_Start = Integer.parseInt(AnnoSp[0]);
763
+ String patt="^\\**([0-9]+)$";
764
+ Pattern ptmp = Pattern.compile(patt);
765
+ Matcher mtmp = ptmp.matcher(AnnoSp[4]);
766
+ if(mtmp.find())
767
+ {
768
+ String taxid = mtmp.group(1);
769
+ Location2Species_hash.put(Sp_Start,taxid);
770
+ if(Sp_Start <= G_Start && Sp_Start >= Sentence_Start && Sp_Start >closet_Sp_Start)
771
+ {
772
+ closet_Sp_Start=Sp_Start;
773
+ Location2Species_hash.put(Integer.parseInt(anno[0]), taxid);
774
+
775
+ if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
776
+ {
777
+ mention2Location2Species_hash.get(G_mentions.toLowerCase()).put(Integer.parseInt(anno[0]), taxid);
778
+ }
779
+ else
780
+ {
781
+ mention2Location2Species_hash.put(G_mentions.toLowerCase(),Location2Species_hash);
782
+ }
783
+
784
+ SPfound=true;
785
+ }
786
+ }
787
+ }
788
+ }
789
+ if(SPfound == false) // 2. right : Closed to last of the gene mention
790
+ {
791
+ int closet_Sp_Last=1000000;
792
+ for(int sp=0;sp<Annotations_Species.size();sp++) // Find the closet species
793
+ {
794
+ String AnnoSp[]=Annotations_Species.get(sp).split("\t");
795
+ int Sp_Last = Integer.parseInt(AnnoSp[1]);
796
+ String patt="^\\**([0-9]+)$";
797
+ Pattern ptmp = Pattern.compile(patt);
798
+ Matcher mtmp = ptmp.matcher(AnnoSp[4]);
799
+ if(mtmp.find())
800
+ {
801
+ String taxid = mtmp.group(1);
802
+ if(Sp_Last >= G_Last && Sp_Last <= Sentence_Last && Sp_Last < closet_Sp_Last)
803
+ {
804
+ closet_Sp_Last=Sp_Last;
805
+ Location2Species_hash.put(Integer.parseInt(anno[0]), taxid);
806
+
807
+ if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
808
+ {
809
+ mention2Location2Species_hash.get(G_mentions.toLowerCase()).put(Integer.parseInt(anno[0]), taxid);
810
+ }
811
+ else
812
+ {
813
+ mention2Location2Species_hash.put(G_mentions.toLowerCase(),Location2Species_hash);
814
+ }
815
+
816
+ SPfound=true;
817
+ }
818
+ }
819
+ }
820
+ }
821
+ }
822
+
823
+ for (int k : Annotations_Gene_hash.keySet()) // k is the index of GNormPlus.BioCDocobj.Annotations.get(i).get(j)
824
+ {
825
+ String anno[] = Annotations_Gene_hash.get(k).split("\t");
826
+ int G_Start= Integer.parseInt(anno[0]);
827
+ int G_Last= Integer.parseInt(anno[1]);
828
+ String G_mentions = anno[2];
829
+ String G_type = anno[3];
830
+ String G_mention_list[]=G_mentions.split("\\|");
831
+ String G_mention=G_mention_list[0]; // only use the first term to detect species ; should be updated after SimConcept
832
+
833
+ /** 1. prefix */
834
+ boolean SPfound = false;
835
+ for(String taxid: PrefixIDTarget_hash.keySet())
836
+ {
837
+ if(GNormPlus.GeneWithoutSPPrefix_hash.containsKey(G_mention.toLowerCase()))
838
+ {
839
+ //special case, and no need for prefix - SA
840
+ }
841
+ else
842
+ {
843
+ Pattern ptmp = Pattern.compile("^("+PrefixIDTarget_hash.get(taxid)+")([A-Z].*)$");
844
+ Matcher mtmp = ptmp.matcher(G_mention);
845
+ if(mtmp.find())
846
+ {
847
+ String MentionWoPrefix=mtmp.group(2);
848
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"|"+MentionWoPrefix+"\t"+anno[3]+"\tPrefix:"+taxid);
849
+ SPfound=true;
850
+ break;
851
+ }
852
+ }
853
+ }
854
+
855
+ /**
856
+ * 2. Co-occurring word
857
+ * boundary :
858
+ * Sentence Start: Sentence_offsets.get(Target_Sentence)
859
+ * Sentence Last: Sentence_offsets.get(Target_Sentence+1)
860
+ */
861
+ //Find the target sentence
862
+ int Target_Sentence=0;
863
+ if(SPfound == false) // 1. left : Closed to start of the gene mention
864
+ {
865
+ for(int s=0;s<Sentence_offsets.size();s++)
866
+
867
+ {
868
+ int Sentence_last=1000000;
869
+ if(s<Sentence_offsets.size()-1)
870
+ {
871
+ Sentence_last=Sentence_offsets.get(s+1);
872
+ }
873
+ if(G_Start<Sentence_last)
874
+ {
875
+ Target_Sentence=s;
876
+ break;
877
+ }
878
+ }
879
+ }
880
+ int Sentence_Start = Sentence_offsets.get(Target_Sentence);
881
+ int Sentence_Last = 1000000;
882
+ if(Sentence_offsets.size() > Target_Sentence+1){ Sentence_Last = Sentence_offsets.get(Target_Sentence+1); }
883
+ if(SPfound == false) // 1. left : Closed to start of the gene mention
884
+ {
885
+ int closet_Sp_Start=0;
886
+ for(int sp=0;sp<Annotations_Species.size();sp++) // Find the closet species
887
+ {
888
+ String AnnoSp[]=Annotations_Species.get(sp).split("\t");
889
+ int Sp_Start = Integer.parseInt(AnnoSp[0]);
890
+ String patt="^\\**([0-9]+)$";
891
+ Pattern ptmp = Pattern.compile(patt);
892
+ Matcher mtmp = ptmp.matcher(AnnoSp[4]);
893
+ if(mtmp.find())
894
+ {
895
+ String taxid = mtmp.group(1);
896
+ if(Sp_Start <= G_Start && Sp_Start >= Sentence_Start && Sp_Start >closet_Sp_Start)
897
+ {
898
+ closet_Sp_Start=Sp_Start;
899
+ if(GNormPlus.SP_Virus2Human_hash.containsKey(taxid))
900
+ {
901
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tLeft:"+taxid+"&9606");
902
+ }
903
+ else
904
+ {
905
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tLeft:"+taxid);
906
+ }
907
+ SPfound=true;
908
+ }
909
+ }
910
+ }
911
+ }
912
+ if(SPfound == false) // 2. right : Closed to last of the gene mention
913
+ {
914
+ int closet_Sp_Last=1000000;
915
+ for(int sp=0;sp<Annotations_Species.size();sp++) // Find the closet species
916
+ {
917
+ String AnnoSp[]=Annotations_Species.get(sp).split("\t");
918
+ int Sp_Last = Integer.parseInt(AnnoSp[1]);
919
+ String patt="^\\**([0-9]+)$";
920
+ Pattern ptmp = Pattern.compile(patt);
921
+ Matcher mtmp = ptmp.matcher(AnnoSp[4]);
922
+ if(mtmp.find())
923
+ {
924
+ String taxid = mtmp.group(1);
925
+ if(Sp_Last >= G_Last && Sp_Last <= Sentence_Last && Sp_Last < closet_Sp_Last)
926
+ {
927
+ closet_Sp_Last=Sp_Last;
928
+ if(GNormPlus.SP_Virus2Human_hash.containsKey(taxid))
929
+ {
930
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tRight:"+taxid+"&9606");
931
+ }
932
+ else
933
+ {
934
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tRight:"+taxid);
935
+ }
936
+ SPfound=true;
937
+ }
938
+ }
939
+ }
940
+ }
941
+
942
+ /** 3. Focus species */
943
+ if(SPfound == false) // 2. right : Closed to last of the gene mention
944
+ {
945
+ // 1. only the mentions appeared earlier are inferred
946
+ //
947
+ if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
948
+ {
949
+ int closed_loca=0;
950
+ for (int loca_start : mention2Location2Species_hash.get(G_mentions.toLowerCase()).keySet())
951
+ {
952
+ if(loca_start<G_Start)
953
+ {
954
+ if(loca_start>closed_loca)
955
+ {
956
+ closed_loca=loca_start;
957
+ }
958
+ }
959
+ }
960
+ if(closed_loca>0)
961
+ {
962
+ if(GNormPlus.SP_Virus2Human_hash.containsKey(Location2Species_hash.get(closed_loca)))
963
+ {
964
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+Location2Species_hash.get(closed_loca)+"&9606");
965
+ }
966
+ else
967
+ {
968
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+Location2Species_hash.get(closed_loca));
969
+ }
970
+ }
971
+ else
972
+ {
973
+ if(GNormPlus.SP_Virus2Human_hash.containsKey(MajorSP))
974
+ {
975
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP+"&9606");
976
+ }
977
+ else
978
+ {
979
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP);
980
+ }
981
+ }
982
+ }
983
+ else
984
+ {
985
+ if(GNormPlus.SP_Virus2Human_hash.containsKey(MajorSP))
986
+ {
987
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP+"&9606");
988
+ }
989
+ else
990
+ {
991
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP);
992
+ }
993
+ }
994
+ }
995
+ }
996
+ }
997
+ }
998
+ }
999
+ GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
1000
+ }
1001
+ public void SpeciesAssignment(String Filename,String FilenameBioC,String FocusSpecies) throws IOException, XMLStreamException
1002
+ {
1003
+ for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) /** PMIDs : i */
1004
+ {
1005
+ for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
1006
+ {
1007
+ for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
1008
+ {
1009
+ String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
1010
+ if(anno.length==5) //Species
1011
+ {
1012
+ String id=anno[4].replaceAll("\\*", "");
1013
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+id);
1014
+ }
1015
+ else //Gene : if(anno.length==3)
1016
+ {
1017
+ /** 1. prefix */
1018
+ boolean SPfound = false;
1019
+ if(GNormPlus.GeneWithoutSPPrefix_hash.containsKey(anno[2].toLowerCase()))
1020
+ {
1021
+ //special case, and no need for prefix - SA
1022
+ }
1023
+ else
1024
+ {
1025
+ Pattern ptmp = Pattern.compile("^("+GNormPlus.PrefixID_hash.get(FocusSpecies)+")([A-Z].*)$");
1026
+ Matcher mtmp = ptmp.matcher(anno[2]);
1027
+ if(mtmp.find())
1028
+ {
1029
+ String MentionWoPrefix=mtmp.group(2);
1030
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"|"+MentionWoPrefix+"\t"+anno[3]+"\tPrefix:"+FocusSpecies);
1031
+ SPfound=true;
1032
+ }
1033
+ }
1034
+ if(SPfound == false)
1035
+ {
1036
+ GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k)+"\tFocus:"+FocusSpecies);
1037
+ }
1038
+ }
1039
+ }
1040
+ }
1041
+ }
1042
+ GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
1043
+ }
1044
  }
src_Java/GNormPluslib/SimConcept.java CHANGED
The diff for this file is too large to render. See raw diff
 
src_python/GeneNER/BIO_format.py CHANGED
@@ -1,257 +1,257 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on Wed Sep 7 08:58:22 2022
4
-
5
- @author: luol2
6
- """
7
-
8
- # -*- coding: utf-8 -*-
9
- """
10
- Created on Fri Jun 24 11:27:57 2022
11
-
12
- @author: luol2
13
- """
14
-
15
-
16
- import stanza
17
- import sys
18
- import os
19
- import io
20
- import json
21
- import re
22
- #sort entity by position in text
23
- def pubtator_entitysort(infile):
24
-
25
- fin=open(infile,'r',encoding='utf-8')
26
- # fout=open(path+'LitCoin/sort/Train_sort.PubTator','w',encoding='utf-8')
27
- fout=io.StringIO()
28
- all_in=fin.read().strip().split('\n\n')
29
- fin.close()
30
- error_dict={} #use to debug error
31
- for doc in all_in:
32
- entity_dict={}
33
- lines=doc.split('\n')
34
- fout.write(lines[0]+'\n'+lines[1]+'\n')
35
- for i in range(2,len(lines)):
36
- segs=lines[i].split('\t')
37
- if len(segs)>=5:
38
- if lines[i] not in entity_dict.keys():
39
- entity_dict[lines[i]]=int(segs[1])
40
- else:
41
- print('entity have in',lines[i])
42
- if segs[0] not in error_dict.keys():
43
- error_dict[segs[0]]=[lines[i]]
44
- else:
45
- if lines[i] not in error_dict[segs[0]]:
46
- error_dict[segs[0]].append(lines[i])
47
-
48
- entity_sort=sorted(entity_dict.items(), key=lambda kv:(kv[1]), reverse=False)
49
- for ele in entity_sort:
50
- fout.write(ele[0]+'\n')
51
- fout.write('\n')
52
- return fout
53
-
54
- def filter_overlap(infile): #nonest
55
-
56
- fin=io.StringIO(infile.getvalue())
57
- fout=io.StringIO()
58
-
59
- documents=fin.read().strip().split('\n\n')
60
- fin.close()
61
- total_entity=0
62
- over_entity=0
63
- nest_entity=0
64
- for doc in documents:
65
- lines=doc.split('\n')
66
- entity_list=[]
67
- if len(lines)>2:
68
- first_entity=lines[2].split('\t')
69
- nest_list=[first_entity]
70
- max_eid=int(first_entity[2])
71
- total_entity+=len(lines)-2
72
- for i in range(3,len(lines)):
73
- segs=lines[i].split('\t')
74
- if int(segs[1])> max_eid:
75
- if len(nest_list)==1:
76
- entity_list.append(nest_list[0])
77
- nest_list=[]
78
- nest_list.append(segs)
79
- if int(segs[2])>max_eid:
80
- max_eid=int(segs[2])
81
- else:
82
- # print(nest_list)
83
- nest_entity+=len(nest_list)-1
84
- tem=find_max_entity(nest_list)#find max entity
85
- # if len(tem)>1:
86
- # print('max nest >1:',tem)
87
- entity_list.extend(tem)
88
- nest_list=[]
89
- nest_list.append(segs)
90
- if int(segs[2])>max_eid:
91
- max_eid=int(segs[2])
92
-
93
- else:
94
- nest_list.append(segs)
95
- if int(segs[2])>max_eid:
96
- max_eid=int(segs[2])
97
- if nest_list!=[]:
98
- if len(nest_list)==1:
99
- entity_list.append(nest_list[0])
100
-
101
- else:
102
- tem=find_max_entity(nest_list)#find max entity
103
- # if len(tem)>1:
104
- # print('max nest >1:',tem)
105
- entity_list.extend(tem)
106
- fout.write(lines[0]+'\n'+lines[1]+'\n')
107
- for ele in entity_list:
108
- fout.write('\t'.join(ele)+'\n')
109
- fout.write('\n')
110
- # print(total_entity,over_entity, nest_entity)
111
- return fout
112
- def find_max_entity(nest_list): #longest entity
113
- max_len=0
114
- final_tem=[]
115
- max_index=0
116
- for i in range(0, len(nest_list)):
117
- cur_len=int(nest_list[i][2])-int(nest_list[i][1])
118
- if cur_len>max_len:
119
- max_len=cur_len
120
- max_index=i
121
-
122
- final_tem.append(nest_list[max_index])
123
- return final_tem
124
-
125
- # change ori pubtator format to labeled text , entity begin with " ssss", end with 'eeee '
126
- def pubtator_to_labeltext(infile):
127
-
128
- fin=io.StringIO(infile.getvalue())
129
- all_context=fin.read().strip().split('\n\n')
130
- fin.close()
131
- fout=io.StringIO()
132
- label_dic={}
133
-
134
- for doc in all_context:
135
- lines=doc.split('\n')
136
- ori_text=lines[0].split('|t|')[1]+' '+lines[1].split('|a|')[1]
137
- pmid=lines[0].split('|t|')[0]
138
- s_index=0
139
- e_index=0
140
- new_text=''
141
- for i in range(2,len(lines)):
142
- segs=lines[i].split('\t')
143
- label_dic[segs[4].lower()]=segs[4]
144
- if len(segs)==6:
145
- e_index=int(segs[1])
146
- new_text+=ori_text[s_index:e_index]+' ssss'+segs[4].lower()+' '+ori_text[int(segs[1]):int(segs[2])]+' eeee'+segs[4].lower()+' '
147
- s_index=int(segs[2])
148
- # if ori_text[int(segs[1]):int(segs[2])]!=segs[3]:
149
- # print('error(ori,label):',ori_text[int(segs[1]):int(segs[2])],segs[3])
150
-
151
- new_text+=ori_text[s_index:]
152
- fout.write(pmid+'\t'+' '.join(new_text.strip().split())+'\n')
153
- return fout, label_dic
154
-
155
-
156
- def pre_token(sentence):
157
- sentence=re.sub("([\=\/\(\)\<\>\+\-\_])"," \\1 ",sentence)
158
- sentence=re.sub("[ ]+"," ",sentence);
159
- return sentence
160
-
161
- # labeltext to conll format (BIO), a token (including features) per line. sentences are split by '\n', or docs are split by '\n'
162
- def labeltext_to_conll_fasttoken(infile,label_dic):
163
-
164
- fin=io.StringIO(infile.getvalue())
165
- all_context=fin.read().strip().split('\n')
166
- fin.close()
167
- fout=io.StringIO()
168
-
169
- # nlp = stanza.Pipeline(lang='en', processors='tokenize',package='craft') #package='craft'
170
- nlp = stanza.Pipeline(lang='en', processors={'tokenize': 'spacy'},package='None') #package='craft'
171
-
172
- doc_i=0
173
- for doc in all_context:
174
- doc_text=doc.split('\t')[1]
175
- doc_text=pre_token(doc_text)
176
- doc_stanza = nlp(doc_text)
177
- doc_i+=1
178
- #print(doc_i)
179
- inentity_flag=0
180
- last_label='O'
181
- for sent in doc_stanza.sentences:
182
- temp_sent=[]
183
- word_num=0
184
- for word in sent.words:
185
- word_num+=1
186
- # print(word.text)
187
- if word.text.strip()=='':
188
- continue
189
- temp_sent.append(word.text)
190
- if word.text.startswith('ssss')==True:
191
- last_label=word.text
192
- inentity_flag=1
193
- elif word.text.startswith('eeee')==True:
194
- last_label=word.text
195
- inentity_flag=0
196
- else:
197
- if last_label=='O':
198
- now_label='O'
199
- elif last_label.startswith('ssss')==True:
200
- now_label='B-'+label_dic[last_label[4:]]
201
-
202
- elif last_label.startswith('B-')==True:
203
- now_label='I-'+last_label[2:]
204
- elif last_label.startswith('I-')==True:
205
- now_label='I-'+last_label[2:]
206
- elif last_label.startswith('eeee')==True:
207
- now_label='O'
208
-
209
- fout.write(word.text+'\t'+now_label+'\n')
210
- last_label=now_label
211
- if inentity_flag==1: # if entity is split by sentence, will connate the sentence
212
- # print('sentence error!!!')
213
- # print(word.text,word_num)
214
- # print(temp_sent)
215
- pass
216
- else:
217
- fout.write('\n')
218
- return fout
219
-
220
- def pubtator_to_conll(infile):
221
-
222
- #1.entity sort
223
- input_sort=pubtator_entitysort(infile)
224
- #print(input_sort.getvalue())
225
-
226
- #2. no overlap, if overlap get longest entity
227
- input_nonest=filter_overlap(input_sort)
228
- # print('......sort.....\n',input_sort.getvalue())
229
-
230
- #3. pubtator to label text
231
- input_labtext,label_dic=pubtator_to_labeltext(input_nonest)
232
- # print('......label.....\n',input_labtext.getvalue())
233
- #print(label_dic)
234
-
235
- #4. label text to conll
236
- output = labeltext_to_conll_fasttoken(input_labtext,label_dic)
237
- # print('......output.....\n',output.getvalue())
238
- # fout=open(outfile,'w',encoding='utf-8')
239
- # fout.write(input_nonest.getvalue())
240
- # fout.close()
241
- return output
242
-
243
- if __name__=='__main__':
244
-
245
-
246
- infile='../../TrainingSet/No100/NER.Train.txt'
247
- output=pubtator_to_conll(infile)
248
- fout=open('../../TrainingSet/No100/NER.Train.conll','w',encoding='utf-8')
249
- fout.write(output.getvalue())
250
- fout.close()
251
- output.close()
252
-
253
-
254
-
255
-
256
-
257
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Wed Sep 7 08:58:22 2022
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ # -*- coding: utf-8 -*-
9
+ """
10
+ Created on Fri Jun 24 11:27:57 2022
11
+
12
+ @author: luol2
13
+ """
14
+
15
+
16
+ import stanza
17
+ import sys
18
+ import os
19
+ import io
20
+ import json
21
+ import re
22
+ #sort entity by position in text
23
+ def pubtator_entitysort(infile):
24
+
25
+ fin=open(infile,'r',encoding='utf-8')
26
+ # fout=open(path+'LitCoin/sort/Train_sort.PubTator','w',encoding='utf-8')
27
+ fout=io.StringIO()
28
+ all_in=fin.read().strip().split('\n\n')
29
+ fin.close()
30
+ error_dict={} #use to debug error
31
+ for doc in all_in:
32
+ entity_dict={}
33
+ lines=doc.split('\n')
34
+ fout.write(lines[0]+'\n'+lines[1]+'\n')
35
+ for i in range(2,len(lines)):
36
+ segs=lines[i].split('\t')
37
+ if len(segs)>=5:
38
+ if lines[i] not in entity_dict.keys():
39
+ entity_dict[lines[i]]=int(segs[1])
40
+ else:
41
+ print('entity have in',lines[i])
42
+ if segs[0] not in error_dict.keys():
43
+ error_dict[segs[0]]=[lines[i]]
44
+ else:
45
+ if lines[i] not in error_dict[segs[0]]:
46
+ error_dict[segs[0]].append(lines[i])
47
+
48
+ entity_sort=sorted(entity_dict.items(), key=lambda kv:(kv[1]), reverse=False)
49
+ for ele in entity_sort:
50
+ fout.write(ele[0]+'\n')
51
+ fout.write('\n')
52
+ return fout
53
+
54
+ def filter_overlap(infile): #nonest
55
+
56
+ fin=io.StringIO(infile.getvalue())
57
+ fout=io.StringIO()
58
+
59
+ documents=fin.read().strip().split('\n\n')
60
+ fin.close()
61
+ total_entity=0
62
+ over_entity=0
63
+ nest_entity=0
64
+ for doc in documents:
65
+ lines=doc.split('\n')
66
+ entity_list=[]
67
+ if len(lines)>2:
68
+ first_entity=lines[2].split('\t')
69
+ nest_list=[first_entity]
70
+ max_eid=int(first_entity[2])
71
+ total_entity+=len(lines)-2
72
+ for i in range(3,len(lines)):
73
+ segs=lines[i].split('\t')
74
+ if int(segs[1])> max_eid:
75
+ if len(nest_list)==1:
76
+ entity_list.append(nest_list[0])
77
+ nest_list=[]
78
+ nest_list.append(segs)
79
+ if int(segs[2])>max_eid:
80
+ max_eid=int(segs[2])
81
+ else:
82
+ # print(nest_list)
83
+ nest_entity+=len(nest_list)-1
84
+ tem=find_max_entity(nest_list)#find max entity
85
+ # if len(tem)>1:
86
+ # print('max nest >1:',tem)
87
+ entity_list.extend(tem)
88
+ nest_list=[]
89
+ nest_list.append(segs)
90
+ if int(segs[2])>max_eid:
91
+ max_eid=int(segs[2])
92
+
93
+ else:
94
+ nest_list.append(segs)
95
+ if int(segs[2])>max_eid:
96
+ max_eid=int(segs[2])
97
+ if nest_list!=[]:
98
+ if len(nest_list)==1:
99
+ entity_list.append(nest_list[0])
100
+
101
+ else:
102
+ tem=find_max_entity(nest_list)#find max entity
103
+ # if len(tem)>1:
104
+ # print('max nest >1:',tem)
105
+ entity_list.extend(tem)
106
+ fout.write(lines[0]+'\n'+lines[1]+'\n')
107
+ for ele in entity_list:
108
+ fout.write('\t'.join(ele)+'\n')
109
+ fout.write('\n')
110
+ # print(total_entity,over_entity, nest_entity)
111
+ return fout
112
+ def find_max_entity(nest_list): #longest entity
113
+ max_len=0
114
+ final_tem=[]
115
+ max_index=0
116
+ for i in range(0, len(nest_list)):
117
+ cur_len=int(nest_list[i][2])-int(nest_list[i][1])
118
+ if cur_len>max_len:
119
+ max_len=cur_len
120
+ max_index=i
121
+
122
+ final_tem.append(nest_list[max_index])
123
+ return final_tem
124
+
125
+ # change ori pubtator format to labeled text , entity begin with " ssss", end with 'eeee '
126
+ def pubtator_to_labeltext(infile):
127
+
128
+ fin=io.StringIO(infile.getvalue())
129
+ all_context=fin.read().strip().split('\n\n')
130
+ fin.close()
131
+ fout=io.StringIO()
132
+ label_dic={}
133
+
134
+ for doc in all_context:
135
+ lines=doc.split('\n')
136
+ ori_text=lines[0].split('|t|')[1]+' '+lines[1].split('|a|')[1]
137
+ pmid=lines[0].split('|t|')[0]
138
+ s_index=0
139
+ e_index=0
140
+ new_text=''
141
+ for i in range(2,len(lines)):
142
+ segs=lines[i].split('\t')
143
+ label_dic[segs[4].lower()]=segs[4]
144
+ if len(segs)==6:
145
+ e_index=int(segs[1])
146
+ new_text+=ori_text[s_index:e_index]+' ssss'+segs[4].lower()+' '+ori_text[int(segs[1]):int(segs[2])]+' eeee'+segs[4].lower()+' '
147
+ s_index=int(segs[2])
148
+ # if ori_text[int(segs[1]):int(segs[2])]!=segs[3]:
149
+ # print('error(ori,label):',ori_text[int(segs[1]):int(segs[2])],segs[3])
150
+
151
+ new_text+=ori_text[s_index:]
152
+ fout.write(pmid+'\t'+' '.join(new_text.strip().split())+'\n')
153
+ return fout, label_dic
154
+
155
+
156
+ def pre_token(sentence):
157
+ sentence=re.sub("([\=\/\(\)\<\>\+\-\_])"," \\1 ",sentence)
158
+ sentence=re.sub("[ ]+"," ",sentence);
159
+ return sentence
160
+
161
+ # labeltext to conll format (BIO), a token (including features) per line. sentences are split by '\n', or docs are split by '\n'
162
+ def labeltext_to_conll_fasttoken(infile,label_dic):
163
+
164
+ fin=io.StringIO(infile.getvalue())
165
+ all_context=fin.read().strip().split('\n')
166
+ fin.close()
167
+ fout=io.StringIO()
168
+
169
+ # nlp = stanza.Pipeline(lang='en', processors='tokenize',package='craft') #package='craft'
170
+ nlp = stanza.Pipeline(lang='en', processors={'tokenize': 'spacy'},package='None') #package='craft'
171
+
172
+ doc_i=0
173
+ for doc in all_context:
174
+ doc_text=doc.split('\t')[1]
175
+ doc_text=pre_token(doc_text)
176
+ doc_stanza = nlp(doc_text)
177
+ doc_i+=1
178
+ #print(doc_i)
179
+ inentity_flag=0
180
+ last_label='O'
181
+ for sent in doc_stanza.sentences:
182
+ temp_sent=[]
183
+ word_num=0
184
+ for word in sent.words:
185
+ word_num+=1
186
+ # print(word.text)
187
+ if word.text.strip()=='':
188
+ continue
189
+ temp_sent.append(word.text)
190
+ if word.text.startswith('ssss')==True:
191
+ last_label=word.text
192
+ inentity_flag=1
193
+ elif word.text.startswith('eeee')==True:
194
+ last_label=word.text
195
+ inentity_flag=0
196
+ else:
197
+ if last_label=='O':
198
+ now_label='O'
199
+ elif last_label.startswith('ssss')==True:
200
+ now_label='B-'+label_dic[last_label[4:]]
201
+
202
+ elif last_label.startswith('B-')==True:
203
+ now_label='I-'+last_label[2:]
204
+ elif last_label.startswith('I-')==True:
205
+ now_label='I-'+last_label[2:]
206
+ elif last_label.startswith('eeee')==True:
207
+ now_label='O'
208
+
209
+ fout.write(word.text+'\t'+now_label+'\n')
210
+ last_label=now_label
211
+ if inentity_flag==1: # if entity is split by sentence, will connate the sentence
212
+ # print('sentence error!!!')
213
+ # print(word.text,word_num)
214
+ # print(temp_sent)
215
+ pass
216
+ else:
217
+ fout.write('\n')
218
+ return fout
219
+
220
+ def pubtator_to_conll(infile):
221
+
222
+ #1.entity sort
223
+ input_sort=pubtator_entitysort(infile)
224
+ #print(input_sort.getvalue())
225
+
226
+ #2. no overlap, if overlap get longest entity
227
+ input_nonest=filter_overlap(input_sort)
228
+ # print('......sort.....\n',input_sort.getvalue())
229
+
230
+ #3. pubtator to label text
231
+ input_labtext,label_dic=pubtator_to_labeltext(input_nonest)
232
+ # print('......label.....\n',input_labtext.getvalue())
233
+ #print(label_dic)
234
+
235
+ #4. label text to conll
236
+ output = labeltext_to_conll_fasttoken(input_labtext,label_dic)
237
+ # print('......output.....\n',output.getvalue())
238
+ # fout=open(outfile,'w',encoding='utf-8')
239
+ # fout.write(input_nonest.getvalue())
240
+ # fout.close()
241
+ return output
242
+
243
+ if __name__=='__main__':
244
+
245
+
246
+ infile='../../TrainingSet/No100/NER.Train.txt'
247
+ output=pubtator_to_conll(infile)
248
+ fout=open('../../TrainingSet/No100/NER.Train.conll','w',encoding='utf-8')
249
+ fout.write(output.getvalue())
250
+ fout.close()
251
+ output.close()
252
+
253
+
254
+
255
+
256
+
257
 
src_python/GeneNER/Evaluation_ner.py CHANGED
@@ -1,243 +1,243 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on Mon Mar 1 15:33:54 2021
4
-
5
- @author: luol2
6
- """
7
- # from BIO format to entity
8
- def BIO_tag(tokens):
9
- gold_entity={}
10
- pre_entity={}
11
- gold_start,gold_end=0,0
12
- pre_start,pre_end=0,0
13
- for i in range(0,len(tokens)):
14
- segs=tokens[i].split('\t')
15
-
16
- # generate gold entity
17
- if segs[1].startswith('B-')>0:
18
- gold_start=i
19
- gold_type=segs[1][2:]
20
- if i+1>=len(tokens): # the last word
21
- gold_end=i
22
- if gold_type in gold_entity.keys():
23
- gold_entity[gold_type].append([gold_start,gold_end])
24
- else:
25
- gold_entity[gold_type]=[[gold_start,gold_end]]
26
- else: # non last word
27
- next_seg=tokens[i+1].split('\t')
28
- if next_seg[1].startswith('B-')>0 or next_seg[1]=='O':
29
- gold_end=i
30
- if gold_type in gold_entity.keys():
31
- gold_entity[gold_type].append([gold_start,gold_end])
32
- else:
33
- gold_entity[gold_type]=[[gold_start,gold_end]]
34
- elif next_seg[1].startswith('I-')>0:
35
- pass
36
- elif segs[1].startswith('I-')>0:
37
- if i+1>=len(tokens): # the last word
38
- gold_end=i
39
- if gold_type in gold_entity.keys():
40
- gold_entity[gold_type].append([gold_start,gold_end])
41
- else:
42
- gold_entity[gold_type]=[[gold_start,gold_end]]
43
- else: # non last word
44
- next_seg=tokens[i+1].split('\t')
45
- if next_seg[1].startswith('B-')>0 or next_seg[1]=='O':
46
- gold_end=i
47
- if gold_type in gold_entity.keys():
48
- gold_entity[gold_type].append([gold_start,gold_end])
49
- else:
50
- gold_entity[gold_type]=[[gold_start,gold_end]]
51
- elif next_seg[1].startswith('I-')>0:
52
- pass
53
- elif segs[1]=='O':
54
- pass
55
-
56
- # generate prediction entity
57
- if segs[2].startswith('B-')>0:
58
- pre_start=i
59
- pre_type=segs[2][2:]
60
- if i+1>=len(tokens): # the last word
61
- pre_end=i
62
- if pre_type in pre_entity.keys():
63
- pre_entity[pre_type].append([pre_start,pre_end])
64
- else:
65
- pre_entity[pre_type]=[[pre_start,pre_end]]
66
- else: # non last word
67
- next_seg=tokens[i+1].split('\t')
68
- if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
69
- pre_end=i
70
- if pre_type in pre_entity.keys():
71
- pre_entity[pre_type].append([pre_start,pre_end])
72
- else:
73
- pre_entity[pre_type]=[[pre_start,pre_end]]
74
- elif next_seg[2].startswith('I-')>0:
75
- pass
76
- elif segs[2].startswith('I-')>0:
77
- if i==0 and i+1<len(tokens): # the first word and not only a word
78
- pre_start=i
79
- pre_type=segs[2][2:]
80
- next_seg=tokens[i+1].split('\t')
81
- if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
82
- pre_end=i
83
- if pre_type in pre_entity.keys():
84
- pre_entity[pre_type].append([pre_start,pre_end])
85
- else:
86
- pre_entity[pre_type]=[[pre_start,pre_end]]
87
- elif next_seg[2].startswith('I-')>0:
88
- pass
89
- elif i==0 and i+1==len(tokens):# only one word:
90
- pre_start=i
91
- pre_type=segs[2][2:]
92
- pre_end=i
93
- if pre_type in pre_entity.keys():
94
- pre_entity[pre_type].append([pre_start,pre_end])
95
- else:
96
- pre_entity[pre_type]=[[pre_start,pre_end]]
97
- elif i+1>=len(tokens): # the last word
98
- last_seg=tokens[i-1].split('\t')
99
- if last_seg[2]=='O':
100
- pre_start=i
101
- pre_type=segs[2][2:]
102
- pre_end=i
103
- if pre_type in pre_entity.keys():
104
- pre_entity[pre_type].append([pre_start,pre_end])
105
- else:
106
- pre_entity[pre_type]=[[pre_start,pre_end]]
107
- elif i+1< len(tokens): # non last word
108
- next_seg=tokens[i+1].split('\t')
109
- last_seg=tokens[i-1].split('\t')
110
- if last_seg[2]=='O':
111
- pre_start=i
112
- pre_type=segs[2][2:]
113
- if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
114
- pre_end=i
115
- if pre_type in pre_entity.keys():
116
- pre_entity[pre_type].append([pre_start,pre_end])
117
- else:
118
- pre_entity[pre_type]=[[pre_start,pre_end]]
119
- elif next_seg[2].startswith('I-')>0:
120
- pass
121
- elif segs[2]=='O':
122
- pass
123
- # print(tokens)
124
- # print(gold_entity)
125
- # print(pre_entity)
126
- return gold_entity,pre_entity
127
-
128
- # input: token \t Gold \t Prediction\n, sentence is split "\n"
129
- def NER_Evaluation():
130
- path='//panfs/pan1/bionlp/lulab/luoling/OpenBioIE_project/models/Kfold/BiLSTM-CRF/'
131
- fin=open(path+'dev_pre.conll_all','r',encoding='utf-8')
132
- all_sentence=fin.read().strip().split('\n\n')
133
- fin.close()
134
- Metrics={} #{'entity_type':[TP,gold_num,pre_num]}
135
-
136
- for sentence in all_sentence:
137
- tokens=sentence.split('\n')
138
- gold_entity,pre_entity=BIO_tag(tokens)
139
- # print(tokens)
140
- for entity_type in gold_entity.keys():
141
- if entity_type not in Metrics.keys():
142
- Metrics[entity_type]=[0,len(gold_entity[entity_type]),0]
143
- else:
144
- Metrics[entity_type][1]+=len(gold_entity[entity_type])
145
- for entity_type in pre_entity.keys():
146
- if entity_type not in Metrics.keys():
147
- Metrics[entity_type]=[0,0,len(pre_entity[entity_type])]
148
- else:
149
- Metrics[entity_type][2]+=len(pre_entity[entity_type])
150
- for mention in pre_entity[entity_type]:
151
- if entity_type in gold_entity.keys():
152
- if mention in gold_entity[entity_type]:
153
- Metrics[entity_type][0]+=1
154
- print(Metrics)
155
- TP,Gold_num,Pre_num=0,0,0
156
- for ele in Metrics.keys():
157
- if Metrics[ele][2]==0:
158
- p=0
159
- else:
160
- p=Metrics[ele][0]/Metrics[ele][2]
161
- if Metrics[ele][1]==0:
162
- r=0
163
- else:
164
- r=Metrics[ele][0]/Metrics[ele][1]
165
- if p+r==0:
166
- f1=0
167
- else:
168
- f1=2*p*r/(p+r)
169
- TP+=Metrics[ele][0]
170
- Gold_num+=Metrics[ele][1]
171
- Pre_num+=Metrics[ele][2]
172
- print(ele+': P=%.5f, R=%.5f, F1=%.5f' % (p,r,f1))
173
- # break
174
- if Pre_num==0:
175
- P=0
176
- else:
177
- P=TP/Pre_num
178
- R=TP/Gold_num
179
- F1=2*P*R/(P+R)
180
- print("Overall: P=%.5f, R=%.5f, F1=%.5f"% (P,R,F1))
181
-
182
- def NER_Evaluation_fn(file):
183
-
184
- fin=open(file,'r',encoding='utf-8')
185
- all_sentence=fin.read().strip().split('\n\n')
186
- fin.close()
187
- Metrics={} #{'entity_type':[TP,gold_num,pre_num]}
188
- breai=0
189
- for sentence in all_sentence:
190
- breai+=1
191
- if breai>5000:
192
- break
193
- tokens=sentence.split('\n')
194
- gold_entity,pre_entity=BIO_tag(tokens)
195
- # print(tokens)
196
- for entity_type in gold_entity.keys():
197
- if entity_type not in Metrics.keys():
198
- Metrics[entity_type]=[0,len(gold_entity[entity_type]),0]
199
- else:
200
- Metrics[entity_type][1]+=len(gold_entity[entity_type])
201
- for entity_type in pre_entity.keys():
202
- if entity_type not in Metrics.keys():
203
- Metrics[entity_type]=[0,0,len(pre_entity[entity_type])]
204
- else:
205
- Metrics[entity_type][2]+=len(pre_entity[entity_type])
206
- for mention in pre_entity[entity_type]:
207
- if entity_type in gold_entity.keys():
208
- if mention in gold_entity[entity_type]:
209
- Metrics[entity_type][0]+=1
210
- print(Metrics)
211
- TP,Gold_num,Pre_num=0,0,0
212
- for ele in Metrics.keys():
213
- if Metrics[ele][2]==0:
214
- p=0
215
- else:
216
- p=Metrics[ele][0]/Metrics[ele][2]
217
- if Metrics[ele][1]==0:
218
- r=0
219
- else:
220
- r=Metrics[ele][0]/Metrics[ele][1]
221
- if p+r==0:
222
- f1=0
223
- else:
224
- f1=2*p*r/(p+r)
225
- TP+=Metrics[ele][0]
226
- Gold_num+=Metrics[ele][1]
227
- Pre_num+=Metrics[ele][2]
228
- print(ele+': P=%.5f, R=%.5f, F1=%.5f' % (p,r,f1))
229
- # break
230
- if Pre_num==0:
231
- P=0
232
- else:
233
- P=TP/Pre_num
234
- R=TP/Gold_num
235
- if P+R==0:
236
- F1=0
237
- else:
238
- F1=2*P*R/(P+R)
239
- print("Overall: P=%.5f, R=%.5f, F1=%.5f"% (P,R,F1))
240
- return F1
241
-
242
- if __name__=='__main__':
243
- NER_Evaluation()
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Mon Mar 1 15:33:54 2021
4
+
5
+ @author: luol2
6
+ """
7
+ # from BIO format to entity
8
+ def BIO_tag(tokens):
9
+ gold_entity={}
10
+ pre_entity={}
11
+ gold_start,gold_end=0,0
12
+ pre_start,pre_end=0,0
13
+ for i in range(0,len(tokens)):
14
+ segs=tokens[i].split('\t')
15
+
16
+ # generate gold entity
17
+ if segs[1].startswith('B-')>0:
18
+ gold_start=i
19
+ gold_type=segs[1][2:]
20
+ if i+1>=len(tokens): # the last word
21
+ gold_end=i
22
+ if gold_type in gold_entity.keys():
23
+ gold_entity[gold_type].append([gold_start,gold_end])
24
+ else:
25
+ gold_entity[gold_type]=[[gold_start,gold_end]]
26
+ else: # non last word
27
+ next_seg=tokens[i+1].split('\t')
28
+ if next_seg[1].startswith('B-')>0 or next_seg[1]=='O':
29
+ gold_end=i
30
+ if gold_type in gold_entity.keys():
31
+ gold_entity[gold_type].append([gold_start,gold_end])
32
+ else:
33
+ gold_entity[gold_type]=[[gold_start,gold_end]]
34
+ elif next_seg[1].startswith('I-')>0:
35
+ pass
36
+ elif segs[1].startswith('I-')>0:
37
+ if i+1>=len(tokens): # the last word
38
+ gold_end=i
39
+ if gold_type in gold_entity.keys():
40
+ gold_entity[gold_type].append([gold_start,gold_end])
41
+ else:
42
+ gold_entity[gold_type]=[[gold_start,gold_end]]
43
+ else: # non last word
44
+ next_seg=tokens[i+1].split('\t')
45
+ if next_seg[1].startswith('B-')>0 or next_seg[1]=='O':
46
+ gold_end=i
47
+ if gold_type in gold_entity.keys():
48
+ gold_entity[gold_type].append([gold_start,gold_end])
49
+ else:
50
+ gold_entity[gold_type]=[[gold_start,gold_end]]
51
+ elif next_seg[1].startswith('I-')>0:
52
+ pass
53
+ elif segs[1]=='O':
54
+ pass
55
+
56
+ # generate prediction entity
57
+ if segs[2].startswith('B-')>0:
58
+ pre_start=i
59
+ pre_type=segs[2][2:]
60
+ if i+1>=len(tokens): # the last word
61
+ pre_end=i
62
+ if pre_type in pre_entity.keys():
63
+ pre_entity[pre_type].append([pre_start,pre_end])
64
+ else:
65
+ pre_entity[pre_type]=[[pre_start,pre_end]]
66
+ else: # non last word
67
+ next_seg=tokens[i+1].split('\t')
68
+ if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
69
+ pre_end=i
70
+ if pre_type in pre_entity.keys():
71
+ pre_entity[pre_type].append([pre_start,pre_end])
72
+ else:
73
+ pre_entity[pre_type]=[[pre_start,pre_end]]
74
+ elif next_seg[2].startswith('I-')>0:
75
+ pass
76
+ elif segs[2].startswith('I-')>0:
77
+ if i==0 and i+1<len(tokens): # the first word and not only a word
78
+ pre_start=i
79
+ pre_type=segs[2][2:]
80
+ next_seg=tokens[i+1].split('\t')
81
+ if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
82
+ pre_end=i
83
+ if pre_type in pre_entity.keys():
84
+ pre_entity[pre_type].append([pre_start,pre_end])
85
+ else:
86
+ pre_entity[pre_type]=[[pre_start,pre_end]]
87
+ elif next_seg[2].startswith('I-')>0:
88
+ pass
89
+ elif i==0 and i+1==len(tokens):# only one word:
90
+ pre_start=i
91
+ pre_type=segs[2][2:]
92
+ pre_end=i
93
+ if pre_type in pre_entity.keys():
94
+ pre_entity[pre_type].append([pre_start,pre_end])
95
+ else:
96
+ pre_entity[pre_type]=[[pre_start,pre_end]]
97
+ elif i+1>=len(tokens): # the last word
98
+ last_seg=tokens[i-1].split('\t')
99
+ if last_seg[2]=='O':
100
+ pre_start=i
101
+ pre_type=segs[2][2:]
102
+ pre_end=i
103
+ if pre_type in pre_entity.keys():
104
+ pre_entity[pre_type].append([pre_start,pre_end])
105
+ else:
106
+ pre_entity[pre_type]=[[pre_start,pre_end]]
107
+ elif i+1< len(tokens): # non last word
108
+ next_seg=tokens[i+1].split('\t')
109
+ last_seg=tokens[i-1].split('\t')
110
+ if last_seg[2]=='O':
111
+ pre_start=i
112
+ pre_type=segs[2][2:]
113
+ if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
114
+ pre_end=i
115
+ if pre_type in pre_entity.keys():
116
+ pre_entity[pre_type].append([pre_start,pre_end])
117
+ else:
118
+ pre_entity[pre_type]=[[pre_start,pre_end]]
119
+ elif next_seg[2].startswith('I-')>0:
120
+ pass
121
+ elif segs[2]=='O':
122
+ pass
123
+ # print(tokens)
124
+ # print(gold_entity)
125
+ # print(pre_entity)
126
+ return gold_entity,pre_entity
127
+
128
+ # input: token \t Gold \t Prediction\n, sentence is split "\n"
129
+ def NER_Evaluation():
130
+ path='//panfs/pan1/bionlp/lulab/luoling/OpenBioIE_project/models/Kfold/BiLSTM-CRF/'
131
+ fin=open(path+'dev_pre.conll_all','r',encoding='utf-8')
132
+ all_sentence=fin.read().strip().split('\n\n')
133
+ fin.close()
134
+ Metrics={} #{'entity_type':[TP,gold_num,pre_num]}
135
+
136
+ for sentence in all_sentence:
137
+ tokens=sentence.split('\n')
138
+ gold_entity,pre_entity=BIO_tag(tokens)
139
+ # print(tokens)
140
+ for entity_type in gold_entity.keys():
141
+ if entity_type not in Metrics.keys():
142
+ Metrics[entity_type]=[0,len(gold_entity[entity_type]),0]
143
+ else:
144
+ Metrics[entity_type][1]+=len(gold_entity[entity_type])
145
+ for entity_type in pre_entity.keys():
146
+ if entity_type not in Metrics.keys():
147
+ Metrics[entity_type]=[0,0,len(pre_entity[entity_type])]
148
+ else:
149
+ Metrics[entity_type][2]+=len(pre_entity[entity_type])
150
+ for mention in pre_entity[entity_type]:
151
+ if entity_type in gold_entity.keys():
152
+ if mention in gold_entity[entity_type]:
153
+ Metrics[entity_type][0]+=1
154
+ print(Metrics)
155
+ TP,Gold_num,Pre_num=0,0,0
156
+ for ele in Metrics.keys():
157
+ if Metrics[ele][2]==0:
158
+ p=0
159
+ else:
160
+ p=Metrics[ele][0]/Metrics[ele][2]
161
+ if Metrics[ele][1]==0:
162
+ r=0
163
+ else:
164
+ r=Metrics[ele][0]/Metrics[ele][1]
165
+ if p+r==0:
166
+ f1=0
167
+ else:
168
+ f1=2*p*r/(p+r)
169
+ TP+=Metrics[ele][0]
170
+ Gold_num+=Metrics[ele][1]
171
+ Pre_num+=Metrics[ele][2]
172
+ print(ele+': P=%.5f, R=%.5f, F1=%.5f' % (p,r,f1))
173
+ # break
174
+ if Pre_num==0:
175
+ P=0
176
+ else:
177
+ P=TP/Pre_num
178
+ R=TP/Gold_num
179
+ F1=2*P*R/(P+R)
180
+ print("Overall: P=%.5f, R=%.5f, F1=%.5f"% (P,R,F1))
181
+
182
+ def NER_Evaluation_fn(file):
183
+
184
+ fin=open(file,'r',encoding='utf-8')
185
+ all_sentence=fin.read().strip().split('\n\n')
186
+ fin.close()
187
+ Metrics={} #{'entity_type':[TP,gold_num,pre_num]}
188
+ breai=0
189
+ for sentence in all_sentence:
190
+ breai+=1
191
+ if breai>5000:
192
+ break
193
+ tokens=sentence.split('\n')
194
+ gold_entity,pre_entity=BIO_tag(tokens)
195
+ # print(tokens)
196
+ for entity_type in gold_entity.keys():
197
+ if entity_type not in Metrics.keys():
198
+ Metrics[entity_type]=[0,len(gold_entity[entity_type]),0]
199
+ else:
200
+ Metrics[entity_type][1]+=len(gold_entity[entity_type])
201
+ for entity_type in pre_entity.keys():
202
+ if entity_type not in Metrics.keys():
203
+ Metrics[entity_type]=[0,0,len(pre_entity[entity_type])]
204
+ else:
205
+ Metrics[entity_type][2]+=len(pre_entity[entity_type])
206
+ for mention in pre_entity[entity_type]:
207
+ if entity_type in gold_entity.keys():
208
+ if mention in gold_entity[entity_type]:
209
+ Metrics[entity_type][0]+=1
210
+ print(Metrics)
211
+ TP,Gold_num,Pre_num=0,0,0
212
+ for ele in Metrics.keys():
213
+ if Metrics[ele][2]==0:
214
+ p=0
215
+ else:
216
+ p=Metrics[ele][0]/Metrics[ele][2]
217
+ if Metrics[ele][1]==0:
218
+ r=0
219
+ else:
220
+ r=Metrics[ele][0]/Metrics[ele][1]
221
+ if p+r==0:
222
+ f1=0
223
+ else:
224
+ f1=2*p*r/(p+r)
225
+ TP+=Metrics[ele][0]
226
+ Gold_num+=Metrics[ele][1]
227
+ Pre_num+=Metrics[ele][2]
228
+ print(ele+': P=%.5f, R=%.5f, F1=%.5f' % (p,r,f1))
229
+ # break
230
+ if Pre_num==0:
231
+ P=0
232
+ else:
233
+ P=TP/Pre_num
234
+ R=TP/Gold_num
235
+ if P+R==0:
236
+ F1=0
237
+ else:
238
+ F1=2*P*R/(P+R)
239
+ print("Overall: P=%.5f, R=%.5f, F1=%.5f"% (P,R,F1))
240
+ return F1
241
+
242
+ if __name__=='__main__':
243
+ NER_Evaluation()
src_python/GeneNER/model_ner.py CHANGED
@@ -1,102 +1,102 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on Wed Feb 10 09:08:09 2021
4
-
5
- @author: luol2
6
- """
7
- import tensorflow as tf
8
- from src_python.GeneNER.represent_ner import Hugface_RepresentationLayer
9
- from tensorflow.keras.layers import *
10
- from tensorflow.keras.models import Model
11
- from tensorflow.keras.optimizers import RMSprop, SGD, Adam, Adadelta, Adagrad,Nadam
12
- from transformers import TFBertModel, BertConfig,TFElectraModel,TFAutoModel
13
- import numpy as np
14
- import sys
15
-
16
-
17
- class LRSchedule_LINEAR(tf.keras.optimizers.schedules.LearningRateSchedule):
18
- def __init__(
19
- self,
20
- init_lr=5e-5,
21
- init_warmup_lr=0.0,
22
- final_lr=5e-7,
23
- warmup_steps=0,
24
- decay_steps=0,
25
- ):
26
- super().__init__()
27
- self.init_lr = init_lr
28
- self.init_warmup_lr=init_warmup_lr
29
- self.final_lr = final_lr
30
- self.warmup_steps = warmup_steps
31
- self.decay_steps = decay_steps
32
-
33
- def __call__(self, step):
34
- """ linear warm up - linear decay """
35
- if self.warmup_steps>0:
36
- warmup_lr = (self.init_lr - self.init_warmup_lr)/self.warmup_steps * step+self.init_warmup_lr
37
- else:
38
- warmup_lr=1000.0
39
- #print('\n.......warmup_lr:',warmup_lr)
40
- decay_lr = tf.math.maximum(
41
- self.final_lr,
42
- self.init_lr - (step - self.warmup_steps)/self.decay_steps*(self.init_lr - self.final_lr)
43
- )
44
- #print('\n.....decay_lr:',decay_lr)
45
- return tf.math.minimum(warmup_lr,decay_lr)
46
-
47
-
48
- class HUGFACE_NER(): #huggingface transformers
49
- def __init__(self, model_files):
50
- self.model_type='HUGFACE'
51
- self.maxlen = 256 #sent 256 doc-512,pretrain-sent 128
52
- self.checkpoint_path = model_files['checkpoint_path']
53
- self.label_file=model_files['labelfile']
54
- self.lowercase=model_files['lowercase']
55
- self.rep = Hugface_RepresentationLayer(self.checkpoint_path, self.label_file, lowercase=self.lowercase)
56
-
57
-
58
- def build_encoder(self):
59
- print('...vocab len:',self.rep.vocab_len)
60
- plm_model = TFAutoModel.from_pretrained(self.checkpoint_path, from_pt=True)
61
- # plm_model.resize_token_embeddings(self.rep.vocab_len)
62
- x1_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='input_ids')
63
- x2_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='token_type_ids')
64
- x3_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='attention_mask')
65
- x = plm_model(x1_in, token_type_ids=x2_in, attention_mask=x3_in)[0]
66
- #dense = TimeDistributed(Dense(512, activation='relu'), name='dense1')(x)
67
- self.encoder = Model (inputs=[x1_in,x2_in,x3_in], outputs=x,name='hugface_encoder')
68
- self.encoder.summary()
69
-
70
- def build_softmax_decoder(self):
71
-
72
- x1_in = Input(shape=(self.maxlen,),dtype=tf.int32)
73
- x2_in = Input(shape=(self.maxlen,),dtype=tf.int32)
74
- x3_in = Input(shape=(self.maxlen,),dtype=tf.int32)
75
- features = self.encoder([x1_in,x2_in,x3_in])
76
- #features = Dropout(0.4)(features)
77
- features = TimeDistributed(Dense(128, activation='relu'), name='dense2')(features)
78
- features= Dropout(0.1)(features)
79
- output = TimeDistributed(Dense(self.rep.label_table_size, activation='softmax'), name='softmax')(features)
80
- self.model = Model(inputs=[x1_in,x2_in,x3_in], outputs=output, name="hugface_softmax")
81
-
82
- lr_schedule=LRSchedule_LINEAR(
83
- init_lr=1e-5,
84
- init_warmup_lr=1e-7,
85
- final_lr=5e-6,
86
- warmup_steps=0,
87
- decay_steps=1000)
88
-
89
- opt = Adam(learning_rate = lr_schedule)
90
- #opt = Adam(lr=5e-6)
91
- self.model.compile(
92
- optimizer=opt,
93
- loss='sparse_categorical_crossentropy',
94
- metrics=['accuracy'],
95
- )
96
- self.model.summary()
97
-
98
-
99
- def load_model(self,model_file):
100
- self.model.load_weights(model_file)
101
- self.model.summary()
102
- print('load HUGFACE model done!')
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Wed Feb 10 09:08:09 2021
4
+
5
+ @author: luol2
6
+ """
7
+ import tensorflow as tf
8
+ from src_python.GeneNER.represent_ner import Hugface_RepresentationLayer
9
+ from tensorflow.keras.layers import *
10
+ from tensorflow.keras.models import Model
11
+ from tensorflow.keras.optimizers import RMSprop, SGD, Adam, Adadelta, Adagrad,Nadam
12
+ from transformers import TFBertModel, BertConfig,TFElectraModel,TFAutoModel
13
+ import numpy as np
14
+ import sys
15
+
16
+
17
+ class LRSchedule_LINEAR(tf.keras.optimizers.schedules.LearningRateSchedule):
18
+ def __init__(
19
+ self,
20
+ init_lr=5e-5,
21
+ init_warmup_lr=0.0,
22
+ final_lr=5e-7,
23
+ warmup_steps=0,
24
+ decay_steps=0,
25
+ ):
26
+ super().__init__()
27
+ self.init_lr = init_lr
28
+ self.init_warmup_lr=init_warmup_lr
29
+ self.final_lr = final_lr
30
+ self.warmup_steps = warmup_steps
31
+ self.decay_steps = decay_steps
32
+
33
+ def __call__(self, step):
34
+ """ linear warm up - linear decay """
35
+ if self.warmup_steps>0:
36
+ warmup_lr = (self.init_lr - self.init_warmup_lr)/self.warmup_steps * step+self.init_warmup_lr
37
+ else:
38
+ warmup_lr=1000.0
39
+ #print('\n.......warmup_lr:',warmup_lr)
40
+ decay_lr = tf.math.maximum(
41
+ self.final_lr,
42
+ self.init_lr - (step - self.warmup_steps)/self.decay_steps*(self.init_lr - self.final_lr)
43
+ )
44
+ #print('\n.....decay_lr:',decay_lr)
45
+ return tf.math.minimum(warmup_lr,decay_lr)
46
+
47
+
48
+ class HUGFACE_NER(): #huggingface transformers
49
+ def __init__(self, model_files):
50
+ self.model_type='HUGFACE'
51
+ self.maxlen = 256 #sent 256 doc-512,pretrain-sent 128
52
+ self.checkpoint_path = model_files['checkpoint_path']
53
+ self.label_file=model_files['labelfile']
54
+ self.lowercase=model_files['lowercase']
55
+ self.rep = Hugface_RepresentationLayer(self.checkpoint_path, self.label_file, lowercase=self.lowercase)
56
+
57
+
58
+ def build_encoder(self):
59
+ print('...vocab len:',self.rep.vocab_len)
60
+ plm_model = TFAutoModel.from_pretrained(self.checkpoint_path, from_pt=True)
61
+ # plm_model.resize_token_embeddings(self.rep.vocab_len)
62
+ x1_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='input_ids')
63
+ x2_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='token_type_ids')
64
+ x3_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='attention_mask')
65
+ x = plm_model(x1_in, token_type_ids=x2_in, attention_mask=x3_in)[0]
66
+ #dense = TimeDistributed(Dense(512, activation='relu'), name='dense1')(x)
67
+ self.encoder = Model (inputs=[x1_in,x2_in,x3_in], outputs=x,name='hugface_encoder')
68
+ self.encoder.summary()
69
+
70
+ def build_softmax_decoder(self):
71
+
72
+ x1_in = Input(shape=(self.maxlen,),dtype=tf.int32)
73
+ x2_in = Input(shape=(self.maxlen,),dtype=tf.int32)
74
+ x3_in = Input(shape=(self.maxlen,),dtype=tf.int32)
75
+ features = self.encoder([x1_in,x2_in,x3_in])
76
+ #features = Dropout(0.4)(features)
77
+ features = TimeDistributed(Dense(128, activation='relu'), name='dense2')(features)
78
+ features= Dropout(0.1)(features)
79
+ output = TimeDistributed(Dense(self.rep.label_table_size, activation='softmax'), name='softmax')(features)
80
+ self.model = Model(inputs=[x1_in,x2_in,x3_in], outputs=output, name="hugface_softmax")
81
+
82
+ lr_schedule=LRSchedule_LINEAR(
83
+ init_lr=1e-5,
84
+ init_warmup_lr=1e-7,
85
+ final_lr=5e-6,
86
+ warmup_steps=0,
87
+ decay_steps=1000)
88
+
89
+ opt = Adam(learning_rate = lr_schedule)
90
+ #opt = Adam(lr=5e-6)
91
+ self.model.compile(
92
+ optimizer=opt,
93
+ loss='sparse_categorical_crossentropy',
94
+ metrics=['accuracy'],
95
+ )
96
+ self.model.summary()
97
+
98
+
99
+ def load_model(self,model_file):
100
+ self.model.load_weights(model_file)
101
+ self.model.summary()
102
+ print('load HUGFACE model done!')
src_python/GeneNER/ner_tag.py CHANGED
@@ -1,85 +1,85 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on Wed Jun 8 11:01:23 2022
4
-
5
- @author: luol2
6
- """
7
-
8
-
9
-
10
- import io
11
- import re
12
- from src_python.GeneNER.processing_data_ner import ml_intext_fn,out_BIO_BERT_softmax_fn
13
- from src_python.GeneNER.restore_index_ner import NN_restore_index_fn
14
- import tensorflow as tf
15
- gpu = tf.config.list_physical_devices('GPU')
16
- print("Num GPUs Available: ", len(gpu))
17
- if len(gpu) > 0:
18
- tf.config.experimental.set_memory_growth(gpu[0], True)
19
-
20
- def pre_token(sentence):
21
- sentence=re.sub("([\W\-\_])"," \\1 ",sentence)
22
- sentence=re.sub("[ ]+"," ",sentence);
23
- return sentence
24
-
25
- def ssplit_token_pos_lemma(in_text,text_level,nlp_token, max_len=400):
26
- #print('max_len:',max_len)
27
- fout=io.StringIO()
28
-
29
- in_text=in_text.strip()
30
- in_text=pre_token(in_text)
31
- doc_stanza = nlp_token(in_text)
32
- strlen=0
33
- for sent in doc_stanza.sentences:
34
- for word in sent.words:
35
- strlen+=1
36
- if word.text.strip()=='':
37
- pass
38
- #print('!!!!blank token text!!!')
39
- else:
40
- fout.write(word.text+'\tO\n')
41
- if strlen>=max_len:
42
- #print('long sentence:',strlen)
43
- fout.write('\n')
44
- strlen=0
45
- if text_level=='SENT':
46
- fout.write('\n')
47
- strlen=0
48
- if text_level=='DOC':
49
- fout.write('\n')
50
-
51
- return fout.getvalue()
52
-
53
- def ml_tagging(ml_input,nn_model):
54
-
55
- test_list = ml_intext_fn(ml_input)
56
- test_x,test_y, test_bert_text_label=nn_model.rep.load_data_hugface(test_list,word_max_len=nn_model.maxlen,label_type='softmax')
57
- test_pre = nn_model.model.predict(test_x,batch_size=64)
58
- test_decode_temp=out_BIO_BERT_softmax_fn(test_pre,test_bert_text_label,nn_model.rep.index_2_label)
59
-
60
- return test_decode_temp
61
- # only machine learning-based method
62
- def ML_Tag(text,ml_model,nlp_token,text_level='SENT'):
63
-
64
- # startTime=time.time()
65
- ssplit_token=ssplit_token_pos_lemma(text, text_level, nlp_token, max_len=ml_model.maxlen)
66
- #print(ssplit_token)
67
- # print('ssplit token:',time.time()-startTime)
68
-
69
- # startTime=time.time()
70
- ml_tsv=ml_tagging(ssplit_token,ml_model)
71
- #print(ml_tsv)
72
- # print('ml ner:',time.time()-startTime)
73
-
74
- final_result= NN_restore_index_fn(text,ml_tsv)
75
-
76
- # print('final ner:',time.time()-startTime)
77
-
78
- return final_result
79
-
80
-
81
-
82
-
83
-
84
-
85
-
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Wed Jun 8 11:01:23 2022
4
+
5
+ @author: luol2
6
+ """
7
+
8
+
9
+
10
+ import io
11
+ import re
12
+ from src_python.GeneNER.processing_data_ner import ml_intext_fn,out_BIO_BERT_softmax_fn
13
+ from src_python.GeneNER.restore_index_ner import NN_restore_index_fn
14
+ import tensorflow as tf
15
+ gpu = tf.config.list_physical_devices('GPU')
16
+ print("Num GPUs Available: ", len(gpu))
17
+ if len(gpu) > 0:
18
+ tf.config.experimental.set_memory_growth(gpu[0], True)
19
+
20
+ def pre_token(sentence):
21
+ sentence=re.sub("([\W\-\_])"," \\1 ",sentence)
22
+ sentence=re.sub("[ ]+"," ",sentence);
23
+ return sentence
24
+
25
+ def ssplit_token_pos_lemma(in_text,text_level,nlp_token, max_len=400):
26
+ #print('max_len:',max_len)
27
+ fout=io.StringIO()
28
+
29
+ in_text=in_text.strip()
30
+ in_text=pre_token(in_text)
31
+ doc_stanza = nlp_token(in_text)
32
+ strlen=0
33
+ for sent in doc_stanza.sentences:
34
+ for word in sent.words:
35
+ strlen+=1
36
+ if word.text.strip()=='':
37
+ pass
38
+ #print('!!!!blank token text!!!')
39
+ else:
40
+ fout.write(word.text+'\tO\n')
41
+ if strlen>=max_len:
42
+ #print('long sentence:',strlen)
43
+ fout.write('\n')
44
+ strlen=0
45
+ if text_level=='SENT':
46
+ fout.write('\n')
47
+ strlen=0
48
+ if text_level=='DOC':
49
+ fout.write('\n')
50
+
51
+ return fout.getvalue()
52
+
53
+ def ml_tagging(ml_input,nn_model):
54
+
55
+ test_list = ml_intext_fn(ml_input)
56
+ test_x,test_y, test_bert_text_label=nn_model.rep.load_data_hugface(test_list,word_max_len=nn_model.maxlen,label_type='softmax')
57
+ test_pre = nn_model.model.predict(test_x,batch_size=64)
58
+ test_decode_temp=out_BIO_BERT_softmax_fn(test_pre,test_bert_text_label,nn_model.rep.index_2_label)
59
+
60
+ return test_decode_temp
61
+ # only machine learning-based method
62
+ def ML_Tag(text,ml_model,nlp_token,text_level='SENT'):
63
+
64
+ # startTime=time.time()
65
+ ssplit_token=ssplit_token_pos_lemma(text, text_level, nlp_token, max_len=ml_model.maxlen)
66
+ #print(ssplit_token)
67
+ # print('ssplit token:',time.time()-startTime)
68
+
69
+ # startTime=time.time()
70
+ ml_tsv=ml_tagging(ssplit_token,ml_model)
71
+ #print(ml_tsv)
72
+ # print('ml ner:',time.time()-startTime)
73
+
74
+ final_result= NN_restore_index_fn(text,ml_tsv)
75
+
76
+ # print('final ner:',time.time()-startTime)
77
+
78
+ return final_result
79
+
80
+
81
+
82
+
83
+
84
+
85
+
src_python/GeneNER/processing_data_ner.py CHANGED
@@ -1,210 +1,210 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on Tue Mar 10 16:34:12 2020
4
-
5
- @author: luol2
6
- """
7
- import numpy as np
8
- import io
9
- import sys
10
- #read ner text (word\tlabel), generate the list[[[w1,label],[w2,label]]]
11
- def ml_intext(file):
12
- fin=open(file,'r',encoding='utf-8')
13
- alltexts=fin.read().strip().split('\n\n')
14
- fin.close()
15
- data_list=[]
16
-
17
- for sents in alltexts:
18
- lines=sents.split('\n')
19
- temp_sentece=[]
20
- for i in range(0,len(lines)):
21
- seg=lines[i].split('\t')
22
- temp_sentece.append(seg[:])
23
-
24
- data_list.append(temp_sentece)
25
- #print(data_list)
26
- #print(label_list)
27
- return data_list
28
-
29
- def ml_intext_fn(ml_input):
30
- fin=io.StringIO(ml_input)
31
- alltexts=fin.read().strip().split('\n\n')
32
- fin.close()
33
- data_list=[]
34
-
35
- for sents in alltexts:
36
- lines=sents.split('\n')
37
- temp_sentece=[]
38
- for i in range(0,len(lines)):
39
- seg=lines[i].split('\t')
40
- temp_sentece.append(seg[:])
41
-
42
- data_list.append(temp_sentece)
43
- #print(data_list)
44
- #print(label_list)
45
- return data_list
46
-
47
- # model predict result to conll evalute format [token answer predict]
48
- def out_BIO_crf(file,raw_pre,raw_input,label_set):
49
- fout=open(file,'w',encoding='utf-8')
50
- for i in range(len(raw_input)):
51
-
52
- for j in range(len(raw_input[i])):
53
- if j<len(raw_pre[i]):
54
- label_id = raw_pre[i][j]
55
- label_tag = label_set[str(label_id)]
56
- else:
57
- label_tag='O'
58
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
59
- fout.write('\n')
60
- fout.close()
61
- def out_BIO_crf_fn(raw_pre,raw_input,label_set):
62
- fout=io.StringIO()
63
- for i in range(len(raw_input)):
64
-
65
- for j in range(len(raw_input[i])):
66
- if j<len(raw_pre[i]):
67
- label_id = raw_pre[i][j]
68
- label_tag = label_set[str(label_id)]
69
- else:
70
- label_tag='O'
71
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
72
- fout.write('\n')
73
- return fout.getvalue()
74
- def out_BIO_softmax(file,raw_pre,raw_input,label_set):
75
- fout=open(file,'w',encoding='utf-8')
76
- #print(raw_pre[0:2])
77
- for i in range(len(raw_input)):
78
-
79
- for j in range(len(raw_input[i])):
80
- if j<len(raw_pre[i]):
81
- label_id = np.argmax(raw_pre[i][j])
82
- #print(label_id)
83
- label_tag = label_set[str(label_id)]
84
- else:
85
- label_tag='O'
86
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
87
- fout.write('\n')
88
- fout.close()
89
- def out_BIO_softmax_fn(raw_pre,raw_input,label_set):
90
- fout=io.StringIO()
91
- #print(raw_pre[0:2])
92
- for i in range(len(raw_input)):
93
-
94
- for j in range(len(raw_input[i])):
95
- if j<len(raw_pre[i]):
96
- label_id = np.argmax(raw_pre[i][j])
97
- #print(label_id)
98
- label_tag = label_set[str(label_id)]
99
- else:
100
- label_tag='O'
101
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
102
- fout.write('\n')
103
- return fout.getvalue()
104
-
105
- def out_BIO_BERT_softmax(file,raw_pre,raw_input,label_set):
106
- fout=open(file,'w',encoding='utf-8')
107
- for i in range(len(raw_input)):
108
- for j in range(len(raw_input[i])):
109
- if raw_input[i][j][-1]<len(raw_pre[i]):
110
- # label_id = raw_pre[i][j]
111
- label_id = np.argmax(raw_pre[i][raw_input[i][j][-1]])
112
- label_tag = label_set[str(label_id)]
113
- else:
114
- label_tag='O'
115
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
116
- fout.write('\n')
117
- fout.close()
118
- def out_BIO_BERT_softmax_fn(raw_pre,raw_input,label_set):
119
- fout=io.StringIO()
120
- for i in range(len(raw_input)):
121
- for j in range(len(raw_input[i])):
122
- if raw_input[i][j][-1]<len(raw_pre[i]):
123
- #label_id = raw_pre[i][j]
124
- label_id = np.argmax(raw_pre[i][raw_input[i][j][-1]])
125
- label_tag = label_set[str(label_id)]
126
- else:
127
- label_tag='O'
128
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
129
- fout.write('\n')
130
- return fout.getvalue()
131
- def out_BIO_BERT_crf(file,raw_pre,raw_input,label_set):
132
- fout=open(file,'w',encoding='utf-8')
133
- for i in range(len(raw_input)):
134
-
135
- for j in range(len(raw_input[i])):
136
- if raw_input[i][j][-1]<len(raw_pre[i]):
137
- label_id = raw_pre[i][raw_input[i][j][-1]]
138
- label_tag = label_set[str(label_id)]
139
- else:
140
- label_tag='O'
141
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
142
- fout.write('\n')
143
- fout.close()
144
- def out_BIO_BERT_crf_fn(raw_pre,raw_input,label_set):
145
- fout=io.StringIO()
146
- for i in range(len(raw_input)):
147
-
148
- for j in range(len(raw_input[i])):
149
- if raw_input[i][j][-1]<len(raw_pre[i]):
150
- label_id = raw_pre[i][raw_input[i][j][-1]]
151
- label_tag = label_set[str(label_id)]
152
- else:
153
- label_tag='O'
154
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
155
- fout.write('\n')
156
- return fout.getvalue()
157
-
158
- def out_BIO_BERT_softmax_score_fn(raw_pre,raw_input,label_set):
159
- fout=io.StringIO()
160
- for i in range(len(raw_input)):
161
-
162
- for j in range(len(raw_input[i])):
163
- if j<len(raw_pre[i]):
164
- #label_id = raw_pre[i][j]
165
- label_id = np.argmax(raw_pre[i][j])
166
- label_score = round(raw_pre[i][j][label_id],4)
167
- label_tag = label_set[str(label_id)]
168
- else:
169
- label_tag='O'
170
- label_score = 0.0
171
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\t'+str(label_score)+'\n')
172
- fout.write('\n')
173
- return fout.getvalue()
174
- #generate char vocab
175
- def char_vocab(infile,outfile_char):
176
- fin=open(infile,'r',encoding='utf-8')
177
- #fout=open(outfile,'w',encoding='utf-8')
178
- fout_char=open(outfile_char,'w',encoding='utf-8')
179
- char_vocab=['oov_char']
180
- max_len=0
181
- for line in fin:
182
- if line.strip()!='':
183
- seg=line.split('\t')
184
- word_len=len(seg[0])
185
- #if word_len<1000:
186
- # fout.write(line)
187
- if word_len>max_len:
188
- max_len=word_len
189
- print(seg[0])
190
- for i in range(word_len):
191
- if seg[0][i] not in char_vocab:
192
- char_vocab.append(seg[0][i])
193
- #else:
194
- # fout.write(line)
195
- fin.close()
196
- #fout.close()
197
- for ele in char_vocab:
198
- fout_char.write(ele+'\n')
199
- fout_char.close()
200
- print('max_len:',max_len)
201
-
202
-
203
- if __name__=='__main__':
204
- # infile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO'
205
- # #outfile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO_new'
206
- # outfile_char='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/src/nn_model/vocab/char_vocab'
207
- # #processing_text(file)
208
- # char_vocab(infile,outfile_char)
209
- a=[1,2,3]
210
- print(a[:-1])
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Tue Mar 10 16:34:12 2020
4
+
5
+ @author: luol2
6
+ """
7
+ import numpy as np
8
+ import io
9
+ import sys
10
+ #read ner text (word\tlabel), generate the list[[[w1,label],[w2,label]]]
11
+ def ml_intext(file):
12
+ fin=open(file,'r',encoding='utf-8')
13
+ alltexts=fin.read().strip().split('\n\n')
14
+ fin.close()
15
+ data_list=[]
16
+
17
+ for sents in alltexts:
18
+ lines=sents.split('\n')
19
+ temp_sentece=[]
20
+ for i in range(0,len(lines)):
21
+ seg=lines[i].split('\t')
22
+ temp_sentece.append(seg[:])
23
+
24
+ data_list.append(temp_sentece)
25
+ #print(data_list)
26
+ #print(label_list)
27
+ return data_list
28
+
29
+ def ml_intext_fn(ml_input):
30
+ fin=io.StringIO(ml_input)
31
+ alltexts=fin.read().strip().split('\n\n')
32
+ fin.close()
33
+ data_list=[]
34
+
35
+ for sents in alltexts:
36
+ lines=sents.split('\n')
37
+ temp_sentece=[]
38
+ for i in range(0,len(lines)):
39
+ seg=lines[i].split('\t')
40
+ temp_sentece.append(seg[:])
41
+
42
+ data_list.append(temp_sentece)
43
+ #print(data_list)
44
+ #print(label_list)
45
+ return data_list
46
+
47
+ # model predict result to conll evalute format [token answer predict]
48
+ def out_BIO_crf(file,raw_pre,raw_input,label_set):
49
+ fout=open(file,'w',encoding='utf-8')
50
+ for i in range(len(raw_input)):
51
+
52
+ for j in range(len(raw_input[i])):
53
+ if j<len(raw_pre[i]):
54
+ label_id = raw_pre[i][j]
55
+ label_tag = label_set[str(label_id)]
56
+ else:
57
+ label_tag='O'
58
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
59
+ fout.write('\n')
60
+ fout.close()
61
+ def out_BIO_crf_fn(raw_pre,raw_input,label_set):
62
+ fout=io.StringIO()
63
+ for i in range(len(raw_input)):
64
+
65
+ for j in range(len(raw_input[i])):
66
+ if j<len(raw_pre[i]):
67
+ label_id = raw_pre[i][j]
68
+ label_tag = label_set[str(label_id)]
69
+ else:
70
+ label_tag='O'
71
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
72
+ fout.write('\n')
73
+ return fout.getvalue()
74
+ def out_BIO_softmax(file,raw_pre,raw_input,label_set):
75
+ fout=open(file,'w',encoding='utf-8')
76
+ #print(raw_pre[0:2])
77
+ for i in range(len(raw_input)):
78
+
79
+ for j in range(len(raw_input[i])):
80
+ if j<len(raw_pre[i]):
81
+ label_id = np.argmax(raw_pre[i][j])
82
+ #print(label_id)
83
+ label_tag = label_set[str(label_id)]
84
+ else:
85
+ label_tag='O'
86
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
87
+ fout.write('\n')
88
+ fout.close()
89
+ def out_BIO_softmax_fn(raw_pre,raw_input,label_set):
90
+ fout=io.StringIO()
91
+ #print(raw_pre[0:2])
92
+ for i in range(len(raw_input)):
93
+
94
+ for j in range(len(raw_input[i])):
95
+ if j<len(raw_pre[i]):
96
+ label_id = np.argmax(raw_pre[i][j])
97
+ #print(label_id)
98
+ label_tag = label_set[str(label_id)]
99
+ else:
100
+ label_tag='O'
101
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
102
+ fout.write('\n')
103
+ return fout.getvalue()
104
+
105
+ def out_BIO_BERT_softmax(file,raw_pre,raw_input,label_set):
106
+ fout=open(file,'w',encoding='utf-8')
107
+ for i in range(len(raw_input)):
108
+ for j in range(len(raw_input[i])):
109
+ if raw_input[i][j][-1]<len(raw_pre[i]):
110
+ # label_id = raw_pre[i][j]
111
+ label_id = np.argmax(raw_pre[i][raw_input[i][j][-1]])
112
+ label_tag = label_set[str(label_id)]
113
+ else:
114
+ label_tag='O'
115
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
116
+ fout.write('\n')
117
+ fout.close()
118
+ def out_BIO_BERT_softmax_fn(raw_pre,raw_input,label_set):
119
+ fout=io.StringIO()
120
+ for i in range(len(raw_input)):
121
+ for j in range(len(raw_input[i])):
122
+ if raw_input[i][j][-1]<len(raw_pre[i]):
123
+ #label_id = raw_pre[i][j]
124
+ label_id = np.argmax(raw_pre[i][raw_input[i][j][-1]])
125
+ label_tag = label_set[str(label_id)]
126
+ else:
127
+ label_tag='O'
128
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
129
+ fout.write('\n')
130
+ return fout.getvalue()
131
+ def out_BIO_BERT_crf(file,raw_pre,raw_input,label_set):
132
+ fout=open(file,'w',encoding='utf-8')
133
+ for i in range(len(raw_input)):
134
+
135
+ for j in range(len(raw_input[i])):
136
+ if raw_input[i][j][-1]<len(raw_pre[i]):
137
+ label_id = raw_pre[i][raw_input[i][j][-1]]
138
+ label_tag = label_set[str(label_id)]
139
+ else:
140
+ label_tag='O'
141
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
142
+ fout.write('\n')
143
+ fout.close()
144
+ def out_BIO_BERT_crf_fn(raw_pre,raw_input,label_set):
145
+ fout=io.StringIO()
146
+ for i in range(len(raw_input)):
147
+
148
+ for j in range(len(raw_input[i])):
149
+ if raw_input[i][j][-1]<len(raw_pre[i]):
150
+ label_id = raw_pre[i][raw_input[i][j][-1]]
151
+ label_tag = label_set[str(label_id)]
152
+ else:
153
+ label_tag='O'
154
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
155
+ fout.write('\n')
156
+ return fout.getvalue()
157
+
158
+ def out_BIO_BERT_softmax_score_fn(raw_pre,raw_input,label_set):
159
+ fout=io.StringIO()
160
+ for i in range(len(raw_input)):
161
+
162
+ for j in range(len(raw_input[i])):
163
+ if j<len(raw_pre[i]):
164
+ #label_id = raw_pre[i][j]
165
+ label_id = np.argmax(raw_pre[i][j])
166
+ label_score = round(raw_pre[i][j][label_id],4)
167
+ label_tag = label_set[str(label_id)]
168
+ else:
169
+ label_tag='O'
170
+ label_score = 0.0
171
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\t'+str(label_score)+'\n')
172
+ fout.write('\n')
173
+ return fout.getvalue()
174
+ #generate char vocab
175
+ def char_vocab(infile,outfile_char):
176
+ fin=open(infile,'r',encoding='utf-8')
177
+ #fout=open(outfile,'w',encoding='utf-8')
178
+ fout_char=open(outfile_char,'w',encoding='utf-8')
179
+ char_vocab=['oov_char']
180
+ max_len=0
181
+ for line in fin:
182
+ if line.strip()!='':
183
+ seg=line.split('\t')
184
+ word_len=len(seg[0])
185
+ #if word_len<1000:
186
+ # fout.write(line)
187
+ if word_len>max_len:
188
+ max_len=word_len
189
+ print(seg[0])
190
+ for i in range(word_len):
191
+ if seg[0][i] not in char_vocab:
192
+ char_vocab.append(seg[0][i])
193
+ #else:
194
+ # fout.write(line)
195
+ fin.close()
196
+ #fout.close()
197
+ for ele in char_vocab:
198
+ fout_char.write(ele+'\n')
199
+ fout_char.close()
200
+ print('max_len:',max_len)
201
+
202
+
203
+ if __name__=='__main__':
204
+ # infile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO'
205
+ # #outfile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO_new'
206
+ # outfile_char='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/src/nn_model/vocab/char_vocab'
207
+ # #processing_text(file)
208
+ # char_vocab(infile,outfile_char)
209
+ a=[1,2,3]
210
+ print(a[:-1])
src_python/GeneNER/represent_ner.py CHANGED
@@ -1,183 +1,183 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on Mon Aug 30 19:54:17 2021
4
-
5
- @author: luol2
6
- """
7
-
8
-
9
-
10
- import os, sys
11
- import numpy as np
12
- from tensorflow.keras.preprocessing.sequence import pad_sequences
13
- from transformers import AutoTokenizer
14
-
15
-
16
- class Hugface_RepresentationLayer(object):
17
-
18
-
19
- def __init__(self, tokenizer_name_or_path, label_file,lowercase=True):
20
-
21
-
22
- #load vocab
23
-
24
- self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True,do_lower_case=lowercase)
25
- self.label_2_index={}
26
- self.index_2_label={}
27
- self.label_table_size=0
28
- self.load_label_vocab(label_file,self.label_2_index,self.index_2_label)
29
- self.label_table_size=len(self.label_2_index)
30
- self.vocab_len=len(self.tokenizer)
31
-
32
- def load_label_vocab(self,fea_file,fea_index,index_2_label):
33
-
34
- fin=open(fea_file,'r',encoding='utf-8')
35
- all_text=fin.read().strip().split('\n')
36
- fin.close()
37
- for i in range(0,len(all_text)):
38
- fea_index[all_text[i]]=i
39
- index_2_label[str(i)]=all_text[i]
40
-
41
-
42
-
43
- def generate_label_list(self,ori_tokens,labels,word_index): #the lable of subtoken is the same with the label of first subtoken
44
- label_list=['O']*len(word_index)
45
-
46
- label_list_index=[]
47
- old_new_token_map=[]
48
- ori_i=0
49
- for i in range(0,len(word_index)):
50
- if word_index[i]==None:
51
- label_list_index.append(self.label_2_index[label_list[i]])
52
- else:
53
- label_list[i]=labels[word_index[i]]
54
- label_list_index.append(self.label_2_index[label_list[i]])
55
- if word_index[i]==ori_i:
56
- old_new_token_map.append(i)
57
- ori_i+=1
58
-
59
-
60
- bert_text_label=[]
61
- for i in range(0,len(ori_tokens)):
62
- bert_text_label.append([ori_tokens[i],labels[i],old_new_token_map[i]])
63
-
64
- return label_list_index,bert_text_label
65
-
66
- def generate_label_list_B(self,ori_tokens,labels,word_index): #tonly first subtoken is B, other is I
67
- label_list=['O']*len(word_index)
68
-
69
- label_list_index=[]
70
- old_new_token_map=[]
71
- ori_i=0
72
- first_index=-1
73
- i=0
74
- while i <len(word_index):
75
- if word_index[i]==None:
76
- label_list_index.append(self.label_2_index[label_list[i]])
77
- i+=1
78
- else:
79
- first_index=word_index[i]
80
- if first_index==ori_i:
81
- old_new_token_map.append(i)
82
- ori_i+=1
83
- label_list[i]=labels[word_index[i]]
84
- label_list_index.append(self.label_2_index[label_list[i]])
85
- i+=1
86
- while word_index[i]==first_index and word_index[i]!=None:
87
- #print(first_index)
88
- if labels[first_index].startswith("B-"):
89
- label_list[i]='I-'+labels[first_index][2:]
90
- label_list_index.append(self.label_2_index[label_list[i]])
91
- else:
92
- label_list[i]=labels[word_index[i]]
93
- label_list_index.append(self.label_2_index[label_list[i]])
94
- i+=1
95
-
96
-
97
-
98
-
99
- bert_text_label=[]
100
- #print(len(old_new_token_map))
101
- for i in range(0,len(ori_tokens)):
102
- if i<len(old_new_token_map):
103
- bert_text_label.append([ori_tokens[i],labels[i],old_new_token_map[i]])
104
- else: # after token > max len
105
- break
106
- return label_list_index,bert_text_label
107
-
108
- def load_data_hugface(self,instances, word_max_len=100, label_type='softmax'):
109
-
110
- x_index=[]
111
- x_seg=[]
112
- x_mask=[]
113
- y_list=[]
114
- bert_text_labels=[]
115
- max_len=0
116
- over_num=0
117
- maxT=word_max_len
118
- ave_len=0
119
-
120
- #print('instances:', instances)
121
- #print('labels:',labels)
122
-
123
-
124
- for sentence in instances:
125
- sentence_text_list=[]
126
- label_list=[]
127
- for j in range(0,len(sentence)):
128
- sentence_text_list.append(sentence[j][0])
129
- label_list.append(sentence[j][-1])
130
-
131
- token_result=self.tokenizer(
132
- sentence_text_list,
133
- max_length=word_max_len,
134
- truncation=True,is_split_into_words=True)
135
-
136
- bert_tokens=self.tokenizer.convert_ids_to_tokens(token_result['input_ids'])
137
- word_index=token_result.word_ids(batch_index=0)
138
- ave_len+=len(bert_tokens)
139
- if len(sentence_text_list)>max_len:
140
- max_len=len(sentence_text_list)
141
- if len(bert_tokens)==maxT:
142
- over_num+=1
143
-
144
- x_index.append(token_result['input_ids'])
145
- x_seg.append(token_result['token_type_ids'])
146
- x_mask.append(token_result['attention_mask'])
147
-
148
- #print('\nsentence_text_list:',len(sentence_text_list),sentence_text_list)
149
- #print('\nlabel:',len(label_list),label_list)
150
- #print('\nword_index:',len(word_index),word_index)
151
- #print('\nbert_tokens:',len(bert_tokens),bert_tokens)
152
- label_list,bert_text_label=self.generate_label_list_B(sentence_text_list,label_list,word_index) # the label list after bert token, ori token/lable/new index
153
- #print('\nlabel list:',len(label_list),label_list)
154
- #print('\nbert_text_label:',len(bert_text_label),bert_text_label)
155
- #sys.exit()
156
- y_list.append(label_list)
157
- #print(y_list)
158
- bert_text_labels.append(bert_text_label)
159
-
160
-
161
- x1_np = pad_sequences(x_index, word_max_len, value=0, padding='post',truncating='post') # right padding
162
- x2_np = pad_sequences(x_seg, word_max_len, value=0, padding='post',truncating='post')
163
- x3_np = pad_sequences(x_mask, word_max_len, value=0, padding='post',truncating='post')
164
- y_np = pad_sequences(y_list, word_max_len, value=0, padding='post',truncating='post')
165
- #print('x1_np:',x1_np)
166
- #print('\nx2_np:',x2_np)
167
- #print('\ny_np:',y_np)
168
- #print('\nbert_text:',bert_text_labels)
169
- # print('bert max len:',max_len,',Over',maxT,':',over_num,'ave len:',ave_len/len(instances),'total:',len(instances))
170
-
171
- if label_type=='softmax':
172
- y_np = np.expand_dims(y_np, 2)
173
- elif label_type=='crf':
174
- pass
175
-
176
- return [x1_np, x2_np,x3_np], y_np,bert_text_labels
177
-
178
-
179
- if __name__ == '__main__':
180
- pass
181
-
182
-
183
-
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Mon Aug 30 19:54:17 2021
4
+
5
+ @author: luol2
6
+ """
7
+
8
+
9
+
10
+ import os, sys
11
+ import numpy as np
12
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
13
+ from transformers import AutoTokenizer
14
+
15
+
16
+ class Hugface_RepresentationLayer(object):
17
+
18
+
19
+ def __init__(self, tokenizer_name_or_path, label_file,lowercase=True):
20
+
21
+
22
+ #load vocab
23
+
24
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True,do_lower_case=lowercase)
25
+ self.label_2_index={}
26
+ self.index_2_label={}
27
+ self.label_table_size=0
28
+ self.load_label_vocab(label_file,self.label_2_index,self.index_2_label)
29
+ self.label_table_size=len(self.label_2_index)
30
+ self.vocab_len=len(self.tokenizer)
31
+
32
+ def load_label_vocab(self,fea_file,fea_index,index_2_label):
33
+
34
+ fin=open(fea_file,'r',encoding='utf-8')
35
+ all_text=fin.read().strip().split('\n')
36
+ fin.close()
37
+ for i in range(0,len(all_text)):
38
+ fea_index[all_text[i]]=i
39
+ index_2_label[str(i)]=all_text[i]
40
+
41
+
42
+
43
+ def generate_label_list(self,ori_tokens,labels,word_index): #the lable of subtoken is the same with the label of first subtoken
44
+ label_list=['O']*len(word_index)
45
+
46
+ label_list_index=[]
47
+ old_new_token_map=[]
48
+ ori_i=0
49
+ for i in range(0,len(word_index)):
50
+ if word_index[i]==None:
51
+ label_list_index.append(self.label_2_index[label_list[i]])
52
+ else:
53
+ label_list[i]=labels[word_index[i]]
54
+ label_list_index.append(self.label_2_index[label_list[i]])
55
+ if word_index[i]==ori_i:
56
+ old_new_token_map.append(i)
57
+ ori_i+=1
58
+
59
+
60
+ bert_text_label=[]
61
+ for i in range(0,len(ori_tokens)):
62
+ bert_text_label.append([ori_tokens[i],labels[i],old_new_token_map[i]])
63
+
64
+ return label_list_index,bert_text_label
65
+
66
+ def generate_label_list_B(self,ori_tokens,labels,word_index): #tonly first subtoken is B, other is I
67
+ label_list=['O']*len(word_index)
68
+
69
+ label_list_index=[]
70
+ old_new_token_map=[]
71
+ ori_i=0
72
+ first_index=-1
73
+ i=0
74
+ while i <len(word_index):
75
+ if word_index[i]==None:
76
+ label_list_index.append(self.label_2_index[label_list[i]])
77
+ i+=1
78
+ else:
79
+ first_index=word_index[i]
80
+ if first_index==ori_i:
81
+ old_new_token_map.append(i)
82
+ ori_i+=1
83
+ label_list[i]=labels[word_index[i]]
84
+ label_list_index.append(self.label_2_index[label_list[i]])
85
+ i+=1
86
+ while word_index[i]==first_index and word_index[i]!=None:
87
+ #print(first_index)
88
+ if labels[first_index].startswith("B-"):
89
+ label_list[i]='I-'+labels[first_index][2:]
90
+ label_list_index.append(self.label_2_index[label_list[i]])
91
+ else:
92
+ label_list[i]=labels[word_index[i]]
93
+ label_list_index.append(self.label_2_index[label_list[i]])
94
+ i+=1
95
+
96
+
97
+
98
+
99
+ bert_text_label=[]
100
+ #print(len(old_new_token_map))
101
+ for i in range(0,len(ori_tokens)):
102
+ if i<len(old_new_token_map):
103
+ bert_text_label.append([ori_tokens[i],labels[i],old_new_token_map[i]])
104
+ else: # after token > max len
105
+ break
106
+ return label_list_index,bert_text_label
107
+
108
+ def load_data_hugface(self,instances, word_max_len=100, label_type='softmax'):
109
+
110
+ x_index=[]
111
+ x_seg=[]
112
+ x_mask=[]
113
+ y_list=[]
114
+ bert_text_labels=[]
115
+ max_len=0
116
+ over_num=0
117
+ maxT=word_max_len
118
+ ave_len=0
119
+
120
+ #print('instances:', instances)
121
+ #print('labels:',labels)
122
+
123
+
124
+ for sentence in instances:
125
+ sentence_text_list=[]
126
+ label_list=[]
127
+ for j in range(0,len(sentence)):
128
+ sentence_text_list.append(sentence[j][0])
129
+ label_list.append(sentence[j][-1])
130
+
131
+ token_result=self.tokenizer(
132
+ sentence_text_list,
133
+ max_length=word_max_len,
134
+ truncation=True,is_split_into_words=True)
135
+
136
+ bert_tokens=self.tokenizer.convert_ids_to_tokens(token_result['input_ids'])
137
+ word_index=token_result.word_ids(batch_index=0)
138
+ ave_len+=len(bert_tokens)
139
+ if len(sentence_text_list)>max_len:
140
+ max_len=len(sentence_text_list)
141
+ if len(bert_tokens)==maxT:
142
+ over_num+=1
143
+
144
+ x_index.append(token_result['input_ids'])
145
+ x_seg.append(token_result['token_type_ids'])
146
+ x_mask.append(token_result['attention_mask'])
147
+
148
+ #print('\nsentence_text_list:',len(sentence_text_list),sentence_text_list)
149
+ #print('\nlabel:',len(label_list),label_list)
150
+ #print('\nword_index:',len(word_index),word_index)
151
+ #print('\nbert_tokens:',len(bert_tokens),bert_tokens)
152
+ label_list,bert_text_label=self.generate_label_list_B(sentence_text_list,label_list,word_index) # the label list after bert token, ori token/lable/new index
153
+ #print('\nlabel list:',len(label_list),label_list)
154
+ #print('\nbert_text_label:',len(bert_text_label),bert_text_label)
155
+ #sys.exit()
156
+ y_list.append(label_list)
157
+ #print(y_list)
158
+ bert_text_labels.append(bert_text_label)
159
+
160
+
161
+ x1_np = pad_sequences(x_index, word_max_len, value=0, padding='post',truncating='post') # right padding
162
+ x2_np = pad_sequences(x_seg, word_max_len, value=0, padding='post',truncating='post')
163
+ x3_np = pad_sequences(x_mask, word_max_len, value=0, padding='post',truncating='post')
164
+ y_np = pad_sequences(y_list, word_max_len, value=0, padding='post',truncating='post')
165
+ #print('x1_np:',x1_np)
166
+ #print('\nx2_np:',x2_np)
167
+ #print('\ny_np:',y_np)
168
+ #print('\nbert_text:',bert_text_labels)
169
+ # print('bert max len:',max_len,',Over',maxT,':',over_num,'ave len:',ave_len/len(instances),'total:',len(instances))
170
+
171
+ if label_type=='softmax':
172
+ y_np = np.expand_dims(y_np, 2)
173
+ elif label_type=='crf':
174
+ pass
175
+
176
+ return [x1_np, x2_np,x3_np], y_np,bert_text_labels
177
+
178
+
179
+ if __name__ == '__main__':
180
+ pass
181
+
182
+
183
+
src_python/GeneNER/restore_index_ner.py CHANGED
@@ -1,447 +1,447 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on Fri Mar 5 10:40:08 2021
4
-
5
- @author: luol2
6
- """
7
-
8
- # -*- coding: utf-8 -*-
9
- """
10
- Created on Sun Jun 14 17:19:02 2020
11
-
12
- @author: luol2
13
- """
14
-
15
- import io
16
- import sys
17
-
18
- # from BIO format to entity,list line is sentence, follwing the entity(start, end, text, entity, type)
19
- def NN_BIO_tag_entity(pre_BIO):
20
- sentences=pre_BIO.strip().split('\n\n')
21
-
22
- pre_result=[]
23
- #print(sentences)
24
- for sent in sentences:
25
- tokens=sent.split('\n')
26
- pre_entity=[]
27
- pre_start,pre_end=0,0
28
- sent_text=''
29
- for i in range(0,len(tokens)):
30
- segs=tokens[i].split('\t')
31
- sent_text+=segs[0]+' '
32
- if len(segs)<3:
33
- continue
34
- #print(tokens)
35
- # generate prediction entity
36
- if segs[2].startswith('B-')>0:
37
- pre_start=i
38
- pre_type=segs[2][2:]
39
- if i+1>=len(tokens): # the last word
40
- pre_end=i
41
- pre_entity.append([pre_start,pre_end,pre_type])
42
- else: # non last word
43
- next_seg=tokens[i+1].split('\t')
44
- if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
45
- pre_end=i
46
- pre_entity.append([pre_start,pre_end,pre_type])
47
- elif next_seg[2].startswith('I-')>0:
48
- pass
49
- elif segs[2].startswith('I-')>0:
50
- if i==0 and i+1<len(tokens): # the first word and not only a word
51
- pre_start=i
52
- pre_type=segs[2][2:]
53
- next_seg=tokens[i+1].split('\t')
54
- if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
55
- pre_end=i
56
- pre_entity.append([pre_start,pre_end,pre_type])
57
- elif next_seg[2].startswith('I-')>0:
58
- pass
59
- elif i==0 and i+1==len(tokens):# only one word:
60
- pre_start=i
61
- pre_type=segs[2][2:]
62
- pre_end=i
63
- pre_entity.append([pre_start,pre_end,pre_type])
64
- elif i+1>=len(tokens): # the last word
65
- last_seg=tokens[i-1].split('\t')
66
- if last_seg[2]=='O':
67
- pre_start=i
68
- pre_type=segs[2][2:]
69
- pre_end=i
70
- pre_entity.append([pre_start,pre_end,pre_type])
71
- elif i+1< len(tokens): # non last word
72
- next_seg=tokens[i+1].split('\t')
73
- last_seg=tokens[i-1].split('\t')
74
- if last_seg[2]=='O':
75
- pre_start=i
76
- pre_type=segs[2][2:]
77
- if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
78
- pre_end=i
79
- pre_entity.append([pre_start,pre_end,pre_type])
80
- elif next_seg[2].startswith('I-')>0:
81
- pass
82
- elif segs[2]=='O':
83
- pass
84
- pre_result.append([sent_text.rstrip(),pre_entity])
85
-
86
-
87
- # print(pre_entity)
88
- return pre_result
89
-
90
- def NN_restore_index_fn(ori_text,file_pre):
91
-
92
- input_result=NN_BIO_tag_entity(file_pre)
93
- #print(input_result)
94
-
95
-
96
- new_sentence=''
97
- restore_result=[]
98
-
99
- sentence_ori=ori_text.lower()
100
-
101
- for sent_ele in input_result:
102
-
103
- #print(pre_lines)
104
- # print(sentence_ori)
105
- if len(sent_ele[1])>0:
106
- #print(pre_lines)
107
- sentence_pre=sent_ele[0].lower()
108
- sentence_pre=sentence_pre.split()
109
-
110
- pre_result=sent_ele[1]
111
-
112
-
113
- restore_sid=0
114
- restore_eid=0
115
- each_word_id=[]
116
-
117
- for i in range(0,len(sentence_pre)):
118
-
119
- temp_id=sentence_ori.find(sentence_pre[i])
120
- if temp_id<0:
121
- #print('ori:',sentence_ori)
122
- print('resotr index error:',sentence_pre[i])
123
- new_sentence+=sentence_ori[0:temp_id]
124
-
125
- restore_sid=len(new_sentence)
126
- restore_eid=len(new_sentence)+len(sentence_pre[i])
127
- each_word_id.append([str(restore_sid),str(restore_eid)])
128
- new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
129
- sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
130
- # print('each_word:',each_word_id)
131
- for pre_ele in pre_result:
132
- temp_pre_result=[each_word_id[int(pre_ele[0])][0],each_word_id[int(pre_ele[1])][1],pre_ele[2]]
133
- if temp_pre_result not in restore_result:
134
- restore_result.append(temp_pre_result)
135
- else:
136
- sentence_pre=sent_ele[0].lower()
137
- sentence_pre=sentence_pre.split()
138
-
139
- for i in range(0,len(sentence_pre)):
140
-
141
- temp_id=sentence_ori.find(sentence_pre[i])
142
- if temp_id<0:
143
- print('resotr index error:',sentence_pre[i])
144
- new_sentence+=sentence_ori[0:temp_id]
145
- new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
146
- sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
147
- #print('resotre:',restore_result)
148
- return restore_result
149
-
150
- def BERT_BIO_tag_entity(pre_BIO):
151
- sentences=pre_BIO.strip().split('\n\n')
152
-
153
- pre_result=[]
154
- for sent in sentences:
155
- tokens=sent.split('\n')
156
- pre_entity=[]
157
- pre_start,pre_end=0,0
158
- sent_text=''
159
- for i in range(1,len(tokens)-1):
160
- segs=tokens[i].split('\t')
161
- sent_text+=segs[0]+' '
162
- # generate prediction entity
163
- if segs[2].startswith('B-')>0:
164
- pre_start=i
165
- pre_type=segs[2][2:]
166
- if i+1>=len(tokens): # the last word
167
- pre_end=i
168
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
169
- else: # non last word
170
- next_seg=tokens[i+1].split('\t')
171
- if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
172
- pre_end=i
173
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
174
- elif next_seg[2].startswith('I-')>0:
175
- pass
176
- elif segs[2].startswith('I-')>0:
177
- if i==0 and i+1<len(tokens): # the first word and not only a word
178
- pre_start=i
179
- pre_type=segs[2][2:]
180
- next_seg=tokens[i+1].split('\t')
181
- if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
182
- pre_end=i
183
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
184
- elif next_seg[2].startswith('I-')>0:
185
- pass
186
- elif i==0 and i+1==len(tokens):# only one word:
187
- pre_start=i
188
- pre_type=segs[2][2:]
189
- pre_end=i
190
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
191
- elif i+1>=len(tokens): # the last word
192
- last_seg=tokens[i-1].split('\t')
193
- if last_seg[2]=='O':
194
- pre_start=i
195
- pre_type=segs[2][2:]
196
- pre_end=i
197
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
198
- elif i+1< len(tokens): # non last word
199
- next_seg=tokens[i+1].split('\t')
200
- last_seg=tokens[i-1].split('\t')
201
- if last_seg[2]=='O':
202
- pre_start=i
203
- pre_type=segs[2][2:]
204
- if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
205
- pre_end=i
206
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
207
- elif next_seg[2].startswith('I-')>0:
208
- pass
209
- elif segs[2]=='O':
210
- pass
211
- pre_result.append([sent_text.rstrip(),pre_entity])
212
-
213
-
214
- #print(pre_result)
215
- return pre_result
216
-
217
- def BERT_BIO_tag_entity_revised(pre_BIO):
218
- print('revised version')
219
- sentences=pre_BIO.strip().split('\n\n')
220
-
221
- pre_result=[]
222
- for sent in sentences:
223
- tokens=sent.split('\n')
224
- pre_entity=[]
225
- pre_start,pre_end=0,0
226
- sent_text=''
227
- for i in range(1,len(tokens)-1):
228
- segs=tokens[i].split('\t')
229
- sent_text+=segs[0]+' '
230
- # generate prediction entity
231
- if segs[2].startswith('B-')>0:
232
- pre_start=i
233
- pre_type=segs[2][2:]
234
- if i+1>=len(tokens)-1: # the last word
235
- pre_end=i
236
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
237
- else: # non last word
238
- next_seg=tokens[i+1].split('\t')
239
- if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
240
- pre_end=i
241
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
242
- elif next_seg[2].startswith('I-')>0:
243
- pass
244
- elif segs[2].startswith('I-')>0:
245
- if i==1 and i+1<len(tokens)-1: # the first word and not only a word
246
- pre_start=i
247
- pre_type=segs[2][2:]
248
- next_seg=tokens[i+1].split('\t')
249
- if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
250
- pre_end=i
251
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
252
- elif next_seg[2].startswith('I-')>0:
253
- pass
254
- elif i==1 and i+1==len(tokens)-1:# only one word:
255
- pre_start=i
256
- pre_type=segs[2][2:]
257
- pre_end=i
258
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
259
- elif i+1>=len(tokens)-1: # the last word
260
- last_seg=tokens[i-1].split('\t')
261
- if last_seg[2]=='O':
262
- pre_start=i
263
- pre_type=segs[2][2:]
264
- pre_end=i
265
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
266
- elif i+1< len(tokens)-1: # non last word
267
- next_seg=tokens[i+1].split('\t')
268
- last_seg=tokens[i-1].split('\t')
269
- if last_seg[2]=='O':
270
- pre_start=i
271
- pre_type=segs[2][2:]
272
- if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
273
- pre_end=i
274
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
275
- elif next_seg[2].startswith('I-')>0:
276
- pass
277
- elif segs[2]=='O':
278
- pass
279
- pre_result.append([sent_text.rstrip(),pre_entity])
280
-
281
-
282
- #print(pre_result)
283
- return pre_result
284
-
285
- # only predict on the first token of the ori word
286
- def BERT_BIO_tag_entity_word(pre_BIO):
287
- sentences=pre_BIO.strip().split('\n\n')
288
-
289
- pre_result=[]
290
- for sent in sentences:
291
- tokens=sent.split('\n')
292
- pre_entity=[]
293
- pre_start,pre_end=0,0
294
- sent_text=''
295
- i=1
296
- while i< len(tokens)-1:
297
- # for i in range(1,len(tokens)-1):
298
- segs=tokens[i].split('\t')
299
- sent_text+=segs[0]+' '
300
- # generate prediction entity
301
- if segs[2].startswith('B-')>0:
302
- pre_start=i
303
- pre_type=segs[2][2:]
304
- if i+1>=len(tokens)-1: # the last word
305
- pre_end=i
306
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
307
- else: # non last word
308
- #pass a word
309
- sub_segs=tokens[i+1].split('\t')
310
- while(sub_segs[0].find('##')==0):
311
- i+=1
312
- sent_text+=sub_segs[0]+' '
313
- sub_segs=tokens[i+1].split('\t')
314
-
315
-
316
- next_seg=tokens[i+1].split('\t')
317
- if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
318
- pre_end=i
319
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
320
- elif next_seg[2].startswith('I-')>0:
321
- pass
322
- elif segs[2].startswith('I-')>0:
323
- if i==1 and i+1<len(tokens)-1: # the first word and not only a word
324
- pre_start=i
325
- pre_type=segs[2][2:]
326
- #pass a word
327
- sub_segs=tokens[i+1].split('\t')
328
- while(sub_segs[0].find('##')==0):
329
- i+=1
330
- sent_text+=sub_segs[0]+' '
331
- sub_segs=tokens[i+1].split('\t')
332
-
333
- next_seg=tokens[i+1].split('\t')
334
- if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
335
- pre_end=i
336
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
337
- elif next_seg[2].startswith('I-')>0:
338
- pass
339
- elif i==1 and i+1==len(tokens)-1:# only one word:
340
- pre_start=i
341
- pre_type=segs[2][2:]
342
- pre_end=i
343
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
344
- elif i+1>=len(tokens)-1: # the last word
345
- last_seg=tokens[i-1].split('\t')
346
- if last_seg[2]=='O':
347
- pre_start=i
348
- pre_type=segs[2][2:]
349
- pre_end=i
350
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
351
- elif i+1< len(tokens)-1: # non last word
352
-
353
- last_seg=tokens[i-1].split('\t')
354
- if last_seg[2]=='O':
355
- pre_start=i
356
- pre_type=segs[2][2:]
357
- #pass a word
358
- sub_segs=tokens[i+1].split('\t')
359
- while(sub_segs[0].find('##')==0):
360
- i+=1
361
- sent_text+=sub_segs[0]+' '
362
- sub_segs=tokens[i+1].split('\t')
363
- next_seg=tokens[i+1].split('\t')
364
- if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
365
- pre_end=i
366
- pre_entity.append([pre_start-1,pre_end-1,pre_type])
367
- elif next_seg[2].startswith('I-')>0:
368
- pass
369
- elif segs[2]=='O':
370
- pass
371
- i+=1
372
- pre_result.append([sent_text.rstrip(),pre_entity])
373
-
374
-
375
- #print(pre_result)
376
- return pre_result
377
-
378
-
379
- def BERT_restore_index_fn(ori_text,file_pre):
380
-
381
- # input_result=BERT_BIO_tag_entity_revised(file_pre)
382
- input_result=BERT_BIO_tag_entity_word(file_pre)
383
- #print(input_result)
384
-
385
-
386
- new_sentence=''
387
- restore_result=[]
388
-
389
- sentence_ori=ori_text.lower()
390
-
391
- for sent_ele in input_result:
392
-
393
- #print(pre_lines)
394
- # print(sentence_ori)
395
- if len(sent_ele[1])>0:
396
- #print(pre_lines)
397
- sentence_pre=sent_ele[0].lower()
398
- sentence_pre=sentence_pre.split()
399
-
400
- pre_result=sent_ele[1]
401
-
402
-
403
- restore_sid=0
404
- restore_eid=0
405
- each_word_id=[]
406
-
407
-
408
- for i in range(0,len(sentence_pre)):
409
- if sentence_pre[i][0:2]=="##":
410
- sentence_pre[i]=sentence_pre[i][2:]
411
- temp_id=sentence_ori.find(sentence_pre[i])
412
- if temp_id<0:
413
- #print('ori:',sentence_ori)
414
- print('resotr index error:',sentence_pre[i])
415
- new_sentence+=sentence_ori[0:temp_id]
416
-
417
- restore_sid=len(new_sentence)
418
- restore_eid=len(new_sentence)+len(sentence_pre[i])
419
- each_word_id.append([str(restore_sid),str(restore_eid)])
420
- new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
421
- sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
422
- # print('each_word:',each_word_id)
423
- for pre_ele in pre_result:
424
- temp_pre_result=[each_word_id[int(pre_ele[0])][0],each_word_id[int(pre_ele[1])][1],pre_ele[2]]
425
- if temp_pre_result not in restore_result:
426
- restore_result.append(temp_pre_result)
427
- else:
428
- sentence_pre=sent_ele[0].lower()
429
- sentence_pre=sentence_pre.split()
430
-
431
- for i in range(0,len(sentence_pre)):
432
- if sentence_pre[i][0:2]=="##":
433
- sentence_pre[i]=sentence_pre[i][2:]
434
- temp_id=sentence_ori.find(sentence_pre[i])
435
- if temp_id<0:
436
- print('resotr index error:',sentence_pre[i])
437
- new_sentence+=sentence_ori[0:temp_id]
438
- new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
439
- sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
440
- #print('resotre:',restore_result)
441
- return restore_result
442
- if __name__=='__main__':
443
- path='//panfs/pan1/bionlp/lulab/luoling/OpenBioIE_project/models/'
444
- fin=open(path+'devout_test.txt','r',encoding='utf-8')
445
- file_pre=fin.read()
446
- ori_text="D90A-SOD1 mediated amyotrophic lateral sclerosis: a single founder for all cases with evidence for a Cis-acting disease modifier in the recessive haplotype. More than 100 different heterozygous mutations in copper/zinc superoxide dismutase (SOD1) have been found in patients with amyotrophic lateral sclerosis (ALS), a fatal neurodegenerative disease. Uniquely, D90A-SOD1 has been identified in recessive, dominant and apparently sporadic pedigrees. The phenotype of homozygotes is stereotyped with an extended survival, whereas that of affected heterozygotes varies. The frequency of D90A-SOD1 is 50 times higher in Scandinavia (2.5%) than elsewhere, though ALS prevalence is not raised there. Our earlier study indicated separate founders for recessive and dominant/sporadic ALS and we proposed a disease-modifying factor linked to the recessive mutation. Here we have doubled our sample set and employed novel markers to characterise the mutation's origin and localise any modifying factor. Linkage disequilibrium analysis indicates that D90A homozygotes and heterozygotes share a rare haplotype and are all descended from a single ancient founder (alpha 0.974) c.895 generations ago. Homozygotes arose subsequently only c.63 generations ago (alpha 0.878). Recombination has reduced the region shared by recessive kindreds to 97-265 kb around SOD1, excluding all neighbouring genes. We propose that a cis-acting regulatory polymorphism has arisen close to D90A-SOD1 in the recessive founder, which decreases ALS susceptibility in heterozygotes and slows disease progression."
447
- NN_restore_index_fn(ori_text,file_pre)
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Mar 5 10:40:08 2021
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ # -*- coding: utf-8 -*-
9
+ """
10
+ Created on Sun Jun 14 17:19:02 2020
11
+
12
+ @author: luol2
13
+ """
14
+
15
+ import io
16
+ import sys
17
+
18
+ # from BIO format to entity,list line is sentence, follwing the entity(start, end, text, entity, type)
19
+ def NN_BIO_tag_entity(pre_BIO):
20
+ sentences=pre_BIO.strip().split('\n\n')
21
+
22
+ pre_result=[]
23
+ #print(sentences)
24
+ for sent in sentences:
25
+ tokens=sent.split('\n')
26
+ pre_entity=[]
27
+ pre_start,pre_end=0,0
28
+ sent_text=''
29
+ for i in range(0,len(tokens)):
30
+ segs=tokens[i].split('\t')
31
+ sent_text+=segs[0]+' '
32
+ if len(segs)<3:
33
+ continue
34
+ #print(tokens)
35
+ # generate prediction entity
36
+ if segs[2].startswith('B-')>0:
37
+ pre_start=i
38
+ pre_type=segs[2][2:]
39
+ if i+1>=len(tokens): # the last word
40
+ pre_end=i
41
+ pre_entity.append([pre_start,pre_end,pre_type])
42
+ else: # non last word
43
+ next_seg=tokens[i+1].split('\t')
44
+ if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
45
+ pre_end=i
46
+ pre_entity.append([pre_start,pre_end,pre_type])
47
+ elif next_seg[2].startswith('I-')>0:
48
+ pass
49
+ elif segs[2].startswith('I-')>0:
50
+ if i==0 and i+1<len(tokens): # the first word and not only a word
51
+ pre_start=i
52
+ pre_type=segs[2][2:]
53
+ next_seg=tokens[i+1].split('\t')
54
+ if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
55
+ pre_end=i
56
+ pre_entity.append([pre_start,pre_end,pre_type])
57
+ elif next_seg[2].startswith('I-')>0:
58
+ pass
59
+ elif i==0 and i+1==len(tokens):# only one word:
60
+ pre_start=i
61
+ pre_type=segs[2][2:]
62
+ pre_end=i
63
+ pre_entity.append([pre_start,pre_end,pre_type])
64
+ elif i+1>=len(tokens): # the last word
65
+ last_seg=tokens[i-1].split('\t')
66
+ if last_seg[2]=='O':
67
+ pre_start=i
68
+ pre_type=segs[2][2:]
69
+ pre_end=i
70
+ pre_entity.append([pre_start,pre_end,pre_type])
71
+ elif i+1< len(tokens): # non last word
72
+ next_seg=tokens[i+1].split('\t')
73
+ last_seg=tokens[i-1].split('\t')
74
+ if last_seg[2]=='O':
75
+ pre_start=i
76
+ pre_type=segs[2][2:]
77
+ if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
78
+ pre_end=i
79
+ pre_entity.append([pre_start,pre_end,pre_type])
80
+ elif next_seg[2].startswith('I-')>0:
81
+ pass
82
+ elif segs[2]=='O':
83
+ pass
84
+ pre_result.append([sent_text.rstrip(),pre_entity])
85
+
86
+
87
+ # print(pre_entity)
88
+ return pre_result
89
+
90
+ def NN_restore_index_fn(ori_text,file_pre):
91
+
92
+ input_result=NN_BIO_tag_entity(file_pre)
93
+ #print(input_result)
94
+
95
+
96
+ new_sentence=''
97
+ restore_result=[]
98
+
99
+ sentence_ori=ori_text.lower()
100
+
101
+ for sent_ele in input_result:
102
+
103
+ #print(pre_lines)
104
+ # print(sentence_ori)
105
+ if len(sent_ele[1])>0:
106
+ #print(pre_lines)
107
+ sentence_pre=sent_ele[0].lower()
108
+ sentence_pre=sentence_pre.split()
109
+
110
+ pre_result=sent_ele[1]
111
+
112
+
113
+ restore_sid=0
114
+ restore_eid=0
115
+ each_word_id=[]
116
+
117
+ for i in range(0,len(sentence_pre)):
118
+
119
+ temp_id=sentence_ori.find(sentence_pre[i])
120
+ if temp_id<0:
121
+ #print('ori:',sentence_ori)
122
+ print('resotr index error:',sentence_pre[i])
123
+ new_sentence+=sentence_ori[0:temp_id]
124
+
125
+ restore_sid=len(new_sentence)
126
+ restore_eid=len(new_sentence)+len(sentence_pre[i])
127
+ each_word_id.append([str(restore_sid),str(restore_eid)])
128
+ new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
129
+ sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
130
+ # print('each_word:',each_word_id)
131
+ for pre_ele in pre_result:
132
+ temp_pre_result=[each_word_id[int(pre_ele[0])][0],each_word_id[int(pre_ele[1])][1],pre_ele[2]]
133
+ if temp_pre_result not in restore_result:
134
+ restore_result.append(temp_pre_result)
135
+ else:
136
+ sentence_pre=sent_ele[0].lower()
137
+ sentence_pre=sentence_pre.split()
138
+
139
+ for i in range(0,len(sentence_pre)):
140
+
141
+ temp_id=sentence_ori.find(sentence_pre[i])
142
+ if temp_id<0:
143
+ print('resotr index error:',sentence_pre[i])
144
+ new_sentence+=sentence_ori[0:temp_id]
145
+ new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
146
+ sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
147
+ #print('resotre:',restore_result)
148
+ return restore_result
149
+
150
+ def BERT_BIO_tag_entity(pre_BIO):
151
+ sentences=pre_BIO.strip().split('\n\n')
152
+
153
+ pre_result=[]
154
+ for sent in sentences:
155
+ tokens=sent.split('\n')
156
+ pre_entity=[]
157
+ pre_start,pre_end=0,0
158
+ sent_text=''
159
+ for i in range(1,len(tokens)-1):
160
+ segs=tokens[i].split('\t')
161
+ sent_text+=segs[0]+' '
162
+ # generate prediction entity
163
+ if segs[2].startswith('B-')>0:
164
+ pre_start=i
165
+ pre_type=segs[2][2:]
166
+ if i+1>=len(tokens): # the last word
167
+ pre_end=i
168
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
169
+ else: # non last word
170
+ next_seg=tokens[i+1].split('\t')
171
+ if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
172
+ pre_end=i
173
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
174
+ elif next_seg[2].startswith('I-')>0:
175
+ pass
176
+ elif segs[2].startswith('I-')>0:
177
+ if i==0 and i+1<len(tokens): # the first word and not only a word
178
+ pre_start=i
179
+ pre_type=segs[2][2:]
180
+ next_seg=tokens[i+1].split('\t')
181
+ if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
182
+ pre_end=i
183
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
184
+ elif next_seg[2].startswith('I-')>0:
185
+ pass
186
+ elif i==0 and i+1==len(tokens):# only one word:
187
+ pre_start=i
188
+ pre_type=segs[2][2:]
189
+ pre_end=i
190
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
191
+ elif i+1>=len(tokens): # the last word
192
+ last_seg=tokens[i-1].split('\t')
193
+ if last_seg[2]=='O':
194
+ pre_start=i
195
+ pre_type=segs[2][2:]
196
+ pre_end=i
197
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
198
+ elif i+1< len(tokens): # non last word
199
+ next_seg=tokens[i+1].split('\t')
200
+ last_seg=tokens[i-1].split('\t')
201
+ if last_seg[2]=='O':
202
+ pre_start=i
203
+ pre_type=segs[2][2:]
204
+ if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
205
+ pre_end=i
206
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
207
+ elif next_seg[2].startswith('I-')>0:
208
+ pass
209
+ elif segs[2]=='O':
210
+ pass
211
+ pre_result.append([sent_text.rstrip(),pre_entity])
212
+
213
+
214
+ #print(pre_result)
215
+ return pre_result
216
+
217
+ def BERT_BIO_tag_entity_revised(pre_BIO):
218
+ print('revised version')
219
+ sentences=pre_BIO.strip().split('\n\n')
220
+
221
+ pre_result=[]
222
+ for sent in sentences:
223
+ tokens=sent.split('\n')
224
+ pre_entity=[]
225
+ pre_start,pre_end=0,0
226
+ sent_text=''
227
+ for i in range(1,len(tokens)-1):
228
+ segs=tokens[i].split('\t')
229
+ sent_text+=segs[0]+' '
230
+ # generate prediction entity
231
+ if segs[2].startswith('B-')>0:
232
+ pre_start=i
233
+ pre_type=segs[2][2:]
234
+ if i+1>=len(tokens)-1: # the last word
235
+ pre_end=i
236
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
237
+ else: # non last word
238
+ next_seg=tokens[i+1].split('\t')
239
+ if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
240
+ pre_end=i
241
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
242
+ elif next_seg[2].startswith('I-')>0:
243
+ pass
244
+ elif segs[2].startswith('I-')>0:
245
+ if i==1 and i+1<len(tokens)-1: # the first word and not only a word
246
+ pre_start=i
247
+ pre_type=segs[2][2:]
248
+ next_seg=tokens[i+1].split('\t')
249
+ if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
250
+ pre_end=i
251
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
252
+ elif next_seg[2].startswith('I-')>0:
253
+ pass
254
+ elif i==1 and i+1==len(tokens)-1:# only one word:
255
+ pre_start=i
256
+ pre_type=segs[2][2:]
257
+ pre_end=i
258
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
259
+ elif i+1>=len(tokens)-1: # the last word
260
+ last_seg=tokens[i-1].split('\t')
261
+ if last_seg[2]=='O':
262
+ pre_start=i
263
+ pre_type=segs[2][2:]
264
+ pre_end=i
265
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
266
+ elif i+1< len(tokens)-1: # non last word
267
+ next_seg=tokens[i+1].split('\t')
268
+ last_seg=tokens[i-1].split('\t')
269
+ if last_seg[2]=='O':
270
+ pre_start=i
271
+ pre_type=segs[2][2:]
272
+ if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
273
+ pre_end=i
274
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
275
+ elif next_seg[2].startswith('I-')>0:
276
+ pass
277
+ elif segs[2]=='O':
278
+ pass
279
+ pre_result.append([sent_text.rstrip(),pre_entity])
280
+
281
+
282
+ #print(pre_result)
283
+ return pre_result
284
+
285
+ # only predict on the first token of the ori word
286
+ def BERT_BIO_tag_entity_word(pre_BIO):
287
+ sentences=pre_BIO.strip().split('\n\n')
288
+
289
+ pre_result=[]
290
+ for sent in sentences:
291
+ tokens=sent.split('\n')
292
+ pre_entity=[]
293
+ pre_start,pre_end=0,0
294
+ sent_text=''
295
+ i=1
296
+ while i< len(tokens)-1:
297
+ # for i in range(1,len(tokens)-1):
298
+ segs=tokens[i].split('\t')
299
+ sent_text+=segs[0]+' '
300
+ # generate prediction entity
301
+ if segs[2].startswith('B-')>0:
302
+ pre_start=i
303
+ pre_type=segs[2][2:]
304
+ if i+1>=len(tokens)-1: # the last word
305
+ pre_end=i
306
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
307
+ else: # non last word
308
+ #pass a word
309
+ sub_segs=tokens[i+1].split('\t')
310
+ while(sub_segs[0].find('##')==0):
311
+ i+=1
312
+ sent_text+=sub_segs[0]+' '
313
+ sub_segs=tokens[i+1].split('\t')
314
+
315
+
316
+ next_seg=tokens[i+1].split('\t')
317
+ if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
318
+ pre_end=i
319
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
320
+ elif next_seg[2].startswith('I-')>0:
321
+ pass
322
+ elif segs[2].startswith('I-')>0:
323
+ if i==1 and i+1<len(tokens)-1: # the first word and not only a word
324
+ pre_start=i
325
+ pre_type=segs[2][2:]
326
+ #pass a word
327
+ sub_segs=tokens[i+1].split('\t')
328
+ while(sub_segs[0].find('##')==0):
329
+ i+=1
330
+ sent_text+=sub_segs[0]+' '
331
+ sub_segs=tokens[i+1].split('\t')
332
+
333
+ next_seg=tokens[i+1].split('\t')
334
+ if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
335
+ pre_end=i
336
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
337
+ elif next_seg[2].startswith('I-')>0:
338
+ pass
339
+ elif i==1 and i+1==len(tokens)-1:# only one word:
340
+ pre_start=i
341
+ pre_type=segs[2][2:]
342
+ pre_end=i
343
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
344
+ elif i+1>=len(tokens)-1: # the last word
345
+ last_seg=tokens[i-1].split('\t')
346
+ if last_seg[2]=='O':
347
+ pre_start=i
348
+ pre_type=segs[2][2:]
349
+ pre_end=i
350
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
351
+ elif i+1< len(tokens)-1: # non last word
352
+
353
+ last_seg=tokens[i-1].split('\t')
354
+ if last_seg[2]=='O':
355
+ pre_start=i
356
+ pre_type=segs[2][2:]
357
+ #pass a word
358
+ sub_segs=tokens[i+1].split('\t')
359
+ while(sub_segs[0].find('##')==0):
360
+ i+=1
361
+ sent_text+=sub_segs[0]+' '
362
+ sub_segs=tokens[i+1].split('\t')
363
+ next_seg=tokens[i+1].split('\t')
364
+ if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
365
+ pre_end=i
366
+ pre_entity.append([pre_start-1,pre_end-1,pre_type])
367
+ elif next_seg[2].startswith('I-')>0:
368
+ pass
369
+ elif segs[2]=='O':
370
+ pass
371
+ i+=1
372
+ pre_result.append([sent_text.rstrip(),pre_entity])
373
+
374
+
375
+ #print(pre_result)
376
+ return pre_result
377
+
378
+
379
+ def BERT_restore_index_fn(ori_text,file_pre):
380
+
381
+ # input_result=BERT_BIO_tag_entity_revised(file_pre)
382
+ input_result=BERT_BIO_tag_entity_word(file_pre)
383
+ #print(input_result)
384
+
385
+
386
+ new_sentence=''
387
+ restore_result=[]
388
+
389
+ sentence_ori=ori_text.lower()
390
+
391
+ for sent_ele in input_result:
392
+
393
+ #print(pre_lines)
394
+ # print(sentence_ori)
395
+ if len(sent_ele[1])>0:
396
+ #print(pre_lines)
397
+ sentence_pre=sent_ele[0].lower()
398
+ sentence_pre=sentence_pre.split()
399
+
400
+ pre_result=sent_ele[1]
401
+
402
+
403
+ restore_sid=0
404
+ restore_eid=0
405
+ each_word_id=[]
406
+
407
+
408
+ for i in range(0,len(sentence_pre)):
409
+ if sentence_pre[i][0:2]=="##":
410
+ sentence_pre[i]=sentence_pre[i][2:]
411
+ temp_id=sentence_ori.find(sentence_pre[i])
412
+ if temp_id<0:
413
+ #print('ori:',sentence_ori)
414
+ print('resotr index error:',sentence_pre[i])
415
+ new_sentence+=sentence_ori[0:temp_id]
416
+
417
+ restore_sid=len(new_sentence)
418
+ restore_eid=len(new_sentence)+len(sentence_pre[i])
419
+ each_word_id.append([str(restore_sid),str(restore_eid)])
420
+ new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
421
+ sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
422
+ # print('each_word:',each_word_id)
423
+ for pre_ele in pre_result:
424
+ temp_pre_result=[each_word_id[int(pre_ele[0])][0],each_word_id[int(pre_ele[1])][1],pre_ele[2]]
425
+ if temp_pre_result not in restore_result:
426
+ restore_result.append(temp_pre_result)
427
+ else:
428
+ sentence_pre=sent_ele[0].lower()
429
+ sentence_pre=sentence_pre.split()
430
+
431
+ for i in range(0,len(sentence_pre)):
432
+ if sentence_pre[i][0:2]=="##":
433
+ sentence_pre[i]=sentence_pre[i][2:]
434
+ temp_id=sentence_ori.find(sentence_pre[i])
435
+ if temp_id<0:
436
+ print('resotr index error:',sentence_pre[i])
437
+ new_sentence+=sentence_ori[0:temp_id]
438
+ new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
439
+ sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
440
+ #print('resotre:',restore_result)
441
+ return restore_result
442
+ if __name__=='__main__':
443
+ path='//panfs/pan1/bionlp/lulab/luoling/OpenBioIE_project/models/'
444
+ fin=open(path+'devout_test.txt','r',encoding='utf-8')
445
+ file_pre=fin.read()
446
+ ori_text="D90A-SOD1 mediated amyotrophic lateral sclerosis: a single founder for all cases with evidence for a Cis-acting disease modifier in the recessive haplotype. More than 100 different heterozygous mutations in copper/zinc superoxide dismutase (SOD1) have been found in patients with amyotrophic lateral sclerosis (ALS), a fatal neurodegenerative disease. Uniquely, D90A-SOD1 has been identified in recessive, dominant and apparently sporadic pedigrees. The phenotype of homozygotes is stereotyped with an extended survival, whereas that of affected heterozygotes varies. The frequency of D90A-SOD1 is 50 times higher in Scandinavia (2.5%) than elsewhere, though ALS prevalence is not raised there. Our earlier study indicated separate founders for recessive and dominant/sporadic ALS and we proposed a disease-modifying factor linked to the recessive mutation. Here we have doubled our sample set and employed novel markers to characterise the mutation's origin and localise any modifying factor. Linkage disequilibrium analysis indicates that D90A homozygotes and heterozygotes share a rare haplotype and are all descended from a single ancient founder (alpha 0.974) c.895 generations ago. Homozygotes arose subsequently only c.63 generations ago (alpha 0.878). Recombination has reduced the region shared by recessive kindreds to 97-265 kb around SOD1, excluding all neighbouring genes. We propose that a cis-acting regulatory polymorphism has arisen close to D90A-SOD1 in the recessive founder, which decreases ALS susceptibility in heterozygotes and slows disease progression."
447
+ NN_restore_index_fn(ori_text,file_pre)
src_python/SpeAss/Evaluation_sa.py CHANGED
@@ -1,396 +1,396 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on Mon Mar 1 15:33:54 2021
4
-
5
- @author: luol2
6
- """
7
- # compute metrics using IO prefile
8
- #ignore arg1
9
- def Rel_Evaluation(prefile):
10
- fin=open(prefile,'r',encoding='utf-8')
11
- all_in=fin.read().strip().split('\n\n')
12
- fin.close()
13
- TP=0 #gold=pre=pos
14
- FP=0 #gold=neg, pre=pos
15
- FN=0 #gold=pos, pre=Neg
16
- for sentence in all_in:
17
- tokens=sentence.split('\n')
18
- entity_id=0
19
- token_id=0
20
- temp_gold='O'
21
- temp_pre='O'
22
- while (token_id<len(tokens)):
23
- seg=tokens[token_id].split('\t')
24
- if seg[0]=='<GENE>':
25
- if seg[1]=='O':
26
- temp_gold=seg[1]
27
- else:
28
- temp_gold=seg[1][2:]
29
- if seg[2]=='O':
30
- temp_pre=seg[2]
31
- else:
32
- temp_pre=seg[2][2:]
33
- token_id+=1
34
- seg=tokens[token_id].split('\t')
35
- while seg[0]!='</GENE>':
36
- token_id+=1
37
- seg=tokens[token_id].split('\t')
38
- if seg[1]!='O' and temp_gold=='O':
39
- temp_gold=seg[1][2:]
40
- if seg[2]!='O' and temp_pre=='O':
41
- temp_pre=seg[2][2:]
42
- if temp_pre!='O' and temp_gold!='O' and temp_pre==temp_gold:
43
- TP+=1
44
- elif temp_pre!='O' and temp_gold!='O' and temp_pre!=temp_gold:
45
- FP+=1
46
- FN+=1
47
- elif temp_pre!='O' and temp_gold=='O' :
48
- FP+=1
49
- elif temp_pre=='O' and temp_gold!='O' :
50
- FN+=1
51
- temp_pre='O'
52
- temp_gold='O'
53
-
54
- else:
55
- pass
56
- token_id+=1
57
- # print('TP,FP,FN:',TP,FP,FN)
58
- if TP+FP==0:
59
- P=0
60
- else:
61
- P=TP/(TP+FP)
62
- if TP+FN==0:
63
- R=0
64
- else:
65
- R=TP/(TP+FN)
66
- if P+R==0:
67
- F1=0
68
- else:
69
- F1=2*P*R/(P+R)
70
- print('TP,FP,FN:',TP,FP,FN)
71
- print('P,R,F1:',P,R,F1)
72
-
73
-
74
- def Rel_Evaluation_fn(prefile):
75
- fin=open(prefile,'r',encoding='utf-8')
76
- all_in=fin.read().strip().split('\n\n')
77
- fin.close()
78
- TP=0 #gold=pre=pos
79
- FP=0 #gold=neg, pre=pos
80
- FN=0 #gold=pos, pre=Neg
81
- for sentence in all_in:
82
- tokens=sentence.split('\n')
83
- entity_id=0
84
- token_id=0
85
- temp_gold='O'
86
- temp_pre='O'
87
- while (token_id<len(tokens)):
88
- seg=tokens[token_id].split('\t')
89
- if seg[0]=='<GENE>':
90
- if seg[1]=='O':
91
- temp_gold=seg[1]
92
- else:
93
- temp_gold=seg[1][2:]
94
- if seg[2]=='O':
95
- temp_pre=seg[2]
96
- else:
97
- temp_pre=seg[2][2:]
98
- token_id+=1
99
- seg=tokens[token_id].split('\t')
100
- while seg[0]!='</GENE>':
101
- token_id+=1
102
- seg=tokens[token_id].split('\t')
103
- if seg[1]!='O' and temp_gold=='O':
104
- temp_gold=seg[1][2:]
105
- if seg[2]!='O' and temp_pre=='O':
106
- temp_pre=seg[2][2:]
107
- if temp_pre!='O' and temp_gold!='O' and temp_pre==temp_gold:
108
- TP+=1
109
- elif temp_pre!='O' and temp_gold!='O' and temp_pre!=temp_gold:
110
- FP+=1
111
- elif temp_pre!='O' and temp_gold=='O' :
112
- FP+=1
113
- elif temp_pre=='O' and temp_gold!='O' :
114
- FN+=1
115
- temp_pre='O'
116
- temp_gold='O'
117
-
118
- else:
119
- pass
120
- token_id+=1
121
- print('TP,FP,FN:',TP,FP,FN)
122
- if TP+FP==0:
123
- P=0
124
- else:
125
- P=TP/(TP+FP)
126
- if TP+FN==0:
127
- R=0
128
- else:
129
- R=TP/(TP+FN)
130
- if P+R==0:
131
- F1=0
132
- else:
133
- F1=2*P*R/(P+R)
134
- # print('TP,FP,FN:',TP,FP,FN)
135
- print('P,R,F1:',P,R,F1)
136
- return F1
137
-
138
- def Rel_Evaluation_Hugface_fn(prefile,ARG2_label='gene1s'):
139
- fin=open(prefile,'r',encoding='utf-8')
140
- all_in=fin.read().strip().split('\n\n')
141
- fin.close()
142
- TP=0 #gold=pre=pos
143
- FP=0 #gold=neg, pre=pos
144
- FN=0 #gold=pos, pre=Neg
145
- result_dict={}#{'rel type':[TP,FP,FN],...,}
146
- for sentence in all_in:
147
- tokens=sentence.split('\n')
148
- for token in tokens:
149
- seg=token.split('\t')
150
- if seg[0]==ARG2_label:
151
- if seg[1].find('ARG2')>=0:
152
- if seg[2]==seg[1]:
153
- if seg[1] not in result_dict.keys():
154
- result_dict[seg[1]]=[1,0,0]
155
- else:
156
- result_dict[seg[1]][0]+=1
157
- TP+=1
158
- elif seg[2].find('ARG2')>=0:
159
- if seg[1] not in result_dict.keys():
160
- result_dict[seg[1]]=[0,0,1]
161
- else:
162
- result_dict[seg[1]][2]+=1
163
- if seg[2] not in result_dict.keys():
164
- result_dict[seg[2]]=[0,1,0]
165
- else:
166
- result_dict[seg[2]][1]+=1
167
- FP+=1
168
- FN+=1
169
- else:
170
- if seg[1] not in result_dict.keys():
171
- result_dict[seg[1]]=[0,0,1]
172
- else:
173
- result_dict[seg[1]][2]+=1
174
- FN+=1
175
-
176
- else:
177
- if seg[2].find('ARG2')>=0:
178
- if seg[2] not in result_dict.keys():
179
- result_dict[seg[2]]=[0,1,0]
180
- else:
181
- result_dict[seg[2]][1]+=1
182
- FP+=1
183
- # print('TP,FP,FN:',TP,FP,FN)
184
- rel_metrics={}
185
- for rel_type in result_dict.keys():
186
- if result_dict[rel_type][0]+result_dict[rel_type][1]==0:
187
- p=0
188
- else:
189
- p=result_dict[rel_type][0]/(result_dict[rel_type][0]+result_dict[rel_type][1])
190
- if result_dict[rel_type][0]+result_dict[rel_type][2]==0:
191
- r=0
192
- else:
193
- r=result_dict[rel_type][0]/(result_dict[rel_type][0]+result_dict[rel_type][2])
194
- if p+r==0:
195
- f1=0
196
- else:
197
- f1=2*p*r/(p+r)
198
- rel_metrics[rel_type]=[round(p,4),round(r,4),round(f1,4)]
199
- if TP+FP==0:
200
- P=0
201
- else:
202
- P=TP/(TP+FP)
203
- if TP+FN==0:
204
- R=0
205
- else:
206
- R=TP/(TP+FN)
207
- if P+R==0:
208
- F1=0
209
- else:
210
- F1=2*P*R/(P+R)
211
- P=round(P,4)
212
- R=round(R,4)
213
- F1=round(F1,4)
214
- print('mertics:\n',rel_metrics)
215
- print('\nTP,FP,FN:',TP,FP,FN)
216
- print('Overall P,R,F1:',P,R,F1)
217
- return [P,R,F1],rel_metrics
218
-
219
- def Rel_Evaluation_AIO_fn(prefile):
220
- fin=open(prefile,'r',encoding='utf-8')
221
- all_in=fin.read().strip().split('\n\n')
222
- fin.close()
223
- TP=0 #gold=pre=pos
224
- FP=0 #gold=neg, pre=pos
225
- FN=0 #gold=pos, pre=Neg
226
- for sentence in all_in:
227
- tokens=sentence.split('\n')
228
- for token in tokens:
229
- seg=token.split('\t')
230
- if seg[0]=='<GENE>':
231
- if seg[1].find('ARG2-')>=0:
232
- if seg[2]==seg[1]:
233
- TP+=1
234
- elif seg[2].find('ARG2-')>=0:
235
- FP+=1
236
- FN+=1
237
- else:
238
- FN+=1
239
-
240
- else:
241
- if seg[2].find('ARG2-')>=0:
242
- FP+=1
243
- # print('TP,FP,FN:',TP,FP,FN)
244
- if TP+FP==0:
245
- P=0
246
- else:
247
- P=TP/(TP+FP)
248
- if TP+FN==0:
249
- R=0
250
- else:
251
- R=TP/(TP+FN)
252
- if P+R==0:
253
- F1=0
254
- else:
255
- F1=2*P*R/(P+R)
256
- P=round(P,4)
257
- R=round(R,4)
258
- F1=round(F1,4)
259
- print('TP,FP,FN:',TP,FP,FN)
260
- print('P,R,F1:',P,R,F1)
261
- return [P,R,F1]
262
-
263
- def Rel_Evaluation_AIO_GC_fn(prefile):
264
- fin=open(prefile,'r',encoding='utf-8')
265
- all_in=fin.read().strip().split('\n\n')
266
- fin.close()
267
- TP=0 #gold=pre=pos
268
- FP=0 #gold=neg, pre=pos
269
- FN=0 #gold=pos, pre=Neg
270
- for sentence in all_in:
271
- tokens=sentence.split('\n')
272
- for token in tokens:
273
- seg=token.split('\t')
274
- if seg[0]=='<CHEMICAL>':
275
- if seg[1].find('ARG2-')>=0:
276
- if seg[2]==seg[1]:
277
- TP+=1
278
- elif seg[2].find('ARG2-')>=0:
279
- FP+=1
280
- FN+=1
281
- else:
282
- FN+=1
283
-
284
- else:
285
- if seg[2].find('ARG2-')>=0:
286
- FP+=1
287
- # print('TP,FP,FN:',TP,FP,FN)
288
- if TP+FP==0:
289
- P=0
290
- else:
291
- P=TP/(TP+FP)
292
- if TP+FN==0:
293
- R=0
294
- else:
295
- R=TP/(TP+FN)
296
- if P+R==0:
297
- F1=0
298
- else:
299
- F1=2*P*R/(P+R)
300
- P=round(P,4)
301
- R=round(R,4)
302
- F1=round(F1,4)
303
- print('TP,FP,FN:',TP,FP,FN)
304
- print('P,R,F1:',P,R,F1)
305
- return [P,R,F1]
306
-
307
- def office_evaluation(goldfile,prefile):
308
- fin_gold=open(goldfile,'r',encoding='utf-8')
309
- all_gold=fin_gold.read().strip().split('\n')
310
- fin_gold.close()
311
- fin_pre=open(prefile,'r',encoding='utf-8')
312
- all_pre=fin_pre.read().strip().split('\n')
313
- fin_pre.close()
314
-
315
- gold_result={}#{'relation type':set(line)}
316
- pre_result={}
317
- all_result={} #{'relation type':[tp,fp,fn]}
318
- for line in all_gold:
319
- seg=line.split('\t')
320
- if seg[1] not in all_result.keys():
321
- all_result[seg[1]]=[0,0,0]
322
- if seg[1] not in gold_result.keys():
323
- gold_result[seg[1]]=set()
324
- gold_result[seg[1]].add(line)
325
- else:
326
- gold_result[seg[1]].add(line)
327
-
328
- for line in all_pre:
329
- seg=line.split('\t')
330
- if seg[1] not in pre_result.keys():
331
- pre_result[seg[1]]=set()
332
- pre_result[seg[1]].add(line)
333
- else:
334
- pre_result[seg[1]].add(line)
335
-
336
- for rel_type in gold_result.keys():
337
- for gold_ele in gold_result[rel_type]:
338
- if rel_type not in pre_result.keys():
339
- all_result[rel_type][2]+=1
340
- else:
341
- if gold_ele in pre_result[rel_type]:
342
- all_result[rel_type][0]+=1
343
- else:
344
- all_result[rel_type][2]+=1
345
- if rel_type in pre_result.keys():
346
- for pre_ele in pre_result[rel_type]:
347
- if pre_ele not in gold_result[rel_type]:
348
- all_result[rel_type][1]+=1
349
- ave_f=0
350
- TP,FP,FN=0,0,0
351
- print(all_result)
352
- for rel_type in all_result.keys():
353
- TP+=all_result[rel_type][0]
354
- FP+=all_result[rel_type][1]
355
- FN+=all_result[rel_type][2]
356
- tem_p,tem_r,tem_f=0,0,0
357
- if all_result[rel_type][0]+all_result[rel_type][1]==0:
358
- tem_p=0
359
- else:
360
- tem_p=all_result[rel_type][0]/(all_result[rel_type][0]+all_result[rel_type][1])
361
- if all_result[rel_type][0]+all_result[rel_type][2]==0:
362
- tem_r=0
363
- else:
364
- tem_r=all_result[rel_type][0]/(all_result[rel_type][0]+all_result[rel_type][2])
365
- if tem_p+tem_r==0:
366
- tem_f=0
367
- else:
368
- tem_f=2*tem_p*tem_r/(tem_p+tem_r)
369
- ave_f+=tem_f
370
- print('%s:p=%.4f,r=%.4f,f=%.4f' % (rel_type,tem_p,tem_r,tem_f))
371
-
372
- if TP+FP==0:
373
- P=0
374
- else:
375
- P=TP/(TP+FP)
376
- if TP+FN==0:
377
- R=0
378
- else:
379
- R=TP/(TP+FN)
380
- if P+R==0:
381
- F1=0
382
- else:
383
- F1=2*P*R/(P+R)
384
- ave_f+=tem_f
385
-
386
- print('Overall:')
387
- print('ave_f1:',ave_f/len(all_result))
388
- print('TP=%d, FP=%d, FN=%d'%(TP,FP,FN))
389
- print('P=%.4f, R=%.4f, F1=%.4f'%(P,R,F1))
390
-
391
-
392
- if __name__=='__main__':
393
- path='//panfs/pan1/bionlplab/luol2/BC7DrugProt/results/'
394
- office_evaluation(path+'dev/dev_gold_relations.tsv',path+'drugprot_dev_LSTM-CRF-ES_pre.tsv')
395
- print('............')
396
- Rel_Evaluation_check('//panfs/pan1/bionlplab/luol2/BC7DrugProt/check/dev_pre_temp.conll')
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Mon Mar 1 15:33:54 2021
4
+
5
+ @author: luol2
6
+ """
7
+ # compute metrics using IO prefile
8
+ #ignore arg1
9
+ def Rel_Evaluation(prefile):
10
+ fin=open(prefile,'r',encoding='utf-8')
11
+ all_in=fin.read().strip().split('\n\n')
12
+ fin.close()
13
+ TP=0 #gold=pre=pos
14
+ FP=0 #gold=neg, pre=pos
15
+ FN=0 #gold=pos, pre=Neg
16
+ for sentence in all_in:
17
+ tokens=sentence.split('\n')
18
+ entity_id=0
19
+ token_id=0
20
+ temp_gold='O'
21
+ temp_pre='O'
22
+ while (token_id<len(tokens)):
23
+ seg=tokens[token_id].split('\t')
24
+ if seg[0]=='<GENE>':
25
+ if seg[1]=='O':
26
+ temp_gold=seg[1]
27
+ else:
28
+ temp_gold=seg[1][2:]
29
+ if seg[2]=='O':
30
+ temp_pre=seg[2]
31
+ else:
32
+ temp_pre=seg[2][2:]
33
+ token_id+=1
34
+ seg=tokens[token_id].split('\t')
35
+ while seg[0]!='</GENE>':
36
+ token_id+=1
37
+ seg=tokens[token_id].split('\t')
38
+ if seg[1]!='O' and temp_gold=='O':
39
+ temp_gold=seg[1][2:]
40
+ if seg[2]!='O' and temp_pre=='O':
41
+ temp_pre=seg[2][2:]
42
+ if temp_pre!='O' and temp_gold!='O' and temp_pre==temp_gold:
43
+ TP+=1
44
+ elif temp_pre!='O' and temp_gold!='O' and temp_pre!=temp_gold:
45
+ FP+=1
46
+ FN+=1
47
+ elif temp_pre!='O' and temp_gold=='O' :
48
+ FP+=1
49
+ elif temp_pre=='O' and temp_gold!='O' :
50
+ FN+=1
51
+ temp_pre='O'
52
+ temp_gold='O'
53
+
54
+ else:
55
+ pass
56
+ token_id+=1
57
+ # print('TP,FP,FN:',TP,FP,FN)
58
+ if TP+FP==0:
59
+ P=0
60
+ else:
61
+ P=TP/(TP+FP)
62
+ if TP+FN==0:
63
+ R=0
64
+ else:
65
+ R=TP/(TP+FN)
66
+ if P+R==0:
67
+ F1=0
68
+ else:
69
+ F1=2*P*R/(P+R)
70
+ print('TP,FP,FN:',TP,FP,FN)
71
+ print('P,R,F1:',P,R,F1)
72
+
73
+
74
+ def Rel_Evaluation_fn(prefile):
75
+ fin=open(prefile,'r',encoding='utf-8')
76
+ all_in=fin.read().strip().split('\n\n')
77
+ fin.close()
78
+ TP=0 #gold=pre=pos
79
+ FP=0 #gold=neg, pre=pos
80
+ FN=0 #gold=pos, pre=Neg
81
+ for sentence in all_in:
82
+ tokens=sentence.split('\n')
83
+ entity_id=0
84
+ token_id=0
85
+ temp_gold='O'
86
+ temp_pre='O'
87
+ while (token_id<len(tokens)):
88
+ seg=tokens[token_id].split('\t')
89
+ if seg[0]=='<GENE>':
90
+ if seg[1]=='O':
91
+ temp_gold=seg[1]
92
+ else:
93
+ temp_gold=seg[1][2:]
94
+ if seg[2]=='O':
95
+ temp_pre=seg[2]
96
+ else:
97
+ temp_pre=seg[2][2:]
98
+ token_id+=1
99
+ seg=tokens[token_id].split('\t')
100
+ while seg[0]!='</GENE>':
101
+ token_id+=1
102
+ seg=tokens[token_id].split('\t')
103
+ if seg[1]!='O' and temp_gold=='O':
104
+ temp_gold=seg[1][2:]
105
+ if seg[2]!='O' and temp_pre=='O':
106
+ temp_pre=seg[2][2:]
107
+ if temp_pre!='O' and temp_gold!='O' and temp_pre==temp_gold:
108
+ TP+=1
109
+ elif temp_pre!='O' and temp_gold!='O' and temp_pre!=temp_gold:
110
+ FP+=1
111
+ elif temp_pre!='O' and temp_gold=='O' :
112
+ FP+=1
113
+ elif temp_pre=='O' and temp_gold!='O' :
114
+ FN+=1
115
+ temp_pre='O'
116
+ temp_gold='O'
117
+
118
+ else:
119
+ pass
120
+ token_id+=1
121
+ print('TP,FP,FN:',TP,FP,FN)
122
+ if TP+FP==0:
123
+ P=0
124
+ else:
125
+ P=TP/(TP+FP)
126
+ if TP+FN==0:
127
+ R=0
128
+ else:
129
+ R=TP/(TP+FN)
130
+ if P+R==0:
131
+ F1=0
132
+ else:
133
+ F1=2*P*R/(P+R)
134
+ # print('TP,FP,FN:',TP,FP,FN)
135
+ print('P,R,F1:',P,R,F1)
136
+ return F1
137
+
138
+ def Rel_Evaluation_Hugface_fn(prefile,ARG2_label='gene1s'):
139
+ fin=open(prefile,'r',encoding='utf-8')
140
+ all_in=fin.read().strip().split('\n\n')
141
+ fin.close()
142
+ TP=0 #gold=pre=pos
143
+ FP=0 #gold=neg, pre=pos
144
+ FN=0 #gold=pos, pre=Neg
145
+ result_dict={}#{'rel type':[TP,FP,FN],...,}
146
+ for sentence in all_in:
147
+ tokens=sentence.split('\n')
148
+ for token in tokens:
149
+ seg=token.split('\t')
150
+ if seg[0]==ARG2_label:
151
+ if seg[1].find('ARG2')>=0:
152
+ if seg[2]==seg[1]:
153
+ if seg[1] not in result_dict.keys():
154
+ result_dict[seg[1]]=[1,0,0]
155
+ else:
156
+ result_dict[seg[1]][0]+=1
157
+ TP+=1
158
+ elif seg[2].find('ARG2')>=0:
159
+ if seg[1] not in result_dict.keys():
160
+ result_dict[seg[1]]=[0,0,1]
161
+ else:
162
+ result_dict[seg[1]][2]+=1
163
+ if seg[2] not in result_dict.keys():
164
+ result_dict[seg[2]]=[0,1,0]
165
+ else:
166
+ result_dict[seg[2]][1]+=1
167
+ FP+=1
168
+ FN+=1
169
+ else:
170
+ if seg[1] not in result_dict.keys():
171
+ result_dict[seg[1]]=[0,0,1]
172
+ else:
173
+ result_dict[seg[1]][2]+=1
174
+ FN+=1
175
+
176
+ else:
177
+ if seg[2].find('ARG2')>=0:
178
+ if seg[2] not in result_dict.keys():
179
+ result_dict[seg[2]]=[0,1,0]
180
+ else:
181
+ result_dict[seg[2]][1]+=1
182
+ FP+=1
183
+ # print('TP,FP,FN:',TP,FP,FN)
184
+ rel_metrics={}
185
+ for rel_type in result_dict.keys():
186
+ if result_dict[rel_type][0]+result_dict[rel_type][1]==0:
187
+ p=0
188
+ else:
189
+ p=result_dict[rel_type][0]/(result_dict[rel_type][0]+result_dict[rel_type][1])
190
+ if result_dict[rel_type][0]+result_dict[rel_type][2]==0:
191
+ r=0
192
+ else:
193
+ r=result_dict[rel_type][0]/(result_dict[rel_type][0]+result_dict[rel_type][2])
194
+ if p+r==0:
195
+ f1=0
196
+ else:
197
+ f1=2*p*r/(p+r)
198
+ rel_metrics[rel_type]=[round(p,4),round(r,4),round(f1,4)]
199
+ if TP+FP==0:
200
+ P=0
201
+ else:
202
+ P=TP/(TP+FP)
203
+ if TP+FN==0:
204
+ R=0
205
+ else:
206
+ R=TP/(TP+FN)
207
+ if P+R==0:
208
+ F1=0
209
+ else:
210
+ F1=2*P*R/(P+R)
211
+ P=round(P,4)
212
+ R=round(R,4)
213
+ F1=round(F1,4)
214
+ print('mertics:\n',rel_metrics)
215
+ print('\nTP,FP,FN:',TP,FP,FN)
216
+ print('Overall P,R,F1:',P,R,F1)
217
+ return [P,R,F1],rel_metrics
218
+
219
+ def Rel_Evaluation_AIO_fn(prefile):
220
+ fin=open(prefile,'r',encoding='utf-8')
221
+ all_in=fin.read().strip().split('\n\n')
222
+ fin.close()
223
+ TP=0 #gold=pre=pos
224
+ FP=0 #gold=neg, pre=pos
225
+ FN=0 #gold=pos, pre=Neg
226
+ for sentence in all_in:
227
+ tokens=sentence.split('\n')
228
+ for token in tokens:
229
+ seg=token.split('\t')
230
+ if seg[0]=='<GENE>':
231
+ if seg[1].find('ARG2-')>=0:
232
+ if seg[2]==seg[1]:
233
+ TP+=1
234
+ elif seg[2].find('ARG2-')>=0:
235
+ FP+=1
236
+ FN+=1
237
+ else:
238
+ FN+=1
239
+
240
+ else:
241
+ if seg[2].find('ARG2-')>=0:
242
+ FP+=1
243
+ # print('TP,FP,FN:',TP,FP,FN)
244
+ if TP+FP==0:
245
+ P=0
246
+ else:
247
+ P=TP/(TP+FP)
248
+ if TP+FN==0:
249
+ R=0
250
+ else:
251
+ R=TP/(TP+FN)
252
+ if P+R==0:
253
+ F1=0
254
+ else:
255
+ F1=2*P*R/(P+R)
256
+ P=round(P,4)
257
+ R=round(R,4)
258
+ F1=round(F1,4)
259
+ print('TP,FP,FN:',TP,FP,FN)
260
+ print('P,R,F1:',P,R,F1)
261
+ return [P,R,F1]
262
+
263
+ def Rel_Evaluation_AIO_GC_fn(prefile):
264
+ fin=open(prefile,'r',encoding='utf-8')
265
+ all_in=fin.read().strip().split('\n\n')
266
+ fin.close()
267
+ TP=0 #gold=pre=pos
268
+ FP=0 #gold=neg, pre=pos
269
+ FN=0 #gold=pos, pre=Neg
270
+ for sentence in all_in:
271
+ tokens=sentence.split('\n')
272
+ for token in tokens:
273
+ seg=token.split('\t')
274
+ if seg[0]=='<CHEMICAL>':
275
+ if seg[1].find('ARG2-')>=0:
276
+ if seg[2]==seg[1]:
277
+ TP+=1
278
+ elif seg[2].find('ARG2-')>=0:
279
+ FP+=1
280
+ FN+=1
281
+ else:
282
+ FN+=1
283
+
284
+ else:
285
+ if seg[2].find('ARG2-')>=0:
286
+ FP+=1
287
+ # print('TP,FP,FN:',TP,FP,FN)
288
+ if TP+FP==0:
289
+ P=0
290
+ else:
291
+ P=TP/(TP+FP)
292
+ if TP+FN==0:
293
+ R=0
294
+ else:
295
+ R=TP/(TP+FN)
296
+ if P+R==0:
297
+ F1=0
298
+ else:
299
+ F1=2*P*R/(P+R)
300
+ P=round(P,4)
301
+ R=round(R,4)
302
+ F1=round(F1,4)
303
+ print('TP,FP,FN:',TP,FP,FN)
304
+ print('P,R,F1:',P,R,F1)
305
+ return [P,R,F1]
306
+
307
+ def office_evaluation(goldfile,prefile):
308
+ fin_gold=open(goldfile,'r',encoding='utf-8')
309
+ all_gold=fin_gold.read().strip().split('\n')
310
+ fin_gold.close()
311
+ fin_pre=open(prefile,'r',encoding='utf-8')
312
+ all_pre=fin_pre.read().strip().split('\n')
313
+ fin_pre.close()
314
+
315
+ gold_result={}#{'relation type':set(line)}
316
+ pre_result={}
317
+ all_result={} #{'relation type':[tp,fp,fn]}
318
+ for line in all_gold:
319
+ seg=line.split('\t')
320
+ if seg[1] not in all_result.keys():
321
+ all_result[seg[1]]=[0,0,0]
322
+ if seg[1] not in gold_result.keys():
323
+ gold_result[seg[1]]=set()
324
+ gold_result[seg[1]].add(line)
325
+ else:
326
+ gold_result[seg[1]].add(line)
327
+
328
+ for line in all_pre:
329
+ seg=line.split('\t')
330
+ if seg[1] not in pre_result.keys():
331
+ pre_result[seg[1]]=set()
332
+ pre_result[seg[1]].add(line)
333
+ else:
334
+ pre_result[seg[1]].add(line)
335
+
336
+ for rel_type in gold_result.keys():
337
+ for gold_ele in gold_result[rel_type]:
338
+ if rel_type not in pre_result.keys():
339
+ all_result[rel_type][2]+=1
340
+ else:
341
+ if gold_ele in pre_result[rel_type]:
342
+ all_result[rel_type][0]+=1
343
+ else:
344
+ all_result[rel_type][2]+=1
345
+ if rel_type in pre_result.keys():
346
+ for pre_ele in pre_result[rel_type]:
347
+ if pre_ele not in gold_result[rel_type]:
348
+ all_result[rel_type][1]+=1
349
+ ave_f=0
350
+ TP,FP,FN=0,0,0
351
+ print(all_result)
352
+ for rel_type in all_result.keys():
353
+ TP+=all_result[rel_type][0]
354
+ FP+=all_result[rel_type][1]
355
+ FN+=all_result[rel_type][2]
356
+ tem_p,tem_r,tem_f=0,0,0
357
+ if all_result[rel_type][0]+all_result[rel_type][1]==0:
358
+ tem_p=0
359
+ else:
360
+ tem_p=all_result[rel_type][0]/(all_result[rel_type][0]+all_result[rel_type][1])
361
+ if all_result[rel_type][0]+all_result[rel_type][2]==0:
362
+ tem_r=0
363
+ else:
364
+ tem_r=all_result[rel_type][0]/(all_result[rel_type][0]+all_result[rel_type][2])
365
+ if tem_p+tem_r==0:
366
+ tem_f=0
367
+ else:
368
+ tem_f=2*tem_p*tem_r/(tem_p+tem_r)
369
+ ave_f+=tem_f
370
+ print('%s:p=%.4f,r=%.4f,f=%.4f' % (rel_type,tem_p,tem_r,tem_f))
371
+
372
+ if TP+FP==0:
373
+ P=0
374
+ else:
375
+ P=TP/(TP+FP)
376
+ if TP+FN==0:
377
+ R=0
378
+ else:
379
+ R=TP/(TP+FN)
380
+ if P+R==0:
381
+ F1=0
382
+ else:
383
+ F1=2*P*R/(P+R)
384
+ ave_f+=tem_f
385
+
386
+ print('Overall:')
387
+ print('ave_f1:',ave_f/len(all_result))
388
+ print('TP=%d, FP=%d, FN=%d'%(TP,FP,FN))
389
+ print('P=%.4f, R=%.4f, F1=%.4f'%(P,R,F1))
390
+
391
+
392
+ if __name__=='__main__':
393
+ path='//panfs/pan1/bionlplab/luol2/BC7DrugProt/results/'
394
+ office_evaluation(path+'dev/dev_gold_relations.tsv',path+'drugprot_dev_LSTM-CRF-ES_pre.tsv')
395
+ print('............')
396
+ Rel_Evaluation_check('//panfs/pan1/bionlplab/luol2/BC7DrugProt/check/dev_pre_temp.conll')
src_python/SpeAss/SA_Pubtator_Conll.py CHANGED
@@ -1,494 +1,494 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- import sys
4
- import io
5
- import stanza
6
- # nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma',package='craft') #package='craft'
7
- nlp = stanza.Pipeline(lang='en', processors={'tokenize': 'spacy'},package='None') #package='craft'
8
- REL_ENT={'arg1':'Species',
9
- 'arg2':'Gene'}
10
-
11
- ENTITY_TAG={'arg1':['arg1s','arg1e'],
12
- 'arg2':['arg2s','arg2e'],
13
- 'gene':['gene1s','gene1e'],
14
- 'species':['species1s','species1e']
15
- }
16
-
17
- # ssplit token and revise index
18
- def ssplit_token(infile):
19
- fin=open(infile,'r',encoding='utf-8')
20
- fout=io.StringIO()
21
- all_in=fin.read().strip().split('\n\n')
22
- fin.close()
23
- for doc_text in all_in:
24
- lines=doc_text.split('\n')
25
- ori_text=lines[0].split('|t|')[1]+' '+lines[1].split('|a|')[1]
26
- pmid=lines[0].split('|t|')[0]
27
- # print(pmid)
28
- entity_all=[] #[[seg0,seg1,...,],[]]
29
- for i in range(2,len(lines)):
30
- seg=lines[i].split('\t')
31
- entity_all.append(seg)
32
-
33
- #ssplit token
34
- doc_stanza = nlp(ori_text)
35
- token_text=''
36
- for sent in doc_stanza.sentences:
37
- for word in sent.words:
38
- if word.text==' ':
39
- pass
40
- # print('token is blank!')
41
- else:
42
- token_text+=word.text+' '
43
- #token_text=token_text+' ' #sentence split by four blank
44
-
45
- #ori_index map token_index
46
- index_map=[-1]*len(ori_text)
47
- j=0
48
- space_list=[' ',chr(160),chr(8201),chr(8194),chr(8197),chr(8202)] #空格有好几种,第一个是常用32,第二个shi 160,8201,8194,8197
49
- for i in range(0,len(ori_text)):
50
- if ori_text[i] in space_list:
51
- pass
52
- elif ori_text[i]==token_text[j]:
53
- #if i>0 and i<285:
54
- # print('=i,j:',i,j,ori_text[i-1:i+1],token_text[j-1:j+1])
55
- index_map[i]=j
56
- j+=1
57
- else:
58
- #if i==283:
59
- # print('!i,j:',i,j,ori_text[i-1:i+1],token_text[j-1:j+1])
60
- j+=1
61
- temp_log=j
62
- try:
63
- while(ori_text[i]!=token_text[j]):
64
- j+=1
65
- except:
66
- print('doc',doc_text)
67
- print('token_text:',token_text)
68
- print('error:',ori_text[i-10:i+10],'i:',ori_text[i],'j:',token_text[temp_log],',',token_text[temp_log-10:temp_log+10])
69
- print(ord(ori_text[i]),ord(' '))
70
- sys.exit()
71
- index_map[i]=j
72
- j+=1
73
- # print(index_map)
74
- # token_text=token_text.replace(' ','<EOS>')
75
- # print(token_text)
76
- fout.write(token_text+'\n')
77
- for ele in entity_all:
78
- if index_map[int(ele[1])]==-1:
79
- new_ents=index_map[int(ele[1])+1]
80
- else:
81
- new_ents=index_map[int(ele[1])]
82
- if index_map[int(ele[2])-1]==-1:
83
- new_ente=index_map[int(ele[2])-1-1]+1
84
- else:
85
- new_ente=index_map[int(ele[2])-1]+1
86
- new_ent=token_text[new_ents:new_ente]
87
- if ele[4]=='Species' or ele[4]=='Gene':
88
- fout.write(ele[0]+'\t'+str(new_ents)+'\t'+str(new_ente)+'\t'+new_ent+'\t'+ele[4]+'\t'+ele[5]+'\n')
89
- else:
90
- # print(ele[4])
91
- fout.write(ele[0]+'\t'+str(new_ents)+'\t'+str(new_ente)+'\t'+new_ent+'\t'+'Gene'+'\t'+ele[5]+'\n')
92
- fout.write('\n')
93
- return fout.getvalue()
94
-
95
-
96
- def corpus_noNest(token_input):
97
-
98
- fin=io.StringIO(token_input)
99
- fout=io.StringIO()
100
-
101
- documents=fin.read().strip().split('\n\n')
102
- fin.close()
103
- total_entity=0
104
- over_entity=0
105
- nest_entity=0
106
- for doc in documents:
107
- lines=doc.split('\n')
108
- context=lines[0]
109
- entity_list=[]
110
- if len(lines)>1:
111
- doc_result={}
112
- for i in range(1,len(lines)):
113
- segs=lines[i].split('\t')
114
- doc_result[lines[i]]=[int(segs[1]),int(segs[2])]
115
- doc_result=sorted(doc_result.items(), key=lambda kv:(kv[1]), reverse=False)
116
- doc_result_sort=[]
117
- for ele in doc_result:
118
- doc_result_sort.append(ele[0])
119
-
120
- first_entity=doc_result_sort[0].split('\t')
121
- nest_list=[first_entity]
122
- max_eid=int(first_entity[2])
123
- total_entity+=len(lines)-2
124
- for i in range(1,len(doc_result_sort)):
125
- segs=doc_result_sort[i].split('\t')
126
- if int(segs[1])> max_eid:
127
- if len(nest_list)==1:
128
- entity_list.append(nest_list[0])
129
- nest_list=[]
130
- nest_list.append(segs)
131
- if int(segs[2])>max_eid:
132
- max_eid=int(segs[2])
133
- else:
134
- # print(nest_list)
135
- nest_entity+=len(nest_list)-1
136
- tem=find_max_entity(nest_list,context)#find max entity
137
- # if len(tem)>1:
138
- # print('max nest >1:',tem)
139
- entity_list.extend(tem)
140
- nest_list=[]
141
- nest_list.append(segs)
142
- if int(segs[2])>max_eid:
143
- max_eid=int(segs[2])
144
-
145
- else:
146
- nest_list.append(segs)
147
- over_entity+=1
148
- if int(segs[2])>max_eid:
149
- max_eid=int(segs[2])
150
- if nest_list!=[]:
151
- if len(nest_list)==1:
152
- entity_list.append(nest_list[0])
153
-
154
- else:
155
- tem=find_max_entity(nest_list,context)#find max entity
156
- # if len(tem)>1:
157
- # print('max nest >1:',tem)
158
- entity_list.extend(tem)
159
- fout.write(context+'\n')
160
- for ele in entity_list:
161
- if ele[4]=='Gene':
162
- temp_gene={}
163
- gene_ids=ele[5].split(',')
164
- for gene_id in gene_ids:
165
- temp_id=gene_id[gene_id.find('Species:'):-1]
166
- spe_id=temp_id[len('Species:'):]
167
- temp_gene[temp_id]=int(spe_id)
168
- temp_gene_sort=sorted(temp_gene.items(), key=lambda kv:(kv[1]), reverse=False)
169
- final_gene_id=''
170
- for temp_ele in temp_gene_sort:
171
- final_gene_id+=temp_ele[0]+','
172
- fout.write('\t'.join(ele[:-1])+'\t'+final_gene_id[:-1]+'\n')
173
- else:
174
- fout.write('\t'.join(ele)+'\n')
175
- fout.write('\n')
176
- # print(total_entity,over_entity, nest_entity)
177
- return fout.getvalue()
178
-
179
- def find_max_entity(nest_list,text):
180
- max_len=0
181
- final_tem=[]
182
- max_index=0
183
- for i in range(0, len(nest_list)):
184
- if nest_list[i][4] =='Species':
185
- final_tem.append(nest_list[i])
186
- else:
187
- cur_len=int(nest_list[i][2])-int(nest_list[i][1])
188
- if cur_len>max_len:
189
- max_len=cur_len
190
- max_index=i
191
- final_tem.append(nest_list[max_index])
192
- return final_tem
193
-
194
-
195
- def generate_seq_input(nonest_input,outfile):
196
-
197
- fin=io.StringIO(nonest_input)
198
- fout=open(outfile,'w',encoding='utf-8')
199
- all_in=fin.read().strip().split('\n\n')
200
- fin.close()
201
-
202
- final_input=[]
203
-
204
- for doc in all_in:
205
- lines=doc.split('\n')
206
- token_text=lines[0]
207
- pmid=lines[1].split('\t')[0]
208
- # print(pmid)
209
- #read entity and relation
210
- entity_arg1={} #only entity offset
211
- entity_arg2={} #only entity offset
212
- entity_all=[] #all entity infor
213
-
214
- for i in range(1,len(lines)):
215
- seg=lines[i].split('\t')
216
- if seg[4]==REL_ENT['arg1']:
217
- if seg[-1] in entity_arg1.keys():
218
- entity_arg1[seg[-1]].append([seg[1],seg[2]])
219
- else:
220
- entity_arg1[seg[-1]]=[[seg[1],seg[2]]]
221
- elif seg[4]==REL_ENT['arg2']:
222
- temp_spes=seg[-1].split(',')
223
- for ele in temp_spes:
224
- gene_spe_id=ele
225
- if gene_spe_id in entity_arg2.keys():
226
- entity_arg2[gene_spe_id].append([seg[1],seg[2]])
227
- else:
228
- entity_arg2[gene_spe_id]=[[seg[1],seg[2]]]
229
-
230
- entity_all.append(seg)
231
- # print('\narg1:',entity_arg1)
232
- # print('\narg2:',entity_arg2)
233
- # print('\nall entity:',entity_all)
234
- # for all arg1 to produce inst
235
- for cur_ele in entity_arg1.keys():
236
-
237
- #1. ner label text
238
- #check cur_ele in relation?
239
- # print(relation_all.keys())
240
- if cur_ele in entity_arg2.keys(): #pos instance
241
- rel_ent2=entity_arg2[cur_ele]
242
- ner_text=''
243
- text_sid=0
244
- #print('nonest:',entity_nonest)
245
- for ele_nonest in entity_all:
246
- ent_id=[ele_nonest[1],ele_nonest[2]]
247
- ent_sid=int(ele_nonest[1])
248
- ent_eid=int(ele_nonest[2])
249
- # print('sid,eid:',ent_sid,ent_eid)
250
- ent_text=ele_nonest[3]
251
- ent_type=ele_nonest[4]
252
- if ent_sid>=text_sid:
253
- if ent_id in entity_arg1[cur_ele]:
254
- ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG['arg1'][0]+' '+ent_text+ ' '+ENTITY_TAG['arg1'][1]+' '
255
- else:
256
- if ent_id in rel_ent2: #arg2 entity
257
- if ent_type!=REL_ENT['arg2']:
258
- pass
259
- # print('arg2 is error! not ',REL_ENT['arg2'], ele_nonest)
260
- ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG['arg2'][0]+' '+ent_text+ ' '+ENTITY_TAG['arg2'][1]+' '
261
- else:
262
- ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG[ent_type.lower()][0]+' '+ent_text+ ' '+ENTITY_TAG[ent_type.lower()][1]+' '
263
- text_sid=ent_eid
264
- else:
265
- pass
266
- # print('ner entity error!!!',ele_nonest,text_sid)
267
- ner_text+=token_text[text_sid:]
268
- sen_tokens=ner_text.split()
269
- # print('\nner_text:',ner_text)
270
-
271
- #3 produce pos input
272
-
273
- temp_input=[]
274
- token_id=0
275
- while token_id <len(sen_tokens):
276
- if sen_tokens[token_id].find(ENTITY_TAG['arg1'][0])>=0:
277
- temp_input.append(ENTITY_TAG['arg1'][0]+'\tO')
278
- token_id+=1
279
- while(sen_tokens[token_id]!=ENTITY_TAG['arg1'][1]):
280
- temp_input.append(sen_tokens[token_id]+'\tO')
281
- token_id+=1
282
- temp_input.append(ENTITY_TAG['arg1'][1]+'\tO')
283
- elif sen_tokens[token_id].find(ENTITY_TAG['arg2'][0])>=0:
284
- temp_input.append(ENTITY_TAG[REL_ENT['arg2'].lower()][0]+'\tARG2')
285
- token_id+=1
286
- while(sen_tokens[token_id]!=ENTITY_TAG['arg2'][1]):
287
- temp_input.append(sen_tokens[token_id]+'\tARG2')
288
- token_id+=1
289
- temp_input.append(ENTITY_TAG[REL_ENT['arg2'].lower()][1]+'\tARG2')
290
- elif sen_tokens[token_id].find(ENTITY_TAG['gene'][0])>=0:
291
- temp_input.append(ENTITY_TAG['gene'][0]+'\tO')
292
- token_id+=1
293
- while(sen_tokens[token_id]!=ENTITY_TAG['gene'][1]):
294
- temp_input.append(sen_tokens[token_id]+'\tO')
295
- token_id+=1
296
- temp_input.append(ENTITY_TAG['gene'][1]+'\tO')
297
- elif sen_tokens[token_id].find(ENTITY_TAG['species'][0])>=0:
298
- temp_input.append(ENTITY_TAG['species'][0]+'\tO')
299
- token_id+=1
300
- while(sen_tokens[token_id]!=ENTITY_TAG['species'][1]):
301
- temp_input.append(sen_tokens[token_id]+'\tO')
302
- token_id+=1
303
- temp_input.append(ENTITY_TAG['species'][1]+'\tO')
304
- else:
305
- if sen_tokens[token_id]=='':
306
- # print('token is none!error!')
307
- pass
308
- else:
309
- temp_input.append(sen_tokens[token_id]+'\tO')
310
- token_id+=1
311
-
312
- final_input.append('\n'.join(temp_input))
313
-
314
- else: #neg instance
315
- ner_text=''
316
- text_sid=0
317
- #print('nonest:',entity_nonest)
318
- for ele_nonest in entity_all:
319
- ent_id=[ele_nonest[1],ele_nonest[2]]
320
- ent_sid=int(ele_nonest[1])
321
- ent_eid=int(ele_nonest[2])
322
- # print('sid,eid:',ent_sid,ent_eid)
323
- ent_text=ele_nonest[3]
324
- ent_type=ele_nonest[4]
325
- if ent_sid>=text_sid:
326
- if ent_id in entity_arg1[cur_ele]:
327
- ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG['arg1'][0]+' '+ent_text+ ' '+ENTITY_TAG['arg1'][1]+' '
328
- else:
329
- ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG[ent_type.lower()][0]+' '+ent_text+ ' '+ENTITY_TAG[ent_type.lower()][1]+' '
330
- text_sid=ent_eid
331
- else:
332
- pass
333
- # print('ner entity error!!!')
334
- ner_text+=token_text[text_sid:]
335
- sen_tokens=ner_text.split()
336
- # print('\nner_text:',ner_text)
337
- # print('ner_Text')
338
- #3 produce NEG input
339
-
340
- temp_input=[]
341
- token_id=0
342
- while token_id <len(sen_tokens):
343
- if sen_tokens[token_id].find(ENTITY_TAG['arg1'][0])>=0:
344
- temp_input.append(ENTITY_TAG['arg1'][0]+'\tO')
345
- token_id+=1
346
- while(sen_tokens[token_id]!=ENTITY_TAG['arg1'][1]):
347
- temp_input.append(sen_tokens[token_id]+'\tO')
348
- token_id+=1
349
- temp_input.append(ENTITY_TAG['arg1'][1]+'\tO')
350
- elif sen_tokens[token_id].find(ENTITY_TAG['gene'][0])>=0:
351
- temp_input.append(ENTITY_TAG['gene'][0]+'\tO')
352
- token_id+=1
353
- while(sen_tokens[token_id]!=ENTITY_TAG['gene'][1]):
354
- temp_input.append(sen_tokens[token_id]+'\tO')
355
- token_id+=1
356
- temp_input.append(ENTITY_TAG['gene'][1]+'\tO')
357
- elif sen_tokens[token_id].find(ENTITY_TAG['species'][0])>=0:
358
- temp_input.append(ENTITY_TAG['species'][0]+'\tO')
359
- token_id+=1
360
- while(sen_tokens[token_id]!=ENTITY_TAG['species'][1]):
361
- temp_input.append(sen_tokens[token_id]+'\tO')
362
- token_id+=1
363
- temp_input.append(ENTITY_TAG['species'][1]+'\tO')
364
- else:
365
- if sen_tokens[token_id]=='':
366
- print('token is none!error!')
367
- else:
368
- temp_input.append(sen_tokens[token_id]+'\tO')
369
- token_id+=1
370
-
371
- final_input.append('\n'.join(temp_input))
372
- # print(entity_nonest)
373
- # sys.exit()
374
- fout.write('\n\n'.join(final_input))
375
- fout.write('\n')
376
- fout.close()
377
-
378
- def check_entity_pos(line,relations):
379
-
380
- seg=line.split(' ')
381
- stack_ent=[]
382
- # print(seg)
383
- entity_num={'arg1':0,'arg2':0, 'gene':0,'chemical':0}
384
-
385
- temp_arg2=[]
386
- for i in range(0,len(seg)):
387
- if seg[i].find(ENTITY_TAG['gene'][0])>=0:
388
- entity_num['gene']+=1
389
- stack_ent.append(seg[i])
390
- elif seg[i].find(ENTITY_TAG['chemical'][0])>=0:
391
- entity_num['chemical']+=1
392
- stack_ent.append(seg[i])
393
- # print(stack_ent)
394
- elif seg[i].find(ENTITY_TAG['arg1'][0])>=0:
395
- entity_num['arg1']+=1
396
- stack_ent.append(seg[i])
397
- elif seg[i].find(ENTITY_TAG['arg2'][0])>=0:
398
- entity_num['arg2']+=1
399
- temp_arg2.append(seg[i].split('|')[0])
400
- stack_ent.append(seg[i])
401
- elif seg[i].find(ENTITY_TAG['arg1'][1])>=0 or seg[i].find(ENTITY_TAG['arg2'][1])>=0 or seg[i].find(ENTITY_TAG['gene'][1])>=0 or seg[i].find(ENTITY_TAG['chemical'][1])>=0:
402
- stack_ent.pop()
403
- if stack_ent!=[]:
404
- # print('entity no match!',stack_ent)
405
- return(-1,seg,entity_num)
406
-
407
- else:
408
- if entity_num['arg1']!=0:
409
- for arg2_id in relations.keys():
410
- if arg2_id not in temp_arg2:
411
- # print('\ntemp_arg2:',temp_arg2)
412
- # print('\narg2_id:',arg2_id)
413
- return(0,seg,entity_num) #some arg2 not in sentence
414
- if entity_num['arg2']!=0 and entity_num['arg1']==0:
415
- return(0,seg,entity_num) #only arg2, but no arg1
416
- return(1,seg,entity_num)
417
-
418
- def check_entity_neg(line):
419
-
420
- seg=line.split(' ')
421
- stack_ent=[]
422
- # print(seg)
423
- entity_num={'arg1':0,'gene':0,'chemical':0}
424
- for i in range(0,len(seg)):
425
- if seg[i].find(ENTITY_TAG['gene'][0])>=0:
426
- entity_num['gene']+=1
427
- stack_ent.append(seg[i])
428
- elif seg[i].find(ENTITY_TAG['chemical'][0])>=0:
429
- entity_num['chemical']+=1
430
- stack_ent.append(seg[i])
431
- # print(stack_ent)
432
- elif seg[i].find(ENTITY_TAG['arg1'][0])>=0:
433
- entity_num['arg1']+=1
434
- stack_ent.append(seg[i])
435
- elif seg[i].find(ENTITY_TAG['arg1'][1])>=0 or seg[i].find(ENTITY_TAG['gene'][1])>=0 or seg[i].find(ENTITY_TAG['chemical'][1])>=0:
436
- stack_ent.pop()
437
- if stack_ent!=[]:
438
- # print('entity no match!',stack_ent)
439
- return(-1,seg,entity_num)
440
-
441
- else:
442
- return(1,seg,entity_num)
443
-
444
- def get_one_entity(nest_list,cur_ent,rel_entity2_id):
445
- max_len=0
446
- max_entity=[]
447
- final_entity=[]
448
- for i in range(0, len(nest_list)):
449
- if nest_list[i][1]==cur_ent:#current entity
450
- final_entity=[]
451
- max_entity=nest_list[i]
452
- final_entity.append(nest_list[i])
453
- return(final_entity)
454
- if nest_list[i][1] in rel_entity2_id: #invole rel
455
- final_entity.append(nest_list[i])
456
- continue
457
- length=int(nest_list[i][4])-int(nest_list[i][3])
458
- if max_entity==[]: #first entity
459
- max_len=length
460
- max_entity=nest_list[i]
461
- else:
462
- if length>max_len:
463
- if max_entity[2]==REL_ENT['arg1']:
464
- max_len=length
465
- max_entity=nest_list[i]
466
- else:
467
- if nest_list[i][2]==REL_ENT['arg2'] and max_entity[1] not in rel_entity2_id:
468
- max_len=length
469
- max_entity=nest_list[i]
470
-
471
- else:
472
- if nest_list[i][1] in rel_entity2_id:
473
- max_len=length
474
- max_entity=nest_list[i]
475
- elif max_entity[2]==REL_ENT['arg1'] and nest_list[i][2]==REL_ENT['arg2']:
476
- max_len=length
477
- max_entity=nest_list[i]
478
- if final_entity==[]:
479
- final_entity.append(max_entity)
480
- return final_entity
481
-
482
- if __name__=='__main__':
483
-
484
- infile='../../TrainingSet/No505/SA.Train.txt'
485
- outfile='../../TrainingSet/No505/SA.Train.conll'
486
-
487
- #tokenizer
488
- token_input=ssplit_token(infile)
489
-
490
- #filter nest entity
491
- nonest_input=corpus_noNest(token_input)
492
-
493
- # to conll
494
  generate_seq_input(nonest_input,outfile)
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import sys
4
+ import io
5
+ import stanza
6
+ # nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma',package='craft') #package='craft'
7
+ nlp = stanza.Pipeline(lang='en', processors={'tokenize': 'spacy'},package='None') #package='craft'
8
+ REL_ENT={'arg1':'Species',
9
+ 'arg2':'Gene'}
10
+
11
+ ENTITY_TAG={'arg1':['arg1s','arg1e'],
12
+ 'arg2':['arg2s','arg2e'],
13
+ 'gene':['gene1s','gene1e'],
14
+ 'species':['species1s','species1e']
15
+ }
16
+
17
+ # ssplit token and revise index
18
+ def ssplit_token(infile):
19
+ fin=open(infile,'r',encoding='utf-8')
20
+ fout=io.StringIO()
21
+ all_in=fin.read().strip().split('\n\n')
22
+ fin.close()
23
+ for doc_text in all_in:
24
+ lines=doc_text.split('\n')
25
+ ori_text=lines[0].split('|t|')[1]+' '+lines[1].split('|a|')[1]
26
+ pmid=lines[0].split('|t|')[0]
27
+ # print(pmid)
28
+ entity_all=[] #[[seg0,seg1,...,],[]]
29
+ for i in range(2,len(lines)):
30
+ seg=lines[i].split('\t')
31
+ entity_all.append(seg)
32
+
33
+ #ssplit token
34
+ doc_stanza = nlp(ori_text)
35
+ token_text=''
36
+ for sent in doc_stanza.sentences:
37
+ for word in sent.words:
38
+ if word.text==' ':
39
+ pass
40
+ # print('token is blank!')
41
+ else:
42
+ token_text+=word.text+' '
43
+ #token_text=token_text+' ' #sentence split by four blank
44
+
45
+ #ori_index map token_index
46
+ index_map=[-1]*len(ori_text)
47
+ j=0
48
+ space_list=[' ',chr(160),chr(8201),chr(8194),chr(8197),chr(8202)] #空格有好几种,第一个是常用32,第二个shi 160,8201,8194,8197
49
+ for i in range(0,len(ori_text)):
50
+ if ori_text[i] in space_list:
51
+ pass
52
+ elif ori_text[i]==token_text[j]:
53
+ #if i>0 and i<285:
54
+ # print('=i,j:',i,j,ori_text[i-1:i+1],token_text[j-1:j+1])
55
+ index_map[i]=j
56
+ j+=1
57
+ else:
58
+ #if i==283:
59
+ # print('!i,j:',i,j,ori_text[i-1:i+1],token_text[j-1:j+1])
60
+ j+=1
61
+ temp_log=j
62
+ try:
63
+ while(ori_text[i]!=token_text[j]):
64
+ j+=1
65
+ except:
66
+ print('doc',doc_text)
67
+ print('token_text:',token_text)
68
+ print('error:',ori_text[i-10:i+10],'i:',ori_text[i],'j:',token_text[temp_log],',',token_text[temp_log-10:temp_log+10])
69
+ print(ord(ori_text[i]),ord(' '))
70
+ sys.exit()
71
+ index_map[i]=j
72
+ j+=1
73
+ # print(index_map)
74
+ # token_text=token_text.replace(' ','<EOS>')
75
+ # print(token_text)
76
+ fout.write(token_text+'\n')
77
+ for ele in entity_all:
78
+ if index_map[int(ele[1])]==-1:
79
+ new_ents=index_map[int(ele[1])+1]
80
+ else:
81
+ new_ents=index_map[int(ele[1])]
82
+ if index_map[int(ele[2])-1]==-1:
83
+ new_ente=index_map[int(ele[2])-1-1]+1
84
+ else:
85
+ new_ente=index_map[int(ele[2])-1]+1
86
+ new_ent=token_text[new_ents:new_ente]
87
+ if ele[4]=='Species' or ele[4]=='Gene':
88
+ fout.write(ele[0]+'\t'+str(new_ents)+'\t'+str(new_ente)+'\t'+new_ent+'\t'+ele[4]+'\t'+ele[5]+'\n')
89
+ else:
90
+ # print(ele[4])
91
+ fout.write(ele[0]+'\t'+str(new_ents)+'\t'+str(new_ente)+'\t'+new_ent+'\t'+'Gene'+'\t'+ele[5]+'\n')
92
+ fout.write('\n')
93
+ return fout.getvalue()
94
+
95
+
96
+ def corpus_noNest(token_input):
97
+
98
+ fin=io.StringIO(token_input)
99
+ fout=io.StringIO()
100
+
101
+ documents=fin.read().strip().split('\n\n')
102
+ fin.close()
103
+ total_entity=0
104
+ over_entity=0
105
+ nest_entity=0
106
+ for doc in documents:
107
+ lines=doc.split('\n')
108
+ context=lines[0]
109
+ entity_list=[]
110
+ if len(lines)>1:
111
+ doc_result={}
112
+ for i in range(1,len(lines)):
113
+ segs=lines[i].split('\t')
114
+ doc_result[lines[i]]=[int(segs[1]),int(segs[2])]
115
+ doc_result=sorted(doc_result.items(), key=lambda kv:(kv[1]), reverse=False)
116
+ doc_result_sort=[]
117
+ for ele in doc_result:
118
+ doc_result_sort.append(ele[0])
119
+
120
+ first_entity=doc_result_sort[0].split('\t')
121
+ nest_list=[first_entity]
122
+ max_eid=int(first_entity[2])
123
+ total_entity+=len(lines)-2
124
+ for i in range(1,len(doc_result_sort)):
125
+ segs=doc_result_sort[i].split('\t')
126
+ if int(segs[1])> max_eid:
127
+ if len(nest_list)==1:
128
+ entity_list.append(nest_list[0])
129
+ nest_list=[]
130
+ nest_list.append(segs)
131
+ if int(segs[2])>max_eid:
132
+ max_eid=int(segs[2])
133
+ else:
134
+ # print(nest_list)
135
+ nest_entity+=len(nest_list)-1
136
+ tem=find_max_entity(nest_list,context)#find max entity
137
+ # if len(tem)>1:
138
+ # print('max nest >1:',tem)
139
+ entity_list.extend(tem)
140
+ nest_list=[]
141
+ nest_list.append(segs)
142
+ if int(segs[2])>max_eid:
143
+ max_eid=int(segs[2])
144
+
145
+ else:
146
+ nest_list.append(segs)
147
+ over_entity+=1
148
+ if int(segs[2])>max_eid:
149
+ max_eid=int(segs[2])
150
+ if nest_list!=[]:
151
+ if len(nest_list)==1:
152
+ entity_list.append(nest_list[0])
153
+
154
+ else:
155
+ tem=find_max_entity(nest_list,context)#find max entity
156
+ # if len(tem)>1:
157
+ # print('max nest >1:',tem)
158
+ entity_list.extend(tem)
159
+ fout.write(context+'\n')
160
+ for ele in entity_list:
161
+ if ele[4]=='Gene':
162
+ temp_gene={}
163
+ gene_ids=ele[5].split(',')
164
+ for gene_id in gene_ids:
165
+ temp_id=gene_id[gene_id.find('Species:'):-1]
166
+ spe_id=temp_id[len('Species:'):]
167
+ temp_gene[temp_id]=int(spe_id)
168
+ temp_gene_sort=sorted(temp_gene.items(), key=lambda kv:(kv[1]), reverse=False)
169
+ final_gene_id=''
170
+ for temp_ele in temp_gene_sort:
171
+ final_gene_id+=temp_ele[0]+','
172
+ fout.write('\t'.join(ele[:-1])+'\t'+final_gene_id[:-1]+'\n')
173
+ else:
174
+ fout.write('\t'.join(ele)+'\n')
175
+ fout.write('\n')
176
+ # print(total_entity,over_entity, nest_entity)
177
+ return fout.getvalue()
178
+
179
+ def find_max_entity(nest_list,text):
180
+ max_len=0
181
+ final_tem=[]
182
+ max_index=0
183
+ for i in range(0, len(nest_list)):
184
+ if nest_list[i][4] =='Species':
185
+ final_tem.append(nest_list[i])
186
+ else:
187
+ cur_len=int(nest_list[i][2])-int(nest_list[i][1])
188
+ if cur_len>max_len:
189
+ max_len=cur_len
190
+ max_index=i
191
+ final_tem.append(nest_list[max_index])
192
+ return final_tem
193
+
194
+
195
+ def generate_seq_input(nonest_input,outfile):
196
+
197
+ fin=io.StringIO(nonest_input)
198
+ fout=open(outfile,'w',encoding='utf-8')
199
+ all_in=fin.read().strip().split('\n\n')
200
+ fin.close()
201
+
202
+ final_input=[]
203
+
204
+ for doc in all_in:
205
+ lines=doc.split('\n')
206
+ token_text=lines[0]
207
+ pmid=lines[1].split('\t')[0]
208
+ # print(pmid)
209
+ #read entity and relation
210
+ entity_arg1={} #only entity offset
211
+ entity_arg2={} #only entity offset
212
+ entity_all=[] #all entity infor
213
+
214
+ for i in range(1,len(lines)):
215
+ seg=lines[i].split('\t')
216
+ if seg[4]==REL_ENT['arg1']:
217
+ if seg[-1] in entity_arg1.keys():
218
+ entity_arg1[seg[-1]].append([seg[1],seg[2]])
219
+ else:
220
+ entity_arg1[seg[-1]]=[[seg[1],seg[2]]]
221
+ elif seg[4]==REL_ENT['arg2']:
222
+ temp_spes=seg[-1].split(',')
223
+ for ele in temp_spes:
224
+ gene_spe_id=ele
225
+ if gene_spe_id in entity_arg2.keys():
226
+ entity_arg2[gene_spe_id].append([seg[1],seg[2]])
227
+ else:
228
+ entity_arg2[gene_spe_id]=[[seg[1],seg[2]]]
229
+
230
+ entity_all.append(seg)
231
+ # print('\narg1:',entity_arg1)
232
+ # print('\narg2:',entity_arg2)
233
+ # print('\nall entity:',entity_all)
234
+ # for all arg1 to produce inst
235
+ for cur_ele in entity_arg1.keys():
236
+
237
+ #1. ner label text
238
+ #check cur_ele in relation?
239
+ # print(relation_all.keys())
240
+ if cur_ele in entity_arg2.keys(): #pos instance
241
+ rel_ent2=entity_arg2[cur_ele]
242
+ ner_text=''
243
+ text_sid=0
244
+ #print('nonest:',entity_nonest)
245
+ for ele_nonest in entity_all:
246
+ ent_id=[ele_nonest[1],ele_nonest[2]]
247
+ ent_sid=int(ele_nonest[1])
248
+ ent_eid=int(ele_nonest[2])
249
+ # print('sid,eid:',ent_sid,ent_eid)
250
+ ent_text=ele_nonest[3]
251
+ ent_type=ele_nonest[4]
252
+ if ent_sid>=text_sid:
253
+ if ent_id in entity_arg1[cur_ele]:
254
+ ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG['arg1'][0]+' '+ent_text+ ' '+ENTITY_TAG['arg1'][1]+' '
255
+ else:
256
+ if ent_id in rel_ent2: #arg2 entity
257
+ if ent_type!=REL_ENT['arg2']:
258
+ pass
259
+ # print('arg2 is error! not ',REL_ENT['arg2'], ele_nonest)
260
+ ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG['arg2'][0]+' '+ent_text+ ' '+ENTITY_TAG['arg2'][1]+' '
261
+ else:
262
+ ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG[ent_type.lower()][0]+' '+ent_text+ ' '+ENTITY_TAG[ent_type.lower()][1]+' '
263
+ text_sid=ent_eid
264
+ else:
265
+ pass
266
+ # print('ner entity error!!!',ele_nonest,text_sid)
267
+ ner_text+=token_text[text_sid:]
268
+ sen_tokens=ner_text.split()
269
+ # print('\nner_text:',ner_text)
270
+
271
+ #3 produce pos input
272
+
273
+ temp_input=[]
274
+ token_id=0
275
+ while token_id <len(sen_tokens):
276
+ if sen_tokens[token_id].find(ENTITY_TAG['arg1'][0])>=0:
277
+ temp_input.append(ENTITY_TAG['arg1'][0]+'\tO')
278
+ token_id+=1
279
+ while(sen_tokens[token_id]!=ENTITY_TAG['arg1'][1]):
280
+ temp_input.append(sen_tokens[token_id]+'\tO')
281
+ token_id+=1
282
+ temp_input.append(ENTITY_TAG['arg1'][1]+'\tO')
283
+ elif sen_tokens[token_id].find(ENTITY_TAG['arg2'][0])>=0:
284
+ temp_input.append(ENTITY_TAG[REL_ENT['arg2'].lower()][0]+'\tARG2')
285
+ token_id+=1
286
+ while(sen_tokens[token_id]!=ENTITY_TAG['arg2'][1]):
287
+ temp_input.append(sen_tokens[token_id]+'\tARG2')
288
+ token_id+=1
289
+ temp_input.append(ENTITY_TAG[REL_ENT['arg2'].lower()][1]+'\tARG2')
290
+ elif sen_tokens[token_id].find(ENTITY_TAG['gene'][0])>=0:
291
+ temp_input.append(ENTITY_TAG['gene'][0]+'\tO')
292
+ token_id+=1
293
+ while(sen_tokens[token_id]!=ENTITY_TAG['gene'][1]):
294
+ temp_input.append(sen_tokens[token_id]+'\tO')
295
+ token_id+=1
296
+ temp_input.append(ENTITY_TAG['gene'][1]+'\tO')
297
+ elif sen_tokens[token_id].find(ENTITY_TAG['species'][0])>=0:
298
+ temp_input.append(ENTITY_TAG['species'][0]+'\tO')
299
+ token_id+=1
300
+ while(sen_tokens[token_id]!=ENTITY_TAG['species'][1]):
301
+ temp_input.append(sen_tokens[token_id]+'\tO')
302
+ token_id+=1
303
+ temp_input.append(ENTITY_TAG['species'][1]+'\tO')
304
+ else:
305
+ if sen_tokens[token_id]=='':
306
+ # print('token is none!error!')
307
+ pass
308
+ else:
309
+ temp_input.append(sen_tokens[token_id]+'\tO')
310
+ token_id+=1
311
+
312
+ final_input.append('\n'.join(temp_input))
313
+
314
+ else: #neg instance
315
+ ner_text=''
316
+ text_sid=0
317
+ #print('nonest:',entity_nonest)
318
+ for ele_nonest in entity_all:
319
+ ent_id=[ele_nonest[1],ele_nonest[2]]
320
+ ent_sid=int(ele_nonest[1])
321
+ ent_eid=int(ele_nonest[2])
322
+ # print('sid,eid:',ent_sid,ent_eid)
323
+ ent_text=ele_nonest[3]
324
+ ent_type=ele_nonest[4]
325
+ if ent_sid>=text_sid:
326
+ if ent_id in entity_arg1[cur_ele]:
327
+ ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG['arg1'][0]+' '+ent_text+ ' '+ENTITY_TAG['arg1'][1]+' '
328
+ else:
329
+ ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG[ent_type.lower()][0]+' '+ent_text+ ' '+ENTITY_TAG[ent_type.lower()][1]+' '
330
+ text_sid=ent_eid
331
+ else:
332
+ pass
333
+ # print('ner entity error!!!')
334
+ ner_text+=token_text[text_sid:]
335
+ sen_tokens=ner_text.split()
336
+ # print('\nner_text:',ner_text)
337
+ # print('ner_Text')
338
+ #3 produce NEG input
339
+
340
+ temp_input=[]
341
+ token_id=0
342
+ while token_id <len(sen_tokens):
343
+ if sen_tokens[token_id].find(ENTITY_TAG['arg1'][0])>=0:
344
+ temp_input.append(ENTITY_TAG['arg1'][0]+'\tO')
345
+ token_id+=1
346
+ while(sen_tokens[token_id]!=ENTITY_TAG['arg1'][1]):
347
+ temp_input.append(sen_tokens[token_id]+'\tO')
348
+ token_id+=1
349
+ temp_input.append(ENTITY_TAG['arg1'][1]+'\tO')
350
+ elif sen_tokens[token_id].find(ENTITY_TAG['gene'][0])>=0:
351
+ temp_input.append(ENTITY_TAG['gene'][0]+'\tO')
352
+ token_id+=1
353
+ while(sen_tokens[token_id]!=ENTITY_TAG['gene'][1]):
354
+ temp_input.append(sen_tokens[token_id]+'\tO')
355
+ token_id+=1
356
+ temp_input.append(ENTITY_TAG['gene'][1]+'\tO')
357
+ elif sen_tokens[token_id].find(ENTITY_TAG['species'][0])>=0:
358
+ temp_input.append(ENTITY_TAG['species'][0]+'\tO')
359
+ token_id+=1
360
+ while(sen_tokens[token_id]!=ENTITY_TAG['species'][1]):
361
+ temp_input.append(sen_tokens[token_id]+'\tO')
362
+ token_id+=1
363
+ temp_input.append(ENTITY_TAG['species'][1]+'\tO')
364
+ else:
365
+ if sen_tokens[token_id]=='':
366
+ print('token is none!error!')
367
+ else:
368
+ temp_input.append(sen_tokens[token_id]+'\tO')
369
+ token_id+=1
370
+
371
+ final_input.append('\n'.join(temp_input))
372
+ # print(entity_nonest)
373
+ # sys.exit()
374
+ fout.write('\n\n'.join(final_input))
375
+ fout.write('\n')
376
+ fout.close()
377
+
378
+ def check_entity_pos(line,relations):
379
+
380
+ seg=line.split(' ')
381
+ stack_ent=[]
382
+ # print(seg)
383
+ entity_num={'arg1':0,'arg2':0, 'gene':0,'chemical':0}
384
+
385
+ temp_arg2=[]
386
+ for i in range(0,len(seg)):
387
+ if seg[i].find(ENTITY_TAG['gene'][0])>=0:
388
+ entity_num['gene']+=1
389
+ stack_ent.append(seg[i])
390
+ elif seg[i].find(ENTITY_TAG['chemical'][0])>=0:
391
+ entity_num['chemical']+=1
392
+ stack_ent.append(seg[i])
393
+ # print(stack_ent)
394
+ elif seg[i].find(ENTITY_TAG['arg1'][0])>=0:
395
+ entity_num['arg1']+=1
396
+ stack_ent.append(seg[i])
397
+ elif seg[i].find(ENTITY_TAG['arg2'][0])>=0:
398
+ entity_num['arg2']+=1
399
+ temp_arg2.append(seg[i].split('|')[0])
400
+ stack_ent.append(seg[i])
401
+ elif seg[i].find(ENTITY_TAG['arg1'][1])>=0 or seg[i].find(ENTITY_TAG['arg2'][1])>=0 or seg[i].find(ENTITY_TAG['gene'][1])>=0 or seg[i].find(ENTITY_TAG['chemical'][1])>=0:
402
+ stack_ent.pop()
403
+ if stack_ent!=[]:
404
+ # print('entity no match!',stack_ent)
405
+ return(-1,seg,entity_num)
406
+
407
+ else:
408
+ if entity_num['arg1']!=0:
409
+ for arg2_id in relations.keys():
410
+ if arg2_id not in temp_arg2:
411
+ # print('\ntemp_arg2:',temp_arg2)
412
+ # print('\narg2_id:',arg2_id)
413
+ return(0,seg,entity_num) #some arg2 not in sentence
414
+ if entity_num['arg2']!=0 and entity_num['arg1']==0:
415
+ return(0,seg,entity_num) #only arg2, but no arg1
416
+ return(1,seg,entity_num)
417
+
418
+ def check_entity_neg(line):
419
+
420
+ seg=line.split(' ')
421
+ stack_ent=[]
422
+ # print(seg)
423
+ entity_num={'arg1':0,'gene':0,'chemical':0}
424
+ for i in range(0,len(seg)):
425
+ if seg[i].find(ENTITY_TAG['gene'][0])>=0:
426
+ entity_num['gene']+=1
427
+ stack_ent.append(seg[i])
428
+ elif seg[i].find(ENTITY_TAG['chemical'][0])>=0:
429
+ entity_num['chemical']+=1
430
+ stack_ent.append(seg[i])
431
+ # print(stack_ent)
432
+ elif seg[i].find(ENTITY_TAG['arg1'][0])>=0:
433
+ entity_num['arg1']+=1
434
+ stack_ent.append(seg[i])
435
+ elif seg[i].find(ENTITY_TAG['arg1'][1])>=0 or seg[i].find(ENTITY_TAG['gene'][1])>=0 or seg[i].find(ENTITY_TAG['chemical'][1])>=0:
436
+ stack_ent.pop()
437
+ if stack_ent!=[]:
438
+ # print('entity no match!',stack_ent)
439
+ return(-1,seg,entity_num)
440
+
441
+ else:
442
+ return(1,seg,entity_num)
443
+
444
+ def get_one_entity(nest_list,cur_ent,rel_entity2_id):
445
+ max_len=0
446
+ max_entity=[]
447
+ final_entity=[]
448
+ for i in range(0, len(nest_list)):
449
+ if nest_list[i][1]==cur_ent:#current entity
450
+ final_entity=[]
451
+ max_entity=nest_list[i]
452
+ final_entity.append(nest_list[i])
453
+ return(final_entity)
454
+ if nest_list[i][1] in rel_entity2_id: #invole rel
455
+ final_entity.append(nest_list[i])
456
+ continue
457
+ length=int(nest_list[i][4])-int(nest_list[i][3])
458
+ if max_entity==[]: #first entity
459
+ max_len=length
460
+ max_entity=nest_list[i]
461
+ else:
462
+ if length>max_len:
463
+ if max_entity[2]==REL_ENT['arg1']:
464
+ max_len=length
465
+ max_entity=nest_list[i]
466
+ else:
467
+ if nest_list[i][2]==REL_ENT['arg2'] and max_entity[1] not in rel_entity2_id:
468
+ max_len=length
469
+ max_entity=nest_list[i]
470
+
471
+ else:
472
+ if nest_list[i][1] in rel_entity2_id:
473
+ max_len=length
474
+ max_entity=nest_list[i]
475
+ elif max_entity[2]==REL_ENT['arg1'] and nest_list[i][2]==REL_ENT['arg2']:
476
+ max_len=length
477
+ max_entity=nest_list[i]
478
+ if final_entity==[]:
479
+ final_entity.append(max_entity)
480
+ return final_entity
481
+
482
+ if __name__=='__main__':
483
+
484
+ infile='../../TrainingSet/No505/SA.Train.txt'
485
+ outfile='../../TrainingSet/No505/SA.Train.conll'
486
+
487
+ #tokenizer
488
+ token_input=ssplit_token(infile)
489
+
490
+ #filter nest entity
491
+ nonest_input=corpus_noNest(token_input)
492
+
493
+ # to conll
494
  generate_seq_input(nonest_input,outfile)
src_python/SpeAss/ml_tagging_score_sa.py CHANGED
@@ -1,220 +1,220 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on Fri Jan 7 09:29:46 2022
4
-
5
- @author: luol2
6
-
7
- machine learning tagging
8
-
9
- """
10
-
11
-
12
- import time
13
- import io
14
-
15
- from src_python.SpeAss.processing_data_sa import ml_intext_fn,out_BIO_BERT_softmax_score_fn
16
- import tensorflow as tf
17
- gpu = tf.config.list_physical_devices('GPU')
18
- print("Num GPUs Available: ", len(gpu))
19
- if len(gpu) > 0:
20
- tf.config.experimental.set_memory_growth(gpu[0], True)
21
- #tf.compat.v1.disable_eager_execution()
22
-
23
- REL_ENT={'arg1':'Species',
24
- 'arg2':'Gene'}
25
-
26
- entity_tag={'arg1':['arg1s','arg1e'],
27
- 'gene':['gene1s','gene1e'],
28
- 'species':['species1s','species1e']
29
- }
30
-
31
- def input_preprocess_notoken(doc_text):
32
- final_input=[]
33
- final_id=[]
34
-
35
- lines=doc_text.split('\n')
36
- token_text=lines[0]
37
- pmid=lines[1].split('\t')[0]
38
- entity_arg1={} #{species_id:[[spe_sid1,sep_eid1],[...]]}
39
- entity_all=[]
40
- for i in range(1,len(lines)):
41
- seg=lines[i].split('\t')
42
- if seg[6]==REL_ENT['arg1']:
43
- if seg[-1] in entity_arg1.keys():
44
- entity_arg1[seg[-1]].append([seg[3],seg[4]])
45
- else:
46
- entity_arg1[seg[-1]]=[[seg[3],seg[4]]]
47
- entity_all.append(seg)
48
-
49
- #print(token_text)
50
- #print(entity_chemical)
51
- #generate input instance
52
- for cur_ele in entity_arg1:
53
-
54
- #2. ner label text
55
- ner_text=''
56
- text_sid=0
57
- #print('nonest:',entity_nonest)
58
- for ele_nonest in entity_all:
59
- ent_id=[ele_nonest[3],ele_nonest[4]]
60
- ent_spe_id=ele_nonest[-1]
61
- ent_sid=int(ele_nonest[3])
62
- ent_eid=int(ele_nonest[4])
63
- # print('sid,eid:',ent_sid,ent_eid)
64
- ent_text=ele_nonest[5]
65
- ent_type=ele_nonest[6]
66
- if ent_sid>=text_sid:
67
- # if token_text[ent_sid:ent_eid]!=ent_text:
68
- # print('error!index_text,entext:',token_text[ent_sid:ent_eid],ent_text)
69
- if ent_id in entity_arg1[cur_ele]: #is species
70
- ner_text+=token_text[text_sid:ent_sid]+' '+ent_spe_id+'|'+entity_tag['arg1'][0]+' '+ent_text+' '+entity_tag['arg1'][1]+' '
71
- else:
72
- ner_text+=token_text[text_sid:ent_sid]+' '+str(ent_sid)+'-'+str(ent_eid)+'|'+entity_tag[ent_type.lower()][0]+' '+ent_text+' '+entity_tag[ent_type.lower()][1]+' '
73
- text_sid=ent_eid
74
- ner_text+=token_text[text_sid:]
75
- sen_tokens=ner_text.split()
76
- #print('\nner_text:',ner_text)
77
-
78
- #3. produce input
79
- temp_input=[]
80
- temp_id={'species':'','gene':[]}
81
- for sen_token in sen_tokens:
82
- if sen_token.find(entity_tag['arg1'][0])>=0:
83
- en_id=sen_token.split('|')[0]
84
- temp_id['species']=en_id
85
- temp_input.append(entity_tag['arg1'][0]+'\tO')
86
- elif sen_token.find(entity_tag['gene'][0])>=0:
87
- en_id=sen_token.split('|')[0]
88
- temp_id['gene'].append(en_id)
89
- temp_input.append(entity_tag['gene'][0]+'\tO')
90
- elif sen_token.find(entity_tag['species'][0])>=0:
91
- en_id=sen_token.split('|')[0]
92
- # temp_id.append(en_id)
93
- temp_input.append(entity_tag['species'][0]+'\tO')
94
- else:
95
- if sen_token=='':
96
- # print('token is none!error!')
97
- pass
98
- else:
99
- temp_input.append(sen_token+'\tO')
100
- final_input.append('\n'.join(temp_input))
101
- final_id.append(temp_id)
102
-
103
- # print(entity_nonest)
104
- return final_input,final_id,entity_all,pmid
105
-
106
-
107
- def ml_tagging(ml_input,nn_model):
108
-
109
- test_set,test_label = ml_intext_fn(ml_input)
110
- test_x,test_y, test_bert_text_label=nn_model.rep.load_data_hugface(test_set,test_label,word_max_len=nn_model.maxlen,label_type='softmax')
111
- test_pre = nn_model.model.predict(test_x)
112
- ml_out=out_BIO_BERT_softmax_score_fn(test_pre,test_bert_text_label,nn_model.rep.index_2_label)
113
- return ml_out
114
-
115
- def output_rel(ml_output,entity_map,pmid):
116
- fin=io.StringIO(ml_output)
117
- alltexts=fin.read().strip().split('\n\n')
118
- fin.close()
119
- final_out={} #{'sid-eid':[spechies id]}
120
- for sen_id,sentence in enumerate(alltexts):
121
- tokens=sentence.split('\n')
122
- gene_entity_id=0
123
- token_id=0
124
- arg1=''
125
- arg2_list=[] #[[ID, score],[id,score]]
126
- while (token_id<len(tokens)):
127
- seg=tokens[token_id].split('\t')
128
- if seg[0]==entity_tag['arg1'][0]:
129
- arg1=entity_map[sen_id]['species']
130
- token_id+=1
131
- if token_id >=len(tokens):
132
- break
133
- seg=tokens[token_id].split('\t')
134
- while seg[0]!=entity_tag['arg1'][1]:
135
- token_id+=1
136
- if token_id >=len(tokens):
137
- break
138
- seg=tokens[token_id].split('\t')
139
- elif seg[0]==entity_tag[REL_ENT['arg2'].lower()][0]:
140
- temp_rel=seg[-2]
141
- temp_score=seg[-1]
142
- arg2_id=entity_map[sen_id]['gene'][gene_entity_id]
143
- gene_entity_id+=1
144
- token_id+=1
145
- if token_id >=len(tokens):
146
- break
147
- seg=tokens[token_id].split('\t')
148
- while seg[0]!=entity_tag[REL_ENT['arg2'].lower()][1]:
149
- token_id+=1
150
- if token_id >=len(tokens):
151
- break
152
- seg=tokens[token_id].split('\t')
153
- if seg[-2].find('ARG2')>=0 and temp_rel.find('ARG2')<0:
154
- temp_rel=seg[-2]
155
- temp_score=seg[-1]
156
- if temp_rel.find('ARG2')>=0:
157
- arg2_list.append([arg2_id,temp_score])
158
- elif seg[0]==entity_tag[REL_ENT['arg1'].lower()][0]:
159
- token_id+=1
160
- if token_id >=len(tokens):
161
- break
162
- seg=tokens[token_id].split('\t')
163
- while seg[0]!=entity_tag[REL_ENT['arg1'].lower()][1]:
164
- token_id+=1
165
- if token_id >=len(tokens):
166
- break
167
- seg=tokens[token_id].split('\t')
168
-
169
- else:
170
- pass
171
- token_id+=1
172
- #print(arg1,arg2_list)
173
- if arg2_list!=[] and arg1!='':
174
- for arg2_ele in arg2_list:
175
- if arg2_ele[0] not in final_out.keys():
176
- final_out[arg2_ele[0]]=[arg1+'|'+arg2_ele[1]]
177
- else:
178
- final_out[arg2_ele[0]].append(arg1+'|'+arg2_ele[1])
179
- return(final_out)
180
-
181
- def NER_Tag(doc_in,nn_model):
182
-
183
- #1. preprocess input, input_text:conll格式, input_entity:相应的实体列表
184
- #print(doc_in)
185
- input_text,entity_index,entity_all,pmid=input_preprocess_notoken(doc_in)
186
- # print('pmid:',pmid)
187
- # print('\entity_index:',entity_index)
188
-
189
-
190
- #2. ml tagging
191
- if input_text!=[]:
192
- ml_pre=ml_tagging(input_text,nn_model)
193
- #print('\noutput:')
194
- #print(ml_pre)
195
-
196
- #3.generate output
197
- final_output=output_rel(ml_pre,entity_index,pmid)
198
- else:
199
- final_output={}
200
- return final_output,entity_all
201
-
202
-
203
-
204
-
205
-
206
-
207
-
208
-
209
-
210
-
211
-
212
-
213
-
214
-
215
-
216
-
217
-
218
-
219
-
220
-
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Jan 7 09:29:46 2022
4
+
5
+ @author: luol2
6
+
7
+ machine learning tagging
8
+
9
+ """
10
+
11
+
12
+ import time
13
+ import io
14
+
15
+ from src_python.SpeAss.processing_data_sa import ml_intext_fn,out_BIO_BERT_softmax_score_fn
16
+ import tensorflow as tf
17
+ gpu = tf.config.list_physical_devices('GPU')
18
+ print("Num GPUs Available: ", len(gpu))
19
+ if len(gpu) > 0:
20
+ tf.config.experimental.set_memory_growth(gpu[0], True)
21
+ #tf.compat.v1.disable_eager_execution()
22
+
23
+ REL_ENT={'arg1':'Species',
24
+ 'arg2':'Gene'}
25
+
26
+ entity_tag={'arg1':['arg1s','arg1e'],
27
+ 'gene':['gene1s','gene1e'],
28
+ 'species':['species1s','species1e']
29
+ }
30
+
31
+ def input_preprocess_notoken(doc_text):
32
+ final_input=[]
33
+ final_id=[]
34
+
35
+ lines=doc_text.split('\n')
36
+ token_text=lines[0]
37
+ pmid=lines[1].split('\t')[0]
38
+ entity_arg1={} #{species_id:[[spe_sid1,sep_eid1],[...]]}
39
+ entity_all=[]
40
+ for i in range(1,len(lines)):
41
+ seg=lines[i].split('\t')
42
+ if seg[6]==REL_ENT['arg1']:
43
+ if seg[-1] in entity_arg1.keys():
44
+ entity_arg1[seg[-1]].append([seg[3],seg[4]])
45
+ else:
46
+ entity_arg1[seg[-1]]=[[seg[3],seg[4]]]
47
+ entity_all.append(seg)
48
+
49
+ #print(token_text)
50
+ #print(entity_chemical)
51
+ #generate input instance
52
+ for cur_ele in entity_arg1:
53
+
54
+ #2. ner label text
55
+ ner_text=''
56
+ text_sid=0
57
+ #print('nonest:',entity_nonest)
58
+ for ele_nonest in entity_all:
59
+ ent_id=[ele_nonest[3],ele_nonest[4]]
60
+ ent_spe_id=ele_nonest[-1]
61
+ ent_sid=int(ele_nonest[3])
62
+ ent_eid=int(ele_nonest[4])
63
+ # print('sid,eid:',ent_sid,ent_eid)
64
+ ent_text=ele_nonest[5]
65
+ ent_type=ele_nonest[6]
66
+ if ent_sid>=text_sid:
67
+ # if token_text[ent_sid:ent_eid]!=ent_text:
68
+ # print('error!index_text,entext:',token_text[ent_sid:ent_eid],ent_text)
69
+ if ent_id in entity_arg1[cur_ele]: #is species
70
+ ner_text+=token_text[text_sid:ent_sid]+' '+ent_spe_id+'|'+entity_tag['arg1'][0]+' '+ent_text+' '+entity_tag['arg1'][1]+' '
71
+ else:
72
+ ner_text+=token_text[text_sid:ent_sid]+' '+str(ent_sid)+'-'+str(ent_eid)+'|'+entity_tag[ent_type.lower()][0]+' '+ent_text+' '+entity_tag[ent_type.lower()][1]+' '
73
+ text_sid=ent_eid
74
+ ner_text+=token_text[text_sid:]
75
+ sen_tokens=ner_text.split()
76
+ #print('\nner_text:',ner_text)
77
+
78
+ #3. produce input
79
+ temp_input=[]
80
+ temp_id={'species':'','gene':[]}
81
+ for sen_token in sen_tokens:
82
+ if sen_token.find(entity_tag['arg1'][0])>=0:
83
+ en_id=sen_token.split('|')[0]
84
+ temp_id['species']=en_id
85
+ temp_input.append(entity_tag['arg1'][0]+'\tO')
86
+ elif sen_token.find(entity_tag['gene'][0])>=0:
87
+ en_id=sen_token.split('|')[0]
88
+ temp_id['gene'].append(en_id)
89
+ temp_input.append(entity_tag['gene'][0]+'\tO')
90
+ elif sen_token.find(entity_tag['species'][0])>=0:
91
+ en_id=sen_token.split('|')[0]
92
+ # temp_id.append(en_id)
93
+ temp_input.append(entity_tag['species'][0]+'\tO')
94
+ else:
95
+ if sen_token=='':
96
+ # print('token is none!error!')
97
+ pass
98
+ else:
99
+ temp_input.append(sen_token+'\tO')
100
+ final_input.append('\n'.join(temp_input))
101
+ final_id.append(temp_id)
102
+
103
+ # print(entity_nonest)
104
+ return final_input,final_id,entity_all,pmid
105
+
106
+
107
+ def ml_tagging(ml_input,nn_model):
108
+
109
+ test_set,test_label = ml_intext_fn(ml_input)
110
+ test_x,test_y, test_bert_text_label=nn_model.rep.load_data_hugface(test_set,test_label,word_max_len=nn_model.maxlen,label_type='softmax')
111
+ test_pre = nn_model.model.predict(test_x)
112
+ ml_out=out_BIO_BERT_softmax_score_fn(test_pre,test_bert_text_label,nn_model.rep.index_2_label)
113
+ return ml_out
114
+
115
+ def output_rel(ml_output,entity_map,pmid):
116
+ fin=io.StringIO(ml_output)
117
+ alltexts=fin.read().strip().split('\n\n')
118
+ fin.close()
119
+ final_out={} #{'sid-eid':[spechies id]}
120
+ for sen_id,sentence in enumerate(alltexts):
121
+ tokens=sentence.split('\n')
122
+ gene_entity_id=0
123
+ token_id=0
124
+ arg1=''
125
+ arg2_list=[] #[[ID, score],[id,score]]
126
+ while (token_id<len(tokens)):
127
+ seg=tokens[token_id].split('\t')
128
+ if seg[0]==entity_tag['arg1'][0]:
129
+ arg1=entity_map[sen_id]['species']
130
+ token_id+=1
131
+ if token_id >=len(tokens):
132
+ break
133
+ seg=tokens[token_id].split('\t')
134
+ while seg[0]!=entity_tag['arg1'][1]:
135
+ token_id+=1
136
+ if token_id >=len(tokens):
137
+ break
138
+ seg=tokens[token_id].split('\t')
139
+ elif seg[0]==entity_tag[REL_ENT['arg2'].lower()][0]:
140
+ temp_rel=seg[-2]
141
+ temp_score=seg[-1]
142
+ arg2_id=entity_map[sen_id]['gene'][gene_entity_id]
143
+ gene_entity_id+=1
144
+ token_id+=1
145
+ if token_id >=len(tokens):
146
+ break
147
+ seg=tokens[token_id].split('\t')
148
+ while seg[0]!=entity_tag[REL_ENT['arg2'].lower()][1]:
149
+ token_id+=1
150
+ if token_id >=len(tokens):
151
+ break
152
+ seg=tokens[token_id].split('\t')
153
+ if seg[-2].find('ARG2')>=0 and temp_rel.find('ARG2')<0:
154
+ temp_rel=seg[-2]
155
+ temp_score=seg[-1]
156
+ if temp_rel.find('ARG2')>=0:
157
+ arg2_list.append([arg2_id,temp_score])
158
+ elif seg[0]==entity_tag[REL_ENT['arg1'].lower()][0]:
159
+ token_id+=1
160
+ if token_id >=len(tokens):
161
+ break
162
+ seg=tokens[token_id].split('\t')
163
+ while seg[0]!=entity_tag[REL_ENT['arg1'].lower()][1]:
164
+ token_id+=1
165
+ if token_id >=len(tokens):
166
+ break
167
+ seg=tokens[token_id].split('\t')
168
+
169
+ else:
170
+ pass
171
+ token_id+=1
172
+ #print(arg1,arg2_list)
173
+ if arg2_list!=[] and arg1!='':
174
+ for arg2_ele in arg2_list:
175
+ if arg2_ele[0] not in final_out.keys():
176
+ final_out[arg2_ele[0]]=[arg1+'|'+arg2_ele[1]]
177
+ else:
178
+ final_out[arg2_ele[0]].append(arg1+'|'+arg2_ele[1])
179
+ return(final_out)
180
+
181
+ def NER_Tag(doc_in,nn_model):
182
+
183
+ #1. preprocess input, input_text:conll格式, input_entity:相应的实体列表
184
+ #print(doc_in)
185
+ input_text,entity_index,entity_all,pmid=input_preprocess_notoken(doc_in)
186
+ # print('pmid:',pmid)
187
+ # print('\entity_index:',entity_index)
188
+
189
+
190
+ #2. ml tagging
191
+ if input_text!=[]:
192
+ ml_pre=ml_tagging(input_text,nn_model)
193
+ #print('\noutput:')
194
+ #print(ml_pre)
195
+
196
+ #3.generate output
197
+ final_output=output_rel(ml_pre,entity_index,pmid)
198
+ else:
199
+ final_output={}
200
+ return final_output,entity_all
201
+
202
+
203
+
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
+
212
+
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
src_python/SpeAss/model_sa.py CHANGED
@@ -1,105 +1,105 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on Wed Feb 10 09:08:09 2021
4
-
5
- @author: luol2
6
-
7
- Model Architecture
8
-
9
- """
10
- import tensorflow as tf
11
- from src_python.SpeAss.represent_sa import Hugface_RepresentationLayer
12
- from tensorflow.keras.layers import *
13
- from tensorflow.keras.models import Model
14
- from tensorflow.keras.optimizers import RMSprop, SGD, Adam, Adadelta, Adagrad,Nadam
15
- from transformers import TFAutoModel
16
- import numpy as np
17
- import sys
18
-
19
-
20
- class LRSchedule_LINEAR(tf.keras.optimizers.schedules.LearningRateSchedule):
21
- def __init__(
22
- self,
23
- init_lr=5e-5,
24
- init_warmup_lr=0.0,
25
- final_lr=5e-7,
26
- warmup_steps=0,
27
- decay_steps=0,
28
- ):
29
- super().__init__()
30
- self.init_lr = init_lr
31
- self.init_warmup_lr=init_warmup_lr
32
- self.final_lr = final_lr
33
- self.warmup_steps = warmup_steps
34
- self.decay_steps = decay_steps
35
-
36
- def __call__(self, step):
37
- """ linear warm up - linear decay """
38
- if self.warmup_steps>0:
39
- warmup_lr = (self.init_lr - self.init_warmup_lr)/self.warmup_steps * step+self.init_warmup_lr
40
- else:
41
- warmup_lr=1000.0
42
- #print('\n.......warmup_lr:',warmup_lr)
43
- decay_lr = tf.math.maximum(
44
- self.final_lr,
45
- self.init_lr - (step - self.warmup_steps)/self.decay_steps*(self.init_lr - self.final_lr)
46
- )
47
- #print('\n.....decay_lr:',decay_lr)
48
- return tf.math.minimum(warmup_lr,decay_lr)
49
-
50
-
51
-
52
- class HUGFACE_NER(): #huggingface transformers
53
- def __init__(self, model_files):
54
- self.model_type='HUGFACE'
55
- self.maxlen = 512
56
- self.checkpoint_path = model_files['checkpoint_path']
57
- self.label_file=model_files['labelfile']
58
- self.lowercase=model_files['lowercase']
59
- self.rep = Hugface_RepresentationLayer(self.checkpoint_path, self.label_file, lowercase=self.lowercase)
60
-
61
-
62
- def build_encoder(self):
63
- print('...vocab len:',self.rep.vocab_len)
64
- plm_model = TFAutoModel.from_pretrained(self.checkpoint_path, from_pt=True)
65
- plm_model.resize_token_embeddings(self.rep.vocab_len)
66
- x1_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='input_ids')
67
- x2_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='token_type_ids')
68
- x3_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='attention_mask')
69
- x = plm_model(x1_in, token_type_ids=x2_in, attention_mask=x3_in)[0]
70
- #dense = TimeDistributed(Dense(512, activation='relu'), name='dense1')(x)
71
- self.encoder = Model (inputs=[x1_in,x2_in,x3_in], outputs=x,name='hugface_encoder')
72
- self.encoder.summary()
73
-
74
- def build_softmax_decoder(self):
75
-
76
- x1_in = Input(shape=(self.maxlen,),dtype=tf.int32)
77
- x2_in = Input(shape=(self.maxlen,),dtype=tf.int32)
78
- x3_in = Input(shape=(self.maxlen,),dtype=tf.int32)
79
- features = self.encoder([x1_in,x2_in,x3_in])
80
- #features = Dropout(0.4)(features)
81
- #features = TimeDistributed(Dense(128, activation='relu'), name='dense2')(features)
82
- features= Dropout(0.1)(features)
83
- output = TimeDistributed(Dense(self.rep.label_table_size, activation='softmax'), name='softmax')(features)
84
- self.model = Model(inputs=[x1_in,x2_in,x3_in], outputs=output, name="hugface_softmax")
85
-
86
- # lr_schedule=LRSchedule_LINEAR(
87
- # init_lr=1e-5,
88
- # init_warmup_lr=1e-7,
89
- # final_lr=1e-6,
90
- # warmup_steps=0,
91
- # decay_steps=40000)
92
-
93
- opt = Adam(learning_rate = 5e-6)
94
- self.model.compile(
95
- optimizer=opt,
96
- loss='sparse_categorical_crossentropy',
97
- metrics=['accuracy'],
98
- )
99
- self.model.summary()
100
-
101
-
102
- def load_model(self,model_file):
103
- self.model.load_weights(model_file)
104
- self.model.summary()
105
- print('load HUGFACE model done!')
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Wed Feb 10 09:08:09 2021
4
+
5
+ @author: luol2
6
+
7
+ Model Architecture
8
+
9
+ """
10
+ import tensorflow as tf
11
+ from src_python.SpeAss.represent_sa import Hugface_RepresentationLayer
12
+ from tensorflow.keras.layers import *
13
+ from tensorflow.keras.models import Model
14
+ from tensorflow.keras.optimizers import RMSprop, SGD, Adam, Adadelta, Adagrad,Nadam
15
+ from transformers import TFAutoModel
16
+ import numpy as np
17
+ import sys
18
+
19
+
20
+ class LRSchedule_LINEAR(tf.keras.optimizers.schedules.LearningRateSchedule):
21
+ def __init__(
22
+ self,
23
+ init_lr=5e-5,
24
+ init_warmup_lr=0.0,
25
+ final_lr=5e-7,
26
+ warmup_steps=0,
27
+ decay_steps=0,
28
+ ):
29
+ super().__init__()
30
+ self.init_lr = init_lr
31
+ self.init_warmup_lr=init_warmup_lr
32
+ self.final_lr = final_lr
33
+ self.warmup_steps = warmup_steps
34
+ self.decay_steps = decay_steps
35
+
36
+ def __call__(self, step):
37
+ """ linear warm up - linear decay """
38
+ if self.warmup_steps>0:
39
+ warmup_lr = (self.init_lr - self.init_warmup_lr)/self.warmup_steps * step+self.init_warmup_lr
40
+ else:
41
+ warmup_lr=1000.0
42
+ #print('\n.......warmup_lr:',warmup_lr)
43
+ decay_lr = tf.math.maximum(
44
+ self.final_lr,
45
+ self.init_lr - (step - self.warmup_steps)/self.decay_steps*(self.init_lr - self.final_lr)
46
+ )
47
+ #print('\n.....decay_lr:',decay_lr)
48
+ return tf.math.minimum(warmup_lr,decay_lr)
49
+
50
+
51
+
52
+ class HUGFACE_NER(): #huggingface transformers
53
+ def __init__(self, model_files):
54
+ self.model_type='HUGFACE'
55
+ self.maxlen = 512
56
+ self.checkpoint_path = model_files['checkpoint_path']
57
+ self.label_file=model_files['labelfile']
58
+ self.lowercase=model_files['lowercase']
59
+ self.rep = Hugface_RepresentationLayer(self.checkpoint_path, self.label_file, lowercase=self.lowercase)
60
+
61
+
62
+ def build_encoder(self):
63
+ print('...vocab len:',self.rep.vocab_len)
64
+ plm_model = TFAutoModel.from_pretrained(self.checkpoint_path, from_pt=True)
65
+ plm_model.resize_token_embeddings(self.rep.vocab_len)
66
+ x1_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='input_ids')
67
+ x2_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='token_type_ids')
68
+ x3_in = Input(shape=(self.maxlen,),dtype=tf.int32, name='attention_mask')
69
+ x = plm_model(x1_in, token_type_ids=x2_in, attention_mask=x3_in)[0]
70
+ #dense = TimeDistributed(Dense(512, activation='relu'), name='dense1')(x)
71
+ self.encoder = Model (inputs=[x1_in,x2_in,x3_in], outputs=x,name='hugface_encoder')
72
+ self.encoder.summary()
73
+
74
+ def build_softmax_decoder(self):
75
+
76
+ x1_in = Input(shape=(self.maxlen,),dtype=tf.int32)
77
+ x2_in = Input(shape=(self.maxlen,),dtype=tf.int32)
78
+ x3_in = Input(shape=(self.maxlen,),dtype=tf.int32)
79
+ features = self.encoder([x1_in,x2_in,x3_in])
80
+ #features = Dropout(0.4)(features)
81
+ #features = TimeDistributed(Dense(128, activation='relu'), name='dense2')(features)
82
+ features= Dropout(0.1)(features)
83
+ output = TimeDistributed(Dense(self.rep.label_table_size, activation='softmax'), name='softmax')(features)
84
+ self.model = Model(inputs=[x1_in,x2_in,x3_in], outputs=output, name="hugface_softmax")
85
+
86
+ # lr_schedule=LRSchedule_LINEAR(
87
+ # init_lr=1e-5,
88
+ # init_warmup_lr=1e-7,
89
+ # final_lr=1e-6,
90
+ # warmup_steps=0,
91
+ # decay_steps=40000)
92
+
93
+ opt = Adam(learning_rate = 5e-6)
94
+ self.model.compile(
95
+ optimizer=opt,
96
+ loss='sparse_categorical_crossentropy',
97
+ metrics=['accuracy'],
98
+ )
99
+ self.model.summary()
100
+
101
+
102
+ def load_model(self,model_file):
103
+ self.model.load_weights(model_file)
104
+ self.model.summary()
105
+ print('load HUGFACE model done!')
src_python/SpeAss/processing_data_sa.py CHANGED
@@ -1,201 +1,201 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on Tue Mar 10 16:34:12 2020
4
-
5
- @author: luol2
6
- """
7
- import numpy as np
8
- import io
9
- import sys
10
- #read ner text (word\tlabel), generate the list[[[w1,label],[w2,label]]]
11
- def ml_intext(file):
12
- fin=open(file,'r',encoding='utf-8')
13
- alltexts=fin.read().strip().split('\n\n')
14
- fin.close()
15
- data_list=[]
16
- label_list=[]
17
-
18
- for sents in alltexts:
19
- lines=sents.split('\n')
20
- temp_sentece=[]
21
- for i in range(0,len(lines)):
22
- seg=lines[i].split('\t')
23
- temp_sentece.append(seg[:])
24
- label_list.append(seg[-1])
25
-
26
- data_list.append(temp_sentece)
27
- #print(data_list)
28
- #print(label_list)
29
- return data_list,label_list
30
-
31
- def ml_intext_fn(alltexts):
32
- # fin=io.StringIO(ml_input)
33
- # alltexts=fin.read().strip().split('\n\n')
34
- # fin.close()
35
- data_list=[]
36
- label_list=[]
37
-
38
- for sents in alltexts:
39
- lines=sents.split('\n')
40
- temp_sentece=[]
41
- for i in range(0,len(lines)):
42
- seg=lines[i].split('\t')
43
- temp_sentece.append(seg[:])
44
- label_list.append(seg[-1])
45
-
46
- data_list.append(temp_sentece)
47
- #print(data_list)
48
- #print(label_list)
49
- return data_list,label_list
50
-
51
- # model predict result to conll evalute format [token answer predict]
52
- def out_BIO(file,raw_pre,raw_input,label_set):
53
- fout=open(file,'w',encoding='utf-8')
54
- for i in range(len(raw_input)):
55
-
56
- for j in range(len(raw_input[i])):
57
- if j<len(raw_pre[i]):
58
- label_id = raw_pre[i][j]
59
- label_tag = label_set[str(label_id)]
60
- else:
61
- label_tag='O'
62
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
63
- fout.write('\n')
64
- fout.close()
65
-
66
- def out_BIO_softmax(file,raw_pre,raw_input,label_set):
67
- fout=open(file,'w',encoding='utf-8')
68
- #print(raw_pre[0:2])
69
- for i in range(len(raw_input)):
70
-
71
- for j in range(len(raw_input[i])):
72
- if j<len(raw_pre[i]):
73
- label_id = np.argmax(raw_pre[i][j])
74
- #print(label_id)
75
- label_tag = label_set[str(label_id)]
76
- else:
77
- label_tag='O'
78
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
79
- fout.write('\n')
80
- fout.close()
81
-
82
- def out_BIO_fn(raw_pre,raw_input,label_set):
83
- fout=io.StringIO()
84
- for i in range(len(raw_input)):
85
-
86
- for j in range(len(raw_input[i])):
87
- if j<len(raw_pre[i]):
88
- label_id = raw_pre[i][j]
89
- label_tag = label_set[str(label_id)]
90
- else:
91
- label_tag='O'
92
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
93
- fout.write('\n')
94
- return fout.getvalue()
95
- def out_BIO_BERT_softmax(file,raw_pre,raw_input,label_set):
96
- fout=open(file,'w',encoding='utf-8')
97
- for i in range(len(raw_input)):
98
-
99
- for j in range(len(raw_input[i])):
100
- if j<len(raw_pre[i]):
101
- # label_id = raw_pre[i][j]
102
- label_id = np.argmax(raw_pre[i][j])
103
- label_tag = label_set[str(label_id)]
104
- else:
105
- label_tag='O'
106
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
107
- fout.write('\n')
108
- fout.close()
109
- def out_BIO_BERT(file,raw_pre,raw_input,label_set):
110
- fout=open(file,'w',encoding='utf-8')
111
- for i in range(len(raw_input)):
112
-
113
- for j in range(len(raw_input[i])):
114
- if j<len(raw_pre[i]):
115
- label_id = raw_pre[i][j]
116
- label_tag = label_set[str(label_id)]
117
- else:
118
- label_tag='O'
119
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
120
- fout.write('\n')
121
- fout.close()
122
- def out_BIO_BERT_fn(raw_pre,raw_input,label_set):
123
- fout=io.StringIO()
124
- for i in range(len(raw_input)):
125
-
126
- for j in range(len(raw_input[i])):
127
- if j<len(raw_pre[i]):
128
- label_id = raw_pre[i][j]
129
- label_tag = label_set[str(label_id)]
130
- else:
131
- label_tag='O'
132
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
133
- fout.write('\n')
134
- return fout.getvalue()
135
- def out_BIO_BERT_softmax_fn(raw_pre,raw_input,label_set):
136
- fout=io.StringIO()
137
- for i in range(len(raw_input)):
138
-
139
- for j in range(len(raw_input[i])):
140
- if j<len(raw_pre[i]):
141
- #label_id = raw_pre[i][j]
142
- label_id = np.argmax(raw_pre[i][j])
143
- label_tag = label_set[str(label_id)]
144
- else:
145
- label_tag='O'
146
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
147
- fout.write('\n')
148
- return fout.getvalue()
149
- def out_BIO_BERT_softmax_score_fn(raw_pre,raw_input,label_set):
150
- fout=io.StringIO()
151
- for i in range(len(raw_input)):
152
-
153
- for j in range(len(raw_input[i])):
154
- if j<len(raw_pre[i]):
155
- #label_id = raw_pre[i][j]
156
- label_id = np.argmax(raw_pre[i][j])
157
- label_score = round(raw_pre[i][j][label_id],4)
158
- label_tag = label_set[str(label_id)]
159
- else:
160
- label_tag='O'
161
- label_score = 0.0
162
- fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\t'+str(label_score)+'\n')
163
- fout.write('\n')
164
- return fout.getvalue()
165
- #generate char vocab
166
- def char_vocab(infile,outfile_char):
167
- fin=open(infile,'r',encoding='utf-8')
168
- #fout=open(outfile,'w',encoding='utf-8')
169
- fout_char=open(outfile_char,'w',encoding='utf-8')
170
- char_vocab=['oov_char']
171
- max_len=0
172
- for line in fin:
173
- if line.strip()!='':
174
- seg=line.split('\t')
175
- word_len=len(seg[0])
176
- #if word_len<1000:
177
- # fout.write(line)
178
- if word_len>max_len:
179
- max_len=word_len
180
- print(seg[0])
181
- for i in range(word_len):
182
- if seg[0][i] not in char_vocab:
183
- char_vocab.append(seg[0][i])
184
- #else:
185
- # fout.write(line)
186
- fin.close()
187
- #fout.close()
188
- for ele in char_vocab:
189
- fout_char.write(ele+'\n')
190
- fout_char.close()
191
- print('max_len:',max_len)
192
-
193
-
194
- if __name__=='__main__':
195
- # infile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO'
196
- # #outfile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO_new'
197
- # outfile_char='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/src/nn_model/vocab/char_vocab'
198
- # #processing_text(file)
199
- # char_vocab(infile,outfile_char)
200
- a=[1,2,3]
201
- print(a[:-1])
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Tue Mar 10 16:34:12 2020
4
+
5
+ @author: luol2
6
+ """
7
+ import numpy as np
8
+ import io
9
+ import sys
10
+ #read ner text (word\tlabel), generate the list[[[w1,label],[w2,label]]]
11
+ def ml_intext(file):
12
+ fin=open(file,'r',encoding='utf-8')
13
+ alltexts=fin.read().strip().split('\n\n')
14
+ fin.close()
15
+ data_list=[]
16
+ label_list=[]
17
+
18
+ for sents in alltexts:
19
+ lines=sents.split('\n')
20
+ temp_sentece=[]
21
+ for i in range(0,len(lines)):
22
+ seg=lines[i].split('\t')
23
+ temp_sentece.append(seg[:])
24
+ label_list.append(seg[-1])
25
+
26
+ data_list.append(temp_sentece)
27
+ #print(data_list)
28
+ #print(label_list)
29
+ return data_list,label_list
30
+
31
+ def ml_intext_fn(alltexts):
32
+ # fin=io.StringIO(ml_input)
33
+ # alltexts=fin.read().strip().split('\n\n')
34
+ # fin.close()
35
+ data_list=[]
36
+ label_list=[]
37
+
38
+ for sents in alltexts:
39
+ lines=sents.split('\n')
40
+ temp_sentece=[]
41
+ for i in range(0,len(lines)):
42
+ seg=lines[i].split('\t')
43
+ temp_sentece.append(seg[:])
44
+ label_list.append(seg[-1])
45
+
46
+ data_list.append(temp_sentece)
47
+ #print(data_list)
48
+ #print(label_list)
49
+ return data_list,label_list
50
+
51
+ # model predict result to conll evalute format [token answer predict]
52
+ def out_BIO(file,raw_pre,raw_input,label_set):
53
+ fout=open(file,'w',encoding='utf-8')
54
+ for i in range(len(raw_input)):
55
+
56
+ for j in range(len(raw_input[i])):
57
+ if j<len(raw_pre[i]):
58
+ label_id = raw_pre[i][j]
59
+ label_tag = label_set[str(label_id)]
60
+ else:
61
+ label_tag='O'
62
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
63
+ fout.write('\n')
64
+ fout.close()
65
+
66
+ def out_BIO_softmax(file,raw_pre,raw_input,label_set):
67
+ fout=open(file,'w',encoding='utf-8')
68
+ #print(raw_pre[0:2])
69
+ for i in range(len(raw_input)):
70
+
71
+ for j in range(len(raw_input[i])):
72
+ if j<len(raw_pre[i]):
73
+ label_id = np.argmax(raw_pre[i][j])
74
+ #print(label_id)
75
+ label_tag = label_set[str(label_id)]
76
+ else:
77
+ label_tag='O'
78
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
79
+ fout.write('\n')
80
+ fout.close()
81
+
82
+ def out_BIO_fn(raw_pre,raw_input,label_set):
83
+ fout=io.StringIO()
84
+ for i in range(len(raw_input)):
85
+
86
+ for j in range(len(raw_input[i])):
87
+ if j<len(raw_pre[i]):
88
+ label_id = raw_pre[i][j]
89
+ label_tag = label_set[str(label_id)]
90
+ else:
91
+ label_tag='O'
92
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
93
+ fout.write('\n')
94
+ return fout.getvalue()
95
+ def out_BIO_BERT_softmax(file,raw_pre,raw_input,label_set):
96
+ fout=open(file,'w',encoding='utf-8')
97
+ for i in range(len(raw_input)):
98
+
99
+ for j in range(len(raw_input[i])):
100
+ if j<len(raw_pre[i]):
101
+ # label_id = raw_pre[i][j]
102
+ label_id = np.argmax(raw_pre[i][j])
103
+ label_tag = label_set[str(label_id)]
104
+ else:
105
+ label_tag='O'
106
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
107
+ fout.write('\n')
108
+ fout.close()
109
+ def out_BIO_BERT(file,raw_pre,raw_input,label_set):
110
+ fout=open(file,'w',encoding='utf-8')
111
+ for i in range(len(raw_input)):
112
+
113
+ for j in range(len(raw_input[i])):
114
+ if j<len(raw_pre[i]):
115
+ label_id = raw_pre[i][j]
116
+ label_tag = label_set[str(label_id)]
117
+ else:
118
+ label_tag='O'
119
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
120
+ fout.write('\n')
121
+ fout.close()
122
+ def out_BIO_BERT_fn(raw_pre,raw_input,label_set):
123
+ fout=io.StringIO()
124
+ for i in range(len(raw_input)):
125
+
126
+ for j in range(len(raw_input[i])):
127
+ if j<len(raw_pre[i]):
128
+ label_id = raw_pre[i][j]
129
+ label_tag = label_set[str(label_id)]
130
+ else:
131
+ label_tag='O'
132
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
133
+ fout.write('\n')
134
+ return fout.getvalue()
135
+ def out_BIO_BERT_softmax_fn(raw_pre,raw_input,label_set):
136
+ fout=io.StringIO()
137
+ for i in range(len(raw_input)):
138
+
139
+ for j in range(len(raw_input[i])):
140
+ if j<len(raw_pre[i]):
141
+ #label_id = raw_pre[i][j]
142
+ label_id = np.argmax(raw_pre[i][j])
143
+ label_tag = label_set[str(label_id)]
144
+ else:
145
+ label_tag='O'
146
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
147
+ fout.write('\n')
148
+ return fout.getvalue()
149
+ def out_BIO_BERT_softmax_score_fn(raw_pre,raw_input,label_set):
150
+ fout=io.StringIO()
151
+ for i in range(len(raw_input)):
152
+
153
+ for j in range(len(raw_input[i])):
154
+ if j<len(raw_pre[i]):
155
+ #label_id = raw_pre[i][j]
156
+ label_id = np.argmax(raw_pre[i][j])
157
+ label_score = round(raw_pre[i][j][label_id],4)
158
+ label_tag = label_set[str(label_id)]
159
+ else:
160
+ label_tag='O'
161
+ label_score = 0.0
162
+ fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\t'+str(label_score)+'\n')
163
+ fout.write('\n')
164
+ return fout.getvalue()
165
+ #generate char vocab
166
+ def char_vocab(infile,outfile_char):
167
+ fin=open(infile,'r',encoding='utf-8')
168
+ #fout=open(outfile,'w',encoding='utf-8')
169
+ fout_char=open(outfile_char,'w',encoding='utf-8')
170
+ char_vocab=['oov_char']
171
+ max_len=0
172
+ for line in fin:
173
+ if line.strip()!='':
174
+ seg=line.split('\t')
175
+ word_len=len(seg[0])
176
+ #if word_len<1000:
177
+ # fout.write(line)
178
+ if word_len>max_len:
179
+ max_len=word_len
180
+ print(seg[0])
181
+ for i in range(word_len):
182
+ if seg[0][i] not in char_vocab:
183
+ char_vocab.append(seg[0][i])
184
+ #else:
185
+ # fout.write(line)
186
+ fin.close()
187
+ #fout.close()
188
+ for ele in char_vocab:
189
+ fout_char.write(ele+'\n')
190
+ fout_char.close()
191
+ print('max_len:',max_len)
192
+
193
+
194
+ if __name__=='__main__':
195
+ # infile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO'
196
+ # #outfile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO_new'
197
+ # outfile_char='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/src/nn_model/vocab/char_vocab'
198
+ # #processing_text(file)
199
+ # char_vocab(infile,outfile_char)
200
+ a=[1,2,3]
201
+ print(a[:-1])