Linhz commited on
Commit
d6c2c07
1 Parent(s): 4c741a4

Upload 34 files

Browse files
Files changed (35) hide show
  1. .gitattributes +3 -0
  2. VnCoreNLP/LICENSE.md +14 -0
  3. VnCoreNLP/Readme.md +136 -0
  4. VnCoreNLP/TagsetDescription.md +67 -0
  5. VnCoreNLP/VLSP2013_POS_tagset.pdf +0 -0
  6. VnCoreNLP/VnCoreNLP-1.1.1.jar +3 -0
  7. VnCoreNLP/VnCoreNLP-1.2.jar +3 -0
  8. VnCoreNLP/VnDT-treebank-description.pdf +0 -0
  9. VnCoreNLP/models/dep/vi-dep.xz +3 -0
  10. VnCoreNLP/models/ner/vi-500brownclusters.xz +3 -0
  11. VnCoreNLP/models/ner/vi-ner.xz +3 -0
  12. VnCoreNLP/models/ner/vi-pretrainedembeddings.xz +3 -0
  13. VnCoreNLP/models/postagger/vi-tagger +3 -0
  14. VnCoreNLP/models/wordsegmenter/vi-vocab +0 -0
  15. VnCoreNLP/models/wordsegmenter/wordsegmenter.rdr +0 -0
  16. VnCoreNLP/pom.xml +103 -0
  17. VnCoreNLP/src/main/java/vn/corenlp/ner/NerRecognizer.java +85 -0
  18. VnCoreNLP/src/main/java/vn/corenlp/parser/DependencyParser.java +74 -0
  19. VnCoreNLP/src/main/java/vn/corenlp/postagger/PosTagger.java +65 -0
  20. VnCoreNLP/src/main/java/vn/corenlp/tokenizer/StringUtils.java +207 -0
  21. VnCoreNLP/src/main/java/vn/corenlp/tokenizer/Tokenizer.java +397 -0
  22. VnCoreNLP/src/main/java/vn/corenlp/wordsegmenter/FWObject.java +30 -0
  23. VnCoreNLP/src/main/java/vn/corenlp/wordsegmenter/Node.java +85 -0
  24. VnCoreNLP/src/main/java/vn/corenlp/wordsegmenter/Utils.java +126 -0
  25. VnCoreNLP/src/main/java/vn/corenlp/wordsegmenter/Vocabulary.java +1605 -0
  26. VnCoreNLP/src/main/java/vn/corenlp/wordsegmenter/WordSegmenter.java +245 -0
  27. VnCoreNLP/src/main/java/vn/corenlp/wordsegmenter/WordTag.java +13 -0
  28. VnCoreNLP/src/main/java/vn/pipeline/Annotation.java +147 -0
  29. VnCoreNLP/src/main/java/vn/pipeline/LexicalInitializer.java +82 -0
  30. VnCoreNLP/src/main/java/vn/pipeline/Sentence.java +110 -0
  31. VnCoreNLP/src/main/java/vn/pipeline/Utils.java +31 -0
  32. VnCoreNLP/src/main/java/vn/pipeline/VnCoreNLP.java +134 -0
  33. VnCoreNLP/src/main/java/vn/pipeline/Word.java +111 -0
  34. VnCoreNLP/src/main/resources/log4j.properties +10 -0
  35. VnCoreNLP/src/test/java/VnCoreNLPExample.java +30 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ VnCoreNLP/models/postagger/vi-tagger filter=lfs diff=lfs merge=lfs -text
37
+ VnCoreNLP/VnCoreNLP-1.1.1.jar filter=lfs diff=lfs merge=lfs -text
38
+ VnCoreNLP/VnCoreNLP-1.2.jar filter=lfs diff=lfs merge=lfs -text
VnCoreNLP/LICENSE.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (C) 2018-2019 vncorenlp
2
+
3
+ This program is free software: you can redistribute it and/or modify
4
+ it under the terms of the GNU General Public License as published by
5
+ the Free Software Foundation, either version 3 of the License, or
6
+ (at your option) any later version.
7
+
8
+ This program is distributed in the hope that it will be useful,
9
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ GNU General Public License for more details.
12
+
13
+ You should have received a copy of the GNU General Public License
14
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
VnCoreNLP/Readme.md ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### Table of contents
2
+ 1. [Introduction](#introduction)
3
+ 2. [Installation](#install)
4
+ 2. [Usage for Python users](#python)
5
+ 3. [Usage for Java users](#java)
6
+ 4. [Experimental results](#exp)
7
+
8
+ # VnCoreNLP: A Vietnamese natural language processing toolkit <a name="introduction"></a>
9
+
10
+ VnCoreNLP is a **fast and accurate** NLP annotation pipeline for Vietnamese, providing rich linguistic annotations through key NLP components of **word segmentation**, **POS tagging**, **named entity recognition** (NER) and **dependency parsing**. Users do not have to install external dependencies. Users can run processing pipelines from either the command-line or the API. The general architecture and experimental results of VnCoreNLP can be found in the following related papers:
11
+
12
+ 1. Thanh Vu, Dat Quoc Nguyen, Dai Quoc Nguyen, Mark Dras and Mark Johnson. **2018**. [VnCoreNLP: A Vietnamese Natural Language Processing Toolkit](http://aclweb.org/anthology/N18-5012). In *Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Demonstrations*, [NAACL 2018](http://naacl2018.org), pages 56-60. [[.bib]](http://aclweb.org/anthology/N18-5012.bib)
13
+ 2. Dat Quoc Nguyen, Dai Quoc Nguyen, Thanh Vu, Mark Dras and Mark Johnson. **2018**. [A Fast and Accurate Vietnamese Word Segmenter](http://www.lrec-conf.org/proceedings/lrec2018/summaries/55.html). In *Proceedings of the 11th International Conference on Language Resources and Evaluation*, [LREC 2018](http://lrec2018.lrec-conf.org/en/), pages 2582-2587. [[.bib]](https://dblp.uni-trier.de/rec/bibtex/conf/lrec/NguyenNVDJ18)
14
+ 3. Dat Quoc Nguyen, Thanh Vu, Dai Quoc Nguyen, Mark Dras and Mark Johnson. **2017**. [From Word Segmentation to POS Tagging for Vietnamese](http://aclweb.org/anthology/U17-1013). In *Proceedings of the 15th Annual Workshop of the Australasian Language Technology Association*, [ALTA 2017](http://alta2017.alta.asn.au), pages 108-113. [[.bib]](http://aclweb.org/anthology/U17-1013.bib)
15
+
16
+ Please **CITE** paper [1] whenever VnCoreNLP is used to produce published results or incorporated into other software. If you are dealing in depth with either word segmentation or POS tagging, you are also encouraged to cite paper [2] or [3], respectively.
17
+
18
+ If you are looking for light-weight versions, VnCoreNLP's word segmentation and POS tagging components have also been released as independent packages [RDRsegmenter](https://github.com/datquocnguyen/RDRsegmenter) [2] and [VnMarMoT](https://github.com/datquocnguyen/VnMarMoT) [3], resepectively.
19
+
20
+
21
+ ## Installation <a name="install"></a>
22
+
23
+ - `Java 1.8+` (Prerequisite)
24
+ - File `VnCoreNLP-1.2.jar` (27MB) and folder `models` (115MB) are placed in the same working folder.
25
+ - `Python 3.6+` if using [a Python wrapper of VnCoreNLP](https://github.com/thelinhbkhn2014/VnCoreNLP_Wrapper). To install this wrapper, users have to run the following command:
26
+
27
+ `$ pip3 install py_vncorenlp`
28
+
29
+ _A special thanks goes to [Linh The Nguyen](https://github.com/thelinhbkhn2014) for creating this wrapper!_
30
+
31
+
32
+ ## Usage for Python users <a name="python"></a>
33
+
34
+ ```python
35
+ import py_vncorenlp
36
+
37
+ # Automatically download VnCoreNLP components from the original repository
38
+ # and save them in some local working folder
39
+ py_vncorenlp.download_model(save_dir='/absolute/path/to/vncorenlp')
40
+
41
+ # Load VnCoreNLP from the local working folder that contains both `VnCoreNLP-1.2.jar` and `models`
42
+ model = py_vncorenlp.VnCoreNLP(save_dir='/absolute/path/to/vncorenlp')
43
+ # Equivalent to: model = py_vncorenlp.VnCoreNLP(annotators=["wseg", "pos", "ner", "parse"], save_dir='/absolute/path/to/vncorenlp')
44
+
45
+ # Annotate a raw corpus
46
+ model.annotate_file(input_file="/absolute/path/to/input/file", output_file="/absolute/path/to/output/file")
47
+
48
+ # Annotate a raw text
49
+ model.print_out(model.annotate_text("Ông Nguyễn Khắc Chúc đang làm việc tại Đại học Quốc gia Hà Nội. Bà Lan, vợ ông Chúc, cũng làm việc tại đây."))
50
+ ```
51
+
52
+ By default, the output is formatted with 6 columns representing word index, word form, POS tag, NER label, head index of the current word and its dependency relation type:
53
+
54
+ ```
55
+ 1 Ông Nc O 4 sub
56
+ 2 Nguyễn_Khắc_Chúc Np B-PER 1 nmod
57
+ 3 đang R O 4 adv
58
+ 4 làm_việc V O 0 root
59
+ 5 tại E O 4 loc
60
+ 6 Đại_học N B-ORG 5 pob
61
+ ...
62
+ ```
63
+
64
+ For users who use VnCoreNLP only for word segmentation:
65
+
66
+ ```python
67
+ rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/absolute/path/to/vncorenlp')
68
+ text = "Ông Nguyễn Khắc Chúc đang làm việc tại Đại học Quốc gia Hà Nội. Bà Lan, vợ ông Chúc, cũng làm việc tại đây."
69
+ output = rdrsegmenter.word_segment(text)
70
+ print(output)
71
+ # ['Ông Nguyễn_Khắc_Chúc đang làm_việc tại Đại_học Quốc_gia Hà_Nội .', 'Bà Lan , v�� ông Chúc , cũng làm_việc tại đây .']
72
+ ```
73
+
74
+
75
+
76
+ ## Usage for Java users <a name="java"></a>
77
+
78
+ ### Using VnCoreNLP from the command line
79
+
80
+ You can run VnCoreNLP to annotate an input raw text corpus (e.g. a collection of news content) by using following commands:
81
+
82
+ // To perform word segmentation, POS tagging, NER and then dependency parsing
83
+ $ java -Xmx2g -jar VnCoreNLP-1.2.jar -fin input.txt -fout output.txt
84
+ // To perform word segmentation, POS tagging and then NER
85
+ $ java -Xmx2g -jar VnCoreNLP-1.2.jar -fin input.txt -fout output.txt -annotators wseg,pos,ner
86
+ // To perform word segmentation and then POS tagging
87
+ $ java -Xmx2g -jar VnCoreNLP-1.2.jar -fin input.txt -fout output.txt -annotators wseg,pos
88
+ // To perform word segmentation
89
+ $ java -Xmx2g -jar VnCoreNLP-1.2.jar -fin input.txt -fout output.txt -annotators wseg
90
+
91
+
92
+ ### Using VnCoreNLP from the API
93
+
94
+ The following code is a simple and complete example:
95
+
96
+ ```java
97
+ import vn.pipeline.*;
98
+ import java.io.*;
99
+ public class VnCoreNLPExample {
100
+ public static void main(String[] args) throws IOException {
101
+
102
+ // "wseg", "pos", "ner", and "parse" refer to as word segmentation, POS tagging, NER and dependency parsing, respectively.
103
+ String[] annotators = {"wseg", "pos", "ner", "parse"};
104
+ VnCoreNLP pipeline = new VnCoreNLP(annotators);
105
+
106
+ String str = "Ông Nguyễn Khắc Chúc đang làm việc tại Đại học Quốc gia Hà Nội. Bà Lan, vợ ông Chúc, cũng làm việc tại đây.";
107
+
108
+ Annotation annotation = new Annotation(str);
109
+ pipeline.annotate(annotation);
110
+
111
+ System.out.println(annotation.toString());
112
+ // 1 Ông Nc O 4 sub
113
+ // 2 Nguyễn_Khắc_Chúc Np B-PER 1 nmod
114
+ // 3 đang R O 4 adv
115
+ // 4 làm_việc V O 0 root
116
+ // ...
117
+
118
+ //Write to file
119
+ PrintStream outputPrinter = new PrintStream("output.txt");
120
+ pipeline.printToFile(annotation, outputPrinter);
121
+
122
+ // You can also get a single sentence to analyze individually
123
+ Sentence firstSentence = annotation.getSentences().get(0);
124
+ System.out.println(firstSentence.toString());
125
+ }
126
+ }
127
+ ```
128
+
129
+ <img width="1039" alt="vncorenlpexample" src="https://user-images.githubusercontent.com/33695776/37561346-aca1fd68-2aa0-11e8-8bd8-530577b0b5cf.png">
130
+
131
+ See VnCoreNLP's open-source in folder `src` for API details.
132
+
133
+ ## Experimental results <a name="exp"></a>
134
+
135
+ See details in papers [1,2,3] above or at [NLP-progress](http://nlpprogress.com/vietnamese/vietnamese.html).
136
+
VnCoreNLP/TagsetDescription.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## POS tags, NER types and dependency labels in VnCoreNLP
2
+
3
+ The following sections are to briefly describe [POS tags](https://github.com/vncorenlp/VnCoreNLP/blob/master/VLSP2013_POS_tagset.pdf), [NER types](http://vlsp.org.vn/vlsp2016/eval/ner) and [dependency labels](https://github.com/vncorenlp/VnCoreNLP/blob/master/VnDT-treebank-description.pdf) used in VnCoreNLP. See details in [Link-to-POS-tag-description](https://github.com/vncorenlp/VnCoreNLP/blob/master/VLSP2013_POS_tagset.pdf), [Link-to-NER-type-description](http://vlsp.org.vn/vlsp2016/eval/ner) and [Link-to-dependency-label-description](https://github.com/vncorenlp/VnCoreNLP/blob/master/VnDT-treebank-description.pdf).
4
+
5
+ ### POS tags
6
+
7
+ |Label| Meaning |
8
+ |---|---|
9
+ | Np | Proper noun |
10
+ | Nc | Classifier noun |
11
+ | Nu | Unit noun |
12
+ | N | Noun |
13
+ | Ny | Abbreviated noun |
14
+ | Nb | (Foreign) borrowed noun|
15
+ | V | Verb|
16
+ |Vb |(Foreign) borrowed verb|
17
+ |A| Adjective|
18
+ |P| Pronoun|
19
+ |R |Adverb|
20
+ |L| Determiner|
21
+ |M |Numeral/Quantity|
22
+ |E |Preposition|
23
+ |C |Subordinating conjunction|
24
+ |Cc |Coordinating conjunction|
25
+ |I |Interjection/Exclamation|
26
+ |T |Particle/Auxiliary, modal words|
27
+ |Y |Abbreviation|
28
+ |Z |Bound morpheme|
29
+ |X |Un-definition/Other|
30
+ |CH |Punctuation and symbols|
31
+
32
+ ### NER types
33
+
34
+ |Label| Meaning |
35
+ |---|---|
36
+ | PER | Names of persons |
37
+ | LOC | Names of locations |
38
+ | ORG| Names of organizations|
39
+ | MISC|Names of miscellaneous entities|
40
+
41
+ ### Top 21 most frequent dependency labels
42
+
43
+ These following labels has an appearance rate of at least 0.2%:
44
+
45
+ |Label| Meaning |
46
+ |---|---|
47
+ |adv|Adverbial |
48
+ |amod| Adjective modifier |
49
+ |conj| Conjunction |
50
+ |coord| Coordination |
51
+ |dep| Default label |
52
+ |det| Determiner |
53
+ |dir| Direction |
54
+ |dob| Direct object |
55
+ |iob| Indirect object |
56
+ |loc| Location |
57
+ |mnr| Manner |
58
+ |nmod| Noun modifier |
59
+ |pmod| Prepositional modifier |
60
+ |pob| Object of a preposition |
61
+ |prd| Predicate |
62
+ |prp| Purpose |
63
+ |punct| Punctuation |
64
+ |root| Root |
65
+ |sub| Subject |
66
+ |tmp|Temporal|
67
+ |vmod| Verb modifier |
VnCoreNLP/VLSP2013_POS_tagset.pdf ADDED
Binary file (130 kB). View file
 
VnCoreNLP/VnCoreNLP-1.1.1.jar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c356b2baa0b83a287642b29d5c2ec5e9558c84d1c937f0aa88a5eea8748e587e
3
+ size 27412575
VnCoreNLP/VnCoreNLP-1.2.jar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e2811cdbc2ddfc71d04be5dc36e185c88dcd1ad4d5d69e4ff2e1369dccf7793
3
+ size 27412703
VnCoreNLP/VnDT-treebank-description.pdf ADDED
Binary file (251 kB). View file
 
VnCoreNLP/models/dep/vi-dep.xz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:266e4a3a55d5edd1607d5f036c2f95b70c0a6c80f58b57fd9962677a6ef331b7
3
+ size 16048864
VnCoreNLP/models/ner/vi-500brownclusters.xz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d30f9cfdf0af193a69e185d1acda0306a9fbe1321f8a700f7c66557a90f92b8c
3
+ size 5599844
VnCoreNLP/models/ner/vi-ner.xz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f04c5e752d7f99a6313b758fc2607a2c3906e58b1d60a37eb0192aead73d61f7
3
+ size 9956876
VnCoreNLP/models/ner/vi-pretrainedembeddings.xz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00d3d034f1b23a8bfe5168195741fde845808c212e6dfcd4c94bead1665eb0fc
3
+ size 57313672
VnCoreNLP/models/postagger/vi-tagger ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a95608a5982db89c11353b451154ec396eccc0ff1f5b22874935ecdf4e0ace01
3
+ size 29709468
VnCoreNLP/models/wordsegmenter/vi-vocab ADDED
Binary file (527 kB). View file
 
VnCoreNLP/models/wordsegmenter/wordsegmenter.rdr ADDED
The diff for this file is too large to render. See raw diff
 
VnCoreNLP/pom.xml ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project xmlns="http://maven.apache.org/POM/4.0.0"
3
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
5
+ <modelVersion>4.0.0</modelVersion>
6
+
7
+ <groupId>VnCoreNLP</groupId>
8
+ <artifactId>VnCoreNLP</artifactId>
9
+ <version>1.2</version>
10
+ <build>
11
+ <plugins>
12
+ <plugin>
13
+ <groupId>org.apache.maven.plugins</groupId>
14
+ <artifactId>maven-compiler-plugin</artifactId>
15
+ <version>3.3</version>
16
+ <configuration>
17
+ <source>1.8</source>
18
+ <target>1.8</target>
19
+ </configuration>
20
+ </plugin>
21
+ <plugin>
22
+ <groupId>org.apache.maven.plugins</groupId>
23
+ <artifactId>maven-shade-plugin</artifactId>
24
+ <version>3.1.0</version>
25
+ <executions>
26
+ <execution>
27
+ <phase>package</phase>
28
+ <goals>
29
+ <goal>shade</goal>
30
+ </goals>
31
+ <configuration>
32
+ <shadedArtifactAttached>false</shadedArtifactAttached>
33
+ <transformers>
34
+ <!-- add Main-Class to manifest file -->
35
+ <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
36
+ <mainClass>vn.pipeline.VnCoreNLP</mainClass>
37
+ </transformer>
38
+ </transformers>
39
+ </configuration>
40
+ </execution>
41
+ </executions>
42
+ </plugin>
43
+ </plugins>
44
+ </build>
45
+
46
+ <properties>
47
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
48
+ </properties>
49
+
50
+ <dependencies>
51
+
52
+ <dependency>
53
+ <groupId>com.optimaize.languagedetector</groupId>
54
+ <artifactId>language-detector</artifactId>
55
+ <version>0.6</version>
56
+ </dependency>
57
+
58
+ <dependency>
59
+ <groupId>vncorenlp</groupId>
60
+ <artifactId>marmot</artifactId>
61
+ <version>1.0</version>
62
+ </dependency>
63
+
64
+ <dependency>
65
+ <groupId>edu.emory.mathcs.nlp</groupId>
66
+ <artifactId>nlp4j-api</artifactId>
67
+ <version>1.1.3</version>
68
+ </dependency>
69
+
70
+ <dependency>
71
+ <groupId>log4j</groupId>
72
+ <artifactId>log4j</artifactId>
73
+ <version>1.2.17</version>
74
+ </dependency>
75
+
76
+ <dependency>
77
+ <groupId>org.slf4j</groupId>
78
+ <artifactId>slf4j-log4j12</artifactId>
79
+ <version>1.7.5</version>
80
+ </dependency>
81
+
82
+ <!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-api -->
83
+ <dependency>
84
+ <groupId>org.slf4j</groupId>
85
+ <artifactId>slf4j-api</artifactId>
86
+ <version>1.7.25</version>
87
+ </dependency>
88
+
89
+ <!--mvn install:install-file -Dfile=lib/marmot.jar -DgroupId=vncorenlp -DartifactId=marmot -Dversion=1.0 -Dpackaging=jar-->
90
+ </dependencies>
91
+ <repositories>
92
+ <repository>
93
+ <id>vncorenlp</id>
94
+ <name>vncorenlp thirdparty repo</name>
95
+ <url>https://github.com/vncorenlp/thirdparty/raw/repository/</url>
96
+ <snapshots>
97
+ <enabled>true</enabled>
98
+ <updatePolicy>always</updatePolicy>
99
+ </snapshots>
100
+ </repository>
101
+ </repositories>
102
+
103
+ </project>
VnCoreNLP/src/main/java/vn/corenlp/ner/NerRecognizer.java ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.corenlp.ner;
2
+
3
+ import edu.emory.mathcs.nlp.common.util.NLPUtils;
4
+ import edu.emory.mathcs.nlp.component.template.NLPComponent;
5
+
6
+ import edu.emory.mathcs.nlp.component.template.lexicon.GlobalLexica;
7
+ import edu.emory.mathcs.nlp.component.template.node.FeatMap;
8
+ import edu.emory.mathcs.nlp.component.template.node.NLPNode;
9
+ import edu.emory.mathcs.nlp.decode.NLPDecoder;
10
+ import org.apache.log4j.Logger;
11
+ import vn.corenlp.wordsegmenter.Vocabulary;
12
+ import vn.pipeline.LexicalInitializer;
13
+ import vn.pipeline.Word;
14
+ import vn.pipeline.Utils;
15
+
16
+ import java.io.File;
17
+ import java.io.IOException;
18
+ import java.util.ArrayList;
19
+ import java.util.List;
20
+
21
+ public class NerRecognizer {
22
+ private NLPDecoder nlpDecoder ;
23
+ public final static Logger LOGGER = Logger.getLogger(NerRecognizer.class);
24
+ private static NerRecognizer nerRecognizer;
25
+ public static NerRecognizer initialize() throws IOException{
26
+ if(nerRecognizer == null) {
27
+ nerRecognizer = new NerRecognizer();
28
+ }
29
+ return nerRecognizer;
30
+ }
31
+
32
+
33
+ public NerRecognizer() throws IOException{
34
+ LOGGER.info("Loading NER model");
35
+ nlpDecoder = new NLPDecoder();
36
+ List<NLPComponent<NLPNode>> components = new ArrayList();
37
+
38
+ String modelPath = Utils.jarDir + "/models/ner/vi-ner.xz";
39
+ if (!new File(modelPath).exists()) throw new IOException("NerRecognizer: " + modelPath + " is not found!");
40
+ GlobalLexica lexica = LexicalInitializer.initialize(true).initializeLexica();
41
+ if(lexica != null) {
42
+ components.add(lexica);
43
+ }
44
+ components.add(NLPUtils.getComponent(modelPath));
45
+ nlpDecoder.setComponents(components);
46
+
47
+ }
48
+
49
+
50
+ public void tagSentence(List<Word> sentenceWords) {
51
+ NLPNode[] decodedNodes = nlpDecoder.decode(toNodeArray(sentenceWords));
52
+ for(int i = 0; i < sentenceWords.size(); i++) {
53
+ Word word = sentenceWords.get(i);
54
+ word.setNerLabel(decodedNodes[i + 1].getNamedEntityTag().replace("U-", "B-").replace("L-", "I-"));
55
+ }
56
+ }
57
+
58
+ private NLPNode[] toNodeArray(List<Word> sentenceWords) {
59
+ NLPNode[] nlpNodes = new NLPNode[sentenceWords.size() + 1];
60
+ nlpNodes[0] = new NLPNode();
61
+ for(int i = 0; i < sentenceWords.size(); i++) {
62
+ Word word = sentenceWords.get(i);
63
+ nlpNodes[i + 1] = new NLPNode(word.getIndex(), word.getForm(), word.getForm(), addLabelForPOSTag(word), new FeatMap());
64
+
65
+ }
66
+ return nlpNodes;
67
+ }
68
+
69
+ public String addLabelForPOSTag(Word word) {
70
+ String[] tokens = word.getForm().split("_");
71
+ String output = word.getPosTag();
72
+ if (word.getPosTag() != null && word.getPosTag().equals("Np")) {
73
+ if (Vocabulary.VN_FAMILY_NAMES.contains(tokens[0].toLowerCase())
74
+ || (tokens.length > 1 && Vocabulary.VN_MIDDLE_NAMES.contains(tokens[1].toLowerCase())))
75
+ output = word.getPosTag() + "-1";
76
+ else output = word.getPosTag() + "-0";
77
+ }
78
+ return output;
79
+ }
80
+
81
+ public static void main(String[] args) {
82
+
83
+
84
+ }
85
+ }
VnCoreNLP/src/main/java/vn/corenlp/parser/DependencyParser.java ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.corenlp.parser;
2
+
3
+ import edu.emory.mathcs.nlp.common.util.NLPUtils;
4
+ import edu.emory.mathcs.nlp.component.template.NLPComponent;
5
+
6
+ import edu.emory.mathcs.nlp.component.template.lexicon.GlobalLexica;
7
+ import edu.emory.mathcs.nlp.component.template.node.FeatMap;
8
+ import edu.emory.mathcs.nlp.component.template.node.NLPNode;
9
+ import edu.emory.mathcs.nlp.decode.NLPDecoder;
10
+ import org.apache.log4j.Logger;
11
+ import vn.pipeline.LexicalInitializer;
12
+ import vn.pipeline.Word;
13
+ import vn.pipeline.Utils;
14
+
15
+ import java.io.File;
16
+ import java.io.IOException;
17
+ import java.util.ArrayList;
18
+ import java.util.List;
19
+
20
+ public class DependencyParser {
21
+ private NLPDecoder nlpDecoder ;
22
+ public final static Logger LOGGER = Logger.getLogger(DependencyParser.class);
23
+ private static DependencyParser dependencyParser;
24
+ public static DependencyParser initialize() throws IOException {
25
+ if(dependencyParser == null) {
26
+ dependencyParser = new DependencyParser();
27
+ }
28
+ return dependencyParser;
29
+ }
30
+
31
+ public DependencyParser() throws IOException {
32
+ LOGGER.info("Loading Dependency Parsing model");
33
+ nlpDecoder = new NLPDecoder();
34
+ List<NLPComponent<NLPNode>> components = new ArrayList();
35
+
36
+ String modelPath = Utils.jarDir + "/models/dep/vi-dep.xz";
37
+ if (!new File(modelPath).exists()) throw new IOException("DependencyParser: " + modelPath + " is not found!");
38
+ GlobalLexica lexica = LexicalInitializer.initialize(true).initializeLexica();
39
+ if(lexica != null) {
40
+ components.add(lexica);
41
+ }
42
+ components.add(NLPUtils.getComponent(modelPath));
43
+ nlpDecoder.setComponents(components);
44
+
45
+ }
46
+
47
+ public void tagSentence(List<Word> sentenceWords) {
48
+ NLPNode[] decodedNodes = nlpDecoder.decode(toNodeArray(sentenceWords));
49
+ for(int i = 0; i < sentenceWords.size(); i++) {
50
+ Word word = sentenceWords.get(i);
51
+ word.setHead(decodedNodes[i + 1].getDependencyHead().getID());
52
+ word.setDepLabel(decodedNodes[i + 1].getDependencyLabel());
53
+ if(word.getPosTag() != null && word.getPosTag().equals("CH")) word.setDepLabel("punct");
54
+ }
55
+ }
56
+
57
+ private NLPNode[] toNodeArray(List<Word> sentenceWords) {
58
+ NLPNode[] nlpNodes = new NLPNode[sentenceWords.size() + 1];
59
+ nlpNodes[0] = new NLPNode();
60
+ for(int i = 0; i < sentenceWords.size(); i++) {
61
+ Word word = sentenceWords.get(i);
62
+ //int id, String form, String lemma, String posTag, FeatMap feats
63
+ nlpNodes[i + 1] = new NLPNode(word.getIndex(), word.getForm(), word.getForm(),
64
+ word.getPosTag(), new FeatMap());
65
+
66
+ }
67
+ return nlpNodes;
68
+ }
69
+
70
+ public static void main(String[] args) {
71
+
72
+
73
+ }
74
+ }
VnCoreNLP/src/main/java/vn/corenlp/postagger/PosTagger.java ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.corenlp.postagger;
2
+
3
+ import marmot.morph.MorphTagger;
4
+ import marmot.morph.Sentence;
5
+ import marmot.morph.Word;
6
+
7
+ import marmot.util.FileUtils;
8
+ import org.apache.log4j.Logger;
9
+
10
+ import java.io.File;
11
+ import java.io.IOException;
12
+ import java.util.ArrayList;
13
+ import java.util.LinkedList;
14
+ import java.util.List;
15
+
16
+ import vn.pipeline.Utils;
17
+
18
+ public class PosTagger {
19
+ private static PosTagger posTagger = null;
20
+ private MorphTagger tagger;
21
+ public final static Logger LOGGER = Logger.getLogger(PosTagger.class);
22
+ public PosTagger() throws IOException {
23
+ LOGGER.info("Loading POS Tagging model");
24
+ String modelPath = Utils.jarDir + "/models/postagger/vi-tagger";
25
+ if (!new File(modelPath).exists()) throw new IOException("PosTagger: " + modelPath + " is not found!");
26
+ tagger = FileUtils.loadFromFile(modelPath);
27
+
28
+ }
29
+
30
+ public static PosTagger initialize() throws IOException {
31
+ if(posTagger == null) {
32
+ posTagger = new PosTagger();
33
+ }
34
+ return posTagger;
35
+ }
36
+
37
+ public List<vn.pipeline.Word> tagSentence(String sentence) throws IOException {
38
+ List<vn.pipeline.Word> output = new ArrayList<>();
39
+ String line = sentence.trim();
40
+ if (line.length() == 0) {
41
+ return output;
42
+ }
43
+ String[] tokenstrs = line.split(" ");
44
+ LinkedList tokens = new LinkedList();
45
+
46
+ for(int i = 0; i < tokenstrs.length; ++i) {
47
+ if (!tokenstrs[i].isEmpty()) {
48
+ Word word = new Word(tokenstrs[i]);
49
+ tokens.add(word);
50
+ }
51
+ }
52
+
53
+ Sentence marmotSentence = new Sentence(tokens);
54
+ Object lemma_tags = tagger.tagWithLemma(marmotSentence);
55
+ for(int i = 0; i < marmotSentence.size(); ++i) {
56
+ List<String> token_lemma_tags = (List)((List)lemma_tags).get(i);
57
+ vn.pipeline.Word word = new vn.pipeline.Word((i + 1), marmotSentence.getWord(i).getWordForm(), (String)token_lemma_tags.get(1));
58
+ output.add(word);
59
+
60
+ }
61
+ return output;
62
+ }
63
+
64
+
65
+ }
VnCoreNLP/src/main/java/vn/corenlp/tokenizer/StringUtils.java ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.corenlp.tokenizer;
2
+
3
+ import java.util.HashSet;
4
+ import java.util.regex.Matcher;
5
+ import java.util.regex.Pattern;
6
+
7
+ public class StringUtils
8
+ {
9
+
10
+ public static void testFoundByRegex(String s, String regex)
11
+ {
12
+ System.out.println("Test string: " + s);
13
+
14
+ Pattern pattern = Pattern.compile(regex);
15
+ Matcher matcher = pattern.matcher(s);
16
+ if (matcher.find()) {
17
+ System.out.println(s.substring(0, matcher.start()));
18
+ System.out.println(s.substring(matcher.start(), matcher.end()));
19
+ System.out.println(s.substring(matcher.end()));
20
+ }
21
+ }
22
+
23
+ public static String char2Hex(Character c)
24
+ {
25
+ return String.format("\\u%04x", (int) c);
26
+ }
27
+
28
+ public static Character hex2Char(String hex)
29
+ {
30
+ int hexToInt = Integer.parseInt(hex.substring(2), 16);
31
+ return (char) hexToInt;
32
+ }
33
+
34
+ public static boolean hasPunctuation(String s)
35
+ {
36
+ for (int i = 0; i < s.length(); i++) {
37
+ if (!Character.isLetterOrDigit(s.charAt(i)))
38
+ return true;
39
+ }
40
+
41
+ return false;
42
+ }
43
+
44
+ public static boolean isPunctuation(String s)
45
+ {
46
+ for (int i = 0; i < s.length(); i++) {
47
+ if (Character.isLetterOrDigit(s.charAt(i)))
48
+ return false;
49
+ }
50
+
51
+ return true;
52
+ }
53
+
54
+ public static boolean isNumeric(String s) {
55
+ return s != null && s.matches("[-+]?\\d*\\.?\\d+");
56
+ }
57
+
58
+ // Modified by Dat Quoc Nguyen
59
+ public static boolean isBrace(String string)
60
+ {
61
+ if (string.equals("”") || string.equals("�") || string.equals("'") || string.equals(")")
62
+ || string.equals("}") || string.equals("]")) {
63
+ return true;
64
+ }
65
+ return false;
66
+ }
67
+
68
+ public static HashSet<String> VN_abbreviation;
69
+ public static HashSet<String> VN_exception;
70
+ static {
71
+ VN_abbreviation = new HashSet<String>();
72
+ VN_exception = new HashSet<String>();
73
+
74
+ VN_abbreviation.add("M.City");
75
+ VN_abbreviation.add("V.I.P");
76
+ VN_abbreviation.add("PGS.Ts");
77
+ VN_abbreviation.add("MRS.");
78
+ VN_abbreviation.add("Mrs.");
79
+ VN_abbreviation.add("Man.United");
80
+ VN_abbreviation.add("Mr.");
81
+ VN_abbreviation.add("SHB.ĐN");
82
+ VN_abbreviation.add("Gs.Bs");
83
+ VN_abbreviation.add("U.S.A");
84
+ VN_abbreviation.add("TMN.CSG");
85
+ VN_abbreviation.add("Kts.Ts");
86
+ VN_abbreviation.add("R.Madrid");
87
+ VN_abbreviation.add("Tp.");
88
+ VN_abbreviation.add("T.Ư");
89
+ VN_abbreviation.add("D.C");
90
+ VN_abbreviation.add("Gs.Tskh");
91
+ VN_abbreviation.add("PGS.KTS");
92
+ VN_abbreviation.add("GS.BS");
93
+ VN_abbreviation.add("KTS.TS");
94
+ VN_abbreviation.add("PGS-TS");
95
+ VN_abbreviation.add("Co.");
96
+ VN_abbreviation.add("S.H.E");
97
+ VN_abbreviation.add("Ths.Bs");
98
+ VN_abbreviation.add("T&T.HN");
99
+ VN_abbreviation.add("MR.");
100
+ VN_abbreviation.add("Ms.");
101
+ VN_abbreviation.add("T.T.P");
102
+ VN_abbreviation.add("TT.");
103
+ VN_abbreviation.add("TP.");
104
+ VN_abbreviation.add("ĐH.QGHN");
105
+ VN_abbreviation.add("Gs.Kts");
106
+ VN_abbreviation.add("Man.Utd");
107
+ VN_abbreviation.add("GD-ĐT");
108
+ VN_abbreviation.add("T.W");
109
+ VN_abbreviation.add("Corp.");
110
+ VN_abbreviation.add("ĐT.LA");
111
+ VN_abbreviation.add("Dr.");
112
+ VN_abbreviation.add("T&T");
113
+ VN_abbreviation.add("HN.ACB");
114
+ VN_abbreviation.add("GS.KTS");
115
+ VN_abbreviation.add("MS.");
116
+ VN_abbreviation.add("Prof.");
117
+ VN_abbreviation.add("GS.TS");
118
+ VN_abbreviation.add("PGs.Ts");
119
+ VN_abbreviation.add("PGS.BS");
120
+ VN_abbreviation.add("BT.");
121
+ VN_abbreviation.add("Ltd.");
122
+ VN_abbreviation.add("ThS.BS");
123
+ VN_abbreviation.add("Gs.Ts");
124
+ VN_abbreviation.add("SL.NA");
125
+ //VN_abbreviation.add("P.");
126
+ VN_abbreviation.add("Th.S");
127
+ VN_abbreviation.add("Gs.Vs");
128
+ VN_abbreviation.add("PGs.Bs");
129
+ VN_abbreviation.add("T.O.P");
130
+ VN_abbreviation.add("PGS.TS");
131
+ VN_abbreviation.add("HN.T&T");
132
+ VN_abbreviation.add("SG.XT");
133
+ VN_abbreviation.add("O.T.C");
134
+ VN_abbreviation.add("TS.BS");
135
+ VN_abbreviation.add("Yahoo!");
136
+ VN_abbreviation.add("Man.City");
137
+ VN_abbreviation.add("MISS.");
138
+ VN_abbreviation.add("HA.GL");
139
+ VN_abbreviation.add("GS.Ts");
140
+ VN_abbreviation.add("TBT.");
141
+ VN_abbreviation.add("GS.VS");
142
+ VN_abbreviation.add("GS.TSKH");
143
+ VN_abbreviation.add("Ts.Bs");
144
+ VN_abbreviation.add("M.U");
145
+ VN_abbreviation.add("Gs.TSKH");
146
+ VN_abbreviation.add("U.S");
147
+ VN_abbreviation.add("Miss.");
148
+ VN_abbreviation.add("GD.ĐT");
149
+ VN_abbreviation.add("PGs.Kts");
150
+ //VN_abbreviation.add("Q.");
151
+ VN_abbreviation.add("St.");
152
+ VN_abbreviation.add("Ng.");
153
+ VN_abbreviation.add("Inc.");
154
+ VN_abbreviation.add("Th.");
155
+ VN_abbreviation.add("N.O.V.A");
156
+
157
+ VN_exception.add("Wi-fi");
158
+ VN_exception.add("17+");
159
+ VN_exception.add("km/h");
160
+ VN_exception.add("M7");
161
+ VN_exception.add("M8");
162
+ VN_exception.add("21+");
163
+ VN_exception.add("G3");
164
+ VN_exception.add("M9");
165
+ VN_exception.add("G4");
166
+ VN_exception.add("km3");
167
+ VN_exception.add("m/s");
168
+ VN_exception.add("km2");
169
+ VN_exception.add("5g");
170
+ VN_exception.add("4G");
171
+ VN_exception.add("8K");
172
+ VN_exception.add("3g");
173
+ VN_exception.add("E9");
174
+ VN_exception.add("U21");
175
+ VN_exception.add("4K");
176
+ VN_exception.add("U23");
177
+ VN_exception.add("Z1");
178
+ VN_exception.add("Z2");
179
+ VN_exception.add("Z3");
180
+ VN_exception.add("Z4");
181
+ VN_exception.add("Z5");
182
+ VN_exception.add("Jong-un");
183
+ VN_exception.add("u19");
184
+ VN_exception.add("5s");
185
+ VN_exception.add("wi-fi");
186
+ VN_exception.add("18+");
187
+ VN_exception.add("Wi-Fi");
188
+ VN_exception.add("m2");
189
+ VN_exception.add("16+");
190
+ VN_exception.add("m3");
191
+ VN_exception.add("V-League");
192
+ VN_exception.add("Geun-hye");
193
+ VN_exception.add("5G");
194
+ VN_exception.add("4g");
195
+ VN_exception.add("Z3+");
196
+ VN_exception.add("3G");
197
+ VN_exception.add("km/s");
198
+ VN_exception.add("6+");
199
+ VN_exception.add("u21");
200
+ VN_exception.add("WI-FI");
201
+ VN_exception.add("u23");
202
+ VN_exception.add("U19");
203
+ VN_exception.add("6s");
204
+ VN_exception.add("4s");
205
+ }
206
+
207
+ }
VnCoreNLP/src/main/java/vn/corenlp/tokenizer/Tokenizer.java ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.corenlp.tokenizer;
2
+
3
+ import java.io.IOException;
4
+ import java.util.ArrayList;
5
+ import java.util.HashSet;
6
+ import java.util.List;
7
+ import java.util.regex.Matcher;
8
+ import java.util.regex.Pattern;
9
+
10
+
11
+ /**
12
+ * This class contains methods used for tokenization step.
13
+ *
14
+ * @author tuanphong94
15
+ * @link https://github.com/phongnt570/UETsegmenter/blob/master/src/vn/edu/vnu/uet/nlp/tokenizer/tokenizer.tokenizer.java
16
+ *
17
+ */
18
+ public class Tokenizer {
19
+ /**
20
+ * @param s
21
+ * @return List of tokens from s
22
+ * @throws IOException
23
+ */
24
+ public static List<String> tokenize(String s) throws IOException {
25
+ if (s == null || s.trim().isEmpty()) {
26
+ return new ArrayList<String>();
27
+ }
28
+
29
+ String[] tempTokens = s.trim().split("\\s+");
30
+ if (tempTokens.length == 0) {
31
+ return new ArrayList<String>();
32
+ }
33
+
34
+ List<String> tokens = new ArrayList<String>();
35
+
36
+ for (String token : tempTokens) {
37
+ if (token.length() == 1 || !StringUtils.hasPunctuation(token)) {
38
+ tokens.add(token);
39
+ continue;
40
+ }
41
+
42
+ if (token.endsWith(",")) {
43
+ tokens.addAll(tokenize(token.substring(0, token.length() - 1)));
44
+ tokens.add(",");
45
+ continue;
46
+ }
47
+
48
+ if (StringUtils.VN_abbreviation.contains(token)) {
49
+ tokens.add(token);
50
+ continue;
51
+ }
52
+
53
+
54
+ if (token.endsWith(".") && Character.isAlphabetic(token.charAt(token.length() - 2))) {
55
+ if ((token.length() == 2 && Character.isUpperCase(token.charAt(token.length() - 2))) || (Pattern.compile(Regex.SHORT_NAME).matcher(token).find())) {
56
+ tokens.add(token);
57
+ continue;
58
+ }
59
+ tokens.addAll(tokenize(token.substring(0, token.length() - 1)));
60
+ tokens.add(".");
61
+ continue;
62
+ }
63
+
64
+ if (StringUtils.VN_exception.contains(token)) {
65
+ tokens.add(token);
66
+ continue;
67
+ }
68
+
69
+ boolean tokenContainsAbb = false;
70
+ for (String e : StringUtils.VN_abbreviation) {
71
+ int i = token.indexOf(e);
72
+ if (i < 0)
73
+ continue;
74
+
75
+ tokenContainsAbb = true;
76
+ tokens = recursive(tokens, token, i, i + e.length());
77
+ break;
78
+ }
79
+ if (tokenContainsAbb)
80
+ continue;
81
+
82
+ boolean tokenContainsExp = false;
83
+ for (String e : StringUtils.VN_exception) {
84
+ int i = token.indexOf(e);
85
+ if (i < 0)
86
+ continue;
87
+
88
+ tokenContainsExp = true;
89
+ tokens = recursive(tokens, token, i, i + e.length());
90
+ break;
91
+ }
92
+ if (tokenContainsExp)
93
+ continue;
94
+
95
+ List<String> regexes = Regex.getRegexList();
96
+
97
+ boolean matching = false;
98
+ for (String regex : regexes) {
99
+ if (token.matches(regex)) {
100
+ tokens.add(token);
101
+ matching = true;
102
+ break;
103
+ }
104
+ }
105
+ if (matching) {
106
+ continue;
107
+ }
108
+
109
+ for (int i = 0; i < regexes.size(); i++) {
110
+ Pattern pattern = Pattern.compile(regexes.get(i));
111
+ Matcher matcher = pattern.matcher(token);
112
+
113
+ if (matcher.find()) {
114
+ if (i == Regex.getRegexIndex("url")) {
115
+ String[] elements = token.split(Pattern.quote("."));
116
+ boolean hasURL = true;
117
+ for (String ele : elements) {
118
+ if (ele.length() == 1 && Character.isUpperCase(ele.charAt(0))) {
119
+ hasURL = false;
120
+ break;
121
+ }
122
+ for (int j = 0; j < ele.length(); j++) {
123
+ if (ele.charAt(j) >= 128) {
124
+ hasURL = false;
125
+ break;
126
+ }
127
+ }
128
+ }
129
+ if (hasURL) {
130
+ tokens = recursive(tokens, token, matcher.start(), matcher.end());
131
+ } else {
132
+ continue;
133
+ }
134
+ }
135
+
136
+ else if (i == Regex.getRegexIndex("month")) {
137
+ int start = matcher.start();
138
+
139
+ boolean hasLetter = false;
140
+
141
+ for (int j = 0; j < start; j++) {
142
+ if (Character.isLetter(token.charAt(j))) {
143
+ tokens = recursive(tokens, token, matcher.start(), matcher.end());
144
+ hasLetter = true;
145
+ break;
146
+ }
147
+ }
148
+
149
+ if (!hasLetter) {
150
+ tokens.add(token);
151
+ }
152
+ }
153
+
154
+ else {
155
+ tokens = recursive(tokens, token, matcher.start(), matcher.end());
156
+ }
157
+
158
+ matching = true;
159
+ break;
160
+ }
161
+ }
162
+
163
+ if (matching)
164
+ continue;
165
+ else
166
+ tokens.add(token);
167
+ }
168
+
169
+ return tokens;
170
+ }
171
+
172
+ private static List<String> recursive(List<String> tokens, String token, int beginMatch, int endMatch)
173
+ throws IOException {
174
+ if (beginMatch > 0)
175
+ tokens.addAll(tokenize(token.substring(0, beginMatch)));
176
+ tokens.addAll(tokenize(token.substring(beginMatch, endMatch)));
177
+
178
+ if (endMatch < token.length())
179
+ tokens.addAll(tokenize(token.substring(endMatch)));
180
+
181
+ return tokens;
182
+ }
183
+
184
+ public static List<String> joinSentences(List<String> tokens) {
185
+ List<String> sentences = new ArrayList<>();
186
+
187
+ List<String> sentence = new ArrayList<>();
188
+ for (int i = 0; i < tokens.size(); i++) {
189
+ String token = tokens.get(i);
190
+ String nextToken = null;
191
+ if (i != tokens.size() - 1) {
192
+ nextToken = tokens.get(i + 1);
193
+ }
194
+ String beforeToken = null;
195
+ if (i > 0) {
196
+ beforeToken = tokens.get(i - 1);
197
+ }
198
+
199
+ sentence.add(token);
200
+
201
+ if (i == tokens.size() - 1) {
202
+ sentences.add(joinSentence(sentence));
203
+ return sentences;
204
+ }
205
+
206
+ if (i < tokens.size() - 2 && token.equals(StringConst.COLON)) {
207
+ if (Character.isDigit(nextToken.charAt(0)) && tokens.get(i + 2).equals(StringConst.STOP)
208
+ || tokens.get(i + 2).equals(StringConst.COMMA)) {
209
+ sentences.add(joinSentence(sentence));
210
+ sentence.clear();
211
+ continue;
212
+ }
213
+ }
214
+
215
+ if (token.matches(Regex.EOS_PUNCTUATION)) {
216
+
217
+ // Added by Dat Quoc Nguyen
218
+ if (nextToken.equals("\"") || nextToken.equals("''")) {
219
+ int count = 0;
220
+ for (String senToken : sentence) {
221
+ if (senToken.equals("\"") || senToken.equals("''"))
222
+ count += 1;
223
+ }
224
+ if (count % 2 == 1)
225
+ continue;
226
+ }
227
+
228
+ // If the current sentence is in the quote or in the brace
229
+ if (StringUtils.isBrace(nextToken) || nextToken.isEmpty() || Character.isLowerCase(nextToken.charAt(0))
230
+ || nextToken.equals(StringConst.COMMA) || Character.isDigit(nextToken.charAt(0))) {
231
+ continue;
232
+ }
233
+
234
+ // Sentence starts with its order number
235
+ if (sentence.size() == 2 && token.equals(StringConst.STOP)) {
236
+ if (Character.isDigit(beforeToken.charAt(0))) {
237
+ continue;
238
+ }
239
+ if (Character.isLowerCase(beforeToken.charAt(0))) {
240
+ continue;
241
+ }
242
+ if (Character.isUpperCase(beforeToken.charAt(0))) {
243
+ if (beforeToken.length() == 1) {
244
+ continue;
245
+ }
246
+ }
247
+ }
248
+
249
+ sentences.add(joinSentence(sentence));
250
+ sentence.clear();
251
+ }
252
+ }
253
+
254
+ return sentences;
255
+ }
256
+
257
+ public static String joinSentence(List<String> tokens) {
258
+ StringBuffer sent = new StringBuffer();
259
+ int length = tokens.size();
260
+ String token;
261
+ for (int i = 0; i < length; i++) {
262
+ token = tokens.get(i);
263
+ if (token.isEmpty() || token == null || token.equals(StringConst.SPACE)) {
264
+ continue;
265
+ }
266
+ sent.append(token);
267
+ if (i < length - 1)
268
+ sent.append(StringConst.SPACE);
269
+ }
270
+ return sent.toString().trim();
271
+ }
272
+ }
273
+
274
+ interface StringConst
275
+ {
276
+ public static final String BOS = "<s>";
277
+ public static final String EOS = "</s>";
278
+
279
+ public static final String SPACE = " ";
280
+ public static final String COMMA = ",";
281
+ public static final String STOP = ".";
282
+ public static final String COLON = ":";
283
+ public static final String UNDERSCORE = "_";
284
+ }
285
+
286
+ class Regex
287
+ {
288
+
289
+ public static final String ELLIPSIS = "\\.{2,}";
290
+
291
+ public static final String EMAIL = "([\\w\\d_\\.-]+)@(([\\d\\w-]+)\\.)*([\\d\\w-]+)";
292
+
293
+ public static final String FULL_DATE = "(0?[1-9]|[12][0-9]|3[01])(\\/|-|\\.)(1[0-2]|(0?[1-9]))((\\/|-|\\.)\\d{4})";
294
+
295
+ public static final String MONTH = "(1[0-2]|(0?[1-9]))(\\/)\\d{4}";
296
+
297
+ public static final String DATE = "(0?[1-9]|[12][0-9]|3[01])(\\/)(1[0-2]|(0?[1-9]))";
298
+
299
+ public static final String TIME = "(\\d\\d:\\d\\d:\\d\\d)|((0?\\d|1\\d|2[0-3])(:|h)(0?\\d|[1-5]\\d)(’|'|p|ph)?)";
300
+
301
+ public static final String MONEY = "\\p{Sc}\\d+([\\.,]\\d+)*|\\d+([\\.,]\\d+)*\\p{Sc}";
302
+
303
+ public static final String PHONE_NUMBER = "(\\(?\\+\\d{1,2}\\)?[\\s\\.-]?)?\\d{2,}[\\s\\.-]?\\d{3,}[\\s\\.-]?\\d{3,}";
304
+
305
+ public static final String URL = "(((https?|ftp):\\/\\/|www\\.)[^\\s/$.?#].[^\\s]*)|(https?:\\/\\/)?(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)";
306
+
307
+ public static final String NUMBER = "[-+]?\\d+([\\.,]\\d+)*%?\\p{Sc}?";
308
+
309
+ public static final String PUNCTUATION = ",|\\.|:|\\?|!|;|-|_|\"|'|“|”|\\||\\(|\\)|\\[|\\]|\\{|\\}|⟨|⟩|«|»|\\\\|\\/|\\‘|\\’|\\“|\\â€�|…|…|‘|’|·";
310
+
311
+ public static final String SPECIAL_CHAR = "\\~|\\@|\\#|\\^|\\&|\\*|\\+|\\-|\\–|<|>|\\|";
312
+
313
+ public static final String EOS_PUNCTUATION = "(\\.+|\\?|!|…)";
314
+
315
+ public static final String NUMBERS_EXPRESSION = NUMBER + "([\\+\\-\\*\\/]" + NUMBER + ")*";
316
+
317
+ public static final String SHORT_NAME = "([\\p{L}]+([\\.\\-][\\p{L}]+)+)|([\\p{L}]+-\\d+)";
318
+
319
+ public static final String WORD_WITH_HYPHEN = "\\p{L}+-\\p{L}+(-\\p{L}+)*";
320
+
321
+ public static final String ALLCAP = "[A-Z]+\\.[A-Z]+";
322
+
323
+ private static List<String> regexes = null;
324
+
325
+ private static List<String> regexIndex = null;
326
+
327
+ public static List<String> getRegexList()
328
+ {
329
+ if (regexes == null) {
330
+ regexes = new ArrayList<String>();
331
+ regexIndex = new ArrayList<String>();
332
+
333
+ regexes.add(ELLIPSIS);
334
+ regexIndex.add("ELLIPSIS");
335
+
336
+ regexes.add(EMAIL);
337
+ regexIndex.add("EMAIL");
338
+
339
+ regexes.add(URL);
340
+ regexIndex.add("URL");
341
+
342
+ regexes.add(FULL_DATE);
343
+ regexIndex.add("FULL_DATE");
344
+
345
+ regexes.add(MONTH);
346
+ regexIndex.add("MONTH");
347
+
348
+ regexes.add(DATE);
349
+ regexIndex.add("DATE");
350
+
351
+ regexes.add(TIME);
352
+ regexIndex.add("TIME");
353
+
354
+ regexes.add(MONEY);
355
+ regexIndex.add("MONEY");
356
+
357
+ regexes.add(PHONE_NUMBER);
358
+ regexIndex.add("PHONE_NUMBER");
359
+
360
+ regexes.add(SHORT_NAME);
361
+ regexIndex.add("SHORT_NAME");
362
+
363
+ regexes.add(NUMBERS_EXPRESSION);
364
+ regexIndex.add("NUMBERS_EXPRESSION");
365
+
366
+ regexes.add(NUMBER);
367
+ regexIndex.add("NUMBER");
368
+
369
+ regexes.add(WORD_WITH_HYPHEN);
370
+ regexIndex.add("WORD_WITH_HYPHEN");
371
+
372
+ regexes.add(PUNCTUATION);
373
+ regexIndex.add("PUNCTUATION");
374
+
375
+ regexes.add(SPECIAL_CHAR);
376
+ regexIndex.add("SPECIAL_CHAR");
377
+
378
+ regexes.add(ALLCAP);
379
+ regexIndex.add("ALLCAP");
380
+
381
+ }
382
+
383
+ return regexes;
384
+ }
385
+
386
+ public static int getRegexIndex(String regex)
387
+ {
388
+ return regexIndex.indexOf(regex.toUpperCase());
389
+ }
390
+ public static void main(String[] args) throws IOException {
391
+ List<String> tokens = Tokenizer.tokenize("93% 9-10 anh-yeu-em");
392
+
393
+ for(String token : tokens) {
394
+ System.out.print(token + " ");
395
+ }
396
+ }
397
+ }
VnCoreNLP/src/main/java/vn/corenlp/wordsegmenter/FWObject.java ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.corenlp.wordsegmenter;
2
+
3
+ /**
4
+ * @author DatQuocNguyen
5
+ */
6
+
7
+ /*
8
+ * Define a 5-word/tag window object to capture the context surrounding a word
9
+ */
10
+ public class FWObject {
11
+ private String[] context;
12
+
13
+ public FWObject(boolean check) {
14
+ context = new String[10];
15
+ if (check == true) {
16
+ for (int i = 0; i < 10; i += 2) {
17
+ context[i] = "<W>";
18
+ context[i + 1] = "<T>";
19
+ }
20
+ }
21
+ }
22
+
23
+ public String[] getContext() {
24
+ return context;
25
+ }
26
+
27
+ public void setContext(String[] context) {
28
+ this.context = context;
29
+ }
30
+ }
VnCoreNLP/src/main/java/vn/corenlp/wordsegmenter/Node.java ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.corenlp.wordsegmenter;
2
+
3
+ /**
4
+ * @author DatQuocNguyen
5
+ */
6
+
7
+ public class Node {
8
+ private FWObject condition;
9
+ private String conclusion;
10
+ private Node exceptNode;
11
+ private Node ifnotNode;
12
+ private Node fatherNode;
13
+ private int depth;
14
+
15
+ public Node(FWObject inCondition, String inConclusion, Node inFatherNode, Node inExceptNode,
16
+ Node inIfnotNode, int inDepth) {
17
+ this.condition = inCondition;
18
+ this.conclusion = inConclusion;
19
+ this.fatherNode = inFatherNode;
20
+ this.exceptNode = inExceptNode;
21
+ this.ifnotNode = inIfnotNode;
22
+ this.depth = inDepth;
23
+ }
24
+
25
+ public void setIfnotNode(Node node) {
26
+ this.ifnotNode = node;
27
+ }
28
+
29
+ public void setExceptNode(Node node) {
30
+ this.exceptNode = node;
31
+ }
32
+
33
+ public void setFatherNode(Node node) {
34
+ this.fatherNode = node;
35
+ }
36
+
37
+ public int countNodes() {
38
+ int count = 1;
39
+ if (exceptNode != null) {
40
+ count += exceptNode.countNodes();
41
+ }
42
+ if (ifnotNode != null) {
43
+ count += ifnotNode.countNodes();
44
+ }
45
+ return count;
46
+ }
47
+
48
+ public boolean satisfy(FWObject object) {
49
+ boolean check = true;
50
+ for (int i = 0; i < 10; i++) {
51
+ String key = condition.getContext()[i];
52
+ if (key != null) {
53
+ if (!key.equals(object.getContext()[i])) {
54
+ check = false;
55
+ break;
56
+ }
57
+ }
58
+ }
59
+ return check;
60
+ }
61
+
62
+ public FWObject getCondition() {
63
+ return condition;
64
+ }
65
+
66
+ public String getConclusion() {
67
+ return conclusion;
68
+ }
69
+
70
+ public Node getExceptNode() {
71
+ return exceptNode;
72
+ }
73
+
74
+ public Node getIfnotNode() {
75
+ return ifnotNode;
76
+ }
77
+
78
+ public Node getFatherNode() {
79
+ return fatherNode;
80
+ }
81
+
82
+ public int getDepth() {
83
+ return depth;
84
+ }
85
+ }
VnCoreNLP/src/main/java/vn/corenlp/wordsegmenter/Utils.java ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.corenlp.wordsegmenter;
2
+
3
+ import java.util.HashMap;
4
+ import java.util.List;
5
+ import java.util.Map;
6
+ import java.util.Set;
7
+
8
+ /**
9
+ * @author DatQuocNguyen
10
+ *
11
+ */
12
+ public class Utils
13
+ {
14
+ public static FWObject getCondition(String strCondition)
15
+ {
16
+ FWObject condition = new FWObject(false);
17
+
18
+ for (String rule : strCondition.split(" and ")) {
19
+ rule = rule.trim();
20
+ String key = rule.substring(rule.indexOf(".") + 1, rule.indexOf(" "));
21
+ String value = getConcreteValue(rule);
22
+
23
+ if (key.equals("prevWord2")) {
24
+ condition.getContext()[4] = value;
25
+ }
26
+ else if (key.equals("prevTag2")) {
27
+ condition.getContext()[5] = value;
28
+ }
29
+ else if (key.equals("prevWord1")) {
30
+ condition.getContext()[2] = value;
31
+ }
32
+ else if (key.equals("prevTag1")) {
33
+ condition.getContext()[3] = value;
34
+ }
35
+ else if (key.equals("word")) {
36
+ condition.getContext()[1] = value;
37
+ }
38
+ else if (key.equals("tag")) {
39
+ condition.getContext()[0] = value;
40
+ }
41
+ else if (key.equals("nextWord1")) {
42
+ condition.getContext()[6] = value;
43
+ }
44
+ else if (key.equals("nextTag1")) {
45
+ condition.getContext()[7] = value;
46
+ }
47
+ else if (key.equals("nextWord2")) {
48
+ condition.getContext()[8] = value;
49
+ }
50
+ else if (key.equals("nextTag2")) {
51
+ condition.getContext()[9] = value;
52
+ }
53
+ }
54
+
55
+ return condition;
56
+ }
57
+
58
+ public static FWObject getObject(List<WordTag> wordtags, int size, int index)
59
+ {
60
+ FWObject object = new FWObject(true);
61
+
62
+ if (index > 1) {
63
+ object.getContext()[4] = wordtags.get(index - 2).word;
64
+ object.getContext()[5] = wordtags.get(index - 2).tag;
65
+ }
66
+
67
+ if (index > 0) {
68
+ object.getContext()[2] = wordtags.get(index - 1).word;
69
+ object.getContext()[3] = wordtags.get(index - 1).tag;
70
+ }
71
+
72
+ String currentWord = wordtags.get(index).word;
73
+ String currentTag = wordtags.get(index).tag;
74
+
75
+ object.getContext()[1] = currentWord;
76
+ object.getContext()[0] = currentTag;
77
+
78
+ if (index < size - 1) {
79
+ object.getContext()[6] = wordtags.get(index + 1).word;
80
+ object.getContext()[7] = wordtags.get(index + 1).tag;
81
+ }
82
+
83
+ if (index < size - 2) {
84
+ object.getContext()[8] = wordtags.get(index + 2).word;
85
+ object.getContext()[9] = wordtags.get(index + 2).tag;
86
+ }
87
+
88
+ return object;
89
+ }
90
+
91
+ public static String getConcreteValue(String str)
92
+ {
93
+ if (str.contains("\"\"")) {
94
+ if (str.contains("Word"))
95
+ return "<W>";
96
+ else
97
+ return "<T>";
98
+ }
99
+ String conclusion = str.substring(str.indexOf("\"") + 1, str.length() - 1);
100
+ return conclusion;
101
+ }
102
+
103
+ public static Map<String, String> NORMALIZER;
104
+ public static Set<String> NORMALIZER_KEYS;
105
+ static {
106
+ NORMALIZER = new HashMap<String, String>();
107
+ NORMALIZER.put("òa", "oà");
108
+ NORMALIZER.put("óa", "oá");
109
+ NORMALIZER.put("ỏa", "oả");
110
+ NORMALIZER.put("õa", "oã");
111
+ NORMALIZER.put("ọa", "oạ");
112
+ NORMALIZER.put("òe", "oè");
113
+ NORMALIZER.put("óe", "oé");
114
+ NORMALIZER.put("ỏe", "oẻ");
115
+ NORMALIZER.put("õe", "oẽ");
116
+ NORMALIZER.put("ọe", "oẹ");
117
+ NORMALIZER.put("ùy", "uỳ");
118
+ NORMALIZER.put("úy", "uý");
119
+ NORMALIZER.put("ủy", "uỷ");
120
+ NORMALIZER.put("ũy", "uỹ");
121
+ NORMALIZER.put("ụy", "uỵ");
122
+ NORMALIZER.put("Ủy", "Uỷ");
123
+ NORMALIZER_KEYS = NORMALIZER.keySet();
124
+ }
125
+
126
+ }
VnCoreNLP/src/main/java/vn/corenlp/wordsegmenter/Vocabulary.java ADDED
@@ -0,0 +1,1605 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.corenlp.wordsegmenter;
2
+
3
+ import java.io.File;
4
+ import java.io.FileInputStream;
5
+ import java.io.IOException;
6
+ import java.io.ObjectInputStream;
7
+ import java.util.HashSet;
8
+ import java.util.Set;
9
+
10
+ import vn.pipeline.Utils;
11
+
12
+ @SuppressWarnings("unchecked")
13
+ public class Vocabulary {
14
+ public static Set<String> VN_DICT;
15
+ static {
16
+ VN_DICT = new HashSet<String>();
17
+ try {
18
+ String vocabPath = Utils.jarDir + "/models/wordsegmenter/vi-vocab";
19
+ if (!new File(vocabPath).exists())
20
+ throw new IOException("Vocabulary: " + vocabPath + " is not found!");
21
+ //Vocabulary.class.getClassLoader().getResource("wordsegmenter/vi-vocab").getPath()
22
+ ObjectInputStream ois = new ObjectInputStream(new FileInputStream(vocabPath));
23
+ VN_DICT = (Set<String>) ois.readObject();
24
+ ois.close();
25
+ }
26
+ catch (IOException | ClassNotFoundException e1) {
27
+ // TODO Auto-generated catch block
28
+ e1.printStackTrace();
29
+ }
30
+ // BufferedReader buffer;
31
+ // try {
32
+ // buffer = new BufferedReader(new InputStreamReader(
33
+ // new FileInputStream(new File("VnVocab.txt")), "UTF-8"));
34
+ // for (String line; (line = buffer.readLine()) != null;) {
35
+ // line = line.trim();
36
+ // if (line.contains(" "))
37
+ // VN_DICT.add(line);
38
+ // }
39
+ // buffer.close();
40
+ // }
41
+ // catch (FileNotFoundException e) {
42
+ // // TODO Auto-generated catch block
43
+ // e.printStackTrace();
44
+ // }
45
+ // catch (IOException e) {
46
+ // // TODO Auto-generated catch block
47
+ // e.printStackTrace();
48
+ // }
49
+ }
50
+
51
+ public static Set<String> COUNTRY_L_NAME;
52
+ static {
53
+ COUNTRY_L_NAME = new HashSet<String>();
54
+ COUNTRY_L_NAME.add("na uy");
55
+ COUNTRY_L_NAME.add("san marino");
56
+ COUNTRY_L_NAME.add("phần lan");
57
+ COUNTRY_L_NAME.add("bồ đào nha");
58
+ COUNTRY_L_NAME.add("ca-ri-bê hà lan");
59
+ COUNTRY_L_NAME.add("quần đảo bắc mariana");
60
+ COUNTRY_L_NAME.add("ả rập xê-út");
61
+ COUNTRY_L_NAME.add("tây ban nha");
62
+ COUNTRY_L_NAME.add("quần đảo virgin");
63
+ COUNTRY_L_NAME.add("đảo somoa thuộc mỹ");
64
+ COUNTRY_L_NAME.add("đông timor");
65
+ COUNTRY_L_NAME.add("hoa kỳ");
66
+ COUNTRY_L_NAME.add("quần đảo pitcairn");
67
+ COUNTRY_L_NAME.add("samoa thuộc mỹ");
68
+ COUNTRY_L_NAME.add("hàn quốc");
69
+ COUNTRY_L_NAME.add("đảo ascension");
70
+ COUNTRY_L_NAME.add("thuỵ sĩ");
71
+ COUNTRY_L_NAME.add("ai cập");
72
+ COUNTRY_L_NAME.add("burkina faso");
73
+ COUNTRY_L_NAME.add("mông cổ");
74
+ COUNTRY_L_NAME.add("polynesia thuộc pháp");
75
+ COUNTRY_L_NAME.add("turks và caicos");
76
+ COUNTRY_L_NAME.add("thổ nhĩ kỳ");
77
+ COUNTRY_L_NAME.add("liên bang micronesia");
78
+ COUNTRY_L_NAME.add("đảo man");
79
+ COUNTRY_L_NAME.add("saint helena");
80
+ COUNTRY_L_NAME.add("ả rập saudi");
81
+ COUNTRY_L_NAME.add("ba lan");
82
+ COUNTRY_L_NAME.add("são tomé và príncipe");
83
+ COUNTRY_L_NAME.add("đảo norfolk");
84
+ COUNTRY_L_NAME.add("chdcnd triều tiên");
85
+ COUNTRY_L_NAME.add("quần đảo canary");
86
+ COUNTRY_L_NAME.add("guiana thuộc pháp");
87
+ COUNTRY_L_NAME.add("antigua và barbuda");
88
+ COUNTRY_L_NAME.add("saint pierre và miquelon");
89
+ COUNTRY_L_NAME.add("sri lanka");
90
+ COUNTRY_L_NAME.add("ceuta và melilla");
91
+ COUNTRY_L_NAME.add("việt nam");
92
+ COUNTRY_L_NAME.add("bờ biển ngà");
93
+ COUNTRY_L_NAME.add("thuỵ điển");
94
+ COUNTRY_L_NAME.add("el salvador");
95
+ COUNTRY_L_NAME.add("svalbard và jan mayen");
96
+ COUNTRY_L_NAME.add("saint lucia");
97
+ COUNTRY_L_NAME.add("diego garcia");
98
+ COUNTRY_L_NAME.add("ấn độ");
99
+ COUNTRY_L_NAME.add("tây sahara");
100
+ COUNTRY_L_NAME.add("quần đảo cook");
101
+ COUNTRY_L_NAME.add("guinea xích đạo");
102
+ COUNTRY_L_NAME.add("trung quốc");
103
+ COUNTRY_L_NAME.add("chdc congo");
104
+ COUNTRY_L_NAME.add("cộng hoà dominica");
105
+ COUNTRY_L_NAME.add("cape verde");
106
+ COUNTRY_L_NAME.add("hà lan");
107
+ COUNTRY_L_NAME.add("puerto rico");
108
+ COUNTRY_L_NAME.add("đài loan");
109
+ COUNTRY_L_NAME.add("cộng hoà séc");
110
+ COUNTRY_L_NAME.add("costa rica");
111
+ COUNTRY_L_NAME.add("saint kitts và nevis");
112
+ COUNTRY_L_NAME.add("nhật bản");
113
+ COUNTRY_L_NAME.add("quần đảo faroe");
114
+ COUNTRY_L_NAME.add("đan mạch");
115
+ COUNTRY_L_NAME.add("turk và caicos");
116
+ COUNTRY_L_NAME.add("cabo verde");
117
+ COUNTRY_L_NAME.add("nam sudan");
118
+ COUNTRY_L_NAME.add("cộng hoà trung phi");
119
+ COUNTRY_L_NAME.add("trung phi");
120
+ COUNTRY_L_NAME.add("saint vincent và grenadines");
121
+ COUNTRY_L_NAME.add("quần đảo cocos");
122
+ COUNTRY_L_NAME.add("thành vatican");
123
+ COUNTRY_L_NAME.add("saint barthélemy");
124
+ COUNTRY_L_NAME.add("nam cực");
125
+ COUNTRY_L_NAME.add("trinidad và tobago");
126
+ COUNTRY_L_NAME.add("cộng hoà congo");
127
+ COUNTRY_L_NAME.add("quần đảo cayman");
128
+ COUNTRY_L_NAME.add("saint martin");
129
+ COUNTRY_L_NAME.add("tristan da cunha");
130
+ COUNTRY_L_NAME.add("bosnia và herzegovina");
131
+ COUNTRY_L_NAME.add("thái lan");
132
+ COUNTRY_L_NAME.add("new zealand");
133
+ COUNTRY_L_NAME.add("hồng kông");
134
+ COUNTRY_L_NAME.add("wallis và futuna");
135
+ COUNTRY_L_NAME.add("sierra leone");
136
+ COUNTRY_L_NAME.add("sint maarten");
137
+ COUNTRY_L_NAME.add("quần đảo solomon");
138
+ COUNTRY_L_NAME.add("nam phi");
139
+ COUNTRY_L_NAME.add("bosna và hercegovina");
140
+ COUNTRY_L_NAME.add("vương quốc anh");
141
+ COUNTRY_L_NAME.add("papua new guinea");
142
+ COUNTRY_L_NAME.add("hy lạp");
143
+ COUNTRY_L_NAME.add("đảo giáng sinh");
144
+ COUNTRY_L_NAME.add("triều tiên");
145
+ COUNTRY_L_NAME.add("quần đảo falkland");
146
+ COUNTRY_L_NAME.add("miến điện");
147
+ COUNTRY_L_NAME.add("quần đảo marshall");
148
+ COUNTRY_L_NAME.add("new caledonia");
149
+ }
150
+
151
+ public static Set<String> COUNTRY_S_NAME;
152
+ static {
153
+ COUNTRY_S_NAME = new HashSet<String>();
154
+ COUNTRY_S_NAME.add("mỹ");
155
+ COUNTRY_S_NAME.add("belarus");
156
+ COUNTRY_S_NAME.add("guinée");
157
+ COUNTRY_S_NAME.add("gambia");
158
+ COUNTRY_S_NAME.add("cô-oét");
159
+ COUNTRY_S_NAME.add("guinea");
160
+ COUNTRY_S_NAME.add("estonia");
161
+ COUNTRY_S_NAME.add("philippines");
162
+ COUNTRY_S_NAME.add("cuba");
163
+ COUNTRY_S_NAME.add("mauritius");
164
+ COUNTRY_S_NAME.add("mali");
165
+ COUNTRY_S_NAME.add("armenia");
166
+ COUNTRY_S_NAME.add("aruba");
167
+ COUNTRY_S_NAME.add("méxico");
168
+ COUNTRY_S_NAME.add("ukraina");
169
+ COUNTRY_S_NAME.add("bénin");
170
+ COUNTRY_S_NAME.add("congo");
171
+ COUNTRY_S_NAME.add("monaco");
172
+ COUNTRY_S_NAME.add("séc");
173
+ COUNTRY_S_NAME.add("kenya");
174
+ COUNTRY_S_NAME.add("hungary");
175
+ COUNTRY_S_NAME.add("greenland");
176
+ COUNTRY_S_NAME.add("li-băng");
177
+ COUNTRY_S_NAME.add("paraguay");
178
+ COUNTRY_S_NAME.add("palau");
179
+ COUNTRY_S_NAME.add("vanuatu");
180
+ COUNTRY_S_NAME.add("colombia");
181
+ COUNTRY_S_NAME.add("azerbaijan");
182
+ COUNTRY_S_NAME.add("syria");
183
+ COUNTRY_S_NAME.add("rwanda");
184
+ COUNTRY_S_NAME.add("libya");
185
+ COUNTRY_S_NAME.add("guernsey");
186
+ COUNTRY_S_NAME.add("afghanistan");
187
+ COUNTRY_S_NAME.add("guiné-bissau");
188
+ COUNTRY_S_NAME.add("hungari");
189
+ COUNTRY_S_NAME.add("kiribati");
190
+ COUNTRY_S_NAME.add("dominica");
191
+ COUNTRY_S_NAME.add("bulgaria");
192
+ COUNTRY_S_NAME.add("brasil");
193
+ COUNTRY_S_NAME.add("bahrain");
194
+ COUNTRY_S_NAME.add("guatemala");
195
+ COUNTRY_S_NAME.add("ghana");
196
+ COUNTRY_S_NAME.add("somalia");
197
+ COUNTRY_S_NAME.add("jamaica");
198
+ COUNTRY_S_NAME.add("togo");
199
+ COUNTRY_S_NAME.add("liechtenstein");
200
+ COUNTRY_S_NAME.add("serbia");
201
+ COUNTRY_S_NAME.add("ma-rốc");
202
+ COUNTRY_S_NAME.add("bỉ");
203
+ COUNTRY_S_NAME.add("úc");
204
+ COUNTRY_S_NAME.add("senegal");
205
+ COUNTRY_S_NAME.add("montserrat");
206
+ COUNTRY_S_NAME.add("zambia");
207
+ COUNTRY_S_NAME.add("namibia");
208
+ COUNTRY_S_NAME.add("comoros");
209
+ COUNTRY_S_NAME.add("curaçao");
210
+ COUNTRY_S_NAME.add("palestine");
211
+ COUNTRY_S_NAME.add("canada");
212
+ COUNTRY_S_NAME.add("li-bi");
213
+ COUNTRY_S_NAME.add("honduras");
214
+ COUNTRY_S_NAME.add("réunion");
215
+ COUNTRY_S_NAME.add("maldives");
216
+ COUNTRY_S_NAME.add("chile");
217
+ COUNTRY_S_NAME.add("algérie");
218
+ COUNTRY_S_NAME.add("oman");
219
+ COUNTRY_S_NAME.add("timor-leste");
220
+ COUNTRY_S_NAME.add("brazil");
221
+ COUNTRY_S_NAME.add("lesotho");
222
+ COUNTRY_S_NAME.add("guyana");
223
+ COUNTRY_S_NAME.add("peru");
224
+ COUNTRY_S_NAME.add("malaysia");
225
+ COUNTRY_S_NAME.add("jersey");
226
+ COUNTRY_S_NAME.add("síp");
227
+ COUNTRY_S_NAME.add("belize");
228
+ COUNTRY_S_NAME.add("nauru");
229
+ COUNTRY_S_NAME.add("campuchia");
230
+ COUNTRY_S_NAME.add("kuwait");
231
+ COUNTRY_S_NAME.add("slovenia");
232
+ COUNTRY_S_NAME.add("somali");
233
+ COUNTRY_S_NAME.add("haiti");
234
+ COUNTRY_S_NAME.add("zimbabwe");
235
+ COUNTRY_S_NAME.add("macedonia");
236
+ COUNTRY_S_NAME.add("micronesia");
237
+ COUNTRY_S_NAME.add("philippin");
238
+ COUNTRY_S_NAME.add("bolivia");
239
+ COUNTRY_S_NAME.add("brunei");
240
+ COUNTRY_S_NAME.add("israel");
241
+ COUNTRY_S_NAME.add("lào");
242
+ COUNTRY_S_NAME.add("bangladesh");
243
+ COUNTRY_S_NAME.add("ý");
244
+ COUNTRY_S_NAME.add("ireland");
245
+ COUNTRY_S_NAME.add("albania");
246
+ COUNTRY_S_NAME.add("botswana");
247
+ COUNTRY_S_NAME.add("venezuela");
248
+ COUNTRY_S_NAME.add("andorra");
249
+ COUNTRY_S_NAME.add("malawi");
250
+ COUNTRY_S_NAME.add("moldova");
251
+ COUNTRY_S_NAME.add("madagascar");
252
+ COUNTRY_S_NAME.add("turkmenistan");
253
+ COUNTRY_S_NAME.add("iran");
254
+ COUNTRY_S_NAME.add("iraq");
255
+ COUNTRY_S_NAME.add("seychelles");
256
+ COUNTRY_S_NAME.add("indonesia");
257
+ COUNTRY_S_NAME.add("tchad");
258
+ COUNTRY_S_NAME.add("nicaragua");
259
+ COUNTRY_S_NAME.add("gibraltar");
260
+ COUNTRY_S_NAME.add("ethiopia");
261
+ COUNTRY_S_NAME.add("ecuador");
262
+ COUNTRY_S_NAME.add("guinea-bissau");
263
+ COUNTRY_S_NAME.add("mauritania");
264
+ COUNTRY_S_NAME.add("albani");
265
+ COUNTRY_S_NAME.add("algeria");
266
+ COUNTRY_S_NAME.add("mozambique");
267
+ COUNTRY_S_NAME.add("cameroon");
268
+ COUNTRY_S_NAME.add("vatican");
269
+ COUNTRY_S_NAME.add("liban");
270
+ COUNTRY_S_NAME.add("panama");
271
+ COUNTRY_S_NAME.add("uae");
272
+ COUNTRY_S_NAME.add("luxembourg");
273
+ COUNTRY_S_NAME.add("nigeria");
274
+ COUNTRY_S_NAME.add("sudan");
275
+ COUNTRY_S_NAME.add("benin");
276
+ COUNTRY_S_NAME.add("chad");
277
+ COUNTRY_S_NAME.add("liberia");
278
+ COUNTRY_S_NAME.add("djibouti");
279
+ COUNTRY_S_NAME.add("đức");
280
+ COUNTRY_S_NAME.add("tajikistan");
281
+ COUNTRY_S_NAME.add("fiji");
282
+ COUNTRY_S_NAME.add("singapore");
283
+ COUNTRY_S_NAME.add("mexico");
284
+ COUNTRY_S_NAME.add("samoa");
285
+ COUNTRY_S_NAME.add("tunisia");
286
+ COUNTRY_S_NAME.add("bahamas");
287
+ COUNTRY_S_NAME.add("bhutan");
288
+ COUNTRY_S_NAME.add("uganda");
289
+ COUNTRY_S_NAME.add("uruguay");
290
+ COUNTRY_S_NAME.add("gabon");
291
+ COUNTRY_S_NAME.add("bungari");
292
+ COUNTRY_S_NAME.add("niger");
293
+ COUNTRY_S_NAME.add("kyrgyzstan");
294
+ COUNTRY_S_NAME.add("pakistan");
295
+ COUNTRY_S_NAME.add("martinique");
296
+ COUNTRY_S_NAME.add("macao");
297
+ COUNTRY_S_NAME.add("kosovo");
298
+ COUNTRY_S_NAME.add("mayotte");
299
+ COUNTRY_S_NAME.add("yemen");
300
+ COUNTRY_S_NAME.add("georgia");
301
+ COUNTRY_S_NAME.add("pháp");
302
+ COUNTRY_S_NAME.add("ai-len");
303
+ COUNTRY_S_NAME.add("argentina");
304
+ COUNTRY_S_NAME.add("jordan");
305
+ COUNTRY_S_NAME.add("anguilla");
306
+ COUNTRY_S_NAME.add("swaziland");
307
+ COUNTRY_S_NAME.add("burundi");
308
+ COUNTRY_S_NAME.add("slovakia");
309
+ COUNTRY_S_NAME.add("uzbekistan");
310
+ COUNTRY_S_NAME.add("maroc");
311
+ COUNTRY_S_NAME.add("tanzania");
312
+ COUNTRY_S_NAME.add("litva");
313
+ COUNTRY_S_NAME.add("grenada");
314
+ COUNTRY_S_NAME.add("gruzia");
315
+ COUNTRY_S_NAME.add("lít-va");
316
+ COUNTRY_S_NAME.add("guam");
317
+ COUNTRY_S_NAME.add("eritrea");
318
+ COUNTRY_S_NAME.add("áo");
319
+ COUNTRY_S_NAME.add("croatia");
320
+ COUNTRY_S_NAME.add("niue");
321
+ COUNTRY_S_NAME.add("nepal");
322
+ COUNTRY_S_NAME.add("tokelau");
323
+ COUNTRY_S_NAME.add("bermuda");
324
+ COUNTRY_S_NAME.add("i-rắc");
325
+ COUNTRY_S_NAME.add("suriname");
326
+ COUNTRY_S_NAME.add("guadeloupe");
327
+ COUNTRY_S_NAME.add("nga");
328
+ COUNTRY_S_NAME.add("romania");
329
+ COUNTRY_S_NAME.add("angola");
330
+ COUNTRY_S_NAME.add("latvia");
331
+ COUNTRY_S_NAME.add("kazakhstan");
332
+ COUNTRY_S_NAME.add("malta");
333
+ COUNTRY_S_NAME.add("myanmar");
334
+ COUNTRY_S_NAME.add("iceland");
335
+ COUNTRY_S_NAME.add("românia");
336
+ COUNTRY_S_NAME.add("montenegro");
337
+ COUNTRY_S_NAME.add("macau");
338
+ COUNTRY_S_NAME.add("tuvalu");
339
+ COUNTRY_S_NAME.add("qatar");
340
+ COUNTRY_S_NAME.add("tonga");
341
+ COUNTRY_S_NAME.add("barbados");
342
+ }
343
+
344
+ public static Set<String> WORLD_COMPANY;
345
+ static {
346
+ WORLD_COMPANY = new HashSet<String>();
347
+ WORLD_COMPANY.add("verizon");
348
+ WORLD_COMPANY.add("prada");
349
+ WORLD_COMPANY.add("hp");
350
+ WORLD_COMPANY.add("walmart");
351
+ WORLD_COMPANY.add("adidas");
352
+ WORLD_COMPANY.add("mastercard");
353
+ WORLD_COMPANY.add("digg");
354
+ WORLD_COMPANY.add("canon");
355
+ WORLD_COMPANY.add("ikea");
356
+ WORLD_COMPANY.add("sony");
357
+ WORLD_COMPANY.add("twitter");
358
+ WORLD_COMPANY.add("lego");
359
+ WORLD_COMPANY.add("toshiba");
360
+ WORLD_COMPANY.add("nokia");
361
+ WORLD_COMPANY.add("bbc");
362
+ WORLD_COMPANY.add("vmware");
363
+ WORLD_COMPANY.add("mercedes-benz");
364
+ WORLD_COMPANY.add("google");
365
+ WORLD_COMPANY.add("intel");
366
+ WORLD_COMPANY.add("iphone");
367
+ WORLD_COMPANY.add("rbc");
368
+ WORLD_COMPANY.add("fedex");
369
+ WORLD_COMPANY.add("mercedes");
370
+ WORLD_COMPANY.add("gillette");
371
+ WORLD_COMPANY.add("ups");
372
+ WORLD_COMPANY.add("carrefour");
373
+ WORLD_COMPANY.add("lenovo");
374
+ WORLD_COMPANY.add("loreal");
375
+ WORLD_COMPANY.add("mcdonald");
376
+ WORLD_COMPANY.add("coca-cola");
377
+ WORLD_COMPANY.add("guardian");
378
+ WORLD_COMPANY.add("cisco");
379
+ WORLD_COMPANY.add("paypal");
380
+ WORLD_COMPANY.add("cvs");
381
+ WORLD_COMPANY.add("acer");
382
+ WORLD_COMPANY.add("cnn");
383
+ WORLD_COMPANY.add("nike");
384
+ WORLD_COMPANY.add("facebook");
385
+ WORLD_COMPANY.add("spotify");
386
+ WORLD_COMPANY.add("adobe");
387
+ WORLD_COMPANY.add("kfc");
388
+ WORLD_COMPANY.add("westpac");
389
+ WORLD_COMPANY.add("subway");
390
+ WORLD_COMPANY.add("ibm");
391
+ WORLD_COMPANY.add("panasonic");
392
+ WORLD_COMPANY.add("visa");
393
+ WORLD_COMPANY.add("motorola");
394
+ WORLD_COMPANY.add("nissan");
395
+ WORLD_COMPANY.add("citibank");
396
+ WORLD_COMPANY.add("baidu");
397
+ WORLD_COMPANY.add("ford");
398
+ WORLD_COMPANY.add("microsoft");
399
+ WORLD_COMPANY.add("bmw");
400
+ WORLD_COMPANY.add("foxconn");
401
+ WORLD_COMPANY.add("yahoo");
402
+ WORLD_COMPANY.add("hermes");
403
+ WORLD_COMPANY.add("oracle");
404
+ WORLD_COMPANY.add("mcdonalds");
405
+ WORLD_COMPANY.add("tencent");
406
+ WORLD_COMPANY.add("mtv");
407
+ WORLD_COMPANY.add("zara");
408
+ WORLD_COMPANY.add("amazon");
409
+ WORLD_COMPANY.add("toyota");
410
+ WORLD_COMPANY.add("gucci");
411
+ WORLD_COMPANY.add("ebay");
412
+ WORLD_COMPANY.add("kodak");
413
+ WORLD_COMPANY.add("youtube");
414
+ WORLD_COMPANY.add("android");
415
+ WORLD_COMPANY.add("linkedin");
416
+ WORLD_COMPANY.add("myspace");
417
+ WORLD_COMPANY.add("t-mobile");
418
+ WORLD_COMPANY.add("apple");
419
+ WORLD_COMPANY.add("samsung");
420
+ WORLD_COMPANY.add("aldi");
421
+ WORLD_COMPANY.add("colgate");
422
+ WORLD_COMPANY.add("starbucks");
423
+ WORLD_COMPANY.add("pepsi");
424
+ WORLD_COMPANY.add("honda");
425
+ WORLD_COMPANY.add("dell");
426
+ WORLD_COMPANY.add("hitachi");
427
+ WORLD_COMPANY.add("blackberry");
428
+ WORLD_COMPANY.add("disney");
429
+ WORLD_COMPANY.add("siemens");
430
+ WORLD_COMPANY.add("vodafone");
431
+ }
432
+
433
+ public static Set<String> VN_LOCATIONS;
434
+ static {
435
+ VN_LOCATIONS = new HashSet<String>();
436
+ VN_LOCATIONS.add("mỹ tho");
437
+ VN_LOCATIONS.add("tập cận bình");
438
+ VN_LOCATIONS.add("nam đông");
439
+ VN_LOCATIONS.add("kiên lương");
440
+ VN_LOCATIONS.add("lương sơn");
441
+ VN_LOCATIONS.add("gò vấp");
442
+ VN_LOCATIONS.add("quang bình");
443
+ VN_LOCATIONS.add("ia pa");
444
+ VN_LOCATIONS.add("lạc sơn");
445
+ VN_LOCATIONS.add("chí linh");
446
+ VN_LOCATIONS.add("ninh hải");
447
+ VN_LOCATIONS.add("sơn dương");
448
+ VN_LOCATIONS.add("quan sơn");
449
+ VN_LOCATIONS.add("ứng hoà");
450
+ VN_LOCATIONS.add("krông pắk");
451
+ VN_LOCATIONS.add("tân hưng");
452
+ VN_LOCATIONS.add("nghệ an");
453
+ VN_LOCATIONS.add("tân thạnh");
454
+ VN_LOCATIONS.add("yên định");
455
+ VN_LOCATIONS.add("mường nhé");
456
+ VN_LOCATIONS.add("ngô quyền");
457
+ VN_LOCATIONS.add("hàm thuận bắc");
458
+ VN_LOCATIONS.add("phú tân");
459
+ VN_LOCATIONS.add("tân hồng");
460
+ VN_LOCATIONS.add("trà ôn");
461
+ VN_LOCATIONS.add("từ liêm");
462
+ VN_LOCATIONS.add("bình thuận");
463
+ VN_LOCATIONS.add("an phú");
464
+ VN_LOCATIONS.add("duy xuyên");
465
+ VN_LOCATIONS.add("nam trực");
466
+ VN_LOCATIONS.add("phù cừ");
467
+ VN_LOCATIONS.add("mai sơn");
468
+ VN_LOCATIONS.add("thạnh phú");
469
+ VN_LOCATIONS.add("lộc bình");
470
+ VN_LOCATIONS.add("kim thành");
471
+ VN_LOCATIONS.add("cái bè");
472
+ VN_LOCATIONS.add("hà quảng");
473
+ VN_LOCATIONS.add("long thành");
474
+ VN_LOCATIONS.add("đồng phù");
475
+ VN_LOCATIONS.add("bảo yên");
476
+ VN_LOCATIONS.add("chiêm hoá");
477
+ VN_LOCATIONS.add("gia nghĩa");
478
+ VN_LOCATIONS.add("an dương");
479
+ VN_LOCATIONS.add("phú quý");
480
+ VN_LOCATIONS.add("quảng trạch");
481
+ VN_LOCATIONS.add("trường sa");
482
+ VN_LOCATIONS.add("hoàn kiếm");
483
+ VN_LOCATIONS.add("thủ thừa");
484
+ VN_LOCATIONS.add("hải lăng");
485
+ VN_LOCATIONS.add("pleiku");
486
+ VN_LOCATIONS.add("thanh hoá");
487
+ VN_LOCATIONS.add("bạch thông");
488
+ VN_LOCATIONS.add("vĩnh phúc");
489
+ VN_LOCATIONS.add("vãn lãng");
490
+ VN_LOCATIONS.add("bình gia");
491
+ VN_LOCATIONS.add("sa thầy");
492
+ VN_LOCATIONS.add("triệu sơn");
493
+ VN_LOCATIONS.add("yên thuỷ");
494
+ VN_LOCATIONS.add("văn giang");
495
+ VN_LOCATIONS.add("hồ chí minh");
496
+ VN_LOCATIONS.add("nga sơn");
497
+ VN_LOCATIONS.add("gia lâm");
498
+ VN_LOCATIONS.add("vị thanh");
499
+ VN_LOCATIONS.add("cái răng");
500
+ VN_LOCATIONS.add("cao bằng");
501
+ VN_LOCATIONS.add("hoài ân");
502
+ VN_LOCATIONS.add("vĩnh long");
503
+ VN_LOCATIONS.add("kim động");
504
+ VN_LOCATIONS.add("ngân sơn");
505
+ VN_LOCATIONS.add("lấp vò");
506
+ VN_LOCATIONS.add("sông công");
507
+ VN_LOCATIONS.add("hoài nhơn");
508
+ VN_LOCATIONS.add("kim bôi");
509
+ VN_LOCATIONS.add("bắc ninh");
510
+ VN_LOCATIONS.add("thái nguyên");
511
+ VN_LOCATIONS.add("đơn dương");
512
+ VN_LOCATIONS.add("định quán");
513
+ VN_LOCATIONS.add("gò công");
514
+ VN_LOCATIONS.add("hà giang");
515
+ VN_LOCATIONS.add("hoà bình");
516
+ VN_LOCATIONS.add("mèo vạc");
517
+ VN_LOCATIONS.add("mộc châu");
518
+ VN_LOCATIONS.add("quảng ngãi");
519
+ VN_LOCATIONS.add("cẩm giàng");
520
+ VN_LOCATIONS.add("sông hinh");
521
+ VN_LOCATIONS.add("thới bình");
522
+ VN_LOCATIONS.add("phụng hiệp");
523
+ VN_LOCATIONS.add("ninh hoà");
524
+ VN_LOCATIONS.add("hậu giang");
525
+ VN_LOCATIONS.add("cái nước");
526
+ VN_LOCATIONS.add("ô môn");
527
+ VN_LOCATIONS.add("gia lai");
528
+ VN_LOCATIONS.add("phổ yên");
529
+ VN_LOCATIONS.add("quế sơn");
530
+ VN_LOCATIONS.add("yên thành");
531
+ VN_LOCATIONS.add("tiên du");
532
+ VN_LOCATIONS.add("an minh");
533
+ VN_LOCATIONS.add("chợ lách");
534
+ VN_LOCATIONS.add("phú ninh");
535
+ VN_LOCATIONS.add("tủa chùa");
536
+ VN_LOCATIONS.add("hương trà");
537
+ VN_LOCATIONS.add("thăng bình");
538
+ VN_LOCATIONS.add("vĩnh thuận");
539
+ VN_LOCATIONS.add("hà tĩnh");
540
+ VN_LOCATIONS.add("lâm đồng");
541
+ VN_LOCATIONS.add("phú quốc");
542
+ VN_LOCATIONS.add("long mỹ");
543
+ VN_LOCATIONS.add("long an");
544
+ VN_LOCATIONS.add("bình lục");
545
+ VN_LOCATIONS.add("vĩnh thạnh");
546
+ VN_LOCATIONS.add("đống đa");
547
+ VN_LOCATIONS.add("hạ long");
548
+ VN_LOCATIONS.add("kỳ sơn");
549
+ VN_LOCATIONS.add("đăk song");
550
+ VN_LOCATIONS.add("lai vung");
551
+ VN_LOCATIONS.add("ý yên");
552
+ VN_LOCATIONS.add("xuyên mộc");
553
+ VN_LOCATIONS.add("vị xuyên");
554
+ VN_LOCATIONS.add("duy tiên");
555
+ VN_LOCATIONS.add("khánh sơn");
556
+ VN_LOCATIONS.add("bỉm sơn");
557
+ VN_LOCATIONS.add("hiệp đức");
558
+ VN_LOCATIONS.add("kim sơn");
559
+ VN_LOCATIONS.add("xín mần");
560
+ VN_LOCATIONS.add("hương thuỷ");
561
+ VN_LOCATIONS.add("tuy hoà");
562
+ VN_LOCATIONS.add("u minh");
563
+ VN_LOCATIONS.add("thiệu hoá");
564
+ VN_LOCATIONS.add("bù đốp");
565
+ VN_LOCATIONS.add("yên sơn");
566
+ VN_LOCATIONS.add("quảng xương");
567
+ VN_LOCATIONS.add("cần đước");
568
+ VN_LOCATIONS.add("thuỷ nguyên");
569
+ VN_LOCATIONS.add("yên dũng");
570
+ VN_LOCATIONS.add("yên hưng");
571
+ VN_LOCATIONS.add("bắc mê");
572
+ VN_LOCATIONS.add("thọ xuân");
573
+ VN_LOCATIONS.add("móng cái");
574
+ VN_LOCATIONS.add("lạc dương");
575
+ VN_LOCATIONS.add("cẩm xuyên");
576
+ VN_LOCATIONS.add("lâm thao");
577
+ VN_LOCATIONS.add("bình tân");
578
+ VN_LOCATIONS.add("phúc yên");
579
+ VN_LOCATIONS.add("sơn tây");
580
+ VN_LOCATIONS.add("vĩnh châu");
581
+ VN_LOCATIONS.add("na hang");
582
+ VN_LOCATIONS.add("chương mỹ");
583
+ VN_LOCATIONS.add("bảo lộc");
584
+ VN_LOCATIONS.add("nghi xuân");
585
+ VN_LOCATIONS.add("lương tài");
586
+ VN_LOCATIONS.add("thoại sơn");
587
+ VN_LOCATIONS.add("cửa lò");
588
+ VN_LOCATIONS.add("đông hưng");
589
+ VN_LOCATIONS.add("lập thạch");
590
+ VN_LOCATIONS.add("nam định");
591
+ VN_LOCATIONS.add("quảng nam");
592
+ VN_LOCATIONS.add("kiên hải");
593
+ VN_LOCATIONS.add("đồng xuân");
594
+ VN_LOCATIONS.add("phú xuyên");
595
+ VN_LOCATIONS.add("tiểu cần");
596
+ VN_LOCATIONS.add("phúc thọ");
597
+ VN_LOCATIONS.add("đông giang");
598
+ VN_LOCATIONS.add("gò dầu");
599
+ VN_LOCATIONS.add("giá rai");
600
+ VN_LOCATIONS.add("tây sơn");
601
+ VN_LOCATIONS.add("phú hoà");
602
+ VN_LOCATIONS.add("việt yên");
603
+ VN_LOCATIONS.add("đak đoa");
604
+ VN_LOCATIONS.add("mường la");
605
+ VN_LOCATIONS.add("hồng ngự");
606
+ VN_LOCATIONS.add("bắc bình");
607
+ VN_LOCATIONS.add("phủ lý");
608
+ VN_LOCATIONS.add("gio linh");
609
+ VN_LOCATIONS.add("cồn cỏ");
610
+ VN_LOCATIONS.add("đức linh");
611
+ VN_LOCATIONS.add("củ chi");
612
+ VN_LOCATIONS.add("hương sơn");
613
+ VN_LOCATIONS.add("tịnh biên");
614
+ VN_LOCATIONS.add("bình thuỷ");
615
+ VN_LOCATIONS.add("nhà bè");
616
+ VN_LOCATIONS.add("yên thế");
617
+ VN_LOCATIONS.add("vĩnh tường");
618
+ VN_LOCATIONS.add("kế sách");
619
+ VN_LOCATIONS.add("sóc sơn");
620
+ VN_LOCATIONS.add("chợ đồn");
621
+ VN_LOCATIONS.add("châu phú");
622
+ VN_LOCATIONS.add("kiến an");
623
+ VN_LOCATIONS.add("sốp cộp");
624
+ VN_LOCATIONS.add("lệ thuỷ");
625
+ VN_LOCATIONS.add("sơn tịnh");
626
+ VN_LOCATIONS.add("càng long");
627
+ VN_LOCATIONS.add("vị thuỷ");
628
+ VN_LOCATIONS.add("ea súp");
629
+ VN_LOCATIONS.add("quảng điền");
630
+ VN_LOCATIONS.add("nghĩa lộ");
631
+ VN_LOCATIONS.add("đồ sơn");
632
+ VN_LOCATIONS.add("krông pa");
633
+ VN_LOCATIONS.add("việt trì");
634
+ VN_LOCATIONS.add("tân thành");
635
+ VN_LOCATIONS.add("nghĩa hưng");
636
+ VN_LOCATIONS.add("bạc liêu");
637
+ VN_LOCATIONS.add("hưng yên");
638
+ VN_LOCATIONS.add("hoàng mai");
639
+ VN_LOCATIONS.add("diên khánh");
640
+ VN_LOCATIONS.add("lăk");
641
+ VN_LOCATIONS.add("bắc trà my");
642
+ VN_LOCATIONS.add("tân châu");
643
+ VN_LOCATIONS.add("tân phú");
644
+ VN_LOCATIONS.add("bình long");
645
+ VN_LOCATIONS.add("đông hà");
646
+ VN_LOCATIONS.add("kon plông");
647
+ VN_LOCATIONS.add("sa đéc");
648
+ VN_LOCATIONS.add("an lão");
649
+ VN_LOCATIONS.add("như xuân");
650
+ VN_LOCATIONS.add("bến lức");
651
+ VN_LOCATIONS.add("thanh khê");
652
+ VN_LOCATIONS.add("long xuyên");
653
+ VN_LOCATIONS.add("chợ gạo");
654
+ VN_LOCATIONS.add("lục nam");
655
+ VN_LOCATIONS.add("hoà thành");
656
+ VN_LOCATIONS.add("vũng liêm");
657
+ VN_LOCATIONS.add("bình định");
658
+ VN_LOCATIONS.add("cẩm mỹ");
659
+ VN_LOCATIONS.add("mộc hoá");
660
+ VN_LOCATIONS.add("tánh linh");
661
+ VN_LOCATIONS.add("đất đỏ");
662
+ VN_LOCATIONS.add("quế võ");
663
+ VN_LOCATIONS.add("trấn yên");
664
+ VN_LOCATIONS.add("cầu ngang");
665
+ VN_LOCATIONS.add("lai châu");
666
+ VN_LOCATIONS.add("gò công tây");
667
+ VN_LOCATIONS.add("lý nhân");
668
+ VN_LOCATIONS.add("bà rịa-vũng tàu");
669
+ VN_LOCATIONS.add("bình giang");
670
+ VN_LOCATIONS.add("mường khương");
671
+ VN_LOCATIONS.add("gò quao");
672
+ VN_LOCATIONS.add("bình đại");
673
+ VN_LOCATIONS.add("điện bàn");
674
+ VN_LOCATIONS.add("hải châu");
675
+ VN_LOCATIONS.add("bắc giang");
676
+ VN_LOCATIONS.add("văn lâm");
677
+ VN_LOCATIONS.add("ninh thuận");
678
+ VN_LOCATIONS.add("cô tô");
679
+ VN_LOCATIONS.add("quảng uyên");
680
+ VN_LOCATIONS.add("đông hải");
681
+ VN_LOCATIONS.add("phan thiết");
682
+ VN_LOCATIONS.add("tĩnh gia");
683
+ VN_LOCATIONS.add("bạch long vĩ");
684
+ VN_LOCATIONS.add("hoài đức");
685
+ VN_LOCATIONS.add("la gi");
686
+ VN_LOCATIONS.add("ngọc hồi");
687
+ VN_LOCATIONS.add("bình sơn");
688
+ VN_LOCATIONS.add("dương minh châu");
689
+ VN_LOCATIONS.add("can lộc");
690
+ VN_LOCATIONS.add("hồng bàng");
691
+ VN_LOCATIONS.add("thanh miện");
692
+ VN_LOCATIONS.add("trảng bàng");
693
+ VN_LOCATIONS.add("thái bình");
694
+ VN_LOCATIONS.add("hải dương");
695
+ VN_LOCATIONS.add("hà tây");
696
+ VN_LOCATIONS.add("krông nô");
697
+ VN_LOCATIONS.add("tam đường");
698
+ VN_LOCATIONS.add("nguyên bình");
699
+ VN_LOCATIONS.add("thủ dầu một");
700
+ VN_LOCATIONS.add("vĩnh lộc");
701
+ VN_LOCATIONS.add("đăk r'lấp");
702
+ VN_LOCATIONS.add("hai bà trưng");
703
+ VN_LOCATIONS.add("long khánh");
704
+ VN_LOCATIONS.add("bình liêu");
705
+ VN_LOCATIONS.add("đồng hỷ");
706
+ VN_LOCATIONS.add("võ nhai");
707
+ VN_LOCATIONS.add("lạc thuỷ");
708
+ VN_LOCATIONS.add("quỳnh phụ");
709
+ VN_LOCATIONS.add("diễn châu");
710
+ VN_LOCATIONS.add("cầu giấy");
711
+ VN_LOCATIONS.add("sơn la");
712
+ VN_LOCATIONS.add("sông mã");
713
+ VN_LOCATIONS.add("kinh môn");
714
+ VN_LOCATIONS.add("thạch thành");
715
+ VN_LOCATIONS.add("ea kar");
716
+ VN_LOCATIONS.add("krông búk");
717
+ VN_LOCATIONS.add("gò công đông");
718
+ VN_LOCATIONS.add("phù ninh");
719
+ VN_LOCATIONS.add("sơn hà");
720
+ VN_LOCATIONS.add("đạ tẻh");
721
+ VN_LOCATIONS.add("mộ đức");
722
+ VN_LOCATIONS.add("cờ đỏ");
723
+ VN_LOCATIONS.add("hương khê");
724
+ VN_LOCATIONS.add("phú lương");
725
+ VN_LOCATIONS.add("di linh");
726
+ VN_LOCATIONS.add("phú vang");
727
+ VN_LOCATIONS.add("lạng giang");
728
+ VN_LOCATIONS.add("yên mô");
729
+ VN_LOCATIONS.add("giao thuỷ");
730
+ VN_LOCATIONS.add("quốc oai");
731
+ VN_LOCATIONS.add("tuyên quang");
732
+ VN_LOCATIONS.add("bát xát");
733
+ VN_LOCATIONS.add("bắc hà");
734
+ VN_LOCATIONS.add("đắk lắk");
735
+ VN_LOCATIONS.add("tiên phước");
736
+ VN_LOCATIONS.add("lê chân");
737
+ VN_LOCATIONS.add("tiên yên");
738
+ VN_LOCATIONS.add("bến cát");
739
+ VN_LOCATIONS.add("tây giang");
740
+ VN_LOCATIONS.add("đà nẵng");
741
+ VN_LOCATIONS.add("ia grai");
742
+ VN_LOCATIONS.add("tam bình");
743
+ VN_LOCATIONS.add("thường tín");
744
+ VN_LOCATIONS.add("vĩnh bảo");
745
+ VN_LOCATIONS.add("hướng hoá");
746
+ VN_LOCATIONS.add("sơn trà");
747
+ VN_LOCATIONS.add("tân uyên");
748
+ VN_LOCATIONS.add("m'đrăk");
749
+ VN_LOCATIONS.add("quản bạ");
750
+ VN_LOCATIONS.add("liên chiểu");
751
+ VN_LOCATIONS.add("tri tôn");
752
+ VN_LOCATIONS.add("tiên lãng");
753
+ VN_LOCATIONS.add("biên hoà");
754
+ VN_LOCATIONS.add("hải hậu");
755
+ VN_LOCATIONS.add("tây ninh");
756
+ VN_LOCATIONS.add("quỳnh nhai");
757
+ VN_LOCATIONS.add("thạch hà");
758
+ VN_LOCATIONS.add("đồng nai");
759
+ VN_LOCATIONS.add("tuyên hoá");
760
+ VN_LOCATIONS.add("mai châu");
761
+ VN_LOCATIONS.add("yên bái");
762
+ VN_LOCATIONS.add("duyên hải");
763
+ VN_LOCATIONS.add("tháp mười");
764
+ VN_LOCATIONS.add("phú nhuận");
765
+ VN_LOCATIONS.add("ân thi");
766
+ VN_LOCATIONS.add("khoái châu");
767
+ VN_LOCATIONS.add("hòn đất");
768
+ VN_LOCATIONS.add("thống nhất");
769
+ VN_LOCATIONS.add("nghĩa đàn");
770
+ VN_LOCATIONS.add("quế phong");
771
+ VN_LOCATIONS.add("thủ đức");
772
+ VN_LOCATIONS.add("hạ lang");
773
+ VN_LOCATIONS.add("vĩnh linh");
774
+ VN_LOCATIONS.add("yên lạc");
775
+ VN_LOCATIONS.add("triệu phong");
776
+ VN_LOCATIONS.add("lâm hà");
777
+ VN_LOCATIONS.add("bảo lâm");
778
+ VN_LOCATIONS.add("hải phòng");
779
+ VN_LOCATIONS.add("vũ quang");
780
+ VN_LOCATIONS.add("cao lộc");
781
+ VN_LOCATIONS.add("nhơn trạch");
782
+ VN_LOCATIONS.add("quảng trị");
783
+ VN_LOCATIONS.add("thạch thất");
784
+ VN_LOCATIONS.add("chơn thành");
785
+ VN_LOCATIONS.add("tân yên");
786
+ VN_LOCATIONS.add("thanh hà");
787
+ VN_LOCATIONS.add("thạnh hoá");
788
+ VN_LOCATIONS.add("si ma cai");
789
+ VN_LOCATIONS.add("bác ái");
790
+ VN_LOCATIONS.add("đăk hà");
791
+ VN_LOCATIONS.add("yên minh");
792
+ VN_LOCATIONS.add("tân bình");
793
+ VN_LOCATIONS.add("đại từ");
794
+ VN_LOCATIONS.add("phục hoà");
795
+ VN_LOCATIONS.add("ninh sơn");
796
+ VN_LOCATIONS.add("long phú");
797
+ VN_LOCATIONS.add("hà tiên");
798
+ VN_LOCATIONS.add("thanh bình");
799
+ VN_LOCATIONS.add("mỏ cày");
800
+ VN_LOCATIONS.add("thạnh trị");
801
+ VN_LOCATIONS.add("trà vinh");
802
+ VN_LOCATIONS.add("dầu tiếng");
803
+ VN_LOCATIONS.add("bắc kạn");
804
+ VN_LOCATIONS.add("chư sê");
805
+ VN_LOCATIONS.add("thanh trì");
806
+ VN_LOCATIONS.add("ngọc lạc");
807
+ VN_LOCATIONS.add("từ sơn");
808
+ VN_LOCATIONS.add("gia bình");
809
+ VN_LOCATIONS.add("pác nặm");
810
+ VN_LOCATIONS.add("thốt nốt");
811
+ VN_LOCATIONS.add("trà bồng");
812
+ VN_LOCATIONS.add("thừa thiên-huế");
813
+ VN_LOCATIONS.add("phước long");
814
+ VN_LOCATIONS.add("cẩm phả");
815
+ VN_LOCATIONS.add("kon rẫy");
816
+ VN_LOCATIONS.add("long biên");
817
+ VN_LOCATIONS.add("cư m'gar");
818
+ VN_LOCATIONS.add("cao lãnh");
819
+ VN_LOCATIONS.add("buôn đôn");
820
+ VN_LOCATIONS.add("đắk nông");
821
+ VN_LOCATIONS.add("lý sơn");
822
+ VN_LOCATIONS.add("sóc trăng");
823
+ VN_LOCATIONS.add("hoằng hoá");
824
+ VN_LOCATIONS.add("quận 10");
825
+ VN_LOCATIONS.add("krông ana");
826
+ VN_LOCATIONS.add("quận 11");
827
+ VN_LOCATIONS.add("quận 12");
828
+ VN_LOCATIONS.add("phan rang-tháp chàm");
829
+ VN_LOCATIONS.add("tân kỳ");
830
+ VN_LOCATIONS.add("tương dương");
831
+ VN_LOCATIONS.add("đan phượng");
832
+ VN_LOCATIONS.add("anh sơn");
833
+ VN_LOCATIONS.add("quận 2");
834
+ VN_LOCATIONS.add("quận 1");
835
+ VN_LOCATIONS.add("qui nhơn");
836
+ VN_LOCATIONS.add("tư nghĩa");
837
+ VN_LOCATIONS.add("bố trạch");
838
+ VN_LOCATIONS.add("quận 9");
839
+ VN_LOCATIONS.add("thạch an");
840
+ VN_LOCATIONS.add("bảo thắng");
841
+ VN_LOCATIONS.add("quận 8");
842
+ VN_LOCATIONS.add("quận 7");
843
+ VN_LOCATIONS.add("nghĩa hành");
844
+ VN_LOCATIONS.add("quận 6");
845
+ VN_LOCATIONS.add("quận 5");
846
+ VN_LOCATIONS.add("hội an");
847
+ VN_LOCATIONS.add("quận 4");
848
+ VN_LOCATIONS.add("quận 3");
849
+ VN_LOCATIONS.add("phong điền");
850
+ VN_LOCATIONS.add("xuân lộc");
851
+ VN_LOCATIONS.add("côn đảo");
852
+ VN_LOCATIONS.add("nha trang");
853
+ VN_LOCATIONS.add("tân lạc");
854
+ VN_LOCATIONS.add("hạ hoà");
855
+ VN_LOCATIONS.add("gia viễn");
856
+ VN_LOCATIONS.add("đồng tháp");
857
+ VN_LOCATIONS.add("hoành bồ");
858
+ VN_LOCATIONS.add("bắc quang");
859
+ VN_LOCATIONS.add("na rì");
860
+ VN_LOCATIONS.add("sông cầu");
861
+ VN_LOCATIONS.add("mường tè");
862
+ VN_LOCATIONS.add("yên phong");
863
+ VN_LOCATIONS.add("tứ kỳ");
864
+ VN_LOCATIONS.add("vũ thư");
865
+ VN_LOCATIONS.add("mỹ hào");
866
+ VN_LOCATIONS.add("chư prông");
867
+ VN_LOCATIONS.add("hóc môn");
868
+ VN_LOCATIONS.add("châu đốc");
869
+ VN_LOCATIONS.add("đô lương");
870
+ VN_LOCATIONS.add("mang thít");
871
+ VN_LOCATIONS.add("tràng định");
872
+ VN_LOCATIONS.add("cam ranh");
873
+ VN_LOCATIONS.add("mang yang");
874
+ VN_LOCATIONS.add("hàm thuận nam");
875
+ VN_LOCATIONS.add("hưng nguyên");
876
+ VN_LOCATIONS.add("kiến xương");
877
+ VN_LOCATIONS.add("ninh phước");
878
+ VN_LOCATIONS.add("phong thổ");
879
+ VN_LOCATIONS.add("đức thọ");
880
+ VN_LOCATIONS.add("hồng lĩnh");
881
+ VN_LOCATIONS.add("khánh vĩnh");
882
+ VN_LOCATIONS.add("mỹ lộc");
883
+ VN_LOCATIONS.add("ngọc hiển");
884
+ VN_LOCATIONS.add("phước sơn");
885
+ VN_LOCATIONS.add("hà đông");
886
+ VN_LOCATIONS.add("lào cai");
887
+ VN_LOCATIONS.add("vĩnh yên");
888
+ VN_LOCATIONS.add("quỳ châu");
889
+ VN_LOCATIONS.add("sơn động");
890
+ VN_LOCATIONS.add("bến cầu");
891
+ VN_LOCATIONS.add("đông anh");
892
+ VN_LOCATIONS.add("kông chro");
893
+ VN_LOCATIONS.add("trảng bom");
894
+ VN_LOCATIONS.add("đông triều");
895
+ VN_LOCATIONS.add("ba tơ");
896
+ VN_LOCATIONS.add("cù lao dung");
897
+ VN_LOCATIONS.add("mỹ xuyên");
898
+ VN_LOCATIONS.add("quảng hà");
899
+ VN_LOCATIONS.add("tân biên");
900
+ VN_LOCATIONS.add("bá thước");
901
+ VN_LOCATIONS.add("cà mau");
902
+ VN_LOCATIONS.add("chi lăng");
903
+ VN_LOCATIONS.add("yên bình");
904
+ VN_LOCATIONS.add("bình minh");
905
+ VN_LOCATIONS.add("bình dương");
906
+ VN_LOCATIONS.add("an nhơn");
907
+ VN_LOCATIONS.add("chư păh");
908
+ VN_LOCATIONS.add("việt nam");
909
+ VN_LOCATIONS.add("giồng riềng");
910
+ VN_LOCATIONS.add("cát tiên");
911
+ VN_LOCATIONS.add("thuận an");
912
+ VN_LOCATIONS.add("ngã năm");
913
+ VN_LOCATIONS.add("cẩm thuỷ");
914
+ VN_LOCATIONS.add("minh long");
915
+ VN_LOCATIONS.add("nam đàn");
916
+ VN_LOCATIONS.add("tân hiệp");
917
+ VN_LOCATIONS.add("thanh sơn");
918
+ VN_LOCATIONS.add("dĩ an");
919
+ VN_LOCATIONS.add("thuận thành");
920
+ VN_LOCATIONS.add("điện biên phủ");
921
+ VN_LOCATIONS.add("vạn ninh");
922
+ VN_LOCATIONS.add("hưng yê");
923
+ VN_LOCATIONS.add("thái thuỵ");
924
+ VN_LOCATIONS.add("thanh xuân");
925
+ VN_LOCATIONS.add("cần giờ");
926
+ VN_LOCATIONS.add("ngũ hành sơn");
927
+ VN_LOCATIONS.add("ba tri");
928
+ VN_LOCATIONS.add("hồng dân");
929
+ VN_LOCATIONS.add("ninh giang");
930
+ VN_LOCATIONS.add("phan rang tháp chàm");
931
+ VN_LOCATIONS.add("than uyên");
932
+ VN_LOCATIONS.add("phú lộc");
933
+ VN_LOCATIONS.add("thanh chương");
934
+ VN_LOCATIONS.add("lục ngạn");
935
+ VN_LOCATIONS.add("năm căn");
936
+ VN_LOCATIONS.add("điện biên đông");
937
+ VN_LOCATIONS.add("hữu lũng");
938
+ VN_LOCATIONS.add("hoàng su phì");
939
+ VN_LOCATIONS.add("tây hồ");
940
+ VN_LOCATIONS.add("bắc yên");
941
+ VN_LOCATIONS.add("sài gòn");
942
+ VN_LOCATIONS.add("vĩnh cửu");
943
+ VN_LOCATIONS.add("bình phước");
944
+ VN_LOCATIONS.add("nam sách");
945
+ VN_LOCATIONS.add("hưng hà");
946
+ VN_LOCATIONS.add("bình chánh");
947
+ VN_LOCATIONS.add("uông bí");
948
+ VN_LOCATIONS.add("ea h'leo");
949
+ VN_LOCATIONS.add("tam điệp");
950
+ VN_LOCATIONS.add("nam giang");
951
+ VN_LOCATIONS.add("trùng khánh");
952
+ VN_LOCATIONS.add("gia lộc");
953
+ VN_LOCATIONS.add("tam dương");
954
+ VN_LOCATIONS.add("hoà an");
955
+ VN_LOCATIONS.add("thừa thiên huế");
956
+ VN_LOCATIONS.add("nông cống");
957
+ VN_LOCATIONS.add("tam kỳ");
958
+ VN_LOCATIONS.add("đak pơ");
959
+ VN_LOCATIONS.add("bình thạnh");
960
+ VN_LOCATIONS.add("hà nội");
961
+ VN_LOCATIONS.add("châu thành");
962
+ VN_LOCATIONS.add("tiên lữ");
963
+ VN_LOCATIONS.add("cầu kè");
964
+ VN_LOCATIONS.add("ninh kiều");
965
+ VN_LOCATIONS.add("buôn ma thuột");
966
+ VN_LOCATIONS.add("an khê");
967
+ VN_LOCATIONS.add("đức huệ");
968
+ VN_LOCATIONS.add("tiền hải");
969
+ VN_LOCATIONS.add("tuy phước");
970
+ VN_LOCATIONS.add("bà rịa");
971
+ VN_LOCATIONS.add("đa krông");
972
+ VN_LOCATIONS.add("đồng xoài");
973
+ VN_LOCATIONS.add("ba vì");
974
+ VN_LOCATIONS.add("quảng ninh");
975
+ VN_LOCATIONS.add("điện biên");
976
+ VN_LOCATIONS.add("hà trung");
977
+ VN_LOCATIONS.add("thanh oai");
978
+ VN_LOCATIONS.add("trà cú");
979
+ VN_LOCATIONS.add("văn yên");
980
+ VN_LOCATIONS.add("bình xuyên");
981
+ VN_LOCATIONS.add("hoà vang");
982
+ VN_LOCATIONS.add("trà lĩnh");
983
+ VN_LOCATIONS.add("yên khánh");
984
+ VN_LOCATIONS.add("kbang");
985
+ VN_LOCATIONS.add("hoàng sa");
986
+ VN_LOCATIONS.add("văn quan");
987
+ VN_LOCATIONS.add("ba chẽ");
988
+ VN_LOCATIONS.add("nho quan");
989
+ VN_LOCATIONS.add("khánh hoà");
990
+ VN_LOCATIONS.add("đăk mil");
991
+ VN_LOCATIONS.add("kiến thuỵ");
992
+ VN_LOCATIONS.add("đầm hà");
993
+ VN_LOCATIONS.add("hàm tân");
994
+ VN_LOCATIONS.add("phù cát");
995
+ VN_LOCATIONS.add("kim bảng");
996
+ VN_LOCATIONS.add("vũng tầu");
997
+ VN_LOCATIONS.add("kiên giang");
998
+ VN_LOCATIONS.add("long hồ");
999
+ VN_LOCATIONS.add("mường chà");
1000
+ VN_LOCATIONS.add("thanh ba");
1001
+ VN_LOCATIONS.add("đại lộc");
1002
+ VN_LOCATIONS.add("mê linh");
1003
+ VN_LOCATIONS.add("mường lát");
1004
+ VN_LOCATIONS.add("đạ huoai");
1005
+ VN_LOCATIONS.add("huế");
1006
+ VN_LOCATIONS.add("cần thơ");
1007
+ VN_LOCATIONS.add("vụ bản");
1008
+ VN_LOCATIONS.add("thanh liêm");
1009
+ VN_LOCATIONS.add("đoan hùng");
1010
+ VN_LOCATIONS.add("hiệp hoà");
1011
+ VN_LOCATIONS.add("bắc sơn");
1012
+ VN_LOCATIONS.add("tân trụ");
1013
+ VN_LOCATIONS.add("cần giuộc");
1014
+ VN_LOCATIONS.add("đăk glong");
1015
+ VN_LOCATIONS.add("hậu lộc");
1016
+ VN_LOCATIONS.add("kỳ anh");
1017
+ VN_LOCATIONS.add("cai lậy");
1018
+ VN_LOCATIONS.add("krông bông");
1019
+ VN_LOCATIONS.add("yên lập");
1020
+ VN_LOCATIONS.add("mù căng chải");
1021
+ VN_LOCATIONS.add("mỹ tú");
1022
+ VN_LOCATIONS.add("trạm tấu");
1023
+ VN_LOCATIONS.add("cư jút");
1024
+ VN_LOCATIONS.add("quỳ hợp");
1025
+ VN_LOCATIONS.add("tân phước");
1026
+ VN_LOCATIONS.add("vĩnh lợi");
1027
+ VN_LOCATIONS.add("đồng văn");
1028
+ VN_LOCATIONS.add("đông sơn");
1029
+ VN_LOCATIONS.add("tây trà");
1030
+ VN_LOCATIONS.add("lộc ninh");
1031
+ VN_LOCATIONS.add("sầm sơn");
1032
+ VN_LOCATIONS.add("lạng sơn");
1033
+ VN_LOCATIONS.add("sa pa");
1034
+ VN_LOCATIONS.add("hàm yên");
1035
+ VN_LOCATIONS.add("vân đồn");
1036
+ VN_LOCATIONS.add("đà bắc");
1037
+ VN_LOCATIONS.add("vân canh");
1038
+ VN_LOCATIONS.add("sơn hoà");
1039
+ VN_LOCATIONS.add("thuận bắc");
1040
+ VN_LOCATIONS.add("châu đức");
1041
+ VN_LOCATIONS.add("thường xuân");
1042
+ VN_LOCATIONS.add("định hoá");
1043
+ VN_LOCATIONS.add("giồng trôm");
1044
+ VN_LOCATIONS.add("núi thành");
1045
+ VN_LOCATIONS.add("rạch giá");
1046
+ VN_LOCATIONS.add("con cuông");
1047
+ VN_LOCATIONS.add("ninh bình");
1048
+ VN_LOCATIONS.add("đồng hới");
1049
+ VN_LOCATIONS.add("tân an");
1050
+ VN_LOCATIONS.add("trực ninh");
1051
+ VN_LOCATIONS.add("thuận châu");
1052
+ VN_LOCATIONS.add("vinh");
1053
+ VN_LOCATIONS.add("trần văn thời");
1054
+ VN_LOCATIONS.add("minh hoá");
1055
+ VN_LOCATIONS.add("yên mỹ");
1056
+ VN_LOCATIONS.add("quan hoá");
1057
+ VN_LOCATIONS.add("văn bàn");
1058
+ VN_LOCATIONS.add("cam lộ");
1059
+ VN_LOCATIONS.add("lang chánh");
1060
+ VN_LOCATIONS.add("phù yên");
1061
+ VN_LOCATIONS.add("đăk tô");
1062
+ VN_LOCATIONS.add("hoa lư");
1063
+ VN_LOCATIONS.add("lục yên");
1064
+ VN_LOCATIONS.add("đức phổ");
1065
+ VN_LOCATIONS.add("hà nam");
1066
+ VN_LOCATIONS.add("tuy an");
1067
+ VN_LOCATIONS.add("an giang");
1068
+ VN_LOCATIONS.add("ba bể");
1069
+ VN_LOCATIONS.add("xuân trường");
1070
+ VN_LOCATIONS.add("cát hải");
1071
+ VN_LOCATIONS.add("kon tum");
1072
+ VN_LOCATIONS.add("bù đăng");
1073
+ VN_LOCATIONS.add("krông năng");
1074
+ VN_LOCATIONS.add("an biên");
1075
+ VN_LOCATIONS.add("yên châu");
1076
+ VN_LOCATIONS.add("phú thọ");
1077
+ VN_LOCATIONS.add("tam nông");
1078
+ VN_LOCATIONS.add("quỳnh lưu");
1079
+ VN_LOCATIONS.add("đình lập");
1080
+ VN_LOCATIONS.add("nghi lộc");
1081
+ VN_LOCATIONS.add("chợ mới");
1082
+ VN_LOCATIONS.add("đức trọng");
1083
+ VN_LOCATIONS.add("đầm dơi");
1084
+ VN_LOCATIONS.add("long đất");
1085
+ VN_LOCATIONS.add("mường lay");
1086
+ VN_LOCATIONS.add("tiền giang");
1087
+ VN_LOCATIONS.add("thông nông");
1088
+ VN_LOCATIONS.add("phú yên");
1089
+ VN_LOCATIONS.add("quảng bình");
1090
+ VN_LOCATIONS.add("sìn hồ");
1091
+ VN_LOCATIONS.add("tuy phong");
1092
+ VN_LOCATIONS.add("ba đình");
1093
+ VN_LOCATIONS.add("phù mỹ");
1094
+ VN_LOCATIONS.add("đức hoà");
1095
+ VN_LOCATIONS.add("bảo lạc");
1096
+ VN_LOCATIONS.add("đăk glei");
1097
+ VN_LOCATIONS.add("bến tre");
1098
+ VN_LOCATIONS.add("như thanh");
1099
+ VN_LOCATIONS.add("thanh thuỷ");
1100
+ VN_LOCATIONS.add("đà lạt");
1101
+ VN_LOCATIONS.add("đức cơ");
1102
+ VN_LOCATIONS.add("văn chấn");
1103
+ VN_LOCATIONS.add("bà rịa vũng tàu");
1104
+ VN_LOCATIONS.add("vĩnh hưng");
1105
+ VN_LOCATIONS.add("cao phong");
1106
+ VN_LOCATIONS.add("nam trà my");
1107
+ VN_LOCATIONS.add("phú giáo");
1108
+ VN_LOCATIONS.add("phú bình");
1109
+ VN_LOCATIONS.add("ayun pa");
1110
+ VN_LOCATIONS.add("mỹ đức");
1111
+ VN_LOCATIONS.add("tuần giáo");
1112
+ }
1113
+
1114
+ public static Set<String> VN_FIRST_SENT_WORDS;
1115
+ static {
1116
+ VN_FIRST_SENT_WORDS = new HashSet<String>();
1117
+ VN_FIRST_SENT_WORDS.add("được");
1118
+ VN_FIRST_SENT_WORDS.add("cty");
1119
+ VN_FIRST_SENT_WORDS.add("mẹ");
1120
+ VN_FIRST_SENT_WORDS.add("trừ");
1121
+ VN_FIRST_SENT_WORDS.add("lên");
1122
+ VN_FIRST_SENT_WORDS.add("trưởng");
1123
+ VN_FIRST_SENT_WORDS.add("là");
1124
+ VN_FIRST_SENT_WORDS.add("chàng");
1125
+ VN_FIRST_SENT_WORDS.add("theo");
1126
+ VN_FIRST_SENT_WORDS.add("tên");
1127
+ VN_FIRST_SENT_WORDS.add("giờ");
1128
+ VN_FIRST_SENT_WORDS.add("biết");
1129
+ VN_FIRST_SENT_WORDS.add("già");
1130
+ VN_FIRST_SENT_WORDS.add("những");
1131
+ VN_FIRST_SENT_WORDS.add("thấy");
1132
+ VN_FIRST_SENT_WORDS.add("thương");
1133
+ VN_FIRST_SENT_WORDS.add("lang");
1134
+ VN_FIRST_SENT_WORDS.add("gái");
1135
+ VN_FIRST_SENT_WORDS.add("mà");
1136
+ VN_FIRST_SENT_WORDS.add("xóm");
1137
+ VN_FIRST_SENT_WORDS.add("má");
1138
+ VN_FIRST_SENT_WORDS.add("cầu");
1139
+ VN_FIRST_SENT_WORDS.add("khách");
1140
+ VN_FIRST_SENT_WORDS.add("nhánh");
1141
+ VN_FIRST_SENT_WORDS.add("hôm");
1142
+ VN_FIRST_SENT_WORDS.add("nhớ");
1143
+ VN_FIRST_SENT_WORDS.add("hạng");
1144
+ VN_FIRST_SENT_WORDS.add("huyện");
1145
+ VN_FIRST_SENT_WORDS.add("vậy");
1146
+ VN_FIRST_SENT_WORDS.add("nhà");
1147
+ VN_FIRST_SENT_WORDS.add("ấp");
1148
+ VN_FIRST_SENT_WORDS.add("sông");
1149
+ VN_FIRST_SENT_WORDS.add("thằng");
1150
+ VN_FIRST_SENT_WORDS.add("nài");
1151
+ VN_FIRST_SENT_WORDS.add("ngành");
1152
+ VN_FIRST_SENT_WORDS.add("nếu");
1153
+ VN_FIRST_SENT_WORDS.add("trời");
1154
+ VN_FIRST_SENT_WORDS.add("đảng");
1155
+ VN_FIRST_SENT_WORDS.add("vào");
1156
+ VN_FIRST_SENT_WORDS.add("thầy");
1157
+ VN_FIRST_SENT_WORDS.add("hai");
1158
+ VN_FIRST_SENT_WORDS.add("vùng");
1159
+ VN_FIRST_SENT_WORDS.add("chuyện");
1160
+ VN_FIRST_SENT_WORDS.add("nhìn");
1161
+ VN_FIRST_SENT_WORDS.add("tim");
1162
+ VN_FIRST_SENT_WORDS.add("cha");
1163
+ VN_FIRST_SENT_WORDS.add("sang");
1164
+ VN_FIRST_SENT_WORDS.add("bên");
1165
+ VN_FIRST_SENT_WORDS.add("đường");
1166
+ VN_FIRST_SENT_WORDS.add("cho");
1167
+ VN_FIRST_SENT_WORDS.add("bảng");
1168
+ VN_FIRST_SENT_WORDS.add("khi");
1169
+ VN_FIRST_SENT_WORDS.add("quận");
1170
+ VN_FIRST_SENT_WORDS.add("biển");
1171
+ VN_FIRST_SENT_WORDS.add("cu");
1172
+ VN_FIRST_SENT_WORDS.add("metro");
1173
+ VN_FIRST_SENT_WORDS.add("vốn");
1174
+ VN_FIRST_SENT_WORDS.add("đến");
1175
+ VN_FIRST_SENT_WORDS.add("năm");
1176
+ VN_FIRST_SENT_WORDS.add("khu");
1177
+ VN_FIRST_SENT_WORDS.add("đài");
1178
+ VN_FIRST_SENT_WORDS.add("miền");
1179
+ VN_FIRST_SENT_WORDS.add("việc");
1180
+ VN_FIRST_SENT_WORDS.add("do");
1181
+ VN_FIRST_SENT_WORDS.add("lập");
1182
+ VN_FIRST_SENT_WORDS.add("nghe");
1183
+ VN_FIRST_SENT_WORDS.add("mắt");
1184
+ VN_FIRST_SENT_WORDS.add("viện");
1185
+ VN_FIRST_SENT_WORDS.add("cả");
1186
+ VN_FIRST_SENT_WORDS.add("em");
1187
+ VN_FIRST_SENT_WORDS.add("rừng");
1188
+ VN_FIRST_SENT_WORDS.add("liệu");
1189
+ VN_FIRST_SENT_WORDS.add("bố");
1190
+ VN_FIRST_SENT_WORDS.add("bộ");
1191
+ VN_FIRST_SENT_WORDS.add("cháu");
1192
+ VN_FIRST_SENT_WORDS.add("riêng");
1193
+ VN_FIRST_SENT_WORDS.add("bà");
1194
+ VN_FIRST_SENT_WORDS.add("số");
1195
+ VN_FIRST_SENT_WORDS.add("chị");
1196
+ VN_FIRST_SENT_WORDS.add("người");
1197
+ VN_FIRST_SENT_WORDS.add("bé");
1198
+ VN_FIRST_SENT_WORDS.add("tàu");
1199
+ VN_FIRST_SENT_WORDS.add("làng");
1200
+ VN_FIRST_SENT_WORDS.add("cảng");
1201
+ VN_FIRST_SENT_WORDS.add("sở");
1202
+ VN_FIRST_SENT_WORDS.add("chiếc");
1203
+ VN_FIRST_SENT_WORDS.add("tết");
1204
+ VN_FIRST_SENT_WORDS.add("cậu");
1205
+ VN_FIRST_SENT_WORDS.add("luật");
1206
+ VN_FIRST_SENT_WORDS.add("chờ");
1207
+ VN_FIRST_SENT_WORDS.add("rời");
1208
+ VN_FIRST_SENT_WORDS.add("chắc");
1209
+ VN_FIRST_SENT_WORDS.add("hội");
1210
+ VN_FIRST_SENT_WORDS.add("chợ");
1211
+ VN_FIRST_SENT_WORDS.add("viên");
1212
+ VN_FIRST_SENT_WORDS.add("cụ");
1213
+ VN_FIRST_SENT_WORDS.add("nay");
1214
+ VN_FIRST_SENT_WORDS.add("thuốc");
1215
+ VN_FIRST_SENT_WORDS.add("bọn");
1216
+ VN_FIRST_SENT_WORDS.add("tờ");
1217
+ VN_FIRST_SENT_WORDS.add("phía");
1218
+ VN_FIRST_SENT_WORDS.add("chữ");
1219
+ VN_FIRST_SENT_WORDS.add("xe");
1220
+ VN_FIRST_SENT_WORDS.add("cò");
1221
+ VN_FIRST_SENT_WORDS.add("có");
1222
+ VN_FIRST_SENT_WORDS.add("cô");
1223
+ VN_FIRST_SENT_WORDS.add("dân");
1224
+ VN_FIRST_SENT_WORDS.add("nhóm");
1225
+ VN_FIRST_SENT_WORDS.add("song");
1226
+ VN_FIRST_SENT_WORDS.add("chú");
1227
+ VN_FIRST_SENT_WORDS.add("từ");
1228
+ VN_FIRST_SENT_WORDS.add("như");
1229
+ VN_FIRST_SENT_WORDS.add("ngày");
1230
+ VN_FIRST_SENT_WORDS.add("phim");
1231
+ VN_FIRST_SENT_WORDS.add("chính");
1232
+ VN_FIRST_SENT_WORDS.add("tân");
1233
+ VN_FIRST_SENT_WORDS.add("gặp");
1234
+ VN_FIRST_SENT_WORDS.add("các");
1235
+ VN_FIRST_SENT_WORDS.add("quê");
1236
+ VN_FIRST_SENT_WORDS.add("dì");
1237
+ VN_FIRST_SENT_WORDS.add("bởi");
1238
+ VN_FIRST_SENT_WORDS.add("quí");
1239
+ VN_FIRST_SENT_WORDS.add("về");
1240
+ VN_FIRST_SENT_WORDS.add("trại");
1241
+ VN_FIRST_SENT_WORDS.add("tại");
1242
+ VN_FIRST_SENT_WORDS.add("lão");
1243
+ VN_FIRST_SENT_WORDS.add("đảo");
1244
+ VN_FIRST_SENT_WORDS.add("nguyên");
1245
+ VN_FIRST_SENT_WORDS.add("còn");
1246
+ VN_FIRST_SENT_WORDS.add("tiếng");
1247
+ VN_FIRST_SENT_WORDS.add("dòng");
1248
+ VN_FIRST_SENT_WORDS.add("và");
1249
+ VN_FIRST_SENT_WORDS.add("hiện");
1250
+ VN_FIRST_SENT_WORDS.add("vợ");
1251
+ VN_FIRST_SENT_WORDS.add("thuyền");
1252
+ VN_FIRST_SENT_WORDS.add("vụ");
1253
+ VN_FIRST_SENT_WORDS.add("đoàn");
1254
+ VN_FIRST_SENT_WORDS.add("thành");
1255
+ VN_FIRST_SENT_WORDS.add("giới");
1256
+ VN_FIRST_SENT_WORDS.add("bến");
1257
+ VN_FIRST_SENT_WORDS.add("vì");
1258
+ VN_FIRST_SENT_WORDS.add("đi");
1259
+ VN_FIRST_SENT_WORDS.add("sân");
1260
+ VN_FIRST_SENT_WORDS.add("sâm");
1261
+ VN_FIRST_SENT_WORDS.add("con");
1262
+ VN_FIRST_SENT_WORDS.add("bác");
1263
+ VN_FIRST_SENT_WORDS.add("cùng");
1264
+ VN_FIRST_SENT_WORDS.add("báo");
1265
+ VN_FIRST_SENT_WORDS.add("chồng");
1266
+ VN_FIRST_SENT_WORDS.add("hàng");
1267
+ VN_FIRST_SENT_WORDS.add("đất");
1268
+ VN_FIRST_SENT_WORDS.add("mỗi");
1269
+ VN_FIRST_SENT_WORDS.add("núi");
1270
+ VN_FIRST_SENT_WORDS.add("phòng");
1271
+ VN_FIRST_SENT_WORDS.add("xã");
1272
+ VN_FIRST_SENT_WORDS.add("hồ");
1273
+ VN_FIRST_SENT_WORDS.add("ông");
1274
+ VN_FIRST_SENT_WORDS.add("giọng");
1275
+ VN_FIRST_SENT_WORDS.add("trường");
1276
+ VN_FIRST_SENT_WORDS.add("đèo");
1277
+ VN_FIRST_SENT_WORDS.add("trùm");
1278
+ VN_FIRST_SENT_WORDS.add("nhiều");
1279
+ VN_FIRST_SENT_WORDS.add("thư");
1280
+ VN_FIRST_SENT_WORDS.add("cục");
1281
+ VN_FIRST_SENT_WORDS.add("nước");
1282
+ VN_FIRST_SENT_WORDS.add("thôn");
1283
+ VN_FIRST_SENT_WORDS.add("bạn");
1284
+ VN_FIRST_SENT_WORDS.add("nàng");
1285
+ VN_FIRST_SENT_WORDS.add("bệnh");
1286
+ VN_FIRST_SENT_WORDS.add("cụm");
1287
+ VN_FIRST_SENT_WORDS.add("tướng");
1288
+ VN_FIRST_SENT_WORDS.add("buôn");
1289
+ VN_FIRST_SENT_WORDS.add("để");
1290
+ VN_FIRST_SENT_WORDS.add("anh");
1291
+ VN_FIRST_SENT_WORDS.add("lính");
1292
+ VN_FIRST_SENT_WORDS.add("với");
1293
+ VN_FIRST_SENT_WORDS.add("ngoài");
1294
+ VN_FIRST_SENT_WORDS.add("trên");
1295
+ VN_FIRST_SENT_WORDS.add("hỏi");
1296
+ VN_FIRST_SENT_WORDS.add("sau");
1297
+ VN_FIRST_SENT_WORDS.add("đội");
1298
+ VN_FIRST_SENT_WORDS.add("gọi");
1299
+ VN_FIRST_SENT_WORDS.add("rồi");
1300
+ VN_FIRST_SENT_WORDS.add("một");
1301
+ VN_FIRST_SENT_WORDS.add("chúc");
1302
+ VN_FIRST_SENT_WORDS.add("nhưng");
1303
+ VN_FIRST_SENT_WORDS.add("đêm");
1304
+ VN_FIRST_SENT_WORDS.add("phó");
1305
+ VN_FIRST_SENT_WORDS.add("bỗng");
1306
+ VN_FIRST_SENT_WORDS.add("trong");
1307
+ VN_FIRST_SENT_WORDS.add("trước");
1308
+ VN_FIRST_SENT_WORDS.add("bản");
1309
+ VN_FIRST_SENT_WORDS.add("cuốn");
1310
+ VN_FIRST_SENT_WORDS.add("chùa");
1311
+ VN_FIRST_SENT_WORDS.add("ban");
1312
+ VN_FIRST_SENT_WORDS.add("giữa");
1313
+ VN_FIRST_SENT_WORDS.add("ngay");
1314
+ VN_FIRST_SENT_WORDS.add("lúc");
1315
+ VN_FIRST_SENT_WORDS.add("tỉnh");
1316
+ VN_FIRST_SENT_WORDS.add("tuy");
1317
+ VN_FIRST_SENT_WORDS.add("vẫn");
1318
+
1319
+ VN_FIRST_SENT_WORDS.add("trà");
1320
+ VN_FIRST_SENT_WORDS.add("ôi");
1321
+ VN_FIRST_SENT_WORDS.add("cặp");
1322
+ VN_FIRST_SENT_WORDS.add("taxi");
1323
+ VN_FIRST_SENT_WORDS.add("nhiễm");
1324
+ VN_FIRST_SENT_WORDS.add("virus");
1325
+ VN_FIRST_SENT_WORDS.add("hồi");
1326
+ VN_FIRST_SENT_WORDS.add("nghĩa");
1327
+ VN_FIRST_SENT_WORDS.add("đọc");
1328
+ VN_FIRST_SENT_WORDS.add("nhờ");
1329
+ VN_FIRST_SENT_WORDS.add("tới");
1330
+ VN_FIRST_SENT_WORDS.add("ong");
1331
+ VN_FIRST_SENT_WORDS.add("website");
1332
+ VN_FIRST_SENT_WORDS.add("bóng");
1333
+ VN_FIRST_SENT_WORDS.add("quít");
1334
+ VN_FIRST_SENT_WORDS.add("kungfu");
1335
+ VN_FIRST_SENT_WORDS.add("ra");
1336
+ VN_FIRST_SENT_WORDS.add("đồng");
1337
+ VN_FIRST_SENT_WORDS.add("băng");
1338
+ VN_FIRST_SENT_WORDS.add("ba");
1339
+ VN_FIRST_SENT_WORDS.add("bầu");
1340
+ VN_FIRST_SENT_WORDS.add("hay");
1341
+ VN_FIRST_SENT_WORDS.add("giải");
1342
+ VN_FIRST_SENT_WORDS.add("giao");
1343
+ VN_FIRST_SENT_WORDS.add("cửa");
1344
+ VN_FIRST_SENT_WORDS.add("phần");
1345
+ VN_FIRST_SENT_WORDS.add("sinh");
1346
+ VN_FIRST_SENT_WORDS.add("vietcombank");
1347
+ VN_FIRST_SENT_WORDS.add("vàng");
1348
+ VN_FIRST_SENT_WORDS.add("fred");
1349
+ VN_FIRST_SENT_WORDS.add("tập");
1350
+ VN_FIRST_SENT_WORDS.add("toyota");
1351
+ VN_FIRST_SENT_WORDS.add("bế");
1352
+ VN_FIRST_SENT_WORDS.add("tuồng");
1353
+ VN_FIRST_SENT_WORDS.add("nguồn");
1354
+ VN_FIRST_SENT_WORDS.add("phường");
1355
+ VN_FIRST_SENT_WORDS.add("làm");
1356
+ VN_FIRST_SENT_WORDS.add("tuyển");
1357
+ VN_FIRST_SENT_WORDS.add("đền");
1358
+ VN_FIRST_SENT_WORDS.add("mong");
1359
+ VN_FIRST_SENT_WORDS.add("nghỉ");
1360
+ VN_FIRST_SENT_WORDS.add("hầm");
1361
+ VN_FIRST_SENT_WORDS.add("trán");
1362
+ VN_FIRST_SENT_WORDS.add("dắt");
1363
+ VN_FIRST_SENT_WORDS.add("sợ");
1364
+ VN_FIRST_SENT_WORDS.add("chỗ");
1365
+ VN_FIRST_SENT_WORDS.add("lái");
1366
+ VN_FIRST_SENT_WORDS.add("xem");
1367
+ VN_FIRST_SENT_WORDS.add("chủ");
1368
+ VN_FIRST_SENT_WORDS.add("chứ");
1369
+ VN_FIRST_SENT_WORDS.add("đợt");
1370
+ VN_FIRST_SENT_WORDS.add("đoạn");
1371
+ VN_FIRST_SENT_WORDS.add("đồn");
1372
+ VN_FIRST_SENT_WORDS.add("trục");
1373
+ VN_FIRST_SENT_WORDS.add("tự");
1374
+ VN_FIRST_SENT_WORDS.add("neil");
1375
+ VN_FIRST_SENT_WORDS.add("điện");
1376
+ VN_FIRST_SENT_WORDS.add("trạm");
1377
+ VN_FIRST_SENT_WORDS.add("gần");
1378
+ VN_FIRST_SENT_WORDS.add("giặc");
1379
+ VN_FIRST_SENT_WORDS.add("cúng");
1380
+ VN_FIRST_SENT_WORDS.add("dù");
1381
+ VN_FIRST_SENT_WORDS.add("vịnh");
1382
+ VN_FIRST_SENT_WORDS.add("quân");
1383
+ VN_FIRST_SENT_WORDS.add("dãy");
1384
+ VN_FIRST_SENT_WORDS.add("pha");
1385
+ VN_FIRST_SENT_WORDS.add("toàn");
1386
+ VN_FIRST_SENT_WORDS.add("tháp");
1387
+ VN_FIRST_SENT_WORDS.add("quĩ");
1388
+ VN_FIRST_SENT_WORDS.add("đĩa");
1389
+ VN_FIRST_SENT_WORDS.add("gà");
1390
+ VN_FIRST_SENT_WORDS.add("lao");
1391
+ VN_FIRST_SENT_WORDS.add("bốn");
1392
+ VN_FIRST_SENT_WORDS.add("họ");
1393
+ VN_FIRST_SENT_WORDS.add("họp");
1394
+ VN_FIRST_SENT_WORDS.add("đèn");
1395
+ VN_FIRST_SENT_WORDS.add("cũng");
1396
+ VN_FIRST_SENT_WORDS.add("động");
1397
+ VN_FIRST_SENT_WORDS.add("mặt");
1398
+ VN_FIRST_SENT_WORDS.add("đầm");
1399
+ VN_FIRST_SENT_WORDS.add("cống");
1400
+ VN_FIRST_SENT_WORDS.add("nơi");
1401
+ VN_FIRST_SENT_WORDS.add("tùng");
1402
+ VN_FIRST_SENT_WORDS.add("phố");
1403
+ VN_FIRST_SENT_WORDS.add("đầu");
1404
+ VN_FIRST_SENT_WORDS.add("vượt");
1405
+ VN_FIRST_SENT_WORDS.add("sao");
1406
+ VN_FIRST_SENT_WORDS.add("cách");
1407
+ VN_FIRST_SENT_WORDS.add("hoặc");
1408
+ VN_FIRST_SENT_WORDS.add("của");
1409
+ VN_FIRST_SENT_WORDS.add("hết");
1410
+ VN_FIRST_SENT_WORDS.add("đỉnh");
1411
+ VN_FIRST_SENT_WORDS.add("kênh");
1412
+ VN_FIRST_SENT_WORDS.add("quyền");
1413
+ VN_FIRST_SENT_WORDS.add("bar");
1414
+ VN_FIRST_SENT_WORDS.add("chống");
1415
+ VN_FIRST_SENT_WORDS.add("khắp");
1416
+ VN_FIRST_SENT_WORDS.add("sách");
1417
+ VN_FIRST_SENT_WORDS.add("wikipedia");
1418
+ }
1419
+
1420
+ public static Set<String> VN_MIDDLE_NAMES;
1421
+ static {
1422
+ VN_MIDDLE_NAMES = new HashSet<String>();
1423
+ VN_MIDDLE_NAMES.add("thúy");
1424
+ VN_MIDDLE_NAMES.add("bao");
1425
+ VN_MIDDLE_NAMES.add("thùy");
1426
+ VN_MIDDLE_NAMES.add("mạnh");
1427
+ VN_MIDDLE_NAMES.add("mỹ");
1428
+ VN_MIDDLE_NAMES.add("an");
1429
+ VN_MIDDLE_NAMES.add("hoa");
1430
+ VN_MIDDLE_NAMES.add("nữ");
1431
+ VN_MIDDLE_NAMES.add("trường");
1432
+ VN_MIDDLE_NAMES.add("vĩnh");
1433
+ VN_MIDDLE_NAMES.add("đắc");
1434
+ VN_MIDDLE_NAMES.add("minh");
1435
+ VN_MIDDLE_NAMES.add("thanh");
1436
+ VN_MIDDLE_NAMES.add("thi");
1437
+ VN_MIDDLE_NAMES.add("thu");
1438
+ VN_MIDDLE_NAMES.add("ninh");
1439
+ VN_MIDDLE_NAMES.add("đình");
1440
+ VN_MIDDLE_NAMES.add("hải");
1441
+ VN_MIDDLE_NAMES.add("tuấn");
1442
+ VN_MIDDLE_NAMES.add("bội");
1443
+ VN_MIDDLE_NAMES.add("thuý");
1444
+ VN_MIDDLE_NAMES.add("việt");
1445
+ VN_MIDDLE_NAMES.add("nguyễn");
1446
+ VN_MIDDLE_NAMES.add("bá");
1447
+ VN_MIDDLE_NAMES.add("phương");
1448
+ VN_MIDDLE_NAMES.add("bé");
1449
+ VN_MIDDLE_NAMES.add("tố");
1450
+ VN_MIDDLE_NAMES.add("quốc");
1451
+ VN_MIDDLE_NAMES.add("nguyệt");
1452
+ VN_MIDDLE_NAMES.add("tử");
1453
+ VN_MIDDLE_NAMES.add("cảnh");
1454
+ VN_MIDDLE_NAMES.add("trọng");
1455
+ VN_MIDDLE_NAMES.add("huy");
1456
+ VN_MIDDLE_NAMES.add("nam");
1457
+ VN_MIDDLE_NAMES.add("chí");
1458
+ VN_MIDDLE_NAMES.add("thái");
1459
+ VN_MIDDLE_NAMES.add("thành");
1460
+ VN_MIDDLE_NAMES.add("chính");
1461
+ VN_MIDDLE_NAMES.add("đinh");
1462
+ VN_MIDDLE_NAMES.add("mai");
1463
+ VN_MIDDLE_NAMES.add("thiên");
1464
+ VN_MIDDLE_NAMES.add("tôn");
1465
+ VN_MIDDLE_NAMES.add("phi");
1466
+ VN_MIDDLE_NAMES.add("hà");
1467
+ VN_MIDDLE_NAMES.add("khắc");
1468
+ VN_MIDDLE_NAMES.add("trúc");
1469
+ VN_MIDDLE_NAMES.add("lan");
1470
+ VN_MIDDLE_NAMES.add("doãn");
1471
+ VN_MIDDLE_NAMES.add("nhất");
1472
+ VN_MIDDLE_NAMES.add("huỳnh");
1473
+ VN_MIDDLE_NAMES.add("quỳnh");
1474
+ VN_MIDDLE_NAMES.add("diễm");
1475
+ VN_MIDDLE_NAMES.add("khánh");
1476
+ VN_MIDDLE_NAMES.add("hữu");
1477
+ VN_MIDDLE_NAMES.add("tấn");
1478
+ VN_MIDDLE_NAMES.add("anh");
1479
+ VN_MIDDLE_NAMES.add("hoành");
1480
+ VN_MIDDLE_NAMES.add("hoàng");
1481
+ VN_MIDDLE_NAMES.add("diệu");
1482
+ VN_MIDDLE_NAMES.add("lê");
1483
+ VN_MIDDLE_NAMES.add("phú");
1484
+ VN_MIDDLE_NAMES.add("duy");
1485
+ VN_MIDDLE_NAMES.add("bảo");
1486
+ VN_MIDDLE_NAMES.add("huyền");
1487
+ VN_MIDDLE_NAMES.add("nguyên");
1488
+ VN_MIDDLE_NAMES.add("bích");
1489
+ VN_MIDDLE_NAMES.add("ánh");
1490
+ VN_MIDDLE_NAMES.add("công");
1491
+ VN_MIDDLE_NAMES.add("mộng");
1492
+ VN_MIDDLE_NAMES.add("lệnh");
1493
+ VN_MIDDLE_NAMES.add("cẩm");
1494
+ VN_MIDDLE_NAMES.add("phúc");
1495
+ VN_MIDDLE_NAMES.add("nhật");
1496
+ VN_MIDDLE_NAMES.add("ngọc");
1497
+ VN_MIDDLE_NAMES.add("thời");
1498
+ VN_MIDDLE_NAMES.add("sơn");
1499
+ VN_MIDDLE_NAMES.add("thuỳ");
1500
+ VN_MIDDLE_NAMES.add("văn");
1501
+ VN_MIDDLE_NAMES.add("vân");
1502
+ VN_MIDDLE_NAMES.add("qui");
1503
+ VN_MIDDLE_NAMES.add("hồng");
1504
+ VN_MIDDLE_NAMES.add("thế");
1505
+ VN_MIDDLE_NAMES.add("kim");
1506
+ VN_MIDDLE_NAMES.add("thị");
1507
+ VN_MIDDLE_NAMES.add("danh");
1508
+ VN_MIDDLE_NAMES.add("hoài");
1509
+ VN_MIDDLE_NAMES.add("tiến");
1510
+ VN_MIDDLE_NAMES.add("tú");
1511
+ VN_MIDDLE_NAMES.add("bửu");
1512
+ VN_MIDDLE_NAMES.add("trung");
1513
+ VN_MIDDLE_NAMES.add("thiện");
1514
+ VN_MIDDLE_NAMES.add("tuyết");
1515
+ VN_MIDDLE_NAMES.add("đăng");
1516
+ VN_MIDDLE_NAMES.add("như");
1517
+ VN_MIDDLE_NAMES.add("hùng");
1518
+ VN_MIDDLE_NAMES.add("vô");
1519
+ VN_MIDDLE_NAMES.add("miên");
1520
+ VN_MIDDLE_NAMES.add("quý");
1521
+ VN_MIDDLE_NAMES.add("quang");
1522
+ VN_MIDDLE_NAMES.add("đức");
1523
+ VN_MIDDLE_NAMES.add("ưng");
1524
+ VN_MIDDLE_NAMES.add("tường");
1525
+ VN_MIDDLE_NAMES.add("kiều");
1526
+ VN_MIDDLE_NAMES.add("thảo");
1527
+ VN_MIDDLE_NAMES.add("xuân");
1528
+ VN_MIDDLE_NAMES.add("viết");
1529
+ VN_MIDDLE_NAMES.add("vũ");
1530
+ VN_MIDDLE_NAMES.add("kế");
1531
+ VN_MIDDLE_NAMES.add("gia");
1532
+ VN_MIDDLE_NAMES.add("phước");
1533
+ VN_MIDDLE_NAMES.add("linh");
1534
+ VN_MIDDLE_NAMES.add("cao");
1535
+ VN_MIDDLE_NAMES.add("lệ");
1536
+ }
1537
+
1538
+ public static Set<String> VN_FAMILY_NAMES;
1539
+ static {
1540
+ VN_FAMILY_NAMES = new HashSet<String>();
1541
+ VN_FAMILY_NAMES.add("bảo");
1542
+ VN_FAMILY_NAMES.add("phan");
1543
+ VN_FAMILY_NAMES.add("lý");
1544
+ VN_FAMILY_NAMES.add("bao");
1545
+ VN_FAMILY_NAMES.add("huyền");
1546
+ VN_FAMILY_NAMES.add("lưu");
1547
+ VN_FAMILY_NAMES.add("nguyên");
1548
+ VN_FAMILY_NAMES.add("diêu");
1549
+ VN_FAMILY_NAMES.add("vĩnh");
1550
+ VN_FAMILY_NAMES.add("ngô");
1551
+ VN_FAMILY_NAMES.add("công");
1552
+ VN_FAMILY_NAMES.add("giang");
1553
+ VN_FAMILY_NAMES.add("đào");
1554
+ VN_FAMILY_NAMES.add("bùi");
1555
+ VN_FAMILY_NAMES.add("hông");
1556
+ VN_FAMILY_NAMES.add("ngọc");
1557
+ VN_FAMILY_NAMES.add("chi");
1558
+ VN_FAMILY_NAMES.add("bưu");
1559
+ VN_FAMILY_NAMES.add("tạ");
1560
+ VN_FAMILY_NAMES.add("nguyễn");
1561
+ VN_FAMILY_NAMES.add("văn");
1562
+ VN_FAMILY_NAMES.add("qui");
1563
+ VN_FAMILY_NAMES.add("hồng");
1564
+ VN_FAMILY_NAMES.add("quy");
1565
+ VN_FAMILY_NAMES.add("từ");
1566
+ VN_FAMILY_NAMES.add("trân");
1567
+ VN_FAMILY_NAMES.add("hường");
1568
+ VN_FAMILY_NAMES.add("tô");
1569
+ VN_FAMILY_NAMES.add("mạc");
1570
+ VN_FAMILY_NAMES.add("bửu");
1571
+ VN_FAMILY_NAMES.add("đặng");
1572
+ VN_FAMILY_NAMES.add("huyên");
1573
+ VN_FAMILY_NAMES.add("lâm");
1574
+ VN_FAMILY_NAMES.add("võ");
1575
+ VN_FAMILY_NAMES.add("đinh");
1576
+ VN_FAMILY_NAMES.add("miên");
1577
+ VN_FAMILY_NAMES.add("mai");
1578
+ VN_FAMILY_NAMES.add("hương");
1579
+ VN_FAMILY_NAMES.add("lương");
1580
+ VN_FAMILY_NAMES.add("hồ");
1581
+ VN_FAMILY_NAMES.add("tôn");
1582
+ VN_FAMILY_NAMES.add("ưng");
1583
+ VN_FAMILY_NAMES.add("la");
1584
+ VN_FAMILY_NAMES.add("thân");
1585
+ VN_FAMILY_NAMES.add("hà");
1586
+ VN_FAMILY_NAMES.add("dương");
1587
+ VN_FAMILY_NAMES.add("trịnh");
1588
+ VN_FAMILY_NAMES.add("tằng");
1589
+ VN_FAMILY_NAMES.add("lan");
1590
+ VN_FAMILY_NAMES.add("doãn");
1591
+ VN_FAMILY_NAMES.add("vinh");
1592
+ VN_FAMILY_NAMES.add("trần");
1593
+ VN_FAMILY_NAMES.add("huỳnh");
1594
+ VN_FAMILY_NAMES.add("vương");
1595
+ VN_FAMILY_NAMES.add("vũ");
1596
+ VN_FAMILY_NAMES.add("cao");
1597
+ VN_FAMILY_NAMES.add("phạm");
1598
+ VN_FAMILY_NAMES.add("hoàng");
1599
+ VN_FAMILY_NAMES.add("đỗ");
1600
+ VN_FAMILY_NAMES.add("trương");
1601
+ VN_FAMILY_NAMES.add("đoàn");
1602
+ VN_FAMILY_NAMES.add("diệp");
1603
+ VN_FAMILY_NAMES.add("lê");
1604
+ }
1605
+ }
VnCoreNLP/src/main/java/vn/corenlp/wordsegmenter/WordSegmenter.java ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.corenlp.wordsegmenter;
2
+
3
+ import org.apache.log4j.Logger;
4
+
5
+ import java.io.BufferedReader;
6
+ import java.io.File;
7
+ import java.io.FileInputStream;
8
+ import java.io.IOException;
9
+ import java.io.InputStreamReader;
10
+
11
+ import java.util.ArrayList;
12
+ import java.util.Arrays;
13
+ import java.util.List;
14
+
15
+ /**
16
+ * @author DatQuocNguyen
17
+ */
18
+ public class WordSegmenter {
19
+ private Node root;
20
+ private static WordSegmenter wordSegmenter = null;
21
+ public final static Logger LOGGER = Logger.getLogger(WordSegmenter.class);
22
+ public WordSegmenter()
23
+ throws IOException {
24
+ LOGGER.info("Loading Word Segmentation model");
25
+ String modelPath = vn.pipeline.Utils.jarDir + "/models/wordsegmenter/wordsegmenter.rdr";
26
+ if (!new File(modelPath).exists())
27
+ throw new IOException("WordSegmenter: " + modelPath + " is not found!");
28
+
29
+ this.constructTreeFromRulesFile(modelPath);
30
+ }
31
+
32
+ public static WordSegmenter initialize() throws IOException {
33
+ if(wordSegmenter == null) {
34
+ wordSegmenter = new WordSegmenter();
35
+ }
36
+ return wordSegmenter;
37
+ }
38
+ private void constructTreeFromRulesFile(String rulesFilePath)
39
+ throws IOException {
40
+ BufferedReader buffer = new BufferedReader(
41
+ new InputStreamReader(new FileInputStream(new File(rulesFilePath)), "UTF-8"));
42
+ String line = buffer.readLine();
43
+
44
+ this.root = new Node(new FWObject(false), "NN", null, null, null, 0);
45
+
46
+ Node currentNode = this.root;
47
+ int currentDepth = 0;
48
+
49
+ for (; (line = buffer.readLine()) != null; ) {
50
+ int depth = 0;
51
+ for (int i = 0; i <= 6; i++) { // Supposed that the maximum
52
+ // exception level is up to 6.
53
+ if (line.charAt(i) == '\t')
54
+ depth += 1;
55
+ else
56
+ break;
57
+ }
58
+
59
+ line = line.trim();
60
+ if (line.length() == 0)
61
+ continue;
62
+
63
+ if (line.contains("cc:"))
64
+ continue;
65
+
66
+ FWObject condition = Utils.getCondition(line.split(" : ")[0].trim());
67
+ String conclusion = Utils.getConcreteValue(line.split(" : ")[1].trim());
68
+
69
+ Node node = new Node(condition, conclusion, null, null, null, depth);
70
+
71
+ if (depth > currentDepth) {
72
+ currentNode.setExceptNode(node);
73
+ } else if (depth == currentDepth) {
74
+ currentNode.setIfnotNode(node);
75
+ } else {
76
+ while (currentNode.getDepth() != depth)
77
+ currentNode = currentNode.getFatherNode();
78
+ currentNode.setIfnotNode(node);
79
+ }
80
+ node.setFatherNode(currentNode);
81
+
82
+ currentNode = node;
83
+ currentDepth = depth;
84
+ }
85
+ buffer.close();
86
+ }
87
+
88
+ private Node findFiredNode(FWObject object) {
89
+ Node currentN = root;
90
+ Node firedN = null;
91
+ while (true) {
92
+ if (currentN.satisfy(object)) {
93
+ firedN = currentN;
94
+ if (currentN.getExceptNode() == null) {
95
+ break;
96
+ } else {
97
+ currentN = currentN.getExceptNode();
98
+ }
99
+ } else {
100
+ if (currentN.getIfnotNode() == null) {
101
+ break;
102
+ } else {
103
+ currentN = currentN.getIfnotNode();
104
+ }
105
+ }
106
+
107
+ }
108
+
109
+ return firedN;
110
+ }
111
+
112
+ private List<WordTag> getInitialSegmentation(String sentence)
113
+ {
114
+ List<WordTag> wordtags = new ArrayList<>();
115
+
116
+ for (String regex : Utils.NORMALIZER_KEYS)
117
+ if (sentence.contains(regex))
118
+ sentence = sentence.replaceAll(regex, Utils.NORMALIZER.get(regex));
119
+
120
+ List<String> tokens = Arrays.asList(sentence.split("\\s+"));
121
+ List<String> lowerTokens = Arrays.asList(sentence.toLowerCase().split("\\s+"));
122
+
123
+ int senLength = tokens.size();
124
+ int i = 0;
125
+ while (i < senLength) {
126
+ String token = tokens.get(i);
127
+ if (token.chars().allMatch(Character::isLetter)) {
128
+
129
+ if (Character.isLowerCase(token.charAt(0)) && (i + 1) < senLength) {
130
+ if (Character.isUpperCase(tokens.get(i + 1).charAt(0))) {
131
+ wordtags.add(new WordTag(token, "B"));
132
+ i++;
133
+ continue;
134
+ }
135
+ }
136
+
137
+ boolean isSingleSyllabel = true;
138
+ for (int j = Math.min(i + 4, senLength); j > i + 1; j--) {
139
+ String word = String.join(" ", lowerTokens.subList(i, j));
140
+ if (Vocabulary.VN_DICT.contains(word)
141
+ || Vocabulary.VN_LOCATIONS.contains(word) || Vocabulary.COUNTRY_L_NAME.contains(word)) {
142
+
143
+ wordtags.add(new WordTag(token, "B"));
144
+ for (int k = i + 1; k < j; k++)
145
+ wordtags.add(new WordTag(tokens.get(k), "I"));
146
+
147
+ i = j - 1;
148
+
149
+ isSingleSyllabel = false;
150
+ break;
151
+ }
152
+ }
153
+ if (isSingleSyllabel) {
154
+ String lowercasedToken = lowerTokens.get(i);
155
+
156
+ if (Vocabulary.VN_FIRST_SENT_WORDS.contains(lowercasedToken)
157
+ || Character.isLowerCase(token.charAt(0))
158
+ || token.chars().allMatch(Character::isUpperCase)
159
+ || Vocabulary.COUNTRY_S_NAME.contains(lowercasedToken)
160
+ || Vocabulary.WORLD_COMPANY.contains(lowercasedToken)) {
161
+
162
+ wordtags.add(new WordTag(token, "B"));
163
+ i++;
164
+ continue;
165
+
166
+ }
167
+
168
+ // Capitalized
169
+ int ilower = i + 1;
170
+ for (ilower = i + 1; ilower < Math.min(i + 4, senLength); ilower++) {
171
+ String ntoken = tokens.get(ilower);
172
+ if (Character.isLowerCase(ntoken.charAt(0))
173
+ || !ntoken.chars().allMatch(Character::isLetter)
174
+ || ntoken.equals("LBKT") || ntoken.equals("RBKT")) {
175
+ break;
176
+ }
177
+ }
178
+
179
+ if (ilower > i + 1) {
180
+ boolean isNotMiddleName = true;
181
+ if (Vocabulary.VN_MIDDLE_NAMES.contains(lowercasedToken) && (i >= 1)) {
182
+ String prevT = tokens.get(i - 1);
183
+ if (Character.isUpperCase(prevT.charAt(0))) {
184
+ if (Vocabulary.VN_FAMILY_NAMES.contains(prevT.toLowerCase())) {
185
+ wordtags.add(new WordTag(token, "I"));
186
+ isNotMiddleName = false;
187
+ }
188
+ }
189
+ }
190
+ if (isNotMiddleName)
191
+ wordtags.add(new WordTag(token, "B"));
192
+ for (int k = i + 1; k < ilower; k++)
193
+ wordtags.add(new WordTag(tokens.get(k), "I"));
194
+
195
+ i = ilower - 1;
196
+ }
197
+ else {
198
+ wordtags.add(new WordTag(token, "B"));
199
+ }
200
+ }
201
+ }
202
+ else {
203
+ wordtags.add(new WordTag(token, "B"));
204
+ }
205
+
206
+ i++;
207
+ }
208
+
209
+ return wordtags;
210
+
211
+ }
212
+
213
+ public String segmentTokenizedString(String str)
214
+ throws IOException {
215
+ StringBuilder sb = new StringBuilder();
216
+
217
+ String line = str.trim();
218
+ if (line.length() == 0) {
219
+ return "\n";
220
+ }
221
+
222
+ List<WordTag> wordtags = this.getInitialSegmentation(line);
223
+
224
+ int size = wordtags.size();
225
+ for (int i = 0; i < size; i++) {
226
+ FWObject object = Utils.getObject(wordtags, size, i);
227
+ Node firedNode = findFiredNode(object);
228
+ if (firedNode.getDepth() > 0) {
229
+ if (firedNode.getConclusion().equals("B"))
230
+ sb.append(" " + wordtags.get(i).form);
231
+ else
232
+ sb.append("_" + wordtags.get(i).form);
233
+ }
234
+ else {// Fired at root, return initialized tag
235
+ if (wordtags.get(i).tag.equals("B"))
236
+ sb.append(" " + wordtags.get(i).form);
237
+ else
238
+ sb.append("_" + wordtags.get(i).form);
239
+ }
240
+ }
241
+ return sb.toString().trim();
242
+ }
243
+
244
+ }
245
+
VnCoreNLP/src/main/java/vn/corenlp/wordsegmenter/WordTag.java ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.corenlp.wordsegmenter;
2
+
3
+ public class WordTag {
4
+ public String word;
5
+ public String tag;
6
+ public String form;
7
+
8
+ public WordTag(String iword, String itag) {
9
+ form = iword;
10
+ word = iword.toLowerCase();
11
+ tag = itag;
12
+ }
13
+ }
VnCoreNLP/src/main/java/vn/pipeline/Annotation.java ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.pipeline;
2
+
3
+ import java.io.*;
4
+ import java.util.*;
5
+
6
+ public class Annotation {
7
+ private String rawText;
8
+ private List<String> tokens;
9
+ private String wordSegmentedText;
10
+ private List<Word> words;
11
+ private List<Sentence> sentences;
12
+
13
+ public Annotation(String rawText) {
14
+ this.rawText = rawText.trim();
15
+ this.tokens = new ArrayList<>();
16
+ this.wordSegmentedText = "";
17
+ this.words = new ArrayList<>();
18
+ }
19
+
20
+ public String detectLanguage() {
21
+ try {
22
+ return Utils.detectLanguage(rawText);
23
+ } catch (IOException e) {
24
+ System.err.println("Cannot detect language!");
25
+ }
26
+ // Can't detect language
27
+ return "N/A";
28
+ }
29
+
30
+ public static boolean isAlphabetic(String str) {
31
+ char[] chars = str.toCharArray();
32
+
33
+ for (char c : chars) {
34
+ if (!Character.isLetter(c)) {
35
+ return false;
36
+ }
37
+ }
38
+
39
+ return true;
40
+ }
41
+
42
+ @Override
43
+ public String toString() {
44
+ StringBuffer sb = new StringBuffer();
45
+ if(sentences != null)
46
+ for(Sentence sentence : sentences) {
47
+ sb.append(sentence.toString() + "\n\n");
48
+ }
49
+ else return rawText;
50
+ return sb.toString();
51
+ }
52
+
53
+ // Word count
54
+ public HashMap<String, Integer> wordCount() {
55
+ HashMap<String, Integer> output = new HashMap<>();
56
+ for (Word np : words) {
57
+ String w = np.getForm();
58
+ if (!output.containsKey(w)) output.put(w, 1);
59
+ else output.put(w, output.get(w) + 1);
60
+ }
61
+ return output;
62
+ }
63
+
64
+ public LinkedHashMap<String, Integer> ngrams(int n, boolean isWordLevel) {
65
+ if (!isWordLevel) return ngramAtCharacterLevel(n);
66
+ return ngramAtWordLevel(n);
67
+ }
68
+
69
+ private LinkedHashMap<String, Integer> ngramAtCharacterLevel(int n) {
70
+ LinkedHashMap<String, Integer> output = new LinkedHashMap<>();
71
+ for (int i = 0; i < this.rawText.length(); i++) {
72
+ StringBuffer sb = new StringBuffer();
73
+ if (i + n <= this.rawText.length()) {
74
+ for (int j = i; j < i + n; j++)
75
+ sb.append(this.rawText.charAt(j));
76
+ String ngram = sb.toString();
77
+ if (!output.containsKey(ngram)) output.put(ngram, 1);
78
+ else output.put(ngram, output.get(ngram) + 1);
79
+ }
80
+ }
81
+ return output;
82
+ }
83
+
84
+ private LinkedHashMap<String, Integer> ngramAtWordLevel(int n) {
85
+ LinkedHashMap<String, Integer> output = new LinkedHashMap<>();
86
+ for (int i = 0; i < this.tokens.size(); i++) {
87
+ StringBuffer sb = new StringBuffer();
88
+ if (i + n <= this.tokens.size()) {
89
+ for (int j = i; j < i + n; j++)
90
+ sb.append(this.tokens.get(j) + " ");
91
+ String ngram = sb.toString();
92
+ if (!output.containsKey(ngram)) output.put(ngram, 1);
93
+ else output.put(ngram, output.get(ngram) + 1);
94
+ }
95
+ }
96
+ return output;
97
+ }
98
+
99
+ public String getRawText() {
100
+ return rawText;
101
+ }
102
+
103
+ public List<Sentence> getSentences() {
104
+ return sentences;
105
+ }
106
+
107
+ public List<String> getTokens() {
108
+ return tokens;
109
+ }
110
+
111
+ public String getWordSegmentedText() {
112
+ return wordSegmentedText;
113
+ }
114
+
115
+
116
+ public String getWordSegmentedTaggedText() {
117
+ StringBuffer wordSegmentedTaggedText = new StringBuffer();
118
+ for(Sentence sentence : sentences) {
119
+ wordSegmentedTaggedText.append(sentence.getWordSegmentedTaggedSentence() + " ");
120
+ }
121
+ return wordSegmentedTaggedText.toString().trim();
122
+ }
123
+
124
+ public List<Word> getWords() {
125
+ return words;
126
+ }
127
+
128
+ public void setRawText(String rawText) {
129
+ this.rawText = rawText;
130
+ }
131
+
132
+ public void setTokens(List<String> tokens) {
133
+ this.tokens = tokens;
134
+ }
135
+
136
+ public void setWordSegmentedText(String wordSegmentedText) {
137
+ this.wordSegmentedText = wordSegmentedText;
138
+ }
139
+
140
+ public void setWords(List<Word> words) {
141
+ this.words = words;
142
+ }
143
+
144
+ public void setSentences(List<Sentence> sentences) {
145
+ this.sentences = sentences;
146
+ }
147
+ }
VnCoreNLP/src/main/java/vn/pipeline/LexicalInitializer.java ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.pipeline;
2
+
3
+ import edu.emory.mathcs.nlp.component.template.lexicon.GlobalLexica;
4
+
5
+ import org.apache.log4j.Logger;
6
+ import org.w3c.dom.Document;
7
+ import org.w3c.dom.Element;
8
+
9
+ import javax.xml.parsers.DocumentBuilder;
10
+ import javax.xml.parsers.DocumentBuilderFactory;
11
+ import java.io.File;
12
+ import java.io.IOException;
13
+ import java.util.HashMap;
14
+ import java.util.logging.Handler;
15
+ import java.util.logging.Level;
16
+
17
+ public class LexicalInitializer {
18
+ private static LexicalInitializer lexicalInitializer;
19
+ private HashMap<String, String> lexicalMap ;
20
+ private boolean initLexica = false;
21
+ private GlobalLexica globalLexica;
22
+
23
+ public final static Logger LOGGER = Logger.getLogger(LexicalInitializer.class);
24
+
25
+ public LexicalInitializer(boolean initLexica) throws IOException {
26
+
27
+ this.initLexica = initLexica;
28
+ this.lexicalMap = new HashMap<>();
29
+
30
+ String lexicalPath = Utils.jarDir + "/models/ner/vi-500brownclusters.xz";
31
+ if (!new File(lexicalPath).exists())
32
+ throw new IOException("LexicalInitializer: " + lexicalPath + " is not found!");
33
+ lexicalMap.put("word_clusters", lexicalPath);
34
+
35
+ lexicalPath = Utils.jarDir + "/models/ner/vi-pretrainedembeddings.xz";
36
+ if (!new File(lexicalPath).exists())
37
+ throw new IOException("LexicalInitializer: " + lexicalPath + " is not found!");
38
+ lexicalMap.put("word_embeddings", lexicalPath);
39
+ }
40
+
41
+ public static LexicalInitializer initialize(boolean initLexica) throws IOException {
42
+ if (lexicalInitializer == null) {
43
+ lexicalInitializer = new LexicalInitializer(initLexica);
44
+ lexicalInitializer.initializeLexica();
45
+ }
46
+ return lexicalInitializer;
47
+ }
48
+
49
+ public GlobalLexica initializeLexica() {
50
+ if (globalLexica == null && initLexica)
51
+ try {
52
+
53
+ DocumentBuilderFactory dbfac = DocumentBuilderFactory.newInstance();
54
+ DocumentBuilder docBuilder = dbfac.newDocumentBuilder();
55
+ Document xmlDoc = docBuilder.newDocument();
56
+ Element root = xmlDoc.createElement("root");
57
+ Element lexicals = xmlDoc.createElement("lexica");
58
+ for(String lexicalName : lexicalMap.keySet()) {
59
+ Element lexical = xmlDoc.createElement(lexicalName);
60
+ lexical.setAttribute("field", "word_form_lowercase");
61
+ if(!new File(lexicalMap.get(lexicalName)).exists()) return null;
62
+ lexical.appendChild(xmlDoc.createTextNode(lexicalMap.get(lexicalName)));
63
+ lexicals.appendChild(lexical);
64
+ }
65
+ root.appendChild(lexicals);
66
+
67
+ java.util.logging.Logger globalLogger = java.util.logging.Logger.getLogger("");
68
+ globalLogger.setLevel(Level.OFF);
69
+ Handler[] handlers = globalLogger.getHandlers();
70
+ for(Handler handler : handlers) {
71
+ globalLogger.removeHandler(handler);
72
+ }
73
+
74
+ globalLexica = new GlobalLexica<>(root);
75
+ } catch (Exception e) {
76
+ e.printStackTrace();
77
+ }
78
+ return globalLexica;
79
+ }
80
+
81
+
82
+ }
VnCoreNLP/src/main/java/vn/pipeline/Sentence.java ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.pipeline;
2
+
3
+ import vn.corenlp.ner.NerRecognizer;
4
+ import vn.corenlp.parser.DependencyParser;
5
+ import vn.corenlp.postagger.PosTagger;
6
+ import vn.corenlp.wordsegmenter.WordSegmenter;
7
+ import vn.corenlp.tokenizer.Tokenizer;
8
+
9
+ import java.io.IOException;
10
+ import java.util.ArrayList;
11
+ import java.util.List;
12
+
13
+ public class Sentence {
14
+ private String rawSentence;
15
+ private List<String> tokens;
16
+ private String wordSegmentedSentence;
17
+
18
+ private List<Word> words;
19
+
20
+ private WordSegmenter wordSegmenter ;
21
+ private PosTagger posTagger;
22
+ private NerRecognizer nerRecognizer;
23
+ private DependencyParser dependencyParser;
24
+
25
+ public Sentence(String rawSentence, WordSegmenter wordSegmenter, PosTagger tagger, NerRecognizer nerRecognizer, DependencyParser dependencyParser) throws IOException {
26
+ this.posTagger = tagger;
27
+ this.nerRecognizer = nerRecognizer;
28
+ this.dependencyParser = dependencyParser;
29
+ this.wordSegmenter = wordSegmenter;
30
+ init(rawSentence.trim());
31
+ }
32
+
33
+
34
+ public String detectLanguage() {
35
+ try {
36
+ return Utils.detectLanguage(rawSentence);
37
+ } catch (IOException e) {
38
+ System.err.println("Cannot detect language!");
39
+ }
40
+ // Can't detect language
41
+ return "N/A";
42
+ }
43
+
44
+ private void init(String rawSentence) throws IOException {
45
+ this.rawSentence = rawSentence;
46
+ this.tokens = Tokenizer.tokenize(this.rawSentence);
47
+
48
+ if(this.wordSegmenter != null) {
49
+ this.wordSegmentedSentence = this.wordSegmenter.segmentTokenizedString(this.rawSentence);
50
+ }
51
+ else this.wordSegmentedSentence = String.join(" ", this.tokens);
52
+
53
+ this.createWords();
54
+
55
+ }
56
+
57
+ private void createWords() throws IOException {
58
+
59
+ if (this.posTagger != null)
60
+ this.words = posTagger.tagSentence(this.wordSegmentedSentence);
61
+ else {
62
+ this.words = new ArrayList<>();
63
+ String[] segmentedTokens = this.wordSegmentedSentence.split(" ");
64
+ for (int i = 0; i < segmentedTokens.length; i++) {
65
+ Word word = new Word((i+1), segmentedTokens[i]);
66
+ this.words.add(word);
67
+ }
68
+ }
69
+
70
+ if (this.nerRecognizer != null)
71
+ this.nerRecognizer.tagSentence(this.words);
72
+ if (this.dependencyParser != null)
73
+ this.dependencyParser.tagSentence(this.words);
74
+
75
+ }
76
+
77
+ @Override
78
+ public String toString() {
79
+ StringBuffer sb = new StringBuffer();
80
+ for (Word word : words) {
81
+ sb.append(word.toString() + "\n");
82
+ }
83
+ return sb.toString().trim();
84
+ }
85
+
86
+ public String getRawSentence() {
87
+ return rawSentence;
88
+ }
89
+
90
+ public List<String> getTokens() {
91
+ return tokens;
92
+ }
93
+
94
+ public String getWordSegmentedSentence() {
95
+ return wordSegmentedSentence;
96
+ }
97
+
98
+ public List<Word> getWords() {
99
+ return words;
100
+ }
101
+
102
+ public String getWordSegmentedTaggedSentence() {
103
+ StringBuffer wordSegmentedTaggedSentence = new StringBuffer();
104
+ for(Word word : this.words) {
105
+ wordSegmentedTaggedSentence.append(word.toString() + " ");
106
+ }
107
+ return wordSegmentedTaggedSentence.toString().trim();
108
+ }
109
+
110
+ }
VnCoreNLP/src/main/java/vn/pipeline/Utils.java ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.pipeline;
2
+
3
+ import com.optimaize.langdetect.DetectedLanguage;
4
+ import com.optimaize.langdetect.LanguageDetector;
5
+ import com.optimaize.langdetect.LanguageDetectorBuilder;
6
+ import com.optimaize.langdetect.ngram.NgramExtractors;
7
+ import com.optimaize.langdetect.profiles.LanguageProfileReader;
8
+
9
+ import java.io.File;
10
+ import java.io.IOException;
11
+ import java.util.List;
12
+
13
+ public class Utils {
14
+ private static File jarFile = new File(VnCoreNLP.class.getProtectionDomain().getCodeSource().getLocation().getPath());
15
+ public static String jarDir = jarFile.getParentFile().getPath();
16
+
17
+ private static LanguageDetector languageDetector = null;
18
+ public static String detectLanguage(String text) throws IOException{
19
+ if(languageDetector == null) {
20
+ languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
21
+ .shortTextAlgorithm(0)
22
+ .withProfiles(new LanguageProfileReader().readAllBuiltIn())
23
+ .build();
24
+ }
25
+ List<DetectedLanguage> detectedLanguages = languageDetector.getProbabilities(text);
26
+ if(detectedLanguages.size() > 0)
27
+ return detectedLanguages.get(0).getLocale().getLanguage();
28
+ return "N/A";
29
+ }
30
+
31
+ }
VnCoreNLP/src/main/java/vn/pipeline/VnCoreNLP.java ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.pipeline;
2
+
3
+ import org.apache.log4j.Logger;
4
+ import vn.corenlp.ner.NerRecognizer;
5
+ import vn.corenlp.parser.DependencyParser;
6
+ import vn.corenlp.postagger.PosTagger;
7
+ import vn.corenlp.tokenizer.Tokenizer;
8
+ import vn.corenlp.wordsegmenter.WordSegmenter;
9
+
10
+ import java.io.*;
11
+ import java.util.ArrayList;
12
+ import java.util.List;
13
+
14
+
15
+ public class VnCoreNLP {
16
+
17
+ public final static Logger LOGGER = Logger.getLogger(Annotation.class);
18
+
19
+ private WordSegmenter wordSegmenter;
20
+ private PosTagger posTagger;
21
+ private NerRecognizer nerRecognizer;
22
+ private DependencyParser dependencyParser;
23
+
24
+ public VnCoreNLP() throws IOException {
25
+ String[] annotators = {"wseg", "pos", "ner", "parse"};
26
+ initAnnotators(annotators);
27
+ }
28
+
29
+ public VnCoreNLP(String[] annotators) throws IOException {
30
+ initAnnotators(annotators);
31
+
32
+ }
33
+
34
+ public void initAnnotators(String[] annotators) throws IOException{
35
+ for(String annotator : annotators) {
36
+ switch (annotator.trim()) {
37
+ case "parse":
38
+ this.dependencyParser = DependencyParser.initialize();
39
+ break;
40
+ case "ner":
41
+ this.nerRecognizer = NerRecognizer.initialize();
42
+ break;
43
+ case "pos":
44
+ this.posTagger = PosTagger.initialize();
45
+ break;
46
+ case "wseg":
47
+ this.wordSegmenter = WordSegmenter.initialize();
48
+ break;
49
+ }
50
+ }
51
+
52
+ }
53
+
54
+ public void printToFile(Annotation annotation, PrintStream printer) throws IOException {
55
+ for(Sentence sentence : annotation.getSentences()) {
56
+ printer.println(sentence.toString());
57
+ }
58
+ }
59
+
60
+ public void printToFile(Annotation annotation, String fileOut) throws IOException {
61
+ PrintStream printer = new PrintStream(fileOut, "UTF-8");
62
+ for(Sentence sentence : annotation.getSentences()) {
63
+ printer.println(sentence.toString() + "\n");
64
+ }
65
+ }
66
+
67
+ public void annotate(Annotation annotation) throws IOException {
68
+ List<String> rawSentences = Tokenizer.joinSentences(Tokenizer.tokenize(annotation.getRawText()));
69
+ annotation.setSentences(new ArrayList<>());
70
+ for (String rawSentence : rawSentences) {
71
+ if (rawSentence.trim().length() > 0) {
72
+ Sentence sentence = new Sentence(rawSentence, wordSegmenter, posTagger, nerRecognizer, dependencyParser);
73
+ annotation.getSentences().add(sentence);
74
+ annotation.getTokens().addAll(sentence.getTokens());
75
+ annotation.getWords().addAll(sentence.getWords());
76
+ annotation.setWordSegmentedText(annotation.getWordSegmentedTaggedText() + sentence.getWordSegmentedSentence() + " ");
77
+ }
78
+
79
+ }
80
+
81
+ annotation.setWordSegmentedText(annotation.getWordSegmentedTaggedText().trim());
82
+
83
+ }
84
+
85
+ public static void printUsage() {
86
+ System.out.println("Usage: \n\t-fin inputFile (required)\n\t-fout outputFile (optional, default: inputFile.out)\n" +
87
+ "\t-annotators functionNames (optional, default: wseg,pos,ner,parse)" +
88
+ "\nExample 1: -fin sample_input.txt -fout output.txt" +
89
+ "\nExample 2: -fin sample_input.txt -fout output.txt -annotators wseg,pos,ner");
90
+ }
91
+
92
+ public static void processPipeline(String fileIn, String fileOut, String[] annotators) throws IOException{
93
+
94
+ FileInputStream fis = new FileInputStream(new File(fileIn));
95
+ InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
96
+ OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(new File(fileOut)), "UTF-8");
97
+
98
+ BufferedReader br = new BufferedReader(isr);
99
+ VnCoreNLP pipeline = new VnCoreNLP(annotators);
100
+ LOGGER.info("Start processing " + fileIn);
101
+ while(br.ready()) {
102
+ String line = br.readLine();
103
+ if (line.trim().length() > 0) {
104
+ Annotation annotation = new Annotation(line);
105
+ pipeline.annotate(annotation);
106
+ osw.write(annotation.toString());
107
+ }
108
+ }
109
+ br.close();
110
+ isr.close();
111
+ fis.close();
112
+ osw.close();
113
+ LOGGER.info("Wrote output to " + fileOut);
114
+ }
115
+
116
+ public static void main(String[] args) throws IOException {
117
+ String fileIn = null, fileOut = null;
118
+ String[] annotators = {"wseg", "pos", "ner", "parse"};
119
+ for(int i = 0; i < args.length; i++) {
120
+ if (args[i].equals("-fin") && i + 1 < args.length) fileIn = args[i+1];
121
+ else if (args[i].equals("-fout") && i + 1 < args.length) fileOut = args[i+1];
122
+ else if (args[i].equals("-annotators") && i + 1 < args.length) annotators = args[i+1].split(",");
123
+ }
124
+
125
+ if (fileIn == null) {
126
+ printUsage();
127
+ return;
128
+ }
129
+
130
+ if (fileOut == null) fileOut = fileIn + ".out";
131
+ processPipeline(fileIn, fileOut, annotators);
132
+ }
133
+
134
+ }
VnCoreNLP/src/main/java/vn/pipeline/Word.java ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package vn.pipeline;
2
+
3
+ public class Word {
4
+ private int index = -1;
5
+ private String form;
6
+ private String posTag;
7
+ private String nerLabel;
8
+ private int head = -1;
9
+ private String depLabel;
10
+
11
+ public Word(Word word) {
12
+ this.index = word.index;
13
+ this.form = word.form;
14
+ this.posTag = word.posTag;
15
+ this.nerLabel = word.nerLabel;
16
+ this.head = word.head;
17
+ this.depLabel = word.depLabel;
18
+ }
19
+ public Word(int index, String form, String posTag) {
20
+ this.index = index;
21
+ this.form = form;
22
+ this.posTag = posTag;
23
+ }
24
+
25
+ public Word(int index, String form) {
26
+ this.index = index;
27
+ this.form = form;
28
+ }
29
+
30
+ public Word(int index, String form, String posTag, String nerLabel) {
31
+ this.index = index;
32
+ this.form = form;
33
+ this.posTag = posTag;
34
+ this.nerLabel = nerLabel;
35
+ }
36
+
37
+ public Word(int index, String form, String posTag, String nerLabel, String chunkingLabel) {
38
+ this.index = index;
39
+ this.form = form;
40
+ this.posTag = posTag;
41
+ this.nerLabel = nerLabel;
42
+ }
43
+
44
+ public Word(int index, String form, String posTag, String nerLabel, int head, String depLabel, String chunkingLabel) {
45
+ this.index = index;
46
+ this.form = form;
47
+ this.posTag = posTag;
48
+ this.nerLabel = nerLabel;
49
+ this.head = head;
50
+ this.depLabel = depLabel;
51
+ }
52
+
53
+ @Override
54
+ public String toString() {
55
+ return this.getIndex() + "\t" +
56
+ this.getForm() + "\t" +
57
+ (this.getPosTag() == null?"_": this.getPosTag()) + "\t" +
58
+ (this.getNerLabel() == null?"_": this.getNerLabel()) + "\t" +
59
+ (this.getHead() == -1?"_\t": this.getHead()) + "\t" +
60
+ (this.getDepLabel() == null?"_": this.getDepLabel());
61
+ }
62
+
63
+ public String getForm() {
64
+ return form;
65
+ }
66
+
67
+ public void setForm(String form) {
68
+ this.form = form;
69
+ }
70
+
71
+ public String getPosTag() {
72
+ return posTag;
73
+ }
74
+
75
+ public void setPosTag(String pos) {
76
+ this.posTag = pos;
77
+ }
78
+
79
+ public String getNerLabel() {
80
+ return nerLabel;
81
+ }
82
+
83
+ public void setNerLabel(String nerLabel) {
84
+ this.nerLabel = nerLabel;
85
+ }
86
+
87
+ public int getIndex() {
88
+ return index;
89
+ }
90
+
91
+ public void setIndex(int index) {
92
+ this.index = index;
93
+ }
94
+
95
+ public int getHead() {
96
+ return head;
97
+ }
98
+
99
+ public void setHead(int head) {
100
+ this.head = head;
101
+ }
102
+
103
+ public String getDepLabel() {
104
+ return depLabel;
105
+ }
106
+
107
+ public void setDepLabel(String depLabel) {
108
+ this.depLabel = depLabel;
109
+ }
110
+
111
+ }
VnCoreNLP/src/main/resources/log4j.properties ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Root logger option
2
+
3
+ log4j.rootLogger=INFO, stdout
4
+ log4j.logger.edu.emory.mathcs.nlp=OFF
5
+
6
+ # Direct log messages to stdout
7
+ log4j.appender.stdout=org.apache.log4j.ConsoleAppender
8
+ log4j.appender.stdout.Target=System.out
9
+ log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
10
+ log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
VnCoreNLP/src/test/java/VnCoreNLPExample.java ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import vn.pipeline.*;
2
+ import java.io.*;
3
+ public class VnCoreNLPExample {
4
+ public static void main(String[] args) throws IOException {
5
+
6
+ // "wseg", "pos", "ner", and "parse" refer to as word segmentation, POS tagging, NER and dependency parsing, respectively.
7
+ String[] annotators = {"wseg", "pos", "ner", "parse"};
8
+ VnCoreNLP pipeline = new VnCoreNLP(annotators);
9
+
10
+ String str = "Ông Nguyễn Khắc Chúc đang làm việc tại Đại học Quốc gia Hà Nội. "
11
+ + "Bà Lan, vợ ông Chúc, cũng làm việc tại đây.";
12
+ Annotation annotation = new Annotation(str);
13
+ pipeline.annotate(annotation);
14
+
15
+ System.out.println(annotation.toString());
16
+ // 1 Ông Nc O 4 sub
17
+ // 2 Nguyễn_Khắc_Chúc Np B-PER 1 nmod
18
+ // 3 đang R O 4 adv
19
+ // 4 làm_việc V O 0 root
20
+ // ...
21
+
22
+ //Write to file
23
+ PrintStream outputPrinter = new PrintStream("output.txt");
24
+ pipeline.printToFile(annotation, outputPrinter);
25
+
26
+ // You can also get a single sentence to analyze individually
27
+ Sentence firstSentence = annotation.getSentences().get(0);
28
+ System.out.println(firstSentence.toString());
29
+ }
30
+ }