saif
commited on
Commit
•
ba83523
1
Parent(s):
e235524
Update model by training for 25 epochs and two more datasets i.e. mit restaurant and mit movie trivia.
Browse files- README.md +131 -39
- added_tokens.json +1 -1
- config.json +4 -2
- pytorch_model.bin +2 -2
- tokenizer.json +0 -0
- tokenizer_config.json +1 -1
README.md
CHANGED
@@ -17,59 +17,151 @@ The FSNER model was proposed in [Example-Based Named Entity Recognition](https:/
|
|
17 |
## Model Training Details
|
18 |
-----
|
19 |
|
20 |
-
| identifier | epochs
|
21 |
-
| ----------
|
22 |
-
| [sayef/fsner-bert-base-uncased](https://huggingface.co/sayef/fsner-bert-base-uncased) |
|
23 |
-
|
24 |
|
25 |
## Installation and Example Usage
|
26 |
------
|
27 |
|
28 |
-
|
29 |
|
|
|
30 |
|
31 |
-
|
32 |
-
from fsner import FSNERModel, FSNERTokenizerUtils
|
33 |
|
34 |
-
model
|
35 |
|
36 |
-
|
37 |
|
38 |
-
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
'I would like to order a computer from eBay.',
|
43 |
-
]
|
44 |
|
45 |
-
|
46 |
-
# wrap entities around with [E] and [/E] in the examples
|
47 |
-
|
48 |
-
supports = [
|
49 |
-
[
|
50 |
-
'Horizontal flow wrapper [E] Pack 403 [/E] features the new retrofit-kit „paper-ON-form“',
|
51 |
-
'[E] Paloma Pick-and-Place-Roboter [/E] arranges the bakery products for the downstream tray-forming equipment',
|
52 |
-
'Finally, the new [E] Kliklok ACE [/E] carton former forms cartons and trays without the use of glue',
|
53 |
-
'We set up our pilot plant with the right [E] FibreForm® [/E] configuration to make prototypes for your marketing tests and package validation',
|
54 |
-
'The [E] CAR-T5 [/E] is a reliable, purely mechanically driven cartoning machine for versatile application fields'
|
55 |
-
],
|
56 |
-
[
|
57 |
-
"[E] Walmart [/E] is a leading e-commerce company",
|
58 |
-
"I recently ordered a book from [E] Amazon [/E]",
|
59 |
-
"I ordered this from [E] ShopClues [/E]",
|
60 |
-
"Fridge can be ordered in [E] Amazon [/E]",
|
61 |
-
"[E] Flipkart [/E] started it's journey from zero"
|
62 |
-
]
|
63 |
-
]
|
64 |
|
65 |
-
device = 'cpu'
|
66 |
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
-
|
71 |
|
72 |
-
output = tokenizer.extract_entity_from_scores(query, W_query, start_prob, end_prob, thresh=0.50)
|
73 |
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
```
|
|
|
17 |
## Model Training Details
|
18 |
-----
|
19 |
|
20 |
+
| identifier | epochs | datasets |
|
21 |
+
| ---------- |:------:|:-----------------------------------------------------------------------------------------------:|
|
22 |
+
| [sayef/fsner-bert-base-uncased](https://huggingface.co/sayef/fsner-bert-base-uncased) | 25 | ontonotes5, conll2003, wnut2017, mit_movie_trivia, mit_restaurant and fin (Alvarado et al.). |
|
|
|
23 |
|
24 |
## Installation and Example Usage
|
25 |
------
|
26 |
|
27 |
+
You can use the FSNER model in 3 ways:
|
28 |
|
29 |
+
1. Install directly from PyPI: `pip install fsner` and import the model as shown in the code example below
|
30 |
|
31 |
+
or
|
|
|
32 |
|
33 |
+
2. Install from source: `python setup.py install` and import the model as shown in the code example below
|
34 |
|
35 |
+
or
|
36 |
|
37 |
+
3. Clone [repo](https://github.com/sayef/fsner) and add absolute path of `fsner/src` directory to your PYTHONPATH and import the model as shown in the code example below
|
38 |
|
39 |
+
```python
|
40 |
+
import json
|
|
|
|
|
41 |
|
42 |
+
from fsner import FSNERModel, FSNERTokenizerUtils, pretty_embed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
|
|
44 |
|
45 |
+
query_texts = [
|
46 |
+
"Does Luke's serve lunch?",
|
47 |
+
"Chang does not speak Taiwanese very well.",
|
48 |
+
"I like Berlin."
|
49 |
+
]
|
50 |
+
|
51 |
+
# Each list in supports are the examples of one entity type
|
52 |
+
# Wrap entities around with [E] and [/E] in the examples.
|
53 |
+
# Each sentence should have only one pair of [E] ... [/E]
|
54 |
+
|
55 |
+
support_texts = {
|
56 |
+
"Restaurant": [
|
57 |
+
"What time does [E] Subway [/E] open for breakfast?",
|
58 |
+
"Is there a [E] China Garden [/E] restaurant in newark?",
|
59 |
+
"Does [E] Le Cirque [/E] have valet parking?",
|
60 |
+
"Is there a [E] McDonalds [/E] on main street?",
|
61 |
+
"Does [E] Mike's Diner [/E] offer huge portions and outdoor dining?"
|
62 |
+
],
|
63 |
+
"Language": [
|
64 |
+
"Although I understood no [E] French [/E] in those days , I was prepared to spend the whole day with Chien - chien .",
|
65 |
+
"like what the hell 's that called in [E] English [/E] ? I have to register to be here like since I 'm a foreigner .",
|
66 |
+
"So , I 'm also working on an [E] English [/E] degree because that 's my real interest .",
|
67 |
+
"Al - Jazeera TV station , established in November 1996 in Qatar , is an [E] Arabic - language [/E] news TV station broadcasting global news and reports nonstop around the clock .",
|
68 |
+
"They think it 's far better for their children to be here improving their [E] English [/E] than sitting at home in front of a TV . \"",
|
69 |
+
"The only solution seemed to be to have her learn [E] French [/E] .",
|
70 |
+
"I have to read sixty pages of [E] Russian [/E] today ."
|
71 |
+
]
|
72 |
+
}
|
73 |
|
74 |
+
device = 'cpu'
|
75 |
|
|
|
76 |
|
77 |
+
tokenizer = FSNERTokenizerUtils("checkpoints/model")
|
78 |
+
queries = tokenizer.tokenize(query_texts).to(device)
|
79 |
+
supports = tokenizer.tokenize(list(support_texts.values())).to(device)
|
80 |
+
|
81 |
+
model = FSNERModel("checkpoints/model")
|
82 |
+
model.to(device)
|
83 |
+
|
84 |
+
p_starts, p_ends = model.predict(queries, supports)
|
85 |
+
|
86 |
+
# One can prepare supports once and reuse multiple times with different queries
|
87 |
+
# ------------------------------------------------------------------------------
|
88 |
+
# start_token_embeddings, end_token_embeddings = model.prepare_supports(supports)
|
89 |
+
# p_starts, p_ends = model.predict(queries, start_token_embeddings=start_token_embeddings,
|
90 |
+
# end_token_embeddings=end_token_embeddings)
|
91 |
+
|
92 |
+
output = tokenizer.extract_entity_from_scores(query_texts, queries, p_starts, p_ends,
|
93 |
+
entity_keys=list(support_texts.keys()), thresh=0.50)
|
94 |
+
|
95 |
+
print(json.dumps(output, indent=2))
|
96 |
+
|
97 |
+
# install displacy for pretty embed
|
98 |
+
pretty_embed(query_texts, output, list(support_texts.keys()))
|
99 |
+
```
|
100 |
+
|
101 |
+
|
102 |
+
<!DOCTYPE html>
|
103 |
+
<html lang="en">
|
104 |
+
<head>
|
105 |
+
<title>displaCy</title>
|
106 |
+
</head>
|
107 |
+
<body style="font-size: 16px; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; padding: 4rem 2rem; direction: ltr">
|
108 |
+
<figure style="margin-bottom: 6rem">
|
109 |
+
<div class="entities" style="line-height: 2.5; direction: ltr">
|
110 |
+
|
111 |
+
<div class="entities" style="line-height: 2.5; direction: ltr">Does
|
112 |
+
<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
|
113 |
+
Luke's
|
114 |
+
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">Restaurant</span>
|
115 |
+
</mark>
|
116 |
+
serve lunch?</div>
|
117 |
+
<div class="entities" style="line-height: 2.5; direction: ltr">Chang does not speak
|
118 |
+
<mark class="entity" style="background: #bfeeb7; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
|
119 |
+
Taiwanese
|
120 |
+
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">Language</span>
|
121 |
+
</mark>
|
122 |
+
very well.</div>
|
123 |
+
<div class="entities" style="line-height: 2.5; direction: ltr">I like Berlin.</div>
|
124 |
+
</div>
|
125 |
+
</figure>
|
126 |
+
</body>
|
127 |
+
</html>
|
128 |
+
|
129 |
+
|
130 |
+
## Datasets preparation
|
131 |
+
|
132 |
+
1. We need to convert dataset into the following format. Let's say we have a dataset file train.json like following.
|
133 |
+
|
134 |
+
```json
|
135 |
+
{
|
136 |
+
"CARDINAL_NUMBER": [
|
137 |
+
"Washington , cloudy , [E] 2 [/E] to 6 degrees .",
|
138 |
+
"New Dehli , sunny , [E] 6 [/E] to 19 degrees .",
|
139 |
+
"Well this is number [E] two [/E] .",
|
140 |
+
"....."
|
141 |
+
],
|
142 |
+
"LANGUAGE": [
|
143 |
+
"They do n't have the Quicken [E] Dutch [/E] version ?",
|
144 |
+
"they learned a lot of [E] German [/E] .",
|
145 |
+
"and then [E] Dutch [/E] it 's Mifrau",
|
146 |
+
"...."
|
147 |
+
],
|
148 |
+
"MONEY": [
|
149 |
+
"Per capita personal income ranged from $ [E] 11,116 [/E] in Mississippi to $ 23,059 in Connecticut ... .",
|
150 |
+
"The trade surplus was [E] 582 million US dollars [/E] .",
|
151 |
+
"It settled with a loss of 4.95 cents at $ [E] 1.3210 [/E] a pound .",
|
152 |
+
"...."
|
153 |
+
]
|
154 |
+
}
|
155 |
+
```
|
156 |
+
|
157 |
+
2. Converted ontonotes5 dataset can be found here:
|
158 |
+
1. [train](https://gist.githubusercontent.com/sayef/46deaf7e6c6e1410b430ddc8aff9c557/raw/ea7ae2ae933bfc9c0daac1aa52a9dc093d5b36f4/ontonotes5.train.json)
|
159 |
+
2. [dev](https://gist.githubusercontent.com/sayef/46deaf7e6c6e1410b430ddc8aff9c557/raw/ea7ae2ae933bfc9c0daac1aa52a9dc093d5b36f4/ontonotes5.dev.json)
|
160 |
+
|
161 |
+
3. Then one could use examples/train.py script to train/evaluate your fsner model.
|
162 |
+
|
163 |
+
```bash
|
164 |
+
python train.py --pretrained-model bert-base-uncased --mode train --train-data train.json --val-data val.json \
|
165 |
+
--train-batch-size 6 --val-batch-size 6 --n-examples-per-entity 10 --neg-example-batch-ratio 1/3 --max-epochs 25 --device gpu \
|
166 |
+
--gpus -1 --strategy ddp
|
167 |
```
|
added_tokens.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"[
|
|
|
1 |
+
{"[E]": 30522, "[/E]": 30523}
|
config.json
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"architectures": [
|
4 |
"BertModel"
|
5 |
],
|
6 |
"attention_probs_dropout_prob": 0.1,
|
|
|
7 |
"gradient_checkpointing": false,
|
8 |
"hidden_act": "gelu",
|
9 |
"hidden_dropout_prob": 0.1,
|
@@ -17,7 +18,8 @@
|
|
17 |
"num_hidden_layers": 12,
|
18 |
"pad_token_id": 0,
|
19 |
"position_embedding_type": "absolute",
|
20 |
-
"
|
|
|
21 |
"type_vocab_size": 2,
|
22 |
"use_cache": true,
|
23 |
"vocab_size": 30524
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "checkpoints/model4",
|
3 |
"architectures": [
|
4 |
"BertModel"
|
5 |
],
|
6 |
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
"gradient_checkpointing": false,
|
9 |
"hidden_act": "gelu",
|
10 |
"hidden_dropout_prob": 0.1,
|
|
|
18 |
"num_hidden_layers": 12,
|
19 |
"pad_token_id": 0,
|
20 |
"position_embedding_type": "absolute",
|
21 |
+
"torch_dtype": "float32",
|
22 |
+
"transformers_version": "4.17.0",
|
23 |
"type_vocab_size": 2,
|
24 |
"use_cache": true,
|
25 |
"vocab_size": 30524
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2a2401a91d2bf80826341c52a0c1f8b6814f36c1b7852d4c93482a13041260f
|
3 |
+
size 438017329
|
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "
|
|
|
1 |
+
{"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "checkpoints/model4", "tokenizer_class": "BertTokenizer"}
|