Spaces:
Runtime error
Runtime error
add train
Browse files
source/services/ner/awscomprehend_2_ner_format.py
CHANGED
@@ -83,7 +83,8 @@ if __name__ == '__main__':
|
|
83 |
for file in Path(r'data/raw_data/annotations/').glob('**/*'):
|
84 |
comprehend2NERFormat = Comprehend2NERFormat(letterfilepath=file)
|
85 |
dataset_lst.append(comprehend2NERFormat())
|
86 |
-
|
|
|
87 |
with open('data/ner_input_data/ner_dataset.json', 'w') as f:
|
88 |
json.dump(dataset_lst, f)
|
89 |
|
|
|
83 |
for file in Path(r'data/raw_data/annotations/').glob('**/*'):
|
84 |
comprehend2NERFormat = Comprehend2NERFormat(letterfilepath=file)
|
85 |
dataset_lst.append(comprehend2NERFormat())
|
86 |
+
print(" ".join(dataset_lst[0]['tokens']))
|
87 |
+
|
88 |
with open('data/ner_input_data/ner_dataset.json', 'w') as f:
|
89 |
json.dump(dataset_lst, f)
|
90 |
|
source/services/ner/train/train.py
CHANGED
@@ -1,33 +1,37 @@
|
|
1 |
-
|
2 |
-
|
3 |
|
4 |
-
|
|
|
5 |
|
6 |
-
|
7 |
-
https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter7/section2_pt.ipynb
|
8 |
|
9 |
-
|
|
|
10 |
|
11 |
-
Install the Transformers, Datasets, and Evaluate libraries to run this notebook.
|
12 |
"""
|
|
|
13 |
|
14 |
-
|
15 |
-
!pip install accelerate
|
16 |
-
|
17 |
-
"""You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."""
|
18 |
-
|
19 |
-
from huggingface_hub import notebook_login
|
20 |
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
|
24 |
|
25 |
-
|
26 |
|
27 |
-
raw_datasets
|
|
|
28 |
|
29 |
raw_datasets["train"][0]["tokens"]
|
30 |
-
|
31 |
raw_datasets["train"][0]["ner_tags"]
|
32 |
|
33 |
ner_feature = raw_datasets["train"].features["ner_tags"]
|
@@ -117,7 +121,6 @@ batch["labels"]
|
|
117 |
for i in range(2):
|
118 |
print(tokenized_datasets["train"][i]["labels"])
|
119 |
|
120 |
-
!pip install seqeval
|
121 |
|
122 |
import evaluate
|
123 |
|
@@ -165,14 +168,12 @@ model = AutoModelForTokenClassification.from_pretrained(
|
|
165 |
|
166 |
model.config.num_labels
|
167 |
|
168 |
-
from huggingface_hub import notebook_login
|
169 |
|
170 |
-
notebook_login()
|
171 |
|
172 |
from transformers import TrainingArguments
|
173 |
|
174 |
args = TrainingArguments(
|
175 |
-
"bert-finetuned-ner",
|
176 |
evaluation_strategy="epoch",
|
177 |
save_strategy="epoch",
|
178 |
learning_rate=2e-5,
|
|
|
1 |
+
"""
|
2 |
+
https://github.com/huggingface/transformers/tree/66fd3a8d626a32989f4569260db32785c6cbf42a/examples/pytorch/token-classification
|
3 |
|
4 |
+
run this command in terminal to login to huggingface hub
|
5 |
+
huggingface-cli login
|
6 |
|
7 |
+
instead of
|
|
|
8 |
|
9 |
+
from huggingface_hub import notebook_login
|
10 |
+
notebook_login()
|
11 |
|
|
|
12 |
"""
|
13 |
+
import datasets
|
14 |
|
15 |
+
dataset = datasets.load_dataset("json", data_files="data/ner_input_data/ner_dataset.json")
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
+
# Convert ner_tag list of string to sequence of classlabels as expected by hugging face for target var https://discuss.huggingface.co/t/sequence-features-class-label-cast/44638/3
|
18 |
+
def get_label_list(labels):
|
19 |
+
# copied from https://github.com/huggingface/transformers/blob/66fd3a8d626a32989f4569260db32785c6cbf42a/examples/pytorch/token-classification/run_ner.py#L320
|
20 |
+
unique_labels = set()
|
21 |
+
for label in labels:
|
22 |
+
unique_labels = unique_labels | set(label)
|
23 |
+
label_list = list(unique_labels)
|
24 |
+
label_list.sort()
|
25 |
+
return label_list
|
26 |
|
27 |
+
all_labels = get_label_list(dataset['train']["ner_tags"])
|
28 |
|
29 |
+
dataset = dataset.cast_column("ner_tags", datasets.Sequence(datasets.ClassLabel(names=all_labels)))
|
30 |
|
31 |
+
raw_datasets = dataset["train"].train_test_split(train_size=0.8, seed=20)
|
32 |
+
raw_datasets["validation"] = raw_datasets.pop("test")
|
33 |
|
34 |
raw_datasets["train"][0]["tokens"]
|
|
|
35 |
raw_datasets["train"][0]["ner_tags"]
|
36 |
|
37 |
ner_feature = raw_datasets["train"].features["ner_tags"]
|
|
|
121 |
for i in range(2):
|
122 |
print(tokenized_datasets["train"][i]["labels"])
|
123 |
|
|
|
124 |
|
125 |
import evaluate
|
126 |
|
|
|
168 |
|
169 |
model.config.num_labels
|
170 |
|
|
|
171 |
|
|
|
172 |
|
173 |
from transformers import TrainingArguments
|
174 |
|
175 |
args = TrainingArguments(
|
176 |
+
"bert-finetuned-legalentity-ner",
|
177 |
evaluation_strategy="epoch",
|
178 |
save_strategy="epoch",
|
179 |
learning_rate=2e-5,
|