aimlnerd commited on
Commit
a2d1297
1 Parent(s): 5348cff
source/services/ner/awscomprehend_2_ner_format.py CHANGED
@@ -83,7 +83,8 @@ if __name__ == '__main__':
83
  for file in Path(r'data/raw_data/annotations/').glob('**/*'):
84
  comprehend2NERFormat = Comprehend2NERFormat(letterfilepath=file)
85
  dataset_lst.append(comprehend2NERFormat())
86
-
 
87
  with open('data/ner_input_data/ner_dataset.json', 'w') as f:
88
  json.dump(dataset_lst, f)
89
 
 
83
  for file in Path(r'data/raw_data/annotations/').glob('**/*'):
84
  comprehend2NERFormat = Comprehend2NERFormat(letterfilepath=file)
85
  dataset_lst.append(comprehend2NERFormat())
86
+ print(" ".join(dataset_lst[0]['tokens']))
87
+
88
  with open('data/ner_input_data/ner_dataset.json', 'w') as f:
89
  json.dump(dataset_lst, f)
90
 
source/services/ner/train/train.py CHANGED
@@ -1,33 +1,37 @@
1
- # -*- coding: utf-8 -*-
2
- """Token classification (PyTorch)
3
 
4
- Automatically generated by Colaboratory.
 
5
 
6
- Original file is located at
7
- https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter7/section2_pt.ipynb
8
 
9
- # Token classification (PyTorch)
 
10
 
11
- Install the Transformers, Datasets, and Evaluate libraries to run this notebook.
12
  """
 
13
 
14
- !pip install datasets evaluate transformers[sentencepiece]
15
- !pip install accelerate
16
-
17
- """You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."""
18
-
19
- from huggingface_hub import notebook_login
20
 
21
- notebook_login()
 
 
 
 
 
 
 
 
22
 
23
- from datasets import load_dataset
24
 
25
- raw_datasets = load_dataset("conll2003")
26
 
27
- raw_datasets
 
28
 
29
  raw_datasets["train"][0]["tokens"]
30
-
31
  raw_datasets["train"][0]["ner_tags"]
32
 
33
  ner_feature = raw_datasets["train"].features["ner_tags"]
@@ -117,7 +121,6 @@ batch["labels"]
117
  for i in range(2):
118
  print(tokenized_datasets["train"][i]["labels"])
119
 
120
- !pip install seqeval
121
 
122
  import evaluate
123
 
@@ -165,14 +168,12 @@ model = AutoModelForTokenClassification.from_pretrained(
165
 
166
  model.config.num_labels
167
 
168
- from huggingface_hub import notebook_login
169
 
170
- notebook_login()
171
 
172
  from transformers import TrainingArguments
173
 
174
  args = TrainingArguments(
175
- "bert-finetuned-ner",
176
  evaluation_strategy="epoch",
177
  save_strategy="epoch",
178
  learning_rate=2e-5,
 
1
+ """
2
+ https://github.com/huggingface/transformers/tree/66fd3a8d626a32989f4569260db32785c6cbf42a/examples/pytorch/token-classification
3
 
4
+ run this command in terminal to login to huggingface hub
5
+ huggingface-cli login
6
 
7
+ instead of
 
8
 
9
+ from huggingface_hub import notebook_login
10
+ notebook_login()
11
 
 
12
  """
13
+ import datasets
14
 
15
+ dataset = datasets.load_dataset("json", data_files="data/ner_input_data/ner_dataset.json")
 
 
 
 
 
16
 
17
+ # Convert ner_tag list of string to sequence of classlabels as expected by hugging face for target var https://discuss.huggingface.co/t/sequence-features-class-label-cast/44638/3
18
+ def get_label_list(labels):
19
+ # copied from https://github.com/huggingface/transformers/blob/66fd3a8d626a32989f4569260db32785c6cbf42a/examples/pytorch/token-classification/run_ner.py#L320
20
+ unique_labels = set()
21
+ for label in labels:
22
+ unique_labels = unique_labels | set(label)
23
+ label_list = list(unique_labels)
24
+ label_list.sort()
25
+ return label_list
26
 
27
+ all_labels = get_label_list(dataset['train']["ner_tags"])
28
 
29
+ dataset = dataset.cast_column("ner_tags", datasets.Sequence(datasets.ClassLabel(names=all_labels)))
30
 
31
+ raw_datasets = dataset["train"].train_test_split(train_size=0.8, seed=20)
32
+ raw_datasets["validation"] = raw_datasets.pop("test")
33
 
34
  raw_datasets["train"][0]["tokens"]
 
35
  raw_datasets["train"][0]["ner_tags"]
36
 
37
  ner_feature = raw_datasets["train"].features["ner_tags"]
 
121
  for i in range(2):
122
  print(tokenized_datasets["train"][i]["labels"])
123
 
 
124
 
125
  import evaluate
126
 
 
168
 
169
  model.config.num_labels
170
 
 
171
 
 
172
 
173
  from transformers import TrainingArguments
174
 
175
  args = TrainingArguments(
176
+ "bert-finetuned-legalentity-ner",
177
  evaluation_strategy="epoch",
178
  save_strategy="epoch",
179
  learning_rate=2e-5,