Spaces:

aimlnerd
/

legal-entity-ner-transformers

Runtime error

App Files Files Community

aimlnerd commited on Jan 8, 2024

Commit

a2d1297

1 Parent(s): 5348cff

add train

Browse files

Files changed (2) hide show

source/services/ner/awscomprehend_2_ner_format.py +2 -1
source/services/ner/train/train.py +23 -22

source/services/ner/awscomprehend_2_ner_format.py CHANGED Viewed

@@ -83,7 +83,8 @@ if __name__ == '__main__':
     for file in Path(r'data/raw_data/annotations/').glob('**/*'):
         comprehend2NERFormat = Comprehend2NERFormat(letterfilepath=file)
         dataset_lst.append(comprehend2NERFormat())
     with open('data/ner_input_data/ner_dataset.json', 'w') as f:
         json.dump(dataset_lst, f)

     for file in Path(r'data/raw_data/annotations/').glob('**/*'):
         comprehend2NERFormat = Comprehend2NERFormat(letterfilepath=file)
         dataset_lst.append(comprehend2NERFormat())
+    print(" ".join(dataset_lst[0]['tokens']))
     with open('data/ner_input_data/ner_dataset.json', 'w') as f:
         json.dump(dataset_lst, f)

source/services/ner/train/train.py CHANGED Viewed

@@ -1,33 +1,37 @@
-# -*- coding: utf-8 -*-
-"""Token classification (PyTorch)
-Automatically generated by Colaboratory.
-Original file is located at
-    https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter7/section2_pt.ipynb
-# Token classification (PyTorch)
-Install the Transformers, Datasets, and Evaluate libraries to run this notebook.
 """
-!pip install datasets evaluate transformers[sentencepiece]
-!pip install accelerate
-"""You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."""
-from huggingface_hub import notebook_login
-notebook_login()
-from datasets import load_dataset
-raw_datasets = load_dataset("conll2003")
-raw_datasets
 raw_datasets["train"][0]["tokens"]
 raw_datasets["train"][0]["ner_tags"]
 ner_feature = raw_datasets["train"].features["ner_tags"]
@@ -117,7 +121,6 @@ batch["labels"]
 for i in range(2):
     print(tokenized_datasets["train"][i]["labels"])
-!pip install seqeval
 import evaluate
@@ -165,14 +168,12 @@ model = AutoModelForTokenClassification.from_pretrained(
 model.config.num_labels
-from huggingface_hub import notebook_login
-notebook_login()
 from transformers import TrainingArguments
 args = TrainingArguments(
-    "bert-finetuned-ner",
     evaluation_strategy="epoch",
     save_strategy="epoch",
     learning_rate=2e-5,

+"""
+https://github.com/huggingface/transformers/tree/66fd3a8d626a32989f4569260db32785c6cbf42a/examples/pytorch/token-classification
+run this command  in terminal to login to huggingface hub
+huggingface-cli login
+instead of
+from huggingface_hub import notebook_login
+notebook_login()
 """
+import datasets
+dataset = datasets.load_dataset("json", data_files="data/ner_input_data/ner_dataset.json")
+# Convert ner_tag list of string to sequence of classlabels as expected by hugging face for target var https://discuss.huggingface.co/t/sequence-features-class-label-cast/44638/3
+def get_label_list(labels):
+    # copied from https://github.com/huggingface/transformers/blob/66fd3a8d626a32989f4569260db32785c6cbf42a/examples/pytorch/token-classification/run_ner.py#L320
+    unique_labels = set()
+    for label in labels:
+        unique_labels = unique_labels | set(label)
+    label_list = list(unique_labels)
+    label_list.sort()
+    return label_list
+all_labels = get_label_list(dataset['train']["ner_tags"])
+dataset = dataset.cast_column("ner_tags", datasets.Sequence(datasets.ClassLabel(names=all_labels)))
+raw_datasets = dataset["train"].train_test_split(train_size=0.8, seed=20)
+raw_datasets["validation"] = raw_datasets.pop("test")
 raw_datasets["train"][0]["tokens"]
 raw_datasets["train"][0]["ner_tags"]
 ner_feature = raw_datasets["train"].features["ner_tags"]
 for i in range(2):
     print(tokenized_datasets["train"][i]["labels"])
 import evaluate
 model.config.num_labels
 from transformers import TrainingArguments
 args = TrainingArguments(
+    "bert-finetuned-legalentity-ner",
     evaluation_strategy="epoch",
     save_strategy="epoch",
     learning_rate=2e-5,