emanuelaboros commited on
Commit
13ca8b7
·
1 Parent(s): caf5c55

Initial commit including model and configuration

Browse files
Files changed (3) hide show
  1. __init__.py +0 -0
  2. generic_ner.py +0 -4
  3. push_to_hf.py +142 -0
__init__.py ADDED
File without changes
generic_ner.py CHANGED
@@ -7,10 +7,6 @@ from nltk.tree import Tree
7
  import string
8
  import torch.nn.functional as F
9
  import re
10
- from models import ExtendedMultitaskModelForTokenClassification
11
-
12
- # Register the custom pipeline
13
- from transformers import pipeline
14
 
15
 
16
  def tokenize(text):
 
7
  import string
8
  import torch.nn.functional as F
9
  import re
 
 
 
 
10
 
11
 
12
  def tokenize(text):
push_to_hf.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import argparse
4
+ from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification
5
+ from huggingface_hub import HfApi, Repository
6
+ import json
7
+ from configuration_extended_multitask import ImpressoConfig
8
+ from models import ExtendedMultitaskModelForTokenClassification
9
+ import subprocess
10
+
11
+
12
+ def get_latest_checkpoint(checkpoint_dir):
13
+ checkpoints = [
14
+ d
15
+ for d in os.listdir(checkpoint_dir)
16
+ if os.path.isdir(os.path.join(checkpoint_dir, d))
17
+ and d.startswith("checkpoint-")
18
+ ]
19
+ checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]), reverse=True)
20
+ return os.path.join(checkpoint_dir, checkpoints[0])
21
+
22
+
23
+ def get_info(label_map):
24
+ num_token_labels_dict = {task: len(labels) for task, labels in label_map.items()}
25
+ return num_token_labels_dict
26
+
27
+
28
+ def push_model_to_hub(checkpoint_dir, repo_name, script_path):
29
+ checkpoint_path = get_latest_checkpoint(checkpoint_dir)
30
+ label_map = json.load(open(os.path.join(checkpoint_dir, "label_map.json"), "r"))
31
+ num_token_labels_dict = get_info(label_map)
32
+ config = ImpressoConfig.from_pretrained(checkpoint_path)
33
+ config.pretrained_config = AutoConfig.from_pretrained(config.name_or_path)
34
+ config.save_pretrained("stacked_bert")
35
+
36
+ config = ImpressoConfig.from_pretrained("stacked_bert")
37
+
38
+ model = ExtendedMultitaskModelForTokenClassification.from_pretrained(
39
+ checkpoint_path, config=config, num_token_labels_dict=num_token_labels_dict
40
+ )
41
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
42
+ local_repo_path = "./repo"
43
+ repo_url = HfApi().create_repo(repo_id=repo_name, exist_ok=True)
44
+ repo = Repository(local_dir=local_repo_path, clone_from=repo_url)
45
+
46
+ try:
47
+ # Try to pull the latest changes from the remote repository using subprocess
48
+ subprocess.run(["git", "pull"], check=True, cwd=local_repo_path)
49
+ except subprocess.CalledProcessError as e:
50
+ # If fast-forward is not possible, reset the local branch to match the remote branch
51
+ subprocess.run(
52
+ ["git", "reset", "--hard", "origin/main"],
53
+ check=True,
54
+ cwd=local_repo_path,
55
+ )
56
+
57
+ # Copy all Python files to the local repository directory
58
+ current_dir = os.path.dirname(os.path.abspath(__file__))
59
+ for filename in os.listdir(current_dir):
60
+ if filename.endswith(".py"):
61
+ shutil.copy(
62
+ os.path.join(current_dir, filename),
63
+ os.path.join(local_repo_path, filename),
64
+ )
65
+
66
+ ImpressoConfig.register_for_auto_class()
67
+ AutoConfig.register("stacked_bert", ImpressoConfig)
68
+ AutoModelForTokenClassification.register(
69
+ ImpressoConfig, ExtendedMultitaskModelForTokenClassification
70
+ )
71
+ ExtendedMultitaskModelForTokenClassification.register_for_auto_class(
72
+ "AutoModelForTokenClassification"
73
+ )
74
+
75
+ model.save_pretrained(local_repo_path)
76
+ tokenizer.save_pretrained(local_repo_path)
77
+
78
+ # Add, commit and push the changes to the repository
79
+ subprocess.run(["git", "add", "."], check=True, cwd=local_repo_path)
80
+ subprocess.run(
81
+ ["git", "commit", "-m", "Initial commit including model and configuration"],
82
+ check=True,
83
+ cwd=local_repo_path,
84
+ )
85
+ subprocess.run(["git", "push"], check=True, cwd=local_repo_path)
86
+
87
+ # Push the model to the hub (this includes the README template)
88
+ model.push_to_hub(repo_name)
89
+ tokenizer.push_to_hub(repo_name)
90
+
91
+ print(f"Model and repo pushed to: {repo_url}")
92
+
93
+
94
+ if __name__ == "__main__":
95
+ parser = argparse.ArgumentParser(description="Push NER model to Hugging Face Hub")
96
+ parser.add_argument(
97
+ "--model_type",
98
+ type=str,
99
+ required=True,
100
+ help="Type of the model (e.g., stacked-bert)",
101
+ )
102
+ parser.add_argument(
103
+ "--language",
104
+ type=str,
105
+ required=True,
106
+ help="Language of the model (e.g., multilingual)",
107
+ )
108
+ parser.add_argument(
109
+ "--checkpoint_dir",
110
+ type=str,
111
+ required=True,
112
+ help="Directory containing checkpoint folders",
113
+ )
114
+ parser.add_argument(
115
+ "--script_path", type=str, required=True, help="Path to the models.py script"
116
+ )
117
+ args = parser.parse_args()
118
+ repo_name = f"impresso-project/ner-{args.model_type}-{args.language}"
119
+ push_model_to_hub(args.checkpoint_dir, repo_name, args.script_path)
120
+ # PIPELINE_REGISTRY.register_pipeline(
121
+ # "generic-ner",
122
+ # pipeline_class=MultitaskTokenClassificationPipeline,
123
+ # pt_model=ExtendedMultitaskModelForTokenClassification,
124
+ # )
125
+ # model.config.custom_pipelines = {
126
+ # "generic-ner": {
127
+ # "impl": "generic_ner.MultitaskTokenClassificationPipeline",
128
+ # "pt": ["ExtendedMultitaskModelForTokenClassification"],
129
+ # "tf": [],
130
+ # }
131
+ # }
132
+ # classifier = pipeline(
133
+ # "generic-ner", model=model, tokenizer=tokenizer, label_map=label_map
134
+ # )
135
+ # from pprint import pprint
136
+ #
137
+ # pprint(
138
+ # classifier(
139
+ # "1. Le public est averti que Charlotte née Bourgoin, femme-de Joseph Digiez, et Maurice Bourgoin, enfant mineur représenté par le sieur Jaques Charles Gicot son curateur, ont été admis par arrêt du Conseil d'Etat du 5 décembre 1797, à solliciter une renonciation générale et absolue aux biens et aux dettes présentes et futures de Jean-Baptiste Bourgoin leur père."
140
+ # )
141
+ # )
142
+ # repo.push_to_hub(commit_message="Initial commit of the trained NER model with code")