File size: 5,556 Bytes
13ca8b7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import os
import shutil
import argparse
from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification
from huggingface_hub import HfApi, Repository
import json
from configuration_extended_multitask import ImpressoConfig
from models import ExtendedMultitaskModelForTokenClassification
import subprocess
def get_latest_checkpoint(checkpoint_dir):
checkpoints = [
d
for d in os.listdir(checkpoint_dir)
if os.path.isdir(os.path.join(checkpoint_dir, d))
and d.startswith("checkpoint-")
]
checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]), reverse=True)
return os.path.join(checkpoint_dir, checkpoints[0])
def get_info(label_map):
num_token_labels_dict = {task: len(labels) for task, labels in label_map.items()}
return num_token_labels_dict
def push_model_to_hub(checkpoint_dir, repo_name, script_path):
checkpoint_path = get_latest_checkpoint(checkpoint_dir)
label_map = json.load(open(os.path.join(checkpoint_dir, "label_map.json"), "r"))
num_token_labels_dict = get_info(label_map)
config = ImpressoConfig.from_pretrained(checkpoint_path)
config.pretrained_config = AutoConfig.from_pretrained(config.name_or_path)
config.save_pretrained("stacked_bert")
config = ImpressoConfig.from_pretrained("stacked_bert")
model = ExtendedMultitaskModelForTokenClassification.from_pretrained(
checkpoint_path, config=config, num_token_labels_dict=num_token_labels_dict
)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
local_repo_path = "./repo"
repo_url = HfApi().create_repo(repo_id=repo_name, exist_ok=True)
repo = Repository(local_dir=local_repo_path, clone_from=repo_url)
try:
# Try to pull the latest changes from the remote repository using subprocess
subprocess.run(["git", "pull"], check=True, cwd=local_repo_path)
except subprocess.CalledProcessError as e:
# If fast-forward is not possible, reset the local branch to match the remote branch
subprocess.run(
["git", "reset", "--hard", "origin/main"],
check=True,
cwd=local_repo_path,
)
# Copy all Python files to the local repository directory
current_dir = os.path.dirname(os.path.abspath(__file__))
for filename in os.listdir(current_dir):
if filename.endswith(".py"):
shutil.copy(
os.path.join(current_dir, filename),
os.path.join(local_repo_path, filename),
)
ImpressoConfig.register_for_auto_class()
AutoConfig.register("stacked_bert", ImpressoConfig)
AutoModelForTokenClassification.register(
ImpressoConfig, ExtendedMultitaskModelForTokenClassification
)
ExtendedMultitaskModelForTokenClassification.register_for_auto_class(
"AutoModelForTokenClassification"
)
model.save_pretrained(local_repo_path)
tokenizer.save_pretrained(local_repo_path)
# Add, commit and push the changes to the repository
subprocess.run(["git", "add", "."], check=True, cwd=local_repo_path)
subprocess.run(
["git", "commit", "-m", "Initial commit including model and configuration"],
check=True,
cwd=local_repo_path,
)
subprocess.run(["git", "push"], check=True, cwd=local_repo_path)
# Push the model to the hub (this includes the README template)
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)
print(f"Model and repo pushed to: {repo_url}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Push NER model to Hugging Face Hub")
parser.add_argument(
"--model_type",
type=str,
required=True,
help="Type of the model (e.g., stacked-bert)",
)
parser.add_argument(
"--language",
type=str,
required=True,
help="Language of the model (e.g., multilingual)",
)
parser.add_argument(
"--checkpoint_dir",
type=str,
required=True,
help="Directory containing checkpoint folders",
)
parser.add_argument(
"--script_path", type=str, required=True, help="Path to the models.py script"
)
args = parser.parse_args()
repo_name = f"impresso-project/ner-{args.model_type}-{args.language}"
push_model_to_hub(args.checkpoint_dir, repo_name, args.script_path)
# PIPELINE_REGISTRY.register_pipeline(
# "generic-ner",
# pipeline_class=MultitaskTokenClassificationPipeline,
# pt_model=ExtendedMultitaskModelForTokenClassification,
# )
# model.config.custom_pipelines = {
# "generic-ner": {
# "impl": "generic_ner.MultitaskTokenClassificationPipeline",
# "pt": ["ExtendedMultitaskModelForTokenClassification"],
# "tf": [],
# }
# }
# classifier = pipeline(
# "generic-ner", model=model, tokenizer=tokenizer, label_map=label_map
# )
# from pprint import pprint
#
# pprint(
# classifier(
# "1. Le public est averti que Charlotte née Bourgoin, femme-de Joseph Digiez, et Maurice Bourgoin, enfant mineur représenté par le sieur Jaques Charles Gicot son curateur, ont été admis par arrêt du Conseil d'Etat du 5 décembre 1797, à solliciter une renonciation générale et absolue aux biens et aux dettes présentes et futures de Jean-Baptiste Bourgoin leur père."
# )
# )
# repo.push_to_hub(commit_message="Initial commit of the trained NER model with code")
|