impresso-project
/

ner-stacked-bert-multilingual

@@ -7,10 +7,6 @@ from nltk.tree import Tree
 import string
 import torch.nn.functional as F
 import re
-from models import ExtendedMultitaskModelForTokenClassification
-# Register the custom pipeline
-from transformers import pipeline
 def tokenize(text):

 import string
 import torch.nn.functional as F
 import re
 def tokenize(text):

push_to_hf.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import os
+import shutil
+import argparse
+from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification
+from huggingface_hub import HfApi, Repository
+import json
+from configuration_extended_multitask import ImpressoConfig
+from models import ExtendedMultitaskModelForTokenClassification
+import subprocess
+def get_latest_checkpoint(checkpoint_dir):
+    checkpoints = [
+        d
+        for d in os.listdir(checkpoint_dir)
+        if os.path.isdir(os.path.join(checkpoint_dir, d))
+        and d.startswith("checkpoint-")
+    ]
+    checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]), reverse=True)
+    return os.path.join(checkpoint_dir, checkpoints[0])
+def get_info(label_map):
+    num_token_labels_dict = {task: len(labels) for task, labels in label_map.items()}
+    return num_token_labels_dict
+def push_model_to_hub(checkpoint_dir, repo_name, script_path):
+    checkpoint_path = get_latest_checkpoint(checkpoint_dir)
+    label_map = json.load(open(os.path.join(checkpoint_dir, "label_map.json"), "r"))
+    num_token_labels_dict = get_info(label_map)
+    config = ImpressoConfig.from_pretrained(checkpoint_path)
+    config.pretrained_config = AutoConfig.from_pretrained(config.name_or_path)
+    config.save_pretrained("stacked_bert")
+    config = ImpressoConfig.from_pretrained("stacked_bert")
+    model = ExtendedMultitaskModelForTokenClassification.from_pretrained(
+        checkpoint_path, config=config, num_token_labels_dict=num_token_labels_dict
+    )
+    tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
+    local_repo_path = "./repo"
+    repo_url = HfApi().create_repo(repo_id=repo_name, exist_ok=True)
+    repo = Repository(local_dir=local_repo_path, clone_from=repo_url)
+    try:
+        # Try to pull the latest changes from the remote repository using subprocess
+        subprocess.run(["git", "pull"], check=True, cwd=local_repo_path)
+    except subprocess.CalledProcessError as e:
+        # If fast-forward is not possible, reset the local branch to match the remote branch
+        subprocess.run(
+            ["git", "reset", "--hard", "origin/main"],
+            check=True,
+            cwd=local_repo_path,
+        )
+    # Copy all Python files to the local repository directory
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    for filename in os.listdir(current_dir):
+        if filename.endswith(".py"):
+            shutil.copy(
+                os.path.join(current_dir, filename),
+                os.path.join(local_repo_path, filename),
+            )
+    ImpressoConfig.register_for_auto_class()
+    AutoConfig.register("stacked_bert", ImpressoConfig)
+    AutoModelForTokenClassification.register(
+        ImpressoConfig, ExtendedMultitaskModelForTokenClassification
+    )
+    ExtendedMultitaskModelForTokenClassification.register_for_auto_class(
+        "AutoModelForTokenClassification"
+    )
+    model.save_pretrained(local_repo_path)
+    tokenizer.save_pretrained(local_repo_path)
+    # Add, commit and push the changes to the repository
+    subprocess.run(["git", "add", "."], check=True, cwd=local_repo_path)
+    subprocess.run(
+        ["git", "commit", "-m", "Initial commit including model and configuration"],
+        check=True,
+        cwd=local_repo_path,
+    )
+    subprocess.run(["git", "push"], check=True, cwd=local_repo_path)
+    # Push the model to the hub (this includes the README template)
+    model.push_to_hub(repo_name)
+    tokenizer.push_to_hub(repo_name)
+    print(f"Model and repo pushed to: {repo_url}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Push NER model to Hugging Face Hub")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        required=True,
+        help="Type of the model (e.g., stacked-bert)",
+    )
+    parser.add_argument(
+        "--language",
+        type=str,
+        required=True,
+        help="Language of the model (e.g., multilingual)",
+    )
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        required=True,
+        help="Directory containing checkpoint folders",
+    )
+    parser.add_argument(
+        "--script_path", type=str, required=True, help="Path to the models.py script"
+    )
+    args = parser.parse_args()
+    repo_name = f"impresso-project/ner-{args.model_type}-{args.language}"
+    push_model_to_hub(args.checkpoint_dir, repo_name, args.script_path)
+    # PIPELINE_REGISTRY.register_pipeline(
+    #     "generic-ner",
+    #     pipeline_class=MultitaskTokenClassificationPipeline,
+    #     pt_model=ExtendedMultitaskModelForTokenClassification,
+    # )
+    # model.config.custom_pipelines = {
+    #     "generic-ner": {
+    #         "impl": "generic_ner.MultitaskTokenClassificationPipeline",
+    #         "pt": ["ExtendedMultitaskModelForTokenClassification"],
+    #         "tf": [],
+    #     }
+    # }
+    # classifier = pipeline(
+    #     "generic-ner", model=model, tokenizer=tokenizer, label_map=label_map
+    # )
+    # from pprint import pprint
+    #
+    # pprint(
+    #     classifier(
+    #         "1. Le public est averti que Charlotte née Bourgoin, femme-de Joseph Digiez, et Maurice Bourgoin, enfant mineur représenté par le sieur Jaques Charles Gicot son curateur, ont été admis par arrêt du Conseil d'Etat du 5 décembre 1797, à solliciter une renonciation générale et absolue aux biens et aux dettes présentes et futures de Jean-Baptiste Bourgoin leur père."
+    #     )
+    # )
+    # repo.push_to_hub(commit_message="Initial commit of the trained NER model with code")