File size: 5,556 Bytes
13ca8b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os
import shutil
import argparse
from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification
from huggingface_hub import HfApi, Repository
import json
from configuration_extended_multitask import ImpressoConfig
from models import ExtendedMultitaskModelForTokenClassification
import subprocess


def get_latest_checkpoint(checkpoint_dir):
    checkpoints = [
        d
        for d in os.listdir(checkpoint_dir)
        if os.path.isdir(os.path.join(checkpoint_dir, d))
        and d.startswith("checkpoint-")
    ]
    checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]), reverse=True)
    return os.path.join(checkpoint_dir, checkpoints[0])


def get_info(label_map):
    num_token_labels_dict = {task: len(labels) for task, labels in label_map.items()}
    return num_token_labels_dict


def push_model_to_hub(checkpoint_dir, repo_name, script_path):
    checkpoint_path = get_latest_checkpoint(checkpoint_dir)
    label_map = json.load(open(os.path.join(checkpoint_dir, "label_map.json"), "r"))
    num_token_labels_dict = get_info(label_map)
    config = ImpressoConfig.from_pretrained(checkpoint_path)
    config.pretrained_config = AutoConfig.from_pretrained(config.name_or_path)
    config.save_pretrained("stacked_bert")

    config = ImpressoConfig.from_pretrained("stacked_bert")

    model = ExtendedMultitaskModelForTokenClassification.from_pretrained(
        checkpoint_path, config=config, num_token_labels_dict=num_token_labels_dict
    )
    tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
    local_repo_path = "./repo"
    repo_url = HfApi().create_repo(repo_id=repo_name, exist_ok=True)
    repo = Repository(local_dir=local_repo_path, clone_from=repo_url)

    try:
        # Try to pull the latest changes from the remote repository using subprocess
        subprocess.run(["git", "pull"], check=True, cwd=local_repo_path)
    except subprocess.CalledProcessError as e:
        # If fast-forward is not possible, reset the local branch to match the remote branch
        subprocess.run(
            ["git", "reset", "--hard", "origin/main"],
            check=True,
            cwd=local_repo_path,
        )

    # Copy all Python files to the local repository directory
    current_dir = os.path.dirname(os.path.abspath(__file__))
    for filename in os.listdir(current_dir):
        if filename.endswith(".py"):
            shutil.copy(
                os.path.join(current_dir, filename),
                os.path.join(local_repo_path, filename),
            )

    ImpressoConfig.register_for_auto_class()
    AutoConfig.register("stacked_bert", ImpressoConfig)
    AutoModelForTokenClassification.register(
        ImpressoConfig, ExtendedMultitaskModelForTokenClassification
    )
    ExtendedMultitaskModelForTokenClassification.register_for_auto_class(
        "AutoModelForTokenClassification"
    )

    model.save_pretrained(local_repo_path)
    tokenizer.save_pretrained(local_repo_path)

    # Add, commit and push the changes to the repository
    subprocess.run(["git", "add", "."], check=True, cwd=local_repo_path)
    subprocess.run(
        ["git", "commit", "-m", "Initial commit including model and configuration"],
        check=True,
        cwd=local_repo_path,
    )
    subprocess.run(["git", "push"], check=True, cwd=local_repo_path)

    # Push the model to the hub (this includes the README template)
    model.push_to_hub(repo_name)
    tokenizer.push_to_hub(repo_name)

    print(f"Model and repo pushed to: {repo_url}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Push NER model to Hugging Face Hub")
    parser.add_argument(
        "--model_type",
        type=str,
        required=True,
        help="Type of the model (e.g., stacked-bert)",
    )
    parser.add_argument(
        "--language",
        type=str,
        required=True,
        help="Language of the model (e.g., multilingual)",
    )
    parser.add_argument(
        "--checkpoint_dir",
        type=str,
        required=True,
        help="Directory containing checkpoint folders",
    )
    parser.add_argument(
        "--script_path", type=str, required=True, help="Path to the models.py script"
    )
    args = parser.parse_args()
    repo_name = f"impresso-project/ner-{args.model_type}-{args.language}"
    push_model_to_hub(args.checkpoint_dir, repo_name, args.script_path)
    # PIPELINE_REGISTRY.register_pipeline(
    #     "generic-ner",
    #     pipeline_class=MultitaskTokenClassificationPipeline,
    #     pt_model=ExtendedMultitaskModelForTokenClassification,
    # )
    # model.config.custom_pipelines = {
    #     "generic-ner": {
    #         "impl": "generic_ner.MultitaskTokenClassificationPipeline",
    #         "pt": ["ExtendedMultitaskModelForTokenClassification"],
    #         "tf": [],
    #     }
    # }
    # classifier = pipeline(
    #     "generic-ner", model=model, tokenizer=tokenizer, label_map=label_map
    # )
    # from pprint import pprint
    #
    # pprint(
    #     classifier(
    #         "1. Le public est averti que Charlotte née Bourgoin, femme-de Joseph Digiez, et Maurice Bourgoin, enfant mineur représenté par le sieur Jaques Charles Gicot son curateur, ont été admis par arrêt du Conseil d'Etat du 5 décembre 1797, à solliciter une renonciation générale et absolue aux biens et aux dettes présentes et futures de Jean-Baptiste Bourgoin leur père."
    #     )
    # )
    # repo.push_to_hub(commit_message="Initial commit of the trained NER model with code")