Spaces:

antoinelouis
/

mtem-pruner

Running

App Files Files Community

antoinelouis commited on Oct 6, 2024

Commit

9fb9957

verified ·

1 Parent(s): d48edf5

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -42

app.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import os
 import csv
 import json
 import torch
 import shutil
 import textwrap
 import numpy as np
 import pandas as pd
@@ -10,7 +12,7 @@ import streamlit as st
 from collections import Counter
 from tokenizers import Tokenizer
 import plotly.graph_objects as go
-from huggingface_hub import whoami, HfApi
 from transformers import AutoModel, AutoTokenizer, PreTrainedTokenizerFast, pipeline
@@ -206,49 +208,83 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str,
     fig.update_traces(texttemplate='%{text:.1f}M', textposition='inside', insidetextanchor='middle')
     st.plotly_chart(fig)
-    # Add a README to the pruned model repo
-    new_model_name = f"{hf_username}/{outdir.split('/')[-1]}"
-    readme_content = textwrap.dedent(f"""
-    ---
-    pipeline_tag: sentence-similarity
-    language: {LANGUAGES[language]['hf_code']}
-    license: mit
-    tags:
-    - passage-retrieval
-    - sentence-similarity
-    - pruned
-    library_name: sentence-transformers
-    base_model: {model_name}
-    base_model_relation: quantized
-    ---
-    # {LANGUAGES[language]['emoji']} {new_model_name.split('/')[-1]}
-    This model is a {100 - pruned_all_params/all_params*100:.1f}% smaller version of [{model_name}](https://huggingface.co/{model_name})
-    for the {language.capitalize()} language, created using the [mtem-pruner](https://huggingface.co/spaces/antoinelouis/mtem-pruner) space.
-    This pruned model should perform similarly to the original model for {language.capitalize()} language tasks with a much smaller
-    memory footprint. However, it may not perform well for other languages present in the original multilingual model as tokens not
-    commonly used in {language.capitalize()} were removed from the original multilingual model's vocabulary.
-    ## Usage
-    You can use this model with the Transformers library:
-    ```python
-    from transformers import AutoModel, AutoTokenizer
-    model_name = "{new_model_name}"
-    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
-    ```
-    **Credits**: cc [@antoinelouis](https://huggingface.co/antoinelouis)
-    """)
-    with open(os.path.join(outdir, "README.md"), "w") as f:
-        f.write(readme_content)
     with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
         push_to_hub(hf_username, hf_token, outdir)
         shutil.rmtree(outdir)
         status.update(state="complete", expanded=False)

 import os
+import re
 import csv
 import json
 import torch
 import shutil
+import tempfile
 import textwrap
 import numpy as np
 import pandas as pd
 from collections import Counter
 from tokenizers import Tokenizer
 import plotly.graph_objects as go
+from huggingface_hub import whoami, HfApi, snapshot_download
 from transformers import AutoModel, AutoTokenizer, PreTrainedTokenizerFast, pipeline
     fig.update_traces(texttemplate='%{text:.1f}M', textposition='inside', insidetextanchor='middle')
     st.plotly_chart(fig)
     with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
+        st.write("- *Adding sentence-transformers files*")
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            snapshot_download(repo_id=model_name, local_dir=tmpdirname, token=hf_token)
+            src_modules_json = os.path.join(tmpdirname, "modules.json")
+            if os.path.exists(src_modules_json):
+                shutil.copy2(src_modules_json, os.path.join(outdir, "modules.json"))
+            src_sentence_bert_config = os.path.join(tmpdirname, "sentence_bert_config.json")
+            if os.path.exists(src_sentence_bert_config):
+                shutil.copy2(src_sentence_bert_config, os.path.join(outdir, "sentence_bert_config.json"))
+            src_pooling_folder = os.path.join(tmpdirname, "1_Pooling")
+            if os.path.exists(src_pooling_folder):
+                shutil.copytree(src_pooling_folder, os.path.join(outdir, "1_Pooling"), dirs_exist_ok=True)
+            src_readme = os.path.join(tmpdirname, "README.md")
+            if os.path.exists(src_readme):
+                with open(src_readme, 'r', encoding='utf-8') as file:
+                    content = file.read()
+                    match = re.search(r'license:\s*(\S+)', content, re.IGNORECASE)
+                    if match:
+                        original_license = match.group(1)
+        st.write("- *Adding a README*")
+        new_model_name = f"{hf_username}/{outdir.split('/')[-1]}"
+        readme_content = textwrap.dedent(f"""
+        ---
+        pipeline_tag: sentence-similarity
+        language: {LANGUAGES[language]['hf_code']}
+        license: {original_license}
+        tags:
+        - passage-retrieval
+        - sentence-similarity
+        - pruned
+        library_name: sentence-transformers
+        base_model: {model_name}
+        base_model_relation: quantized
+        ---
+        # {LANGUAGES[language]['emoji']} {new_model_name.split('/')[-1]}
+        This model is a {100 - pruned_all_params/all_params*100:.1f}% smaller version of [{model_name}](https://huggingface.co/{model_name})
+        for the {language.capitalize()} language, created using the [mtem-pruner](https://huggingface.co/spaces/antoinelouis/mtem-pruner) space.
+        This pruned model should perform similarly to the original model for {language.capitalize()} language tasks with a much smaller
+        memory footprint. However, it may not perform well for other languages present in the original multilingual model as tokens not
+        commonly used in {language.capitalize()} were removed from the original multilingual model's vocabulary.
+        ## Usage
+        You can use this model with the Transformers library:
+        ```python
+        from transformers import AutoModel, AutoTokenizer
+        model_name = "{new_model_name}"
+        model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
+        ```
+        Or with the sentence-transformers library:
+        ```python
+        from sentence_transformers import SentenceTransformer
+        model = SentenceTransformer("{new_model_name}")
+        ```
+        **Credits**: cc [@antoinelouis](https://huggingface.co/antoinelouis)
+        """)
+        with open(os.path.join(outdir, "README.md"), "w") as f:
+            f.write(readme_content)
+        st.write("- *Pushing to Hub*")
         push_to_hub(hf_username, hf_token, outdir)
         shutil.rmtree(outdir)
         status.update(state="complete", expanded=False)