Spaces:
Running
Running
antoinelouis
commited on
Commit
•
9fb9957
1
Parent(s):
d48edf5
Update app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
import os
|
|
|
2 |
import csv
|
3 |
import json
|
4 |
import torch
|
5 |
import shutil
|
|
|
6 |
import textwrap
|
7 |
import numpy as np
|
8 |
import pandas as pd
|
@@ -10,7 +12,7 @@ import streamlit as st
|
|
10 |
from collections import Counter
|
11 |
from tokenizers import Tokenizer
|
12 |
import plotly.graph_objects as go
|
13 |
-
from huggingface_hub import whoami, HfApi
|
14 |
from transformers import AutoModel, AutoTokenizer, PreTrainedTokenizerFast, pipeline
|
15 |
|
16 |
|
@@ -206,49 +208,83 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str,
|
|
206 |
fig.update_traces(texttemplate='%{text:.1f}M', textposition='inside', insidetextanchor='middle')
|
207 |
st.plotly_chart(fig)
|
208 |
|
209 |
-
# Add a README to the pruned model repo
|
210 |
-
new_model_name = f"{hf_username}/{outdir.split('/')[-1]}"
|
211 |
-
readme_content = textwrap.dedent(f"""
|
212 |
-
---
|
213 |
-
pipeline_tag: sentence-similarity
|
214 |
-
language: {LANGUAGES[language]['hf_code']}
|
215 |
-
license: mit
|
216 |
-
tags:
|
217 |
-
- passage-retrieval
|
218 |
-
- sentence-similarity
|
219 |
-
- pruned
|
220 |
-
library_name: sentence-transformers
|
221 |
-
base_model: {model_name}
|
222 |
-
base_model_relation: quantized
|
223 |
-
---
|
224 |
-
# {LANGUAGES[language]['emoji']} {new_model_name.split('/')[-1]}
|
225 |
-
|
226 |
-
This model is a {100 - pruned_all_params/all_params*100:.1f}% smaller version of [{model_name}](https://huggingface.co/{model_name})
|
227 |
-
for the {language.capitalize()} language, created using the [mtem-pruner](https://huggingface.co/spaces/antoinelouis/mtem-pruner) space.
|
228 |
-
|
229 |
-
This pruned model should perform similarly to the original model for {language.capitalize()} language tasks with a much smaller
|
230 |
-
memory footprint. However, it may not perform well for other languages present in the original multilingual model as tokens not
|
231 |
-
commonly used in {language.capitalize()} were removed from the original multilingual model's vocabulary.
|
232 |
-
|
233 |
-
## Usage
|
234 |
-
|
235 |
-
You can use this model with the Transformers library:
|
236 |
-
|
237 |
-
```python
|
238 |
-
from transformers import AutoModel, AutoTokenizer
|
239 |
-
|
240 |
-
model_name = "{new_model_name}"
|
241 |
-
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
|
242 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
|
243 |
-
```
|
244 |
-
|
245 |
-
**Credits**: cc [@antoinelouis](https://huggingface.co/antoinelouis)
|
246 |
-
""")
|
247 |
-
with open(os.path.join(outdir, "README.md"), "w") as f:
|
248 |
-
f.write(readme_content)
|
249 |
-
|
250 |
with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
push_to_hub(hf_username, hf_token, outdir)
|
|
|
252 |
shutil.rmtree(outdir)
|
253 |
status.update(state="complete", expanded=False)
|
254 |
|
|
|
1 |
import os
|
2 |
+
import re
|
3 |
import csv
|
4 |
import json
|
5 |
import torch
|
6 |
import shutil
|
7 |
+
import tempfile
|
8 |
import textwrap
|
9 |
import numpy as np
|
10 |
import pandas as pd
|
|
|
12 |
from collections import Counter
|
13 |
from tokenizers import Tokenizer
|
14 |
import plotly.graph_objects as go
|
15 |
+
from huggingface_hub import whoami, HfApi, snapshot_download
|
16 |
from transformers import AutoModel, AutoTokenizer, PreTrainedTokenizerFast, pipeline
|
17 |
|
18 |
|
|
|
208 |
fig.update_traces(texttemplate='%{text:.1f}M', textposition='inside', insidetextanchor='middle')
|
209 |
st.plotly_chart(fig)
|
210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
|
212 |
+
st.write("- *Adding sentence-transformers files*")
|
213 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
214 |
+
snapshot_download(repo_id=model_name, local_dir=tmpdirname, token=hf_token)
|
215 |
+
|
216 |
+
src_modules_json = os.path.join(tmpdirname, "modules.json")
|
217 |
+
if os.path.exists(src_modules_json):
|
218 |
+
shutil.copy2(src_modules_json, os.path.join(outdir, "modules.json"))
|
219 |
+
|
220 |
+
src_sentence_bert_config = os.path.join(tmpdirname, "sentence_bert_config.json")
|
221 |
+
if os.path.exists(src_sentence_bert_config):
|
222 |
+
shutil.copy2(src_sentence_bert_config, os.path.join(outdir, "sentence_bert_config.json"))
|
223 |
+
|
224 |
+
src_pooling_folder = os.path.join(tmpdirname, "1_Pooling")
|
225 |
+
if os.path.exists(src_pooling_folder):
|
226 |
+
shutil.copytree(src_pooling_folder, os.path.join(outdir, "1_Pooling"), dirs_exist_ok=True)
|
227 |
+
|
228 |
+
src_readme = os.path.join(tmpdirname, "README.md")
|
229 |
+
if os.path.exists(src_readme):
|
230 |
+
with open(src_readme, 'r', encoding='utf-8') as file:
|
231 |
+
content = file.read()
|
232 |
+
match = re.search(r'license:\s*(\S+)', content, re.IGNORECASE)
|
233 |
+
if match:
|
234 |
+
original_license = match.group(1)
|
235 |
+
|
236 |
+
st.write("- *Adding a README*")
|
237 |
+
new_model_name = f"{hf_username}/{outdir.split('/')[-1]}"
|
238 |
+
readme_content = textwrap.dedent(f"""
|
239 |
+
---
|
240 |
+
pipeline_tag: sentence-similarity
|
241 |
+
language: {LANGUAGES[language]['hf_code']}
|
242 |
+
license: {original_license}
|
243 |
+
tags:
|
244 |
+
- passage-retrieval
|
245 |
+
- sentence-similarity
|
246 |
+
- pruned
|
247 |
+
library_name: sentence-transformers
|
248 |
+
base_model: {model_name}
|
249 |
+
base_model_relation: quantized
|
250 |
+
---
|
251 |
+
# {LANGUAGES[language]['emoji']} {new_model_name.split('/')[-1]}
|
252 |
+
|
253 |
+
This model is a {100 - pruned_all_params/all_params*100:.1f}% smaller version of [{model_name}](https://huggingface.co/{model_name})
|
254 |
+
for the {language.capitalize()} language, created using the [mtem-pruner](https://huggingface.co/spaces/antoinelouis/mtem-pruner) space.
|
255 |
+
|
256 |
+
This pruned model should perform similarly to the original model for {language.capitalize()} language tasks with a much smaller
|
257 |
+
memory footprint. However, it may not perform well for other languages present in the original multilingual model as tokens not
|
258 |
+
commonly used in {language.capitalize()} were removed from the original multilingual model's vocabulary.
|
259 |
+
|
260 |
+
## Usage
|
261 |
+
|
262 |
+
You can use this model with the Transformers library:
|
263 |
+
|
264 |
+
```python
|
265 |
+
from transformers import AutoModel, AutoTokenizer
|
266 |
+
|
267 |
+
model_name = "{new_model_name}"
|
268 |
+
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
|
269 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
|
270 |
+
```
|
271 |
+
|
272 |
+
Or with the sentence-transformers library:
|
273 |
+
|
274 |
+
```python
|
275 |
+
from sentence_transformers import SentenceTransformer
|
276 |
+
|
277 |
+
model = SentenceTransformer("{new_model_name}")
|
278 |
+
```
|
279 |
+
|
280 |
+
**Credits**: cc [@antoinelouis](https://huggingface.co/antoinelouis)
|
281 |
+
""")
|
282 |
+
with open(os.path.join(outdir, "README.md"), "w") as f:
|
283 |
+
f.write(readme_content)
|
284 |
+
|
285 |
+
st.write("- *Pushing to Hub*")
|
286 |
push_to_hub(hf_username, hf_token, outdir)
|
287 |
+
|
288 |
shutil.rmtree(outdir)
|
289 |
status.update(state="complete", expanded=False)
|
290 |
|