antoinelouis commited on
Commit
9fb9957
1 Parent(s): d48edf5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -42
app.py CHANGED
@@ -1,8 +1,10 @@
1
  import os
 
2
  import csv
3
  import json
4
  import torch
5
  import shutil
 
6
  import textwrap
7
  import numpy as np
8
  import pandas as pd
@@ -10,7 +12,7 @@ import streamlit as st
10
  from collections import Counter
11
  from tokenizers import Tokenizer
12
  import plotly.graph_objects as go
13
- from huggingface_hub import whoami, HfApi
14
  from transformers import AutoModel, AutoTokenizer, PreTrainedTokenizerFast, pipeline
15
 
16
 
@@ -206,49 +208,83 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str,
206
  fig.update_traces(texttemplate='%{text:.1f}M', textposition='inside', insidetextanchor='middle')
207
  st.plotly_chart(fig)
208
 
209
- # Add a README to the pruned model repo
210
- new_model_name = f"{hf_username}/{outdir.split('/')[-1]}"
211
- readme_content = textwrap.dedent(f"""
212
- ---
213
- pipeline_tag: sentence-similarity
214
- language: {LANGUAGES[language]['hf_code']}
215
- license: mit
216
- tags:
217
- - passage-retrieval
218
- - sentence-similarity
219
- - pruned
220
- library_name: sentence-transformers
221
- base_model: {model_name}
222
- base_model_relation: quantized
223
- ---
224
- # {LANGUAGES[language]['emoji']} {new_model_name.split('/')[-1]}
225
-
226
- This model is a {100 - pruned_all_params/all_params*100:.1f}% smaller version of [{model_name}](https://huggingface.co/{model_name})
227
- for the {language.capitalize()} language, created using the [mtem-pruner](https://huggingface.co/spaces/antoinelouis/mtem-pruner) space.
228
-
229
- This pruned model should perform similarly to the original model for {language.capitalize()} language tasks with a much smaller
230
- memory footprint. However, it may not perform well for other languages present in the original multilingual model as tokens not
231
- commonly used in {language.capitalize()} were removed from the original multilingual model's vocabulary.
232
-
233
- ## Usage
234
-
235
- You can use this model with the Transformers library:
236
-
237
- ```python
238
- from transformers import AutoModel, AutoTokenizer
239
-
240
- model_name = "{new_model_name}"
241
- model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
242
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
243
- ```
244
-
245
- **Credits**: cc [@antoinelouis](https://huggingface.co/antoinelouis)
246
- """)
247
- with open(os.path.join(outdir, "README.md"), "w") as f:
248
- f.write(readme_content)
249
-
250
  with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  push_to_hub(hf_username, hf_token, outdir)
 
252
  shutil.rmtree(outdir)
253
  status.update(state="complete", expanded=False)
254
 
 
1
  import os
2
+ import re
3
  import csv
4
  import json
5
  import torch
6
  import shutil
7
+ import tempfile
8
  import textwrap
9
  import numpy as np
10
  import pandas as pd
 
12
  from collections import Counter
13
  from tokenizers import Tokenizer
14
  import plotly.graph_objects as go
15
+ from huggingface_hub import whoami, HfApi, snapshot_download
16
  from transformers import AutoModel, AutoTokenizer, PreTrainedTokenizerFast, pipeline
17
 
18
 
 
208
  fig.update_traces(texttemplate='%{text:.1f}M', textposition='inside', insidetextanchor='middle')
209
  st.plotly_chart(fig)
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
212
+ st.write("- *Adding sentence-transformers files*")
213
+ with tempfile.TemporaryDirectory() as tmpdirname:
214
+ snapshot_download(repo_id=model_name, local_dir=tmpdirname, token=hf_token)
215
+
216
+ src_modules_json = os.path.join(tmpdirname, "modules.json")
217
+ if os.path.exists(src_modules_json):
218
+ shutil.copy2(src_modules_json, os.path.join(outdir, "modules.json"))
219
+
220
+ src_sentence_bert_config = os.path.join(tmpdirname, "sentence_bert_config.json")
221
+ if os.path.exists(src_sentence_bert_config):
222
+ shutil.copy2(src_sentence_bert_config, os.path.join(outdir, "sentence_bert_config.json"))
223
+
224
+ src_pooling_folder = os.path.join(tmpdirname, "1_Pooling")
225
+ if os.path.exists(src_pooling_folder):
226
+ shutil.copytree(src_pooling_folder, os.path.join(outdir, "1_Pooling"), dirs_exist_ok=True)
227
+
228
+ src_readme = os.path.join(tmpdirname, "README.md")
229
+ if os.path.exists(src_readme):
230
+ with open(src_readme, 'r', encoding='utf-8') as file:
231
+ content = file.read()
232
+ match = re.search(r'license:\s*(\S+)', content, re.IGNORECASE)
233
+ if match:
234
+ original_license = match.group(1)
235
+
236
+ st.write("- *Adding a README*")
237
+ new_model_name = f"{hf_username}/{outdir.split('/')[-1]}"
238
+ readme_content = textwrap.dedent(f"""
239
+ ---
240
+ pipeline_tag: sentence-similarity
241
+ language: {LANGUAGES[language]['hf_code']}
242
+ license: {original_license}
243
+ tags:
244
+ - passage-retrieval
245
+ - sentence-similarity
246
+ - pruned
247
+ library_name: sentence-transformers
248
+ base_model: {model_name}
249
+ base_model_relation: quantized
250
+ ---
251
+ # {LANGUAGES[language]['emoji']} {new_model_name.split('/')[-1]}
252
+
253
+ This model is a {100 - pruned_all_params/all_params*100:.1f}% smaller version of [{model_name}](https://huggingface.co/{model_name})
254
+ for the {language.capitalize()} language, created using the [mtem-pruner](https://huggingface.co/spaces/antoinelouis/mtem-pruner) space.
255
+
256
+ This pruned model should perform similarly to the original model for {language.capitalize()} language tasks with a much smaller
257
+ memory footprint. However, it may not perform well for other languages present in the original multilingual model as tokens not
258
+ commonly used in {language.capitalize()} were removed from the original multilingual model's vocabulary.
259
+
260
+ ## Usage
261
+
262
+ You can use this model with the Transformers library:
263
+
264
+ ```python
265
+ from transformers import AutoModel, AutoTokenizer
266
+
267
+ model_name = "{new_model_name}"
268
+ model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
269
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
270
+ ```
271
+
272
+ Or with the sentence-transformers library:
273
+
274
+ ```python
275
+ from sentence_transformers import SentenceTransformer
276
+
277
+ model = SentenceTransformer("{new_model_name}")
278
+ ```
279
+
280
+ **Credits**: cc [@antoinelouis](https://huggingface.co/antoinelouis)
281
+ """)
282
+ with open(os.path.join(outdir, "README.md"), "w") as f:
283
+ f.write(readme_content)
284
+
285
+ st.write("- *Pushing to Hub*")
286
  push_to_hub(hf_username, hf_token, outdir)
287
+
288
  shutil.rmtree(outdir)
289
  status.update(state="complete", expanded=False)
290