antoinelouis commited on
Commit
0209ea7
·
verified ·
1 Parent(s): 2075e20

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -33
app.py CHANGED
@@ -41,21 +41,15 @@ def estimate_pruned_vocabulary(tokenizer: PreTrainedTokenizerFast, language: str
41
  sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
42
  if os.path.exists(sentences_file):
43
  df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
 
44
  counter = Counter(tokenizer.all_special_tokens)
45
- counter.update(tok for t in tqdm(df.text) for tok in tokenizer.tokenize(t))
46
- with open(f"data.nosync/{language}_filtered_tokens.txt", "w") as f:
47
- f.write("\n".join(map(str, set(counter))))
 
48
  else:
49
  raise FileNotFoundError
50
 
51
- def get_pruned_vocabulary(language: str):
52
- filtered_tokens_file = f"data.nosync/{language}_filtered_tokens.txt"
53
- if os.path.exists(filtered_tokens_file):
54
- with open(filtered_tokens_file, "r") as f:
55
- return set(f.read().splitlines())
56
- else:
57
- raise FileNotFoundError(f"No filtered tokens file found for language {language}. Please run `estimate_pruned_vocabulary` first.")
58
-
59
  @st.cache_resource
60
  def load_model_and_tokenizer(model_name: str):
61
  model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
@@ -78,14 +72,12 @@ def get_test_sentence(target_lang: str, source_lang: str = "eng_Latn"):
78
  return translator(text, src_lang=source_lang, tgt_lang=target_lang)[0]['translation_text']
79
 
80
  def push_to_hub(hf_username: str, hf_token: str, model_dir: str, private: bool = False):
81
- print(f"'{hf_token}'")
82
- _ = whoami(token=hf_token)
83
  api = HfApi(endpoint="https://huggingface.co", token=hf_token)
84
  repo_id = f"{hf_username}/{model_dir.split('/')[-1]}"
85
  api.create_repo(repo_id=repo_id, repo_type="model", private=private)
86
  api.upload_folder(repo_id=repo_id, folder_path=model_dir, commit_message="Upload pruned model")
87
 
88
- def prune_model(model_name: str, language: str, hf_username: str, hf_token: str):
89
  st.markdown(f"- Let's prune the [**{model_name}**](https://huggingface.co/{model_name}) model to keep its **{language.capitalize()}** tokens only.")
90
 
91
  # Load the model and its tokenizer
@@ -103,13 +95,23 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str)
103
  f"with **{encoder_params/1e6:.1f}M** parameters only."
104
  )
105
 
106
- # Estimate the most used tokens in the language.
107
- filtered_tokens = get_pruned_vocabulary(language)
108
- st.markdown(
109
- f"- {language.capitalize()} seems to only use **{len(filtered_tokens)/tokenizer.vocab_size*100:.0f}%** "+
110
- f"of the model vocabulary (i.e., {len(filtered_tokens)} out of the original {tokenizer.vocab_size} tokens)."
111
- )
112
-
 
 
 
 
 
 
 
 
 
 
113
  with st.status("Pruning the model...", expanded=True) as status:
114
  st.write("- *Updating the tokenizer*")
115
  outdir = f"{language}-{model_name.split('/')[-1]}"
@@ -165,7 +167,7 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str)
165
 
166
  with st.status("Testing the conversion...", expanded=True) as status:
167
  st.write(f"- *Checking the pruned tokenizer*")
168
- assert len(new_tokenizer) == len(filtered_tokens), f"ERROR: new tokenizer size ({len(new_tokenizer)}) != number of filtered tokens ({len(filtered_tokens)})"
169
  assert filtered_tokens == set(new_tokenizer.convert_ids_to_tokens(range(len(new_tokenizer)))), f"ERROR: The new tokenizer vocabulary doesn't match number of the filtered tokens"
170
 
171
  st.write(f"- *Checking the pruned model*")
@@ -247,7 +249,7 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str)
247
  f.write(readme_content)
248
 
249
  with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
250
- #push_to_hub(hf_username, hf_token, outdir)
251
  shutil.rmtree(outdir)
252
  status.update(state="complete", expanded=False)
253
 
@@ -279,22 +281,31 @@ def main():
279
  """)
280
 
281
  model_name = st.selectbox("Choose a multilingual model", MODELS)
282
- language = st.selectbox(
283
- "Pick your target language",
284
- options=list(LANGUAGES.keys()),
285
- format_func=lambda x: f"{LANGUAGES[x]['emoji']} {x.capitalize()}"
286
- )
287
- col1, col2 = st.columns(2)
288
  with col1:
289
- hf_username = st.text_input("Your Hugging Face username", placeholder="antoinelouis")
 
 
 
 
290
  with col2:
 
 
 
 
 
 
 
 
291
  hf_token = st.text_input("Your Hugging Face access token", type="password", placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
292
 
293
- if st.button("Prune Model"):
294
  if not hf_username or not hf_token:
295
- st.error("Your HF username and access token is required to save the pruned model on your account.")
296
  else:
297
- prune_model(model_name, language, hf_username, hf_token)
 
298
 
299
  st.markdown(
300
  """
 
41
  sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
42
  if os.path.exists(sentences_file):
43
  df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
44
+ my_bar = st.progress(0)
45
  counter = Counter(tokenizer.all_special_tokens)
46
+ for i, text in enumerate(df.text):
47
+ counter.update(tok for tok in tokenizer.tokenize(text))
48
+ my_bar.progress(i/len(df)+1, text=f"{i/len(df)*100:.0f}%")
49
+ return set(counter)
50
  else:
51
  raise FileNotFoundError
52
 
 
 
 
 
 
 
 
 
53
  @st.cache_resource
54
  def load_model_and_tokenizer(model_name: str):
55
  model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
 
72
  return translator(text, src_lang=source_lang, tgt_lang=target_lang)[0]['translation_text']
73
 
74
  def push_to_hub(hf_username: str, hf_token: str, model_dir: str, private: bool = False):
 
 
75
  api = HfApi(endpoint="https://huggingface.co", token=hf_token)
76
  repo_id = f"{hf_username}/{model_dir.split('/')[-1]}"
77
  api.create_repo(repo_id=repo_id, repo_type="model", private=private)
78
  api.upload_folder(repo_id=repo_id, folder_path=model_dir, commit_message="Upload pruned model")
79
 
80
+ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str, keep_english: bool):
81
  st.markdown(f"- Let's prune the [**{model_name}**](https://huggingface.co/{model_name}) model to keep its **{language.capitalize()}** tokens only.")
82
 
83
  # Load the model and its tokenizer
 
95
  f"with **{encoder_params/1e6:.1f}M** parameters only."
96
  )
97
 
98
+ with st.status(f"Computing the {language.capitalize()} vocabulary...", expanded=True) as status:
99
+ filtered_tokens = estimate_pruned_vocabulary(tokenizer, language)
100
+ num_filtered_tokens = len(filtered_tokens)
101
+ st.write(
102
+ f"{language.capitalize()} only uses **{num_filtered_tokens/tokenizer.vocab_size*100:.0f}%** "+
103
+ f"of the model vocabulary (i.e., {num_filtered_tokens} out of the original {tokenizer.vocab_size} tokens)."
104
+ )
105
+ status.update(state="complete", expanded=True)
106
+
107
+ if keep_english:
108
+ with st.status(f"Computing the English vocabulary...", expanded=True) as status:
109
+ english_tokens = estimate_pruned_vocabulary(tokenizer, "english")
110
+ filtered_tokens.update(english_tokens)
111
+ st.write(f"Considering the **English** tokens adds **{len(filtered_tokens) - num_filtered_tokens}** tokens to the vocabulary.")
112
+ num_filtered_tokens = len(filtered_tokens)
113
+ status.update(state="complete", expanded=True)
114
+
115
  with st.status("Pruning the model...", expanded=True) as status:
116
  st.write("- *Updating the tokenizer*")
117
  outdir = f"{language}-{model_name.split('/')[-1]}"
 
167
 
168
  with st.status("Testing the conversion...", expanded=True) as status:
169
  st.write(f"- *Checking the pruned tokenizer*")
170
+ assert len(new_tokenizer) == num_filtered_tokens, f"ERROR: new tokenizer size ({len(new_tokenizer)}) != number of filtered tokens ({num_filtered_tokens})"
171
  assert filtered_tokens == set(new_tokenizer.convert_ids_to_tokens(range(len(new_tokenizer)))), f"ERROR: The new tokenizer vocabulary doesn't match number of the filtered tokens"
172
 
173
  st.write(f"- *Checking the pruned model*")
 
249
  f.write(readme_content)
250
 
251
  with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
252
+ push_to_hub(hf_username, hf_token, outdir)
253
  shutil.rmtree(outdir)
254
  status.update(state="complete", expanded=False)
255
 
 
281
  """)
282
 
283
  model_name = st.selectbox("Choose a multilingual model", MODELS)
284
+
285
+ col1, col2 = st.columns([3, 1])
 
 
 
 
286
  with col1:
287
+ language = st.selectbox(
288
+ "Pick your target language",
289
+ options=list(LANGUAGES.keys()),
290
+ format_func=lambda x: f"{LANGUAGES[x]['emoji']} {x.capitalize()}"
291
+ )
292
  with col2:
293
+ st.write("")
294
+ st.write("")
295
+ keep_english = st.checkbox("Keep English", value=False, help="Keep English tokens in addition to the selected language")
296
+
297
+ col3, col4 = st.columns(2)
298
+ with col3:
299
+ hf_username = st.text_input("Your Hugging Face username", placeholder="antoinelouis")
300
+ with col4:
301
  hf_token = st.text_input("Your Hugging Face access token", type="password", placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
302
 
303
+ if st.button("Prune model"):
304
  if not hf_username or not hf_token:
305
+ st.error("Your HF username and access token are required to save the pruned model on your account.")
306
  else:
307
+ _ = whoami(token=hf_token)
308
+ prune_model(model_name, language, hf_username, hf_token, keep_english)
309
 
310
  st.markdown(
311
  """