Spaces:
Running
Running
antoinelouis
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -41,21 +41,15 @@ def estimate_pruned_vocabulary(tokenizer: PreTrainedTokenizerFast, language: str
|
|
41 |
sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
|
42 |
if os.path.exists(sentences_file):
|
43 |
df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
|
|
|
44 |
counter = Counter(tokenizer.all_special_tokens)
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
48 |
else:
|
49 |
raise FileNotFoundError
|
50 |
|
51 |
-
def get_pruned_vocabulary(language: str):
|
52 |
-
filtered_tokens_file = f"data.nosync/{language}_filtered_tokens.txt"
|
53 |
-
if os.path.exists(filtered_tokens_file):
|
54 |
-
with open(filtered_tokens_file, "r") as f:
|
55 |
-
return set(f.read().splitlines())
|
56 |
-
else:
|
57 |
-
raise FileNotFoundError(f"No filtered tokens file found for language {language}. Please run `estimate_pruned_vocabulary` first.")
|
58 |
-
|
59 |
@st.cache_resource
|
60 |
def load_model_and_tokenizer(model_name: str):
|
61 |
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
|
@@ -78,14 +72,12 @@ def get_test_sentence(target_lang: str, source_lang: str = "eng_Latn"):
|
|
78 |
return translator(text, src_lang=source_lang, tgt_lang=target_lang)[0]['translation_text']
|
79 |
|
80 |
def push_to_hub(hf_username: str, hf_token: str, model_dir: str, private: bool = False):
|
81 |
-
print(f"'{hf_token}'")
|
82 |
-
_ = whoami(token=hf_token)
|
83 |
api = HfApi(endpoint="https://huggingface.co", token=hf_token)
|
84 |
repo_id = f"{hf_username}/{model_dir.split('/')[-1]}"
|
85 |
api.create_repo(repo_id=repo_id, repo_type="model", private=private)
|
86 |
api.upload_folder(repo_id=repo_id, folder_path=model_dir, commit_message="Upload pruned model")
|
87 |
|
88 |
-
def prune_model(model_name: str, language: str, hf_username: str, hf_token: str):
|
89 |
st.markdown(f"- Let's prune the [**{model_name}**](https://huggingface.co/{model_name}) model to keep its **{language.capitalize()}** tokens only.")
|
90 |
|
91 |
# Load the model and its tokenizer
|
@@ -103,13 +95,23 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str)
|
|
103 |
f"with **{encoder_params/1e6:.1f}M** parameters only."
|
104 |
)
|
105 |
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
with st.status("Pruning the model...", expanded=True) as status:
|
114 |
st.write("- *Updating the tokenizer*")
|
115 |
outdir = f"{language}-{model_name.split('/')[-1]}"
|
@@ -165,7 +167,7 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str)
|
|
165 |
|
166 |
with st.status("Testing the conversion...", expanded=True) as status:
|
167 |
st.write(f"- *Checking the pruned tokenizer*")
|
168 |
-
assert len(new_tokenizer) ==
|
169 |
assert filtered_tokens == set(new_tokenizer.convert_ids_to_tokens(range(len(new_tokenizer)))), f"ERROR: The new tokenizer vocabulary doesn't match number of the filtered tokens"
|
170 |
|
171 |
st.write(f"- *Checking the pruned model*")
|
@@ -247,7 +249,7 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str)
|
|
247 |
f.write(readme_content)
|
248 |
|
249 |
with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
|
250 |
-
|
251 |
shutil.rmtree(outdir)
|
252 |
status.update(state="complete", expanded=False)
|
253 |
|
@@ -279,22 +281,31 @@ def main():
|
|
279 |
""")
|
280 |
|
281 |
model_name = st.selectbox("Choose a multilingual model", MODELS)
|
282 |
-
|
283 |
-
|
284 |
-
options=list(LANGUAGES.keys()),
|
285 |
-
format_func=lambda x: f"{LANGUAGES[x]['emoji']} {x.capitalize()}"
|
286 |
-
)
|
287 |
-
col1, col2 = st.columns(2)
|
288 |
with col1:
|
289 |
-
|
|
|
|
|
|
|
|
|
290 |
with col2:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
hf_token = st.text_input("Your Hugging Face access token", type="password", placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
|
292 |
|
293 |
-
if st.button("Prune
|
294 |
if not hf_username or not hf_token:
|
295 |
-
st.error("Your HF username and access token
|
296 |
else:
|
297 |
-
|
|
|
298 |
|
299 |
st.markdown(
|
300 |
"""
|
|
|
41 |
sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
|
42 |
if os.path.exists(sentences_file):
|
43 |
df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
|
44 |
+
my_bar = st.progress(0)
|
45 |
counter = Counter(tokenizer.all_special_tokens)
|
46 |
+
for i, text in enumerate(df.text):
|
47 |
+
counter.update(tok for tok in tokenizer.tokenize(text))
|
48 |
+
my_bar.progress(i/len(df)+1, text=f"{i/len(df)*100:.0f}%")
|
49 |
+
return set(counter)
|
50 |
else:
|
51 |
raise FileNotFoundError
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
@st.cache_resource
|
54 |
def load_model_and_tokenizer(model_name: str):
|
55 |
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
|
|
|
72 |
return translator(text, src_lang=source_lang, tgt_lang=target_lang)[0]['translation_text']
|
73 |
|
74 |
def push_to_hub(hf_username: str, hf_token: str, model_dir: str, private: bool = False):
|
|
|
|
|
75 |
api = HfApi(endpoint="https://huggingface.co", token=hf_token)
|
76 |
repo_id = f"{hf_username}/{model_dir.split('/')[-1]}"
|
77 |
api.create_repo(repo_id=repo_id, repo_type="model", private=private)
|
78 |
api.upload_folder(repo_id=repo_id, folder_path=model_dir, commit_message="Upload pruned model")
|
79 |
|
80 |
+
def prune_model(model_name: str, language: str, hf_username: str, hf_token: str, keep_english: bool):
|
81 |
st.markdown(f"- Let's prune the [**{model_name}**](https://huggingface.co/{model_name}) model to keep its **{language.capitalize()}** tokens only.")
|
82 |
|
83 |
# Load the model and its tokenizer
|
|
|
95 |
f"with **{encoder_params/1e6:.1f}M** parameters only."
|
96 |
)
|
97 |
|
98 |
+
with st.status(f"Computing the {language.capitalize()} vocabulary...", expanded=True) as status:
|
99 |
+
filtered_tokens = estimate_pruned_vocabulary(tokenizer, language)
|
100 |
+
num_filtered_tokens = len(filtered_tokens)
|
101 |
+
st.write(
|
102 |
+
f"{language.capitalize()} only uses **{num_filtered_tokens/tokenizer.vocab_size*100:.0f}%** "+
|
103 |
+
f"of the model vocabulary (i.e., {num_filtered_tokens} out of the original {tokenizer.vocab_size} tokens)."
|
104 |
+
)
|
105 |
+
status.update(state="complete", expanded=True)
|
106 |
+
|
107 |
+
if keep_english:
|
108 |
+
with st.status(f"Computing the English vocabulary...", expanded=True) as status:
|
109 |
+
english_tokens = estimate_pruned_vocabulary(tokenizer, "english")
|
110 |
+
filtered_tokens.update(english_tokens)
|
111 |
+
st.write(f"Considering the **English** tokens adds **{len(filtered_tokens) - num_filtered_tokens}** tokens to the vocabulary.")
|
112 |
+
num_filtered_tokens = len(filtered_tokens)
|
113 |
+
status.update(state="complete", expanded=True)
|
114 |
+
|
115 |
with st.status("Pruning the model...", expanded=True) as status:
|
116 |
st.write("- *Updating the tokenizer*")
|
117 |
outdir = f"{language}-{model_name.split('/')[-1]}"
|
|
|
167 |
|
168 |
with st.status("Testing the conversion...", expanded=True) as status:
|
169 |
st.write(f"- *Checking the pruned tokenizer*")
|
170 |
+
assert len(new_tokenizer) == num_filtered_tokens, f"ERROR: new tokenizer size ({len(new_tokenizer)}) != number of filtered tokens ({num_filtered_tokens})"
|
171 |
assert filtered_tokens == set(new_tokenizer.convert_ids_to_tokens(range(len(new_tokenizer)))), f"ERROR: The new tokenizer vocabulary doesn't match number of the filtered tokens"
|
172 |
|
173 |
st.write(f"- *Checking the pruned model*")
|
|
|
249 |
f.write(readme_content)
|
250 |
|
251 |
with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
|
252 |
+
push_to_hub(hf_username, hf_token, outdir)
|
253 |
shutil.rmtree(outdir)
|
254 |
status.update(state="complete", expanded=False)
|
255 |
|
|
|
281 |
""")
|
282 |
|
283 |
model_name = st.selectbox("Choose a multilingual model", MODELS)
|
284 |
+
|
285 |
+
col1, col2 = st.columns([3, 1])
|
|
|
|
|
|
|
|
|
286 |
with col1:
|
287 |
+
language = st.selectbox(
|
288 |
+
"Pick your target language",
|
289 |
+
options=list(LANGUAGES.keys()),
|
290 |
+
format_func=lambda x: f"{LANGUAGES[x]['emoji']} {x.capitalize()}"
|
291 |
+
)
|
292 |
with col2:
|
293 |
+
st.write("")
|
294 |
+
st.write("")
|
295 |
+
keep_english = st.checkbox("Keep English", value=False, help="Keep English tokens in addition to the selected language")
|
296 |
+
|
297 |
+
col3, col4 = st.columns(2)
|
298 |
+
with col3:
|
299 |
+
hf_username = st.text_input("Your Hugging Face username", placeholder="antoinelouis")
|
300 |
+
with col4:
|
301 |
hf_token = st.text_input("Your Hugging Face access token", type="password", placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
|
302 |
|
303 |
+
if st.button("Prune model"):
|
304 |
if not hf_username or not hf_token:
|
305 |
+
st.error("Your HF username and access token are required to save the pruned model on your account.")
|
306 |
else:
|
307 |
+
_ = whoami(token=hf_token)
|
308 |
+
prune_model(model_name, language, hf_username, hf_token, keep_english)
|
309 |
|
310 |
st.markdown(
|
311 |
"""
|