antoinelouis commited on
Commit
525264a
1 Parent(s): df42891

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +310 -0
app.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import csv
3
+ import json
4
+ import torch
5
+ import shutil
6
+ import requests
7
+ import textwrap
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+ from tqdm.auto import tqdm
12
+ from collections import Counter
13
+ from tokenizers import Tokenizer
14
+ import plotly.graph_objects as go
15
+ from huggingface_hub import whoami, HfApi
16
+ from transformers import AutoModel, AutoTokenizer, PreTrainedTokenizerFast, pipeline
17
+
18
+
19
+ LANGUAGES = {
20
+ "french": {"emoji":"🇫🇷", "nllb_code":"fra_Latn", "hf_code":"fr"},
21
+ "english": {"emoji":"🇬🇧", "nllb_code":"eng_Latn", "hf_code":"en"},
22
+ "german": {"emoji":"🇩🇪", "nllb_code":"deu_Latn", "hf_code":"de"},
23
+ "italian": {"emoji":"🇮🇹", "nllb_code":"ita_Latn", "hf_code":"it"},
24
+ "spanish": {"emoji":"🇪🇸", "nllb_code":"spa_Latn", "hf_code":"es"},
25
+ "portuguese": {"emoji":"🇵🇹", "nllb_code":"por_Latn", "hf_code":"pt"}
26
+ }
27
+
28
+ MODELS = [
29
+ "intfloat/multilingual-e5-small",
30
+ "intfloat/multilingual-e5-base",
31
+ "intfloat/multilingual-e5-large",
32
+ "BAAI/bge-m3",
33
+ "Alibaba-NLP/gte-multilingual-base",
34
+ #"jinaai/jina-embeddings-v3", # TODO: uses ParametrizedEmbedding
35
+ ]
36
+
37
+ def estimate_pruned_vocabulary(tokenizer: PreTrainedTokenizerFast, language: str):
38
+ """
39
+ Estimate the most common tokens in the language. You should first download the 1M sentences dataset for the desired language.
40
+ Source: https://wortschatz.uni-leipzig.de/en/download/English
41
+ """
42
+ sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
43
+ if os.path.exists(sentences_file):
44
+ df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
45
+ counter = Counter(tokenizer.all_special_tokens)
46
+ counter.update(tok for t in tqdm(df.text) for tok in tokenizer.tokenize(t))
47
+ with open(f"data.nosync/{language}_filtered_tokens.txt", "w") as f:
48
+ f.write("\n".join(map(str, set(counter))))
49
+ else:
50
+ raise FileNotFoundError
51
+
52
+ def get_pruned_vocabulary(language: str):
53
+ filtered_tokens_file = f"data.nosync/{language}_filtered_tokens.txt"
54
+ if os.path.exists(filtered_tokens_file):
55
+ with open(filtered_tokens_file, "r") as f:
56
+ return set(f.read().splitlines())
57
+ else:
58
+ raise FileNotFoundError(f"No filtered tokens file found for language {language}. Please run `estimate_pruned_vocabulary` first.")
59
+
60
+ @st.cache_resource
61
+ def load_model_and_tokenizer(model_name: str):
62
+ model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
63
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
64
+ return model, tokenizer
65
+
66
+ def count_parameters(model, layer_name: str = None):
67
+ return sum(p.numel() for name, p in model.named_parameters() if layer_name is None or name.startswith(layer_name))
68
+
69
+ @st.cache_resource
70
+ def get_test_sentence(target_lang: str, source_lang: str = "eng_Latn"):
71
+ text = """
72
+ Alan Mathison Turing (23 June 1912 - 7 June 1954) was an English mathematician,
73
+ computer scientist, logician, cryptanalyst, philosopher and theoretical biologist.
74
+ """
75
+ if target_lang == "eng_Latn":
76
+ return text
77
+ model_name = "facebook/nllb-200-distilled-600M"
78
+ translator = pipeline(task="translation", tokenizer=model_name, model=model_name)
79
+ return translator(text, src_lang=source_lang, tgt_lang=target_lang)[0]['translation_text']
80
+
81
+ def push_to_hub(username: str, token: str, model_dir: str, private: bool = False):
82
+ _ = whoami(token=token)
83
+ api = HfApi(endpoint="https://huggingface.co", token=token)
84
+ repo_id = f"{username}/{model_dir.split('/')[-1]}"
85
+ api.create_repo(repo_id=repo_id, repo_type="model", private=private)
86
+ api.upload_folder(repo_id=repo_id, folder_path=model_dir, commit_message="Upload pruned model")
87
+
88
+ def prune_model(model_name: str, language: str, username: str, token: str):
89
+ st.markdown(f"- Pruning the [**{model_name}**](https://huggingface.co/{model_name}) model to keep its **{language.capitalize()}** tokens only. *Let's go!*")
90
+
91
+ # Load the model and its tokenizer
92
+ model, tokenizer = load_model_and_tokenizer(model_name)
93
+
94
+ # Calculate parameters for the original model
95
+ all_params = count_parameters(model)
96
+ encoder_params = count_parameters(model, layer_name="encoder")
97
+ embedding_params = count_parameters(model, layer_name="embeddings")
98
+
99
+ st.markdown(
100
+ f"- The model has **{all_params/1e6:.1f}M** parameters, of which **{embedding_params/all_params*100:.0f}%** "+
101
+ f"(i.e., {embedding_params/1e6:.1f}M params) come from the *embedding matrix* and its {tokenizer.vocab_size} token entries. "+
102
+ f"This means that the contextualization of text sequences is actually done by a *{model.config.num_hidden_layers}-layer Transformer encoder* "+
103
+ f"with **{encoder_params/1e6:.1f}M** parameters only."
104
+ )
105
+
106
+ # Estimate the most used tokens in the language.
107
+ filtered_tokens = get_pruned_vocabulary(language)
108
+ st.markdown(
109
+ f"- {language.capitalize()} seems to only use **{len(filtered_tokens)/tokenizer.vocab_size*100:.0f}%** "+
110
+ f"of the model vocabulary (i.e., {len(filtered_tokens)} out of the original {tokenizer.vocab_size} tokens)."
111
+ )
112
+
113
+ st.markdown("- *Updating the tokenizer...*")
114
+ outdir = f"{language}-{model_name.split('/')[-1]}"
115
+
116
+ # Export the tokenizer to a JSON string and access its vocabulary (list of lists: [[token, score], ...])
117
+ tokenizer_json = json.loads(tokenizer.backend_tokenizer.to_str())
118
+ original_vocab = tokenizer_json['model']['vocab']
119
+
120
+ # Build a mapping from tokens to their original IDs
121
+ original_token_to_id = {entry[0]: idx for idx, entry in enumerate(original_vocab)}
122
+
123
+ # Filter out the tokens to remove and reassign new IDs
124
+ new_id = 0
125
+ new_token_to_id = {}
126
+ new_id_to_original_id = {}
127
+ filtered_vocab_entries = []
128
+
129
+ for token, score in original_vocab:
130
+ if token in filtered_tokens:
131
+ filtered_vocab_entries.append([token, score])
132
+ new_token_to_id[token] = new_id
133
+ new_id_to_original_id[new_id] = original_token_to_id[token]
134
+ new_id += 1
135
+
136
+ # Update the vocab in the tokenizer JSON and rebuild the tokenizer from the modified JSON
137
+ tokenizer_json['model']['vocab'] = filtered_vocab_entries
138
+ new_backend_tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
139
+
140
+ # Create a new tokenizer instance and save it
141
+ new_tokenizer = PreTrainedTokenizerFast(tokenizer_object=new_backend_tokenizer, **tokenizer.init_kwargs)
142
+ new_tokenizer.save_pretrained(outdir)
143
+
144
+ st.markdown("- *Updating the embedding matrix...*")
145
+ new_model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
146
+
147
+ # Create a new embedding matrix and map the original vectors to their new IDs
148
+ original_embeddings = new_model.get_input_embeddings().weight.data
149
+ new_embeddings = torch.nn.Embedding(
150
+ num_embeddings=new_tokenizer.vocab_size,
151
+ embedding_dim=model.config.hidden_size,
152
+ padding_idx=new_tokenizer.pad_token_id,
153
+ )
154
+
155
+ for new_id in range(new_tokenizer.vocab_size):
156
+ original_id = new_id_to_original_id.get(new_id)
157
+ new_embeddings.weight.data[new_id] = original_embeddings[original_id]
158
+
159
+ new_model.set_input_embeddings(new_embeddings)
160
+ new_model.config.vocab_size = new_tokenizer.vocab_size
161
+ new_model.save_pretrained(outdir)
162
+
163
+ # Test the conversion
164
+ test_sentence = get_test_sentence(LANGUAGES[language]['nllb_code'])
165
+ st.markdown(f"""- *Verifying everything worked as expected with the following test sentence: "{test_sentence}"*""")
166
+
167
+ assert len(new_tokenizer) == len(filtered_tokens), f"ERROR: new tokenizer size ({len(new_tokenizer)}) != number of filtered tokens ({len(filtered_tokens)})"
168
+ assert filtered_tokens == set(new_tokenizer.convert_ids_to_tokens(range(len(new_tokenizer)))), f"ERROR: The new tokenizer vocabulary doesn't match number of the filtered tokens"
169
+
170
+ with torch.inference_mode():
171
+ emb1 = model(**tokenizer(test_sentence, return_tensors='pt')).last_hidden_state[:, 0][0].numpy()
172
+ emb2 = new_model(**new_tokenizer(test_sentence, return_tensors='pt')).last_hidden_state[:, 0][0].numpy()
173
+ diff = np.abs(emb1 - emb2).max()
174
+ assert diff < 1e-6, f"ERROR: Some dimensions of the two vectors have a non negligible difference ({diff})"
175
+
176
+ st.success("The conversion **succeeded**! You can verify it by looking at the output *[cls]* token embedding:")
177
+ col1, col2 = st.columns(2)
178
+ with col1:
179
+ st.markdown("Original model:")
180
+ st.code(f"{emb1.tolist()}")
181
+ with col2:
182
+ st.markdown("Pruned model:")
183
+ st.code(f"{emb2.tolist()}")
184
+
185
+ # Show visually the result of the pruning process
186
+ pruned_all_params = count_parameters(new_model)
187
+ pruned_encoder_params = count_parameters(new_model, layer_name="encoder")
188
+ pruned_embedding_params = count_parameters(new_model, layer_name="embeddings")
189
+ st.markdown(f"The pruned model is **{pruned_all_params/all_params*100:.1f}%** of the original model size.")
190
+ data = {
191
+ 'Model': ['Original', 'Pruned'],
192
+ 'Embedding': [embedding_params / 1e6, pruned_embedding_params / 1e6],
193
+ 'Encoder': [encoder_params / 1e6, pruned_encoder_params / 1e6]
194
+ }
195
+ fig = go.Figure(data=[
196
+ go.Bar(name='Embedding matrix', x=data['Model'], y=data['Embedding'], text=data['Embedding'], textposition='inside', marker_color='#E5B4B4'),
197
+ go.Bar(name='Transformer encoder', x=data['Model'], y=data['Encoder'], text=data['Encoder'], textposition='inside', marker_color='#7FBFE0')
198
+ ])
199
+ fig.update_layout(barmode='stack', yaxis_title='# Params (M)', height=400, margin=dict(t=10, b=10))
200
+ fig.update_traces(texttemplate='%{text:.1f}M', textposition='inside', insidetextanchor='middle')
201
+ st.plotly_chart(fig)
202
+
203
+ # Add a README to the pruned model repo
204
+ new_model_name = f"{username}/{outdir.split('/')[-1]}"
205
+ readme_content = textwrap.dedent(f"""
206
+ ---
207
+ pipeline_tag: sentence-similarity
208
+ language: {LANGUAGES[language]['hf_code']}
209
+ license: mit
210
+ tags:
211
+ - passage-retrieval
212
+ - sentence-similarity
213
+ - pruned
214
+ library_name: sentence-transformers
215
+ base_model: {model_name}
216
+ base_model_relation: pruned
217
+ ---
218
+ # {new_model_name.split('/')[-1]}
219
+
220
+ This model is a pruned version of [{model_name}](https://huggingface.co/{model_name}) for the {language.capitalize()} language.
221
+
222
+ It was created by the [Multilingual Text Embedding Model Pruner](https://huggingface.co/spaces/antoinelouis/mteb-pruner) space,
223
+ which removed tokens not commonly used in {language.capitalize()} from the original multilingual model's vocabulary and adjsuted
224
+ the model's embedding matrix accordingly.
225
+
226
+ This pruned model should perform similarly to the original model for {language.capitalize()} language tasks, but with a much smaller
227
+ memory footprint ({100 - pruned_all_params/all_params*100:.1f}% smaller). However, it may not perform well for other languages present
228
+ in the original multilingual model.
229
+
230
+ ## Usage
231
+
232
+ You can use this model with the Transformers library:
233
+
234
+ ```python
235
+ from transformers import AutoModel, AutoTokenizer
236
+
237
+ model_name = "{new_model_name}"
238
+ model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
239
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
240
+ ```
241
+ """)
242
+ with open(os.path.join(outdir, "README.md"), "w") as f:
243
+ f.write(readme_content)
244
+
245
+ st.markdown("- *Pushing the pruned model to your Hugging Face account...*")
246
+ push_to_hub(username, token, outdir)
247
+ shutil.rmtree(outdir)
248
+
249
+ st.markdown("Done! You can now load your pruned model like this:")
250
+ st.code(f"""
251
+ from transformers import AutoModel, AutoTokenizer
252
+
253
+ model_name = "{new_model_name}"
254
+ model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
255
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
256
+ """, language="python")
257
+
258
+
259
+ def main():
260
+ st.header("Multilingual Text Embedding Model Pruner")
261
+ st.markdown("""
262
+ This space helps you create a smaller, language-specific version of a multilingual text embedding model. Here's what it does:
263
+
264
+ 1. 🌎 Takes a popular text embedding model that was trained on many languages
265
+ 2. ✂️ Trims it down to focus on just one language by removing unused tokens from its vocabulary
266
+ 3. 🚀 Gives you a smaller model that works just as well for your chosen language
267
+
268
+ #### Why is this useful?
269
+
270
+ - 💾 Get the same performance in your language with a much smaller model size
271
+ - 🌐 Great for low-resource environments with limited RAM
272
+
273
+ Ready to shrink your model? Let's get started!
274
+ """)
275
+
276
+ model_name = st.selectbox("Choose a multilingual model", MODELS)
277
+ language = st.selectbox(
278
+ "Pick your target language",
279
+ options=list(LANGUAGES.keys()),
280
+ format_func=lambda x: f"{LANGUAGES[x]['emoji']} {x.capitalize()}"
281
+ )
282
+ username = st.text_input("Your Hugging Face username", placeholder="antoinelouis")
283
+ token = st.text_input("Your Hugging Face access token", type="password", placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
284
+
285
+ if st.button("Prune Model"):
286
+ if not username or not token:
287
+ st.error("Your HF username and access token is required to save the pruned model on your account.")
288
+ else:
289
+ prune_model(model_name, language, username, token)
290
+
291
+ st.markdown(
292
+ """
293
+ <style>
294
+ .credits {
295
+ position: fixed;
296
+ right: 10px;
297
+ bottom: 10px;
298
+ color: #888888;
299
+ font-size: 11px;
300
+ }
301
+ </style>
302
+ <div class="credits">
303
+ Credits to <a href="https://gist.github.com/avidale/44cd35bfcdaf8bedf51d97c468cc8001" target="_blank">@avidale</a> for inspiration.
304
+ </div>
305
+ """,
306
+ unsafe_allow_html=True
307
+ )
308
+
309
+ if __name__ == "__main__":
310
+ main()