raphaelsty
/

distilbert-sparsembed

@@ -1,60 +0,0 @@
----
-language:
-- en
----
-This model was trained with [Sparsembed](https://github.com/raphaelsty/sparsembed). You can find details on how to use it in the [Sparsembed](https://github.com/raphaelsty/sparsembed) repository.
-```sh
-pip install sparsembed
-```
-```python
-from sparsembed import model, retrieve
-from transformers import AutoModelForMaskedLM, AutoTokenizer
-device = "cuda" # cpu
-batch_size = 10
-# List documents to index:
-documents = [
- {'id': 0,
-  'title': 'Paris',
-  'url': 'https://en.wikipedia.org/wiki/Paris',
-  'text': 'Paris is the capital and most populous city of France.'},
- {'id': 1,
-  'title': 'Paris',
-  'url': 'https://en.wikipedia.org/wiki/Paris',
-  'text': "Since the 17th century, Paris has been one of Europe's major centres of science, and arts."},
- {'id': 2,
-  'title': 'Paris',
-  'url': 'https://en.wikipedia.org/wiki/Paris',
-  'text': 'The City of Paris is the centre and seat of government of the region and province of Île-de-France.'
-}]
-model = model.SparsEmbed(
-    model=AutoModelForMaskedLM.from_pretrained("raphaelsty/distilbert-sparsembed").to(device),
-    tokenizer=AutoTokenizer.from_pretrained("raphaelsty/distilbert-sparsembed"),
-    device=device
-)
-retriever = retrieve.SpladeRetriever(
-    key="id", # Key identifier of each document.
-    on=["title", "text"], # Fields to search.
-    model=model # Splade retriever.
-)
-retriever = retriever.add(
-    documents=documents,
-    batch_size=batch_size,
-    k_tokens=256, # Number of activated tokens.
-)
-retriever(
-    ["paris", "Toulouse"], # Queries
-    k_tokens=20, # Maximum number of activated tokens.
-    k=100, # Number of documents to retrieve.
-    batch_size=batch_size
-)
-```

README.md CHANGED Viewed

@@ -1,3 +1,60 @@
 ---
-license: mit
 ---

 ---
+language:
+- en
 ---
+This model was trained with [Sparsembed](https://github.com/raphaelsty/sparsembed). You can find details on how to use it in the [Sparsembed](https://github.com/raphaelsty/sparsembed) repository.
+```sh
+pip install sparsembed
+```
+```python
+from sparsembed import model, retrieve
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+device = "cuda" # cpu
+batch_size = 10
+# List documents to index:
+documents = [
+ {'id': 0,
+  'title': 'Paris',
+  'url': 'https://en.wikipedia.org/wiki/Paris',
+  'text': 'Paris is the capital and most populous city of France.'},
+ {'id': 1,
+  'title': 'Paris',
+  'url': 'https://en.wikipedia.org/wiki/Paris',
+  'text': "Since the 17th century, Paris has been one of Europe's major centres of science, and arts."},
+ {'id': 2,
+  'title': 'Paris',
+  'url': 'https://en.wikipedia.org/wiki/Paris',
+  'text': 'The City of Paris is the centre and seat of government of the region and province of Île-de-France.'
+}]
+model = model.SparsEmbed(
+    model=AutoModelForMaskedLM.from_pretrained("raphaelsty/distilbert-sparsembed").to(device),
+    tokenizer=AutoTokenizer.from_pretrained("raphaelsty/distilbert-sparsembed"),
+    device=device
+)
+retriever = retrieve.SpladeRetriever(
+    key="id", # Key identifier of each document.
+    on=["title", "text"], # Fields to search.
+    model=model # Splade retriever.
+)
+retriever = retriever.add(
+    documents=documents,
+    batch_size=batch_size,
+    k_tokens=256, # Number of activated tokens.
+)
+retriever(
+    ["paris", "Toulouse"], # Queries
+    k_tokens=20, # Maximum number of activated tokens.
+    k=100, # Number of documents to retrieve.
+    batch_size=batch_size
+)
+```