Raphael Sourty commited on
Commit
a160321
1 Parent(s): c07c3a1
Files changed (2) hide show
  1. README copy.md +0 -60
  2. README.md +58 -1
README copy.md DELETED
@@ -1,60 +0,0 @@
1
- ---
2
- language:
3
- - en
4
- ---
5
-
6
- This model was trained with [Sparsembed](https://github.com/raphaelsty/sparsembed). You can find details on how to use it in the [Sparsembed](https://github.com/raphaelsty/sparsembed) repository.
7
-
8
- ```sh
9
- pip install sparsembed
10
- ```
11
-
12
- ```python
13
- from sparsembed import model, retrieve
14
- from transformers import AutoModelForMaskedLM, AutoTokenizer
15
-
16
- device = "cuda" # cpu
17
-
18
- batch_size = 10
19
-
20
- # List documents to index:
21
- documents = [
22
- {'id': 0,
23
- 'title': 'Paris',
24
- 'url': 'https://en.wikipedia.org/wiki/Paris',
25
- 'text': 'Paris is the capital and most populous city of France.'},
26
- {'id': 1,
27
- 'title': 'Paris',
28
- 'url': 'https://en.wikipedia.org/wiki/Paris',
29
- 'text': "Since the 17th century, Paris has been one of Europe's major centres of science, and arts."},
30
- {'id': 2,
31
- 'title': 'Paris',
32
- 'url': 'https://en.wikipedia.org/wiki/Paris',
33
- 'text': 'The City of Paris is the centre and seat of government of the region and province of Île-de-France.'
34
- }]
35
-
36
- model = model.SparsEmbed(
37
- model=AutoModelForMaskedLM.from_pretrained("raphaelsty/distilbert-sparsembed").to(device),
38
- tokenizer=AutoTokenizer.from_pretrained("raphaelsty/distilbert-sparsembed"),
39
- device=device
40
- )
41
-
42
- retriever = retrieve.SpladeRetriever(
43
- key="id", # Key identifier of each document.
44
- on=["title", "text"], # Fields to search.
45
- model=model # Splade retriever.
46
- )
47
-
48
- retriever = retriever.add(
49
- documents=documents,
50
- batch_size=batch_size,
51
- k_tokens=256, # Number of activated tokens.
52
- )
53
-
54
- retriever(
55
- ["paris", "Toulouse"], # Queries
56
- k_tokens=20, # Maximum number of activated tokens.
57
- k=100, # Number of documents to retrieve.
58
- batch_size=batch_size
59
- )
60
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,3 +1,60 @@
1
  ---
2
- license: mit
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - en
4
  ---
5
+
6
+ This model was trained with [Sparsembed](https://github.com/raphaelsty/sparsembed). You can find details on how to use it in the [Sparsembed](https://github.com/raphaelsty/sparsembed) repository.
7
+
8
+ ```sh
9
+ pip install sparsembed
10
+ ```
11
+
12
+ ```python
13
+ from sparsembed import model, retrieve
14
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
15
+
16
+ device = "cuda" # cpu
17
+
18
+ batch_size = 10
19
+
20
+ # List documents to index:
21
+ documents = [
22
+ {'id': 0,
23
+ 'title': 'Paris',
24
+ 'url': 'https://en.wikipedia.org/wiki/Paris',
25
+ 'text': 'Paris is the capital and most populous city of France.'},
26
+ {'id': 1,
27
+ 'title': 'Paris',
28
+ 'url': 'https://en.wikipedia.org/wiki/Paris',
29
+ 'text': "Since the 17th century, Paris has been one of Europe's major centres of science, and arts."},
30
+ {'id': 2,
31
+ 'title': 'Paris',
32
+ 'url': 'https://en.wikipedia.org/wiki/Paris',
33
+ 'text': 'The City of Paris is the centre and seat of government of the region and province of Île-de-France.'
34
+ }]
35
+
36
+ model = model.SparsEmbed(
37
+ model=AutoModelForMaskedLM.from_pretrained("raphaelsty/distilbert-sparsembed").to(device),
38
+ tokenizer=AutoTokenizer.from_pretrained("raphaelsty/distilbert-sparsembed"),
39
+ device=device
40
+ )
41
+
42
+ retriever = retrieve.SpladeRetriever(
43
+ key="id", # Key identifier of each document.
44
+ on=["title", "text"], # Fields to search.
45
+ model=model # Splade retriever.
46
+ )
47
+
48
+ retriever = retriever.add(
49
+ documents=documents,
50
+ batch_size=batch_size,
51
+ k_tokens=256, # Number of activated tokens.
52
+ )
53
+
54
+ retriever(
55
+ ["paris", "Toulouse"], # Queries
56
+ k_tokens=20, # Maximum number of activated tokens.
57
+ k=100, # Number of documents to retrieve.
58
+ batch_size=batch_size
59
+ )
60
+ ```