Update README.md
Browse files
README.md
CHANGED
@@ -48,9 +48,19 @@ Then you can use the model like this:
|
|
48 |
```python
|
49 |
from sentence_transformers import SentenceTransformer
|
50 |
sentences = ["样例数据-1", "样例数据-2"]
|
51 |
-
model = SentenceTransformer('BAAI/baai-general-embedding-large-
|
52 |
embeddings = model.encode(sentences, normalize_embeddings=True)
|
53 |
print(embeddings)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
```
|
55 |
|
56 |
|
@@ -62,16 +72,22 @@ from transformers import AutoTokenizer, AutoModel
|
|
62 |
import torch
|
63 |
# Sentences we want sentence embeddings for
|
64 |
sentences = ["样例数据-1", "样例数据-2"]
|
|
|
65 |
# Load model from HuggingFace Hub
|
66 |
tokenizer = AutoTokenizer.from_pretrained('BAAI/baai-general-embedding-large-en-instruction')
|
67 |
model = AutoModel.from_pretrained('BAAI/baai-general-embedding-large-en-instruction')
|
|
|
68 |
# Tokenize sentences
|
69 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
|
|
|
|
|
|
70 |
# Compute token embeddings
|
71 |
with torch.no_grad():
|
72 |
model_output = model(**encoded_input)
|
73 |
# Perform pooling. In this case, cls pooling.
|
74 |
sentence_embeddings = model_output[0][:, 0]
|
|
|
75 |
# normalize embeddings
|
76 |
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
|
77 |
print("Sentence embeddings:")
|
|
|
48 |
```python
|
49 |
from sentence_transformers import SentenceTransformer
|
50 |
sentences = ["样例数据-1", "样例数据-2"]
|
51 |
+
model = SentenceTransformer('BAAI/baai-general-embedding-large-zh-instruction')
|
52 |
embeddings = model.encode(sentences, normalize_embeddings=True)
|
53 |
print(embeddings)
|
54 |
+
|
55 |
+
#For retrieval task, when you use the model whose name ends with `-instruction`
|
56 |
+
#each query should start with a instruction.
|
57 |
+
queries = ["手机开不了机怎么办?"]
|
58 |
+
passages = ["样例段落-1", "样例段落-2"]
|
59 |
+
instruction = "为这个句子生成表示以用于检索相关文章:"
|
60 |
+
model = SentenceTransformer('BAAI/baai-general-embedding-large-zh-instruction')
|
61 |
+
q_embeddings = model.encode([instruction+q for q in queries], normalize_embeddings=True)
|
62 |
+
p_embeddings = model.encode(passages, normalize_embeddings=True)
|
63 |
+
scores = q_embeddings @ p_embeddings.T
|
64 |
```
|
65 |
|
66 |
|
|
|
72 |
import torch
|
73 |
# Sentences we want sentence embeddings for
|
74 |
sentences = ["样例数据-1", "样例数据-2"]
|
75 |
+
|
76 |
# Load model from HuggingFace Hub
|
77 |
tokenizer = AutoTokenizer.from_pretrained('BAAI/baai-general-embedding-large-en-instruction')
|
78 |
model = AutoModel.from_pretrained('BAAI/baai-general-embedding-large-en-instruction')
|
79 |
+
|
80 |
# Tokenize sentences
|
81 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
82 |
+
# For retrieval task, need to add an instruction to query when using the "*-instruction" model.
|
83 |
+
# encoded_input = tokenizer(["为这个句子生成表示以用于检索相关文章:" + query for query in queries], padding=True, truncation=True, return_tensors='pt')
|
84 |
+
|
85 |
# Compute token embeddings
|
86 |
with torch.no_grad():
|
87 |
model_output = model(**encoded_input)
|
88 |
# Perform pooling. In this case, cls pooling.
|
89 |
sentence_embeddings = model_output[0][:, 0]
|
90 |
+
|
91 |
# normalize embeddings
|
92 |
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
|
93 |
print("Sentence embeddings:")
|