Tom Aarsen commited on
Commit
6d20221
·
1 Parent(s): 541c5a6

Fix various snippets; add required safe_serialization

Browse files
Files changed (2) hide show
  1. README.md +3 -3
  2. sentence_bert_config.json +4 -1
README.md CHANGED
@@ -2675,9 +2675,9 @@ from sentence_transformers import SentenceTransformer
2675
 
2676
  matryoshka_dim = 512
2677
 
2678
- model = SentenceTransformer(".", trust_remote_code=True)
2679
  sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
2680
- embeddings = model.encode(sentences)
2681
  embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
2682
  embeddings = embeddings[:, :matryoshka_dim]
2683
  embeddings = F.normalize(embeddings, p=2, dim=1)
@@ -2699,7 +2699,7 @@ def mean_pooling(model_output, attention_mask):
2699
  sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
2700
 
2701
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
2702
- model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1', trust_remote_code=True)
2703
  model.eval()
2704
 
2705
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
 
2675
 
2676
  matryoshka_dim = 512
2677
 
2678
+ model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
2679
  sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
2680
+ embeddings = model.encode(sentences, convert_to_tensor=True)
2681
  embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
2682
  embeddings = embeddings[:, :matryoshka_dim]
2683
  embeddings = F.normalize(embeddings, p=2, dim=1)
 
2699
  sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
2700
 
2701
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
2702
+ model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True, safe_serialization=True)
2703
  model.eval()
2704
 
2705
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
sentence_bert_config.json CHANGED
@@ -1,4 +1,7 @@
1
  {
2
  "max_seq_length": 8192,
3
- "do_lower_case": false
 
 
 
4
  }
 
1
  {
2
  "max_seq_length": 8192,
3
+ "do_lower_case": false,
4
+ "model_args": {
5
+ "safe_serialization": true
6
+ }
7
  }