bwang0911 commited on
Commit
a05fa3e
1 Parent(s): dcc1fee

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +14 -7
README.md CHANGED
@@ -111,10 +111,13 @@ def mean_pooling(model_output, attention_mask):
111
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
112
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
113
 
114
- sentences = ['How is the weather today?', 'What is the current weather like today?']
 
 
 
115
 
116
- tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-small-en')
117
- model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-small-en', trust_remote_code=True)
118
 
119
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
120
 
@@ -135,16 +138,20 @@ from transformers import AutoModel
135
  from numpy.linalg import norm
136
 
137
  cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
138
- model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code', trust_remote_code=True) # trust_remote_code is needed to use the encode method
139
- embeddings = model.encode(['How is the weather today?', 'What is the current weather like today?'])
140
- print(cos_sim(embeddings[0], embeddings[1]))
 
 
 
 
141
  ```
142
 
143
  If you only want to handle shorter sequence, such as 2k, pass the `max_length` parameter to the `encode` function:
144
 
145
  ```python
146
  embeddings = model.encode(
147
- ['Very long ... document'],
148
  max_length=2048
149
  )
150
  ```
 
111
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
112
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
113
 
114
+ sentences = [
115
+ 'Save model to a pickle located at `path`',
116
+ 'def save_act(self, path=None): if path is None: path = os.path.join(logger.get_dir(), "model.pkl") with tempfile.TemporaryDirectory() as td: save_variables(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, "w") as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f)',
117
+ ]
118
 
119
+ tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-code')
120
+ model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code', trust_remote_code=True)
121
 
122
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
123
 
 
138
  from numpy.linalg import norm
139
 
140
  cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
141
+ model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code', trust_remote_code=True)
142
+ embeddings = model.encode(
143
+ [
144
+ 'Save model to a pickle located at `path`',
145
+ 'def save_act(self, path=None): if path is None: path = os.path.join(logger.get_dir(), "model.pkl") with tempfile.TemporaryDirectory() as td: save_variables(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, "w") as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f)',
146
+ ]
147
+ )
148
  ```
149
 
150
  If you only want to handle shorter sequence, such as 2k, pass the `max_length` parameter to the `encode` function:
151
 
152
  ```python
153
  embeddings = model.encode(
154
+ ['Very long ... code'],
155
  max_length=2048
156
  )
157
  ```