intfloat commited on
Commit
34f5747
1 Parent(s): 39139e8

update README.md

Browse files
Files changed (1) hide show
  1. README.md +19 -2
README.md CHANGED
@@ -2614,7 +2614,6 @@ import torch.nn.functional as F
2614
 
2615
  from torch import Tensor
2616
  from transformers import AutoTokenizer, AutoModel
2617
- from transformers.modeling_outputs import BaseModelOutput
2618
 
2619
 
2620
  def average_pool(last_hidden_states: Tensor,
@@ -2636,7 +2635,7 @@ model = AutoModel.from_pretrained('intfloat/e5-small')
2636
  # Tokenize the input texts
2637
  batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
2638
 
2639
- outputs: BaseModelOutput = model(**batch_dict)
2640
  embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
2641
 
2642
  # (Optionally) normalize embeddings
@@ -2653,3 +2652,21 @@ Please refer to our paper at [https://arxiv.org/pdf/2212.03533.pdf](https://arxi
2653
 
2654
  Check out [unilm/e5](https://github.com/microsoft/unilm/tree/master/e5) to reproduce evaluation results
2655
  on the [BEIR](https://arxiv.org/abs/2104.08663) and [MTEB benchmark](https://arxiv.org/abs/2210.07316).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2614
 
2615
  from torch import Tensor
2616
  from transformers import AutoTokenizer, AutoModel
 
2617
 
2618
 
2619
  def average_pool(last_hidden_states: Tensor,
 
2635
  # Tokenize the input texts
2636
  batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
2637
 
2638
+ outputs = model(**batch_dict)
2639
  embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
2640
 
2641
  # (Optionally) normalize embeddings
 
2652
 
2653
  Check out [unilm/e5](https://github.com/microsoft/unilm/tree/master/e5) to reproduce evaluation results
2654
  on the [BEIR](https://arxiv.org/abs/2104.08663) and [MTEB benchmark](https://arxiv.org/abs/2210.07316).
2655
+
2656
+ ## Citation
2657
+
2658
+ If you find our paper or models helpful, please consider cite as follows:
2659
+
2660
+ ```
2661
+ @article{wang2022text,
2662
+ title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
2663
+ author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
2664
+ journal={arXiv preprint arXiv:2212.03533},
2665
+ year={2022}
2666
+ }
2667
+ ```
2668
+
2669
+ ## Limitations
2670
+
2671
+ This model only works for English texts. Long texts will be truncated to at most 512 tokens.
2672
+