metadata
language:
- zh
pipeline_tag: sentence-similarity
tags:
- PEG
- feature-extraction
- sentence-similarity
- transformers
- mteb
model-index:
- name: PEG
results:
- task:
type: Reranking
dataset:
type: C-MTEB/CMedQAv1-reranking
name: MTEB CMedQAv1
config: default
split: test
revision: None
metrics:
- type: map
value: 84.09137463267582
- type: mrr
value: 86.6288888888889
- task:
type: Reranking
dataset:
type: C-MTEB/CMedQAv2-reranking
name: MTEB CMedQAv2
config: default
split: test
revision: None
metrics:
- type: map
value: 86.55765031914974
- type: mrr
value: 89.4325396825397
- task:
type: Retrieval
dataset:
type: C_MTEB/CmedqaRetrieval
name: MTEB CmedqaRetrieval
config: default
split: dev
revision: None
metrics:
- type: map_at_1
value: 26.101000000000003
- type: map_at_10
value: 38.239000000000004
- type: map_at_100
value: 40.083
- type: map_at_1000
value: 40.205
- type: map_at_3
value: 34.386
- type: map_at_5
value: 36.425999999999995
- type: mrr_at_1
value: 39.434999999999995
- type: mrr_at_10
value: 46.967999999999996
- type: mrr_at_100
value: 47.946
- type: mrr_at_1000
value: 47.997
- type: mrr_at_3
value: 44.803
- type: mrr_at_5
value: 45.911
- type: ndcg_at_1
value: 39.434999999999995
- type: ndcg_at_10
value: 44.416
- type: ndcg_at_100
value: 51.773
- type: ndcg_at_1000
value: 53.888000000000005
- type: ndcg_at_3
value: 39.816
- type: ndcg_at_5
value: 41.467999999999996
- type: precision_at_1
value: 39.434999999999995
- type: precision_at_10
value: 9.786999999999999
- type: precision_at_100
value: 1.5810000000000002
- type: precision_at_1000
value: 0.184
- type: precision_at_3
value: 22.414
- type: precision_at_5
value: 15.943999999999999
- type: recall_at_1
value: 26.101000000000003
- type: recall_at_10
value: 53.82900000000001
- type: recall_at_100
value: 84.63199999999999
- type: recall_at_1000
value: 98.782
- type: recall_at_3
value: 39.585
- type: recall_at_5
value: 45.141
- task:
type: Retrieval
dataset:
type: C_MTEB/CovidRetrieval
name: MTEB CovidRetrieval
config: default
split: dev
revision: None
metrics:
- type: map_at_1
value: 70.39
- type: map_at_10
value: 78.93599999999999
- type: map_at_100
value: 79.202
- type: map_at_1000
value: 79.205
- type: map_at_3
value: 77.538
- type: map_at_5
value: 78.312
- type: mrr_at_1
value: 70.706
- type: mrr_at_10
value: 79.018
- type: mrr_at_100
value: 79.28399999999999
- type: mrr_at_1000
value: 79.288
- type: mrr_at_3
value: 77.713
- type: mrr_at_5
value: 78.462
- type: ndcg_at_1
value: 70.601
- type: ndcg_at_10
value: 82.555
- type: ndcg_at_100
value: 83.718
- type: ndcg_at_1000
value: 83.855
- type: ndcg_at_3
value: 79.779
- type: ndcg_at_5
value: 81.149
- type: precision_at_1
value: 70.601
- type: precision_at_10
value: 9.463000000000001
- type: precision_at_100
value: 0.9979999999999999
- type: precision_at_1000
value: 0.101
- type: precision_at_3
value: 28.871999999999996
- type: precision_at_5
value: 18.019
- type: recall_at_1
value: 70.39
- type: recall_at_10
value: 93.572
- type: recall_at_100
value: 98.736
- type: recall_at_1000
value: 99.895
- type: recall_at_3
value: 86.091
- type: recall_at_5
value: 89.384
- task:
type: Retrieval
dataset:
type: C_MTEB/DuRetrieval
name: MTEB DuRetrieval
config: default
split: dev
revision: None
metrics:
- type: map_at_1
value: 26.147
- type: map_at_10
value: 80.205
- type: map_at_100
value: 82.96
- type: map_at_1000
value: 82.999
- type: map_at_3
value: 55.16799999999999
- type: map_at_5
value: 69.798
- type: mrr_at_1
value: 89.8
- type: mrr_at_10
value: 93.16799999999999
- type: mrr_at_100
value: 93.22500000000001
- type: mrr_at_1000
value: 93.228
- type: mrr_at_3
value: 92.85
- type: mrr_at_5
value: 93.067
- type: ndcg_at_1
value: 89.8
- type: ndcg_at_10
value: 87.668
- type: ndcg_at_100
value: 90.16
- type: ndcg_at_1000
value: 90.505
- type: ndcg_at_3
value: 85.842
- type: ndcg_at_5
value: 85.101
- type: precision_at_1
value: 89.8
- type: precision_at_10
value: 42.225
- type: precision_at_100
value: 4.8149999999999995
- type: precision_at_1000
value: 0.48900000000000005
- type: precision_at_3
value: 76.967
- type: precision_at_5
value: 65.32
- type: recall_at_1
value: 26.147
- type: recall_at_10
value: 89.30399999999999
- type: recall_at_100
value: 97.609
- type: recall_at_1000
value: 99.409
- type: recall_at_3
value: 57.56
- type: recall_at_5
value: 74.78200000000001
- task:
type: Retrieval
dataset:
type: C_MTEB/EcomRetrieval
name: MTEB EcomRetrieval
config: default
split: dev
revision: None
metrics:
- type: map_at_1
value: 53.300000000000004
- type: map_at_10
value: 62.507000000000005
- type: map_at_100
value: 63.068000000000005
- type: map_at_1000
value: 63.08200000000001
- type: map_at_3
value: 60.050000000000004
- type: map_at_5
value: 61.41
- type: mrr_at_1
value: 53.300000000000004
- type: mrr_at_10
value: 62.507000000000005
- type: mrr_at_100
value: 63.068000000000005
- type: mrr_at_1000
value: 63.08200000000001
- type: mrr_at_3
value: 60.050000000000004
- type: mrr_at_5
value: 61.41
- type: ndcg_at_1
value: 53.300000000000004
- type: ndcg_at_10
value: 67.31700000000001
- type: ndcg_at_100
value: 69.862
- type: ndcg_at_1000
value: 70.231
- type: ndcg_at_3
value: 62.222
- type: ndcg_at_5
value: 64.66300000000001
- type: precision_at_1
value: 53.300000000000004
- type: precision_at_10
value: 8.260000000000002
- type: precision_at_100
value: 0.941
- type: precision_at_1000
value: 0.097
- type: precision_at_3
value: 22.833000000000002
- type: precision_at_5
value: 14.879999999999999
- type: recall_at_1
value: 53.300000000000004
- type: recall_at_10
value: 82.6
- type: recall_at_100
value: 94.1
- type: recall_at_1000
value: 97
- type: recall_at_3
value: 68.5
- type: recall_at_5
value: 74.4
- task:
type: Retrieval
dataset:
type: C_MTEB/MMarcoRetrieval
name: MTEB MMarcoRetrieval
config: default
split: dev
revision: None
metrics:
- type: map_at_1
value: 70.68799999999999
- type: map_at_10
value: 79.28399999999999
- type: map_at_100
value: 79.537
- type: map_at_1000
value: 79.545
- type: map_at_3
value: 77.643
- type: map_at_5
value: 78.694
- type: mrr_at_1
value: 73.05199999999999
- type: mrr_at_10
value: 79.794
- type: mrr_at_100
value: 80.024
- type: mrr_at_1000
value: 80.03099999999999
- type: mrr_at_3
value: 78.441
- type: mrr_at_5
value: 79.29
- type: ndcg_at_1
value: 73.05199999999999
- type: ndcg_at_10
value: 82.627
- type: ndcg_at_100
value: 83.737
- type: ndcg_at_1000
value: 83.946
- type: ndcg_at_3
value: 79.585
- type: ndcg_at_5
value: 81.306
- type: precision_at_1
value: 73.05199999999999
- type: precision_at_10
value: 9.835
- type: precision_at_100
value: 1.038
- type: precision_at_1000
value: 0.106
- type: precision_at_3
value: 29.756
- type: precision_at_5
value: 18.788
- type: recall_at_1
value: 70.68799999999999
- type: recall_at_10
value: 92.38300000000001
- type: recall_at_100
value: 97.347
- type: recall_at_1000
value: 98.992
- type: recall_at_3
value: 84.37
- type: recall_at_5
value: 88.434
- task:
type: Retrieval
dataset:
type: C_MTEB/MedicalRetrieval
name: MTEB MedicalRetrieval
config: default
split: dev
revision: None
metrics:
- type: map_at_1
value: 53.1
- type: map_at_10
value: 58.36599999999999
- type: map_at_100
value: 58.939
- type: map_at_1000
value: 58.99100000000001
- type: map_at_3
value: 57.15
- type: map_at_5
value: 57.794999999999995
- type: mrr_at_1
value: 53.2
- type: mrr_at_10
value: 58.416000000000004
- type: mrr_at_100
value: 58.989999999999995
- type: mrr_at_1000
value: 59.041
- type: mrr_at_3
value: 57.199999999999996
- type: mrr_at_5
value: 57.845
- type: ndcg_at_1
value: 53.1
- type: ndcg_at_10
value: 60.989000000000004
- type: ndcg_at_100
value: 63.967
- type: ndcg_at_1000
value: 65.436
- type: ndcg_at_3
value: 58.425000000000004
- type: ndcg_at_5
value: 59.583
- type: precision_at_1
value: 53.1
- type: precision_at_10
value: 6.93
- type: precision_at_100
value: 0.8370000000000001
- type: precision_at_1000
value: 0.096
- type: precision_at_3
value: 20.7
- type: precision_at_5
value: 12.98
- type: recall_at_1
value: 53.1
- type: recall_at_10
value: 69.3
- type: recall_at_100
value: 83.7
- type: recall_at_1000
value: 95.5
- type: recall_at_3
value: 62.1
- type: recall_at_5
value: 64.9
- task:
type: Reranking
dataset:
type: C-MTEB/Mmarco-reranking
name: MTEB MMarcoReranking
config: default
split: dev
revision: None
metrics:
- type: map
value: 33.548800108363665
- type: mrr
value: 32.529761904761905
- task:
type: Reranking
dataset:
type: C-MTEB/T2Reranking
name: MTEB T2Reranking
config: default
split: dev
revision: None
metrics:
- type: map
value: 69.43381583724414
- type: mrr
value: 80.47879657392181
- task:
type: Retrieval
dataset:
type: C_MTEB/T2Retrieval
name: MTEB T2Retrieval
config: default
split: dev
revision: None
metrics:
- type: map_at_1
value: 28.116000000000003
- type: map_at_10
value: 80.026
- type: map_at_100
value: 83.541
- type: map_at_1000
value: 83.592
- type: map_at_3
value: 56.092
- type: map_at_5
value: 69.114
- type: mrr_at_1
value: 91.557
- type: mrr_at_10
value: 93.73700000000001
- type: mrr_at_100
value: 93.808
- type: mrr_at_1000
value: 93.811
- type: mrr_at_3
value: 93.384
- type: mrr_at_5
value: 93.614
- type: ndcg_at_1
value: 91.553
- type: ndcg_at_10
value: 87.003
- type: ndcg_at_100
value: 90.128
- type: ndcg_at_1000
value: 90.615
- type: ndcg_at_3
value: 88.205
- type: ndcg_at_5
value: 86.978
- type: precision_at_1
value: 91.553
- type: precision_at_10
value: 43.25
- type: precision_at_100
value: 5.067
- type: precision_at_1000
value: 0.518
- type: precision_at_3
value: 77.25
- type: precision_at_5
value: 64.902
- type: recall_at_1
value: 28.116000000000003
- type: recall_at_10
value: 85.994
- type: recall_at_100
value: 96.345
- type: recall_at_1000
value: 98.867
- type: recall_at_3
value: 57.67099999999999
- type: recall_at_5
value: 72.26
- task:
type: Retrieval
dataset:
type: C_MTEB/VideoRetrieval
name: MTEB VideoRetrieval
config: default
split: dev
revision: None
metrics:
- type: map_at_1
value: 64.9
- type: map_at_10
value: 73.763
- type: map_at_100
value: 74.116
- type: map_at_1000
value: 74.12100000000001
- type: map_at_3
value: 72.15
- type: map_at_5
value: 73.25
- type: mrr_at_1
value: 64.9
- type: mrr_at_10
value: 73.763
- type: mrr_at_100
value: 74.116
- type: mrr_at_1000
value: 74.12100000000001
- type: mrr_at_3
value: 72.15
- type: mrr_at_5
value: 73.25
- type: ndcg_at_1
value: 64.9
- type: ndcg_at_10
value: 77.639
- type: ndcg_at_100
value: 79.396
- type: ndcg_at_1000
value: 79.554
- type: ndcg_at_3
value: 74.406
- type: ndcg_at_5
value: 76.385
- type: precision_at_1
value: 64.9
- type: precision_at_10
value: 8.959999999999999
- type: precision_at_100
value: 0.979
- type: precision_at_1000
value: 0.099
- type: precision_at_3
value: 26.967000000000002
- type: precision_at_5
value: 17.14
- type: recall_at_1
value: 64.9
- type: recall_at_10
value: 89.60000000000001
- type: recall_at_100
value: 97.89999999999999
- type: recall_at_1000
value: 99.2
- type: recall_at_3
value: 80.9
- type: recall_at_5
value: 85.7
license: apache-2.0 library_name: transformers
PEG: Towards Robust Text Retrieval with Progressive Learning
Model Details
We propose the PEG model (a Progressively Learned Textual Embedding), which progressively adjusts the weights of samples contributing to the loss within an extremely large batch, based on the difficulty levels of negative samples. we have amassed an extensive collection of over 110 million data, spanning a wide range of fields such as general knowledge, finance, tourism, medicine, and more.
Our technical report is available at Paper
Usage (HuggingFace Transformers)
Install transformers:
pip install transformers
Then load model and predict:
from transformers import AutoModel, AutoTokenizer
import torch
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('TownsWu/PEG')
model = AutoModel.from_pretrained('TownsWu/PEG')
sentences = ['如何更换花呗绑定银行卡', '花呗更改绑定银行卡']
# Tokenize sentences
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
# Compute token embeddings
with torch.no_grad():
last_hidden_state = model(**inputs, return_dict=True).last_hidden_state
embeddings = last_hidden_state[:, 0]
print("embeddings:")
print(embeddings)
Contact
If you have any question or suggestion related to this project, feel free to open an issue or pull request. You also can email Tong Wu(townswu@tencent.com).
Citation
If you find our work helpful for your research, please consider citing the following BibTeX entry:
@article{wu2023towards,
title={Towards Robust Text Retrieval with Progressive Learning},
author={Wu, Tong and Qin, Yulei and Zhang, Enwei and Xu, Zihan and Gao, Yuting and Li, Ke and Sun, Xing},
journal={arXiv preprint arXiv:2311.11691},
year={2023}
}