File size: 6,493 Bytes
90e5585 fe3937f 90e5585 60218f2 fe3937f 60218f2 48f9fa0 90e5585 fe3937f 90e5585 f6e521c 1b45b2a f6e521c db119c0 f6e521c 5b12381 f6e521c 2d12088 f6e521c 2d12088 f6e521c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
---
language:
- en
thumbnail: "https://pbs.twimg.com/media/FThx_rEWAAEoujW?format=jpg&name=medium"
tags:
- t5
- contrastive learning
- ranking
- decoding
- metric learning
- pytorch
- text generation
- retrieval
license: "apache-2.0"
datasets:
- Wikipedia
- PG19
- C4
- relic
- ChapterBreak
- HellaSwag
- ROCStories
metrics:
- MAUVE
- human
---
## Main repository
https://github.com/martiansideofthemoon/rankgen
## What is RankGen?
RankGen is a suite of encoder models (100M-1.2B parameters) which map prefixes and generations from any pretrained English language model to a shared vector space. RankGen can be used to rerank multiple full-length samples from an LM, and it can also be incorporated as a scoring function into beam search to significantly improve generation quality (0.85 vs 0.77 MAUVE, 75% preference according to humans annotators who are English writers). RankGen can also be used like a dense retriever, and achieves state-of-the-art performance on [literary retrieval](https://relic.cs.umass.edu/leaderboard.html).
## Setup
**Requirements** (`pip` will install these dependencies for you)
Python 3.7+, `torch` (CUDA recommended), `transformers`
**Installation**
```
python3.7 -m virtualenv rankgen-venv
source rankgen-venv/bin/activate
pip install rankgen
```
Get the data [here](https://drive.google.com/drive/folders/1DRG2ess7fK3apfB-6KoHb_azMuHbsIv4?usp=sharing) and place folder in root directory. Alternatively, use `gdown` as shown below,
```
gdown --folder https://drive.google.com/drive/folders/1DRG2ess7fK3apfB-6KoHb_azMuHbsIv4
```
Run the test script to make sure the RankGen checkpoint has loaded correctly,
```
python -m rankgen.test_rankgen_encoder --model_path kalpeshk2011/rankgen-t5-base-all
### Expected output
0.0009239262409127233
0.0011521980725477804
```
## Using RankGen
Loading RankGen is simple using the HuggingFace APIs (see Method-2 below), but we suggest using [`RankGenEncoder`](https://github.com/martiansideofthemoon/rankgen/blob/master/rankgen/rankgen_encoder.py), which is a small wrapper around the HuggingFace APIs for correctly preprocessing data and doing tokenization automatically. You can either download [our repository](https://github.com/martiansideofthemoon/rankgen) and install the API, or copy the implementation from [below](#rankgenencoder-implementation).
#### [SUGGESTED] Method-1: Loading the model with RankGenEncoder
```
from rankgen import RankGenEncoder, RankGenGenerator
rankgen_encoder = RankGenEncoder("kalpeshk2011/rankgen-t5-base-all")
# Encoding vectors
prefix_vectors = rankgen_encoder.encode(["This is a prefix sentence."], vectors_type="prefix")
suffix_vectors = rankgen_encoder.encode(["This is a suffix sentence."], vectors_type="suffix")
# Generating text
# use a HuggingFace compatible language model
generator = RankGenGenerator(rankgen_encoder=rankgen_encoder, language_model="gpt2-medium")
inputs = ["Whatever might be the nature of the tragedy it would be over with long before this, and those moving black spots away yonder to the west, that he had discerned from the bluff, were undoubtedly the departing raiders. There was nothing left for Keith to do except determine the fate of the unfortunates, and give their bodies decent burial. That any had escaped, or yet lived, was altogether unlikely, unless, perchance, women had been in the party, in which case they would have been borne away prisoners."]
# Baseline nucleus sampling
print(generator.generate_single(inputs, top_p=0.9)[0][0])
# Over-generate and re-rank
print(generator.overgenerate_rerank(inputs, top_p=0.9, num_samples=10)[0][0])
# Beam search
print(generator.beam_search(inputs, top_p=0.9, num_samples=10, beam_size=2)[0][0])
```
#### Method-2: Loading the model with HuggingFace APIs
```
from transformers import T5Tokenizer, AutoModel
tokenizer = T5Tokenizer.from_pretrained(f"google/t5-v1_1-base")
model = AutoModel.from_pretrained("kalpeshk2011/rankgen-t5-base-all", trust_remote_code=True)
```
### RankGenEncoder Implementation
```
import tqdm
from transformers import T5Tokenizer, T5EncoderModel, AutoModel
class RankGenEncoder():
def __init__(self, model_path, max_batch_size=32, model_size=None, cache_dir=None):
assert model_path in ["kalpeshk2011/rankgen-t5-xl-all", "kalpeshk2011/rankgen-t5-xl-pg19", "kalpeshk2011/rankgen-t5-base-all", "kalpeshk2011/rankgen-t5-large-all"]
self.max_batch_size = max_batch_size
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
if model_size is None:
if "t5-large" in model_path or "t5_large" in model_path:
self.model_size = "large"
elif "t5-xl" in model_path or "t5_xl" in model_path:
self.model_size = "xl"
else:
self.model_size = "base"
else:
self.model_size = model_size
self.tokenizer = T5Tokenizer.from_pretrained(f"google/t5-v1_1-{self.model_size}", cache_dir=cache_dir)
self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
self.model.to(self.device)
self.model.eval()
def encode(self, inputs, vectors_type="prefix", verbose=False, return_input_ids=False):
tokenizer = self.tokenizer
max_batch_size = self.max_batch_size
if isinstance(inputs, str):
inputs = [inputs]
if vectors_type == 'prefix':
inputs = ['pre ' + input for input in inputs]
max_length = 512
else:
inputs = ['suffi ' + input for input in inputs]
max_length = 128
all_embeddings = []
all_input_ids = []
for i in tqdm.tqdm(range(0, len(inputs), max_batch_size), total=(len(inputs) // max_batch_size) + 1, disable=not verbose, desc=f"Encoding {vectors_type} inputs:"):
tokenized_inputs = tokenizer(inputs[i:i + max_batch_size], return_tensors="pt", padding=True)
for k, v in tokenized_inputs.items():
tokenized_inputs[k] = v[:, :max_length]
tokenized_inputs = tokenized_inputs.to(self.device)
with torch.inference_mode():
batch_embeddings = self.model(**tokenized_inputs)
all_embeddings.append(batch_embeddings)
if return_input_ids:
all_input_ids.extend(tokenized_inputs.input_ids.cpu().tolist())
return {
"embeddings": torch.cat(all_embeddings, dim=0),
"input_ids": all_input_ids
}
``` |