kalpeshk2011 commited on
Commit
f6e521c
1 Parent(s): 2d12088

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +92 -3
README.md CHANGED
@@ -1,4 +1,41 @@
1
- ## Loading the model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  ```
4
  from transformers import T5Tokenizer, AutoModel
@@ -7,6 +44,58 @@ tokenizer = T5Tokenizer.from_pretrained(f"google/t5-v1_1-base")
7
  model = AutoModel.from_pretrained("kalpeshk2011/rankgen-t5-base-all", trust_remote_code=True)
8
  ```
9
 
10
- ## Main repository
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- https://github.com/martiansideofthemoon/rankgen
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Main repository
2
+
3
+ https://github.com/martiansideofthemoon/rankgen
4
+
5
+ ## What is RankGen?
6
+
7
+ RankGen is a suite of encoder models (100M-1.2B parameters) which map prefixes and generations from any pretrained English language model to a shared vector space. RankGen can be used to rerank multiple full-length samples from an LM, and it can also be incorporated as a scoring function into beam search to significantly improve generation quality (0.85 vs 0.77 MAUVE, 75% preference according to humans annotators who are English writers).
8
+
9
+ ## Using RankGen
10
+
11
+ Loading RankGen is simple using the HuggingFace APIs (see Method-2 below), but we suggest using [`RankGenEncoder`](https://github.com/martiansideofthemoon/rankgen/blob/master/rankgen/rankgen_encoder.py), which is a small wrapper around the HuggingFace APIs for correctly preprocessing data and doing tokenization automatically. You can either download the repository and install the API, or copy the implementation from [below](#rankgenencoder-implementation).
12
+
13
+ #### [SUGGESTED] Method-1: Loading the model with RankGenEncoder
14
+
15
+ ```
16
+ from rankgen import RankGenEncoder, RankGenGenerator
17
+
18
+ rankgen_encoder = RankGenEncoder("kalpeshk2011/rankgen-t5-base-all")
19
+
20
+ # Encoding vectors
21
+ prefix_vectors = rankgen_encoder.encode(["This is a prefix sentence."], vectors_type="prefix")
22
+ suffix_vectors = rankgen_encoder.encode(["This is a suffix sentence."], vectors_type="suffix")
23
+
24
+ # Generating text
25
+ # use a HuggingFace compatible language model
26
+ generator = RankGenGenerator(rankgen_encoder=rankgen_encoder, language_model="gpt2-medium")
27
+
28
+ inputs = ["Whatever might be the nature of the tragedy it would be over with long before this, and those moving black spots away yonder to the west, that he had discerned from the bluff, were undoubtedly the departing raiders. There was nothing left for Keith to do except determine the fate of the unfortunates, and give their bodies decent burial. That any had escaped, or yet lived, was altogether unlikely, unless, perchance, women had been in the party, in which case they would have been borne away prisoners."]
29
+
30
+ # Baseline nucleus sampling
31
+ print(generator.generate_single(inputs, top_p=0.9)[0][0])
32
+ # Over-generate and re-rank
33
+ print(generator.overgenerate_rerank(inputs, top_p=0.9, num_samples=10)[0][0])
34
+ # Beam search
35
+ print(generator.beam_search(inputs, top_p=0.9, num_samples=10, beam_size=2)[0][0])
36
+ ```
37
+
38
+ #### Method-2: Loading the model with HuggingFace APIs
39
 
40
  ```
41
  from transformers import T5Tokenizer, AutoModel
 
44
  model = AutoModel.from_pretrained("kalpeshk2011/rankgen-t5-base-all", trust_remote_code=True)
45
  ```
46
 
47
+ ### RankGenEncoder Implementation
48
+
49
+ ```
50
+ import tqdm
51
+ from transformers import T5Tokenizer, T5EncoderModel, AutoModel
52
+
53
+ class RankGenEncoder():
54
+ def __init__(self, model_path, max_batch_size=32, model_size=None, cache_dir=None):
55
+ assert model_path in ["kalpeshk2011/rankgen-t5-xl-all", "kalpeshk2011/rankgen-t5-xl-pg19", "kalpeshk2011/rankgen-t5-base-all", "kalpeshk2011/rankgen-t5-large-all"]
56
+ self.max_batch_size = max_batch_size
57
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
58
+ if model_size is None:
59
+ if "t5-large" in model_path or "t5_large" in model_path:
60
+ self.model_size = "large"
61
+ elif "t5-xl" in model_path or "t5_xl" in model_path:
62
+ self.model_size = "xl"
63
+ else:
64
+ self.model_size = "base"
65
+ else:
66
+ self.model_size = model_size
67
+
68
+ self.tokenizer = T5Tokenizer.from_pretrained(f"google/t5-v1_1-{self.model_size}", cache_dir=cache_dir)
69
+ self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
70
+ self.model.to(self.device)
71
+ self.model.eval()
72
+
73
+ def encode(self, inputs, vectors_type="prefix", verbose=False, return_input_ids=False):
74
+ tokenizer = self.tokenizer
75
+ max_batch_size = self.max_batch_size
76
+ if isinstance(inputs, str):
77
+ inputs = [inputs]
78
+ if vectors_type == 'prefix':
79
+ inputs = ['pre ' + input for input in inputs]
80
+ max_length = 512
81
+ else:
82
+ inputs = ['suffi ' + input for input in inputs]
83
+ max_length = 128
84
 
85
+ all_embeddings = []
86
+ all_input_ids = []
87
+ for i in tqdm.tqdm(range(0, len(inputs), max_batch_size), total=(len(inputs) // max_batch_size) + 1, disable=not verbose, desc=f"Encoding {vectors_type} inputs:"):
88
+ tokenized_inputs = tokenizer(inputs[i:i + max_batch_size], return_tensors="pt", padding=True)
89
+ for k, v in tokenized_inputs.items():
90
+ tokenized_inputs[k] = v[:, :max_length]
91
+ tokenized_inputs = tokenized_inputs.to(self.device)
92
+ with torch.inference_mode():
93
+ batch_embeddings = self.model(**tokenized_inputs)
94
+ all_embeddings.append(batch_embeddings)
95
+ if return_input_ids:
96
+ all_input_ids.extend(tokenized_inputs.input_ids.cpu().tolist())
97
+ return {
98
+ "embeddings": torch.cat(all_embeddings, dim=0),
99
+ "input_ids": all_input_ids
100
+ }
101
+ ```