orionweller commited on
Commit
b7e679e
·
1 Parent(s): a907241
Files changed (3) hide show
  1. __pycache__/app.cpython-310.pyc +0 -0
  2. app.py +113 -97
  3. requirements.txt +2 -1
__pycache__/app.cpython-310.pyc ADDED
Binary file (7.39 kB). View file
 
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import pickle
3
  import numpy as np
4
  import glob
5
- from tqdm import tqdm
6
  import torch
7
  import torch.nn.functional as F
8
  from transformers import AutoTokenizer, AutoModel
@@ -13,50 +13,81 @@ import os
13
  import json
14
  import spaces
15
  import ir_datasets
16
- import subprocess
 
17
 
18
  # Set up logging
19
  logging.basicConfig(level=logging.INFO)
20
  logger = logging.getLogger(__name__)
21
 
 
 
 
22
  # Global variables
23
  CUR_MODEL = "orionweller/repllama-instruct-hard-positives-v2-joint"
24
- base_model = "meta-llama/Llama-2-7b-hf"
25
  tokenizer = None
26
  model = None
27
- retriever = None
28
- corpus_lookup = None
29
- queries = None
30
- q_lookup = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def load_model():
33
  global tokenizer, model
34
- tokenizer = AutoTokenizer.from_pretrained(base_model)
35
  tokenizer.pad_token_id = tokenizer.eos_token_id
36
  tokenizer.pad_token = tokenizer.eos_token
37
  tokenizer.padding_side = "right"
38
 
39
- base_model_instance = AutoModel.from_pretrained("meta-llama/Llama-2-7b-hf")
40
  model = PeftModel.from_pretrained(base_model_instance, CUR_MODEL)
41
  model = model.merge_and_unload()
42
  model.eval()
43
  model.cuda()
44
 
45
  def load_corpus_embeddings(dataset_name):
46
- global retriever, corpus_lookup
47
- corpus_path = f"{dataset_name}/corpus_emb*"
48
  index_files = glob.glob(corpus_path)
49
- logger.info(f'Pattern match found {len(index_files)} files; loading them into index.')
50
 
51
  p_reps_0, p_lookup_0 = pickle_load(index_files[0])
52
- retriever = FaissFlatSearcher(p_reps_0)
53
 
54
  shards = [(p_reps_0, p_lookup_0)] + [pickle_load(f) for f in index_files[1:]]
55
- corpus_lookup = []
56
 
57
- for p_reps, p_lookup in tqdm(shards, desc='Loading shards into index', total=len(index_files)):
58
- retriever.add(p_reps)
59
- corpus_lookup += p_lookup
60
 
61
  def pickle_load(path):
62
  with open(path, 'rb') as f:
@@ -64,119 +95,104 @@ def pickle_load(path):
64
  return np.array(reps), lookup
65
 
66
  def load_queries(dataset_name):
67
- global queries, q_lookup
68
- dataset = ir_datasets.load(f"beir/{dataset_name.lower()}/test")
69
 
70
- queries = []
71
- q_lookup = {}
 
72
  for query in dataset.queries_iter():
73
- queries.append(query.text)
74
- q_lookup[query.query_id] = query.text
 
 
 
 
 
75
 
76
- def encode_queries(prefix, postfix):
77
- global queries
78
- input_texts = [f"{prefix}Query: {query} {postfix}".strip() for query in queries]
 
 
79
 
80
  encoded_embeds = []
81
- batch_size = 32 # Adjust as needed
82
 
83
- for start_idx in range(0, len(input_texts), batch_size):
84
  batch_input_texts = input_texts[start_idx: start_idx + batch_size]
85
 
86
- inputs = tokenizer(batch_input_texts, padding=True, truncation=True, return_tensors="pt").to(model.device)
87
-
88
- with torch.no_grad():
89
- outputs = model(**inputs)
90
- embeds = outputs.last_hidden_state[:, 0, :] # Use [CLS] token embedding
 
91
  embeds = F.normalize(embeds, p=2, dim=-1)
92
  encoded_embeds.append(embeds.cpu().numpy())
93
 
94
  return np.concatenate(encoded_embeds, axis=0)
95
 
96
- def search_queries(q_reps, depth=1000):
97
- all_scores, all_indices = retriever.search(q_reps, depth)
98
- psg_indices = [[str(corpus_lookup[x]) for x in q_dd] for q_dd in all_indices]
99
  return all_scores, np.array(psg_indices)
100
 
101
- def write_ranking(corpus_indices, corpus_scores, ranking_save_file):
102
- with open(ranking_save_file, 'w') as f:
103
- for qid, q_doc_scores, q_doc_indices in zip(q_lookup.keys(), corpus_scores, corpus_indices):
104
- score_list = [(s, idx) for s, idx in zip(q_doc_scores, q_doc_indices)]
105
- score_list = sorted(score_list, key=lambda x: x[0], reverse=True)
106
- for rank, (s, idx) in enumerate(score_list, 1):
107
- f.write(f'{qid} Q0 {idx} {rank} {s} pyserini\n')
108
-
109
- def evaluate_with_subprocess(dataset, ranking_file):
110
- # Convert to TREC format
111
- trec_file = f"rank.{dataset}.trec"
112
- convert_cmd = [
113
- "python", "-m", "tevatron.utils.format.convert_result_to_trec",
114
- "--input", ranking_file,
115
- "--output", trec_file,
116
- "--remove_query"
117
- ]
118
- subprocess.run(convert_cmd, check=True)
119
-
120
- # Evaluate using trec_eval
121
- eval_cmd = [
122
- "python", "-m", "pyserini.eval.trec_eval",
123
- "-c", "-mrecall.100", "-mndcg_cut.10",
124
- f"beir-v1.0.0-{dataset}-test", trec_file
125
- ]
126
- result = subprocess.run(eval_cmd, capture_output=True, text=True, check=True)
127
-
128
- # Parse the output
129
- lines = result.stdout.strip().split('\n')
130
- ndcg_10 = float(lines[0].split()[-1])
131
- recall_100 = float(lines[1].split()[-1])
132
-
133
- # Clean up temporary files
134
- os.remove(ranking_file)
135
- os.remove(trec_file)
136
-
137
- return f"nDCG@10: {ndcg_10:.4f}, Recall@100: {recall_100:.4f}"
138
 
139
- @spaces.GPU
140
- def run_evaluation(dataset, prefix, postfix):
141
- global queries, q_lookup
 
 
 
 
 
 
142
 
143
- # Load corpus embeddings and queries if not already loaded
144
- if retriever is None or queries is None:
145
  load_corpus_embeddings(dataset)
146
  load_queries(dataset)
147
 
148
- # Encode queries
149
- q_reps = encode_queries(prefix, postfix)
150
 
151
- # Search
152
- all_scores, psg_indices = search_queries(q_reps)
153
 
154
- # Write ranking
155
- ranking_file = f"temp_ranking_{dataset}.txt"
156
- write_ranking(psg_indices, all_scores, ranking_file)
157
 
158
- # Evaluate
159
- results = evaluate_with_subprocess(dataset, ranking_file)
160
 
161
- return results
 
 
 
162
 
163
- def gradio_interface(dataset, prefix, postfix):
164
- return run_evaluation(dataset, prefix, postfix)
165
 
166
- # Load model
167
  load_model()
 
 
 
 
168
 
169
  # Create Gradio interface
170
  iface = gr.Interface(
171
  fn=gradio_interface,
172
  inputs=[
173
- gr.Dropdown(choices=["scifact", "arguana"], label="Dataset"),
174
- gr.Textbox(label="Prefix prompt"),
175
- gr.Textbox(label="Postfix prompt")
176
  ],
177
- outputs=gr.Textbox(label="Evaluation Results"),
178
- title="Query Evaluation with Custom Prompts",
179
- description="Select a dataset and enter prefix and postfix prompts to evaluate queries using Pyserini."
180
  )
181
 
182
  # Launch the interface
 
2
  import pickle
3
  import numpy as np
4
  import glob
5
+ import tqdm
6
  import torch
7
  import torch.nn.functional as F
8
  from transformers import AutoTokenizer, AutoModel
 
13
  import json
14
  import spaces
15
  import ir_datasets
16
+ import pytrec_eval
17
+ from huggingface_hub import login
18
 
19
  # Set up logging
20
  logging.basicConfig(level=logging.INFO)
21
  logger = logging.getLogger(__name__)
22
 
23
+ # Authenticate with HF_TOKEN
24
+ login(token=os.environ['HF_TOKEN'])
25
+
26
  # Global variables
27
  CUR_MODEL = "orionweller/repllama-instruct-hard-positives-v2-joint"
28
+ BASE_MODEL = "meta-llama/Llama-2-7b-hf"
29
  tokenizer = None
30
  model = None
31
+ retrievers = {}
32
+ corpus_lookups = {}
33
+ queries = {}
34
+ q_lookups = {}
35
+ qrels = {}
36
+ datasets = ["scifact", "arguana"]
37
+ current_dataset = "scifact"
38
+
39
+ def pool(last_hidden_states, attention_mask):
40
+ last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
41
+ sequence_lengths = attention_mask.sum(dim=1) - 1
42
+ batch_size = last_hidden.shape[0]
43
+ return last_hidden[torch.arange(batch_size, device=last_hidden.device), sequence_lengths]
44
+
45
+ def create_batch_dict(tokenizer, input_texts, max_length=512):
46
+ batch_dict = tokenizer(
47
+ input_texts,
48
+ max_length=max_length - 1,
49
+ return_token_type_ids=False,
50
+ return_attention_mask=False,
51
+ padding=False,
52
+ truncation=True
53
+ )
54
+ batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
55
+ return tokenizer.pad(
56
+ batch_dict,
57
+ padding=True,
58
+ pad_to_multiple_of=8,
59
+ return_attention_mask=True,
60
+ return_tensors="pt",
61
+ )
62
 
63
  def load_model():
64
  global tokenizer, model
65
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
66
  tokenizer.pad_token_id = tokenizer.eos_token_id
67
  tokenizer.pad_token = tokenizer.eos_token
68
  tokenizer.padding_side = "right"
69
 
70
+ base_model_instance = AutoModel.from_pretrained(BASE_MODEL)
71
  model = PeftModel.from_pretrained(base_model_instance, CUR_MODEL)
72
  model = model.merge_and_unload()
73
  model.eval()
74
  model.cuda()
75
 
76
  def load_corpus_embeddings(dataset_name):
77
+ global retrievers, corpus_lookups
78
+ corpus_path = f"{dataset_name}/corpus_emb.*.pkl"
79
  index_files = glob.glob(corpus_path)
80
+ logger.info(f'Loading {len(index_files)} files into index for {dataset_name}.')
81
 
82
  p_reps_0, p_lookup_0 = pickle_load(index_files[0])
83
+ retrievers[dataset_name] = FaissFlatSearcher(p_reps_0)
84
 
85
  shards = [(p_reps_0, p_lookup_0)] + [pickle_load(f) for f in index_files[1:]]
86
+ corpus_lookups[dataset_name] = []
87
 
88
+ for p_reps, p_lookup in tqdm.tqdm(shards, desc=f'Loading shards into index for {dataset_name}', total=len(index_files)):
89
+ retrievers[dataset_name].add(p_reps)
90
+ corpus_lookups[dataset_name] += p_lookup
91
 
92
  def pickle_load(path):
93
  with open(path, 'rb') as f:
 
95
  return np.array(reps), lookup
96
 
97
  def load_queries(dataset_name):
98
+ global queries, q_lookups, qrels
99
+ dataset = ir_datasets.load(f"beir/{dataset_name.lower()}" + ("/test" if dataset_name == "scifact" else ""))
100
 
101
+ queries[dataset_name] = []
102
+ q_lookups[dataset_name] = {}
103
+ qrels[dataset_name] = {}
104
  for query in dataset.queries_iter():
105
+ queries[dataset_name].append(query.text)
106
+ q_lookups[dataset_name][query.query_id] = query.text
107
+
108
+ for qrel in dataset.qrels_iter():
109
+ if qrel.query_id not in qrels[dataset_name]:
110
+ qrels[dataset_name][qrel.query_id] = {}
111
+ qrels[dataset_name][qrel.query_id][qrel.doc_id] = qrel.relevance
112
 
113
+ @spaces.GPU
114
+ def encode_queries(dataset_name, postfix):
115
+ global queries, tokenizer, model
116
+ model = model.cuda()
117
+ input_texts = [f"query: {query.strip()} {postfix}".strip() for query in queries[dataset_name]]
118
 
119
  encoded_embeds = []
120
+ batch_size = 32
121
 
122
+ for start_idx in tqdm.tqdm(range(0, len(input_texts), batch_size), desc="Encoding queries"):
123
  batch_input_texts = input_texts[start_idx: start_idx + batch_size]
124
 
125
+ batch_dict = create_batch_dict(tokenizer, batch_input_texts)
126
+ batch_dict = {k: v.to(model.device) for k, v in batch_dict.items()}
127
+
128
+ with torch.cuda.amp.autocast():
129
+ outputs = model(**batch_dict)
130
+ embeds = pool(outputs.last_hidden_state, batch_dict['attention_mask'])
131
  embeds = F.normalize(embeds, p=2, dim=-1)
132
  encoded_embeds.append(embeds.cpu().numpy())
133
 
134
  return np.concatenate(encoded_embeds, axis=0)
135
 
136
+ def search_queries(dataset_name, q_reps, depth=1000):
137
+ all_scores, all_indices = retrievers[dataset_name].search(q_reps, depth)
138
+ psg_indices = [[str(corpus_lookups[dataset_name][x]) for x in q_dd] for q_dd in all_indices]
139
  return all_scores, np.array(psg_indices)
140
 
141
+ def evaluate(qrels, results, k_values):
142
+ evaluator = pytrec_eval.RelevanceEvaluator(
143
+ qrels, {f"ndcg_cut.{k}" for k in k_values} | {f"recall.{k}" for k in k_values}
144
+ )
145
+ scores = evaluator.evaluate(results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
+ metrics = {}
148
+ for k in k_values:
149
+ metrics[f"NDCG@{k}"] = round(np.mean([query_scores[f"ndcg_cut_{k}"] for query_scores in scores.values()]), 3)
150
+ metrics[f"Recall@{k}"] = round(np.mean([query_scores[f"recall_{k}"] for query_scores in scores.values()]), 3)
151
+
152
+ return metrics
153
+
154
+ def run_evaluation(dataset, postfix):
155
+ global current_dataset
156
 
157
+ if dataset not in retrievers or dataset not in queries:
 
158
  load_corpus_embeddings(dataset)
159
  load_queries(dataset)
160
 
161
+ current_dataset = dataset
 
162
 
163
+ q_reps = encode_queries(dataset, postfix)
164
+ all_scores, psg_indices = search_queries(dataset, q_reps)
165
 
166
+ results = {qid: dict(zip(doc_ids, map(float, scores)))
167
+ for qid, scores, doc_ids in zip(q_lookups[dataset].keys(), all_scores, psg_indices)}
 
168
 
169
+ metrics = evaluate(qrels[dataset], results, k_values=[10, 100])
 
170
 
171
+ return {
172
+ "NDCG@10": metrics["NDCG@10"],
173
+ "Recall@100": metrics["Recall@100"]
174
+ }
175
 
176
+ def gradio_interface(dataset, postfix):
177
+ return run_evaluation(dataset, postfix)
178
 
179
+ # Load model and initial datasets
180
  load_model()
181
+ for dataset in datasets:
182
+ print(f"Loading dataset: {dataset}")
183
+ load_corpus_embeddings(dataset)
184
+ load_queries(dataset)
185
 
186
  # Create Gradio interface
187
  iface = gr.Interface(
188
  fn=gradio_interface,
189
  inputs=[
190
+ gr.Dropdown(choices=datasets, label="Dataset", value="scifact"),
191
+ gr.Textbox(label="Prompt")
 
192
  ],
193
+ outputs=gr.JSON(label="Evaluation Results"),
194
+ title="Promptriever Demo",
195
+ description="Select a dataset and enter a postfix prompt to evaluate the model's performance. Note: it takes about **ten seconds** for each dataset."
196
  )
197
 
198
  # Launch the interface
requirements.txt CHANGED
@@ -1,8 +1,9 @@
1
- gradio==4.39.0
2
  pyserini==0.23.0
3
  faiss-cpu==1.7.4
4
  torch==2.1.0
5
  ir_datasets
6
  peft==0.12.0
7
  ir_datasets==0.5.8
 
8
  tevatron @ git+https://github.com/texttron/tevatron@7d298b4
 
1
+ gradio==4.43.0
2
  pyserini==0.23.0
3
  faiss-cpu==1.7.4
4
  torch==2.1.0
5
  ir_datasets
6
  peft==0.12.0
7
  ir_datasets==0.5.8
8
+ pytrec_eval==0.5
9
  tevatron @ git+https://github.com/texttron/tevatron@7d298b4