Spaces:
Sleeping
Sleeping
orionweller
commited on
Commit
·
b7e679e
1
Parent(s):
a907241
working
Browse files- __pycache__/app.cpython-310.pyc +0 -0
- app.py +113 -97
- requirements.txt +2 -1
__pycache__/app.cpython-310.pyc
ADDED
Binary file (7.39 kB). View file
|
|
app.py
CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
|
|
2 |
import pickle
|
3 |
import numpy as np
|
4 |
import glob
|
5 |
-
|
6 |
import torch
|
7 |
import torch.nn.functional as F
|
8 |
from transformers import AutoTokenizer, AutoModel
|
@@ -13,50 +13,81 @@ import os
|
|
13 |
import json
|
14 |
import spaces
|
15 |
import ir_datasets
|
16 |
-
import
|
|
|
17 |
|
18 |
# Set up logging
|
19 |
logging.basicConfig(level=logging.INFO)
|
20 |
logger = logging.getLogger(__name__)
|
21 |
|
|
|
|
|
|
|
22 |
# Global variables
|
23 |
CUR_MODEL = "orionweller/repllama-instruct-hard-positives-v2-joint"
|
24 |
-
|
25 |
tokenizer = None
|
26 |
model = None
|
27 |
-
|
28 |
-
|
29 |
-
queries =
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
def load_model():
|
33 |
global tokenizer, model
|
34 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
35 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
36 |
tokenizer.pad_token = tokenizer.eos_token
|
37 |
tokenizer.padding_side = "right"
|
38 |
|
39 |
-
base_model_instance = AutoModel.from_pretrained(
|
40 |
model = PeftModel.from_pretrained(base_model_instance, CUR_MODEL)
|
41 |
model = model.merge_and_unload()
|
42 |
model.eval()
|
43 |
model.cuda()
|
44 |
|
45 |
def load_corpus_embeddings(dataset_name):
|
46 |
-
global
|
47 |
-
corpus_path = f"{dataset_name}/corpus_emb
|
48 |
index_files = glob.glob(corpus_path)
|
49 |
-
logger.info(f'
|
50 |
|
51 |
p_reps_0, p_lookup_0 = pickle_load(index_files[0])
|
52 |
-
|
53 |
|
54 |
shards = [(p_reps_0, p_lookup_0)] + [pickle_load(f) for f in index_files[1:]]
|
55 |
-
|
56 |
|
57 |
-
for p_reps, p_lookup in tqdm(shards, desc='Loading shards into index', total=len(index_files)):
|
58 |
-
|
59 |
-
|
60 |
|
61 |
def pickle_load(path):
|
62 |
with open(path, 'rb') as f:
|
@@ -64,119 +95,104 @@ def pickle_load(path):
|
|
64 |
return np.array(reps), lookup
|
65 |
|
66 |
def load_queries(dataset_name):
|
67 |
-
global queries,
|
68 |
-
dataset = ir_datasets.load(f"beir/{dataset_name.lower()}/test")
|
69 |
|
70 |
-
queries = []
|
71 |
-
|
|
|
72 |
for query in dataset.queries_iter():
|
73 |
-
queries.append(query.text)
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
79 |
|
80 |
encoded_embeds = []
|
81 |
-
batch_size = 32
|
82 |
|
83 |
-
for start_idx in range(0, len(input_texts), batch_size):
|
84 |
batch_input_texts = input_texts[start_idx: start_idx + batch_size]
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
91 |
embeds = F.normalize(embeds, p=2, dim=-1)
|
92 |
encoded_embeds.append(embeds.cpu().numpy())
|
93 |
|
94 |
return np.concatenate(encoded_embeds, axis=0)
|
95 |
|
96 |
-
def search_queries(q_reps, depth=1000):
|
97 |
-
all_scores, all_indices =
|
98 |
-
psg_indices = [[str(
|
99 |
return all_scores, np.array(psg_indices)
|
100 |
|
101 |
-
def
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
for rank, (s, idx) in enumerate(score_list, 1):
|
107 |
-
f.write(f'{qid} Q0 {idx} {rank} {s} pyserini\n')
|
108 |
-
|
109 |
-
def evaluate_with_subprocess(dataset, ranking_file):
|
110 |
-
# Convert to TREC format
|
111 |
-
trec_file = f"rank.{dataset}.trec"
|
112 |
-
convert_cmd = [
|
113 |
-
"python", "-m", "tevatron.utils.format.convert_result_to_trec",
|
114 |
-
"--input", ranking_file,
|
115 |
-
"--output", trec_file,
|
116 |
-
"--remove_query"
|
117 |
-
]
|
118 |
-
subprocess.run(convert_cmd, check=True)
|
119 |
-
|
120 |
-
# Evaluate using trec_eval
|
121 |
-
eval_cmd = [
|
122 |
-
"python", "-m", "pyserini.eval.trec_eval",
|
123 |
-
"-c", "-mrecall.100", "-mndcg_cut.10",
|
124 |
-
f"beir-v1.0.0-{dataset}-test", trec_file
|
125 |
-
]
|
126 |
-
result = subprocess.run(eval_cmd, capture_output=True, text=True, check=True)
|
127 |
-
|
128 |
-
# Parse the output
|
129 |
-
lines = result.stdout.strip().split('\n')
|
130 |
-
ndcg_10 = float(lines[0].split()[-1])
|
131 |
-
recall_100 = float(lines[1].split()[-1])
|
132 |
-
|
133 |
-
# Clean up temporary files
|
134 |
-
os.remove(ranking_file)
|
135 |
-
os.remove(trec_file)
|
136 |
-
|
137 |
-
return f"nDCG@10: {ndcg_10:.4f}, Recall@100: {recall_100:.4f}"
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
-
|
144 |
-
if retriever is None or queries is None:
|
145 |
load_corpus_embeddings(dataset)
|
146 |
load_queries(dataset)
|
147 |
|
148 |
-
|
149 |
-
q_reps = encode_queries(prefix, postfix)
|
150 |
|
151 |
-
|
152 |
-
all_scores, psg_indices = search_queries(q_reps)
|
153 |
|
154 |
-
|
155 |
-
|
156 |
-
write_ranking(psg_indices, all_scores, ranking_file)
|
157 |
|
158 |
-
|
159 |
-
results = evaluate_with_subprocess(dataset, ranking_file)
|
160 |
|
161 |
-
return
|
|
|
|
|
|
|
162 |
|
163 |
-
def gradio_interface(dataset,
|
164 |
-
return run_evaluation(dataset,
|
165 |
|
166 |
-
# Load model
|
167 |
load_model()
|
|
|
|
|
|
|
|
|
168 |
|
169 |
# Create Gradio interface
|
170 |
iface = gr.Interface(
|
171 |
fn=gradio_interface,
|
172 |
inputs=[
|
173 |
-
gr.Dropdown(choices=
|
174 |
-
gr.Textbox(label="
|
175 |
-
gr.Textbox(label="Postfix prompt")
|
176 |
],
|
177 |
-
outputs=gr.
|
178 |
-
title="
|
179 |
-
description="Select a dataset and enter
|
180 |
)
|
181 |
|
182 |
# Launch the interface
|
|
|
2 |
import pickle
|
3 |
import numpy as np
|
4 |
import glob
|
5 |
+
import tqdm
|
6 |
import torch
|
7 |
import torch.nn.functional as F
|
8 |
from transformers import AutoTokenizer, AutoModel
|
|
|
13 |
import json
|
14 |
import spaces
|
15 |
import ir_datasets
|
16 |
+
import pytrec_eval
|
17 |
+
from huggingface_hub import login
|
18 |
|
19 |
# Set up logging
|
20 |
logging.basicConfig(level=logging.INFO)
|
21 |
logger = logging.getLogger(__name__)
|
22 |
|
23 |
+
# Authenticate with HF_TOKEN
|
24 |
+
login(token=os.environ['HF_TOKEN'])
|
25 |
+
|
26 |
# Global variables
|
27 |
CUR_MODEL = "orionweller/repllama-instruct-hard-positives-v2-joint"
|
28 |
+
BASE_MODEL = "meta-llama/Llama-2-7b-hf"
|
29 |
tokenizer = None
|
30 |
model = None
|
31 |
+
retrievers = {}
|
32 |
+
corpus_lookups = {}
|
33 |
+
queries = {}
|
34 |
+
q_lookups = {}
|
35 |
+
qrels = {}
|
36 |
+
datasets = ["scifact", "arguana"]
|
37 |
+
current_dataset = "scifact"
|
38 |
+
|
39 |
+
def pool(last_hidden_states, attention_mask):
|
40 |
+
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
|
41 |
+
sequence_lengths = attention_mask.sum(dim=1) - 1
|
42 |
+
batch_size = last_hidden.shape[0]
|
43 |
+
return last_hidden[torch.arange(batch_size, device=last_hidden.device), sequence_lengths]
|
44 |
+
|
45 |
+
def create_batch_dict(tokenizer, input_texts, max_length=512):
|
46 |
+
batch_dict = tokenizer(
|
47 |
+
input_texts,
|
48 |
+
max_length=max_length - 1,
|
49 |
+
return_token_type_ids=False,
|
50 |
+
return_attention_mask=False,
|
51 |
+
padding=False,
|
52 |
+
truncation=True
|
53 |
+
)
|
54 |
+
batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
|
55 |
+
return tokenizer.pad(
|
56 |
+
batch_dict,
|
57 |
+
padding=True,
|
58 |
+
pad_to_multiple_of=8,
|
59 |
+
return_attention_mask=True,
|
60 |
+
return_tensors="pt",
|
61 |
+
)
|
62 |
|
63 |
def load_model():
|
64 |
global tokenizer, model
|
65 |
+
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
66 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
67 |
tokenizer.pad_token = tokenizer.eos_token
|
68 |
tokenizer.padding_side = "right"
|
69 |
|
70 |
+
base_model_instance = AutoModel.from_pretrained(BASE_MODEL)
|
71 |
model = PeftModel.from_pretrained(base_model_instance, CUR_MODEL)
|
72 |
model = model.merge_and_unload()
|
73 |
model.eval()
|
74 |
model.cuda()
|
75 |
|
76 |
def load_corpus_embeddings(dataset_name):
|
77 |
+
global retrievers, corpus_lookups
|
78 |
+
corpus_path = f"{dataset_name}/corpus_emb.*.pkl"
|
79 |
index_files = glob.glob(corpus_path)
|
80 |
+
logger.info(f'Loading {len(index_files)} files into index for {dataset_name}.')
|
81 |
|
82 |
p_reps_0, p_lookup_0 = pickle_load(index_files[0])
|
83 |
+
retrievers[dataset_name] = FaissFlatSearcher(p_reps_0)
|
84 |
|
85 |
shards = [(p_reps_0, p_lookup_0)] + [pickle_load(f) for f in index_files[1:]]
|
86 |
+
corpus_lookups[dataset_name] = []
|
87 |
|
88 |
+
for p_reps, p_lookup in tqdm.tqdm(shards, desc=f'Loading shards into index for {dataset_name}', total=len(index_files)):
|
89 |
+
retrievers[dataset_name].add(p_reps)
|
90 |
+
corpus_lookups[dataset_name] += p_lookup
|
91 |
|
92 |
def pickle_load(path):
|
93 |
with open(path, 'rb') as f:
|
|
|
95 |
return np.array(reps), lookup
|
96 |
|
97 |
def load_queries(dataset_name):
|
98 |
+
global queries, q_lookups, qrels
|
99 |
+
dataset = ir_datasets.load(f"beir/{dataset_name.lower()}" + ("/test" if dataset_name == "scifact" else ""))
|
100 |
|
101 |
+
queries[dataset_name] = []
|
102 |
+
q_lookups[dataset_name] = {}
|
103 |
+
qrels[dataset_name] = {}
|
104 |
for query in dataset.queries_iter():
|
105 |
+
queries[dataset_name].append(query.text)
|
106 |
+
q_lookups[dataset_name][query.query_id] = query.text
|
107 |
+
|
108 |
+
for qrel in dataset.qrels_iter():
|
109 |
+
if qrel.query_id not in qrels[dataset_name]:
|
110 |
+
qrels[dataset_name][qrel.query_id] = {}
|
111 |
+
qrels[dataset_name][qrel.query_id][qrel.doc_id] = qrel.relevance
|
112 |
|
113 |
+
@spaces.GPU
|
114 |
+
def encode_queries(dataset_name, postfix):
|
115 |
+
global queries, tokenizer, model
|
116 |
+
model = model.cuda()
|
117 |
+
input_texts = [f"query: {query.strip()} {postfix}".strip() for query in queries[dataset_name]]
|
118 |
|
119 |
encoded_embeds = []
|
120 |
+
batch_size = 32
|
121 |
|
122 |
+
for start_idx in tqdm.tqdm(range(0, len(input_texts), batch_size), desc="Encoding queries"):
|
123 |
batch_input_texts = input_texts[start_idx: start_idx + batch_size]
|
124 |
|
125 |
+
batch_dict = create_batch_dict(tokenizer, batch_input_texts)
|
126 |
+
batch_dict = {k: v.to(model.device) for k, v in batch_dict.items()}
|
127 |
+
|
128 |
+
with torch.cuda.amp.autocast():
|
129 |
+
outputs = model(**batch_dict)
|
130 |
+
embeds = pool(outputs.last_hidden_state, batch_dict['attention_mask'])
|
131 |
embeds = F.normalize(embeds, p=2, dim=-1)
|
132 |
encoded_embeds.append(embeds.cpu().numpy())
|
133 |
|
134 |
return np.concatenate(encoded_embeds, axis=0)
|
135 |
|
136 |
+
def search_queries(dataset_name, q_reps, depth=1000):
|
137 |
+
all_scores, all_indices = retrievers[dataset_name].search(q_reps, depth)
|
138 |
+
psg_indices = [[str(corpus_lookups[dataset_name][x]) for x in q_dd] for q_dd in all_indices]
|
139 |
return all_scores, np.array(psg_indices)
|
140 |
|
141 |
+
def evaluate(qrels, results, k_values):
|
142 |
+
evaluator = pytrec_eval.RelevanceEvaluator(
|
143 |
+
qrels, {f"ndcg_cut.{k}" for k in k_values} | {f"recall.{k}" for k in k_values}
|
144 |
+
)
|
145 |
+
scores = evaluator.evaluate(results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
+
metrics = {}
|
148 |
+
for k in k_values:
|
149 |
+
metrics[f"NDCG@{k}"] = round(np.mean([query_scores[f"ndcg_cut_{k}"] for query_scores in scores.values()]), 3)
|
150 |
+
metrics[f"Recall@{k}"] = round(np.mean([query_scores[f"recall_{k}"] for query_scores in scores.values()]), 3)
|
151 |
+
|
152 |
+
return metrics
|
153 |
+
|
154 |
+
def run_evaluation(dataset, postfix):
|
155 |
+
global current_dataset
|
156 |
|
157 |
+
if dataset not in retrievers or dataset not in queries:
|
|
|
158 |
load_corpus_embeddings(dataset)
|
159 |
load_queries(dataset)
|
160 |
|
161 |
+
current_dataset = dataset
|
|
|
162 |
|
163 |
+
q_reps = encode_queries(dataset, postfix)
|
164 |
+
all_scores, psg_indices = search_queries(dataset, q_reps)
|
165 |
|
166 |
+
results = {qid: dict(zip(doc_ids, map(float, scores)))
|
167 |
+
for qid, scores, doc_ids in zip(q_lookups[dataset].keys(), all_scores, psg_indices)}
|
|
|
168 |
|
169 |
+
metrics = evaluate(qrels[dataset], results, k_values=[10, 100])
|
|
|
170 |
|
171 |
+
return {
|
172 |
+
"NDCG@10": metrics["NDCG@10"],
|
173 |
+
"Recall@100": metrics["Recall@100"]
|
174 |
+
}
|
175 |
|
176 |
+
def gradio_interface(dataset, postfix):
|
177 |
+
return run_evaluation(dataset, postfix)
|
178 |
|
179 |
+
# Load model and initial datasets
|
180 |
load_model()
|
181 |
+
for dataset in datasets:
|
182 |
+
print(f"Loading dataset: {dataset}")
|
183 |
+
load_corpus_embeddings(dataset)
|
184 |
+
load_queries(dataset)
|
185 |
|
186 |
# Create Gradio interface
|
187 |
iface = gr.Interface(
|
188 |
fn=gradio_interface,
|
189 |
inputs=[
|
190 |
+
gr.Dropdown(choices=datasets, label="Dataset", value="scifact"),
|
191 |
+
gr.Textbox(label="Prompt")
|
|
|
192 |
],
|
193 |
+
outputs=gr.JSON(label="Evaluation Results"),
|
194 |
+
title="Promptriever Demo",
|
195 |
+
description="Select a dataset and enter a postfix prompt to evaluate the model's performance. Note: it takes about **ten seconds** for each dataset."
|
196 |
)
|
197 |
|
198 |
# Launch the interface
|
requirements.txt
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
-
gradio==4.
|
2 |
pyserini==0.23.0
|
3 |
faiss-cpu==1.7.4
|
4 |
torch==2.1.0
|
5 |
ir_datasets
|
6 |
peft==0.12.0
|
7 |
ir_datasets==0.5.8
|
|
|
8 |
tevatron @ git+https://github.com/texttron/tevatron@7d298b4
|
|
|
1 |
+
gradio==4.43.0
|
2 |
pyserini==0.23.0
|
3 |
faiss-cpu==1.7.4
|
4 |
torch==2.1.0
|
5 |
ir_datasets
|
6 |
peft==0.12.0
|
7 |
ir_datasets==0.5.8
|
8 |
+
pytrec_eval==0.5
|
9 |
tevatron @ git+https://github.com/texttron/tevatron@7d298b4
|