Miaoran000 commited on
Commit
dcf13df
1 Parent(s): 9c08956

upload csv to leaderboard_results

Browse files
.gitignore CHANGED
@@ -13,6 +13,7 @@ eval-results/
13
  auto_evals/
14
  eval-queue-bk/
15
  eval-results-bk/
 
16
 
17
  src/assets/model_counts.html
18
 
 
13
  auto_evals/
14
  eval-queue-bk/
15
  eval-results-bk/
16
+ eval-results-bk_hhem21/
17
 
18
  src/assets/model_counts.html
19
 
src/backend/evaluate_model.py CHANGED
@@ -112,13 +112,13 @@ class Evaluator:
112
 
113
  #update leaderboard_summaries.csv
114
  #first remove previous results for the current model
115
- # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), encoding='utf-8')
116
- # mask = existing_df['model'] == self.model
117
- # existing_df = existing_df[~mask]
118
- # print(existing_df.shape)
119
- # summary_doc = set(existing_df['model'].values.tolist())
120
- # print(summary_doc)
121
- # # get new result
122
  leaderboard_summaries_df = source_summary_df
123
  leaderboard_summaries_df.insert(2, "model", [self.model]*leaderboard_summaries_df.shape[0])
124
  leaderboard_summaries_df.to_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), mode='a', index=False, header=False)
@@ -126,23 +126,17 @@ class Evaluator:
126
 
127
  # update leaderboard_summaries_with_scores.csv
128
  # BUG: get error when opening the file
129
- # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'),
130
- # encoding='utf-8', sep=",", quotechar='"', quoting=2)
131
- # print(existing_df.shape)
132
- # score_doc = set(existing_df['model'].values.tolist())
133
- # print(score_doc)
134
- # mask = existing_df['model'] == self.model
135
- # existing_df = existing_df[~mask]
136
- # # get new result
137
  leaderboard_summaries_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
138
  leaderboard_summaries_with_scores_df.insert(3, "model", [self.model]*leaderboard_summaries_with_scores_df.shape[0])
139
  leaderboard_summaries_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'), mode='a', index=False, header=False)
140
  print('leaderboard_summaries_with_scores.csv has been updated')
141
 
142
- # for model in summary_doc:
143
- # if model not in score_doc:
144
- # print(f"{model} records missing in leaderboard_summaries_with_scores.csv")
145
-
146
- # for model in score_doc:
147
- # if model not in summary_doc:
148
- # print(f"{model} records missing in leaderboard_summaries.csv")
 
112
 
113
  #update leaderboard_summaries.csv
114
  #first remove previous results for the current model
115
+ existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), encoding='utf-8')
116
+ mask = existing_df['model'] == self.model
117
+ existing_df = existing_df[~mask]
118
+ print(existing_df.shape)
119
+ summary_doc = set(existing_df['model'].values.tolist())
120
+ print(summary_doc)
121
+ # get new result
122
  leaderboard_summaries_df = source_summary_df
123
  leaderboard_summaries_df.insert(2, "model", [self.model]*leaderboard_summaries_df.shape[0])
124
  leaderboard_summaries_df.to_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), mode='a', index=False, header=False)
 
126
 
127
  # update leaderboard_summaries_with_scores.csv
128
  # BUG: get error when opening the file
129
+ existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'),
130
+ encoding='utf-8', sep=",", quotechar='"', quoting=2)
131
+ print(existing_df.shape)
132
+ score_doc = set(existing_df['model'].values.tolist())
133
+ print(score_doc)
134
+ mask = existing_df['model'] == self.model
135
+ existing_df = existing_df[~mask]
136
+ # get new result
137
  leaderboard_summaries_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
138
  leaderboard_summaries_with_scores_df.insert(3, "model", [self.model]*leaderboard_summaries_with_scores_df.shape[0])
139
  leaderboard_summaries_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'), mode='a', index=False, header=False)
140
  print('leaderboard_summaries_with_scores.csv has been updated')
141
 
142
+
 
 
 
 
 
 
src/backend/model_operations.py CHANGED
@@ -27,7 +27,7 @@ import google.generativeai as genai
27
  import src.backend.util as util
28
  import src.envs as envs
29
 
30
- litellm.set_verbose=True
31
 
32
  # Set up basic configuration for logging
33
  logging.basicConfig(level=logging.INFO,
@@ -95,6 +95,7 @@ class SummaryGenerator:
95
  self.answer_rate = None
96
  self.exceptions = None
97
  self.local_model = None
 
98
 
99
  def generate_summaries(self, df, save_path=None):
100
  """Generate summaries for a given DataFrame of source docs.
@@ -118,8 +119,9 @@ class SummaryGenerator:
118
 
119
  system_prompt = envs.SYSTEM_PROMPT
120
  user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
 
121
 
122
- while True:
123
  try:
124
  _summary = self.generate_summary(system_prompt, user_prompt)
125
  # print(f"Finish index {index}")
@@ -169,11 +171,22 @@ class SummaryGenerator:
169
  def generate_summary(self, system_prompt: str, user_prompt: str):
170
  # Using Together AI API
171
  using_together_api = False
172
- together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm', 'llama-3', 'qwen'] #, 'mistralai'
173
- for together_ai_api_model in together_ai_api_models:
174
- if together_ai_api_model in self.model_id.lower():
175
- using_together_api = True
 
 
 
176
  break
 
 
 
 
 
 
 
 
177
  # if 'mixtral' in self.model_id.lower() or 'dbrx' in self.model_id.lower() or 'wizardlm' in self.model_id.lower(): # For mixtral and dbrx models, use Together AI API
178
  if using_together_api:
179
  # print('using together api')
@@ -269,24 +282,33 @@ class SummaryGenerator:
269
  print(result)
270
  return result
271
 
272
- elif 'snowflake' in self.model_id.lower():
273
  print("using replicate")
274
- input = {
275
- "prompt": user_prompt,
276
- "temperature": 0,
277
- "max_new_tokens": 250,
278
- "stop_sequences": "<|im_end|>",
279
- "prompt_template": f"<|im_start|>system\n{system_prompt}<|im_end|>\n" + "<|im_start|>user\n{prompt}<|im_end|>\n\n<|im_start|>assistant\n",
280
- }
 
 
 
 
 
 
 
 
281
  response = replicate.run(
282
- self.model_id.lower(),
283
  input=input
284
  )
 
285
  if isinstance(response, list):
286
  response = ''.join(response)
287
- print(response)
288
- print()
289
-
290
  return response
291
 
292
  elif 'claude' in self.model_id.lower(): # using anthropic api
@@ -313,22 +335,11 @@ class SummaryGenerator:
313
  return result
314
 
315
  # Using HF API or download checkpoints
316
- elif self.local_model is None:
317
- # response = litellm.completion(
318
- # model='command-r-plus' if 'command' in self.model else self.model,
319
- # messages=[{"role": "system", "content": system_prompt},
320
- # {"role": "user", "content": user_prompt}],
321
- # temperature=0.0,
322
- # max_tokens=256,
323
- # api_base=self.api_base,
324
- # )
325
- # result = response['choices'][0]['message']['content']
326
- # print(result)
327
- # return result
328
  try: # try use HuggingFace API
329
- print('using huggingface api')
330
  response = litellm.completion(
331
- model='command-r-plus' if 'command' in self.model_id else self.model_id,
332
  messages=[{"role": "system", "content": system_prompt},
333
  {"role": "user", "content": user_prompt}],
334
  temperature=0.0,
@@ -345,13 +356,35 @@ class SummaryGenerator:
345
  print(f"Rate limit hit at {current_time}. Waiting for 5 minutes before retrying...")
346
  time.sleep(wait_time)
347
  else:
348
- self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf" if 'openelm' in self.model_id.lower() else self.model_id, trust_remote_code=True)
349
- print("Tokenizer loaded")
350
- self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto", cache_dir='/home/paperspace/cache')
351
- print("Local model loaded")
 
 
 
 
 
 
 
 
352
 
353
- # Using local model
354
- if self.local_model: # cannot call API. using local model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  if 'gemma' in self.model_id.lower() or 'mistral-7b' in self.model_id.lower():
356
  messages=[
357
  # gemma-1.1, mistral-7b does not accept system role
@@ -361,10 +394,10 @@ class SummaryGenerator:
361
 
362
  elif 'phi-2' in self.model_id.lower():
363
  prompt = system_prompt + '\n' + user_prompt
364
-
365
  else:
366
  messages=[
367
- {"role": "system", "content": system_prompt}, # gemma-1.1, mistral-7b does not accept system role
368
  {"role": "user", "content": user_prompt}
369
  ]
370
  prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
 
27
  import src.backend.util as util
28
  import src.envs as envs
29
 
30
+ litellm.set_verbose=False
31
 
32
  # Set up basic configuration for logging
33
  logging.basicConfig(level=logging.INFO,
 
95
  self.answer_rate = None
96
  self.exceptions = None
97
  self.local_model = None
98
+ self.local_pipeline = None
99
 
100
  def generate_summaries(self, df, save_path=None):
101
  """Generate summaries for a given DataFrame of source docs.
 
119
 
120
  system_prompt = envs.SYSTEM_PROMPT
121
  user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
122
+ _summary = None
123
 
124
+ while not _summary:
125
  try:
126
  _summary = self.generate_summary(system_prompt, user_prompt)
127
  # print(f"Finish index {index}")
 
171
  def generate_summary(self, system_prompt: str, user_prompt: str):
172
  # Using Together AI API
173
  using_together_api = False
174
+ together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm', 'llama-3-', 'qwen'] #, 'mistralai'
175
+ using_replicate_api = False
176
+ replicate_api_models = ['snowflake', 'llama-3.1-405b']
177
+
178
+ for replicate_api_model in replicate_api_models:
179
+ if replicate_api_model in self.model_id.lower():
180
+ using_replicate_api = True
181
  break
182
+
183
+ if not using_replicate_api:
184
+ for together_ai_api_model in together_ai_api_models:
185
+ if together_ai_api_model in self.model_id.lower():
186
+ using_together_api = True
187
+ break
188
+
189
+
190
  # if 'mixtral' in self.model_id.lower() or 'dbrx' in self.model_id.lower() or 'wizardlm' in self.model_id.lower(): # For mixtral and dbrx models, use Together AI API
191
  if using_together_api:
192
  # print('using together api')
 
282
  print(result)
283
  return result
284
 
285
+ elif using_replicate_api:
286
  print("using replicate")
287
+ if 'snowflake' in self.model_id.lower():
288
+ input = {
289
+ "prompt": user_prompt,
290
+ "temperature": 0,
291
+ "max_new_tokens": 250,
292
+ "stop_sequences": "<|im_end|>",
293
+ "prompt_template": f"<|im_start|>system\n{system_prompt}<|im_end|>\n" + "<|im_start|>user\n{prompt}<|im_end|>\n\n<|im_start|>assistant\n",
294
+ }
295
+ else:
296
+ input = {
297
+ "prompt": user_prompt,
298
+ "system_prompt": system_prompt,
299
+ "temperature": 0,
300
+ "max_new_tokens": 250
301
+ }
302
  response = replicate.run(
303
+ self.model_id,
304
  input=input
305
  )
306
+ # print(response)
307
  if isinstance(response, list):
308
  response = ''.join(response)
309
+ # print(response)
310
+ # print()
311
+ print(response)
312
  return response
313
 
314
  elif 'claude' in self.model_id.lower(): # using anthropic api
 
335
  return result
336
 
337
  # Using HF API or download checkpoints
338
+ elif self.local_model is None and self.local_pipeline is None:
 
 
 
 
 
 
 
 
 
 
 
339
  try: # try use HuggingFace API
340
+ print('** using huggingface api')
341
  response = litellm.completion(
342
+ model=self.model,
343
  messages=[{"role": "system", "content": system_prompt},
344
  {"role": "user", "content": user_prompt}],
345
  temperature=0.0,
 
356
  print(f"Rate limit hit at {current_time}. Waiting for 5 minutes before retrying...")
357
  time.sleep(wait_time)
358
  else:
359
+ try:
360
+ self.local_pipeline = pipeline(
361
+ "text-generation",
362
+ model=self.model_id,
363
+ model_kwargs={"torch_dtype": torch.bfloat16},
364
+ device_map="auto",
365
+ )
366
+ except:
367
+ self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf" if 'openelm' in self.model_id.lower() else self.model_id, trust_remote_code=True)
368
+ print("Tokenizer loaded")
369
+ self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto")
370
+ print("Local model loaded")
371
 
372
+
373
+ # Using local model/pipeline
374
+ if self.local_pipeline:
375
+ messages=[
376
+ {"role": "system", "content": system_prompt},
377
+ {"role": "user", "content": user_prompt}
378
+ ]
379
+ outputs = self.local_pipeline(
380
+ messages,
381
+ max_new_tokens=250,
382
+ )
383
+ result = outputs[0]["generated_text"][-1]['content']
384
+ print(result)
385
+ return result
386
+
387
+ elif self.local_model: # cannot call API. using local model / pipeline
388
  if 'gemma' in self.model_id.lower() or 'mistral-7b' in self.model_id.lower():
389
  messages=[
390
  # gemma-1.1, mistral-7b does not accept system role
 
394
 
395
  elif 'phi-2' in self.model_id.lower():
396
  prompt = system_prompt + '\n' + user_prompt
397
+
398
  else:
399
  messages=[
400
+ {"role": "system", "content": system_prompt},
401
  {"role": "user", "content": user_prompt}
402
  ]
403
  prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
src/backend/run_eval_suite.py CHANGED
@@ -50,6 +50,14 @@ def run_evaluation(eval_request: EvalRequest, batch_size, device,
50
  results = evaluator.evaluate()
51
  if write_results:
52
  evaluator.write_results()
 
 
 
 
 
 
 
 
53
  except Exception as e:
54
  logging.error(f"Error during evaluation: {e}")
55
  raise
 
50
  results = evaluator.evaluate()
51
  if write_results:
52
  evaluator.write_results()
53
+ # upload leaderboard_summaries.csv to HF
54
+ envs.API.upload_file(
55
+ path_or_fileobj=envs.LEADERBOARD_DATASET_PATH,
56
+ path_in_repo=envs.LEADERBOARD_DATASET_PATH.split('/')[-1],
57
+ repo_id=envs.LEADERBOARD_DATASET_REPO,
58
+ repo_type="dataset",
59
+ )
60
+
61
  except Exception as e:
62
  logging.error(f"Error during evaluation: {e}")
63
  raise
src/envs.py CHANGED
@@ -10,6 +10,7 @@ OWNER = "vectara"
10
  REPO_ID = f"{OWNER}/leaderboard"
11
  QUEUE_REPO = f"{OWNER}/requests"
12
  RESULTS_REPO = f"{OWNER}/results"
 
13
 
14
  CACHE_PATH=os.getenv("HF_HOME", ".")
15
 
@@ -22,6 +23,7 @@ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
22
  DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #"cpu"
23
  API = HfApi(token=TOKEN)
24
 
 
25
  DATASET_PATH = "src/datasets/leaderboard_dataset.csv"
26
  SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
27
  HEM_PATH = 'vectara/hallucination_evaluation_model'
 
10
  REPO_ID = f"{OWNER}/leaderboard"
11
  QUEUE_REPO = f"{OWNER}/requests"
12
  RESULTS_REPO = f"{OWNER}/results"
13
+ LEADERBOARD_DATASET_REPO = f"{OWNER}/leaderboard_results"
14
 
15
  CACHE_PATH=os.getenv("HF_HOME", ".")
16
 
 
23
  DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #"cpu"
24
  API = HfApi(token=TOKEN)
25
 
26
+ LEADERBOARD_DATASET_PATH = "Hallucination Leaderboard Results/leaderboard_summaries.csv"
27
  DATASET_PATH = "src/datasets/leaderboard_dataset.csv"
28
  SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
29
  HEM_PATH = 'vectara/hallucination_evaluation_model'