judge-arena / gen_api_answer.py
kaikaidai's picture
fix/update-flow-judge-model (#9)
183a44f verified
from openai import OpenAI
import anthropic
from together import Together
import cohere
import json
import re
import os
import requests
from prompts import (
JUDGE_SYSTEM_PROMPT,
PROMETHEUS_PROMPT,
PROMETHEUS_PROMPT_WITH_REFERENCE,
ATLA_PROMPT,
ATLA_PROMPT_WITH_REFERENCE,
FLOW_JUDGE_PROMPT
)
from transformers import AutoTokenizer
# Initialize clients
anthropic_client = anthropic.Anthropic()
openai_client = OpenAI()
together_client = Together()
hf_api_key = os.getenv("HF_API_KEY")
flow_judge_api_key = os.getenv("FLOW_JUDGE_API_KEY")
cohere_client = cohere.ClientV2(os.getenv("CO_API_KEY"))
def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
"""Get response from OpenAI API"""
try:
response = openai_client.chat.completions.create(
model=model_name,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
max_completion_tokens=max_tokens,
temperature=temperature,
)
return response.choices[0].message.content
except Exception as e:
return f"Error with OpenAI model {model_name}: {str(e)}"
def get_anthropic_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
"""Get response from Anthropic API"""
try:
response = anthropic_client.messages.create(
model=model_name,
max_tokens=max_tokens,
temperature=temperature,
system=system_prompt,
messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
)
return response.content[0].text
except Exception as e:
return f"Error with Anthropic model {model_name}: {str(e)}"
def get_together_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
"""Get response from Together API"""
try:
response = together_client.chat.completions.create(
model=model_name,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
max_tokens=max_tokens,
temperature=temperature,
stream=False,
)
return response.choices[0].message.content
except Exception as e:
return f"Error with Together model {model_name}: {str(e)}"
def get_prometheus_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
"""Get response from Hugging Face model"""
try:
headers = {
"Accept": "application/json",
"Authorization": f"Bearer {hf_api_key}",
"Content-Type": "application/json"
}
# Create messages list for chat template
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
# Apply chat template
model_id = "prometheus-eval/prometheus-7b-v2.0"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_api_key)
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
payload = {
"inputs": formatted_prompt,
"parameters": {
"max_new_tokens": max_tokens,
"return_full_text": False,
"temperature": temperature
}
}
response = requests.post(
"https://otb7jglxy6r37af6.us-east-1.aws.endpoints.huggingface.cloud",
headers=headers,
json=payload
)
return response.json()[0]["generated_text"]
except Exception as e:
return f"Error with Hugging Face model {model_name}: {str(e)}"
def get_atla_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
"""Get response from HF endpoint for Atla model"""
try:
headers = {
"Accept": "application/json",
"Authorization": f"Bearer {hf_api_key}",
"Content-Type": "application/json"
}
# Create messages list for chat template
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
# Apply chat template
model_id = "AtlaAI/Atla-8B-preview"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_api_key)
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
payload = {
"inputs": formatted_prompt,
"parameters": {
"max_new_tokens": max_tokens,
"return_full_text": False,
"temperature": temperature,
"seed": 42,
"add_generation_prompt": True
}
}
response = requests.post(
"https://azk0vbxyrc64s2v2.us-east-1.aws.endpoints.huggingface.cloud",
headers=headers,
json=payload
)
return response.json()[0]["generated_text"]
except Exception as e:
return f"Error with Atla model {model_name}: {str(e)}"
def get_flow_judge_response(model_name, prompt, max_tokens=2048, temperature=0.1, top_p=0.95) -> str:
"""Get response from Flow Judge"""
try:
response = requests.post(
"https://arena.flow-ai.io/v1/chat/completions",
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {flow_judge_api_key}"
},
json={
"model": model_name,
"messages": [
{"role": "user", "content": prompt}
],
"max_tokens": max_tokens,
"temperature": temperature,
"top_p": top_p,
"stop": None
}
)
response.raise_for_status()
return response.json()["choices"][0]['message']['content']
except Exception as e:
return f"Error with Flow Judge completions model {model_name}: {str(e)}"
def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
"""Get response from Cohere API"""
try:
response = cohere_client.chat(
model=model_name,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt}
],
max_tokens=max_tokens,
temperature=temperature
)
# Extract the text from the content items
content_items = response.message.content
if isinstance(content_items, list):
# Get the text from the first content item
return content_items[0].text
return str(content_items) # Fallback if it's not a list
except Exception as e:
return f"Error with Cohere model {model_name}: {str(e)}"
def get_model_response(
model_name,
model_info,
prompt_data,
use_reference=False,
max_tokens=500,
temperature=0
):
"""Get response from appropriate API based on model organization"""
if not model_info:
return "Model not found or unsupported."
api_model = model_info["api_model"]
organization = model_info["organization"]
# Determine if model is Prometheus or Atla or Flow Judge
is_prometheus = (organization == "Prometheus")
is_atla = (organization == "Atla")
is_flow_judge = (organization == "Flow AI")
# For non-Prometheus/Atla models/Flow Judge, use the Judge system prompt
system_prompt = None if (is_prometheus or is_atla or is_flow_judge) else JUDGE_SYSTEM_PROMPT
# Select the appropriate base prompt
if is_atla:
base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
elif is_flow_judge:
base_prompt = FLOW_JUDGE_PROMPT
else:
base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE if use_reference else PROMETHEUS_PROMPT
# For non-Prometheus/non-Atla models, replace the specific instruction
if not (is_prometheus or is_atla or is_flow_judge):
base_prompt = base_prompt.replace(
'3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
'3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
)
try:
if not is_flow_judge:
# Format the prompt with the provided data, only using available keys
final_prompt = base_prompt.format(
human_input=prompt_data['human_input'],
ai_response=prompt_data['ai_response'],
ground_truth_input=prompt_data.get('ground_truth_input', ''),
eval_criteria=prompt_data['eval_criteria'],
score1_desc=prompt_data['score1_desc'],
score2_desc=prompt_data['score2_desc'],
score3_desc=prompt_data['score3_desc'],
score4_desc=prompt_data['score4_desc'],
score5_desc=prompt_data['score5_desc']
)
else:
human_input = f"<user_input>\n{prompt_data['human_input']}\n</user_input>"
ai_response = f"<response>\n{prompt_data['ai_response']}\n</response>"
ground_truth=prompt_data.get('ground_truth_input', '')
if ground_truth:
response_reference = f"<response_reference>\n{ground_truth}\n</response_reference>"
else:
response_reference = ""
eval_criteria = prompt_data['eval_criteria']
score1_desc = f"- Score 1: {prompt_data['score1_desc']}\n"
score2_desc = f"- Score 2: {prompt_data['score2_desc']}\n"
score3_desc = f"- Score 3: {prompt_data['score3_desc']}\n"
score4_desc = f"- Score 4: {prompt_data['score4_desc']}\n"
score5_desc = f"- Score 5: {prompt_data['score5_desc']}"
rubric = score1_desc + score2_desc + score3_desc + score4_desc + score5_desc
if response_reference:
inputs = human_input + "\n"+ response_reference
else:
inputs = human_input
final_prompt = base_prompt.format(
INPUTS=inputs,
OUTPUT=ai_response,
EVALUATION_CRITERIA=eval_criteria,
RUBRIC=rubric
)
except KeyError as e:
return f"Error formatting prompt: Missing required field {str(e)}"
try:
if organization == "OpenAI":
return get_openai_response(
api_model, final_prompt, system_prompt, max_tokens, temperature
)
elif organization == "Anthropic":
return get_anthropic_response(
api_model, final_prompt, system_prompt, max_tokens, temperature
)
elif organization == "Prometheus":
return get_prometheus_response(
api_model, final_prompt, system_prompt, max_tokens, temperature = 0.01
)
elif organization == "Atla":
return get_atla_response(
api_model, final_prompt, system_prompt, max_tokens, temperature = 0.01
)
elif organization == "Cohere":
return get_cohere_response(
api_model, final_prompt, system_prompt, max_tokens, temperature
)
elif organization == "Flow AI":
return get_flow_judge_response(
api_model, final_prompt, # Keep default hps
)
else:
# All other organizations use Together API
return get_together_response(
api_model, final_prompt, system_prompt, max_tokens, temperature
)
except Exception as e:
return f"Error with {organization} model {model_name}: {str(e)}"
def parse_model_response(response):
try:
# Debug print
print(f"Raw model response: {response}")
# If response is already a dictionary, use it directly
if isinstance(response, dict):
return str(response.get("result", "N/A")), response.get("feedback", "N/A")
# First try to parse the entire response as JSON
try:
data = json.loads(response)
return str(data.get("result", "N/A")), data.get("feedback", "N/A")
except json.JSONDecodeError:
# If that fails (typically for smaller models), try to find JSON within the response
json_match = re.search(r"{.*}", response, re.DOTALL)
if json_match:
data = json.loads(json_match.group(0))
return str(data.get("result", "N/A")), data.get("feedback", "N/A")
else:
return "Error", f"Invalid response format returned - here is the raw model response: {response}"
except Exception as e:
# Debug print for error case
print(f"Failed to parse response: {str(e)}")
# If the error message itself contains valid JSON, try to parse that
try:
error_json_match = re.search(r"{.*}", str(e), re.DOTALL)
if error_json_match:
data = json.loads(error_json_match.group(0))
return str(data.get("result", "N/A")), data.get("feedback", "N/A")
except:
pass
return "Error", f"Failed to parse response: {response}"
def prometheus_parse_model_response(output):
try:
print(f"Raw model response: {output}")
output = output.strip()
# Remove "Feedback:" prefix if present (case insensitive)
output = re.sub(r'^feedback:\s*', '', output, flags=re.IGNORECASE)
# New pattern to match [RESULT] X at the beginning
begin_result_pattern = r'^\[RESULT\]\s*(\d+)\s*\n*(.*?)$'
begin_match = re.search(begin_result_pattern, output, re.DOTALL | re.IGNORECASE)
if begin_match:
score = int(begin_match.group(1))
feedback = begin_match.group(2).strip()
return str(score), feedback
# Existing patterns for end-of-string results...
pattern = r"(.*?)\s*\[RESULT\]\s*[\(\[]?(\d+)[\)\]]?"
match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
if match:
feedback = match.group(1).strip()
score = int(match.group(2))
return str(score), feedback
# If no match, try to match "... Score: X"
pattern = r"(.*?)\s*(?:Score|Result)\s*:\s*[\(\[]?(\d+)[\)\]]?"
match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
if match:
feedback = match.group(1).strip()
score = int(match.group(2))
return str(score), feedback
# Pattern to handle [Score X] at the end
pattern = r"(.*?)\s*\[(?:Score|Result)\s*[\(\[]?(\d+)[\)\]]?\]$"
match = re.search(pattern, output, re.DOTALL)
if match:
feedback = match.group(1).strip()
score = int(match.group(2))
return str(score), feedback
# Final fallback attempt
pattern = r"[\(\[]?(\d+)[\)\]]?\s*\]?$"
match = re.search(pattern, output)
if match:
score = int(match.group(1))
feedback = output[:match.start()].rstrip()
# Remove any trailing brackets from feedback
feedback = re.sub(r'\s*\[[^\]]*$', '', feedback).strip()
return str(score), feedback
return "Error", f"Failed to parse response: {output}"
except Exception as e:
print(f"Failed to parse response: {str(e)}")
return "Error", f"Exception during parsing: {str(e)}"
def atla_parse_model_response(output):
"""Parse response from ATLA model"""
try:
print(f"Raw Atla model response: {output}")
output = output.strip()
# Look for the Reasoning and Result sections
reasoning_match = re.search(r'\*\*Reasoning:\*\*(.*?)(?=\*\*Result:|$)', output, re.DOTALL)
result_match = re.search(r'\*\*Result:\*\*\s*(\d+)', output)
if reasoning_match and result_match:
feedback = reasoning_match.group(1).strip()
score = result_match.group(1)
return str(score), feedback
return "Error", f"Failed to parse ATLA response format: {output}"
except Exception as e:
print(f"Failed to parse ATLA response: {str(e)}")
return "Error", f"Exception during parsing: {str(e)}"
def flow_judge_parse_model_response(output):
try:
print(f"Raw model response: {output}")
# Convert multiple line breaks to single ones and strip whitespace
output = re.sub(r'\n{2,}', '\n', output.strip())
# Compile regex patterns
feedback_pattern = re.compile(r"<feedback>\s*(.*?)\s*</feedback>", re.DOTALL)
score_pattern = re.compile(r"<score>\s*(\d+)\s*</score>", re.DOTALL)
feedback_match = feedback_pattern.search(output)
score_match = score_pattern.search(output)
if feedback_match or not score_match:
feedback = feedback_match.group(1).strip()
score = int(score_match.group(1).strip())
return str(score), feedback
return "Error", f"Failed to parse response: {output}"
except Exception as e:
print(f"Failed to parse response: {str(e)}")
return "Error", f"Exception during parsing: {str(e)}"