Petr Tsvetkov
Add noref gpt-eval to the pipeline
073db2c
raw
history blame
2.43 kB
from api_wrappers import grazie_wrapper
def build_prompt_ref(prediction, reference):
return f"""Evaluate the following commit message based on clarity, specificity, context, and conciseness without
providing any additional feedback or commentary:
START OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
{prediction}
END OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
For reference, consider this as an example of a good commit message for the same commit that is both concise and
specific:
START OF THE REFERENCE COMMIT MESSAGE
{reference}
END OF THE REFERENCE COMMIT MESSAGE
YOUR TASK: Provide a single number as a response, representing the rating on a scale from 1 to 10, where 1 is the
lowest quality and 10 is the highest quality. Do not include any other text or explanation in your response.
"""
def build_prompt_noref(prediction, diff):
return f"""Evaluate the following commit message based on clarity, specificity, context, and conciseness without
providing any additional feedback or commentary:
START OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
{prediction}
END OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
These are the code changes included in the commit:
START OF THE CODE CHANGES
{diff}
END OF THE CODE CHANGES
YOUR TASK: Provide a single number as a response, representing the rating on a scale from 1 to 10, where 1 is the
lowest quality and 10 is the highest quality. Do not include any other text or explanation in your response.
"""
N_RETRIES = 3
def get_number_for_prompt(prompt):
outputs = []
result = None
for i in range(N_RETRIES):
try:
output = grazie_wrapper.generate_for_prompt(prompt).strip().split()[-1]
outputs.append(output)
result = int(output)
break
except ValueError:
continue
if result is None:
raise RuntimeError(f"LLM cannot generate a number. Its outputs were: {str(outputs)}")
return result
def compute_ref(prediction, reference, n_requests):
prompt = build_prompt_ref(prediction, reference)
results = [
get_number_for_prompt(prompt)
for _ in range(n_requests)
]
return sum(results) / len(results)
def compute_noref(prediction, diff, n_requests):
prompt = build_prompt_noref(prediction, diff)
results = [
get_number_for_prompt(prompt)
for _ in range(n_requests)
]
return sum(results) / len(results)