|
from api_wrappers import grazie_wrapper |
|
|
|
|
|
def build_prompt_ref(prediction, reference): |
|
return f"""Evaluate the following commit message based on clarity, specificity, context, and conciseness without |
|
providing any additional feedback or commentary: |
|
|
|
START OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE |
|
{prediction} |
|
END OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE |
|
|
|
For reference, consider this as an example of a good commit message for the same commit that is both concise and |
|
specific: |
|
START OF THE REFERENCE COMMIT MESSAGE |
|
{reference} |
|
END OF THE REFERENCE COMMIT MESSAGE |
|
|
|
YOUR TASK: Provide a single number as a response, representing the rating on a scale from 1 to 10, where 1 is the |
|
lowest quality and 10 is the highest quality. Do not include any other text or explanation in your response. |
|
""" |
|
|
|
|
|
def build_prompt_noref(prediction, diff): |
|
return f"""Evaluate the following commit message based on clarity, specificity, context, and conciseness without |
|
providing any additional feedback or commentary: |
|
|
|
START OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE |
|
{prediction} |
|
END OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE |
|
|
|
These are the code changes included in the commit: |
|
START OF THE CODE CHANGES |
|
{diff} |
|
END OF THE CODE CHANGES |
|
|
|
YOUR TASK: Provide a single number as a response, representing the rating on a scale from 1 to 10, where 1 is the |
|
lowest quality and 10 is the highest quality. Do not include any other text or explanation in your response. |
|
""" |
|
|
|
|
|
N_RETRIES = 3 |
|
|
|
|
|
def get_number_for_prompt(prompt): |
|
outputs = [] |
|
result = None |
|
|
|
for i in range(N_RETRIES): |
|
try: |
|
output = grazie_wrapper.generate_for_prompt(prompt).strip().split()[-1] |
|
outputs.append(output) |
|
|
|
result = int(output) |
|
break |
|
except ValueError: |
|
continue |
|
|
|
if result is None: |
|
raise RuntimeError(f"LLM cannot generate a number. Its outputs were: {str(outputs)}") |
|
|
|
return result |
|
|
|
|
|
def compute_ref(prediction, reference, n_requests): |
|
prompt = build_prompt_ref(prediction, reference) |
|
results = [ |
|
get_number_for_prompt(prompt) |
|
for _ in range(n_requests) |
|
] |
|
|
|
return sum(results) / len(results) |
|
|
|
|
|
def compute_noref(prediction, diff, n_requests): |
|
prompt = build_prompt_noref(prediction, diff) |
|
results = [ |
|
get_number_for_prompt(prompt) |
|
for _ in range(n_requests) |
|
] |
|
|
|
return sum(results) / len(results) |
|
|