from api_wrappers import grazie_wrapper def build_prompt_ref(prediction, reference): return f"""Evaluate the following commit message based on clarity, specificity, context, and conciseness without providing any additional feedback or commentary: START OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE {prediction} END OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE For reference, consider this as an example of a good commit message for the same commit that is both concise and specific: START OF THE REFERENCE COMMIT MESSAGE {reference} END OF THE REFERENCE COMMIT MESSAGE YOUR TASK: Provide a single number as a response, representing the rating on a scale from 1 to 10, where 1 is the lowest quality and 10 is the highest quality. Do not include any other text or explanation in your response. """ def build_prompt_noref(prediction, diff): return f"""Evaluate the following commit message based on clarity, specificity, context, and conciseness without providing any additional feedback or commentary: START OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE {prediction} END OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE These are the code changes included in the commit: START OF THE CODE CHANGES {diff} END OF THE CODE CHANGES YOUR TASK: Provide a single number as a response, representing the rating on a scale from 1 to 10, where 1 is the lowest quality and 10 is the highest quality. Do not include any other text or explanation in your response. """ N_RETRIES = 3 def get_number_for_prompt(prompt): outputs = [] result = None for i in range(N_RETRIES): try: output = grazie_wrapper.generate_for_prompt(prompt).strip().split()[-1] outputs.append(output) result = int(output) break except ValueError: continue if result is None: raise RuntimeError(f"LLM cannot generate a number. Its outputs were: {str(outputs)}") return result def compute_ref(prediction, reference, n_requests): prompt = build_prompt_ref(prediction, reference) results = [ get_number_for_prompt(prompt) for _ in range(n_requests) ] return sum(results) / len(results) def compute_noref(prediction, diff, n_requests): prompt = build_prompt_noref(prediction, diff) results = [ get_number_for_prompt(prompt) for _ in range(n_requests) ] return sum(results) / len(results)