Spaces:

JetBrains-Research
/

commit-message-editing-visualization

Sleeping

commit-message-editing-visualization / custom_metrics /gpt_eval.py

Petr Tsvetkov

Add noref gpt-eval to the pipeline

073db2c 6 months ago

2.43 kB

	from api_wrappers import grazie_wrapper


	def build_prompt_ref(prediction, reference):
	return f"""Evaluate the following commit message based on clarity, specificity, context, and conciseness without
	providing any additional feedback or commentary:

	START OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
	{prediction}
	END OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE

	For reference, consider this as an example of a good commit message for the same commit that is both concise and
	specific:
	START OF THE REFERENCE COMMIT MESSAGE
	{reference}
	END OF THE REFERENCE COMMIT MESSAGE

	YOUR TASK: Provide a single number as a response, representing the rating on a scale from 1 to 10, where 1 is the
	lowest quality and 10 is the highest quality. Do not include any other text or explanation in your response.
	"""


	def build_prompt_noref(prediction, diff):
	return f"""Evaluate the following commit message based on clarity, specificity, context, and conciseness without
	providing any additional feedback or commentary:

	START OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
	{prediction}
	END OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE

	These are the code changes included in the commit:
	START OF THE CODE CHANGES
	{diff}
	END OF THE CODE CHANGES

	YOUR TASK: Provide a single number as a response, representing the rating on a scale from 1 to 10, where 1 is the
	lowest quality and 10 is the highest quality. Do not include any other text or explanation in your response.
	"""


	N_RETRIES = 3


	def get_number_for_prompt(prompt):
	outputs = []
	result = None

	for i in range(N_RETRIES):
	try:
	output = grazie_wrapper.generate_for_prompt(prompt).strip().split()[-1]
	outputs.append(output)

	result = int(output)
	break
	except ValueError:
	continue

	if result is None:
	raise RuntimeError(f"LLM cannot generate a number. Its outputs were: {str(outputs)}")

	return result


	def compute_ref(prediction, reference, n_requests):
	prompt = build_prompt_ref(prediction, reference)
	results = [
	get_number_for_prompt(prompt)
	for _ in range(n_requests)
	]

	return sum(results) / len(results)


	def compute_noref(prediction, diff, n_requests):
	prompt = build_prompt_noref(prediction, diff)
	results = [
	get_number_for_prompt(prompt)
	for _ in range(n_requests)
	]

	return sum(results) / len(results)