File size: 2,433 Bytes
e027012
 
 
f5faae7
 
 
e027012
f5faae7
e027012
f5faae7
e027012
f5faae7
 
 
e027012
 
 
f5faae7
 
e027012
 
 
073db2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e027012
 
 
f5faae7
e027012
f5faae7
e027012
 
 
f5faae7
e027012
f5faae7
 
 
e027012
 
 
f5faae7
 
 
 
 
 
 
 
 
 
 
 
 
 
073db2c
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from api_wrappers import grazie_wrapper


def build_prompt_ref(prediction, reference):
    return f"""Evaluate the following commit message based on clarity, specificity, context, and conciseness without 
providing any additional feedback or commentary:

START OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
{prediction}
END OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE

For reference, consider this as an example of a good commit message for the same commit that is both concise and 
specific: 
START OF THE REFERENCE COMMIT MESSAGE 
{reference}
END OF THE REFERENCE COMMIT MESSAGE

YOUR TASK: Provide a single number as a response, representing the rating on a scale from 1 to 10, where 1 is the 
lowest quality and 10 is the highest quality. Do not include any other text or explanation in your response.
"""


def build_prompt_noref(prediction, diff):
    return f"""Evaluate the following commit message based on clarity, specificity, context, and conciseness without 
providing any additional feedback or commentary:

START OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
{prediction}
END OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE

These are the code changes included in the commit: 
START OF THE CODE CHANGES
{diff}
END OF THE CODE CHANGES

YOUR TASK: Provide a single number as a response, representing the rating on a scale from 1 to 10, where 1 is the 
lowest quality and 10 is the highest quality. Do not include any other text or explanation in your response.
"""


N_RETRIES = 3


def get_number_for_prompt(prompt):
    outputs = []
    result = None

    for i in range(N_RETRIES):
        try:
            output = grazie_wrapper.generate_for_prompt(prompt).strip().split()[-1]
            outputs.append(output)

            result = int(output)
            break
        except ValueError:
            continue

    if result is None:
        raise RuntimeError(f"LLM cannot generate a number. Its outputs were: {str(outputs)}")

    return result


def compute_ref(prediction, reference, n_requests):
    prompt = build_prompt_ref(prediction, reference)
    results = [
        get_number_for_prompt(prompt)
        for _ in range(n_requests)
    ]

    return sum(results) / len(results)


def compute_noref(prediction, diff, n_requests):
    prompt = build_prompt_noref(prediction, diff)
    results = [
        get_number_for_prompt(prompt)
        for _ in range(n_requests)
    ]

    return sum(results) / len(results)