Spaces:
Sleeping
Sleeping
Commit
·
3b598e6
1
Parent(s):
21514b1
Speed up parallel tests execution
Browse files- tests/candidate.py +43 -19
- tests/test_e2e.py +65 -24
tests/candidate.py
CHANGED
@@ -53,8 +53,13 @@ def complete_interview(
|
|
53 |
topic = topic or random.choice(topic_lists[interview_type])
|
54 |
difficulty = difficulty or random.choice(["easy", "medium", "hard"])
|
55 |
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
interview_data = defaultdict(
|
60 |
lambda: None,
|
@@ -98,19 +103,27 @@ def complete_interview(
|
|
98 |
elif mode == "repeat":
|
99 |
candidate_message = chat_display[-1][1]
|
100 |
else:
|
101 |
-
response = client.chat.completions.create(
|
102 |
-
model=model, messages=messages_candidate, temperature=1, response_format={"type": "json_object"}, stream=False
|
103 |
-
)
|
104 |
try:
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
continue
|
115 |
|
116 |
if not candidate_message and not code and mode != "empty":
|
@@ -127,10 +140,17 @@ def complete_interview(
|
|
127 |
chat_display.append([candidate_message, None])
|
128 |
|
129 |
send_time = time.time()
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
response_times.append(time.time() - send_time)
|
136 |
|
@@ -144,8 +164,12 @@ def complete_interview(
|
|
144 |
|
145 |
time.sleep(pause) # to prevent exceeding rate limits
|
146 |
|
|
|
|
|
147 |
for fb in llm.end_interview(problem_statement_text, messages_interviewer, interview_type):
|
148 |
-
|
|
|
|
|
149 |
|
150 |
interview_data["average_response_time_seconds"] = round(sum(response_times) / len(response_times), 2) if response_times else 0
|
151 |
|
|
|
53 |
topic = topic or random.choice(topic_lists[interview_type])
|
54 |
difficulty = difficulty or random.choice(["easy", "medium", "hard"])
|
55 |
|
56 |
+
# Fix: Iterate over all elements and keep the last one
|
57 |
+
problem_statement_text = None
|
58 |
+
for text in llm.get_problem(requirements, difficulty, topic, interview_type):
|
59 |
+
problem_statement_text = text
|
60 |
+
|
61 |
+
if problem_statement_text is None:
|
62 |
+
raise ValueError("Failed to get problem statement")
|
63 |
|
64 |
interview_data = defaultdict(
|
65 |
lambda: None,
|
|
|
103 |
elif mode == "repeat":
|
104 |
candidate_message = chat_display[-1][1]
|
105 |
else:
|
|
|
|
|
|
|
106 |
try:
|
107 |
+
response = client.chat.completions.create(
|
108 |
+
model=model,
|
109 |
+
messages=messages_candidate,
|
110 |
+
temperature=1,
|
111 |
+
response_format={"type": "json_object"},
|
112 |
+
timeout=30, # Add a timeout to prevent indefinite waiting
|
113 |
+
)
|
114 |
+
try:
|
115 |
+
response_json = json.loads(response.choices[0].message.content)
|
116 |
+
candidate_message = response_json.get("message", "")
|
117 |
+
code = response_json.get("code_and_notes", "")
|
118 |
+
finished = response_json.get("finished", False)
|
119 |
+
question = response_json.get("question", False)
|
120 |
+
|
121 |
+
if finished and not question and not code:
|
122 |
+
break
|
123 |
+
except:
|
124 |
+
continue
|
125 |
+
except Exception as e:
|
126 |
+
print(f"Error in API call: {str(e)}, skipping this iteration")
|
127 |
continue
|
128 |
|
129 |
if not candidate_message and not code and mode != "empty":
|
|
|
140 |
chat_display.append([candidate_message, None])
|
141 |
|
142 |
send_time = time.time()
|
143 |
+
|
144 |
+
# Fix: Iterate over all elements and keep the last one
|
145 |
+
last_result = None
|
146 |
+
for result in send_request(code, previous_code, messages_interviewer, chat_display, llm, tts=None, silent=True):
|
147 |
+
last_result = result
|
148 |
+
|
149 |
+
if last_result is not None:
|
150 |
+
messages_interviewer, chat_display, previous_code, _ = last_result
|
151 |
+
else:
|
152 |
+
print("send_request did not return any results, skipping this iteration")
|
153 |
+
continue
|
154 |
|
155 |
response_times.append(time.time() - send_time)
|
156 |
|
|
|
164 |
|
165 |
time.sleep(pause) # to prevent exceeding rate limits
|
166 |
|
167 |
+
# Fix: Iterate over all elements and keep the last one
|
168 |
+
feedback = None
|
169 |
for fb in llm.end_interview(problem_statement_text, messages_interviewer, interview_type):
|
170 |
+
feedback = fb
|
171 |
+
|
172 |
+
interview_data["feedback"] = feedback
|
173 |
|
174 |
interview_data["average_response_time_seconds"] = round(sum(response_times) / len(response_times), 2) if response_times else 0
|
175 |
|
tests/test_e2e.py
CHANGED
@@ -1,46 +1,87 @@
|
|
|
|
1 |
from tests.candidate import complete_interview
|
2 |
from tests.grader import grade
|
3 |
-
from concurrent.futures import ThreadPoolExecutor
|
4 |
import random
|
5 |
import logging
|
6 |
-
from typing import List
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
|
|
|
10 |
"""
|
11 |
-
Complete an interview and return the overall score.
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
:
|
14 |
-
|
15 |
-
|
|
|
|
|
16 |
"""
|
17 |
file_path, _ = complete_interview(interview_type, "test", model="gpt-4o-mini", mode=mode)
|
18 |
feedback = grade(file_path, model="gpt-4o")
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
|
21 |
-
assert feedback["overall_score"] > min_score
|
22 |
-
return feedback["overall_score"]
|
23 |
|
24 |
|
25 |
def test_complete_interview() -> None:
|
26 |
"""
|
27 |
Test the complete interview process for various interview types, including edge cases.
|
|
|
28 |
"""
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="gibberish"))
|
39 |
-
futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="repeat"))
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
44 |
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
2 |
from tests.candidate import complete_interview
|
3 |
from tests.grader import grade
|
|
|
4 |
import random
|
5 |
import logging
|
6 |
+
from typing import List, Dict, Any, Tuple
|
7 |
|
8 |
+
# Constants
|
9 |
+
INTERVIEW_TYPES = ["ml_design", "math", "ml_theory", "system_design", "sql", "coding"]
|
10 |
+
EDGE_CASE_MODES = ["empty", "gibberish", "repeat"]
|
11 |
+
MIN_AVERAGE_SCORE = 0.7
|
12 |
+
MIN_INTERVIEW_SCORE = 0.3
|
13 |
+
MAX_WORKERS = 5
|
14 |
|
15 |
+
|
16 |
+
def complete_and_grade_interview(interview_type: str, mode: str = "normal") -> Dict[str, Any]:
|
17 |
"""
|
18 |
+
Complete an interview and return the overall score and metadata.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
interview_type (str): Type of the interview.
|
22 |
+
mode (str): Mode of the interview ("normal", "empty", "gibberish", "repeat").
|
23 |
|
24 |
+
Returns:
|
25 |
+
Dict[str, Any]: Dictionary containing interview metadata and score.
|
26 |
+
|
27 |
+
Raises:
|
28 |
+
AssertionError: If the overall score is below the minimum score.
|
29 |
"""
|
30 |
file_path, _ = complete_interview(interview_type, "test", model="gpt-4o-mini", mode=mode)
|
31 |
feedback = grade(file_path, model="gpt-4o")
|
32 |
+
score = feedback["overall_score"]
|
33 |
+
|
34 |
+
assert (
|
35 |
+
score > MIN_INTERVIEW_SCORE
|
36 |
+
), f"Score {score} is below minimum {MIN_INTERVIEW_SCORE} for {interview_type} interview in {mode} mode"
|
37 |
|
38 |
+
return {"interview_type": interview_type, "mode": mode, "score": score}
|
|
|
|
|
39 |
|
40 |
|
41 |
def test_complete_interview() -> None:
|
42 |
"""
|
43 |
Test the complete interview process for various interview types, including edge cases.
|
44 |
+
Runs interviews concurrently using a thread pool and checks the average score.
|
45 |
"""
|
46 |
+
interview_configs: List[Tuple[str, str]] = [(it, "normal") for it in INTERVIEW_TYPES] + [
|
47 |
+
(random.choice(INTERVIEW_TYPES), mode) for mode in EDGE_CASE_MODES
|
48 |
+
]
|
49 |
+
|
50 |
+
valid_results: List[Dict[str, Any]] = []
|
51 |
+
|
52 |
+
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
53 |
+
future_to_config = {
|
54 |
+
executor.submit(complete_and_grade_interview, interview_type, mode): (interview_type, mode)
|
55 |
+
for interview_type, mode in interview_configs
|
56 |
+
}
|
57 |
+
|
58 |
+
for future in as_completed(future_to_config):
|
59 |
+
interview_type, mode = future_to_config[future]
|
60 |
+
try:
|
61 |
+
result = future.result()
|
62 |
+
valid_results.append(result)
|
63 |
+
logging.info(f"Interview completed - Type: {result['interview_type']}, Mode: {result['mode']}, Score: {result['score']}")
|
64 |
+
except Exception as e:
|
65 |
+
logging.error(f"Interview failed - Type: {interview_type}, Mode: {mode}, Error: {str(e)}")
|
66 |
|
67 |
+
# Calculate and log average score
|
68 |
+
average_score = sum(result["score"] for result in valid_results) / len(valid_results)
|
69 |
+
logging.info(f"Average score across all interviews: {average_score:.2f}")
|
70 |
|
71 |
+
# Assert on the average score
|
72 |
+
assert average_score > MIN_AVERAGE_SCORE, f"Average score {average_score:.2f} is below minimum {MIN_AVERAGE_SCORE}"
|
|
|
|
|
73 |
|
74 |
+
# Log summary of results
|
75 |
+
for interview_type in INTERVIEW_TYPES:
|
76 |
+
type_scores = [r["score"] for r in valid_results if r["interview_type"] == interview_type]
|
77 |
+
if type_scores:
|
78 |
+
avg_type_score = sum(type_scores) / len(type_scores)
|
79 |
+
logging.info(f"Average score for {interview_type}: {avg_type_score:.2f}")
|
80 |
|
81 |
+
# Check that we have results for all interview types and edge cases
|
82 |
+
tested_types = {r["interview_type"] for r in valid_results}
|
83 |
+
tested_modes = {r["mode"] for r in valid_results}
|
84 |
+
assert tested_types == set(INTERVIEW_TYPES), f"Not all interview types were tested. Missing: {set(INTERVIEW_TYPES) - tested_types}"
|
85 |
+
assert tested_modes == set(
|
86 |
+
EDGE_CASE_MODES + ["normal"]
|
87 |
+
), f"Not all modes were tested. Missing: {set(EDGE_CASE_MODES + ['normal']) - tested_modes}"
|