Spaces:

IliaLarchenko
/

interviewer

Sleeping

App Files Files Community

IliaLarchenko commited on Aug 17, 2024

Commit

3b598e6

1 Parent(s): 21514b1

Speed up parallel tests execution

Browse files

Files changed (2) hide show

tests/candidate.py +43 -19
tests/test_e2e.py +65 -24

tests/candidate.py CHANGED Viewed

@@ -53,8 +53,13 @@ def complete_interview(
     topic = topic or random.choice(topic_lists[interview_type])
     difficulty = difficulty or random.choice(["easy", "medium", "hard"])
-    for problem_statement_text in llm.get_problem(requirements, difficulty, topic, interview_type):
-        pass
     interview_data = defaultdict(
         lambda: None,
@@ -98,19 +103,27 @@ def complete_interview(
         elif mode == "repeat":
             candidate_message = chat_display[-1][1]
         else:
-            response = client.chat.completions.create(
-                model=model, messages=messages_candidate, temperature=1, response_format={"type": "json_object"}, stream=False
-            )
             try:
-                response_json = json.loads(response.choices[0].message.content)
-                candidate_message = response_json.get("message", "")
-                code = response_json.get("code_and_notes", "")
-                finished = response_json.get("finished", False)
-                question = response_json.get("question", False)
-                if finished and not question and not code:
-                    break
-            except:
                 continue
         if not candidate_message and not code and mode != "empty":
@@ -127,10 +140,17 @@ def complete_interview(
         chat_display.append([candidate_message, None])
         send_time = time.time()
-        for messages_interviewer, chat_display, previous_code, _ in send_request(
-            code, previous_code, messages_interviewer, chat_display, llm, tts=None, silent=True
-        ):
-            pass
         response_times.append(time.time() - send_time)
@@ -144,8 +164,12 @@ def complete_interview(
         time.sleep(pause)  # to prevent exceeding rate limits
     for fb in llm.end_interview(problem_statement_text, messages_interviewer, interview_type):
-        interview_data["feedback"] = fb
     interview_data["average_response_time_seconds"] = round(sum(response_times) / len(response_times), 2) if response_times else 0

     topic = topic or random.choice(topic_lists[interview_type])
     difficulty = difficulty or random.choice(["easy", "medium", "hard"])
+    # Fix: Iterate over all elements and keep the last one
+    problem_statement_text = None
+    for text in llm.get_problem(requirements, difficulty, topic, interview_type):
+        problem_statement_text = text
+    if problem_statement_text is None:
+        raise ValueError("Failed to get problem statement")
     interview_data = defaultdict(
         lambda: None,
         elif mode == "repeat":
             candidate_message = chat_display[-1][1]
         else:
             try:
+                response = client.chat.completions.create(
+                    model=model,
+                    messages=messages_candidate,
+                    temperature=1,
+                    response_format={"type": "json_object"},
+                    timeout=30,  # Add a timeout to prevent indefinite waiting
+                )
+                try:
+                    response_json = json.loads(response.choices[0].message.content)
+                    candidate_message = response_json.get("message", "")
+                    code = response_json.get("code_and_notes", "")
+                    finished = response_json.get("finished", False)
+                    question = response_json.get("question", False)
+                    if finished and not question and not code:
+                        break
+                except:
+                    continue
+            except Exception as e:
+                print(f"Error in API call: {str(e)}, skipping this iteration")
                 continue
         if not candidate_message and not code and mode != "empty":
         chat_display.append([candidate_message, None])
         send_time = time.time()
+        # Fix: Iterate over all elements and keep the last one
+        last_result = None
+        for result in send_request(code, previous_code, messages_interviewer, chat_display, llm, tts=None, silent=True):
+            last_result = result
+        if last_result is not None:
+            messages_interviewer, chat_display, previous_code, _ = last_result
+        else:
+            print("send_request did not return any results, skipping this iteration")
+            continue
         response_times.append(time.time() - send_time)
         time.sleep(pause)  # to prevent exceeding rate limits
+    # Fix: Iterate over all elements and keep the last one
+    feedback = None
     for fb in llm.end_interview(problem_statement_text, messages_interviewer, interview_type):
+        feedback = fb
+    interview_data["feedback"] = feedback
     interview_data["average_response_time_seconds"] = round(sum(response_times) / len(response_times), 2) if response_times else 0

tests/test_e2e.py CHANGED Viewed

@@ -1,46 +1,87 @@
 from tests.candidate import complete_interview
 from tests.grader import grade
-from concurrent.futures import ThreadPoolExecutor
 import random
 import logging
-from typing import List
-def complete_and_grade_interview(interview_type: str, mode: str = "normal", min_score=0.3) -> float:
     """
-    Complete an interview and return the overall score.
-    :param interview_type: Type of the interview.
-    :param mode: Mode of the interview ("normal", "empty", "gibberish", "repeat").
-    :return: Overall score of the interview.
     """
     file_path, _ = complete_interview(interview_type, "test", model="gpt-4o-mini", mode=mode)
     feedback = grade(file_path, model="gpt-4o")
-    logging.info(f"Interview type: {interview_type}, mode: {mode}, score: {feedback['overall_score']}")
-    assert feedback["overall_score"] > min_score
-    return feedback["overall_score"]
 def test_complete_interview() -> None:
     """
     Test the complete interview process for various interview types, including edge cases.
     """
-    interview_types = ["ml_design", "math", "ml_theory", "system_design", "sql", "coding"]
-    scores: List[float] = []
-    with ThreadPoolExecutor(max_workers=5) as executor:
-        # Test normal interviews
-        futures = [executor.submit(complete_and_grade_interview, it) for it in interview_types]
-        # Test edge cases: empty, gibberish, repeat for one random interview type each
-        futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="empty"))
-        futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="gibberish"))
-        futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="repeat"))
-        for future in futures:
-            score = future.result()
-            scores.append(score)
-    logging.info(f"Average score: {sum(scores) / len(scores)}")
-    assert sum(scores) / len(scores) > 0.7

+from concurrent.futures import ThreadPoolExecutor, as_completed
 from tests.candidate import complete_interview
 from tests.grader import grade
 import random
 import logging
+from typing import List, Dict, Any, Tuple
+# Constants
+INTERVIEW_TYPES = ["ml_design", "math", "ml_theory", "system_design", "sql", "coding"]
+EDGE_CASE_MODES = ["empty", "gibberish", "repeat"]
+MIN_AVERAGE_SCORE = 0.7
+MIN_INTERVIEW_SCORE = 0.3
+MAX_WORKERS = 5
+def complete_and_grade_interview(interview_type: str, mode: str = "normal") -> Dict[str, Any]:
     """
+    Complete an interview and return the overall score and metadata.
+    Args:
+        interview_type (str): Type of the interview.
+        mode (str): Mode of the interview ("normal", "empty", "gibberish", "repeat").
+    Returns:
+        Dict[str, Any]: Dictionary containing interview metadata and score.
+    Raises:
+        AssertionError: If the overall score is below the minimum score.
     """
     file_path, _ = complete_interview(interview_type, "test", model="gpt-4o-mini", mode=mode)
     feedback = grade(file_path, model="gpt-4o")
+    score = feedback["overall_score"]
+    assert (
+        score > MIN_INTERVIEW_SCORE
+    ), f"Score {score} is below minimum {MIN_INTERVIEW_SCORE} for {interview_type} interview in {mode} mode"
+    return {"interview_type": interview_type, "mode": mode, "score": score}
 def test_complete_interview() -> None:
     """
     Test the complete interview process for various interview types, including edge cases.
+    Runs interviews concurrently using a thread pool and checks the average score.
     """
+    interview_configs: List[Tuple[str, str]] = [(it, "normal") for it in INTERVIEW_TYPES] + [
+        (random.choice(INTERVIEW_TYPES), mode) for mode in EDGE_CASE_MODES
+    ]
+    valid_results: List[Dict[str, Any]] = []
+    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+        future_to_config = {
+            executor.submit(complete_and_grade_interview, interview_type, mode): (interview_type, mode)
+            for interview_type, mode in interview_configs
+        }
+        for future in as_completed(future_to_config):
+            interview_type, mode = future_to_config[future]
+            try:
+                result = future.result()
+                valid_results.append(result)
+                logging.info(f"Interview completed - Type: {result['interview_type']}, Mode: {result['mode']}, Score: {result['score']}")
+            except Exception as e:
+                logging.error(f"Interview failed - Type: {interview_type}, Mode: {mode}, Error: {str(e)}")
+    # Calculate and log average score
+    average_score = sum(result["score"] for result in valid_results) / len(valid_results)
+    logging.info(f"Average score across all interviews: {average_score:.2f}")
+    # Assert on the average score
+    assert average_score > MIN_AVERAGE_SCORE, f"Average score {average_score:.2f} is below minimum {MIN_AVERAGE_SCORE}"
+    # Log summary of results
+    for interview_type in INTERVIEW_TYPES:
+        type_scores = [r["score"] for r in valid_results if r["interview_type"] == interview_type]
+        if type_scores:
+            avg_type_score = sum(type_scores) / len(type_scores)
+            logging.info(f"Average score for {interview_type}: {avg_type_score:.2f}")
+    # Check that we have results for all interview types and edge cases
+    tested_types = {r["interview_type"] for r in valid_results}
+    tested_modes = {r["mode"] for r in valid_results}
+    assert tested_types == set(INTERVIEW_TYPES), f"Not all interview types were tested. Missing: {set(INTERVIEW_TYPES) - tested_types}"
+    assert tested_modes == set(
+        EDGE_CASE_MODES + ["normal"]
+    ), f"Not all modes were tested. Missing: {set(EDGE_CASE_MODES + ['normal']) - tested_modes}"