from typing import Tuple from app_utils import load_problems, SESSION_ID from openai import OpenAI from anthropic import Anthropic from dotenv import load_dotenv import os, re from collections import defaultdict import json import logging from datetime import datetime # Configure logging DEBUG = True # Set to False to disable debug logging def setup_logging(): """Configure logging settings""" log_filename = f"grading_logs_{datetime.now().strftime('%Y%m%d')}.log" logging.basicConfig( level=logging.DEBUG if DEBUG else logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(log_filename), logging.StreamHandler() ] ) setup_logging() logger = logging.getLogger(__name__) load_dotenv() LLMs = { "DeepInfra": { "API_KEY": os.getenv('DEEPINFRA_API_KEY'), "BASEURL": "https://api.deepinfra.com/v1/openai", "MODEL": "nvidia/Llama-3.1-Nemotron-70B-Instruct", }, "ClaudeAI": { "API_KEY": os.getenv('ClaudeAI_API_KEY'), "MODEL": "claude-3-5-sonnet-latest" } } PROVIDER = "ClaudeAI" # "DeepInfra" def get_client(): """Create and return appropriate client based on provider""" if PROVIDER == "DeepInfra": return OpenAI( api_key=LLMs[PROVIDER]["API_KEY"], base_url=LLMs[PROVIDER]["BASEURL"], ) elif PROVIDER == "ClaudeAI": return Anthropic(api_key=LLMs[PROVIDER]["API_KEY"]) else: raise ValueError("Invalid LLM provider selected") def call_llm(client, prompt): """Make API call to the selected LLM provider""" if PROVIDER == "DeepInfra": chat_completion = client.chat.completions.create( model=LLMs[PROVIDER]["MODEL"], messages=[{"role": "user", "content": prompt}], ) return chat_completion.choices[0].message.content else: # ClaudeAI message = client.messages.create( model=LLMs[PROVIDER]["MODEL"], max_tokens=1000, temperature=0, messages=[{ "role": "user", "content": prompt }] ) return message.content[0].text if isinstance(message.content, list) else message.content Grading_rubric = defaultdict(lambda: None) def LLM_output_to_dict(LLM_output): # Define the regex patterns for each field score_pattern = r"##Score:\s*(\d+)" feedback_pattern = r"##Feedback:\s*(.*?)\s*##Rubric:" rubric_pattern = r"##Rubric:\s*(.*?)\s*" # Extract the values using regex score_match = re.search(score_pattern, LLM_output, re.DOTALL) feedback_match = re.search(feedback_pattern, LLM_output, re.DOTALL) rubric_match = re.search(rubric_pattern, LLM_output, re.DOTALL) # Get the matched values or set to empty string if not found score = int(score_match.group(1)) if score_match else 0 feedback = feedback_match.group(1).strip() if feedback_match else "" rubric = rubric_match.group(1).strip() if rubric_match else "" # Return the dictionary return { "Score": score, "Feedback": feedback, "Rubric": rubric } def grade_submission(student_code: str, problem_id: str, problem_description: str, student_answer: str, correct_answer: str) -> Tuple[float, str]: """Call LLM model to grade student submission. Returns score and feedback.""" client = get_client() rubric = Grading_rubric[problem_id] prompt = f"""You are a programming assignment grader. Evaluate the student's answer based on the Problem description, Grading rubric and correct answer and provide a score and brief feedback. Note if a problem asks an essay question or short answer question, student's answer may be in comments like "# answer" or in tripple quotes. If Grading rubric is not provided, you should develop your own rubric based on the problem description and correct answer. In rubric do not include readability and structure criteria. Please be tolerant for minor syntax errors (up to 1 point deduction for less than 2 minor syntax errors). Please focus on the logic of the code. ##Problem ID: {problem_id} ##Problem Description: {problem_description} ##Grading rubric: {rubric} ##Student Code: {student_code} ##Student Answer: {student_answer} ##Correct Answer: {correct_answer} Provide your response delimited in with the 3 fields below and nothing else. For "Rubric" key, supply your grading rubric if you did not receive one, otherwise leave it as empty . Here are 3 fields you need to provide: ##Score: (int) number between 0 and highest score specified in problem description ##Feedback: (str) Your code is mostly correct, but ... (your brief feedback in markdown only if student's answer is not 100% correct.) ##Rubric: (str) - 1. first (2 pts) - 2. second (3 pts) (this is your grading rubric for this problem in markdown format with numbered bullet points if you did not receive one, otherwise leave it as empty.) """ logger.info(f"Problem ID: {problem_id}") if DEBUG: logger.debug("Prompt sent to LLM:") logger.debug(prompt) try: response = call_llm(client, prompt) if DEBUG: logger.debug("Raw LLM Response:") logger.info(response) # Remove markdown code blocks if present response = response.strip() try: result = LLM_output_to_dict(response) score = float(result.get('Score', 0.0)) feedback = str(result.get('Feedback', "")) rubric = str(result.get('Rubric', "")) if DEBUG: logger.debug(f"Parsed Results - Score: {score}, Feedback: {feedback}") if rubric and rubric != "None": logger.debug(f"New rubric created for problem {problem_id}") if rubric and rubric != "None": Grading_rubric[problem_id] = rubric return score, feedback, rubric except Exception as e: logger.error("Failed to parse LLM response") return 0.0, "Failed to parse grading", rubric except Exception as e: logger.error(f"Error during grading: {str(e)}") return 0.0, f"Grading error: {str(e)}", rubric