from typing import Tuple
from app_utils import load_problems, SESSION_ID
from openai import OpenAI
from anthropic import Anthropic
from dotenv import load_dotenv
import os, re
from collections import defaultdict
import json
import logging
from datetime import datetime
# Configure logging
DEBUG = True # Set to False to disable debug logging
def setup_logging():
"""Configure logging settings"""
log_filename = f"grading_logs_{datetime.now().strftime('%Y%m%d')}.log"
logging.basicConfig(
level=logging.DEBUG if DEBUG else logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_filename),
logging.StreamHandler()
]
)
setup_logging()
logger = logging.getLogger(__name__)
load_dotenv()
LLMs = {
"DeepInfra": {
"API_KEY": os.getenv('DEEPINFRA_API_KEY'),
"BASEURL": "https://api.deepinfra.com/v1/openai",
"MODEL": "nvidia/Llama-3.1-Nemotron-70B-Instruct",
},
"ClaudeAI": {
"API_KEY": os.getenv('ClaudeAI_API_KEY'),
"MODEL": "claude-3-5-sonnet-latest"
}
}
PROVIDER = "ClaudeAI" # "DeepInfra"
def get_client():
"""Create and return appropriate client based on provider"""
if PROVIDER == "DeepInfra":
return OpenAI(
api_key=LLMs[PROVIDER]["API_KEY"],
base_url=LLMs[PROVIDER]["BASEURL"],
)
elif PROVIDER == "ClaudeAI":
return Anthropic(api_key=LLMs[PROVIDER]["API_KEY"])
else:
raise ValueError("Invalid LLM provider selected")
def call_llm(client, prompt):
"""Make API call to the selected LLM provider"""
if PROVIDER == "DeepInfra":
chat_completion = client.chat.completions.create(
model=LLMs[PROVIDER]["MODEL"],
messages=[{"role": "user", "content": prompt}],
)
return chat_completion.choices[0].message.content
else: # ClaudeAI
message = client.messages.create(
model=LLMs[PROVIDER]["MODEL"],
max_tokens=1000,
temperature=0,
messages=[{
"role": "user",
"content": prompt
}]
)
return message.content[0].text if isinstance(message.content, list) else message.content
Grading_rubric = defaultdict(lambda: None)
def LLM_output_to_dict(LLM_output):
# Define the regex patterns for each field
score_pattern = r"##Score:\s*(\d+)"
feedback_pattern = r"##Feedback:\s*(.*?)\s*##Rubric:"
rubric_pattern = r"##Rubric:\s*(.*?)\s*"
# Extract the values using regex
score_match = re.search(score_pattern, LLM_output, re.DOTALL)
feedback_match = re.search(feedback_pattern, LLM_output, re.DOTALL)
rubric_match = re.search(rubric_pattern, LLM_output, re.DOTALL)
# Get the matched values or set to empty string if not found
score = int(score_match.group(1)) if score_match else 0
feedback = feedback_match.group(1).strip() if feedback_match else ""
rubric = rubric_match.group(1).strip() if rubric_match else ""
# Return the dictionary
return {
"Score": score,
"Feedback": feedback,
"Rubric": rubric
}
def grade_submission(student_code: str, problem_id: str,
problem_description: str,
student_answer: str, correct_answer: str) -> Tuple[float, str]:
"""Call LLM model to grade student submission. Returns score and feedback."""
client = get_client()
rubric = Grading_rubric[problem_id]
prompt = f"""You are a programming assignment grader.
Evaluate the student's answer based on the Problem description, Grading rubric and correct answer and provide a score and brief feedback. Note if a problem asks an essay question or short answer question, student's answer may be in comments like "# answer" or in tripple quotes.
If Grading rubric is not provided, you should develop your own rubric based on the problem description and correct answer. In rubric do not include readability and structure criteria. Please be tolerant for minor syntax errors (up to 1 point deduction for less than 2 minor syntax errors). Please focus on the logic of the code.
##Problem ID: {problem_id}
##Problem Description: {problem_description}
##Grading rubric: {rubric}
##Student Code:
{student_code}
##Student Answer: {student_answer}
##Correct Answer: {correct_answer}
Provide your response delimited in with the 3 fields below and nothing else. For "Rubric" key, supply your grading rubric if you did not receive one, otherwise leave it as empty . Here are 3 fields you need to provide:
"""
logger.info(f"Problem ID: {problem_id}")
if DEBUG:
logger.debug("Prompt sent to LLM:")
logger.debug(prompt)
try:
response = call_llm(client, prompt)
if DEBUG:
logger.debug("Raw LLM Response:")
logger.info(response)
# Remove markdown code blocks if present
response = response.strip()
try:
result = LLM_output_to_dict(response)
score = float(result.get('Score', 0.0))
feedback = str(result.get('Feedback', ""))
rubric = str(result.get('Rubric', ""))
if DEBUG:
logger.debug(f"Parsed Results - Score: {score}, Feedback: {feedback}")
if rubric and rubric != "None":
logger.debug(f"New rubric created for problem {problem_id}")
if rubric and rubric != "None":
Grading_rubric[problem_id] = rubric
return score, feedback, rubric
except Exception as e:
logger.error("Failed to parse LLM response")
return 0.0, "Failed to parse grading", rubric
except Exception as e:
logger.error(f"Error during grading: {str(e)}")
return 0.0, f"Grading error: {str(e)}", rubric