Spaces:
Sleeping
Sleeping
Create question_generator.py
Browse files- question_generator.py +183 -0
question_generator.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import csv
|
3 |
+
import os
|
4 |
+
import logging
|
5 |
+
import hashlib
|
6 |
+
from typing import List, Dict
|
7 |
+
from datetime import datetime
|
8 |
+
from mistralai import Mistral
|
9 |
+
|
10 |
+
# Set up logging
|
11 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
12 |
+
|
13 |
+
# Get the Mistral API key from environment variables
|
14 |
+
api_key = os.environ.get("MISTRAL_API_KEY")
|
15 |
+
if not api_key:
|
16 |
+
logging.error("MISTRAL_API_KEY environment variable is not set.")
|
17 |
+
raise ValueError("MISTRAL_API_KEY environment variable is not set.")
|
18 |
+
|
19 |
+
model = "mistral-large-latest"
|
20 |
+
|
21 |
+
# Initialize Mistral client
|
22 |
+
client = Mistral(api_key=api_key)
|
23 |
+
|
24 |
+
def load_csv_data(file_path: str) -> List[Dict[str, str]]:
|
25 |
+
"""Load data from a CSV file."""
|
26 |
+
logging.info(f"Loading data from {file_path}...")
|
27 |
+
try:
|
28 |
+
with open(file_path, 'r', encoding='utf-8') as csvfile:
|
29 |
+
reader = csv.DictReader(csvfile)
|
30 |
+
data = list(reader)
|
31 |
+
logging.info(f"Loaded {len(data)} rows from {file_path}")
|
32 |
+
return data
|
33 |
+
except FileNotFoundError:
|
34 |
+
logging.error(f"File not found: {file_path}")
|
35 |
+
raise
|
36 |
+
except csv.Error as e:
|
37 |
+
logging.error(f"Error reading CSV file {file_path}: {e}")
|
38 |
+
raise
|
39 |
+
|
40 |
+
# Load data from both CSV files
|
41 |
+
try:
|
42 |
+
detailed_cases = load_csv_data('processed_medical_history.csv')
|
43 |
+
infectious_diseases = load_csv_data('infectious_diseases.csv')
|
44 |
+
except Exception as e:
|
45 |
+
logging.error(f"Failed to load CSV data: {e}")
|
46 |
+
raise
|
47 |
+
|
48 |
+
def hash_question(question: str) -> str:
|
49 |
+
"""Generate a hash for a question to check for duplicates."""
|
50 |
+
return hashlib.md5(question.encode()).hexdigest()
|
51 |
+
|
52 |
+
def load_generated_questions() -> set:
|
53 |
+
"""Load previously generated question hashes from a file."""
|
54 |
+
try:
|
55 |
+
with open('generated_questions.txt', 'r') as f:
|
56 |
+
return set(line.strip() for line in f)
|
57 |
+
except FileNotFoundError:
|
58 |
+
return set()
|
59 |
+
|
60 |
+
def save_generated_question(question_hash: str):
|
61 |
+
"""Save a newly generated question hash to the file."""
|
62 |
+
with open('generated_questions.txt', 'a') as f:
|
63 |
+
f.write(question_hash + '\n')
|
64 |
+
|
65 |
+
generated_questions = load_generated_questions()
|
66 |
+
|
67 |
+
def generate_microbiology_question() -> Dict[str, str]:
|
68 |
+
"""Generate a microbiology question."""
|
69 |
+
question_types = [
|
70 |
+
"clinical_vignette",
|
71 |
+
"mechanism_of_pathogenesis",
|
72 |
+
"laboratory_diagnosis",
|
73 |
+
"antimicrobial_resistance",
|
74 |
+
"vaccine_preventable_disease",
|
75 |
+
"microbial_physiology_genetics",
|
76 |
+
"epidemiology_transmission"
|
77 |
+
]
|
78 |
+
question_type = random.choice(question_types)
|
79 |
+
logging.info(f"Generating {question_type} question...")
|
80 |
+
|
81 |
+
if question_type == "clinical_vignette":
|
82 |
+
case = random.choice(detailed_cases)
|
83 |
+
context = f"""
|
84 |
+
Pathogen: {case['Pathogen_Name']} ({case['Pathogen_Type']})
|
85 |
+
Key Symptoms: {case['Key_Symptoms']}
|
86 |
+
Physical Findings: {case['Physical_Findings']}
|
87 |
+
Lab Results: {case['Lab_Results']}
|
88 |
+
Patient Demographics: {case['Patient_Demographics']}
|
89 |
+
"""
|
90 |
+
else:
|
91 |
+
disease = random.choice(infectious_diseases)
|
92 |
+
context = f"""
|
93 |
+
Infectious Agent: {disease['infectious_agent']}
|
94 |
+
Diagnosis: {disease['diagnosis']}
|
95 |
+
Treatment: {disease['treatment']}
|
96 |
+
"""
|
97 |
+
|
98 |
+
prompt = f"""
|
99 |
+
Create a microbiology question that could appear on the NBME exam. This should be a {question_type} question.
|
100 |
+
Use the following information as inspiration, but feel free to expand or modify:
|
101 |
+
|
102 |
+
{context}
|
103 |
+
|
104 |
+
Generate a question based on the following template, depending on the question type:
|
105 |
+
|
106 |
+
1. Clinical Vignette with Pathogen Identification:
|
107 |
+
A [age]-year-old [gender] presents with [symptoms and clinical findings]. [Additional relevant information]. Which of the following is the most likely causal organism?
|
108 |
+
|
109 |
+
2. Mechanism of Pathogenesis:
|
110 |
+
[Description of a pathogen or clinical scenario]
|
111 |
+
Which of the following best describes the mechanism by which this organism causes disease?
|
112 |
+
|
113 |
+
3. Laboratory Diagnosis:
|
114 |
+
A patient presents with [symptoms]. [Description of laboratory findings or test results].
|
115 |
+
Which of the following is the most likely diagnosis based on these laboratory findings?
|
116 |
+
|
117 |
+
4. Antimicrobial Mechanism and Resistance:
|
118 |
+
A patient is diagnosed with [infection]. The causative organism is found to be resistant to [antibiotic]. Which of the following mechanisms is most likely responsible for this resistance?
|
119 |
+
|
120 |
+
5. Vaccine-Preventable Disease:
|
121 |
+
A [age]-year-old [gender] presents with [symptoms of a vaccine-preventable disease]. Which of the following vaccines would have been most likely to prevent this condition?
|
122 |
+
|
123 |
+
6. Microbial Physiology and Genetics:
|
124 |
+
An investigator observes [description of microbial behavior or genetic phenomenon]. Which of the following best explains this observation?
|
125 |
+
|
126 |
+
7. Epidemiology and Transmission:
|
127 |
+
A cluster of [disease] cases is reported in [location]. [Description of affected population and circumstances]. Which of the following is the most likely mode of transmission?
|
128 |
+
|
129 |
+
Include:
|
130 |
+
1. The question based on the selected template
|
131 |
+
2. Five possible answer options (A through E)
|
132 |
+
3. The correct answer
|
133 |
+
4. A brief explanation of why the correct answer is right and why the other options are incorrect
|
134 |
+
5. Detailed medical reasoning for the correct answer, including relevant pathophysiology, microbiology concepts, and clinical implications.
|
135 |
+
|
136 |
+
Format the response as a JSON object with the following keys:
|
137 |
+
|
138 |
+
{
|
139 |
+
"question": "The question text",
|
140 |
+
"options": {
|
141 |
+
"A": "Option A text",
|
142 |
+
"B": "Option B text",
|
143 |
+
"C": "Option C text",
|
144 |
+
"D": "Option D text",
|
145 |
+
"E": "Option E text"
|
146 |
+
},
|
147 |
+
"correct_answer": "The letter of the correct answer (A, B, C, D, or E)",
|
148 |
+
"explanation": "The explanation text",
|
149 |
+
"medical_reasoning": "The detailed medical reasoning text"
|
150 |
+
}
|
151 |
+
"""
|
152 |
+
|
153 |
+
chat_response = client.chat.complete(
|
154 |
+
model=model,
|
155 |
+
messages=[
|
156 |
+
{
|
157 |
+
"role": "system",
|
158 |
+
"content": "You are a medical educator creating unique microbiology questions for the NBME exam. Ensure each question is distinct from previously generated ones and follows the specified template."
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"role": "user",
|
162 |
+
"content": prompt
|
163 |
+
}
|
164 |
+
]
|
165 |
+
)
|
166 |
+
|
167 |
+
response_content = chat_response.choices[0].message.content
|
168 |
+
# Parse the JSON response
|
169 |
+
import json
|
170 |
+
question_data = json.loads(response_content)
|
171 |
+
|
172 |
+
# Save the question hash
|
173 |
+
question_hash = hash_question(question_data['question'])
|
174 |
+
if question_hash not in generated_questions:
|
175 |
+
generated_questions.add(question_hash)
|
176 |
+
save_generated_question(question_hash)
|
177 |
+
|
178 |
+
return question_data
|
179 |
+
|
180 |
+
# Example usage
|
181 |
+
if __name__ == "__main__":
|
182 |
+
question = generate_microbiology_question()
|
183 |
+
print(json.dumps(question, indent=2))
|