ambrosfitz commited on
Commit
b9549f1
·
verified ·
1 Parent(s): 6d85b30

Create question_generator.py

Browse files
Files changed (1) hide show
  1. question_generator.py +183 -0
question_generator.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import csv
3
+ import os
4
+ import logging
5
+ import hashlib
6
+ from typing import List, Dict
7
+ from datetime import datetime
8
+ from mistralai import Mistral
9
+
10
+ # Set up logging
11
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12
+
13
+ # Get the Mistral API key from environment variables
14
+ api_key = os.environ.get("MISTRAL_API_KEY")
15
+ if not api_key:
16
+ logging.error("MISTRAL_API_KEY environment variable is not set.")
17
+ raise ValueError("MISTRAL_API_KEY environment variable is not set.")
18
+
19
+ model = "mistral-large-latest"
20
+
21
+ # Initialize Mistral client
22
+ client = Mistral(api_key=api_key)
23
+
24
+ def load_csv_data(file_path: str) -> List[Dict[str, str]]:
25
+ """Load data from a CSV file."""
26
+ logging.info(f"Loading data from {file_path}...")
27
+ try:
28
+ with open(file_path, 'r', encoding='utf-8') as csvfile:
29
+ reader = csv.DictReader(csvfile)
30
+ data = list(reader)
31
+ logging.info(f"Loaded {len(data)} rows from {file_path}")
32
+ return data
33
+ except FileNotFoundError:
34
+ logging.error(f"File not found: {file_path}")
35
+ raise
36
+ except csv.Error as e:
37
+ logging.error(f"Error reading CSV file {file_path}: {e}")
38
+ raise
39
+
40
+ # Load data from both CSV files
41
+ try:
42
+ detailed_cases = load_csv_data('processed_medical_history.csv')
43
+ infectious_diseases = load_csv_data('infectious_diseases.csv')
44
+ except Exception as e:
45
+ logging.error(f"Failed to load CSV data: {e}")
46
+ raise
47
+
48
+ def hash_question(question: str) -> str:
49
+ """Generate a hash for a question to check for duplicates."""
50
+ return hashlib.md5(question.encode()).hexdigest()
51
+
52
+ def load_generated_questions() -> set:
53
+ """Load previously generated question hashes from a file."""
54
+ try:
55
+ with open('generated_questions.txt', 'r') as f:
56
+ return set(line.strip() for line in f)
57
+ except FileNotFoundError:
58
+ return set()
59
+
60
+ def save_generated_question(question_hash: str):
61
+ """Save a newly generated question hash to the file."""
62
+ with open('generated_questions.txt', 'a') as f:
63
+ f.write(question_hash + '\n')
64
+
65
+ generated_questions = load_generated_questions()
66
+
67
+ def generate_microbiology_question() -> Dict[str, str]:
68
+ """Generate a microbiology question."""
69
+ question_types = [
70
+ "clinical_vignette",
71
+ "mechanism_of_pathogenesis",
72
+ "laboratory_diagnosis",
73
+ "antimicrobial_resistance",
74
+ "vaccine_preventable_disease",
75
+ "microbial_physiology_genetics",
76
+ "epidemiology_transmission"
77
+ ]
78
+ question_type = random.choice(question_types)
79
+ logging.info(f"Generating {question_type} question...")
80
+
81
+ if question_type == "clinical_vignette":
82
+ case = random.choice(detailed_cases)
83
+ context = f"""
84
+ Pathogen: {case['Pathogen_Name']} ({case['Pathogen_Type']})
85
+ Key Symptoms: {case['Key_Symptoms']}
86
+ Physical Findings: {case['Physical_Findings']}
87
+ Lab Results: {case['Lab_Results']}
88
+ Patient Demographics: {case['Patient_Demographics']}
89
+ """
90
+ else:
91
+ disease = random.choice(infectious_diseases)
92
+ context = f"""
93
+ Infectious Agent: {disease['infectious_agent']}
94
+ Diagnosis: {disease['diagnosis']}
95
+ Treatment: {disease['treatment']}
96
+ """
97
+
98
+ prompt = f"""
99
+ Create a microbiology question that could appear on the NBME exam. This should be a {question_type} question.
100
+ Use the following information as inspiration, but feel free to expand or modify:
101
+
102
+ {context}
103
+
104
+ Generate a question based on the following template, depending on the question type:
105
+
106
+ 1. Clinical Vignette with Pathogen Identification:
107
+ A [age]-year-old [gender] presents with [symptoms and clinical findings]. [Additional relevant information]. Which of the following is the most likely causal organism?
108
+
109
+ 2. Mechanism of Pathogenesis:
110
+ [Description of a pathogen or clinical scenario]
111
+ Which of the following best describes the mechanism by which this organism causes disease?
112
+
113
+ 3. Laboratory Diagnosis:
114
+ A patient presents with [symptoms]. [Description of laboratory findings or test results].
115
+ Which of the following is the most likely diagnosis based on these laboratory findings?
116
+
117
+ 4. Antimicrobial Mechanism and Resistance:
118
+ A patient is diagnosed with [infection]. The causative organism is found to be resistant to [antibiotic]. Which of the following mechanisms is most likely responsible for this resistance?
119
+
120
+ 5. Vaccine-Preventable Disease:
121
+ A [age]-year-old [gender] presents with [symptoms of a vaccine-preventable disease]. Which of the following vaccines would have been most likely to prevent this condition?
122
+
123
+ 6. Microbial Physiology and Genetics:
124
+ An investigator observes [description of microbial behavior or genetic phenomenon]. Which of the following best explains this observation?
125
+
126
+ 7. Epidemiology and Transmission:
127
+ A cluster of [disease] cases is reported in [location]. [Description of affected population and circumstances]. Which of the following is the most likely mode of transmission?
128
+
129
+ Include:
130
+ 1. The question based on the selected template
131
+ 2. Five possible answer options (A through E)
132
+ 3. The correct answer
133
+ 4. A brief explanation of why the correct answer is right and why the other options are incorrect
134
+ 5. Detailed medical reasoning for the correct answer, including relevant pathophysiology, microbiology concepts, and clinical implications.
135
+
136
+ Format the response as a JSON object with the following keys:
137
+
138
+ {
139
+ "question": "The question text",
140
+ "options": {
141
+ "A": "Option A text",
142
+ "B": "Option B text",
143
+ "C": "Option C text",
144
+ "D": "Option D text",
145
+ "E": "Option E text"
146
+ },
147
+ "correct_answer": "The letter of the correct answer (A, B, C, D, or E)",
148
+ "explanation": "The explanation text",
149
+ "medical_reasoning": "The detailed medical reasoning text"
150
+ }
151
+ """
152
+
153
+ chat_response = client.chat.complete(
154
+ model=model,
155
+ messages=[
156
+ {
157
+ "role": "system",
158
+ "content": "You are a medical educator creating unique microbiology questions for the NBME exam. Ensure each question is distinct from previously generated ones and follows the specified template."
159
+ },
160
+ {
161
+ "role": "user",
162
+ "content": prompt
163
+ }
164
+ ]
165
+ )
166
+
167
+ response_content = chat_response.choices[0].message.content
168
+ # Parse the JSON response
169
+ import json
170
+ question_data = json.loads(response_content)
171
+
172
+ # Save the question hash
173
+ question_hash = hash_question(question_data['question'])
174
+ if question_hash not in generated_questions:
175
+ generated_questions.add(question_hash)
176
+ save_generated_question(question_hash)
177
+
178
+ return question_data
179
+
180
+ # Example usage
181
+ if __name__ == "__main__":
182
+ question = generate_microbiology_question()
183
+ print(json.dumps(question, indent=2))