|
import gradio as gr |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
import torch |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
if device.type == "cpu": |
|
print("Warning: CUDA is not available. Running on CPU, which may be slow.") |
|
|
|
|
|
model_name = "ruslanmv/ai-medical-model-32bit" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
device_map="auto", |
|
load_in_8bit=True |
|
).to(device) |
|
|
|
|
|
|
|
|
|
def ask_medical_question(question): |
|
prompt = f"<|start_header_id|>system<|end_header_id|> You are a Medical AI chatbot assistant. <|eot_id|><|start_header_id|>User: <|end_header_id|>This is the question: {question}<|eot_id|>" |
|
|
|
inputs = tokenizer(prompt, return_tensors="pt").to(device) |
|
outputs = model.generate( |
|
**inputs, |
|
max_new_tokens=100, |
|
temperature=0.7, |
|
do_sample=True, |
|
top_p=0.9, |
|
top_k=30, |
|
) |
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
return response |
|
|
|
|
|
|
|
iface = gr.Interface(fn=ask_medical_question, inputs="text", outputs="text") |
|
iface.launch() |
|
|