from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import PeftModel import torch import locale import os locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') # Load model directly tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2") model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2") # Use bitsandbytes to load the model in 8-bit precision bnb_config = BitsAndBytesConfig( load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16 ) model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", quantization_config=bnb_config) # Load adapter configuration adapter_config_dir = "adapter_config" # Load the adapters into the model model = PeftModel.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", adapter_config=adapter_config_dir) # FastAPI app from fastapi import FastAPI, HTTPException from pydantic import BaseModel app = FastAPI() class Question(BaseModel): question: str class Answer(BaseModel): answer: str @app.post("/ask", response_model=Answer) async def ask_question(question: Question): try: inputs = tokenizer(question.question, return_tensors="pt") outputs = model.generate(**inputs) answer = tokenizer.decode(outputs[0], skip_special_tokens=True) return Answer(answer=answer) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) # To run the FastAPI app, use the following command: # uvicorn app:app --host 0.0.0.0 --port 8000