import gradio as gr import torch from peft import prepare_model_for_kbit_training from transformers import AutoTokenizer, AutoModelForCausalLM model_path = "inception-mbzuai/jais-13b-chat" prompt_eng = "### Instruction: \n\nComplete the conversation below between [|Human|] and [|AI|]:\n### Input: [|Human|] {Question}\n### Response: [|AI|]" prompt_ar = "### Instruction: \n\nأكمل المحادثة أدناه بين [|Human|] و [|AI|]:\n### Input: [|Human|] {Question}\n### Response: [|AI|]" device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(model_path) #model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True) #model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("inception-mbzuai/jais-13b-chat", load_in_8bit=True, device_map="auto", trust_remote_code=True) model = prepare_model_for_kbit_training(model) def get_response(text,tokenizer=tokenizer,model=model): input_ids = tokenizer(text, return_tensors="pt").input_ids inputs = input_ids.to(device) input_len = inputs.shape[-1] generate_ids = model.generate( inputs, top_p=0.9, temperature=0.3, max_length=2048-input_len, min_length=input_len + 4, repetition_penalty=1.2, do_sample=True, ) response = tokenizer.batch_decode( generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True )[0] response = response.split("### Response: [|AI|]") return response def greet(): ques= input() text = prompt_ar.format_map({'Question':ques}) return get_response(text)