Svngoku
/

Llama-3.1-8B-AlpaCare-MedInstruct

@@ -33,10 +33,7 @@ datasets:
 max_seq_length = 4096
 dtype = None
 load_in_4bit = True # Use 4bit quantization to reduce memory usage.
-```
-```py
 alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 ### Instruction:
@@ -49,31 +46,53 @@ alpaca_prompt = """Below is an instruction that describes a task, paired with an
 {}"""
 ```
 ```py
-if True:
-    from unsloth import FastLanguageModel
-    model, tokenizer = FastLanguageModel.from_pretrained(
         model_name = "Svngoku/Llama-3.1-8B-AlpaCare-MedInstruct",
         max_seq_length = max_seq_length,
         dtype = dtype,
         load_in_4bit = load_in_4bit,
-    )
-    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
-# alpaca_prompt = You MUST copy from above!
-inputs = tokenizer(
-[
-    alpaca_prompt.format(
-        "Write an argument emphasizing the importance of ethical considerations in medical research.", # instruction
-        "", # input
-        "", # output - leave this blank for generation!
-    )
-], return_tensors = "pt").to("cuda")
-from transformers import TextStreamer
-text_streamer = TextStreamer(tokenizer)
-_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 800)
 ```
 This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.

 max_seq_length = 4096
 dtype = None
 load_in_4bit = True # Use 4bit quantization to reduce memory usage.
 alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 ### Instruction:
 {}"""
 ```
 ```py
+from unsloth import FastLanguageModel
+model, tokenizer = FastLanguageModel.from_pretrained(
         model_name = "Svngoku/Llama-3.1-8B-AlpaCare-MedInstruct",
         max_seq_length = max_seq_length,
         dtype = dtype,
         load_in_4bit = load_in_4bit,
+)
+FastLanguageModel.for_inference(model)
+```
+```py
+def generate_medical_answer(input: str = "", instruction: str = ""):
+  inputs = tokenizer(
+  [
+      alpaca_prompt.format(
+          instruction,
+          input,
+          "",
+      )
+  ], return_tensors = "pt").to("cuda")
+  text_streamer = TextStreamer(tokenizer)
+  # _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 800)
+      # Generate the response
+  output = model.generate(**inputs, max_new_tokens=1024)
+    # Decode the generated response
+  generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
+    # Extract the response part if needed (assuming the response starts after "### Response:")
+  response_start = generated_text.find("### Response:") + len("### Response:")
+  response = generated_text[response_start:].strip()
+    # Format the response in Markdown
+  # markdown_response = f"{response}"
+    # Render the markdown response
+  # display(Markdown(markdown_response))
+  return response
+```
+```py
+generate_medical_answer(
+  instruction = "What are the pharmacodynamics of Omeprazole?",
+  input="Writte the text in plain markdown."
+)
 ```
 This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.