pmking27
/

PrathameshLLM-2B-GGUF

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

pmking27 commited on Apr 9, 2024

Commit

573747c

·

verified ·

1 Parent(s): e195ac9

Update README.md

Files changed (1) hide show

README.md +11 -6

README.md CHANGED Viewed

@@ -91,8 +91,12 @@ Replace `model_repo_id` and `filename` with the desired model repository ID and
 ```python
 from llama_cpp import Llama
-llm = Llama(model_path = filepath, chat_format="gemma")
 # Defining the Alpaca prompt template
 alpaca_prompt = """
 ### Instruction:
@@ -118,12 +122,12 @@ output = llm(
         "In how many phases will the general elections in India be held?", # input
         "", # output - leave this blank for generation!
     ), #Alpaca Prompt
-  max_tokens=512,  # Generate up to 512 tokens
-  stop=["<eos>"],   #stop token
-  echo=True        # Whether to echo the prompt
 )
-output_text=output['choices'][0]['text']
 start_marker = "### Response:"
 end_marker = "<eos>"
 start_pos = output_text.find(start_marker) + len(start_marker)
@@ -133,6 +137,7 @@ end_pos = output_text.find(end_marker, start_pos)
 response_text = output_text[start_pos:end_pos].strip()
 print(response_text)
 ```
 #### Simple llama-cpp-python Chat Completion API Example Code

 ```python
 from llama_cpp import Llama
+llm = Llama(
+  model_path = filepath,  # Download the model file first
+  n_ctx = 32768,  # The max sequence length to use - note that longer sequence lengths require much more resources
+  n_threads = 8,            # The number of CPU threads to use, tailor to your system and the resulting performance
+  n_gpu_layers = 35         # The number of layers to offload to GPU, if you have GPU acceleration available
+)
 # Defining the Alpaca prompt template
 alpaca_prompt = """
 ### Instruction:
         "In how many phases will the general elections in India be held?", # input
         "", # output - leave this blank for generation!
     ), #Alpaca Prompt
+  max_tokens = 512,  # Generate up to 512 tokens
+  stop = ["<eos>"],   #stop token
+  echo = True        # Whether to echo the prompt
 )
+output_text = output['choices'][0]['text']
 start_marker = "### Response:"
 end_marker = "<eos>"
 start_pos = output_text.find(start_marker) + len(start_marker)
 response_text = output_text[start_pos:end_pos].strip()
 print(response_text)
 ```
 #### Simple llama-cpp-python Chat Completion API Example Code