as-cle-bert commited on
Commit
dfbb358
·
verified ·
1 Parent(s): 823ae95

Create llama_cpp_inf.py

Browse files
Files changed (1) hide show
  1. llama_cpp_inf.py +48 -0
llama_cpp_inf.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Imports
2
+ from llama_cpp import Llama
3
+ import re
4
+ from huggingface_hub import hf_hub_download
5
+
6
+ ## Download the GGUF model
7
+ model_name = "microsoft/Phi-3-mini-4k-instruct-gguf"
8
+ model_file = "Phi-3-mini-4k-instruct-q4.gguf" # this is the specific model file we'll use in this example. It's a 4-bit quant, but other levels of quantization are available in the model repo if preferred
9
+ model_path = hf_hub_download(model_name, filename=model_file)
10
+
11
+ ## Instantiate model from downloaded file
12
+ llm = Llama(
13
+ model_path=model_path,
14
+ n_ctx=4096, # Context length to use
15
+ n_threads=14, # Number of CPU threads to use
16
+ n_gpu_layers=3 # Number of model layers to offload to GPU
17
+ )
18
+
19
+ ## Generation kwargs
20
+ generation_kwargs = {
21
+ "max_tokens":1024,
22
+ "stop":["<|end|>"],
23
+ "echo":False, # Echo the prompt in the output
24
+ "top_k":1 # This is essentially greedy decoding, since the model will always return the highest-probability token. Set this value > 1 for sampling decoding
25
+ }
26
+
27
+ def run_inference_lcpp(jsonstr, user_search):
28
+ prompt = f"""Instructions for the assistant: Starting from the URLs and the keywords deriving from Google search results and provided to you in JSON format, generate a meaningful summary of the search results that satisfies the user's query.
29
+ URLs and keywords in JSON format: {jsonstr}.
30
+ User's query to satisfy: {user_search}"""
31
+ res = llm(prompt, **generation_kwargs)
32
+ response = res["choices"][0]["text"]
33
+ jsondict = eval(jsonstr)
34
+ addon = "Reference websites:\n- "+ '\n- '.join(list(jsondict.keys()))
35
+ input_string = response.replace("<|assistant|>", "") + "\n\n" + addon
36
+ frag_res = re.findall(r'\w+|\s+|[^\w\s]', input_string)
37
+ for word in frag_res:
38
+ yield word
39
+
40
+ if __name__ == "__main__":
41
+ prompt = """Context: A vector database, vector store or vector search engine is a database that can store vectors (fixed-length lists of numbers) along with other data items. Vector databases typically implement one or more Approximate Nearest Neighbor (ANN) algorithms,[1][2] so that one can search the database with a query vector to retrieve the closest matching database records.
42
+
43
+ Vectors are mathematical representations of data in a high-dimensional space. In this space, each dimension corresponds to a feature of the data, with the number of dimensions ranging from a few hundred to tens of thousands, depending on the complexity of the data being represented. A vector's position in this space represents its characteristics. Words, phrases, or entire documents, as well as images, audio, and other types of data, can all be vectorized; Prompt: Describe what is a vector database"""
44
+ res = llm(prompt, **generation_kwargs) # Res is a dictionary
45
+
46
+ ## Unpack and the generated text from the LLM response dictionary and print it
47
+ print(res["choices"][0]["text"])
48
+ # res is short for result