import transformers import torch import os from fastapi import FastAPI from transformers import AutoTokenizer from llama_cpp import Llama from huggingface_hub import hf_hub_download # Load the model hf_hub_download(repo_id="bevangelista/Llama-2-7b-chat-hf-GGUF-Q4_K_M", filename="Llama-2-7b-chat-hf-GGUF-Q4_K_M.gguf") app = FastAPI() @app.get("/") def llama(): llm = Llama( model_path="./models/7B/llama-model.gguf", # n_gpu_layers=-1, # Uncomment to use GPU acceleration # seed=1337, # Uncomment to set a specific seed # n_ctx=2048, # Uncomment to increase the context window ) output = llm( "Q: Name the planets in the solar system? A: ", # Prompt max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window stop=["Q:", "\n"], # Stop generating just before the model would generate a new question echo=True # Echo the prompt back in the output ) # Generate a completion, can also call create_completion return {"output": output}