Spaces:
Sleeping
Sleeping
File size: 3,054 Bytes
643e1b9 aac5496 643e1b9 1275101 8c678cf ac12a64 b7aed3a 974c8b8 1275101 b210fbe b910146 ac12a64 1275101 231b62a 643e1b9 0467f17 643e1b9 b7a41e7 aac5496 baf000f 6130d38 0467f17 baf000f 643e1b9 650c39a b277c0d d3df8fd 643e1b9 708da42 b52ede2 08c9e9f 5592cea 08c9e9f 5592cea 08c9e9f 5592cea 08c9e9f 643e1b9 03d2fc2 974c8b8 f7aeb1e ed51056 f7aeb1e ed51056 f7aeb1e 716b08f f7aeb1e ed51056 86b68c0 9a196a8 f7aeb1e cf360c7 86b68c0 974c8b8 86b68c0 f7aeb1e ed51056 0328982 8238f47 643e1b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import torch
import os
from transformers import AutoModelForCausalLM, GemmaTokenizerFast, TextIteratorStreamer, AutoTokenizer
from interface import GemmaLLMInterface
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.instructor import InstructorEmbedding
import gradio as gr
from llama_index.core import ChatPromptTemplate
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader, PromptTemplate, load_index_from_storage
from llama_index.core.node_parser import SentenceSplitter
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import spaces
from huggingface_hub import login
from llama_index.core.memory import ChatMemoryBuffer
from typing import Iterator
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
login(huggingface_token)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_id = "google/gemma-2-2b-it"
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype= torch.bfloat16 if torch.cuda.is_available() else torch.float32,
token=True)
model.tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model.eval()
# what models will be used by LlamaIndex:
Settings.embed_model = InstructorEmbedding(model_name="hkunlp/instructor-base")
Settings.llm = GemmaLLMInterface()
############################---------------------------------
# Get the parser
parser = SentenceSplitter.from_defaults(
chunk_size=256, chunk_overlap=64, paragraph_separator="\n\n"
)
def build_index():
# Load documents from a file
documents = SimpleDirectoryReader(input_files=["data/blockchainprova.txt"]).load_data()
# Parse the documents into nodes
nodes = parser.get_nodes_from_documents(documents)
# Build the vector store index from the nodes
index = VectorStoreIndex(nodes)
return index
@spaces.GPU(duration=20)
def handle_query(query_str, chathistory) -> Iterator[str]:
index = build_index()
try:
memory = ChatMemoryBuffer.from_defaults(token_limit=1500)
chat_engine = index.as_chat_engine(
chat_mode="context",
memory=memory,
system_prompt=(
"Sei un assistente Q&A italiano di nome Odi, che risponde solo alle domande o richieste pertinenti in modo preciso. Hai una risposta predefinita per quando un utente ti chiede informazioni su di te o sul tuo creatore, ovvero: 'Sono un assistente ricercatore creato dagli Osservatori Digitali'."
),
)
outputs = []
response = chat_engine.stream_chat(query_str)
#response = chat_engine.chat(query_str)
for token in response.response_gen:
if not token.startswith("system:") and not token.startswith("user:"):
outputs.append(str(token))
print(f"Generated token: {token}")
yield "".join(outputs)
except Exception as e:
yield f"Error processing query: {str(e)}"
|