import torch import os from transformers import AutoModelForCausalLM, GemmaTokenizerFast, TextIteratorStreamer, AutoTokenizer from interface import GemmaLLMInterface from llama_index.core.node_parser import SentenceSplitter from llama_index.embeddings.instructor import InstructorEmbedding import gradio as gr from llama_index.core import ChatPromptTemplate from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader, PromptTemplate, load_index_from_storage from llama_index.core.node_parser import SentenceSplitter from huggingface_hub import hf_hub_download from llama_cpp import Llama import spaces from huggingface_hub import login huggingface_token = os.getenv("HUGGINGFACE_TOKEN") login(huggingface_token) """hf_hub_download( repo_id="google/gemma-2-2b-it-GGUF", filename="2b_it_v2.gguf", local_dir="./models", token=huggingface_token ) llm = Llama( model_path=f"models/2b_it_v2.gguf", flash_attn=True, _gpu_layers=81, n_batch=1024, n_ctx=8192, )""" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model_id = "google/gemma-2-2b-it" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype= torch.bfloat16 if torch.cuda.is_available() else torch.float32, token=True ) model.eval() # what models will be used by LlamaIndex: Settings.embed_model = InstructorEmbedding(model_name="hkunlp/instructor-base") Settings.llm = GemmaLLMInterface(model=model, tokenizer=tokenizer) #Settings.llm = llm ############################--------------------------------- @spaces.GPU(duration=120) def handle_query(query_str, chathistory): # Reading documents from disk documents = SimpleDirectoryReader(input_files=["data/blockchainprova.txt"]).load_data() # Splitting the document into chunks parser = SentenceSplitter.from_defaults( chunk_size=256, chunk_overlap=64, paragraph_separator="\n\n" ) nodes = parser.get_nodes_from_documents(documents) # BUILD A VECTOR STORE index = VectorStoreIndex(nodes) qa_prompt_str = ( "Context information is below.\n" "---------------------\n" "{context_str}\n" "---------------------\n" "Given the context information and not prior knowledge, " "answer the question: {query_str}\n" ) # Text QA Prompt chat_text_qa_msgs = [ ( "system", "Sei un assistente italiano di nome Ossy che risponde solo alle domande o richieste pertinenti. ", ), ("user", qa_prompt_str), ] text_qa_template = ChatPromptTemplate.from_messages(chat_text_qa_msgs) try: result = index.as_query_engine(text_qa_template=text_qa_template).query(query_str) response_text = result.response # Remove any unwanted tokens like cleaned_result = response_text.replace("", "").strip() yield cleaned_result except Exception as e: yield f"Error processing query: {str(e)}"