File size: 3,054 Bytes
643e1b9
 
aac5496
643e1b9
 
 
 
 
 
 
1275101
 
8c678cf
ac12a64
b7aed3a
974c8b8
 
1275101
b210fbe
b910146
ac12a64
1275101
231b62a
643e1b9
0467f17
643e1b9
 
b7a41e7
aac5496
baf000f
6130d38
 
0467f17
baf000f
643e1b9
650c39a
b277c0d
d3df8fd
643e1b9
 
708da42
 
 
 
b52ede2
08c9e9f
 
5592cea
08c9e9f
5592cea
08c9e9f
5592cea
08c9e9f
 
 
643e1b9
03d2fc2
974c8b8
f7aeb1e
ed51056
 
 
f7aeb1e
ed51056
 
f7aeb1e
 
 
716b08f
f7aeb1e
ed51056
 
86b68c0
 
9a196a8
f7aeb1e
 
cf360c7
 
86b68c0
974c8b8
86b68c0
f7aeb1e
ed51056
0328982
8238f47
643e1b9
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import torch
import os
from transformers import AutoModelForCausalLM, GemmaTokenizerFast, TextIteratorStreamer, AutoTokenizer
from interface import GemmaLLMInterface
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.instructor import InstructorEmbedding
import gradio as gr
from llama_index.core import ChatPromptTemplate
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader, PromptTemplate, load_index_from_storage
from llama_index.core.node_parser import SentenceSplitter
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import spaces
from huggingface_hub import login
from llama_index.core.memory import ChatMemoryBuffer
from typing import Iterator



huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
login(huggingface_token)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_id = "google/gemma-2-2b-it"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto", 
    torch_dtype= torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    token=True)

model.tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model.eval()

# what models will be used by LlamaIndex:
Settings.embed_model = InstructorEmbedding(model_name="hkunlp/instructor-base")
Settings.llm = GemmaLLMInterface()

############################---------------------------------

# Get the parser
parser = SentenceSplitter.from_defaults(
                chunk_size=256, chunk_overlap=64, paragraph_separator="\n\n"
            )

def build_index():
    # Load documents from a file
    documents = SimpleDirectoryReader(input_files=["data/blockchainprova.txt"]).load_data()
    # Parse the documents into nodes
    nodes = parser.get_nodes_from_documents(documents)
    # Build the vector store index from the nodes
    index = VectorStoreIndex(nodes)
    
    return index


@spaces.GPU(duration=20)
def handle_query(query_str, chathistory) -> Iterator[str]:
    
    index = build_index()

    try:
          
        memory = ChatMemoryBuffer.from_defaults(token_limit=1500)
        chat_engine = index.as_chat_engine(
        chat_mode="context",
        memory=memory,
        system_prompt=(
          "Sei un assistente Q&A italiano di nome Odi, che risponde solo alle domande o richieste pertinenti in modo preciso. Hai una risposta predefinita per quando un utente ti chiede informazioni su di te o sul tuo creatore, ovvero: 'Sono un assistente ricercatore creato dagli Osservatori Digitali'."
          ),
        )
        
        
        outputs = []
        response = chat_engine.stream_chat(query_str)
        #response = chat_engine.chat(query_str)
        for token in response.response_gen:
          if not token.startswith("system:") and not token.startswith("user:"):
          
            outputs.append(str(token))
            print(f"Generated token: {token}")
            yield "".join(outputs)
          
        
    except Exception as e:
        yield f"Error processing query: {str(e)}"