emar commited on
Commit
6c79873
1 Parent(s): 4b52a00

reduce to simplicity?

Browse files
Files changed (1) hide show
  1. app.py +18 -42
app.py CHANGED
@@ -1,67 +1,43 @@
1
- import spaces
2
  import gradio as gr
3
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
4
- from llama_index.core import StorageContext, load_index_from_storage, Settings
 
 
 
5
  from llama_index.llms.huggingface import HuggingFaceLLM
6
  import torch
7
- from pydantic import BaseModel
8
-
9
  PERSIST_DIR = './storage'
10
 
11
  # Configure the settings
12
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
 
14
- # Pydantic config to avoid protected namespace warning
15
- class Config(BaseModel):
16
- model_config = {'protected_namespaces': ()}
17
-
18
- # @spaces.GPU(duration=240)
19
- def setup():
20
- Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5", device=DEVICE)
21
- Settings.llm = HuggingFaceLLM(
22
- model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
23
- tokenizer_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
24
- context_window=2048,
25
- max_new_tokens=256,
26
- generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
27
- device_map="auto",
28
- )
29
 
30
- setup()
31
 
32
- # Load the existing index
33
- # @spaces.GPU
34
- def load_context():
35
- storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
36
- index = load_index_from_storage(storage_context)
37
- query_engine = index.as_query_engine()
38
- return query_engine
39
-
40
- query_engine = None
41
 
42
- def initialize_query_engine():
43
- global query_engine
44
- query_engine = load_context()
45
 
46
- # Initialize query engine at the start
47
- initialize_query_engine()
48
 
49
- # Chatbot response function
50
  @spaces.GPU
51
  def chatbot_response(message, history):
52
- if query_engine is None:
53
- initialize_query_engine()
54
  response = query_engine.query(message)
55
  return str(response)
56
 
57
- # Initialize Gradio interface
58
  iface = gr.ChatInterface(
59
  fn=chatbot_response,
60
  title="UESP Lore Chatbot: CPU bound version of Phi-3-mini",
61
- description=(
62
- "Low quality and extremely slow version of the ones you can find on the github page: "
63
- "https://github.com/emarron/UESP-lore. I am not paying to have Llama3 on here."
64
- ),
65
  examples=["Who is Zaraphus?"],
66
  cache_examples=True,
67
  )
 
1
+ import os
2
  import gradio as gr
3
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
4
+ from llama_index.core import (
5
+ StorageContext,
6
+ load_index_from_storage, Settings,
7
+ )
8
  from llama_index.llms.huggingface import HuggingFaceLLM
9
  import torch
 
 
10
  PERSIST_DIR = './storage'
11
 
12
  # Configure the settings
13
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5", device="cpu")
17
 
18
+ Settings.llm = HuggingFaceLLM(
19
+ model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
20
+ tokenizer_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
21
+ context_window=2048,
22
+ max_new_tokens=256,
23
+ generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
24
+ device_map="auto",
25
+ )
 
26
 
27
+ storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
28
+ index = load_index_from_storage(storage_context)
29
+ query_engine = index.as_query_engine()
30
 
 
 
31
 
 
32
  @spaces.GPU
33
  def chatbot_response(message, history):
 
 
34
  response = query_engine.query(message)
35
  return str(response)
36
 
 
37
  iface = gr.ChatInterface(
38
  fn=chatbot_response,
39
  title="UESP Lore Chatbot: CPU bound version of Phi-3-mini",
40
+ description="Low quality and extremely slow version of the ones you can find on the github page.: https://github.com/emarron/UESP-lore I am not paying to have Llama3 on here.",
 
 
 
41
  examples=["Who is Zaraphus?"],
42
  cache_examples=True,
43
  )