heikowagner commited on
Commit
1f84a9a
·
1 Parent(s): 8d717c1
app/VectorStore/chroma-collections.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:485e9d46361ec332e64d9b50063f7b958cfd7bf015931232033e05d34e3474d2
3
- size 712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6500348785bdf69480c86a933feaa0dd3328a9acffda71e251ca9928c6813627
3
+ size 957
app/VectorStore/index/id_to_uuid_3c194f90-478a-4f8e-a5ac-67776218c783.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3fd923d38dbc7773fa8ddd035a3a12b35b36c0596120795d5441fa2631aa500
3
- size 7657
 
 
 
 
app/VectorStore/index/index_3c194f90-478a-4f8e-a5ac-67776218c783.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8012c468a836e45dec5264f07e79a82dd9b0cfbd57b7db82ab3e5f87659e004
3
- size 779728
 
 
 
 
app/VectorStore/index/index_metadata_3c194f90-478a-4f8e-a5ac-67776218c783.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe883ac5dc1e9c3d5b56fe942e1fef13b990df4e9b32e59c5eb7b12bba00e7c0
3
- size 73
 
 
 
 
app/VectorStore/index/uuid_to_id_3c194f90-478a-4f8e-a5ac-67776218c783.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d94d83b22ad6a388ffd24e1151e31ff2b22aaee250d0a8e442f0744bc00cffda
3
- size 8970
 
 
 
 
app/app.py CHANGED
@@ -42,9 +42,9 @@ else:
42
  'Select the Documents to be used to answer your question',
43
  collections )
44
 
45
- st.write('You selected:', option)
46
 
47
- chain = load_model.create_chain(llm, collection=option, model_name="hkunlp/instructor-large ")
48
  try:
49
  query = st.text_area('Ask a question:', 'Hallo how are you today?')
50
  result = chain({"query": query})
 
42
  'Select the Documents to be used to answer your question',
43
  collections )
44
 
45
+ st.write('You selected:', option['name'])
46
 
47
+ chain = load_model.create_chain(llm, collection=option['name'], model_name=option['model_name'])
48
  try:
49
  query = st.text_area('Ask a question:', 'Hallo how are you today?')
50
  result = chain({"query": query})
app/exploration.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+
3
+ from utils import retrieve_collections, get_chroma_client
4
+
5
+
6
+ from load_model import load_embedding
7
+
8
+ #retrieve_collections()
9
+
10
+ client = get_chroma_client()
11
+
12
+ # %%
13
+ client.reset()
14
+ # %%
15
+ collections = tuple( [collection.name for collection in client.list_collections()] ) ##Keine Embedding function in der Collection angelegt...
16
+
17
+ ef = load_embedding("hkunlp/instructor-large")
18
+ collection="heikostest2"
19
+ client.create_collection(collection, embedding_function=ef, metadata={"loaded_docs":[]})
20
+
21
+
22
+ # %%
23
+ client.list_collections()
app/load_model.py CHANGED
@@ -97,9 +97,8 @@ def load_embedding(model_name):
97
  )
98
  return embeddings
99
 
100
- def load_vectorstore(model_name, collection):
101
  embeddings = load_embedding(model_name)
102
-
103
  client_settings = Settings(
104
  chroma_db_impl="duckdb+parquet",
105
  persist_directory=persist_directory,
@@ -110,11 +109,12 @@ def load_vectorstore(model_name, collection):
110
  embedding_function=embeddings,
111
  client_settings=client_settings,
112
  persist_directory=persist_directory,
 
113
  )
114
  return vectorstore
115
 
116
- def create_chain(_llm, collection, model_name):
117
- vectorstore = load_vectorstore(model_name, collection)
118
  retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
119
  chain = RetrievalQA.from_chain_type(llm=_llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
120
  return chain
 
97
  )
98
  return embeddings
99
 
100
+ def load_vectorstore(model_name, collection, metadata):
101
  embeddings = load_embedding(model_name)
 
102
  client_settings = Settings(
103
  chroma_db_impl="duckdb+parquet",
104
  persist_directory=persist_directory,
 
109
  embedding_function=embeddings,
110
  client_settings=client_settings,
111
  persist_directory=persist_directory,
112
+ collection_metadata=metadata
113
  )
114
  return vectorstore
115
 
116
+ def create_chain(_llm, collection, model_name, metadata=None):
117
+ vectorstore = load_vectorstore(model_name, collection, metadata=metadata)
118
  retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
119
  chain = RetrievalQA.from_chain_type(llm=_llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
120
  return chain
app/load_test.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ # %%
3
+ import os
4
+ import pathlib
5
+
6
+ from load_model import load_embedding
7
+ from utils import get_chroma_client
8
+ from load_vectors import load_from_web, create_and_add, load_and_split
9
+
10
+ collection="axaterms"
11
+ client = get_chroma_client()
12
+ # Load collection to get metadata
13
+ loaded_collection = client.get_collection(collection)
14
+
15
+ # %%
16
+ model_name = loaded_collection.metadata['model_name']
17
+
18
+ # %%
19
+ print( loaded_collection.json() )
20
+
21
+
22
+ # %%
23
+ client.get_collection(collection).json() #add documents destroys the metadata... maybe :)
24
+ # %%
25
+
26
+ #loaded_collection.modify(metadata={"Test":99})
27
+
28
+ # %%
29
+ loaded_collection.json()
app/load_vectors.py CHANGED
@@ -41,10 +41,10 @@ def create_collection(collection_name, model_name, client):
41
  client.get_or_create_collection(collection_name, embedding_function=ef)
42
  return True
43
 
44
- def create_and_add(collection_name, sub_docs, model_name):
45
  logging.info(f"Adding documents to {collection_name}")
46
- embeddings = load_embedding(model_name)
47
- vectorstore = load_vectorstore(model_name, collection_name)
48
  vectorstore.add_documents(documents=sub_docs, embedding=embeddings)
49
  vectorstore.persist()
50
 
 
41
  client.get_or_create_collection(collection_name, embedding_function=ef)
42
  return True
43
 
44
+ def create_and_add(collection_name, sub_docs, model_name, metadata):
45
  logging.info(f"Adding documents to {collection_name}")
46
+ embeddings = load_embedding(model_name)
47
+ vectorstore = load_vectorstore(model_name, collection_name, metadata = metadata)
48
  vectorstore.add_documents(documents=sub_docs, embedding=embeddings)
49
  vectorstore.persist()
50
 
app/run.py CHANGED
@@ -1,17 +1,49 @@
 
1
  # %%
2
  import os
3
  import pathlib
 
 
 
 
 
4
  current_path = str( pathlib.Path(__file__).parent.resolve() )
5
  with open(current_path+'/.openaiapikey', 'r') as reader:
6
  os.environ['OPENAI_API_KEY']=reader.read()
7
  import load_model
8
- import cloudpickle
9
 
10
  # %%
11
- # llm = load_model.load_gpu_model("decapoda-research/llama-7b-hf")
12
  llm= load_model.load_openai_model()
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # %%
15
- chain = load_model.create_chain(llm, collection="heikospaper", model_name="hkunlp/instructor-large")
16
- result = chain({"query": "What are AXAs green Goals?"})
17
- print(result)
 
1
+ # This script inits the models and adds an example collection to the Vectorstore
2
  # %%
3
  import os
4
  import pathlib
5
+
6
+ from load_model import load_embedding
7
+ from utils import get_chroma_client
8
+ from load_vectors import load_from_web, create_and_add, load_and_split
9
+
10
  current_path = str( pathlib.Path(__file__).parent.resolve() )
11
  with open(current_path+'/.openaiapikey', 'r') as reader:
12
  os.environ['OPENAI_API_KEY']=reader.read()
13
  import load_model
 
14
 
15
  # %%
16
+ #load_model.load_gpu_model("decapoda-research/llama-7b-hf") #Download local model
17
  llm= load_model.load_openai_model()
18
 
19
+ # %%
20
+ #Load example Data
21
+ client = get_chroma_client()
22
+ client.reset()
23
+ ef = load_embedding("hkunlp/instructor-large")
24
+ collection_name="axaterms"
25
+ metadata= {"loaded_docs":[], "Subject":"AXA Terms", "model_name": ef.model_name}
26
+ selected_collection = client.create_collection(collection_name, embedding_function=ef, metadata=metadata)
27
+
28
+ docs_tarifs= [
29
+ "https://www.axa.de/site/axa-de/get/documents_E1805589786/axade/medien/privatkunden/fahrzeugversicherungen/kfz-versicherung/start-and-drive/start-and-drive-versicherungsbedingungen.pdf",
30
+ "https://www.axa.de/site/axa-de/get/documents_E-298610932/axade/medien/privatkunden/haftpflicht-und-recht/rechtsschutz/versicherungsbedingungen-roland-rechtsschutz.pdf",
31
+ "https://www.axa.de/site/axa-de/get/documents_E1450059874/axade/medien/privatkunden/haftpflicht-und-recht/private%20haftpflichtversicherung/privathaftpflicht-versicherungsbedingungen-leistungspaket-L.pdf",
32
+ "https://www.axa.de/site/axa-de/get/documents_E1883536226/axade/medien/privatkunden/haus-und-wohnen/hausratversicherung/hausrat-versicherungsbedingungen-L.pdf",
33
+ ]
34
+
35
+ # %%
36
+ # Load collection to get metadata
37
+ loaded_collection = client.get_collection(collection_name)
38
+ model_name = loaded_collection.metadata['model_name']
39
+
40
+ # %%
41
+
42
+ docs = load_from_web(docs_tarifs)
43
+ sub_docs = load_and_split(docs, chunk_size=1000)
44
+ create_and_add(collection_name, sub_docs, model_name, metadata)
45
+
46
  # %%
47
+ chain = load_model.create_chain(llm, collection=collection_name, model_name=model_name)
48
+ #result = chain({"query": "Ist mein Kinderwagen bei einem Leitungswasserschaden mitversichert?"})
49
+ #print(result)
app/utils.py CHANGED
@@ -4,6 +4,7 @@ from langchain.docstore.document import Document
4
  import chromadb
5
  from chromadb.config import Settings
6
  import load_model
 
7
  from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
8
  persist_directory = load_model.persist_directory
9
 
@@ -21,15 +22,18 @@ def format_result_set(result):
21
  for document in source_documents:
22
  st.write(format_document(document))
23
 
24
- @st.cache_resource
25
  def get_chroma_client():
26
  return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
27
  persist_directory=persist_directory
28
  ))
29
- @st.cache_data
30
  def retrieve_collections():
31
  client = get_chroma_client()
32
- collections = tuple( [collection.name for collection in client.list_collections()] )
 
 
 
33
  return collections
34
 
35
  def load_files():
@@ -64,7 +68,7 @@ def load_files():
64
  if st.button('Upload'):
65
  docs = load_from_file(uploaded_files)
66
  sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
67
- create_and_add(selected_collection, sub_docs, "hkunlp/instructor-large")
68
  uploaded_files=None
69
  else:
70
  st.write('Urls of Source Documents (Comma separated):')
@@ -75,12 +79,14 @@ def load_files():
75
  if st.button('Upload'):
76
  docs = load_from_web(urls)
77
  sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
78
- create_and_add(selected_collection, sub_docs, "hkunlp/instructor-large")
79
  uploaded_files=None
80
  else:
81
  collection = st.text_area('Name of your new collection:', '')
 
82
  if st.button('Create'):
83
  if len(collection)>3:
84
- client.create_collection(collection) #collection_name + "_" + re.sub('[^A-Za-z0-9]+', '', model_name) --Problem i added the model to the name -> Better use Metadata :)
 
85
  retrieve_collections.clear()
86
  st.write("Collection " +collection+" succesfully created.")
 
4
  import chromadb
5
  from chromadb.config import Settings
6
  import load_model
7
+ from load_model import load_embedding
8
  from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
9
  persist_directory = load_model.persist_directory
10
 
 
22
  for document in source_documents:
23
  st.write(format_document(document))
24
 
25
+ #@st.cache_resource
26
  def get_chroma_client():
27
  return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
28
  persist_directory=persist_directory
29
  ))
30
+ #@st.cache_data
31
  def retrieve_collections():
32
  client = get_chroma_client()
33
+ all_collections = client.list_collections()
34
+ print(all_collections)
35
+ print(all_collections[0].metadata)
36
+ collections = tuple( [{'name': collection.name, 'model_name': collection.metadata['model_name']} for collection in all_collections] )
37
  return collections
38
 
39
  def load_files():
 
68
  if st.button('Upload'):
69
  docs = load_from_file(uploaded_files)
70
  sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
71
+ create_and_add(selected_collection, sub_docs, None)
72
  uploaded_files=None
73
  else:
74
  st.write('Urls of Source Documents (Comma separated):')
 
79
  if st.button('Upload'):
80
  docs = load_from_web(urls)
81
  sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
82
+ create_and_add(selected_collection, sub_docs, None)
83
  uploaded_files=None
84
  else:
85
  collection = st.text_area('Name of your new collection:', '')
86
+ model_name = st.text_area('Choose the embedding function:', "hkunlp/instructor-large")
87
  if st.button('Create'):
88
  if len(collection)>3:
89
+ ef = load_embedding(model_name)
90
+ client.create_collection(collection, embedding_function=ef)
91
  retrieve_collections.clear()
92
  st.write("Collection " +collection+" succesfully created.")