lfoppiano commited on
Commit
0188e45
1 Parent(s): a5de09e

add query analyzer with min and avg similarity

Browse files
document_qa/document_qa_engine.py CHANGED
@@ -1,35 +1,23 @@
1
  import copy
2
  import os
3
  from pathlib import Path
4
- from typing import Union, Any, Optional, List, Dict, Tuple, ClassVar, Collection
5
 
6
  import tiktoken
7
  from langchain.chains import create_extraction_chain
8
  from langchain.chains.question_answering import load_qa_chain, stuff_prompt, refine_prompts, map_reduce_prompt, \
9
  map_rerank_prompt
 
10
  from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
11
  from langchain.retrievers import MultiQueryRetriever
12
  from langchain.schema import Document
13
- from langchain_community.vectorstores.chroma import Chroma, DEFAULT_K
14
- from langchain_community.vectorstores.faiss import FAISS
15
- from langchain_core.callbacks import CallbackManagerForRetrieverRun
16
- from langchain_core.utils import xor_args
17
- from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
18
  from tqdm import tqdm
19
 
 
20
  from document_qa.grobid_processors import GrobidProcessor
21
-
22
-
23
- def _results_to_docs_scores_and_embeddings(results: Any) -> List[Tuple[Document, float, List[float]]]:
24
- return [
25
- (Document(page_content=result[0], metadata=result[1] or {}), result[2], result[3])
26
- for result in zip(
27
- results["documents"][0],
28
- results["metadatas"][0],
29
- results["distances"][0],
30
- results["embeddings"][0],
31
- )
32
- ]
33
 
34
 
35
  class TextMerger:
@@ -117,135 +105,6 @@ class BaseRetrieval:
117
  self.persist_directory = persist_directory
118
 
119
 
120
- class AdvancedVectorStoreRetriever(VectorStoreRetriever):
121
- allowed_search_types: ClassVar[Collection[str]] = (
122
- "similarity",
123
- "similarity_score_threshold",
124
- "mmr",
125
- "similarity_with_embeddings"
126
- )
127
-
128
- def _get_relevant_documents(
129
- self, query: str, *, run_manager: CallbackManagerForRetrieverRun
130
- ) -> List[Document]:
131
- if self.search_type == "similarity":
132
- docs = self.vectorstore.similarity_search(query, **self.search_kwargs)
133
- elif self.search_type == "similarity_score_threshold":
134
- docs_and_similarities = (
135
- self.vectorstore.similarity_search_with_relevance_scores(
136
- query, **self.search_kwargs
137
- )
138
- )
139
- for doc, similarity in docs_and_similarities:
140
- if '__similarity' not in doc.metadata.keys():
141
- doc.metadata['__similarity'] = similarity
142
-
143
- docs = [doc for doc, _ in docs_and_similarities]
144
- elif self.search_type == "mmr":
145
- docs = self.vectorstore.max_marginal_relevance_search(
146
- query, **self.search_kwargs
147
- )
148
- elif self.search_type == "similarity_with_embeddings":
149
- docs_scores_and_embeddings = (
150
- self.vectorstore.advanced_similarity_search(
151
- query, **self.search_kwargs
152
- )
153
- )
154
-
155
- for doc, score, embeddings in docs_scores_and_embeddings:
156
- if '__embeddings' not in doc.metadata.keys():
157
- doc.metadata['__embeddings'] = embeddings
158
- if '__similarity' not in doc.metadata.keys():
159
- doc.metadata['__similarity'] = score
160
-
161
- docs = [doc for doc, _, _ in docs_scores_and_embeddings]
162
- else:
163
- raise ValueError(f"search_type of {self.search_type} not allowed.")
164
- return docs
165
-
166
-
167
- class AdvancedVectorStore(VectorStore):
168
- def as_retriever(self, **kwargs: Any) -> AdvancedVectorStoreRetriever:
169
- tags = kwargs.pop("tags", None) or []
170
- tags.extend(self._get_retriever_tags())
171
- return AdvancedVectorStoreRetriever(vectorstore=self, **kwargs, tags=tags)
172
-
173
-
174
- class ChromaAdvancedRetrieval(Chroma, AdvancedVectorStore):
175
- def __init__(self, **kwargs):
176
- super().__init__(**kwargs)
177
-
178
- @xor_args(("query_texts", "query_embeddings"))
179
- def __query_collection(
180
- self,
181
- query_texts: Optional[List[str]] = None,
182
- query_embeddings: Optional[List[List[float]]] = None,
183
- n_results: int = 4,
184
- where: Optional[Dict[str, str]] = None,
185
- where_document: Optional[Dict[str, str]] = None,
186
- **kwargs: Any,
187
- ) -> List[Document]:
188
- """Query the chroma collection."""
189
- try:
190
- import chromadb # noqa: F401
191
- except ImportError:
192
- raise ValueError(
193
- "Could not import chromadb python package. "
194
- "Please install it with `pip install chromadb`."
195
- )
196
- return self._collection.query(
197
- query_texts=query_texts,
198
- query_embeddings=query_embeddings,
199
- n_results=n_results,
200
- where=where,
201
- where_document=where_document,
202
- **kwargs,
203
- )
204
-
205
- def advanced_similarity_search(
206
- self,
207
- query: str,
208
- k: int = DEFAULT_K,
209
- filter: Optional[Dict[str, str]] = None,
210
- **kwargs: Any,
211
- ) -> [List[Document], float, List[float]]:
212
- docs_scores_and_embeddings = self.similarity_search_with_scores_and_embeddings(query, k, filter=filter)
213
- return docs_scores_and_embeddings
214
-
215
- def similarity_search_with_scores_and_embeddings(
216
- self,
217
- query: str,
218
- k: int = DEFAULT_K,
219
- filter: Optional[Dict[str, str]] = None,
220
- where_document: Optional[Dict[str, str]] = None,
221
- **kwargs: Any,
222
- ) -> List[Tuple[Document, float, List[float]]]:
223
-
224
- if self._embedding_function is None:
225
- results = self.__query_collection(
226
- query_texts=[query],
227
- n_results=k,
228
- where=filter,
229
- where_document=where_document,
230
- include=['metadatas', 'documents', 'embeddings', 'distances']
231
- )
232
- else:
233
- query_embedding = self._embedding_function.embed_query(query)
234
- results = self.__query_collection(
235
- query_embeddings=[query_embedding],
236
- n_results=k,
237
- where=filter,
238
- where_document=where_document,
239
- include=['metadatas', 'documents', 'embeddings', 'distances']
240
- )
241
-
242
- return _results_to_docs_scores_and_embeddings(results)
243
-
244
-
245
- class FAISSAdvancedRetrieval(FAISS):
246
- pass
247
-
248
-
249
  class NER_Retrival(VectorStore):
250
  """
251
  This class implement a retrieval based on NER models.
@@ -256,7 +115,6 @@ class NER_Retrival(VectorStore):
256
 
257
  engines = {
258
  'chroma': ChromaAdvancedRetrieval,
259
- 'faiss': FAISSAdvancedRetrieval,
260
  'ner': NER_Retrival
261
  }
262
 
@@ -409,7 +267,7 @@ class DocumentQAEngine:
409
  context_as_text = [doc.page_content for doc in documents]
410
  return context_as_text, coordinates
411
 
412
- def query_storage_and_embeddings(self, query: str, doc_id, context_size=4):
413
  """
414
  Returns both the context and the embedding information from a given query
415
  """
@@ -417,10 +275,35 @@ class DocumentQAEngine:
417
  retriever = db.as_retriever(search_kwargs={"k": context_size}, search_type="similarity_with_embeddings")
418
  relevant_documents = retriever.get_relevant_documents(query)
419
 
420
- context_as_text = [doc.page_content for doc in relevant_documents]
421
- return context_as_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
 
423
- # chroma_collection.get(include=['embeddings'])['embeddings']
 
 
 
 
 
 
 
424
 
425
  def _parse_json(self, response, output_parser):
426
  system_message = "You are an useful assistant expert in materials science, physics, and chemistry " \
@@ -444,10 +327,7 @@ class DocumentQAEngine:
444
  return parsed_output
445
 
446
  def _run_query(self, doc_id, query, context_size=4) -> (List[Document], list):
447
- relevant_documents = self._get_context(doc_id, query, context_size)
448
- relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
449
- for doc in
450
- relevant_documents]
451
  response = self.chain.run(input_documents=relevant_documents,
452
  question=query)
453
 
 
1
  import copy
2
  import os
3
  from pathlib import Path
4
+ from typing import Union, Any, List
5
 
6
  import tiktoken
7
  from langchain.chains import create_extraction_chain
8
  from langchain.chains.question_answering import load_qa_chain, stuff_prompt, refine_prompts, map_reduce_prompt, \
9
  map_rerank_prompt
10
+ from langchain.evaluation import PairwiseEmbeddingDistanceEvalChain, load_evaluator, EmbeddingDistance
11
  from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
12
  from langchain.retrievers import MultiQueryRetriever
13
  from langchain.schema import Document
14
+ from langchain_community.vectorstores.chroma import Chroma
15
+ from langchain_core.vectorstores import VectorStore
 
 
 
16
  from tqdm import tqdm
17
 
18
+ # from document_qa.embedding_visualiser import QueryVisualiser
19
  from document_qa.grobid_processors import GrobidProcessor
20
+ from document_qa.langchain import ChromaAdvancedRetrieval
 
 
 
 
 
 
 
 
 
 
 
21
 
22
 
23
  class TextMerger:
 
105
  self.persist_directory = persist_directory
106
 
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  class NER_Retrival(VectorStore):
109
  """
110
  This class implement a retrieval based on NER models.
 
115
 
116
  engines = {
117
  'chroma': ChromaAdvancedRetrieval,
 
118
  'ner': NER_Retrival
119
  }
120
 
 
267
  context_as_text = [doc.page_content for doc in documents]
268
  return context_as_text, coordinates
269
 
270
+ def query_storage_and_embeddings(self, query: str, doc_id, context_size=4) -> List[Document]:
271
  """
272
  Returns both the context and the embedding information from a given query
273
  """
 
275
  retriever = db.as_retriever(search_kwargs={"k": context_size}, search_type="similarity_with_embeddings")
276
  relevant_documents = retriever.get_relevant_documents(query)
277
 
278
+ return relevant_documents
279
+
280
+ def analyse_query(self, query, doc_id, context_size=4):
281
+ db = self.data_storage.embeddings_dict[doc_id]
282
+ # retriever = db.as_retriever(
283
+ # search_kwargs={"k": context_size, 'score_threshold': 0.0},
284
+ # search_type="similarity_score_threshold"
285
+ # )
286
+ retriever = db.as_retriever(search_kwargs={"k": context_size}, search_type="similarity_with_embeddings")
287
+ relevant_documents = retriever.get_relevant_documents(query)
288
+ relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
289
+ for doc in
290
+ relevant_documents]
291
+ all_documents = db.get(include=['documents', 'metadatas', 'embeddings'])
292
+ # all_documents_embeddings = all_documents["embeddings"]
293
+ # query_embedding = db._embedding_function.embed_query(query)
294
+
295
+ # distance_evaluator = load_evaluator("pairwise_embedding_distance",
296
+ # embeddings=db._embedding_function,
297
+ # distance_metric=EmbeddingDistance.EUCLIDEAN)
298
 
299
+ # distance_evaluator.evaluate_string_pairs(query=query_embedding, documents="")
300
+
301
+ similarities = [doc.metadata['__similarity'] for doc in relevant_documents]
302
+ min_similarity = min(similarities)
303
+ mean_similarity = sum(similarities) / len(similarities)
304
+ coefficient = min_similarity - mean_similarity
305
+
306
+ return f"Coefficient: {coefficient}, (Min similarity {min_similarity}, Mean similarity: {mean_similarity})", relevant_document_coordinates
307
 
308
  def _parse_json(self, response, output_parser):
309
  system_message = "You are an useful assistant expert in materials science, physics, and chemistry " \
 
327
  return parsed_output
328
 
329
  def _run_query(self, doc_id, query, context_size=4) -> (List[Document], list):
330
+ relevant_documents, relevant_document_coordinates = self._get_context(doc_id, query, context_size)
 
 
 
331
  response = self.chain.run(input_documents=relevant_documents,
332
  question=query)
333
 
document_qa/langchain.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Any, Optional, List, Dict, Tuple, ClassVar, Collection
3
+
4
+ from langchain.schema import Document
5
+ from langchain_community.vectorstores.chroma import Chroma, DEFAULT_K
6
+ from langchain_core.callbacks import CallbackManagerForRetrieverRun
7
+ from langchain_core.utils import xor_args
8
+ from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
9
+
10
+
11
+ class AdvancedVectorStoreRetriever(VectorStoreRetriever):
12
+ allowed_search_types: ClassVar[Collection[str]] = (
13
+ "similarity",
14
+ "similarity_score_threshold",
15
+ "mmr",
16
+ "similarity_with_embeddings"
17
+ )
18
+
19
+ def _get_relevant_documents(
20
+ self, query: str, *, run_manager: CallbackManagerForRetrieverRun
21
+ ) -> List[Document]:
22
+
23
+ if self.search_type == "similarity_with_embeddings":
24
+ docs_scores_and_embeddings = (
25
+ self.vectorstore.advanced_similarity_search(
26
+ query, **self.search_kwargs
27
+ )
28
+ )
29
+
30
+ for doc, score, embeddings in docs_scores_and_embeddings:
31
+ if '__embeddings' not in doc.metadata.keys():
32
+ doc.metadata['__embeddings'] = embeddings
33
+ if '__similarity' not in doc.metadata.keys():
34
+ doc.metadata['__similarity'] = score
35
+
36
+ docs = [doc for doc, _, _ in docs_scores_and_embeddings]
37
+ elif self.search_type == "similarity_score_threshold":
38
+ docs_and_similarities = (
39
+ self.vectorstore.similarity_search_with_relevance_scores(
40
+ query, **self.search_kwargs
41
+ )
42
+ )
43
+ for doc, similarity in docs_and_similarities:
44
+ if '__similarity' not in doc.metadata.keys():
45
+ doc.metadata['__similarity'] = similarity
46
+
47
+ docs = [doc for doc, _ in docs_and_similarities]
48
+ else:
49
+ docs = super()._get_relevant_documents(query, run_manager=run_manager)
50
+
51
+ return docs
52
+
53
+
54
+ class AdvancedVectorStore(VectorStore):
55
+ def as_retriever(self, **kwargs: Any) -> AdvancedVectorStoreRetriever:
56
+ tags = kwargs.pop("tags", None) or []
57
+ tags.extend(self._get_retriever_tags())
58
+ return AdvancedVectorStoreRetriever(vectorstore=self, **kwargs, tags=tags)
59
+
60
+
61
+ class ChromaAdvancedRetrieval(Chroma, AdvancedVectorStore):
62
+ def __init__(self, **kwargs):
63
+ super().__init__(**kwargs)
64
+
65
+ @xor_args(("query_texts", "query_embeddings"))
66
+ def __query_collection(
67
+ self,
68
+ query_texts: Optional[List[str]] = None,
69
+ query_embeddings: Optional[List[List[float]]] = None,
70
+ n_results: int = 4,
71
+ where: Optional[Dict[str, str]] = None,
72
+ where_document: Optional[Dict[str, str]] = None,
73
+ **kwargs: Any,
74
+ ) -> List[Document]:
75
+ """Query the chroma collection."""
76
+ try:
77
+ import chromadb # noqa: F401
78
+ except ImportError:
79
+ raise ValueError(
80
+ "Could not import chromadb python package. "
81
+ "Please install it with `pip install chromadb`."
82
+ )
83
+ return self._collection.query(
84
+ query_texts=query_texts,
85
+ query_embeddings=query_embeddings,
86
+ n_results=n_results,
87
+ where=where,
88
+ where_document=where_document,
89
+ **kwargs,
90
+ )
91
+
92
+ def advanced_similarity_search(
93
+ self,
94
+ query: str,
95
+ k: int = DEFAULT_K,
96
+ filter: Optional[Dict[str, str]] = None,
97
+ **kwargs: Any,
98
+ ) -> [List[Document], float, List[float]]:
99
+ docs_scores_and_embeddings = self.similarity_search_with_scores_and_embeddings(query, k, filter=filter)
100
+ return docs_scores_and_embeddings
101
+
102
+ def similarity_search_with_scores_and_embeddings(
103
+ self,
104
+ query: str,
105
+ k: int = DEFAULT_K,
106
+ filter: Optional[Dict[str, str]] = None,
107
+ where_document: Optional[Dict[str, str]] = None,
108
+ **kwargs: Any,
109
+ ) -> List[Tuple[Document, float, List[float]]]:
110
+
111
+ if self._embedding_function is None:
112
+ results = self.__query_collection(
113
+ query_texts=[query],
114
+ n_results=k,
115
+ where=filter,
116
+ where_document=where_document,
117
+ include=['metadatas', 'documents', 'embeddings', 'distances']
118
+ )
119
+ else:
120
+ query_embedding = self._embedding_function.embed_query(query)
121
+ results = self.__query_collection(
122
+ query_embeddings=[query_embedding],
123
+ n_results=k,
124
+ where=filter,
125
+ where_document=where_document,
126
+ include=['metadatas', 'documents', 'embeddings', 'distances']
127
+ )
128
+
129
+ return _results_to_docs_scores_and_embeddings(results)
130
+
131
+
132
+ def _results_to_docs_scores_and_embeddings(results: Any) -> List[Tuple[Document, float, List[float]]]:
133
+ return [
134
+ (Document(page_content=result[0], metadata=result[1] or {}), result[2], result[3])
135
+ for result in zip(
136
+ results["documents"][0],
137
+ results["metadatas"][0],
138
+ results["distances"][0],
139
+ results["embeddings"][0],
140
+ )
141
+ ]
requirements.txt CHANGED
@@ -24,4 +24,6 @@ typing-inspect==0.9.0
24
  typing_extensions==4.11.0
25
  pydantic==2.6.4
26
  sentence_transformers==2.6.1
27
- streamlit-pdf-viewer
 
 
 
24
  typing_extensions==4.11.0
25
  pydantic==2.6.4
26
  sentence_transformers==2.6.1
27
+ streamlit-pdf-viewer
28
+ umap-learn
29
+ plotly