hlydecker's picture
Duplicate from hlydecker/Augmented-Retrieval-qa-ChatGPT
1ce95c4
raw
history blame contribute delete
No virus
3.91 kB
# import hashlib
from langchain.vectorstores.faiss import *
from langchain.vectorstores.faiss import FAISS as OriginalFAISS
from streamlit_langchain_chat.customized_langchain.docstore.in_memory import InMemoryDocstore
class FAISS(OriginalFAISS):
def __add(
self,
texts: Iterable[str],
embeddings: Iterable[List[float]],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
if not isinstance(self.docstore, AddableMixin):
raise ValueError(
"If trying to add texts, the underlying docstore should support "
f"adding items, which {self.docstore} does not"
)
documents = []
for i, text in enumerate(texts):
metadata = metadatas[i] if metadatas else {}
documents.append(Document(page_content=text, metadata=metadata))
# Add to the index, the index_to_id mapping, and the docstore.
starting_len = len(self.index_to_docstore_id)
self.index.add(np.array(embeddings, dtype=np.float32))
# Get list of index, id, and docs.
full_info = [
(starting_len + i, str(uuid.uuid4()), doc)
for i, doc in enumerate(documents)
]
# Add information to docstore and index.
self.docstore.add({_id: doc for _, _id, doc in full_info})
index_to_id = {index: _id for index, _id, _ in full_info}
self.index_to_docstore_id.update(index_to_id)
return [_id for _, _id, _ in full_info]
@classmethod
def __from(
cls,
texts: List[str],
embeddings: List[List[float]],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> FAISS:
faiss = dependable_faiss_import()
index = faiss.IndexFlatL2(len(embeddings[0]))
index.add(np.array(embeddings, dtype=np.float32))
documents = []
for i, text in enumerate(texts):
metadata = metadatas[i] if metadatas else {}
documents.append(Document(page_content=text, metadata=metadata))
index_to_id = {i: str(uuid.uuid4()) for i in range(len(documents))}
# # TODO: cambiar para usar el hash. Y ver donde se pondria para que no cargara el chunk en el dataset
# index_to_id_2 = dict()
# for i in range(len(documents)):
# h = hashlib.new('sha256')
# text_ = documents[i].page_content
# h.update(text_.encode())
# index_to_id_2[i] = str(h.hexdigest())
# #
docstore = InMemoryDocstore(
{index_to_id[i]: doc for i, doc in enumerate(documents)}
)
return cls(embedding.embed_query, index, docstore, index_to_id)
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> FAISS:
"""Construct FAISS wrapper from raw documents.
This is a user friendly interface that:
1. Embeds documents.
2. Creates an in memory docstore
3. Initializes the FAISS database
This is intended to be a quick way to get started.
Example:
.. code-block:: python
from langchain import FAISS
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
faiss = FAISS.from_texts(texts, embeddings)
"""
# embeddings = embedding.embed_documents(texts)
print(f"len(texts): {len(texts)}") # TODO: borrar
embeddings = [embedding.embed_documents([text])[0] for text in texts]
print(f"len(embeddings): {len(embeddings)}") # TODO: borrar
return cls.__from(texts, embeddings, embedding, metadatas, **kwargs)