# import hashlib from langchain.vectorstores.faiss import * from langchain.vectorstores.faiss import FAISS as OriginalFAISS from streamlit_langchain_chat.customized_langchain.docstore.in_memory import InMemoryDocstore class FAISS(OriginalFAISS): def __add( self, texts: Iterable[str], embeddings: Iterable[List[float]], metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> List[str]: if not isinstance(self.docstore, AddableMixin): raise ValueError( "If trying to add texts, the underlying docstore should support " f"adding items, which {self.docstore} does not" ) documents = [] for i, text in enumerate(texts): metadata = metadatas[i] if metadatas else {} documents.append(Document(page_content=text, metadata=metadata)) # Add to the index, the index_to_id mapping, and the docstore. starting_len = len(self.index_to_docstore_id) self.index.add(np.array(embeddings, dtype=np.float32)) # Get list of index, id, and docs. full_info = [ (starting_len + i, str(uuid.uuid4()), doc) for i, doc in enumerate(documents) ] # Add information to docstore and index. self.docstore.add({_id: doc for _, _id, doc in full_info}) index_to_id = {index: _id for index, _id, _ in full_info} self.index_to_docstore_id.update(index_to_id) return [_id for _, _id, _ in full_info] @classmethod def __from( cls, texts: List[str], embeddings: List[List[float]], embedding: Embeddings, metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> FAISS: faiss = dependable_faiss_import() index = faiss.IndexFlatL2(len(embeddings[0])) index.add(np.array(embeddings, dtype=np.float32)) documents = [] for i, text in enumerate(texts): metadata = metadatas[i] if metadatas else {} documents.append(Document(page_content=text, metadata=metadata)) index_to_id = {i: str(uuid.uuid4()) for i in range(len(documents))} # # TODO: cambiar para usar el hash. Y ver donde se pondria para que no cargara el chunk en el dataset # index_to_id_2 = dict() # for i in range(len(documents)): # h = hashlib.new('sha256') # text_ = documents[i].page_content # h.update(text_.encode()) # index_to_id_2[i] = str(h.hexdigest()) # # docstore = InMemoryDocstore( {index_to_id[i]: doc for i, doc in enumerate(documents)} ) return cls(embedding.embed_query, index, docstore, index_to_id) @classmethod def from_texts( cls, texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> FAISS: """Construct FAISS wrapper from raw documents. This is a user friendly interface that: 1. Embeds documents. 2. Creates an in memory docstore 3. Initializes the FAISS database This is intended to be a quick way to get started. Example: .. code-block:: python from langchain import FAISS from langchain.embeddings import OpenAIEmbeddings embeddings = OpenAIEmbeddings() faiss = FAISS.from_texts(texts, embeddings) """ # embeddings = embedding.embed_documents(texts) print(f"len(texts): {len(texts)}") # TODO: borrar embeddings = [embedding.embed_documents([text])[0] for text in texts] print(f"len(embeddings): {len(embeddings)}") # TODO: borrar return cls.__from(texts, embeddings, embedding, metadatas, **kwargs)