transformers-chat / ingest.py
enoreyes's picture
Update ingest.py
b00f9c3
raw
history blame
1.45 kB
"""Load html from files, clean up, split, ingest into Weaviate."""
import os
from pathlib import Path
from markdown import markdown
import pickle
from bs4 import BeautifulSoup
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.vectorstores import FAISS
from InstructorEmbedding import INSTRUCTOR
print(os.environ["HUGGINFACE_APIKEY"])
def clean_data(data):
html = markdown(data)
soup = BeautifulSoup(html, "html.parser")
text = ''.join(soup.findAll(text=True))
return "\n".join([t for t in text.split("\n") if t])
docs = []
metadatas = []
for p in Path("docs").rglob("*"):
if p.is_dir():
continue
if str(p).lower().endswith(('.md', '.mdx')):
with open(p) as f:
print(p)
filename = os.path.splitext(p)[0]
docs.append(clean_data(f.read()))
metadatas.append({"source": filename})
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=512,
chunk_overlap=64,
length_function=len,
)
documents = text_splitter.create_documents(docs, metadatas=metadatas)
print("making embedding")
embedding = HuggingFaceEmbeddings()
print("beginning construction of faiss")
search_index = FAISS.from_documents(documents, embedding)
print("beginning pickle")
with open("docs.pkl", 'wb') as f:
pickle.dump(search_index, f)
print("Pickle complete")