"""Load html from files, clean up, split, ingest into Weaviate.""" import os from pathlib import Path from markdown import markdown import pickle from bs4 import BeautifulSoup from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings from langchain.vectorstores import FAISS from InstructorEmbedding import INSTRUCTOR print(os.environ["HUGGINFACE_APIKEY"]) def clean_data(data): html = markdown(data) soup = BeautifulSoup(html, "html.parser") text = ''.join(soup.findAll(text=True)) return "\n".join([t for t in text.split("\n") if t]) docs = [] metadatas = [] for p in Path("docs").rglob("*"): if p.is_dir(): continue if str(p).lower().endswith(('.md', '.mdx')): with open(p) as f: print(p) filename = os.path.splitext(p)[0] docs.append(clean_data(f.read())) metadatas.append({"source": filename}) text_splitter = CharacterTextSplitter( separator="\n", chunk_size=512, chunk_overlap=64, length_function=len, ) documents = text_splitter.create_documents(docs, metadatas=metadatas) print("making embedding") embedding = HuggingFaceEmbeddings() print("beginning construction of faiss") search_index = FAISS.from_documents(documents, embedding) print("beginning pickle") with open("docs.pkl", 'wb') as f: pickle.dump(search_index, f) print("Pickle complete")