Spaces:
Runtime error
Runtime error
"""Load html from files, clean up, split, ingest into Weaviate.""" | |
import os | |
from pathlib import Path | |
from markdown import markdown | |
import pickle | |
from bs4 import BeautifulSoup | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings | |
from langchain.vectorstores import FAISS | |
from InstructorEmbedding import INSTRUCTOR | |
print(os.environ["HUGGINFACE_APIKEY"]) | |
def clean_data(data): | |
html = markdown(data) | |
soup = BeautifulSoup(html, "html.parser") | |
text = ''.join(soup.findAll(text=True)) | |
return "\n".join([t for t in text.split("\n") if t]) | |
docs = [] | |
metadatas = [] | |
for p in Path("docs").rglob("*"): | |
if p.is_dir(): | |
continue | |
if str(p).lower().endswith(('.md', '.mdx')): | |
with open(p) as f: | |
print(p) | |
filename = os.path.splitext(p)[0] | |
docs.append(clean_data(f.read())) | |
metadatas.append({"source": filename}) | |
text_splitter = CharacterTextSplitter( | |
separator="\n", | |
chunk_size=512, | |
chunk_overlap=64, | |
length_function=len, | |
) | |
documents = text_splitter.create_documents(docs, metadatas=metadatas) | |
print("making embedding") | |
embedding = HuggingFaceEmbeddings() | |
print("beginning construction of faiss") | |
search_index = FAISS.from_documents(documents, embedding) | |
print("beginning pickle") | |
with open("docs.pkl", 'wb') as f: | |
pickle.dump(search_index, f) | |
print("Pickle complete") |