Spaces:
Runtime error
Runtime error
File size: 1,947 Bytes
9d38059 b00f9c3 9d38059 b00f9c3 a56c9af 9d38059 a56c9af b00f9c3 9d38059 b00f9c3 9d38059 b00f9c3 a56c9af 9d38059 b00f9c3 9d38059 b00f9c3 fa8c8ef 9d38059 fa8c8ef 9d38059 b00f9c3 fa8c8ef 9d38059 b00f9c3 9d38059 b00f9c3 9d38059 b00f9c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
"""Load html from files, clean up, split, ingest into Weaviate."""
import os
from pathlib import Path
from markdown import markdown
import pickle
import re
from bs4 import BeautifulSoup
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from InstructorEmbedding import INSTRUCTOR
print(os.environ["HUGGINFACE_APIKEY"])
def clean_data(data):
html = markdown(data)
soup = BeautifulSoup(html, "html.parser")
text = ''.join(soup.findAll(text=True))
cleaned_text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
print(cleaned_text)
return "\n".join([t for t in cleaned_text.split("\n") if t])
docs = []
metadatas = []
for p in Path("docs").rglob("*"):
if p.is_dir():
continue
if str(p).lower().endswith(('.md', '.mdx')):
with open(p) as f:
filename = os.path.splitext(p)[0]
docs.append(clean_data(f.read()))
newfile_name = filename.replace("\\", "/")[5:]
print("file:" + newfile_name)
metadatas.append({"source": newfile_name})
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=768,
chunk_overlap=128,
length_function=len,
)
documents = text_splitter.create_documents(docs, metadatas=metadatas)
print("making embedding")
model_name = "hkunlp/instructor-large"
embed_instruction = "Represent the text from the Hugging Face code documentation"
query_instruction = "Query the most relevant text from the Hugging Face code documentation"
embedding = HuggingFaceInstructEmbeddings(model_name=model_name, embed_instruction=embed_instruction, query_instruction=query_instruction)
print("beginning construction of faiss")
search_index = FAISS.from_documents(documents, embedding)
print("beginning pickle")
with open("docs.pkl", 'wb') as f:
pickle.dump(search_index, f)
print("Pickle complete") |