File size: 1,947 Bytes
9d38059
 
 
b00f9c3
9d38059
b00f9c3
a56c9af
9d38059
 
a56c9af
b00f9c3
 
9d38059
b00f9c3
9d38059
 
b00f9c3
 
 
a56c9af
 
 
9d38059
 
 
b00f9c3
9d38059
 
b00f9c3
 
 
 
fa8c8ef
 
 
9d38059
 
 
fa8c8ef
 
9d38059
 
 
 
 
b00f9c3
fa8c8ef
 
 
 
9d38059
b00f9c3
 
9d38059
b00f9c3
 
 
9d38059
b00f9c3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""Load html from files, clean up, split, ingest into Weaviate."""
import os
from pathlib import Path
from markdown import markdown

import pickle
import re
from bs4 import BeautifulSoup
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from InstructorEmbedding import INSTRUCTOR

print(os.environ["HUGGINFACE_APIKEY"])

def clean_data(data):
    html = markdown(data)
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(text=True))
    cleaned_text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
    print(cleaned_text)
    return "\n".join([t for t in cleaned_text.split("\n") if t])

docs = []
metadatas = []
for p in Path("docs").rglob("*"):
    if p.is_dir():
        continue
    if str(p).lower().endswith(('.md', '.mdx')):
        with open(p) as f:
            filename = os.path.splitext(p)[0]
            docs.append(clean_data(f.read()))
            newfile_name = filename.replace("\\", "/")[5:]
            print("file:" + newfile_name)
            metadatas.append({"source": newfile_name})

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=768,
    chunk_overlap=128,
    length_function=len,
)

documents = text_splitter.create_documents(docs, metadatas=metadatas)

print("making embedding")
model_name = "hkunlp/instructor-large"
embed_instruction = "Represent the text from the Hugging Face code documentation"
query_instruction = "Query the most relevant text from the Hugging Face code documentation"
embedding = HuggingFaceInstructEmbeddings(model_name=model_name, embed_instruction=embed_instruction, query_instruction=query_instruction)

print("beginning construction of faiss")
search_index = FAISS.from_documents(documents, embedding)

print("beginning pickle")
with open("docs.pkl", 'wb') as f:
    pickle.dump(search_index, f)

print("Pickle complete")