Spaces:
Running
Running
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/free-speech-stores.ipynb. | |
# %% auto 0 | |
__all__ = ['setup_openai_api_key', 'setup_db'] | |
# %% ../nbs/free-speech-stores.ipynb 4 | |
# libraries required for functionality | |
import os | |
from getpass import getpass | |
from langchain.chains import RetrievalQA | |
from langchain.llms import OpenAI | |
from langchain.prompts import PromptTemplate | |
from langchain.document_loaders import UnstructuredFileLoader | |
from langchain.document_loaders.merge import MergedDataLoader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.vectorstores import Chroma | |
# %% ../nbs/free-speech-stores.ipynb 12 | |
def setup_openai_api_key(): | |
openai_api_key = getpass() | |
os.environ["OPENAI_API_KEY"] = openai_api_key | |
# %% ../nbs/free-speech-stores.ipynb 15 | |
import nltk | |
nltk.download('averaged_perceptron_tagger') | |
# %% ../nbs/free-speech-stores.ipynb 27 | |
def setup_db(local_path, hub_path, chunk_size=1000, chunk_overlap=5): | |
file_list = os.listdir(local_path) | |
# set up loaders | |
loaders_list = [] | |
for file_path in file_list: | |
file_path = local_path + file_path | |
loaders_list.append(UnstructuredFileLoader(file_path)) | |
loader_all = MergedDataLoader(loaders=[loader for loader in loaders_list]) | |
# Split and embed docs | |
documents = loader_all.load() | |
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
texts = text_splitter.split_documents(documents) | |
embeddings = OpenAIEmbeddings() | |
# Replace dataset path with relevant dataset name - counterspeech-resources or hatespeech-background | |
db = DeepLake.from_documents(texts, dataset_path=hub_path, embedding=embeddings, overwrite=True) | |
return | |