# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/free-speech-stores.ipynb. # %% auto 0 __all__ = ['setup_openai_api_key', 'setup_db'] # %% ../nbs/free-speech-stores.ipynb 4 # libraries required for functionality import os from getpass import getpass from langchain.chains import RetrievalQA from langchain.llms import OpenAI from langchain.prompts import PromptTemplate from langchain.document_loaders import UnstructuredFileLoader from langchain.document_loaders.merge import MergedDataLoader from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Chroma # %% ../nbs/free-speech-stores.ipynb 12 def setup_openai_api_key(): openai_api_key = getpass() os.environ["OPENAI_API_KEY"] = openai_api_key # %% ../nbs/free-speech-stores.ipynb 15 import nltk nltk.download('averaged_perceptron_tagger') # %% ../nbs/free-speech-stores.ipynb 27 def setup_db(local_path, hub_path, chunk_size=1000, chunk_overlap=5): file_list = os.listdir(local_path) # set up loaders loaders_list = [] for file_path in file_list: file_path = local_path + file_path loaders_list.append(UnstructuredFileLoader(file_path)) loader_all = MergedDataLoader(loaders=[loader for loader in loaders_list]) # Split and embed docs documents = loader_all.load() text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) texts = text_splitter.split_documents(documents) embeddings = OpenAIEmbeddings() # Replace dataset path with relevant dataset name - counterspeech-resources or hatespeech-background db = DeepLake.from_documents(texts, dataset_path=hub_path, embedding=embeddings, overwrite=True) return