Umang Chaudhry
Merge pull request #98 from vanderbilt-data-science/hf-struggle
8e04782
raw
history blame
1.74 kB
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/free-speech-stores.ipynb.
# %% auto 0
__all__ = ['setup_openai_api_key', 'setup_db']
# %% ../nbs/free-speech-stores.ipynb 4
# libraries required for functionality
import os
from getpass import getpass
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders.merge import MergedDataLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
# %% ../nbs/free-speech-stores.ipynb 12
def setup_openai_api_key():
openai_api_key = getpass()
os.environ["OPENAI_API_KEY"] = openai_api_key
# %% ../nbs/free-speech-stores.ipynb 15
import nltk
nltk.download('averaged_perceptron_tagger')
# %% ../nbs/free-speech-stores.ipynb 27
def setup_db(local_path, hub_path, chunk_size=1000, chunk_overlap=5):
file_list = os.listdir(local_path)
# set up loaders
loaders_list = []
for file_path in file_list:
file_path = local_path + file_path
loaders_list.append(UnstructuredFileLoader(file_path))
loader_all = MergedDataLoader(loaders=[loader for loader in loaders_list])
# Split and embed docs
documents = loader_all.load()
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
# Replace dataset path with relevant dataset name - counterspeech-resources or hatespeech-background
db = DeepLake.from_documents(texts, dataset_path=hub_path, embedding=embeddings, overwrite=True)
return