File size: 1,738 Bytes
6dc8174
 
 
8e04782
6dc8174
8e04782
6dc8174
 
 
 
 
 
 
 
 
 
 
 
 
8e04782
6dc8174
 
 
 
8e04782
6dc8174
 
 
8e04782
 
 
6dc8174
 
 
8e04782
 
6dc8174
 
 
 
 
 
 
 
 
 
8e04782
 
6dc8174
8e04782
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/free-speech-stores.ipynb.

# %% auto 0
__all__ = ['setup_openai_api_key', 'setup_db']

# %% ../nbs/free-speech-stores.ipynb 4
# libraries required for functionality
import os
from getpass import getpass

from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders.merge import MergedDataLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# %% ../nbs/free-speech-stores.ipynb 12
def setup_openai_api_key():
    openai_api_key = getpass()
    os.environ["OPENAI_API_KEY"] = openai_api_key

# %% ../nbs/free-speech-stores.ipynb 15
import nltk
nltk.download('averaged_perceptron_tagger')

# %% ../nbs/free-speech-stores.ipynb 27
def setup_db(local_path, hub_path, chunk_size=1000, chunk_overlap=5):
  file_list = os.listdir(local_path)

  # set up loaders
  loaders_list = []
  for file_path in file_list:
    file_path = local_path + file_path
    loaders_list.append(UnstructuredFileLoader(file_path))

  loader_all = MergedDataLoader(loaders=[loader for loader in loaders_list])

  # Split and embed docs
  documents = loader_all.load()
  text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  texts = text_splitter.split_documents(documents)
  embeddings = OpenAIEmbeddings()

  # Replace dataset path with relevant dataset name - counterspeech-resources or hatespeech-background
  db = DeepLake.from_documents(texts, dataset_path=hub_path, embedding=embeddings, overwrite=True)

  return