File size: 4,439 Bytes
c43278c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from dotenv import load_dotenv
import os
import subprocess

# # Load environment variables from .env file
# load_dotenv()
# # Access environment variables
# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# SERPAPI_API_KEY = os.getenv("SERPAPI_API_KEY")

subprocess.run(["git", "clone", "https://github.com/TheMITTech/shakespeare"], check=True)

from glob import glob
files = glob("./shakespeare/**/*.html")


import shutil
import os
os.mkdir('./data')
destination_folder = './data/'
for html_file in files:
  shutil.move(html_file, destination_folder + html_file.split("/")[-1])


from langchain.document_loaders import BSHTMLLoader, DirectoryLoader
from bs4 import BeautifulSoup
bshtml_dir_loader = DirectoryLoader('./data/', loader_cls=BSHTMLLoader)
data = bshtml_dir_loader.load()

from transformers import AutoTokenizer
bloomz_tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-1b7")

from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(bloomz_tokenizer, chunk_size=100, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(data)

from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

from langchain.vectorstores import Chroma
persist_directory = "vector_db"
vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=persist_directory)

vectordb.persist()
vectordb = None
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

from langchain import HuggingFacePipeline
llm = HuggingFacePipeline.from_model_id(
    model_id="bigscience/bloomz-1b7", 
    task="text-generation", 
    model_kwargs={"temperature" : 0, "max_length" : 500})

doc_retriever = vectordb.as_retriever()

from langchain.chains import RetrievalQA
shakespeare_qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=doc_retriever)

# from langchain.utilities import SerpAPIWrapper
# search = SerpAPIWrapper()

# from langchain.agents import initialize_agent, Tool
# from langchain.agents import AgentType
# from langchain.tools import BaseTool
# from langchain.llms import OpenAI
# from langchain import LLMMathChain, SerpAPIWrapper

# tools = [
#     Tool(
#         name = "Shakespeare QA System",
#         func=shakespeare_qa.run,
#         description="useful for when you need to answer questions about Shakespeare's works. Input should be a fully formed question."
#     ),
#     Tool(
#         name = "SERP API Search",
#         func=search.run,
#         description="useful for when you need to answer questions about ruff (a python linter). Input should be a fully formed question."
#     ),
# ]

# from langchain.memory import ConversationBufferMemory, ReadOnlySharedMemory
# memory = ConversationBufferMemory(memory_key="chat_history")
# readonlymemory = ReadOnlySharedMemory(memory=memory)

# from langchain.agents import ZeroShotAgent, Tool, AgentExecutor

# prefix = """Have a conversation with a human, answering the following questions as best you can. You have access to the following tools:"""
# suffix = """Begin!"

# {chat_history}
# Question: {input}
# {agent_scratchpad}"""

# prompt = ZeroShotAgent.create_prompt(
#     tools, 
#     prefix=prefix, 
#     suffix=suffix, 
#     input_variables=["input", "chat_history", "agent_scratchpad"]
# )


# from langchain import OpenAI, LLMChain, PromptTemplate
# llm_chain = LLMChain(llm=llm, prompt=prompt)
# agent = ZeroShotAgent(llm_chain=llm_chain, tools=tools, verbose=True)
# agent_chain = AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, verbose=True, memory=memory)

def make_inference(query):
    return(shakespeare_qa.run(input=query))

if __name__ == "__main__":
    # make a gradio interface
    import gradio as gr

    gr.Interface(
        make_inference,
        [
            gr.inputs.Textbox(lines=2, label="Query"),
        ],
        gr.outputs.Textbox(label="Response"),
        title="🗣️QuestionMyDoc-Bloomz1b7📄",
        description="🗣️QuestionMyDoc-Bloomz1b7📄 is a tool that allows you to ask questions about a document. In this case - Shakespears.",
    ).launch()