Spaces:
Running
Running
robertselvam
commited on
Create dspy_qa.py
Browse files- dspy_qa.py +78 -0
dspy_qa.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
import dspy
|
4 |
+
from dsp.utils import deduplicate
|
5 |
+
from dspy.retrieve.faiss_rm import FaissRM
|
6 |
+
from langchain_community.document_loaders import PyPDFLoader
|
7 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
8 |
+
|
9 |
+
# os.environ["AZURE_OPENAI_API_KEY"] = ""
|
10 |
+
|
11 |
+
class GenerateSearchQuery(dspy.Signature):
|
12 |
+
"""Write a simple search query that will help answer a complex question."""
|
13 |
+
|
14 |
+
|
15 |
+
context = dspy.InputField(desc="may contain relevant content")
|
16 |
+
question = dspy.InputField()
|
17 |
+
query = dspy.OutputField()
|
18 |
+
|
19 |
+
class GenerateAnswer(dspy.Signature):
|
20 |
+
"""give me a answer for user question based on context"""
|
21 |
+
|
22 |
+
|
23 |
+
context = dspy.InputField(desc="may contain relevant content")
|
24 |
+
question = dspy.InputField()
|
25 |
+
answer = dspy.OutputField()
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
class DocQA(dspy.Module):
|
30 |
+
def __init__(self, file_path,passages_per_hop=3, max_hops=2):
|
31 |
+
super().__init__()
|
32 |
+
self.cache = "cache.json"
|
33 |
+
self.llm = dspy.AzureOpenAI(api_base="https://azureadople.openai.azure.com/",
|
34 |
+
api_version="2023-09-15-preview",
|
35 |
+
model="GPT-3")
|
36 |
+
|
37 |
+
self.generate_query = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
|
38 |
+
self.retrieve = dspy.Retrieve(k=passages_per_hop)
|
39 |
+
self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
|
40 |
+
self.max_hops = max_hops
|
41 |
+
|
42 |
+
self.knowledge_base = self.create_knowledge_base(file_path)
|
43 |
+
|
44 |
+
def load_documents(self, file_path):
|
45 |
+
print("file_path", file_path)
|
46 |
+
loader = PyPDFLoader(file_path)
|
47 |
+
documents = loader.load()
|
48 |
+
return documents
|
49 |
+
|
50 |
+
def split_documents(self, documents):
|
51 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
52 |
+
chunk_size=6000,
|
53 |
+
chunk_overlap=0,
|
54 |
+
length_function=len,
|
55 |
+
is_separator_regex=False,
|
56 |
+
)
|
57 |
+
|
58 |
+
docs = text_splitter.split_documents(documents)
|
59 |
+
document_chunks = [page_content.page_content for page_content in docs]
|
60 |
+
print("input context Ready")
|
61 |
+
return document_chunks
|
62 |
+
|
63 |
+
def create_knowledge_base(self, file_path):
|
64 |
+
print("file_path", file_path)
|
65 |
+
document = self.load_documents(file_path)
|
66 |
+
split_documents = self.split_documents(document)
|
67 |
+
knowledge_base = FaissRM(split_documents)
|
68 |
+
return knowledge_base
|
69 |
+
|
70 |
+
def run(self,question):
|
71 |
+
dspy.settings.configure(lm=self.llm, rm=self.knowledge_base)
|
72 |
+
|
73 |
+
|
74 |
+
passages = self.retrieve(question).passages
|
75 |
+
context = deduplicate(passages)
|
76 |
+
|
77 |
+
pred = self.generate_answer(context=context, question=question)
|
78 |
+
return pred.answer
|