Spaces:
Sleeping
Sleeping
Commit
·
4823e70
1
Parent(s):
8832a9e
Upload 12 files
Browse files- hay/__pycache__/model.cpython-310.pyc +0 -0
- hay/__pycache__/pipeline.cpython-310.pyc +0 -0
- hay/__pycache__/retriever.cpython-310.pyc +0 -0
- hay/model.py +19 -6
- hay/pipeline.py +45 -5
- hay/retriever.py +9 -20
- outputs/docs-data/data-00000-of-00001.arrow +3 -0
- outputs/docs-data/dataset_info.json +60 -0
- outputs/docs-data/state.json +13 -0
- outputs/docs-data2/data-00000-of-00001.arrow +3 -0
- outputs/docs-data2/dataset_info.json +60 -0
- outputs/docs-data2/state.json +13 -0
hay/__pycache__/model.cpython-310.pyc
CHANGED
Binary files a/hay/__pycache__/model.cpython-310.pyc and b/hay/__pycache__/model.cpython-310.pyc differ
|
|
hay/__pycache__/pipeline.cpython-310.pyc
CHANGED
Binary files a/hay/__pycache__/pipeline.cpython-310.pyc and b/hay/__pycache__/pipeline.cpython-310.pyc differ
|
|
hay/__pycache__/retriever.cpython-310.pyc
CHANGED
Binary files a/hay/__pycache__/retriever.cpython-310.pyc and b/hay/__pycache__/retriever.cpython-310.pyc differ
|
|
hay/model.py
CHANGED
@@ -1,22 +1,35 @@
|
|
1 |
from haystack.nodes import PromptNode, PromptTemplate
|
2 |
from haystack.nodes import AnswerParser
|
3 |
from haystack.nodes import TransformersSummarizer
|
4 |
-
from haystack import Document
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
|
9 |
def prompting_model():
|
10 |
'''
|
11 |
Define a prompt node in haystack pipeline
|
12 |
'''
|
13 |
|
14 |
-
prompt_node = PromptNode(model_name_or_path="facebook/galactica-125m", default_prompt_template="deepset/question-answering-per-document")
|
15 |
|
16 |
-
|
17 |
|
18 |
return prompt_node
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
def summarize():
|
22 |
|
|
|
1 |
from haystack.nodes import PromptNode, PromptTemplate
|
2 |
from haystack.nodes import AnswerParser
|
3 |
from haystack.nodes import TransformersSummarizer
|
|
|
|
|
|
|
|
|
4 |
|
5 |
def prompting_model():
|
6 |
'''
|
7 |
Define a prompt node in haystack pipeline
|
8 |
'''
|
9 |
|
10 |
+
# prompt_node = PromptNode(model_name_or_path="facebook/galactica-125m", default_prompt_template="deepset/question-answering-per-document")
|
11 |
|
12 |
+
prompt_node = PromptNode(model_name_or_path="facebook/galactica-125m")
|
13 |
|
14 |
return prompt_node
|
15 |
|
16 |
+
def prompting_model_2():
|
17 |
+
'''
|
18 |
+
Define a prompt node in haystack pipeline, with detailed prompt
|
19 |
+
'''
|
20 |
+
|
21 |
+
custom_prompt = PromptTemplate(prompt = """ You are a helpful and knowledgeable agent. To achieve your goal of answering complex questions,
|
22 |
+
you have access to the following paragraph :
|
23 |
+
{join(documents)}
|
24 |
+
|
25 |
+
Your output should be a detailed summary of the paragraph
|
26 |
+
""")
|
27 |
+
|
28 |
+
summarization_template = PromptTemplate("deepset/summarization")
|
29 |
+
|
30 |
+
prompt_node = PromptNode(model_name_or_path="facebook/galactica-125m", default_prompt_template=custom_prompt)
|
31 |
+
|
32 |
+
return prompt_node
|
33 |
|
34 |
def summarize():
|
35 |
|
hay/pipeline.py
CHANGED
@@ -1,14 +1,18 @@
|
|
1 |
from hay.model import prompting_model, summarize
|
|
|
2 |
from haystack.pipelines import Pipeline, SearchSummarizationPipeline
|
|
|
|
|
|
|
3 |
from hay.retriever import retriever1
|
4 |
|
5 |
-
def rg_pipeline(question):
|
6 |
'''
|
7 |
Defines a pipeline of retriever and generator and generates output for the given question
|
8 |
'''
|
9 |
|
10 |
prompt_node = prompting_model()
|
11 |
-
retriever = retriever1()
|
12 |
|
13 |
pipe = Pipeline()
|
14 |
pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
|
@@ -23,17 +27,17 @@ def rg_pipeline(question):
|
|
23 |
return None
|
24 |
|
25 |
|
26 |
-
def rs_pipeline(question):
|
27 |
'''
|
28 |
Defines a pipeline of retriever and summarizer and generates output for the given question
|
29 |
'''
|
30 |
|
31 |
-
retriever = retriever1()
|
32 |
summarizer = summarize()
|
33 |
|
34 |
# Get top 10 results from the retriever and summarize them
|
35 |
pipeline = SearchSummarizationPipeline(summarizer=summarizer, retriever=retriever)
|
36 |
-
result = pipeline.run(query=question, params={"Retriever": {"top_k":
|
37 |
|
38 |
output = ''
|
39 |
for i in range(len(result['documents'])):
|
@@ -43,6 +47,42 @@ def rs_pipeline(question):
|
|
43 |
|
44 |
return output
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
|
48 |
|
|
|
1 |
from hay.model import prompting_model, summarize
|
2 |
+
from hay.model import prompting_model_2
|
3 |
from haystack.pipelines import Pipeline, SearchSummarizationPipeline
|
4 |
+
from haystack.agents.memory import ConversationSummaryMemory
|
5 |
+
# from haystack.agents.conversational import
|
6 |
+
from haystack import Document
|
7 |
from hay.retriever import retriever1
|
8 |
|
9 |
+
def rg_pipeline(question, d):
|
10 |
'''
|
11 |
Defines a pipeline of retriever and generator and generates output for the given question
|
12 |
'''
|
13 |
|
14 |
prompt_node = prompting_model()
|
15 |
+
retriever = retriever1(d)
|
16 |
|
17 |
pipe = Pipeline()
|
18 |
pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
|
|
|
27 |
return None
|
28 |
|
29 |
|
30 |
+
def rs_pipeline(question, d):
|
31 |
'''
|
32 |
Defines a pipeline of retriever and summarizer and generates output for the given question
|
33 |
'''
|
34 |
|
35 |
+
retriever = retriever1(d)
|
36 |
summarizer = summarize()
|
37 |
|
38 |
# Get top 10 results from the retriever and summarize them
|
39 |
pipeline = SearchSummarizationPipeline(summarizer=summarizer, retriever=retriever)
|
40 |
+
result = pipeline.run(query=question, params={"Retriever": {"top_k": 2}})
|
41 |
|
42 |
output = ''
|
43 |
for i in range(len(result['documents'])):
|
|
|
47 |
|
48 |
return output
|
49 |
|
50 |
+
# Try this later
|
51 |
+
def conv_agent(question="How to reduce carbon emissions?"):
|
52 |
+
# '''
|
53 |
+
# Defines a pipeline using the conversational agent class
|
54 |
+
# '''
|
55 |
+
# prompt_node = prompting_model()
|
56 |
+
# summary_memory = ConversationSummaryMemory(prompt_node=prompt_node)
|
57 |
+
# conversational_agent = ConversationalAgent(prompt_node=prompt_node, memory=summary_memory)
|
58 |
+
|
59 |
+
|
60 |
+
output = None
|
61 |
+
return output
|
62 |
+
|
63 |
+
def rsg_pipeline(question, d):
|
64 |
+
|
65 |
+
'''
|
66 |
+
Defines a pipeline using the summarization pipeline with an additional prompt node
|
67 |
+
'''
|
68 |
+
# retriever = retriever1(d)
|
69 |
+
# summarizer = summarize()
|
70 |
+
# pipeline = SearchSummarizationPipeline(summarizer=summarizer, retriever=retriever)
|
71 |
+
# result = pipeline.run(query=question, params={"Retriever": {"top_k": 5}})
|
72 |
+
|
73 |
+
# output = ''
|
74 |
+
# for i in range(len(result['documents'])):
|
75 |
+
# output += result['documents'][i].meta['summary']
|
76 |
+
|
77 |
+
output = "In recent years, a number of papers have examined the impact of incentives on firms to reduce carbon emissions.The European Union's (EU) Emissions Trading Scheme (ETS) aims to reduce greenhouse gas (GHG) emissions by trading carbon dioxide emissions from major emitters.The aim of this article is to provide a reference for managers to improve the attractiveness of their stores to consumers and for the gov- ernment to design carbon policy.In this paper, we discuss how the number of retail stores in a market affects the carbon emissions in the supply chain, and present our mathematical models to illustrate how retail store density can affect the carbon emissions in the supply chain, accounting for consumers’ emissions and transportation cost.KeyTakeaways:"
|
78 |
+
node = prompting_model_2()
|
79 |
+
pipe = Pipeline()
|
80 |
+
pipe.add_node(component=node, name="prompt_node", inputs = ["Query"])
|
81 |
+
|
82 |
+
f_output = pipe.run(query=question, documents=[Document(output)])
|
83 |
+
# op = [a.answer for a in f_output["answers"]]
|
84 |
+
|
85 |
+
return f_output
|
86 |
|
87 |
|
88 |
|
hay/retriever.py
CHANGED
@@ -18,13 +18,14 @@ from haystack.nodes import TfidfRetriever
|
|
18 |
import warnings
|
19 |
warnings.filterwarnings('ignore')
|
20 |
|
21 |
-
def generate_docs(overlap, length):
|
22 |
|
23 |
'''
|
24 |
Takes in split length and split overlap
|
25 |
Saves the docs in a pandas dataframe
|
26 |
'''
|
27 |
-
|
|
|
28 |
|
29 |
preprocessor = PreProcessor(
|
30 |
clean_empty_lines=True,
|
@@ -42,35 +43,23 @@ def generate_docs(overlap, length):
|
|
42 |
|
43 |
df = pd.DataFrame(docs)
|
44 |
dataset = Dataset(pa.Table.from_pandas(df))
|
45 |
-
dataset.save_to_disk('outputs/docs-dataset')
|
|
|
46 |
|
47 |
return None
|
48 |
|
49 |
|
50 |
-
def retriever1():
|
51 |
'''
|
52 |
Use BM25 Retriever to retrieve data
|
53 |
'''
|
54 |
|
55 |
-
dataset = load_from_disk('outputs/docs-dataset')
|
|
|
56 |
|
57 |
# BM25Retriever with InMemoryDocumentStore
|
58 |
document_store = InMemoryDocumentStore(use_bm25=True)
|
59 |
document_store.write_documents(dataset)
|
60 |
-
retriever = BM25Retriever(document_store=document_store, top_k=
|
61 |
|
62 |
return retriever
|
63 |
-
|
64 |
-
|
65 |
-
# def retriever2():
|
66 |
-
# document_store = FAISSDocumentStore(similarity="dot_product")
|
67 |
-
# retriever = DensePassageRetriever(
|
68 |
-
# document_store=document_store,
|
69 |
-
# query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
|
70 |
-
# passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base"
|
71 |
-
# )
|
72 |
-
# document_store.update_embeddings(retriever)
|
73 |
-
|
74 |
-
# return retriever
|
75 |
-
# generate_docs(20, 250)
|
76 |
-
# ret = retriever2()
|
|
|
18 |
import warnings
|
19 |
warnings.filterwarnings('ignore')
|
20 |
|
21 |
+
def generate_docs(overlap, length, d='data'):
|
22 |
|
23 |
'''
|
24 |
Takes in split length and split overlap
|
25 |
Saves the docs in a pandas dataframe
|
26 |
'''
|
27 |
+
|
28 |
+
all_docs = convert_files_to_docs(dir_path=d)
|
29 |
|
30 |
preprocessor = PreProcessor(
|
31 |
clean_empty_lines=True,
|
|
|
43 |
|
44 |
df = pd.DataFrame(docs)
|
45 |
dataset = Dataset(pa.Table.from_pandas(df))
|
46 |
+
# dataset.save_to_disk('outputs/docs-dataset')
|
47 |
+
dataset.save_to_disk('outputs/docs-'+d)
|
48 |
|
49 |
return None
|
50 |
|
51 |
|
52 |
+
def retriever1(d):
|
53 |
'''
|
54 |
Use BM25 Retriever to retrieve data
|
55 |
'''
|
56 |
|
57 |
+
# dataset = load_from_disk('outputs/docs-dataset')
|
58 |
+
dataset = load_from_disk('outputs/docs-'+d)
|
59 |
|
60 |
# BM25Retriever with InMemoryDocumentStore
|
61 |
document_store = InMemoryDocumentStore(use_bm25=True)
|
62 |
document_store.write_documents(dataset)
|
63 |
+
retriever = BM25Retriever(document_store=document_store, top_k=10)
|
64 |
|
65 |
return retriever
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
outputs/docs-data/data-00000-of-00001.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d48ece13d37530a44060ad19d05e96a34d048d5de9403947fd50235c9b6a254
|
3 |
+
size 3583736
|
outputs/docs-data/dataset_info.json
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"citation": "",
|
3 |
+
"description": "",
|
4 |
+
"features": {
|
5 |
+
"id": {
|
6 |
+
"dtype": "string",
|
7 |
+
"_type": "Value"
|
8 |
+
},
|
9 |
+
"content": {
|
10 |
+
"dtype": "string",
|
11 |
+
"_type": "Value"
|
12 |
+
},
|
13 |
+
"content_type": {
|
14 |
+
"dtype": "string",
|
15 |
+
"_type": "Value"
|
16 |
+
},
|
17 |
+
"meta": {
|
18 |
+
"_split_id": {
|
19 |
+
"dtype": "int64",
|
20 |
+
"_type": "Value"
|
21 |
+
},
|
22 |
+
"_split_overlap": [
|
23 |
+
{
|
24 |
+
"doc_id": {
|
25 |
+
"dtype": "string",
|
26 |
+
"_type": "Value"
|
27 |
+
},
|
28 |
+
"range": {
|
29 |
+
"feature": {
|
30 |
+
"dtype": "int64",
|
31 |
+
"_type": "Value"
|
32 |
+
},
|
33 |
+
"_type": "Sequence"
|
34 |
+
}
|
35 |
+
}
|
36 |
+
],
|
37 |
+
"name": {
|
38 |
+
"dtype": "string",
|
39 |
+
"_type": "Value"
|
40 |
+
}
|
41 |
+
},
|
42 |
+
"id_hash_keys": {
|
43 |
+
"feature": {
|
44 |
+
"dtype": "string",
|
45 |
+
"_type": "Value"
|
46 |
+
},
|
47 |
+
"_type": "Sequence"
|
48 |
+
},
|
49 |
+
"score": {
|
50 |
+
"dtype": "null",
|
51 |
+
"_type": "Value"
|
52 |
+
},
|
53 |
+
"embedding": {
|
54 |
+
"dtype": "null",
|
55 |
+
"_type": "Value"
|
56 |
+
}
|
57 |
+
},
|
58 |
+
"homepage": "",
|
59 |
+
"license": ""
|
60 |
+
}
|
outputs/docs-data/state.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_data_files": [
|
3 |
+
{
|
4 |
+
"filename": "data-00000-of-00001.arrow"
|
5 |
+
}
|
6 |
+
],
|
7 |
+
"_fingerprint": "737b5afa18cfd1c6",
|
8 |
+
"_format_columns": null,
|
9 |
+
"_format_kwargs": {},
|
10 |
+
"_format_type": null,
|
11 |
+
"_output_all_columns": false,
|
12 |
+
"_split": null
|
13 |
+
}
|
outputs/docs-data2/data-00000-of-00001.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0dc7265bdf1990c6ee2e9cfdb4a11583e5d8e4d8149239d0f4b32412f6677beb
|
3 |
+
size 6516160
|
outputs/docs-data2/dataset_info.json
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"citation": "",
|
3 |
+
"description": "",
|
4 |
+
"features": {
|
5 |
+
"id": {
|
6 |
+
"dtype": "string",
|
7 |
+
"_type": "Value"
|
8 |
+
},
|
9 |
+
"content": {
|
10 |
+
"dtype": "string",
|
11 |
+
"_type": "Value"
|
12 |
+
},
|
13 |
+
"content_type": {
|
14 |
+
"dtype": "string",
|
15 |
+
"_type": "Value"
|
16 |
+
},
|
17 |
+
"meta": {
|
18 |
+
"_split_id": {
|
19 |
+
"dtype": "int64",
|
20 |
+
"_type": "Value"
|
21 |
+
},
|
22 |
+
"_split_overlap": [
|
23 |
+
{
|
24 |
+
"doc_id": {
|
25 |
+
"dtype": "string",
|
26 |
+
"_type": "Value"
|
27 |
+
},
|
28 |
+
"range": {
|
29 |
+
"feature": {
|
30 |
+
"dtype": "int64",
|
31 |
+
"_type": "Value"
|
32 |
+
},
|
33 |
+
"_type": "Sequence"
|
34 |
+
}
|
35 |
+
}
|
36 |
+
],
|
37 |
+
"name": {
|
38 |
+
"dtype": "string",
|
39 |
+
"_type": "Value"
|
40 |
+
}
|
41 |
+
},
|
42 |
+
"id_hash_keys": {
|
43 |
+
"feature": {
|
44 |
+
"dtype": "string",
|
45 |
+
"_type": "Value"
|
46 |
+
},
|
47 |
+
"_type": "Sequence"
|
48 |
+
},
|
49 |
+
"score": {
|
50 |
+
"dtype": "null",
|
51 |
+
"_type": "Value"
|
52 |
+
},
|
53 |
+
"embedding": {
|
54 |
+
"dtype": "null",
|
55 |
+
"_type": "Value"
|
56 |
+
}
|
57 |
+
},
|
58 |
+
"homepage": "",
|
59 |
+
"license": ""
|
60 |
+
}
|
outputs/docs-data2/state.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_data_files": [
|
3 |
+
{
|
4 |
+
"filename": "data-00000-of-00001.arrow"
|
5 |
+
}
|
6 |
+
],
|
7 |
+
"_fingerprint": "87b3b11c03e41dc7",
|
8 |
+
"_format_columns": null,
|
9 |
+
"_format_kwargs": {},
|
10 |
+
"_format_type": null,
|
11 |
+
"_output_all_columns": false,
|
12 |
+
"_split": null
|
13 |
+
}
|