carbonnnnn commited on
Commit
4823e70
·
1 Parent(s): 8832a9e

Upload 12 files

Browse files
hay/__pycache__/model.cpython-310.pyc CHANGED
Binary files a/hay/__pycache__/model.cpython-310.pyc and b/hay/__pycache__/model.cpython-310.pyc differ
 
hay/__pycache__/pipeline.cpython-310.pyc CHANGED
Binary files a/hay/__pycache__/pipeline.cpython-310.pyc and b/hay/__pycache__/pipeline.cpython-310.pyc differ
 
hay/__pycache__/retriever.cpython-310.pyc CHANGED
Binary files a/hay/__pycache__/retriever.cpython-310.pyc and b/hay/__pycache__/retriever.cpython-310.pyc differ
 
hay/model.py CHANGED
@@ -1,22 +1,35 @@
1
  from haystack.nodes import PromptNode, PromptTemplate
2
  from haystack.nodes import AnswerParser
3
  from haystack.nodes import TransformersSummarizer
4
- from haystack import Document
5
-
6
-
7
-
8
 
9
  def prompting_model():
10
  '''
11
  Define a prompt node in haystack pipeline
12
  '''
13
 
14
- prompt_node = PromptNode(model_name_or_path="facebook/galactica-125m", default_prompt_template="deepset/question-answering-per-document")
15
 
16
- # prompt_node = PromptNode(model_name_or_path="facebook/opt-350m", default_prompt_template=lfqa_prompt)
17
 
18
  return prompt_node
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def summarize():
22
 
 
1
  from haystack.nodes import PromptNode, PromptTemplate
2
  from haystack.nodes import AnswerParser
3
  from haystack.nodes import TransformersSummarizer
 
 
 
 
4
 
5
  def prompting_model():
6
  '''
7
  Define a prompt node in haystack pipeline
8
  '''
9
 
10
+ # prompt_node = PromptNode(model_name_or_path="facebook/galactica-125m", default_prompt_template="deepset/question-answering-per-document")
11
 
12
+ prompt_node = PromptNode(model_name_or_path="facebook/galactica-125m")
13
 
14
  return prompt_node
15
 
16
+ def prompting_model_2():
17
+ '''
18
+ Define a prompt node in haystack pipeline, with detailed prompt
19
+ '''
20
+
21
+ custom_prompt = PromptTemplate(prompt = """ You are a helpful and knowledgeable agent. To achieve your goal of answering complex questions,
22
+ you have access to the following paragraph :
23
+ {join(documents)}
24
+
25
+ Your output should be a detailed summary of the paragraph
26
+ """)
27
+
28
+ summarization_template = PromptTemplate("deepset/summarization")
29
+
30
+ prompt_node = PromptNode(model_name_or_path="facebook/galactica-125m", default_prompt_template=custom_prompt)
31
+
32
+ return prompt_node
33
 
34
  def summarize():
35
 
hay/pipeline.py CHANGED
@@ -1,14 +1,18 @@
1
  from hay.model import prompting_model, summarize
 
2
  from haystack.pipelines import Pipeline, SearchSummarizationPipeline
 
 
 
3
  from hay.retriever import retriever1
4
 
5
- def rg_pipeline(question):
6
  '''
7
  Defines a pipeline of retriever and generator and generates output for the given question
8
  '''
9
 
10
  prompt_node = prompting_model()
11
- retriever = retriever1()
12
 
13
  pipe = Pipeline()
14
  pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
@@ -23,17 +27,17 @@ def rg_pipeline(question):
23
  return None
24
 
25
 
26
- def rs_pipeline(question):
27
  '''
28
  Defines a pipeline of retriever and summarizer and generates output for the given question
29
  '''
30
 
31
- retriever = retriever1()
32
  summarizer = summarize()
33
 
34
  # Get top 10 results from the retriever and summarize them
35
  pipeline = SearchSummarizationPipeline(summarizer=summarizer, retriever=retriever)
36
- result = pipeline.run(query=question, params={"Retriever": {"top_k": 5}})
37
 
38
  output = ''
39
  for i in range(len(result['documents'])):
@@ -43,6 +47,42 @@ def rs_pipeline(question):
43
 
44
  return output
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
 
48
 
 
1
  from hay.model import prompting_model, summarize
2
+ from hay.model import prompting_model_2
3
  from haystack.pipelines import Pipeline, SearchSummarizationPipeline
4
+ from haystack.agents.memory import ConversationSummaryMemory
5
+ # from haystack.agents.conversational import
6
+ from haystack import Document
7
  from hay.retriever import retriever1
8
 
9
+ def rg_pipeline(question, d):
10
  '''
11
  Defines a pipeline of retriever and generator and generates output for the given question
12
  '''
13
 
14
  prompt_node = prompting_model()
15
+ retriever = retriever1(d)
16
 
17
  pipe = Pipeline()
18
  pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
 
27
  return None
28
 
29
 
30
+ def rs_pipeline(question, d):
31
  '''
32
  Defines a pipeline of retriever and summarizer and generates output for the given question
33
  '''
34
 
35
+ retriever = retriever1(d)
36
  summarizer = summarize()
37
 
38
  # Get top 10 results from the retriever and summarize them
39
  pipeline = SearchSummarizationPipeline(summarizer=summarizer, retriever=retriever)
40
+ result = pipeline.run(query=question, params={"Retriever": {"top_k": 2}})
41
 
42
  output = ''
43
  for i in range(len(result['documents'])):
 
47
 
48
  return output
49
 
50
+ # Try this later
51
+ def conv_agent(question="How to reduce carbon emissions?"):
52
+ # '''
53
+ # Defines a pipeline using the conversational agent class
54
+ # '''
55
+ # prompt_node = prompting_model()
56
+ # summary_memory = ConversationSummaryMemory(prompt_node=prompt_node)
57
+ # conversational_agent = ConversationalAgent(prompt_node=prompt_node, memory=summary_memory)
58
+
59
+
60
+ output = None
61
+ return output
62
+
63
+ def rsg_pipeline(question, d):
64
+
65
+ '''
66
+ Defines a pipeline using the summarization pipeline with an additional prompt node
67
+ '''
68
+ # retriever = retriever1(d)
69
+ # summarizer = summarize()
70
+ # pipeline = SearchSummarizationPipeline(summarizer=summarizer, retriever=retriever)
71
+ # result = pipeline.run(query=question, params={"Retriever": {"top_k": 5}})
72
+
73
+ # output = ''
74
+ # for i in range(len(result['documents'])):
75
+ # output += result['documents'][i].meta['summary']
76
+
77
+ output = "In recent years, a number of papers have examined the impact of incentives on firms to reduce carbon emissions.The European Union's (EU) Emissions Trading Scheme (ETS) aims to reduce greenhouse gas (GHG) emissions by trading carbon dioxide emissions from major emitters.The aim of this article is to provide a reference for managers to improve the attractiveness of their stores to consumers and for the gov- ernment to design carbon policy.In this paper, we discuss how the number of retail stores in a market affects the carbon emissions in the supply chain, and present our mathematical models to illustrate how retail store density can affect the carbon emissions in the supply chain, accounting for consumers’ emissions and transportation cost.KeyTakeaways:"
78
+ node = prompting_model_2()
79
+ pipe = Pipeline()
80
+ pipe.add_node(component=node, name="prompt_node", inputs = ["Query"])
81
+
82
+ f_output = pipe.run(query=question, documents=[Document(output)])
83
+ # op = [a.answer for a in f_output["answers"]]
84
+
85
+ return f_output
86
 
87
 
88
 
hay/retriever.py CHANGED
@@ -18,13 +18,14 @@ from haystack.nodes import TfidfRetriever
18
  import warnings
19
  warnings.filterwarnings('ignore')
20
 
21
- def generate_docs(overlap, length):
22
 
23
  '''
24
  Takes in split length and split overlap
25
  Saves the docs in a pandas dataframe
26
  '''
27
- all_docs = convert_files_to_docs(dir_path='data')
 
28
 
29
  preprocessor = PreProcessor(
30
  clean_empty_lines=True,
@@ -42,35 +43,23 @@ def generate_docs(overlap, length):
42
 
43
  df = pd.DataFrame(docs)
44
  dataset = Dataset(pa.Table.from_pandas(df))
45
- dataset.save_to_disk('outputs/docs-dataset')
 
46
 
47
  return None
48
 
49
 
50
- def retriever1():
51
  '''
52
  Use BM25 Retriever to retrieve data
53
  '''
54
 
55
- dataset = load_from_disk('outputs/docs-dataset')
 
56
 
57
  # BM25Retriever with InMemoryDocumentStore
58
  document_store = InMemoryDocumentStore(use_bm25=True)
59
  document_store.write_documents(dataset)
60
- retriever = BM25Retriever(document_store=document_store, top_k=5)
61
 
62
  return retriever
63
-
64
-
65
- # def retriever2():
66
- # document_store = FAISSDocumentStore(similarity="dot_product")
67
- # retriever = DensePassageRetriever(
68
- # document_store=document_store,
69
- # query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
70
- # passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base"
71
- # )
72
- # document_store.update_embeddings(retriever)
73
-
74
- # return retriever
75
- # generate_docs(20, 250)
76
- # ret = retriever2()
 
18
  import warnings
19
  warnings.filterwarnings('ignore')
20
 
21
+ def generate_docs(overlap, length, d='data'):
22
 
23
  '''
24
  Takes in split length and split overlap
25
  Saves the docs in a pandas dataframe
26
  '''
27
+
28
+ all_docs = convert_files_to_docs(dir_path=d)
29
 
30
  preprocessor = PreProcessor(
31
  clean_empty_lines=True,
 
43
 
44
  df = pd.DataFrame(docs)
45
  dataset = Dataset(pa.Table.from_pandas(df))
46
+ # dataset.save_to_disk('outputs/docs-dataset')
47
+ dataset.save_to_disk('outputs/docs-'+d)
48
 
49
  return None
50
 
51
 
52
+ def retriever1(d):
53
  '''
54
  Use BM25 Retriever to retrieve data
55
  '''
56
 
57
+ # dataset = load_from_disk('outputs/docs-dataset')
58
+ dataset = load_from_disk('outputs/docs-'+d)
59
 
60
  # BM25Retriever with InMemoryDocumentStore
61
  document_store = InMemoryDocumentStore(use_bm25=True)
62
  document_store.write_documents(dataset)
63
+ retriever = BM25Retriever(document_store=document_store, top_k=10)
64
 
65
  return retriever
 
 
 
 
 
 
 
 
 
 
 
 
 
 
outputs/docs-data/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d48ece13d37530a44060ad19d05e96a34d048d5de9403947fd50235c9b6a254
3
+ size 3583736
outputs/docs-data/dataset_info.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "id": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ },
9
+ "content": {
10
+ "dtype": "string",
11
+ "_type": "Value"
12
+ },
13
+ "content_type": {
14
+ "dtype": "string",
15
+ "_type": "Value"
16
+ },
17
+ "meta": {
18
+ "_split_id": {
19
+ "dtype": "int64",
20
+ "_type": "Value"
21
+ },
22
+ "_split_overlap": [
23
+ {
24
+ "doc_id": {
25
+ "dtype": "string",
26
+ "_type": "Value"
27
+ },
28
+ "range": {
29
+ "feature": {
30
+ "dtype": "int64",
31
+ "_type": "Value"
32
+ },
33
+ "_type": "Sequence"
34
+ }
35
+ }
36
+ ],
37
+ "name": {
38
+ "dtype": "string",
39
+ "_type": "Value"
40
+ }
41
+ },
42
+ "id_hash_keys": {
43
+ "feature": {
44
+ "dtype": "string",
45
+ "_type": "Value"
46
+ },
47
+ "_type": "Sequence"
48
+ },
49
+ "score": {
50
+ "dtype": "null",
51
+ "_type": "Value"
52
+ },
53
+ "embedding": {
54
+ "dtype": "null",
55
+ "_type": "Value"
56
+ }
57
+ },
58
+ "homepage": "",
59
+ "license": ""
60
+ }
outputs/docs-data/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "737b5afa18cfd1c6",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
outputs/docs-data2/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dc7265bdf1990c6ee2e9cfdb4a11583e5d8e4d8149239d0f4b32412f6677beb
3
+ size 6516160
outputs/docs-data2/dataset_info.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "id": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ },
9
+ "content": {
10
+ "dtype": "string",
11
+ "_type": "Value"
12
+ },
13
+ "content_type": {
14
+ "dtype": "string",
15
+ "_type": "Value"
16
+ },
17
+ "meta": {
18
+ "_split_id": {
19
+ "dtype": "int64",
20
+ "_type": "Value"
21
+ },
22
+ "_split_overlap": [
23
+ {
24
+ "doc_id": {
25
+ "dtype": "string",
26
+ "_type": "Value"
27
+ },
28
+ "range": {
29
+ "feature": {
30
+ "dtype": "int64",
31
+ "_type": "Value"
32
+ },
33
+ "_type": "Sequence"
34
+ }
35
+ }
36
+ ],
37
+ "name": {
38
+ "dtype": "string",
39
+ "_type": "Value"
40
+ }
41
+ },
42
+ "id_hash_keys": {
43
+ "feature": {
44
+ "dtype": "string",
45
+ "_type": "Value"
46
+ },
47
+ "_type": "Sequence"
48
+ },
49
+ "score": {
50
+ "dtype": "null",
51
+ "_type": "Value"
52
+ },
53
+ "embedding": {
54
+ "dtype": "null",
55
+ "_type": "Value"
56
+ }
57
+ },
58
+ "homepage": "",
59
+ "license": ""
60
+ }
outputs/docs-data2/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "87b3b11c03e41dc7",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }