Jorge Henao commited on
Commit
312fcc5
·
1 Parent(s): 3a0a9ab

openapi generative pipeline integration

Browse files
Files changed (6) hide show
  1. .vscode/launch.json +26 -9
  2. about.py +1 -1
  3. hallazgos.py +50 -8
  4. main_page.py +7 -2
  5. pinecode_quieries.py +28 -14
  6. reformas.py +61 -10
.vscode/launch.json CHANGED
@@ -1,16 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
  // Use IntelliSense to learn about possible attributes.
3
  // Hover to view descriptions of existing attributes.
4
- // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
  "version": "0.2.0",
6
  "configurations": [
7
- {
8
- "name": "Python: Current File",
9
- "type": "python",
10
- "request": "launch",
11
- "program": "${file}",
12
- "console": "integratedTerminal",
13
- "justMyCode": true
14
- }
 
15
  ]
16
  }
 
1
+ // {
2
+ // // Use IntelliSense to learn about possible attributes.
3
+ // // Hover to view descriptions of existing attributes.
4
+ // // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ // "version": "0.2.0",
6
+ // "configurations": [
7
+ // {
8
+ // "name": "Python: Current File",
9
+ // "type": "python",
10
+ // "request": "launch",
11
+ // "program": "${file}",
12
+ // "console": "integratedTerminal",
13
+ // "justMyCode": true
14
+ // }
15
+ // ]
16
+ // }
17
  {
18
  // Use IntelliSense to learn about possible attributes.
19
  // Hover to view descriptions of existing attributes.
20
+ // For more information, visit: Debugging in Visual Studio Code
21
  "version": "0.2.0",
22
  "configurations": [
23
+ {
24
+ "name": "debug streamlit",
25
+ "type": "python",
26
+ "request": "launch",
27
+ "program": "/Users/jorge.henao/oosource/ask2democracycol/venv/bin/streamlit",
28
+ "args": [
29
+ "run",
30
+ "main_page.py"]
31
+ }
32
  ]
33
  }
about.py CHANGED
@@ -1,6 +1,6 @@
1
  import streamlit as st
2
 
3
- def about_ask2democracy():
4
  st.markdown("""
5
  <h1 style='
6
  text-align: center;
 
1
  import streamlit as st
2
 
3
+ def about_ask2democracy(api_key):
4
  st.markdown("""
5
  <h1 style='
6
  text-align: center;
hallazgos.py CHANGED
@@ -10,14 +10,15 @@ import logging
10
  logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
11
  logging.getLogger("haystack").setLevel(logging.INFO)
12
 
13
- extractive_query = PinecodeProposalQueries (index_name= Config.index_name,
 
 
 
14
  api_key = Config.es_password,
15
  environment = Config.pinecone_environment,
16
  embedding_dim = Config.embedding_dim,
17
  reader_name_or_path = Config.reader_model_name_or_path,
18
- use_gpu = Config.use_gpu)
19
-
20
- def hallazgos_comision_verdad_2022():
21
  title = """
22
  <h1 style='
23
  text-align: center;
@@ -55,8 +56,7 @@ def hallazgos_comision_verdad_2022():
55
 
56
  def search(question, retriever_top_k, reader_top_k):
57
  filters = {"source_title": "Hallazgos y recomendaciones - 28 de Junio 2022"}
58
-
59
- query_result = extractive_query.search_by_query(query = question,
60
  retriever_top_k = retriever_top_k,
61
  reader_top_k = reader_top_k,
62
  filters = filters)
@@ -80,7 +80,7 @@ def hallazgos_comision_verdad_2022():
80
  elapsed_time = round(ent - stt, 2)
81
 
82
  # show which query was entered, and what was searching time
83
- st.write(f"**Resultados relacionados con:** \"{query}\" ({elapsed_time} sec.)")
84
  # then we use loop to show results
85
  for i, answer in enumerate(results):
86
  # answer starts with header
@@ -92,9 +92,49 @@ def hallazgos_comision_verdad_2022():
92
 
93
  st.markdown("---")
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  #results = search("que es el adres", retriever_top_k=5, reader_top_k=3)
96
 
97
- st.markdown(f"""<br><p>Cuanto más contexto le des a la pregunta mejor funciona el sistema.
 
98
  No es un sistema basado en palabras claves, puedes escribir preguntas elaboradas.
99
  Una serie de modelos de lenguaje transformers intervienen en cada consulta para ayudar a entenderlas.""",
100
  unsafe_allow_html=True)
@@ -106,4 +146,6 @@ def hallazgos_comision_verdad_2022():
106
  st.error("¡escribe una pregunta!")
107
  else:
108
  st.session_state["submit"] = True
 
 
109
  search_and_show_results()
 
10
  logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
11
  logging.getLogger("haystack").setLevel(logging.INFO)
12
 
13
+
14
+ def hallazgos_comision_verdad_2022(api_key):
15
+
16
+ queries = PinecodeProposalQueries (index_name= Config.index_name,
17
  api_key = Config.es_password,
18
  environment = Config.pinecone_environment,
19
  embedding_dim = Config.embedding_dim,
20
  reader_name_or_path = Config.reader_model_name_or_path,
21
+ use_gpu = Config.use_gpu, OPENAI_key= api_key)
 
 
22
  title = """
23
  <h1 style='
24
  text-align: center;
 
56
 
57
  def search(question, retriever_top_k, reader_top_k):
58
  filters = {"source_title": "Hallazgos y recomendaciones - 28 de Junio 2022"}
59
+ query_result = queries.search_by_query(query = question,
 
60
  retriever_top_k = retriever_top_k,
61
  reader_top_k = reader_top_k,
62
  filters = filters)
 
80
  elapsed_time = round(ent - stt, 2)
81
 
82
  # show which query was entered, and what was searching time
83
+ st.write(f"**Resultados encontrados de las fuentes** \"{query}\" ({elapsed_time} sec.):")
84
  # then we use loop to show results
85
  for i, answer in enumerate(results):
86
  # answer starts with header
 
92
 
93
  st.markdown("---")
94
 
95
+ def search_and_generate_answer(question, retriever_top_k, generator_top_k):
96
+ filters = {"source_title": "Hallazgos y recomendaciones - 28 de Junio 2022"}
97
+ query_result = queries.genenerate_answer_OpenAI(query = question,
98
+ retriever_top_k = retriever_top_k,
99
+ generator_top_k = generator_top_k,
100
+ filters = filters, OPENAI_key = st.session_state.get("OPENAI_API_KEY"))
101
+
102
+ result = []
103
+ for i in range(0, len(query_result)):
104
+ item = query_result[i]
105
+ source_title = item.meta['doc_metas'][0]['source_title']
106
+ source_url = item.meta['doc_metas'][0]['source_url']
107
+ chapter_titles = [source['title'] for source in item.meta['doc_metas']]
108
+ result.append([[i+1], item.answer.replace("\n",""),
109
+ source_title, source_url, str(chapter_titles)])
110
+
111
+ def search_and_show_generative_results():
112
+ # set start time
113
+ stt = time.time()
114
+
115
+ # retrieve top 5 documents
116
+ results = search_and_generate_answer(query, retriever_top_k=5, generator_top_k=1)
117
+ # set endtime
118
+ ent = time.time()
119
+ # measure resulting time
120
+ elapsed_time = round(ent - stt, 2)
121
+
122
+ # show which query was entered, and what was searching time
123
+ st.write(f"**Respuesta generada a partir de los resultados** \"{query}\" ({elapsed_time} sec.):")
124
+ # then we use loop to show results
125
+ if results != None:
126
+ for i, answer in enumerate(results):
127
+ # answer starts with header
128
+ st.subheader(f"{answer[1]}")
129
+ st.markdown(f"[**Lee más aquí**]({answer[3]})")
130
+ st.caption(f"Fuentes: {answer[2]} - {answer[4]}")
131
+
132
+ st.markdown("---")
133
+
134
  #results = search("que es el adres", retriever_top_k=5, reader_top_k=3)
135
 
136
+
137
+ st.markdown(f"""<br><p>Cuanto más contexto le des a la pregunta mejores resultados se obtienen.
138
  No es un sistema basado en palabras claves, puedes escribir preguntas elaboradas.
139
  Una serie de modelos de lenguaje transformers intervienen en cada consulta para ayudar a entenderlas.""",
140
  unsafe_allow_html=True)
 
146
  st.error("¡escribe una pregunta!")
147
  else:
148
  st.session_state["submit"] = True
149
+ if api_key:
150
+ search_and_show_generative_results()
151
  search_and_show_results()
main_page.py CHANGED
@@ -6,7 +6,11 @@ from pinecode_quieries import PinecodeProposalQueries
6
  from config import Config
7
 
8
  # Define the sidebar
9
- api_key = st.sidebar.text_input("OpenAI API Key", type="password")
 
 
 
 
10
 
11
  # Define the navigation between pages
12
  page_options = {
@@ -19,4 +23,5 @@ page_options = {
19
  selected_page = st.sidebar.radio("Selecciona la página que deseas explorar:", list(page_options.keys()))
20
 
21
  # Render the selected page
22
- page_options[selected_page]()
 
 
6
  from config import Config
7
 
8
  # Define the sidebar
9
+ api_key = st.sidebar.text_input("OpenAI API Key", type="password",
10
+ value=st.session_state.get("OPENAI_API_KEY", ""))
11
+ if api_key:
12
+ st.session_state["OPENAI_API_KEY"] = api_key
13
+
14
 
15
  # Define the navigation between pages
16
  page_options = {
 
23
  selected_page = st.sidebar.radio("Selecciona la página que deseas explorar:", list(page_options.keys()))
24
 
25
  # Render the selected page
26
+ #print("key: " + api_key)
27
+ page_options[selected_page](api_key)
pinecode_quieries.py CHANGED
@@ -76,24 +76,24 @@ class DocumentQueries(ABC):
76
  class PinecodeProposalQueries(DocumentQueries):
77
 
78
  def __init__(self, index_name: str, api_key, reader_name_or_path: str, use_gpu = True,
79
- embedding_dim = 384, environment = "us-east1-gcp") -> None:
80
 
81
  reader = FARMReader(model_name_or_path = reader_name_or_path,
82
  use_gpu = use_gpu, num_processes = 1,
83
  context_window_size = 200)
84
 
85
  self._initialize_pipeline(index_name, api_key, reader = reader, embedding_dim=
86
- embedding_dim, environment = environment)
87
  #self.log = Log(es_host= es_host, es_index="log", es_user = es_user, es_password= es_password)
88
 
89
  def _initialize_pipeline(self, index_name, api_key, similarity = "cosine",
90
  embedding_dim = 384, reader = None,
91
  environment = "us-east1-gcp",
92
- metadata_config = {"indexed": ["title", "source_title"]}):
 
93
  if reader is not None:
94
  self.reader = reader
95
- self.OPENAI_generator = OpenAIAnswerGenerator(api_key="",
96
- model="text-davinci-003", temperature=.5, max_tokens=60)
97
  #pinecone.init(api_key=es_password, environment="us-east1-gcp")
98
 
99
  self.document_store = PineconeDocumentStore(
@@ -113,9 +113,14 @@ class PinecodeProposalQueries(DocumentQueries):
113
 
114
  self.extractive_pipe = ExtractiveQAPipeline (reader = self.reader,
115
  retriever = self.retriever)
116
- self.generative_OPENAI_pipe = GenerativeQAPipeline(generator = self.OPENAI_generator,
117
- retriever = self.retriever)
118
 
 
 
 
 
 
 
 
119
  def search_by_query(self, query : str, retriever_top_k: int, reader_top_k: int, index_name: str = None, filters = None):
120
  #self.document_store.update_embeddings(self.retriever, update_existing_embeddings=False)
121
  params = {"Retriever": {"top_k": retriever_top_k,
@@ -124,19 +129,28 @@ class PinecodeProposalQueries(DocumentQueries):
124
  prediction = self.extractive_pipe.run( query = query, params = params, debug = True)
125
  return prediction["answers"]
126
 
127
- def genenerate_answer_OpenAI(self, query : str, retriever_top_k: int, reader_top_k: int, es_index: str = None, filters = None) :
128
- params = {"Retriever": {"top_k": retriever_top_k,
129
- "filters": filters},
130
- "Generator": {"top_k": reader_top_k}}
131
- prediction = self.generative_OPENAI_pipe.run( query = query, params = params)
132
- return prediction
 
 
 
 
 
 
 
 
 
133
 
134
  def genenerate_answer_HF(self, query : str, retriever_top_k: int, reader_top_k: int, es_index: str = None, filters = None) :
135
  params = {"Retriever": {"top_k": retriever_top_k,
136
  "filters": filters},
137
  "Generator": {"top_k": reader_top_k}}
138
  prediction = self.generative_HF_pipe.run( query = query, params = params)
139
- return prediction
140
 
141
  class Log():
142
 
 
76
  class PinecodeProposalQueries(DocumentQueries):
77
 
78
  def __init__(self, index_name: str, api_key, reader_name_or_path: str, use_gpu = True,
79
+ embedding_dim = 384, environment = "us-east1-gcp", OPENAI_key = None) -> None:
80
 
81
  reader = FARMReader(model_name_or_path = reader_name_or_path,
82
  use_gpu = use_gpu, num_processes = 1,
83
  context_window_size = 200)
84
 
85
  self._initialize_pipeline(index_name, api_key, reader = reader, embedding_dim=
86
+ embedding_dim, environment = environment, OPENAI_key= OPENAI_key)
87
  #self.log = Log(es_host= es_host, es_index="log", es_user = es_user, es_password= es_password)
88
 
89
  def _initialize_pipeline(self, index_name, api_key, similarity = "cosine",
90
  embedding_dim = 384, reader = None,
91
  environment = "us-east1-gcp",
92
+ metadata_config = {"indexed": ["title", "source_title"]},
93
+ OPENAI_key = None):
94
  if reader is not None:
95
  self.reader = reader
96
+
 
97
  #pinecone.init(api_key=es_password, environment="us-east1-gcp")
98
 
99
  self.document_store = PineconeDocumentStore(
 
113
 
114
  self.extractive_pipe = ExtractiveQAPipeline (reader = self.reader,
115
  retriever = self.retriever)
 
 
116
 
117
+ self.generative_OPENAI_pipe = None
118
+ if (OPENAI_key != None and OPENAI_key != ""):
119
+ OPENAI_generator = OpenAIAnswerGenerator(api_key = OPENAI_key,
120
+ model="text-davinci-003", temperature=.5, max_tokens=60)
121
+ self.generative_OPENAI_pipe = GenerativeQAPipeline(generator = OPENAI_generator,
122
+ retriever = self.retriever)
123
+
124
  def search_by_query(self, query : str, retriever_top_k: int, reader_top_k: int, index_name: str = None, filters = None):
125
  #self.document_store.update_embeddings(self.retriever, update_existing_embeddings=False)
126
  params = {"Retriever": {"top_k": retriever_top_k,
 
129
  prediction = self.extractive_pipe.run( query = query, params = params, debug = True)
130
  return prediction["answers"]
131
 
132
+ def genenerate_answer_OpenAI(self, query : str, retriever_top_k: int, generator_top_k: int, filters = None, OPENAI_key = None):
133
+ # if (OPENAI_key != None and OPENAI_key != ""):
134
+ # OPENAI_generator = OpenAIAnswerGenerator(api_key=OPENAI_key,
135
+ # model="text-davinci-003", temperature=.5, max_tokens=60)
136
+ # self.generative_OPENAI_pipe = GenerativeQAPipeline(generator = OPENAI_generator,
137
+ # retriever = self.retriever)
138
+
139
+ if (self.generative_OPENAI_pipe != None):
140
+ params = {"Retriever": {"top_k": retriever_top_k,
141
+ "filters": filters},
142
+ "Generator": {"top_k": generator_top_k}}
143
+ prediction = self.generative_OPENAI_pipe.run( query = query, params = params)
144
+ return prediction["answers"]
145
+ else:
146
+ return None
147
 
148
  def genenerate_answer_HF(self, query : str, retriever_top_k: int, reader_top_k: int, es_index: str = None, filters = None) :
149
  params = {"Retriever": {"top_k": retriever_top_k,
150
  "filters": filters},
151
  "Generator": {"top_k": reader_top_k}}
152
  prediction = self.generative_HF_pipe.run( query = query, params = params)
153
+ return prediction["answers"]
154
 
155
  class Log():
156
 
reformas.py CHANGED
@@ -10,14 +10,16 @@ import logging
10
  logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
11
  logging.getLogger("haystack").setLevel(logging.INFO)
12
 
13
- extractive_query = PinecodeProposalQueries (index_name= Config.index_name,
 
 
14
  api_key = Config.es_password,
15
  environment = Config.pinecone_environment,
16
  embedding_dim = Config.embedding_dim,
17
  reader_name_or_path = Config.reader_model_name_or_path,
18
- use_gpu = Config.use_gpu)
 
19
 
20
- def reforma_salud_febrero_2023():
21
  title = """
22
  <h1 style='
23
  text-align: center;
@@ -74,7 +76,7 @@ def reforma_salud_febrero_2023():
74
  def search(question, retriever_top_k, reader_top_k):
75
  filters = {"source_title": "Reforma de la salud 13 Febrero 2023"}
76
 
77
- query_result = extractive_query.search_by_query(query = question,
78
  retriever_top_k = retriever_top_k,
79
  reader_top_k = reader_top_k,
80
  filters = filters)
@@ -86,7 +88,7 @@ def reforma_salud_febrero_2023():
86
  int(item.meta['page']), item.meta['source_url']])
87
  #result.append([[i+1], item.answer, item.context[:200], item.meta['title']])
88
  return result
89
-
90
  def search_and_show_results():
91
  # set start time
92
  stt = time.time()
@@ -98,7 +100,7 @@ def reforma_salud_febrero_2023():
98
  elapsed_time = round(ent - stt, 2)
99
 
100
  # show which query was entered, and what was searching time
101
- st.write(f"**Resultados relacionados con:** \"{query}\" ({elapsed_time} sec.)")
102
  # then we use loop to show results
103
  for i, answer in enumerate(results):
104
  # answer starts with header
@@ -106,12 +108,54 @@ def reforma_salud_febrero_2023():
106
  # cropped answer
107
  doc = answer[2][:250] + "..."
108
  st.markdown(f"{doc}[**Lee más aquí**]({answer[6]})")
109
- st.caption(f"Fuente: {answer[4]} - Capítulo: {answer[3]} - Página: {answer[5]}")
110
 
111
- st.markdown("---")
112
 
113
- #results = search("que es el adres", retriever_top_k=5, reader_top_k=3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  st.write(f"""<br><p>Cuanto más contexto le des a la pregunta mejores resultados se obtienen.
116
  No es un sistema basado en palabras claves, puedes escribir preguntas elaboradas.
117
  Una serie de modelos de lenguaje transformers intervienen en cada consulta para ayudar a entenderlas."""
@@ -124,4 +168,11 @@ def reforma_salud_febrero_2023():
124
  st.error("¡escribe una pregunta!")
125
  else:
126
  st.session_state["submit"] = True
127
- search_and_show_results()
 
 
 
 
 
 
 
 
10
  logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
11
  logging.getLogger("haystack").setLevel(logging.INFO)
12
 
13
+ def reforma_salud_febrero_2023(api_key):
14
+
15
+ queries = PinecodeProposalQueries (index_name= Config.index_name,
16
  api_key = Config.es_password,
17
  environment = Config.pinecone_environment,
18
  embedding_dim = Config.embedding_dim,
19
  reader_name_or_path = Config.reader_model_name_or_path,
20
+ use_gpu = Config.use_gpu,
21
+ OPENAI_key=api_key)
22
 
 
23
  title = """
24
  <h1 style='
25
  text-align: center;
 
76
  def search(question, retriever_top_k, reader_top_k):
77
  filters = {"source_title": "Reforma de la salud 13 Febrero 2023"}
78
 
79
+ query_result = queries.search_by_query(query = question,
80
  retriever_top_k = retriever_top_k,
81
  reader_top_k = reader_top_k,
82
  filters = filters)
 
88
  int(item.meta['page']), item.meta['source_url']])
89
  #result.append([[i+1], item.answer, item.context[:200], item.meta['title']])
90
  return result
91
+
92
  def search_and_show_results():
93
  # set start time
94
  stt = time.time()
 
100
  elapsed_time = round(ent - stt, 2)
101
 
102
  # show which query was entered, and what was searching time
103
+ st.write(f"**Resultados encontrados de las fuentes** \"{query}\" ({elapsed_time} sec.):")
104
  # then we use loop to show results
105
  for i, answer in enumerate(results):
106
  # answer starts with header
 
108
  # cropped answer
109
  doc = answer[2][:250] + "..."
110
  st.markdown(f"{doc}[**Lee más aquí**]({answer[6]})")
111
+ st.caption(f"Fuente: {answer[4]} - Artículo: {answer[3]} - Página: {answer[5]}")
112
 
113
+ #st.markdown("---")
114
 
115
+ def search_and_generate_answer(question, retriever_top_k, generator_top_k):
116
+ filters = {"source_title": "Reforma de la salud 13 Febrero 2023"}
117
+
118
+ query_result = queries.genenerate_answer_OpenAI(query = question,
119
+ retriever_top_k = retriever_top_k,
120
+ generator_top_k = generator_top_k,
121
+ filters = filters)
122
+
123
+ result = []
124
+ for i in range(0, len(query_result)):
125
+ item = query_result[i]
126
+ source_title = item.meta['doc_metas'][0]['source_title']
127
+ source_url = item.meta['doc_metas'][0]['source_url']
128
+ chapter_titles = [source['title'] for source in item.meta['doc_metas']]
129
+ result.append([[i+1], item.answer.replace("\n",""),
130
+ source_title, source_url, str(chapter_titles)])
131
+ return result
132
 
133
+ def search_and_show_generative_results():
134
+ # set start time
135
+ stt = time.time()
136
+
137
+ # retrieve top 5 documents
138
+ results = search_and_generate_answer(query, retriever_top_k=5, generator_top_k=1)
139
+ # set endtime
140
+ ent = time.time()
141
+ # measure resulting time
142
+ elapsed_time = round(ent - stt, 2)
143
+
144
+ # show which query was entered, and what was searching time
145
+ st.write(f"**Respuesta generada a partir de los resultados** \"{query}\" ({elapsed_time} sec.):")
146
+ if results != None:
147
+ for i, answer in enumerate(results):
148
+ # answer starts with header
149
+ st.subheader(f"{answer[1]}")
150
+ st.markdown(f"[**Lee más aquí**]({answer[3]})")
151
+ st.caption(f"Fuentes: {answer[2]} - {answer[4]}")
152
+
153
+ st.markdown("---")
154
+
155
+ #st.markdown("---")
156
+
157
+ #results = search("que es el adres", retriever_top_k=5, reader_top_k=3)
158
+
159
  st.write(f"""<br><p>Cuanto más contexto le des a la pregunta mejores resultados se obtienen.
160
  No es un sistema basado en palabras claves, puedes escribir preguntas elaboradas.
161
  Una serie de modelos de lenguaje transformers intervienen en cada consulta para ayudar a entenderlas."""
 
168
  st.error("¡escribe una pregunta!")
169
  else:
170
  st.session_state["submit"] = True
171
+ #if not st.session_state.get("OPENAI_API_KEY"):
172
+ if api_key:
173
+ search_and_show_generative_results()
174
+ search_and_show_results()
175
+
176
+ #r = search_and_generate_answer("que es el ADRES?", retriever_top_k = 5, generator_top_k = 1)
177
+
178
+