Théo ALVES DA COSTA commited on
Commit
4b4bf28
1 Parent(s): 24f8d00

Updated v1.3 with images

Browse files
app.py CHANGED
@@ -9,6 +9,10 @@ import os
9
  import time
10
  import re
11
  import json
 
 
 
 
12
  from datetime import datetime
13
  from azure.storage.fileshare import ShareServiceClient
14
 
@@ -64,8 +68,6 @@ file_share_name = "climateqa"
64
  service = ShareServiceClient(account_url=account_url, credential=credential)
65
  share_client = service.get_share_client(file_share_name)
66
 
67
- print("YO",account_url,credential)
68
-
69
  user_id = create_user_id()
70
 
71
 
@@ -145,18 +147,12 @@ async def chat(query,history,audience,sources,reports):
145
  reports = []
146
 
147
 
148
- retriever = ClimateQARetriever(vectorstore=vectorstore,sources = sources,reports = reports,k_summary = 3,k_total = 10,threshold=0.5)
149
  rag_chain = make_rag_chain(retriever,llm)
150
 
151
- source_string = ""
152
-
153
-
154
  # gradio_format = make_pairs([a.content for a in history]) + [(query, "")]
155
-
156
  # history = history + [(query,"")]
157
-
158
  # print(history)
159
-
160
  # print(gradio_format)
161
 
162
  # # reset memory
@@ -227,7 +223,7 @@ async def chat(query,history,audience,sources,reports):
227
  output_language = op['value']["language"] # str
228
  output_query = op["value"]["question"]
229
  except Exception as e:
230
- raise gr.Error(f"ClimateQ&A Error: {e}\nThe error has been noted, try another question and if the error remains, you can contact us :)")
231
 
232
  elif op['path'] == retriever_path_id: # documents
233
  try:
@@ -267,8 +263,7 @@ async def chat(query,history,audience,sources,reports):
267
  yield history,docs_html,output_query,output_language,gallery
268
 
269
  except Exception as e:
270
- print(f"Error in fallback iterator: {e}")
271
- raise gr.Error(f"ClimateQ&A Error: {e}\nThe error has been noted, try another question and if the error remains, you can contact us :)")
272
 
273
 
274
  try:
@@ -282,6 +277,7 @@ async def chat(query,history,audience,sources,reports):
282
  "prompt": prompt,
283
  "query": prompt,
284
  "question":output_query,
 
285
  "docs":serialize_docs(docs),
286
  "answer": history[-1][1],
287
  "time": timestamp,
@@ -289,8 +285,43 @@ async def chat(query,history,audience,sources,reports):
289
  log_on_azure(file, logs, share_client)
290
  except Exception as e:
291
  print(f"Error logging on Azure Blob Storage: {e}")
292
- raise gr.Error(f"ClimateQ&A Error: {str(e)[:100]}\nThe error has been noted, try another question and if the error remains, you can contact us :)")
293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
  # gallery = [x.metadata["image_path"] for x in docs if (len(x.metadata["image_path"]) > 0 and "IAS" in x.metadata["image_path"])]
296
  # if len(gallery) > 0:
@@ -334,21 +365,66 @@ def make_html_source(source,i):
334
  meta = source.metadata
335
  # content = source.page_content.split(":",1)[1].strip()
336
  content = source.page_content.strip()
337
- return f"""
338
- <div class="card">
339
- <div class="card-content">
340
- <h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
341
- <p>{content}</p>
342
- </div>
343
- <div class="card-footer">
344
- <span>{meta['name']}</span>
345
- <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
346
- <span role="img" aria-label="Open PDF">🔗</span>
347
- </a>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  </div>
349
- </div>
350
- """
 
351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
 
354
 
@@ -501,71 +577,6 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
501
  output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False)
502
  output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False)
503
 
504
- with gr.Tab("Figures",elem_id = "tab-images",id = 3):
505
- gallery = gr.Gallery()
506
-
507
-
508
- def start_chat(query,history):
509
- history = history + [(query,"")]
510
- history = [tuple(x) for x in history]
511
- print(history)
512
- return (gr.update(interactive = False),gr.update(selected=1),history)
513
-
514
- def finish_chat():
515
- return (gr.update(interactive = True,value = ""))
516
-
517
- (textbox
518
- .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
519
- .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery],concurrency_limit = 8,api_name = "chat_textbox")
520
- .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
521
- )
522
-
523
- (examples_hidden
524
- .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
525
- .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery],concurrency_limit = 8,api_name = "chat_examples")
526
- .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
527
- )
528
-
529
-
530
- def change_sample_questions(key):
531
- index = list(QUESTIONS.keys()).index(key)
532
- visible_bools = [False] * len(samples)
533
- visible_bools[index] = True
534
- return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
535
-
536
-
537
-
538
- dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
539
-
540
- # # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
541
- # (textbox
542
- # .submit(answer_user, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
543
- # .success(change_tab,None,tabs)
544
- # .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
545
- # .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue = True)
546
- # .success(lambda x : textbox,[textbox],[textbox])
547
- # )
548
-
549
- # (examples_hidden
550
- # .change(answer_user_example, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
551
- # .success(change_tab,None,tabs)
552
- # .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
553
- # .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue=True)
554
- # .success(lambda x : textbox,[textbox],[textbox])
555
- # )
556
- # submit_button.click(answer_user, [textbox, bot], [textbox, bot], queue=True).then(
557
- # answer_bot, [textbox,bot,dropdown_audience,dropdown_sources], [textbox,bot,sources_textbox]
558
- # )
559
-
560
-
561
-
562
-
563
-
564
-
565
-
566
-
567
-
568
-
569
 
570
 
571
 
@@ -575,6 +586,9 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
575
  #---------------------------------------------------------------------------------------
576
 
577
 
 
 
 
578
  with gr.Tab("About ClimateQ&A",elem_classes = "max-height other-tabs"):
579
  with gr.Row():
580
  with gr.Column(scale=1):
@@ -758,6 +772,62 @@ Or around 2 to 4 times more than a typical Google search.
758
  """
759
  )
760
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
761
  demo.queue()
762
 
763
  demo.launch()
 
9
  import time
10
  import re
11
  import json
12
+
13
+ from io import BytesIO
14
+ import base64
15
+
16
  from datetime import datetime
17
  from azure.storage.fileshare import ShareServiceClient
18
 
 
68
  service = ShareServiceClient(account_url=account_url, credential=credential)
69
  share_client = service.get_share_client(file_share_name)
70
 
 
 
71
  user_id = create_user_id()
72
 
73
 
 
147
  reports = []
148
 
149
 
150
+ retriever = ClimateQARetriever(vectorstore=vectorstore,sources = sources,min_size = 200,reports = reports,k_summary = 3,k_total = 15,threshold=0.5)
151
  rag_chain = make_rag_chain(retriever,llm)
152
 
 
 
 
153
  # gradio_format = make_pairs([a.content for a in history]) + [(query, "")]
 
154
  # history = history + [(query,"")]
 
155
  # print(history)
 
156
  # print(gradio_format)
157
 
158
  # # reset memory
 
223
  output_language = op['value']["language"] # str
224
  output_query = op["value"]["question"]
225
  except Exception as e:
226
+ raise gr.Error(f"ClimateQ&A Error: {e} - The error has been noted, try another question and if the error remains, you can contact us :)")
227
 
228
  elif op['path'] == retriever_path_id: # documents
229
  try:
 
263
  yield history,docs_html,output_query,output_language,gallery
264
 
265
  except Exception as e:
266
+ raise gr.Error(f"ClimateQ&A Error: {e}</br>The error has been noted, try another question and if the error remains, you can contact us :)")
 
267
 
268
 
269
  try:
 
277
  "prompt": prompt,
278
  "query": prompt,
279
  "question":output_query,
280
+ "sources":sources,
281
  "docs":serialize_docs(docs),
282
  "answer": history[-1][1],
283
  "time": timestamp,
 
285
  log_on_azure(file, logs, share_client)
286
  except Exception as e:
287
  print(f"Error logging on Azure Blob Storage: {e}")
288
+ raise gr.Error(f"ClimateQ&A Error: {str(e)[:100]}</br>The error has been noted, try another question and if the error remains, you can contact us :)")
289
 
290
+ image_dict = {}
291
+ for i,doc in enumerate(docs):
292
+
293
+ if doc.metadata["chunk_type"] == "image":
294
+ try:
295
+ key = f"Image {i}"
296
+ image_path = doc.metadata["image_path"].split("documents/")[1]
297
+ img = get_image_from_azure_blob_storage(image_path)
298
+
299
+ # Convert the image to a byte buffer
300
+ buffered = BytesIO()
301
+ img.save(buffered, format="PNG")
302
+ img_str = base64.b64encode(buffered.getvalue()).decode()
303
+
304
+ # Embedding the base64 string in Markdown
305
+ markdown_image = f"![Alt text](data:image/png;base64,{img_str})"
306
+ image_dict[key] = {"img":img,"md":markdown_image,"caption":doc.page_content,"key":key,"figure_code":doc.metadata["figure_code"]}
307
+ except Exception as e:
308
+ print(f"Skipped adding image {i} because of {e}")
309
+
310
+ if len(image_dict) > 0:
311
+
312
+ gallery = [x["img"] for x in list(image_dict.values())]
313
+ img = list(image_dict.values())[0]
314
+ img_md = img["md"]
315
+ img_caption = img["caption"]
316
+ img_code = img["figure_code"]
317
+ if img_code != "N/A":
318
+ img_name = f"{img['key']} - {img['figure_code']}"
319
+ else:
320
+ img_name = f"{img['key']}"
321
+
322
+ answer_yet = history[-1][1] + f"\n\n{img_md}\n<p class='chatbot-caption'><b>{img_name}</b> - {img_caption}</p>"
323
+ history[-1] = (history[-1][0],answer_yet)
324
+ history = [tuple(x) for x in history]
325
 
326
  # gallery = [x.metadata["image_path"] for x in docs if (len(x.metadata["image_path"]) > 0 and "IAS" in x.metadata["image_path"])]
327
  # if len(gallery) > 0:
 
365
  meta = source.metadata
366
  # content = source.page_content.split(":",1)[1].strip()
367
  content = source.page_content.strip()
368
+
369
+ toc_levels = []
370
+ for j in range(2):
371
+ level = meta[f"toc_level{j}"]
372
+ if level != "N/A":
373
+ toc_levels.append(level)
374
+ else:
375
+ break
376
+ toc_levels = " > ".join(toc_levels)
377
+ print(toc_levels)
378
+
379
+ if len(toc_levels) > 0:
380
+ name = f"<b>{toc_levels}</b><br/>{meta['name']}"
381
+ else:
382
+ name = meta['name']
383
+
384
+ print(name)
385
+
386
+
387
+ if meta["chunk_type"] == "text":
388
+
389
+ card = f"""
390
+ <div class="card">
391
+ <div class="card-content">
392
+ <h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
393
+ <p>{content}</p>
394
+ </div>
395
+ <div class="card-footer">
396
+ <span>{name}</span>
397
+ <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
398
+ <span role="img" aria-label="Open PDF">🔗</span>
399
+ </a>
400
+ </div>
401
  </div>
402
+ """
403
+
404
+ else:
405
 
406
+ if meta["figure_code"] != "N/A":
407
+ title = f"{meta['figure_code']} - {meta['short_name']}"
408
+ else:
409
+ title = f"{meta['short_name']}"
410
+
411
+ card = f"""
412
+ <div class="card card-image">
413
+ <div class="card-content">
414
+ <h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
415
+ <p>{content}</p>
416
+ <p class='ai-generated'>AI-generated description</p>
417
+ </div>
418
+ <div class="card-footer">
419
+ <span>{name}</span>
420
+ <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
421
+ <span role="img" aria-label="Open PDF">🔗</span>
422
+ </a>
423
+ </div>
424
+ </div>
425
+ """
426
+
427
+ return card
428
 
429
 
430
 
 
577
  output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False)
578
  output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False)
579
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
580
 
581
 
582
 
 
586
  #---------------------------------------------------------------------------------------
587
 
588
 
589
+ with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
590
+ gallery_component = gr.Gallery()
591
+
592
  with gr.Tab("About ClimateQ&A",elem_classes = "max-height other-tabs"):
593
  with gr.Row():
594
  with gr.Column(scale=1):
 
772
  """
773
  )
774
 
775
+
776
+
777
+
778
+ def start_chat(query,history):
779
+ history = history + [(query,"")]
780
+ history = [tuple(x) for x in history]
781
+ print(history)
782
+ return (gr.update(interactive = False),gr.update(selected=1),history)
783
+
784
+ def finish_chat():
785
+ return (gr.update(interactive = True,value = ""))
786
+
787
+ (textbox
788
+ .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
789
+ .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_textbox")
790
+ .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
791
+ )
792
+
793
+ (examples_hidden
794
+ .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
795
+ .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_examples")
796
+ .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
797
+ )
798
+
799
+
800
+ def change_sample_questions(key):
801
+ index = list(QUESTIONS.keys()).index(key)
802
+ visible_bools = [False] * len(samples)
803
+ visible_bools[index] = True
804
+ return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
805
+
806
+
807
+
808
+ dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
809
+
810
+ # # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
811
+ # (textbox
812
+ # .submit(answer_user, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
813
+ # .success(change_tab,None,tabs)
814
+ # .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
815
+ # .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue = True)
816
+ # .success(lambda x : textbox,[textbox],[textbox])
817
+ # )
818
+
819
+ # (examples_hidden
820
+ # .change(answer_user_example, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
821
+ # .success(change_tab,None,tabs)
822
+ # .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
823
+ # .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue=True)
824
+ # .success(lambda x : textbox,[textbox],[textbox])
825
+ # )
826
+ # submit_button.click(answer_user, [textbox, bot], [textbox, bot], queue=True).then(
827
+ # answer_bot, [textbox,bot,dropdown_audience,dropdown_sources], [textbox,bot,sources_textbox]
828
+ # )
829
+
830
+
831
  demo.queue()
832
 
833
  demo.launch()
climateqa/engine/prompts.py CHANGED
@@ -49,6 +49,7 @@ Guidelines:
49
  - You do not need to use every passage. Only use the ones that help answer the question.
50
  - If the documents do not have the information needed to answer the question, just say you do not have enough information.
51
  - Consider by default that the question is about the past century unless it is specified otherwise.
 
52
 
53
  -----------------------
54
  Passages:
@@ -59,18 +60,58 @@ Question: {question} - Explained to {audience}
59
  Answer in {language} with the passages citations:
60
  """
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  answer_prompt_without_docs_template = """
64
  You are ClimateQ&A, an AI Assistant created by Ekimetrics. Your role is to explain climate-related questions using info from the IPCC and/or IPBES reports.
65
  Always stay true to climate science and do not make up information. If you do not know the answer, just say you do not know.
66
 
67
  Guidelines:
68
- - Start by explaining clearly that you could not find the answer in the IPCC/IPBES reports, so your answer is based on your own knowledge and must be taken with great caution because it's AI generated.
 
69
  - Also explain that the user should reformulate the question to be more specific to the IPCC/IPBES reports
70
  - Because your role is to explain climate science, you can never be skeptical about climate change.
71
- - If the question is not related to environmental issues, never never answer it. Say it's not your role.
 
72
 
73
- Question: {question} - Explained to {audience}
74
  Answer in {language}:
75
  """
76
 
 
49
  - You do not need to use every passage. Only use the ones that help answer the question.
50
  - If the documents do not have the information needed to answer the question, just say you do not have enough information.
51
  - Consider by default that the question is about the past century unless it is specified otherwise.
52
+ - If the passage is the caption of a picture, you can still use it as part of your answer as any other document.
53
 
54
  -----------------------
55
  Passages:
 
60
  Answer in {language} with the passages citations:
61
  """
62
 
63
+ answer_prompt_images_template = """
64
+ You are ClimateQ&A, an AI Assistant created by Ekimetrics.
65
+ You are given the answer to a environmental question based on passages from the IPCC and IPBES reports and image captions.
66
+
67
+ Generate a follow-up and illustrated explanation to the existing answer using the content of the image caption.
68
+ The actual images will be inserted in the user interface afterward.
69
+
70
+
71
+ Guidelines:
72
+ - Don't summarize the previous answer or make an introduction, you only need to illustrate with the images.
73
+ - Mention the image using similar sentence : "Indeed, as we see in this picture ...", "In the following image, it is shown that ...", but without mentioning the Image number
74
+ - Insert a placeholder like this [Image i] and by skipping to a new line before and after, where the image will be displayed within your explanation
75
+
76
+ For example :
77
+ ```
78
+ Sea rise is projected to endanger isolated islands by 2050.
79
+ In the figure below, we see an projection of sea level rise:
80
+ [Image 1]
81
+ This image is depicting the urgency depicted in the passages.
82
+ ```
83
+
84
+ -----------------------
85
+ Image captions:
86
+ {images}
87
+
88
+ -----------------------
89
+ Question:
90
+ {question}
91
+
92
+ -----------------------
93
+ Answer:
94
+ {answer}
95
+
96
+ -----------------------
97
+
98
+ Follow-up explanation in {language} explained to {audience}:
99
+ """
100
+
101
 
102
  answer_prompt_without_docs_template = """
103
  You are ClimateQ&A, an AI Assistant created by Ekimetrics. Your role is to explain climate-related questions using info from the IPCC and/or IPBES reports.
104
  Always stay true to climate science and do not make up information. If you do not know the answer, just say you do not know.
105
 
106
  Guidelines:
107
+ - If it's a conversational question such as "hello", "who are you", ..., you can answer directly
108
+ - Start by explaining clearly that you could not find any passages to answer in the IPCC/IPBES reports, but it can be because of the search engine, and not because it's not there. So your answer is based on your own knowledge and must be taken with great caution because it's AI generated and you prefer to use sources to answer.
109
  - Also explain that the user should reformulate the question to be more specific to the IPCC/IPBES reports
110
  - Because your role is to explain climate science, you can never be skeptical about climate change.
111
+ - If the question is not related to environmental issues, never never answer it. Say it's not your role.
112
+ - Make paragraphs by starting new lines to make your answers more readable.
113
 
114
+ Question: {question}
115
  Answer in {language}:
116
  """
117
 
climateqa/engine/rag.py CHANGED
@@ -7,7 +7,7 @@ from langchain_core.prompts.prompt import PromptTemplate
7
  from langchain_core.prompts.base import format_document
8
 
9
  from climateqa.engine.reformulation import make_reformulation_chain
10
- from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template
11
  from climateqa.engine.utils import pass_values, flatten_dict
12
 
13
 
@@ -16,10 +16,26 @@ DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}"
16
  def _combine_documents(
17
  docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
18
  ):
19
- doc_strings = [f"Doc {i+1}: " + format_document(doc, document_prompt) for i,doc in enumerate(docs)]
 
 
 
 
 
 
 
 
 
20
  return sep.join(doc_strings)
21
 
22
 
 
 
 
 
 
 
 
23
  def make_rag_chain(retriever,llm):
24
 
25
 
@@ -51,22 +67,29 @@ def make_rag_chain(retriever,llm):
51
  **pass_values(["question","audience","language"])
52
  }
53
 
54
- # Generate the answer
55
-
56
 
57
 
58
  answer_with_docs = {
59
  "answer": input_documents | prompt | llm | StrOutputParser(),
60
- **pass_values(["question","audience","language","query","docs"])
61
  }
62
 
63
  answer_without_docs = {
64
  "answer": prompt_without_docs | llm | StrOutputParser(),
65
- **pass_values(["question","audience","language","query","docs"])
66
  }
67
 
 
 
 
 
 
 
 
68
  answer = RunnableBranch(
69
- (lambda x: len(x["docs"]) > 0, answer_with_docs),
70
  answer_without_docs,
71
  )
72
 
@@ -77,3 +100,16 @@ def make_rag_chain(retriever,llm):
77
 
78
  return rag_chain
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from langchain_core.prompts.base import format_document
8
 
9
  from climateqa.engine.reformulation import make_reformulation_chain
10
+ from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
11
  from climateqa.engine.utils import pass_values, flatten_dict
12
 
13
 
 
16
  def _combine_documents(
17
  docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
18
  ):
19
+
20
+ doc_strings = []
21
+
22
+ for i,doc in enumerate(docs):
23
+ # chunk_type = "Doc" if doc.metadata["chunk_type"] == "text" else "Image"
24
+ chunk_type = "Doc"
25
+ doc_string = f"{chunk_type} {i+1}: " + format_document(doc, document_prompt)
26
+ doc_string = doc_string.replace("\n"," ")
27
+ doc_strings.append(doc_string)
28
+
29
  return sep.join(doc_strings)
30
 
31
 
32
+ def get_text_docs(x):
33
+ return [doc for doc in x if doc.metadata["chunk_type"] == "text"]
34
+
35
+ def get_image_docs(x):
36
+ return [doc for doc in x if doc.metadata["chunk_type"] == "image"]
37
+
38
+
39
  def make_rag_chain(retriever,llm):
40
 
41
 
 
67
  **pass_values(["question","audience","language"])
68
  }
69
 
70
+ # ------- CHAIN 3
71
+ # Bot answer
72
 
73
 
74
  answer_with_docs = {
75
  "answer": input_documents | prompt | llm | StrOutputParser(),
76
+ **pass_values(["question","audience","language","query","docs"]),
77
  }
78
 
79
  answer_without_docs = {
80
  "answer": prompt_without_docs | llm | StrOutputParser(),
81
+ **pass_values(["question","audience","language","query","docs"]),
82
  }
83
 
84
+ # def has_images(x):
85
+ # image_docs = [doc for doc in x["docs"] if doc.metadata["chunk_type"]=="image"]
86
+ # return len(image_docs) > 0
87
+
88
+ def has_docs(x):
89
+ return len(x["docs"]) > 0
90
+
91
  answer = RunnableBranch(
92
+ (lambda x: has_docs(x), answer_with_docs),
93
  answer_without_docs,
94
  )
95
 
 
100
 
101
  return rag_chain
102
 
103
+
104
+
105
+ def make_illustration_chain(llm):
106
+
107
+ prompt_with_images = ChatPromptTemplate.from_template(answer_prompt_images_template)
108
+
109
+ input_description_images = {
110
+ "images":lambda x : _combine_documents(get_image_docs(x["docs"])),
111
+ **pass_values(["question","audience","language","answer"]),
112
+ }
113
+
114
+ illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
115
+ return illustration_chain
climateqa/engine/retriever.py CHANGED
@@ -18,7 +18,8 @@ class ClimateQARetriever(BaseRetriever):
18
  threshold:float = 0.6
19
  k_summary:int = 3
20
  k_total:int = 10
21
- namespace:str = "vectors"
 
22
 
23
 
24
  def _get_relevant_documents(
@@ -31,8 +32,8 @@ class ClimateQARetriever(BaseRetriever):
31
  assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
32
 
33
  # Prepare base search kwargs
34
-
35
  filters = {}
 
36
  if len(self.reports) > 0:
37
  filters["short_name"] = {"$in":self.reports}
38
  else:
@@ -59,14 +60,14 @@ class ClimateQARetriever(BaseRetriever):
59
  docs = docs_summaries + docs_full
60
 
61
  # Filter if scores are below threshold
62
- docs = [x for x in docs if x[1] > self.threshold]
63
 
64
  # Add score to metadata
65
  results = []
66
  for i,(doc,score) in enumerate(docs):
67
  doc.metadata["similarity_score"] = score
68
  doc.metadata["content"] = doc.page_content
69
- doc.metadata["page_number"] = int(doc.metadata["page_number"])
70
  # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
71
  results.append(doc)
72
 
 
18
  threshold:float = 0.6
19
  k_summary:int = 3
20
  k_total:int = 10
21
+ namespace:str = "vectors",
22
+ min_size:int = 200,
23
 
24
 
25
  def _get_relevant_documents(
 
32
  assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
33
 
34
  # Prepare base search kwargs
 
35
  filters = {}
36
+
37
  if len(self.reports) > 0:
38
  filters["short_name"] = {"$in":self.reports}
39
  else:
 
60
  docs = docs_summaries + docs_full
61
 
62
  # Filter if scores are below threshold
63
+ # docs = [x for x in docs if x[1] > self.threshold and len(x[0].page_content) > self.min_size]
64
 
65
  # Add score to metadata
66
  results = []
67
  for i,(doc,score) in enumerate(docs):
68
  doc.metadata["similarity_score"] = score
69
  doc.metadata["content"] = doc.page_content
70
+ doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
71
  # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
72
  results.append(doc)
73
 
climateqa/engine/vectorstore.py CHANGED
@@ -2,8 +2,8 @@
2
  # More info at https://docs.pinecone.io/docs/langchain
3
  # And https://python.langchain.com/docs/integrations/vectorstores/pinecone
4
  import os
5
- import pinecone
6
- from langchain_community.vectorstores import Pinecone
7
 
8
  # LOAD ENVIRONMENT VARIABLES
9
  try:
@@ -13,20 +13,29 @@ except:
13
  pass
14
 
15
 
16
- def get_pinecone_vectorstore(embeddings,text_key = "text"):
17
 
18
- # initialize pinecone
19
- pinecone.init(
20
- api_key=os.getenv("PINECONE_API_KEY"), # find at app.pinecone.io
21
- environment=os.getenv("PINECONE_API_ENVIRONMENT"), # next to api key in console
22
- )
 
 
 
 
 
23
 
24
- index_name = os.getenv("PINECONE_API_INDEX")
25
- vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
26
 
 
 
 
27
  return vectorstore
28
 
29
 
 
30
  # def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
31
 
32
  # assert isinstance(sources,list)
 
2
  # More info at https://docs.pinecone.io/docs/langchain
3
  # And https://python.langchain.com/docs/integrations/vectorstores/pinecone
4
  import os
5
+ from pinecone import Pinecone
6
+ from langchain_community.vectorstores import Pinecone as PineconeVectorstore
7
 
8
  # LOAD ENVIRONMENT VARIABLES
9
  try:
 
13
  pass
14
 
15
 
16
+ def get_pinecone_vectorstore(embeddings,text_key = "content"):
17
 
18
+ # # initialize pinecone
19
+ # pinecone.init(
20
+ # api_key=os.getenv("PINECONE_API_KEY"), # find at app.pinecone.io
21
+ # environment=os.getenv("PINECONE_API_ENVIRONMENT"), # next to api key in console
22
+ # )
23
+
24
+ # index_name = os.getenv("PINECONE_API_INDEX")
25
+ # vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
26
+
27
+ # return vectorstore
28
 
29
+ pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
30
+ index = pc.Index(os.getenv("PINECONE_API_INDEX"))
31
 
32
+ vectorstore = PineconeVectorstore(
33
+ index, embeddings, text_key,
34
+ )
35
  return vectorstore
36
 
37
 
38
+
39
  # def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
40
 
41
  # assert isinstance(sources,list)
climateqa/sample_questions.py CHANGED
@@ -73,6 +73,12 @@ QUESTIONS = {
73
  "What are the impacts of invasive alien species on Indigenous Peoples and local communities?",
74
  "What technologies and tools are available for managing invasive alien species?",
75
  "How do economic and land-use changes facilitate the introduction and spread of invasive alien species?"
 
 
 
 
 
 
76
  ]
77
 
78
  }
 
73
  "What are the impacts of invasive alien species on Indigenous Peoples and local communities?",
74
  "What technologies and tools are available for managing invasive alien species?",
75
  "How do economic and land-use changes facilitate the introduction and spread of invasive alien species?"
76
+ ],
77
+ "Experimental images":[
78
+ "Is warming unprecedented in the past 200 years ?",
79
+ "Are human activities causing global warming?",
80
+ "What is the distribution of uncertainty in projected precipitation changes across different time frames ?",
81
+ "What are the anticipated changes in the global water cycle by the end of the 21st century under an intermediate emissions scenario ?",
82
  ]
83
 
84
  }
climateqa/utils.py CHANGED
@@ -15,7 +15,7 @@ def get_file_from_azure_blob_storage(path):
15
 
16
 
17
  def get_image_from_azure_blob_storage(path):
18
- base_path = "search_demo/climateq&a/processed_image/"
19
  path = os.path.join(base_path, path)
20
  file_object = get_file_from_azure_blob_storage(path)
21
  image = Image.open(file_object)
 
15
 
16
 
17
  def get_image_from_azure_blob_storage(path):
18
+ base_path = "climateqa/documents/"
19
  path = os.path.join(base_path, path)
20
  file_object = get_file_from_azure_blob_storage(path)
21
  image = Image.open(file_object)
style.css CHANGED
@@ -295,4 +295,28 @@ body.dark .card-footer span {
295
  white-space: normal !important; /* Allow the text to wrap */
296
  word-break: break-word !important; /* Break words to prevent overflow */
297
  overflow-wrap: break-word !important; /* Break long words if necessary */
298
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  white-space: normal !important; /* Allow the text to wrap */
296
  word-break: break-word !important; /* Break words to prevent overflow */
297
  overflow-wrap: break-word !important; /* Break long words if necessary */
298
+ }
299
+
300
+ span.chatbot > p > img{
301
+ margin-top:40px !important;
302
+ max-height: none !important;
303
+ max-width: 80% !important;
304
+ border-radius:0px !important;
305
+ }
306
+
307
+
308
+ .chatbot-caption{
309
+ font-size:11px;
310
+ font-style:italic;
311
+ color:#508094;
312
+ }
313
+
314
+ .ai-generated{
315
+ font-size:11px!important;
316
+ font-style:italic;
317
+ color:#73b8d4 !important;
318
+ }
319
+
320
+ .card-image > .card-content{
321
+ background-color:#f1f7fa !important;
322
+ }