Spaces:

Ekimetrics
/

climate-question-answering

Running

@@ -1,12 +1,13 @@
 from climateqa.engine.embeddings import get_embeddings_function
 embeddings_function = get_embeddings_function()
 from sentence_transformers import CrossEncoder
-# reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
 import gradio as gr
-from gradio_modal import Modal
 import pandas as pd
 import numpy as np
 import os
@@ -14,8 +15,6 @@ import time
 import re
 import json
-from gradio import ChatMessage
 # from gradio_modal import Modal
 from io import BytesIO
@@ -26,29 +25,20 @@ from azure.storage.fileshare import ShareServiceClient
 from utils import create_user_id
-from gradio_modal import Modal
-from PIL import Image
-from langchain_core.runnables.schema import StreamEvent
 # ClimateQ&A imports
 from climateqa.engine.llm import get_llm
 from climateqa.engine.vectorstore import get_pinecone_vectorstore
-# from climateqa.knowledge.retriever import ClimateQARetriever
-from climateqa.engine.reranker import get_reranker
 from climateqa.engine.embeddings import get_embeddings_function
-from climateqa.engine.chains.prompts import audience_prompts
 from climateqa.sample_questions import QUESTIONS
-from climateqa.constants import POSSIBLE_REPORTS, OWID_CATEGORIES
 from climateqa.utils import get_image_from_azure_blob_storage
-from climateqa.engine.graph import make_graph_agent
-from climateqa.engine.embeddings import get_embeddings_function
-from climateqa.engine.chains.retrieve_papers import find_papers
-from front.utils import serialize_docs,process_figures
-from climateqa.event_handler import init_audience, handle_retrieved_documents, stream_answer,handle_retrieved_owid_graphs
 # Load environment variables in local mode
 try:
@@ -57,8 +47,6 @@ try:
 except Exception as e:
     pass
-import requests
 # Set up Gradio Theme
 theme = gr.themes.Base(
     primary_hue="blue",
@@ -92,114 +80,134 @@ share_client = service.get_share_client(file_share_name)
 user_id = create_user_id()
-CITATION_LABEL = "BibTeX citation for ClimateQ&A"
-CITATION_TEXT = r"""@misc{climateqa,
-    author={Théo Alves Da Costa, Timothée Bohe},
-    title={ClimateQ&A, AI-powered conversational assistant for climate change and biodiversity loss},
-    year={2024},
-    howpublished= {\url{https://climateqa.com}},
-}
-@software{climateqa,
-    author = {Théo Alves Da Costa, Timothée Bohe},
-    publisher = {ClimateQ&A},
-    title = {ClimateQ&A, AI-powered conversational assistant for climate change and biodiversity loss},
-}
-"""
 # Create vectorstore and retriever
-vectorstore = get_pinecone_vectorstore(embeddings_function, index_name = os.getenv("PINECONE_API_INDEX"))
-vectorstore_graphs = get_pinecone_vectorstore(embeddings_function, index_name = os.getenv("PINECONE_API_INDEX_OWID"), text_key="description")
 llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
-reranker = get_reranker("large")
-agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, reranker=reranker)
-def update_config_modal_visibility(config_open):
-    new_config_visibility_status = not config_open
-    return gr.update(visible=new_config_visibility_status), new_config_visibility_status
-async def chat(query, history, audience, sources, reports, relevant_content_sources, search_only):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     (messages in gradio format, messages in langchain format, source documents)"""
-    date_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    print(f">> NEW QUESTION ({date_now}) : {query}")
-    audience_prompt = init_audience(audience)
     # Prepare default values
-    if sources is None or len(sources) == 0:
-        sources = ["IPCC", "IPBES", "IPOS"]
-    if reports is None or len(reports) == 0:
         reports = []
-    inputs = {"user_input": query,"audience": audience_prompt,"sources_input":sources, "relevant_content_sources" : relevant_content_sources, "search_only": search_only, "reports": reports}
-    result = agent.astream_events(inputs,version = "v1")
-    docs = []
-    related_contents = []
     docs_html = ""
     output_query = ""
     output_language = ""
     output_keywords = ""
-    start_streaming = False
-    graphs_html = ""
-    figures = '<div class="figures-container"><p></p> </div>'
-    steps_display = {
-        "categorize_intent":("🔄️ Analyzing user message",True),
-        "transform_query":("🔄️ Thinking step by step to answer the question",True),
-        "retrieve_documents":("🔄️ Searching in the knowledge base",False),
-    }
-    used_documents = []
-    answer_message_content = ""
     try:
-        async for event in result:
-            if "langgraph_node" in event["metadata"]:
-                node = event["metadata"]["langgraph_node"]
-                if event["event"] == "on_chain_end" and event["name"] == "retrieve_documents" and event["data"]["output"] != None:# when documents are retrieved
-                    docs, docs_html, history, used_documents, related_contents = handle_retrieved_documents(event, history, used_documents)
-                elif event["event"] == "on_chain_end" and node == "categorize_intent" and event["name"] == "_write": # when the query is transformed
-                    intent = event["data"]["output"]["intent"]
-                    if "language" in event["data"]["output"]:
-                        output_language = event["data"]["output"]["language"]
-                    else :
-                        output_language = "English"
-                    history[-1].content = f"Language identified : {output_language} \n Intent identified : {intent}"
-                elif event["name"] in steps_display.keys() and event["event"] == "on_chain_start": #display steps
-                    event_description, display_output = steps_display[node]
-                    if not hasattr(history[-1], 'metadata') or history[-1].metadata["title"] != event_description: # if a new step begins
-                        history.append(ChatMessage(role="assistant", content = "", metadata={'title' :event_description}))
-                elif event["name"] != "transform_query" and event["event"] == "on_chat_model_stream" and node in ["answer_rag", "answer_rag_no_docs","answer_search","answer_chitchat"]:# if streaming answer
-                    history, start_streaming, answer_message_content = stream_answer(history, event, start_streaming, answer_message_content)
-                elif event["name"] in ["retrieve_graphs", "retrieve_graphs_ai"] and event["event"] == "on_chain_end":
-                    graphs_html = handle_retrieved_owid_graphs(event, graphs_html)
-                if event["name"] == "transform_query" and event["event"] =="on_chain_end":
-                    if hasattr(history[-1],"content"):
-                        history[-1].content += "Decompose question into sub-questions: \n\n - " + "\n - ".join([q["question"] for q in event["data"]["output"]["remaining_questions"]])
-                if event["name"] == "categorize_intent" and event["event"] == "on_chain_start":
-                    print("X")
-            yield history, docs_html, output_query, output_language, related_contents , graphs_html,  #,output_query,output_keywords
     except Exception as e:
-        print(event, "has failed")
         raise gr.Error(f"{e}")
@@ -208,7 +216,7 @@ async def chat(query, history, audience, sources, reports, relevant_content_sour
         if os.getenv("GRADIO_ENV") != "local":
             timestamp = str(datetime.now().timestamp())
             file = timestamp + ".json"
-            prompt = history[1]["content"]
             logs = {
                 "user_id": str(user_id),
                 "prompt": prompt,
@@ -216,7 +224,7 @@ async def chat(query, history, audience, sources, reports, relevant_content_sour
                 "question":output_query,
                 "sources":sources,
                 "docs":serialize_docs(docs),
-                "answer": history[-1].content,
                 "time": timestamp,
             }
             log_on_azure(file, logs, share_client)
@@ -224,7 +232,119 @@ async def chat(query, history, audience, sources, reports, relevant_content_sour
         print(f"Error logging on Azure Blob Storage: {e}")
         raise gr.Error(f"ClimateQ&A Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)")
-    yield history, docs_html, output_query, output_language, related_contents, graphs_html
 def save_feedback(feed: str, user_id):
@@ -248,7 +368,77 @@ def log_on_azure(file, logs, share_client):
     file_client.upload_file(logs)
 # --------------------------------------------------------------------
@@ -263,15 +453,10 @@ Hello, I am ClimateQ&A, a conversational assistant designed to help you understa
 - **Language**: You can ask me your questions in any language.
 - **Audience**: You can specify your audience (children, general public, experts) to get a more adapted answer.
 - **Sources**: You can choose to search in the IPCC or IPBES reports, or both.
-- **Relevant content sources**: You can choose to search for figures, papers, or graphs that can be relevant for your question.
 ⚠️ Limitations
 *Please note that the AI is not perfect and may sometimes give irrelevant answers. If you are not satisfied with the answer, please ask a more specific question or report your feedback to help us improve the system.*
-🛈 Information
-Please note that we log your questions for meta-analysis purposes, so avoid sharing any sensitive or personal information.
 What do you want to learn ?
 """
@@ -282,64 +467,35 @@ def vote(data: gr.LikeData):
     else:
         print(data)
-def save_graph(saved_graphs_state, embedding, category):
-    print(f"\nCategory:\n{saved_graphs_state}\n")
-    if category not in saved_graphs_state:
-        saved_graphs_state[category] = []
-    if embedding not in saved_graphs_state[category]:
-        saved_graphs_state[category].append(embedding)
-    return saved_graphs_state, gr.Button("Graph Saved")
-with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=theme,elem_id = "main-component") as demo:
-    chat_completed_state = gr.State(0)
-    current_graphs = gr.State([])
-    saved_graphs = gr.State({})
-    config_open = gr.State(False)
     with gr.Tab("ClimateQ&A"):
         with gr.Row(elem_id="chatbot-row"):
             with gr.Column(scale=2):
                 chatbot = gr.Chatbot(
-                    value = [ChatMessage(role="assistant", content=init_prompt)],
-                    type = "messages",
-                    show_copy_button=True,
-                    show_label = False,
-                    elem_id="chatbot",
-                    layout = "panel",
                     avatar_images = (None,"https://i.ibb.co/YNyd5W2/logo4.png"),
-                    max_height="80vh",
-                    height="100vh"
-                )
                 # bot.like(vote,None,None)
                 with gr.Row(elem_id = "input-message"):
-                    textbox = gr.Textbox(
-                        placeholder="Ask me anything here!",
-                        show_label=False,
-                        scale=12,
-                        lines=1,
-                        interactive=True,
-                        elem_id="input-textbox"
-                    )
-                    config_button = gr.Button(
-                        "",
-                        elem_id="config-button"
-                    )
-            with gr.Column(scale=2, variant="panel",elem_id = "right-panel"):
-                with gr.Tabs(elem_id = "right_panel_tab") as tabs:
                     with gr.TabItem("Examples",elem_id = "tab-examples",id = 0):
                         examples_hidden = gr.Textbox(visible = False)
@@ -365,305 +521,102 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
                                 )
                             samples.append(group_examples)
-                    # with gr.Tab("Configuration", id = 10, ) as tab_config:
-                    #         # gr.Markdown("Reminders: You can talk in any language, ClimateQ&A is multi-lingual!")
-                    #     pass
-                            # with gr.Row():
-                            #     dropdown_sources = gr.CheckboxGroup(
-                            #         ["IPCC", "IPBES","IPOS"],
-                            #         label="Select source",
-                            #         value=["IPCC"],
-                            #         interactive=True,
-                            #     )
-                            #     dropdown_external_sources = gr.CheckboxGroup(
-                            #         ["IPCC figures","OpenAlex", "OurWorldInData"],
-                            #         label="Select database to search for relevant content",
-                            #         value=["IPCC figures"],
-                            #         interactive=True,
-                            #     )
-                            # dropdown_reports = gr.Dropdown(
-                            #     POSSIBLE_REPORTS,
-                            #     label="Or select specific reports",
-                            #     multiselect=True,
-                            #     value=None,
-                            #     interactive=True,
-                            # )
-                            # search_only = gr.Checkbox(label="Search only without chating", value=False, interactive=True, elem_id="checkbox-chat")
-                            # dropdown_audience = gr.Dropdown(
-                            #     ["Children","General public","Experts"],
-                            #     label="Select audience",
-                            #     value="Experts",
-                            #     interactive=True,
-                            # )
-                            # after = gr.Slider(minimum=1950,maximum=2023,step=1,value=1960,label="Publication date",show_label=True,interactive=True,elem_id="date-papers", visible=False)
-                            # output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False, visible= False)
-                            # output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False, visible= False)
-                            # dropdown_external_sources.change(lambda x: gr.update(visible = True ) if "OpenAlex" in x else gr.update(visible=False) , inputs=[dropdown_external_sources], outputs=[after])
-                            # # dropdown_external_sources.change(lambda x: gr.update(visible = True ) if "OpenAlex" in x else gr.update(visible=False) , inputs=[dropdown_external_sources], outputs=[after], visible=True)
-                    with gr.Tab("Sources",elem_id = "tab-sources",id = 1) as tab_sources:
-                        sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
-                    with gr.Tab("Recommended content", elem_id="tab-recommended_content",id=2) as tab_recommended_content:
-                        with gr.Tabs(elem_id = "group-subtabs") as tabs_recommended_content:
-                            with gr.Tab("Figures",elem_id = "tab-figures",id = 3) as tab_figures:
-                                sources_raw = gr.State([])
-                                new_figures = gr.State([])
-                                used_figures = gr.State([])
-                                with Modal(visible=False, elem_id="modal_figure_galery") as figure_modal:
-                                    gallery_component = gr.Gallery(object_fit='scale-down',elem_id="gallery-component", height="80vh")
-                                show_full_size_figures = gr.Button("Show figures in full size",elem_id="show-figures",interactive=True)
-                                show_full_size_figures.click(lambda : Modal(visible=True),None,figure_modal)
-                                figures_cards = gr.HTML(show_label=False, elem_id="sources-figures")
-                            with gr.Tab("Papers",elem_id = "tab-citations",id = 4) as tab_papers:
-                                # btn_summary = gr.Button("Summary")
-                                # Fenêtre simulée pour le Summary
-                                with gr.Accordion(visible=True, elem_id="papers-summary-popup", label= "See summary of relevant papers", open= False) as summary_popup:
-                                    papers_summary = gr.Markdown("", visible=True, elem_id="papers-summary")
-                                # btn_relevant_papers = gr.Button("Relevant papers")
-                                # Fenêtre simulée pour les Relevant Papers
-                                with gr.Accordion(visible=True, elem_id="papers-relevant-popup",label= "See relevant papers", open= False) as relevant_popup:
-                                    papers_html = gr.HTML(show_label=False, elem_id="papers-textbox")
-                                btn_citations_network = gr.Button("Explore papers citations network")
-                                # Fenêtre simulée pour le Citations Network
-                                with Modal(visible=False) as papers_modal:
-                                    citations_network = gr.HTML("<h3>Citations Network Graph</h3>", visible=True, elem_id="papers-citations-network")
-                                btn_citations_network.click(lambda: Modal(visible=True), None, papers_modal)
-                            with gr.Tab("Graphs", elem_id="tab-graphs", id=5) as tab_graphs:
-                                graphs_container = gr.HTML("<h2>There are no graphs to be displayed at the moment. Try asking another question.</h2>",elem_id="graphs-container")
-                                current_graphs.change(lambda x : x, inputs=[current_graphs], outputs=[graphs_container])
-            with Modal(visible=False,elem_id="modal-config") as config_modal:
-                gr.Markdown("Reminders: You can talk in any language, ClimateQ&A is multi-lingual!")
-                # with gr.Row():
-                dropdown_sources = gr.CheckboxGroup(
-                    ["IPCC", "IPBES","IPOS"],
-                    label="Select source (by default search in all sources)",
-                    value=["IPCC"],
-                    interactive=True,
-                )
-                dropdown_reports = gr.Dropdown(
-                    POSSIBLE_REPORTS,
-                    label="Or select specific reports",
-                    multiselect=True,
-                    value=None,
-                    interactive=True,
-                )
-                dropdown_external_sources = gr.CheckboxGroup(
-                    ["Figures (IPCC/IPBES)","Papers (OpenAlex)", "Graphs (OurWorldInData)"],
-                    label="Select database to search for relevant content",
-                    value=["Figures (IPCC/IPBES)"],
-                    interactive=True,
-                )
-                search_only = gr.Checkbox(label="Search only for recommended content without chating", value=False, interactive=True, elem_id="checkbox-chat")
-                dropdown_audience = gr.Dropdown(
-                    ["Children","General public","Experts"],
-                    label="Select audience",
-                    value="Experts",
-                    interactive=True,
-                )
-                after = gr.Slider(minimum=1950,maximum=2023,step=1,value=1960,label="Publication date",show_label=True,interactive=True,elem_id="date-papers", visible=False)
-                output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False, visible= False)
-                output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False, visible= False)
-                dropdown_external_sources.change(lambda x: gr.update(visible = True ) if "Papers (OpenAlex)" in x else gr.update(visible=False) , inputs=[dropdown_external_sources], outputs=[after])
-                close_config_modal = gr.Button("Validate and Close",elem_id="close-config-modal")
-                close_config_modal.click(fn=update_config_modal_visibility, inputs=[config_open], outputs=[config_modal, config_open])
-                # dropdown_external_sources.change(lambda x: gr.update(visible = True ) if "OpenAlex" in x else gr.update(visible=False) , inputs=[dropdown_external_sources], outputs=[after], visible=True)
-            config_button.click(fn=update_config_modal_visibility, inputs=[config_open], outputs=[config_modal, config_open])
-                    # with gr.Tab("OECD",elem_id = "tab-oecd",id = 6):
-                    #     oecd_indicator = "RIVER_FLOOD_RP100_POP_SH"
-                    #     oecd_topic = "climate"
-                    #     oecd_latitude = "46.8332"
-                    #     oecd_longitude = "5.3725"
-                    #     oecd_zoom = "5.6442"
-                    #     # Create the HTML content with the iframe
-                    #     iframe_html = f"""
-                    #     <iframe src="https://localdataportal.oecd.org/maps.html?indicator={oecd_indicator}&topic={oecd_topic}&latitude={oecd_latitude}&longitude={oecd_longitude}&zoom={oecd_zoom}"
-                    #             width="100%" height="600" frameborder="0" style="border:0;" allowfullscreen></iframe>
-                    #     """
-                    #     oecd_textbox = gr.HTML(iframe_html, show_label=False, elem_id="oecd-textbox")
 #---------------------------------------------------------------------------------------
 # OTHER TABS
 #---------------------------------------------------------------------------------------
-    # with gr.Tab("Settings",elem_id = "tab-config",id = 2):
-    #     gr.Markdown("Reminder: You can talk in any language, ClimateQ&A is multi-lingual!")
-    #     dropdown_sources = gr.CheckboxGroup(
-    #         ["IPCC", "IPBES","IPOS", "OpenAlex"],
-    #         label="Select source",
-    #         value=["IPCC"],
-    #         interactive=True,
-    #     )
-    #     dropdown_reports = gr.Dropdown(
-    #         POSSIBLE_REPORTS,
-    #         label="Or select specific reports",
-    #         multiselect=True,
-    #         value=None,
-    #         interactive=True,
-    #     )
-    #     dropdown_audience = gr.Dropdown(
-    #         ["Children","General public","Experts"],
-    #         label="Select audience",
-    #         value="Experts",
-    #         interactive=True,
-    #     )
-    #     output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False)
-    #     output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False)
     with gr.Tab("About",elem_classes = "max-height other-tabs"):
         with gr.Row():
             with gr.Column(scale=1):
-                gr.Markdown(
-                    """
-                    ### More info
-                    - See more info at [https://climateqa.com](https://climateqa.com/docs/intro/)
-                    - Feedbacks on this [form](https://forms.office.com/e/1Yzgxm6jbp)
-                    ### Citation
-                    """
-                )
-                with gr.Accordion(CITATION_LABEL,elem_id="citation", open = False,):
-                    # # Display citation label and text)
-                    gr.Textbox(
-                        value=CITATION_TEXT,
-                        label="",
-                        interactive=False,
-                        show_copy_button=True,
-                        lines=len(CITATION_TEXT.split('\n')),
-                    )
-    def start_chat(query,history,search_only):
-        history = history + [ChatMessage(role="user", content=query)]
-        if not search_only:
-            return (gr.update(interactive = False),gr.update(selected=1),history, [])
-        else:
-            return (gr.update(interactive = False),gr.update(selected=2),history, [])
     def finish_chat():
-        return gr.update(interactive = True,value = "")
-    # Initialize visibility states
-    summary_visible = False
-    relevant_visible = False
-    # Functions to toggle visibility
-    def toggle_summary_visibility():
-        global summary_visible
-        summary_visible = not summary_visible
-        return gr.update(visible=summary_visible)
-    def toggle_relevant_visibility():
-        global relevant_visible
-        relevant_visible = not relevant_visible
-        return gr.update(visible=relevant_visible)
-    def change_completion_status(current_state):
-        current_state = 1 - current_state
-        return current_state
-    def update_sources_number_display(sources_textbox, figures_cards, current_graphs, papers_html):
-        sources_number = sources_textbox.count("<h2>")
-        figures_number = figures_cards.count("<h2>")
-        graphs_number = current_graphs.count("<iframe")
-        papers_number = papers_html.count("<h2>")
-        sources_notif_label = f"Sources ({sources_number})"
-        figures_notif_label = f"Figures ({figures_number})"
-        graphs_notif_label = f"Graphs ({graphs_number})"
-        papers_notif_label = f"Papers ({papers_number})"
-        recommended_content_notif_label = f"Recommended content ({figures_number + graphs_number + papers_number})"
-        return gr.update(label = recommended_content_notif_label), gr.update(label = sources_notif_label), gr.update(label = figures_notif_label), gr.update(label = graphs_notif_label), gr.update(label = papers_notif_label)
     (textbox
-        .submit(start_chat, [textbox, chatbot, search_only],
-                [textbox, tabs, chatbot, sources_raw],
-                queue=False,
-                api_name="start_chat_textbox")
-        .then(chat, [textbox, chatbot, dropdown_audience, dropdown_sources,
-                     dropdown_reports, dropdown_external_sources, search_only],
-              [chatbot, sources_textbox, output_query, output_language,
-               new_figures, current_graphs],
-              concurrency_limit=8,
-              api_name="chat_textbox")
-        .then(finish_chat, None, [textbox],
-              api_name="finish_chat_textbox")
     )
     (examples_hidden
-        .change(start_chat, [examples_hidden,chatbot, search_only], [textbox,tabs,chatbot, sources_raw],queue = False,api_name = "start_chat_examples")
-        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, dropdown_external_sources, search_only] ,[chatbot,sources_textbox,output_query,output_language, new_figures, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
-        # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_graphs, tab_papers] )
     )
@@ -674,26 +627,51 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
         return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
-    new_figures.change(process_figures, inputs=[sources_raw, new_figures], outputs=[sources_raw, figures_cards, gallery_component])
-    # update sources numbers
-    sources_textbox.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_recommended_content, tab_sources, tab_figures, tab_graphs, tab_papers])
-    figures_cards.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_recommended_content, tab_sources, tab_figures, tab_graphs, tab_papers])
-    current_graphs.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_recommended_content, tab_sources, tab_figures, tab_graphs, tab_papers])
-    papers_html.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_recommended_content, tab_sources, tab_figures, tab_graphs, tab_papers])
-    # other questions examples
     dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
-    # search for papers
-    textbox.submit(find_papers,[textbox,after, dropdown_external_sources], [papers_html,citations_network,papers_summary])
-    examples_hidden.change(find_papers,[examples_hidden,after,dropdown_external_sources], [papers_html,citations_network,papers_summary])
-    # btn_summary.click(toggle_summary_visibility, outputs=summary_popup)
-    # btn_relevant_papers.click(toggle_relevant_visibility, outputs=relevant_popup)
-    demo.queue()
-demo.launch(ssr_mode=False)

 from climateqa.engine.embeddings import get_embeddings_function
 embeddings_function = get_embeddings_function()
+from climateqa.papers.openalex import OpenAlex
 from sentence_transformers import CrossEncoder
+reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
+oa = OpenAlex()
 import gradio as gr
 import pandas as pd
 import numpy as np
 import os
 import re
 import json
 # from gradio_modal import Modal
 from io import BytesIO
 from utils import create_user_id
 # ClimateQ&A imports
 from climateqa.engine.llm import get_llm
+from climateqa.engine.rag import make_rag_chain
 from climateqa.engine.vectorstore import get_pinecone_vectorstore
+from climateqa.engine.retriever import ClimateQARetriever
 from climateqa.engine.embeddings import get_embeddings_function
+from climateqa.engine.prompts import audience_prompts
 from climateqa.sample_questions import QUESTIONS
+from climateqa.constants import POSSIBLE_REPORTS
 from climateqa.utils import get_image_from_azure_blob_storage
+from climateqa.engine.keywords import make_keywords_chain
+from climateqa.engine.rag import make_rag_papers_chain
 # Load environment variables in local mode
 try:
 except Exception as e:
     pass
 # Set up Gradio Theme
 theme = gr.themes.Base(
     primary_hue="blue",
 user_id = create_user_id()
+def parse_output_llm_with_sources(output):
+    # Split the content into a list of text and "[Doc X]" references
+    content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
+    parts = []
+    for part in content_parts:
+        if part.startswith("Doc"):
+            subparts = part.split(",")
+            subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts]
+            subparts = [f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" for subpart in subparts]
+            parts.append("".join(subparts))
+        else:
+            parts.append(part)
+    content_parts = "".join(parts)
+    return content_parts
 # Create vectorstore and retriever
+vectorstore = get_pinecone_vectorstore(embeddings_function)
 llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
+def make_pairs(lst):
+    """from a list of even lenght, make tupple pairs"""
+    return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)]
+def serialize_docs(docs):
+    new_docs = []
+    for doc in docs:
+        new_doc = {}
+        new_doc["page_content"] = doc.page_content
+        new_doc["metadata"] = doc.metadata
+        new_docs.append(new_doc)
+    return new_docs
+async def chat(query,history,audience,sources,reports):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     (messages in gradio format, messages in langchain format, source documents)"""
+    print(f">> NEW QUESTION : {query}")
+    if audience == "Children":
+        audience_prompt = audience_prompts["children"]
+    elif audience == "General public":
+        audience_prompt = audience_prompts["general"]
+    elif audience == "Experts":
+        audience_prompt = audience_prompts["experts"]
+    else:
+        audience_prompt = audience_prompts["experts"]
     # Prepare default values
+    if len(sources) == 0:
+        sources = ["IPCC"]
+    if len(reports) == 0:
         reports = []
+    retriever = ClimateQARetriever(vectorstore=vectorstore,sources = sources,min_size = 200,reports = reports,k_summary = 3,k_total = 15,threshold=0.5)
+    rag_chain = make_rag_chain(retriever,llm)
+    inputs = {"query": query,"audience": audience_prompt}
+    result = rag_chain.astream_log(inputs) #{"callbacks":[MyCustomAsyncHandler()]})
+    # result = rag_chain.stream(inputs)
+    path_reformulation = "/logs/reformulation/final_output"
+    path_keywords = "/logs/keywords/final_output"
+    path_retriever = "/logs/find_documents/final_output"
+    path_answer = "/logs/answer/streamed_output_str/-"
     docs_html = ""
     output_query = ""
     output_language = ""
     output_keywords = ""
+    gallery = []
     try:
+        async for op in result:
+            op = op.ops[0]
+            if op['path'] == path_reformulation: # reforulated question
+                try:
+                    output_language = op['value']["language"] # str
+                    output_query = op["value"]["question"]
+                except Exception as e:
+                    raise gr.Error(f"ClimateQ&A Error: {e} - The error has been noted, try another question and if the error remains, you can contact us :)")
+            if op["path"] == path_keywords:
+                try:
+                    output_keywords = op['value']["keywords"] # str
+                    output_keywords = " AND ".join(output_keywords)
+                except Exception as e:
+                    pass
+            elif op['path'] == path_retriever: # documents
+                try:
+                    docs = op['value']['docs'] # List[Document]
+                    docs_html = []
+                    for i, d in enumerate(docs, 1):
+                        docs_html.append(make_html_source(d, i))
+                    docs_html = "".join(docs_html)
+                except TypeError:
+                    print("No documents found")
+                    print("op: ",op)
+                    continue
+            elif op['path'] == path_answer: # final answer
+                new_token = op['value'] # str
+                # time.sleep(0.01)
+                previous_answer = history[-1][1]
+                previous_answer = previous_answer if previous_answer is not None else ""
+                answer_yet = previous_answer + new_token
+                answer_yet = parse_output_llm_with_sources(answer_yet)
+                history[-1] = (query,answer_yet)
+            else:
+                continue
+            history = [tuple(x) for x in history]
+            yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
     except Exception as e:
         raise gr.Error(f"{e}")
         if os.getenv("GRADIO_ENV") != "local":
             timestamp = str(datetime.now().timestamp())
             file = timestamp + ".json"
+            prompt = history[-1][0]
             logs = {
                 "user_id": str(user_id),
                 "prompt": prompt,
                 "question":output_query,
                 "sources":sources,
                 "docs":serialize_docs(docs),
+                "answer": history[-1][1],
                 "time": timestamp,
             }
             log_on_azure(file, logs, share_client)
         print(f"Error logging on Azure Blob Storage: {e}")
         raise gr.Error(f"ClimateQ&A Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)")
+    image_dict = {}
+    for i,doc in enumerate(docs):
+        if doc.metadata["chunk_type"] == "image":
+            try:
+                key = f"Image {i+1}"
+                image_path = doc.metadata["image_path"].split("documents/")[1]
+                img = get_image_from_azure_blob_storage(image_path)
+                # Convert the image to a byte buffer
+                buffered = BytesIO()
+                img.save(buffered, format="PNG")
+                img_str = base64.b64encode(buffered.getvalue()).decode()
+                # Embedding the base64 string in Markdown
+                markdown_image = f"![Alt text](data:image/png;base64,{img_str})"
+                image_dict[key] = {"img":img,"md":markdown_image,"caption":doc.page_content,"key":key,"figure_code":doc.metadata["figure_code"]}
+            except Exception as e:
+                print(f"Skipped adding image {i} because of {e}")
+    if len(image_dict) > 0:
+        gallery = [x["img"] for x in list(image_dict.values())]
+        img = list(image_dict.values())[0]
+        img_md = img["md"]
+        img_caption = img["caption"]
+        img_code = img["figure_code"]
+        if img_code != "N/A":
+            img_name = f"{img['key']} - {img['figure_code']}"
+        else:
+            img_name = f"{img['key']}"
+        answer_yet = history[-1][1] + f"\n\n{img_md}\n<p class='chatbot-caption'><b>{img_name}</b> - {img_caption}</p>"
+        history[-1] = (history[-1][0],answer_yet)
+        history = [tuple(x) for x in history]
+    # gallery = [x.metadata["image_path"] for x in docs if (len(x.metadata["image_path"]) > 0 and "IAS" in x.metadata["image_path"])]
+    # if len(gallery) > 0:
+    #     gallery = list(set("|".join(gallery).split("|")))
+    #     gallery = [get_image_from_azure_blob_storage(x) for x in gallery]
+    yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
+def make_html_source(source,i):
+    meta = source.metadata
+    # content = source.page_content.split(":",1)[1].strip()
+    content = source.page_content.strip()
+    toc_levels = []
+    for j in range(2):
+        level = meta[f"toc_level{j}"]
+        if level != "N/A":
+            toc_levels.append(level)
+        else:
+            break
+    toc_levels = " > ".join(toc_levels)
+    if len(toc_levels) > 0:
+        name = f"<b>{toc_levels}</b><br/>{meta['name']}"
+    else:
+        name = meta['name']
+    if meta["chunk_type"] == "text":
+        card = f"""
+    <div class="card" id="doc{i}">
+        <div class="card-content">
+            <h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
+            <p>{content}</p>
+        </div>
+        <div class="card-footer">
+            <span>{name}</span>
+            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
+                <span role="img" aria-label="Open PDF">🔗</span>
+            </a>
+        </div>
+    </div>
+    """
+    else:
+        if meta["figure_code"] != "N/A":
+            title = f"{meta['figure_code']} - {meta['short_name']}"
+        else:
+            title = f"{meta['short_name']}"
+        card = f"""
+    <div class="card card-image">
+        <div class="card-content">
+            <h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
+            <p>{content}</p>
+            <p class='ai-generated'>AI-generated description</p>
+        </div>
+        <div class="card-footer">
+            <span>{name}</span>
+            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
+                <span role="img" aria-label="Open PDF">🔗</span>
+            </a>
+        </div>
+    </div>
+    """
+    return card
+#     else:
+#         docs_string = "No relevant passages found in the climate science reports (IPCC and IPBES)"
+#         complete_response = "**No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate issues).**"
+#         messages.append({"role": "assistant", "content": complete_response})
+#         gradio_format = make_pairs([a["content"] for a in messages[1:]])
+#         yield gradio_format, messages, docs_string
 def save_feedback(feed: str, user_id):
     file_client.upload_file(logs)
+def generate_keywords(query):
+    chain = make_keywords_chain(llm)
+    keywords = chain.invoke(query)
+    keywords = " AND ".join(keywords["keywords"])
+    return keywords
+papers_cols_widths = {
+    "doc":50,
+    "id":100,
+    "title":300,
+    "doi":100,
+    "publication_year":100,
+    "abstract":500,
+    "rerank_score":100,
+    "is_oa":50,
+}
+papers_cols = list(papers_cols_widths.keys())
+papers_cols_widths = list(papers_cols_widths.values())
+async def find_papers(query, keywords,after):
+    summary = ""
+    df_works = oa.search(keywords,after = after)
+    df_works = df_works.dropna(subset=["abstract"])
+    df_works = oa.rerank(query,df_works,reranker)
+    df_works = df_works.sort_values("rerank_score",ascending=False)
+    G = oa.make_network(df_works)
+    height = "750px"
+    network = oa.show_network(G,color_by = "rerank_score",notebook=False,height = height)
+    network_html = network.generate_html()
+    network_html = network_html.replace("'", "\"")
+    css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
+    network_html = network_html + css_to_inject
+    network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
+    display-capture; encrypted-media;" sandbox="allow-modals allow-forms
+    allow-scripts allow-same-origin allow-popups
+    allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
+    allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
+    docs = df_works["content"].head(15).tolist()
+    df_works = df_works.reset_index(drop = True).reset_index().rename(columns = {"index":"doc"})
+    df_works["doc"] = df_works["doc"] + 1
+    df_works = df_works[papers_cols]
+    yield df_works,network_html,summary
+    chain = make_rag_papers_chain(llm)
+    result = chain.astream_log({"question": query,"docs": docs,"language":"English"})
+    path_answer = "/logs/StrOutputParser/streamed_output/-"
+    async for op in result:
+        op = op.ops[0]
+        if op['path'] == path_answer: # reforulated question
+            new_token = op['value'] # str
+            summary += new_token
+        else:
+            continue
+        yield df_works,network_html,summary
 # --------------------------------------------------------------------
 - **Language**: You can ask me your questions in any language.
 - **Audience**: You can specify your audience (children, general public, experts) to get a more adapted answer.
 - **Sources**: You can choose to search in the IPCC or IPBES reports, or both.
 ⚠️ Limitations
 *Please note that the AI is not perfect and may sometimes give irrelevant answers. If you are not satisfied with the answer, please ask a more specific question or report your feedback to help us improve the system.*
 What do you want to learn ?
 """
     else:
         print(data)
+with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main-component") as demo:
+    # user_id_state = gr.State([user_id])
     with gr.Tab("ClimateQ&A"):
         with gr.Row(elem_id="chatbot-row"):
             with gr.Column(scale=2):
+                # state = gr.State([system_template])
                 chatbot = gr.Chatbot(
+                    value=[(None,init_prompt)],
+                    show_copy_button=True,show_label = False,elem_id="chatbot",layout = "panel",
                     avatar_images = (None,"https://i.ibb.co/YNyd5W2/logo4.png"),
+                )#,avatar_images = ("assets/logo4.png",None))
                 # bot.like(vote,None,None)
                 with gr.Row(elem_id = "input-message"):
+                    textbox=gr.Textbox(placeholder="Ask me anything here!",show_label=False,scale=7,lines = 1,interactive = True,elem_id="input-textbox")
+                    # submit = gr.Button("",elem_id = "submit-button",scale = 1,interactive = True,icon = "https://static-00.iconduck.com/assets.00/settings-icon-2048x2046-cw28eevx.png")
+            with gr.Column(scale=1, variant="panel",elem_id = "right-panel"):
+                with gr.Tabs() as tabs:
                     with gr.TabItem("Examples",elem_id = "tab-examples",id = 0):
                         examples_hidden = gr.Textbox(visible = False)
                                 )
                             samples.append(group_examples)
+                    with gr.Tab("Sources",elem_id = "tab-citations",id = 1):
+                        sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
+                        docs_textbox = gr.State("")
+                    # with Modal(visible = False) as config_modal:
+                    with gr.Tab("Configuration",elem_id = "tab-config",id = 2):
+                        gr.Markdown("Reminder: You can talk in any language, ClimateQ&A is multi-lingual!")
+                        dropdown_sources = gr.CheckboxGroup(
+                            ["IPCC", "IPBES","IPOS"],
+                            label="Select source",
+                            value=["IPCC"],
+                            interactive=True,
+                        )
+                        dropdown_reports = gr.Dropdown(
+                            POSSIBLE_REPORTS,
+                            label="Or select specific reports",
+                            multiselect=True,
+                            value=None,
+                            interactive=True,
+                        )
+                        dropdown_audience = gr.Dropdown(
+                            ["Children","General public","Experts"],
+                            label="Select audience",
+                            value="Experts",
+                            interactive=True,
+                        )
+                        output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False)
+                        output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False)
 #---------------------------------------------------------------------------------------
 # OTHER TABS
 #---------------------------------------------------------------------------------------
+    with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
+        gallery_component = gr.Gallery()
+    with gr.Tab("Papers (beta)",elem_id = "tab-papers",elem_classes = "max-height other-tabs"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                query_papers = gr.Textbox(placeholder="Question",show_label=False,lines = 1,interactive = True,elem_id="query-papers")
+                keywords_papers = gr.Textbox(placeholder="Keywords",show_label=False,lines = 1,interactive = True,elem_id="keywords-papers")
+                after = gr.Slider(minimum=1950,maximum=2023,step=1,value=1960,label="Publication date",show_label=True,interactive=True,elem_id="date-papers")
+                search_papers = gr.Button("Search",elem_id="search-papers",interactive=True)
+            with gr.Column(scale=7):
+                with gr.Tab("Summary",elem_id="papers-summary-tab"):
+                    papers_summary = gr.Markdown(visible=True,elem_id="papers-summary")
+                with gr.Tab("Relevant papers",elem_id="papers-results-tab"):
+                    papers_dataframe = gr.Dataframe(visible=True,elem_id="papers-table",headers = papers_cols)
+                with gr.Tab("Citations network",elem_id="papers-network-tab"):
+                    citations_network = gr.HTML(visible=True,elem_id="papers-citations-network")
     with gr.Tab("About",elem_classes = "max-height other-tabs"):
         with gr.Row():
             with gr.Column(scale=1):
+                gr.Markdown("See more info at [https://climateqa.com](https://climateqa.com/docs/intro/)")
+    def start_chat(query,history):
+        history = history + [(query,None)]
+        history = [tuple(x) for x in history]
+        return (gr.update(interactive = False),gr.update(selected=1),history)
     def finish_chat():
+        return (gr.update(interactive = True,value = ""))
     (textbox
+        .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
+        .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component,query_papers,keywords_papers],concurrency_limit = 8,api_name = "chat_textbox")
+        .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
     )
     (examples_hidden
+        .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
+        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component,query_papers,keywords_papers],concurrency_limit = 8,api_name = "chat_examples")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
     )
         return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
     dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
+    query_papers.submit(generate_keywords,[query_papers], [keywords_papers])
+    search_papers.click(find_papers,[query_papers,keywords_papers,after], [papers_dataframe,citations_network,papers_summary])
+    # # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
+    # (textbox
+    #     .submit(answer_user, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
+    #     .success(change_tab,None,tabs)
+    #     .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
+    #     .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue = True)
+    #     .success(lambda x : textbox,[textbox],[textbox])
+    # )
+    # (examples_hidden
+    #     .change(answer_user_example, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
+    #     .success(change_tab,None,tabs)
+    #     .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
+    #     .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue=True)
+    #     .success(lambda x : textbox,[textbox],[textbox])
+    # )
+    # submit_button.click(answer_user, [textbox, bot], [textbox, bot], queue=True).then(
+    #         answer_bot, [textbox,bot,dropdown_audience,dropdown_sources], [textbox,bot,sources_textbox]
+    #     )
+    # with Modal(visible=True) as first_modal:
+    #     gr.Markdown("# Welcome to ClimateQ&A !")
+    #     gr.Markdown("### Examples")
+    #     examples = gr.Examples(
+    #         ["Yo ça roule","ça boume"],
+    #         [examples_hidden],
+    #         examples_per_page=8,
+    #         run_on_click=False,
+    #         elem_id="examples",
+    #         api_name="examples",
+    #     )
+    # submit.click(lambda: Modal(visible=True), None, config_modal)
+    demo.queue()
+demo.launch()

climateqa/constants.py CHANGED Viewed

@@ -1,6 +1,4 @@
 POSSIBLE_REPORTS = [
-    "IPBES IABWFH SPM",
-    "IPBES CBL SPM",
     "IPCC AR6 WGI SPM",
     "IPCC AR6 WGI FR",
     "IPCC AR6 WGI TS",
@@ -44,25 +42,4 @@ POSSIBLE_REPORTS = [
     "IPBES IAS A C5",
     "IPBES IAS A C6",
     "IPBES IAS A SPM"
-]
-OWID_CATEGORIES = ['Access to Energy', 'Agricultural Production',
-       'Agricultural Regulation & Policy', 'Air Pollution',
-       'Animal Welfare', 'Antibiotics', 'Biodiversity', 'Biofuels',
-       'Biological & Chemical Weapons', 'CO2 & Greenhouse Gas Emissions',
-       'COVID-19', 'Clean Water', 'Clean Water & Sanitation',
-       'Climate Change', 'Crop Yields', 'Diet Compositions',
-       'Electricity', 'Electricity Mix', 'Energy', 'Energy Efficiency',
-       'Energy Prices', 'Environmental Impacts of Food Production',
-       'Environmental Protection & Regulation', 'Famines', 'Farm Size',
-       'Fertilizers', 'Fish & Overfishing', 'Food Supply', 'Food Trade',
-       'Food Waste', 'Food and Agriculture', 'Forests & Deforestation',
-       'Fossil Fuels', 'Future Population Growth',
-       'Hunger & Undernourishment', 'Indoor Air Pollution', 'Land Use',
-       'Land Use & Yields in Agriculture', 'Lead Pollution',
-       'Meat & Dairy Production', 'Metals & Minerals',
-       'Natural Disasters', 'Nuclear Energy', 'Nuclear Weapons',
-       'Oil Spills', 'Outdoor Air Pollution', 'Ozone Layer', 'Pandemics',
-       'Pesticides', 'Plastic Pollution', 'Renewable Energy', 'Soil',
-       'Transport', 'Urbanization', 'Waste Management', 'Water Pollution',
-       'Water Use & Stress', 'Wildfires']

 POSSIBLE_REPORTS = [
     "IPCC AR6 WGI SPM",
     "IPCC AR6 WGI FR",
     "IPCC AR6 WGI TS",
     "IPBES IAS A C5",
     "IPBES IAS A C6",
     "IPBES IAS A SPM"
+]

climateqa/engine/chains/__init__.py DELETED Viewed

File without changes

climateqa/engine/chains/answer_ai_impact.py DELETED Viewed

@@ -1,46 +0,0 @@
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.output_parsers import StrOutputParser
-prompt_template = """
-You are ClimateQ&A, an helpful AI Assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports.
-Always stay true to climate and nature science and do not make up information.
-If you do not know the answer, just say you do not know.
-## Guidelines
-- Explain that the environmental impact of AI is not covered by the IPCC or IPBES reports, but you can recommend info based on the sources below
-- Answer the question in the original language of the question
-## Sources
-- You can propose to visit this page https://climateqa.com/docs/carbon-footprint/ to learn more about ClimateQ&A's own carbon footprint
-- You can recommend to look at the work of the AI & climate expert scientist Sasha Luccioni with in in particular those papers
-    - Power Hungry Processing: Watts Driving the Cost of AI Deployment? - https://arxiv.org/abs/2311.16863 - about the carbon footprint at the inference stage of AI models
-    - Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning - https://arxiv.org/abs/2302.08476
-    - Estimating the Carbon Footprint of BLOOM, a 176B Parameter Language Model - https://arxiv.org/abs/2211.02001 - about the carbon footprint of training a large language model
-- You can also recommend the following tools to calculate the carbon footprint of AI models
-    - CodeCarbon - https://github.com/mlco2/codecarbon to measure the carbon footprint of your code
-    - Ecologits - https://ecologits.ai/ to measure the carbon footprint of using LLMs APIs such
-"""
-def make_ai_impact_chain(llm):
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", prompt_template),
-        ("user", "{question}")
-    ])
-    chain = prompt | llm | StrOutputParser()
-    chain = chain.with_config({"run_name":"ai_impact_chain"})
-    return chain
-def make_ai_impact_node(llm):
-    ai_impact_chain = make_ai_impact_chain(llm)
-    async def answer_ai_impact(state,config):
-        answer = await ai_impact_chain.ainvoke({"question":state["user_input"]},config)
-        return {"answer":answer}
-    return answer_ai_impact

climateqa/engine/chains/answer_chitchat.py DELETED Viewed

@@ -1,56 +0,0 @@
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.output_parsers import StrOutputParser
-chitchat_prompt_template = """
-You are ClimateQ&A, an helpful AI Assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports.
-Always stay true to climate and nature science and do not make up information.
-If you do not know the answer, just say you do not know.
-## Guidelines
-- If it's a conversational question, you can normally chat with the user
-- If the question is not related to any topic about the environment, refuse to answer and politely ask the user to ask another question about the environment
-- If the user ask if you speak any language, you can say you speak all languages :)
-- If the user ask about the bot itself "ClimateQ&A", you can explain that you are an AI assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports and propose to visit the website here https://climateqa.com/docs/intro/ for more information
-- If the question is about ESG regulations, standards, or frameworks like the CSRD, TCFD, SASB, GRI, CDP, etc., you can explain that this is not a topic covered by the IPCC or IPBES reports.
-- Precise that you are specialized in finding trustworthy information from the scientific reports of the IPCC and IPBES and other scientific litterature
-- If relevant you can propose up to 3 example of questions they could ask from the IPCC or IPBES reports from the examples below
-- Always answer in the original language of the question
-## Examples of questions you can suggest (in the original language of the question)
-    "What evidence do we have of climate change?",
-    "Are human activities causing global warming?",
-    "What are the impacts of climate change?",
-    "Can climate change be reversed?",
-    "What is the difference between climate change and global warming?",
-"""
-def make_chitchat_chain(llm):
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", chitchat_prompt_template),
-        ("user", "{question}")
-    ])
-    chain = prompt | llm | StrOutputParser()
-    chain = chain.with_config({"run_name":"chitchat_chain"})
-    return chain
-def make_chitchat_node(llm):
-    chitchat_chain = make_chitchat_chain(llm)
-    async def answer_chitchat(state,config):
-        print("---- Answer chitchat ----")
-        answer = await chitchat_chain.ainvoke({"question":state["user_input"]},config)
-        state["answer"] = answer
-        return state
-        # return {"answer":answer}
-    return answer_chitchat

climateqa/engine/chains/chitchat_categorization.py DELETED Viewed

@@ -1,43 +0,0 @@
-from langchain_core.pydantic_v1 import BaseModel, Field
-from typing import List
-from typing import Literal
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.utils.function_calling import convert_to_openai_function
-from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-class IntentCategorizer(BaseModel):
-    """Analyzing the user message input"""
-    environment: bool = Field(
-        description="Return 'True' if the question relates to climate change, the environment, nature, etc. (Example: should I eat fish?). Return 'False' if the question is just chit chat or not related to the environment or climate change.",
-    )
-def make_chitchat_intent_categorization_chain(llm):
-    openai_functions = [convert_to_openai_function(IntentCategorizer)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"IntentCategorizer"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain
-def make_chitchat_intent_categorization_node(llm):
-    categorization_chain = make_chitchat_intent_categorization_chain(llm)
-    def categorize_message(state):
-        output = categorization_chain.invoke({"input": state["user_input"]})
-        print(f"\n\nChit chat output intent categorization: {output}\n")
-        state["search_graphs_chitchat"] = output["environment"]
-        print(f"\n\nChit chat output intent categorization: {state}\n")
-        return state
-    return categorize_message

climateqa/engine/chains/graph_retriever.py DELETED Viewed

@@ -1,128 +0,0 @@
-import sys
-import os
-from contextlib import contextmanager
-from ..reranker import rerank_docs
-from ..graph_retriever import retrieve_graphs # GraphRetriever
-from ...utils import remove_duplicates_keep_highest_score
-def divide_into_parts(target, parts):
-    # Base value for each part
-    base = target // parts
-    # Remainder to distribute
-    remainder = target % parts
-    # List to hold the result
-    result = []
-    for i in range(parts):
-        if i < remainder:
-            # These parts get base value + 1
-            result.append(base + 1)
-        else:
-            # The rest get the base value
-            result.append(base)
-    return result
-@contextmanager
-def suppress_output():
-    # Open a null device
-    with open(os.devnull, 'w') as devnull:
-        # Store the original stdout and stderr
-        old_stdout = sys.stdout
-        old_stderr = sys.stderr
-        # Redirect stdout and stderr to the null device
-        sys.stdout = devnull
-        sys.stderr = devnull
-        try:
-            yield
-        finally:
-            # Restore stdout and stderr
-            sys.stdout = old_stdout
-            sys.stderr = old_stderr
-def make_graph_retriever_node(vectorstore, reranker, rerank_by_question=True, k_final=15, k_before_reranking=100):
-    async def node_retrieve_graphs(state):
-        print("---- Retrieving graphs ----")
-        POSSIBLE_SOURCES = ["IEA", "OWID"]
-        questions = state["remaining_questions"] if state["remaining_questions"] is not None and state["remaining_questions"]!=[]  else [state["query"]]
-        # sources_input = state["sources_input"]
-        sources_input = ["auto"]
-        auto_mode = "auto" in sources_input
-        # There are several options to get the final top k
-        # Option 1 - Get 100 documents by question and rerank by question
-        # Option 2 - Get 100/n documents by question and rerank the total
-        if rerank_by_question:
-            k_by_question = divide_into_parts(k_final,len(questions))
-        docs = []
-        for i,q in enumerate(questions):
-            question = q["question"] if isinstance(q, dict) else q
-            print(f"Subquestion {i}: {question}")
-            # If auto mode, we use all sources
-            if auto_mode:
-                sources = POSSIBLE_SOURCES
-            # Otherwise, we use the config
-            else:
-                sources = sources_input
-            if any([x in POSSIBLE_SOURCES for x in sources]):
-                sources = [x for x in sources if x in POSSIBLE_SOURCES]
-                # Search the document store using the retriever
-                docs_question = await retrieve_graphs(
-                    query = question,
-                    vectorstore = vectorstore,
-                    sources = sources,
-                    k_total = k_before_reranking,
-                    threshold = 0.5,
-                    )
-                # docs_question = retriever.get_relevant_documents(question)
-                # Rerank
-                if reranker is not None and docs_question!=[]:
-                    with suppress_output():
-                        docs_question = rerank_docs(reranker,docs_question,question)
-                else:
-                    # Add a default reranking score
-                    for doc in docs_question:
-                        doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
-                # If rerank by question we select the top documents for each question
-                if rerank_by_question:
-                    docs_question = docs_question[:k_by_question[i]]
-                # Add sources used in the metadata
-                for doc in docs_question:
-                    doc.metadata["sources_used"] = sources
-                print(f"{len(docs_question)} graphs retrieved for subquestion {i + 1}: {docs_question}")
-                docs.extend(docs_question)
-            else:
-                print(f"There are no graphs which match the sources filtered on. Sources filtered on: {sources}. Sources available: {POSSIBLE_SOURCES}.")
-            # Remove duplicates and keep the duplicate document with the highest reranking score
-            docs = remove_duplicates_keep_highest_score(docs)
-            # Sorting the list in descending order by rerank_score
-            # Then select the top k
-            docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
-            docs = docs[:k_final]
-        return {"recommended_content": docs}
-    return node_retrieve_graphs

climateqa/engine/chains/intent_categorization.py DELETED Viewed

@@ -1,90 +0,0 @@
-from langchain_core.pydantic_v1 import BaseModel, Field
-from typing import List
-from typing import Literal
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.utils.function_calling import convert_to_openai_function
-from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-class IntentCategorizer(BaseModel):
-    """Analyzing the user message input"""
-    language: str = Field(
-        description="Find the language of the message input in full words (ex: French, English, Spanish, ...), defaults to English",
-        default="English",
-    )
-    intent: str = Field(
-        enum=[
-            "ai_impact",
-            # "geo_info",
-            # "esg",
-            "search",
-            "chitchat",
-        ],
-        description="""
-            Categorize the user input in one of the following category
-            Any question
-            Examples:
-            - ai_impact = Environmental impacts of AI: "What are the environmental impacts of AI", "How does AI affect the environment"
-            - search = Searching for any quesiton about climate change, energy, biodiversity, nature, and everything we can find the IPCC or IPBES reports or scientific papers,
-            - chitchat = Any general question that is not related to the environment or climate change or just conversational, or if you don't think searching the IPCC or IPBES reports would be relevant
-        """,
-            # - geo_info = Geolocated info about climate change: Any question where the user wants to know localized impacts of climate change, eg: "What will be the temperature in Marseille in 2050"
-            # - esg = Any question about the ESG regulation, frameworks and standards like the CSRD, TCFD, SASB, GRI, CDP, etc.
-    )
-def make_intent_categorization_chain(llm):
-    openai_functions = [convert_to_openai_function(IntentCategorizer)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"IntentCategorizer"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will analyze, translate and categorize the user input message using the function provided. Categorize the user input as ai ONLY if it is related to Artificial Intelligence, search if it is related to the environment, climate change, energy, biodiversity, nature, etc. and chitchat if it is just general conversation."),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain
-def make_intent_categorization_node(llm):
-    categorization_chain = make_intent_categorization_chain(llm)
-    def categorize_message(state):
-        print("---- Categorize_message ----")
-        output = categorization_chain.invoke({"input": state["user_input"]})
-        print(f"\n\nOutput intent categorization: {output}\n")
-        if "language" not in output: output["language"] = "English"
-        output["query"] = state["user_input"]
-        return output
-    return categorize_message
-# SAMPLE_QUESTIONS = [
-#     "Est-ce que l'IA a un impact sur l'environnement ?",
-#     "Que dit le GIEC sur l'impact de l'IA",
-#     "Qui sont les membres du GIEC",
-#     "What is the impact of El Nino ?",
-#     "Yo",
-#     "Hello ça va bien ?",
-#     "Par qui as tu été créé ?",
-#     "What role do cloud formations play in modulating the Earth's radiative balance, and how are they represented in current climate models?",
-#     "Which industries have the highest GHG emissions?",
-#     "What are invasive alien species and how do they threaten biodiversity and ecosystems?",
-#     "Are human activities causing global warming?",
-#     "What is the motivation behind mining the deep seabed?",
-#     "Tu peux m'écrire un poème sur le changement climatique ?",
-#     "Tu peux m'écrire un poème sur les bonbons ?",
-#     "What will be the temperature in 2100 in Strasbourg?",
-#     "C'est quoi le lien entre biodiversity and changement climatique ?",
-# ]

climateqa/engine/chains/keywords_extraction.py DELETED Viewed

@@ -1,40 +0,0 @@
-from langchain_core.pydantic_v1 import BaseModel, Field
-from typing import List
-from typing import Literal
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.utils.function_calling import convert_to_openai_function
-from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-class KeywordExtraction(BaseModel):
-    """
-    Analyzing the user query to extract keywords to feed a search engine
-    """
-    keywords: List[str] = Field(
-        description="""
-        Extract the keywords from the user query to feed a search engine as a list
-        Avoid adding super specific keywords to prefer general keywords
-        Maximum 3 keywords
-        Examples:
-        - "What is the impact of deep sea mining ?" -> ["deep sea mining"]
-        - "How will El Nino be impacted by climate change" -> ["el nino","climate change"]
-        - "Is climate change a hoax" -> ["climate change","hoax"]
-        """
-    )
-def make_keywords_extraction_chain(llm):
-    openai_functions = [convert_to_openai_function(KeywordExtraction)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"KeywordExtraction"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant"),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain

climateqa/engine/chains/query_transformation.py DELETED Viewed

@@ -1,201 +0,0 @@
-from langchain_core.pydantic_v1 import BaseModel, Field
-from typing import List
-from typing import Literal
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.utils.function_calling import convert_to_openai_function
-from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-ROUTING_INDEX = {
-    "Vector":["IPCC","IPBES","IPOS"],
-    "OpenAlex":["OpenAlex"],
-}
-POSSIBLE_SOURCES = [y for values in ROUTING_INDEX.values() for y in values]
-# Prompt from the original paper https://arxiv.org/pdf/2305.14283
-# Query Rewriting for Retrieval-Augmented Large Language Models
-class QueryDecomposition(BaseModel):
-    """
-    Decompose the user query into smaller parts to think step by step to answer this question
-    Act as a simple planning agent
-    """
-    questions: List[str] = Field(
-        description="""
-        Think step by step to answer this question, and provide one or several search engine questions in English for knowledge that you need.
-        Suppose that the user is looking for information about climate change, energy, biodiversity, nature, and everything we can find the IPCC reports and scientific literature
-        - If it's already a standalone and explicit question, just return the reformulated question for the search engine
-        - If you need to decompose the question, output a list of maximum 2 to 3 questions
-    """
-    )
-class Location(BaseModel):
-    country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
-    location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
-class QueryAnalysis(BaseModel):
-    """
-    Analyzing the user query to extract topics, sources and date
-    Also do query expansion to get alternative search queries
-    Also provide simple keywords to feed a search engine
-    """
-    # keywords: List[str] = Field(
-    #     description="""
-    #     Extract the keywords from the user query to feed a search engine as a list
-    #     Maximum 3 keywords
-    #     Examples:
-    #     - "What is the impact of deep sea mining ?" -> deep sea mining
-    #     - "How will El Nino be impacted by climate change" -> el nino;climate change
-    #     - "Is climate change a hoax" -> climate change;hoax
-    #     """
-    # )
-    # alternative_queries: List[str] = Field(
-    #     description="""
-    #     Generate alternative search questions from the user query to feed a search engine
-    #     """
-    # )
-    # step_back_question: str = Field(
-    #     description="""
-    #     You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer.
-    #     This questions should help you get more context and information about the user query
-    #     """
-    # )
-    sources: List[Literal["IPCC", "IPBES", "IPOS"]] = Field( #,"OpenAlex"]] = Field(
-        ...,
-        description="""
-            Given a user question choose which documents would be most relevant for answering their question,
-            - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
-            - IPBES is for questions about biodiversity and nature
-            - IPOS is for questions about the ocean and deep sea mining
-        """,
-            # - OpenAlex is for any other questions that are not in the previous categories but could be found in the scientific litterature
-    )
-    # topics: List[Literal[
-    #     "Climate change",
-    #     "Biodiversity",
-    #     "Energy",
-    #     "Decarbonization",
-    #     "Climate science",
-    #     "Nature",
-    #     "Climate policy and justice",
-    #     "Oceans",
-    #     "Deep sea mining",
-    #     "ESG and regulations",
-    #     "CSRD",
-    # ]] = Field(
-    #     ...,
-    #     description = """
-    #         Choose the topics that are most relevant to the user query, ex: Climate change, Energy, Biodiversity, ...
-    #     """,
-    # )
-    # date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
-    # location:Location
-def make_query_decomposition_chain(llm):
-    openai_functions = [convert_to_openai_function(QueryDecomposition)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryDecomposition"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain
-def make_query_rewriter_chain(llm):
-    openai_functions = [convert_to_openai_function(QueryAnalysis)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain
-def make_query_transform_node(llm,k_final=15):
-    decomposition_chain = make_query_decomposition_chain(llm)
-    rewriter_chain = make_query_rewriter_chain(llm)
-    def transform_query(state):
-        print("---- Transform query ----")
-        if "sources_auto" not in state or state["sources_auto"] is None or state["sources_auto"] is False:
-            auto_mode = False
-        else:
-            auto_mode = True
-        sources_input = state.get("sources_input")
-        if sources_input is None: sources_input = ROUTING_INDEX["Vector"]
-        new_state = {}
-        # Decomposition
-        decomposition_output = decomposition_chain.invoke({"input":state["query"]})
-        new_state.update(decomposition_output)
-        # Query Analysis
-        questions = []
-        for question in new_state["questions"]:
-            question_state = {"question":question}
-            analysis_output = rewriter_chain.invoke({"input":question})
-            # TODO WARNING llm should always return smthg
-            # The case when the llm does not return any sources
-            if not analysis_output["sources"] or not all(source in ["IPCC", "IPBS", "IPOS"] for source in analysis_output["sources"]):
-                analysis_output["sources"] = ["IPCC", "IPBES", "IPOS"]
-            question_state.update(analysis_output)
-            questions.append(question_state)
-        # Explode the questions into multiple questions with different sources
-        new_questions = []
-        for q in questions:
-            question,sources = q["question"],q["sources"]
-            # If not auto mode we take the configuration
-            if not auto_mode:
-                sources = sources_input
-            for index,index_sources in ROUTING_INDEX.items():
-                selected_sources = list(set(sources).intersection(index_sources))
-                if len(selected_sources) > 0:
-                    new_questions.append({"question":question,"sources":selected_sources,"index":index})
-        # # Add the number of questions to search
-        # k_by_question = k_final // len(new_questions)
-        # for q in new_questions:
-        #     q["k"] = k_by_question
-        # new_state["questions"] = new_questions
-        # new_state["remaining_questions"] = new_questions
-        new_state = {
-            "remaining_questions":new_questions,
-            "n_questions":len(new_questions),
-        }
-        return new_state
-    return transform_query

climateqa/engine/chains/retrieve_documents.py DELETED Viewed

@@ -1,310 +0,0 @@
-import sys
-import os
-from contextlib import contextmanager
-from langchain_core.tools import tool
-from langchain_core.runnables import chain
-from langchain_core.runnables import RunnableParallel, RunnablePassthrough
-from langchain_core.runnables import RunnableLambda
-from ..reranker import rerank_docs
-# from ...knowledge.retriever import ClimateQARetriever
-from ...knowledge.openalex import OpenAlexRetriever
-from .keywords_extraction import make_keywords_extraction_chain
-from ..utils import log_event
-from langchain_core.vectorstores import VectorStore
-from typing import List
-from langchain_core.documents.base import Document
-def divide_into_parts(target, parts):
-    # Base value for each part
-    base = target // parts
-    # Remainder to distribute
-    remainder = target % parts
-    # List to hold the result
-    result = []
-    for i in range(parts):
-        if i < remainder:
-            # These parts get base value + 1
-            result.append(base + 1)
-        else:
-            # The rest get the base value
-            result.append(base)
-    return result
-@contextmanager
-def suppress_output():
-    # Open a null device
-    with open(os.devnull, 'w') as devnull:
-        # Store the original stdout and stderr
-        old_stdout = sys.stdout
-        old_stderr = sys.stderr
-        # Redirect stdout and stderr to the null device
-        sys.stdout = devnull
-        sys.stderr = devnull
-        try:
-            yield
-        finally:
-            # Restore stdout and stderr
-            sys.stdout = old_stdout
-            sys.stderr = old_stderr
-@tool
-def query_retriever(question):
-    """Just a dummy tool to simulate the retriever query"""
-    return question
-def _add_sources_used_in_metadata(docs,sources,question,index):
-    for doc in docs:
-        doc.metadata["sources_used"] = sources
-        doc.metadata["question_used"] = question
-        doc.metadata["index_used"] = index
-    return docs
-def _get_k_summary_by_question(n_questions):
-    if n_questions == 0:
-        return 0
-    elif n_questions == 1:
-        return 5
-    elif n_questions == 2:
-        return 3
-    elif n_questions == 3:
-        return 2
-    else:
-        return 1
-def _get_k_images_by_question(n_questions):
-    if n_questions == 0:
-        return 0
-    elif n_questions == 1:
-        return 7
-    elif n_questions == 2:
-        return 5
-    elif n_questions == 3:
-        return 3
-    else:
-        return 1
-def _add_metadata_and_score(docs: List) -> Document:
-    # Add score to metadata
-    docs_with_metadata = []
-    for i,(doc,score) in enumerate(docs):
-        doc.page_content = doc.page_content.replace("\r\n"," ")
-        doc.metadata["similarity_score"] = score
-        doc.metadata["content"] = doc.page_content
-        if doc.metadata["page_number"] != "N/A":
-            doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
-        else:
-            doc.metadata["page_number"] = 1
-        # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
-        docs_with_metadata.append(doc)
-    return docs_with_metadata
-async def get_IPCC_relevant_documents(
-    query: str,
-    vectorstore:VectorStore,
-    sources:list = ["IPCC","IPBES","IPOS"],
-    search_figures:bool = False,
-    reports:list = [],
-    threshold:float = 0.6,
-    k_summary:int = 3,
-    k_total:int = 10,
-    k_images: int = 5,
-    namespace:str = "vectors",
-    min_size:int = 200,
-    search_only:bool = False,
-) :
-    # Check if all elements in the list are either IPCC or IPBES
-    assert isinstance(sources,list)
-    assert sources
-    assert all([x in ["IPCC","IPBES","IPOS"] for x in sources])
-    assert k_total > k_summary, "k_total should be greater than k_summary"
-    # Prepare base search kwargs
-    filters = {}
-    if len(reports) > 0:
-        filters["short_name"] = {"$in":reports}
-    else:
-        filters["source"] = { "$in": sources}
-    # INIT
-    docs_summaries = []
-    docs_full = []
-    docs_images = []
-    if search_only:
-        # Only search for images if search_only is True
-        if search_figures:
-            filters_image = {
-                **filters,
-                "chunk_type":"image"
-            }
-            docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
-            docs_images = _add_metadata_and_score(docs_images)
-    else:
-        # Regular search flow for text and optionally images
-        # Search for k_summary documents in the summaries dataset
-        filters_summaries = {
-            **filters,
-            "chunk_type":"text",
-            "report_type": { "$in":["SPM"]},
-        }
-        docs_summaries = vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = k_summary)
-        docs_summaries = [x for x in docs_summaries if x[1] > threshold]
-        # Search for k_total - k_summary documents in the full reports dataset
-        filters_full = {
-            **filters,
-            "chunk_type":"text",
-            "report_type": { "$nin":["SPM"]},
-        }
-        k_full = k_total - len(docs_summaries)
-        docs_full = vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
-        if search_figures:
-            # Images
-            filters_image = {
-                **filters,
-                "chunk_type":"image"
-            }
-            docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
-        docs_summaries, docs_full, docs_images = _add_metadata_and_score(docs_summaries), _add_metadata_and_score(docs_full), _add_metadata_and_score(docs_images)
-        # Filter if length are below threshold
-        docs_summaries = [x for x in docs_summaries if len(x.page_content) > min_size]
-        docs_full = [x for x in docs_full if len(x.page_content) > min_size]
-    return {
-        "docs_summaries" : docs_summaries,
-        "docs_full" : docs_full,
-        "docs_images" : docs_images,
-    }
-# The chain callback is not necessary, but it propagates the langchain callbacks to the astream_events logger to display intermediate results
-# @chain
-async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5, k_images=5):
-    """
-    Retrieve and rerank documents based on the current question in the state.
-    Args:
-        state (dict): The current state containing documents, related content, relevant content sources, remaining questions and n_questions.
-        config (dict): Configuration settings for logging and other purposes.
-        vectorstore (object): The vector store used to retrieve relevant documents.
-        reranker (object): The reranker used to rerank the retrieved documents.
-        llm (object): The language model used for processing.
-        rerank_by_question (bool, optional): Whether to rerank documents by question. Defaults to True.
-        k_final (int, optional): The final number of documents to retrieve. Defaults to 15.
-        k_before_reranking (int, optional): The number of documents to retrieve before reranking. Defaults to 100.
-        k_summary (int, optional): The number of summary documents to retrieve. Defaults to 5.
-        k_images (int, optional): The number of image documents to retrieve. Defaults to 5.
-    Returns:
-        dict: The updated state containing the retrieved and reranked documents, related content, and remaining questions.
-    """
-    print("---- Retrieve documents ----")
-    # Get the documents from the state
-    if "documents" in state and state["documents"] is not None:
-        docs = state["documents"]
-    else:
-        docs = []
-    # Get the related_content from the state
-    if "related_content" in state and state["related_content"] is not None:
-        related_content = state["related_content"]
-    else:
-        related_content = []
-    search_figures = "Figures (IPCC/IPBES)" in state["relevant_content_sources"]
-    search_only = state["search_only"]
-    reports = state["reports"]
-    # Get the current question
-    current_question = state["remaining_questions"][0]
-    remaining_questions = state["remaining_questions"][1:]
-    k_by_question = k_final // state["n_questions"]
-    k_summary_by_question = _get_k_summary_by_question(state["n_questions"])
-    k_images_by_question = _get_k_images_by_question(state["n_questions"])
-    sources = current_question["sources"]
-    question = current_question["question"]
-    index = current_question["index"]
-    print(f"Retrieve documents for question: {question}")
-    await log_event({"question":question,"sources":sources,"index":index},"log_retriever",config)
-    if index == "Vector": # always true for now
-        docs_question_dict = await get_IPCC_relevant_documents(
-            query  = question,
-            vectorstore=vectorstore,
-            search_figures = search_figures,
-            sources = sources,
-            min_size = 200,
-            k_summary = k_summary_by_question,
-            k_total = k_before_reranking,
-            k_images = k_images_by_question,
-            threshold = 0.5,
-            search_only = search_only,
-            reports = reports,
-        )
-    # Rerank
-    if reranker is not None:
-        with suppress_output():
-            docs_question_summary_reranked = rerank_docs(reranker,docs_question_dict["docs_summaries"],question)
-            docs_question_fulltext_reranked = rerank_docs(reranker,docs_question_dict["docs_full"],question)
-            docs_question_images_reranked = rerank_docs(reranker,docs_question_dict["docs_images"],question)
-            if rerank_by_question:
-                docs_question_summary_reranked = sorted(docs_question_summary_reranked, key=lambda x: x.metadata["reranking_score"], reverse=True)
-                docs_question_fulltext_reranked = sorted(docs_question_fulltext_reranked, key=lambda x: x.metadata["reranking_score"], reverse=True)
-                docs_question_images_reranked = sorted(docs_question_images_reranked, key=lambda x: x.metadata["reranking_score"], reverse=True)
-    else:
-        docs_question = docs_question_dict["docs_summaries"] + docs_question_dict["docs_full"]
-        # Add a default reranking score
-        for doc in docs_question:
-            doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
-    docs_question = docs_question_summary_reranked + docs_question_fulltext_reranked
-    docs_question = docs_question[:k_by_question]
-    images_question = docs_question_images_reranked[:k_images]
-    if reranker is not None and rerank_by_question:
-        docs_question = sorted(docs_question, key=lambda x: x.metadata["reranking_score"], reverse=True)
-    # Add sources used in the metadata
-    docs_question = _add_sources_used_in_metadata(docs_question,sources,question,index)
-    images_question = _add_sources_used_in_metadata(images_question,sources,question,index)
-    # Add to the list of docs
-    docs.extend(docs_question)
-    related_content.extend(images_question)
-    new_state = {"documents":docs, "related_contents": related_content,"remaining_questions":remaining_questions}
-    return new_state
-def make_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
-    @chain
-    async def retrieve_docs(state, config):
-        state =  await retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_question, k_final, k_before_reranking, k_summary)
-        return state
-    return retrieve_docs

climateqa/engine/chains/retrieve_papers.py DELETED Viewed

@@ -1,95 +0,0 @@
-from climateqa.engine.keywords import make_keywords_chain
-from climateqa.engine.llm import get_llm
-from climateqa.knowledge.openalex import OpenAlex
-from climateqa.engine.chains.answer_rag import make_rag_papers_chain
-from front.utils import make_html_papers
-from climateqa.engine.reranker import get_reranker
-oa = OpenAlex()
-llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
-reranker = get_reranker("nano")
-papers_cols_widths = {
-    "id":100,
-    "title":300,
-    "doi":100,
-    "publication_year":100,
-    "abstract":500,
-    "is_oa":50,
-}
-papers_cols = list(papers_cols_widths.keys())
-papers_cols_widths = list(papers_cols_widths.values())
-def generate_keywords(query):
-    chain = make_keywords_chain(llm)
-    keywords = chain.invoke(query)
-    keywords = " AND ".join(keywords["keywords"])
-    return keywords
-async def find_papers(query,after, relevant_content_sources, reranker= reranker):
-    if "Papers (OpenAlex)" in relevant_content_sources:
-        summary = ""
-        keywords = generate_keywords(query)
-        df_works = oa.search(keywords,after = after)
-        print(f"Found {len(df_works)} papers")
-        if not df_works.empty:
-            df_works = df_works.dropna(subset=["abstract"])
-            df_works = df_works[df_works["abstract"] != ""].reset_index(drop = True)
-            df_works = oa.rerank(query,df_works,reranker)
-            df_works = df_works.sort_values("rerank_score",ascending=False)
-            docs_html = []
-            for i in range(10):
-                docs_html.append(make_html_papers(df_works, i))
-            docs_html = "".join(docs_html)
-            G = oa.make_network(df_works)
-            height = "750px"
-            network = oa.show_network(G,color_by = "rerank_score",notebook=False,height = height)
-            network_html = network.generate_html()
-            network_html = network_html.replace("'", "\"")
-            css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
-            network_html = network_html + css_to_inject
-            network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
-            display-capture; encrypted-media;" sandbox="allow-modals allow-forms
-            allow-scripts allow-same-origin allow-popups
-            allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
-            allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
-            docs = df_works["content"].head(10).tolist()
-            df_works = df_works.reset_index(drop = True).reset_index().rename(columns = {"index":"doc"})
-            df_works["doc"] = df_works["doc"] + 1
-            df_works = df_works[papers_cols]
-            yield docs_html, network_html, summary
-            chain = make_rag_papers_chain(llm)
-            result = chain.astream_log({"question": query,"docs": docs,"language":"English"})
-            path_answer = "/logs/StrOutputParser/streamed_output/-"
-            async for op in result:
-                op = op.ops[0]
-                if op['path'] == path_answer: # reforulated question
-                    new_token = op['value'] # str
-                    summary += new_token
-                else:
-                    continue
-                yield docs_html, network_html, summary
-        else :
-            print("No papers found")
-    else :
-        yield "","", ""

climateqa/engine/chains/retriever.py DELETED Viewed

@@ -1,126 +0,0 @@
-# import sys
-# import os
-# from contextlib import contextmanager
-# from ..reranker import rerank_docs
-# from ...knowledge.retriever import ClimateQARetriever
-# def divide_into_parts(target, parts):
-#     # Base value for each part
-#     base = target // parts
-#     # Remainder to distribute
-#     remainder = target % parts
-#     # List to hold the result
-#     result = []
-#     for i in range(parts):
-#         if i < remainder:
-#             # These parts get base value + 1
-#             result.append(base + 1)
-#         else:
-#             # The rest get the base value
-#             result.append(base)
-#     return result
-# @contextmanager
-# def suppress_output():
-#     # Open a null device
-#     with open(os.devnull, 'w') as devnull:
-#         # Store the original stdout and stderr
-#         old_stdout = sys.stdout
-#         old_stderr = sys.stderr
-#         # Redirect stdout and stderr to the null device
-#         sys.stdout = devnull
-#         sys.stderr = devnull
-#         try:
-#             yield
-#         finally:
-#             # Restore stdout and stderr
-#             sys.stdout = old_stdout
-#             sys.stderr = old_stderr
-# def make_retriever_node(vectorstore,reranker,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
-#     def retrieve_documents(state):
-#         POSSIBLE_SOURCES = ["IPCC","IPBES","IPOS"] # ,"OpenAlex"]
-#         questions = state["questions"]
-#         # Use sources from the user input or from the LLM detection
-#         if "sources_input" not in state or state["sources_input"] is None:
-#             sources_input = ["auto"]
-#         else:
-#             sources_input = state["sources_input"]
-#         auto_mode = "auto" in sources_input
-#         # There are several options to get the final top k
-#         # Option 1 - Get 100 documents by question and rerank by question
-#         # Option 2 - Get 100/n documents by question and rerank the total
-#         if rerank_by_question:
-#             k_by_question = divide_into_parts(k_final,len(questions))
-#         docs = []
-#         for i,q in enumerate(questions):
-#             sources = q["sources"]
-#             question = q["question"]
-#             # If auto mode, we use the sources detected by the LLM
-#             if auto_mode:
-#                 sources = [x for x in sources if x in POSSIBLE_SOURCES]
-#             # Otherwise, we use the config
-#             else:
-#                 sources = sources_input
-#             # Search the document store using the retriever
-#             # Configure high top k for further reranking step
-#             retriever = ClimateQARetriever(
-#                 vectorstore=vectorstore,
-#                 sources = sources,
-#                 # reports = ias_reports,
-#                 min_size = 200,
-#                 k_summary = k_summary,
-#                 k_total = k_before_reranking,
-#                 threshold = 0.5,
-#             )
-#             docs_question = retriever.get_relevant_documents(question)
-#             # Rerank
-#             if reranker is not None:
-#                 with suppress_output():
-#                     docs_question = rerank_docs(reranker,docs_question,question)
-#             else:
-#                 # Add a default reranking score
-#                 for doc in docs_question:
-#                     doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
-#             # If rerank by question we select the top documents for each question
-#             if rerank_by_question:
-#                 docs_question = docs_question[:k_by_question[i]]
-#             # Add sources used in the metadata
-#             for doc in docs_question:
-#                 doc.metadata["sources_used"] = sources
-#             # Add to the list of docs
-#             docs.extend(docs_question)
-#         # Sorting the list in descending order by rerank_score
-#         # Then select the top k
-#         docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
-#         docs = docs[:k_final]
-#         new_state = {"documents":docs}
-#         return new_state
-#     return retrieve_documents

climateqa/engine/chains/sample_router.py DELETED Viewed

@@ -1,66 +0,0 @@
-# from typing import List
-# from typing import Literal
-# from langchain.prompts import ChatPromptTemplate
-# from langchain_core.utils.function_calling import convert_to_openai_function
-# from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-# # https://livingdatalab.com/posts/2023-11-05-openai-function-calling-with-langchain.html
-# class Location(BaseModel):
-#     country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
-#     location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
-# class QueryAnalysis(BaseModel):
-#     """Analyzing the user query"""
-#     language: str = Field(
-#         description="Find the language of the query in full words (ex: French, English, Spanish, ...), defaults to English"
-#     )
-#     intent: str = Field(
-#         enum=[
-#             "Environmental impacts of AI",
-#             "Geolocated info about climate change",
-#             "Climate change",
-#             "Biodiversity",
-#             "Deep sea mining",
-#             "Chitchat",
-#         ],
-#         description="""
-#             Categorize the user query in one of the following category,
-#             Examples:
-#             - Geolocated info about climate change: "What will be the temperature in Marseille in 2050"
-#             - Climate change: "What is radiative forcing", "How much will
-#         """,
-#     )
-#     sources: List[Literal["IPCC", "IPBES", "IPOS"]] = Field(
-#         ...,
-#         description="""
-#             Given a user question choose which documents would be most relevant for answering their question,
-#             - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
-#             - IPBES is for questions about biodiversity and nature
-#             - IPOS is for questions about the ocean and deep sea mining
-#         """,
-#     )
-#     date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
-#     location:Location
-#     # query: str = Field(
-#     #     description = """
-#     #         Translate to english and reformulate the following user message to be a short standalone question, in the context of an educational discussion about climate change.
-#     #         The reformulated question will used in a search engine
-#     #         By default, assume that the user is asking information about the last century,
-#     #         Use the following examples
-#     #         ### Examples:
-#     #         La technologie nous sauvera-t-elle ? -> Can technology help humanity mitigate the effects of climate change?
-#     #         what are our reserves in fossil fuel? -> What are the current reserves of fossil fuels and how long will they last?
-#     #         what are the main causes of climate change? -> What are the main causes of climate change in the last century?
-#     #         Question in English:
-#     #     """
-#     # )
-# openai_functions = [convert_to_openai_function(QueryAnalysis)]
-# llm2 = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})

climateqa/engine/chains/set_defaults.py DELETED Viewed

@@ -1,13 +0,0 @@
-def set_defaults(state):
-    print("---- Setting defaults ----")
-    if not state["audience"] or state["audience"] is None:
-        state.update({"audience": "experts"})
-    sources_input = state["sources_input"] if "sources_input" in state else ["auto"]
-    state.update({"sources_input": sources_input})
-    # if not state["sources_input"] or state["sources_input"] is None:
-    #     state.update({"sources_input": ["auto"]})
-    return state

climateqa/engine/chains/translation.py DELETED Viewed

@@ -1,42 +0,0 @@
-from langchain_core.pydantic_v1 import BaseModel, Field
-from typing import List
-from typing import Literal
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.utils.function_calling import convert_to_openai_function
-from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-class Translation(BaseModel):
-    """Analyzing the user message input"""
-    translation: str = Field(
-        description="Translate the message input to English",
-    )
-def make_translation_chain(llm):
-    openai_functions = [convert_to_openai_function(Translation)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"Translation"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will translate the user input message to English using the function provided"),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain
-def make_translation_node(llm):
-    translation_chain = make_translation_chain(llm)
-    def translate_query(state):
-        print("---- Translate query ----")
-        user_input = state["user_input"]
-        translation = translation_chain.invoke({"input":user_input})
-        return {"query":translation["translation"]}
-    return translate_query

climateqa/engine/embeddings.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_community.embeddings import HuggingFaceEmbeddings
-def get_embeddings_function(version = "v1.2",query_instruction = "Represent this sentence for searching relevant passages: "):
     if version == "v1.2":
@@ -10,12 +10,12 @@ def get_embeddings_function(version = "v1.2",query_instruction = "Represent this
         # Best embedding model at a reasonable size at the moment (2023-11-22)
         model_name = "BAAI/bge-base-en-v1.5"
-        encode_kwargs = {'normalize_embeddings': True,"show_progress_bar":False} # set True to compute cosine similarity
         print("Loading embeddings model: ", model_name)
         embeddings_function = HuggingFaceBgeEmbeddings(
             model_name=model_name,
             encode_kwargs=encode_kwargs,
-            query_instruction=query_instruction,
         )
     else:
@@ -23,6 +23,3 @@ def get_embeddings_function(version = "v1.2",query_instruction = "Represent this
         embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
     return embeddings_function

 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_community.embeddings import HuggingFaceEmbeddings
+def get_embeddings_function(version = "v1.2"):
     if version == "v1.2":
         # Best embedding model at a reasonable size at the moment (2023-11-22)
         model_name = "BAAI/bge-base-en-v1.5"
+        encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
         print("Loading embeddings model: ", model_name)
         embeddings_function = HuggingFaceBgeEmbeddings(
             model_name=model_name,
             encode_kwargs=encode_kwargs,
+            query_instruction="Represent this sentence for searching relevant passages: "
         )
     else:
         embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
     return embeddings_function

climateqa/engine/graph.py DELETED Viewed

@@ -1,192 +0,0 @@
-import sys
-import os
-from contextlib import contextmanager
-from langchain.schema import Document
-from langgraph.graph import END, StateGraph
-from langchain_core.runnables.graph import CurveStyle, MermaidDrawMethod
-from typing_extensions import TypedDict
-from typing import List, Dict
-from IPython.display import display, HTML, Image
-from .chains.answer_chitchat import make_chitchat_node
-from .chains.answer_ai_impact import make_ai_impact_node
-from .chains.query_transformation import make_query_transform_node
-from .chains.translation import make_translation_node
-from .chains.intent_categorization import make_intent_categorization_node
-from .chains.retrieve_documents import make_retriever_node
-from .chains.answer_rag import make_rag_node
-from .chains.graph_retriever import make_graph_retriever_node
-from .chains.chitchat_categorization import make_chitchat_intent_categorization_node
-# from .chains.set_defaults import set_defaults
-class GraphState(TypedDict):
-    """
-    Represents the state of our graph.
-    """
-    user_input : str
-    language : str
-    intent : str
-    search_graphs_chitchat : bool
-    query: str
-    remaining_questions : List[dict]
-    n_questions : int
-    answer: str
-    audience: str = "experts"
-    sources_input: List[str] = ["IPCC","IPBES"]
-    relevant_content_sources: List[str] = ["Figures (IPCC/IPBES)"]
-    sources_auto: bool = True
-    min_year: int = 1960
-    max_year: int = None
-    documents: List[Document]
-    related_contents : Dict[str,Document]
-    recommended_content : List[Document]
-    search_only : bool = False
-    reports : List[str] = []
-def search(state): #TODO
-    return state
-def answer_search(state):#TODO
-    return state
-def route_intent(state):
-    intent = state["intent"]
-    if intent in ["chitchat","esg"]:
-        return "answer_chitchat"
-    # elif intent == "ai_impact":
-    #     return "answer_ai_impact"
-    else:
-        # Search route
-        return "search"
-def chitchat_route_intent(state):
-    intent = state["search_graphs_chitchat"]
-    if intent is True:
-        return "retrieve_graphs_chitchat"
-    elif intent is False:
-        return END
-def route_translation(state):
-    if state["language"].lower() == "english":
-        return "transform_query"
-    else:
-        return "translate_query"
-def route_based_on_relevant_docs(state,threshold_docs=0.2):
-    docs = [x for x in state["documents"] if x.metadata["reranking_score"] > threshold_docs]
-    if len(docs) > 0:
-        return "answer_rag"
-    else:
-        return "answer_rag_no_docs"
-def route_retrieve_documents(state):
-    if len(state["remaining_questions"]) == 0 and state["search_only"] :
-        return END
-    elif len(state["remaining_questions"]) > 0:
-        return "retrieve_documents"
-    else:
-        return "answer_search"
-def make_id_dict(values):
-    return {k:k for k in values}
-def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, reranker, threshold_docs=0.2):
-    workflow = StateGraph(GraphState)
-    # Define the node functions
-    categorize_intent = make_intent_categorization_node(llm)
-    transform_query = make_query_transform_node(llm)
-    translate_query = make_translation_node(llm)
-    answer_chitchat = make_chitchat_node(llm)
-    answer_ai_impact = make_ai_impact_node(llm)
-    retrieve_documents = make_retriever_node(vectorstore_ipcc, reranker, llm)
-    retrieve_graphs = make_graph_retriever_node(vectorstore_graphs, reranker)
-    answer_rag = make_rag_node(llm, with_docs=True)
-    answer_rag_no_docs = make_rag_node(llm, with_docs=False)
-    chitchat_categorize_intent = make_chitchat_intent_categorization_node(llm)
-    # Define the nodes
-    # workflow.add_node("set_defaults", set_defaults)
-    workflow.add_node("categorize_intent", categorize_intent)
-    workflow.add_node("search", search)
-    workflow.add_node("answer_search", answer_search)
-    workflow.add_node("transform_query", transform_query)
-    workflow.add_node("translate_query", translate_query)
-    workflow.add_node("answer_chitchat", answer_chitchat)
-    workflow.add_node("chitchat_categorize_intent", chitchat_categorize_intent)
-    workflow.add_node("retrieve_graphs", retrieve_graphs)
-    workflow.add_node("retrieve_graphs_chitchat", retrieve_graphs)
-    workflow.add_node("retrieve_documents", retrieve_documents)
-    workflow.add_node("answer_rag", answer_rag)
-    workflow.add_node("answer_rag_no_docs", answer_rag_no_docs)
-    # Entry point
-    workflow.set_entry_point("categorize_intent")
-    # CONDITIONAL EDGES
-    workflow.add_conditional_edges(
-        "categorize_intent",
-        route_intent,
-        make_id_dict(["answer_chitchat","search"])
-    )
-    workflow.add_conditional_edges(
-        "chitchat_categorize_intent",
-        chitchat_route_intent,
-        make_id_dict(["retrieve_graphs_chitchat", END])
-    )
-    workflow.add_conditional_edges(
-        "search",
-        route_translation,
-        make_id_dict(["translate_query","transform_query"])
-    )
-    workflow.add_conditional_edges(
-        "retrieve_documents",
-        # lambda state : "retrieve_documents" if len(state["remaining_questions"]) > 0 else "answer_search",
-        route_retrieve_documents,
-        make_id_dict([END,"retrieve_documents","answer_search"])
-    )
-    workflow.add_conditional_edges(
-        "answer_search",
-        lambda x : route_based_on_relevant_docs(x,threshold_docs=threshold_docs),
-        make_id_dict(["answer_rag","answer_rag_no_docs"])
-    )
-    workflow.add_conditional_edges(
-        "transform_query",
-        lambda state : "retrieve_graphs" if "Graphs (OurWorldInData)" in state["relevant_content_sources"]  else END,
-        make_id_dict(["retrieve_graphs", END])
-    )
-    # Define the edges
-    workflow.add_edge("translate_query", "transform_query")
-    workflow.add_edge("transform_query", "retrieve_documents")
-    workflow.add_edge("retrieve_graphs", END)
-    workflow.add_edge("answer_rag", END)
-    workflow.add_edge("answer_rag_no_docs", END)
-    workflow.add_edge("answer_chitchat", "chitchat_categorize_intent")
-    workflow.add_edge("retrieve_graphs_chitchat", END)
-    # Compile
-    app = workflow.compile()
-    return app
-def display_graph(app):
-    display(
-        Image(
-            app.get_graph(xray = True).draw_mermaid_png(
-                draw_method=MermaidDrawMethod.API,
-            )
-        )
-    )

climateqa/engine/graph_retriever.py DELETED Viewed

@@ -1,88 +0,0 @@
-from langchain_core.retrievers import BaseRetriever
-from langchain_core.documents.base import Document
-from langchain_core.vectorstores import VectorStore
-from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
-from typing import List
-# class GraphRetriever(BaseRetriever):
-#     vectorstore:VectorStore
-#     sources:list = ["OWID"] # plus tard ajouter OurWorldInData # faudra integrate avec l'autre retriever
-#     threshold:float = 0.5
-#     k_total:int = 10
-#     def _get_relevant_documents(
-#         self, query: str, *, run_manager: CallbackManagerForRetrieverRun
-#     ) -> List[Document]:
-#         # Check if all elements in the list are IEA or OWID
-#         assert isinstance(self.sources,list)
-#         assert self.sources
-#         assert any([x in ["OWID"] for x in self.sources])
-#         # Prepare base search kwargs
-#         filters = {}
-#         filters["source"] = {"$in": self.sources}
-#         docs = self.vectorstore.similarity_search_with_score(query=query, filter=filters, k=self.k_total)
-#         # Filter if scores are below threshold
-#         docs = [x for x in docs if x[1] > self.threshold]
-#         # Remove duplicate documents
-#         unique_docs = []
-#         seen_docs = []
-#         for i, doc in enumerate(docs):
-#             if doc[0].page_content not in seen_docs:
-#                 unique_docs.append(doc)
-#                 seen_docs.append(doc[0].page_content)
-#         # Add score to metadata
-#         results = []
-#         for i,(doc,score) in enumerate(unique_docs):
-#             doc.metadata["similarity_score"] = score
-#             doc.metadata["content"] = doc.page_content
-#             results.append(doc)
-#         return results
-async def retrieve_graphs(
-    query: str,
-    vectorstore:VectorStore,
-    sources:list = ["OWID"], # plus tard ajouter OurWorldInData # faudra integrate avec l'autre retriever
-    threshold:float = 0.5,
-    k_total:int = 10,
-)-> List[Document]:
-        # Check if all elements in the list are IEA or OWID
-        assert isinstance(sources,list)
-        assert sources
-        assert any([x in ["OWID"] for x in sources])
-        # Prepare base search kwargs
-        filters = {}
-        filters["source"] = {"$in": sources}
-        docs = vectorstore.similarity_search_with_score(query=query, filter=filters, k=k_total)
-        # Filter if scores are below threshold
-        docs = [x for x in docs if x[1] > threshold]
-        # Remove duplicate documents
-        unique_docs = []
-        seen_docs = []
-        for i, doc in enumerate(docs):
-            if doc[0].page_content not in seen_docs:
-                unique_docs.append(doc)
-                seen_docs.append(doc[0].page_content)
-        # Add score to metadata
-        results = []
-        for i,(doc,score) in enumerate(unique_docs):
-            doc.metadata["similarity_score"] = score
-            doc.metadata["content"] = doc.page_content
-            results.append(doc)
-        return results

climateqa/engine/keywords.py CHANGED Viewed

@@ -11,12 +11,10 @@ class KeywordsOutput(BaseModel):
     keywords: list = Field(
         description="""
-        Generate 1 or 2 relevant keywords from the user query to ask a search engine for scientific research papers. Answer only with English keywords.
-        Do not use special characters or accents.
         Example:
         - "What is the impact of deep sea mining ?" -> ["deep sea mining"]
-        - "Quel est l'impact de l'exploitation minière en haute mer ?" -> ["deep sea mining"]
         - "How will El Nino be impacted by climate change" -> ["el nino"]
         - "Is climate change a hoax" -> [Climate change","hoax"]
         """

     keywords: list = Field(
         description="""
+        Generate 1 or 2 relevant keywords from the user query to ask a search engine for scientific research papers.
         Example:
         - "What is the impact of deep sea mining ?" -> ["deep sea mining"]
         - "How will El Nino be impacted by climate change" -> ["el nino"]
         - "Is climate change a hoax" -> [Climate change","hoax"]
         """

climateqa/engine/llm/__init__.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from climateqa.engine.llm.openai import get_llm as get_openai_llm
 from climateqa.engine.llm.azure import get_llm as get_azure_llm
-from climateqa.engine.llm.ollama import get_llm as get_ollama_llm
 def get_llm(provider="openai",**kwargs):
@@ -9,8 +8,6 @@ def get_llm(provider="openai",**kwargs):
         return get_openai_llm(**kwargs)
     elif provider == "azure":
         return get_azure_llm(**kwargs)
-    elif provider == "ollama":
-        return  get_ollama_llm(**kwargs)
     else:
         raise ValueError(f"Unknown provider: {provider}")

 from climateqa.engine.llm.openai import get_llm as get_openai_llm
 from climateqa.engine.llm.azure import get_llm as get_azure_llm
 def get_llm(provider="openai",**kwargs):
         return get_openai_llm(**kwargs)
     elif provider == "azure":
         return get_azure_llm(**kwargs)
     else:
         raise ValueError(f"Unknown provider: {provider}")

climateqa/engine/llm/ollama.py DELETED Viewed

@@ -1,6 +0,0 @@
-from langchain_community.llms import Ollama
-def get_llm(model="llama3", **kwargs):
-    return Ollama(model=model, **kwargs)

climateqa/engine/llm/openai.py CHANGED Viewed

@@ -7,7 +7,7 @@ try:
 except Exception:
     pass
-def get_llm(model="gpt-4o-mini",max_tokens=1024, temperature=0.0, streaming=True,timeout=30, **kwargs):
     llm = ChatOpenAI(
         model=model,

 except Exception:
     pass
+def get_llm(model="gpt-3.5-turbo-0125",max_tokens=1024, temperature=0.0, streaming=True,timeout=30, **kwargs):
     llm = ChatOpenAI(
         model=model,

climateqa/engine/{chains/prompts.py → prompts.py} RENAMED Viewed

@@ -56,7 +56,7 @@ Passages:
 {context}
 -----------------------
-Question: {query} - Explained to {audience}
 Answer in {language} with the passages citations:
 """
@@ -137,7 +137,7 @@ Guidelines:
 - If the question is not related to environmental issues, never never answer it. Say it's not your role.
 - Make paragraphs by starting new lines to make your answers more readable.
-Question: {query}
 Answer in {language}:
 """
@@ -147,27 +147,4 @@ audience_prompts = {
     "children": "6 year old children that don't know anything about science and climate change and need metaphors to learn",
     "general": "the general public who know the basics in science and climate change and want to learn more about it without technical terms. Still use references to passages.",
     "experts": "expert and climate scientists that are not afraid of technical terms",
-}
-answer_prompt_graph_template = """
-Given the user question and a list of graphs which are related to the question, rank the graphs based on relevance to the user question. ALWAYS follow the guidelines given below.
-### Guidelines ###
-- Keep all the graphs that are given to you.
-- NEVER modify the graph HTML embedding, the category or the source leave them exactly as they are given.
-- Return the ranked graphs as a list of dictionaries with keys 'embedding', 'category', and 'source'.
-- Return a valid JSON output.
------------------------
-User question:
-{query}
-Graphs and their HTML embedding:
-{recommended_content}
------------------------
-{format_instructions}
-Output the result as json with a key "graphs" containing a list of dictionaries of the relevant graphs with keys 'embedding', 'category', and 'source'. Do not modify the graph HTML embedding, the category or the source. Do not put any message or text before or after the JSON output.
-"""

 {context}
 -----------------------
+Question: {question} - Explained to {audience}
 Answer in {language} with the passages citations:
 """
 - If the question is not related to environmental issues, never never answer it. Say it's not your role.
 - Make paragraphs by starting new lines to make your answers more readable.
+Question: {question}
 Answer in {language}:
 """
     "children": "6 year old children that don't know anything about science and climate change and need metaphors to learn",
     "general": "the general public who know the basics in science and climate change and want to learn more about it without technical terms. Still use references to passages.",
     "experts": "expert and climate scientists that are not afraid of technical terms",
+}

climateqa/engine/{chains/answer_rag.py → rag.py} RENAMED Viewed

@@ -2,14 +2,15 @@ from operator import itemgetter
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts.prompt import PromptTemplate
 from langchain_core.prompts.base import format_document
-from climateqa.engine.chains.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
-from climateqa.engine.chains.prompts import papers_prompt_template
-import time
-from ..utils import rename_chain, pass_values
 DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
@@ -39,51 +40,72 @@ def get_text_docs(x):
 def get_image_docs(x):
     return [doc for doc in x if doc.metadata["chunk_type"] == "image"]
-def make_rag_chain(llm):
     prompt = ChatPromptTemplate.from_template(answer_prompt_template)
-    chain = ({
-        "context":lambda x : _combine_documents(x["documents"]),
-        "context_length":lambda x : print("CONTEXT LENGTH : " , len(_combine_documents(x["documents"]))),
-        "query":itemgetter("query"),
-        "language":itemgetter("language"),
-        "audience":itemgetter("audience"),
-    } | prompt | llm | StrOutputParser())
-    return chain
-def make_rag_chain_without_docs(llm):
-    prompt = ChatPromptTemplate.from_template(answer_prompt_without_docs_template)
-    chain = prompt | llm | StrOutputParser()
-    return chain
-def make_rag_node(llm,with_docs = True):
-    if with_docs:
-        rag_chain = make_rag_chain(llm)
-    else:
-        rag_chain = make_rag_chain_without_docs(llm)
-    async def answer_rag(state,config):
-        print("---- Answer RAG ----")
-        start_time = time.time()
-        answer = await rag_chain.ainvoke(state,config)
-        end_time = time.time()
-        elapsed_time = end_time - start_time
-        print("RAG elapsed time: ", elapsed_time)
-        print("Answer size : ", len(answer))
-        # print(f"\n\nAnswer:\n{answer}")
-        return {"answer":answer}
-    return answer_rag
 def make_rag_papers_chain(llm):
     prompt = ChatPromptTemplate.from_template(papers_prompt_template)
     input_documents = {
         "context":lambda x : _combine_documents(x["docs"]),
         **pass_values(["question","language"])
@@ -109,4 +131,4 @@ def make_illustration_chain(llm):
     }
     illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
-    return illustration_chain

 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
 from langchain_core.prompts.prompt import PromptTemplate
 from langchain_core.prompts.base import format_document
+from climateqa.engine.reformulation import make_reformulation_chain
+from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
+from climateqa.engine.prompts import papers_prompt_template
+from climateqa.engine.utils import pass_values, flatten_dict,prepare_chain,rename_chain
+from climateqa.engine.keywords import make_keywords_chain
 DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
 def get_image_docs(x):
     return [doc for doc in x if doc.metadata["chunk_type"] == "image"]
+def make_rag_chain(retriever,llm):
+    # Construct the prompt
     prompt = ChatPromptTemplate.from_template(answer_prompt_template)
+    prompt_without_docs = ChatPromptTemplate.from_template(answer_prompt_without_docs_template)
+    # ------- CHAIN 0 - Reformulation
+    reformulation = make_reformulation_chain(llm)
+    reformulation = prepare_chain(reformulation,"reformulation")
+    # ------- Find all keywords from the reformulated query
+    keywords = make_keywords_chain(llm)
+    keywords = {"keywords":itemgetter("question") | keywords}
+    keywords = prepare_chain(keywords,"keywords")
+    # ------- CHAIN 1
+    # Retrieved documents
+    find_documents = {"docs": itemgetter("question") | retriever} | RunnablePassthrough()
+    find_documents = prepare_chain(find_documents,"find_documents")
+    # ------- CHAIN 2
+    # Construct inputs for the llm
+    input_documents = {
+        "context":lambda x : _combine_documents(x["docs"]),
+        **pass_values(["question","audience","language","keywords"])
+    }
+    # ------- CHAIN 3
+    # Bot answer
+    llm_final = rename_chain(llm,"answer")
+    answer_with_docs = {
+        "answer": input_documents | prompt | llm_final | StrOutputParser(),
+        **pass_values(["question","audience","language","query","docs","keywords"]),
+    }
+    answer_without_docs = {
+        "answer":  prompt_without_docs | llm_final | StrOutputParser(),
+        **pass_values(["question","audience","language","query","docs","keywords"]),
+    }
+    # def has_images(x):
+    #     image_docs = [doc for doc in x["docs"] if doc.metadata["chunk_type"]=="image"]
+    #     return len(image_docs) > 0
+    def has_docs(x):
+        return len(x["docs"]) > 0
+    answer = RunnableBranch(
+        (lambda x: has_docs(x), answer_with_docs),
+        answer_without_docs,
+    )
+    # ------- FINAL CHAIN
+    # Build the final chain
+    rag_chain = reformulation | keywords | find_documents | answer
+    return rag_chain
 def make_rag_papers_chain(llm):
     prompt = ChatPromptTemplate.from_template(papers_prompt_template)
     input_documents = {
         "context":lambda x : _combine_documents(x["docs"]),
         **pass_values(["question","language"])
     }
     illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
+    return illustration_chain

climateqa/engine/{chains/reformulation.py → reformulation.py} RENAMED Viewed

@@ -3,7 +3,7 @@ from langchain.output_parsers.structured import StructuredOutputParser, Response
 from langchain_core.prompts import PromptTemplate
 from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
-from climateqa.engine.chains.prompts import reformulation_prompt_template
 from climateqa.engine.utils import pass_values, flatten_dict

 from langchain_core.prompts import PromptTemplate
 from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
+from climateqa.engine.prompts import reformulation_prompt_template
 from climateqa.engine.utils import pass_values, flatten_dict

climateqa/engine/reranker.py DELETED Viewed

@@ -1,50 +0,0 @@
-import os
-from dotenv import load_dotenv
-from scipy.special import expit, logit
-from rerankers import Reranker
-from sentence_transformers import CrossEncoder
-load_dotenv()
-def get_reranker(model = "nano", cohere_api_key = None):
-    assert model in ["nano","tiny","small","large", "jina"]
-    if model == "nano":
-        reranker = Reranker('ms-marco-TinyBERT-L-2-v2', model_type='flashrank')
-    elif model == "tiny":
-        reranker = Reranker('ms-marco-MiniLM-L-12-v2', model_type='flashrank')
-    elif model == "small":
-        reranker = Reranker("mixedbread-ai/mxbai-rerank-xsmall-v1", model_type='cross-encoder')
-    elif model == "large":
-        if cohere_api_key is None:
-            cohere_api_key = os.environ["COHERE_API_KEY"]
-        reranker = Reranker("cohere", lang='en', api_key = cohere_api_key)
-    elif model == "jina":
-        # Reached token quota so does not work
-        reranker = Reranker("jina-reranker-v2-base-multilingual", api_key = os.getenv("JINA_RERANKER_API_KEY"))
-        # marche pas sans gpu ? et anyways returns with another structure donc faudrait changer le code du retriever node
-        # reranker = CrossEncoder("jinaai/jina-reranker-v2-base-multilingual", automodel_args={"torch_dtype": "auto"}, trust_remote_code=True,)
-    return reranker
-def rerank_docs(reranker,docs,query):
-    if docs == []:
-        return []
-    # Get a list of texts from langchain docs
-    input_docs = [x.page_content for x in docs]
-    # Rerank using rerankers library
-    results = reranker.rank(query=query, docs=input_docs)
-    # Prepare langchain list of docs
-    docs_reranked = []
-    for result in results.results:
-        doc_id = result.document.doc_id
-        doc = docs[doc_id]
-        doc.metadata["reranking_score"] = result.score
-        doc.metadata["query_used_for_retrieval"] = query
-        docs_reranked.append(doc)
-    return docs_reranked

climateqa/engine/retriever.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# https://github.com/langchain-ai/langchain/issues/8623
+import pandas as pd
+from langchain_core.retrievers import BaseRetriever
+from langchain_core.vectorstores import VectorStoreRetriever
+from langchain_core.documents.base import Document
+from langchain_core.vectorstores import VectorStore
+from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
+from typing import List
+from pydantic import Field
+class ClimateQARetriever(BaseRetriever):
+    vectorstore:VectorStore
+    sources:list = ["IPCC","IPBES","IPOS"]
+    reports:list = []
+    threshold:float = 0.6
+    k_summary:int = 3
+    k_total:int = 10
+    namespace:str = "vectors",
+    min_size:int = 200,
+    def _get_relevant_documents(
+        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        # Check if all elements in the list are either IPCC or IPBES
+        assert isinstance(self.sources,list)
+        assert all([x in ["IPCC","IPBES","IPOS"] for x in self.sources])
+        assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
+        # Prepare base search kwargs
+        filters = {}
+        if len(self.reports) > 0:
+            filters["short_name"] = {"$in":self.reports}
+        else:
+            filters["source"] = { "$in":self.sources}
+        # Search for k_summary documents in the summaries dataset
+        filters_summaries = {
+            **filters,
+            "report_type": { "$in":["SPM"]},
+        }
+        docs_summaries = self.vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = self.k_summary)
+        docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
+        # Search for k_total - k_summary documents in the full reports dataset
+        filters_full = {
+            **filters,
+            "report_type": { "$nin":["SPM"]},
+        }
+        k_full = self.k_total - len(docs_summaries)
+        docs_full = self.vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
+        # Concatenate documents
+        docs = docs_summaries + docs_full
+        # Filter if scores are below threshold
+        docs = [x for x in docs if len(x[0].page_content) > self.min_size]
+        # docs = [x for x in docs if x[1] > self.threshold]
+        # Add score to metadata
+        results = []
+        for i,(doc,score) in enumerate(docs):
+            doc.metadata["similarity_score"] = score
+            doc.metadata["content"] = doc.page_content
+            doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
+            # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
+            results.append(doc)
+        # Sort by score
+        # results = sorted(results,key = lambda x : x.metadata["similarity_score"],reverse = True)
+        return results
+# def filter_summaries(df,k_summary = 3,k_total = 10):
+#     # assert source in ["IPCC","IPBES","ALL"], "source arg should be in (IPCC,IPBES,ALL)"
+#     # # Filter by source
+#     # if source == "IPCC":
+#     #     df = df.loc[df["source"]=="IPCC"]
+#     # elif source == "IPBES":
+#     #     df = df.loc[df["source"]=="IPBES"]
+#     # else:
+#     #     pass
+#     # Separate summaries and full reports
+#     df_summaries = df.loc[df["report_type"].isin(["SPM","TS"])]
+#     df_full = df.loc[~df["report_type"].isin(["SPM","TS"])]
+#     # Find passages from summaries dataset
+#     passages_summaries = df_summaries.head(k_summary)
+#     # Find passages from full reports dataset
+#     passages_fullreports = df_full.head(k_total - len(passages_summaries))
+#     # Concatenate passages
+#     passages = pd.concat([passages_summaries,passages_fullreports],axis = 0,ignore_index = True)
+#     return passages
+# def retrieve_with_summaries(query,retriever,k_summary = 3,k_total = 10,sources = ["IPCC","IPBES"],max_k = 100,threshold = 0.555,as_dict = True,min_length = 300):
+#     assert max_k > k_total
+#     validated_sources = ["IPCC","IPBES"]
+#     sources = [x for x in sources if x in validated_sources]
+#     filters = {
+#         "source": { "$in": sources },
+#     }
+#     print(filters)
+#     # Retrieve documents
+#     docs = retriever.retrieve(query,top_k = max_k,filters = filters)
+#     # Filter by score
+#     docs = [{**x.meta,"score":x.score,"content":x.content} for x in docs if x.score > threshold]
+#     if len(docs) == 0:
+#         return []
+#     res = pd.DataFrame(docs)
+#     passages_df = filter_summaries(res,k_summary,k_total)
+#     if as_dict:
+#         contents = passages_df["content"].tolist()
+#         meta = passages_df.drop(columns = ["content"]).to_dict(orient = "records")
+#         passages = []
+#         for i in range(len(contents)):
+#             passages.append({"content":contents[i],"meta":meta[i]})
+#         return passages
+#     else:
+#         return passages_df
+# def retrieve(query,sources = ["IPCC"],threshold = 0.555,k = 10):
+#     print("hellooooo")
+#     # Reformulate queries
+#     reformulated_query,language = reformulate(query)
+#     print(reformulated_query)
+#     # Retrieve documents
+#     passages = retrieve_with_summaries(reformulated_query,retriever,k_total = k,k_summary = 3,as_dict = True,sources = sources,threshold = threshold)
+#     response = {
+#       "query":query,
+#       "reformulated_query":reformulated_query,
+#       "language":language,
+#       "sources":passages,
+#       "prompts":{"init_prompt":init_prompt,"sources_prompt":sources_prompt},
+#     }
+#     return response

climateqa/engine/utils.py CHANGED Viewed

@@ -1,15 +1,8 @@
 from operator import itemgetter
 from typing import Any, Dict, Iterable, Tuple
-import tiktoken
 from langchain_core.runnables import RunnablePassthrough
-def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
-    encoding = tiktoken.get_encoding(encoding_name)
-    num_tokens = len(encoding.encode(string))
-    return num_tokens
 def pass_values(x):
     if not isinstance(x, list):
         x = [x]
@@ -74,13 +67,3 @@ def flatten_dict(
     """
     flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
     return flat_dict
-async def log_event(info,name,config):
-    """Helper function that will run a dummy chain with the given info
-    The astream_event function will catch this chain and stream the dict info to the logger
-    """
-    chain = RunnablePassthrough().with_config(run_name=name)
-    _ = await chain.ainvoke(info,config)

 from operator import itemgetter
 from typing import Any, Dict, Iterable, Tuple
 from langchain_core.runnables import RunnablePassthrough
 def pass_values(x):
     if not isinstance(x, list):
         x = [x]
     """
     flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
     return flat_dict

climateqa/engine/vectorstore.py CHANGED Viewed

@@ -13,9 +13,7 @@ except:
     pass
-def get_pinecone_vectorstore(embeddings,text_key = "content", index_name = os.getenv("PINECONE_API_INDEX")):
     # # initialize pinecone
     # pinecone.init(
@@ -29,7 +27,7 @@ def get_pinecone_vectorstore(embeddings,text_key = "content", index_name = os.ge
     # return vectorstore
     pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
-    index = pc.Index(index_name)
     vectorstore = PineconeVectorstore(
         index, embeddings, text_key,

     pass
+def get_pinecone_vectorstore(embeddings,text_key = "content"):
     # # initialize pinecone
     # pinecone.init(
     # return vectorstore
     pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
+    index = pc.Index(os.getenv("PINECONE_API_INDEX"))
     vectorstore = PineconeVectorstore(
         index, embeddings, text_key,

climateqa/event_handler.py DELETED Viewed

@@ -1,123 +0,0 @@
-from langchain_core.runnables.schema import StreamEvent
-from gradio import ChatMessage
-from climateqa.engine.chains.prompts import audience_prompts
-from front.utils import make_html_source,parse_output_llm_with_sources,serialize_docs,make_toolbox,generate_html_graphs
-import numpy as np
-def init_audience(audience :str) -> str:
-    if audience == "Children":
-        audience_prompt = audience_prompts["children"]
-    elif audience == "General public":
-        audience_prompt = audience_prompts["general"]
-    elif audience == "Experts":
-        audience_prompt = audience_prompts["experts"]
-    else:
-        audience_prompt = audience_prompts["experts"]
-    return audience_prompt
-def handle_retrieved_documents(event: StreamEvent, history : list[ChatMessage], used_documents : list[str]) -> tuple[str, list[ChatMessage], list[str]]:
-    """
-    Handles the retrieved documents and returns the HTML representation of the documents
-    Args:
-        event (StreamEvent): The event containing the retrieved documents
-        history (list[ChatMessage]): The current message history
-        used_documents (list[str]): The list of used documents
-    Returns:
-        tuple[str, list[ChatMessage], list[str]]: The updated HTML representation of the documents, the updated message history and the updated list of used documents
-    """
-    try:
-        docs = event["data"]["output"]["documents"]
-        docs_html = []
-        textual_docs = [d for d in docs if d.metadata["chunk_type"] == "text"]
-        for i, d in enumerate(textual_docs, 1):
-            if d.metadata["chunk_type"] == "text":
-                docs_html.append(make_html_source(d, i))
-        used_documents = used_documents + [f"{d.metadata['short_name']} - {d.metadata['name']}" for d in docs]
-        if used_documents!=[]:
-            history[-1].content = "Adding sources :\n\n - " + "\n - ".join(np.unique(used_documents))
-        docs_html = "".join(docs_html)
-        related_contents = event["data"]["output"]["related_contents"]
-    except Exception as e:
-        print(f"Error getting documents: {e}")
-        print(event)
-    return docs, docs_html, history, used_documents, related_contents
-def stream_answer(history: list[ChatMessage], event : StreamEvent, start_streaming : bool, answer_message_content : str)-> tuple[list[ChatMessage], bool, str]:
-    """
-    Handles the streaming of the answer and updates the history with the new message content
-    Args:
-        history (list[ChatMessage]): The current message history
-        event (StreamEvent): The event containing the streamed answer
-        start_streaming (bool): A flag indicating if the streaming has started
-        new_message_content (str): The content of the new message
-    Returns:
-        tuple[list[ChatMessage], bool, str]: The updated history, the updated streaming flag and the updated message content
-    """
-    if start_streaming == False:
-        start_streaming = True
-        history.append(ChatMessage(role="assistant", content = ""))
-    answer_message_content +=  event["data"]["chunk"].content
-    answer_message_content = parse_output_llm_with_sources(answer_message_content)
-    history[-1] = ChatMessage(role="assistant", content = answer_message_content)
-    # history.append(ChatMessage(role="assistant", content = new_message_content))
-    return history, start_streaming, answer_message_content
-def handle_retrieved_owid_graphs(event :StreamEvent, graphs_html: str) -> str:
-    """
-    Handles the retrieved OWID graphs and returns the HTML representation of the graphs
-    Args:
-        event (StreamEvent): The event containing the retrieved graphs
-        graphs_html (str): The current HTML representation of the graphs
-    Returns:
-        str: The updated HTML representation
-    """
-    try:
-        recommended_content = event["data"]["output"]["recommended_content"]
-        unique_graphs = []
-        seen_embeddings = set()
-        for x in recommended_content:
-            embedding = x.metadata["returned_content"]
-            # Check if the embedding has already been seen
-            if embedding not in seen_embeddings:
-                unique_graphs.append({
-                    "embedding": embedding,
-                    "metadata": {
-                        "source": x.metadata["source"],
-                        "category": x.metadata["category"]
-                    }
-                })
-                # Add the embedding to the seen set
-                seen_embeddings.add(embedding)
-        categories = {}
-        for graph in unique_graphs:
-            category = graph['metadata']['category']
-            if category not in categories:
-                categories[category] = []
-            categories[category].append(graph['embedding'])
-        for category, embeddings in categories.items():
-            graphs_html += f"<h3>{category}</h3>"
-            for embedding in embeddings:
-                graphs_html += f"<div>{embedding}</div>"
-    except Exception as e:
-        print(f"Error getting graphs: {e}")
-    return graphs_html

climateqa/knowledge/__init__.py DELETED Viewed

File without changes

climateqa/knowledge/retriever.py DELETED Viewed

@@ -1,102 +0,0 @@
-# # https://github.com/langchain-ai/langchain/issues/8623
-# import pandas as pd
-# from langchain_core.retrievers import BaseRetriever
-# from langchain_core.vectorstores import VectorStoreRetriever
-# from langchain_core.documents.base import Document
-# from langchain_core.vectorstores import VectorStore
-# from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
-# from typing import List
-# from pydantic import Field
-# def _add_metadata_and_score(docs: List) -> Document:
-#     # Add score to metadata
-#     docs_with_metadata = []
-#     for i,(doc,score) in enumerate(docs):
-#         doc.page_content = doc.page_content.replace("\r\n"," ")
-#         doc.metadata["similarity_score"] = score
-#         doc.metadata["content"] = doc.page_content
-#         doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
-#         # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
-#         docs_with_metadata.append(doc)
-#     return docs_with_metadata
-# class ClimateQARetriever(BaseRetriever):
-#     vectorstore:VectorStore
-#     sources:list = ["IPCC","IPBES","IPOS"]
-#     reports:list = []
-#     threshold:float = 0.6
-#     k_summary:int = 3
-#     k_total:int = 10
-#     namespace:str = "vectors",
-#     min_size:int = 200,
-#     def _get_relevant_documents(
-#         self, query: str, *, run_manager: CallbackManagerForRetrieverRun
-#     ) -> List[Document]:
-#         # Check if all elements in the list are either IPCC or IPBES
-#         assert isinstance(self.sources,list)
-#         assert self.sources
-#         assert all([x in ["IPCC","IPBES","IPOS"] for x in self.sources])
-#         assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
-#         # Prepare base search kwargs
-#         filters = {}
-#         if len(self.reports) > 0:
-#             filters["short_name"] = {"$in":self.reports}
-#         else:
-#             filters["source"] = { "$in":self.sources}
-#         # Search for k_summary documents in the summaries dataset
-#         filters_summaries = {
-#             **filters,
-#             "chunk_type":"text",
-#             "report_type": { "$in":["SPM"]},
-#         }
-#         docs_summaries = self.vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = self.k_summary)
-#         docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
-#         # docs_summaries = []
-#         # Search for k_total - k_summary documents in the full reports dataset
-#         filters_full = {
-#             **filters,
-#             "chunk_type":"text",
-#             "report_type": { "$nin":["SPM"]},
-#         }
-#         k_full = self.k_total - len(docs_summaries)
-#         docs_full = self.vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
-#         # Images
-#         filters_image = {
-#             **filters,
-#             "chunk_type":"image"
-#         }
-#         docs_images = self.vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_full)
-#         # docs_images = []
-#         # Concatenate documents
-#         # docs = docs_summaries + docs_full + docs_images
-#         # Filter if scores are below threshold
-#         # docs = [x for x in docs if x[1] > self.threshold]
-#         docs_summaries, docs_full, docs_images = _add_metadata_and_score(docs_summaries), _add_metadata_and_score(docs_full), _add_metadata_and_score(docs_images)
-#         # Filter if length are below threshold
-#         docs_summaries = [x for x in docs_summaries if len(x.page_content) > self.min_size]
-#         docs_full = [x for x in docs_full if len(x.page_content) > self.min_size]
-#         return {
-#             "docs_summaries" : docs_summaries,
-#             "docs_full" : docs_full,
-#             "docs_images" : docs_images,
-#         }

climateqa/papers/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import pandas as pd
+from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
+import pyalex
+pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
+class OpenAlex():
+    def __init__(self):
+        pass
+    def search(self,keywords,n_results = 100,after = None,before = None):
+        works = Works().search(keywords).get()
+        for page in works.paginate(per_page=n_results):
+            break
+        df_works = pd.DataFrame(page)
+        return works
+    def make_network(self):
+        pass
+    def get_abstract_from_inverted_index(self,index):
+        # Determine the maximum index to know the length of the reconstructed array
+        max_index = max([max(positions) for positions in index.values()])
+        # Initialize a list with placeholders for all positions
+        reconstructed = [''] * (max_index + 1)
+        # Iterate through the inverted index and place each token at its respective position(s)
+        for token, positions in index.items():
+            for position in positions:
+                reconstructed[position] = token
+        # Join the tokens to form the reconstructed sentence(s)
+        return ' '.join(reconstructed)

climateqa/{knowledge → papers}/openalex.py RENAMED Viewed

@@ -3,32 +3,18 @@ import networkx as nx
 import matplotlib.pyplot as plt
 from pyvis.network import Network
-from langchain_core.retrievers import BaseRetriever
-from langchain_core.vectorstores import VectorStoreRetriever
-from langchain_core.documents.base import Document
-from langchain_core.vectorstores import VectorStore
-from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
-from ..engine.utils import num_tokens_from_string
-from typing import List
-from pydantic import Field
 from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
 import pyalex
 pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
-def replace_nan_with_empty_dict(x):
-    return x if pd.notna(x) else {}
 class OpenAlex():
     def __init__(self):
         pass
-    def search(self,keywords:str,n_results = 100,after = None,before = None):
         if isinstance(keywords,str):
             works = Works().search(keywords)
@@ -41,36 +27,29 @@ class OpenAlex():
                 break
             df_works = pd.DataFrame(page)
-            if df_works.empty:
-                return df_works
-            df_works = df_works.dropna(subset = ["title"])
-            df_works["primary_location"] = df_works["primary_location"].map(replace_nan_with_empty_dict)
-            df_works["abstract"] = df_works["abstract_inverted_index"].apply(lambda x: self.get_abstract_from_inverted_index(x)).fillna("")
             df_works["is_oa"] = df_works["open_access"].map(lambda x : x.get("is_oa",False))
             df_works["pdf_url"] = df_works["primary_location"].map(lambda x : x.get("pdf_url",None))
-            df_works["url"] = df_works["id"]
-            df_works["content"] = (df_works["title"] + "\n" + df_works["abstract"]).map(lambda x : x.strip())
-            df_works["num_tokens"] = df_works["content"].map(lambda x : num_tokens_from_string(x))
-            df_works = df_works.drop(columns = ["abstract_inverted_index"])
-            df_works["display_name"] = df_works["primary_location"].apply(lambda x :x["source"] if type(x) == dict and 'source' in x else "").apply(lambda x : x["display_name"] if type(x) == dict and "display_name" in x else "")
-            df_works["subtitle"] = df_works["title"].astype(str) + " - " + df_works["display_name"].astype(str) + " - " + df_works["publication_year"].astype(str)
-            return df_works
         else:
-           raise Exception("Keywords must be a string")
     def rerank(self,query,df,reranker):
         scores = reranker.rank(
             query,
-            df["content"].tolist()
         )
-        scores = sorted(scores.results, key = lambda x : x.document.doc_id)
-        scores = [x.score for x in scores]
         df["rerank_score"] = scores
         return df
@@ -160,36 +139,4 @@ class OpenAlex():
                     reconstructed[position] = token
             # Join the tokens to form the reconstructed sentence(s)
-            return ' '.join(reconstructed)
-class OpenAlexRetriever(BaseRetriever):
-    min_year:int = 1960
-    max_year:int = None
-    k:int = 100
-    def _get_relevant_documents(
-        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
-    ) -> List[Document]:
-        openalex = OpenAlex()
-        # Search for documents
-        df_docs = openalex.search(query,n_results=self.k,after = self.min_year,before = self.max_year)
-        docs = []
-        for i,row in df_docs.iterrows():
-            num_tokens = row["num_tokens"]
-            if num_tokens < 50 or num_tokens > 1000:
-                continue
-            doc = Document(
-                page_content = row["content"],
-                metadata = row.to_dict()
-            )
-            docs.append(doc)
-        return docs

 import matplotlib.pyplot as plt
 from pyvis.network import Network
 from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
 import pyalex
 pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
 class OpenAlex():
     def __init__(self):
         pass
+    def search(self,keywords,n_results = 100,after = None,before = None):
         if isinstance(keywords,str):
             works = Works().search(keywords)
                 break
             df_works = pd.DataFrame(page)
+            df_works["abstract"] = df_works["abstract_inverted_index"].apply(lambda x: self.get_abstract_from_inverted_index(x))
             df_works["is_oa"] = df_works["open_access"].map(lambda x : x.get("is_oa",False))
             df_works["pdf_url"] = df_works["primary_location"].map(lambda x : x.get("pdf_url",None))
+            df_works["content"] = df_works["title"] + "\n" + df_works["abstract"]
         else:
+            df_works = []
+            for keyword in keywords:
+                df_keyword = self.search(keyword,n_results = n_results,after = after,before = before)
+                df_works.append(df_keyword)
+            df_works = pd.concat(df_works,ignore_index=True,axis = 0)
+        return df_works
     def rerank(self,query,df,reranker):
         scores = reranker.rank(
             query,
+            df["content"].tolist(),
+            top_k = len(df),
         )
+        scores.sort(key = lambda x : x["corpus_id"])
+        scores = [x["score"] for x in scores]
         df["rerank_score"] = scores
         return df
                     reconstructed[position] = token
             # Join the tokens to form the reconstructed sentence(s)
+            return ' '.join(reconstructed)

climateqa/utils.py CHANGED Viewed

@@ -20,16 +20,3 @@ def get_image_from_azure_blob_storage(path):
     file_object = get_file_from_azure_blob_storage(path)
     image = Image.open(file_object)
     return image
-def remove_duplicates_keep_highest_score(documents):
-    unique_docs = {}
-    for doc in documents:
-        doc_id = doc.metadata.get('doc_id')
-        if doc_id in unique_docs:
-            if doc.metadata['reranking_score'] > unique_docs[doc_id].metadata['reranking_score']:
-                unique_docs[doc_id] = doc
-        else:
-            unique_docs[doc_id] = doc
-    return list(unique_docs.values())

     file_object = get_file_from_azure_blob_storage(path)
     image = Image.open(file_object)
     return image

front/__init__.py DELETED Viewed

File without changes

front/callbacks.py DELETED Viewed

File without changes

front/utils.py DELETED Viewed

@@ -1,341 +0,0 @@
-import re
-from collections import defaultdict
-from climateqa.utils import get_image_from_azure_blob_storage
-from climateqa.engine.chains.prompts import audience_prompts
-from PIL import Image
-from io import BytesIO
-import base64
-def make_pairs(lst:list)->list:
-    """from a list of even lenght, make tupple pairs"""
-    return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)]
-def serialize_docs(docs:list)->list:
-    new_docs = []
-    for doc in docs:
-        new_doc = {}
-        new_doc["page_content"] = doc.page_content
-        new_doc["metadata"] = doc.metadata
-        new_docs.append(new_doc)
-    return new_docs
-def parse_output_llm_with_sources(output:str)->str:
-    # Split the content into a list of text and "[Doc X]" references
-    content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
-    parts = []
-    for part in content_parts:
-        if part.startswith("Doc"):
-            subparts = part.split(",")
-            subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts]
-            subparts = [f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" for subpart in subparts]
-            parts.append("".join(subparts))
-        else:
-            parts.append(part)
-    content_parts = "".join(parts)
-    return content_parts
-def process_figures(docs:list, new_figures:list)->tuple:
-    docs = docs + new_figures
-    figures = '<div class="figures-container"><p></p> </div>'
-    gallery = []
-    used_figures = []
-    if docs == []:
-        return docs, figures, gallery
-    docs_figures = [d for d in docs if d.metadata["chunk_type"] == "image"]
-    for i_doc, doc in enumerate(docs_figures):
-        if doc.metadata["chunk_type"] == "image":
-            path = doc.metadata["image_path"]
-            if path not in used_figures:
-                used_figures.append(path)
-                figure_number = len(used_figures)
-                try:
-                    key = f"Image {figure_number}"
-                    image_path = doc.metadata["image_path"].split("documents/")[1]
-                    img = get_image_from_azure_blob_storage(image_path)
-                    # Convert the image to a byte buffer
-                    buffered = BytesIO()
-                    max_image_length = 500
-                    img_resized = img.resize((max_image_length, int(max_image_length * img.size[1]/img.size[0])))
-                    img_resized.save(buffered, format="PNG")
-                    img_str = base64.b64encode(buffered.getvalue()).decode()
-                    figures = figures + make_html_figure_sources(doc, figure_number, img_str)
-                    gallery.append(img)
-                except Exception as e:
-                    print(f"Skipped adding image {figure_number} because of {e}")
-    return docs, figures, gallery
-def generate_html_graphs(graphs:list)->str:
-    # Organize graphs by category
-    categories = defaultdict(list)
-    for graph in graphs:
-        category = graph['metadata']['category']
-        categories[category].append(graph['embedding'])
-    # Begin constructing the HTML
-    html_code = '''
-                <!DOCTYPE html>
-                <html lang="en">
-                <head>
-                    <meta charset="UTF-8">
-                    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-                    <title>Graphs by Category</title>
-                    <style>
-                        .tab-content {
-                            display: none;
-                        }
-                        .tab-content.active {
-                            display: block;
-                        }
-                        .tabs {
-                            margin-bottom: 20px;
-                        }
-                        .tab-button {
-                            background-color: #ddd;
-                            border: none;
-                            padding: 10px 20px;
-                            cursor: pointer;
-                            margin-right: 5px;
-                        }
-                        .tab-button.active {
-                            background-color: #ccc;
-                        }
-                    </style>
-                    <script>
-                        function showTab(tabId) {
-                            var contents = document.getElementsByClassName('tab-content');
-                            var buttons = document.getElementsByClassName('tab-button');
-                            for (var i = 0; i < contents.length; i++) {
-                                contents[i].classList.remove('active');
-                                buttons[i].classList.remove('active');
-                            }
-                            document.getElementById(tabId).classList.add('active');
-                            document.querySelector('button[data-tab="'+tabId+'"]').classList.add('active');
-                        }
-                    </script>
-                </head>
-                <body>
-                    <div class="tabs">
-                '''
-    # Add buttons for each category
-    for i, category in enumerate(categories.keys()):
-        active_class = 'active' if i == 0 else ''
-        html_code += f'<button class="tab-button {active_class}" onclick="showTab(\'tab-{i}\')" data-tab="tab-{i}">{category}</button>'
-    html_code += '</div>'
-    # Add content for each category
-    for i, (category, embeds) in enumerate(categories.items()):
-        active_class = 'active' if i == 0 else ''
-        html_code += f'<div id="tab-{i}" class="tab-content {active_class}">'
-        for embed in embeds:
-            html_code += embed
-        html_code += '</div>'
-    html_code += '''
-                </body>
-                </html>
-                '''
-    return html_code
-def make_html_source(source,i):
-    meta = source.metadata
-    # content = source.page_content.split(":",1)[1].strip()
-    content = source.page_content.strip()
-    toc_levels = []
-    for j in range(2):
-        level = meta[f"toc_level{j}"]
-        if level != "N/A":
-            toc_levels.append(level)
-        else:
-            break
-    toc_levels = " > ".join(toc_levels)
-    if len(toc_levels) > 0:
-        name = f"<b>{toc_levels}</b><br/>{meta['name']}"
-    else:
-        name = meta['name']
-    score = meta['reranking_score']
-    if score > 0.8:
-        color = "score-green"
-    elif score > 0.5:
-        color = "score-orange"
-    else:
-        color = "score-red"
-    relevancy_score = f"<p class=relevancy-score>Relevancy score: <span class='{color}'>{score:.1%}</span></p>"
-    if meta["chunk_type"] == "text":
-        card = f"""
-    <div class="card" id="doc{i}">
-        <div class="card-content">
-            <h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
-            <p>{content}</p>
-            {relevancy_score}
-        </div>
-        <div class="card-footer">
-            <span>{name}</span>
-            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
-                <span role="img" aria-label="Open PDF">🔗</span>
-            </a>
-        </div>
-    </div>
-    """
-    else:
-        if meta["figure_code"] != "N/A":
-            title = f"{meta['figure_code']} - {meta['short_name']}"
-        else:
-            title = f"{meta['short_name']}"
-        card = f"""
-    <div class="card card-image">
-        <div class="card-content">
-            <h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
-            <p class='ai-generated'>AI-generated description</p>
-            <p>{content}</p>
-            {relevancy_score}
-        </div>
-        <div class="card-footer">
-            <span>{name}</span>
-            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
-                <span role="img" aria-label="Open PDF">🔗</span>
-            </a>
-        </div>
-    </div>
-    """
-    return card
-def make_html_papers(df,i):
-    title = df['title'][i]
-    content = df['abstract'][i]
-    url = df['doi'][i]
-    publication_date = df['publication_year'][i]
-    subtitle = df['subtitle'][i]
-    card = f"""
-    <div class="card" id="doc{i}">
-        <div class="card-content">
-            <h2>Doc {i+1} - {title}</h2>
-            <p>{content}</p>
-        </div>
-        <div class="card-footer">
-            <span>{subtitle}</span>
-            <a href="{url}" target="_blank" class="pdf-link">
-                <span role="img" aria-label="Open paper">🔗</span>
-            </a>
-        </div>
-    </div>
-        """
-    return card
-def make_html_figure_sources(source,i,img_str):
-    meta = source.metadata
-    content = source.page_content.strip()
-    score = meta['reranking_score']
-    if score > 0.8:
-        color = "score-green"
-    elif score > 0.5:
-        color = "score-orange"
-    else:
-        color = "score-red"
-    toc_levels = []
-    if len(toc_levels) > 0:
-        name = f"<b>{toc_levels}</b><br/>{meta['name']}"
-    else:
-        name = meta['name']
-    relevancy_score = f"<p class=relevancy-score>Relevancy score: <span class='{color}'>{score:.1%}</span></p>"
-    if meta["figure_code"] != "N/A":
-        title = f"{meta['figure_code']} - {meta['short_name']}"
-    else:
-        title = f"{meta['short_name']}"
-    card = f"""
-    <div class="card card-image">
-        <div class="card-content">
-            <h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
-            <img src="data:image/png;base64, { img_str }" alt="Alt text" />
-            <p class='ai-generated'>AI-generated description</p>
-            <p>{content}</p>
-            {relevancy_score}
-        </div>
-        <div class="card-footer">
-            <span>{name}</span>
-            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
-                <span role="img" aria-label="Open PDF">🔗</span>
-            </a>
-        </div>
-    </div>
-    """
-    return card
-def make_toolbox(tool_name,description = "",checked = False,elem_id = "toggle"):
-    if checked:
-        span = "<span class='checkmark'>&#10003;</span>"
-    else:
-        span = "<span class='loader'></span>"
-#     toolbox = f"""
-# <div class="dropdown">
-# <label for="{elem_id}" class="dropdown-toggle">
-#     {span}
-#     {tool_name}
-#     <span class="caret"></span>
-# </label>
-# <input type="checkbox" id="{elem_id}" hidden/>
-# <div class="dropdown-content">
-#     <p>{description}</p>
-# </div>
-# </div>
-# """
-    toolbox = f"""
-<div class="dropdown">
-<label for="{elem_id}" class="dropdown-toggle">
-    {span}
-    {tool_name}
-</label>
-</div>
-"""
-    return toolbox

requirements.txt CHANGED Viewed

@@ -1,21 +1,13 @@
-gradio==5.0.2
 azure-storage-file-share==12.11.1
 azure-storage-blob
 python-dotenv==1.0.0
-langchain==0.2.1
-langchain_openai==0.1.7
-langgraph==0.0.55
-pinecone-client==4.1.0
 sentence-transformers==2.6.0
 huggingface-hub
 pyalex==0.13
 networkx==3.2.1
-pyvis==0.3.2
-flashrank==0.2.5
-rerankers==0.3.0
-torch==2.3.0
-nvidia-cudnn-cu12==8.9.2.26
-langchain-community==0.2
-msal==1.31
-matplotlib==3.9.2
-gradio-modal==0.0.4

+gradio==4.19.1
 azure-storage-file-share==12.11.1
 azure-storage-blob
 python-dotenv==1.0.0
+langchain==0.1.4
+langchain_openai==0.0.6
+pinecone-client==3.0.2
 sentence-transformers==2.6.0
 huggingface-hub
+msal
 pyalex==0.13
 networkx==3.2.1
+pyvis==0.3.2

sandbox/20240310 - CQA - Semantic Routing 1.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

sandbox/20240702 - CQA - Graph Functionality.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

sandbox/20241104 - CQA - StepByStep CQA.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

style.css CHANGED Viewed

@@ -3,79 +3,6 @@
     --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
   } */
-#tab-recommended_content{
-    padding-top: 0px;
-    padding-left : 0px;
-    padding-right: 0px;
-}
-#group-subtabs {
-    /* display: block; */
-    position : sticky;
-}
-}
-#papers-summary-popup button span{
-    /* make label of accordio in bold, center, and bigger */
-    font-size: 16px;
-    font-weight: bold;
-    text-align: center;
-}
-#papers-relevant-popup span{
-    /* make label of accordio in bold, center, and bigger */
-    font-size: 16px;
-    font-weight: bold;
-    text-align: center;
-}
-#tab-citations .button{
-    padding: 12px 16px;
-    font-size: 16px;
-    font-weight: bold;
-    cursor: pointer;
-    border: none;
-    outline: none;
-    text-align: left;
-    transition: background-color 0.3s ease;
-}
-.gradio-container {
-    width: 100%!important;
-    max-width: 100% !important;
-}
-/* fix for huggingface infinite growth*/
-main.flex.flex-1.flex-col {
-    max-height: 95vh !important;
-}
-button#show-figures{
-    /* Base styles */
-    background-color: #f5f5f5;
-    border: 1px solid #e0e0e0;
-    border-radius: 4px;
-    color: #333333;
-    cursor: pointer;
-    width: 100%;
-    text-align: center;
-}
-.avatar-container.svelte-1x5p6hu:not(.thumbnail-item) img {
-    width: 100%;
-    height: 100%;
-    object-fit: cover;
-    border-radius: 50%;
-    padding: 0px;
-    margin: 0px;
-}
 .warning-box {
     background-color: #fff3cd;
     border: 1px solid #ffeeba;
@@ -130,26 +57,14 @@ body.dark .tip-box * {
 .message{
     font-size:14px !important;
-}
-.card-content img {
-    display: block;
-    margin: auto;
-    max-width: 100%; /* Ensures the image is responsive */
-    height: auto;
 }
 a {
     text-decoration: none;
     color: inherit;
 }
-.doc-ref sup{
-    color:#dc2626!important;
-    /* margin-right:1px; */
-}
 .card {
     background-color: white;
     border-radius: 10px;
@@ -213,183 +128,94 @@ a {
     border:none;
 }
-label.selected{
-  background: #93c5fd !important;
 }
-#submit-button{
-    padding:0px !important;
 }
-#modal-config .block.modal-block.padded {
-    padding-top: 25px;
-    height: 100vh;
-}
-#modal-config .modal-container{
-    margin: 0px;
-    padding: 0px;
-}
-/* Modal styles */
-#modal-config {
-    position: fixed;
-    top: 0;
-    left: 0;
-    height: 100vh;
-    width: 500px;
-    background-color: white;
-    box-shadow: 2px 0 10px rgba(0, 0, 0, 0.1);
-    z-index: 1000;
-    padding: 15px;
-    transform: none;
-}
-#modal-config .close{
-    display: none;
 }
-/* Push main content to the right when modal is open */
-/* .modal ~ * {
-    margin-left: 300px;
-    transition: margin-left 0.3s ease;
 } */
-#modal-config .modal .wrap ul{
-    position:static;
-    top: 100%;
-    left: 0;
-    /* min-height: 100px; */
-    height: 100%;
-    /* margin-top: 0; */
-    z-index: 9999;
-    pointer-events: auto;
-    height: 200px;
-}
-#config-button{
-    background: none;
-    border: none;
-    padding: 8px;
-    cursor: pointer;
-    width: 40px;
-    height: 40px;
-    display: flex;
-    align-items: center;
-    justify-content: center;
-    border-radius: 50%;
-    transition: background-color 0.2s;
-}
-#config-button::before {
-    content: '⚙️';
-    font-size: 20px;
-}
-#config-button:hover {
-    background-color: rgba(0, 0, 0, 0.1);
-}
-#checkbox-config{
-    display: block;
     position: absolute;
-    background: none;
-    border: none;
-    padding: 8px;
-    cursor: pointer;
-    width: 40px;
-    height: 40px;
-    display: flex;
-    align-items: center;
-    justify-content: center;
     border-radius: 50%;
-    transition: background-color 0.2s;
-    font-size: 20px;
-    text-align: center;
-}
-#checkbox-config:checked{
-    display: block;
 }
 @media screen and (min-width: 1024px) {
-    /* Additional style for scrollable tab content */
-    /* div#tab-recommended_content {
-        overflow-y: auto;
-        max-height: 80vh;
-    } */
-    .gradio-container {
-        max-height: calc(100vh - 190px) !important;
-        overflow: hidden;
-    }
-    /* div#chatbot{
-        height:calc(100vh - 170px) !important;
-        max-height:calc(100vh - 170px) !important;
-    } */
     div#tab-examples{
         height:calc(100vh - 190px) !important;
-        overflow-y: scroll !important;
-        /* overflow-y: auto; */
     }
     div#sources-textbox{
         height:calc(100vh - 190px) !important;
-        overflow-y: scroll !important;
-        /* overflow-y: auto !important; */
-    }
-    div#graphs-container{
-        height:calc(100vh - 210px) !important;
-        overflow-y: scroll !important;
-    }
-    div#sources-figures{
-        height:calc(100vh - 300px) !important;
-        max-height: 90vh !important;
-        overflow-y: scroll !important;
-    }
-    div#graphs-container{
-        height:calc(100vh - 300px) !important;
-        max-height: 90vh !important;
-        overflow-y: scroll !important;
-    }
-    div#tab-citations{
-        height:calc(100vh - 300px) !important;
-        max-height: 90vh !important;
-        overflow-y: scroll !important;
     }
     div#tab-config{
         height:calc(100vh - 190px) !important;
-        overflow-y: scroll !important;
-        /* overflow-y: auto !important; */
     }
-    /* Force container to respect height limits */
-    .main-component{
-        contain: size layout;
-        overflow: hidden;
-    }
     div#chatbot-row{
-        max-height:calc(100vh - 90px) !important;
     }
-/*
     .max-height{
         height:calc(100vh - 90px) !important;
-        max-height:calc(100vh - 90px) !important;
         overflow-y: auto;
     }
-*/
 }
 footer {
@@ -432,33 +258,21 @@ footer {
     /* ... add other mobile-specific styles ... */
 }
-@media (prefers-color-scheme: dark) {
-    .card{
-        background-color: #374151;
-    }
-    .card-image > .card-content{
-        background-color: rgb(55, 65, 81) !important;
-    }
-    .card-footer {
-        background-color: #404652;
-    }
-    .container > .wrap{
-        background-color: #374151 !important;
-        color:white !important;
-    }
-    .card-content h2{
-        color:#e7754f !important;
-    }
-    .doc-ref sup{
-        color:rgb(235 109 35)!important;
-        /* margin-right:1px; */
-    }
-    .card-footer span {
-        color:white !important;
-    }
 }
@@ -504,7 +318,7 @@ span.chatbot > p > img{
 }
 .card-image > .card-content{
-    background-color:#f1f7fa;
 }
@@ -530,7 +344,8 @@ span.chatbot > p > img{
 }
 #dropdown-samples{
   background:none !important;
 }
@@ -548,190 +363,3 @@ span.chatbot > p > img{
 .a-doc-ref{
 	text-decoration: none !important;
 }
-.dropdown {
-    position: relative;
-    display:inline-block;
-    margin-bottom: 10px;
-  }
-  .dropdown-toggle {
-    background-color: #f2f2f2;
-    color: black;
-    padding: 10px;
-    font-size: 16px;
-    cursor: pointer;
-    display: block;
-    width: 400px; /* Adjust width as needed */
-    position: relative;
-    display: flex;
-    align-items: center; /* Vertically center the contents */
-    justify-content: left;
-  }
-  .dropdown-toggle .caret {
-    content: "";
-    position: absolute;
-    right: 10px;
-    top: 50%;
-    border-left: 5px solid transparent;
-    border-right: 5px solid transparent;
-    border-top: 5px solid black;
-    transform: translateY(-50%);
-  }
-  input[type="checkbox"] {
-    display: none !important;
-  }
-  input[type="checkbox"]:checked + .dropdown-content {
-    display: block;
-  }
-  #checkbox-chat input[type="checkbox"] {
-    display: flex !important;
-  }
-  .dropdown-content {
-    display: none;
-    position: absolute;
-    background-color: #f9f9f9;
-    min-width: 300px;
-    box-shadow: 0 8px 16px 0 rgba(0,0,0,0.2);
-    z-index: 1;
-    padding: 12px;
-    border: 1px solid #ccc;
-  }
-  input[type="checkbox"]:checked + .dropdown-toggle + .dropdown-content {
-    display: block;
-  }
-  input[type="checkbox"]:checked + .dropdown-toggle .caret {
-    border-top: 0;
-    border-bottom: 5px solid black;
-  }
-.loader {
-    border: 1px solid #d0d0d0 !important; /* Light grey background */
-    border-top: 1px solid #db3434 !important; /* Blue color */
-    border-right: 1px solid #3498db !important; /* Blue color */
-    border-radius: 50%;
-    width: 20px;
-    height: 20px;
-    animation: spin 2s linear infinite;
-    display:inline-block;
-    margin-right:10px !important;
-}
-.checkmark{
-    color:green !important;
-    font-size:18px;
-    margin-right:10px !important;
-}
-@keyframes spin {
-    0% { transform: rotate(0deg); }
-    100% { transform: rotate(360deg); }
-}
-.relevancy-score{
-    margin-top:10px !important;
-    font-size:10px !important;
-    font-style:italic;
-}
-.score-green{
-    color:green !important;
-}
-.score-orange{
-    color:orange !important;
-}
-.score-red{
-    color:red !important;
-}
-/* Mobile specific adjustments */
-@media screen and (max-width: 767px) {
-    div#tab-recommended_content {
-        max-height: 50vh; /* Reduce height for smaller screens */
-        overflow-y: auto;
-    }
-}
-/* Additional style for scrollable tab content */
-div#tab-saved-graphs {
-    overflow-y: auto; /* Enable vertical scrolling */
-    max-height: 80vh; /* Adjust height as needed */
-}
-/* Mobile specific adjustments */
-@media screen and (max-width: 767px) {
-    div#tab-saved-graphs {
-        max-height: 50vh; /* Reduce height for smaller screens */
-        overflow-y: auto;
-    }
-}
-.message-buttons-left.panel.message-buttons.with-avatar {
-    display: none;
-}
-/* Specific fixes for Hugging Face Space iframe */
-.h-full {
-    height: auto !important;
-    min-height: 0 !important;
-}
-.space-content {
-    height: auto !important;
-    max-height: 100vh !important;
-    overflow: hidden;
-}
-/* Mobile specific modal configuration */
-@media screen and (max-width: 767px) {
-    #modal-config {
-        width: 100%; /* Full width on mobile */
-        height: 100vh;
-        left: 0;
-        top: 0;
-        padding: 10px; /* Reduced padding for mobile */
-    }
-    #modal-config .block.modal-block.padded {
-        padding-top: 15px; /* Reduced top padding */
-        height: 100vh;
-        overflow-y: auto; /* Enable scrolling */
-    }
-    #modal-config .modal-container {
-        width: 100%;
-        height: 100%;
-    }
-    /* Show close button on mobile */
-    #modal-config .close {
-        display: block;
-        position: absolute;
-        top: 10px;
-        right: 10px;
-        z-index: 1001;
-        padding: 8px;
-        font-size: 24px;
-        background: none;
-        border: none;
-        cursor: pointer;
-    }
-    /* Ensure modal content is scrollable on mobile */
-    #modal-config .modal .wrap ul {
-        max-height: calc(100vh - 60px); /* Account for header space */
-        overflow-y: auto;
-    }
-}

     --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
   } */
 .warning-box {
     background-color: #fff3cd;
     border: 1px solid #ffeeba;
 .message{
     font-size:14px !important;
 }
 a {
     text-decoration: none;
     color: inherit;
 }
 .card {
     background-color: white;
     border-radius: 10px;
     border:none;
 }
+/* .gallery-item > div:hover{
+    background-color:#7494b0 !important;
+    color:white!important;
 }
+.gallery-item:hover{
+    border:#7494b0 !important;
 }
+.gallery-item > div{
+    background-color:white !important;
+    color:#577b9b!important;
 }
+.label{
+    color:#577b9b!important;
 } */
+/* .paginate{
+    color:#577b9b!important;
+} */
+/* span[data-testid="block-info"]{
+    background:none !important;
+    color:#577b9b;
+  } */
+/* Pseudo-element for the circularly cropped picture */
+/* .message.bot::before {
+    content: '';
     position: absolute;
+    top: -10px;
+    left: -10px;
+    width: 30px;
+    height: 30px;
+    background-image: var(--user-image);
+    background-size: cover;
+    background-position: center;
     border-radius: 50%;
+    z-index: 10;
+  }
+   */
+label.selected{
+  background:none !important;
 }
+#submit-button{
+    padding:0px !important;
+}
 @media screen and (min-width: 1024px) {
     div#tab-examples{
         height:calc(100vh - 190px) !important;
+        overflow-y: auto;
     }
     div#sources-textbox{
         height:calc(100vh - 190px) !important;
+        overflow-y: auto !important;
     }
     div#tab-config{
         height:calc(100vh - 190px) !important;
+        overflow-y: auto !important;
     }
     div#chatbot-row{
+        height:calc(100vh - 90px) !important;
     }
+    div#chatbot{
+        height:calc(100vh - 170px) !important;
+    }
     .max-height{
         height:calc(100vh - 90px) !important;
         overflow-y: auto;
     }
+    /* .tabitem:nth-child(n+3) {
+        padding-top:30px;
+        padding-left:40px;
+        padding-right:40px;
+    } */
 }
 footer {
     /* ... add other mobile-specific styles ... */
 }
+body.dark .card{
+    background-color: #374151;
+}
+body.dark .card-content h2{
+    color:#f4dbd3 !important;
+}
+body.dark .card-footer {
+    background-color: #404652;
+}
+body.dark .card-footer span {
+    color:white !important;
 }
 }
 .card-image > .card-content{
+    background-color:#f1f7fa !important;
 }
 }
 #dropdown-samples{
+  /*! border:none !important; */
+  /*! border-width:0px !important; */
   background:none !important;
 }
 .a-doc-ref{
 	text-decoration: none !important;
 }

test.json DELETED Viewed

File without changes