import re from collections import defaultdict from climateqa.utils import get_image_from_azure_blob_storage from climateqa.engine.chains.prompts import audience_prompts from PIL import Image from io import BytesIO import base64 def make_pairs(lst:list)->list: """from a list of even lenght, make tupple pairs""" return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)] def serialize_docs(docs:list)->list: new_docs = [] for doc in docs: new_doc = {} new_doc["page_content"] = doc.page_content new_doc["metadata"] = doc.metadata new_docs.append(new_doc) return new_docs def parse_output_llm_with_sources(output:str)->str: # Split the content into a list of text and "[Doc X]" references content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output) parts = [] for part in content_parts: if part.startswith("Doc"): subparts = part.split(",") subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts] subparts = [f"""{subpart}""" for subpart in subparts] parts.append("".join(subparts)) else: parts.append(part) content_parts = "".join(parts) return content_parts def process_figures(docs:list)->tuple: gallery=[] used_figures =[] figures = '

' docs_figures = [d for d in docs if d.metadata["chunk_type"] == "image"] for i, doc in enumerate(docs_figures): if doc.metadata["chunk_type"] == "image": if doc.metadata["figure_code"] != "N/A": title = f"{doc.metadata['figure_code']} - {doc.metadata['short_name']}" else: title = f"{doc.metadata['short_name']}" if title not in used_figures: used_figures.append(title) try: key = f"Image {i+1}" image_path = doc.metadata["image_path"].split("documents/")[1] img = get_image_from_azure_blob_storage(image_path) # Convert the image to a byte buffer buffered = BytesIO() max_image_length = 500 img_resized = img.resize((max_image_length, int(max_image_length * img.size[1]/img.size[0]))) img_resized.save(buffered, format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode() figures = figures + make_html_figure_sources(doc, i, img_str) gallery.append(img) except Exception as e: print(f"Skipped adding image {i} because of {e}") return figures, gallery def generate_html_graphs(graphs:list)->str: # Organize graphs by category categories = defaultdict(list) for graph in graphs: category = graph['metadata']['category'] categories[category].append(graph['embedding']) # Begin constructing the HTML html_code = ''' Graphs by Category
''' # Add buttons for each category for i, category in enumerate(categories.keys()): active_class = 'active' if i == 0 else '' html_code += f'' html_code += '
' # Add content for each category for i, (category, embeds) in enumerate(categories.items()): active_class = 'active' if i == 0 else '' html_code += f'
' for embed in embeds: html_code += embed html_code += '
' html_code += ''' ''' return html_code def make_html_source(source,i): meta = source.metadata # content = source.page_content.split(":",1)[1].strip() content = source.page_content.strip() toc_levels = [] for j in range(2): level = meta[f"toc_level{j}"] if level != "N/A": toc_levels.append(level) else: break toc_levels = " > ".join(toc_levels) if len(toc_levels) > 0: name = f"{toc_levels}
{meta['name']}" else: name = meta['name'] score = meta['reranking_score'] if score > 0.8: color = "score-green" elif score > 0.5: color = "score-orange" else: color = "score-red" relevancy_score = f"

Relevancy score: {score:.1%}

" if meta["chunk_type"] == "text": card = f"""

Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}

{content}

{relevancy_score}
""" else: if meta["figure_code"] != "N/A": title = f"{meta['figure_code']} - {meta['short_name']}" else: title = f"{meta['short_name']}" card = f"""

Image {i} - {title} - Page {int(meta['page_number'])}

AI-generated description

{content}

{relevancy_score}
""" return card def make_html_df(df,i): title = df['title'][i] content = df['abstract'][i] url = df['doi'][i] publication_date = df['publication_year'][i] card = f"""

Doc {i+1} - {title}

{content}

""" return card def make_html_figure_sources(source,i,img_str): meta = source.metadata content = source.page_content.strip() score = meta['reranking_score'] if score > 0.8: color = "score-green" elif score > 0.5: color = "score-orange" else: color = "score-red" toc_levels = [] if len(toc_levels) > 0: name = f"{toc_levels}
{meta['name']}" else: name = meta['name'] relevancy_score = f"

Relevancy score: {score:.1%}

" if meta["figure_code"] != "N/A": title = f"{meta['figure_code']} - {meta['short_name']}" else: title = f"{meta['short_name']}" card = f"""

Image {i} - {title} - Page {int(meta['page_number'])}

Alt text

AI-generated description

{content}

{relevancy_score}
""" return card def make_toolbox(tool_name,description = "",checked = False,elem_id = "toggle"): if checked: span = "" else: span = "" # toolbox = f""" # # """ toolbox = f""" """ return toolbox