import streamlit as st import pandas as pd import json import re from langchain_core.documents import Document from langchain_chroma import Chroma from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.retrievers import BM25Retriever from langchain.retrievers import EnsembleRetriever from langchain_core.runnables import ( RunnableLambda ) from langchain_core.documents import Document @st.cache_data def load_data(): with open("data/test/constitution.json") as f: return json.load(f) def format_for_search(chapter, subchapter, article_number, article_content): subchapter_name = subchapter["heading"] if subchapter_name != "": subchapter_name = f"\n{subchapter_name}\n" if article_number == "-1": article_number = "" else: article_number = f"\nArtykuł {article_number}\n" if chapter['number'] == 0: chapter_name = chapter["title"] else: chapter_name = f"Rozdział {chapter['number']} {chapter['title']}" chapter_name = f"{chapter_name}\n" return f"{article_content}\n\n\n{chapter_name}{subchapter_name}{article_number}" constitution = load_data() print(constitution) documents = [ Document( page_content=format_for_search(chapter, subchapter, article_number, article_content), metadata={"chapter_number": chapter["number"], "chapter_title": chapter["title"], "subchapter_title": subchapter["heading"], "subchapter_number": subchapter_number, "article_number": article_number}, ) for chapter in constitution["chapters"] for subchapter_number, subchapter in chapter["subchapters"].items() for article_number, article_content in subchapter["articles"].items() ] def get_full_content(constitution): text = "# KONSTYTUCJA\n# RZECZYPOSPOLITEJ POLSKIEJ\n\n## z dnia 2 kwietnia 1997 r.\n" for chapter in constitution["chapters"]: chapter_prefix = f"## **Rozdział {chapter['number']}**\n\n" if chapter['number'] != 0 else '' chapter_title = f"**{chapter['title']}**" if chapter['title'].strip() != '' else '' chapter_name = f"{chapter_prefix} {chapter_title}" text += f"\n\n{chapter_name}" for subchapter_number, subchapter in chapter["subchapters"].items(): subchapter_heading = f"\n\n### **{subchapter['heading']}**" if subchapter['heading'].strip() != '' else '' text += subchapter_heading for article_number, article_content in subchapter["articles"].items(): if article_number == "-1": article_prefix = "" else: article_prefix = f"**Art. {article_number}.**\n\n" text += f"\n\n{article_prefix}{article_content}" return text @st.cache_resource def load_models(): embeddings = HuggingFaceEmbeddings(model_name="ipipan/silver-retriever-base-v1") vector_store = Chroma.from_documents(documents=documents, embedding=embeddings) embeddings_retriever = vector_store.as_retriever() bm25_retriever = BM25Retriever.from_documents(documents) formatter = RunnableLambda(lambda x: f"Pytanie: {x}") formatting_retriever = formatter | embeddings_retriever # ensemble_retriever = EnsembleRetriever( # retrievers=[bm25_retriever, formatting_retriever], weights=[0.5, 0.5] # ) return embeddings, vector_store, formatting_retriever embeddings, vector_store, retriever = load_models() st.title('Konstytucja RP') tab1, tab2 = st.tabs(["Wyszukiwarka", "Treść"]) with tab1: question = st.text_input('Zadaj pytanie:', 'Czy zgodnie z konstytucją wszyscy są równi wobec prawa?') retrieved_docs = retriever.invoke(f"{question}") def format_single(doc): meta_removed = doc.page_content.split("\n\n\n")[0] if doc.metadata['article_number'] == "-1": prefix = "**Preambuła**" else: prefix = f"**Art. {doc.metadata['article_number']}.**" return f"{prefix}\n\n{meta_removed}" def format_docs(docs): return "\n\n".join([ format_single(doc) for doc in docs ]) result = format_docs(retrieved_docs) st.markdown(format_docs(retrieved_docs)) with tab2: st.markdown(get_full_content(constitution))