Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import json | |
import re | |
from langchain_core.documents import Document | |
from langchain_chroma import Chroma | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.retrievers import BM25Retriever | |
from langchain.retrievers import EnsembleRetriever | |
from langchain_core.runnables import ( | |
RunnableLambda | |
) | |
from langchain_core.documents import Document | |
def load_data(): | |
with open("data/test/constitution.json") as f: | |
return json.load(f) | |
def format_for_search(chapter, subchapter, article_number, article_content): | |
subchapter_name = subchapter["heading"] | |
if subchapter_name != "": | |
subchapter_name = f"\n{subchapter_name}\n" | |
if article_number == "-1": | |
article_number = "" | |
else: | |
article_number = f"\nArtykuł {article_number}\n" | |
if chapter['number'] == 0: | |
chapter_name = chapter["title"] | |
else: | |
chapter_name = f"Rozdział {chapter['number']} {chapter['title']}" | |
chapter_name = f"{chapter_name}\n" | |
return f"{article_content}\n\n\n{chapter_name}{subchapter_name}{article_number}" | |
constitution = load_data() | |
print(constitution) | |
documents = [ | |
Document( | |
page_content=format_for_search(chapter, subchapter, article_number, article_content), | |
metadata={"chapter_number": chapter["number"], "chapter_title": chapter["title"], | |
"subchapter_title": subchapter["heading"], "subchapter_number": subchapter_number, | |
"article_number": article_number}, | |
) | |
for chapter in constitution["chapters"] | |
for subchapter_number, subchapter in chapter["subchapters"].items() | |
for article_number, article_content in subchapter["articles"].items() | |
] | |
def get_full_content(constitution): | |
text = "# KONSTYTUCJA\n# RZECZYPOSPOLITEJ POLSKIEJ\n\n## z dnia 2 kwietnia 1997 r.\n" | |
for chapter in constitution["chapters"]: | |
chapter_prefix = f"## **Rozdział {chapter['number']}**\n\n" if chapter['number'] != 0 else '' | |
chapter_title = f"**{chapter['title']}**" if chapter['title'].strip() != '' else '' | |
chapter_name = f"{chapter_prefix} {chapter_title}" | |
text += f"\n\n{chapter_name}" | |
for subchapter_number, subchapter in chapter["subchapters"].items(): | |
subchapter_heading = f"\n\n### **{subchapter['heading']}**" if subchapter['heading'].strip() != '' else '' | |
text += subchapter_heading | |
for article_number, article_content in subchapter["articles"].items(): | |
if article_number == "-1": | |
article_prefix = "" | |
else: | |
article_prefix = f"**Art. {article_number}.**\n\n" | |
text += f"\n\n{article_prefix}{article_content}" | |
return text | |
def load_models(): | |
embeddings = HuggingFaceEmbeddings(model_name="ipipan/silver-retriever-base-v1") | |
vector_store = Chroma.from_documents(documents=documents, embedding=embeddings) | |
embeddings_retriever = vector_store.as_retriever() | |
bm25_retriever = BM25Retriever.from_documents(documents) | |
formatter = RunnableLambda(lambda x: f"Pytanie: {x}") | |
formatting_retriever = formatter | embeddings_retriever | |
# ensemble_retriever = EnsembleRetriever( | |
# retrievers=[bm25_retriever, formatting_retriever], weights=[0.5, 0.5] | |
# ) | |
return embeddings, vector_store, formatting_retriever | |
embeddings, vector_store, retriever = load_models() | |
st.title('Konstytucja RP') | |
tab1, tab2 = st.tabs(["Wyszukiwarka", "Treść"]) | |
with tab1: | |
question = st.text_input('Zadaj pytanie:', 'Czy zgodnie z konstytucją wszyscy są równi wobec prawa?') | |
retrieved_docs = retriever.invoke(f"{question}") | |
def format_single(doc): | |
meta_removed = doc.page_content.split("\n\n\n")[0] | |
if doc.metadata['article_number'] == "-1": | |
prefix = "**Preambuła**" | |
else: | |
prefix = f"**Art. {doc.metadata['article_number']}.**" | |
return f"{prefix}\n\n{meta_removed}" | |
def format_docs(docs): | |
return "\n\n".join([ | |
format_single(doc) | |
for doc in docs | |
]) | |
result = format_docs(retrieved_docs) | |
st.markdown(format_docs(retrieved_docs)) | |
with tab2: | |
st.markdown(get_full_content(constitution)) |