konstytucja-rag / main.py
Szymon Woźniak
initial commit
33f20db
raw
history blame
4.32 kB
import streamlit as st
import pandas as pd
import json
import re
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_core.runnables import (
RunnableLambda
)
from langchain_core.documents import Document
@st.cache_data
def load_data():
with open("data/test/constitution.json") as f:
return json.load(f)
def format_for_search(chapter, subchapter, article_number, article_content):
subchapter_name = subchapter["heading"]
if subchapter_name != "":
subchapter_name = f"\n{subchapter_name}\n"
if article_number == "-1":
article_number = ""
else:
article_number = f"\nArtykuł {article_number}\n"
if chapter['number'] == 0:
chapter_name = chapter["title"]
else:
chapter_name = f"Rozdział {chapter['number']} {chapter['title']}"
chapter_name = f"{chapter_name}\n"
return f"{article_content}\n\n\n{chapter_name}{subchapter_name}{article_number}"
constitution = load_data()
print(constitution)
documents = [
Document(
page_content=format_for_search(chapter, subchapter, article_number, article_content),
metadata={"chapter_number": chapter["number"], "chapter_title": chapter["title"],
"subchapter_title": subchapter["heading"], "subchapter_number": subchapter_number,
"article_number": article_number},
)
for chapter in constitution["chapters"]
for subchapter_number, subchapter in chapter["subchapters"].items()
for article_number, article_content in subchapter["articles"].items()
]
def get_full_content(constitution):
text = "# KONSTYTUCJA\n# RZECZYPOSPOLITEJ POLSKIEJ\n\n## z dnia 2 kwietnia 1997 r.\n"
for chapter in constitution["chapters"]:
chapter_prefix = f"## **Rozdział {chapter['number']}**\n\n" if chapter['number'] != 0 else ''
chapter_title = f"**{chapter['title']}**" if chapter['title'].strip() != '' else ''
chapter_name = f"{chapter_prefix} {chapter_title}"
text += f"\n\n{chapter_name}"
for subchapter_number, subchapter in chapter["subchapters"].items():
subchapter_heading = f"\n\n### **{subchapter['heading']}**" if subchapter['heading'].strip() != '' else ''
text += subchapter_heading
for article_number, article_content in subchapter["articles"].items():
if article_number == "-1":
article_prefix = ""
else:
article_prefix = f"**Art. {article_number}.**\n\n"
text += f"\n\n{article_prefix}{article_content}"
return text
@st.cache_resource
def load_models():
embeddings = HuggingFaceEmbeddings(model_name="ipipan/silver-retriever-base-v1")
vector_store = Chroma.from_documents(documents=documents, embedding=embeddings)
embeddings_retriever = vector_store.as_retriever()
bm25_retriever = BM25Retriever.from_documents(documents)
formatter = RunnableLambda(lambda x: f"Pytanie: {x}")
formatting_retriever = formatter | embeddings_retriever
# ensemble_retriever = EnsembleRetriever(
# retrievers=[bm25_retriever, formatting_retriever], weights=[0.5, 0.5]
# )
return embeddings, vector_store, formatting_retriever
embeddings, vector_store, retriever = load_models()
st.title('Konstytucja RP')
tab1, tab2 = st.tabs(["Wyszukiwarka", "Treść"])
with tab1:
question = st.text_input('Zadaj pytanie:', 'Czy zgodnie z konstytucją wszyscy są równi wobec prawa?')
retrieved_docs = retriever.invoke(f"{question}")
def format_single(doc):
meta_removed = doc.page_content.split("\n\n\n")[0]
if doc.metadata['article_number'] == "-1":
prefix = "**Preambuła**"
else:
prefix = f"**Art. {doc.metadata['article_number']}.**"
return f"{prefix}\n\n{meta_removed}"
def format_docs(docs):
return "\n\n".join([
format_single(doc)
for doc in docs
])
result = format_docs(retrieved_docs)
st.markdown(format_docs(retrieved_docs))
with tab2:
st.markdown(get_full_content(constitution))