File size: 4,324 Bytes
33f20db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import streamlit as st
import pandas as pd
import json
import re
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_core.runnables import (
    RunnableLambda
)
from langchain_core.documents import Document

@st.cache_data
def load_data():
    with open("data/test/constitution.json") as f:
        return json.load(f)


def format_for_search(chapter, subchapter, article_number, article_content):
    subchapter_name = subchapter["heading"]
    if subchapter_name != "":
        subchapter_name = f"\n{subchapter_name}\n"

    if article_number == "-1":
        article_number = ""
    else:
        article_number = f"\nArtykuł {article_number}\n"

    if chapter['number'] == 0:
        chapter_name = chapter["title"]
    else:
        chapter_name = f"Rozdział {chapter['number']} {chapter['title']}"
    chapter_name = f"{chapter_name}\n"
    return f"{article_content}\n\n\n{chapter_name}{subchapter_name}{article_number}"

constitution = load_data()
print(constitution)
documents = [
    Document(
        page_content=format_for_search(chapter, subchapter, article_number, article_content),
        metadata={"chapter_number": chapter["number"], "chapter_title": chapter["title"],
                  "subchapter_title": subchapter["heading"], "subchapter_number": subchapter_number,
                  "article_number": article_number},
    )
    for chapter in constitution["chapters"]
    for subchapter_number, subchapter in chapter["subchapters"].items()
    for article_number, article_content in subchapter["articles"].items()
]

def get_full_content(constitution):
    text = "# KONSTYTUCJA\n# RZECZYPOSPOLITEJ POLSKIEJ\n\n## z dnia 2 kwietnia 1997 r.\n"
    for chapter in constitution["chapters"]:
        chapter_prefix = f"## **Rozdział {chapter['number']}**\n\n" if chapter['number'] != 0 else ''
        chapter_title = f"**{chapter['title']}**" if chapter['title'].strip() != '' else ''
        chapter_name = f"{chapter_prefix} {chapter_title}"

        text += f"\n\n{chapter_name}"
        for subchapter_number, subchapter in chapter["subchapters"].items():
            subchapter_heading = f"\n\n### **{subchapter['heading']}**" if subchapter['heading'].strip() != '' else ''
            text += subchapter_heading
            for article_number, article_content in subchapter["articles"].items():
                if article_number == "-1":
                    article_prefix = ""
                else:
                    article_prefix = f"**Art. {article_number}.**\n\n"
                text += f"\n\n{article_prefix}{article_content}"
    return text


@st.cache_resource
def load_models():
    embeddings = HuggingFaceEmbeddings(model_name="ipipan/silver-retriever-base-v1")
    vector_store = Chroma.from_documents(documents=documents, embedding=embeddings)
    embeddings_retriever = vector_store.as_retriever()

    bm25_retriever = BM25Retriever.from_documents(documents)
    formatter = RunnableLambda(lambda x: f"Pytanie: {x}")

    formatting_retriever = formatter | embeddings_retriever

    # ensemble_retriever = EnsembleRetriever(
    #     retrievers=[bm25_retriever, formatting_retriever], weights=[0.5, 0.5]
    # )

    return embeddings, vector_store, formatting_retriever

embeddings, vector_store, retriever = load_models()

st.title('Konstytucja RP')
tab1, tab2 = st.tabs(["Wyszukiwarka", "Treść"])

with tab1:
    question = st.text_input('Zadaj pytanie:', 'Czy zgodnie z konstytucją wszyscy są równi wobec prawa?')

    retrieved_docs = retriever.invoke(f"{question}")

    def format_single(doc):
        meta_removed = doc.page_content.split("\n\n\n")[0]
        if doc.metadata['article_number'] == "-1":
            prefix = "**Preambuła**"
        else:
            prefix = f"**Art. {doc.metadata['article_number']}.**"
        return f"{prefix}\n\n{meta_removed}"

    def format_docs(docs):
        return "\n\n".join([
            format_single(doc)
            for doc in docs
        ])
    result = format_docs(retrieved_docs)

    st.markdown(format_docs(retrieved_docs))

with tab2:
    st.markdown(get_full_content(constitution))